diff --git a/0001-modify-operator-throughput-speed.patch b/0001-modify-operator-throughput-speed.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e7d04115e0d27e293295d3cf2ff096fa8617831d
--- /dev/null
+++ b/0001-modify-operator-throughput-speed.patch
@@ -0,0 +1,352174 @@
+From 7f26a931153a32c5099746b6c18bbb88054ce70d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=E5=AD=99=E6=B5=B7=E4=BA=AE?= <sunhailiang3@huawei.com>
+Date: Mon, 17 Feb 2025 11:30:08 +0800
+Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96v0.6.2?=
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+---
+ .buildkite/check-wheel-size.py                |   35 +-
+ .buildkite/generate_index.py                  |   24 +
+ .../configs/DeepSeek-V2-Lite-Chat.yaml        |   12 +
+ ...lama-3-70B-Instruct-FBGEMM-nonuniform.yaml |   11 +
+ .../configs/Meta-Llama-3-70B-Instruct.yaml    |   11 +
+ ...struct-Channelwise-compressed-tensors.yaml |   11 +
+ ...Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml |   11 +
+ ...-3-8B-Instruct-FP8-compressed-tensors.yaml |   11 +
+ .../configs/Meta-Llama-3-8B-Instruct-FP8.yaml |   11 +
+ ...Instruct-INT8-compressed-tensors-asym.yaml |   11 +
+ ...3-8B-Instruct-INT8-compressed-tensors.yaml |   11 +
+ ...nstruct-nonuniform-compressed-tensors.yaml |   11 +
+ .../configs/Meta-Llama-3-8B-Instruct.yaml     |   11 +
+ .../configs/Meta-Llama-3-8B-QQQ.yaml          |   11 +
+ ...2-1B-Instruct-INT8-compressed-tensors.yaml |   11 +
+ .../configs/Minitron-4B-Base-FP8.yaml         |   11 +
+ ...xtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml |   11 +
+ .../Mixtral-8x7B-Instruct-v0.1-FP8.yaml       |   11 +
+ .../configs/Mixtral-8x7B-Instruct-v0.1.yaml   |   11 +
+ .../configs/Qwen2-1.5B-Instruct-FP8W8.yaml    |   11 +
+ ...1.5B-Instruct-INT8-compressed-tensors.yaml |   11 +
+ ....5B-Instruct-W8A16-compressed-tensors.yaml |   11 +
+ .../configs/Qwen2-57B-A14-Instruct.yaml       |   11 +
+ .../lm-eval-harness/configs/models-large.txt  |    5 +
+ .../lm-eval-harness/configs/models-small.txt  |   10 +
+ .../run-lm-eval-gsm-hf-baseline.sh            |   46 +
+ .../run-lm-eval-gsm-vllm-baseline.sh          |   51 +
+ .buildkite/lm-eval-harness/run-tests.sh       |   59 +
+ .../test_lm_eval_correctness.py               |   63 +
+ .buildkite/nightly-benchmarks/README.md       |  153 +
+ .../benchmark-pipeline.yaml                   |   92 +
+ .../nightly-benchmarks/nightly-annotation.md  |   28 +
+ .../nightly-descriptions.md                   |   39 +
+ .../nightly-benchmarks/nightly-pipeline.yaml  |  196 +
+ .../performance-benchmarks-descriptions.md    |   62 +
+ .../convert-results-json-to-markdown.py       |  204 +
+ .../scripts/download-tokenizer.py             |   26 +
+ .../scripts/generate-nightly-markdown.py      |   95 +
+ .../scripts/get-lmdeploy-modelname.py         |    6 +
+ .../scripts/launch-server.sh                  |  228 ++
+ .../scripts/nightly-annotate.sh               |   78 +
+ .../scripts/run-nightly-benchmarks.sh         |  355 ++
+ .../scripts/run-performance-benchmarks.sh     |  377 ++
+ .../scripts/summary-nightly-results.py        |   83 +
+ .../scripts/wait-for-image.sh                 |   19 +
+ .../tests/latency-tests.json                  |   32 +
+ .../tests/nightly-tests.json                  |  323 ++
+ .../tests/serving-tests.json                  |   80 +
+ .../tests/throughput-tests.json               |   35 +
+ .buildkite/release-pipeline.yaml              |   72 +
+ .buildkite/run-amd-test.sh                    |  144 +-
+ .buildkite/run-benchmarks.sh                  |   15 +-
+ .buildkite/run-cpu-test-ppc64le.sh            |   14 +
+ .buildkite/run-cpu-test.sh                    |   82 +-
+ .buildkite/run-gh200-test.sh                  |   28 +
+ .buildkite/run-hpu-test.sh                    |   16 +
+ .buildkite/run-multi-node-test.sh             |  108 +
+ .buildkite/run-neuron-test.sh                 |   59 +-
+ .buildkite/run-openvino-test.sh               |   16 +
+ .buildkite/run-tpu-test.sh                    |   26 +
+ .buildkite/run-xpu-test.sh                    |   19 +
+ .buildkite/test-pipeline.yaml                 |  628 ++-
+ .buildkite/upload-wheels.sh                   |   71 +
+ .clang-format                                 |   26 +
+ .dockerignore                                 |   32 +
+ .github/CODEOWNERS                            |   33 +
+ .github/FUNDING.yml                           |    2 +
+ .github/ISSUE_TEMPLATE/100-documentation.yml  |    7 +
+ .github/ISSUE_TEMPLATE/200-installation.yml   |    7 +
+ .github/ISSUE_TEMPLATE/300-usage.yml          |    7 +
+ .github/ISSUE_TEMPLATE/400-bug-report.yml     |  107 +
+ .../ISSUE_TEMPLATE/500-feature-request.yml    |   38 +
+ .github/ISSUE_TEMPLATE/600-new-model.yml      |   40 +
+ .../700-performance-discussion.yml            |   59 +
+ .github/ISSUE_TEMPLATE/750-RFC.yml            |    7 +
+ .../ISSUE_TEMPLATE/800-misc-discussion.yml    |   28 +
+ .github/PULL_REQUEST_TEMPLATE.md              |   61 +-
+ .github/dependabot.yml                        |   31 +
+ .github/mergify.yml                           |   60 +
+ .github/scripts/cleanup_pr_body.sh            |   50 +
+ .github/workflows/actionlint.yml              |   40 +
+ .github/workflows/add_label_automerge.yml     |   21 +
+ .github/workflows/clang-format.yml            |   53 +
+ .github/workflows/cleanup_pr_body.yml         |   26 +
+ .github/workflows/codespell.yml               |   45 +
+ .github/workflows/doc-lint.yml                |   32 +
+ .github/workflows/lint-and-deploy.yaml        |   82 +
+ .github/workflows/matchers/actionlint.json    |   17 +
+ .github/workflows/matchers/mypy.json          |   16 +
+ .github/workflows/matchers/ruff.json          |   17 +
+ .github/workflows/mypy.yaml                   |   43 +-
+ .github/workflows/png-lint.yml                |   37 +
+ .github/workflows/publish.yml                 |  126 +-
+ .github/workflows/reminder_comment.yml        |   21 +
+ .github/workflows/ruff.yml                    |   53 +-
+ .github/workflows/scripts/build.sh            |   10 +-
+ .github/workflows/scripts/cuda-install.sh     |    8 +-
+ .github/workflows/scripts/pytorch-install.sh  |    2 +-
+ .github/workflows/shellcheck.yml              |   37 +
+ .github/workflows/stale.yml                   |   52 +
+ .github/workflows/yapf.yml                    |   35 +-
+ .gitignore                                    |   21 +-
+ .readthedocs.yaml                             |   12 +-
+ .shellcheckrc                                 |    9 +
+ CMakeLists.txt                                |  539 ++-
+ CODE_OF_CONDUCT.md                            |  128 +
+ CONTRIBUTING.md                               |   55 +-
+ DCO                                           |   34 +
+ Dockerfile                                    |  239 +-
+ Dockerfile.arm                                |   62 +
+ Dockerfile.cpu                                |   67 +-
+ Dockerfile.hpu                                |   21 +
+ Dockerfile.neuron                             |   47 +-
+ Dockerfile.openvino                           |   29 +
+ Dockerfile.ppc64le                            |   38 +
+ Dockerfile.rocm                               |  189 +-
+ Dockerfile.tpu                                |   28 +
+ Dockerfile.xpu                                |   69 +
+ README.md                                     |  149 +-
+ SECURITY.md                                   |   11 +
+ benchmarks/README.md                          |   11 +
+ benchmarks/backend_request_func.py            |  119 +-
+ benchmarks/benchmark_guided.py                |  494 +++
+ benchmarks/benchmark_latency.py               |  141 +-
+ .../benchmark_long_document_qa_throughput.py  |  183 +
+ benchmarks/benchmark_prefix_caching.py        |  242 +-
+ benchmarks/benchmark_prioritization.py        |  177 +
+ benchmarks/benchmark_serving.py               |  852 +++-
+ benchmarks/benchmark_serving_guided.py        |  881 +++++
+ benchmarks/benchmark_throughput.py            |  486 ++-
+ .../cutlass_benchmarks/sparse_benchmarks.py   |  384 ++
+ benchmarks/cutlass_benchmarks/utils.py        |   96 +
+ .../cutlass_benchmarks/w8a8_benchmarks.py     |  365 ++
+ .../cutlass_benchmarks/weight_shapes.py       |   43 +
+ .../disagg_overhead_benchmark.sh              |  145 +
+ .../disagg_performance_benchmark.sh           |  163 +
+ .../disagg_prefill_proxy_server.py            |   61 +
+ .../disagg_benchmarks/round_robin_proxy.py    |   60 +
+ .../visualize_benchmark_results.py            |   46 +
+ .../fused_kernels/layernorm_rms_benchmarks.py |  173 +
+ benchmarks/kernels/benchmark_aqlm.py          |   14 +-
+ benchmarks/kernels/benchmark_layernorm.py     |   86 +
+ benchmarks/kernels/benchmark_machete.py       |  672 ++++
+ benchmarks/kernels/benchmark_marlin.py        |  254 ++
+ benchmarks/kernels/benchmark_moe.py           |  367 ++
+ .../kernels/benchmark_paged_attention.py      |   45 +-
+ benchmarks/kernels/benchmark_quant.py         |  100 +
+ benchmarks/kernels/benchmark_rmsnorm.py       |  262 ++
+ benchmarks/kernels/benchmark_rope.py          |   22 +-
+ benchmarks/kernels/benchmark_shapes.py        |   75 +
+ benchmarks/kernels/graph_machete_bench.py     |   63 +
+ benchmarks/kernels/requirements.txt           |    1 +
+ benchmarks/kernels/weight_shapes.py           |   49 +
+ benchmarks/launch_tgi_server.sh               |   10 +-
+ benchmarks/overheads/benchmark_hashing.py     |   59 +
+ .../structured_schema_1.json                  |  113 +
+ cmake/cpu_extension.cmake                     |  164 +-
+ cmake/utils.cmake                             |  315 +-
+ collect_env.py                                |   64 +-
+ csrc/activation_kernels.cu                    |  195 +-
+ csrc/attention/attention_generic.cuh          |   19 +-
+ csrc/attention/attention_kernels.cuh          |  676 ++++
+ csrc/attention/attention_utils.cuh            |   13 +-
+ csrc/attention/dtype_bfloat16.cuh             |   82 +-
+ csrc/attention/dtype_float16.cuh              |   92 +-
+ csrc/attention/dtype_float32.cuh              |   88 +-
+ csrc/attention/dtype_fp8.cuh                  |   36 +-
+ csrc/attention/paged_attention_v1.cu          |  193 +
+ csrc/attention/paged_attention_v2.cu          |  203 +
+ csrc/cache.h                                  |   49 +-
+ csrc/cache_kernels.cu                         |  448 ++-
+ csrc/core/exception.hpp                       |    3 +
+ csrc/core/math.hpp                            |    7 +
+ csrc/core/registration.h                      |   27 +
+ csrc/core/scalar_type.hpp                     |  347 ++
+ csrc/cpu/activation.cpp                       |   79 +-
+ csrc/cpu/attention.cpp                        |  476 ++-
+ csrc/cpu/cache.cpp                            |   77 +-
+ csrc/cpu/cpu_types.hpp                        |  357 +-
+ csrc/cpu/cpu_types_arm.hpp                    |  572 +++
+ csrc/cpu/cpu_types_vsx.hpp                    |  491 +++
+ csrc/cpu/cpu_types_x86.hpp                    |  632 +++
+ csrc/cpu/dnnl_helper.hpp                      |  174 +
+ csrc/cpu/layernorm.cpp                        |   32 +-
+ csrc/cpu/pos_encoding.cpp                     |  166 +-
+ csrc/cpu/quant.cpp                            |  613 +++
+ csrc/cpu/torch_bindings.cpp                   |  160 +
+ csrc/cpu/utils.cpp                            |  103 +
+ csrc/cuda_compat.h                            |   17 +-
+ csrc/cuda_utils.h                             |   17 +-
+ csrc/cuda_utils_kernels.cu                    |   40 +-
+ csrc/custom_all_reduce.cu                     |  150 +-
+ csrc/custom_all_reduce.cuh                    |  293 +-
+ csrc/custom_all_reduce_test.cu                |   79 +-
+ csrc/cutlass_extensions/common.cpp            |   11 +
+ csrc/cutlass_extensions/common.hpp            |   35 +
+ csrc/cutlass_extensions/cute_utils.cuh        |   68 +
+ .../epilogue/broadcast_load_epilogue_c2x.hpp  |  497 +++
+ .../epilogue/broadcast_load_epilogue_c3x.hpp  |  447 +++
+ .../epilogue/scaled_mm_epilogues_c2x.hpp      |  319 ++
+ .../epilogue/scaled_mm_epilogues_c3x.hpp      |  317 ++
+ csrc/cutlass_extensions/torch_utils.hpp       |  160 +
+ .../vllm_collective_builder.cuh               |   43 +
+ csrc/cutlass_extensions/vllm_custom_types.cuh |   50 +
+ .../vllm_cutlass_library_extension.py         |   78 +
+ .../vllm_numeric_conversion.cuh               |  992 +++++
+ csrc/cutlass_extensions/vllm_type_utils.cuh   |   42 +
+ csrc/dispatch_utils.h                         |   58 +-
+ csrc/layernorm_kernels.cu                     |  312 +-
+ csrc/layernorm_quant_kernels.cu               |  234 ++
+ csrc/mamba/causal_conv1d/causal_conv1d.cu     |  662 ++++
+ csrc/mamba/causal_conv1d/causal_conv1d.h      |  159 +
+ csrc/mamba/causal_conv1d/static_switch.h      |   28 +
+ csrc/mamba/mamba_ssm/selective_scan.h         |  266 ++
+ csrc/mamba/mamba_ssm/selective_scan_fwd.cu    |  658 ++++
+ csrc/mamba/mamba_ssm/static_switch.h          |   28 +
+ csrc/moe/marlin_kernels/marlin_moe_kernel.h   | 1616 ++++++++
+ .../marlin_kernels/marlin_moe_kernel_ku4.cu   |   31 +
+ .../marlin_kernels/marlin_moe_kernel_ku4.h    |   20 +
+ .../marlin_kernels/marlin_moe_kernel_ku4b8.cu |   31 +
+ .../marlin_kernels/marlin_moe_kernel_ku4b8.h  |   20 +
+ .../marlin_moe_kernel_ku8b128.cu              |   31 +
+ .../marlin_moe_kernel_ku8b128.h               |   18 +
+ csrc/moe/marlin_moe_ops.cu                    |  588 +++
+ csrc/moe/moe_align_sum_kernels.cu             |  324 ++
+ csrc/moe/moe_ops.h                            |   17 +-
+ csrc/moe/topk_softmax_kernels.cu              |   29 +-
+ csrc/moe/torch_bindings.cpp                   |   39 +
+ csrc/ops.h                                    |  419 +-
+ csrc/permute_cols.cu                          |   88 +
+ csrc/pos_encoding_kernels.cu                  |  235 +-
+ csrc/prepare_inputs/advance_step.cu           |  327 ++
+ csrc/prepare_inputs/advance_step.cuh          |   19 +
+ csrc/quantization/aqlm/gemm_kernels.cu        |  553 ++-
+ csrc/quantization/awq/dequantize.cuh          |  139 +-
+ csrc/quantization/awq/gemm_kernels.cu         |  620 +--
+ .../compressed_tensors/int8_quant_kernels.cu  |  286 ++
+ csrc/quantization/cutlass_w8a8/Epilogues.md   |  147 +
+ .../cutlass_w8a8/scaled_mm_c2x.cu             |  199 +
+ .../cutlass_w8a8/scaled_mm_c2x.cuh            |  220 ++
+ .../scaled_mm_c2x_sm75_dispatch.cuh           |  123 +
+ .../scaled_mm_c2x_sm80_dispatch.cuh           |  139 +
+ .../scaled_mm_c2x_sm89_fp8_dispatch.cuh       |  368 ++
+ .../scaled_mm_c2x_sm89_int8_dispatch.cuh      |  353 ++
+ .../cutlass_w8a8/scaled_mm_c3x.cu             |   87 +
+ .../cutlass_w8a8/scaled_mm_c3x.cuh            |  160 +
+ .../scaled_mm_c3x_sm90_fp8_dispatch.cuh       |   96 +
+ .../scaled_mm_c3x_sm90_int8_dispatch.cuh      |  140 +
+ .../cutlass_w8a8/scaled_mm_entry.cu           |  218 ++
+ csrc/quantization/fp8/amd/hip_float8.h        |  137 +
+ csrc/quantization/fp8/amd/hip_float8_impl.h   |  316 ++
+ csrc/quantization/fp8/amd/quant_utils.cuh     |  577 +++
+ csrc/quantization/fp8/common.cu               |  149 +
+ csrc/quantization/fp8/common.cuh              |  160 +
+ csrc/quantization/fp8/fp8_marlin.cu           | 1311 +++++++
+ csrc/quantization/fp8/nvidia/quant_utils.cuh  |  573 +++
+ ...fused_layernorm_dynamic_per_token_quant.cu |  160 +
+ .../fused_kernels/layernorm_utils.cuh         |  327 ++
+ .../fused_kernels/quant_conversions.cuh       |   81 +
+ csrc/quantization/gguf/dequantize.cuh         |  568 +++
+ csrc/quantization/gguf/ggml-common.h          | 1130 ++++++
+ csrc/quantization/gguf/gguf_kernel.cu         |  249 ++
+ csrc/quantization/gguf/mmq.cuh                |  600 +++
+ csrc/quantization/gguf/mmvq.cuh               |  190 +
+ csrc/quantization/gguf/vecdotq.cuh            | 1810 +++++++++
+ csrc/quantization/gptq/compat.cuh             |   70 +-
+ csrc/quantization/gptq/matrix_view.cuh        |  503 +--
+ csrc/quantization/gptq/q_gemm.cu              | 3443 ++++++++---------
+ csrc/quantization/gptq/qdq_2.cuh              |  107 +-
+ csrc/quantization/gptq/qdq_3.cuh              |  246 +-
+ csrc/quantization/gptq/qdq_4.cuh              |  203 +-
+ csrc/quantization/gptq/qdq_8.cuh              |   34 +-
+ csrc/quantization/gptq/qdq_util.cuh           |   58 +-
+ .../gptq_marlin/awq_marlin_repack.cu          |  268 ++
+ csrc/quantization/gptq_marlin/gptq_marlin.cu  | 1547 ++++++--
+ .../gptq_marlin/gptq_marlin_repack.cu         |  132 +-
+ csrc/quantization/gptq_marlin/marlin.cuh      |   87 +
+ .../gptq_marlin/marlin_dtypes.cuh             |   79 +
+ csrc/quantization/machete/Readme.md           |   45 +
+ csrc/quantization/machete/generate.py         |  659 ++++
+ .../machete/machete_collective_builder.cuh    |   31 +
+ .../machete/machete_interleaving_utils.cuh    |   35 +
+ .../quantization/machete/machete_mainloop.cuh | 1470 +++++++
+ .../machete/machete_mm_kernel.cuh             |  314 ++
+ .../machete/machete_mm_launcher.cuh           |   75 +
+ .../machete/machete_prepack_kernel.cuh        |   76 +
+ .../machete/machete_prepack_launcher.cuh      |   74 +
+ .../machete/machete_prepacked_layout.cuh      |  249 ++
+ csrc/quantization/machete/machete_pytorch.cu  |   73 +
+ csrc/quantization/marlin/dense/LICENSE        |  209 +
+ csrc/quantization/marlin/dense/common/base.h  |   32 +
+ csrc/quantization/marlin/dense/common/mem.h   |   89 +
+ .../marlin/dense/marlin_cuda_kernel.cu        | 1073 +++++
+ .../marlin/qqq/marlin_qqq_gemm_kernel.cu      | 1248 ++++++
+ csrc/quantization/marlin/sparse/LICENSE       |  203 +
+ csrc/quantization/marlin/sparse/common/base.h |   51 +
+ csrc/quantization/marlin/sparse/common/mem.h  |  136 +
+ csrc/quantization/marlin/sparse/common/mma.h  |  191 +
+ .../marlin/sparse/marlin_24_cuda_kernel.cu    | 1145 ++++++
+ csrc/quantization/vectorization.cuh           |   33 +
+ csrc/rocm/attention.cu                        | 1120 ++++++
+ csrc/rocm/ops.h                               |   14 +
+ csrc/rocm/torch_bindings.cpp                  |   34 +
+ csrc/sparse/cutlass/sparse_compressor_c3x.cu  |  165 +
+ .../sparse/cutlass/sparse_compressor_entry.cu |   42 +
+ csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |  303 ++
+ csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  |  496 +++
+ csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |   70 +
+ csrc/torch_bindings.cpp                       |  505 +++
+ csrc/type_convert.cuh                         |  165 +
+ docs/Makefile                                 |    4 +
+ docs/README.md                                |    1 +
+ docs/requirements-docs.txt                    |   24 +-
+ docs/source/_static/custom.js                 |   18 +
+ docs/source/_templates/sections/header.html   |   39 +
+ docs/source/api/engine/async_llm_engine.md    |    7 +
+ docs/source/api/engine/index.md               |   17 +
+ docs/source/api/engine/llm_engine.md          |    7 +
+ docs/source/api/inference_params.md           |   21 +
+ docs/source/api/model/adapters.md             |    9 +
+ docs/source/api/model/index.md                |   11 +
+ docs/source/api/model/interfaces.md           |    9 +
+ docs/source/api/model/interfaces_base.md      |    9 +
+ docs/source/api/multimodal/index.md           |   28 +
+ docs/source/api/multimodal/inputs.md          |   49 +
+ docs/source/api/multimodal/parse.md           |    9 +
+ docs/source/api/multimodal/processing.md      |    9 +
+ docs/source/api/multimodal/profiling.md       |    9 +
+ docs/source/api/multimodal/registry.md        |    9 +
+ docs/source/api/offline_inference/index.md    |    9 +
+ docs/source/api/offline_inference/llm.md      |    7 +
+ .../api/offline_inference/llm_inputs.md       |   19 +
+ .../dockerfile-stages-dependency.png          |  Bin 0 -> 118207 bytes
+ .../architecture_helm_deployment.png          |  Bin 0 -> 991484 bytes
+ .../arch_overview/entrypoints.excalidraw.png  |  Bin 0 -> 123422 bytes
+ .../arch_overview/llm_engine.excalidraw.png   |  Bin 0 -> 178116 bytes
+ docs/source/assets/design/hierarchy.png       |  Bin 0 -> 174150 bytes
+ .../features/disagg_prefill/abstraction.jpg   |  Bin 0 -> 104673 bytes
+ .../features/disagg_prefill/overview.jpg      |  Bin 0 -> 177439 bytes
+ docs/source/community/meetups.md              |   15 +
+ docs/source/community/sponsors.md             |   38 +
+ docs/source/conf.py                           |  146 +-
+ .../contributing/dockerfile/dockerfile.md     |   50 +
+ docs/source/contributing/model/basic.md       |  115 +
+ docs/source/contributing/model/index.md       |   27 +
+ docs/source/contributing/model/multimodal.md  |  395 ++
+ .../source/contributing/model/registration.md |   55 +
+ docs/source/contributing/model/tests.md       |   63 +
+ docs/source/contributing/overview.md          |  149 +
+ .../contributing/profiling/profiling_index.md |   41 +
+ .../contributing/vulnerability_management.md  |   43 +
+ docs/source/deployment/docker.md              |   81 +
+ docs/source/deployment/frameworks/bentoml.md  |    7 +
+ .../source/deployment/frameworks/cerebrium.md |  109 +
+ docs/source/deployment/frameworks/dstack.md   |  102 +
+ docs/source/deployment/frameworks/helm.md     |  250 ++
+ docs/source/deployment/frameworks/index.md    |   14 +
+ docs/source/deployment/frameworks/lws.md      |   11 +
+ docs/source/deployment/frameworks/modal.md    |    7 +
+ docs/source/deployment/frameworks/skypilot.md |  345 ++
+ docs/source/deployment/frameworks/triton.md   |    5 +
+ docs/source/deployment/integrations/index.md  |    9 +
+ docs/source/deployment/integrations/kserve.md |    7 +
+ docs/source/deployment/integrations/kubeai.md |   15 +
+ .../deployment/integrations/llamastack.md     |   38 +
+ docs/source/deployment/k8s.md                 |  249 ++
+ docs/source/deployment/nginx.md               |  133 +
+ docs/source/design/arch_overview.md           |  252 ++
+ .../source/design/automatic_prefix_caching.md |   42 +
+ docs/source/design/huggingface_integration.md |   36 +
+ docs/source/design/kernel/paged_attention.md  |  529 +++
+ docs/source/design/mm_processing.md           |   64 +
+ docs/source/design/multiprocessing.md         |  196 +
+ docs/source/design/plugin_system.md           |   56 +
+ .../features/automatic_prefix_caching.md      |  102 +
+ docs/source/features/compatibility_matrix.md  |  468 +++
+ docs/source/features/disagg_prefill.md        |   68 +
+ docs/source/features/lora.md                  |  214 +
+ docs/source/features/quantization/auto_awq.md |   78 +
+ docs/source/features/quantization/bnb.md      |   47 +
+ docs/source/features/quantization/fp8.md      |  192 +
+ .../features/quantization/fp8_e4m3_kvcache.md |   44 +
+ .../features/quantization/fp8_e5m2_kvcache.md |   31 +
+ docs/source/features/quantization/gguf.md     |   72 +
+ docs/source/features/quantization/index.md    |   19 +
+ docs/source/features/quantization/int8.md     |  136 +
+ .../quantization/supported_hardware.md        |  131 +
+ docs/source/features/spec_decode.md           |  265 ++
+ docs/source/features/structured_outputs.md    |  260 ++
+ docs/source/features/tool_calling.md          |  300 ++
+ docs/source/generate_examples.py              |  272 +-
+ docs/source/getting_started/faq.md            |   37 +
+ .../getting_started/installation/cpu-apple.md |   48 +
+ .../getting_started/installation/cpu-arm.md   |   46 +
+ .../getting_started/installation/cpu-x86.md   |  154 +
+ .../getting_started/installation/gpu-cuda.md  |  236 ++
+ .../getting_started/installation/gpu-rocm.md  |  163 +
+ .../getting_started/installation/hpu-gaudi.md |  389 ++
+ .../getting_started/installation/index.md     |   20 +
+ .../getting_started/installation/neuron.md    |  132 +
+ .../getting_started/installation/openvino.md  |  104 +
+ .../getting_started/installation/tpu.md       |  191 +
+ .../getting_started/installation/xpu.md       |   74 +
+ docs/source/getting_started/quickstart.md     |  186 +
+ .../source/getting_started/troubleshooting.md |  203 +
+ docs/source/index.md                          |  192 +
+ docs/source/models/extensions/index.md        |    8 +
+ .../models/extensions/runai_model_streamer.md |   53 +
+ docs/source/models/extensions/tensorizer.md   |   16 +
+ docs/source/models/generative_models.md       |  126 +
+ docs/source/models/pooling_models.md          |  136 +
+ docs/source/models/supported_models.md        |  868 +++++
+ docs/source/performance/benchmarks.md         |   28 +
+ docs/source/performance/optimization.md       |   63 +
+ docs/source/serving/distributed_serving.md    |  105 +
+ docs/source/serving/engine_args.md            |   25 +
+ docs/source/serving/env_vars.md               |   15 +
+ docs/source/serving/integrations/index.md     |    8 +
+ docs/source/serving/integrations/langchain.md |   30 +
+ .../source/serving/integrations/llamaindex.md |   26 +
+ docs/source/serving/metrics.md                |   38 +
+ docs/source/serving/multimodal_inputs.md      |  533 +++
+ docs/source/serving/offline_inference.md      |   79 +
+ .../serving/openai_compatible_server.md       |  439 ++-
+ docs/source/serving/usage_stats.md            |    6 +-
+ examples/offline_inference/aqlm_example.py    |   45 +
+ examples/offline_inference/arctic.py          |   26 +
+ examples/offline_inference/audio_language.py  |  131 +
+ examples/offline_inference/basic.py           |   22 +
+ .../basic_with_model_default_sampling.py      |   30 +
+ examples/offline_inference/chat.py            |   80 +
+ examples/offline_inference/chat_with_tools.py |  138 +
+ examples/offline_inference/classification.py  |   28 +
+ examples/offline_inference/cli.py             |   80 +
+ examples/offline_inference/cpu_offload.py     |   22 +
+ examples/offline_inference/distributed.py     |  108 +
+ examples/offline_inference/embedding.py       |   28 +
+ examples/offline_inference/encoder_decoder.py |   99 +
+ .../offline_inference/florence2_inference.py  |   45 +
+ examples/offline_inference/gguf_inference.py  |   32 +
+ .../offline_inference/llm_engine_example.py   |   60 +
+ .../lora_with_quantization_inference.py       |  134 +
+ examples/offline_inference/mlpspeculator.py   |   56 +
+ .../offline_inference/multilora_inference.py  |  106 +
+ examples/offline_inference/neuron.py          |   36 +
+ .../neuron_int8_quantization.py               |   50 +
+ .../offline_inference/openai/openai_batch.md  |  205 +
+ .../openai/openai_example_batch.jsonl         |    2 +
+ examples/offline_inference/pixtral.py         |  165 +
+ examples/offline_inference/prefix_caching.py  |   83 +
+ examples/offline_inference/profiling.py       |  458 +++
+ .../offline_inference/save_sharded_state.py   |   75 +
+ examples/offline_inference/scoring.py         |   23 +
+ .../offline_inference/simple_profiling.py     |   40 +
+ .../offline_inference/structured_outputs.py   |   78 +
+ examples/offline_inference/tpu.py             |   28 +
+ examples/offline_inference/vision_language.py |  705 ++++
+ .../vision_language_embedding.py              |  170 +
+ .../vision_language_multi_image.py            |  493 +++
+ examples/offline_inference/whisper.py         |   59 +
+ examples/online_serving/api_client.py         |   84 +
+ .../online_serving/chart-helm/.helmignore     |    6 +
+ examples/online_serving/chart-helm/Chart.yaml |   21 +
+ examples/online_serving/chart-helm/README.md  |   21 +
+ examples/online_serving/chart-helm/ct.yaml    |    3 +
+ .../online_serving/chart-helm/lintconf.yaml   |   42 +
+ .../chart-helm/templates/_helpers.tpl         |  164 +
+ .../chart-helm/templates/configmap.yaml       |   11 +
+ .../chart-helm/templates/custom-objects.yaml  |    6 +
+ .../chart-helm/templates/deployment.yaml      |  122 +
+ .../chart-helm/templates/hpa.yaml             |   31 +
+ .../chart-helm/templates/job.yaml             |   37 +
+ .../templates/poddisruptionbudget.yaml        |    7 +
+ .../chart-helm/templates/pvc.yaml             |   13 +
+ .../chart-helm/templates/secrets.yaml         |   10 +
+ .../chart-helm/templates/service.yaml         |   14 +
+ .../chart-helm/values.schema.json             |  265 ++
+ .../online_serving/chart-helm/values.yaml     |  119 +
+ .../online_serving/disaggregated_prefill.sh   |  109 +
+ .../gradio_openai_chatbot_webserver.py        |   82 +
+ examples/online_serving/gradio_webserver.py   |   52 +
+ .../openai_chat_completion_client.py          |   36 +
+ ...i_chat_completion_client_for_multimodal.py |  321 ++
+ ...penai_chat_completion_client_with_tools.py |  162 +
+ ...enai_chat_completion_structured_outputs.py |   94 +
+ ...ai_chat_embedding_client_for_multimodal.py |  120 +
+ .../openai_completion_client.py               |   31 +
+ .../openai_cross_encoder_score.py             |   59 +
+ .../online_serving/openai_embedding_client.py |   25 +
+ .../online_serving/openai_pooling_client.py   |   51 +
+ examples/online_serving/opentelemetry/Otel.md |   82 +
+ .../opentelemetry/dummy_client.py             |   35 +
+ .../prometheus_grafana/README.md              |   54 +
+ .../prometheus_grafana/docker-compose.yaml    |   19 +
+ .../prometheus_grafana/grafana.json           | 1557 ++++++++
+ .../prometheus_grafana/prometheus.yaml        |   10 +
+ examples/online_serving/run_cluster.sh        |   49 +
+ .../online_serving/sagemaker-entrypoint.sh    |   24 +
+ examples/other/fp8/README.md                  |   96 +
+ examples/other/fp8/extract_scales.py          |  367 ++
+ examples/other/fp8/quantizer/README.md        |   32 +
+ examples/other/fp8/quantizer/quantize.py      |  367 ++
+ examples/other/logging_configuration.md       |  172 +
+ examples/other/tensorize_vllm_model.py        |  240 ++
+ examples/template_blip2.jinja                 |   11 +
+ examples/template_dse_qwen2_vl.jinja          |    7 +
+ examples/template_llava.jinja                 |   23 +
+ examples/template_pixtral_hf.jinja            |   38 +
+ examples/template_vlm2vec.jinja               |   16 +
+ examples/tool_chat_template_granite.jinja     |   36 +
+ .../tool_chat_template_granite_20b_fc.jinja   |  130 +
+ examples/tool_chat_template_hermes.jinja      |  130 +
+ .../tool_chat_template_internlm2_tool.jinja   |   60 +
+ .../tool_chat_template_llama3.1_json.jinja    |  120 +
+ .../tool_chat_template_llama3.2_json.jinja    |  133 +
+ ...tool_chat_template_llama3.2_pythonic.jinja |   98 +
+ examples/tool_chat_template_mistral.jinja     |   86 +
+ .../tool_chat_template_mistral_parallel.jinja |   93 +
+ examples/tool_chat_template_toolace.jinja     |   65 +
+ find_cuda_init.py                             |   33 +
+ format.sh                                     |  141 +-
+ pyproject.toml                                |   64 +-
+ python_only_dev.py                            |   14 +
+ requirements-build.txt                        |    8 +-
+ requirements-common.txt                       |   45 +-
+ requirements-cpu.txt                          |    8 +-
+ requirements-cuda.txt                         |   11 +-
+ requirements-dev.txt                          |   36 +-
+ requirements-hpu.txt                          |   11 +
+ requirements-lint.txt                         |   15 +
+ requirements-neuron.txt                       |    4 +-
+ requirements-openvino.txt                     |    8 +
+ requirements-rocm.txt                         |    9 +-
+ requirements-test.in                          |   32 +
+ requirements-test.txt                         |  582 +++
+ requirements-tpu.txt                          |   25 +
+ requirements-xpu.txt                          |   16 +
+ setup.py                                      |  422 +-
+ tests/async_engine/__init__.py                |    0
+ tests/async_engine/api_server_async_engine.py |   13 +-
+ tests/async_engine/test_api_server.py         |   11 +-
+ tests/async_engine/test_async_llm_engine.py   |  300 +-
+ tests/async_engine/test_request_tracker.py    |   27 +-
+ tests/basic_correctness/__init__.py           |    0
+ .../test_basic_correctness.py                 |  209 +-
+ .../basic_correctness/test_chunked_prefill.py |  310 +-
+ tests/basic_correctness/test_cpu_offload.py   |    6 +
+ tests/basic_correctness/test_preemption.py    |  227 +-
+ tests/compile/__init__.py                     |    0
+ tests/compile/backend.py                      |   37 +
+ tests/compile/piecewise/__init__.py           |    0
+ tests/compile/piecewise/test_simple.py        |  109 +
+ tests/compile/piecewise/test_toy_llama.py     |  447 +++
+ tests/compile/test_basic_correctness.py       |  141 +
+ tests/compile/test_full_graph.py              |   20 +
+ tests/compile/test_functionalization.py       |  100 +
+ tests/compile/test_fusion.py                  |  116 +
+ tests/compile/test_pass_manager.py            |   35 +
+ tests/compile/test_wrapper.py                 |   61 +
+ tests/compile/utils.py                        |   97 +
+ tests/conftest.py                             | 1026 ++++-
+ tests/core/block/e2e/__init__.py              |    0
+ tests/core/block/e2e/conftest.py              |   30 +-
+ tests/core/block/e2e/test_correctness.py      |  247 +-
+ .../e2e/test_correctness_sliding_window.py    |  170 +
+ tests/core/block/test_block_manager.py        |  491 +++
+ tests/core/block/test_block_table.py          |   19 +-
+ .../block/test_cpu_gpu_block_allocator.py     |   32 +-
+ tests/core/block/test_naive_block.py          |   49 +-
+ tests/core/block/test_prefix_caching_block.py |  549 ++-
+ tests/core/test_chunked_prefill_scheduler.py  |  372 +-
+ tests/core/test_num_computed_tokens_update.py |   80 +
+ tests/core/test_scheduler.py                  |  777 ++--
+ tests/core/test_scheduler_encoder_decoder.py  |  104 +
+ tests/core/test_serialization.py              |   33 +
+ tests/core/utils.py                           |  214 +-
+ tests/data/test_config.yaml                   |    5 +
+ tests/distributed/__init__.py                 |    0
+ tests/distributed/test_ca_buffer_sharing.py   |   59 +
+ tests/distributed/test_comm_ops.py            |  134 +-
+ tests/distributed/test_custom_all_reduce.py   |   99 +-
+ tests/distributed/test_distributed_oot.py     |    6 +
+ .../distributed/test_multi_node_assignment.py |   64 +
+ tests/distributed/test_pipeline_parallel.py   |  427 ++
+ tests/distributed/test_pipeline_partition.py  |   34 +
+ tests/distributed/test_pp_cudagraph.py        |   30 +
+ tests/distributed/test_pynccl.py              |  267 +-
+ tests/distributed/test_same_node.py           |   34 +
+ tests/distributed/test_shm_broadcast.py       |  116 +
+ tests/distributed/test_utils.py               |  141 +
+ tests/encoder_decoder/__init__.py             |    0
+ tests/encoder_decoder/test_e2e_correctness.py |  119 +
+ tests/engine/__init__.py                      |    0
+ tests/engine/output_processor/__init__.py     |    0
+ .../output_processor/test_multi_step.py       |   23 +-
+ .../output_processor/test_stop_checker.py     |   86 +
+ tests/engine/test_arg_utils.py                |  142 +
+ tests/engine/test_custom_executor.py          |   91 +
+ tests/engine/test_multiproc_workers.py        |    6 +-
+ tests/engine/test_short_mm_context.py         |   29 +
+ tests/engine/test_skip_tokenizer_init.py      |    7 +-
+ tests/engine/test_stop_reason.py              |   11 +-
+ tests/engine/test_stop_strings.py             |  158 +-
+ tests/entrypoints/__init__.py                 |    0
+ tests/entrypoints/conftest.py                 |  159 +
+ tests/entrypoints/llm/__init__.py             |    0
+ tests/entrypoints/llm/test_accuracy.py        |   56 +
+ tests/entrypoints/llm/test_chat.py            |   92 +
+ tests/entrypoints/llm/test_encode.py          |  107 +
+ tests/entrypoints/llm/test_generate.py        |  104 +
+ .../llm/test_generate_multiple_loras.py       |   66 +
+ tests/entrypoints/llm/test_gpu_utilization.py |   25 +
+ tests/entrypoints/llm/test_guided_generate.py |  265 ++
+ tests/entrypoints/llm/test_init.py            |   22 +
+ tests/entrypoints/llm/test_lazy_outlines.py   |   76 +
+ .../entrypoints/llm/test_prompt_validation.py |   24 +
+ tests/entrypoints/offline_mode/__init__.py    |    0
+ .../offline_mode/test_offline_mode.py         |   82 +
+ tests/entrypoints/openai/__init__.py          |    0
+ tests/entrypoints/openai/test_accuracy.py     |   85 +
+ .../openai/test_async_tokenization.py         |  137 +
+ tests/entrypoints/openai/test_audio.py        |  380 ++
+ tests/entrypoints/openai/test_basic.py        |  156 +
+ tests/entrypoints/openai/test_chat.py         |  996 +++++
+ tests/entrypoints/openai/test_chat_echo.py    |   79 +
+ .../entrypoints/openai/test_chat_template.py  |  117 +
+ .../entrypoints/openai/test_chunked_prompt.py |  126 +
+ tests/entrypoints/openai/test_cli_args.py     |  131 +
+ tests/entrypoints/openai/test_completion.py   |  779 ++++
+ tests/entrypoints/openai/test_embedding.py    |  274 ++
+ .../openai/test_encoder_decoder.py            |   52 +
+ .../entrypoints/openai/test_lora_adapters.py  |  269 ++
+ tests/entrypoints/openai/test_metrics.py      |  236 ++
+ tests/entrypoints/openai/test_models.py       |   64 +
+ .../openai/test_oot_registration.py           |   42 +
+ tests/entrypoints/openai/test_pooling.py      |  238 ++
+ .../openai/test_prompt_validation.py          |   57 +
+ .../openai/test_return_tokens_as_ids.py       |   87 +
+ tests/entrypoints/openai/test_root_path.py    |  103 +
+ tests/entrypoints/openai/test_run_batch.py    |  104 +
+ tests/entrypoints/openai/test_score.py        |   93 +
+ tests/entrypoints/openai/test_serving_chat.py |  136 +-
+ .../entrypoints/openai/test_serving_models.py |  121 +
+ tests/entrypoints/openai/test_shutdown.py     |   37 +
+ tests/entrypoints/openai/test_tokenization.py |  170 +
+ tests/entrypoints/openai/test_video.py        |  348 ++
+ tests/entrypoints/openai/test_vision.py       |  349 ++
+ .../openai/test_vision_embedding.py           |   95 +
+ .../openai/tool_parsers/__init__.py           |    0
+ .../tool_parsers/test_pythonic_tool_parser.py |  160 +
+ .../entrypoints/openai/tool_parsers/utils.py  |  123 +
+ tests/entrypoints/test_chat_utils.py          |  796 ++++
+ tests/kernels/__init__.py                     |    0
+ tests/kernels/quant_utils.py                  |   88 +
+ tests/kernels/test_activation.py              |   67 +-
+ tests/kernels/test_aqlm.py                    |   37 +
+ tests/kernels/test_attention.py               |  181 +-
+ tests/kernels/test_attention_selector.py      |  100 +
+ tests/kernels/test_awq.py                     |   43 +
+ tests/kernels/test_awq_marlin.py              |  167 +
+ tests/kernels/test_awq_triton.py              |  170 +
+ tests/kernels/test_block_fp8.py               |  265 ++
+ tests/kernels/test_blocksparse_attention.py   |  439 +++
+ tests/kernels/test_cache.py                   |  235 +-
+ tests/kernels/test_cascade_flash_attn.py      |  182 +
+ tests/kernels/test_causal_conv1d.py           |  435 +++
+ tests/kernels/test_cutlass.py                 |  455 +++
+ tests/kernels/test_encoder_decoder_attn.py    | 1101 ++++++
+ tests/kernels/test_flash_attn.py              |  241 ++
+ tests/kernels/test_flashinfer.py              |  470 +++
+ tests/kernels/test_fp8_quant.py               |  114 +
+ tests/kernels/test_fused_quant_layernorm.py   |  171 +
+ tests/kernels/test_ggml.py                    |   22 +
+ tests/kernels/test_gguf.py                    |  127 +
+ tests/kernels/test_gptq.py                    |   29 +
+ tests/kernels/test_int8_quant.py              |  190 +
+ tests/kernels/test_layernorm.py               |   96 +-
+ tests/kernels/test_machete_mm.py              |  406 ++
+ tests/kernels/test_mamba_ssm.py               |  720 ++++
+ tests/kernels/test_marlin_gemm.py             |  616 +++
+ tests/kernels/test_moe.py                     |  324 +-
+ tests/kernels/test_permute_cols.py            |   15 +
+ tests/kernels/test_pos_encoding.py            |  126 +-
+ tests/kernels/test_prefix_prefill.py          |  334 +-
+ tests/kernels/test_rotary_embedding.py        |   62 +
+ tests/kernels/test_semi_structured.py         |  134 +
+ tests/kernels/test_triton_scaled_mm.py        |  106 +
+ tests/kernels/test_utils.py                   |   24 +
+ tests/kernels/utils.py                        | 1100 ++++++
+ tests/kv_transfer/disagg_test.py              |  119 +
+ tests/kv_transfer/module_test.py              |   64 +
+ tests/kv_transfer/test_lookup_buffer.py       |  160 +
+ tests/kv_transfer/test_lookup_buffer.sh       |    8 +
+ tests/kv_transfer/test_send_recv.py           |  158 +
+ tests/kv_transfer/test_send_recv.sh           |    9 +
+ tests/lora/conftest.py                        |  211 +-
+ tests/lora/data/__init__.py                   |    0
+ tests/lora/data/long_context_test_data.py     |  119 +
+ tests/lora/test_baichuan.py                   |   36 +-
+ tests/lora/test_chatglm3_tp.py                |  105 +
+ tests/lora/test_gemma.py                      |   19 +-
+ tests/lora/test_jamba.py                      |   54 +
+ tests/lora/test_layers.py                     |  791 +++-
+ tests/lora/test_llama_tp.py                   |  159 +
+ tests/lora/test_long_context.py               |  299 ++
+ tests/lora/test_lora_bias_e2e.py              |   52 +
+ tests/lora/test_lora_checkpoints.py           |   56 +-
+ tests/lora/test_lora_huggingface.py           |   39 +
+ tests/lora/test_lora_manager.py               |  570 ++-
+ tests/lora/test_minicpmv_tp.py                |  122 +
+ tests/lora/test_mixtral.py                    |   90 +-
+ tests/lora/test_phi.py                        |   70 +
+ tests/lora/test_punica_ops_sizes.py           |  400 ++
+ tests/lora/test_punica_ops_variation.py       |  316 ++
+ tests/lora/test_quant_model.py                |  115 +-
+ tests/lora/test_qwen2vl.py                    |   81 +
+ tests/lora/test_tokenizer_group.py            |   24 +-
+ tests/lora/test_utils.py                      |   83 +-
+ tests/lora/test_worker.py                     |   17 +-
+ tests/lora/utils.py                           |  280 +-
+ tests/metrics/__init__.py                     |    0
+ tests/metrics/test_metrics.py                 |  324 +-
+ tests/model_executor/__init__.py              |    0
+ tests/model_executor/conftest.py              |   49 +
+ .../model_executor/test_enabled_custom_ops.py |   89 +
+ .../model_executor/test_guided_processors.py  |  128 +
+ .../test_model_load_with_params.py            |  119 +
+ tests/models/__init__.py                      |    0
+ tests/models/decoder_only/__init__.py         |    0
+ .../decoder_only/audio_language/__init__.py   |    0
+ .../audio_language/test_ultravox.py           |  268 ++
+ .../models/decoder_only/language/__init__.py  |    0
+ .../models/decoder_only/language/test_aqlm.py |   69 +
+ .../models/decoder_only/language/test_fp8.py  |  100 +
+ .../models/decoder_only/language/test_gguf.py |  130 +
+ .../decoder_only/language/test_gptq_marlin.py |   84 +
+ .../language/test_gptq_marlin_24.py           |   73 +
+ .../decoder_only/language/test_granite.py     |   41 +
+ .../decoder_only/language/test_jamba.py       |  339 ++
+ .../decoder_only/language/test_mamba.py       |  323 ++
+ .../decoder_only/language/test_mistral.py     |  335 ++
+ .../decoder_only/language/test_modelopt.py    |   80 +
+ .../decoder_only/language/test_models.py      |   86 +
+ .../decoder_only/language/test_phimoe.py      |  102 +
+ .../decoder_only/vision_language/__init__.py  |    0
+ .../decoder_only/vision_language/test_awq.py  |  120 +
+ .../vision_language/test_h2ovl.py             |  129 +
+ .../vision_language/test_intern_vit.py        |   77 +
+ .../vision_language/test_models.py            |  742 ++++
+ .../vision_language/test_phi3v.py             |  234 ++
+ .../vision_language/test_pixtral.py           |  270 ++
+ .../vision_language/test_qwen2_vl.py          |  429 ++
+ .../vision_language/vlm_utils/__init__.py     |    0
+ .../vision_language/vlm_utils/builders.py     |  236 ++
+ .../vlm_utils/case_filtering.py               |  157 +
+ .../vision_language/vlm_utils/core.py         |  156 +
+ .../vlm_utils/custom_inputs.py                |  103 +
+ .../vision_language/vlm_utils/model_utils.py  |  582 +++
+ .../vision_language/vlm_utils/runners.py      |  139 +
+ .../vision_language/vlm_utils/types.py        |  198 +
+ tests/models/embedding/__init__.py            |    0
+ tests/models/embedding/language/__init__.py   |    0
+ .../embedding/language/test_cls_models.py     |   42 +
+ .../embedding/language/test_embedding.py      |   75 +
+ .../models/embedding/language/test_gritlm.py  |  200 +
+ .../models/embedding/language/test_scoring.py |   89 +
+ tests/models/embedding/utils.py               |   30 +
+ .../embedding/vision_language/__init__.py     |    0
+ .../vision_language/test_dse_qwen2_vl.py      |  209 +
+ .../vision_language/test_llava_next.py        |  140 +
+ .../embedding/vision_language/test_phi3v.py   |  126 +
+ tests/models/encoder_decoder/__init__.py      |    0
+ .../audio_language/__init__.py                |    0
+ .../audio_language/test_whisper.py            |  136 +
+ .../encoder_decoder/language/__init__.py      |    0
+ .../encoder_decoder/language/test_bart.py     |  222 ++
+ .../vision_language/__init__.py               |    0
+ .../vision_language/test_broadcast.py         |   35 +
+ .../vision_language/test_florence2.py         |  102 +
+ .../vision_language/test_mllama.py            |  367 ++
+ tests/models/fixtures/pixtral_chat.json       |    1 +
+ .../models/fixtures/pixtral_chat_engine.json  |    1 +
+ tests/models/multimodal/__init__.py           |    0
+ .../models/multimodal/processing/__init__.py  |    0
+ .../multimodal/processing/test_common.py      |  201 +
+ .../multimodal/processing/test_idefics3.py    |  178 +
+ .../multimodal/processing/test_internvl.py    |  206 +
+ .../multimodal/processing/test_llava_next.py  |  132 +
+ .../processing/test_llava_onevision.py        |  132 +
+ .../multimodal/processing/test_phi3v.py       |   55 +
+ .../models/multimodal/processing/test_qwen.py |  144 +
+ .../multimodal/processing/test_qwen2_vl.py    |   54 +
+ tests/models/registry.py                      |  248 ++
+ tests/models/test_initialization.py           |   63 +
+ tests/models/test_oot_registration.py         |   80 +-
+ tests/models/test_registry.py                 |   94 +
+ tests/models/utils.py                         |  282 +-
+ tests/mq_llm_engine/__init__.py               |    0
+ tests/mq_llm_engine/test_abort.py             |   67 +
+ tests/mq_llm_engine/test_error_handling.py    |  293 ++
+ tests/mq_llm_engine/test_load.py              |   57 +
+ tests/mq_llm_engine/utils.py                  |   78 +
+ tests/multi_step/__init__.py                  |    0
+ .../multi_step/test_correctness_async_llm.py  |  224 ++
+ tests/multi_step/test_correctness_llm.py      |  352 ++
+ tests/multimodal/__init__.py                  |    0
+ tests/multimodal/test_inputs.py               |   95 +
+ tests/multimodal/test_processing.py           |  613 +++
+ tests/multimodal/test_processor_kwargs.py     |  400 ++
+ tests/multimodal/test_utils.py                |  400 ++
+ tests/multimodal/utils.py                     |   33 +
+ tests/plugins/vllm_add_dummy_model/setup.py   |    9 +
+ .../vllm_add_dummy_model/__init__.py          |   20 +
+ .../my_gemma_embedding.py                     |   70 +
+ .../vllm_add_dummy_model/my_llava.py          |   26 +
+ .../vllm_add_dummy_model/my_opt.py            |   19 +
+ .../plugins/vllm_add_dummy_platform/setup.py  |   11 +
+ .../vllm_add_dummy_platform/__init__.py       |    5 +
+ .../vllm_add_dummy_platform/dummy_platform.py |    5 +
+ tests/plugins_tests/test_platform_plugins.py  |   16 +
+ tests/prefix_caching/__init__.py              |    0
+ .../test_disable_sliding_window.py            |   44 +
+ tests/prefix_caching/test_prefix_caching.py   |  254 +-
+ tests/prompt_adapter/test_bloom.py            |   45 +
+ .../test_multi_adapter_inference.py           |   53 +
+ tests/prompt_adapter/test_pa_lora.py          |   61 +
+ tests/quantization/__init__.py                |    0
+ tests/quantization/test_bitsandbytes.py       |  168 +
+ tests/quantization/test_compressed_tensors.py |  313 ++
+ tests/quantization/test_configs.py            |   10 +-
+ tests/quantization/test_cpu_offload.py        |   68 +
+ tests/quantization/test_experts_int8.py       |   28 +
+ tests/quantization/test_fp8.py                |  144 +-
+ tests/quantization/test_ipex_quant.py         |   30 +
+ tests/quantization/test_lm_head.py            |   47 +
+ tests/quantization/utils.py                   |   15 +
+ tests/runai_model_streamer/__init__.py        |    0
+ .../test_runai_model_streamer_loader.py       |   31 +
+ .../runai_model_streamer/test_weight_utils.py |   39 +
+ tests/samplers/__init__.py                    |    0
+ tests/samplers/test_beam_search.py            |   35 +-
+ tests/samplers/test_ignore_eos.py             |   24 +-
+ tests/samplers/test_logits_processor.py       |   85 +-
+ tests/samplers/test_logprobs.py               |  118 +-
+ tests/samplers/test_no_bad_words.py           |  185 +
+ tests/samplers/test_ranks.py                  |   38 +-
+ tests/samplers/test_rejection_sampler.py      |  231 +-
+ tests/samplers/test_sampler.py                |  315 +-
+ tests/samplers/test_seeded_generate.py        |   11 +-
+ .../test_typical_acceptance_sampler.py        |  470 +++
+ tests/spec_decode/e2e/conftest.py             |  519 ++-
+ tests/spec_decode/e2e/test_compatibility.py   |  134 +-
+ .../spec_decode/e2e/test_eagle_correctness.py |  309 ++
+ tests/spec_decode/e2e/test_integration.py     |  140 +
+ .../e2e/test_integration_dist_tp2.py          |  174 +
+ .../e2e/test_integration_dist_tp4.py          |  121 +
+ tests/spec_decode/e2e/test_logprobs.py        |  349 +-
+ .../e2e/test_medusa_correctness.py            |  383 ++
+ tests/spec_decode/e2e/test_mlp_correctness.py |  478 +++
+ .../e2e/test_multistep_correctness.py         |  486 ++-
+ .../spec_decode/e2e/test_ngram_correctness.py |  262 +-
+ tests/spec_decode/e2e/test_seed.py            |   67 +
+ tests/spec_decode/test_batch_expansion.py     |   23 +-
+ tests/spec_decode/test_dynamic_spec_decode.py |   87 +
+ tests/spec_decode/test_metrics.py             |  137 +-
+ tests/spec_decode/test_multi_step_worker.py   |  441 ++-
+ tests/spec_decode/test_ngram_worker.py        |   50 +-
+ tests/spec_decode/test_scorer.py              |  114 +
+ tests/spec_decode/test_spec_decode_worker.py  |  464 ++-
+ tests/spec_decode/test_utils.py               |   76 +-
+ tests/spec_decode/utils.py                    |  143 +-
+ tests/standalone_tests/lazy_torch_compile.py  |   28 +
+ tests/standalone_tests/python_only_compile.sh |   30 +
+ tests/system_messages/sonnet3.5_nov2024.txt   |   71 +
+ tests/tensorizer_loader/conftest.py           |   47 +
+ tests/tensorizer_loader/test_tensorizer.py    |  448 ++-
+ tests/test_cache_block_hashing.py             |   16 +-
+ tests/test_config.py                          |  252 +-
+ tests/test_embedded_commit.py                 |    8 +
+ tests/test_inputs.py                          |   79 +
+ tests/test_logger.py                          |   18 +-
+ tests/test_logits_processor.py                |   33 +-
+ tests/test_regression.py                      |   21 +
+ tests/test_scalartype.py                      |   36 +
+ tests/test_sequence.py                        |   46 +-
+ tests/test_sharded_state_loader.py            |  131 +
+ tests/test_utils.py                           |  447 +++
+ tests/tokenization/test_detokenize.py         |  189 +-
+ tests/tokenization/test_get_eos.py            |   31 +
+ tests/tokenization/test_tokenizer_group.py    |  120 +-
+ tests/tool_use/__init__.py                    |    0
+ tests/tool_use/conftest.py                    |   38 +
+ ...est_chat_completion_request_validations.py |   71 +
+ tests/tool_use/test_chat_completions.py       |  146 +
+ tests/tool_use/test_jamba_tool_parser.py      |  275 ++
+ tests/tool_use/test_parallel_tool_calls.py    |  205 +
+ tests/tool_use/test_tool_calls.py             |  192 +
+ tests/tool_use/utils.py                       |  313 ++
+ tests/tpu/__init__.py                         |    0
+ tests/tpu/test_compilation.py                 |   79 +
+ tests/tpu/test_custom_dispatcher.py           |   22 +
+ tests/tpu/test_quantization_accuracy.py       |   49 +
+ tests/tracing/__init__.py                     |    0
+ tests/tracing/test_tracing.py                 |  202 +
+ tests/utils.py                                |  825 ++++
+ tests/v1/__init__.py                          |    0
+ tests/v1/core/test_kv_cache_utils.py          |  245 ++
+ tests/v1/core/test_prefix_caching.py          |  566 +++
+ tests/v1/e2e/__init__.py                      |    0
+ tests/v1/e2e/test_cascade_attention.py        |   22 +
+ tests/v1/engine/__init__.py                   |    0
+ tests/v1/engine/test_async_llm.py             |  116 +
+ tests/v1/engine/test_engine_args.py           |   46 +
+ tests/v1/engine/test_engine_core.py           |  177 +
+ tests/v1/engine/test_engine_core_client.py    |  202 +
+ tests/v1/engine/test_output_processor.py      |  295 ++
+ tests/v1/sample/__init__.py                   |    0
+ tests/v1/sample/test_sampler.py               |  321 ++
+ tests/v1/worker/__init__.py                   |    0
+ tests/v1/worker/test_gpu_input_batch.py       |  224 ++
+ tests/vllm_test_utils/setup.py                |    7 +
+ .../vllm_test_utils/__init__.py               |    9 +
+ .../vllm_test_utils/vllm_test_utils/blame.py  |   53 +
+ .../vllm_test_utils/monitor.py                |   68 +
+ tests/weight_loading/models-large.txt         |    5 +
+ tests/weight_loading/models.txt               |   33 +
+ .../run_model_weight_loading_test.sh          |   49 +
+ tests/weight_loading/test_weight_loading.py   |   32 +
+ .../test_encoder_decoder_model_runner.py      |  646 ++++
+ tests/worker/test_model_input.py              |  241 ++
+ tests/worker/test_model_runner.py             |  281 +-
+ tests/worker/test_profile.py                  |   65 +
+ tests/worker/test_swap.py                     |   37 +-
+ tools/actionlint.sh                           |   13 +
+ tools/check_repo.sh                           |   14 +
+ tools/doc-lint.sh                             |    3 +
+ tools/mypy.sh                                 |   33 +
+ tools/png-lint.sh                             |   15 +
+ tools/profiler/print_layerwise_table.py       |   82 +
+ tools/profiler/visualize_layerwise_profile.py |  590 +++
+ tools/report_build_time_ninja.py              |  312 ++
+ tools/shellcheck.sh                           |   22 +
+ use_existing_torch.py                         |   18 +
+ vllm/__init__.py                              |   24 +-
+ vllm/_custom_ops.py                           |  967 ++++-
+ vllm/_ipex_ops.py                             |  226 ++
+ vllm/adapter_commons/__init__.py              |    0
+ vllm/adapter_commons/layers.py                |   14 +
+ vllm/adapter_commons/models.py                |  103 +
+ vllm/adapter_commons/request.py               |   23 +
+ vllm/adapter_commons/utils.py                 |   90 +
+ vllm/adapter_commons/worker_manager.py        |   36 +
+ vllm/assets/__init__.py                       |    0
+ vllm/assets/audio.py                          |   31 +
+ vllm/assets/base.py                           |   38 +
+ vllm/assets/image.py                          |   29 +
+ vllm/assets/video.py                          |   82 +
+ vllm/attention/__init__.py                    |    8 +-
+ vllm/attention/backends/abstract.py           |  205 +-
+ vllm/attention/backends/blocksparse_attn.py   |  454 +++
+ vllm/attention/backends/flash_attn.py         |  899 ++++-
+ vllm/attention/backends/flashinfer.py         |  836 +++-
+ vllm/attention/backends/hpu_attn.py           |  281 ++
+ vllm/attention/backends/ipex_attn.py          |  386 ++
+ vllm/attention/backends/openvino.py           |  140 +
+ vllm/attention/backends/pallas.py             |  345 ++
+ vllm/attention/backends/placeholder_attn.py   |  403 ++
+ vllm/attention/backends/rocm_flash_attn.py    |  472 ++-
+ vllm/attention/backends/torch_sdpa.py         |  613 ++-
+ vllm/attention/backends/utils.py              |  574 +++
+ vllm/attention/backends/xformers.py           |  577 ++-
+ vllm/attention/layer.py                       |  275 +-
+ .../ops/blocksparse_attention/__init__.py     |    0
+ .../blocksparse_attention_kernel.py           |  430 ++
+ .../ops/blocksparse_attention/interface.py    |  236 ++
+ .../ops/blocksparse_attention/utils.py        |  242 ++
+ vllm/attention/ops/hpu_paged_attn.py          |  103 +
+ vllm/attention/ops/ipex_attn.py               |  191 +
+ vllm/attention/ops/paged_attn.py              |   69 +-
+ vllm/attention/ops/prefix_prefill.py          |  180 +-
+ vllm/attention/ops/triton_flash_attention.py  |   10 +
+ vllm/attention/selector.py                    |  225 +-
+ vllm/beam_search.py                           |   71 +
+ vllm/compilation/__init__.py                  |    0
+ vllm/compilation/backends.py                  |  802 ++++
+ vllm/compilation/counter.py                   |   31 +
+ vllm/compilation/decorators.py                |  235 ++
+ vllm/compilation/fix_functionalization.py     |  180 +
+ vllm/compilation/fusion.py                    |  615 +++
+ vllm/compilation/fx_utils.py                  |   42 +
+ vllm/compilation/inductor_pass.py             |   84 +
+ vllm/compilation/monitor.py                   |   36 +
+ vllm/compilation/multi_output_match.py        |  106 +
+ vllm/compilation/pass_manager.py              |   77 +
+ vllm/compilation/reshapes.py                  |   88 +
+ vllm/compilation/vllm_inductor_pass.py        |   49 +
+ vllm/compilation/wrapper.py                   |  105 +
+ vllm/config.py                                | 2904 ++++++++++++--
+ vllm/connections.py                           |  167 +
+ vllm/core/block/block_table.py                |  195 +-
+ vllm/core/block/common.py                     |  276 +-
+ vllm/core/block/cpu_gpu_block_allocator.py    |  249 +-
+ vllm/core/block/interfaces.py                 |  127 +-
+ vllm/core/block/naive_block.py                |  266 +-
+ vllm/core/block/prefix_caching_block.py       |  879 ++++-
+ vllm/core/block/utils.py                      |   26 +
+ vllm/core/block_manager.py                    |  516 +++
+ vllm/core/evictor.py                          |  154 +
+ vllm/core/interfaces.py                       |   40 +-
+ vllm/core/placeholder_block_space_manager.py  |   94 +
+ vllm/core/scheduler.py                        | 1297 +++++--
+ vllm/distributed/communication_op.py          |  233 +-
+ .../device_communicators/cuda_wrapper.py      |  172 +
+ .../device_communicators/custom_all_reduce.py |  469 +--
+ .../custom_all_reduce_utils.py                |  255 ++
+ .../device_communicators/hpu_communicator.py  |   48 +
+ .../device_communicators/pynccl.py            |  406 +-
+ .../device_communicators/pynccl_wrapper.py    |  338 ++
+ .../device_communicators/shm_broadcast.py     |  528 +++
+ .../device_communicators/tpu_communicator.py  |   61 +
+ .../device_communicators/xpu_communicator.py  |   47 +
+ vllm/distributed/kv_transfer/README.md        |   30 +
+ vllm/distributed/kv_transfer/__init__.py      |    0
+ .../kv_transfer/disagg_prefill_workflow.jpg   |  Bin 0 -> 142656 bytes
+ .../kv_transfer/kv_connector/__init__.py      |    0
+ .../kv_transfer/kv_connector/base.py          |  122 +
+ .../kv_transfer/kv_connector/factory.py       |   48 +
+ .../kv_connector/simple_connector.py          |  312 ++
+ .../kv_transfer/kv_lookup_buffer/__init__.py  |    0
+ .../kv_transfer/kv_lookup_buffer/base.py      |  108 +
+ .../kv_lookup_buffer/simple_buffer.py         |  242 ++
+ .../kv_transfer/kv_pipe/__init__.py           |    0
+ vllm/distributed/kv_transfer/kv_pipe/base.py  |   65 +
+ .../kv_transfer/kv_pipe/mooncake_pipe.py      |  272 ++
+ .../kv_transfer/kv_pipe/pynccl_pipe.py        |  276 ++
+ .../kv_transfer/kv_transfer_agent.py          |   75 +
+ vllm/distributed/parallel_state.py            | 1307 ++++++-
+ vllm/distributed/utils.py                     |  255 +-
+ vllm/engine/arg_utils.py                      | 1028 ++++-
+ vllm/engine/async_llm_engine.py               | 1130 ++++--
+ vllm/engine/async_timeout.py                  |  189 +
+ vllm/engine/llm_engine.py                     | 1847 +++++++--
+ vllm/engine/metrics.py                        |  683 +++-
+ vllm/engine/metrics_types.py                  |  101 +
+ vllm/engine/multiprocessing/__init__.py       |  152 +
+ vllm/engine/multiprocessing/client.py         |  689 ++++
+ vllm/engine/multiprocessing/engine.py         |  389 ++
+ vllm/engine/output_processor/interfaces.py    |   20 +-
+ vllm/engine/output_processor/multi_step.py    |  149 +-
+ vllm/engine/output_processor/single_step.py   |  318 +-
+ vllm/engine/output_processor/stop_checker.py  |   77 +-
+ vllm/engine/output_processor/util.py          |   16 +-
+ vllm/engine/protocol.py                       |  277 ++
+ vllm/entrypoints/api_server.py                |  101 +-
+ vllm/entrypoints/chat_utils.py                | 1001 +++++
+ vllm/entrypoints/launcher.py                  |  103 +
+ vllm/entrypoints/llm.py                       | 1198 +++++-
+ vllm/entrypoints/logger.py                    |   42 +
+ vllm/entrypoints/openai/api_server.py         |  816 +++-
+ vllm/entrypoints/openai/cli_args.py           |  179 +-
+ vllm/entrypoints/openai/logits_processors.py  |   86 +
+ vllm/entrypoints/openai/protocol.py           | 1118 +++++-
+ vllm/entrypoints/openai/run_batch.py          |  317 ++
+ vllm/entrypoints/openai/serving_chat.py       |  918 +++--
+ vllm/entrypoints/openai/serving_completion.py |  541 ++-
+ vllm/entrypoints/openai/serving_embedding.py  |  240 ++
+ vllm/entrypoints/openai/serving_engine.py     |  609 ++-
+ vllm/entrypoints/openai/serving_models.py     |  250 ++
+ vllm/entrypoints/openai/serving_pooling.py    |  233 ++
+ vllm/entrypoints/openai/serving_score.py      |  226 ++
+ .../openai/serving_tokenization.py            |  144 +
+ .../openai/tool_parsers/__init__.py           |   16 +
+ .../tool_parsers/abstract_tool_parser.py      |  160 +
+ .../granite_20b_fc_tool_parser.py             |  251 ++
+ .../tool_parsers/granite_tool_parser.py       |  229 ++
+ .../openai/tool_parsers/hermes_tool_parser.py |  367 ++
+ .../tool_parsers/internlm2_tool_parser.py     |  208 +
+ .../openai/tool_parsers/jamba_tool_parser.py  |  300 ++
+ .../openai/tool_parsers/llama_tool_parser.py  |  258 ++
+ .../tool_parsers/mistral_tool_parser.py       |  322 ++
+ .../tool_parsers/pythonic_tool_parser.py      |  289 ++
+ vllm/entrypoints/openai/tool_parsers/utils.py |  121 +
+ vllm/entrypoints/utils.py                     |   57 +
+ vllm/envs.py                                  |  324 +-
+ vllm/executor/cpu_executor.py                 |  333 +-
+ vllm/executor/distributed_gpu_executor.py     |  149 +-
+ vllm/executor/executor_base.py                |   74 +-
+ vllm/executor/gpu_executor.py                 |  131 +-
+ vllm/executor/hpu_executor.py                 |  202 +
+ vllm/executor/msgspec_utils.py                |   27 +
+ vllm/executor/multiproc_gpu_executor.py       |  223 ++
+ vllm/executor/multiproc_worker_utils.py       |   77 +-
+ vllm/executor/multiproc_xpu_executor.py       |   26 +
+ vllm/executor/neuron_executor.py              |   59 +-
+ vllm/executor/openvino_executor.py            |  125 +
+ vllm/executor/ray_gpu_executor.py             |  541 ++-
+ vllm/executor/ray_hpu_executor.py             |  515 +++
+ vllm/executor/ray_tpu_executor.py             |  343 ++
+ vllm/executor/ray_utils.py                    |  308 +-
+ vllm/executor/ray_xpu_executor.py             |   40 +
+ vllm/executor/tpu_executor.py                 |  142 +
+ vllm/executor/xpu_executor.py                 |   39 +
+ vllm/forward_context.py                       |   99 +
+ vllm/inputs/__init__.py                       |   37 +
+ vllm/inputs/data.py                           |  403 ++
+ vllm/inputs/parse.py                          |  112 +
+ vllm/inputs/preprocess.py                     |  707 ++++
+ vllm/inputs/registry.py                       |  464 +++
+ vllm/logger.py                                |   80 +-
+ vllm/logging_utils/__init__.py                |    5 +
+ vllm/logging_utils/formatter.py               |   15 +
+ vllm/logits_process.py                        |  119 +
+ vllm/lora/fully_sharded_layers.py             |  279 +-
+ vllm/lora/layers.py                           | 1260 +++---
+ vllm/lora/lora.py                             |   35 +-
+ vllm/lora/models.py                           |  612 +--
+ vllm/lora/ops/__init__.py                     |    0
+ vllm/lora/ops/torch_ops/__init__.py           |   13 +
+ vllm/lora/ops/torch_ops/lora_ops.py           |  113 +
+ vllm/lora/ops/triton_ops/__init__.py          |   13 +
+ vllm/lora/ops/triton_ops/bgmv_expand.py       |  187 +
+ vllm/lora/ops/triton_ops/bgmv_expand_slice.py |  206 +
+ vllm/lora/ops/triton_ops/bgmv_shrink.py       |  167 +
+ vllm/lora/ops/triton_ops/sgmv_expand.py       |  278 ++
+ vllm/lora/ops/triton_ops/sgmv_shrink.py       |  239 ++
+ vllm/lora/ops/triton_ops/utils.py             |  165 +
+ vllm/lora/peft_helper.py                      |   80 +
+ vllm/lora/punica_wrapper/__init__.py          |    7 +
+ vllm/lora/punica_wrapper/punica_base.py       |  482 +++
+ vllm/lora/punica_wrapper/punica_cpu.py        |  346 ++
+ vllm/lora/punica_wrapper/punica_gpu.py        |  314 ++
+ vllm/lora/punica_wrapper/punica_hpu.py        |   87 +
+ vllm/lora/punica_wrapper/punica_selector.py   |   26 +
+ vllm/lora/punica_wrapper/utils.py             |  159 +
+ vllm/lora/request.py                          |   85 +-
+ vllm/lora/utils.py                            |  145 +-
+ vllm/lora/worker_manager.py                   |  247 +-
+ vllm/model_executor/__init__.py               |    8 +-
+ vllm/model_executor/custom_op.py              |  144 +
+ .../guided_decoding/__init__.py               |  150 +-
+ .../guided_decoding/guided_fields.py          |   39 +
+ .../lm_format_enforcer_decoding.py            |   46 +-
+ .../guided_decoding/outlines_decoding.py      |   94 +-
+ .../outlines_logits_processors.py             |  107 +-
+ vllm/model_executor/guided_decoding/utils.py  |  228 ++
+ .../guided_decoding/xgrammar_decoding.py      |  316 ++
+ vllm/model_executor/layers/activation.py      |  241 +-
+ .../layers/fused_moe/__init__.py              |   47 +-
+ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json |  146 +
+ ...336,device_name=NVIDIA_A100-SXM4-80GB.json |  146 +
+ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json |  218 ++
+ ...792,device_name=NVIDIA_A100-SXM4-80GB.json |  218 ++
+ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json |  218 ++
+ ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json |  218 ++
+ ...072,device_name=NVIDIA_H100_80GB_HBM3.json |  218 ++
+ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json |  218 ++
+ ...584,device_name=NVIDIA_A100-SXM4-80GB.json |  218 ++
+ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json |  218 ++
+ ...168,device_name=NVIDIA_A100-SXM4-80GB.json |  218 ++
+ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json |  146 +
+ ...336,device_name=NVIDIA_A100-SXM4-80GB.json |  146 +
+ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json |  218 ++
+ ...792,device_name=NVIDIA_A100-SXM4-80GB.json |  218 ++
+ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json |  146 +
+ ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json |  146 +
+ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |  130 +
+ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json |  146 +
+ ...584,device_name=NVIDIA_A100-SXM4-80GB.json |  218 ++
+ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |  130 +
+ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json |  146 +
+ ...168,device_name=NVIDIA_A100-SXM4-80GB.json |  146 +
+ ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json |  146 +
+ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |  130 +
+ ...280,device_name=NVIDIA_A100-SXM4-80GB.json |  146 +
+ ...280,device_name=NVIDIA_H100_80GB_HBM3.json |  146 +
+ ...640,device_name=NVIDIA_A100-SXM4-80GB.json |  146 +
+ ...640,device_name=NVIDIA_H100_80GB_HBM3.json |  146 +
+ ...14336,device_name=AMD_Instinct_MI300X.json |  200 +
+ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |  138 +
+ ...=1792,device_name=AMD_Instinct_MI300X.json |  200 +
+ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |  146 +
+ ...=3584,device_name=AMD_Instinct_MI300X.json |  200 +
+ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |  146 +
+ .../E=8,N=3584,device_name=NVIDIA_L40S.json   |  173 +
+ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |  146 +
+ ...=7168,device_name=AMD_Instinct_MI300X.json |  200 +
+ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |  146 +
+ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |  146 +
+ .../layers/fused_moe/fused_marlin_moe.py      |  359 ++
+ .../layers/fused_moe/fused_moe.py             |  772 +++-
+ vllm/model_executor/layers/fused_moe/layer.py |  619 +++
+ .../layers/fused_moe/moe_pallas.py            |   62 +
+ .../layers/fused_moe/moe_torch_iterative.py   |   51 +
+ vllm/model_executor/layers/layernorm.py       |  155 +-
+ vllm/model_executor/layers/linear.py          |  756 +++-
+ .../model_executor/layers/logits_processor.py |   97 +-
+ vllm/model_executor/layers/mamba/__init__.py  |    0
+ .../layers/mamba/mamba_mixer.py               |  241 ++
+ .../layers/mamba/ops/__init__.py              |    0
+ .../layers/mamba/ops/causal_conv1d.py         |  102 +
+ .../layers/mamba/ops/mamba_ssm.py             |  411 ++
+ vllm/model_executor/layers/pooler.py          |  320 ++
+ .../layers/quantization/__init__.py           |   96 +-
+ .../layers/quantization/aqlm.py               |   21 +-
+ .../model_executor/layers/quantization/awq.py |  100 +-
+ .../layers/quantization/awq_marlin.py         |  475 +++
+ .../layers/quantization/awq_triton.py         |  317 ++
+ .../layers/quantization/base_config.py        |   62 +-
+ .../layers/quantization/bitsandbytes.py       |  357 ++
+ .../compressed_tensors/__init__.py            |    0
+ .../compressed_tensors/compressed_tensors.py  |  555 +++
+ .../compressed_tensors_moe.py                 |  519 +++
+ .../compressed_tensors/schemes/__init__.py    |   18 +
+ .../schemes/compressed_tensors_24.py          |  208 +
+ .../schemes/compressed_tensors_scheme.py      |   52 +
+ .../schemes/compressed_tensors_w4a16_24.py    |  157 +
+ .../schemes/compressed_tensors_w8a16_fp8.py   |  117 +
+ .../schemes/compressed_tensors_w8a8_fp8.py    |  146 +
+ .../schemes/compressed_tensors_w8a8_int8.py   |  108 +
+ .../schemes/compressed_tensors_wNa16.py       |  162 +
+ .../compressed_tensors/triton_scaled_mm.py    |  199 +
+ .../quantization/compressed_tensors/utils.py  |  171 +
+ .../layers/quantization/deepspeedfp.py        |  190 +
+ .../layers/quantization/experts_int8.py       |  180 +
+ .../layers/quantization/fbgemm_fp8.py         |  165 +
+ .../model_executor/layers/quantization/fp8.py |  653 +++-
+ .../layers/quantization/gguf.py               |  226 ++
+ .../layers/quantization/gptq.py               |  142 +-
+ .../layers/quantization/gptq_marlin.py        |  749 ++--
+ .../layers/quantization/gptq_marlin_24.py     |  292 ++
+ .../layers/quantization/hqq_marlin.py         |  325 ++
+ .../layers/quantization/ipex_quant.py         |  247 ++
+ .../layers/quantization/kernels/__init__.py   |    0
+ .../kernels/mixed_precision/MPLinearKernel.py |   87 +
+ .../kernels/mixed_precision/__init__.py       |   74 +
+ .../kernels/mixed_precision/exllama.py        |  140 +
+ .../kernels/mixed_precision/machete.py        |  120 +
+ .../kernels/mixed_precision/marlin.py         |  133 +
+ .../kernels/scaled_mm/ScaledMMLinearKernel.py |   64 +
+ .../kernels/scaled_mm/__init__.py             |   84 +
+ .../quantization/kernels/scaled_mm/cutlass.py |  134 +
+ .../quantization/kernels/scaled_mm/xla.py     |  101 +
+ .../layers/quantization/kv_cache.py           |   78 +
+ .../layers/quantization/marlin.py             |  110 +-
+ .../layers/quantization/modelopt.py           |  163 +
+ .../layers/quantization/neuron_quant.py       |   64 +
+ .../model_executor/layers/quantization/qqq.py |  270 ++
+ .../layers/quantization/tpu_int8.py           |  116 +
+ .../layers/quantization/utils/__init__.py     |    3 +
+ .../layers/quantization/utils/fp8_utils.py    |  353 ++
+ .../layers/quantization/utils/layer_utils.py  |   37 +
+ .../quantization/utils/machete_utils.py       |   30 +
+ .../layers/quantization/utils/marlin_utils.py |  350 ++
+ .../quantization/utils/marlin_utils_fp8.py    |  108 +
+ .../quantization/utils/marlin_utils_test.py   |  163 +
+ .../utils/marlin_utils_test_24.py             |  463 +++
+ .../utils/marlin_utils_test_qqq.py            |  125 +
+ .../layers/quantization/utils/quant_utils.py  |  454 +++
+ .../layers/quantization/utils/w8a8_utils.py   |  225 ++
+ .../layers/rejection_sampler.py               |  389 +-
+ vllm/model_executor/layers/resampler.py       |  267 ++
+ .../model_executor/layers/rotary_embedding.py |  687 +++-
+ vllm/model_executor/layers/sampler.py         |  893 +++--
+ .../layers/spec_decode_base_sampler.py        |  254 ++
+ .../layers/typical_acceptance_sampler.py      |  170 +
+ vllm/model_executor/layers/utils.py           |   57 +
+ .../layers/vocab_parallel_embedding.py        |  415 +-
+ vllm/model_executor/model_loader/__init__.py  |   20 +-
+ vllm/model_executor/model_loader/loader.py    | 1331 ++++++-
+ vllm/model_executor/model_loader/neuron.py    |  155 +-
+ vllm/model_executor/model_loader/openvino.py  |  203 +
+ .../model_executor/model_loader/tensorizer.py |  242 +-
+ vllm/model_executor/model_loader/utils.py     |   26 +-
+ .../model_loader/weight_utils.py              |  371 +-
+ vllm/model_executor/models/__init__.py        |  133 +-
+ vllm/model_executor/models/adapters.py        |  248 ++
+ vllm/model_executor/models/arctic.py          |  581 +++
+ vllm/model_executor/models/aria.py            |  688 ++++
+ vllm/model_executor/models/baichuan.py        |  200 +-
+ vllm/model_executor/models/bart.py            |  998 +++++
+ vllm/model_executor/models/bert.py            |  532 +++
+ vllm/model_executor/models/blip.py            |  333 ++
+ vllm/model_executor/models/blip2.py           |  739 ++++
+ vllm/model_executor/models/bloom.py           |  136 +-
+ vllm/model_executor/models/chameleon.py       | 1166 ++++++
+ vllm/model_executor/models/chatglm.py         |  548 ++-
+ vllm/model_executor/models/clip.py            |  544 +++
+ vllm/model_executor/models/commandr.py        |  234 +-
+ vllm/model_executor/models/dbrx.py            |  271 +-
+ vllm/model_executor/models/decilm.py          |   32 +-
+ vllm/model_executor/models/deepseek.py        |  143 +-
+ vllm/model_executor/models/deepseek_v2.py     |  652 ++++
+ vllm/model_executor/models/deepseek_v3.py     |  663 ++++
+ vllm/model_executor/models/deepseek_vl2.py    |  662 ++++
+ vllm/model_executor/models/eagle.py           |  212 +
+ vllm/model_executor/models/exaone.py          |  614 +++
+ vllm/model_executor/models/falcon.py          |  201 +-
+ vllm/model_executor/models/florence2.py       |  264 ++
+ vllm/model_executor/models/fuyu.py            |  406 ++
+ vllm/model_executor/models/gemma.py           |  214 +-
+ vllm/model_executor/models/gemma2.py          |  471 +++
+ vllm/model_executor/models/glm.py             |   21 +
+ .../models/glm4_vision_encoder.py             |  296 ++
+ vllm/model_executor/models/gpt2.py            |  164 +-
+ vllm/model_executor/models/gpt_bigcode.py     |  175 +-
+ vllm/model_executor/models/gpt_j.py           |  136 +-
+ vllm/model_executor/models/gpt_neox.py        |  131 +-
+ vllm/model_executor/models/granite.py         |  553 +++
+ vllm/model_executor/models/granitemoe.py      |  458 +++
+ vllm/model_executor/models/gritlm.py          |  248 ++
+ vllm/model_executor/models/h2ovl.py           |  400 ++
+ .../models/idefics2_vision_model.py           |  344 ++
+ vllm/model_executor/models/idefics3.py        |  777 ++++
+ vllm/model_executor/models/interfaces.py      |  441 +++
+ vllm/model_executor/models/interfaces_base.py |  177 +
+ vllm/model_executor/models/intern_vit.py      |  474 +++
+ vllm/model_executor/models/internlm2.py       |  316 +-
+ vllm/model_executor/models/internlm2_ve.py    |  154 +
+ vllm/model_executor/models/internvl.py        |  777 ++++
+ vllm/model_executor/models/jais.py            |  162 +-
+ vllm/model_executor/models/jamba.py           |  631 +++
+ vllm/model_executor/models/llama.py           |  523 ++-
+ vllm/model_executor/models/llava.py           |  923 ++++-
+ vllm/model_executor/models/llava_next.py      |  587 +++
+ .../model_executor/models/llava_next_video.py |  493 +++
+ vllm/model_executor/models/llava_onevision.py |  903 +++++
+ vllm/model_executor/models/mamba.py           |  302 ++
+ vllm/model_executor/models/mamba_cache.py     |  158 +
+ vllm/model_executor/models/medusa.py          |  208 +
+ vllm/model_executor/models/minicpm.py         |  381 +-
+ vllm/model_executor/models/minicpm3.py        |  251 ++
+ vllm/model_executor/models/minicpmv.py        | 1023 +++++
+ vllm/model_executor/models/mixtral.py         |  466 +--
+ vllm/model_executor/models/mixtral_quant.py   |  166 +-
+ vllm/model_executor/models/mllama.py          | 1527 ++++++++
+ vllm/model_executor/models/mlp_speculator.py  |  203 +
+ vllm/model_executor/models/module_mapping.py  |   69 +
+ vllm/model_executor/models/molmo.py           | 1412 +++++++
+ vllm/model_executor/models/mpt.py             |  126 +-
+ vllm/model_executor/models/nemotron.py        |  531 +++
+ vllm/model_executor/models/nvlm_d.py          |   88 +
+ vllm/model_executor/models/olmo.py            |  138 +-
+ vllm/model_executor/models/olmo2.py           |  432 +++
+ vllm/model_executor/models/olmoe.py           |  466 +++
+ vllm/model_executor/models/opt.py             |  189 +-
+ vllm/model_executor/models/orion.py           |  138 +-
+ vllm/model_executor/models/paligemma.py       |  321 ++
+ vllm/model_executor/models/persimmon.py       |  368 ++
+ vllm/model_executor/models/phi.py             |  179 +-
+ vllm/model_executor/models/phi3.py            |   20 +
+ vllm/model_executor/models/phi3_small.py      |  482 +++
+ vllm/model_executor/models/phi3v.py           |  732 ++++
+ vllm/model_executor/models/phimoe.py          |  676 ++++
+ vllm/model_executor/models/pixtral.py         | 1123 ++++++
+ vllm/model_executor/models/qwen.py            |  899 ++++-
+ vllm/model_executor/models/qwen2.py           |  407 +-
+ vllm/model_executor/models/qwen2_audio.py     |  417 ++
+ vllm/model_executor/models/qwen2_moe.py       |  306 +-
+ vllm/model_executor/models/qwen2_rm.py        |  117 +
+ vllm/model_executor/models/qwen2_vl.py        | 1355 +++++++
+ vllm/model_executor/models/registry.py        |  514 +++
+ vllm/model_executor/models/roberta.py         |  256 ++
+ vllm/model_executor/models/siglip.py          |  655 ++++
+ vllm/model_executor/models/solar.py           |  573 +++
+ vllm/model_executor/models/stablelm.py        |  159 +-
+ vllm/model_executor/models/starcoder2.py      |  171 +-
+ vllm/model_executor/models/telechat2.py       |  132 +
+ vllm/model_executor/models/ultravox.py        |  556 +++
+ vllm/model_executor/models/utils.py           |  642 +++
+ vllm/model_executor/models/vision.py          |  145 +
+ vllm/model_executor/models/whisper.py         |  735 ++++
+ vllm/model_executor/parameter.py              |  425 ++
+ vllm/model_executor/pooling_metadata.py       |   69 +
+ vllm/model_executor/sampling_metadata.py      |  399 +-
+ vllm/model_executor/utils.py                  |   31 +-
+ vllm/multimodal/__init__.py                   |   31 +
+ vllm/multimodal/audio.py                      |   75 +
+ vllm/multimodal/base.py                       |  461 +++
+ vllm/multimodal/hasher.py                     |  100 +
+ vllm/multimodal/image.py                      |  137 +
+ vllm/multimodal/inputs.py                     |  523 +++
+ vllm/multimodal/parse.py                      |  366 ++
+ vllm/multimodal/processing.py                 | 1190 ++++++
+ vllm/multimodal/profiling.py                  |  206 +
+ vllm/multimodal/registry.py                   |  423 ++
+ vllm/multimodal/utils.py                      |  479 +++
+ vllm/multimodal/video.py                      |  188 +
+ vllm/outputs.py                               |  481 ++-
+ vllm/platforms/__init__.py                    |  223 ++
+ vllm/platforms/cpu.py                         |  111 +
+ vllm/platforms/cuda.py                        |  365 ++
+ vllm/platforms/hpu.py                         |   64 +
+ vllm/platforms/interface.py                   |  272 ++
+ vllm/platforms/neuron.py                      |   46 +
+ vllm/platforms/openvino.py                    |  143 +
+ vllm/platforms/rocm.py                        |  153 +
+ vllm/platforms/tpu.py                         |   81 +
+ vllm/platforms/xpu.py                         |   95 +
+ vllm/plugins/__init__.py                      |   88 +
+ vllm/pooling_params.py                        |   23 +
+ vllm/profiler/__init__.py                     |    5 +
+ vllm/profiler/layerwise_profile.py            |  372 ++
+ vllm/profiler/utils.py                        |  145 +
+ vllm/prompt_adapter/__init__.py               |    0
+ vllm/prompt_adapter/layers.py                 |   80 +
+ vllm/prompt_adapter/models.py                 |  355 ++
+ vllm/prompt_adapter/request.py                |   34 +
+ vllm/prompt_adapter/utils.py                  |   94 +
+ vllm/prompt_adapter/worker_manager.py         |  176 +
+ vllm/sampling_params.py                       |  445 ++-
+ vllm/scalar_type.py                           |  330 ++
+ vllm/scripts.py                               |  207 +
+ vllm/sequence.py                              | 1310 +++++--
+ vllm/spec_decode/batch_expansion.py           |  311 +-
+ vllm/spec_decode/draft_model_runner.py        |  323 ++
+ vllm/spec_decode/interfaces.py                |   21 +-
+ vllm/spec_decode/medusa_worker.py             |  137 +
+ vllm/spec_decode/metrics.py                   |   56 +-
+ vllm/spec_decode/mlp_speculator_worker.py     |   91 +
+ vllm/spec_decode/mqa_scorer.py                |  113 +
+ vllm/spec_decode/multi_step_worker.py         |  324 +-
+ vllm/spec_decode/ngram_worker.py              |  159 +-
+ vllm/spec_decode/proposer_worker_base.py      |   56 +
+ .../spec_decode/smaller_tp_proposer_worker.py |  161 +
+ vllm/spec_decode/spec_decode_worker.py        |  952 ++++-
+ vllm/spec_decode/target_model_runner.py       |   42 +
+ vllm/spec_decode/top1_proposer.py             |  158 +-
+ vllm/spec_decode/util.py                      |  178 +-
+ vllm/tracing.py                               |  119 +
+ vllm/transformers_utils/__init__.py           |   17 +
+ vllm/transformers_utils/config.py             |  593 ++-
+ vllm/transformers_utils/configs/__init__.py   |   32 +-
+ vllm/transformers_utils/configs/arctic.py     |  204 +
+ vllm/transformers_utils/configs/aria.py       |   47 +
+ vllm/transformers_utils/configs/chatglm.py    |    3 +-
+ vllm/transformers_utils/configs/cohere2.py    |  192 +
+ .../configs/deepseek_vl2.py                   |  214 +
+ vllm/transformers_utils/configs/eagle.py      |   49 +
+ vllm/transformers_utils/configs/exaone.py     |  189 +
+ vllm/transformers_utils/configs/h2ovl.py      |   13 +
+ vllm/transformers_utils/configs/internvl.py   |   51 +
+ vllm/transformers_utils/configs/jais.py       |    1 -
+ vllm/transformers_utils/configs/medusa.py     |   60 +
+ vllm/transformers_utils/configs/mllama.py     |   28 +
+ .../configs/mlp_speculator.py                 |   65 +
+ vllm/transformers_utils/configs/mpt.py        |    7 +-
+ vllm/transformers_utils/configs/nemotron.py   |  202 +
+ vllm/transformers_utils/configs/nvlm_d.py     |   12 +
+ vllm/transformers_utils/configs/olmo2.py      |  166 +
+ vllm/transformers_utils/configs/solar.py      |  244 ++
+ vllm/transformers_utils/configs/telechat2.py  |   61 +
+ vllm/transformers_utils/configs/ultravox.py   |   99 +
+ vllm/transformers_utils/detokenizer.py        |  202 +-
+ vllm/transformers_utils/detokenizer_utils.py  |  167 +
+ vllm/transformers_utils/processor.py          |  104 +
+ vllm/transformers_utils/s3_utils.py           |  151 +
+ vllm/transformers_utils/tokenizer.py          |  180 +-
+ .../tokenizer_group/__init__.py               |   46 +-
+ .../tokenizer_group/base_tokenizer_group.py   |   39 +-
+ .../tokenizer_group/ray_tokenizer_group.py    |  169 +-
+ .../tokenizer_group/tokenizer_group.py        |   66 +-
+ .../transformers_utils/tokenizers/__init__.py |    6 +-
+ vllm/transformers_utils/tokenizers/mistral.py |  366 ++
+ vllm/transformers_utils/utils.py              |   20 +
+ vllm/triton_utils/__init__.py                 |   10 +
+ vllm/triton_utils/custom_cache_manager.py     |   53 +
+ vllm/triton_utils/importing.py                |   15 +
+ vllm/usage/usage_lib.py                       |   36 +-
+ vllm/utils.py                                 | 1980 ++++++++--
+ vllm/v1/__init__.py                           |    0
+ vllm/v1/attention/__init__.py                 |    0
+ vllm/v1/attention/backends/__init__.py        |    0
+ vllm/v1/attention/backends/flash_attn.py      |  430 ++
+ vllm/v1/core/__init__.py                      |    0
+ vllm/v1/core/encoder_cache_manager.py         |   48 +
+ vllm/v1/core/kv_cache_manager.py              |  479 +++
+ vllm/v1/core/kv_cache_utils.py                |  307 ++
+ vllm/v1/core/scheduler.py                     |  618 +++
+ vllm/v1/engine/__init__.py                    |   79 +
+ vllm/v1/engine/async_llm.py                   |  342 ++
+ vllm/v1/engine/core.py                        |  286 ++
+ vllm/v1/engine/core_client.py                 |  268 ++
+ vllm/v1/engine/detokenizer.py                 |  180 +
+ vllm/v1/engine/llm_engine.py                  |  179 +
+ vllm/v1/engine/mm_input_mapper.py             |  142 +
+ vllm/v1/engine/output_processor.py            |  200 +
+ vllm/v1/engine/processor.py                   |  223 ++
+ vllm/v1/executor/__init__.py                  |    0
+ vllm/v1/executor/abstract.py                  |   57 +
+ vllm/v1/executor/multiproc_executor.py        |  405 ++
+ vllm/v1/executor/ray_executor.py              |  342 ++
+ vllm/v1/executor/ray_utils.py                 |  280 ++
+ vllm/v1/executor/uniproc_executor.py          |   84 +
+ vllm/v1/metrics/__init__.py                   |    0
+ vllm/v1/metrics/loggers.py                    |   38 +
+ vllm/v1/metrics/stats.py                      |   39 +
+ vllm/v1/outputs.py                            |   39 +
+ vllm/v1/request.py                            |  171 +
+ vllm/v1/sample/__init__.py                    |    0
+ vllm/v1/sample/metadata.py                    |   31 +
+ vllm/v1/sample/ops/__init__.py                |    0
+ vllm/v1/sample/ops/penalties.py               |   59 +
+ vllm/v1/sample/ops/topk_topp_sampler.py       |  201 +
+ vllm/v1/sample/sampler.py                     |  136 +
+ vllm/v1/serial_utils.py                       |   10 +
+ vllm/v1/utils.py                              |  136 +
+ vllm/v1/worker/__init__.py                    |    0
+ vllm/v1/worker/block_table.py                 |   78 +
+ vllm/v1/worker/gpu_input_batch.py             |  435 +++
+ vllm/v1/worker/gpu_model_runner.py            |  866 +++++
+ vllm/v1/worker/gpu_worker.py                  |  273 ++
+ vllm/version.py                               |   11 +
+ vllm/worker/cache_engine.py                   |   58 +-
+ vllm/worker/cpu_enc_dec_model_runner.py       |  325 ++
+ vllm/worker/cpu_model_runner.py               |  903 +++--
+ vllm/worker/cpu_pooling_model_runner.py       |  134 +
+ vllm/worker/cpu_worker.py                     |  247 +-
+ vllm/worker/enc_dec_model_runner.py           |  526 +++
+ vllm/worker/hpu_model_runner.py               | 2016 ++++++++++
+ vllm/worker/hpu_worker.py                     |  410 ++
+ vllm/worker/model_runner.py                   | 2698 ++++++++-----
+ vllm/worker/model_runner_base.py              |  305 ++
+ vllm/worker/multi_step_model_runner.py        |  907 +++++
+ vllm/worker/multi_step_tpu_worker.py          |  105 +
+ vllm/worker/multi_step_worker.py              |  194 +
+ vllm/worker/neuron_model_runner.py            |  232 +-
+ vllm/worker/neuron_worker.py                  |   83 +-
+ vllm/worker/openvino_model_runner.py          |  369 ++
+ vllm/worker/openvino_worker.py                |  588 +++
+ vllm/worker/pooling_model_runner.py           |  201 +
+ vllm/worker/tpu_model_runner.py               |  896 +++++
+ vllm/worker/tpu_worker.py                     |  294 ++
+ vllm/worker/utils.py                          |   51 +
+ vllm/worker/worker.py                         |  473 ++-
+ vllm/worker/worker_base.py                    |  392 +-
+ vllm/worker/xpu_model_runner.py               |  609 +++
+ vllm/worker/xpu_worker.py                     |  184 +
+ 1537 files changed, 284998 insertions(+), 25880 deletions(-)
+ create mode 100644 .buildkite/generate_index.py
+ create mode 100644 .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+ create mode 100644 .buildkite/lm-eval-harness/configs/models-large.txt
+ create mode 100644 .buildkite/lm-eval-harness/configs/models-small.txt
+ create mode 100644 .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+ create mode 100644 .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+ create mode 100644 .buildkite/lm-eval-harness/run-tests.sh
+ create mode 100644 .buildkite/lm-eval-harness/test_lm_eval_correctness.py
+ create mode 100644 .buildkite/nightly-benchmarks/README.md
+ create mode 100644 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+ create mode 100644 .buildkite/nightly-benchmarks/nightly-annotation.md
+ create mode 100644 .buildkite/nightly-benchmarks/nightly-descriptions.md
+ create mode 100644 .buildkite/nightly-benchmarks/nightly-pipeline.yaml
+ create mode 100644 .buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+ create mode 100644 .buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+ create mode 100644 .buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+ create mode 100644 .buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+ create mode 100644 .buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+ create mode 100644 .buildkite/nightly-benchmarks/scripts/launch-server.sh
+ create mode 100644 .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+ create mode 100644 .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+ create mode 100644 .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+ create mode 100644 .buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+ create mode 100644 .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+ create mode 100644 .buildkite/nightly-benchmarks/tests/latency-tests.json
+ create mode 100644 .buildkite/nightly-benchmarks/tests/nightly-tests.json
+ create mode 100644 .buildkite/nightly-benchmarks/tests/serving-tests.json
+ create mode 100644 .buildkite/nightly-benchmarks/tests/throughput-tests.json
+ create mode 100644 .buildkite/release-pipeline.yaml
+ create mode 100644 .buildkite/run-cpu-test-ppc64le.sh
+ create mode 100644 .buildkite/run-gh200-test.sh
+ create mode 100644 .buildkite/run-hpu-test.sh
+ create mode 100644 .buildkite/run-multi-node-test.sh
+ create mode 100644 .buildkite/run-openvino-test.sh
+ create mode 100644 .buildkite/run-tpu-test.sh
+ create mode 100644 .buildkite/run-xpu-test.sh
+ create mode 100644 .buildkite/upload-wheels.sh
+ create mode 100644 .clang-format
+ create mode 100644 .github/CODEOWNERS
+ create mode 100644 .github/FUNDING.yml
+ create mode 100644 .github/ISSUE_TEMPLATE/400-bug-report.yml
+ create mode 100644 .github/ISSUE_TEMPLATE/500-feature-request.yml
+ create mode 100644 .github/ISSUE_TEMPLATE/600-new-model.yml
+ create mode 100644 .github/ISSUE_TEMPLATE/700-performance-discussion.yml
+ create mode 100644 .github/ISSUE_TEMPLATE/800-misc-discussion.yml
+ create mode 100644 .github/dependabot.yml
+ create mode 100644 .github/mergify.yml
+ create mode 100644 .github/scripts/cleanup_pr_body.sh
+ create mode 100644 .github/workflows/actionlint.yml
+ create mode 100644 .github/workflows/add_label_automerge.yml
+ create mode 100644 .github/workflows/clang-format.yml
+ create mode 100644 .github/workflows/cleanup_pr_body.yml
+ create mode 100644 .github/workflows/codespell.yml
+ create mode 100644 .github/workflows/doc-lint.yml
+ create mode 100644 .github/workflows/lint-and-deploy.yaml
+ create mode 100644 .github/workflows/matchers/actionlint.json
+ create mode 100644 .github/workflows/matchers/mypy.json
+ create mode 100644 .github/workflows/matchers/ruff.json
+ create mode 100644 .github/workflows/png-lint.yml
+ create mode 100644 .github/workflows/reminder_comment.yml
+ create mode 100644 .github/workflows/shellcheck.yml
+ create mode 100644 .github/workflows/stale.yml
+ create mode 100644 .shellcheckrc
+ create mode 100644 CODE_OF_CONDUCT.md
+ create mode 100644 DCO
+ create mode 100644 Dockerfile.arm
+ create mode 100644 Dockerfile.hpu
+ create mode 100644 Dockerfile.openvino
+ create mode 100644 Dockerfile.ppc64le
+ create mode 100644 Dockerfile.tpu
+ create mode 100644 Dockerfile.xpu
+ create mode 100644 SECURITY.md
+ create mode 100644 benchmarks/benchmark_guided.py
+ create mode 100644 benchmarks/benchmark_long_document_qa_throughput.py
+ create mode 100644 benchmarks/benchmark_prioritization.py
+ create mode 100644 benchmarks/benchmark_serving_guided.py
+ create mode 100644 benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+ create mode 100644 benchmarks/cutlass_benchmarks/utils.py
+ create mode 100644 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+ create mode 100644 benchmarks/cutlass_benchmarks/weight_shapes.py
+ create mode 100644 benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+ create mode 100644 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+ create mode 100644 benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+ create mode 100644 benchmarks/disagg_benchmarks/round_robin_proxy.py
+ create mode 100644 benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+ create mode 100644 benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+ create mode 100644 benchmarks/kernels/benchmark_layernorm.py
+ create mode 100644 benchmarks/kernels/benchmark_machete.py
+ create mode 100644 benchmarks/kernels/benchmark_marlin.py
+ create mode 100644 benchmarks/kernels/benchmark_moe.py
+ create mode 100644 benchmarks/kernels/benchmark_quant.py
+ create mode 100644 benchmarks/kernels/benchmark_rmsnorm.py
+ create mode 100644 benchmarks/kernels/benchmark_shapes.py
+ create mode 100644 benchmarks/kernels/graph_machete_bench.py
+ create mode 100644 benchmarks/kernels/requirements.txt
+ create mode 100644 benchmarks/kernels/weight_shapes.py
+ create mode 100644 benchmarks/overheads/benchmark_hashing.py
+ create mode 100644 benchmarks/structured_schemas/structured_schema_1.json
+ create mode 100644 csrc/attention/attention_kernels.cuh
+ create mode 100644 csrc/attention/paged_attention_v1.cu
+ create mode 100644 csrc/attention/paged_attention_v2.cu
+ create mode 100644 csrc/core/exception.hpp
+ create mode 100644 csrc/core/math.hpp
+ create mode 100644 csrc/core/registration.h
+ create mode 100644 csrc/core/scalar_type.hpp
+ create mode 100644 csrc/cpu/cpu_types_arm.hpp
+ create mode 100644 csrc/cpu/cpu_types_vsx.hpp
+ create mode 100644 csrc/cpu/cpu_types_x86.hpp
+ create mode 100644 csrc/cpu/dnnl_helper.hpp
+ create mode 100644 csrc/cpu/quant.cpp
+ create mode 100644 csrc/cpu/torch_bindings.cpp
+ create mode 100644 csrc/cpu/utils.cpp
+ create mode 100644 csrc/cutlass_extensions/common.cpp
+ create mode 100644 csrc/cutlass_extensions/common.hpp
+ create mode 100644 csrc/cutlass_extensions/cute_utils.cuh
+ create mode 100644 csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
+ create mode 100644 csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
+ create mode 100644 csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+ create mode 100644 csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+ create mode 100644 csrc/cutlass_extensions/torch_utils.hpp
+ create mode 100644 csrc/cutlass_extensions/vllm_collective_builder.cuh
+ create mode 100644 csrc/cutlass_extensions/vllm_custom_types.cuh
+ create mode 100644 csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+ create mode 100644 csrc/cutlass_extensions/vllm_numeric_conversion.cuh
+ create mode 100644 csrc/cutlass_extensions/vllm_type_utils.cuh
+ create mode 100644 csrc/layernorm_quant_kernels.cu
+ create mode 100644 csrc/mamba/causal_conv1d/causal_conv1d.cu
+ create mode 100644 csrc/mamba/causal_conv1d/causal_conv1d.h
+ create mode 100644 csrc/mamba/causal_conv1d/static_switch.h
+ create mode 100644 csrc/mamba/mamba_ssm/selective_scan.h
+ create mode 100644 csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+ create mode 100644 csrc/mamba/mamba_ssm/static_switch.h
+ create mode 100644 csrc/moe/marlin_kernels/marlin_moe_kernel.h
+ create mode 100644 csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu
+ create mode 100644 csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h
+ create mode 100644 csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
+ create mode 100644 csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
+ create mode 100644 csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
+ create mode 100644 csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
+ create mode 100644 csrc/moe/marlin_moe_ops.cu
+ create mode 100644 csrc/moe/moe_align_sum_kernels.cu
+ create mode 100644 csrc/moe/torch_bindings.cpp
+ create mode 100644 csrc/permute_cols.cu
+ create mode 100644 csrc/prepare_inputs/advance_step.cu
+ create mode 100644 csrc/prepare_inputs/advance_step.cuh
+ create mode 100644 csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+ create mode 100644 csrc/quantization/cutlass_w8a8/Epilogues.md
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
+ create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+ create mode 100644 csrc/quantization/fp8/amd/hip_float8.h
+ create mode 100644 csrc/quantization/fp8/amd/hip_float8_impl.h
+ create mode 100644 csrc/quantization/fp8/amd/quant_utils.cuh
+ create mode 100644 csrc/quantization/fp8/common.cu
+ create mode 100644 csrc/quantization/fp8/common.cuh
+ create mode 100644 csrc/quantization/fp8/fp8_marlin.cu
+ create mode 100644 csrc/quantization/fp8/nvidia/quant_utils.cuh
+ create mode 100644 csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+ create mode 100644 csrc/quantization/fused_kernels/layernorm_utils.cuh
+ create mode 100644 csrc/quantization/fused_kernels/quant_conversions.cuh
+ create mode 100644 csrc/quantization/gguf/dequantize.cuh
+ create mode 100644 csrc/quantization/gguf/ggml-common.h
+ create mode 100644 csrc/quantization/gguf/gguf_kernel.cu
+ create mode 100644 csrc/quantization/gguf/mmq.cuh
+ create mode 100644 csrc/quantization/gguf/mmvq.cuh
+ create mode 100644 csrc/quantization/gguf/vecdotq.cuh
+ create mode 100644 csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+ create mode 100644 csrc/quantization/gptq_marlin/marlin.cuh
+ create mode 100644 csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+ create mode 100644 csrc/quantization/machete/Readme.md
+ create mode 100644 csrc/quantization/machete/generate.py
+ create mode 100644 csrc/quantization/machete/machete_collective_builder.cuh
+ create mode 100644 csrc/quantization/machete/machete_interleaving_utils.cuh
+ create mode 100644 csrc/quantization/machete/machete_mainloop.cuh
+ create mode 100644 csrc/quantization/machete/machete_mm_kernel.cuh
+ create mode 100644 csrc/quantization/machete/machete_mm_launcher.cuh
+ create mode 100644 csrc/quantization/machete/machete_prepack_kernel.cuh
+ create mode 100644 csrc/quantization/machete/machete_prepack_launcher.cuh
+ create mode 100644 csrc/quantization/machete/machete_prepacked_layout.cuh
+ create mode 100644 csrc/quantization/machete/machete_pytorch.cu
+ create mode 100644 csrc/quantization/marlin/dense/LICENSE
+ create mode 100644 csrc/quantization/marlin/dense/common/base.h
+ create mode 100644 csrc/quantization/marlin/dense/common/mem.h
+ create mode 100644 csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+ create mode 100644 csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+ create mode 100644 csrc/quantization/marlin/sparse/LICENSE
+ create mode 100644 csrc/quantization/marlin/sparse/common/base.h
+ create mode 100644 csrc/quantization/marlin/sparse/common/mem.h
+ create mode 100644 csrc/quantization/marlin/sparse/common/mma.h
+ create mode 100644 csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+ create mode 100644 csrc/quantization/vectorization.cuh
+ create mode 100644 csrc/rocm/attention.cu
+ create mode 100644 csrc/rocm/ops.h
+ create mode 100644 csrc/rocm/torch_bindings.cpp
+ create mode 100644 csrc/sparse/cutlass/sparse_compressor_c3x.cu
+ create mode 100644 csrc/sparse/cutlass/sparse_compressor_entry.cu
+ create mode 100644 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+ create mode 100644 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+ create mode 100644 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+ create mode 100644 csrc/torch_bindings.cpp
+ create mode 100644 csrc/type_convert.cuh
+ create mode 100644 docs/source/_static/custom.js
+ create mode 100644 docs/source/_templates/sections/header.html
+ create mode 100644 docs/source/api/engine/async_llm_engine.md
+ create mode 100644 docs/source/api/engine/index.md
+ create mode 100644 docs/source/api/engine/llm_engine.md
+ create mode 100644 docs/source/api/inference_params.md
+ create mode 100644 docs/source/api/model/adapters.md
+ create mode 100644 docs/source/api/model/index.md
+ create mode 100644 docs/source/api/model/interfaces.md
+ create mode 100644 docs/source/api/model/interfaces_base.md
+ create mode 100644 docs/source/api/multimodal/index.md
+ create mode 100644 docs/source/api/multimodal/inputs.md
+ create mode 100644 docs/source/api/multimodal/parse.md
+ create mode 100644 docs/source/api/multimodal/processing.md
+ create mode 100644 docs/source/api/multimodal/profiling.md
+ create mode 100644 docs/source/api/multimodal/registry.md
+ create mode 100644 docs/source/api/offline_inference/index.md
+ create mode 100644 docs/source/api/offline_inference/llm.md
+ create mode 100644 docs/source/api/offline_inference/llm_inputs.md
+ create mode 100644 docs/source/assets/contributing/dockerfile-stages-dependency.png
+ create mode 100644 docs/source/assets/deployment/architecture_helm_deployment.png
+ create mode 100644 docs/source/assets/design/arch_overview/entrypoints.excalidraw.png
+ create mode 100644 docs/source/assets/design/arch_overview/llm_engine.excalidraw.png
+ create mode 100644 docs/source/assets/design/hierarchy.png
+ create mode 100644 docs/source/assets/features/disagg_prefill/abstraction.jpg
+ create mode 100644 docs/source/assets/features/disagg_prefill/overview.jpg
+ create mode 100644 docs/source/community/meetups.md
+ create mode 100644 docs/source/community/sponsors.md
+ create mode 100644 docs/source/contributing/dockerfile/dockerfile.md
+ create mode 100644 docs/source/contributing/model/basic.md
+ create mode 100644 docs/source/contributing/model/index.md
+ create mode 100644 docs/source/contributing/model/multimodal.md
+ create mode 100644 docs/source/contributing/model/registration.md
+ create mode 100644 docs/source/contributing/model/tests.md
+ create mode 100644 docs/source/contributing/overview.md
+ create mode 100644 docs/source/contributing/profiling/profiling_index.md
+ create mode 100644 docs/source/contributing/vulnerability_management.md
+ create mode 100644 docs/source/deployment/docker.md
+ create mode 100644 docs/source/deployment/frameworks/bentoml.md
+ create mode 100644 docs/source/deployment/frameworks/cerebrium.md
+ create mode 100644 docs/source/deployment/frameworks/dstack.md
+ create mode 100644 docs/source/deployment/frameworks/helm.md
+ create mode 100644 docs/source/deployment/frameworks/index.md
+ create mode 100644 docs/source/deployment/frameworks/lws.md
+ create mode 100644 docs/source/deployment/frameworks/modal.md
+ create mode 100644 docs/source/deployment/frameworks/skypilot.md
+ create mode 100644 docs/source/deployment/frameworks/triton.md
+ create mode 100644 docs/source/deployment/integrations/index.md
+ create mode 100644 docs/source/deployment/integrations/kserve.md
+ create mode 100644 docs/source/deployment/integrations/kubeai.md
+ create mode 100644 docs/source/deployment/integrations/llamastack.md
+ create mode 100644 docs/source/deployment/k8s.md
+ create mode 100644 docs/source/deployment/nginx.md
+ create mode 100644 docs/source/design/arch_overview.md
+ create mode 100644 docs/source/design/automatic_prefix_caching.md
+ create mode 100644 docs/source/design/huggingface_integration.md
+ create mode 100644 docs/source/design/kernel/paged_attention.md
+ create mode 100644 docs/source/design/mm_processing.md
+ create mode 100644 docs/source/design/multiprocessing.md
+ create mode 100644 docs/source/design/plugin_system.md
+ create mode 100644 docs/source/features/automatic_prefix_caching.md
+ create mode 100644 docs/source/features/compatibility_matrix.md
+ create mode 100644 docs/source/features/disagg_prefill.md
+ create mode 100644 docs/source/features/lora.md
+ create mode 100644 docs/source/features/quantization/auto_awq.md
+ create mode 100644 docs/source/features/quantization/bnb.md
+ create mode 100644 docs/source/features/quantization/fp8.md
+ create mode 100644 docs/source/features/quantization/fp8_e4m3_kvcache.md
+ create mode 100644 docs/source/features/quantization/fp8_e5m2_kvcache.md
+ create mode 100644 docs/source/features/quantization/gguf.md
+ create mode 100644 docs/source/features/quantization/index.md
+ create mode 100644 docs/source/features/quantization/int8.md
+ create mode 100644 docs/source/features/quantization/supported_hardware.md
+ create mode 100644 docs/source/features/spec_decode.md
+ create mode 100644 docs/source/features/structured_outputs.md
+ create mode 100644 docs/source/features/tool_calling.md
+ create mode 100644 docs/source/getting_started/faq.md
+ create mode 100644 docs/source/getting_started/installation/cpu-apple.md
+ create mode 100644 docs/source/getting_started/installation/cpu-arm.md
+ create mode 100644 docs/source/getting_started/installation/cpu-x86.md
+ create mode 100644 docs/source/getting_started/installation/gpu-cuda.md
+ create mode 100644 docs/source/getting_started/installation/gpu-rocm.md
+ create mode 100644 docs/source/getting_started/installation/hpu-gaudi.md
+ create mode 100644 docs/source/getting_started/installation/index.md
+ create mode 100644 docs/source/getting_started/installation/neuron.md
+ create mode 100644 docs/source/getting_started/installation/openvino.md
+ create mode 100644 docs/source/getting_started/installation/tpu.md
+ create mode 100644 docs/source/getting_started/installation/xpu.md
+ create mode 100644 docs/source/getting_started/quickstart.md
+ create mode 100644 docs/source/getting_started/troubleshooting.md
+ create mode 100644 docs/source/index.md
+ create mode 100644 docs/source/models/extensions/index.md
+ create mode 100644 docs/source/models/extensions/runai_model_streamer.md
+ create mode 100644 docs/source/models/extensions/tensorizer.md
+ create mode 100644 docs/source/models/generative_models.md
+ create mode 100644 docs/source/models/pooling_models.md
+ create mode 100644 docs/source/models/supported_models.md
+ create mode 100644 docs/source/performance/benchmarks.md
+ create mode 100644 docs/source/performance/optimization.md
+ create mode 100644 docs/source/serving/distributed_serving.md
+ create mode 100644 docs/source/serving/engine_args.md
+ create mode 100644 docs/source/serving/env_vars.md
+ create mode 100644 docs/source/serving/integrations/index.md
+ create mode 100644 docs/source/serving/integrations/langchain.md
+ create mode 100644 docs/source/serving/integrations/llamaindex.md
+ create mode 100644 docs/source/serving/metrics.md
+ create mode 100644 docs/source/serving/multimodal_inputs.md
+ create mode 100644 docs/source/serving/offline_inference.md
+ create mode 100644 examples/offline_inference/aqlm_example.py
+ create mode 100644 examples/offline_inference/arctic.py
+ create mode 100644 examples/offline_inference/audio_language.py
+ create mode 100644 examples/offline_inference/basic.py
+ create mode 100644 examples/offline_inference/basic_with_model_default_sampling.py
+ create mode 100644 examples/offline_inference/chat.py
+ create mode 100644 examples/offline_inference/chat_with_tools.py
+ create mode 100644 examples/offline_inference/classification.py
+ create mode 100644 examples/offline_inference/cli.py
+ create mode 100644 examples/offline_inference/cpu_offload.py
+ create mode 100644 examples/offline_inference/distributed.py
+ create mode 100644 examples/offline_inference/embedding.py
+ create mode 100644 examples/offline_inference/encoder_decoder.py
+ create mode 100644 examples/offline_inference/florence2_inference.py
+ create mode 100644 examples/offline_inference/gguf_inference.py
+ create mode 100644 examples/offline_inference/llm_engine_example.py
+ create mode 100644 examples/offline_inference/lora_with_quantization_inference.py
+ create mode 100644 examples/offline_inference/mlpspeculator.py
+ create mode 100644 examples/offline_inference/multilora_inference.py
+ create mode 100644 examples/offline_inference/neuron.py
+ create mode 100644 examples/offline_inference/neuron_int8_quantization.py
+ create mode 100644 examples/offline_inference/openai/openai_batch.md
+ create mode 100644 examples/offline_inference/openai/openai_example_batch.jsonl
+ create mode 100644 examples/offline_inference/pixtral.py
+ create mode 100644 examples/offline_inference/prefix_caching.py
+ create mode 100644 examples/offline_inference/profiling.py
+ create mode 100644 examples/offline_inference/save_sharded_state.py
+ create mode 100644 examples/offline_inference/scoring.py
+ create mode 100644 examples/offline_inference/simple_profiling.py
+ create mode 100644 examples/offline_inference/structured_outputs.py
+ create mode 100644 examples/offline_inference/tpu.py
+ create mode 100644 examples/offline_inference/vision_language.py
+ create mode 100644 examples/offline_inference/vision_language_embedding.py
+ create mode 100644 examples/offline_inference/vision_language_multi_image.py
+ create mode 100644 examples/offline_inference/whisper.py
+ create mode 100644 examples/online_serving/api_client.py
+ create mode 100644 examples/online_serving/chart-helm/.helmignore
+ create mode 100644 examples/online_serving/chart-helm/Chart.yaml
+ create mode 100644 examples/online_serving/chart-helm/README.md
+ create mode 100644 examples/online_serving/chart-helm/ct.yaml
+ create mode 100644 examples/online_serving/chart-helm/lintconf.yaml
+ create mode 100644 examples/online_serving/chart-helm/templates/_helpers.tpl
+ create mode 100644 examples/online_serving/chart-helm/templates/configmap.yaml
+ create mode 100644 examples/online_serving/chart-helm/templates/custom-objects.yaml
+ create mode 100644 examples/online_serving/chart-helm/templates/deployment.yaml
+ create mode 100644 examples/online_serving/chart-helm/templates/hpa.yaml
+ create mode 100644 examples/online_serving/chart-helm/templates/job.yaml
+ create mode 100644 examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
+ create mode 100644 examples/online_serving/chart-helm/templates/pvc.yaml
+ create mode 100644 examples/online_serving/chart-helm/templates/secrets.yaml
+ create mode 100644 examples/online_serving/chart-helm/templates/service.yaml
+ create mode 100644 examples/online_serving/chart-helm/values.schema.json
+ create mode 100644 examples/online_serving/chart-helm/values.yaml
+ create mode 100644 examples/online_serving/disaggregated_prefill.sh
+ create mode 100644 examples/online_serving/gradio_openai_chatbot_webserver.py
+ create mode 100644 examples/online_serving/gradio_webserver.py
+ create mode 100644 examples/online_serving/openai_chat_completion_client.py
+ create mode 100644 examples/online_serving/openai_chat_completion_client_for_multimodal.py
+ create mode 100644 examples/online_serving/openai_chat_completion_client_with_tools.py
+ create mode 100644 examples/online_serving/openai_chat_completion_structured_outputs.py
+ create mode 100644 examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+ create mode 100644 examples/online_serving/openai_completion_client.py
+ create mode 100644 examples/online_serving/openai_cross_encoder_score.py
+ create mode 100644 examples/online_serving/openai_embedding_client.py
+ create mode 100644 examples/online_serving/openai_pooling_client.py
+ create mode 100644 examples/online_serving/opentelemetry/Otel.md
+ create mode 100644 examples/online_serving/opentelemetry/dummy_client.py
+ create mode 100644 examples/online_serving/prometheus_grafana/README.md
+ create mode 100644 examples/online_serving/prometheus_grafana/docker-compose.yaml
+ create mode 100644 examples/online_serving/prometheus_grafana/grafana.json
+ create mode 100644 examples/online_serving/prometheus_grafana/prometheus.yaml
+ create mode 100644 examples/online_serving/run_cluster.sh
+ create mode 100644 examples/online_serving/sagemaker-entrypoint.sh
+ create mode 100644 examples/other/fp8/README.md
+ create mode 100644 examples/other/fp8/extract_scales.py
+ create mode 100644 examples/other/fp8/quantizer/README.md
+ create mode 100644 examples/other/fp8/quantizer/quantize.py
+ create mode 100644 examples/other/logging_configuration.md
+ create mode 100644 examples/other/tensorize_vllm_model.py
+ create mode 100644 examples/template_blip2.jinja
+ create mode 100644 examples/template_dse_qwen2_vl.jinja
+ create mode 100644 examples/template_llava.jinja
+ create mode 100644 examples/template_pixtral_hf.jinja
+ create mode 100644 examples/template_vlm2vec.jinja
+ create mode 100644 examples/tool_chat_template_granite.jinja
+ create mode 100644 examples/tool_chat_template_granite_20b_fc.jinja
+ create mode 100644 examples/tool_chat_template_hermes.jinja
+ create mode 100644 examples/tool_chat_template_internlm2_tool.jinja
+ create mode 100644 examples/tool_chat_template_llama3.1_json.jinja
+ create mode 100644 examples/tool_chat_template_llama3.2_json.jinja
+ create mode 100644 examples/tool_chat_template_llama3.2_pythonic.jinja
+ create mode 100644 examples/tool_chat_template_mistral.jinja
+ create mode 100644 examples/tool_chat_template_mistral_parallel.jinja
+ create mode 100644 examples/tool_chat_template_toolace.jinja
+ create mode 100644 find_cuda_init.py
+ create mode 100644 python_only_dev.py
+ create mode 100644 requirements-hpu.txt
+ create mode 100644 requirements-lint.txt
+ create mode 100644 requirements-openvino.txt
+ create mode 100644 requirements-test.in
+ create mode 100644 requirements-test.txt
+ create mode 100644 requirements-tpu.txt
+ create mode 100644 requirements-xpu.txt
+ create mode 100644 tests/async_engine/__init__.py
+ create mode 100644 tests/basic_correctness/__init__.py
+ create mode 100644 tests/basic_correctness/test_cpu_offload.py
+ create mode 100644 tests/compile/__init__.py
+ create mode 100644 tests/compile/backend.py
+ create mode 100644 tests/compile/piecewise/__init__.py
+ create mode 100644 tests/compile/piecewise/test_simple.py
+ create mode 100644 tests/compile/piecewise/test_toy_llama.py
+ create mode 100644 tests/compile/test_basic_correctness.py
+ create mode 100644 tests/compile/test_full_graph.py
+ create mode 100644 tests/compile/test_functionalization.py
+ create mode 100644 tests/compile/test_fusion.py
+ create mode 100644 tests/compile/test_pass_manager.py
+ create mode 100644 tests/compile/test_wrapper.py
+ create mode 100644 tests/compile/utils.py
+ create mode 100644 tests/core/block/e2e/__init__.py
+ create mode 100644 tests/core/block/e2e/test_correctness_sliding_window.py
+ create mode 100644 tests/core/block/test_block_manager.py
+ create mode 100644 tests/core/test_num_computed_tokens_update.py
+ create mode 100644 tests/core/test_scheduler_encoder_decoder.py
+ create mode 100644 tests/core/test_serialization.py
+ create mode 100644 tests/data/test_config.yaml
+ create mode 100644 tests/distributed/__init__.py
+ create mode 100644 tests/distributed/test_ca_buffer_sharing.py
+ create mode 100644 tests/distributed/test_distributed_oot.py
+ create mode 100644 tests/distributed/test_multi_node_assignment.py
+ create mode 100644 tests/distributed/test_pipeline_parallel.py
+ create mode 100644 tests/distributed/test_pipeline_partition.py
+ create mode 100644 tests/distributed/test_pp_cudagraph.py
+ create mode 100644 tests/distributed/test_same_node.py
+ create mode 100644 tests/distributed/test_shm_broadcast.py
+ create mode 100644 tests/distributed/test_utils.py
+ create mode 100644 tests/encoder_decoder/__init__.py
+ create mode 100644 tests/encoder_decoder/test_e2e_correctness.py
+ create mode 100644 tests/engine/__init__.py
+ create mode 100644 tests/engine/output_processor/__init__.py
+ create mode 100644 tests/engine/output_processor/test_stop_checker.py
+ create mode 100644 tests/engine/test_arg_utils.py
+ create mode 100644 tests/engine/test_custom_executor.py
+ create mode 100644 tests/engine/test_short_mm_context.py
+ create mode 100644 tests/entrypoints/__init__.py
+ create mode 100644 tests/entrypoints/conftest.py
+ create mode 100644 tests/entrypoints/llm/__init__.py
+ create mode 100644 tests/entrypoints/llm/test_accuracy.py
+ create mode 100644 tests/entrypoints/llm/test_chat.py
+ create mode 100644 tests/entrypoints/llm/test_encode.py
+ create mode 100644 tests/entrypoints/llm/test_generate.py
+ create mode 100644 tests/entrypoints/llm/test_generate_multiple_loras.py
+ create mode 100644 tests/entrypoints/llm/test_gpu_utilization.py
+ create mode 100644 tests/entrypoints/llm/test_guided_generate.py
+ create mode 100644 tests/entrypoints/llm/test_init.py
+ create mode 100644 tests/entrypoints/llm/test_lazy_outlines.py
+ create mode 100644 tests/entrypoints/llm/test_prompt_validation.py
+ create mode 100644 tests/entrypoints/offline_mode/__init__.py
+ create mode 100644 tests/entrypoints/offline_mode/test_offline_mode.py
+ create mode 100644 tests/entrypoints/openai/__init__.py
+ create mode 100644 tests/entrypoints/openai/test_accuracy.py
+ create mode 100644 tests/entrypoints/openai/test_async_tokenization.py
+ create mode 100644 tests/entrypoints/openai/test_audio.py
+ create mode 100644 tests/entrypoints/openai/test_basic.py
+ create mode 100644 tests/entrypoints/openai/test_chat.py
+ create mode 100644 tests/entrypoints/openai/test_chat_echo.py
+ create mode 100644 tests/entrypoints/openai/test_chat_template.py
+ create mode 100644 tests/entrypoints/openai/test_chunked_prompt.py
+ create mode 100644 tests/entrypoints/openai/test_cli_args.py
+ create mode 100644 tests/entrypoints/openai/test_completion.py
+ create mode 100644 tests/entrypoints/openai/test_embedding.py
+ create mode 100644 tests/entrypoints/openai/test_encoder_decoder.py
+ create mode 100644 tests/entrypoints/openai/test_lora_adapters.py
+ create mode 100644 tests/entrypoints/openai/test_metrics.py
+ create mode 100644 tests/entrypoints/openai/test_models.py
+ create mode 100644 tests/entrypoints/openai/test_oot_registration.py
+ create mode 100644 tests/entrypoints/openai/test_pooling.py
+ create mode 100644 tests/entrypoints/openai/test_prompt_validation.py
+ create mode 100644 tests/entrypoints/openai/test_return_tokens_as_ids.py
+ create mode 100644 tests/entrypoints/openai/test_root_path.py
+ create mode 100644 tests/entrypoints/openai/test_run_batch.py
+ create mode 100644 tests/entrypoints/openai/test_score.py
+ create mode 100644 tests/entrypoints/openai/test_serving_models.py
+ create mode 100644 tests/entrypoints/openai/test_shutdown.py
+ create mode 100644 tests/entrypoints/openai/test_tokenization.py
+ create mode 100644 tests/entrypoints/openai/test_video.py
+ create mode 100644 tests/entrypoints/openai/test_vision.py
+ create mode 100644 tests/entrypoints/openai/test_vision_embedding.py
+ create mode 100644 tests/entrypoints/openai/tool_parsers/__init__.py
+ create mode 100644 tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+ create mode 100644 tests/entrypoints/openai/tool_parsers/utils.py
+ create mode 100644 tests/entrypoints/test_chat_utils.py
+ create mode 100644 tests/kernels/__init__.py
+ create mode 100644 tests/kernels/quant_utils.py
+ create mode 100644 tests/kernels/test_aqlm.py
+ create mode 100644 tests/kernels/test_attention_selector.py
+ create mode 100644 tests/kernels/test_awq.py
+ create mode 100644 tests/kernels/test_awq_marlin.py
+ create mode 100644 tests/kernels/test_awq_triton.py
+ create mode 100644 tests/kernels/test_block_fp8.py
+ create mode 100644 tests/kernels/test_blocksparse_attention.py
+ create mode 100644 tests/kernels/test_cascade_flash_attn.py
+ create mode 100644 tests/kernels/test_causal_conv1d.py
+ create mode 100644 tests/kernels/test_cutlass.py
+ create mode 100644 tests/kernels/test_encoder_decoder_attn.py
+ create mode 100644 tests/kernels/test_flash_attn.py
+ create mode 100644 tests/kernels/test_flashinfer.py
+ create mode 100644 tests/kernels/test_fp8_quant.py
+ create mode 100644 tests/kernels/test_fused_quant_layernorm.py
+ create mode 100644 tests/kernels/test_ggml.py
+ create mode 100644 tests/kernels/test_gguf.py
+ create mode 100644 tests/kernels/test_gptq.py
+ create mode 100644 tests/kernels/test_int8_quant.py
+ create mode 100644 tests/kernels/test_machete_mm.py
+ create mode 100644 tests/kernels/test_mamba_ssm.py
+ create mode 100644 tests/kernels/test_marlin_gemm.py
+ create mode 100644 tests/kernels/test_permute_cols.py
+ create mode 100644 tests/kernels/test_rotary_embedding.py
+ create mode 100644 tests/kernels/test_semi_structured.py
+ create mode 100644 tests/kernels/test_triton_scaled_mm.py
+ create mode 100644 tests/kernels/test_utils.py
+ create mode 100644 tests/kernels/utils.py
+ create mode 100644 tests/kv_transfer/disagg_test.py
+ create mode 100644 tests/kv_transfer/module_test.py
+ create mode 100644 tests/kv_transfer/test_lookup_buffer.py
+ create mode 100644 tests/kv_transfer/test_lookup_buffer.sh
+ create mode 100644 tests/kv_transfer/test_send_recv.py
+ create mode 100644 tests/kv_transfer/test_send_recv.sh
+ create mode 100644 tests/lora/data/__init__.py
+ create mode 100644 tests/lora/data/long_context_test_data.py
+ create mode 100644 tests/lora/test_chatglm3_tp.py
+ create mode 100644 tests/lora/test_jamba.py
+ create mode 100644 tests/lora/test_llama_tp.py
+ create mode 100644 tests/lora/test_long_context.py
+ create mode 100644 tests/lora/test_lora_bias_e2e.py
+ create mode 100644 tests/lora/test_lora_huggingface.py
+ create mode 100644 tests/lora/test_minicpmv_tp.py
+ create mode 100644 tests/lora/test_phi.py
+ create mode 100644 tests/lora/test_punica_ops_sizes.py
+ create mode 100644 tests/lora/test_punica_ops_variation.py
+ create mode 100644 tests/lora/test_qwen2vl.py
+ create mode 100644 tests/metrics/__init__.py
+ create mode 100644 tests/model_executor/__init__.py
+ create mode 100644 tests/model_executor/conftest.py
+ create mode 100644 tests/model_executor/test_enabled_custom_ops.py
+ create mode 100644 tests/model_executor/test_guided_processors.py
+ create mode 100644 tests/model_executor/test_model_load_with_params.py
+ create mode 100644 tests/models/__init__.py
+ create mode 100644 tests/models/decoder_only/__init__.py
+ create mode 100644 tests/models/decoder_only/audio_language/__init__.py
+ create mode 100644 tests/models/decoder_only/audio_language/test_ultravox.py
+ create mode 100644 tests/models/decoder_only/language/__init__.py
+ create mode 100644 tests/models/decoder_only/language/test_aqlm.py
+ create mode 100644 tests/models/decoder_only/language/test_fp8.py
+ create mode 100644 tests/models/decoder_only/language/test_gguf.py
+ create mode 100644 tests/models/decoder_only/language/test_gptq_marlin.py
+ create mode 100644 tests/models/decoder_only/language/test_gptq_marlin_24.py
+ create mode 100644 tests/models/decoder_only/language/test_granite.py
+ create mode 100644 tests/models/decoder_only/language/test_jamba.py
+ create mode 100644 tests/models/decoder_only/language/test_mamba.py
+ create mode 100644 tests/models/decoder_only/language/test_mistral.py
+ create mode 100644 tests/models/decoder_only/language/test_modelopt.py
+ create mode 100644 tests/models/decoder_only/language/test_models.py
+ create mode 100644 tests/models/decoder_only/language/test_phimoe.py
+ create mode 100644 tests/models/decoder_only/vision_language/__init__.py
+ create mode 100644 tests/models/decoder_only/vision_language/test_awq.py
+ create mode 100644 tests/models/decoder_only/vision_language/test_h2ovl.py
+ create mode 100644 tests/models/decoder_only/vision_language/test_intern_vit.py
+ create mode 100644 tests/models/decoder_only/vision_language/test_models.py
+ create mode 100644 tests/models/decoder_only/vision_language/test_phi3v.py
+ create mode 100644 tests/models/decoder_only/vision_language/test_pixtral.py
+ create mode 100644 tests/models/decoder_only/vision_language/test_qwen2_vl.py
+ create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/__init__.py
+ create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/builders.py
+ create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+ create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/core.py
+ create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+ create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+ create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/runners.py
+ create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/types.py
+ create mode 100644 tests/models/embedding/__init__.py
+ create mode 100644 tests/models/embedding/language/__init__.py
+ create mode 100644 tests/models/embedding/language/test_cls_models.py
+ create mode 100644 tests/models/embedding/language/test_embedding.py
+ create mode 100644 tests/models/embedding/language/test_gritlm.py
+ create mode 100644 tests/models/embedding/language/test_scoring.py
+ create mode 100644 tests/models/embedding/utils.py
+ create mode 100644 tests/models/embedding/vision_language/__init__.py
+ create mode 100644 tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+ create mode 100644 tests/models/embedding/vision_language/test_llava_next.py
+ create mode 100644 tests/models/embedding/vision_language/test_phi3v.py
+ create mode 100644 tests/models/encoder_decoder/__init__.py
+ create mode 100644 tests/models/encoder_decoder/audio_language/__init__.py
+ create mode 100644 tests/models/encoder_decoder/audio_language/test_whisper.py
+ create mode 100644 tests/models/encoder_decoder/language/__init__.py
+ create mode 100644 tests/models/encoder_decoder/language/test_bart.py
+ create mode 100644 tests/models/encoder_decoder/vision_language/__init__.py
+ create mode 100644 tests/models/encoder_decoder/vision_language/test_broadcast.py
+ create mode 100644 tests/models/encoder_decoder/vision_language/test_florence2.py
+ create mode 100644 tests/models/encoder_decoder/vision_language/test_mllama.py
+ create mode 100644 tests/models/fixtures/pixtral_chat.json
+ create mode 100644 tests/models/fixtures/pixtral_chat_engine.json
+ create mode 100644 tests/models/multimodal/__init__.py
+ create mode 100644 tests/models/multimodal/processing/__init__.py
+ create mode 100644 tests/models/multimodal/processing/test_common.py
+ create mode 100644 tests/models/multimodal/processing/test_idefics3.py
+ create mode 100644 tests/models/multimodal/processing/test_internvl.py
+ create mode 100644 tests/models/multimodal/processing/test_llava_next.py
+ create mode 100644 tests/models/multimodal/processing/test_llava_onevision.py
+ create mode 100644 tests/models/multimodal/processing/test_phi3v.py
+ create mode 100644 tests/models/multimodal/processing/test_qwen.py
+ create mode 100644 tests/models/multimodal/processing/test_qwen2_vl.py
+ create mode 100644 tests/models/registry.py
+ create mode 100644 tests/models/test_initialization.py
+ create mode 100644 tests/models/test_registry.py
+ create mode 100644 tests/mq_llm_engine/__init__.py
+ create mode 100644 tests/mq_llm_engine/test_abort.py
+ create mode 100644 tests/mq_llm_engine/test_error_handling.py
+ create mode 100644 tests/mq_llm_engine/test_load.py
+ create mode 100644 tests/mq_llm_engine/utils.py
+ create mode 100644 tests/multi_step/__init__.py
+ create mode 100644 tests/multi_step/test_correctness_async_llm.py
+ create mode 100644 tests/multi_step/test_correctness_llm.py
+ create mode 100644 tests/multimodal/__init__.py
+ create mode 100644 tests/multimodal/test_inputs.py
+ create mode 100644 tests/multimodal/test_processing.py
+ create mode 100644 tests/multimodal/test_processor_kwargs.py
+ create mode 100644 tests/multimodal/test_utils.py
+ create mode 100644 tests/multimodal/utils.py
+ create mode 100644 tests/plugins/vllm_add_dummy_model/setup.py
+ create mode 100644 tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+ create mode 100644 tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+ create mode 100644 tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+ create mode 100644 tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+ create mode 100644 tests/plugins/vllm_add_dummy_platform/setup.py
+ create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+ create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+ create mode 100644 tests/plugins_tests/test_platform_plugins.py
+ create mode 100644 tests/prefix_caching/__init__.py
+ create mode 100644 tests/prefix_caching/test_disable_sliding_window.py
+ create mode 100644 tests/prompt_adapter/test_bloom.py
+ create mode 100644 tests/prompt_adapter/test_multi_adapter_inference.py
+ create mode 100644 tests/prompt_adapter/test_pa_lora.py
+ create mode 100644 tests/quantization/__init__.py
+ create mode 100644 tests/quantization/test_bitsandbytes.py
+ create mode 100644 tests/quantization/test_compressed_tensors.py
+ create mode 100644 tests/quantization/test_cpu_offload.py
+ create mode 100644 tests/quantization/test_experts_int8.py
+ create mode 100644 tests/quantization/test_ipex_quant.py
+ create mode 100644 tests/quantization/test_lm_head.py
+ create mode 100644 tests/quantization/utils.py
+ create mode 100644 tests/runai_model_streamer/__init__.py
+ create mode 100644 tests/runai_model_streamer/test_runai_model_streamer_loader.py
+ create mode 100644 tests/runai_model_streamer/test_weight_utils.py
+ create mode 100644 tests/samplers/__init__.py
+ create mode 100644 tests/samplers/test_no_bad_words.py
+ create mode 100644 tests/samplers/test_typical_acceptance_sampler.py
+ create mode 100644 tests/spec_decode/e2e/test_eagle_correctness.py
+ create mode 100644 tests/spec_decode/e2e/test_integration.py
+ create mode 100644 tests/spec_decode/e2e/test_integration_dist_tp2.py
+ create mode 100644 tests/spec_decode/e2e/test_integration_dist_tp4.py
+ create mode 100644 tests/spec_decode/e2e/test_medusa_correctness.py
+ create mode 100644 tests/spec_decode/e2e/test_mlp_correctness.py
+ create mode 100644 tests/spec_decode/e2e/test_seed.py
+ create mode 100644 tests/spec_decode/test_dynamic_spec_decode.py
+ create mode 100644 tests/spec_decode/test_scorer.py
+ create mode 100644 tests/standalone_tests/lazy_torch_compile.py
+ create mode 100644 tests/standalone_tests/python_only_compile.sh
+ create mode 100644 tests/system_messages/sonnet3.5_nov2024.txt
+ create mode 100644 tests/tensorizer_loader/conftest.py
+ create mode 100644 tests/test_embedded_commit.py
+ create mode 100644 tests/test_inputs.py
+ create mode 100644 tests/test_scalartype.py
+ create mode 100644 tests/test_sharded_state_loader.py
+ create mode 100644 tests/test_utils.py
+ create mode 100644 tests/tokenization/test_get_eos.py
+ create mode 100644 tests/tool_use/__init__.py
+ create mode 100644 tests/tool_use/conftest.py
+ create mode 100644 tests/tool_use/test_chat_completion_request_validations.py
+ create mode 100644 tests/tool_use/test_chat_completions.py
+ create mode 100644 tests/tool_use/test_jamba_tool_parser.py
+ create mode 100644 tests/tool_use/test_parallel_tool_calls.py
+ create mode 100644 tests/tool_use/test_tool_calls.py
+ create mode 100644 tests/tool_use/utils.py
+ create mode 100644 tests/tpu/__init__.py
+ create mode 100644 tests/tpu/test_compilation.py
+ create mode 100644 tests/tpu/test_custom_dispatcher.py
+ create mode 100644 tests/tpu/test_quantization_accuracy.py
+ create mode 100644 tests/tracing/__init__.py
+ create mode 100644 tests/tracing/test_tracing.py
+ create mode 100644 tests/utils.py
+ create mode 100644 tests/v1/__init__.py
+ create mode 100644 tests/v1/core/test_kv_cache_utils.py
+ create mode 100644 tests/v1/core/test_prefix_caching.py
+ create mode 100644 tests/v1/e2e/__init__.py
+ create mode 100644 tests/v1/e2e/test_cascade_attention.py
+ create mode 100644 tests/v1/engine/__init__.py
+ create mode 100644 tests/v1/engine/test_async_llm.py
+ create mode 100644 tests/v1/engine/test_engine_args.py
+ create mode 100644 tests/v1/engine/test_engine_core.py
+ create mode 100644 tests/v1/engine/test_engine_core_client.py
+ create mode 100644 tests/v1/engine/test_output_processor.py
+ create mode 100644 tests/v1/sample/__init__.py
+ create mode 100644 tests/v1/sample/test_sampler.py
+ create mode 100644 tests/v1/worker/__init__.py
+ create mode 100644 tests/v1/worker/test_gpu_input_batch.py
+ create mode 100644 tests/vllm_test_utils/setup.py
+ create mode 100644 tests/vllm_test_utils/vllm_test_utils/__init__.py
+ create mode 100644 tests/vllm_test_utils/vllm_test_utils/blame.py
+ create mode 100644 tests/vllm_test_utils/vllm_test_utils/monitor.py
+ create mode 100644 tests/weight_loading/models-large.txt
+ create mode 100644 tests/weight_loading/models.txt
+ create mode 100644 tests/weight_loading/run_model_weight_loading_test.sh
+ create mode 100644 tests/weight_loading/test_weight_loading.py
+ create mode 100644 tests/worker/test_encoder_decoder_model_runner.py
+ create mode 100644 tests/worker/test_model_input.py
+ create mode 100644 tests/worker/test_profile.py
+ create mode 100644 tools/actionlint.sh
+ create mode 100644 tools/check_repo.sh
+ create mode 100644 tools/doc-lint.sh
+ create mode 100644 tools/mypy.sh
+ create mode 100644 tools/png-lint.sh
+ create mode 100644 tools/profiler/print_layerwise_table.py
+ create mode 100644 tools/profiler/visualize_layerwise_profile.py
+ create mode 100644 tools/report_build_time_ninja.py
+ create mode 100644 tools/shellcheck.sh
+ create mode 100644 use_existing_torch.py
+ create mode 100644 vllm/_ipex_ops.py
+ create mode 100644 vllm/adapter_commons/__init__.py
+ create mode 100644 vllm/adapter_commons/layers.py
+ create mode 100644 vllm/adapter_commons/models.py
+ create mode 100644 vllm/adapter_commons/request.py
+ create mode 100644 vllm/adapter_commons/utils.py
+ create mode 100644 vllm/adapter_commons/worker_manager.py
+ create mode 100644 vllm/assets/__init__.py
+ create mode 100644 vllm/assets/audio.py
+ create mode 100644 vllm/assets/base.py
+ create mode 100644 vllm/assets/image.py
+ create mode 100644 vllm/assets/video.py
+ create mode 100644 vllm/attention/backends/blocksparse_attn.py
+ create mode 100644 vllm/attention/backends/hpu_attn.py
+ create mode 100644 vllm/attention/backends/ipex_attn.py
+ create mode 100644 vllm/attention/backends/openvino.py
+ create mode 100644 vllm/attention/backends/pallas.py
+ create mode 100644 vllm/attention/backends/placeholder_attn.py
+ create mode 100644 vllm/attention/backends/utils.py
+ create mode 100644 vllm/attention/ops/blocksparse_attention/__init__.py
+ create mode 100644 vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
+ create mode 100644 vllm/attention/ops/blocksparse_attention/interface.py
+ create mode 100644 vllm/attention/ops/blocksparse_attention/utils.py
+ create mode 100644 vllm/attention/ops/hpu_paged_attn.py
+ create mode 100644 vllm/attention/ops/ipex_attn.py
+ create mode 100644 vllm/beam_search.py
+ create mode 100644 vllm/compilation/__init__.py
+ create mode 100644 vllm/compilation/backends.py
+ create mode 100644 vllm/compilation/counter.py
+ create mode 100644 vllm/compilation/decorators.py
+ create mode 100644 vllm/compilation/fix_functionalization.py
+ create mode 100644 vllm/compilation/fusion.py
+ create mode 100644 vllm/compilation/fx_utils.py
+ create mode 100644 vllm/compilation/inductor_pass.py
+ create mode 100644 vllm/compilation/monitor.py
+ create mode 100644 vllm/compilation/multi_output_match.py
+ create mode 100644 vllm/compilation/pass_manager.py
+ create mode 100644 vllm/compilation/reshapes.py
+ create mode 100644 vllm/compilation/vllm_inductor_pass.py
+ create mode 100644 vllm/compilation/wrapper.py
+ create mode 100644 vllm/connections.py
+ create mode 100644 vllm/core/block/utils.py
+ create mode 100644 vllm/core/block_manager.py
+ create mode 100644 vllm/core/evictor.py
+ create mode 100644 vllm/core/placeholder_block_space_manager.py
+ create mode 100644 vllm/distributed/device_communicators/cuda_wrapper.py
+ create mode 100644 vllm/distributed/device_communicators/custom_all_reduce_utils.py
+ create mode 100644 vllm/distributed/device_communicators/hpu_communicator.py
+ create mode 100644 vllm/distributed/device_communicators/pynccl_wrapper.py
+ create mode 100644 vllm/distributed/device_communicators/shm_broadcast.py
+ create mode 100644 vllm/distributed/device_communicators/tpu_communicator.py
+ create mode 100644 vllm/distributed/device_communicators/xpu_communicator.py
+ create mode 100644 vllm/distributed/kv_transfer/README.md
+ create mode 100644 vllm/distributed/kv_transfer/__init__.py
+ create mode 100644 vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg
+ create mode 100644 vllm/distributed/kv_transfer/kv_connector/__init__.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_connector/base.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_connector/factory.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_pipe/__init__.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_pipe/base.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+ create mode 100644 vllm/distributed/kv_transfer/kv_transfer_agent.py
+ create mode 100644 vllm/engine/async_timeout.py
+ create mode 100644 vllm/engine/metrics_types.py
+ create mode 100644 vllm/engine/multiprocessing/__init__.py
+ create mode 100644 vllm/engine/multiprocessing/client.py
+ create mode 100644 vllm/engine/multiprocessing/engine.py
+ create mode 100644 vllm/engine/protocol.py
+ create mode 100644 vllm/entrypoints/chat_utils.py
+ create mode 100644 vllm/entrypoints/launcher.py
+ create mode 100644 vllm/entrypoints/logger.py
+ create mode 100644 vllm/entrypoints/openai/logits_processors.py
+ create mode 100644 vllm/entrypoints/openai/run_batch.py
+ create mode 100644 vllm/entrypoints/openai/serving_embedding.py
+ create mode 100644 vllm/entrypoints/openai/serving_models.py
+ create mode 100644 vllm/entrypoints/openai/serving_pooling.py
+ create mode 100644 vllm/entrypoints/openai/serving_score.py
+ create mode 100644 vllm/entrypoints/openai/serving_tokenization.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/__init__.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+ create mode 100644 vllm/entrypoints/openai/tool_parsers/utils.py
+ create mode 100644 vllm/entrypoints/utils.py
+ create mode 100644 vllm/executor/hpu_executor.py
+ create mode 100644 vllm/executor/msgspec_utils.py
+ create mode 100644 vllm/executor/multiproc_gpu_executor.py
+ create mode 100644 vllm/executor/multiproc_xpu_executor.py
+ create mode 100644 vllm/executor/openvino_executor.py
+ create mode 100644 vllm/executor/ray_hpu_executor.py
+ create mode 100644 vllm/executor/ray_tpu_executor.py
+ create mode 100644 vllm/executor/ray_xpu_executor.py
+ create mode 100644 vllm/executor/tpu_executor.py
+ create mode 100644 vllm/executor/xpu_executor.py
+ create mode 100644 vllm/forward_context.py
+ create mode 100644 vllm/inputs/__init__.py
+ create mode 100644 vllm/inputs/data.py
+ create mode 100644 vllm/inputs/parse.py
+ create mode 100644 vllm/inputs/preprocess.py
+ create mode 100644 vllm/inputs/registry.py
+ create mode 100644 vllm/logging_utils/__init__.py
+ create mode 100644 vllm/logging_utils/formatter.py
+ create mode 100644 vllm/logits_process.py
+ create mode 100644 vllm/lora/ops/__init__.py
+ create mode 100644 vllm/lora/ops/torch_ops/__init__.py
+ create mode 100644 vllm/lora/ops/torch_ops/lora_ops.py
+ create mode 100644 vllm/lora/ops/triton_ops/__init__.py
+ create mode 100644 vllm/lora/ops/triton_ops/bgmv_expand.py
+ create mode 100644 vllm/lora/ops/triton_ops/bgmv_expand_slice.py
+ create mode 100644 vllm/lora/ops/triton_ops/bgmv_shrink.py
+ create mode 100644 vllm/lora/ops/triton_ops/sgmv_expand.py
+ create mode 100644 vllm/lora/ops/triton_ops/sgmv_shrink.py
+ create mode 100644 vllm/lora/ops/triton_ops/utils.py
+ create mode 100644 vllm/lora/peft_helper.py
+ create mode 100644 vllm/lora/punica_wrapper/__init__.py
+ create mode 100644 vllm/lora/punica_wrapper/punica_base.py
+ create mode 100644 vllm/lora/punica_wrapper/punica_cpu.py
+ create mode 100644 vllm/lora/punica_wrapper/punica_gpu.py
+ create mode 100644 vllm/lora/punica_wrapper/punica_hpu.py
+ create mode 100644 vllm/lora/punica_wrapper/punica_selector.py
+ create mode 100644 vllm/lora/punica_wrapper/utils.py
+ create mode 100644 vllm/model_executor/custom_op.py
+ create mode 100644 vllm/model_executor/guided_decoding/guided_fields.py
+ create mode 100644 vllm/model_executor/guided_decoding/utils.py
+ create mode 100644 vllm/model_executor/guided_decoding/xgrammar_decoding.py
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+ create mode 100644 vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+ create mode 100644 vllm/model_executor/layers/fused_moe/layer.py
+ create mode 100644 vllm/model_executor/layers/fused_moe/moe_pallas.py
+ create mode 100644 vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
+ create mode 100644 vllm/model_executor/layers/mamba/__init__.py
+ create mode 100644 vllm/model_executor/layers/mamba/mamba_mixer.py
+ create mode 100644 vllm/model_executor/layers/mamba/ops/__init__.py
+ create mode 100644 vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+ create mode 100644 vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+ create mode 100644 vllm/model_executor/layers/pooler.py
+ create mode 100644 vllm/model_executor/layers/quantization/awq_marlin.py
+ create mode 100644 vllm/model_executor/layers/quantization/awq_triton.py
+ create mode 100644 vllm/model_executor/layers/quantization/bitsandbytes.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+ create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+ create mode 100644 vllm/model_executor/layers/quantization/deepspeedfp.py
+ create mode 100644 vllm/model_executor/layers/quantization/experts_int8.py
+ create mode 100644 vllm/model_executor/layers/quantization/fbgemm_fp8.py
+ create mode 100644 vllm/model_executor/layers/quantization/gguf.py
+ create mode 100644 vllm/model_executor/layers/quantization/gptq_marlin_24.py
+ create mode 100644 vllm/model_executor/layers/quantization/hqq_marlin.py
+ create mode 100644 vllm/model_executor/layers/quantization/ipex_quant.py
+ create mode 100644 vllm/model_executor/layers/quantization/kernels/__init__.py
+ create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+ create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+ create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+ create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+ create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+ create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+ create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+ create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+ create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+ create mode 100644 vllm/model_executor/layers/quantization/kv_cache.py
+ create mode 100644 vllm/model_executor/layers/quantization/modelopt.py
+ create mode 100644 vllm/model_executor/layers/quantization/neuron_quant.py
+ create mode 100644 vllm/model_executor/layers/quantization/qqq.py
+ create mode 100644 vllm/model_executor/layers/quantization/tpu_int8.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/__init__.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/fp8_utils.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/layer_utils.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/machete_utils.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/quant_utils.py
+ create mode 100644 vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+ create mode 100644 vllm/model_executor/layers/resampler.py
+ create mode 100644 vllm/model_executor/layers/spec_decode_base_sampler.py
+ create mode 100644 vllm/model_executor/layers/typical_acceptance_sampler.py
+ create mode 100644 vllm/model_executor/layers/utils.py
+ create mode 100644 vllm/model_executor/model_loader/openvino.py
+ create mode 100644 vllm/model_executor/models/adapters.py
+ create mode 100644 vllm/model_executor/models/arctic.py
+ create mode 100644 vllm/model_executor/models/aria.py
+ create mode 100644 vllm/model_executor/models/bart.py
+ create mode 100644 vllm/model_executor/models/bert.py
+ create mode 100644 vllm/model_executor/models/blip.py
+ create mode 100644 vllm/model_executor/models/blip2.py
+ create mode 100644 vllm/model_executor/models/chameleon.py
+ create mode 100644 vllm/model_executor/models/clip.py
+ create mode 100644 vllm/model_executor/models/deepseek_v2.py
+ create mode 100644 vllm/model_executor/models/deepseek_v3.py
+ create mode 100644 vllm/model_executor/models/deepseek_vl2.py
+ create mode 100644 vllm/model_executor/models/eagle.py
+ create mode 100644 vllm/model_executor/models/exaone.py
+ create mode 100644 vllm/model_executor/models/florence2.py
+ create mode 100644 vllm/model_executor/models/fuyu.py
+ create mode 100644 vllm/model_executor/models/gemma2.py
+ create mode 100644 vllm/model_executor/models/glm.py
+ create mode 100644 vllm/model_executor/models/glm4_vision_encoder.py
+ create mode 100644 vllm/model_executor/models/granite.py
+ create mode 100644 vllm/model_executor/models/granitemoe.py
+ create mode 100644 vllm/model_executor/models/gritlm.py
+ create mode 100644 vllm/model_executor/models/h2ovl.py
+ create mode 100644 vllm/model_executor/models/idefics2_vision_model.py
+ create mode 100644 vllm/model_executor/models/idefics3.py
+ create mode 100644 vllm/model_executor/models/interfaces.py
+ create mode 100644 vllm/model_executor/models/interfaces_base.py
+ create mode 100644 vllm/model_executor/models/intern_vit.py
+ create mode 100644 vllm/model_executor/models/internlm2_ve.py
+ create mode 100644 vllm/model_executor/models/internvl.py
+ create mode 100644 vllm/model_executor/models/jamba.py
+ create mode 100644 vllm/model_executor/models/llava_next.py
+ create mode 100644 vllm/model_executor/models/llava_next_video.py
+ create mode 100644 vllm/model_executor/models/llava_onevision.py
+ create mode 100644 vllm/model_executor/models/mamba.py
+ create mode 100644 vllm/model_executor/models/mamba_cache.py
+ create mode 100644 vllm/model_executor/models/medusa.py
+ create mode 100644 vllm/model_executor/models/minicpm3.py
+ create mode 100644 vllm/model_executor/models/minicpmv.py
+ create mode 100644 vllm/model_executor/models/mllama.py
+ create mode 100644 vllm/model_executor/models/mlp_speculator.py
+ create mode 100644 vllm/model_executor/models/module_mapping.py
+ create mode 100644 vllm/model_executor/models/molmo.py
+ create mode 100644 vllm/model_executor/models/nemotron.py
+ create mode 100644 vllm/model_executor/models/nvlm_d.py
+ create mode 100644 vllm/model_executor/models/olmo2.py
+ create mode 100644 vllm/model_executor/models/olmoe.py
+ create mode 100644 vllm/model_executor/models/paligemma.py
+ create mode 100644 vllm/model_executor/models/persimmon.py
+ create mode 100644 vllm/model_executor/models/phi3.py
+ create mode 100644 vllm/model_executor/models/phi3_small.py
+ create mode 100644 vllm/model_executor/models/phi3v.py
+ create mode 100644 vllm/model_executor/models/phimoe.py
+ create mode 100644 vllm/model_executor/models/pixtral.py
+ create mode 100644 vllm/model_executor/models/qwen2_audio.py
+ create mode 100644 vllm/model_executor/models/qwen2_rm.py
+ create mode 100644 vllm/model_executor/models/qwen2_vl.py
+ create mode 100644 vllm/model_executor/models/registry.py
+ create mode 100644 vllm/model_executor/models/roberta.py
+ create mode 100644 vllm/model_executor/models/siglip.py
+ create mode 100644 vllm/model_executor/models/solar.py
+ create mode 100644 vllm/model_executor/models/telechat2.py
+ create mode 100644 vllm/model_executor/models/ultravox.py
+ create mode 100644 vllm/model_executor/models/utils.py
+ create mode 100644 vllm/model_executor/models/vision.py
+ create mode 100644 vllm/model_executor/models/whisper.py
+ create mode 100644 vllm/model_executor/parameter.py
+ create mode 100644 vllm/model_executor/pooling_metadata.py
+ create mode 100644 vllm/multimodal/__init__.py
+ create mode 100644 vllm/multimodal/audio.py
+ create mode 100644 vllm/multimodal/base.py
+ create mode 100644 vllm/multimodal/hasher.py
+ create mode 100644 vllm/multimodal/image.py
+ create mode 100644 vllm/multimodal/inputs.py
+ create mode 100644 vllm/multimodal/parse.py
+ create mode 100644 vllm/multimodal/processing.py
+ create mode 100644 vllm/multimodal/profiling.py
+ create mode 100644 vllm/multimodal/registry.py
+ create mode 100644 vllm/multimodal/utils.py
+ create mode 100644 vllm/multimodal/video.py
+ create mode 100644 vllm/platforms/__init__.py
+ create mode 100644 vllm/platforms/cpu.py
+ create mode 100644 vllm/platforms/cuda.py
+ create mode 100644 vllm/platforms/hpu.py
+ create mode 100644 vllm/platforms/interface.py
+ create mode 100644 vllm/platforms/neuron.py
+ create mode 100644 vllm/platforms/openvino.py
+ create mode 100644 vllm/platforms/rocm.py
+ create mode 100644 vllm/platforms/tpu.py
+ create mode 100644 vllm/platforms/xpu.py
+ create mode 100644 vllm/plugins/__init__.py
+ create mode 100644 vllm/pooling_params.py
+ create mode 100644 vllm/profiler/__init__.py
+ create mode 100644 vllm/profiler/layerwise_profile.py
+ create mode 100644 vllm/profiler/utils.py
+ create mode 100644 vllm/prompt_adapter/__init__.py
+ create mode 100644 vllm/prompt_adapter/layers.py
+ create mode 100644 vllm/prompt_adapter/models.py
+ create mode 100644 vllm/prompt_adapter/request.py
+ create mode 100644 vllm/prompt_adapter/utils.py
+ create mode 100644 vllm/prompt_adapter/worker_manager.py
+ create mode 100644 vllm/scalar_type.py
+ create mode 100644 vllm/scripts.py
+ create mode 100644 vllm/spec_decode/draft_model_runner.py
+ create mode 100644 vllm/spec_decode/medusa_worker.py
+ create mode 100644 vllm/spec_decode/mlp_speculator_worker.py
+ create mode 100644 vllm/spec_decode/mqa_scorer.py
+ create mode 100644 vllm/spec_decode/proposer_worker_base.py
+ create mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py
+ create mode 100644 vllm/spec_decode/target_model_runner.py
+ create mode 100644 vllm/tracing.py
+ create mode 100644 vllm/transformers_utils/configs/arctic.py
+ create mode 100644 vllm/transformers_utils/configs/aria.py
+ create mode 100644 vllm/transformers_utils/configs/cohere2.py
+ create mode 100644 vllm/transformers_utils/configs/deepseek_vl2.py
+ create mode 100644 vllm/transformers_utils/configs/eagle.py
+ create mode 100644 vllm/transformers_utils/configs/exaone.py
+ create mode 100644 vllm/transformers_utils/configs/h2ovl.py
+ create mode 100644 vllm/transformers_utils/configs/internvl.py
+ create mode 100644 vllm/transformers_utils/configs/medusa.py
+ create mode 100644 vllm/transformers_utils/configs/mllama.py
+ create mode 100644 vllm/transformers_utils/configs/mlp_speculator.py
+ create mode 100644 vllm/transformers_utils/configs/nemotron.py
+ create mode 100644 vllm/transformers_utils/configs/nvlm_d.py
+ create mode 100644 vllm/transformers_utils/configs/olmo2.py
+ create mode 100644 vllm/transformers_utils/configs/solar.py
+ create mode 100644 vllm/transformers_utils/configs/telechat2.py
+ create mode 100644 vllm/transformers_utils/configs/ultravox.py
+ create mode 100644 vllm/transformers_utils/detokenizer_utils.py
+ create mode 100644 vllm/transformers_utils/processor.py
+ create mode 100644 vllm/transformers_utils/s3_utils.py
+ create mode 100644 vllm/transformers_utils/tokenizers/mistral.py
+ create mode 100644 vllm/transformers_utils/utils.py
+ create mode 100644 vllm/triton_utils/__init__.py
+ create mode 100644 vllm/triton_utils/custom_cache_manager.py
+ create mode 100644 vllm/triton_utils/importing.py
+ create mode 100644 vllm/v1/__init__.py
+ create mode 100644 vllm/v1/attention/__init__.py
+ create mode 100644 vllm/v1/attention/backends/__init__.py
+ create mode 100644 vllm/v1/attention/backends/flash_attn.py
+ create mode 100644 vllm/v1/core/__init__.py
+ create mode 100644 vllm/v1/core/encoder_cache_manager.py
+ create mode 100644 vllm/v1/core/kv_cache_manager.py
+ create mode 100644 vllm/v1/core/kv_cache_utils.py
+ create mode 100644 vllm/v1/core/scheduler.py
+ create mode 100644 vllm/v1/engine/__init__.py
+ create mode 100644 vllm/v1/engine/async_llm.py
+ create mode 100644 vllm/v1/engine/core.py
+ create mode 100644 vllm/v1/engine/core_client.py
+ create mode 100644 vllm/v1/engine/detokenizer.py
+ create mode 100644 vllm/v1/engine/llm_engine.py
+ create mode 100644 vllm/v1/engine/mm_input_mapper.py
+ create mode 100644 vllm/v1/engine/output_processor.py
+ create mode 100644 vllm/v1/engine/processor.py
+ create mode 100644 vllm/v1/executor/__init__.py
+ create mode 100644 vllm/v1/executor/abstract.py
+ create mode 100644 vllm/v1/executor/multiproc_executor.py
+ create mode 100644 vllm/v1/executor/ray_executor.py
+ create mode 100644 vllm/v1/executor/ray_utils.py
+ create mode 100644 vllm/v1/executor/uniproc_executor.py
+ create mode 100644 vllm/v1/metrics/__init__.py
+ create mode 100644 vllm/v1/metrics/loggers.py
+ create mode 100644 vllm/v1/metrics/stats.py
+ create mode 100644 vllm/v1/outputs.py
+ create mode 100644 vllm/v1/request.py
+ create mode 100644 vllm/v1/sample/__init__.py
+ create mode 100644 vllm/v1/sample/metadata.py
+ create mode 100644 vllm/v1/sample/ops/__init__.py
+ create mode 100644 vllm/v1/sample/ops/penalties.py
+ create mode 100644 vllm/v1/sample/ops/topk_topp_sampler.py
+ create mode 100644 vllm/v1/sample/sampler.py
+ create mode 100644 vllm/v1/serial_utils.py
+ create mode 100644 vllm/v1/utils.py
+ create mode 100644 vllm/v1/worker/__init__.py
+ create mode 100644 vllm/v1/worker/block_table.py
+ create mode 100644 vllm/v1/worker/gpu_input_batch.py
+ create mode 100644 vllm/v1/worker/gpu_model_runner.py
+ create mode 100644 vllm/v1/worker/gpu_worker.py
+ create mode 100644 vllm/version.py
+ create mode 100644 vllm/worker/cpu_enc_dec_model_runner.py
+ create mode 100644 vllm/worker/cpu_pooling_model_runner.py
+ create mode 100644 vllm/worker/enc_dec_model_runner.py
+ create mode 100644 vllm/worker/hpu_model_runner.py
+ create mode 100644 vllm/worker/hpu_worker.py
+ create mode 100644 vllm/worker/model_runner_base.py
+ create mode 100644 vllm/worker/multi_step_model_runner.py
+ create mode 100644 vllm/worker/multi_step_tpu_worker.py
+ create mode 100644 vllm/worker/multi_step_worker.py
+ create mode 100644 vllm/worker/openvino_model_runner.py
+ create mode 100644 vllm/worker/openvino_worker.py
+ create mode 100644 vllm/worker/pooling_model_runner.py
+ create mode 100644 vllm/worker/tpu_model_runner.py
+ create mode 100644 vllm/worker/tpu_worker.py
+ create mode 100644 vllm/worker/utils.py
+ create mode 100644 vllm/worker/xpu_model_runner.py
+ create mode 100644 vllm/worker/xpu_worker.py
+
+diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
+index 90a5e54..0412c5f 100644
+--- a/.buildkite/check-wheel-size.py
++++ b/.buildkite/check-wheel-size.py
+@@ -1,36 +1,43 @@
+ import os
++import sys
+ import zipfile
+ 
+-MAX_SIZE_MB = 100
++# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
++VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
+ 
+ 
+ def print_top_10_largest_files(zip_file):
++    """Print the top 10 largest files in the given zip file."""
+     with zipfile.ZipFile(zip_file, 'r') as z:
+         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
+         file_sizes.sort(key=lambda x: x[1], reverse=True)
+         for f, size in file_sizes[:10]:
+-            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
++            print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
+ 
+ 
+ def check_wheel_size(directory):
++    """Check the size of .whl files in the given directory."""
+     for root, _, files in os.walk(directory):
+-        for f in files:
+-            if f.endswith(".whl"):
+-                wheel_path = os.path.join(root, f)
+-                wheel_size = os.path.getsize(wheel_path)
+-                wheel_size_mb = wheel_size / (1024 * 1024)
+-                if wheel_size_mb > MAX_SIZE_MB:
+-                    print(
+-                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
+-                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
++        for file_name in files:
++            if file_name.endswith(".whl"):
++                wheel_path = os.path.join(root, file_name)
++                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
++                if wheel_size_mb > VLLM_MAX_SIZE_MB:
++                    print(f"Not allowed: Wheel {wheel_path} is larger "
++                          f"({wheel_size_mb:.2f} MB) than the limit "
++                          f"({VLLM_MAX_SIZE_MB} MB).")
+                     print_top_10_largest_files(wheel_path)
+                     return 1
+                 else:
+                     print(f"Wheel {wheel_path} is within the allowed size "
+-                          f"({wheel_size_mb} MB).")
++                          f"({wheel_size_mb:.2f} MB).")
+     return 0
+ 
+ 
+ if __name__ == "__main__":
+-    import sys
+-    sys.exit(check_wheel_size(sys.argv[1]))
++    if len(sys.argv) < 2:
++        print("Usage: python check-wheel-size.py <directory>")
++        sys.exit(1)
++
++    directory = sys.argv[1]
++    sys.exit(check_wheel_size(directory))
+\ No newline at end of file
+diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
+new file mode 100644
+index 0000000..8350e27
+--- /dev/null
++++ b/.buildkite/generate_index.py
+@@ -0,0 +1,24 @@
++import argparse
++import os
++
++template = """<!DOCTYPE html>
++<html>
++    <body>
++    <h1>Links for vLLM</h1/>
++        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
++    </body>
++</html>
++"""
++
++parser = argparse.ArgumentParser()
++parser.add_argument("--wheel", help="The wheel path.", required=True)
++args = parser.parse_args()
++
++filename = os.path.basename(args.wheel)
++
++with open("index.html", "w") as f:
++    print(f"Generated index.html for {args.wheel}")
++    # cloudfront requires escaping the '+' character
++    f.write(
++        template.format(wheel=filename,
++                        wheel_html_escaped=filename.replace("+", "%2B")))
+diff --git a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+new file mode 100644
+index 0000000..d70ecb2
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+@@ -0,0 +1,12 @@
++# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
++model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.671
++  - name: "exact_match,flexible-extract"
++    value: 0.664
++limit: 1000
++num_fewshot: 5
++trust_remote_code: True
+\ No newline at end of file
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+new file mode 100644
+index 0000000..4397eff
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
++model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.905
++  - name: "exact_match,flexible-extract"
++    value: 0.905
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+new file mode 100644
+index 0000000..fa6ea23
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
++model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.892
++  - name: "exact_match,flexible-extract"
++    value: 0.892
++limit: 250
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+new file mode 100644
+index 0000000..c513159
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
++model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.752
++  - name: "exact_match,flexible-extract"
++    value: 0.754
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+new file mode 100644
+index 0000000..5e57fcb
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
++model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.753
++  - name: "exact_match,flexible-extract"
++    value: 0.753
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+new file mode 100644
+index 0000000..374171f
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
++model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.755
++  - name: "exact_match,flexible-extract"
++    value: 0.755
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+new file mode 100644
+index 0000000..dc36b70
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
++model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.753
++  - name: "exact_match,flexible-extract"
++    value: 0.753
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+new file mode 100644
+index 0000000..0ecfc01
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
++model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.764
++  - name: "exact_match,flexible-extract"
++    value: 0.764
++limit: 250
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+new file mode 100644
+index 0000000..bc29002
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
++model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.728
++  - name: "exact_match,flexible-extract"
++    value: 0.728
++limit: 250
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+new file mode 100644
+index 0000000..3964f3b
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
++model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.758
++  - name: "exact_match,flexible-extract"
++    value: 0.759
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+new file mode 100644
+index 0000000..fb4b491
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
++model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.756
++  - name: "exact_match,flexible-extract"
++    value: 0.752
++limit: 250
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+new file mode 100644
+index 0000000..0424586
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
++model_name: "HandH1998/QQQ-Llama-3-8b-g128"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.419
++  - name: "exact_match,flexible-extract"
++    value: 0.416
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+new file mode 100644
+index 0000000..78347f6
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
++model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.356
++  - name: "exact_match,flexible-extract"
++    value: 0.358
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+new file mode 100644
+index 0000000..3ea0b7b
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
++model_name: "mgoin/Minitron-4B-Base-FP8"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.233
++  - name: "exact_match,flexible-extract"
++    value: 0.236
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+new file mode 100644
+index 0000000..75a24e4
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+@@ -0,0 +1,11 @@
++# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
++model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.86
++  - name: "exact_match,flexible-extract"
++    value: 0.86
++limit: 250
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+new file mode 100644
+index 0000000..436ec21
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+@@ -0,0 +1,11 @@
++# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
++model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.624
++  - name: "exact_match,flexible-extract"
++    value: 0.624
++limit: 250
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+new file mode 100644
+index 0000000..dec9164
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
++model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.616
++  - name: "exact_match,flexible-extract"
++    value: 0.632
++limit: 250
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+new file mode 100644
+index 0000000..42936fb
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
++model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.578
++  - name: "exact_match,flexible-extract"
++    value: 0.585
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+new file mode 100644
+index 0000000..43ff2bc
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
++model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.593
++  - name: "exact_match,flexible-extract"
++    value: 0.588
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+new file mode 100644
+index 0000000..259799b
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+@@ -0,0 +1,11 @@
++# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
++model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.595
++  - name: "exact_match,flexible-extract"
++    value: 0.582
++limit: 1000
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+new file mode 100644
+index 0000000..45d5efc
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+@@ -0,0 +1,11 @@
++# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
++model_name: "Qwen/Qwen2-57B-A14B-Instruct"
++tasks:
++- name: "gsm8k"
++  metrics:
++  - name: "exact_match,strict-match"
++    value: 0.792
++  - name: "exact_match,flexible-extract"
++    value: 0.824
++limit: 250
++num_fewshot: 5
+diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
+new file mode 100644
+index 0000000..37eeac8
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/models-large.txt
+@@ -0,0 +1,5 @@
++Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
++Meta-Llama-3-70B-Instruct.yaml
++Mixtral-8x7B-Instruct-v0.1.yaml
++Qwen2-57B-A14-Instruct.yaml
++DeepSeek-V2-Lite-Chat.yaml
+diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
+new file mode 100644
+index 0000000..6057229
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/configs/models-small.txt
+@@ -0,0 +1,10 @@
++Meta-Llama-3-8B-Instruct.yaml
++Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
++Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
++Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
++Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
++Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
++Minitron-4B-Base-FP8.yaml
++Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
++Qwen2-1.5B-Instruct-FP8W8.yaml
++Meta-Llama-3-8B-QQQ.yaml
+diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+new file mode 100644
+index 0000000..a67fc89
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+@@ -0,0 +1,46 @@
++#!/bin/bash
++# We can use this script to compute baseline accuracy on GSM for transformers.
++#
++# Make sure you have lm-eval-harness installed:
++#   pip install lm-eval==0.4.4
++
++usage() {
++    echo``
++    echo "Runs lm eval harness on GSM8k using huggingface transformers."
++    echo "This pathway is intended to be used to create baselines for "
++    echo "our automated nm-test-accuracy workflow"
++    echo
++    echo "usage: ${0} <options>"
++    echo
++    echo "  -m    - huggingface stub or local directory of the model"
++    echo "  -b    - batch size to run the evaluation at"
++    echo "  -l    - limit number of samples to run"
++    echo "  -f    - number of fewshot samples to use"
++    echo
++}
++
++while getopts "m:b:l:f:" OPT; do
++  case ${OPT} in
++    m ) 
++        MODEL="$OPTARG"
++        ;;
++    b ) 
++        BATCH_SIZE="$OPTARG"
++        ;;
++    l ) 
++        LIMIT="$OPTARG"
++        ;;
++    f ) 
++        FEWSHOT="$OPTARG"
++        ;;
++    \? ) 
++        usage
++        exit 1
++        ;;
++  esac
++done
++
++lm_eval --model hf \
++  --model_args "pretrained=$MODEL,parallelize=True" \
++  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
++  --batch_size "$BATCH_SIZE"
+diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+new file mode 100644
+index 0000000..65be3c5
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+@@ -0,0 +1,51 @@
++#!/bin/bash
++# We can use this script to compute baseline accuracy on GSM for vllm.
++# We use this for fp8, which HF does not support.
++#
++# Make sure you have lm-eval-harness installed:
++#   pip install lm-eval==0.4.4
++
++usage() {
++    echo``
++    echo "Runs lm eval harness on GSM8k using huggingface transformers."
++    echo "This pathway is intended to be used to create baselines for "
++    echo "our automated nm-test-accuracy workflow"
++    echo
++    echo "usage: ${0} <options>"
++    echo
++    echo "  -m    - huggingface stub or local directory of the model"
++    echo "  -b    - batch size to run the evaluation at"
++    echo "  -l    - limit number of samples to run"
++    echo "  -f    - number of fewshot samples to use"
++    echo "  -t    - tensor parallel size to run at"
++    echo
++}
++
++while getopts "m:b:l:f:t:" OPT; do
++  case ${OPT} in
++    m ) 
++        MODEL="$OPTARG"
++        ;;
++    b ) 
++        BATCH_SIZE="$OPTARG"
++        ;;
++    l ) 
++        LIMIT="$OPTARG"
++        ;;
++    f ) 
++        FEWSHOT="$OPTARG"
++        ;;
++    t )
++        TP_SIZE="$OPTARG"
++        ;;
++    \? ) 
++        usage
++        exit 1
++        ;;
++  esac
++done
++
++lm_eval --model vllm \
++  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
++  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
++  --batch_size "$BATCH_SIZE"
+diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh
+new file mode 100644
+index 0000000..26f33b7
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/run-tests.sh
+@@ -0,0 +1,59 @@
++#!/bin/bash
++
++usage() {
++    echo``
++    echo "Runs lm eval harness on GSM8k using vllm and compares to "
++    echo "precomputed baseline (measured by HF transformers.)"
++    echo
++    echo "usage: ${0} <options>"
++    echo
++    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
++    echo "  -t    - tensor parallel size"
++    echo
++}
++
++SUCCESS=0
++
++while getopts "c:t:" OPT; do
++  case ${OPT} in
++    c ) 
++        CONFIG="$OPTARG"
++        ;;
++    t )
++        TP_SIZE="$OPTARG"
++        ;;
++    \? )
++        usage
++        exit 1
++        ;;
++  esac
++done
++
++# Parse list of configs.
++IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
++
++for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
++do
++    LOCAL_SUCCESS=0
++    
++    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
++
++    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
++    export LM_EVAL_TP_SIZE=$TP_SIZE
++    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
++
++    if [[ $LOCAL_SUCCESS == 0 ]]; then
++        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
++    else
++        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
++    fi
++
++    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
++
++done
++
++if [ "${SUCCESS}" -eq "0" ]; then
++    exit 0
++else
++    exit 1
++fi
+diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+new file mode 100644
+index 0000000..afc935c
+--- /dev/null
++++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+@@ -0,0 +1,63 @@
++"""
++LM eval harness on model to compare vs HF baseline computed offline.
++Configs are found in configs/$MODEL.yaml
++
++* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
++* export LM_EVAL_TP_SIZE=4 
++* pytest -s test_lm_eval_correctness.py
++"""
++
++import os
++from pathlib import Path
++
++import lm_eval
++import numpy
++import yaml
++
++RTOL = 0.05
++TEST_DATA_FILE = os.environ.get(
++    "LM_EVAL_TEST_DATA_FILE",
++    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
++
++TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
++
++
++def launch_lm_eval(eval_config):
++    trust_remote_code = eval_config.get('trust_remote_code', False)
++
++    model_args = f"pretrained={eval_config['model_name']}," \
++                 f"tensor_parallel_size={TP_SIZE}," \
++                 f"add_bos_token=true," \
++                 f"trust_remote_code={trust_remote_code}"
++
++    results = lm_eval.simple_evaluate(
++        model="vllm",
++        model_args=model_args,
++        tasks=[task["name"] for task in eval_config["tasks"]],
++        num_fewshot=eval_config["num_fewshot"],
++        limit=eval_config["limit"],
++        batch_size="auto")
++
++    return results
++
++
++def test_lm_eval_correctness():
++    eval_config = yaml.safe_load(
++        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
++
++    # Launch eval requests.
++    results = launch_lm_eval(eval_config)
++
++    # Confirm scores match ground truth.
++    success = True
++    for task in eval_config["tasks"]:
++        for metric in task["metrics"]:
++            ground_truth = metric["value"]
++            measured_value = results["results"][task["name"]][metric["name"]]
++            print(f'{task["name"]} | {metric["name"]}: '
++                  f'ground_truth={ground_truth} | measured={measured_value}')
++            success = success and numpy.isclose(
++                ground_truth, measured_value, rtol=RTOL)
++
++    # Assert at the end, print all scores even on failure for debugging.
++    assert success
+diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
+new file mode 100644
+index 0000000..fbf41eb
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/README.md
+@@ -0,0 +1,153 @@
++# vLLM benchmark suite
++
++
++## Introduction
++
++This directory contains two sets of benchmark for vllm.
++- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
++- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
++
++
++See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
++
++
++## Performance benchmark quick overview
++
++**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
++
++**Benchmarking Duration**: about 1hr.
++
++**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
++
++
++## Nightly benchmark quick overview
++
++**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. 
++
++**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
++
++**Benchmarking Duration**: about 3.5hrs.
++
++
++
++## Trigger the benchmark
++
++Performance benchmark will be triggered when:
++- A PR being merged into vllm.
++- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
++
++Nightly benchmark will be triggered when:
++- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
++
++
++
++
++## Performance benchmark details
++
++
++See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
++
++
++#### Latency test
++
++Here is an example of one test inside `latency-tests.json`:
++
++```json
++[
++    {
++        "test_name": "latency_llama8B_tp1",
++        "parameters": {
++            "model": "meta-llama/Meta-Llama-3-8B",
++            "tensor_parallel_size": 1,
++            "load_format": "dummy",
++            "num_iters_warmup": 5,
++            "num_iters": 15
++        }
++    },
++]
++```
++
++In this example:
++-  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
++-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
++
++Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
++
++WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
++
++
++#### Throughput test
++The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
++
++The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
++
++#### Serving test
++We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
++
++```
++[
++    {
++        "test_name": "serving_llama8B_tp1_sharegpt",
++        "qps_list": [1, 4, 16, "inf"],
++        "server_parameters": {
++            "model": "meta-llama/Meta-Llama-3-8B",
++            "tensor_parallel_size": 1,
++            "swap_space": 16,
++            "disable_log_stats": "",
++            "disable_log_requests": "",
++            "load_format": "dummy"
++        },
++        "client_parameters": {
++            "model": "meta-llama/Meta-Llama-3-8B",
++            "backend": "vllm",
++            "dataset_name": "sharegpt",
++            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
++            "num_prompts": 200
++        }
++    },
++]
++```
++
++Inside this example:
++- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
++- The `server-parameters` includes the command line arguments for vLLM server.
++- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
++- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
++
++The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
++
++WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
++
++#### Visualizing the results
++The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
++You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
++If you do not see the table, please wait till the benchmark finish running.
++The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
++The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
++
++
++
++## Nightly test details
++
++See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
++
++
++#### Workflow
++
++- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. 
++- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
++- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
++- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
++
++#### Nightly tests
++
++In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
++
++#### Docker containers
++
++The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
++
++WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
++
++WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
++
+diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+new file mode 100644
+index 0000000..679abf1
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+@@ -0,0 +1,92 @@
++steps:
++  - label: "Wait for container to be ready"
++    key: wait-for-container-image
++    agents:
++      queue: A100
++    plugins:
++    - kubernetes:
++        podSpec:
++          containers:
++          - image: badouralix/curl-jq
++            command:
++            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
++
++  - label: "A100"
++    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
++    agents:
++      queue: A100
++    depends_on: wait-for-container-image
++    plugins:
++    - kubernetes:
++        podSpec:
++          priorityClassName: perf-benchmark
++          containers:
++          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
++            command:
++            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
++            resources:
++              limits:
++                nvidia.com/gpu: 8
++            volumeMounts:
++            - name: devshm
++              mountPath: /dev/shm
++            env:
++            - name: VLLM_USAGE_SOURCE
++              value: ci-test
++            - name: HF_TOKEN
++              valueFrom:
++                secretKeyRef:
++                  name: hf-token-secret
++                  key: token
++          nodeSelector:
++            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
++          volumes:
++          - name: devshm
++            emptyDir:
++              medium: Memory
++
++  - label: "H200"
++    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
++    agents:
++      queue: H200
++    depends_on: wait-for-container-image
++    plugins:
++    - docker#v5.12.0:
++        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
++        command:
++        - bash
++        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
++        mount-buildkite-agent: true
++        propagate-environment: true
++        ipc: host
++        gpus: 4,5,6,7
++        volumes:
++          - /data/benchmark-hf-cache:/root/.cache/huggingface
++        environment:
++        - VLLM_USAGE_SOURCE
++        - HF_TOKEN
++
++  #- block: "Run H100 Benchmark"
++    #key: block-h100
++    #depends_on: ~
++
++  - label: "H100"
++    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
++    agents:
++      queue: H100
++    depends_on: wait-for-container-image
++    plugins:
++    - docker#v5.12.0:
++        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
++        command:
++        - bash
++        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
++        mount-buildkite-agent: true
++        propagate-environment: true
++        ipc: host
++        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
++        volumes:
++          - /data/benchmark-hf-cache:/root/.cache/huggingface
++        environment:
++        - VLLM_USAGE_SOURCE
++        - HF_TOKEN
+diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md
+new file mode 100644
+index 0000000..1e33793
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
+@@ -0,0 +1,28 @@
++
++## Description
++
++This file contains the downloading link for benchmarking results.
++
++- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
++- [benchmarking results](artifact://results.zip)
++- [benchmarking code](artifact://nightly-benchmarks.zip)
++
++Please download the visualization scripts in the post
++
++
++## Results reproduction
++
++- Find the docker we use in `benchmarking pipeline`
++- Deploy the docker, and inside the docker:
++  - Download `nightly-benchmarks.zip`. 
++  - In the same folder, run the following code
++```
++export HF_TOKEN=<your HF token>
++apt update
++apt install -y git
++unzip nightly-benchmarks.zip
++VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
++```
++
++And the results will be inside `./benchmarks/results`.
++
+diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
+new file mode 100644
+index 0000000..7dec7a0
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
+@@ -0,0 +1,39 @@
++
++# Nightly benchmark
++
++This benchmark aims to:
++- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
++- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
++
++Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
++
++Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
++
++
++## Setup
++
++- Docker images:
++  - vLLM: `vllm/vllm-openai:v0.6.2`
++  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
++  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
++  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
++    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
++  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
++- Hardware
++  - 8x Nvidia A100 GPUs
++- Workload:
++  - Dataset
++    - ShareGPT dataset
++    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
++    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
++    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
++  - Models: llama-3 8B, llama-3 70B.
++    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
++  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
++    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
++  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
++
++# Known issues
++
++- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
++- TGI does not support `ignore-eos` flag.
+\ No newline at end of file
+diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+new file mode 100644
+index 0000000..199517e
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+@@ -0,0 +1,196 @@
++common_pod_spec: &common_pod_spec
++  priorityClassName: perf-benchmark
++  nodeSelector:
++    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
++  volumes:
++    - name: devshm
++      emptyDir:
++        medium: Memory
++    - name: hf-cache
++      hostPath:
++        path: /root/.cache/huggingface
++        type: Directory
++
++common_container_settings: &common_container_settings
++  command:
++    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
++  resources:
++    limits:
++      nvidia.com/gpu: 8
++  volumeMounts:
++    - name: devshm
++      mountPath: /dev/shm
++    - name: hf-cache
++      mountPath: /root/.cache/huggingface
++  env:
++    - name: VLLM_USAGE_SOURCE
++      value: ci-test
++    - name: HF_HOME
++      value: /root/.cache/huggingface
++    - name: VLLM_SOURCE_CODE_LOC
++      value: /workspace/build/buildkite/vllm/performance-benchmark
++    - name: HF_TOKEN
++      valueFrom:
++        secretKeyRef:
++          name: hf-token-secret
++          key: token
++
++steps:
++  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
++
++
++
++  - label: "A100 vllm step 10"
++    priority: 100
++    agents:
++      queue: A100
++    plugins:
++      - kubernetes:
++          podSpec:
++            <<: *common_pod_spec
++            containers:
++              - image: vllm/vllm-openai:v0.6.2
++                <<: *common_container_settings
++
++
++
++  - label: "A100 sglang benchmark"
++    priority: 100
++    agents:
++      queue: A100
++    plugins:
++      - kubernetes:
++          podSpec:
++            <<: *common_pod_spec
++            containers:
++              - image: lmsysorg/sglang:v0.3.2-cu121
++                <<: *common_container_settings
++
++  - label: "A100 lmdeploy benchmark"
++    priority: 100
++    agents:
++      queue: A100
++    plugins:
++      - kubernetes:
++          podSpec:
++            <<: *common_pod_spec
++            containers:
++              - image: openmmlab/lmdeploy:v0.6.1-cu12
++                <<: *common_container_settings
++
++
++
++
++  - label: "A100 trt llama-8B"
++    priority: 100
++    agents:
++      queue: A100
++    plugins:
++      - kubernetes:
++          podSpec:
++            <<: *common_pod_spec
++            containers:
++              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
++                <<: *common_container_settings
++                env:
++                  - name: VLLM_USAGE_SOURCE
++                    value: ci-test
++                  - name: HF_HOME
++                    value: /root/.cache/huggingface
++                  - name: VLLM_SOURCE_CODE_LOC
++                    value: /workspace/build/buildkite/vllm/performance-benchmark
++                  - name: HF_TOKEN
++                    valueFrom:
++                      secretKeyRef:
++                        name: hf-token-secret
++                        key: token
++                  - name: TEST_SELECTOR
++                    value: "llama8B"
++
++
++  - label: "A100 trt llama-70B"
++    priority: 100
++    agents:
++      queue: A100
++    plugins:
++      - kubernetes:
++          podSpec:
++            <<: *common_pod_spec
++            containers:
++              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
++                <<: *common_container_settings
++                env:
++                  - name: VLLM_USAGE_SOURCE
++                    value: ci-test
++                  - name: HF_HOME
++                    value: /root/.cache/huggingface
++                  - name: VLLM_SOURCE_CODE_LOC
++                    value: /workspace/build/buildkite/vllm/performance-benchmark
++                  - name: HF_TOKEN
++                    valueFrom:
++                      secretKeyRef:
++                        name: hf-token-secret
++                        key: token
++                  - name: TEST_SELECTOR
++                    value: "llama70B"
++
++
++  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
++  # - label: "A100 trt benchmark"
++  #   priority: 100
++  #   agents:
++  #     queue: A100
++  #   plugins:
++  #     - kubernetes:
++  #         podSpec:
++  #           <<: *common_pod_spec
++  #           containers:
++  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
++  #               <<: *common_container_settings
++
++
++  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
++  # - label: "A100 tgi benchmark"
++  #   priority: 100
++  #   agents:
++  #     queue: A100
++  #   plugins:
++  #     - kubernetes:
++  #         podSpec:
++  #           <<: *common_pod_spec
++  #           containers:
++  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
++  #               <<: *common_container_settings
++        
++  - wait
++
++  - label: "Collect the results"
++    priority: 100
++    agents:
++      queue: A100
++    plugins:
++      - kubernetes:
++          podSpec:
++            <<: *common_pod_spec
++            containers:
++            - image: vllm/vllm-openai:v0.5.0.post1
++              command:
++              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
++              resources:
++                limits:
++                  nvidia.com/gpu: 8
++              volumeMounts:
++              - name: devshm
++                mountPath: /dev/shm
++              env:
++              - name: VLLM_USAGE_SOURCE
++                value: ci-test
++              - name: VLLM_SOURCE_CODE_LOC
++                value: /workspace/build/buildkite/vllm/performance-benchmark
++              - name: HF_TOKEN
++                valueFrom:
++                  secretKeyRef:
++                    name: hf-token-secret
++                    key: token
++
++  - block: ":rocket: check the results!"
+\ No newline at end of file
+diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+new file mode 100644
+index 0000000..da32d1f
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+@@ -0,0 +1,62 @@
++
++## Latency tests
++
++- Input length: 32 tokens.
++- Output length: 128 tokens.
++- Batch size: fixed (8).
++- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
++- Evaluation metrics: end-to-end latency (mean, median, p99).
++
++
++{latency_tests_markdown_table}
++
++
++## Throughput tests
++
++- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
++- Output length: the corresponding output length of these 200 prompts.
++- Batch size: dynamically determined by vllm to achieve maximum throughput.
++- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
++- Evaluation metrics: throughput.
++
++
++{throughput_tests_markdown_table}
++
++
++## Serving tests
++
++- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
++- Output length: the corresponding output length of these 200 prompts.
++- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
++- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
++- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
++- We also added a speculative decoding test for llama-3 70B, under QPS 2
++- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
++
++
++{serving_tests_markdown_table}
++
++
++## json version of the benchmarking tables
++
++This section contains the data of the markdown tables above in JSON format. 
++You can load the benchmarking tables into pandas dataframes as follows:
++
++```python
++import json
++import pandas as pd
++
++benchmarking_results_json = """The json string"""
++benchmarking_results = json.loads(benchmarking_results_json)
++latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
++throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
++serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
++```
++
++The json string for all benchmarking tables:
++```json
++{benchmarking_results_in_json_string}
++```
++
++You can also check the raw experiment data in the Artifact tab of the Buildkite page.
++
+diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+new file mode 100644
+index 0000000..9d3646e
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+@@ -0,0 +1,204 @@
++import json
++import os
++from pathlib import Path
++
++import pandas as pd
++from tabulate import tabulate
++
++results_folder = Path("results/")
++
++# latency results and the keys that will be printed into markdown
++latency_results = []
++latency_column_mapping = {
++    "test_name": "Test name",
++    "gpu_type": "GPU",
++    "avg_latency": "Mean latency (ms)",
++    # "P10": "P10 (s)",
++    # "P25": "P25 (s)",
++    "P50": "Median latency (ms)",
++    # "P75": "P75 (s)",
++    # "P90": "P90 (s)",
++    "P99": "P99 latency (ms)",
++}
++
++# throughput tests and the keys that will be printed into markdown
++throughput_results = []
++throughput_results_column_mapping = {
++    "test_name": "Test name",
++    "gpu_type": "GPU",
++    # "num_requests": "# of req.",
++    # "total_num_tokens": "Total # of tokens",
++    # "elapsed_time": "Elapsed time (s)",
++    "requests_per_second": "Tput (req/s)",
++    # "tokens_per_second": "Tput (tok/s)",
++}
++
++# serving results and the keys that will be printed into markdown
++serving_results = []
++serving_column_mapping = {
++    "test_name": "Test name",
++    "gpu_type": "GPU",
++    # "completed": "# of req.",
++    "request_throughput": "Tput (req/s)",
++    # "input_throughput": "Input Tput (tok/s)",
++    # "output_throughput": "Output Tput (tok/s)",
++    "mean_ttft_ms": "Mean TTFT (ms)",
++    "median_ttft_ms": "Median TTFT (ms)",
++    "p99_ttft_ms": "P99 TTFT (ms)",
++    # "mean_tpot_ms": "Mean TPOT (ms)",
++    # "median_tpot_ms": "Median",
++    # "p99_tpot_ms": "P99",
++    "mean_itl_ms": "Mean ITL (ms)",
++    "median_itl_ms": "Median ITL (ms)",
++    "p99_itl_ms": "P99 ITL (ms)",
++}
++
++
++def read_markdown(file):
++    if os.path.exists(file):
++        with open(file) as f:
++            return f.read() + "\n"
++    else:
++        return f"{file} not found.\n"
++
++
++def results_to_json(latency, throughput, serving):
++    return json.dumps({
++        'latency': latency.to_dict(),
++        'throughput': throughput.to_dict(),
++        'serving': serving.to_dict()
++    })
++
++
++if __name__ == "__main__":
++
++    # collect results
++    for test_file in results_folder.glob("*.json"):
++
++        with open(test_file) as f:
++            raw_result = json.loads(f.read())
++
++        if "serving" in str(test_file):
++            # this result is generated via `benchmark_serving.py`
++
++            # attach the benchmarking command to raw_result
++            with open(test_file.with_suffix(".commands")) as f:
++                command = json.loads(f.read())
++            raw_result.update(command)
++
++            # update the test name of this result
++            raw_result.update({"test_name": test_file.stem})
++
++            # add the result to raw_result
++            serving_results.append(raw_result)
++            continue
++
++        elif "latency" in f.name:
++            # this result is generated via `benchmark_latency.py`
++
++            # attach the benchmarking command to raw_result
++            with open(test_file.with_suffix(".commands")) as f:
++                command = json.loads(f.read())
++            raw_result.update(command)
++
++            # update the test name of this result
++            raw_result.update({"test_name": test_file.stem})
++
++            # get different percentiles
++            for perc in [10, 25, 50, 75, 90, 99]:
++                # Multiply 1000 to convert the time unit from s to ms
++                raw_result.update(
++                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
++            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
++
++            # add the result to raw_result
++            latency_results.append(raw_result)
++            continue
++
++        elif "throughput" in f.name:
++            # this result is generated via `benchmark_throughput.py`
++
++            # attach the benchmarking command to raw_result
++            with open(test_file.with_suffix(".commands")) as f:
++                command = json.loads(f.read())
++            raw_result.update(command)
++
++            # update the test name of this result
++            raw_result.update({"test_name": test_file.stem})
++
++            # add the result to raw_result
++            throughput_results.append(raw_result)
++            continue
++
++        print(f"Skipping {test_file}")
++
++    latency_results = pd.DataFrame.from_dict(latency_results)
++    serving_results = pd.DataFrame.from_dict(serving_results)
++    throughput_results = pd.DataFrame.from_dict(throughput_results)
++
++    raw_results_json = results_to_json(latency_results, throughput_results,
++                                       serving_results)
++
++    # remapping the key, for visualization purpose
++    if not latency_results.empty:
++        latency_results = latency_results[list(
++            latency_column_mapping.keys())].rename(
++                columns=latency_column_mapping)
++    if not serving_results.empty:
++        serving_results = serving_results[list(
++            serving_column_mapping.keys())].rename(
++                columns=serving_column_mapping)
++    if not throughput_results.empty:
++        throughput_results = throughput_results[list(
++            throughput_results_column_mapping.keys())].rename(
++                columns=throughput_results_column_mapping)
++
++    processed_results_json = results_to_json(latency_results,
++                                             throughput_results,
++                                             serving_results)
++
++    for df in [latency_results, serving_results, throughput_results]:
++        if df.empty:
++            continue
++
++        # Sort all dataframes by their respective "Test name" columns
++        df.sort_values(by="Test name", inplace=True)
++
++        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
++        # we want to turn it into "8xGPUTYPE"
++        df["GPU"] = df["GPU"].apply(
++            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
++
++    # get markdown tables
++    latency_md_table = tabulate(latency_results,
++                                headers='keys',
++                                tablefmt='pipe',
++                                showindex=False)
++    serving_md_table = tabulate(serving_results,
++                                headers='keys',
++                                tablefmt='pipe',
++                                showindex=False)
++    throughput_md_table = tabulate(throughput_results,
++                                   headers='keys',
++                                   tablefmt='pipe',
++                                   showindex=False)
++
++    # document the result
++    with open(results_folder / "benchmark_results.md", "w") as f:
++
++        results = read_markdown("../.buildkite/nightly-benchmarks/" +
++                                "performance-benchmarks-descriptions.md")
++        results = results.format(
++            latency_tests_markdown_table=latency_md_table,
++            throughput_tests_markdown_table=throughput_md_table,
++            serving_tests_markdown_table=serving_md_table,
++            benchmarking_results_in_json_string=processed_results_json)
++        f.write(results)
++
++    # document benchmarking results in json
++    with open(results_folder / "benchmark_results.json", "w") as f:
++
++        results = latency_results.to_dict(
++            orient='records') + throughput_results.to_dict(
++                orient='records') + serving_results.to_dict(orient='records')
++        f.write(json.dumps(results))
+diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+new file mode 100644
+index 0000000..68ac590
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+@@ -0,0 +1,26 @@
++import argparse
++
++from transformers import AutoTokenizer
++
++
++def main(model, cachedir):
++    # Load the tokenizer and save it to the specified directory
++    tokenizer = AutoTokenizer.from_pretrained(model)
++    tokenizer.save_pretrained(cachedir)
++    print(f"Tokenizer saved to {cachedir}")
++
++
++if __name__ == "__main__":
++    parser = argparse.ArgumentParser(
++        description="Download and save Hugging Face tokenizer")
++    parser.add_argument("--model",
++                        type=str,
++                        required=True,
++                        help="Name of the model")
++    parser.add_argument("--cachedir",
++                        type=str,
++                        required=True,
++                        help="Directory to save the tokenizer")
++
++    args = parser.parse_args()
++    main(args.model, args.cachedir)
+diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+new file mode 100644
+index 0000000..052060c
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+@@ -0,0 +1,95 @@
++import argparse
++import json
++from pathlib import Path
++
++import numpy as np
++import pandas as pd
++from tabulate import tabulate
++
++
++def parse_arguments():
++    parser = argparse.ArgumentParser(
++        description=
++        'Parse command line arguments for summary-nightly-results script.')
++    parser.add_argument('--results-folder',
++                        type=str,
++                        required=True,
++                        help='The folder where the results are stored.')
++    parser.add_argument('--description',
++                        type=str,
++                        required=True,
++                        help='Description of the results.')
++
++    args = parser.parse_args()
++    return args
++
++
++def get_perf(df, method, model, metric):
++
++    means = []
++
++    for qps in [2, 4, 8, 16, "inf"]:
++        target = df['Test name'].str.contains(model)
++        target = target & df['Engine'].str.contains(method)
++        target = target & df['Test name'].str.contains("qps_" + str(qps))
++        filtered_df = df[target]
++
++        if filtered_df.empty:
++            means.append(0.)
++        else:
++            means.append(filtered_df[metric].values[0])
++
++    return np.array(means)
++
++
++def get_perf_w_std(df, method, model, metric):
++
++    if metric in ["TTFT", "ITL"]:
++        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
++        mean = mean.tolist()
++        std = get_perf(df, method, model, "Std " + metric + " (ms)")
++        if std.mean() == 0:
++            std = None
++        success = get_perf(df, method, model, "Successful req.")
++        if std is not None:
++            std = std / np.sqrt(success)
++            std = std.tolist()
++
++    else:
++        assert metric == "Tput"
++        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
++            df, method, model, "Output Tput (tok/s)")
++        mean = mean.tolist()
++        std = None
++
++    return mean, std
++
++
++def main(args):
++    results_folder = Path(args.results_folder)
++
++    results = []
++
++    # collect results
++    for test_file in results_folder.glob("*_nightly_results.json"):
++        with open(test_file) as f:
++            results = results + json.loads(f.read())
++
++    # generate markdown table
++    df = pd.DataFrame.from_dict(results)
++
++    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
++
++    with open(args.description) as f:
++        description = f.read()
++
++    description = description.format(
++        nightly_results_benchmarking_table=md_table)
++
++    with open("nightly_results.md", "w") as f:
++        f.write(description)
++
++
++if __name__ == '__main__':
++    args = parse_arguments()
++    main(args)
+diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+new file mode 100644
+index 0000000..18bcc3a
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+@@ -0,0 +1,6 @@
++from lmdeploy.serve.openai.api_client import APIClient
++
++api_client = APIClient("http://localhost:8000")
++model_name = api_client.available_models[0]
++
++print(model_name)
+diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+new file mode 100644
+index 0000000..fb5063d
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+@@ -0,0 +1,228 @@
++#!/bin/bash
++
++# Currently FP8 benchmark is NOT enabled.
++
++set -x
++server_params=$1
++common_params=$2
++
++json2args() {
++  # transforms the JSON string to command line args, and '_' is replaced to '-'
++  # example:
++  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
++  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
++  local json_string=$1
++  local args=$(
++    echo "$json_string" | jq -r '
++      to_entries |
++      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
++      join(" ")
++    '
++  )
++  echo "$args"
++}
++
++launch_trt_server() {
++
++  model_path=$(echo "$common_params" | jq -r '.model')
++  model_name="${model_path#*/}"
++  model_type=$(echo "$server_params" | jq -r '.model_type')
++  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
++  model_tp_size=$(echo "$common_params" | jq -r '.tp')
++  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
++  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
++  max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
++  max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
++  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
++
++  # create model caching directory
++  cd ~
++  rm -rf models
++  mkdir -p models
++  cd models
++  models_dir=$(pwd)
++  trt_model_path=${models_dir}/${model_name}-trt-ckpt
++  trt_engine_path=${models_dir}/${model_name}-trt-engine
++
++  # clone tensorrt backend
++  cd /
++  rm -rf tensorrtllm_backend
++  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
++  git lfs install
++  cd tensorrtllm_backend
++  git checkout "$trt_llm_version"
++  git submodule update --init --recursive
++
++  # build trtllm engine
++  cd /tensorrtllm_backend
++  cd "./tensorrt_llm/examples/${model_type}"
++  python3 convert_checkpoint.py \
++    --model_dir "${model_path}" \
++    --dtype "${model_dtype}" \
++    --tp_size "${model_tp_size}" \
++    --output_dir "${trt_model_path}"
++  trtllm-build \
++    --checkpoint_dir "${trt_model_path}" \
++    --use_fused_mlp \
++    --reduce_fusion disable \
++    --workers 8 \
++    --gpt_attention_plugin "${model_dtype}" \
++    --gemm_plugin "${model_dtype}" \
++    --tp_size "${model_tp_size}" \
++    --max_batch_size "${max_batch_size}" \
++    --max_input_len "${max_input_len}" \
++    --max_seq_len "${max_seq_len}" \
++    --max_num_tokens "${max_num_tokens}" \
++    --output_dir "${trt_engine_path}"
++
++  # handle triton protobuf files and launch triton server
++  cd /tensorrtllm_backend
++  mkdir triton_model_repo
++  cp -r all_models/inflight_batcher_llm/* triton_model_repo/
++  cd triton_model_repo
++  rm -rf ./tensorrt_llm/1/*
++  cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
++  python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
++  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
++  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
++  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
++  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
++  cd /tensorrtllm_backend
++  python3 scripts/launch_triton_server.py \
++    --world_size="${model_tp_size}" \
++    --model_repo=/tensorrtllm_backend/triton_model_repo &
++
++}
++
++launch_tgi_server() {
++  model=$(echo "$common_params" | jq -r '.model')
++  tp=$(echo "$common_params" | jq -r '.tp')
++  port=$(echo "$common_params" | jq -r '.port')
++  server_args=$(json2args "$server_params")
++
++  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
++    echo "Key 'fp8' exists in common params."
++    server_command="/tgi-entrypoint.sh \
++                --model-id $model \
++                --num-shard $tp \
++                --port $port \
++                --quantize fp8 \
++                $server_args"
++  else
++    echo "Key 'fp8' does not exist in common params."
++    server_command="/tgi-entrypoint.sh \
++                --model-id $model \
++                --num-shard $tp \
++                --port $port \
++                $server_args"
++  fi
++
++  echo "Server command: $server_command"
++  eval "$server_command" &
++
++}
++
++launch_lmdeploy_server() {
++  model=$(echo "$common_params" | jq -r '.model')
++  tp=$(echo "$common_params" | jq -r '.tp')
++  port=$(echo "$common_params" | jq -r '.port')
++  server_args=$(json2args "$server_params")
++
++  server_command="lmdeploy serve api_server $model \
++    --tp $tp \
++    --server-port $port \
++    $server_args"
++
++  # run the server
++  echo "Server command: $server_command"
++  bash -c "$server_command" &
++}
++
++launch_sglang_server() {
++
++  model=$(echo "$common_params" | jq -r '.model')
++  tp=$(echo "$common_params" | jq -r '.tp')
++  port=$(echo "$common_params" | jq -r '.port')
++  server_args=$(json2args "$server_params")
++
++  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
++    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
++    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
++    server_command="python3 \
++        -m sglang.launch_server \
++        --tp $tp \
++        --model-path $model \
++        --port $port \
++        $server_args"
++  else
++    echo "Key 'fp8' does not exist in common params."
++    server_command="python3 \
++        -m sglang.launch_server \
++        --tp $tp \
++        --model-path $model \
++        --port $port \
++        $server_args"
++  fi
++
++  # run the server
++  echo "Server command: $server_command"
++  eval "$server_command" &
++}
++
++launch_vllm_server() {
++
++  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
++
++  model=$(echo "$common_params" | jq -r '.model')
++  tp=$(echo "$common_params" | jq -r '.tp')
++  port=$(echo "$common_params" | jq -r '.port')
++  server_args=$(json2args "$server_params")
++
++  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
++    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
++    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
++    server_command="python3 \
++        -m vllm.entrypoints.openai.api_server \
++        -tp $tp \
++        --model $model \
++        --port $port \
++        $server_args"
++  else
++    echo "Key 'fp8' does not exist in common params."
++    server_command="python3 \
++        -m vllm.entrypoints.openai.api_server \
++        -tp $tp \
++        --model $model \
++        --port $port \
++        $server_args"
++  fi
++
++  # run the server
++  echo "Server command: $server_command"
++  eval "$server_command" &
++}
++
++main() {
++
++  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
++    launch_trt_server
++  fi
++
++  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
++    launch_tgi_server
++  fi
++
++  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
++    launch_lmdeploy_server
++  fi
++
++  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
++    launch_sglang_server
++  fi
++
++  if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
++    launch_vllm_server
++  fi
++}
++
++main
+diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+new file mode 100644
+index 0000000..686f70d
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+@@ -0,0 +1,78 @@
++#!/bin/bash
++
++set -ex
++set -o pipefail
++
++
++main() {
++
++    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
++    (which jq) || (apt-get update && apt-get -y install jq)
++    (which zip) || (apt-get install -y zip)
++
++    if [ ! -f /workspace/buildkite-agent ]; then
++        echo "buildkite-agent binary not found. Skip plotting the results."
++        exit 0
++    fi
++
++    # initial annotation
++    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
++
++    # download results
++    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
++    mkdir -p results/
++    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
++    ls
++    ls results/
++
++    # upload benchmark results
++    zip -r results.zip results/
++    /workspace/buildkite-agent artifact upload "results.zip"
++
++    # upload benchmarking scripts
++    cd "$VLLM_SOURCE_CODE_LOC/"
++    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
++    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
++
++    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
++    # upload benchmarking pipeline
++    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
++
++    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
++    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
++    
++
++
++    # The figures should be genereated by a separate process outside the CI/CD pipeline
++
++    # # generate figures
++    # python3 -m pip install tabulate pandas matplotlib
++
++    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
++    #     --description $description \
++    #     --results-folder results/ 
++
++
++    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
++    #     --description $description \
++    #     --results-folder results/ \
++    #     --dataset sharegpt
++
++    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
++    #     --description $description \
++    #     --results-folder results/ \
++    #     --dataset sonnet_2048_128
++
++    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
++    #     --description $description \
++    #     --results-folder results/ \
++    #     --dataset sonnet_128_2048
++    
++    # # upload results and figures
++    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
++    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
++    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
++    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
++}
++
++main "$@"
+diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+new file mode 100644
+index 0000000..3f38cf5
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+@@ -0,0 +1,355 @@
++#!/bin/bash
++
++set -o pipefail
++set -x
++
++check_gpus() {
++  # check the number of GPUs and GPU type.
++  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
++  if [[ $gpu_count -gt 0 ]]; then
++    echo "GPU found."
++  else
++    echo "Need at least 1 GPU to run benchmarking."
++    exit 1
++  fi
++  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
++  echo "GPU type is $gpu_type"
++}
++
++check_hf_token() {
++  # check if HF_TOKEN is available and valid
++  if [[ -z "$HF_TOKEN" ]]; then
++    echo "Error: HF_TOKEN is not set."
++    exit 1
++  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
++    echo "Error: HF_TOKEN does not start with 'hf_'."
++    exit 1
++  else
++    echo "HF_TOKEN is set and valid."
++  fi
++}
++
++
++upload_to_buildkite() {
++  # upload the benchmarking results to buildkite
++
++  # if the agent binary is not found, skip uploading the results, exit 0
++  if [ ! -f /workspace/buildkite-agent ]; then
++    echo "buildkite-agent binary not found. Skip uploading the results."
++    return 0
++  fi
++  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
++  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
++}
++
++
++get_current_llm_serving_engine() {
++
++  if which lmdeploy >/dev/null; then
++    echo "Container: lmdeploy"
++    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
++    return
++  fi
++
++  if [ -e /tgi-entrypoint.sh ]; then
++    echo "Container: tgi"
++    export CURRENT_LLM_SERVING_ENGINE=tgi
++    return
++  fi
++
++  if which trtllm-build >/dev/null; then
++    echo "Container: tensorrt-llm"
++    export CURRENT_LLM_SERVING_ENGINE=trt
++    return
++  fi
++
++  if [ -e /sgl-workspace ]; then
++    echo "Container: sglang"
++    export CURRENT_LLM_SERVING_ENGINE=sglang
++    return
++  fi
++
++  if [ -e /vllm-workspace ]; then
++    echo "Container: vllm"
++    # move to a completely irrelevant directory, to avoid import vllm from current folder
++    export CURRENT_LLM_SERVING_ENGINE=vllm
++    
++    return
++  fi
++}
++
++json2args() {
++  # transforms the JSON string to command line args, and '_' is replaced to '-'
++  # example:
++  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
++  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
++  local json_string=$1
++  local args=$(
++    echo "$json_string" | jq -r '
++      to_entries |
++      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
++      join(" ")
++    '
++  )
++  echo "$args"
++}
++
++kill_gpu_processes() {
++  pkill -f python
++  pkill -f python3
++  pkill -f tritonserver
++  pkill -f pt_main_thread
++  pkill -f text-generation
++  pkill -f lmdeploy
++
++  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
++    sleep 1
++  done
++}
++
++wait_for_server() {
++  # wait for vllm server to start
++  # return 1 if vllm server crashes
++  timeout 1200 bash -c '
++    until curl -s localhost:8000/v1/completions > /dev/null; do
++      sleep 1
++    done' && return 0 || return 1
++}
++
++ensure_installed() {
++  # Ensure that the given command is installed by apt-get
++  local cmd=$1
++  if ! which "$cmd" >/dev/null; then
++    apt-get update && apt-get install -y "$cmd"
++  fi
++}
++
++run_serving_tests() {
++  # run serving tests using `benchmark_serving.py`
++  # $1: a json file specifying serving test cases
++
++  local serving_test_file
++  serving_test_file=$1
++
++  # Iterate over serving tests
++  jq -c '.[]' "$serving_test_file" | while read -r params; do
++    # get the test name, and append the GPU type back to it.
++    test_name=$(echo "$params" | jq -r '.test_name')
++
++    # if TEST_SELECTOR is set, only run the test cases that match the selector
++    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
++      echo "Skip test case $test_name."
++      continue
++    fi
++
++    # prepend the current serving engine to the test name
++    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
++
++    # get common parameters
++    common_params=$(echo "$params" | jq -r '.common_parameters')
++    model=$(echo "$common_params" | jq -r '.model')
++    tp=$(echo "$common_params" | jq -r '.tp')
++    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
++    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
++    port=$(echo "$common_params" | jq -r '.port')
++    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
++    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
++
++    # get client and server arguments
++    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
++    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
++    client_args=$(json2args "$client_params")
++    qps_list=$(echo "$params" | jq -r '.qps_list')
++    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
++    echo "Running over qps list $qps_list"
++
++    # check if there is enough GPU to run the test
++    if [[ $gpu_count -lt $tp ]]; then
++      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
++      continue
++    fi
++
++    if [[ $reuse_server == "true" ]]; then
++      echo "Reuse previous server for test case $test_name"
++    else
++      kill_gpu_processes
++      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
++        "$server_params" "$common_params"
++    fi
++
++    if wait_for_server; then
++      echo ""
++      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
++    else
++      echo ""
++      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
++      break
++    fi
++
++    # prepare tokenizer
++    # this is required for lmdeploy.
++    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
++    rm -rf /tokenizer_cache
++    mkdir /tokenizer_cache
++    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
++      --model "$model" \
++      --cachedir /tokenizer_cache
++    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
++
++
++    # change model name for lmdeploy (it will not follow standard hf name)
++    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
++      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
++    fi
++
++    # iterate over different QPS
++    for qps in $qps_list; do
++      # remove the surrounding single quote from qps
++      if [[ "$qps" == *"inf"* ]]; then
++        echo "qps was $qps"
++        qps="inf"
++        echo "now qps is $qps"
++      fi
++
++      new_test_name=$test_name"_qps_"$qps
++
++      backend=$CURRENT_LLM_SERVING_ENGINE
++
++      if [[ $backend = "trt" ]]; then
++        backend="tensorrt-llm"
++      fi
++
++      if [[ "$backend" == *"vllm"* ]]; then
++        backend="vllm"
++      fi
++
++      if [[ "$dataset_name" = "sharegpt" ]]; then
++
++        client_command="python3 benchmark_serving.py \
++          --backend $backend \
++          --tokenizer /tokenizer_cache \
++          --model $model \
++          --dataset-name $dataset_name \
++          --dataset-path $dataset_path \
++          --num-prompts $num_prompts \
++          --port $port \
++          --save-result \
++          --result-dir $RESULTS_FOLDER \
++          --result-filename ${new_test_name}.json \
++          --request-rate $qps \
++          --ignore-eos \
++          $client_args"
++
++      elif [[ "$dataset_name" = "sonnet" ]]; then
++
++        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
++        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
++        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
++
++        client_command="python3 benchmark_serving.py \
++          --backend $backend \
++          --tokenizer /tokenizer_cache \
++          --model $model \
++          --dataset-name $dataset_name \
++          --dataset-path $dataset_path \
++          --num-prompts $num_prompts \
++          --sonnet-input-len $sonnet_input_len \
++          --sonnet-output-len $sonnet_output_len \
++          --sonnet-prefix-len $sonnet_prefix_len \
++          --port $port \
++          --save-result \
++          --result-dir $RESULTS_FOLDER \
++          --result-filename ${new_test_name}.json \
++          --request-rate $qps \
++          --ignore-eos \
++          $client_args"
++
++      else
++  
++        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
++        exit 1
++
++      fi
++
++        
++
++      echo "Running test case $test_name with qps $qps"
++      echo "Client command: $client_command"
++
++      eval "$client_command"
++
++      server_command="None"
++
++      # record the benchmarking commands
++      jq_output=$(jq -n \
++        --arg server "$server_command" \
++        --arg client "$client_command" \
++        --arg gpu "$gpu_type" \
++        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
++        '{
++          server_command: $server,
++          client_command: $client,
++          gpu_type: $gpu,
++          engine: $engine
++        }')
++      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
++
++    done
++
++  done
++
++  kill_gpu_processes
++}
++
++
++prepare_dataset() {
++
++  # download sharegpt dataset
++  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
++  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
++
++  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
++  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
++  echo "" > sonnet_4x.txt
++  for _ in {1..4}
++  do
++    cat sonnet.txt >> sonnet_4x.txt
++  done
++  
++}
++
++main() {
++
++  # check if the environment variable is successfully injected from yaml
++
++  check_gpus
++  check_hf_token
++  get_current_llm_serving_engine
++
++  pip install -U transformers
++
++  # check storage
++  df -h
++
++  ensure_installed wget
++  ensure_installed curl
++  ensure_installed jq
++
++  prepare_dataset
++
++  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
++  declare -g RESULTS_FOLDER=results/
++  mkdir -p $RESULTS_FOLDER
++  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
++
++  # run the test
++  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
++
++  # upload benchmark results to buildkite
++  python3 -m pip install tabulate pandas
++  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
++  upload_to_buildkite
++
++}
++
++main "$@"
+diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+new file mode 100644
+index 0000000..0d16a83
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+@@ -0,0 +1,377 @@
++#!/bin/bash
++
++# This script should be run inside the CI process
++# This script assumes that we are already inside the vllm/ directory
++# Benchmarking results will be available inside vllm/benchmarks/results/
++
++# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
++# and we still want to see other benchmarking results even when mixtral crashes.
++set -x
++set -o pipefail
++
++check_gpus() {
++  # check the number of GPUs and GPU type.
++  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
++  if [[ $gpu_count -gt 0 ]]; then
++    echo "GPU found."
++  else
++    echo "Need at least 1 GPU to run benchmarking."
++    exit 1
++  fi
++  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
++  echo "GPU type is $gpu_type"
++}
++
++check_hf_token() {
++  # check if HF_TOKEN is available and valid
++  if [[ -z "$HF_TOKEN" ]]; then
++    echo "Error: HF_TOKEN is not set."
++    exit 1
++  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
++    echo "Error: HF_TOKEN does not start with 'hf_'."
++    exit 1
++  else
++    echo "HF_TOKEN is set and valid."
++  fi
++}
++
++ensure_sharegpt_downloaded() {
++  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
++  if [ ! -f "$FILE" ]; then
++    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
++  else
++    echo "$FILE already exists."
++  fi
++}
++
++json2args() {
++  # transforms the JSON string to command line args, and '_' is replaced to '-'
++  # example:
++  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
++  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
++  local json_string=$1
++  local args=$(
++    echo "$json_string" | jq -r '
++      to_entries |
++      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
++      join(" ")
++    '
++  )
++  echo "$args"
++}
++
++wait_for_server() {
++  # wait for vllm server to start
++  # return 1 if vllm server crashes
++  timeout 1200 bash -c '
++    until curl -X POST localhost:8000/v1/completions; do
++      sleep 1
++    done' && return 0 || return 1
++}
++
++kill_processes_launched_by_current_bash() {
++  # Kill all python processes launched from current bash script
++  current_shell_pid=$$
++  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
++  if [ -n "$processes" ]; then
++    echo "Killing the following processes matching '$1':"
++    echo "$processes"
++    echo "$processes" | xargs kill -9
++  else
++    echo "No processes found matching '$1'."
++  fi
++}
++
++kill_gpu_processes() {
++
++  ps -aux
++  lsof -t -i:8000 | xargs -r kill -9
++  pgrep python3 | xargs -r kill -9
++
++
++  # wait until GPU memory usage smaller than 1GB
++  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
++    sleep 1
++  done
++
++  # remove vllm config file
++  rm -rf ~/.config/vllm
++
++}
++
++upload_to_buildkite() {
++  # upload the benchmarking results to buildkite
++
++  # if the agent binary is not found, skip uploading the results, exit 0
++  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
++  if command -v buildkite-agent >/dev/null 2>&1; then
++    BUILDKITE_AGENT_COMMAND="buildkite-agent"
++  elif [ -f /workspace/buildkite-agent ]; then
++    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
++  else
++    echo "buildkite-agent binary not found. Skip uploading the results."
++    return 0
++  fi
++
++  # Use the determined command to annotate and upload artifacts
++  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
++  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
++}
++
++run_latency_tests() {
++  # run latency tests using `benchmark_latency.py`
++  # $1: a json file specifying latency test cases
++
++  local latency_test_file
++  latency_test_file=$1
++
++  # Iterate over latency tests
++  jq -c '.[]' "$latency_test_file" | while read -r params; do
++    # get the test name, and append the GPU type back to it.
++    test_name=$(echo "$params" | jq -r '.test_name')
++    if [[ ! "$test_name" =~ ^latency_ ]]; then
++      echo "In latency-test.json, test_name must start with \"latency_\"."
++      exit 1
++    fi
++
++    # if TEST_SELECTOR is set, only run the test cases that match the selector
++    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
++      echo "Skip test case $test_name."
++      continue
++    fi
++
++    # get arguments
++    latency_params=$(echo "$params" | jq -r '.parameters')
++    latency_args=$(json2args "$latency_params")
++
++    # check if there is enough GPU to run the test
++    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
++    if [[ $gpu_count -lt $tp ]]; then
++      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
++      continue
++    fi
++
++    latency_command="python3 benchmark_latency.py \
++      --output-json $RESULTS_FOLDER/${test_name}.json \
++      $latency_args"
++
++    echo "Running test case $test_name"
++    echo "Latency command: $latency_command"
++
++    # recoding benchmarking command ang GPU command
++    jq_output=$(jq -n \
++      --arg latency "$latency_command" \
++      --arg gpu "$gpu_type" \
++      '{
++        latency_command: $latency,
++        gpu_type: $gpu
++      }')
++    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
++
++    # run the benchmark
++    eval "$latency_command"
++
++    kill_gpu_processes
++
++  done
++}
++
++run_throughput_tests() {
++  # run throughput tests using `benchmark_throughput.py`
++  # $1: a json file specifying throughput test cases
++
++  local throughput_test_file
++  throughput_test_file=$1
++
++  # Iterate over throughput tests
++  jq -c '.[]' "$throughput_test_file" | while read -r params; do
++    # get the test name, and append the GPU type back to it.
++    test_name=$(echo "$params" | jq -r '.test_name')
++    if [[ ! "$test_name" =~ ^throughput_ ]]; then
++      echo "In throughput-test.json, test_name must start with \"throughput_\"."
++      exit 1
++    fi
++
++    # if TEST_SELECTOR is set, only run the test cases that match the selector
++    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
++      echo "Skip test case $test_name."
++      continue
++    fi
++
++    # get arguments
++    throughput_params=$(echo "$params" | jq -r '.parameters')
++    throughput_args=$(json2args "$throughput_params")
++
++    # check if there is enough GPU to run the test
++    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
++    if [[ $gpu_count -lt $tp ]]; then
++      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
++      continue
++    fi
++
++    throughput_command="python3 benchmark_throughput.py \
++      --output-json $RESULTS_FOLDER/${test_name}.json \
++      $throughput_args"
++
++    echo "Running test case $test_name"
++    echo "Throughput command: $throughput_command"
++    # recoding benchmarking command ang GPU command
++    jq_output=$(jq -n \
++      --arg command "$throughput_command" \
++      --arg gpu "$gpu_type" \
++      '{
++        throughput_command: $command,
++        gpu_type: $gpu
++      }')
++    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
++
++    # run the benchmark
++    eval "$throughput_command"
++
++    kill_gpu_processes
++
++  done
++}
++
++run_serving_tests() {
++  # run serving tests using `benchmark_serving.py`
++  # $1: a json file specifying serving test cases
++
++  local serving_test_file
++  serving_test_file=$1
++
++  # Iterate over serving tests
++  jq -c '.[]' "$serving_test_file" | while read -r params; do
++    # get the test name, and append the GPU type back to it.
++    test_name=$(echo "$params" | jq -r '.test_name')
++    if [[ ! "$test_name" =~ ^serving_ ]]; then
++      echo "In serving-test.json, test_name must start with \"serving_\"."
++      exit 1
++    fi
++
++    # if TEST_SELECTOR is set, only run the test cases that match the selector
++    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
++      echo "Skip test case $test_name."
++      continue
++    fi
++
++    # get client and server arguments
++    server_params=$(echo "$params" | jq -r '.server_parameters')
++    client_params=$(echo "$params" | jq -r '.client_parameters')
++    server_args=$(json2args "$server_params")
++    client_args=$(json2args "$client_params")
++    qps_list=$(echo "$params" | jq -r '.qps_list')
++    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
++    echo "Running over qps list $qps_list"
++
++    # check if there is enough GPU to run the test
++    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
++    if [[ $gpu_count -lt $tp ]]; then
++      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
++      continue
++    fi
++
++    # check if server model and client model is aligned
++    server_model=$(echo "$server_params" | jq -r '.model')
++    client_model=$(echo "$client_params" | jq -r '.model')
++    if [[ $server_model != "$client_model" ]]; then
++      echo "Server model and client model must be the same. Skip testcase $test_name."
++      continue
++    fi
++
++    server_command="python3 \
++      -m vllm.entrypoints.openai.api_server \
++      $server_args"
++
++    # run the server
++    echo "Running test case $test_name"
++    echo "Server command: $server_command"
++    bash -c "$server_command" &
++    server_pid=$!
++
++    # wait until the server is alive
++    if wait_for_server; then
++      echo ""
++      echo "vllm server is up and running."
++    else
++      echo ""
++      echo "vllm failed to start within the timeout period."
++    fi
++
++    # iterate over different QPS
++    for qps in $qps_list; do
++      # remove the surrounding single quote from qps
++      if [[ "$qps" == *"inf"* ]]; then
++        echo "qps was $qps"
++        qps="inf"
++        echo "now qps is $qps"
++      fi
++
++      new_test_name=$test_name"_qps_"$qps
++
++      client_command="python3 benchmark_serving.py \
++        --save-result \
++        --result-dir $RESULTS_FOLDER \
++        --result-filename ${new_test_name}.json \
++        --request-rate $qps \
++        $client_args"
++
++      echo "Running test case $test_name with qps $qps"
++      echo "Client command: $client_command"
++
++      bash -c "$client_command"
++
++      # record the benchmarking commands
++      jq_output=$(jq -n \
++        --arg server "$server_command" \
++        --arg client "$client_command" \
++        --arg gpu "$gpu_type" \
++        '{
++          server_command: $server,
++          client_command: $client,
++          gpu_type: $gpu
++        }')
++      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
++
++    done
++
++    # clean up
++    kill -9 $server_pid
++    kill_gpu_processes
++  done
++}
++
++main() {
++  check_gpus
++  check_hf_token
++
++  # dependencies
++  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
++  (which jq) || (apt-get update && apt-get -y install jq)
++  (which lsof) || (apt-get update && apt-get install -y lsof)
++
++  # get the current IP address, required by benchmark_serving.py
++  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
++  # turn of the reporting of the status of each request, to clean up the terminal output
++  export VLLM_LOG_LEVEL="WARNING"
++
++  # prepare for benchmarking
++  cd benchmarks || exit 1
++  ensure_sharegpt_downloaded
++  declare -g RESULTS_FOLDER=results/
++  mkdir -p $RESULTS_FOLDER
++  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
++
++  # benchmarking
++  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
++  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
++  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
++
++  # postprocess benchmarking results
++  pip install tabulate pandas
++  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
++
++  upload_to_buildkite
++}
++
++main "$@"
+diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+new file mode 100644
+index 0000000..92d6fad
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+@@ -0,0 +1,83 @@
++import datetime
++import json
++import os
++from pathlib import Path
++
++import pandas as pd
++from tabulate import tabulate
++
++results_folder = Path("results/")
++
++# serving results and the keys that will be printed into markdown
++serving_results = []
++serving_column_mapping = {
++    "test_name": "Test name",
++    "gpu_type": "GPU",
++    "completed": "Successful req.",
++    "request_throughput": "Tput (req/s)",
++    "mean_ttft_ms": "Mean TTFT (ms)",
++    "std_ttft_ms": "Std TTFT (ms)",
++    "median_ttft_ms": "Median TTFT (ms)",
++    "mean_itl_ms": "Mean ITL (ms)",
++    "std_itl_ms": "Std ITL (ms)",
++    "median_itl_ms": "Median ITL (ms)",
++    "mean_tpot_ms": "Mean TPOT (ms)",
++    "std_tpot_ms": "Std TPOT (ms)",
++    "median_tpot_ms": "Median TPOT (ms)",
++    "total_token_throughput": "Total Token Tput (tok/s)",
++    "output_throughput": "Output Tput (tok/s)",
++    "total_input_tokens": "Total input tokens",
++    "total_output_tokens": "Total output tokens",
++    "engine": "Engine",
++}
++
++if __name__ == "__main__":
++
++    # collect results
++    for test_file in results_folder.glob("*.json"):
++
++        with open(test_file) as f:
++            raw_result = json.loads(f.read())
++
++        # attach the benchmarking command to raw_result
++        with open(test_file.with_suffix(".commands")) as f:
++            command = json.loads(f.read())
++        raw_result.update(command)
++
++        # update the test name of this result
++        raw_result.update({"test_name": test_file.stem})
++
++        # add the result to raw_result
++        serving_results.append(raw_result)
++        continue
++
++    serving_results = pd.DataFrame.from_dict(serving_results)
++
++    if not serving_results.empty:
++        serving_results = serving_results[list(
++            serving_column_mapping.keys())].rename(
++                columns=serving_column_mapping)
++
++    serving_md_table_with_headers = tabulate(serving_results,
++                                             headers='keys',
++                                             tablefmt='pipe',
++                                             showindex=False)
++    # remove the first line of header
++    serving_md_table_lines = serving_md_table_with_headers.split('\n')
++    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
++
++    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
++    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
++
++    # document benchmarking results in markdown
++    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
++        # document results with header.
++        # for those who wants to reproduce our benchmark.
++        f.write(serving_md_table_with_headers)
++        f.write('\n')
++
++    # document benchmarking results in json
++    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
++
++        results = serving_results.to_dict(orient='records')
++        f.write(json.dumps(results))
+diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+new file mode 100644
+index 0000000..aa0f7ad
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+@@ -0,0 +1,19 @@
++#!/bin/sh
++TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
++URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
++
++TIMEOUT_SECONDS=10
++
++retries=0
++while [ $retries -lt 1000 ]; do
++    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
++        exit 0
++    fi
++
++    echo "Waiting for image to be available..."
++
++    retries=$((retries + 1))
++    sleep 5
++done
++
++exit 1
+diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json
+new file mode 100644
+index 0000000..1841186
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
+@@ -0,0 +1,32 @@
++[
++    {
++        "test_name": "latency_llama8B_tp1",
++        "parameters": {
++            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
++            "tensor_parallel_size": 1,
++            "load_format": "dummy",
++            "num_iters_warmup": 5,
++            "num_iters": 15
++        }
++    },
++    {
++        "test_name": "latency_llama70B_tp4",
++        "parameters": {
++            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
++            "tensor_parallel_size": 4,
++            "load_format": "dummy",
++            "num-iters-warmup": 5,
++            "num-iters": 15
++        }
++    },
++    {
++        "test_name": "latency_mixtral8x7B_tp2",
++        "parameters": {
++            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
++            "tensor_parallel_size": 2,
++            "load_format": "dummy",
++            "num-iters-warmup": 5,
++            "num-iters": 15
++        }
++    }
++]
+\ No newline at end of file
+diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+new file mode 100644
+index 0000000..fda1a7a
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+@@ -0,0 +1,323 @@
++[
++    {
++        "test_name": "llama8B_tp1_sharegpt",
++        "qps_list": [4,8,16,32,"inf"],
++        "common_parameters": {
++            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
++            "tp": 1,
++            "dataset_name": "sharegpt",
++            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
++            "num_prompts": 500,
++            "port": 8000,
++            "reuse_server": false
++        },
++        "lmdeploy_server_parameters": {
++            "dtype": "bfloat16"
++        },
++        "lmdeploy_client_parameters": {
++        },
++        "tgi_server_parameters": {
++        },
++        "tgi_client_parameters": {
++            "endpoint": "/generate_stream"
++        },
++        "trt_server_parameters": {
++            "model_type": "llama",
++            "model_dtype": "bfloat16",
++            "max_batch_size": 2048,
++            "max_input_len": 4096,
++            "max_seq_len": 6144,
++            "max_num_tokens": 16384,
++            "trt_llm_version": "v0.11.0"
++        },
++        "trt_client_parameters": {
++            "endpoint": "/v2/models/ensemble/generate_stream"
++        }, 
++        "vllm_server_parameters": {
++            "disable_log_stats": "",
++            "disable_log_requests": "",
++            "gpu_memory_utilization": 0.9,
++            "num_scheduler_steps": 10,
++            "max_num_seqs": 512,
++            "dtype": "bfloat16"
++        },
++        "vllm_client_parameters": {
++        },
++        "sglang_server_parameters": {
++            "disable_radix_cache": "",
++            "enable_torch_compile": "",
++            "dtype": "bfloat16"
++        },
++        "sglang_client_parameters": {
++        }
++    },
++    {
++        "test_name": "llama8B_tp1_sonnet_512_16",
++        "qps_list": [4,8,16,32,"inf"],
++        "common_parameters": {
++            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
++            "tp": 1,
++            "dataset_name": "sonnet",
++            "dataset_path": "./sonnet_4x.txt",
++            "num_prompts": 500,
++            "port": 8000,
++            "sonnet_input_len": 512,
++            "sonnet_output_len": 16,
++            "sonnet_prefix_len": 50,
++            "reuse_server": true
++        },
++        "lmdeploy_server_parameters": {
++            "dtype": "bfloat16"
++        },
++        "lmdeploy_client_parameters": {
++        },
++        "tgi_server_parameters": {
++        },
++        "tgi_client_parameters": {
++            "endpoint": "/generate_stream"
++        },
++        "trt_server_parameters": {
++            "model_type": "llama",
++            "model_dtype": "bfloat16",
++            "max_batch_size": 2048,
++            "max_input_len": 4096,
++            "max_seq_len": 6144,
++            "max_num_tokens": 16384,
++            "trt_llm_version": "v0.11.0"
++        },
++        "trt_client_parameters": {
++            "endpoint": "/v2/models/ensemble/generate_stream"
++        }, 
++        "vllm_server_parameters": {
++            "disable_log_stats": "",
++            "disable_log_requests": "",
++            "gpu_memory_utilization": 0.9,
++            "num_scheduler_steps": 10,
++            "max_num_seqs": 512,
++            "dtype": "bfloat16"
++        },
++        "vllm_client_parameters": {
++        },
++        "sglang_server_parameters": {
++            "disable_radix_cache": "",
++            "enable_torch_compile": "",
++            "dtype": "bfloat16"
++        },
++        "sglang_client_parameters": {
++        }
++    },
++    {
++        "test_name": "llama8B_tp1_sonnet_512_256",
++        "qps_list": [4,8,16,32,"inf"],
++        "common_parameters": {
++            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
++            "tp": 1,
++            "dataset_name": "sonnet",
++            "dataset_path": "./sonnet_4x.txt",
++            "num_prompts": 500,
++            "port": 8000,
++            "sonnet_input_len": 512,
++            "sonnet_output_len": 256,
++            "sonnet_prefix_len": 50,
++            "reuse_server": true
++        },
++        "lmdeploy_server_parameters": {
++            "dtype": "bfloat16"
++        },
++        "lmdeploy_client_parameters": {
++        },
++        "tgi_server_parameters": {
++        },
++        "tgi_client_parameters": {
++            "endpoint": "/generate_stream"
++        },
++        "trt_server_parameters": {
++            "model_type": "llama",
++            "model_dtype": "bfloat16",
++            "max_batch_size": 2048,
++            "max_input_len": 4096,
++            "max_seq_len": 6144,
++            "max_num_tokens": 16384,
++            "trt_llm_version": "v0.11.0"
++        },
++        "trt_client_parameters": {
++            "endpoint": "/v2/models/ensemble/generate_stream"
++        }, 
++        "vllm_server_parameters": {
++            "disable_log_stats": "",
++            "disable_log_requests": "",
++            "gpu_memory_utilization": 0.9,
++            "num_scheduler_steps": 10,
++            "max_num_seqs": 512,
++            "dtype": "bfloat16"
++        },
++        "vllm_client_parameters": {
++        },
++        "sglang_server_parameters": {
++            "disable_radix_cache": "",
++            "enable_torch_compile": "",
++            "dtype": "bfloat16"
++        },
++        "sglang_client_parameters": {
++        }
++    },
++    {
++        "test_name": "llama70B_tp4_sharegpt",
++        "qps_list": [4,8,16,32,"inf"],
++        "common_parameters": {
++            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
++            "tp": 4,
++            "dataset_name": "sharegpt",
++            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
++            "num_prompts": 500,
++            "port": 8000,
++            "reuse_server": false
++        },
++        "lmdeploy_server_parameters": {
++            "dtype": "bfloat16"
++        },
++        "lmdeploy_client_parameters": {
++        },
++        "tgi_server_parameters": {
++        },
++        "tgi_client_parameters": {
++            "endpoint": "/generate_stream"
++        },
++        "trt_server_parameters": {
++            "model_type": "llama",
++            "model_dtype": "bfloat16",
++            "max_batch_size": 2048,
++            "max_input_len": 4096,
++            "max_seq_len": 6144,
++            "max_num_tokens": 16384,
++            "trt_llm_version": "v0.11.0"
++        },
++        "trt_client_parameters": {
++            "endpoint": "/v2/models/ensemble/generate_stream"
++        }, 
++        "vllm_server_parameters": {
++            "disable_log_stats": "",
++            "disable_log_requests": "",
++            "gpu_memory_utilization": 0.9,
++            "num_scheduler_steps": 10,
++            "max_num_seqs": 512,
++            "dtype": "bfloat16"
++        },
++        "vllm_client_parameters": {
++        },
++        "sglang_server_parameters": {
++            "disable_radix_cache": "",
++            "dtype": "bfloat16"
++        },
++        "sglang_client_parameters": {
++        }
++    },
++    {
++        "test_name": "llama70B_tp4_sonnet_512_16",
++        "qps_list": [4,8,16,32,"inf"],
++        "common_parameters": {
++            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
++            "tp": 4,
++            "dataset_name": "sonnet",
++            "dataset_path": "./sonnet_4x.txt",
++            "num_prompts": 500,
++            "port": 8000,
++            "sonnet_input_len": 512,
++            "sonnet_output_len": 16,
++            "sonnet_prefix_len": 50,
++            "reuse_server": true
++        },
++        "lmdeploy_server_parameters": {
++            "dtype": "bfloat16"
++        },
++        "lmdeploy_client_parameters": {
++        },
++        "tgi_server_parameters": {
++        },
++        "tgi_client_parameters": {
++            "endpoint": "/generate_stream"
++        },
++        "trt_server_parameters": {
++            "model_type": "llama",
++            "model_dtype": "bfloat16",
++            "max_batch_size": 2048,
++            "max_input_len": 4096,
++            "max_seq_len": 6144,
++            "max_num_tokens": 16384,
++            "trt_llm_version": "v0.11.0"
++        },
++        "trt_client_parameters": {
++            "endpoint": "/v2/models/ensemble/generate_stream"
++        }, 
++        "vllm_server_parameters": {
++            "disable_log_stats": "",
++            "disable_log_requests": "",
++            "gpu_memory_utilization": 0.9,
++            "num_scheduler_steps": 10,
++            "max_num_seqs": 512,
++            "dtype": "bfloat16"
++        },
++        "vllm_client_parameters": {
++        },
++        "sglang_server_parameters": {
++            "disable_radix_cache": "",
++            "dtype": "bfloat16"
++        },
++        "sglang_client_parameters": {
++        }
++    },
++    {
++        "test_name": "llama70B_tp4_sonnet_512_256",
++        "qps_list": [4,8,16,32,"inf"],
++        "common_parameters": {
++            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
++            "tp": 4,
++            "dataset_name": "sonnet",
++            "dataset_path": "./sonnet_4x.txt",
++            "num_prompts": 500,
++            "port": 8000,
++            "sonnet_input_len": 512,
++            "sonnet_output_len": 256,
++            "sonnet_prefix_len": 50,
++            "reuse_server": true
++        },
++        "lmdeploy_server_parameters": {
++            "dtype": "bfloat16"
++        },
++        "lmdeploy_client_parameters": {
++        },
++        "tgi_server_parameters": {
++        },
++        "tgi_client_parameters": {
++            "endpoint": "/generate_stream"
++        },
++        "trt_server_parameters": {
++            "model_type": "llama",
++            "model_dtype": "bfloat16",
++            "max_batch_size": 2048,
++            "max_input_len": 4096,
++            "max_seq_len": 6144,
++            "max_num_tokens": 16384,
++            "trt_llm_version": "v0.11.0"
++        },
++        "trt_client_parameters": {
++            "endpoint": "/v2/models/ensemble/generate_stream"
++        }, 
++        "vllm_server_parameters": {
++            "disable_log_stats": "",
++            "disable_log_requests": "",
++            "gpu_memory_utilization": 0.9,
++            "num_scheduler_steps": 10,
++            "max_num_seqs": 512,
++            "dtype": "bfloat16"
++        },
++        "vllm_client_parameters": {
++        },
++        "sglang_server_parameters": {
++            "disable_radix_cache": "",
++            "dtype": "bfloat16"
++        },
++        "sglang_client_parameters": {
++        }
++    }
++]
+\ No newline at end of file
+diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
+new file mode 100644
+index 0000000..facb0ea
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
+@@ -0,0 +1,80 @@
++[
++    {
++        "test_name": "serving_llama8B_tp1_sharegpt",
++        "qps_list": [1, 4, 16, "inf"],
++        "server_parameters": {
++            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
++            "tensor_parallel_size": 1,
++            "swap_space": 16,
++            "disable_log_stats": "",
++            "disable_log_requests": "",
++            "load_format": "dummy"
++        },
++        "client_parameters": {
++            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
++            "backend": "vllm",
++            "dataset_name": "sharegpt",
++            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
++            "num_prompts": 200
++        }
++    },
++    {
++        "test_name": "serving_llama70B_tp4_sharegpt",
++        "qps_list": [1, 4, 16, "inf"],
++        "server_parameters": {
++            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
++            "tensor_parallel_size": 4,
++            "swap_space": 16,
++            "disable_log_stats": "",
++            "disable_log_requests": "",
++            "load_format": "dummy"
++        },
++        "client_parameters": {
++            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
++            "backend": "vllm",
++            "dataset_name": "sharegpt",
++            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
++            "num_prompts": 200
++        }
++    },
++    {
++        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
++        "qps_list": [1, 4, 16, "inf"],
++        "server_parameters": {
++            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
++            "tensor_parallel_size": 2,
++            "swap_space": 16,
++            "disable_log_stats": "",
++            "disable_log_requests": "",
++            "load_format": "dummy"
++        },
++        "client_parameters": {
++            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
++            "backend": "vllm",
++            "dataset_name": "sharegpt",
++            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
++            "num_prompts": 200
++        }
++    },
++    {
++        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
++        "qps_list": [2],
++        "server_parameters": {
++            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
++            "disable_log_requests": "", 
++            "tensor_parallel_size": 4,
++            "swap_space": 16, 
++            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
++            "num_speculative_tokens": 4,
++            "speculative_draft_tensor_parallel_size": 1,
++            "use_v2_block_manager": ""
++        },
++        "client_parameters": {
++            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
++            "backend": "vllm",
++            "dataset_name": "sharegpt",
++            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
++            "num_prompts": 200 
++        }
++    }
++]
+diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+new file mode 100644
+index 0000000..91ef6d1
+--- /dev/null
++++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+@@ -0,0 +1,35 @@
++[
++    {
++        "test_name": "throughput_llama8B_tp1",
++        "parameters": {
++            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
++            "tensor_parallel_size": 1,
++            "load_format": "dummy",
++            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
++            "num_prompts": 200,
++            "backend": "vllm"
++        }
++    },
++    {
++        "test_name": "throughput_llama70B_tp4",
++        "parameters": {
++            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
++            "tensor_parallel_size": 4,
++            "load_format": "dummy",
++            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
++            "num_prompts": 200,
++            "backend": "vllm"
++        }
++    },
++    {
++        "test_name": "throughput_mixtral8x7B_tp2",
++        "parameters": {
++            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
++            "tensor_parallel_size": 2,
++            "load_format": "dummy",
++            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
++            "num_prompts": 200,
++            "backend": "vllm"
++        }
++    }
++]
+\ No newline at end of file
+diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
+new file mode 100644
+index 0000000..51618a2
+--- /dev/null
++++ b/.buildkite/release-pipeline.yaml
+@@ -0,0 +1,72 @@
++steps:
++  - label: "Build wheel - CUDA 12.1"
++    agents:
++      queue: cpu_queue_postmerge
++    commands:
++      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
++      - "mkdir artifacts"
++      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
++      - "bash .buildkite/upload-wheels.sh"
++    env:
++      DOCKER_BUILDKIT: "1"
++
++  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
++  # However, this block can be uncommented to save some compute hours.
++  # - block: "Build CUDA 11.8 wheel"
++  #   key: block-build-cu118-wheel
++
++  - label: "Build wheel - CUDA 11.8"
++    # depends_on: block-build-cu118-wheel
++    agents:
++      queue: cpu_queue_postmerge
++    commands:
++      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
++      - "mkdir artifacts"
++      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
++      - "bash .buildkite/upload-wheels.sh"
++    env:
++      DOCKER_BUILDKIT: "1"
++
++  - block: "Build release image"
++    depends_on: ~
++    key: block-release-image-build
++
++  - label: "Build release image"
++    depends_on: block-release-image-build
++    agents:
++      queue: cpu_queue_postmerge
++    commands:
++      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
++      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
++      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
++
++  - label: "Build and publish TPU release image"
++    depends_on: ~
++    if: build.env("NIGHTLY") == "1"
++    agents:
++      queue: tpu_queue_postmerge
++    commands:
++      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
++      - "docker push vllm/vllm-tpu:nightly"
++      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
++    plugins:
++      - docker-login#v3.0.0:
++          username: vllm
++          password-env: DOCKERHUB_TOKEN
++    env:
++      DOCKER_BUILDKIT: "1"
++
++  - block: "Build CPU release image"
++    key: block-cpu-release-image-build
++    depends_on: ~
++
++  - label: "Build and publish CPU release image"
++    depends_on: block-cpu-release-image-build
++    agents:
++      queue: cpu_queue_postmerge
++    commands:
++      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
++      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
++      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
++    env:
++      DOCKER_BUILDKIT: "1"
+diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
+index c04e05a..3515ccd 100644
+--- a/.buildkite/run-amd-test.sh
++++ b/.buildkite/run-amd-test.sh
+@@ -1,10 +1,49 @@
+-# This script build the ROCm docker image and runs test inside it.
+-set -ex
++#!/bin/bash
++
++# This script runs test inside the corresponding ROCm docker container.
++set -o pipefail
+ 
+ # Print ROCm version
++echo "--- Confirming Clean Initial State"
++while true; do
++        sleep 3
++        if grep -q clean /opt/amdgpu/etc/gpu_state; then
++                echo "GPUs state is \"clean\""
++                break
++        fi
++done
++
+ echo "--- ROCm info"
+ rocminfo
+ 
++# cleanup older docker images
++cleanup_docker() {
++  # Get Docker's root directory
++  docker_root=$(docker info -f '{{.DockerRootDir}}')
++  if [ -z "$docker_root" ]; then
++    echo "Failed to determine Docker root directory."
++    exit 1
++  fi
++  echo "Docker root directory: $docker_root"
++  # Check disk usage of the filesystem where Docker's root directory is located
++  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
++  # Define the threshold
++  threshold=70
++  if [ "$disk_usage" -gt "$threshold" ]; then
++    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
++    # Remove dangling images (those that are not tagged and not used by any container)
++    docker image prune -f
++    # Remove unused volumes / force the system prune for old images as well.
++    docker volume prune -f && docker system prune --force --filter "until=72h" --all
++    echo "Docker images and volumes cleanup completed."
++  else
++    echo "Disk usage is below $threshold%. No cleanup needed."
++  fi
++}
++
++# Call the cleanup docker function
++cleanup_docker
++
+ echo "--- Resetting GPUs"
+ 
+ echo "reset" > /opt/amdgpu/etc/gpu_state
+@@ -17,28 +56,101 @@ while true; do
+         fi
+ done
+ 
+-echo "--- Building container"
+-sha=$(git rev-parse --short HEAD)
+-container_name=rocm_${sha}
+-docker build \
+-        -t ${container_name} \
+-        -f Dockerfile.rocm \
+-        --progress plain \
+-        .
++echo "--- Pulling container" 
++image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
++container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
++docker pull "${image_name}"
+ 
+ remove_docker_container() {
+-   docker rm -f ${container_name} || docker image rm -f ${container_name} || true
++   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+ }
+ trap remove_docker_container EXIT
+ 
+ echo "--- Running container"
+ 
+-docker run \
++HF_CACHE="$(realpath ~)/huggingface"
++mkdir -p "${HF_CACHE}"
++HF_MOUNT="/root/.cache/huggingface"
++
++commands=$@
++echo "Commands:$commands"
++#ignore certain kernels tests
++if [[ $commands == *" kernels "* ]]; then
++  commands="${commands} \
++  --ignore=kernels/test_attention.py \
++  --ignore=kernels/test_attention_selector.py \
++  --ignore=kernels/test_blocksparse_attention.py \
++  --ignore=kernels/test_causal_conv1d.py \
++  --ignore=kernels/test_cutlass.py \
++  --ignore=kernels/test_encoder_decoder_attn.py \
++  --ignore=kernels/test_flash_attn.py \
++  --ignore=kernels/test_flashinfer.py \
++  --ignore=kernels/test_int8_quant.py \
++  --ignore=kernels/test_machete_gemm.py \
++  --ignore=kernels/test_mamba_ssm.py \
++  --ignore=kernels/test_marlin_gemm.py \
++  --ignore=kernels/test_moe.py \
++  --ignore=kernels/test_prefix_prefill.py \
++  --ignore=kernels/test_rand.py \
++  --ignore=kernels/test_sampler.py"
++fi
++
++#ignore certain Entrypoints tests
++if [[ $commands == *" entrypoints/openai "* ]]; then
++  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
++  --ignore=entrypoints/openai/test_accuracy.py \
++  --ignore=entrypoints/openai/test_audio.py \
++  --ignore=entrypoints/openai/test_encoder_decoder.py \
++  --ignore=entrypoints/openai/test_embedding.py \
++  --ignore=entrypoints/openai/test_oot_registration.py "}
++fi
++
++PARALLEL_JOB_COUNT=8
++# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
++if [[ $commands == *"--shard-id="* ]]; then
++  # assign job count as the number of shards used   
++  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
++  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
++    # assign shard-id for each shard
++    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
++    echo "Shard ${GPU} commands:$commands_gpu"
++    docker run \
+         --device /dev/kfd --device /dev/dri \
+         --network host \
++        --shm-size=16gb \
+         --rm \
++        -e HIP_VISIBLE_DEVICES="${GPU}" \
+         -e HF_TOKEN \
+-        --name ${container_name} \
+-        ${container_name} \
+-        /bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
+-
++        -v "${HF_CACHE}:${HF_MOUNT}" \
++        -e "HF_HOME=${HF_MOUNT}" \
++        --name "${container_name}_${GPU}" \
++        "${image_name}" \
++        /bin/bash -c "${commands_gpu}" \
++        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
++    PIDS+=($!)
++  done
++  #wait for all processes to finish and collect exit codes
++  for pid in "${PIDS[@]}"; do
++    wait "${pid}"
++    STATUS+=($?)
++  done
++  for st in "${STATUS[@]}"; do
++    if [[ ${st} -ne 0 ]]; then
++      echo "One of the processes failed with $st"
++      exit "${st}"
++    fi
++  done
++else
++  docker run \
++          --device /dev/kfd --device /dev/dri \
++          --network host \
++          --shm-size=16gb \
++          --rm \
++          -e HIP_VISIBLE_DEVICES=0 \
++          -e HF_TOKEN \
++          -v "${HF_CACHE}:${HF_MOUNT}" \
++          -e "HF_HOME=${HF_MOUNT}" \
++          --name "${container_name}" \
++          "${image_name}" \
++          /bin/bash -c "${commands}"
++fi
+diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
+index 7fbad1c..1641c1f 100644
+--- a/.buildkite/run-benchmarks.sh
++++ b/.buildkite/run-benchmarks.sh
+@@ -1,3 +1,5 @@
++#!/bin/bash
++
+ # This script is run by buildkite to run the benchmarks and upload the results to buildkite
+ 
+ set -ex
+@@ -9,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
+ (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+ 
+ # run python-based benchmarks and upload the result to buildkite
+-python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
++python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+ bench_latency_exit_code=$?
+ 
+-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
++python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+ bench_throughput_exit_code=$?
+ 
+ # run server-based benchmarks and upload the result to buildkite
+@@ -50,16 +52,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
+ sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
+ echo "" >> benchmark_results.md
+ echo '```' >> benchmark_results.md
+-tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
++tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
+ echo '```' >> benchmark_results.md
+ 
+ # if the agent binary is not found, skip uploading the results, exit 0
+-if [ ! -f /workspace/buildkite-agent ]; then
++if [ ! -f /usr/bin/buildkite-agent ]; then
+     exit 0
+ fi
+ 
+ # upload the results to buildkite
+-/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
++buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+ 
+ # exit with the exit code of the benchmarks
+ if [ $bench_latency_exit_code -ne 0 ]; then
+@@ -74,4 +76,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
+     exit $bench_serving_exit_code
+ fi
+ 
+-/workspace/buildkite-agent artifact upload openai-*.json
++rm ShareGPT_V3_unfiltered_cleaned_split.json
++buildkite-agent artifact upload "*.json"
+diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
+new file mode 100644
+index 0000000..bc06838
+--- /dev/null
++++ b/.buildkite/run-cpu-test-ppc64le.sh
+@@ -0,0 +1,14 @@
++#!/bin/bash
++
++# This script build the CPU docker image and run the offline inference inside the container.
++# It serves a sanity check for compilation and basic model usage.
++set -ex
++
++# Setup cleanup
++remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
++trap remove_docker_container EXIT
++remove_docker_container
++
++# Try building the docker image
++docker build -t cpu-test -f Dockerfile.ppc64le .
++
+diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
+index f187d1f..9925db7 100644
+--- a/.buildkite/run-cpu-test.sh
++++ b/.buildkite/run-cpu-test.sh
+@@ -1,14 +1,88 @@
++#!/bin/bash
++
+ # This script build the CPU docker image and run the offline inference inside the container.
+ # It serves a sanity check for compilation and basic model usage.
+ set -ex
+ 
++# allow to bind to different cores
++CORE_RANGE=${CORE_RANGE:-48-95}
++NUMA_NODE=${NUMA_NODE:-1}
++
+ # Try building the docker image
+-docker build -t cpu-test -f Dockerfile.cpu .
++numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
++numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
+ 
+ # Setup cleanup
+-remove_docker_container() { docker rm -f cpu-test || true; }
++remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+ trap remove_docker_container EXIT
+ remove_docker_container
+ 
+-# Run the image and launch offline inference
+-docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
++# Run the image, setting --shm-size=4g for tensor parallel.
++docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
++ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
++docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
++ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
++
++function cpu_tests() {
++  set -e
++  export NUMA_NODE=$2
++
++  # offline inference
++  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
++    set -e
++    python3 examples/offline_inference/basic.py"
++
++  # Run basic model test
++  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
++    set -e
++    pip install -r vllm/requirements-test.txt
++    pytest -v -s tests/models/decoder_only/language -m cpu_model
++    pytest -v -s tests/models/embedding/language -m cpu_model
++    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
++    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
++    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
++
++  # Run compressed-tensor test
++  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
++    set -e
++    pytest -s -v \
++    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
++    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
++
++  # Run AWQ test
++  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
++    set -e
++    pytest -s -v \
++    tests/quantization/test_ipex_quant.py"
++
++  # Run chunked-prefill and prefix-cache test
++  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
++    set -e
++    pytest -s -v -k cpu_model \
++    tests/basic_correctness/test_chunked_prefill.py"  
++
++  # online serving
++  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
++    set -e
++    export VLLM_CPU_KVCACHE_SPACE=10 
++    export VLLM_CPU_OMP_THREADS_BIND=$1
++    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
++    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
++    python3 benchmarks/benchmark_serving.py \
++      --backend vllm \
++      --dataset-name random \
++      --model facebook/opt-125m \
++      --num-prompts 20 \
++      --endpoint /v1/completions \
++      --tokenizer facebook/opt-125m"
++
++  # Run multi-lora tests
++  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
++    set -e
++    pytest -s -v \
++    tests/lora/test_qwen2vl.py"
++}
++
++# All of CPU tests are expected to be finished less than 25 mins.
++export -f cpu_tests
++timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
+new file mode 100644
+index 0000000..3e4e409
+--- /dev/null
++++ b/.buildkite/run-gh200-test.sh
+@@ -0,0 +1,28 @@
++#!/bin/bash
++
++# This script build the GH200 docker image and run the offline inference inside the container.
++# It serves a sanity check for compilation and basic model usage.
++set -ex
++
++# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
++python3 use_existing_torch.py
++
++# Try building the docker image
++DOCKER_BUILDKIT=1 docker build . \
++  --target vllm-openai \
++  --platform "linux/arm64" \
++  -t gh200-test \
++  --build-arg max_jobs=66 \
++  --build-arg nvcc_threads=2 \
++  --build-arg torch_cuda_arch_list="9.0+PTX" \
++  --build-arg vllm_fa_cmake_gpu_arches="90-real"
++
++# Setup cleanup
++remove_docker_container() { docker rm -f gh200-test || true; }
++trap remove_docker_container EXIT
++remove_docker_container
++
++# Run the image and test offline inference
++docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
++    python3 examples/offline_inference/basic.py
++'
+diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
+new file mode 100644
+index 0000000..8f3b082
+--- /dev/null
++++ b/.buildkite/run-hpu-test.sh
+@@ -0,0 +1,16 @@
++#!/bin/bash
++
++# This script build the CPU docker image and run the offline inference inside the container.
++# It serves a sanity check for compilation and basic model usage.
++set -ex
++
++# Try building the docker image
++docker build -t hpu-test-env -f Dockerfile.hpu .
++
++# Setup cleanup
++remove_docker_container() { docker rm -f hpu-test || true; }
++trap remove_docker_container EXIT
++remove_docker_container
++
++# Run the image and launch offline inference
++docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
+\ No newline at end of file
+diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh
+new file mode 100644
+index 0000000..530bf90
+--- /dev/null
++++ b/.buildkite/run-multi-node-test.sh
+@@ -0,0 +1,108 @@
++#!/bin/bash
++
++set -euox pipefail
++
++if [[ $# -lt 4 ]]; then
++    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
++    exit 1
++fi
++
++WORKING_DIR=$1
++NUM_NODES=$2
++NUM_GPUS=$3
++DOCKER_IMAGE=$4
++
++shift 4
++COMMANDS=("$@")
++if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
++    echo "The number of commands must be equal to the number of nodes."
++    echo "Number of nodes: $NUM_NODES"
++    echo "Number of commands: ${#COMMANDS[@]}"
++    exit 1
++fi
++
++echo "List of commands"
++for command in "${COMMANDS[@]}"; do
++    echo "$command"
++done
++
++start_network() {
++    docker network create --subnet=192.168.10.0/24 docker-net
++}
++
++start_nodes() {
++    for node in $(seq 0 $(($NUM_NODES-1))); do
++        GPU_DEVICES='"device='
++        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
++            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
++            GPU_DEVICES+=$(($DEVICE_NUM))
++            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
++                GPU_DEVICES+=','
++            fi
++        done
++        GPU_DEVICES+='"'
++
++        # start the container in detached mode
++        # things to note:
++        # 1. --shm-size=10.24gb is required. don't use --ipc=host
++        # 2. pass HF_TOKEN to the container
++        # 3. map the huggingface cache directory to the container
++        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
++        #    starting from 192.168.10.11)
++        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
++            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
++            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
++            /bin/bash -c "tail -f /dev/null"
++
++        # organize containers into a ray cluster
++        if [ "$node" -eq 0 ]; then
++            # start the ray head node
++            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
++            # wait for the head node to be ready
++            sleep 10
++        else
++            # start the ray worker nodes, and connect them to the head node
++            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
++        fi
++    done
++
++    # wait for the cluster to be ready
++    sleep 10
++
++    # print the cluster status
++    docker exec node0 /bin/bash -c "ray status"
++}
++
++run_nodes() {
++    # important: iterate in reverse order to start the head node last
++    # we start the worker nodes first, in detached mode, and then start the head node
++    # in the foreground, so that the output of the head node is visible in the buildkite logs
++    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
++        GPU_DEVICES='"device='
++        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
++            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
++            GPU_DEVICES+=$(($DEVICE_NUM))
++            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
++                GPU_DEVICES+=','
++            fi
++        done
++        GPU_DEVICES+='"'
++        echo "Running node$node with GPU devices: $GPU_DEVICES"
++        if [ "$node" -ne 0 ]; then
++            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
++        else
++            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
++        fi
++    done
++}
++cleanup() {
++    for node in $(seq 0 $(($NUM_NODES-1))); do
++        docker stop "node$node"
++    done
++    docker network rm docker-net
++}
++trap cleanup EXIT
++start_network
++start_nodes
++run_nodes
++
+diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
+index 252c0f7..189714e 100644
+--- a/.buildkite/run-neuron-test.sh
++++ b/.buildkite/run-neuron-test.sh
+@@ -1,6 +1,20 @@
++#!/bin/bash
++
+ # This script build the Neuron docker image and run the API server inside the container.
+ # It serves a sanity check for compilation and basic model usage.
+ set -e
++set -v
++
++image_name="neuron/vllm-ci"
++container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
++
++HF_CACHE="$(realpath ~)/huggingface"
++mkdir -p "${HF_CACHE}"
++HF_MOUNT="/root/.cache/huggingface"
++
++NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
++mkdir -p "${NEURON_COMPILE_CACHE_URL}"
++NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
+ 
+ # Try building the docker image
+ aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+@@ -11,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
+     last_build=$(cat /tmp/neuron-docker-build-timestamp)
+     current_time=$(date +%s)
+     if [ $((current_time - last_build)) -gt 86400 ]; then
++        docker image prune -f
+         docker system prune -f
+-        echo $current_time > /tmp/neuron-docker-build-timestamp
++        rm -rf "${HF_MOUNT:?}/*"
++        rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
++        echo "$current_time" > /tmp/neuron-docker-build-timestamp
+     fi
+ else
+-    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
++    date "+%s" > /tmp/neuron-docker-build-timestamp
+ fi
+ 
+-docker build -t neuron -f Dockerfile.neuron .
++docker build -t "${image_name}" -f Dockerfile.neuron .
+ 
+ # Setup cleanup
+-remove_docker_container() { docker rm -f neuron || true; }
++remove_docker_container() {
++    docker image rm -f "${image_name}" || true;
++}
+ trap remove_docker_container EXIT
+-remove_docker_container
+ 
+ # Run the image
+-docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
+-       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
+-
+-# Wait for the server to start
+-wait_for_server_to_start() {
+-    timeout=300
+-    counter=0
+-
+-    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+-        sleep 1
+-        counter=$((counter + 1))
+-        if [ $counter -ge $timeout ]; then
+-            echo "Timeout after $timeout seconds"
+-            break
+-        fi
+-    done
+-}
+-wait_for_server_to_start
+-
+-# Test a simple prompt
+-curl -X POST -H "Content-Type: application/json" \
+-    localhost:8000/generate \
+-    -d '{"prompt": "San Francisco is a"}'
++docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
++       -v "${HF_CACHE}:${HF_MOUNT}" \
++       -e "HF_HOME=${HF_MOUNT}" \
++       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
++       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
++       --name "${container_name}" \
++       ${image_name} \
++       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
+diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
+new file mode 100644
+index 0000000..6159b21
+--- /dev/null
++++ b/.buildkite/run-openvino-test.sh
+@@ -0,0 +1,16 @@
++#!/bin/bash
++
++# This script build the OpenVINO docker image and run the offline inference inside the container.
++# It serves a sanity check for compilation and basic model usage.
++set -ex
++
++# Try building the docker image
++docker build -t openvino-test -f Dockerfile.openvino .
++
++# Setup cleanup
++remove_docker_container() { docker rm -f openvino-test || true; }
++trap remove_docker_container EXIT
++remove_docker_container
++
++# Run the image and launch offline inference
++docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
+diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
+new file mode 100644
+index 0000000..650af0f
+--- /dev/null
++++ b/.buildkite/run-tpu-test.sh
+@@ -0,0 +1,26 @@
++#!/bin/bash
++
++set -e
++
++# Build the docker image.
++docker build -f Dockerfile.tpu -t vllm-tpu .
++
++# Set up cleanup.
++remove_docker_container() { docker rm -f tpu-test || true; }
++trap remove_docker_container EXIT
++# Remove the container that might not be cleaned up in the previous run.
++remove_docker_container
++
++# For HF_TOKEN.
++source /etc/environment
++# Run a simple end-to-end example.
++docker run --privileged --net host --shm-size=16G -it \
++    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
++    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
++    && python3 -m pip install pytest \
++    && python3 -m pip install lm_eval[api]==0.4.4 \
++    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
++    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
++    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
++    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
++    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
+diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
+new file mode 100644
+index 0000000..4d344e5
+--- /dev/null
++++ b/.buildkite/run-xpu-test.sh
+@@ -0,0 +1,19 @@
++#!/bin/bash
++
++# This script build the CPU docker image and run the offline inference inside the container.
++# It serves a sanity check for compilation and basic model usage.
++set -ex
++
++# Try building the docker image
++docker build -t xpu-test -f Dockerfile.xpu .
++
++# Setup cleanup
++remove_docker_container() { docker rm -f xpu-test || true; }
++trap remove_docker_container EXIT
++remove_docker_container
++
++# Run the image and test offline inference/tensor parallel
++docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
++    python3 examples/offline_inference/basic.py
++    python3 examples/offline_inference/cli.py -tp 2
++'
+diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
+index e49a565..74b287c 100644
+--- a/.buildkite/test-pipeline.yaml
++++ b/.buildkite/test-pipeline.yaml
+@@ -1,132 +1,594 @@
+ # In this file, you can add more tests to run either by adding a new step or
+ # adding a new command to an existing step. See different options here for examples.
+-# This script will be feed into Jinja template in `test-template.j2` to generate
+-# the final pipeline yaml file.
++
++# This script will be feed into Jinja template in `test-template-aws.j2` at
++# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
++# to generate the final pipeline yaml file.
++
++# Documentation
++# label(str): the name of the test. emoji allowed.
++# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
++# fast_check_only(bool): run this test on fastcheck pipeline only
++# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
++# command(str): the single command to run for tests. incompatible with commands.
++# commands(list): the list of commands to run for test. incompatbile with command.
++# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
++# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
++# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
++# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, 
++#     in this case, commands must be specified. the first command runs on first host, the second
++#     command runs on the second host.
++# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
++# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
++
++# When adding a test
++# - If the test belong to an existing group, add it there
++# - If the test is short, add to any existing step
++# - If the test takes more than 10min, then it is okay to create a new step. 
++#   Note that all steps execute in parallel. 
+ 
+ steps:
+-- label: Regression Test
+-  command: pytest -v -s test_regression.py
+-  working_dir: "/vllm-workspace/tests" # optional
++##### fast check tests  #####
++
++- label: Documentation Build # 2min
++  working_dir: "/vllm-workspace/test_docs/docs"
++  fast_check: true
++  no_gpu: True
++  commands:
++  - pip install -r requirements-docs.txt
++  - SPHINXOPTS=\"-W\" make html
++  # Check API reference (if it fails, you may have missing mock imports)
++  - grep \"sig sig-object py\" build/html/api/inference_params.html
++
++- label: Async Engine, Inputs, Utils, Worker Test # 24min
++  fast_check: true
++  source_file_dependencies:
++  - vllm/
++  - tests/mq_llm_engine
++  - tests/async_engine
++  - tests/test_inputs
++  - tests/multimodal
++  - tests/test_utils
++  - tests/worker
++  - tests/standalone_tests/lazy_torch_compile.py
++  commands:
++  - pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git  # Used by multimoda processing test
++  - python3 standalone_tests/lazy_torch_compile.py
++  - pytest -v -s mq_llm_engine # MQLLMEngine
++  - pytest -v -s async_engine # AsyncLLMEngine
++  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
++  - pytest -v -s test_inputs.py
++  - pytest -v -s multimodal
++  - pytest -v -s test_utils.py # Utils
++  - pytest -v -s worker # Worker
++
++- label: Python-only Installation Test
++  source_file_dependencies:
++  - tests/standalone_tests/python_only_compile.sh
++  - setup.py
++  commands:
++  - bash standalone_tests/python_only_compile.sh
+ 
+-- label: AsyncEngine Test
+-  command: pytest -v -s async_engine
++- label: Basic Correctness Test # 30min
++  #mirror_hardwares: [amd]
++  fast_check: true
++  source_file_dependencies:
++  - vllm/
++  - tests/basic_correctness/test_basic_correctness
++  - tests/basic_correctness/test_cpu_offload
++  - tests/basic_correctness/test_preemption
++  commands:
++  - pytest -v -s basic_correctness/test_basic_correctness.py
++  - pytest -v -s basic_correctness/test_cpu_offload.py
++  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+ 
+-- label: Basic Correctness Test
++- label: Chunked Prefill Test
++  source_file_dependencies:
++  - vllm/
++  - tests/basic_correctness/test_chunked_prefill
+   commands:
+-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+ 
+-- label: Core Test
++- label: Core Test # 10min
+   mirror_hardwares: [amd]
+-  command: pytest -v -s core
++  fast_check: true
++  source_file_dependencies:
++  - vllm/core
++  - vllm/distributed
++  - tests/core
++  commands:
++  - pytest -v -s core
+ 
+-- label: Distributed Comm Ops Test
+-  command: pytest -v -s test_comm_ops.py
+-  working_dir: "/vllm-workspace/tests/distributed"
+-  num_gpus: 2
++- label: Entrypoints Test # 40min
++  working_dir: "/vllm-workspace/tests"
++  fast_check: true
++  mirror_hardwares: [amd]
++  source_file_dependencies:
++  - vllm/
++  commands:
++  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
++  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
++  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
++  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
++  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
++  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
++  - pytest -v -s entrypoints/test_chat_utils.py
++  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
++
++- label: Distributed Tests (4 GPUs) # 10min
++  working_dir: "/vllm-workspace/tests"
++  num_gpus: 4
++  fast_check: true
++  source_file_dependencies:
++  - vllm/distributed/
++  - vllm/core/
++  - tests/distributed
++  - tests/spec_decode/e2e/test_integration_dist_tp4
++  - tests/compile
++  commands:
++  - pytest -v -s distributed/test_utils.py
++  - pytest -v -s compile/test_basic_correctness.py
++  - pytest -v -s distributed/test_pynccl.py
++  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
++
++- label: Metrics, Tracing Test # 10min
++  num_gpus: 2 
++  fast_check: true
++  source_file_dependencies:
++  - vllm/
++  - tests/metrics
++  - tests/tracing
++  commands:
++  - pytest -v -s metrics 
++  - "pip install \
++      'opentelemetry-sdk>=1.26.0,<1.27.0' \
++      'opentelemetry-api>=1.26.0,<1.27.0' \
++      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
++      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
++  - pytest -v -s tracing
+ 
+-- label: Distributed Tests
+-  working_dir: "/vllm-workspace/tests/distributed"
++##### fast check tests  #####
++#####  1 GPU test  #####
+ 
+-  num_gpus: 2 # only support 1 or 2 for now.
++- label: Regression Test # 5min
+   mirror_hardwares: [amd]
++  source_file_dependencies:
++  - vllm/
++  - tests/test_regression
++  commands:
++  - pip install modelscope
++  - pytest -v -s test_regression.py
++  working_dir: "/vllm-workspace/tests" # optional
+ 
++- label: Engine Test # 10min
++  mirror_hardwares: [amd]
++  source_file_dependencies:
++  - vllm/
++  - tests/engine
++  - tests/tokenization
+   commands:
+-  - pytest -v -s test_pynccl_library.py
+-  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
+-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
+-  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
+-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
++  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
++  # OOM in the CI unless we run this separately
++  - pytest -v -s tokenization
+ 
+-- label: Distributed Tests (Multiple Groups)
+-  working_dir: "/vllm-workspace/tests/distributed"
+-  num_gpus: 4
++- label: V1 Test
++  #mirror_hardwares: [amd]
++  source_file_dependencies:
++    - vllm/
++    - tests/v1
++  commands:
++    - VLLM_USE_V1=1 pytest -v -s v1
++
++- label: Examples Test # 25min
++  working_dir: "/vllm-workspace/examples"
++  #mirror_hardwares: [amd]
++  source_file_dependencies:
++  - vllm/entrypoints
++  - examples/
+   commands:
+-  - pytest -v -s test_pynccl.py
++    - pip install tensorizer # for tensorizer test
++    - python3 offline_inference/basic.py
++    - python3 offline_inference/cpu_offload.py
++    - python3 offline_inference/chat.py
++    - python3 offline_inference/prefix_caching.py
++    - python3 offline_inference/llm_engine_example.py
++    - python3 offline_inference/vision_language.py
++    - python3 offline_inference/vision_language_multi_image.py
++    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
++    - python3 offline_inference/encoder_decoder.py
++    - python3 offline_inference/classification.py
++    - python3 offline_inference/embedding.py
++    - python3 offline_inference/scoring.py
++    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+ 
+-- label: Engine Test
++- label: Prefix Caching Test # 9min
+   mirror_hardwares: [amd]
+-  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
++  source_file_dependencies:
++  - vllm/
++  - tests/prefix_caching
++  commands:
++    - pytest -v -s prefix_caching
+ 
+-- label: Entrypoints Test
++- label: Samplers Test # 36min
++  source_file_dependencies:
++  - vllm/model_executor/layers
++  - vllm/sampling_metadata.py
++  - tests/samplers
++  - tests/conftest.py
+   commands:
+-  # these tests have to be separated, because each one will allocate all posible GPU memory
+-  - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
+-  - pytest -v -s entrypoints/test_server_oot_registration.py
++    - pytest -v -s samplers
++    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+ 
+-- label: Examples Test
+-  working_dir: "/vllm-workspace/examples"
++- label: LogitsProcessor Test # 5min
+   mirror_hardwares: [amd]
++  source_file_dependencies:
++  - vllm/model_executor/layers
++  - vllm/model_executor/guided_decoding
++  - tests/test_logits_processor
++  - tests/model_executor/test_guided_processors
++  commands: 
++    - pytest -v -s test_logits_processor.py
++    - pytest -v -s model_executor/test_guided_processors.py
++
++- label: Speculative decoding tests # 40min
++  source_file_dependencies:
++  - vllm/spec_decode
++  - tests/spec_decode
++  - vllm/model_executor/models/eagle.py
+   commands:
+-    # install aws cli for llava_example.py
+-    - pip install awscli
+-    - python3 offline_inference.py
+-    - python3 offline_inference_with_prefix.py
+-    - python3 llm_engine_example.py
+-    - python3 llava_example.py
++    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
++    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
++    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
+ 
+-- label: Kernels Test %N
+-  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
++- label: LoRA Test %N # 15min each
++  mirror_hardwares: [amd]
++  source_file_dependencies:
++  - vllm/lora
++  - tests/lora
++  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
+   parallelism: 4
+ 
+-- label: Models Test
++- label: "PyTorch Fullgraph Smoke Test" # 9min
++  fast_check: true
++  source_file_dependencies:
++  - vllm/
++  - tests/compile
++  commands:
++  - pytest -v -s compile/test_basic_correctness.py
++  # these tests need to be separated, cannot combine
++  - pytest -v -s compile/piecewise/test_simple.py
++  - pytest -v -s compile/piecewise/test_toy_llama.py
++
++- label: "PyTorch Fullgraph Test" # 18min
++  source_file_dependencies:
++  - vllm/
++  - tests/compile
++  commands:
++  - pytest -v -s compile/test_full_graph.py
++
++- label: Kernels Test %N # 1h each
+   mirror_hardwares: [amd]
++  source_file_dependencies:
++  - csrc/
++  - vllm/attention
++  - tests/kernels
+   commands:
+-    - bash ../.buildkite/download-images.sh
+-    - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
++    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
++  parallelism: 4
+ 
+-- label: Llava Test
++- label: Tensorizer Test # 11min
+   mirror_hardwares: [amd]
++  soft_fail: true
++  source_file_dependencies:
++  - vllm/model_executor/model_loader
++  - tests/tensorizer_loader
+   commands:
+-    - bash ../.buildkite/download-images.sh
+-    - pytest -v -s models/test_llava.py
++    - apt-get update && apt-get install -y curl libsodium23
++    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
++    - pytest -v -s tensorizer_loader
+ 
+-- label: Prefix Caching Test
++- label: Benchmarks # 9min
++  working_dir: "/vllm-workspace/.buildkite"
+   mirror_hardwares: [amd]
++  source_file_dependencies:
++  - benchmarks/
+   commands:
+-    - pytest -v -s prefix_caching
++  - bash run-benchmarks.sh
+ 
+-- label: Samplers Test
+-  command: pytest -v -s samplers
++- label: Quantization Test # 33min
++  source_file_dependencies:
++  - csrc/
++  - vllm/model_executor/layers/quantization
++  - tests/quantization
++  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+ 
+-- label: LogitsProcessor Test
+-  mirror_hardwares: [amd]
+-  command: pytest -v -s test_logits_processor.py
++- label: LM Eval Small Models # 53min
++  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
++  source_file_dependencies:
++  - csrc/
++  - vllm/model_executor/layers/quantization
++  commands:
++  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
++  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+ 
+-- label: Worker Test
+-  mirror_hardwares: [amd]
+-  command: pytest -v -s worker
++- label: Encoder Decoder tests # 5min
++  source_file_dependencies:
++  - vllm/
++  - tests/encoder_decoder
++  commands:
++    - pytest -v -s encoder_decoder
+ 
+-- label: Speculative decoding tests
+-  mirror_hardwares: [amd]
+-  command: pytest -v -s spec_decode
++- label: OpenAI-Compatible Tool Use # 20 min
++  fast_check: false
++  mirror_hardwares: [ amd ]
++  source_file_dependencies:
++    - vllm/
++    - tests/tool_use
++  commands:
++    - pytest -v -s tool_use
+ 
+-- label: LoRA Test %N
+-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+-  parallelism: 4
++#####  models test  #####
+ 
+-- label: Tensorizer Test
+-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
++- label: Basic Models Test # 24min
++  source_file_dependencies:
++  - vllm/
++  - tests/models
++  commands:
++    - pytest -v -s models/test_registry.py
++    - pytest -v -s models/test_initialization.py
+ 
+-- label: Metrics Test
+-  command: pytest -v -s metrics
++- label: Language Models Test (Standard) # 32min
++  #mirror_hardwares: [amd]
++  source_file_dependencies:
++  - vllm/
++  - tests/models/decoder_only/language
++  - tests/models/embedding/language
++  - tests/models/encoder_decoder/language
++  commands:
++    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
++    - pytest -v -s models/embedding/language -m core_model
+ 
+-- label: Quantization Test
+-  command: pytest -v -s quantization
++- label: Language Models Test (Extended) # 1h10min
++  optional: true
++  source_file_dependencies:
++  - vllm/
++  - tests/models/decoder_only/language
++  - tests/models/embedding/language
++  - tests/models/encoder_decoder/language
++  commands:
++    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
++    - pytest -v -s models/embedding/language -m 'not core_model'
+ 
+-- label: Benchmarks
+-  working_dir: "/vllm-workspace/.buildkite"
+-  mirror_hardwares: [amd]
++- label: Multi-Modal Models Test (Standard) # 40min
++  #mirror_hardwares: [amd]
++  source_file_dependencies:
++  - vllm/
++  - tests/models/decoder_only/audio_language
++  - tests/models/decoder_only/vision_language
++  - tests/models/embedding/vision_language
++  - tests/models/encoder_decoder/audio_language
++  - tests/models/encoder_decoder/vision_language
+   commands:
+-  - pip install aiohttp
+-  - bash run-benchmarks.sh
++    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
++    - pytest -v -s models/multimodal
++    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
++    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
++    - pytest -v -s models/embedding/vision_language -m core_model
++    - pytest -v -s models/encoder_decoder/audio_language -m core_model
++    - pytest -v -s models/encoder_decoder/language -m core_model
++    - pytest -v -s models/encoder_decoder/vision_language -m core_model
+ 
+-- label: Documentation Build
+-  working_dir: "/vllm-workspace/test_docs/docs"
+-  no_gpu: True
++- label: Multi-Modal Models Test (Extended) 1 # 48m
++  optional: true
++  source_file_dependencies:
++  - vllm/
++  - tests/models/decoder_only/audio_language
++  - tests/models/decoder_only/vision_language
++  - tests/models/embedding/vision_language
++  - tests/models/encoder_decoder/vision_language
+   commands:
+-  - pip install -r requirements-docs.txt
+-  - SPHINXOPTS=\"-W\" make html
++    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
++    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
++    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
++    # HACK - run phi3v tests separately to sidestep this transformers bug
++    # https://github.com/huggingface/transformers/issues/34307
++    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
++    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
++    - pytest -v -s models/embedding/vision_language -m 'not core_model'
++    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
++    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
++
++- label: Multi-Modal Models Test (Extended) 2 # 38m
++  optional: true
++  source_file_dependencies:
++  - vllm/
++  - tests/models/decoder_only/vision_language
++  commands:
++    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
++    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
++
++# This test is used only in PR development phase to test individual models and should never run on main
++- label: Custom Models Test
++  optional: true
++  commands:
++    - echo 'Testing custom models...'
++    # PR authors can temporarily add commands below to test individual models
++    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
++    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
++
++#####  1 GPU test  #####
++#####  multi gpus test  #####
++
++- label: Distributed Comm Ops Test # 7min
++  working_dir: "/vllm-workspace/tests"
++  num_gpus: 2
++  source_file_dependencies:
++  - vllm/distributed
++  - tests/distributed
++  commands:
++  - pytest -v -s distributed/test_comm_ops.py
++  - pytest -v -s distributed/test_shm_broadcast.py
++
++- label: 2 Node Tests (4 GPUs in total) # 16min
++  working_dir: "/vllm-workspace/tests"
++  num_gpus: 2
++  num_nodes: 2
++  source_file_dependencies:
++  - vllm/distributed/
++  - vllm/engine/
++  - vllm/executor/
++  - vllm/model_executor/models/
++  - tests/distributed/
++  commands:
++  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
++    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
++    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
++    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
++  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
++    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
++
++- label: Distributed Tests (2 GPUs) # 40min
++  #mirror_hardwares: [amd]
++  working_dir: "/vllm-workspace/tests"
++  num_gpus: 2
++  source_file_dependencies:
++  - vllm/distributed/
++  - vllm/engine/
++  - vllm/executor/
++  - vllm/model_executor/models/
++  - tests/distributed/
++  - vllm/compilation
++  - vllm/worker/worker_base.py
++  - vllm/worker/worker.py
++  - vllm/worker/model_runner.py
++  commands:
++  - pytest -v -s ./compile/test_basic_correctness.py
++  - pytest -v -s ./compile/test_wrapper.py
++  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
++  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
++  # Avoid importing model tests that cause CUDA reinitialization error
++  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
++  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
++  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
++  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
++  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
++  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
++
++- label: Plugin Tests (2 GPUs) # 40min
++  working_dir: "/vllm-workspace/tests"
++  num_gpus: 2
++  fast_check: true
++  source_file_dependencies:
++  - vllm/plugins/
++  - tests/plugins/
++  commands:
++  # begin platform plugin tests, all the code in-between runs on dummy platform
++  - pip install -e ./plugins/vllm_add_dummy_platform
++  - pytest -v -s plugins_tests/test_platform_plugins.py
++  - pip uninstall vllm_add_dummy_platform -y
++  # end platform plugin tests
++  # other tests continue here:
++  - pip install -e ./plugins/vllm_add_dummy_model
++  - pytest -v -s distributed/test_distributed_oot.py
++  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
++  - pytest -v -s models/test_oot_registration.py # it needs a clean process
++
++- label: Multi-step Tests (4 GPUs) # 36min
++  working_dir: "/vllm-workspace/tests"
++  num_gpus: 4
++  source_file_dependencies:
++  - vllm/model_executor/layers/sampler.py
++  - vllm/sequence.py
++  - vllm/worker/worker_base.py
++  - vllm/worker/worker.py
++  - vllm/worker/multi_step_worker.py
++  - vllm/worker/model_runner_base.py
++  - vllm/worker/model_runner.py
++  - vllm/worker/multi_step_model_runner.py
++  - vllm/engine
++  - tests/multi_step
++  commands:
++  - pytest -v -s multi_step/test_correctness_async_llm.py
++  - pytest -v -s multi_step/test_correctness_llm.py
++
++- label: Pipeline Parallelism Test # 45min
++  working_dir: "/vllm-workspace/tests"
++  num_gpus: 4
++  source_file_dependencies:
++  - vllm/distributed/
++  - vllm/engine/
++  - vllm/executor/
++  - vllm/model_executor/models/
++  - tests/distributed/
++  commands:
++  - pytest -v -s distributed/test_pp_cudagraph.py
++  - pytest -v -s distributed/test_pipeline_parallel.py
++
++- label: LoRA TP Test (Distributed)
++  num_gpus: 4
++  source_file_dependencies:
++  - vllm/lora
++  - tests/lora
++  commands:
++    # FIXIT: find out which code initialize cuda before running the test
++    # before the fix, we need to use spawn to test it
++    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
++    # This test runs llama 13B, so it is required to run on 4 GPUs.
++    - pytest -v -s -x lora/test_long_context.py
++    # There is some Tensor Parallelism related processing logic in LoRA that 
++    # requires multi-GPU testing for validation.
++    - pytest -v -s -x lora/test_chatglm3_tp.py
++    - pytest -v -s -x lora/test_llama_tp.py
++    - pytest -v -s -x lora/test_minicpmv_tp.py
++
++
++- label: Weight Loading Multiple GPU Test  # 33min
++  working_dir: "/vllm-workspace/tests"
++  num_gpus: 2
++  source_file_dependencies:
++  - vllm/
++  - tests/weight_loading
++  commands:
++    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
++
++- label: Weight Loading Multiple GPU Test - Large Models # optional
++  working_dir: "/vllm-workspace/tests"
++  num_gpus: 2
++  gpu: a100
++  optional: true
++  source_file_dependencies:
++  - vllm/
++  - tests/weight_loading
++  commands:
++    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
++
++
++##### multi gpus test #####
++##### A100 test #####
++
++- label: Distributed Tests (A100) # optional
++  gpu: a100
++  optional: true
++  num_gpus: 4
++  source_file_dependencies:
++  - vllm/
++  commands: 
++  # NOTE: don't test llama model here, it seems hf implementation is buggy
++  # see https://github.com/vllm-project/vllm/pull/5689 for details
++  - pytest -v -s distributed/test_custom_all_reduce.py
++  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
++  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
++  - pytest -v -s -x lora/test_mixtral.py
++
++- label: LM Eval Large Models # optional
++  gpu: a100
++  optional: true
++  num_gpus: 4
++  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
++  source_file_dependencies:
++  - csrc/
++  - vllm/model_executor/layers/quantization
++  commands:
++  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
++  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
+new file mode 100644
+index 0000000..3c75665
+--- /dev/null
++++ b/.buildkite/upload-wheels.sh
+@@ -0,0 +1,71 @@
++#!/usr/bin/env bash
++
++set -ex
++
++# Assume wheels are in artifacts/dist/*.whl
++wheel_files=(artifacts/dist/*.whl)
++
++# Check that exactly one wheel is found
++if [[ ${#wheel_files[@]} -ne 1 ]]; then
++  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
++  exit 1
++fi
++
++# Get the single wheel file
++wheel="${wheel_files[0]}"
++
++# Rename 'linux' to 'manylinux1' in the wheel filename
++new_wheel="${wheel/linux/manylinux1}"
++mv -- "$wheel" "$new_wheel"
++wheel="$new_wheel"
++
++# Extract the version from the wheel
++version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
++echo "Version: $version"
++
++normal_wheel="$wheel" # Save the original wheel filename
++
++# If the version contains "dev", rename it to v1.0.0.dev for consistency
++if [[ $version == *dev* ]]; then
++    suffix="${version##*.}"
++    if [[ $suffix == cu* ]]; then
++        new_version="1.0.0.dev+${suffix}"
++    else
++        new_version="1.0.0.dev"
++    fi
++    new_wheel="${wheel/$version/$new_version}"
++    # use cp to keep both files in the artifacts directory
++    cp -- "$wheel" "$new_wheel"
++    wheel="$new_wheel"
++    version="$new_version"
++fi
++
++# Upload the wheel to S3
++python3 .buildkite/generate_index.py --wheel "$normal_wheel"
++
++# generate index for this commit
++aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
++aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
++
++if [[ $normal_wheel == *"cu118"* ]]; then
++    # if $normal_wheel matches cu118, do not upload the index.html
++    echo "Skipping index files for cu118 wheels"
++else
++    # only upload index.html for cu12 wheels (default wheels)
++    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
++    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
++fi
++
++# generate index for nightly
++aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
++aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
++
++if [[ $normal_wheel == *"cu118"* ]]; then
++    # if $normal_wheel matches cu118, do not upload the index.html
++    echo "Skipping index files for cu118 wheels"
++else
++    # only upload index.html for cu12 wheels (default wheels)
++    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
++fi
++
++aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+\ No newline at end of file
+diff --git a/.clang-format b/.clang-format
+new file mode 100644
+index 0000000..7f9e6d7
+--- /dev/null
++++ b/.clang-format
+@@ -0,0 +1,26 @@
++BasedOnStyle: Google
++UseTab: Never
++IndentWidth: 2
++ColumnLimit: 80
++
++# Force pointers to the type for C++.
++DerivePointerAlignment: false
++PointerAlignment: Left
++
++# Reordering #include statements can (and currently will) introduce errors
++SortIncludes: false
++
++# Style choices
++AlignConsecutiveAssignments: false
++AlignConsecutiveDeclarations: false
++IndentPPDirectives: BeforeHash
++
++IncludeCategories:
++  - Regex:           '^<'
++    Priority:        4
++  - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
++    Priority:        3
++  - Regex:           '^"(qoda|\.\.)/'
++    Priority:        2
++  - Regex:           '.*'
++    Priority:        1
+diff --git a/.dockerignore b/.dockerignore
+index 5cfe0dc..3863656 100644
+--- a/.dockerignore
++++ b/.dockerignore
+@@ -1 +1,33 @@
++/.venv
++/build
++dist
+ vllm/*.so
++
++# Byte-compiled / optimized / DLL files
++__pycache__/
++*.py[cod]
++*$py.class
++
++.mypy_cache
++
++# Distribution / packaging
++.Python
++/build/
++cmake-build-*/
++CMakeUserPresets.json
++develop-eggs/
++/dist/
++downloads/
++eggs/
++.eggs/
++lib/
++lib64/
++parts/
++sdist/
++var/
++wheels/
++share/python-wheels/
++*.egg-info/
++.installed.cfg
++*.egg
++MANIFEST
+diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
+new file mode 100644
+index 0000000..3cb91fc
+--- /dev/null
++++ b/.github/CODEOWNERS
+@@ -0,0 +1,33 @@
++# See https://help.github.com/articles/about-codeowners/
++# for more info about CODEOWNERS file
++
++# This lists cover the "core" components of vLLM that require careful review
++/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
++/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
++/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
++/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
++/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
++/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
++/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
++CMakeLists.txt @tlrmchlsmth
++
++# vLLM V1
++/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
++
++# Test ownership
++/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
++/tests/test_inputs.py @DarkLight1337 @ywang96
++/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
++/tests/models @DarkLight1337 @ywang96
++/tests/multimodal @DarkLight1337 @ywang96
++/tests/prefix_caching @comaniac @KuntaiDu
++/tests/spec_decode @njhill @LiuXiaoxuanPKU
++/tests/kernels @tlrmchlsmth @WoosukKwon
++/tests/quantization @mgoin @robertgshaw2-neuralmagic
++/.buildkite/lm-eval-harness @mgoin @simon-mo
++/tests/distributed/test_multi_node_assignment.py @youkaichao
++/tests/distributed/test_pipeline_parallel.py @youkaichao
++/tests/distributed/test_same_node.py @youkaichao
++/tests/multi_step @alexm-neuralmagic @comaniac
++/tests/weight_loading @mgoin @youkaichao
++/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
+diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
+new file mode 100644
+index 0000000..d1f6105
+--- /dev/null
++++ b/.github/FUNDING.yml
+@@ -0,0 +1,2 @@
++github: [vllm-project]
++open_collective: vllm
+diff --git a/.github/ISSUE_TEMPLATE/100-documentation.yml b/.github/ISSUE_TEMPLATE/100-documentation.yml
+index 501c0aa..74d397b 100644
+--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
++++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
+@@ -20,3 +20,10 @@ body:
+   attributes:
+     value: >
+       Thanks for contributing 🎉!
++- type: checkboxes
++  id: askllm
++  attributes:
++    label: Before submitting a new issue...
++    options:
++      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
++        required: true
+diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml
+index df41ade..590e56c 100644
+--- a/.github/ISSUE_TEMPLATE/200-installation.yml
++++ b/.github/ISSUE_TEMPLATE/200-installation.yml
+@@ -38,3 +38,10 @@ body:
+   attributes:
+     value: >
+       Thanks for contributing 🎉!
++- type: checkboxes
++  id: askllm
++  attributes:
++    label: Before submitting a new issue...
++    options:
++      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
++        required: true
+diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
+index 54763af..004798a 100644
+--- a/.github/ISSUE_TEMPLATE/300-usage.yml
++++ b/.github/ISSUE_TEMPLATE/300-usage.yml
+@@ -36,3 +36,10 @@ body:
+   attributes:
+     value: >
+       Thanks for contributing 🎉!
++- type: checkboxes
++  id: askllm
++  attributes:
++    label: Before submitting a new issue...
++    options:
++      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
++        required: true
+diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
+new file mode 100644
+index 0000000..30db172
+--- /dev/null
++++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
+@@ -0,0 +1,107 @@
++name: 🐛 Bug report
++description: Raise an issue here if you find a bug.
++title: "[Bug]: "
++labels: ["bug"]
++
++body:
++- type: markdown
++  attributes:
++    value: >
++      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
++- type: textarea
++  attributes:
++    label: Your current environment
++    description: |
++      Please run the following and paste the output below.
++      ```sh
++      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
++      # For security purposes, please feel free to check the contents of collect_env.py before running it.
++      python collect_env.py
++      ```
++      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
++    value: |
++      <details>
++      <summary>The output of `python collect_env.py`</summary>
++
++      ```text
++      Your output of `python collect_env.py` here
++      ```
++      
++      </details>
++  validations:
++    required: true
++- type: textarea
++  attributes:
++    label: Model Input Dumps
++    description: |
++      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
++    placeholder: |
++      Upload the dumped input file.
++  validations:
++    required: false
++- type: textarea
++  attributes:
++    label: 🐛 Describe the bug
++    description: |
++      Please provide a clear and concise description of what the bug is.
++
++      If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
++
++      ```python
++      from vllm import LLM, SamplingParams
++
++      prompts = [
++          "Hello, my name is",
++          "The president of the United States is",
++          "The capital of France is",
++          "The future of AI is",
++      ]
++      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++      llm = LLM(model="facebook/opt-125m")
++
++      outputs = llm.generate(prompts, sampling_params)
++
++      # Print the outputs.
++      for output in outputs:
++          prompt = output.prompt
++          generated_text = output.outputs[0].text
++          print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++      ```
++
++      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
++
++      Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
++
++      Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
++
++      If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
++    placeholder: |
++      A clear and concise description of what the bug is.
++
++      ```python
++      # Sample code to reproduce the problem
++      ```
++
++      ```
++      The error message you got, with the full traceback.
++      ```
++  validations:
++    required: true
++- type: markdown
++  attributes:
++    value: >
++      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
++
++      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
++
++      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
++
++      Thanks for contributing 🎉!
++- type: checkboxes
++  id: askllm
++  attributes:
++    label: Before submitting a new issue...
++    options:
++      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
++        required: true
+diff --git a/.github/ISSUE_TEMPLATE/500-feature-request.yml b/.github/ISSUE_TEMPLATE/500-feature-request.yml
+new file mode 100644
+index 0000000..097d88f
+--- /dev/null
++++ b/.github/ISSUE_TEMPLATE/500-feature-request.yml
+@@ -0,0 +1,38 @@
++name: 🚀 Feature request
++description: Submit a proposal/request for a new vllm feature
++title: "[Feature]: "
++labels: ["feature request"]
++
++body:
++- type: markdown
++  attributes:
++    value: >
++      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
++- type: textarea
++  attributes:
++    label: 🚀 The feature, motivation and pitch
++    description: >
++      A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
++  validations:
++    required: true
++- type: textarea
++  attributes:
++    label: Alternatives
++    description: >
++      A description of any alternative solutions or features you've considered, if any.
++- type: textarea
++  attributes:
++    label: Additional context
++    description: >
++      Add any other context or screenshots about the feature request.
++- type: markdown
++  attributes:
++    value: >
++      Thanks for contributing 🎉!
++- type: checkboxes
++  id: askllm
++  attributes:
++    label: Before submitting a new issue...
++    options:
++      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
++        required: true
+diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml
+new file mode 100644
+index 0000000..713e76c
+--- /dev/null
++++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
+@@ -0,0 +1,40 @@
++name: 🤗 Support request for a new model from huggingface
++description: Submit a proposal/request for a new model from huggingface
++title: "[New Model]: "
++labels: ["new model"]
++
++body:
++- type: markdown
++  attributes:
++    value: >
++      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
++
++      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
++- type: textarea
++  attributes:
++    label: The model to consider.
++    description: >
++      A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
++  validations:
++    required: true
++- type: textarea
++  attributes:
++    label: The closest model vllm already supports.
++    description: >
++      Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for?
++- type: textarea
++  attributes:
++    label: What's your difficulty of supporting the model you want?
++    description: >
++      For example, any new operators or new architecture?
++- type: markdown
++  attributes:
++    value: >
++      Thanks for contributing 🎉!
++- type: checkboxes
++  id: askllm
++  attributes:
++    label: Before submitting a new issue...
++    options:
++      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
++        required: true
+diff --git a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+new file mode 100644
+index 0000000..273f50d
+--- /dev/null
++++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+@@ -0,0 +1,59 @@
++name: ⚡ Discussion on the performance of vllm
++description: Submit a proposal/discussion about the performance of vllm
++title: "[Performance]: "
++labels: ["performance"]
++
++body:
++- type: markdown
++  attributes:
++    value: >
++      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
++- type: textarea
++  attributes:
++    label: Proposal to improve performance
++    description: >
++      How do you plan to improve vllm's performance?
++  validations:
++    required: false
++- type: textarea
++  attributes:
++    label: Report of performance regression
++    description: >
++      Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks .
++  validations:
++    required: false
++- type: textarea
++  attributes:
++    label: Misc discussion on performance
++    description: >
++      Anything about the performance.
++  validations:
++    required: false
++- type: textarea
++  attributes:
++    label: Your current environment (if you think it is necessary)
++    description: |
++      Please run the following and paste the output below.
++      ```sh
++      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
++      # For security purposes, please feel free to check the contents of collect_env.py before running it.
++      python collect_env.py
++      ```
++      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
++    value: |
++      ```text
++      The output of `python collect_env.py`
++      ```
++  validations:
++    required: false
++- type: markdown
++  attributes:
++    value: >
++      Thanks for contributing 🎉!
++- type: checkboxes
++  id: askllm
++  attributes:
++    label: Before submitting a new issue...
++    options:
++      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
++        required: true
+diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml
+index 5382b12..e447c07 100644
+--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
++++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
+@@ -47,3 +47,10 @@ body:
+   attributes:
+     value: >
+       Thanks for contributing 🎉!
++- type: checkboxes
++  id: askllm
++  attributes:
++    label: Before submitting a new issue...
++    options:
++      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
++        required: true
+diff --git a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+new file mode 100644
+index 0000000..79e6e90
+--- /dev/null
++++ b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+@@ -0,0 +1,28 @@
++name: 🎲 Misc/random discussions that do not fit into the above categories.
++description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
++title: "[Misc]: "
++labels: ["misc"]
++
++body:
++- type: markdown
++  attributes:
++    value: >
++      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
++- type: textarea
++  attributes:
++    label: Anything you want to discuss about vllm.
++    description: >
++      Anything you want to discuss about vllm.
++  validations:
++    required: true
++- type: markdown
++  attributes:
++    value: >
++      Thanks for contributing 🎉!
++- type: checkboxes
++  id: askllm
++  attributes:
++    label: Before submitting a new issue...
++    options:
++      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
++        required: true
+diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
+index 262ce8e..51a73c8 100644
+--- a/.github/PULL_REQUEST_TEMPLATE.md
++++ b/.github/PULL_REQUEST_TEMPLATE.md
+@@ -2,63 +2,4 @@ FILL IN THE PR DESCRIPTION HERE
+ 
+ FIX #xxxx (*link existing issues this PR will resolve*)
+ 
+-**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
+-
+----
+-
+-<details>
+-<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
+-<summary><b> PR Checklist (Click to Expand) </b></summary>
+-
+-<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
+-
+-<h3>PR Title and Classification</h3>
+-<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
+-<ul>
+-    <li><code>[Bugfix]</code> for bug fixes.</li>
+-    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
+-    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
+-    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
+-    <li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
+-    <li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
+-    <li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
+-    <li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
+-    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
+-</ul>
+-<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
+-
+-<h3>Code Quality</h3>
+-
+-<p>The PR need to meet the following code quality standards:</p>
+-
+-<ul>
+-    <li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
+-    <li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
+-    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
+-    <li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
+-    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
+-</ul>
+-
+-<h3>Notes for Large Changes</h3>
+-<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
+-
+-<h3>What to Expect for the Reviews</h3>
+-
+-<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
+-
+-<ul>
+-    <li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
+-    <li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
+-    <li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
+-    <li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
+- </li>
+-</ul>
+-
+-<h3>Thank You</h3>
+-
+-<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
+-
+-
+-</details>
+-
+-
++**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
+diff --git a/.github/dependabot.yml b/.github/dependabot.yml
+new file mode 100644
+index 0000000..683b70c
+--- /dev/null
++++ b/.github/dependabot.yml
+@@ -0,0 +1,31 @@
++version: 2
++updates:
++  # Maintain dependencies for GitHub Actions
++  - package-ecosystem: "github-actions"
++    directory: "/"
++    schedule:
++      interval: "weekly"
++  - package-ecosystem: "pip"
++    directory: "/"
++    schedule:
++      interval: "weekly"
++    labels: ["dependencies"]
++    open-pull-requests-limit: 5
++    reviewers: ["khluu", "simon-mo"]
++    allow:
++      - dependency-type: "all"
++    ignore:
++      - dependency-name: "*"
++        update-types: ["version-update:semver-patch"]
++      - dependency-name: "torch"
++      - dependency-name: "torchvision"
++      - dependency-name: "xformers"
++      - dependency-name: "lm-format-enforcer"
++      - dependency-name: "gguf"
++      - dependency-name: "compressed-tensors"
++      - dependency-name: "ray[adag]"
++      - dependency-name: "lm-eval"
++    groups:
++      minor-update:
++        applies-to: version-updates
++        update-types: ["minor"]
+diff --git a/.github/mergify.yml b/.github/mergify.yml
+new file mode 100644
+index 0000000..ca4bd7e
+--- /dev/null
++++ b/.github/mergify.yml
+@@ -0,0 +1,60 @@
++pull_request_rules:
++- name: label-documentation
++  description: Automatically apply documentation label
++  conditions:
++    - or:
++      - files~=^[^/]+\.md$
++      - files~=^docs/
++  actions:
++    label:
++      add:
++        - documentation
++
++- name: label-ci-build
++  description: Automatically apply ci/build label
++  conditions:
++    - or:
++      - files~=^\.github/
++      - files~=\.buildkite/
++      - files~=^cmake/
++      - files=CMakeLists.txt
++      - files~=^Dockerfile
++      - files~=^requirements.*\.txt
++      - files=setup.py
++  actions:
++    label:
++      add:
++        - ci/build
++
++- name: label-frontend
++  description: Automatically apply frontend label
++  conditions:
++    - files~=^vllm/entrypoints/
++  actions:
++    label:
++      add:
++        - frontend
++
++- name: ping author on conflicts and add 'needs-rebase' label
++  conditions:
++      - conflict
++      - -closed
++  actions:
++    label:
++      add:
++        - needs-rebase
++    comment:
++      message: |
++       This pull request has merge conflicts that must be resolved before it can be
++       merged. Please rebase the PR, @{{author}}.
++
++       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
++
++- name: remove 'needs-rebase' label when conflict is resolved
++  conditions:
++      - -conflict
++      - -closed
++  actions:
++    label:
++      remove:
++        - needs-rebase
+diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
+new file mode 100644
+index 0000000..3246c6f
+--- /dev/null
++++ b/.github/scripts/cleanup_pr_body.sh
+@@ -0,0 +1,50 @@
++#!/bin/bash
++
++set -eu
++
++# ensure 1 argument is passed
++if [ "$#" -ne 1 ]; then
++    echo "Usage: $0 <pr_number>"
++    exit 1
++fi
++
++PR_NUMBER=$1
++OLD=/tmp/orig_pr_body.txt
++NEW=/tmp/new_pr_body.txt
++
++gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
++cp "${OLD}" "${NEW}"
++
++# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
++sed -i '/FIX #xxxx.*$/d' "${NEW}"
++
++# Remove "FILL IN THE PR DESCRIPTION HERE"
++sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
++
++# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
++sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
++
++# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
++python3 - <<EOF
++import re
++
++with open("${NEW}", "r") as file:
++    content = file.read()
++
++pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
++content = re.sub(pattern, '', content)
++
++with open("${NEW}", "w") as file:
++    file.write(content)
++EOF
++
++# Run this only if ${NEW} is different than ${OLD}
++if ! cmp -s "${OLD}" "${NEW}"; then
++    gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
++    echo
++    echo "Updated PR body:"
++    echo
++    cat "${NEW}"
++else
++    echo "No changes needed"
++fi
+diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
+new file mode 100644
+index 0000000..0226cf0
+--- /dev/null
++++ b/.github/workflows/actionlint.yml
+@@ -0,0 +1,40 @@
++name: Lint GitHub Actions workflows
++on:
++  push:
++    branches:
++      - "main"
++    paths:
++      - '.github/workflows/*.ya?ml'
++      - '.github/workflows/actionlint.*'
++      - '.github/workflows/matchers/actionlint.json'
++  pull_request:
++    branches:
++      - "main"
++    paths:
++      - '.github/workflows/*.ya?ml'
++      - '.github/workflows/actionlint.*'
++      - '.github/workflows/matchers/actionlint.json'
++
++env:
++  LC_ALL: en_US.UTF-8
++
++defaults:
++  run:
++    shell: bash
++
++permissions:
++  contents: read
++
++jobs:
++  actionlint:
++    runs-on: ubuntu-latest
++    steps:
++      - name: "Checkout"
++        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++        with:
++          fetch-depth: 0
++
++      - name: "Run actionlint"
++        run: |
++          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
++          tools/actionlint.sh -color
+diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
+new file mode 100644
+index 0000000..c9d6d42
+--- /dev/null
++++ b/.github/workflows/add_label_automerge.yml
+@@ -0,0 +1,21 @@
++name: Add label on auto-merge enabled
++on:
++    pull_request_target:
++        types:
++            - auto_merge_enabled
++jobs:
++    add-label-on-auto-merge:
++        runs-on: ubuntu-latest
++        steps:
++            -   name: Add label
++                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
++                with:
++                    script: |
++                        github.rest.issues.addLabels({
++                            owner: context.repo.owner,
++                            repo: context.repo.repo,
++                            issue_number: context.issue.number,
++                            labels: ['ready']
++                        })
++                env:
++                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
+new file mode 100644
+index 0000000..68149d2
+--- /dev/null
++++ b/.github/workflows/clang-format.yml
+@@ -0,0 +1,53 @@
++name: clang-format
++
++on:
++  # Trigger the workflow on push or pull request,
++  # but only for the main branch
++  push:
++    branches:
++      - main
++    paths:
++      - '**/*.h'
++      - '**/*.cpp'
++      - '**/*.cu'
++      - '**/*.cuh'
++      - '.github/workflows/clang-format.yml'
++  pull_request:
++    branches:
++      - main
++    paths:
++      - '**/*.h'
++      - '**/*.cpp'
++      - '**/*.cu'
++      - '**/*.cuh'
++      - '.github/workflows/clang-format.yml'
++
++jobs:
++  clang-format:
++    runs-on: ubuntu-latest
++    strategy:
++      matrix:
++        python-version: ["3.11"]
++    steps:
++    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++    - name: Set up Python ${{ matrix.python-version }}
++      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
++      with:
++        python-version: ${{ matrix.python-version }}
++    - name: Install dependencies
++      run: |
++        python -m pip install --upgrade pip
++        pip install clang-format==18.1.5
++    - name: Running clang-format
++      run: |
++        EXCLUDES=(
++            'csrc/moe/topk_softmax_kernels.cu'
++            'csrc/quantization/gguf/ggml-common.h'
++            'csrc/quantization/gguf/dequantize.cuh'
++            'csrc/quantization/gguf/vecdotq.cuh'
++            'csrc/quantization/gguf/mmq.cuh'
++            'csrc/quantization/gguf/mmvq.cuh'
++        )
++        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
++            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
++            | xargs clang-format --dry-run --Werror
+diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
+new file mode 100644
+index 0000000..0085a1c
+--- /dev/null
++++ b/.github/workflows/cleanup_pr_body.yml
+@@ -0,0 +1,26 @@
++name: Cleanup PR Body
++
++on:
++  pull_request_target:
++    types: [opened, reopened, edited]
++
++permissions:
++  pull-requests: write
++
++jobs:
++  update-description:
++    runs-on: ubuntu-latest
++
++    steps:
++      - name: Checkout repository
++        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++
++      - name: Set up Python
++        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
++        with:
++          python-version: '3.12'
++
++      - name: Update PR description
++        env:
++          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
++        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
+diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
+new file mode 100644
+index 0000000..68887ad
+--- /dev/null
++++ b/.github/workflows/codespell.yml
+@@ -0,0 +1,45 @@
++name: codespell
++
++on:
++  # Trigger the workflow on push or pull request,
++  # but only for the main branch
++  push:
++    branches:
++      - main
++    paths:
++      - "**/*.py"
++      - "**/*.md"
++      - "**/*.rst"
++      - pyproject.toml
++      - requirements-lint.txt
++      - .github/workflows/codespell.yml
++  pull_request:
++    branches:
++      - main
++    paths:
++      - "**/*.py"
++      - "**/*.md"
++      - "**/*.rst"
++      - pyproject.toml
++      - requirements-lint.txt
++      - .github/workflows/codespell.yml
++
++jobs:
++  codespell:
++    runs-on: ubuntu-latest
++    strategy:
++      matrix:
++        python-version: ["3.12"]
++    steps:
++    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++    - name: Set up Python ${{ matrix.python-version }}
++      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
++      with:
++        python-version: ${{ matrix.python-version }}
++    - name: Install dependencies
++      run: |
++        python -m pip install --upgrade pip
++        pip install -r requirements-lint.txt
++    - name: Spelling check with codespell
++      run: |
++        codespell --toml pyproject.toml
+diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml
+new file mode 100644
+index 0000000..2f5ee8b
+--- /dev/null
++++ b/.github/workflows/doc-lint.yml
+@@ -0,0 +1,32 @@
++name: Lint documentation
++
++on:
++  push:
++    branches:
++      - main
++    paths:
++      - "docs/**"
++  pull_request:
++    branches:
++      - main
++    paths:
++      - "docs/**"
++
++jobs:
++  doc-lint:
++    runs-on: ubuntu-latest
++    strategy:
++      matrix:
++        python-version: ["3.12"]
++    steps:
++      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++      - name: Set up Python ${{ matrix.python-version }}
++        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
++        with:
++          python-version: ${{ matrix.python-version }}
++      - name: Install dependencies
++        run: |
++          python -m pip install --upgrade pip
++          pip install -r requirements-lint.txt
++      - name: Linting docs
++        run: tools/doc-lint.sh
+diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
+new file mode 100644
+index 0000000..556b60d
+--- /dev/null
++++ b/.github/workflows/lint-and-deploy.yaml
+@@ -0,0 +1,82 @@
++name: Lint and Deploy Charts
++
++on: pull_request
++
++jobs:
++  lint-and-deploy:
++    runs-on: ubuntu-latest
++    steps:
++      - name: Checkout
++        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++        with:
++          fetch-depth: 0
++
++      - name: Set up Helm
++        uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
++        with:
++          version: v3.14.4
++
++       #Python is required because ct lint runs Yamale and yamllint which require Python.
++      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
++        with:
++          python-version: '3.13'
++
++      - name: Set up chart-testing
++        uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
++        with:
++          version: v3.10.1
++
++      - name: Run chart-testing (lint)
++        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
++
++      - name: Setup minio
++        run: |
++          docker network create vllm-net
++          docker run -d -p 9000:9000 --name minio --net vllm-net \
++                     -e "MINIO_ACCESS_KEY=minioadmin" \
++                     -e "MINIO_SECRET_KEY=minioadmin" \
++                     -v /tmp/data:/data \
++                     -v /tmp/config:/root/.minio \
++                     minio/minio server /data
++          export AWS_ACCESS_KEY_ID=minioadmin
++          export AWS_SECRET_ACCESS_KEY=minioadmin
++          export AWS_EC2_METADATA_DISABLED=true
++          mkdir opt-125m
++          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
++          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
++          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
++
++      - name: Create kind cluster
++        uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
++
++      - name: Build the Docker image vllm cpu
++        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
++
++      - name: Configuration of docker images, network and namespace for the kind cluster
++        run: |
++          docker pull amazon/aws-cli:2.6.4
++          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
++          kind load docker-image vllm-cpu-env:latest --name chart-testing
++          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
++          kubectl create ns ns-vllm
++
++      - name: Run chart-testing (install)
++        run: |
++          export AWS_ACCESS_KEY_ID=minioadmin
++          export AWS_SECRET_ACCESS_KEY=minioadmin
++          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
++          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
++    
++      - name: curl test
++        run: |
++          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
++          sleep 10
++          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
++                  --header "Content-Type: application/json" \
++                  --data '{
++                          "model": "opt-125m",
++                          "prompt": "San Francisco is a",
++                          "max_tokens": 7,
++                          "temperature": 0
++                  }'):$CODE"
++          echo "$CODE"
+\ No newline at end of file
+diff --git a/.github/workflows/matchers/actionlint.json b/.github/workflows/matchers/actionlint.json
+new file mode 100644
+index 0000000..4613e16
+--- /dev/null
++++ b/.github/workflows/matchers/actionlint.json
+@@ -0,0 +1,17 @@
++{
++  "problemMatcher": [
++    {
++      "owner": "actionlint",
++      "pattern": [
++        {
++          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
++          "file": 1,
++          "line": 2,
++          "column": 3,
++          "message": 4,
++          "code": 5
++        }
++      ]
++    }
++  ]
++}
+diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json
+new file mode 100644
+index 0000000..f048fce
+--- /dev/null
++++ b/.github/workflows/matchers/mypy.json
+@@ -0,0 +1,16 @@
++{
++  "problemMatcher": [
++    {
++      "owner": "mypy",
++      "pattern": [
++        {
++          "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
++          "file": 1,
++          "line": 2,
++          "severity": 3,
++          "message": 4
++        }
++      ]
++    }
++  ]
++}
+diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json
+new file mode 100644
+index 0000000..f6d4479
+--- /dev/null
++++ b/.github/workflows/matchers/ruff.json
+@@ -0,0 +1,17 @@
++{
++    "problemMatcher": [
++      {
++        "owner": "ruff",
++        "pattern": [
++          {
++            "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
++            "file": 1,
++            "line": 2,
++            "column": 3,
++            "code": 4,
++            "message": 5
++          }
++        ]
++      }
++    ]
++  }
+diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
+index a20753d..73eeacf 100644
+--- a/.github/workflows/mypy.yaml
++++ b/.github/workflows/mypy.yaml
+@@ -6,45 +6,46 @@ on:
+   push:
+     branches:
+       - main
++    paths:
++      - '**/*.py'
++      - '.github/workflows/mypy.yaml'
++      - 'tools/mypy.sh'
++      - 'pyproject.toml'
+   pull_request:
+     branches:
+       - main
++    # This workflow is only relevant when one of the following files changes.
++    # However, we have github configured to expect and require this workflow
++    # to run and pass before github with auto-merge a pull request. Until github
++    # allows more flexible auto-merge policy, we can just run this on every PR.
++    # It doesn't take that long to run, anyway.
++    #paths:
++    #  - '**/*.py'
++    #  - '.github/workflows/mypy.yaml'
++    #  - 'tools/mypy.sh'
++    #  - 'pyproject.toml'
+ 
+ jobs:
+-  ruff:
++  mypy:
+     runs-on: ubuntu-latest
+     strategy:
+       matrix:
+-        python-version: ["3.8", "3.9", "3.10", "3.11"]
++        python-version: ["3.9", "3.10", "3.11", "3.12"]
+     steps:
+-    - uses: actions/checkout@v2
++    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+     - name: Set up Python ${{ matrix.python-version }}
+-      uses: actions/setup-python@v2
++      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+       with:
+         python-version: ${{ matrix.python-version }}
+     - name: Install dependencies
+       run: |
+         python -m pip install --upgrade pip
+-        pip install mypy==1.9.0
++        pip install mypy==1.11.1
+         pip install types-setuptools
+         pip install types-PyYAML
+         pip install types-requests
+         pip install types-setuptools
+     - name: Mypy
+       run: |
+-        mypy vllm/attention --config-file pyproject.toml
+-        mypy vllm/core --config-file pyproject.toml
+-        mypy vllm/distributed --config-file pyproject.toml
+-        mypy vllm/entrypoints --config-file pyproject.toml
+-        mypy vllm/executor --config-file pyproject.toml
+-        mypy vllm/usage --config-file pyproject.toml
+-        mypy vllm/*.py --config-file pyproject.toml
+-        mypy vllm/transformers_utils --config-file pyproject.toml
+-        mypy vllm/engine  --config-file pyproject.toml
+-        mypy vllm/worker --config-file pyproject.toml
+-        mypy vllm/spec_decode --config-file pyproject.toml
+-        mypy vllm/model_executor  --config-file pyproject.toml
+-        mypy vllm/lora --config-file pyproject.toml
+-        mypy vllm/logging --config-file pyproject.toml
+-        mypy vllm/model_executor --config-file pyproject.toml
+-
++        echo "::add-matcher::.github/workflows/matchers/mypy.json"
++        tools/mypy.sh 1 ${{ matrix.python-version }}
+diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml
+new file mode 100644
+index 0000000..4932af9
+--- /dev/null
++++ b/.github/workflows/png-lint.yml
+@@ -0,0 +1,37 @@
++name: Lint PNG exports from excalidraw
++on:
++  push:
++    branches:
++      - "main"
++    paths:
++      - '*.excalidraw.png'
++      - '.github/workflows/png-lint.yml'
++  pull_request:
++    branches:
++      - "main"
++    paths:
++      - '*.excalidraw.png'
++      - '.github/workflows/png-lint.yml'
++
++env:
++  LC_ALL: en_US.UTF-8
++
++defaults:
++  run:
++    shell: bash
++
++permissions:
++  contents: read
++
++jobs:
++  actionlint:
++    runs-on: ubuntu-latest
++    steps:
++      - name: "Checkout"
++        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++        with:
++          fetch-depth: 0
++
++      - name: "Run png-lint.sh to check excalidraw exported images"
++        run: |
++          tools/png-lint.sh
+diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
+index ac60ce0..e40ceaa 100644
+--- a/.github/workflows/publish.yml
++++ b/.github/workflows/publish.yml
+@@ -21,16 +21,16 @@ jobs:
+       upload_url: ${{ steps.create_release.outputs.upload_url }}
+     steps:
+       - name: Checkout
+-        uses: actions/checkout@v3
++        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ 
+       - name: Extract branch info
+         shell: bash
+         run: |
+-          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
++          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
+ 
+       - name: Create Release
+         id: create_release
+-        uses: "actions/github-script@v6"
++        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+         env:
+           RELEASE_TAG: ${{ env.release_tag }}
+         with:
+@@ -39,64 +39,68 @@ jobs:
+             const script = require('.github/workflows/scripts/create_release.js')
+             await script(github, context, core)
+ 
+-  wheel:
+-    name: Build Wheel
+-    runs-on: ${{ matrix.os }}
+-    needs: release
+-
+-    strategy:
+-      fail-fast: false
+-      matrix:
+-          os: ['ubuntu-20.04']
+-          python-version: ['3.8', '3.9', '3.10', '3.11']
+-          pytorch-version: ['2.3.0']  # Must be the most recent version that meets requirements-cuda.txt.
+-          cuda-version: ['11.8', '12.1']
+-
+-    steps:
+-      - name: Checkout
+-        uses: actions/checkout@v3
+-
+-      - name: Setup ccache
+-        uses: hendrikmuhs/ccache-action@v1.2
+-
+-      - name: Set up Linux Env
+-        if: ${{ runner.os == 'Linux' }}
+-        run: |
+-          bash -x .github/workflows/scripts/env.sh
+-
+-      - name: Set up Python
+-        uses: actions/setup-python@v4
+-        with:
+-            python-version: ${{ matrix.python-version }}
+-
+-      - name: Install CUDA ${{ matrix.cuda-version }}
+-        run: |
+-          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+-
+-      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+-        run: |
+-          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+-
+-      - name: Build wheel
+-        shell: bash
+-        env:
+-          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+-        run: |
+-          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+-          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+-          asset_name=${wheel_name//"linux"/"manylinux1"}
+-          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+-          echo "asset_name=${asset_name}" >> $GITHUB_ENV
+-
+-      - name: Upload Release Asset
+-        uses: actions/upload-release-asset@v1
+-        env:
+-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+-        with:
+-          upload_url: ${{ needs.release.outputs.upload_url }}
+-          asset_path: ./dist/${{ env.wheel_name }}
+-          asset_name: ${{ env.asset_name }}
+-          asset_content_type: application/*
++  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
++  # wheel:
++  #   name: Build Wheel
++  #   runs-on: ${{ matrix.os }}
++  #   needs: release
++
++  #   strategy:
++  #     fail-fast: false
++  #     matrix:
++  #         os: ['ubuntu-20.04']
++  #         python-version: ['3.9', '3.10', '3.11', '3.12']
++  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
++  #         cuda-version: ['11.8', '12.1']
++
++  #   steps:
++  #     - name: Checkout
++  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++
++  #     - name: Setup ccache
++  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
++  #       with:
++  #         create-symlink: true
++  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
++
++  #     - name: Set up Linux Env
++  #       if: ${{ runner.os == 'Linux' }}
++  #       run: |
++  #         bash -x .github/workflows/scripts/env.sh
++
++  #     - name: Set up Python
++  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
++  #       with:
++  #           python-version: ${{ matrix.python-version }}
++
++  #     - name: Install CUDA ${{ matrix.cuda-version }}
++  #       run: |
++  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
++
++  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
++  #       run: |
++  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
++
++  #     - name: Build wheel
++  #       shell: bash
++  #       env:
++  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
++  #       run: |
++  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
++  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
++  #         asset_name=${wheel_name//"linux"/"manylinux1"}
++  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
++  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
++
++  #     - name: Upload Release Asset
++  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
++  #       env:
++  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
++  #       with:
++  #         upload_url: ${{ needs.release.outputs.upload_url }}
++  #         asset_path: ./dist/${{ env.wheel_name }}
++  #         asset_name: ${{ env.asset_name }}
++  #         asset_content_type: application/*
+ 
+       # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
+       # - name: Publish package
+diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
+new file mode 100644
+index 0000000..df62539
+--- /dev/null
++++ b/.github/workflows/reminder_comment.yml
+@@ -0,0 +1,21 @@
++name: PR Reminder Comment Bot
++on:
++  pull_request_target:
++    types: [opened]
++
++jobs:
++  pr_reminder:
++    runs-on: ubuntu-latest
++    steps:
++      - name: Remind to run full CI on PR
++        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
++        with:
++          script: |
++            github.rest.issues.createComment({
++              owner: context.repo.owner,
++              repo: context.repo.repo,
++              issue_number: context.issue.number,
++              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
++            })
++        env:
++          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
+index e71033f..7266cc3 100644
+--- a/.github/workflows/ruff.yml
++++ b/.github/workflows/ruff.yml
+@@ -6,32 +6,47 @@ on:
+   push:
+     branches:
+       - main
++    paths:
++      - "**/*.py"
++      - pyproject.toml
++      - requirements-lint.txt
++      - .github/workflows/matchers/ruff.json
++      - .github/workflows/ruff.yml
+   pull_request:
+     branches:
+       - main
++    # This workflow is only relevant when one of the following files changes.
++    # However, we have github configured to expect and require this workflow
++    # to run and pass before github with auto-merge a pull request. Until github
++    # allows more flexible auto-merge policy, we can just run this on every PR.
++    # It doesn't take that long to run, anyway.
++    #paths:
++    #  - "**/*.py"
++    #  - pyproject.toml
++    #  - requirements-lint.txt
++    #  - .github/workflows/matchers/ruff.json
++    #  - .github/workflows/ruff.yml
+ 
+ jobs:
+   ruff:
+     runs-on: ubuntu-latest
+     strategy:
+       matrix:
+-        python-version: ["3.8", "3.9", "3.10", "3.11"]
++        python-version: ["3.12"]
+     steps:
+-    - uses: actions/checkout@v2
+-    - name: Set up Python ${{ matrix.python-version }}
+-      uses: actions/setup-python@v2
+-      with:
+-        python-version: ${{ matrix.python-version }}
+-    - name: Install dependencies
+-      run: |
+-        python -m pip install --upgrade pip
+-        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
+-    - name: Analysing the code with ruff
+-      run: |
+-        ruff .
+-    - name: Spelling check with codespell
+-      run: |
+-        codespell --toml pyproject.toml
+-    - name: Run isort
+-      run: |
+-        isort . --check-only
++      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++      - name: Set up Python ${{ matrix.python-version }}
++        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
++        with:
++          python-version: ${{ matrix.python-version }}
++      - name: Install dependencies
++        run: |
++          python -m pip install --upgrade pip
++          pip install -r requirements-lint.txt
++      - name: Analysing the code with ruff
++        run: |
++          echo "::add-matcher::.github/workflows/matchers/ruff.json"
++          ruff check --output-format github .
++      - name: Run isort
++        run: |
++          isort . --check-only
+diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
+index 60a3978..122e4e1 100644
+--- a/.github/workflows/scripts/build.sh
++++ b/.github/workflows/scripts/build.sh
+@@ -1,4 +1,5 @@
+ #!/bin/bash
++set -eux
+ 
+ python_executable=python$1
+ cuda_home=/usr/local/cuda-$2
+@@ -8,14 +9,15 @@ PATH=${cuda_home}/bin:$PATH
+ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
+ 
+ # Install requirements
+-$python_executable -m pip install wheel packaging
+-$python_executable -m pip install -r requirements-cuda.txt
++$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
+ 
+ # Limit the number of parallel jobs to avoid OOM
+ export MAX_JOBS=1
+-# Make sure punica is built for the release (for LoRA)
+-export VLLM_INSTALL_PUNICA_KERNELS=1
+ # Make sure release wheels are built for the following architectures
+ export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
++export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
++
++bash tools/check_repo.sh
++
+ # Build
+ $python_executable setup.py bdist_wheel --dist-dir=dist
+diff --git a/.github/workflows/scripts/cuda-install.sh b/.github/workflows/scripts/cuda-install.sh
+index 312c6e8..3d0b7a1 100644
+--- a/.github/workflows/scripts/cuda-install.sh
++++ b/.github/workflows/scripts/cuda-install.sh
+@@ -1,16 +1,16 @@
+ #!/bin/bash
+ 
+ # Replace '.' with '-' ex: 11.8 -> 11-8
+-cuda_version=$(echo $1 | tr "." "-")
++cuda_version=$(echo "$1" | tr "." "-")
+ # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
+-OS=$(echo $2 | tr -d ".\-")
++OS=$(echo "$2" | tr -d ".\-")
+ 
+ # Installs CUDA
+-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
++wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
+ sudo dpkg -i cuda-keyring_1.1-1_all.deb
+ rm cuda-keyring_1.1-1_all.deb
+ sudo apt -qq update
+-sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
++sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
+ sudo apt clean
+ 
+ # Test nvcc
+diff --git a/.github/workflows/scripts/pytorch-install.sh b/.github/workflows/scripts/pytorch-install.sh
+index dfc1851..e3cda7d 100644
+--- a/.github/workflows/scripts/pytorch-install.sh
++++ b/.github/workflows/scripts/pytorch-install.sh
+@@ -6,7 +6,7 @@ cuda_version=$3
+ 
+ # Install torch
+ $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
+-$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
++$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
+ 
+ # Print version information
+ $python_executable --version
+diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml
+new file mode 100644
+index 0000000..4b1587e
+--- /dev/null
++++ b/.github/workflows/shellcheck.yml
+@@ -0,0 +1,37 @@
++name: Lint shell scripts
++on:
++  push:
++    branches:
++      - "main"
++    paths:
++      - '**/*.sh'
++      - '.github/workflows/shellcheck.yml'
++  pull_request:
++    branches:
++      - "main"
++    paths:
++      - '**/*.sh'
++      - '.github/workflows/shellcheck.yml'
++
++env:
++  LC_ALL: en_US.UTF-8
++
++defaults:
++  run:
++    shell: bash
++
++permissions:
++  contents: read
++
++jobs:
++  shellcheck:
++    runs-on: ubuntu-latest
++    steps:
++      - name: "Checkout"
++        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++        with:
++          fetch-depth: 0
++
++      - name: "Check shell scripts"
++        run: |
++          tools/shellcheck.sh
+diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
+new file mode 100644
+index 0000000..81e7c9b
+--- /dev/null
++++ b/.github/workflows/stale.yml
+@@ -0,0 +1,52 @@
++name: 'Close inactive issues and PRs'
++
++on:
++  schedule:
++    # Daily at 1:30 AM UTC
++    - cron: '30 1 * * *'
++
++jobs:
++  close-issues-and-pull-requests:
++    permissions:
++      issues: write
++      pull-requests: write
++      actions: write
++    runs-on: ubuntu-latest
++    steps:
++      - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
++        with:
++          # Increasing this value ensures that changes to this workflow
++          # propagate to all issues and PRs in days rather than months
++          operations-per-run: 1000
++
++          exempt-draft-pr: true
++          exempt-issue-labels: 'keep-open'
++          exempt-pr-labels: 'keep-open'
++
++          labels-to-add-when-unstale: 'unstale'
++          labels-to-remove-when-stale: 'unstale'
++
++          days-before-issue-stale: 90
++          days-before-issue-close: 30
++          stale-issue-label: 'stale'
++          stale-issue-message: >
++            This issue has been automatically marked as stale because it has not
++            had any activity within 90 days. It will be automatically closed if no
++            further activity occurs within 30 days. Leave a comment if
++            you feel this issue should remain open. Thank you!
++          close-issue-message: >
++            This issue has been automatically closed due to inactivity. Please
++            feel free to reopen if you feel it is still relevant. Thank you!
++
++          days-before-pr-stale: 90
++          days-before-pr-close: 30
++          stale-pr-label: 'stale'
++          stale-pr-message: >
++            This pull request has been automatically marked as stale because it
++            has not had any activity within 90 days. It will be automatically
++            closed if no further activity occurs within 30 days. Leave a comment
++            if you feel this pull request should remain open. Thank you!
++          close-pr-message: >
++            This pull request has been automatically closed due to inactivity.
++            Please feel free to reopen if you intend to continue working on it.
++            Thank you!
+diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
+index 04f307b..ff441f9 100644
+--- a/.github/workflows/yapf.yml
++++ b/.github/workflows/yapf.yml
+@@ -6,26 +6,33 @@ on:
+   push:
+     branches:
+       - main
++    paths:
++      - "**/*.py"
++      - .github/workflows/yapf.yml
+   pull_request:
+     branches:
+       - main
++    paths:
++      - "**/*.py"
++      - .github/workflows/yapf.yml
++
+ jobs:
+   yapf:
+     runs-on: ubuntu-latest
+     strategy:
+       matrix:
+-        python-version: ["3.8", "3.9", "3.10", "3.11"]
++        python-version: ["3.12"]
+     steps:
+-    - uses: actions/checkout@v2
+-    - name: Set up Python ${{ matrix.python-version }}
+-      uses: actions/setup-python@v2
+-      with:
+-        python-version: ${{ matrix.python-version }}
+-    - name: Install dependencies
+-      run: |
+-        python -m pip install --upgrade pip
+-        pip install yapf==0.32.0
+-        pip install toml==0.10.2
+-    - name: Running yapf
+-      run: |
+-        yapf --diff --recursive .
++      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
++      - name: Set up Python ${{ matrix.python-version }}
++        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
++        with:
++          python-version: ${{ matrix.python-version }}
++      - name: Install dependencies
++        run: |
++          python -m pip install --upgrade pip
++          pip install yapf==0.32.0
++          pip install toml==0.10.2
++      - name: Running yapf
++        run: |
++          yapf --diff --recursive .
+diff --git a/.gitignore b/.gitignore
+index e077366..89dab8f 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -1,3 +1,9 @@
++# version file generated by setuptools-scm
++/vllm/_version.py
++
++# vllm-flash-attn built from source
++vllm/vllm_flash_attn/
++
+ # Byte-compiled / optimized / DLL files
+ __pycache__/
+ *.py[cod]
+@@ -9,6 +15,8 @@ __pycache__/
+ # Distribution / packaging
+ .Python
+ build/
++cmake-build-*/
++CMakeUserPresets.json
+ develop-eggs/
+ dist/
+ downloads/
+@@ -25,6 +33,7 @@ share/python-wheels/
+ .installed.cfg
+ *.egg
+ MANIFEST
++/.deps/
+ 
+ # PyInstaller
+ #  Usually these files are written by a python script from a template
+@@ -70,8 +79,7 @@ instance/
+ 
+ # Sphinx documentation
+ docs/_build/
+-docs/source/getting_started/examples/*.rst
+-!**/*.template.rst
++docs/source/getting_started/examples/
+ 
+ # PyBuilder
+ .pybuilder/
+@@ -84,6 +92,9 @@ target/
+ profile_default/
+ ipython_config.py
+ 
++# generated files
++**/generated/**
++
+ # pyenv
+ #   For a library or package, you might want to ignore these files since the code is
+ #   intended to run in multiple environments; otherwise, check them in:
+@@ -186,4 +197,8 @@ _build/
+ hip_compat.h
+ 
+ # Benchmark dataset
+-*.json
++benchmarks/*.json
++
++# Linting
++actionlint
++shellcheck*/
+diff --git a/.readthedocs.yaml b/.readthedocs.yaml
+index 428e199..284196b 100644
+--- a/.readthedocs.yaml
++++ b/.readthedocs.yaml
+@@ -6,16 +6,16 @@ version: 2
+ build:
+   os: ubuntu-22.04
+   tools:
+-    python: "3.8"
++    python: "3.12"
+ 
+ sphinx:
+-   configuration: docs/source/conf.py
++  configuration: docs/source/conf.py
++  fail_on_warning: true
+ 
+ # If using Sphinx, optionally build your docs in additional formats such as PDF
+-formats:
+-   - pdf
++formats: []
+ 
+ # Optionally declare the Python requirements required to build your docs
+ python:
+-   install:
+-   - requirements: docs/requirements-docs.txt
++  install:
++    - requirements: docs/requirements-docs.txt
+diff --git a/.shellcheckrc b/.shellcheckrc
+new file mode 100644
+index 0000000..f3b6eed
+--- /dev/null
++++ b/.shellcheckrc
+@@ -0,0 +1,9 @@
++# rules currently disabled:
++#
++#   SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
++#   SC2004 (style): $/${} is unnecessary on arithmetic variables.
++#   SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
++#   SC2155 (warning): Declare and assign separately to avoid masking return values.
++#   SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
++#
++disable=SC1091,SC2004,SC2129,SC2155,SC2164
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index f817f33..f4b9c3e 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1,25 +1,43 @@
+-cmake_minimum_required(VERSION 3.21)
++cmake_minimum_required(VERSION 3.26)
+ 
++# When building directly using CMake, make sure you run the install step
++# (it places the .so files in the correct location).
++#
++# Example:
++# mkdir build && cd build
++# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
++# cmake --build . --target install
++#
++# If you want to only build one target, make sure to install it manually:
++# cmake --build . --target _C
++# cmake --install . --component _C
+ project(vllm_extensions LANGUAGES CXX)
+ 
+-option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
++# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
++set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
+ 
+ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
+ 
+ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+ 
++# Suppress potential warnings about unused manually-specified variables
++set(ignoreMe "${VLLM_PYTHON_PATH}")
++
++# Prevent installation of dependencies (cutlass) by default.
++install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
++
+ #
+ # Supported python versions.  These versions will be searched in order, the
+ # first match will be selected.  These should be kept in sync with setup.py.
+ #
+-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
++set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
+ 
+ # Supported NVIDIA architectures.
+-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
++set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
+ 
+ # Supported AMD GPU architectures.
+-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
++set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
+ 
+ #
+ # Supported/expected torch versions for CUDA/ROCm.
+@@ -31,9 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
+ # requirements.txt files and should be kept consistent.  The ROCm torch
+ # versions are derived from Dockerfile.rocm
+ #
+-set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
+-set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
+-set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
++set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
++set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
+ 
+ #
+ # Try to find python package with an executable that exactly matches
+@@ -66,19 +83,6 @@ endif()
+ #
+ find_package(Torch REQUIRED)
+ 
+-#
+-# Normally `torch.utils.cpp_extension.CUDAExtension` would add
+-# `libtorch_python.so` for linking against an extension. Torch's cmake
+-# configuration does not include this library (presumably since the cmake
+-# config is used for standalone C++ binaries that link against torch).
+-# The `libtorch_python.so` library defines some of the glue code between
+-# torch/python via pybind and is required by VLLM extensions for this
+-# reason. So, add it by manually with `find_library` using torch's
+-# installed library path.
+-#
+-find_library(torch_python_LIBRARY torch_python PATHS
+-  "${TORCH_INSTALL_PREFIX}/lib")
+-
+ #
+ # Forward the non-CUDA device extensions to external CMake scripts.
+ #
+@@ -87,7 +91,7 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
+     if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+         include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+     else()
+-        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
++        return()
+     endif()
+     return()
+ endif()
+@@ -111,31 +115,42 @@ elseif(HIP_FOUND)
+   # .hip extension automatically, HIP must be enabled explicitly.
+   enable_language(HIP)
+ 
+-  # ROCm 5.x
+-  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
+-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
+-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
+-      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
+-  endif()
+-
+-  # ROCm 6.x
+-  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
+-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
+-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
+-      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
++  # ROCm 5.X and 6.X
++  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
++      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
++    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
++      "expected for ROCm build, saw ${Torch_VERSION} instead.")
+   endif()
+ else()
+   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
+ endif()
+ 
+-#
+-# Override the GPU architectures detected by cmake/torch and filter them by
+-# the supported versions for the current language.
+-# The final set of arches is stored in `VLLM_GPU_ARCHES`.
+-#
+-override_gpu_arches(VLLM_GPU_ARCHES
+-  ${VLLM_GPU_LANG}
+-  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
++
++if(VLLM_GPU_LANG STREQUAL "CUDA")
++  #
++  # For cuda we want to be able to control which architectures we compile for on
++  # a per-file basis in order to cut down on compile time. So here we extract
++  # the set of architectures we want to compile for and remove the from the
++  # CMAKE_CUDA_FLAGS so that they are not applied globally.
++  #
++  clear_cuda_arches(CUDA_ARCH_FLAGS)
++  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
++  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
++  # Filter the target architectures by the supported supported archs
++  # since for some files we will build for all CUDA_ARCHS.
++  cuda_archs_loose_intersection(CUDA_ARCHS
++    "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
++  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
++else()
++  #
++  # For other GPU targets override the GPU architectures detected by cmake/torch
++  # and filter them by the supported versions for the current language.
++  # The final set of arches is stored in `VLLM_GPU_ARCHES`.
++  #
++  override_gpu_arches(VLLM_GPU_ARCHES
++    ${VLLM_GPU_LANG}
++    "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
++endif()
+ 
+ #
+ # Query torch for additional GPU compilation flags for the given
+@@ -151,8 +166,19 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
+   list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
+ endif()
+ 
++
+ #
+-# Define extension targets
++# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
++# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
++# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
++# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
++#
++include(FetchContent)
++file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
++message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
++
++#
++# Define other extension targets
+ #
+ 
+ #
+@@ -161,27 +187,243 @@ endif()
+ 
+ set(VLLM_EXT_SRC
+   "csrc/cache_kernels.cu"
+-  "csrc/attention/attention_kernels.cu"
++  "csrc/attention/paged_attention_v1.cu"
++  "csrc/attention/paged_attention_v2.cu"
+   "csrc/pos_encoding_kernels.cu"
+   "csrc/activation_kernels.cu"
+   "csrc/layernorm_kernels.cu"
+-  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
++  "csrc/layernorm_quant_kernels.cu"
+   "csrc/quantization/gptq/q_gemm.cu"
+-  "csrc/quantization/fp8/fp8_cuda_kernels.cu"
++  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
++  "csrc/quantization/fp8/common.cu"
++  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
++  "csrc/quantization/gguf/gguf_kernel.cu"
+   "csrc/cuda_utils_kernels.cu"
+-  "csrc/moe_align_block_size_kernels.cu"
+-  "csrc/pybind.cpp")
++  "csrc/prepare_inputs/advance_step.cu"
++  "csrc/torch_bindings.cpp")
+ 
+ if(VLLM_GPU_LANG STREQUAL "CUDA")
++  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
++
++  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
++  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
++
++  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
++  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
++    set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
++  endif()
++
++  if(VLLM_CUTLASS_SRC_DIR)
++    if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
++      get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
++    endif()
++    message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
++    FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
++  else()
++    FetchContent_Declare(
++        cutlass
++        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
++        GIT_TAG v3.6.0
++        GIT_PROGRESS TRUE
++
++        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
++        # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
++        # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
++        GIT_SHALLOW TRUE
++    )
++  endif()
++  FetchContent_MakeAvailable(cutlass)
++
+   list(APPEND VLLM_EXT_SRC
++    "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
++    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
+     "csrc/quantization/aqlm/gemm_kernels.cu"
+     "csrc/quantization/awq/gemm_kernels.cu"
+-    "csrc/quantization/marlin/marlin_cuda_kernel.cu"
+-    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+-    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+-    "csrc/custom_all_reduce.cu")
++    "csrc/custom_all_reduce.cu"
++    "csrc/permute_cols.cu"
++    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
++    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
++    "csrc/sparse/cutlass/sparse_compressor_entry.cu"
++    "csrc/cutlass_extensions/common.cpp")
++
++  set_gencode_flags_for_srcs(
++    SRCS "${VLLM_EXT_SRC}"
++    CUDA_ARCHS "${CUDA_ARCHS}")
++
++  # Only build Marlin kernels if we are building for at least some compatible archs.
++  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
++  # are not supported by Machete yet.
++  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
++  if (MARLIN_ARCHS)
++    set(MARLIN_SRCS
++       "csrc/quantization/fp8/fp8_marlin.cu"
++       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
++       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
++       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
++       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
++       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
++       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
++    set_gencode_flags_for_srcs(
++      SRCS "${MARLIN_SRCS}"
++      CUDA_ARCHS "${MARLIN_ARCHS}")
++    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
++    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
++  else()
++    message(STATUS "Not building Marlin kernels as no compatible archs found"
++                   " in CUDA target architectures")
++  endif()
++
++  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
++  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
++  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
++  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
++    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
++    set_gencode_flags_for_srcs(
++      SRCS "${SRCS}"
++      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
++    list(APPEND VLLM_EXT_SRC "${SRCS}")
++    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
++    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
++  else()
++    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
++      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
++                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
++                     "later if you intend on running FP8 quantized models on "
++                     "Hopper.")
++    else()
++      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
++                     "in CUDA target architectures")
++    endif()
++
++    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
++    # build any 3x kernels
++    set(SCALED_MM_3X_ARCHS)
++  endif()
++
++  #
++  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
++  # kernels for the remaining archs that are not already built for 3x.
++  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
++    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
++  # subtract out the archs that are already built for 3x
++  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
++  if (SCALED_MM_2X_ARCHS)
++    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
++    set_gencode_flags_for_srcs(
++      SRCS "${SRCS}"
++      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
++    list(APPEND VLLM_EXT_SRC "${SRCS}")
++    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
++    message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
++  else()
++    if (SCALED_MM_3X_ARCHS)
++      message(STATUS "Not building scaled_mm_c2x as all archs are already built"
++                     " for and covered by scaled_mm_c3x")
++    else()
++      message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
++                    "in CUDA target architectures")
++    endif()
++  endif()
++
++  #
++  # 2:4 Sparse Kernels
++
++  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
++  # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
++  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
++    set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
++             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
++    set_gencode_flags_for_srcs(
++      SRCS "${SRCS}"
++      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
++    list(APPEND VLLM_EXT_SRC "${SRCS}")
++    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
++    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
++  else()
++    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
++      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
++                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
++                     "if you intend on running FP8 sparse quantized models on Hopper.")
++    else()
++      message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
++                     "in CUDA target architectures")
++    endif()
++  endif()
++
++
++  #
++  # Machete kernels
++
++  # The machete kernels only work on hopper and require CUDA 12.0 or later.
++  # Only build Machete kernels if we are building for something compatible with sm90a
++  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
++  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
++    #
++    # For the Machete kernels we automatically generate sources for various
++    # preselected input type pairs and schedules.
++    # Generate sources:
++    set(MACHETE_GEN_SCRIPT
++      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
++    file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
++
++    message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
++    message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
++
++    if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
++        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
++      execute_process(
++        COMMAND ${CMAKE_COMMAND} -E env
++        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
++          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
++        RESULT_VARIABLE machete_generation_result
++        OUTPUT_VARIABLE machete_generation_output
++        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
++        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
++      )
++
++      if (NOT machete_generation_result EQUAL 0)
++        message(FATAL_ERROR "Machete generation failed."
++                            " Result: \"${machete_generation_result}\""
++                            "\nCheck the log for details: "
++                            "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
++      else()
++        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
++            CACHE STRING "Last run machete generate script hash" FORCE)
++        message(STATUS "Machete generation completed successfully.")
++      endif()
++    else()
++      message(STATUS "Machete generation script has not changed, skipping generation.")
++    endif()
++
++    # Add machete generated sources
++    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
++    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
++
++    # forward compatible
++    set_gencode_flags_for_srcs(
++      SRCS "${MACHETE_GEN_SOURCES}"
++      CUDA_ARCHS "${MACHETE_ARCHS}")
++
++    list(APPEND VLLM_EXT_SRC
++      csrc/quantization/machete/machete_pytorch.cu)
++
++    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
++  else()
++    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
++        AND MACHETE_ARCHS)
++      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
++                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
++                     "later if you intend on running w4a16 quantized models on "
++                     "Hopper.")
++    else()
++      message(STATUS "Not building Machete kernels as no compatible archs "
++                     "found in CUDA target architectures")
++    endif()
++  endif()
++# if CUDA endif
+ endif()
+ 
++message(STATUS "Enabling C extension.")
+ define_gpu_extension_target(
+   _C
+   DESTINATION vllm
+@@ -189,16 +431,55 @@ define_gpu_extension_target(
+   SOURCES ${VLLM_EXT_SRC}
+   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+   ARCHITECTURES ${VLLM_GPU_ARCHES}
++  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
++  USE_SABI 3
+   WITH_SOABI)
+ 
++# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
++# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
++# driver API. This causes problems when linking with earlier versions of CUDA.
++# Setting this variable sidesteps the issue by calling the driver directly.
++target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
++
+ #
+ # _moe_C extension
+ #
+ 
+ set(VLLM_MOE_EXT_SRC
+-  "csrc/moe/moe_ops.cpp"
++  "csrc/moe/torch_bindings.cpp"
++  "csrc/moe/moe_align_sum_kernels.cu"
+   "csrc/moe/topk_softmax_kernels.cu")
+ 
++set_gencode_flags_for_srcs(
++  SRCS "${VLLM_MOE_EXT_SRC}"
++  CUDA_ARCHS "${CUDA_ARCHS}")
++
++if(VLLM_GPU_LANG STREQUAL "CUDA")
++  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
++  if (MARLIN_MOE_ARCHS)
++    set(MARLIN_MOE_SRC
++        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
++        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
++        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
++        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
++        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
++        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
++        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
++        "csrc/moe/marlin_moe_ops.cu")
++
++    set_gencode_flags_for_srcs(
++      SRCS "${MARLIN_MOE_SRC}"
++      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
++
++    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
++    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
++  else()
++    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
++                   " in CUDA target architectures")
++  endif()
++endif()
++
++message(STATUS "Enabling moe extension.")
+ define_gpu_extension_target(
+   _moe_C
+   DESTINATION vllm
+@@ -206,89 +487,101 @@ define_gpu_extension_target(
+   SOURCES ${VLLM_MOE_EXT_SRC}
+   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+   ARCHITECTURES ${VLLM_GPU_ARCHES}
++  USE_SABI 3
+   WITH_SOABI)
+ 
+-#
+-# _punica_C extension
+-#
+-
+-set(VLLM_PUNICA_EXT_SRC
+-  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
+-  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
+-  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
+-  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
+-  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
+-  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
+-  "csrc/punica/punica_ops.cc")
+-
+-#
+-# Copy GPU compilation flags+update for punica
+-#
+-set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+-list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
+-  "-D__CUDA_NO_HALF_OPERATORS__"
+-  "-D__CUDA_NO_HALF_CONVERSIONS__"
+-  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+-  "-D__CUDA_NO_HALF2_OPERATORS__")
+-
+-#
+-# Filter out CUDA architectures < 8.0 for punica.
+-#
+-if (${VLLM_GPU_LANG} STREQUAL "CUDA")
+-  set(VLLM_PUNICA_GPU_ARCHES)
+-  foreach(ARCH ${VLLM_GPU_ARCHES})
+-    string_to_ver(CODE_VER ${ARCH})
+-    if (CODE_VER GREATER_EQUAL 8.0)
+-      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
+-    endif()
+-  endforeach()
+-  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+-endif()
++if(VLLM_GPU_LANG STREQUAL "HIP")
++  #
++  # _rocm_C extension
++  #
++  set(VLLM_ROCM_EXT_SRC
++    "csrc/rocm/torch_bindings.cpp"
++    "csrc/rocm/attention.cu")
+ 
+-if (VLLM_PUNICA_GPU_ARCHES)
+   define_gpu_extension_target(
+-    _punica_C
++    _rocm_C
+     DESTINATION vllm
+     LANGUAGE ${VLLM_GPU_LANG}
+-    SOURCES ${VLLM_PUNICA_EXT_SRC}
+-    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
+-    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
++    SOURCES ${VLLM_ROCM_EXT_SRC}
++    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
++    ARCHITECTURES ${VLLM_GPU_ARCHES}
++    USE_SABI 3
+     WITH_SOABI)
+-else()
+-  message(WARNING "Unable to create _punica_C target because none of the "
+-    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
++endif()
++
++# vllm-flash-attn currently only supported on CUDA
++if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
++  return()
++endif ()
++
++# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
++# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
++# arches in the CUDA case (and instead set the gencodes on a per file basis)
++# we need to manually set VLLM_GPU_ARCHES here.
++if(VLLM_GPU_LANG STREQUAL "CUDA")
++  foreach(_ARCH ${CUDA_ARCHS})
++    string(REPLACE "." "" _ARCH "${_ARCH}")
++    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
++  endforeach()
+ endif()
+ 
+ #
+-# Add the `default` target which detects which extensions should be
+-# built based on platform/architecture.  This is the same logic that
+-# setup.py uses to select which extensions should be built and should
+-# be kept in sync.
+-#
+-# The `default` target makes direct use of cmake easier since knowledge
+-# of which extensions are supported has been factored in, e.g.
++# Build vLLM flash attention from source
+ #
+-# mkdir build && cd build
+-# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
+-# cmake --build . --target default
++# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
++# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
++# They should be identical but if they aren't, this is a massive footgun.
+ #
+-add_custom_target(default)
++# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
++# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
++# If no component is specified, vllm-flash-attn is still installed.
+ 
+-if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+-  message(STATUS "Enabling C extension.")
+-  add_dependencies(default _C)
++# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
++# This is to enable local development of vllm-flash-attn within vLLM.
++# It can be set as an environment variable or passed as a cmake argument.
++# The environment variable takes precedence.
++if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
++  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
+ endif()
+ 
+-if(VLLM_GPU_LANG STREQUAL "CUDA")
+-  message(STATUS "Enabling moe extension.")
+-  add_dependencies(default _moe_C)
+-
+-  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
+-  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
+-  # there are supported target arches.
+-  if (VLLM_PUNICA_GPU_ARCHES AND
+-      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
+-    message(STATUS "Enabling punica extension.")
+-    add_dependencies(default _punica_C)
+-  endif()
++if(VLLM_FLASH_ATTN_SRC_DIR)
++  FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
++else()
++  FetchContent_Declare(
++          vllm-flash-attn
++          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
++          GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
++          GIT_PROGRESS TRUE
++          # Don't share the vllm-flash-attn build between build types
++          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
++  )
+ endif()
++
++# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
++set(VLLM_PARENT_BUILD ON)
++
++# Ensure the vllm/vllm_flash_attn directory exists before installation
++install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
++
++# Make sure vllm-flash-attn install rules are nested under vllm/
++install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
++install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
++install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
++
++# Fetch the vllm-flash-attn library
++FetchContent_MakeAvailable(vllm-flash-attn)
++message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
++
++# Restore the install prefix
++install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
++install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
++
++# Copy over the vllm-flash-attn python files
++install(
++        DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
++        DESTINATION vllm/vllm_flash_attn
++        COMPONENT vllm_flash_attn_c
++        FILES_MATCHING PATTERN "*.py"
++)
++
++# Nothing after vllm-flash-attn, see comment about macros above
+diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
+new file mode 100644
+index 0000000..f801b5f
+--- /dev/null
++++ b/CODE_OF_CONDUCT.md
+@@ -0,0 +1,128 @@
++
++# vLLM Code of Conduct
++
++## Our Pledge
++
++We as members, contributors, and leaders pledge to make participation in our
++community a harassment-free experience for everyone, regardless of age, body
++size, visible or invisible disability, ethnicity, sex characteristics, gender
++identity and expression, level of experience, education, socioeconomic status,
++nationality, personal appearance, race, caste, color, religion, or sexual
++identity and orientation.
++
++We pledge to act and interact in ways that contribute to an open, welcoming,
++diverse, inclusive, and healthy community.
++
++## Our Standards
++
++Examples of behavior that contributes to a positive environment for our
++community include:
++
++* Demonstrating empathy and kindness toward other people
++* Being respectful of differing opinions, viewpoints, and experiences
++* Giving and gracefully accepting constructive feedback
++* Accepting responsibility and apologizing to those affected by our mistakes,
++  and learning from the experience
++* Focusing on what is best not just for us as individuals, but for the overall
++  community
++
++Examples of unacceptable behavior include:
++
++* The use of sexualized language or imagery, and sexual attention or advances of
++  any kind
++* Trolling, insulting or derogatory comments, and personal or political attacks
++* Public or private harassment
++* Publishing others' private information, such as a physical or email address,
++  without their explicit permission
++* Other conduct which could reasonably be considered inappropriate in a
++  professional setting
++
++## Enforcement Responsibilities
++
++Community leaders are responsible for clarifying and enforcing our standards of
++acceptable behavior and will take appropriate and fair corrective action in
++response to any behavior that they deem inappropriate, threatening, offensive,
++or harmful.
++
++Community leaders have the right and responsibility to remove, edit, or reject
++comments, commits, code, wiki edits, issues, and other contributions that are
++not aligned to this Code of Conduct, and will communicate reasons for moderation
++decisions when appropriate.
++
++## Scope
++
++This Code of Conduct applies within all community spaces, and also applies when
++an individual is officially representing the community in public spaces.
++Examples of representing our community include using an official email address,
++posting via an official social media account, or acting as an appointed
++representative at an online or offline/IRL event.
++
++## Enforcement
++
++Instances of abusive, harassing, or otherwise unacceptable behavior may be
++reported to the community leaders responsible for enforcement in the #code-of-conduct
++channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
++All complaints will be reviewed and investigated promptly and fairly.
++
++All community leaders are obligated to respect the privacy and security of the
++reporter of any incident.
++
++## Enforcement Guidelines
++
++Community leaders will follow these Community Impact Guidelines in determining
++the consequences for any action they deem in violation of this Code of Conduct:
++
++### 1. Correction
++
++**Community Impact**: Use of inappropriate language or other behavior deemed
++unprofessional or unwelcome in the community.
++
++**Consequence**: A private, written warning from community leaders, providing
++clarity around the nature of the violation and an explanation of why the
++behavior was inappropriate. A public apology may be requested.
++
++### 2. Warning
++
++**Community Impact**: A violation through a single incident or series of
++actions.
++
++**Consequence**: A warning with consequences for continued behavior. No
++interaction with the people involved, including unsolicited interaction with
++those enforcing the Code of Conduct, for a specified period of time. This
++includes avoiding interactions in community spaces as well as external channels
++like social media. Violating these terms may lead to a temporary or permanent
++ban.
++
++### 3. Temporary Ban
++
++**Community Impact**: A serious violation of community standards, including
++sustained inappropriate behavior.
++
++**Consequence**: A temporary ban from any sort of interaction or public
++communication with the community for a specified period of time. No public or
++private interaction with the people involved, including unsolicited interaction
++with those enforcing the Code of Conduct, is allowed during this period.
++Violating these terms may lead to a permanent ban.
++
++### 4. Permanent Ban
++
++**Community Impact**: Demonstrating a pattern of violation of community
++standards, including sustained inappropriate behavior, harassment of an
++individual, or aggression toward or disparagement of classes of individuals.
++
++**Consequence**: A permanent ban from any sort of public interaction within the
++community.
++
++## Attribution
++
++This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
++version 2.1, available at
++[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
++
++Community Impact Guidelines were inspired by
++[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
++
++For answers to common questions about this code of conduct, see the
++[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
++[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
++
+diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
+index 81a8db2..6d46a6d 100644
+--- a/CONTRIBUTING.md
++++ b/CONTRIBUTING.md
+@@ -1,56 +1,3 @@
+ # Contributing to vLLM
+ 
+-Thank you for your interest in contributing to vLLM!
+-Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
+-There are several ways you can contribute to the project:
+-
+-- Identify and report any issues or bugs.
+-- Request or add a new model.
+-- Suggest or implement new features.
+-
+-However, remember that contributions aren't just about code.
+-We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
+-
+-Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
+-Talk about it in your blog posts, highlighting how it's driving your incredible projects.
+-Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
+-
+-
+-## Setup for development
+-
+-### Build from source
+-
+-```bash
+-pip install -e .  # This may take several minutes.
+-```
+-
+-### Testing
+-
+-```bash
+-pip install -r requirements-dev.txt
+-
+-# linting and formatting
+-bash format.sh
+-# Static type checking
+-mypy
+-# Unit tests
+-pytest tests/
+-```
+-**Note:** Currently, the repository does not pass the mypy tests.
+-
+-
+-## Contributing Guidelines
+-
+-### Issue Reporting
+-
+-If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
+-If not, please file a new issue, providing as much relevant information as possible.
+-
+-### Pull Requests & Code Reviews
+-
+-Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
+-
+-### Thank You
+-
+-Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
+-Your contributions make vLLM a great tool for everyone!
++You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
+diff --git a/DCO b/DCO
+new file mode 100644
+index 0000000..49b8cb0
+--- /dev/null
++++ b/DCO
+@@ -0,0 +1,34 @@
++Developer Certificate of Origin
++Version 1.1
++
++Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
++
++Everyone is permitted to copy and distribute verbatim copies of this
++license document, but changing it is not allowed.
++
++
++Developer's Certificate of Origin 1.1
++
++By making a contribution to this project, I certify that:
++
++(a) The contribution was created in whole or in part by me and I
++    have the right to submit it under the open source license
++    indicated in the file; or
++
++(b) The contribution is based upon previous work that, to the best
++    of my knowledge, is covered under an appropriate open source
++    license and I have the right under that license to submit that
++    work with modifications, whether created in whole or in part
++    by me, under the same open source license (unless I am
++    permitted to submit under a different license), as indicated
++    in the file; or
++
++(c) The contribution was provided directly to me by some other
++    person who certified (a), (b) or (c) and I have not modified
++    it.
++
++(d) I understand and agree that this project and the contribution
++    are public and that a record of the contribution (including all
++    personal information I submit with it, including my sign-off) is
++    maintained indefinitely and may be redistributed consistent with
++    this project or the open source license(s) involved.
+diff --git a/Dockerfile b/Dockerfile
+index 90be3a3..4542bc9 100644
+--- a/Dockerfile
++++ b/Dockerfile
+@@ -2,34 +2,63 @@
+ # to run the OpenAI compatible server.
+ 
+ # Please update any changes made here to
+-# docs/source/dev/dockerfile/dockerfile.rst and
+-# docs/source/assets/dev/dockerfile-stages-dependency.png
++# docs/source/contributing/dockerfile/dockerfile.md and
++# docs/source/assets/contributing/dockerfile-stages-dependency.png
+ 
++ARG CUDA_VERSION=12.4.1
+ #################### BASE BUILD IMAGE ####################
+ # prepare basic build environment
+-FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
+-
+-RUN apt-get update -y \
+-    && apt-get install -y python3-pip git
++FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
++ARG CUDA_VERSION=12.4.1
++ARG PYTHON_VERSION=3.12
++ARG TARGETPLATFORM
++ENV DEBIAN_FRONTEND=noninteractive
++
++# Install Python and other dependencies
++RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
++    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
++    && apt-get update -y \
++    && apt-get install -y ccache software-properties-common git curl sudo \
++    && add-apt-repository ppa:deadsnakes/ppa \
++    && apt-get update -y \
++    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
++    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
++    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
++    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
++    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
++    && python3 --version && python3 -m pip --version
++
++# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
++# as it was causing spam when compiling the CUTLASS kernels
++RUN apt-get install -y gcc-10 g++-10
++RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
++RUN <<EOF
++gcc --version
++EOF
+ 
+ # Workaround for https://github.com/openai/triton/issues/2507 and
+ # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+ # this won't be needed for future versions of this docker image
+ # or future versions of triton.
+-RUN ldconfig /usr/local/cuda-12.4/compat/
++RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+ 
+ WORKDIR /workspace
+ 
+ # install build and runtime dependencies
+-COPY requirements-common.txt requirements-common.txt
+-COPY requirements-cuda.txt requirements-cuda.txt
++
++# arm64 (GH200) build follows the practice of "use existing pytorch" build,
++# we need to install torch and torchvision from the nightly builds first,
++# pytorch will not appear as a vLLM dependency in all of the following steps
++# after this step
+ RUN --mount=type=cache,target=/root/.cache/pip \
+-    pip install -r requirements-cuda.txt
++    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
++        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
++    fi
+ 
+-# install development dependencies
+-COPY requirements-dev.txt requirements-dev.txt
++COPY requirements-common.txt requirements-common.txt
++COPY requirements-cuda.txt requirements-cuda.txt
+ RUN --mount=type=cache,target=/root/.cache/pip \
+-    pip install -r requirements-dev.txt
++    python3 -m pip install -r requirements-cuda.txt
+ 
+ # cuda arch list used by torch
+ # can be useful for both `dev` and `test`
+@@ -37,29 +66,25 @@ RUN --mount=type=cache,target=/root/.cache/pip \
+ # see https://github.com/pytorch/pytorch/pull/123243
+ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
++# Override the arch list for flash-attn to reduce the binary size
++ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
++ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+ #################### BASE BUILD IMAGE ####################
+ 
+-
+ #################### WHEEL BUILD IMAGE ####################
+-FROM dev AS build
++FROM base AS build
++ARG TARGETPLATFORM
+ 
+ # install build dependencies
+ COPY requirements-build.txt requirements-build.txt
++
+ RUN --mount=type=cache,target=/root/.cache/pip \
+-    pip install -r requirements-build.txt
++    python3 -m pip install -r requirements-build.txt
+ 
+-# install compiler cache to speed up compilation leveraging local or remote caching
+-RUN apt-get update -y && apt-get install -y ccache
+-
+-# files and directories related to build wheels
+-COPY csrc csrc
+-COPY setup.py setup.py
+-COPY cmake cmake
+-COPY CMakeLists.txt CMakeLists.txt
+-COPY requirements-common.txt requirements-common.txt
+-COPY requirements-cuda.txt requirements-cuda.txt
+-COPY pyproject.toml pyproject.toml
+-COPY vllm vllm
++COPY . .
++ARG GIT_REPO_CHECK=0
++RUN --mount=type=bind,source=.git,target=.git \
++    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+ 
+ # max jobs used by Ninja to build extensions
+ ARG max_jobs=2
+@@ -67,68 +92,116 @@ ENV MAX_JOBS=${max_jobs}
+ # number of threads used by nvcc
+ ARG nvcc_threads=8
+ ENV NVCC_THREADS=$nvcc_threads
+-# make sure punica kernels are built (for LoRA)
+-ENV VLLM_INSTALL_PUNICA_KERNELS=1
++
++ARG USE_SCCACHE
++ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
++ARG SCCACHE_REGION_NAME=us-west-2
++ARG SCCACHE_S3_NO_CREDENTIALS=0
++# if USE_SCCACHE is set, use sccache to speed up compilation
++RUN --mount=type=cache,target=/root/.cache/pip \
++    --mount=type=bind,source=.git,target=.git \
++    if [ "$USE_SCCACHE" = "1" ]; then \
++        echo "Installing sccache..." \
++        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
++        && tar -xzf sccache.tar.gz \
++        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
++        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
++        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
++        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
++        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
++        && export SCCACHE_IDLE_TIMEOUT=0 \
++        && export CMAKE_BUILD_TYPE=Release \
++        && sccache --show-stats \
++        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
++        && sccache --show-stats; \
++    fi
+ 
+ ENV CCACHE_DIR=/root/.cache/ccache
+ RUN --mount=type=cache,target=/root/.cache/ccache \
+     --mount=type=cache,target=/root/.cache/pip \
+-    python3 setup.py bdist_wheel --dist-dir=dist
++    --mount=type=bind,source=.git,target=.git  \
++    if [ "$USE_SCCACHE" != "1" ]; then \
++        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
++    fi
+ 
+-# check the size of the wheel, we cannot upload wheels larger than 100MB
++# Check the size of the wheel if RUN_WHEEL_CHECK is true
+ COPY .buildkite/check-wheel-size.py check-wheel-size.py
+-RUN python3 check-wheel-size.py dist
+-
+-# the `vllm_nccl` package must be installed from source distribution
+-# pip is too smart to store a wheel in the cache, and other CI jobs
+-# will directly use the wheel from the cache, which is not what we want.
+-# we need to remove it manually
+-RUN --mount=type=cache,target=/root/.cache/pip \
+-    pip cache remove vllm_nccl*
++# Default max size of the wheel is 250MB
++ARG VLLM_MAX_SIZE_MB=250
++ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
++ARG RUN_WHEEL_CHECK=true
++RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
++        python3 check-wheel-size.py dist; \
++    else \
++        echo "Skipping wheel size check."; \
++    fi
+ #################### EXTENSION Build IMAGE ####################
+ 
+-#################### FLASH_ATTENTION Build IMAGE ####################
+-FROM dev as flash-attn-builder
+-# max jobs used for build
+-ARG max_jobs=2
+-ENV MAX_JOBS=${max_jobs}
+-# flash attention version
+-ARG flash_attn_version=v2.5.8
+-ENV FLASH_ATTN_VERSION=${flash_attn_version}
+-
+-WORKDIR /usr/src/flash-attention-v2
++#################### DEV IMAGE ####################
++FROM base as dev
+ 
+-# Download the wheel or build it if a pre-compiled release doesn't exist
+-RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
+-    --no-build-isolation --no-deps --no-cache-dir
+-
+-#################### FLASH_ATTENTION Build IMAGE ####################
++COPY requirements-lint.txt requirements-lint.txt
++COPY requirements-test.txt requirements-test.txt
++COPY requirements-dev.txt requirements-dev.txt
++RUN --mount=type=cache,target=/root/.cache/pip \
++    python3 -m pip install -r requirements-dev.txt
++#################### DEV IMAGE ####################
+ 
+ #################### vLLM installation IMAGE ####################
+ # image with vLLM installed
+-FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
++FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
++ARG CUDA_VERSION=12.4.1
++ARG PYTHON_VERSION=3.12
+ WORKDIR /vllm-workspace
+-
+-RUN apt-get update -y \
+-    && apt-get install -y python3-pip git vim
++ENV DEBIAN_FRONTEND=noninteractive
++ARG TARGETPLATFORM
++
++RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
++    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
++
++# Install Python and other dependencies
++RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
++    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
++    && apt-get update -y \
++    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
++    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
++    && add-apt-repository ppa:deadsnakes/ppa \
++    && apt-get update -y \
++    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
++    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
++    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
++    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
++    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
++    && python3 --version && python3 -m pip --version
+ 
+ # Workaround for https://github.com/openai/triton/issues/2507 and
+ # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+ # this won't be needed for future versions of this docker image
+ # or future versions of triton.
+-RUN ldconfig /usr/local/cuda-12.4/compat/
++RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
++
++# arm64 (GH200) build follows the practice of "use existing pytorch" build,
++# we need to install torch and torchvision from the nightly builds first,
++# pytorch will not appear as a vLLM dependency in all of the following steps
++# after this step
++RUN --mount=type=cache,target=/root/.cache/pip \
++    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
++        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
++    fi
+ 
+-# install vllm wheel first, so that torch etc will be installed
++# Install vllm wheel first, so that torch etc will be installed.
+ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+     --mount=type=cache,target=/root/.cache/pip \
+-    pip install dist/*.whl --verbose
++    python3 -m pip install dist/*.whl --verbose
+ 
+-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+-    --mount=type=cache,target=/root/.cache/pip \
+-    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
++RUN --mount=type=cache,target=/root/.cache/pip \
++. /etc/environment && \
++if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
++    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
++fi
++COPY examples examples
+ #################### vLLM installation IMAGE ####################
+ 
+-
+ #################### TEST IMAGE ####################
+ # image to run unit testing suite
+ # note that this uses vllm installed by `pip`
+@@ -138,7 +211,19 @@ ADD . /vllm-workspace/
+ 
+ # install development dependencies (for testing)
+ RUN --mount=type=cache,target=/root/.cache/pip \
+-    pip install -r requirements-dev.txt
++    python3 -m pip install -r requirements-dev.txt
++
++# install development dependencies (for testing)
++RUN --mount=type=cache,target=/root/.cache/pip \
++    python3 -m pip install -e tests/vllm_test_utils
++
++# enable fast downloads from hf (for testing)
++RUN --mount=type=cache,target=/root/.cache/pip \
++    python3 -m pip install hf_transfer
++ENV HF_HUB_ENABLE_HF_TRANSFER 1
++
++# Copy in the v1 package for testing (it isn't distributed yet)
++COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
+ 
+ # doc requires source code
+ # we hide them inside `test_docs/` , so that this source code
+@@ -146,18 +231,30 @@ RUN --mount=type=cache,target=/root/.cache/pip \
+ RUN mkdir test_docs
+ RUN mv docs test_docs/
+ RUN mv vllm test_docs/
+-
+ #################### TEST IMAGE ####################
+ 
+ #################### OPENAI API SERVER ####################
+-# openai api server alternative
+-FROM vllm-base AS vllm-openai
++# base openai image with additional requirements, for any subsequent openai-style images
++FROM vllm-base AS vllm-openai-base
+ 
+ # install additional dependencies for openai api server
+ RUN --mount=type=cache,target=/root/.cache/pip \
+-    pip install accelerate hf_transfer modelscope
++    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
++        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
++    else \
++        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
++    fi
+ 
+ ENV VLLM_USAGE_SOURCE production-docker-image
+ 
++# define sagemaker first, so it is not default from `docker build`
++FROM vllm-openai-base AS vllm-sagemaker
++
++COPY examples/online_serving/sagemaker-entrypoint.sh .
++RUN chmod +x sagemaker-entrypoint.sh
++ENTRYPOINT ["./sagemaker-entrypoint.sh"]
++
++FROM vllm-openai-base AS vllm-openai
++
+ ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ #################### OPENAI API SERVER ####################
+diff --git a/Dockerfile.arm b/Dockerfile.arm
+new file mode 100644
+index 0000000..093ee22
+--- /dev/null
++++ b/Dockerfile.arm
+@@ -0,0 +1,62 @@
++# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
++
++FROM ubuntu:22.04 AS cpu-test-arm
++
++ENV CCACHE_DIR=/root/.cache/ccache
++
++ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
++
++RUN --mount=type=cache,target=/var/cache/apt \
++    apt-get update -y \
++    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
++    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
++    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
++
++# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
++RUN --mount=type=cache,target=/root/.cache/pip \
++    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
++
++# Set LD_PRELOAD for tcmalloc on ARM
++ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
++
++RUN echo 'ulimit -c 0' >> ~/.bashrc
++
++WORKDIR /workspace
++
++ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
++ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
++RUN --mount=type=cache,target=/root/.cache/pip \
++    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
++    pip install --upgrade pip && \
++    pip install -r requirements-build.txt
++
++FROM cpu-test-arm AS build
++
++WORKDIR /workspace/vllm
++
++RUN --mount=type=cache,target=/root/.cache/pip \
++    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
++    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
++    pip install -v -r requirements-cpu.txt
++
++COPY . .
++ARG GIT_REPO_CHECK=0
++RUN --mount=type=bind,source=.git,target=.git \
++    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
++
++# Disabling AVX512 specific optimizations for ARM
++ARG VLLM_CPU_DISABLE_AVX512="true"
++ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
++
++RUN --mount=type=cache,target=/root/.cache/pip \
++    --mount=type=cache,target=/root/.cache/ccache \
++    --mount=type=bind,source=.git,target=.git \
++    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
++    pip install dist/*.whl && \
++    rm -rf dist
++
++WORKDIR /workspace/
++
++RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
++
++ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+\ No newline at end of file
+diff --git a/Dockerfile.cpu b/Dockerfile.cpu
+index 4251fdd..f163edc 100644
+--- a/Dockerfile.cpu
++++ b/Dockerfile.cpu
+@@ -1,20 +1,69 @@
+ # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+ 
+-FROM ubuntu:22.04
++FROM ubuntu:22.04 AS cpu-test-1
+ 
+-RUN apt-get update  -y \
+-    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
++ENV CCACHE_DIR=/root/.cache/ccache
++
++ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
++
++RUN --mount=type=cache,target=/var/cache/apt \
++    apt-get update -y \
++    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
++    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+ 
+-RUN pip install --upgrade pip \
+-    && pip install wheel packaging ninja setuptools>=49.4.0 numpy
++# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
++# intel-openmp provides additional performance improvement vs. openmp
++# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
++RUN --mount=type=cache,target=/root/.cache/pip \
++    pip install intel-openmp==2025.0.1
++
++ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
++
++RUN echo 'ulimit -c 0' >> ~/.bashrc
++
++RUN pip install intel_extension_for_pytorch==2.5.0
++
++WORKDIR /workspace
+ 
+-COPY ./ /workspace/vllm
++COPY requirements-build.txt requirements-build.txt
++ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
++ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
++RUN --mount=type=cache,target=/root/.cache/pip \
++    pip install --upgrade pip && \
++    pip install -r requirements-build.txt
++
++FROM cpu-test-1 AS build
+ 
+ WORKDIR /workspace/vllm
+ 
+-RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
++COPY requirements-common.txt requirements-common.txt
++COPY requirements-cpu.txt requirements-cpu.txt
++RUN --mount=type=cache,target=/root/.cache/pip \
++    pip install -v -r requirements-cpu.txt
++
++COPY . .
++ARG GIT_REPO_CHECK=0
++RUN --mount=type=bind,source=.git,target=.git \
++    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
++
++# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
++ARG VLLM_CPU_DISABLE_AVX512
++ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
++
++RUN --mount=type=cache,target=/root/.cache/pip \
++    --mount=type=cache,target=/root/.cache/ccache \
++    --mount=type=bind,source=.git,target=.git \
++    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
++    pip install dist/*.whl && \
++    rm -rf dist
++
++WORKDIR /workspace/
++
++RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+ 
+-RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
++# install development dependencies (for testing)
++RUN --mount=type=cache,target=/root/.cache/pip \
++    pip install -e tests/vllm_test_utils
+ 
+-CMD ["/bin/bash"]
++ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+diff --git a/Dockerfile.hpu b/Dockerfile.hpu
+new file mode 100644
+index 0000000..87e0c1a
+--- /dev/null
++++ b/Dockerfile.hpu
+@@ -0,0 +1,21 @@
++FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
++
++COPY ./ /workspace/vllm
++
++WORKDIR /workspace/vllm
++
++RUN pip install -v -r requirements-hpu.txt
++
++ENV no_proxy=localhost,127.0.0.1
++ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
++
++RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
++
++# install development dependencies (for testing)
++RUN python3 -m pip install -e tests/vllm_test_utils
++
++WORKDIR /workspace/
++
++RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
++
++ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+diff --git a/Dockerfile.neuron b/Dockerfile.neuron
+index fe42b4e..e9cb828 100644
+--- a/Dockerfile.neuron
++++ b/Dockerfile.neuron
+@@ -1,36 +1,49 @@
+ # default base image
+-ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
++# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
++ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
+ 
+ FROM $BASE_IMAGE
+ 
+ RUN echo "Base image is $BASE_IMAGE"
+ 
+ # Install some basic utilities
+-RUN apt-get update && apt-get install python3 python3-pip -y
++RUN apt-get update && \
++    apt-get install -y \
++        git \
++        python3 \
++        python3-pip \
++        ffmpeg libsm6 libxext6 libgl1
+ 
+ ### Mount Point ###
+-# When launching the container, mount the code directory to /app
+-ARG APP_MOUNT=/app
++# When launching the container, mount the code directory to /workspace
++ARG APP_MOUNT=/workspace
+ VOLUME [ ${APP_MOUNT} ]
+-WORKDIR ${APP_MOUNT}
++WORKDIR ${APP_MOUNT}/vllm
+ 
+ RUN python3 -m pip install --upgrade pip
+ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+-RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
++RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
+ RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+-RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
++RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
++RUN python3 -m pip install pytest
+ 
+-COPY ./vllm /app/vllm/vllm
+-COPY ./setup.py /app/vllm/setup.py
+-COPY ./requirements-common.txt /app/vllm/requirements-common.txt
+-COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
++COPY . .
++ARG GIT_REPO_CHECK=0
++RUN --mount=type=bind,source=.git,target=.git \
++    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+ 
+-RUN cd /app/vllm \
+-    && python3 -m pip install -U -r requirements-neuron.txt
++RUN python3 -m pip install -U \
++        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
++        -r requirements-neuron.txt
+ 
+-ENV VLLM_BUILD_WITH_NEURON 1
+-RUN cd /app/vllm \
+-    && pip install -e . \
+-    && cd ..
++ENV VLLM_TARGET_DEVICE neuron
++RUN --mount=type=bind,source=.git,target=.git \
++    pip install --no-build-isolation -v -e .
++
++# install development dependencies (for testing)
++RUN python3 -m pip install -e tests/vllm_test_utils
++
++# overwrite entrypoint to run bash script
++RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
+ 
+ CMD ["/bin/bash"]
+diff --git a/Dockerfile.openvino b/Dockerfile.openvino
+new file mode 100644
+index 0000000..32bcbfa
+--- /dev/null
++++ b/Dockerfile.openvino
+@@ -0,0 +1,29 @@
++# The vLLM Dockerfile is used to construct vLLM image that can be directly used
++# to run the OpenAI compatible server.
++
++FROM ubuntu:22.04 AS dev
++
++RUN apt-get update -y && \
++    apt-get install -y \
++        git python3-pip \
++        ffmpeg libsm6 libxext6 libgl1
++WORKDIR /workspace
++
++COPY . .
++ARG GIT_REPO_CHECK=0
++RUN --mount=type=bind,source=.git,target=.git \
++    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
++
++RUN python3 -m pip install -U pip
++# install build requirements
++RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
++# build vLLM with OpenVINO backend
++RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
++
++COPY examples/ /workspace/examples
++COPY benchmarks/ /workspace/benchmarks
++
++# install development dependencies (for testing)
++RUN python3 -m pip install -e tests/vllm_test_utils
++
++CMD ["/bin/bash"]
+diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
+new file mode 100644
+index 0000000..d3cd1c7
+--- /dev/null
++++ b/Dockerfile.ppc64le
+@@ -0,0 +1,38 @@
++FROM mambaorg/micromamba
++ARG MAMBA_DOCKERFILE_ACTIVATE=1
++USER root
++
++ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
++
++RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
++
++# Some packages in requirements-cpu are installed here
++# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
++# Currently these may not be available for venv or pip directly
++RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
++
++COPY ./ /workspace/vllm
++
++WORKDIR /workspace/vllm
++ARG GIT_REPO_CHECK=0
++RUN --mount=type=bind,source=.git,target=.git \
++    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
++
++RUN --mount=type=cache,target=/root/.cache/pip  \
++    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
++        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
++        torch==2.3.1 \
++        -r requirements-cpu.txt \
++        xformers uvloop==0.20.0
++
++RUN --mount=type=bind,source=.git,target=.git \
++    VLLM_TARGET_DEVICE=cpu python3 setup.py install
++
++# install development dependencies (for testing)
++RUN python3 -m pip install -e tests/vllm_test_utils
++
++WORKDIR /workspace/
++
++RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
++
++ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
+diff --git a/Dockerfile.rocm b/Dockerfile.rocm
+index d04bb99..e733994 100644
+--- a/Dockerfile.rocm
++++ b/Dockerfile.rocm
+@@ -1,35 +1,27 @@
+-# default base image
+-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+-
+-FROM $BASE_IMAGE
+-
+-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+-
+-RUN echo "Base image is $BASE_IMAGE"
+-
+-# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+-# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
++# Default ROCm 6.2 base image
++ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
+ 
++# Default ROCm ARCHes to build vLLM for.
++ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
+ 
++# Whether to install CK-based flash-attention
++# If 0, will not install flash-attention
++ARG BUILD_FA="1"
+ ARG FA_GFX_ARCHS="gfx90a;gfx942"
+-RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
++ARG FA_BRANCH="3cea2fb"
+ 
+-ARG FA_BRANCH="ae7928c"
+-RUN echo "FA_BRANCH is $FA_BRANCH"
++# Whether to build triton on rocm
++ARG BUILD_TRITON="1"
++ARG TRITON_BRANCH="e192dba"
+ 
+-# whether to build flash-attention
+-# if 0, will not build flash attention
+-# this is useful for gfx target where flash-attention is not supported
+-# In that case, we need to use the python reference attention implementation in vllm
+-ARG BUILD_FA="1"
++### Base image build stage
++FROM $BASE_IMAGE AS base
+ 
+-# whether to build triton on rocm
+-ARG BUILD_TRITON="1"
++# Import arg(s) defined before this build stage
++ARG PYTORCH_ROCM_ARCH
+ 
+ # Install some basic utilities
+ RUN apt-get update && apt-get install python3 python3-pip -y
+-
+-# Install some basic utilities
+ RUN apt-get update && apt-get install -y \
+     curl \
+     ca-certificates \
+@@ -40,68 +32,143 @@ RUN apt-get update && apt-get install -y \
+     build-essential \
+     wget \
+     unzip \
+-    nvidia-cuda-toolkit \
+     tmux \
++    ccache \
+  && rm -rf /var/lib/apt/lists/*
+ 
+-### Mount Point ###
+-# When launching the container, mount the code directory to /app
++# When launching the container, mount the code directory to /vllm-workspace
+ ARG APP_MOUNT=/vllm-workspace
+-VOLUME [ ${APP_MOUNT} ]
+ WORKDIR ${APP_MOUNT}
+ 
+ RUN python3 -m pip install --upgrade pip
+-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
++# Remove sccache so it doesn't interfere with ccache
++# TODO: implement sccache support across components
++RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
++
++# Install torch == 2.6.0 on ROCm
++RUN --mount=type=cache,target=/root/.cache/pip \
++    case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
++        *"rocm-6.2"*) \
++            python3 -m pip uninstall -y torch torchvision \
++            && python3 -m pip install --pre \
++                torch==2.6.0.dev20241113+rocm6.2 \
++                'setuptools-scm>=8' \
++                torchvision==0.20.0.dev20241113+rocm6.2 \
++                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
++        *) ;; esac
+ 
+ ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
+ ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
+ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
+ ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
+ 
+-# Install ROCm flash-attention
+-RUN if [ "$BUILD_FA" = "1" ]; then \
+-    mkdir libs \
+-    && cd libs \
+-    && git clone https://github.com/ROCm/flash-attention.git \
+-    && cd flash-attention \
+-    && git checkout ${FA_BRANCH} \
+-    && git submodule update --init \
+-    && export GPU_ARCHS=${FA_GFX_ARCHS} \
+-    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+-        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
+-    && python3 setup.py install \
+-    && cd ..; \
++ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
++ENV CCACHE_DIR=/root/.cache/ccache
++
++
++### AMD-SMI build stage
++FROM base AS build_amdsmi
++# Build amdsmi wheel always
++RUN cd /opt/rocm/share/amd_smi \
++    && python3 -m pip wheel . --wheel-dir=/install
++
++
++### Flash-Attention wheel build stage
++FROM base AS build_fa
++ARG BUILD_FA
++ARG FA_GFX_ARCHS
++ARG FA_BRANCH
++# Build ROCm flash-attention wheel if `BUILD_FA = 1`
++RUN --mount=type=cache,target=${CCACHE_DIR} \
++    if [ "$BUILD_FA" = "1" ]; then \
++        mkdir -p libs \
++        && cd libs \
++        && git clone https://github.com/ROCm/flash-attention.git \
++        && cd flash-attention \
++        && git checkout "${FA_BRANCH}" \
++        && git submodule update --init \
++        && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
++    # Create an empty directory otherwise as later build stages expect one
++    else mkdir -p /install; \
+     fi
+ 
+-# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+-# Manually removed it so that later steps of numpy upgrade can continue
+-RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+-    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
+ 
+-# build triton
+-RUN if [ "$BUILD_TRITON" = "1" ]; then \
++### Triton wheel build stage
++FROM base AS build_triton
++ARG BUILD_TRITON
++ARG TRITON_BRANCH
++# Build triton wheel if `BUILD_TRITON = 1`
++RUN --mount=type=cache,target=${CCACHE_DIR} \
++    if [ "$BUILD_TRITON" = "1" ]; then \
+     mkdir -p libs \
+     && cd libs \
+-    && pip uninstall -y triton \
+-    && git clone https://github.com/ROCm/triton.git \
+-    && cd triton/python \
+-    && pip3 install . \
+-    && cd ../..; \
++    && python3 -m pip install ninja cmake wheel pybind11 \
++    && git clone https://github.com/OpenAI/triton.git \
++    && cd triton \
++    && git checkout "${TRITON_BRANCH}" \
++    && cd python \
++    && python3 setup.py bdist_wheel --dist-dir=/install; \
++    # Create an empty directory otherwise as later build stages expect one
++    else mkdir -p /install; \
+     fi
+ 
+-WORKDIR /vllm-workspace
++
++### Final vLLM build stage
++FROM base AS final
++# Import the vLLM development directory from the build context
+ COPY . .
++ARG GIT_REPO_CHECK=0
++RUN --mount=type=bind,source=.git,target=.git \
++    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+ 
+-RUN python3 -m pip install --upgrade pip numba
++RUN python3 -m pip install --upgrade pip
+ 
++# Package upgrades for useful functionality or to avoid dependency issues
+ RUN --mount=type=cache,target=/root/.cache/pip \
+-    pip install -U -r requirements-rocm.txt \
+-    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
+-    && python3 setup.py install \
+-    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+-    && cd ..
++    python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
+ 
+-RUN python3 -m pip install --upgrade pip
+-RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
++
++# Workaround for ray >= 2.10.0
++ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
++# Silences the HF Tokenizers warning
++ENV TOKENIZERS_PARALLELISM=false
++
++RUN --mount=type=cache,target=${CCACHE_DIR} \
++    --mount=type=bind,source=.git,target=.git \
++    --mount=type=cache,target=/root/.cache/pip \
++    python3 -m pip install -Ur requirements-rocm.txt \
++    && python3 setup.py clean --all \
++    && python3 setup.py develop
++
++# Copy amdsmi wheel into final image
++RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
++    mkdir -p libs \
++    && cp /install/*.whl libs \
++    # Preemptively uninstall to avoid same-version no-installs
++    && python3 -m pip uninstall -y amdsmi;
++
++# Copy triton wheel(s) into final image if they were built
++RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
++    mkdir -p libs \
++    && if ls /install/*.whl; then \
++        cp /install/*.whl libs \
++        # Preemptively uninstall to avoid same-version no-installs
++        && python3 -m pip uninstall -y triton; fi
++
++# Copy flash-attn wheel(s) into final image if they were built
++RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
++    mkdir -p libs \
++    && if ls /install/*.whl; then \
++        cp /install/*.whl libs \
++        # Preemptively uninstall to avoid same-version no-installs
++        && python3 -m pip uninstall -y flash-attn; fi
++
++# Install wheels that were built to the final image
++RUN --mount=type=cache,target=/root/.cache/pip \
++    if ls libs/*.whl; then \
++    python3 -m pip install libs/*.whl; fi
++
++# install development dependencies (for testing)
++RUN python3 -m pip install -e tests/vllm_test_utils
+ 
+ CMD ["/bin/bash"]
+diff --git a/Dockerfile.tpu b/Dockerfile.tpu
+new file mode 100644
+index 0000000..b617932
+--- /dev/null
++++ b/Dockerfile.tpu
+@@ -0,0 +1,28 @@
++ARG NIGHTLY_DATE="20241017"
++ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
++
++FROM $BASE_IMAGE
++WORKDIR /workspace/vllm
++
++# Install some basic utilities
++RUN apt-get update && apt-get install -y \
++    git \
++    ffmpeg libsm6 libxext6 libgl1
++
++# Build vLLM.
++COPY . .
++ARG GIT_REPO_CHECK=0
++RUN --mount=type=bind,source=.git,target=.git \
++    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
++
++ENV VLLM_TARGET_DEVICE="tpu"
++RUN --mount=type=cache,target=/root/.cache/pip \
++    --mount=type=bind,source=.git,target=.git \
++    python3 -m pip install \
++        -r requirements-tpu.txt
++RUN python3 setup.py develop
++
++# install development dependencies (for testing)
++RUN python3 -m pip install -e tests/vllm_test_utils
++
++CMD ["/bin/bash"]
+diff --git a/Dockerfile.xpu b/Dockerfile.xpu
+new file mode 100644
+index 0000000..a374f20
+--- /dev/null
++++ b/Dockerfile.xpu
+@@ -0,0 +1,69 @@
++FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
++
++RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
++    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
++    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
++    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
++    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
++    chmod 644 /usr/share/keyrings/intel-graphics.gpg
++
++RUN apt-get update -y && \
++    apt-get install -y --no-install-recommends --fix-missing \
++    curl \
++    ffmpeg \
++    git \
++    libsndfile1 \
++    libsm6 \
++    libxext6 \
++    libgl1 \
++    lsb-release \
++    numactl \
++    python3 \
++    python3-dev \
++    python3-pip \
++    # vim \
++    wget
++
++WORKDIR /workspace/vllm
++COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
++COPY requirements-common.txt /workspace/vllm/requirements-common.txt
++
++RUN --mount=type=cache,target=/root/.cache/pip \
++    pip install --no-cache-dir \
++    -r requirements-xpu.txt
++
++RUN git clone https://github.com/intel/pti-gpu && \
++    cd pti-gpu/sdk && \
++    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
++    mkdir build && \
++    cd build && \
++    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
++    make -j && \
++    cmake --install . --config Release --prefix "/usr/local"
++
++ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
++
++COPY . .
++ARG GIT_REPO_CHECK
++RUN --mount=type=bind,source=.git,target=.git \
++    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
++
++ENV VLLM_TARGET_DEVICE=xpu
++
++RUN --mount=type=cache,target=/root/.cache/pip \
++    --mount=type=bind,source=.git,target=.git \
++    python3 setup.py install
++
++CMD ["/bin/bash"]
++
++FROM vllm-base AS vllm-openai
++
++# install additional dependencies for openai api server
++RUN --mount=type=cache,target=/root/.cache/pip \
++    pip install accelerate hf_transfer 'modelscope!=1.15.0'
++
++ENV VLLM_USAGE_SOURCE production-docker-image \
++    TRITON_XPU_PROFILE 1
++# install development dependencies (for testing)
++RUN python3 -m pip install -e tests/vllm_test_utils
++ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+diff --git a/README.md b/README.md
+index 524d027..67c557b 100644
+--- a/README.md
++++ b/README.md
+@@ -10,21 +10,28 @@ Easy, fast, and cheap LLM serving for everyone
+ </h3>
+ 
+ <p align="center">
+-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
+-
++| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+ </p>
+ 
++---
++
++The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui)
++
++---
++
+ *Latest News* 🔥
++- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
++- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
++- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
++- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
++- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
++- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
++- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
++- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
+ - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
+-- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+-- [2024/01] Added ROCm 6.0 support to vLLM.
+-- [2023/12] Added ROCm 5.7 support to vLLM.
+-- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
+-- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
+-- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
++- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
++- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
+ - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
+-- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
+-- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
+ - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
+ 
+ ---
+@@ -34,77 +41,89 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
+ vLLM is fast with:
+ 
+ - State-of-the-art serving throughput
+-- Efficient management of attention key and value memory with **PagedAttention**
++- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
+ - Continuous batching of incoming requests
+ - Fast model execution with CUDA/HIP graph
+-- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
+-- Optimized CUDA kernels
++- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
++- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
++- Speculative decoding
++- Chunked prefill
++
++**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
+ 
+ vLLM is flexible and easy to use with:
+ 
+ - Seamless integration with popular Hugging Face models
+ - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
+-- Tensor parallelism support for distributed inference
++- Tensor parallelism and pipeline parallelism support for distributed inference
+ - Streaming outputs
+ - OpenAI-compatible API server
+-- Support NVIDIA GPUs and AMD GPUs
+-- (Experimental) Prefix caching support
+-- (Experimental) Multi-lora support
+-
+-vLLM seamlessly supports many Hugging Face models, including the following architectures:
+-
+-- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
+-- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
+-- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
+-- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
+-- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
+-- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
+-- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
+-- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
+-- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
+-- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
+-- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
+-- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
+-- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
+-- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
+-- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
+-- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
+-- LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+-- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
+-- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
+-- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
+-- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
+-- OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
+-- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
+-- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
+-- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
+-- Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
+-- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
+-- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
+-- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
+-- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
+-- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
+-- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
+-- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
+-
+-Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
++- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
++- Prefix caching support
++- Multi-lora support
++
++vLLM seamlessly supports most popular open-source models on HuggingFace, including:
++- Transformer-like LLMs (e.g., Llama)
++- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
++- Embedding Models (e.g. E5-Mistral)
++- Multi-modal LLMs (e.g., LLaVA)
++
++Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
++
++## Getting Started
++
++Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
+ 
+ ```bash
+ pip install vllm
+ ```
+ 
+-## Getting Started
+-
+-Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
++Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
+ - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
+ - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
+-- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
++- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
+ 
+ ## Contributing
+ 
+ We welcome and value any contributions and collaborations.
+ Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+ 
++## Sponsors
++
++vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
++
++<!-- Note: Please sort them in alphabetical order. -->
++<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
++Cash Donations:
++- a16z
++- Dropbox
++- Sequoia Capital
++- Skywork AI
++- ZhenFund
++
++Compute Resources:
++- AMD
++- Anyscale
++- AWS
++- Crusoe Cloud
++- Databricks
++- DeepInfra
++- Google Cloud
++- Lambda Lab
++- Nebius
++- Novita AI
++- NVIDIA
++- Replicate
++- Roblox
++- RunPod
++- Trainy
++- UC Berkeley
++- UC San Diego
++
++Slack Sponsor: Anyscale
++
++We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
++
+ ## Citation
+ 
+ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
+@@ -116,3 +135,15 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
+   year={2023}
+ }
+ ```
++
++## Contact Us
++
++* For technical questions and feature requests, please use Github issues or discussions.
++* For discussing with fellow users, please use Discord.
++* For coordinating contributions and development, please use Slack.
++* For security disclosures, please use Github's security advisory feature.
++* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
++
++## Media Kit
++
++* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
+diff --git a/SECURITY.md b/SECURITY.md
+new file mode 100644
+index 0000000..de0032d
+--- /dev/null
++++ b/SECURITY.md
+@@ -0,0 +1,11 @@
++# Security Policy
++
++## Reporting a Vulnerability
++
++If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
++
++Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/).
++
++---
++
++Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
+diff --git a/benchmarks/README.md b/benchmarks/README.md
+index 192d6c4..2aa4a28 100644
+--- a/benchmarks/README.md
++++ b/benchmarks/README.md
+@@ -6,3 +6,14 @@ You can download the dataset by running:
+ ```bash
+ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+ ```
++
++## Downloading the ShareGPT4V dataset
++
++The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
++will ignore a datapoint if the referred image is missing.
++```bash
++wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
++mkdir coco -p
++wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
++unzip coco/train2017.zip -d coco/
++```
+diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
+index f9d1675..b678490 100644
+--- a/benchmarks/backend_request_func.py
++++ b/benchmarks/backend_request_func.py
+@@ -4,10 +4,13 @@ import sys
+ import time
+ import traceback
+ from dataclasses import dataclass, field
+-from typing import List, Optional
++from typing import List, Optional, Union
+ 
+ import aiohttp
++import huggingface_hub.constants
+ from tqdm.asyncio import tqdm
++from transformers import (AutoTokenizer, PreTrainedTokenizer,
++                          PreTrainedTokenizerFast)
+ 
+ AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+ 
+@@ -20,7 +23,10 @@ class RequestFuncInput:
+     output_len: int
+     model: str
+     best_of: int = 1
+-    use_beam_search: bool = False
++    logprobs: Optional[int] = None
++    extra_body: Optional[dict] = None
++    multi_modal_content: Optional[dict] = None
++    ignore_eos: bool = False
+ 
+ 
+ @dataclass
+@@ -31,6 +37,7 @@ class RequestFuncOutput:
+     ttft: float = 0.0  # Time to first token
+     itl: List[float] = field(
+         default_factory=list)  # List of inter-token latencies
++    tpot: float = 0.0  # avg next-token latencies
+     prompt_len: int = 0
+     error: str = ""
+ 
+@@ -43,13 +50,14 @@ async def async_request_tgi(
+     assert api_url.endswith("generate_stream")
+ 
+     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+-        assert not request_func_input.use_beam_search
+         params = {
+             "best_of": request_func_input.best_of,
+             "max_new_tokens": request_func_input.output_len,
+             "do_sample": True,
+             "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+             "top_p": 0.99,  # TGI does not accept 1.0 top_p.
++            "truncate": request_func_input.prompt_len,
++            # TGI does not accept ignore_eos flag.
+         }
+         payload = {
+             "inputs": request_func_input.prompt,
+@@ -68,9 +76,13 @@ async def async_request_tgi(
+                         chunk_bytes = chunk_bytes.strip()
+                         if not chunk_bytes:
+                             continue
++                        chunk_bytes = chunk_bytes.decode("utf-8")
+ 
+-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+-                                              "data:")
++                        #NOTE: Sometimes TGI returns a ping response without
++                        # any data, we should skip it.
++                        if chunk_bytes.startswith(":"):
++                            continue
++                        chunk = chunk_bytes.removeprefix("data:")
+ 
+                         data = json.loads(chunk)
+                         timestamp = time.perf_counter()
+@@ -89,6 +101,9 @@ async def async_request_tgi(
+                     output.latency = most_recent_timestamp - st
+                     output.success = True
+                     output.generated_text = data["generated_text"]
++                else:
++                    output.error = response.reason or ""
++                    output.success = False
+         except Exception:
+             output.success = False
+             exc_info = sys.exc_info()
+@@ -107,7 +122,6 @@ async def async_request_trt_llm(
+     assert api_url.endswith("generate_stream")
+ 
+     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+-        assert not request_func_input.use_beam_search
+         assert request_func_input.best_of == 1
+         payload = {
+             "accumulate_tokens": True,
+@@ -117,6 +131,8 @@ async def async_request_trt_llm(
+             "max_tokens": request_func_input.output_len,
+             "stream": True,
+         }
++        if request_func_input.ignore_eos:
++            payload["min_length"] = request_func_input.output_len
+         output = RequestFuncOutput()
+         output.prompt_len = request_func_input.prompt_len
+ 
+@@ -131,8 +147,8 @@ async def async_request_trt_llm(
+                         if not chunk_bytes:
+                             continue
+ 
+-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+-                                              "data:")
++                        chunk = chunk_bytes.decode("utf-8").removeprefix(
++                            "data:")
+ 
+                         data = json.loads(chunk)
+                         output.generated_text += data["text_output"]
+@@ -171,7 +187,6 @@ async def async_request_deepspeed_mii(
+ ) -> RequestFuncOutput:
+     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+         assert request_func_input.best_of == 1
+-        assert not request_func_input.use_beam_search
+ 
+         payload = {
+             "prompt": request_func_input.prompt,
+@@ -215,19 +230,22 @@ async def async_request_openai_completions(
+ ) -> RequestFuncOutput:
+     api_url = request_func_input.api_url
+     assert api_url.endswith(
+-        "v1/completions"
+-    ), "OpenAI Completions API URL must end with 'v1/completions'."
++        ("completions", "profile")
++    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+ 
+     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+-        assert not request_func_input.use_beam_search
+         payload = {
+             "model": request_func_input.model,
+             "prompt": request_func_input.prompt,
+             "temperature": 0.0,
+             "best_of": request_func_input.best_of,
+             "max_tokens": request_func_input.output_len,
++            "logprobs": request_func_input.logprobs,
+             "stream": True,
++            "ignore_eos": request_func_input.ignore_eos,
+         }
++        if request_func_input.extra_body:
++            payload.update(request_func_input.extra_body)
+         headers = {
+             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+         }
+@@ -243,39 +261,49 @@ async def async_request_openai_completions(
+             async with session.post(url=api_url, json=payload,
+                                     headers=headers) as response:
+                 if response.status == 200:
++                    first_chunk_received = False
+                     async for chunk_bytes in response.content:
+                         chunk_bytes = chunk_bytes.strip()
+                         if not chunk_bytes:
+                             continue
+ 
+-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+-                                              "data: ")
++                        chunk = chunk_bytes.decode("utf-8").removeprefix(
++                            "data: ")
+                         if chunk == "[DONE]":
+                             latency = time.perf_counter() - st
+                         else:
+                             data = json.loads(chunk)
+ 
++                            # NOTE: Some completion API might have a last
++                            # usage summary response without a token so we
++                            # want to check a token was generated
+                             if data["choices"][0]["text"]:
+                                 timestamp = time.perf_counter()
+                                 # First token
+-                                if ttft == 0.0:
++                                if not first_chunk_received:
++                                    first_chunk_received = True
+                                     ttft = time.perf_counter() - st
+                                     output.ttft = ttft
+ 
+                                 # Decoding phase
+-                                # NOTE: Some completion API might have a last
+-                                # usage summary response without a token so we
+-                                # do not want to include as inter-token-latency
+-                                elif data.get("usage", None) is None:
++                                else:
+                                     output.itl.append(timestamp -
+                                                       most_recent_timestamp)
+ 
+                                 most_recent_timestamp = timestamp
+                                 generated_text += data["choices"][0]["text"]
+-
++                    if first_chunk_received:
++                        output.success = True
++                    else:
++                        output.success = False
++                        output.error = (
++                            "Never received a valid chunk to calculate TTFT."
++                            "This response will be marked as failed!")
+                     output.generated_text = generated_text
+-                    output.success = True
+                     output.latency = latency
++                else:
++                    output.error = response.reason or ""
++                    output.success = False
+         except Exception:
+             output.success = False
+             exc_info = sys.exc_info()
+@@ -292,23 +320,28 @@ async def async_request_openai_chat_completions(
+ ) -> RequestFuncOutput:
+     api_url = request_func_input.api_url
+     assert api_url.endswith(
+-        "v1/chat/completions"
+-    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
++        "chat/completions"
++    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+ 
+     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+-        assert not request_func_input.use_beam_search
++        content = [{"type": "text", "text": request_func_input.prompt}]
++        if request_func_input.multi_modal_content:
++            content.append(request_func_input.multi_modal_content)
+         payload = {
+             "model": request_func_input.model,
+             "messages": [
+                 {
+                     "role": "user",
+-                    "content": request_func_input.prompt,
++                    "content": content
+                 },
+             ],
+             "temperature": 0.0,
+-            "max_tokens": request_func_input.output_len,
++            "max_completion_tokens": request_func_input.output_len,
+             "stream": True,
++            "ignore_eos": request_func_input.ignore_eos,
+         }
++        if request_func_input.extra_body:
++            payload.update(request_func_input.extra_body)
+         headers = {
+             "Content-Type": "application/json",
+             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+@@ -330,8 +363,8 @@ async def async_request_openai_chat_completions(
+                         if not chunk_bytes:
+                             continue
+ 
+-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+-                                              "data: ")
++                        chunk = chunk_bytes.decode("utf-8").removeprefix(
++                            "data: ")
+                         if chunk == "[DONE]":
+                             latency = time.perf_counter() - st
+                         else:
+@@ -370,12 +403,28 @@ async def async_request_openai_chat_completions(
+     return output
+ 
+ 
+-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
+-# introduced in Python 3.9
+-def remove_prefix(text: str, prefix: str) -> str:
+-    if text.startswith(prefix):
+-        return text[len(prefix):]
+-    return text
++def get_model(pretrained_model_name_or_path: str) -> str:
++    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
++        from modelscope import snapshot_download
++
++        model_path = snapshot_download(
++            model_id=pretrained_model_name_or_path,
++            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
++            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
++
++        return model_path
++    return pretrained_model_name_or_path
++
++
++def get_tokenizer(
++    pretrained_model_name_or_path: str, trust_remote_code: bool
++) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
++    if pretrained_model_name_or_path is not None and not os.path.exists(
++            pretrained_model_name_or_path):
++        pretrained_model_name_or_path = get_model(
++            pretrained_model_name_or_path)
++    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
++                                         trust_remote_code=trust_remote_code)
+ 
+ 
+ ASYNC_REQUEST_FUNCS = {
+@@ -386,4 +435,6 @@ ASYNC_REQUEST_FUNCS = {
+     "openai": async_request_openai_completions,
+     "openai-chat": async_request_openai_chat_completions,
+     "tensorrt-llm": async_request_trt_llm,
++    "scalellm": async_request_openai_completions,
++    "sglang": async_request_openai_completions,
+ }
+diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
+new file mode 100644
+index 0000000..1a0e625
+--- /dev/null
++++ b/benchmarks/benchmark_guided.py
+@@ -0,0 +1,494 @@
++"""Benchmark guided decoding throughput."""
++import argparse
++import dataclasses
++import json
++import os
++import random
++import time
++from typing import List
++
++import datasets
++import pandas as pd
++import uvloop
++from transformers import AutoTokenizer, PreTrainedTokenizerBase
++
++from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
++from vllm.entrypoints.openai.api_server import (
++    build_async_engine_client_from_engine_args)
++from vllm.sampling_params import GuidedDecodingParams
++from vllm.utils import FlexibleArgumentParser, merge_async_iterators
++
++
++@dataclasses.dataclass
++class SampleRequest:
++    """A class representing a single inference request for benchmarking.
++
++    Attributes:
++        prompt: The input text prompt for the model.
++        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
++            images).
++        prompt_len: The length of the prompt in tokens.
++        expected_output_len: The expected length of the output in tokens.
++    """
++    prompt: str
++    prompt_len: int
++    expected_output_len: int
++    schema: dict
++    structure_type: str = 'json'
++    completion: str = None
++
++
++def run_vllm(requests: List[SampleRequest],
++             engine_args: EngineArgs,
++             n: int,
++             guided_decoding_rate: float = 1.0,
++             warmup: bool = False) -> float:
++    from vllm import LLM, SamplingParams
++    llm = LLM(**vars(engine_args))
++
++    # Add the requests to the engine.
++    prompts: List[str] = []
++    sampling_params: List[SamplingParams] = []
++    # create a list containing random selected true or false
++    guided_decoding_req_idx = random.sample(
++        range(len(requests)), int(len(requests) * guided_decoding_rate))
++
++    if warmup:
++        print(">>>>> Running warmup prompt, for the first 5")
++        # We setup the first 5 requests to warmup FSM
++        # if using xgrammar dataset, we will skip warmup
++        warmup_requests = requests[:5]
++        for i, request in enumerate(warmup_requests):
++            prompts.append(request.prompt)
++            sampling_params.append(
++                SamplingParams(
++                    n=n,
++                    temperature=1.0,
++                    top_p=1.0,
++                    ignore_eos=True,
++                    max_tokens=request.expected_output_len,
++                    guided_decoding=GuidedDecodingParams(json=request.schema)
++                    if guided_decoding_rate > 0 else None,
++                ))
++        llm.generate(prompts, sampling_params, use_tqdm=False)
++
++    print(">>>>> Benchmark started...")
++    prompts = []
++    sampling_params = []
++    for i, request in enumerate(requests):
++        prompts.append(request.prompt)
++        sampling_params.append(
++            SamplingParams(
++                n=n,
++                temperature=1.0,
++                top_p=1.0,
++                ignore_eos=True,
++                max_tokens=request.expected_output_len,
++                guided_decoding=GuidedDecodingParams(
++                    **{request.structure_type: request.schema})
++                if i in guided_decoding_req_idx else None,
++            ))
++
++    start = time.perf_counter()
++    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
++    ret = []
++    for output, request in zip(outputs, requests):
++        generated_text = output.outputs[0].text
++        ret.append({
++            "generated": generated_text,
++            "expected": request.completion
++        })
++    end = time.perf_counter()
++    return end - start, ret
++
++
++async def run_vllm_async(
++        requests: List[SampleRequest],
++        engine_args: AsyncEngineArgs,
++        n: int,
++        guided_decoding_rate: float = 1.0,
++        warmup: bool = False,
++        disable_frontend_multiprocessing: bool = False) -> float:
++    from vllm import SamplingParams
++
++    async with build_async_engine_client_from_engine_args(
++            engine_args, disable_frontend_multiprocessing) as llm:
++
++        # Add the requests to the engine.
++        prompts: List[str] = []
++        sampling_params: List[SamplingParams] = []
++        guided_decoding_req_idx = random.sample(
++            range(len(requests)), int(len(requests) * guided_decoding_rate))
++
++        if warmup:
++            print(">>>>>> Running warmup prompt, for the first 5")
++            # We setup the first 5 requests to warmup FSM
++            # if using xgrammar dataset, we will skip warmup
++            warmup_requests = requests[:5]
++            for i, request in enumerate(warmup_requests):
++                prompts.append(request.prompt)
++                sampling_params.append(
++                    SamplingParams(
++                        n=n,
++                        temperature=1.0,
++                        top_p=1.0,
++                        ignore_eos=True,
++                        max_tokens=request.expected_output_len,
++                        guided_decoding=GuidedDecodingParams(
++                            json=request.schema)
++                        if guided_decoding_rate > 0 else None,
++                    ))
++            generators = []
++            for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
++                generator = llm.generate(prompt, sp, request_id=f"test{i}")
++                generators.append(generator)
++            all_gens = merge_async_iterators(*generators)
++            async for i, res in all_gens:
++                pass
++
++        print(">>>>> Benchmark started...")
++        prompts = []
++        sampling_params = []
++        for i, request in enumerate(requests):
++            prompts.append(request.prompt)
++            sampling_params.append(
++                SamplingParams(
++                    n=n,
++                    temperature=1.0,
++                    top_p=1.0,
++                    ignore_eos=True,
++                    max_tokens=request.expected_output_len,
++                    guided_decoding=GuidedDecodingParams(json=request.schema)
++                    if i in guided_decoding_req_idx else None,
++                ))
++
++        generators = []
++        start_time = []
++        latencies = []
++        start = time.perf_counter()
++        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
++            generator = llm.generate(prompt, sp, request_id=f"test{i}")
++            generators.append(generator)
++            start_time.append(time.perf_counter())
++            latencies.append([])
++        all_gens = merge_async_iterators(*generators)
++        generated_texts = [''] * len(requests)
++        async for i, res in all_gens:
++            generated_texts[i] = res.outputs[0].text
++            lat = time.perf_counter() - start_time[i]
++            latencies[i].append(lat)
++        ret = [{
++            'generated': gt,
++            'expected': req.completion
++        } for gt, req in zip(generated_texts, requests)]
++        end = time.perf_counter()
++        first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
++        next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
++                                  for lat in latencies])
++        return end - start, ret, (first_latency, next_latency)
++
++
++def sample_requests(tokenizer: PreTrainedTokenizerBase,
++                    args: argparse.Namespace) -> List[SampleRequest]:
++    if args.dataset == 'json':
++        if args.json_schema_path is None:
++            dir_path = os.path.dirname(os.path.realpath(__file__))
++            args.json_schema_path = os.path.join(dir_path,
++                                                 "structured_schemas",
++                                                 "structured_schema_1.json")
++        with open(args.json_schema_path) as f:
++            schema = json.load(f)
++        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
++        input_len = len(tokenizer(prompt).input_ids)
++        print(f"Input length of the prompt: {input_len} tokens")
++        requests = [
++            SampleRequest(prompt=prompt,
++                          prompt_len=input_len,
++                          expected_output_len=args.output_len,
++                          schema=schema,
++                          structure_type=args.structure_type)
++            for _ in range(args.num_prompts)
++        ]
++
++    elif args.dataset == "grammar":
++        schema = """
++            ?start: select_statement
++
++            ?select_statement: "SELECT " column_list " FROM " table_name
++
++            ?column_list: column_name ("," column_name)*
++
++            ?table_name: identifier
++
++            ?column_name: identifier
++
++            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
++        """
++        prompt = "Generate an SQL query to show the 'username' \
++            and 'email' from the 'users' table."
++
++        input_len = len(tokenizer(prompt).input_ids)
++        print(f"Input length of the prompt: {input_len} tokens")
++        requests = [
++            SampleRequest(prompt=prompt,
++                          prompt_len=input_len,
++                          expected_output_len=args.output_len,
++                          schema=schema,
++                          structure_type=args.structure_type)
++            for _ in range(args.num_prompts)
++        ]
++
++    elif args.dataset == "regex":
++        regex = r"\w+@\w+\.com\n"
++        args.regex = regex
++        prompt = "Generate an email address for Alan Turing, \
++            who works in Enigma. End in .com and new line. \
++                Example result: alan.turing@enigma.com\n"
++
++        input_len = len(tokenizer(prompt).input_ids)
++        print(f"Input length of the prompt: {input_len} tokens")
++        requests = [
++            SampleRequest(prompt=prompt,
++                          prompt_len=input_len,
++                          expected_output_len=args.output_len,
++                          schema=regex,
++                          structure_type=args.structure_type)
++            for _ in range(args.num_prompts)
++        ]
++
++    elif args.dataset == "choice":
++        choice = ["Positive", "Negative"]
++        args.choice = choice
++        prompt = "Classify this sentiment: vLLM is wonderful!"
++        input_len = len(tokenizer(prompt).input_ids)
++        print(f"Input length of the prompt: {input_len} tokens")
++        requests = [
++            SampleRequest(prompt=prompt,
++                          prompt_len=input_len,
++                          expected_output_len=args.output_len,
++                          schema=choice,
++                          structure_type=args.structure_type)
++            for _ in range(args.num_prompts)
++        ]
++
++    elif args.dataset == "xgrammar_bench":
++        args.warmup = False
++        requests: List[SampleRequest] = []
++        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
++                                        split="train")
++        print(f"dataset has {len(dataset)} entries")
++        len_dataset = len(dataset)
++        for data_point_idx in range(args.num_prompts):
++            idx = data_point_idx
++            while idx >= len_dataset:
++                idx -= len_dataset
++            schema = dataset["schema"][idx]
++            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
++                                                   tokenize=False)
++            input_len = len(tokenizer(prompt).input_ids)
++            completion = dataset["completion"][idx]
++
++            requests.append(
++                SampleRequest(prompt=prompt,
++                              prompt_len=input_len,
++                              expected_output_len=args.output_len,
++                              schema=schema,
++                              completion=completion))
++
++    return requests
++
++
++def evaluate(ret, args):
++
++    def _eval_correctness_json(expected, actual):
++        # extract json string from string using regex
++        import re
++        actual = actual.replace('\n', '').replace(' ', '').strip()
++        try:
++            actual = re.search(r'\{.*\}', actual).group()
++            actual = json.loads(actual)
++        except Exception:
++            return False
++
++        return True
++
++    def _eval_correctness_choice(expected, actual):
++        return actual in args.choice
++
++    def _eval_correctness_regex(expected, actual):
++        import re
++        return re.match(args.regex, actual) is not None
++
++    def _eval_correctness(expected, actual):
++        if args.structure_type == 'json':
++            return _eval_correctness_json(expected, actual)
++        elif args.structure_type == 'regex':
++            return _eval_correctness_regex(expected, actual)
++        elif args.structure_type == 'choice':
++            return _eval_correctness_choice(expected, actual)
++        else:
++            return None
++
++    scores = []
++    for res in ret:
++        score = _eval_correctness(res['expected'], res['generated'])
++        res['correctness'] = score
++        scores.append(score)
++
++    not_none_scores = [score for score in scores if score is not None]
++
++    return (sum(not_none_scores) / len(not_none_scores) *
++            100) if len(not_none_scores) > 0 else None
++
++
++def main(args: argparse.Namespace):
++    print(args)
++    random.seed(args.seed)
++
++    # async engine is working for 'regex', 'choice' and 'grammar'
++    if args.dataset == 'grammar':
++        args.structure_type = 'grammar'
++        args.async_engine = False
++    elif args.dataset == 'regex':
++        args.structure_type = 'regex'
++        args.async_engine = False
++    elif args.dataset == 'choice':
++        args.structure_type = 'choice'
++        args.async_engine = False
++    else:
++        args.structure_type = 'json'
++
++    if args.no_guided_decoding:
++        args.guided_decoding_ratio = 0
++    if args.save_results:
++        result_file_name = f'{args.guided_decoding_ratio}guided'
++        result_file_name += f"_{args.model.split('/')[-1]}"
++        result_file_name += f"_{args.dataset}"
++        result_file_name += f"_{args.num_prompts}"
++        result_file_name += f"_out{args.output_len}"
++        result_file_name += f"_async{args.async_engine}"
++        result_file_name += f"_warmup{args.warmup}"
++        result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
++        result_file_name += ".txt"
++    else:
++        result_file_name = None
++
++    # Synthesize a prompt with the given input length.
++    tokenizer = AutoTokenizer.from_pretrained(
++        args.tokenizer, trust_remote_code=args.trust_remote_code)
++    requests = sample_requests(tokenizer, args)
++
++    if args.async_engine:
++        engine_args = AsyncEngineArgs.from_cli_args(args)
++        elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
++            run_vllm_async(requests, engine_args, args.n,
++                           args.guided_decoding_ratio, args.warmup,
++                           args.disable_frontend_multiprocessing))
++    else:
++        engine_args = EngineArgs.from_cli_args(args)
++        elapsed_time, ret = run_vllm(requests, engine_args, args.n,
++                                     args.guided_decoding_ratio, args.warmup)
++        first_latency, next_latency = None, None
++
++    score = evaluate(ret, args)
++    total_num_tokens = sum(request.prompt_len + request.expected_output_len
++                           for request in requests)
++    total_output_tokens = sum(request.expected_output_len
++                              for request in requests)
++    if first_latency is not None:
++        latency_breakdown = "\nFirst token latency(msecs):\n"
++        latency_breakdown += f"{first_latency.describe()}"
++        latency_breakdown += "\nNext token latency(msecs):\n"
++        latency_breakdown += f"{next_latency.describe()}"
++    print(
++        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
++        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
++        f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
++        f"Correct rate is {score} %",
++        f"{latency_breakdown if first_latency is not None else ''}")
++
++    # Output JSON results if specified
++    if args.output_json or result_file_name:
++        results = {
++            "elapsed_time": elapsed_time,
++            "num_requests": len(requests),
++            "total_num_tokens": total_num_tokens,
++            "total_output_tokens": total_output_tokens,
++            "requests_per_second": len(requests) / elapsed_time,
++            "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
++            "output_tokens_per_second":
++            f"{total_output_tokens / elapsed_time:.2f}",
++            "correct_rate(%)": score
++        }
++        results = {"outputs": ret, **results}
++        if first_latency is not None:
++            results["first_token_latency(msecs)"] = first_latency.describe(
++            ).to_dict()
++            results["next_token_latency(msecs)"] = next_latency.describe(
++            ).to_dict()
++        if args.output_json:
++            with open(args.output_json, "w") as f:
++                json.dump(results, f, indent=4)
++        elif result_file_name:
++            with open(result_file_name, "w") as f:
++                json.dump(results, f, indent=4)
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
++    parser = AsyncEngineArgs.add_cli_args(parser)
++
++    parser.add_argument("--output-len",
++                        type=int,
++                        default=512,
++                        help="Output length for each request. Overrides the "
++                        "output length from the dataset.")
++    parser.add_argument(
++        "--dataset",
++        default='json',
++        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
++    parser.add_argument("--json_schema_path",
++                        type=str,
++                        default=None,
++                        help="Path to json schema.")
++    parser.add_argument("--n",
++                        type=int,
++                        default=1,
++                        help="Number of generated sequences per prompt.")
++    parser.add_argument("--num-prompts",
++                        type=int,
++                        default=10,
++                        help="Number of prompts to process.")
++    parser.add_argument(
++        '--output-json',
++        type=str,
++        default=None,
++        help='Path to save the throughput results in JSON format.')
++    parser.add_argument("--async-engine",
++                        action='store_true',
++                        default=False,
++                        help="Use vLLM async engine rather than LLM class.")
++    parser.add_argument("--no-guided-decoding",
++                        action='store_true',
++                        default=False,
++                        help="Whether to disable JSON decoding or not.")
++    parser.add_argument("--guided-decoding-ratio",
++                        type=float,
++                        default=1.0,
++                        help="Ratio of Guided Decoding requests")
++    parser.add_argument("--disable-frontend-multiprocessing",
++                        action='store_true',
++                        default=False,
++                        help="Disable decoupled async engine frontend.")
++    parser.add_argument("--warmup",
++                        action="store_true",
++                        default=False,
++                        help="Run warmup prompts before benchmark.")
++    parser.add_argument("--save-results",
++                        action="store_true",
++                        default=False,
++                        help="save output results.")
++    args = parser.parse_args()
++    if args.tokenizer is None:
++        args.tokenizer = args.model
++    main(args)
+diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
+index 44da3ba..77c4f6a 100644
+--- a/benchmarks/benchmark_latency.py
++++ b/benchmarks/benchmark_latency.py
+@@ -1,42 +1,35 @@
+ """Benchmark the latency of processing a single batch of requests."""
+ import argparse
++import dataclasses
++import json
+ import time
+ from pathlib import Path
+-from typing import Optional
++from typing import List, Optional
+ 
+ import numpy as np
+ import torch
+ from tqdm import tqdm
+ 
+ from vllm import LLM, SamplingParams
+-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
++from vllm.engine.arg_utils import EngineArgs
++from vllm.inputs import PromptType
++from vllm.sampling_params import BeamSearchParams
++from vllm.utils import FlexibleArgumentParser
+ 
+ 
+ def main(args: argparse.Namespace):
+     print(args)
+ 
++    engine_args = EngineArgs.from_cli_args(args)
++
+     # NOTE(woosuk): If the request cannot be processed in a single batch,
+     # the engine will automatically process the request in multiple batches.
+-    llm = LLM(model=args.model,
+-              tokenizer=args.tokenizer,
+-              quantization=args.quantization,
+-              tensor_parallel_size=args.tensor_parallel_size,
+-              trust_remote_code=args.trust_remote_code,
+-              dtype=args.dtype,
+-              enforce_eager=args.enforce_eager,
+-              kv_cache_dtype=args.kv_cache_dtype,
+-              quantization_param_path=args.quantization_param_path,
+-              device=args.device,
+-              ray_workers_use_nsight=args.ray_workers_use_nsight,
+-              enable_chunked_prefill=args.enable_chunked_prefill,
+-              download_dir=args.download_dir,
+-              block_size=args.block_size)
++    llm = LLM(**dataclasses.asdict(engine_args))
+ 
+     sampling_params = SamplingParams(
+         n=args.n,
+-        temperature=0.0 if args.use_beam_search else 1.0,
++        temperature=1.0,
+         top_p=1.0,
+-        use_beam_search=args.use_beam_search,
+         ignore_eos=True,
+         max_tokens=args.output_len,
+     )
+@@ -44,7 +37,23 @@ def main(args: argparse.Namespace):
+     dummy_prompt_token_ids = np.random.randint(10000,
+                                                size=(args.batch_size,
+                                                      args.input_len))
+-    dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
++    dummy_prompts: List[PromptType] = [{
++        "prompt_token_ids": batch
++    } for batch in dummy_prompt_token_ids.tolist()]
++
++    def llm_generate():
++        if not args.use_beam_search:
++            llm.generate(dummy_prompts,
++                         sampling_params=sampling_params,
++                         use_tqdm=False)
++        else:
++            llm.beam_search(
++                dummy_prompts,
++                BeamSearchParams(
++                    beam_width=args.n,
++                    max_tokens=args.output_len,
++                    ignore_eos=True,
++                ))
+ 
+     def run_to_completion(profile_dir: Optional[str] = None):
+         if profile_dir:
+@@ -55,15 +64,11 @@ def main(args: argparse.Namespace):
+                     ],
+                     on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                         str(profile_dir))) as p:
+-                llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+-                             sampling_params=sampling_params,
+-                             use_tqdm=False)
+-            print(p.key_averages())
++                llm_generate()
++            print(p.key_averages().table(sort_by="self_cuda_time_total"))
+         else:
+             start_time = time.perf_counter()
+-            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+-                         sampling_params=sampling_params,
+-                         use_tqdm=False)
++            llm_generate()
+             end_time = time.perf_counter()
+             latency = end_time - start_time
+             return latency
+@@ -87,24 +92,27 @@ def main(args: argparse.Namespace):
+     for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+         latencies.append(run_to_completion(profile_dir=None))
+     latencies = np.array(latencies)
+-    percentages = [10, 25, 50, 75, 90]
++    percentages = [10, 25, 50, 75, 90, 99]
+     percentiles = np.percentile(latencies, percentages)
+     print(f'Avg latency: {np.mean(latencies)} seconds')
+     for percentage, percentile in zip(percentages, percentiles):
+         print(f'{percentage}% percentile latency: {percentile} seconds')
+ 
++    # Output JSON results if specified
++    if args.output_json:
++        results = {
++            "avg_latency": np.mean(latencies),
++            "latencies": latencies.tolist(),
++            "percentiles": dict(zip(percentages, percentiles.tolist())),
++        }
++        with open(args.output_json, "w") as f:
++            json.dump(results, f, indent=4)
++
+ 
+ if __name__ == '__main__':
+-    parser = argparse.ArgumentParser(
++    parser = FlexibleArgumentParser(
+         description='Benchmark the latency of processing a single batch of '
+         'requests till completion.')
+-    parser.add_argument('--model', type=str, default='facebook/opt-125m')
+-    parser.add_argument('--tokenizer', type=str, default=None)
+-    parser.add_argument('--quantization',
+-                        '-q',
+-                        choices=[*QUANTIZATION_METHODS, None],
+-                        default=None)
+-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+     parser.add_argument('--input-len', type=int, default=32)
+     parser.add_argument('--output-len', type=int, default=128)
+     parser.add_argument('--batch-size', type=int, default=8)
+@@ -121,41 +129,6 @@ if __name__ == '__main__':
+                         type=int,
+                         default=30,
+                         help='Number of iterations to run.')
+-    parser.add_argument('--trust-remote-code',
+-                        action='store_true',
+-                        help='trust remote code from huggingface')
+-    parser.add_argument(
+-        '--dtype',
+-        type=str,
+-        default='auto',
+-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+-        help='data type for model weights and activations. '
+-        'The "auto" option will use FP16 precision '
+-        'for FP32 and FP16 models, and BF16 precision '
+-        'for BF16 models.')
+-    parser.add_argument('--enforce-eager',
+-                        action='store_true',
+-                        help='enforce eager mode and disable CUDA graph')
+-    parser.add_argument(
+-        "--kv-cache-dtype",
+-        type=str,
+-        choices=['auto', 'fp8'],
+-        default='auto',
+-        help=
+-        'Data type for kv cache storage. If "auto", will use model data type. '
+-        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+-        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+-        'common inference criteria.')
+-    parser.add_argument(
+-        '--quantization-param-path',
+-        type=str,
+-        default=None,
+-        help='Path to the JSON file containing the KV cache scaling factors. '
+-        'This should generally be supplied, when KV cache dtype is FP8. '
+-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+-        'instead supported for common inference criteria.')
+     parser.add_argument(
+         '--profile',
+         action='store_true',
+@@ -167,29 +140,11 @@ if __name__ == '__main__':
+         help=('path to save the pytorch profiler output. Can be visualized '
+               'with ui.perfetto.dev or Tensorboard.'))
+     parser.add_argument(
+-        "--device",
++        '--output-json',
+         type=str,
+-        default="cuda",
+-        choices=["cuda", "cpu"],
+-        help='device type for vLLM execution, supporting CUDA and CPU.')
+-    parser.add_argument('--block-size',
+-                        type=int,
+-                        default=16,
+-                        help='block size of key/value cache')
+-    parser.add_argument(
+-        '--enable-chunked-prefill',
+-        action='store_true',
+-        help='If True, the prefill requests can be chunked based on the '
+-        'max_num_batched_tokens')
+-    parser.add_argument(
+-        "--ray-workers-use-nsight",
+-        action='store_true',
+-        help="If specified, use nsight to profile ray workers",
+-    )
+-    parser.add_argument('--download-dir',
+-                        type=str,
+-                        default=None,
+-                        help='directory to download and load the weights, '
+-                        'default to the default cache dir of huggingface')
++        default=None,
++        help='Path to save the latency results in JSON format.')
++
++    parser = EngineArgs.add_cli_args(parser)
+     args = parser.parse_args()
+     main(args)
+diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
+new file mode 100644
+index 0000000..0b8fba3
+--- /dev/null
++++ b/benchmarks/benchmark_long_document_qa_throughput.py
+@@ -0,0 +1,183 @@
++"""
++Offline benchmark to test the long document QA throughput.
++
++Example usage:
++    # This workload samples 8 different prompts with a default input
++    # length of 20000 tokens, then replicates each prompt 2 times 
++    # in random order.
++    python benchmark_long_document_qa_throughput.py \
++        --model meta-llama/Llama-2-7b-chat-hf \
++        --enable-prefix-caching \
++        --num-documents 8 \
++        --repeat-count 2 
++
++Commandline arguments:
++    --num-documents: The number of documents to sample prompts from.
++
++    --document-length: The length of each document in tokens. 
++                       (Optional, default: 20000)
++
++    --output-len: The number of tokens to generate for each prompt.
++                  (Optional, default: 10)
++
++    --repeat-count: The number of times to repeat each prompt.
++                    (Optional, default: 2)
++
++    --repeat-mode: The mode to repeat prompts. The supported modes are:
++        - 'random': shuffle the prompts randomly. (Default)
++        - 'tile': the entire prompt list is repeated in sequence. (Potentially
++                  lowest cache hit)
++        - 'interleave': each prompt is repeated consecutively before 
++                        moving to the next element. (Highest cache hit)
++    
++    --shuffle-seed: Random seed when the repeat mode is "random".
++                    (Optional, default: 0)
++
++In the meantime, it also supports all the vLLM engine args to initialize the 
++LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
++details.
++"""
++
++import dataclasses
++import random
++import time
++
++from vllm import LLM, SamplingParams
++from vllm.engine.arg_utils import EngineArgs
++from vllm.utils import FlexibleArgumentParser
++
++
++def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
++    """
++    Test long document QA with the given prompts and sampling parameters.
++    Print the time spent in processing all the prompts.
++
++    Args:
++        llm: The language model used for generating responses.
++        sampling_params: Sampling parameter used to generate the response.
++        prompts: A list of prompt strings to be processed by the LLM.
++    """
++    start_time = time.time()
++    llm.generate(prompts, sampling_params=sampling_params)
++    end_time = time.time()
++    print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
++
++
++def repeat_prompts(prompts, repeat_count, mode: str):
++    """
++    Repeat each prompt in the list for a specified number of times.
++    The order of prompts in the output list depends on the mode.
++
++    Args:
++        prompts: A list of prompts to be repeated.
++        repeat_count: The number of times each prompt is repeated.
++        mode: The mode of repetition. Supported modes are:
++            - 'random': Shuffle the prompts randomly after repetition.
++            - 'tile': Repeat the entire prompt list in sequence.
++              Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
++            - 'interleave': Repeat each prompt consecutively before moving to 
++              the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
++
++    Returns:
++        A list of repeated prompts in the specified order.
++
++    Raises:
++        ValueError: If an invalid mode is provided.
++    """
++    print("Repeat mode: ", mode)
++    if mode == 'random':
++        repeated_prompts = prompts * repeat_count
++        random.shuffle(repeated_prompts)
++        return repeated_prompts
++    elif mode == 'tile':
++        return prompts * repeat_count
++    elif mode == 'interleave':
++        repeated_prompts = []
++        for prompt in prompts:
++            repeated_prompts.extend([prompt] * repeat_count)
++        return repeated_prompts
++    else:
++        raise ValueError(f"Invalid mode: {mode}, only support "
++                         "'random', 'tile', 'interleave'")
++
++
++def main(args):
++    random.seed(args.shuffle_seed)
++
++    # Prepare the prompts:
++    # we append the document id at the beginning to avoid any of the document
++    # being the prefix of other documents
++    prompts = [
++        str(i) + ' '.join(['hi'] * args.document_length)
++        for i in range(args.num_documents)
++    ]
++
++    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
++
++    warmup_prompts = [
++        "This is warm up request " + str(i) + \
++                ' '.join(['hi'] * args.document_length)
++        for i in range(args.num_documents)]
++
++    # Create the LLM engine
++    engine_args = EngineArgs.from_cli_args(args)
++    llm = LLM(**dataclasses.asdict(engine_args))
++    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
++
++    print("------warm up------")
++    test_long_document_qa(
++        llm=llm,
++        prompts=warmup_prompts,
++        sampling_params=sampling_params,
++    )
++
++    print("------start generating------")
++    test_long_document_qa(
++        llm=llm,
++        prompts=prompts,
++        sampling_params=sampling_params,
++    )
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(
++        description=
++        'Benchmark the performance with or without automatic prefix caching.')
++
++    parser.add_argument(
++        '--document-length',
++        type=int,
++        # Roughly the number of tokens for a system paper,
++        # excluding images
++        default=20000,
++        help='Range of input lengths for sampling prompts,'
++        'specified as "min:max" (e.g., "128:256").')
++
++    parser.add_argument('--num-documents',
++                        type=int,
++                        default=8,
++                        help='Range of input lengths for sampling prompts,'
++                        'specified as "min:max" (e.g., "128:256").')
++
++    parser.add_argument('--output-len', type=int, default=10)
++
++    parser.add_argument('--repeat-count',
++                        type=int,
++                        default=2,
++                        help='Number of times to repeat each prompt')
++
++    parser.add_argument("--repeat-mode",
++                        type=str,
++                        default='random',
++                        help='The mode to repeat prompts. The supported '
++                        'modes are "random", "tile", and "interleave". '
++                        'See repeat_prompts() in the source code for details.')
++
++    parser.add_argument("--shuffle-seed",
++                        type=int,
++                        default=0,
++                        help='Random seed when the repeat mode is "random"')
++
++    parser = EngineArgs.add_cli_args(parser)
++    args = parser.parse_args()
++    main(args)
+diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
+index 0899669..3ab421a 100644
+--- a/benchmarks/benchmark_prefix_caching.py
++++ b/benchmarks/benchmark_prefix_caching.py
+@@ -1,7 +1,47 @@
+-import argparse
++"""
++Benchmark the efficiency of prefix caching.
++
++This script allows you to benchmark the performance of
++a model with and without prefix caching using either fixed prompts
++or prompts sampled from the ShareGPT dataset.
++
++Fixed example usage:
++    python benchmark_prefix_caching.py \
++        --model meta-llama/Llama-2-7b-chat-hf \
++        --enable-prefix-caching \
++        --num-prompts 1 \
++        --repeat-count 100 \
++        --input-length-range 128:256
++
++ShareGPT example usage:
++    # This command samples 20 prompts with input lengths
++    # between 128 and 256 tokens from the ShareGPT dataset,
++    # then replicates each prompt 5 times.
++    python benchmark_prefix_caching.py \
++        --model meta-llama/Llama-2-7b-chat-hf \
++        --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \
++        --enable-prefix-caching \
++        --num-prompts 20 \
++        --repeat-count 5 \
++        --input-length-range 128:256
++"""
++
++import dataclasses
++import json
++import random
+ import time
++from typing import List, Optional, Tuple
++
++from transformers import PreTrainedTokenizerBase
+ 
+ from vllm import LLM, SamplingParams
++from vllm.engine.arg_utils import EngineArgs
++from vllm.utils import FlexibleArgumentParser
++
++try:
++    from vllm.transformers_utils.tokenizer import get_tokenizer
++except ImportError:
++    from backend_request_func import get_tokenizer
+ 
+ PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
+ 
+@@ -15,25 +55,150 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
+     print(f"cost time {end_time - start_time}")
+ 
+ 
++@dataclasses.dataclass
++class Request:
++    prompt: str
++    prompt_len: int
++    output_len: int
++
++
++def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
++    vocab = tokenizer.get_vocab()
++    # Remove the special tokens.
++    vocab = {
++        k: v
++        for k, v in vocab.items() if k not in tokenizer.all_special_ids
++    }
++    return random.choices(list(vocab.values()), k=length)
++
++
++def sample_requests_from_dataset(
++    dataset_path: str,
++    num_requests: int,
++    tokenizer: PreTrainedTokenizerBase,
++    input_length_range: Tuple[int, int],
++    fixed_output_len: Optional[int],
++) -> List[Request]:
++    if fixed_output_len is not None and fixed_output_len < 4:
++        raise ValueError("output_len too small")
++
++    # Load the dataset.
++    with open(dataset_path) as f:
++        dataset = json.load(f)
++    # Filter out the conversations with less than 2 turns.
++    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
++    # Only keep the first two turns of each conversation.
++    dataset = [(data["conversations"][0]["value"],
++                data["conversations"][1]["value"]) for data in dataset]
++
++    # Shuffle the dataset.
++    random.shuffle(dataset)
++
++    min_len, max_len = input_length_range
++    assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
++
++    # Filter out sequences that are too long or too short
++    filtered_requests: List[Request] = []
++
++    for i in range(len(dataset)):
++        if len(filtered_requests) == num_requests:
++            break
++
++        # Tokenize the prompts and completions.
++        prompt_token_ids = tokenizer(dataset[i][0]).input_ids
++        prompt = tokenizer.decode(prompt_token_ids)
++        completion = dataset[i][1]
++        completion_token_ids = tokenizer(completion).input_ids
++        prompt_len = len(prompt_token_ids)
++        output_len = (len(completion_token_ids)
++                      if fixed_output_len is None else fixed_output_len)
++        if min_len <= prompt_len <= max_len:
++            filtered_requests.append(Request(prompt, prompt_len, output_len))
++
++    return filtered_requests
++
++
++def sample_requests_from_random(
++    num_requests: int,
++    tokenizer: PreTrainedTokenizerBase,
++    input_length_range: Tuple[int, int],
++    fixed_output_len: Optional[int],
++    prefix_len: int,
++) -> List[Request]:
++
++    requests = []
++    prefix_token_ids = sample_tokens(tokenizer, prefix_len)
++    min_len, max_len = input_length_range
++
++    for i in range(num_requests):
++        unique_part_token_ids = sample_tokens(
++            tokenizer,
++            random.randint(min_len - prefix_len, max_len - prefix_len))
++        prompt_token_ids = prefix_token_ids + unique_part_token_ids
++        prompt = tokenizer.decode(prompt_token_ids)
++        prompt_len = len(prompt_token_ids)
++        assert (min_len <= prompt_len <= max_len
++                ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
++        requests.append(Request(prompt, prompt_len, fixed_output_len))
++    return requests
++
++
++def repeat_and_sort_requests(requests: List[Request],
++                             repeat_count: int,
++                             sort: bool = False) -> List[str]:
++    repeated_requests = requests * repeat_count
++    if sort:
++        repeated_requests.sort(key=lambda x: x[1])
++    else:
++        random.shuffle(repeated_requests)
++    return [req.prompt for req in repeated_requests]
++
++
+ def main(args):
+-    llm = LLM(model=args.model,
+-              tokenizer_mode='auto',
+-              trust_remote_code=True,
+-              enforce_eager=True,
+-              use_v2_block_manager=args.use_v2_block_manager,
+-              tensor_parallel_size=args.tensor_parallel_size,
+-              enable_prefix_caching=args.enable_prefix_caching)
+-
+-    num_prompts = 100
+-    prompts = [PROMPT] * num_prompts
++    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
++    input_length_range = tuple(map(int, args.input_length_range.split(':')))
++    random.seed(args.seed)
++    if args.dataset_path is not None:
++        if args.prefix_len > 0:
++            raise ValueError("prefix-len is not supported when "
++                             "dataset-path is provided.")
++        print(f"Start to sample {args.num_prompts} prompts "
++              f"from {args.dataset_path}")
++        filtered_requests = sample_requests_from_dataset(
++            dataset_path=args.dataset_path,
++            num_requests=args.num_prompts,
++            tokenizer=tokenizer,
++            input_length_range=input_length_range,
++            fixed_output_len=args.output_len,
++        )
++    else:
++        print(f"Start to sample {args.num_prompts} prompts from random")
++        filtered_requests = sample_requests_from_random(
++            num_requests=args.num_prompts,
++            tokenizer=tokenizer,
++            input_length_range=input_length_range,
++            fixed_output_len=args.output_len,
++            prefix_len=args.prefix_len,
++        )
++
++    # Print some helpful stats of the requests.
++    print(f"Sampled {len(filtered_requests)} requests.")
++    prompt_lens = [req.prompt_len for req in filtered_requests]
++    print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}")
++    print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}")
++    print(f"Min Prompt Length: {min(prompt_lens)}")
++    print(f"Max Prompt Length: {max(prompt_lens)}")
++
++    engine_args = EngineArgs.from_cli_args(args)
++
++    llm = LLM(**dataclasses.asdict(engine_args))
++
+     sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+ 
+-    print("------warm up------")
+-    test_prefix(
+-        llm=llm,
+-        prompts=prompts,
+-        sampling_params=sampling_params,
+-    )
++    print("Testing filtered requests")
++    prompts = repeat_and_sort_requests(filtered_requests,
++                                       repeat_count=args.repeat_count,
++                                       sort=args.sort)
+ 
+     print("------start generating------")
+     test_prefix(
+@@ -44,19 +209,40 @@ def main(args):
+ 
+ 
+ if __name__ == "__main__":
+-    parser = argparse.ArgumentParser(
+-        description='Benchmark the performance with or without automatic '
+-        'prefix caching.')
+-    parser.add_argument('--model',
++    parser = FlexibleArgumentParser(
++        description=
++        'Benchmark the performance with or without automatic prefix caching.')
++    parser.add_argument("--dataset-path",
+                         type=str,
+-                        default='baichuan-inc/Baichuan2-13B-Chat')
+-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
++                        default=None,
++                        help="Path to the dataset.")
+     parser.add_argument('--output-len', type=int, default=10)
+-    parser.add_argument('--enable-prefix-caching',
++    parser.add_argument('--num-prompts',
++                        type=int,
++                        required=True,
++                        help="Number of the prompts sampled from dataset")
++    parser.add_argument('--repeat-count',
++                        type=int,
++                        default=1,
++                        help='Number of times to repeat each prompt')
++    parser.add_argument('--sort',
+                         action='store_true',
+-                        help='enable prefix caching')
+-    parser.add_argument('--use-v2-block-manager',
+-                        action='store_true',
+-                        help='Use BlockSpaceMangerV2')
++                        help='Sort prompts by input length')
++    parser.add_argument('--input-length-range',
++                        type=str,
++                        required=True,
++                        help='Range of input lengths for sampling prompts,'
++                        'specified as "min:max" (e.g., "128:256").')
++    parser.add_argument(
++        "--prefix-len",
++        type=int,
++        default=0,
++        help="Specifies the length of a common prefix to be "
++        "added to the input prompt. The input-length-range will "
++        "subtract this length when filtering prompts. Only used "
++        "when dataset-path is not provided.",
++    )
++
++    parser = EngineArgs.add_cli_args(parser)
+     args = parser.parse_args()
+     main(args)
+diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
+new file mode 100644
+index 0000000..e0c9e6a
+--- /dev/null
++++ b/benchmarks/benchmark_prioritization.py
+@@ -0,0 +1,177 @@
++"""Benchmark offline prioritization."""
++import argparse
++import dataclasses
++import json
++import random
++import time
++from typing import List, Optional, Tuple
++
++from transformers import AutoTokenizer, PreTrainedTokenizerBase
++
++from vllm.engine.arg_utils import EngineArgs
++from vllm.utils import FlexibleArgumentParser
++
++
++def sample_requests(
++    dataset_path: str,
++    num_requests: int,
++    tokenizer: PreTrainedTokenizerBase,
++    fixed_output_len: Optional[int],
++) -> List[Tuple[str, int, int]]:
++    if fixed_output_len is not None and fixed_output_len < 4:
++        raise ValueError("output_len too small")
++
++    # Load the dataset.
++    with open(dataset_path) as f:
++        dataset = json.load(f)
++    # Filter out the conversations with less than 2 turns.
++    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
++    # Only keep the first two turns of each conversation.
++    dataset = [(data["conversations"][0]["value"],
++                data["conversations"][1]["value"]) for data in dataset]
++
++    # Shuffle the dataset.
++    random.shuffle(dataset)
++
++    # Filter out sequences that are too long or too short
++    filtered_dataset: List[Tuple[str, int, int]] = []
++    for i in range(len(dataset)):
++        if len(filtered_dataset) == num_requests:
++            break
++
++        # Tokenize the prompts and completions.
++        prompt = dataset[i][0]
++        prompt_token_ids = tokenizer(prompt).input_ids
++        completion = dataset[i][1]
++        completion_token_ids = tokenizer(completion).input_ids
++        prompt_len = len(prompt_token_ids)
++        output_len = len(completion_token_ids
++                         ) if fixed_output_len is None else fixed_output_len
++        if prompt_len < 4 or output_len < 4:
++            # Prune too short sequences.
++            continue
++        if prompt_len > 1024 or prompt_len + output_len > 2048:
++            # Prune too long sequences.
++            continue
++
++        #Select a equi-probable random priority
++        priority = 0 if random.random() < 0.5 else 1
++
++        filtered_dataset.append((prompt, prompt_len, output_len, priority))
++
++    return filtered_dataset
++
++
++def run_vllm(
++    requests: List[Tuple[str, int, int]],
++    n: int,
++    engine_args: EngineArgs,
++) -> float:
++    from vllm import LLM, SamplingParams
++    llm = LLM(**dataclasses.asdict(engine_args))
++
++    # Add the requests to the engine.
++    prompts = []
++    sampling_params = []
++    priority = []
++    for prompt, _, output_len, _priority in requests:
++        prompts.append(prompt)
++        priority.append(_priority)
++        sampling_params.append(
++            SamplingParams(
++                n=n,
++                temperature=1.0,
++                top_p=1.0,
++                ignore_eos=True,
++                max_tokens=output_len,
++            ))
++
++    start = time.perf_counter()
++    llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
++    end = time.perf_counter()
++    return end - start
++
++
++def main(args: argparse.Namespace):
++    print(args)
++    random.seed(args.seed)
++
++    # Sample the requests.
++    tokenizer = AutoTokenizer.from_pretrained(
++        args.tokenizer, trust_remote_code=args.trust_remote_code)
++    if args.dataset is None:
++        # Synthesize a prompt with the given input length.
++        prompt = "hi" * (args.input_len - 1)
++        requests = [(prompt, args.input_len, args.output_len)
++                    for _ in range(args.num_prompts)]
++    else:
++        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
++                                   args.output_len)
++
++    if args.backend == "vllm":
++        elapsed_time = run_vllm(requests, args.n,
++                                EngineArgs.from_cli_args(args))
++    else:
++        raise ValueError(f"Unknown backend: {args.backend}")
++    total_num_tokens = sum(prompt_len + output_len
++                           for _, prompt_len, output_len, priority in requests)
++    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
++          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
++
++    # Output JSON results if specified
++    if args.output_json:
++        results = {
++            "elapsed_time": elapsed_time,
++            "num_requests": len(requests),
++            "total_num_tokens": total_num_tokens,
++            "requests_per_second": len(requests) / elapsed_time,
++            "tokens_per_second": total_num_tokens / elapsed_time,
++        }
++        with open(args.output_json, "w") as f:
++            json.dump(results, f, indent=4)
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
++    parser.add_argument("--backend",
++                        type=str,
++                        choices=["vllm", "hf", "mii"],
++                        default="vllm")
++    parser.add_argument("--dataset",
++                        type=str,
++                        default=None,
++                        help="Path to the dataset.")
++    parser.add_argument("--input-len",
++                        type=int,
++                        default=None,
++                        help="Input prompt length for each request")
++    parser.add_argument("--output-len",
++                        type=int,
++                        default=None,
++                        help="Output length for each request. Overrides the "
++                        "output length from the dataset.")
++    parser.add_argument("--n",
++                        type=int,
++                        default=1,
++                        help="Number of generated sequences per prompt.")
++    parser.add_argument("--num-prompts",
++                        type=int,
++                        default=200,
++                        help="Number of prompts to process.")
++    parser.add_argument(
++        '--output-json',
++        type=str,
++        default=None,
++        help='Path to save the throughput results in JSON format.')
++
++    parser = EngineArgs.add_cli_args(parser)
++    args = parser.parse_args()
++    if args.tokenizer is None:
++        args.tokenizer = args.model
++    if args.dataset is None:
++        assert args.input_len is not None
++        assert args.output_len is not None
++    else:
++        assert args.input_len is None
++
++    main(args)
+diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
+index 2c2d69d..4eb0e1f 100644
+--- a/benchmarks/benchmark_serving.py
++++ b/benchmarks/benchmark_serving.py
+@@ -1,9 +1,9 @@
+-"""Benchmark online serving throughput.
++r"""Benchmark online serving throughput.
+ 
+ On the server side, run one of the following commands:
+     vLLM OpenAI API server
+-    python -m vllm.entrypoints.openai.api_server \
+-        --model <your_model> --swap-space 16 \
++    vllm serve <your_model> \
++        --swap-space 16 \
+         --disable-log-requests
+ 
+     (TGI backend)
+@@ -17,9 +17,15 @@ On the client side, run:
+         --dataset-path <path to dataset> \
+         --request-rate <request_rate> \ # By default <request_rate> is inf
+         --num-prompts <num_prompts> # By default <num_prompts> is 1000
++
++    when using tgi backend, add
++        --endpoint /generate_stream
++    to the end of the command above.
+ """
+ import argparse
+ import asyncio
++import base64
++import io
+ import json
+ import os
+ import random
+@@ -27,15 +33,27 @@ import time
+ import warnings
+ from dataclasses import dataclass
+ from datetime import datetime
+-from typing import AsyncGenerator, List, Optional, Tuple
++from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
+ 
+ import numpy as np
+ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+                                   RequestFuncOutput)
++from datasets import load_dataset
++from PIL.Image import Image
+ from tqdm.asyncio import tqdm
+ from transformers import PreTrainedTokenizerBase
+ 
+-from vllm.transformers_utils.tokenizer import get_tokenizer
++try:
++    from vllm.transformers_utils.tokenizer import get_tokenizer
++except ImportError:
++    from backend_request_func import get_tokenizer
++
++try:
++    from vllm.utils import FlexibleArgumentParser
++except ImportError:
++    from argparse import ArgumentParser as FlexibleArgumentParser
++
++MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+ 
+ 
+ @dataclass
+@@ -44,14 +62,28 @@ class BenchmarkMetrics:
+     total_input: int
+     total_output: int
+     request_throughput: float
+-    input_throughput: float
++    request_goodput: float
+     output_throughput: float
++    total_token_throughput: float
+     mean_ttft_ms: float
+     median_ttft_ms: float
+-    p99_ttft_ms: float
++    std_ttft_ms: float
++    percentiles_ttft_ms: List[Tuple[float, float]]
+     mean_tpot_ms: float
+     median_tpot_ms: float
+-    p99_tpot_ms: float
++    std_tpot_ms: float
++    percentiles_tpot_ms: List[Tuple[float, float]]
++    mean_itl_ms: float
++    median_itl_ms: float
++    std_itl_ms: float
++    percentiles_itl_ms: List[Tuple[float, float]]
++    # E2EL stands for end-to-end latency per request.
++    # It is the time taken on the client side from sending
++    # a request to receiving a complete response.
++    mean_e2el_ms: float
++    median_e2el_ms: float
++    std_e2el_ms: float
++    percentiles_e2el_ms: List[Tuple[float, float]]
+ 
+ 
+ def sample_sharegpt_requests(
+@@ -59,12 +91,9 @@ def sample_sharegpt_requests(
+     num_requests: int,
+     tokenizer: PreTrainedTokenizerBase,
+     fixed_output_len: Optional[int] = None,
+-) -> List[Tuple[str, int, int]]:
+-    if fixed_output_len is not None and fixed_output_len < 4:
+-        raise ValueError("output_len too small")
+-
++) -> List[Tuple[str, int, int, None]]:
+     # Load the dataset.
+-    with open(dataset_path) as f:
++    with open(dataset_path, encoding='utf-8') as f:
+         dataset = json.load(f)
+     # Filter out the conversations with less than 2 turns.
+     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+@@ -89,13 +118,13 @@ def sample_sharegpt_requests(
+         prompt_len = len(prompt_token_ids)
+         output_len = len(completion_token_ids
+                          ) if fixed_output_len is None else fixed_output_len
+-        if prompt_len < 4 or output_len < 4:
++        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
+             # Prune too short sequences.
+             continue
+         if prompt_len > 1024 or prompt_len + output_len > 2048:
+             # Prune too long sequences.
+             continue
+-        filtered_dataset.append((prompt, prompt_len, output_len))
++        filtered_dataset.append((prompt, prompt_len, output_len, None))
+ 
+     return filtered_dataset
+ 
+@@ -107,13 +136,13 @@ def sample_sonnet_requests(
+     output_len: int,
+     prefix_len: int,
+     tokenizer: PreTrainedTokenizerBase,
+-) -> List[Tuple[str, str, int, int]]:
++) -> List[Tuple[str, str, int, int, None]]:
+     assert (
+         input_len > prefix_len
+     ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
+ 
+     # Load the dataset.
+-    with open(dataset_path) as f:
++    with open(dataset_path, encoding='utf-8') as f:
+         poem_lines = f.readlines()
+ 
+     # Tokenize the poem lines.
+@@ -150,9 +179,9 @@ def sample_sonnet_requests(
+     # Sample the rest of lines per request.
+     sampled_requests: List[Tuple[str, int, int]] = []
+     for _ in range(num_requests):
+-        sampled_lines = "".join(
+-            prefix_lines +
+-            random.sample(poem_lines, num_input_lines - num_prefix_lines))
++        num_lines_needed = num_input_lines - num_prefix_lines
++        sampled_lines = "".join(prefix_lines +
++                                random.choices(poem_lines, k=num_lines_needed))
+ 
+         prompt = f"{base_prompt}{sampled_lines}"
+         message = [
+@@ -165,24 +194,224 @@ def sample_sonnet_requests(
+             message, add_generation_prompt=True, tokenize=False)
+         prompt_len = len(tokenizer(prompt_formatted).input_ids)
+         sampled_requests.append(
+-            (prompt, prompt_formatted, prompt_len, output_len))
++            (prompt, prompt_formatted, prompt_len, output_len, None))
++
++    return sampled_requests
++
++
++def sample_mmmu_pro_vision_requests(
++    dataset,
++    num_requests: int,
++    tokenizer: PreTrainedTokenizerBase,
++    fixed_output_len: Optional[int] = None,
++) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
++    sampled_requests: List[Tuple[str, int, int, Dict[str,
++                                                     Collection[str]]]] = []
++    for data in dataset:
++        if len(sampled_requests) == num_requests:
++            break
++
++        # MMMU-Pro vision direct prompt
++        # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
++        prompt = (
++            "Answer with the option letter from the given choices directly. "
++            "The last line of your response should be of the following "
++            "format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
++            "options.")
++
++        prompt_token_ids = tokenizer(prompt).input_ids
++        if fixed_output_len is None:
++            # Default max output len is set to 128
++            print("--hf-output-len is not provided. Using default value 128.")
++            fixed_output_len = 128
++
++        prompt_len = len(prompt_token_ids)
++        output_len = fixed_output_len
++
++        assert isinstance(
++            data["image"],
++            Image), ("Input image format must be `PIL.Image.Image`, "
++                     f"given {type(data['image'])}.")
++        image: Image = data["image"]
++        image = image.convert("RGB")
++        image_data = io.BytesIO()
++        image.save(image_data, format='JPEG')
++        image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
++        mm_content = {
++            "type": "image_url",
++            "image_url": {
++                "url": f"data:image/jpeg;base64,{image_base64}"
++            },
++        }
++
++        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
++
++    return sampled_requests
++
++
++def sample_hf_requests(
++    dataset_path: str,
++    dataset_subset: str,
++    dataset_split: str,
++    num_requests: int,
++    tokenizer: PreTrainedTokenizerBase,
++    random_seed: int,
++    fixed_output_len: Optional[int] = None,
++) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
++
++    # Special case for MMMU-Pro vision dataset
++    if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
++        assert dataset_split == "test"
++        dataset = load_dataset(dataset_path,
++                               name=dataset_subset,
++                               split=dataset_split,
++                               streaming=True)
++        assert "image" in dataset.features, (
++            "MMMU/MMMU_Pro vision dataset must have 'image' column.")
++        filter_func = lambda x: isinstance(x["image"], Image)
++        dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
++        return sample_mmmu_pro_vision_requests(dataset, num_requests,
++                                               tokenizer, fixed_output_len)
++
++    dataset = load_dataset(dataset_path,
++                           name=dataset_subset,
++                           split=dataset_split,
++                           streaming=True)
++    assert "conversations" in dataset.features, (
++        "HF Dataset must have 'conversations' column.")
++    filter_func = lambda x: len(x["conversations"]) >= 2
++    filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
++    sampled_requests: List[Tuple[str, int, int, Dict[str,
++                                                     Collection[str]]]] = []
++    for data in filtered_dataset:
++        if len(sampled_requests) == num_requests:
++            break
++
++        # Tokenize the prompts and completions.
++        prompt = data["conversations"][0]["value"]
++        prompt_token_ids = tokenizer(prompt).input_ids
++        completion = data["conversations"][1]["value"]
++        completion_token_ids = tokenizer(completion).input_ids
++        prompt_len = len(prompt_token_ids)
++        output_len = len(completion_token_ids
++                         ) if fixed_output_len is None else fixed_output_len
++        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
++            # Prune too short sequences.
++            continue
++        if fixed_output_len is None and \
++            (prompt_len > 1024 or prompt_len + output_len > 2048):
++            # Prune too long sequences.
++            continue
++
++        if "image" in data and isinstance(data["image"], Image):
++            image: Image = data["image"]
++            image = image.convert("RGB")
++            image_data = io.BytesIO()
++            image.save(image_data, format='JPEG')
++            image_base64 = base64.b64encode(
++                image_data.getvalue()).decode("utf-8")
++            mm_content = {
++                "type": "image_url",
++                "image_url": {
++                    "url": f"data:image/jpeg;base64,{image_base64}"
++                },
++            }
++        elif "image" in data and isinstance(data["image"], str):
++            if (data["image"].startswith("http://") or \
++                data["image"].startswith("file://")):
++                image_url = data["image"]
++            else:
++                image_url = f"file://{data['image']}"
++
++            mm_content = {
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                },
++            }
++        else:
++            mm_content = None
++
++        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
+ 
+     return sampled_requests
+ 
+ 
++def sample_random_requests(
++    prefix_len: int,
++    input_len: int,
++    output_len: int,
++    num_prompts: int,
++    range_ratio: float,
++    tokenizer: PreTrainedTokenizerBase,
++) -> List[Tuple[str, int, int]]:
++    prefix_token_ids = np.random.randint(0,
++                                         tokenizer.vocab_size,
++                                         size=prefix_len).tolist()
++
++    input_lens = np.random.randint(
++        int(input_len * range_ratio),
++        input_len + 1,
++        size=num_prompts,
++    )
++    output_lens = np.random.randint(
++        int(output_len * range_ratio),
++        output_len + 1,
++        size=num_prompts,
++    )
++    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
++    input_requests = []
++    for i in range(num_prompts):
++        prompt = tokenizer.decode(prefix_token_ids +
++                                  [(offsets[i] + i + j) % tokenizer.vocab_size
++                                   for j in range(input_lens[i])])
++
++        input_requests.append((prompt, int(prefix_len + input_lens[i]),
++                               int(output_lens[i]), None))
++
++    return input_requests
++
++
+ async def get_request(
+     input_requests: List[Tuple[str, int, int]],
+     request_rate: float,
++    burstiness: float = 1.0,
+ ) -> AsyncGenerator[Tuple[str, int, int], None]:
++    """
++    Asynchronously generates requests at a specified rate 
++    with OPTIONAL burstiness.
++    
++    Args:
++        input_requests: 
++            A list of input requests, each represented as a tuple.
++        request_rate: 
++            The rate at which requests are generated (requests/s).
++        burstiness (optional): 
++            The burstiness factor of the request generation. 
++            Only takes effect when request_rate is not inf.
++            Default value is 1, which follows a Poisson process.
++            Otherwise, the request intervals follow a gamma distribution.
++            A lower burstiness value (0 < burstiness < 1) results 
++            in more bursty requests, while a higher burstiness value 
++            (burstiness > 1) results in a more uniform arrival of requests.
++    """
+     input_requests = iter(input_requests)
++
++    # Calculate scale parameter theta to maintain the desired request_rate.
++    assert burstiness > 0, (
++        f"A positive burstiness factor is expected, but given {burstiness}.")
++    theta = 1.0 / (request_rate * burstiness)
++
+     for request in input_requests:
+         yield request
+ 
+         if request_rate == float("inf"):
+             # If the request rate is infinity, then we don't need to wait.
+             continue
+-        # Sample the request interval from the exponential distribution.
+-        interval = np.random.exponential(1.0 / request_rate)
++
++        # Sample the request interval from the gamma distribution.
++        # If burstiness is 1, it follows exponential distribution.
++        interval = np.random.gamma(shape=burstiness, scale=theta)
+         # The next request will be sent after the interval.
+         await asyncio.sleep(interval)
+ 
+@@ -192,39 +421,100 @@ def calculate_metrics(
+     outputs: List[RequestFuncOutput],
+     dur_s: float,
+     tokenizer: PreTrainedTokenizerBase,
++    selected_percentile_metrics: List[str],
++    selected_percentiles: List[float],
++    gootput_config_dict: Dict[str, float],
+ ) -> Tuple[BenchmarkMetrics, List[int]]:
+-    actual_output_lens = []
++    actual_output_lens: List[int] = []
+     total_input = 0
+     completed = 0
+-    tpots = []
+-    ttfts = []
++    good_completed = 0
++    itls: List[float] = []
++    tpots: List[float] = []
++    all_tpots: List[float] = []
++    ttfts: List[float] = []
++    e2els: List[float] = []
+     for i in range(len(outputs)):
+         if outputs[i].success:
+-            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
++            # We use the tokenizer to count the number of output tokens for all
++            # serving backends instead of looking at len(outputs[i].itl) since
++            # multiple output tokens may be bundled together
++            # Note : this may inflate the output token count slightly
++            output_len = len(
++                tokenizer(outputs[i].generated_text,
++                          add_special_tokens=False).input_ids)
+             actual_output_lens.append(output_len)
+             total_input += input_requests[i][1]
++            tpot = 0
+             if output_len > 1:
+-                tpots.append(
+-                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
++                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
++                                                                 1)
++                tpots.append(tpot)
++            # Note: if output_len <= 1, we regard tpot as 0 for goodput
++            all_tpots.append(tpot)
++            itls += outputs[i].itl
+             ttfts.append(outputs[i].ttft)
++            e2els.append(outputs[i].latency)
+             completed += 1
+         else:
+             actual_output_lens.append(0)
+ 
++    if gootput_config_dict:
++        valid_metrics = []
++        slo_values = []
++
++        if "ttft" in gootput_config_dict:
++            valid_metrics.append(ttfts)
++            slo_values.append(gootput_config_dict["ttft"] /
++                              MILLISECONDS_TO_SECONDS_CONVERSION)
++        if "tpot" in gootput_config_dict:
++            valid_metrics.append(all_tpots)
++            slo_values.append(gootput_config_dict["tpot"] /
++                              MILLISECONDS_TO_SECONDS_CONVERSION)
++        if "e2el" in gootput_config_dict:
++            valid_metrics.append(e2els)
++            slo_values.append(gootput_config_dict["e2el"] /
++                              MILLISECONDS_TO_SECONDS_CONVERSION)
++
++        for req_metric in zip(*valid_metrics):
++            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
++            if is_good_req:
++                good_completed += 1
++
++    if completed == 0:
++        warnings.warn(
++            "All requests failed. This is likely due to a misconfiguration "
++            "on the benchmark arguments.",
++            stacklevel=2)
+     metrics = BenchmarkMetrics(
+         completed=completed,
+         total_input=total_input,
+         total_output=sum(actual_output_lens),
+         request_throughput=completed / dur_s,
+-        input_throughput=total_input / dur_s,
++        request_goodput=good_completed / dur_s,
+         output_throughput=sum(actual_output_lens) / dur_s,
++        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+         mean_ttft_ms=np.mean(ttfts or 0) *
+         1000,  # ttfts is empty if streaming is not supported by backend
++        std_ttft_ms=np.std(ttfts or 0) * 1000,
+         median_ttft_ms=np.median(ttfts or 0) * 1000,
+-        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+-        mean_tpot_ms=np.mean(tpots) * 1000,
+-        median_tpot_ms=np.median(tpots) * 1000,
+-        p99_tpot_ms=np.percentile(tpots, 99) * 1000,
++        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
++                             for p in selected_percentiles],
++        mean_tpot_ms=np.mean(tpots or 0) * 1000,
++        std_tpot_ms=np.std(tpots or 0) * 1000,
++        median_tpot_ms=np.median(tpots or 0) * 1000,
++        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
++                             for p in selected_percentiles],
++        mean_itl_ms=np.mean(itls or 0) * 1000,
++        std_itl_ms=np.std(itls or 0) * 1000,
++        median_itl_ms=np.median(itls or 0) * 1000,
++        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
++                            for p in selected_percentiles],
++        mean_e2el_ms=np.mean(e2els or 0) * 1000,
++        std_e2el_ms=np.std(e2els or 0) * 1000,
++        median_e2el_ms=np.median(e2els or 0) * 1000,
++        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
++                             for p in selected_percentiles],
+     )
+ 
+     return metrics, actual_output_lens
+@@ -233,43 +523,129 @@ def calculate_metrics(
+ async def benchmark(
+     backend: str,
+     api_url: str,
++    base_url: str,
+     model_id: str,
+     tokenizer: PreTrainedTokenizerBase,
+     input_requests: List[Tuple[str, int, int]],
++    logprobs: Optional[int],
+     best_of: int,
+-    use_beam_search: bool,
+     request_rate: float,
++    burstiness: float,
+     disable_tqdm: bool,
++    profile: bool,
++    selected_percentile_metrics: List[str],
++    selected_percentiles: List[str],
++    ignore_eos: bool,
++    gootput_config_dict: Dict[str, float],
++    max_concurrency: Optional[int],
+ ):
+     if backend in ASYNC_REQUEST_FUNCS:
+-        request_func = ASYNC_REQUEST_FUNCS.get(backend)
++        request_func = ASYNC_REQUEST_FUNCS[backend]
+     else:
+         raise ValueError(f"Unknown backend: {backend}")
+ 
++    print("Starting initial single prompt test run...")
++    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
++        input_requests[0])
++    if backend != "openai-chat" and test_mm_content is not None:
++        # multi-modal benchmark is only available on OpenAI Chat backend.
++        raise ValueError(
++            "Multi-modal content is only supported on 'openai-chat' backend.")
++    test_input = RequestFuncInput(
++        model=model_id,
++        prompt=test_prompt,
++        api_url=api_url,
++        prompt_len=test_prompt_len,
++        output_len=test_output_len,
++        logprobs=logprobs,
++        best_of=best_of,
++        multi_modal_content=test_mm_content,
++        ignore_eos=ignore_eos,
++    )
++    test_output = await request_func(request_func_input=test_input)
++    if not test_output.success:
++        raise ValueError(
++            "Initial test run failed - Please make sure benchmark arguments "
++            f"are correctly specified. Error: {test_output.error}")
++    else:
++        print("Initial test run completed. Starting main benchmark run...")
++
++    if profile:
++        print("Starting profiler...")
++        profile_input = RequestFuncInput(model=model_id,
++                                         prompt=test_prompt,
++                                         api_url=base_url + "/start_profile",
++                                         prompt_len=test_prompt_len,
++                                         output_len=test_output_len,
++                                         logprobs=logprobs,
++                                         best_of=best_of,
++                                         multi_modal_content=test_mm_content,
++                                         ignore_eos=ignore_eos)
++        profile_output = await request_func(request_func_input=profile_input)
++        if profile_output.success:
++            print("Profiler started")
++
++    if burstiness == 1.0:
++        distribution = "Poisson process"
++    else:
++        distribution = "Gamma distribution"
++
+     print(f"Traffic request rate: {request_rate}")
++    print(f"Burstiness factor: {burstiness} ({distribution})")
++    print(f"Maximum request concurrency: {max_concurrency}")
+ 
+     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+ 
++    # This can be used once the minimum Python version is 3.10 or higher,
++    # and it will simplify the code in limited_request_func.
++    #    semaphore = (asyncio.Semaphore(max_concurrency)
++    #                 if max_concurrency else contextlib.nullcontext())
++    semaphore = (asyncio.Semaphore(max_concurrency)
++                 if max_concurrency else None)
++
++    async def limited_request_func(request_func_input, pbar):
++        if semaphore is None:
++            return await request_func(request_func_input=request_func_input,
++                                      pbar=pbar)
++        async with semaphore:
++            return await request_func(request_func_input=request_func_input,
++                                      pbar=pbar)
++
+     benchmark_start_time = time.perf_counter()
+-    tasks = []
+-    async for request in get_request(input_requests, request_rate):
+-        prompt, prompt_len, output_len = request
+-        request_func_input = RequestFuncInput(
+-            model=model_id,
+-            prompt=prompt,
+-            api_url=api_url,
+-            prompt_len=prompt_len,
+-            output_len=output_len,
+-            best_of=best_of,
+-            use_beam_search=use_beam_search,
+-        )
++    tasks: List[asyncio.Task] = []
++    async for request in get_request(input_requests, request_rate, burstiness):
++        prompt, prompt_len, output_len, mm_content = request
++        request_func_input = RequestFuncInput(model=model_id,
++                                              prompt=prompt,
++                                              api_url=api_url,
++                                              prompt_len=prompt_len,
++                                              output_len=output_len,
++                                              logprobs=logprobs,
++                                              best_of=best_of,
++                                              multi_modal_content=mm_content,
++                                              ignore_eos=ignore_eos)
+         tasks.append(
+             asyncio.create_task(
+-                request_func(request_func_input=request_func_input,
+-                             pbar=pbar)))
++                limited_request_func(request_func_input=request_func_input,
++                                     pbar=pbar)))
+     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+ 
+-    if not disable_tqdm:
++    if profile:
++        print("Stopping profiler...")
++        profile_input = RequestFuncInput(
++            model=model_id,
++            prompt=test_prompt,
++            api_url=base_url + "/stop_profile",
++            prompt_len=test_prompt_len,
++            output_len=test_output_len,
++            logprobs=logprobs,
++            best_of=best_of,
++        )
++        profile_output = await request_func(request_func_input=profile_input)
++        if profile_output.success:
++            print("Profiler stopped")
++
++    if pbar is not None:
+         pbar.close()
+ 
+     benchmark_duration = time.perf_counter() - benchmark_start_time
+@@ -279,6 +655,9 @@ async def benchmark(
+         outputs=outputs,
+         dur_s=benchmark_duration,
+         tokenizer=tokenizer,
++        selected_percentile_metrics=selected_percentile_metrics,
++        selected_percentiles=selected_percentiles,
++        gootput_config_dict=gootput_config_dict,
+     )
+ 
+     print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+@@ -290,23 +669,13 @@ async def benchmark(
+                                  metrics.total_output))
+     print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                     metrics.request_throughput))
+-    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
+-                                    metrics.input_throughput))
++    if gootput_config_dict:
++        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
++                                        metrics.request_goodput))
+     print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                     metrics.output_throughput))
+-    print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
+-    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+-    print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
+-                                    metrics.median_ttft_ms))
+-    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+-    print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
+-                               n=50,
+-                               c='-'))
+-    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+-    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
+-                                    metrics.median_tpot_ms))
+-    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+-    print("=" * 50)
++    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
++                                    metrics.total_token_throughput))
+ 
+     result = {
+         "duration": benchmark_duration,
+@@ -314,14 +683,10 @@ async def benchmark(
+         "total_input_tokens": metrics.total_input,
+         "total_output_tokens": metrics.total_output,
+         "request_throughput": metrics.request_throughput,
+-        "input_throughput": metrics.input_throughput,
++        "request_goodput:":
++        metrics.request_goodput if gootput_config_dict else None,
+         "output_throughput": metrics.output_throughput,
+-        "mean_ttft_ms": metrics.mean_ttft_ms,
+-        "median_ttft_ms": metrics.median_ttft_ms,
+-        "p99_ttft_ms": metrics.p99_ttft_ms,
+-        "mean_tpot_ms": metrics.mean_tpot_ms,
+-        "median_tpot_ms": metrics.median_tpot_ms,
+-        "p99_tpot_ms": metrics.p99_tpot_ms,
++        "total_token_throughput": metrics.total_token_throughput,
+         "input_lens": [output.prompt_len for output in outputs],
+         "output_lens": actual_output_lens,
+         "ttfts": [output.ttft for output in outputs],
+@@ -329,9 +694,85 @@ async def benchmark(
+         "generated_texts": [output.generated_text for output in outputs],
+         "errors": [output.error for output in outputs],
+     }
++
++    def process_one_metric(
++        # E.g., "ttft"
++        metric_attribute_name: str,
++        # E.g., "TTFT"
++        metric_name: str,
++        # E.g., "Time to First Token"
++        metric_header: str,
++    ):
++        # This function prints and adds statistics of the specified
++        # metric.
++        if metric_attribute_name not in selected_percentile_metrics:
++            return
++        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
++        print("{:<40} {:<10.2f}".format(
++            f"Mean {metric_name} (ms):",
++            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
++        print("{:<40} {:<10.2f}".format(
++            f"Median {metric_name} (ms):",
++            getattr(metrics, f"median_{metric_attribute_name}_ms")))
++        result[f"mean_{metric_attribute_name}_ms"] = getattr(
++            metrics, f"mean_{metric_attribute_name}_ms")
++        result[f"median_{metric_attribute_name}_ms"] = getattr(
++            metrics, f"median_{metric_attribute_name}_ms")
++        result[f"std_{metric_attribute_name}_ms"] = getattr(
++            metrics, f"std_{metric_attribute_name}_ms")
++        for p, value in getattr(metrics,
++                                f"percentiles_{metric_attribute_name}_ms"):
++            p_word = str(int(p)) if int(p) == p else str(p)
++            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
++                                            value))
++            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
++
++    process_one_metric("ttft", "TTFT", "Time to First Token")
++    process_one_metric("tpot", "TPOT",
++                       "Time per Output Token (excl. 1st token)")
++    process_one_metric("itl", "ITL", "Inter-token Latency")
++    process_one_metric("e2el", "E2EL", "End-to-end Latency")
++
++    print("=" * 50)
++
+     return result
+ 
+ 
++def check_goodput_args(args):
++    # Check and parse goodput arguments
++    gootput_config_dict = {}
++    VALID_NAMES = ["ttft", "tpot", "e2el"]
++    if args.goodput:
++        gootput_config_dict = parse_goodput(args.goodput)
++        for slo_name, slo_val in gootput_config_dict.items():
++            if slo_name not in VALID_NAMES:
++                raise ValueError(
++                    f"Invalid metric name found, {slo_name}: {slo_val}. "
++                    "The service level objective name should be one of "
++                    f"{str(VALID_NAMES)}. ")
++            if slo_val < 0:
++                raise ValueError(
++                    f"Invalid value found, {slo_name}: {slo_val}. "
++                    "The service level objective value should be "
++                    "non-negative.")
++    return gootput_config_dict
++
++
++def parse_goodput(slo_pairs):
++    gootput_config_dict = {}
++    try:
++        for slo_pair in slo_pairs:
++            slo_name, slo_val = slo_pair.split(":")
++            gootput_config_dict[slo_name] = float(slo_val)
++    except ValueError as err:
++        raise argparse.ArgumentTypeError(
++            "Invalid format found for service level objectives. "
++            "Specify service level objectives for goodput as \"KEY:VALUE\" "
++            "pairs, where the key is a metric name, and the value is a "
++            "number in milliseconds.") from err
++    return gootput_config_dict
++
++
+ def main(args: argparse.Namespace):
+     print(args)
+     random.seed(args.seed)
+@@ -340,13 +781,17 @@ def main(args: argparse.Namespace):
+     backend = args.backend
+     model_id = args.model
+     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
++    tokenizer_mode = args.tokenizer_mode
+ 
+     if args.base_url is not None:
+         api_url = f"{args.base_url}{args.endpoint}"
++        base_url = f"{args.base_url}"
+     else:
+         api_url = f"http://{args.host}:{args.port}{args.endpoint}"
++        base_url = f"http://{args.host}:{args.port}"
+ 
+     tokenizer = get_tokenizer(tokenizer_id,
++                              tokenizer_mode=tokenizer_mode,
+                               trust_remote_code=args.trust_remote_code)
+ 
+     if args.dataset is not None:
+@@ -381,9 +826,9 @@ def main(args: argparse.Namespace):
+                 prefix_len=args.sonnet_prefix_len,
+                 tokenizer=tokenizer,
+             )
+-            input_requests = [(prompt, prompt_len, output_len)
++            input_requests = [(prompt, prompt_len, output_len, None)
+                               for prompt, prompt_formatted, prompt_len,
+-                              output_len in input_requests]
++                              output_len, _ in input_requests]
+         else:
+             assert (
+                 tokenizer.chat_template or tokenizer.default_chat_template
+@@ -396,29 +841,62 @@ def main(args: argparse.Namespace):
+                 prefix_len=args.sonnet_prefix_len,
+                 tokenizer=tokenizer,
+             )
+-            input_requests = [(prompt_formatted, prompt_len, output_len)
++            input_requests = [(prompt_formatted, prompt_len, output_len, None)
+                               for prompt, prompt_formatted, prompt_len,
+-                              output_len in input_requests]
++                              output_len, _ in input_requests]
++
++    elif args.dataset_name == "hf":
++        input_requests = sample_hf_requests(
++            dataset_path=args.dataset_path,
++            dataset_subset=args.hf_subset,
++            dataset_split=args.hf_split,
++            num_requests=args.num_prompts,
++            tokenizer=tokenizer,
++            random_seed=args.seed,
++            fixed_output_len=args.hf_output_len,
++        )
++
++    elif args.dataset_name == "random":
++        input_requests = sample_random_requests(
++            prefix_len=args.random_prefix_len,
++            input_len=args.random_input_len,
++            output_len=args.random_output_len,
++            num_prompts=args.num_prompts,
++            range_ratio=args.random_range_ratio,
++            tokenizer=tokenizer,
++        )
+ 
+     else:
+         raise ValueError(f"Unknown dataset: {args.dataset_name}")
+ 
++    gootput_config_dict = check_goodput_args(args)
++
+     benchmark_result = asyncio.run(
+         benchmark(
+             backend=backend,
+             api_url=api_url,
++            base_url=base_url,
+             model_id=model_id,
+             tokenizer=tokenizer,
+             input_requests=input_requests,
++            logprobs=args.logprobs,
+             best_of=args.best_of,
+-            use_beam_search=args.use_beam_search,
+             request_rate=args.request_rate,
++            burstiness=args.burstiness,
+             disable_tqdm=args.disable_tqdm,
++            profile=args.profile,
++            selected_percentile_metrics=args.percentile_metrics.split(","),
++            selected_percentiles=[
++                float(p) for p in args.metric_percentiles.split(",")
++            ],
++            ignore_eos=args.ignore_eos,
++            gootput_config_dict=gootput_config_dict,
++            max_concurrency=args.max_concurrency,
+         ))
+ 
+     # Save config and results to json
+     if args.save_result:
+-        result_json = {}
++        result_json: Dict[str, Any] = {}
+ 
+         # Setup
+         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+@@ -427,7 +905,6 @@ def main(args: argparse.Namespace):
+         result_json["model_id"] = model_id
+         result_json["tokenizer_id"] = tokenizer_id
+         result_json["best_of"] = args.best_of
+-        result_json["use_beam_search"] = args.use_beam_search
+         result_json["num_prompts"] = args.num_prompts
+ 
+         # Metadata
+@@ -444,21 +921,27 @@ def main(args: argparse.Namespace):
+         # Traffic
+         result_json["request_rate"] = (
+             args.request_rate if args.request_rate < float("inf") else "inf")
++        result_json["burstiness"] = args.burstiness
++        result_json["max_concurrency"] = args.max_concurrency
+ 
+         # Merge with benchmark result
+         result_json = {**result_json, **benchmark_result}
+ 
+         # Save to file
+         base_model_id = model_id.split("/")[-1]
+-        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
++        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
++                               if args.max_concurrency is not None else "")
++        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  #noqa
++        if args.result_filename:
++            file_name = args.result_filename
+         if args.result_dir:
+             file_name = os.path.join(args.result_dir, file_name)
+-        with open(file_name, "w") as outfile:
++        with open(file_name, "w", encoding='utf-8') as outfile:
+             json.dump(result_json, outfile)
+ 
+ 
+ if __name__ == "__main__":
+-    parser = argparse.ArgumentParser(
++    parser = FlexibleArgumentParser(
+         description="Benchmark the online serving throughput.")
+     parser.add_argument(
+         "--backend",
+@@ -491,13 +974,27 @@ if __name__ == "__main__":
+         "--dataset-name",
+         type=str,
+         default="sharegpt",
+-        choices=["sharegpt", "sonnet"],
++        choices=["sharegpt", "sonnet", "random", "hf"],
+         help="Name of the dataset to benchmark on.",
+     )
+     parser.add_argument("--dataset-path",
+                         type=str,
+                         default=None,
+-                        help="Path to the dataset.")
++                        help="Path to the sharegpt/sonnet dataset. "
++                        "Or the huggingface dataset ID if using HF dataset.")
++    parser.add_argument(
++        "--max-concurrency",
++        type=int,
++        default=None,
++        help="Maximum number of concurrent requests. This can be used "
++        "to help simulate an environment where a higher level component "
++        "is enforcing a maximum number of concurrent requests. While the "
++        "--request-rate argument controls the rate at which requests are "
++        "initiated, this argument will control how many are actually allowed "
++        "to execute at a time. This means that when used in combination, the "
++        "actual request rate may be lower than specified with --request-rate, "
++        "if the server is not processing requests fast enough to keep up.")
++
+     parser.add_argument(
+         "--model",
+         type=str,
+@@ -508,7 +1005,7 @@ if __name__ == "__main__":
+         "--tokenizer",
+         type=str,
+         help=
+-        "Name or path of the tokenizer, if not using the default tokenizer.",
++        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+     )
+     parser.add_argument(
+         "--best-of",
+@@ -525,31 +1022,14 @@ if __name__ == "__main__":
+         help="Number of prompts to process.",
+     )
+     parser.add_argument(
+-        "--sharegpt-output-len",
++        "--logprobs",
+         type=int,
+         default=None,
+-        help="Output length for each request. Overrides the output length "
+-        "from the ShareGPT dataset.")
+-    parser.add_argument(
+-        "--sonnet-input-len",
+-        type=int,
+-        default=550,
+-        help=
+-        "Number of input tokens per request, used only for sonnet dataset.",
+-    )
+-    parser.add_argument(
+-        "--sonnet-output-len",
+-        type=int,
+-        default=150,
+-        help=
+-        "Number of output tokens per request, used only for sonnet dataset.",
+-    )
+-    parser.add_argument(
+-        "--sonnet-prefix-len",
+-        type=int,
+-        default=200,
+-        help=
+-        "Number of prefix tokens per request, used only for sonnet dataset.",
++        help=("Number of logprobs-per-token to compute & return as part of "
++              "the request. If unspecified, then either (1) if beam search "
++              "is disabled, no logprobs are computed & a single dummy "
++              "logprob is returned for each token; or (2) if beam search "
++              "is enabled 1 logprob per token is computed"),
+     )
+     parser.add_argument(
+         "--request-rate",
+@@ -557,8 +1037,20 @@ if __name__ == "__main__":
+         default=float("inf"),
+         help="Number of requests per second. If this is inf, "
+         "then all the requests are sent at time 0. "
+-        "Otherwise, we use Poisson process to synthesize "
+-        "the request arrival times.",
++        "Otherwise, we use Poisson process or gamma distribution "
++        "to synthesize the request arrival times.",
++    )
++    parser.add_argument(
++        "--burstiness",
++        type=float,
++        default=1.0,
++        help="Burstiness factor of the request generation. "
++        "Only take effect when request_rate is not inf. "
++        "Default value is 1, which follows Poisson process. "
++        "Otherwise, the request intervals follow a gamma distribution. "
++        "A lower burstiness value (0 < burstiness < 1) results in more "
++        "bursty requests. A higher burstiness value (burstiness > 1) "
++        "results in a more uniform arrival of requests.",
+     )
+     parser.add_argument("--seed", type=int, default=0)
+     parser.add_argument(
+@@ -571,6 +1063,12 @@ if __name__ == "__main__":
+         action="store_true",
+         help="Specify to disable tqdm progress bar.",
+     )
++    parser.add_argument(
++        "--profile",
++        action="store_true",
++        help="Use Torch Profiler. The endpoint must be launched with "
++        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
++    )
+     parser.add_argument(
+         "--save-result",
+         action="store_true",
+@@ -591,6 +1089,138 @@ if __name__ == "__main__":
+         help="Specify directory to save benchmark json results."
+         "If not specified, results are saved in the current directory.",
+     )
++    parser.add_argument(
++        "--result-filename",
++        type=str,
++        default=None,
++        help="Specify the filename to save benchmark json results."
++        "If not specified, results will be saved in "
++        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
++        " format.",
++    )
++    parser.add_argument(
++        "--ignore-eos",
++        action="store_true",
++        help="Set ignore_eos flag when sending the benchmark request."
++        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
++    parser.add_argument(
++        "--percentile-metrics",
++        type=str,
++        default="ttft,tpot,itl",
++        help="Comma-seperated list of selected metrics to report percentils. "
++        "This argument specifies the metrics to report percentiles. "
++        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
++        "Default value is \"ttft,tpot,itl\".")
++    parser.add_argument(
++        "--metric-percentiles",
++        type=str,
++        default="99",
++        help="Comma-seperated list of percentiles for selected metrics. "
++        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
++        "Default value is \"99\". "
++        "Use \"--percentile-metrics\" to select metrics.",
++    )
++    parser.add_argument(
++        "--goodput",
++        nargs="+",
++        required=False,
++        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
++        "pairs, where the key is a metric name, and the value is in "
++        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
++        "separated by spaces. Allowed request level metric names are "
++        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
++        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
++        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
++
++    # group for dataset specific arguments
++    sonnet_group = parser.add_argument_group("sonnet dataset options")
++    sonnet_group.add_argument(
++        "--sonnet-input-len",
++        type=int,
++        default=550,
++        help=
++        "Number of input tokens per request, used only for sonnet dataset.",
++    )
++    sonnet_group.add_argument(
++        "--sonnet-output-len",
++        type=int,
++        default=150,
++        help=
++        "Number of output tokens per request, used only for sonnet dataset.",
++    )
++    sonnet_group.add_argument(
++        "--sonnet-prefix-len",
++        type=int,
++        default=200,
++        help=
++        "Number of prefix tokens per request, used only for sonnet dataset.",
++    )
++
++    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
++    sharegpt_group.add_argument(
++        "--sharegpt-output-len",
++        type=int,
++        default=None,
++        help="Output length for each request. Overrides the output length "
++        "from the ShareGPT dataset.")
++
++    random_group = parser.add_argument_group("random dataset options")
++    random_group.add_argument(
++        "--random-input-len",
++        type=int,
++        default=1024,
++        help=
++        "Number of input tokens per request, used only for random sampling.",
++    )
++    random_group.add_argument(
++        "--random-output-len",
++        type=int,
++        default=128,
++        help=
++        "Number of output tokens per request, used only for random sampling.",
++    )
++    random_group.add_argument(
++        "--random-range-ratio",
++        type=float,
++        default=1.0,
++        help="Range of sampled ratio of input/output length, "
++        "used only for random sampling.",
++    )
++    random_group.add_argument(
++        "--random-prefix-len",
++        type=int,
++        default=0,
++        help="Number of fixed prefix tokens before random "
++        " context. The length range of context in a random "
++        " request is [random-prefix-len, "
++        " random-prefix-len + random-prefix-len * random-range-ratio).")
++
++    hf_group = parser.add_argument_group("hf dataset options")
++    hf_group.add_argument("--hf-subset",
++                          type=str,
++                          default=None,
++                          help="Subset of the HF dataset.")
++    hf_group.add_argument("--hf-split",
++                          type=str,
++                          default=None,
++                          help="Split of the HF dataset.")
++    hf_group.add_argument(
++        "--hf-output-len",
++        type=int,
++        default=None,
++        help="Output length for each request. Overrides the output lengths "
++        "from the sampled HF dataset.",
++    )
++
++    parser.add_argument(
++        '--tokenizer-mode',
++        type=str,
++        default="auto",
++        choices=['auto', 'slow', 'mistral'],
++        help='The tokenizer mode.\n\n* "auto" will use the '
++        'fast tokenizer if available.\n* "slow" will '
++        'always use the slow tokenizer. \n* '
++        '"mistral" will always use the `mistral_common` tokenizer.')
+ 
+     args = parser.parse_args()
+     main(args)
+diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
+new file mode 100644
+index 0000000..4435d87
+--- /dev/null
++++ b/benchmarks/benchmark_serving_guided.py
+@@ -0,0 +1,881 @@
++r"""Benchmark online serving throughput with guided decoding.
++
++On the server side, run one of the following commands:
++    (vLLM OpenAI API server)
++    vllm serve <your_model> --disable-log-requests
++
++    (TGI backend)
++    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
++
++On the client side, run:
++    python benchmarks/benchmark_serving.py \
++        --backend <backend> \
++        --model <your_model> \
++        --dataset json \
++        --guided-decoding-ratio 1.0 \
++        --guided-decoding-backend xgrammar \
++        --request-rate 10 \
++        --num-prompts 1000
++
++    when using tgi backend, add
++        --endpoint /generate_stream
++    to the end of the command above.
++"""
++import argparse
++import asyncio
++import dataclasses
++import json
++import os
++import random
++import time
++import warnings
++from dataclasses import dataclass
++from typing import AsyncGenerator, List, Optional, Tuple
++
++import datasets
++import numpy as np
++import pandas as pd
++from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
++                                  RequestFuncOutput)
++from tqdm.asyncio import tqdm
++from transformers import PreTrainedTokenizerBase
++
++try:
++    from vllm.transformers_utils.tokenizer import get_tokenizer
++except ImportError:
++    from backend_request_func import get_tokenizer
++
++try:
++    from vllm.utils import FlexibleArgumentParser
++except ImportError:
++    from argparse import ArgumentParser as FlexibleArgumentParser
++
++MILLISECONDS_TO_SECONDS_CONVERSION = 1000
++
++
++@dataclass
++class BenchmarkMetrics:
++    completed: int
++    total_input: int
++    total_output: int
++    request_throughput: float
++    request_goodput: float
++    output_throughput: float
++    total_token_throughput: float
++    mean_ttft_ms: float
++    median_ttft_ms: float
++    std_ttft_ms: float
++    percentiles_ttft_ms: List[Tuple[float, float]]
++    mean_tpot_ms: float
++    median_tpot_ms: float
++    std_tpot_ms: float
++    percentiles_tpot_ms: List[Tuple[float, float]]
++    mean_itl_ms: float
++    median_itl_ms: float
++    std_itl_ms: float
++    percentiles_itl_ms: List[Tuple[float, float]]
++    # E2EL stands for end-to-end latency per request.
++    # It is the time taken on the client side from sending
++    # a request to receiving a complete response.
++    mean_e2el_ms: float
++    median_e2el_ms: float
++    std_e2el_ms: float
++    percentiles_e2el_ms: List[Tuple[float, float]]
++
++
++@dataclasses.dataclass
++class SampleRequest:
++    """A class representing a single inference request for benchmarking.
++
++    Attributes:
++        prompt: The input text prompt for the model.
++        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
++            images).
++        prompt_len: The length of the prompt in tokens.
++        expected_output_len: The expected length of the output in tokens.
++    """
++    prompt: str
++    prompt_len: int
++    expected_output_len: int
++    schema: dict
++    structure_type: str
++    completion: str = None
++
++
++def sample_requests(tokenizer: PreTrainedTokenizerBase,
++                    args: argparse.Namespace) -> List[SampleRequest]:
++    if args.dataset == 'json':
++        if args.json_schema_path is None:
++            dir_path = os.path.dirname(os.path.realpath(__file__))
++            args.json_schema_path = os.path.join(dir_path,
++                                                 "structured_schemas",
++                                                 "structured_schema_1.json")
++        with open(args.json_schema_path) as f:
++            schema = json.load(f)
++        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
++        input_len = len(tokenizer(prompt).input_ids)
++        print(f"Input length of the prompt: {input_len} tokens")
++        requests = [
++            SampleRequest(prompt=prompt,
++                          prompt_len=input_len,
++                          expected_output_len=args.output_len,
++                          schema=schema,
++                          structure_type=args.structure_type)
++            for _ in range(args.num_prompts)
++        ]
++
++    elif args.dataset == "grammar":
++        schema = """
++            ?start: select_statement
++
++            ?select_statement: "SELECT " column_list " FROM " table_name
++
++            ?column_list: column_name ("," column_name)*
++
++            ?table_name: identifier
++
++            ?column_name: identifier
++
++            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
++        """
++        prompt = "Generate an SQL query to show the 'username' \
++            and 'email' from the 'users' table."
++
++        input_len = len(tokenizer(prompt).input_ids)
++        print(f"Input length of the prompt: {input_len} tokens")
++        requests = [
++            SampleRequest(prompt=prompt,
++                          prompt_len=input_len,
++                          expected_output_len=args.output_len,
++                          schema=schema,
++                          structure_type=args.structure_type)
++            for _ in range(args.num_prompts)
++        ]
++
++    elif args.dataset == "regex":
++        regex = r"\w+@\w+\.com\n"
++        args.regex = regex
++        prompt = "Generate an email address for Alan Turing, \
++            who works in Enigma. End in .com and new line. \
++                Example result: alan.turing@enigma.com\n"
++
++        input_len = len(tokenizer(prompt).input_ids)
++        print(f"Input length of the prompt: {input_len} tokens")
++        requests = [
++            SampleRequest(prompt=prompt,
++                          prompt_len=input_len,
++                          expected_output_len=args.output_len,
++                          schema=regex,
++                          structure_type=args.structure_type)
++            for _ in range(args.num_prompts)
++        ]
++
++    elif args.dataset == "choice":
++        choice = ["Positive", "Negative"]
++        args.choice = choice
++        prompt = "Classify this sentiment: vLLM is wonderful!"
++        input_len = len(tokenizer(prompt).input_ids)
++        print(f"Input length of the prompt: {input_len} tokens")
++        requests = [
++            SampleRequest(prompt=prompt,
++                          prompt_len=input_len,
++                          expected_output_len=args.output_len,
++                          schema=choice,
++                          structure_type=args.structure_type)
++            for _ in range(args.num_prompts)
++        ]
++
++    elif args.dataset == "xgrammar_bench":
++        requests: List[SampleRequest] = []
++        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
++                                        split="train")
++        print(f"dataset has {len(dataset)} entries")
++        len_dataset = len(dataset)
++        for data_point_idx in range(args.num_prompts):
++            idx = data_point_idx
++            while idx >= len_dataset:
++                idx -= len_dataset
++            schema = dataset["schema"][idx]
++            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
++                                                   tokenize=False)
++            input_len = len(tokenizer(prompt).input_ids)
++            completion = dataset["completion"][idx]
++
++            requests.append(
++                SampleRequest(prompt=prompt,
++                              prompt_len=input_len,
++                              expected_output_len=args.output_len,
++                              schema=schema,
++                              structure_type=args.structure_type,
++                              completion=completion))
++
++    return requests
++
++
++async def get_request(
++    input_requests: List[SampleRequest],
++    request_rate: float,
++    burstiness: float = 1.0,
++) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
++    """
++    Asynchronously generates requests at a specified rate 
++    with OPTIONAL burstiness.
++    
++    Args:
++        input_requests: 
++            A list of input requests, each represented as a tuple.
++        request_rate: 
++            The rate at which requests are generated (requests/s).
++        burstiness (optional): 
++            The burstiness factor of the request generation. 
++            Only takes effect when request_rate is not inf.
++            Default value is 1, which follows a Poisson process.
++            Otherwise, the request intervals follow a gamma distribution.
++            A lower burstiness value (0 < burstiness < 1) results 
++            in more bursty requests, while a higher burstiness value 
++            (burstiness > 1) results in a more uniform arrival of requests.
++    """
++    input_requests = iter(input_requests)
++
++    # Calculate scale parameter theta to maintain the desired request_rate.
++    assert burstiness > 0, (
++        f"A positive burstiness factor is expected, but given {burstiness}.")
++    theta = 1.0 / (request_rate * burstiness)
++
++    for i, request in enumerate(input_requests):
++        yield i, request
++
++        if request_rate == float("inf"):
++            # If the request rate is infinity, then we don't need to wait.
++            continue
++
++        # Sample the request interval from the gamma distribution.
++        # If burstiness is 1, it follows exponential distribution.
++        interval = np.random.gamma(shape=burstiness, scale=theta)
++        # The next request will be sent after the interval.
++        await asyncio.sleep(interval)
++
++
++def calculate_metrics(
++    input_requests: List[Tuple[str, int, int]],
++    outputs: List[RequestFuncOutput],
++    dur_s: float,
++    tokenizer: PreTrainedTokenizerBase,
++    selected_percentile_metrics: List[str],
++    selected_percentiles: List[float],
++) -> Tuple[BenchmarkMetrics, List[int]]:
++    actual_output_lens: List[int] = []
++    total_input = 0
++    completed = 0
++    good_completed = 0
++    itls: List[float] = []
++    tpots: List[float] = []
++    all_tpots: List[float] = []
++    ttfts: List[float] = []
++    e2els: List[float] = []
++    for i in range(len(outputs)):
++        if outputs[i].success:
++            # We use the tokenizer to count the number of output tokens for all
++            # serving backends instead of looking at len(outputs[i].itl) since
++            # multiple output tokens may be bundled together
++            # Note : this may inflate the output token count slightly
++            output_len = len(
++                tokenizer(outputs[i].generated_text,
++                          add_special_tokens=False).input_ids)
++            actual_output_lens.append(output_len)
++            total_input += input_requests[i].prompt_len
++            tpot = 0
++            if output_len > 1:
++                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
++                                                                 1)
++                tpots.append(tpot)
++            outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
++            # Note: if output_len <= 1, we regard tpot as 0 for goodput
++            all_tpots.append(tpot)
++            itls += outputs[i].itl
++            ttfts.append(outputs[i].ttft)
++            e2els.append(outputs[i].latency)
++            completed += 1
++        else:
++            actual_output_lens.append(0)
++
++    if completed == 0:
++        warnings.warn(
++            "All requests failed. This is likely due to a misconfiguration "
++            "on the benchmark arguments.",
++            stacklevel=2)
++    metrics = BenchmarkMetrics(
++        completed=completed,
++        total_input=total_input,
++        total_output=sum(actual_output_lens),
++        request_throughput=completed / dur_s,
++        request_goodput=good_completed / dur_s,
++        output_throughput=sum(actual_output_lens) / dur_s,
++        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
++        mean_ttft_ms=np.mean(ttfts or 0) *
++        1000,  # ttfts is empty if streaming is not supported by backend
++        std_ttft_ms=np.std(ttfts or 0) * 1000,
++        median_ttft_ms=np.median(ttfts or 0) * 1000,
++        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
++                             for p in selected_percentiles],
++        mean_tpot_ms=np.mean(tpots or 0) * 1000,
++        std_tpot_ms=np.std(tpots or 0) * 1000,
++        median_tpot_ms=np.median(tpots or 0) * 1000,
++        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
++                             for p in selected_percentiles],
++        mean_itl_ms=np.mean(itls or 0) * 1000,
++        std_itl_ms=np.std(itls or 0) * 1000,
++        median_itl_ms=np.median(itls or 0) * 1000,
++        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
++                            for p in selected_percentiles],
++        mean_e2el_ms=np.mean(e2els or 0) * 1000,
++        std_e2el_ms=np.std(e2els or 0) * 1000,
++        median_e2el_ms=np.median(e2els or 0) * 1000,
++        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
++                             for p in selected_percentiles],
++    )
++
++    return metrics, actual_output_lens
++
++
++async def benchmark(
++    backend: str,
++    api_url: str,
++    base_url: str,
++    model_id: str,
++    tokenizer: PreTrainedTokenizerBase,
++    input_requests: List[SampleRequest],
++    request_rate: float,
++    burstiness: float,
++    disable_tqdm: bool,
++    profile: bool,
++    selected_percentile_metrics: List[str],
++    selected_percentiles: List[str],
++    ignore_eos: bool,
++    max_concurrency: Optional[int],
++    guided_decoding_ratio: float,
++    guided_decoding_backend: str,
++):
++    if backend in ASYNC_REQUEST_FUNCS:
++        request_func = ASYNC_REQUEST_FUNCS[backend]
++    else:
++        raise ValueError(f"Unknown backend: {backend}")
++
++    def prepare_extra_body(request) -> dict:
++        extra_body = {}
++        # Add the schema to the extra_body
++        extra_body[request.structure_type] = request.schema
++        # Add the specific guided_decoding_backend
++        extra_body["guided_decoding_backend"] = guided_decoding_backend
++        return extra_body
++
++    print("Starting initial single prompt test run...")
++    guided_decoding_req_idx = random.sample(
++        range(len(input_requests)),
++        int(len(input_requests) * guided_decoding_ratio))
++
++    test_request = input_requests[0]
++    test_input = RequestFuncInput(
++        model=model_id,
++        prompt=test_request.prompt,
++        api_url=api_url,
++        prompt_len=test_request.prompt_len,
++        output_len=test_request.expected_output_len,
++        ignore_eos=ignore_eos,
++        extra_body=prepare_extra_body(test_request),
++    )
++    test_output = await request_func(request_func_input=test_input)
++    if not test_output.success:
++        raise ValueError(
++            "Initial test run failed - Please make sure benchmark arguments "
++            f"are correctly specified. Error: {test_output.error}")
++    else:
++        print("Initial test run completed. Starting main benchmark run...")
++
++    if profile:
++        print("Starting profiler...")
++        profile_input = RequestFuncInput(
++            model=model_id,
++            prompt=test_request.prompt,
++            api_url=base_url + "/start_profile",
++            prompt_len=test_request.prompt_len,
++            output_len=test_request.expected_output_len,
++            ignore_eos=ignore_eos,
++            extra_body=prepare_extra_body(test_request),
++        )
++        profile_output = await request_func(request_func_input=profile_input)
++        if profile_output.success:
++            print("Profiler started")
++
++    if burstiness == 1.0:
++        distribution = "Poisson process"
++    else:
++        distribution = "Gamma distribution"
++
++    print(f"Traffic request rate: {request_rate}")
++    print(f"Burstiness factor: {burstiness} ({distribution})")
++    print(f"Maximum request concurrency: {max_concurrency}")
++
++    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
++
++    # This can be used once the minimum Python version is 3.10 or higher,
++    # and it will simplify the code in limited_request_func.
++    #    semaphore = (asyncio.Semaphore(max_concurrency)
++    #                 if max_concurrency else contextlib.nullcontext())
++    semaphore = (asyncio.Semaphore(max_concurrency)
++                 if max_concurrency else None)
++
++    async def limited_request_func(request_func_input, pbar):
++        if semaphore is None:
++            return await request_func(request_func_input=request_func_input,
++                                      pbar=pbar)
++        async with semaphore:
++            return await request_func(request_func_input=request_func_input,
++                                      pbar=pbar)
++
++    benchmark_start_time = time.perf_counter()
++    tasks: List[asyncio.Task] = []
++    expected: List[str] = []
++    async for i, request in get_request(input_requests, request_rate,
++                                        burstiness):
++        extra_body = prepare_extra_body(
++            request) if i in guided_decoding_req_idx else None
++        request_func_input = RequestFuncInput(
++            model=model_id,
++            prompt=request.prompt,
++            api_url=api_url,
++            prompt_len=request.prompt_len,
++            output_len=request.expected_output_len,
++            ignore_eos=ignore_eos,
++            extra_body=extra_body,
++        )
++        expected.append(request.completion)
++        tasks.append(
++            asyncio.create_task(
++                limited_request_func(request_func_input=request_func_input,
++                                     pbar=pbar)))
++    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
++
++    if profile:
++        print("Stopping profiler...")
++        profile_input = RequestFuncInput(
++            model=model_id,
++            prompt=test_request.prompt,
++            api_url=base_url + "/stop_profile",
++            prompt_len=test_request.prompt_len,
++            output_len=test_request.expected_output_len,
++            extra_body={test_request.structure_type: test_request.schema},
++        )
++        profile_output = await request_func(request_func_input=profile_input)
++        if profile_output.success:
++            print("Profiler stopped")
++
++    if pbar is not None:
++        pbar.close()
++
++    benchmark_duration = time.perf_counter() - benchmark_start_time
++
++    metrics, actual_output_lens = calculate_metrics(
++        input_requests=input_requests,
++        outputs=outputs,
++        dur_s=benchmark_duration,
++        tokenizer=tokenizer,
++        selected_percentile_metrics=selected_percentile_metrics,
++        selected_percentiles=selected_percentiles,
++    )
++
++    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
++    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
++    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
++                                    benchmark_duration))
++    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
++    print("{:<40} {:<10}".format("Total generated tokens:",
++                                 metrics.total_output))
++    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
++                                    metrics.request_throughput))
++    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
++                                    metrics.output_throughput))
++    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
++                                    metrics.total_token_throughput))
++
++    result = {
++        "duration":
++        benchmark_duration,
++        "completed":
++        metrics.completed,
++        "total_input_tokens":
++        metrics.total_input,
++        "total_output_tokens":
++        metrics.total_output,
++        "request_throughput":
++        metrics.request_throughput,
++        "output_throughput":
++        metrics.output_throughput,
++        "total_token_throughput":
++        metrics.total_token_throughput,
++        "ttft_description":
++        pd.Series([output.ttft for output in outputs]).describe().to_dict(),
++        "tpot_description":
++        pd.Series([output.tpot for output in outputs]).describe().to_dict(),
++        "input_lens": [output.prompt_len for output in outputs],
++        "output_lens":
++        actual_output_lens,
++        "ttfts": [output.ttft for output in outputs],
++        "itls": [output.itl for output in outputs],
++        "errors": [output.error for output in outputs],
++    }
++
++    ret = [{
++        'generated': output.generated_text,
++        'expected': gt
++    } for output, gt in zip(outputs, expected)]
++
++    def process_one_metric(
++        # E.g., "ttft"
++        metric_attribute_name: str,
++        # E.g., "TTFT"
++        metric_name: str,
++        # E.g., "Time to First Token"
++        metric_header: str,
++    ):
++        # This function prints and adds statistics of the specified
++        # metric.
++        if metric_attribute_name not in selected_percentile_metrics:
++            return
++        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
++        print("{:<40} {:<10.2f}".format(
++            f"Mean {metric_name} (ms):",
++            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
++        print("{:<40} {:<10.2f}".format(
++            f"Median {metric_name} (ms):",
++            getattr(metrics, f"median_{metric_attribute_name}_ms")))
++        result[f"mean_{metric_attribute_name}_ms"] = getattr(
++            metrics, f"mean_{metric_attribute_name}_ms")
++        result[f"median_{metric_attribute_name}_ms"] = getattr(
++            metrics, f"median_{metric_attribute_name}_ms")
++        result[f"std_{metric_attribute_name}_ms"] = getattr(
++            metrics, f"std_{metric_attribute_name}_ms")
++        for p, value in getattr(metrics,
++                                f"percentiles_{metric_attribute_name}_ms"):
++            p_word = str(int(p)) if int(p) == p else str(p)
++            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
++                                            value))
++            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
++
++    process_one_metric("ttft", "TTFT", "Time to First Token")
++    process_one_metric("tpot", "TPOT",
++                       "Time per Output Token (excl. 1st token)")
++    process_one_metric("itl", "ITL", "Inter-token Latency")
++    process_one_metric("e2el", "E2EL", "End-to-end Latency")
++
++    print("=" * 50)
++
++    return result, ret
++
++
++def evaluate(ret, args):
++
++    def _eval_correctness_json(expected, actual):
++        # extract json string from string using regex
++        import re
++        actual = actual.replace('\n', '').replace(' ', '').strip()
++        try:
++            actual = re.search(r'\{.*\}', actual).group()
++            actual = json.loads(actual)
++        except Exception:
++            return False
++
++        return True
++
++    def _eval_correctness_choice(expected, actual):
++        return actual in args.choice
++
++    def _eval_correctness_regex(expected, actual):
++        import re
++        return re.match(args.regex, actual) is not None
++
++    def _eval_correctness(expected, actual):
++        if args.structure_type == 'guided_json':
++            return _eval_correctness_json(expected, actual)
++        elif args.structure_type == 'guided_regex':
++            return _eval_correctness_regex(expected, actual)
++        elif args.structure_type == 'guided_choice':
++            return _eval_correctness_choice(expected, actual)
++        else:
++            return None
++
++    scores = []
++    for res in ret:
++        score = _eval_correctness(res['expected'], res['generated'])
++        res['correctness'] = score
++        scores.append(score)
++
++    not_none_scores = [score for score in scores if score is not None]
++
++    return (sum(not_none_scores) / len(not_none_scores) *
++            100) if len(not_none_scores) > 0 else None
++
++
++def main(args: argparse.Namespace):
++    print(args)
++    random.seed(args.seed)
++    np.random.seed(args.seed)
++
++    backend = args.backend
++    model_id = args.model
++    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
++
++    if args.base_url is not None:
++        api_url = f"{args.base_url}{args.endpoint}"
++        base_url = f"{args.base_url}"
++    else:
++        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
++        base_url = f"http://{args.host}:{args.port}"
++
++    tokenizer = get_tokenizer(tokenizer_id,
++                              trust_remote_code=args.trust_remote_code)
++
++    if args.dataset == 'grammar':
++        args.structure_type = 'guided_grammar'
++    elif args.dataset == 'regex':
++        args.structure_type = 'guided_regex'
++    elif args.dataset == 'choice':
++        args.structure_type = 'guided_choice'
++    else:
++        args.structure_type = 'guided_json'
++
++    if args.no_guided_decoding:
++        args.guided_decoding_ratio = 0
++    if args.save_results:
++        result_file_name = f'{args.guided_decoding_ratio}guided'
++        result_file_name += f"_{backend}"
++        result_file_name += f"_{args.request_rate}qps"
++        result_file_name += f"_{args.model.split('/')[-1]}"
++        result_file_name += f"_{args.dataset}"
++        result_file_name += f"_{args.num_prompts}"
++        result_file_name += f"_out{args.output_len}"
++        result_file_name += ".txt"
++    else:
++        result_file_name = None
++
++    input_requests = sample_requests(tokenizer, args)
++
++    benchmark_result, ret = asyncio.run(
++        benchmark(
++            backend=backend,
++            api_url=api_url,
++            base_url=base_url,
++            model_id=model_id,
++            tokenizer=tokenizer,
++            input_requests=input_requests,
++            request_rate=args.request_rate,
++            burstiness=args.burstiness,
++            disable_tqdm=args.disable_tqdm,
++            profile=args.profile,
++            selected_percentile_metrics=args.percentile_metrics.split(","),
++            selected_percentiles=[
++                float(p) for p in args.metric_percentiles.split(",")
++            ],
++            ignore_eos=args.ignore_eos,
++            max_concurrency=args.max_concurrency,
++            guided_decoding_ratio=args.guided_decoding_ratio,
++            guided_decoding_backend=args.guided_decoding_backend,
++        ))
++
++    # Save config and results to json
++    score = evaluate(ret, args)
++    print("correct_rate(%)", score, '\n')
++    if args.save_results:
++        results = {
++            "backend":
++            backend,
++            "model_id":
++            model_id,
++            "tokenizer_id":
++            tokenizer_id,
++            "num_prompts":
++            args.num_prompts,
++            "request_rate":
++            args.request_rate if args.request_rate < float("inf") else "inf",
++            "burstiness":
++            args.burstiness,
++            "max_concurrency":
++            args.max_concurrency,
++            "correct_rate(%)":
++            score
++        }
++        results = {"outputs": ret, **results, **benchmark_result}
++
++        # Save to file
++        if args.result_filename:
++            result_file_name = args.result_filename
++        if args.result_dir:
++            result_file_name = os.path.join(args.result_dir, result_file_name)
++        with open(result_file_name, "w", encoding='utf-8') as outfile:
++            json.dump(results, outfile, indent=4)
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(
++        description="Benchmark the online serving throughput.")
++    parser.add_argument(
++        "--backend",
++        type=str,
++        default="vllm",
++        choices=list(ASYNC_REQUEST_FUNCS.keys()),
++    )
++    parser.add_argument(
++        "--base-url",
++        type=str,
++        default=None,
++        help="Server or API base url if not using http host and port.",
++    )
++    parser.add_argument("--host", type=str, default="localhost")
++    parser.add_argument("--port", type=int, default=8000)
++    parser.add_argument(
++        "--endpoint",
++        type=str,
++        default="/v1/completions",
++        help="API endpoint.",
++    )
++    parser.add_argument(
++        "--dataset",
++        default='json',
++        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
++    parser.add_argument("--json_schema_path",
++                        type=str,
++                        default=None,
++                        help="Path to json schema.")
++    parser.add_argument(
++        "--max-concurrency",
++        type=int,
++        default=None,
++        help="Maximum number of concurrent requests. This can be used "
++        "to help simulate an environment where a higher level component "
++        "is enforcing a maximum number of concurrent requests. While the "
++        "--request-rate argument controls the rate at which requests are "
++        "initiated, this argument will control how many are actually allowed "
++        "to execute at a time. This means that when used in combination, the "
++        "actual request rate may be lower than specified with --request-rate, "
++        "if the server is not processing requests fast enough to keep up.")
++    parser.add_argument(
++        "--model",
++        type=str,
++        required=True,
++        help="Name of the model.",
++    )
++    parser.add_argument(
++        "--tokenizer",
++        type=str,
++        help=
++        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
++    )
++    parser.add_argument(
++        "--num-prompts",
++        type=int,
++        default=1000,
++        help="Number of prompts to process.",
++    )
++    parser.add_argument(
++        "--output-len",
++        type=int,
++        default=128,
++        help="Number of output tokens.",
++    )
++    parser.add_argument(
++        "--request-rate",
++        type=float,
++        default=float("inf"),
++        help="Number of requests per second. If this is inf, "
++        "then all the requests are sent at time 0. "
++        "Otherwise, we use Poisson process or gamma distribution "
++        "to synthesize the request arrival times.",
++    )
++    parser.add_argument(
++        "--burstiness",
++        type=float,
++        default=1.0,
++        help="Burstiness factor of the request generation. "
++        "Only take effect when request_rate is not inf. "
++        "Default value is 1, which follows Poisson process. "
++        "Otherwise, the request intervals follow a gamma distribution. "
++        "A lower burstiness value (0 < burstiness < 1) results in more "
++        "bursty requests. A higher burstiness value (burstiness > 1) "
++        "results in a more uniform arrival of requests.",
++    )
++    parser.add_argument("--seed", type=int, default=0)
++    parser.add_argument(
++        "--trust-remote-code",
++        action="store_true",
++        help="Trust remote code from huggingface",
++    )
++    parser.add_argument(
++        "--disable-tqdm",
++        action="store_true",
++        help="Specify to disable tqdm progress bar.",
++    )
++    parser.add_argument(
++        "--save-results",
++        action="store_true",
++        help="Specify to save benchmark results to a json file",
++    )
++    parser.add_argument(
++        "--profile",
++        action="store_true",
++        help="Use Torch Profiler. The endpoint must be launched with "
++        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
++    )
++    parser.add_argument(
++        "--result-dir",
++        type=str,
++        default=None,
++        help="Specify directory to save benchmark json results."
++        "If not specified, results are saved in the current directory.",
++    )
++    parser.add_argument(
++        "--result-filename",
++        type=str,
++        default=None,
++        help="Specify the filename to save benchmark json results."
++        "If not specified, results will be saved in "
++        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
++        " format.",
++    )
++    parser.add_argument(
++        "--ignore-eos",
++        action="store_true",
++        help="Set ignore_eos flag when sending the benchmark request."
++        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
++    parser.add_argument(
++        "--percentile-metrics",
++        type=str,
++        default="ttft,tpot,itl",
++        help="Comma-seperated list of selected metrics to report percentils. "
++        "This argument specifies the metrics to report percentiles. "
++        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
++        "Default value is \"ttft,tpot,itl\".")
++    parser.add_argument(
++        "--metric-percentiles",
++        type=str,
++        default="99",
++        help="Comma-seperated list of percentiles for selected metrics. "
++        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
++        "Default value is \"99\". "
++        "Use \"--percentile-metrics\" to select metrics.",
++    )
++    parser.add_argument("--no-guided-decoding",
++                        action='store_true',
++                        default=False,
++                        help="Whether to disable JSON decoding or not.")
++    parser.add_argument("--guided-decoding-ratio",
++                        type=float,
++                        default=1.0,
++                        help="Ratio of Guided Decoding requests")
++    parser.add_argument("--guided-decoding-backend",
++                        type=str,
++                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
++                        default="xgrammar",
++                        help="Backend to use for guided decoding")
++
++    args = parser.parse_args()
++    main(args)
+diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
+index 695d06e..c1b10b3 100644
+--- a/benchmarks/benchmark_throughput.py
++++ b/benchmarks/benchmark_throughput.py
+@@ -1,24 +1,99 @@
+ """Benchmark offline inference throughput."""
+ import argparse
++import dataclasses
+ import json
+ import random
+ import time
+-from typing import List, Optional, Tuple
++from functools import cache
++from typing import Dict, List, Optional, Tuple
+ 
+ import torch
++import uvloop
++from PIL import Image
+ from tqdm import tqdm
+ from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                           PreTrainedTokenizerBase)
+ 
+-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
++from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
++from vllm.entrypoints.openai.api_server import (
++    build_async_engine_client_from_engine_args)
++from vllm.inputs import TextPrompt
++from vllm.lora.request import LoRARequest
++from vllm.lora.utils import get_adapter_absolute_path
++from vllm.multimodal import MultiModalDataDict
++from vllm.sampling_params import BeamSearchParams
++from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
++from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+ 
+ 
+-def sample_requests(
+-    dataset_path: str,
+-    num_requests: int,
+-    tokenizer: PreTrainedTokenizerBase,
+-    fixed_output_len: Optional[int],
+-) -> List[Tuple[str, int, int]]:
++@dataclasses.dataclass
++class SampleRequest:
++    """A class representing a single inference request for benchmarking.
++
++    Attributes:
++        prompt: The input text prompt for the model.
++        prompt_len: The length of the prompt in tokens.
++        expected_output_len: The expected length of the output in tokens.
++        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
++            images).
++        lora_request: Optional LoRARequest specifying the LoRA to use. 
++    """
++    prompt: str
++    prompt_len: int
++    expected_output_len: int
++    multi_modal_data: Optional[MultiModalDataDict] = None
++    lora_request: Optional[LoRARequest] = None
++
++
++def _get_prompt_for_image_model(question: str, *, model: str) -> str:
++    """Prepend and append special tokens around the question to form a prompt.
++
++    Args:
++        question: The input question text to wrap with special tokens
++        model: The name of the model being used, to determine which special
++            tokens to add
++
++    Returns:
++        The formatted prompt string with appropriate special tokens for the
++            model
++
++    Raises:
++        ValueError: If an unsupported model name is provided
++    """
++    model = model.lower()
++    if "pixtral" in model:
++        return f"<s>[INST]{question}\n[IMG][/INST]"
++    raise ValueError(f"Unsupported model {model}")
++
++
++@cache
++def lora_path_on_disk(lora_path: str) -> str:
++    return get_adapter_absolute_path(lora_path)
++
++
++lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
++
++
++def get_random_lora_request(
++        args: argparse.Namespace
++) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
++    global lora_tokenizer_cache
++    lora_id = random.randint(1, args.max_loras)
++    lora_request = LoRARequest(lora_name=str(lora_id),
++                               lora_int_id=lora_id,
++                               lora_path=lora_path_on_disk(args.lora_path))
++    if lora_id not in lora_tokenizer_cache:
++        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
++    return lora_request, lora_tokenizer_cache[lora_id]
++
++
++def sample_requests(tokenizer: PreTrainedTokenizerBase,
++                    args: argparse.Namespace) -> List[SampleRequest]:
++
++    dataset_path: str = args.dataset
++    num_requests: int = args.num_prompts
++    fixed_output_len: Optional[int] = args.output_len
++    model: str = args.model
+     if fixed_output_len is not None and fixed_output_len < 4:
+         raise ValueError("output_len too small")
+ 
+@@ -27,24 +102,46 @@ def sample_requests(
+         dataset = json.load(f)
+     # Filter out the conversations with less than 2 turns.
+     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+-    # Only keep the first two turns of each conversation.
+-    dataset = [(data["conversations"][0]["value"],
+-                data["conversations"][1]["value"]) for data in dataset]
+-
+     # Shuffle the dataset.
+     random.shuffle(dataset)
+ 
+     # Filter out sequences that are too long or too short
+-    filtered_dataset: List[Tuple[str, int, int]] = []
+-    for i in range(len(dataset)):
++    filtered_dataset: List[SampleRequest] = []
++    for data in tqdm(dataset,
++                     total=len(filtered_dataset),
++                     desc="sampling requests"):
+         if len(filtered_dataset) == num_requests:
+             break
+ 
++        # Only keep the first two turns of each conversation.
++        prompt = data["conversations"][0]["value"]
++        completion = data["conversations"][1]["value"]
++
++        multi_modal_data: Optional[MultiModalDataDict] = None
++        if "image" in data:
++            multi_modal_data = multi_modal_data or {}
++            image_path = data["image"]
++            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
++            assert isinstance(image_path,
++                              str), "Only support single image input"
++            try:
++                multi_modal_data["image"] = Image.open(image_path).convert(
++                    "RGB")
++            except FileNotFoundError:
++                # Ignore datapoint where asset is missing
++                continue
++            prompt = _get_prompt_for_image_model(question=prompt, model=model)
++
++        request_tokenizer = tokenizer
++        lora_request: Optional[LoRARequest] = None
++        if args.enable_lora:
++            lora_request, lora_tokenizer = get_random_lora_request(args)
++            if lora_tokenizer:
++                request_tokenizer = lora_tokenizer
++
+         # Tokenize the prompts and completions.
+-        prompt = dataset[i][0]
+-        prompt_token_ids = tokenizer(prompt).input_ids
+-        completion = dataset[i][1]
+-        completion_token_ids = tokenizer(completion).input_ids
++        prompt_token_ids = request_tokenizer(prompt).input_ids
++        completion_token_ids = request_tokenizer(completion).input_ids
+         prompt_len = len(prompt_token_ids)
+         output_len = len(completion_token_ids
+                          ) if fixed_output_len is None else fixed_output_len
+@@ -54,85 +151,124 @@ def sample_requests(
+         if prompt_len > 1024 or prompt_len + output_len > 2048:
+             # Prune too long sequences.
+             continue
+-        filtered_dataset.append((prompt, prompt_len, output_len))
++        filtered_dataset.append(
++            SampleRequest(prompt=prompt,
++                          prompt_len=prompt_len,
++                          expected_output_len=output_len,
++                          multi_modal_data=multi_modal_data,
++                          lora_request=lora_request))
+ 
+     return filtered_dataset
+ 
+ 
+ def run_vllm(
+-    requests: List[Tuple[str, int, int]],
+-    model: str,
+-    tokenizer: str,
+-    quantization: Optional[str],
+-    tensor_parallel_size: int,
+-    seed: int,
++    requests: List[SampleRequest],
+     n: int,
+-    use_beam_search: bool,
+-    trust_remote_code: bool,
+-    dtype: str,
+-    max_model_len: Optional[int],
+-    enforce_eager: bool,
+-    kv_cache_dtype: str,
+-    quantization_param_path: Optional[str],
+-    device: str,
+-    enable_prefix_caching: bool,
+-    enable_chunked_prefill: bool,
+-    max_num_batched_tokens: int,
+-    gpu_memory_utilization: float = 0.9,
+-    download_dir: Optional[str] = None,
++    engine_args: EngineArgs,
+ ) -> float:
+     from vllm import LLM, SamplingParams
+-    llm = LLM(
+-        model=model,
+-        tokenizer=tokenizer,
+-        quantization=quantization,
+-        tensor_parallel_size=tensor_parallel_size,
+-        seed=seed,
+-        trust_remote_code=trust_remote_code,
+-        dtype=dtype,
+-        max_model_len=max_model_len,
+-        gpu_memory_utilization=gpu_memory_utilization,
+-        enforce_eager=enforce_eager,
+-        kv_cache_dtype=kv_cache_dtype,
+-        quantization_param_path=quantization_param_path,
+-        device=device,
+-        enable_prefix_caching=enable_prefix_caching,
+-        download_dir=download_dir,
+-        enable_chunked_prefill=enable_chunked_prefill,
+-        max_num_batched_tokens=max_num_batched_tokens,
+-    )
++    llm = LLM(**dataclasses.asdict(engine_args))
+ 
+     # Add the requests to the engine.
+-    prompts = []
+-    sampling_params = []
+-    for prompt, _, output_len in requests:
+-        prompts.append(prompt)
++    prompts: List[TextPrompt] = []
++    sampling_params: List[SamplingParams] = []
++    for request in requests:
++        prompts.append(
++            TextPrompt(prompt=request.prompt,
++                       multi_modal_data=request.multi_modal_data))
+         sampling_params.append(
+             SamplingParams(
+                 n=n,
+-                temperature=0.0 if use_beam_search else 1.0,
++                temperature=1.0,
+                 top_p=1.0,
+-                use_beam_search=use_beam_search,
+                 ignore_eos=True,
+-                max_tokens=output_len,
++                max_tokens=request.expected_output_len,
+             ))
++    lora_requests: Optional[List[LoRARequest]] = None
++    if engine_args.enable_lora:
++        lora_requests = [request.lora_request for request in requests]
+ 
+-    start = time.perf_counter()
+-    llm.generate(prompts, sampling_params, use_tqdm=True)
+-    end = time.perf_counter()
++    use_beam_search = False
++
++    if not use_beam_search:
++        start = time.perf_counter()
++        llm.generate(prompts,
++                     sampling_params,
++                     lora_request=lora_requests,
++                     use_tqdm=True)
++        end = time.perf_counter()
++    else:
++        assert lora_requests is None, "BeamSearch API does not support LoRA"
++        prompts = [request.prompt for request in requests]
++        # output_len should be the same for all requests.
++        output_len = requests[0][2]
++        for request in requests:
++            assert request.expected_output_len == output_len
++        start = time.perf_counter()
++        llm.beam_search(
++            prompts,
++            BeamSearchParams(
++                beam_width=n,
++                max_tokens=output_len,
++                ignore_eos=True,
++            ))
++        end = time.perf_counter()
+     return end - start
+ 
+ 
++async def run_vllm_async(
++    requests: List[SampleRequest],
++    n: int,
++    engine_args: AsyncEngineArgs,
++    disable_frontend_multiprocessing: bool = False,
++) -> float:
++    from vllm import SamplingParams
++
++    async with build_async_engine_client_from_engine_args(
++            engine_args, disable_frontend_multiprocessing) as llm:
++
++        # Add the requests to the engine.
++        prompts: List[TextPrompt] = []
++        sampling_params: List[SamplingParams] = []
++        lora_requests: List[Optional[LoRARequest]] = []
++        for request in requests:
++            prompts.append(
++                TextPrompt(prompt=request.prompt,
++                           multi_modal_data=request.multi_modal_data))
++            sampling_params.append(
++                SamplingParams(
++                    n=n,
++                    temperature=1.0,
++                    top_p=1.0,
++                    ignore_eos=True,
++                    max_tokens=request.expected_output_len,
++                ))
++            lora_requests.append(request.lora_request)
++
++        generators = []
++        start = time.perf_counter()
++        for i, (prompt, sp,
++                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
++            generator = llm.generate(prompt,
++                                     sp,
++                                     lora_request=lr,
++                                     request_id=f"test{i}")
++            generators.append(generator)
++        all_gens = merge_async_iterators(*generators)
++        async for i, res in all_gens:
++            pass
++        end = time.perf_counter()
++        return end - start
++
++
+ def run_hf(
+-    requests: List[Tuple[str, int, int]],
++    requests: List[SampleRequest],
+     model: str,
+     tokenizer: PreTrainedTokenizerBase,
+     n: int,
+-    use_beam_search: bool,
+     max_batch_size: int,
+     trust_remote_code: bool,
+ ) -> float:
+-    assert not use_beam_search
+     llm = AutoModelForCausalLM.from_pretrained(
+         model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+     if llm.config.model_type == "llama":
+@@ -164,7 +300,7 @@ def run_hf(
+                               padding=True).input_ids
+         llm_outputs = llm.generate(
+             input_ids=input_ids.cuda(),
+-            do_sample=not use_beam_search,
++            do_sample=True,
+             num_return_sequences=n,
+             temperature=1.0,
+             top_p=1.0,
+@@ -184,14 +320,14 @@ def run_hf(
+ 
+ 
+ def run_mii(
+-    requests: List[Tuple[str, int, int]],
++    requests: List[SampleRequest],
+     model: str,
+     tensor_parallel_size: int,
+     output_len: int,
+ ) -> float:
+     from mii import client, serve
+     llm = serve(model, tensor_parallel=tensor_parallel_size)
+-    prompts = [prompt for prompt, _, _ in requests]
++    prompts = [request.prompt for request in requests]
+ 
+     start = time.perf_counter()
+     llm.generate(prompts, max_new_tokens=output_len)
+@@ -209,42 +345,99 @@ def main(args: argparse.Namespace):
+     tokenizer = AutoTokenizer.from_pretrained(
+         args.tokenizer, trust_remote_code=args.trust_remote_code)
+     if args.dataset is None:
+-        # Synthesize a prompt with the given input length.
+-        prompt = "hi" * (args.input_len - 1)
+-        requests = [(prompt, args.input_len, args.output_len)
+-                    for _ in range(args.num_prompts)]
++        vocab_size = tokenizer.vocab_size
++        requests = []
++        for _ in range(args.num_prompts):
++
++            request_tokenizer = tokenizer
++            lora_request: Optional[LoRARequest] = None
++            if args.enable_lora:
++                lora_request, lora_tokenizer = get_random_lora_request(args)
++                if lora_tokenizer:
++                    request_tokenizer = lora_tokenizer
++
++            # Synthesize a prompt with the given input length.
++            candidate_ids = [
++                random.randint(0, vocab_size - 1)
++                for _ in range(args.input_len)
++            ]
++            # As tokenizer may add additional tokens like BOS, we need to try
++            # different lengths to get the desired input length.
++            for _ in range(5):  # Max attempts to correct
++                candidate_prompt = request_tokenizer.decode(candidate_ids)
++                tokenized_len = len(request_tokenizer.encode(candidate_prompt))
++
++                if tokenized_len == args.input_len:
++                    break
++
++                # Adjust length based on difference
++                diff = args.input_len - tokenized_len
++                if diff > 0:
++                    candidate_ids.extend([
++                        random.randint(100, vocab_size - 100)
++                        for _ in range(diff)
++                    ])
++                else:
++                    candidate_ids = candidate_ids[:diff]
++            requests.append(
++                SampleRequest(prompt=candidate_prompt,
++                              prompt_len=args.input_len,
++                              expected_output_len=args.output_len,
++                              lora_request=lora_request))
+     else:
+-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+-                                   args.output_len)
++        requests = sample_requests(tokenizer, args)
+ 
++    is_multi_modal = any(request.multi_modal_data is not None
++                         for request in requests)
+     if args.backend == "vllm":
+-        elapsed_time = run_vllm(
+-            requests, args.model, args.tokenizer, args.quantization,
+-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+-            args.trust_remote_code, args.dtype, args.max_model_len,
+-            args.enforce_eager, args.kv_cache_dtype,
+-            args.quantization_param_path, args.device,
+-            args.enable_prefix_caching, args.enable_chunked_prefill,
+-            args.max_num_batched_tokens, args.gpu_memory_utilization,
+-            args.download_dir)
++        if args.async_engine:
++            elapsed_time = uvloop.run(
++                run_vllm_async(
++                    requests,
++                    args.n,
++                    AsyncEngineArgs.from_cli_args(args),
++                    args.disable_frontend_multiprocessing,
++                ))
++        else:
++            elapsed_time = run_vllm(requests, args.n,
++                                    EngineArgs.from_cli_args(args))
+     elif args.backend == "hf":
+         assert args.tensor_parallel_size == 1
+         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+-                              args.use_beam_search, args.hf_max_batch_size,
+-                              args.trust_remote_code)
++                              args.hf_max_batch_size, args.trust_remote_code)
+     elif args.backend == "mii":
+         elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
+                                args.output_len)
+     else:
+         raise ValueError(f"Unknown backend: {args.backend}")
+-    total_num_tokens = sum(prompt_len + output_len
+-                           for _, prompt_len, output_len in requests)
++    total_num_tokens = sum(request.prompt_len + request.expected_output_len
++                           for request in requests)
++    total_output_tokens = sum(request.expected_output_len
++                              for request in requests)
++    if is_multi_modal:
++        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
++              "following metrics are not accurate because image tokens are not"
++              " counted. See vllm-project/vllm/issues/9778 for details.")
++        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
+     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+-          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
++          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
++          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
++
++    # Output JSON results if specified
++    if args.output_json:
++        results = {
++            "elapsed_time": elapsed_time,
++            "num_requests": len(requests),
++            "total_num_tokens": total_num_tokens,
++            "requests_per_second": len(requests) / elapsed_time,
++            "tokens_per_second": total_num_tokens / elapsed_time,
++        }
++        with open(args.output_json, "w") as f:
++            json.dump(results, f, indent=4)
+ 
+ 
+ if __name__ == "__main__":
+-    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
++    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+     parser.add_argument("--backend",
+                         type=str,
+                         choices=["vllm", "hf", "mii"],
+@@ -252,7 +445,9 @@ if __name__ == "__main__":
+     parser.add_argument("--dataset",
+                         type=str,
+                         default=None,
+-                        help="Path to the dataset.")
++                        help="Path to the dataset. The dataset is expected to "
++                        "be a json in form of List[Dict[..., conversations: "
++                        "List[Dict[..., value: <prompt_or_response>]]]]")
+     parser.add_argument("--input-len",
+                         type=int,
+                         default=None,
+@@ -262,97 +457,40 @@ if __name__ == "__main__":
+                         default=None,
+                         help="Output length for each request. Overrides the "
+                         "output length from the dataset.")
+-    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+-    parser.add_argument("--tokenizer", type=str, default=None)
+-    parser.add_argument('--quantization',
+-                        '-q',
+-                        choices=[*QUANTIZATION_METHODS, None],
+-                        default=None)
+-    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+     parser.add_argument("--n",
+                         type=int,
+                         default=1,
+                         help="Number of generated sequences per prompt.")
+-    parser.add_argument("--use-beam-search", action="store_true")
+     parser.add_argument("--num-prompts",
+                         type=int,
+                         default=1000,
+                         help="Number of prompts to process.")
+-    parser.add_argument("--seed", type=int, default=0)
+     parser.add_argument("--hf-max-batch-size",
+                         type=int,
+                         default=None,
+                         help="Maximum batch size for HF backend.")
+-    parser.add_argument('--trust-remote-code',
+-                        action='store_true',
+-                        help='trust remote code from huggingface')
+-    parser.add_argument(
+-        '--max-model-len',
+-        type=int,
+-        default=None,
+-        help='Maximum length of a sequence (including prompt and output). '
+-        'If None, will be derived from the model.')
+-    parser.add_argument(
+-        '--dtype',
+-        type=str,
+-        default='auto',
+-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+-        help='data type for model weights and activations. '
+-        'The "auto" option will use FP16 precision '
+-        'for FP32 and FP16 models, and BF16 precision '
+-        'for BF16 models.')
+-    parser.add_argument('--gpu-memory-utilization',
+-                        type=float,
+-                        default=0.9,
+-                        help='the fraction of GPU memory to be used for '
+-                        'the model executor, which can range from 0 to 1.'
+-                        'If unspecified, will use the default value of 0.9.')
+-    parser.add_argument("--enforce-eager",
+-                        action="store_true",
+-                        help="enforce eager execution")
+-    parser.add_argument(
+-        "--kv-cache-dtype",
+-        type=str,
+-        choices=["auto", "fp8"],
+-        default="auto",
+-        help=
+-        'Data type for kv cache storage. If "auto", will use model data type. '
+-        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+-        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+-        'common inference criteria.')
+     parser.add_argument(
+-        '--quantization-param-path',
++        '--output-json',
+         type=str,
+         default=None,
+-        help='Path to the JSON file containing the KV cache scaling factors. '
+-        'This should generally be supplied, when KV cache dtype is FP8. '
+-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+-        'instead supported for common inference criteria.')
++        help='Path to save the throughput results in JSON format.')
++    parser.add_argument("--async-engine",
++                        action='store_true',
++                        default=False,
++                        help="Use vLLM async engine rather than LLM class.")
++    parser.add_argument("--disable-frontend-multiprocessing",
++                        action='store_true',
++                        default=False,
++                        help="Disable decoupled async engine frontend.")
++    # LoRA
+     parser.add_argument(
+-        "--device",
++        "--lora-path",
+         type=str,
+-        default="cuda",
+-        choices=["cuda", "cpu"],
+-        help='device type for vLLM execution, supporting CUDA and CPU.')
+-    parser.add_argument(
+-        "--enable-prefix-caching",
+-        action='store_true',
+-        help="enable automatic prefix caching for vLLM backend.")
+-    parser.add_argument("--enable-chunked-prefill",
+-                        action='store_true',
+-                        help="enable chunked prefill for vLLM backend.")
+-    parser.add_argument('--max-num-batched-tokens',
+-                        type=int,
+-                        default=None,
+-                        help='maximum number of batched tokens per '
+-                        'iteration')
+-    parser.add_argument('--download-dir',
+-                        type=str,
+-                        default=None,
+-                        help='directory to download and load the weights, '
+-                        'default to the default cache dir of huggingface')
++        default=None,
++        help="Path to the lora adapters to use. This can be an absolute path, "
++        "a relative path, or a Hugging Face model identifier.")
++
++    parser = AsyncEngineArgs.add_cli_args(parser)
+     args = parser.parse_args()
+     if args.tokenizer is None:
+         args.tokenizer = args.model
+@@ -361,6 +499,8 @@ if __name__ == "__main__":
+         assert args.output_len is not None
+     else:
+         assert args.input_len is None
++    if args.enable_lora:
++        assert args.lora_path is not None
+ 
+     if args.backend == "vllm":
+         if args.hf_max_batch_size is not None:
+@@ -370,13 +510,14 @@ if __name__ == "__main__":
+             raise ValueError("HF max batch size is required for HF backend.")
+         if args.quantization is not None:
+             raise ValueError("Quantization is only for vLLM backend.")
++        if args.enable_lora is not None:
++            raise ValueError("LoRA benchmarking is only supported for vLLM"
++                             " backend")
+     elif args.backend == "mii":
+         if args.dtype != "auto":
+             raise ValueError("dtype must be auto for MII backend.")
+         if args.n != 1:
+             raise ValueError("n must be 1 for MII backend.")
+-        if args.use_beam_search:
+-            raise ValueError("Beam search is not supported for MII backend.")
+         if args.quantization is not None:
+             raise ValueError("Quantization is only for vLLM backend.")
+         if args.hf_max_batch_size is not None:
+@@ -384,4 +525,7 @@ if __name__ == "__main__":
+         if args.tokenizer != args.model:
+             raise ValueError("Tokenizer must be the same as the model for MII "
+                              "backend.")
++        if args.enable_lora is not None:
++            raise ValueError("LoRA benchmarking is only supported for vLLM"
++                             " backend")
+     main(args)
+diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+new file mode 100644
+index 0000000..3d1c5e3
+--- /dev/null
++++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+@@ -0,0 +1,384 @@
++import argparse
++import copy
++import itertools
++import pickle as pkl
++import time
++from typing import Callable, Iterable, List, Tuple
++
++import torch
++import torch.utils.benchmark as TBenchmark
++from torch.utils.benchmark import Measurement as TMeasurement
++from utils import make_rand_sparse_tensors
++from weight_shapes import WEIGHT_SHAPES
++
++from vllm import _custom_ops as ops
++from vllm.utils import FlexibleArgumentParser
++
++DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
++DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
++DEFAULT_TP_SIZES = [1]
++
++
++# bench
++def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
++             **kwargs) -> TMeasurement:
++    min_run_time = 1
++
++    globals = {
++        "args": args,
++        "kwargs": kwargs,
++        "fn": fn,
++    }
++    return TBenchmark.Timer(
++        stmt="fn(*args, **kwargs)",
++        globals=globals,
++        label=label,
++        sub_label=sub_label,
++        description=description,
++    ).blocked_autorange(min_run_time=min_run_time)
++
++
++def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
++               sub_label: str) -> Iterable[TMeasurement]:
++    assert dtype == torch.int8
++    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
++    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
++    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
++    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
++
++    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
++                                       torch.bfloat16)
++    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
++
++    if not torch.allclose(out, out_ref):
++        print("Incorrect results")
++        print(out)
++        print(out_ref)
++    else:
++        print("Correct results")
++
++    timers = []
++    # pytorch impl - bfloat16
++    timers.append(
++        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
++                 torch.mm, a.to(dtype=torch.bfloat16),
++                 b.to(dtype=torch.bfloat16)))
++
++    # pytorch impl - float16
++    timers.append(
++        bench_fn(label, sub_label,
++                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
++                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
++
++    # cutlass impl
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
++                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
++                 torch.bfloat16))
++
++    # cutlass with bias
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
++                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
++                 bias))
++
++    # cutlass sparse impl
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
++                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
++                 scale_b, torch.bfloat16))
++
++    # cutlass sparse with bias
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
++                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
++                 scale_b, torch.bfloat16, bias))
++
++    return timers
++
++
++def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
++              sub_label: str) -> Iterable[TMeasurement]:
++    assert dtype == torch.float8_e4m3fn
++    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
++                                                     k)
++    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
++    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
++    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
++
++    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
++                                       torch.bfloat16)
++    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
++
++    if not torch.allclose(out, out_ref):
++        print("Incorrect results")
++        print(out)
++        print(out_ref)
++    else:
++        print("Correct results")
++
++    timers = []
++
++    # pytorch impl w. bf16
++    timers.append(
++        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
++                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
++                 b.to(dtype=torch.bfloat16, device="cuda")))
++
++    # pytorch impl: bf16 output, without fp8 fast accum
++    timers.append(
++        bench_fn(label,
++                 sub_label,
++                 "pytorch_fp8_fp8_bf16_scaled_mm",
++                 torch._scaled_mm,
++                 a,
++                 b,
++                 scale_a=scale_a,
++                 scale_b=scale_b,
++                 out_dtype=torch.bfloat16))
++
++    # pytorch impl: bf16 output, with fp8 fast accum
++    timers.append(
++        bench_fn(label,
++                 sub_label,
++                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
++                 torch._scaled_mm,
++                 a,
++                 b,
++                 scale_a=scale_a,
++                 scale_b=scale_b,
++                 out_dtype=torch.bfloat16,
++                 use_fast_accum=True))
++
++    # pytorch impl: fp16 output, without fp8 fast accum
++    timers.append(
++        bench_fn(label,
++                 sub_label,
++                 "pytorch_fp8_fp8_fp16_scaled_mm",
++                 torch._scaled_mm,
++                 a,
++                 b,
++                 scale_a=scale_a,
++                 scale_b=scale_b,
++                 out_dtype=torch.float16))
++
++    # pytorch impl: fp16 output, with fp8 fast accum
++    timers.append(
++        bench_fn(label,
++                 sub_label,
++                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
++                 torch._scaled_mm,
++                 a,
++                 b,
++                 scale_a=scale_a,
++                 scale_b=scale_b,
++                 out_dtype=torch.float16,
++                 use_fast_accum=True))
++
++    # cutlass impl: bf16 output
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
++                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
++                 torch.bfloat16))
++
++    # cutlass impl: bf16 output
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
++                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
++                 scale_b, torch.bfloat16))
++
++    # cutlass impl: fp16 output
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
++                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
++                 scale_b, torch.float16))
++
++    # cutlass impl: bf16 output, with bias
++    timers.append(
++        bench_fn(label, sub_label,
++                 "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
++                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
++                 scale_b, torch.bfloat16, bias))
++
++    # cutlass impl: fp16 output, with bias
++    timers.append(
++        bench_fn(label, sub_label,
++                 "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
++                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
++                 scale_b, torch.float16, bias.to(dtype=torch.float16)))
++
++    return timers
++
++
++def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
++          sub_label: str) -> Iterable[TMeasurement]:
++    if dtype == torch.int8:
++        return bench_int8(dtype, m, k, n, label, sub_label)
++    if dtype == torch.float8_e4m3fn:
++        return bench_fp8(dtype, m, k, n, label, sub_label)
++    raise ValueError("unsupported type")
++
++
++# runner
++def print_timers(timers: Iterable[TMeasurement]):
++    compare = TBenchmark.Compare(timers)
++    compare.print()
++
++
++def run(dtype: torch.dtype,
++        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
++    results = []
++    for m, k, n in MKNs:
++        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
++                       f"MKN=({m}x{k}x{n})")
++        print_timers(timers)
++        results.extend(timers)
++
++    return results
++
++
++# output makers
++def make_output(data: Iterable[TMeasurement],
++                MKNs: Iterable[Tuple[int, int, int]],
++                base_description: str,
++                timestamp=None):
++    print(f"== All Results {base_description} ====")
++    print_timers(data)
++
++    # pickle all the results
++    timestamp = int(time.time()) if timestamp is None else timestamp
++    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
++        pkl.dump(data, f)
++
++
++# argparse runners
++
++
++def run_square_bench(args):
++    dim_sizes = list(
++        range(args.dim_start, args.dim_end + 1, args.dim_increment))
++    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
++    data = run(args.dtype, MKNs)
++
++    make_output(data, MKNs, f"square_bench-{args.dtype}")
++
++
++def run_range_bench(args):
++    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
++    n = len(dim_sizes)
++    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
++    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
++    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
++    MKNs = list(zip(Ms, Ks, Ns))
++    data = run(args.dtype, MKNs)
++
++    make_output(data, MKNs, f"range_bench-{args.dtype}")
++
++
++def run_model_bench(args):
++    print("Benchmarking models:")
++    for i, model in enumerate(args.models):
++        print(f"[{i}]  {model}")
++
++    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
++        KNs = []
++        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
++            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
++            KNs.append(KN)
++        return KNs
++
++    model_bench_data = []
++    models_tps = list(itertools.product(args.models, args.tp_sizes))
++    for model, tp_size in models_tps:
++        Ms = args.batch_sizes
++        KNs = model_shapes(model, tp_size)
++        MKNs = []
++        for m in Ms:
++            for k, n in KNs:
++                MKNs.append((m, k, n))
++
++        data = run(args.dtype, MKNs)
++        model_bench_data.append(data)
++
++    # Print all results
++    for data, model_tp in zip(model_bench_data, models_tps):
++        model, tp_size = model_tp
++        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
++        print_timers(data)
++
++    timestamp = int(time.time())
++
++    all_data = []
++    for d in model_bench_data:
++        all_data.extend(d)
++    # pickle all data
++    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
++        pkl.dump(all_data, f)
++
++
++if __name__ == '__main__':
++
++    def to_torch_dtype(dt):
++        if dt == "int8":
++            return torch.int8
++        if dt == "fp8":
++            return torch.float8_e4m3fn
++        raise ValueError("unsupported dtype")
++
++    parser = FlexibleArgumentParser(
++        description="""
++Benchmark Cutlass GEMM.
++
++    To run square GEMMs:
++        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
++    
++    To run constant N and K and sweep M:
++        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
++    
++    To run dimensions from a model:
++        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
++    
++    Output:
++        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
++            """,  # noqa: E501
++        formatter_class=argparse.RawTextHelpFormatter)
++
++    parser.add_argument("--dtype",
++                        type=to_torch_dtype,
++                        required=True,
++                        help="Available options are ['int8', 'fp8']")
++    subparsers = parser.add_subparsers(dest="cmd")
++
++    square_parser = subparsers.add_parser("square_bench")
++    square_parser.add_argument("--dim-start", type=int, required=True)
++    square_parser.add_argument("--dim-end", type=int, required=True)
++    square_parser.add_argument("--dim-increment", type=int, required=True)
++    square_parser.set_defaults(func=run_square_bench)
++
++    range_parser = subparsers.add_parser("range_bench")
++    range_parser.add_argument("--dim-start", type=int, required=True)
++    range_parser.add_argument("--dim-end", type=int, required=True)
++    range_parser.add_argument("--dim-increment", type=int, required=True)
++    range_parser.add_argument("--m-constant", type=int, default=None)
++    range_parser.add_argument("--n-constant", type=int, default=None)
++    range_parser.add_argument("--k-constant", type=int, default=None)
++    range_parser.set_defaults(func=run_range_bench)
++
++    model_parser = subparsers.add_parser("model_bench")
++    model_parser.add_argument("--models",
++                              nargs="+",
++                              type=str,
++                              default=DEFAULT_MODELS,
++                              choices=WEIGHT_SHAPES.keys())
++    model_parser.add_argument("--tp-sizes",
++                              nargs="+",
++                              type=int,
++                              default=DEFAULT_TP_SIZES)
++    model_parser.add_argument("--batch-sizes",
++                              nargs="+",
++                              type=int,
++                              default=DEFAULT_BATCH_SIZES)
++    model_parser.set_defaults(func=run_model_bench)
++
++    args = parser.parse_args()
++    args.func(args)
+diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
+new file mode 100644
+index 0000000..ef06fcd
+--- /dev/null
++++ b/benchmarks/cutlass_benchmarks/utils.py
+@@ -0,0 +1,96 @@
++# Cutlass bench utils
++from typing import Iterable, Tuple
++
++import torch
++
++import vllm._custom_ops as ops
++
++
++def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
++    finfo = torch.finfo(torch.float8_e4m3fn)
++    return torch.round(tensor.clamp(
++        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
++
++
++def to_int8(tensor: torch.Tensor) -> torch.Tensor:
++    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
++
++
++def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
++    return tensor.to(dtype=torch.bfloat16)
++
++
++def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
++    return tensor.to(dtype=torch.float16)
++
++
++def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
++                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
++    a = torch.randn((m, k), device='cuda') * 5
++    b = torch.randn((n, k), device='cuda').t() * 5
++
++    if dtype == torch.int8:
++        return to_int8(a), to_int8(b)
++    if dtype == torch.float8_e4m3fn:
++        return to_fp8(a), to_fp8(b)
++
++    raise ValueError("unsupported dtype")
++
++
++def prune_to_2_4(tensor):
++    # Reshape tensor to [N, 4] where N is number of groups of 4
++    original_shape = tensor.shape
++    reshaped = tensor.reshape(-1, 4)
++
++    # Get indices of top 2 absolute values in each group of 4
++    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
++
++    # Create binary mask
++    mask = torch.zeros_like(reshaped)
++    mask.scatter_(dim=1,
++                  index=indices,
++                  src=torch.ones_like(indices, dtype=mask.dtype))
++
++    # Apply mask and reshape back
++    pruned = reshaped * mask
++
++    # Turn all -0.0 to 0.0
++    pruned[pruned == -0.0] = 0.0
++
++    return pruned.reshape(original_shape)
++
++
++def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
++                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
++    a = torch.randn((m, k), device='cuda') * 5
++    b = torch.randn((n, k), device='cuda').t() * 5
++
++    b = prune_to_2_4(b.t()).t()
++
++    if dtype == torch.int8:
++        a, b = to_int8(a), to_int8(b)
++    elif dtype == torch.float8_e4m3fn:
++        a, b = to_fp8(a), to_fp8(b)
++    elif dtype == torch.float16:
++        a, b = to_fp16(a), to_fp16(b)
++    elif dtype == torch.bfloat16:
++        a, b = to_bf16(a), to_bf16(b)
++    else:
++        raise ValueError("unsupported dtype")
++
++    b_compressed, e = ops.cutlass_sparse_compress(b.t())
++
++    # Compressed B, Metadata, Original A, B
++    return b_compressed, e, a, b
++
++
++def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
++                        m: int, n: int, k: int) -> \
++                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
++    ABs = []
++    for _ in range(num_tensors):
++        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
++        if b_comp is not None:
++            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
++    BComps, Es, As, Bs = zip(*ABs)
++    return list(BComps), list(Es), list(As), list(Bs)
+diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+new file mode 100644
+index 0000000..d0353bc
+--- /dev/null
++++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+@@ -0,0 +1,365 @@
++import argparse
++import copy
++import itertools
++import pickle as pkl
++import time
++from typing import Callable, Iterable, List, Tuple
++
++import torch
++import torch.utils.benchmark as TBenchmark
++from torch.utils.benchmark import Measurement as TMeasurement
++from utils import make_rand_tensors
++from weight_shapes import WEIGHT_SHAPES
++
++from vllm import _custom_ops as ops
++from vllm.utils import FlexibleArgumentParser
++
++DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
++DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
++DEFAULT_TP_SIZES = [1]
++
++
++# bench
++def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
++             **kwargs) -> TMeasurement:
++    min_run_time = 1
++
++    globals = {
++        "args": args,
++        "kwargs": kwargs,
++        "fn": fn,
++    }
++    return TBenchmark.Timer(
++        stmt="fn(*args, **kwargs)",
++        globals=globals,
++        label=label,
++        sub_label=sub_label,
++        description=description,
++    ).blocked_autorange(min_run_time=min_run_time)
++
++
++def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
++               sub_label: str) -> Iterable[TMeasurement]:
++    assert dtype == torch.int8
++    a, b = make_rand_tensors(torch.int8, m, n, k)
++    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
++    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
++    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
++    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
++    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
++
++    timers = []
++    # pytorch impl - bfloat16
++    timers.append(
++        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
++                 torch.mm, a.to(dtype=torch.bfloat16),
++                 b.to(dtype=torch.bfloat16)))
++
++    # pytorch impl - float16
++    timers.append(
++        bench_fn(label, sub_label,
++                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
++                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
++
++    # cutlass impl
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
++                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
++                 torch.bfloat16))
++
++    # cutlass with bias
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
++                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
++                 bias))
++
++    # cutlass with azp per-tensor
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
++                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
++                 torch.bfloat16, azp_adj))
++
++    # cutlass with azp per-tensor + bias
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
++                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
++                 torch.bfloat16, azp_adj, None, bias))
++
++    # cutlass with azp per-token
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
++                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
++                 torch.bfloat16, azp_adj, azp))
++
++    # cutlass with azp per-token + bias
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
++                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
++                 torch.bfloat16, azp_adj, azp, bias))
++
++    return timers
++
++
++def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
++              sub_label: str) -> Iterable[TMeasurement]:
++    assert dtype == torch.float8_e4m3fn
++    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
++    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
++    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
++    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
++
++    timers = []
++
++    # pytorch impl w. bf16
++    timers.append(
++        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
++                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
++                 b.to(dtype=torch.bfloat16, device="cuda")))
++
++    # pytorch impl: bf16 output, without fp8 fast accum
++    timers.append(
++        bench_fn(label,
++                 sub_label,
++                 "pytorch_fp8_fp8_bf16_scaled_mm",
++                 torch._scaled_mm,
++                 a,
++                 b,
++                 scale_a=scale_a,
++                 scale_b=scale_b,
++                 out_dtype=torch.bfloat16))
++
++    # pytorch impl: bf16 output, with fp8 fast accum
++    timers.append(
++        bench_fn(label,
++                 sub_label,
++                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
++                 torch._scaled_mm,
++                 a,
++                 b,
++                 scale_a=scale_a,
++                 scale_b=scale_b,
++                 out_dtype=torch.bfloat16,
++                 use_fast_accum=True))
++
++    # pytorch impl: fp16 output, without fp8 fast accum
++    timers.append(
++        bench_fn(label,
++                 sub_label,
++                 "pytorch_fp8_fp8_fp16_scaled_mm",
++                 torch._scaled_mm,
++                 a,
++                 b,
++                 scale_a=scale_a,
++                 scale_b=scale_b,
++                 out_dtype=torch.float16))
++
++    # pytorch impl: fp16 output, with fp8 fast accum
++    timers.append(
++        bench_fn(label,
++                 sub_label,
++                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
++                 torch._scaled_mm,
++                 a,
++                 b,
++                 scale_a=scale_a,
++                 scale_b=scale_b,
++                 out_dtype=torch.float16,
++                 use_fast_accum=True))
++
++    # cutlass impl: bf16 output
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
++                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
++                 torch.bfloat16))
++    # cutlass impl: fp16 output
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
++                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
++
++    # cutlass impl: bf16 output, with bias
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
++                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
++                 bias))
++
++    # cutlass impl: fp16 output, with bias
++    timers.append(
++        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
++                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
++                 bias.to(dtype=torch.float16)))
++
++    return timers
++
++
++def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
++          sub_label: str) -> Iterable[TMeasurement]:
++    if dtype == torch.int8:
++        return bench_int8(dtype, m, k, n, label, sub_label)
++    if dtype == torch.float8_e4m3fn:
++        return bench_fp8(dtype, m, k, n, label, sub_label)
++    raise ValueError("unsupported type")
++
++
++# runner
++def print_timers(timers: Iterable[TMeasurement]):
++    compare = TBenchmark.Compare(timers)
++    compare.print()
++
++
++def run(dtype: torch.dtype,
++        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
++    results = []
++    for m, k, n in MKNs:
++        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
++                       f"MKN=({m}x{k}x{n})")
++        print_timers(timers)
++        results.extend(timers)
++
++    return results
++
++
++# output makers
++def make_output(data: Iterable[TMeasurement],
++                MKNs: Iterable[Tuple[int, int, int]],
++                base_description: str,
++                timestamp=None):
++    print(f"== All Results {base_description} ====")
++    print_timers(data)
++
++    # pickle all the results
++    timestamp = int(time.time()) if timestamp is None else timestamp
++    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
++        pkl.dump(data, f)
++
++
++# argparse runners
++
++
++def run_square_bench(args):
++    dim_sizes = list(
++        range(args.dim_start, args.dim_end + 1, args.dim_increment))
++    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
++    data = run(args.dtype, MKNs)
++
++    make_output(data, MKNs, f"square_bench-{args.dtype}")
++
++
++def run_range_bench(args):
++    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
++    n = len(dim_sizes)
++    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
++    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
++    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
++    MKNs = list(zip(Ms, Ks, Ns))
++    data = run(args.dtype, MKNs)
++
++    make_output(data, MKNs, f"range_bench-{args.dtype}")
++
++
++def run_model_bench(args):
++    print("Benchmarking models:")
++    for i, model in enumerate(args.models):
++        print(f"[{i}]  {model}")
++
++    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
++        KNs = []
++        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
++            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
++            KNs.append(KN)
++        return KNs
++
++    model_bench_data = []
++    models_tps = list(itertools.product(args.models, args.tp_sizes))
++    for model, tp_size in models_tps:
++        Ms = args.batch_sizes
++        KNs = model_shapes(model, tp_size)
++        MKNs = []
++        for m in Ms:
++            for k, n in KNs:
++                MKNs.append((m, k, n))
++
++        data = run(args.dtype, MKNs)
++        model_bench_data.append(data)
++
++    # Print all results
++    for data, model_tp in zip(model_bench_data, models_tps):
++        model, tp_size = model_tp
++        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
++        print_timers(data)
++
++    timestamp = int(time.time())
++
++    all_data = []
++    for d in model_bench_data:
++        all_data.extend(d)
++    # pickle all data
++    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
++        pkl.dump(all_data, f)
++
++
++if __name__ == '__main__':
++
++    def to_torch_dtype(dt):
++        if dt == "int8":
++            return torch.int8
++        if dt == "fp8":
++            return torch.float8_e4m3fn
++        raise ValueError("unsupported dtype")
++
++    parser = FlexibleArgumentParser(
++        description="""
++Benchmark Cutlass GEMM.
++
++    To run square GEMMs:
++        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
++    
++    To run constant N and K and sweep M:
++        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
++    
++    To run dimensions from a model:
++        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
++    
++    Output:
++        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
++            """,  # noqa: E501
++        formatter_class=argparse.RawTextHelpFormatter)
++
++    parser.add_argument("--dtype",
++                        type=to_torch_dtype,
++                        required=True,
++                        help="Available options are ['int8', 'fp8']")
++    subparsers = parser.add_subparsers(dest="cmd")
++
++    square_parser = subparsers.add_parser("square_bench")
++    square_parser.add_argument("--dim-start", type=int, required=True)
++    square_parser.add_argument("--dim-end", type=int, required=True)
++    square_parser.add_argument("--dim-increment", type=int, required=True)
++    square_parser.set_defaults(func=run_square_bench)
++
++    range_parser = subparsers.add_parser("range_bench")
++    range_parser.add_argument("--dim-start", type=int, required=True)
++    range_parser.add_argument("--dim-end", type=int, required=True)
++    range_parser.add_argument("--dim-increment", type=int, required=True)
++    range_parser.add_argument("--m-constant", type=int, default=None)
++    range_parser.add_argument("--n-constant", type=int, default=None)
++    range_parser.add_argument("--k-constant", type=int, default=None)
++    range_parser.set_defaults(func=run_range_bench)
++
++    model_parser = subparsers.add_parser("model_bench")
++    model_parser.add_argument("--models",
++                              nargs="+",
++                              type=str,
++                              default=DEFAULT_MODELS,
++                              choices=WEIGHT_SHAPES.keys())
++    model_parser.add_argument("--tp-sizes",
++                              nargs="+",
++                              type=int,
++                              default=DEFAULT_TP_SIZES)
++    model_parser.add_argument("--batch-sizes",
++                              nargs="+",
++                              type=int,
++                              default=DEFAULT_BATCH_SIZES)
++    model_parser.set_defaults(func=run_model_bench)
++
++    args = parser.parse_args()
++    args.func(args)
+\ No newline at end of file
+diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
+new file mode 100644
+index 0000000..d58fb0b
+--- /dev/null
++++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
+@@ -0,0 +1,43 @@
++# Weight Shapes are in the format
++# ([K, N], TP_SPLIT_DIM)
++# Example:
++#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
++#   - TP1 : K = 14336, N = 4096
++#   - TP2 : K = 7168, N = 4096
++#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
++#   - TP1 : K = 4096, N = 6144
++#   - TP4 : K = 4096, N = 1536
++
++# TP1 shapes
++WEIGHT_SHAPES = {
++    "mistralai/Mistral-7B-v0.1": [
++        ([4096, 6144], 1),
++        ([4096, 4096], 0),
++        ([4096, 28672], 1),
++        ([14336, 4096], 0),
++    ],
++    "meta-llama/Llama-2-7b-hf": [
++        ([4096, 12288], 1),
++        ([4096, 4096], 0),
++        ([4096, 22016], 1),
++        ([11008, 4096], 0),
++    ],
++    "meta-llama/Llama-3-8b": [
++        ([4096, 6144], 1),
++        ([4096, 4096], 0),
++        ([4096, 28672], 1),
++        ([14336, 4096], 0),
++    ],
++    "meta-llama/Llama-2-13b-hf": [
++        ([5120, 15360], 1),
++        ([5120, 5120], 0),
++        ([5120, 27648], 1),
++        ([13824, 5120], 0),
++    ],
++    "meta-llama/Llama-2-70b-hf": [
++        ([8192, 10240], 1),
++        ([8192, 8192], 0),
++        ([8192, 57344], 1),
++        ([28672, 8192], 0),
++    ],
++}
+\ No newline at end of file
+diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+new file mode 100644
+index 0000000..9499963
+--- /dev/null
++++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+@@ -0,0 +1,145 @@
++#!/bin/bash
++
++# benchmark the overhead of disaggregated prefill.
++# methodology:
++# - send all request to prefill vLLM instance. It will buffer KV cache.
++# - then send all request to decode instance. 
++# - The TTFT of decode instance is the overhead.
++
++set -ex
++
++kill_gpu_processes() {
++  # kill all processes on GPU.
++  pgrep pt_main_thread | xargs -r kill -9
++  pgrep python3 | xargs -r kill -9
++  sleep 10
++
++  # remove vllm config file
++  rm -rf ~/.config/vllm
++
++  # Print the GPU memory usage
++  # so that we know if all GPU processes are killed.
++  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
++  # The memory usage should be 0 MB.
++  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
++}
++
++wait_for_server() {
++  # wait for vllm server to start
++  # return 1 if vllm server crashes
++  local port=$1
++  timeout 1200 bash -c "
++    until curl -s localhost:${port}/v1/completions > /dev/null; do
++      sleep 1
++    done" && return 0 || return 1
++}
++
++
++benchmark() {
++
++  export VLLM_LOGGING_LEVEL=DEBUG
++  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
++
++  # compare chunked prefill with disaggregated prefill
++
++  results_folder="./results"
++  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
++  dataset_name="sonnet"
++  dataset_path="../sonnet_4x.txt"
++  num_prompts=10
++  qps=$1
++  prefix_len=50
++  input_len=2048
++  output_len=$2
++
++
++  CUDA_VISIBLE_DEVICES=0 python3 \
++    -m vllm.entrypoints.openai.api_server \
++    --model $model \
++    --port 8100 \
++    --max-model-len 10000 \
++    --gpu-memory-utilization 0.6 \
++    --kv-transfer-config \
++    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
++    
++
++  CUDA_VISIBLE_DEVICES=1 python3 \
++    -m vllm.entrypoints.openai.api_server \
++    --model $model \
++    --port 8200 \
++    --max-model-len 10000 \
++    --gpu-memory-utilization 0.6 \
++    --kv-transfer-config \
++    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
++
++  wait_for_server 8100
++  wait_for_server 8200
++
++  # let the prefill instance finish prefill
++  python3 ../benchmark_serving.py \
++          --backend vllm \
++          --model $model \
++          --dataset-name $dataset_name \
++          --dataset-path $dataset_path \
++          --sonnet-input-len $input_len \
++          --sonnet-output-len "$output_len" \
++          --sonnet-prefix-len $prefix_len \
++          --num-prompts $num_prompts \
++          --port 8100 \
++          --save-result \
++          --result-dir $results_folder \
++          --result-filename disagg_prefill_tp1.json \
++          --request-rate "inf"
++
++
++  # send the request to decode.
++  # The TTFT of this command will be the overhead of disagg prefill impl.
++  python3 ../benchmark_serving.py \
++          --backend vllm \
++          --model $model \
++          --dataset-name $dataset_name \
++          --dataset-path $dataset_path \
++          --sonnet-input-len $input_len \
++          --sonnet-output-len "$output_len" \
++          --sonnet-prefix-len $prefix_len \
++          --num-prompts $num_prompts \
++          --port 8200 \
++          --save-result \
++          --result-dir $results_folder \
++          --result-filename disagg_prefill_tp1_overhead.json \
++          --request-rate "$qps"
++  kill_gpu_processes
++
++}
++
++
++main() {
++
++  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
++  (which jq) || (apt-get -y install jq)
++  (which socat) || (apt-get -y install socat)
++
++  pip install quart httpx datasets
++
++  cd "$(dirname "$0")"
++
++  cd ..
++  # create sonnet-4x.txt
++  echo "" > sonnet_4x.txt
++  for _ in {1..4}
++  do
++    cat sonnet.txt >> sonnet_4x.txt
++  done
++  cd disagg_benchmarks
++
++  rm -rf results
++  mkdir results
++
++  default_qps=1
++  default_output_len=1
++  benchmark $default_qps $default_output_len
++
++}
++
++
++main "$@"
+diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+new file mode 100644
+index 0000000..eb5d891
+--- /dev/null
++++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+@@ -0,0 +1,163 @@
++#!/bin/bash
++
++# Requirement: 2x GPUs.
++
++
++# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
++# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
++# Resource: 2x GPU
++# Approaches:
++# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
++# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
++# Prefilling instance: max_output_token=1
++# Decoding instance: force the input tokens be the same across requests to bypass prefilling
++
++set -ex
++
++kill_gpu_processes() {
++  # kill all processes on GPU.
++  pgrep pt_main_thread | xargs -r kill -9
++  pgrep python3 | xargs -r kill -9
++  for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
++  sleep 1
++}
++
++wait_for_server() {
++  # wait for vllm server to start
++  # return 1 if vllm server crashes
++  local port=$1
++  timeout 1200 bash -c "
++    until curl -s localhost:${port}/v1/completions > /dev/null; do
++      sleep 1
++    done" && return 0 || return 1
++}
++
++
++launch_chunked_prefill() {
++  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
++  # disagg prefill
++  CUDA_VISIBLE_DEVICES=0 python3 \
++    -m vllm.entrypoints.openai.api_server \
++    --model $model \
++    --port 8100 \
++    --max-model-len 10000 \
++    --enable-chunked-prefill \
++    --gpu-memory-utilization 0.6 &
++  CUDA_VISIBLE_DEVICES=1 python3 \
++    -m vllm.entrypoints.openai.api_server \
++    --model $model \
++    --port 8200 \
++    --max-model-len 10000 \
++    --enable-chunked-prefill \
++    --gpu-memory-utilization 0.6 &
++  wait_for_server 8100
++  wait_for_server 8200
++  python3 round_robin_proxy.py &
++  sleep 1
++}
++
++
++launch_disagg_prefill() {
++  model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
++  # disagg prefill
++  CUDA_VISIBLE_DEVICES=0 python3 \
++    -m vllm.entrypoints.openai.api_server \
++    --model $model \
++    --port 8100 \
++    --max-model-len 10000 \
++    --gpu-memory-utilization 0.6 \
++    --kv-transfer-config \
++    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
++
++  CUDA_VISIBLE_DEVICES=1 python3 \
++    -m vllm.entrypoints.openai.api_server \
++    --model $model \
++    --port 8200 \
++    --max-model-len 10000 \
++    --gpu-memory-utilization 0.6 \
++    --kv-transfer-config \
++    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
++
++  wait_for_server 8100
++  wait_for_server 8200
++  python3 disagg_prefill_proxy_server.py &
++  sleep 1
++}
++
++
++benchmark() {
++  results_folder="./results"
++  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
++  dataset_name="sonnet"
++  dataset_path="../sonnet_4x.txt"
++  num_prompts=100
++  qps=$1
++  prefix_len=50
++  input_len=1024
++  output_len=$2
++  tag=$3
++
++  python3 ../benchmark_serving.py \
++          --backend vllm \
++          --model $model \
++          --dataset-name $dataset_name \
++          --dataset-path $dataset_path \
++          --sonnet-input-len $input_len \
++          --sonnet-output-len "$output_len" \
++          --sonnet-prefix-len $prefix_len \
++          --num-prompts $num_prompts \
++          --port 8000 \
++          --save-result \
++          --result-dir $results_folder \
++          --result-filename "$tag"-qps-"$qps".json \
++          --request-rate "$qps"
++
++  sleep 2
++}
++
++
++main() {
++
++  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
++  (which jq) || (apt-get -y install jq)
++  (which socat) || (apt-get -y install socat)
++  (which lsof) || (apt-get -y install lsof)
++
++  pip install quart httpx matplotlib aiohttp datasets
++
++  cd "$(dirname "$0")"
++
++  cd ..
++  # create sonnet-4x.txt so that we can sample 2048 tokens for input
++  echo "" > sonnet_4x.txt
++  for _ in {1..4}
++  do
++    cat sonnet.txt >> sonnet_4x.txt
++  done
++  cd disagg_benchmarks
++
++  rm -rf results
++  mkdir results
++
++  default_output_len=6
++
++  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
++
++  launch_chunked_prefill
++  for qps in 2 4 6 8; do
++  benchmark $qps $default_output_len chunked_prefill
++  done
++  kill_gpu_processes
++
++  launch_disagg_prefill
++  for qps in 2 4 6 8; do
++  benchmark $qps $default_output_len disagg_prefill
++  done
++  kill_gpu_processes
++
++  python3 visualize_benchmark_results.py
++
++}
++
++
++main "$@"
+diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+new file mode 100644
+index 0000000..4058b1c
+--- /dev/null
++++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+@@ -0,0 +1,61 @@
++import os
++
++import aiohttp
++from quart import Quart, make_response, request
++
++AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
++
++app = Quart(__name__)
++
++
++async def forward_request(url, data):
++    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
++        headers = {
++            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
++        }
++        async with session.post(url=url, json=data,
++                                headers=headers) as response:
++            if response.status == 200:
++                # if response.headers.get('Transfer-Encoding') == 'chunked':
++                if True:
++                    async for chunk_bytes in response.content.iter_chunked(
++                            1024):
++                        yield chunk_bytes
++                else:
++                    content = await response.read()
++                    yield content
++
++
++@app.route('/v1/completions', methods=['POST'])
++async def handle_request():
++    try:
++        original_request_data = await request.get_json()
++
++        prefill_request = original_request_data.copy()
++        # change max_tokens = 1 to let it only do prefill
++        prefill_request['max_tokens'] = 1
++
++        # finish prefill
++        async for _ in forward_request('http://localhost:8100/v1/completions',
++                                       prefill_request):
++            continue
++
++        # return decode
++        generator = forward_request('http://localhost:8200/v1/completions',
++                                    original_request_data)
++        response = await make_response(generator)
++        response.timeout = None
++
++        return response
++
++    except Exception as e:
++        import sys
++        import traceback
++        exc_info = sys.exc_info()
++        print("Error occurred in disagg prefill proxy server")
++        print(e)
++        print("".join(traceback.format_exception(*exc_info)))
++
++
++if __name__ == '__main__':
++    app.run(port=8000)
+diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
+new file mode 100644
+index 0000000..6eb5f63
+--- /dev/null
++++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
+@@ -0,0 +1,60 @@
++import asyncio
++import itertools
++
++import aiohttp
++from aiohttp import web
++
++
++class RoundRobinProxy:
++
++    def __init__(self, target_ports):
++        self.target_ports = target_ports
++        self.port_cycle = itertools.cycle(self.target_ports)
++
++    async def handle_request(self, request):
++        target_port = next(self.port_cycle)
++        target_url = f"http://localhost:{target_port}{request.path_qs}"
++
++        async with aiohttp.ClientSession() as session:
++            try:
++                # Forward the request
++                async with session.request(
++                        method=request.method,
++                        url=target_url,
++                        headers=request.headers,
++                        data=request.content,
++                ) as response:
++                    # Start sending the response
++                    resp = web.StreamResponse(status=response.status,
++                                              headers=response.headers)
++                    await resp.prepare(request)
++
++                    # Stream the response content
++                    async for chunk in response.content.iter_any():
++                        await resp.write(chunk)
++
++                    await resp.write_eof()
++                    return resp
++
++            except Exception as e:
++                return web.Response(text=f"Error: {str(e)}", status=500)
++
++
++async def main():
++    proxy = RoundRobinProxy([8100, 8200])
++    app = web.Application()
++    app.router.add_route('*', '/{path:.*}', proxy.handle_request)
++
++    runner = web.AppRunner(app)
++    await runner.setup()
++    site = web.TCPSite(runner, 'localhost', 8000)
++    await site.start()
++
++    print("Proxy server started on http://localhost:8000")
++
++    # Keep the server running
++    await asyncio.Event().wait()
++
++
++if __name__ == '__main__':
++    asyncio.run(main())
+diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+new file mode 100644
+index 0000000..e59d8bb
+--- /dev/null
++++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+@@ -0,0 +1,46 @@
++import json
++
++import matplotlib.pyplot as plt
++import pandas as pd
++
++if __name__ == "__main__":
++
++    data = []
++    for name in ['disagg_prefill', 'chunked_prefill']:
++        for qps in [2, 4, 6, 8]:
++            with open(f"results/{name}-qps-{qps}.json") as f:
++                x = json.load(f)
++                x['name'] = name
++                x['qps'] = qps
++                data.append(x)
++
++    df = pd.DataFrame.from_dict(data)
++    dis_df = df[df['name'] == 'disagg_prefill']
++    chu_df = df[df['name'] == 'chunked_prefill']
++
++    plt.style.use('bmh')
++    plt.rcParams['font.size'] = 20
++
++    for key in [
++            'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
++            'median_itl_ms', 'p99_itl_ms'
++    ]:
++
++        fig, ax = plt.subplots(figsize=(11, 7))
++        plt.plot(dis_df['qps'],
++                 dis_df[key],
++                 label='disagg_prefill',
++                 marker='o',
++                 linewidth=4)
++        plt.plot(chu_df['qps'],
++                 chu_df[key],
++                 label='chunked_prefill',
++                 marker='o',
++                 linewidth=4)
++        ax.legend()
++
++        ax.set_xlabel('QPS')
++        ax.set_ylabel(key)
++        ax.set_ylim(bottom=0)
++        fig.savefig(f'results/{key}.png')
++        plt.close(fig)
+diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+new file mode 100644
+index 0000000..ef91f9f
+--- /dev/null
++++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+@@ -0,0 +1,173 @@
++import pickle as pkl
++import time
++from dataclasses import dataclass
++from itertools import product
++from typing import Callable, Iterable, List, Optional
++
++import torch
++import torch.utils.benchmark as TBenchmark
++from torch.utils.benchmark import Measurement as TMeasurement
++from tqdm import tqdm
++
++import vllm._custom_ops as ops
++from vllm.model_executor.layers.layernorm import RMSNorm
++
++
++@dataclass
++class bench_params_t:
++    num_tokens: int
++    hidden_size: int
++    add_residual: bool
++    dtype: torch.dtype
++
++    def description(self):
++        return (f'N {self.num_tokens} '
++                f'x D {self.hidden_size} '
++                f'x R {self.add_residual} '
++                f'x DT {self.dtype}')
++
++
++def get_bench_params() -> List[bench_params_t]:
++    ## Test Fixtures
++    NUM_TOKENS = [2**x for x in range(11)]
++    HIDDEN_SIZES = list(range(1024, 8129, 1024))
++    ADD_RESIDUAL = [True, False]
++    DTYPES = [torch.bfloat16, torch.float]
++
++    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
++    bench_params = list(map(lambda x: \
++        bench_params_t(x[0], x[1], x[2], x[3]), combinations))
++    return bench_params
++
++
++# Reference impls
++def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
++                      residual: Optional[torch.Tensor],
++                      quant_dtype: torch.dtype):
++    # Norm
++    torch_out = None
++    if residual is None:
++        torch_out = rms_norm_layer.forward_cuda(x, residual)
++    else:
++        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
++
++    # Quant
++    torch_out, _, _ = ops.scaled_int8_quant(torch_out)
++
++
++def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
++                     residual: Optional[torch.Tensor],
++                     quant_dtype: torch.dtype):
++    # Norm
++    torch_out = None
++    if residual is None:
++        torch_out = rms_norm_layer.forward_cuda(x, residual)
++    else:
++        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
++
++    # Quant
++    torch_out, _ = ops.scaled_fp8_quant(torch_out)
++
++
++def fused_impl(
++        rms_norm_layer: RMSNorm,  # this stores the weights
++        x: torch.Tensor,
++        residual: Optional[torch.Tensor],
++        quant_dtype: torch.dtype):
++    out, _ = ops.rms_norm_dynamic_per_token_quant(x,
++                                                  rms_norm_layer.weight,
++                                                  1e-6,
++                                                  quant_dtype,
++                                                  residual=residual)
++
++
++# Bench functions
++def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
++             quant_dtype: torch.dtype, label: str, sub_label: str,
++             fn: Callable, description: str) -> TMeasurement:
++
++    min_run_time = 1
++
++    globals = {
++        "rms_norm_layer": rms_norm_layer,
++        "x": x,
++        "residual": residual,
++        "quant_dtype": quant_dtype,
++        "fn": fn,
++    }
++    return TBenchmark.Timer(
++        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
++        globals=globals,
++        label=label,
++        sub_label=sub_label,
++        description=description,
++    ).blocked_autorange(min_run_time=min_run_time)
++
++def bench(params: bench_params_t, label: str, sub_label: str) \
++        -> Iterable[TMeasurement]:
++
++    # Make inputs
++    layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
++    # Make weights
++    layer.weight.data.normal_(mean=1.0, std=0.1)
++    # Make inputs
++    scale = 1 / params.hidden_size
++    x = torch.randn(params.num_tokens,
++                    params.hidden_size,
++                    dtype=params.dtype,
++                    device='cuda') * scale
++    residual = (torch.randn_like(x) * scale).to(device='cuda') \
++            if params.add_residual else None
++
++    timers = []
++
++    # unfused int8 impl.
++    timers.append(
++        bench_fn(layer, x, residual, torch.int8, label, sub_label,
++                 unfused_int8_impl, "unfused_int8_impl"))
++
++    # unfused fp8 impl.
++    timers.append(
++        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
++                 unfused_fp8_impl, "unfused_fp8_impl"))
++
++    # fused int8 impl.
++    timers.append(
++        bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl,
++                 "fused_int8_impl"))
++
++    # fused fp8 impl.
++    timers.append(
++        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
++                 fused_impl, "fused_fp8_impl"))
++
++    print_timers(timers)
++
++    return timers
++
++
++# launch bench
++# runner
++def print_timers(timers: Iterable[TMeasurement]):
++    compare = TBenchmark.Compare(timers)
++    compare.print()
++
++
++def main():
++    torch.set_default_device('cuda')
++    bench_params = get_bench_params()
++
++    timers = []
++    for bp in tqdm(bench_params):
++        timers.extend(
++            bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
++    print_timers(timers)
++
++    # pickle all the results
++    timestamp = int(time.time())
++    with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f:
++        pkl.dump(timers, f)
++
++
++if __name__ == '__main__':
++    main()
+diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
+index 5939294..601c4ea 100644
+--- a/benchmarks/kernels/benchmark_aqlm.py
++++ b/benchmarks/kernels/benchmark_aqlm.py
+@@ -1,4 +1,3 @@
+-import argparse
+ import os
+ import sys
+ from typing import Optional
+@@ -10,6 +9,7 @@ from vllm import _custom_ops as ops
+ from vllm.model_executor.layers.quantization.aqlm import (
+     dequantize_weight, generic_dequantize_gemm, get_int_dtype,
+     optimized_dequantize_gemm)
++from vllm.utils import FlexibleArgumentParser
+ 
+ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+ 
+@@ -86,9 +86,9 @@ def dequant_no_scale(
+ # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+ # the generic pytorch version.
+ # Just visual comparison.
+-def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
++def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
+ 
+-    n = parts.sum().item()
++    n = int(parts.sum().item())
+ 
+     device = torch.device('cuda:0')
+ 
+@@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+ 
+ def main():
+ 
+-    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
++    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
+ 
+     # Add arguments
+     parser.add_argument("--nbooks",
+@@ -204,7 +204,7 @@ def main():
+         sys.stdout = sys.__stdout__
+ 
+ 
+-def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
++def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
+              methods):
+ 
+     # I didn't see visible improvements from increasing these, but feel free :)
+@@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+     print('')
+ 
+ 
+-def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
++def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
+                nbooks: int, bits: int, method) -> float:
+ 
+-    n = parts.sum().item()
++    n = int(parts.sum().item())
+ 
+     device = torch.device('cuda:0')
+ 
+diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
+new file mode 100644
+index 0000000..7acea60
+--- /dev/null
++++ b/benchmarks/kernels/benchmark_layernorm.py
+@@ -0,0 +1,86 @@
++import time
++
++import torch
++
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.platforms import current_platform
++from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
++
++
++@torch.inference_mode()
++def main(num_tokens: int,
++         hidden_size: int,
++         add_residual: bool,
++         dtype: torch.dtype,
++         seed: int = 0,
++         do_profile: bool = False,
++         num_warmup_iters: int = 5,
++         num_iters: int = 100) -> None:
++    current_platform.seed_everything(seed)
++    torch.set_default_device("cuda")
++
++    layer = RMSNorm(hidden_size).to(dtype=dtype)
++    layer.weight.data.normal_(mean=1.0, std=0.1)
++    scale = 1 / (2 * hidden_size)
++    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
++    x *= scale
++    residual = torch.randn_like(x) * scale if add_residual else None
++
++    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
++        torch.cuda.synchronize()
++        if profile:
++            torch.cuda.cudart().cudaProfilerStart()
++        start_time = time.perf_counter()
++
++        for _ in range(num_iters):
++            layer(x, residual)
++        torch.cuda.synchronize()
++
++        end_time = time.perf_counter()
++        if profile:
++            torch.cuda.cudart().cudaProfilerStart()
++        return (end_time - start_time) / num_iters
++
++    # Warmup.
++    print("Warming up...")
++    run_benchmark = run_cuda_benchmark
++    run_benchmark(num_iters=num_warmup_iters, profile=False)
++
++    # Benchmark.
++    if do_profile:
++        latency = run_benchmark(num_iters=1, profile=True)
++    else:
++        latency = run_benchmark(num_iters=num_iters, profile=False)
++    print(f"Kernel running time: {latency * 1000000:.3f} us")
++
++
++if __name__ == '__main__':
++    parser = FlexibleArgumentParser(
++        description="Benchmark the layernorm kernel.")
++    parser.add_argument("--num-tokens", type=int, default=4096)
++    parser.add_argument("--hidden-size", type=int, default=8192)
++    parser.add_argument("--add-residual", action="store_true")
++    parser.add_argument("--dtype",
++                        type=str,
++                        choices=["half", "bfloat16", "float"],
++                        default="half")
++    parser.add_argument("--seed", type=int, default=0)
++    parser.add_argument("--profile", action="store_true")
++    parser.add_argument("--num-warmup-iters", type=int, default=5)
++    parser.add_argument("--num-iters",
++                        type=int,
++                        default=100,
++                        help="Number of benchmark iterations. "
++                        "If --profile is set, this number is ignored")
++
++    args = parser.parse_args()
++    print(args)
++
++    main(num_tokens=args.num_tokens,
++         hidden_size=args.hidden_size,
++         add_residual=args.add_residual,
++         dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
++         seed=args.seed,
++         do_profile=args.profile,
++         num_warmup_iters=args.num_warmup_iters,
++         num_iters=args.num_iters)
+diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
+new file mode 100644
+index 0000000..46bab74
+--- /dev/null
++++ b/benchmarks/kernels/benchmark_machete.py
+@@ -0,0 +1,672 @@
++import argparse
++import copy
++import itertools
++import math
++import os
++import pickle as pkl
++import time
++from dataclasses import dataclass
++from itertools import product
++from typing import Callable, Iterable, List, Optional, Tuple
++
++import pandas as pd
++import torch
++import torch.utils.benchmark as TBenchmark
++from torch.utils.benchmark import Measurement as TMeasurement
++from weight_shapes import WEIGHT_SHAPES
++
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.utils.marlin_utils import (
++    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales,
++    marlin_zero_points)
++from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
++    MarlinWorkspace)
++from vllm.model_executor.layers.quantization.utils.quant_utils import (
++    pack_rows, quantize_weights)
++from vllm.scalar_type import ScalarType, scalar_types
++from vllm.utils import FlexibleArgumentParser
++
++DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
++DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
++DEFAULT_TP_SIZES = [1]
++
++NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False)
++
++if NVTX_PROFILE:
++    import nvtx
++
++
++def terse_type_name(dt):
++    return {
++        torch.bfloat16: "bf16",
++        torch.float16: "fp16",
++        torch.int8: "int8",
++        torch.float8_e4m3fn: "fp8",
++        torch.bfloat16: "bf16",
++        torch.float: "float",
++        torch.int: "int",
++    }[dt]
++
++
++@dataclass
++class BenchmarkTensors:
++    w_ref: torch.Tensor
++    a: torch.Tensor
++
++    w_q: torch.Tensor
++    group_size: Optional[int]
++    wtype: ScalarType
++    w_g_s: torch.Tensor
++    w_g_zp: Optional[torch.Tensor]
++    w_ch_s: Optional[torch.Tensor]
++    w_tok_s: Optional[torch.Tensor]
++
++
++@dataclass
++class TypeConfig:
++    act_type: torch.dtype
++    weight_type: ScalarType
++    output_type: Optional[torch.dtype]
++    group_scale_type: Optional[torch.dtype]
++    group_zero_type: Optional[torch.dtype]
++    channel_scale_type: Optional[torch.dtype]
++    token_scale_type: Optional[torch.dtype]
++
++
++def rand_data(shape, dtype=torch.float16, scale=1):
++    if dtype.is_floating_point:
++        return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype)
++    else:
++        return torch.randint(-15, 15, shape, dtype=dtype, device="cuda")
++
++
++def quantize_and_pack(atype: torch.dtype,
++                      w: torch.Tensor,
++                      wtype: ScalarType,
++                      stype: Optional[torch.dtype],
++                      group_size: Optional[int],
++                      zero_points: bool = False):
++    assert wtype.is_integer(), "TODO: support floating point weights"
++
++    w_ref, w_q, w_s, w_zp = quantize_weights(
++        w,
++        wtype,
++        group_size=group_size,
++        zero_points=zero_points,
++        # to match how the kernel applies zps
++        ref_zero_points_after_scales=True)
++
++    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
++    return w_ref, w_q, w_s, w_zp
++
++
++def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
++                         group_size: Optional[int]) -> List[BenchmarkTensors]:
++    m, n, k = shape
++
++    # we want to make sure that weights don't fit into L2 cache between runs so
++    #  we construct enough weights to exceed L2 cache, which is 50mb on a H100
++    #  so we target total weight size > 2*50mb
++    num_weights = math.ceil(2 * 50 * 1024**2 * 8 /
++                            (k * n * types.weight_type.size_bits))
++
++    a = rand_data((m, k), types.act_type, scale=5)
++
++    benchmark_tensors: List[BenchmarkTensors] = []
++    for _ in range(num_weights):
++        w = rand_data((k, n), types.act_type, scale=5)
++
++        if types.group_scale_type is not None:
++            w = w.to(types.group_scale_type)
++        if w.dtype.itemsize == 1:
++            w = w.to(torch.float16)
++
++        w_ref, w_q_packed, w_s, w_zp = quantize_and_pack(
++            a.dtype, w, types.weight_type, types.group_scale_type, group_size,
++            types.group_zero_type is not None)
++
++        if not a.dtype.is_floating_point:
++            aiinfo = torch.iinfo(a.dtype)
++            w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
++
++        w_ref = w_ref.to(torch.float32)
++
++        w_ch_s = None if types.channel_scale_type is None else\
++            rand_data((n,), types.channel_scale_type)
++        w_tok_s = None if types.token_scale_type is None else\
++            rand_data((m,), types.token_scale_type)
++
++        benchmark_tensors.append(
++            BenchmarkTensors(w_ref=w_ref,
++                             a=a,
++                             w_q=w_q_packed,
++                             wtype=types.weight_type,
++                             w_g_s=w_s,
++                             w_g_zp=w_zp,
++                             group_size=group_size,
++                             w_ch_s=w_ch_s,
++                             w_tok_s=w_tok_s))
++
++    return benchmark_tensors
++
++
++def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable:
++    a = bt.a
++    w = bt.w_ref.to(bt.a.dtype)  # use float reference tensor
++    if a.dtype not in [torch.float16, torch.bfloat16]:
++        a = a.to(torch.float16)
++        w = w.to(torch.float16)
++    return lambda: torch.matmul(a, w)
++
++
++def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable:
++    if bt.w_ch_s is not None and bt.w_tok_s is not None:
++        scale_a = bt.w_tok_s.to(torch.float32)
++        scale_b = bt.w_ch_s.to(torch.float32)
++    else:
++        scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
++        scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
++    w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t()
++    return lambda: ops.cutlass_scaled_mm(
++        bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16)
++
++
++def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
++    device = bt.a.device
++
++    workspace = MarlinWorkspace(bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
++                                GPTQ_MARLIN_MAX_PARALLEL)
++
++    if bt.w_g_zp is None:
++        w_zp = torch.empty(0, dtype=torch.int, device=device)
++    else:
++        w_zp = marlin_zero_points(bt.w_g_zp, bt.w_ref.shape[0],
++                                  bt.w_ref.shape[1], bt.wtype.size_bits)
++
++    if bt.group_size is None:
++        w_s = torch.tensor([], device="cuda", dtype=torch.half)
++    else:
++        w_s = marlin_permute_scales(bt.w_g_s, bt.w_ref.shape[0],
++                                    bt.w_ref.shape[1], bt.group_size)
++
++    sort_indices = torch.empty(0, dtype=torch.int, device=device)
++    g_idx = torch.empty(0, dtype=torch.int, device=device)
++    w_q = ops.gptq_marlin_repack(bt.w_q, sort_indices, bt.w_ref.shape[0],
++                                 bt.w_ref.shape[1], bt.wtype.size_bits)
++
++    if bt.a.dtype.is_floating_point:
++        assert bt.w_ch_s is None
++        assert bt.w_tok_s is None
++        assert bt.group_size is not None
++
++        fn = lambda: ops.gptq_marlin_gemm(a=bt.a,
++                                          b_q_weight=w_q,
++                                          b_scales=w_s,
++                                          b_zeros=w_zp,
++                                          g_idx=g_idx,
++                                          perm=sort_indices,
++                                          workspace=workspace.scratch,
++                                          b_q_type=bt.wtype,
++                                          size_m=bt.a.shape[0],
++                                          size_n=bt.w_ref.shape[1],
++                                          size_k=bt.w_ref.shape[0],
++                                          is_k_full=True,
++                                          is_zp_float=False)
++    else:
++        assert bt.a.dtype == torch.int8
++        assert bt.wtype == scalar_types.uint4b8
++
++        if bt.w_ch_s is not None:
++            s_ch = bt.w_ch_s.to(torch.float32)
++        else:
++            s_ch = torch.ones(bt.w_ref.shape[1],
++                              dtype=torch.float32,
++                              device=device)
++
++        if bt.w_tok_s is not None:
++            s_tok = bt.w_tok_s.to(torch.float32)
++        else:
++            s_tok = torch.ones(bt.a.shape[0],
++                               dtype=torch.float32,
++                               device=device)
++
++        fn = lambda: ops.marlin_qqq_gemm(a=bt.a,
++                                         b_q_weight=w_q,
++                                         s_group=w_s,
++                                         s_tok=s_tok,
++                                         s_ch=s_ch,
++                                         workspace=workspace.scratch,
++                                         size_m=bt.a.shape[0],
++                                         size_n=bt.w_ref.shape[1],
++                                         size_k=bt.w_ref.shape[0])
++
++    return fn
++
++
++def machete_create_bench_fn(bt: BenchmarkTensors,
++                            out_type=torch.dtype,
++                            schedule=None) -> Callable:
++    w_q = bt.w_q.t().contiguous().t()  # make col major
++    w_q = ops.machete_prepack_B(w_q, bt.a.dtype, bt.wtype,
++                                None if bt.w_g_s is None else bt.w_g_s.dtype)
++
++    w_g_zp = bt.w_g_zp
++    if w_g_zp is not None:
++        w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype))
++
++    return lambda: ops.machete_mm(
++        a=bt.a,
++        b_q=bt.w_q,
++        b_type=bt.wtype,
++        b_group_scales=bt.w_g_s,
++        b_group_zeros=w_g_zp,
++        b_group_size=bt.group_size,
++        b_channel_scales=bt.w_ch_s,
++        a_token_scales=bt.w_tok_s,
++        out_type=out_type,
++        schedule=schedule,
++    )
++
++
++# impl
++
++# bench
++
++
++def bench_fns(label: str, sub_label: str, description: str,
++              fns: List[Callable]):
++
++    min_run_time = 1 if not NVTX_PROFILE else 0.1
++    res = TBenchmark.Timer(
++        stmt="""
++        for fn in fns:
++            fn()
++        """,
++        globals={
++            "fns": fns
++        },
++        label=label,
++        sub_label=sub_label,
++        description=description,
++    ).blocked_autorange(min_run_time=min_run_time)
++
++    if NVTX_PROFILE:
++        with nvtx.annotate("mm-bench"), nvtx.annotate(
++                f"{label}|{sub_label}|{description}"):
++            fns[0]()
++
++    return res
++
++
++_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
++_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
++
++
++def bench(types: TypeConfig,
++          group_size: int,
++          m: int,
++          k: int,
++          n: int,
++          label: str,
++          sub_label: str,
++          sweep_schedules: bool = True) -> List[TMeasurement]:
++    benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
++    sub_label += f", L={len(benchmark_tensors)}"
++
++    name_type_string = f"W{types.weight_type}"+\
++                       f"-A{terse_type_name(types.act_type)}"
++    if types.group_scale_type is not None:
++        name_type_string += f"-GS{terse_type_name(types.group_scale_type)}"
++    if types.group_zero_type is not None:
++        name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}"
++    if group_size is not None:
++        name_type_string += f"-G{group_size}"
++    if types.channel_scale_type is not None:
++        name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}"
++    if types.token_scale_type is not None:
++        name_type_string += f"-TS{terse_type_name(types.token_scale_type)}"
++
++    timers = []
++    # pytorch impl
++    timers.append(
++        bench_fns(
++            label, sub_label, "torch.matmul (fp16)",
++            [torch_matmul_f16_create_bench_fn(bt)
++             for bt in benchmark_tensors]))
++
++    if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn:
++        timers.append(
++            bench_fns(
++                label, sub_label,
++                f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", [
++                    cutlass_scaled_mm_create_bench_fn(bt)
++                    for bt in benchmark_tensors
++                ]))
++
++    if types.act_type != torch.float8_e4m3fn:
++        timers.append(
++            bench_fns(label, sub_label, f"marlin ({name_type_string})",
++                      [marlin_create_bench_fn(bt)
++                       for bt in benchmark_tensors]))
++
++    # machete
++    timers.append(
++        bench_fns(label, sub_label, f"machete ({name_type_string})", [
++            machete_create_bench_fn(bt, out_type=types.output_type)
++            for bt in benchmark_tensors
++        ]))
++
++    if sweep_schedules:
++        global _SWEEP_SCHEDULES_RESULTS
++
++        print("Finding best schedule for machete")
++        best = None
++        best_schedule = None
++        schedules = ops.machete_supported_schedules(
++            a_type=types.act_type,
++            b_type=types.weight_type,
++            group_scales_type=types.group_scale_type,
++            group_zeros_type=types.group_zero_type,
++            token_scales_type=types.token_scale_type,
++            channel_scales_type=types.channel_scale_type,
++            out_type=types.output_type)
++
++        if schedules is None or len(schedules) == 0:
++            raise ValueError("No schedules found to sweep")
++
++        for schedule in reversed(schedules):
++            schedule_M = int(schedule.split("_")[0].split("x")[1])
++
++            # Prune known bad schedules
++            if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
++                continue
++
++            res = bench_fns(label, sub_label, "machete_best", [
++                machete_create_bench_fn(
++                    bt, out_type=types.output_type, schedule=schedule)
++                for bt in benchmark_tensors
++            ])
++
++            results_row = {
++                "M": m,
++                "K": k,
++                "N": n,
++                "group_size": group_size,
++                "schedule": schedule,
++                "median": res.median,
++            }
++            if _SWEEP_SCHEDULES_RESULTS is None:
++                _SWEEP_SCHEDULES_RESULTS = pd.DataFrame(
++                    columns=results_row.keys())
++            _SWEEP_SCHEDULES_RESULTS.\
++                loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
++
++            print(f"  {res.median:5.5} ", schedule)
++            if not best or res.median < best.median:
++                best = res
++                best_schedule = schedule
++        print("Best schedule:", best_schedule)
++        timers.append(best)
++
++    return timers
++
++
++# runner
++def print_timers(timers: List[TMeasurement]):
++    compare = TBenchmark.Compare(timers)
++    compare.print()
++
++
++def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
++    types = TypeConfig(
++        act_type=args.act_type,
++        weight_type=scalar_types.uint4b8 if args.group_zero_type is None \
++            else scalar_types.uint4,
++        output_type=args.out_type,
++        group_scale_type=args.group_scale_type,
++        group_zero_type=args.group_zero_type,
++        channel_scale_type=args.channel_scale_type,
++        token_scale_type=args.token_scale_type,
++    )
++
++    results: List[TMeasurement] = []
++    for m, k, n in MKNs:
++        timers = bench(types,
++                       args.group_size,
++                       m,
++                       k,
++                       n,
++                       f"{args.act_type}-gemm",
++                       f"MKN=({m}x{k}x{n})",
++                       sweep_schedules=args.sweep_schedules)
++        print_timers(timers)
++        results.extend(timers)
++
++    return results
++
++
++# output makers
++def make_output(
++    data: List[TMeasurement],
++    MKNs: Iterable[Tuple[int, int, int]],
++    base_description: str,
++    timestamp=None,
++):
++
++    print(f"== All Results {base_description} ====")
++    print_timers(data)
++
++    # pickle all the results
++    timestamp = int(time.time()) if timestamp is None else timestamp
++    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
++        pkl.dump(data, f)
++
++
++# argparse runners
++
++
++def run_square_bench(args):
++    dim_sizes = list(
++        range(args.dim_start, args.dim_end + 1, args.dim_increment))
++    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
++    data = run(args.dtype, args.sweep_schedules, MKNs)
++
++    make_output(data, MKNs, f"square_bench-{args.dtype}")
++
++
++def run_range_bench(args):
++    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
++    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
++    m_increment, k_increment, n_increment = \
++        (int(x) for x in args.dim_increment.split(","))
++    Ms = list(range(m_start, m_end + 1, m_increment))
++    Ks = list(range(k_start, k_end + 1, k_increment))
++    Ns = list(range(n_start, n_end + 1, n_increment))
++    MKNs = list(product(Ms, Ks, Ns))
++
++    data = run(args.dtype, args.sweep_schedules, MKNs)
++
++    make_output(data, MKNs, f"range_bench-{args.dtype}")
++
++
++def run_model_bench(args):
++
++    print("Benchmarking models:")
++    for i, model in enumerate(args.models):
++        print(f"[{i}]  {model}")
++
++    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
++        KNs = []
++        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
++            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
++            KNs.append(KN)
++        return KNs
++
++    model_bench_data = []
++    models_tps = list(itertools.product(args.models, args.tp_sizes))
++    for model, tp_size in models_tps:
++        Ms = args.batch_sizes
++        KNs = model_shapes(model, tp_size)
++        MKNs = []
++        for m in Ms:
++            for k, n in KNs:
++                MKNs.append((m, k, n))
++
++        data = run(args, MKNs)
++        model_bench_data.append(data)
++
++    type_string = f"{args.act_type}"
++
++    # Print all results
++    for data, model_tp in zip(model_bench_data, models_tps):
++        model, tp_size = model_tp
++        print(f"== Results {type_string} {model}-TP{tp_size} ====")
++        print_timers(data)
++
++    timestr = time.strftime("%Y%m%d-%H%M%S")
++
++    all_results = []
++    for d in model_bench_data:
++        all_results.extend(d)
++
++    # pickle all data
++    with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f:
++        args_dict = vars(args)
++        args_dict.pop("func")
++        pkl.dump({
++            "args": args_dict,
++            "results": all_results,
++        }, f)
++
++
++if __name__ == "__main__":
++
++    def to_torch_dtype(dt):
++        return {
++            "bfloat16": torch.bfloat16,
++            "float16": torch.float16,
++            "int8": torch.int8,
++            "float8_e4m3fn": torch.float8_e4m3fn,
++            "int": torch.int,
++            "float": torch.float,
++        }[dt]
++
++    class ToTorchDtype(argparse.Action):
++
++        def __call__(self, parser, namespace, values, option_string=None):
++            setattr(namespace, self.dest, to_torch_dtype(values))
++
++    parser = FlexibleArgumentParser(
++        description="""
++Benchmark Machete GEMM.
++
++    To run square GEMMs:
++        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
++    
++    To run constant N and K and sweep M:
++        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
++    
++    To run dimensions from a model:
++        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
++    
++    Output:
++        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
++            """,  # noqa: E501
++        formatter_class=argparse.RawTextHelpFormatter,
++    )
++    parser.add_argument(
++        "--act-type",
++        action=ToTorchDtype,
++        required=True,
++        choices=['bfloat16', 'float16', 'int8', 'float8_e4m3fn'],
++    )
++    parser.add_argument(
++        "--group-scale-type",
++        action=ToTorchDtype,
++        choices=['bfloat16', 'float16'],
++    )
++    parser.add_argument(
++        "--group-zero-type",
++        type=to_torch_dtype,
++        choices=['bfloat16', 'float16'],
++    )
++    parser.add_argument(
++        "--channel-scale-type",
++        action=ToTorchDtype,
++        choices=['float'],
++    )
++    parser.add_argument(
++        "--token-scale-type",
++        action=ToTorchDtype,
++        choices=['float'],
++    )
++    parser.add_argument(
++        "--out-type",
++        action=ToTorchDtype,
++        choices=['bfloat16', 'float16'],
++    )
++    parser.add_argument(
++        "--group-size",
++        type=int,
++        help="Available options are ['None', '-1', '128'], default=128",
++        default=128,
++    )
++    parser.add_argument(
++        "--sweep-schedules",
++        action="store_true",
++        help="Run a sweep over all supported schedules",
++    )
++    parser.add_argument("--sweep-csv-out",
++                        help="CSV to store sweep results",
++                        default="sch_sweep_results.csv")
++    subparsers = parser.add_subparsers(dest="cmd", required=True)
++
++    square_parser = subparsers.add_parser("square_bench")
++    square_parser.add_argument("--dim-start", type=int, required=True)
++    square_parser.add_argument("--dim-end", type=int, required=True)
++    square_parser.add_argument("--dim-increment", type=int, required=True)
++    square_parser.set_defaults(func=run_square_bench)
++
++    range_parser = subparsers.add_parser("range_bench")
++    range_parser.add_argument(
++        "--dim-start",
++        type=str,
++        required=True,
++        help="Start value for M,K,N as common separated list")
++    range_parser.add_argument(
++        "--dim-end",
++        type=str,
++        required=True,
++        help="End value (inclusive) for M,K,N as common separated list")
++    range_parser.add_argument(
++        "--dim-increment",
++        type=str,
++        required=True,
++        help="Increment value for M,K,N as common separated list")
++    range_parser.set_defaults(func=run_range_bench)
++
++    model_parser = subparsers.add_parser("model_bench")
++    model_parser.add_argument(
++        "--models",
++        nargs="+",
++        type=str,
++        default=DEFAULT_MODELS,
++        choices=WEIGHT_SHAPES.keys(),
++    )
++    model_parser.add_argument("--tp-sizes",
++                              nargs="+",
++                              type=int,
++                              default=DEFAULT_TP_SIZES)
++    model_parser.add_argument("--batch-sizes",
++                              nargs="+",
++                              type=int,
++                              default=DEFAULT_BATCH_SIZES)
++    model_parser.set_defaults(func=run_model_bench)
++
++    args = parser.parse_args()
++
++    _SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
++    args.func(args)
++
++    if _SWEEP_SCHEDULES_RESULTS is not None:
++        _SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)
+diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
+new file mode 100644
+index 0000000..8fb44e3
+--- /dev/null
++++ b/benchmarks/kernels/benchmark_marlin.py
+@@ -0,0 +1,254 @@
++from typing import List
++
++import torch
++import torch.utils.benchmark as benchmark
++from benchmark_shapes import WEIGHT_SHAPES
++
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
++    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
++    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
++from vllm.model_executor.layers.quantization.utils.marlin_utils import (
++    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
++    MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
++from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
++    MarlinWorkspace, marlin_quantize)
++from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
++    marlin_24_quantize)
++from vllm.model_executor.layers.quantization.utils.quant_utils import (
++    gptq_pack, gptq_quantize_weights, sort_weights)
++from vllm.scalar_type import ScalarType
++from vllm.utils import FlexibleArgumentParser
++
++DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
++DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
++
++ACT_ORDER_OPTS = [False, True]
++K_FULL_OPTS = [False, True]
++
++
++def bench_run(results: List[benchmark.Measurement], model: str,
++              act_order: bool, is_k_full: bool, quant_type: ScalarType,
++              group_size: int, size_m: int, size_k: int, size_n: int):
++    label = "Quant Matmul"
++
++    sub_label = ("{}, act={} k_full={}, q={}, g={}, "
++                 "MKN=({}x{}x{})".format(model, act_order, is_k_full,
++                                         str(quant_type), group_size, size_m,
++                                         size_k, size_n))
++
++    print(f"Testing: {sub_label}")
++
++    a = torch.randn(size_m, size_k).to(torch.half).cuda()
++    b = torch.rand(size_k, size_n).to(torch.half).cuda()
++
++    a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
++
++    # Marlin quant
++    (
++        marlin_w_ref,
++        marlin_q_w,
++        marlin_s,
++        marlin_g_idx,
++        marlin_sort_indices,
++        marlin_rand_perm,
++    ) = marlin_quantize(b, quant_type, group_size, act_order)
++
++    # Marlin_24 quant
++    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
++     marlin_24_s) = marlin_24_quantize(b, quant_type, group_size)
++
++    marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
++
++    # GPTQ quant
++    (w_ref, q_w, s, g_idx,
++     rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order)
++    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
++
++    # For act_order, sort the "weights" and "g_idx"
++    # so that group ids are increasing
++    repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
++    if act_order:
++        (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
++
++    # Prepare
++    marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
++                                       GPTQ_MARLIN_MAX_PARALLEL)
++
++    marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
++                                          GPTQ_MARLIN_24_MAX_PARALLEL)
++    marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
++
++    globals = {
++        # Gen params
++        "quant_type": quant_type,
++        "group_size": group_size,
++        "size_m": size_m,
++        "size_n": size_n,
++        "size_k": size_k,
++        "a": a,
++        "a_tmp": a_tmp,
++        # Marlin params
++        "marlin_w_ref": marlin_w_ref,
++        "marlin_q_w": marlin_q_w,
++        "marlin_s": marlin_s,
++        "marlin_zp": marlin_zp,
++        "marlin_g_idx": marlin_g_idx,
++        "marlin_sort_indices": marlin_sort_indices,
++        "marlin_rand_perm": marlin_rand_perm,
++        "marlin_workspace": marlin_workspace,
++        "is_k_full": is_k_full,
++        # Marlin_24 params
++        "marlin_24_w_ref": marlin_24_w_ref,
++        "marlin_24_q_w_comp": marlin_24_q_w_comp,
++        "marlin_24_meta": marlin_24_meta,
++        "marlin_24_s": marlin_24_s,
++        "marlin_24_workspace": marlin_24_workspace,
++        # GPTQ params
++        "q_w_gptq": q_w_gptq,
++        "repack_sort_indices": repack_sort_indices,
++        # Kernels
++        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
++        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
++        "gptq_marlin_repack": ops.gptq_marlin_repack,
++    }
++
++    min_run_time = 1
++
++    # Warmup pytorch
++    for i in range(5):
++        torch.matmul(a, marlin_w_ref)
++
++    results.append(
++        benchmark.Timer(
++            stmt="torch.matmul(a, marlin_w_ref)",
++            globals=globals,
++            label=label,
++            sub_label=sub_label,
++            description="pytorch_gemm",
++        ).blocked_autorange(min_run_time=min_run_time))
++
++    results.append(
++        benchmark.Timer(
++            stmt=
++            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
++            globals=globals,
++            label=label,
++            sub_label=sub_label,
++            description="gptq_marlin_gemm_fp16",
++        ).blocked_autorange(min_run_time=min_run_time))
++
++    results.append(
++        benchmark.Timer(
++            stmt=
++            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
++            globals=globals,
++            label=label,
++            sub_label=sub_label,
++            description="gptq_marlin_gemm_fp32",
++        ).blocked_autorange(min_run_time=min_run_time))
++
++    if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
++            and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
++        results.append(
++            benchmark.Timer(
++                stmt=
++                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
++                globals=globals,
++                label=label,
++                sub_label=sub_label,
++                description="gptq_marlin_24_gemm",
++            ).blocked_autorange(min_run_time=min_run_time))
++
++    results.append(
++        benchmark.Timer(
++            stmt=
++            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
++            globals=globals,
++            label=label,
++            sub_label=sub_label,
++            description="gptq_marlin_repack",
++        ).blocked_autorange(min_run_time=min_run_time))
++
++
++def main(args):
++    print("Benchmarking models:")
++    for i, model in enumerate(args.models):
++        print(f"[{i}]  {model}")
++
++    results: List[benchmark.Measurement] = []
++
++    for model in args.models:
++        for layer in WEIGHT_SHAPES[model]:
++            size_k = layer[0]
++            size_n = layer[1]
++
++            if len(args.limit_k) > 0 and size_k not in args.limit_k:
++                continue
++
++            if len(args.limit_n) > 0 and size_n not in args.limit_n:
++                continue
++
++            for act_order in ACT_ORDER_OPTS:
++                if len(args.limit_act_order
++                       ) > 0 and act_order not in args.limit_act_order:
++                    continue
++
++                for is_k_full in K_FULL_OPTS:
++                    if len(args.limit_k_full
++                           ) > 0 and is_k_full not in args.limit_k_full:
++                        continue
++
++                    for quant_type in query_marlin_supported_quant_types(
++                            False):
++                        if len(args.limit_num_bits) > 0 and \
++                            quant_type.size_bits not in args.limit_num_bits:
++                            continue
++
++                        for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
++                            if len(
++                                    args.limit_group_size
++                            ) > 0 and group_size not in args.limit_group_size:
++                                continue
++
++                            # For act_order, the group_size must be less than
++                            # size_k
++                            if act_order and (group_size == size_k
++                                              or group_size == -1):
++                                continue
++
++                            for size_m in args.batch_sizes:
++                                bench_run(results, model, act_order, is_k_full,
++                                          quant_type, group_size, size_m,
++                                          size_k, size_n)
++
++    compare = benchmark.Compare(results)
++    compare.print()
++
++
++# For quick benchmarking use:
++#   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
++#
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(
++        description="Benchmark Marlin across specified models/shapes/batches")
++    parser.add_argument(
++        "--models",
++        nargs="+",
++        type=str,
++        default=DEFAULT_MODELS,
++        choices=WEIGHT_SHAPES.keys(),
++    )
++    parser.add_argument("--batch-sizes",
++                        nargs="+",
++                        type=int,
++                        default=DEFAULT_BATCH_SIZES)
++    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
++    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
++    parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
++    parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
++    parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
++    parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
++
++    args = parser.parse_args()
++    main(args)
+diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
+new file mode 100644
+index 0000000..8f538c2
+--- /dev/null
++++ b/benchmarks/kernels/benchmark_moe.py
+@@ -0,0 +1,367 @@
++import argparse
++import time
++from datetime import datetime
++from typing import Any, Dict, List, Tuple, TypedDict
++
++import ray
++import torch
++import triton
++from ray.experimental.tqdm_ray import tqdm
++from transformers import AutoConfig
++
++from vllm.model_executor.layers.fused_moe.fused_moe import *
++from vllm.platforms import current_platform
++from vllm.utils import FlexibleArgumentParser
++
++
++class BenchmarkConfig(TypedDict):
++    BLOCK_SIZE_M: int
++    BLOCK_SIZE_N: int
++    BLOCK_SIZE_K: int
++    GROUP_SIZE_M: int
++    num_warps: int
++    num_stages: int
++
++
++def benchmark_config(
++    config: BenchmarkConfig,
++    num_tokens: int,
++    num_experts: int,
++    shard_intermediate_size: int,
++    hidden_size: int,
++    topk: int,
++    dtype: torch.dtype,
++    use_fp8_w8a8: bool,
++    use_int8_w8a16: bool,
++    num_iters: int = 100,
++) -> float:
++    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
++    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
++    if use_int8_w8a16:
++        w1 = torch.randint(-127,
++                           127, (
++                               num_experts,
++                               shard_intermediate_size,
++                               hidden_size,
++                           ),
++                           dtype=torch.int8)
++        w2 = torch.randint(-127,
++                           127, (
++                               num_experts,
++                               hidden_size,
++                               shard_intermediate_size // 2,
++                           ),
++                           dtype=torch.int8)
++    else:
++        w1 = torch.randn(num_experts,
++                         shard_intermediate_size,
++                         hidden_size,
++                         dtype=init_dtype)
++        w2 = torch.randn(num_experts,
++                         hidden_size,
++                         shard_intermediate_size // 2,
++                         dtype=init_dtype)
++    gating_output = torch.randn(num_iters,
++                                num_tokens,
++                                num_experts,
++                                dtype=torch.float32)
++
++    w1_scale = None
++    w2_scale = None
++    a1_scale = None
++    a2_scale = None
++    if use_int8_w8a16:
++        w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size),
++                               dtype=torch.float32)
++        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
++    if use_fp8_w8a8:
++        w1_scale = torch.randn(num_experts, dtype=torch.float32)
++        w2_scale = torch.randn(num_experts, dtype=torch.float32)
++        a1_scale = torch.randn(1, dtype=torch.float32)
++        a2_scale = torch.randn(1, dtype=torch.float32)
++
++        w1 = w1.to(torch.float8_e4m3fn)
++        w2 = w2.to(torch.float8_e4m3fn)
++
++    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
++
++    def prepare(i: int):
++        input_gating.copy_(gating_output[i])
++
++    def run():
++        from vllm.model_executor.layers.fused_moe import override_config
++        with override_config(config):
++            fused_moe(
++                x,
++                w1,
++                w2,
++                input_gating,
++                topk,
++                renormalize=True,
++                inplace=True,
++                use_fp8_w8a8=use_fp8_w8a8,
++                use_int8_w8a16=use_int8_w8a16,
++                w1_scale=w1_scale,
++                w2_scale=w2_scale,
++                a1_scale=a1_scale,
++                a2_scale=a2_scale,
++            )
++
++    # JIT compilation & warmup
++    run()
++    torch.cuda.synchronize()
++
++    # Capture 10 invocations with CUDA graph
++    graph = torch.cuda.CUDAGraph()
++    with torch.cuda.graph(graph):
++        for _ in range(10):
++            run()
++    torch.cuda.synchronize()
++
++    # Warmup
++    for _ in range(5):
++        graph.replay()
++    torch.cuda.synchronize()
++
++    start_event = torch.cuda.Event(enable_timing=True)
++    end_event = torch.cuda.Event(enable_timing=True)
++
++    latencies: List[float] = []
++    for i in range(num_iters):
++        prepare(i)
++        torch.cuda.synchronize()
++
++        start_event.record()
++        graph.replay()
++        end_event.record()
++        end_event.synchronize()
++        latencies.append(start_event.elapsed_time(end_event))
++    avg = sum(latencies) / (num_iters * 10) * 1000  # us
++    graph.reset()
++    return avg
++
++
++def get_configs_compute_bound() -> List[Dict[str, int]]:
++    # Reduced search space for faster tuning.
++    # TODO(woosuk): Increase the search space and use a performance model to
++    # prune the search space.
++    configs: List[BenchmarkConfig] = []
++    for num_stages in [2, 3, 4, 5]:
++        for block_m in [16, 32, 64, 128, 256]:
++            for block_k in [64, 128, 256]:
++                for block_n in [32, 64, 128, 256]:
++                    for num_warps in [4, 8]:
++                        for group_size in [1, 16, 32, 64]:
++                            configs.append({
++                                "BLOCK_SIZE_M": block_m,
++                                "BLOCK_SIZE_N": block_n,
++                                "BLOCK_SIZE_K": block_k,
++                                "GROUP_SIZE_M": group_size,
++                                "num_warps": num_warps,
++                                "num_stages": num_stages,
++                            })
++    return configs
++
++
++@ray.remote(num_gpus=1)
++class BenchmarkWorker:
++
++    def __init__(self, seed: int) -> None:
++        torch.set_default_device("cuda")
++        current_platform.seed_everything(seed)
++        self.seed = seed
++
++    def benchmark(
++        self,
++        num_tokens: int,
++        num_experts: int,
++        shard_intermediate_size: int,
++        hidden_size: int,
++        topk: int,
++        dtype: torch.dtype,
++        use_fp8_w8a8: bool,
++        use_int8_w8a16: bool,
++    ) -> Tuple[Dict[str, int], float]:
++        current_platform.seed_everything(self.seed)
++        dtype_str = get_config_dtype_str(dtype,
++                                         use_int8_w8a16=use_int8_w8a16,
++                                         use_fp8_w8a8=use_fp8_w8a8)
++        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
++        # is the intermediate size after silu_and_mul.
++        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
++                                    dtype_str)
++        if op_config is None:
++            config = get_default_config(num_tokens, num_experts,
++                                        shard_intermediate_size, hidden_size,
++                                        topk, dtype_str)
++        else:
++            config = op_config[min(op_config.keys(),
++                                   key=lambda x: abs(x - num_tokens))]
++        kernel_time = benchmark_config(config, num_tokens, num_experts,
++                                       shard_intermediate_size, hidden_size,
++                                       topk, dtype, use_fp8_w8a8,
++                                       use_int8_w8a16)
++        return config, kernel_time
++
++    def tune(
++        self,
++        num_tokens: int,
++        num_experts: int,
++        shard_intermediate_size: int,
++        hidden_size: int,
++        topk: int,
++        dtype: torch.dtype,
++        use_fp8_w8a8: bool,
++        use_int8_w8a16: bool,
++        search_space: List[Dict[str, int]],
++    ) -> Dict[str, int]:
++        best_config = None
++        best_time = float("inf")
++        for config in tqdm(search_space):
++            try:
++                kernel_time = benchmark_config(config,
++                                               num_tokens,
++                                               num_experts,
++                                               shard_intermediate_size,
++                                               hidden_size,
++                                               topk,
++                                               dtype,
++                                               use_fp8_w8a8,
++                                               use_int8_w8a16,
++                                               num_iters=10)
++            except triton.runtime.autotuner.OutOfResources:
++                # Some configurations may be invalid and fail to compile.
++                continue
++
++            if kernel_time < best_time:
++                best_time = kernel_time
++                best_config = config
++        now = datetime.now()
++        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
++        assert best_config is not None
++        return best_config
++
++
++def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
++    return {
++        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
++        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
++        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
++        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
++        "num_warps": config["num_warps"],
++        "num_stages": config["num_stages"],
++    }
++
++
++def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
++                 shard_intermediate_size: int, hidden_size: int, topk: int,
++                 dtype: torch.dtype, use_fp8_w8a8: bool,
++                 use_int8_w8a16: bool) -> None:
++    dtype_str = get_config_dtype_str(dtype,
++                                     use_int8_w8a16=use_int8_w8a16,
++                                     use_fp8_w8a8=use_fp8_w8a8)
++
++    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
++    # is the intermediate size after silu_and_mul.
++    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
++                                    dtype_str)
++
++    print(f"Writing best config to {filename}...")
++    with open(filename, "w") as f:
++        json.dump(configs, f, indent=4)
++        f.write("\n")
++
++
++def main(args: argparse.Namespace):
++    print(args)
++
++    config = AutoConfig.from_pretrained(args.model)
++    if config.architectures[0] == "DbrxForCausalLM":
++        E = config.ffn_config.moe_num_experts
++        topk = config.ffn_config.moe_top_k
++        intermediate_size = config.ffn_config.ffn_hidden_size
++        shard_intermediate_size = 2 * intermediate_size // args.tp_size
++    elif config.architectures[0] == "JambaForCausalLM":
++        E = config.num_experts
++        topk = config.num_experts_per_tok
++        intermediate_size = config.intermediate_size
++        shard_intermediate_size = 2 * intermediate_size // args.tp_size
++    else:
++        # Default: Mixtral.
++        E = config.num_local_experts
++        topk = config.num_experts_per_tok
++        intermediate_size = config.intermediate_size
++        shard_intermediate_size = 2 * intermediate_size // args.tp_size
++
++    hidden_size = config.hidden_size
++    dtype = config.torch_dtype
++    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
++    use_int8_w8a16 = args.dtype == "int8_w8a16"
++
++    if args.batch_size is None:
++        batch_sizes = [
++            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
++            2048, 3072, 4096
++        ]
++    else:
++        batch_sizes = [args.batch_size]
++
++    ray.init()
++    num_gpus = int(ray.available_resources()["GPU"])
++    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
++
++    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
++        outputs = []
++        worker_idx = 0
++        for input_args in inputs:
++            worker = workers[worker_idx]
++            worker_method = getattr(worker, method)
++            output = worker_method.remote(*input_args)
++            outputs.append(output)
++            worker_idx = (worker_idx + 1) % num_gpus
++        return ray.get(outputs)
++
++    if args.tune:
++        search_space = get_configs_compute_bound()
++        print(f"Start tuning over {len(search_space)} configurations...")
++
++        start = time.time()
++        configs = _distribute(
++            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
++                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space)
++                     for batch_size in batch_sizes])
++        best_configs = {
++            M: sort_config(config)
++            for M, config in zip(batch_sizes, configs)
++        }
++        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
++                     topk, dtype, use_fp8_w8a8, use_int8_w8a16)
++        end = time.time()
++        print(f"Tuning took {end - start:.2f} seconds")
++    else:
++        outputs = _distribute(
++            "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
++                           topk, dtype, use_fp8_w8a8, use_int8_w8a16)
++                          for batch_size in batch_sizes])
++
++        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
++            print(f"Batch size: {batch_size}, config: {config}")
++            print(f"Kernel time: {kernel_time:.2f} us")
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser()
++    parser.add_argument("--model",
++                        type=str,
++                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
++    parser.add_argument("--tp-size", "-tp", type=int, default=2)
++    parser.add_argument("--dtype",
++                        type=str,
++                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
++                        default="auto")
++    parser.add_argument("--seed", type=int, default=0)
++    parser.add_argument("--batch-size", type=int, required=False)
++    parser.add_argument("--tune", action="store_true")
++    args = parser.parse_args()
++
++    main(args)
+diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
+index ca7967c..14eef00 100644
+--- a/benchmarks/kernels/benchmark_paged_attention.py
++++ b/benchmarks/kernels/benchmark_paged_attention.py
+@@ -1,12 +1,13 @@
+-import argparse
+ import random
+ import time
+-from typing import Optional
++from typing import List, Optional
+ 
+ import torch
+ 
+ from vllm import _custom_ops as ops
+-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
++from vllm.platforms import current_platform
++from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
++                        create_kv_caches_with_random)
+ 
+ NUM_BLOCKS = 1024
+ PARTITION_SIZE = 512
+@@ -28,10 +29,7 @@ def main(
+     device: str = "cuda",
+     kv_cache_dtype: Optional[str] = None,
+ ) -> None:
+-    random.seed(seed)
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    current_platform.seed_everything(seed)
+ 
+     scale = float(1.0 / (head_size**0.5))
+     query = torch.empty(num_seqs,
+@@ -54,14 +52,17 @@ def main(
+ 
+     # Create the block tables.
+     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+-    block_tables = []
++    block_tables_lst: List[List[int]] = []
+     for _ in range(num_seqs):
+         block_table = [
+             random.randint(0, NUM_BLOCKS - 1)
+             for _ in range(max_num_blocks_per_seq)
+         ]
+-        block_tables.append(block_table)
+-    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
++        block_tables_lst.append(block_table)
++
++    block_tables = torch.tensor(block_tables_lst,
++                                dtype=torch.int,
++                                device=device)
+ 
+     # Create the KV cache.
+     key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
+@@ -97,7 +98,7 @@ def main(
+         start_time = time.perf_counter()
+ 
+         # Using default kv_scale
+-        kv_scale = 1.0
++        k_scale = v_scale = 1.0
+ 
+         for _ in range(num_iters):
+             if version == "v1":
+@@ -114,7 +115,8 @@ def main(
+                     max_seq_len,
+                     alibi_slopes,
+                     kv_cache_dtype,
+-                    kv_scale,
++                    k_scale,
++                    v_scale,
+                 )
+             elif version == "v2":
+                 ops.paged_attention_v2(
+@@ -133,7 +135,8 @@ def main(
+                     max_seq_len,
+                     alibi_slopes,
+                     kv_cache_dtype,
+-                    kv_scale,
++                    k_scale,
++                    v_scale,
+                 )
+             else:
+                 raise ValueError(f"Invalid version: {version}")
+@@ -158,19 +161,19 @@ def main(
+ 
+ 
+ if __name__ == '__main__':
+-    parser = argparse.ArgumentParser(
++    parser = FlexibleArgumentParser(
+         description="Benchmark the paged attention kernel.")
+     parser.add_argument("--version",
+                         type=str,
+                         choices=["v1", "v2"],
+                         default="v2")
+     parser.add_argument("--batch-size", type=int, default=8)
+-    parser.add_argument("--seq_len", type=int, default=4096)
++    parser.add_argument("--seq-len", type=int, default=4096)
+     parser.add_argument("--num-query-heads", type=int, default=64)
+     parser.add_argument("--num-kv-heads", type=int, default=8)
+     parser.add_argument("--head-size",
+                         type=int,
+-                        choices=[64, 80, 96, 112, 128, 256],
++                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+                         default=128)
+     parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+     parser.add_argument("--use-alibi", action="store_true")
+@@ -183,13 +186,11 @@ if __name__ == '__main__':
+     parser.add_argument(
+         "--kv-cache-dtype",
+         type=str,
+-        choices=["auto", "fp8"],
++        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
+         default="auto",
+-        help=
+-        'Data type for kv cache storage. If "auto", will use model data type. '
+-        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+-        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+-        'common inference criteria.')
++        help="Data type for kv cache storage. If 'auto', will use model "
++        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
++        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
+     args = parser.parse_args()
+     print(args)
+ 
+diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
+new file mode 100644
+index 0000000..1d62483
+--- /dev/null
++++ b/benchmarks/kernels/benchmark_quant.py
+@@ -0,0 +1,100 @@
++import time
++
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.platforms import current_platform
++from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
++
++
++@torch.inference_mode()
++def main(num_tokens: int,
++         hidden_size: int,
++         static_scale: bool,
++         quant_dtype: torch.dtype,
++         dtype: torch.dtype,
++         seed: int = 0,
++         do_profile: bool = False,
++         num_warmup_iters: int = 5,
++         num_iters: int = 100) -> None:
++    current_platform.seed_everything(seed)
++    torch.set_default_device("cuda")
++
++    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
++    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
++
++    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
++        torch.cuda.synchronize()
++        if profile:
++            torch.cuda.cudart().cudaProfilerStart()
++        start_time = time.perf_counter()
++
++        for _ in range(num_iters):
++            if quant_dtype == torch.int8:
++                ops.scaled_int8_quant(x, scale)
++            else:
++                ops.scaled_fp8_quant(x, scale)
++        torch.cuda.synchronize()
++
++        end_time = time.perf_counter()
++        if profile:
++            torch.cuda.cudart().cudaProfilerStart()
++        return (end_time - start_time) / num_iters
++
++    # Warmup.
++    print("Warming up...")
++    run_benchmark = run_cuda_benchmark
++    run_benchmark(num_iters=num_warmup_iters, profile=False)
++
++    # Benchmark.
++    if do_profile:
++        latency = run_benchmark(num_iters=1, profile=True)
++    else:
++        latency = run_benchmark(num_iters=num_iters, profile=False)
++    print(f"Kernel running time: {latency * 1000000:.3f} us")
++
++
++if __name__ == '__main__':
++
++    def to_torch_dtype(dt):
++        if dt == "int8":
++            return torch.int8
++        if dt == "fp8":
++            return torch.float8_e4m3fn
++        raise ValueError(f"Unsupported dtype: {dt}")
++
++    parser = FlexibleArgumentParser(
++        description="Benchmark the quantization (fp8 or int8) kernel.")
++    parser.add_argument("--num-tokens", type=int, default=4096)
++    parser.add_argument("--hidden-size", type=int, default=8192)
++    parser.add_argument("--static-scale", action="store_true")
++    parser.add_argument("--quant-dtype",
++                        type=str,
++                        choices=["fp8", "int8"],
++                        default="int8")
++    parser.add_argument("--dtype",
++                        type=str,
++                        choices=["half", "bfloat16", "float"],
++                        default="half")
++
++    parser.add_argument("--seed", type=int, default=0)
++    parser.add_argument("--profile", action="store_true")
++    parser.add_argument("--num-warmup-iters", type=int, default=5)
++    parser.add_argument("--num-iters",
++                        type=int,
++                        default=100,
++                        help="Number of benchmark iterations. "
++                        "If --profile is set, this number is ignored")
++
++    args = parser.parse_args()
++    print(args)
++
++    main(num_tokens=args.num_tokens,
++         hidden_size=args.hidden_size,
++         static_scale=args.static_scale,
++         quant_dtype=to_torch_dtype(args.quant_dtype),
++         dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
++         seed=args.seed,
++         do_profile=args.profile,
++         num_warmup_iters=args.num_warmup_iters,
++         num_iters=args.num_iters)
+diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
+new file mode 100644
+index 0000000..baa5de0
+--- /dev/null
++++ b/benchmarks/kernels/benchmark_rmsnorm.py
+@@ -0,0 +1,262 @@
++import itertools
++from typing import Optional, Tuple, Union
++
++import torch
++import triton
++from flashinfer.norm import fused_add_rmsnorm, rmsnorm
++from torch import nn
++
++from vllm import _custom_ops as vllm_ops
++
++
++class HuggingFaceRMSNorm(nn.Module):
++
++    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
++        super().__init__()
++        self.weight = nn.Parameter(torch.ones(hidden_size))
++        self.variance_epsilon = eps
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        residual: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
++        orig_dtype = x.dtype
++        x = x.to(torch.float32)
++        if residual is not None:
++            x = x + residual.to(torch.float32)
++            residual = x.to(orig_dtype)
++
++        variance = x.pow(2).mean(dim=-1, keepdim=True)
++        x = x * torch.rsqrt(variance + self.variance_epsilon)
++        x = x.to(orig_dtype) * self.weight
++        if residual is None:
++            return x
++        else:
++            return x, residual
++
++
++def rmsnorm_naive(
++    x: torch.Tensor,
++    weight: torch.Tensor,
++    residual: Optional[torch.Tensor] = None,
++    eps: float = 1e-6,
++):
++    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
++    naive_norm.weight = nn.Parameter(weight)
++    naive_norm = naive_norm.to(x.device)
++
++    orig_shape = x.shape
++    x = x.view(-1, x.shape[-1])
++    if residual is not None:
++        residual = residual.view(-1, residual.shape[-1])
++
++    output = naive_norm(x, residual)
++
++    if isinstance(output, tuple):
++        output = (output[0].view(orig_shape), output[1].view(orig_shape))
++    else:
++        output = output.view(orig_shape)
++    return output
++
++
++def rmsnorm_flashinfer(
++    x: torch.Tensor,
++    weight: torch.Tensor,
++    residual: Optional[torch.Tensor] = None,
++    eps: float = 1e-6,
++):
++    orig_shape = x.shape
++    x = x.view(-1, x.shape[-1])
++    if residual is not None:
++        residual = residual.view(-1, residual.shape[-1])
++
++    if residual is not None:
++        fused_add_rmsnorm(x, residual, weight, eps)
++        output = (x, residual)
++    else:
++        output = rmsnorm(x, weight, eps)
++
++    if isinstance(output, tuple):
++        output = (output[0].view(orig_shape), output[1].view(orig_shape))
++    else:
++        output = output.view(orig_shape)
++    return output
++
++
++def rmsnorm_vllm(
++    x: torch.Tensor,
++    weight: torch.Tensor,
++    residual: Optional[torch.Tensor] = None,
++    eps: float = 1e-6,
++):
++    orig_shape = x.shape
++    x = x.view(-1, x.shape[-1])
++    if residual is not None:
++        residual = residual.view(-1, residual.shape[-1])
++
++    if residual is not None:
++        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
++        output = (x, residual)
++    else:
++        out = torch.empty_like(x)
++        vllm_ops.rms_norm(out, x, weight, eps)
++        output = out
++
++    if isinstance(output, tuple):
++        output = (output[0].view(orig_shape), output[1].view(orig_shape))
++    else:
++        output = output.view(orig_shape)
++    return output
++
++
++def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
++    dtype = torch.bfloat16
++    x = torch.randn(batch_size,
++                    seq_len,
++                    hidden_size,
++                    dtype=dtype,
++                    device="cuda")
++    weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
++    residual = torch.randn_like(x) if use_residual else None
++
++    output_naive = rmsnorm_naive(
++        x.clone(), weight,
++        residual.clone() if residual is not None else None)
++    output_flashinfer = rmsnorm_flashinfer(
++        x.clone(), weight,
++        residual.clone() if residual is not None else None)
++    output_vllm = rmsnorm_vllm(
++        x.clone(), weight,
++        residual.clone() if residual is not None else None)
++
++    if use_residual:
++        output_naive = output_naive[0]
++        output_flashinfer = output_flashinfer[0]
++        output_vllm = output_vllm[0]
++
++    print(f"Naive output={output_naive}")
++    print(f"FlashInfer output={output_flashinfer}")
++    print(f"VLLM output={output_vllm}")
++
++    if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
++                      rtol=1e-2) and torch.allclose(
++                          output_naive, output_vllm, atol=1e-2, rtol=1e-2):
++        print("✅ All implementations match")
++    else:
++        print("❌ Implementations differ")
++
++
++batch_size_range = [2**i for i in range(0, 7, 2)]
++seq_length_range = [2**i for i in range(6, 11, 1)]
++head_num_range = [32, 48]
++configs = list(
++    itertools.product(head_num_range, batch_size_range, seq_length_range))
++
++
++def get_benchmark(use_residual):
++
++    @triton.testing.perf_report(
++        triton.testing.Benchmark(
++            x_names=["head_num", "batch_size", "seq_len"],
++            x_vals=[list(_) for _ in configs],
++            line_arg="provider",
++            line_vals=["huggingface", "flashinfer", "vllm"],
++            line_names=["HuggingFace", "FlashInfer", "vLLM"],
++            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
++            ylabel="us",
++            plot_name=
++            f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual",
++            args={},
++        ))
++    def benchmark(head_num, batch_size, seq_len, provider):
++        dtype = torch.bfloat16
++        hidden_size = head_num * 128  # assuming head_dim = 128
++
++        x = torch.randn(batch_size,
++                        seq_len,
++                        hidden_size,
++                        dtype=dtype,
++                        device="cuda")
++        weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
++        residual = torch.randn_like(x) if use_residual else None
++
++        quantiles = [0.5, 0.2, 0.8]
++
++        if provider == "huggingface":
++            ms, min_ms, max_ms = triton.testing.do_bench(
++                lambda: rmsnorm_naive(
++                    x.clone(),
++                    weight,
++                    residual.clone() if residual is not None else None,
++                ),
++                quantiles=quantiles,
++            )
++        elif provider == "flashinfer":
++            ms, min_ms, max_ms = triton.testing.do_bench(
++                lambda: rmsnorm_flashinfer(
++                    x.clone(),
++                    weight,
++                    residual.clone() if residual is not None else None,
++                ),
++                quantiles=quantiles,
++            )
++        else:
++            ms, min_ms, max_ms = triton.testing.do_bench(
++                lambda: rmsnorm_vllm(
++                    x.clone(),
++                    weight,
++                    residual.clone() if residual is not None else None,
++                ),
++                quantiles=quantiles,
++            )
++
++        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
++
++    return benchmark
++
++
++if __name__ == "__main__":
++    import argparse
++
++    parser = argparse.ArgumentParser()
++    parser.add_argument(
++        "--batch-size",
++        type=int,
++        default=4,
++        help="Batch size",
++    )
++    parser.add_argument(
++        "--seq-len",
++        type=int,
++        default=128,
++        help="Sequence length",
++    )
++    parser.add_argument(
++        "--hidden-size",
++        type=int,
++        default=4096,
++        help="Hidden size (2nd dimension) of the sequence",
++    )
++    parser.add_argument("--use-residual",
++                        action="store_true",
++                        help="Whether to use residual connection")
++    parser.add_argument(
++        "--save-path",
++        type=str,
++        default="./configs/rmsnorm/",
++        help="Path to save rmsnorm benchmark results",
++    )
++
++    args = parser.parse_args()
++
++    # Run correctness test
++    calculate_diff(batch_size=args.batch_size,
++                   seq_len=args.seq_len,
++                   hidden_size=args.hidden_size,
++                   use_residual=args.use_residual)
++
++    # Get the benchmark function with proper use_residual setting
++    benchmark = get_benchmark(args.use_residual)
++    # Run performance benchmark
++    benchmark.run(print_data=True, save_path=args.save_path)
+diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
+index 9188e81..250d505 100644
+--- a/benchmarks/kernels/benchmark_rope.py
++++ b/benchmarks/kernels/benchmark_rope.py
+@@ -1,11 +1,13 @@
+-import argparse
+ from itertools import accumulate
+-from typing import Optional
++from typing import List, Optional
+ 
+ import nvtx
+ import torch
+ 
+-from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
++                                                         get_rope)
++from vllm.platforms import current_platform
++from vllm.utils import FlexibleArgumentParser
+ 
+ 
+ def benchmark_rope_kernels_multi_lora(
+@@ -21,9 +23,7 @@ def benchmark_rope_kernels_multi_lora(
+     max_position: int = 8192,
+     base: int = 10000,
+ ) -> None:
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     if rotary_dim is None:
+         rotary_dim = head_size
+@@ -32,17 +32,17 @@ def benchmark_rope_kernels_multi_lora(
+     # batched RoPE can take multiple scaling factors
+     batched_rope = get_rope(head_size, rotary_dim, max_position, base,
+                             is_neox_style, {
+-                                "type": "linear",
++                                "rope_type": "linear",
+                                 "factor": tuple(scaling_factors)
+                             })
+     # non-batched RoPE takes only one scaling factor, we create multiple
+     # instances to simulate the same behavior
+-    non_batched_ropes = []
++    non_batched_ropes: List[RotaryEmbedding] = []
+     for scaling_factor in scaling_factors:
+         non_batched_ropes.append(
+             get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
+                      {
+-                         "type": "linear",
++                         "rope_type": "linear",
+                          "factor": (scaling_factor, )
+                      }))
+ 
+@@ -85,7 +85,7 @@ def benchmark_rope_kernels_multi_lora(
+ 
+ 
+ if __name__ == '__main__':
+-    parser = argparse.ArgumentParser(
++    parser = FlexibleArgumentParser(
+         description="Benchmark the rotary embedding kernels.")
+     parser.add_argument("--is-neox-style", type=bool, default=True)
+     parser.add_argument("--batch-size", type=int, default=16)
+@@ -93,7 +93,7 @@ if __name__ == '__main__':
+     parser.add_argument("--num-heads", type=int, default=8)
+     parser.add_argument("--head-size",
+                         type=int,
+-                        choices=[64, 80, 96, 112, 128, 256],
++                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+                         default=128)
+     parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
+     parser.add_argument("--dtype",
+diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
+new file mode 100644
+index 0000000..4eeeca3
+--- /dev/null
++++ b/benchmarks/kernels/benchmark_shapes.py
+@@ -0,0 +1,75 @@
++WEIGHT_SHAPES = {
++    "ideal": [[4 * 256 * 32, 256 * 32]],
++    "mistralai/Mistral-7B-v0.1/TP1": [
++        [4096, 6144],
++        [4096, 4096],
++        [4096, 28672],
++        [14336, 4096],
++    ],
++    "mistralai/Mistral-7B-v0.1/TP2": [
++        [4096, 3072],
++        [2048, 4096],
++        [4096, 14336],
++        [7168, 4096],
++    ],
++    "mistralai/Mistral-7B-v0.1/TP4": [
++        [4096, 1536],
++        [1024, 4096],
++        [4096, 7168],
++        [3584, 4096],
++    ],
++    "meta-llama/Llama-2-7b-hf/TP1": [
++        [4096, 12288],
++        [4096, 4096],
++        [4096, 22016],
++        [11008, 4096],
++    ],
++    "meta-llama/Llama-2-7b-hf/TP2": [
++        [4096, 6144],
++        [2048, 4096],
++        [4096, 11008],
++        [5504, 4096],
++    ],
++    "meta-llama/Llama-2-7b-hf/TP4": [
++        [4096, 3072],
++        [1024, 4096],
++        [4096, 5504],
++        [2752, 4096],
++    ],
++    "meta-llama/Llama-2-13b-hf/TP1": [
++        [5120, 15360],
++        [5120, 5120],
++        [5120, 27648],
++        [13824, 5120],
++    ],
++    "meta-llama/Llama-2-13b-hf/TP2": [
++        [5120, 7680],
++        [2560, 5120],
++        [5120, 13824],
++        [6912, 5120],
++    ],
++    "meta-llama/Llama-2-13b-hf/TP4": [
++        [5120, 3840],
++        [1280, 5120],
++        [5120, 6912],
++        [3456, 5120],
++    ],
++    "meta-llama/Llama-2-70b-hf/TP1": [
++        [8192, 10240],
++        [8192, 8192],
++        [8192, 57344],
++        [28672, 8192],
++    ],
++    "meta-llama/Llama-2-70b-hf/TP2": [
++        [8192, 5120],
++        [4096, 8192],
++        [8192, 28672],
++        [14336, 8192],
++    ],
++    "meta-llama/Llama-2-70b-hf/TP4": [
++        [8192, 2560],
++        [2048, 8192],
++        [8192, 14336],
++        [7168, 8192],
++    ],
++}
+diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
+new file mode 100644
+index 0000000..7d0bd84
+--- /dev/null
++++ b/benchmarks/kernels/graph_machete_bench.py
+@@ -0,0 +1,63 @@
++import math
++import pickle
++import re
++from collections import defaultdict
++from typing import List
++
++import matplotlib.pyplot as plt
++import pandas as pd
++import seaborn as sns
++from torch.utils.benchmark import Measurement as TMeasurement
++
++from vllm.utils import FlexibleArgumentParser
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(
++        description='Benchmark the latency of processing a single batch of '
++        'requests till completion.')
++    parser.add_argument('filename', type=str)
++
++    args = parser.parse_args()
++
++    with open(args.filename, 'rb') as f:
++        data = pickle.load(f)
++        raw_results: List[TMeasurement] = data["results"]
++
++    results = defaultdict(lambda: list())
++    for v in raw_results:
++        result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
++        if result is not None:
++            KN = result.group(1)
++        else:
++            raise Exception("MKN not found")
++        result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label)
++        if result is not None:
++            M = result.group(1)
++        else:
++            raise Exception("MKN not found")
++
++        kernel = v.task_spec.description
++        results[KN].append({
++            "kernel": kernel,
++            "batch_size": M,
++            "median": v.median
++        })
++
++    rows = int(math.ceil(len(results) / 2))
++    fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
++    axs = axs.flatten()
++    for axs_idx, (shape, data) in enumerate(results.items()):
++        plt.sca(axs[axs_idx])
++        df = pd.DataFrame(data)
++        sns.lineplot(data=df,
++                     x="batch_size",
++                     y="median",
++                     hue="kernel",
++                     style="kernel",
++                     markers=True,
++                     dashes=False,
++                     palette="Dark2")
++        plt.title(f"Shape: {shape}")
++        plt.ylabel("time (median, s)")
++    plt.tight_layout()
++    plt.savefig("graph_machete_bench.pdf")
+diff --git a/benchmarks/kernels/requirements.txt b/benchmarks/kernels/requirements.txt
+new file mode 100644
+index 0000000..1411a4a
+--- /dev/null
++++ b/benchmarks/kernels/requirements.txt
+@@ -0,0 +1 @@
++pandas
+\ No newline at end of file
+diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
+new file mode 100644
+index 0000000..51f24f3
+--- /dev/null
++++ b/benchmarks/kernels/weight_shapes.py
+@@ -0,0 +1,49 @@
++# Weight Shapes are in the format
++# ([K, N], TP_SPLIT_DIM)
++# Example:
++#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
++#   - TP1 : K = 14336, N = 4096
++#   - TP2 : K = 7168, N = 4096
++#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
++#   - TP1 : K = 4096, N = 6144
++#   - TP4 : K = 4096, N = 1536
++
++# TP1 shapes
++WEIGHT_SHAPES = {
++    "mistralai/Mistral-7B-v0.1": [
++        ([4096, 6144], 1),
++        ([4096, 4096], 0),
++        ([4096, 28672], 1),
++        ([14336, 4096], 0),
++    ],
++    "meta-llama/Llama-2-7b-hf": [
++        ([4096, 12288], 1),
++        ([4096, 4096], 0),
++        ([4096, 22016], 1),
++        ([11008, 4096], 0),
++    ],
++    "meta-llama/Llama-3-8b": [
++        ([4096, 6144], 1),
++        ([4096, 4096], 0),
++        ([4096, 28672], 1),
++        ([14336, 4096], 0),
++    ],
++    "meta-llama/Llama-2-13b-hf": [
++        ([5120, 15360], 1),
++        ([5120, 5120], 0),
++        ([5120, 27648], 1),
++        ([13824, 5120], 0),
++    ],
++    "meta-llama/Llama-2-70b-hf": [
++        ([8192, 10240], 1),
++        ([8192, 8192], 0),
++        ([8192, 57344], 1),
++        ([28672, 8192], 0),
++    ],
++    "meta-llama/Llama-3.1-405b-hf": [
++        ([16384, 18432], 1),
++        ([16384, 16384], 0),
++        ([16384, 106496], 1),
++        ([53248, 16384], 0),
++    ],
++}
+diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
+index 64d3c4f..ba7383d 100755
+--- a/benchmarks/launch_tgi_server.sh
++++ b/benchmarks/launch_tgi_server.sh
+@@ -4,13 +4,13 @@ PORT=8000
+ MODEL=$1
+ TOKENS=$2
+ 
+-docker run --gpus all --shm-size 1g -p $PORT:80 \
+-           -v $PWD/data:/data \
+-           ghcr.io/huggingface/text-generation-inference:1.4.0 \
+-           --model-id $MODEL \
++docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
++           -v "$PWD/data:/data" \
++           ghcr.io/huggingface/text-generation-inference:2.2.0 \
++           --model-id "$MODEL" \
+            --sharded false  \
+            --max-input-length 1024 \
+            --max-total-tokens 2048 \
+            --max-best-of 5 \
+            --max-concurrent-requests 5000 \
+-           --max-batch-total-tokens $TOKENS
++           --max-batch-total-tokens "$TOKENS"
+diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
+new file mode 100644
+index 0000000..d16d6f9
+--- /dev/null
++++ b/benchmarks/overheads/benchmark_hashing.py
+@@ -0,0 +1,59 @@
++import cProfile
++import pstats
++
++from vllm import LLM, SamplingParams
++from vllm.utils import FlexibleArgumentParser
++
++# A very long prompt, total number of tokens is about 15k.
++LONG_PROMPT = ["You are an expert in large language models, aren't you?"
++               ] * 1000
++LONG_PROMPT = ' '.join(LONG_PROMPT)
++
++
++def main(args):
++    llm = LLM(
++        model=args.model,
++        enforce_eager=True,
++        enable_prefix_caching=True,
++        tensor_parallel_size=args.tensor_parallel_size,
++    )
++
++    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
++    profiler = cProfile.Profile()
++
++    print("------warm up------")
++    for i in range(3):
++        output = llm.generate(LONG_PROMPT, sampling_params)
++        print(output[0].outputs[0].text)
++
++    print("------start generating------")
++    for i in range(3):
++        profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
++                        globals(), locals())
++
++    # analyze the runtime of hashing function
++    stats = pstats.Stats(profiler)
++    stats.sort_stats('cumulative')
++    total_time = 0
++    total_calls = 0
++    for func in stats.stats:
++        if 'hash_of_block' in func[2]:
++            total_time = stats.stats[func][3]
++            total_calls = stats.stats[func][0]
++    percentage = (total_time / stats.total_tt) * 100
++    print(f"Hashing took {total_time:.2f} seconds,"
++          f"{percentage:.2f}% of the total runtime.")
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(
++        description='Benchmark the performance of hashing function in'
++        'automatic prefix caching.')
++    parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
++    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
++    parser.add_argument('--output-len', type=int, default=10)
++    parser.add_argument('--enable-prefix-caching',
++                        action='store_true',
++                        help='enable prefix caching')
++    args = parser.parse_args()
++    main(args)
+diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json
+new file mode 100644
+index 0000000..6003698
+--- /dev/null
++++ b/benchmarks/structured_schemas/structured_schema_1.json
+@@ -0,0 +1,113 @@
++{
++    "$schema":
++    "https://json-schema.org/draft/2020-12/schema",
++    "title":
++    "User Profile",
++    "type":
++    "object",
++    "properties": {
++        "userId": {
++            "type": "string",
++            "description": "Unique identifier for the user."
++        },
++        "personalInfo": {
++            "type": "object",
++            "properties": {
++                "firstName": {
++                    "type": "string",
++                    "description": "The user's first name."
++                },
++                "lastName": {
++                    "type": "string",
++                    "description": "The user's last name."
++                },
++                "age": {
++                    "type": "integer",
++                    "minimum": 0,
++                    "description": "The user's age."
++                },
++                "phoneNumbers": {
++                    "type":
++                    "array",
++                    "items": {
++                        "type": "object",
++                        "properties": {
++                            "type": {
++                                "type": "string",
++                                "enum": ["home", "work", "mobile"],
++                                "description": "Type of phone number."
++                            },
++                            "number": {
++                                "type": "string",
++                                "pattern": "^\\+?[1-9]\\d{1,14}$",
++                                "description": "Phone number in E.164 format."
++                            }
++                        },
++                        "required": ["type", "number"]
++                    },
++                    "description":
++                    "List of phone numbers associated with the user."
++                }
++            },
++            "required": ["firstName", "lastName"]
++        },
++        "address": {
++            "type": "object",
++            "properties": {
++                "street": {
++                    "type": "string",
++                    "description": "Street address."
++                },
++                "city": {
++                    "type": "string",
++                    "description": "City name."
++                },
++                "state": {
++                    "type": "string",
++                    "description": "State or province."
++                },
++                "postalCode": {
++                    "type": "string",
++                    "pattern": "^\\d{5}(-\\d{4})?$",
++                    "description": "Postal code."
++                },
++                "country": {
++                    "type": "string",
++                    "description": "Country name."
++                }
++            },
++            "required": ["street", "city", "state", "postalCode", "country"]
++        },
++        "preferences": {
++            "type": "object",
++            "properties": {
++                "newsletterSubscribed": {
++                    "type":
++                    "boolean",
++                    "description":
++                    "Indicates if the user is subscribed to the newsletter."
++                },
++                "favoriteCategories": {
++                    "type": "array",
++                    "items": {
++                        "type": "string"
++                    },
++                    "description": "List of user's favorite categories."
++                }
++            },
++            "required": ["newsletterSubscribed"]
++        },
++        "accountStatus": {
++            "type": "string",
++            "enum": ["active", "inactive", "suspended"],
++            "description": "Current status of the user's account."
++        },
++        "registrationDate": {
++            "type": "string",
++            "format": "date-time",
++            "description": "ISO 8601 formatted date-time of user registration."
++        }
++    },
++    "required":
++    ["userId", "personalInfo", "address", "accountStatus", "registrationDate"]
++}
+\ No newline at end of file
+diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
+index 0cf3776..714abca 100644
+--- a/cmake/cpu_extension.cmake
++++ b/cmake/cpu_extension.cmake
+@@ -1,5 +1,14 @@
++include(FetchContent)
++
++set(CMAKE_CXX_STANDARD_REQUIRED ON)
++set(CMAKE_CXX_EXTENSIONS ON)
+ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+ 
++if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
++    set(MACOSX_FOUND TRUE)
++endif()
++
++
+ #
+ # Define environment variables for special configurations
+ #
+@@ -9,21 +18,40 @@ endif()
+ 
+ include_directories("${CMAKE_SOURCE_DIR}/csrc")
+ 
++
++set (ENABLE_NUMA TRUE)
++
+ #
+ # Check the compile flags
+ #
+-list(APPEND CXX_COMPILE_FLAGS 
+-    "-fopenmp"
+-    "-DVLLM_CPU_EXTENSION")
+ 
+-execute_process(COMMAND cat /proc/cpuinfo
+-                RESULT_VARIABLE CPUINFO_RET
+-                OUTPUT_VARIABLE CPUINFO)
++if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
++    list(APPEND CXX_COMPILE_FLAGS
++        "-mf16c"
++    )
++endif()
+ 
+-if (NOT CPUINFO_RET EQUAL 0)
+-    message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
++if(MACOSX_FOUND)
++    list(APPEND CXX_COMPILE_FLAGS
++        "-Xpreprocessor"
++        "-fopenmp"
++        "-DVLLM_CPU_EXTENSION")
++else()
++    list(APPEND CXX_COMPILE_FLAGS
++        "-fopenmp"
++        "-DVLLM_CPU_EXTENSION")
+ endif()
+ 
++if (NOT MACOSX_FOUND)
++    execute_process(COMMAND cat /proc/cpuinfo
++                    RESULT_VARIABLE CPUINFO_RET
++                    OUTPUT_VARIABLE CPUINFO)
++    if (NOT CPUINFO_RET EQUAL 0)
++        message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
++    endif()
++endif()
++
++
+ function (find_isa CPUINFO TARGET OUT)
+     string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
+     if(NOT ISA_FOUND EQUAL -1)
+@@ -33,9 +61,30 @@ function (find_isa CPUINFO TARGET OUT)
+     endif()
+ endfunction()
+ 
+-find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
++function (is_avx512_disabled OUT)
++    set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
++    if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
++        set(${OUT} ON PARENT_SCOPE)
++    else()
++        set(${OUT} OFF PARENT_SCOPE)
++    endif()
++endfunction()
++
++is_avx512_disabled(AVX512_DISABLED)
+ 
+-if (AVX512_FOUND)
++if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
++    set(APPLE_SILICON_FOUND TRUE)
++else()
++    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
++    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
++    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
++    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
++    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
++    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
++endif()
++
++
++if (AVX512_FOUND AND NOT AVX512_DISABLED)
+     list(APPEND CXX_COMPILE_FLAGS
+         "-mavx512f"
+         "-mavx512vl"
+@@ -44,8 +93,8 @@ if (AVX512_FOUND)
+ 
+     find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
+     if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
+-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND 
+-            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) 
++        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
++            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+             list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
+         else()
+             message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
+@@ -53,16 +102,75 @@ if (AVX512_FOUND)
+     else()
+         message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
+     endif()
++    
++elseif (AVX2_FOUND)
++    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
++    message(WARNING "vLLM CPU backend using AVX2 ISA")
++    
++elseif (POWER9_FOUND OR POWER10_FOUND)
++    message(STATUS "PowerPC detected")
++    # Check for PowerPC VSX support
++    list(APPEND CXX_COMPILE_FLAGS
++        "-mvsx"
++        "-mcpu=native"
++        "-mtune=native")
++
++elseif (ASIMD_FOUND)
++    message(STATUS "ARMv8 or later architecture detected")
++    if(ARM_BF16_FOUND)
++        message(STATUS "BF16 extension detected")
++        set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
++        add_compile_definitions(ARM_BF16_SUPPORT)
++    else()
++        message(WARNING "BF16 functionality is not available")
++        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
++    endif()
++    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
++elseif(APPLE_SILICON_FOUND)
++    message(STATUS "Apple Silicon Detected")
++    set(ENABLE_NUMA OFF)
+ else()
+-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
++    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
+ endif()
+ 
+-message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
+-
+-
+ #
+-# Define extension targets
++# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
+ #
++if (AVX512_FOUND AND NOT AVX512_DISABLED)
++    FetchContent_Declare(
++        oneDNN
++        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
++        GIT_TAG  v3.6
++        GIT_PROGRESS TRUE
++        GIT_SHALLOW TRUE
++    )
++
++    set(ONEDNN_LIBRARY_TYPE "STATIC")
++    set(ONEDNN_BUILD_DOC "OFF")
++    set(ONEDNN_BUILD_EXAMPLES "OFF")
++    set(ONEDNN_BUILD_TESTS "OFF")
++    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
++    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
++    set(ONEDNN_BUILD_GRAPH "OFF")
++    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
++    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
++    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
++    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
++    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
++
++    FetchContent_MakeAvailable(oneDNN)
++    
++    list(APPEND LIBS dnnl)
++endif()
++
++message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
++
++if(ENABLE_NUMA)
++    list(APPEND LIBS numa)
++else()
++    message(STATUS "NUMA is disabled")
++    add_compile_definitions(-DVLLM_NUMA_DISABLED)
++endif()
+ 
+ #
+ # _C extension
+@@ -71,20 +179,30 @@ set(VLLM_EXT_SRC
+     "csrc/cpu/activation.cpp"
+     "csrc/cpu/attention.cpp"
+     "csrc/cpu/cache.cpp"
++    "csrc/cpu/utils.cpp"
+     "csrc/cpu/layernorm.cpp"
+     "csrc/cpu/pos_encoding.cpp"
+-    "csrc/cpu/pybind.cpp")
++    "csrc/cpu/torch_bindings.cpp")
++
++if (AVX512_FOUND AND NOT AVX512_DISABLED)
++    set(VLLM_EXT_SRC
++        "csrc/cpu/quant.cpp"
++        ${VLLM_EXT_SRC})
++endif()
++
++#
++# Define extension targets
++#
+ 
+ define_gpu_extension_target(
+     _C
+     DESTINATION vllm
+     LANGUAGE CXX
+     SOURCES ${VLLM_EXT_SRC}
++    LIBRARIES ${LIBS}
+     COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
+-    WITH_SOABI 
++    USE_SABI 3
++    WITH_SOABI
+ )
+ 
+-add_custom_target(default)
+-message(STATUS "Enabling C extension.")
+-add_dependencies(default _C)
+-
++message(STATUS "Enabling C extension.")
+\ No newline at end of file
+diff --git a/cmake/utils.cmake b/cmake/utils.cmake
+index 7c71673..40430da 100644
+--- a/cmake/utils.cmake
++++ b/cmake/utils.cmake
+@@ -5,7 +5,7 @@
+ macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
+   file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
+   set(Python_EXECUTABLE ${EXECUTABLE})
+-  find_package(Python COMPONENTS Interpreter Development.Module)
++  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
+   if (NOT Python_FOUND)
+     message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+   endif()
+@@ -99,7 +99,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
+       "Failed to determine torch nvcc compiler flags")
+ 
+     if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+-      list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
++      list(APPEND GPU_FLAGS "-DENABLE_FP8")
+     endif()
+     if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
+       list(REMOVE_ITEM GPU_FLAGS
+@@ -119,7 +119,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
+ 
+     list(APPEND GPU_FLAGS
+       "-DUSE_ROCM"
+-      "-DENABLE_FP8_E4M3"
++      "-DENABLE_FP8"
+       "-U__HIP_NO_HALF_CONVERSIONS__"
+       "-U__HIP_NO_HALF_OPERATORS__"
+       "-fno-gpu-rdc")
+@@ -133,10 +133,181 @@ macro(string_to_ver OUT_VER IN_STR)
+   string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
+ endmacro()
+ 
++#
++# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
++# `CUDA_ARCH_FLAGS`.
++#
++# Example:
++#   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
++#   clear_cuda_arches(CUDA_ARCH_FLAGS)
++#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
++#   CMAKE_CUDA_FLAGS="-Wall"
++#
++macro(clear_cuda_arches CUDA_ARCH_FLAGS)
++    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
++    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
++      ${CMAKE_CUDA_FLAGS})
++
++    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
++    # and passed back via the `CUDA_ARCHITECTURES` property.
++    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
++      ${CMAKE_CUDA_FLAGS})
++endmacro()
++
++#
++# Extract unique CUDA architectures from a list of compute capabilities codes in 
++# the form `<major><minor>[<letter>]`, convert them to the form sort 
++# `<major>.<minor>`, dedupes them and then sorts them in ascending order and 
++# stores them in `OUT_ARCHES`.
++#
++# Example:
++#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" 
++#   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
++#   OUT_ARCHES="7.5;...;9.0"
++function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
++  set(_CUDA_ARCHES)
++  foreach(_ARCH ${CUDA_ARCH_FLAGS})
++    string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
++    if (_COMPUTE)
++      set(_COMPUTE ${CMAKE_MATCH_1})
++    endif()
++
++    string_to_ver(_COMPUTE_VER ${_COMPUTE})
++    list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
++  endforeach()
++
++  list(REMOVE_DUPLICATES _CUDA_ARCHES)
++  list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
++  set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
++endfunction()
++
++#
++# For a specific file set the `-gencode` flag in compile options conditionally 
++# for the CUDA language. 
++#
++# Example:
++#   set_gencode_flag_for_srcs(
++#     SRCS "foo.cu"
++#     ARCH "compute_75"
++#     CODE "sm_75")
++#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for 
++#    `foo.cu` (only for the CUDA language).
++#
++macro(set_gencode_flag_for_srcs)
++  set(options)
++  set(oneValueArgs ARCH CODE)
++  set(multiValueArgs SRCS)
++  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
++                        "${multiValueArgs}" ${ARGN} )
++  set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
++  set_property(
++    SOURCE ${arg_SRCS}
++    APPEND PROPERTY
++    COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
++  )
++
++  message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
++endmacro(set_gencode_flag_for_srcs)
++
++#
++# For a list of source files set the `-gencode` flags in the files specific 
++#  compile options (specifically for the CUDA language).
++#
++# arguments are:
++#  SRCS: list of source files
++#  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
++#  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
++#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS 
++#    that is larger than BUILD_PTX_FOR_ARCH.
++#
++macro(set_gencode_flags_for_srcs)
++  set(options)
++  set(oneValueArgs BUILD_PTX_FOR_ARCH)
++  set(multiValueArgs SRCS CUDA_ARCHS)
++  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
++                        "${multiValueArgs}" ${ARGN} )
++
++  foreach(_ARCH ${arg_CUDA_ARCHS})
++    string(REPLACE "." "" _ARCH "${_ARCH}")
++    set_gencode_flag_for_srcs(
++      SRCS ${arg_SRCS}
++      ARCH "compute_${_ARCH}"
++      CODE "sm_${_ARCH}")
++  endforeach()
++
++  if (${arg_BUILD_PTX_FOR_ARCH})
++    list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
++    list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
++    if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
++      string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
++      set_gencode_flag_for_srcs(
++        SRCS ${arg_SRCS}
++        ARCH "compute_${_PTX_ARCH}"
++        CODE "compute_${_PTX_ARCH}")
++    endif()
++  endif()
++endmacro()
++
++#
++# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form 
++#  `<major>.<minor>[letter]` compute the "loose intersection" with the 
++#  `TGT_CUDA_ARCHS` list of gencodes. 
++# The loose intersection is defined as:
++#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
++#  where `<=` is the version comparison operator.
++# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
++#  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
++# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
++#  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
++#  9.0a to the result. 
++# The result is stored in `OUT_CUDA_ARCHS`.
++#
++# Example:
++#   SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
++#   TGT_CUDA_ARCHS="8.0;8.9;9.0"
++#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
++#   OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
++#
++function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
++  list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
++
++  # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
++  # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
++  set(_CUDA_ARCHS)
++  if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
++    list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
++    if ("9.0" IN_LIST TGT_CUDA_ARCHS)
++      set(_CUDA_ARCHS "9.0a")
++    endif()
++  endif()
++
++  list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
++
++  # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is 
++  # less or eqault to ARCH
++  foreach(_ARCH ${CUDA_ARCHS})
++  set(_TMP_ARCH)
++  foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
++    if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
++      set(_TMP_ARCH ${_SRC_ARCH})
++    else()
++      break()
++    endif()
++  endforeach()
++  if (_TMP_ARCH)
++    list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
++  endif()
++  endforeach()
++
++  list(REMOVE_DUPLICATES _CUDA_ARCHS)
++  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
++endfunction()
++
+ #
+ # Override the GPU architectures detected by cmake/torch and filter them by
+ # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
+-# `GPU_ARCHES`.
++# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set 
++# the architectures on a per file basis.
+ #
+ # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
+ #
+@@ -147,16 +318,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+   if (${GPU_LANG} STREQUAL "HIP")
+     #
+     # `GPU_ARCHES` controls the `--offload-arch` flags.
+-    # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
+-    # via the `PYTORCH_ROCM_ARCH` env variable.
+     #
+-
++    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
++    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
++    # "rocm_agent_enumerator" in "enable_language(HIP)"
++    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
++    #
++    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
++      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
++    else()
++      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
++    endif()
+     #
+     # Find the intersection of the supported + detected architectures to
+     # set the module architecture flags.
+     #
+     set(${GPU_ARCHES})
+-    foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
++    foreach (_ARCH ${HIP_ARCHITECTURES})
+       if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+         list(APPEND ${GPU_ARCHES} ${_ARCH})
+       endif()
+@@ -164,112 +342,10 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+ 
+     if(NOT ${GPU_ARCHES})
+       message(FATAL_ERROR
+-        "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
++        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
+         " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
+     endif()
+-
+-  elseif(${GPU_LANG} STREQUAL "CUDA")
+-    #
+-    # Setup/process CUDA arch flags.
+-    #
+-    # The torch cmake setup hardcodes the detected architecture flags in
+-    # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
+-    # can't modified on a per-target basis, e.g. for the `punica` extension.
+-    # So, all the `-gencode` flags need to be extracted and removed from
+-    # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
+-    # Since it's not possible to use `target_compiler_options` for adding target
+-    # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
+-    # must be used instead.  This requires repackaging the architecture flags
+-    # into a format that cmake expects for `CUDA_ARCHITECTURES`.
+-    #
+-    # This is a bit fragile in that it depends on torch using `-gencode` as opposed
+-    # to one of the other nvcc options to specify architectures.
+-    #
+-    # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
+-    # detected architectures.
+-    #
+-    message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+-
+-    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
+-    string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
+-      ${CMAKE_CUDA_FLAGS})
+-
+-    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+-    # and passed back via the `CUDA_ARCHITECTURES` property.
+-    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+-      ${CMAKE_CUDA_FLAGS})
+-
+-    # If this error is triggered, it might mean that torch has changed how it sets
+-    # up nvcc architecture code generation flags.
+-    if (NOT _CUDA_ARCH_FLAGS)
+-      message(FATAL_ERROR
+-        "Could not find any architecture related code generation flags in "
+-        "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
+-    endif()
+-
+-    message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+-    message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
+-
+-    # Initialize the architecture lists to empty.
+-    set(${GPU_ARCHES})
+-
+-    # Process each `gencode` flag.
+-    foreach(_ARCH ${_CUDA_ARCH_FLAGS})
+-      # For each flag, extract the version number and whether it refers to PTX
+-      # or native code.
+-      # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
+-      # for that match.
+-
+-      string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
+-      if (_COMPUTE)
+-        set(_COMPUTE ${CMAKE_MATCH_1})
+-      endif()
+-
+-      string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
+-      if (_SM)
+-        set(_SM ${CMAKE_MATCH_1})
+-      endif()
+-
+-      string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
+-      if (_CODE)
+-        set(_CODE ${CMAKE_MATCH_1})
+-      endif()
+-
+-      # Make sure the virtual architecture can be matched.
+-      if (NOT _COMPUTE)
+-        message(FATAL_ERROR
+-          "Could not determine virtual architecture from: ${_ARCH}.")
+-      endif()
+-
+-      # One of sm_ or compute_ must exist.
+-      if ((NOT _SM) AND (NOT _CODE))
+-        message(FATAL_ERROR
+-          "Could not determine a codegen architecture from: ${_ARCH}.")
+-      endif()
+-
+-      if (_SM)
+-        # -real suffix let CMake to only generate elf code for the kernels.
+-        # we want this, otherwise the added ptx (default) will increase binary size.
+-        set(_VIRT "-real")
+-        set(_CODE_ARCH ${_SM})
+-      else()
+-        # -virtual suffix let CMake to generate ptx code for the kernels.
+-        set(_VIRT "-virtual")
+-        set(_CODE_ARCH ${_CODE})
+-      endif()
+-
+-      # Check if the current version is in the supported arch list.
+-      string_to_ver(_CODE_VER ${_CODE_ARCH})
+-      if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+-        message(STATUS "discarding unsupported CUDA arch ${_VER}.")
+-        continue()
+-      endif()
+-
+-      # Add it to the arch list.
+-      list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
+-    endforeach()
+   endif()
+-  message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
+ endmacro()
+ 
+ #
+@@ -294,6 +370,7 @@ endmacro()
+ # INCLUDE_DIRECTORIES <dirs> - Extra include directories.
+ # LIBRARIES <libraries>      - Extra link libraries.
+ # WITH_SOABI                 - Generate library with python SOABI suffix name.
++# USE_SABI <version>         - Use python stable api <version>
+ #
+ # Note: optimization level/debug info is set via cmake build type.
+ #
+@@ -301,7 +378,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
+   cmake_parse_arguments(PARSE_ARGV 1
+     GPU
+     "WITH_SOABI"
+-    "DESTINATION;LANGUAGE"
++    "DESTINATION;LANGUAGE;USE_SABI"
+     "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
+ 
+   # Add hipify preprocessing step when building with HIP/ROCm.
+@@ -315,7 +392,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
+     set(GPU_WITH_SOABI)
+   endif()
+ 
+-  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
++  if (GPU_USE_SABI)
++    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
++  else()
++    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
++  endif()
+ 
+   if (GPU_LANGUAGE STREQUAL "HIP")
+     # Make this target dependent on the hipify preprocessor step.
+@@ -338,17 +419,15 @@ function (define_gpu_extension_target GPU_MOD_NAME)
+   target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
+     ${GPU_INCLUDE_DIRECTORIES})
+ 
+-  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
+-    ${GPU_LIBRARIES})
++  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
+ 
+   # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
+   # dependencies that are not necessary and may not be installed.
+   if (GPU_LANGUAGE STREQUAL "CUDA")
+-    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
+-      ${CUDA_LIBRARIES})
++    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
+   else()
+     target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+   endif()
+ 
+-  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
++  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
+ endfunction()
+diff --git a/collect_env.py b/collect_env.py
+index 1ecfeb8..254c19b 100644
+--- a/collect_env.py
++++ b/collect_env.py
+@@ -1,17 +1,19 @@
+ # ruff: noqa
+ # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
+ 
+-# Unlike the rest of the PyTorch this file must be python2 compliant.
+-# This script outputs relevant system environment info
+-# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
+ import datetime
+ import locale
+ import os
+ import re
+ import subprocess
+ import sys
++# Unlike the rest of the PyTorch this file must be python2 compliant.
++# This script outputs relevant system environment info
++# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
+ from collections import namedtuple
+ 
++from vllm.envs import environment_variables
++
+ try:
+     import torch
+     TORCH_AVAILABLE = True
+@@ -52,6 +54,7 @@ SystemEnv = namedtuple(
+         'vllm_version',  # vllm specific field
+         'vllm_build_flags',  # vllm specific field
+         'gpu_topo',  # vllm specific field
++        'env_vars',
+     ])
+ 
+ DEFAULT_CONDA_PATTERNS = {
+@@ -64,6 +67,10 @@ DEFAULT_CONDA_PATTERNS = {
+     "triton",
+     "optree",
+     "nccl",
++    "transformers",
++    "zmq",
++    "nvidia",
++    "pynvml",
+ }
+ 
+ DEFAULT_PIP_PATTERNS = {
+@@ -75,6 +82,10 @@ DEFAULT_PIP_PATTERNS = {
+     "optree",
+     "onnx",
+     "nccl",
++    "transformers",
++    "zmq",
++    "nvidia",
++    "pynvml",
+ }
+ 
+ 
+@@ -259,12 +270,16 @@ def get_neuron_sdk_version(run_lambda):
+ 
+ 
+ def get_vllm_version():
+-    try:
+-        import vllm
+-        return vllm.__version__
+-    except ImportError:
+-        return 'N/A'
++    from vllm import __version__, __version_tuple__
++
++    if __version__ == "dev":
++        return "N/A (dev)"
+ 
++    if len(__version_tuple__) == 4: # dev build
++        git_sha = __version_tuple__[-1][1:] # type: ignore
++        return f"{__version__} (git sha: {git_sha}"
++
++    return __version__
+ 
+ def summarize_vllm_build_flags():
+     # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
+@@ -276,9 +291,14 @@ def summarize_vllm_build_flags():
+ 
+ 
+ def get_gpu_topo(run_lambda):
++    output = None
++
+     if get_platform() == 'linux':
+-        return run_and_read_all(run_lambda, 'nvidia-smi topo -m')
+-    return None
++        output = run_and_read_all(run_lambda, 'nvidia-smi topo -m')
++        if output is None:
++            output = run_and_read_all(run_lambda, 'rocm-smi --showtopo')
++
++    return output
+ 
+ 
+ # example outputs of CPU infos
+@@ -495,6 +515,22 @@ def is_xnnpack_available():
+     else:
+         return "N/A"
+ 
++def get_env_vars():
++    env_vars = ''
++    secret_terms=('secret', 'token', 'api', 'access', 'password')
++    report_prefix = ("TORCH", "NCCL", "PYTORCH",
++                     "CUDA", "CUBLAS", "CUDNN",
++                     "OMP_", "MKL_",
++                     "NVIDIA")
++    for k, v in os.environ.items():
++        if any(term in k.lower() for term in secret_terms):
++            continue
++        if k in environment_variables:
++            env_vars = env_vars + "{}={}".format(k, v) + "\n"
++        if k.startswith(report_prefix):
++            env_vars = env_vars + "{}={}".format(k, v) + "\n"
++
++    return env_vars
+ 
+ def get_env_info():
+     run_lambda = run
+@@ -566,6 +602,7 @@ def get_env_info():
+         vllm_version=vllm_version,
+         vllm_build_flags=vllm_build_flags,
+         gpu_topo=gpu_topo,
++        env_vars=get_env_vars(),
+     )
+ 
+ 
+@@ -601,6 +638,11 @@ Versions of relevant libraries:
+ {conda_packages}
+ """.strip()
+ 
++# both the above code and the following code use `strip()` to
++# remove leading/trailing whitespaces, so we need to add a newline
++# in between to separate the two sections
++env_info_fmt += "\n"
++
+ env_info_fmt += """
+ ROCM Version: {rocm_version}
+ Neuron SDK Version: {neuron_sdk_version}
+@@ -609,6 +651,8 @@ vLLM Build Flags:
+ {vllm_build_flags}
+ GPU Topology:
+ {gpu_topo}
++
++{env_vars}
+ """.strip()
+ 
+ 
+diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
+index 24d9727..839dc36 100644
+--- a/csrc/activation_kernels.cu
++++ b/csrc/activation_kernels.cu
+@@ -1,5 +1,5 @@
+ #include <ATen/cuda/CUDAContext.h>
+-#include <torch/extension.h>
++#include <torch/all.h>
+ #include <c10/cuda/CUDAGuard.h>
+ 
+ #include <cmath>
+@@ -10,11 +10,11 @@
+ namespace vllm {
+ 
+ // Activation and gating kernel template.
+-template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
++template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+ __global__ void act_and_mul_kernel(
+-  scalar_t* __restrict__ out,               // [..., d]
+-  const scalar_t* __restrict__ input,       // [..., 2, d]
+-  const int d) {
++    scalar_t* __restrict__ out,          // [..., d]
++    const scalar_t* __restrict__ input,  // [..., 2, d]
++    const int d) {
+   const int64_t token_idx = blockIdx.x;
+   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+     const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+@@ -23,84 +23,120 @@ __global__ void act_and_mul_kernel(
+   }
+ }
+ 
+-template<typename T>
++template <typename T>
+ __device__ __forceinline__ T silu_kernel(const T& x) {
+   // x * sigmoid(x)
+-  return (T) (((float) x) / (1.0f + expf((float) -x)));
++  return (T)(((float)x) / (1.0f + expf((float)-x)));
+ }
+ 
+-template<typename T>
++template <typename T>
+ __device__ __forceinline__ T gelu_kernel(const T& x) {
+   // Equivalent to PyTorch GELU with 'none' approximation.
+   // Refer to:
+   // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
+-  const float f = (float) x;
++  const float f = (float)x;
+   constexpr float ALPHA = M_SQRT1_2;
+-  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
++  return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+ }
+ 
+-template<typename T>
++template <typename T>
+ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
+   // Equivalent to PyTorch GELU with 'tanh' approximation.
+   // Refer to:
+   // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
+-  const float f = (float) x;
++  const float f = (float)x;
+   constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
+   constexpr float KAPPA = 0.044715;
+   float x_cube = f * f * f;
+   float inner = BETA * (f + KAPPA * x_cube);
+-  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
++  return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
+ }
+ 
+-} // namespace vllm
++}  // namespace vllm
+ 
+ // Launch activation and gating kernel.
+-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                             \
+-  int d = input.size(-1) / 2;                                                             \
+-  int64_t num_tokens = input.numel() / input.size(-1);                                    \
+-  dim3 grid(num_tokens);                                                                  \
+-  dim3 block(std::min(d, 1024));                                                          \
+-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
+-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
+-  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
+-    input.scalar_type(),                                                                  \
+-    "act_and_mul_kernel",                                                                 \
+-    [&] {                                                                                 \
+-      vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(   \
+-        out.data_ptr<scalar_t>(),                                                         \
+-        input.data_ptr<scalar_t>(),                                                       \
+-        d);                                                                               \
+-    });
+-
+-void silu_and_mul(
+-  torch::Tensor& out,      // [..., d]
+-  torch::Tensor& input)    // [..., 2 * d]
++#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                            \
++  int d = input.size(-1) / 2;                                            \
++  int64_t num_tokens = input.numel() / input.size(-1);                   \
++  dim3 grid(num_tokens);                                                 \
++  dim3 block(std::min(d, 1024));                                         \
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
++  VLLM_DISPATCH_FLOATING_TYPES(                                          \
++      input.scalar_type(), "act_and_mul_kernel", [&] {                   \
++        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
++            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
++                                         input.data_ptr<scalar_t>(), d); \
++      });
++
++void silu_and_mul(torch::Tensor& out,    // [..., d]
++                  torch::Tensor& input)  // [..., 2 * d]
+ {
+   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+ }
+ 
+-void gelu_and_mul(
+-  torch::Tensor& out,      // [..., d]
+-  torch::Tensor& input)    // [..., 2 * d]
++void gelu_and_mul(torch::Tensor& out,    // [..., d]
++                  torch::Tensor& input)  // [..., 2 * d]
+ {
+   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
+ }
+ 
+-void gelu_tanh_and_mul(
+-  torch::Tensor& out,      // [..., d]
+-  torch::Tensor& input)    // [..., 2 * d]
++void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
++                       torch::Tensor& input)  // [..., 2 * d]
+ {
+   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
+ }
+ 
+ namespace vllm {
+ 
++template <typename T>
++__device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
++  const float f = (float)x;
++  return (T)(f > threshold ? f : 0.0f);
++}
++
++template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
++__global__ void act_and_mul_kernel_with_param(
++    scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
++    const float param) {
++  const int64_t token_idx = blockIdx.x;
++  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
++    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
++    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
++    out[token_idx * d + idx] = ACT_FN(x, param) * y;
++  }
++}
++
++}  // namespace vllm
++
++#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
++  int d = input.size(-1) / 2;                                           \
++  int64_t num_tokens = input.numel() / input.size(-1);                  \
++  dim3 grid(num_tokens);                                                \
++  dim3 block(std::min(d, 1024));                                        \
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));     \
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();         \
++  VLLM_DISPATCH_FLOATING_TYPES(                                         \
++      input.scalar_type(), "act_and_mul_kernel_with_param", [&] {       \
++        vllm::act_and_mul_kernel_with_param<scalar_t, KERNEL<scalar_t>> \
++            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),      \
++                                         input.data_ptr<scalar_t>(), d, \
++                                         PARAM);                        \
++      });
++
++void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
++                     torch::Tensor& input,  // [..., 2 * d]
++                     double threshold) {
++  LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
++}
++namespace vllm {
++
+ // Element-wise activation kernel template.
+-template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
++template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+ __global__ void activation_kernel(
+-  scalar_t* __restrict__ out,               // [..., d]
+-  const scalar_t* __restrict__ input,       // [..., d]
+-  const int d) {
++    scalar_t* __restrict__ out,          // [..., d]
++    const scalar_t* __restrict__ input,  // [..., d]
++    const int d) {
+   const int64_t token_idx = blockIdx.x;
+   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+     const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
+@@ -108,54 +144,61 @@ __global__ void activation_kernel(
+   }
+ }
+ 
+-} // namespace vllm
++}  // namespace vllm
+ 
+ // Launch element-wise activation kernel.
+-#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                  \
+-  int d = input.size(-1);                                                                 \
+-  int64_t num_tokens = input.numel() / d;                                                 \
+-  dim3 grid(num_tokens);                                                                  \
+-  dim3 block(std::min(d, 1024));                                                          \
+-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
+-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
+-  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
+-    input.scalar_type(),                                                                  \
+-    "activation_kernel",                                                                  \
+-    [&] {                                                                                 \
+-      vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(    \
+-        out.data_ptr<scalar_t>(),                                                         \
+-        input.data_ptr<scalar_t>(),                                                       \
+-        d);                                                                               \
+-    });
++#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                       \
++  int d = input.size(-1);                                                      \
++  int64_t num_tokens = input.numel() / d;                                      \
++  dim3 grid(num_tokens);                                                       \
++  dim3 block(std::min(d, 1024));                                               \
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
++  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
++    vllm::activation_kernel<scalar_t, KERNEL<scalar_t>>                        \
++        <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),                 \
++                                     input.data_ptr<scalar_t>(), d);           \
++  });
+ 
+ namespace vllm {
+ 
+-template<typename T>
++template <typename T>
+ __device__ __forceinline__ T gelu_new_kernel(const T& x) {
+-  const float x3 = (float) (x * x * x);
+-  const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
+-  return ((T) 0.5) * x * (((T) 1.0) + t);
++  const float x3 = (float)(x * x * x);
++  const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
++  return ((T)0.5) * x * (((T)1.0) + t);
+ }
+ 
+-template<typename T>
++template <typename T>
+ __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
+-  const float f = (float) x;
+-  const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
+-  return ((T) 0.5) * x * (((T) 1.0) + t);
++  const float f = (float)x;
++  const T t =
++      (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
++  return ((T)0.5) * x * (((T)1.0) + t);
++}
++
++template <typename T>
++__device__ __forceinline__ T gelu_quick_kernel(const T& x) {
++  // x * sigmoid(1.702 * x)
++  return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
+ }
+ 
+-} // namespace vllm
++}  // namespace vllm
+ 
+-void gelu_new(
+-  torch::Tensor& out,     // [..., d]
+-  torch::Tensor& input)   // [..., d]
++void gelu_new(torch::Tensor& out,    // [..., d]
++              torch::Tensor& input)  // [..., d]
+ {
+   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
+ }
+ 
+-void gelu_fast(
+-  torch::Tensor& out,     // [..., d]
+-  torch::Tensor& input)   // [..., d]
++void gelu_fast(torch::Tensor& out,    // [..., d]
++               torch::Tensor& input)  // [..., d]
+ {
+   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
+ }
++
++void gelu_quick(torch::Tensor& out,    // [..., d]
++                torch::Tensor& input)  // [..., d]
++{
++  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
++}
+diff --git a/csrc/attention/attention_generic.cuh b/csrc/attention/attention_generic.cuh
+index 31fb401..62409c0 100644
+--- a/csrc/attention/attention_generic.cuh
++++ b/csrc/attention/attention_generic.cuh
+@@ -1,5 +1,6 @@
+ /*
+- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
++ * Adapted from
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+  * Copyright (c) 2023, The vLLM team.
+  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+  *
+@@ -22,31 +23,31 @@
+ namespace vllm {
+ 
+ // A vector type to store Q, K, V elements.
+-template<typename T, int VEC_SIZE>
++template <typename T, int VEC_SIZE>
+ struct Vec {};
+ 
+ // A vector type to store FP32 accumulators.
+-template<typename T>
++template <typename T>
+ struct FloatVec {};
+ 
+ // Template vector operations.
+-template<typename Acc, typename A, typename B>
++template <typename Acc, typename A, typename B>
+ inline __device__ Acc mul(A a, B b);
+ 
+-template<typename T>
++template <typename T>
+ inline __device__ float sum(T v);
+ 
+-template<typename T>
++template <typename T>
+ inline __device__ float dot(T a, T b) {
+   return sum(mul<T, T, T>(a, b));
+ }
+ 
+-template<typename A, typename T>
++template <typename A, typename T>
+ inline __device__ float dot(T a, T b) {
+   return sum(mul<A, T, T>(a, b));
+ }
+ 
+-template<typename T>
++template <typename T>
+ inline __device__ void zero(T& dst) {
+   constexpr int WORDS = sizeof(T) / 4;
+   union {
+@@ -61,4 +62,4 @@ inline __device__ void zero(T& dst) {
+   dst = tmp.raw;
+ }
+ 
+-} // namespace vllm
++}  // namespace vllm
+diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh
+new file mode 100644
+index 0000000..563e143
+--- /dev/null
++++ b/csrc/attention/attention_kernels.cuh
+@@ -0,0 +1,676 @@
++/*
++ * Adapted from
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
++ * Copyright (c) 2023, The vLLM team.
++ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include <torch/all.h>
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <algorithm>
++
++#include "attention_dtypes.h"
++#include "attention_utils.cuh"
++
++#ifdef USE_ROCM
++  #include <hip/hip_bf16.h>
++  #include "../quantization/fp8/amd/quant_utils.cuh"
++typedef __hip_bfloat16 __nv_bfloat16;
++#else
++  #include "../quantization/fp8/nvidia/quant_utils.cuh"
++#endif
++
++#ifndef USE_ROCM
++  #define WARP_SIZE 32
++#else
++  #define WARP_SIZE warpSize
++#endif
++
++#define MAX(a, b) ((a) > (b) ? (a) : (b))
++#define MIN(a, b) ((a) < (b) ? (a) : (b))
++#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
++
++namespace vllm {
++
++// Utility function for attention softmax.
++template <int NUM_WARPS>
++inline __device__ float block_sum(float* red_smem, float sum) {
++  // Decompose the thread index into warp / lane.
++  int warp = threadIdx.x / WARP_SIZE;
++  int lane = threadIdx.x % WARP_SIZE;
++
++  // Compute the sum per warp.
++#pragma unroll
++  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
++    sum += VLLM_SHFL_XOR_SYNC(sum, mask);
++  }
++
++  // Warp leaders store the data to shared memory.
++  if (lane == 0) {
++    red_smem[warp] = sum;
++  }
++
++  // Make sure the data is in shared memory.
++  __syncthreads();
++
++  // The warps compute the final sums.
++  if (lane < NUM_WARPS) {
++    sum = red_smem[lane];
++  }
++
++  // Parallel reduction inside the warp.
++#pragma unroll
++  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
++    sum += VLLM_SHFL_XOR_SYNC(sum, mask);
++  }
++
++  // Broadcast to other threads.
++  return VLLM_SHFL_SYNC(sum, 0);
++}
++
++// TODO(woosuk): Merge the last two dimensions of the grid.
++// Grid: (num_heads, num_seqs, max_num_partitions).
++template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
++          int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE,
++          bool IS_BLOCK_SPARSE,
++          int PARTITION_SIZE = 0>  // Zero means no partitioning.
++__device__ void paged_attention_kernel(
++    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
++    float* __restrict__ max_logits,  // [num_seqs, num_heads,
++                                     // max_num_partitions]
++    scalar_t* __restrict__ out,  // [num_seqs, num_heads, max_num_partitions,
++                                 // head_size]
++    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
++    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
++                                          // head_size/x, block_size, x]
++    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
++                                          // head_size, block_size]
++    const int num_kv_heads,               // [num_heads]
++    const float scale,
++    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
++    const int* __restrict__ seq_lens,      // [num_seqs]
++    const int max_num_blocks_per_seq,
++    const float* __restrict__ alibi_slopes,  // [num_heads]
++    const int q_stride, const int kv_block_stride, const int kv_head_stride,
++    const float k_scale, const float v_scale, const int tp_rank,
++    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
++    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
++  const int seq_idx = blockIdx.y;
++  const int partition_idx = blockIdx.z;
++  const int max_num_partitions = gridDim.z;
++  constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
++  const int seq_len = seq_lens[seq_idx];
++  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= seq_len) {
++    // No work to do. Terminate the thread block.
++    return;
++  }
++
++  const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
++  const int num_blocks_per_partition =
++      USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
++
++  // [start_block_idx, end_block_idx) is the range of blocks to process.
++  const int start_block_idx =
++      USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
++  const int end_block_idx =
++      MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
++  const int num_blocks = end_block_idx - start_block_idx;
++
++  // [start_token_idx, end_token_idx) is the range of tokens to process.
++  const int start_token_idx = start_block_idx * BLOCK_SIZE;
++  const int end_token_idx =
++      MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
++  const int num_tokens = end_token_idx - start_token_idx;
++
++  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
++  constexpr int NUM_THREAD_GROUPS =
++      NUM_THREADS / THREAD_GROUP_SIZE;  // Note: This assumes THREAD_GROUP_SIZE
++                                        // divides NUM_THREADS
++  assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
++  constexpr int NUM_TOKENS_PER_THREAD_GROUP =
++      DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
++  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++  const int thread_idx = threadIdx.x;
++  const int warp_idx = thread_idx / WARP_SIZE;
++  const int lane = thread_idx % WARP_SIZE;
++
++  const int head_idx = blockIdx.x;
++  const int num_heads = gridDim.x;
++  const int num_queries_per_kv = num_heads / num_kv_heads;
++  const int kv_head_idx = head_idx / num_queries_per_kv;
++  const float alibi_slope =
++      alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
++
++  // A vector type to store a part of a key or a query.
++  // The vector size is configured in such a way that the threads in a thread
++  // group fetch or compute 16 bytes at a time. For example, if the size of a
++  // thread group is 4 and the data type is half, then the vector size is 16 /
++  // (4 * sizeof(half)) == 2.
++  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
++  using K_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
++  using Q_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
++  using Quant_vec = typename Vec<cache_t, VEC_SIZE>::Type;
++
++  constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
++  constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
++
++  const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
++  const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
++
++  // Load the query to registers.
++  // Each thread in a thread group has a different part of the query.
++  // For example, if the the thread group size is 4, then the first thread in
++  // the group has 0, 4, 8, ... th vectors of the query, and the second thread
++  // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
++  // q is split from a qkv tensor, it may not be contiguous.
++  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
++  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
++#pragma unroll
++  for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
++       i += NUM_THREAD_GROUPS) {
++    const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
++    q_vecs[thread_group_offset][i] =
++        *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
++  }
++  __syncthreads();  // TODO(naed90): possible speedup if this is replaced with a
++                    // memory wall right before we use q_vecs
++
++  // Memory planning.
++  extern __shared__ char shared_mem[];
++  // NOTE(woosuk): We use FP32 for the softmax logits for better accuracy.
++  float* logits = reinterpret_cast<float*>(shared_mem);
++  // Workspace for reduction.
++  __shared__ float red_smem[2 * NUM_WARPS];
++
++  // x == THREAD_GROUP_SIZE * VEC_SIZE
++  // Each thread group fetches x elements from the key at a time.
++  constexpr int x = 16 / sizeof(cache_t);
++  float qk_max = -FLT_MAX;
++
++  // Iterate over the key blocks.
++  // Each warp fetches a block of keys for each iteration.
++  // Each thread group in a warp fetches a key from the block, and computes
++  // dot product with the query.
++  const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
++
++  // blocksparse specific vars
++  int bs_block_offset;
++  int q_bs_block_id;
++  if constexpr (IS_BLOCK_SPARSE) {
++    // const int num_blocksparse_blocks = DIVIDE_ROUND_UP(seq_len,
++    // blocksparse_block_size);
++    q_bs_block_id = (seq_len - 1) / blocksparse_block_size;
++    if (blocksparse_head_sliding_step >= 0)
++      // sliding on q heads
++      bs_block_offset =
++          (tp_rank * num_heads + head_idx) * blocksparse_head_sliding_step + 1;
++    else
++      // sliding on kv heads
++      bs_block_offset = (tp_rank * num_kv_heads + kv_head_idx) *
++                            (-blocksparse_head_sliding_step) +
++                        1;
++  }
++
++  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
++       block_idx += NUM_WARPS) {
++    // NOTE(woosuk): The block number is stored in int32. However, we cast it to
++    // int64 because int32 can lead to overflow when this variable is multiplied
++    // by large numbers (e.g., kv_block_stride).
++    // For blocksparse attention: skip computation on blocks that are not
++    // attended
++    if constexpr (IS_BLOCK_SPARSE) {
++      const int k_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
++      const bool is_remote =
++          ((k_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0);
++      const bool is_local =
++          (k_bs_block_id > q_bs_block_id - blocksparse_local_blocks);
++      if (!is_remote && !is_local) {
++        for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
++          const int physical_block_offset =
++              (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
++          const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
++
++          if (thread_group_offset == 0) {
++            // NOTE(linxihui): assign very large number to skipped tokens to
++            // avoid contribution to the sumexp softmax normalizer. This will
++            // not be used at computing sum(softmax*v) as the blocks will be
++            // skipped.
++            logits[token_idx - start_token_idx] = -FLT_MAX;
++          }
++        }
++        continue;
++      }
++    }
++    const int64_t physical_block_number =
++        static_cast<int64_t>(block_table[block_idx]);
++
++    // Load a key to registers.
++    // Each thread in a thread group has a different part of the key.
++    // For example, if the the thread group size is 4, then the first thread in
++    // the group has 0, 4, 8, ... th vectors of the key, and the second thread
++    // has 1, 5, 9, ... th vectors of the key, and so on.
++    for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
++      const int physical_block_offset =
++          (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
++      const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
++      K_vec k_vecs[NUM_VECS_PER_THREAD];
++
++#pragma unroll
++      for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
++        const cache_t* k_ptr =
++            k_cache + physical_block_number * kv_block_stride +
++            kv_head_idx * kv_head_stride + physical_block_offset * x;
++        const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
++        const int offset1 = (vec_idx * VEC_SIZE) / x;
++        const int offset2 = (vec_idx * VEC_SIZE) % x;
++
++        if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
++          k_vecs[j] = *reinterpret_cast<const K_vec*>(
++              k_ptr + offset1 * BLOCK_SIZE * x + offset2);
++        } else {
++          // Vector conversion from Quant_vec to K_vec.
++          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(
++              k_ptr + offset1 * BLOCK_SIZE * x + offset2);
++          k_vecs[j] = fp8::scaled_convert<K_vec, Quant_vec, KV_DTYPE>(
++              k_vec_quant, k_scale);
++        }
++      }
++
++      // Compute dot product.
++      // This includes a reduction across the threads in the same thread group.
++      float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(
++                             q_vecs[thread_group_offset], k_vecs);
++      // Add the ALiBi bias if slopes are given.
++      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
++
++      if (thread_group_offset == 0) {
++        // Store the partial reductions to shared memory.
++        // NOTE(woosuk): It is required to zero out the masked logits.
++        const bool mask = token_idx >= seq_len;
++        logits[token_idx - start_token_idx] = mask ? 0.f : qk;
++        // Update the max value.
++        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
++      }
++    }
++  }
++
++  // Perform reduction across the threads in the same warp to get the
++  // max qk value for each "warp" (not across the thread block yet).
++  // The 0-th thread of each thread group already has its max qk value.
++#pragma unroll
++  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
++    qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
++  }
++  if (lane == 0) {
++    red_smem[warp_idx] = qk_max;
++  }
++  __syncthreads();
++
++  // TODO(woosuk): Refactor this part.
++  // Get the max qk value for the sequence.
++  qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
++#pragma unroll
++  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
++    qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
++  }
++  // Broadcast the max qk value to all threads.
++  qk_max = VLLM_SHFL_SYNC(qk_max, 0);
++
++  // Get the sum of the exp values.
++  float exp_sum = 0.f;
++  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
++    float val = __expf(logits[i] - qk_max);
++    logits[i] = val;
++    exp_sum += val;
++  }
++  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
++
++  // Compute softmax.
++  const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
++  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
++    logits[i] *= inv_sum;
++  }
++  __syncthreads();
++
++  // If partitioning is enabled, store the max logit and exp_sum.
++  if (USE_PARTITIONING && thread_idx == 0) {
++    float* max_logits_ptr = max_logits +
++                            seq_idx * num_heads * max_num_partitions +
++                            head_idx * max_num_partitions + partition_idx;
++    *max_logits_ptr = qk_max;
++    float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions +
++                          head_idx * max_num_partitions + partition_idx;
++    *exp_sums_ptr = exp_sum;
++  }
++
++  // Each thread will fetch 16 bytes from the value cache at a time.
++  constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
++  using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
++  using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
++  using V_quant_vec = typename Vec<cache_t, V_VEC_SIZE>::Type;
++  using Float_L_vec = typename FloatVec<L_vec>::Type;
++
++  constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
++  constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
++  constexpr int NUM_ROWS_PER_THREAD =
++      DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
++
++  // NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
++  float accs[NUM_ROWS_PER_THREAD];
++#pragma unroll
++  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++    accs[i] = 0.f;
++  }
++
++  scalar_t zero_value;
++  zero(zero_value);
++  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
++       block_idx += NUM_WARPS) {
++    // NOTE(woosuk): The block number is stored in int32. However, we cast it to
++    // int64 because int32 can lead to overflow when this variable is multiplied
++    // by large numbers (e.g., kv_block_stride).
++    // For blocksparse attention: skip computation on blocks that are not
++    // attended
++    if constexpr (IS_BLOCK_SPARSE) {
++      int v_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
++      if (!((v_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0) &&
++          !((v_bs_block_id > q_bs_block_id - blocksparse_local_blocks))) {
++        continue;
++      }
++    }
++    const int64_t physical_block_number =
++        static_cast<int64_t>(block_table[block_idx]);
++    const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
++    const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
++    L_vec logits_vec;
++    from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx -
++                                                           start_token_idx));
++
++    const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride +
++                           kv_head_idx * kv_head_stride;
++#pragma unroll
++    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++      if (row_idx < HEAD_SIZE) {
++        const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
++        V_vec v_vec;
++
++        if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
++          v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
++        } else {
++          V_quant_vec v_quant_vec =
++              *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
++          // Vector conversion from V_quant_vec to V_vec.
++          v_vec = fp8::scaled_convert<V_vec, V_quant_vec, KV_DTYPE>(v_quant_vec,
++                                                                    v_scale);
++        }
++        if (block_idx == num_seq_blocks - 1) {
++          // NOTE(woosuk): When v_vec contains the tokens that are out of the
++          // context, we should explicitly zero out the values since they may
++          // contain NaNs. See
++          // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
++          scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
++#pragma unroll
++          for (int j = 0; j < V_VEC_SIZE; j++) {
++            v_vec_ptr[j] = token_idx + j < seq_len ? v_vec_ptr[j] : zero_value;
++          }
++        }
++        accs[i] += dot(logits_vec, v_vec);
++      }
++    }
++  }
++
++  // Perform reduction within each warp.
++#pragma unroll
++  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++    float acc = accs[i];
++#pragma unroll
++    for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
++      acc += VLLM_SHFL_XOR_SYNC(acc, mask);
++    }
++    accs[i] = acc;
++  }
++
++  // NOTE(woosuk): A barrier is required because the shared memory space for
++  // logits is reused for the output.
++  __syncthreads();
++
++  // Perform reduction across warps.
++  float* out_smem = reinterpret_cast<float*>(shared_mem);
++#pragma unroll
++  for (int i = NUM_WARPS; i > 1; i /= 2) {
++    int mid = i / 2;
++    // Upper warps write to shared memory.
++    if (warp_idx >= mid && warp_idx < i) {
++      float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
++#pragma unroll
++      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
++          dst[row_idx] = accs[i];
++        }
++      }
++    }
++    __syncthreads();
++
++    // Lower warps update the output.
++    if (warp_idx < mid) {
++      const float* src = &out_smem[warp_idx * HEAD_SIZE];
++#pragma unroll
++      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
++          accs[i] += src[row_idx];
++        }
++      }
++    }
++    __syncthreads();
++  }
++
++  // Write the final output.
++  if (warp_idx == 0) {
++    scalar_t* out_ptr =
++        out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
++        head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE;
++#pragma unroll
++    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
++        from_float(*(out_ptr + row_idx), accs[i]);
++      }
++    }
++  }
++}
++
++// Grid: (num_heads, num_seqs, 1).
++template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
++          int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE,
++          bool IS_BLOCK_SPARSE>
++__global__ void paged_attention_v1_kernel(
++    scalar_t* __restrict__ out,           // [num_seqs, num_heads, head_size]
++    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
++    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
++                                          // head_size/x, block_size, x]
++    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
++                                          // head_size, block_size]
++    const int num_kv_heads,               // [num_heads]
++    const float scale,
++    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
++    const int* __restrict__ seq_lens,      // [num_seqs]
++    const int max_num_blocks_per_seq,
++    const float* __restrict__ alibi_slopes,  // [num_heads]
++    const int q_stride, const int kv_block_stride, const int kv_head_stride,
++    const float k_scale, const float v_scale, const int tp_rank,
++    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
++    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
++  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
++                         KV_DTYPE, IS_BLOCK_SPARSE>(
++      /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
++      v_cache, num_kv_heads, scale, block_tables, seq_lens,
++      max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
++      kv_head_stride, k_scale, v_scale, tp_rank, blocksparse_local_blocks,
++      blocksparse_vert_stride, blocksparse_block_size,
++      blocksparse_head_sliding_step);
++}
++
++// Grid: (num_heads, num_seqs, max_num_partitions).
++template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
++          int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE,
++          bool IS_BLOCK_SPARSE,
++          int PARTITION_SIZE>
++__global__ void paged_attention_v2_kernel(
++    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
++    float* __restrict__ max_logits,       // [num_seqs, num_heads,
++                                          // max_num_partitions]
++    scalar_t* __restrict__ tmp_out,       // [num_seqs, num_heads,
++                                          // max_num_partitions, head_size]
++    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
++    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
++                                          // head_size/x, block_size, x]
++    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
++                                          // head_size, block_size]
++    const int num_kv_heads,               // [num_heads]
++    const float scale,
++    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
++    const int* __restrict__ seq_lens,      // [num_seqs]
++    const int max_num_blocks_per_seq,
++    const float* __restrict__ alibi_slopes,  // [num_heads]
++    const int q_stride, const int kv_block_stride, const int kv_head_stride,
++    const float k_scale, const float v_scale, const int tp_rank,
++    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
++    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
++  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
++                         KV_DTYPE, IS_BLOCK_SPARSE, PARTITION_SIZE>(
++      exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
++      block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
++      kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,
++      blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
++      blocksparse_head_sliding_step);
++}
++
++// Grid: (num_heads, num_seqs).
++template <typename scalar_t, int HEAD_SIZE, int NUM_THREADS,
++          int PARTITION_SIZE>
++__global__ void paged_attention_v2_reduce_kernel(
++    scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
++    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
++                                           // max_num_partitions]
++    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
++                                           // max_num_partitions]
++    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
++                                           // max_num_partitions, head_size]
++    const int* __restrict__ seq_lens,      // [num_seqs]
++    const int max_num_partitions) {
++  const int num_heads = gridDim.x;
++  const int head_idx = blockIdx.x;
++  const int seq_idx = blockIdx.y;
++  const int seq_len = seq_lens[seq_idx];
++  const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
++  if (num_partitions == 1) {
++    // No need to reduce. Only copy tmp_out to out.
++    scalar_t* out_ptr =
++        out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
++    const scalar_t* tmp_out_ptr =
++        tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
++        head_idx * max_num_partitions * HEAD_SIZE;
++    for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
++      out_ptr[i] = tmp_out_ptr[i];
++    }
++    // Terminate the thread block.
++    return;
++  }
++
++  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++  const int warp_idx = threadIdx.x / WARP_SIZE;
++  const int lane = threadIdx.x % WARP_SIZE;
++
++  // Size: 2 * num_partitions.
++  extern __shared__ char shared_mem[];
++  // Workspace for reduction.
++  __shared__ float red_smem[2 * NUM_WARPS];
++
++  // Load max logits to shared memory.
++  float* shared_max_logits = reinterpret_cast<float*>(shared_mem);
++  const float* max_logits_ptr = max_logits +
++                                seq_idx * num_heads * max_num_partitions +
++                                head_idx * max_num_partitions;
++  float max_logit = -FLT_MAX;
++  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
++    const float l = max_logits_ptr[i];
++    shared_max_logits[i] = l;
++    max_logit = fmaxf(max_logit, l);
++  }
++  __syncthreads();
++
++  // Get the global max logit.
++  // Reduce within the warp.
++#pragma unroll
++  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
++    max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask));
++  }
++  if (lane == 0) {
++    red_smem[warp_idx] = max_logit;
++  }
++  __syncthreads();
++  // Reduce across warps.
++  max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
++#pragma unroll
++  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
++    max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask));
++  }
++  // Broadcast the max value to all threads.
++  max_logit = VLLM_SHFL_SYNC(max_logit, 0);
++
++  // Load rescaled exp sums to shared memory.
++  float* shared_exp_sums =
++      reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
++  const float* exp_sums_ptr = exp_sums +
++                              seq_idx * num_heads * max_num_partitions +
++                              head_idx * max_num_partitions;
++  float global_exp_sum = 0.0f;
++  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
++    float l = shared_max_logits[i];
++    float rescaled_exp_sum = exp_sums_ptr[i] * expf(l - max_logit);
++    global_exp_sum += rescaled_exp_sum;
++    shared_exp_sums[i] = rescaled_exp_sum;
++  }
++  __syncthreads();
++  global_exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum);
++  const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
++
++  // Aggregate tmp_out to out.
++  const scalar_t* tmp_out_ptr =
++      tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
++      head_idx * max_num_partitions * HEAD_SIZE;
++  scalar_t* out_ptr =
++      out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
++#pragma unroll
++  for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
++    float acc = 0.0f;
++    for (int j = 0; j < num_partitions; ++j) {
++      acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] *
++             inv_global_exp_sum;
++    }
++    from_float(out_ptr[i], acc);
++  }
++}
++
++}  // namespace vllm
++
++#undef WARP_SIZE
++#undef MAX
++#undef MIN
++#undef DIVIDE_ROUND_UP
+diff --git a/csrc/attention/attention_utils.cuh b/csrc/attention/attention_utils.cuh
+index ff64c4b..826b0ed 100644
+--- a/csrc/attention/attention_utils.cuh
++++ b/csrc/attention/attention_utils.cuh
+@@ -1,5 +1,6 @@
+ /*
+- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
++ * Adapted from
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+  * Copyright (c) 2023, The vLLM team.
+  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+  *
+@@ -26,14 +27,14 @@
+ namespace vllm {
+ 
+ // Q*K^T operation.
+-template<int THREAD_GROUP_SIZE, typename Vec, int N>
++template <int THREAD_GROUP_SIZE, typename Vec, int N>
+ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
+   using A_vec = typename FloatVec<Vec>::Type;
+   // Compute the parallel products for Q*K^T (treat vector lanes separately).
+   A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
+ #pragma unroll
+   for (int ii = 1; ii < N; ++ii) {
+-    qk_vec = fma(q[ii], k[ii], qk_vec);
++    qk_vec = vllm::fma(q[ii], k[ii], qk_vec);
+   }
+ 
+   // Finalize the reduction across lanes.
+@@ -45,12 +46,12 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
+   return qk;
+ }
+ 
+-template<typename T, int THREAD_GROUP_SIZE>
++template <typename T, int THREAD_GROUP_SIZE>
+ struct Qk_dot {
+-  template<typename Vec, int N>
++  template <typename Vec, int N>
+   static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
+     return qk_dot_<THREAD_GROUP_SIZE>(q, k);
+   }
+ };
+ 
+-} // namespace vllm
++}  // namespace vllm
+diff --git a/csrc/attention/dtype_bfloat16.cuh b/csrc/attention/dtype_bfloat16.cuh
+index 31e0cee..97a25ba 100644
+--- a/csrc/attention/dtype_bfloat16.cuh
++++ b/csrc/attention/dtype_bfloat16.cuh
+@@ -1,6 +1,8 @@
+ /*
+- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
++ * Adapted from
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
++ * and
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+  * Copyright (c) 2023, The vLLM team.
+  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+  *
+@@ -28,8 +30,8 @@
+   #include <hip/hip_bf16.h>
+   #include <hip/hip_fp16.h>
+ 
+-  typedef __hip_bfloat162 __nv_bfloat162;
+-  typedef __hip_bfloat16 __nv_bfloat16;
++typedef __hip_bfloat162 __nv_bfloat162;
++typedef __hip_bfloat16 __nv_bfloat16;
+ #endif
+ 
+ #include <stdint.h>
+@@ -50,37 +52,37 @@ struct bf16_8_t {
+ };
+ 
+ // BF16 vector types for Q, K, V.
+-template<>
++template <>
+ struct Vec<__nv_bfloat16, 1> {
+   using Type = __nv_bfloat16;
+ };
+-template<>
++template <>
+ struct Vec<__nv_bfloat16, 2> {
+   using Type = __nv_bfloat162;
+ };
+-template<>
++template <>
+ struct Vec<__nv_bfloat16, 4> {
+   using Type = bf16_4_t;
+ };
+-template<>
++template <>
+ struct Vec<__nv_bfloat16, 8> {
+   using Type = bf16_8_t;
+ };
+ 
+ // FP32 accumulator vector types corresponding to Vec.
+-template<>
++template <>
+ struct FloatVec<__nv_bfloat16> {
+   using Type = float;
+ };
+-template<>
++template <>
+ struct FloatVec<__nv_bfloat162> {
+   using Type = float2;
+ };
+-template<>
++template <>
+ struct FloatVec<bf16_4_t> {
+   using Type = Float4_;
+ };
+-template<>
++template <>
+ struct FloatVec<bf16_8_t> {
+   using Type = Float8_;
+ };
+@@ -92,6 +94,7 @@ inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
+ #else
+   return __bfloat1622float2(val);
+ #endif
++  __builtin_unreachable();  // Suppress missing return statement warning
+ }
+ 
+ inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+@@ -100,6 +103,7 @@ inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+ #else
+   return __bfloat162bfloat162(val);
+ #endif
++  __builtin_unreachable();  // Suppress missing return statement warning
+ }
+ 
+ // Vector addition.
+@@ -108,11 +112,12 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
+   assert(false);
+ #else
+   #ifndef USE_ROCM
+-    return a + b;
++  return a + b;
+   #else
+-    return __hadd(a, b);
++  return __hadd(a, b);
+   #endif
+ #endif
++  __builtin_unreachable();  // Suppress missing return statement warning
+ }
+ 
+ inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) {
+@@ -121,6 +126,7 @@ inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) {
+ #else
+   return __hadd2(a, b);
+ #endif
++  __builtin_unreachable();  // Suppress missing return statement warning
+ }
+ 
+ inline __device__ bf16_4_t add(bf16_4_t a, bf16_4_t b) {
+@@ -161,30 +167,32 @@ inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) {
+ }
+ 
+ // Vector multiplication.
+-template<>
++template <>
+ inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+   assert(false);
+ #else
+   return __hmul(a, b);
+ #endif
++  __builtin_unreachable();  // Suppress missing return statement warning
+ }
+ 
+-template<>
++template <>
+ inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+   assert(false);
+ #else
+   return __hmul2(a, b);
+ #endif
++  __builtin_unreachable();  // Suppress missing return statement warning
+ }
+ 
+-template<>
++template <>
+ inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
+   return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
+ }
+ 
+-template<>
++template <>
+ inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
+   bf16_4_t c;
+   c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+@@ -192,7 +200,7 @@ inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
+   __nv_bfloat162 s = bf162bf162(a);
+   bf16_4_t c;
+@@ -201,7 +209,7 @@ inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
+   bf16_8_t c;
+   c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+@@ -211,7 +219,7 @@ inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
+   __nv_bfloat162 s = bf162bf162(a);
+   bf16_8_t c;
+@@ -222,26 +230,26 @@ inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
+   float fa = __bfloat162float(a);
+   float fb = __bfloat162float(b);
+   return fa * fb;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
+   float2 fa = bf1622float2(a);
+   float2 fb = bf1622float2(b);
+   return mul<float2, float2, float2>(fa, fb);
+ }
+ 
+-template<>
++template <>
+ inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
+   return mul<float2, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
+ }
+ 
+-template<>
++template <>
+ inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
+   Float4_ fc;
+   fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+@@ -249,7 +257,7 @@ inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
+   return fc;
+ }
+ 
+-template<>
++template <>
+ inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
+   __nv_bfloat162 s = bf162bf162(a);
+   Float4_ fc;
+@@ -258,7 +266,7 @@ inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
+   return fc;
+ }
+ 
+-template<>
++template <>
+ inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
+   Float8_ fc;
+   fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+@@ -268,7 +276,7 @@ inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
+   return fc;
+ }
+ 
+-template<>
++template <>
+ inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
+   __nv_bfloat162 s = bf162bf162(a);
+   Float8_ fc;
+@@ -280,20 +288,24 @@ inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
+ }
+ 
+ // Vector fused multiply-add.
+-inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
++inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
++                                     __nv_bfloat162 c) {
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+   assert(false);
+ #else
+   return __hfma2(a, b, c);
+ #endif
++  __builtin_unreachable();  // Suppress missing return statement warning
+ }
+ 
+-inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) {
++inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
++                                     __nv_bfloat162 c) {
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+   assert(false);
+ #else
+   return __hfma2(bf162bf162(a), b, c);
+ #endif
++  __builtin_unreachable();  // Suppress missing return statement warning
+ }
+ 
+ inline __device__ bf16_4_t fma(bf16_4_t a, bf16_4_t b, bf16_4_t c) {
+@@ -379,23 +391,23 @@ inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) {
+ }
+ 
+ // Vector sum.
+-template<>
++template <>
+ inline __device__ float sum(__nv_bfloat16 v) {
+   return __bfloat162float(v);
+ }
+ 
+-template<>
++template <>
+ inline __device__ float sum(__nv_bfloat162 v) {
+   float2 vf = bf1622float2(v);
+   return vf.x + vf.y;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float sum(bf16_4_t v) {
+   return sum(v.x) + sum(v.y);
+ }
+ 
+-template<>
++template <>
+ inline __device__ float sum(bf16_8_t v) {
+   return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
+ }
+@@ -448,4 +460,4 @@ inline __device__ void zero(__nv_bfloat16& dst) {
+ #endif
+ }
+ 
+-} // namespace vllm
++}  // namespace vllm
+diff --git a/csrc/attention/dtype_float16.cuh b/csrc/attention/dtype_float16.cuh
+index d3271e6..3a1815f 100644
+--- a/csrc/attention/dtype_float16.cuh
++++ b/csrc/attention/dtype_float16.cuh
+@@ -1,6 +1,8 @@
+ /*
+- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
++ * Adapted from
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
++ * and
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+  * Copyright (c) 2023, The vLLM team.
+  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+  *
+@@ -30,37 +32,37 @@
+ namespace vllm {
+ 
+ // FP16 vector types for Q, K, V.
+-template<>
++template <>
+ struct Vec<uint16_t, 1> {
+   using Type = uint16_t;
+ };
+-template<>
++template <>
+ struct Vec<uint16_t, 2> {
+   using Type = uint32_t;
+ };
+-template<>
++template <>
+ struct Vec<uint16_t, 4> {
+   using Type = uint2;
+ };
+-template<>
++template <>
+ struct Vec<uint16_t, 8> {
+   using Type = uint4;
+ };
+ 
+ // FP32 accumulator vector types corresponding to Vec.
+-template<>
++template <>
+ struct FloatVec<uint16_t> {
+   using Type = float;
+ };
+-template<>
++template <>
+ struct FloatVec<uint32_t> {
+   using Type = float2;
+ };
+-template<>
++template <>
+ struct FloatVec<uint2> {
+   using Type = Float4_;
+ };
+-template<>
++template <>
+ struct FloatVec<uint4> {
+   using Type = Float8_;
+ };
+@@ -73,8 +75,8 @@ inline __device__ uint32_t h0_h0(uint16_t a) {
+   return b;
+ #else
+   union {
+-   uint32_t u32;
+-   uint16_t u16[2];
++    uint32_t u32;
++    uint16_t u16[2];
+   } tmp;
+   tmp.u16[0] = a;
+   tmp.u16[1] = a;
+@@ -130,10 +132,12 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
+   } tmp;
+ #ifndef USE_ROCM
+   #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+-    asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
++  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
++               : "=r"(tmp.u32)
++               : "f"(f.y), "f"(f.x));
+   #else
+-    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+-    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
++  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
++  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+   #endif
+ #else
+   tmp.u16[0] = float_to_half(f.x);
+@@ -201,7 +205,7 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) {
+ }
+ 
+ // Vector multiplication.
+-template<>
++template <>
+ inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
+   uint16_t c;
+ #ifndef USE_ROCM
+@@ -212,7 +216,7 @@ inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
+   uint32_t c;
+ #ifndef USE_ROCM
+@@ -223,12 +227,12 @@ inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
+   return mul<uint32_t, uint32_t, uint32_t>(h0_h0(a), b);
+ }
+ 
+-template<>
++template <>
+ inline __device__ uint2 mul(uint2 a, uint2 b) {
+   uint2 c;
+   c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+@@ -236,7 +240,7 @@ inline __device__ uint2 mul(uint2 a, uint2 b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ uint2 mul(uint16_t a, uint2 b) {
+   uint32_t s = h0_h0(a);
+   uint2 c;
+@@ -245,7 +249,7 @@ inline __device__ uint2 mul(uint16_t a, uint2 b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ uint4 mul(uint4 a, uint4 b) {
+   uint4 c;
+   c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+@@ -255,7 +259,7 @@ inline __device__ uint4 mul(uint4 a, uint4 b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ uint4 mul(uint16_t a, uint4 b) {
+   uint32_t s = h0_h0(a);
+   uint4 c;
+@@ -266,26 +270,26 @@ inline __device__ uint4 mul(uint16_t a, uint4 b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float mul(uint16_t a, uint16_t b) {
+   float fa = half_to_float(a);
+   float fb = half_to_float(b);
+   return fa * fb;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float2 mul(uint32_t a, uint32_t b) {
+   float2 fa = half2_to_float2(a);
+   float2 fb = half2_to_float2(b);
+   return mul<float2, float2, float2>(fa, fb);
+ }
+ 
+-template<>
++template <>
+ inline __device__ float2 mul(uint16_t a, uint32_t b) {
+   return mul<float2, uint32_t, uint32_t>(h0_h0(a), b);
+ }
+ 
+-template<>
++template <>
+ inline __device__ Float4_ mul(uint2 a, uint2 b) {
+   Float4_ fc;
+   fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
+@@ -293,7 +297,7 @@ inline __device__ Float4_ mul(uint2 a, uint2 b) {
+   return fc;
+ }
+ 
+-template<>
++template <>
+ inline __device__ Float4_ mul(uint16_t a, uint2 b) {
+   uint32_t s = h0_h0(a);
+   Float4_ fc;
+@@ -302,7 +306,7 @@ inline __device__ Float4_ mul(uint16_t a, uint2 b) {
+   return fc;
+ }
+ 
+-template<>
++template <>
+ inline __device__ Float8_ mul(uint4 a, uint4 b) {
+   Float8_ fc;
+   fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
+@@ -312,7 +316,7 @@ inline __device__ Float8_ mul(uint4 a, uint4 b) {
+   return fc;
+ }
+ 
+-template<>
++template <>
+ inline __device__ Float8_ mul(uint16_t a, uint4 b) {
+   uint32_t s = h0_h0(a);
+   Float8_ fc;
+@@ -327,9 +331,13 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) {
+ inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
+   uint32_t d;
+ #ifndef USE_ROCM
+-  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
++  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++               : "=r"(d)
++               : "r"(a), "r"(b), "r"(c));
+ #else
+-  asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
++  asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
++               : "=v"(d)
++               : "v"(a), "v"(b), "v"(c));
+ #endif
+   return d;
+ }
+@@ -423,24 +431,24 @@ inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) {
+ }
+ 
+ // Vector sum.
+-template<>
++template <>
+ inline __device__ float sum(uint16_t v) {
+   return half_to_float(v);
+ }
+ 
+-template<>
++template <>
+ inline __device__ float sum(uint32_t v) {
+   float2 tmp = half2_to_float2(v);
+   return tmp.x + tmp.y;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float sum(uint2 v) {
+   uint32_t c = add(v.x, v.y);
+   return sum(c);
+ }
+ 
+-template<>
++template <>
+ inline __device__ float sum(uint4 v) {
+   uint32_t c = add(v.x, v.y);
+   c = add(c, v.z);
+@@ -470,13 +478,9 @@ inline __device__ void from_float(uint4& dst, Float8_ src) {
+ }
+ 
+ // From float16 to float32.
+-inline __device__ float to_float(uint16_t u) {
+-  return half_to_float(u);
+-}
++inline __device__ float to_float(uint16_t u) { return half_to_float(u); }
+ 
+-inline __device__ float2 to_float(uint32_t u) {
+-  return half2_to_float2(u);
+-}
++inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); }
+ 
+ inline __device__ Float4_ to_float(uint2 u) {
+   Float4_ tmp;
+@@ -495,8 +499,6 @@ inline __device__ Float8_ to_float(uint4 u) {
+ }
+ 
+ // Zero-out a variable.
+-inline __device__ void zero(uint16_t& dst) {
+-  dst = uint16_t(0);
+-}
++inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); }
+ 
+-} // namespace vllm
++}  // namespace vllm
+diff --git a/csrc/attention/dtype_float32.cuh b/csrc/attention/dtype_float32.cuh
+index b200d2d..7c6a686 100644
+--- a/csrc/attention/dtype_float32.cuh
++++ b/csrc/attention/dtype_float32.cuh
+@@ -1,6 +1,8 @@
+ /*
+- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
++ * Adapted from
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
++ * and
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+  * Copyright (c) 2023, The vLLM team.
+  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+  *
+@@ -38,37 +40,35 @@ struct Float8_ {
+ };
+ 
+ // FP32 vector types for Q, K, V.
+-template<>
++template <>
+ struct Vec<float, 1> {
+   using Type = float;
+ };
+-template<>
++template <>
+ struct Vec<float, 2> {
+   using Type = float2;
+ };
+-template<>
++template <>
+ struct Vec<float, 4> {
+   using Type = float4;
+ };
+ 
+ // FP32 accumulator vector types corresponding to Vec.
+-template<>
++template <>
+ struct FloatVec<float> {
+   using Type = float;
+ };
+-template<>
++template <>
+ struct FloatVec<float2> {
+   using Type = float2;
+ };
+-template<>
++template <>
+ struct FloatVec<float4> {
+   using Type = float4;
+ };
+ 
+ // Vector addition.
+-inline __device__ float add(float a, float b) {
+-  return a + b;
+-}
++inline __device__ float add(float a, float b) { return a + b; }
+ 
+ inline __device__ float2 add(float2 a, float2 b) {
+   float2 c;
+@@ -87,12 +87,12 @@ inline __device__ float4 add(float4 a, float4 b) {
+ }
+ 
+ // Vector multiplication.
+-template<>
++template <>
+ inline __device__ float mul<float, float>(float a, float b) {
+   return a * b;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float2 mul(float2 a, float2 b) {
+   float2 c;
+   c.x = a.x * b.x;
+@@ -100,7 +100,7 @@ inline __device__ float2 mul(float2 a, float2 b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float2 mul(float a, float2 b) {
+   float2 c;
+   c.x = a * b.x;
+@@ -108,7 +108,7 @@ inline __device__ float2 mul(float a, float2 b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float4 mul(float4 a, float4 b) {
+   float4 c;
+   c.x = a.x * b.x;
+@@ -118,7 +118,7 @@ inline __device__ float4 mul(float4 a, float4 b) {
+   return c;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float4 mul(float a, float4 b) {
+   float4 c;
+   c.x = a * b.x;
+@@ -129,9 +129,7 @@ inline __device__ float4 mul(float a, float4 b) {
+ }
+ 
+ // Vector fused multiply-add.
+-inline __device__ float fma(float a, float b, float c) {
+-  return a * b + c;
+-}
++inline __device__ float fma(float a, float b, float c) { return a * b + c; }
+ 
+ inline __device__ float2 fma(float2 a, float2 b, float2 c) {
+   float2 d;
+@@ -182,35 +180,33 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
+ }
+ 
+ // Vector sum.
+-template<>
++template <>
+ inline __device__ float sum(float v) {
+   return v;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float sum(float2 v) {
+   return v.x + v.y;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float sum(float4 v) {
+   return v.x + v.y + v.z + v.w;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float sum(Float4_ v) {
+   return v.x.x + v.x.y + v.y.x + v.y.y;
+ }
+ 
+-template<>
++template <>
+ inline __device__ float sum(Float8_ v) {
+   return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
+ }
+ 
+ // Vector dot product.
+-inline __device__ float dot(float a, float b) {
+-  return a * b;
+-}
++inline __device__ float dot(float a, float b) { return a * b; }
+ 
+ inline __device__ float dot(float2 a, float2 b) {
+   float2 c = mul<float2, float2, float2>(a, b);
+@@ -232,42 +228,24 @@ inline __device__ float dot(Float8_ a, Float8_ b) {
+ }
+ 
+ // From float to float.
+-inline __device__ void from_float(float& dst, float src) {
+-  dst = src;
+-}
++inline __device__ void from_float(float& dst, float src) { dst = src; }
+ 
+-inline __device__ void from_float(float2& dst, float2 src) {
+-  dst = src;
+-}
++inline __device__ void from_float(float2& dst, float2 src) { dst = src; }
+ 
+-inline __device__ void from_float(float4& dst, float4 src) {
+-  dst = src;
+-}
++inline __device__ void from_float(float4& dst, float4 src) { dst = src; }
+ 
+ // From float to float.
+-inline __device__ float to_float(float u) {
+-  return u;
+-}
++inline __device__ float to_float(float u) { return u; }
+ 
+-inline __device__ float2 to_float(float2 u) {
+-  return u;
+-}
++inline __device__ float2 to_float(float2 u) { return u; }
+ 
+-inline __device__ float4 to_float(float4 u) {
+-  return u;
+-}
++inline __device__ float4 to_float(float4 u) { return u; }
+ 
+-inline __device__ Float4_ to_float(Float4_ u) {
+-  return u;
+-}
++inline __device__ Float4_ to_float(Float4_ u) { return u; }
+ 
+-inline __device__ Float8_ to_float(Float8_ u) {
+-  return u;
+-}
++inline __device__ Float8_ to_float(Float8_ u) { return u; }
+ 
+ // Zero-out a variable.
+-inline __device__ void zero(float& dst) {
+-  dst = 0.f;
+-}
++inline __device__ void zero(float& dst) { dst = 0.f; }
+ 
+-} // namespace vllm
++}  // namespace vllm
+diff --git a/csrc/attention/dtype_fp8.cuh b/csrc/attention/dtype_fp8.cuh
+index d11dee9..e714e32 100644
+--- a/csrc/attention/dtype_fp8.cuh
++++ b/csrc/attention/dtype_fp8.cuh
+@@ -3,33 +3,39 @@
+ #include "attention_generic.cuh"
+ 
+ #include <stdint.h>
+-#ifdef ENABLE_FP8_E5M2
+-#include <cuda_fp8.h>
+-#endif
++#ifdef ENABLE_FP8
++  #ifndef USE_ROCM
++    #include <cuda_fp8.h>
++  #endif  // USE_ROCM
++#endif    // ENABLE_FP8
+ 
+ namespace vllm {
+-#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
+-// fp8 vector types for quantization of kv cache
+ 
+-template<>
++enum class Fp8KVCacheDataType {
++  kAuto = 0,
++  kFp8E4M3 = 1,
++  kFp8E5M2 = 2,
++};
++
++// fp8 vector types for quantization of kv cache
++template <>
+ struct Vec<uint8_t, 1> {
+-    using Type = uint8_t;
++  using Type = uint8_t;
+ };
+ 
+-template<>
++template <>
+ struct Vec<uint8_t, 2> {
+-    using Type = uint16_t;
++  using Type = uint16_t;
+ };
+ 
+-template<>
++template <>
+ struct Vec<uint8_t, 4> {
+-    using Type = uint32_t;
++  using Type = uint32_t;
+ };
+ 
+-template<>
++template <>
+ struct Vec<uint8_t, 8> {
+-    using Type = uint2;
++  using Type = uint2;
+ };
+-#endif // ENABLE_FP8_E5M2
+ 
+-} // namespace vllm
++}  // namespace vllm
+diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
+new file mode 100644
+index 0000000..2732114
+--- /dev/null
++++ b/csrc/attention/paged_attention_v1.cu
+@@ -0,0 +1,193 @@
++/*
++ * Adapted from
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
++ * Copyright (c) 2023, The vLLM team.
++ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "attention_kernels.cuh"
++
++#ifndef USE_ROCM
++  #define WARP_SIZE 32
++#else
++  #define WARP_SIZE warpSize
++#endif
++
++#define MAX(a, b) ((a) > (b) ? (a) : (b))
++#define MIN(a, b) ((a) < (b) ? (a) : (b))
++#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
++
++#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
++  VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                     \
++      ((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE,        \
++                                              BLOCK_SIZE, NUM_THREADS,      \
++                                              KV_DTYPE, IS_BLOCK_SPARSE>),  \
++      shared_mem_size);                                                     \
++  vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,        \
++                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE>   \
++      <<<grid, block, shared_mem_size, stream>>>(                           \
++          out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
++          scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
++          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
++          k_scale, v_scale, tp_rank, blocksparse_local_blocks,              \
++          blocksparse_vert_stride, blocksparse_block_size,                  \
++          blocksparse_head_sliding_step);
++
++// TODO(woosuk): Tune NUM_THREADS.
++template <typename T, typename CACHE_T, int BLOCK_SIZE,
++          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
++          int NUM_THREADS = 128>
++void paged_attention_v1_launcher(
++    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
++    torch::Tensor& value_cache, int num_kv_heads, float scale,
++    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
++    const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
++    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
++    const int blocksparse_vert_stride, const int blocksparse_block_size,
++    const int blocksparse_head_sliding_step) {
++  int num_seqs = query.size(0);
++  int num_heads = query.size(1);
++  int head_size = query.size(2);
++  int max_num_blocks_per_seq = block_tables.size(1);
++  int q_stride = query.stride(0);
++  int kv_block_stride = key_cache.stride(0);
++  int kv_head_stride = key_cache.stride(1);
++
++  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
++  assert(head_size % thread_group_size == 0);
++
++  // NOTE: alibi_slopes is optional.
++  const float* alibi_slopes_ptr =
++      alibi_slopes
++          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
++          : nullptr;
++
++  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
++  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
++  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
++  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
++  int* block_tables_ptr = block_tables.data_ptr<int>();
++  int* seq_lens_ptr = seq_lens.data_ptr<int>();
++
++  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++  int padded_max_seq_len =
++      DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
++  int logits_size = padded_max_seq_len * sizeof(float);
++  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
++  // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
++  // Keep that in sync with the logic here!
++  int shared_mem_size = std::max(logits_size, outputs_size);
++
++  dim3 grid(num_heads, num_seqs, 1);
++  dim3 block(NUM_THREADS);
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  switch (head_size) {
++    // NOTE(woosuk): To reduce the compilation time, we only compile for the
++    // head sizes that we use in the model. However, we can easily extend this
++    // to support any head size which is a multiple of 16.
++    case 32:
++      LAUNCH_PAGED_ATTENTION_V1(32);
++      break;
++    case 64:
++      LAUNCH_PAGED_ATTENTION_V1(64);
++      break;
++    case 80:
++      LAUNCH_PAGED_ATTENTION_V1(80);
++      break;
++    case 96:
++      LAUNCH_PAGED_ATTENTION_V1(96);
++      break;
++    case 112:
++      LAUNCH_PAGED_ATTENTION_V1(112);
++      break;
++    case 120:
++      LAUNCH_PAGED_ATTENTION_V1(120);
++      break;
++    case 128:
++      LAUNCH_PAGED_ATTENTION_V1(128);
++      break;
++    case 192:
++      LAUNCH_PAGED_ATTENTION_V1(192);
++      break;
++    case 256:
++      LAUNCH_PAGED_ATTENTION_V1(256);
++      break;
++    default:
++      TORCH_CHECK(false, "Unsupported head size: ", head_size);
++      break;
++  }
++}
++
++#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)  \
++  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
++                              IS_BLOCK_SPARSE>(                              \
++      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
++      seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank,        \
++      blocksparse_local_blocks, blocksparse_vert_stride,                     \
++      blocksparse_block_size, blocksparse_head_sliding_step);
++
++#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
++  if (is_block_sparse) {                                                   \
++    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
++  } else {                                                                 \
++    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
++  }
++
++// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
++// 1, 2, 4, 64, 128, 256.
++#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
++  switch (block_size) {                                           \
++    case 8:                                                       \
++      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
++      break;                                                      \
++    case 16:                                                      \
++      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
++      break;                                                      \
++    case 32:                                                      \
++      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
++      break;                                                      \
++    default:                                                      \
++      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
++      break;                                                      \
++  }
++
++void paged_attention_v1(
++    torch::Tensor& out,    // [num_seqs, num_heads, head_size]
++    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
++    torch::Tensor&
++        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
++    torch::Tensor&
++        value_cache,       // [num_blocks, num_heads, head_size, block_size]
++    int64_t num_kv_heads,  // [num_heads]
++    double scale,
++    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
++    torch::Tensor& seq_lens,      // [num_seqs]
++    int64_t block_size, int64_t max_seq_len,
++    const std::optional<torch::Tensor>& alibi_slopes,
++    const std::string& kv_cache_dtype, double k_scale, double v_scale,
++    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
++    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
++    const int64_t blocksparse_head_sliding_step) {
++  const bool is_block_sparse = (blocksparse_vert_stride > 1);
++
++  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
++                             CALL_V1_LAUNCHER_BLOCK_SIZE)
++}
++
++#undef WARP_SIZE
++#undef MAX
++#undef MIN
++#undef DIVIDE_ROUND_UP
+\ No newline at end of file
+diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
+new file mode 100644
+index 0000000..a453b22
+--- /dev/null
++++ b/csrc/attention/paged_attention_v2.cu
+@@ -0,0 +1,203 @@
++/*
++ * Adapted from
++ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
++ * Copyright (c) 2023, The vLLM team.
++ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "attention_kernels.cuh"
++
++#ifndef USE_ROCM
++  #define WARP_SIZE 32
++#else
++  #define WARP_SIZE warpSize
++#endif
++
++#define MAX(a, b) ((a) > (b) ? (a) : (b))
++#define MIN(a, b) ((a) < (b) ? (a) : (b))
++#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
++
++#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                   \
++  vllm::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,           \
++                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE,      \
++                                  PARTITION_SIZE>                              \
++      <<<grid, block, shared_mem_size, stream>>>(                              \
++          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
++          value_cache_ptr, num_kv_heads, scale, block_tables_ptr,              \
++          seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
++          kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,          \
++          blocksparse_local_blocks, blocksparse_vert_stride,                   \
++          blocksparse_block_size, blocksparse_head_sliding_step);              \
++  vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS,            \
++                                         PARTITION_SIZE>                       \
++      <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                \
++          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr,    \
++          max_num_partitions);
++
++template <typename T, typename CACHE_T, int BLOCK_SIZE,
++          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
++          int NUM_THREADS = 128, int PARTITION_SIZE = 512>
++void paged_attention_v2_launcher(
++    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
++    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
++    torch::Tensor& value_cache, int num_kv_heads, float scale,
++    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
++    const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
++    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
++    const int blocksparse_vert_stride, const int blocksparse_block_size,
++    const int blocksparse_head_sliding_step) {
++  int num_seqs = query.size(0);
++  int num_heads = query.size(1);
++  int head_size = query.size(2);
++  int max_num_blocks_per_seq = block_tables.size(1);
++  int q_stride = query.stride(0);
++  int kv_block_stride = key_cache.stride(0);
++  int kv_head_stride = key_cache.stride(1);
++
++  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
++  assert(head_size % thread_group_size == 0);
++
++  // NOTE: alibi_slopes is optional.
++  const float* alibi_slopes_ptr =
++      alibi_slopes
++          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
++          : nullptr;
++
++  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
++  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
++  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
++  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
++  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
++  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
++  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
++  int* block_tables_ptr = block_tables.data_ptr<int>();
++  int* seq_lens_ptr = seq_lens.data_ptr<int>();
++
++  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++  int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
++  int logits_size = PARTITION_SIZE * sizeof(float);
++  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
++
++  // For paged attention v2 kernel.
++  dim3 grid(num_heads, num_seqs, max_num_partitions);
++  int shared_mem_size = std::max(logits_size, outputs_size);
++  // For paged attention v2 reduce kernel.
++  dim3 reduce_grid(num_heads, num_seqs);
++  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
++
++  dim3 block(NUM_THREADS);
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  switch (head_size) {
++    // NOTE(woosuk): To reduce the compilation time, we only compile for the
++    // head sizes that we use in the model. However, we can easily extend this
++    // to support any head size which is a multiple of 16.
++    case 32:
++      LAUNCH_PAGED_ATTENTION_V2(32);
++      break;
++    case 64:
++      LAUNCH_PAGED_ATTENTION_V2(64);
++      break;
++    case 80:
++      LAUNCH_PAGED_ATTENTION_V2(80);
++      break;
++    case 96:
++      LAUNCH_PAGED_ATTENTION_V2(96);
++      break;
++    case 112:
++      LAUNCH_PAGED_ATTENTION_V2(112);
++      break;
++    case 120:
++      LAUNCH_PAGED_ATTENTION_V2(120);
++      break;
++    case 128:
++      LAUNCH_PAGED_ATTENTION_V2(128);
++      break;
++    case 192:
++      LAUNCH_PAGED_ATTENTION_V2(192);
++      break;
++    case 256:
++      LAUNCH_PAGED_ATTENTION_V2(256);
++      break;
++    default:
++      TORCH_CHECK(false, "Unsupported head size: ", head_size);
++      break;
++  }
++}
++
++#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)   \
++  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,               \
++                              IS_BLOCK_SPARSE>(                               \
++      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
++      num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
++      k_scale, v_scale, tp_rank, blocksparse_local_blocks,                    \
++      blocksparse_vert_stride, blocksparse_block_size,                        \
++      blocksparse_head_sliding_step);
++
++#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
++  if (is_block_sparse) {                                                   \
++    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
++  } else {                                                                 \
++    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
++  }
++
++// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
++// 1, 2, 4, 64, 128, 256.
++#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
++  switch (block_size) {                                           \
++    case 8:                                                       \
++      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
++      break;                                                      \
++    case 16:                                                      \
++      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
++      break;                                                      \
++    case 32:                                                      \
++      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
++      break;                                                      \
++    default:                                                      \
++      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
++      break;                                                      \
++  }
++
++void paged_attention_v2(
++    torch::Tensor& out,         // [num_seqs, num_heads, head_size]
++    torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]
++    torch::Tensor& max_logits,  // [num_seqs, num_heads, max_num_partitions]
++    torch::Tensor&
++        tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
++    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
++    torch::Tensor&
++        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
++    torch::Tensor&
++        value_cache,       // [num_blocks, num_heads, head_size, block_size]
++    int64_t num_kv_heads,  // [num_heads]
++    double scale,
++    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
++    torch::Tensor& seq_lens,      // [num_seqs]
++    int64_t block_size, int64_t max_seq_len,
++    const std::optional<torch::Tensor>& alibi_slopes,
++    const std::string& kv_cache_dtype, double k_scale, double v_scale,
++    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
++    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
++    const int64_t blocksparse_head_sliding_step) {
++  const bool is_block_sparse = (blocksparse_vert_stride > 1);
++  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
++                             CALL_V2_LAUNCHER_BLOCK_SIZE)
++}
++
++#undef WARP_SIZE
++#undef MAX
++#undef MIN
++#undef DIVIDE_ROUND_UP
+\ No newline at end of file
+diff --git a/csrc/cache.h b/csrc/cache.h
+index 4c142ce..11c4c50 100644
+--- a/csrc/cache.h
++++ b/csrc/cache.h
+@@ -1,38 +1,33 @@
+ #pragma once
+ 
+-#include <torch/extension.h>
++#include <torch/all.h>
+ 
+ #include <map>
+ #include <vector>
+ 
+-void swap_blocks(
+-  torch::Tensor& src,
+-  torch::Tensor& dst,
+-  const std::map<int64_t, int64_t>& block_mapping);
++void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
++                 const torch::Tensor& block_mapping);
+ 
+-void copy_blocks(
+-  std::vector<torch::Tensor>& key_caches,
+-  std::vector<torch::Tensor>& value_caches,
+-  const std::map<int64_t, std::vector<int64_t>>& block_mapping);
++// Note: the key_caches and value_caches vectors are constant but
++// not the Tensors they contain. The vectors need to be const refs
++// in order to satisfy pytorch's C++ operator registration code.
++void copy_blocks(std::vector<torch::Tensor> const& key_caches,
++                 std::vector<torch::Tensor> const& value_caches,
++                 const torch::Tensor& block_mapping);
+ 
+-void reshape_and_cache(
+-  torch::Tensor& key,
+-  torch::Tensor& value,
+-  torch::Tensor& key_cache,
+-  torch::Tensor& value_cache,
+-  torch::Tensor& slot_mapping,
+-  const std::string& kv_cache_dtype,
+-  const float kv_scale);
++void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
++                       torch::Tensor& key_cache, torch::Tensor& value_cache,
++                       torch::Tensor& slot_mapping,
++                       const std::string& kv_cache_dtype, const double k_scale,
++                       const double v_scale);
+ 
+-void reshape_and_cache_flash(
+-  torch::Tensor& key,
+-  torch::Tensor& value,
+-  torch::Tensor& key_cache,
+-  torch::Tensor& value_cache,
+-  torch::Tensor& slot_mapping,
+-  const std::string& kv_cache_dtype);
++void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
++                             torch::Tensor& key_cache,
++                             torch::Tensor& value_cache,
++                             torch::Tensor& slot_mapping,
++                             const std::string& kv_cache_dtype,
++                             const double k_scale, const double v_scale);
+ 
+ // Just for unittest
+-void convert_fp8(
+-  torch::Tensor& src_cache,
+-  torch::Tensor& dst_cache);
++void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
++                 const double scale, const std::string& kv_cache_dtype);
+diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
+index 42f884c..8a95279 100644
+--- a/csrc/cache_kernels.cu
++++ b/csrc/cache_kernels.cu
+@@ -1,13 +1,14 @@
+-#include <torch/extension.h>
++#include <torch/all.h>
+ #include <ATen/cuda/CUDAContext.h>
+ #include <c10/cuda/CUDAGuard.h>
+ 
+ #include "cuda_compat.h"
+ #include "dispatch_utils.h"
+-#if defined(ENABLE_FP8_E5M2)
+-#include "quantization/fp8_e5m2_kvcache/quant_utils.cuh"
+-#elif defined(ENABLE_FP8_E4M3)
+-#include "quantization/fp8/amd_detail/quant_utils.cuh"
++
++#ifdef USE_ROCM
++  #include "quantization/fp8/amd/quant_utils.cuh"
++#else
++  #include "quantization/fp8/nvidia/quant_utils.cuh"
+ #endif
+ 
+ #include <algorithm>
+@@ -17,20 +18,17 @@
+ 
+ #ifdef USE_ROCM
+   #include <hip/hip_bf16.h>
+-  typedef __hip_bfloat16 __nv_bfloat16;
++typedef __hip_bfloat16 __nv_bfloat16;
+ #endif
+ 
+-void swap_blocks(
+-  torch::Tensor& src,
+-  torch::Tensor& dst,
+-  const std::map<int64_t, int64_t>& block_mapping) {
++void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
++                 const torch::Tensor& block_mapping) {
+   torch::Device src_device = src.device();
+   torch::Device dst_device = dst.device();
+   cudaMemcpyKind memcpy_type;
+   if (src_device.is_cuda() && dst_device.is_cuda()) {
+-    TORCH_CHECK(
+-      src_device.index() == dst_device.index(),
+-      "src and dst must be on the same GPU");
++    TORCH_CHECK(src_device.index() == dst_device.index(),
++                "src and dst must be on the same GPU");
+     memcpy_type = cudaMemcpyDeviceToDevice;
+   } else if (src_device.is_cuda() && dst_device.is_cpu()) {
+     memcpy_type = cudaMemcpyDeviceToHost;
+@@ -40,41 +38,44 @@ void swap_blocks(
+     TORCH_CHECK(false, "Invalid device combination");
+   }
+ 
+-  char *src_ptr = static_cast<char*>(src.data_ptr());
+-  char *dst_ptr = static_cast<char*>(dst.data_ptr());
++  // NOTE(youkaichao): keep in mind that `block_mapping` should be
++  // a cpu tensor, otherwise every `item` call will require a gpu-cpu
++  // synchronization.
++  TORCH_CHECK(block_mapping.device().is_cpu(), "block_mapping must be on CPU");
++
++  char* src_ptr = static_cast<char*>(src.data_ptr());
++  char* dst_ptr = static_cast<char*>(dst.data_ptr());
+ 
+   const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
+-  const at::cuda::OptionalCUDAGuard device_guard(src_device.is_cuda() ? src_device : dst_device);
++  const at::cuda::OptionalCUDAGuard device_guard(
++      src_device.is_cuda() ? src_device : dst_device);
+   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+   // NOTE(woosuk): This can be slow if the number of blocks is large.
+-  for (const auto& pair : block_mapping) {
+-    int64_t src_block_number = pair.first;
+-    int64_t dst_block_number = pair.second;
++  const int64_t num_blocks = block_mapping.size(0);
++  for (size_t i = 0; i < num_blocks; i++) {
++    int64_t src_block_number = block_mapping[i][0].item<int64_t>();
++    int64_t dst_block_number = block_mapping[i][1].item<int64_t>();
+     int64_t src_offset = src_block_number * block_size_in_bytes;
+     int64_t dst_offset = dst_block_number * block_size_in_bytes;
+-    cudaMemcpyAsync(
+-      dst_ptr + dst_offset,
+-      src_ptr + src_offset,
+-      block_size_in_bytes,
+-      memcpy_type,
+-      stream);
++    cudaMemcpyAsync(dst_ptr + dst_offset, src_ptr + src_offset,
++                    block_size_in_bytes, memcpy_type, stream);
+   }
+ }
+ 
+ namespace vllm {
+ 
+ // Grid: (num_layers, num_pairs)
+-template<typename scalar_t>
+-__global__ void copy_blocks_kernel(
+-  int64_t* key_cache_ptrs,
+-  int64_t* value_cache_ptrs,
+-  const int64_t* __restrict__ block_mapping,
+-  const int numel_per_block) {
++template <typename scalar_t>
++__global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
++                                   int64_t* value_cache_ptrs,
++                                   const int64_t* __restrict__ block_mapping,
++                                   const int numel_per_block) {
+   const int layer_idx = blockIdx.x;
+   const int pair_idx = blockIdx.y;
+ 
+   scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
+-  scalar_t* value_cache = reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
++  scalar_t* value_cache =
++      reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
+   int64_t src_block_number = block_mapping[2 * pair_idx];
+   int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
+ 
+@@ -92,12 +93,14 @@ __global__ void copy_blocks_kernel(
+   }
+ }
+ 
+-} // namespace vllm
++}  // namespace vllm
+ 
+-void copy_blocks(
+-  std::vector<torch::Tensor>& key_caches,
+-  std::vector<torch::Tensor>& value_caches,
+-  const std::map<int64_t, std::vector<int64_t>>& block_mapping) {
++// Note: the key_caches and value_caches vectors are constant but
++// not the Tensors they contain. The vectors need to be const refs
++// in order to satisfy pytorch's C++ operator registration code.
++void copy_blocks(std::vector<torch::Tensor> const& key_caches,
++                 std::vector<torch::Tensor> const& value_caches,
++                 const torch::Tensor& block_mapping) {
+   int num_layers = key_caches.size();
+   TORCH_CHECK(num_layers == value_caches.size());
+   if (num_layers == 0) {
+@@ -111,29 +114,23 @@ void copy_blocks(
+   int64_t key_cache_ptrs[num_layers];
+   int64_t value_cache_ptrs[num_layers];
+   for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
+-    key_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
+-    value_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
++    key_cache_ptrs[layer_idx] =
++        reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
++    value_cache_ptrs[layer_idx] =
++        reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
+   }
+-  // Create block mapping array.
+-  std::vector<int64_t> block_mapping_vec;
+-  for (const auto& pair : block_mapping) {
+-    int64_t src_block_number = pair.first;
+-    for (int64_t dst_block_number : pair.second) {
+-      block_mapping_vec.push_back(src_block_number);
+-      block_mapping_vec.push_back(dst_block_number);
+-    }
+-  }
+-  int64_t* block_mapping_array = block_mapping_vec.data();
+-  int num_pairs = block_mapping_vec.size() / 2;
++
++  // block_mapping is a 2D tensor with shape (num_pairs, 2).
++  int num_pairs = block_mapping.size(0);
+ 
+   // Move the data structures to the GPU.
+   // NOTE: This synchronizes the CPU and GPU.
+-  torch::Tensor key_cache_ptrs_tensor = torch::from_blob(
+-    key_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
+-  torch::Tensor value_cache_ptrs_tensor = torch::from_blob(
+-    value_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
+-  torch::Tensor block_mapping_tensor = torch::from_blob(
+-    block_mapping_array, {2 * num_pairs}, torch::kInt64).to(cache_device);
++  torch::Tensor key_cache_ptrs_tensor =
++      torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
++          .to(cache_device);
++  torch::Tensor value_cache_ptrs_tensor =
++      torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
++          .to(cache_device);
+ 
+   // Launch the kernel.
+   const int numel_per_block = key_caches[0][0].numel();
+@@ -142,31 +139,28 @@ void copy_blocks(
+   const at::cuda::OptionalCUDAGuard device_guard(cache_device);
+   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+   VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
+-    key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
+-      vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
+-        key_cache_ptrs_tensor.data_ptr<int64_t>(),
+-        value_cache_ptrs_tensor.data_ptr<int64_t>(),
+-        block_mapping_tensor.data_ptr<int64_t>(),
+-        numel_per_block);
+-    }));
++      key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
++        vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
++            key_cache_ptrs_tensor.data_ptr<int64_t>(),
++            value_cache_ptrs_tensor.data_ptr<int64_t>(),
++            block_mapping.data_ptr<int64_t>(), numel_per_block);
++      }));
+ }
+ 
+ namespace vllm {
+ 
+-template<typename scalar_t, typename cache_t, bool is_fp8_kv_cache>
++template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+ __global__ void reshape_and_cache_kernel(
+-  const scalar_t* __restrict__ key,           // [num_tokens, num_heads, head_size]
+-  const scalar_t* __restrict__ value,         // [num_tokens, num_heads, head_size]
+-  cache_t* __restrict__ key_cache,            // [num_blocks, num_heads, head_size/x, block_size, x]
+-  cache_t* __restrict__ value_cache,          // [num_blocks, num_heads, head_size, block_size]
+-  const int64_t* __restrict__ slot_mapping,   // [num_tokens]
+-  const int key_stride,
+-  const int value_stride,
+-  const int num_heads,
+-  const int head_size,
+-  const int block_size,
+-  const int x,
+-  const float kv_scale) {
++    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
++    const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
++    cache_t* __restrict__ key_cache,     // [num_blocks, num_heads, head_size/x,
++                                         // block_size, x]
++    cache_t* __restrict__ value_cache,   // [num_blocks, num_heads, head_size,
++                                         // block_size]
++    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
++    const int key_stride, const int value_stride, const int num_heads,
++    const int head_size, const int block_size, const int x, const float k_scale,
++    const float v_scale) {
+   const int64_t token_idx = blockIdx.x;
+   const int64_t slot_idx = slot_mapping[token_idx];
+   if (slot_idx < 0) {
+@@ -187,47 +181,40 @@ __global__ void reshape_and_cache_kernel(
+     const int x_idx = head_offset / x;
+     const int x_offset = head_offset % x;
+ 
+-    const int64_t tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
+-                                + head_idx * (head_size / x) * block_size * x
+-                                + x_idx * block_size * x
+-                                + block_offset * x
+-                                + x_offset;
+-    const int64_t tgt_value_idx = block_idx * num_heads * head_size * block_size
+-                                  + head_idx * head_size * block_size
+-                                  + head_offset * block_size
+-                                  + block_offset;
++    const int64_t tgt_key_idx =
++        block_idx * num_heads * (head_size / x) * block_size * x +
++        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
++        block_offset * x + x_offset;
++    const int64_t tgt_value_idx =
++        block_idx * num_heads * head_size * block_size +
++        head_idx * head_size * block_size + head_offset * block_size +
++        block_offset;
+     scalar_t tgt_key = key[src_key_idx];
+     scalar_t tgt_value = value[src_value_idx];
+-    if constexpr (is_fp8_kv_cache) {
+-#if defined(ENABLE_FP8_E5M2)
+-      key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_key);
+-      value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_value);
+-#elif defined(ENABLE_FP8_E4M3)
+-      key_cache[tgt_key_idx] = fp8_e4m3::scaled_vec_conversion<uint8_t, scalar_t>(tgt_key, kv_scale);
+-      value_cache[tgt_value_idx] = fp8_e4m3::scaled_vec_conversion<uint8_t, scalar_t>(tgt_value, kv_scale);
+-#else
+-      assert(false);
+-#endif
+-    } else {
++    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+       key_cache[tgt_key_idx] = tgt_key;
+       value_cache[tgt_value_idx] = tgt_value;
++    } else {
++      key_cache[tgt_key_idx] =
++          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
++      value_cache[tgt_value_idx] =
++          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
+     }
+   }
+ }
+ 
+-template<typename scalar_t>
++template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+ __global__ void reshape_and_cache_flash_kernel(
+-  const scalar_t* __restrict__ key,           // [num_tokens, num_heads, head_size]
+-  const scalar_t* __restrict__ value,         // [num_tokens, num_heads, head_size]
+-  scalar_t* __restrict__ k_cache,             // [num_blocks, block_size, num_heads, head_size]
+-  scalar_t* __restrict__ v_cache,             // [num_blocks, block_size, num_heads, head_size]
+-  const int64_t* __restrict__ slot_mapping,   // [num_tokens]
+-  const int block_stride,
+-  const int key_stride,
+-  const int value_stride,
+-  const int num_heads,
+-  const int head_size,
+-  const int block_size) {
++    const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
++    const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
++    cache_t* __restrict__ key_cache,     // [num_blocks, block_size, num_heads,
++                                         // head_size]
++    cache_t* __restrict__ value_cache,   // [num_blocks, block_size, num_heads,
++                                         // head_size]
++    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
++    const int block_stride, const int key_stride, const int value_stride,
++    const int num_heads, const int head_size, const int block_size,
++    const float k_scale, const float v_scale) {
+   const int64_t token_idx = blockIdx.x;
+   const int64_t slot_idx = slot_mapping[token_idx];
+   // NOTE: slot_idx can be -1 if the token is padded
+@@ -242,40 +229,47 @@ __global__ void reshape_and_cache_flash_kernel(
+     const int64_t src_value_idx = token_idx * value_stride + i;
+     const int head_idx = i / head_size;
+     const int head_offset = i % head_size;
+-    const int64_t tgt_value_idx = block_idx * block_stride
+-                              + block_offset * num_heads * head_size
+-                              + head_idx * head_size
+-                              + head_offset;
+-    k_cache[tgt_value_idx] = key[src_key_idx];
+-    v_cache[tgt_value_idx] = value[src_value_idx];
++    const int64_t tgt_key_value_idx = block_idx * block_stride +
++                                      block_offset * num_heads * head_size +
++                                      head_idx * head_size + head_offset;
++    scalar_t tgt_key = key[src_key_idx];
++    scalar_t tgt_value = value[src_value_idx];
++    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
++      key_cache[tgt_key_value_idx] = tgt_key;
++      value_cache[tgt_key_value_idx] = tgt_value;
++    } else {
++      key_cache[tgt_key_value_idx] =
++          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
++      value_cache[tgt_key_value_idx] =
++          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
++    }
+   }
+ }
+-} // namespace vllm
+-
+-#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE)                                     \
+-  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, IS_FP8_KV_CACHE><<<grid, block, 0, stream>>>(      \
+-    reinterpret_cast<KV_T*>(key.data_ptr()),                                                       \
+-    reinterpret_cast<KV_T*>(value.data_ptr()),                                                     \
+-    reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),                                              \
+-    reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),                                            \
+-    slot_mapping.data_ptr<int64_t>(),                                                              \
+-    key_stride,                                                                                    \
+-    value_stride,                                                                                  \
+-    num_heads,                                                                                     \
+-    head_size,                                                                                     \
+-    block_size,                                                                                    \
+-    x,                                                                                             \
+-    kv_scale);
++}  // namespace vllm
++
++// KV_T is the stored data type of kv-cache.
++// CACHE_T is the data type of key and value tensors.
++// KV_DTYPE is the real data type of kv-cache.
++#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)               \
++  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>             \
++      <<<grid, block, 0, stream>>>(                                   \
++          reinterpret_cast<KV_T*>(key.data_ptr()),                    \
++          reinterpret_cast<KV_T*>(value.data_ptr()),                  \
++          reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),           \
++          reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),         \
++          slot_mapping.data_ptr<int64_t>(), key_stride, value_stride, \
++          num_heads, head_size, block_size, x, k_scale, v_scale);
+ 
+ void reshape_and_cache(
+-  torch::Tensor& key,           // [num_tokens, num_heads, head_size]
+-  torch::Tensor& value,         // [num_tokens, num_heads, head_size]
+-  torch::Tensor& key_cache,     // [num_blocks, num_heads, head_size/x, block_size, x]
+-  torch::Tensor& value_cache,   // [num_blocks, num_heads, head_size, block_size]
+-  torch::Tensor& slot_mapping,  // [num_tokens]
+-  const std::string& kv_cache_dtype,
+-  const float kv_scale)
+-{
++    torch::Tensor& key,    // [num_tokens, num_heads, head_size]
++    torch::Tensor& value,  // [num_tokens, num_heads, head_size]
++    torch::Tensor&
++        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
++    torch::Tensor&
++        value_cache,  // [num_blocks, num_heads, head_size, block_size]
++    torch::Tensor& slot_mapping,  // [num_tokens]
++    const std::string& kv_cache_dtype, const double k_scale,
++    const double v_scale) {
+   int num_tokens = key.size(0);
+   int num_heads = key.size(1);
+   int head_size = key.size(2);
+@@ -289,111 +283,93 @@ void reshape_and_cache(
+   dim3 block(std::min(num_heads * head_size, 512));
+   const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
+   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-  if (kv_cache_dtype == "auto") {
+-    if (key.dtype() == at::ScalarType::Float) {
+-      CALL_RESHAPE_AND_CACHE(float, float, false);
+-    } else if (key.dtype() == at::ScalarType::Half) {
+-      CALL_RESHAPE_AND_CACHE(uint16_t, uint16_t, false);
+-    } else if (key.dtype() == at::ScalarType::BFloat16) {
+-      CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false);
+-    }
+-  } else if (kv_cache_dtype == "fp8") {
+-    if (key.dtype() == at::ScalarType::Float) {
+-      CALL_RESHAPE_AND_CACHE(float, uint8_t, true);
+-    } else if (key.dtype() == at::ScalarType::Half) {
+-      CALL_RESHAPE_AND_CACHE(uint16_t, uint8_t, true);
+-    } else if (key.dtype() == at::ScalarType::BFloat16) {
+-      CALL_RESHAPE_AND_CACHE(__nv_bfloat16, uint8_t, true);
+-    }
+-  } else {
+-    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
+-  }
++
++  DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
++                             CALL_RESHAPE_AND_CACHE)
+ }
+ 
++// KV_T is the stored data type of kv-cache.
++// CACHE_T is the data type of key and value tensors.
++// KV_DTYPE is the real data type of kv-cache.
++#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
++  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
++      <<<grid, block, 0, stream>>>(                                   \
++          reinterpret_cast<KV_T*>(key.data_ptr()),                    \
++          reinterpret_cast<KV_T*>(value.data_ptr()),                  \
++          reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),           \
++          reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),         \
++          slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
++          value_stride, num_heads, head_size, block_size, k_scale, v_scale);
++
+ void reshape_and_cache_flash(
+-  torch::Tensor& key,           // [num_tokens, num_heads, head_size]
+-  torch::Tensor& value,         // [num_tokens, num_heads, head_size]
+-  torch::Tensor& k_cache,       // [num_blocks, block_size, num_heads, head_size]
+-  torch::Tensor& v_cache,       // [num_blocks, block_size, num_heads, head_size]
+-  torch::Tensor& slot_mapping,  // [num_tokens]
+-  const std::string& kv_cache_dtype)
+-{
+-  // FIXME: only support auto datatype, does not support fp8
+-  if (kv_cache_dtype != "auto") {
+-    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
+-  }
+-  int num_tokens = key.size(0);
++    torch::Tensor& key,        // [num_tokens, num_heads, head_size]
++    torch::Tensor& value,      // [num_tokens, num_heads, head_size]
++    torch::Tensor& key_cache,  // [num_blocks, block_size, num_heads, head_size]
++    torch::Tensor&
++        value_cache,  // [num_blocks, block_size, num_heads, head_size]
++    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
++    const std::string& kv_cache_dtype, const double k_scale,
++    const double v_scale) {
++  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
++  // slot_mapping.size(0) because of padding for CUDA graphs.
++  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
++  // both include padding.
++  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
++  // since key includes padding for CUDA graphs, while slot_mapping does not.
++  // In this case, slot_mapping.size(0) represents the actual number of tokens
++  // before padding.
++  // For compatibility with both cases, we use slot_mapping.size(0) as the
++  // number of tokens.
++  int num_tokens = slot_mapping.size(0);
+   int num_heads = key.size(1);
+   int head_size = key.size(2);
+-  int block_size = k_cache.size(1);
++  int block_size = key_cache.size(1);
+ 
+   int key_stride = key.stride(0);
+   int value_stride = value.stride(0);
+-  int block_stride = k_cache.stride(0);
+-  TORCH_CHECK(k_cache.stride(0) == v_cache.stride(0));
++  int block_stride = key_cache.stride(0);
++  TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
+ 
+   dim3 grid(num_tokens);
+   dim3 block(std::min(num_heads * head_size, 512));
+   const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
+   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-  VLLM_DISPATCH_FLOATING_TYPES(
+-    key.scalar_type(),
+-    "reshape_and_cache_flash",
+-    [&] {
+-      vllm::reshape_and_cache_flash_kernel<scalar_t><<<grid, block, 0, stream>>>(
+-        key.data_ptr<scalar_t>(),
+-        value.data_ptr<scalar_t>(),
+-        k_cache.data_ptr<scalar_t>(),
+-        v_cache.data_ptr<scalar_t>(),
+-        slot_mapping.data_ptr<int64_t>(),
+-        block_stride,
+-        key_stride,
+-        value_stride,
+-        num_heads,
+-        head_size,
+-        block_size);
+-    });
++
++  DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
++                             CALL_RESHAPE_AND_CACHE_FLASH);
+ }
+ 
+ namespace vllm {
+ 
+-template<typename Tout, typename Tin>
+-__global__ void convert_fp8_kernel(
+-  const Tin* __restrict__ src_cache,
+-  Tout* __restrict__ dst_cache,
+-  const int64_t block_stride) {
++template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
++__global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
++                                   Tout* __restrict__ dst_cache,
++                                   const float scale,
++                                   const int64_t block_stride) {
+   const int64_t block_idx = blockIdx.x;
+   for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
+     int64_t idx = block_idx * block_stride + i;
+-#if defined(ENABLE_FP8_E5M2)
+-    dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion<Tout, Tin>(src_cache[idx]);
+-#elif defined(ENABLE_FP8_E4M3)
+-    dst_cache[idx] = fp8_e4m3::vec_conversion<Tout, Tin>(src_cache[idx]);
+-#else
+-    assert(false);
+-#endif
++    dst_cache[idx] =
++        fp8::scaled_convert<Tout, Tin, kv_dt>(src_cache[idx], scale);
+   }
+ }
+ 
+-} // namespace vllm
++}  // namespace vllm
+ 
+-#define CALL_CONVERT_FP8(Tout, Tin)                                 \
+-  vllm::convert_fp8_kernel<Tout, Tin><<<grid, block, 0, stream>>>(  \
+-    reinterpret_cast<Tin*>(src_cache.data_ptr()),                   \
+-    reinterpret_cast<Tout*>(dst_cache.data_ptr()),                  \
+-    block_stride);
++#define CALL_CONVERT_FP8(Tout, Tin, KV_DTYPE)                                \
++  vllm::convert_fp8_kernel<Tout, Tin, KV_DTYPE><<<grid, block, 0, stream>>>( \
++      reinterpret_cast<Tin*>(src_cache.data_ptr()),                          \
++      reinterpret_cast<Tout*>(dst_cache.data_ptr()), scale, block_stride);
+ 
+-void convert_fp8(
+-  torch::Tensor& src_cache,
+-  torch::Tensor& dst_cache)
+-{
++// Only for testing.
++void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
++                 const double scale, const std::string& kv_cache_dtype) {
+   torch::Device src_device = src_cache.device();
+   torch::Device dst_device = dst_cache.device();
+   TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
+   TORCH_CHECK(dst_device.is_cuda(), "dst must be on a GPU")
+-  TORCH_CHECK(
+-    src_device.index() == dst_device.index(),
+-    "src and dst must be on the same GPU");
++  TORCH_CHECK(src_device.index() == dst_device.index(),
++              "src and dst must be on the same GPU");
+   at::cuda::OptionalCUDAGuard device_guard(src_device);
+ 
+   int64_t num_blocks = src_cache.size(0);
+@@ -403,17 +379,37 @@ void convert_fp8(
+   dim3 block(std::min(block_stride, int64_t(512)));
+   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+ 
+-  if (src_cache.dtype() == at::ScalarType::Float) {
+-    CALL_CONVERT_FP8(uint8_t, float);
+-  } else if (src_cache.dtype() == at::ScalarType::Half) {
+-    CALL_CONVERT_FP8(uint8_t, uint16_t);
+-  } else if (src_cache.dtype() == at::ScalarType::BFloat16) {
+-    CALL_CONVERT_FP8(uint8_t, __nv_bfloat16);
+-  } else if (dst_cache.dtype() == at::ScalarType::Float) {
+-    CALL_CONVERT_FP8(float, uint8_t);
+-  } else if (dst_cache.dtype() == at::ScalarType::Half) {
+-    CALL_CONVERT_FP8(uint16_t, uint8_t);
+-  } else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
+-    CALL_CONVERT_FP8(__nv_bfloat16, uint8_t);
++  if (kv_cache_dtype == "auto") {
++    if (src_cache.dtype() == at::ScalarType::Float) {
++      CALL_CONVERT_FP8(uint8_t, float, vllm::Fp8KVCacheDataType::kAuto);
++    } else if (src_cache.dtype() == at::ScalarType::Half) {
++      CALL_CONVERT_FP8(uint8_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto);
++    } else if (src_cache.dtype() == at::ScalarType::BFloat16) {
++      CALL_CONVERT_FP8(uint8_t, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto);
++    } else if (dst_cache.dtype() == at::ScalarType::Float) {
++      CALL_CONVERT_FP8(float, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
++    } else if (dst_cache.dtype() == at::ScalarType::Half) {
++      CALL_CONVERT_FP8(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
++    } else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
++      CALL_CONVERT_FP8(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
++    }
++  } else if (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3") {
++    if (src_cache.dtype() == at::ScalarType::Float) {
++      CALL_CONVERT_FP8(uint8_t, float, vllm::Fp8KVCacheDataType::kFp8E4M3);
++    } else if (src_cache.dtype() == at::ScalarType::Half) {
++      CALL_CONVERT_FP8(uint8_t, uint16_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
++    } else if (src_cache.dtype() == at::ScalarType::BFloat16) {
++      CALL_CONVERT_FP8(uint8_t, __nv_bfloat16,
++                       vllm::Fp8KVCacheDataType::kFp8E4M3);
++    } else if (dst_cache.dtype() == at::ScalarType::Float) {
++      CALL_CONVERT_FP8(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
++    } else if (dst_cache.dtype() == at::ScalarType::Half) {
++      CALL_CONVERT_FP8(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
++    } else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
++      CALL_CONVERT_FP8(__nv_bfloat16, uint8_t,
++                       vllm::Fp8KVCacheDataType::kFp8E4M3);
++    }
++  } else {
++    TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype);
+   }
+ }
+diff --git a/csrc/core/exception.hpp b/csrc/core/exception.hpp
+new file mode 100644
+index 0000000..f3b2ffa
+--- /dev/null
++++ b/csrc/core/exception.hpp
+@@ -0,0 +1,3 @@
++#pragma once
++
++#define VLLM_IMPLIES(p, q) (!(p) || (q))
+diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
+new file mode 100644
+index 0000000..ba9f40a
+--- /dev/null
++++ b/csrc/core/math.hpp
+@@ -0,0 +1,7 @@
++#include <climits>
++#include <iostream>
++
++inline uint32_t next_pow_2(uint32_t const num) {
++  if (num <= 1) return num;
++  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
++}
+\ No newline at end of file
+diff --git a/csrc/core/registration.h b/csrc/core/registration.h
+new file mode 100644
+index 0000000..4d0ce1c
+--- /dev/null
++++ b/csrc/core/registration.h
+@@ -0,0 +1,27 @@
++#pragma once
++
++#include <Python.h>
++
++#define _CONCAT(A, B) A##B
++#define CONCAT(A, B) _CONCAT(A, B)
++
++#define _STRINGIFY(A) #A
++#define STRINGIFY(A) _STRINGIFY(A)
++
++// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
++// could be a macro instead of a literal token.
++#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
++
++// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
++// could be a macro instead of a literal token.
++#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
++  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
++
++// REGISTER_EXTENSION allows the shared library to be loaded and initialized
++// via python's import statement.
++#define REGISTER_EXTENSION(NAME)                                               \
++  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
++    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
++                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
++    return PyModule_Create(&module);                                           \
++  }
+diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
+new file mode 100644
+index 0000000..408e736
+--- /dev/null
++++ b/csrc/core/scalar_type.hpp
+@@ -0,0 +1,347 @@
++#pragma once
++
++// For TORCH_CHECK
++#include <torch/library.h>
++
++namespace vllm {
++
++//
++//  ScalarType can represent a wide range of floating point and integer types,
++//  in particular it can be used to represent sub-byte data types (something
++//  that torch.dtype currently does not support).
++//
++//  The type definitions on the Python side can be found in: vllm/scalar_type.py
++//  these type definitions should be kept up to date with any Python API changes
++//  here.
++//
++class ScalarType {
++ public:
++  enum NanRepr : uint8_t {
++    NAN_NONE = 0,                // nans are not supported
++    NAN_IEEE_754 = 1,            // nans are: exp all 1s, mantissa not all 0s
++    NAN_EXTD_RANGE_MAX_MIN = 2,  // nans are: exp all 1s, mantissa all 1s
++
++    NAN_REPR_ID_MAX
++  };
++
++  constexpr ScalarType(uint8_t exponent, uint8_t mantissa, bool signed_,
++                       int32_t bias, bool finite_values_only = false,
++                       NanRepr nan_repr = NAN_IEEE_754)
++      : exponent(exponent),
++        mantissa(mantissa),
++        signed_(signed_),
++        bias(bias),
++        finite_values_only(finite_values_only),
++        nan_repr(nan_repr){};
++
++  static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
++    return ScalarType(0, size_bits - 1, true, bias);
++  }
++
++  static constexpr ScalarType uint(uint8_t size_bits, int32_t bias = 0) {
++    return ScalarType(0, size_bits, false, bias);
++  }
++
++  // IEEE 754 compliant floating point type
++  static constexpr ScalarType float_IEEE754(uint8_t exponent,
++                                            uint8_t mantissa) {
++    TORCH_CHECK(mantissa > 0 && exponent > 0);
++    return ScalarType(exponent, mantissa, true, 0, false, NAN_IEEE_754);
++  }
++
++  // IEEE 754 non-compliant floating point type
++  static constexpr ScalarType float_(uint8_t exponent, uint8_t mantissa,
++                                     bool finite_values_only,
++                                     NanRepr nan_repr) {
++    TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
++    TORCH_CHECK(mantissa > 0 && exponent > 0);
++    TORCH_CHECK(nan_repr != NAN_IEEE_754,
++                "use `float_IEEE754` constructor for floating point types that "
++                "follow IEEE 754 conventions");
++    return ScalarType(exponent, mantissa, true, 0, finite_values_only,
++                      nan_repr);
++  }
++
++  uint8_t const exponent;  // size of the exponent field (0 for integer types)
++  uint8_t const mantissa;  // size of the mantissa field (size of the integer
++                           // excluding the sign bit for integer types)
++  bool const signed_;  // flag if the type supports negative numbers (i.e. has a
++                       // sign bit)
++  int32_t const bias;  // stored values equal value + bias,
++                       // used for quantized type
++
++  // Extra Floating point info
++  bool const finite_values_only;  // i.e. no +/-inf if true
++  NanRepr const nan_repr;         // how NaNs are represented
++                                  // (not applicable for integer types)
++
++  using Id = int64_t;
++
++ private:
++  // Field size in id
++  template <typename T_>
++  static constexpr size_t member_id_field_width() {
++    using T = std::decay_t<T_>;
++    return std::is_same_v<T, bool> ? 1 : sizeof(T) * 8;
++  }
++
++  template <typename Fn, typename Init, typename Member, typename... Rest>
++  static constexpr auto reduce_members_helper(Fn f, Init val, Member member,
++                                              Rest... rest) {
++    auto new_val = f(val, member);
++    if constexpr (sizeof...(rest) > 0) {
++      return reduce_members_helper(f, new_val, rest...);
++    } else {
++      return new_val;
++    };
++  }
++
++  template <typename Fn, typename Init>
++  constexpr auto reduce_members(Fn f, Init init) const {
++    // Should be in constructor order for `from_id`
++    return reduce_members_helper(f, init, exponent, mantissa, signed_, bias,
++                                 finite_values_only, nan_repr);
++  };
++
++  template <typename Fn, typename Init>
++  static constexpr auto reduce_member_types(Fn f, Init init) {
++    constexpr auto dummy_type = ScalarType(0, 0, false, 0, false, NAN_NONE);
++    return dummy_type.reduce_members(f, init);
++  };
++
++  static constexpr auto id_size_bits() {
++    return reduce_member_types(
++        [](int acc, auto member) -> int {
++          return acc + member_id_field_width<decltype(member)>();
++        },
++        0);
++  }
++
++ public:
++  // unique id for this scalar type that can be computed at compile time for
++  //  c++17 template specialization this is not needed once we migrate to
++  //  c++20 and can pass literal classes as template parameters
++  constexpr Id id() const {
++    static_assert(id_size_bits() <= sizeof(Id) * 8,
++                  "ScalarType id is too large to be stored");
++
++    auto or_and_advance = [](std::pair<Id, uint32_t> result,
++                             auto member) -> std::pair<Id, uint32_t> {
++      auto [id, bit_offset] = result;
++      auto constexpr bits = member_id_field_width<decltype(member)>();
++      return {id | (int64_t(member) & ((uint64_t(1) << bits) - 1))
++                       << bit_offset,
++              bit_offset + bits};
++    };
++    return reduce_members(or_and_advance, std::pair<Id, uint32_t>{}).first;
++  }
++
++  // create a ScalarType from an id, for c++17 template specialization,
++  //  this is not needed once we migrate to c++20 and can pass literal
++  //  classes as template parameters
++  static constexpr ScalarType from_id(Id id) {
++    auto extract_and_advance = [id](auto result, auto member) {
++      using T = decltype(member);
++      auto [tuple, bit_offset] = result;
++      auto constexpr bits = member_id_field_width<T>();
++      auto extracted_val = static_cast<T>((int64_t(id) >> bit_offset) &
++                                          ((uint64_t(1) << bits) - 1));
++      auto new_tuple = std::tuple_cat(tuple, std::make_tuple(extracted_val));
++      return std::pair<decltype(new_tuple), int>{new_tuple, bit_offset + bits};
++    };
++
++    auto [tuple_args, _] = reduce_member_types(extract_and_advance,
++                                               std::pair<std::tuple<>, int>{});
++    return std::apply([](auto... args) { return ScalarType(args...); },
++                      tuple_args);
++  }
++
++  constexpr int64_t size_bits() const {
++    return mantissa + exponent + is_signed();
++  }
++  constexpr bool is_signed() const { return signed_; }
++  constexpr bool is_integer() const { return exponent == 0; }
++  constexpr bool is_floating_point() const { return exponent > 0; }
++  constexpr bool is_ieee_754() const {
++    return is_floating_point() && finite_values_only == false &&
++           nan_repr == NAN_IEEE_754;
++  }
++  constexpr bool has_nans() const {
++    return is_floating_point() && nan_repr != NAN_NONE;
++  }
++  constexpr bool has_infs() const {
++    return is_floating_point() && finite_values_only == false;
++  }
++  constexpr bool has_bias() const { return bias != 0; }
++
++ private:
++  double _floating_point_max() const {
++    TORCH_CHECK(mantissa <= 52 && exponent <= 11,
++                "Cannot represent max/min as a double for type ", str());
++
++    uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1;
++    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) {
++      max_mantissa -= 1;
++    }
++
++    uint64_t max_exponent = (uint64_t(1) << exponent) - 2;
++    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) {
++      TORCH_CHECK(exponent < 11,
++                  "Cannot represent max/min as a double for type ", str());
++      max_exponent += 1;
++    }
++
++    // adjust the exponent to match that of a double
++    //  for now we assume the exponent bias is the standard 2^(e-1) -1, (where e
++    //  is the exponent bits), there is some precedent for non-standard biases,
++    //  example `float8_e4m3b11fnuz` here: https://github.com/jax-ml/ml_dtypes
++    //  but to avoid premature over complication we are just assuming the
++    //  standard exponent bias until there is a need to support non-standard
++    //  biases
++    uint64_t exponent_bias = (uint64_t(1) << (exponent - 1)) - 1;
++    uint64_t exponent_bias_double = (uint64_t(1) << 10) - 1;  // double e = 11
++
++    uint64_t max_exponent_double =
++        max_exponent - exponent_bias + exponent_bias_double;
++
++    // shift the mantissa into the position for a double and
++    // the exponent
++    uint64_t double_raw =
++        (max_mantissa << (52 - mantissa)) | (max_exponent_double << 52);
++
++    return *reinterpret_cast<double*>(&double_raw);
++  }
++
++  constexpr std::variant<int64_t, double> _raw_max() const {
++    if (is_floating_point()) {
++      return {_floating_point_max()};
++    } else {
++      TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
++                  "Cannot represent max as a int64_t");
++      return {(int64_t(1) << mantissa) - 1};
++    }
++  }
++
++  constexpr std::variant<int64_t, double> _raw_min() const {
++    if (is_floating_point()) {
++      TORCH_CHECK(is_signed(),
++                  "We currently assume all floating point types are signed");
++      constexpr uint64_t sign_bit_double = (uint64_t(1) << 63);
++
++      double max = _floating_point_max();
++      uint64_t max_raw = *reinterpret_cast<uint64_t*>(&max);
++      uint64_t min_raw = max_raw | sign_bit_double;
++      return {*reinterpret_cast<double*>(&min_raw)};
++    } else {
++      TORCH_CHECK(!is_signed() || size_bits() <= 64,
++                  "Cannot represent min as a int64_t");
++      if (is_signed()) {
++        // set the top bit to 1 (i.e. INT64_MIN) and the rest to 0
++        // then perform an arithmetic shift right to set all the bits above
++        // (size_bits() - 1) to 1
++        return {INT64_MIN >> (64 - size_bits())};
++      } else {
++        return {int64_t(0)};
++      }
++    }
++  }
++
++ public:
++  // Max representable value for this scalar type.
++  // (accounting for bias if there is one)
++  constexpr std::variant<int64_t, double> max() const {
++    return std::visit(
++        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
++        _raw_max());
++  }
++
++  // Min representable value for this scalar type.
++  // (accounting for bias if there is one)
++  constexpr std::variant<int64_t, double> min() const {
++    return std::visit(
++        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
++        _raw_min());
++  }
++
++  std::string str() const {
++    /* naming generally follows: https://github.com/jax-ml/ml_dtypes
++     * for floating point types (leading f) the scheme is:
++     *  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
++     *  flags:
++     *  - no-flags: means it follows IEEE 754 conventions
++     *  - f: means finite values only (no infinities)
++     *  - n: means nans are supported (non-standard encoding)
++     * for integer types the scheme is:
++     *  `[u]int<size_bits>[b<bias>]`
++     *  - if bias is not present it means its zero
++     */
++    if (is_floating_point()) {
++      auto ret = "float" + std::to_string(size_bits()) + "_e" +
++                 std::to_string(exponent) + "m" + std::to_string(mantissa);
++      if (!is_ieee_754()) {
++        if (finite_values_only) {
++          ret += "f";
++        }
++        if (nan_repr != NAN_NONE) {
++          ret += "n";
++        }
++      }
++      return ret;
++    } else {
++      auto ret = ((is_signed()) ? "int" : "uint") + std::to_string(size_bits());
++      if (has_bias()) {
++        ret += "b" + std::to_string(bias);
++      }
++      return ret;
++    }
++  }
++
++  constexpr bool operator==(ScalarType const& other) const {
++    return mantissa == other.mantissa && exponent == other.exponent &&
++           bias == other.bias && signed_ == other.signed_ &&
++           finite_values_only == other.finite_values_only &&
++           nan_repr == other.nan_repr;
++  }
++};
++
++using ScalarTypeId = ScalarType::Id;
++
++// "rust style" names generally following:
++//   https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
++static inline constexpr auto kS4 = ScalarType::int_(4);
++static inline constexpr auto kU4 = ScalarType::uint(4);
++static inline constexpr auto kU4B8 = ScalarType::uint(4, 8);
++static inline constexpr auto kS8 = ScalarType::int_(8);
++static inline constexpr auto kU8 = ScalarType::uint(8);
++static inline constexpr auto kU8B128 = ScalarType::uint(8, 128);
++
++static inline constexpr auto kFE3M2f =
++    ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
++static inline constexpr auto kFE4M3fn =
++    ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
++static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
++static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
++static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
++
++// Fixed width style names, generally following:
++//  https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L47-L57
++static inline constexpr auto kInt4 = kS4;
++static inline constexpr auto kUint4 = kU4;
++static inline constexpr auto kUint4b8 = kU4B8;
++static inline constexpr auto kInt8 = kS8;
++static inline constexpr auto kUint8 = kU8;
++static inline constexpr auto kUint8b128 = kU8B128;
++
++static inline constexpr auto kFloat6_e3m2f = kFE3M2f;
++static inline constexpr auto kFloat8_e4m3fn = kFE4M3fn;
++static inline constexpr auto kFloat8_e5m2 = kFE5M2;
++static inline constexpr auto kFloat16_e8m7 = kFE8M7;
++static inline constexpr auto kFloat16_e5m10 = kFE5M10;
++
++// colloquial names
++static inline constexpr auto kHalf = kFE5M10;
++static inline constexpr auto kFloat16 = kHalf;
++static inline constexpr auto kBFloat16 = kFE8M7;
++
++static inline constexpr auto kFloat16Id = kFloat16.id();
++};  // namespace vllm
+diff --git a/csrc/cpu/activation.cpp b/csrc/cpu/activation.cpp
+index 1bd24eb..039b8d5 100644
+--- a/csrc/cpu/activation.cpp
++++ b/csrc/cpu/activation.cpp
+@@ -1,10 +1,10 @@
+ #include "cpu_types.hpp"
+ 
+ namespace {
+-template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8 &),
++template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8&),
+           bool is_gated>
+-void activation_kernel(int num_tokens, int d, scalar_t *__restrict__ input,
+-                       scalar_t *__restrict__ output) {
++void activation_kernel(int num_tokens, int d, scalar_t* __restrict__ input,
++                       scalar_t* __restrict__ output) {
+   using scalar_vec_t = vec_op::vec_t<scalar_t>;
+   constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+ 
+@@ -34,13 +34,13 @@ void activation_kernel(int num_tokens, int d, scalar_t *__restrict__ input,
+   }
+ }
+ 
+-FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8 &x) {
++FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8& x) {
+   const vec_op::FP32Vec8 zeros(0.0);
+   const vec_op::FP32Vec8 ones(1.0);
+   return x / (ones + (zeros - x).exp());
+ }
+ 
+-FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8 &x) {
++FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8& x) {
+   const vec_op::FP32Vec8 ones(1.0);
+   const vec_op::FP32Vec8 w1(0.79788456f);
+   const vec_op::FP32Vec8 w2(0.044715f);
+@@ -50,7 +50,7 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8 &x) {
+   return w3 * x * (ones + t);
+ }
+ 
+-FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8 &x) {
++FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
+   const vec_op::FP32Vec8 ones(1.0);
+   const vec_op::FP32Vec8 w1(0.79788456f);
+   const vec_op::FP32Vec8 w2(0.044715f);
+@@ -59,14 +59,21 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8 &x) {
+   return w3 * x * (ones + t);
+ }
+ 
+-FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8 &x) {
++FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) {
++  const vec_op::FP32Vec8 zeros(0.0);
++  const vec_op::FP32Vec8 ones(1.0);
++  const vec_op::FP32Vec8 w1(1.702f);
++  return x / (ones + (zeros - w1 * x).exp());
++}
++
++FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
+   const vec_op::FP32Vec8 ones(1.0);
+   const vec_op::FP32Vec8 w1(M_SQRT1_2);
+   const vec_op::FP32Vec8 w2(0.5);
+   return x * w2 * (ones + (x * w1).er());
+ }
+ 
+-FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8 &x) {
++FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8& x) {
+   const vec_op::FP32Vec8 ones(1.0);
+   const vec_op::FP32Vec8 w1(M_SQRT2 * M_2_SQRTPI * 0.5);
+   const vec_op::FP32Vec8 w2(0.5);
+@@ -75,40 +82,36 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8 &x) {
+   const vec_op::FP32Vec8 inner = w1 * (x + x_3 * w3);
+   return x * w2 * (ones + inner.tanh());
+ }
+-}; // namespace
++};  // namespace
+ 
+-void silu_and_mul(torch::Tensor &out, torch::Tensor &input) {
++void silu_and_mul(torch::Tensor& out, torch::Tensor& input) {
+   int num_tokens = input.numel() / input.size(-1);
+   int d = input.size(-1) / 2;
+ 
+-  VLLM_DISPATCH_FLOATING_TYPES(
+-      input.scalar_type(), "silu_and_mul_impl", [&] {
+-        CPU_KERNEL_GUARD_IN(silu_and_mul_impl)
+-        activation_kernel<scalar_t, silu_act, true>(num_tokens, d,
+-                                                    input.data_ptr<scalar_t>(),
+-                                                    out.data_ptr<scalar_t>());
+-        CPU_KERNEL_GUARD_OUT(silu_and_mul_impl)
+-      });
++  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "silu_and_mul_impl", [&] {
++    CPU_KERNEL_GUARD_IN(silu_and_mul_impl)
++    activation_kernel<scalar_t, silu_act, true>(
++        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
++    CPU_KERNEL_GUARD_OUT(silu_and_mul_impl)
++  });
+ }
+ 
+-void gelu_and_mul(torch::Tensor &out,   // [..., d]
+-                      torch::Tensor &input) // [..., 2 * d]
++void gelu_and_mul(torch::Tensor& out,    // [..., d]
++                  torch::Tensor& input)  // [..., 2 * d]
+ {
+   int num_tokens = input.numel() / input.size(-1);
+   int d = input.size(-1) / 2;
+ 
+-  VLLM_DISPATCH_FLOATING_TYPES(
+-      input.scalar_type(), "gelu_and_mul_impl", [&] {
+-        CPU_KERNEL_GUARD_IN(gelu_and_mul_impl)
+-        activation_kernel<scalar_t, gelu_act, true>(num_tokens, d,
+-                                                    input.data_ptr<scalar_t>(),
+-                                                    out.data_ptr<scalar_t>());
+-        CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl)
+-      });
++  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_and_mul_impl", [&] {
++    CPU_KERNEL_GUARD_IN(gelu_and_mul_impl)
++    activation_kernel<scalar_t, gelu_act, true>(
++        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
++    CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl)
++  });
+ }
+ 
+-void gelu_tanh_and_mul(torch::Tensor &out,   // [..., d]
+-                           torch::Tensor &input) // [..., 2 * d]
++void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
++                       torch::Tensor& input)  // [..., 2 * d]
+ {
+   int num_tokens = input.numel() / input.size(-1);
+   int d = input.size(-1) / 2;
+@@ -123,7 +126,7 @@ void gelu_tanh_and_mul(torch::Tensor &out,   // [..., d]
+       });
+ }
+ 
+-void gelu_new(torch::Tensor &out, torch::Tensor &input) {
++void gelu_new(torch::Tensor& out, torch::Tensor& input) {
+   int num_tokens = input.numel() / input.size(-1);
+   int d = input.size(-1);
+ 
+@@ -135,7 +138,7 @@ void gelu_new(torch::Tensor &out, torch::Tensor &input) {
+   });
+ }
+ 
+-void gelu_fast(torch::Tensor &out, torch::Tensor &input) {
++void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
+   int num_tokens = input.numel() / input.size(-1);
+   int d = input.size(-1);
+ 
+@@ -146,3 +149,15 @@ void gelu_fast(torch::Tensor &out, torch::Tensor &input) {
+     CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
+   });
+ }
++
++void gelu_quick(torch::Tensor& out, torch::Tensor& input) {
++  int num_tokens = input.numel() / input.size(-1);
++  int d = input.size(-1);
++
++  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] {
++    CPU_KERNEL_GUARD_IN(gelu_quick_impl)
++    activation_kernel<scalar_t, gelu_quick_act, false>(
++        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
++    CPU_KERNEL_GUARD_OUT(gelu_quick_impl)
++  });
++}
+diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
+index c1d765b..ef5b140 100644
+--- a/csrc/cpu/attention.cpp
++++ b/csrc/cpu/attention.cpp
+@@ -2,7 +2,8 @@
+ 
+ namespace {
+ 
+-template <typename scalar_t> struct KernelVecType {
++template <typename scalar_t>
++struct KernelVecType {
+   using q_load_vec_type = void;
+   using q_vec_type = void;
+   using k_load_vec_type = void;
+@@ -11,7 +12,8 @@ template <typename scalar_t> struct KernelVecType {
+   using v_load_vec_type = void;
+ };
+ 
+-template <> struct KernelVecType<float> {
++template <>
++struct KernelVecType<float> {
+   using q_load_vec_type = vec_op::FP32Vec4;
+   using q_vec_type = vec_op::FP32Vec16;
+   using k_load_vec_type = vec_op::FP32Vec16;
+@@ -20,8 +22,27 @@ template <> struct KernelVecType<float> {
+   using v_load_vec_type = vec_op::FP32Vec16;
+ };
+ 
++template <>
++struct KernelVecType<c10::Half> {
++#ifdef __powerpc64__
++  // Power architecture-specific vector types
++  using q_load_vec_type = vec_op::FP32Vec8;
++  using k_load_vec_type = vec_op::FP32Vec16;
++  using v_load_vec_type = vec_op::FP32Vec16;
++#else
++  // Fallback for other architectures, including x86
++  using q_load_vec_type = vec_op::FP16Vec8;
++  using k_load_vec_type = vec_op::FP16Vec16;
++  using v_load_vec_type = vec_op::FP16Vec16;
++#endif
++  using q_vec_type = vec_op::FP32Vec16;
++  using k_vec_type = vec_op::FP32Vec16;
++  using qk_acc_vec_type = vec_op::FP32Vec16;
++};
++
+ #ifdef __AVX512BF16__
+-template <> struct KernelVecType<c10::BFloat16> {
++template <>
++struct KernelVecType<c10::BFloat16> {
+   using q_load_vec_type = vec_op::BF16Vec8;
+   using q_vec_type = vec_op::BF16Vec32;
+   using k_load_vec_type = vec_op::BF16Vec32;
+@@ -30,7 +51,12 @@ template <> struct KernelVecType<c10::BFloat16> {
+   using v_load_vec_type = vec_op::BF16Vec16;
+ };
+ #else
+-template <> struct KernelVecType<c10::BFloat16> {
++  #ifdef __aarch64__
++    #ifndef ARM_BF16_SUPPORT
++    // pass
++    #else
++template <>
++struct KernelVecType<c10::BFloat16> {
+   using q_load_vec_type = vec_op::BF16Vec8;
+   using q_vec_type = vec_op::FP32Vec16;
+   using k_load_vec_type = vec_op::BF16Vec16;
+@@ -38,10 +64,22 @@ template <> struct KernelVecType<c10::BFloat16> {
+   using qk_acc_vec_type = vec_op::FP32Vec16;
+   using v_load_vec_type = vec_op::BF16Vec16;
+ };
++    #endif
++  #else
++template <>
++struct KernelVecType<c10::BFloat16> {
++  using q_load_vec_type = vec_op::BF16Vec8;
++  using q_vec_type = vec_op::FP32Vec16;
++  using k_load_vec_type = vec_op::BF16Vec16;
++  using k_vec_type = vec_op::FP32Vec16;
++  using qk_acc_vec_type = vec_op::FP32Vec16;
++  using v_load_vec_type = vec_op::BF16Vec16;
++};
++  #endif
+ #endif
+ 
+ template <typename T>
+-FORCE_INLINE std::pair<T, T> reduceSoftmax(T *data, const int size,
++FORCE_INLINE std::pair<T, T> reduceSoftmax(T* data, const int size,
+                                            const int capacity) {
+   T max = data[0];
+   for (int i = 1; i < size; ++i) {
+@@ -67,10 +105,11 @@ FORCE_INLINE std::pair<T, T> reduceSoftmax(T *data, const int size,
+ }
+ 
+ template <typename T>
+-FORCE_INLINE std::pair<T, T>
+-reduceSoftmaxAlibi(T *data, const int size, const int capacity,
+-                   const float alibi_slope, const int start_index,
+-                   const int seq_len) {
++FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
++                                                const int capacity,
++                                                const float alibi_slope,
++                                                const int start_index,
++                                                const int seq_len) {
+   data[0] += alibi_slope * (start_index - seq_len + 1);
+   T max = data[0];
+   for (int i = 1; i < size; ++i) {
+@@ -98,7 +137,7 @@ reduceSoftmaxAlibi(T *data, const int size, const int capacity,
+ }
+ 
+ template <typename T>
+-FORCE_INLINE void reducePartitonSoftmax(const T *max_data, T *sum_data,
++FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
+                                         const int size) {
+   T max = max_data[0];
+   for (int i = 1; i < size; ++i) {
+@@ -132,9 +171,9 @@ struct reduceQKBlockKernel {
+   static_assert(k_load_vec_type::get_elem_num() % x == 0);
+   static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
+ 
+-  FORCE_INLINE static void call(const scalar_t *__restrict__ q,
+-                                const scalar_t *__restrict__ k_block,
+-                                float *__restrict__ logits, float scale,
++  FORCE_INLINE static void call(const scalar_t* __restrict__ q,
++                                const scalar_t* __restrict__ k_block,
++                                float* __restrict__ logits, float scale,
+                                 const int token_num) {
+     const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
+ 
+@@ -196,8 +235,8 @@ struct reduceQKBlockKernel {
+ 
+ template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
+           int HEAD_PARTITION_SIZE, typename acc_t>
+-FORCE_INLINE void reduceValueBlock(const float *prob, const scalar_t *v_block,
+-                                   acc_t &&acc) {
++FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
++                                   acc_t&& acc) {
+   using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
+   constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
+   static_assert(BLOCK_SIZE == ELEM_NUM);
+@@ -209,27 +248,27 @@ FORCE_INLINE void reduceValueBlock(const float *prob, const scalar_t *v_block,
+     acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
+   });
+ }
+-}; // namespace
++};  // namespace
+ 
+ // Paged attention v1
+ namespace {
+ template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
+ struct paged_attention_v1_impl {
+-  static void
+-  call(scalar_t *__restrict__ out,           // [num_seqs, num_heads, head_size]
+-       const scalar_t *__restrict__ q,       // [num_seqs, num_heads, head_size]
+-       const scalar_t *__restrict__ k_cache, // [num_blocks, num_kv_heads,
++  static void call(
++      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
++      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
++      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                              // head_size/x, block_size, x]
+-       const scalar_t *__restrict__ v_cache, // [num_blocks, num_kv_heads,
++      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                              // head_size, block_size]
+-       const int num_kv_heads, const float scale,
+-       const int
+-           *__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+-       const int *__restrict__ seq_lens, // [num_seqs]
+-       const int max_num_blocks_per_seq,
+-       const float *__restrict__ alibi_slopes, // [num_heads]
+-       const int q_stride, const int kv_block_stride, const int kv_head_stride,
+-       const int num_seqs, const int num_heads) {
++      const int num_kv_heads, const float scale,
++      const int* __restrict__ block_tables,  // [num_seqs,
++                                             // max_num_blocks_per_seq]
++      const int* __restrict__ seq_lens,      // [num_seqs]
++      const int max_num_blocks_per_seq,
++      const float* __restrict__ alibi_slopes,  // [num_heads]
++      const int q_stride, const int kv_block_stride, const int kv_head_stride,
++      const int num_seqs, const int num_heads) {
+     constexpr int x = 16 / sizeof(scalar_t);
+     const int num_queries_per_kv = num_heads / num_kv_heads;
+ 
+@@ -243,32 +282,31 @@ struct paged_attention_v1_impl {
+ 
+     size_t logits_bytes =
+         parallel_work_item_num * max_seq_len_padded * sizeof(float);
+-    float *logits = (float *)std::aligned_alloc(
+-        64, logits_bytes); // Cacheline alignment for each context token.
+-                           // [parallel_work_item_num, max_seq_len_padded]
++    float* logits = (float*)std::aligned_alloc(
++        64, logits_bytes);  // Cacheline alignment for each context token.
++                            // [parallel_work_item_num, max_seq_len_padded]
+ 
+ #pragma omp parallel for collapse(2) schedule(dynamic, 1)
+     for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
+       for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+         int seq_len = seq_lens[seq_idx];
+-        const int *seq_block_table =
++        const int* seq_block_table =
+             block_tables + max_num_blocks_per_seq * seq_idx;
+         const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
+         const int64_t kv_head_idx = head_idx / num_queries_per_kv;
+-        const scalar_t *__restrict__ q_vec_ptr =
++        const scalar_t* __restrict__ q_vec_ptr =
+             q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+-        const int last_block_token_num =
+-            seq_len - (block_num - 1) * BLOCK_SIZE;
+-        float *__restrict__ thread_block_logits =
++        const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
++        float* __restrict__ thread_block_logits =
+             logits + omp_get_thread_num() * max_seq_len_padded;
+ 
+         // Compute logits
+         for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+           const int64_t physical_block_idx = seq_block_table[block_idx];
+-          const scalar_t *__restrict__ k_block_cache_ptr =
++          const scalar_t* __restrict__ k_block_cache_ptr =
+               k_cache + physical_block_idx * kv_block_stride +
+               kv_head_idx * kv_head_stride;
+-          float *__restrict__ head_block_logits =
++          float* __restrict__ head_block_logits =
+               thread_block_logits + block_idx * BLOCK_SIZE;
+ 
+           reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
+@@ -282,8 +320,7 @@ struct paged_attention_v1_impl {
+                              block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
+                              seq_len);
+         } else {
+-          reduceSoftmax(thread_block_logits, seq_len,
+-                        block_num * BLOCK_SIZE);
++          reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE);
+         }
+ 
+         // Compute value
+@@ -293,14 +330,14 @@ struct paged_attention_v1_impl {
+         for (int head_part_idx = 0; head_part_idx < head_partition_num;
+              ++head_part_idx) {
+           vec_op::FP32Vec16 accums[head_elem_num_per_partition];
+-          scalar_t *__restrict__ out_ptr =
++          scalar_t* __restrict__ out_ptr =
+               out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
+               head_part_idx * head_elem_num_per_partition;
+           for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+             const int64_t physical_block_idx = seq_block_table[block_idx];
+-            const float *__restrict__ prob_vec_ptr =
++            const float* __restrict__ prob_vec_ptr =
+                 thread_block_logits + block_idx * BLOCK_SIZE;
+-            const scalar_t *__restrict__ v_block_cache_ptr =
++            const scalar_t* __restrict__ v_block_cache_ptr =
+                 v_cache + physical_block_idx * kv_block_stride +
+                 kv_head_idx * kv_head_stride +
+                 BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
+@@ -311,7 +348,7 @@ struct paged_attention_v1_impl {
+             if (block_idx != block_num - 1) {
+               const int64_t next_physical_block_idx =
+                   seq_block_table[block_idx + 1];
+-              const scalar_t *__restrict__ next_v_block_cache_ptr =
++              const scalar_t* __restrict__ next_v_block_cache_ptr =
+                   v_cache + next_physical_block_idx * kv_block_stride +
+                   kv_head_idx * kv_head_stride +
+                   BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
+@@ -340,16 +377,16 @@ struct paged_attention_v1_impl {
+ #define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                   \
+   paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call(                     \
+       out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
+-      block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,              \
++      block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,                  \
+       alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs,   \
+       num_heads);
+ 
+ template <typename T, int BLOCK_SIZE>
+ void paged_attention_v1_impl_launcher(
+-    torch::Tensor &out, torch::Tensor &query, torch::Tensor &key_cache,
+-    torch::Tensor &value_cache, int num_kv_heads, float scale,
+-    torch::Tensor &block_tables, torch::Tensor &seq_lens,
+-    int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
++    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
++    torch::Tensor& value_cache, int num_kv_heads, float scale,
++    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
++    const std::optional<torch::Tensor>& alibi_slopes) {
+   int num_seqs = query.size(0);
+   int num_heads = query.size(1);
+   int head_size = query.size(2);
+@@ -359,68 +396,77 @@ void paged_attention_v1_impl_launcher(
+   int kv_head_stride = key_cache.stride(1);
+ 
+   // NOTE: alibi_slopes is optional.
+-  const float *alibi_slopes_ptr =
++  const float* alibi_slopes_ptr =
+       alibi_slopes
+-          ? reinterpret_cast<const float *>(alibi_slopes.value().data_ptr())
++          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+           : nullptr;
+ 
+-  T *out_ptr = reinterpret_cast<T *>(out.data_ptr());
+-  T *query_ptr = reinterpret_cast<T *>(query.data_ptr());
+-  T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
+-  T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
+-  int *block_tables_ptr = block_tables.data_ptr<int>();
+-  int *seq_lens_ptr = seq_lens.data_ptr<int>();
++  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
++  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
++  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
++  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
++  int* block_tables_ptr = block_tables.data_ptr<int>();
++  int* seq_lens_ptr = seq_lens.data_ptr<int>();
+ 
+   switch (head_size) {
+-  case 64:
+-    LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
+-    break;
+-  case 80:
+-    LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
+-    break;
+-  case 96:
+-    LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
+-    break;
+-  case 112:
+-    LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
+-    break;
+-  case 128:
+-    LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
+-    break;
+-  case 256:
+-    LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
+-    break;
+-  default:
+-    TORCH_CHECK(false, "Unsupported head size: ", head_size);
+-    break;
++    case 32:
++      LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
++      break;
++    case 64:
++      LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
++      break;
++    case 80:
++      LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
++      break;
++    case 96:
++      LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
++      break;
++    case 112:
++      LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
++      break;
++    case 128:
++      LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
++      break;
++    case 192:
++      LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
++      break;
++    case 256:
++      LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
++      break;
++    default:
++      TORCH_CHECK(false, "Unsupported head size: ", head_size);
++      break;
+   }
+ }
+ 
+-#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE)                                 \
+-  paged_attention_v1_impl_launcher<T, BLOCK_SIZE>(                             \
+-      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,   \
++#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE)                               \
++  paged_attention_v1_impl_launcher<T, BLOCK_SIZE>(                           \
++      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
+       seq_lens, max_seq_len, alibi_slopes);
+ 
+-#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T)                                  \
+-  switch (block_size) {                                                        \
+-  case 16:                                                                     \
+-    CALL_V1_KERNEL_LAUNCHER(T, 16);                                            \
+-    break;                                                                     \
+-  default:                                                                     \
+-    TORCH_CHECK(false, "Unsupported block size: ", block_size);                \
+-    break;                                                                     \
++#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
++  switch (block_size) {                                           \
++    case 16:                                                      \
++      CALL_V1_KERNEL_LAUNCHER(T, 16);                             \
++      break;                                                      \
++    default:                                                      \
++      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
++      break;                                                      \
+   }
+-} // namespace
+-
+-void paged_attention_v1(torch::Tensor &out, torch::Tensor &query,
+-                        torch::Tensor &key_cache, torch::Tensor &value_cache,
+-                        int num_kv_heads, float scale,
+-                        torch::Tensor &block_tables,
+-                        torch::Tensor &seq_lens, int block_size,
+-                        int max_seq_len,
+-                        const c10::optional<torch::Tensor> &alibi_slopes,
+-                        const std::string &kv_cache_dtype, float kv_scale) {
+-  TORCH_CHECK(kv_scale == 1.0f);
++}  // namespace
++
++void paged_attention_v1(
++    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
++    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
++    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
++    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
++    const std::string& kv_cache_dtype, double k_scale, double v_scale,
++    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
++    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
++    const int64_t blocksparse_head_sliding_step) {
++  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
++  TORCH_CHECK(blocksparse_vert_stride <= 1,
++              "CPU backend does not support blocksparse attention yet.");
+   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
+                                [&] {
+                                  CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
+@@ -434,23 +480,24 @@ namespace {
+ template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
+ struct paged_attention_v2_impl {
+   static void call(
+-      scalar_t *__restrict__ out,   // [num_seqs, num_heads, head_size]
+-      float *__restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
+-      float
+-          *__restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
+-      scalar_t *__restrict__ tmp_out,       // [num_seqs, num_heads,
+-                                            // max_num_partitions, head_size]
+-      const scalar_t *__restrict__ q,       // [num_seqs, num_heads, head_size]
+-      const scalar_t *__restrict__ k_cache, // [num_blocks, num_kv_heads,
+-                                            // head_size/x, block_size, x]
+-      const scalar_t *__restrict__ v_cache, // [num_blocks, num_kv_heads,
+-                                            // head_size, block_size]
++      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
++      float* __restrict__ exp_sums,          // [num_seqs, num_heads,
++                                             // max_num_partitions]
++      float* __restrict__ max_logits,        // [num_seqs, num_heads,
++                                             // max_num_partitions]
++      scalar_t* __restrict__ tmp_out,        // [num_seqs, num_heads,
++                                             // max_num_partitions, head_size]
++      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
++      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
++                                             // head_size/x, block_size, x]
++      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
++                                             // head_size, block_size]
+       const int num_kv_heads, const float scale,
+-      const int
+-          *__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+-      const int *__restrict__ seq_lens, // [num_seqs]
++      const int* __restrict__ block_tables,  // [num_seqs,
++                                             // max_num_blocks_per_seq]
++      const int* __restrict__ seq_lens,      // [num_seqs]
+       const int max_num_blocks_per_seq,
+-      const float *__restrict__ alibi_slopes, // [num_heads]
++      const float* __restrict__ alibi_slopes,  // [num_heads]
+       const int q_stride, const int kv_block_stride, const int kv_head_stride,
+       const int num_seqs, const int num_heads, const int max_num_partitions) {
+     constexpr int x = 16 / sizeof(scalar_t);
+@@ -468,8 +515,7 @@ struct paged_attention_v2_impl {
+           const int seq_len = seq_lens[seq_idx];
+           const int start_token_idx = partition_idx * PARTITION_SIZE;
+ 
+-          if (start_token_idx >= seq_len)
+-            continue;
++          if (start_token_idx >= seq_len) continue;
+ 
+           const int partition_num =
+               (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
+@@ -477,15 +523,14 @@ struct paged_attention_v2_impl {
+           const int token_num =
+               (std::min(seq_len, start_token_idx + PARTITION_SIZE) -
+                start_token_idx);
+-          const int block_num =
+-              (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
++          const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
+           const int last_block_token_num =
+               token_num - (block_num - 1) * BLOCK_SIZE;
+-          const int *seq_block_table = block_tables +
++          const int* seq_block_table = block_tables +
+                                        max_num_blocks_per_seq * seq_idx +
+                                        start_token_idx / BLOCK_SIZE;
+           const int64_t kv_head_idx = head_idx / num_queries_per_kv;
+-          const scalar_t *__restrict__ q_vec_ptr =
++          const scalar_t* __restrict__ q_vec_ptr =
+               q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+ 
+           float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
+@@ -493,10 +538,10 @@ struct paged_attention_v2_impl {
+           // Compute logits
+           for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+             const int64_t physical_block_idx = seq_block_table[block_idx];
+-            const scalar_t *__restrict__ k_block_cache_ptr =
++            const scalar_t* __restrict__ k_block_cache_ptr =
+                 k_cache + physical_block_idx * kv_block_stride +
+                 kv_head_idx * kv_head_stride;
+-            float *__restrict__ head_block_logits =
++            float* __restrict__ head_block_logits =
+                 logits + block_idx * BLOCK_SIZE;
+ 
+             reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
+@@ -510,13 +555,13 @@ struct paged_attention_v2_impl {
+                 logits, token_num, block_num * BLOCK_SIZE,
+                 alibi_slopes[head_idx], start_token_idx, seq_len);
+           } else {
+-            max_and_sum = reduceSoftmax(logits, token_num,
+-                                        block_num * BLOCK_SIZE);
++            max_and_sum =
++                reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE);
+           }
+ 
+-          auto &&[max_logit, exp_sum] = max_and_sum;
++          auto&& [max_logit, exp_sum] = max_and_sum;
+ 
+-          scalar_t *__restrict__ output_buffer = nullptr;
++          scalar_t* __restrict__ output_buffer = nullptr;
+           if (!no_reduce) {
+             auto idx = seq_idx * num_heads * max_num_partitions +
+                        head_idx * max_num_partitions + partition_idx;
+@@ -538,13 +583,13 @@ struct paged_attention_v2_impl {
+           for (int head_part_idx = 0; head_part_idx < head_partition_num;
+                ++head_part_idx) {
+             vec_op::FP32Vec16 accums[head_elem_num_per_partition];
+-            scalar_t *__restrict__ out_ptr =
++            scalar_t* __restrict__ out_ptr =
+                 output_buffer + head_part_idx * head_elem_num_per_partition;
+             for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+               const int64_t physical_block_idx = seq_block_table[block_idx];
+-              const float *__restrict__ prob_vec_ptr =
++              const float* __restrict__ prob_vec_ptr =
+                   logits + block_idx * BLOCK_SIZE;
+-              const scalar_t *__restrict__ v_block_cache_ptr =
++              const scalar_t* __restrict__ v_block_cache_ptr =
+                   v_cache + physical_block_idx * kv_block_stride +
+                   kv_head_idx * kv_head_stride +
+                   BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
+@@ -555,7 +600,7 @@ struct paged_attention_v2_impl {
+               if (block_idx != block_num - 1) {
+                 const int64_t next_physical_block_idx =
+                     seq_block_table[block_idx + 1];
+-                const scalar_t *__restrict__ next_v_block_cache_ptr =
++                const scalar_t* __restrict__ next_v_block_cache_ptr =
+                     v_cache + next_physical_block_idx * kv_block_stride +
+                     kv_head_idx * kv_head_stride +
+                     BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
+@@ -587,8 +632,7 @@ struct paged_attention_v2_impl {
+         const int partition_num =
+             (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
+ 
+-        if (partition_num == 1)
+-          continue;
++        if (partition_num == 1) continue;
+ 
+         reducePartitonSoftmax(
+             max_logits + seq_idx * num_heads * max_num_partitions +
+@@ -603,11 +647,11 @@ struct paged_attention_v2_impl {
+     using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
+     static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
+     constexpr int head_elem_num_per_group =
+-        16; // Note: didn't align with the cacheline size, due to some HEAD_SIZE
+-            // didn't align with 64 bytes
++        16;  // Note: didn't align with the cacheline size, due to some
++             // HEAD_SIZE didn't align with 64 bytes
+     static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
+     constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
+-    const float *__restrict__ rescale_factors = exp_sums;
++    const float* __restrict__ rescale_factors = exp_sums;
+ #pragma omp parallel for collapse(3) schedule(static, 1)
+     for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
+       for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+@@ -616,17 +660,16 @@ struct paged_attention_v2_impl {
+           const int partition_num =
+               (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
+ 
+-          if (partition_num == 1)
+-            continue;
++          if (partition_num == 1) continue;
+ 
+-          const float *__restrict__ seq_head_rescale_factors =
++          const float* __restrict__ seq_head_rescale_factors =
+               rescale_factors + seq_idx * num_heads * max_num_partitions +
+               head_idx * max_num_partitions;
+-          const scalar_t *__restrict__ seq_head_tmp_out =
++          const scalar_t* __restrict__ seq_head_tmp_out =
+               tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+               head_idx * max_num_partitions * HEAD_SIZE +
+               group_idx * head_elem_num_per_group;
+-          scalar_t *__restrict__ seq_head_output =
++          scalar_t* __restrict__ seq_head_output =
+               out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
+               group_idx * head_elem_num_per_group;
+ 
+@@ -645,21 +688,21 @@ struct paged_attention_v2_impl {
+   }
+ };
+ 
+-#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                   \
+-  paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call(     \
+-      out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr,           \
+-      key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr,   \
+-      seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
+-      kv_block_stride, kv_head_stride, num_seqs, num_heads,                    \
++#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE)                 \
++  paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call(   \
++      out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr,         \
++      key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
++      seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,      \
++      kv_block_stride, kv_head_stride, num_seqs, num_heads,                  \
+       max_num_partitions);
+ 
+ template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
+ void paged_attention_v2_impl_launcher(
+-    torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits,
+-    torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache,
+-    torch::Tensor &value_cache, int num_kv_heads, float scale,
+-    torch::Tensor &block_tables, torch::Tensor &seq_lens, int block_size,
+-    int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
++    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
++    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
++    torch::Tensor& value_cache, int num_kv_heads, float scale,
++    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
++    int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
+   int num_seqs = query.size(0);
+   int num_heads = query.size(1);
+   int head_size = query.size(2);
+@@ -670,77 +713,86 @@ void paged_attention_v2_impl_launcher(
+   int max_num_partitions = exp_sums.size(-1);
+ 
+   // NOTE: alibi_slopes is optional.
+-  const float *alibi_slopes_ptr =
++  const float* alibi_slopes_ptr =
+       alibi_slopes
+-          ? reinterpret_cast<const float *>(alibi_slopes.value().data_ptr())
++          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+           : nullptr;
+ 
+-  T *out_ptr = reinterpret_cast<T *>(out.data_ptr());
+-  float *exp_sums_ptr = reinterpret_cast<float *>(exp_sums.data_ptr());
+-  float *max_logits_ptr = reinterpret_cast<float *>(max_logits.data_ptr());
+-  T *tmp_out_ptr = reinterpret_cast<T *>(tmp_out.data_ptr());
+-  T *query_ptr = reinterpret_cast<T *>(query.data_ptr());
+-  T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
+-  T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
+-  int *block_tables_ptr = block_tables.data_ptr<int>();
+-  int *seq_lens_ptr = seq_lens.data_ptr<int>();
++  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
++  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
++  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
++  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
++  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
++  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
++  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
++  int* block_tables_ptr = block_tables.data_ptr<int>();
++  int* seq_lens_ptr = seq_lens.data_ptr<int>();
+ 
+   switch (head_size) {
+-  case 64:
+-    LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
+-    break;
+-  case 80:
+-    LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
+-    break;
+-  case 96:
+-    LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
+-    break;
+-  case 112:
+-    LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
+-    break;
+-  case 128:
+-    LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
+-    break;
+-  case 256:
+-    LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
+-    break;
+-  default:
+-    TORCH_CHECK(false, "Unsupported head size: ", head_size);
+-    break;
++    case 32:
++      LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
++      break;
++    case 64:
++      LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
++      break;
++    case 80:
++      LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
++      break;
++    case 96:
++      LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
++      break;
++    case 112:
++      LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
++      break;
++    case 128:
++      LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
++      break;
++    case 192:
++      LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
++      break;
++    case 256:
++      LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
++      break;
++    default:
++      TORCH_CHECK(false, "Unsupported head size: ", head_size);
++      break;
+   }
+ }
+ 
+-#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE)                                 \
+-  paged_attention_v2_impl_launcher<T, BLOCK_SIZE>(                             \
+-      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,       \
+-      num_kv_heads, scale, block_tables, seq_lens, block_size,             \
+-      max_seq_len, alibi_slopes);
+-
+-#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T)                                  \
+-  switch (block_size) {                                                        \
+-  case 16:                                                                     \
+-    CALL_V2_KERNEL_LAUNCHER(T, 16);                                            \
+-    break;                                                                     \
+-  default:                                                                     \
+-    TORCH_CHECK(false, "Unsupported block size: ", block_size);                \
+-    break;                                                                     \
++#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE)                              \
++  paged_attention_v2_impl_launcher<T, BLOCK_SIZE>(                          \
++      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
++      num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \
++      alibi_slopes);
++
++#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T)                     \
++  switch (block_size) {                                           \
++    case 16:                                                      \
++      CALL_V2_KERNEL_LAUNCHER(T, 16);                             \
++      break;                                                      \
++    default:                                                      \
++      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
++      break;                                                      \
+   }
+-} // namespace
+-
+-void paged_attention_v2(torch::Tensor &out, torch::Tensor &exp_sums,
+-                        torch::Tensor &max_logits, torch::Tensor &tmp_out,
+-                        torch::Tensor &query, torch::Tensor &key_cache,
+-                        torch::Tensor &value_cache, int num_kv_heads,
+-                        float scale, torch::Tensor &block_tables,
+-                        torch::Tensor &seq_lens, int block_size,
+-                        int max_seq_len,
+-                        const c10::optional<torch::Tensor> &alibi_slopes,
+-                        const std::string &kv_cache_dtype, float kv_scale) {
+-  TORCH_CHECK(kv_scale == 1.0f);
++}  // namespace
++
++void paged_attention_v2(
++    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
++    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
++    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
++    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
++    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
++    const std::string& kv_cache_dtype, double k_scale, double v_scale,
++    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
++    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
++    const int64_t blocksparse_head_sliding_step) {
++  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
++  TORCH_CHECK(blocksparse_vert_stride <= 1,
++              "CPU backend does not support blocksparse attention yet.");
+   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
+                                [&] {
+                                  CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
+                                  CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
+                                  CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
+                                });
+-}
++}
+\ No newline at end of file
+diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
+index 7849a5d..31d4543 100644
+--- a/csrc/cpu/cache.cpp
++++ b/csrc/cpu/cache.cpp
+@@ -5,25 +5,26 @@
+ 
+ namespace {
+ template <typename scalar_t>
+-void copy_blocks_cpu_impl(
+-    std::vector<torch::Tensor> &key_caches,
+-    std::vector<torch::Tensor> &value_caches,
+-    const std::vector<std::pair<int64_t, int64_t>> mapping_pairs,
+-    const int element_num_per_block, const int layer_num) {
+-  const size_t pair_num = mapping_pairs.size();
++void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
++                          std::vector<torch::Tensor> const& value_caches,
++                          const torch::Tensor& mapping_pairs,
++                          const int element_num_per_block,
++                          const int layer_num) {
++  const size_t pair_num = mapping_pairs.size(0);
+   const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
+ #pragma omp parallel for collapse(2)
+   for (int layer = 0; layer < layer_num; ++layer) {
+     for (size_t pair = 0; pair < pair_num; ++pair) {
+-      int64_t source_offset = element_num_per_block * mapping_pairs[pair].first;
++      int64_t source_offset =
++          element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
+       int64_t target_offset =
+-          element_num_per_block * mapping_pairs[pair].second;
+-      scalar_t *key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
+-      scalar_t *source_ptr = key_cache_ptr + source_offset;
+-      scalar_t *target_ptr = key_cache_ptr + target_offset;
++          element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
++      scalar_t* key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
++      scalar_t* source_ptr = key_cache_ptr + source_offset;
++      scalar_t* target_ptr = key_cache_ptr + target_offset;
+       std::memcpy(target_ptr, source_ptr, block_bytes);
+ 
+-      scalar_t *value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
++      scalar_t* value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
+       source_ptr = value_cache_ptr + source_offset;
+       target_ptr = value_cache_ptr + target_offset;
+       std::memcpy(target_ptr, source_ptr, block_bytes);
+@@ -33,9 +34,9 @@ void copy_blocks_cpu_impl(
+ 
+ template <typename scalar_t>
+ void reshape_and_cache_cpu_impl(
+-    const scalar_t *__restrict__ key, const scalar_t *__restrict__ value,
+-    scalar_t *__restrict__ key_cache, scalar_t *__restrict__ value_cache,
+-    const int64_t *__restrict__ slot_mapping, const int num_tokens,
++    const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
++    scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
++    const int64_t* __restrict__ slot_mapping, const int num_tokens,
+     const int key_stride, const int value_stride, const int num_heads,
+     const int head_size, const int block_size, const int x) {
+   const int block_elem_num = num_heads * head_size * block_size;
+@@ -48,14 +49,14 @@ void reshape_and_cache_cpu_impl(
+         int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
+         int src_value_head_idx =
+             token_idx * value_stride + head_idx * head_size;
+-        const scalar_t *src_key_head_ptr = key + src_key_head_idx;
+-        const scalar_t *src_value_head_ptr = value + src_value_head_idx;
++        const scalar_t* src_key_head_ptr = key + src_key_head_idx;
++        const scalar_t* src_value_head_ptr = value + src_value_head_idx;
+         const int64_t block_index = slot_idx / block_size;
+         const int64_t block_offset = slot_idx % block_size;
+-        scalar_t *target_key_head_ptr = key_cache +
++        scalar_t* target_key_head_ptr = key_cache +
+                                         block_elem_num * block_index +
+                                         head_idx * block_size * head_size;
+-        scalar_t *target_value_head_ptr = value_cache +
++        scalar_t* target_value_head_ptr = value_cache +
+                                           block_elem_num * block_index +
+                                           head_idx * block_size * head_size;
+ 
+@@ -79,40 +80,36 @@ void reshape_and_cache_cpu_impl(
+     }
+   }
+ }
+-}; // namespace
++};  // namespace
+ 
+-void copy_blocks(std::vector<torch::Tensor> &key_caches,
+-                 std::vector<torch::Tensor> &value_caches,
+-                 const std::map<int64_t, std::vector<int64_t>> &block_mapping) {
+-  int num_layers = key_caches.size();
++// Note: the key_caches and value_caches vectors are constant but
++// not the Tensors they contain. The vectors need to be const refs
++// in order to satisfy pytorch's C++ operator registration code.
++void copy_blocks(std::vector<torch::Tensor> const& key_caches,
++                 std::vector<torch::Tensor> const& value_caches,
++                 const torch::Tensor& block_mapping) {
++  unsigned num_layers = key_caches.size();
+   TORCH_CHECK(num_layers == value_caches.size());
+   if (num_layers == 0) {
+     return;
+   }
+ 
+-  std::vector<std::pair<int64_t, int64_t>> mapping_pairs;
+-  mapping_pairs.reserve(block_mapping.size());
+-  for (const auto &pair : block_mapping) {
+-    for (const auto &dst : pair.second) {
+-      mapping_pairs.emplace_back(pair.first, dst);
+-    }
+-  }
+-
+   const int element_num_per_block = key_caches[0][0].numel();
+   VLLM_DISPATCH_FLOATING_TYPES(
+       key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
+         CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
+-        copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, mapping_pairs,
++        copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
+                                        element_num_per_block, num_layers);
+         CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
+       });
+ }
+ 
+-void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
+-                       torch::Tensor &key_cache, torch::Tensor &value_cache,
+-                       torch::Tensor &slot_mapping,
+-                       const std::string &kv_cache_dtype, float kv_scale) {
+-  TORCH_CHECK(kv_scale == 1.0f);
++void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
++                       torch::Tensor& key_cache, torch::Tensor& value_cache,
++                       torch::Tensor& slot_mapping,
++                       const std::string& kv_cache_dtype, double k_scale,
++                       double v_scale) {
++  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
+ 
+   int num_tokens = key.size(0);
+   int num_heads = key.size(1);
+@@ -135,7 +132,7 @@ void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
+       });
+ }
+ 
+-void swap_blocks(torch::Tensor &src, torch::Tensor &dst,
+-                 const std::map<int64_t, int64_t> &block_mapping) {
++void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
++                 const torch::Tensor& block_mapping) {
+   TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
+ }
+diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
+index c1d3ec0..28db047 100644
+--- a/csrc/cpu/cpu_types.hpp
++++ b/csrc/cpu/cpu_types.hpp
+@@ -1,352 +1,17 @@
+-
+ #ifndef CPU_TYPES_HPP
+ #define CPU_TYPES_HPP
+ 
+-#include <immintrin.h>
+-#include <torch/extension.h>
+-
+-namespace vec_op {
+-
+-// FIXME: FP16 is not fully supported in Torch-CPU
+-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+-
+-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+-
+-#ifndef CPU_OP_GUARD
+-#define CPU_KERNEL_GUARD_IN(NAME)
+-#define CPU_KERNEL_GUARD_OUT(NAME)
++#if defined(__x86_64__)
++  //x86 implementation
++  #include "cpu_types_x86.hpp"
++#elif defined(__POWER9_VECTOR__)
++  //ppc implementation
++  #include "cpu_types_vsx.hpp"
++#elif defined(__aarch64__)
++  //arm implementation
++  #include "cpu_types_arm.hpp"
+ #else
+-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+-  std::cout << #NAME << " invoked." << std::endl;
+-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+-#endif
+-
+-#define FORCE_INLINE __attribute__((always_inline)) inline
+-
+-namespace {
+-template <typename T, T... indexes, typename F>
+-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+-  (f(std::integral_constant<T, indexes>{}), ...);
+-}
+-}; // namespace
+-
+-template <typename T, T count, typename F,
+-          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+-constexpr void unroll_loop(F &&f) {
+-  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+-}
+-
+-template <typename T> struct Vec {
+-  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+-};
+-
+-struct FP32Vec8;
+-struct FP32Vec16;
+-
+-#ifdef __AVX512FP16__
+-struct FP16Vec8 : public Vec<FP16Vec8> {
+-  constexpr static int VEC_ELEM_NUM = 8;
+-
+-  __m128h reg;
+-
+-  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
+-
+-  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
+-
+-  explicit FP16Vec8(__m128h data) : reg(data) {}
+-
+-  FP16Vec8 operator*(const FP16Vec8 &b) const {
+-    return FP16Vec8(_mm_mul_ph(reg, b.reg));
+-  }
+-
+-  FP16Vec8 operator+(const FP16Vec8 &b) const {
+-    return FP16Vec8(_mm_add_ph(reg, b.reg));
+-  }
+-
+-  FP16Vec8 operator-(const FP16Vec8 &b) const {
+-    return FP16Vec8(_mm_sub_ph(reg, b.reg));
+-  }
+-
+-  FP16Vec8 operator/(const FP16Vec8 &b) const {
+-    return FP16Vec8(_mm_div_ph(reg, b.reg));
+-  }
+-
+-  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
+-};
++  #warning "unsupported vLLM cpu implementation"
+ #endif
+ 
+-struct BF16Vec8 : public Vec<BF16Vec8> {
+-  constexpr static int VEC_ELEM_NUM = 8;
+-
+-  __m128i reg;
+-
+-  explicit BF16Vec8(const void *ptr)
+-      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
+-
+-  explicit BF16Vec8(const FP32Vec8 &);
+-
+-  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+-};
+-
+-struct BF16Vec16 : public Vec<BF16Vec16> {
+-  constexpr static int VEC_ELEM_NUM = 16;
+-
+-  __m256i reg;
+-
+-  explicit BF16Vec16(const void *ptr)
+-      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
+-
+-  explicit BF16Vec16(const FP32Vec16 &);
+-
+-  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+-};
+-
+-struct BF16Vec32 : public Vec<BF16Vec32> {
+-  constexpr static int VEC_ELEM_NUM = 32;
+-
+-  __m512i reg;
+-
+-  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
+-
+-  explicit BF16Vec32(__m512i data) : reg(data) {}
+-
+-  explicit BF16Vec32(BF16Vec8 &vec8_data)
+-      : reg((__m512i)_mm512_inserti32x4(
+-            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+-                                                      (__m128i)vec8_data.reg),
+-                                                  (__m128i)vec8_data.reg, 1),
+-                               (__m128i)vec8_data.reg, 2),
+-            (__m128i)vec8_data.reg, 3)) {}
+-
+-  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
+-};
+-
+-struct FP32Vec4 : public Vec<FP32Vec4> {
+-  constexpr static int VEC_ELEM_NUM = 4;
+-  union AliasReg {
+-    __m128 reg;
+-    float values[VEC_ELEM_NUM];
+-  };
+-
+-  __m128 reg;
+-
+-  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
+-
+-  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
+-
+-  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
+-
+-  explicit FP32Vec4(__m128 data) : reg(data) {}
+-
+-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+-};
+-
+-struct FP32Vec8 : public Vec<FP32Vec8> {
+-  constexpr static int VEC_ELEM_NUM = 8;
+-  union AliasReg {
+-    __m256 reg;
+-    float values[VEC_ELEM_NUM];
+-  };
+-
+-  __m256 reg;
+-
+-  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
+-
+-  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
+-
+-  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
+-
+-  explicit FP32Vec8(__m256 data) : reg(data) {}
+-
+-  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
+-
+-#ifdef __AVX512FP16__
+-  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
+-#endif
+-
+-  explicit FP32Vec8(const BF16Vec8 &v)
+-      : reg(_mm256_castsi256_ps(
+-            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
+-
+-  float reduce_sum() const {
+-    AliasReg ar;
+-    ar.reg = reg;
+-    float result = 0;
+-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+-
+-    return result;
+-  }
+-
+-  FP32Vec8 exp() const {
+-    AliasReg ar;
+-    ar.reg = reg;
+-    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
+-                                  expf(ar.values[5]), expf(ar.values[4]),
+-                                  expf(ar.values[3]), expf(ar.values[2]),
+-                                  expf(ar.values[1]), expf(ar.values[0])));
+-  }
+-
+-  FP32Vec8 tanh() const {
+-    AliasReg ar;
+-    ar.reg = reg;
+-    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
+-                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
+-                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
+-                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
+-  }
+-
+-  FP32Vec8 er() const {
+-    AliasReg ar;
+-    ar.reg = reg;
+-    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
+-                                  erf(ar.values[5]), erf(ar.values[4]),
+-                                  erf(ar.values[3]), erf(ar.values[2]),
+-                                  erf(ar.values[1]), erf(ar.values[0])));
+-  }
+-
+-  FP32Vec8 operator*(const FP32Vec8 &b) const {
+-    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
+-  }
+-
+-  FP32Vec8 operator+(const FP32Vec8 &b) const {
+-    return FP32Vec8(_mm256_add_ps(reg, b.reg));
+-  }
+-
+-  FP32Vec8 operator-(const FP32Vec8 &b) const {
+-    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
+-  }
+-
+-  FP32Vec8 operator/(const FP32Vec8 &b) const {
+-    return FP32Vec8(_mm256_div_ps(reg, b.reg));
+-  }
+-
+-  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
+-};
+-
+-struct FP32Vec16 : public Vec<FP32Vec16> {
+-  constexpr static int VEC_ELEM_NUM = 16;
+-  union AliasReg {
+-    __m512 reg;
+-    float values[VEC_ELEM_NUM];
+-  };
+-
+-  __m512 reg;
+-
+-  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
+-
+-  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
+-
+-  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
+-
+-  explicit FP32Vec16(__m512 data) : reg(data) {}
+-
+-  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+-
+-  explicit FP32Vec16(const FP32Vec4 &data)
+-      : reg((__m512)_mm512_inserti32x4(
+-            _mm512_inserti32x4(
+-                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
+-                                   (__m128i)data.reg, 1),
+-                (__m128i)data.reg, 2),
+-            (__m128i)data.reg, 3)) {}
+-
+-  explicit FP32Vec16(const FP32Vec8 &data)
+-      : reg((__m512)_mm512_inserti32x8(
+-            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
+-
+-  explicit FP32Vec16(const BF16Vec16 &v)
+-      : reg(_mm512_castsi512_ps(
+-            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
+-
+-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+-
+-  FP32Vec16 operator*(const FP32Vec16 &b) const {
+-    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
+-  }
+-
+-  FP32Vec16 operator+(const FP32Vec16 &b) const {
+-    return FP32Vec16(_mm512_add_ps(reg, b.reg));
+-  }
+-
+-  FP32Vec16 operator-(const FP32Vec16 &b) const {
+-    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
+-  }
+-
+-  FP32Vec16 operator/(const FP32Vec16 &b) const {
+-    return FP32Vec16(_mm512_div_ps(reg, b.reg));
+-  }
+-
+-  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
+-
+-  template <int group_size> float reduce_sub_sum(int idx) {
+-    static_assert(VEC_ELEM_NUM % group_size == 0);
+-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+-    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
+-    return _mm512_mask_reduce_add_ps(mask, reg);
+-  }
+-
+-  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
+-};
+-
+-template <typename T> struct VecType { using vec_type = void; };
+-
+-template <typename T> using vec_t = typename VecType<T>::vec_type;
+-
+-template <> struct VecType<float> { using vec_type = FP32Vec8; };
+-
+-#ifdef __AVX512FP16__
+-template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
+-#endif
+-
+-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+-
+-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+-
+-#ifdef __AVX512FP16__
+-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+-  *reinterpret_cast<_Float16 *>(ptr) = v;
+-}
+-#endif
+-
+-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+-  acc = acc + a * b;
+-}
+-
+-#ifdef __AVX512BF16__
+-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+-  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
+-}
+-
+-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+-    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
+-
+-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+-    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
+-
+-inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+-  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
+-}
+-#else
+-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+-  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
+-      reinterpret_cast<c10::BFloat16 *>(&v);
+-  *ptr = *(v_ptr + 1);
+-}
+-
+-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+-    : reg(_mm256_cvtepi32_epi16(
+-          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
+-
+-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+-    : reg(_mm512_cvtepi32_epi16(
+-          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
+-#endif
+-
+-inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
+-
+-}; // namespace vec_op
+-
+-#endif
++#endif
+\ No newline at end of file
+diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
+new file mode 100644
+index 0000000..ae062a5
+--- /dev/null
++++ b/csrc/cpu/cpu_types_arm.hpp
+@@ -0,0 +1,572 @@
++#include <arm_neon.h>
++#include <torch/all.h> 
++#include <cmath>
++
++namespace vec_op {
++
++#ifdef ARM_BF16_SUPPORT
++  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
++    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
++    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)                         \
++    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)  
++#else
++  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
++    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
++    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
++#endif
++
++#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
++  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
++
++#ifndef CPU_OP_GUARD
++#define CPU_KERNEL_GUARD_IN(NAME)
++#define CPU_KERNEL_GUARD_OUT(NAME)
++#else
++#define CPU_KERNEL_GUARD_IN(NAME)                                              \
++  std::cout << #NAME << " invoked." << std::endl;
++#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
++#endif
++
++#define FORCE_INLINE __attribute__((always_inline)) inline
++
++namespace {
++  template <typename T, T... indexes, typename F>
++  constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
++    (f(std::integral_constant<T, indexes>{}), ...);
++  };
++}; 
++
++template <typename T, T count, typename F,
++          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
++constexpr void unroll_loop(F &&f) {
++  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
++}
++
++template <typename T> struct Vec {
++  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
++};
++
++struct FP32Vec8;
++struct FP32Vec16;
++
++struct FP16Vec8 : public Vec<FP16Vec8> {
++  constexpr static int VEC_ELEM_NUM = 8;
++
++  float16x8_t reg;
++
++  explicit FP16Vec8(const void *ptr)
++      : reg(vld1q_f16(static_cast<const __fp16 *>(ptr))) {};
++
++  explicit FP16Vec8(const FP32Vec8 &);
++
++  void save(void *ptr) const {
++    vst1q_f16(static_cast<__fp16 *>(ptr), reg);
++  }
++};
++
++struct FP16Vec16 : public Vec<FP16Vec16> {
++    constexpr static int VEC_ELEM_NUM = 16;
++    
++    float16x8x2_t reg; 
++    
++    explicit FP16Vec16(const void *ptr) {
++        reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));        
++        reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);    
++    }
++    
++    explicit FP16Vec16(const FP32Vec16& vec);
++    
++    void save(void *ptr) const {
++        vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);       
++        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);   
++    }
++    
++    void save(void *ptr, const int elem_num) const {
++        int full_blocks = elem_num / 8;   
++        int remainder = elem_num % 8;     
++        
++        if (full_blocks > 0) {
++            vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
++            if (full_blocks > 1) {
++                vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
++            }
++        }
++
++        // Note: below is the unrolled version of the following code:
++        // 
++        // for (int i = 0; i < remainder; ++i) {
++        //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = 
++        //          vgetq_lane_f16(temp, i);
++        // }
++        // 
++        // For macOS build (Clang), the arm/neon intrinsics function 
++        // `vgetq_lane_f16` needs the parameter `i` to be constant at compile 
++        // time. 
++        
++        if (remainder > 0) {
++            float16x8_t temp = reg.val[full_blocks];
++            __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
++            switch (remainder)
++            {
++            case 1:
++              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
++              break;
++            case 2:
++              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
++              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
++              break;
++            case 3:
++              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
++              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
++              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
++              break;
++            case 4:
++              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
++              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
++              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
++              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
++              break;
++            case 5:
++              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
++              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
++              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
++              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
++              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
++              break;
++            case 6:
++              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
++              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
++              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
++              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
++              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
++              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
++              break;
++            case 7:
++              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
++              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
++              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
++              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
++              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
++              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
++              fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
++              break;
++            
++            default:
++              break;
++            }
++        }
++    }
++};
++
++
++#ifdef ARM_BF16_SUPPORT
++struct BF16Vec8 : public Vec<BF16Vec8> {
++  constexpr static int VEC_ELEM_NUM = 8;
++
++  bfloat16x8_t reg;
++
++  explicit BF16Vec8(const void *ptr)
++      : reg(*reinterpret_cast<const bfloat16x8_t *>(ptr)) {};
++
++  explicit BF16Vec8(bfloat16x8_t data) : reg(data) {};
++
++  explicit BF16Vec8(const FP32Vec8 &);
++
++  explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};  
++
++  void save(void *ptr) const { *reinterpret_cast<bfloat16x8_t *>(ptr) = reg; }
++};
++
++struct BF16Vec16 : public Vec<BF16Vec16> {
++  constexpr static int VEC_ELEM_NUM = 16;
++
++  bfloat16x8x2_t reg;
++
++  explicit BF16Vec16(const void *ptr)
++      : reg(*reinterpret_cast<const bfloat16x8x2_t *>(ptr)) {};
++
++  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
++
++  explicit BF16Vec16(const FP32Vec16 &);
++
++  explicit BF16Vec16(float32x4x4_t v) : reg({
++    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
++    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])
++  }){};
++
++  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x2_t *>(ptr) = reg; };
++};
++
++struct BF16Vec32 : public Vec<BF16Vec32> {
++  constexpr static int VEC_ELEM_NUM = 32;
++
++  bfloat16x8x4_t reg;
++
++  explicit BF16Vec32(const void *ptr)
++      : reg(*reinterpret_cast<const bfloat16x8x4_t *>(ptr)) {};
++
++  explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {};
++
++  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
++    vec8_data.reg,
++    vec8_data.reg,
++    vec8_data.reg,
++    vec8_data.reg
++  }) {};
++
++  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x4_t *>(ptr) = reg; };
++};
++#endif
++
++struct FP32Vec4 : public Vec<FP32Vec4> {
++  constexpr static int VEC_ELEM_NUM = 4;
++
++  union AliasReg {
++    float32x4_t reg;
++    float values[VEC_ELEM_NUM];
++  };
++
++  float32x4_t reg;
++
++  explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {};
++
++  explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {};
++
++  explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {};
++
++  explicit FP32Vec4(float32x4_t data) : reg(data) {};
++
++  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {};
++};
++
++struct FP32Vec8 : public Vec<FP32Vec8> {
++  constexpr static int VEC_ELEM_NUM = 8;
++  union AliasReg {
++    float32x4x2_t reg;
++    float values[VEC_ELEM_NUM];
++  };
++
++  float32x4x2_t reg;
++
++  explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {};
++
++  explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {};
++
++  explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
++
++  explicit FP32Vec8(float32x4x2_t data) : reg(data) {};
++
++  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {};
++
++  explicit FP32Vec8(const FP16Vec8 &v) {
++        reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));  
++        reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); 
++    };
++
++  explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
++
++  #ifdef ARM_BF16_SUPPORT
++
++  explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
++
++  explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
++
++  #endif
++
++  float reduce_sum() const {
++    AliasReg ar;
++    ar.reg = reg;
++    float answer = 0;
++    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
++
++    return answer;
++  }
++
++  FP32Vec8 exp() const {
++    AliasReg ar;
++    ar.reg = reg;
++
++    float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])};
++    float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])};
++    float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])};
++    float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])};
++
++    float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1);
++    float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3);
++
++    float32x4x2_t result;
++    result.val[0] = result0;
++    result.val[1] = result1;
++
++    return FP32Vec8(result);
++  }
++
++  FP32Vec8 tanh() const {
++    AliasReg ar;
++    ar.reg = reg;
++
++    float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])};
++    float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])};
++    float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])};
++    float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])};
++
++    float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1);
++    float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3);
++
++    float32x4x2_t result;
++    result.val[0] = result0;
++    result.val[1] = result1;
++
++    return FP32Vec8(result);
++  }
++
++  FP32Vec8 er() const {
++    AliasReg ar;
++    ar.reg = reg;
++
++    float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])), static_cast<float32_t>(erf(ar.values[1]))};
++    float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])), static_cast<float32_t>(erf(ar.values[3]))};
++    float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])), static_cast<float32_t>(erf(ar.values[5]))};
++    float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])), static_cast<float32_t>(erf(ar.values[7]))};
++
++    float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
++    float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
++
++    float32x4x2_t result;
++    result.val[0] = result0;
++    result.val[1] = result1;
++
++    return FP32Vec8(result);
++  } 
++
++  FP32Vec8 operator*(const FP32Vec8 &b) const {
++    return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])}));
++  }
++
++  FP32Vec8 operator+(const FP32Vec8 &b) const {
++    return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])}));
++  }
++
++  FP32Vec8 operator-(const FP32Vec8 &b) const {
++    return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])}));
++  }
++
++  FP32Vec8 operator/(const FP32Vec8 &b) const {
++    return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])}));
++  }
++
++  void save(float *ptr) const {
++    vst1q_f32(ptr, reg.val[0]);
++    vst1q_f32(ptr + 4, reg.val[1]);
++  }
++};
++
++struct FP32Vec16 : public Vec<FP32Vec16> {
++  constexpr static int VEC_ELEM_NUM = 16;
++  union AliasReg {
++    float32x4x4_t reg;
++    float values[VEC_ELEM_NUM];
++  };
++
++  float32x4x4_t reg;
++
++  explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
++
++  explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}
++
++  explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {}
++
++  explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
++
++  explicit FP32Vec16(const FP32Vec8 &data) {
++        reg.val[0] = data.reg.val[0]; 
++        reg.val[1] = data.reg.val[1]; 
++        reg.val[2] = data.reg.val[0]; 
++        reg.val[3] = data.reg.val[1]; 
++  }
++
++  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
++
++  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {}
++
++  #ifdef ARM_BF16_SUPPORT
++  explicit FP32Vec16(bfloat16x8x2_t v) : reg({
++    vcvtq_low_f32_bf16(v.val[0]),
++    vcvtq_high_f32_bf16(v.val[0]),
++    vcvtq_low_f32_bf16(v.val[1]),
++    vcvtq_high_f32_bf16(v.val[1])
++  }) {};
++  #endif
++
++  explicit FP32Vec16(const FP32Vec4 &data) {
++    reg.val[0] = data.reg;
++    reg.val[1] = data.reg;
++    reg.val[2] = data.reg;
++    reg.val[3] = data.reg;
++  };
++
++  #ifdef ARM_BF16_SUPPORT
++  explicit FP32Vec16(const BF16Vec16 &v) : reg({
++    vcvtq_low_f32_bf16(v.reg.val[0]),
++    vcvtq_high_f32_bf16(v.reg.val[0]),
++    vcvtq_low_f32_bf16(v.reg.val[1]),
++    vcvtq_high_f32_bf16(v.reg.val[1])
++  }) {};
++
++  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {};
++  #endif
++
++  explicit FP32Vec16(const FP16Vec16 &v) {
++      reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
++      reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
++      reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
++      reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
++  };
++
++  FP32Vec16 operator+(const FP32Vec16 &b) const {
++    return FP32Vec16(float32x4x4_t({
++        vaddq_f32(reg.val[0], b.reg.val[0]),
++        vaddq_f32(reg.val[1], b.reg.val[1]),
++        vaddq_f32(reg.val[2], b.reg.val[2]),
++        vaddq_f32(reg.val[3], b.reg.val[3])}));
++  };
++
++  FP32Vec16 operator*(const FP32Vec16 &b) const {
++    return FP32Vec16(float32x4x4_t({
++        vmulq_f32(reg.val[0], b.reg.val[0]),
++        vmulq_f32(reg.val[1], b.reg.val[1]),
++        vmulq_f32(reg.val[2], b.reg.val[2]),
++        vmulq_f32(reg.val[3], b.reg.val[3])}));
++  };
++
++  FP32Vec16 operator-(const FP32Vec16 &b) const {
++    return FP32Vec16(float32x4x4_t({
++        vsubq_f32(reg.val[0], b.reg.val[0]),
++        vsubq_f32(reg.val[1], b.reg.val[1]),
++        vsubq_f32(reg.val[2], b.reg.val[2]),
++        vsubq_f32(reg.val[3], b.reg.val[3])
++    }));
++  };
++
++  FP32Vec16 operator/(const FP32Vec16 &b) const {
++    return FP32Vec16(float32x4x4_t({
++        vdivq_f32(reg.val[0], b.reg.val[0]),
++        vdivq_f32(reg.val[1], b.reg.val[1]),
++        vdivq_f32(reg.val[2], b.reg.val[2]),
++        vdivq_f32(reg.val[3], b.reg.val[3])
++    }));
++  };
++
++  float reduce_sum() const {
++    AliasReg ar;
++    ar.reg = reg;
++    float answer = 0;
++    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
++
++    return answer;
++  };
++
++  template <int group_size> float reduce_sub_sum(int idx) {
++    static_assert(VEC_ELEM_NUM % group_size == 0);
++
++    AliasReg ar;
++    ar.reg = reg;
++    float answer = 0;
++    const int start = idx * group_size;
++    unroll_loop<int, group_size>(
++        [&answer, &start, ar](int i) { answer += ar.values[start + i]; });
++
++    return answer;
++  };
++
++  void save(float *ptr) const {
++    vst1q_f32(ptr, reg.val[0]);
++    vst1q_f32(ptr + 4, reg.val[1]);
++    vst1q_f32(ptr + 8, reg.val[2]);
++    vst1q_f32(ptr + 12, reg.val[3]);
++  };
++};
++
++template <typename T> struct VecType { using vec_type = void; };
++
++template <typename T> using vec_t = typename VecType<T>::vec_type;
++
++template <> struct VecType<float> { using vec_type = FP32Vec8; };
++
++template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
++
++#ifdef ARM_BF16_SUPPORT
++template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
++#endif
++
++template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
++
++template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
++  *reinterpret_cast<__fp16 *>(ptr) = v;
++}
++
++inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) {
++    float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
++    float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
++    float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
++    float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
++
++    reg.val[0] = vcombine_f16(low_0, high_0);
++    reg.val[1] = vcombine_f16(low_1, high_1);
++};
++
++inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) {
++    float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
++    float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
++
++    reg = vcombine_f16(lower_half, upper_half);
++};
++
++inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
++
++  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
++  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
++  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
++  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
++};
++
++#ifdef ARM_BF16_SUPPORT
++inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
++
++  float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
++  float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
++  float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
++  float32x4_t a1_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[1]));
++
++  float32x4_t b0_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[0]));
++  float32x4_t b0_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[0]));
++  float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1]));
++  float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1]));
++
++  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low);
++  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high);
++  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low);
++  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high);
++};
++#endif
++
++#ifdef ARM_BF16_SUPPORT
++inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {};
++
++inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
++    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
++    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3])
++  }){};
++#endif
++
++inline void prefetch(const void *addr) {
++    __builtin_prefetch(addr, 0, 1);
++};
++
++#ifdef ARM_BF16_SUPPORT
++template <>
++inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { 
++  *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);
++};
++#endif
++};
+\ No newline at end of file
+diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp
+new file mode 100644
+index 0000000..b50bdad
+--- /dev/null
++++ b/csrc/cpu/cpu_types_vsx.hpp
+@@ -0,0 +1,491 @@
++
++#ifndef CPU_TYPES_VSX_HPP
++#define CPU_TYPES_VSX_HPP
++
++#include <altivec.h>
++#include <cmath>
++#include <torch/all.h>
++
++namespace vec_op {
++
++// FIXME: FP16 is not fully supported in Torch-CPU
++#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
++  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
++  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
++
++#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
++  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
++
++#ifndef CPU_OP_GUARD
++#define CPU_KERNEL_GUARD_IN(NAME)
++#define CPU_KERNEL_GUARD_OUT(NAME)
++#else
++#define CPU_KERNEL_GUARD_IN(NAME)                                              \
++  std::cout << #NAME << " invoked." << std::endl;
++#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
++#endif
++
++#define FORCE_INLINE __attribute__((always_inline)) inline
++
++namespace {
++template <typename T, T... indexes, typename F>
++constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
++  (f(std::integral_constant<T, indexes>{}), ...);
++}
++}; // namespace
++
++template <typename T, T count, typename F,
++          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
++constexpr void unroll_loop(F &&f) {
++  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
++}
++
++template <typename T> struct Vec {
++  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
++};
++
++typedef struct ss16x8x2_t {
++  __vector signed short val[2];
++} ss16x8x2_t;
++
++typedef struct ss16x8x4_t {
++  __vector signed short val[4];
++} ss16x8x4_t;
++
++typedef struct f32x4x2_t {
++  __vector float val[2];
++} f32x4x2_t;
++
++typedef struct f32x4x4_t {
++  __vector float val[4];
++} f32x4x4_t;
++
++struct FP32Vec8;
++struct FP32Vec16;
++
++struct BF16Vec8 : public Vec<BF16Vec8> {
++  constexpr static int VEC_ELEM_NUM = 8;
++
++  __vector signed short reg;
++
++  explicit BF16Vec8(const void *ptr)
++      : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {}
++
++  explicit BF16Vec8(const FP32Vec8 &);
++
++  void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; }
++};
++
++struct BF16Vec16 : public Vec<BF16Vec16> {
++  constexpr static int VEC_ELEM_NUM = 16;
++
++  ss16x8x2_t reg;
++
++  explicit BF16Vec16(const void *ptr) {
++    // Load 256 bits in two parts
++    reg.val[0] = (__vector signed short)vec_xl(0,  (signed short *)ptr);
++    reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr);
++  }
++
++  explicit BF16Vec16(const FP32Vec16 &);
++
++  void save(void *ptr) const {
++    // Save 256 bits in two parts
++    vec_xst(reg.val[0], 0, (signed short *)ptr);
++    vec_xst(reg.val[1], 16, (signed short *)ptr);
++  }
++};
++
++const static __vector signed short zero = vec_splats((signed short)0);
++
++struct BF16Vec32 : public Vec<BF16Vec32> {
++  constexpr static int VEC_ELEM_NUM = 32;
++
++  ss16x8x4_t reg;
++  explicit BF16Vec32(const void *ptr)
++      : reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {}
++
++  explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
++
++  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
++    vec8_data.reg,
++    vec8_data.reg,
++    vec8_data.reg,
++    vec8_data.reg
++  }) {}
++
++  void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; }
++};
++
++struct FP32Vec4 : public Vec<FP32Vec4> {
++  constexpr static int VEC_ELEM_NUM = 4;
++  union AliasReg {
++    __vector float reg;
++    float values[VEC_ELEM_NUM];
++  };
++
++  __vector float reg;
++
++  explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
++
++  explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
++
++  explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {}
++
++  explicit FP32Vec4(__vector float data) : reg(data) {}
++
++  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
++};
++
++struct FP32Vec8 : public Vec<FP32Vec8> {
++  constexpr static int VEC_ELEM_NUM = 8;
++  union AliasReg {
++    f32x4x2_t reg;
++    float values[VEC_ELEM_NUM];
++  };
++
++  f32x4x2_t reg;
++
++  explicit FP32Vec8(float v) {
++    reg.val[0] = vec_splats(v);
++    reg.val[1] = vec_splats(v);
++  }
++
++  explicit FP32Vec8() {
++    reg.val[0] = vec_splats(0.0f);
++    reg.val[1] = vec_splats(0.0f);
++  }
++
++  explicit FP32Vec8(const float *ptr) {
++    reg.val[0] = vec_xl(0, ptr);
++    reg.val[1] = vec_xl(16, ptr);
++  }
++
++  explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
++
++  explicit FP32Vec8(const FP32Vec8 &data) {
++    reg.val[0] = data.reg.val[0];
++    reg.val[1] = data.reg.val[1];
++  }
++
++  explicit FP32Vec8(const BF16Vec8 &v) {
++    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
++    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
++  }
++
++  float reduce_sum() const {
++    AliasReg ar;
++    ar.reg = reg;
++    float result = 0;
++    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
++
++    return result;
++  }
++
++  FP32Vec8 exp() const {
++    // TODO: Vectorize this
++    AliasReg ar;
++    ar.reg = reg;
++    f32x4x4_t ret;
++    ret.val[0][0] = std::exp(ar.values[0]);
++    ret.val[0][1] = std::exp(ar.values[1]);
++    ret.val[0][2] = std::exp(ar.values[2]);
++    ret.val[0][3] = std::exp(ar.values[3]);
++    ret.val[1][0] = std::exp(ar.values[4]);
++    ret.val[1][1] = std::exp(ar.values[5]);
++    ret.val[1][2] = std::exp(ar.values[6]);
++    ret.val[1][3] = std::exp(ar.values[7]);
++    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
++  }
++
++  FP32Vec8 tanh() const {
++    // TODO: Vectorize this
++    AliasReg ar;
++    ar.reg = reg;
++    f32x4x4_t ret;
++    ret.val[0][0] = std::tanh(ar.values[0]);
++    ret.val[0][1] = std::tanh(ar.values[1]);
++    ret.val[0][2] = std::tanh(ar.values[2]);
++    ret.val[0][3] = std::tanh(ar.values[3]);
++    ret.val[1][0] = std::tanh(ar.values[4]);
++    ret.val[1][1] = std::tanh(ar.values[5]);
++    ret.val[1][2] = std::tanh(ar.values[6]);
++    ret.val[1][3] = std::tanh(ar.values[7]);
++    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
++  }
++
++  FP32Vec8 er() const {
++    // TODO: Vectorize this
++    AliasReg ar;
++    ar.reg = reg;
++    f32x4x4_t ret;
++    ret.val[0][0] = std::erf(ar.values[0]);
++    ret.val[0][1] = std::erf(ar.values[1]);
++    ret.val[0][2] = std::erf(ar.values[2]);
++    ret.val[0][3] = std::erf(ar.values[3]);
++    ret.val[1][0] = std::erf(ar.values[4]);
++    ret.val[1][1] = std::erf(ar.values[5]);
++    ret.val[1][2] = std::erf(ar.values[6]);
++    ret.val[1][3] = std::erf(ar.values[7]);
++    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
++  }
++
++  FP32Vec8 operator*(const FP32Vec8 &b) const {
++    return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
++  }
++
++  FP32Vec8 operator+(const FP32Vec8 &b) const {
++    return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
++  }
++
++  FP32Vec8 operator-(const FP32Vec8 &b) const {
++    return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
++  }
++
++  FP32Vec8 operator/(const FP32Vec8 &b) const {
++    return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
++  }
++
++  void save(float *ptr) const {
++    vec_xst(reg.val[0], 0, ptr);
++    vec_xst(reg.val[1], 16, ptr);
++  }
++};
++
++struct FP32Vec16 : public Vec<FP32Vec16> {
++  constexpr static int VEC_ELEM_NUM = 16;
++  union AliasReg {
++    f32x4x4_t reg;
++    float values[VEC_ELEM_NUM];
++  };
++
++  f32x4x4_t reg;
++
++  explicit FP32Vec16(float v) {
++    reg.val[0] = vec_splats(v);
++    reg.val[1] = vec_splats(v);
++    reg.val[2] = vec_splats(v);
++    reg.val[3] = vec_splats(v);
++  }
++
++  explicit FP32Vec16() {
++    reg.val[0] = vec_splats(0.0f);
++    reg.val[1] = vec_splats(0.0f);
++    reg.val[2] = vec_splats(0.0f);
++    reg.val[3] = vec_splats(0.0f);
++  }
++
++  explicit FP32Vec16(const float *ptr) {
++    reg.val[0] = vec_xl(0, ptr);
++    reg.val[1] = vec_xl(16, ptr);
++    reg.val[2] = vec_xl(32, ptr);
++    reg.val[3] = vec_xl(48, ptr);
++  }
++
++  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
++
++  explicit FP32Vec16(const FP32Vec16 &data) {
++    reg.val[0] = data.reg.val[0];
++    reg.val[1] = data.reg.val[1];
++    reg.val[2] = data.reg.val[2];
++    reg.val[3] = data.reg.val[3];
++  }
++
++  explicit FP32Vec16(const FP32Vec4 &data) {
++    reg.val[0] = data.reg;
++    reg.val[1] = data.reg;
++    reg.val[2] = data.reg;
++    reg.val[3] = data.reg;
++  }
++
++  explicit FP32Vec16(const FP32Vec8 &data) {
++    reg.val[0] = data.reg.val[0];
++    reg.val[1] = data.reg.val[1];
++    reg.val[2] = data.reg.val[0];
++    reg.val[3] = data.reg.val[1];
++  }
++
++  explicit FP32Vec16(const BF16Vec16 &v) {
++    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
++    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
++    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
++    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
++  }
++
++  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
++
++  FP32Vec16 operator*(const FP32Vec16 &b) const {
++    return FP32Vec16(f32x4x4_t({
++        vec_mul(reg.val[0], b.reg.val[0]),
++        vec_mul(reg.val[1], b.reg.val[1]),
++        vec_mul(reg.val[2], b.reg.val[2]),
++        vec_mul(reg.val[3], b.reg.val[3])}));
++  }
++
++  FP32Vec16 operator+(const FP32Vec16 &b) const {
++    return FP32Vec16(f32x4x4_t({
++        vec_add(reg.val[0], b.reg.val[0]),
++        vec_add(reg.val[1], b.reg.val[1]),
++        vec_add(reg.val[2], b.reg.val[2]),
++        vec_add(reg.val[3], b.reg.val[3])}));
++  }
++
++  FP32Vec16 operator-(const FP32Vec16 &b) const {
++    return FP32Vec16(f32x4x4_t({
++        vec_sub(reg.val[0], b.reg.val[0]),
++        vec_sub(reg.val[1], b.reg.val[1]),
++        vec_sub(reg.val[2], b.reg.val[2]),
++        vec_sub(reg.val[3], b.reg.val[3])}));
++  }
++
++  FP32Vec16 operator/(const FP32Vec16 &b) const {
++    return FP32Vec16(f32x4x4_t({
++        vec_div(reg.val[0], b.reg.val[0]),
++        vec_div(reg.val[1], b.reg.val[1]),
++        vec_div(reg.val[2], b.reg.val[2]),
++        vec_div(reg.val[3], b.reg.val[3])}));
++  }
++
++  float reduce_sum() const {
++    AliasReg ar;
++    ar.reg = reg;
++    float result = 0;
++    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
++
++    return result;
++  }
++
++  template <int group_size> float reduce_sub_sum(int idx) {
++    static_assert(VEC_ELEM_NUM % group_size == 0);
++
++    AliasReg ar;
++    ar.reg = reg;
++    float result = 0;
++    const int start = idx * group_size;
++    unroll_loop<int, group_size>(
++        [&result, &start, ar](int i) { result += ar.values[start + i]; });
++
++    return result;
++  }
++
++  void save(float *ptr) const {
++    vec_xst(reg.val[0], 0, ptr);
++    vec_xst(reg.val[1], 16, ptr);
++    vec_xst(reg.val[2], 32, ptr);
++    vec_xst(reg.val[3], 48, ptr);
++  }
++};
++
++template <typename T> struct VecType { using vec_type = void; };
++
++template <typename T> using vec_t = typename VecType<T>::vec_type;
++
++template <> struct VecType<float> { using vec_type = FP32Vec8; };
++
++template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
++
++template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
++
++inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
++  acc = acc + a * b;
++}
++
++template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
++  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
++      reinterpret_cast<c10::BFloat16 *>(&v);
++  *ptr = *(v_ptr + 1);
++}
++
++#ifndef __VEC_CLASS_FP_NAN
++#define __VEC_CLASS_FP_NAN (1 << 6)
++#endif
++
++const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
++#ifndef _ARCH_PWR10
++const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff };
++const static __vector unsigned int nan  = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
++const static __vector unsigned int sh16 = { 16, 16, 16, 16 };
++const static __vector unsigned int one  = { 1, 1, 1, 1 };
++#endif
++
++inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
++#ifdef _ARCH_PWR10
++  __vector signed short ret[2];
++  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
++  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
++  reg = vec_perm(ret[0], ret[1], omask);
++#elif defined(_ARCH_PWR9)
++  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
++  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
++  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
++  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
++  lsb0 = vec_and(lsb0, one);
++  lsb1 = vec_and(lsb1, one);
++  __vector unsigned int rnd0 = vec_add(lsb0, bias);
++  __vector unsigned int rnd1 = vec_add(lsb1, bias);
++  inp0 = vec_add(inp0, rnd0);
++  inp1 = vec_add(inp1, rnd1);
++  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
++  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
++  inp0 = vec_sel(inp0, nan, sel0);
++  inp1 = vec_sel(inp1, nan, sel1);
++  inp0 = vec_sr(inp0, sh16);
++  inp1 = vec_sr(inp1, sh16);
++  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
++#endif
++}
++
++inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
++#ifdef _ARCH_PWR10
++  __vector signed short ret[4];
++  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
++  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
++  ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]);
++  ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]);
++  reg.val[0] = vec_perm(ret[0], ret[1], omask);
++  reg.val[1] = vec_perm(ret[2], ret[3], omask);
++#elif defined(_ARCH_PWR9)
++  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
++  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
++  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
++  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
++  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
++  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
++  __vector unsigned int lsb2 = vec_sr(inp2, sh16);
++  __vector unsigned int lsb3 = vec_sr(inp3, sh16);
++  lsb0 = vec_and(lsb0, one);
++  lsb1 = vec_and(lsb1, one);
++  lsb2 = vec_and(lsb2, one);
++  lsb3 = vec_and(lsb3, one);
++  __vector unsigned int rnd0 = vec_add(lsb0, bias);
++  __vector unsigned int rnd1 = vec_add(lsb1, bias);
++  __vector unsigned int rnd2 = vec_add(lsb2, bias);
++  __vector unsigned int rnd3 = vec_add(lsb3, bias);
++  inp0 = vec_add(inp0, rnd0);
++  inp1 = vec_add(inp1, rnd1);
++  inp2 = vec_add(inp2, rnd2);
++  inp3 = vec_add(inp3, rnd3);
++  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
++  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
++  __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
++  __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
++  inp0 = vec_sel(inp0, nan, sel0);
++  inp1 = vec_sel(inp1, nan, sel1);
++  inp2 = vec_sel(inp2, nan, sel2);
++  inp3 = vec_sel(inp3, nan, sel3);
++  inp0 = vec_sr(inp0, sh16);
++  inp1 = vec_sr(inp1, sh16);
++  inp2 = vec_sr(inp2, sh16);
++  inp3 = vec_sr(inp3, sh16);
++  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
++  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
++#endif
++}
++
++inline void prefetch(const void *addr) {
++  __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
++}
++
++}; // namespace vec_op
++
++#endif
+diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
+new file mode 100644
+index 0000000..4bb4eb0
+--- /dev/null
++++ b/csrc/cpu/cpu_types_x86.hpp
+@@ -0,0 +1,632 @@
++
++#ifndef CPU_TYPES_X86_HPP
++#define CPU_TYPES_X86_HPP
++
++#include <immintrin.h>
++#include <torch/all.h>
++
++#ifndef __AVX2__
++static_assert(false, "AVX2 must be supported for the current implementation.");
++#endif
++
++namespace vec_op {
++
++#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
++  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
++  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)                      \
++  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
++
++#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
++  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
++
++#ifndef CPU_OP_GUARD
++#define CPU_KERNEL_GUARD_IN(NAME)
++#define CPU_KERNEL_GUARD_OUT(NAME)
++#else
++#define CPU_KERNEL_GUARD_IN(NAME)                                              \
++  RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
++#define CPU_KERNEL_GUARD_OUT(NAME)
++#endif
++
++#define FORCE_INLINE __attribute__((always_inline)) inline
++
++namespace {
++template <typename T, T... indexes, typename F>
++constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
++  (f(std::integral_constant<T, indexes>{}), ...);
++}
++}; // namespace
++
++template <typename T, T count, typename F,
++          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
++constexpr void unroll_loop(F &&f) {
++  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
++}
++
++template <typename T> struct Vec {
++  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
++};
++
++struct FP32Vec8;
++struct FP32Vec16;
++
++struct FP16Vec8 : public Vec<FP16Vec8> {
++  constexpr static int VEC_ELEM_NUM = 8;
++
++  __m128i reg;
++
++  explicit FP16Vec8(const void *ptr)
++      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
++
++  explicit FP16Vec8(const FP32Vec8 &);
++
++  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
++};
++
++struct FP16Vec16 : public Vec<FP16Vec16> {
++  constexpr static int VEC_ELEM_NUM = 16;
++
++  __m256i reg;
++
++  explicit FP16Vec16(const void *ptr)
++      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
++
++  explicit FP16Vec16(const FP32Vec16 &);
++
++  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
++
++  void save(void* ptr, const int elem_num) const {
++    constexpr uint32_t M = 0xFFFFFFFF;
++    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
++    _mm256_mask_storeu_epi16(ptr, mask, reg);
++  }
++};
++
++struct BF16Vec8 : public Vec<BF16Vec8> {
++  constexpr static int VEC_ELEM_NUM = 8;
++
++  __m128i reg;
++
++  explicit BF16Vec8(const void *ptr)
++      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
++
++  explicit BF16Vec8(const FP32Vec8 &);
++
++  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
++};
++
++struct BF16Vec16 : public Vec<BF16Vec16> {
++  constexpr static int VEC_ELEM_NUM = 16;
++
++  __m256i reg;
++
++  explicit BF16Vec16(const void *ptr)
++      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
++
++  explicit BF16Vec16(const FP32Vec16 &);
++
++  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
++
++  void save(void* ptr, const int elem_num) const {
++    constexpr uint32_t M = 0xFFFFFFFF;
++    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
++    _mm256_mask_storeu_epi16(ptr, mask, reg);
++  }
++};
++
++#ifdef __AVX512F__
++struct BF16Vec32 : public Vec<BF16Vec32> {
++  constexpr static int VEC_ELEM_NUM = 32;
++
++  __m512i reg;
++
++  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
++
++  explicit BF16Vec32(__m512i data) : reg(data) {}
++
++  explicit BF16Vec32(BF16Vec8 &vec8_data)
++      : reg((__m512i)_mm512_inserti32x4(
++            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
++                                                      (__m128i)vec8_data.reg),
++                                                  (__m128i)vec8_data.reg, 1),
++                               (__m128i)vec8_data.reg, 2),
++            (__m128i)vec8_data.reg, 3)) {}
++
++  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
++};
++#else
++struct BF16Vec32 : public Vec<BF16Vec32> {
++  constexpr static int VEC_ELEM_NUM = 32;
++
++  __m256i reg_low;
++  __m256i reg_high;
++
++  explicit BF16Vec32(const void *ptr)
++      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
++        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
++
++  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
++                                                  reg_high(high) {}
++
++  explicit BF16Vec32(BF16Vec8 &vec8_data)
++      : reg_low((__m256i)_mm256_inserti32x4(
++                _mm256_castsi128_si256((__m128i)vec8_data.reg),
++                                       (__m128i)vec8_data.reg, 1)),
++        reg_high((__m256i)_mm256_inserti32x4(
++                _mm256_castsi128_si256((__m128i)vec8_data.reg),
++                                       (__m128i)vec8_data.reg, 1)) {}
++
++  void save(void *ptr) const {
++    *reinterpret_cast<__m256i *>(ptr) = reg_low;
++    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
++  }
++};
++#endif
++
++struct FP32Vec4 : public Vec<FP32Vec4> {
++  constexpr static int VEC_ELEM_NUM = 4;
++  union AliasReg {
++    __m128 reg;
++    float values[VEC_ELEM_NUM];
++  };
++
++  __m128 reg;
++
++  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
++
++  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
++
++  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
++
++  explicit FP32Vec4(__m128 data) : reg(data) {}
++
++  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
++};
++
++struct FP32Vec8 : public Vec<FP32Vec8> {
++  constexpr static int VEC_ELEM_NUM = 8;
++  union AliasReg {
++    __m256 reg;
++    float values[VEC_ELEM_NUM];
++  };
++
++  __m256 reg;
++
++  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
++
++  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
++
++  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
++
++  explicit FP32Vec8(__m256 data) : reg(data) {}
++
++  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
++
++  explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {}
++
++  explicit FP32Vec8(const BF16Vec8 &v)
++      : reg(_mm256_castsi256_ps(
++            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
++
++  float reduce_sum() const {
++    AliasReg ar;
++    ar.reg = reg;
++    float result = 0;
++    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
++
++    return result;
++  }
++
++  FP32Vec8 exp() const {
++    AliasReg ar;
++    ar.reg = reg;
++    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
++                                  expf(ar.values[5]), expf(ar.values[4]),
++                                  expf(ar.values[3]), expf(ar.values[2]),
++                                  expf(ar.values[1]), expf(ar.values[0])));
++  }
++
++  FP32Vec8 tanh() const {
++    AliasReg ar;
++    ar.reg = reg;
++    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
++                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
++                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
++                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
++  }
++
++  FP32Vec8 er() const {
++    AliasReg ar;
++    ar.reg = reg;
++    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
++                                  erf(ar.values[5]), erf(ar.values[4]),
++                                  erf(ar.values[3]), erf(ar.values[2]),
++                                  erf(ar.values[1]), erf(ar.values[0])));
++  }
++
++  FP32Vec8 operator*(const FP32Vec8 &b) const {
++    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
++  }
++
++  FP32Vec8 operator+(const FP32Vec8 &b) const {
++    return FP32Vec8(_mm256_add_ps(reg, b.reg));
++  }
++
++  FP32Vec8 operator-(const FP32Vec8 &b) const {
++    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
++  }
++
++  FP32Vec8 operator/(const FP32Vec8 &b) const {
++    return FP32Vec8(_mm256_div_ps(reg, b.reg));
++  }
++
++  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
++};
++
++#ifdef __AVX512F__
++struct INT32Vec16: public Vec<INT32Vec16> {
++  constexpr static int VEC_ELEM_NUM = 16;
++  union AliasReg {
++    __m512i reg;
++    int32_t values[VEC_ELEM_NUM];
++  };
++
++  __m512i reg;
++  
++  explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {}
++
++  void save(int32_t* ptr) const {
++    _mm512_storeu_epi32(ptr, reg);
++  }
++
++  void save(int32_t* ptr, const int elem_num) const {
++    constexpr uint32_t M = 0xFFFFFFFF;
++    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
++    _mm512_mask_storeu_epi32(ptr, mask, reg);
++  }
++};
++#endif
++
++#ifdef __AVX512F__
++struct FP32Vec16 : public Vec<FP32Vec16> {
++  constexpr static int VEC_ELEM_NUM = 16;
++  union AliasReg {
++    __m512 reg;
++    float values[VEC_ELEM_NUM];
++  };
++
++  __m512 reg;
++
++  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
++
++  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
++
++  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
++
++  explicit FP32Vec16(__m512 data) : reg(data) {}
++
++  explicit FP32Vec16(const FP32Vec4 &data)
++      : reg((__m512)_mm512_inserti32x4(
++            _mm512_inserti32x4(
++                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
++                                   (__m128i)data.reg, 1),
++                (__m128i)data.reg, 2),
++            (__m128i)data.reg, 3)) {}
++
++  explicit FP32Vec16(const FP32Vec8 &data)
++      : reg((__m512)_mm512_inserti32x8(
++            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
++
++  explicit FP32Vec16(const BF16Vec16 &v)
++      : reg(_mm512_castsi512_ps(
++            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
++
++  explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {}
++
++  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
++
++  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
++
++  explicit FP32Vec16(const INT32Vec16 &v)
++      : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {}
++
++  FP32Vec16 operator*(const FP32Vec16 &b) const {
++    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
++  }
++
++  FP32Vec16 operator+(const FP32Vec16 &b) const {
++    return FP32Vec16(_mm512_add_ps(reg, b.reg));
++  }
++
++  FP32Vec16 operator-(const FP32Vec16 &b) const {
++    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
++  }
++
++  FP32Vec16 operator/(const FP32Vec16 &b) const {
++    return FP32Vec16(_mm512_div_ps(reg, b.reg));
++  }
++
++  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
++    return FP32Vec16(_mm512_min_ps(max.reg, _mm512_max_ps(min.reg, reg)));
++  }
++
++  FP32Vec16 max(const FP32Vec16& b) const {
++    return FP32Vec16(_mm512_max_ps(reg, b.reg));
++  }
++
++  FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
++    constexpr uint32_t M = 0xFFFFFFFF;
++    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
++    return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
++  }
++
++  FP32Vec16 min(const FP32Vec16& b) const {
++    return FP32Vec16(_mm512_min_ps(reg, b.reg));
++  }
++
++  FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
++    constexpr uint32_t M = 0xFFFFFFFF;
++    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
++    return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg));
++  }
++
++  FP32Vec16 abs() const {
++    return FP32Vec16(_mm512_abs_ps(reg));
++  } 
++
++  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
++
++  float reduce_max() const { return _mm512_reduce_max_ps(reg); }
++
++  float reduce_min() const { return _mm512_reduce_min_ps(reg); }
++
++  template <int group_size> float reduce_sub_sum(int idx) {
++    static_assert(VEC_ELEM_NUM % group_size == 0);
++    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
++    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
++    return _mm512_mask_reduce_add_ps(mask, reg);
++  }
++
++  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
++
++  void save(float* ptr, const int elem_num) const {
++    constexpr uint32_t M = 0xFFFFFFFF;
++    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
++    _mm512_mask_storeu_ps(ptr, mask, reg);
++  }
++};
++#else
++struct FP32Vec16 : public Vec<FP32Vec16> {
++  constexpr static int VEC_ELEM_NUM = 16;
++
++  union AliasReg {
++    __m256 reg;
++    float values[8];
++  };
++
++  __m256 reg_low;
++  __m256 reg_high;
++
++  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
++                                reg_high(_mm256_set1_ps(v)) {}
++
++  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
++                         reg_high(_mm256_set1_ps(0.0)) {}
++
++  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
++                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
++
++  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
++
++  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
++                                              reg_high(data.reg_high) {}
++
++  explicit FP32Vec16(const FP32Vec4 &data)
++      : reg_low((__m256)_mm256_inserti128_si256(
++                _mm256_castsi128_si256((__m128i)data.reg),
++                                       (__m128i)data.reg, 1)),
++        reg_high((__m256)_mm256_inserti128_si256(
++                 _mm256_castsi128_si256((__m128i)data.reg),
++                                       (__m128i)data.reg, 1)) {}
++
++  explicit FP32Vec16(const FP32Vec8 &data)
++      : reg_low(data.reg), reg_high(data.reg) {}
++
++  explicit FP32Vec16(const FP16Vec16 &v) {
++    __m128i low = _mm256_extractf128_si256(v.reg, 0);
++    __m128i high = _mm256_extractf128_si256(v.reg, 1);
++
++    reg_low = _mm256_cvtph_ps(low);
++    reg_high = _mm256_cvtph_ps(high);
++  }
++
++  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
++
++  explicit FP32Vec16(const BF16Vec16 &v) {
++    __m128i low = _mm256_extractf128_si256(v.reg, 0);
++    __m128i high = _mm256_extractf128_si256(v.reg, 1);
++
++    __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
++    __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
++
++    __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
++    __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
++
++    reg_low = _mm256_castsi256_ps(v_low_shifted);
++    reg_high = _mm256_castsi256_ps(v_high_shifted);
++  }
++
++  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
++
++  FP32Vec16 operator*(const FP32Vec16 &b) const {
++    return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
++                     _mm256_mul_ps(reg_high, b.reg_high));
++  }
++
++  FP32Vec16 operator+(const FP32Vec16 &b) const {
++    return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
++                     _mm256_add_ps(reg_high, b.reg_high));
++  }
++
++  FP32Vec16 operator-(const FP32Vec16 &b) const {
++    return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
++                     _mm256_sub_ps(reg_high, b.reg_high));
++  }
++
++  FP32Vec16 operator/(const FP32Vec16 &b) const {
++    return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
++                     _mm256_div_ps(reg_high, b.reg_high));
++  }
++
++  float reduce_sum() const {
++    FP32Vec8 low = FP32Vec8(reg_low);
++    FP32Vec8 high = FP32Vec8(reg_high);
++    return low.reduce_sum() + high.reduce_sum();
++  }
++
++  template <int group_size> float reduce_sub_sum(int idx) {
++    float sum = 0.0;
++    static_assert(VEC_ELEM_NUM % group_size == 0);
++    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
++    uint32_t mask = base_mask << (idx * group_size);
++
++    AliasReg ar;
++
++    auto func = [&sum, &mask, &ar](int i) {
++      int flag = mask & 0x1;
++      mask = mask >> 1;
++      if (flag != 0) sum += ar.values[i];
++    };
++
++    ar.reg = reg_low;
++    unroll_loop<int, 8>(func);
++
++    ar.reg = reg_high;
++    unroll_loop<int, 8>(func);
++
++    return sum;
++  }
++
++  void save(float *ptr) const {
++    _mm256_storeu_ps(ptr, reg_low);
++    _mm256_storeu_ps(ptr + 8, reg_high);
++  }
++};
++#endif
++
++#ifdef __AVX512F__
++struct INT8Vec16: public Vec<INT8Vec16> {
++  constexpr static int VEC_ELEM_NUM = 16;
++  union AliasReg {
++    __m128i reg;
++    int8_t values[VEC_ELEM_NUM];
++  };
++
++  __m128i reg;
++  
++  explicit INT8Vec16(const FP32Vec16& vec) : reg(
++    _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
++  ) {}
++
++  void save(int8_t* ptr) const {
++    _mm_storeu_epi8(ptr, reg);
++  }
++
++  void save(int8_t* ptr, const int elem_num) const {
++    constexpr uint32_t M = 0xFFFFFFFF;
++    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
++    _mm_mask_storeu_epi8(ptr, mask, reg);
++  }
++};
++#endif
++
++template <typename T> struct VecType { using vec_type = void; };
++
++template <typename T> using vec_t = typename VecType<T>::vec_type;
++
++template <> struct VecType<float> { using vec_type = FP32Vec8; };
++
++template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
++
++template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
++
++template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
++
++inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
++  acc = acc + a * b;
++}
++
++template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
++  *reinterpret_cast<unsigned short *>(ptr) =
++      _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
++}
++
++inline FP16Vec8::FP16Vec8(const FP32Vec8 &v)
++    : reg(_mm256_cvtps_ph(v.reg,
++                          _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
++
++#ifdef __AVX512F__
++inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
++    : reg(_mm512_cvtps_ph(v.reg,
++                          _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
++#else
++inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
++    : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
++#endif
++
++#ifdef __AVX512BF16__
++template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
++  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
++}
++
++inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
++    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
++
++inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
++    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
++
++inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
++  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
++}
++#else
++template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
++  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
++      reinterpret_cast<c10::BFloat16 *>(&v);
++  *ptr = *(v_ptr + 1);
++}
++
++#ifdef __AVX512F__
++inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
++    : reg(_mm256_cvtepi32_epi16(
++          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
++
++inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
++    : reg(_mm512_cvtepi32_epi16(
++          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
++#else
++namespace{
++__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
++  __m256i ai = _mm256_castps_si256(a);
++  ai = _mm256_srli_epi32(ai, 16);
++  ai = _mm256_packus_epi32(ai, ai);
++  ai = _mm256_permute4x64_epi64(ai, 0b00111001);
++  return _mm256_extracti128_si256(ai, 0);
++}
++}
++
++inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
++    : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
++
++inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
++  BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
++  BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
++  reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
++}
++#endif // __AVX512F__
++#endif // __AVX512BF16__
++
++inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
++
++}; // namespace vec_op
++
++#endif
+diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp
+new file mode 100644
+index 0000000..8b5011d
+--- /dev/null
++++ b/csrc/cpu/dnnl_helper.hpp
+@@ -0,0 +1,174 @@
++#ifndef DNNL_HELPER_HPP
++#define DNNL_HELPER_HPP
++
++#include <c10/util/BFloat16.h>
++#include <c10/util/Half.h>
++
++#include "oneapi/dnnl/dnnl.hpp"
++
++namespace {
++template <typename T>
++struct DNNLType {
++  static constexpr dnnl::memory::data_type type =
++      dnnl::memory::data_type::undef;
++};
++
++template <>
++struct DNNLType<int8_t> {
++  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
++};
++
++template <>
++struct DNNLType<int32_t> {
++  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
++};
++
++template <>
++struct DNNLType<float> {
++  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
++};
++
++template <>
++struct DNNLType<c10::BFloat16> {
++  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
++};
++
++template <>
++struct DNNLType<c10::Half> {
++  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
++};
++
++template <typename T>
++constexpr inline dnnl::memory::data_type get_dnnl_type() {
++  return DNNLType<std::decay_t<T>>::type;
++}
++};  // namespace
++
++template <bool InputNoScale>
++class DNNLPrimitiveHelper {
++ public:
++  // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
++  // A: [M, K], row-major
++  // B: [K, N], column-major
++  // C: [M, N], row-major
++  // bias: [N], row-major, optional
++  // a_scales: [MS]
++  // b_scales: [NS]
++  // Note: Due to the limitation of oneDNN
++  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
++  // not supported.
++  template <typename OutputT, typename BiasT>
++  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
++                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
++                            dnnl_dim_t K, const float* a_scales,
++                            const float* b_scales, dnnl_dim_t MS,
++                            dnnl_dim_t NS) {
++    auto&& OutputType = get_dnnl_type<OutputT>();
++    auto&& BiasType = get_dnnl_type<BiasT>();
++
++    dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
++    dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
++    dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
++
++    dnnl::primitive_attr attr;
++    if constexpr (!InputNoScale) {
++      if (MS == 1) {
++        // per-tensor
++        attr.set_scales_mask(DNNL_ARG_SRC, 0);
++      } else {
++        // per-token
++        TORCH_CHECK(false, "per-token quantization is unsupported.");
++      }
++    }
++
++    if (NS == 1) {
++      // per-tensor
++      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
++    } else {
++      // per-channel
++      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
++    }
++
++    dnnl::matmul::primitive_desc matmul_pd;
++    if (bias) {
++      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
++      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
++                                               bias_md, c_md, attr);
++    } else {
++      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
++                                               c_md, attr);
++    }
++    dnnl::matmul matmul(matmul_pd);
++
++    auto& engine = default_engine();
++
++    dnnl::memory a_m(a_md, engine, (void*)a);
++    dnnl::memory b_m(b_md, engine, (void*)b);
++    dnnl::memory c_m(c_md, engine, (void*)c);
++    dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
++                            (void*)a_scales);
++    dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
++                            (void*)b_scales);
++
++    auto& stream = default_stream();
++    if constexpr (InputNoScale) {
++      if (bias) {
++        dnnl::memory::desc bias_md({N}, BiasType, {1});
++        dnnl::memory bias_m(bias_md, engine, (void*)bias);
++        matmul.execute(
++            stream, {
++                        {DNNL_ARG_SRC, a_m},
++                        {DNNL_ARG_WEIGHTS, b_m},
++                        {DNNL_ARG_BIAS, bias_m},
++                        {DNNL_ARG_DST, c_m},
++                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
++                    });
++      } else {
++        matmul.execute(
++            stream, {
++                        {DNNL_ARG_SRC, a_m},
++                        {DNNL_ARG_WEIGHTS, b_m},
++                        {DNNL_ARG_DST, c_m},
++                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
++                    });
++      }
++    } else {
++      if (bias) {
++        dnnl::memory::desc bias_md({N}, BiasType, {1});
++        dnnl::memory bias_m(bias_md, engine, (void*)bias);
++        matmul.execute(
++            stream, {
++                        {DNNL_ARG_SRC, a_m},
++                        {DNNL_ARG_WEIGHTS, b_m},
++                        {DNNL_ARG_BIAS, bias_m},
++                        {DNNL_ARG_DST, c_m},
++                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
++                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
++                    });
++      } else {
++        matmul.execute(
++            stream, {
++                        {DNNL_ARG_SRC, a_m},
++                        {DNNL_ARG_WEIGHTS, b_m},
++                        {DNNL_ARG_DST, c_m},
++                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
++                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
++                    });
++      }
++    }
++    stream.wait();
++  }
++
++ private:
++  static dnnl::engine& default_engine() {
++    static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
++    return engine;
++  }
++
++  static dnnl::stream& default_stream() {
++    static dnnl::stream stream(default_engine());
++    return stream;
++  }
++};
++
++#endif
+diff --git a/csrc/cpu/layernorm.cpp b/csrc/cpu/layernorm.cpp
+index 467f0dc..a76ad08 100644
+--- a/csrc/cpu/layernorm.cpp
++++ b/csrc/cpu/layernorm.cpp
+@@ -2,10 +2,10 @@
+ 
+ namespace {
+ template <typename scalar_t>
+-void rms_norm_impl(scalar_t *__restrict__ out,
+-                       const scalar_t *__restrict__ input,
+-                       const scalar_t *__restrict__ weight, const float epsilon,
+-                       const int num_tokens, const int hidden_size) {
++void rms_norm_impl(scalar_t* __restrict__ out,
++                   const scalar_t* __restrict__ input,
++                   const scalar_t* __restrict__ weight, const float epsilon,
++                   const int num_tokens, const int hidden_size) {
+   using scalar_vec_t = vec_op::vec_t<scalar_t>;
+   constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+   TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
+@@ -41,11 +41,11 @@ void rms_norm_impl(scalar_t *__restrict__ out,
+ }
+ 
+ template <typename scalar_t>
+-void fused_add_rms_norm_impl(scalar_t *__restrict__ input,
+-                                 scalar_t *__restrict__ residual,
+-                                 const scalar_t *__restrict__ weight,
+-                                 const float epsilon, const int num_tokens,
+-                                 const int hidden_size) {
++void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
++                             scalar_t* __restrict__ residual,
++                             const scalar_t* __restrict__ weight,
++                             const float epsilon, const int num_tokens,
++                             const int hidden_size) {
+   using scalar_vec_t = vec_op::vec_t<scalar_t>;
+   constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+   TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
+@@ -85,24 +85,24 @@ void fused_add_rms_norm_impl(scalar_t *__restrict__ input,
+     }
+   }
+ }
+-} // namespace
++}  // namespace
+ 
+-void rms_norm(torch::Tensor &out, torch::Tensor &input,
+-                  torch::Tensor &weight, float epsilon) {
++void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
++              double epsilon) {
+   int hidden_size = input.size(-1);
+   int num_tokens = input.numel() / hidden_size;
+ 
+   VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_impl", [&] {
+     CPU_KERNEL_GUARD_IN(rms_norm_impl)
+     rms_norm_impl(out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+-                      weight.data_ptr<scalar_t>(), epsilon, num_tokens,
+-                      hidden_size);
++                  weight.data_ptr<scalar_t>(), epsilon, num_tokens,
++                  hidden_size);
+     CPU_KERNEL_GUARD_OUT(rms_norm_impl)
+   });
+ }
+ 
+-void fused_add_rms_norm(torch::Tensor &input, torch::Tensor &residual,
+-                            torch::Tensor &weight, float epsilon) {
++void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
++                        torch::Tensor& weight, double epsilon) {
+   int hidden_size = input.size(-1);
+   int num_tokens = input.numel() / hidden_size;
+ 
+diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp
+index e9b3992..96bce7d 100644
+--- a/csrc/cpu/pos_encoding.cpp
++++ b/csrc/cpu/pos_encoding.cpp
+@@ -4,107 +4,107 @@
+ namespace {
+ template <typename scalar_t>
+ void rotary_embedding_impl(
+-    const int64_t
+-        *__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
+-    scalar_t
+-        *__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
+-                             /// [num_tokens, num_heads, head_size]
+-    scalar_t
+-        *__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
+-                           // [num_tokens, num_kv_heads, head_size]
+-    const scalar_t
+-        *__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
++    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
++                                            // [num_tokens]
++    scalar_t* __restrict__ query,           /// [batch_size, seq_len, num_heads,
++                                   /// head_size] or [num_tokens, num_heads,
++                                   /// head_size]
++    scalar_t* __restrict__ key,  // [batch_size, seq_len, num_kv_heads,
++                                 // head_size] or [num_tokens, num_kv_heads,
++                                 // head_size]
++    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
++                                                 // 2]
+     const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+     const int num_heads, const int num_kv_heads, const int head_size,
+     const int num_tokens) {
+   using scalar_vec_t = vec_op::vec_t<scalar_t>;
+   constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+-  constexpr int ELEM_SIZE = sizeof(scalar_t);
+ 
+   const int embed_dim = rot_dim / 2;
+-  TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);
++  bool flag = (embed_dim % VEC_ELEM_NUM == 0);
++  const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;
+ 
+-#pragma omp parallel for
+-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+-    int64_t pos = positions[token_idx];
+-    const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
+-
+-    for (int i = 0; i < num_heads; ++i) {
+-      const int head_idx = i;
+-      const int64_t token_head =
+-          token_idx * query_stride + head_idx * head_size;
+-      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
+-        const int rot_offset = j;
+-        const int x_index = rot_offset;
+-        const int y_index = embed_dim + rot_offset;
++  auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr,
++                          scalar_t* qk) {
++    int j = 0;
++    for (; j < loop_upper; j += VEC_ELEM_NUM) {
++      const int rot_offset = j;
++      const int x_index = rot_offset;
++      const int y_index = embed_dim + rot_offset;
+ 
+-        const int64_t out_x = token_head + x_index;
+-        const int64_t out_y = token_head + y_index;
++      const int64_t out_x = token_head + x_index;
++      const int64_t out_y = token_head + y_index;
+ 
+-        const scalar_vec_t cos(cache_ptr + x_index);
+-        const scalar_vec_t sin(cache_ptr + y_index);
++      const scalar_vec_t cos(cache_ptr + x_index);
++      const scalar_vec_t sin(cache_ptr + y_index);
+ 
+-        const scalar_vec_t q_x(query + out_x);
+-        const scalar_vec_t q_y(query + out_y);
++      const scalar_vec_t q_x(qk + out_x);
++      const scalar_vec_t q_y(qk + out_y);
+ 
+-        vec_op::FP32Vec8 fp32_cos(cos);
+-        vec_op::FP32Vec8 fp32_sin(sin);
++      vec_op::FP32Vec8 fp32_cos(cos);
++      vec_op::FP32Vec8 fp32_sin(sin);
+ 
+-        vec_op::FP32Vec8 fp32_q_x(q_x);
+-        vec_op::FP32Vec8 fp32_q_y(q_y);
++      vec_op::FP32Vec8 fp32_q_x(q_x);
++      vec_op::FP32Vec8 fp32_q_y(q_y);
+ 
+-        auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+-        scalar_vec_t(out1).save(query + out_x);
++      auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
++      scalar_vec_t(out1).save(qk + out_x);
+ 
+-        auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+-        scalar_vec_t(out2).save(query + out_y);
+-      }
++      auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
++      scalar_vec_t(out2).save(qk + out_y);
+     }
+-
+-    for (int i = 0; i < num_kv_heads; ++i) {
+-      const int head_idx = i;
+-      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+-      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
+-        const int rot_offset = j;
+-        const int x_index = rot_offset;
+-        const int y_index = embed_dim + rot_offset;
++    if (!flag) {
++      for (; j < embed_dim; ++j) {
++        const int x_index = j;
++        const int y_index = embed_dim + j;
+ 
+         const int64_t out_x = token_head + x_index;
+         const int64_t out_y = token_head + y_index;
+ 
+-        const scalar_vec_t cos(cache_ptr + x_index);
+-        const scalar_vec_t sin(cache_ptr + y_index);
++        const float fp32_cos = cache_ptr[x_index];
++        const float fp32_sin = cache_ptr[y_index];
+ 
+-        const scalar_vec_t k_x(key + out_x);
+-        const scalar_vec_t k_y(key + out_y);
++        const float fp32_q_x = qk[out_x];
++        const float fp32_q_y = qk[out_y];
+ 
+-        vec_op::FP32Vec8 fp32_cos(cos);
+-        vec_op::FP32Vec8 fp32_sin(sin);
++        qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
++        qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
++      }
++    }
++  };
+ 
+-        vec_op::FP32Vec8 fp32_k_x(k_x);
+-        vec_op::FP32Vec8 fp32_k_y(k_y);
++#pragma omp parallel for
++  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
++    int64_t pos = positions[token_idx];
++    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+ 
+-        auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
+-        scalar_vec_t(out1).save(key + out_x);
+-        auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
+-        scalar_vec_t(out2).save(key + out_y);
+-      }
++    for (int i = 0; i < num_heads; ++i) {
++      const int head_idx = i;
++      const int64_t token_head =
++          token_idx * query_stride + head_idx * head_size;
++      compute_loop(token_head, cache_ptr, query);
++    }
++
++    for (int i = 0; i < num_kv_heads; ++i) {
++      const int head_idx = i;
++      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
++      compute_loop(token_head, cache_ptr, key);
+     }
+   }
+ }
+ 
+ template <typename scalar_t>
+ void rotary_embedding_gptj_impl(
+-    const int64_t
+-        *__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
+-    scalar_t
+-        *__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
+-                             /// [num_tokens, num_heads, head_size]
+-    scalar_t
+-        *__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
+-                           // [num_tokens, num_kv_heads, head_size]
+-    const scalar_t
+-        *__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
++    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
++                                            // [num_tokens]
++    scalar_t* __restrict__ query,           /// [batch_size, seq_len, num_heads,
++                                   /// head_size] or [num_tokens, num_heads,
++                                   /// head_size]
++    scalar_t* __restrict__ key,  // [batch_size, seq_len, num_kv_heads,
++                                 // head_size] or [num_tokens, num_kv_heads,
++                                 // head_size]
++    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
++                                                 // 2]
+     const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+     const int num_heads, const int num_kv_heads, const int head_size,
+     const int num_tokens) {
+@@ -114,13 +114,13 @@ void rotary_embedding_gptj_impl(
+   for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+     for (int i = 0; i < num_heads; ++i) {
+       int64_t pos = positions[token_idx];
+-      const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
+-      const scalar_t *cos_cache_ptr = cache_ptr;
+-      const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
++      const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
++      const scalar_t* cos_cache_ptr = cache_ptr;
++      const scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
+       const int head_idx = i;
+       const int64_t token_head =
+           token_idx * query_stride + head_idx * head_size;
+-      scalar_t *head_query = token_head + query;
++      scalar_t* head_query = token_head + query;
+       for (int j = 0; j < embed_dim; j += 1) {
+         const int rot_offset = j;
+         const int x_index = 2 * rot_offset;
+@@ -142,12 +142,12 @@ void rotary_embedding_gptj_impl(
+   for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+     for (int i = 0; i < num_kv_heads; ++i) {
+       int64_t pos = positions[token_idx];
+-      const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
+-      const scalar_t *cos_cache_ptr = cache_ptr;
+-      const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
++      const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
++      const scalar_t* cos_cache_ptr = cache_ptr;
++      const scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
+       const int head_idx = i;
+       const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+-      scalar_t *head_key = key + token_head;
++      scalar_t* head_key = key + token_head;
+       for (int j = 0; j < embed_dim; j += 1) {
+         const int rot_offset = j;
+         const int x_index = 2 * rot_offset;
+@@ -165,11 +165,11 @@ void rotary_embedding_gptj_impl(
+     }
+   }
+ }
+-}; // namespace
++};  // namespace
+ 
+-void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
+-                          torch::Tensor &key, int head_size,
+-                          torch::Tensor &cos_sin_cache, bool is_neox) {
++void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
++                      torch::Tensor& key, int64_t head_size,
++                      torch::Tensor& cos_sin_cache, bool is_neox) {
+   int num_tokens = query.numel() / query.size(-1);
+   int rot_dim = cos_sin_cache.size(1);
+   int num_heads = query.size(-1) / head_size;
+diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
+new file mode 100644
+index 0000000..33b1637
+--- /dev/null
++++ b/csrc/cpu/quant.cpp
+@@ -0,0 +1,613 @@
++#include "cpu_types.hpp"
++#include "dnnl_helper.hpp"
++
++namespace {
++template <typename scalar_t>
++struct KernelVecType {
++  using load_vec_type = void;
++  using azp_adj_load_vec_type = void;
++  using cvt_vec_type = void;
++};
++
++template <>
++struct KernelVecType<float> {
++  using load_vec_type = vec_op::FP32Vec16;
++  using azp_adj_load_vec_type = vec_op::INT32Vec16;
++  using cvt_vec_type = vec_op::FP32Vec16;
++};
++
++template <>
++struct KernelVecType<c10::BFloat16> {
++  using load_vec_type = vec_op::BF16Vec16;
++  using azp_adj_load_vec_type = vec_op::INT32Vec16;
++  using cvt_vec_type = vec_op::FP32Vec16;
++};
++
++template <>
++struct KernelVecType<c10::Half> {
++#ifdef __powerpc64__
++  // Power architecture-specific vector type
++  using load_vec_type = vec_op::FP32Vec16;
++#else
++  // Fallback for other architectures
++  using load_vec_type = vec_op::FP16Vec16;
++#endif
++  using azp_adj_load_vec_type = vec_op::INT32Vec16;
++  using cvt_vec_type = vec_op::FP32Vec16;
++};
++
++#ifdef __AVX512F__
++template <bool AZP, typename scalar_t>
++void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
++                                   const float* scale, const int32_t* azp,
++                                   const int num_tokens,
++                                   const int hidden_size) {
++  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
++  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
++  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
++
++  constexpr float i8_min =
++      static_cast<float>(std::numeric_limits<int8_t>::min());
++  constexpr float i8_max =
++      static_cast<float>(std::numeric_limits<int8_t>::max());
++  const cvt_vec_t inv_scale(1.0 / *scale);
++  const cvt_vec_t i8_min_vec(i8_min);
++  const cvt_vec_t i8_max_vec(i8_max);
++
++  cvt_vec_t zp_vec;
++  if constexpr (AZP) {
++    zp_vec = cvt_vec_t(static_cast<float>(*azp));
++  }
++
++  #pragma omp parallel for
++  for (int i = 0; i < num_tokens; ++i) {
++    int j = 0;
++    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
++      load_vec_t elems(input + i * hidden_size + j);
++      cvt_vec_t elems_fp32(elems);
++      elems_fp32 = elems_fp32 * inv_scale;
++
++      if constexpr (AZP) {
++        elems_fp32 = elems_fp32 + zp_vec;
++      }
++
++      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
++      vec_op::INT8Vec16 elems_int8(elems_fp32);
++      elems_int8.save(output + i * hidden_size + j);
++    }
++
++    load_vec_t elems(input + i * hidden_size + j);
++    cvt_vec_t elems_fp32(elems);
++    elems_fp32 = elems_fp32 * inv_scale;
++
++    if constexpr (AZP) {
++      elems_fp32 = elems_fp32 + zp_vec;
++    }
++
++    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
++    vec_op::INT8Vec16 elems_int8(elems_fp32);
++    elems_int8.save(output + i * hidden_size + j, hidden_size - j);
++  }
++}
++
++template <bool AZP, typename scalar_t>
++void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
++                                    float* scale, int32_t* azp,
++                                    const int num_tokens,
++                                    const int hidden_size) {
++  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
++  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
++  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
++
++  constexpr float i8_min =
++      static_cast<float>(std::numeric_limits<int8_t>::min());
++  constexpr float i8_max =
++      static_cast<float>(std::numeric_limits<int8_t>::max());
++  const cvt_vec_t i8_min_vec(i8_min);
++  const cvt_vec_t i8_max_vec(i8_max);
++
++  #pragma omp parallel for
++  for (int i = 0; i < num_tokens; ++i) {
++    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
++    cvt_vec_t min_value(std::numeric_limits<float>::max());
++    {
++      int j = 0;
++      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
++        load_vec_t elems(input + i * hidden_size + j);
++        cvt_vec_t elems_fp32(elems);
++        if constexpr (AZP) {
++          max_value = max_value.max(elems_fp32);
++          min_value = min_value.min(elems_fp32);
++        } else {
++          max_value = max_value.max(elems_fp32.abs());
++        }
++      }
++
++      load_vec_t elems(input + i * hidden_size + j);
++      cvt_vec_t elems_fp32(elems);
++
++      if (j + vec_elem_num == hidden_size) {
++        if constexpr (AZP) {
++          max_value = max_value.max(elems_fp32);
++          min_value = min_value.min(elems_fp32);
++        } else {
++          max_value = max_value.max(elems_fp32.abs());
++        }
++      } else {
++        if constexpr (AZP) {
++          max_value = max_value.max(elems_fp32, hidden_size - j);
++          min_value = min_value.min(elems_fp32, hidden_size - j);
++        } else {
++          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
++        }
++      }
++    }
++
++    float scale_val, azp_val;
++    if constexpr (AZP) {
++      float max_scalar = max_value.reduce_max();
++      float min_scalar = min_value.reduce_min();
++      scale_val = (max_scalar - min_scalar) / 255.0f;
++      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
++      azp[i] = static_cast<int32_t>(azp_val);
++      scale[i] = scale_val;
++    } else {
++      scale_val = max_value.reduce_max() / 127.0f;
++      scale[i] = scale_val;
++    }
++
++    const cvt_vec_t inv_scale(1.0 / scale_val);
++    const cvt_vec_t azp_vec(azp_val);
++
++    {
++      int j = 0;
++      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
++        load_vec_t elems(input + i * hidden_size + j);
++        cvt_vec_t elems_fp32(elems);
++        elems_fp32 = (elems_fp32 * inv_scale);
++
++        if constexpr (AZP) {
++          elems_fp32 = elems_fp32 + azp_vec;
++        }
++        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
++        vec_op::INT8Vec16 elems_int8(elems_fp32);
++        elems_int8.save(output + i * hidden_size + j);
++      }
++
++      load_vec_t elems(input + i * hidden_size + j);
++      cvt_vec_t elems_fp32(elems);
++      elems_fp32 = (elems_fp32 * inv_scale);
++
++      if constexpr (AZP) {
++        elems_fp32 = elems_fp32 + azp_vec;
++      }
++      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
++      vec_op::INT8Vec16 elems_int8(elems_fp32);
++      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
++    }
++  }
++}
++
++template <bool PerChannel, typename scalar_t>
++void static_quant_epilogue(const float* input, scalar_t* output,
++                           const float a_scale, const float* b_scale,
++                           const int32_t* azp_with_adj, const int num_tokens,
++                           const int hidden_size) {
++  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
++  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
++  using azp_adj_load_vec_t =
++      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
++  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
++  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
++
++  #pragma omp parallel for
++  for (int i = 0; i < num_tokens; ++i) {
++    cvt_vec_t a_scale_vec(a_scale);
++    cvt_vec_t b_scale_vec(*b_scale);
++    cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
++
++    int j = 0;
++    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
++      cvt_vec_t elems_fp32(input + i * hidden_size + j);
++      azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
++      cvt_vec_t azp_adj_fp32(azp_adj_vec);
++
++      if constexpr (PerChannel) {
++        b_scale_vec = cvt_vec_t(b_scale + j);
++        scale_vec = b_scale_vec * a_scale_vec;
++      }
++
++      elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
++
++      load_vec_t elems_out(elems_fp32);
++      elems_out.save(output + i * hidden_size + j);
++    }
++
++    cvt_vec_t elems_fp32(input + i * hidden_size + j);
++    azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
++    cvt_vec_t azp_adj_fp32(azp_adj_vec);
++
++    if constexpr (PerChannel) {
++      b_scale_vec = cvt_vec_t(b_scale + j);
++      scale_vec = b_scale_vec * a_scale_vec;
++    }
++
++    elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
++
++    load_vec_t elems_out(elems_fp32);
++    elems_out.save(output + i * hidden_size + j, hidden_size - j);
++  }
++}
++
++template <bool AZP, bool PerChannel, bool Bias, typename scalar_t>
++void dynamic_quant_epilogue(const float* input, scalar_t* output,
++                            const float* a_scale, const float* b_scale,
++                            const int32_t* azp, const int32_t* azp_adj,
++                            const scalar_t* bias, const int num_tokens,
++                            const int hidden_size) {
++  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
++  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
++  using azp_adj_load_vec_t =
++      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
++  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
++  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
++
++  #pragma omp parallel for
++  for (int i = 0; i < num_tokens; ++i) {
++    int j = 0;
++    cvt_vec_t token_scale_vec(a_scale[i]);
++    cvt_vec_t token_zp_scale_vec;
++    if constexpr (AZP) {
++      float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
++      if constexpr (!PerChannel) {
++        zp_scale_val *= *b_scale;
++      }
++      token_zp_scale_vec = cvt_vec_t(zp_scale_val);
++    }
++
++    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
++      cvt_vec_t elems_fp32(input + i * hidden_size + j);
++      elems_fp32 = elems_fp32 * token_scale_vec;
++
++      if constexpr (AZP) {
++        azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
++        cvt_vec_t azp_adj_fp32(azp_adj_vec);
++        azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
++
++        if constexpr (PerChannel) {
++          cvt_vec_t b_scale_vec(b_scale + j);
++          azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
++        }
++
++        elems_fp32 = elems_fp32 - azp_adj_fp32;
++      }
++
++      if constexpr (Bias) {
++        load_vec_t bias_vec(bias + j);
++        cvt_vec_t bias_vec_fp32(bias_vec);
++        elems_fp32 = elems_fp32 + bias_vec_fp32;
++      }
++
++      load_vec_t elems_out(elems_fp32);
++      elems_out.save(output + i * hidden_size + j);
++    }
++
++    cvt_vec_t elems_fp32(input + i * hidden_size + j);
++    elems_fp32 = elems_fp32 * token_scale_vec;
++
++    if constexpr (AZP) {
++      azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
++      cvt_vec_t azp_adj_fp32(azp_adj_vec);
++      azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
++
++      if constexpr (PerChannel) {
++        cvt_vec_t b_scale_vec(b_scale + j);
++        azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
++      }
++
++      elems_fp32 = elems_fp32 - azp_adj_fp32;
++    }
++
++    if constexpr (Bias) {
++      load_vec_t bias_vec(bias + j);
++      cvt_vec_t bias_vec_fp32(bias_vec);
++      elems_fp32 = elems_fp32 + bias_vec_fp32;
++    }
++
++    load_vec_t elems_out(elems_fp32);
++    elems_out.save(output + i * hidden_size + j, hidden_size - j);
++  }
++}
++#else
++template <typename scalar_t>
++void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
++                                   const float* scale, const int32_t* azp,
++                                   const int num_tokens,
++                                   const int hidden_size) {
++  TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
++}
++
++template <typename scalar_t>
++void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
++                                    float* scale, int32_t* azp,
++                                    const int num_tokens,
++                                    const int hidden_size) {
++  TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
++}
++
++template <bool PerChannel, typename scalar_t>
++void static_quant_epilogue(const float* input, scalar_t* output,
++                           const float a_scale, const float* b_scale,
++                           const int32_t* azp_with_adj, const int num_tokens,
++                           const int hidden_size) {
++  TORCH_CHECK(false, "static_quant_epilogue requires AVX512 support.")
++}
++
++template <typename scalar_t>
++void dynamic_quant_epilogue(const float* input, scalar_t* output,
++                            const float* a_scale, const float* b_scale,
++                            const int32_t* azp, const int32_t* azp_with_adj,
++                            const scalar_t* bias, const int num_tokens,
++                            const int hidden_size) {
++  TORCH_CHECK(false, "dynamic_quant_epilogue requires AVX512 support.")
++}
++#endif
++}  // namespace
++
++void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
++                    const torch::Tensor& a,         // [M, IC], row-major
++                    const torch::Tensor& b,         // [IC, OC], column-major
++                    const torch::Tensor& a_scales,  // [1] or [M]
++                    const torch::Tensor& b_scales,  // [1] or [OC]
++                    const std::optional<torch::Tensor>& bias  // [OC]
++) {
++  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
++  // Checks for conformality
++  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
++              "int8_scaled_mm only supports INT8 inputs.")
++  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
++  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
++              b.size(1) == c.size(1));
++  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
++  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
++
++  // Check for strides and alignment
++  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
++  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
++  TORCH_CHECK(c.stride(0) % 16 == 0 &&
++              b.stride(1) % 16 == 0);  // 16 Byte Alignment
++  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
++
++  if (bias) {
++    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
++                bias->dim() == 1);
++  }
++
++  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] {
++    if (a_scales.numel() != 1) {
++      // per-token
++      // Note: oneDNN doesn't support per-token activation quantization
++      // Ideally we want to fuse the GEMM and the scale procedure with oneDNN
++      // JIT, the intermediate data is cached in registers or L1. But for now
++      // the oneDNN GEMM code generation only supports two quantization
++      // patterns: per-tensor or per-output-channel of weight.
++      // So we have to apply the per-token scale with a 'epilogue'. In C=s_a *
++      // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN
++      // GEMM, then the per-token scale (and bias) is applied with the epilogue
++      // C=s_a * C_inter + bias.
++      torch::Tensor tmp_fp32_out =
++          torch::empty_like(c, ::at::ScalarType::Float);
++      // Compute C_inter=s_b * (A@B)
++      DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
++          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
++          tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
++          a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
++      if (bias.has_value()) {
++        // Compute C=s_a * C_inter + bias
++        dynamic_quant_epilogue<false, true, true>(
++            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
++            a_scales.data_ptr<float>(), nullptr, nullptr, nullptr,
++            bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
++      } else {
++        // Compute C=s_a * C_inter
++        dynamic_quant_epilogue<false, true, false, scalar_t>(
++            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
++            a_scales.data_ptr<float>(), nullptr, nullptr, nullptr, nullptr,
++            c.size(0), c.size(1));
++      }
++    } else {
++      // per-tensor
++      if (bias.has_value()) {
++        // Compute C=s_a * s_b * (A@B) + bias
++        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
++            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
++            bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
++            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
++            a_scales.numel(), b_scales.numel());
++      } else {
++        // Compute C=s_a * s_b * (A@B)
++        DNNLPrimitiveHelper<false>::gemm_s8s8_jit<scalar_t, void>(
++            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
++            nullptr, a.size(0), b.size(1), a.size(1),
++            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
++            a_scales.numel(), b_scales.numel());
++      }
++    }
++  });
++}
++
++void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
++                        const torch::Tensor& a,  // [M, IC], row-major
++                        const torch::Tensor& b,  // [IC, OC], column-major
++                        const torch::Tensor& a_scales,            // [1] or [M]
++                        const torch::Tensor& b_scales,            // [1] or [OC]
++                        const torch::Tensor& azp_adj,             // [OC]
++                        const std::optional<torch::Tensor>& azp,  // [1] or [M]
++                        const std::optional<torch::Tensor>& bias  // [OC]
++) {
++  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
++  // Checks for conformality
++  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
++              "int8_scaled_mm_azp only supports INT8 inputs.")
++  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
++  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
++              b.size(1) == c.size(1));
++  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
++  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
++
++  // Check for strides and alignment
++  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
++  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
++  TORCH_CHECK(c.stride(0) % 16 == 0 &&
++              b.stride(1) % 16 == 0);  // 16 Byte Alignment
++  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
++
++  if (bias) {
++    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous());
++  }
++  if (azp) {
++    TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous());
++  }
++  TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous());
++
++  // azp & bias types
++  TORCH_CHECK(azp_adj.dtype() == torch::kInt32);
++  TORCH_CHECK(!azp || azp->dtype() == torch::kInt32);
++  TORCH_CHECK(!bias || bias->dtype() == c.dtype(),
++              "currently bias dtype must match output dtype ", c.dtype());
++
++  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] {
++    torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
++    if (a_scales.numel() != 1) {
++      // per-token
++      // Note: oneDNN doesn't support per-token activation quantization
++      // Compute C_inter=s_b * (A@B)
++      DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
++          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
++          tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
++          a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
++      if (bias.has_value()) {
++        // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias
++        if (b_scales.numel() != 1) {
++          // Per-Channel
++          dynamic_quant_epilogue<true, true, true>(
++              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
++              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
++              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(),
++              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
++        } else {
++          // Per-Tensor
++          dynamic_quant_epilogue<true, false, true>(
++              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
++              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
++              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(),
++              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
++        }
++      } else {
++        // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj
++        if (b_scales.numel() != 1) {
++          // Per-Channel
++          dynamic_quant_epilogue<true, true, false, scalar_t>(
++              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
++              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
++              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(), nullptr,
++              c.size(0), c.size(1));
++        } else {
++          // Per-Tensor
++          dynamic_quant_epilogue<true, false, false, scalar_t>(
++              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
++              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
++              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(), nullptr,
++              c.size(0), c.size(1));
++        }
++      }
++    } else {
++      // per-tensor
++      if (bias.has_value()) {
++        // Compute C_inter=s_a * s_b * (A@B) + bias
++        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
++            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
++            tmp_fp32_out.data_ptr<float>(), bias->data_ptr<scalar_t>(),
++            a.size(0), b.size(1), a.size(1), a_scales.data_ptr<float>(),
++            b_scales.data_ptr<float>(), a_scales.numel(), b_scales.numel());
++      } else {
++        // Compute C_inter=s_a * s_b * (A@B)
++        DNNLPrimitiveHelper<false>::gemm_s8s8_jit<float, void>(
++            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
++            tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
++            a.size(1), a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
++            a_scales.numel(), b_scales.numel());
++      }
++
++      // Compute C=C_inter - s_a * s_b * azp_adj
++      if (b_scales.numel() != 1) {
++        // Per-Channel
++        static_quant_epilogue<true>(
++            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
++            *a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
++            azp_adj.data_ptr<int32_t>(), a.size(0), b.size(1));
++      } else {
++        // Per-Tensor
++        static_quant_epilogue<false>(
++            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
++            *a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
++            azp_adj.data_ptr<int32_t>(), a.size(0), b.size(1));
++      }
++    }
++  });
++}
++
++// static-per-tensor quantization.
++void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
++                              const torch::Tensor& input,  // [..., hidden_size]
++                              const torch::Tensor& scale,
++                              std::optional<torch::Tensor> const& azp) {
++  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
++  TORCH_CHECK(input.is_contiguous());
++  TORCH_CHECK(out.is_contiguous());
++  TORCH_CHECK(scale.numel() == 1);
++  TORCH_CHECK(!azp.has_value() || azp->numel() == 1);
++
++  const int hidden_size = input.size(-1);
++  const int num_tokens = input.numel() / hidden_size;
++  VLLM_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
++        if (azp.has_value()) {
++          static_scaled_int8_quant_impl<true>(
++              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
++              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
++              hidden_size);
++        } else {
++          static_scaled_int8_quant_impl<false>(
++              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
++              scale.data_ptr<float>(), nullptr, num_tokens, hidden_size);
++        }
++      });
++}
++
++// dynamic-per-token quantization.
++void dynamic_scaled_int8_quant(
++    torch::Tensor& out,          // [..., hidden_size]
++    const torch::Tensor& input,  // [..., hidden_size]
++    torch::Tensor& scale,        // [..., 1]
++    std::optional<torch::Tensor> const& azp) {
++  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
++  TORCH_CHECK(input.is_contiguous());
++  TORCH_CHECK(out.is_contiguous());
++
++  int const hidden_size = input.size(-1);
++  int const num_tokens = input.numel() / hidden_size;
++  VLLM_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
++        if (azp.has_value()) {
++          dynamic_scaled_int8_quant_impl<true>(
++              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
++              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
++              hidden_size);
++        } else {
++          dynamic_scaled_int8_quant_impl<false>(
++              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
++              scale.data_ptr<float>(), nullptr, num_tokens, hidden_size);
++        }
++      });
++}
+diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
+new file mode 100644
+index 0000000..74e4d81
+--- /dev/null
++++ b/csrc/cpu/torch_bindings.cpp
+@@ -0,0 +1,160 @@
++#include "cache.h"
++#include "ops.h"
++#include "core/registration.h"
++
++#include <torch/library.h>
++
++std::string init_cpu_threads_env(const std::string& cpu_ids);
++
++void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
++                    const torch::Tensor& b, const torch::Tensor& a_scales,
++                    const torch::Tensor& b_scales,
++                    const std::optional<torch::Tensor>& bias);
++
++void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
++                        const torch::Tensor& b, const torch::Tensor& a_scales,
++                        const torch::Tensor& b_scales,
++                        const torch::Tensor& azp_adj,
++                        const std::optional<torch::Tensor>& azp,
++                        const std::optional<torch::Tensor>& bias);
++
++TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
++  // vLLM custom ops
++
++  // Attention ops
++  // Compute the attention between an input query and the cached keys/values
++  // using PagedAttention.
++  ops.def(
++      "paged_attention_v1("
++      "    Tensor! out, Tensor query, Tensor key_cache,"
++      "    Tensor value_cache, int num_kv_heads, float scale,"
++      "    Tensor block_tables, Tensor seq_lens, int block_size,"
++      "    int max_seq_len, Tensor? alibi_slopes,"
++      "    str kv_cache_dtype, float k_scale, float v_scale,"
++      "    int tp_rank, int blocksparse_local_blocks,"
++      "    int blocksparse_vert_stride, int blocksparse_block_size,"
++      "    int blocksparse_head_sliding_step) -> ()");
++  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
++
++  // PagedAttention V2.
++  ops.def(
++      "paged_attention_v2("
++      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
++      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
++      "    Tensor value_cache, int num_kv_heads, float scale,"
++      "    Tensor block_tables, Tensor seq_lens, int block_size,"
++      "    int max_seq_len, Tensor? alibi_slopes,"
++      "    str kv_cache_dtype, float k_scale, float v_scale,"
++      "    int tp_rank, int blocksparse_local_blocks,"
++      "    int blocksparse_vert_stride, int blocksparse_block_size,"
++      "    int blocksparse_head_sliding_step) -> ()");
++  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
++
++  // Activation ops
++
++  // Activation function used in SwiGLU.
++  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
++  ops.impl("silu_and_mul", torch::kCPU, &silu_and_mul);
++
++  // Activation function used in GeGLU with `none` approximation.
++  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
++  ops.impl("gelu_and_mul", torch::kCPU, &gelu_and_mul);
++
++  // Activation function used in GeGLU with `tanh` approximation.
++  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
++  ops.impl("gelu_tanh_and_mul", torch::kCPU, &gelu_tanh_and_mul);
++
++  // GELU implementation used in GPT-2.
++  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
++  ops.impl("gelu_new", torch::kCPU, &gelu_new);
++
++  // Approximate GELU implementation.
++  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
++  ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
++
++  // Quick GELU implementation.
++  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
++  ops.impl("gelu_quick", torch::kCPU, &gelu_quick);
++
++  // Layernorm
++  // Apply Root Mean Square (RMS) Normalization to the input tensor.
++  ops.def(
++      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
++      "()");
++  ops.impl("rms_norm", torch::kCPU, &rms_norm);
++
++  // In-place fused Add and RMS Normalization.
++  ops.def(
++      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
++      "float epsilon) -> ()");
++  ops.impl("fused_add_rms_norm", torch::kCPU, &fused_add_rms_norm);
++
++  // Rotary embedding
++  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
++  ops.def(
++      "rotary_embedding(Tensor positions, Tensor! query,"
++      "                 Tensor! key, int head_size,"
++      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
++  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
++
++  // Quantization
++#ifdef __AVX512F__
++  // Compute int8 quantized tensor for given scaling factor.
++  ops.def(
++      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
++      "Tensor? azp) -> ()");
++  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
++
++  // Compute int8 quantized tensor and scaling factor
++  ops.def(
++      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
++      "Tensor!? azp) -> ()");
++  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
++           &dynamic_scaled_int8_quant);
++  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
++  // quantization.
++  ops.def(
++      "cutlass_scaled_mm(Tensor! out, Tensor a,"
++      "                  Tensor b, Tensor a_scales,"
++      "                  Tensor b_scales, Tensor? bias) -> ()");
++  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
++  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
++  // quantization.
++  ops.def(
++      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
++      "                  Tensor b, Tensor a_scales,"
++      "                  Tensor b_scales, Tensor azp_adj,"
++      "                  Tensor? azp, Tensor? bias) -> ()");
++  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
++#endif
++}
++
++TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
++  // Cache ops
++  // Swap in (out) the cache blocks from src to dst.
++  cache_ops.def(
++      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
++  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
++
++  // Copy the cache blocks from src to dst.
++  cache_ops.def(
++      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
++      "Tensor block_mapping) -> ()");
++  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
++
++  // Reshape the key and value tensors and cache them.
++  cache_ops.def(
++      "reshape_and_cache(Tensor key, Tensor value,"
++      "                  Tensor! key_cache, Tensor! value_cache,"
++      "                  Tensor slot_mapping,"
++      "                  str kv_cache_dtype,"
++      "                  float k_scale, float v_scale) -> ()");
++  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
++}
++
++TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
++  // CPU utils
++  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
++}
++
++REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
+diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
+new file mode 100644
+index 0000000..42a1c1d
+--- /dev/null
++++ b/csrc/cpu/utils.cpp
+@@ -0,0 +1,103 @@
++#ifndef VLLM_NUMA_DISABLED
++  #include <numa.h>
++  #include <unistd.h>
++  #include <string>
++  #include <sched.h>
++#endif
++
++#include "cpu_types.hpp"
++
++#ifdef VLLM_NUMA_DISABLED
++std::string init_cpu_threads_env(const std::string& cpu_ids) {
++  return std::string(
++      "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
++      "no effect to setup thread affinity.");
++}
++
++#endif
++
++#ifndef VLLM_NUMA_DISABLED
++std::string init_cpu_threads_env(const std::string& cpu_ids) {
++  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
++  TORCH_CHECK(omp_cpu_mask->size > 0);
++  std::vector<int> omp_cpu_ids;
++  omp_cpu_ids.reserve(omp_cpu_mask->size);
++
++  constexpr int group_size = 8 * sizeof(*omp_cpu_mask->maskp);
++
++  for (int offset = 0; offset < omp_cpu_mask->size; offset += group_size) {
++    unsigned long group_mask = omp_cpu_mask->maskp[offset / group_size];
++    int i = 0;
++    while (group_mask) {
++      if (group_mask & 1) {
++        omp_cpu_ids.emplace_back(offset + i);
++      }
++      ++i;
++      group_mask >>= 1;
++    }
++  }
++
++  // Memory node binding
++  if (numa_available() != -1) {
++    int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
++    bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
++    bitmask* src_mask = numa_get_membind();
++
++    int pid = getpid();
++
++    // move all existing pages to the specified numa node.
++    *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
++    int page_num = numa_migrate_pages(pid, src_mask, mask);
++    if (page_num == -1) {
++      TORCH_CHECK(false,
++                  "numa_migrate_pages failed. errno: " + std::to_string(errno));
++    }
++
++    // restrict memory allocation node.
++    numa_set_membind(mask);
++    numa_set_strict(1);
++  }
++
++  // OMP threads binding
++  omp_set_num_threads((int)omp_cpu_ids.size());
++  torch::set_num_threads((int)omp_cpu_ids.size());
++  TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
++  TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
++
++  std::vector<std::pair<int, int>> thread_core_mapping;
++  thread_core_mapping.reserve(omp_cpu_ids.size());
++  omp_lock_t writelock;
++  omp_init_lock(&writelock);
++
++  #pragma omp parallel for schedule(static, 1)
++  for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
++    cpu_set_t mask;
++    CPU_ZERO(&mask);
++    CPU_SET(omp_cpu_ids[i], &mask);
++    int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
++    if (ret == -1) {
++      TORCH_CHECK(false,
++                  "sched_setaffinity failed. errno: " + std::to_string(errno));
++    }
++
++    omp_set_lock(&writelock);
++    thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
++    omp_unset_lock(&writelock);
++  }
++
++  omp_destroy_lock(&writelock);
++
++  numa_free_nodemask(omp_cpu_mask);
++
++  std::stringstream ss;
++  ss << "OMP threads binding of Process " << getpid() << ":\n";
++  std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
++            [](auto&& a, auto&& b) { return a.second < b.second; });
++  for (auto&& item : thread_core_mapping) {
++    ss << "\t"
++       << "OMP tid: " << item.first << ", core " << item.second << "\n";
++  }
++
++  return ss.str();
++}
++#endif
+\ No newline at end of file
+diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h
+index c711d8d..82e5561 100644
+--- a/csrc/cuda_compat.h
++++ b/csrc/cuda_compat.h
+@@ -1,7 +1,7 @@
+ #pragma once
+ 
+ #ifdef USE_ROCM
+-#include <hip/hip_runtime.h>
++  #include <hip/hip_runtime.h>
+ #endif
+ 
+ #ifndef USE_ROCM
+@@ -17,9 +17,14 @@
+ #endif
+ 
+ #ifndef USE_ROCM
+-  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask)
++  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
++    __shfl_xor_sync(uint32_t(-1), var, lane_mask)
++  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
++    __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
+ #else
+   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
++  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
++    __shfl_xor(var, lane_mask, width)
+ #endif
+ 
+ #ifndef USE_ROCM
+@@ -28,6 +33,13 @@
+   #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
+ #endif
+ 
++#ifndef USE_ROCM
++  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
++    __shfl_down_sync(uint32_t(-1), var, lane_delta)
++#else
++  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
++#endif
++
+ #ifndef USE_ROCM
+   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+     cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
+@@ -35,4 +47,3 @@
+   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+     hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
+ #endif
+-
+diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h
+index 1483484..c352242 100644
+--- a/csrc/cuda_utils.h
++++ b/csrc/cuda_utils.h
+@@ -1,10 +1,15 @@
+ #pragma once
+ 
+-#include <torch/extension.h>
++#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
++  #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
++  #define DEVICE_INLINE __forceinline__ __device__
++  #define HOST_INLINE __forceinline__ __host__
++#else
++  #define HOST_DEVICE_INLINE inline
++  #define DEVICE_INLINE inline
++  #define HOST_INLINE inline
++#endif
+ 
+-int get_device_attribute(
+-    int attribute,
+-    int device_id);
++int64_t get_device_attribute(int64_t attribute, int64_t device_id);
+ 
+-int get_max_shared_memory_per_block_device_attribute(
+-    int device_id);
++int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
+diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu
+index 1a443ef..d6f9eb6 100644
+--- a/csrc/cuda_utils_kernels.cu
++++ b/csrc/cuda_utils_kernels.cu
+@@ -2,34 +2,28 @@
+   #include <hip/hip_runtime.h>
+   #include <hip/hip_runtime_api.h>
+ #endif
+-int get_device_attribute(
+-    int attribute,
+-    int device_id)
+-{
+-    int device, value;
+-    if (device_id < 0) {
+-        cudaGetDevice(&device);
+-    }
+-    else {
+-        device = device_id;
+-    }
+-    cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute), device);
+-    return value;
++int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
++  int device, value;
++  if (device_id < 0) {
++    cudaGetDevice(&device);
++  } else {
++    device = device_id;
++  }
++  cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
++                         device);
++  return value;
+ }
+ 
+-
+-int get_max_shared_memory_per_block_device_attribute(
+-    int device_id)
+-{
+-int attribute;    
+-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
+-// cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
++int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
++  int64_t attribute;
++  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
++  // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
+ 
+ #ifdef USE_ROCM
+-    attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
++  attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
+ #else
+-    attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin;
++  attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin;
+ #endif
+ 
+-    return get_device_attribute(attribute, device_id);
++  return get_device_attribute(attribute, device_id);
+ }
+diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu
+index 3906dcf..123278b 100644
+--- a/csrc/custom_all_reduce.cu
++++ b/csrc/custom_all_reduce.cu
+@@ -1,36 +1,33 @@
+ #include <ATen/cuda/Exceptions.h>
+ #include <c10/cuda/CUDAGuard.h>
+ #include <c10/cuda/CUDAStream.h>
+-#include <torch/extension.h>
++#include <torch/all.h>
+ 
+ #include "custom_all_reduce.cuh"
+ 
+-// fake pointer type
+-using fptr_t = uint64_t;
+-static_assert(sizeof(void *) == sizeof(fptr_t));
++// Fake pointer type, must match fptr_t type in ops.h.
++// We use this type alias to indicate when pointers are passed in as int64_t.
++using fptr_t = int64_t;
++static_assert(sizeof(void*) == sizeof(fptr_t));
+ 
+-fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
+-                      const std::vector<std::string> &handles,
+-                      const std::vector<int64_t> &offsets, int rank,
++fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
++                      torch::Tensor& rank_data, int64_t rank,
+                       bool full_nvlink) {
+-  int world_size = offsets.size();
++  int world_size = fake_ipc_ptrs.size();
+   if (world_size > 8)
+     throw std::invalid_argument("world size > 8 is not supported");
+   if (world_size % 2 != 0)
+     throw std::invalid_argument("Odd num gpus is not supported for now");
+-  if (world_size != handles.size())
+-    throw std::invalid_argument(
+-        "handles length should equal to offsets length");
+   if (rank < 0 || rank >= world_size)
+     throw std::invalid_argument("invalid rank passed in");
+ 
+-  cudaIpcMemHandle_t ipc_handles[8];
++  vllm::Signal* ipc_ptrs[8];
+   for (int i = 0; i < world_size; i++) {
+-    std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t));
++    ipc_ptrs[i] = reinterpret_cast<vllm::Signal*>(fake_ipc_ptrs[i]);
+   }
+-  return (fptr_t) new vllm::CustomAllreduce(
+-      reinterpret_cast<vllm::Signal *>(meta.data_ptr()), rank_data.data_ptr(),
+-      rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
++  return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
++                                            rank_data.numel(), rank, world_size,
++                                            full_nvlink);
+ }
+ 
+ /**
+@@ -49,46 +46,55 @@ fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
+  * 5. A[None].expand(2, -1, -1, -1): Not OK
+  * 6. A[:, 1:, 1:]: Not OK
+  */
+-bool _is_weak_contiguous(torch::Tensor &t) {
++bool _is_weak_contiguous(torch::Tensor& t) {
+   return t.is_contiguous() ||
+          (t.storage().nbytes() - t.storage_offset() * t.element_size() ==
+           t.numel() * t.element_size());
+ }
+ 
+-bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
+-                      bool full_nvlink) {
+-  auto inp_size = inp.numel() * inp.element_size();
+-  // custom allreduce requires input byte size to be multiples of 16
+-  if (inp_size % 16 != 0) return false;
+-  if (!_is_weak_contiguous(inp)) return false;
+-  if (world_size == 2 || full_nvlink) return inp_size <= max_size;
+-  // for 4 or more non NVLink-capable GPUs, custom allreduce provides little
+-  // performance improvement over NCCL.
+-  return false;
+-}
++/**
++ * Performs an out-of-place allreduce and stores result in out.
++ *
++ * If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered.
++ * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
++ * copied into _reg_buffer.
++ */
++void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
++                fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
++  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
++  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+ 
+-void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out,
+-                 cudaStream_t stream) {
+-  auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa);
++  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
++  TORCH_CHECK_EQ(inp.numel(), out.numel());
+   TORCH_CHECK(_is_weak_contiguous(out));
++  TORCH_CHECK(_is_weak_contiguous(inp));
++  auto input_size = inp.numel() * inp.element_size();
++  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
++  if (reg_buffer) {
++    TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
++    AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size,
++                                  cudaMemcpyDeviceToDevice, stream));
++  } else {
++    reg_buffer = inp.data_ptr();
++  }
+   switch (out.scalar_type()) {
+     case at::ScalarType::Float: {
+-      fa->allreduce<float>(stream, reinterpret_cast<float *>(inp.data_ptr()),
+-                           reinterpret_cast<float *>(out.data_ptr()),
++      fa->allreduce<float>(stream, reinterpret_cast<float*>(reg_buffer),
++                           reinterpret_cast<float*>(out.data_ptr()),
+                            out.numel());
+       break;
+     }
+     case at::ScalarType::Half: {
+-      fa->allreduce<half>(stream, reinterpret_cast<half *>(inp.data_ptr()),
+-                          reinterpret_cast<half *>(out.data_ptr()),
+-                          out.numel());
++      fa->allreduce<half>(stream, reinterpret_cast<half*>(reg_buffer),
++                          reinterpret_cast<half*>(out.data_ptr()), out.numel());
+       break;
+     }
+ #if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+     case at::ScalarType::BFloat16: {
+       fa->allreduce<nv_bfloat16>(
+-          stream, reinterpret_cast<nv_bfloat16 *>(inp.data_ptr()),
+-          reinterpret_cast<nv_bfloat16 *>(out.data_ptr()), out.numel());
++          stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
++          reinterpret_cast<nv_bfloat16*>(out.data_ptr()), out.numel());
+       break;
+     }
+ #endif
+@@ -98,51 +104,41 @@ void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out,
+   }
+ }
+ 
+-void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out) {
+-  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+-  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+-  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+-  TORCH_CHECK_EQ(inp.numel(), out.numel());
+-  _all_reduce(_fa, inp, out, stream);
+-}
+-
+-void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &reg_buffer,
+-                      torch::Tensor &out) {
+-  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+-  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+-
+-  auto input_size = inp.numel() * inp.element_size();
+-  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+-  TORCH_CHECK_EQ(inp.numel(), out.numel());
+-  TORCH_CHECK(input_size <= reg_buffer.numel() * reg_buffer.element_size(),
+-              "registered buffer is too small to contain the input");
+-  AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer.data_ptr(), inp.data_ptr(),
+-                                input_size, cudaMemcpyDeviceToDevice, stream));
+-  _all_reduce(_fa, reg_buffer, out, stream);
+-}
+-
+ void dispose(fptr_t _fa) {
+-  auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa);
+-  delete fa;
++  delete reinterpret_cast<vllm::CustomAllreduce*>(_fa);
+ }
+ 
+-int meta_size() { return sizeof(vllm::Signal); }
++int64_t meta_size() { return sizeof(vllm::Signal); }
+ 
+-void register_buffer(fptr_t _fa, torch::Tensor &t,
+-                     const std::vector<std::string> &handles,
+-                     const std::vector<int64_t> &offsets) {
+-  auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa);
+-  fa->register_buffer(handles, offsets, t.data_ptr());
++void register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs) {
++  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
++  TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_);
++  void* ipc_ptrs[8];
++  for (int i = 0; i < fake_ipc_ptrs.size(); i++) {
++    ipc_ptrs[i] = reinterpret_cast<void*>(fake_ipc_ptrs[i]);
++  }
++  fa->register_buffer(ipc_ptrs);
+ }
+ 
+-std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+-    fptr_t _fa) {
+-  auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa);
+-  return fa->get_graph_buffer_ipc_meta();
++// Use vector<int64_t> to represent byte data for python binding compatibility.
++std::tuple<std::vector<int64_t>, std::vector<int64_t>>
++get_graph_buffer_ipc_meta(fptr_t _fa) {
++  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
++  auto [handle, offsets] = fa->get_graph_buffer_ipc_meta();
++  std::vector<int64_t> bytes(handle.begin(), handle.end());
++  return std::make_tuple(bytes, offsets);
+ }
+ 
+-void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
+-                            const std::vector<std::vector<int64_t>> &offsets) {
+-  auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa);
+-  fa->register_graph_buffers(handles, offsets);
++// Use vector<int64_t> to represent byte data for python binding compatibility.
++void register_graph_buffers(fptr_t _fa,
++                            const std::vector<std::vector<int64_t>>& handles,
++                            const std::vector<std::vector<int64_t>>& offsets) {
++  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
++  std::vector<std::string> bytes;
++  bytes.reserve(handles.size());
++  for (int i = 0; i < handles.size(); i++) {
++    bytes.emplace_back(handles[i].begin(), handles[i].end());
++  }
++  bytes.reserve(handles.size());
++  fa->register_graph_buffers(bytes, offsets);
+ }
+diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
+index 750e68d..6be4d4f 100644
+--- a/csrc/custom_all_reduce.cuh
++++ b/csrc/custom_all_reduce.cuh
+@@ -6,6 +6,7 @@
+ #include <cuda_runtime.h>
+ 
+ #include <iostream>
++#include <array>
+ #include <limits>
+ #include <map>
+ #include <unordered_map>
+@@ -23,17 +24,23 @@
+ 
+ namespace vllm {
+ 
+-constexpr int kMaxBlocks = 64;
+-// note: we don't want to use atomics for signals because peer atomics are no
+-// supported on PCIe links
++constexpr int kMaxBlocks = 36;
++// Counter may overflow, but it's fine since unsigned int overflow is
++// well-defined behavior.
++using FlagType = uint32_t;
+ struct Signal {
+-  alignas(128) uint32_t start[kMaxBlocks][8];
+-  alignas(128) uint32_t end[kMaxBlocks][8];
++  alignas(128) FlagType self_counter[kMaxBlocks][8];
++  // Two sets of peer counters are needed for two syncs. The reason is that
++  // it's possible for peer GPU block to arrive at the second sync point while
++  // the current GPU block haven't passed the first sync point. Thus, peer GPU
++  // may write counter+1 while current GPU is busy waiting for counter. We use
++  // alternating counter array to avoid this possibility.
++  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
+ };
+ 
+-struct __align__(16) RankData { const void *__restrict__ ptrs[8]; };
++struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
+ 
+-struct __align__(16) RankSignals { volatile Signal *signals[8]; };
++struct __align__(16) RankSignals { Signal* signals[8]; };
+ 
+ // like std::array, but aligned
+ template <typename T, int sz>
+@@ -68,11 +75,11 @@ DINLINE half downcast_s(float val) {
+ // scalar add functions
+ // for some reason when compiling with Pytorch, the + operator for half and
+ // bfloat is disabled so we call the intrinsics directly
+-DINLINE half &assign_add(half &a, half b) {
++DINLINE half& assign_add(half& a, half b) {
+   a = __hadd(a, b);
+   return a;
+ }
+-DINLINE float &assign_add(float &a, float b) { return a += b; }
++DINLINE float& assign_add(float& a, float b) { return a += b; }
+ 
+ #if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+ DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }
+@@ -80,14 +87,14 @@ template <>
+ DINLINE nv_bfloat16 downcast_s(float val) {
+   return __float2bfloat16(val);
+ }
+-DINLINE nv_bfloat16 &assign_add(nv_bfloat16 &a, nv_bfloat16 b) {
++DINLINE nv_bfloat16& assign_add(nv_bfloat16& a, nv_bfloat16 b) {
+   a = __hadd(a, b);
+   return a;
+ }
+ #endif
+ 
+ template <typename T, int N>
+-DINLINE array_t<T, N> &packed_assign_add(array_t<T, N> &a, array_t<T, N> b) {
++DINLINE array_t<T, N>& packed_assign_add(array_t<T, N>& a, array_t<T, N> b) {
+ #pragma unroll
+   for (int i = 0; i < N; i++) {
+     assign_add(a.data[i], b.data[i]);
+@@ -123,53 +130,75 @@ DINLINE O downcast(array_t<float, O::size> val) {
+   }
+ }
+ 
+-// This function is meant to be used as the first synchronization in the all
+-// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
+-// prior memory accesses. Note: volatile writes will not be reordered against
+-// other volatile writes.
+-template <int ngpus>
+-DINLINE void start_sync(const RankSignals &sg, volatile Signal *self_sg,
+-                        int rank) {
+-  if (threadIdx.x < ngpus) {
+-    // reset flag for next time
+-    self_sg->end[blockIdx.x][threadIdx.x] = 0;
+-    // simultaneously write to the corresponding flag of all ranks.
+-    // Latency = 1 p2p write
+-    sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1;
+-    // wait until we got true from all ranks
+-    while (!self_sg->start[blockIdx.x][threadIdx.x])
+-      ;
+-  }
+-  __syncthreads();
++static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
++  asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
++               "l"(flag_addr));
++#else
++  asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
++               "l"(flag_addr));
++#endif
++}
++
++static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
++  FlagType flag;
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
++  asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
++               : "=r"(flag)
++               : "l"(flag_addr));
++#else
++  asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
++               : "=r"(flag)
++               : "l"(flag_addr));
++#endif
++  return flag;
++}
++
++static DINLINE void st_flag_volatile(FlagType* flag_addr, FlagType flag) {
++  asm volatile("st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
++}
++
++static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
++  FlagType flag;
++  asm volatile("ld.volatile.global.u32 %0, [%1];"
++               : "=r"(flag)
++               : "l"(flag_addr));
++  return flag;
+ }
+ 
+-// This function is meant to be used as the second or the final synchronization
+-// barrier in the all reduce kernel. If it's the final synchronization barrier,
+-// we don't need to make any visibility guarantees for prior memory accesses.
+-template <int ngpus, bool final_sync = false>
+-DINLINE void end_sync(const RankSignals &sg, volatile Signal *self_sg,
+-                      int rank) {
+-  __syncthreads();
+-  // eliminate the case that prior writes are not visible after signals become
+-  // visible. Note that I did not managed to make this happen through a lot of
+-  // testing. Might be the case that hardware provides stronger guarantee than
+-  // the memory model. 
+-  if constexpr (!final_sync) __threadfence_system();
++// is_start: whether this is the very first synchronization barrier.
++// need_fence: whether a memory fence is needed. If true, a release-acquire
++// semantic is used to enforce memory access order before and after this
++// barrier.
++template <int ngpus, bool is_start, bool need_fence = false>
++DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
++                               int rank) {
++  if constexpr (!is_start) __syncthreads();
++  static_assert(
++      !(is_start && need_fence));  // Start barrier shouldn't need fence.
+   if (threadIdx.x < ngpus) {
+-    // reset flag for next time
+-    self_sg->start[blockIdx.x][threadIdx.x] = 0;
+-    // simultaneously write to the corresponding flag of all ranks.
+-    // Latency = 1 p2p write
+-    sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1;
+-    // wait until we got true from all ranks
+-    while (!self_sg->end[blockIdx.x][threadIdx.x])
+-      ;
++    // Increment the counter. Technically we only need one counter, but we use
++    // multiple per block to eliminate the need to share the counter via smem.
++    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
++    // Write the expected counter value to peer and wait for correct value from
++    // peer.
++    auto peer_counter_ptr =
++        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
++    auto self_counter_ptr =
++        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
++    if constexpr (need_fence) {
++      st_flag_release(peer_counter_ptr, val);
++      while (ld_flag_acquire(self_counter_ptr) != val);
++    } else {
++      st_flag_volatile(peer_counter_ptr, val);
++      while (ld_flag_volatile(self_counter_ptr) != val);
++    }
+   }
+-  if constexpr (!final_sync) __syncthreads();
++  if constexpr (is_start || need_fence) __syncthreads();
+ }
+ 
+ template <typename P, int ngpus, typename A>
+-DINLINE P packed_reduce(const P *ptrs[], int idx) {
++DINLINE P packed_reduce(const P* ptrs[], int idx) {
+   A tmp = upcast(ptrs[0][idx]);
+ #pragma unroll
+   for (int i = 1; i < ngpus; i++) {
+@@ -180,34 +209,31 @@ DINLINE P packed_reduce(const P *ptrs[], int idx) {
+ 
+ template <typename T, int ngpus>
+ __global__ void __launch_bounds__(512, 1)
+-    cross_device_reduce_1stage(RankData *_dp, RankSignals sg,
+-                               volatile Signal *self_sg, T *__restrict__ result,
+-                               int rank, int size) {
++    cross_device_reduce_1stage(RankData* _dp, RankSignals sg, Signal* self_sg,
++                               T* __restrict__ result, int rank, int size) {
+   using P = typename packed_t<T>::P;
+   using A = typename packed_t<T>::A;
+   // note: we don't reorder the address so the accumulation order is the same
+   // for all ranks, ensuring bitwise identical results
+   auto dp = *_dp;
+-  start_sync<ngpus>(sg, self_sg, rank);
++  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+   // do the actual reduction
+   for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+        idx += gridDim.x * blockDim.x) {
+-    ((P *)result)[idx] =
+-        packed_reduce<P, ngpus, A>((const P **)&dp.ptrs[0], idx);
++    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
+   }
+-  end_sync<ngpus, true>(sg, self_sg, rank);
++  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+ }
+ 
+ template <typename P>
+-DINLINE P *get_tmp_buf(volatile Signal *sg) {
+-  return (P *)(((Signal *)sg) + 1);
++DINLINE P* get_tmp_buf(Signal* sg) {
++  return (P*)(((Signal*)sg) + 1);
+ }
+ 
+ template <typename T, int ngpus>
+ __global__ void __launch_bounds__(512, 1)
+-    cross_device_reduce_2stage(RankData *_dp, RankSignals sg,
+-                               volatile Signal *self_sg, T *__restrict__ result,
+-                               int rank, int size) {
++    cross_device_reduce_2stage(RankData* _dp, RankSignals sg, Signal* self_sg,
++                               T* __restrict__ result, int rank, int size) {
+   int tid = blockIdx.x * blockDim.x + threadIdx.x;
+   int stride = gridDim.x * blockDim.x;
+   using P = typename packed_t<T>::P;
+@@ -216,21 +242,21 @@ __global__ void __launch_bounds__(512, 1)
+   int start = rank * part;
+   int end = rank == ngpus - 1 ? size : start + part;
+   int largest_part = part + size % ngpus;
+-  const P *ptrs[ngpus];
+-  P *tmps[ngpus];
++  const P* ptrs[ngpus];
++  P* tmps[ngpus];
+ #pragma unroll
+   for (int i = 0; i < ngpus; i++) {
+     int target = (rank + i) % ngpus;
+-    ptrs[i] = (const P *)_dp->ptrs[target];
++    ptrs[i] = (const P*)_dp->ptrs[target];
+     tmps[i] = get_tmp_buf<P>(sg.signals[target]);
+   }
+   auto tmp_out = tmps[0];
+-  start_sync<ngpus>(sg, self_sg, rank);
++  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+   // stage 1: reduce scatter
+   for (int idx = start + tid; idx < end; idx += stride) {
+     tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
+   }
+-  end_sync<ngpus>(sg, self_sg, rank);
++  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
+ 
+   // stage 2: allgather. Note: it's important to match the tid between
+   // the two stages, because visibility across devices is only guaranteed
+@@ -243,7 +269,7 @@ __global__ void __launch_bounds__(512, 1)
+       int gather_from_rank = ((rank + i) % ngpus);
+       if (gather_from_rank == ngpus - 1 || idx < part) {
+         int dst_idx = gather_from_rank * part + idx;
+-        ((P *)result)[dst_idx] = tmps[i][idx];
++        ((P*)result)[dst_idx] = tmps[i][idx];
+       }
+     }
+   }
+@@ -259,71 +285,76 @@ class CustomAllreduce {
+   int world_size_;
+   bool full_nvlink_;
+ 
+-  // below are device pointers
+   RankSignals sg_;
+-  std::unordered_map<void *, RankData *> buffers_;
+-  Signal *self_sg_;
+-
+-  // stores the registered device pointers from all ranks
++  // Stores an map from a pointer to its peer pointters from all ranks.
++  std::unordered_map<void*, RankData*> buffers_;
++  Signal* self_sg_;
++
++  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
++  // For cuda graph to work, all kernel arguments must be fixed during graph
++  // capture time. However, the peer pointers are not known during graph capture
++  // time. Therefore, during capture, we increment the rank data pointer and use
++  // that as the argument to the kernel. The kernel arguments are stored in
++  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
++  // memory pointed to by the pointers in graph_unreg_buffers_ when
++  // the IPC handles are exchanged between ranks.
++  //
++  // The overall process looks like this:
++  // 1. Graph capture.
++  // 2. Each rank obtains the IPC handles for each addresses used during cuda
++  // graph capture using get_graph_buffer_ipc_meta.
++  // 3. (In Python) all gather the IPC handles.
++  // 4. Obtain the peer pointers by opening the IPC handles, and store them in
++  // the rank data array at corresponding positions.
+   RankData *d_rank_data_base_, *d_rank_data_end_;
+-  std::vector<void *> graph_unreg_buffers_;
++  std::vector<void*> graph_unreg_buffers_;
+   // a map from IPC handles to opened IPC pointers
+-  std::map<IPC_KEY, char *> ipc_handles_;
++  std::map<IPC_KEY, char*> ipc_handles_;
+ 
+   /**
+-   * meta is a pointer to device metadata and temporary buffer for allreduce.
++   * Signals are an array of ipc-enabled buffers from all ranks.
++   * For each of the buffer, the layout is as follows:
++   * | -- sizeof(Signal) -- | ------ a few MB ----- |
++   * The first section is for allreduce synchronization, and the second section
++   * is for storing the intermediate results required by some allreduce algos.
+    *
+-   * There's a total of sizeof(Signal) of prefix before the actual data,
+-   * so meta + 1 points to actual temporary buffer.
+-   *
+-   * note: this class does not own any device memory. Any required buffers
+-   * are passed in from the constructor
++   * Note: this class does not own any device memory. Any required buffers
++   * are passed in from the constructor.
+    */
+-  CustomAllreduce(Signal *meta, void *rank_data, size_t rank_data_sz,
+-                  const cudaIpcMemHandle_t *handles,
+-                  const std::vector<int64_t> &offsets, int rank,
+-                  bool full_nvlink = true)
++  CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
++                  int rank, int world_size, bool full_nvlink = true)
+       : rank_(rank),
+-        world_size_(offsets.size()),
++        world_size_(world_size),
+         full_nvlink_(full_nvlink),
+-        self_sg_(meta),
+-        d_rank_data_base_(reinterpret_cast<RankData *>(rank_data)),
++        self_sg_(signals[rank]),
++        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
+         d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
+     for (int i = 0; i < world_size_; i++) {
+-      Signal *rank_sg;
+-      if (i != rank_) {
+-        char *handle = open_ipc_handle(&handles[i]);
+-        handle += offsets[i];
+-        rank_sg = (Signal *)handle;
+-      } else {
+-        rank_sg = self_sg_;
+-      }
+-      sg_.signals[i] = rank_sg;
++      sg_.signals[i] = signals[i];
+     }
+   }
+ 
+-  char *open_ipc_handle(const void *ipc_handle) {
++  char* open_ipc_handle(const void* ipc_handle) {
+     auto [it, new_handle] =
+-        ipc_handles_.insert({*((IPC_KEY *)ipc_handle), nullptr});
++        ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+     if (new_handle) {
+-      char *ipc_ptr;
+-      CUDACHECK(cudaIpcOpenMemHandle((void **)&ipc_ptr,
+-                                     *((const cudaIpcMemHandle_t *)ipc_handle),
++      char* ipc_ptr;
++      CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptr,
++                                     *((const cudaIpcMemHandle_t*)ipc_handle),
+                                      cudaIpcMemLazyEnablePeerAccess));
+       it->second = ipc_ptr;
+     }
+     return it->second;
+   }
+ 
+-  std::pair<std::vector<uint8_t>, std::vector<int64_t>>
+-  get_graph_buffer_ipc_meta() {
++  std::pair<std::string, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
+     auto num_buffers = graph_unreg_buffers_.size();
+     auto handle_sz = sizeof(cudaIpcMemHandle_t);
+-    std::vector<uint8_t> handles(handle_sz * num_buffers, 0);
++    std::string handles(handle_sz * num_buffers, static_cast<char>(0));
+     std::vector<int64_t> offsets(num_buffers);
+     for (int i = 0; i < num_buffers; i++) {
+       auto ptr = graph_unreg_buffers_[i];
+-      void *base_ptr;
++      void* base_ptr;
+       // note: must share the base address of each allocation, or we get wrong
+       // address
+       if (cuPointerGetAttribute(&base_ptr,
+@@ -331,8 +362,8 @@ class CustomAllreduce {
+                                 (CUdeviceptr)ptr) != CUDA_SUCCESS)
+         throw std::runtime_error("failed to get pointer attr");
+       CUDACHECK(cudaIpcGetMemHandle(
+-          (cudaIpcMemHandle_t *)&handles[i * handle_sz], base_ptr));
+-      offsets[i] = ((char *)ptr) - ((char *)base_ptr);
++          (cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
++      offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+     }
+     return std::make_pair(handles, offsets);
+   }
+@@ -344,26 +375,22 @@ class CustomAllreduce {
+           std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+   }
+ 
+-  void register_buffer(const std::vector<std::string> &handles,
+-                       const std::vector<int64_t> &offsets, void *self) {
++  /**
++   * Register already-shared IPC pointers.
++   */
++  void register_buffer(void** ptrs) {
+     check_rank_data_capacity();
+     RankData data;
+     for (int i = 0; i < world_size_; i++) {
+-      if (i != rank_) {
+-        char *handle = open_ipc_handle(handles[i].data());
+-        handle += offsets[i];
+-        data.ptrs[i] = handle;
+-      } else {
+-        data.ptrs[i] = self;
+-      }
++      data.ptrs[i] = ptrs[i];
+     }
+     auto d_data = d_rank_data_base_++;
+     CUDACHECK(
+         cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
+-    buffers_[self] = d_data;
++    buffers_[ptrs[rank_]] = d_data;
+   }
+ 
+-  // note: when registering graph buffers, we intentionally choose to not
++  // Note: when registering graph buffers, we intentionally choose to not
+   // deduplicate the addresses. That means if the allocator reuses some
+   // addresses, they will be registered again. This is to account for the remote
+   // possibility of different allocation patterns between ranks. For example,
+@@ -371,17 +398,17 @@ class CustomAllreduce {
+   // got a different address. IPC handles have internal reference counting
+   // mechanism so overhead should be small.
+   void register_graph_buffers(
+-      const std::vector<std::string> &handles,
+-      const std::vector<std::vector<int64_t>> &offsets) {
++      const std::vector<std::string>& handles,
++      const std::vector<std::vector<int64_t>>& offsets) {
+     auto num_buffers = graph_unreg_buffers_.size();
+     check_rank_data_capacity(num_buffers);
+     std::vector<RankData> rank_data(num_buffers);
+     for (int i = 0; i < num_buffers; i++) {
+       auto self_ptr = graph_unreg_buffers_[i];
+-      auto &rd = rank_data[i];
++      auto& rd = rank_data[i];
+       for (int j = 0; j < world_size_; j++) {
+         if (j != rank_) {
+-          char *handle =
++          char* handle =
+               open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]);
+           handle += offsets[j][i];
+           rd.ptrs[j] = handle;
+@@ -398,14 +425,16 @@ class CustomAllreduce {
+   }
+ 
+   /**
+-   * This is the result after careful grid search. Using 36 blocks give the best
+-   * or close to the best runtime on the devices I tried: A100, A10, A30, T4,
+-   * V100. You'll notice that NCCL kernels also only take a small amount of SMs.
+-   * Not quite sure the underlying reason, but my guess is that too many SMs
+-   * will cause contention on NVLink bus.
++   * Performs allreduce, assuming input has already been registered.
++   *
++   * Block and grid default configs are results after careful grid search. Using
++   * 36 blocks give the best or close to the best runtime on the devices I
++   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
++   * take a small amount of SMs. Not quite sure the underlying reason, but my
++   * guess is that too many SMs will cause contention on NVLink bus.
+    */
+   template <typename T>
+-  void allreduce(cudaStream_t stream, T *input, T *output, int size,
++  void allreduce(cudaStream_t stream, T* input, T* output, int size,
+                  int threads = 512, int block_limit = 36) {
+     auto d = packed_t<T>::P::size;
+     if (size % d != 0)
+@@ -418,7 +447,7 @@ class CustomAllreduce {
+                                std::to_string(kMaxBlocks) + ". Got " +
+                                std::to_string(block_limit));
+ 
+-    RankData *ptrs;
++    RankData* ptrs;
+     cudaStreamCaptureStatus status;
+     CUDACHECK(cudaStreamIsCapturing(stream, &status));
+     if (status == cudaStreamCaptureStatusActive) {
+@@ -440,6 +469,8 @@ class CustomAllreduce {
+ #define KL(ngpus, name)                                                       \
+   name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+                                                  rank_, size);
++    // TODO(hanzhi713): Threshold is different for A100 and H100.
++    // Add per device threshold.
+ #define REDUCE_CASE(ngpus)                            \
+   case ngpus: {                                       \
+     if (world_size_ == 2) {                           \
+diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu
+index c34a503..b59ea40 100644
+--- a/csrc/custom_all_reduce_test.cu
++++ b/csrc/custom_all_reduce_test.cu
+@@ -1,15 +1,15 @@
+ /**
+  * This is a standalone test for custom allreduce.
+  * To compile, make sure you have MPI and NCCL installed in your system.
+- * export MPI_HOME=XXX
++ * export MPI_HOME=xxx
+  * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
+- * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
++ * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
+  *
+  * Warning: this C++ test is not designed to be very readable and was used
+  * during the rapid prototyping process.
+  *
+  * To run:
+- * mpirun -np 8 ./custom_all_reduce_test
++ * mpirun --allow-run-as-root -np 8 ./custom_all_reduce_test
+  */
+ #include <cuda.h>
+ #include <curand_kernel.h>
+@@ -44,11 +44,18 @@
+   } while (0)
+ 
+ __global__ void dummy_kernel() {
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+   for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
++#else
++  for (int i = 0; i < 100; i++) {
++    long long int start = clock64();
++    while (clock64() - start < 150000000);  // approximately 98.4ms on P40
++  }
++#endif
+ }
+ 
+ template <typename T>
+-__global__ void set_data(T *data, int size, int myRank) {
++__global__ void set_data(T* data, int size, int myRank) {
+   for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+        idx += gridDim.x * blockDim.x) {
+     data[idx] = myRank * 0.11f;
+@@ -56,8 +63,8 @@ __global__ void set_data(T *data, int size, int myRank) {
+ }
+ 
+ template <typename T>
+-__global__ void convert_data(const T *data1, const T *data2, double *fdata1,
+-                             double *fdata2, int size) {
++__global__ void convert_data(const T* data1, const T* data2, double* fdata1,
++                             double* fdata2, int size) {
+   for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+        idx += gridDim.x * blockDim.x) {
+     fdata1[idx] = data1[idx];
+@@ -65,7 +72,7 @@ __global__ void convert_data(const T *data1, const T *data2, double *fdata1,
+   }
+ }
+ 
+-__global__ void init_rand(curandState_t *state, int size, int nRanks) {
++__global__ void init_rand(curandState_t* state, int size, int nRanks) {
+   for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+        idx += gridDim.x * blockDim.x) {
+     for (int i = 0; i < nRanks; i++) {
+@@ -75,7 +82,7 @@ __global__ void init_rand(curandState_t *state, int size, int nRanks) {
+ }
+ 
+ template <typename T>
+-__global__ void gen_data(curandState_t *state, T *data, double *ground_truth,
++__global__ void gen_data(curandState_t* state, T* data, double* ground_truth,
+                          int myRank, int nRanks, int size) {
+   for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+        idx += gridDim.x * blockDim.x) {
+@@ -91,9 +98,9 @@ __global__ void gen_data(curandState_t *state, T *data, double *ground_truth,
+ }
+ 
+ template <typename T>
+-void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
++void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
+          int data_size, bool performance_test) {
+-  T *result;
++  T* result;
+   cudaStream_t stream;
+   CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+   CUDACHECK(cudaMalloc(&result, data_size * sizeof(T)));
+@@ -101,8 +108,8 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
+ 
+   cudaIpcMemHandle_t self_data_handle;
+   cudaIpcMemHandle_t data_handles[8];
+-  vllm::Signal *buffer;
+-  T *self_data_copy;
++  vllm::Signal* buffer;
++  T* self_data_copy;
+   /**
+    * Allocate IPC buffer
+    *
+@@ -125,32 +132,34 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
+                          MPI_BYTE, data_handles, sizeof(cudaIpcMemHandle_t),
+                          MPI_BYTE, MPI_COMM_WORLD));
+ 
+-  void *rank_data;
++  void* rank_data;
+   size_t rank_data_sz = 16 * 1024 * 1024;
+   CUDACHECK(cudaMalloc(&rank_data, rank_data_sz));
+-  std::vector<int64_t> offsets(nRanks, 0);
+-  vllm::CustomAllreduce fa(buffer, rank_data, rank_data_sz, data_handles,
+-                           offsets, myRank);
+-  auto *self_data =
+-      reinterpret_cast<T *>(reinterpret_cast<char *>(buffer) +
+-                            sizeof(vllm::Signal) + data_size * sizeof(T));
++  vllm::Signal* ipc_ptrs[8];
++  for (int i = 0; i < nRanks; i++) {
++    if (i == myRank)
++      ipc_ptrs[i] = buffer;
++    else
++      CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptrs[i], data_handles[i],
++                                     cudaIpcMemLazyEnablePeerAccess));
++  }
++  vllm::CustomAllreduce fa(ipc_ptrs, rank_data, rank_data_sz, myRank, nRanks);
++  auto* self_data =
++      reinterpret_cast<T*>(reinterpret_cast<char*>(buffer) +
++                           sizeof(vllm::Signal) + data_size * sizeof(T));
+   // hack buffer registration
+   {
+-    std::vector<std::string> handles;
+-    handles.reserve(nRanks);
++    void* data[8];
+     for (int i = 0; i < nRanks; i++) {
+-      char *begin = (char *)&data_handles[i];
+-      char *end = (char *)&data_handles[i + 1];
+-      handles.emplace_back(begin, end);
++      data[i] =
++          ((char*)ipc_ptrs[i]) + sizeof(vllm::Signal) + data_size * sizeof(T);
+     }
+-    std::vector<int64_t> offsets(nRanks,
+-                                 sizeof(vllm::Signal) + data_size * sizeof(T));
+-    fa.register_buffer(handles, offsets, self_data);
++    fa.register_buffer(data);
+   }
+ 
+-  double *ground_truth;
++  double* ground_truth;
+   CUDACHECK(cudaMallocHost(&ground_truth, data_size * sizeof(double)));
+-  curandState_t *states;
++  curandState_t* states;
+   CUDACHECK(cudaMalloc(&states, sizeof(curandState_t) * nRanks * data_size));
+   init_rand<<<108, 1024, 0, stream>>>(states, data_size, nRanks);
+   gen_data<T><<<108, 1024, 0, stream>>>(states, self_data, ground_truth, myRank,
+@@ -287,7 +296,7 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
+   CUDACHECK(cudaStreamDestroy(stream));
+ }
+ 
+-int main(int argc, char **argv) {
++int main(int argc, char** argv) {
+   int nRanks, myRank;
+   MPICHECK(MPI_Init(&argc, &argv));
+   MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
+@@ -296,21 +305,25 @@ int main(int argc, char **argv) {
+   ncclUniqueId id;
+   ncclComm_t comm;
+   if (myRank == 0) ncclGetUniqueId(&id);
+-  MPICHECK(MPI_Bcast(static_cast<void *>(&id), sizeof(id), MPI_BYTE, 0,
++  MPICHECK(MPI_Bcast(static_cast<void*>(&id), sizeof(id), MPI_BYTE, 0,
+                      MPI_COMM_WORLD));
+   NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));
+ 
+   bool performance_test = true;
+   cudaProfilerStart();
+-  // for (int threads : {256, 512}) {
++  // Uncomment to scan through different block size configs.
++  // for (int threads : {256, 512, 1024}) {
+   //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
+-  //     run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024);
++  //     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
++  //     performance_test);
+   //   }
+   // }
++  // Scan through different sizes to test performance.
+   for (int sz = 512; sz <= (8 << 20); sz *= 2) {
+     run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
+   }
+ 
+   cudaProfilerStop();
++  MPICHECK(MPI_Finalize());
+   return EXIT_SUCCESS;
+ }
+diff --git a/csrc/cutlass_extensions/common.cpp b/csrc/cutlass_extensions/common.cpp
+new file mode 100644
+index 0000000..3d2093a
+--- /dev/null
++++ b/csrc/cutlass_extensions/common.cpp
+@@ -0,0 +1,11 @@
++#include "cutlass_extensions/common.hpp"
++
++int32_t get_sm_version_num() {
++  int32_t major_capability, minor_capability;
++  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
++                         0);
++  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
++                         0);
++  int32_t version_num = major_capability * 10 + minor_capability;
++  return version_num;
++}
+\ No newline at end of file
+diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
+new file mode 100644
+index 0000000..85e359a
+--- /dev/null
++++ b/csrc/cutlass_extensions/common.hpp
+@@ -0,0 +1,35 @@
++#pragma once
++
++#include "cutlass/cutlass.h"
++#include <climits>
++#include "cuda_runtime.h"
++#include <iostream>
++
++/**
++ * Helper function for checking CUTLASS errors
++ */
++#define CUTLASS_CHECK(status)                       \
++  {                                                 \
++    cutlass::Status error = status;                 \
++    TORCH_CHECK(error == cutlass::Status::kSuccess, \
++                cutlassGetStatusString(error));     \
++  }
++
++/**
++ * Panic wrapper for unwinding CUDA runtime errors
++ */
++#define CUDA_CHECK(status)                                        \
++  {                                                               \
++    cudaError_t error = status;                                   \
++    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
++  }
++
++inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
++  int max_shared_mem_per_block_opt_in = 0;
++  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
++                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
++                        device);
++  return max_shared_mem_per_block_opt_in;
++}
++
++int32_t get_sm_version_num();
+diff --git a/csrc/cutlass_extensions/cute_utils.cuh b/csrc/cutlass_extensions/cute_utils.cuh
+new file mode 100644
+index 0000000..f61fe3c
+--- /dev/null
++++ b/csrc/cutlass_extensions/cute_utils.cuh
+@@ -0,0 +1,68 @@
++#pragma once
++
++#include <cute/tensor.hpp>
++#include <torch/all.h>
++namespace cute {
++
++////////////////////////////////////////////////////////////////////
++// layout utils
++////////////////////////////////////////////////////////////////////
++
++// Permute layout based on indices, example:
++//   permute_layout<1, 0>(layout) will swap the two dimensions
++//   permute_layout<0, 2, 1>(layout) will swap the last two dimensions
++template <size_t... I, typename Layout>
++CUTE_HOST_DEVICE static constexpr auto permute_layout(Layout l) {
++  static_assert(rank(l) == sizeof...(I), "Invalid permutation, rank mismatch");
++  return cute::make_layout(cute::get<I>(l)...);
++}
++
++// is the layout f(x) = x
++template <typename Layout>
++CUTE_HOST_DEVICE static constexpr bool is_identity_layout() {
++  if constexpr (std::is_same_v<Layout, void>) {
++    return true;
++  } else {
++    constexpr auto coalesced_layout = coalesce(Layout{});
++    if constexpr (rank(coalesced_layout) == 1 &&
++                  stride<0>(coalesced_layout) == 1) {
++      return true;
++    }
++    return false;
++  }
++}
++
++////////////////////////////////////////////////////////////////////
++// Pointer utils
++////////////////////////////////////////////////////////////////////
++
++template <class PointerType>
++static constexpr auto get_logical_ptr(PointerType* ptr) {
++  if constexpr (cute::sizeof_bits_v<PointerType> < 8) {
++    return cute::subbyte_iterator<PointerType>(ptr);
++  } else {
++    return ptr;
++  }
++}
++
++////////////////////////////////////////////////////////////////////
++// Misc utils
++////////////////////////////////////////////////////////////////////
++
++template <typename T, typename Elements>
++CUTE_HOST_DEVICE static constexpr auto create_auto_vectorizing_copy() {
++  constexpr auto bits = sizeof_bits_v<T> * Elements{};
++  if constexpr (bits % 128 == 0) {
++    return AutoVectorizingCopyWithAssumedAlignment<128>{};
++  } else if constexpr (bits % 64 == 0) {
++    return AutoVectorizingCopyWithAssumedAlignment<64>{};
++  } else if constexpr (bits % 32 == 0) {
++    return AutoVectorizingCopyWithAssumedAlignment<32>{};
++  } else if constexpr (bits % 16 == 0) {
++    return AutoVectorizingCopyWithAssumedAlignment<16>{};
++  } else {
++    return AutoVectorizingCopyWithAssumedAlignment<8>{};
++  }
++}
++
++};  // namespace cute
+diff --git a/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
+new file mode 100644
+index 0000000..7aa87fe
+--- /dev/null
++++ b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
+@@ -0,0 +1,497 @@
++/***************************************************************************************************
++ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
++ *reserved. SPDX-License-Identifier: BSD-3-Clause
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice,
++ *this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ * this list of conditions and the following disclaimer in the documentation
++ * and/or other materials provided with the distribution.
++ *
++ * 3. Neither the name of the copyright holder nor the names of its
++ * contributors may be used to endorse or promote products derived from
++ * this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
++ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
++ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
++ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
++ *POSSIBILITY OF SUCH DAMAGE.
++ *
++ **************************************************************************************************/
++
++//
++// This file is a modified excerpt of
++// include/cutlass/epilogue/fusion/visitor_load.hpp from
++// https://github.com/NVIDIA/cutlass v3.5.0
++// It has been modified to support either
++// row/column or scalar broadcasting where the tensor being loaded from is
++// always passed in via a device pointer. This lets one compiled kernel handle
++// all cases of per-tensor or per-channel/per-token quantization.
++//
++// This interface also allows the scales to be passed in as tensors that
++// consistently reside on the device, which avoids an issue with a previous
++// implementation where scalars needed to be on the CPU since they
++// were passed in via float values. This created a potential performance hazard
++// if scales were initially on the device, and caused torch.compile graph
++// breaks when moving scales to the CPU.
++//
++#pragma once
++
++// Turn off clang-format for the entire file to keep it close to upstream
++// clang-format off
++
++#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
++#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
++#include "cute/tensor.hpp"
++
++namespace cutlass::epilogue::threadblock {
++
++using namespace cute;
++using namespace detail;
++
++template<
++  class ThreadMap,
++  class Element,
++  class StrideMNL
++>
++struct VisitorRowOrScalarBroadcast {
++
++  // This struct has been modified to have a bool indicating that ptr_row is a 
++  // scalar that must be broadcast.
++  struct Arguments {
++    Element const* ptr_row = nullptr;
++    bool row_broadcast = true;
++    StrideMNL dRow = {};
++  };
++
++  using Params = Arguments;
++
++  template <class ProblemShape>
++  static constexpr Params
++  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
++    return args;
++  }
++
++  template <class ProblemShape>
++  static size_t
++  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
++    return 0;
++  }
++
++  struct SharedStorage {};
++
++  // Global load type
++  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
++  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
++  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
++
++  CUTLASS_HOST_DEVICE
++  VisitorRowOrScalarBroadcast() { }
++
++  CUTLASS_HOST_DEVICE
++  VisitorRowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
++    : params_ptr(&params) { }
++
++  Params const* params_ptr;
++
++  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
++  struct Callbacks : EmptyCallbacks {
++    CUTLASS_DEVICE
++    Callbacks(
++      GTensor&& tC_gRow,
++      RTensor&& tC_rRow,
++      CTensor&& tC_cRow,
++      ProblemShape problem_shape,
++      Params const* params_ptr
++    ):
++      tC_gRow(cute::forward<GTensor>(tC_gRow)),
++      tC_rRow(cute::forward<RTensor>(tC_rRow)),
++      tC_cRow(cute::forward<CTensor>(tC_cRow)),
++      n(get<1>(problem_shape)),
++      params_ptr(params_ptr) { }
++
++    GTensor tC_gRow;
++    RTensor tC_rRow;
++    CTensor tC_cRow;
++    Params const* params_ptr;
++    int n;
++
++    // This function is modified from VisitorRowBroadcast
++    CUTLASS_DEVICE void
++    begin_epilogue() {
++      clear(tC_rRow);
++      auto src_v = filter(tC_gRow);
++      auto coord_v = filter(tC_cRow);
++      auto dst_v = filter(tC_rRow);
++
++      if (params_ptr->row_broadcast) {
++        // In this case we are loading from a row vector and broadcasting
++        CUTLASS_PRAGMA_UNROLL
++        for (int i = 0; i < size(src_v); ++i) {
++          bool guard = get<1>(coord_v(i)) < n;
++          cutlass::arch::global_load<VecType, sizeof(VecType)>(
++              dst_v(i), (void const*)&src_v(i), guard);
++        }
++      } else {
++        // In this case we are loading from a scalar and broadcasting
++        VecType filled_vec;
++        CUTLASS_PRAGMA_UNROLL
++        for (int i = 0; i < VecLength; i++) {
++          reinterpret_cast<Element*>(&filled_vec)[i] = *(params_ptr->ptr_row);
++        }
++
++        CUTLASS_PRAGMA_UNROLL
++        for (int i = 0; i < size(src_v); ++i) {
++          if (get<1>(coord_v(i)) < n) {
++            dst_v(i) = filled_vec;
++          }
++        }
++      }
++    }
++
++    template <class ElementAccumulator, int FragmentSize>
++    CUTLASS_DEVICE auto // returns an Array
++    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
++          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
++      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
++      return rRow_frg(column_idx);
++    }
++  };
++
++  template <class ProblemShape>
++  CUTLASS_DEVICE auto
++  get_callbacks(
++    gemm::GemmCoord threadblock_tile_offset,
++    int thread_idx,
++    ProblemShape problem_shape
++  ) {
++    Tensor mRow = make_tensor(
++      make_gmem_ptr(params_ptr->ptr_row),
++      problem_shape,
++      params_ptr->dRow);
++
++    // VECTOR, FRAGMENT_COLUMN
++    Tensor tC_gRow = recast<VecType>(
++      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
++    )(_,_,_0{},_0{},_0{},_0{});
++    Tensor tC_rRow = make_tensor_like(tC_gRow);
++
++    // Generate the pred tensor
++    Tensor cRow = make_identity_tensor(mRow.shape());
++    Tensor tC_cRow = outer_partition(
++      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
++      Shape<Int<VecLength>>{},
++      (_0{})
++    );
++
++    return Callbacks<
++      decltype(tC_gRow), decltype(tC_rRow),
++      decltype(tC_cRow), ProblemShape>(
++      cute::move(tC_gRow),
++      cute::move(tC_rRow),
++      cute::move(tC_cRow),
++      problem_shape,
++      params_ptr
++    );
++  }
++
++};
++
++/////////////////////////////////////////////////////////////////////////////////////////////////
++
++// This is a modified RowBroadcast that will broadcast 0 if ptr_row is null
++template<
++  class ThreadMap,
++  class Element,
++  class StrideMNL
++>
++struct VisitorRowOrZeroBroadcast {
++
++  // This struct has been modified to remove null_default (because it's always 0)
++  struct Arguments {
++    Element const* ptr_row = nullptr;
++    StrideMNL dRow = {};
++  };
++
++  using Params = Arguments;
++
++  template <class ProblemShape>
++  static constexpr Params
++  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
++    return args;
++  }
++
++  template <class ProblemShape>
++  static size_t
++  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
++    return 0;
++  }
++
++  struct SharedStorage {};
++
++  // Global load type
++  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
++  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
++  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
++
++  CUTLASS_HOST_DEVICE
++  VisitorRowOrZeroBroadcast() { }
++
++  CUTLASS_HOST_DEVICE
++  VisitorRowOrZeroBroadcast(Params const& params, SharedStorage const& shared_storage)
++    : params_ptr(&params) { }
++
++  Params const* params_ptr;
++
++  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
++  struct Callbacks : EmptyCallbacks {
++    CUTLASS_DEVICE
++    Callbacks(
++      GTensor&& tC_gRow,
++      RTensor&& tC_rRow,
++      CTensor&& tC_cRow,
++      ProblemShape problem_shape,
++      Params const* params_ptr
++    ):
++      tC_gRow(cute::forward<GTensor>(tC_gRow)),
++      tC_rRow(cute::forward<RTensor>(tC_rRow)),
++      tC_cRow(cute::forward<CTensor>(tC_cRow)),
++      n(get<1>(problem_shape)),
++      params_ptr(params_ptr) { }
++
++    GTensor tC_gRow;
++    RTensor tC_rRow;
++    CTensor tC_cRow;
++    Params const* params_ptr;
++    int n;
++
++    // This function is modified from VisitorRowBroadcast
++    CUTLASS_DEVICE void
++    begin_epilogue() {
++      clear(tC_rRow);
++      auto src_v = filter(tC_gRow);
++      auto coord_v = filter(tC_cRow);
++      auto dst_v = filter(tC_rRow);
++
++      if (params_ptr->ptr_row != nullptr) {
++        // In this case we are loading from a row vector and broadcasting
++        CUTLASS_PRAGMA_UNROLL
++        for (int i = 0; i < size(src_v); ++i) {
++          bool guard = get<1>(coord_v(i)) < n;
++          cutlass::arch::global_load<VecType, sizeof(VecType)>(
++              dst_v(i), (void const*)&src_v(i), guard);
++        }
++      } else {
++        // In this case we are broadcasting 0
++        VecType filled_vec;
++        CUTLASS_PRAGMA_UNROLL
++        for (int i = 0; i < VecLength; i++) {
++          reinterpret_cast<Element*>(&filled_vec)[i] = Element{0};
++        }
++
++        CUTLASS_PRAGMA_UNROLL
++        for (int i = 0; i < size(src_v); ++i) {
++          if (get<1>(coord_v(i)) < n) {
++            dst_v(i) = filled_vec;
++          }
++        }
++      }
++    }
++
++    template <class ElementAccumulator, int FragmentSize>
++    CUTLASS_DEVICE auto // returns an Array
++    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
++          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
++      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
++      return rRow_frg(column_idx);
++    }
++  };
++
++  template <class ProblemShape>
++  CUTLASS_DEVICE auto
++  get_callbacks(
++    gemm::GemmCoord threadblock_tile_offset,
++    int thread_idx,
++    ProblemShape problem_shape
++  ) {
++    Tensor mRow = make_tensor(
++      make_gmem_ptr(params_ptr->ptr_row),
++      problem_shape,
++      params_ptr->dRow);
++
++    // VECTOR, FRAGMENT_COLUMN
++    Tensor tC_gRow = recast<VecType>(
++      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
++    )(_,_,_0{},_0{},_0{},_0{});
++    Tensor tC_rRow = make_tensor_like(tC_gRow);
++
++    // Generate the pred tensor
++    Tensor cRow = make_identity_tensor(mRow.shape());
++    Tensor tC_cRow = outer_partition(
++      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
++      Shape<Int<VecLength>>{},
++      (_0{})
++    );
++
++    return Callbacks<
++      decltype(tC_gRow), decltype(tC_rRow),
++      decltype(tC_cRow), ProblemShape>(
++      cute::move(tC_gRow),
++      cute::move(tC_rRow),
++      cute::move(tC_cRow),
++      problem_shape,
++      params_ptr
++    );
++  }
++
++};
++
++
++/////////////////////////////////////////////////////////////////////////////////////////////////
++
++// Column vector broadcast
++template<
++  class ThreadMap,
++  class Element,
++  class StrideMNL = Stride<_1,_0,_0>
++>
++struct VisitorColOrScalarBroadcast {
++
++  // This struct has been modified to have a bool indicating that ptr_col is a
++  // scalar that must be broadcast.
++  struct Arguments {
++    Element const* ptr_col = nullptr;
++    bool col_broadcast = true;
++    StrideMNL dCol = {};
++  };
++
++  using Params = Arguments;
++
++  template <class ProblemShape>
++  static constexpr Params
++  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
++    return args;
++  }
++
++  template <class ProblemShape>
++  static size_t
++  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
++    return 0;
++  }
++
++  struct SharedStorage { };
++
++  CUTLASS_HOST_DEVICE
++  VisitorColOrScalarBroadcast() { }
++
++  CUTLASS_HOST_DEVICE
++  VisitorColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
++    : params_ptr(&params) { }
++
++  Params const* params_ptr;
++
++  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
++  struct Callbacks : EmptyCallbacks {
++    CUTLASS_DEVICE
++    Callbacks(
++      GTensor&& tC_gCol,
++      RTensor&& tC_rCol,
++      CTensor&& tC_cCol,
++      ProblemShape problem_shape,
++      Params const* params_ptr
++    ):
++      tC_gCol(cute::forward<GTensor>(tC_gCol)),
++      tC_rCol(cute::forward<RTensor>(tC_rCol)),
++      tC_cCol(cute::forward<CTensor>(tC_cCol)),
++      m(get<0>(problem_shape)),
++      params_ptr(params_ptr) { }
++
++    GTensor tC_gCol;
++    RTensor tC_rCol;
++    CTensor tC_cCol;
++    Params const* params_ptr;
++    int m;
++
++    // This function is modified from VisitorColBroadcast
++    CUTLASS_DEVICE void 
++    begin_epilogue() {
++      clear(tC_rCol);
++
++      Tensor pred = make_tensor<bool>(shape(tC_gCol));
++      CUTLASS_PRAGMA_UNROLL
++      for (int i = 0; i < size(pred); ++i) {
++        pred(i) = get<0>(tC_cCol(i)) < m;
++      }
++
++      if (params_ptr->col_broadcast) {
++        // In this case we are loading from a column vector and broadcasting
++        copy_if(pred, tC_gCol, tC_rCol);
++      } else {
++        // In this case we are loading from a scalar and broadcasting
++        auto dst_v = filter(tC_rCol);
++
++        CUTLASS_PRAGMA_UNROLL
++        for (int i = 0; i < size(dst_v); ++i) {
++          if (pred(i)) {
++            dst_v(i) = *(params_ptr->ptr_col);
++          }
++        }
++      }
++    }
++
++    template <class ElementAccumulator, int FragmentSize>
++    CUTLASS_DEVICE auto // returns an Array
++    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
++          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
++      Array<Element, FragmentSize> frg_col;
++      frg_col.fill(tC_rCol(row_idx,iter_idx));
++      return frg_col;
++    }
++  };
++
++  template <class ProblemShape>
++  CUTLASS_DEVICE auto
++  get_callbacks(
++    gemm::GemmCoord threadblock_tile_offset,
++    int thread_idx,
++    ProblemShape problem_shape
++  ) {
++    Tensor mCol = make_tensor(
++      make_gmem_ptr(params_ptr->ptr_col),
++      problem_shape,
++      params_ptr->dCol);
++
++    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
++    Tensor tC_gCol = group_modes<1,4>(
++      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
++    Tensor tC_rCol = make_tensor_like(tC_gCol);
++
++    // Generate the pred tensor
++    Tensor cCol = make_identity_tensor(mCol.shape());
++    Tensor tC_cCol = group_modes<1,4>(
++      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
++
++    return Callbacks<
++      decltype(tC_gCol), decltype(tC_rCol),
++      decltype(tC_cCol), ProblemShape>(
++      cute::move(tC_gCol),
++      cute::move(tC_rCol),
++      cute::move(tC_cCol),
++      problem_shape,
++      params_ptr
++    );
++  }
++};
++
++}
+diff --git a/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
+new file mode 100644
+index 0000000..58b1e8f
+--- /dev/null
++++ b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
+@@ -0,0 +1,447 @@
++/***************************************************************************************************
++ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
++ *reserved. SPDX-License-Identifier: BSD-3-Clause
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice,
++ *this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ * this list of conditions and the following disclaimer in the documentation
++ * and/or other materials provided with the distribution.
++ *
++ * 3. Neither the name of the copyright holder nor the names of its
++ * contributors may be used to endorse or promote products derived from
++ * this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
++ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
++ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
++ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
++ *POSSIBILITY OF SUCH DAMAGE.
++ *
++ **************************************************************************************************/
++
++//
++// This file is a modified excerpt of
++// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
++// from https://github.com/NVIDIA/cutlass v3.5.0
++// It has been modified to support either row/column or scalar broadcasting
++// where the tensor being loaded from is always passed in via a device pointer.
++// This lets one compiled kernel handle all cases of per-tensor or
++// per-channel/per-token quantization.
++//
++// This interface also allows the scales to be passed in as tensors that
++// consistently reside on the device, which avoids an issue with a previous
++// implementation where scalars needed to be on the CPU since they
++// were passed in via float values. This created a potential performance hazard
++// if scales were initially on the device, and caused torch.compile graphs
++// breaks when moving scales to the CPU.
++//
++#pragma once
++
++// Turn off clang-format for the entire file to keep it close to upstream
++// clang-format off
++
++#include "cutlass/cutlass.h"
++#include "cutlass/arch/barrier.h"
++
++#include "cute/tensor.hpp"
++#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
++
++namespace cutlass::epilogue::fusion {
++
++using namespace cute;
++using namespace detail;
++
++// Row vector broadcast
++template<
++  int Stages,
++  class CtaTileShapeMNK,
++  class Element,
++  class StrideMNL = Stride<_0,_1,_0>,
++  int Alignment = 128 / sizeof_bits_v<Element>
++>
++struct Sm90RowOrScalarBroadcast {
++  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
++  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
++  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
++
++  struct SharedStorage { 
++    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
++  };
++
++  // This struct has been modified to have a bool indicating that ptr_row is a 
++  // scalar that must be broadcast, instead of containing a scalar that is 
++  // valid if ptr_row is null.
++  struct Arguments {
++    Element const* ptr_row = nullptr;
++    bool row_broadcast = true;
++    StrideMNL dRow = {};
++  };
++
++  using Params = Arguments;
++
++  template <class ProblemShape>
++  static constexpr Params
++  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
++    return args;
++  }
++
++  template <class ProblemShape>
++  static bool
++  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
++    return true;
++  }
++
++  template <class ProblemShape>
++  static size_t
++  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
++    return 0;
++  }
++
++  template <class ProblemShape>
++  static cutlass::Status
++  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
++    CudaHostAdapter* cuda_adapter = nullptr) {
++    return cutlass::Status::kSuccess;
++  }
++
++  CUTLASS_HOST_DEVICE
++  Sm90RowOrScalarBroadcast() { }
++
++  CUTLASS_HOST_DEVICE
++  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
++      : params(params)
++      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
++
++  Params params;
++  Element *smem = nullptr;
++
++  CUTLASS_DEVICE bool
++  is_producer_load_needed() const {
++    return false;
++  }
++
++  CUTLASS_DEVICE bool
++  is_C_load_needed() const {
++    return false;
++  }
++
++  CUTLASS_DEVICE bool
++  is_zero() const {
++    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
++  }
++
++  template <class... Args>
++  CUTLASS_DEVICE auto
++  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
++    return EmptyProducerLoadCallbacks{};
++  }
++
++  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
++  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
++    CUTLASS_DEVICE
++    ConsumerStoreCallbacks(
++        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
++        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
++        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
++        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, Params const& params_)
++      : tGS_gRow(tGS_gRow_)
++      , tGS_sRow(tGS_sRow_)
++      , tGS_cRow(tGS_cRow_)
++      , tiled_G2S(tiled_g2s_)
++      , tSR_sRow(tSR_sRow_)
++      , tSR_rRow(tSR_rRow_)
++      , tCcRow(tCcRow_)
++      , residue_tCcRow(residue_tCcRow_)
++      , params(params_) {}
++
++    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
++    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
++    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
++    Tiled_G2S tiled_G2S;
++
++    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
++    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
++  
++    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
++    ThrResidue residue_tCcRow;                                                   // (m, n)
++    ThrNum thr_num;
++    Params const& params;
++
++    CUTLASS_DEVICE void
++    begin() {
++      if (!params.row_broadcast) {
++        fill(tSR_rRow, *(params.ptr_row));
++        return;
++      }
++
++      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
++      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
++      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
++      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
++
++      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
++        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
++          continue; // OOB of SMEM, 
++        }
++        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
++          tGS_sRow_flt(i) = tGS_gRow_flt(i);
++        }
++        else {
++          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
++        }
++      }
++      synchronize();
++    }
++
++    CUTLASS_DEVICE void
++    begin_loop(int epi_m, int epi_n) {
++      if (epi_m == 0) { // Assumes M-major subtile loop
++        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
++        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
++        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
++        copy(tSR_sRow_flt, tSR_rRow_flt);
++      }
++    }
++
++    template <typename ElementAccumulator, int FragmentSize>
++    CUTLASS_DEVICE Array<Element, FragmentSize>
++    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
++      Array<Element, FragmentSize> frg_row;
++
++      CUTLASS_PRAGMA_UNROLL
++      for (int i = 0; i < FragmentSize; ++i) {
++        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
++      }
++
++      return frg_row;
++    }
++  };
++
++  template <
++    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
++    class... Args
++  >
++  CUTLASS_DEVICE auto
++  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
++    auto [M, N, K, L] = args.problem_shape_mnkl;
++    auto [m, n, k, l] = args.tile_coord_mnkl;
++    using ThreadCount = decltype(size(args.tiled_copy));
++
++    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
++    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
++    Tensor sRow = make_tensor(make_smem_ptr(smem), 
++        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
++    //// G2S: Gmem to Smem
++    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
++                                     Layout< Shape<_1, ThreadCount>, 
++                                            Stride<_0,          _1>>{}, 
++                                     Layout<_1>{});   
++    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
++    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
++    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
++
++    //// G2S: Coord 
++    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
++    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
++
++    //// S2R: Smem to Reg
++    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
++    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
++
++    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
++      tGS_gRow, 
++      tGS_sRow, 
++      tGS_cRow, tiled_g2s, 
++      tSR_sRow, 
++      tSR_rRow, 
++      args.tCcD, 
++      args.residue_cD,
++      ThreadCount{}, 
++      params);
++  }
++};
++
++/////////////////////////////////////////////////////////////////////////////////////////////////
++
++// Column vector broadcast
++template<
++  int Stages,
++  class CtaTileShapeMNK,
++  class Element,
++  class StrideMNL = Stride<_1,_0,_0>,
++  int Alignment = 128 / sizeof_bits_v<Element>
++>
++struct Sm90ColOrScalarBroadcast {
++  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
++  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
++  static_assert(
++    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
++    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
++
++  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
++  struct SharedStorage { };
++
++  // This struct has been modified to have a bool indicating that ptr_col is a 
++  // scalar that must be broadcast, instead of containing a scalar that is 
++  // valid if ptr_col is null.
++  struct Arguments {
++    Element const* ptr_col = nullptr;
++    bool col_broadcast = true;
++    StrideMNL dCol = {};
++  };
++
++  using Params = Arguments;
++
++  template <class ProblemShape>
++  static constexpr Params
++  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
++    return args;
++  }
++
++  template <class ProblemShape>
++  static bool
++  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
++    return true;
++  }
++
++  template <class ProblemShape>
++  static size_t
++  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
++    return 0;
++  }
++
++  template <class ProblemShape>
++  static cutlass::Status
++  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
++    CudaHostAdapter* cuda_adapter = nullptr) {
++    return cutlass::Status::kSuccess;
++  }
++
++  CUTLASS_DEVICE bool
++  is_producer_load_needed() const {
++    return false;
++  }
++
++  CUTLASS_DEVICE bool
++  is_C_load_needed() const {
++    return false;
++  }
++
++  CUTLASS_DEVICE bool
++  is_zero() const {
++    return (!params.col_broadcast && *(params.ptr_col) == Element(0));
++  }
++
++  CUTLASS_HOST_DEVICE
++  Sm90ColOrScalarBroadcast() { }
++
++  CUTLASS_HOST_DEVICE
++  Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
++      : params(params) { }
++
++  Params params;
++
++  template <class... Args>
++  CUTLASS_DEVICE auto
++  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
++    return EmptyProducerLoadCallbacks{};
++  }
++
++  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
++  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
++    CUTLASS_DEVICE
++    ConsumerStoreCallbacks(
++      GTensor&& tCgCol,
++      RTensor&& tCrCol,
++      CTensor&& tCcCol,
++      ProblemShape problem_shape,
++      Params const& params
++    ): 
++      tCgCol(cute::forward<GTensor>(tCgCol)),
++      tCrCol(cute::forward<RTensor>(tCrCol)),
++      tCcCol(cute::forward<CTensor>(tCcCol)),
++      m(get<0>(problem_shape)),
++      params(params) {}
++
++    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
++    RTensor tCrCol;
++    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
++    Params const& params;
++    int m;
++
++    CUTLASS_DEVICE void
++    begin() {
++      Tensor pred = make_tensor<bool>(shape(tCgCol));
++      CUTLASS_PRAGMA_UNROLL
++      for (int i = 0; i < size(pred); ++i) {
++        pred(i) = get<0>(tCcCol(i)) < m;
++      }
++
++      if (!params.col_broadcast) {
++        fill(tCrCol, *(params.ptr_col));
++        return;
++      }
++
++      // Filter so we don't issue redundant copies over stride-0 modes
++      // (only works if 0-strides are in same location, which is by construction)
++      copy_if(pred, filter(tCgCol), filter(tCrCol));
++    }
++
++    template <typename ElementAccumulator, int FragmentSize>
++    CUTLASS_DEVICE Array<Element, FragmentSize>
++    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
++      Array<Element, FragmentSize> frg_col;
++      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
++
++      CUTLASS_PRAGMA_UNROLL
++      for (int i = 0; i < FragmentSize; ++i) {
++        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
++      }
++
++      return frg_col;
++    }
++
++  };
++
++  template <
++    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
++    class... Args
++  >
++  CUTLASS_DEVICE auto
++  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
++
++    auto [M, N, K, L] = args.problem_shape_mnkl;
++    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
++    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
++      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
++    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
++
++    // Generate an identity tensor matching the shape of the global tensor and 
++    //  partition the same way, this will be used to generate the predicate
++    //  tensor for loading
++    Tensor cCol = make_identity_tensor(mCol.shape());
++    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
++      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
++
++    return ConsumerStoreCallbacks(
++      cute::move(tCgCol), 
++      cute::move(tCrCol), 
++      cute::move(tCcCol), 
++      args.problem_shape_mnkl, 
++      params
++    );
++  }
++};
++
++}
+diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+new file mode 100644
+index 0000000..ef413e6
+--- /dev/null
++++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+@@ -0,0 +1,319 @@
++#pragma once
++
++#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp"
++
++/*
++   This file defines custom epilogues for fusing channel scales, token scales,
++   bias, and activation zero-points onto a GEMM operation using the
++   CUTLASS 2.x API, for sm80 (Ampere) NVIDIA GPUs.
++
++   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
++   as well as a static prepare_args function that constructs an
++   EVTCompute::Arguments struct.
++*/
++
++namespace vllm::c2x {
++
++using namespace cute;
++
++/*
++ * This class provides the common load descriptors for the
++ * ScaledEpilogue[...] classes
++ */
++template <typename ElementD, typename OutputTileThreadMap>
++struct ScaledEpilogueBase {
++ protected:
++  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
++
++  template <typename T>
++  using ColOrScalarLoad =
++      cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
++          OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
++
++  template <typename T>
++  using RowOrScalarLoad =
++      cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
++          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
++
++  template <typename T>
++  using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast<
++      OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
++
++  template <typename T>
++  using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast<
++      OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
++
++  template <typename T>
++  using RowOrZeroLoad =
++      cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast<
++          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
++
++  // This utility function constructs the arguments for the load descriptors
++  // from a tensor. It can handle both row and column, as well as row/column or
++  // scalar cases.
++  template <typename Descriptor, typename T>
++  static auto args_from_tensor(torch::Tensor const& tensor) {
++    using Arguments = typename Descriptor::Arguments;
++    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
++    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
++                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
++      return Arguments{data_ptr, tensor.numel() != 1};
++    } else {
++      // it would technically work but no use case as data_ptr is never nullptr
++      static_assert(!std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
++      return Arguments{data_ptr};
++    }
++  }
++
++  // This overload handles the case where there might not be a tensor, in which
++  // case a nullptr is passed and a constant (0) is used.
++  template <typename Descriptor, typename T>
++  static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
++    static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
++    using Arguments = typename Descriptor::Arguments;
++    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
++    return Arguments{data_ptr};
++  }
++};
++
++/*
++ This epilogue function defines a quantized GEMM operation similar to
++ torch._scaled_mm.
++
++ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
++ per-row. B can be quantized per-tensor or per-column.
++ Any combination of per-tensor and per-row or column is supported.
++ A and B must have symmetric quantization (zero point == 0).
++
++ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
++ scales are applied elementwise with numpy-style broadcasting.
++
++ ScaleA and ScaleB define the epilogue functions that apply the scales for
++ the A and B operands respectively. These scales may be either per-tensor or
++ per row or column.
++*/
++template <typename ElementD, typename OutputTileThreadMap>
++struct ScaledEpilogue
++    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
++ private:
++  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
++  using Accum = typename SUPER::Accum;
++  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
++  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
++
++  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::multiplies, float, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTCompute0 =
++      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
++
++  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::multiplies, ElementD, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++ public:
++  using EVTCompute =
++      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
++  using ArgumentType = typename EVTCompute::Arguments;
++
++  static ArgumentType prepare_args(torch::Tensor const& a_scales,
++                                   torch::Tensor const& b_scales) {
++    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
++    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
++
++    typename EVTCompute0::Arguments evt0_args{b_args};
++    return ArgumentType{a_args, evt0_args};
++  }
++};
++
++/*
++ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
++ * This bias can also be used in the per-tensor azp case, where the activation
++ * zero point (azp) is used to compute an azp correction term,
++ * which is folded into the bias.
++ *
++ * The bias tensor must be per-output channel.
++ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
++ */
++template <typename ElementD, typename OutputTileThreadMap>
++struct ScaledEpilogueBias
++    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
++ protected:
++  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
++  using Accum = typename SUPER::Accum;
++  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
++  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
++  using Bias = typename SUPER::template RowLoad<ElementD>;
++  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::multiplies, float, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTCompute0 =
++      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
++
++  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::multiply_add, ElementD, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++ public:
++  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
++                                                             EVTCompute0, Bias>;
++  using ArgumentType = typename EVTCompute::Arguments;
++  static ArgumentType prepare_args(torch::Tensor const& a_scales,
++                                   torch::Tensor const& b_scales,
++                                   torch::Tensor const& bias) {
++    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
++    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
++    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
++
++    typename EVTCompute0::Arguments evt0_args{b_args};
++    return ArgumentType{a_args, evt0_args, bias_args};
++  }
++};
++
++/*
++ * This epilogue directly supports per-tensor azp in int32 form.
++ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
++ * term, which should already be multiplied with the scalar azp.
++ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
++ *
++ * This epilogue also supports bias, which remains per-channel.
++ */
++template <typename ElementD, typename OutputTileThreadMap>
++struct ScaledEpilogueBiasAzp
++    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
++ private:
++  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
++  using Accum = typename SUPER::Accum;
++  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
++  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
++  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
++
++  // This is the full AZP term, azp * J @ B, shape (1,n)
++  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
++
++  // Compute float(accum - azp_adj), both operands are int32_t
++  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::minus, float, int32_t,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTComputeAzp =
++      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Accum, AzpWithAdj>;
++
++  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::multiplies, float, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTComputeScaleB =
++      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
++                                              EVTComputeAzp>;
++
++  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::multiply_add, ElementD, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++ public:
++  using EVTCompute =
++      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
++                                              EVTComputeScaleB, Bias>;
++
++  using ArgumentType = typename EVTCompute::Arguments;
++
++  static ArgumentType prepare_args(torch::Tensor const& a_scales,
++                                   torch::Tensor const& b_scales,
++                                   torch::Tensor const& azp_adj,
++                                   std::optional<torch::Tensor> const& bias) {
++    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
++    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
++    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
++    auto azp_adj_args =
++        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
++
++    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
++    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
++    return ArgumentType{a_args, evt_scale_b_args, bias_args};
++  }
++};
++
++/*
++ * This epilogue supports per-token azp by computing and applying
++ * the correction term using a rank-1 update. If the term were materialized,
++ * it would require O(m*n) space, and this way it only requires O(m+n) space.
++ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
++ * point for each row of A.
++ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
++ *
++ * This epilogue also supports bias, which remains per-channel.
++ */
++template <typename ElementD, typename OutputTileThreadMap>
++struct ScaledEpilogueBiasAzpToken
++    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
++ private:
++  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
++  using Accum = typename SUPER::Accum;
++  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
++  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
++  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
++
++  // Per-token azp term, shape (m,1)
++  using Azp = typename SUPER::template ColLoad<int32_t>;
++
++  // This is the AZP adjustment term, J @ B, shape (1,n)
++  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
++
++  // Compute azp * azp_adj
++  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::multiplies, int32_t, int32_t,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTComputeAzp =
++      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Azp, AzpAdj>;
++
++  // Compute float(accum - azp*azp_adj), all operands are int32_t
++  using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::minus, float, int32_t,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTComputeAcc =
++      cutlass::epilogue::threadblock::Sm80EVT<ComputeAcc, Accum, EVTComputeAzp>;
++
++  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::multiplies, float, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTComputeScaleB =
++      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
++                                              EVTComputeAcc>;
++
++  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
++      cutlass::multiply_add, ElementD, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++ public:
++  using EVTCompute =
++      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
++                                              EVTComputeScaleB, Bias>;
++
++  using ArgumentType = typename EVTCompute::Arguments;
++
++  static ArgumentType prepare_args(torch::Tensor const& a_scales,
++                                   torch::Tensor const& b_scales,
++                                   torch::Tensor const& azp_adj,
++                                   torch::Tensor const& azp,
++                                   std::optional<torch::Tensor> const& bias) {
++    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
++    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
++    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
++    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
++    auto azp_adj_args =
++        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
++
++    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
++    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
++    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
++    return ArgumentType{a_args, evt_scale_b_args, bias_args};
++  }
++};
++
++};  // namespace vllm::c2x
+\ No newline at end of file
+diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+new file mode 100644
+index 0000000..c590c66
+--- /dev/null
++++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+@@ -0,0 +1,317 @@
++#pragma once
++
++#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
++
++/*
++   This file defines custom epilogues for fusing channel scales, token scales,
++   bias, and activation zero-points onto a GEMM operation using the
++   CUTLASS 3.x API, for NVIDIA GPUs with sm90a (Hopper) or later.
++
++   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
++   as well as a static prepare_args function that constructs an
++   EVTCompute::Arguments struct.
++*/
++
++namespace vllm::c3x {
++
++using namespace cute;
++
++/*
++ * This class provides the common load descriptors for the
++ * ScaledEpilogue[...] classes
++ */
++template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
++struct ScaledEpilogueBase {
++ protected:
++  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
++
++  template <typename T>
++  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
++      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
++      Stride<Int<1>, Int<0>, Int<0>>>;
++
++  template <typename T>
++  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
++      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
++      Stride<Int<0>, Int<1>, Int<0>>>;
++
++  // Don't want to support nullptr by default
++  template <typename T, bool EnableNullPtr = false>
++  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
++      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
++      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
++
++  // Don't want to support nullptr by default
++  template <typename T, bool EnableNullPtr = false>
++  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
++      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
++      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
++
++  // This utility function constructs the arguments for the load descriptors
++  // from a tensor. It can handle both row and column, as well as row/column or
++  // scalar cases.
++  template <typename Descriptor, typename T>
++  static auto args_from_tensor(torch::Tensor const& tensor) {
++    using Arguments = typename Descriptor::Arguments;
++    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
++    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
++                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
++      return Arguments{data_ptr, tensor.numel() != 1};
++    } else {
++      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
++                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
++      return Arguments{data_ptr};
++    }
++  }
++
++  // This overload handles the case where there might not be a tensor, in which
++  // case a nullptr is passed and a constant (0) is used.
++  template <typename Descriptor, typename T>
++  static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
++    using Arguments = typename Descriptor::Arguments;
++    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
++    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
++                  std::is_same_v<Descriptor, RowLoad<T, true>>);
++    return Arguments{data_ptr};
++  }
++};
++
++/*
++   This epilogue function defines a quantized GEMM operation similar to
++   torch.scaled_mm_.
++
++   A and B may be both either int8 or fp8_e4m3. A can be
++   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
++   Any combination of per-tensor and per-row or column is supported.
++   A and B must have symmetric quantization (zero point == 0).
++
++   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
++   scales are applied elementwise with numpy-style broadcasting.
++
++   ScaleA and ScaleB define the epilogue functions that apply the scales for
++   the A and B operands respectively. These scales may be either per-tensor or
++   per row or column.
++*/
++template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
++struct ScaledEpilogue
++    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
++ private:
++  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
++  using Accum = typename SUPER::Accum;
++  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
++  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
++
++  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::multiplies, float, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTCompute0 =
++      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
++
++  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::multiplies, ElementD, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++ public:
++  using EVTCompute =
++      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
++  using ArgumentType = typename EVTCompute::Arguments;
++
++  static ArgumentType prepare_args(torch::Tensor const& a_scales,
++                                   torch::Tensor const& b_scales) {
++    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
++    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
++
++    typename EVTCompute0::Arguments evt0_args{b_args};
++    return ArgumentType{a_args, evt0_args};
++  }
++};
++
++/*
++ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
++ * This bias can also be used in the per-tensor azp case, where the activation
++ * zero point (azp) is used to compute an azp correction term,
++ * which is folded into the bias.
++ *
++ * The bias tensor must be per-output channel.
++ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
++ */
++template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
++struct ScaledEpilogueBias
++    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
++ private:
++  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
++  using Accum = typename SUPER::Accum;
++  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
++  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
++  using Bias = typename SUPER::template RowLoad<ElementD>;
++
++  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::multiplies, float, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTCompute0 =
++      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
++
++  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::multiply_add, ElementD, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++ public:
++  using EVTCompute =
++      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
++
++  using ArgumentType = typename EVTCompute::Arguments;
++  static ArgumentType prepare_args(torch::Tensor const& a_scales,
++                                   torch::Tensor const& b_scales,
++                                   torch::Tensor const& bias) {
++    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
++    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
++    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
++
++    typename EVTCompute0::Arguments evt0_args{b_args};
++    return ArgumentType{a_args, evt0_args, bias_args};
++  }
++};
++
++/*
++ * This epilogue directly supports per-tensor azp in int32 form.
++ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
++ * term, which should already be multiplied with the scalar azp.
++ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
++ *
++ * This epilogue also supports bias, which remains per-channel.
++ */
++template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
++struct ScaledEpilogueBiasAzp
++    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
++ private:
++  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
++  using Accum = typename SUPER::Accum;
++  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
++  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
++  using Bias = typename SUPER::template RowLoad<ElementD, true>;
++
++  // This is the full AZP term, azp * J @ B, shape (1,n)
++  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
++
++  // Compute float(accum - azp_adj), both operands are int32_t
++  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::minus, float, int32_t,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTComputeAzp =
++      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
++
++  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::multiplies, float, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTComputeScaleB =
++      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
++
++  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::multiply_add, ElementD, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++ public:
++  using EVTCompute =
++      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
++                                         EVTComputeScaleB, Bias>;
++  using ArgumentType = typename EVTCompute::Arguments;
++
++  static ArgumentType prepare_args(torch::Tensor const& a_scales,
++                                   torch::Tensor const& b_scales,
++                                   torch::Tensor const& azp_adj,
++                                   std::optional<torch::Tensor> const& bias) {
++    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
++    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
++    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
++    auto azp_adj_args =
++        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
++
++    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
++    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
++    return ArgumentType{a_args, evt_scale_b_args, bias_args};
++  }
++};
++
++/*
++ * This epilogue supports per-token azp by computing and applying
++ * the correction term using a rank-1 update. If the term were materialized,
++ * it would require O(m*n) space, and this way it only requires O(m+n) space.
++ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
++ * point for each row of A.
++ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
++ *
++ * This epilogue also supports bias, which remains per-channel.
++ */
++template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
++struct ScaledEpilogueBiasAzpToken
++    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
++ private:
++  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
++  using Accum = typename SUPER::Accum;
++  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
++  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
++  using Bias = typename SUPER::template RowLoad<ElementD, true>;
++
++  // Per-token azp term, shape (m,1)
++  using Azp = typename SUPER::template ColLoad<int32_t>;
++
++  // This is the AZP adjustment term, J @ B, shape (1,n)
++  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
++
++  // Compute azp * azp_adj
++  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::multiplies, int32_t, int32_t,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTComputeAzp =
++      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
++
++  // Compute float(accum - azp*azp_adj), all operands are int32_t
++  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::minus, float, int32_t,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTComputeAcc =
++      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
++
++  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::multiplies, float, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++  using EVTComputeScaleB =
++      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
++
++  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
++      cutlass::multiply_add, ElementD, float,
++      cutlass::FloatRoundStyle::round_to_nearest>;
++
++ public:
++  using EVTCompute =
++      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
++                                         EVTComputeScaleB, Bias>;
++  using ArgumentType = typename EVTCompute::Arguments;
++
++  static ArgumentType prepare_args(torch::Tensor const& a_scales,
++                                   torch::Tensor const& b_scales,
++                                   torch::Tensor const& azp_adj,
++                                   torch::Tensor const& azp,
++                                   std::optional<torch::Tensor> const& bias) {
++    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
++    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
++    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
++    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
++    auto azp_adj_args =
++        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
++
++    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
++    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
++    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
++    return ArgumentType{a_args, evt_scale_b_args, bias_args};
++  }
++};
++
++};  // namespace vllm::c3x
+\ No newline at end of file
+diff --git a/csrc/cutlass_extensions/torch_utils.hpp b/csrc/cutlass_extensions/torch_utils.hpp
+new file mode 100644
+index 0000000..a1ff933
+--- /dev/null
++++ b/csrc/cutlass_extensions/torch_utils.hpp
+@@ -0,0 +1,160 @@
++#pragma once
++
++#include <torch/all.h>
++
++#include "cute/layout.hpp"
++#include "cutlass/layout/matrix.h"
++#include "cutlass/bfloat16.h"
++#include "cutlass/half.h"
++
++using ColumnMajor = typename cutlass::layout::ColumnMajor;
++using RowMajor = typename cutlass::layout::RowMajor;
++
++namespace cute {
++
++namespace detail {
++
++template <class T, class F, class G, int... I>
++CUTE_HOST_DEVICE constexpr auto tapply_with_idx(T&& t, F&& f, G&& g,
++                                                seq<I...>) {
++  return g(f(cute::get<I>(static_cast<T&&>(t)), I)...);
++}
++
++template <class F, int... I>
++CUTE_HOST_DEVICE constexpr auto make_shape_from_idx(F&& f, seq<I...>) {
++  return make_shape(f(I)...);
++}
++
++};  // namespace detail
++
++template <class T, class F>
++CUTE_HOST_DEVICE constexpr auto transform_with_idx(T const& t, F&& f) {
++  if constexpr (cute::is_tuple<T>::value) {
++    return detail::tapply_with_idx(
++        t, f, [](auto const&... a) { return cute::make_tuple(a...); },
++        tuple_seq<T>{});
++  } else {
++    return f(t);
++  }
++
++  CUTE_GCC_UNREACHABLE;
++}
++
++// calls: make_shape(f(0), f(1), ..., f(N-1))
++template <int N, class F>
++CUTE_HOST_DEVICE constexpr auto make_shape_from_idx(F&& f) {
++  return detail::make_shape_from_idx(f, make_seq<N>{});
++}
++
++};  // namespace cute
++
++// Make a layout from a tensor with `rank(Stride{})`, where the shape is the
++// shape of the passed in tensor and the strides are of type `Stride` and
++// contain the strides of the passed in tensor, checking that any static strides
++// in `Stride{}` match the strides of the passed in tensor.
++// If `tensor.dim() < rank(Stride{})`, the shape is padded with 1s and the extra
++// strides are set to be 0 or 1.
++template <typename Stride>
++static inline auto make_cute_layout(torch::Tensor const& tensor,
++                                    std::string_view name = "tensor") {
++  TORCH_CHECK(tensor.dim() <= rank(Stride{}));
++  auto stride = cute::transform_with_idx(
++      Stride{}, [&](auto const& stride_ele, auto const& idx) {
++        using StrideEle = std::decay_t<decltype(stride_ele)>;
++
++        if (idx < tensor.dim()) {
++          if constexpr (cute::is_static_v<StrideEle>) {
++            TORCH_CHECK(StrideEle::value == tensor.stride(idx), "Expected ",
++                        name, ".stride(", idx, ") to be ", StrideEle::value);
++            return StrideEle{};
++          } else {
++            if (tensor.size(idx) == 1) {
++              // use 0 stride for dim with size 1, this is easier for
++              // cute/cutlass to optimize (helps the TMA code flatten dims)
++              return StrideEle{0};
++            } else {
++              return tensor.stride(idx);
++            }
++          }
++        } else {
++          // Extra strides are assumed to be 0 or 1
++          if constexpr (cute::is_static_v<StrideEle>) {
++            static_assert(StrideEle::value == 0 || StrideEle::value == 1);
++          }
++          return StrideEle{};
++        }
++      });
++
++  auto shape = cute::make_shape_from_idx<rank(Stride{})>([&](auto const& idx) {
++    if (idx < tensor.dim())
++      return tensor.size(idx);
++    else
++      return int64_t(1);
++  });
++
++  return make_layout(shape, stride);
++}
++
++template <typename Stride>
++static inline auto maybe_make_cute_layout(
++    std::optional<torch::Tensor> const& tensor,
++    std::string_view name = "tensor") {
++  using Layout = decltype(make_cute_layout<Stride>(*tensor));
++
++  if (tensor) {
++    return std::optional<Layout>{make_cute_layout<Stride>(*tensor, name)};
++  } else {
++    return std::optional<Layout>{};
++  }
++}
++
++//
++//  Torch Type to Cutlass Type (equivalent_cutlass_type)
++//
++
++template <typename T>
++struct equivalent_cutlass_type {
++  using type = T;
++};
++
++template <typename T>
++using equivalent_cutlass_type_t = typename equivalent_cutlass_type<T>::type;
++
++template <>
++struct equivalent_cutlass_type<c10::Half> {
++  using type = cutlass::half_t;
++};
++
++template <>
++struct equivalent_cutlass_type<c10::BFloat16> {
++  using type = cutlass::bfloat16_t;
++};
++
++//
++// equivalent_scalar_t (basically inverse of equivalent_cutlass_type)
++//
++
++// Return a `c10::CppTypeToScalarType<T>` compatible type, i.e. get the C++ from
++// c10 that is equivalent to T, e.g.: `cutlass::half_t -> c10::Half`
++template <typename T>
++struct equivalent_scalar_type {
++  using type = T;
++};
++
++template <typename T>
++using equivalent_scalar_type_t = typename equivalent_scalar_type<T>::type;
++
++template <>
++struct equivalent_scalar_type<cutlass::half_t> {
++  using type = c10::Half;
++};
++
++template <>
++struct equivalent_scalar_type<cutlass::bfloat16_t> {
++  using type = c10::BFloat16;
++};
++
++// get equivalent c10::ScalarType tag from compile time type
++template <typename T>
++static inline constexpr c10::ScalarType equivalent_scalar_type_v =
++    c10::CppTypeToScalarType<equivalent_scalar_type_t<T>>::value;
+\ No newline at end of file
+diff --git a/csrc/cutlass_extensions/vllm_collective_builder.cuh b/csrc/cutlass_extensions/vllm_collective_builder.cuh
+new file mode 100644
+index 0000000..085ee12
+--- /dev/null
++++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh
+@@ -0,0 +1,43 @@
++#pragma once
++
++#include "cutlass/gemm/collective/collective_builder.hpp"
++
++namespace cutlass::gemm::collective {
++using namespace cute;
++
++//
++// VLLMCollectiveBuilder is a wrapper around CollectiveBuilder that allows for
++// for custom kernel tags, allowing you to build custom collectives. Without
++// touching the cutlass library headers, using `CutlassKernelTag` will mean it
++// will resort to using the standard cutlass collective builder.
++//
++
++// Use the default Cutlass collective builder, i.e. use an unmodified cutless
++// collective
++struct CutlassKernelTag {};
++
++template <class KernelTag, class ArchTag, class OpClass, class ElementA,
++          class GmemLayoutA, int AlignmentA, class ElementB, class GmemLayoutB,
++          int AlignmentB, class ElementAccumulator, class TileShape_MNK,
++          class ClusterShape_MNK, class StageCountType,
++          class KernelScheduleType, class Enable = void>
++struct VLLMCollectiveBuilder {
++  static_assert(sizeof(ElementA) == 0,
++                "Could not build a collective for given parameters.");
++};
++
++template <class ArchTag, class OpClass, class ElementA, class GmemLayoutA,
++          int AlignmentA, class ElementB, class GmemLayoutB, int AlignmentB,
++          class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
++          class StageCountType, class KernelScheduleType>
++struct VLLMCollectiveBuilder<
++    CutlassKernelTag, ArchTag, OpClass, ElementA, GmemLayoutA, AlignmentA,
++    ElementB, GmemLayoutB, AlignmentB, ElementAccumulator, TileShape_MNK,
++    ClusterShape_MNK, StageCountType, KernelScheduleType> {
++  using CollectiveOp = typename CollectiveBuilder<
++      ArchTag, OpClass, ElementA, GmemLayoutA, AlignmentA, ElementB,
++      GmemLayoutB, AlignmentB, ElementAccumulator, TileShape_MNK,
++      ClusterShape_MNK, StageCountType, KernelScheduleType>::CollectiveOp;
++};
++
++};  // namespace cutlass::gemm::collective
+\ No newline at end of file
+diff --git a/csrc/cutlass_extensions/vllm_custom_types.cuh b/csrc/cutlass_extensions/vllm_custom_types.cuh
+new file mode 100644
+index 0000000..6146bdc
+--- /dev/null
++++ b/csrc/cutlass_extensions/vllm_custom_types.cuh
+@@ -0,0 +1,50 @@
++#pragma once
++
++#include "cutlass/integer_subbyte.h"
++
++namespace cutlass {
++
++///////////////////////////////////////////////////////////////////////////////////////////////////
++
++template <int Bits, int Bias, bool Signed = false>
++struct vllm_biased_integer_subbyte : public integer_subbyte<Bits, Signed> {
++  using Base = integer_subbyte<Bits, Signed>;
++
++  using Storage = typename Base::Storage;
++  using xint_t = typename Base::xint_t;
++
++  using Base::bits_mask_;
++  using Base::sign_mask_;
++  using Base::storage;
++
++  //
++  // Methods
++  //
++
++  /// No operation
++  vllm_biased_integer_subbyte() = default;
++
++  /// Conversion from integer type
++  CUTLASS_HOST_DEVICE explicit vllm_biased_integer_subbyte(int value)
++      : Base(value) {}
++  CUTLASS_HOST_DEVICE explicit vllm_biased_integer_subbyte(unsigned value)
++      : Base(value) {}
++  CUTLASS_HOST_DEVICE explicit vllm_biased_integer_subbyte(double value)
++      : Base(value) {}
++};
++///////////////////////////////////////////////////////////////////////////////////////////////////
++
++// "GPTQ" types, i.e. symmetric quantization
++using vllm_uint4b8_t = vllm_biased_integer_subbyte<4, 8>;      // u4b8
++using vllm_uint8b128_t = vllm_biased_integer_subbyte<8, 128>;  // u8b128
++
++///////////////////////////////////////////////////////////////////////////////////////////////////
++
++template <int Bits, int Bias, bool Signed>
++struct sizeof_bits<vllm_biased_integer_subbyte<Bits, Bias, Signed>> {
++  static constexpr int value = Bits;
++};
++
++///////////////////////////////////////////////////////////////////////////////////////////////////
++
++}  // namespace cutlass
+diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+new file mode 100644
+index 0000000..b401736
+--- /dev/null
++++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+@@ -0,0 +1,78 @@
++import enum
++from typing import Dict, Union
++
++from cutlass_library import *
++
++#
++#   Extend cutlass library with custom types, and missing values
++#
++
++
++class VLLMDataType(enum.Enum):
++    u4b8 = enum_auto()
++    u8b128 = enum_auto()
++
++
++class MixedInputKernelScheduleType(enum.Enum):
++    TmaWarpSpecialized = enum_auto()
++    TmaWarpSpecializedPingpong = enum_auto()
++    TmaWarpSpecializedCooperative = enum_auto()
++
++
++VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
++    **DataTypeNames,  # type: ignore
++    **{
++        VLLMDataType.u4b8: "u4b8",
++        VLLMDataType.u8b128: "u8b128",
++    }
++}
++
++VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
++    **DataTypeTag,  # type: ignore
++    **{
++        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
++        VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
++    }
++}
++
++VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = {
++    **DataTypeSize,  # type: ignore
++    **{
++        VLLMDataType.u4b8: 4,
++        VLLMDataType.u8b128: 8,
++    }
++}
++
++VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
++    VLLMDataType.u4b8: "vllm::kU4B8",
++    VLLMDataType.u8b128: "vllm::kU8B128",
++    DataType.u4: "vllm::kU4",
++    DataType.u8: "vllm::kU8",
++    DataType.s4: "vllm::kS4",
++    DataType.s8: "vllm::kS8",
++    DataType.f16: "vllm::kFloat16",
++    DataType.bf16: "vllm::kBfloat16",
++}
++
++VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
++    DataType.u8: "at::ScalarType::Byte",
++    DataType.s8: "at::ScalarType::Char",
++    DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
++    DataType.s32: "at::ScalarType::Int",
++    DataType.f16: "at::ScalarType::Half",
++    DataType.bf16: "at::ScalarType::BFloat16",
++    DataType.f32: "at::ScalarType::Float",
++}
++
++VLLMKernelScheduleTag: Dict[Union[
++    MixedInputKernelScheduleType, KernelScheduleType], str] = {
++        **KernelScheduleTag,  # type: ignore
++        **{
++            MixedInputKernelScheduleType.TmaWarpSpecialized:
++            "cutlass::gemm::KernelTmaWarpSpecialized",
++            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
++            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
++            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
++            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
++        }
++    }
+diff --git a/csrc/cutlass_extensions/vllm_numeric_conversion.cuh b/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
+new file mode 100644
+index 0000000..90f226c
+--- /dev/null
++++ b/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
+@@ -0,0 +1,992 @@
++#pragma once
++
++#include "cutlass/numeric_conversion.h"
++#include "cutlass_extensions/vllm_custom_types.cuh"
++#include "cutlass_extensions/cute_utils.cuh"
++#include "cutlass_extensions/vllm_type_utils.cuh"
++
++// this file extends:
++//   https://github.com/NVIDIA/cutlass/blob/cutlass-3.5.0/include/cutlass/numeric_conversion.h
++// with vllm specific type conversions, namely: vllm_uint4b8_t, vllm_uint8b128_t
++// as well as adds interleaved numeric array converters for specific types.
++// (interleaved numeric array converters can be more efficient for subbyte
++// types)
++
++namespace cutlass {
++
++// InterleavedNumericArrayConverter is like NumericArrayConverter but also
++// deinterleaves converted elements based on IlvBlkLayout, interleaving can
++// make subbyte converts more efficient by allowing for efficient extraction
++// of subbyte elements from a 32bit register.
++template <typename IlvBlkLayout, typename T, typename S, int N,
++          FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
++          class Enable = void>
++struct InterleavedNumericArrayConverter {
++  using Converter = NumericArrayConverter<T, S, N, Round>;
++
++  using result_type = typename Converter::result_type;
++  using source_type = typename Converter::source_type;
++
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    if (cute::elect_one_sync()) {
++      if constexpr (std::is_same_v<IlvBlkLayout, void>) {
++        printf(
++            "Convert %s <= %s (N = %d, IlvBlkLayout = void), not implemented\n",
++            nameof_v<T>, nameof_v<S>, N);
++      } else {
++        printf(
++            "Convert %s <= %s (N = %d, size(IlvBlkLayout{}) = %d), not "
++            "implemented\n",
++            nameof_v<T>, nameof_v<S>, N, size(IlvBlkLayout{}));
++      }
++      __brkpt();
++    }
++    return {};
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++template <typename IlvBlkLayout, typename T, typename S, int N,
++          FloatRoundStyle Round>
++struct InterleavedNumericArrayConverter<
++    IlvBlkLayout, T, S, N, Round,
++    std::enable_if_t<is_identity_layout<IlvBlkLayout>()>> {
++  using Converter = NumericArrayConverter<T, S, N, Round>;
++
++  using result_type = typename Converter::result_type;
++  using source_type = typename Converter::source_type;
++
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return Converter::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++template <typename RegConvert32bit, typename T, typename S, int N>
++struct ArrayConverterPacked32Bit {
++  using result_type = Array<T, N>;
++  using source_type = Array<S, N>;
++
++  using result_packed_8_t = Array<T, 8>;
++  using result_packed_4_t = Array<T, 4>;
++  using result_packed_2_t = Array<T, 2>;
++  using src_packed_8_t = Array<S, 8>;
++  using src_packed_4_t = Array<S, 4>;
++  using src_packed_2_t = Array<S, 2>;
++
++  static_assert(N % 2 == 0, "N must be a multiple of 2");
++  static_assert(cutlass::sizeof_bits_v<S> >= 4);  // TODO: add 16 packed sources
++  static_assert(32 % cutlass::sizeof_bits_v<S> == 0);
++  static constexpr auto src_elems_per_32bit_reg =
++      32 / cutlass::sizeof_bits_v<S>;
++
++  // Maybe not Valid. ScalarConverter will not actually work unless
++  // NumericConverter<T, S, Round> is implemented. However it won't be used
++  // anyways since we assert N % 2 == 0, just here for compliance with
++  // VectorizedConverter.
++  using ScalarConverter = NumericConverter<T, S>;
++
++  template <typename PackedSrc>
++  CUTLASS_DEVICE static auto to_regs(PackedSrc const& src) {
++    if constexpr (sizeof(PackedSrc) == 1) {
++      return Array<uint32_t, 1>{reinterpret_cast<uint8_t const&>(src)};
++    } else if constexpr (sizeof(PackedSrc) == 2) {
++      return Array<uint32_t, 1>{reinterpret_cast<uint16_t const&>(src)};
++    } else if constexpr (sizeof(PackedSrc) == 4) {
++      return Array<uint32_t, 1>{reinterpret_cast<uint32_t const&>(src)};
++    } else {
++      static_assert(sizeof(PackedSrc) == 8);
++      return reinterpret_cast<Array<uint32_t, 2> const&>(src);
++    }
++  }
++
++  // The core converter uses bit tricks to construct a known FP16 number, then
++  // does a subtraction in FP16 for the final result.
++  template <typename PackedResultType, typename PackedSrcType>
++  CUTLASS_DEVICE static PackedResultType packed_convert(
++      PackedSrcType const& source) {
++    static_assert(PackedSrcType::kElements == PackedResultType::kElements);
++    static_assert(PackedResultType::kElements == 2 ||
++                      PackedResultType::kElements == 4 ||
++                      PackedResultType::kElements == 8,
++                  "Invalid PackedResultType must be 2, 4 or 8.");
++    static_assert(std::is_same_v<typename PackedSrcType::Element, S>);
++    static_assert(std::is_same_v<typename PackedResultType::Element, T>);
++
++    return RegConvert32bit::template convert<PackedResultType>(to_regs(source));
++  }
++
++  friend class detail::VectorizedConverter;
++
++ public:
++  CUTLASS_DEVICE static result_type convert(source_type const& source) {
++    result_type result;
++    using ConverterType =
++        ArrayConverterPacked32Bit<RegConvert32bit,
++                                  typename result_type::Element,
++                                  typename source_type::Element, N>;
++
++    if constexpr (src_elems_per_32bit_reg >= 8) {
++      detail::VectorizedConverter::convert<
++          ConverterType, result_packed_8_t, src_packed_8_t, result_packed_4_t,
++          src_packed_4_t, result_packed_2_t, src_packed_2_t>(result, source);
++    } else if constexpr (src_elems_per_32bit_reg >= 4) {
++      detail::VectorizedConverter::convert<ConverterType, result_packed_4_t,
++                                           src_packed_4_t, result_packed_2_t,
++                                           src_packed_2_t>(result, source);
++    } else {
++      detail::VectorizedConverter::convert<ConverterType, result_packed_2_t,
++                                           src_packed_2_t>(result, source);
++    }
++
++    return result;
++  }
++};
++
++// Convert 8 4bit values packed into a 32bit register to 8 8bit values packed
++// into 2 32bit register.
++template <uint8_t LUT0, uint8_t LUT1, uint8_t LUT2, uint8_t LUT3,    //
++          uint8_t LUT4, uint8_t LUT5, uint8_t LUT6, uint8_t LUT7,    //
++          uint8_t LUT8, uint8_t LUT9, uint8_t LUT10, uint8_t LUT11,  //
++          uint8_t LUT12, uint8_t LUT13, uint8_t LUT14, uint8_t LUT15>
++CUTLASS_DEVICE cutlass::AlignedArray<uint32_t, 2> lut_4bit_to_8bit_convert(
++    uint32_t src) {
++  cutlass::AlignedArray<uint32_t, 2> r;
++  // Determines if the value is in the top half of the LUT if set or
++  //  (i.e. LUT[8:15]) in the bottom half (i.e. LUT[0:7]) if not set. Then move
++  //  into bit position 0x4 of each nibble so when or'd with final_prmt_base it
++  //  selects the correct candidate. When elements in final_prmt_base
++  //  are >= 0x4, the high candidate is selected (i.e. LUT[8:15]), when elements
++  //  are  < 0x4, the low candidate is selected (i.e. LUT[0:7])
++  uint32_t high_bit = (src & 0x88888888) >> 1;
++
++  // `high_bit` is OR'd with 0x31203120 to find the correct value in the LUT
++  // (selects correct high or low candidate)
++  const uint32_t final_prmt_base = 0x32103210;
++
++  // Ignore the high bit when indexing into LUT, for each 4bit value
++  //  we index into both the high and low candidates then use
++  //  high_bit | final_prmt_base to select the correct candidate
++  uint32_t lut_idx = (src & 0x77777777);
++
++  auto pack = [](uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
++    return uint32_t(a) | (uint32_t(b) << 8) | (uint32_t(c) << 16) |
++           (uint32_t(d) << 24);
++  };
++
++  static constexpr uint32_t LOW_0 = pack(LUT0, LUT1, LUT2, LUT3);
++  static constexpr uint32_t LOW_1 = pack(LUT4, LUT5, LUT6, LUT7);
++  static constexpr uint32_t HIGH_0 = pack(LUT8, LUT9, LUT10, LUT11);
++  static constexpr uint32_t HIGH_1 = pack(LUT12, LUT13, LUT14, LUT15);
++
++  CUTLASS_PRAGMA_UNROLL
++  for (int ii = 0; ii < 2; ++ii, lut_idx >>= 16, high_bit >>= 16) {
++    uint32_t final_prmt_idx = final_prmt_base | high_bit;
++
++    // This uses a look up table to convert packed int4s to packed int8s,
++    // using the int4 value as the index to prmt. It first select both the
++    // high and low candidates, then uses the high bit (i.e. `high_bit`) to
++    // select the correct candidate.
++    asm volatile(
++        "{\n"
++        "  .reg .b32 low, high;\n"
++        "  prmt.b32 low, %1, %2, %5;\n"
++        "  prmt.b32 high, %3, %4, %5;\n"
++        "  prmt.b32 %0, low, high, %6;\n"
++        "}\n"
++        : "=r"(r[ii])
++        : "n"(LOW_0), "n"(LOW_1), "n"(HIGH_0), "n"(HIGH_1), "r"(lut_idx),
++          "r"(final_prmt_idx));
++  }
++
++  return r;
++};
++
++// for Array<int8_t, N> <= Array<vllm_uint4b8_t, N>
++template <FloatRoundStyle Round, int N>
++struct NumericArrayConverter<int8_t, vllm_uint4b8_t, N, Round> {
++  using result_type = Array<int8_t, N>;
++  using source_type = Array<vllm_uint4b8_t, N>;
++
++  static FloatRoundStyle const round_style = Round;
++
++ private:
++  struct RegConvert {
++    template <typename PackedResultType>
++    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
++      // [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7] as int8s
++      auto r = lut_4bit_to_8bit_convert<0xF8, 0xF9, 0xFA, 0xFB,  //
++                                        0xFC, 0xFD, 0xFE, 0xFF,  //
++                                        0x00, 0x01, 0x02, 0x03,  //
++                                        0x04, 0x05, 0x06, 0x07>(src_[0]);
++      return reinterpret_cast<PackedResultType&>(r);
++    };
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++// for Array<cutlass::float_e4m3_t, N> <= Array<vllm_uint4b8_t, N>
++template <FloatRoundStyle Round, int N>
++struct NumericArrayConverter<cutlass::float_e4m3_t, vllm_uint4b8_t, N, Round> {
++  using result_type = Array<cutlass::float_e4m3_t, N>;
++  using source_type = Array<vllm_uint4b8_t, N>;
++
++  static FloatRoundStyle const round_style = Round;
++
++ private:
++  struct RegConvert {
++    template <typename PackedResultType>
++    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
++      // [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7] as fp8s
++      auto r = lut_4bit_to_8bit_convert<0xD0, 0xCE, 0xCC, 0xCA,  //
++                                        0xC8, 0xC4, 0xC0, 0xB8,  //
++                                        0x00, 0x38, 0x40, 0x44,  //
++                                        0x48, 0x4A, 0x4C, 0x4E>(src_[0]);
++      return reinterpret_cast<PackedResultType&>(r);
++    };
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++// for Array<cutlass::half_t, N> <= Array<vllm_uint4b8_t, N>
++template <FloatRoundStyle Round, int N>
++struct NumericArrayConverter<cutlass::half_t, vllm_uint4b8_t, N, Round> {
++  using result_type = Array<cutlass::half_t, N>;
++  using source_type = Array<vllm_uint4b8_t, N>;
++
++  struct RegConvert {
++    template <typename PackedResultType>
++    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
++      uint32_t src = src_[0];
++      using RegArray =
++          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
++                                sizeof(PackedResultType)>;
++      RegArray r;
++
++      // Below constructs the following temporary:
++      // fp16s_01 = {0x00, i4_01, 0x00, i4_01}
++      // fp16s_23 = {0x00, i4_23, 0x00, i4_23}
++      // fp16s_45 = {0x00, i4_45, 0x00, i4_45}
++      // fp16s_67 = {0x00, i4_67, 0x00, i4_67}
++      // We use inline asm instead of __byte_perm intrinsic since we don't want
++      // the documented (& 0x7) on the index. NVCC might be able to optimize it
++      // out since the index is a constexpr, but we choose to be safe about it
++      // here.
++      uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
++      static_assert(RegArray::kElements <= 4,
++                    "Too many inputs for F16 -> I4 vector converter");
++      CUTLASS_PRAGMA_UNROLL
++      for (int ii = 0; ii < RegArray::kElements; ++ii) {
++        asm volatile(
++            "{\n"
++            "  prmt.b32 %0, %1, %2, %3;\n"
++            "}\n"
++            : "=r"(r[ii])
++            : "r"(src), "n"(0), "r"(prmt_indices[ii]));
++      }
++
++      // Since the stored 4bit values are biased by 8 we get stored_val = (x+8)
++      //  we are trying to construct x and a fp16 value
++      // The below XOR does the following:
++      //  1) Sets the exponent bits of the FP16 to the correct value for the
++      //  FP16 magic_num. We will be constructing {1024+16*(x1+8), 1024+(x0+8)},
++      //  where x1 in the high nibble and x0 is the low nibble then using hfma
++      //  to subtract 1032 from that
++      // The AND does the following:
++      //  1) Clear the set bits for the int4 we will ignore.
++      // We use lop3 so that we can use 1 instruction for AND and XOR.
++      static constexpr uint32_t xor_mask = 0x64006400;
++      static constexpr uint32_t and_mask = 0xFFF0FF0F;
++      static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
++
++      // For each operand, computes:
++      // r[i] = (r[i] & and_mask) ^ xor_mask
++      CUTLASS_PRAGMA_UNROLL
++      for (int ii = 0; ii < RegArray::kElements; ++ii) {
++        asm volatile(
++            "{\n"
++            "  lop3.b32 %0, %0, %1, %2, %3;\n"
++            "}\n"
++            : "+r"(r[ii])
++            : "n"(and_mask), "n"(xor_mask), "n"(immLut));
++      }
++
++      // We will issue 2 hfmas that do the following:
++      // {x1, x0} = {1024+16*(x1+8), 1024+(x0+8)} * {1/16, 1} - {72, 1032}
++      //          = {x1 + 1152, x0 + 1032} * {1/16, 1} - {72, 1032}
++      static constexpr uint32_t hfma_bias_rep = 0xD480E408;   // {72, 1032}
++      static constexpr uint32_t hfma_scale_rep = 0x2C003C00;  // {1 / 16, 1}
++
++      const half2& hfma_bias = reinterpret_cast<const half2&>(hfma_bias_rep);
++      const half2& hfma_scale = reinterpret_cast<const half2&>(hfma_scale_rep);
++      CUTLASS_PRAGMA_UNROLL
++      for (int ii = 0; ii < RegArray::kElements; ++ii) {
++        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
++        fp16x2_val = __hfma2(hfma_scale, fp16x2_val, hfma_bias);
++      }
++
++      return reinterpret_cast<PackedResultType&>(r);
++    };
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++// for Array<cutlass::half_t, N> <= Array<vllm_uint4b8_t, N>
++//   for IlvdLayout: (2, 4):(4, 1)
++template <FloatRoundStyle Round, int N>
++struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
++                                        cutlass::half_t, vllm_uint4b8_t, N,
++                                        Round, void> {
++  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
++  static_assert(N % size(IlvdLayout{}) == 0);
++
++  using result_type = Array<cutlass::half_t, N>;
++  using source_type = Array<vllm_uint4b8_t, N>;
++
++  static FloatRoundStyle const round_style = Round;
++
++ private:
++  struct RegConvert {
++    template <typename PackedResultType>
++    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
++      uint32_t src = src_[0];
++      using RegArray =
++          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
++                                sizeof(PackedResultType)>;
++      RegArray r;
++
++      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
++      static constexpr uint32_t xor_mask = 0x64006400;
++
++      for (int ii = 0; ii < RegArray::kElements; ii += 2) {
++        auto src_ = src >> (4 * (ii));
++        r[ii + 0] = src_;
++        r[ii + 1] = src_;
++
++        static constexpr uint32_t and_xor_imm_lut = (0xf0 & 0xcc) ^ 0xaa;
++
++        static constexpr uint32_t low_nib_mask = 0x000F000F;
++        static constexpr uint32_t high_nib_mask = 0x00F000F0;
++
++        asm volatile(
++            "{\n"
++            "  lop3.b32 %0, %0, %1, %2, %3;\n"
++            "}\n"
++            : "+r"(r[ii + 0])
++            : "n"(low_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
++
++        asm volatile(
++            "{\n"
++            "  lop3.b32 %0, %0, %1, %2, %3;\n"
++            "}\n"
++            : "+r"(r[ii + 1])
++            : "n"(high_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
++
++        // For low nibble:
++        //  {x1, x0} = {1024+(x1+8), 1024+(x0+8)} * {1, 1} - {1032, 1032}
++        // For high nibble:
++        //  {x1, x0} = {1024+16*(x1+8), 1024+16*(x0+8)} * {1/16, 1/16}
++        //             - {72, 72}
++        static constexpr uint32_t low_nib_bias = 0x64086408;    // {1032, 1032}
++        static constexpr uint32_t high_nib_scale = 0x2C002C00;  // {1/16, 1/16}
++        static constexpr uint32_t high_nib_bias = 0xD480D480;   // {-72, -72}
++
++        {
++          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 0]);
++          fp16x2_val =
++              __hsub2(fp16x2_val, reinterpret_cast<const half2&>(low_nib_bias));
++        }
++
++        {
++          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 1]);
++          fp16x2_val = __hfma2(fp16x2_val,
++                               reinterpret_cast<const half2&>(high_nib_scale),
++                               reinterpret_cast<const half2&>(high_nib_bias));
++        }
++      }
++
++      return reinterpret_cast<PackedResultType&>(r);
++    };
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++// for Array<cutlass::half_t, N> <= Array<uint4_t, N>
++//   for IlvdLayout: (2, 4):(4, 1)
++template <FloatRoundStyle Round, int N>
++struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
++                                        cutlass::half_t, uint4_t, N, Round,
++                                        void> {
++  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
++  static_assert(N % size(IlvdLayout{}) == 0);
++
++  using result_type = Array<cutlass::half_t, N>;
++  using source_type = Array<uint4_t, N>;
++
++  static FloatRoundStyle const round_style = Round;
++
++ private:
++  struct RegConvert {
++    template <typename PackedResultType>
++    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
++      uint32_t src = src_[0];
++      using RegArray =
++          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
++                                sizeof(PackedResultType)>;
++      RegArray r;
++
++      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
++      static constexpr uint32_t xor_mask = 0x64006400;
++
++      for (int ii = 0; ii < RegArray::kElements; ii += 2) {
++        auto src_ = src >> (4 * (ii));
++        r[ii + 0] = src_;
++        r[ii + 1] = src_;
++
++        static constexpr uint32_t and_xor_imm_lut = (0xf0 & 0xcc) ^ 0xaa;
++
++        static constexpr uint32_t low_nib_mask = 0x000F000F;
++        static constexpr uint32_t high_nib_mask = 0x00F000F0;
++
++        asm volatile(
++            "{\n"
++            "  lop3.b32 %0, %0, %1, %2, %3;\n"
++            "}\n"
++            : "+r"(r[ii + 0])
++            : "n"(low_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
++
++        asm volatile(
++            "{\n"
++            "  lop3.b32 %0, %0, %1, %2, %3;\n"
++            "}\n"
++            : "+r"(r[ii + 1])
++            : "n"(high_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
++
++        // For low nibble:
++        //  {x1, x0} = {1024+x1, 1024+x0} - {1024, 1024}
++        // For high nibble:
++        //  {x1, x0} = {1024+16*x1, 1024+16*x0} * {1/16, 1/16} - {64, 64}
++        static constexpr uint32_t low_nib_bias = 0x64006400;    // {1024, 1024}
++        static constexpr uint32_t high_nib_scale = 0x2C002C00;  // {1/16, 1/16}
++        static constexpr uint32_t high_nib_bias = 0xD400D400;   // {-64, -64}
++
++        {
++          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 0]);
++          fp16x2_val =
++              __hsub2(fp16x2_val, reinterpret_cast<const half2&>(low_nib_bias));
++        }
++
++        {
++          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 1]);
++          fp16x2_val = __hfma2(fp16x2_val,
++                               reinterpret_cast<const half2&>(high_nib_scale),
++                               reinterpret_cast<const half2&>(high_nib_bias));
++        }
++      }
++
++      return reinterpret_cast<PackedResultType&>(r);
++    };
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++// for Array<cutlass::half_t, N> <= Array<vllm_uint8b128_t, N>
++template <FloatRoundStyle Round, int N>
++struct NumericArrayConverter<cutlass::half_t, vllm_uint8b128_t, N, Round> {
++  using result_type = Array<cutlass::half_t, N>;
++  using source_type = Array<vllm_uint8b128_t, N>;
++
++  struct RegConvert {
++    template <typename PackedResultType>
++    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
++      uint32_t src = src_[0];
++      // Hold output FP16s in reg. We need 1 reg for every 2 elements
++      using RegArray =
++          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
++                                sizeof(PackedResultType)>;
++      RegArray r;
++
++      uint32_t const prmt_indices[2] = {0x5150, 0x5352};
++      static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
++
++      for (int ii = 0; ii < RegArray::kElements; ++ii) {
++        asm volatile("prmt.b32 %0,%1,%2,%3;\n"
++                     : "=r"(r[ii])
++                     : "r"(src), "n"(start_byte_for_fp16),
++                       "r"(prmt_indices[ii]));
++      }
++
++      // -128 is folded into bias subtraction, i.e. the 0x80 in the low bytes
++      static constexpr uint32_t bias_rep = 0x64806480;
++      const half2& bias = reinterpret_cast<const half2&>(bias_rep);
++      CUTLASS_PRAGMA_UNROLL
++      for (int ii = 0; ii < RegArray::kElements; ++ii) {
++        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
++        fp16x2_val = __hsub2(fp16x2_val, bias);
++      }
++
++      return reinterpret_cast<PackedResultType&>(r);
++    };
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++// for Array<cutlass::float, N> <= Array<vllm_uint8b128_t, N>
++template <FloatRoundStyle Round, int N>
++struct NumericArrayConverter<float, vllm_uint8b128_t, N, Round> {
++  using result_type = Array<float, N>;
++  using source_type = Array<vllm_uint8b128_t, N>;
++  static FloatRoundStyle const round_style = Round;
++
++ private:
++  struct RegConvert {
++    template <typename PackedResultType>
++    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
++      uint32_t src = src_[0];
++      PackedResultType r;
++
++      // __byte_perm simulates the add.u32 0x4B000000 to every u8 element of
++      // u8x4 source and stores the result in r (without introducing extra
++      // cvt.u32.u8 instruction)
++      uint32_t const prmt_indices[4] = {0x7650, 0x7651, 0x7652, 0x7653};
++      uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&r);
++      for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
++        result_as_int[ii] = __byte_perm(src, 0x4B000000, prmt_indices[ii]);
++        // Subtract the magic number 0x4B000000 from tmp in floating-point
++        // arithmetic to obtain final result
++        r[ii] -= (8388608.f + 128.f);  // fold in -128 bias
++      }
++
++      return r;
++    };
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
++
++// for Array<cutlass::bfloat16_t, N> <= Array<vllm_uint4b8_t, N>
++template <FloatRoundStyle Round, int N>
++struct NumericArrayConverter<cutlass::bfloat16_t, vllm_uint4b8_t, N, Round> {
++  using result_type = Array<cutlass::bfloat16_t, N>;
++  using source_type = Array<vllm_uint4b8_t, N>;
++
++  static FloatRoundStyle const round_style = Round;
++
++ private:
++  struct RegConvert {
++    template <typename PackedResultType>
++    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
++      uint32_t src_reg = src_[0];
++      // Hold output BF16s in reg. We need 1 reg for every 2 elements
++      using RegArray =
++          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
++                                sizeof(PackedResultType)>;
++      RegArray r;
++      uint32_t src_reg_shifted = src_reg >> 4;
++
++      // Below constructs the following temporary:
++      uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
++      static_assert(RegArray::kElements <= 4,
++                    "Too many inputs for uint4b8_t -> BF16 vector converter");
++      CUTLASS_PRAGMA_UNROLL
++      for (int ii = 0; ii < RegArray::kElements; ++ii) {
++        asm volatile(
++            "{\n"
++            "  prmt.b32 %0, %1, %2, %3;\n"
++            "}\n"
++            : "=r"(r[ii])
++            : "r"(src_reg), "r"(src_reg_shifted), "r"(prmt_indices[ii]));
++      }
++
++      // Since the stored 4bit values are biased by 8 we get stored_val = (x+8)
++      //  we are trying to construct x and a BF16 value
++      // The below XOR does the following:
++      //  1) Sets the exponent bits of the BF16 to the correct value for the
++      //  BF16 magic_num. We will be constructing {128 + (x1+8), 128 + (x0+8)}
++      //  and subtracting 136 to get {x1, x0}
++      static constexpr uint32_t xor_mask = 0x43004300;
++      static constexpr uint32_t and_mask = 0x000F000F;
++      static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
++
++      // For each operand, computes:
++      // r[i] = (r[i] & and_mask) ^ xor_mask
++      CUTLASS_PRAGMA_UNROLL
++      for (int ii = 0; ii < RegArray::kElements; ++ii) {
++        asm volatile(
++            "{\n"
++            "  lop3.b32 %0, %0, %1, %2, %3;\n"
++            "}\n"
++            : "+r"(r[ii])
++            : "n"(and_mask), "n"(xor_mask), "n"(immLut));
++      }
++
++      // We will issue 2 bfmas that do the following:
++      // high BF16:
++      // hi_bf16 - 136, lo_bf16 - 136
++
++      // This is the BF16 {136, 136} represented as an integer.
++      static constexpr uint32_t bias_rep = 0x43084308;
++      const __nv_bfloat162& bias =
++          reinterpret_cast<const __nv_bfloat162&>(bias_rep);
++
++      CUTLASS_PRAGMA_UNROLL
++      for (int ii = 0; ii < RegArray::kElements; ++ii) {
++        __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
++        bf16x2_val = __hsub2(bf16x2_val, bias);
++      }
++
++      return reinterpret_cast<PackedResultType&>(r);
++    }
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++// for Array<cutlass::bfloat16_t, N> <= Array<vllm_uint4b8_t, N>
++//   for IlvdLayout: (2, 4):(4, 1)
++template <FloatRoundStyle Round, int N>
++struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
++                                        cutlass::bfloat16_t, vllm_uint4b8_t, N,
++                                        Round, void> {
++  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
++  static_assert(N % size(IlvdLayout{}) == 0);
++
++  using result_type = Array<cutlass::bfloat16_t, N>;
++  using source_type = Array<vllm_uint4b8_t, N>;
++
++ private:
++  struct RegConvert {
++    template <typename PackedResultType>
++    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
++      uint32_t src = src_[0];
++      using RegArray =
++          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
++                                sizeof(PackedResultType)>;
++      RegArray r;
++
++      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
++      static constexpr uint32_t or_mask = 0x43004300;
++
++      // Unlike float16 where the mantissa is large enough to contain 2
++      // nibbles, bfloat16 can only fit one, so we can only convert one
++      // nibble at a time
++      for (int ii = 0; ii < RegArray::kElements; ++ii) {
++        r[ii] = src >> (4 * ii);
++
++        static constexpr uint32_t and_or_imm_lut = (0xf0 & 0xcc) | 0xaa;
++        static constexpr uint32_t low_nib_mask = 0x000F000F;
++
++        asm volatile(
++            "{\n"
++            "  lop3.b32 %0, %0, %1, %2, %3;\n"
++            "}\n"
++            : "+r"(r[ii + 0])
++            : "n"(low_nib_mask), "n"(or_mask), "n"(and_or_imm_lut));
++
++        // For low nibble:
++        //  {x1, x0} = {128+(x1+8), 128+(x0+8)} * {1, 1} - {136, 136}
++        static constexpr uint32_t low_nib_bias = 0x43084308;  // {136, 136}
++
++        {
++          __nv_bfloat162& fp16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
++          fp16x2_val =
++              __hsub2(fp16x2_val,
++                      reinterpret_cast<const __nv_bfloat162&>(low_nib_bias));
++        }
++      }
++
++      return reinterpret_cast<PackedResultType&>(r);
++    };
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++// for Array<cutlass::bfloat16_t, N> <= Array<uint4_t, N>
++//   for IlvdLayout: (2, 4):(4, 1)
++template <FloatRoundStyle Round, int N>
++struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
++                                        cutlass::bfloat16_t, uint4_t, N, Round,
++                                        void> {
++  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
++  static_assert(N % size(IlvdLayout{}) == 0);
++
++  using result_type = Array<cutlass::bfloat16_t, N>;
++  using source_type = Array<uint4_t, N>;
++
++ private:
++  struct RegConvert {
++    template <typename PackedResultType>
++    CUTLASS_DEVICE static PackedResultType convert(Array<uint32_t, 1> src_) {
++      uint32_t src = src_[0];
++      using RegArray =
++          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
++                                sizeof(PackedResultType)>;
++      RegArray r;
++
++      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
++      static constexpr uint32_t or_mask = 0x43004300;
++
++      // Unlike float16 where the mantissa is large enough to contain 2
++      // nibbles, bfloat16 can only fit one, so we can only convert one
++      // nibble at a time
++      for (int ii = 0; ii < RegArray::kElements; ++ii) {
++        r[ii] = src >> (4 * ii);
++
++        static constexpr uint32_t and_or_imm_lut = (0xf0 & 0xcc) | 0xaa;
++        static constexpr uint32_t low_nib_mask = 0x000F000F;
++
++        asm volatile(
++            "{\n"
++            "  lop3.b32 %0, %0, %1, %2, %3;\n"
++            "}\n"
++            : "+r"(r[ii])
++            : "n"(low_nib_mask), "n"(or_mask), "n"(and_or_imm_lut));
++
++        // For low nibble:
++        //  {x1, x0} = {128 + x1, 128 + x0} * {1, 1} - {128, 128}
++        static constexpr uint32_t low_nib_bias = 0x43004300;  // {128, 128}
++
++        {
++          __nv_bfloat162& fp16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
++          fp16x2_val =
++              __hsub2(fp16x2_val,
++                      reinterpret_cast<const __nv_bfloat162&>(low_nib_bias));
++        }
++      }
++
++      return reinterpret_cast<PackedResultType&>(r);
++    };
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++// for Array<cutlass::bfloat16_t, N> <= Array<vllm_uint8b128_t, N>
++template <FloatRoundStyle Round, int N>
++struct NumericArrayConverter<cutlass::bfloat16_t, vllm_uint8b128_t, N, Round> {
++  using result_type = Array<cutlass::bfloat16_t, N>;
++  using source_type = Array<vllm_uint8b128_t, N>;
++  static FloatRoundStyle const round_style = Round;
++
++ private:
++  using result_packed_4_t = Array<cutlass::bfloat16_t, 4>;
++  using result_packed_2_t = Array<cutlass::bfloat16_t, 2>;
++  using src_packed_4_t = Array<vllm_uint8b128_t, 4>;
++  using src_packed_2_t = Array<vllm_uint8b128_t, 2>;
++
++  // Not Valid, not supported, only here to satisfy the interface and to avoid
++  //  a compile error. ScalarConverter will not actually work until
++  //  NumericConverter<cutlass::bfloat16_t, vllm_uint8b128_t, Round> is
++  //  implemented
++  using ScalarConverter =
++      NumericConverter<cutlass::bfloat16_t, vllm_uint8b128_t, Round>;
++
++  template <typename PackedResultType, typename PackedSrcType>
++  CUTLASS_DEVICE static PackedResultType packed_convert(
++      PackedSrcType const& source) {
++    static_assert(
++        (platform::is_same<PackedSrcType, src_packed_2_t>::value &&
++         platform::is_same<PackedResultType, result_packed_2_t>::value) ||
++            (platform::is_same<PackedSrcType, src_packed_4_t>::value &&
++             platform::is_same<PackedResultType, result_packed_4_t>::value),
++        "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private "
++        "convert dispatch.");
++
++    NumericArrayConverter<float, vllm_uint8b128_t, PackedResultType::kElements,
++                          Round>
++        convert_uint8_to_f32;
++    Array<float, PackedResultType::kElements> tmp =
++        convert_uint8_to_f32(source);
++    NumericArrayConverter<cutlass::bfloat16_t, float,
++                          PackedResultType::kElements, Round>
++        convert_f32_to_bf16_;
++    return convert_f32_to_bf16_(tmp);
++  }
++
++  friend class detail::VectorizedConverter;
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    result_type result;
++    using ConverterType =
++        NumericArrayConverter<typename result_type::Element,
++                              typename source_type::Element, N, Round>;
++    detail::VectorizedConverter::convert<ConverterType, result_packed_4_t,
++                                         src_packed_4_t, result_packed_2_t,
++                                         src_packed_2_t>(result, source);
++
++    return result;
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++#endif
++
++// for Array<int8_t, N> <= Array<cutlass::half_t, N>
++//   FastFP16toINT8 from https://arxiv.org/pdf/2406.09904
++template <FloatRoundStyle Round, int N>
++struct NumericArrayConverter<int8_t, cutlass::half_t, N, Round> {
++  using result_type = Array<int8_t, N>;
++  using source_type = Array<cutlass::half_t, N>;
++
++  struct RegConvert {
++    // FastFP16toINT8 from https://arxiv.org/pdf/2406.09904
++    template <typename PackedResultType, int src_regs>
++    CUTLASS_DEVICE static PackedResultType convert(
++        Array<uint32_t, src_regs> src) {
++      // Hold output int8s in reg. We need 1 reg for every 4 elements
++      using RegArray = cutlass::AlignedArray<
++          uint32_t, std::max(PackedResultType::kElements / 4, size_t(1))>;
++      RegArray r;
++
++      static constexpr uint32_t MAGIC_BIAS_ = 0x64806480;
++      auto MAGIC_BIAS = *reinterpret_cast<const half2*>(&MAGIC_BIAS_);
++
++      *reinterpret_cast<half2*>(&src[0]) =
++          __hadd2(*reinterpret_cast<half2*>(&src[0]), MAGIC_BIAS);
++
++      if constexpr (src_regs > 1) {
++        *reinterpret_cast<half2*>(&src[1]) =
++            __hadd2(*reinterpret_cast<half2*>(&src[1]), MAGIC_BIAS);
++      }
++
++      static_assert(PackedResultType::kElements <= 4);
++      uint32_t uint8s;
++      static constexpr uint32_t MASK_0246 = 0x6420;
++      static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080;
++      asm volatile("prmt.b32 %0,%1,%2,%3;\n"
++                   : "=r"(uint8s)
++                   : "r"(src[0]), "r"((src_regs > 1) ? src[1] : src[0]),
++                     "n"(MASK_0246));
++
++      uint32_t int8s = (uint8s ^ UINT8s_TO_INT8s_MASK);
++
++      return reinterpret_cast<PackedResultType&>(int8s);
++    };
++  };
++
++ public:
++  CUTLASS_DEVICE
++  static result_type convert(source_type const& source) {
++    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
++                                     typename source_type::Element,
++                                     N>::convert(source);
++  }
++
++  CUTLASS_DEVICE
++  result_type operator()(source_type const& s) const { return convert(s); }
++};
++
++/////////////////////////////////////////////////////////////////////////////////////////////////
++
++}  // namespace cutlass
++
++/////////////////////////////////////////////////////////////////////////////////////////////////
+diff --git a/csrc/cutlass_extensions/vllm_type_utils.cuh b/csrc/cutlass_extensions/vllm_type_utils.cuh
+new file mode 100644
+index 0000000..500ed50
+--- /dev/null
++++ b/csrc/cutlass_extensions/vllm_type_utils.cuh
+@@ -0,0 +1,42 @@
++#include "cutlass/bfloat16.h"
++#include "cutlass/half.h"
++#include "cuda_bf16.h"
++
++#include "cutlass_extensions/vllm_custom_types.cuh"
++
++namespace cutlass {
++
++template <typename T>
++struct nameof {
++  static constexpr char const* value = "unknown";
++};
++
++template <typename T>
++inline constexpr auto nameof_v = nameof<T>::value;
++
++#define NAMEOF_TYPE(T)                       \
++  template <>                                \
++  struct nameof<T> {                         \
++    static constexpr char const* value = #T; \
++  };
++
++NAMEOF_TYPE(float_e4m3_t)
++NAMEOF_TYPE(float_e5m2_t)
++NAMEOF_TYPE(half_t)
++NAMEOF_TYPE(nv_bfloat16)
++NAMEOF_TYPE(bfloat16_t)
++NAMEOF_TYPE(float)
++
++NAMEOF_TYPE(int4b_t)
++NAMEOF_TYPE(int8_t)
++NAMEOF_TYPE(int32_t)
++NAMEOF_TYPE(int64_t)
++
++NAMEOF_TYPE(vllm_uint4b8_t)
++NAMEOF_TYPE(uint4b_t)
++NAMEOF_TYPE(uint8_t)
++NAMEOF_TYPE(vllm_uint8b128_t)
++NAMEOF_TYPE(uint32_t)
++NAMEOF_TYPE(uint64_t)
++
++};  // namespace cutlass
+\ No newline at end of file
+diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
+index 91abd9e..03414b7 100644
+--- a/csrc/dispatch_utils.h
++++ b/csrc/dispatch_utils.h
+@@ -4,34 +4,46 @@
+  */
+ #pragma once
+ 
+-#include <torch/extension.h>
++#include <torch/all.h>
+ 
+-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)              \
+-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
+-  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
++#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
++  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
++  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+ 
+-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
+-  AT_DISPATCH_SWITCH(                                             \
+-    TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
++#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
++  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+ 
+-#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)     \
+-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
+-  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
+-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)   \
++// TODO(luka/varun): use FP8_TYPE macro after refactoring
++#ifndef USE_ROCM
++  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
++    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
++    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
++#else
++  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                      \
++    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \
++    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
++#endif
++
++#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
++  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
++
++#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
++  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
++  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
++  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
+ 
+-#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...)           \
+-  AT_DISPATCH_SWITCH(                                                    \
+-    TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
+-    
+-#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)             \
+-  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)      \
+-  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)      \
+-  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)     \
+-  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)       \
++#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \
++  AT_DISPATCH_SWITCH(TYPE, NAME,                               \
++                     VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
++
++#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)         \
++  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)  \
++  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)  \
++  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
++  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
+   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+ 
+-#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)             \
+-  AT_DISPATCH_SWITCH(                                             \
+-    TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
++#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
++  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
+diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
+index e56b4d2..fb6882f 100644
+--- a/csrc/layernorm_kernels.cu
++++ b/csrc/layernorm_kernels.cu
+@@ -1,198 +1,59 @@
+-#include <torch/extension.h>
+-#include <ATen/cuda/CUDAContext.h>
++#include "type_convert.cuh"
++#include "dispatch_utils.h"
++
++#include <torch/cuda.h>
+ #include <c10/cuda/CUDAGuard.h>
+ 
+-#include "dispatch_utils.h"
+-#include "reduction_utils.cuh"
+ #ifndef USE_ROCM
+-  #include <cuda_bf16.h>
+-  #include <cuda_fp16.h>
++  #include <cub/cub.cuh>
+ #else
+-  #include <hip/hip_bf16.h>
+-  #include <hip/hip_fp16.h>
+-
+-  using __nv_bfloat16 = __hip_bfloat16;
+-  using __nv_bfloat162 = __hip_bfloat162;
++  #include <hipcub/hipcub.hpp>
+ #endif
+ 
+ namespace vllm {
+ 
+ // TODO(woosuk): Further optimize this kernel.
+-template<typename scalar_t>
++template <typename scalar_t>
+ __global__ void rms_norm_kernel(
+-  scalar_t* __restrict__ out,             // [..., hidden_size]
+-  const scalar_t* __restrict__ input,     // [..., hidden_size]
+-  const scalar_t* __restrict__ weight,    // [hidden_size]
+-  const float epsilon,
+-  const int num_tokens,
+-  const int hidden_size) {
++    scalar_t* __restrict__ out,           // [..., hidden_size]
++    const scalar_t* __restrict__ input,   // [..., hidden_size]
++    const scalar_t* __restrict__ weight,  // [hidden_size]
++    const float epsilon, const int num_tokens, const int hidden_size) {
+   __shared__ float s_variance;
+   float variance = 0.0f;
+ 
+   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+-    const float x = (float) input[blockIdx.x * hidden_size + idx];
++    const float x = (float)input[blockIdx.x * hidden_size + idx];
+     variance += x * x;
+   }
+-  variance = blockReduceSum<float>(variance);
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStore;
++  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
++
+   if (threadIdx.x == 0) {
+     s_variance = rsqrtf(variance / hidden_size + epsilon);
+   }
+   __syncthreads();
+ 
+   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+-    float x = (float) input[blockIdx.x * hidden_size + idx];
+-    out[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx];
++    float x = (float)input[blockIdx.x * hidden_size + idx];
++    out[blockIdx.x * hidden_size + idx] =
++        ((scalar_t)(x * s_variance)) * weight[idx];
+   }
+ }
+ 
+-
+-/* Converter structs for the conversion from torch types to HIP/CUDA types,
+-   and the associated type conversions within HIP/CUDA. These helpers need
+-   to be implemented for now because the relevant type conversion
+-   operators/constructors are not consistently implemented by HIP/CUDA, so
+-   a generic conversion via type casts cannot be implemented.
+-
+-   Each struct should have the member static constexpr bool `exists`:
+-   If false, the optimized kernel is not used for the corresponding torch type.
+-   If true, the struct should be fully defined as shown in the examples below. 
+- */
+-template<typename torch_type>
+-struct _typeConvert { static constexpr bool exists = false; };
+-
+-#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
+-// CUDA < 12.0 runs into issues with packed type conversion
+-template<>
+-struct _typeConvert<c10::Half> {
+-  static constexpr bool exists = true;
+-  using hip_type = __half;
+-  using packed_hip_type = __half2;
+-
+-  __device__ static inline float convert(hip_type x) { return __half2float(x); }
+-  __device__ static inline float2 convert(packed_hip_type x) { return __half22float2(x); }
+-  __device__ static inline hip_type convert(float x) { return __float2half_rn(x); }
+-  __device__ static inline packed_hip_type convert(float2 x) { return __float22half2_rn(x); }
+-};
+-
+-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+-// CUDA_ARCH < 800 does not have BF16 support
+-// TODO: Add in ROCm support once public headers handle bf16 maturely
+-template<>
+-struct _typeConvert<c10::BFloat16> {
+-  static constexpr bool exists = true;
+-  using hip_type = __nv_bfloat16;
+-  using packed_hip_type = __nv_bfloat162;
+-
+-  __device__ static inline float convert(hip_type x) { return __bfloat162float(x); }
+-  __device__ static inline float2 convert(packed_hip_type x) { return __bfloat1622float2(x); }
+-  __device__ static inline hip_type convert(float x) { return __float2bfloat16(x); }
+-  __device__ static inline packed_hip_type convert(float2 x) { return __float22bfloat162_rn(x); }
+-};
+-#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+-#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
+-
+-/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
+-   for appropriate specializations of fused_add_rms_norm_kernel.
+-   Only functions that are necessary in that kernel are implemented.
+-   Alignment to 16 bytes is required to use 128-bit global memory ops.
+- */
+-template<typename scalar_t, int width>
+-struct alignas(16) _f16Vec {
+-  /* Not theoretically necessary that width is a power of 2 but should 
+-     almost always be the case for optimization purposes */ 
+-  static_assert(width > 0 && (width & (width - 1)) == 0,
+-                "Width is not a positive power of 2!");
+-  using Converter = _typeConvert<scalar_t>;
+-  using T1 = typename Converter::hip_type;
+-  using T2 = typename Converter::packed_hip_type;
+-  T1 data[width];
+-
+-  __device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
+-    if constexpr (width % 2 == 0) {
+-      #pragma unroll
+-      for (int i = 0; i < width; i += 2) {
+-        T2 temp{data[i], data[i+1]};
+-        temp += T2{other.data[i], other.data[i+1]};
+-        data[i] = temp.x;
+-        data[i+1] = temp.y;
+-      }
+-    } else {
+-      #pragma unroll
+-      for (int i = 0; i < width; ++i)
+-        data[i] += other.data[i];
+-    }
+-    return *this;
+-  }
+-
+-  __device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
+-    if constexpr (width % 2 == 0) {
+-      #pragma unroll
+-      for (int i = 0; i < width; i += 2) {
+-        T2 temp{data[i], data[i+1]};
+-        temp *= T2{other.data[i], other.data[i+1]};
+-        data[i] = temp.x;
+-        data[i+1] = temp.y;
+-      }
+-    } else {
+-      #pragma unroll
+-      for (int i = 0; i < width; ++i)
+-        data[i] *= other.data[i];
+-    }
+-    return *this;
+-  }
+-
+-  __device__ _f16Vec& operator*=(const float scale) {
+-    if constexpr (width % 2 == 0) {
+-      #pragma unroll
+-      for (int i = 0; i < width; i += 2) {
+-        float2 temp_f = Converter::convert(T2{data[i], data[i+1]});
+-        temp_f.x *= scale;
+-        temp_f.y *= scale;
+-        T2 temp = Converter::convert(temp_f);
+-        data[i] = temp.x;
+-        data[i+1] = temp.y;
+-      }
+-    } else {
+-      #pragma unroll
+-      for (int i = 0; i < width; ++i) {
+-        float temp = Converter::convert(data[i]) * scale;
+-        data[i] = Converter::convert(temp);
+-      }
+-    }
+-    return *this;
+-  }
+-
+-  __device__ float sum_squares() const {
+-    float result = 0.0f;
+-    if constexpr (width % 2 == 0) {
+-      #pragma unroll
+-      for (int i = 0; i < width; i += 2) {
+-        float2 z = Converter::convert(T2{data[i], data[i+1]});
+-        result += z.x * z.x + z.y * z.y;
+-      }
+-    } else {
+-      #pragma unroll
+-      for (int i = 0; i < width; ++i) {
+-        float x = Converter::convert(data[i]);
+-        result += x * x;
+-      }
+-    }
+-    return result;
+-  }
+-};
+-
+ /* Function specialization in the case of FP16/BF16 tensors.
+    Additional optimizations we can make in this case are
+    packed and vectorized operations, which help with the
+    memory latency bottleneck. */
+-template<typename scalar_t, int width>
+-__global__ std::enable_if_t<
+-  (width > 0) && _typeConvert<scalar_t>::exists> fused_add_rms_norm_kernel(
+-  scalar_t* __restrict__ input,           // [..., hidden_size]
+-  scalar_t* __restrict__ residual,        // [..., hidden_size]
+-  const scalar_t* __restrict__ weight,    // [hidden_size]
+-  const float epsilon,
+-  const int num_tokens,
+-  const int hidden_size) {
++template <typename scalar_t, int width>
++__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
++fused_add_rms_norm_kernel(
++    scalar_t* __restrict__ input,         // [..., hidden_size]
++    scalar_t* __restrict__ residual,      // [..., hidden_size]
++    const scalar_t* __restrict__ weight,  // [hidden_size]
++    const float epsilon, const int num_tokens, const int hidden_size) {
+   // Sanity checks on our vector struct and type-punned pointer arithmetic
+   static_assert(std::is_pod_v<_f16Vec<scalar_t, width>>);
+   static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
+@@ -203,9 +64,12 @@ __global__ std::enable_if_t<
+   /* These and the argument pointers are all declared `restrict` as they are
+      not aliased in practice. Argument pointers should not be dereferenced
+      in this kernel as that would be undefined behavior */
+-  auto* __restrict__ input_v = reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
+-  auto* __restrict__ residual_v = reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
+-  auto* __restrict__ weight_v = reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
++  auto* __restrict__ input_v =
++      reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
++  auto* __restrict__ residual_v =
++      reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
++  auto* __restrict__ weight_v =
++      reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
+ 
+   for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+     int id = blockIdx.x * vec_hidden_size + idx;
+@@ -214,11 +78,11 @@ __global__ std::enable_if_t<
+     variance += temp.sum_squares();
+     residual_v[id] = temp;
+   }
+-  /* Keep the following if-else block in sync with the
+-     calculation of max_block_size in fused_add_rms_norm */ 
+-  if (num_tokens < 256) {
+-    variance = blockReduceSum<float, 1024>(variance);
+-  } else variance = blockReduceSum<float, 256>(variance);
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStore;
++  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
++
+   if (threadIdx.x == 0) {
+     s_variance = rsqrtf(variance / hidden_size + epsilon);
+   }
+@@ -233,52 +97,49 @@ __global__ std::enable_if_t<
+   }
+ }
+ 
+-
+ /* Generic fused_add_rms_norm_kernel
+    The width field is not used here but necessary for other specializations.
+  */
+-template<typename scalar_t, int width>
+-__global__ std::enable_if_t<
+-  (width == 0) || !_typeConvert<scalar_t>::exists> fused_add_rms_norm_kernel(
+-  scalar_t* __restrict__ input,           // [..., hidden_size]
+-  scalar_t* __restrict__ residual,        // [..., hidden_size]
+-  const scalar_t* __restrict__ weight,    // [hidden_size]
+-  const float epsilon,
+-  const int num_tokens,
+-  const int hidden_size) {
++template <typename scalar_t, int width>
++__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
++fused_add_rms_norm_kernel(
++    scalar_t* __restrict__ input,         // [..., hidden_size]
++    scalar_t* __restrict__ residual,      // [..., hidden_size]
++    const scalar_t* __restrict__ weight,  // [hidden_size]
++    const float epsilon, const int num_tokens, const int hidden_size) {
+   __shared__ float s_variance;
+   float variance = 0.0f;
+ 
+   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+     scalar_t z = input[blockIdx.x * hidden_size + idx];
+     z += residual[blockIdx.x * hidden_size + idx];
+-    float x = (float) z;
++    float x = (float)z;
+     variance += x * x;
+     residual[blockIdx.x * hidden_size + idx] = z;
+   }
+-  /* Keep the following if-else block in sync with the
+-     calculation of max_block_size in fused_add_rms_norm */ 
+-  if (num_tokens < 256) {
+-    variance = blockReduceSum<float, 1024>(variance);
+-  } else variance = blockReduceSum<float, 256>(variance);
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStore;
++  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
++
+   if (threadIdx.x == 0) {
+     s_variance = rsqrtf(variance / hidden_size + epsilon);
+   }
+   __syncthreads();
+ 
+   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+-    float x = (float) residual[blockIdx.x * hidden_size + idx];
+-    input[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx];
++    float x = (float)residual[blockIdx.x * hidden_size + idx];
++    input[blockIdx.x * hidden_size + idx] =
++        ((scalar_t)(x * s_variance)) * weight[idx];
+   }
+ }
+ 
+-} // namespace vllm
++}  // namespace vllm
+ 
+-void rms_norm(
+-  torch::Tensor& out,      // [..., hidden_size]
+-  torch::Tensor& input,    // [..., hidden_size]
+-  torch::Tensor& weight,   // [hidden_size]
+-  float epsilon) {
++void rms_norm(torch::Tensor& out,     // [..., hidden_size]
++              torch::Tensor& input,   // [..., hidden_size]
++              torch::Tensor& weight,  // [hidden_size]
++              double epsilon) {
+   int hidden_size = input.size(-1);
+   int num_tokens = input.numel() / hidden_size;
+ 
+@@ -286,40 +147,27 @@ void rms_norm(
+   dim3 block(std::min(hidden_size, 1024));
+   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-  VLLM_DISPATCH_FLOATING_TYPES(
+-    input.scalar_type(),
+-    "rms_norm_kernel",
+-    [&] {
+-      vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
+-        out.data_ptr<scalar_t>(),
+-        input.data_ptr<scalar_t>(),
+-        weight.data_ptr<scalar_t>(),
+-        epsilon,
+-        num_tokens,
+-        hidden_size);
+-    });
++  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
++    vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
++        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
++        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
++  });
+ }
+ 
+-#define LAUNCH_FUSED_ADD_RMS_NORM(width)              \
+-  VLLM_DISPATCH_FLOATING_TYPES(                       \
+-    input.scalar_type(),                              \
+-    "fused_add_rms_norm_kernel",                      \
+-    [&] {                                             \
+-      vllm::fused_add_rms_norm_kernel                 \
+-      <scalar_t, width><<<grid, block, 0, stream>>>(  \
+-        input.data_ptr<scalar_t>(),                   \
+-        residual.data_ptr<scalar_t>(),                \
+-        weight.data_ptr<scalar_t>(),                  \
+-        epsilon,                                      \
+-        num_tokens,                                   \
+-        hidden_size);                                 \
+-    });
+-
+-void fused_add_rms_norm(
+-  torch::Tensor& input,    // [..., hidden_size]
+-  torch::Tensor& residual, // [..., hidden_size]
+-  torch::Tensor& weight,   // [hidden_size]
+-  float epsilon) {
++#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                       \
++  VLLM_DISPATCH_FLOATING_TYPES(                                                \
++      input.scalar_type(), "fused_add_rms_norm_kernel", [&] {                  \
++        vllm::fused_add_rms_norm_kernel<scalar_t, width>                       \
++            <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),           \
++                                         residual.data_ptr<scalar_t>(),        \
++                                         weight.data_ptr<scalar_t>(), epsilon, \
++                                         num_tokens, hidden_size);             \
++      });
++
++void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
++                        torch::Tensor& residual,  // [..., hidden_size]
++                        torch::Tensor& weight,    // [hidden_size]
++                        double epsilon) {
+   int hidden_size = input.size(-1);
+   int num_tokens = input.numel() / hidden_size;
+ 
+@@ -342,8 +190,8 @@ void fused_add_rms_norm(
+   auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
+   auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
+   auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
+-  bool ptrs_are_aligned = inp_ptr % 16 == 0 && res_ptr % 16 == 0 \
+-                          && wt_ptr % 16 == 0;
++  bool ptrs_are_aligned =
++      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
+   if (ptrs_are_aligned && hidden_size % 8 == 0) {
+     LAUNCH_FUSED_ADD_RMS_NORM(8);
+   } else {
+diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
+new file mode 100644
+index 0000000..c18e2a4
+--- /dev/null
++++ b/csrc/layernorm_quant_kernels.cu
+@@ -0,0 +1,234 @@
++/*
++ * This file contains the CUDA kernels for the fused quantized layernorm.
++ * The kernels correspond to the kernels in layernorm_kernels.cu, except they
++ * also produce quantized output directly.
++ * Currently, only static fp8 quantization is supported.
++ */
++
++#include "type_convert.cuh"
++#include "quantization/fp8/common.cuh"
++#include "dispatch_utils.h"
++
++#include <torch/cuda.h>
++#include <c10/cuda/CUDAGuard.h>
++
++#ifndef USE_ROCM
++  #include <cub/cub.cuh>
++#else
++  #include <hipcub/hipcub.hpp>
++#endif
++
++namespace vllm {
++
++// TODO(woosuk): Further optimize this kernel.
++template <typename scalar_t>
++__global__ void rms_norm_static_fp8_quant_kernel(
++    FP8_TYPE* __restrict__ out,           // [..., hidden_size]
++    const scalar_t* __restrict__ input,   // [..., hidden_size]
++    const scalar_t* __restrict__ weight,  // [hidden_size]
++    const float* __restrict__ scale,      // [1]
++    const float epsilon, const int num_tokens, const int hidden_size) {
++  __shared__ float s_variance;
++  float variance = 0.0f;
++
++  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
++    const float x = (float)input[blockIdx.x * hidden_size + idx];
++    variance += x * x;
++  }
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStore;
++  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
++
++  if (threadIdx.x == 0) {
++    s_variance = rsqrtf(variance / hidden_size + epsilon);
++  }
++  __syncthreads();
++
++  // invert scale to avoid division
++  float const scale_inv = 1.0f / *scale;
++
++  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
++    float x = (float)input[blockIdx.x * hidden_size + idx];
++    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
++    out[blockIdx.x * hidden_size + idx] =
++        scaled_fp8_conversion<true>(out_norm, scale_inv);
++  }
++}
++
++/* Function specialization in the case of FP16/BF16 tensors.
++   Additional optimizations we can make in this case are
++   packed and vectorized operations, which help with the
++   memory latency bottleneck. */
++template <typename scalar_t, int width>
++__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
++fused_add_rms_norm_static_fp8_quant_kernel(
++    FP8_TYPE* __restrict__ out,           // [..., hidden_size]
++    scalar_t* __restrict__ input,         // [..., hidden_size]
++    scalar_t* __restrict__ residual,      // [..., hidden_size]
++    const scalar_t* __restrict__ weight,  // [hidden_size]
++    const float* __restrict__ scale,      // [1]
++    const float epsilon, const int num_tokens, const int hidden_size) {
++  // Sanity checks on our vector struct and type-punned pointer arithmetic
++  static_assert(std::is_pod_v<_f16Vec<scalar_t, width>>);
++  static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
++
++  const int vec_hidden_size = hidden_size / width;
++  __shared__ float s_variance;
++  float variance = 0.0f;
++  /* These and the argument pointers are all declared `restrict` as they are
++     not aliased in practice. Argument pointers should not be dereferenced
++     in this kernel as that would be undefined behavior */
++  auto* __restrict__ input_v =
++      reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
++  auto* __restrict__ residual_v =
++      reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
++  auto* __restrict__ weight_v =
++      reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
++
++  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
++    int id = blockIdx.x * vec_hidden_size + idx;
++    _f16Vec<scalar_t, width> temp = input_v[id];
++    temp += residual_v[id];
++    variance += temp.sum_squares();
++    residual_v[id] = temp;
++  }
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStore;
++  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
++
++  if (threadIdx.x == 0) {
++    s_variance = rsqrtf(variance / hidden_size + epsilon);
++  }
++  __syncthreads();
++
++  // invert scale to avoid division
++  float const scale_inv = 1.0f / *scale;
++
++  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
++    int id = blockIdx.x * vec_hidden_size + idx;
++    _f16Vec<scalar_t, width> temp = residual_v[id];
++    temp *= s_variance;
++    temp *= weight_v[idx];
++#pragma unroll
++    for (int i = 0; i < width; ++i) {
++      out[id * width + i] =
++          scaled_fp8_conversion<true>(float(temp.data[i]), scale_inv);
++    }
++  }
++}
++
++/* Generic fused_add_rms_norm_kernel
++   The width field is not used here but necessary for other specializations.
++ */
++template <typename scalar_t, int width>
++__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
++fused_add_rms_norm_static_fp8_quant_kernel(
++    FP8_TYPE* __restrict__ out,           // [..., hidden_size]
++    scalar_t* __restrict__ input,         // [..., hidden_size]
++    scalar_t* __restrict__ residual,      // [..., hidden_size]
++    const scalar_t* __restrict__ weight,  // [hidden_size]
++    const float* __restrict__ scale,      // [1]
++    const float epsilon, const int num_tokens, const int hidden_size) {
++  __shared__ float s_variance;
++  float variance = 0.0f;
++
++  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
++    scalar_t z = input[blockIdx.x * hidden_size + idx];
++    z += residual[blockIdx.x * hidden_size + idx];
++    float x = (float)z;
++    variance += x * x;
++    residual[blockIdx.x * hidden_size + idx] = z;
++  }
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStore;
++  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
++
++  if (threadIdx.x == 0) {
++    s_variance = rsqrtf(variance / hidden_size + epsilon);
++  }
++  __syncthreads();
++
++  // invert scale to avoid division
++  float const scale_inv = 1.0f / *scale;
++
++  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
++    float x = (float)residual[blockIdx.x * hidden_size + idx];
++    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
++    out[blockIdx.x * hidden_size + idx] =
++        scaled_fp8_conversion<true>(out_norm, scale_inv);
++  }
++}
++
++}  // namespace vllm
++
++void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
++                               torch::Tensor& input,   // [..., hidden_size]
++                               torch::Tensor& weight,  // [hidden_size]
++                               torch::Tensor& scale,   // [1]
++                               double epsilon) {
++  int hidden_size = input.size(-1);
++  int num_tokens = input.numel() / hidden_size;
++
++  dim3 grid(num_tokens);
++  dim3 block(std::min(hidden_size, 1024));
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
++    vllm::rms_norm_static_fp8_quant_kernel<scalar_t>
++        <<<grid, block, 0, stream>>>(
++            out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),
++            weight.data_ptr<scalar_t>(), scale.data_ptr<float>(), epsilon,
++            num_tokens, hidden_size);
++  });
++}
++
++#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
++  VLLM_DISPATCH_FLOATING_TYPES(                                             \
++      input.scalar_type(), "fused_add_rms_norm_kernel", [&] {               \
++        vllm::fused_add_rms_norm_static_fp8_quant_kernel<scalar_t, width>   \
++            <<<grid, block, 0, stream>>>(                                   \
++                out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),       \
++                residual.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(), \
++                scale.data_ptr<float>(), epsilon, num_tokens, hidden_size); \
++      });
++
++void fused_add_rms_norm_static_fp8_quant(
++    torch::Tensor& out,       // [..., hidden_size],
++    torch::Tensor& input,     // [..., hidden_size]
++    torch::Tensor& residual,  // [..., hidden_size]
++    torch::Tensor& weight,    // [hidden_size]
++    torch::Tensor& scale,     // [1]
++    double epsilon) {
++  int hidden_size = input.size(-1);
++  int num_tokens = input.numel() / hidden_size;
++
++  dim3 grid(num_tokens);
++  /* This kernel is memory-latency bound in many scenarios.
++     When num_tokens is large, a smaller block size allows
++     for increased block occupancy on CUs and better latency
++     hiding on global mem ops. */
++  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
++  dim3 block(std::min(hidden_size, max_block_size));
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  /*If the tensor types are FP16/BF16, try to use the optimized kernel
++    with packed + vectorized ops.
++    Max optimization is achieved with a width-8 vector of FP16/BF16s
++    since we can load at most 128 bits at once in a global memory op.
++    However, this requires each tensor's data to be aligned to 16
++    bytes.
++   */
++  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
++  auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
++  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
++  bool ptrs_are_aligned =
++      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
++  if (ptrs_are_aligned && hidden_size % 8 == 0) {
++    LAUNCH_FUSED_ADD_RMS_NORM(8);
++  } else {
++    LAUNCH_FUSED_ADD_RMS_NORM(0);
++  }
++}
+diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
+new file mode 100644
+index 0000000..f0e5533
+--- /dev/null
++++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
+@@ -0,0 +1,662 @@
++// clang-format off
++// adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d_fwd.cu 
++// and https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d_update.cu
++#include <torch/all.h>
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++
++#include "causal_conv1d.h"
++#include <c10/util/BFloat16.h>
++#include <c10/util/Half.h>
++#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
++
++#include <cub/block/block_load.cuh>
++#include <cub/block/block_store.cuh>
++
++#include "static_switch.h"
++
++
++
++#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
++
++#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
++    if (ITYPE == at::ScalarType::Half) {                                            \
++        using input_t = at::Half;                                                   \
++        using weight_t = at::Half;                                                  \
++        __VA_ARGS__();                                                              \
++    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
++        using input_t = at::BFloat16;                                               \
++        using weight_t = at::BFloat16;                                              \
++        __VA_ARGS__();                                                              \
++    } else if (ITYPE == at::ScalarType::Float)  {                                   \
++        using input_t = float;                                                      \
++        using weight_t = float;                                                     \
++        __VA_ARGS__();                                                              \
++    } else {                                                                        \
++        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
++    }
++
++
++template<typename input_t, typename weight_t>
++void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
++
++template<typename input_t, typename weight_t>
++void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream);
++
++void set_conv_params_fwd(ConvParamsBase &params,
++                         // sizes
++                         const size_t batch,
++                         const size_t dim,
++                         const size_t seqlen,
++                         const size_t width,
++                         // device pointers
++                         const at::Tensor x,
++                         const at::Tensor weight,
++                         const at::Tensor out,
++                         const std::optional<at::Tensor>& bias,
++                         bool silu_activation,
++                         int64_t pad_slot_id,
++                         const std::optional<at::Tensor>& query_start_loc = std::nullopt,
++                         const std::optional<at::Tensor>& cache_indices = std::nullopt,
++                         const std::optional<at::Tensor>& has_initial_state = std::nullopt) {
++
++    // Reset the parameters
++    memset(&params, 0, sizeof(params));
++
++    params.batch = batch;
++    params.dim = dim;
++    params.seqlen = seqlen;
++    params.width = width;
++    params.pad_slot_id = pad_slot_id;
++
++    params.silu_activation = silu_activation;
++
++    // Set the pointers and strides.
++    params.x_ptr = x.data_ptr();
++    params.weight_ptr = weight.data_ptr();
++    params.bias_ptr = bias.has_value() ? bias.value().data_ptr() : nullptr;
++    params.out_ptr = out.data_ptr();
++    // All stride are in elements, not bytes.
++    params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
++    params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
++    params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
++    const bool varlen = params.query_start_loc_ptr != nullptr;
++    params.x_batch_stride = x.stride(varlen ? 1 : 0);
++    params.x_c_stride = x.stride(varlen ? 0 : 1);
++    params.x_l_stride = x.stride(varlen ? 1 : -1);
++    params.weight_c_stride = weight.stride(0);
++    params.weight_width_stride = weight.stride(1);
++    params.out_batch_stride = out.stride(varlen ? 1 : 0);
++    params.out_c_stride = out.stride(varlen ? 0 : 1);
++    params.out_l_stride = out.stride(varlen ? 1 : -1);
++}
++
++
++void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
++                  const std::optional<at::Tensor> &bias_,
++                  const std::optional<at::Tensor> &conv_states,
++                  const std::optional<at::Tensor> &query_start_loc,
++                  const std::optional<at::Tensor> &cache_indices,
++                  const std::optional<at::Tensor> &has_initial_state,
++                  bool silu_activation,
++                 // used to identify padding entries if cache_indices provided
++                 // in case of padding, the kernel will return early
++                  int64_t pad_slot_id) {
++    auto input_type = x.scalar_type();
++    auto weight_type = weight.scalar_type();
++    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
++    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
++
++    TORCH_CHECK(x.is_cuda());
++    TORCH_CHECK(weight.is_cuda());
++    
++    const bool varlen = query_start_loc.has_value() ? true : false;
++    const auto sizes = x.sizes();
++    const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
++    const int dim = varlen ? sizes[0] : sizes[1];
++    const int seqlen = varlen ? sizes[1] : sizes[2];
++    const int width = weight.size(-1);
++    if (varlen){
++        CHECK_SHAPE(x, dim, seqlen);
++    }
++    else {
++        CHECK_SHAPE(x, batch_size, dim, seqlen);
++    }
++    CHECK_SHAPE(weight, dim, width);
++
++
++
++    if (bias_.has_value()) {
++        auto bias = bias_.value();
++        TORCH_CHECK(bias.scalar_type() == weight_type);
++        TORCH_CHECK(bias.is_cuda());
++        TORCH_CHECK(bias.stride(-1) == 1);
++        CHECK_SHAPE(bias, dim);
++    }
++
++
++    if (has_initial_state.has_value()) {
++        auto has_initial_state_ = has_initial_state.value();
++        TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
++        TORCH_CHECK(has_initial_state_.is_cuda());
++        CHECK_SHAPE(has_initial_state_, batch_size);
++    }
++
++
++    if (query_start_loc.has_value()) {
++        auto query_start_loc_ = query_start_loc.value();
++        TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
++        TORCH_CHECK(query_start_loc_.is_cuda());
++    }
++
++
++    if (cache_indices.has_value()) {
++        auto cache_indices_ = cache_indices.value();
++        TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
++        TORCH_CHECK(cache_indices_.is_cuda());
++        CHECK_SHAPE(cache_indices_, batch_size);
++    }
++
++    at::Tensor out = x;
++
++    ConvParamsBase params;
++    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
++                        bias_,
++                        silu_activation, 
++                        pad_slot_id,
++                        query_start_loc,
++                        cache_indices,
++                        has_initial_state
++                        );
++
++    if (conv_states.has_value()) {
++        auto conv_states_ = conv_states.value();
++        TORCH_CHECK(conv_states_.scalar_type() == input_type);
++        TORCH_CHECK(conv_states_.is_cuda());
++        params.conv_states_ptr = conv_states_.data_ptr();
++        params.conv_states_batch_stride = conv_states_.stride(0);
++        params.conv_states_c_stride = conv_states_.stride(1);
++        params.conv_states_l_stride = conv_states_.stride(2);
++    } else {
++        params.conv_states_ptr = nullptr;
++    }
++
++    // Otherwise the kernel will be launched from cuda:0 device
++    // Cast to char to avoid compiler warning about narrowing
++    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
++    auto stream = at::cuda::getCurrentCUDAStream().stream();
++    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
++            causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
++    });
++}
++
++
++void causal_conv1d_update(const at::Tensor &x,
++                     const at::Tensor &conv_state,
++                     const at::Tensor &weight,
++                     const std::optional<at::Tensor> &bias_,
++                     bool silu_activation,
++                     const std::optional<at::Tensor> &cache_seqlens_,
++                     const std::optional<at::Tensor> &conv_state_indices_,
++                     // used to identify padding entries if cache_indices provided
++                     // in case of padding, the kernel will return early
++                     int64_t pad_slot_id) {
++    auto input_type = x.scalar_type();
++    auto weight_type = weight.scalar_type();
++    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
++    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
++    TORCH_CHECK(weight_type == input_type, "weight type must equal to input type, other variations are disabled due to binary size limitations");
++    TORCH_CHECK(conv_state.scalar_type() == input_type);
++
++    TORCH_CHECK(x.is_cuda());
++    TORCH_CHECK(conv_state.is_cuda());
++    TORCH_CHECK(weight.is_cuda());
++
++    const auto sizes = x.sizes();
++    const int batch_size = sizes[0];
++    const int dim = sizes[1];
++    const int seqlen = sizes[2];
++    const int width = weight.size(-1);
++    const int conv_state_len = conv_state.size(2);
++    TORCH_CHECK(conv_state_len >= width - 1);
++
++    CHECK_SHAPE(x, batch_size, dim, seqlen);
++    CHECK_SHAPE(weight, dim, width);
++
++    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
++
++    if (bias_.has_value()) {
++        auto bias = bias_.value();
++        TORCH_CHECK(bias.scalar_type() == weight_type);
++        TORCH_CHECK(bias.is_cuda());
++        TORCH_CHECK(bias.stride(-1) == 1);
++        CHECK_SHAPE(bias, dim);
++    }
++
++    at::Tensor out = x;
++
++    ConvParamsBase params;
++    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
++                        bias_,
++                        silu_activation,
++                        pad_slot_id);
++    params.conv_state_ptr = conv_state.data_ptr();
++    params.conv_state_len = conv_state_len;
++    // All stride are in elements, not bytes.
++    params.conv_state_batch_stride = conv_state.stride(0);
++    params.conv_state_c_stride = conv_state.stride(1);
++    params.conv_state_l_stride = conv_state.stride(2);
++
++    if (cache_seqlens_.has_value()) {
++        auto cache_seqlens = cache_seqlens_.value();
++        TORCH_CHECK(cache_seqlens.scalar_type() == torch::kInt32);
++        TORCH_CHECK(cache_seqlens.is_cuda());
++        TORCH_CHECK(cache_seqlens.stride(-1) == 1);
++        CHECK_SHAPE(cache_seqlens, batch_size);
++        params.cache_seqlens = cache_seqlens.data_ptr<int32_t>();
++    } else {
++        params.cache_seqlens = nullptr;
++    }
++
++    if (conv_state_indices_.has_value()) {
++        auto conv_state_indices = conv_state_indices_.value();
++        TORCH_CHECK(conv_state_indices.scalar_type() == torch::kInt32)
++        TORCH_CHECK(conv_state_indices.is_cuda());
++        TORCH_CHECK(conv_state_indices.stride(0) == 1)
++        CHECK_SHAPE(conv_state_indices, batch_size);
++
++        int conv_state_entries = conv_state.size(0);
++        CHECK_SHAPE(conv_state, conv_state_entries, dim, conv_state_len);
++
++        params.conv_state_indices_ptr = conv_state_indices.data_ptr<int32_t>();
++    } else {
++        CHECK_SHAPE(conv_state, batch_size, dim, conv_state_len);
++        params.conv_state_indices_ptr = nullptr;
++    }
++
++    // Otherwise the kernel will be launched from cuda:0 device
++    // Cast to char to avoid compiler warning about narrowing
++    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
++    auto stream = at::cuda::getCurrentCUDAStream().stream();
++    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] {
++            causal_conv1d_update_cuda<input_t, weight_t>(params, stream);
++    });
++}
++
++template<int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
++struct Causal_conv1d_fwd_kernel_traits {
++    using input_t = input_t_;
++    using weight_t = weight_t_;
++    static constexpr int kNThreads = kNThreads_;
++    static constexpr int kWidth = kWidth_;
++    static constexpr int kNBytes = sizeof(input_t);
++    static_assert(kNBytes == 2 || kNBytes == 4);
++    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
++    static_assert(kWidth <= kNElts);
++    static constexpr bool kIsVecLoad = kIsVecLoad_;
++    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
++    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNElts, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
++    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, 1, cub::BLOCK_LOAD_DIRECT>;
++    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNElts, cub::BLOCK_STORE_WARP_TRANSPOSE>;
++    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, 1, cub::BLOCK_STORE_DIRECT>;
++    static constexpr int kSmemIOSize = kIsVecLoad
++        ? 0
++        : custom_max({sizeof(typename BlockLoadT::TempStorage), sizeof(typename BlockStoreT::TempStorage)});
++    static constexpr int kSmemExchangeSize = kNThreads * kNBytes * kNElts;
++    static constexpr int kSmemSize = kSmemIOSize + kSmemExchangeSize;
++};
++
++template<typename Ktraits>
++__global__ __launch_bounds__(Ktraits::kNThreads)
++void causal_conv1d_fwd_kernel(ConvParamsBase params) {
++    constexpr int kWidth = Ktraits::kWidth;
++    constexpr int kNThreads = Ktraits::kNThreads;
++    constexpr int kNElts = Ktraits::kNElts;
++    constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
++    using input_t = typename Ktraits::input_t;
++    using vec_t = typename Ktraits::vec_t;
++    using weight_t = typename Ktraits::weight_t;
++
++    // Shared memory.
++    extern __shared__ char smem_[];
++    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
++    auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_);
++    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
++    auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
++    vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
++
++    const bool kVarlen = params.query_start_loc_ptr != nullptr;
++    const int tidx = threadIdx.x;
++    const int batch_id = blockIdx.x;
++    const int channel_id = blockIdx.y;
++    const int *query_start_loc = kVarlen ? reinterpret_cast<int *>(params.query_start_loc_ptr) : nullptr;
++    const int sequence_start_index = kVarlen ? query_start_loc[batch_id] : batch_id;
++    const int seqlen = kVarlen ? query_start_loc[batch_id + 1] - sequence_start_index : params.seqlen;
++
++    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + sequence_start_index * params.x_batch_stride
++        + channel_id * params.x_c_stride;
++    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
++    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
++        + channel_id * params.out_c_stride;
++    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
++
++    bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
++        : reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
++
++    int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
++        : reinterpret_cast<int *>(params.cache_indices_ptr);
++    int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
++    // cache_index == params.pad_slot_id is defined as padding, so we exit early
++    if (cache_index == params.pad_slot_id){
++        return;
++    }
++    input_t *conv_states = params.conv_states_ptr == nullptr ? nullptr
++        : reinterpret_cast<input_t *>(params.conv_states_ptr) + cache_index * params.conv_states_batch_stride + channel_id * params.conv_states_c_stride;
++
++    // Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
++    if (tidx == 0) {
++        input_t initial_state[kNElts] = {0};
++        if (has_initial_state) {
++            #pragma unroll
++            for (int w = 0; w < kWidth - 1; ++w){ initial_state[kNElts - 1 - (kWidth - 2) + w ] = conv_states[w]; }
++        }
++        smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(initial_state)[0];
++    }
++
++    float weight_vals[kWidth];
++    #pragma unroll
++    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
++
++    constexpr int kChunkSize = kNThreads * kNElts;
++    const int n_chunks = (seqlen + kChunkSize - 1) / kChunkSize;
++    for (int chunk = 0; chunk < n_chunks; ++chunk) {
++        input_t x_vals_load[2 * kNElts] = {0};
++        if constexpr(kIsVecLoad) {
++            typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (seqlen - chunk * kChunkSize) / kNElts);
++        } else {
++            __syncthreads();
++            typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), seqlen - chunk * kChunkSize);
++        }
++        x += kChunkSize;
++        __syncthreads();
++        // Thread kNThreads - 1 don't write yet, so that thread 0 can read
++        // the last elements of the previous chunk.
++        if (tidx < kNThreads - 1) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1]; }
++        __syncthreads();
++        reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[tidx > 0 ? tidx - 1 : kNThreads - 1];
++        __syncthreads();
++        // Now thread kNThreads - 1 can write the last elements of the current chunk.
++        if (tidx == kNThreads - 1) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1]; }
++
++        float x_vals[2 * kNElts];
++        #pragma unroll
++        for (int i = 0; i < 2 * kNElts; ++i) { x_vals[i] = float(x_vals_load[i]); }
++
++        float out_vals[kNElts];
++        #pragma unroll
++        for (int i = 0; i < kNElts; ++i) {
++            out_vals[i] = bias_val;
++            #pragma unroll
++            for (int w = 0; w < kWidth; ++w) {
++                out_vals[i] += weight_vals[w] * x_vals[kNElts + i - (kWidth - w - 1)];
++            }
++        }
++
++        if (params.silu_activation) {
++            #pragma unroll
++            for (int i = 0; i < kNElts; ++i) {
++                out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i]));
++            }
++        }
++
++        input_t out_vals_store[kNElts];
++        #pragma unroll
++        for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
++        if constexpr(kIsVecLoad) {
++            typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (seqlen - chunk * kChunkSize) / kNElts);
++        } else {
++            typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
++        }
++        out += kChunkSize;
++
++        int final_state_position =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
++        // in case the final state is separated between the last "smem_exchange" and 
++        // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
++        // (which occurs when `final_state_position` is a non-positivie index)
++        // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
++        if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
++            input_t vals_load[kNElts] = {0};
++            if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
++                // chunk = n_chunks - 2, a segment of the final state sits in the last index
++                reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[kNThreads - 1];
++                #pragma unroll
++                for (int w = 0; w < -final_state_position; ++w){
++                    conv_states[w] = vals_load[kNElts + final_state_position + w];
++                }
++            }
++            if ((chunk == n_chunks - 1) && tidx == 0){
++                // chunk = n_chunks - 1, the second segment of the final state first positions
++                reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[0];
++                for (int w = -final_state_position; w < kWidth - 1; ++w){
++                    conv_states[w] = vals_load[w + final_state_position];
++                }
++                return;
++            }
++        }
++    }
++    // Final state is stored in the smem_exchange last token slot,
++    // in case seqlen < kWidth, we would need to take the final state from the 
++    // initial state which is stored in conv_states
++    // in case seqlen > kWidth, we would need to load the last kWidth - 1 data
++    // and load it into conv_state accordingly
++    int last_thread =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize) / kNElts;
++    if (conv_states != nullptr && tidx == last_thread) { 
++        input_t x_vals_load[kNElts * 2] = {0};
++        // in case we are on the first kWidth tokens
++        if (last_thread == 0 && seqlen < kWidth){
++            // Need to take the initial state
++            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[0];
++            const int offset = seqlen - (kWidth - 1);
++            #pragma unroll
++            for (int w = 0; w < kWidth - 1; ++w){
++                // pad the existing state
++                if ((w - seqlen) >= 0 && has_initial_state) { conv_states[w - seqlen] = conv_states[w]; }
++                else if ((w - seqlen) >= 0 && !has_initial_state) { conv_states[w - seqlen] = input_t(0.0f); }
++            }
++            #pragma unroll
++            for (int w = 0; w < kWidth - 1; ++w){
++                if (offset + w >= 0) 
++                    conv_states[w] = x_vals_load[offset + w ];
++            }
++        }
++        else {
++            // in case the final state is in between the threads data
++            const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
++            if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){
++                // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a 
++                // illegal access error on H100.
++                // Therefore, we access last_thread + 1, only if the final state data sits there
++                reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
++            }
++            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
++            #pragma unroll
++            for (int w = 0; w < kWidth - 1; ++w){
++                conv_states[w] = x_vals_load[offset + w ];
++            }
++        }
++        
++    }
++}
++
++
++template<int kNThreads, int kWidth, typename input_t, typename weight_t>
++void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
++    static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
++    const bool kVarlen = params.query_start_loc_ptr != nullptr;
++    BOOL_SWITCH(params.seqlen % kNElts == 0 && !kVarlen, kIsVecLoad, [&] {
++        using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
++        constexpr int kSmemSize = Ktraits::kSmemSize;
++        dim3 grid(params.batch, params.dim);
++
++        auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
++
++        if (kSmemSize >= 48 * 1024) {
++            #ifndef USE_ROCM
++            C10_CUDA_CHECK(cudaFuncSetAttribute(
++                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
++            #else
++            // There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function.
++            C10_CUDA_CHECK(cudaFuncSetAttribute(
++                (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
++            std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
++            #endif
++        }
++        kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
++
++        C10_CUDA_KERNEL_LAUNCH_CHECK();
++    });
++}
++
++template<typename input_t, typename weight_t>
++void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
++    if (params.width == 2) {
++        causal_conv1d_fwd_launch<128, 2, input_t, weight_t>(params, stream);
++    } else if (params.width == 3) {
++        causal_conv1d_fwd_launch<128, 3, input_t, weight_t>(params, stream);
++    } else if (params.width == 4) {
++        causal_conv1d_fwd_launch<128, 4, input_t, weight_t>(params, stream);
++    }
++}
++
++
++template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
++template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
++template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
++
++
++
++
++template<int kNThreads_, int kWidth_, typename input_t_, typename weight_t_>
++struct Causal_conv1d_update_kernel_traits {
++    using input_t = input_t_;
++    using weight_t = weight_t_;
++    static constexpr int kNThreads = kNThreads_;
++    static constexpr int kWidth = kWidth_;
++    static constexpr int kNBytes = sizeof(input_t);
++    static_assert(kNBytes == 2 || kNBytes == 4);
++};
++
++template<typename Ktraits, bool kIsCircularBuffer>
++__global__ __launch_bounds__(Ktraits::kNThreads)
++void causal_conv1d_update_kernel(ConvParamsBase params) {
++    constexpr int kWidth = Ktraits::kWidth;
++    constexpr int kNThreads = Ktraits::kNThreads;
++    using input_t = typename Ktraits::input_t;
++    using weight_t = typename Ktraits::weight_t;
++
++    const int tidx = threadIdx.x;
++    const int batch_id = blockIdx.x;
++    const int channel_id = blockIdx.y * kNThreads + tidx;
++    if (channel_id >= params.dim) return;
++
++    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
++        + channel_id * params.x_c_stride;
++
++    // If params.conv_state_batch_indices is set, then the conv state is gathered from the conv state tensor
++    // along the batch axis. Otherwise, the conv state coordinate is the same as the batch id.
++    const int conv_state_batch_coord = params.conv_state_indices_ptr == nullptr
++        ? batch_id
++        : params.conv_state_indices_ptr[batch_id];
++    // conv_state_batch_coord == params.pad_slot_id is defined as padding so we exit early
++    if (conv_state_batch_coord == params.pad_slot_id){
++        return;
++    }
++    input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr) 
++        + conv_state_batch_coord * params.conv_state_batch_stride
++        + channel_id * params.conv_state_c_stride;
++
++    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
++    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
++        + channel_id * params.out_c_stride;
++    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
++
++    int state_len = params.conv_state_len;
++    int advance_len = params.seqlen;
++    int cache_seqlen = kIsCircularBuffer ? params.cache_seqlens[batch_id] % state_len : 0;
++    int update_idx = cache_seqlen - (kWidth - 1);
++    update_idx = update_idx < 0 ? update_idx + state_len : update_idx;
++
++    float weight_vals[kWidth] = {0};
++    #pragma unroll
++    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
++
++    float x_vals[kWidth] = {0};
++    if constexpr (!kIsCircularBuffer) {
++        #pragma unroll 2
++        for (int i = 0; i < state_len - advance_len - (kWidth - 1); ++i) {
++            conv_state[i * params.conv_state_l_stride] = conv_state[(i + advance_len) * params.conv_state_l_stride];
++        }
++        #pragma unroll
++        for (int i = 0; i < kWidth - 1; ++i) {
++            input_t state_val = conv_state[(state_len - (kWidth - 1) + i) * params.conv_state_l_stride];
++            if (i < advance_len + (kWidth - 1) && state_len - advance_len - (kWidth - 1) + i >= 0) {
++                conv_state[(state_len - advance_len - (kWidth - 1) + i) * params.conv_state_l_stride] = state_val;
++            }
++            x_vals[i] = float(state_val);
++        }
++    } else {
++        #pragma unroll
++        for (int i = 0; i < kWidth - 1; ++i, update_idx = update_idx + 1 >= state_len ? update_idx + 1 - state_len : update_idx + 1) {
++            input_t state_val = conv_state[update_idx * params.conv_state_l_stride];
++            x_vals[i] = float(state_val);
++        }
++    }
++    #pragma unroll 2
++    for (int i = 0; i < params.seqlen; ++i) {
++        input_t x_val = x[i * params.x_l_stride];
++        if constexpr (!kIsCircularBuffer) {
++            if (i < advance_len && state_len - advance_len + i >= 0) {
++                conv_state[(state_len - advance_len + i) * params.conv_state_l_stride] = x_val;
++            }
++        } else {
++            conv_state[update_idx * params.conv_state_l_stride] = x_val;
++            ++update_idx;
++            update_idx = update_idx >= state_len ? update_idx - state_len : update_idx;
++        }
++        x_vals[kWidth - 1] = float(x_val);
++        float out_val = bias_val;
++        #pragma unroll
++        for (int j = 0; j < kWidth; ++j) { out_val += weight_vals[j] * x_vals[j]; }
++        if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
++        out[i * params.out_l_stride] = input_t(out_val);
++        // Shift the input buffer by 1
++        #pragma unroll
++        for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = x_vals[i + 1]; }
++    }
++}
++
++template<int kNThreads, int kWidth, typename input_t, typename weight_t>
++void causal_conv1d_update_launch(ConvParamsBase &params, cudaStream_t stream) {
++    using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
++    dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
++    auto kernel = params.cache_seqlens == nullptr
++        ? &causal_conv1d_update_kernel<Ktraits, false>
++        : &causal_conv1d_update_kernel<Ktraits, true>;
++    kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
++    C10_CUDA_KERNEL_LAUNCH_CHECK();
++}
++
++template<typename input_t, typename weight_t>
++void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream) {
++    if (params.width == 2) {
++        causal_conv1d_update_launch<64, 2, input_t, weight_t>(params, stream);
++    } else if (params.width == 3) {
++        causal_conv1d_update_launch<64, 3, input_t, weight_t>(params, stream);
++    } else if (params.width == 4) {
++        causal_conv1d_update_launch<64, 4, input_t, weight_t>(params, stream);
++    }
++}
++
++template void causal_conv1d_update_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
++template void causal_conv1d_update_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
++template void causal_conv1d_update_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
+diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.h b/csrc/mamba/causal_conv1d/causal_conv1d.h
+new file mode 100644
+index 0000000..e26684a
+--- /dev/null
++++ b/csrc/mamba/causal_conv1d/causal_conv1d.h
+@@ -0,0 +1,159 @@
++/******************************************************************************
++ * Copyright (c) 2024, Tri Dao.
++ ******************************************************************************/
++// clang-format off
++// adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d.h
++#pragma once
++
++#include <cuda_bf16.h>
++#include <cuda_fp16.h>
++////////////////////////////////////////////////////////////////////////////////////////////////////
++
++struct ConvParamsBase {
++    using index_t = uint32_t;
++
++    int batch, dim, seqlen, width;
++    int64_t pad_slot_id;
++    bool silu_activation;
++
++    index_t x_batch_stride;
++    index_t x_c_stride;
++    index_t x_l_stride;
++    index_t weight_c_stride;
++    index_t weight_width_stride;
++    index_t out_batch_stride;
++    index_t out_c_stride;
++    index_t out_l_stride;
++
++    int conv_state_len;
++    index_t conv_state_batch_stride;
++    index_t conv_state_c_stride;
++    index_t conv_state_l_stride;
++
++    // Common data pointers.
++    void *__restrict__ x_ptr;
++    void *__restrict__ weight_ptr;
++    void *__restrict__ bias_ptr;
++    void *__restrict__ out_ptr;
++
++    void *__restrict__ conv_state_ptr;
++    void *__restrict__ query_start_loc_ptr;
++    void *__restrict__ has_initial_state_ptr;
++    void *__restrict__ cache_indices_ptr;
++    int32_t *__restrict__ cache_seqlens;
++
++    // For the continuous batching case. Makes it so that the mamba state for 
++    // the current batch doesn't need to be a contiguous tensor.
++    int32_t *__restrict__ conv_state_indices_ptr;
++
++    void *__restrict__ seq_idx_ptr;
++
++    // No __restrict__ since initial_states could be the same as final_states.
++    void * initial_states_ptr;
++    index_t initial_states_batch_stride;
++    index_t initial_states_l_stride;
++    index_t initial_states_c_stride;
++
++    void * final_states_ptr;
++    index_t final_states_batch_stride;
++    index_t final_states_l_stride;
++    index_t final_states_c_stride;
++
++    void *  conv_states_ptr;
++    index_t conv_states_batch_stride;
++    index_t conv_states_l_stride;
++    index_t conv_states_c_stride;
++};
++
++
++#ifndef USE_ROCM
++    #include <cuda_bf16.h>
++
++    template<typename T>
++    __device__ inline T shuffle_xor(T val, int offset) {
++        return __shfl_xor_sync(uint32_t(-1), val, offset);
++    }
++
++    constexpr size_t custom_max(std::initializer_list<size_t> ilist) 
++    {
++        return std::max(ilist);
++    }
++
++    template<typename T>
++    constexpr T constexpr_min(T a, T b) {
++        return std::min(a, b);
++    }
++
++#else
++    #include <hip/hip_bf16.h>
++
++    template<typename T>
++    __device__ inline T shuffle_xor(T val, int offset) {
++        return __shfl_xor(val, offset);
++    }
++    constexpr size_t custom_max(std::initializer_list<size_t> ilist) 
++    {
++        return *std::max_element(ilist.begin(), ilist.end());
++    }
++
++    template<typename T>
++    constexpr T constexpr_min(T a, T b) {
++        return a < b ? a : b;
++    }
++#endif
++
++////////////////////////////////////////////////////////////////////////////////////////////////////
++
++template<int BYTES> struct BytesToType {};
++
++template<> struct BytesToType<16> {
++    using Type = uint4;
++    static_assert(sizeof(Type) == 16);
++};
++
++template<> struct BytesToType<8> {
++    using Type = uint64_t;
++    static_assert(sizeof(Type) == 8);
++};
++
++template<> struct BytesToType<4> {
++    using Type = uint32_t;
++    static_assert(sizeof(Type) == 4);
++};
++
++template<> struct BytesToType<2> {
++    using Type = uint16_t;
++    static_assert(sizeof(Type) == 2);
++};
++
++template<> struct BytesToType<1> {
++    using Type = uint8_t;
++    static_assert(sizeof(Type) == 1);
++};
++
++////////////////////////////////////////////////////////////////////////////////////////////////////
++
++template<typename T>
++struct SumOp {
++__device__ inline T operator()(T const & x, T const & y) { return x + y; }
++};
++
++template<int THREADS>
++struct Allreduce {
++    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
++    template<typename T, typename Operator>
++    static __device__ inline T run(T x, Operator &op) {
++        constexpr int OFFSET = THREADS / 2;
++        x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
++        return Allreduce<OFFSET>::run(x, op);
++    }
++};
++
++template<>
++struct Allreduce<2> {
++template<typename T, typename Operator>
++static __device__ inline T run(T x, Operator &op) {
++    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
++    return x;
++}
++};
+diff --git a/csrc/mamba/causal_conv1d/static_switch.h b/csrc/mamba/causal_conv1d/static_switch.h
+new file mode 100644
+index 0000000..ef74bf4
+--- /dev/null
++++ b/csrc/mamba/causal_conv1d/static_switch.h
+@@ -0,0 +1,28 @@
++// Inspired by
++// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
++// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
++// clang-format off
++// adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/static_switch.h
++
++#pragma once
++
++/// @param COND       - a boolean expression to switch by
++/// @param CONST_NAME - a name given for the constexpr bool variable.
++/// @param ...       - code to execute for true and false
++///
++/// Usage:
++/// ```
++/// BOOL_SWITCH(flag, BoolConst, [&] {
++///     some_function<BoolConst>(...);
++/// });
++/// ```
++#define BOOL_SWITCH(COND, CONST_NAME, ...)                                           \
++    [&] {                                                                            \
++        if (COND) {                                                                  \
++            static constexpr bool CONST_NAME = true;                                 \
++            return __VA_ARGS__();                                                    \
++        } else {                                                                     \
++            static constexpr bool CONST_NAME = false;                                \
++            return __VA_ARGS__();                                                    \
++        }                                                                            \
++    }()
+diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h
+new file mode 100644
+index 0000000..563d2fe
+--- /dev/null
++++ b/csrc/mamba/mamba_ssm/selective_scan.h
+@@ -0,0 +1,266 @@
++/******************************************************************************
++ * Copyright (c) 2023, Tri Dao.
++ ******************************************************************************/
++// clang-format off
++// adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/selective_scan.h
++
++#pragma once
++
++#ifndef USE_ROCM
++    #include <cuda_bf16.h>
++#else
++    #include <hip/hip_bf16.h>
++#endif
++#include <cuda_fp16.h>
++////////////////////////////////////////////////////////////////////////////////////////////////////
++
++struct SSMParamsBase {
++    using index_t = uint32_t;
++
++    int batch, dim, seqlen, dstate, n_groups, n_chunks;
++    int dim_ngroups_ratio;
++    bool is_variable_B;
++    bool is_variable_C;
++    int64_t pad_slot_id;
++
++    bool delta_softplus;
++
++    index_t A_d_stride;
++    index_t A_dstate_stride;
++    index_t B_batch_stride;
++    index_t B_d_stride;
++    index_t B_dstate_stride;
++    index_t B_group_stride;
++    index_t C_batch_stride;
++    index_t C_d_stride;
++    index_t C_dstate_stride;
++    index_t C_group_stride;
++    index_t u_batch_stride;
++    index_t u_d_stride;
++    index_t delta_batch_stride;
++    index_t delta_d_stride;
++    index_t z_batch_stride;
++    index_t z_d_stride;
++    index_t out_batch_stride;
++    index_t out_d_stride;
++    index_t out_z_batch_stride;
++    index_t out_z_d_stride;
++
++    // Common data pointers.
++    void *__restrict__ A_ptr;
++    void *__restrict__ B_ptr;
++    void *__restrict__ C_ptr;
++    void *__restrict__ D_ptr;
++    void *__restrict__ u_ptr;
++    void *__restrict__ delta_ptr;
++    void *__restrict__ delta_bias_ptr;
++    void *__restrict__ out_ptr;
++    void *__restrict__ ssm_states_ptr;
++    void *__restrict__ z_ptr;
++    void *__restrict__ out_z_ptr;
++
++    void *__restrict__ query_start_loc_ptr;
++    void *__restrict__ cache_indices_ptr;
++    void *__restrict__ has_initial_state_ptr;
++
++};
++
++
++
++
++#ifndef USE_ROCM
++
++    constexpr size_t custom_max(std::initializer_list<size_t> ilist) 
++    {
++        return std::max(ilist);
++    }
++
++    template<typename T>
++    constexpr T constexpr_min(T a, T b) {
++        return std::min(a, b);
++    }
++
++#else
++    constexpr size_t custom_max(std::initializer_list<size_t> ilist) 
++    {
++        return *std::max_element(ilist.begin(), ilist.end());
++    }
++
++    template<typename T>
++    constexpr T constexpr_min(T a, T b) {
++        return a < b ? a : b;
++    }
++#endif
++
++
++#define MAX_DSTATE 256
++
++
++inline __device__ float2 operator+(const float2 & a, const float2 & b){
++    return {a.x + b.x, a.y + b.y};
++}
++
++inline __device__ float3 operator+(const float3 &a, const float3 &b) {
++  return {a.x + b.x, a.y + b.y, a.z + b.z};
++}
++
++inline __device__ float4 operator+(const float4 & a, const float4 & b){
++    return {a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w};
++}
++
++////////////////////////////////////////////////////////////////////////////////////////////////////
++
++template<int BYTES> struct BytesToType {};
++
++template<> struct BytesToType<16> {
++    using Type = uint4;
++    static_assert(sizeof(Type) == 16);
++};
++
++template<> struct BytesToType<8> {
++    using Type = uint64_t;
++    static_assert(sizeof(Type) == 8);
++};
++
++template<> struct BytesToType<4> {
++    using Type = uint32_t;
++    static_assert(sizeof(Type) == 4);
++};
++
++template<> struct BytesToType<2> {
++    using Type = uint16_t;
++    static_assert(sizeof(Type) == 2);
++};
++
++template<> struct BytesToType<1> {
++    using Type = uint8_t;
++    static_assert(sizeof(Type) == 1);
++};
++
++////////////////////////////////////////////////////////////////////////////////////////////////////
++
++template<typename scalar_t, int N>
++struct Converter{
++    static inline __device__ void to_float(const scalar_t (&src)[N], float (&dst)[N]) {
++        #pragma unroll
++        for (int i = 0; i < N; ++i) { dst[i] = src[i]; }
++    }
++};
++
++template<int N>
++struct Converter<at::Half, N>{
++    static inline __device__ void to_float(const at::Half (&src)[N], float (&dst)[N]) {
++        static_assert(N % 2 == 0);
++        auto &src2 = reinterpret_cast<const half2 (&)[N / 2]>(src);
++        auto &dst2 = reinterpret_cast<float2 (&)[N / 2]>(dst);
++        #pragma unroll
++        for (int i = 0; i < N / 2; ++i) { dst2[i] = __half22float2(src2[i]); }
++    }
++};
++
++#if __CUDA_ARCH__ >= 800
++template<int N>
++struct Converter<at::BFloat16, N>{
++    static inline __device__ void to_float(const at::BFloat16 (&src)[N], float (&dst)[N]) {
++        static_assert(N % 2 == 0);
++        auto &src2 = reinterpret_cast<const nv_bfloat162 (&)[N / 2]>(src);
++        auto &dst2 = reinterpret_cast<float2 (&)[N / 2]>(dst);
++        #pragma unroll
++        for (int i = 0; i < N / 2; ++i) { dst2[i] = __bfloat1622float2(src2[i]); }
++    }
++};
++#endif
++
++////////////////////////////////////////////////////////////////////////////////////////////////////
++
++
++template<typename scalar_t> struct SSMScanOp;
++
++template<>
++struct SSMScanOp<float> {
++    __device__ __forceinline__ float2 operator()(const float2 &ab0, const float2 &ab1) const {
++        return make_float2(ab1.x * ab0.x, ab1.x * ab0.y + ab1.y);
++    }
++};
++
++// A stateful callback functor that maintains a running prefix to be applied
++// during consecutive scan operations.
++template <typename scalar_t> struct SSMScanPrefixCallbackOp {
++    using scan_t = std::conditional_t<std::is_same_v<scalar_t, float>, float2, float4>;
++    scan_t running_prefix;
++    // Constructor
++    __device__ SSMScanPrefixCallbackOp(scan_t running_prefix_) : running_prefix(running_prefix_) {}
++    // Callback operator to be entered by the first warp of threads in the block.
++    // Thread-0 is responsible for returning a value for seeding the block-wide scan.
++    __device__ scan_t operator()(scan_t block_aggregate) {
++        scan_t old_prefix = running_prefix;
++        running_prefix = SSMScanOp<scalar_t>()(running_prefix, block_aggregate);
++        return old_prefix;
++    }
++};
++
++////////////////////////////////////////////////////////////////////////////////////////////////////
++
++template<typename Ktraits>
++inline __device__ void load_input(typename Ktraits::input_t *u,
++                                  typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
++                                  typename Ktraits::BlockLoadT::TempStorage &smem_load,
++                                  int seqlen) {
++    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
++        auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
++        using vec_t = typename Ktraits::vec_t;
++        typename Ktraits::BlockLoadVecT(smem_load_vec).Load(
++            reinterpret_cast<vec_t*>(u),
++            reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(u_vals)
++            #ifdef USE_ROCM
++                , Ktraits::kNThreads * Ktraits::kNLoads
++            #endif
++            
++       );
++    } else {
++        typename Ktraits::BlockLoadT(smem_load).Load(u, u_vals, seqlen, 0.f);
++    }
++}
++
++
++template<typename Ktraits>
++inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
++                                   typename Ktraits::weight_t (&B_vals)[Ktraits::kNItems],
++                                   typename Ktraits::BlockLoadWeightT::TempStorage &smem_load_weight,
++                                   int seqlen) {
++    constexpr int kNItems = Ktraits::kNItems;
++    typename Ktraits::input_t B_vals_load[kNItems];
++    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
++        auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
++        using vec_t = typename Ktraits::vec_t;
++        typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
++            reinterpret_cast<vec_t*>(Bvar),
++            reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(B_vals_load)
++      );
++    } else {
++        typename Ktraits::BlockLoadWeightT(smem_load_weight).Load(Bvar, B_vals_load, seqlen, 0.f);
++    }
++    // #pragma unroll
++    // for (int i = 0; i < kNItems; ++i) { B_vals[i] = B_vals_load[i]; }
++    Converter<typename Ktraits::input_t, kNItems>::to_float(B_vals_load, B_vals);
++}
++
++template<typename Ktraits>
++inline __device__ void store_output(typename Ktraits::input_t *out,
++                                    const float (&out_vals)[Ktraits::kNItems],
++                                    typename Ktraits::BlockStoreT::TempStorage &smem_store,
++                                    int seqlen) {
++    typename Ktraits::input_t write_vals[Ktraits::kNItems];
++    #pragma unroll
++    for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
++    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
++        auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
++        using vec_t = typename Ktraits::vec_t;
++        typename Ktraits::BlockStoreVecT(smem_store_vec).Store(
++            reinterpret_cast<vec_t*>(out),
++            reinterpret_cast<vec_t(&)[Ktraits::kNLoads]>(write_vals)
++       );
++    } else {
++        typename Ktraits::BlockStoreT(smem_store).Store(out, write_vals, seqlen);
++    }
++}
+diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+new file mode 100644
+index 0000000..bd0a341
+--- /dev/null
++++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+@@ -0,0 +1,658 @@
++// clang-format off
++// adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/selective_scan_fwd_kernel.cuh
++#include <torch/all.h>
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include "selective_scan.h"
++
++#include <c10/util/BFloat16.h>
++#include <c10/util/Half.h>
++#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
++
++#ifndef USE_ROCM
++    #include <cub/block/block_load.cuh>
++    #include <cub/block/block_store.cuh>
++    #include <cub/block/block_scan.cuh>
++#else
++    #include <hipcub/hipcub.hpp>
++    namespace cub = hipcub;
++#endif
++
++#include "selective_scan.h"
++#include "static_switch.h"
++
++template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
++         bool kIsVariableB_, bool kIsVariableC_,
++         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
++struct Selective_Scan_fwd_kernel_traits {
++    static_assert(kNItems_ % 4 == 0);
++    using input_t = input_t_;
++    using weight_t = weight_t_;
++    static constexpr int kNThreads = kNThreads_;
++    // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
++    static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
++    static constexpr int kNItems = kNItems_;
++    static constexpr int kNRows = kNRows_;
++    static constexpr int kNBytes = sizeof(input_t);
++    static_assert(kNBytes == 2 || kNBytes == 4);
++    static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems);
++    static_assert(kNItems % kNElts == 0);
++    static constexpr int kNLoads = kNItems / kNElts;
++    static constexpr bool kIsEvenLen = kVarlen_ ? false : kIsEvenLen_;
++    static constexpr bool kIsVariableB = kIsVariableB_;
++    static constexpr bool kIsVariableC = kIsVariableC_;
++    static constexpr bool kHasZ = kHasZ_;
++    static constexpr bool kVarlen = kVarlen_;
++
++    static constexpr bool kDirectIO = kVarlen_ ? false : kIsEvenLen && kNLoads == 1;
++    static constexpr int kNLoadsIndex = kNItems / 4;
++    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
++    using scan_t = float2;
++    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
++    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads,
++        !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
++    using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, kNItems , cub::BLOCK_LOAD_WARP_TRANSPOSE>;
++    using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads ,
++        !kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE  : cub::BLOCK_LOAD_DIRECT>;
++    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
++    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, kNLoads,
++        !kDirectIO ? cub::BLOCK_STORE_WARP_TRANSPOSE : cub::BLOCK_STORE_DIRECT>;
++    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING_MEMOIZE>;
++    // using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_RAKING>;
++    using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
++    static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage),
++                                                 sizeof(typename BlockLoadVecT::TempStorage),
++                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
++                                                 (int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
++                                                 sizeof(typename BlockStoreT::TempStorage),
++                                                 sizeof(typename BlockStoreVecT::TempStorage)});
++    static constexpr int kSmemSize = kSmemIOSize + sizeof(typename BlockScanT::TempStorage);
++};
++
++template<typename Ktraits>
++__global__ __launch_bounds__(Ktraits::kNThreads, Ktraits::kMinBlocks)
++void selective_scan_fwd_kernel(SSMParamsBase params) {
++    constexpr bool kIsVariableB = Ktraits::kIsVariableB;
++    constexpr bool kIsVariableC = Ktraits::kIsVariableC;
++    constexpr bool kHasZ = Ktraits::kHasZ;
++    constexpr bool kVarlen = Ktraits::kVarlen;
++    constexpr int kNThreads = Ktraits::kNThreads;
++    constexpr int kNItems = Ktraits::kNItems;
++    constexpr int kNRows = Ktraits::kNRows;
++    constexpr bool kDirectIO = Ktraits::kDirectIO;
++    using input_t = typename Ktraits::input_t;
++    using weight_t = typename Ktraits::weight_t;
++    using scan_t = typename Ktraits::scan_t;
++
++    // Shared memory.
++    extern __shared__ char smem_[];
++    // cast to lvalue reference of expected type
++    // char *smem_loadstorescan = smem_ + 2 * MAX_DSTATE * sizeof(weight_t);
++    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_ + 2 * MAX_DSTATE * sizeof(weight_t));
++    // auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
++    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
++    auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
++    auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
++    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
++    auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
++    // weight_t *smem_a = reinterpret_cast<weight_t *>(smem_ + smem_loadstorescan_size);
++    // weight_t *smem_bc = reinterpret_cast<weight_t *>(smem_a + MAX_DSTATE);
++    scan_t *smem_running_prefix = reinterpret_cast<scan_t *>(smem_ + Ktraits::kSmemSize);
++
++    const int batch_id = blockIdx.x;
++    const int dim_id = blockIdx.y;
++    const int group_id = dim_id / (params.dim_ngroups_ratio);
++    int seqlen = params.seqlen;
++    int sequence_start_index = batch_id;
++    if constexpr (kVarlen){
++        int *query_start_loc = reinterpret_cast<int *>(params.query_start_loc_ptr);
++        sequence_start_index = query_start_loc[batch_id];
++        seqlen = query_start_loc[batch_id + 1] - sequence_start_index;
++    }
++    const bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
++        : reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
++
++    const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
++        : reinterpret_cast<int *>(params.cache_indices_ptr);
++    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
++    // cache_index == params.pad_slot_id is defined as padding, so we exit early
++    if (cache_index == params.pad_slot_id){
++        return;
++    }
++    input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + sequence_start_index * params.u_batch_stride
++        + dim_id * kNRows * params.u_d_stride;
++    input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + sequence_start_index * params.delta_batch_stride
++        + dim_id * kNRows * params.delta_d_stride;
++    weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride;
++    weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride;
++    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
++    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
++    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
++    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate;
++
++    float D_val[kNRows] = {0};
++    if (params.D_ptr != nullptr) {
++        #pragma unroll
++        for (int r = 0; r < kNRows; ++r) {
++            D_val[r] = reinterpret_cast<float *>(params.D_ptr)[dim_id * kNRows + r];
++        }
++    }
++    float delta_bias[kNRows] = {0};
++    if (params.delta_bias_ptr != nullptr) {
++        #pragma unroll
++        for (int r = 0; r < kNRows; ++r) {
++            delta_bias[r] = reinterpret_cast<float *>(params.delta_bias_ptr)[dim_id * kNRows + r];
++        }
++    }
++
++
++    // for (int state_idx = threadIdx.x; state_idx < params.dstate; state_idx += blockDim.x) {
++    //     smem_a[state_idx] = A[state_idx * params.A_dstate_stride];
++    //     smem_bc[state_idx] = B[state_idx * params.B_dstate_stride] * C[state_idx * params.C_dstate_stride];
++    // }
++
++    constexpr int kChunkSize = kNThreads * kNItems;
++    const int n_chunks = (seqlen + 2048 - 1) / 2048;
++    for (int chunk = 0; chunk < n_chunks; ++chunk) {
++        input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
++
++        __syncthreads();
++        #pragma unroll
++        for (int r = 0; r < kNRows; ++r) {
++            if constexpr (!kDirectIO) {
++                if (r > 0) { __syncthreads(); }
++            }
++            load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, seqlen - chunk * kChunkSize);
++            if constexpr (!kDirectIO) { __syncthreads(); }
++            load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, seqlen - chunk * kChunkSize);
++        }
++        u += kChunkSize;
++        delta += kChunkSize;
++    
++        float delta_vals[kNRows][kNItems], delta_u_vals[kNRows][kNItems], out_vals[kNRows][kNItems];
++        #pragma unroll
++        for (int r = 0; r < kNRows; ++r) {
++            #pragma unroll
++            for (int i = 0; i < kNItems; ++i) {
++                float u_val = float(u_vals[r][i]);
++                delta_vals[r][i] = float(delta_vals_load[r][i]) + delta_bias[r];
++                if (params.delta_softplus) {
++                    delta_vals[r][i] = delta_vals[r][i] <= 20.f ? log1pf(expf(delta_vals[r][i])) : delta_vals[r][i];
++                }
++                delta_u_vals[r][i] = delta_vals[r][i] * u_val;
++                out_vals[r][i] = D_val[r] * u_val;
++            }
++        }
++
++        __syncthreads();
++        for (int state_idx = 0; state_idx < params.dstate; ++state_idx) {
++            weight_t A_val[kNRows];
++            #pragma unroll
++            for (int r = 0; r < kNRows; ++r) {
++                A_val[r] = A[state_idx * params.A_dstate_stride + r * params.A_d_stride];
++                // Multiply the real part of A with LOG2E so we can use exp2f instead of expf.
++                constexpr float kLog2e = M_LOG2E;
++                A_val[r] *= kLog2e;
++            }
++            // This variable holds B * C if both B and C are constant across seqlen. If only B varies
++            // across seqlen, this holds C. If only C varies across seqlen, this holds B.
++            // If both B and C vary, this is unused.
++            weight_t BC_val[kNRows];
++            weight_t B_vals[kNItems], C_vals[kNItems];
++            if constexpr (kIsVariableB) {
++                load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
++                    smem_load_weight, (seqlen - chunk * kChunkSize) * (1));
++                if constexpr (!kIsVariableC) {
++                    #pragma unroll
++                    for (int r = 0; r < kNRows; ++r) {
++                        BC_val[r] = C[state_idx * params.C_dstate_stride + r * params.C_d_stride];
++                    }
++                }
++            }
++            if constexpr (kIsVariableC) {
++                auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
++                load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
++                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 ));
++                if constexpr (!kIsVariableB) {
++                    #pragma unroll
++                    for (int r = 0; r < kNRows; ++r) {
++                        BC_val[r] = B[state_idx * params.B_dstate_stride + r * params.B_d_stride];
++                    }
++                }
++            }
++            if constexpr (!kIsVariableB && !kIsVariableC) {
++                #pragma unroll
++                for (int r = 0; r < kNRows; ++r) {
++                    BC_val[r] = B[state_idx * params.B_dstate_stride + r * params.B_d_stride] * C[state_idx * params.C_dstate_stride + r * params.C_d_stride];
++                }
++            }
++
++            #pragma unroll
++            for (int r = 0; r < kNRows; ++r) {
++                if (r > 0) { __syncthreads(); }  // Scan could be using the same smem
++                scan_t thread_data[kNItems];
++                #pragma unroll
++                for (int i = 0; i < kNItems; ++i) {
++                    thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
++                                                 !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
++                    
++                    if (seqlen % (kNItems * kNThreads) != 0) {  // So that the last state is correct
++                        if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
++                            thread_data[i] = make_float2(1.f, 0.f);
++                        }
++                    }
++                }
++                // Initialize running total
++
++                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0);
++
++                SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
++                typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
++                    thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
++                );
++                // There's a syncthreads in the scan op, so we don't need to sync here.
++                // Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
++                if (threadIdx.x == 0) {
++                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
++                    if (chunk == n_chunks - 1) {
++                        ssm_states[state_idx] = input_t(prefix_op.running_prefix.y);
++                    }
++                }
++                #pragma unroll
++                for (int i = 0; i < kNItems; ++i) {
++                    const weight_t C_val = !kIsVariableC
++                        ? BC_val[r]
++                        : (!kIsVariableB ? BC_val[r] * C_vals[i] : C_vals[i]);
++                    out_vals[r][i] += thread_data[i].y * C_val;
++                }
++            }
++        }
++        
++        input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
++            + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
++        __syncthreads();
++        #pragma unroll
++        for (int r = 0; r < kNRows; ++r) {
++            if constexpr (!kDirectIO) {
++                if (r > 0) { __syncthreads(); }
++            }
++            store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
++        }
++
++        if constexpr (kHasZ) {
++            input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + sequence_start_index * params.z_batch_stride
++                + dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
++            input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + sequence_start_index * params.out_z_batch_stride
++                + dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
++            #pragma unroll
++            for (int r = 0; r < kNRows; ++r) {
++                input_t z_vals[kNItems];
++                __syncthreads();
++                load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, seqlen - chunk * kChunkSize);
++                #pragma unroll
++                for (int i = 0; i < kNItems; ++i) {
++                    float z_val = z_vals[i];
++                    out_vals[r][i] *= z_val / (1 + expf(-z_val));
++                }
++                __syncthreads();
++                store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
++            }
++        }
++
++        Bvar += kChunkSize * 1;
++        Cvar += kChunkSize * 1;
++    }
++}
++
++template<int kNThreads, int kNItems, typename input_t, typename weight_t>
++void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
++    // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
++    // processing 1 row.
++    constexpr int kNRows = 1;
++    // kIsVariableB, kIsVariableC and kHasZ are all set to True to reduce binary size
++    constexpr bool kIsVariableB = true;
++    constexpr bool kIsVariableC = true;
++    constexpr bool kHasZ = true;
++    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
++        BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
++            using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
++            constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
++            dim3 grid(params.batch, params.dim / kNRows);
++            auto kernel = &selective_scan_fwd_kernel<Ktraits>;
++            if (kSmemSize >= 48 * 1024) {
++                C10_CUDA_CHECK(cudaFuncSetAttribute(
++                    kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
++            }
++            kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
++            C10_CUDA_KERNEL_LAUNCH_CHECK();
++        });
++    });
++}
++
++template<typename input_t, typename weight_t>
++void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
++
++    #ifndef USE_ROCM
++        if (params.seqlen <= 128) {           
++            selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
++        } else if (params.seqlen <= 256) {
++            selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
++        } else if (params.seqlen <= 512) {
++            selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
++        } else if (params.seqlen <= 1024) {
++            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
++        } else {
++            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
++        }
++    #else
++        if (params.seqlen <= 256) {
++            selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
++        } else if (params.seqlen <= 512) {
++            selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
++        } else if (params.seqlen <= 1024) {
++            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
++        } else {
++            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
++        }
++    #endif
++}
++
++template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
++template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
++template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);
++
++#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
++
++#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
++    if (ITYPE == at::ScalarType::Half) {                                            \
++        using input_t = at::Half;                                                   \
++        using weight_t = float;                                                     \
++        __VA_ARGS__();                                                              \
++    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
++        using input_t = at::BFloat16;                                               \
++        using weight_t = float;                                                     \
++        __VA_ARGS__();                                                              \
++    } else if (ITYPE == at::ScalarType::Float)  {                                   \
++        using input_t = float;                                                      \
++        using weight_t = float;                                                     \
++        __VA_ARGS__();                                                              \
++    } else {                                                                        \
++        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
++    }
++
++
++template<typename input_t, typename weight_t>
++void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);
++
++void set_ssm_params_fwd(SSMParamsBase &params,
++                        // sizes
++                        const size_t batch,
++                        const size_t dim,
++                        const size_t seqlen,
++                        const size_t dstate,
++                        const size_t n_groups,
++                        const bool is_variable_B,
++                        const bool is_variable_C,
++                        // device pointers
++                        const torch::Tensor u,
++                        const torch::Tensor delta,
++                        const torch::Tensor A,
++                        const torch::Tensor B,
++                        const torch::Tensor C,
++                        const torch::Tensor out,
++                        const torch::Tensor z,
++                        const torch::Tensor out_z,
++                        const std::optional<at::Tensor>& D,
++                        const std::optional<at::Tensor>& delta_bias,
++                        const torch::Tensor ssm_states,
++                        bool has_z, 
++                        bool delta_softplus,
++                        const std::optional<at::Tensor>& query_start_loc,
++                        const std::optional<at::Tensor>& cache_indices,
++                        const std::optional<at::Tensor>& has_initial_state,
++                        bool varlen,
++                        int64_t pad_slot_id) {
++
++    // Reset the parameters
++    memset(&params, 0, sizeof(params));
++
++    params.batch = batch;
++    params.dim = dim;
++    params.seqlen = seqlen;
++    params.dstate = dstate;
++    params.n_groups = n_groups;
++    params.dim_ngroups_ratio = dim / n_groups;
++    params.pad_slot_id = pad_slot_id;
++
++    params.delta_softplus = delta_softplus;
++
++    params.is_variable_B = is_variable_B;
++    params.is_variable_C = is_variable_C;
++
++    // Set the pointers and strides.
++    params.u_ptr = u.data_ptr();
++    params.delta_ptr = delta.data_ptr();
++    params.A_ptr = A.data_ptr();
++    params.B_ptr = B.data_ptr();
++    params.C_ptr = C.data_ptr();
++    params.D_ptr = D.has_value() ? D.value().data_ptr() : nullptr;
++    params.delta_bias_ptr = delta_bias.has_value() ? delta_bias.value().data_ptr() : nullptr;
++    params.out_ptr = out.data_ptr();
++    params.ssm_states_ptr = ssm_states.data_ptr();
++    params.z_ptr = has_z ? z.data_ptr() : nullptr;
++    params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
++    params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
++    params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
++    params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
++
++
++    // All stride are in elements, not bytes.
++    params.A_d_stride = A.stride(0);
++    params.A_dstate_stride = A.stride(1);
++
++    if (varlen){
++        params.B_batch_stride = B.stride(2);
++        params.B_group_stride = B.stride(0);
++        params.B_dstate_stride = B.stride(1);
++        params.C_batch_stride = C.stride(2);
++        params.C_group_stride = C.stride(0);
++        params.C_dstate_stride = C.stride(1);
++
++        params.u_batch_stride = u.stride(1);
++        params.u_d_stride = u.stride(0);
++        params.delta_batch_stride = delta.stride(1);
++        params.delta_d_stride = delta.stride(0);
++        if (has_z) {
++            params.z_batch_stride = z.stride(1);
++            params.z_d_stride = z.stride(0);
++            params.out_z_batch_stride = out_z.stride(1);
++            params.out_z_d_stride = out_z.stride(0);
++        }
++        params.out_batch_stride = out.stride(1);
++        params.out_d_stride = out.stride(0);
++
++    }
++    else{
++        if (!is_variable_B) {
++            params.B_d_stride = B.stride(0);
++        } else {
++            params.B_batch_stride = B.stride(0);
++            params.B_group_stride = B.stride(1);
++        }
++        params.B_dstate_stride = !is_variable_B ? B.stride(1) : B.stride(2);
++        if (!is_variable_C) {
++            params.C_d_stride = C.stride(0);
++        } else {
++            params.C_batch_stride = C.stride(0);
++            params.C_group_stride = C.stride(1);
++        }
++        params.C_dstate_stride = !is_variable_C ? C.stride(1) : C.stride(2);
++        params.u_batch_stride = u.stride(0);
++        params.u_d_stride = u.stride(1);
++        params.delta_batch_stride = delta.stride(0);
++        params.delta_d_stride = delta.stride(1);
++        if (has_z) {
++            params.z_batch_stride = z.stride(0);
++            params.z_d_stride = z.stride(1);
++            params.out_z_batch_stride = out_z.stride(0);
++            params.out_z_d_stride = out_z.stride(1);
++        }
++        params.out_batch_stride = out.stride(0);
++        params.out_d_stride = out.stride(1);
++    }
++}
++
++void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
++                  const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
++                  const std::optional<torch::Tensor> &D_,
++                  const std::optional<torch::Tensor> &z_,
++                  const std::optional<torch::Tensor> &delta_bias_,
++                  bool delta_softplus,
++                  const std::optional<torch::Tensor> &query_start_loc,
++                  const std::optional<torch::Tensor> &cache_indices,
++                  const std::optional<torch::Tensor> &has_initial_state,
++                  const torch::Tensor &ssm_states,
++                  // used to identify padding entries if cache_indices provided
++                  // in case of padding, the kernel will return early
++                  int64_t pad_slot_id) {
++    auto input_type = u.scalar_type();
++    auto weight_type = A.scalar_type();
++    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
++    TORCH_CHECK(weight_type == at::ScalarType::Float);
++
++    const bool is_variable_B = B.dim() >= 3;
++    const bool is_variable_C = C.dim() >= 3;
++
++    TORCH_CHECK(delta.scalar_type() == input_type);
++    TORCH_CHECK(B.scalar_type() == (!is_variable_B ? weight_type : input_type));
++    TORCH_CHECK(C.scalar_type() == (!is_variable_C ? weight_type : input_type));
++
++    TORCH_CHECK(u.is_cuda());
++    TORCH_CHECK(delta.is_cuda());
++    TORCH_CHECK(A.is_cuda());
++    TORCH_CHECK(B.is_cuda());
++    TORCH_CHECK(C.is_cuda());
++
++    TORCH_CHECK(u.stride(-1) == 1 || u.size(-1) == 1);
++    TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1);
++
++    const auto sizes = u.sizes();
++    const bool varlen = query_start_loc.has_value();
++    const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
++    const int dim = varlen ? sizes[0] : sizes[1];
++    const int seqlen = varlen ? sizes[1] : sizes[2];
++    const int dstate = A.size(1);
++    const int n_groups = varlen ? B.size(0) : B.size(1);
++
++    TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
++
++    if (varlen) {
++        CHECK_SHAPE(u, dim, seqlen);
++        CHECK_SHAPE(delta, dim, seqlen);
++    } else {
++        CHECK_SHAPE(u, batch_size, dim, seqlen);
++        CHECK_SHAPE(delta, batch_size, dim, seqlen);
++    }
++    CHECK_SHAPE(A, dim, dstate);
++    TORCH_CHECK(is_variable_B, "is_variable_B = False is disabled in favor of reduced binary size")
++    if (varlen) {
++        CHECK_SHAPE(B, n_groups, dstate, seqlen);
++    } else {
++        CHECK_SHAPE(B, batch_size, n_groups, dstate, seqlen); 
++    }
++    TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1);
++
++    TORCH_CHECK(is_variable_C, "is_variable_C = False is disabled in favor of reduced binary size")
++    if (varlen) {
++        CHECK_SHAPE(C, n_groups, dstate, seqlen);
++    } else {
++        CHECK_SHAPE(C, batch_size, n_groups, dstate, seqlen); 
++    }
++    TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1);
++
++    if (D_.has_value()) {
++        auto D = D_.value();
++        TORCH_CHECK(D.scalar_type() == at::ScalarType::Float);
++        TORCH_CHECK(D.is_cuda());
++        TORCH_CHECK(D.stride(-1) == 1 || D.size(-1) == 1);
++        CHECK_SHAPE(D, dim);
++    }
++
++    if (delta_bias_.has_value()) {
++        auto delta_bias = delta_bias_.value();
++        TORCH_CHECK(delta_bias.scalar_type() == at::ScalarType::Float);
++        TORCH_CHECK(delta_bias.is_cuda());
++        TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1);
++        CHECK_SHAPE(delta_bias, dim);
++    }
++
++
++    if (has_initial_state.has_value()) {
++        auto has_initial_state_ = has_initial_state.value();
++        TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
++        TORCH_CHECK(has_initial_state_.is_cuda());
++        CHECK_SHAPE(has_initial_state_, batch_size);
++    }
++
++
++    if (query_start_loc.has_value()) {
++        auto query_start_loc_ = query_start_loc.value();
++        TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
++        TORCH_CHECK(query_start_loc_.is_cuda());
++    }
++
++
++    if (cache_indices.has_value()) {
++        auto cache_indices_ = cache_indices.value();
++        TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
++        TORCH_CHECK(cache_indices_.is_cuda());
++        CHECK_SHAPE(cache_indices_, batch_size);
++    }
++   
++
++    at::Tensor z, out_z;
++    const bool has_z = z_.has_value();
++    TORCH_CHECK(has_z, "has_z = False is disabled in favor of reduced binary size")
++    z = z_.value();
++    TORCH_CHECK(z.scalar_type() == input_type);
++    TORCH_CHECK(z.is_cuda());
++    TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
++    if (varlen){
++        CHECK_SHAPE(z, dim, seqlen);
++    } else {
++        CHECK_SHAPE(z, batch_size, dim, seqlen);
++    }
++
++    out_z = z;
++
++    // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
++    at::Tensor out = delta;
++    TORCH_CHECK(ssm_states.scalar_type() == input_type);
++    TORCH_CHECK(ssm_states.is_cuda());
++    TORCH_CHECK(ssm_states.stride(-1) == 1);
++
++    SSMParamsBase params;
++    set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, is_variable_B, is_variable_C,
++                       u, delta, A, B, C, out, z, out_z,
++                       D_,
++                       delta_bias_,
++                       ssm_states,
++                       has_z,
++                       delta_softplus,
++                       query_start_loc,
++                       cache_indices,
++                       has_initial_state,
++                       varlen,
++                       pad_slot_id
++                       );
++
++    
++    // Otherwise the kernel will be launched from cuda:0 device
++    // Cast to char to avoid compiler warning about narrowing
++    at::cuda::CUDAGuard device_guard{(char)u.get_device()};
++    auto stream = at::cuda::getCurrentCUDAStream().stream();
++    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
++        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
++    });
++}
++
+diff --git a/csrc/mamba/mamba_ssm/static_switch.h b/csrc/mamba/mamba_ssm/static_switch.h
+new file mode 100644
+index 0000000..840cb23
+--- /dev/null
++++ b/csrc/mamba/mamba_ssm/static_switch.h
+@@ -0,0 +1,28 @@
++// Inspired by
++// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
++// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
++
++// clang-format off
++// adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/static_switch.h
++#pragma once
++
++/// @param COND       - a boolean expression to switch by
++/// @param CONST_NAME - a name given for the constexpr bool variable.
++/// @param ...       - code to execute for true and false
++///
++/// Usage:
++/// ```
++/// BOOL_SWITCH(flag, BoolConst, [&] {
++///     some_function<BoolConst>(...);
++/// });
++/// ```
++#define BOOL_SWITCH(COND, CONST_NAME, ...) \
++  [&] {                                    \
++    if (COND) {                            \
++      constexpr bool CONST_NAME = true;    \
++      return __VA_ARGS__();                \
++    } else {                               \
++      constexpr bool CONST_NAME = false;   \
++      return __VA_ARGS__();                \
++    }                                      \
++  }()
+diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+new file mode 100644
+index 0000000..a217401
+--- /dev/null
++++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+@@ -0,0 +1,1616 @@
++#pragma once
++
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <cuda.h>
++#include <cuda_fp16.h>
++#include <cuda_runtime.h>
++
++#include <iostream>
++
++#include "core/scalar_type.hpp"
++
++namespace marlin_moe {
++
++constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
++
++// Instances of `Vec` are used to organize groups of >>registers<<, as needed
++// for instance as inputs to tensor core operations. Consequently, all
++// corresponding index accesses must be compile-time constants, which is why we
++// extensively use `#pragma unroll` throughout the kernel code to guarantee
++// this.
++template <typename T, int n>
++struct Vec {
++  T elems[n];
++  __device__ T& operator[](int i) { return elems[i]; }
++};
++
++using I4 = Vec<int, 4>;
++
++// Matrix fragments for tensor core instructions; their precise layout is
++// documented here:
++// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
++using FragA = Vec<half2, 4>;
++using FragB = Vec<half2, 2>;
++using FragC = Vec<float, 4>;
++using FragS = Vec<half2, 1>;  // quantization scales
++using FragZP = Vec<half2, 4>;
++
++// Predicated asynchronous global->shared copy; used for inputs A where we apply
++// predication to handle batchsizes that are not multiples of 16.
++__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
++                                      bool pred = true) {
++  const int BYTES = 16;
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "{\n"
++      "   .reg .pred p;\n"
++      "   setp.ne.b32 p, %0, 0;\n"
++      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
++      "}\n" ::"r"((int)pred),
++      "r"(smem), "l"(glob_ptr), "n"(BYTES));
++}
++
++// Asynchronous global->shared copy
++__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
++  const int BYTES = 16;
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "{\n"
++      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
++      "}\n" ::"r"(smem),
++      "l"(glob_ptr), "n"(BYTES));
++}
++
++// Async copy fence.
++__device__ inline void cp_async_fence() {
++  asm volatile("cp.async.commit_group;\n" ::);
++}
++
++// Wait until at most `n` async copy stages are still pending.
++template <int n>
++__device__ inline void cp_async_wait() {
++  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
++}
++
++// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
++// output/accumulation.
++__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
++                           FragC& frag_c) {
++  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
++  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
++  float* c = reinterpret_cast<float*>(&frag_c);
++  asm volatile(
++      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
++      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
++      : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
++      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
++        "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
++}
++
++// Instruction for loading a full 16x16 matrix fragment of operand A from shared
++// memory, directly in tensor core layout.
++__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
++  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
++               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
++               : "r"(smem));
++}
++
++// Lookup-table based 3-input logical operation; explicitly used for
++// dequantization as the compiler does not seem to automatically recognize it in
++// all cases.
++template <int lut>
++__device__ inline int lop3(int a, int b, int c) {
++  int res;
++  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
++               : "=r"(res)
++               : "r"(a), "r"(b), "r"(c), "n"(lut));
++  return res;
++}
++
++// Constructs destination register by taking bytes from 2 sources (based on
++// mask)
++template <int start_byte, int mask>
++__device__ inline uint32_t prmt(uint32_t a) {
++  uint32_t res;
++  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
++               : "=r"(res)
++               : "r"(a), "n"(start_byte), "n"(mask));
++  return res;
++}
++
++template <vllm::ScalarTypeId w_type_id>
++__device__ inline FragB dequant(int q);
++
++// Efficiently dequantize 4bit values packed in an int32 value into a full
++// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
++// with some small changes:
++// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
++template <>
++__device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
++  const int LO = 0x000f000f;
++  const int HI = 0x00f000f0;
++  const int EX = 0x64006400;
++  // Guarantee that the `(a & b) | c` operations are LOP3s.
++  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
++  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
++  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
++  // directly into `SUB` and `ADD`.
++  const int SUB = 0x64086408;
++  const int MUL = 0x2c002c00;
++  const int ADD = 0xd480d480;
++  FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&SUB));
++  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&MUL),
++                      *reinterpret_cast<const half2*>(&ADD));
++  return frag_b;
++}
++
++// Fast Int8ToFp16: Efficiently dequantize 8bit int values to fp16
++// Reference:
++// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
++template <>
++__device__ inline FragB dequant<vllm::kU8B128.id()>(int q) {
++  static constexpr uint32_t mask_for_elt_01 = 0x5250;
++  static constexpr uint32_t mask_for_elt_23 = 0x5351;
++  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
++
++  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
++  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
++
++  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
++
++  FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
++  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
++  return frag_b;
++}
++
++template <>
++__device__ inline FragB dequant<vllm::kU4.id()>(int q) {
++  const int LO = 0x000f000f;
++  const int HI = 0x00f000f0;
++  const int EX = 0x64006400;
++  // Guarantee that the `(a & b) | c` operations are LOP3s.
++  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
++  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
++
++  const int SUB = 0x64006400;
++  const int MUL = 0x2c002c00;
++  const int ADD = 0xd400d400;
++  FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&SUB));
++  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&MUL),
++                      *reinterpret_cast<const half2*>(&ADD));
++  return frag_b;
++}
++
++template <>
++__device__ inline FragB dequant<vllm::kU8.id()>(int q) {
++  static constexpr uint32_t mask_for_elt_01 = 0x5250;
++  static constexpr uint32_t mask_for_elt_23 = 0x5351;
++  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
++
++  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
++  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
++
++  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
++
++  FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
++  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
++  return frag_b;
++}
++
++// Multiply dequantized values by the corresponding quantization scale; used
++// only for grouped quantization.
++__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
++  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
++  frag_b[0] = __hmul2(frag_b[0], s);
++  frag_b[1] = __hmul2(frag_b[1], s);
++}
++
++__device__ inline void sub_zp(FragB& frag_b, half2& frag_zp, int i) {
++  half2 zp = __half2half2(reinterpret_cast<__half*>(&frag_zp)[i]);
++  frag_b[0] = __hsub2(frag_b[0], zp);
++  frag_b[1] = __hsub2(frag_b[1], zp);
++}
++
++// Same as above, but for act_order (each K is multiplied individually)
++__device__ inline void scale4(FragB& frag_b, FragS& frag_s_1, FragS& frag_s_2,
++                              FragS& frag_s_3, FragS& frag_s_4, int i) {
++  __half2 s_val_1_2;
++  s_val_1_2.x = reinterpret_cast<__half*>(&frag_s_1)[i];
++  s_val_1_2.y = reinterpret_cast<__half*>(&frag_s_2)[i];
++
++  __half2 s_val_3_4;
++  s_val_3_4.x = reinterpret_cast<__half*>(&frag_s_3)[i];
++  s_val_3_4.y = reinterpret_cast<__half*>(&frag_s_4)[i];
++
++  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
++  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
++}
++
++// Given 2 floats multiply by 2 scales (halves)
++__device__ inline void scale_float(float* c, FragS& s) {
++  __half* s_ptr = reinterpret_cast<__half*>(&s);
++  c[0] = __fmul_rn(c[0], __half2float(s_ptr[0]));
++  c[1] = __fmul_rn(c[1], __half2float(s_ptr[1]));
++}
++
++// Wait until barrier reaches `count`, then lock for current threadblock.
++__device__ inline void barrier_acquire(int* lock, int count) {
++  if (threadIdx.x == 0) {
++    int state = -1;
++    do
++      // Guarantee that subsequent writes by this threadblock will be visible
++      // globally.
++      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
++                   : "=r"(state)
++                   : "l"(lock));
++    while (state != count);
++  }
++  __syncthreads();
++}
++
++// Release barrier and increment visitation count.
++__device__ inline void barrier_release(int* lock, bool reset = false) {
++  __syncthreads();
++  if (threadIdx.x == 0) {
++    if (reset) {
++      lock[0] = 0;
++      return;
++    }
++    int val = 1;
++    // Make sure that all writes since acquiring this barrier are visible
++    // globally, while releasing the barrier.
++    asm volatile("fence.acq_rel.gpu;\n");
++    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
++                 :
++                 : "l"(lock), "r"(val));
++  }
++}
++
++template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
++          const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const bool has_act_order,    // whether act_order is enabled
++          const bool has_zp,           // whether zero-points are enabled
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__device__ void MarlinMoESingle(
++    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // fp16 output buffer of shape mxn
++    const int* __restrict__ sorted_ids,      // int32 sorted ids of experts
++    const float* __restrict__ topk_weights,  // float topk weights
++    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
++                                          // (k/groupsize)xn
++    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
++                                          // (k/groupsize)x(n/pack_factor)
++    const int* __restrict__ g_idx,        // int32 group indices of shape k
++    const int* __restrict__ expert_offsets,
++    int num_groups,        // number of scale groups per output channel
++    int expert_idx,        // idx of current expert
++    int num_experts,       // number of experts
++    int topk,              // topk parameter of moe
++    int prob_m,            // batch dimension m
++    int prob_n,            // output dimension n
++    int prob_k,            // reduction dimension k
++    int tot_m,             // total number of rows in A and C
++    int* locks,            // extra global storage for barrier synchronization
++    bool replicate_input,  // do we use the same input for each expert?
++    bool apply_weights,    // apply weights to output
++    int current_m_block    // current m block to start kernel computation from
++) {
++  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
++  constexpr int pack_factor = 32 / w_type.size_bits();
++
++  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
++  // better partitioning with less reductions
++  int parallel = 1;
++  if (prob_m > 16 * thread_m_blocks) {
++    parallel = prob_m / (16 * thread_m_blocks);
++    prob_m = 16 * thread_m_blocks;
++  }
++
++  int k_tiles = prob_k / 16 / thread_k_blocks;
++  int n_tiles = prob_n / 16 / thread_n_blocks;
++  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
++
++  if constexpr (!has_act_order && group_blocks != -1) {
++    if (group_blocks >= thread_k_blocks) {
++      // Ensure that the number of tiles in each stripe is a multiple of the
++      // groupsize; this avoids an annoying special case where a stripe starts
++      // in the middle of group.
++      iters = (group_blocks / thread_k_blocks) *
++              ceildiv(iters, (group_blocks / thread_k_blocks));
++    }
++  }
++
++  int slice_row = (iters * blockIdx.x) % k_tiles;
++  int slice_col_par = (iters * blockIdx.x) / k_tiles;
++  int slice_col = slice_col_par;
++  int slice_iters;  // number of threadblock tiles in the current slice
++  int slice_count =
++      0;          // total number of active threadblocks in the current slice
++  int slice_idx;  // index of threadblock in current slice; numbered bottom to
++                  // top
++
++  // We can easily implement parallel problem execution by just remapping
++  // indices and advancing global pointers
++  if (slice_col_par >= n_tiles) {
++    locks += (slice_col_par / n_tiles) * n_tiles;
++    slice_col = slice_col_par % n_tiles;
++    sorted_ids += (slice_col_par / n_tiles) * 16 * thread_m_blocks;
++  }
++
++  // Compute all information about the current slice which is required for
++  // synchronization.
++  auto init_slice = [&]() {
++    slice_iters =
++        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
++    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
++    if (slice_iters == 0) return;
++    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
++    slice_count = 1;
++    slice_idx = 0;
++    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
++    if (col_first <= k_tiles * (slice_col_par + 1)) {
++      int col_off = col_first - k_tiles * slice_col_par;
++      slice_count = ceildiv(k_tiles - col_off, iters);
++      if (col_off > 0) slice_count++;
++      int delta_first = iters * blockIdx.x - col_first;
++      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
++        slice_idx = slice_count - 1;
++      else {
++        slice_idx = slice_count - 1 - delta_first / iters;
++        if (col_off > 0) slice_idx--;
++      }
++    }
++    if (slice_col == n_tiles) {
++      sorted_ids += 16 * thread_m_blocks;
++      locks += n_tiles;
++      slice_col = 0;
++    }
++  };
++  init_slice();
++
++  // A sizes/strides
++
++  // stride of the A matrix in global memory
++  int a_gl_stride = prob_k / 8;
++  // stride of an A matrix tile in shared memory
++  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
++  // delta between subsequent A tiles in global memory
++  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
++  // between subsequent accesses within a tile
++  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
++  // between shared memory writes
++  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
++  // between shared memory tile reads
++  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
++  // within a shared memory tile
++  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
++  // overall size of a tile
++  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
++  // number of shared write iterations for a tile
++  constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
++
++  // B sizes/strides
++  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
++  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
++  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
++  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
++
++  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
++  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
++  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
++  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
++  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
++  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
++
++  // Scale sizes/strides without act_order
++  int s_gl_stride = prob_n / 8;
++  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
++  constexpr int s_tb_groups =
++      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
++          ? thread_k_blocks / group_blocks
++          : 1;
++  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
++  int s_gl_rd_delta = s_gl_stride;
++  // Scale size/strides with act_order
++  constexpr int tb_k = 16 * thread_k_blocks;
++  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
++  // constexpr int act_s_row_stride      = 1;
++  // int           act_s_col_stride      = act_s_row_stride * num_groups;
++  int act_s_col_stride = 1;
++  int act_s_col_warp_stride = act_s_col_stride * 8;
++  int tb_n_warps = thread_n_blocks / 4;
++  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
++
++  // Zero-points sizes/strides
++  int zp_gl_stride = (prob_n / pack_factor) / 4;
++  constexpr int zp_sh_stride = ((16 * thread_n_blocks) / pack_factor) / 4;
++  constexpr int zp_tb_groups = s_tb_groups;
++  constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
++  int zp_gl_rd_delta = zp_gl_stride;
++
++  // Global A read index of current thread.
++  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                (threadIdx.x % a_gl_rd_delta_o);
++  a_gl_rd += a_gl_rd_delta_o * slice_row;
++  // Shared write index of current thread.
++  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                (threadIdx.x % a_gl_rd_delta_o);
++  // Shared read index.
++  int a_sh_rd =
++      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
++  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
++
++  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
++                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
++  b_gl_rd += b_sh_stride * slice_col;
++  b_gl_rd += b_gl_rd_delta_o * slice_row;
++  int b_sh_wr = threadIdx.x * b_thread_vecs;
++  int b_sh_rd = threadIdx.x * b_thread_vecs;
++
++  // For act_order
++  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
++  int slice_k_start = tb_k * slice_row;
++  int slice_k_finish = slice_k_start + tb_k * slice_iters;
++  int slice_k_start_shared_fetch = slice_k_start;
++  int slice_n_offset = act_s_col_tb_stride * slice_col;
++
++  // No act_order
++  int s_gl_rd;
++  if constexpr (!has_act_order) {
++    if constexpr (group_blocks == -1) {
++      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
++    } else {
++      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
++                s_sh_stride * slice_col + threadIdx.x;
++    }
++  }
++  int s_sh_wr = threadIdx.x;
++  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
++
++  // Zero-points
++  int zp_gl_rd;
++  if constexpr (has_zp) {
++    if constexpr (group_blocks == -1) {
++      zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
++    } else {
++      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
++                 zp_sh_stride * slice_col + threadIdx.x;
++    }
++  }
++  int zp_sh_wr = threadIdx.x;
++  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
++
++  // We use a different scale layout for grouped and column-wise quantization as
++  // we scale a `half2` tile in column-major layout in the former and in
++  // row-major in the latter case.
++  int s_sh_rd;
++  if constexpr (group_blocks != -1)
++    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
++              (threadIdx.x % 32) / 4;
++  else
++    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
++              (threadIdx.x % 32) % 4;
++
++  // Zero-points have the same read layout as the scales
++  // (without column-wise case)
++  constexpr int num_col_threads = 8;
++  constexpr int num_row_threads = 4;
++  constexpr int num_ints_per_thread = 8 / pack_factor;
++  int zp_sh_rd;
++  if constexpr (has_zp) {
++    zp_sh_rd = num_ints_per_thread * num_col_threads *
++                   ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
++               num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
++  }
++
++  int sh_first_group_id = -1;
++  int sh_num_groups = -1;
++  constexpr int sh_max_num_groups = 32;
++
++  extern __shared__ int4 sh[];
++  // Shared memory storage for global fetch pipelines.
++  int4* sh_a = sh;
++  int4* sh_b = sh_a + (stages * a_sh_stage);
++  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
++  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
++  int4* sh_s = sh_zp + (stages * zp_sh_stage);
++
++  // Precompute which thread should not read memory in which iterations; this is
++  // needed if there are more threads than required for a certain tilesize or
++  // when the batchsize is not a multiple of 16.
++  bool a_sh_wr_pred[a_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < a_sh_wr_iters; i++) {
++    int a_idx = a_sh_wr_delta * i + a_sh_wr;
++    int row = a_idx / a_gl_rd_delta_o;
++    if (row >= prob_m) {
++      a_sh_wr_pred[i] = false;
++    } else {
++      a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
++    }
++  }
++
++  // To ensure that writing and reading A tiles to/from shared memory, the
++  // latter in fragment format, is fully bank conflict free, we need to use a
++  // rather fancy XOR-based layout. The key here is that neither reads nor
++  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
++  // same shared memory banks. Further, it seems (based on NSight-Compute) that
++  // each warp must also write a consecutive memory segment?
++  auto transform_a = [&](int i) {
++    int row = i / a_gl_rd_delta_o;
++    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
++  };
++  // Since the computation of this remapping is non-trivial and, due to our main
++  // loop unrolls, all shared memory accesses are static, we simply precompute
++  // both transformed reads and writes.
++  int a_sh_wr_trans[a_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < a_sh_wr_iters; i++)
++    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
++  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
++  #pragma unroll
++  for (int i = 0; i < b_sh_wr_iters; i++) {
++  #pragma unroll
++    for (int j = 0; j < thread_m_blocks; j++)
++      a_sh_rd_trans[i][j] =
++          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
++  }
++
++  // Since B-accesses have non-constant stride they have to be computed at
++  // runtime; we break dependencies between subsequent accesses with a tile by
++  // maintining multiple pointers (we have enough registers), a tiny
++  // optimization.
++  const int4* B_ptr[b_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < b_sh_wr_iters; i++)
++    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
++
++  // Register storage for double buffer of shared memory reads.
++  FragA frag_a[2][thread_m_blocks];
++  I4 frag_b_quant[2][b_thread_vecs];
++  FragC frag_c[thread_m_blocks][4][2];
++  FragS frag_s[2][4];                    // No act-order
++  FragS act_frag_s[2][4][4];             // For act-order
++  int frag_qzp[2][num_ints_per_thread];  // Zero-points
++  FragZP frag_zp;                        // Zero-points in fp16
++
++  // Zero accumulators.
++  auto zero_accums = [&]() {
++  #pragma unroll
++    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
++      reinterpret_cast<float*>(frag_c)[i] = 0;
++  };
++
++  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
++                                    int last_group_id) {
++    sh_first_group_id = first_group_id;
++    sh_num_groups = last_group_id - first_group_id + 1;
++
++    if (sh_num_groups < sh_max_num_groups) {
++      sh_num_groups = sh_max_num_groups;
++    }
++
++    if (sh_first_group_id + sh_num_groups > num_groups) {
++      sh_num_groups = num_groups - sh_first_group_id;
++    }
++
++    int row_offset = first_group_id * s_gl_stride;
++
++    if (is_async) {
++      for (int i = 0; i < sh_num_groups; i++) {
++        if (threadIdx.x < s_sh_stride) {
++          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
++                         &scales_ptr[row_offset + (i * s_gl_stride) +
++                                     slice_n_offset + threadIdx.x]);
++        }
++      }
++    } else {
++      for (int i = 0; i < sh_num_groups; i++) {
++        if (threadIdx.x < s_sh_stride) {
++          sh_s[(i * s_sh_stride) + threadIdx.x] =
++              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
++                         threadIdx.x];
++        }
++      }
++    }
++  };
++  // Asynchronously fetch the next A, B and s tile from global to the next
++  // shared memory pipeline location.
++  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
++    if (pred) {
++      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < a_sh_wr_iters; i++) {
++        int a_idx = a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off;
++        int row = a_idx / a_gl_stride;
++        int sorted_row =
++            replicate_input ? sorted_ids[row] / topk : sorted_ids[row];
++        int new_idx = sorted_row * a_gl_stride + a_idx % a_gl_stride;
++        if (sorted_row < tot_m * (replicate_input ? 1 : topk) &&
++            new_idx < a_gl_stride * tot_m * (replicate_input ? 1 : topk)) {
++          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[new_idx],
++                         a_sh_wr_pred[i]);
++        }
++      }
++      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < b_sh_wr_iters; i++) {
++  #pragma unroll
++        for (int j = 0; j < b_thread_vecs; j++) {
++          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
++        }
++        B_ptr[i] += b_gl_rd_delta_o;
++      }
++
++      if constexpr (has_act_order) {
++        // Fetch g_idx thread-block portion
++        int full_pipe = a_off;
++        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
++        if (cur_k < prob_k && cur_k < slice_k_finish) {
++          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
++
++          int4 const* cur_g_idx_stage_ptr =
++              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
++
++          if (threadIdx.x < g_idx_stage) {
++            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
++                           &cur_g_idx_stage_ptr[threadIdx.x]);
++          }
++        }
++      } else {
++        if constexpr (group_blocks != -1) {
++          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
++
++          if constexpr (group_blocks >= thread_k_blocks) {
++            // Only fetch scales if this tile starts a new group
++            if (pipe % (group_blocks / thread_k_blocks) == 0) {
++              if (s_sh_wr_pred) {
++                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
++              }
++              s_gl_rd += s_gl_rd_delta;
++            }
++          } else {
++            for (int i = 0; i < s_tb_groups; i++) {
++              if (s_sh_wr_pred) {
++                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
++                          &scales_ptr[s_gl_rd]);
++              }
++              s_gl_rd += s_gl_rd_delta;
++            }
++          }
++        }
++
++        if constexpr (has_zp && group_blocks != -1) {
++          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
++
++          if constexpr (group_blocks >= thread_k_blocks) {
++            // Only fetch zero-points if this tile starts a new group
++            if (pipe % (group_blocks / thread_k_blocks) == 0) {
++              if (zp_sh_wr_pred) {
++                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
++              }
++              zp_gl_rd += zp_gl_rd_delta;
++            }
++          } else {
++            for (int i = 0; i < zp_tb_groups; i++) {
++              if (zp_sh_wr_pred) {
++                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
++                          &zp_ptr[zp_gl_rd]);
++              }
++              zp_gl_rd += zp_gl_rd_delta;
++            }
++          }
++        }
++      }
++    }
++    // Insert a fence even when we are winding down the pipeline to ensure that
++    // waiting is also correct at this point.
++    cp_async_fence();
++  };
++
++  auto fetch_zp_to_shared = [&]() {
++    if (zp_sh_wr_pred) {
++      cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
++    }
++  };
++
++  // Wait until the next thread tile has been loaded to shared memory.
++  auto wait_for_stage = [&]() {
++    // We only have `stages - 2` active fetches since we are double buffering
++    // and can only issue the next fetch when it is guaranteed that the previous
++    // shared memory load is fully complete (as it may otherwise be
++    // overwritten).
++    cp_async_wait<stages - 2>();
++    __syncthreads();
++  };
++
++  // Load the next sub-tile from the current location in the shared memory pipe
++  // into the current register buffer.
++  auto fetch_to_registers = [&](int k, int pipe) {
++    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
++    for (int i = 0; i < thread_m_blocks; i++)
++      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
++    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++
++  #pragma unroll
++    for (int i = 0; i < b_thread_vecs; i++) {
++      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
++          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
++    }
++  };
++
++  bool is_same_group[stages];
++  int same_group_id[stages];
++
++  auto init_same_group = [&](int pipe) {
++    if constexpr (!has_act_order) {
++      is_same_group[pipe] = false;
++      same_group_id[pipe] = 0;
++      return;
++    }
++
++    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
++    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
++
++    int group_id_1 = sh_g_idx_int_ptr[0];
++    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
++
++    is_same_group[pipe] = group_id_1 == group_id_2;
++    same_group_id[pipe] = group_id_1;
++  };
++
++  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
++    int pipe = full_pipe % stages;
++
++    if constexpr (!has_act_order) {
++      // No act-order case
++      if constexpr (group_blocks != -1) {
++        if constexpr (group_blocks >= thread_k_blocks) {
++          int4* sh_s_stage =
++              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
++                                   (pipe / (group_blocks / thread_k_blocks)));
++          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
++        } else {
++          int warp_id = threadIdx.x / 32;
++          int n_warps = thread_n_blocks / 4;
++
++          int warp_row = warp_id / n_warps;
++
++          int cur_k = warp_row * 16;
++          cur_k += k_iter_size * (k % b_sh_wr_iters);
++
++          int k_blocks = cur_k / 16;
++          int cur_group_id = k_blocks / group_blocks;
++
++          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
++
++          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
++              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
++        }
++      }
++
++      return;
++    }
++
++    // Act-order case
++
++    // Determine K of the "current" thread-block
++    int cur_k = slice_k_start + tb_k * full_pipe;
++    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
++      return;
++    }
++
++    // Reset (to current thread-block) since we read g_idx portion from the
++    // shared memory
++    cur_k = 0;
++
++    // Progress to current iteration
++    cur_k += k_iter_size * (k % b_sh_wr_iters);
++
++    // Determine "position" inside the thread-block (based on warp and
++    // thread-id)
++    int warp_id = threadIdx.x / 32;
++    int n_warps =
++        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
++
++    int warp_row = warp_id / n_warps;
++    int warp_col = warp_id % n_warps;
++
++    cur_k += warp_row * 16;
++
++    int th_id = threadIdx.x % 32;
++    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
++
++    int s_col_shift =
++        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
++        (th_id / 4) * act_s_col_stride;
++
++    if (is_same_group[pipe]) {
++      if (k % 2 == 0) {
++        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
++            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
++                 s_col_shift];
++      } else {
++        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
++            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
++      }
++
++      for (int i = 1; i < 4; i++) {
++        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
++            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
++      }
++      return;
++    }
++
++    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
++    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
++
++    constexpr int k_frag_offsets[4] = {0, 1, 8,
++                                       9};  // Tensor core offsets per thread
++
++  #pragma unroll
++    for (int i = 0; i < 4; i++) {
++      int actual_k = cur_k + k_frag_offsets[i];
++
++      int group_id = sh_g_idx_int_ptr[actual_k];
++      int rel_group_id = group_id - sh_first_group_id;
++
++      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
++          sh_s[rel_group_id * s_sh_stride + s_col_shift];
++    }
++  };
++
++  auto fetch_zp_to_registers = [&](int k, int full_pipe) {
++    // This code does not handle group_blocks == 0,
++    // which signifies act_order.
++    // has_zp implies AWQ, which doesn't have act_order,
++    static_assert(!has_zp || group_blocks != 0);
++
++    if constexpr (has_zp) {
++      int pipe = full_pipe % stages;
++
++      if constexpr (group_blocks == -1) {
++        for (int i = 0; i < num_ints_per_thread; i++) {
++          frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
++        }
++
++      } else if constexpr (group_blocks >= thread_k_blocks) {
++        int4* sh_zp_stage =
++            sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
++                                   (pipe / (group_blocks / thread_k_blocks)));
++        for (int i = 0; i < num_ints_per_thread; i++) {
++          frag_qzp[k % 2][i] =
++              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
++        }
++      } else {
++        int warp_id = threadIdx.x / 32;
++        int n_warps = thread_n_blocks / 4;
++
++        int warp_row = warp_id / n_warps;
++
++        int cur_k = warp_row * 16;
++        cur_k += k_iter_size * (k % b_sh_wr_iters);
++
++        int k_blocks = cur_k / 16;
++        int cur_group_id = 0;
++
++        // Suppress bogus and persistent divide-by-zero warning
++  #pragma nv_diagnostic push
++  #pragma nv_diag_suppress divide_by_zero
++        cur_group_id = k_blocks / group_blocks;
++  #pragma nv_diagnostic pop
++
++        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
++
++        sh_zp_stage += cur_group_id * zp_sh_stride;
++
++        for (int i = 0; i < num_ints_per_thread; i++) {
++          frag_qzp[k % 2][i] =
++              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
++        }
++      }
++    }
++  };
++
++  // Execute the actual tensor core matmul of a sub-tile.
++  auto matmul = [&](int k) {
++    if constexpr (has_zp) {
++      FragB frag_zp_0;
++      FragB frag_zp_1;
++      int zp_quant_0, zp_quant_1;
++
++      if constexpr (w_type.size_bits() == 4) {
++        zp_quant_0 = frag_qzp[k % 2][0];
++        zp_quant_1 = zp_quant_0 >> 8;
++      } else {
++        static_assert(w_type.size_bits() == 8);
++        zp_quant_0 = frag_qzp[k % 2][0];
++        zp_quant_1 = frag_qzp[k % 2][1];
++      }
++
++      frag_zp_0 = dequant<w_type_id>(zp_quant_0);
++      frag_zp_1 = dequant<w_type_id>(zp_quant_1);
++
++      frag_zp[0] = frag_zp_0[0];
++      frag_zp[1] = frag_zp_0[1];
++      frag_zp[2] = frag_zp_1[0];
++      frag_zp[3] = frag_zp_1[1];
++    }
++
++  // We have the m dimension as the inner loop in order to encourage overlapping
++  // dequantization and matmul operations.
++  #pragma unroll
++    for (int j = 0; j < 4; j++) {
++      int b_quant_0, b_quant_1;
++      if constexpr (w_type.size_bits() == 4) {
++        b_quant_0 = frag_b_quant[k % 2][0][j];
++        b_quant_1 = b_quant_0 >> 8;
++      } else {
++        static_assert(w_type.size_bits() == 8);
++        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
++        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
++        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
++      }
++
++      FragB frag_b0 = dequant<w_type_id>(b_quant_0);
++      FragB frag_b1 = dequant<w_type_id>(b_quant_1);
++      // Apply zero-point to frag_b0
++      if constexpr (has_zp) {
++        sub_zp(frag_b0, frag_zp[j], 0);
++      }
++
++      // Apply scale to frag_b0
++      if constexpr (has_act_order) {
++        scale4(frag_b0, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
++               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 0);
++      } else {
++        if constexpr (group_blocks != -1) {
++          scale(frag_b0, frag_s[k % 2][j], 0);
++        }
++      }
++
++      // Apply zero-point to frag_b1
++      if constexpr (has_zp) {
++        sub_zp(frag_b1, frag_zp[j], 1);
++      }
++
++      // Apply scale to frag_b1
++      if constexpr (has_act_order) {
++        scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
++               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 1);
++
++      } else {
++        if constexpr (group_blocks != -1) {
++          scale(frag_b1, frag_s[k % 2][j], 1);
++        }
++      }
++
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks; i++) {
++        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
++        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
++      }
++    }
++  };
++
++  // Since we slice across the k dimension of a tile in order to increase the
++  // number of warps while keeping the n dimension of a tile reasonable, we have
++  // multiple warps that accumulate their partial sums of the same output
++  // location; which we have to reduce over in the end. We do in shared memory.
++  auto thread_block_reduce = [&]() {
++    constexpr int red_off = threads / b_sh_stride_threads / 2;
++    if (red_off >= 1) {
++      int red_idx = threadIdx.x / b_sh_stride_threads;
++      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
++      constexpr int red_sh_delta = b_sh_stride_threads;
++      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
++                      (threadIdx.x % b_sh_stride_threads);
++
++      // Parallel logarithmic shared memory reduction. We make sure to avoid any
++      // unnecessary read or write iterations, e.g., for two warps we write only
++      // once by warp 1 and read only once by warp 0.
++
++  #pragma unroll
++      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
++  #pragma unroll
++        for (int i = red_off; i > 0; i /= 2) {
++          if (i <= red_idx && red_idx < 2 * i) {
++  #pragma unroll
++            for (int j = 0; j < 4 * 2; j++) {
++              int red_sh_wr =
++                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
++              if (i < red_off) {
++                float* c_rd =
++                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
++                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
++  #pragma unroll
++                for (int k = 0; k < 4; k++)
++                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
++                      c_rd[k] + c_wr[k];
++              }
++              sh[red_sh_wr] =
++                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
++            }
++          }
++          __syncthreads();
++        }
++        if (red_idx == 0) {
++  #pragma unroll
++          for (int i = 0; i < 4 * 2; i++) {
++            float* c_rd =
++                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
++  #pragma unroll
++            for (int j = 0; j < 4; j++)
++              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
++                  c_rd[j];
++          }
++        }
++        __syncthreads();
++      }
++    }
++  };
++
++  // Since multiple threadblocks may process parts of the same column slice, we
++  // finally have to globally reduce over the results. As the striped
++  // partitioning minimizes the number of such reductions and our outputs are
++  // usually rather small, we perform this reduction serially in L2 cache.
++  auto global_reduce = [&](bool first = false, bool last = false) {
++    // We are very careful here to reduce directly in the output buffer to
++    // maximize L2 cache utilization in this step. To do this, we write out
++    // results in FP16 (but still reduce with FP32 compute).
++    constexpr int active_threads = 32 * thread_n_blocks / 4;
++    if (threadIdx.x < active_threads) {
++      int c_gl_stride = prob_n / 8;
++      int c_gl_wr_delta_o = 8 * c_gl_stride;
++      int c_gl_wr_delta_i = 4 * (active_threads / 32);
++      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
++                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
++      c_gl_wr += (2 * thread_n_blocks) * slice_col;
++      constexpr int c_sh_wr_delta = active_threads;
++      int c_sh_wr = threadIdx.x;
++
++      int row = (threadIdx.x % 32) / 4;
++
++      if (!first) {
++  // Interestingly, doing direct global accesses here really seems to mess up
++  // the compiler and lead to slowdowns, hence we also use async-copies even
++  // though these fetches are not actually asynchronous.
++  #pragma unroll
++        for (int i = 0; i < thread_m_blocks * 4; i++) {
++          int c_idx =
++              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
++          int sorted_row = sorted_ids[c_idx / c_gl_stride];
++          int new_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
++          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i], &C[new_idx],
++                         sorted_row < tot_m * topk &&
++                             (8 * (i / 2) + row < prob_m &&
++                              (i < (thread_m_blocks - 1) * 4 ||
++                               sorted_ids[8 * (i / 2) + row] < tot_m * topk)));
++        }
++        cp_async_fence();
++        cp_async_wait<0>();
++      }
++
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks * 4; i++) {
++        if (8 * (i / 2) + row < prob_m &&
++            (i < (thread_m_blocks - 1) * 4 ||
++             sorted_ids[8 * (i / 2) + row] < tot_m * topk)) {
++          if (!first) {
++            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
++  #pragma unroll
++            for (int j = 0; j < 2 * 4; j++) {
++              reinterpret_cast<float*>(
++                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
++                  __half2float(reinterpret_cast<__half*>(&c_red)[j]);
++            }
++          }
++          if (!last) {
++            int4 c;
++  #pragma unroll
++            for (int j = 0; j < 2 * 4; j++) {
++              reinterpret_cast<__half*>(&c)[j] =
++                  __float2half(reinterpret_cast<float*>(
++                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
++            }
++            int c_idx =
++                c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
++            int row = sorted_ids[c_idx / c_gl_stride];
++            if (row < tot_m * topk) {
++              int new_idx = row * c_gl_stride + c_idx % c_gl_stride;
++              C[new_idx] = c;
++            }
++          }
++        }
++      }
++    }
++  };
++
++  // Write out the reduce final result in the correct layout. We only actually
++  // reshuffle matrix fragments in this step, the reduction above is performed
++  // in fragment layout.
++  auto write_result = [&]() {
++    int c_gl_stride = prob_n / 8;
++    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
++    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
++    constexpr int c_sh_rd_delta =
++        c_sh_stride * (threads / (2 * thread_n_blocks));
++
++    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
++                  (threadIdx.x % (2 * thread_n_blocks));
++    c_gl_wr += (2 * thread_n_blocks) * slice_col;
++    int c_sh_wr =
++        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
++    c_sh_wr += 32 * (threadIdx.x / 32);
++    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
++                  (threadIdx.x % (2 * thread_n_blocks));
++
++    int c_gl_wr_end = c_gl_stride * prob_m;
++
++    // We first reorder in shared memory to guarantee the most efficient final
++    // global write patterns
++    auto write = [&](int idx, float c0, float c1, FragS& s) {
++      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
++
++      // For per-column quantization we finally apply the scale here (only for
++      // 4-bit)
++      if constexpr (!has_act_order && group_blocks == -1 &&
++                    w_type.size_bits() == 4) {
++        res = __hmul2(res, s[0]);
++      }
++
++      ((half2*)sh)[idx] = res;
++    };
++    if (threadIdx.x / 32 < thread_n_blocks / 4) {
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks; i++) {
++  #pragma unroll
++        for (int j = 0; j < 4; j++) {
++          int wr = c_sh_wr + 8 * j;
++          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
++                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
++          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
++                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
++          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
++                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
++          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
++                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
++        }
++        c_sh_wr += 16 * (4 * c_sh_stride);
++      }
++    }
++    __syncthreads();
++
++  #pragma unroll
++    for (int i = 0;
++         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
++         i++) {
++      if (c_gl_wr < c_gl_wr_end) {
++        int row = sorted_ids[c_gl_wr / c_gl_stride];
++        if (row < tot_m * topk) {
++          int off = row * c_gl_stride + c_gl_wr % c_gl_stride;
++          if (!apply_weights) {
++            C[off] = sh[c_sh_rd];
++          } else {
++            __half* ctrg = reinterpret_cast<__half*>(&C[off]);
++            __half* csrc = reinterpret_cast<__half*>(&sh[c_sh_rd]);
++            for (int j = 0; j < 8; ++j) {
++              ctrg[j] = __float2half(topk_weights[row] * __half2float(csrc[j]));
++            }
++          }
++          c_gl_wr += c_gl_wr_delta;
++          c_sh_rd += c_sh_rd_delta;
++        }
++      }
++    }
++  };
++
++  // Start global fetch and register load pipelines.
++  auto start_pipes = [&]() {
++
++  #pragma unroll
++    for (int i = 0; i < stages - 1; i++) {
++      if (has_act_order && i == 0) {
++        int last_g_idx = slice_k_start + stages * tb_k * 2;
++        if (last_g_idx >= prob_k) {
++          last_g_idx = prob_k - 1;
++        }
++        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
++      }
++
++      if constexpr (has_zp && group_blocks == -1) {
++        if (i == 0) {
++          fetch_zp_to_shared();
++        }
++      }
++      fetch_to_shared(i, i, i < slice_iters);
++    }
++
++    zero_accums();
++    wait_for_stage();
++    init_same_group(0);
++    fetch_to_registers(0, 0);
++    fetch_scales_to_registers(0, 0);
++    fetch_zp_to_registers(0, 0);
++    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
++    slice_k_start_shared_fetch += tb_k * (stages - 1);
++  };
++  if (slice_iters) {
++    start_pipes();
++  }
++
++  // Main loop.
++  while (slice_iters) {
++    // We unroll over both the global fetch and the register load pipeline to
++    // ensure all shared memory accesses are static. Note that both pipelines
++    // have even length meaning that the next iteration will always start at
++    // index 0.
++  #pragma unroll
++    for (int pipe = 0; pipe < stages;) {
++  #pragma unroll
++      for (int k = 0; k < b_sh_wr_iters; k++) {
++        fetch_to_registers(k + 1, pipe % stages);
++        fetch_scales_to_registers(k + 1, pipe);
++        fetch_zp_to_registers(k + 1, pipe);
++        if (k == b_sh_wr_iters - 2) {
++          fetch_to_shared((pipe + stages - 1) % stages, pipe,
++                          slice_iters >= stages);
++          pipe++;
++          wait_for_stage();
++          init_same_group(pipe % stages);
++        }
++        matmul(k);
++      }
++      slice_iters--;
++      if (slice_iters == 0) {
++        break;
++      }
++    }
++
++    a_gl_rd += a_gl_rd_delta_o * stages;
++    slice_k_start += tb_k * stages;
++    slice_k_start_shared_fetch += tb_k * stages;
++
++    if constexpr (has_act_order) {
++      int first_group_id = g_idx[slice_k_start];
++      int last_g_idx = slice_k_start + stages * tb_k * 2;
++      if (last_g_idx >= prob_k) {
++        last_g_idx = prob_k - 1;
++      }
++      int last_group_id = g_idx[last_g_idx];
++      if (last_group_id >= sh_first_group_id + sh_num_groups) {
++        fetch_scales_to_shared(false, first_group_id, last_group_id);
++        __syncthreads();
++      }
++    }
++
++    // Process results and, if necessary, proceed to the next column slice.
++    // While this pattern may not be the most readable, other ways of writing
++    // the loop seemed to noticeably worse performance after compilation.
++    if (slice_iters == 0) {
++      cp_async_wait<0>();
++      bool last = slice_idx == slice_count - 1;
++      if constexpr (!has_act_order && group_blocks == -1) {
++        if constexpr (w_type.size_bits() == 8) {
++          if (s_sh_wr_pred) {
++            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
++          }
++          cp_async_fence();
++        } else {
++          // For 4-bit per-column scales, we only fetch them here in the
++          // final step before write-out
++          if (last) {
++            if (s_sh_wr_pred) {
++              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
++            }
++            cp_async_fence();
++          }
++        }
++      }
++
++      thread_block_reduce();
++      if constexpr (!has_act_order && group_blocks == -1) {
++        if constexpr (w_type.size_bits() == 8) {
++          cp_async_wait<0>();
++          __syncthreads();
++          if (threadIdx.x / 32 < thread_n_blocks / 4) {
++            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
++            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
++          }
++
++        } else {
++          if (last) {
++            cp_async_wait<0>();
++            __syncthreads();
++            if (threadIdx.x / 32 < thread_n_blocks / 4) {
++              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
++              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
++            }
++          }
++        }
++      }
++
++      // For 8-bit channelwise, we apply the scale before the global reduction
++      // that converts the fp32 results to fp16 (so that we avoid possible
++      // overflow in fp16)
++      if constexpr (!has_act_order && group_blocks == -1 &&
++                    w_type.size_bits() == 8) {
++        if (threadIdx.x / 32 < thread_n_blocks / 4) {
++  #pragma unroll
++          for (int i = 0; i < thread_m_blocks; i++) {
++  #pragma unroll
++            for (int j = 0; j < 4; j++) {
++              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][0]),
++                          frag_s[j / 2][2 * (j % 2) + 0]);
++              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][2]),
++                          frag_s[j / 2][2 * (j % 2) + 0]);
++
++              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][0]),
++                          frag_s[j / 2][2 * (j % 2) + 1]);
++              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][2]),
++                          frag_s[j / 2][2 * (j % 2) + 1]);
++            }
++          }
++        }
++      }
++
++      if (slice_count > 1) {  // only globally reduce if there is more than one
++                              // block in a slice
++        barrier_acquire(&locks[slice_col], slice_idx);
++        global_reduce(slice_idx == 0, last);
++        barrier_release(&locks[slice_col], last);
++      }
++      if (last)  // only the last block in a slice actually writes the result
++        write_result();
++      slice_row = 0;
++      slice_col_par++;
++      slice_col++;
++      init_slice();
++      if (slice_iters) {
++        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                  (threadIdx.x % a_gl_rd_delta_o);
++  #pragma unroll
++        for (int i = 0; i < b_sh_wr_iters; i++)
++          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
++        if (slice_col == 0) {
++  #pragma unroll
++          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
++        }
++
++        // Update slice k/n for scales loading
++        if constexpr (has_act_order) {
++          slice_k_start = tb_k * slice_row;
++          slice_k_finish = slice_k_start + tb_k * slice_iters;
++          slice_k_start_shared_fetch = slice_k_start;
++          slice_n_offset = act_s_col_tb_stride * slice_col;
++
++        } else {
++          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
++          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
++        }
++
++        start_pipes();
++      }
++    }
++  }
++}
++
++template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
++          const int threads,          // number of threads in a threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const bool has_act_order,    // whether act_order is enabled
++          const bool has_zp,           // whether zero-points are enabled
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__global__ void MarlinMoE(
++    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // fp16 output buffer of shape mxn
++    const int* __restrict__ sorted_ids_base,  // int32 sorted ids of experts
++    const float* __restrict__ topk_weights,   // float topk weights
++    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
++                                          // (k/groupsize)xn
++    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
++                                          // (k/groupsize)x(n/pack_factor)
++    const int* __restrict__ g_idx,        // int32 group indices of shape k
++    const int* __restrict__ expert_offsets,
++    int num_groups,        // number of scale groups per output channel
++    int expert_idx,        // idx of current expert
++    int num_experts,       // number of experts
++    int topk,              // topk parameter of moe
++    int prob_m,            // batch dimension m
++    int prob_n,            // output dimension n
++    int prob_k,            // reduction dimension k
++    int tot_m,             // total number of rows in A and C
++    int* locks,            // extra global storage for barrier synchronization
++    bool replicate_input,  // do we use the same input for each expert?
++    bool apply_weights,    // apply weights to output
++    int current_m_block,   // current m block to start kernel computation from
++    int max_par,           // maximum parallelism
++    int cfg_max_m_blocks   // upper bound on m blocks
++) {
++  int m_block_ctr = current_m_block;
++
++  const int* sorted_ids_expert =
++      sorted_ids_base + expert_offsets[expert_idx] + m_block_ctr * 4 * max_par;
++  int tot_its = expert_offsets[expert_idx + 1] - expert_offsets[expert_idx];
++  if (tot_its == 0) {
++    return;
++  }
++  int tot_m_blocks = ceildiv(tot_its, 16);
++  int pad = 16 * tot_m_blocks - tot_its;
++
++  if (m_block_ctr >= tot_m_blocks) {
++    return;
++  }
++
++  int max_block = tot_m_blocks - m_block_ctr;
++  prob_m = tot_its - 16 * m_block_ctr;
++
++  int par = 1;
++  if (max_block > cfg_max_m_blocks) {
++    // Note that parallel > 1 currently only works for inputs without any
++    // padding
++    par = (16 * max_block - pad) / (16 * cfg_max_m_blocks);
++    if (par > max_par) par = max_par;
++    prob_m = (16 * cfg_max_m_blocks) * par;
++    m_block_ctr += cfg_max_m_blocks * (par - 1);
++    max_block = cfg_max_m_blocks;
++  }
++
++  if (max_block == 1) {
++    MarlinMoESingle<w_type_id, threads, 1, thread_n_blocks, thread_k_blocks,
++                    stages, has_act_order, has_zp, group_blocks>(
++        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
++        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
++        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
++        current_m_block);
++  } else if (max_block == 2) {
++    MarlinMoESingle<w_type_id, threads, 2, thread_n_blocks, thread_k_blocks,
++                    stages, has_act_order, has_zp, group_blocks>(
++        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
++        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
++        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
++        current_m_block);
++  } else if (max_block == 3) {
++    MarlinMoESingle<w_type_id, threads, 3, thread_n_blocks, thread_k_blocks,
++                    stages, has_act_order, has_zp, group_blocks>(
++        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
++        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
++        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
++        current_m_block);
++  } else {
++    MarlinMoESingle<w_type_id, threads, 4, thread_n_blocks, thread_k_blocks,
++                    stages, has_act_order, has_zp, group_blocks>(
++        A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
++        expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
++        prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
++        current_m_block);
++  }
++}
++
++#else
++
++template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
++          const int threads,          // number of threads in a threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const bool has_act_order,    // whether act_order is enabled
++          const bool has_zp,           // whether zero-points are enabled
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__global__ void MarlinMoE(
++    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // fp16 output buffer of shape mxn
++    const int* __restrict__ sorted_ids,      // int32 sorted ids of experts
++    const float* __restrict__ topk_weights,  // float topk weights
++    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
++                                          // (k/groupsize)xn
++    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
++                                          // (k/groupsize)x(n/pack_factor)
++    const int* __restrict__ g_idx,        // int32 group indices of shape k
++    const int* __restrict__ expert_offsets,
++    int num_groups,        // number of scale groups per output channel
++    int expert_idx,        // idx of current expert
++    int num_experts,       // number of experts
++    int topk,              // topk parameter of moe
++    int prob_m,            // batch dimension m
++    int prob_n,            // output dimension n
++    int prob_k,            // reduction dimension k
++    int tot_m,             // total number of rows in A and C
++    int* locks,            // extra global storage for barrier synchronization
++    bool replicate_input,  // do we use the same input for each expert?
++    bool apply_weights,    // apply weights to output
++    int current_m_block,   // current m block to start kernel computation from
++    int max_par,           // maximum parallelism
++    int cfg_max_m_blocks   // upper bound on m blocks
++) {
++  // Marlin is not implemented yet for SM < 8.0
++  assert(false);
++  return;
++}
++
++#endif
++
++// 8 warps are a good choice since every SM has 4 schedulers and having more
++// than 1 warp per schedule allows some more latency hiding. At the same time,
++// we want relatively few warps to have many registers per warp and small tiles.
++const int USER_THREADS =
++    256;               // Note: This is only used with user-provided thread_k/n
++const int STAGES = 4;  // 4 pipeline stages fit into shared memory
++
++static constexpr int min_thread_n = 64;
++static constexpr int min_thread_k = 64;
++
++#define __CALL_IF_MOE(W_TYPE, THREAD_N_BLOCKS, THREAD_K_BLOCKS, HAS_ACT_ORDER, \
++                      HAS_ZP, GROUP_BLOCKS, NUM_THREADS)                       \
++  else if (q_type == W_TYPE && thread_n_blocks == THREAD_N_BLOCKS &&           \
++           thread_k_blocks == THREAD_K_BLOCKS &&                               \
++           has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP &&               \
++           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
++    cudaFuncSetAttribute(                                                      \
++        MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,  \
++                  STAGES, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS>,                \
++        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);          \
++    MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,      \
++              STAGES, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS>                     \
++        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
++            A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,      \
++            zp_ptr, g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx,     \
++            num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks,           \
++            replicate_input, apply_weights, m_block, max_par,                  \
++            cfg_max_m_blocks);                                                 \
++  }
++
++#define GPTQ_CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)          \
++  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS)   \
++  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
++  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS)  \
++  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS)  \
++  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)
++
++#define AWQ_CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)          \
++  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
++  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS)  \
++  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS)  \
++  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
++
++}  // namespace marlin_moe
+diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu
+new file mode 100644
+index 0000000..77bc0dd
+--- /dev/null
++++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu
+@@ -0,0 +1,31 @@
++#include "marlin_moe_kernel_ku4.h"
++
++namespace marlin_moe {
++
++// We return bool so we can create these different kernel calls as a sequence
++// of if-elseif's.
++bool call_marlin_moe_kernel_ku4(
++    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
++    bool has_act_order, int group_blocks, int num_threads, int blocks,
++    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
++    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
++    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
++    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
++    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
++    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
++    int m_block, int max_par, int cfg_max_m_blocks) {
++  bool has_zp = true;
++
++  if (false) {
++  }
++  AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256)
++  AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256)
++  AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128)
++  AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128)
++  else {
++    return false;
++  }
++  return true;
++}
++
++}  // namespace marlin_moe
+diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h
+new file mode 100644
+index 0000000..833fadf
+--- /dev/null
++++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h
+@@ -0,0 +1,20 @@
++#pragma once
++
++#include "marlin_moe_kernel.h"
++
++namespace marlin_moe {
++
++// We return bool so we can create these different kernel calls as a sequence
++// of if-elseif's.
++bool call_marlin_moe_kernel_ku4(
++    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
++    bool has_act_order, int group_blocks, int num_threads, int blocks,
++    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
++    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
++    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
++    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
++    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
++    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
++    int m_block, int max_par, int cfg_max_m_blocks);
++
++}  // namespace marlin_moe
+diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
+new file mode 100644
+index 0000000..f7e57b0
+--- /dev/null
++++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
+@@ -0,0 +1,31 @@
++#include "marlin_moe_kernel_ku4b8.h"
++
++namespace marlin_moe {
++
++// We return bool so we can create these different kernel calls as a sequence
++// of if-elseif's.
++bool call_marlin_moe_kernel_ku4b8(
++    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
++    bool has_act_order, int group_blocks, int num_threads, int blocks,
++    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
++    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
++    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
++    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
++    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
++    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
++    int m_block, int max_par, int cfg_max_m_blocks) {
++  bool has_zp = false;
++
++  if (false) {
++  }
++  GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
++  GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
++  GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
++  GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
++  else {
++    return false;
++  }
++  return true;
++}
++
++}  // namespace marlin_moe
+diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
+new file mode 100644
+index 0000000..494da8f
+--- /dev/null
++++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
+@@ -0,0 +1,20 @@
++#pragma once
++
++#include "marlin_moe_kernel.h"
++
++namespace marlin_moe {
++
++// We return bool so we can create these different kernel calls as a sequence
++// of if-elseif's.
++bool call_marlin_moe_kernel_ku4b8(
++    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
++    bool has_act_order, int group_blocks, int num_threads, int blocks,
++    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
++    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
++    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
++    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
++    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
++    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
++    int m_block, int max_par, int cfg_max_m_blocks);
++
++}  // namespace marlin_moe
+diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
+new file mode 100644
+index 0000000..a901f0b
+--- /dev/null
++++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
+@@ -0,0 +1,31 @@
++#include "marlin_moe_kernel_ku8b128.h"
++
++namespace marlin_moe {
++
++// We return bool so we can create these different kernel calls as a sequence
++// of if-elseif's.
++bool call_marlin_moe_kernel_ku8b128(
++    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
++    bool has_act_order, int group_blocks, int num_threads, int blocks,
++    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
++    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
++    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
++    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
++    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
++    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
++    int m_block, int max_par, int cfg_max_m_blocks) {
++  bool has_zp = false;
++
++  if (false) {
++  }
++  GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
++  GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
++  GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
++  GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
++  else {
++    return false;
++  }
++  return true;
++}
++
++}  // namespace marlin_moe
+diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
+new file mode 100644
+index 0000000..f3018aa
+--- /dev/null
++++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
+@@ -0,0 +1,18 @@
++#pragma once
++
++#include "marlin_moe_kernel.h"
++
++namespace marlin_moe {
++
++bool call_marlin_moe_kernel_ku8b128(
++    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
++    bool has_act_order, int group_blocks, int num_threads, int blocks,
++    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
++    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
++    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
++    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
++    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
++    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
++    int m_block, int max_par, int cfg_max_m_blocks);
++
++}
+diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
+new file mode 100644
+index 0000000..5f12483
+--- /dev/null
++++ b/csrc/moe/marlin_moe_ops.cu
+@@ -0,0 +1,588 @@
++/*
++ * Modified by Neural Magic
++ * Copyright (C) Marlin.2024 Elias Frantar
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *         http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <cuda.h>
++#include <cuda_fp16.h>
++#include <cuda_runtime.h>
++
++#include <iostream>
++
++#include "core/exception.hpp"
++#include "core/scalar_type.hpp"
++#include "core/registration.h"
++#include "marlin_kernels/marlin_moe_kernel_ku4b8.h"
++#include "marlin_kernels/marlin_moe_kernel_ku8b128.h"
++#include "marlin_kernels/marlin_moe_kernel_ku4.h"
++
++template <typename T>
++inline std::string str(T x) {
++  return std::to_string(x);
++}
++
++namespace marlin_moe {
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
++
++// For a given "a" of size [M,K] performs a permutation of the K columns based
++// on the given "perm" indices.
++__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
++                                    int const* __restrict__ perm_int_ptr,
++                                    int4* __restrict__ out_int4_ptr, int size_m,
++                                    int size_k, int block_rows) {
++  int start_row = block_rows * blockIdx.x;
++  int finish_row = start_row + block_rows;
++  if (finish_row > size_m) {
++    finish_row = size_m;
++  }
++  int cur_block_rows = finish_row - start_row;
++
++  int row_stride = size_k * sizeof(half) / 16;
++
++  auto permute_row = [&](int row) {
++    int iters = size_k / blockDim.x;
++    int rest = size_k % blockDim.x;
++
++    int offset = row * row_stride;
++
++    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
++    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
++
++    int base_k = 0;
++
++    for (int i = 0; i < iters; i++) {
++      int cur_k = base_k + threadIdx.x;
++      int src_pos = perm_int_ptr[cur_k];
++
++      out_half[cur_k] = a_row_half[src_pos];
++
++      base_k += blockDim.x;
++    }
++
++    if (rest) {
++      if (threadIdx.x < rest) {
++        int cur_k = base_k + threadIdx.x;
++        int src_pos = perm_int_ptr[cur_k];
++
++        out_half[cur_k] = a_row_half[src_pos];
++      }
++    }
++  };
++
++  for (int i = 0; i < cur_block_rows; i++) {
++    int cur_row = start_row + i;
++    if (cur_row < size_m) {
++      permute_row(cur_row);
++    }
++  }
++}
++
++__global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
++                                       int* __restrict__ expert_offsets,
++                                       int topk_length, int block_size) {
++  int expert_id = threadIdx.x;
++  int num_experts = blockDim.x;
++
++  int occurrences = 0;
++  for (int i = 0; i < topk_length; ++i) {
++    occurrences += (topk_ids[i] == expert_id);
++  }
++  expert_offsets[expert_id + 1] = occurrences;
++  __syncthreads();
++
++  if (threadIdx.x == 0) {
++    int tot_offset = 0;
++    expert_offsets[0] = 0;
++    for (int i = 0; i < num_experts; ++i) {
++      tot_offset += ceildiv(expert_offsets[i + 1], block_size) * block_size;
++      expert_offsets[i + 1] = tot_offset;
++    }
++  }
++  __syncthreads();
++}
++
++#else
++
++__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
++                                    int const* __restrict__ perm_int_ptr,
++                                    int4* __restrict__ out_int4_ptr, int size_m,
++                                    int size_k, int block_rows) {
++  // Marlin is not implemented yet for SM < 8.0
++  assert(false);
++  return;
++}
++
++__global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
++                                       int* __restrict__ expert_offsets,
++                                       int topk_length, int block_size) {
++  // Marlin is not implemented yet for SM < 8.0
++  assert(false);
++  return;
++}
++
++#endif
++
++typedef struct {
++  int thread_k;
++  int thread_n;
++  int num_threads;
++} thread_config_t;
++
++typedef struct {
++  int max_m_blocks;
++  thread_config_t tb_cfg;
++} exec_config_t;
++
++thread_config_t small_batch_thread_configs[] = {
++    // Ordered by priority
++
++    // thread_k, thread_n, num_threads
++    {128, 128, 256},  // Default
++    {128, 64, 128},   // Reduce N 2X, same K
++    {64, 256, 256},   // Reduce K 2X, increase N 2X
++    {64, 128, 128},   // Reduce K 2X, same N
++    {64, 64, 128},    // Reduce both 2X
++};
++
++thread_config_t large_batch_thread_configs[] = {
++    // Ordered by priority
++
++    // thread_k, thread_n, num_threads
++    {64, 256, 256},   // Default
++    {128, 128, 256},  // Reduce N 2X, increase K 2X
++    {64, 128, 128},   // Reduce N 2X, same K
++    {128, 64, 128},   // Reduce N 4X, increase K 2X
++    {64, 64, 128},    // Reduce N 4X, same K
++};
++
++int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
++                          int prob_n, int prob_k, int num_bits, int group_size,
++                          bool has_act_order, bool is_k_full) {
++  bool cache_scales_chunk = has_act_order && !is_k_full;
++
++  int tb_n = th_config.thread_n;
++  int tb_k = th_config.thread_k;
++
++  // Get max scale groups per thread-block
++  int tb_groups;
++  if (group_size == -1) {
++    tb_groups = 1;
++  } else if (group_size == 0) {
++    tb_groups = ceildiv(tb_k, 32);  // Worst case is 32 group size
++  } else {
++    tb_groups = ceildiv(tb_k, group_size);
++  }
++
++  if (cache_scales_chunk) {
++    int load_groups =
++        tb_groups * STAGES * 2;          // Chunk size is 2x pipeline over dim K
++    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
++    return load_groups * tb_n * 4;
++
++  } else {
++    int tb_scales = tb_groups * tb_n * 2;
++
++    return tb_scales * STAGES;
++  }
++}
++
++bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
++                         int prob_m, int prob_n, int prob_k, int num_bits,
++                         int scales_cache_size, int max_shared_mem) {
++  int pack_factor = 32 / num_bits;
++
++  // Get B size
++  int tb_k = th_config.thread_k;
++  int tb_n = th_config.thread_n;
++
++  int b_size = (tb_k * tb_n / pack_factor) * 4;
++
++  // Get A size
++  int m_blocks = ceildiv(prob_m, 16);
++  int tb_max_m = 16;
++
++  while (true) {
++    if (m_blocks >= max_m_blocks) {
++      tb_max_m *= max_m_blocks;
++      break;
++    }
++
++    max_m_blocks--;
++    if (max_m_blocks == 0) {
++      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
++    }
++  }
++
++  int a_size = (tb_max_m * tb_k) * 2;
++
++  float pipe_size = (a_size + b_size) * STAGES;
++
++  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity
++
++  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
++}
++
++bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
++                     int prob_m, int prob_n, int prob_k, int num_bits,
++                     int group_size, bool has_act_order, bool is_k_full,
++                     int max_shared_mem) {
++  // Sanity
++  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
++      th_config.num_threads == -1) {
++    return false;
++  }
++
++  // Verify K/N are divisible by thread K/N
++  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
++    return false;
++  }
++
++  // thread_k can be only 128 or 64 (because it must be less than groupsize
++  // which is 128)
++  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
++    return false;
++  }
++
++  // Verify min for thread K/N
++  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
++    return false;
++  }
++
++  // num_threads must be at least 128 (= 4 warps)
++  if (th_config.num_threads < 128) {
++    return false;
++  }
++
++  //  Determine cache for scales
++  int scales_cache_size =
++      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
++                            group_size, has_act_order, is_k_full);
++
++  // Check that pipeline fits into cache
++  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
++                           num_bits, scales_cache_size, max_shared_mem)) {
++    return false;
++  }
++
++  return true;
++}
++
++exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
++                                      int num_bits, int group_size,
++                                      bool has_act_order, bool is_k_full,
++                                      int max_shared_mem) {
++  int max_m_blocks = 4;
++  while (max_m_blocks > 0) {
++    if (prob_m <= 16) {
++      for (auto th_config : small_batch_thread_configs) {
++        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
++                            num_bits, group_size, has_act_order, is_k_full,
++                            max_shared_mem)) {
++          return exec_config_t{max_m_blocks, th_config};
++        }
++      }
++    } else {
++      for (auto th_config : large_batch_thread_configs) {
++        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
++                            num_bits, group_size, has_act_order, is_k_full,
++                            max_shared_mem)) {
++          return exec_config_t{max_m_blocks, th_config};
++        }
++      }
++    }
++
++    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
++                     // usage
++  }
++
++  return exec_config_t{0, {-1, -1, -1}};
++}
++
++#define CALL_MOE_KERNEL_FUNCTION(KERNEL_FUNCTION)                             \
++  else if (KERNEL_FUNCTION(                                                   \
++               q_type, thread_n_blocks, thread_k_blocks, has_act_order,       \
++               group_blocks, num_threads, blocks, max_shared_mem, stream,     \
++               A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,  \
++               zp_ptr, g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx, \
++               num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks,       \
++               replicate_input, apply_weights, m_block, max_par,              \
++               exec_cfg.max_m_blocks)) {                                      \
++  }
++
++void marlin_mm_moe(const void* A, const void* B, void* C,
++                   const void* sorted_ids, const void* topk_weights,
++                   const void* topk_ids, const void* s, void* zp,
++                   const void* g_idx, const void* perm, void* a_tmp,
++                   void* expert_offsets, int prob_m, int prob_n, int prob_k,
++                   void* workspace, vllm::ScalarType const& q_type,
++                   bool has_act_order, bool is_k_full, bool has_zp,
++                   int num_groups, int group_size, int num_experts, int topk,
++                   int moe_block_size, int dev, cudaStream_t stream,
++                   int thread_k, int thread_n, int sms, int max_par,
++                   bool replicate_input, bool apply_weights) {
++  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
++              ", ", prob_n, ", ", prob_k, "]");
++
++  if (sms == -1) {
++    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
++  }
++
++  int max_shared_mem = 0;
++  cudaDeviceGetAttribute(&max_shared_mem,
++                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
++  TORCH_CHECK(max_shared_mem > 0);
++
++  int num_bits = q_type.size_bits();
++
++  // Set thread config
++  exec_config_t exec_cfg;
++  if (thread_k != -1 && thread_n != -1) {
++    // User-defined config
++    exec_cfg =
++        exec_config_t{4, thread_config_t{thread_k, thread_n, USER_THREADS}};
++  } else {
++    // Auto config
++    exec_cfg =
++        determine_thread_config(prob_m, prob_n, prob_k, num_bits, group_size,
++                                has_act_order, is_k_full, max_shared_mem);
++  }
++
++  TORCH_CHECK(exec_cfg.max_m_blocks > 0 &&
++                  is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks,
++                                  prob_m, prob_n, prob_k, num_bits, group_size,
++                                  has_act_order, is_k_full, max_shared_mem),
++              "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
++              ", thread_k = ", exec_cfg.tb_cfg.thread_k,
++              ", thread_n = ", exec_cfg.tb_cfg.thread_n,
++              ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [",
++              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
++              ", group_size = ", group_size,
++              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
++              ", max_shared_mem = ", max_shared_mem);
++
++  int num_threads = exec_cfg.tb_cfg.num_threads;
++  thread_k = exec_cfg.tb_cfg.thread_k;
++  thread_n = exec_cfg.tb_cfg.thread_n;
++
++  int thread_k_blocks = thread_k / 16;
++  int thread_n_blocks = thread_n / 16;
++
++  int blocks = sms;
++
++  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
++              " is not divisible by thread_n = ", thread_n);
++  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
++              " is not divisible by thread_k = ", thread_k);
++
++  int group_blocks = 0;
++  if (has_act_order) {
++    if (is_k_full) {
++      TORCH_CHECK(group_size != -1);
++      group_blocks = group_size / 16;
++      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
++                  " is not divisible by group_blocks = ", group_blocks);
++    } else {
++      TORCH_CHECK(group_size == 0);
++      group_blocks = 0;
++    }
++
++  } else {
++    if (group_size == -1) {
++      group_blocks = -1;
++    } else {
++      group_blocks = group_size / 16;
++      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
++                  " is not divisible by group_blocks = ", group_blocks);
++    }
++  }
++
++  int tot_m = prob_m;
++
++  const int* topk_ids_ptr = (const int*)topk_ids;
++  int* expert_offsets_ptr = (int*)expert_offsets;
++  compute_expert_offsets<<<1, num_experts, 0, stream>>>(
++      topk_ids_ptr, expert_offsets_ptr, tot_m * topk, moe_block_size);
++
++  bool do_permute_a = has_act_order;
++
++  // If we have a full K, then we can run the non-act-order version of Marlin
++  // (since the weight rows are reordered by increasing group ids, and by
++  // having a full K, we have full original groups)
++  if (is_k_full) {
++    has_act_order = false;
++  }
++
++  int pack_factor = 32 / q_type.size_bits();
++
++  for (int expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
++    const int4* A_ptr = (const int4*)A;
++    int4* a_tmp_ptr = (int4*)a_tmp;
++    const int4* B_ptr =
++        (const int4*)B + (prob_n * prob_k / (pack_factor * 4)) * expert_idx;
++    int4* C_ptr = (int4*)C;
++    const float* topk_weights_ptr = (const float*)topk_weights;
++    const int* sorted_ids_ptr = (const int*)sorted_ids;
++    const int4* s_ptr = (const int4*)s + num_groups * prob_n / 8 * expert_idx;
++    const int4* zp_ptr =
++        (const int4*)zp + num_groups * prob_n / (pack_factor * 4) * expert_idx;
++    const int* g_idx_ptr = (const int*)g_idx + prob_k * expert_idx;
++    const int* perm_ptr = (const int*)perm + prob_k * expert_idx;
++    int* locks = (int*)workspace;
++
++    if (do_permute_a) {
++      // Permute A columns
++      int topk_rows = replicate_input ? tot_m : tot_m * topk;
++      int block_rows = ceildiv(topk_rows, blocks);
++      permute_cols_kernel<<<blocks, num_threads, 0, stream>>>(
++          A_ptr, perm_ptr, a_tmp_ptr, topk_rows, prob_k, block_rows);
++      A_ptr = a_tmp_ptr;
++    }
++
++    int tot_m_blocks = ceildiv(tot_m, 16);
++    for (int m_block = 0; m_block < tot_m_blocks;
++         m_block += 4 * exec_cfg.max_m_blocks) {
++      if (false) {
++      }
++      CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku4b8)
++      CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku8b128)
++      CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku4)
++      else {
++        TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
++                               str(prob_n) + ", " + str(prob_k) + "]" +
++                               ", has_act_order = " + str(has_act_order) +
++                               ", num_groups = " + str(num_groups) +
++                               ", group_size = " + str(group_size) +
++                               ", thread_n_blocks = " + str(thread_n_blocks) +
++                               ", thread_k_blocks = " + str(thread_k_blocks));
++      }
++    }
++  }
++}
++
++}  // namespace marlin_moe
++
++torch::Tensor marlin_gemm_moe(
++    const torch::Tensor& a, const torch::Tensor& b_q_weights,
++    const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
++    const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
++    torch::Tensor& b_zeros, const torch::Tensor& g_idx,
++    const torch::Tensor& perm, torch::Tensor& workspace,
++    vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n,
++    int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk,
++    int64_t moe_block_size, bool replicate_input, bool apply_weights) {
++  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
++  bool has_zp = b_zeros.size(1) != 0;
++  if (has_zp) {
++    TORCH_CHECK(
++        b_q_type == vllm::kU4,
++        "b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str());
++  } else {
++    TORCH_CHECK(
++        b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
++        "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type.str());
++  }
++
++  int pack_factor = 32 / b_q_type.size_bits();
++
++  int max_par = 4;
++
++  int dev = a.get_device();
++
++  auto options_dtype =
++      torch::TensorOptions().dtype(a.dtype()).device(a.device());
++  auto options_int =
++      torch::TensorOptions().dtype(torch::kInt).device(a.device());
++  torch::Tensor c = torch::zeros({size_m, topk, size_n}, options_dtype);
++  torch::Tensor a_tmp =
++      replicate_input ? torch::zeros({size_m, size_k}, options_dtype)
++                      : torch::zeros({size_m, topk, size_k}, options_dtype);
++  torch::Tensor expert_offsets = torch::empty({num_experts + 1}, options_int);
++
++  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
++  // auto -1)
++  int thread_k = -1;
++  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
++  // auto -1)
++  int thread_n = -1;
++  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
++  int sms = -1;
++
++  // Detect groupsize and act_order
++  int num_groups = -1;
++  int group_size = -1;
++  bool has_act_order = g_idx.size(1) != 0;
++
++  int b_rank = b_scales.sizes().size();
++  TORCH_CHECK(b_rank == 3, "b_scales rank = ", b_rank, " is not 3");
++  TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2),
++              " is not size_n = ", size_n);
++  num_groups = b_scales.size(1);
++
++  TORCH_CHECK(VLLM_IMPLIES(!is_k_full, has_act_order),
++              "if is_k_full is false, has_act_order must be true");
++
++  if (has_act_order) {
++    if (is_k_full) {
++      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
++      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
++                  ", is not divisible by num_groups = ", num_groups);
++      group_size = size_k / num_groups;
++    } else {
++      group_size = 0;
++    }
++
++  } else {
++    if (num_groups > 1) {
++      TORCH_CHECK(
++          size_k % num_groups == 0, "size_k = ", size_k,
++          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
++      group_size = size_k / num_groups;
++    } else {
++      group_size = -1;
++    }
++  }
++
++  // Verify b_zeros
++  if (has_zp) {
++    int rank = b_zeros.sizes().size();
++    TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3");
++    TORCH_CHECK(b_zeros.size(1) == num_groups,
++                "b_zeros dim 1 = ", b_zeros.size(1),
++                " is not num_groups = ", num_groups);
++    TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor,
++                "b_zeros dim 2 = ", b_zeros.size(2),
++                " is not size_n / pack_factor = ", size_n / pack_factor);
++  }
++
++  marlin_moe::marlin_mm_moe(
++      a.data_ptr(), b_q_weights.data_ptr(), c.data_ptr(), sorted_ids.data_ptr(),
++      topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
++      b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
++      expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
++      b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size,
++      num_experts, topk, moe_block_size, dev,
++      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par,
++      replicate_input, apply_weights);
++  return c;
++}
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
++  m.impl("marlin_gemm_moe", &marlin_gemm_moe);
++}
+diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
+new file mode 100644
+index 0000000..24341d6
+--- /dev/null
++++ b/csrc/moe/moe_align_sum_kernels.cu
+@@ -0,0 +1,324 @@
++#include <torch/all.h>
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++
++#include <ATen/ATen.h>
++#include <THC/THCAtomics.cuh>
++
++#include "../cuda_compat.h"
++#include "../dispatch_utils.h"
++
++#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
++
++namespace vllm {
++namespace moe {
++
++namespace {
++__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
++                                         int32_t col) {
++  // don't worry about overflow because num_experts is relatively small
++  return row * total_col + col;
++}
++}  // namespace
++
++template <typename scalar_t>
++__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
++                                            int32_t* sorted_token_ids,
++                                            int32_t* expert_ids,
++                                            int32_t* total_tokens_post_pad,
++                                            int32_t num_experts,
++                                            int32_t block_size, size_t numel) {
++  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
++  const size_t start_idx = threadIdx.x * tokens_per_thread;
++
++  extern __shared__ int32_t shared_mem[];
++
++  int32_t* tokens_cnts =
++      shared_mem;  // 2d tensor with shape (blockDim.x + 1, num_experts)
++  int32_t* cumsum =
++      shared_mem +
++      (blockDim.x + 1) * num_experts;  // 1d tensor with shape (num_experts + 1)
++
++  for (int i = 0; i < num_experts; ++i) {
++    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
++  }
++
++  /**
++   * In the first step we compute token_cnts[thread_index + 1][expert_index],
++   * which counts how many tokens in the token shard of thread_index are
++   * assigned to expert expert_index.
++   */
++  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++    ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
++  }
++
++  __syncthreads();
++
++  // For each expert we accumulate the token counts from the different threads.
++  if (threadIdx.x < num_experts) {
++    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
++    for (int i = 1; i <= blockDim.x; ++i) {
++      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
++          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
++    }
++  }
++
++  __syncthreads();
++
++  // We accumulate the token counts of all experts in thread 0.
++  if (threadIdx.x == 0) {
++    cumsum[0] = 0;
++    for (int i = 1; i <= num_experts; ++i) {
++      cumsum[i] = cumsum[i - 1] +
++                  CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
++                          block_size) *
++                      block_size;
++    }
++    *total_tokens_post_pad = cumsum[num_experts];
++  }
++
++  __syncthreads();
++
++  /**
++   * For each expert, each thread processes the tokens of the corresponding
++   * blocks and stores the corresponding expert_id for each block.
++   */
++  if (threadIdx.x < num_experts) {
++    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
++         i += block_size) {
++      expert_ids[i / block_size] = threadIdx.x;
++    }
++  }
++
++  /**
++   * Each thread processes a token shard, calculating the index of each token
++   * after sorting by expert number. Given the example topk_ids =
++   * [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
++   * *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
++   * padding value(preset in python).
++   */
++  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++    int32_t expert_id = topk_ids[i];
++    /** The cumsum[expert_id] stores the starting index of the tokens that the
++     * expert with expert_id needs to process, and
++     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
++     * processed by the expert with expert_id within the current thread's token
++     * shard.
++     */
++    int32_t rank_post_pad =
++        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
++        cumsum[expert_id];
++    sorted_token_ids[rank_post_pad] = i;
++    ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
++  }
++}
++
++// TODO(simon): this is temporarily adapted from
++// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7
++// we did this to unblock Deepseek V3 but there should be a better
++// implementation to manage shared memory.
++template <typename scalar_t>
++__global__ void moe_align_block_size_global_mem_kernel(
++    scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
++    int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
++    int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) {
++  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
++  const size_t start_idx = threadIdx.x * tokens_per_thread;
++
++  for (int i = 0; i < num_experts; ++i) {
++    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
++  }
++
++  /**
++   * In the first step we compute token_cnts[thread_index + 1][expert_index],
++   * which counts how many tokens in the token shard of thread_index are
++   * assigned to expert expert_index.
++   */
++  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++    ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
++  }
++
++  __syncthreads();
++
++  // For each expert we accumulate the token counts from the different threads.
++  if (threadIdx.x < num_experts) {
++    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
++    for (int i = 1; i <= blockDim.x; ++i) {
++      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
++          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
++    }
++  }
++
++  __syncthreads();
++
++  // We accumulate the token counts of all experts in thread 0.
++  if (threadIdx.x == 0) {
++    cumsum[0] = 0;
++    for (int i = 1; i <= num_experts; ++i) {
++      cumsum[i] = cumsum[i - 1] +
++                  CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
++                          block_size) *
++                      block_size;
++    }
++    *total_tokens_post_pad = cumsum[num_experts];
++  }
++
++  __syncthreads();
++
++  /**
++   * For each expert, each thread processes the tokens of the corresponding
++   * blocks and stores the corresponding expert_id for each block.
++   */
++  if (threadIdx.x < num_experts) {
++    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
++         i += block_size) {
++      expert_ids[i / block_size] = threadIdx.x;
++    }
++  }
++
++  /**
++   * Each thread processes a token shard, calculating the index of each token
++   * after sorting by expert number. Given the example topk_ids =
++   * [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
++   * *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
++   * padding value(preset in python).
++   */
++  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++    int32_t expert_id = topk_ids[i];
++    /** The cumsum[expert_id] stores the starting index of the tokens that the
++     * expert with expert_id needs to process, and
++     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
++     * processed by the expert with expert_id within the current thread's token
++     * shard.
++     */
++    int32_t rank_post_pad =
++        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
++        cumsum[expert_id];
++    sorted_token_ids[rank_post_pad] = i;
++    ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
++  }
++}
++
++template <typename scalar_t, int TOPK>
++__global__ void moe_sum_kernel(
++    scalar_t* __restrict__ out,          // [..., d]
++    const scalar_t* __restrict__ input,  // [..., topk, d]
++    const int d) {
++  const int64_t token_idx = blockIdx.x;
++  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
++    scalar_t x = 0.0;
++#pragma unroll
++    for (int k = 0; k < TOPK; ++k) {
++      x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
++    }
++    out[token_idx * d + idx] = x;
++  }
++}
++
++}  // namespace moe
++}  // namespace vllm
++
++void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
++                          int64_t block_size, torch::Tensor sorted_token_ids,
++                          torch::Tensor experts_ids,
++                          torch::Tensor num_tokens_post_pad) {
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++
++  // If we have very large number of experts, we can no longer use shared
++  // memory.
++  // TODO(simon): the right solution should be calculating the exact right
++  // amount of shared memory and use that. The num_experts >= 256 is just a
++  // temporary solution to unblock Deepseek V3.
++  if (num_experts >= 256) {
++    VLLM_DISPATCH_INTEGRAL_TYPES(
++        topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
++          // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
++          // tensors
++          const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
++
++          const int32_t mem_tokens_cnts =
++              ((num_experts + 1) * num_experts) * sizeof(int32_t);
++          const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t);
++          // allocate global memory
++          int32_t* tokens_cnts;
++          int32_t* cumsum;
++          cudaMalloc(&tokens_cnts, mem_tokens_cnts);
++          cudaMalloc(&cumsum, mem_cumsum);
++
++          auto kernel =
++              vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
++          kernel<<<1, num_thread, 0, stream>>>(
++              topk_ids.data_ptr<scalar_t>(),
++              sorted_token_ids.data_ptr<int32_t>(),
++              experts_ids.data_ptr<int32_t>(),
++              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
++              topk_ids.numel(), tokens_cnts, cumsum);
++          cudaFree(tokens_cnts);
++          cudaFree(cumsum);
++        });
++  } else {
++    VLLM_DISPATCH_INTEGRAL_TYPES(
++        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
++          // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
++          // tensors
++          const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
++          const int32_t shared_mem =
++              ((num_thread + 1) * num_experts + (num_experts + 1)) *
++              sizeof(int32_t);
++
++          // set dynamic shared mem
++          auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
++          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
++              (void*)kernel, shared_mem));
++          kernel<<<1, num_thread, shared_mem, stream>>>(
++              topk_ids.data_ptr<scalar_t>(),
++              sorted_token_ids.data_ptr<int32_t>(),
++              experts_ids.data_ptr<int32_t>(),
++              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
++              topk_ids.numel());
++        });
++  }
++}
++
++void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
++             torch::Tensor& output)  // [num_tokens, hidden_size]
++{
++  const int hidden_size = input.size(-1);
++  const int num_tokens = output.numel() / hidden_size;
++  const int topk = input.size(1);
++
++  dim3 grid(num_tokens);
++  dim3 block(std::min(hidden_size, 1024));
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++
++  switch (topk) {
++    case 2:
++      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
++        vllm::moe::moe_sum_kernel<scalar_t, 2><<<grid, block, 0, stream>>>(
++            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
++            hidden_size);
++      });
++      break;
++
++    case 3:
++      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
++        vllm::moe::moe_sum_kernel<scalar_t, 3><<<grid, block, 0, stream>>>(
++            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
++            hidden_size);
++      });
++      break;
++
++    case 4:
++      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
++        vllm::moe::moe_sum_kernel<scalar_t, 4><<<grid, block, 0, stream>>>(
++            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
++            hidden_size);
++      });
++      break;
++
++    default:
++      at::sum_out(output, input, 1);
++      break;
++  }
++}
+diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
+index a01be3e..596cc0a 100644
+--- a/csrc/moe/moe_ops.h
++++ b/csrc/moe/moe_ops.h
+@@ -1,9 +1,14 @@
+ #pragma once
+ 
+-#include <torch/extension.h>
++#include <torch/all.h>
+ 
+-void topk_softmax(
+-  torch::Tensor& topk_weights,
+-  torch::Tensor& topk_indices,
+-  torch::Tensor& token_expert_indices,
+-  torch::Tensor& gating_output);
++void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
++                  torch::Tensor& token_expert_indices,
++                  torch::Tensor& gating_output);
++
++void moe_sum(torch::Tensor& input, torch::Tensor& output);
++
++void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
++                          int64_t block_size, torch::Tensor sorted_token_ids,
++                          torch::Tensor experts_ids,
++                          torch::Tensor num_tokens_post_pad);
+diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
+index 8c65f40..de9747b 100644
+--- a/csrc/moe/topk_softmax_kernels.cu
++++ b/csrc/moe/topk_softmax_kernels.cu
+@@ -16,18 +16,25 @@
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+-#include <torch/extension.h>
++#include <torch/all.h>
+ #include <ATen/cuda/CUDAContext.h>
+ #include <c10/cuda/CUDAGuard.h>
++#include "../cuda_compat.h"
+ 
+-#include <cub/cub.cuh>
+-#include <cub/util_type.cuh>
++#ifndef USE_ROCM
++    #include <cub/util_type.cuh>
++    #include <cub/cub.cuh>
++#else
++    #include <hipcub/util_type.hpp>
++    #include <hipcub/hipcub.hpp>
++#endif
++
++#define MAX(a, b) ((a) > (b) ? (a) : (b))
++#define MIN(a, b) ((a) < (b) ? (a) : (b))
+ 
+ namespace vllm {
+ namespace moe {
+ 
+-static constexpr int WARP_SIZE = 32;
+-
+ /// Aligned array type
+ template <
+     typename T,
+@@ -265,7 +272,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
+ #pragma unroll
+     for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+     {
+-        thread_max = max(thread_max, __shfl_xor_sync(0xFFFFFFFF, thread_max, mask, THREADS_PER_ROW));
++        thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
+     }
+ 
+     // From this point, thread max in all the threads have the max within the row.
+@@ -282,7 +289,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
+ #pragma unroll
+     for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+     {
+-        row_sum += __shfl_xor_sync(0xFFFFFFFF, row_sum, mask, THREADS_PER_ROW);
++        row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
+     }
+ 
+     // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
+@@ -332,8 +339,8 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
+ #pragma unroll
+         for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+         {
+-            float other_max = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, THREADS_PER_ROW);
+-            int other_expert = __shfl_xor_sync(0xFFFFFFFF, expert, mask, THREADS_PER_ROW);
++            float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
++            int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);
+ 
+             // We want lower indices to "win" in every thread so we break ties this way
+             if (other_max > max_val || (other_max == max_val && other_expert < expert))
+@@ -383,7 +390,7 @@ struct TopkConstants
+ {
+     static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+     static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
+-    static constexpr int VECs_PER_THREAD = std::max(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
++    static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
+     static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
+     static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
+     static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
+@@ -396,7 +403,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
+ {
+     static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
+ 
+-    static constexpr int BYTES_PER_LDG = std::min(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
++    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
+     using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
+     static constexpr int VPT = Constants::VPT;
+     static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
+diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
+new file mode 100644
+index 0000000..f3a558c
+--- /dev/null
++++ b/csrc/moe/torch_bindings.cpp
+@@ -0,0 +1,39 @@
++#include "core/registration.h"
++#include "moe_ops.h"
++
++TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
++  // Apply topk softmax to the gating outputs.
++  m.def(
++      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
++      "token_expert_indices, Tensor gating_output) -> ()");
++  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
++
++  // Calculate the result of moe by summing up the partial results
++  // from all selected experts.
++  m.def("moe_sum(Tensor! input, Tensor output) -> ()");
++  m.impl("moe_sum", torch::kCUDA, &moe_sum);
++
++  // Aligning the number of tokens to be processed by each expert such
++  // that it is divisible by the block size.
++  m.def(
++      "moe_align_block_size(Tensor topk_ids, int num_experts,"
++      "                     int block_size, Tensor! sorted_token_ids,"
++      "                     Tensor! experts_ids,"
++      "                     Tensor! num_tokens_post_pad) -> ()");
++  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
++
++#ifndef USE_ROCM
++  m.def(
++      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
++      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
++      "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
++      "int b_q_type, SymInt size_m, "
++      "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
++      "topk, "
++      "int moe_block_size, bool replicate_input, bool apply_weights)"
++      " -> Tensor");
++  // conditionally compiled so impl registration is in source file
++#endif
++}
++
++REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
+diff --git a/csrc/ops.h b/csrc/ops.h
+index 9541adc..9efd9b0 100644
+--- a/csrc/ops.h
++++ b/csrc/ops.h
+@@ -1,206 +1,245 @@
+ #pragma once
+ 
+-#include <torch/extension.h>
++#include <optional>
++#include <torch/library.h>
++
++#include "core/scalar_type.hpp"
++
++#include <vector>
++
++torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
++  // Ensure tensor is on CUDA
++  if (!tensor.is_cuda()) {
++    throw std::runtime_error("Tensor must be on CUDA device");
++  }
++
++  // Get the raw data pointer
++  void* data_ptr = tensor.data_ptr();
++
++  // Get tensor sizes and strides
++  std::vector<int64_t> sizes = tensor.sizes().vec();
++  std::vector<int64_t> strides = tensor.strides().vec();
++
++  // Get tensor options (dtype, device)
++  auto options = tensor.options();
++
++  // Create a new tensor from the raw data pointer
++  auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options);
++
++  return new_tensor;
++}
+ 
+ void paged_attention_v1(
+-  torch::Tensor& out,
+-  torch::Tensor& query,
+-  torch::Tensor& key_cache,
+-  torch::Tensor& value_cache,
+-  int num_kv_heads,
+-  float scale,
+-  torch::Tensor& block_tables,
+-  torch::Tensor& seq_lens,
+-  int block_size,
+-  int max_seq_len,
+-  const c10::optional<torch::Tensor>& alibi_slopes,
+-  const std::string& kv_cache_dtype,
+-  float kv_scale);
++    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
++    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
++    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
++    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
++    const std::string& kv_cache_dtype, double k_scale, double v_scale,
++    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
++    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
++    const int64_t blocksparse_head_sliding_step);
+ 
+ void paged_attention_v2(
+-  torch::Tensor& out,
+-  torch::Tensor& exp_sums,
+-  torch::Tensor& max_logits,
+-  torch::Tensor& tmp_out,
+-  torch::Tensor& query,
+-  torch::Tensor& key_cache,
+-  torch::Tensor& value_cache,
+-  int num_kv_heads,
+-  float scale,
+-  torch::Tensor& block_tables,
+-  torch::Tensor& seq_lens,
+-  int block_size,
+-  int max_seq_len,
+-  const c10::optional<torch::Tensor>& alibi_slopes,
+-  const std::string& kv_cache_dtype,
+-  float kv_scale);
+-
+-void rms_norm(
+-  torch::Tensor& out,
+-  torch::Tensor& input,
+-  torch::Tensor& weight,
+-  float epsilon);
+-
+-void fused_add_rms_norm(
+-  torch::Tensor& input,
+-  torch::Tensor& residual,
+-  torch::Tensor& weight,
+-  float epsilon);
+-
+-void rotary_embedding(
+-  torch::Tensor& positions,
+-  torch::Tensor& query,
+-  torch::Tensor& key,
+-  int head_size,
+-  torch::Tensor& cos_sin_cache,
+-  bool is_neox);
+-
+-void batched_rotary_embedding(
+-  torch::Tensor& positions,
+-  torch::Tensor& query,
+-  torch::Tensor& key,
+-  int head_size,
+-  torch::Tensor& cos_sin_cache,
+-  bool is_neox,
+-  int rot_dim,
+-  torch::Tensor& cos_sin_cache_offsets);
+-
+-void silu_and_mul(
+-  torch::Tensor& out,
+-  torch::Tensor& input);
+-
+-void gelu_and_mul(
+-  torch::Tensor& out,
+-  torch::Tensor& input);
+-
+-void gelu_tanh_and_mul(
+-  torch::Tensor& out,
+-  torch::Tensor& input);
+-
+-void gelu_new(
+-  torch::Tensor& out,
+-  torch::Tensor& input);
+-
+-void gelu_fast(
+-  torch::Tensor& out,
+-  torch::Tensor& input);
++    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
++    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
++    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
++    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
++    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
++    const std::string& kv_cache_dtype, double k_scale, double v_scale,
++    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
++    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
++    const int64_t blocksparse_head_sliding_step);
++
++void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
++              double epsilon);
++
++void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
++                        torch::Tensor& weight, double epsilon);
++
++void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
++                               torch::Tensor& weight, torch::Tensor& scale,
++                               double epsilon);
++
++void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
++                                         torch::Tensor& input,
++                                         torch::Tensor& residual,
++                                         torch::Tensor& weight,
++                                         torch::Tensor& scale, double epsilon);
++
++void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
++                                      torch::Tensor const& input,
++                                      torch::Tensor const& weight,
++                                      torch::Tensor& scales,
++                                      double const epsilon,
++                                      std::optional<torch::Tensor> scale_ub,
++                                      std::optional<torch::Tensor> residual);
++
++void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
++                      torch::Tensor& key, int64_t head_size,
++                      torch::Tensor& cos_sin_cache, bool is_neox);
++
++void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
++                              torch::Tensor& key, int64_t head_size,
++                              torch::Tensor& cos_sin_cache, bool is_neox,
++                              int64_t rot_dim,
++                              torch::Tensor& cos_sin_cache_offsets);
++
++void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
++
++void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
++
++void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
++
++void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
++                     double threshold);
++
++void gelu_new(torch::Tensor& out, torch::Tensor& input);
++
++void gelu_fast(torch::Tensor& out, torch::Tensor& input);
++
++void gelu_quick(torch::Tensor& out, torch::Tensor& input);
++
++void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
++                            int64_t block_size, torch::Tensor& input_tokens,
++                            torch::Tensor& sampled_token_ids,
++                            torch::Tensor& input_positions,
++                            torch::Tensor& seq_lens,
++                            torch::Tensor& slot_mapping,
++                            torch::Tensor& block_tables);
++
++void advance_step_flashinfer(
++    int64_t num_seqs, int64_t num_queries, int64_t block_size,
++    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
++    torch::Tensor& input_positions, torch::Tensor& seq_lens,
++    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
++    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
++    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
+ 
+ #ifndef USE_ROCM
+-torch::Tensor aqlm_gemm(
+-  const torch::Tensor& input,
+-  const torch::Tensor& codes,
+-  const torch::Tensor& codebooks,
+-  const torch::Tensor& scales,
+-  const torch::Tensor& codebook_partition_sizes,
+-  const std::optional<torch::Tensor>& bias
+-);
++torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
++                        const torch::Tensor& codebooks,
++                        const torch::Tensor& scales,
++                        const std::vector<int64_t>& codebook_partition_sizes,
++                        const std::optional<torch::Tensor>& bias);
+ 
+ torch::Tensor aqlm_dequant(
+-  const torch::Tensor& codes,
+-  const torch::Tensor& codebooks,
+-  const torch::Tensor& codebook_partition_sizes
+-);
+-
+-torch::Tensor awq_gemm(
+-  torch::Tensor _in_feats,
+-  torch::Tensor _kernel,
+-  torch::Tensor _scaling_factors,
+-  torch::Tensor _zeros,
+-  int split_k_iters);
+-
+-torch::Tensor awq_dequantize(
+-    torch::Tensor _kernel,
+-    torch::Tensor _scaling_factors,
+-    torch::Tensor _zeros,
+-    int split_k_iters,
+-    int thx,
+-    int thy);
+-
+-torch::Tensor marlin_gemm(
+-    torch::Tensor& a, 
+-    torch::Tensor& b_q_weight,
+-    torch::Tensor& b_scales, 
+-    torch::Tensor& workspace,
+-    int64_t size_m, 
+-    int64_t size_n, 
+-    int64_t size_k);
+-
+-torch::Tensor gptq_marlin_gemm(
+-  torch::Tensor &a,
+-  torch::Tensor &b_q_weight,
+-  torch::Tensor &b_scales,
+-  torch::Tensor &g_idx,
+-  torch::Tensor &perm,
+-  torch::Tensor &workspace,
+-  int64_t num_bits,
+-  int64_t size_m,
+-  int64_t size_n,
+-  int64_t size_k,
+-  bool is_k_full);
+-
+-torch::Tensor gptq_marlin_repack(
+-  torch::Tensor &b_q_weight,
+-  torch::Tensor &perm,
+-  int64_t size_k,
+-  int64_t size_n,
+-  int64_t num_bits);
++    const torch::Tensor& codes, const torch::Tensor& codebooks,
++    const std::vector<int64_t>& codebook_partition_sizes);
++
++torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
++                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
++                       int64_t split_k_iters);
++
++torch::Tensor awq_dequantize(torch::Tensor _kernel,
++                             torch::Tensor _scaling_factors,
++                             torch::Tensor _zeros, int64_t split_k_iters,
++                             int64_t thx, int64_t thy);
++
++torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
++#endif
++
++torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
++                              int64_t n);
++
++torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
++                                  int64_t type, int64_t row);
++
++torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
++                              int64_t row);
++
++#ifndef USE_ROCM
++bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
++
++void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, torch::Tensor const& a_scales,
++                       torch::Tensor const& b_scales,
++                       std::optional<torch::Tensor> const& bias);
++
++void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
++                           torch::Tensor const& b,
++                           torch::Tensor const& a_scales,
++                           torch::Tensor const& b_scales,
++                           torch::Tensor const& azp_adj,
++                           std::optional<torch::Tensor> const& azp,
++                           std::optional<torch::Tensor> const& bias);
++
++bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);
++
++void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
++                              torch::Tensor const& b, torch::Tensor const& e,
++                              torch::Tensor const& a_scales,
++                              torch::Tensor const& b_scales,
++                              std::optional<torch::Tensor> const& bias);
++
++bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
++                                   torch::Tensor& e, torch::Tensor const& a);
+ #endif
+ 
+-void squeezellm_gemm(
+-  torch::Tensor vec,
+-  torch::Tensor mat,
+-  torch::Tensor mul,
+-  torch::Tensor lookup_table);
+-
+-torch::Tensor gptq_gemm(
+-  torch::Tensor a,
+-  torch::Tensor b_q_weight,
+-  torch::Tensor b_gptq_qzeros,
+-  torch::Tensor b_gptq_scales,
+-  torch::Tensor b_g_idx,
+-  bool use_exllama,
+-  int bit);
+-
+-void gptq_shuffle(
+-  torch::Tensor q_weight,
+-  torch::Tensor q_perm,
+-  int bit);
+-
+-void static_scaled_fp8_quant(
+-  torch::Tensor& out,
+-  torch::Tensor& input,
+-  torch::Tensor& scale);
+-
+-void dynamic_scaled_fp8_quant(
+-  torch::Tensor& out,
+-  torch::Tensor& input,
+-  torch::Tensor& scale);
+-
+-void moe_align_block_size(
+-  torch::Tensor topk_ids,
+-  int num_experts,
+-  int block_size,
+-  torch::Tensor sorted_token_ids,
+-  torch::Tensor experts_ids,
+-  torch::Tensor num_tokens_post_pad);
++void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
++                              torch::Tensor const& scale,
++                              std::optional<torch::Tensor> const& azp);
++
++void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
++                               torch::Tensor& scales,
++                               std::optional<torch::Tensor> const& azp);
++
++torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
++                        torch::Tensor b_gptq_qzeros,
++                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
++                        bool use_exllama, int64_t bit);
++
++void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
++
++void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
++                             torch::Tensor const& scale);
++
++void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
++                              torch::Tensor& scale);
++
++void dynamic_per_token_scaled_fp8_quant(
++    torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
++    std::optional<torch::Tensor> const& scale_ub);
++
++void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
++                        const torch::Tensor& A, const torch::Tensor& B,
++                        const torch::Tensor& C,
++                        const std::optional<torch::Tensor>& D_,
++                        const std::optional<torch::Tensor>& z_,
++                        const std::optional<torch::Tensor>& delta_bias_,
++                        bool delta_softplus,
++                        const std::optional<torch::Tensor>& query_start_loc,
++                        const std::optional<torch::Tensor>& cache_indices,
++                        const std::optional<torch::Tensor>& has_initial_state,
++                        const torch::Tensor& ssm_states, int64_t pad_slot_id);
++
++void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state,
++                          const at::Tensor& weight,
++                          const std::optional<at::Tensor>& bias_,
++                          bool silu_activation,
++                          const std::optional<at::Tensor>& cache_seqlens_,
++                          const std::optional<at::Tensor>& conv_state_indices_,
++                          int64_t pad_slot_id);
++
++void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
++                       const std::optional<at::Tensor>& bias_,
++                       const std::optional<at::Tensor>& conv_states,
++                       const std::optional<at::Tensor>& query_start_loc,
++                       const std::optional<at::Tensor>& cache_indices,
++                       const std::optional<at::Tensor>& has_initial_state,
++                       bool silu_activation, int64_t pad_slot_id);
+ 
+ #ifndef USE_ROCM
+-using fptr_t = uint64_t;
+-fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
+-                    const std::vector<std::string> &handles,
+-                    const std::vector<int64_t> &offsets, int rank,
+-                    bool full_nvlink);
+-bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
+-                      bool full_nvlink);
+-void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out);
+-void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &reg_buffer,
+-                      torch::Tensor &out);
++using fptr_t = int64_t;
++fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
++                      torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
++void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
++                fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
+ void dispose(fptr_t _fa);
+-int meta_size();
+-void register_buffer(fptr_t _fa, torch::Tensor &t,
+-                     const std::vector<std::string> &handles,
+-                     const std::vector<int64_t> &offsets);
+-std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
+-void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
+-                            const std::vector<std::vector<int64_t>> &offsets);
++int64_t meta_size();
++void register_buffer(fptr_t _fa, const std::vector<int64_t>& fake_ipc_ptrs);
++std::tuple<std::vector<int64_t>, std::vector<int64_t>>
++get_graph_buffer_ipc_meta(fptr_t _fa);
++void register_graph_buffers(fptr_t _fa,
++                            const std::vector<std::vector<int64_t>>& handles,
++                            const std::vector<std::vector<int64_t>>& offsets);
+ #endif
+diff --git a/csrc/permute_cols.cu b/csrc/permute_cols.cu
+new file mode 100644
+index 0000000..f51fa73
+--- /dev/null
++++ b/csrc/permute_cols.cu
+@@ -0,0 +1,88 @@
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++
++#include <cuda_fp16.h>
++
++static constexpr int default_threads = 256;
++static constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
++
++// For a given "a" of size [M,K] performs a permutation of the K columns based
++// on the given "perm" indices.
++// Currently only supports 16bit types (since we permute half types)
++__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
++                                    int const* __restrict__ perm_int_ptr,
++                                    int4* __restrict__ out_int4_ptr, int size_m,
++                                    int size_k, int block_rows) {
++  int start_row = block_rows * blockIdx.x;
++  int finish_row = start_row + block_rows;
++  if (finish_row > size_m) {
++    finish_row = size_m;
++  }
++  int cur_block_rows = std::max(finish_row - start_row, 0);
++
++  int row_stride = size_k * sizeof(half) / 16;
++
++  auto permute_row = [&](int row) {
++    int iters = size_k / default_threads;
++    int rest = size_k % default_threads;
++
++    int offset = row * row_stride;
++
++    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
++    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
++
++    int base_k = 0;
++
++    for (int i = 0; i < iters; i++) {
++      int cur_k = base_k + threadIdx.x;
++      int src_pos = perm_int_ptr[cur_k];
++
++      out_half[cur_k] = a_row_half[src_pos];
++
++      base_k += default_threads;
++    }
++
++    if (rest) {
++      if (threadIdx.x < rest) {
++        int cur_k = base_k + threadIdx.x;
++        int src_pos = perm_int_ptr[cur_k];
++
++        out_half[cur_k] = a_row_half[src_pos];
++      }
++    }
++  };
++
++  for (int i = 0; i < cur_block_rows; i++) {
++    int cur_row = start_row + i;
++    if (cur_row < size_m) {
++      permute_row(cur_row);
++    }
++  }
++}
++
++// More efficient version of A[..., perm]
++//  taken from gptq_marlin.cu
++torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm) {
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
++  auto dev = A.get_device();
++  auto stream = at::cuda::getCurrentCUDAStream(dev);
++
++  TORCH_CHECK(A.scalar_type() == at::kHalf || A.scalar_type() == at::kBFloat16,
++              "Currently only 16bit types are supported");
++  TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
++  TORCH_CHECK(A.size(-1) % 8 == 0,
++              "A columns must be a multiple of 8 (128bits)");
++  auto A_2d = A.view({-1, A.size(-1)});
++
++  torch::Tensor D = torch::empty_like(A);
++  int sms;
++  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
++  int block_rows = div_ceil(A_2d.size(0), sms);
++  permute_cols_kernel<<<sms, default_threads, 0, stream>>>(
++      reinterpret_cast<int4 const*>(A_2d.const_data_ptr()),
++      perm.const_data_ptr<int>(), reinterpret_cast<int4*>(D.mutable_data_ptr()),
++      A_2d.size(0), A_2d.size(1), block_rows);
++  return D;
++}
+\ No newline at end of file
+diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu
+index d80cb69..97184a8 100644
+--- a/csrc/pos_encoding_kernels.cu
++++ b/csrc/pos_encoding_kernels.cu
+@@ -1,4 +1,4 @@
+-#include <torch/extension.h>
++#include <torch/all.h>
+ #include <ATen/cuda/CUDAContext.h>
+ #include <c10/cuda/CUDAGuard.h>
+ 
+@@ -7,14 +7,10 @@
+ 
+ namespace vllm {
+ 
+-template<typename scalar_t, bool IS_NEOX>
++template <typename scalar_t, bool IS_NEOX>
+ inline __device__ void apply_token_rotary_embedding(
+-  scalar_t* __restrict__ arr,
+-  const scalar_t* __restrict__ cos_ptr,
+-  const scalar_t* __restrict__ sin_ptr,
+-  int rot_offset,
+-  int embed_dim)
+-{
++    scalar_t* __restrict__ arr, const scalar_t* __restrict__ cos_ptr,
++    const scalar_t* __restrict__ sin_ptr, int rot_offset, int embed_dim) {
+   int x_index, y_index;
+   scalar_t cos, sin;
+   if (IS_NEOX) {
+@@ -37,19 +33,17 @@ inline __device__ void apply_token_rotary_embedding(
+   arr[y_index] = y * cos + x * sin;
+ }
+ 
+-template<typename scalar_t, bool IS_NEOX>
++template <typename scalar_t, bool IS_NEOX>
+ inline __device__ void apply_rotary_embedding(
+-  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+-  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+-  const scalar_t* cache_ptr,
+-  const int head_size,
+-  const int num_heads,
+-  const int num_kv_heads,
+-  const int rot_dim,
+-  const int token_idx,
+-  const int64_t query_stride,
+-  const int64_t key_stride)
+-{
++    scalar_t* __restrict__ query,  // [batch_size, seq_len, num_heads,
++                                   // head_size] or [num_tokens, num_heads,
++                                   // head_size]
++    scalar_t* __restrict__ key,    // [batch_size, seq_len, num_kv_heads,
++                                   // head_size] or [num_tokens, num_kv_heads,
++                                   // head_size]
++    const scalar_t* cache_ptr, const int head_size, const int num_heads,
++    const int num_kv_heads, const int rot_dim, const int token_idx,
++    const int64_t query_stride, const int64_t key_stride) {
+   const int embed_dim = rot_dim / 2;
+   const scalar_t* cos_ptr = cache_ptr;
+   const scalar_t* sin_ptr = cache_ptr + embed_dim;
+@@ -59,8 +53,8 @@ inline __device__ void apply_rotary_embedding(
+     const int head_idx = i / embed_dim;
+     const int64_t token_head = token_idx * query_stride + head_idx * head_size;
+     const int rot_offset = i % embed_dim;
+-    apply_token_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
+-                                              sin_ptr, rot_offset, embed_dim);
++    apply_token_rotary_embedding<scalar_t, IS_NEOX>(
++        query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
+   }
+ 
+   const int nk = num_kv_heads * embed_dim;
+@@ -68,62 +62,74 @@ inline __device__ void apply_rotary_embedding(
+     const int head_idx = i / embed_dim;
+     const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+     const int rot_offset = i % embed_dim;
+-    apply_token_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
+-                                              sin_ptr, rot_offset, embed_dim);
++    apply_token_rotary_embedding<scalar_t, IS_NEOX>(
++        key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
+   }
+ }
+ 
+-template<typename scalar_t, bool IS_NEOX>
++template <typename scalar_t, bool IS_NEOX>
+ __global__ void rotary_embedding_kernel(
+-  const int64_t* __restrict__ positions,        // [batch_size, seq_len] or [num_tokens]
+-  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+-  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+-  const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
+-  const int rot_dim,
+-  const int64_t query_stride,
+-  const int64_t key_stride,
+-  const int num_heads,
+-  const int num_kv_heads,
+-  const int head_size) {
++    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
++                                            // [num_tokens]
++    scalar_t* __restrict__ query,           // [batch_size, seq_len, num_heads,
++                                   // head_size] or [num_tokens, num_heads,
++                                   // head_size]
++    scalar_t* __restrict__ key,  // [batch_size, seq_len, num_kv_heads,
++                                 // head_size] or [num_tokens, num_kv_heads,
++                                 // head_size]
++    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
++                                                 // 2]
++    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
++    const int num_heads, const int num_kv_heads, const int head_size) {
+   // Each thread block is responsible for one token.
+   const int token_idx = blockIdx.x;
+   int64_t pos = positions[token_idx];
+   const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+ 
+-  apply_rotary_embedding<scalar_t, IS_NEOX>(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
++  apply_rotary_embedding<scalar_t, IS_NEOX>(
++      query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
++      token_idx, query_stride, key_stride);
+ }
+ 
+-template<typename scalar_t, bool IS_NEOX>
++template <typename scalar_t, bool IS_NEOX>
+ __global__ void batched_rotary_embedding_kernel(
+-  const int64_t* __restrict__ positions,              // [batch_size, seq_len] or [num_tokens]
+-  scalar_t* __restrict__ query,                       // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+-  scalar_t* __restrict__ key,                         // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+-  const scalar_t* __restrict__ cos_sin_cache,         // [max_position, 2, rot_dim // 2]
+-  const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len] or [num_tokens]
+-  const int rot_dim,
+-  const int64_t query_stride,
+-  const int64_t key_stride,
+-  const int num_heads,
+-  const int num_kv_heads,
+-  const int head_size) {
++    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
++                                            // [num_tokens]
++    scalar_t* __restrict__ query,           // [batch_size, seq_len, num_heads,
++                                   // head_size] or [num_tokens, num_heads,
++                                   // head_size]
++    scalar_t* __restrict__ key,  // [batch_size, seq_len, num_kv_heads,
++                                 // head_size] or [num_tokens, num_kv_heads,
++                                 // head_size]
++    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
++                                                 // 2]
++    const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len]
++                                                        // or [num_tokens]
++    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
++    const int num_heads, const int num_kv_heads, const int head_size) {
+   // Each thread block is responsible for one token.
+   const int token_idx = blockIdx.x;
+   int64_t pos = positions[token_idx];
+   int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
+-  const scalar_t* cache_ptr = cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
++  const scalar_t* cache_ptr =
++      cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
+ 
+-  apply_rotary_embedding<scalar_t, IS_NEOX>(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
++  apply_rotary_embedding<scalar_t, IS_NEOX>(
++      query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
++      token_idx, query_stride, key_stride);
+ }
+ 
+-} // namespace vllm
++}  // namespace vllm
+ 
+ void rotary_embedding(
+-  torch::Tensor& positions,         // [batch_size, seq_len] or [num_tokens]
+-  torch::Tensor& query,             // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
+-  torch::Tensor& key,               // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
+-  int head_size,
+-  torch::Tensor& cos_sin_cache,     // [max_position, rot_dim]
+-  bool is_neox) {
++    torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
++    torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
++                           // [num_tokens, num_heads * head_size]
++    torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
++                           // [num_tokens, num_kv_heads * head_size]
++    int64_t head_size,
++    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
++    bool is_neox) {
+   int64_t num_tokens = query.numel() / query.size(-1);
+   int rot_dim = cos_sin_cache.size(1);
+   int num_heads = query.size(-1) / head_size;
+@@ -132,39 +138,24 @@ void rotary_embedding(
+   int64_t key_stride = key.stride(-2);
+ 
+   dim3 grid(num_tokens);
+-  dim3 block(std::min(num_heads * rot_dim / 2, 512));
++  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-  VLLM_DISPATCH_FLOATING_TYPES(
+-    query.scalar_type(),
+-    "rotary_embedding",
+-    [&] {
+-      if (is_neox) {
+-        vllm::rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
+-          positions.data_ptr<int64_t>(),
+-          query.data_ptr<scalar_t>(),
+-          key.data_ptr<scalar_t>(),
+-          cos_sin_cache.data_ptr<scalar_t>(),
+-          rot_dim,
+-          query_stride,
+-          key_stride,
+-          num_heads,
+-          num_kv_heads,
+-          head_size);
+-      } else {
+-        vllm::rotary_embedding_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
+-          positions.data_ptr<int64_t>(),
+-          query.data_ptr<scalar_t>(),
+-          key.data_ptr<scalar_t>(),
+-          cos_sin_cache.data_ptr<scalar_t>(),
+-          rot_dim,
+-          query_stride,
+-          key_stride,
+-          num_heads,
+-          num_kv_heads,
+-          head_size);
+-      }
+-    });
++  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
++    if (is_neox) {
++      vllm::rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
++          positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
++          key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(), rot_dim,
++          query_stride, key_stride, num_heads, num_kv_heads, head_size);
++    } else {
++      vllm::rotary_embedding_kernel<scalar_t, false>
++          <<<grid, block, 0, stream>>>(
++              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
++              key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
++              rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
++              head_size);
++    }
++  });
+ }
+ 
+ /*
+@@ -172,14 +163,15 @@ Batched version of rotary embedding, pack multiple LoRAs together
+ and process in batched manner.
+ */
+ void batched_rotary_embedding(
+-  torch::Tensor& positions,         // [batch_size, seq_len] or [num_tokens]
+-  torch::Tensor& query,             // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
+-  torch::Tensor& key,               // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
+-  int head_size,
+-  torch::Tensor& cos_sin_cache,     // [max_position, rot_dim]
+-  bool is_neox,
+-  int rot_dim,
+-  torch::Tensor& cos_sin_cache_offsets // [num_tokens]
++    torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
++    torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
++                           // [num_tokens, num_heads * head_size]
++    torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
++                           // [num_tokens, num_kv_heads * head_size]
++    int64_t head_size,
++    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
++    bool is_neox, int64_t rot_dim,
++    torch::Tensor& cos_sin_cache_offsets  // [num_tokens]
+ ) {
+   int64_t num_tokens = cos_sin_cache_offsets.size(0);
+   int num_heads = query.size(-1) / head_size;
+@@ -188,39 +180,24 @@ void batched_rotary_embedding(
+   int64_t key_stride = key.stride(-2);
+ 
+   dim3 grid(num_tokens);
+-  dim3 block(std::min(num_heads * rot_dim / 2, 512));
++  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-  VLLM_DISPATCH_FLOATING_TYPES(
+-    query.scalar_type(),
+-    "rotary_embedding",
+-    [&] {
+-      if (is_neox) {
+-        vllm::batched_rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
+-          positions.data_ptr<int64_t>(),
+-          query.data_ptr<scalar_t>(),
+-          key.data_ptr<scalar_t>(),
+-          cos_sin_cache.data_ptr<scalar_t>(),
+-          cos_sin_cache_offsets.data_ptr<int64_t>(),
+-          rot_dim,
+-          query_stride,
+-          key_stride,
+-          num_heads,
+-          num_kv_heads,
+-          head_size);
+-      } else {
+-        vllm::batched_rotary_embedding_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
+-          positions.data_ptr<int64_t>(),
+-          query.data_ptr<scalar_t>(),
+-          key.data_ptr<scalar_t>(),
+-          cos_sin_cache.data_ptr<scalar_t>(),
+-          cos_sin_cache_offsets.data_ptr<int64_t>(),
+-          rot_dim,
+-          query_stride,
+-          key_stride,
+-          num_heads,
+-          num_kv_heads,
+-          head_size);
+-      }
+-    });
++  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
++    if (is_neox) {
++      vllm::batched_rotary_embedding_kernel<scalar_t, true>
++          <<<grid, block, 0, stream>>>(
++              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
++              key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
++              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
++              key_stride, num_heads, num_kv_heads, head_size);
++    } else {
++      vllm::batched_rotary_embedding_kernel<scalar_t, false>
++          <<<grid, block, 0, stream>>>(
++              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
++              key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
++              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
++              key_stride, num_heads, num_kv_heads, head_size);
++    }
++  });
+ }
+diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
+new file mode 100644
+index 0000000..bd184ee
+--- /dev/null
++++ b/csrc/prepare_inputs/advance_step.cu
+@@ -0,0 +1,327 @@
++/*
++ * The goal of this GPU kernel is to advance input tensors on the GPU directly
++ * PR: https://github.com/vllm-project/vllm/pull/6338
++ * Current restrictions:
++ *     1. Specialized for DraftModelRunner
++ *     2. Supports flash_attn only
++ */
++
++#include "advance_step.cuh"
++
++namespace prepare_inputs {
++
++//
++template <int const num_threads>
++__global__ void advance_step_flashattn_kernel(
++    int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
++    long const* sampled_token_ids_ptr, long* input_positions_ptr,
++    int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
++    int64_t const block_tables_stride) {
++  int const n_pad = num_seqs - num_queries;
++  if (n_pad && blockIdx.x == 0) {
++    // Handle cuda graph padding
++    int const offset = num_queries;
++    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
++      input_tokens_ptr[offset + i] = 0;
++      input_positions_ptr[offset + i] = 0;
++      slot_mapping_ptr[offset + i] = -1;
++    }
++  }
++
++  int num_query_blocks = div_ceil(num_queries, num_threads);
++
++  if (blockIdx.x >= num_query_blocks) {
++    return;
++  }
++
++  int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
++
++  if (cur_query_id >= num_queries) {
++    return;
++  }
++
++  // Update input_tokens
++  input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
++
++  int seq_len = seq_lens_ptr[cur_query_id];
++  int next_seq_len = seq_len + 1;
++  int next_input_pos = next_seq_len - 1;
++
++  // Update seq_lens
++  seq_lens_ptr[cur_query_id] = next_seq_len;
++  // Update input_positions
++  input_positions_ptr[cur_query_id] = next_input_pos;
++
++  int const* seq_block_tables_ptr =
++      block_tables_ptr + block_tables_stride * cur_query_id;
++
++  int block_index = next_input_pos / block_size;
++  int block_offset = next_input_pos % block_size;
++
++  int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset;
++  // Update slot_mapping
++  slot_mapping_ptr[cur_query_id] = slot_num;
++}
++
++inline void verify_tensor(std::string const& name, torch::Tensor const& t,
++                          int64_t const size_0, int64_t const size_1,
++                          c10::ScalarType const type) {
++  bool size_0_cond = true;
++  if (size_0 != -1) {
++    size_0_cond = t.size(0) == size_0;
++  }
++
++  bool size_1_cond = true;
++  if (size_1 != -1) {
++    size_1_cond = t.size(1) == size_1;
++  }
++
++  bool is_contiguous = t.is_contiguous();
++  bool same_type = t.dtype() == type;
++
++  bool pass = size_0_cond && size_1_cond && is_contiguous && same_type;
++  if (!pass) {
++    TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(),
++                " is_cont = ", t.is_contiguous(), ", type = ", t.dtype(),
++                " is not as expected: shape = [", size_0, ", ", size_1,
++                "], type = ", type);
++  }
++}
++
++/// each thread processes a block per query
++__global__ void advance_step_flashinfer_kernel(
++    int num_threads, int num_seqs, int num_queries, int block_size,
++    long* input_tokens_ptr, long const* sampled_token_ids_ptr,
++    long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
++    int const* block_tables_ptr, int64_t const block_tables_stride,
++    int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
++  int num_query_blocks = div_ceil(num_queries, num_threads);
++
++  if (blockIdx.x < num_query_blocks) {
++    int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
++
++    if (cur_query_id < num_queries) {
++      // Update input_tokens
++      input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
++
++      int seq_len = seq_lens_ptr[cur_query_id];
++      int next_seq_len = seq_len + 1;
++      int next_input_pos = next_seq_len - 1;
++
++      // Update seq_lens
++      seq_lens_ptr[cur_query_id] = next_seq_len;
++      // Update input_positions
++      input_positions_ptr[cur_query_id] = next_input_pos;
++
++      int const* seq_block_tables_ptr =
++          block_tables_ptr + block_tables_stride * cur_query_id;
++
++      int block_index = next_input_pos / block_size;
++      int block_offset = next_input_pos % block_size;
++
++      // Update paged_kv_last_page_len
++      paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
++
++      int slot_num =
++          seq_block_tables_ptr[block_index] * block_size + block_offset;
++      // Update slot_mapping
++      slot_mapping_ptr[cur_query_id] = slot_num;
++      block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
++    }
++  }
++}
++
++__global__ void advance_step_flashinfer_indptr_kernel(
++    int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
++    int* block_table_bound_ptr) {
++  int idx = blockIdx.x * num_threads + threadIdx.x;
++  // Update paged_kv_indptr
++  if (idx == 0) {
++    paged_kv_indptr_ptr[idx] = 0;
++  }
++  if (idx < num_queries) {
++    int sum = 0;
++    for (int i = 0; i <= idx; ++i) {
++      sum += block_table_bound_ptr[i];
++    }
++    paged_kv_indptr_ptr[idx + 1] = sum;
++  }
++}
++
++__global__ void advance_step_flashinfer_indices_kernel(
++    int num_seqs, int num_queries, int const* block_tables_ptr,
++    int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr,
++    int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
++  // note: max_num_blocks_per_seq = block_tables.stride(0)
++  int tid = blockIdx.x * blockDim.x + threadIdx.x;
++
++  // when cuda graphs are enabled, paged_kv_indptr tensor
++  // has to be updated for the padded queries
++  // tid represents a query# for paged_kv_indptr tensor
++  if (num_queries < tid && tid <= num_seqs) {
++    paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries];
++  }
++
++  // each thread processes a block_ptr in block_tables
++  // block_tables shape: [num_queries, max_num_blocks_per_seq]
++  // paged_kv_indices is flattened block_tables.
++  for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq);
++       idx += (gridDim.x * blockDim.x)) {
++    // block_tables-row = paged_kv_indptr[queryNum]
++    int queryNum = idx / max_num_blocks_per_seq;
++    int col = idx % max_num_blocks_per_seq;
++    if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) {
++      int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col;
++      int block_tables_idx = queryNum * max_num_blocks_per_seq + col;
++      paged_kv_indices_ptr[indices_arr_idx] =
++          block_tables_ptr[block_tables_idx];
++    }
++  }
++}
++
++void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
++                            torch::Tensor& input_tokens,       // type: long
++                            torch::Tensor& sampled_token_ids,  // type: long
++                            torch::Tensor& input_positions,    // type: long
++                            torch::Tensor& seq_lens,           // type: int
++                            torch::Tensor& slot_mapping,       // type: long
++                            torch::Tensor& block_tables) {     // type: int
++
++  if (logging) {
++    printf("advance_step_flashattn:\n");
++    printf("  num_seqs = %d\n", num_seqs);
++    printf("  num_queries = %d\n", num_queries);
++    printf("  block_size = %d\n", block_size);
++  }
++  // Verify all tensors
++  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
++  verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
++                at::kLong);
++  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
++  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
++  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
++  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
++
++  int dev = sampled_token_ids.get_device();
++  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
++
++  int blocks;
++  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
++
++  advance_step_flashattn_kernel<max_threads>
++      <<<blocks, max_threads, 0, stream>>>(
++          num_seqs, num_queries, block_size,
++          reinterpret_cast<long*>(input_tokens.data_ptr()),
++          reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
++          reinterpret_cast<long*>(input_positions.data_ptr()),
++          reinterpret_cast<int*>(seq_lens.data_ptr()),
++          reinterpret_cast<long*>(slot_mapping.data_ptr()),
++          reinterpret_cast<int const*>(block_tables.data_ptr()),
++          block_tables.stride(0));
++}
++
++void advance_step_flashinfer(
++    int num_seqs, int num_queries, int block_size,
++    torch::Tensor& input_tokens,            // type: long
++    torch::Tensor& sampled_token_ids,       // type: long
++    torch::Tensor& input_positions,         // type: long
++    torch::Tensor& seq_lens,                // type: int
++    torch::Tensor& slot_mapping,            // type: long
++    torch::Tensor& block_tables,            // type: int
++    torch::Tensor& paged_kv_indices,        // type: int
++    torch::Tensor& paged_kv_indptr,         // type: int
++    torch::Tensor& paged_kv_last_page_len,  // type: int
++    torch::Tensor& block_table_bound) {     // type: int
++
++  if (logging) {
++    printf("advance_step_flashinfer:\n");
++    printf("  num_seqs = %d\n", num_seqs);
++    printf("  num_queries = %d\n", num_queries);
++    printf("  block_size = %d\n", block_size);
++    printf("  block_tables.stride(0) = %zu\n", block_tables.stride(0));
++  }
++  // Verify all tensors
++  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
++  // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
++  //               at::kLong);
++  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
++  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
++  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
++  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
++
++  verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
++  verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
++  verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
++                at::kInt);
++
++  verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
++
++  int dev = sampled_token_ids.get_device();
++  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
++
++  int blocks;
++  int threads;
++  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
++  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
++
++  int block_tables_stride = block_tables.stride(0);
++  TORCH_CHECK((blocks * threads > num_queries),
++              "multi-step: not enough threads to map to num_queries = ",
++              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
++              " blocks = ", blocks, " max_threads = ", threads);
++  if (logging) {
++    printf("launching kernels with %d blocks and %d threads\n", blocks,
++           threads);
++  }
++  advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
++      threads, num_seqs, num_queries, block_size,
++      reinterpret_cast<long*>(input_tokens.data_ptr()),
++      reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
++      reinterpret_cast<long*>(input_positions.data_ptr()),
++      reinterpret_cast<int*>(seq_lens.data_ptr()),
++      reinterpret_cast<long*>(slot_mapping.data_ptr()),
++      reinterpret_cast<int const*>(block_tables.data_ptr()),
++      block_tables.stride(0),
++      reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
++      reinterpret_cast<int*>(block_table_bound.data_ptr()));
++
++  advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
++      threads, num_seqs, num_queries,
++      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
++      reinterpret_cast<int*>(block_table_bound.data_ptr()));
++
++  advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
++      num_seqs, num_queries,
++      reinterpret_cast<int const*>(block_tables.data_ptr()),
++      block_tables.stride(0),
++      reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
++      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
++      reinterpret_cast<int*>(block_table_bound.data_ptr()));
++}
++
++}  // namespace prepare_inputs
++
++void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
++                            int64_t block_size, torch::Tensor& input_tokens,
++                            torch::Tensor& sampled_token_ids,
++                            torch::Tensor& input_positions,
++                            torch::Tensor& seq_lens,
++                            torch::Tensor& slot_mapping,
++                            torch::Tensor& block_tables) {
++  prepare_inputs::advance_step_flashattn(
++      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
++      input_positions, seq_lens, slot_mapping, block_tables);
++}
++
++void advance_step_flashinfer(
++    int64_t num_seqs, int64_t num_queries, int64_t block_size,
++    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
++    torch::Tensor& input_positions, torch::Tensor& seq_lens,
++    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
++    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
++    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
++  prepare_inputs::advance_step_flashinfer(
++      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
++      input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
++      paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
++}
+diff --git a/csrc/prepare_inputs/advance_step.cuh b/csrc/prepare_inputs/advance_step.cuh
+new file mode 100644
+index 0000000..f215746
+--- /dev/null
++++ b/csrc/prepare_inputs/advance_step.cuh
+@@ -0,0 +1,19 @@
++#pragma once
++
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <cuda.h>
++#include <cuda_fp16.h>
++#include <cuda_runtime.h>
++#include <iostream>
++
++namespace prepare_inputs {
++
++static constexpr int max_threads = 256;
++static constexpr bool logging = false;
++
++constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
++
++}  // namespace prepare_inputs
+diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
+index 4415316..79cd2c6 100644
+--- a/csrc/quantization/aqlm/gemm_kernels.cu
++++ b/csrc/quantization/aqlm/gemm_kernels.cu
+@@ -18,39 +18,35 @@
+ #include <cuda.h>
+ #include <cuda_fp16.h>
+ #include <cuda_runtime.h>
+-#include <torch/extension.h>
++#include <torch/all.h>
+ #include <c10/cuda/CUDAStream.h>
+ #include <c10/cuda/CUDAGuard.h>
+ 
+ #include <iostream>
+ #include <cstdlib>
+ 
+-
+ namespace vllm {
+ namespace aqlm {
+ 
+ __global__ void Code1x16MatVec(
+-  const int4* __restrict__ A,
+-  const int4* __restrict__ B,
+-        int4* __restrict__ C,
+-  const int4* __restrict__ codebook,
+-  const int prob_m,
+-  const int prob_k,
+-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+-  const int codebook_stride // as int4.
++    const int4* __restrict__ A, const int4* __restrict__ B,
++    int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m,
++    const int prob_k,
++    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
++                                  // codebook, at most 3 long.
++    const int codebook_stride     // as int4.
+ ) {
+   int a_gl_stride = prob_k / 8 / 8;
+   int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+   bool pred = a_gl_rd < prob_m;
+ 
+-  if (pred)
+-  {
+-    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
++  if (pred) {
++    // advance to the correct codebook, this easy because we only multiply one
++    // column of the codebook.
+     auto codebook_size = &codebook_a_sizes.x;
+-    while (a_gl_rd >= *codebook_size)
+-    {
+-        codebook += codebook_stride;
+-        ++codebook_size;
++    while (a_gl_rd >= *codebook_size) {
++      codebook += codebook_stride;
++      ++codebook_size;
+     }
+   }
+ 
+@@ -67,8 +63,7 @@ __global__ void Code1x16MatVec(
+     // We pad shared memory to avoid bank conflicts during reads
+     __syncthreads();
+     for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
+-      if (b_gl_rd + i < prob_k / 8)
+-        sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
++      if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
+     }
+     __syncthreads();
+     b_gl_rd += 32 * 8;
+@@ -76,22 +71,19 @@ __global__ void Code1x16MatVec(
+     int b_sh_rd = 9 * (threadIdx.x % 32);
+     if (pred && a_gl_rd < a_gl_end) {
+       const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
+-      #pragma unroll
++#pragma unroll
+       for (int i = 0; i < 8; i++) {
+         uint32_t dec[4];
+-        // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
+-        // actually help us; this brings > 2x speedup.
+-        asm volatile (
+-          "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
+-          : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
+-          : "l"((void*) &codebook[enc[i]])
+-        );
++        // We bypass the L1 cache to avoid massive amounts of memory streaming
++        // that doesn't actually help us; this brings > 2x speedup.
++        asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
++                     : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
++                     : "l"((void*)&codebook[enc[i]]));
+         half2* a = reinterpret_cast<half2*>(&dec);
+         half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
+         half2 res2 = {};
+-        #pragma unroll
+-        for (int j = 0; j < 4; j++)
+-          res2 = __hfma2(a[j], b[j], res2);
++#pragma unroll
++        for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2);
+         res += __half2float(res2.x) + __half2float(res2.y);
+         b_sh_rd++;
+       }
+@@ -100,37 +92,33 @@ __global__ void Code1x16MatVec(
+   }
+ 
+   if (pred) {
+-    #pragma unroll
+-    for (int i = 16; i > 0; i /= 2)
+-      res += __shfl_down_sync(0xffffffff, res, i);
++#pragma unroll
++    for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
+     if (threadIdx.x % 32 == 0)
+       reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
+   }
+ }
+ 
+ __global__ void Code2x8MatVec(
+-  const int4* __restrict__ A,
+-  const int4* __restrict__ B,
+-        int4* __restrict__ C,
+-  const int4* __restrict__ codebook,
+-  int prob_m,
+-  int prob_k,
+-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+-  const int codebook_stride // as int4.
++    const int4* __restrict__ A, const int4* __restrict__ B,
++    int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m,
++    int prob_k,
++    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
++                                  // codebook, at most 3 long.
++    const int codebook_stride     // as int4.
+ 
+ ) {
+   int a_gl_stride = prob_k / 8 / 8;
+   int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+   bool pred = a_gl_rd < prob_m;
+ 
+-  if (pred)
+-  {
+-    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
++  if (pred) {
++    // advance to the correct codebook, this easy because we only multiply one
++    // column of the codebook.
+     auto codebook_size = &codebook_a_sizes.x;
+-    while (a_gl_rd >= *codebook_size)
+-    {
+-        codebook += codebook_stride;
+-        ++codebook_size;
++    while (a_gl_rd >= *codebook_size) {
++      codebook += codebook_stride;
++      ++codebook_size;
+     }
+   }
+ 
+@@ -148,9 +136,8 @@ __global__ void Code2x8MatVec(
+ 
+   for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
+     int4 dec = codebook[i];
+-    #pragma unroll
+-    for (int j = 0; j < 8; j++)
+-      sh_code[8 * i + (j + lane) % 8] = dec;
++#pragma unroll
++    for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
+   }
+   __syncthreads();
+ 
+@@ -161,8 +148,7 @@ __global__ void Code2x8MatVec(
+     // We pad shared memory to avoid bank conflicts during reads
+     __syncthreads();
+     for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
+-      if (b_gl_rd + i < prob_k / 8)
+-        sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
++      if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
+     }
+     __syncthreads();
+     b_gl_rd += 32 * 8;
+@@ -170,13 +156,15 @@ __global__ void Code2x8MatVec(
+     int b_sh_rd = 9 * (threadIdx.x % 32);
+     if (pred && a_gl_rd < a_gl_end) {
+       const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
+-      #pragma unroll
++#pragma unroll
+       for (int i = 0; i < 8; i++) {
+-        half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
+-        half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
+-        half2*  b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
++        half2* a0 =
++            reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
++        half2* a1 =
++            reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
++        half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
+         half2 res2 = {};
+-        #pragma unroll
++#pragma unroll
+         for (int j = 0; j < 4; j++)
+           res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
+         res += __half2float(res2.x) + __half2float(res2.y);
+@@ -187,36 +175,31 @@ __global__ void Code2x8MatVec(
+   }
+ 
+   if (pred) {
+-    #pragma unroll
+-    for (int i = 16; i > 0; i /= 2)
+-      res += __shfl_down_sync(0xffffffff, res, i);
++#pragma unroll
++    for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
+     if (threadIdx.x % 32 == 0)
+       reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
+   }
+ }
+ 
+-
+ __global__ void Code1x16Dequant(
+-  const int4* __restrict__ A,
+-        int4* __restrict__ C,
+-  const int4* __restrict__ codebook,
+-  int prob_m,
+-  int prob_k,
+-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
+-  const int codebook_stride // as int4
++    const int4* __restrict__ A, int4* __restrict__ C,
++    const int4* __restrict__ codebook, int prob_m, int prob_k,
++    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
++                                  // codebook, at most 3 long, sums to m.
++    const int codebook_stride     // as int4
+ ) {
+   int a_gl_stride = prob_k / 8 / 8;
+   int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+   bool pred = a_gl_rd < prob_m;
+ 
+-  if (pred)
+-  {
+-    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
++  if (pred) {
++    // advance to the correct codebook, this easy because we only multiply one
++    // column of the codebook.
+     auto codebook_size = &codebook_a_sizes.x;
+-    while (a_gl_rd >= *codebook_size)
+-    {
+-        codebook += codebook_stride;
+-        ++codebook_size;
++    while (a_gl_rd >= *codebook_size) {
++      codebook += codebook_stride;
++      ++codebook_size;
+     }
+   }
+ 
+@@ -231,17 +214,15 @@ __global__ void Code1x16Dequant(
+   while (iters--) {
+     if (pred && a_gl_rd < a_gl_end) {
+       const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
+-      #pragma unroll
++#pragma unroll
+       for (int i = 0; i < 8; i++) {
+         int4 chunk;
+         auto dec = reinterpret_cast<uint32_t*>(&chunk);
+-        // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
+-        // actually help us; this brings > 2x speedup.
+-        asm volatile (
+-          "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
+-          : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
+-          : "l"((void*) &codebook[enc[i]])
+-        );
++        // We bypass the L1 cache to avoid massive amounts of memory streaming
++        // that doesn't actually help us; this brings > 2x speedup.
++        asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
++                     : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
++                     : "l"((void*)&codebook[enc[i]]));
+ 
+         C[a_gl_rd * 8 + i] = chunk;
+       }
+@@ -250,28 +231,25 @@ __global__ void Code1x16Dequant(
+   }
+ }
+ 
+-
+ __global__ void Code2x8Dequant(
+-  const int4* __restrict__ A,
+-        int4* __restrict__ C,
+-  const int4* __restrict__ codebook,
+-  int prob_m,
+-  int prob_k,
+-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
+-  const int codebook_stride // as int4
++    const int4* __restrict__ A, int4* __restrict__ C,
++    const int4* __restrict__ codebook, int prob_m, int prob_k,
++    const int4
++        codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
++                           // most 3 long, corresponds to cols.
++    const int codebook_stride  // as int4
+ ) {
+   int a_gl_stride = prob_k / 8 / 8;
+   int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+   bool pred = a_gl_rd < prob_m;
+ 
+-  if (pred)
+-  {
+-    // advance to the correct codebook, this easy because we only multiply one column of the codebook.
++  if (pred) {
++    // advance to the correct codebook, this easy because we only multiply one
++    // column of the codebook.
+     auto codebook_size = &codebook_a_sizes.x;
+-    while (a_gl_rd >= *codebook_size)
+-    {
+-        codebook += codebook_stride;
+-        ++codebook_size;
++    while (a_gl_rd >= *codebook_size) {
++      codebook += codebook_stride;
++      ++codebook_size;
+     }
+   }
+ 
+@@ -290,24 +268,23 @@ __global__ void Code2x8Dequant(
+ 
+   for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
+     int4 dec = codebook[i];
+-    #pragma unroll
+-    for (int j = 0; j < 8; j++)
+-      sh_code[8 * i + (j + lane) % 8] = dec;
++#pragma unroll
++    for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
+   }
+   __syncthreads();
+ 
+-  float res = 0;
+-
+   int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
+   while (iters--) {
+     if (pred && a_gl_rd < a_gl_end) {
+       const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
+-      #pragma unroll
++#pragma unroll
+       for (int i = 0; i < 8; i++) {
+         int4 chunk;
+-        half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
+-        half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
+-        #pragma unroll
++        half2* a0 =
++            reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
++        half2* a1 =
++            reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
++#pragma unroll
+         for (int j = 0; j < 4; j++)
+           reinterpret_cast<half2*>(&chunk)[j] = __hadd2(a0[j], a1[j]);
+         C[a_gl_rd * 8 + i] = chunk;
+@@ -317,22 +294,15 @@ __global__ void Code2x8Dequant(
+   }
+ }
+ 
+-inline int ceildiv(int a, int b) {
+-  return (a + b - 1) / b;
+-}
++inline int ceildiv(int a, int b) { return (a + b - 1) / b; }
+ 
+ const int THREAD_M = 16;
+ 
+-void  code1x16_matvec_cuda(
+-  const void* __restrict__ A,
+-  const void* __restrict__ B,
+-        void* __restrict__ C,
+-  const void* __restrict__ codebook,
+-  int prob_m,
+-  int prob_k,
+-  const int4 codebook_a_sizes,
+-  const int codebook_stride
+-) {
++void code1x16_matvec_cuda(const void* __restrict__ A,
++                          const void* __restrict__ B, void* __restrict__ C,
++                          const void* __restrict__ codebook, int prob_m,
++                          int prob_k, const int4 codebook_a_sizes,
++                          const int codebook_stride) {
+   int sms;
+   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+   int waves = 0;
+@@ -345,28 +315,16 @@ void  code1x16_matvec_cuda(
+   int blocks = ceildiv(prob_m, thread_m);
+   int threads = 32 * thread_m;
+   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+-  Code1x16MatVec<<<blocks, threads, 16*32*9, stream>>>(
+-    (const int4*) A,
+-    (const int4*) B,
+-    (int4*) C,
+-    (const int4*) codebook,
+-    prob_m,
+-    prob_k,
+-    codebook_a_sizes,
+-    codebook_stride
+-  );
++  Code1x16MatVec<<<blocks, threads, 16 * 32 * 9, stream>>>(
++      (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
++      prob_k, codebook_a_sizes, codebook_stride);
+ }
+ 
+-void  code2x8_matvec_cuda(
+-  const void* __restrict__ A,
+-  const void* __restrict__ B,
+-        void* __restrict__ C,
+-  const void* __restrict__ codebook,
+-  int prob_m,
+-  int prob_k,
+-  const int4 codebook_a_sizes,
+-  const int codebook_stride
+-) {
++void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B,
++                         void* __restrict__ C,
++                         const void* __restrict__ codebook, int prob_m,
++                         int prob_k, const int4 codebook_a_sizes,
++                         const int codebook_stride) {
+   int sms;
+   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+   int waves = 0;
+@@ -379,30 +337,20 @@ void  code2x8_matvec_cuda(
+   int blocks = ceildiv(prob_m, thread_m);
+   int threads = 32 * thread_m;
+   int shared = 16 * (2 * 256 * 8 + 32 * 9);
+-  cudaFuncSetAttribute(
+-    Code2x8MatVec, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
+-  );
++  cudaFuncSetAttribute(Code2x8MatVec,
++                       cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
+   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+   Code2x8MatVec<<<blocks, threads, shared, stream>>>(
+-    (const int4*) A,
+-    (const int4*) B,
+-    (int4*) C,
+-    (const int4*) codebook,
+-    prob_m,
+-    prob_k,
+-    codebook_a_sizes,
+-    codebook_stride
+-  );
++      (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
++      prob_k, codebook_a_sizes, codebook_stride);
+ }
+ 
+ void code1x16_dequant_cuda(
+-  const void* __restrict__ A,
+-        void* __restrict__ C,
+-  const void* __restrict__ codebook,
+-  int prob_m,
+-  int prob_k,
+-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+-  const int codebook_stride // as int4.
++    const void* __restrict__ A, void* __restrict__ C,
++    const void* __restrict__ codebook, int prob_m, int prob_k,
++    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
++                                  // codebook, at most 3 long.
++    const int codebook_stride     // as int4.
+ ) {
+   int sms;
+   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+@@ -417,25 +365,21 @@ void code1x16_dequant_cuda(
+   int threads = 32 * thread_m;
+   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+   Code1x16Dequant<<<blocks, threads, 0, stream>>>(
+-    (const int4*) A,
+-    (int4*) C,
+-    (const int4*) codebook,
+-    prob_m,
+-    prob_k,
+-    codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long.
+-    codebook_stride // as int4.
++      (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
++      codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
++                         // most 3 long.
++      codebook_stride    // as int4.
+   );
+ }
+ 
+ // Dequantizes the code and codebook into weights.
+-void  code2x8_dequant_cuda(
+-  const void* __restrict__ A,
+-        void* __restrict__ C,
+-  const void* __restrict__ codebook,
+-  int prob_m,
+-  int prob_k,
+-  const int4 codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
+-  const int codebook_stride // as int4
++void code2x8_dequant_cuda(
++    const void* __restrict__ A, void* __restrict__ C,
++    const void* __restrict__ codebook, int prob_m, int prob_k,
++    const int4
++        codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
++                           // most 3 long, corresponds to cols.
++    const int codebook_stride  // as int4
+ ) {
+   int sms;
+   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+@@ -451,74 +395,50 @@ void  code2x8_dequant_cuda(
+   int shared = 16 * (2 * 256 * 8 + 32 * 9);
+   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+ 
+-  cudaFuncSetAttribute(
+-    Code2x8Dequant, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
+-  );
++  cudaFuncSetAttribute(Code2x8Dequant,
++                       cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
+   Code2x8Dequant<<<blocks, threads, shared, stream>>>(
+-    (const int4*) A,
+-    (int4*) C,
+-    (const int4*) codebook,
+-    prob_m,
+-    prob_k,
+-    codebook_a_sizes,
+-    codebook_stride
+-  );
++      (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
++      codebook_a_sizes, codebook_stride);
+ }
+ 
+-int codebook_stride(const torch::Tensor& codebooks)
+-{
++int codebook_stride(const torch::Tensor& codebooks) {
+   return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
+ }
+ 
+ void code1x16_matvec(
+-  const torch::Tensor& A,
+-  const torch::Tensor& B,
+-        torch::Tensor& C,
+-  const torch::Tensor& codebook,
+-  const int4 codebook_a_sizes  // cumulative sizes of A spanning each codebook, at most 3 long.
++    const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C,
++    const torch::Tensor& codebook,
++    const int4 codebook_a_sizes  // cumulative sizes of A spanning each
++                                 // codebook, at most 3 long.
+ ) {
+   const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+   int prob_m = C.size(0);
+   int prob_k = B.size(0);
+ 
+-  code1x16_matvec_cuda(
+-    A.data_ptr(),
+-    B.data_ptr(),
+-    C.data_ptr(),
+-    codebook.data_ptr(),
+-    prob_m,
+-    prob_k,
+-    codebook_a_sizes,
+-    codebook_stride(codebook)
+-  );
++  code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
++                       codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
++                       codebook_stride(codebook));
+ }
+ 
+-torch::Tensor code1x16_matmat(
+-  const torch::Tensor& input,
+-  const torch::Tensor& codes,
+-  const torch::Tensor& codebooks,
+-  const torch::Tensor& scales,
+-  const int4 codebook_a_sizes,
+-  const std::optional<torch::Tensor>& bias) {
++torch::Tensor code1x16_matmat(const torch::Tensor& input,
++                              const torch::Tensor& codes,
++                              const torch::Tensor& codebooks,
++                              const torch::Tensor& scales,
++                              const int4 codebook_a_sizes,
++                              const std::optional<torch::Tensor>& bias) {
+   auto input_sizes = input.sizes();
+   auto out_features = codes.size(0) * codebooks.size(2);
+   auto flat_input = input.reshape({-1, input.size(-1)});
+-  auto flat_output = torch::empty({flat_input.size(0), out_features},
+-    torch::TensorOptions()
+-      .dtype(input.dtype())
+-      .device(input.device())
+-  );
++  auto flat_output = torch::empty(
++      {flat_input.size(0), out_features},
++      torch::TensorOptions().dtype(input.dtype()).device(input.device()));
+ 
+   for (int i = 0; i < flat_input.size(0); ++i) {
+     auto input_vec = flat_input.index({i});
+     auto output_vec = flat_output.index({i});
+-    code1x16_matvec(
+-      codes.squeeze(2),
+-      input_vec,
+-      output_vec,
+-      codebooks,
+-      codebook_a_sizes
+-    );
++    code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
++                    codebook_a_sizes);
+   }
+   flat_output *= scales.flatten().unsqueeze(0);
+ 
+@@ -533,55 +453,35 @@ torch::Tensor code1x16_matmat(
+   return output;
+ }
+ 
+-void code2x8_matvec(
+-  const torch::Tensor& A,
+-  const torch::Tensor& B,
+-        torch::Tensor& C,
+-  const torch::Tensor& codebook,
+-  const int4 codebook_a_sizes
+-) {
++void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B,
++                    torch::Tensor& C, const torch::Tensor& codebook,
++                    const int4 codebook_a_sizes) {
+   const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+   int prob_m = C.size(0);
+   int prob_k = B.size(0);
+-  code2x8_matvec_cuda(
+-    A.data_ptr(),
+-    B.data_ptr(),
+-    C.data_ptr(),
+-    codebook.data_ptr(),
+-    prob_m,
+-    prob_k,
+-    codebook_a_sizes,
+-    2 * codebook_stride(codebook)
+-  );
++  code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
++                      codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
++                      2 * codebook_stride(codebook));
+ }
+ 
+-torch::Tensor code2x8_matmat(
+-  const torch::Tensor& input,
+-  const torch::Tensor& codes,
+-  const torch::Tensor& codebooks,
+-  const torch::Tensor& scales,
+-  const int4 codebook_a_sizes,
+-  const std::optional<torch::Tensor>& bias
+-) {
++torch::Tensor code2x8_matmat(const torch::Tensor& input,
++                             const torch::Tensor& codes,
++                             const torch::Tensor& codebooks,
++                             const torch::Tensor& scales,
++                             const int4 codebook_a_sizes,
++                             const std::optional<torch::Tensor>& bias) {
+   auto input_sizes = input.sizes();
+   auto out_features = codes.size(0) * codebooks.size(2);
+   auto flat_input = input.reshape({-1, input.size(-1)});
+-  auto flat_output = torch::empty({flat_input.size(0), out_features},
+-    torch::TensorOptions()
+-      .dtype(input.dtype())
+-      .device(input.device())
+-  );
++  auto flat_output = torch::empty(
++      {flat_input.size(0), out_features},
++      torch::TensorOptions().dtype(input.dtype()).device(input.device()));
+ 
+   for (int i = 0; i < flat_input.size(0); ++i) {
+     auto input_vec = flat_input.index({i});
+     auto output_vec = flat_output.index({i});
+-    code2x8_matvec(
+-      codes.squeeze(2),
+-      input_vec,
+-      output_vec,
+-      codebooks,
+-      codebook_a_sizes
+-    );
++    code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
++                   codebook_a_sizes);
+   }
+   flat_output *= scales.flatten().unsqueeze(0);
+   if (bias.has_value()) {
+@@ -596,66 +496,58 @@ torch::Tensor code2x8_matmat(
+ }
+ 
+ // Accumulate the partition sizes.
+-int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes)
+-{
++int4 accumulate_sizes(const std::vector<int64_t>& codebook_partition_sizes) {
+   int4 cumulative_sizes;
+   auto cumulative_size = &cumulative_sizes.x;
+-  int i = 0;
++  size_t i = 0;
+   int last = 0;
+-  assert(codebook_partition_sizes.size(0) <= 4);
+-  for (; i <  codebook_partition_sizes.size(0); ++i, ++cumulative_size)
+-  {
+-    *cumulative_size = codebook_partition_sizes[i].item<int>() + last;
++  assert(codebook_partition_sizes.size() <= 4);
++  for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) {
++    *cumulative_size = codebook_partition_sizes[i] + last;
+     last = *cumulative_size;
+   }
+   // fill in the rest with unreachable.
+-  for (; i < 4; ++i, ++cumulative_size)
+-  {
+-    *cumulative_size = last*10;
++  for (; i < 4; ++i, ++cumulative_size) {
++    *cumulative_size = last * 10;
+   }
+   return cumulative_sizes;
+ }
+ 
+-} // namespace aqlm
+-} // namespace vllm
+-
++}  // namespace aqlm
++}  // namespace vllm
+ 
+-torch::Tensor aqlm_gemm(
+-  const torch::Tensor& input,
+-  const torch::Tensor& codes,
+-  const torch::Tensor& codebooks,
+-  const torch::Tensor& scales,
+-  const torch::Tensor& codebook_partition_sizes,
+-  const std::optional<torch::Tensor>& bias
+-)
+-{
+-  int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
++torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
++                        const torch::Tensor& codebooks,
++                        const torch::Tensor& scales,
++                        const std::vector<int64_t>& codebook_partition_sizes,
++                        const std::optional<torch::Tensor>& bias) {
++  int4 cumulative_sizes =
++      vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
+ 
+-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
++  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
+   int const entries = codebooks.size(1);
+ 
+-  if (nbooks == 1 && entries == (1 << 16))
+-  { 
+-    return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
++  if (nbooks == 1 && entries == (1 << 16)) {
++    return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales,
++                                       cumulative_sizes, bias);
+   }
+-  if (nbooks == 2 && entries == (1 << 8))
+-  {
+-    return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
++  if (nbooks == 2 && entries == (1 << 8)) {
++    return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales,
++                                      cumulative_sizes, bias);
+   }
+ 
+-  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
++  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
++              " entries is not currently supported.")
+   return {};
+ }
+ 
+ torch::Tensor aqlm_dequant(
+-  const torch::Tensor& codes,
+-  const torch::Tensor& codebooks,
+-  const torch::Tensor& codebook_partition_sizes
+-)
+-{
+-  int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
+-
+-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
++    const torch::Tensor& codes, const torch::Tensor& codebooks,
++    const std::vector<int64_t>& codebook_partition_sizes) {
++  int4 cumulative_sizes =
++      vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
++
++  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
+   int const entries = codebooks.size(1);
+ 
+   const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
+@@ -665,48 +557,41 @@ torch::Tensor aqlm_dequant(
+   auto in_features = codes.size(1) * 8;
+   auto out_features = codes.size(0);
+ 
+-  assert(out_features = codebook_partition_sizes.sum().item<int>());
++  assert(out_features == std::accumulate(codebook_partition_sizes.begin(),
++                                         codebook_partition_sizes.end(), 0));
+ 
+   auto weights = torch::empty({out_features, in_features},
+-    torch::TensorOptions()
+-      .dtype(codebooks.dtype())
+-      .device(codebooks.device())
+-  );
++                              torch::TensorOptions()
++                                  .dtype(codebooks.dtype())
++                                  .device(codebooks.device()));
+ 
+-  if (nbooks == 1 && entries == (1 << 16))
+-  {
+-    vllm::aqlm::code1x16_dequant_cuda(
+-      codes.data_ptr(),
+-      weights.data_ptr(),
+-      codebooks.data_ptr(),
+-      out_features,
+-      in_features,
+-      cumulative_sizes,
+-      vllm::aqlm::codebook_stride(codebooks));
+-
+-    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.)
+-    // weights *= scales.index({"...", 0, 0});
+-
+-     return weights;
++  if (nbooks == 1 && entries == (1 << 16)) {
++    vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
++                                      codebooks.data_ptr(), out_features,
++                                      in_features, cumulative_sizes,
++                                      vllm::aqlm::codebook_stride(codebooks));
++
++    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower
++    // and not consistent with gemv implementation.) weights *=
++    // scales.index({"...", 0, 0});
++
++    return weights;
+   }
+ 
+-  if (nbooks == 2 && entries == (1 << 8))
+-  {
+-     vllm::aqlm::code2x8_dequant_cuda(
+-        codes.data_ptr(), 
+-        weights.data_ptr(), 
+-        codebooks.data_ptr(), 
+-        out_features,
+-        in_features, 
+-        cumulative_sizes, 
+-        vllm::aqlm::codebook_stride(codebooks));
+-
+-    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation)
+-    // weights *= scales.index({"...", 0, 0});
+-
+-     return weights;
++  if (nbooks == 2 && entries == (1 << 8)) {
++    vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
++                                     codebooks.data_ptr(), out_features,
++                                     in_features, cumulative_sizes,
++                                     vllm::aqlm::codebook_stride(codebooks));
++
++    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower
++    // and not consistent with gemv implementation) weights *=
++    // scales.index({"...", 0, 0});
++
++    return weights;
+   }
+ 
+-  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
++  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
++              " entries is not currently supported.")
+   return {};
+ }
+diff --git a/csrc/quantization/awq/dequantize.cuh b/csrc/quantization/awq/dequantize.cuh
+index d1d926d..5fa4b5f 100644
+--- a/csrc/quantization/awq/dequantize.cuh
++++ b/csrc/quantization/awq/dequantize.cuh
+@@ -1,11 +1,11 @@
+ /*
+ Adapted from https://github.com/mit-han-lab/llm-awq
+-Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
++Modified from NVIDIA FasterTransformer:
++https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+ @article{lin2023awq,
+-  title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
+-  author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
+-  journal={arXiv},
+-  year={2023}
++  title={AWQ: Activation-aware Weight Quantization for LLM Compression and
++Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang,
++Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
+ }
+ */
+ 
+@@ -14,74 +14,89 @@ Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransfor
+ namespace vllm {
+ namespace awq {
+ 
+-__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
+-{
++__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) {
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
+   assert(false);
+ #else
+-    uint4 result;
++  uint4 result;
+ 
+-    uint32_t*      h   = reinterpret_cast<uint32_t*>(&result);
+-    uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
++  uint32_t* h = reinterpret_cast<uint32_t*>(&result);
++  uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
+ 
+-    // First, we extract the i4s and construct an intermediate fp16 number.
+-    static constexpr uint32_t immLut                = (0xf0 & 0xcc) | 0xaa;
+-    static constexpr uint32_t BOTTOM_MASK           = 0x000f000f;
+-    static constexpr uint32_t TOP_MASK              = 0x00f000f0;
+-    static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
++  // First, we extract the i4s and construct an intermediate fp16 number.
++  static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
++  static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
++  static constexpr uint32_t TOP_MASK = 0x00f000f0;
++  static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
+ 
+-    // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
+-    // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
+-    // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
+-    // elt_67 to fp16 without having to shift them to the bottom bits before hand.
++  // Note that the entire sequence only requires 1 shift instruction. This is
++  // thanks to the register packing format and the fact that we force our
++  // integers to be unsigned, and account for this in the fp16 subtractions. In
++  // addition, I exploit the fact that sub and fma have the same throughput in
++  // order to convert elt_23 and elt_67 to fp16 without having to shift them to
++  // the bottom bits before hand.
+ 
+-    // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
+-    // immediately before required.
+-    const uint32_t top_i4s = i4s >> 8;
+-    // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
+-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+-                    : "=r"(h[0])
+-                    : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+-    // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
+-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+-                    : "=r"(h[1])
+-                    : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+-    // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
+-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+-                    : "=r"(h[2])
+-                    : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+-    // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
+-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+-                    : "=r"(h[3])
+-                    : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
++  // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
++  // dependency if we issue immediately before required.
++  const uint32_t top_i4s = i4s >> 8;
++  // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
++  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
++               : "=r"(h[0])
++               : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
++                 "n"(immLut));
++  // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
++  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
++               : "=r"(h[1])
++               : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
++                 "n"(immLut));
++  // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
++  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
++               : "=r"(h[2])
++               : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
++                 "n"(immLut));
++  // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
++  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
++               : "=r"(h[3])
++               : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
++                 "n"(immLut));
+ 
+-    // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
+-    // half2 ctor. In this case, I chose performance reliability over code readability.
++  // I use inline PTX below because I am not sure if the compiler will emit
++  // float2half instructions if I use the half2 ctor. In this case, I chose
++  // performance reliability over code readability.
+ 
+-    // This is the half2 {1032, 1032} represented as an integer.
+-    // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
+-    // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
+-    static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
+-    // This is the half2 {1 / 16, 1 / 16} represented as an integer.
+-    static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
+-    // This is the half2 {-72, -72} represented as an integer.
+-    // static constexpr uint32_t NEG_72 = 0xd480d480;
+-    // Haotian: Let's use {-64, -64}.
+-    static constexpr uint32_t NEG_64 = 0xd400d400;
++  // This is the half2 {1032, 1032} represented as an integer.
++  // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
++  // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
++  static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
++  // This is the half2 {1 / 16, 1 / 16} represented as an integer.
++  static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
++  // This is the half2 {-72, -72} represented as an integer.
++  // static constexpr uint32_t NEG_72 = 0xd480d480;
++  // Haotian: Let's use {-64, -64}.
++  static constexpr uint32_t NEG_64 = 0xd400d400;
+ 
+-    // Finally, we construct the output numbers.
+-    // Convert elt_01
+-    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+-    // Convert elt_23
+-    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+-    // Convert elt_45
+-    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+-    // Convert elt_67
+-    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
++  // Finally, we construct the output numbers.
++  // Convert elt_01
++  asm volatile("sub.f16x2 %0, %1, %2;\n"
++               : "=r"(h[0])
++               : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
++  // Convert elt_23
++  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++               : "=r"(h[1])
++               : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
++  // Convert elt_45
++  asm volatile("sub.f16x2 %0, %1, %2;\n"
++               : "=r"(h[2])
++               : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
++  // Convert elt_67
++  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++               : "=r"(h[3])
++               : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+ 
+-    return result;
++  return result;
+ #endif
++  __builtin_unreachable();  // Suppress missing return statement warning
+ }
+ 
+-} // namespace awq
+-} // namespace vllm
++}  // namespace awq
++}  // namespace vllm
+diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu
+index 5aefb0b..9da724a 100644
+--- a/csrc/quantization/awq/gemm_kernels.cu
++++ b/csrc/quantization/awq/gemm_kernels.cu
+@@ -1,15 +1,13 @@
+ /*
+ Adapted from https://github.com/mit-han-lab/llm-awq
+ @article{lin2023awq,
+-  title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
+-  author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
+-  journal={arXiv},
+-  year={2023}
++  title={AWQ: Activation-aware Weight Quantization for LLM Compression and
++Acceleration}, author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang,
++Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
+ }
+  */
+ 
+-
+-#include <torch/extension.h>
++#include <torch/all.h>
+ #include <c10/cuda/CUDAGuard.h>
+ 
+ #include "dequantize.cuh"
+@@ -19,27 +17,13 @@ Adapted from https://github.com/mit-han-lab/llm-awq
+ namespace vllm {
+ namespace awq {
+ 
+-// Pack two half values.
+-static inline __device__ __host__ unsigned
+-__pack_half2(const half x, const half y) {
+-  unsigned v0 = *((unsigned short *)&x);
+-  unsigned v1 = *((unsigned short *)&y);
+-  return (v1 << 16) | v0;
+-}
+-
+-template<int N>
+-__global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16nXk32(
+-  int G,
+-  int split_k_iters,
+-  half* __restrict__ A,
+-  int* __restrict__ B,
+-  half* __restrict__ scaling_factors,
+-  int* __restrict__ zeros,
+-  int M,
+-  int IC,
+-  int OC,
+-  half* __restrict__ C)
+-{
++template <int N>
++__global__ void __launch_bounds__(64)
++    gemm_forward_4bit_cuda_m16nXk32(int G, int split_k_iters,
++                                    half* __restrict__ A, int* __restrict__ B,
++                                    half* __restrict__ scaling_factors,
++                                    int* __restrict__ zeros, int M, int IC,
++                                    int OC, half* __restrict__ C) {
+   // Only support matrix n = 64 or 128
+   assert(N == 64 || N == 128);
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
+@@ -50,11 +34,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16nXk32(
+   __shared__ half A_shared[16 * (32 + 8)];
+   __shared__ half B_shared[32 * (N + 8)];
+ 
+-  __shared__ half scaling_factors_shared[N];
+-  __shared__ half zeros_shared[N];
+-
+   int j_factors1 = ((OC + N - 1) / N);
+-  int blockIdx_x = 0;
+   int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
+   int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
+ 
+@@ -68,45 +48,47 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16nXk32(
+ 
+   static constexpr int row_stride_warp = 32 * 8 / 32;
+   static constexpr int row_stride = 2 * 32 * 8 / N;
+-  bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < N;
+   // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
+-  bool ld_A_flag = (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp + threadIdx.x * 8 / 32) < M;     // threadIdx.y is warp_id
++  bool ld_A_flag =
++      (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp +
++       threadIdx.x * 8 / 32) < M;  // threadIdx.y is warp_id
+   // bool wb_C_flag = (threadIdx.x / 4) < M;
+ 
+-  half* A_ptr = A
+-                + (((int)blockIdx_y) / j_factors1 * 16 + (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) * IC
+-                + (((int)threadIdx.x) % (32 / 8)) * 8;
+-
+-  int* B_ptr = B
+-            + ((int)threadIdx.y) * (OC / 8) * (256 / N)
+-            + (((int)threadIdx.x) / (N / 8)) * (OC / 8)
+-            + (((int)blockIdx_y) % j_factors1) * (N / 8)
+-            + (((int)threadIdx.x) % (N / 8)) * 1;
+-// Why * 1 in the above line?
+-
+-  half* A_shared_ptr = A_shared
+-                    + ((int)threadIdx.y) * row_stride_warp * (32 + 8)
+-                    + (((int)threadIdx.x) / (32 / 8)) * (32 + 8)
+-                    + (((int)threadIdx.x) % (32 / 8) ) * 8;
+-
+-  half* B_shared_ptr = B_shared
+-                    + ((int)threadIdx.y) * (row_stride / 2) * (N + 8)
+-                    + (((int)threadIdx.x) / (N / 8)) * (N + 8)
+-                    + (((int)threadIdx.x) % (N / 8)) * 8;
+-
+-  int* zeros_ptr = zeros
+-                + (((int)blockIdx_y) % j_factors1) * (N / 8)
+-                + ((int)threadIdx.x) % (N / 8);
+-
+-  half* scaling_factors_ptr = scaling_factors
+-                            + (((int)blockIdx_y) % j_factors1) * N
+-                            + (((int)threadIdx.x) % (N / 8)) * 8;
+-
+-  half* C_ptr = C
+-              + static_cast<long long>(blockIdx_z) * M * OC        // blockIdz.x -> split_k dim
+-              + (((int)blockIdx_y) % j_factors1) * N
+-              + ((int)threadIdx.y) * (N / 2)
+-              + (((int)threadIdx.x) % 4) * 2;
++  half* A_ptr =
++      A +
++      (((int)blockIdx_y) / j_factors1 * 16 +
++       (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) *
++          IC +
++      (((int)threadIdx.x) % (32 / 8)) * 8;
++
++  int* B_ptr = B + ((int)threadIdx.y) * (OC / 8) * (256 / N) +
++               (((int)threadIdx.x) / (N / 8)) * (OC / 8) +
++               (((int)blockIdx_y) % j_factors1) * (N / 8) +
++               (((int)threadIdx.x) % (N / 8)) * 1;
++  // Why * 1 in the above line?
++
++  half* A_shared_ptr = A_shared +
++                       ((int)threadIdx.y) * row_stride_warp * (32 + 8) +
++                       (((int)threadIdx.x) / (32 / 8)) * (32 + 8) +
++                       (((int)threadIdx.x) % (32 / 8)) * 8;
++
++  half* B_shared_ptr = B_shared +
++                       ((int)threadIdx.y) * (row_stride / 2) * (N + 8) +
++                       (((int)threadIdx.x) / (N / 8)) * (N + 8) +
++                       (((int)threadIdx.x) % (N / 8)) * 8;
++
++  int* zeros_ptr = zeros + (((int)blockIdx_y) % j_factors1) * (N / 8) +
++                   ((int)threadIdx.x) % (N / 8);
++
++  half* scaling_factors_ptr = scaling_factors +
++                              (((int)blockIdx_y) % j_factors1) * N +
++                              (((int)threadIdx.x) % (N / 8)) * 8;
++
++  half* C_ptr =
++      C +
++      static_cast<long long>(blockIdx_z) * M * OC  // blockIdz.x -> split_k dim
++      + (((int)blockIdx_y) % j_factors1) * N + ((int)threadIdx.y) * (N / 2) +
++      (((int)threadIdx.x) % 4) * 2;
+ 
+   // preload s.f. and zeros
+   int k_bound = (IC / 32 + split_k_iters - 1) / split_k_iters;
+@@ -115,57 +97,79 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16nXk32(
+     int k_0_0 = _k_0_0 * split_k_iters + blockIdx_z;
+     __syncthreads();
+     // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
+-    if (ld_A_flag)
+-    {
++    if (ld_A_flag) {
+       *(uint4*)(A_shared_ptr) = *(uint4*)(A_ptr + (k_0_0 * 32));
+-    }
+-    else
+-    {
++    } else {
+       *(uint4*)(A_shared_ptr) = make_uint4(0, 0, 0, 0);
+     }
+ 
+     // for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 2; ++ax0_ax1_fused_0) {
+     uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr + k_0_0 * 32 / G * (OC / 8));
+     uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+-    uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC));
++    uint4 B_loaded_scale =
++        *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC));
+     /*
+-    if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 && threadIdx.y == 0){
+-      printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x, B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x, B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w);
++    if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 &&
++    threadIdx.y == 0){ printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x,
++    B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x,
++    B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w);
+     }
+     */
+     // uint4 B_loaded_scale = make_uint4(0, 0, 0, 0);
+     int* B_ptr_local = B_ptr + k_0_0 * 32 * (OC / 8);
+ 
+     for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < N / 16; ++ax0_ax1_fused_0) {
+-
+       // B: 32 x 136 (128+8) float16
+       // each warp: 32 x 4
+-      // each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus zero -> WB UINT4
+-      // *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) * 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15) * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 * 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) * 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) * 8)));
+-      // row stride in shared memory: (NWARPS * 32 * 8 / cta_N)
+-      uint32_t B_loaded = *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
++      // each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus
++      // zero -> WB UINT4
++      // *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) *
++      // 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15)
++      // * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 *
++      // 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) *
++      // 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) *
++      // 8))); row stride in shared memory: (NWARPS * 32 * 8 / cta_N)
++      uint32_t B_loaded =
++          *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
+       uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+-      //uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N / 8)) * 8);
+ 
+-      // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8);
+       // - zero and * scale
+-      // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale.
+-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
++      // TODO (Haotian): can save 4 assembly instructions if sormulate as deq =
++      // q * scale - zero * scale.
++      asm volatile("sub.f16x2 %0, %1, %2;\n"
++                   : "=r"(B_loaded_fp16.x)
++                   : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
++      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++                   : "=r"(B_loaded_fp16.x)
++                   : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
++      asm volatile("sub.f16x2 %0, %1, %2;\n"
++                   : "=r"(B_loaded_fp16.y)
++                   : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
++      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++                   : "=r"(B_loaded_fp16.y)
++                   : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
++      asm volatile("sub.f16x2 %0, %1, %2;\n"
++                   : "=r"(B_loaded_fp16.z)
++                   : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
++      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++                   : "=r"(B_loaded_fp16.z)
++                   : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
++      asm volatile("sub.f16x2 %0, %1, %2;\n"
++                   : "=r"(B_loaded_fp16.w)
++                   : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
++      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++                   : "=r"(B_loaded_fp16.w)
++                   : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+       /*
+-      if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){
+-        printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
++      if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 ==
++      0 && threadIdx.x == 17 && threadIdx.y == 0){ printf("[x] %X %X %X %X\n",
++      B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
+       }
+       */
+ 
+       // write back
+-      *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (N + 8)) = B_loaded_fp16;
++      *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (N + 8)) =
++          B_loaded_fp16;
+     }
+     __syncthreads();
+ 
+@@ -173,123 +177,184 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16nXk32(
+       {
+         unsigned int addr;
+         __asm__ __volatile__(
+-          "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
+-          : "=r"(addr)
+-          : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8))))
+-        );
+-
++            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
++            "addr; }\n"
++            : "=r"(addr)
++            : "l"((void*)((&(A_shared[(k_0_1 * 16)])) +
++                          (((((int)threadIdx.x) & 15) * 40) +
++                           ((((int)threadIdx.x) >> 4) * 8)))));
+ 
+         __asm__ __volatile__(
+-          "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+-          "{%0, %1, %2, %3}, [%4];\n"
+-          : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
+-          : "r"(addr)
+-        );
++            "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
++            "{%0, %1, %2, %3}, [%4];\n"
++            : "=r"(((unsigned*)(A_shared_warp + 0))[0]),
++              "=r"(((unsigned*)(A_shared_warp + 0))[1]),
++              "=r"(((unsigned*)(A_shared_warp + 0))[2]),
++              "=r"(((unsigned*)(A_shared_warp + 0))[3])
++            : "r"(addr));
+       }
+ 
+       for (int ax1_0 = 0; ax1_0 < N / 32; ++ax1_0) {
+         {
+           unsigned int addr;
+           __asm__ __volatile__(
+-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
+-            : "=r"(addr)
+-            : "l"((void *)((&(B_shared[(((k_0_1 * (N * 16 + 128)) + (((int)threadIdx.y) * (N / 2))) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * (N + 8)) + ((((int)threadIdx.x) >> 4) * 8))))
+-          );
++              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, "
++              "addr; }\n"
++              : "=r"(addr)
++              : "l"((void*)((&(B_shared[(((k_0_1 * (N * 16 + 128)) +
++                                          (((int)threadIdx.y) * (N / 2))) +
++                                         (ax1_0 * 16))])) +
++                            (((((int)threadIdx.x) & 15) * (N + 8)) +
++                             ((((int)threadIdx.x) >> 4) * 8)))));
+           __asm__ __volatile__(
+-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+-            "{%0, %1, %2, %3}, [%4];\n"
+-            : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
+-            : "r"(addr)
+-          );
++              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
++              "{%0, %1, %2, %3}, [%4];\n"
++              : "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[0]),
++                "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[1]),
++                "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[2]),
++                "=r"(((unsigned*)(B_shared_warp + (ax1_0 * 8)))[3])
++              : "r"(addr));
+         }
+       }
+       for (int j_0_4 = 0; j_0_4 < N / 32; ++j_0_4) {
+-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
++  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+         {
+           __asm__ __volatile__(
+-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+-            :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
+-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
++              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
++              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
++              : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
++                "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
++                "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
++                "=f"(((float*)(C_warp + (j_0_4 * 8)))[3])
++              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
++                "r"(((unsigned*)(A_shared_warp + 0))[1]),
++                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[0]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[3]));
+         }
+ 
+         {
+           __asm__ __volatile__(
+-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+-            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
+-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
++              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
++              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
++              : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
++                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
++                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
++                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])
++              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
++                "r"(((unsigned*)(A_shared_warp + 0))[1]),
++                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]));
+         }
+ 
+         {
+           __asm__ __volatile__(
+-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+-            :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
+-            : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
++              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
++              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
++              : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
++                "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
++                "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
++                "=f"(((float*)(C_warp + (j_0_4 * 8)))[3])
++              : "r"(((unsigned*)(A_shared_warp + 0))[2]),
++                "r"(((unsigned*)(A_shared_warp + 0))[3]),
++                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[1]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[3]));
+         }
+ 
+         {
+           __asm__ __volatile__(
+-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+-            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
+-            : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
++              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
++              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
++              : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
++                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
++                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
++                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])
++              : "r"(((unsigned*)(A_shared_warp + 0))[2]),
++                "r"(((unsigned*)(A_shared_warp + 0))[3]),
++                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]));
+         }
+-#else
++  #else
+         {
+           __asm__ __volatile__(
+-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
+-            :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
+-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
++              "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
++              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, "
++              "%13};\n"
++              : "=f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
++                "=f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
++                "=f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
++                "=f"(((float*)(C_warp + (j_0_4 * 8)))[3])
++              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
++                "r"(((unsigned*)(A_shared_warp + 0))[1]),
++                "r"(((unsigned*)(A_shared_warp + 0))[2]),
++                "r"(((unsigned*)(A_shared_warp + 0))[3]),
++                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[0]),
++                "r"(((unsigned*)(B_shared_warp + (j_0_4 * 8)))[1]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[0]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[1]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[2]),
++                "f"(((float*)(C_warp + (j_0_4 * 8)))[3]));
+         }
+ 
+         {
+           __asm__ __volatile__(
+-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
+-            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
+-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
++              "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
++              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, "
++              "%13};\n"
++              : "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
++                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
++                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
++                "=f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3])
++              : "r"(((unsigned*)(A_shared_warp + 0))[0]),
++                "r"(((unsigned*)(A_shared_warp + 0))[1]),
++                "r"(((unsigned*)(A_shared_warp + 0))[2]),
++                "r"(((unsigned*)(A_shared_warp + 0))[3]),
++                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]),
++                "r"(((unsigned*)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[0]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[1]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[2]),
++                "f"(((float*)(C_warp + ((j_0_4 * 8) + 4)))[3]));
+         }
+ 
+-#endif
++  #endif
+       }
+     }
+   }
+ 
+-// TODO: Shang: Hoist loop invariance.
++  // TODO: Shang: Hoist loop invariance.
+   for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) {
+     for (int local_id = 0; local_id < 8; ++local_id) {
+-      int row_offset = (((int)blockIdx_y) / j_factors1) * 16 + ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8;
+-      if (row_offset < M)
+-      {
+-        *(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 + local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]);
++      int row_offset = (((int)blockIdx_y) / j_factors1) * 16 +
++                       ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8;
++      if (row_offset < M) {
++        *(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 +
++          local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]);
+       }
+     }
+   }
+ #endif
+ }
+ 
+-__global__ void __launch_bounds__(64) dequantize_weights(
+-    int* __restrict__ B,
+-    half* __restrict__ scaling_factors,
+-    int* __restrict__ zeros,
+-    half* __restrict__ C,
+-    int G
+-)
+-{
+-  int j_factors1 = 4;
+-  int row_stride2 = 4;
+-  int split_k_iters = 1;
++__global__ void __launch_bounds__(64)
++    dequantize_weights(int* __restrict__ B, half* __restrict__ scaling_factors,
++                       int* __restrict__ zeros, half* __restrict__ C, int G) {
+   static constexpr uint32_t ZERO = 0x0;
+   half B_shared[32 * (128 + 8)];
+ 
+   half* B_shared_ptr2 = B_shared;
+ 
+-  half B_shared_warp[32];
+-  int OC = 512;
+-
+   int N = blockDim.x * gridDim.x;  // 2
+   int col = (blockIdx.x * blockDim.x + threadIdx.x);
+   int row = blockIdx.y * blockDim.y + threadIdx.y;
+@@ -310,14 +375,30 @@ __global__ void __launch_bounds__(64) dequantize_weights(
+ 
+   uint32_t B_loaded = *(uint32_t*)B_ptr2;
+   uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+-  asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+-  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+-  asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+-  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+-  asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+-  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+-  asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+-  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
++  asm volatile("sub.f16x2 %0, %1, %2;\n"
++               : "=r"(B_loaded_fp16.x)
++               : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
++  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++               : "=r"(B_loaded_fp16.x)
++               : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
++  asm volatile("sub.f16x2 %0, %1, %2;\n"
++               : "=r"(B_loaded_fp16.y)
++               : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
++  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++               : "=r"(B_loaded_fp16.y)
++               : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
++  asm volatile("sub.f16x2 %0, %1, %2;\n"
++               : "=r"(B_loaded_fp16.z)
++               : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
++  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++               : "=r"(B_loaded_fp16.z)
++               : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
++  asm volatile("sub.f16x2 %0, %1, %2;\n"
++               : "=r"(B_loaded_fp16.w)
++               : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
++  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
++               : "=r"(B_loaded_fp16.w)
++               : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+ 
+   *(uint4*)B_shared_ptr2 = B_loaded_fp16;
+ 
+@@ -326,58 +407,57 @@ __global__ void __launch_bounds__(64) dequantize_weights(
+   }
+ }
+ 
+-} // namespace awq
+-} // namespace vllm
+-
+-torch::Tensor awq_dequantize(
+-    torch::Tensor _kernel,
+-    torch::Tensor _scaling_factors,
+-    torch::Tensor _zeros,
+-    int split_k_iters,
+-    int thx,
+-    int thy)
+-{
+-    int in_c = _kernel.size(0);
+-    int qout_c = _kernel.size(1);
+-    int out_c = qout_c * 8;
+-    int G = in_c / _scaling_factors.size(0);
+-
+-    int x_thread = thx;
+-    int y_thread = thy;
+-
+-    int x_blocks = 1;
+-    int y_blocks = 1;
+-    if (thx==0) {
+-      x_thread = qout_c;
+-    }
+-    if (thy==0) {
+-      y_thread = in_c;
+-    }
+-    if (thx==0 && thy==0) {
+-      x_thread = 8;
+-      y_thread = 8;
+-      x_blocks = (int)(qout_c / 8);
+-      y_blocks = (int)(in_c / 8);
+-    }
++}  // namespace awq
++}  // namespace vllm
++
++torch::Tensor awq_dequantize(torch::Tensor _kernel,
++                             torch::Tensor _scaling_factors,
++                             torch::Tensor _zeros, int64_t split_k_iters,
++                             int64_t thx, int64_t thy) {
++  int in_c = _kernel.size(0);
++  int qout_c = _kernel.size(1);
++  int out_c = qout_c * 8;
++  int G = in_c / _scaling_factors.size(0);
++
++  int x_thread = thx;
++  int y_thread = thy;
++
++  int x_blocks = 1;
++  int y_blocks = 1;
++  if (thx == 0) {
++    x_thread = qout_c;
++  }
++  if (thy == 0) {
++    y_thread = in_c;
++  }
++  if (thx == 0 && thy == 0) {
++    x_thread = 8;
++    y_thread = 8;
++    x_blocks = (int)(qout_c / 8);
++    y_blocks = (int)(in_c / 8);
++  }
+ 
+-    const at::cuda::OptionalCUDAGuard device_guard(device_of(_scaling_factors));
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(_scaling_factors));
+ 
+-    auto options = torch::TensorOptions().dtype(_scaling_factors.dtype()).device(_scaling_factors.device());
+-    at::Tensor _de_kernel = torch::empty({in_c, out_c}, options);
++  auto options = torch::TensorOptions()
++                     .dtype(_scaling_factors.dtype())
++                     .device(_scaling_factors.device());
++  at::Tensor _de_kernel = torch::empty({in_c, out_c}, options);
+ 
+-    auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
+-    auto de_kernel = reinterpret_cast<half*>(_de_kernel.data_ptr<at::Half>());
+-    auto scaling_factors = reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>());
+-    auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
++  auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
++  auto de_kernel = reinterpret_cast<half*>(_de_kernel.data_ptr<at::Half>());
++  auto scaling_factors =
++      reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>());
++  auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
+ 
+-    dim3 num_blocks(x_blocks, y_blocks);
+-    dim3 threads_per_block(x_thread, y_thread);
++  dim3 num_blocks(x_blocks, y_blocks);
++  dim3 threads_per_block(x_thread, y_thread);
+ 
+-    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-    vllm::awq::dequantize_weights<<<num_blocks, threads_per_block, 0, stream>>>(
+-        kernel, scaling_factors, zeros, de_kernel, G);
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  vllm::awq::dequantize_weights<<<num_blocks, threads_per_block, 0, stream>>>(
++      kernel, scaling_factors, zeros, de_kernel, G);
+ 
+-    return _de_kernel;
++  return _de_kernel;
+ }
+ 
+ // in_feats: M, IC [float16]
+@@ -386,61 +466,61 @@ torch::Tensor awq_dequantize(
+ // zeros: IC // G, OC // 8 [int32] -> cast to IC // G, OC [uint4b]
+ // assume that batch_size < 16 for now
+ 
+-torch::Tensor awq_gemm(
+-    torch::Tensor _in_feats,
+-    torch::Tensor _kernel,
+-    torch::Tensor _scaling_factors,
+-    torch::Tensor _zeros,
+-    int split_k_iters)
+-{
+-    int num_in_feats = _in_feats.size(0);
+-    int num_in_channels = _in_feats.size(1);
+-    const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
+-
+-    auto options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
+-    at::Tensor _out_feats = torch::empty({split_k_iters, num_in_feats, _kernel.size(1) * 8}, options);
+-    int num_out_feats = _out_feats.size(-2);
+-    int num_out_channels = _out_feats.size(-1);
+-
+-    auto in_feats = reinterpret_cast<half*>(_in_feats.data_ptr<at::Half>());
+-    auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
+-    auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>());
+-    auto scaling_factors = reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>());
+-    auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
+-    int group_size = num_in_channels / _scaling_factors.size(0);
+-
+-    if (num_out_channels % 64 != 0)
+-        throw std::invalid_argument("OC is not multiple of cta_N = 64");
+-    if (num_out_channels % 8 != 0)
+-        throw std::invalid_argument("OC is not multiple of pack_num = 8");
+-    if (group_size % 32 != 0)
+-	      throw std::invalid_argument("Group size should be a multiple of 32");
+-    if (num_out_channels % group_size != 0)
+-        throw std::invalid_argument("OC is not multiple of Group size");
+-
+-    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-    if (num_out_channels % 128 == 0)
+-    {
+-        int j_factors1 = num_out_channels / 128 / 1;
+-        dim3 num_blocks((num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters);
+-        // threadIdx.x: 32
+-        // threadIdx.y: i_factors[2] * j_factors[2]
+-        dim3 threads_per_block(32, 2);
+-        vllm::awq::gemm_forward_4bit_cuda_m16nXk32<128><<<num_blocks, threads_per_block, 0, stream>>>(
+-            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels,
+-            num_out_channels, out_feats);
+-    }
+-    else if (num_out_channels % 64 == 0)
+-    {
+-        int j_factors1 = num_out_channels / 64 / 1;
+-        dim3 num_blocks(1 * (num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters);
+-
+-        // threadIdx.x: 32
+-        // threadIdx.y: i_factors[2] * j_factors[2]
+-        dim3 threads_per_block(32, 2);
+-        vllm::awq::gemm_forward_4bit_cuda_m16nXk32<64><<<num_blocks, threads_per_block, 0, stream>>>(
+-            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels,
+-            num_out_channels, out_feats);
+-    }
+-    return _out_feats.sum(0);
++torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
++                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
++                       int64_t split_k_iters) {
++  int num_in_feats = _in_feats.size(0);
++  int num_in_channels = _in_feats.size(1);
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
++
++  auto options = torch::TensorOptions()
++                     .dtype(_in_feats.dtype())
++                     .device(_in_feats.device());
++  at::Tensor _out_feats =
++      torch::empty({split_k_iters, num_in_feats, _kernel.size(1) * 8}, options);
++  int num_out_feats = _out_feats.size(-2);
++  int num_out_channels = _out_feats.size(-1);
++
++  auto in_feats = reinterpret_cast<half*>(_in_feats.data_ptr<at::Half>());
++  auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
++  auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>());
++  auto scaling_factors =
++      reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>());
++  auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
++  int group_size = num_in_channels / _scaling_factors.size(0);
++
++  if (num_out_channels % 64 != 0)
++    throw std::invalid_argument("OC is not multiple of cta_N = 64");
++  if (num_out_channels % 8 != 0)
++    throw std::invalid_argument("OC is not multiple of pack_num = 8");
++  if (group_size % 32 != 0)
++    throw std::invalid_argument("Group size should be a multiple of 32");
++  if (num_out_channels % group_size != 0)
++    throw std::invalid_argument("OC is not multiple of Group size");
++
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  if (num_out_channels % 128 == 0) {
++    int j_factors1 = num_out_channels / 128 / 1;
++    dim3 num_blocks((num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters);
++    // threadIdx.x: 32
++    // threadIdx.y: i_factors[2] * j_factors[2]
++    dim3 threads_per_block(32, 2);
++    vllm::awq::gemm_forward_4bit_cuda_m16nXk32<128>
++        <<<num_blocks, threads_per_block, 0, stream>>>(
++            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros,
++            num_in_feats, num_in_channels, num_out_channels, out_feats);
++  } else if (num_out_channels % 64 == 0) {
++    int j_factors1 = num_out_channels / 64 / 1;
++    dim3 num_blocks(1 * (num_out_feats + 16 - 1) / 16 * j_factors1 *
++                    split_k_iters);
++
++    // threadIdx.x: 32
++    // threadIdx.y: i_factors[2] * j_factors[2]
++    dim3 threads_per_block(32, 2);
++    vllm::awq::gemm_forward_4bit_cuda_m16nXk32<64>
++        <<<num_blocks, threads_per_block, 0, stream>>>(
++            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros,
++            num_in_feats, num_in_channels, num_out_channels, out_feats);
++  }
++  return _out_feats.sum(0);
+ }
+diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+new file mode 100644
+index 0000000..e797858
+--- /dev/null
++++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+@@ -0,0 +1,286 @@
++#include <ATen/cuda/CUDAContext.h>
++#include <torch/all.h>
++#include <cmath>
++
++#include "../../dispatch_utils.h"
++
++#ifndef USE_ROCM
++  #include <cub/util_type.cuh>
++  #include <cub/cub.cuh>
++#else
++  #include <hipcub/util_type.hpp>
++  #include <hipcub/hipcub.hpp>
++#endif
++
++static inline __device__ int8_t float_to_int8_rn(float x) {
++#ifdef USE_ROCM
++  static constexpr auto i8_min =
++      static_cast<float>(std::numeric_limits<int8_t>::min());
++  static constexpr auto i8_max =
++      static_cast<float>(std::numeric_limits<int8_t>::max());
++
++  // To match the rounding mode of CUDA, we use nearbyint.
++  // It uses the current rounding mode, which is always FE_TONEAREST on HIP.
++  // If that changes in the future, we may need to set the rounding mode
++  // explicitly, either at runtime or compile time.
++  float dst = std::nearbyint(x);
++
++  // saturate
++  dst = std::clamp(dst, i8_min, i8_max);
++  return static_cast<int8_t>(dst);
++#else
++  // CUDA path
++  uint32_t dst;
++  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
++  return reinterpret_cast<const int8_t&>(dst);
++#endif
++}
++
++static inline __device__ int32_t float_to_int32_rn(float x) {
++#ifdef USE_ROCM
++  // int32_max is not exactly representable as float.
++  // Therefore, we need to be careful and manually return int32_max on overflow.
++  // For symmetry, we also do the same for int32_min, even though it is exactly
++  // representable as float and the conversion should be exact.
++  static constexpr auto i32_min = std::numeric_limits<int32_t>::min();
++  static constexpr auto i32_min_f = static_cast<float>(i32_min);
++  static constexpr auto i32_max = std::numeric_limits<int32_t>::max();
++  static constexpr auto i32_max_f = static_cast<float>(i32_max);
++
++  // To match the rounding mode of CUDA, we use nearbyint.
++  // It uses the current rounding mode, which is always FE_TONEAREST on HIP.
++  // If that changes in the future, we may need to set the rounding mode
++  // explicitly, either at runtime or compile time.
++  float dst = std::nearbyint(x);
++
++  // saturate on the higher end.
++  if (dst >= i32_max_f) {
++    return i32_max;
++  }
++  // saturate on the lower end.
++  if (dst <= i32_min_f) {
++    return i32_min;
++  }
++
++  return static_cast<int32_t>(dst);
++#else
++  // CUDA path
++  uint32_t dst;
++  asm volatile("cvt.rni.sat.s32.f32 %0, %1;" : "=r"(dst) : "f"(x));
++  return reinterpret_cast<const int32_t&>(dst);
++#endif
++}
++
++static inline __device__ int8_t int32_to_int8(int32_t x) {
++#ifdef USE_ROCM
++  static constexpr auto i8_min =
++      static_cast<int32_t>(std::numeric_limits<int8_t>::min());
++  static constexpr auto i8_max =
++      static_cast<int32_t>(std::numeric_limits<int8_t>::max());
++
++  // saturate
++  int32_t dst = std::clamp(x, i8_min, i8_max);
++  return static_cast<int8_t>(dst);
++#else
++  // CUDA path
++  uint32_t dst;
++  asm volatile("cvt.sat.s8.s32 %0, %1;" : "=r"(dst) : "r"(x));
++  return reinterpret_cast<const int8_t&>(dst);
++#endif
++}
++
++namespace vllm {
++
++template <typename scalar_t, typename scale_type>
++__global__ void static_scaled_int8_quant_kernel(
++    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
++    scale_type const* scale_ptr, const int hidden_size) {
++  int const tid = threadIdx.x;
++  int64_t const token_idx = blockIdx.x;
++  scale_type const scale = *scale_ptr;
++
++  // Must be performed using 64-bit math to avoid integer overflow.
++  out += token_idx * hidden_size;
++  input += token_idx * hidden_size;
++
++  for (int i = tid; i < hidden_size; i += blockDim.x) {
++    out[i] = float_to_int8_rn(static_cast<float>(input[i]) / scale);
++  }
++}
++
++template <typename scalar_t, typename scale_type, typename azp_type>
++__global__ void static_scaled_int8_azp_quant_kernel(
++    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
++    scale_type const* scale_ptr, azp_type const* azp_ptr,
++    const int hidden_size) {
++  int const tid = threadIdx.x;
++  int64_t const token_idx = blockIdx.x;
++  scale_type const scale = *scale_ptr;
++  azp_type const azp = *azp_ptr;
++
++  // Must be performed using 64-bit math to avoid integer overflow.
++  out += token_idx * hidden_size;
++  input += token_idx * hidden_size;
++
++  for (int i = tid; i < hidden_size; i += blockDim.x) {
++    auto const val = static_cast<float>(input[i]);
++    auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp);
++    out[i] = quant_val;
++  }
++}
++
++template <typename scalar_t, typename scale_type>
++__global__ void dynamic_scaled_int8_quant_kernel(
++    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
++    scale_type* scale, const int hidden_size) {
++  int const tid = threadIdx.x;
++  int64_t const token_idx = blockIdx.x;
++  float absmax_val = 0.0f;
++  float const zero = 0.0f;
++
++  // Must be performed using 64-bit math to avoid integer overflow.
++  out += token_idx * hidden_size;
++  input += token_idx * hidden_size;
++
++  for (int i = tid; i < hidden_size; i += blockDim.x) {
++    float val = static_cast<float>(input[i]);
++    val = val > zero ? val : -val;
++    absmax_val = val > absmax_val ? val : absmax_val;
++  }
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStorage;
++  float const block_absmax_val_maybe =
++      BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
++  __shared__ float block_absmax_val;
++  if (tid == 0) {
++    block_absmax_val = block_absmax_val_maybe;
++    scale[token_idx] = block_absmax_val / 127.0f;
++  }
++  __syncthreads();
++
++  float const tmp_scale = 127.0f / block_absmax_val;
++  for (int i = tid; i < hidden_size; i += blockDim.x) {
++    out[i] = float_to_int8_rn(static_cast<float>(input[i]) * tmp_scale);
++  }
++}
++
++template <typename scalar_t, typename scale_type, typename azp_type>
++__global__ void dynamic_scaled_int8_azp_quant_kernel(
++    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
++    scale_type* scale, azp_type* azp, const int hidden_size) {
++  int64_t const token_idx = blockIdx.x;
++
++  // Must be performed using 64-bit math to avoid integer overflow.
++  out += token_idx * hidden_size;
++  input += token_idx * hidden_size;
++
++  // Scan for the min and max value for this token
++  float max_val = std::numeric_limits<float>::min();
++  float min_val = std::numeric_limits<float>::max();
++  for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
++    auto val = static_cast<float>(input[i]);
++    max_val = std::max(max_val, val);
++    min_val = std::min(min_val, val);
++  }
++
++  // Reduce the max and min values across the block
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStorage;
++  max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x);
++  __syncthreads();  // Make sure min doesn't mess with max shared memory
++  min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x);
++
++  __shared__ scale_type scale_sh;
++  __shared__ azp_type azp_sh;
++
++  // Compute the scale and zero point and store them, only on the first thread
++  if (threadIdx.x == 0) {
++    float const scale_val = (max_val - min_val) / 255.0f;
++    // Use rounding to even (same as torch.round)
++    auto const azp_float = std::nearbyint(-128.0f - min_val / scale_val);
++    auto const azp_val = static_cast<azp_type>(azp_float);
++
++    // Store the scale and azp into shared and global
++    scale[token_idx] = scale_sh = scale_val;
++    azp[token_idx] = azp_sh = azp_val;
++  }
++
++  // Wait for the scale and azp to be computed
++  __syncthreads();
++
++  float const scale_val = scale_sh;
++  azp_type const azp_val = azp_sh;
++
++  // Quantize the values
++  for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
++    auto const val = static_cast<float>(input[i]);
++    auto const quant_val =
++        int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val);
++    out[i] = quant_val;
++  }
++}
++
++}  // namespace vllm
++
++void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
++                              torch::Tensor const& input,  // [..., hidden_size]
++                              torch::Tensor const& scale,
++                              std::optional<torch::Tensor> const& azp) {
++  TORCH_CHECK(input.is_contiguous());
++  TORCH_CHECK(out.is_contiguous());
++  TORCH_CHECK(scale.numel() == 1);
++  TORCH_CHECK(!azp || azp->numel() == 1);
++
++  int const hidden_size = input.size(-1);
++  int const num_tokens = input.numel() / hidden_size;
++  dim3 const grid(num_tokens);
++  dim3 const block(std::min(hidden_size, 1024));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  VLLM_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
++        if (!azp) {
++          vllm::static_scaled_int8_quant_kernel<scalar_t, float>
++              <<<grid, block, 0, stream>>>(
++                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
++                  scale.data_ptr<float>(), hidden_size);
++        } else {
++          vllm::static_scaled_int8_azp_quant_kernel<scalar_t, float, int32_t>
++              <<<grid, block, 0, stream>>>(
++                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
++                  scale.data_ptr<float>(), azp->data_ptr<int32_t>(),
++                  hidden_size);
++        }
++      });
++}
++
++void dynamic_scaled_int8_quant(
++    torch::Tensor& out,          // [..., hidden_size]
++    torch::Tensor const& input,  // [..., hidden_size]
++    torch::Tensor& scales, std::optional<torch::Tensor> const& azp) {
++  TORCH_CHECK(input.is_contiguous());
++  TORCH_CHECK(out.is_contiguous());
++  TORCH_CHECK(scales.is_contiguous());
++  TORCH_CHECK(!azp || azp->is_contiguous());
++
++  int const hidden_size = input.size(-1);
++  int const num_tokens = input.numel() / hidden_size;
++  dim3 const grid(num_tokens);
++  dim3 const block(std::min(hidden_size, 1024));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  VLLM_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
++        if (!azp) {
++          vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float>
++              <<<grid, block, 0, stream>>>(
++                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
++                  scales.data_ptr<float>(), hidden_size);
++        } else {
++          vllm::dynamic_scaled_int8_azp_quant_kernel<scalar_t, float, int32_t>
++              <<<grid, block, 0, stream>>>(
++                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
++                  scales.data_ptr<float>(), azp->data_ptr<int32_t>(),
++                  hidden_size);
++        }
++      });
++}
+diff --git a/csrc/quantization/cutlass_w8a8/Epilogues.md b/csrc/quantization/cutlass_w8a8/Epilogues.md
+new file mode 100644
+index 0000000..aae0415
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/Epilogues.md
+@@ -0,0 +1,147 @@
++# CUTLASS Epilogues
++
++## Introduction
++This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs. 
++
++Currently, we only support symmetric quantization for weights,
++and symmetric and asymmetric quantization for activations.
++Both can be quantized per-tensor or per-channel (weights) / per-token (activations).
++
++There are 4 epilogues:
++1. ScaledEpilogue: symmetric quantization for activations, no bias.
++1. ScaledEpilogueBias: symmetric quantization for activations, supports bias.
++1. ScaledEpilogueAzp: asymmetric per-tensor quantization for activations, supports bias.
++1. ScaledEpilogueAzpPerToken: asymmetric per-token quantization for activations, supports bias.
++
++We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size.
++Instead, if no bias is passed, the epilogue will use 0 as the bias.
++That induces a redundant addition operation (and runtime check), but the performance impact is minor.
++
++## Underlying Linear Algebra
++
++More details available in the [Activation Quantization RFC](https://github.com/vllm-project/vllm/issues/3975).
++
++If $` \widehat X `$ is the quantized $` X `$, our matrices become the following
++
++```math
++A = s_a (\widehat A - J_a z_a)
++```
++```math
++B = s_b \widehat B
++```
++```math
++D = A B + C
++```
++```math
++D = s_a s_b \widehat D + C
++```
++
++Here, D is the output of the GEMM, and C is the bias.
++A is the activations and supports asymmetric quantization,
++and B is the weights and only supports symmetric quantization.
++$ s_a $ and $s_b$ are the scales for activations and weights, respectively.
++$ z_a $ is the zero-point for activations, and $ J_a $ is the matrix of all ones with dimensions of A.
++Additional epilogues would be required to support asymmetric quantization for weights.
++
++Expanding further, we can calculate $` \widehat D `$ as follows:
++
++```math
++A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B
++```
++```math
++A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right)
++```
++```math
++\widehat D = \widehat A \widehat B - z_a J_a \widehat B
++```
++
++Note that $` \widehat A \widehat B `$ is the raw output of the GEMM,
++and $` J_a \widehat B `$ is known ahead of time.
++Each row of it is equal to $` \mathbf 1 \widehat B `$, which is a row-vector of column sums of $` \widehat B `$.
++
++## Epilogues
++
++### ScaledEpilogue
++This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$.
++The output of the GEMM is:
++
++```math
++\widehat D = \widehat A \widehat B
++```
++```math
++D = s_a s_b \widehat D
++```
++```math
++D = s_a s_b \widehat A \widehat B
++```
++
++Epilogue parameters:
++- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
++- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
++
++### ScaledEpilogueBias
++This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$.
++The output of the GEMM is:
++
++```math
++\widehat D = \widehat A \widehat B
++```
++```math
++D = s_a s_b \widehat D + C 
++```
++```math
++D = s_a s_b \widehat A \widehat B + C
++```
++
++
++Epilogue parameters:
++- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
++- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
++- `bias` is the bias, is always per-channel (row-vector).
++
++### ScaledEpilogueAzp
++This epilogue computes the asymmetric per-tensor quantization for activations with bias.
++The output of the GEMM is:
++
++```math
++\widehat D = \widehat A \widehat B - z_a J_a \widehat B
++```
++```math
++D = s_a s_b \widehat D + C 
++```
++```math
++D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C
++```
++
++Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$. 
++That is precomputed and stored in `azp_with_adj` as a row-vector.
++
++Epilogue parameters:
++- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
++  - Generally this will be per-tensor as the zero-points are per-tensor.
++- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
++- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
++- `bias` is the bias, is always per-channel (row-vector).
++
++To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel.
++
++### ScaledEpilogueAzpPerToken
++This epilogue computes the asymmetric per-token quantization for activations with bias.
++
++The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector.
++That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$.
++
++Epilogue parameters:
++- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
++  - Generally this will be per-token as the zero-points are per-token.
++- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
++- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
++- `azp` is the zero-point (`z_a`), is per-token (column-vector).
++- `bias` is the bias, is always per-channel (row-vector).
++
++To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel.
++
++The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM):
++```
++out = scale_a * scale_b * (Dq - azp_adj * azp) + bias
++```
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+new file mode 100644
+index 0000000..865fef5
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+@@ -0,0 +1,199 @@
++#include <stddef.h>
++#include <torch/all.h>
++#include "cutlass/cutlass.h"
++
++#include "scaled_mm_c2x.cuh"
++#include "scaled_mm_c2x_sm75_dispatch.cuh"
++#include "scaled_mm_c2x_sm80_dispatch.cuh"
++#include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
++#include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
++
++#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp"
++
++using namespace vllm;
++
++/*
++   This file defines quantized GEMM operations using the CUTLASS 2.x API, for
++   NVIDIA GPUs with SM versions prior to sm90 (Hopper).
++*/
++
++template <template <typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
++                                     torch::Tensor const& b,
++                                     EpilogueArgs&&... epilogue_args) {
++  TORCH_CHECK(a.dtype() == torch::kInt8);
++  TORCH_CHECK(b.dtype() == torch::kInt8);
++
++  if (out.dtype() == torch::kBFloat16) {
++    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++  } else {
++    TORCH_CHECK(out.dtype() == torch::kFloat16);
++    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++  }
++}
++
++void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
++                            torch::Tensor const& b,
++                            torch::Tensor const& a_scales,
++                            torch::Tensor const& b_scales,
++                            std::optional<torch::Tensor> const& bias) {
++  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
++  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
++  if (bias) {
++    TORCH_CHECK(bias->dtype() == out.dtype(),
++                "currently bias dtype must match output dtype ", out.dtype());
++    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBias>(
++        out, a, b, a_scales, b_scales, *bias);
++  } else {
++    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogue>(
++        out, a, b, a_scales, b_scales);
++  }
++}
++
++void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
++                                torch::Tensor const& b,
++                                torch::Tensor const& a_scales,
++                                torch::Tensor const& b_scales,
++                                torch::Tensor const& azp_adj,
++                                std::optional<torch::Tensor> const& azp,
++                                std::optional<torch::Tensor> const& bias) {
++  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
++  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
++
++  if (azp) {
++    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
++        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
++  } else {
++    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzp>(
++        out, a, b, a_scales, b_scales, azp_adj, bias);
++  }
++}
++
++template <template <typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
++                                     torch::Tensor const& b,
++                                     EpilogueArgs&&... epilogue_args) {
++  TORCH_CHECK(a.dtype() == torch::kInt8);
++  TORCH_CHECK(b.dtype() == torch::kInt8);
++
++  if (out.dtype() == torch::kBFloat16) {
++    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++  } else {
++    TORCH_CHECK(out.dtype() == torch::kFloat16);
++    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++  }
++}
++
++void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
++                            torch::Tensor const& b,
++                            torch::Tensor const& a_scales,
++                            torch::Tensor const& b_scales,
++                            std::optional<torch::Tensor> const& bias) {
++  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
++  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
++  if (bias) {
++    TORCH_CHECK(bias->dtype() == out.dtype(),
++                "currently bias dtype must match output dtype ", out.dtype());
++    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBias>(
++        out, a, b, a_scales, b_scales, *bias);
++  } else {
++    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogue>(
++        out, a, b, a_scales, b_scales);
++  }
++}
++
++void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
++                                torch::Tensor const& b,
++                                torch::Tensor const& a_scales,
++                                torch::Tensor const& b_scales,
++                                torch::Tensor const& azp_adj,
++                                std::optional<torch::Tensor> const& azp,
++                                std::optional<torch::Tensor> const& bias) {
++  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
++  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
++
++  if (azp) {
++    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
++        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
++  } else {
++    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzp>(
++        out, a, b, a_scales, b_scales, azp_adj, bias);
++  }
++}
++
++template <template <typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
++                                     torch::Tensor const& b,
++                                     EpilogueArgs&&... epilogue_args) {
++  if (a.dtype() == torch::kInt8) {
++    TORCH_CHECK(b.dtype() == torch::kInt8);
++
++    if (out.dtype() == torch::kBFloat16) {
++      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
++                                             Epilogue>(
++          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++    } else {
++      assert(out.dtype() == torch::kFloat16);
++      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
++          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++    }
++  } else {
++    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
++
++    if (out.dtype() == torch::kBFloat16) {
++      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
++                                            cutlass::bfloat16_t, Epilogue>(
++          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++    } else {
++      TORCH_CHECK(out.dtype() == torch::kFloat16);
++      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
++                                            cutlass::half_t, Epilogue>(
++          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++    }
++  }
++}
++
++void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
++                            torch::Tensor const& b,
++                            torch::Tensor const& a_scales,
++                            torch::Tensor const& b_scales,
++                            std::optional<torch::Tensor> const& bias) {
++  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
++  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
++  if (bias) {
++    TORCH_CHECK(bias->dtype() == out.dtype(),
++                "currently bias dtype must match output dtype ", out.dtype());
++    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBias>(
++        out, a, b, a_scales, b_scales, *bias);
++  } else {
++    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogue>(
++        out, a, b, a_scales, b_scales);
++  }
++}
++
++void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
++                                torch::Tensor const& b,
++                                torch::Tensor const& a_scales,
++                                torch::Tensor const& b_scales,
++                                torch::Tensor const& azp_adj,
++                                std::optional<torch::Tensor> const& azp,
++                                std::optional<torch::Tensor> const& bias) {
++  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
++  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
++
++  if (azp) {
++    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
++        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
++  } else {
++    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzp>(
++        out, a, b, a_scales, b_scales, azp_adj, bias);
++  }
++}
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+new file mode 100644
+index 0000000..f2fae4b
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+@@ -0,0 +1,220 @@
++#pragma once
++#include <stddef.h>
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++
++// clang-format will break include orders
++// clang-format off
++#include "cute/tensor.hpp"
++#include "cute/atom/mma_atom.hpp"
++#include "cutlass/numeric_types.h"
++
++#include "cutlass/cutlass.h"
++#include "cutlass/gemm_coord.h"
++#include "cutlass/arch/mma_sm75.h"
++#include "cutlass/arch/arch.h"
++#include "cutlass/arch/mma.h"
++#include "cutlass/gemm/device/gemm.h"
++#include "cutlass/gemm/device/gemm_universal_adapter.h"
++
++#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
++#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
++
++#include "core/math.hpp"
++#include "cutlass_extensions/common.hpp"
++// clang-format on
++
++using namespace cute;
++
++/*
++   Epilogues defined in,
++   csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
++   must contain a public type named EVTCompute of type Sm80EVT,
++   as well as a static prepare_args function that constructs an
++   EVTCompute::Arguments struct.
++*/
++
++namespace vllm {
++
++// Wrappers for the GEMM kernel that is used to guard against compilation on
++// architectures that will never use the kernel. The purpose of this is to
++// reduce the size of the compiled binary.
++// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
++// into code that will be executed on the device where it is defined.
++template <typename Kernel>
++struct enable_sm75_to_sm80 : Kernel {
++  template <typename... Args>
++  CUTLASS_DEVICE static void invoke(Args&&... args) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
++    Kernel::invoke(std::forward<Args>(args)...);
++#endif
++  }
++};
++
++template <typename Kernel>
++struct enable_sm80_to_sm89 : Kernel {
++  template <typename... Args>
++  CUTLASS_DEVICE static void invoke(Args&&... args) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
++    Kernel::invoke(std::forward<Args>(args)...);
++#endif
++  }
++};
++
++template <typename Kernel>
++struct enable_sm89_to_sm90 : Kernel {
++  template <typename... Args>
++  CUTLASS_DEVICE static void invoke(Args&&... args) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
++    Kernel::invoke(std::forward<Args>(args)...);
++#endif
++  }
++};
++template <typename Arch, template <typename> typename ArchGuard,
++          typename ElementAB_, typename ElementD_,
++          template <typename, typename> typename Epilogue_, typename TileShape,
++          typename WarpShape, typename InstructionShape, int32_t MainLoopStages,
++          typename FP8MathOperator = cutlass::arch::OpMultiplyAdd>
++struct cutlass_2x_gemm {
++  using ElementAB = ElementAB_;
++  using ElementD = ElementD_;
++
++  using ElementAcc =
++      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
++                                float>::type;
++
++  using Operator =
++      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
++                                cutlass::arch::OpMultiplyAddSaturate,
++                                FP8MathOperator>::type;
++
++  using OutputTileThreadMap =
++      cutlass::epilogue::threadblock::OutputTileThreadLayout<
++          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
++          >;
++
++  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
++  using EVTCompute = typename Epilogue::EVTCompute;
++
++  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
++      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
++      Stride<int64_t, Int<1>, Int<0>>>;
++
++  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
++
++  // clang-format off
++  using RowMajor = typename cutlass::layout::RowMajor;
++  using ColumnMajor = typename cutlass::layout::ColumnMajor;
++  using KernelType =
++    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
++      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
++      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
++      float, cutlass::layout::RowMajor, 4,
++      ElementAcc, float, cutlass::arch::OpClassTensorOp,
++      Arch,
++      TileShape, WarpShape, InstructionShape,
++      EVTD,
++      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
++      MainLoopStages, Operator,
++      1 /* epilogue stages */
++      >::GemmKernel>;
++  // clang-format on
++
++  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
++};
++
++template <typename Gemm, typename... EpilogueArgs>
++inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
++                                torch::Tensor const& b,
++                                EpilogueArgs&&... epilogue_params) {
++  using ElementAB = typename Gemm::ElementAB;
++  using ElementD = typename Gemm::ElementD;
++
++  int32_t m = a.size(0);
++  int32_t n = b.size(1);
++  int32_t k = a.size(1);
++  cutlass::gemm::GemmCoord problem_size{m, n, k};
++
++  int64_t lda = a.stride(0);
++  int64_t ldb = b.stride(1);
++  int64_t ldc = out.stride(0);
++
++  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
++  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
++
++  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
++  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
++  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
++
++  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
++
++  using Epilogue = typename Gemm::Epilogue;
++  auto evt_args =
++      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
++
++  typename Gemm::EVTD::Arguments epilogue_args{
++      evt_args,
++      d_args,
++  };
++
++  typename Gemm::Op::Arguments args{
++      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
++      problem_size,                                           // problem size
++      1,                                                      // batch count
++      epilogue_args,
++      a_ptr,
++      b_ptr,
++      nullptr,
++      nullptr,
++      0,
++      0,
++      0,
++      0,
++      lda,
++      ldb,
++      ldc,
++      ldc};
++
++  // Launch the CUTLASS GEMM kernel.
++  typename Gemm::Op gemm_op;
++  size_t workspace_size = gemm_op.get_workspace_size(args);
++  auto const workspace_options =
++      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
++  auto workspace = torch::empty(workspace_size, workspace_options);
++
++  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
++
++  CUTLASS_CHECK(gemm_op.can_implement(args));
++  cutlass::Status status = gemm_op(args, workspace.data_ptr(), stream);
++  CUTLASS_CHECK(status);
++}
++
++template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
++inline void fallback_cutlass_gemm_caller(torch::Tensor& out,
++                                         torch::Tensor const& a,
++                                         torch::Tensor const& b,
++                                         EpilogueArgs&&... args) {
++  // In some cases, the GPU isn't able to accommodate the
++  // shared memory requirements of the Gemm. In such cases, use
++  // the FallbackGemm instead.
++  static const int max_shared_mem_per_block_opt_in =
++      get_cuda_max_shared_memory_per_block_opt_in(0);
++
++  size_t const gemm_shared_mem_size =
++      sizeof(typename Gemm::KernelType::SharedStorage);
++  size_t const fallback_gemm_shared_mem_size =
++      sizeof(typename FallbackGemm::KernelType::SharedStorage);
++
++  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
++    return cutlass_gemm_caller<Gemm>(out, a, b,
++                                     std::forward<EpilogueArgs>(args)...);
++  } else {
++    TORCH_CHECK(fallback_gemm_shared_mem_size <=
++                max_shared_mem_per_block_opt_in);
++    return cutlass_gemm_caller<FallbackGemm>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  }
++}
++
++}  // namespace vllm
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
+new file mode 100644
+index 0000000..a562fd8
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
+@@ -0,0 +1,123 @@
++#pragma once
++
++#include "scaled_mm_c2x.cuh"
++
++/**
++ * This file defines Gemm kernel configurations for SM75 based on the Gemm
++ * shape.
++ */
++
++namespace vllm {
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue>
++struct sm75_config_default {
++  // This config is used in 2 cases,
++  // - M in (256, inf]
++  // - M in (64, 128]
++  // Shared memory required by this Gemm 32768
++  static_assert(std::is_same<InType, int8_t>());
++  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
++  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
++  using Cutlass2xGemm =
++      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
++                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue>
++struct sm75_config_M256 {
++  // M in (128, 256]
++  // Shared memory required by this Gemm 65536
++  static_assert(std::is_same<InType, int8_t>());
++  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 128>;
++  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
++  using Cutlass2xGemm =
++      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
++                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue>
++struct sm75_config_M64 {
++  // M in (32, 64]
++  // Shared memory required by this Gemm 49152
++  static_assert(std::is_same<InType, int8_t>());
++  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
++  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
++  using Cutlass2xGemm =
++      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
++                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue>
++struct sm75_config_M32 {
++  // M in [1, 32]
++  // Shared memory required by this Gemm 49152
++  static_assert(std::is_same<InType, int8_t>());
++  using TileShape = typename cutlass::gemm::GemmShape<32, 128, 64>;
++  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
++  using Cutlass2xGemm =
++      cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType,
++                      Epilogue, TileShape, WarpShape, InstructionShape, 2>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++inline void cutlass_gemm_sm75_dispatch(torch::Tensor& out,
++                                       torch::Tensor const& a,
++                                       torch::Tensor const& b,
++                                       EpilogueArgs&&... args) {
++  static_assert(std::is_same<InType, int8_t>());
++  TORCH_CHECK(a.dtype() == torch::kInt8);
++  TORCH_CHECK(b.dtype() == torch::kInt8);
++
++  using Cutlass2xGemmDefault =
++      typename sm75_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
++  using Cutlass2xGemmM256 =
++      typename sm75_config_M256<InType, OutType, Epilogue>::Cutlass2xGemm;
++  using Cutlass2xGemmM128 = Cutlass2xGemmDefault;
++  using Cutlass2xGemmM64 =
++      typename sm75_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
++  using Cutlass2xGemmM32 =
++      typename sm75_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
++
++  // Due to shared memory requirements, some Gemms may fail to run on some
++  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
++  // in such cases.
++  // sm75_config_default has the least shared-memory requirements.
++  using FallbackGemm = Cutlass2xGemmDefault;
++
++  uint32_t const m = a.size(0);
++  uint32_t const mp2 =
++      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
++  if (mp2 <= 32) {
++    // M in [1, 32]
++    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 64) {
++    // M in (32, 64]
++    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 128) {
++    // M in (64, 128]
++    return fallback_cutlass_gemm_caller<Cutlass2xGemmM128, FallbackGemm>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 256) {
++    // M in (128, 256]
++    return fallback_cutlass_gemm_caller<Cutlass2xGemmM256, FallbackGemm>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else {
++    // M in (256, inf)
++    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  }
++}
++
++}  // namespace vllm
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
+new file mode 100644
+index 0000000..89d101b
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
+@@ -0,0 +1,139 @@
++#pragma once
++
++#include "scaled_mm_c2x.cuh"
++
++/**
++ * This file defines Gemm kernel configurations for SM80 based on the Gemm
++ * shape.
++ */
++
++namespace vllm {
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue>
++struct sm80_config_default {
++  // This config is used in 2 cases,
++  //  - M in (128, inf)
++  //  - M in (64, 128] and N >= 8192
++  // Shared Memory required by this Gemm - 81920 bytes
++  static_assert(std::is_same<InType, int8_t>());
++  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
++  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  using Cutlass2xGemm =
++      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
++                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue>
++struct sm80_config_M64 {
++  // This config is used in 2 cases,
++  // - M in (32, 64]
++  // - M in (64, 128] and N < 8192
++  // Shared Memory required by this Gemm - 122880 bytes
++  static_assert(std::is_same<InType, int8_t>());
++  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
++  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  using Cutlass2xGemm =
++      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
++                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue>
++struct sm80_config_M32 {
++  // M in (16, 32]
++  // Shared Memory required by this Gemm - 61440 bytes
++  static_assert(std::is_same<InType, int8_t>());
++  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
++  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  using Cutlass2xGemm =
++      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
++                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue>
++struct sm80_config_M16 {
++  // M in [1, 16]
++  // Shared Memory required by this Gemm - 51200 bytes
++  static_assert(std::is_same<InType, int8_t>());
++  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
++  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  using Cutlass2xGemm =
++      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
++                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++inline void cutlass_gemm_sm80_dispatch(torch::Tensor& out,
++                                       torch::Tensor const& a,
++                                       torch::Tensor const& b,
++                                       EpilogueArgs&&... args) {
++  static_assert(std::is_same<InType, int8_t>());
++  TORCH_CHECK(a.dtype() == torch::kInt8);
++  TORCH_CHECK(b.dtype() == torch::kInt8);
++
++  using Cutlass2xGemmDefault =
++      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
++  using Cutlass2xGemmM128BigN =
++      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
++  using Cutlass2xGemmM128SmallN =
++      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
++  using Cutlass2xGemmM64 =
++      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
++  using Cutlass2xGemmM32 =
++      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
++  using Cutlass2xGemmM16 =
++      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
++
++  // Due to shared memory requirements, some Gemms may fail to run on some
++  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
++  // in such cases.
++  // sm80_config_M16 has the least shared-memory requirement. However,
++  // based on some profiling, we select sm80_config_M32 as a better alternative
++  // performance wise.
++  using FallbackGemm =
++      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
++
++  uint32_t const m = a.size(0);
++  uint32_t const mp2 =
++      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
++  if (mp2 <= 16) {
++    // M in [1, 16]
++    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 32) {
++    // M in (16, 32]
++    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 64) {
++    // M in (32, 64]
++    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 128) {
++    // M in (64, 128]
++    uint32_t const n = out.size(1);
++    bool const small_n = n < 8192;
++    if (small_n) {
++      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
++                                          FallbackGemm>(
++          out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
++          out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  } else {
++    // M in (128, inf)
++    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  }
++}
++
++}  // namespace vllm
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
+new file mode 100644
+index 0000000..4e82c99
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
+@@ -0,0 +1,368 @@
++#pragma once
++
++#include "scaled_mm_c2x.cuh"
++#include "cutlass/float8.h"
++
++/**
++ * This file defines Gemm kernel configurations for SM89 (FP8) based on the Gemm
++ * shape.
++ */
++
++namespace vllm {
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue>
++struct sm89_fp8_fallback_gemm {
++  // Shared Memory required by this Gemm - 61440 bytes
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 64>;
++  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
++  using Cutlass2xGemm =
++      cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90, InType, OutType,
++                      Epilogue, TileShape, WarpShape, InstructionShape, 5,
++                      FP8MathOperator>;
++};
++
++struct sm89_fp8_config_default {
++  // M in (256, inf)
++  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++
++    using FallbackGemm =
++        typename sm89_fp8_fallback_gemm<InType, OutType,
++                                        Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 4096) {
++      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else if (np2 <= 8192) {
++      using TileShape = typename cutlass::gemm::GemmShape<256, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++
++    } else {
++      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++struct sm89_fp8_config_M256 {
++  // M in (128, 256]
++  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++
++    using FallbackGemm =
++        typename sm89_fp8_fallback_gemm<InType, OutType,
++                                        Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 4096) {
++      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++struct sm89_fp8_config_M128 {
++  // M in (64, 128]
++  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++
++    using FallbackGemm =
++        typename sm89_fp8_fallback_gemm<InType, OutType,
++                                        Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 8192) {
++      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++
++    } else if (np2 <= 16384) {
++      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = typename cutlass::gemm::GemmShape<128, 64, 128>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++struct sm89_fp8_config_M64 {
++  // M in (32, 64]
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++
++    using FallbackGemm =
++        typename sm89_fp8_fallback_gemm<InType, OutType,
++                                        Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 8196) {
++      using TileShape = typename cutlass::gemm::GemmShape<64, 64, 128>;
++      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
++      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else if (np2 <= 16384) {
++      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
++      using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++      using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = typename cutlass::gemm::GemmShape<64, 64, 128>;
++      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
++      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++struct sm89_fp8_config_M32 {
++  // M in (16, 32]
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++
++    using FallbackGemm =
++        typename sm89_fp8_fallback_gemm<InType, OutType,
++                                        Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 8192) {
++      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
++      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else if (np2 <= 16384) {
++      using TileShape = typename cutlass::gemm::GemmShape<32, 128, 128>;
++      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 4, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
++      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5, FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++struct sm89_fp8_config_M16 {
++  // M in [1, 16]
++  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;
++  static const int32_t MainLoopStages = 5;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++
++    using FallbackGemm =
++        typename sm89_fp8_fallback_gemm<InType, OutType,
++                                        Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 8192) {
++      using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, MainLoopStages,
++                                FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else if (np2 <= 24576) {
++      using TileShape = typename cutlass::gemm::GemmShape<16, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, MainLoopStages,
++                                FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, MainLoopStages,
++                                FP8MathOperator>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++inline void cutlass_gemm_sm89_fp8_dispatch(torch::Tensor& out,
++                                           torch::Tensor const& a,
++                                           torch::Tensor const& b,
++                                           EpilogueArgs&&... args) {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
++
++  uint32_t const m = a.size(0);
++  uint32_t const mp2 =
++      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
++
++  if (mp2 <= 16) {
++    // M in [1, 16]
++    return sm89_fp8_config_M16::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 32) {
++    // M in (16, 32]
++    return sm89_fp8_config_M32::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 64) {
++    // M in (32, 64]
++    return sm89_fp8_config_M64::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 128) {
++    // M in (64, 128]
++    return sm89_fp8_config_M128::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 256) {
++    // M in (128, 256]
++    return sm89_fp8_config_M256::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else {
++    // M in (256, inf)
++    return sm89_fp8_config_default::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  }
++}
++
++}  // namespace vllm
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
+new file mode 100644
+index 0000000..95723b3
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
+@@ -0,0 +1,353 @@
++#pragma once
++
++#include "scaled_mm_c2x.cuh"
++
++/**
++ * This file defines Gemm kernel configurations for SM89 (int8) based on the
++ * Gemm shape.
++ */
++
++namespace vllm {
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue>
++struct sm89_int8_fallback_gemm {
++  // Shared mem requirement : 61440
++  static_assert(std::is_same<InType, int8_t>());
++  using TileShape = cutlass::gemm::GemmShape<32, 64, 128>;
++  using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++  static int32_t const MainLoopStages = 5;
++
++  using Cutlass2xGemm =
++      cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90, InType, OutType,
++                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
++};
++
++struct sm89_int8_config_default {
++  // M in (256, inf)
++  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, int8_t>());
++    TORCH_CHECK(a.dtype() == torch::kInt8);
++
++    using FallbackGemm =
++        typename sm89_int8_fallback_gemm<InType, OutType,
++                                         Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 4096) {
++      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else if (np2 <= 8192) {
++      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else if (np2 <= 16384) {
++      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++struct sm89_int8_config_M256 {
++  // M in (128, 256]
++  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, int8_t>());
++    TORCH_CHECK(a.dtype() == torch::kInt8);
++
++    using FallbackGemm =
++        typename sm89_int8_fallback_gemm<InType, OutType,
++                                         Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 4096) {
++      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else if (np2 <= 8192) {
++      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else if (np2 <= 16384) {
++      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++struct sm89_int8_config_M128 {
++  // M in (64, 128]
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, int8_t>());
++    TORCH_CHECK(a.dtype() == torch::kInt8);
++
++    using FallbackGemm =
++        typename sm89_int8_fallback_gemm<InType, OutType,
++                                         Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 8192) {
++      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
++      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else if (np2 <= 16384) {
++      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;
++      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = cutlass::gemm::GemmShape<64, 64, 128>;
++      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++struct sm89_int8_config_M64 {
++  // M in (32, 64]
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, int8_t>());
++    TORCH_CHECK(a.dtype() == torch::kInt8);
++
++    using FallbackGemm =
++        typename sm89_int8_fallback_gemm<InType, OutType,
++                                         Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 8192) {
++      using TileShape = cutlass::gemm::GemmShape<64, 64, 128>;
++      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;
++      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 3>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++struct sm89_int8_config_M32 {
++  // M in (16, 32]
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, int8_t>());
++    TORCH_CHECK(a.dtype() == torch::kInt8);
++
++    using FallbackGemm =
++        typename sm89_int8_fallback_gemm<InType, OutType,
++                                         Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 8192) {
++      using TileShape = cutlass::gemm::GemmShape<32, 64, 128>;
++      using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = cutlass::gemm::GemmShape<32, 128, 128>;
++      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 4>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++struct sm89_int8_config_M16 {
++  // M in [1, 16]
++  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
++  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
++
++  template <typename InType, typename OutType,
++            template <typename, typename> typename Epilogue,
++            typename... EpilogueArgs>
++  static void dispatch(torch::Tensor& out, torch::Tensor const& a,
++                       torch::Tensor const& b, EpilogueArgs&&... args) {
++    static_assert(std::is_same<InType, int8_t>());
++    TORCH_CHECK(a.dtype() == torch::kInt8);
++
++    using FallbackGemm =
++        typename sm89_int8_fallback_gemm<InType, OutType,
++                                         Epilogue>::Cutlass2xGemm;
++
++    uint32_t const n = out.size(1);
++    uint32_t const np2 = next_pow_2(n);
++
++    if (np2 <= 8192) {
++      using TileShape = cutlass::gemm::GemmShape<16, 64, 128>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 5>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      using TileShape = cutlass::gemm::GemmShape<16, 128, 128>;
++
++      return vllm::fallback_cutlass_gemm_caller<
++          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
++                                InType, OutType, Epilogue, TileShape, WarpShape,
++                                InstructionShape, 4>,
++          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++inline void cutlass_gemm_sm89_int8_dispatch(torch::Tensor& out,
++                                            torch::Tensor const& a,
++                                            torch::Tensor const& b,
++                                            EpilogueArgs&&... args) {
++  static_assert(std::is_same<InType, int8_t>());
++  TORCH_CHECK(a.dtype() == torch::kInt8);
++  TORCH_CHECK(b.dtype() == torch::kInt8);
++
++  uint32_t const m = a.size(0);
++  uint32_t const mp2 =
++      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
++
++  if (mp2 <= 16) {
++    // M in [1, 16]
++    return sm89_int8_config_M16::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 32) {
++    // M in (16, 32]
++    return sm89_int8_config_M32::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 64) {
++    // M in (32, 64]
++    return sm89_int8_config_M64::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 128) {
++    // M in (64, 128]
++    return sm89_int8_config_M128::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 256) {
++    // M in (128, 256]
++    return sm89_int8_config_M256::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else {
++    // M in (256, inf)
++    return sm89_int8_config_default::dispatch<InType, OutType, Epilogue>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  }
++}
++
++}  // namespace vllm
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+new file mode 100644
+index 0000000..e18d7d7
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+@@ -0,0 +1,87 @@
++#include <cudaTypedefs.h>
++
++#if defined CUDA_VERSION && CUDA_VERSION >= 12000
++
++  #include "scaled_mm_c3x_sm90_fp8_dispatch.cuh"
++  #include "scaled_mm_c3x_sm90_int8_dispatch.cuh"
++
++  #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
++using namespace vllm;
++
++/*
++   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
++   NVIDIA GPUs with sm90a (Hopper) or later.
++*/
++
++template <template <typename, typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
++                                     torch::Tensor const& b,
++                                     EpilogueArgs&&... epilogue_args) {
++  if (a.dtype() == torch::kInt8) {
++    TORCH_CHECK(b.dtype() == torch::kInt8);
++
++    if (out.dtype() == torch::kBFloat16) {
++      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
++                                             Epilogue>(
++          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++    } else {
++      TORCH_CHECK(out.dtype() == torch::kFloat16);
++      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
++          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++    }
++  } else {
++    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
++
++    if (out.dtype() == torch::kBFloat16) {
++      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
++                                            cutlass::bfloat16_t, Epilogue>(
++          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++    } else {
++      TORCH_CHECK(out.dtype() == torch::kFloat16);
++      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
++                                            cutlass::half_t, Epilogue>(
++          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
++    }
++  }
++}
++
++void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
++                            torch::Tensor const& b,
++                            torch::Tensor const& a_scales,
++                            torch::Tensor const& b_scales,
++                            std::optional<torch::Tensor> const& bias) {
++  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
++  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
++  if (bias) {
++    TORCH_CHECK(bias->dtype() == c.dtype(),
++                "currently bias dtype must match output dtype ", c.dtype());
++    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
++        c, a, b, a_scales, b_scales, *bias);
++  } else {
++    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogue>(
++        c, a, b, a_scales, b_scales);
++  }
++}
++
++void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
++                                torch::Tensor const& b,
++                                torch::Tensor const& a_scales,
++                                torch::Tensor const& b_scales,
++                                torch::Tensor const& azp_adj,
++                                std::optional<torch::Tensor> const& azp,
++                                std::optional<torch::Tensor> const& bias) {
++  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
++  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
++
++  if (azp) {
++    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzpToken>(
++        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
++  } else {
++    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzp>(
++        out, a, b, a_scales, b_scales, azp_adj, bias);
++  }
++}
++
++#endif
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
+new file mode 100644
+index 0000000..d4bc2f0
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
+@@ -0,0 +1,160 @@
++#pragma once
++
++// clang-format will break include orders
++// clang-format off
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++
++#include "cutlass/cutlass.h"
++
++#include "cute/tensor.hpp"
++#include "cute/atom/mma_atom.hpp"
++#include "cutlass/numeric_types.h"
++
++#include "cutlass/gemm/device/gemm_universal_adapter.h"
++#include "cutlass/gemm/kernel/gemm_universal.hpp"
++#include "cutlass/epilogue/collective/collective_builder.hpp"
++#include "cutlass/gemm/collective/collective_builder.hpp"
++
++#include "core/math.hpp"
++#include "cutlass_extensions/common.hpp"
++// clang-format on
++
++/*
++  Epilogues defined in,
++  csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp,
++  must contain a public type named EVTCompute of type Sm90EVT, as well as a
++  static prepare_args function that constructs an EVTCompute::Arguments struct.
++*/
++
++using namespace cute;
++
++namespace vllm {
++
++// A wrapper for the GEMM kernel that is used to guard against compilation on
++// architectures that will never use the kernel. The purpose of this is to
++// reduce the size of the compiled binary.
++// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
++// into code that will be executed on the device where it is defined.
++template <typename Kernel>
++struct enable_sm90_or_later : Kernel {
++  template <typename... Args>
++  CUTLASS_DEVICE void operator()(Args&&... args) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
++    Kernel::operator()(std::forward<Args>(args)...);
++#endif
++  }
++};
++
++template <typename ElementAB_, typename ElementD_,
++          template <typename, typename, typename> typename Epilogue_,
++          typename TileShape, typename ClusterShape, typename KernelSchedule,
++          typename EpilogueSchedule>
++struct cutlass_3x_gemm {
++  using ElementAB = ElementAB_;
++  using ElementD = ElementD_;
++  using ElementAcc =
++      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
++                                float>::type;
++
++  using EpilogueDescriptor =
++      cutlass::epilogue::collective::detail::EpilogueDescriptor<
++          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
++          ElementD, EpilogueSchedule>;
++
++  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
++
++  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
++  using ElementC = void;
++  using StrideC = StrideD;
++
++  using EVTCompute = typename Epilogue::EVTCompute;
++
++  using CollectiveEpilogue =
++      typename cutlass::epilogue::collective::CollectiveBuilder<
++          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
++          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
++          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
++          EpilogueSchedule, EVTCompute>::CollectiveOp;
++
++  static constexpr size_t CEStorageSize =
++      sizeof(typename CollectiveEpilogue::SharedStorage);
++  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
++      static_cast<int>(CEStorageSize)>;
++
++  // clang-format off
++  using CollectiveMainloop =
++      typename cutlass::gemm::collective::CollectiveBuilder<
++          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
++          ElementAB, cutlass::layout::RowMajor, 16, 
++          ElementAB, cutlass::layout::ColumnMajor, 16, 
++          ElementAcc, TileShape, ClusterShape,
++          Stages,
++          KernelSchedule>::CollectiveOp;
++  // clang-format on
++
++  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
++      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
++      cutlass::gemm::PersistentScheduler>>;
++
++  struct GemmKernel : public KernelType {};
++};
++
++template <typename Gemm, typename... EpilogueArgs>
++void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
++                         torch::Tensor const& b,
++                         EpilogueArgs&&... epilogue_params) {
++  using ElementAB = typename Gemm::ElementAB;
++  using ElementD = typename Gemm::ElementD;
++
++  int32_t m = a.size(0);
++  int32_t n = b.size(1);
++  int32_t k = a.size(1);
++
++  int64_t lda = a.stride(0);
++  int64_t ldb = b.stride(1);
++  int64_t ldc = out.stride(0);
++
++  using StrideA = Stride<int64_t, Int<1>, int64_t>;
++  using StrideB = Stride<int64_t, Int<1>, int64_t>;
++  using StrideC = typename Gemm::StrideC;
++
++  StrideA a_stride{lda, Int<1>{}, 0};
++  StrideB b_stride{ldb, Int<1>{}, 0};
++  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
++
++  using GemmKernel = typename Gemm::GemmKernel;
++  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
++
++  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
++  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
++  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
++                                                       b_stride};
++
++  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
++  typename GemmKernel::EpilogueArguments epilogue_args{
++      Gemm::Epilogue::prepare_args(
++          std::forward<EpilogueArgs>(epilogue_params)...),
++      c_ptr, c_stride, c_ptr, c_stride};
++
++  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
++                                      prob_shape, mainloop_args, epilogue_args};
++
++  // Launch the CUTLASS GEMM kernel.
++  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
++  GemmOp gemm_op;
++  CUTLASS_CHECK(gemm_op.can_implement(args));
++
++  size_t workspace_size = gemm_op.get_workspace_size(args);
++  auto const workspace_options =
++      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
++  auto workspace = torch::empty(workspace_size, workspace_options);
++
++  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
++
++  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
++  CUTLASS_CHECK(status);
++}
++
++}  // namespace vllm
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
+new file mode 100644
+index 0000000..f08419b
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
+@@ -0,0 +1,96 @@
++#pragma once
++
++#include "scaled_mm_c3x.cuh"
++
++/**
++ * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
++ * shape.
++ */
++
++namespace vllm {
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_default {
++  // M in (128, inf)
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule =
++      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_128, _128, _128>;
++  using ClusterShape = Shape<_2, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                      KernelSchedule, EpilogueSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_M128 {
++  // M in (64, 128]
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule =
++      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _128, _128>;
++  using ClusterShape = Shape<_2, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                      KernelSchedule, EpilogueSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_M64 {
++  // M in [1, 64]
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule =
++      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _64, _128>;
++  using ClusterShape = Shape<_1, _8, _1>;
++
++  using Cutlass3xGemm =
++      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                      KernelSchedule, EpilogueSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
++                                           torch::Tensor const& a,
++                                           torch::Tensor const& b,
++                                           EpilogueArgs&&... args) {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
++
++  using Cutlass3xGemmDefault =
++      typename sm90_fp8_config_default<InType, OutType,
++                                       Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM64 =
++      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM128 =
++      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
++
++  uint32_t const m = a.size(0);
++  uint32_t const mp2 =
++      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
++
++  if (mp2 <= 64) {
++    // m in [1, 64]
++    return cutlass_gemm_caller<Cutlass3xGemmM64>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 128) {
++    // m in (64, 128]
++    return cutlass_gemm_caller<Cutlass3xGemmM128>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else {
++    // m in (128, inf)
++    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  }
++}
++
++}  // namespace vllm
+\ No newline at end of file
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
+new file mode 100644
+index 0000000..34e5fd9
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
+@@ -0,0 +1,140 @@
++#pragma once
++
++#include "scaled_mm_c3x.cuh"
++
++/**
++ * This file defines Gemm kernel configurations for SM90 (int8) based on the
++ * Gemm shape.
++ */
++
++namespace vllm {
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_int8_config_default {
++  // For M > 128 and any N
++  static_assert(std::is_same<InType, int8_t>());
++  using KernelSchedule =
++      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_128, _128, _128>;
++  using ClusterShape = Shape<_2, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                      KernelSchedule, EpilogueSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_int8_config_M128 {
++  // For M in (64, 128] and any N
++  static_assert(std::is_same<InType, int8_t>());
++  using KernelSchedule =
++      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _128, _128>;
++  using ClusterShape = Shape<_2, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                      KernelSchedule, EpilogueSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_int8_config_M64 {
++  // For M in (32, 64] and any N
++  static_assert(std::is_same<InType, int8_t>());
++  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _64, _256>;
++  using ClusterShape = Shape<_1, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                      KernelSchedule, EpilogueSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_int8_config_M32_NBig {
++  // For M in [1, 32] and N >= 8192
++  static_assert(std::is_same<InType, int8_t>());
++  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _128, _256>;
++  using ClusterShape = Shape<_1, _4, _1>;
++  using Cutlass3xGemm =
++      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                      KernelSchedule, EpilogueSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_int8_config_M32_NSmall {
++  // For M in [1, 32] and N < 8192
++  static_assert(std::is_same<InType, int8_t>());
++  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _64, _256>;
++  using ClusterShape = Shape<_1, _8, _1>;
++  using Cutlass3xGemm =
++      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                      KernelSchedule, EpilogueSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++inline void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out,
++                                            torch::Tensor const& a,
++                                            torch::Tensor const& b,
++                                            EpilogueArgs&&... args) {
++  static_assert(std::is_same<InType, int8_t>());
++  TORCH_CHECK(a.dtype() == torch::kInt8);
++  TORCH_CHECK(b.dtype() == torch::kInt8);
++
++  using Cutlass3xGemmDefault =
++      typename sm90_int8_config_default<InType, OutType,
++                                        Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM128 =
++      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM64 =
++      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM32NBig =
++      typename sm90_int8_config_M32_NBig<InType, OutType,
++                                         Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM32NSmall =
++      typename sm90_int8_config_M32_NSmall<InType, OutType,
++                                           Epilogue>::Cutlass3xGemm;
++
++  uint32_t const n = out.size(1);
++  bool const is_small_n = n < 8192;
++
++  uint32_t const m = a.size(0);
++  uint32_t const mp2 =
++      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
++
++  if (mp2 <= 32) {
++    // m in [1, 32]
++    if (is_small_n) {
++      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
++          out, a, b, std::forward<EpilogueArgs>(args)...);
++    } else {
++      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
++          out, a, b, std::forward<EpilogueArgs>(args)...);
++    }
++  } else if (mp2 <= 64) {
++    // m in (32, 64]
++    return cutlass_gemm_caller<Cutlass3xGemmM64>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 128) {
++    // m in (64, 128]
++    return cutlass_gemm_caller<Cutlass3xGemmM128>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  } else {
++    // m in (128, inf)
++    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
++        out, a, b, std::forward<EpilogueArgs>(args)...);
++  }
++}
++
++}  // namespace vllm
+\ No newline at end of file
+diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+new file mode 100644
+index 0000000..3f2b526
+--- /dev/null
++++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+@@ -0,0 +1,218 @@
++#include <cudaTypedefs.h>
++
++#include <c10/cuda/CUDAGuard.h>
++#include <torch/all.h>
++
++#include "cutlass_extensions/common.hpp"
++
++void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
++                            torch::Tensor const& b,
++                            torch::Tensor const& a_scales,
++                            torch::Tensor const& b_scales,
++                            std::optional<torch::Tensor> const& bias);
++
++void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
++                            torch::Tensor const& b,
++                            torch::Tensor const& a_scales,
++                            torch::Tensor const& b_scales,
++                            std::optional<torch::Tensor> const& bias);
++
++void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
++                            torch::Tensor const& b,
++                            torch::Tensor const& a_scales,
++                            torch::Tensor const& b_scales,
++                            std::optional<torch::Tensor> const& bias);
++
++#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
++void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
++                            torch::Tensor const& b,
++                            torch::Tensor const& a_scales,
++                            torch::Tensor const& b_scales,
++                            std::optional<torch::Tensor> const& bias);
++#endif
++
++void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
++                                torch::Tensor const& b,
++                                torch::Tensor const& a_scales,
++                                torch::Tensor const& b_scales,
++                                torch::Tensor const& azp_adj,
++                                std::optional<torch::Tensor> const& azp,
++                                std::optional<torch::Tensor> const& bias);
++
++void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a,
++                                torch::Tensor const& b,
++                                torch::Tensor const& a_scales,
++                                torch::Tensor const& b_scales,
++                                torch::Tensor const& azp_adj,
++                                std::optional<torch::Tensor> const& azp,
++                                std::optional<torch::Tensor> const& bias);
++
++void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
++                                torch::Tensor const& b,
++                                torch::Tensor const& a_scales,
++                                torch::Tensor const& b_scales,
++                                torch::Tensor const& azp_adj,
++                                std::optional<torch::Tensor> const& azp,
++                                std::optional<torch::Tensor> const& bias);
++
++#if defined CUDA_VERSION && CUDA_VERSION >= 12000
++void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
++                                torch::Tensor const& b,
++                                torch::Tensor const& a_scales,
++                                torch::Tensor const& b_scales,
++                                torch::Tensor const& azp_adj,
++                                std::optional<torch::Tensor> const& azp,
++                                std::optional<torch::Tensor> const& bias);
++#endif
++
++bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
++  // CUTLASS FP8 kernels need at least
++  //   CUDA 12.0 on SM90 systems (Hopper)
++  //   CUDA 12.4 on SM89 systems (Lovelace)
++
++#if defined CUDA_VERSION
++  if (cuda_device_capability >= 90) {
++    return CUDA_VERSION >= 12000;
++  } else if (cuda_device_capability >= 89) {
++    return CUDA_VERSION >= 12040;
++  }
++#endif
++
++  return false;
++}
++
++void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
++                       torch::Tensor const& b, torch::Tensor const& a_scales,
++                       torch::Tensor const& b_scales,
++                       std::optional<torch::Tensor> const& bias) {
++  // Checks for conformality
++  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
++  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
++              b.size(1) == c.size(1));
++  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
++  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
++
++  // Check for strides and alignment
++  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
++  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
++  TORCH_CHECK(c.stride(0) % 16 == 0 &&
++              b.stride(1) % 16 == 0);  // 16 Byte Alignment
++  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
++
++  if (bias) {
++    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
++                bias->dim() == 1);
++  }
++
++  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
++  int32_t version_num = get_sm_version_num();
++  // Hopper
++
++  // Guard against compilation issues for sm90 kernels
++#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
++  if (version_num >= 90) {
++    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
++    return;
++  }
++#endif
++
++#if defined ENABLE_SCALED_MM_C2X && ENABLE_SCALED_MM_C2X
++  if (version_num == 89) {
++    // Ada Lovelace
++    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales, bias);
++    return;
++  }
++
++  if (version_num >= 80) {
++    // Ampere
++    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
++    return;
++  }
++
++  if (version_num >= 75) {
++    // Turing
++    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
++    return;
++  }
++#endif
++
++  TORCH_CHECK_NOT_IMPLEMENTED(
++      false,
++      "No compiled cutlass_scaled_mm for a compute capability less than "
++      "CUDA device capability: ",
++      version_num);
++}
++
++void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
++                           torch::Tensor const& b,
++                           torch::Tensor const& a_scales,
++                           torch::Tensor const& b_scales,
++                           torch::Tensor const& azp_adj,
++                           std::optional<torch::Tensor> const& azp,
++                           std::optional<torch::Tensor> const& bias) {
++  // Checks for conformality
++  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
++  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
++              b.size(1) == c.size(1));
++  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
++  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
++
++  // Check for strides and alignment
++  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
++  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
++  TORCH_CHECK(c.stride(0) % 16 == 0 &&
++              b.stride(1) % 16 == 0);  // 16 Byte Alignment
++  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
++
++  // bias, azp, azp_adj are all 1d
++  // bias and azp_adj have n elements, azp has m elements
++  if (bias) {
++    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous());
++  }
++  if (azp) {
++    TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous());
++  }
++  TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous());
++
++  // azp & bias types
++  TORCH_CHECK(azp_adj.dtype() == torch::kInt32);
++  TORCH_CHECK(!azp || azp->dtype() == torch::kInt32);
++  TORCH_CHECK(!bias || bias->dtype() == c.dtype(),
++              "currently bias dtype must match output dtype ", c.dtype());
++
++  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
++
++  int32_t version_num = get_sm_version_num();
++
++#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
++  if (version_num >= 90) {
++    cutlass_scaled_mm_azp_sm90(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
++    return;
++  }
++#endif
++
++#if defined ENABLE_SCALED_MM_C2X && ENABLE_SCALED_MM_C2X
++  if (version_num == 89) {
++    // Ada Lovelace
++    cutlass_scaled_mm_azp_sm89(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
++    return;
++  }
++
++  if (version_num >= 80) {
++    // Ampere
++    cutlass_scaled_mm_azp_sm80(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
++    return;
++  }
++
++  // Turing
++  TORCH_CHECK(version_num >= 75);
++  cutlass_scaled_mm_azp_sm75(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
++  return;
++#endif
++
++  TORCH_CHECK_NOT_IMPLEMENTED(
++      false,
++      "No compiled cutlass_scaled_mm_azp for a compute capability less than "
++      "CUDA device capability: ",
++      version_num);
++}
+\ No newline at end of file
+diff --git a/csrc/quantization/fp8/amd/hip_float8.h b/csrc/quantization/fp8/amd/hip_float8.h
+new file mode 100644
+index 0000000..f9c80fc
+--- /dev/null
++++ b/csrc/quantization/fp8/amd/hip_float8.h
+@@ -0,0 +1,137 @@
++#pragma once
++
++#ifdef __HIPCC__
++  #include <hip/hip_runtime.h>
++#else
++  #include <type_traits>
++  #include <stdint.h>
++  #include <math.h>
++  #include <iostream>
++#endif
++
++#include "hip_float8_impl.h"
++
++struct alignas(1) hip_fp8 {
++  struct from_bits_t {};
++  HIP_FP8_HOST_DEVICE static constexpr from_bits_t from_bits() {
++    return from_bits_t();
++  }
++  uint8_t data;
++
++  hip_fp8() = default;
++  HIP_FP8_HOST_DEVICE constexpr hip_fp8(const hip_fp8&) = default;
++  HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v) = delete;
++  explicit HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v, from_bits_t)
++      : data(v) {}
++
++#ifdef __HIP__MI300__
++  // NOTE: ON-DEVICE... always optimal bias
++  explicit HIP_FP8_DEVICE hip_fp8(float v)
++      : data(hip_fp8_impl::to_fp8_from_fp32(v)) {}
++
++  explicit HIP_FP8_DEVICE hip_fp8(_Float16 v)
++      : hip_fp8(static_cast<float>(v)) {}
++
++  // Host only implementation using s/w simulation
++  explicit HIP_FP8_HOST
++#else   // __HIP__MI300__
++  // both Host and DEVICE for non-MI300 using s/w simulation
++  explicit HIP_FP8_HOST_DEVICE
++#endif  // __HIP__MI300__
++  hip_fp8(float v) {
++    data = hip_fp8_impl::to_float8<4, 3, float, true /*negative_zero_nan*/,
++                                   true /*clip*/>(v);
++  }
++
++  explicit HIP_FP8_HOST_DEVICE hip_fp8(double v)
++      : hip_fp8(static_cast<float>(v)) {}
++
++#ifdef __HIP__MI300__
++  // upcast using device specific intrinsic
++  explicit inline HIP_FP8_DEVICE operator float() const {
++    float fval;
++    uint32_t i32val = static_cast<uint32_t>(data);
++
++    // upcast
++    asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0"
++                 : "=v"(fval)
++                 : "v"(i32val));
++
++    return fval;
++  }
++
++  explicit inline HIP_FP8_HOST operator float() const
++#else   // __HIP__MI300__
++  explicit inline HIP_FP8_HOST_DEVICE operator float() const
++#endif  // __HIP__MI300__
++  {
++    return hip_fp8_impl::from_float8<4, 3, float, true /*negative_zero_nan*/>(
++        data);
++  }
++};
++
++namespace std {
++inline hip_fp8 sin(hip_fp8 a) { return hip_fp8(sinf(float(a))); }
++inline hip_fp8 cos(hip_fp8 a) { return hip_fp8(cosf(float(a))); }
++HIP_FP8_HOST_DEVICE constexpr hip_fp8 real(const hip_fp8& a) { return a; }
++}  // namespace std
++
++// Special operator overloading
++inline std::ostream& operator<<(std::ostream& os, const hip_fp8& f8) {
++  return os << float(f8);
++}
++
++// all + operator overloading with mixed types
++// mixed types, always converts to f32, does computation in f32, and returns
++// float
++inline HIP_FP8_HOST_DEVICE float operator+(const float fa, hip_fp8 b) {
++  return (fa + float(b));
++}
++
++inline HIP_FP8_HOST_DEVICE float operator+(hip_fp8 a, const float fb) {
++  return (float(a) + fb);
++}
++
++inline HIP_FP8_HOST_DEVICE hip_fp8 operator+(hip_fp8 a, hip_fp8 b) {
++  return hip_fp8(float(a) + float(b));
++}
++
++inline HIP_FP8_HOST_DEVICE hip_fp8& operator+=(hip_fp8& a, hip_fp8 b) {
++  return a = hip_fp8(float(a) + float(b));
++}
++
++// overloading multiplication, always returns float,
++inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, hip_fp8 b) {
++  return float(a) * float(b);
++}
++
++inline HIP_FP8_HOST_DEVICE float operator*(float a, hip_fp8 b) {
++  return (a * float(b));
++}
++
++inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, float b) {
++  return (float(a) * b);
++}
++
++inline HIP_FP8_HOST_DEVICE float operator*(int32_t a, hip_fp8 b) {
++  return ((float)a * float(b));
++}
++
++inline HIP_FP8_HOST_DEVICE float operator*(double a, hip_fp8 b) {
++  return ((float)a * float(b));
++}
++
++// overloading for compare
++inline HIP_FP8_HOST_DEVICE bool operator==(hip_fp8 a, hip_fp8 b) {
++  return (a.data == b.data);
++}
++inline HIP_FP8_HOST_DEVICE bool operator!=(hip_fp8 a, hip_fp8 b) {
++  return (a.data != b.data);
++}
++
++inline HIP_FP8_HOST_DEVICE bool operator>=(hip_fp8 a, hip_fp8 b) {
++  return static_cast<float>(a) >= static_cast<float>(b);
++}
++inline HIP_FP8_HOST_DEVICE bool operator>(hip_fp8 a, hip_fp8 b) {
++  return static_cast<float>(a) > static_cast<float>(b);
++}
+diff --git a/csrc/quantization/fp8/amd/hip_float8_impl.h b/csrc/quantization/fp8/amd/hip_float8_impl.h
+new file mode 100644
+index 0000000..90251c3
+--- /dev/null
++++ b/csrc/quantization/fp8/amd/hip_float8_impl.h
+@@ -0,0 +1,316 @@
++#pragma once
++
++#if defined(__HIPCC__) && \
++    (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
++  #define __HIP__MI300__
++#endif
++
++#ifdef __HIPCC__
++  #define HIP_FP8_HOST_DEVICE __host__ __device__
++  #define HIP_FP8_HOST __host__
++  #define HIP_FP8_DEVICE __device__
++#else
++  #define HIP_FP8_HOST_DEVICE
++  #define HIP_FP8_HOST
++  #define HIP_FP8_DEVICE
++#endif
++
++namespace hip_fp8_impl {
++
++#ifdef __HIP__MI300__
++HIP_FP8_DEVICE uint8_t to_fp8_from_fp32(float v) {
++  uint8_t i8data;
++  union {
++    float fval;
++    uint32_t i32val;
++    uint8_t i8val[4];  // NOTE: not endian independent
++  } val;
++
++  uint32_t ival = 0;
++  val.fval = v;
++
++  if ((val.i32val & 0x7F800000) !=
++      0x7F800000) {  /// propagate NAN/INF, no clipping
++    val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
++  }
++
++  ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival,
++                                         false);  // false -> WORD0
++  val.i32val = ival;
++  i8data = val.i8val[0];
++
++  return i8data;
++}
++#endif  // __HIP__MI300__
++
++HIP_FP8_HOST inline int clz(uint32_t x) { return __builtin_clz(x); }
++#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
++HIP_FP8_DEVICE inline int clz(uint32_t x) { return __clz(x); }
++#endif
++
++template <int we, int wm, typename T, bool negative_zero_nan, bool clip>
++HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false,
++                                      uint32_t rng = 0) {
++#ifdef __HIPCC__
++  constexpr bool is_half = std::is_same<T, _Float16>::value;
++#else
++  constexpr bool is_half = false;
++#endif
++  constexpr bool is_float = std::is_same<T, float>::value;
++  static_assert(wm + we == 7, "wm+we==7");
++  static_assert(is_half || is_float, "Only half and float can be cast to f8");
++
++  const int mfmt = (sizeof(T) == 4) ? 23 : 10;
++  uint32_t x;
++  if (sizeof(T) == 4) {
++    x = reinterpret_cast<uint32_t&>(_x);
++  } else {
++    x = reinterpret_cast<uint16_t&>(_x);
++  }
++
++  uint32_t head, mantissa;
++  int exponent, bias;
++  uint32_t sign;
++
++  if (sizeof(T) == 4) {
++    head = x & 0xFF800000;
++    mantissa = x & 0x7FFFFF;
++    exponent = (head >> 23) & 0xFF;
++    sign = head >> 31;
++    bias = 127;
++  } else {
++    head = x & 0xFC00;
++    mantissa = x & 0x3FF;
++    exponent = (head >> 10) & 0x1F;
++    sign = head >> 15;
++    bias = 15;
++  }
++
++  uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
++
++  // Deal with inf and NaNs
++  if (negative_zero_nan) {
++    if (sizeof(T) == 4) {
++      if ((x & 0x7F800000) == 0x7F800000) {
++        return 0x80;
++      }
++    } else {
++      // if(__hisinf(x) || __hisnan(x))
++      if ((x & 0x7C00) == 0x7C00) {
++        return 0x80;
++      }
++    }
++  } else {
++    if (sizeof(T) == 4) {
++      if ((x & 0x7F800000) == 0x7F800000) {
++        return signed_inf + (mantissa != 0 ? 1 : 0);
++      }
++    } else {
++      if ((x & 0x7C00) == 0x7C00) {
++        return signed_inf + (mantissa != 0 ? 1 : 0);
++      }
++    }
++  }
++  if (x == 0) {
++    return 0;
++  }
++
++  // First need to check if it is normal or denorm as there is a difference of
++  // implicit 1 Then need to adjust the exponent to align with the F8 exponent,
++  // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng
++  // to mantissa and truncate. And for RNE, no need to add rng. Then probably
++  // need to check whether there is carry and adjust exponent and mantissa again
++
++  // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent
++  // bits
++  const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
++  const int f8_denormal_act_exponent =
++      1 - f8_bias;  // actual exponent of f8 denormal
++  // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
++  // f8_exponent is the converted f8 exponent with bias encoding
++  // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
++  // the difference needs to be adjusted and mantissa shifted
++  int act_exponent, f8_exponent, exponent_diff;
++
++  if (exponent == 0) {  // fp32/fp16 is in denormal.
++    /* fp32 denormal is below 2^-127 so it is usually not a concern here, we
++mostly concern fp16 here. In this case, f8 is usually in denormal. But there
++could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has
++exponent bias 16. It means that there are some numbers in fp16 denormal but they
++are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
++where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
++(NANOO) normal. In this case, the fp16 mantissa should be shift left by 1  */
++    act_exponent = exponent - bias + 1;
++    exponent_diff =
++        f8_denormal_act_exponent -
++        act_exponent;  // actual exponent is exponent-bias+1 as it is denormal
++  } else {             // fp32/fp16 is normal with implicit 1
++    act_exponent = exponent - bias;
++    if (act_exponent <= f8_denormal_act_exponent) {
++      /* This is the case where fp32/fp16 is normal but it is in f8 denormal
++range. For example fp8 nanoo mode, denormal exponent is -7, but if the
++fp32/fp16 actual exponent is -7, it is actually larger due to the implicit 1,
++Therefore it needs to be adjust to -6 and mantissa shift right by 1.
++So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
++      exponent_diff = f8_denormal_act_exponent - act_exponent;
++    } else {              // both fp32/fp16 and f8 are in normal range
++      exponent_diff = 0;  // exponent_diff=0 does not mean there is no
++                          // difference for this case, act_exponent could be
++                          // larger. Just that it does not need shift mantissa
++    }
++    mantissa += (1 << mfmt);  // Add the implicit 1 into mantissa
++  }
++
++  bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) ==
++                  static_cast<uint32_t>(1 << (mfmt - wm + exponent_diff - 1));
++  /* This part is a bit tricky. The judgment of whether it is a tie needs to be
++ done before we shift right as shift right could rip off some residual part
++ and make something not midpoint look like midpoint. For example, the fp16
++ number 0x1002 (0 00100 0000000010), it is larger than midpoint, but after
++ shift right by 4 bits, it would look like midpoint.
++*/
++
++  if (exponent_diff > 0) {
++    mantissa >>= exponent_diff;
++  } else if (exponent_diff == -1) {
++    mantissa <<= -exponent_diff;
++  }
++  bool implicit_one = mantissa & (1 << mfmt);
++  // if there is no implicit 1, it  means the f8 is denormal and need to adjust
++  // to denorm exponent
++  f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ +
++                f8_bias - (implicit_one ? 0 : 1);
++
++  // Now we have the exponent and mantissa adjusted
++  uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
++  bool odd = mantissa & (1 << (mfmt - wm));  // if the least significant bit
++                                             // that is not truncated is 1
++  mantissa +=
++      (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) &
++      drop_mask;
++
++  // Now we deal with overflow
++  if (f8_exponent == 0) {
++    if ((1 << mfmt) & mantissa) {
++      f8_exponent = 1;  // denormal overflow to become normal, promote exponent
++    }
++  } else {
++    if ((1 << (mfmt + 1)) & mantissa) {
++      mantissa >>= 1;
++      f8_exponent++;
++    }
++  }
++
++  mantissa >>= (mfmt - wm);
++
++  // above range: quantize to maximum possible float of the same sign
++  const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
++  if (f8_exponent > max_exp) {
++    if (clip) {
++      mantissa = (1 << wm) - 1;
++      f8_exponent = max_exp;
++    } else {
++      return signed_inf;
++    }
++  }
++
++  if (f8_exponent == 0 && mantissa == 0) {
++    return negative_zero_nan ? 0 : (sign << 7);
++  }
++  mantissa &= (1 << wm) - 1;
++  return (sign << 7) | (f8_exponent << wm) | mantissa;
++}
++
++template <int we, int wm, typename T = float, bool negative_zero_nan = true>
++inline HIP_FP8_HOST_DEVICE T from_float8(uint8_t x) {
++#ifdef __HIPCC__
++  constexpr bool is_half = std::is_same<T, _Float16>::value;
++#else
++  constexpr bool is_half = false;
++#endif
++  constexpr bool is_float = std::is_same<T, float>::value;
++  static_assert(is_half || is_float, "only half and float are supported");
++
++  constexpr int weo = is_half ? 5 : 8;
++  constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
++
++  T fInf, fNegInf, fNaN, fNeg0;
++
++#ifdef __HIPCC__
++  if (is_half) {
++    const uint16_t ihInf = 0x7C00;
++    const uint16_t ihNegInf = 0xFC00;
++    const uint16_t ihNaN = 0x7C01;
++    const uint16_t ihNeg0 = 0x8000;
++    fInf = reinterpret_cast<const _Float16&>(ihInf);
++    fNegInf = reinterpret_cast<const _Float16&>(ihNegInf);
++    fNaN = reinterpret_cast<const _Float16&>(ihNaN);
++    fNeg0 = reinterpret_cast<const _Float16&>(ihNeg0);
++  } else
++#endif
++      if (is_float) {
++    const uint32_t ifInf = 0x7F800000;
++    const uint32_t ifNegInf = 0xFF800000;
++    const uint32_t ifNaN = 0x7F800001;
++    const uint32_t ifNeg0 = 0x80000000;
++    fInf = reinterpret_cast<const float&>(ifInf);
++    fNegInf = reinterpret_cast<const float&>(ifNegInf);
++    fNaN = reinterpret_cast<const float&>(ifNaN);
++    fNeg0 = reinterpret_cast<const float&>(ifNeg0);
++  }
++
++  if (x == 0) {
++    return 0;
++  }
++
++  uint32_t sign = x >> 7;
++  uint32_t mantissa = x & ((1 << wm) - 1);
++  int exponent = (x & 0x7F) >> wm;
++  if (negative_zero_nan) {
++    if (x == 0x80) {
++      return fNaN;
++    }
++  } else {
++    if (x == 0x80) {
++      return fNeg0;
++    }
++    if (exponent == ((1 << we) - 1)) {
++      return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
++    }
++  }
++  typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
++  if (we == 5 && is_half && !negative_zero_nan) {
++    retval = x << 8;
++    return reinterpret_cast<const T&>(retval);
++  }
++
++  const int exp_low_cutoff =
++      (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
++
++  // subnormal input
++  if (exponent == 0) {
++    // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
++    int sh = 1 + clz(mantissa) - (32 - wm);
++    mantissa <<= sh;
++    exponent += 1 - sh;
++    mantissa &= ((1 << wm) - 1);
++  }
++  exponent += exp_low_cutoff - 1;
++  mantissa <<= wmo - wm;
++
++  // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
++  if (exponent <= 0) {
++    mantissa |= 1 << wmo;
++    mantissa >>= 1 - exponent;
++    exponent = 0;
++  }
++
++  if (sizeof(T) == 2) {
++    retval = (sign << 15) | (exponent << 10) | mantissa;
++  } else {
++    retval = (sign << 31) | (exponent << 23) | mantissa;
++  }
++  return reinterpret_cast<const T&>(retval);
++}
++
++}  // namespace hip_fp8_impl
+diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
+new file mode 100644
+index 0000000..eb66834
+--- /dev/null
++++ b/csrc/quantization/fp8/amd/quant_utils.cuh
+@@ -0,0 +1,577 @@
++#pragma once
++#include "hip_float8.h"
++
++#include <hip/hip_fp16.h>
++#include <hip/hip_bf16.h>
++#include <hip/hip_bfloat16.h>
++
++#include "../../../attention/dtype_fp8.cuh"
++#include "../../../attention/dtype_float32.cuh"
++#include "../../../attention/dtype_bfloat16.cuh"
++
++namespace vllm {
++#ifdef USE_ROCM
++
++namespace fp8 {
++  #ifdef ENABLE_FP8
++
++template <typename Tout, typename Tin>
++__inline__ __device__ Tout vec_conversion(const Tin& x) {
++  return x;
++}
++
++template <typename Tout, typename Tin>
++__inline__ __device__ Tout scaled_vec_conversion(const Tin& x,
++                                                 const float scale) {
++  return x;
++}
++
++// fp8 -> half
++template <>
++__inline__ __device__ uint16_t
++vec_conversion<uint16_t, uint8_t>(const uint8_t& a) {
++  hip_fp8 f8{a, hip_fp8::from_bits()};
++  __half_raw res;
++  res.data = static_cast<float>(f8);
++  return res.x;
++}
++
++// fp8x2 -> half2
++template <>
++__inline__ __device__ uint32_t
++vec_conversion<uint32_t, uint16_t>(const uint16_t& a) {
++    #if defined(__HIP__MI300__) && \
++        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
++  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
++  union {
++    __half2_raw h2r;
++    uint32_t ui32;
++  } tmp;
++  tmp.h2r.x.data = f2[0];
++  tmp.h2r.y.data = f2[1];
++  return tmp.ui32;
++    #else
++  union {
++    uint16_t u16[2];
++    uint32_t u32;
++  } tmp;
++
++  tmp.u16[0] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a));
++  tmp.u16[1] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a >> 8U));
++  return tmp.u32;
++    #endif
++}
++
++// fp8x4 -> half2x2
++template <>
++__inline__ __device__ uint2 vec_conversion<uint2, uint32_t>(const uint32_t& a) {
++  union {
++    uint2 u32x2;
++    uint32_t u32[2];
++  } tmp;
++  tmp.u32[0] = vec_conversion<uint32_t, uint16_t>((uint16_t)a);
++  tmp.u32[1] = vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U));
++  return tmp.u32x2;
++}
++
++// fp8x8 -> half2x4
++template <>
++__inline__ __device__ uint4 vec_conversion<uint4, uint2>(const uint2& a) {
++  union {
++    uint4 u64x2;
++    uint2 u64[2];
++  } tmp;
++  tmp.u64[0] = vec_conversion<uint2, uint32_t>(a.x);
++  tmp.u64[1] = vec_conversion<uint2, uint32_t>(a.y);
++  return tmp.u64x2;
++}
++
++using __nv_bfloat16 = __hip_bfloat16;
++
++// fp8 -> __nv_bfloat16
++template <>
++__inline__ __device__ __nv_bfloat16
++vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) {
++  hip_fp8 f8{a, hip_fp8::from_bits()};
++  float f{f8};
++  return __float2bfloat16(f);
++}
++
++using __nv_bfloat162 = __hip_bfloat162;
++
++// fp8x2 -> __nv_bfloat162
++template <>
++__inline__ __device__ __nv_bfloat162
++vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a) {
++  __nv_bfloat162 res;
++  res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a);
++  res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U));
++  return res;
++}
++
++// fp8x4 -> bf16_4_t
++template <>
++__inline__ __device__ bf16_4_t
++vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a) {
++  bf16_4_t res;
++  res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a);
++  res.y = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U));
++  return res;
++}
++
++// fp8x8 -> bf16_8_t
++template <>
++__inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(const uint2& a) {
++  bf16_4_t tmp1, tmp2;
++  tmp1 = vec_conversion<bf16_4_t, uint32_t>(a.x);
++  tmp2 = vec_conversion<bf16_4_t, uint32_t>(a.y);
++  bf16_8_t res;
++  res.x = tmp1.x;
++  res.y = tmp1.y;
++  res.z = tmp2.x;
++  res.w = tmp2.y;
++  return res;
++}
++
++// fp8 -> float
++template <>
++__inline__ __device__ float vec_conversion<float, uint8_t>(const uint8_t& a) {
++  hip_fp8 fp8{a, hip_fp8::from_bits()};
++  return static_cast<float>(fp8);
++}
++
++// fp8x2 -> float2
++template <>
++__inline__ __device__ float2
++vec_conversion<float2, uint16_t>(const uint16_t& a) {
++    #if defined(__HIP__MI300__) && \
++        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
++  float2 res;
++  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
++  res.x = f2[0];
++  res.y = f2[1];
++  return res;
++    #else
++  float2 res;
++  res.x = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a));
++  res.y = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U));
++  return res;
++    #endif
++}
++
++// fp8x4 -> float4
++template <>
++__inline__ __device__ Float4_
++vec_conversion<Float4_, uint32_t>(const uint32_t& a) {
++  Float4_ res;
++  res.x = vec_conversion<float2, uint16_t>((uint16_t)a);
++  res.y = vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U));
++  return res;
++}
++
++// fp8x8 -> float8
++template <>
++__inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(const uint2& a) {
++  Float4_ tmp1, tmp2;
++  tmp1 = vec_conversion<Float4_, uint32_t>(a.x);
++  tmp2 = vec_conversion<Float4_, uint32_t>(a.y);
++  Float8_ res;
++  res.x = tmp1.x;
++  res.y = tmp1.y;
++  res.z = tmp2.x;
++  res.w = tmp2.y;
++  return res;
++}
++
++// half -> fp8
++template <>
++__inline__ __device__ uint8_t
++vec_conversion<uint8_t, uint16_t>(const uint16_t& a) {
++  __half_raw tmp;
++  tmp.x = a;
++
++  hip_fp8 f8{static_cast<float>(tmp.data)};
++  return f8.data;
++}
++
++// bf16 -> fp8
++template <>
++__inline__ __device__ uint8_t
++vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a) {
++  hip_fp8 res{__bfloat162float(a)};
++  return res.data;
++}
++
++// float -> fp8
++template <>
++__inline__ __device__ uint8_t vec_conversion<uint8_t, float>(const float& a) {
++  hip_fp8 f8(a);
++  return f8.data;
++}
++
++// fp8x4 -> float4
++template <>
++__inline__ __device__ float4
++vec_conversion<float4, uint32_t>(const uint32_t& a) {
++  Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
++  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
++  return res;
++}
++
++// float2 -> half2
++template <>
++__inline__ __device__ uint32_t
++vec_conversion<uint32_t, float2>(const float2& a) {
++  union {
++    half2 float16;
++    uint32_t uint32;
++  };
++
++  float16 = __float22half2_rn(a);
++  return uint32;
++}
++
++// Float4 -> half2x2
++template <>
++__inline__ __device__ uint2 vec_conversion<uint2, Float4_>(const Float4_& a) {
++  uint2 b;
++  float2 val;
++  val.x = a.x.x;
++  val.y = a.x.y;
++  b.x = vec_conversion<uint32_t, float2>(val);
++
++  val.x = a.y.x;
++  val.y = a.y.y;
++  b.y = vec_conversion<uint32_t, float2>(val);
++  return b;
++}
++
++// Float4 -> float4
++template <>
++__inline__ __device__ float4 vec_conversion<float4, Float4_>(const Float4_& a) {
++  float4 b;
++  b.x = a.x.x;
++  b.y = a.x.y;
++  b.z = a.y.x;
++  b.w = a.y.y;
++  return b;
++}
++
++// Float8 -> half2x4
++template <>
++__inline__ __device__ uint4 vec_conversion<uint4, Float8_>(const Float8_& a) {
++  uint4 b;
++  b.x = vec_conversion<uint32_t, float2>(a.x);
++  b.y = vec_conversion<uint32_t, float2>(a.y);
++  b.z = vec_conversion<uint32_t, float2>(a.z);
++  b.w = vec_conversion<uint32_t, float2>(a.w);
++  return b;
++}
++
++// float2 -> bfloat162
++template <>
++__inline__ __device__ __nv_bfloat162
++vec_conversion<__nv_bfloat162, float2>(const float2& a) {
++  __nv_bfloat162 b = __float22bfloat162_rn(a);
++  return b;
++}
++
++// Float4 -> bfloat162x2
++template <>
++__inline__ __device__ bf16_4_t
++vec_conversion<bf16_4_t, Float4_>(const Float4_& a) {
++  bf16_4_t b;
++  b.x = __float22bfloat162_rn(a.x);
++  b.y = __float22bfloat162_rn(a.y);
++  return b;
++}
++
++// Float8 -> bfloat162x4
++template <>
++__inline__ __device__ bf16_8_t
++vec_conversion<bf16_8_t, Float8_>(const Float8_& a) {
++  bf16_8_t b;
++  b.x = __float22bfloat162_rn(a.x);
++  b.y = __float22bfloat162_rn(a.y);
++  b.z = __float22bfloat162_rn(a.z);
++  b.w = __float22bfloat162_rn(a.w);
++  return b;
++}
++
++/* Scaled and vectorized conversions, for data exchange between high and low
++   precision domains
++
++   Convention of the scale in API, e.g: FP8_data = Quantization(
++   High_Precision_data / scale ) s.t. Quantize(HP / scale) => FP8 Dequant(FP8) *
++   scale =>  HP
++
++ */
++
++// fp8 -> half
++template <>
++__inline__ __device__ uint16_t
++scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, const float scale) {
++  hip_fp8 f8{a, hip_fp8::from_bits()};
++  __half_raw res;
++  res.data = static_cast<float>(f8) * scale;
++  return res.x;
++}
++
++// fp8x2 -> half2
++template <>
++__inline__ __device__ uint32_t scaled_vec_conversion<uint32_t, uint16_t>(
++    const uint16_t& a, const float scale) {
++    #if defined(__HIP__MI300__) && \
++        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
++  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
++  union {
++    __half2_raw h2r;
++    uint32_t ui32;
++  } tmp;
++  tmp.h2r.x.data = f2[0] * scale;
++  tmp.h2r.y.data = f2[1] * scale;
++  return tmp.ui32;
++    #else
++  union {
++    uint16_t u16[2];
++    uint32_t u32;
++  } tmp;
++
++  tmp.u16[0] =
++      scaled_vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a), scale);
++  tmp.u16[1] = scaled_vec_conversion<uint16_t, uint8_t>(
++      static_cast<uint8_t>(a >> 8U), scale);
++  return tmp.u32;
++    #endif
++}
++
++// fp8x4 -> half2x2
++template <>
++__inline__ __device__ uint2
++scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, const float scale) {
++  union {
++    uint2 u32x2;
++    uint32_t u32[2];
++  } tmp;
++  tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
++  tmp.u32[1] =
++      scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
++  return tmp.u32x2;
++}
++
++// fp8x8 -> half2x4
++template <>
++__inline__ __device__ uint4
++scaled_vec_conversion<uint4, uint2>(const uint2& a, const float scale) {
++  union {
++    uint4 u64x2;
++    uint2 u64[2];
++  } tmp;
++  tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
++  tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
++  return tmp.u64x2;
++}
++
++using __nv_bfloat16 = __hip_bfloat16;
++
++// fp8 -> __nv_bfloat16
++template <>
++__inline__ __device__ __nv_bfloat16
++scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a,
++                                              const float scale) {
++  hip_fp8 f8{a, hip_fp8::from_bits()};
++  float f{f8};
++  return __float2bfloat16(f * scale);
++}
++
++using __nv_bfloat162 = __hip_bfloat162;
++
++// fp8x2 -> __nv_bfloat162
++template <>
++__inline__ __device__ __nv_bfloat162
++scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,
++                                                const float scale) {
++  __nv_bfloat162 res;
++  res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale);
++  res.y =
++      scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U), scale);
++  return res;
++}
++
++// fp8x4 -> bf16_4_t
++template <>
++__inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
++    const uint32_t& a, const float scale) {
++  bf16_4_t res;
++  res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale);
++  res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U),
++                                                          scale);
++  return res;
++}
++
++// fp8x8 -> bf16_8_t
++template <>
++__inline__ __device__ bf16_8_t
++scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, const float scale) {
++  bf16_4_t tmp1, tmp2;
++  tmp1 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.x, scale);
++  tmp2 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.y, scale);
++  bf16_8_t res;
++  res.x = tmp1.x;
++  res.y = tmp1.y;
++  res.z = tmp2.x;
++  res.w = tmp2.y;
++  return res;
++}
++
++// fp8 -> float
++template <>
++__inline__ __device__ float scaled_vec_conversion<float, uint8_t>(
++    const uint8_t& a, const float scale) {
++  hip_fp8 fp8{a, hip_fp8::from_bits()};
++  return static_cast<float>(fp8) * scale;
++}
++
++// fp8x2 -> float2
++template <>
++__inline__ __device__ float2
++scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, const float scale) {
++    #if defined(__HIP__MI300__) && \
++        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
++  float2 res;
++  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
++  res.x = f2[0] * scale;
++  res.y = f2[1] * scale;
++  return res;
++    #else
++  float2 res;
++  res.x = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a), scale);
++  res.y = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U),
++                                                scale);
++  return res;
++    #endif
++}
++
++// fp8x4 -> float4
++template <>
++__inline__ __device__ Float4_
++scaled_vec_conversion<Float4_, uint32_t>(const uint32_t& a, const float scale) {
++  Float4_ res;
++  res.x = scaled_vec_conversion<float2, uint16_t>((uint16_t)a, scale);
++  res.y = scaled_vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U), scale);
++  return res;
++}
++
++// fp8x8 -> float8
++template <>
++__inline__ __device__ Float8_
++scaled_vec_conversion<Float8_, uint2>(const uint2& a, const float scale) {
++  Float4_ tmp1, tmp2;
++  tmp1 = scaled_vec_conversion<Float4_, uint32_t>(a.x, scale);
++  tmp2 = scaled_vec_conversion<Float4_, uint32_t>(a.y, scale);
++  Float8_ res;
++  res.x = tmp1.x;
++  res.y = tmp1.y;
++  res.z = tmp2.x;
++  res.w = tmp2.y;
++  return res;
++}
++
++/* Quantize(HP / scale) => FP8 */
++
++// TODO(Hai): vectorized to add
++
++// half -> fp8
++template <>
++__inline__ __device__ uint8_t
++scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, const float scale) {
++  __half_raw tmp;
++  tmp.x = a;
++
++  hip_fp8 f8{static_cast<float>(tmp.data) / scale};
++  return f8.data;
++}
++
++// bf16 -> fp8
++template <>
++__inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, __nv_bfloat16>(
++    const __nv_bfloat16& a, const float scale) {
++  hip_fp8 res{__bfloat162float(a) / scale};
++  return res.data;
++}
++
++// float -> fp8
++template <>
++__inline__ __device__ uint8_t
++scaled_vec_conversion<uint8_t, float>(const float& a, const float scale) {
++  hip_fp8 f8(a / scale);
++  return f8.data;
++}
++
++// fp8x4 -> float4
++template <>
++__inline__ __device__ float4
++scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, const float scale) {
++  Float4_ tmp = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
++  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
++  return res;
++}
++  #endif  // ENABLE_FP8
++
++template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
++__inline__ __device__ Tout convert(const Tin& x) {
++  #ifdef ENABLE_FP8
++  if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) {
++    return vec_conversion<Tout, Tin>(x);
++  }
++  #endif
++  assert(false);
++  return {};  // Squash missing return statement warning
++}
++
++template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
++__inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
++  #ifdef ENABLE_FP8
++  if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) {
++    return scaled_vec_conversion<Tout, Tin>(x, scale);
++  }
++  #endif
++  assert(false);
++  return {};  // Squash missing return statement warning
++}
++
++  // The following macro is used to dispatch the conversion function based on
++  // the data type of the key and value cache. The FN is a macro that calls a
++  // function with template<typename scalar_t, typename cache_t,
++  // Fp8KVCacheDataType kv_dt>.
++  #define DISPATCH_BY_KV_CACHE_DTYPE(SRC_DTYPE, KV_DTYPE, FN)                  \
++    if (KV_DTYPE == "auto") {                                                  \
++      if (SRC_DTYPE == at::ScalarType::Float) {                                \
++        FN(float, float, vllm::Fp8KVCacheDataType::kAuto);                     \
++      } else if (SRC_DTYPE == at::ScalarType::Half) {                          \
++        FN(uint16_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto);               \
++      } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                      \
++        FN(__nv_bfloat16, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto);     \
++      } else {                                                                 \
++        TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE); \
++      }                                                                        \
++    } else {                                                                   \
++      if (KV_DTYPE == "fp8" || KV_DTYPE == "fp8_e4m3") {                       \
++        if (SRC_DTYPE == at::ScalarType::Float) {                              \
++          FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);              \
++        } else if (SRC_DTYPE == at::ScalarType::Half) {                        \
++          FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);           \
++        } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                    \
++          FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);      \
++        } else {                                                               \
++          TORCH_CHECK(false,                                                   \
++                      "Unsupported input type of kv cache: ", SRC_DTYPE);      \
++        }                                                                      \
++      } else {                                                                 \
++        TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE);   \
++      }                                                                        \
++    }
++
++}  // namespace fp8
++#endif  // USE_ROCM
++}  // namespace vllm
+diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
+new file mode 100644
+index 0000000..e4f6615
+--- /dev/null
++++ b/csrc/quantization/fp8/common.cu
+@@ -0,0 +1,149 @@
++#include "common.cuh"
++#include "dispatch_utils.h"
++
++#include <c10/cuda/CUDAGuard.h>
++
++#ifndef USE_ROCM
++  #include <cub/cub.cuh>
++#else
++  #include <hipcub/hipcub.hpp>
++#endif
++
++namespace vllm {
++
++template <typename scalar_t>
++__global__ void scaled_fp8_quant_kernel(FP8_TYPE* __restrict__ out,
++                                        const scalar_t* __restrict__ input,
++                                        const float* __restrict__ scale,
++                                        int64_t num_elems) {
++  int tid = blockDim.x * blockIdx.x + threadIdx.x;
++
++  // Invert the scale so that we can use multiplications to avoid expensive
++  // division.
++  const float inverted_scale = 1.0f / (*scale);
++  scaled_fp8_conversion_vec<scalar_t, true>(
++      out, input, inverted_scale, num_elems, tid, blockDim.x * gridDim.x);
++}
++
++template <typename scalar_t>
++__global__ void dynamic_per_token_scaled_fp8_quant_kernel(
++    FP8_TYPE* __restrict__ out, float* __restrict__ scale,
++    scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
++    const int hidden_size) {
++  float const min_scaling_factor = 1.0f / (FP8_E4M3_MAX * 512.f);
++
++  int const tid = threadIdx.x;
++  int const token_idx = blockIdx.x;
++
++  // Use int64 to avoid overflowing an int32 when calculating this offset
++  int64_t offset = static_cast<int64_t>(token_idx) * hidden_size;
++  scalar_t const* __restrict__ token_input = &input[offset];
++  FP8_TYPE* __restrict__ token_output = &out[offset];
++
++  // For vectorization, token_input and token_output pointers need to be
++  // aligned at 8-byte and 4-byte addresses respectively.
++  bool const can_vectorize = hidden_size % 4 == 0;
++
++  float absmax_val = 0.0f;
++  if (can_vectorize) {
++    absmax_val = thread_max_vec(token_input, hidden_size, tid, blockDim.x);
++  } else {
++    for (int i = tid; i < hidden_size; i += blockDim.x) {
++      float const x = static_cast<float>(token_input[i]);
++      absmax_val = max(absmax_val, fabs(x));
++    }
++  }
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStorage;
++  float const block_absmax_val_maybe =
++      BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
++  __shared__ float token_scale;
++  if (tid == 0) {
++    if (scale_ub) {
++      token_scale = min(block_absmax_val_maybe, *scale_ub);
++    } else {
++      token_scale = block_absmax_val_maybe;
++    }
++    // token scale computation
++    token_scale = max(token_scale / FP8_E4M3_MAX, min_scaling_factor);
++    scale[token_idx] = token_scale;
++  }
++  __syncthreads();
++
++  // Note that we don't use inverted scales so we can match FBGemm impl.
++  if (can_vectorize) {
++    scaled_fp8_conversion_vec<scalar_t, false>(
++        token_output, token_input, token_scale, hidden_size, tid, blockDim.x);
++  } else {
++    for (int i = tid; i < hidden_size; i += blockDim.x) {
++      token_output[i] = scaled_fp8_conversion<false>(
++          static_cast<float>(token_input[i]), token_scale);
++    }
++  }
++}
++
++}  // namespace vllm
++
++void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
++                             torch::Tensor const& input,  // [..., d]
++                             torch::Tensor const& scale)  // [1]
++{
++  int64_t num_tokens = input.numel() / input.size(-1);
++  int64_t num_elems = input.numel();
++  dim3 grid(num_tokens);
++  dim3 block(1024);
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  VLLM_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "scaled_fp8_quant_kernel", [&] {
++        vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
++            out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),
++            scale.data_ptr<float>(), num_elems);
++      });
++}
++
++void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
++                              torch::Tensor const& input,  // [..., d]
++                              torch::Tensor& scale)        // [1]
++{
++  int64_t num_tokens = input.numel() / input.size(-1);
++  int64_t num_elems = input.numel();
++  dim3 grid(num_tokens);
++  dim3 block(1024);
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  VLLM_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "scaled_fp8_quant_kernel", [&] {
++        vllm::segmented_max_reduction<scalar_t><<<grid, block, 0, stream>>>(
++            scale.data_ptr<float>(), input.data_ptr<scalar_t>(), num_elems);
++        vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
++            out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),
++            scale.data_ptr<float>(), num_elems);
++      });
++}
++
++void dynamic_per_token_scaled_fp8_quant(
++    torch::Tensor& out,          // [..., d]
++    torch::Tensor const& input,  // [..., d]
++    torch::Tensor& scales, std::optional<at::Tensor> const& scale_ub) {
++  TORCH_CHECK(input.is_contiguous());
++  TORCH_CHECK(out.is_contiguous());
++
++  int const hidden_size = input.size(-1);
++  int const num_tokens = input.numel() / hidden_size;
++  dim3 const grid(num_tokens);
++  dim3 const block(std::min(hidden_size, 1024));
++
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  VLLM_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "dynamic_per_token_scaled_fp8_quant_kernel", [&] {
++        vllm::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t>
++            <<<grid, block, 0, stream>>>(
++                out.data_ptr<FP8_TYPE>(), scales.data_ptr<float>(),
++                input.data_ptr<scalar_t>(),
++                scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
++                hidden_size);
++      });
++}
+diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
+new file mode 100644
+index 0000000..15bd5b6
+--- /dev/null
++++ b/csrc/quantization/fp8/common.cuh
+@@ -0,0 +1,160 @@
++#pragma once
++
++#include "quantization/vectorization.cuh"
++
++#include <cmath>
++#include <c10/core/ScalarType.h>
++
++#ifndef USE_ROCM
++  #include <c10/util/Float8_e4m3fn.h>
++using FP8_TYPE = c10::Float8_e4m3fn;
++C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX =
++    std::numeric_limits<FP8_TYPE>::max();
++#else
++  #include <c10/util/Float8_e4m3fnuz.h>
++  #include "amd/hip_float8.h"
++using FP8_TYPE = c10::Float8_e4m3fnuz;
++// Using the default max value from pytorch (240.0) will cause accuracy
++// issue when running dynamic quantization. Here use 224.0f for rocm.
++constexpr auto FP8_E4M3_MAX = 224.0f;
++#endif
++constexpr static auto kFp8Type = c10::CppTypeToScalarType<FP8_TYPE>::value;
++
++namespace vllm {
++
++__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
++  float old;
++  old = (value >= 0)
++            ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
++            : __uint_as_float(
++                  atomicMin((unsigned int*)addr, __float_as_uint(value)));
++
++  return old;
++}
++
++template <bool is_scale_inverted>
++__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
++                                                          float const scale) {
++  float x = 0.0f;
++  if constexpr (is_scale_inverted) {
++    x = val * scale;
++  } else {
++    x = val / scale;
++  }
++
++  float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
++#ifndef USE_ROCM
++  return static_cast<c10::Float8_e4m3fn>(r);
++#else
++  // Use hardware cvt instruction for fp8 on rocm
++  return c10::Float8_e4m3fnuz(hip_fp8(r).data,
++                              c10::Float8_e4m3fnuz::from_bits());
++#endif
++}
++
++// Compute the absolute maximum m of the input tensor and store
++// m / float8_e4m3::max() in *scale. Each thread block performs a
++// reduction tree and the memory in scale is atomically updated.
++// So to get the right answer, *scale needs to be initialized to
++// a value <= 0.0 and we need to wait for all thread blocks to
++// finish before consuming *scale.
++template <typename scalar_t>
++__global__ void segmented_max_reduction(float* __restrict__ scale,
++                                        const scalar_t* __restrict__ input,
++                                        int64_t num_elems) {
++  __shared__ float cache[1024];
++  int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
++
++  // First store maximum for all values processes by
++  // the current thread in cache[threadIdx.x]
++  scalar_t tmp = 0.0;
++  while (i < num_elems) {
++    float x = static_cast<float>(input[i]);
++    tmp = max(tmp, fabs(x));
++    i += blockDim.x * gridDim.x;
++  }
++  cache[threadIdx.x] = tmp;
++
++  __syncthreads();
++
++  // Now perform parallel reduction within the thread block
++  int ib = blockDim.x / 2;
++  while (ib != 0) {
++    if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) {
++      cache[threadIdx.x] = cache[threadIdx.x + ib];
++    }
++    __syncthreads();
++    ib /= 2;
++  }
++  // Finally, since cache[0] contains the maximum for this thread block,
++  // atomically write the max to the target location
++  if (threadIdx.x == 0) {
++    atomicMaxFloat(scale, cache[0] / FP8_E4M3_MAX);
++  }
++}
++
++template <typename scalar_t>
++__device__ float thread_max_vec(scalar_t const* __restrict__ input,
++                                int64_t const num_elems, int const tid,
++                                int const step) {
++  // Vectorized input/output to better utilize memory bandwidth.
++  vec4_t<scalar_t> const* vectorized_in =
++      reinterpret_cast<vec4_t<scalar_t> const*>(input);
++
++  int64_t const num_vec_elems = num_elems >> 2;
++  float absmax_val = 0.0f;
++
++#pragma unroll 4
++  for (int64_t i = tid; i < num_vec_elems; i += step) {
++    vec4_t<scalar_t> in_vec = vectorized_in[i];
++    absmax_val = max(absmax_val, fabs(in_vec.x));
++    absmax_val = max(absmax_val, fabs(in_vec.y));
++    absmax_val = max(absmax_val, fabs(in_vec.z));
++    absmax_val = max(absmax_val, fabs(in_vec.w));
++  }
++
++  // Handle the remaining elements if num_elems is not divisible by 4
++  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
++    absmax_val = max(absmax_val, fabs(input[i]));
++  }
++
++  return absmax_val;
++}
++
++template <typename scalar_t, bool is_scale_inverted>
++__device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out,
++                                          scalar_t const* __restrict__ input,
++                                          float const scale,
++                                          int64_t const num_elems,
++                                          int const tid, int const step) {
++  using float8x4_t = q8x4_t<FP8_TYPE>;
++  // Vectorized input/output to better utilize memory bandwidth.
++  auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
++  auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
++
++  int64_t const num_vec_elems = num_elems >> 2;
++
++#pragma unroll 4
++  for (int64_t i = tid; i < num_vec_elems; i += step) {
++    vec4_t<scalar_t> in_vec = vectorized_in[i];
++    float8x4_t out_vec;
++
++    out_vec.x = scaled_fp8_conversion<is_scale_inverted>(
++        static_cast<float>(in_vec.x), scale);
++    out_vec.y = scaled_fp8_conversion<is_scale_inverted>(
++        static_cast<float>(in_vec.y), scale);
++    out_vec.z = scaled_fp8_conversion<is_scale_inverted>(
++        static_cast<float>(in_vec.z), scale);
++    out_vec.w = scaled_fp8_conversion<is_scale_inverted>(
++        static_cast<float>(in_vec.w), scale);
++    vectorized_out[i] = out_vec;
++  }
++
++  // Handle the remaining elements if num_elems is not divisible by 4
++  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
++    out[i] = scaled_fp8_conversion<is_scale_inverted>(
++        static_cast<float>(input[i]), scale);
++  }
++}
++
++}  // namespace vllm
+\ No newline at end of file
+diff --git a/csrc/quantization/fp8/fp8_marlin.cu b/csrc/quantization/fp8/fp8_marlin.cu
+new file mode 100644
+index 0000000..376bbd4
+--- /dev/null
++++ b/csrc/quantization/fp8/fp8_marlin.cu
+@@ -0,0 +1,1311 @@
++/*
++ * Modified by Neural Magic
++ * Copyright (C) Marlin.2024 Elias Frantar
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *         http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++/*
++ * Adapted from https://github.com/IST-DASLab/marlin
++ */
++
++#include "../gptq_marlin/marlin.cuh"
++#include "../gptq_marlin/marlin_dtypes.cuh"
++
++#include "core/registration.h"
++
++using namespace marlin;
++
++#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
++  static_assert(std::is_same<scalar_t, half>::value ||          \
++                    std::is_same<scalar_t, nv_bfloat16>::value, \
++                "only float16 and bfloat16 is supported");
++
++template <typename T>
++inline std::string str(T x) {
++  return std::to_string(x);
++}
++
++namespace fp8_marlin {
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
++
++template <typename scalar_t,          // compute dtype, half or nv_float16
++          const int num_bits,         // number of bits used for weights
++          const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__global__ void Marlin(
++    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // fp16 output buffer of shape mxn
++    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
++                                          // (k/groupsize)xn
++    int num_groups,  // number of scale groups per output channel
++    int prob_m,      // batch dimension m
++    int prob_n,      // output dimension n
++    int prob_k,      // reduction dimension k
++    int* locks       // extra global storage for barrier synchronization
++) {}
++
++}  // namespace fp8_marlin
++
++torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
++                              torch::Tensor& b_scales, torch::Tensor& workspace,
++                              int64_t num_bits, int64_t size_m, int64_t size_n,
++                              int64_t size_k) {
++  TORCH_CHECK_NOT_IMPLEMENTED(false,
++                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
++  return torch::empty({1, 1});
++}
++
++#else
++
++// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
++// output/accumulation.
++template <typename scalar_t>
++__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
++                           const typename ScalarType<scalar_t>::FragB& frag_b,
++                           typename ScalarType<scalar_t>::FragC& frag_c) {
++  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
++  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
++  float* c = reinterpret_cast<float*>(&frag_c);
++  if constexpr (std::is_same<scalar_t, half>::value) {
++    asm volatile(
++        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
++        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
++        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
++        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
++          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
++  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
++    asm volatile(
++        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
++        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
++        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
++        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
++          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
++  } else {
++    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
++  }
++}
++
++// Instruction for loading a full 16x16 matrix fragment of operand A from shared
++// memory, directly in tensor core layout.
++template <typename scalar_t>
++__device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
++                             const void* smem_ptr) {
++  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
++               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
++               : "r"(smem));
++}
++
++// Fast FP8ToFp16/FP8ToBf16: Efficiently dequantize 8bit fp8_e4m3 values to fp16
++// bf16 Reference:
++// - FP16:
++// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
++// - BF16:
++// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
++template <typename scalar_t>
++__device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit(int q) {
++  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
++}
++
++template <>
++__device__ inline typename ScalarType<half>::FragB dequant_8bit<half>(int q) {
++  // Constants for FP8 (E4M3) and FP16 formats
++  constexpr int FP8_EXPONENT = 4, FP8_MANTISSA = 3, FP16_EXPONENT = 5;
++  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT;
++
++  // Calculate MASK for extracting mantissa and exponent
++  constexpr int MASK1 = 0x80000000;
++  constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA);
++  constexpr int MASK3 = MASK2 & 0x7fffffff;
++  constexpr int MASK = MASK3 | (MASK3 >> 16);
++  // Final MASK value: 0x7F007F00
++
++  // Extract and shift FP8 values to FP16 format
++  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
++  int Out2 = ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT);
++
++  // Construct and apply exponent bias
++  constexpr int BIAS_OFFSET =
++      (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
++  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
++
++  // Convert to half2 and apply bias
++  typename ScalarType<half>::FragB frag_b;
++  // Note: reverse indexing is intentional because weights are permuted
++  frag_b[1] = __hmul2(*reinterpret_cast<const half2*>(&Out1), bias_reg);
++  frag_b[0] = __hmul2(*reinterpret_cast<const half2*>(&Out2), bias_reg);
++  return frag_b;
++}
++
++template <>
++__device__ inline typename ScalarType<nv_bfloat16>::FragB
++dequant_8bit<nv_bfloat16>(int q) {
++  // Constants for FP8 (E4M3) and BF16 formats
++  constexpr int FP8_EXPONENT = 4, FP8_MANTISSA = 3, BF16_EXPONENT = 8;
++  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
++
++  // Calculate MASK for extracting mantissa and exponent
++  constexpr int MASK1 = 0x80000000;
++  constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA);
++  constexpr int MASK3 = MASK2 & 0x7fffffff;
++  constexpr int MASK = MASK3 | (MASK3 >> 16);
++  // Final MASK value: 0x7F007F00
++
++  // Extract and shift FP8 values to BF16 format
++  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
++  int Out2 = ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT);
++
++  // Construct and apply exponent bias
++  constexpr int BIAS_OFFSET =
++      (1 << (BF16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
++  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
++  // position
++  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
++  const nv_bfloat162 bias_reg =
++      __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
++
++  // Convert to bfloat162 and apply bias
++  typename ScalarType<nv_bfloat16>::FragB frag_b;
++  // Note: reverse indexing is intentional because weights are permuted
++  frag_b[1] = __hmul2(*reinterpret_cast<const nv_bfloat162*>(&Out1), bias_reg);
++  frag_b[0] = __hmul2(*reinterpret_cast<const nv_bfloat162*>(&Out2), bias_reg);
++  return frag_b;
++}
++
++// Multiply dequantized values by the corresponding quantization scale; used
++// only for grouped quantization.
++template <typename scalar_t>
++__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
++                             typename ScalarType<scalar_t>::FragS& frag_s,
++                             int i) {
++  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
++  scalar_t2 s =
++      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
++  frag_b[0] = __hmul2(frag_b[0], s);
++  frag_b[1] = __hmul2(frag_b[1], s);
++}
++
++// Given 2 floats multiply by 2 scales (halves)
++template <typename scalar_t>
++__device__ inline void scale_float(float* c,
++                                   typename ScalarType<scalar_t>::FragS& s) {
++  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
++  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
++  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
++}
++
++// Wait until barrier reaches `count`, then lock for current threadblock.
++__device__ inline void barrier_acquire(int* lock, int count) {
++  if (threadIdx.x == 0) {
++    int state = -1;
++    do
++      // Guarantee that subsequent writes by this threadblock will be visible
++      // globally.
++      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
++                   : "=r"(state)
++                   : "l"(lock));
++    while (state != count);
++  }
++  __syncthreads();
++}
++
++// Release barrier and increment visitation count.
++__device__ inline void barrier_release(int* lock, bool reset = false) {
++  __syncthreads();
++  if (threadIdx.x == 0) {
++    if (reset) {
++      lock[0] = 0;
++      return;
++    }
++    int val = 1;
++    // Make sure that all writes since acquiring this barrier are visible
++    // globally, while releasing the barrier.
++    asm volatile("fence.acq_rel.gpu;\n");
++    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
++                 :
++                 : "l"(lock), "r"(val));
++  }
++}
++
++template <typename scalar_t,          // compute dtype, half or nv_float16
++          const int num_bits,         // number of bits used for weights
++          const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__global__ void Marlin(
++    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // fp16 output buffer of shape mxn
++    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
++                                          // (k/groupsize)xn
++    int num_groups,  // number of scale groups per output channel
++    int prob_m,      // batch dimension m
++    int prob_n,      // output dimension n
++    int prob_k,      // reduction dimension k
++    int* locks       // extra global storage for barrier synchronization
++) {
++  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
++  // same size, which might involve multiple column "slices" (of width 16 *
++  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
++  // example:
++  //   0 1 3
++  //   0 2 3
++  //   1 2 4
++  // While this kind of partitioning makes things somewhat more complicated, it
++  // ensures good utilization of all SMs for many kinds of shape and GPU
++  // configurations, while requiring as few slow global cross-threadblock
++  // reductions as possible.
++  using Dtype = ScalarType<scalar_t>;
++  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
++  using FragA = typename ScalarType<scalar_t>::FragA;
++  using FragB = typename ScalarType<scalar_t>::FragB;
++  using FragC = typename ScalarType<scalar_t>::FragC;
++  using FragS = typename ScalarType<scalar_t>::FragS;
++
++  constexpr int pack_factor = 32 / num_bits;
++
++  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
++  // better partitioning with less reductions
++  int parallel = 1;
++  if (prob_m > 16 * thread_m_blocks) {
++    parallel = prob_m / (16 * thread_m_blocks);
++    prob_m = 16 * thread_m_blocks;
++  }
++
++  int k_tiles = prob_k / 16 / thread_k_blocks;
++  int n_tiles = prob_n / 16 / thread_n_blocks;
++  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
++
++  int slice_row = (iters * blockIdx.x) % k_tiles;
++  int slice_col_par = (iters * blockIdx.x) / k_tiles;
++  int slice_col = slice_col_par;
++  int slice_iters;  // number of threadblock tiles in the current slice
++  int slice_count =
++      0;          // total number of active threadblocks in the current slice
++  int slice_idx;  // index of threadblock in current slice; numbered bottom to
++                  // top
++
++  // We can easily implement parallel problem execution by just remapping
++  // indices and advancing global pointers
++  if (slice_col_par >= n_tiles) {
++    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
++    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
++    locks += (slice_col_par / n_tiles) * n_tiles;
++    slice_col = slice_col_par % n_tiles;
++  }
++
++  // Compute all information about the current slice which is required for
++  // synchronization.
++  auto init_slice = [&]() {
++    slice_iters =
++        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
++    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
++    if (slice_iters == 0) return;
++    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
++    slice_count = 1;
++    slice_idx = 0;
++    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
++    if (col_first <= k_tiles * (slice_col_par + 1)) {
++      int col_off = col_first - k_tiles * slice_col_par;
++      slice_count = div_ceil(k_tiles - col_off, iters);
++      if (col_off > 0) slice_count++;
++      int delta_first = iters * blockIdx.x - col_first;
++      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
++        slice_idx = slice_count - 1;
++      else {
++        slice_idx = slice_count - 1 - delta_first / iters;
++        if (col_off > 0) slice_idx--;
++      }
++    }
++    if (slice_col == n_tiles) {
++      A += 16 * thread_m_blocks * prob_k / 8;
++      C += 16 * thread_m_blocks * prob_n / 8;
++      locks += n_tiles;
++      slice_col = 0;
++    }
++  };
++  init_slice();
++
++  // A sizes/strides
++
++  // stride of the A matrix in global memory
++  int a_gl_stride = prob_k / 8;
++  // stride of an A matrix tile in shared memory
++  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
++  // delta between subsequent A tiles in global memory
++  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
++  // between subsequent accesses within a tile
++  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
++  // between shared memory writes
++  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
++  // between shared memory tile reads
++  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
++  // within a shared memory tile
++  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
++  // overall size of a tile
++  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
++  // number of shared write iterations for a tile
++  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
++
++  // B sizes/strides
++  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
++  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
++  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
++  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
++
++  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
++  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
++  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
++  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
++  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
++  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
++
++  // Scale sizes/strides without act_order
++  int s_gl_stride = prob_n / 8;
++  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
++
++  // Scale size/strides with act_order
++  constexpr int tb_k = 16 * thread_k_blocks;
++  constexpr int g_idx_stage = 0;
++  // constexpr int act_s_row_stride      = 1;
++  // int           act_s_col_stride      = act_s_row_stride * num_groups;
++  int act_s_col_stride = 1;
++  int act_s_col_warp_stride = act_s_col_stride * 8;
++  int tb_n_warps = thread_n_blocks / 4;
++  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
++
++  // Global A read index of current thread.
++  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                (threadIdx.x % a_gl_rd_delta_o);
++  a_gl_rd += a_gl_rd_delta_o * slice_row;
++  // Shared write index of current thread.
++  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                (threadIdx.x % a_gl_rd_delta_o);
++  // Shared read index.
++  int a_sh_rd =
++      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
++  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
++
++  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
++                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
++  b_gl_rd += b_sh_stride * slice_col;
++  b_gl_rd += b_gl_rd_delta_o * slice_row;
++  int b_sh_wr = threadIdx.x * b_thread_vecs;
++  int b_sh_rd = threadIdx.x * b_thread_vecs;
++
++  // For act_order
++  int slice_k_start = tb_k * slice_row;
++  int slice_k_start_shared_fetch = slice_k_start;
++  int slice_n_offset = act_s_col_tb_stride * slice_col;
++
++  // No act_order
++  int s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
++  int s_sh_wr = threadIdx.x;
++  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
++
++  // We scale a `half2` tile in row-major layout for column-wise quantization.
++  int s_sh_rd =
++      8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4;
++
++  // Precompute which thread should not read memory in which iterations; this is
++  // needed if there are more threads than required for a certain tilesize or
++  // when the batchsize is not a multiple of 16.
++  bool a_sh_wr_pred[a_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < a_sh_wr_iters; i++)
++    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
++
++  // To ensure that writing and reading A tiles to/from shared memory, the
++  // latter in fragment format, is fully bank conflict free, we need to use a
++  // rather fancy XOR-based layout. The key here is that neither reads nor
++  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
++  // same shared memory banks. Further, it seems (based on NSight-Compute) that
++  // each warp must also write a consecutive memory segment?
++  auto transform_a = [&](int i) {
++    int row = i / a_gl_rd_delta_o;
++    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
++  };
++  // Since the computation of this remapping is non-trivial and, due to our main
++  // loop unrolls, all shared memory accesses are static, we simply precompute
++  // both transformed reads and writes.
++  int a_sh_wr_trans[a_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < a_sh_wr_iters; i++)
++    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
++  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
++  #pragma unroll
++  for (int i = 0; i < b_sh_wr_iters; i++) {
++  #pragma unroll
++    for (int j = 0; j < thread_m_blocks; j++)
++      a_sh_rd_trans[i][j] =
++          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
++  }
++
++  // Since B-accesses have non-constant stride they have to be computed at
++  // runtime; we break dependencies between subsequent accesses with a tile by
++  // maintining multiple pointers (we have enough registers), a tiny
++  // optimization.
++  const int4* B_ptr[b_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < b_sh_wr_iters; i++)
++    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
++
++  extern __shared__ int4 sh[];
++  // Shared memory storage for global fetch pipelines.
++  int4* sh_a = sh;
++  int4* sh_b = sh_a + (stages * a_sh_stage);
++  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
++  int4* sh_s = sh_g_idx + (stages * g_idx_stage);
++
++  // Register storage for double buffer of shared memory reads.
++  FragA frag_a[2][thread_m_blocks];
++  I4 frag_b_quant[2][b_thread_vecs];
++  FragC frag_c[thread_m_blocks][4][2];
++  FragS frag_s[2][4];
++
++  // Zero accumulators.
++  auto zero_accums = [&]() {
++  #pragma unroll
++    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
++      reinterpret_cast<float*>(frag_c)[i] = 0;
++  };
++
++  int sh_first_group_id = -1;
++  int sh_num_groups = -1;
++  constexpr int sh_max_num_groups = 32;
++
++  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
++                                    int last_group_id) {
++    sh_first_group_id = first_group_id;
++    sh_num_groups = last_group_id - first_group_id + 1;
++
++    if (sh_num_groups < sh_max_num_groups) {
++      sh_num_groups = sh_max_num_groups;
++    }
++
++    if (sh_first_group_id + sh_num_groups > num_groups) {
++      sh_num_groups = num_groups - sh_first_group_id;
++    }
++
++    int row_offset = first_group_id * s_gl_stride;
++
++    if (is_async) {
++      for (int i = 0; i < sh_num_groups; i++) {
++        if (threadIdx.x < s_sh_stride) {
++          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
++                         &scales_ptr[row_offset + (i * s_gl_stride) +
++                                     slice_n_offset + threadIdx.x]);
++        }
++      }
++    } else {
++      for (int i = 0; i < sh_num_groups; i++) {
++        if (threadIdx.x < s_sh_stride) {
++          sh_s[(i * s_sh_stride) + threadIdx.x] =
++              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
++                         threadIdx.x];
++        }
++      }
++    }
++  };
++  // Asynchronously fetch the next A, B and s tile from global to the next
++  // shared memory pipeline location.
++  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
++    if (pred) {
++      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < a_sh_wr_iters; i++) {
++        cp_async4_pred(
++            &sh_a_stage[a_sh_wr_trans[i]],
++            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
++            a_sh_wr_pred[i]);
++      }
++      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < b_sh_wr_iters; i++) {
++  #pragma unroll
++        for (int j = 0; j < b_thread_vecs; j++) {
++          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
++        }
++
++        B_ptr[i] += b_gl_rd_delta_o;
++      }
++    }
++    // Insert a fence even when we are winding down the pipeline to ensure that
++    // waiting is also correct at this point.
++    cp_async_fence();
++  };
++
++  // Wait until the next thread tile has been loaded to shared memory.
++  auto wait_for_stage = [&]() {
++    // We only have `stages - 2` active fetches since we are double buffering
++    // and can only issue the next fetch when it is guaranteed that the previous
++    // shared memory load is fully complete (as it may otherwise be
++    // overwritten).
++    cp_async_wait<stages - 2>();
++    __syncthreads();
++  };
++
++  // Load the next sub-tile from the current location in the shared memory pipe
++  // into the current register buffer.
++  auto fetch_to_registers = [&](int k, int pipe) {
++    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
++    for (int i = 0; i < thread_m_blocks; i++)
++      ldsm4<scalar_t>(frag_a[k % 2][i],
++                      &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
++    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++
++  #pragma unroll
++    for (int i = 0; i < b_thread_vecs; i++) {
++      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
++          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
++    }
++  };
++
++  bool is_same_group[stages];
++  int same_group_id[stages];
++
++  auto init_same_group = [&](int pipe) {
++    is_same_group[pipe] = false;
++    same_group_id[pipe] = 0;
++    return;
++  };
++
++  // Execute the actual tensor core matmul of a sub-tile.
++  auto matmul = [&](int k) {
++  // We have the m dimension as the inner loop in order to encourage overlapping
++  // dequantization and matmul operations.
++  #pragma unroll
++    for (int j = 0; j < 4; j++) {
++      FragB frag_b0;
++      FragB frag_b1;
++
++      int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
++      int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
++      int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
++
++      frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
++      frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
++
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks; i++) {
++        mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
++        mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
++      }
++    }
++  };
++
++  // Since we slice across the k dimension of a tile in order to increase the
++  // number of warps while keeping the n dimension of a tile reasonable, we have
++  // multiple warps that accumulate their partial sums of the same output
++  // location; which we have to reduce over in the end. We do in shared memory.
++  auto thread_block_reduce = [&]() {
++    constexpr int red_off = threads / b_sh_stride_threads / 2;
++    if (red_off >= 1) {
++      int red_idx = threadIdx.x / b_sh_stride_threads;
++      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
++      constexpr int red_sh_delta = b_sh_stride_threads;
++      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
++                      (threadIdx.x % b_sh_stride_threads);
++
++      // Parallel logarithmic shared memory reduction. We make sure to avoid any
++      // unnecessary read or write iterations, e.g., for two warps we write only
++      // once by warp 1 and read only once by warp 0.
++
++  #pragma unroll
++      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
++  #pragma unroll
++        for (int i = red_off; i > 0; i /= 2) {
++          if (i <= red_idx && red_idx < 2 * i) {
++  #pragma unroll
++            for (int j = 0; j < 4 * 2; j++) {
++              int red_sh_wr =
++                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
++              if (i < red_off) {
++                float* c_rd =
++                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
++                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
++  #pragma unroll
++                for (int k = 0; k < 4; k++)
++                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
++                      c_rd[k] + c_wr[k];
++              }
++              sh[red_sh_wr] =
++                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
++            }
++          }
++          __syncthreads();
++        }
++        if (red_idx == 0) {
++  #pragma unroll
++          for (int i = 0; i < 4 * 2; i++) {
++            float* c_rd =
++                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
++  #pragma unroll
++            for (int j = 0; j < 4; j++)
++              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
++                  c_rd[j];
++          }
++        }
++        __syncthreads();
++      }
++    }
++  };
++
++  // Since multiple threadblocks may process parts of the same column slice, we
++  // finally have to globally reduce over the results. As the striped
++  // partitioning minimizes the number of such reductions and our outputs are
++  // usually rather small, we perform this reduction serially in L2 cache.
++  auto global_reduce = [&](bool first = false, bool last = false) {
++    // We are very careful here to reduce directly in the output buffer to
++    // maximize L2 cache utilization in this step. To do this, we write out
++    // results in FP16 (but still reduce with FP32 compute).
++    constexpr int active_threads = 32 * thread_n_blocks / 4;
++    if (threadIdx.x < active_threads) {
++      int c_gl_stride = prob_n / 8;
++      int c_gl_wr_delta_o = 8 * c_gl_stride;
++      int c_gl_wr_delta_i = 4 * (active_threads / 32);
++      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
++                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
++      c_gl_wr += (2 * thread_n_blocks) * slice_col;
++      constexpr int c_sh_wr_delta = active_threads;
++      int c_sh_wr = threadIdx.x;
++
++      int row = (threadIdx.x % 32) / 4;
++
++      if (!first) {
++  // Interestingly, doing direct global accesses here really seems to mess up
++  // the compiler and lead to slowdowns, hence we also use async-copies even
++  // though these fetches are not actually asynchronous.
++  #pragma unroll
++        for (int i = 0; i < thread_m_blocks * 4; i++) {
++          cp_async4_pred(
++              &sh[c_sh_wr + c_sh_wr_delta * i],
++              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
++                 c_gl_wr_delta_i * (i % 2)],
++              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
++        }
++        cp_async_fence();
++        cp_async_wait<0>();
++      }
++
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks * 4; i++) {
++        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
++          if (!first) {
++            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
++  #pragma unroll
++            for (int j = 0; j < 2 * 4; j++) {
++              reinterpret_cast<float*>(
++                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
++                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
++            }
++          }
++          if (!last) {
++            int4 c;
++  #pragma unroll
++            for (int j = 0; j < 2 * 4; j++) {
++              reinterpret_cast<scalar_t*>(&c)[j] =
++                  Dtype::float2num(reinterpret_cast<float*>(
++                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
++            }
++            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
++                c;
++          }
++        }
++      }
++    }
++  };
++
++  // Write out the reduce final result in the correct layout. We only actually
++  // reshuffle matrix fragments in this step, the reduction above is performed
++  // in fragment layout.
++  auto write_result = [&]() {
++    int c_gl_stride = prob_n / 8;
++    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
++    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
++    constexpr int c_sh_rd_delta =
++        c_sh_stride * (threads / (2 * thread_n_blocks));
++
++    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
++                  (threadIdx.x % (2 * thread_n_blocks));
++    c_gl_wr += (2 * thread_n_blocks) * slice_col;
++    int c_sh_wr =
++        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
++    c_sh_wr += 32 * (threadIdx.x / 32);
++    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
++                  (threadIdx.x % (2 * thread_n_blocks));
++
++    int c_gl_wr_end = c_gl_stride * prob_m;
++
++    // We first reorder in shared memory to guarantee the most efficient final
++    // global write patterns
++    auto write = [&](int idx, float c0, float c1, FragS& s) {
++      scalar_t2 res =
++          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
++
++      ((scalar_t2*)sh)[idx] = res;
++    };
++
++    if (threadIdx.x / 32 < thread_n_blocks / 4) {
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks; i++) {
++  #pragma unroll
++        for (int j = 0; j < 4; j++) {
++          int wr = c_sh_wr + 8 * j;
++          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
++                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
++          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
++                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
++          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
++                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
++          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
++                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
++        }
++        c_sh_wr += 16 * (4 * c_sh_stride);
++      }
++    }
++    __syncthreads();
++
++  #pragma unroll
++    for (int i = 0;
++         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
++         i++) {
++      if (c_gl_wr < c_gl_wr_end) {
++        C[c_gl_wr] = sh[c_sh_rd];
++        c_gl_wr += c_gl_wr_delta;
++        c_sh_rd += c_sh_rd_delta;
++      }
++    }
++  };
++
++  // Start global fetch and register load pipelines.
++  auto start_pipes = [&]() {
++
++  #pragma unroll
++    for (int i = 0; i < stages - 1; i++) {
++      fetch_to_shared(i, i, i < slice_iters);
++    }
++
++    zero_accums();
++    wait_for_stage();
++    init_same_group(0);
++    fetch_to_registers(0, 0);
++    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
++    slice_k_start_shared_fetch += tb_k * (stages - 1);
++  };
++  if (slice_iters) {
++    start_pipes();
++  }
++
++  // Main loop.
++  while (slice_iters) {
++    // We unroll over both the global fetch and the register load pipeline to
++    // ensure all shared memory accesses are static. Note that both pipelines
++    // have even length meaning that the next iteration will always start at
++    // index 0.
++
++  #pragma unroll
++    for (int pipe = 0; pipe < stages;) {
++  #pragma unroll
++      for (int k = 0; k < b_sh_wr_iters; k++) {
++        fetch_to_registers(k + 1, pipe % stages);
++        if (k == b_sh_wr_iters - 2) {
++          fetch_to_shared((pipe + stages - 1) % stages, pipe,
++                          slice_iters >= stages);
++          pipe++;
++          wait_for_stage();
++          init_same_group(pipe % stages);
++        }
++        matmul(k);
++      }
++      slice_iters--;
++      if (slice_iters == 0) {
++        break;
++      }
++    }
++
++    a_gl_rd += a_gl_rd_delta_o * stages;
++    slice_k_start += tb_k * stages;
++    slice_k_start_shared_fetch += tb_k * stages;
++
++    // Process results and, if necessary, proceed to the next column slice.
++    // While this pattern may not be the most readable, other ways of writing
++    // the loop seemed to noticeably worse performance after compilation.
++    if (slice_iters == 0) {
++      cp_async_wait<0>();
++      bool last = slice_idx == slice_count - 1;
++      // For per-column scales, we only fetch them here in the final step before
++      // write-out
++      if (s_sh_wr_pred) {
++        cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
++      }
++      cp_async_fence();
++
++      thread_block_reduce();
++
++      cp_async_wait<0>();
++      __syncthreads();
++      if (threadIdx.x / 32 < thread_n_blocks / 4) {
++        reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
++        reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
++      }
++
++      // For 8-bit channelwise, we apply the scale before the global reduction
++      // that converts the fp32 results to fp16 (so that we avoid possible
++      // overflow in fp16)
++      if (threadIdx.x / 32 < thread_n_blocks / 4) {
++  #pragma unroll
++        for (int i = 0; i < thread_m_blocks; i++) {
++  #pragma unroll
++          for (int j = 0; j < 4; j++) {
++            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][0][0]),
++                                  frag_s[j / 2][2 * (j % 2) + 0]);
++            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][0][2]),
++                                  frag_s[j / 2][2 * (j % 2) + 0]);
++
++            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][1][0]),
++                                  frag_s[j / 2][2 * (j % 2) + 1]);
++            scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][1][2]),
++                                  frag_s[j / 2][2 * (j % 2) + 1]);
++          }
++        }
++      }
++
++      if (slice_count > 1) {  // only globally reduce if there is more than one
++                              // block in a slice
++        barrier_acquire(&locks[slice_col], slice_idx);
++        global_reduce(slice_idx == 0, last);
++        barrier_release(&locks[slice_col], last);
++      }
++      if (last)  // only the last block in a slice actually writes the result
++        write_result();
++      slice_row = 0;
++      slice_col_par++;
++      slice_col++;
++      init_slice();
++      if (slice_iters) {
++        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                  (threadIdx.x % a_gl_rd_delta_o);
++  #pragma unroll
++        for (int i = 0; i < b_sh_wr_iters; i++)
++          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
++        if (slice_col == 0) {
++  #pragma unroll
++          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
++        }
++
++        // Update slice k/n for scales loading
++        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
++
++        start_pipes();
++      }
++    }
++  }
++}
++
++  #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,                \
++                    THREAD_K_BLOCKS, GROUP_BLOCKS, NUM_THREADS)                \
++    else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
++             thread_n_blocks == THREAD_N_BLOCKS &&                             \
++             thread_k_blocks == THREAD_K_BLOCKS &&                             \
++             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {     \
++      cudaFuncSetAttribute(                                                    \
++          Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,             \
++                 THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, GROUP_BLOCKS>, \
++          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
++      Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,                 \
++             THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, GROUP_BLOCKS>      \
++          <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                   \
++              A_ptr, B_ptr, C_ptr, s_ptr, num_groups, prob_m, prob_n, prob_k,  \
++              locks);                                                          \
++    }
++
++typedef struct {
++  int thread_k;
++  int thread_n;
++  int num_threads;
++} thread_config_t;
++
++typedef struct {
++  int max_m_blocks;
++  thread_config_t tb_cfg;
++} exec_config_t;
++
++thread_config_t small_batch_thread_configs[] = {
++    // Ordered by priority
++
++    // thread_k, thread_n, num_threads
++    {128, 128, 256},
++    {64, 128, 128},
++    {128, 64, 128},
++};
++
++thread_config_t large_batch_thread_configs[] = {
++    // Ordered by priority
++
++    // thread_k, thread_n, num_threads
++    {64, 256, 256},
++    {64, 128, 128},
++    {128, 64, 128},
++
++};
++
++int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
++                          int prob_n, int prob_k, int num_bits,
++                          int group_size) {
++  int tb_n = th_config.thread_n;
++
++  // Get max scale groups per thread-block
++  // Fixed for channelwise
++  int tb_groups = 1;
++  int tb_scales = tb_groups * tb_n * 2;
++
++  return tb_scales * pipe_stages;
++}
++
++bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
++                         int prob_m, int prob_n, int prob_k, int num_bits,
++                         int scales_cache_size, int max_shared_mem) {
++  int pack_factor = 32 / num_bits;
++
++  // Get B size
++  int tb_k = th_config.thread_k;
++  int tb_n = th_config.thread_n;
++
++  int b_size = (tb_k * tb_n / pack_factor) * 4;
++
++  // Get A size
++  int m_blocks = div_ceil(prob_m, 16);
++  int tb_max_m = 16;
++
++  while (true) {
++    if (m_blocks >= max_m_blocks) {
++      tb_max_m *= max_m_blocks;
++      break;
++    }
++
++    max_m_blocks--;
++    if (max_m_blocks == 0) {
++      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
++    }
++  }
++
++  int a_size = (tb_max_m * tb_k) * 2;
++
++  float pipe_size = (a_size + b_size) * pipe_stages;
++
++  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity
++
++  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
++}
++
++bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
++                     int prob_m, int prob_n, int prob_k, int num_bits,
++                     int group_size, int max_shared_mem) {
++  // Sanity
++  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
++      th_config.num_threads == -1) {
++    return false;
++  }
++
++  // Verify K/N are divisible by thread K/N
++  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
++    return false;
++  }
++
++  // Verify min for thread K/N
++  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
++    return false;
++  }
++
++  // num_threads must be at least 128 (= 4 warps)
++  if (th_config.num_threads < 128) {
++    return false;
++  }
++
++  //  Determine cache for scales
++  int scales_cache_size = get_scales_cache_size(th_config, prob_m, prob_n,
++                                                prob_k, num_bits, group_size);
++
++  // Check that pipeline fits into cache
++  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
++                           num_bits, scales_cache_size, max_shared_mem)) {
++    return false;
++  }
++
++  return true;
++}
++
++exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
++                                      int num_bits, int group_size,
++                                      int max_shared_mem) {
++  int max_m_blocks = 4;
++  while (max_m_blocks > 0) {
++    if (prob_m <= 16) {
++      for (auto th_config : small_batch_thread_configs) {
++        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
++                            num_bits, group_size, max_shared_mem)) {
++          return exec_config_t{max_m_blocks, th_config};
++        }
++      }
++    } else {
++      for (auto th_config : large_batch_thread_configs) {
++        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
++                            num_bits, group_size, max_shared_mem)) {
++          return exec_config_t{max_m_blocks, th_config};
++        }
++      }
++    }
++
++    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
++                     // usage
++  }
++
++  return exec_config_t{0, {-1, -1, -1}};
++}
++
++  #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
++    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)
++
++template <typename scalar_t>
++void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, int prob_m,
++                     int prob_n, int prob_k, void* workspace, int num_bits,
++                     int num_groups, int group_size, int dev,
++                     cudaStream_t stream, int thread_k, int thread_n, int sms,
++                     int max_par) {
++  TORCH_CHECK(num_bits == 8, "num_bits must be 8. Got = ", num_bits);
++  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
++              ", ", prob_n, ", ", prob_k, "]");
++
++  int tot_m = prob_m;
++  int tot_m_blocks = div_ceil(tot_m, 16);
++  int pad = 16 * tot_m_blocks - tot_m;
++
++  if (sms == -1) {
++    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
++  }
++
++  int max_shared_mem = 0;
++  cudaDeviceGetAttribute(&max_shared_mem,
++                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
++  TORCH_CHECK(max_shared_mem > 0);
++
++  // Set thread config
++  exec_config_t exec_cfg;
++  if (thread_k != -1 && thread_n != -1) {
++    // User-defined config
++    exec_cfg =
++        exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}};
++  } else {
++    // Auto config
++    exec_cfg = determine_thread_config(prob_m, prob_n, prob_k, num_bits,
++                                       group_size, max_shared_mem);
++  }
++
++  TORCH_CHECK(
++      exec_cfg.max_m_blocks > 0 &&
++          is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks, prob_m,
++                          prob_n, prob_k, num_bits, group_size, max_shared_mem),
++      "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
++      ", thread_k = ", exec_cfg.tb_cfg.thread_k,
++      ", thread_n = ", exec_cfg.tb_cfg.thread_n,
++      ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [", prob_m,
++      ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
++      ", group_size = ", group_size, ", max_shared_mem = ", max_shared_mem);
++
++  int num_threads = exec_cfg.tb_cfg.num_threads;
++  thread_k = exec_cfg.tb_cfg.thread_k;
++  thread_n = exec_cfg.tb_cfg.thread_n;
++
++  int thread_k_blocks = thread_k / 16;
++  int thread_n_blocks = thread_n / 16;
++
++  int blocks = sms;
++
++  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
++              " is not divisible by thread_n = ", thread_n);
++  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
++              " is not divisible by thread_k = ", thread_k);
++
++  int group_blocks = -1;
++
++  const int4* A_ptr = (const int4*)A;
++  const int4* B_ptr = (const int4*)B;
++  int4* C_ptr = (int4*)C;
++  const int4* s_ptr = (const int4*)s;
++
++  int* locks = (int*)workspace;
++
++  // Main loop
++  for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
++    int thread_m_blocks = tot_m_blocks - i;
++    prob_m = tot_m - 16 * i;
++    int par = 1;
++    if (thread_m_blocks > exec_cfg.max_m_blocks) {
++      // Note that parallel > 1 currently only works for inputs without any
++      // padding
++      par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
++      if (par > max_par) par = max_par;
++      prob_m = (16 * exec_cfg.max_m_blocks) * par;
++      i += exec_cfg.max_m_blocks * (par - 1);
++      thread_m_blocks = exec_cfg.max_m_blocks;
++    }
++
++    // Define kernel configurations
++    if (false) {
++    }
++    CALL_IF(8, 32, 2, 256)
++    CALL_IF(8, 16, 4, 256)
++    CALL_IF(8, 8, 8, 256)
++    CALL_IF(8, 8, 4, 128)
++    CALL_IF(8, 4, 8, 128)
++    else {
++      TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
++                             str(prob_n) + ", " + str(prob_k) + "]" +
++                             ", num_groups = " + str(num_groups) +
++                             ", group_size = " + str(group_size) +
++                             ", thread_m_blocks = " + str(thread_m_blocks) +
++                             ", thread_n_blocks = " + str(thread_n_blocks) +
++                             ", thread_k_blocks = " + str(thread_k_blocks));
++    }
++
++    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
++    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
++  }
++}
++
++}  // namespace fp8_marlin
++
++torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
++                              torch::Tensor& b_scales, torch::Tensor& workspace,
++                              int64_t num_bits, int64_t size_m, int64_t size_n,
++                              int64_t size_k) {
++  // Verify num_bits
++  TORCH_CHECK(num_bits == 8, "num_bits must be 8. Got = ", num_bits);
++  int pack_factor = 32 / num_bits;
++
++  // Verify A
++  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
++              ", size_m = ", size_m);
++  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
++              ", size_k = ", size_k);
++
++  // Verify B
++  TORCH_CHECK(size_k % marlin::tile_size == 0, "size_k = ", size_k,
++              " is not divisible by tile_size = ", marlin::tile_size);
++  TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
++              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
++              ", size_k = ", size_k, ", tile_size = ", marlin::tile_size);
++  TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
++              "b_q_weight.size(1) = ", b_q_weight.size(1),
++              " is not divisible by tile_size = ", marlin::tile_size);
++  int actual_size_n = (b_q_weight.size(1) / marlin::tile_size) * pack_factor;
++  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
++              ", actual_size_n = ", actual_size_n);
++
++  // Verify device and strides
++  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
++  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
++
++  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
++  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
++
++  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
++  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
++
++  // Alloc buffers
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
++  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
++  torch::Tensor c = torch::empty({size_m, size_n}, options);
++
++  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
++  // auto -1)
++  int thread_k = -1;
++  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
++  // auto -1)
++  int thread_n = -1;
++  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
++  int sms = -1;
++
++  // Detect groupsize and act_order
++  int num_groups = -1;
++  int group_size = -1;
++
++  int b_rank = b_scales.sizes().size();
++  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
++  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
++              " is not size_n = ", size_n);
++  // Channelwise only for FP8
++  TORCH_CHECK(b_scales.size(0) == 1)
++  num_groups = b_scales.size(0);
++
++  // Verify workspace size
++  TORCH_CHECK(size_n % marlin::min_thread_n == 0, "size_n = ", size_n,
++              ", is not divisible by min_thread_n = ", marlin::min_thread_n);
++  int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
++  TORCH_CHECK(workspace.numel() >= min_workspace_size,
++              "workspace.numel = ", workspace.numel(),
++              " is below min_workspace_size = ", min_workspace_size);
++
++  int dev = a.get_device();
++  if (a.scalar_type() == at::ScalarType::Half) {
++    fp8_marlin::marlin_mm_f16i4<half>(
++        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
++        b_scales.data_ptr<at::Half>(), size_m, size_n, size_k,
++        workspace.data_ptr(), num_bits, num_groups, group_size, dev,
++        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
++        marlin::max_par);
++  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
++    fp8_marlin::marlin_mm_f16i4<nv_bfloat16>(
++        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
++        c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(), size_m,
++        size_n, size_k, workspace.data_ptr(), num_bits, num_groups, group_size,
++        dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
++        marlin::max_par);
++  } else {
++    TORCH_CHECK(false, "fp8_marlin_gemm only supports bfloat16 and float16");
++  }
++
++  return c;
++}
++
++#endif
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
++  m.impl("fp8_marlin_gemm", &fp8_marlin_gemm);
++}
+\ No newline at end of file
+diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/fp8/nvidia/quant_utils.cuh
+new file mode 100644
+index 0000000..f8cd1dc
+--- /dev/null
++++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh
+@@ -0,0 +1,573 @@
++#pragma once
++
++#include "../../../attention/attention_dtypes.h"
++#include <assert.h>
++#include <float.h>
++#include <stdint.h>
++#include <type_traits>
++
++namespace vllm {
++#ifndef USE_ROCM
++
++namespace fp8 {
++  #ifdef ENABLE_FP8
++
++    #if 0  // Disable the following code to reduce the binary size.
++template <typename Tout, typename Tin>
++__inline__ __device__ Tout
++vec_conversion(const Tin &x, const __nv_fp8_interpretation_t fp8_type) {
++  return x;
++}
++
++// fp8 -> half
++template <>
++__inline__ __device__ uint16_t vec_conversion<uint16_t, uint8_t>(
++    const uint8_t &a, const __nv_fp8_interpretation_t fp8_type) {
++  __half_raw res = __nv_cvt_fp8_to_halfraw(a, fp8_type);
++  return res.x;
++}
++
++// fp8x2 -> half2
++template <>
++__inline__ __device__ uint32_t vec_conversion<uint32_t, uint16_t>(
++    const uint16_t &a, const __nv_fp8_interpretation_t fp8_type) {
++  union {
++    uint16_t u16[2];
++    uint32_t u32;
++  } tmp;
++  __half2_raw res = __nv_cvt_fp8x2_to_halfraw2(a, fp8_type);
++  tmp.u16[0] = res.x;
++  tmp.u16[1] = res.y;
++  return tmp.u32;
++}
++
++// fp8x4 -> half2x2
++template <>
++__inline__ __device__ uint2 vec_conversion<uint2, uint32_t>(
++    const uint32_t &a, const __nv_fp8_interpretation_t fp8_type) {
++  union {
++    uint2 u32x2;
++    uint32_t u32[2];
++  } tmp;
++  tmp.u32[0] = vec_conversion<uint32_t, uint16_t>((uint16_t)a, fp8_type);
++  tmp.u32[1] =
++      vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), fp8_type);
++  return tmp.u32x2;
++}
++
++// fp8x8 -> half2x4
++template <>
++__inline__ __device__ uint4 vec_conversion<uint4, uint2>(
++    const uint2 &a, const __nv_fp8_interpretation_t fp8_type) {
++  union {
++    uint4 u64x2;
++    uint2 u64[2];
++  } tmp;
++  tmp.u64[0] = vec_conversion<uint2, uint32_t>(a.x, fp8_type);
++  tmp.u64[1] = vec_conversion<uint2, uint32_t>(a.y, fp8_type);
++  return tmp.u64x2;
++}
++
++// fp8 -> __nv_bfloat16
++template <>
++__inline__ __device__ __nv_bfloat16 vec_conversion<__nv_bfloat16, uint8_t>(
++    const uint8_t &a, const __nv_fp8_interpretation_t fp8_type) {
++  // Note there is no direct convert function from fp8 to bf16.
++  // fp8 -> half
++  __half_raw res = __nv_cvt_fp8_to_halfraw(a, fp8_type);
++  // half -> float -> bf16
++  float tmp = half_to_float(res.x);
++  return __float2bfloat16(tmp);
++}
++
++// fp8x2 -> __nv_bfloat162
++template <>
++__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, uint16_t>(
++    const uint16_t &a, const __nv_fp8_interpretation_t fp8_type) {
++  __nv_bfloat162 res;
++  res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, fp8_type);
++  res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U), fp8_type);
++  return res;
++}
++
++// fp8x4 -> bf16_4_t
++template <>
++__inline__ __device__ bf16_4_t vec_conversion<bf16_4_t, uint32_t>(
++    const uint32_t &a, const __nv_fp8_interpretation_t fp8_type) {
++  bf16_4_t res;
++  res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, fp8_type);
++  res.y =
++      vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U), fp8_type);
++  return res;
++}
++
++// fp8x8 -> bf16_8_t
++template <>
++__inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(
++    const uint2 &a, const __nv_fp8_interpretation_t fp8_type) {
++  bf16_4_t tmp1, tmp2;
++  tmp1 = vec_conversion<bf16_4_t, uint32_t>(a.x, fp8_type);
++  tmp2 = vec_conversion<bf16_4_t, uint32_t>(a.y, fp8_type);
++  bf16_8_t res;
++  res.x = tmp1.x;
++  res.y = tmp1.y;
++  res.z = tmp2.x;
++  res.w = tmp2.y;
++  return res;
++}
++
++// fp8 -> float
++template <>
++__inline__ __device__ float
++vec_conversion<float, uint8_t>(const uint8_t &a,
++                               const __nv_fp8_interpretation_t fp8_type) {
++  // fp8 -> half
++  uint16_t tmp = vec_conversion<uint16_t, uint8_t>(a, fp8_type);
++  // half -> float
++  return half_to_float(tmp);
++}
++
++// fp8x2 -> float2
++template <>
++__inline__ __device__ float2 vec_conversion<float2, uint16_t>(
++    const uint16_t &a, const __nv_fp8_interpretation_t fp8_type) {
++  // fp8x2 -> half2
++  uint32_t tmp = vec_conversion<uint32_t, uint16_t>(a, fp8_type);
++  // half2 -> float2
++  return half2_to_float2(tmp);
++}
++
++// fp8x4 -> float4
++template <>
++__inline__ __device__ Float4_ vec_conversion<Float4_, uint32_t>(
++    const uint32_t &a, const __nv_fp8_interpretation_t fp8_type) {
++  Float4_ res;
++  res.x = vec_conversion<float2, uint16_t>((uint16_t)a, fp8_type);
++  res.y = vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U), fp8_type);
++  return res;
++}
++
++// fp8x8 -> float8
++template <>
++__inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(
++    const uint2 &a, const __nv_fp8_interpretation_t fp8_type) {
++  Float4_ tmp1, tmp2;
++  tmp1 = vec_conversion<Float4_, uint32_t>(a.x, fp8_type);
++  tmp2 = vec_conversion<Float4_, uint32_t>(a.y, fp8_type);
++  Float8_ res;
++  res.x = tmp1.x;
++  res.y = tmp1.y;
++  res.z = tmp2.x;
++  res.w = tmp2.y;
++  return res;
++}
++
++// half -> fp8
++template <>
++__inline__ __device__ uint8_t vec_conversion<uint8_t, uint16_t>(
++    const uint16_t &a, const __nv_fp8_interpretation_t fp8_type) {
++  __half_raw tmp;
++  tmp.x = a;
++  __nv_fp8_storage_t res =
++      __nv_cvt_halfraw_to_fp8(tmp, __NV_SATFINITE, fp8_type);
++  return (uint8_t)res;
++}
++
++// bf16 -> fp8
++template <>
++__inline__ __device__ uint8_t vec_conversion<uint8_t, __nv_bfloat16>(
++    const __nv_bfloat16 &a, const __nv_fp8_interpretation_t fp8_type) {
++      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
++  assert(false);
++      #else
++  __nv_fp8_storage_t res = __nv_cvt_bfloat16raw_to_fp8(
++      __nv_bfloat16_raw(a), __NV_SATFINITE, fp8_type);
++  return (uint8_t)res;
++      #endif
++}
++
++// float -> fp8
++template <>
++__inline__ __device__ uint8_t vec_conversion<uint8_t, float>(
++    const float &a, const __nv_fp8_interpretation_t fp8_type) {
++  __nv_fp8_storage_t res = __nv_cvt_float_to_fp8(a, __NV_SATFINITE, fp8_type);
++  return (uint8_t)res;
++}
++
++// fp8x4 -> float4
++template <>
++__inline__ __device__ float4 vec_conversion<float4, uint32_t>(
++    const uint32_t &a, const __nv_fp8_interpretation_t fp8_type) {
++  Float4_ tmp = vec_conversion<Float4_, uint32_t>(a, fp8_type);
++  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
++  return res;
++}
++
++template <>
++__inline__ __device__ uint32_t vec_conversion<uint32_t, float2>(
++    const float2 &a, const __nv_fp8_interpretation_t fp8_type) {
++  union {
++    half2 float16;
++    uint32_t uint32;
++  };
++
++  float16 = __float22half2_rn(a);
++  return uint32;
++}
++
++template <>
++__inline__ __device__ uint2 vec_conversion<uint2, Float4_>(
++    const Float4_ &a, const __nv_fp8_interpretation_t fp8_type) {
++  uint2 b;
++  float2 val;
++  val.x = a.x.x;
++  val.y = a.x.y;
++  b.x = vec_conversion<uint32_t, float2>(val, fp8_type);
++
++  val.x = a.y.x;
++  val.y = a.y.y;
++  b.y = vec_conversion<uint32_t, float2>(val, fp8_type);
++
++  return b;
++}
++
++template <>
++__inline__ __device__ float4 vec_conversion<float4, Float4_>(
++    const Float4_ &a, const __nv_fp8_interpretation_t fp8_type) {
++  float4 b;
++  b.x = a.x.x;
++  b.y = a.x.y;
++  b.z = a.y.x;
++  b.w = a.y.y;
++  return b;
++}
++
++template <>
++__inline__ __device__ uint4 vec_conversion<uint4, Float8_>(
++    const Float8_ &a, const __nv_fp8_interpretation_t fp8_type) {
++  uint4 b;
++  b.x = vec_conversion<uint32_t, float2>(a.x, fp8_type);
++  b.y = vec_conversion<uint32_t, float2>(a.y, fp8_type);
++  b.z = vec_conversion<uint32_t, float2>(a.z, fp8_type);
++  b.w = vec_conversion<uint32_t, float2>(a.w, fp8_type);
++  return b;
++}
++
++template <>
++__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(
++    const float2 &a, const __nv_fp8_interpretation_t fp8_type) {
++  __nv_bfloat162 b;
++  from_float(b, a);
++  return b;
++}
++
++template <>
++__inline__ __device__ bf16_4_t vec_conversion<bf16_4_t, Float4_>(
++    const Float4_ &a, const __nv_fp8_interpretation_t fp8_type) {
++  bf16_4_t b;
++  from_float(b, a);
++  return b;
++}
++
++template <>
++__inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, Float8_>(
++    const Float8_ &a, const __nv_fp8_interpretation_t fp8_type) {
++  bf16_8_t b;
++  from_float(b, a);
++  return b;
++}
++    #endif
++
++/* Scaled and vectorized conversions, for data exchange between high and low
++   precision domains Convention of the scale in API, e.g: FP8_data =
++   Quantization( High_Precision_data / scale ) s.t. Quantize(HP / scale) => FP8
++     Dequant(FP8) * scale =>  HP
++ */
++
++template <typename Tout, typename Tin>
++__inline__ __device__ Tout scaled_vec_conversion(
++    const Tin& x, const float scale, const __nv_fp8_interpretation_t fp8_type) {
++  return x;
++}
++
++// fp8 -> half
++template <>
++__inline__ __device__ uint16_t scaled_vec_conversion<uint16_t, uint8_t>(
++    const uint8_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  __half_raw tmp = __nv_cvt_fp8_to_halfraw(a, fp8_type);
++  return float_to_half(half_to_float(tmp.x) * scale);
++}
++
++// fp8x2 -> half2
++template <>
++__inline__ __device__ uint32_t scaled_vec_conversion<uint32_t, uint16_t>(
++    const uint16_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  union {
++    uint16_t u16[2];
++    uint32_t u32;
++  } tmp;
++  __half2_raw res = __nv_cvt_fp8x2_to_halfraw2(a, fp8_type);
++  tmp.u16[0] = float_to_half(half_to_float(res.x) * scale);
++  tmp.u16[1] = float_to_half(half_to_float(res.y) * scale);
++  return tmp.u32;
++}
++
++// fp8x4 -> half2x2
++template <>
++__inline__ __device__ uint2 scaled_vec_conversion<uint2, uint32_t>(
++    const uint32_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  union {
++    uint2 u32x2;
++    uint32_t u32[2];
++  } tmp;
++  tmp.u32[0] =
++      scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale, fp8_type);
++  tmp.u32[1] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U),
++                                                         scale, fp8_type);
++  return tmp.u32x2;
++}
++
++// fp8x8 -> half2x4
++template <>
++__inline__ __device__ uint4
++scaled_vec_conversion<uint4, uint2>(const uint2& a, const float scale,
++                                    const __nv_fp8_interpretation_t fp8_type) {
++  union {
++    uint4 u64x2;
++    uint2 u64[2];
++  } tmp;
++  tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale, fp8_type);
++  tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale, fp8_type);
++  return tmp.u64x2;
++}
++
++// fp8 -> __nv_bfloat16
++template <>
++__inline__ __device__ __nv_bfloat16
++scaled_vec_conversion<__nv_bfloat16, uint8_t>(
++    const uint8_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  // Note there is no direct convert function from fp8 to bf16.
++  // fp8 -> half
++  __half_raw res = __nv_cvt_fp8_to_halfraw(a, fp8_type);
++  // half -> float -> bf16
++  float tmp = half_to_float(res.x);
++  return __float2bfloat16(tmp * scale);
++}
++
++// fp8x2 -> __nv_bfloat162
++template <>
++__inline__ __device__ __nv_bfloat162
++scaled_vec_conversion<__nv_bfloat162, uint16_t>(
++    const uint16_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  __nv_bfloat162 res;
++  res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale,
++                                                        fp8_type);
++  res.y = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U),
++                                                        scale, fp8_type);
++  return res;
++}
++
++// fp8x4 -> bf16_4_t
++template <>
++__inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
++    const uint32_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  bf16_4_t res;
++  res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale,
++                                                          fp8_type);
++  res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U),
++                                                          scale, fp8_type);
++  return res;
++}
++
++// fp8x8 -> bf16_8_t
++template <>
++__inline__ __device__ bf16_8_t scaled_vec_conversion<bf16_8_t, uint2>(
++    const uint2& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  bf16_4_t tmp1, tmp2;
++  tmp1 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.x, scale, fp8_type);
++  tmp2 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.y, scale, fp8_type);
++  bf16_8_t res;
++  res.x = tmp1.x;
++  res.y = tmp1.y;
++  res.z = tmp2.x;
++  res.w = tmp2.y;
++  return res;
++}
++
++// fp8 -> float
++template <>
++__inline__ __device__ float scaled_vec_conversion<float, uint8_t>(
++    const uint8_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  // fp8 -> half
++  __half_raw res = __nv_cvt_fp8_to_halfraw(a, fp8_type);
++  uint16_t tmp = res.x;
++
++  // half -> float
++  return half_to_float(tmp) * scale;
++}
++
++// fp8x2 -> float2
++template <>
++__inline__ __device__ float2 scaled_vec_conversion<float2, uint16_t>(
++    const uint16_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  // fp8x2 -> half2
++  uint32_t tmp = scaled_vec_conversion<uint32_t, uint16_t>(a, scale, fp8_type);
++  // half2 -> float2
++  return half2_to_float2(tmp);
++}
++
++// fp8x4 -> float4
++template <>
++__inline__ __device__ Float4_ scaled_vec_conversion<Float4_, uint32_t>(
++    const uint32_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  Float4_ res;
++  res.x = scaled_vec_conversion<float2, uint16_t>((uint16_t)a, scale, fp8_type);
++  res.y = scaled_vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U), scale,
++                                                  fp8_type);
++  return res;
++}
++
++// fp8x8 -> float8
++template <>
++__inline__ __device__ Float8_ scaled_vec_conversion<Float8_, uint2>(
++    const uint2& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  Float4_ tmp1, tmp2;
++  tmp1 = scaled_vec_conversion<Float4_, uint32_t>(a.x, scale, fp8_type);
++  tmp2 = scaled_vec_conversion<Float4_, uint32_t>(a.y, scale, fp8_type);
++  Float8_ res;
++  res.x = tmp1.x;
++  res.y = tmp1.y;
++  res.z = tmp2.x;
++  res.w = tmp2.y;
++  return res;
++}
++
++// half -> fp8
++template <>
++__inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, uint16_t>(
++    const uint16_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  __nv_fp8_storage_t res =
++      __nv_cvt_float_to_fp8(half_to_float(a) / scale, __NV_SATFINITE, fp8_type);
++  return (uint8_t)res;
++}
++
++// bf16 -> fp8
++template <>
++__inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, __nv_bfloat16>(
++    const __nv_bfloat16& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
++  assert(false);
++    #else
++  __nv_fp8_storage_t res = __nv_cvt_float_to_fp8(__bfloat162float(a) / scale,
++                                                 __NV_SATFINITE, fp8_type);
++  return (uint8_t)res;
++    #endif
++  __builtin_unreachable();  // Suppress missing return statement warning
++}
++
++// float -> fp8
++template <>
++__inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, float>(
++    const float& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  __nv_fp8_storage_t res =
++      __nv_cvt_float_to_fp8(a / scale, __NV_SATFINITE, fp8_type);
++  return (uint8_t)res;
++}
++
++// fp8x4 -> float4
++template <>
++__inline__ __device__ float4 scaled_vec_conversion<float4, uint32_t>(
++    const uint32_t& a, const float scale,
++    const __nv_fp8_interpretation_t fp8_type) {
++  Float4_ tmp = scaled_vec_conversion<Float4_, uint32_t>(a, scale, fp8_type);
++  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
++  return res;
++}
++  #endif  // ENABLE_FP8
++
++template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
++__inline__ __device__ Tout convert(const Tin& x) {
++  #if 0  // Disable the following code to reduce the binary size.
++  if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) {
++    return vec_conversion<Tout, Tin>(x, __NV_E4M3);
++  } else if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E5M2) {
++    return vec_conversion<Tout, Tin>(x, __NV_E5M2);
++  }
++  #endif
++  assert(false);
++  __builtin_unreachable();  // Suppress missing return statement warning
++}
++
++template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
++__inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
++  #ifdef ENABLE_FP8
++  if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E4M3) {
++    return scaled_vec_conversion<Tout, Tin>(x, scale, __NV_E4M3);
++  } else if constexpr (kv_dt == Fp8KVCacheDataType::kFp8E5M2) {
++    return scaled_vec_conversion<Tout, Tin>(x, scale, __NV_E5M2);
++  }
++  #endif
++  assert(false);
++  __builtin_unreachable();  // Suppress missing return statement warning
++}
++
++  // The following macro is used to dispatch the conversion function based on
++  // the data type of the key and value cache. The FN is a macro that calls a
++  // function with template<typename scalar_t, typename cache_t,
++  // Fp8KVCacheDataType kv_dt>.
++  #define DISPATCH_BY_KV_CACHE_DTYPE(SRC_DTYPE, KV_DTYPE, FN)                  \
++    if (KV_DTYPE == "auto") {                                                  \
++      if (SRC_DTYPE == at::ScalarType::Float) {                                \
++        FN(float, float, vllm::Fp8KVCacheDataType::kAuto);                     \
++      } else if (SRC_DTYPE == at::ScalarType::Half) {                          \
++        FN(uint16_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto);               \
++      } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                      \
++        FN(__nv_bfloat16, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto);     \
++      } else {                                                                 \
++        TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE); \
++      }                                                                        \
++    } else {                                                                   \
++      if (KV_DTYPE == "fp8" || KV_DTYPE == "fp8_e4m3") {                       \
++        if (SRC_DTYPE == at::ScalarType::Float) {                              \
++          FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);              \
++        } else if (SRC_DTYPE == at::ScalarType::Half) {                        \
++          FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);           \
++        } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                    \
++          FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);      \
++        } else {                                                               \
++          TORCH_CHECK(false,                                                   \
++                      "Unsupported input type of kv cache: ", SRC_DTYPE);      \
++        }                                                                      \
++      } else if (KV_DTYPE == "fp8_e5m2") {                                     \
++        if (SRC_DTYPE == at::ScalarType::Float) {                              \
++          FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2);              \
++        } else if (SRC_DTYPE == at::ScalarType::Half) {                        \
++          FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2);           \
++        } else if (SRC_DTYPE == at::ScalarType::BFloat16) {                    \
++          FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E5M2);      \
++        } else {                                                               \
++          TORCH_CHECK(false,                                                   \
++                      "Unsupported input type of kv cache: ", SRC_DTYPE);      \
++        }                                                                      \
++      } else {                                                                 \
++        TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE);   \
++      }                                                                        \
++    }
++
++}  // namespace fp8
++#endif  // not USE_ROCM
++}  // namespace vllm
+diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+new file mode 100644
+index 0000000..3c4f183
+--- /dev/null
++++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+@@ -0,0 +1,160 @@
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++
++#include "../../dispatch_utils.h"
++#include "layernorm_utils.cuh"
++#include "quant_conversions.cuh"
++
++namespace vllm {
++
++template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
++__device__ void rms_norm_dynamic_per_token_quant_vec(
++    scalar_out_t* __restrict__ out,       // [..., hidden_size]
++    float* __restrict__ scales,           // [num_tokens]
++    scalar_t const* __restrict__ input,   // [..., hidden_size]
++    scalar_t const* __restrict__ weight,  // [hidden_size]
++    float const* scale_ub, float const var_epsilon,
++    float const min_scaling_factor, int32_t const hidden_size,
++    scalar_t* __restrict__ residual = nullptr) {
++  float rms = 0.0f;
++  float token_scale = 0.0f;
++
++  // Compute rms
++  vllm::vectorized::compute_rms<scalar_t, has_residual>(
++      &rms, input, hidden_size, var_epsilon, residual);
++
++  // Compute scale
++  vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
++                                                     has_residual>(
++      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
++      hidden_size, residual);
++
++  // RMS Norm + Quant
++  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
++    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
++                                     has_residual>(
++        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
++  } else {
++    // FP8 - Do not invert token_scale for exact match with FBGemm
++    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
++                                     has_residual>(
++        out, input, weight, rms, token_scale, hidden_size, residual);
++  }
++}
++
++// RMS norm + quant kernel
++template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
++__global__ void rms_norm_dynamic_per_token_quant_kernel(
++    scalar_out_t* __restrict__ out,       // [..., hidden_size]
++    float* __restrict__ scales,           // [num_tokens]
++    scalar_t const* __restrict__ input,   // [..., hidden_size]
++    scalar_t const* __restrict__ weight,  // [hidden_size]
++    float const* scale_ub, float const var_epsilon,
++    float const min_scaling_factor, int32_t const hidden_size,
++    scalar_t* __restrict__ residual = nullptr) {
++  // For vectorization, token_input and token_output pointers need to be
++  // aligned at 8-byte and 4-byte addresses respectively.
++  bool const can_vectorize = hidden_size % 4 == 0;
++
++  if (can_vectorize) {
++    return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
++                                                has_residual>(
++        out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
++        hidden_size, residual);
++  }
++
++  float rms = 0.0f;
++  float token_scale = 0.0f;
++
++  // Compute RMS
++  vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
++                                            var_epsilon, residual);
++  // Compute Scale
++  vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
++      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
++      hidden_size, residual);
++
++  // RMS Norm + Quant
++  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
++    vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
++        out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
++  } else {
++    // FP8 - Do not invert s_token_scale for exact match with FBGemm
++    vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
++        out, input, weight, rms, token_scale, hidden_size, residual);
++  }
++}
++}  // namespace vllm
++
++// Residual add + RMS norm + dynamic per token
++template <typename scalar_in_t>
++void rms_norm_dynamic_per_token_quant_dispatch(
++    torch::Tensor& out,           // [..., hidden_size]
++    torch::Tensor const& input,   // [..., hidden_size]
++    torch::Tensor const& weight,  // [hidden_size]
++    torch::Tensor& scales,        // [num_tokens]
++    double const var_epsilon,     // Variance epsilon used in norm calculation
++    std::optional<at::Tensor> const& scale_ub,
++    std::optional<at::Tensor>& residual) {
++  int32_t hidden_size = input.size(-1);
++  int32_t num_tokens = input.numel() / hidden_size;
++
++  dim3 grid(num_tokens);
++  dim3 block(std::min(hidden_size, 1024));
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++
++  const float min_scaling_factor =
++      out.dtype() == torch::kInt8
++          ? std::numeric_limits<float>::epsilon()
++          : 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
++
++  if (residual.has_value()) {
++    VLLM_DISPATCH_QUANT_TYPES(
++        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
++          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
++                                                        true>
++              <<<grid, block, 0, stream>>>(
++                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
++                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
++                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
++                  var_epsilon, min_scaling_factor, hidden_size,
++                  residual->data_ptr<scalar_in_t>());
++        });
++
++  } else {
++    VLLM_DISPATCH_QUANT_TYPES(
++        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
++          vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
++                                                        false>
++              <<<grid, block, 0, stream>>>(
++                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
++                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
++                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
++                  var_epsilon, min_scaling_factor, hidden_size, nullptr);
++        });
++  }
++}
++
++void rms_norm_dynamic_per_token_quant(
++    torch::Tensor& out,           // [..., hidden_size]
++    torch::Tensor const& input,   // [..., hidden_size]
++    torch::Tensor const& weight,  // [hidden_size]
++    torch::Tensor& scales,        // [num_tokens]
++    double const var_epsilon,     // Variance epsilon used in norm calculation
++    std::optional<at::Tensor> scale_ub, std::optional<at::Tensor> residual) {
++  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
++  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
++
++  if (scale_ub.has_value()) {
++    TORCH_CHECK(out.dtype() == kFp8Type);
++  }
++  TORCH_CHECK(scales.dtype() == torch::kFloat32);
++
++  VLLM_DISPATCH_FLOATING_TYPES(
++      input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
++        rms_norm_dynamic_per_token_quant_dispatch<scalar_t>(
++            out, input, weight, scales, var_epsilon, scale_ub, residual);
++      });
++}
+diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
+new file mode 100644
+index 0000000..cec6b54
+--- /dev/null
++++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
+@@ -0,0 +1,327 @@
++#pragma once
++
++/**
++ * __device__ layernorm utilities.
++ */
++
++#include "quantization/vectorization.cuh"
++#include "quant_conversions.cuh"
++
++#ifndef USE_ROCM
++  #include <cub/cub.cuh>
++#else
++  #include <hipcub/hipcub.hpp>
++#endif
++
++namespace vllm {
++
++// has_residual must be true, if residual is not a nullptr
++template <typename scalar_t, bool has_residual = false>
++__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
++                            int32_t const hidden_size, float const epsilon,
++                            scalar_t const* __restrict__ residual = nullptr) {
++  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
++  // sum of squares
++  float ss = 0.0f;
++
++  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
++    float x = static_cast<float>(input[token_offset + i]);
++    if constexpr (has_residual) {
++      x += static_cast<float>(residual[token_offset + i]);
++    }
++
++    ss += x * x;
++  }
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStore;
++  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
++
++  __shared__ float s_rms;
++  if (threadIdx.x == 0) {
++    s_rms = rsqrtf(ss / hidden_size + epsilon);
++  }
++  __syncthreads();
++
++  *rms = s_rms;
++}
++
++template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
++__device__ void compute_dynamic_per_token_scales(
++    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
++    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
++    float const rms, float const* __restrict__ scale_ub,
++    float const min_scaling_factor, int32_t const hidden_size,
++    scalar_t const* __restrict__ residual = nullptr) {
++  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
++  ;
++  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
++
++  float block_absmax_val_maybe = 0.0f;
++  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
++    float x = static_cast<float>(input[token_offset + i]);
++    if constexpr (has_residual) {
++      x += static_cast<float>(residual[token_offset + i]);
++    }
++
++    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
++    block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
++  }
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStore;
++  block_absmax_val_maybe =
++      BlockReduce(reduceStore)
++          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
++
++  __shared__ float s_token_scale;
++  if (threadIdx.x == 0) {
++    float scale = 0.0f;
++    if (scale_ub) {
++      scale = min(block_absmax_val_maybe, *scale_ub);
++    } else {
++      scale = block_absmax_val_maybe;
++    }
++    // token scale computation
++    scale = max(scale / qmax, min_scaling_factor);
++    s_token_scale = scale;                 // Shared memory store
++    all_token_scales[blockIdx.x] = scale;  // Global output store
++  }
++  __syncthreads();
++
++  *token_scale = s_token_scale;
++}
++
++template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
++          bool has_residual = false>
++__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
++                               scalar_t const* __restrict__ input,
++                               scalar_t const* __restrict__ weight,
++                               float const rms, float const scale,
++                               int32_t const hidden_size,
++                               scalar_t* __restrict__ residual = nullptr) {
++  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
++  ;
++
++  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
++    float x = static_cast<float>(input[token_offset + i]);
++    if constexpr (has_residual) {
++      x += static_cast<float>(residual[token_offset + i]);
++      residual[token_offset + i] = static_cast<scalar_t>(x);
++    }
++    // Norm
++    x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
++    // Quant
++    output[token_offset + i] =
++        ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
++  }
++}
++
++namespace vectorized {
++
++// Compute 1.0/rms(input)
++// hidden_size must be a multiple of 4
++template <typename scalar_t, bool has_residual = false>
++__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
++                            int32_t const hidden_size, float const epsilon,
++                            scalar_t const* __restrict__ residual = nullptr) {
++  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
++
++  // Vectorized input/output to better utilize memory bandwidth.
++  vec4_t<scalar_t> const* vec_input =
++      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
++  vec4_t<scalar_t> const* vec_residual = nullptr;
++  if constexpr (has_residual) {
++    vec_residual =
++        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
++  }
++
++  // sum of squares
++  float ss = 0.0f;
++
++  int32_t const num_vec_elems = hidden_size >> 2;
++
++#pragma unroll 4
++  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
++    vec4_t<scalar_t> in = vec_input[i];
++
++    vec4_t<float> x;
++    x.x = static_cast<float>(in.x);
++    x.y = static_cast<float>(in.y);
++    x.z = static_cast<float>(in.z);
++    x.w = static_cast<float>(in.w);
++    if constexpr (has_residual) {
++      vec4_t<scalar_t> r = vec_residual[i];
++      x.x += static_cast<float>(r.x);
++      x.y += static_cast<float>(r.y);
++      x.z += static_cast<float>(r.z);
++      x.w += static_cast<float>(r.w);
++    }
++
++    ss += x.x * x.x;
++    ss += x.y * x.y;
++    ss += x.z * x.z;
++    ss += x.w * x.w;
++  }
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStore;
++  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
++
++  __shared__ float s_rms;
++  if (threadIdx.x == 0) {
++    s_rms = rsqrtf(ss / hidden_size + epsilon);
++  }
++  __syncthreads();
++
++  *rms = s_rms;
++}
++
++// Vectorized version of vllm::compute_dynamic_per_token_scales
++// hidden_size must be a multiple of 4
++template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
++__device__ void compute_dynamic_per_token_scales(
++    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
++    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
++    float const rms, float const* __restrict__ scale_ub,
++    float const min_scaling_factor, int32_t const hidden_size,
++    scalar_t const* __restrict__ residual = nullptr) {
++  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
++  ;
++
++  // Vectorized input/weight/residual to better utilize memory bandwidth.
++  vec4_t<scalar_t> const* vec_input =
++      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
++  vec4_t<scalar_t> const* vec_weight =
++      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
++  vec4_t<scalar_t> const* vec_residual = nullptr;
++  if constexpr (has_residual) {
++    vec_residual =
++        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
++  }
++
++  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
++
++  int32_t const num_vec_elems = hidden_size >> 2;
++  float block_absmax_val_maybe = 0.0f;
++
++#pragma unroll 4
++  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
++    vec4_t<scalar_t> in = vec_input[i];
++    vec4_t<scalar_t> const w = vec_weight[i];
++
++    vec4_t<float> x;
++    x.x = static_cast<float>(in.x);
++    x.y = static_cast<float>(in.y);
++    x.z = static_cast<float>(in.z);
++    x.w = static_cast<float>(in.w);
++    if constexpr (has_residual) {
++      vec4_t<scalar_t> r = vec_residual[i];
++      x.x += static_cast<float>(r.x);
++      x.y += static_cast<float>(r.y);
++      x.z += static_cast<float>(r.z);
++      x.w += static_cast<float>(r.w);
++    }
++
++    block_absmax_val_maybe = fmaxf(
++        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
++    block_absmax_val_maybe = fmaxf(
++        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
++    block_absmax_val_maybe = fmaxf(
++        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
++    block_absmax_val_maybe = fmaxf(
++        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
++  }
++
++  using BlockReduce = cub::BlockReduce<float, 1024>;
++  __shared__ typename BlockReduce::TempStorage reduceStore;
++  block_absmax_val_maybe =
++      BlockReduce(reduceStore)
++          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
++
++  __shared__ float s_token_scale;
++  if (threadIdx.x == 0) {
++    float scale = 0.0f;
++    if (scale_ub) {
++      scale = min(block_absmax_val_maybe, *scale_ub);
++    } else {
++      scale = block_absmax_val_maybe;
++    }
++    // token scale computation
++    scale = max(scale / qmax, min_scaling_factor);
++    s_token_scale = scale;                 // shared memory store
++    all_token_scales[blockIdx.x] = scale;  // global output store
++  }
++  __syncthreads();
++
++  *token_scale = s_token_scale;
++}
++
++// hidden_size must be a multiple of 4
++template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
++          bool has_residual = false>
++__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
++                               scalar_t const* __restrict__ input,
++                               scalar_t const* __restrict__ weight,
++                               float const rms, float const scale,
++                               int32_t const hidden_size,
++                               scalar_t* __restrict__ residual = nullptr) {
++  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
++  ;
++
++  // Vectorized input/output/weight/residual to better utilize memory bandwidth.
++  vec4_t<scalar_t> const* vec_input =
++      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
++  vec4_t<scalar_t> const* vec_weight =
++      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
++  q8x4_t<scalar_out_t>* vec_output =
++      reinterpret_cast<q8x4_t<scalar_out_t>*>(&output[token_offset]);
++  vec4_t<scalar_t>* vec_residual = nullptr;
++  if constexpr (has_residual) {
++    vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
++  }
++
++  int32_t const num_vec_elems = hidden_size >> 2;
++
++// TODO(luka/varun) extract into type-agnostic vectorized quant function to
++//  replace scaled_fp8_conversion_vec
++#pragma unroll 4
++  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
++    vec4_t<scalar_t> const in = vec_input[i];
++    vec4_t<scalar_t> const w = vec_weight[i];
++
++    vec4_t<float> x;
++    x.x = static_cast<float>(in.x);
++    x.y = static_cast<float>(in.y);
++    x.z = static_cast<float>(in.z);
++    x.w = static_cast<float>(in.w);
++    if constexpr (has_residual) {
++      vec4_t<scalar_t> r = vec_residual[i];
++      x.x += static_cast<float>(r.x);
++      x.y += static_cast<float>(r.y);
++      x.z += static_cast<float>(r.z);
++      x.w += static_cast<float>(r.w);
++      // Update residual
++      r.x = static_cast<scalar_t>(x.x);
++      r.y = static_cast<scalar_t>(x.y);
++      r.z = static_cast<scalar_t>(x.z);
++      r.w = static_cast<scalar_t>(x.w);
++      vec_residual[i] = r;
++    }
++
++    q8x4_t<scalar_out_t> out;
++    out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
++        static_cast<scalar_t>(x.x * rms) * w.x, scale);
++    out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
++        static_cast<scalar_t>(x.y * rms) * w.y, scale);
++    out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
++        static_cast<scalar_t>(x.z * rms) * w.z, scale);
++    out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
++        static_cast<scalar_t>(x.w * rms) * w.w, scale);
++    vec_output[i] = out;
++  }
++}
++
++}  // namespace vectorized
++
++}  // namespace vllm
+diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
+new file mode 100644
+index 0000000..f8a9872
+--- /dev/null
++++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
+@@ -0,0 +1,81 @@
++#pragma once
++
++/**
++ * __device__ helper functions to deal with float -> quant datatype conversion
++ */
++
++#include "quantization/vectorization.cuh"
++// TODO(luka/varun):refactor common.cuh to use this file instead
++#include "quantization/fp8/common.cuh"
++
++namespace vllm {
++
++// TODO(luka/varun): combine into common utilities for int8
++//  (with int8_quant_kernels.cu)
++static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
++#ifdef USE_ROCM
++  static const float i8_min =
++      static_cast<float>(std::numeric_limits<int8_t>::min());
++  static const float i8_max =
++      static_cast<float>(std::numeric_limits<int8_t>::max());
++  // round
++  float dst = std::nearbyint(x);
++  // saturate
++  dst = std::clamp(dst, i8_min, i8_max);
++  return static_cast<int8_t>(dst);
++#else
++  // CUDA path
++  uint32_t dst;
++  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
++  return reinterpret_cast<const int8_t&>(dst);
++#endif
++}
++
++static __device__ __forceinline__ FP8_TYPE float_to_fp8(float const x) {
++  float const r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
++  return static_cast<FP8_TYPE>(r);
++}
++
++template <typename quant_type_t, bool is_scale_inverted, typename enable = void>
++struct ScaledQuant;
++
++template <typename quant_type_t, bool is_scale_inverted>
++struct ScaledQuant<
++    quant_type_t, is_scale_inverted,
++    typename std::enable_if_t<std::is_same_v<quant_type_t, int8_t>>> {
++  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
++                                                          float const scale) {
++    if constexpr (is_scale_inverted) {
++      return float_to_int8_rn(x * scale);
++    } else {
++      return float_to_int8_rn(x / scale);
++    }
++  }
++};
++
++template <typename quant_type_t, bool is_scale_inverted>
++struct ScaledQuant<
++    quant_type_t, is_scale_inverted,
++    typename std::enable_if_t<std::is_same_v<quant_type_t, FP8_TYPE>>> {
++  static __device__ __forceinline__ quant_type_t quant_fn(float const x,
++                                                          float const scale) {
++    if constexpr (is_scale_inverted) {
++      return float_to_fp8(x * scale);
++    } else {
++      return float_to_fp8(x / scale);
++    }
++  }
++};
++
++template <typename scalar_t, typename quant_type_t, bool is_scale_inverted>
++__device__ void scaled_quant_conversion(quant_type_t* __restrict__ output,
++                                        scalar_t const* __restrict__ input,
++                                        float const scale, int const tid,
++                                        int const num_elements,
++                                        int const step) {
++  for (int i = tid; i < num_elements; i += step) {
++    output[i] = ScaledQuant<quant_type_t, is_scale_inverted>(input[i], scale);
++  }
++}
++
++}  // namespace vllm
+diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh
+new file mode 100644
+index 0000000..c012262
+--- /dev/null
++++ b/csrc/quantization/gguf/dequantize.cuh
+@@ -0,0 +1,568 @@
++// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/convert.cu
++// Dequant functions
++static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
++    const block_q4_0 * x = (const block_q4_0 *) vx;
++
++    const dfloat d = x[ib].d;
++
++    const int vui = x[ib].qs[iqs];
++
++    v.x = __int2half_rn(vui & 0xF);
++    v.y = __int2half_rn(vui >> 4);
++
++    v = __hsub2(v, __floats2half2_rn(8.0f, 8.0f));
++    v = __hmul2(v, {d, d});
++}
++
++static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
++    const block_q4_1 * x = (const block_q4_1 *) vx;
++
++    const dfloat d = __low2half(x[ib].dm);
++    const dfloat m = __high2half(x[ib].dm);
++
++    const int vui = x[ib].qs[iqs];
++
++    v.x = __int2half_rn(vui & 0xF);
++    v.y = __int2half_rn(vui >> 4);
++
++    v = __hmul2(v, {d, d});
++    v = __hadd2(v, {m, m});
++}
++
++static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
++    const block_q5_0 * x = (const block_q5_0 *) vx;
++
++    const dfloat d = x[ib].d;
++
++    uint32_t qh;
++    memcpy(&qh, x[ib].qh, sizeof(qh));
++
++    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
++    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
++
++    v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
++    v.y = __int2half_rn((x[ib].qs[iqs] >>  4) | xh_1);
++
++    v = __hsub2(v, __floats2half2_rn(16.0f, 16.0f));
++    v = __hmul2(v, {d, d});
++}
++
++static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
++    const block_q5_1 * x = (const block_q5_1 *) vx;
++
++    const dfloat d = __low2half(x[ib].dm);
++    const dfloat m = __high2half(x[ib].dm);
++
++    uint32_t qh;
++    memcpy(&qh, x[ib].qh, sizeof(qh));
++
++    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
++    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
++
++    v.x = __int2half_rn((x[ib].qs[iqs] & 0xf) | xh_0);
++    v.y = __int2half_rn((x[ib].qs[iqs] >>  4) | xh_1);
++
++    v = __hmul2(v, {d, d});
++    v = __hadd2(v, {m, m});
++}
++
++static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
++    const block_q8_0 * x = (const block_q8_0 *) vx;
++
++    const dfloat d = x[ib].d;
++
++    v.x = __int2half_rn(x[ib].qs[iqs + 0]);
++    v.y = __int2half_rn(x[ib].qs[iqs + 1]);
++
++    v = __hmul2(v, {d, d});
++}
++
++template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
++static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
++    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
++
++    if (i >= k) {
++        return;
++    }
++
++    const int ib = i/qk; // block index
++    const int iqs = (i%qk)/qr; // quant index
++    const int iybs = i - i%qk; // y block start index
++    const int y_offset = qr == 1 ? 1 : qk/2;
++
++    // dequantize
++    dfloat2 v;
++    dequantize_kernel(vx, ib, iqs, v);
++
++    y[iybs + iqs + 0]        = v.x;
++    y[iybs + iqs + y_offset] = v.y;
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++
++    const int i   = blockIdx.x;
++    const block_q2_K * x = (const block_q2_K *) vx;
++
++    const int tid = threadIdx.x;
++    const int n   = tid/32;
++    const int l   = tid - 32*n;
++    const int is  = 8*n + l/16;
++
++    const uint8_t q = x[i].qs[32*n + l];
++    dst_t * y = yy + i*QK_K + 128*n;
++
++    half dall = __low2half(x[i].dm);
++    half dmin = __high2half(x[i].dm);
++    y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4)));
++    y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4)));
++    y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4)));
++    y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4)));
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++
++    const int i = blockIdx.x;
++    const block_q3_K * x = (const block_q3_K *) vx;
++
++    const int r = threadIdx.x/4;
++    const int tid = r/2;
++    const int is0 = r%2;
++    const int l0 = 16*is0 + 4*(threadIdx.x%4);
++    const int n = tid / 4;
++    const int j = tid - 4*n;
++
++    uint8_t m = 1 << (4*n + j);
++    int is = 8*n + 2*j + is0;
++    int shift = 2*j;
++
++    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
++                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
++                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
++                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
++    half d_all = x[i].d;
++    half dl = __hmul(d_all,  __int2half_rn(us - 32));
++
++    dst_t * y = yy + i*QK_K + 128*n + 32*j;
++    const uint8_t * q = x[i].qs + 32*n;
++    const uint8_t * hm = x[i].hmask;
++
++    for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)));
++}
++
++static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
++    if (j < 4) {
++        d = q[j] & 63; m = q[j + 4] & 63;
++    } else {
++        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
++        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
++    }
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++    const block_q4_K * x = (const block_q4_K *) vx;
++
++    const int i = blockIdx.x;
++
++    // assume 32 threads
++    const int tid = threadIdx.x;
++    const int il  = tid/8;
++    const int ir  = tid%8;
++    const int is  = 2*il;
++    const int n   = 4;
++
++    dst_t * y = yy + i*QK_K + 64*il + n*ir;
++
++    const half dall = __low2half(x[i].dm);
++    const half dmin = __high2half(x[i].dm);
++
++    const uint8_t * q = x[i].qs + 32*il + n*ir;
++
++    uint8_t sc, m;
++    get_scale_min_k4(is + 0, x[i].scales, sc, m);
++    const half d1 = __hmul(dall, __int2half_rn(sc));
++    const half m1 = __hmul(dmin,  __int2half_rn(m));
++    get_scale_min_k4(is + 1, x[i].scales, sc, m);
++    const half d2 = __hmul(dall, __int2half_rn(sc));
++    const half m2 = __hmul(dmin, __int2half_rn(m));
++    for (int l = 0; l < n; ++l) {
++        y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1);
++        y[l +32] = __hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2);
++    }
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++    const block_q5_K * x = (const block_q5_K *) vx;
++
++    const int i = blockIdx.x;
++
++    // assume 64 threads - this is very slightly better than the one below
++    const int tid = threadIdx.x;
++    const int il  = tid/16;   // il is in 0...3
++    const int ir  = tid%16;   // ir is in 0...15
++    const int is  = 2*il;     // is is in 0...6
++
++    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
++
++    const half dall = __low2half(x[i].dm);
++    const half dmin = __high2half(x[i].dm);
++
++    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
++    const uint8_t * qh = x[i].qh + 2*ir;
++
++    uint8_t sc, m;
++    get_scale_min_k4(is + 0, x[i].scales, sc, m);
++    const half d1 = __hmul(dall, __int2half_rn(sc)); const half m1 = __hmul(dmin, __int2half_rn(m));
++    get_scale_min_k4(is + 1, x[i].scales, sc, m);
++    const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
++
++    uint8_t   hm  = 1 << (2*il);
++    y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1);
++    y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1);
++    hm <<= 1;
++    y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2);
++    y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2);
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++    const block_q6_K * x = (const block_q6_K *) vx;
++
++    const int i = blockIdx.x;
++
++    // assume 64 threads - this is very slightly better than the one below
++    const int tid = threadIdx.x;
++    const int ip  = tid/32;   // ip is 0 or 1
++    const int il  = tid - 32*ip; // 0...32
++    const int is  = 8*ip + il/16;
++
++    dst_t * y = yy + i*QK_K + 128*ip + il;
++
++    const half d = x[i].d;
++
++    const uint8_t * ql = x[i].ql + 64*ip + il;
++    const uint8_t   qh = x[i].qh[32*ip + il];
++    const int8_t  * sc = x[i].scales + is;
++
++    y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
++    y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
++    y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
++    y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++
++    const int i   = blockIdx.x;
++    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
++
++    const int tid = threadIdx.x;
++    const int il = tid/8; // 0...3
++    const int ib = tid%8; // 0...7
++    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
++    const uint16_t * q2 = x[i].qs + 4*ib;
++    const uint8_t  * aux8 = (const uint8_t *)q2;
++    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
++    const uint32_t aux32 = q2[2] | (q2[3] << 16);
++    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
++    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
++    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++
++    const int i   = blockIdx.x;
++    const block_iq2_xs * x = (const block_iq2_xs *) vx;
++
++    const int tid = threadIdx.x;
++    const int il = tid/8; // 0...3
++    const int ib = tid%8; // 0...7
++    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
++    const uint16_t * q2 = x[i].qs + 4*ib;
++    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
++    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
++    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
++    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
++
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++
++    const int i   = blockIdx.x;
++    const block_iq2_s * x = (const block_iq2_s *) vx;
++
++    const int tid = threadIdx.x;
++    const int il = tid/8; // 0...3
++    const int ib = tid%8; // 0...7
++    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
++    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
++    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
++    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
++    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++
++    const int i   = blockIdx.x;
++    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
++
++    const int tid = threadIdx.x;
++    const int il = tid/8; // 0...3
++    const int ib = tid%8; // 0...7
++    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
++    const uint8_t  * q3 = x[i].qs + 8*ib;
++    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
++    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
++    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
++    const uint32_t aux32 = gas[0] | (gas[1] << 16);
++    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
++    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
++    for (int j = 0; j < 4; ++j) {
++        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
++        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
++    }
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++
++    const int i   = blockIdx.x;
++    const block_iq3_s * x = (const block_iq3_s *) vx;
++
++    const int tid = threadIdx.x;
++    const int il = tid/8; // 0...3
++    const int ib = tid%8; // 0...7
++    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
++    const uint8_t * qs = x[i].qs + 8*ib;
++    const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
++    const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
++    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
++    const uint8_t signs = x[i].signs[4*ib + il];
++    for (int j = 0; j < 4; ++j) {
++        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
++        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
++    }
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++
++    const int64_t i   = blockIdx.x;
++    const block_iq1_s * x = (const block_iq1_s  *) vx;
++
++    const int64_t tid = threadIdx.x;
++    const int64_t il = tid/8; // 0...3
++    const int64_t ib = tid%8; // 0...7
++    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
++    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
++    const float d = __half2float(x[i].d) * (2*((x[i].qh[ib] >> 12) & 7) + 1);
++    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
++    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
++    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
++    grid32[0] &= 0x0f0f0f0f;
++    for (int j = 0; j < 8; ++j) {
++        y[j] = __float2half(d * (q[j] + delta));
++    }
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++
++    const int64_t i   = blockIdx.x;
++    const block_iq1_m * x = (const block_iq1_m  *) vx;
++
++    const int64_t tid = threadIdx.x;
++    const int64_t il = tid/8; // 0...3
++    const int64_t ib = tid%8; // 0...7
++    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
++    const uint16_t * sc = (const uint16_t *)x[i].scales;
++    iq1m_scale_t scale;
++    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
++    const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
++    const float d = __half2float(scale.f16) * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
++    const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
++    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
++    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
++    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
++    grid32[0] &= 0x0f0f0f0f;
++    for (int j = 0; j < 8; ++j) {
++        y[j] = __float2half(d * (q[j] + delta));
++    }
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++
++    const int i   = blockIdx.x;
++    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
++
++    const int tid = threadIdx.x;
++    const int il = tid/8; // 0...3
++    const int ib = tid%8; // 0...7
++    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
++    const uint8_t  * q4 = x[ib].qs + 4*il;
++    const float d = __half2float(x[ib].d);
++    for (int j = 0; j < 4; ++j) {
++        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
++        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
++    }
++
++}
++
++template<typename dst_t>
++static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
++    const int i   = blockIdx.x;
++    const block_iq4_xs * x = (const block_iq4_xs *)vx;
++
++    const int tid = threadIdx.x;
++    const int il = tid/8; // 0...3
++    const int ib = tid%8; // 0...7
++    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
++    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
++    const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
++    for (int j = 0; j < 4; ++j) {
++        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
++        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
++    }
++}
++
++template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
++static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
++    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
++    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
++}
++
++template<typename dst_t>
++static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = k / QK_K;
++    dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = (k + QK_K - 1) / QK_K;
++    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
++}
++
++template<typename dst_t>
++static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
++    const int nb = (k + QK_K - 1) / QK_K;
++    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
++}
++
++static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
++    switch (type) {
++        case 2:
++            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
++        case 3:
++            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
++        case 6:
++            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
++        case 7:
++            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
++        case 8:
++            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
++        case 10:
++            return dequantize_row_q2_K_cuda;
++        case 11:
++            return dequantize_row_q3_K_cuda;
++        case 12:
++            return dequantize_row_q4_K_cuda;
++        case 13:
++            return dequantize_row_q5_K_cuda;
++        case 14:
++            return dequantize_row_q6_K_cuda;
++        case 16:
++            return dequantize_row_iq2_xxs_cuda;
++        case 17:
++            return dequantize_row_iq2_xs_cuda;
++        case 18:
++            return dequantize_row_iq3_xxs_cuda;
++        case 19:
++            return dequantize_row_iq1_s_cuda;
++        case 20:
++            return dequantize_row_iq4_nl_cuda;
++        case 21:
++            return dequantize_row_iq3_s_cuda;
++        case 22:
++            return dequantize_row_iq2_s_cuda;
++        case 23:
++            return dequantize_row_iq4_xs_cuda;
++        case 29:
++            return dequantize_row_iq1_m_cuda;
++        default:
++            return nullptr;
++    }
++}
+\ No newline at end of file
+diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h
+new file mode 100644
+index 0000000..d42205a
+--- /dev/null
++++ b/csrc/quantization/gguf/ggml-common.h
+@@ -0,0 +1,1130 @@
++// copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-common.h
++#define QK_K 256
++#define K_QUANTS_PER_ITERATION 2
++#define WARP_SIZE_GGUF 32
++#define K_SCALE_SIZE 12
++#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
++#define CUDA_QUANTIZE_BLOCK_SIZE 256
++#define GGML_CUDA_DMMV_X 32
++#define GGML_CUDA_MMV_Y 1
++
++
++// Data Structures
++// QK = number of values after dequantization
++// QR = QK / number of values before dequantization
++// QI = number of 32 bit integers before dequantization
++
++#define QK4_0 32
++#define QR4_0 2
++#define QI4_0 (QK4_0 / (4 * QR4_0))
++typedef struct {
++    half    d;              // delta
++    uint8_t qs[QK4_0 / 2];  // nibbles / quants
++} block_q4_0;
++
++#define QK4_1 32
++#define QR4_1 2
++#define QI4_1 (QK4_1 / (4 * QR4_1))
++typedef struct {
++    half2   dm;             // dm.x = delta, dm.y = min
++    uint8_t qs[QK4_1 / 2];  // nibbles / quants
++} block_q4_1;
++
++#define QK5_0 32
++#define QR5_0 2
++#define QI5_0 (QK5_0 / (4 * QR5_0))
++typedef struct {
++    half d;                 // delta
++    uint8_t qh[4];          // 5-th bit of quants
++    uint8_t qs[QK5_0 / 2];  // nibbles / quants
++} block_q5_0;
++
++#define QK5_1 32
++#define QR5_1 2
++#define QI5_1 (QK5_1 / (4 * QR5_1))
++typedef struct {
++    half2 dm;               // dm.x = delta, dm.y = min
++    uint8_t qh[4];          // 5-th bit of quants
++    uint8_t qs[QK5_1 / 2];  // nibbles / quants
++} block_q5_1;
++
++#define QK8_0 32
++#define QR8_0 1
++#define QI8_0 (QK8_0 / (4 * QR8_0))
++typedef struct {
++    half    d;              // delta
++    int8_t  qs[QK8_0];      // quants
++} block_q8_0;
++
++#define QK8_1 32
++#define QR8_1 1
++#define QI8_1 (QK8_1 / (4 * QR8_1))
++typedef struct {
++    half2   ds;             // ds.x = delta, ds.y = sum
++    int8_t  qs[QK8_0];      // quants
++} block_q8_1;
++
++#define QR2_K 4
++#define QI2_K (QK_K / (4*QR2_K))
++typedef struct {
++    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
++    uint8_t qs[QK_K/4];      // quants
++    half2 dm;                // super-block scale for quantized scales/mins
++} block_q2_K;
++
++#define QR3_K 4
++#define QI3_K (QK_K / (4*QR3_K))
++typedef struct {
++    uint8_t hmask[QK_K/8];     // quants - high bit
++    uint8_t qs[QK_K/4];        // quants - low 2 bits
++    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
++    half d;             // super-block scale
++} block_q3_K;
++
++#define QR4_K 2
++#define QI4_K (QK_K / (4*QR4_K))
++typedef struct {
++    half2 dm;                  // super-block scale for quantized scales/mins
++    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
++    uint8_t qs[QK_K/2];        // 4--bit quants
++} block_q4_K;
++
++#define QR5_K 2
++#define QI5_K (QK_K / (4*QR5_K))
++typedef struct {
++    half2 dm;                     // super-block scale for quantized scales/mins
++    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
++    uint8_t qh[QK_K/8];           // quants, high bit
++    uint8_t qs[QK_K/2];           // quants, low 4 bits
++} block_q5_K;
++
++#define QR6_K 2
++#define QI6_K (QK_K / (4*QR6_K))
++typedef struct {
++    uint8_t ql[QK_K/2];   // quants, lower 4 bits
++    uint8_t qh[QK_K/4];   // quants, upper 2 bits
++    int8_t  scales[QK_K/16]; // scales
++    half    d;         // delta
++} block_q6_K;
++
++#define QR2_XXS 8
++#define QI2_XXS (QK_K / (4*QR2_XXS))
++typedef struct {
++    half d;
++    uint16_t qs[QK_K/8];
++} block_iq2_xxs;
++
++#define QR2_XS 8
++#define QI2_XS (QK_K / (4*QR2_XS))
++typedef struct {
++    half d;
++    uint16_t qs[QK_K/8];
++    uint8_t  scales[QK_K/32];
++} block_iq2_xs;
++
++#define QR2_S 8
++#define QI2_S (QK_K / (4*QR2_S))
++typedef struct {
++    half d;
++    uint8_t qs[QK_K/4];
++    uint8_t qh[QK_K/32];
++    uint8_t scales[QK_K/32];
++} block_iq2_s;
++
++#define QR3_XXS 8
++#define QI3_XXS (QK_K / (4*QR3_XXS))
++typedef struct {
++    half d;
++    uint8_t qs[3*(QK_K/8)];
++} block_iq3_xxs;
++
++#define QR3_XS 8
++#define QI3_XS (QK_K / (4*QR3_XS))
++#define IQ3S_N_SCALE QK_K/64
++typedef struct {
++    half d;
++    uint8_t qs[QK_K/4];
++    uint8_t qh[QK_K/32];
++    uint8_t signs[QK_K/8];
++    uint8_t scales[IQ3S_N_SCALE];
++} block_iq3_s;
++
++// 1.5625 bpw
++#define QR1_S 8
++#define QI1_S (QK_K / (4*QR1_S))
++typedef struct {
++    half d;
++    uint8_t  qs[QK_K/8];
++    uint16_t qh[QK_K/32];
++} block_iq1_s;
++
++// 1.75 bpw
++#define QR1_M 8
++#define QI1_M (QK_K / (4*QR1_M))
++typedef struct {
++    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
++    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
++    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
++} block_iq1_m;
++
++// Used by IQ1_M quants
++typedef union {
++    half f16;
++    uint16_t  u16;
++} iq1m_scale_t;
++
++#define QK4_NL 32
++#define QR4_NL 2
++#define QI4_NL (QK4_NL / (4*QR4_NL))
++typedef struct {
++    half d;
++    uint8_t qs[QK4_NL/2];
++} block_iq4_nl;
++
++#define QR4_XS 8
++#define QI4_XS (QK_K / (4*QR4_XS))
++typedef struct {
++    half d;
++    uint16_t scales_h;
++    uint8_t  scales_l[QK_K/64];
++    uint8_t  qs[QK_K/2];
++} block_iq4_xs;
++
++static const __device__ uint64_t iq2xxs_grid[256] = {
++    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
++    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
++    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
++    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
++    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
++    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
++    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
++    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
++    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
++    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
++    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
++    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
++    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
++    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
++    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
++    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
++    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
++    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
++    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
++    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
++    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
++    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
++    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
++    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
++    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
++    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
++    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
++    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
++    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
++    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
++    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
++    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
++    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
++    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
++    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
++    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
++    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
++    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
++    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
++    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
++    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
++    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
++    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
++    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
++    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
++    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
++    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
++    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
++    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
++    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
++    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
++    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
++    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
++    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
++    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
++    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
++    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
++    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
++    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
++    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
++    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
++    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
++    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
++    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
++};
++
++static const __device__ uint64_t iq2xs_grid[512] = {
++    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
++    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
++    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
++    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
++    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
++    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
++    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
++    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
++    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
++    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
++    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
++    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
++    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
++    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
++    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
++    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
++    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
++    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
++    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
++    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
++    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
++    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
++    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
++    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
++    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
++    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
++    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
++    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
++    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
++    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
++    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
++    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
++    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
++    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
++    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
++    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
++    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
++    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
++    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
++    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
++    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
++    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
++    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
++    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
++    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
++    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
++    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
++    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
++    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
++    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
++    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
++    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
++    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
++    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
++    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
++    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
++    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
++    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
++    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
++    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
++    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
++    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
++    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
++    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
++    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
++    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
++    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
++    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
++    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
++    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
++    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
++    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
++    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
++    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
++    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
++    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
++    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
++    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
++    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
++    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
++    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
++    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
++    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
++    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
++    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
++    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
++    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
++    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
++    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
++    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
++    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
++    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
++    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
++    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
++    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
++    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
++    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
++    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
++    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
++    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
++    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
++    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
++    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
++    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
++    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
++    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
++    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
++    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
++    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
++    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
++    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
++    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
++    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
++    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
++    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
++    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
++    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
++    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
++    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
++    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
++    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
++    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
++    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
++    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
++    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
++    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
++    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
++    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
++};
++
++static const __device__ uint64_t iq2s_grid[1024] = {
++    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
++    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
++    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
++    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
++    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
++    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
++    0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
++    0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
++    0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
++    0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
++    0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
++    0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
++    0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
++    0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
++    0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
++    0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
++    0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
++    0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
++    0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
++    0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
++    0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
++    0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
++    0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
++    0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
++    0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
++    0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
++    0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
++    0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
++    0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
++    0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
++    0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
++    0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
++    0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
++    0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
++    0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
++    0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
++    0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
++    0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
++    0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
++    0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
++    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
++    0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
++    0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
++    0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
++    0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
++    0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
++    0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
++    0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
++    0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
++    0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
++    0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
++    0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
++    0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
++    0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
++    0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
++    0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
++    0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
++    0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
++    0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
++    0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
++    0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
++    0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
++    0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
++    0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
++    0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
++    0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
++    0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
++    0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
++    0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
++    0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
++    0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
++    0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
++    0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
++    0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
++    0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
++    0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
++    0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
++    0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
++    0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
++    0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
++    0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
++    0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
++    0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
++    0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
++    0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
++    0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
++    0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
++    0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
++    0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
++    0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
++    0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
++    0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
++    0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
++    0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
++    0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
++    0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
++    0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
++    0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
++    0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
++    0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
++    0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
++    0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
++    0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
++    0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
++    0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
++    0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
++    0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
++    0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
++    0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
++    0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
++    0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
++    0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
++    0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
++    0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
++    0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
++    0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
++    0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
++    0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
++    0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
++    0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
++    0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
++    0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
++    0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
++    0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
++    0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
++    0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
++    0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
++    0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
++    0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
++    0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
++    0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
++    0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
++    0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
++    0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
++    0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
++    0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
++    0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
++    0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
++    0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
++    0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
++    0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
++    0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
++    0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
++    0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
++    0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
++    0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
++    0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
++    0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
++    0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
++    0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
++    0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
++    0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
++    0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
++    0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
++    0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
++    0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
++    0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
++    0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
++    0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
++    0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
++    0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
++    0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
++    0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
++    0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
++    0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
++    0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
++    0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
++    0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
++    0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
++    0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
++    0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
++    0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
++    0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
++    0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
++    0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
++    0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
++    0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
++    0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
++    0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
++    0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
++    0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
++    0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
++    0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
++    0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
++    0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
++    0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
++    0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
++    0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
++    0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
++    0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
++    0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
++    0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
++    0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
++    0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
++    0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
++    0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
++    0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
++    0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
++    0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
++    0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
++    0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
++    0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
++    0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
++    0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
++    0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
++    0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
++    0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
++    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
++    0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
++    0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
++    0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
++    0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
++    0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
++    0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
++    0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
++    0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
++    0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
++    0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
++    0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
++    0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
++    0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
++    0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
++    0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
++    0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
++    0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
++    0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
++    0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
++    0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
++    0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
++    0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
++    0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
++    0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
++    0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
++    0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
++    0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
++    0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
++    0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
++    0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
++    0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
++    0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
++    0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
++    0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
++    0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
++    0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
++    0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
++    0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
++    0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
++    0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
++    0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
++    0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
++    0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
++    0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
++    0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
++    0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
++    0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
++    0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
++};
++
++static const __device__ uint32_t iq3xxs_grid[256] = {
++    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
++    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
++    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
++    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
++    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
++    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
++    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
++    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
++    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
++    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
++    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
++    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
++    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
++    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
++    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
++    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
++    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
++    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
++    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
++    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
++    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
++    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
++    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
++    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
++    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
++    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
++    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
++    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
++    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
++    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
++    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
++    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
++};
++
++static const __device__ uint32_t iq3xs_grid[512] = {
++    0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
++    0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
++    0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
++    0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
++    0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
++    0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
++    0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
++    0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
++    0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
++    0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
++    0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
++    0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
++    0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
++    0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
++    0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
++    0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
++    0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
++    0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
++    0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
++    0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
++    0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
++    0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
++    0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
++    0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
++    0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
++    0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
++    0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
++    0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
++    0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
++    0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
++    0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
++    0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
++    0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
++    0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
++    0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
++    0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
++    0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
++    0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
++    0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
++    0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
++    0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
++    0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
++    0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
++    0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
++    0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
++    0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
++    0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
++    0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
++    0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
++    0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
++    0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
++    0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
++    0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
++    0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
++    0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
++    0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
++    0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
++    0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
++    0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
++    0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
++    0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
++    0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
++    0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
++    0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
++};
++
++#define IQ1S_DELTA 0.125f
++#define IQ1M_DELTA 0.125f
++static const __device__ uint64_t iq1s_grid_gpu[2048] = {
++    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
++    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
++    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
++    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
++    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
++    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
++    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
++    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
++    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
++    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
++    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
++    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
++    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
++    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
++    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
++    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
++    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
++    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
++    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
++    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
++    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
++    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
++    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
++    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
++    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
++    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
++    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
++    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
++    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
++    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
++    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
++    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
++    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
++    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
++    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
++    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
++    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
++    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
++    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
++    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
++    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
++    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
++    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
++    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
++    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
++    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
++    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
++    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
++    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
++    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
++    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
++    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
++    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
++    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
++    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
++    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
++    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
++    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
++    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
++    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
++    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
++    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
++    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
++    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
++    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
++    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
++    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
++    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
++    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
++    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
++    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
++    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
++    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
++    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
++    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
++    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
++    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
++    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
++    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
++    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
++    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
++    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
++    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
++    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
++    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
++    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
++    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
++    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
++    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
++    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
++    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
++    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
++    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
++    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
++    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
++    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
++    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
++    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
++    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
++    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
++    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
++    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
++    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
++    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
++    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
++    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
++    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
++    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
++    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
++    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
++    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
++    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
++    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
++    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
++    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
++    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
++    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
++    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
++    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
++    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
++    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
++    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
++    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
++    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
++    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
++    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
++    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
++    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
++    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
++    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
++    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
++    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
++    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
++    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
++    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
++    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
++    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
++    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
++    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
++    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
++    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
++    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
++    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
++    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
++    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
++    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
++    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
++    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
++    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
++    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
++    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
++    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
++    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
++    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
++    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
++    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
++    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
++    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
++    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
++    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
++    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
++    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
++    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
++    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
++    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
++    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
++    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
++    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
++    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
++    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
++    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
++    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
++    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
++    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
++    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
++    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
++    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
++    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
++    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
++    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
++    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
++    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
++    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
++    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
++    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
++    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
++    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
++    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
++    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
++    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
++    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
++    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
++    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
++    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
++    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
++    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
++    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
++    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
++    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
++    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
++    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
++    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
++    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
++    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
++    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
++    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
++    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
++    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
++    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
++    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
++    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
++    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
++    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
++    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
++    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
++    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
++    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
++    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
++    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
++    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
++    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
++    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
++    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
++    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
++    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
++    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
++    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
++    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
++    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
++    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
++    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
++    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
++    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
++    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
++    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
++    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
++    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
++    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
++    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
++    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
++    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
++    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
++    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
++    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
++    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
++    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
++    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
++    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
++    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
++    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
++    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
++    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
++    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
++    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
++    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
++    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
++};
++
++static const __device__ uint8_t ksigns_iq2xs[128] = {
++      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
++    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
++    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
++     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
++    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
++     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
++     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
++    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
++};
++
++static const __device__ uint64_t ksigns64[128] = {
++    0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
++    0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
++    0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
++    0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
++    0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
++    0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
++    0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
++    0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
++    0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
++    0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
++    0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
++    0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
++    0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
++    0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
++    0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
++    0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
++    0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
++    0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
++    0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
++    0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
++    0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
++    0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
++    0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
++    0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
++    0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
++    0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
++    0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
++    0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
++    0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
++    0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
++    0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
++    0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
++};
++
++static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
++static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
++
++
++typedef half dfloat; // dequantize float
++typedef half2 dfloat2;
++typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
++typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream);
++typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
++typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
++typedef void (*load_tiles_cuda_t)(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
++typedef float (*vec_dot_q_mul_mat_cuda_t)(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
++
++// Utility function
++
++#if defined(USE_ROCM)
++
++#ifndef __has_builtin
++    #define __has_builtin(x) 0
++#endif
++
++typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
++static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
++    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
++    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
++#if __has_builtin(__builtin_elementwise_sub_sat)
++    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
++    return reinterpret_cast<const int &>(c);
++#else
++    int8x4_t c;
++    int16_t tmp;
++#pragma unroll
++    for (int i = 0; i < 4; i++) {
++        tmp = va[i] - vb[i];
++        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
++        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
++        c[i] = tmp;
++    }
++    return reinterpret_cast<int &>(c);
++#endif // __has_builtin(__builtin_elementwise_sub_sat)
++}
++
++static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
++#if __has_builtin(__builtin_amdgcn_sdot4)
++    c = __builtin_amdgcn_sdot4(a, b, c, false);
++#else
++    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
++    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
++    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
++#endif
++    return c;
++}
++
++static __device__ __forceinline__ uint32_t __vcmpeq4(const uint32_t a, const uint32_t b) {
++    uint32_t neq = a^b;
++    return !(neq & 0xff000000) * 0xff000000 |
++           !(neq & 0x00ff0000) * 0x00ff0000 |
++           !(neq & 0x0000ff00) * 0x0000ff00 |
++           !(neq & 0x000000ff) * 0x000000ff;
++}
++
++static __device__ __forceinline__ uint32_t __vsub4(const uint32_t a, const uint32_t b) {
++    return (static_cast<uint8_t>(((a & 0xff000000) >> 24) - ((b & 0xff000000) >> 24)) << 24) +
++           (static_cast<uint8_t>(((a & 0x00ff0000) >> 16) - ((b & 0x00ff0000) >> 16)) << 16) +
++           (static_cast<uint8_t>(((a & 0x0000ff00) >>  8) - ((b & 0x0000ff00) >>  8)) <<  8) +
++           (static_cast<uint8_t>(((a & 0x000000ff) >>  0) - ((b & 0x000000ff) >>  0)) <<  0);
++}
++#endif // defined(USE_ROCM)
+diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
+new file mode 100644
+index 0000000..5f0eaf5
+--- /dev/null
++++ b/csrc/quantization/gguf/gguf_kernel.cu
+@@ -0,0 +1,249 @@
++#include <cuda_fp16.h>
++#include <cuda_runtime.h>
++
++#include <torch/all.h>
++#include <c10/cuda/CUDAGuard.h>
++
++#include "cuda_compat.h"
++
++#include "ggml-common.h"
++#include "vecdotq.cuh"
++#include "dequantize.cuh"
++#include "mmvq.cuh"
++#include "mmq.cuh"
++
++// Q8 gemv
++static __global__ void quantize_q8_1(const half* __restrict__ x,
++                                     void* __restrict__ vy, const int kx,
++                                     const int kx_padded) {
++  const int ix = blockDim.x * blockIdx.x + threadIdx.x;
++  if (ix >= kx_padded) {
++    return;
++  }
++  const int iy = blockDim.y * blockIdx.y + threadIdx.y;
++  const int i_padded = iy * kx_padded + ix;
++
++  block_q8_1* y = (block_q8_1*)vy;
++
++  const int ib = i_padded / QK8_1;   // block index
++  const int iqs = i_padded % QK8_1;  // quant index
++
++  const float xi = ix < kx ? __half2float(x[iy * kx + ix]) : 0.0f;
++  float amax = fabsf(xi);
++  float sum = xi;
++
++#pragma unroll
++  for (int mask = 16; mask > 0; mask >>= 1) {
++    amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32));
++    sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32);
++  }
++
++  const float d = amax / 127;
++  const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
++
++  y[ib].qs[iqs] = q;
++
++  if (iqs > 0) {
++    return;
++  }
++
++  y[ib].ds.x = __float2half(d);
++  y[ib].ds.y = __float2half(sum);
++}
++
++static void quantize_row_q8_1_cuda(const half* x, void* vy, const int kx,
++                                   const int ky, cudaStream_t stream) {
++  const int64_t kx_padded = (kx + 512 - 1) / 512 * 512;
++  const int block_num_x =
++      (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
++  const dim3 num_blocks(block_num_x, ky, 1);
++  const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
++  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
++}
++
++torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
++                              int64_t type, int64_t m, int64_t n) {
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
++  auto options =
++      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
++  at::Tensor DW = torch::empty({m, n}, options);
++  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
++  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
++  to_fp16_cuda((void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream);
++  return DW;
++}
++
++torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W,  // quant weight
++                                  torch::Tensor X,  // input
++                                  int64_t type, int64_t row) {
++  int col = X.sizes()[1];
++  const int padded = (col + 512 - 1) / 512 * 512;
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
++  auto options =
++      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
++  at::Tensor Y = torch::empty({1, row}, options);
++  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
++  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
++  at::Tensor quant_X = torch::empty({1, padded / 32 * 9}, options);
++  quantize_row_q8_1_cuda((half*)X.data_ptr(), (void*)quant_X.data_ptr(), col, 1,
++                         stream);
++  switch (type) {
++    case 2:
++      mul_mat_vec_q4_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
++                                 (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 3:
++      mul_mat_vec_q4_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
++                                 (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 6:
++      mul_mat_vec_q5_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
++                                 (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 7:
++      mul_mat_vec_q5_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
++                                 (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 8:
++      mul_mat_vec_q8_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
++                                 (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 10:
++      mul_mat_vec_q2_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
++                                 (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 11:
++      mul_mat_vec_q3_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
++                                 (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 12:
++      mul_mat_vec_q4_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
++                                 (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 13:
++      mul_mat_vec_q5_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
++                                 (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 14:
++      mul_mat_vec_q6_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
++                                 (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 16:
++      mul_mat_vec_iq2_xxs_q8_1_cuda((void*)W.data_ptr(),
++                                    (void*)quant_X.data_ptr(),
++                                    (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 17:
++      mul_mat_vec_iq2_xs_q8_1_cuda((void*)W.data_ptr(),
++                                   (void*)quant_X.data_ptr(),
++                                   (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 18:
++      mul_mat_vec_iq3_xxs_q8_1_cuda((void*)W.data_ptr(),
++                                    (void*)quant_X.data_ptr(),
++                                    (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 19:
++      mul_mat_vec_iq1_s_q8_1_cuda((void*)W.data_ptr(),
++                                  (void*)quant_X.data_ptr(),
++                                  (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 20:
++      mul_mat_vec_iq4_nl_q8_1_cuda((void*)W.data_ptr(),
++                                   (void*)quant_X.data_ptr(),
++                                   (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 21:
++      mul_mat_vec_iq3_s_q8_1_cuda((void*)W.data_ptr(),
++                                  (void*)quant_X.data_ptr(),
++                                  (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 22:
++      mul_mat_vec_iq2_s_q8_1_cuda((void*)W.data_ptr(),
++                                  (void*)quant_X.data_ptr(),
++                                  (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 23:
++      mul_mat_vec_iq4_xs_q8_1_cuda((void*)W.data_ptr(),
++                                   (void*)quant_X.data_ptr(),
++                                   (half*)Y.data_ptr(), col, row, stream);
++      break;
++    case 29:
++      mul_mat_vec_iq1_m_q8_1_cuda((void*)W.data_ptr(),
++                                  (void*)quant_X.data_ptr(),
++                                  (half*)Y.data_ptr(), col, row, stream);
++      break;
++  }
++  return Y;
++}
++
++torch::Tensor ggml_mul_mat_a8(torch::Tensor W,  // quant weight
++                              torch::Tensor X,  // input
++                              int64_t type, int64_t row) {
++  int col = X.sizes()[1];
++  int padded = (col + 512 - 1) / 512 * 512;
++  int batch = X.sizes()[0];
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
++  auto options =
++      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
++  at::Tensor Y = torch::empty({batch, row}, options);
++  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
++  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
++  at::Tensor quant_X = torch::empty({batch, padded / 32 * 9}, options);
++  quantize_row_q8_1_cuda((half*)X.data_ptr(), (void*)quant_X.data_ptr(), col,
++                         batch, stream);
++
++  switch (type) {
++    case 2:
++      ggml_mul_mat_q4_0_q8_1_cuda(
++          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
++          col, row, batch, padded, row, stream);
++      break;
++    case 3:
++      ggml_mul_mat_q4_1_q8_1_cuda(
++          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
++          col, row, batch, padded, row, stream);
++      break;
++    case 6:
++      ggml_mul_mat_q5_0_q8_1_cuda(
++          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
++          col, row, batch, padded, row, stream);
++      break;
++    case 7:
++      ggml_mul_mat_q5_1_q8_1_cuda(
++          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
++          col, row, batch, padded, row, stream);
++      break;
++    case 8:
++      ggml_mul_mat_q8_0_q8_1_cuda(
++          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
++          col, row, batch, padded, row, stream);
++      break;
++    case 10:
++      ggml_mul_mat_q2_K_q8_1_cuda(
++          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
++          col, row, batch, padded, row, stream);
++      break;
++    case 11:
++      ggml_mul_mat_q3_K_q8_1_cuda(
++          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
++          col, row, batch, padded, row, stream);
++      break;
++    case 12:
++      ggml_mul_mat_q4_K_q8_1_cuda(
++          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
++          col, row, batch, padded, row, stream);
++      break;
++    case 13:
++      ggml_mul_mat_q5_K_q8_1_cuda(
++          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
++          col, row, batch, padded, row, stream);
++      break;
++    case 14:
++      ggml_mul_mat_q6_K_q8_1_cuda(
++          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
++          col, row, batch, padded, row, stream);
++      break;
++  }
++  return Y;
++}
+diff --git a/csrc/quantization/gguf/mmq.cuh b/csrc/quantization/gguf/mmq.cuh
+new file mode 100644
+index 0000000..c935faa
+--- /dev/null
++++ b/csrc/quantization/gguf/mmq.cuh
+@@ -0,0 +1,600 @@
++// copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
++template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
++              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
++static __device__ __forceinline__ void mul_mat_q(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++
++    const block_q_t  * x = (const block_q_t  *) vx;
++    const block_q8_1 * y = (const block_q8_1 *) vy;
++
++    const int blocks_per_row_x = ncols_x / qk;
++    const int blocks_per_col_y = nrows_y / QK8_1;
++    const int blocks_per_warp = WARP_SIZE_GGUF / qi;
++
++    const int & ncols_dst = ncols_y;
++
++    const int row_dst_0 = blockIdx.x*mmq_y;
++    const int & row_x_0 = row_dst_0;
++
++    const int col_dst_0 = blockIdx.y*mmq_x;
++    const int & col_y_0 = col_dst_0;
++
++    int   * tile_x_ql = nullptr;
++    half2 * tile_x_dm = nullptr;
++    int   * tile_x_qh = nullptr;
++    int   * tile_x_sc = nullptr;
++
++    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
++
++    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE_GGUF];
++    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE_GGUF/QI8_1];
++
++    float sum[mmq_y/WARP_SIZE_GGUF][mmq_x/nwarps] = {{0.0f}};
++
++    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
++
++        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
++                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
++
++#pragma unroll
++        for (int ir = 0; ir < qr; ++ir) {
++            const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
++            const int kbxd = kqs / QI8_1;
++
++#pragma unroll
++            for (int i = 0; i < mmq_x; i += nwarps) {
++                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
++                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
++                const int index_y = (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF;
++                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
++            }
++
++#pragma unroll
++            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
++                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
++                const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
++                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
++
++                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
++                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE_GGUF/QI8_1) + kby].ds;
++                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE_GGUF/QI8_1) + kby];
++                if (need_sum) {
++                    *dsi_dst = *dsi_src;
++                } else {
++                    float * dfi_dst = (float *) dsi_dst;
++                    *dfi_dst = __low2float(*dsi_src);
++                }
++            }
++
++            __syncthreads();
++
++// #pragma unroll // unrolling this loop causes too much register pressure
++            for (int k = ir*WARP_SIZE_GGUF/qr; k < (ir+1)*WARP_SIZE_GGUF/qr; k += vdr) {
++#pragma unroll
++                for (int j = 0; j < mmq_x; j += nwarps) {
++#pragma unroll
++                    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
++                        sum[i/WARP_SIZE_GGUF][j/nwarps] += vec_dot(
++                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
++                            threadIdx.x + i, threadIdx.y + j, k);
++                    }
++                }
++            }
++            __syncthreads();
++        }
++    }
++
++#pragma unroll
++    for (int j = 0; j < mmq_x; j += nwarps) {
++        const int col_dst = col_dst_0 + j + threadIdx.y;
++        if (col_dst >= ncols_dst) {
++            return;
++        }
++
++#pragma unroll
++        for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
++            const int row_dst = row_dst_0 + threadIdx.x + i;
++            if (row_dst >= nrows_dst) {
++                continue;
++            }
++            dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE_GGUF][j/nwarps]);
++        }
++    }
++}
++
++#if defined(USE_ROCM)
++#define  MMQ_X_Q4_0  64
++#define  MMQ_Y_Q4_0  128
++#define NWARPS_Q4_0  8
++#else
++#define  MMQ_X_Q4_0 4
++#define  MMQ_Y_Q4_0 32
++#define NWARPS_Q4_0 4
++#endif
++
++template <bool need_check> static __global__ void
++#if defined(USE_ROCM)
++__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_0, 2)
++#endif
++mul_mat_q4_0(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++    const int mmq_x  =  MMQ_X_Q4_0;
++    const int mmq_y  =  MMQ_Y_Q4_0;
++    const int nwarps = NWARPS_Q4_0;
++
++    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
++        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
++        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++}
++
++static void ggml_mul_mat_q4_0_q8_1_cuda(
++    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
++    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
++
++    int mmq_x  =  MMQ_X_Q4_0;
++    int mmq_y  =  MMQ_Y_Q4_0;
++    int nwarps = NWARPS_Q4_0;
++
++    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
++    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
++    const dim3 block_nums(block_num_x, block_num_y, 1);
++    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
++
++    if (nrows_x % mmq_y == 0) {
++        const bool need_check = false;
++        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    } else {
++        const bool need_check = true;
++        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    }
++}
++
++#if defined(USE_ROCM)
++#define  MMQ_X_Q4_1 64
++#define  MMQ_Y_Q4_1 128
++#define NWARPS_Q4_1 8
++#else
++#define  MMQ_X_Q4_1 4
++#define  MMQ_Y_Q4_1 32
++#define NWARPS_Q4_1 4
++#endif
++
++template <bool need_check> static __global__ void
++#if defined(USE_ROCM)
++__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_1, 2)
++#endif
++mul_mat_q4_1(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++    const int mmq_x  =  MMQ_X_Q4_1;
++    const int mmq_y  =  MMQ_Y_Q4_1;
++    const int nwarps = NWARPS_Q4_1;
++
++    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
++        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
++        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++}
++
++static void ggml_mul_mat_q4_1_q8_1_cuda(
++    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
++    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
++
++    int mmq_x  =  MMQ_X_Q4_1;
++    int mmq_y  =  MMQ_Y_Q4_1;
++    int nwarps = NWARPS_Q4_1;
++
++    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
++    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
++    const dim3 block_nums(block_num_x, block_num_y, 1);
++    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
++
++    if (nrows_x % mmq_y == 0) {
++        const bool need_check = false;
++        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    } else {
++        const bool need_check = true;
++        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    }
++}
++
++#if defined(USE_ROCM)
++#define  MMQ_X_Q5_0 64
++#define  MMQ_Y_Q5_0 128
++#define NWARPS_Q5_0 8
++#else
++#define  MMQ_X_Q5_0 4
++#define  MMQ_Y_Q5_0 32
++#define NWARPS_Q5_0 4
++#endif
++
++template <bool need_check> static __global__ void
++#if defined(USE_ROCM)
++__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_0, 2)
++#endif
++mul_mat_q5_0(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++    const int mmq_x  =  MMQ_X_Q5_0;
++    const int mmq_y  =  MMQ_Y_Q5_0;
++    const int nwarps = NWARPS_Q5_0;
++
++    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
++        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
++        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++}
++
++static void ggml_mul_mat_q5_0_q8_1_cuda(
++    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
++    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
++
++    const int mmq_x  =  MMQ_X_Q5_0;
++    const int mmq_y  =  MMQ_Y_Q5_0;
++    const int nwarps = NWARPS_Q5_0;
++
++    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
++    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
++    const dim3 block_nums(block_num_x, block_num_y, 1);
++    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
++
++    if (nrows_x % mmq_y == 0) {
++        const bool need_check = false;
++        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    } else {
++        const bool need_check = true;
++        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    }
++}
++
++#if defined(USE_ROCM)
++#define  MMQ_X_Q5_1 64
++#define  MMQ_Y_Q5_1 128
++#define NWARPS_Q5_1 8
++#else
++#define  MMQ_X_Q5_1 4
++#define  MMQ_Y_Q5_1 32
++#define NWARPS_Q5_1 4
++#endif
++
++template <bool need_check> static __global__ void
++#if defined(USE_ROCM)
++__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_1, 2)
++#endif
++mul_mat_q5_1(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++    const int mmq_x  =  MMQ_X_Q5_1;
++    const int mmq_y  =  MMQ_Y_Q5_1;
++    const int nwarps = NWARPS_Q5_1;
++
++    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
++        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
++        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++}
++
++static void ggml_mul_mat_q5_1_q8_1_cuda(
++    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
++    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
++    const int mmq_x  =  MMQ_X_Q5_1;
++    const int mmq_y  =  MMQ_Y_Q5_1;
++    const int nwarps = NWARPS_Q5_1;
++
++    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
++    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
++    const dim3 block_nums(block_num_x, block_num_y, 1);
++    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
++
++    if (nrows_x % mmq_y == 0) {
++        const bool need_check = false;
++        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    } else {
++        const bool need_check = true;
++        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    }
++}
++
++#if defined(USE_ROCM)
++#define  MMQ_X_Q8_0 64
++#define  MMQ_Y_Q8_0 128
++#define NWARPS_Q8_0 8
++#else
++#define  MMQ_X_Q8_0 4
++#define  MMQ_Y_Q8_0 32
++#define NWARPS_Q8_0 4
++#endif
++
++template <bool need_check> static __global__ void
++#if defined(USE_ROCM)
++__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q8_0, 2)
++#endif
++mul_mat_q8_0(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++    const int mmq_x  =  MMQ_X_Q8_0;
++    const int mmq_y  =  MMQ_Y_Q8_0;
++    const int nwarps = NWARPS_Q8_0;
++
++    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
++        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
++        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++}
++
++static void ggml_mul_mat_q8_0_q8_1_cuda(
++    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
++    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
++    const int mmq_x  =  MMQ_X_Q8_0;
++    const int mmq_y  =  MMQ_Y_Q8_0;
++    const int nwarps = NWARPS_Q8_0;
++
++    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
++    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
++    const dim3 block_nums(block_num_x, block_num_y, 1);
++    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
++
++    if (nrows_x % mmq_y == 0) {
++        const bool need_check = false;
++        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    } else {
++        const bool need_check = true;
++        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    }
++}
++
++#if defined(USE_ROCM)
++#define  MMQ_X_Q2_K 64
++#define  MMQ_Y_Q2_K 128
++#define NWARPS_Q2_K 8
++#else
++#define  MMQ_X_Q2_K 4
++#define  MMQ_Y_Q2_K 32
++#define NWARPS_Q2_K 4
++#endif
++
++template <bool need_check> static __global__ void
++#if defined(USE_ROCM)
++__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q2_K, 2)
++#endif
++mul_mat_q2_K(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++    const int mmq_x  =  MMQ_X_Q2_K;
++    const int mmq_y  =  MMQ_Y_Q2_K;
++    const int nwarps = NWARPS_Q2_K;
++
++    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
++        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
++        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++}
++
++static void ggml_mul_mat_q2_K_q8_1_cuda(
++    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
++    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
++    const int mmq_x  =  MMQ_X_Q2_K;
++    const int mmq_y  =  MMQ_Y_Q2_K;
++    const int nwarps = NWARPS_Q2_K;
++
++    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
++    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
++    const dim3 block_nums(block_num_x, block_num_y, 1);
++    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
++
++    if (nrows_x % mmq_y == 0) {
++        const bool need_check = false;
++        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    } else {
++        const bool need_check = true;
++        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    }
++}
++
++#if defined(USE_ROCM)
++#define  MMQ_X_Q3_K 64
++#define  MMQ_Y_Q3_K 128
++#define NWARPS_Q3_K 8
++#else
++#define  MMQ_X_Q3_K 4
++#define  MMQ_Y_Q3_K 32
++#define NWARPS_Q3_K 4
++#endif
++
++template <bool need_check> static __global__ void
++#if defined(USE_ROCM)
++__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q3_K, 2)
++#endif
++mul_mat_q3_K(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++
++    const int mmq_x  =  MMQ_X_Q3_K;
++    const int mmq_y  =  MMQ_Y_Q3_K;
++    const int nwarps = NWARPS_Q3_K;
++
++    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
++        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
++        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++}
++
++static void ggml_mul_mat_q3_K_q8_1_cuda(
++    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
++    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
++
++    const int mmq_x  =  MMQ_X_Q3_K;
++    const int mmq_y  =  MMQ_Y_Q3_K;
++    const int nwarps = NWARPS_Q3_K;
++
++    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
++    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
++    const dim3 block_nums(block_num_x, block_num_y, 1);
++    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
++
++    if (nrows_x % mmq_y == 0) {
++        const bool need_check = false;
++        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    } else {
++        const bool need_check = true;
++        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    }
++}
++
++#if defined(USE_ROCM)
++#define  MMQ_X_Q4_K 64
++#define  MMQ_Y_Q4_K 128
++#define NWARPS_Q4_K 8
++#else
++#define  MMQ_X_Q4_K 4
++#define  MMQ_Y_Q4_K 32
++#define NWARPS_Q4_K 4
++#endif
++
++template <bool need_check> static __global__ void
++#if defined(USE_ROCM)
++__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_K, 2)
++#endif
++mul_mat_q4_K(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++    const int mmq_x  =  MMQ_X_Q4_K;
++    const int mmq_y  =  MMQ_Y_Q4_K;
++    const int nwarps = NWARPS_Q4_K;
++
++    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
++        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
++        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++}
++
++static void ggml_mul_mat_q4_K_q8_1_cuda(
++    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
++    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
++    const int mmq_x  =  MMQ_X_Q4_K;
++    const int mmq_y  =  MMQ_Y_Q4_K;
++    const int nwarps = NWARPS_Q4_K;
++
++    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
++    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
++    const dim3 block_nums(block_num_x, block_num_y, 1);
++    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
++
++    if (nrows_x % mmq_y == 0) {
++        const bool need_check = false;
++        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    } else {
++        const bool need_check = true;
++        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    }
++}
++
++#if defined(USE_ROCM)
++#define  MMQ_X_Q5_K 64
++#define  MMQ_Y_Q5_K 128
++#define NWARPS_Q5_K 8
++#else
++#define  MMQ_X_Q5_K 4
++#define  MMQ_Y_Q5_K 32
++#define NWARPS_Q5_K 4
++#endif
++
++template <bool need_check> static __global__ void
++#if defined(USE_ROCM)
++__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_K, 2)
++#endif
++mul_mat_q5_K(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++    const int mmq_x  =  MMQ_X_Q5_K;
++    const int mmq_y  =  MMQ_Y_Q5_K;
++    const int nwarps = NWARPS_Q5_K;
++
++    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
++        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
++        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++}
++
++static void ggml_mul_mat_q5_K_q8_1_cuda(
++    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
++    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
++
++    const int mmq_x  =  MMQ_X_Q5_K;
++    const int mmq_y  =  MMQ_Y_Q5_K;
++    const int nwarps = NWARPS_Q5_K;
++
++    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
++    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
++    const dim3 block_nums(block_num_x, block_num_y, 1);
++    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
++
++    if (nrows_x % mmq_y == 0) {
++        const bool need_check = false;
++        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    } else {
++        const bool need_check = true;
++        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    }
++}
++
++#if defined(USE_ROCM)
++#define  MMQ_X_Q6_K 64
++#define  MMQ_Y_Q6_K 128
++#define NWARPS_Q6_K 8
++#else
++#define  MMQ_X_Q6_K 4
++#define  MMQ_Y_Q6_K 32
++#define NWARPS_Q6_K 4
++#endif
++
++template <bool need_check> static __global__ void
++#if defined(USE_ROCM)
++__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q6_K, 2)
++#endif
++mul_mat_q6_K(
++    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
++    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
++    const int mmq_x  =  MMQ_X_Q6_K;
++    const int mmq_y  =  MMQ_Y_Q6_K;
++    const int nwarps = NWARPS_Q6_K;
++
++    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
++        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
++        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++}
++
++static void ggml_mul_mat_q6_K_q8_1_cuda(
++    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
++    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
++    const int mmq_x  =  MMQ_X_Q6_K;
++    const int mmq_y  =  MMQ_Y_Q6_K;
++    const int nwarps = NWARPS_Q6_K;
++
++    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
++    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
++    const dim3 block_nums(block_num_x, block_num_y, 1);
++    const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
++
++    if (nrows_x % mmq_y == 0) {
++        const bool need_check = false;
++        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    } else {
++        const bool need_check = true;
++        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
++            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
++    }
++}
+diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh
+new file mode 100644
+index 0000000..b01e939
+--- /dev/null
++++ b/csrc/quantization/gguf/mmvq.cuh
+@@ -0,0 +1,190 @@
++// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
++template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
++static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, const int ncols, const int nrows) {
++    const int row = blockIdx.x*blockDim.y + threadIdx.y;
++
++    if (row >= nrows) {
++        return;
++    }
++
++    const int blocks_per_row = ncols / qk;
++    const int blocks_per_warp = vdr * WARP_SIZE / qi;
++
++// partial sum for each thread
++    float tmp = 0.0f;
++
++    const block_q_t  * x = (const block_q_t  *) vx;
++    const block_q8_1 * y = (const block_q8_1 *) vy;
++
++    for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
++        const int ibx = row*blocks_per_row + i; // x block index
++
++        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
++
++        const int iqs  = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
++
++        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
++    }
++
++    // sum up partial sums and write back result
++#pragma unroll
++    for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) {
++        tmp += VLLM_SHFL_XOR_SYNC(tmp, mask);
++    }
++
++    if (threadIdx.x == 0) {
++        dst[row] = __float2half(tmp);
++    }
++}
++
++static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
++
++static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
++    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
++    const dim3 block_nums(block_num_y, 1, 1);
++    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
++    mul_mat_vec_q<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
++        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
++}
+diff --git a/csrc/quantization/gguf/vecdotq.cuh b/csrc/quantization/gguf/vecdotq.cuh
+new file mode 100644
+index 0000000..e004226
+--- /dev/null
++++ b/csrc/quantization/gguf/vecdotq.cuh
+@@ -0,0 +1,1810 @@
++// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/vecdotq.cuh
++// and https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
++static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) {
++    const uint16_t * x16 = (const uint16_t *) x; // assume at least 2 byte alignment
++
++    int x32  = x16[2*i32 + 0] <<  0;
++    x32     |= x16[2*i32 + 1] << 16;
++
++    return x32;
++}
++
++static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) {
++    return ((const int *) x)[i32]; // assume at least 4 byte alignment
++}
++
++static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
++    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
++    int x32 = 0;
++    x32 |= x16[0] <<  0;
++    x32 |= x16[1] << 16;
++    return x32;
++}
++
++static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
++    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
++    int x32 = 0;
++    x32 |= x16[0] <<  0;
++    x32 |= x16[1] << 16;
++    return x32;
++}
++
++static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
++    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
++}
++
++static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
++    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
++}
++
++
++#define VDR_Q4_0_Q8_1_MMVQ 2
++#define VDR_Q4_0_Q8_1_MMQ  4
++
++template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
++    const int * v, const int * u, const float & d4, const half2 & ds8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    int sumi = 0;
++
++#pragma unroll
++    for (int i = 0; i < vdr; ++i) {
++        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
++        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
++
++        // SIMD dot product of quantized values
++        sumi = __dp4a(vi0, u[2*i+0], sumi);
++        sumi = __dp4a(vi1, u[2*i+1], sumi);
++    }
++
++    const float2 ds8f = __half22float2(ds8);
++
++    // second part effectively subtracts 8 from each quant value
++    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
++#endif
++}
++
++#define VDR_Q4_1_Q8_1_MMVQ 2
++#define VDR_Q4_1_Q8_1_MMQ  4
++
++template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
++    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    int sumi = 0;
++
++#pragma unroll
++    for (int i = 0; i < vdr; ++i) {
++        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
++        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
++
++        // SIMD dot product of quantized values
++        sumi = __dp4a(vi0, u[2*i+0], sumi);
++        sumi = __dp4a(vi1, u[2*i+1], sumi);
++    }
++
++    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
++    const float d4d8 = tmp.x;
++    const float m4s8 = tmp.y;
++
++    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
++    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
++#endif
++}
++
++#define VDR_Q5_0_Q8_1_MMVQ 2
++#define VDR_Q5_0_Q8_1_MMQ  4
++
++template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
++    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    int sumi = 0;
++
++#pragma unroll
++    for (int i = 0; i < vdr; ++i) {
++        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
++        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
++        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
++        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
++        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
++        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
++
++        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
++        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
++        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
++        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
++        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
++        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
++    }
++
++    const float2 ds8f = __half22float2(ds8);
++
++    // second part effectively subtracts 16 from each quant value
++    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
++#endif
++}
++
++
++#define VDR_Q5_1_Q8_1_MMVQ 2
++#define VDR_Q5_1_Q8_1_MMQ  4
++
++template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
++    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    int sumi = 0;
++
++#pragma unroll
++    for (int i = 0; i < vdr; ++i) {
++        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
++        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
++        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
++        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
++        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
++        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
++
++        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
++        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
++        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
++        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
++        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
++        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
++    }
++
++    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
++    const float d5d8 = tmp.x;
++    const float m5s8 = tmp.y;
++
++    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
++    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
++#endif
++}
++
++#define VDR_Q8_0_Q8_1_MMVQ 2
++#define VDR_Q8_0_Q8_1_MMQ 8
++
++template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
++    const int * v, const int * u, const float & d8_0, const float & d8_1) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    int sumi = 0;
++
++#pragma unroll
++    for (int i = 0; i < vdr; ++i) {
++        // SIMD dot product of quantized values
++        sumi = __dp4a(v[i], u[i], sumi);
++    }
++    return d8_0*d8_1 * sumi;
++#endif
++}
++
++template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
++    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++
++    int sumi = 0;
++
++#pragma unroll
++    for (int i = 0; i < vdr; ++i) {
++        // SIMD dot product of quantized values
++        sumi = __dp4a(v[i], u[i], sumi);
++    }
++
++    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
++    const float d8d8 = tmp.x;
++    const float m8s8 = tmp.y;
++
++    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
++    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
++#endif
++}
++
++#define VDR_Q2_K_Q8_1_MMVQ 1
++#define VDR_Q2_K_Q8_1_MMQ  2
++
++// contiguous v/x values
++static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
++    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
++    const half2 & dm2, const float * __restrict__ d8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    float sumf_d = 0.0f;
++    float sumf_m = 0.0f;
++
++#pragma unroll
++    for (int i = 0; i < QR2_K; ++i) {
++        const int sc = scales[2*i];
++
++        const int vi = (v >> (2*i)) & 0x03030303;
++
++        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
++
++        // fill int with 4x m
++        int m = sc >> 4;
++        m |= m <<  8;
++        m |= m << 16;
++        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
++    }
++
++    const float2 dm2f = __half22float2(dm2);
++
++    return dm2f.x*sumf_d - dm2f.y*sumf_m;
++#endif
++}
++
++static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
++    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
++    const half2 & dm2, const float & d8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    int sumi_d = 0;
++    int sumi_m = 0;
++
++#pragma unroll
++    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
++        int sumi_d_sc = 0;
++
++        const int sc = scales[i0 / (QI8_1/2)];
++
++        // fill int with 4x m
++        int m = sc >> 4;
++        m |= m <<  8;
++        m |= m << 16;
++
++#pragma unroll
++        for (int i = i0; i < i0 + QI8_1/2; ++i) {
++            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
++            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
++        }
++
++        sumi_d += sumi_d_sc * (sc & 0xF);
++    }
++
++    const float2 dm2f = __half22float2(dm2);
++
++    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
++#endif
++}
++
++#define VDR_Q3_K_Q8_1_MMVQ 1
++#define VDR_Q3_K_Q8_1_MMQ  2
++
++// contiguous v/x values
++static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
++    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
++    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++
++    float sumf = 0.0f;
++
++#pragma unroll
++    for (int i = 0; i < QR3_K; ++i) {
++        const int isc = scale_offset + 2*i;
++
++        const int isc_low = isc % (QK_K/32);
++        const int sc_shift_low = 4 * (isc / (QK_K/32));
++        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
++
++        const int isc_high = isc % (QK_K/64);
++        const int sc_shift_high = 2 * (isc / (QK_K/64));
++        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
++
++        const int sc = (sc_low | sc_high) - 32;
++
++        const int vil = (vl >> (2*i)) & 0x03030303;
++
++        const int vih = ((vh >> i) << 2) & 0x04040404;
++
++        const int vi = __vsubss4(vil, vih);
++
++        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
++    }
++
++    return d3 * sumf;
++#endif
++}
++
++static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
++    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
++    const float & d3, const float & d8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    int sumi = 0;
++
++#pragma unroll
++    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
++        int sumi_sc = 0;
++
++        for (int i = i0; i < i0 + QI8_1/2; ++i) {
++            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
++        }
++
++        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
++    }
++
++    return d3*d8 * sumi;
++#endif
++}
++
++#define VDR_Q4_K_Q8_1_MMVQ 2
++#define VDR_Q4_K_Q8_1_MMQ  8
++
++// contiguous v/x values
++static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
++    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
++    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++
++    float sumf_d = 0.0f;
++    float sumf_m = 0.0f;
++
++#pragma unroll
++    for (int i = 0; i < QR4_K; ++i) {
++        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
++        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
++
++        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
++        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
++
++        sumf_d += d8[i] * (dot1 * sc[i]);
++        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
++    }
++
++    const float2 dm4f = __half22float2(dm4);
++    return dm4f.x*sumf_d - dm4f.y*sumf_m;
++#endif
++}
++
++static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
++    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
++    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    float sumf_d = 0.0f;
++    float sumf_m = 0.0f;
++
++#pragma unroll
++    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
++        int sumi_d = 0;
++
++#pragma unroll
++        for (int j = 0; j < QI8_1; ++j) {
++            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
++        }
++
++        const float2 ds8f = __half22float2(ds8[i]);
++
++        sumf_d += ds8f.x * (sc[i] * sumi_d);
++        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
++    }
++
++    const float2 dm4f = __half22float2(dm4);
++
++    return dm4f.x*sumf_d - dm4f.y*sumf_m;
++#endif
++}
++
++#define VDR_Q5_K_Q8_1_MMVQ 2
++#define VDR_Q5_K_Q8_1_MMQ  8
++
++static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
++    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
++    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++
++    float sumf_d = 0.0f;
++    float sumf_m = 0.0f;
++
++#pragma unroll
++    for (int i = 0; i < QR5_K; ++i) {
++        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
++        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
++
++        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
++        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
++
++        const int v0i = vl0i | vh0i;
++        const int v1i = vl1i | vh1i;
++
++        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
++        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
++
++        sumf_d += d8[i] * (dot1 * sc[i]);
++        sumf_m += d8[i] * (dot2 * m[i]);
++    }
++
++    const float2 dm5f = __half22float2(dm5);
++    return dm5f.x*sumf_d - dm5f.y*sumf_m;
++#endif
++}
++
++static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
++    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
++    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    float sumf_d = 0.0f;
++    float sumf_m = 0.0f;
++
++#pragma unroll
++    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
++        int sumi_d = 0;
++
++#pragma unroll
++        for (int j = 0; j < QI8_1; ++j) {
++            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
++        }
++
++        const float2 ds8f = __half22float2(ds8[i]);
++
++        sumf_d += ds8f.x * (sc[i] * sumi_d);
++        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
++    }
++
++    const float2 dm4f = __half22float2(dm4);
++
++    return dm4f.x*sumf_d - dm4f.y*sumf_m;
++#endif
++}
++
++#define VDR_Q6_K_Q8_1_MMVQ 1
++#define VDR_Q6_K_Q8_1_MMQ  8
++
++// contiguous v/x values
++static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
++    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
++    const float & d, const float * __restrict__ d8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    float sumf = 0.0f;
++
++#pragma unroll
++    for (int i = 0; i < QR6_K; ++i) {
++        const int sc = scales[4*i];
++        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
++        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
++        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
++
++        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
++    }
++
++    return d*sumf;
++#endif
++}
++
++static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
++    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
++    const float & d6, const float * __restrict__ d8) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    float sumf_d = 0.0f;
++
++#pragma unroll
++    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
++        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
++
++#pragma unroll
++        for (int i = i0; i < i0 + 2; ++i) {
++            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
++            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
++
++            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
++            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
++        }
++
++        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
++    }
++
++    return d6 * sumf_d;
++#endif
++}
++
++static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++
++    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
++
++    int v[VDR_Q4_0_Q8_1_MMVQ];
++    int u[2*VDR_Q4_0_Q8_1_MMVQ];
++
++#pragma unroll
++    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
++        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
++        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
++        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
++    }
++
++    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, __half2float(bq4_0->d), bq8_1->ds);
++}
++
++template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
++    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
++    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI4_0) + mmq_y/QI4_0];
++    *x_ql = tile_x_qs;
++    *x_dm = (half2 *) tile_x_d;
++}
++
++template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
++    const int kbx  = k / QI4_0;
++    const int kqsx = k % QI4_0;
++
++    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
++    float * x_dmf = (float *) x_dm;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
++        int i = i0 + i_offset;
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
++        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
++        // x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbx] = bxi->d;
++    }
++
++    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_0;
++    const int kbxd = k % blocks_per_tile_x_row;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
++        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
++        x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d);
++    }
++}
++
++static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
++    (void)x_qh; (void)x_sc;
++
++    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
++    const float * x_dmf = (const float *) x_dm;
++
++    int u[2*VDR_Q4_0_Q8_1_MMQ];
++
++#pragma unroll
++    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
++        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
++        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_0) % WARP_SIZE_GGUF];
++    }
++
++    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
++        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i/QI4_0 + k/QI4_0],
++         y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
++}
++
++static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++
++    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
++
++    int v[VDR_Q4_1_Q8_1_MMVQ];
++    int u[2*VDR_Q4_1_Q8_1_MMVQ];
++
++#pragma unroll
++    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
++        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
++        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
++        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
++    }
++
++    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
++}
++
++template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
++    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE_GGUF) +     + mmq_y];
++    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_1) + mmq_y/QI4_1];
++    *x_ql = tile_x_qs;
++    *x_dm = tile_x_dm;
++}
++
++template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
++    const int kbx  = k / QI4_1;
++    const int kqsx = k % QI4_1;
++
++    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
++        int i = i0 + i_offset;
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
++        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
++    }
++
++    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_1;
++    const int kbxd = k % blocks_per_tile_x_row;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
++        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
++        x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
++    }
++}
++
++static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
++    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
++
++    int u[2*VDR_Q4_1_Q8_1_MMQ];
++
++#pragma unroll
++    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
++        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
++        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_1) % WARP_SIZE_GGUF];
++    }
++
++    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
++        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i/QI4_1 + k/QI4_1],
++         y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
++}
++
++static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++
++    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
++
++    int vl[VDR_Q5_0_Q8_1_MMVQ];
++    int vh[VDR_Q5_0_Q8_1_MMVQ];
++    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
++
++#pragma unroll
++    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
++        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
++        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
++        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
++        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
++    }
++
++    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, __half2float(bq5_0->d), bq8_1->ds);
++}
++
++template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
++    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
++    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI5_0) + mmq_y/QI5_0];
++
++    *x_ql = tile_x_ql;
++    *x_dm = (half2 *) tile_x_d;
++}
++
++template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
++    const int kbx  = k / QI5_0;
++    const int kqsx = k % QI5_0;
++
++    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
++        int i = i0 + i_offset;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
++        const int ql = get_int_from_uint8(bxi->qs, kqsx);
++        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
++
++        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
++        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
++        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
++        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
++        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
++        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
++
++        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0;
++
++        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
++        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
++        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
++        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
++        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
++        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
++
++        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1;
++    }
++
++    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_0;
++    const int kbxd = k % blocks_per_tile_x_row;
++    float * x_dmf = (float *) x_dm;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
++        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++
++        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
++        x_dmf[i * (WARP_SIZE_GGUF/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d);
++    }
++}
++
++static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
++    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
++    const int index_bx = i * (WARP_SIZE_GGUF/QI5_0) + i/QI5_0 + k/QI5_0;
++    const float * x_dmf = (const float *) x_dm;
++    const float * y_df  = (const float *) y_ds;
++
++    int u[2*VDR_Q5_0_Q8_1_MMQ];
++
++#pragma unroll
++    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
++        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
++        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_0) % WARP_SIZE_GGUF];
++    }
++
++    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
++        (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
++}
++
++static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++
++    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
++
++    int vl[VDR_Q5_1_Q8_1_MMVQ];
++    int vh[VDR_Q5_1_Q8_1_MMVQ];
++    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
++
++#pragma unroll
++    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
++        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
++        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
++        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
++        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
++    }
++
++    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
++}
++
++template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
++    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
++    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_1) + mmq_y/QI5_1];
++
++    *x_ql = tile_x_ql;
++    *x_dm = tile_x_dm;
++}
++
++template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
++    const int kbx  = k / QI5_1;
++    const int kqsx = k % QI5_1;
++
++    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
++        int i = i0 + i_offset;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++
++        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
++
++        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
++        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
++
++        int qs0 = (ql >>  0) & 0x0F0F0F0F;
++        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
++        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
++        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
++        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
++
++        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0;
++
++        int qs1 = (ql >>  4) & 0x0F0F0F0F;
++        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
++        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
++        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
++        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
++
++        x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1;
++    }
++
++    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_1;
++    const int kbxd = k % blocks_per_tile_x_row;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
++        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++
++        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
++
++        x_dm[i * (WARP_SIZE_GGUF/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
++    }
++}
++
++static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
++    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
++    const int index_bx = i * (WARP_SIZE_GGUF/QI5_1) + + i/QI5_1 + k/QI5_1;
++
++    int u[2*VDR_Q5_1_Q8_1_MMQ];
++
++#pragma unroll
++    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
++        u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l)         % WARP_SIZE_GGUF];
++        u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_1) % WARP_SIZE_GGUF];
++    }
++
++    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
++        (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]);
++}
++
++static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++
++    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
++
++    int v[VDR_Q8_0_Q8_1_MMVQ];
++    int u[VDR_Q8_0_Q8_1_MMVQ];
++
++#pragma unroll
++    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
++        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
++        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
++    }
++
++    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, __half2float(bq8_0->d), __low2float(bq8_1->ds));
++}
++
++template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
++    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
++    __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI8_0) + mmq_y/QI8_0];
++
++    *x_ql = tile_x_qs;
++    *x_dm = (half2 *) tile_x_d;
++}
++
++template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
++    const int kbx  = k / QI8_0;
++    const int kqsx = k % QI8_0;
++    float * x_dmf = (float *) x_dm;
++
++    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
++        int i = i0 + i_offset;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
++        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
++    }
++
++    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI8_0;
++    const int kbxd = k % blocks_per_tile_x_row;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
++        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
++        x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d);
++    }
++}
++
++static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
++    const float * x_dmf = (const float *) x_dm;
++    const float * y_df  = (const float *) y_ds;
++
++    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
++        (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[j * WARP_SIZE_GGUF + k], x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i/QI8_0 + k/QI8_0],
++         y_df[j * (WARP_SIZE_GGUF/QI8_1) + k/QI8_1]);
++}
++
++static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++
++    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
++
++    const int bq8_offset = QR2_K * (iqs / QI8_1);
++    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
++
++    const uint8_t * scales = bq2_K->scales + scale_offset;
++
++    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
++    int    u[QR2_K];
++    float d8[QR2_K];
++
++#pragma unroll
++    for (int i = 0; i < QR2_K; ++ i) {
++        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
++        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
++    }
++
++    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
++}
++
++template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
++    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
++    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI2_K) + mmq_y/QI2_K];
++    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4)     + mmq_y/4];
++
++    *x_ql = tile_x_ql;
++    *x_dm = tile_x_dm;
++    *x_sc = tile_x_sc;
++}
++
++template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
++    const int kbx  = k / QI2_K;
++    const int kqsx = k % QI2_K;
++
++    const block_q2_K * bx0 = (const block_q2_K *) vx;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
++        int i = i0 + i_offset;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
++        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
++    }
++
++    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI2_K;
++    const int kbxd = k % blocks_per_tile_x_row;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
++        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
++        x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
++    }
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
++        int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4);
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI2_K/4);
++        x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
++    }
++}
++
++static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
++    const int kbx = k / QI2_K;
++    const int ky  = (k % QI2_K) * QR2_K;
++    const float * y_df = (const float *) y_ds;
++
++    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
++
++    const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
++    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
++
++#pragma unroll
++    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
++        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
++    }
++
++    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4]) + ky/4;
++
++    const int index_y = j * WARP_SIZE_GGUF + (QR2_K*k) % WARP_SIZE_GGUF;
++    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
++}
++
++static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++
++    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
++
++    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
++    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
++
++    const float d = __half2float(bq3_K->d);
++
++    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
++
++    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
++    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
++
++    int    u[QR3_K];
++    float d8[QR3_K];
++
++#pragma unroll
++    for (int i = 0; i < QR3_K; ++i) {
++        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
++        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
++    }
++
++    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
++}
++
++template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
++    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
++    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI3_K) + mmq_y/QI3_K];
++    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE_GGUF/2)     + mmq_y/2];
++    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4)     + mmq_y/4];
++
++    *x_ql = tile_x_ql;
++    *x_dm = tile_x_dm;
++    *x_qh = tile_x_qh;
++    *x_sc = tile_x_sc;
++}
++
++template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
++    const int kbx  = k / QI3_K;
++    const int kqsx = k % QI3_K;
++
++    const block_q3_K * bx0 = (const block_q3_K *) vx;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
++        int i = i0 + i_offset;
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
++        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
++    }
++
++    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI3_K;
++    const int kbxd = k % blocks_per_tile_x_row;
++    float * x_dmf = (float *) x_dm;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
++        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
++        x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d);
++    }
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
++        int i = i0 + i_offset * 2 + k / (WARP_SIZE_GGUF/2);
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/2)) / (QI3_K/2);
++        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
++        x_qh[i * (WARP_SIZE_GGUF/2) + i / 2 + k % (WARP_SIZE_GGUF/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
++    }
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
++        int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4);
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI3_K/4);
++
++        const int ksc = k % (QI3_K/4);
++
++        const int ksc_low = ksc % (QI3_K/8);
++        const int shift_low = 4 * (ksc / (QI3_K/8));
++        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
++
++        const int ksc_high = QI3_K/8;
++        const int shift_high = 2 * ksc;
++        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
++
++        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
++
++        x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = sc;
++    }
++}
++
++static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
++
++    const int kbx  = k / QI3_K;
++    const int ky  = (k % QI3_K) * QR3_K;
++    const float * x_dmf = (const float *) x_dm;
++    const float * y_df  = (const float *) y_ds;
++
++    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4)) + ky/4;
++
++    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
++
++#pragma unroll
++    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
++        const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
++        const int shift = 2 * ((ky % 32) / 8);
++        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
++
++        const int vh = x_qh[i * (WARP_SIZE_GGUF/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
++        const int vlh = (vh << 2) & 0x04040404;
++
++        v[l] = __vsubss4(vll, vlh);
++    }
++
++    const int index_y = j * WARP_SIZE_GGUF + (k*QR3_K) % WARP_SIZE_GGUF;
++    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
++}
++
++static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
++
++    int    v[2];
++    int    u[2*QR4_K];
++    float d8[QR4_K];
++
++    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
++    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
++
++    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
++    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
++    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
++    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
++
++    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
++    v[0] = q4[0];
++    v[1] = q4[4];
++
++    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
++    uint16_t aux[2];
++    const int j = bq8_offset/2;
++    if (j < 2) {
++        aux[0] = scales[j+0] & 0x3f3f;
++        aux[1] = scales[j+2] & 0x3f3f;
++    } else {
++        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
++        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
++    }
++    const uint8_t * sc = (const uint8_t *)aux;
++    const uint8_t * m  = sc + 2;
++
++    for (int i = 0; i < QR4_K; ++i) {
++        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
++        d8[i] = __low2float(bq8i->ds);
++
++        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
++        u[2*i+0] = q8[0];
++        u[2*i+1] = q8[4];
++    }
++
++    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
++}
++
++template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
++    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE_GGUF)       + mmq_y];
++    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_K) + mmq_y/QI4_K];
++    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
++
++    *x_ql = tile_x_ql;
++    *x_dm = tile_x_dm;
++    *x_sc = tile_x_sc;
++}
++
++template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
++    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
++    const int kqsx = k % QI4_K; // == k if QK_K == 256
++
++    const block_q4_K * bx0 = (const block_q4_K *) vx;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
++        int i = i0 + i_offset;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
++        x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
++    }
++
++    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_K; // == 1 if QK_K == 256
++    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
++        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
++        if (need_check) {
++            i = min(i, i_max);
++        }
++        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
++        x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
++    }
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
++        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++
++        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI4_K/8);
++
++        const int * scales = (const int *) bxi->scales;
++
++        const int ksc = k % (WARP_SIZE_GGUF/8);
++        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
++        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
++        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
++
++        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8;
++    }
++}
++
++static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
++    (void)x_qh;
++
++    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2*((k % 16) / 8);
++
++    const int index_y = j * WARP_SIZE_GGUF + (QR4_K*k) % WARP_SIZE_GGUF;
++    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[index_y], sc, sc+8,
++                                      x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
++}
++
++static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++
++    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
++
++    int   vl[2];
++    int   vh[2];
++    int    u[2*QR5_K];
++    float d8[QR5_K];
++
++    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
++    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
++    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
++
++    vl[0] = ql[0];
++    vl[1] = ql[4];
++
++    vh[0] = qh[0] >> bq8_offset;
++    vh[1] = qh[4] >> bq8_offset;
++
++    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
++    uint16_t aux[2];
++    const int j = bq8_offset/2;
++    if (j < 2) {
++        aux[0] = scales[j+0] & 0x3f3f;
++        aux[1] = scales[j+2] & 0x3f3f;
++    } else {
++        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
++        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
++    }
++    const uint8_t * sc = (const uint8_t *)aux;
++    const uint8_t * m  = sc + 2;
++
++#pragma unroll
++    for (int i = 0; i < QR5_K; ++i) {
++        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
++        d8[i] = __low2float(bq8i->ds);
++
++        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
++        u[2*i+0] = q8[0];
++        u[2*i+1] = q8[4];
++    }
++
++    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
++}
++
++template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
++    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
++    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_K) + mmq_y/QI5_K];
++    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
++
++    *x_ql = tile_x_ql;
++    *x_dm = tile_x_dm;
++    *x_sc = tile_x_sc;
++}
++
++template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
++    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
++    const int kqsx = k % QI5_K; // == k if QK_K == 256
++
++    const block_q5_K * bx0 = (const block_q5_K *) vx;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
++        int i = i0 + i_offset;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++
++        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
++        const int ky = QR5_K*kqsx;
++
++        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
++        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
++        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
++
++        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
++        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
++        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
++
++        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
++        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
++
++        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = ql0 | qh0;
++        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = ql1 | qh1;
++    }
++
++    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_K; // == 1 if QK_K == 256
++    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
++        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++
++        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
++        x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
++    }
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
++        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++
++        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI5_K/8);
++
++        const int * scales = (const int *) bxi->scales;
++
++        const int ksc = k % (WARP_SIZE_GGUF/8);
++
++        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
++        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
++        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
++
++        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8;
++    }
++}
++
++static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
++    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
++
++    const int index_x = i * (QR5_K*WARP_SIZE_GGUF + 1) +  QR5_K*k;
++    const int index_y = j * WARP_SIZE_GGUF             + (QR5_K*k) % WARP_SIZE_GGUF;
++    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
++                                      x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
++}
++
++static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++
++    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
++
++    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
++    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
++    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
++
++    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
++    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
++
++    const int8_t * scales = bq6_K->scales + scale_offset;
++
++    int    u[QR6_K];
++    float d8[QR6_K];
++
++#pragma unroll
++    for (int i = 0; i < QR6_K; ++i) {
++        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
++        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
++    }
++
++    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, __half2float(bq6_K->d), d8);
++}
++
++template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
++    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF)     + mmq_y];
++    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI6_K) + mmq_y/QI6_K];
++    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8)     + mmq_y/8];
++
++    *x_ql = tile_x_ql;
++    *x_dm = tile_x_dm;
++    *x_sc = tile_x_sc;
++}
++
++template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
++    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
++    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
++    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
++    const int kqsx = k % QI6_K; // == k if QK_K == 256
++
++    const block_q6_K * bx0 = (const block_q6_K *) vx;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
++        int i = i0 + i_offset;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++
++        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
++        const int ky = QR6_K*kqsx;
++
++        const int ql = get_int_from_uint8(bxi->ql, kqsx);
++        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
++        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
++
++        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
++        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
++        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
++
++        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
++        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
++
++        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
++        x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
++    }
++
++    const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI6_K; // == 1 if QK_K == 256
++    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
++    float * x_dmf = (float *) x_dm;
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
++        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++
++        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
++
++        x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d);
++    }
++
++#pragma unroll
++    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
++        int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y;
++
++        if (need_check) {
++            i = min(i, i_max);
++        }
++
++        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / 4;
++
++        x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + k % (WARP_SIZE_GGUF/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
++    }
++}
++
++static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
++    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
++    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
++    const float * x_dmf = (const float *) x_dm;
++    const float * y_df  = (const float *) y_ds;
++
++    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/8]);
++
++    const int index_x = i * (QR6_K*WARP_SIZE_GGUF + 1) +  QR6_K*k;
++    const int index_y = j * WARP_SIZE_GGUF             + (QR6_K*k) % WARP_SIZE_GGUF;
++    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
++}
++
++static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
++
++    const int ib32 = iqs;
++    const uint16_t * q2 = bq2->qs + 4*ib32;
++    const uint8_t  * aux8 = (const uint8_t *)q2;
++    const int8_t   * q8 = bq8_1[ib32].qs;
++    uint32_t aux32 = q2[2] | (q2[3] << 16);
++    int sumi = 0;
++    for (int l = 0; l < 4; ++l) {
++        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
++        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
++        for (int j = 0; j < 8; ++j) {
++            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
++        }
++        q8 += 8;
++        aux32 >>= 7;
++    }
++    const float d = __half2float(bq2->d) * (0.5f + aux32) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
++    return d * sumi;
++}
++
++static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
++
++    const int ib32 = iqs;
++    const uint16_t * q2 = bq2->qs + 4*ib32;
++    const int8_t   * q8 = bq8_1[ib32].qs;
++    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
++    const uint8_t ls2 = bq2->scales[ib32] >>  4;
++    int sumi1 = 0;
++    for (int l = 0; l < 2; ++l) {
++        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
++        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
++        for (int j = 0; j < 8; ++j) {
++            sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
++        }
++        q8 += 8;
++    }
++    int sumi2 = 0;
++    for (int l = 2; l < 4; ++l) {
++        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
++        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
++        for (int j = 0; j < 8; ++j) {
++            sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
++        }
++        q8 += 8;
++    }
++    const float d = __half2float(bq2->d) * __half2float(bq8_1[ib32].ds.x) * 0.25f;
++    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
++}
++
++static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
++
++    const int ib32 = iqs;
++    const int8_t  * q8 = bq8_1[ib32].qs;
++    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
++    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
++    const uint8_t ls2 = bq2->scales[ib32] >>  4;
++    int sumi1 = 0;
++    for (int l = 0; l < 2; ++l) {
++        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
++        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
++        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
++        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
++        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
++        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
++        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
++        q8 += 8;
++    }
++    int sumi2 = 0;
++    for (int l = 2; l < 4; ++l) {
++        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
++        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
++        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
++        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
++        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
++        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
++        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
++        q8 += 8;
++    }
++    const float d = __half2float(bq2->d) * __low2float(bq8_1[ib32].ds) * 0.25f;
++    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
++#endif
++}
++
++static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
++
++    const int ib32 = iqs;
++    const uint8_t  * q3 = bq2->qs + 8*ib32;
++    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
++    const int8_t   * q8 = bq8_1[ib32].qs;
++    uint32_t aux32 = gas[0] | (gas[1] << 16);
++    int sumi = 0;
++    for (int l = 0; l < 4; ++l) {
++        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
++        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
++        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
++        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
++        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
++        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
++        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
++        q8 += 8;
++        aux32 >>= 7;
++    }
++    const float d = __half2float(bq2->d) * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
++    return d * sumi;
++#endif
++}
++
++static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
++
++    const int ib32 = iqs;
++    const uint8_t  * qs = bq2->qs + 8*ib32;
++    const int8_t   * q8 = bq8_1[ib32].qs;
++    int sumi = 0;
++    for (int l = 0; l < 4; ++l) {
++        const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
++        const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
++        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
++        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
++        const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
++        const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
++        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
++        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
++        q8 += 8;
++    }
++    const float d = __half2float(bq2->d) * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f;
++    return d * sumi;
++#endif
++}
++
++static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
++
++    const int       qs_packed = get_int_b2(bq1->qs, iqs);
++    const uint8_t * qs        = (const uint8_t *) &qs_packed;
++
++    const int qh = bq1->qh[iqs];
++
++    int sumi = 0;
++#pragma unroll
++    for (int l0 = 0; l0 < 8; l0 += 2) {
++        const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)];
++
++        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
++        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
++
++        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
++        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
++
++        sumi = __dp4a(grid0, u0, sumi);
++        sumi = __dp4a(grid1, u1, sumi);
++    }
++
++    const float  d1q   = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
++    const float  delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
++    const float2 ds    = __half22float2(bq8_1[iqs].ds);
++    return d1q * (ds.x*sumi + ds.y*delta);
++#endif
++}
++
++static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++
++    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
++
++    const int       qs_packed = get_int_b4(bq1->qs, iqs);
++    const uint8_t * qs        = (const uint8_t *) &qs_packed;
++
++    int   sumi[2] = {0};
++    float sumf[2] = {0.0f};
++#pragma unroll
++    for (int l0 = 0; l0 < 8; l0 += 2) {
++        const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2));
++
++        const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)];
++
++        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
++        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
++
++        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
++        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
++
++        sumi[l0/4] = __dp4a(grid0, u0, sumi[l0/4]);
++        sumi[l0/4] = __dp4a(grid1, u1, sumi[l0/4]);
++
++        const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08);
++        int sumy = 0;
++        sumy = __dp4a(u0, 0x01010101, sumy);
++        sumy = __dp4a(u1, 0x01010101, sumy);
++        sumf[l0/4] += delta*sumy;
++    }
++
++    const uint16_t * sc = (const uint16_t *) bq1->scales;
++
++    iq1m_scale_t scale;
++    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
++    const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
++
++    const int tmp = sc[iqs/2] >> (6*(iqs%2));
++    const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
++    const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
++    return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
++#endif
++}
++
++static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
++        int & val1, int & val2) {
++
++    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
++    aux32 = q4 & 0x0f0f0f0f;
++    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
++    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
++    val1 = v1 | (v2 << 16);
++    aux32 = (q4 >> 4) & 0x0f0f0f0f;
++    v1 = values[q8[0]] | (values[q8[1]] << 8);
++    v2 = values[q8[2]] | (values[q8[3]] << 8);
++    val2 = v1 | (v2 << 16);
++}
++
++static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++
++    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
++
++    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
++    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;
++
++    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
++
++    int v1, v2;
++    int sumi1 = 0, sumi2 = 0;
++    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
++        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
++        get_int_from_table_16(aux, values, v1, v2);
++        sumi1 = __dp4a(v1, q8[l+0], sumi1);
++        sumi2 = __dp4a(v2, q8[l+4], sumi2);
++    }
++    const float d = __half2float(bq->d) * __low2float(bq8_1->ds);
++    return d * (sumi1 + sumi2);
++#endif
++}
++
++
++static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
++    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM
++    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
++    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
++
++    // iqs is 0...7
++    const int ib32 = iqs;
++    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
++    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
++    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
++    const float d = __half2float(bq4->d) * (ls - 32) * __low2float(bq8_1[ib32].ds);
++    int v1, v2;
++    int sumi1 = 0, sumi2 = 0;
++    for (int j = 0; j < 4; ++j) {
++        get_int_from_table_16(q4[j], values, v1, v2);
++        sumi1 = __dp4a(v1, q8[j+0], sumi1);
++        sumi2 = __dp4a(v2, q8[j+4], sumi2);
++    }
++    return d * (sumi1 + sumi2);
++#endif
++}
+\ No newline at end of file
+diff --git a/csrc/quantization/gptq/compat.cuh b/csrc/quantization/gptq/compat.cuh
+index 4da0bc6..1b3fb3d 100644
+--- a/csrc/quantization/gptq/compat.cuh
++++ b/csrc/quantization/gptq/compat.cuh
+@@ -9,54 +9,54 @@ namespace vllm {
+ namespace gptq {
+ // atomicAdd for half types, to support CC < 7.x
+ 
+-__device__ __forceinline__ void atomicAdd_half(half* address, half val)
+-{
+-    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+-    unsigned int old = *address_as_ui;
+-    unsigned int assumed;
++__device__ __forceinline__ void atomicAdd_half(half* address, half val) {
++  unsigned int* address_as_ui =
++      (unsigned int*)((char*)address - ((size_t)address & 2));
++  unsigned int old = *address_as_ui;
++  unsigned int assumed;
+ 
+-    do
+-    {
+-        assumed = old;
+-        __half_raw hsum;
+-        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+-        half tmpres = __hadd(hsum, val);
+-        hsum = __half_raw(tmpres);
+-        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+-        old = atomicCAS(address_as_ui, assumed, old);
+-    }
+-    while (assumed != old);
++  do {
++    assumed = old;
++    __half_raw hsum;
++    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
++    half tmpres = __hadd(hsum, val);
++    hsum = __half_raw(tmpres);
++    old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)
++                              : (old & 0xffff0000) | hsum.x;
++    old = atomicCAS(address_as_ui, assumed, old);
++  } while (assumed != old);
+ }
+ 
+ // atomicAdd for half2 types
+ 
+-__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
+-{
+-    unsigned int* address_as_ui = (unsigned int*)address;
+-    unsigned int old = *address_as_ui;
+-    unsigned int assumed;
+-    do
+-    {
+-        assumed = old;
+-        half2 old_val = *((half2*)&old);
+-        half2 new_val = __hadd2(old_val, val);
+-        old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
+-    }
+-    while (assumed != old);
++__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) {
++  unsigned int* address_as_ui = (unsigned int*)address;
++  unsigned int old = *address_as_ui;
++  unsigned int assumed;
++  do {
++    assumed = old;
++    half2 old_val = *((half2*)&old);
++    half2 new_val = __hadd2(old_val, val);
++    old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
++  } while (assumed != old);
+ }
+ 
+ //
+ 
+ #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+-#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
++  #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
+ 
+-__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
++__device__ __forceinline__ void atomicAdd(half* address, half val) {
++  atomicAdd_half(address, val);
++}
+ 
+-#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
+-__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
+-#endif
++    #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
++__device__ __forceinline__ void atomicAdd(half2* address, half2 val) {
++  atomicAdd_half2(address, val);
++}
++    #endif
+ 
+-#endif
++  #endif
+ #endif
+ 
+ }  // namespace gptq
+diff --git a/csrc/quantization/gptq/matrix_view.cuh b/csrc/quantization/gptq/matrix_view.cuh
+index eda3436..2b6719f 100644
+--- a/csrc/quantization/gptq/matrix_view.cuh
++++ b/csrc/quantization/gptq/matrix_view.cuh
+@@ -1,5 +1,6 @@
+ /*
+-Adapted from https://github.com/turboderp/exllamav2 and https://github.com/turboderp/exllama
++Adapted from https://github.com/turboderp/exllamav2 and
++https://github.com/turboderp/exllama
+ */
+ 
+ #ifndef _matrix_view_cuh
+@@ -13,260 +14,280 @@ Adapted from https://github.com/turboderp/exllamav2 and https://github.com/turbo
+ namespace vllm {
+ namespace gptq {
+ 
+-class MatrixView_half
+-{
+-public:
+-    const half* data;
+-    const int height;
+-    const int width;
+-
+-    __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
+-        : data(data), height(height), width(width)
+-    { }
+-
+-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
+-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
+-    __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
+-    __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
+-
+-    __device__ __forceinline__ void item4(half (&items)[4], int row, int column) const
+-    {
+-        half2* ptr = (half2*) item_ptr(row, column);
+-        half2 i01 = ptr[0];
+-        half2 i23 = ptr[1];
+-        items[0] = __low2half(i01);
+-        items[1] = __high2half(i01);
+-        items[2] = __low2half(i23);
+-        items[3] = __high2half(i23);
+-    }
+-    __device__ __forceinline__ void item4_f(float (&items)[4], int row, int column) const
+-    {
+-        half2* ptr = (half2*)item_ptr(row, column);
+-        half2 i01 = ptr[0];
+-        half2 i23 = ptr[1];
+-        items[0] = __half2float(__low2half(i01));
+-        items[1] = __half2float(__high2half(i01));
+-        items[2] = __half2float(__low2half(i23));
+-        items[3] = __half2float(__high2half(i23));
+-    }
+-
+-    __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, int column) const
+-    {
+-        half2* ptr = (half2*)item_ptr(row, column);
+-        half2 i01 = ptr[0];
+-        half2 i23 = ptr[1];
+-        items[0] = __half2half2(__low2half(i01));
+-        items[1] = __half2half2(__high2half(i01));
+-        items[2] = __half2half2(__low2half(i23));
+-        items[3] = __half2half2(__high2half(i23));
+-    }
++class MatrixView_half {
++ public:
++  const half* data;
++  const int height;
++  const int width;
++
++  __device__ __forceinline__ MatrixView_half(const half* data, const int height,
++                                             const int width)
++      : data(data), height(height), width(width) {}
++
++  __device__ __forceinline__ half item(int row, int column) const {
++    return data[row * width + column];
++  }
++  __device__ __forceinline__ half2 item_half2(int row, int column) const {
++    return ((half2*)data)[(row * width + column) / 2];
++  }
++  __device__ __forceinline__ half2 item_half2half2(int row, int column) const {
++    return __half2half2(data[row * width + column]);
++  }
++  __device__ __forceinline__ const half* item_ptr(int row, int column) const {
++    return &data[row * width + column];
++  }
++
++  __device__ __forceinline__ void item4(half (&items)[4], int row,
++                                        int column) const {
++    half2* ptr = (half2*)item_ptr(row, column);
++    half2 i01 = ptr[0];
++    half2 i23 = ptr[1];
++    items[0] = __low2half(i01);
++    items[1] = __high2half(i01);
++    items[2] = __low2half(i23);
++    items[3] = __high2half(i23);
++  }
++  __device__ __forceinline__ void item4_f(float (&items)[4], int row,
++                                          int column) const {
++    half2* ptr = (half2*)item_ptr(row, column);
++    half2 i01 = ptr[0];
++    half2 i23 = ptr[1];
++    items[0] = __half2float(__low2half(i01));
++    items[1] = __half2float(__high2half(i01));
++    items[2] = __half2float(__low2half(i23));
++    items[3] = __half2float(__high2half(i23));
++  }
++
++  __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row,
++                                           int column) const {
++    half2* ptr = (half2*)item_ptr(row, column);
++    half2 i01 = ptr[0];
++    half2 i23 = ptr[1];
++    items[0] = __half2half2(__low2half(i01));
++    items[1] = __half2half2(__high2half(i01));
++    items[2] = __half2half2(__low2half(i23));
++    items[3] = __half2half2(__high2half(i23));
++  }
+ };
+ 
+-class MatrixView_half_rw
+-{
+-public:
+-    half* data;
+-    const int height;
+-    const int width;
+-
+-    __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
+-        : data(data), height(height), width(width)
+-    { }
+-
+-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
+-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
+-    __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
+-    __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
+-    __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
+-
+-    __device__ __forceinline__ void set4(int row, int column, half v0, half v1, half v2, half v3)
+-    {
+-        half2 v01 = __halves2half2(v0, v1);
+-        half2 v23 = __halves2half2(v2, v3);
+-        half2* ptr = (half2*) item_ptr(row, column);
+-        ptr[0] = v01;
+-        ptr[1] = v23;
+-    }
++class MatrixView_half_rw {
++ public:
++  half* data;
++  const int height;
++  const int width;
++
++  __device__ __forceinline__ MatrixView_half_rw(half* data, const int height,
++                                                const int width)
++      : data(data), height(height), width(width) {}
++
++  __device__ __forceinline__ half item(int row, int column) const {
++    return data[row * width + column];
++  }
++  __device__ __forceinline__ half2 item_half2(int row, int column) const {
++    return ((half2*)data)[(row * width + column) / 2];
++  }
++  __device__ __forceinline__ half* item_ptr(int row, int column) {
++    return &data[row * width + column];
++  }
++  __device__ __forceinline__ void set(int row, int column, half value) {
++    data[row * width + column] = value;
++  }
++  __device__ __forceinline__ void set_half2(int row, int column, half2 value) {
++    ((half2*)data)[(row * width + column) / 2] = value;
++  }
++
++  __device__ __forceinline__ void set4(int row, int column, half v0, half v1,
++                                       half v2, half v3) {
++    half2 v01 = __halves2half2(v0, v1);
++    half2 v23 = __halves2half2(v2, v3);
++    half2* ptr = (half2*)item_ptr(row, column);
++    ptr[0] = v01;
++    ptr[1] = v23;
++  }
+ };
+ 
+-class MatrixView_q4_row
+-{
+-public:
+-    const uint32_t* data;
+-    const int height;
+-    const int width;
+-
+-    __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
+-        : data(data), height(height), width(width)
+-    { }
+-
+-    __device__ __forceinline__ int item(int row, int column) const
+-    {
+-        int shift = (column & 0x07) * 4;
+-        return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
+-    }
+-
+-    __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const
+-    {
+-        int shift = (column & 0x07) * 4;
+-        uint32_t d = data[row * width / 8 + column / 8] >> shift;
+-        items[0] = d & 0x0f;
+-        items[1] = (d >> 4) & 0x0f;
+-    }
+-
+-    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+-    {
+-        int shift = (column & 0x07) * 4;
+-        uint32_t d = data[row * width / 8 + column / 8] >> shift;
+-        items[0] = d & 0x0f;
+-        items[1] = (d >> 4) & 0x0f;
+-        items[2] = (d >> 8) & 0x0f;
+-        items[3] = (d >> 12) & 0x0f;
+-    }
++class MatrixView_q4_row {
++ public:
++  const uint32_t* data;
++  const int height;
++  const int width;
++
++  __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data,
++                                               const int height,
++                                               const int width)
++      : data(data), height(height), width(width) {}
++
++  __device__ __forceinline__ int item(int row, int column) const {
++    int shift = (column & 0x07) * 4;
++    return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
++  }
++
++  __device__ __forceinline__ void item2(int (&items)[2], int row,
++                                        int column) const {
++    int shift = (column & 0x07) * 4;
++    uint32_t d = data[row * width / 8 + column / 8] >> shift;
++    items[0] = d & 0x0f;
++    items[1] = (d >> 4) & 0x0f;
++  }
++
++  __device__ __forceinline__ void item4(int (&items)[4], int row,
++                                        int column) const {
++    int shift = (column & 0x07) * 4;
++    uint32_t d = data[row * width / 8 + column / 8] >> shift;
++    items[0] = d & 0x0f;
++    items[1] = (d >> 4) & 0x0f;
++    items[2] = (d >> 8) & 0x0f;
++    items[3] = (d >> 12) & 0x0f;
++  }
+ };
+ 
+-class MatrixView_q4_column
+-{
+-public:
+-    const uint32_t* data;
+-    const int height;
+-    const int width;
+-
+-    __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width)
+-        : data(data), height(height), width(width)
+-    { }
+-
+-    __device__ __forceinline__ int item(int row, int column) const
+-    {
+-        int shift = (row & 0x07) * 4;
+-        return (data[row / 8 * width + column] >> shift) & 0x0f;
+-    }
+-
+-    __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; }
+-    __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
++class MatrixView_q4_column {
++ public:
++  const uint32_t* data;
++  const int height;
++  const int width;
++
++  __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data,
++                                                  const int height,
++                                                  const int width)
++      : data(data), height(height), width(width) {}
++
++  __device__ __forceinline__ int item(int row, int column) const {
++    int shift = (row & 0x07) * 4;
++    return (data[row / 8 * width + column] >> shift) & 0x0f;
++  }
++
++  __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) {
++    return data[row / 8 * width + column];
++  }
++  __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row,
++                                                             int column) {
++    return &data[row / 8 * width + column];
++  }
+ };
+ 
+-class MatrixView_q2_row
+-{
+-public:
+-    const uint32_t* data;
+-    const int height;
+-    const int width;
+-
+-    __device__ __forceinline__ MatrixView_q2_row(const uint32_t* data, const int height, const int width)
+-        : data(data), height(height), width(width)
+-    { }
+-
+-    __device__ __forceinline__ int item(int row, int column) const
+-    {
+-        int shift = (column & 0x0f) * 2;
+-        return (data[row * width / 16 + column / 16] >> shift) & 0x03;
+-    }
+-
+-    __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const
+-    {
+-        int shift = (column & 0x0f) * 2;
+-        uint32_t d = data[row * width / 16 + column / 16] >> shift;
+-        items[0] = d & 0x03;
+-        items[1] = (d >> 2) & 0x03;
+-    }
+-
+-    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+-    {
+-        int shift = (column & 0x0f) * 2;
+-        uint32_t d = data[row * width / 16 + column / 16] >> shift;
+-        items[0] = d & 0x03;
+-        items[1] = (d >> 2) & 0x03;
+-        items[2] = (d >> 4) & 0x03;
+-        items[3] = (d >> 6) & 0x03;
+-    }
++class MatrixView_q2_row {
++ public:
++  const uint32_t* data;
++  const int height;
++  const int width;
++
++  __device__ __forceinline__ MatrixView_q2_row(const uint32_t* data,
++                                               const int height,
++                                               const int width)
++      : data(data), height(height), width(width) {}
++
++  __device__ __forceinline__ int item(int row, int column) const {
++    int shift = (column & 0x0f) * 2;
++    return (data[row * width / 16 + column / 16] >> shift) & 0x03;
++  }
++
++  __device__ __forceinline__ void item2(int (&items)[2], int row,
++                                        int column) const {
++    int shift = (column & 0x0f) * 2;
++    uint32_t d = data[row * width / 16 + column / 16] >> shift;
++    items[0] = d & 0x03;
++    items[1] = (d >> 2) & 0x03;
++  }
++
++  __device__ __forceinline__ void item4(int (&items)[4], int row,
++                                        int column) const {
++    int shift = (column & 0x0f) * 2;
++    uint32_t d = data[row * width / 16 + column / 16] >> shift;
++    items[0] = d & 0x03;
++    items[1] = (d >> 2) & 0x03;
++    items[2] = (d >> 4) & 0x03;
++    items[3] = (d >> 6) & 0x03;
++  }
+ };
+ 
+-class MatrixView_q3_row
+-{
+-public:
+-    const uint32_t* data;
+-    const int height;
+-    const int width;
+-
+-    __device__ __forceinline__ MatrixView_q3_row(const uint32_t* data, const int height, const int width)
+-        : data(data), height(height), width(width)
+-    { }
+-
+-    __device__ __forceinline__ int item(int row, int column) const
+-    {
+-        int z_w = column * 3 / 32;
+-        int z_mod =  column & 0x1f;
+-
+-        if (z_mod == 10) {
+-            return (data[row * width * 3 / 32 + z_w] >> 30) | ((data[row * width * 3 / 32 + (z_w + 1)] << 2) & 0x4);
+-        } else if (z_mod == 21) {
+-            return (data[row * width * 3 / 32 + z_w] >> 31) | ((data[row * width * 3 / 32 + (z_w + 1)] << 1) & 0x6);
+-        } else if (z_mod < 10) {
+-            return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3)) & 0x07;
+-        } else if (z_mod < 21) {
+-            return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3  - 32)) & 0x07;
+-        } else {
+-            return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3  - 64)) & 0x07;
+-        }
++class MatrixView_q3_row {
++ public:
++  const uint32_t* data;
++  const int height;
++  const int width;
++
++  __device__ __forceinline__ MatrixView_q3_row(const uint32_t* data,
++                                               const int height,
++                                               const int width)
++      : data(data), height(height), width(width) {}
++
++  __device__ __forceinline__ int item(int row, int column) const {
++    int z_w = column * 3 / 32;
++    int z_mod = column & 0x1f;
++
++    if (z_mod == 10) {
++      return (data[row * width * 3 / 32 + z_w] >> 30) |
++             ((data[row * width * 3 / 32 + (z_w + 1)] << 2) & 0x4);
++    } else if (z_mod == 21) {
++      return (data[row * width * 3 / 32 + z_w] >> 31) |
++             ((data[row * width * 3 / 32 + (z_w + 1)] << 1) & 0x6);
++    } else if (z_mod < 10) {
++      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3)) & 0x07;
++    } else if (z_mod < 21) {
++      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 32)) & 0x07;
++    } else {
++      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 64)) & 0x07;
+     }
+-
+-    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+-    {
+-        int shift = (column & 0x1f);
+-        uint32_t d;
+-        if (shift <= 4) {
+-            d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3);
+-        } else if (shift == 8) {
+-            d = (data[row * width / 32 * 3 + column * 3 / 32] >> 24) | ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0x0f) << 8);
+-        } else if (shift <= 16) {
+-            d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 32);
+-        } else if (shift == 20) {
+-            d = (data[row * width / 32 * 3 + column * 3 / 32] >> 28) | ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0xff) << 4);
+-        } else {
+-            d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 64);
+-        }
+-        items[0] = d & 0x07;
+-        items[1] = (d >> 3) & 0x07;
+-        items[2] = (d >> 6) & 0x07;
+-        items[3] = (d >> 9) & 0x07;
++  }
++
++  __device__ __forceinline__ void item4(int (&items)[4], int row,
++                                        int column) const {
++    int shift = (column & 0x1f);
++    uint32_t d;
++    if (shift <= 4) {
++      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3);
++    } else if (shift == 8) {
++      d = (data[row * width / 32 * 3 + column * 3 / 32] >> 24) |
++          ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0x0f) << 8);
++    } else if (shift <= 16) {
++      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 32);
++    } else if (shift == 20) {
++      d = (data[row * width / 32 * 3 + column * 3 / 32] >> 28) |
++          ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0xff) << 4);
++    } else {
++      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 64);
+     }
++    items[0] = d & 0x07;
++    items[1] = (d >> 3) & 0x07;
++    items[2] = (d >> 6) & 0x07;
++    items[3] = (d >> 9) & 0x07;
++  }
+ };
+ 
+-class MatrixView_q8_row
+-{
+-public:
+-    const uint32_t* data;
+-    const int height;
+-    const int width;
+-
+-    __device__ __forceinline__ MatrixView_q8_row(const uint32_t* data, const int height, const int width)
+-        : data(data), height(height), width(width)
+-    { }
+-
+-    __device__ __forceinline__ int item(int row, int column) const
+-    {
+-        int shift = (column & 0x03) * 8;
+-        return (data[row * width / 4 + column / 4] >> shift) & 0xff;
+-    }
+-
+-    __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const
+-    {
+-        int shift = (column & 0x03) * 8;
+-        uint32_t d = data[row * width / 4 + column / 4] >> shift;
+-        items[0] = d & 0xff;
+-        items[1] = (d >> 8) & 0xff;
+-    }
+-
+-    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+-    {
+-        int shift = (column & 0x03) * 2;
+-        uint32_t d = data[row * width / 4 + column / 4] >> shift;
+-        items[0] = d & 0xff;
+-        items[1] = (d >> 8) & 0xff;
+-        items[2] = (d >> 16) & 0xff;
+-        items[3] = (d >> 24) & 0xff;
+-    }
++class MatrixView_q8_row {
++ public:
++  const uint32_t* data;
++  const int height;
++  const int width;
++
++  __device__ __forceinline__ MatrixView_q8_row(const uint32_t* data,
++                                               const int height,
++                                               const int width)
++      : data(data), height(height), width(width) {}
++
++  __device__ __forceinline__ int item(int row, int column) const {
++    int shift = (column & 0x03) * 8;
++    return (data[row * width / 4 + column / 4] >> shift) & 0xff;
++  }
++
++  __device__ __forceinline__ void item2(int (&items)[2], int row,
++                                        int column) const {
++    int shift = (column & 0x03) * 8;
++    uint32_t d = data[row * width / 4 + column / 4] >> shift;
++    items[0] = d & 0xff;
++    items[1] = (d >> 8) & 0xff;
++  }
++
++  __device__ __forceinline__ void item4(int (&items)[4], int row,
++                                        int column) const {
++    int shift = (column & 0x03) * 2;
++    uint32_t d = data[row * width / 4 + column / 4] >> shift;
++    items[0] = d & 0xff;
++    items[1] = (d >> 8) & 0xff;
++    items[2] = (d >> 16) & 0xff;
++    items[3] = (d >> 24) & 0xff;
++  }
+ };
+ 
+ }  // namespace gptq
+diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
+index cc56649..785f1a0 100644
+--- a/csrc/quantization/gptq/q_gemm.cu
++++ b/csrc/quantization/gptq/q_gemm.cu
+@@ -1,11 +1,12 @@
+ /*
+-Adapted from https://github.com/turboderp/exllamav2 and https://github.com/qwopqwop200/GPTQ-for-LLaMa
++Adapted from https://github.com/turboderp/exllamav2 and
++https://github.com/qwopqwop200/GPTQ-for-LLaMa
+ */
+ 
+ #include <cstdint>
+ #include <cstdio>
+ 
+-#include <torch/extension.h>
++#include <torch/all.h>
+ #include <c10/cuda/CUDAGuard.h>
+ #include <ATen/cuda/CUDAContext.h>
+ #include <cuda_runtime.h>
+@@ -32,2044 +33,1824 @@ namespace gptq {
+ #define DIVIDE(x, size) (((x) + (size) - 1) / (size))
+ 
+ #if defined(USE_ROCM)
+-#include <hipblas/hipblas.h>
+-__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
+-                                                               hipblasOperation_t transA,
+-                                                               hipblasOperation_t transB,
+-                                                               int                m,
+-                                                               int                n,
+-                                                               int                k,
+-                                                               const half*        alpha,
+-                                                               const half*        AP,
+-                                                               int                lda,
+-                                                               const half*        BP,
+-                                                               int                ldb,
+-                                                               const half*        beta,
+-                                                               half*              CP,
+-                                                               int                ldc) {
+-    return hipblasHgemm(handle, transA, transB, m, n, k,
+-                        reinterpret_cast<const hipblasHalf *>(alpha),
+-                        reinterpret_cast<const hipblasHalf *>(AP), lda,
+-                        reinterpret_cast<const hipblasHalf *>(BP), ldb,
+-                        reinterpret_cast<const hipblasHalf *>(beta),
+-                        reinterpret_cast<hipblasHalf *>(CP), ldc);
++  #include <hipblas/hipblas.h>
++__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(
++    hipblasHandle_t handle, hipblasOperation_t transA,
++    hipblasOperation_t transB, int m, int n, int k, const half* alpha,
++    const half* AP, int lda, const half* BP, int ldb, const half* beta,
++    half* CP, int ldc) {
++  return hipblasHgemm(handle, transA, transB, m, n, k,
++                      reinterpret_cast<const hipblasHalf*>(alpha),
++                      reinterpret_cast<const hipblasHalf*>(AP), lda,
++                      reinterpret_cast<const hipblasHalf*>(BP), ldb,
++                      reinterpret_cast<const hipblasHalf*>(beta),
++                      reinterpret_cast<hipblasHalf*>(CP), ldc);
+ }
+-#define hipblasHgemm __compat_hipblasHgemm
++  #define hipblasHgemm __compat_hipblasHgemm
+ 
+-// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
+-#define rocblas_operation_none HIPBLAS_OP_N
+-#define rocblas_hgemm __compat_hipblasHgemm
++  // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
++  #define rocblas_operation_none HIPBLAS_OP_N
++  #define rocblas_hgemm __compat_hipblasHgemm
+ #endif
+ 
+-__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result)
+-{
+-    half2 result = {};
+-    const half2* a2_ptr = (const half2*)a_ptr;
+-    #pragma unroll
+-    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+-    return __hadd2(result, g_result);
++__forceinline__ __device__ half2 dot22_8(half2 (&dq)[4], const half* a_ptr,
++                                         const half2 g_result) {
++  half2 result = {};
++  const half2* a2_ptr = (const half2*)a_ptr;
++#pragma unroll
++  for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
++  return __hadd2(result, g_result);
+ }
+ 
+-__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr)
+-{
+-    half2 result = {};
+-    const half2* a2_ptr = (const half2*)a_ptr;
+-    #pragma unroll
+-    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+-    return __half2float(__low2half(result)) + __half2float(__high2half(result));
++__forceinline__ __device__ float dot22_8_f(half2 (&dq)[4], const half* a_ptr) {
++  half2 result = {};
++  const half2* a2_ptr = (const half2*)a_ptr;
++#pragma unroll
++  for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
++  return __half2float(__low2half(result)) + __half2float(__high2half(result));
+ }
+ 
+-__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result, const half qs_h)
+-{
+-    half2 result = {};
+-    const half2* a2_ptr = (const half2*)a_ptr;
+-    #pragma unroll
+-    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+-    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
++__forceinline__ __device__ half2 dot22_8(half2 (&dq)[4], const half* a_ptr,
++                                         const half2 g_result,
++                                         const half qs_h) {
++  half2 result = {};
++  const half2* a2_ptr = (const half2*)a_ptr;
++#pragma unroll
++  for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
++  return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+ }
+ 
+-__forceinline__ __device__ half2 dot22_16(half2(&dq)[8], const half* a_ptr, const half2 g_result, const half qs_h)
+-{
+-    half2 result = {};
+-    const half2* a2_ptr = (const half2*)a_ptr;
+-    #pragma unroll
+-    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+-    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
++__forceinline__ __device__ half2 dot22_16(half2 (&dq)[8], const half* a_ptr,
++                                          const half2 g_result,
++                                          const half qs_h) {
++  half2 result = {};
++  const half2* a2_ptr = (const half2*)a_ptr;
++#pragma unroll
++  for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
++  return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+ }
+ 
+-__forceinline__ __device__ half2 dot22_32(half2(&dq)[16], const half* a_ptr, const half2 g_result, const half qs_h)
+-{
+-    half2 result = {};
+-    const half2* a2_ptr = (const half2*)a_ptr;
+-    #pragma unroll
+-    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+-    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
++__forceinline__ __device__ half2 dot22_32(half2 (&dq)[16], const half* a_ptr,
++                                          const half2 g_result,
++                                          const half qs_h) {
++  half2 result = {};
++  const half2* a2_ptr = (const half2*)a_ptr;
++#pragma unroll
++  for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
++  return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+ }
+ 
+-__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr, const float g_result, const float qs_f)
+-{
+-    half2 result = {};
+-    const half2* a2_ptr = (const half2*)a_ptr;
+-    #pragma unroll
+-    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+-    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+-    return fma(result_f, qs_f, g_result);
++__forceinline__ __device__ float dot22_8_f(half2 (&dq)[4], const half* a_ptr,
++                                           const float g_result,
++                                           const float qs_f) {
++  half2 result = {};
++  const half2* a2_ptr = (const half2*)a_ptr;
++#pragma unroll
++  for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
++  float result_f =
++      __half2float(__low2half(result)) + __half2float(__high2half(result));
++  return fma(result_f, qs_f, g_result);
+ }
+ 
+-__forceinline__ __device__ float dot22_16_f(half2(&dq)[8], const half* a_ptr, const float g_result, const float qs_f)
+-{
+-    half2 result = {};
+-    const half2* a2_ptr = (const half2*)a_ptr;
+-    #pragma unroll
+-    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+-    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+-    return fma(result_f, qs_f, g_result);
++__forceinline__ __device__ float dot22_16_f(half2 (&dq)[8], const half* a_ptr,
++                                            const float g_result,
++                                            const float qs_f) {
++  half2 result = {};
++  const half2* a2_ptr = (const half2*)a_ptr;
++#pragma unroll
++  for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
++  float result_f =
++      __half2float(__low2half(result)) + __half2float(__high2half(result));
++  return fma(result_f, qs_f, g_result);
+ }
+ 
+-__forceinline__ __device__ float dot22_32_f(half2(&dq)[16], const half* a_ptr, const float g_result, const float qs_f)
+-{
+-    half2 result = {};
+-    const half2* a2_ptr = (const half2*)a_ptr;
+-    #pragma unroll
+-    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+-    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+-    return fma(result_f, qs_f, g_result);
++__forceinline__ __device__ float dot22_32_f(half2 (&dq)[16], const half* a_ptr,
++                                            const float g_result,
++                                            const float qs_f) {
++  half2 result = {};
++  const half2* a2_ptr = (const half2*)a_ptr;
++#pragma unroll
++  for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
++  float result_f =
++      __half2float(__low2half(result)) + __half2float(__high2half(result));
++  return fma(result_f, qs_f, g_result);
+ }
+ 
+-__forceinline__ __device__ half dot22_8_h(half2(&dq)[4], const half* a_ptr, const half g_result, const half qs_h)
+-{
+-    // Use FP32 accumulator to avoid potential overflow since unscaled weights are in the range -128..127
+-
+-    float result = {};
+-    #pragma unroll
+-    for (int i = 0; i < 4; i++)
+-    {
+-        half2 w01 = dq[i];
+-        float w0 = __low2float(w01);
+-        float w1 = __high2float(w01);
+-        float x0 = __half2float(*a_ptr++);
+-        float x1 = __half2float(*a_ptr++);
+-        result = fma(w0, x0, result);
+-        result = fma(w1, x1, result);
+-    }
+-    float qs = __half2float(qs_h);
+-    result *= qs;
+-    half result_h = __float2half_rn(result);
+-    return __hadd(result_h, g_result);
++__forceinline__ __device__ half dot22_8_h(half2 (&dq)[4], const half* a_ptr,
++                                          const half g_result,
++                                          const half qs_h) {
++  // Use FP32 accumulator to avoid potential overflow since unscaled weights are
++  // in the range -128..127
++
++  float result = {};
++#pragma unroll
++  for (int i = 0; i < 4; i++) {
++    half2 w01 = dq[i];
++    float w0 = __low2float(w01);
++    float w1 = __high2float(w01);
++    float x0 = __half2float(*a_ptr++);
++    float x1 = __half2float(*a_ptr++);
++    result = fma(w0, x0, result);
++    result = fma(w1, x1, result);
++  }
++  float qs = __half2float(qs_h);
++  result *= qs;
++  half result_h = __float2half_rn(result);
++  return __hadd(result_h, g_result);
+ }
+ 
+-__forceinline__ __device__ half dot22_16_h(half2(&dq)[8], const half* a_ptr, const half g_result, const half qs_h)
+-{
+-    half2 result = {};
+-    const half2* a2_ptr = (const half2*)a_ptr;
+-    #pragma unroll
+-    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+-    half result_h = __hadd(__low2half(result), __high2half(result));
+-    return __hfma(result_h, qs_h, g_result);
++__forceinline__ __device__ half dot22_16_h(half2 (&dq)[8], const half* a_ptr,
++                                           const half g_result,
++                                           const half qs_h) {
++  half2 result = {};
++  const half2* a2_ptr = (const half2*)a_ptr;
++#pragma unroll
++  for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
++  half result_h = __hadd(__low2half(result), __high2half(result));
++  return __hfma(result_h, qs_h, g_result);
+ }
+ 
+-__forceinline__ __device__ half dot22_32_h(half2(&dq)[16], const half* a_ptr, const half g_result, const half qs_h)
+-{
+-    half2 result = {};
+-    const half2* a2_ptr = (const half2*)a_ptr;
+-    #pragma unroll
+-    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+-    half result_h = __hadd(__low2half(result), __high2half(result));
+-    return __hfma(result_h, qs_h, g_result);
++__forceinline__ __device__ half dot22_32_h(half2 (&dq)[16], const half* a_ptr,
++                                           const half g_result,
++                                           const half qs_h) {
++  half2 result = {};
++  const half2* a2_ptr = (const half2*)a_ptr;
++#pragma unroll
++  for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
++  half result_h = __hadd(__low2half(result), __high2half(result));
++  return __hfma(result_h, qs_h, g_result);
+ }
+ 
+-
+-typedef void (*fp_gemm_half_q_half_gptq_kernel)
+-(
+-    const half*,
+-    const uint32_t*,
+-    const uint32_t*,
+-    const half*,
+-    half*,
+-    const int,
+-    const int,
+-    const int,
+-    const int,
+-    const int*
+-);
+-
++typedef void (*fp_gemm_half_q_half_gptq_kernel)(const half*, const uint32_t*,
++                                                const uint32_t*, const half*,
++                                                half*, const int, const int,
++                                                const int, const int,
++                                                const int*);
+ 
+ template <bool first_block, int m_count>
+-__global__ void gemm_half_q_half_gptq_4bit_kernel
+-(
+-    const half* __restrict__ a,
+-    const uint32_t* __restrict__ b_q_weight,
++__global__ void gemm_half_q_half_gptq_4bit_kernel(
++    const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight,
+     const uint32_t* __restrict__ b_gptq_qzeros,
+-    const half* __restrict__ b_gptq_scales,
+-    half* __restrict__ c,
+-    const int size_m,
+-    const int size_n,
+-    const int size_k,
+-    const int groups,
+-    const int* __restrict__ b_q_perm
+-)
+-{
+-    MatrixView_half a_(a, size_m, size_k);
+-    MatrixView_half_rw c_(c, size_m, size_n);
+-    MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+-    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+-
+-    int t = threadIdx.x;
+-
+-    // Block
+-    int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+-    int offset_m = blockIdx.y * m_count;
+-    int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+-
+-    int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+-    int end_m = min(offset_m + m_count, size_m);
+-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+-
+-    int n = offset_n + t * 4;
+-
+-    // Preload block_a
+-    __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+-
+-    if (offset_k + t < end_k)
+-    {
+-        for (int m = 0; m < m_count; ++m)
+-        {
+-            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+-            half* block_a_ptr = block_a[m];
+-
+-            half a0;
+-            if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]];
+-            else a0 = a_ptr[offset_k + t];
+-            block_a_ptr[t] = a0;
+-        }
++    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
++    const int size_m, const int size_n, const int size_k, const int groups,
++    const int* __restrict__ b_q_perm) {
++  MatrixView_half a_(a, size_m, size_k);
++  MatrixView_half_rw c_(c, size_m, size_n);
++  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
++  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
++
++  int t = threadIdx.x;
++
++  // Block
++  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
++  int offset_m = blockIdx.y * m_count;
++  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
++
++  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
++  int end_m = min(offset_m + m_count, size_m);
++  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
++
++  int n = offset_n + t * 4;
++
++  // Preload block_a
++  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
++
++  if (offset_k + t < end_k) {
++    for (int m = 0; m < m_count; ++m) {
++      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
++      half* block_a_ptr = block_a[m];
++
++      half a0;
++      if (b_q_perm)
++        a0 = a_ptr[b_q_perm[offset_k + t]];
++      else
++        a0 = a_ptr[offset_k + t];
++      block_a_ptr[t] = a0;
+     }
++  }
+ 
+-    // Zero output
+-    if (n >= size_n) return;
++  // Zero output
++  if (n >= size_n) return;
+ 
+-    if (blockIdx.z == 0)
+-    {
+-        for (int m = 0; m < m_count; m++)
+-            *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
++  if (blockIdx.z == 0) {
++    for (int m = 0; m < m_count; m++)
++      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
++  }
++
++  __syncthreads();
++
++  // Find initial group
++  int groupsize = size_k / groups;
++  int group = offset_k / groupsize;
++  int nextgroup = offset_k + groupsize;
++
++  // a, b offset
++  int qk = offset_k / (32 / 4);
++
++  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
++  const half* a_ptr = &block_a[0][0];
++  int a_stride = BLOCK_KN_SIZE;
++
++  // Initial group
++  int zeros[4];
++  float scales[4];
++  half2 z1z16[4][2];
++  half2 y1y16[4][2];
++  b_gptq_qzeros_.item4(zeros, group, n);
++  b_gptq_scales_.item4_f(scales, group, n);
++  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
++  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
++  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
++  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
++
++  // Column result
++  float block_c[m_count][4] = {};
++
++  // Dequantize and multiply
++  int k = offset_k;
++  while (k < end_k) {
++    if (k == nextgroup) {
++      group++;
++      nextgroup += groupsize;
++      b_gptq_qzeros_.item4(zeros, group, n);
++      b_gptq_scales_.item4_f(scales, group, n);
++      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
++      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
++      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
++      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+     }
+ 
+-    __syncthreads();
+-
+-    // Find initial group
+-    int groupsize = size_k / groups;
+-    int group = offset_k / groupsize;
+-    int nextgroup = offset_k + groupsize;
+-
+-    // a, b offset
+-    int qk = offset_k / (32 / 4);
+-
+-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+-    const half* a_ptr = &block_a[0][0];
+-    int a_stride = BLOCK_KN_SIZE;
+-
+-    // Initial group
+-    int zeros[4];
+-    float scales[4];
+-    half2 z1z16[4][2];
+-    half2 y1y16[4][2];
+-    b_gptq_qzeros_.item4(zeros, group, n);
+-    b_gptq_scales_.item4_f(scales, group, n);
+-    dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+-    dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+-    dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+-    dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+-
+-    // Column result
+-    float block_c[m_count][4] = {};
+-
+-    // Dequantize and multiply
+-    int k = offset_k;
+-    while (k < end_k)
+-    {
+-        if (k == nextgroup)
+-        {
+-            group++;
+-            nextgroup += groupsize;
+-            b_gptq_qzeros_.item4(zeros, group, n);
+-            b_gptq_scales_.item4_f(scales, group, n);
+-            dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+-            dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+-            dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+-            dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+-        }
+-
+-        #pragma unroll
+-        for (int j = 0; j < 4; j++)
+-        {
+-            const int4* b_ptr4 = (int4*) b_ptr;
+-            int4 load_int4 = *b_ptr4;
+-
+-            half2 dq[4][4];
+-            dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
+-            dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
+-            dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
+-            dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);
+-
+-            #pragma unroll
+-            for (int m = 0; m < m_count; m++)
+-            {
+-                block_c[m][0] = fma(dot22_8_f(dq[0], a_ptr + m * a_stride), scales[0], block_c[m][0]);
+-                block_c[m][1] = fma(dot22_8_f(dq[1], a_ptr + m * a_stride), scales[1], block_c[m][1]);
+-                block_c[m][2] = fma(dot22_8_f(dq[2], a_ptr + m * a_stride), scales[2], block_c[m][2]);
+-                block_c[m][3] = fma(dot22_8_f(dq[3], a_ptr + m * a_stride), scales[3], block_c[m][3]);
+-            }
+-
+-            b_ptr += size_n;
+-            a_ptr += 8;
+-        }
+-
+-        k += 32;
++#pragma unroll
++    for (int j = 0; j < 4; j++) {
++      const int4* b_ptr4 = (int4*)b_ptr;
++      int4 load_int4 = *b_ptr4;
++
++      half2 dq[4][4];
++      dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n,
++                          false);
++      dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n,
++                          false);
++      dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n,
++                          false);
++      dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n,
++                          false);
++
++#pragma unroll
++      for (int m = 0; m < m_count; m++) {
++        block_c[m][0] = fma(dot22_8_f(dq[0], a_ptr + m * a_stride), scales[0],
++                            block_c[m][0]);
++        block_c[m][1] = fma(dot22_8_f(dq[1], a_ptr + m * a_stride), scales[1],
++                            block_c[m][1]);
++        block_c[m][2] = fma(dot22_8_f(dq[2], a_ptr + m * a_stride), scales[2],
++                            block_c[m][2]);
++        block_c[m][3] = fma(dot22_8_f(dq[3], a_ptr + m * a_stride), scales[3],
++                            block_c[m][3]);
++      }
++
++      b_ptr += size_n;
++      a_ptr += 8;
+     }
+ 
+-    for (int m = 0; m < m_count; m++)
+-    {
+-        half2 *out = (half2*) c_.item_ptr(offset_m + m, n);
+-        half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]), __float2half_rn(block_c[m][1]));
+-        half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]), __float2half_rn(block_c[m][3]));
+-        atomicAdd(out    , result01);
+-        atomicAdd(out + 1, result23);
+-    }
++    k += 32;
++  }
++
++  for (int m = 0; m < m_count; m++) {
++    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
++    half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]),
++                                    __float2half_rn(block_c[m][1]));
++    half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]),
++                                    __float2half_rn(block_c[m][3]));
++    atomicAdd(out, result01);
++    atomicAdd(out + 1, result23);
++  }
+ }
+ 
+ template <bool first_block, int m_count>
+-__global__ void gemm_half_q_half_gptq_2bit_kernel
+-(
+-    const half* __restrict__ a,
+-    const uint32_t* __restrict__ b_q_weight,
++__global__ void gemm_half_q_half_gptq_2bit_kernel(
++    const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight,
+     const uint32_t* __restrict__ b_gptq_qzeros,
+-    const half* __restrict__ b_gptq_scales,
+-    half* __restrict__ c,
+-    const int size_m,
+-    const int size_n,
+-    const int size_k,
+-    const int groups,
+-    const int* __restrict__ b_q_perm
+-)
+-{
+-    MatrixView_half a_(a, size_m, size_k);
+-    MatrixView_half_rw c_(c, size_m, size_n);
+-    MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+-    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+-
+-    int t = threadIdx.x;
+-
+-    // Block
+-    int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+-    int offset_m = blockIdx.y * m_count;
+-    int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+-
+-    int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+-    int end_m = min(offset_m + m_count, size_m);
+-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+-
+-    int n = offset_n + t * 4;
+-
+-    // Preload block_a
+-    __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+-
+-    if (offset_k + t < end_k)
+-    {
+-        for (int m = 0; m < m_count; ++m)
+-        {
+-            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+-            half* block_a_ptr = block_a[m];
+-
+-            half a0;
+-            if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]];
+-            else a0 = a_ptr[offset_k + t];
+-            block_a_ptr[t] = a0;
+-        }
++    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
++    const int size_m, const int size_n, const int size_k, const int groups,
++    const int* __restrict__ b_q_perm) {
++  MatrixView_half a_(a, size_m, size_k);
++  MatrixView_half_rw c_(c, size_m, size_n);
++  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
++  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
++
++  int t = threadIdx.x;
++
++  // Block
++  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
++  int offset_m = blockIdx.y * m_count;
++  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
++
++  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
++  int end_m = min(offset_m + m_count, size_m);
++  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
++
++  int n = offset_n + t * 4;
++
++  // Preload block_a
++  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
++
++  if (offset_k + t < end_k) {
++    for (int m = 0; m < m_count; ++m) {
++      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
++      half* block_a_ptr = block_a[m];
++
++      half a0;
++      if (b_q_perm)
++        a0 = a_ptr[b_q_perm[offset_k + t]];
++      else
++        a0 = a_ptr[offset_k + t];
++      block_a_ptr[t] = a0;
+     }
++  }
+ 
+-    // Zero output
+-    if (n >= size_n) return;
++  // Zero output
++  if (n >= size_n) return;
+ 
+-    if (blockIdx.z == 0)
+-    {
+-        for (int m = 0; m < m_count; m++)
+-            *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
++  if (blockIdx.z == 0) {
++    for (int m = 0; m < m_count; m++)
++      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
++  }
++
++  __syncthreads();
++
++  // Find initial group
++  int groupsize = size_k / groups;
++  int group = offset_k / groupsize;
++  int nextgroup = offset_k + groupsize;
++
++  // a, b offset
++  int qk = offset_k / (32 / 2);
++
++  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
++  const half* a_ptr = &block_a[0][0];
++  int a_stride = BLOCK_KN_SIZE;
++
++  // Initial group
++  int zeros[4];
++  half scales[4];
++  b_gptq_qzeros_.item4(zeros, group, n);
++  b_gptq_scales_.item4(scales, group, n);
++  // Column result
++  half block_c[m_count][4] = {};
++
++  // Dequantize and multiply
++  int k = offset_k;
++  while (k < end_k) {
++    if (k == nextgroup) {
++      group++;
++      nextgroup += groupsize;
++      b_gptq_qzeros_.item4(zeros, group, n);
++      b_gptq_scales_.item4(scales, group, n);
+     }
+ 
+-    __syncthreads();
+-
+-    // Find initial group
+-    int groupsize = size_k / groups;
+-    int group = offset_k / groupsize;
+-    int nextgroup = offset_k + groupsize;
+-
+-    // a, b offset
+-    int qk = offset_k / (32 / 2);
+-
+-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+-    const half* a_ptr = &block_a[0][0];
+-    int a_stride = BLOCK_KN_SIZE;
+-
+-    // Initial group
+-    int zeros[4];
+-    half scales[4];
+-    b_gptq_qzeros_.item4(zeros, group, n);
+-    b_gptq_scales_.item4(scales, group, n);
+-    // Column result
+-    half block_c[m_count][4] = {};
+-
+-    // Dequantize and multiply
+-    int k = offset_k;
+-    while (k < end_k)
+-    {
+-        if (k == nextgroup)
+-        {
+-            group++;
+-            nextgroup += groupsize;
+-            b_gptq_qzeros_.item4(zeros, group, n);
+-            b_gptq_scales_.item4(scales, group, n);
+-        }
+-
+-        #pragma unroll
+-        for (int j = 0; j < 1; j++)
+-        {
+-            const int4* b_ptr4 = (int4*) b_ptr;
+-            int4 load_int4 = *b_ptr4;
+-
+-            half2 dq[4][8];
+-            dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
+-            dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
+-            dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
+-            dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+-
+-            #pragma unroll
+-            for (int m = 0; m < m_count; m++)
+-            {
+-                block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+-                block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+-                block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+-                block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+-            }
+-
+-            b_ptr += size_n;
+-            a_ptr += 16;
+-        }
+-
+-        k += 16;
++#pragma unroll
++    for (int j = 0; j < 1; j++) {
++      const int4* b_ptr4 = (int4*)b_ptr;
++      int4 load_int4 = *b_ptr4;
++
++      half2 dq[4][8];
++      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
++      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
++      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
++      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
++
++#pragma unroll
++      for (int m = 0; m < m_count; m++) {
++        block_c[m][0] =
++            dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
++        block_c[m][1] =
++            dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
++        block_c[m][2] =
++            dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
++        block_c[m][3] =
++            dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
++      }
++
++      b_ptr += size_n;
++      a_ptr += 16;
+     }
+ 
+-    for (int m = 0; m < m_count; m++)
+-    {
+-        half2 *out = (half2*) c_.item_ptr(offset_m + m, n);
+-        half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+-        half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+-        atomicAdd(out    , result01);
+-        atomicAdd(out + 1, result23);
+-    }
++    k += 16;
++  }
++
++  for (int m = 0; m < m_count; m++) {
++    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
++    half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
++    half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
++    atomicAdd(out, result01);
++    atomicAdd(out + 1, result23);
++  }
+ }
+ 
+ template <bool first_block, int m_count>
+-__global__ void gemm_half_q_half_gptq_3bit_kernel
+-(
+-    const half* __restrict__ a,
+-    const uint32_t* __restrict__ b_q_weight,
++__global__ void gemm_half_q_half_gptq_3bit_kernel(
++    const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight,
+     const uint32_t* __restrict__ b_gptq_qzeros,
+-    const half* __restrict__ b_gptq_scales,
+-    half* __restrict__ c,
+-    const int size_m,
+-    const int size_n,
+-    const int size_k,
+-    const int groups,
+-    const int* __restrict__ b_q_perm
+-)
+-{
+-    MatrixView_half a_(a, size_m, size_k);
+-    MatrixView_half_rw c_(c, size_m, size_n);
+-    MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+-    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+-
+-    int t = threadIdx.x;
+-
+-    // Block
+-    int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+-    int offset_m = blockIdx.y * m_count;
+-    int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+-
+-    int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+-    int end_m = min(offset_m + m_count, size_m);
+-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+-
+-    int n = offset_n + t * 4;
+-
+-    // Preload block_a
+-    __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+-
+-    if (offset_k + t < end_k)
+-    {
+-        for (int m = 0; m < m_count; ++m)
+-        {
+-            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+-            half* block_a_ptr = block_a[m];
+-
+-            half a0;
+-            if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]];
+-            else a0 = a_ptr[offset_k + t];
+-            block_a_ptr[t] = a0;
+-        }
++    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
++    const int size_m, const int size_n, const int size_k, const int groups,
++    const int* __restrict__ b_q_perm) {
++  MatrixView_half a_(a, size_m, size_k);
++  MatrixView_half_rw c_(c, size_m, size_n);
++  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
++  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
++
++  int t = threadIdx.x;
++
++  // Block
++  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
++  int offset_m = blockIdx.y * m_count;
++  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
++
++  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
++  int end_m = min(offset_m + m_count, size_m);
++  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
++
++  int n = offset_n + t * 4;
++
++  // Preload block_a
++  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
++
++  if (offset_k + t < end_k) {
++    for (int m = 0; m < m_count; ++m) {
++      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
++      half* block_a_ptr = block_a[m];
++
++      half a0;
++      if (b_q_perm)
++        a0 = a_ptr[b_q_perm[offset_k + t]];
++      else
++        a0 = a_ptr[offset_k + t];
++      block_a_ptr[t] = a0;
+     }
++  }
+ 
+-    // Zero output
+-    if (n >= size_n) return;
++  // Zero output
++  if (n >= size_n) return;
+ 
+-    if (blockIdx.z == 0)
+-    {
+-        for (int m = 0; m < m_count; m++)
+-            *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
++  if (blockIdx.z == 0) {
++    for (int m = 0; m < m_count; m++)
++      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
++  }
++
++  __syncthreads();
++
++  // Find initial group
++  int groupsize = size_k / groups;
++  int group = offset_k / groupsize;
++  int nextgroup = offset_k + groupsize;
++
++  // a, b offset
++  int qk = offset_k / 32 * 3;
++
++  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
++  const half* a_ptr = &block_a[0][0];
++  int a_stride = BLOCK_KN_SIZE;
++
++  // Initial group
++  int zeros[4];
++  half scales[4];
++  b_gptq_qzeros_.item4(zeros, group, n);
++  b_gptq_scales_.item4(scales, group, n);
++  // Column result
++  half block_c[m_count][4] = {};
++
++  // Dequantize and multiply
++  int k = offset_k;
++  while (k < end_k) {
++    if (k == nextgroup) {
++      group++;
++      nextgroup += groupsize;
++      b_gptq_qzeros_.item4(zeros, group, n);
++      b_gptq_scales_.item4(scales, group, n);
+     }
+ 
+-    __syncthreads();
+-
+-    // Find initial group
+-    int groupsize = size_k / groups;
+-    int group = offset_k / groupsize;
+-    int nextgroup = offset_k + groupsize;
+-
+-    // a, b offset
+-    int qk = offset_k / 32 * 3;
+-
+-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+-    const half* a_ptr = &block_a[0][0];
+-    int a_stride = BLOCK_KN_SIZE;
+-
+-    // Initial group
+-    int zeros[4];
+-    half scales[4];
+-    b_gptq_qzeros_.item4(zeros, group, n);
+-    b_gptq_scales_.item4(scales, group, n);
+-    // Column result
+-    half block_c[m_count][4] = {};
+-
+-    // Dequantize and multiply
+-    int k = offset_k;
+-    while (k < end_k)
+-    {
+-        if (k == nextgroup)
+-        {
+-            group++;
+-            nextgroup += groupsize;
+-            b_gptq_qzeros_.item4(zeros, group, n);
+-            b_gptq_scales_.item4(scales, group, n);
+-        }
+-
+-        #pragma unroll
+-        for (int j = 0; j < 1; j++)
+-        {
+-            int4 load_int4[3];
+-            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+-            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+-            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
+-
+-            half2 dq[4][16];
+-            dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n, zeros[0] + 1);
+-            dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n, zeros[1] + 1);
+-            dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n, zeros[2] + 1);
+-            dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n, zeros[3] + 1);
+-
+-            #pragma unroll
+-            for (int m = 0; m < m_count; m++)
+-            {
+-                block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+-                block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+-                block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+-                block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+-            }
+-            a_ptr += 32;
+-        }
+-
+-        k += 32;
++#pragma unroll
++    for (int j = 0; j < 1; j++) {
++      int4 load_int4[3];
++      load_int4[0] = *((int4*)b_ptr);
++      b_ptr += size_n;
++      load_int4[1] = *((int4*)b_ptr);
++      b_ptr += size_n;
++      load_int4[2] = *((int4*)b_ptr);
++      b_ptr += size_n;
++
++      half2 dq[4][16];
++      dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
++                      size_n, zeros[0] + 1);
++      dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
++                      size_n, zeros[1] + 1);
++      dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
++                      size_n, zeros[2] + 1);
++      dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
++                      size_n, zeros[3] + 1);
++
++#pragma unroll
++      for (int m = 0; m < m_count; m++) {
++        block_c[m][0] =
++            dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
++        block_c[m][1] =
++            dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
++        block_c[m][2] =
++            dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
++        block_c[m][3] =
++            dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
++      }
++      a_ptr += 32;
+     }
+ 
+-    for (int m = 0; m < m_count; m++)
+-    {
+-        half2 *out = (half2*) c_.item_ptr(offset_m + m, n);
+-        half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+-        half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+-        atomicAdd(out    , result01);
+-        atomicAdd(out + 1, result23);
+-    }
++    k += 32;
++  }
++
++  for (int m = 0; m < m_count; m++) {
++    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
++    half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
++    half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
++    atomicAdd(out, result01);
++    atomicAdd(out + 1, result23);
++  }
+ }
+ 
+ template <bool first_block, int m_count>
+-__global__ void gemm_half_q_half_gptq_8bit_kernel
+-(
+-    const half* __restrict__ a,
+-    const uint32_t* __restrict__ b_q_weight,
++__global__ void gemm_half_q_half_gptq_8bit_kernel(
++    const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight,
+     const uint32_t* __restrict__ b_gptq_qzeros,
+-    const half* __restrict__ b_gptq_scales,
+-    half* __restrict__ c,
+-    const int size_m,
+-    const int size_n,
+-    const int size_k,
+-    const int groups,
+-    const int* __restrict__ b_q_perm
+-)
+-{
+-    MatrixView_half a_(a, size_m, size_k);
+-    MatrixView_half_rw c_(c, size_m, size_n);
+-    MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+-    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+-
+-    int t = threadIdx.x;
+-
+-    // Block
+-    int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+-    int offset_m = blockIdx.y * m_count;
+-    int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+-
+-    int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+-    int end_m = min(offset_m + m_count, size_m);
+-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+-
+-    int n = offset_n + t * 4;
+-
+-    // Preload block_a
+-    __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+-
+-    if (offset_k + t < end_k)
+-    {
+-        for (int m = 0; m < m_count; ++m)
+-        {
+-            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+-            half* block_a_ptr = block_a[m];
+-
+-            half a0;
+-            if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]];
+-            else a0 = a_ptr[offset_k + t];
+-            block_a_ptr[t] = a0;
+-        }
+-    }
+-
+-    // Zero output
+-    if (n >= size_n) return;
+-
+-    if (blockIdx.z == 0)
+-    {
+-        for (int m = 0; m < m_count; m++)
+-            *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
++    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
++    const int size_m, const int size_n, const int size_k, const int groups,
++    const int* __restrict__ b_q_perm) {
++  MatrixView_half a_(a, size_m, size_k);
++  MatrixView_half_rw c_(c, size_m, size_n);
++  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
++  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
++
++  int t = threadIdx.x;
++
++  // Block
++  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
++  int offset_m = blockIdx.y * m_count;
++  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
++
++  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
++  int end_m = min(offset_m + m_count, size_m);
++  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
++
++  int n = offset_n + t * 4;
++
++  // Preload block_a
++  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
++
++  if (offset_k + t < end_k) {
++    for (int m = 0; m < m_count; ++m) {
++      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
++      half* block_a_ptr = block_a[m];
++
++      half a0;
++      if (b_q_perm)
++        a0 = a_ptr[b_q_perm[offset_k + t]];
++      else
++        a0 = a_ptr[offset_k + t];
++      block_a_ptr[t] = a0;
+     }
++  }
+ 
+-    __syncthreads();
+-
+-    // Find initial group
+-    int groupsize = size_k / groups;
+-    int group = offset_k / groupsize;
+-    int nextgroup = offset_k + groupsize;
+-
+-    // a, b offset
+-    int qk = offset_k / (32 / 8);
+-
+-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+-    const half* a_ptr = &block_a[0][0];
+-    int a_stride = BLOCK_KN_SIZE;
+-
+-    // Initial group
+-    int zeros[4];
+-    half scales[4];
+-    b_gptq_qzeros_.item4(zeros, group, n);
+-    b_gptq_scales_.item4(scales, group, n);
+-    // Column result
+-    half block_c[m_count][4] = {};
+-
+-    // Dequantize and multiply
+-    int k = offset_k;
+-    while (k < end_k)
+-    {
+-        if (k == nextgroup)
+-        {
+-            group++;
+-            nextgroup += groupsize;
+-            b_gptq_qzeros_.item4(zeros, group, n);
+-            b_gptq_scales_.item4(scales, group, n);
+-        }
++  // Zero output
++  if (n >= size_n) return;
+ 
+-        #pragma unroll
+-        for (int j = 0; j < 4; j++)
+-        {
+-            int4 load_int4[2];
+-            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+-            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+-
+-            half2 dq[4][4];
+-            dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, zeros[0] + 1);
+-            dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, zeros[1] + 1);
+-            dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, zeros[2] + 1);
+-            dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, zeros[3] + 1);
+-
+-            for (int m = 0; m < m_count; m++)
+-            {
+-                block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+-                block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+-                block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+-                block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+-            }
+-            a_ptr += 8;
+-        }
+-        k += 32;
++  if (blockIdx.z == 0) {
++    for (int m = 0; m < m_count; m++)
++      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
++  }
++
++  __syncthreads();
++
++  // Find initial group
++  int groupsize = size_k / groups;
++  int group = offset_k / groupsize;
++  int nextgroup = offset_k + groupsize;
++
++  // a, b offset
++  int qk = offset_k / (32 / 8);
++
++  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
++  const half* a_ptr = &block_a[0][0];
++  int a_stride = BLOCK_KN_SIZE;
++
++  // Initial group
++  int zeros[4];
++  half scales[4];
++  b_gptq_qzeros_.item4(zeros, group, n);
++  b_gptq_scales_.item4(scales, group, n);
++  // Column result
++  half block_c[m_count][4] = {};
++
++  // Dequantize and multiply
++  int k = offset_k;
++  while (k < end_k) {
++    if (k == nextgroup) {
++      group++;
++      nextgroup += groupsize;
++      b_gptq_qzeros_.item4(zeros, group, n);
++      b_gptq_scales_.item4(scales, group, n);
+     }
+ 
+-    for (int m = 0; m < m_count; m++)
+-    {
+-        half2 *out = (half2*) c_.item_ptr(offset_m + m, n);
+-        half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+-        half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+-        atomicAdd(out    , result01);
+-        atomicAdd(out + 1, result23);
++#pragma unroll
++    for (int j = 0; j < 4; j++) {
++      int4 load_int4[2];
++      load_int4[0] = *((int4*)b_ptr);
++      b_ptr += size_n;
++      load_int4[1] = *((int4*)b_ptr);
++      b_ptr += size_n;
++
++      half2 dq[4][4];
++      dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
++                     zeros[0] + 1);
++      dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
++                     zeros[1] + 1);
++      dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
++                     zeros[2] + 1);
++      dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
++                     zeros[3] + 1);
++
++      for (int m = 0; m < m_count; m++) {
++        block_c[m][0] =
++            dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
++        block_c[m][1] =
++            dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
++        block_c[m][2] =
++            dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
++        block_c[m][3] =
++            dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
++      }
++      a_ptr += 8;
+     }
++    k += 32;
++  }
++
++  for (int m = 0; m < m_count; m++) {
++    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
++    half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
++    half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
++    atomicAdd(out, result01);
++    atomicAdd(out + 1, result23);
++  }
+ }
+ 
+ fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(
+-    bool first_block, const int m_count, const int bit)
+-{
+-    #define SELECT_KERNEL(M_COUNT)                                            \
+-    if (m_count == M_COUNT) {                                                 \
+-      if (bit == 2) return gemm_half_q_half_gptq_2bit_kernel<true, M_COUNT>;  \
+-      if (bit == 3) return gemm_half_q_half_gptq_3bit_kernel<true, M_COUNT>;  \
+-      if (bit == 4) return gemm_half_q_half_gptq_4bit_kernel<true, M_COUNT>;  \
+-      if (bit == 8) return gemm_half_q_half_gptq_8bit_kernel<true, M_COUNT>;  \
+-    }
+-    #if BLOCK_M_SIZE_MAX >= 1
+-    SELECT_KERNEL(1);
+-    #endif
+-    #if BLOCK_M_SIZE_MAX >= 2
+-    SELECT_KERNEL(2);
+-    #endif
+-    #if BLOCK_M_SIZE_MAX >= 3
+-    SELECT_KERNEL(3);
+-    #endif
+-    #if BLOCK_M_SIZE_MAX >= 4
+-    SELECT_KERNEL(4);
+-    #endif
+-    #if BLOCK_M_SIZE_MAX >= 5
+-    SELECT_KERNEL(5);
+-    #endif
+-    #if BLOCK_M_SIZE_MAX >= 6
+-    SELECT_KERNEL(6);
+-    #endif
+-    #if BLOCK_M_SIZE_MAX >= 7
+-    SELECT_KERNEL(7);
+-    #endif
+-    #if BLOCK_M_SIZE_MAX >= 8
+-    SELECT_KERNEL(8);
+-    #endif
+-    return NULL;
++    bool first_block, const int m_count, const int bit) {
++#define SELECT_KERNEL(M_COUNT)                                             \
++  if (m_count == M_COUNT) {                                                \
++    if (bit == 2) return gemm_half_q_half_gptq_2bit_kernel<true, M_COUNT>; \
++    if (bit == 3) return gemm_half_q_half_gptq_3bit_kernel<true, M_COUNT>; \
++    if (bit == 4) return gemm_half_q_half_gptq_4bit_kernel<true, M_COUNT>; \
++    if (bit == 8) return gemm_half_q_half_gptq_8bit_kernel<true, M_COUNT>; \
++  }
++#if BLOCK_M_SIZE_MAX >= 1
++  SELECT_KERNEL(1);
++#endif
++#if BLOCK_M_SIZE_MAX >= 2
++  SELECT_KERNEL(2);
++#endif
++#if BLOCK_M_SIZE_MAX >= 3
++  SELECT_KERNEL(3);
++#endif
++#if BLOCK_M_SIZE_MAX >= 4
++  SELECT_KERNEL(4);
++#endif
++#if BLOCK_M_SIZE_MAX >= 5
++  SELECT_KERNEL(5);
++#endif
++#if BLOCK_M_SIZE_MAX >= 6
++  SELECT_KERNEL(6);
++#endif
++#if BLOCK_M_SIZE_MAX >= 7
++  SELECT_KERNEL(7);
++#endif
++#if BLOCK_M_SIZE_MAX >= 8
++  SELECT_KERNEL(8);
++#endif
++  return NULL;
+ }
+ 
++void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
++                                const uint32_t* b_gptq_qzeros,
++                                const half* b_gptq_scales, const int* b_q_perm,
++                                half* c, int size_m, int size_n, int size_k,
++                                int m_count, int groups, int bit) {
++  dim3 blockDim, gridDim;
++  blockDim.x = BLOCK_KN_SIZE;
++  blockDim.y = 1;
++  blockDim.z = 1;
++  gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4);
++  gridDim.y = DIVIDE(size_m, m_count);
++  gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
++
++  fp_gemm_half_q_half_gptq_kernel kernel =
++      pick_gemm_half_q_half_gptq_kernel(true, m_count, bit);
++
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  kernel<<<gridDim, blockDim, 0, stream>>>(a, b_q_weight, b_gptq_qzeros,
++                                           b_gptq_scales, c, size_m, size_n,
++                                           size_k, groups, b_q_perm);
++}
+ 
+-void gemm_half_q_half_cuda_part
+-(
+-    const half* a,
+-    const uint32_t* b_q_weight,
+-    const uint32_t* b_gptq_qzeros,
+-    const half* b_gptq_scales,
+-    const int* b_q_perm,
+-    half* c,
+-    int size_m,
+-    int size_n,
+-    int size_k,
+-    int m_count,
+-    int groups,
+-    int bit
+-)
+-{
+-    dim3 blockDim, gridDim;
+-    blockDim.x = BLOCK_KN_SIZE;
+-    blockDim.y = 1;
+-    blockDim.z = 1;
+-    gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4);
+-    gridDim.y = DIVIDE(size_m, m_count);
+-    gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
++__global__ void reconstruct_exllama_8bit_kernel(
++    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
++    const uint32_t* __restrict__ b_gptq_qzeros,
++    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
++    const int groups, half* __restrict__ b) {
++  MatrixView_half_rw b_(b, size_k, size_n);
++  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
++  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+ 
+-    fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count, bit);
++  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
++  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+ 
+-    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-    kernel<<<gridDim, blockDim, 0, stream>>>
+-    (
+-        a,
+-        b_q_weight,
+-        b_gptq_qzeros,
+-        b_gptq_scales,
+-        c,
+-        size_m,
+-        size_n,
+-        size_k,
+-        groups,
+-        b_q_perm
+-    );
+-}
++  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+ 
++  // Preload remapping table
++  __shared__ int perm[BLOCK_KN_SIZE];
++  int t = threadIdx.x;
+ 
+-__global__ void reconstruct_exllama_8bit_kernel
+-(
+-    const uint32_t* __restrict__ b_q_weight,
+-    const int* __restrict__ b_q_perm,
+-    const uint32_t* __restrict__ b_gptq_qzeros,
+-    const half* __restrict__ b_gptq_scales,
+-    const int size_k,
+-    const int size_n,
+-    const int groups,
+-    half* __restrict__ b
+-)
+-{
+-    MatrixView_half_rw b_(b, size_k, size_n);
+-    MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+-    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+-
+-    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
+-    int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+-
+-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+-
+-    // Preload remapping table
+-    __shared__ int perm[BLOCK_KN_SIZE];
+-    int t = threadIdx.x;
+-
+-    if (b_q_perm)
+-    {
+-        if (offset_k + t < size_k)
+-            perm[t] = b_q_perm[offset_k + t];
+-    }
++  if (b_q_perm) {
++    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
++  }
+ 
+-    // Column
+-    int n = offset_n + t * 4;
+-    if (n >= size_n) return;
++  // Column
++  int n = offset_n + t * 4;
++  if (n >= size_n) return;
+ 
+-    // Find initial group
+-    int groupsize = size_k / groups;
+-    int group = offset_k / groupsize;
+-    int nextgroup = offset_k + groupsize;
++  // Find initial group
++  int groupsize = size_k / groups;
++  int group = offset_k / groupsize;
++  int nextgroup = offset_k + groupsize;
+ 
+-    // b offset
+-    int qk = offset_k / (32 / 8);
++  // b offset
++  int qk = offset_k / (32 / 8);
+ 
+-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
++  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+ 
+-    // Initial zeros/scale
+-    int zeros[4];
+-    half2 scales[4];
+-    b_gptq_qzeros_.item4(zeros, group, n);
+-    b_gptq_scales_.item4_h2(scales, group, n);
++  // Initial zeros/scale
++  int zeros[4];
++  half2 scales[4];
++  b_gptq_qzeros_.item4(zeros, group, n);
++  b_gptq_scales_.item4_h2(scales, group, n);
+ 
+-    __syncthreads();
++  __syncthreads();
+ 
+-    int k = offset_k;
+-    int lk = 0;
++  int k = offset_k;
++  int lk = 0;
+ 
+-    while (k < end_k)
+-    {
+-        if (k == nextgroup)
+-        {
+-            group++;
+-            nextgroup += groupsize;
+-            b_gptq_qzeros_.item4(zeros, group, n);
+-            b_gptq_scales_.item4_h2(scales, group, n);
+-        }
++  while (k < end_k) {
++    if (k == nextgroup) {
++      group++;
++      nextgroup += groupsize;
++      b_gptq_qzeros_.item4(zeros, group, n);
++      b_gptq_scales_.item4_h2(scales, group, n);
++    }
+ 
+-        for (int p = 0; p < 4; p++)
+-        {
+-            int4 load_int4[2];
+-            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+-            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+-
+-            half2 dq[4][4];
+-            dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, zeros[0] + 1);
+-            dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, zeros[1] + 1);
+-            dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, zeros[2] + 1);
+-            dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, zeros[3] + 1);
+-
+-            //half* dqh = (half*)dq;
+-            if (b_q_perm)
+-            {
+-                for (int j = 0; j < 4; j++)
+-                {
+-                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+-                    b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+-                    b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+-                }
+-            }
+-            else
+-            {
+-                for (int j = 0; j < 4; j++)
+-                {
+-                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+-                    b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+-                    b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+-                }
+-            }
++    for (int p = 0; p < 4; p++) {
++      int4 load_int4[2];
++      load_int4[0] = *((int4*)b_ptr);
++      b_ptr += size_n;
++      load_int4[1] = *((int4*)b_ptr);
++      b_ptr += size_n;
++
++      half2 dq[4][4];
++      dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
++                     zeros[0] + 1);
++      dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
++                     zeros[1] + 1);
++      dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
++                     zeros[2] + 1);
++      dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
++                     zeros[3] + 1);
++
++      // half* dqh = (half*)dq;
++      if (b_q_perm) {
++        for (int j = 0; j < 4; j++) {
++          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
++          b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]),
++                  __low2half(dq[2][j]), __low2half(dq[3][j]));
++          b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]),
++                  __high2half(dq[2][j]), __high2half(dq[3][j]));
+         }
+-        k += 32;
++      } else {
++        for (int j = 0; j < 4; j++) {
++          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
++          b_.set4(offset_k + lk++, n, __low2half(dq[0][j]),
++                  __low2half(dq[1][j]), __low2half(dq[2][j]),
++                  __low2half(dq[3][j]));
++          b_.set4(offset_k + lk++, n, __high2half(dq[0][j]),
++                  __high2half(dq[1][j]), __high2half(dq[2][j]),
++                  __high2half(dq[3][j]));
++        }
++      }
+     }
++    k += 32;
++  }
+ }
+ 
+-__global__ void reconstruct_exllama_4bit_kernel
+-(
+-    const uint32_t* __restrict__ b_q_weight,
+-    const int* __restrict__ b_q_perm,
++__global__ void reconstruct_exllama_4bit_kernel(
++    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
+     const uint32_t* __restrict__ b_gptq_qzeros,
+-    const half* __restrict__ b_gptq_scales,
+-    const int size_k,
+-    const int size_n,
+-    const int groups,
+-    half* __restrict__ b
+-)
+-{
+-    MatrixView_half_rw b_(b, size_k, size_n);
+-    MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+-    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+-
+-    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
+-    int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+-
+-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+-
+-    // Preload remapping table
+-    __shared__ int perm[BLOCK_KN_SIZE];
+-    int t = threadIdx.x;
+-
+-    if (b_q_perm)
+-    {
+-        if (offset_k + t < size_k)
+-            perm[t] = b_q_perm[offset_k + t];
++    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
++    const int groups, half* __restrict__ b) {
++  MatrixView_half_rw b_(b, size_k, size_n);
++  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
++  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
++
++  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
++  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
++
++  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
++
++  // Preload remapping table
++  __shared__ int perm[BLOCK_KN_SIZE];
++  int t = threadIdx.x;
++
++  if (b_q_perm) {
++    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
++  }
++
++  // Column
++  int n = offset_n + t * 4;
++  if (n >= size_n) return;
++
++  // Find initial group
++  int groupsize = size_k / groups;
++  int group = offset_k / groupsize;
++  int nextgroup = offset_k + groupsize;
++
++  // b offset
++  int qk = offset_k / (32 / 4);
++
++  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
++
++  // Initial zeros/scale
++  int zeros[4];
++  half2 scales[4];
++  half2 z1z16[4][2];
++  half2 y1y16[4][2];
++  b_gptq_qzeros_.item4(zeros, group, n);
++  b_gptq_scales_.item4_h2(scales, group, n);
++  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
++  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
++  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
++  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
++
++  __syncthreads();
++
++  int k = offset_k;
++  int lk = 0;
++
++  while (k < end_k) {
++    if (k == nextgroup) {
++      group++;
++      nextgroup += groupsize;
++      b_gptq_qzeros_.item4(zeros, group, n);
++      b_gptq_scales_.item4_h2(scales, group, n);
++      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
++      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
++      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
++      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+     }
+ 
+-    // Column
+-    int n = offset_n + t * 4;
+-    if (n >= size_n) return;
+-
+-    // Find initial group
+-    int groupsize = size_k / groups;
+-    int group = offset_k / groupsize;
+-    int nextgroup = offset_k + groupsize;
+-
+-    // b offset
+-    int qk = offset_k / (32 / 4);
+-
+-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+-
+-    // Initial zeros/scale
+-    int zeros[4];
+-    half2 scales[4];
+-    half2 z1z16[4][2];
+-    half2 y1y16[4][2];
+-    b_gptq_qzeros_.item4(zeros, group, n);
+-    b_gptq_scales_.item4_h2(scales, group, n);
+-    dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+-    dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+-    dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+-    dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+-
+-    __syncthreads();
+-
+-    int k = offset_k;
+-    int lk = 0;
+-
+-    while (k < end_k)
+-    {
+-        if (k == nextgroup)
+-        {
+-            group++;
+-            nextgroup += groupsize;
+-            b_gptq_qzeros_.item4(zeros, group, n);
+-            b_gptq_scales_.item4_h2(scales, group, n);
+-            dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+-            dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+-            dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+-            dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
++    for (int p = 0; p < 4; p++) {
++      half2 dq[4][4];
++      const int4* b_ptr4 = (int4*)b_ptr;
++      int4 load_int4 = *b_ptr4;
++
++      dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n,
++                          false);
++      dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n,
++                          false);
++      dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n,
++                          false);
++      dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n,
++                          false);
++
++      b_ptr += size_n;
++      // half* dqh = (half*)dq;
++      if (b_q_perm) {
++        for (int j = 0; j < 4; j++) {
++          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
++          b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]),
++                  __low2half(dq[2][j]), __low2half(dq[3][j]));
++          b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]),
++                  __high2half(dq[2][j]), __high2half(dq[3][j]));
+         }
+-
+-        for (int p = 0; p < 4; p++)
+-        {
+-            half2 dq[4][4];
+-            const int4* b_ptr4 = (int4*) b_ptr;
+-            int4 load_int4 = *b_ptr4;
+-
+-            dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
+-            dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
+-            dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
+-            dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);
+-
+-            b_ptr += size_n;
+-            //half* dqh = (half*)dq;
+-            if (b_q_perm)
+-            {
+-                for (int j = 0; j < 4; j++)
+-                {
+-                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+-                    b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+-                    b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+-                }
+-            }
+-            else
+-            {
+-                for (int j = 0; j < 4; j++)
+-                {
+-                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+-                    b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+-                    b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+-                }
+-            }
++      } else {
++        for (int j = 0; j < 4; j++) {
++          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
++          b_.set4(offset_k + lk++, n, __low2half(dq[0][j]),
++                  __low2half(dq[1][j]), __low2half(dq[2][j]),
++                  __low2half(dq[3][j]));
++          b_.set4(offset_k + lk++, n, __high2half(dq[0][j]),
++                  __high2half(dq[1][j]), __high2half(dq[2][j]),
++                  __high2half(dq[3][j]));
+         }
+-        k += 32;
++      }
+     }
++    k += 32;
++  }
+ }
+ 
+-__global__ void reconstruct_exllama_3bit_kernel
+-(
+-    const uint32_t* __restrict__ b_q_weight,
+-    const int* __restrict__ b_q_perm,
++__global__ void reconstruct_exllama_3bit_kernel(
++    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
+     const uint32_t* __restrict__ b_gptq_qzeros,
+-    const half* __restrict__ b_gptq_scales,
+-    const int size_k,
+-    const int size_n,
+-    const int groups,
+-    half* __restrict__ b
+-)
+-{
+-    MatrixView_half_rw b_(b, size_k, size_n);
+-    MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+-    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+-
+-    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
+-    int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+-
+-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+-
+-    // Preload remapping table
+-    __shared__ int perm[BLOCK_KN_SIZE];
+-    int t = threadIdx.x;
+-
+-    if (b_q_perm)
+-    {
+-        if (offset_k + t < size_k)
+-            perm[t] = b_q_perm[offset_k + t];
+-    }
++    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
++    const int groups, half* __restrict__ b) {
++  MatrixView_half_rw b_(b, size_k, size_n);
++  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
++  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+ 
+-    // Column
+-    int n = offset_n + t * 4;
+-    if (n >= size_n) return;
++  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
++  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+ 
+-    // Find initial group
+-    int groupsize = size_k / groups;
+-    int group = offset_k / groupsize;
+-    int nextgroup = offset_k + groupsize;
++  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+ 
+-    // b offset
+-    int qk = offset_k / 32* 3;
++  // Preload remapping table
++  __shared__ int perm[BLOCK_KN_SIZE];
++  int t = threadIdx.x;
+ 
+-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
++  if (b_q_perm) {
++    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
++  }
+ 
+-    // Initial zeros/scale
+-    int zeros[4];
+-    half2 scales[4];
+-    b_gptq_qzeros_.item4(zeros, group, n);
+-    b_gptq_scales_.item4_h2(scales, group, n);
++  // Column
++  int n = offset_n + t * 4;
++  if (n >= size_n) return;
+ 
+-    __syncthreads();
++  // Find initial group
++  int groupsize = size_k / groups;
++  int group = offset_k / groupsize;
++  int nextgroup = offset_k + groupsize;
+ 
+-    int k = offset_k;
+-    int lk = 0;
++  // b offset
++  int qk = offset_k / 32 * 3;
+ 
+-    while (k < end_k)
+-    {
+-        if (k == nextgroup)
+-        {
+-            group++;
+-            nextgroup += groupsize;
+-            b_gptq_qzeros_.item4(zeros, group, n);
+-            b_gptq_scales_.item4_h2(scales, group, n);
+-        }
++  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
++
++  // Initial zeros/scale
++  int zeros[4];
++  half2 scales[4];
++  b_gptq_qzeros_.item4(zeros, group, n);
++  b_gptq_scales_.item4_h2(scales, group, n);
++
++  __syncthreads();
+ 
+-        for (int p = 0; p < 1; p++)
+-        {
+-            int4 load_int4[3];
+-            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+-            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+-            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
+-
+-            half2 dq[4][16];
+-            dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n, zeros[0] + 1);
+-            dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n, zeros[1] + 1);
+-            dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n, zeros[2] + 1);
+-            dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n, zeros[3] + 1);
+-
+-            if (b_q_perm)
+-            {
+-                for (int j = 0; j < 16; j++)
+-                {
+-                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+-                    b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+-                    b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+-                }
+-            }
+-            else
+-            {
+-                for (int j = 0; j < 16; j++)
+-                {
+-                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+-                    b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+-                    b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+-                }
+-            }
++  int k = offset_k;
++  int lk = 0;
++
++  while (k < end_k) {
++    if (k == nextgroup) {
++      group++;
++      nextgroup += groupsize;
++      b_gptq_qzeros_.item4(zeros, group, n);
++      b_gptq_scales_.item4_h2(scales, group, n);
++    }
++
++    for (int p = 0; p < 1; p++) {
++      int4 load_int4[3];
++      load_int4[0] = *((int4*)b_ptr);
++      b_ptr += size_n;
++      load_int4[1] = *((int4*)b_ptr);
++      b_ptr += size_n;
++      load_int4[2] = *((int4*)b_ptr);
++      b_ptr += size_n;
++
++      half2 dq[4][16];
++      dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
++                      size_n, zeros[0] + 1);
++      dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
++                      size_n, zeros[1] + 1);
++      dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
++                      size_n, zeros[2] + 1);
++      dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
++                      size_n, zeros[3] + 1);
++
++      if (b_q_perm) {
++        for (int j = 0; j < 16; j++) {
++          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
++          b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]),
++                  __low2half(dq[2][j]), __low2half(dq[3][j]));
++          b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]),
++                  __high2half(dq[2][j]), __high2half(dq[3][j]));
++        }
++      } else {
++        for (int j = 0; j < 16; j++) {
++          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
++          b_.set4(offset_k + lk++, n, __low2half(dq[0][j]),
++                  __low2half(dq[1][j]), __low2half(dq[2][j]),
++                  __low2half(dq[3][j]));
++          b_.set4(offset_k + lk++, n, __high2half(dq[0][j]),
++                  __high2half(dq[1][j]), __high2half(dq[2][j]),
++                  __high2half(dq[3][j]));
+         }
+-        k += 32;
++      }
+     }
++    k += 32;
++  }
+ }
+ 
+-__global__ void reconstruct_exllama_2bit_kernel
+-(
+-    const uint32_t* __restrict__ b_q_weight,
+-    const int* __restrict__ b_q_perm,
++__global__ void reconstruct_exllama_2bit_kernel(
++    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
+     const uint32_t* __restrict__ b_gptq_qzeros,
+-    const half* __restrict__ b_gptq_scales,
+-    const int size_k,
+-    const int size_n,
+-    const int groups,
+-    half* __restrict__ b
+-)
+-{
+-    MatrixView_half_rw b_(b, size_k, size_n);
+-    MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+-    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+-
+-    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
+-    int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+-
+-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+-
+-    // Preload remapping table
+-    __shared__ int perm[BLOCK_KN_SIZE];
+-    int t = threadIdx.x;
+-
+-    if (b_q_perm)
+-    {
+-        if (offset_k + t < size_k)
+-            perm[t] = b_q_perm[offset_k + t];
+-    }
++    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
++    const int groups, half* __restrict__ b) {
++  MatrixView_half_rw b_(b, size_k, size_n);
++  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
++  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+ 
+-    // Column
+-    int n = offset_n + t * 4;
+-    if (n >= size_n) return;
++  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
++  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+ 
+-    // Find initial group
+-    int groupsize = size_k / groups;
+-    int group = offset_k / groupsize;
+-    int nextgroup = offset_k + groupsize;
++  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+ 
+-    // b offset
+-    int qk = offset_k / (32 / 2);
++  // Preload remapping table
++  __shared__ int perm[BLOCK_KN_SIZE];
++  int t = threadIdx.x;
+ 
+-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
++  if (b_q_perm) {
++    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
++  }
+ 
+-    // Initial zeros/scale
+-    int zeros[4];
+-    half2 scales[4];
+-    b_gptq_qzeros_.item4(zeros, group, n);
+-    b_gptq_scales_.item4_h2(scales, group, n);
++  // Column
++  int n = offset_n + t * 4;
++  if (n >= size_n) return;
+ 
+-    __syncthreads();
++  // Find initial group
++  int groupsize = size_k / groups;
++  int group = offset_k / groupsize;
++  int nextgroup = offset_k + groupsize;
+ 
+-    int k = offset_k;
+-    int lk = 0;
++  // b offset
++  int qk = offset_k / (32 / 2);
+ 
+-    while (k < end_k)
+-    {
+-        if (k == nextgroup)
+-        {
+-            group++;
+-            nextgroup += groupsize;
+-            b_gptq_qzeros_.item4(zeros, group, n);
+-            b_gptq_scales_.item4_h2(scales, group, n);
+-        }
++  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+ 
+-        for (int p = 0; p < 2; p++)
+-        {
+-            const int4* b_ptr4 = (int4*) b_ptr;
+-            int4 load_int4 = *b_ptr4;
+-
+-            half2 dq[4][8];
+-            dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
+-            dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
+-            dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
+-            dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+-
+-            b_ptr += size_n;
+-            //half* dqh = (half*)dq;
+-            if (b_q_perm)
+-            {
+-                for (int j = 0; j < 8; j++)
+-                {
+-                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+-                    b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+-                    b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+-                }
+-            }
+-            else
+-            {
+-                for (int j = 0; j < 8; j++)
+-                {
+-                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+-                    b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+-                    b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+-                }
+-            }
+-        }
+-        k += 32;
+-    }
+-}
++  // Initial zeros/scale
++  int zeros[4];
++  half2 scales[4];
++  b_gptq_qzeros_.item4(zeros, group, n);
++  b_gptq_scales_.item4_h2(scales, group, n);
+ 
+-void reconstruct_exllama
+-(
+-    const uint32_t* b_q_weight,
+-    const uint32_t* b_gptq_qzeros,
+-    const half* b_gptq_scales,
+-    const int* b_q_perm,
+-    half* out,
+-    int height,
+-    int width,
+-    int groups,
+-    int bit
+-)
+-{
+-    dim3 blockDim, gridDim;
+-    blockDim.x = BLOCK_KN_SIZE;
+-    blockDim.y = 1;
+-    gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
+-    gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
++  __syncthreads();
+ 
+-    auto reconstruct_exllama_kernel = reconstruct_exllama_4bit_kernel;
+-    if (bit == 2) {
+-        reconstruct_exllama_kernel = reconstruct_exllama_2bit_kernel;
+-    } else if (bit == 3) {
+-        reconstruct_exllama_kernel = reconstruct_exllama_3bit_kernel;
+-    } else if (bit == 8) {
+-        reconstruct_exllama_kernel = reconstruct_exllama_8bit_kernel;
++  int k = offset_k;
++  int lk = 0;
++
++  while (k < end_k) {
++    if (k == nextgroup) {
++      group++;
++      nextgroup += groupsize;
++      b_gptq_qzeros_.item4(zeros, group, n);
++      b_gptq_scales_.item4_h2(scales, group, n);
+     }
+ 
+-    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-    reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>
+-    (
+-        b_q_weight,
+-        b_q_perm,
+-        b_gptq_qzeros,
+-        b_gptq_scales,
+-        height,
+-        width,
+-        groups,
+-        out
+-    );
++    for (int p = 0; p < 2; p++) {
++      const int4* b_ptr4 = (int4*)b_ptr;
++      int4 load_int4 = *b_ptr4;
++
++      half2 dq[4][8];
++      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
++      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
++      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
++      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
++
++      b_ptr += size_n;
++      // half* dqh = (half*)dq;
++      if (b_q_perm) {
++        for (int j = 0; j < 8; j++) {
++          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
++          b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]),
++                  __low2half(dq[2][j]), __low2half(dq[3][j]));
++          b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]),
++                  __high2half(dq[2][j]), __high2half(dq[3][j]));
++        }
++      } else {
++        for (int j = 0; j < 8; j++) {
++          for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
++          b_.set4(offset_k + lk++, n, __low2half(dq[0][j]),
++                  __low2half(dq[1][j]), __low2half(dq[2][j]),
++                  __low2half(dq[3][j]));
++          b_.set4(offset_k + lk++, n, __high2half(dq[0][j]),
++                  __high2half(dq[1][j]), __high2half(dq[2][j]),
++                  __high2half(dq[3][j]));
++        }
++      }
++    }
++    k += 32;
++  }
+ }
+ 
++void reconstruct_exllama(const uint32_t* b_q_weight,
++                         const uint32_t* b_gptq_qzeros,
++                         const half* b_gptq_scales, const int* b_q_perm,
++                         half* out, int height, int width, int groups,
++                         int bit) {
++  dim3 blockDim, gridDim;
++  blockDim.x = BLOCK_KN_SIZE;
++  blockDim.y = 1;
++  gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
++  gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
++
++  auto reconstruct_exllama_kernel = reconstruct_exllama_4bit_kernel;
++  if (bit == 2) {
++    reconstruct_exllama_kernel = reconstruct_exllama_2bit_kernel;
++  } else if (bit == 3) {
++    reconstruct_exllama_kernel = reconstruct_exllama_3bit_kernel;
++  } else if (bit == 8) {
++    reconstruct_exllama_kernel = reconstruct_exllama_8bit_kernel;
++  }
++
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>(
++      b_q_weight, b_q_perm, b_gptq_qzeros, b_gptq_scales, height, width, groups,
++      out);
++}
+ 
+ __global__ void gemm_half_q_half_alt_4bit_kernel(
+-    const half2* __restrict__ vec,
+-    const uint32_t* __restrict__ mat,
+-    half* __restrict__ mul,
+-    const half* __restrict__ scales,
+-    const uint32_t* __restrict__ zeros,
+-    const int* __restrict__ g_idx,
+-    int batch,
+-    int height,
+-    int width
+-)
+-{
+-    int zero_width = width / 8;
+-    int vec_height = height * 4;
+-    const int blockwidth2 = BLOCK_KN_SIZE / 2;
+-    int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+-    int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
+-    int h = BLOCK_KN_SIZE * blockIdx.z / 8;
+-    int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
+-    int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+-
+-    __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
+-    if (threadIdx.x < h_end) {
+-        for (int m = 0; m < b_end; ++m) {
+-          blockvec[m][threadIdx.x] =
+-              vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 +
+-                  threadIdx.x];
+-        }
++    const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
++    half* __restrict__ mul, const half* __restrict__ scales,
++    const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
++    int batch, int height, int width) {
++  int zero_width = width / 8;
++  int vec_height = height * 4;
++  const int blockwidth2 = BLOCK_KN_SIZE / 2;
++  int b = blockIdx.y * BLOCK_M_SIZE_MAX;
++  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
++  int h = BLOCK_KN_SIZE * blockIdx.z / 8;
++  int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
++  int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
++
++  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
++  if (threadIdx.x < h_end) {
++    for (int m = 0; m < b_end; ++m) {
++      blockvec[m][threadIdx.x] =
++          vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 +
++              threadIdx.x];
+     }
+-
+-    __shared__ half2 deq2[256][8];
+-    int val = threadIdx.x / 8;
+-    int off = threadIdx.x % 8;
+-    for (; val < 256; val += BLOCK_KN_SIZE / 8) {
+-        deq2[val][off] = __halves2half2(
+-            __int2half_rn(val & 0xF), __int2half_rn(val >> 4)
+-        );
++  }
++
++  __shared__ half2 deq2[256][8];
++  int val = threadIdx.x / 8;
++  int off = threadIdx.x % 8;
++  for (; val < 256; val += BLOCK_KN_SIZE / 8) {
++    deq2[val][off] =
++        __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
++  }
++
++  if (blockIdx.z == 0) {
++    for (int m = 0; m < b_end; m++) mul[(b + m) * width + w] = __int2half_rn(0);
++  }
++  __syncthreads();
++
++  int i = width * h + w;
++  int g_h = h * 8;
++  int k = 0;
++  int z_w = w / 8;
++  int z_mod = (w % 8) * 4;
++  half2 res2;
++  half res[BLOCK_M_SIZE_MAX] = {};
++
++  unsigned int tmp;
++  while (k < h_end) {
++    tmp = mat[i];
++    half2 scales_tmp[4];
++    half2 zeros_tmp[4];
++    for (int tmp_k = 0; tmp_k < 4; tmp_k++) {
++      int g = g_idx[g_h + (k + tmp_k) * 2];
++      int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1];
++      half scale_f = scales[g * width + w];
++      half scale_f2 = scales[g2 * width + w];
++      half2 scale = __halves2half2(scale_f, scale_f2);
++      half2 zero = __halves2half2(
++          __hmul(scale_f,
++                 __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) -
++                               1)),
++          __hmul(scale_f2,
++                 __int2half_rn(
++                     -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1)));
++      scales_tmp[tmp_k] = scale;
++      zeros_tmp[tmp_k] = zero;
+     }
+-
+-    if (blockIdx.z == 0)
+-    {
+-        for (int m = 0; m < b_end; m++)
+-            mul[(b + m) * width + w] = __int2half_rn(0);
+-    }
+-    __syncthreads();
+-
+-    int i = width * h + w;
+-    int g_h = h * 8;
+-    int k = 0;
+-    int z_w = w / 8;
+-    int z_mod = (w % 8) * 4;
+-    half2 res2;
+-    half res[BLOCK_M_SIZE_MAX] = {};
+-
+-    unsigned int tmp;
+-    while (k < h_end) {
+-        tmp = mat[i];
+-        half2 scales_tmp[4];
+-        half2 zeros_tmp[4];
+-        for (int tmp_k = 0; tmp_k < 4; tmp_k++) {
+-            int g = g_idx[g_h + (k + tmp_k) * 2];
+-            int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1];
+-            half scale_f = scales[g * width + w];
+-            half scale_f2 = scales[g2 * width + w];
+-            half2 scale = __halves2half2(scale_f, scale_f2);
+-            half2 zero = __halves2half2(
+-                __hmul(scale_f, __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) - 1)),
+-                __hmul(scale_f2, __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1))
+-            );
+-            scales_tmp[tmp_k] = scale;
+-            zeros_tmp[tmp_k] = zero;
+-        }
+-        for (int m = 0; m < b_end; m++) {
++    for (int m = 0; m < b_end; m++) {
+ #ifndef USE_ROCM
+-            res2 = {};
++      res2 = {};
+ #else
+-            res2.x = __half_as_ushort(__float2half(0));
+-            res2.y = __half_as_ushort(__float2half(0));
++      res2.x = __half_as_ushort(__float2half(0));
++      res2.y = __half_as_ushort(__float2half(0));
+ #endif
+-            res2 = __hfma2(__hfma2(deq2[(tmp >>  0) & 0xff][off], scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2);
+-            res2 = __hfma2(__hfma2(deq2[(tmp >>  8) & 0xff][off], scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2);
+-            res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xff][off], scales_tmp[2], zeros_tmp[2]), blockvec[m][k + 2], res2);
+-            res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xff][off], scales_tmp[3], zeros_tmp[3]), blockvec[m][k + 3], res2);
++      res2 = __hfma2(
++          __hfma2(deq2[(tmp >> 0) & 0xff][off], scales_tmp[0], zeros_tmp[0]),
++          blockvec[m][k + 0], res2);
++      res2 = __hfma2(
++          __hfma2(deq2[(tmp >> 8) & 0xff][off], scales_tmp[1], zeros_tmp[1]),
++          blockvec[m][k + 1], res2);
++      res2 = __hfma2(
++          __hfma2(deq2[(tmp >> 16) & 0xff][off], scales_tmp[2], zeros_tmp[2]),
++          blockvec[m][k + 2], res2);
++      res2 = __hfma2(
++          __hfma2(deq2[(tmp >> 24) & 0xff][off], scales_tmp[3], zeros_tmp[3]),
++          blockvec[m][k + 3], res2);
+ #ifndef USE_ROCM
+-            res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
++      res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
+ #else
+-            res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
++      res[m] = __hadd(
++          res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
+ #endif
+-        }
+-        i += width;
+-        k += 4;
+-    }
+-    for (int m = 0; m < b_end; m++) {
+-        atomicAdd(&mul[(b + m) * width + w], res[m]);
+     }
++    i += width;
++    k += 4;
++  }
++  for (int m = 0; m < b_end; m++) {
++    atomicAdd(&mul[(b + m) * width + w], res[m]);
++  }
+ }
+ 
+-
+ __global__ void gemm_half_q_half_alt_8bit_kernel(
+-    const half2* __restrict__ vec,
+-    const uint32_t* __restrict__ mat,
+-    half* __restrict__ mul,
+-    const half* __restrict__ scales,
+-    const uint32_t* __restrict__ zeros,
+-    const int* __restrict__ g_idx,
+-    int batch,
+-    int height,
+-    int width
+-)
+-{
+-    int zero_width = width / 4;
+-    int vec_height = height * 2;
+-    const int blockwidth2 = BLOCK_KN_SIZE / 2;
+-    int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+-    int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
+-    int h = BLOCK_KN_SIZE * blockIdx.z / 4;
+-    int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
+-    int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+-
+-    __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
+-    if (threadIdx.x < h_end) {
+-        for (int m = 0; m < b_end; ++m) {
+-          blockvec[m][threadIdx.x] =
+-              vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 +
+-                  threadIdx.x];
+-        }
++    const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
++    half* __restrict__ mul, const half* __restrict__ scales,
++    const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
++    int batch, int height, int width) {
++  int zero_width = width / 4;
++  int vec_height = height * 2;
++  const int blockwidth2 = BLOCK_KN_SIZE / 2;
++  int b = blockIdx.y * BLOCK_M_SIZE_MAX;
++  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
++  int h = BLOCK_KN_SIZE * blockIdx.z / 4;
++  int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
++  int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
++
++  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
++  if (threadIdx.x < h_end) {
++    for (int m = 0; m < b_end; ++m) {
++      blockvec[m][threadIdx.x] =
++          vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 +
++              threadIdx.x];
+     }
+-
+-
+-    if (blockIdx.z == 0)
+-    {
+-        for (int m = 0; m < b_end; m++)
+-            mul[(b + m) * width + w] = __int2half_rn(0);
++  }
++
++  if (blockIdx.z == 0) {
++    for (int m = 0; m < b_end; m++) mul[(b + m) * width + w] = __int2half_rn(0);
++  }
++  __syncthreads();
++
++  int i = width * h + w;
++  int g_h = h * 4;
++  int k = 0;
++  int z_w = w / 4;
++  int z_mod = (w % 4) * 8;
++  half2 res2;
++  half res[BLOCK_M_SIZE_MAX] = {};
++
++  unsigned int tmp;
++  while (k < h_end) {
++    tmp = mat[i];
++    half2 scales_tmp[2];
++    half2 zeros_tmp[2];
++    for (int tmp_k = 0; tmp_k < 2; tmp_k++) {
++      int g = g_idx[g_h + (k + tmp_k) * 2];
++      int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1];
++      half scale_f = scales[g * width + w];
++      half scale_f2 = scales[g2 * width + w];
++      half2 scale = __halves2half2(scale_f, scale_f2);
++      half2 zero = __halves2half2(
++          __hmul(scale_f,
++                 __int2half_rn(
++                     -((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)),
++          __hmul(scale_f2,
++                 __int2half_rn(
++                     -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1)));
++      scales_tmp[tmp_k] = scale;
++      zeros_tmp[tmp_k] = zero;
+     }
+-    __syncthreads();
+-
+-    int i = width * h + w;
+-    int g_h = h * 4;
+-    int k = 0;
+-    int z_w = w / 4;
+-    int z_mod = (w % 4) * 8;
+-    half2 res2;
+-    half res[BLOCK_M_SIZE_MAX] = {};
+-
+-    unsigned int tmp;
+-    while (k < h_end) {
+-        tmp = mat[i];
+-        half2 scales_tmp[2];
+-        half2 zeros_tmp[2];
+-        for (int tmp_k = 0; tmp_k < 2; tmp_k++) {
+-            int g = g_idx[g_h + (k + tmp_k) * 2];
+-            int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1];
+-            half scale_f = scales[g * width + w];
+-            half scale_f2 = scales[g2 * width + w];
+-            half2 scale = __halves2half2(scale_f, scale_f2);
+-            half2 zero = __halves2half2(
+-                __hmul(scale_f, __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)),
+-                __hmul(scale_f2, __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1))
+-            );
+-            scales_tmp[tmp_k] = scale;
+-            zeros_tmp[tmp_k] = zero;
+-        }
+-        for (int m = 0; m < b_end; m++) {
++    for (int m = 0; m < b_end; m++) {
+ #ifndef USE_ROCM
+-            res2 = {};
++      res2 = {};
+ #else
+-            res2.x = __half_as_ushort(__float2half(0));
+-            res2.y = __half_as_ushort(__float2half(0));
++      res2.x = __half_as_ushort(__float2half(0));
++      res2.y = __half_as_ushort(__float2half(0));
+ #endif
+-            half2 v12 = __halves2half2(__int2half_rn(tmp & 0xFF), __int2half_rn((tmp >> 8) & 0xFF));
+-            res2 = __hfma2(__hfma2(v12, scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2);
+-            half2 v34 = __halves2half2(__int2half_rn((tmp >> 16) & 0xFF), __int2half_rn((tmp >> 24) & 0xFF));
+-            res2 = __hfma2(__hfma2(v34, scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2);
++      half2 v12 = __halves2half2(__int2half_rn(tmp & 0xFF),
++                                 __int2half_rn((tmp >> 8) & 0xFF));
++      res2 = __hfma2(__hfma2(v12, scales_tmp[0], zeros_tmp[0]),
++                     blockvec[m][k + 0], res2);
++      half2 v34 = __halves2half2(__int2half_rn((tmp >> 16) & 0xFF),
++                                 __int2half_rn((tmp >> 24) & 0xFF));
++      res2 = __hfma2(__hfma2(v34, scales_tmp[1], zeros_tmp[1]),
++                     blockvec[m][k + 1], res2);
+ #ifndef USE_ROCM
+-            res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
++      res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
+ #else
+-            res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
++      res[m] = __hadd(
++          res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
+ #endif
+-        }
+-        i += width;
+-        k += 2;
+-    }
+-    for (int m = 0; m < b_end; m++) {
+-        atomicAdd(&mul[(b + m) * width + w], res[m]);
+     }
++    i += width;
++    k += 2;
++  }
++  for (int m = 0; m < b_end; m++) {
++    atomicAdd(&mul[(b + m) * width + w], res[m]);
++  }
+ }
+ 
+-void gemm_half_q_half_alt
+-(
+-    const half* a,
+-    const uint32_t* b_q_weight,
+-    const uint32_t* b_gptq_qzeros,
+-    const half* b_gptq_scales,
+-    const int* b_g_idx,
+-    half* c,
+-    int size_m,
+-    int size_n,
+-    int size_k,
+-    int bit
+-)
+-{
+-    dim3 blockDim, gridDim;
+-    blockDim.x = BLOCK_KN_SIZE;
+-    blockDim.y = 1;
+-    blockDim.z = 1;
+-    gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE);
+-    gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX);
+-    gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
+-
+-    auto kernel = gemm_half_q_half_alt_4bit_kernel;
+-    if (bit == 8) {
+-        kernel = gemm_half_q_half_alt_8bit_kernel;
+-    }
+-
+-    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-    kernel<<<gridDim, blockDim, 0, stream>>>
+-    (
+-        (const half2*) a,
+-        b_q_weight,
+-        c,
+-        b_gptq_scales,
+-        b_gptq_qzeros,
+-        b_g_idx,
+-        size_m,
+-        size_k / 32 * bit,
+-        size_n
+-    );
++void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
++                          const uint32_t* b_gptq_qzeros,
++                          const half* b_gptq_scales, const int* b_g_idx,
++                          half* c, int size_m, int size_n, int size_k,
++                          int bit) {
++  dim3 blockDim, gridDim;
++  blockDim.x = BLOCK_KN_SIZE;
++  blockDim.y = 1;
++  blockDim.z = 1;
++  gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE);
++  gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX);
++  gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
++
++  auto kernel = gemm_half_q_half_alt_4bit_kernel;
++  if (bit == 8) {
++    kernel = gemm_half_q_half_alt_8bit_kernel;
++  }
++
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  kernel<<<gridDim, blockDim, 0, stream>>>(
++      (const half2*)a, b_q_weight, c, b_gptq_scales, b_gptq_qzeros, b_g_idx,
++      size_m, size_k / 32 * bit, size_n);
+ }
+ 
+-template<class T, int bit>
+-__global__ void reconstruct_gptq_kernel
+-(
+-    const uint32_t* __restrict__ w,
+-    const half* __restrict__ w_scales,
+-    const uint32_t* __restrict__ w_zeros,
+-    const int* __restrict__ g_idx,
+-    const int height,
+-    const int width,
+-    const int group,
+-    half* __restrict__ out
+-)
+-{
+-    // Start of block
+-
+-    int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+-    int row = blockIdx.y * 32 / bit;
+-    if (column >= width) return;
+-
+-    // Views
+-
+-    MatrixView_half_rw out_(out, height, width);
+-    MatrixView_half w_scales_(w_scales, group, width);
+-    T w_zeros_(w_zeros, group, width);
+-
+-    uint32_t w_read = w[blockIdx.y * width + column];
+-    half* out_ptr = out_.item_ptr(row, column);
+-
+-    #pragma unroll
+-    for (int s = 0; s < 32; s += bit)
+-    {
+-        int group = g_idx[row + s / bit];
+-        half w_scale = w_scales_.item(group, column);
+-        uint32_t w_zero = w_zeros_.item(group, column) + 1;
+-        half w_item = __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero), w_scale);
+-        *out_ptr = w_item; out_ptr += out_.width;
+-    }
++template <class T, int bit>
++__global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
++                                        const half* __restrict__ w_scales,
++                                        const uint32_t* __restrict__ w_zeros,
++                                        const int* __restrict__ g_idx,
++                                        const int height, const int width,
++                                        const int group,
++                                        half* __restrict__ out) {
++  // Start of block
++
++  int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
++  int row = blockIdx.y * 32 / bit;
++  if (column >= width) return;
++
++  // Views
++
++  MatrixView_half_rw out_(out, height, width);
++  MatrixView_half w_scales_(w_scales, group, width);
++  T w_zeros_(w_zeros, group, width);
++
++  uint32_t w_read = w[blockIdx.y * width + column];
++  half* out_ptr = out_.item_ptr(row, column);
++
++#pragma unroll
++  for (int s = 0; s < 32; s += bit) {
++    int group = g_idx[row + s / bit];
++    half w_scale = w_scales_.item(group, column);
++    uint32_t w_zero = w_zeros_.item(group, column) + 1;
++    half w_item =
++        __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero),
++               w_scale);
++    *out_ptr = w_item;
++    out_ptr += out_.width;
++  }
+ }
+ 
+-__global__ void reconstruct_gptq_3bit_kernel
+-(
+-    const uint32_t* __restrict__ w,
+-    const half* __restrict__ w_scales,
+-    const uint32_t* __restrict__ w_zeros,
+-    const int* __restrict__ g_idx,
+-    const int height,
+-    const int width,
+-    const int group,
+-    half* __restrict__ out
+-)
+-{
+-    // Start of block
+-    int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+-    int row = blockIdx.y * 32;
+-    if (column >= width) return;
+-
+-    // Views
+-
+-    MatrixView_half_rw out_(out, height, width);
+-    MatrixView_half w_scales_(w_scales, group, width);
+-    MatrixView_q3_row w_zeros_(w_zeros, group, width);
+-
+-    uint32_t w1 = w[(blockIdx.y * 3) * width + column];
+-    uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column];
+-    uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column];
+-    half* out_ptr = out_.item_ptr(row, column);
+-
+-    #pragma unroll
+-    for (int i = 0; i < 32; i += 1)
+-    {
+-        int group = g_idx[row + i];
+-        half w_scale = w_scales_.item(group, column);
+-        uint32_t w_zero = w_zeros_.item(group, column) + 1;
+-        int w_item;
+-        if (i == 10) {
+-            w_item = (w1 >> 30) | ((w2 << 2) & 0x4);
+-        } else if (i == 21) {
+-            w_item = (w2 >> 31) | ((w3 << 1) & 0x6);
+-        } else if (i < 10) {
+-            w_item = ((w1 >> (i * 3)) & 0x7);
+-        } else if (i < 21) {
+-            w_item = ((w2 >> (i * 3 - 32)) & 0x7);
+-        } else {
+-            w_item = ((w3 >> (i * 3 - 64)) & 0x7);
+-        }
+-        *out_ptr = __hmul(__int2half_rn(w_item - w_zero), w_scale);
+-        out_ptr += out_.width;
++__global__ void reconstruct_gptq_3bit_kernel(
++    const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
++    const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
++    const int height, const int width, const int group,
++    half* __restrict__ out) {
++  // Start of block
++  int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
++  int row = blockIdx.y * 32;
++  if (column >= width) return;
++
++  // Views
++
++  MatrixView_half_rw out_(out, height, width);
++  MatrixView_half w_scales_(w_scales, group, width);
++  MatrixView_q3_row w_zeros_(w_zeros, group, width);
++
++  uint32_t w1 = w[(blockIdx.y * 3) * width + column];
++  uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column];
++  uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column];
++  half* out_ptr = out_.item_ptr(row, column);
++
++#pragma unroll
++  for (int i = 0; i < 32; i += 1) {
++    int group = g_idx[row + i];
++    half w_scale = w_scales_.item(group, column);
++    uint32_t w_zero = w_zeros_.item(group, column) + 1;
++    int w_item;
++    if (i == 10) {
++      w_item = (w1 >> 30) | ((w2 << 2) & 0x4);
++    } else if (i == 21) {
++      w_item = (w2 >> 31) | ((w3 << 1) & 0x6);
++    } else if (i < 10) {
++      w_item = ((w1 >> (i * 3)) & 0x7);
++    } else if (i < 21) {
++      w_item = ((w2 >> (i * 3 - 32)) & 0x7);
++    } else {
++      w_item = ((w3 >> (i * 3 - 64)) & 0x7);
+     }
++    *out_ptr = __hmul(__int2half_rn(w_item - w_zero), w_scale);
++    out_ptr += out_.width;
++  }
+ }
+ 
+-void reconstruct_gptq
+-(
+-    const uint32_t* b_q_weight,
+-    const uint32_t* b_gptq_qzeros,
+-    const half* b_gptq_scales,
+-    const int* b_g_idx,
+-    half* out,
+-    int height,
+-    int width,
+-    int groups,
+-    int bit
+-)
+-{
+-    dim3 blockDim, gridDim;
+-    blockDim.x = BLOCK_KN_SIZE;
+-    blockDim.y = 1;
+-    gridDim.y = DIVIDE(height, 32 / bit);
+-    gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
+-
+-    auto kernel = reconstruct_gptq_kernel<MatrixView_q4_row, 4>;
+-    if (bit == 2) {
+-        kernel = reconstruct_gptq_kernel<MatrixView_q2_row, 2>;
+-    } else if (bit == 8) {
+-        kernel = reconstruct_gptq_kernel<MatrixView_q8_row, 8>;
+-    } else if (bit == 3) {
+-        kernel = reconstruct_gptq_3bit_kernel;
+-        gridDim.y = DIVIDE(height, 32);
+-    }
+-
+-    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-    kernel<<<gridDim, blockDim, 0, stream>>>
+-    (
+-        b_q_weight,
+-        b_gptq_scales,
+-        b_gptq_qzeros,
+-        b_g_idx,
+-        height,
+-        width,
+-        groups,
+-        out
+-    );
++void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
++                      const half* b_gptq_scales, const int* b_g_idx, half* out,
++                      int height, int width, int groups, int bit) {
++  dim3 blockDim, gridDim;
++  blockDim.x = BLOCK_KN_SIZE;
++  blockDim.y = 1;
++  gridDim.y = DIVIDE(height, 32 / bit);
++  gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
++
++  auto kernel = reconstruct_gptq_kernel<MatrixView_q4_row, 4>;
++  if (bit == 2) {
++    kernel = reconstruct_gptq_kernel<MatrixView_q2_row, 2>;
++  } else if (bit == 8) {
++    kernel = reconstruct_gptq_kernel<MatrixView_q8_row, 8>;
++  } else if (bit == 3) {
++    kernel = reconstruct_gptq_3bit_kernel;
++    gridDim.y = DIVIDE(height, 32);
++  }
++
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  kernel<<<gridDim, blockDim, 0, stream>>>(b_q_weight, b_gptq_scales,
++                                           b_gptq_qzeros, b_g_idx, height,
++                                           width, groups, out);
+ }
+ 
+-
+-void gemm_half_q_half_cuda
+-(
+-    cublasHandle_t cublas_handle,
+-    const half* a,
+-    const uint32_t* b_q_weight,
+-    const uint32_t* b_gptq_qzeros,
+-    const half* b_gptq_scales,
+-    const int* b_g_idx,
+-    half* c,
+-    half* temp_dq,
+-    int size_m,
+-    int size_n,
+-    int size_k,
+-    int groups,
+-    bool use_exllama,
+-    int bit
+-)
+-{
+-    bool use_reconstruct;
++void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
++                           const uint32_t* b_q_weight,
++                           const uint32_t* b_gptq_qzeros,
++                           const half* b_gptq_scales, const int* b_g_idx,
++                           half* c, half* temp_dq, int size_m, int size_n,
++                           int size_k, int groups, bool use_exllama, int bit) {
++  bool use_reconstruct;
++  if (use_exllama) {
++    use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) ||
++                       (bit != 8 && size_m > MAX_Q_GEMM_ROWS));
++  } else {
++    // The 2/3-bit kernels are somehow slower than dequant + gemm baseline, so
++    // we disabled them for now.
++    use_reconstruct = (bit < 4 || size_m > MAX_ALT_GEMM_ROWS);
++  }
++  if (use_reconstruct) {
++    // Reconstruct FP16 matrix, then cuBLAS
+     if (use_exllama) {
+-        use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) || (bit != 8 && size_m > MAX_Q_GEMM_ROWS));
++      reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
++                          temp_dq, size_k, size_n, groups, bit);
+     } else {
+-        // The 2/3-bit kernels are somehow slower than dequant + gemm baseline, so we disabled them for now.
+-        use_reconstruct = (bit < 4 || size_m > MAX_ALT_GEMM_ROWS);
++      reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
++                       temp_dq, size_k, size_n, groups, bit);
+     }
+-    if (use_reconstruct) {
+-        // Reconstruct FP16 matrix, then cuBLAS
+-        if (use_exllama) {
+-            reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, temp_dq,
+-                                size_k, size_n, groups, bit);
+-        }
+-        else
+-        {
+-            reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
+-                             temp_dq, size_k, size_n, groups, bit);
+-        }
+ 
+-        const half alpha = __float2half(1.0f);
+-        const half beta = __float2half(0.0f);
+-        cublasHgemm(cublas_handle,
+-                    CUBLAS_OP_N,
+-                    CUBLAS_OP_N,
+-                    size_n, size_m, size_k,
+-                    &alpha, temp_dq, size_n,
+-                            a,       size_k,
+-                    &beta,  c,       size_n);
++    const half alpha = __float2half(1.0f);
++    const half beta = __float2half(0.0f);
++    cublasHgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, size_n, size_m, size_k,
++                &alpha, temp_dq, size_n, a, size_k, &beta, c, size_n);
++  } else if (use_exllama) {
++    // Quantized matmul
++    int max_chunks = size_m / BLOCK_M_SIZE_MAX;
++    int last_chunk = max_chunks * BLOCK_M_SIZE_MAX;
++    int last_chunk_size = size_m - last_chunk;
++
++    if (max_chunks) {
++      gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
++                                 b_g_idx, c, last_chunk, size_n, size_k,
++                                 BLOCK_M_SIZE_MAX, groups, bit);
+     }
+-    else if (use_exllama)
+-    {
+-        // Quantized matmul
+-        int max_chunks = size_m / BLOCK_M_SIZE_MAX;
+-        int last_chunk = max_chunks * BLOCK_M_SIZE_MAX;
+-        int last_chunk_size = size_m - last_chunk;
+-
+-        if (max_chunks)
+-        {
+-            gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
+-                                        c, last_chunk, size_n, size_k, BLOCK_M_SIZE_MAX,
+-                                        groups, bit);
+-        }
+ 
+-        if (last_chunk_size)
+-        {
+-            gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight, b_gptq_qzeros,
+-                                        b_gptq_scales, b_g_idx, c + last_chunk * size_n,
+-                                        last_chunk_size, size_n, size_k, last_chunk_size,
+-                                        groups, bit);
+-        }
+-    }
+-    else
+-    {
+-        gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
+-                             c, size_m, size_n, size_k, bit);
++    if (last_chunk_size) {
++      gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight,
++                                 b_gptq_qzeros, b_gptq_scales, b_g_idx,
++                                 c + last_chunk * size_n, last_chunk_size,
++                                 size_n, size_k, last_chunk_size, groups, bit);
+     }
++  } else {
++    gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
++                         c, size_m, size_n, size_k, bit);
++  }
+ }
+ 
+-__global__ void shuffle_4bit_kernel
+-(
+-    uint32_t* __restrict__ b_q_weight,
+-    const int size_k,
+-    const int size_n
+-)
+-{
+-    int n = blockIdx.x * THREADS_X + threadIdx.x;
+-    if (n >= size_n) return;
+-    int k = 0;
+-    uint32_t* b_ptr = b_q_weight + n;
+-    while (k < size_k) { shuffle_4bit_8 (b_ptr, size_n); b_ptr += 1 * size_n; k +=  8; }
++__global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
++                                    const int size_k, const int size_n) {
++  int n = blockIdx.x * THREADS_X + threadIdx.x;
++  if (n >= size_n) return;
++  int k = 0;
++  uint32_t* b_ptr = b_q_weight + n;
++  while (k < size_k) {
++    shuffle_4bit_8(b_ptr, size_n);
++    b_ptr += 1 * size_n;
++    k += 8;
++  }
+ }
+ 
+-__global__ void shuffle_8bit_kernel
+-(
+-    uint32_t* __restrict__ b_q_weight,
+-    const int size_k,
+-    const int size_n
+-)
+-{
+-    int n = blockIdx.x * THREADS_X + threadIdx.x;
+-    if (n >= size_n) return;
+-    int k = 0;
+-    uint32_t* b_ptr = b_q_weight + n;
+-    while (k < size_k) { shuffle_8bit_4 (b_ptr, size_n); b_ptr += 1 * size_n; k +=  4; }
++__global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
++                                    const int size_k, const int size_n) {
++  int n = blockIdx.x * THREADS_X + threadIdx.x;
++  if (n >= size_n) return;
++  int k = 0;
++  uint32_t* b_ptr = b_q_weight + n;
++  while (k < size_k) {
++    shuffle_8bit_4(b_ptr, size_n);
++    b_ptr += 1 * size_n;
++    k += 4;
++  }
+ }
+ 
+-__global__ void shuffle_2bit_kernel
+-(
+-    uint32_t* __restrict__ b_q_weight,
+-    const int size_k,
+-    const int size_n
+-)
+-{
+-    int n = blockIdx.x * THREADS_X + threadIdx.x;
+-    if (n >= size_n) return;
+-    int k = 0;
+-    uint32_t* b_ptr = b_q_weight + n;
+-    while (k < size_k) { shuffle_2bit_16(b_ptr, size_n); b_ptr += 1 * size_n; k += 16;  }
++__global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
++                                    const int size_k, const int size_n) {
++  int n = blockIdx.x * THREADS_X + threadIdx.x;
++  if (n >= size_n) return;
++  int k = 0;
++  uint32_t* b_ptr = b_q_weight + n;
++  while (k < size_k) {
++    shuffle_2bit_16(b_ptr, size_n);
++    b_ptr += 1 * size_n;
++    k += 16;
++  }
+ }
+ 
+-__global__ void shuffle_3bit_kernel
+-(
+-    uint32_t* __restrict__ b_q_weight,
+-    const int size_k,
+-    const int size_n
+-)
+-{
+-    int n = blockIdx.x * THREADS_X + threadIdx.x;
+-    if (n >= size_n) return;
+-    int k = 0;
+-    uint32_t* b_ptr = b_q_weight + n;
+-    while (k < size_k) { shuffle_3bit_32(b_ptr, size_n); b_ptr += 3 * size_n; k += 32;  }
++__global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight,
++                                    const int size_k, const int size_n) {
++  int n = blockIdx.x * THREADS_X + threadIdx.x;
++  if (n >= size_n) return;
++  int k = 0;
++  uint32_t* b_ptr = b_q_weight + n;
++  while (k < size_k) {
++    shuffle_3bit_32(b_ptr, size_n);
++    b_ptr += 3 * size_n;
++    k += 32;
++  }
+ }
+ 
+-__global__ void make_sequential_4bit_kernel
+-(
+-    const uint32_t* __restrict__ w,
+-    uint32_t* __restrict__ w_new,
+-    const int* __restrict__ q_perm,
+-    const int w_width
+-)
+-{
+-    const uint64_t* w2 = (uint64_t*) w;
+-    uint64_t* w_new2 = (uint64_t*) w_new;
+-    int w2_stride = w_width >> 1;
+-    int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+-    if (w2_column >= w2_stride) return;
+-    int w_new2_row = blockIdx.y;
+-    int q_perm_idx = w_new2_row << 3;
+-    uint64_t dst = 0;
+-
+-    #pragma unroll
+-    for (int i = 0; i < 8; i++)
+-    {
+-        int source_row = q_perm[q_perm_idx++];
+-
+-        int w2_row = source_row >> 3;
+-        int w2_subrow = source_row & 0x07;
+-        int w2_row_shift = w2_subrow << 2;
+-        int wnew2_row_shift = i << 2;
+-
+-        uint64_t src = w2[w2_row * w2_stride + w2_column];
+-        src >>= w2_row_shift;
+-        src &= 0x0000000f0000000f;
+-        src <<= wnew2_row_shift;
+-        dst |= src;
+-    }
+-    w_new2[w_new2_row * w2_stride + w2_column] = dst;
++__global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w,
++                                            uint32_t* __restrict__ w_new,
++                                            const int* __restrict__ q_perm,
++                                            const int w_width) {
++  const uint64_t* w2 = (uint64_t*)w;
++  uint64_t* w_new2 = (uint64_t*)w_new;
++  int w2_stride = w_width >> 1;
++  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
++  if (w2_column >= w2_stride) return;
++  int w_new2_row = blockIdx.y;
++  int q_perm_idx = w_new2_row << 3;
++  uint64_t dst = 0;
++
++#pragma unroll
++  for (int i = 0; i < 8; i++) {
++    int source_row = q_perm[q_perm_idx++];
++
++    int w2_row = source_row >> 3;
++    int w2_subrow = source_row & 0x07;
++    int w2_row_shift = w2_subrow << 2;
++    int wnew2_row_shift = i << 2;
++
++    uint64_t src = w2[w2_row * w2_stride + w2_column];
++    src >>= w2_row_shift;
++    src &= 0x0000000f0000000f;
++    src <<= wnew2_row_shift;
++    dst |= src;
++  }
++  w_new2[w_new2_row * w2_stride + w2_column] = dst;
+ }
+ 
+-__global__ void make_sequential_2bit_kernel
+-(
+-    const uint32_t* __restrict__ w,
+-    uint32_t* __restrict__ w_new,
+-    const int* __restrict__ q_perm,
+-    const int w_width
+-)
+-{
+-    const uint64_t* w2 = (uint64_t*) w;
+-    uint64_t* w_new2 = (uint64_t*) w_new;
+-    int w2_stride = w_width >> 1;
+-    int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+-    if (w2_column >= w2_stride) return;
+-    int w_new2_row = blockIdx.y;
+-    int q_perm_idx = w_new2_row << 4;
+-    uint64_t dst = 0;
+-
+-    #pragma unroll
+-    for (int i = 0; i < 16; i++)
+-    {
+-        int source_row = q_perm[q_perm_idx++];
+-
+-        int w2_row = source_row >> 4;
+-        int w2_subrow = source_row & 0x0f;
+-        int w2_row_shift = w2_subrow << 1;
+-        int wnew2_row_shift = i << 1;
+-
+-        uint64_t src = w2[w2_row * w2_stride + w2_column];
+-        src >>= w2_row_shift;
+-        src &= 0x0000000300000003;
+-        src <<= wnew2_row_shift;
+-        dst |= src;
+-    }
+-    w_new2[w_new2_row * w2_stride + w2_column] = dst;
++__global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w,
++                                            uint32_t* __restrict__ w_new,
++                                            const int* __restrict__ q_perm,
++                                            const int w_width) {
++  const uint64_t* w2 = (uint64_t*)w;
++  uint64_t* w_new2 = (uint64_t*)w_new;
++  int w2_stride = w_width >> 1;
++  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
++  if (w2_column >= w2_stride) return;
++  int w_new2_row = blockIdx.y;
++  int q_perm_idx = w_new2_row << 4;
++  uint64_t dst = 0;
++
++#pragma unroll
++  for (int i = 0; i < 16; i++) {
++    int source_row = q_perm[q_perm_idx++];
++
++    int w2_row = source_row >> 4;
++    int w2_subrow = source_row & 0x0f;
++    int w2_row_shift = w2_subrow << 1;
++    int wnew2_row_shift = i << 1;
++
++    uint64_t src = w2[w2_row * w2_stride + w2_column];
++    src >>= w2_row_shift;
++    src &= 0x0000000300000003;
++    src <<= wnew2_row_shift;
++    dst |= src;
++  }
++  w_new2[w_new2_row * w2_stride + w2_column] = dst;
+ }
+ 
+-__global__ void make_sequential_3bit_kernel
+-(
+-    const uint32_t* __restrict__ w,
+-    uint32_t* __restrict__ w_new,
+-    const int* __restrict__ q_perm,
+-    const int w_width
+-)
+-{
+-    int w_column = THREADS_X * blockIdx.x + threadIdx.x;
+-    if (w_column >= w_width) return;
+-    int w_new_row = blockIdx.y * 3;
+-    int q_perm_idx = blockIdx.y << 5;
+-    uint32_t dst[3] = {0, 0, 0};
+-
+-    #pragma unroll
+-    for (int i = 0; i < 32; i++)
+-    {
+-        int source_row = q_perm[q_perm_idx++];
+-        int z_w = (source_row / 32) * 3;
+-        int z_mod = source_row % 32;
+-        int z_bit;
+-
+-        if (z_mod != 10){
+-            if (z_mod != 21){
+-                z_bit = z_mod;
+-                if (z_bit > 21){
+-                    z_bit *= 3;
+-                    z_bit -= 64;
+-                    z_w += 2;
+-                } else if (z_bit > 10){
+-                    z_bit *= 3;
+-                    z_bit -= 32;
+-                    z_w += 1;
+-                } else {
+-                    z_bit *= 3;
+-                }
+-            } else {
+-                z_w += 1;
+-            }
+-        }
+-
+-        uint64_t src;
+-        if (z_mod == 10) {
+-            src = (w[z_w * w_width + w_column] >> 30) | ((w[(z_w + 1) * w_width + w_column] << 2) & 0x4);
+-        } else if (z_mod == 21){
+-            src = (w[z_w * w_width + w_column] >> 31) | ((w[(z_w + 1) * w_width + w_column] << 1) & 0x6);
++__global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w,
++                                            uint32_t* __restrict__ w_new,
++                                            const int* __restrict__ q_perm,
++                                            const int w_width) {
++  int w_column = THREADS_X * blockIdx.x + threadIdx.x;
++  if (w_column >= w_width) return;
++  int w_new_row = blockIdx.y * 3;
++  int q_perm_idx = blockIdx.y << 5;
++  uint32_t dst[3] = {0, 0, 0};
++
++#pragma unroll
++  for (int i = 0; i < 32; i++) {
++    int source_row = q_perm[q_perm_idx++];
++    int z_w = (source_row / 32) * 3;
++    int z_mod = source_row % 32;
++    int z_bit;
++
++    if (z_mod != 10) {
++      if (z_mod != 21) {
++        z_bit = z_mod;
++        if (z_bit > 21) {
++          z_bit *= 3;
++          z_bit -= 64;
++          z_w += 2;
++        } else if (z_bit > 10) {
++          z_bit *= 3;
++          z_bit -= 32;
++          z_w += 1;
+         } else {
+-            src = w[z_w * w_width + w_column];
+-            src >>= z_bit;
+-            src &= 0x07;
++          z_bit *= 3;
+         }
++      } else {
++        z_w += 1;
++      }
++    }
+ 
+-        z_w = 0;
+-        if (i != 10){
+-            if (i != 21){
+-                z_bit = i;
+-                if (z_bit > 21){
+-                    z_bit *= 3;
+-                    z_bit -= 64;
+-                    z_w += 2;
+-                } else if (z_bit > 10){
+-                    z_bit *= 3;
+-                    z_bit -= 32;
+-                    z_w += 1;
+-                } else {
+-                    z_bit *= 3;
+-                }
+-            } else {
+-                z_w += 1;
+-            }
+-        }
+-        if (i == 10) {
+-            dst[z_w] |= (src & 0x03) << 30;
+-            dst[z_w + 1] |= ((src & 0x4) >> 2);
+-        } else if (i == 21) {
+-            dst[z_w] |= (src & 0x01) << 31;
+-            dst[z_w + 1] |= ((src & 0x6) >> 1);
++    uint64_t src;
++    if (z_mod == 10) {
++      src = (w[z_w * w_width + w_column] >> 30) |
++            ((w[(z_w + 1) * w_width + w_column] << 2) & 0x4);
++    } else if (z_mod == 21) {
++      src = (w[z_w * w_width + w_column] >> 31) |
++            ((w[(z_w + 1) * w_width + w_column] << 1) & 0x6);
++    } else {
++      src = w[z_w * w_width + w_column];
++      src >>= z_bit;
++      src &= 0x07;
++    }
++
++    z_w = 0;
++    if (i != 10) {
++      if (i != 21) {
++        z_bit = i;
++        if (z_bit > 21) {
++          z_bit *= 3;
++          z_bit -= 64;
++          z_w += 2;
++        } else if (z_bit > 10) {
++          z_bit *= 3;
++          z_bit -= 32;
++          z_w += 1;
+         } else {
+-            dst[z_w] |= (src << z_bit);
++          z_bit *= 3;
+         }
++      } else {
++        z_w += 1;
++      }
++    }
++    if (i == 10) {
++      dst[z_w] |= (src & 0x03) << 30;
++      dst[z_w + 1] |= ((src & 0x4) >> 2);
++    } else if (i == 21) {
++      dst[z_w] |= (src & 0x01) << 31;
++      dst[z_w + 1] |= ((src & 0x6) >> 1);
++    } else {
++      dst[z_w] |= (src << z_bit);
+     }
+-    w_new[w_new_row * w_width + w_column] = dst[0];
+-    w_new[(w_new_row + 1) * w_width + w_column] = dst[1];
+-    w_new[(w_new_row + 2) * w_width + w_column] = dst[2];
++  }
++  w_new[w_new_row * w_width + w_column] = dst[0];
++  w_new[(w_new_row + 1) * w_width + w_column] = dst[1];
++  w_new[(w_new_row + 2) * w_width + w_column] = dst[2];
+ }
+ 
+-__global__ void make_sequential_8bit_kernel
+-(
+-    const uint32_t* __restrict__ w,
+-    uint32_t* __restrict__ w_new,
+-    const int* __restrict__ q_perm,
+-    const int w_width
+-)
+-{
+-    const uint64_t* w2 = (uint64_t*) w;
+-    uint64_t* w_new2 = (uint64_t*) w_new;
+-    int w2_stride = w_width >> 1;
+-    int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+-    if (w2_column >= w2_stride) return;
+-    int w_new2_row = blockIdx.y;
+-    int q_perm_idx = w_new2_row << 2;
+-    uint64_t dst = 0;
+-
+-    #pragma unroll
+-    for (int i = 0; i < 4; i++)
+-    {
+-        int source_row = q_perm[q_perm_idx++];
+-
+-        int w2_row = source_row >> 2;
+-        int w2_subrow = source_row & 0x03;
+-        int w2_row_shift = w2_subrow << 3;
+-        int wnew2_row_shift = i << 3;
+-
+-        uint64_t src = w2[w2_row * w2_stride + w2_column];
+-        src >>= w2_row_shift;
+-        src &= 0x000000ff000000ff;
+-        src <<= wnew2_row_shift;
+-        dst |= src;
+-    }
+-    w_new2[w_new2_row * w2_stride + w2_column] = dst;
++__global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w,
++                                            uint32_t* __restrict__ w_new,
++                                            const int* __restrict__ q_perm,
++                                            const int w_width) {
++  const uint64_t* w2 = (uint64_t*)w;
++  uint64_t* w_new2 = (uint64_t*)w_new;
++  int w2_stride = w_width >> 1;
++  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
++  if (w2_column >= w2_stride) return;
++  int w_new2_row = blockIdx.y;
++  int q_perm_idx = w_new2_row << 2;
++  uint64_t dst = 0;
++
++#pragma unroll
++  for (int i = 0; i < 4; i++) {
++    int source_row = q_perm[q_perm_idx++];
++
++    int w2_row = source_row >> 2;
++    int w2_subrow = source_row & 0x03;
++    int w2_row_shift = w2_subrow << 3;
++    int wnew2_row_shift = i << 3;
++
++    uint64_t src = w2[w2_row * w2_stride + w2_column];
++    src >>= w2_row_shift;
++    src &= 0x000000ff000000ff;
++    src <<= wnew2_row_shift;
++    dst |= src;
++  }
++  w_new2[w_new2_row * w2_stride + w2_column] = dst;
+ }
+ 
++void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
++                            int width, int bit) {
++  if (q_perm) {
++    uint32_t* new_qweight = NULL;
++    cudaMalloc(&new_qweight, height / 32 * bit * width * sizeof(uint32_t));
+ 
+-void shuffle_exllama_weight
+-(
+-    uint32_t* q_weight,
+-    int* q_perm,
+-    int height,
+-    int width,
+-    int bit
+-)
+-{
+-    if (q_perm)
+-    {
+-        uint32_t* new_qweight = NULL;
+-        cudaMalloc(&new_qweight, height / 32 * bit * width * sizeof(uint32_t));
+-
+-        dim3 blockDim, gridDim;
+-        blockDim.x = THREADS_X;
+-        blockDim.y = 1;
+-        gridDim.x = DIVIDE(width, THREADS_X);
+-        gridDim.y = height / 32 * bit;
+-
+-        auto kernel = make_sequential_4bit_kernel;
+-        if (bit == 2) {
+-            kernel = make_sequential_2bit_kernel;
+-        } else if (bit == 3) {
+-            kernel = make_sequential_3bit_kernel;
+-            gridDim.y = height / 32;
+-        } else if (bit == 8) {
+-            kernel = make_sequential_8bit_kernel;
+-        }
+-        const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-        kernel<<<gridDim, blockDim, 0, stream>>>
+-        (
+-            q_weight,
+-            new_qweight,
+-            q_perm,
+-            width
+-        );
+-        // Replace qweights
+-        cudaMemcpyAsync(q_weight, new_qweight, height / 32 * bit * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
+-        // Cleanup
+-        cudaDeviceSynchronize();
+-        cudaFree(new_qweight);
+-    }
+     dim3 blockDim, gridDim;
+     blockDim.x = THREADS_X;
+     blockDim.y = 1;
+     gridDim.x = DIVIDE(width, THREADS_X);
+-    gridDim.y = 1;
+-    auto shuffle_kernel = shuffle_4bit_kernel;
++    gridDim.y = height / 32 * bit;
++
++    auto kernel = make_sequential_4bit_kernel;
+     if (bit == 2) {
+-        shuffle_kernel = shuffle_2bit_kernel;
++      kernel = make_sequential_2bit_kernel;
+     } else if (bit == 3) {
+-        shuffle_kernel = shuffle_3bit_kernel;
++      kernel = make_sequential_3bit_kernel;
++      gridDim.y = height / 32;
+     } else if (bit == 8) {
+-        shuffle_kernel = shuffle_8bit_kernel;
++      kernel = make_sequential_8bit_kernel;
+     }
+     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+-    shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, height, width);
++    kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, new_qweight, q_perm,
++                                             width);
++    // Replace qweights
++    cudaMemcpyAsync(q_weight, new_qweight,
++                    height / 32 * bit * width * sizeof(uint32_t),
++                    cudaMemcpyDeviceToDevice);
++    // Cleanup
++    cudaDeviceSynchronize();
++    cudaFree(new_qweight);
++  }
++  dim3 blockDim, gridDim;
++  blockDim.x = THREADS_X;
++  blockDim.y = 1;
++  gridDim.x = DIVIDE(width, THREADS_X);
++  gridDim.y = 1;
++  auto shuffle_kernel = shuffle_4bit_kernel;
++  if (bit == 2) {
++    shuffle_kernel = shuffle_2bit_kernel;
++  } else if (bit == 3) {
++    shuffle_kernel = shuffle_3bit_kernel;
++  } else if (bit == 8) {
++    shuffle_kernel = shuffle_8bit_kernel;
++  }
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, height, width);
+ }
+ 
+ }  // namespace gptq
+ }  // namespace vllm
+ 
+-torch::Tensor gptq_gemm
+-(
+-    torch::Tensor a,
+-    torch::Tensor b_q_weight,
+-    torch::Tensor b_gptq_qzeros,
+-    torch::Tensor b_gptq_scales,
+-    torch::Tensor b_g_idx,
+-    bool use_exllama,
+-    int bit
+-)
+-{
+-    const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+-    auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+-    at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
+-    at::Tensor temp_dq = torch::empty({b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);
+-
+-    vllm::gptq::gemm_half_q_half_cuda
+-    (
+-        at::cuda::getCurrentCUDABlasHandle(),
+-        (const half*) a.data_ptr(),
+-        (const uint32_t*) b_q_weight.data_ptr(),
+-        (const uint32_t*)b_gptq_qzeros.data_ptr(),
+-        (const half*) b_gptq_scales.data_ptr(),
+-        b_g_idx.device().is_meta() ? NULL : (const int*) b_g_idx.data_ptr(),
+-        (half*) c.data_ptr(),
+-        (half*) temp_dq.data_ptr(),
+-        c.size(0),  // m
+-        c.size(1),  // n
+-        a.size(1),  // k
+-        b_gptq_qzeros.size(0),  // group number
+-        use_exllama,
+-        bit
+-    );
+-    return c;
++torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
++                        torch::Tensor b_gptq_qzeros,
++                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
++                        bool use_exllama, int64_t bit) {
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
++  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
++  at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
++  at::Tensor temp_dq = torch::empty(
++      {b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);
++
++  vllm::gptq::gemm_half_q_half_cuda(
++      at::cuda::getCurrentCUDABlasHandle(), (const half*)a.data_ptr(),
++      (const uint32_t*)b_q_weight.data_ptr(),
++      (const uint32_t*)b_gptq_qzeros.data_ptr(),
++      (const half*)b_gptq_scales.data_ptr(),
++      b_g_idx.device().is_meta() ? NULL : (const int*)b_g_idx.data_ptr(),
++      (half*)c.data_ptr(), (half*)temp_dq.data_ptr(),
++      c.size(0),              // m
++      c.size(1),              // n
++      a.size(1),              // k
++      b_gptq_qzeros.size(0),  // group number
++      use_exllama, bit);
++  return c;
+ }
+ 
+-void gptq_shuffle
+-(
+-    torch::Tensor q_weight,
+-    torch::Tensor q_perm,
+-    int bit
+-)
+-{
+-    const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
+-    vllm::gptq::shuffle_exllama_weight(
+-        (uint32_t*) q_weight.data_ptr(),
+-        q_perm.device().is_meta() || q_perm.numel() == 0 ? NULL : (int*) q_perm.data_ptr(),
+-        q_weight.size(0) * 32 / bit,
+-        q_weight.size(1),
+-        bit
+-    );
++void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
++  vllm::gptq::shuffle_exllama_weight(
++      (uint32_t*)q_weight.data_ptr(),
++      q_perm.device().is_meta() || q_perm.numel() == 0
++          ? NULL
++          : (int*)q_perm.data_ptr(),
++      q_weight.size(0) * 32 / bit, q_weight.size(1), bit);
+ }
+diff --git a/csrc/quantization/gptq/qdq_2.cuh b/csrc/quantization/gptq/qdq_2.cuh
+index 295872a..ca0f810 100644
+--- a/csrc/quantization/gptq/qdq_2.cuh
++++ b/csrc/quantization/gptq/qdq_2.cuh
+@@ -14,71 +14,60 @@ namespace gptq {
+ //
+ // ffddbb99 77553311  eeccaa88 66442200
+ 
+-__forceinline__ __device__ void shuffle_2bit_16
+-(
+-    uint32_t* q,
+-    int stride
+-)
+-{
+-    uint32_t qa = q[0];
+-    uint32_t qb = 0;
++__forceinline__ __device__ void shuffle_2bit_16(uint32_t* q, int stride) {
++  uint32_t qa = q[0];
++  uint32_t qb = 0;
+ 
+-    #pragma unroll
+-    for (int i = 0; i < 8; i++)
+-    {
+-        uint32_t qa0 = qa & 0x03;
+-        uint32_t qa1 = (qa & 0x0c) >> 2;
+-        qa >>= 4;
+-        qb |= (qa1 << (i * 2 + 16));
+-        qb |= (qa0 << (i * 2));
+-    }
+-    q[0] = qb;
++#pragma unroll
++  for (int i = 0; i < 8; i++) {
++    uint32_t qa0 = qa & 0x03;
++    uint32_t qa1 = (qa & 0x0c) >> 2;
++    qa >>= 4;
++    qb |= (qa1 << (i * 2 + 16));
++    qb |= (qa0 << (i * 2));
++  }
++  q[0] = qb;
+ }
+ 
+-__forceinline__ __device__ void dequant_2bit_16
+-(
+-    const uint32_t q_0,
+-    half2 (&dq)[8],
+-    int stride,
+-    const uint32_t zero
+-)
+-{
+-    const uint32_t c0 = 0x64006400;
+-    const half y4_  = __float2half_rn(1.0f /  4.0f);
+-    const half y16_ = __float2half_rn(1.0f / 16.0f);
+-    const half y64_ = __float2half_rn(1.0f / 64.0f);
+-    const half2 y4  = __halves2half2(y4_,  y4_);
+-    const half2 y16 = __halves2half2(y16_, y16_);
+-    const half2 y64 = __halves2half2(y64_, y64_);
++__forceinline__ __device__ void dequant_2bit_16(const uint32_t q_0,
++                                                half2 (&dq)[8], int stride,
++                                                const uint32_t zero) {
++  const uint32_t c0 = 0x64006400;
++  const half y4_ = __float2half_rn(1.0f / 4.0f);
++  const half y16_ = __float2half_rn(1.0f / 16.0f);
++  const half y64_ = __float2half_rn(1.0f / 64.0f);
++  const half2 y4 = __halves2half2(y4_, y4_);
++  const half2 y16 = __halves2half2(y16_, y16_);
++  const half2 y64 = __halves2half2(y64_, y64_);
+ 
+-    const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero);
+-    const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero));
+-    const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+-    const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+-    const half2 z1 = __half2half2(z1_.as_half);
+-    const half2 z4 = __half2half2(z4_);
+-    const half2 z16 = __half2half2(z16_);
+-    const half2 z64 = __half2half2(z64_);
++  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
++  const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero));
++  const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
++  const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
++  const half2 z1 = __half2half2(z1_.as_half);
++  const half2 z4 = __half2half2(z4_);
++  const half2 z16 = __half2half2(z16_);
++  const half2 z64 = __half2half2(z64_);
+ 
+-    uint32_t qa = q_0;
+-    half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1])      + 1024
+-    half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) *  4 + 1024
+-    half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024
+-    half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024
+-    qa >>= 8;
+-    half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8])      + 1024
+-    half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) *  4 + 1024
+-    half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024
+-    half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024
++  uint32_t qa = q_0;
++  half2_uint32 q0((qa & 0x00030003) | c0);  // half2(q[ 0], q[ 1])      + 1024
++  half2_uint32 q1((qa & 0x000c000c) | c0);  // half2(q[ 2], q[ 3]) *  4 + 1024
++  half2_uint32 q2((qa & 0x00300030) | c0);  // half2(q[ 4], q[ 5]) * 16 + 1024
++  half2_uint32 q3((qa & 0x00c000c0) | c0);  // half2(q[ 6], q[ 7]) * 64 + 1024
++  qa >>= 8;
++  half2_uint32 q4((qa & 0x00030003) | c0);  // half2(q[ 8], q[ 8])      + 1024
++  half2_uint32 q5((qa & 0x000c000c) | c0);  // half2(q[10], q[11]) *  4 + 1024
++  half2_uint32 q6((qa & 0x00300030) | c0);  // half2(q[12], q[13]) * 16 + 1024
++  half2_uint32 q7((qa & 0x00c000c0) | c0);  // half2(q[14], q[15]) * 64 + 1024
+ 
+-    dq[0] = __hadd2(q0.as_half2, z1);
+-    dq[1] = __hfma2(q1.as_half2, y4,  z4);
+-    dq[2] = __hfma2(q2.as_half2, y16, z16);
+-    dq[3] = __hfma2(q3.as_half2, y64, z64);
+-    dq[4] = __hadd2(q4.as_half2, z1);
+-    dq[5] = __hfma2(q5.as_half2, y4,  z4);
+-    dq[6] = __hfma2(q6.as_half2, y16, z16);
+-    dq[7] = __hfma2(q7.as_half2, y64, z64);
++  dq[0] = __hadd2(q0.as_half2, z1);
++  dq[1] = __hfma2(q1.as_half2, y4, z4);
++  dq[2] = __hfma2(q2.as_half2, y16, z16);
++  dq[3] = __hfma2(q3.as_half2, y64, z64);
++  dq[4] = __hadd2(q4.as_half2, z1);
++  dq[5] = __hfma2(q5.as_half2, y4, z4);
++  dq[6] = __hfma2(q6.as_half2, y16, z16);
++  dq[7] = __hfma2(q7.as_half2, y64, z64);
+ }
+ 
+ }  // namespace gptq
+diff --git a/csrc/quantization/gptq/qdq_3.cuh b/csrc/quantization/gptq/qdq_3.cuh
+index 3e7ecde..0d5c2ad 100644
+--- a/csrc/quantization/gptq/qdq_3.cuh
++++ b/csrc/quantization/gptq/qdq_3.cuh
+@@ -11,128 +11,136 @@ namespace gptq {
+ // vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+ // vtttrrrp ppnnnlll  usssqqqo oommmkkk
+ 
+-__forceinline__ __device__ void shuffle_3bit_32
+-(
+-    uint32_t* q,
+-    int stride
+-)
+-{
+-    uint32_t qa = q[0 * stride];
+-    uint32_t qb = q[1 * stride];
+-    uint32_t qc = q[2 * stride];
+-
+-    // qa: aa999888 77766655  54443332 22111000
+-    // qb: lkkkjjji iihhhggg  fffeeedd dcccbbba
+-    // qc: vvvuuutt tsssrrrq  qqpppooo nnnmmmll
+-
+-    uint32_t qd = qc >> 26;
+-    qc <<= 4;
+-    qc |= qb >> 28;
+-    qb <<= 2;
+-    qb |= qa >> 30;
+-
+-    // qa: ..999888 77766655  54443332 22111000
+-    // qb: ..jjjiii hhhgggff  feeedddc ccbbbaaa
+-    // qc: ..tttsss rrrqqqpp  pooonnnm mmlllkkk
+-    // qd:                               vvvuuu
+-
+-    uint32_t za = 0;
+-    uint32_t zb = 0;
+-    uint32_t zc = 0;
+-
+-    for (int i = 0; i < 5; i++) { uint32_t t0 = qa & 0x07; uint32_t t1 = (qa & 0x38) >> 3; qa >>= 6; za |= (t0 << (i * 3)); za |= (t1 << (i * 3 + 16)); }
+-    for (int i = 0; i < 5; i++) { uint32_t t0 = qb & 0x07; uint32_t t1 = (qb & 0x38) >> 3; qb >>= 6; zb |= (t0 << (i * 3)); zb |= (t1 << (i * 3 + 16)); }
+-    for (int i = 0; i < 5; i++) { uint32_t t0 = qc & 0x07; uint32_t t1 = (qc & 0x38) >> 3; qc >>= 6; zc |= (t0 << (i * 3)); zc |= (t1 << (i * 3 + 16)); }
+-
+-    // za:  9997775 55333111   8886664 44222000
+-    // zb:  jjjhhhf ffdddbbb   iiiggge eecccaaa
+-    // zc:  tttrrrp ppnnnlll   sssqqqo oommmkkk
+-    // qd:                               vvvuuu
+-
+-    za |= ((qd & 0x01) >> 0) << 15;
+-    zb |= ((qd & 0x02) >> 1) << 15;
+-    zc |= ((qd & 0x04) >> 2) << 15;
+-    za |= ((qd & 0x08) >> 3) << 31;
+-    zb |= ((qd & 0x10) >> 4) << 31;
+-    zc |= ((qd & 0x20) >> 5) << 31;
+-
+-    // za: v9997775 55333111  u8886664 44222000  (u, v lsb)
+-    // zb: vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+-    // zc: vtttrrrp ppnnnlll  usssqqqo oommmkkk
+-
+-    q[0 * stride] = za;
+-    q[1 * stride] = zb;
+-    q[2 * stride] = zc;
+-}
+-
+-__forceinline__ __device__ void dequant_3bit_32
+-(
+-    const uint32_t q_0,
+-    const uint32_t q_1,
+-    const uint32_t q_2,
+-    half2 (&dq)[16],
+-    int stride,
+-    const uint32_t zero
+-)
+-{
+-    const uint32_t c0 = 0x64006400;
+-    const half y8_  = __float2half_rn(1.0f /  8.0f);
+-    const half y64_ = __float2half_rn(1.0f / 64.0f);
+-    const half2 y8  = __halves2half2(y8_,  y8_);
+-    const half2 y64 = __halves2half2(y64_, y64_);
+-    const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero);
+-    const half z8_ = __hsub(__int2half_rn(-128), __int2half_rn(zero));
+-    const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+-    const half2 z1  = __halves2half2(z1_.as_half,  z1_.as_half);
+-    const half2 z8  = __halves2half2(z8_,  z8_);
+-    const half2 z64 = __halves2half2(z64_, z64_);
+-
+-    uint32_t qa = q_0;
+-    uint32_t qb = q_1;
+-    uint32_t qc = q_2;
+-
+-    half2_uint32 q0((qa & 0x00070007) | c0); // half2(q[ 0], q[ 1])      + 1024
+-    half2_uint32 q1((qa & 0x00380038) | c0); // half2(q[ 2], q[ 3]) *  8 + 1024
++__forceinline__ __device__ void shuffle_3bit_32(uint32_t* q, int stride) {
++  uint32_t qa = q[0 * stride];
++  uint32_t qb = q[1 * stride];
++  uint32_t qc = q[2 * stride];
++
++  // qa: aa999888 77766655  54443332 22111000
++  // qb: lkkkjjji iihhhggg  fffeeedd dcccbbba
++  // qc: vvvuuutt tsssrrrq  qqpppooo nnnmmmll
++
++  uint32_t qd = qc >> 26;
++  qc <<= 4;
++  qc |= qb >> 28;
++  qb <<= 2;
++  qb |= qa >> 30;
++
++  // qa: ..999888 77766655  54443332 22111000
++  // qb: ..jjjiii hhhgggff  feeedddc ccbbbaaa
++  // qc: ..tttsss rrrqqqpp  pooonnnm mmlllkkk
++  // qd:                               vvvuuu
++
++  uint32_t za = 0;
++  uint32_t zb = 0;
++  uint32_t zc = 0;
++
++  for (int i = 0; i < 5; i++) {
++    uint32_t t0 = qa & 0x07;
++    uint32_t t1 = (qa & 0x38) >> 3;
+     qa >>= 6;
+-    half2_uint32 q2((qa & 0x00070007) | c0); // half2(q[ 4], q[ 5])      + 1024
+-    half2_uint32 q3((qa & 0x00380038) | c0); // half2(q[ 6], q[ 7]) *  8 + 1024
+-    half2_uint32 q4((qa & 0x01c001c0) | c0); // half2(q[ 8], q[ 9]) * 64 + 1024
+-    qa >>= 9;
+-    qa &= 0x00010001;
+-    half2_uint32 q5((qb & 0x00070007) | c0); // half2(q[10], q[11])      + 1024
+-    half2_uint32 q6((qb & 0x00380038) | c0); // half2(q[12], q[13]) *  8 + 1024
++    za |= (t0 << (i * 3));
++    za |= (t1 << (i * 3 + 16));
++  }
++  for (int i = 0; i < 5; i++) {
++    uint32_t t0 = qb & 0x07;
++    uint32_t t1 = (qb & 0x38) >> 3;
+     qb >>= 6;
+-    half2_uint32 q7((qb & 0x00070007) | c0); // half2(q[14], q[15])      + 1024
+-    half2_uint32 q8((qb & 0x00380038) | c0); // half2(q[16], q[17]) *  8 + 1024
+-    half2_uint32 q9((qb & 0x01c001c0) | c0); // half2(q[18], q[19]) * 64 + 1024
+-    qb >>= 8;
+-    qb &= 0x00020002;
+-    half2_uint32 q10((qc & 0x00070007) | c0); // half2(q[20], q[21])      + 1024
+-    half2_uint32 q11((qc & 0x00380038) | c0); // half2(q[22], q[23]) *  8 + 1024
++    zb |= (t0 << (i * 3));
++    zb |= (t1 << (i * 3 + 16));
++  }
++  for (int i = 0; i < 5; i++) {
++    uint32_t t0 = qc & 0x07;
++    uint32_t t1 = (qc & 0x38) >> 3;
+     qc >>= 6;
+-    half2_uint32 q12((qc & 0x00070007) | c0); // half2(q[24], q[25])      + 1024
+-    half2_uint32 q13((qc & 0x00380038) | c0); // half2(q[26], q[27]) *  8 + 1024
+-    half2_uint32 q14((qc & 0x01c001c0) | c0); // half2(q[28], q[29]) * 64 + 1024
+-    qc >>= 7;
+-    qc &= 0x00040004;
+-    half2_uint32 q15((qa | qb | qc) | c0);
+-
+-    dq[ 0] = __hadd2( q0.as_half2, z1);
+-    dq[ 1] = __hfma2( q1.as_half2, y8,  z8);
+-    dq[ 2] = __hadd2( q2.as_half2, z1);
+-    dq[ 3] = __hfma2( q3.as_half2, y8,  z8);
+-    dq[ 4] = __hfma2( q4.as_half2, y64, z64);
+-    dq[ 5] = __hadd2( q5.as_half2, z1);
+-    dq[ 6] = __hfma2( q6.as_half2, y8,  z8);
+-    dq[ 7] = __hadd2( q7.as_half2, z1);
+-    dq[ 8] = __hfma2( q8.as_half2, y8,  z8);
+-    dq[ 9] = __hfma2( q9.as_half2, y64, z64);
+-    dq[10] = __hadd2(q10.as_half2, z1);
+-    dq[11] = __hfma2(q11.as_half2, y8,  z8);
+-    dq[12] = __hadd2(q12.as_half2, z1);
+-    dq[13] = __hfma2(q13.as_half2, y8,  z8);
+-    dq[14] = __hfma2(q14.as_half2, y64, z64);
+-    dq[15] = __hadd2(q15.as_half2, z1);
++    zc |= (t0 << (i * 3));
++    zc |= (t1 << (i * 3 + 16));
++  }
++
++  // za:  9997775 55333111   8886664 44222000
++  // zb:  jjjhhhf ffdddbbb   iiiggge eecccaaa
++  // zc:  tttrrrp ppnnnlll   sssqqqo oommmkkk
++  // qd:                               vvvuuu
++
++  za |= ((qd & 0x01) >> 0) << 15;
++  zb |= ((qd & 0x02) >> 1) << 15;
++  zc |= ((qd & 0x04) >> 2) << 15;
++  za |= ((qd & 0x08) >> 3) << 31;
++  zb |= ((qd & 0x10) >> 4) << 31;
++  zc |= ((qd & 0x20) >> 5) << 31;
++
++  // za: v9997775 55333111  u8886664 44222000  (u, v lsb)
++  // zb: vjjjhhhf ffdddbbb  uiiiggge eecccaaa
++  // zc: vtttrrrp ppnnnlll  usssqqqo oommmkkk
++
++  q[0 * stride] = za;
++  q[1 * stride] = zb;
++  q[2 * stride] = zc;
++}
++
++__forceinline__ __device__ void dequant_3bit_32(const uint32_t q_0,
++                                                const uint32_t q_1,
++                                                const uint32_t q_2,
++                                                half2 (&dq)[16], int stride,
++                                                const uint32_t zero) {
++  const uint32_t c0 = 0x64006400;
++  const half y8_ = __float2half_rn(1.0f / 8.0f);
++  const half y64_ = __float2half_rn(1.0f / 64.0f);
++  const half2 y8 = __halves2half2(y8_, y8_);
++  const half2 y64 = __halves2half2(y64_, y64_);
++  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
++  const half z8_ = __hsub(__int2half_rn(-128), __int2half_rn(zero));
++  const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
++  const half2 z1 = __halves2half2(z1_.as_half, z1_.as_half);
++  const half2 z8 = __halves2half2(z8_, z8_);
++  const half2 z64 = __halves2half2(z64_, z64_);
++
++  uint32_t qa = q_0;
++  uint32_t qb = q_1;
++  uint32_t qc = q_2;
++
++  half2_uint32 q0((qa & 0x00070007) | c0);  // half2(q[ 0], q[ 1])      + 1024
++  half2_uint32 q1((qa & 0x00380038) | c0);  // half2(q[ 2], q[ 3]) *  8 + 1024
++  qa >>= 6;
++  half2_uint32 q2((qa & 0x00070007) | c0);  // half2(q[ 4], q[ 5])      + 1024
++  half2_uint32 q3((qa & 0x00380038) | c0);  // half2(q[ 6], q[ 7]) *  8 + 1024
++  half2_uint32 q4((qa & 0x01c001c0) | c0);  // half2(q[ 8], q[ 9]) * 64 + 1024
++  qa >>= 9;
++  qa &= 0x00010001;
++  half2_uint32 q5((qb & 0x00070007) | c0);  // half2(q[10], q[11])      + 1024
++  half2_uint32 q6((qb & 0x00380038) | c0);  // half2(q[12], q[13]) *  8 + 1024
++  qb >>= 6;
++  half2_uint32 q7((qb & 0x00070007) | c0);  // half2(q[14], q[15])      + 1024
++  half2_uint32 q8((qb & 0x00380038) | c0);  // half2(q[16], q[17]) *  8 + 1024
++  half2_uint32 q9((qb & 0x01c001c0) | c0);  // half2(q[18], q[19]) * 64 + 1024
++  qb >>= 8;
++  qb &= 0x00020002;
++  half2_uint32 q10((qc & 0x00070007) | c0);  // half2(q[20], q[21])      + 1024
++  half2_uint32 q11((qc & 0x00380038) | c0);  // half2(q[22], q[23]) *  8 + 1024
++  qc >>= 6;
++  half2_uint32 q12((qc & 0x00070007) | c0);  // half2(q[24], q[25])      + 1024
++  half2_uint32 q13((qc & 0x00380038) | c0);  // half2(q[26], q[27]) *  8 + 1024
++  half2_uint32 q14((qc & 0x01c001c0) | c0);  // half2(q[28], q[29]) * 64 + 1024
++  qc >>= 7;
++  qc &= 0x00040004;
++  half2_uint32 q15((qa | qb | qc) | c0);
++
++  dq[0] = __hadd2(q0.as_half2, z1);
++  dq[1] = __hfma2(q1.as_half2, y8, z8);
++  dq[2] = __hadd2(q2.as_half2, z1);
++  dq[3] = __hfma2(q3.as_half2, y8, z8);
++  dq[4] = __hfma2(q4.as_half2, y64, z64);
++  dq[5] = __hadd2(q5.as_half2, z1);
++  dq[6] = __hfma2(q6.as_half2, y8, z8);
++  dq[7] = __hadd2(q7.as_half2, z1);
++  dq[8] = __hfma2(q8.as_half2, y8, z8);
++  dq[9] = __hfma2(q9.as_half2, y64, z64);
++  dq[10] = __hadd2(q10.as_half2, z1);
++  dq[11] = __hfma2(q11.as_half2, y8, z8);
++  dq[12] = __hadd2(q12.as_half2, z1);
++  dq[13] = __hfma2(q13.as_half2, y8, z8);
++  dq[14] = __hfma2(q14.as_half2, y64, z64);
++  dq[15] = __hadd2(q15.as_half2, z1);
+ }
+ 
+ }  // namespace gptq
+diff --git a/csrc/quantization/gptq/qdq_4.cuh b/csrc/quantization/gptq/qdq_4.cuh
+index 881f353..7f65d2d 100644
+--- a/csrc/quantization/gptq/qdq_4.cuh
++++ b/csrc/quantization/gptq/qdq_4.cuh
+@@ -13,133 +13,112 @@ namespace gptq {
+ //
+ // 77775555 33331111  66664444 22220000
+ 
+-__forceinline__ __device__ void shuffle_4bit_8
+-(
+-    uint32_t* q,
+-    int stride
+-)
+-{
+-    uint32_t qa = q[0];
+-    uint32_t qb = 0;
+-
+-    #pragma unroll
+-    for (int i = 0; i < 4; i++)
+-    {
+-        uint32_t qa0 = qa & 0x0f;
+-        uint32_t qa1 = (qa & 0xf0) >> 4;
+-        qa >>= 8;
+-        qb |= (qa1 << (i * 4 + 16));
+-        qb |= (qa0 << (i * 4));
+-    }
+-    q[0] = qb;
+-}
+-
+-__forceinline__ __device__ void dequant_4bit_8
+-(
+-    const uint32_t q_0,
+-    half2 (&dq)[4],
+-    int stride,
+-    const uint32_t zero
+-)
+-{
+-    const uint32_t c0 = 0x64006400;
+-    const half y16_ = __float2half_rn(1.0f / 16.0f);
+-    const half2 y16 = __halves2half2(y16_, y16_);
+-    const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero);
+-    const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+-    const half2 z1 = __half2half2(z1_.as_half);
+-    const half2 z16 = __half2half2(z16_);
+-
+-    uint32_t qa = q_0;
+-    half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1])      + 1024
+-    half2_uint32 q1((qa & 0x00f000f0) | c0); // half2(q[ 2], q[ 3]) * 16 + 1024
++__forceinline__ __device__ void shuffle_4bit_8(uint32_t* q, int stride) {
++  uint32_t qa = q[0];
++  uint32_t qb = 0;
++
++#pragma unroll
++  for (int i = 0; i < 4; i++) {
++    uint32_t qa0 = qa & 0x0f;
++    uint32_t qa1 = (qa & 0xf0) >> 4;
+     qa >>= 8;
+-    half2_uint32 q2((qa & 0x000f000f) | c0); // half2(q[ 4], q[ 5])      + 1024
+-    half2_uint32 q3((qa & 0x00f000f0) | c0); // half2(q[ 6], q[ 7]) * 16 + 1024
++    qb |= (qa1 << (i * 4 + 16));
++    qb |= (qa0 << (i * 4));
++  }
++  q[0] = qb;
++}
+ 
+-    dq[0] = __hadd2(q0.as_half2, z1);
+-    dq[1] = __hfma2(q1.as_half2, y16, z16);
+-    dq[2] = __hadd2(q2.as_half2, z1);
+-    dq[3] = __hfma2(q3.as_half2, y16, z16);
++__forceinline__ __device__ void dequant_4bit_8(const uint32_t q_0,
++                                               half2 (&dq)[4], int stride,
++                                               const uint32_t zero) {
++  const uint32_t c0 = 0x64006400;
++  const half y16_ = __float2half_rn(1.0f / 16.0f);
++  const half2 y16 = __halves2half2(y16_, y16_);
++  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
++  const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
++  const half2 z1 = __half2half2(z1_.as_half);
++  const half2 z16 = __half2half2(z16_);
++
++  uint32_t qa = q_0;
++  half2_uint32 q0((qa & 0x000f000f) | c0);  // half2(q[ 0], q[ 1])      + 1024
++  half2_uint32 q1((qa & 0x00f000f0) | c0);  // half2(q[ 2], q[ 3]) * 16 + 1024
++  qa >>= 8;
++  half2_uint32 q2((qa & 0x000f000f) | c0);  // half2(q[ 4], q[ 5])      + 1024
++  half2_uint32 q3((qa & 0x00f000f0) | c0);  // half2(q[ 6], q[ 7]) * 16 + 1024
++
++  dq[0] = __hadd2(q0.as_half2, z1);
++  dq[1] = __hfma2(q1.as_half2, y16, z16);
++  dq[2] = __hadd2(q2.as_half2, z1);
++  dq[3] = __hfma2(q3.as_half2, y16, z16);
+ }
+ 
+-__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale
+-(
+-    const uint32_t zero,
+-    const half scale,
+-    half2 (&z1z16)[2],
+-    half2 (&y1y16)[2]
+-)
+-{
+-    half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero);
+-    half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
++__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale(
++    const uint32_t zero, const half scale, half2 (&z1z16)[2],
++    half2 (&y1y16)[2]) {
++  half_uint16 z1(0xe400 | zero);  // half(-1024.0f - zero);
++  half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+ 
+-    half2 scale2 = __half2half2(scale);
++  half2 scale2 = __half2half2(scale);
+ 
+-    z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half));
+-    z1z16[1] = __hmul2(scale2, __half2half2(z16));
++  z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half));
++  z1z16[1] = __hmul2(scale2, __half2half2(z16));
+ 
+-    const half y1 = __float2half_rn(1.0f);
+-    const half y16 = __float2half_rn(1.0f / 16.0f);
++  const half y1 = __float2half_rn(1.0f);
++  const half y16 = __float2half_rn(1.0f / 16.0f);
+ 
+-    y1y16[0] = __hmul2(scale2, __half2half2(y1));
+-    y1y16[1] = __hmul2(scale2, __half2half2(y16));
++  y1y16[0] = __hmul2(scale2, __half2half2(y1));
++  y1y16[1] = __hmul2(scale2, __half2half2(y16));
+ }
+ 
+-__forceinline__ __device__ void dequant_4bit_8_prep_zero
+-(
+-    const uint32_t zero,
+-    half2(&z1z16)[2],
+-    half2(&y1y16)[2]
+-)
+-{
+-    half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero);
+-    half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
++__forceinline__ __device__ void dequant_4bit_8_prep_zero(const uint32_t zero,
++                                                         half2 (&z1z16)[2],
++                                                         half2 (&y1y16)[2]) {
++  half_uint16 z1(0xe400 | zero);  // half(-1024.0f - zero);
++  half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+ 
+-    z1z16[0] = __half2half2(z1.as_half);
+-    z1z16[1] = __half2half2(z16);
++  z1z16[0] = __half2half2(z1.as_half);
++  z1z16[1] = __half2half2(z16);
+ 
+-    const half y1 = __float2half_rn(1.0f);
+-    const half y16 = __float2half_rn(1.0f / 16.0f);
++  const half y1 = __float2half_rn(1.0f);
++  const half y16 = __float2half_rn(1.0f / 16.0f);
+ 
+-    y1y16[0] = __half2half2(y1);
+-    y1y16[1] = __half2half2(y16);
++  y1y16[0] = __half2half2(y1);
++  y1y16[1] = __half2half2(y16);
+ }
+ 
+-
+-__forceinline__ __device__ void dequant_4bit_8_gptq
+-(
+-    const uint32_t q_0,
+-    half2 (&dq)[4],
+-    half2 (&z1z16)[2],
+-    half2 (&y1y16)[2],
+-    int stride,
+-    bool scaled
+-)
+-{
+-    const uint32_t c0 = 0x64006400;
+-
+-    uint32_t qa = q_0;
+-    half2_uint32 q0((qa & 0x000f000f) | c0); // half2( q[0]      + 1024, q[1]      + 1024 )
+-    half2_uint32 q1((qa & 0x00f000f0) | c0); // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 )
+-    qa >>= 8;
+-    half2_uint32 q2((qa & 0x000f000f) | c0); // half2( q[4]      + 1024, q[5]      + 1024 )
+-    half2_uint32 q3((qa & 0x00f000f0) | c0); // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 )
+-
+-    if (scaled)
+-    {
+-        dq[0] = __hfma2(q0.as_half2, y1y16[0], z1z16[0]);  // half2( q[0] * s - z * s, q[1] * s - z * s)
+-        dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]);  // half2( q[2] * s - z * s, q[3] * s - z * s)
+-        dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]);
+-        dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);
+-    }
+-    else
+-    {
+-        dq[0] = __hadd2(q0.as_half2,           z1z16[0]);  // half2( q[0] - z, q[1] - z )
+-        dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]);  // half2( q[2] - z, q[3] - z )
+-        dq[2] = __hadd2(q2.as_half2,           z1z16[0]);  // half2( q[4] - z, q[5] - z )
+-        dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);  // half2( q[6] - z, q[7] - z )
+-    }
++__forceinline__ __device__ void dequant_4bit_8_gptq(const uint32_t q_0,
++                                                    half2 (&dq)[4],
++                                                    half2 (&z1z16)[2],
++                                                    half2 (&y1y16)[2],
++                                                    int stride, bool scaled) {
++  const uint32_t c0 = 0x64006400;
++
++  uint32_t qa = q_0;
++  half2_uint32 q0((qa & 0x000f000f) |
++                  c0);  // half2( q[0]      + 1024, q[1]      + 1024 )
++  half2_uint32 q1((qa & 0x00f000f0) |
++                  c0);  // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 )
++  qa >>= 8;
++  half2_uint32 q2((qa & 0x000f000f) |
++                  c0);  // half2( q[4]      + 1024, q[5]      + 1024 )
++  half2_uint32 q3((qa & 0x00f000f0) |
++                  c0);  // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 )
++
++  if (scaled) {
++    dq[0] = __hfma2(q0.as_half2, y1y16[0],
++                    z1z16[0]);  // half2( q[0] * s - z * s, q[1] * s - z * s)
++    dq[1] = __hfma2(q1.as_half2, y1y16[1],
++                    z1z16[1]);  // half2( q[2] * s - z * s, q[3] * s - z * s)
++    dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]);
++    dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);
++  } else {
++    dq[0] = __hadd2(q0.as_half2, z1z16[0]);  // half2( q[0] - z, q[1] - z )
++    dq[1] = __hfma2(q1.as_half2, y1y16[1],
++                    z1z16[1]);               // half2( q[2] - z, q[3] - z )
++    dq[2] = __hadd2(q2.as_half2, z1z16[0]);  // half2( q[4] - z, q[5] - z )
++    dq[3] = __hfma2(q3.as_half2, y1y16[1],
++                    z1z16[1]);  // half2( q[6] - z, q[7] - z )
++  }
+ }
+ }  // namespace gptq
+ }  // namespace vllm
+diff --git a/csrc/quantization/gptq/qdq_8.cuh b/csrc/quantization/gptq/qdq_8.cuh
+index 0c7ad78..feb5d22 100644
+--- a/csrc/quantization/gptq/qdq_8.cuh
++++ b/csrc/quantization/gptq/qdq_8.cuh
+@@ -10,28 +10,18 @@ Copied from https://github.com/turboderp/exllamav2
+ namespace vllm {
+ namespace gptq {
+ 
+-__forceinline__ __device__ void shuffle_8bit_4
+-(
+-    uint32_t* q,
+-    int stride
+-)
+-{
+-}
+-
+-__forceinline__ __device__ void dequant_8bit_8
+-(
+-    const uint32_t q_0,
+-    const uint32_t q_1,
+-    half2 (&dq)[4],
+-    int stride,
+-    const uint32_t zero
+-)
+-{
+-    half dqh[8];
+-    for (int i = 0; i < 4; i++) dqh[i    ] = dq_ns(exb(q_0, i * 8, 0xff), zero);
+-    for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
+-
+-    for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
++__forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
++
++__forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
++                                               const uint32_t q_1,
++                                               half2 (&dq)[4], int stride,
++                                               const uint32_t zero) {
++  half dqh[8];
++  for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
++  for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
++
++  for (int i = 0; i < 4; i++)
++    dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+ }
+ 
+ }  // namespace gptq
+diff --git a/csrc/quantization/gptq/qdq_util.cuh b/csrc/quantization/gptq/qdq_util.cuh
+index 1722a9a..9426408 100644
+--- a/csrc/quantization/gptq/qdq_util.cuh
++++ b/csrc/quantization/gptq/qdq_util.cuh
+@@ -8,51 +8,47 @@ Copied from https://github.com/turboderp/exllamav2
+ namespace vllm {
+ namespace gptq {
+ 
+-union half2_uint32
+-{
+-    uint32_t as_uint32;
+-    half2 as_half2;
+-    __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
+-    __device__ half2_uint32(half2 val) : as_half2(val) {}
++union half2_uint32 {
++  uint32_t as_uint32;
++  half2 as_half2;
++  __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
++  __device__ half2_uint32(half2 val) : as_half2(val) {}
+ };
+ 
+-union half_uint16
+-{
+-    uint16_t as_uint16;
+-    half as_half;
+-    __device__ half_uint16(uint16_t val) : as_uint16(val) {}
+-    __device__ half_uint16(half val) : as_half(val) {}
++union half_uint16 {
++  uint16_t as_uint16;
++  half as_half;
++  __device__ half_uint16(uint16_t val) : as_uint16(val) {}
++  __device__ half_uint16(half val) : as_half(val) {}
+ };
+ 
+ // Max_scale premultiplied by 1/256
+ 
+-__forceinline__ __device__ half dq_scale(const int qs, const half max_scale)
+-{
+-    int qs_i = qs + 1;
+-    half qs_h = __int2half_rn(qs_i * qs_i);
+-    qs_h = __hmul(qs_h, max_scale);
+-    return qs_h;
++__forceinline__ __device__ half dq_scale(const int qs, const half max_scale) {
++  int qs_i = qs + 1;
++  half qs_h = __int2half_rn(qs_i * qs_i);
++  qs_h = __hmul(qs_h, max_scale);
++  return qs_h;
+ }
+ 
+-__forceinline__ __device__ half dq(const int q, const int qzero, const half scale)
+-{
+-    return __hmul(__int2half_rn(q - qzero), scale);
++__forceinline__ __device__ half dq(const int q, const int qzero,
++                                   const half scale) {
++  return __hmul(__int2half_rn(q - qzero), scale);
+ }
+ 
+-__forceinline__ __device__ half dq_ns(const int q, const int qzero)
+-{
+-    //return __hsub(__int2half_rn(q), __int2half_rn(qzero));
+-    return __int2half_rn(q - qzero);
++__forceinline__ __device__ half dq_ns(const int q, const int qzero) {
++  // return __hsub(__int2half_rn(q), __int2half_rn(qzero));
++  return __int2half_rn(q - qzero);
+ }
+ 
+-__forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask)
+-{
+-    return (int)((q >> shift) & mask);
++__forceinline__ __device__ int exb(const uint32_t q, const int shift,
++                                   const int mask) {
++  return (int)((q >> shift) & mask);
+ }
+ 
+-__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask)
+-{
+-    return (int)(__funnelshift_rc(q0, q1, shift) & mask);
++__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0,
++                                   const int shift, const int mask) {
++  return (int)(__funnelshift_rc(q0, q1, shift) & mask);
+ }
+ 
+ }  // namespace gptq
+diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+new file mode 100644
+index 0000000..3e2f87d
+--- /dev/null
++++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+@@ -0,0 +1,268 @@
++#include "marlin.cuh"
++
++#include "core/registration.h"
++
++namespace marlin {
++
++template <int const num_threads, int const num_bits>
++__global__ void awq_marlin_repack_kernel(
++    uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
++    int size_k, int size_n) {
++  constexpr int pack_factor = 32 / num_bits;
++
++  int k_tiles = size_k / tile_k_size;
++  int n_tiles = size_n / tile_n_size;
++  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
++
++  int start_k_tile = blockIdx.x * block_k_tiles;
++  if (start_k_tile >= k_tiles) {
++    return;
++  }
++
++  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
++
++  // Wait until the next thread tile has been loaded to shared memory.
++  auto wait_for_stage = [&]() {
++    // We only have `stages - 2` active fetches since we are double buffering
++    // and can only issue the next fetch when it is guaranteed that the previous
++    // shared memory load is fully complete (as it may otherwise be
++    // overwritten).
++    cp_async_wait<repack_stages - 2>();
++    __syncthreads();
++  };
++
++  extern __shared__ int4 sh[];
++
++  constexpr int tile_n_ints = tile_n_size / pack_factor;
++
++  constexpr int stage_n_threads = tile_n_ints / 4;
++  constexpr int stage_k_threads = tile_k_size;
++  constexpr int stage_size = stage_k_threads * stage_n_threads;
++
++  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
++    if (n_tile_id >= n_tiles) {
++      cp_async_fence();
++      return;
++    }
++
++    int first_n = n_tile_id * tile_n_size;
++    int first_n_packed = first_n / pack_factor;
++
++    int4* sh_ptr = sh + stage_size * pipe;
++
++    if (threadIdx.x < stage_size) {
++      int k_id = threadIdx.x / stage_n_threads;
++      int n_id = threadIdx.x % stage_n_threads;
++
++      int first_k = k_tile_id * tile_k_size;
++
++      cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
++                reinterpret_cast<int4 const*>(
++                    &(b_q_weight_ptr[(first_k + k_id) * (size_n / pack_factor) +
++                                     first_n_packed + (n_id * 4)])));
++    }
++
++    cp_async_fence();
++  };
++
++  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
++    if (n_tile_id >= n_tiles) {
++      return;
++    }
++
++    int warp_id = threadIdx.x / 32;
++    int th_id = threadIdx.x % 32;
++
++    if (warp_id >= 4) {
++      return;
++    }
++
++    int tc_col = th_id / 4;
++    int tc_row = (th_id % 4) * 2;
++
++    constexpr int tc_offsets[4] = {0, 1, 8, 9};
++
++    int cur_n = warp_id * 16 + tc_col;
++    int cur_n_packed = cur_n / pack_factor;
++    int cur_n_pos = cur_n % pack_factor;
++
++    constexpr int sh_stride = tile_n_ints;
++    constexpr uint32_t mask = (1 << num_bits) - 1;
++
++    int4* sh_stage_ptr = sh + stage_size * pipe;
++    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
++
++    // Undo interleaving
++    int cur_n_pos_unpacked;
++    if constexpr (num_bits == 4) {
++      constexpr int undo_pack[8] = {0, 4, 1, 5, 2, 6, 3, 7};
++      cur_n_pos_unpacked = undo_pack[cur_n_pos];
++    } else {
++      constexpr int undo_pack[4] = {0, 2, 1, 3};
++      cur_n_pos_unpacked = undo_pack[cur_n_pos];
++    }
++
++    uint32_t vals[8];
++#pragma unroll
++    for (int i = 0; i < 4; i++) {
++      int cur_elem = tc_row + tc_offsets[i];
++
++      int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
++      int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
++                                          sh_stride * cur_elem];
++
++      vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
++      vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
++    }
++
++    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
++    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
++
++    // Result of:
++    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
++    if constexpr (num_bits == 4) {
++      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
++
++      uint32_t res = 0;
++#pragma unroll
++      for (int i = 0; i < 8; i++) {
++        res |= vals[pack_idx[i]] << (i * 4);
++      }
++
++      out_ptr[out_offset + th_id * 4 + warp_id] = res;
++
++    } else {
++      constexpr int pack_idx[4] = {0, 2, 1, 3};
++
++      uint32_t res1 = 0;
++      uint32_t res2 = 0;
++#pragma unroll
++      for (int i = 0; i < 4; i++) {
++        res1 |= vals[pack_idx[i]] << (i * 8);
++        res2 |= vals[4 + pack_idx[i]] << (i * 8);
++      }
++
++      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
++      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
++    }
++  };
++
++  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
++#pragma unroll
++    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
++      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
++    }
++
++    wait_for_stage();
++  };
++#pragma unroll
++  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
++    int n_tile_id = 0;
++
++    start_pipes(k_tile_id, n_tile_id);
++
++    while (n_tile_id < n_tiles) {
++#pragma unroll
++      for (int pipe = 0; pipe < repack_stages; pipe++) {
++        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
++                        n_tile_id + pipe + repack_stages - 1);
++        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
++        wait_for_stage();
++      }
++      n_tile_id += repack_stages;
++    }
++  }
++}
++
++}  // namespace marlin
++
++#define CALL_IF(NUM_BITS)                                                   \
++  else if (num_bits == NUM_BITS) {                                          \
++    cudaFuncSetAttribute(                                                   \
++        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \
++        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
++    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>      \
++        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
++            b_q_weight_ptr, out_ptr, size_k, size_n);                       \
++  }
++
++torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
++                                int64_t size_n, int64_t num_bits) {
++  // Verify compatibility with marlin tile of 16x64
++  TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
++              " is not divisible by tile_k_size = ", marlin::tile_k_size);
++  TORCH_CHECK(size_n % marlin::tile_n_size == 0, "size_n = ", size_n,
++              " is not divisible by tile_n_size = ", marlin::tile_n_size);
++
++  TORCH_CHECK(num_bits == 4 || num_bits == 8,
++              "num_bits must be 4 or 8. Got = ", num_bits);
++  int const pack_factor = 32 / num_bits;
++
++  // Verify B
++  TORCH_CHECK(b_q_weight.size(0) == size_k,
++              "b_q_weight.size(0) = ", b_q_weight.size(0),
++              " is not size_k = ", size_k);
++  TORCH_CHECK((size_n / pack_factor) == b_q_weight.size(1),
++              "Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
++              ", size_n = ", size_n, ", pack_factor = ", pack_factor);
++
++  // Verify device and strides
++  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
++  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
++  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
++
++  // Alloc buffers
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
++  auto options = torch::TensorOptions()
++                     .dtype(b_q_weight.dtype())
++                     .device(b_q_weight.device());
++  torch::Tensor out = torch::empty(
++      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
++      options);
++
++  // Get ptrs
++  uint32_t const* b_q_weight_ptr =
++      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
++  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
++
++  // Get dev info
++  int dev = b_q_weight.get_device();
++  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
++  int blocks;
++  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
++
++  int max_shared_mem = 0;
++  cudaDeviceGetAttribute(&max_shared_mem,
++                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
++  TORCH_CHECK(max_shared_mem > 0);
++
++  if (false) {
++  }
++  CALL_IF(4)
++  CALL_IF(8)
++  else {
++    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits);
++  }
++
++  return out;
++}
++
++torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
++                                     c10::SymInt size_k, c10::SymInt size_n,
++                                     int64_t num_bits) {
++  int const pack_factor = 32 / num_bits;
++  auto options = torch::TensorOptions()
++                     .dtype(b_q_weight.dtype())
++                     .device(b_q_weight.device());
++  return torch::empty_symint(
++      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
++      options);
++}
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
++  m.impl("awq_marlin_repack", &awq_marlin_repack);
++}
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
++  m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
++}
+\ No newline at end of file
+diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
+index fd0837f..04ef842 100644
+--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
++++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
+@@ -19,52 +19,71 @@
+  * Adapted from https://github.com/IST-DASLab/marlin
+  */
+ 
+-#include "gptq_marlin.cuh"
++#include "marlin.cuh"
++#include "marlin_dtypes.cuh"
++#include "core/scalar_type.hpp"
+ 
+-template <typename T> inline std::string str(T x) { return std::to_string(x); }
++#include "core/registration.h"
+ 
+-namespace gptq_marlin {
++#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
++  static_assert(std::is_same<scalar_t, half>::value ||          \
++                    std::is_same<scalar_t, nv_bfloat16>::value, \
++                "only float16 and bfloat16 is supported");
++
++template <typename T>
++inline std::string str(T x) {
++  return std::to_string(x);
++}
++
++namespace marlin {
+ 
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+ 
+-__global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr,
+-                                    int const *__restrict__ perm_int_ptr,
+-                                    int4 *__restrict__ out_int4_ptr, int size_m,
++__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
++                                    int const* __restrict__ perm_int_ptr,
++                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                     int size_k, int block_rows) {}
+ 
+-template <const int num_bits,        // number of bits used for weights
+-          const int threads,         // number of threads in a threadblock
+-          const int thread_m_blocks, // number of 16x16 blocks in the m
+-                                     // dimension (batchsize) of the threadblock
+-          const int thread_n_blocks, // same for n dimension (output)
+-          const int thread_k_blocks, // same for k dimension (reduction)
+-          const int stages, // number of stages for the async global->shared
+-                            // fetch pipeline
+-          const bool has_act_order,   // whether act_order is enabled
+-          const int group_blocks = -1 // number of consecutive 16x16 blocks with
+-                                      // a separate quantization scale
++template <typename scalar_t,  // compute dtype, half or nv_float16
++          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
++          const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const bool has_act_order,     // whether act_order is enabled
++          const int group_blocks = -1,  // number of consecutive 16x16 blocks
++                                        // with a separate quantization scale
++          const bool is_zp_float        // is zero point of float16 type?
+           >
+-__global__ void
+-Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+-       const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn
+-       int4 *__restrict__ C,       // fp16 output buffer of shape mxn
+-       const int4 *__restrict__ scales_ptr, // fp16 quantization scales of shape
+-                                            // (k/groupsize)xn
+-       const int *__restrict__ g_idx,       // int32 group indices of shape k
+-       int num_groups, // number of scale groups per output channel
+-       int prob_m,     // batch dimension m
+-       int prob_n,     // output dimension n
+-       int prob_k,     // reduction dimension k
+-       int *locks      // extra global storage for barrier synchronization
++__global__ void Marlin(
++    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // fp16 output buffer of shape mxn
++    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
++    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
++                                          // (k/groupsize)xn
++    const int* __restrict__ g_idx,        // int32 group indices of shape k
++    int num_groups,       // number of scale groups per output channel
++    int prob_m,           // batch dimension m
++    int prob_n,           // output dimension n
++    int prob_k,           // reduction dimension k
++    int* locks,           // extra global storage for barrier synchronization
++    bool use_fp32_reduce  // whether to use fp32 global reduce
+ ) {}
+ 
+-} // namespace gptq_marlin
++}  // namespace marlin
+ 
+-torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+-                               torch::Tensor &b_scales, torch::Tensor &g_idx,
+-                               torch::Tensor &perm, torch::Tensor &workspace,
+-                               int64_t num_bits, int64_t size_m, int64_t size_n,
+-                               int64_t size_k, bool is_k_full) {
++torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
++                               torch::Tensor& b_scales, torch::Tensor& b_zeros,
++                               torch::Tensor& g_idx, torch::Tensor& perm,
++                               torch::Tensor& workspace,
++                               vllm::ScalarTypeId const b_q_type_id,
++                               int64_t size_m, int64_t size_n, int64_t size_k,
++                               bool is_k_full, bool has_zp, bool is_zp_float) {
+   TORCH_CHECK_NOT_IMPLEMENTED(false,
+                               "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+   return torch::empty({1, 1});
+@@ -72,32 +91,40 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+ 
+ #else
+ 
+-// Matrix fragments for tensor core instructions; their precise layout is
+-// documented here:
+-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+-using FragA = Vec<half2, 4>;
+-using FragB = Vec<half2, 2>;
+-using FragC = Vec<float, 4>;
+-using FragS = Vec<half2, 1>; // quantization scales
+-
+ // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+ // output/accumulation.
+-__device__ inline void mma(const FragA &a_frag, const FragB &frag_b,
+-                           FragC &frag_c) {
+-  const uint32_t *a = reinterpret_cast<const uint32_t *>(&a_frag);
+-  const uint32_t *b = reinterpret_cast<const uint32_t *>(&frag_b);
+-  float *c = reinterpret_cast<float *>(&frag_c);
+-  asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+-               "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+-               : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+-               : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]),
+-                 "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
++template <typename scalar_t>
++__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
++                           const typename ScalarType<scalar_t>::FragB& frag_b,
++                           typename ScalarType<scalar_t>::FragC& frag_c) {
++  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
++  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
++  float* c = reinterpret_cast<float*>(&frag_c);
++  if constexpr (std::is_same<scalar_t, half>::value) {
++    asm volatile(
++        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
++        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
++        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
++        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
++          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
++  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
++    asm volatile(
++        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
++        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
++        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
++        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
++          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
++  } else {
++    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
++  }
+ }
+ 
+ // Instruction for loading a full 16x16 matrix fragment of operand A from shared
+ // memory, directly in tensor core layout.
+-__device__ inline void ldsm4(FragA &frag_a, const void *smem_ptr) {
+-  uint32_t *a = reinterpret_cast<uint32_t *>(&frag_a);
++template <typename scalar_t>
++__device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
++                             const void* smem_ptr) {
++  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+   uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+   asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+                : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+@@ -107,7 +134,8 @@ __device__ inline void ldsm4(FragA &frag_a, const void *smem_ptr) {
+ // Lookup-table based 3-input logical operation; explicitly used for
+ // dequantization as the compiler does not seem to automatically recognize it in
+ // all cases.
+-template <int lut> __device__ inline int lop3(int a, int b, int c) {
++template <int lut>
++__device__ inline int lop3(int a, int b, int c) {
+   int res;
+   asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                : "=r"(res)
+@@ -115,7 +143,8 @@ template <int lut> __device__ inline int lop3(int a, int b, int c) {
+   return res;
+ }
+ 
+-// Constructs destination register by taking bytes from 2 sources (based on mask)
++// Constructs destination register by taking bytes from 2 sources (based on
++// mask)
+ template <int start_byte, int mask>
+ __device__ inline uint32_t prmt(uint32_t a) {
+   uint32_t res;
+@@ -125,11 +154,21 @@ __device__ inline uint32_t prmt(uint32_t a) {
+   return res;
+ }
+ 
+-// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+-// values. We mostly follow the strategy in the link below, with some small
+-// changes:
+-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+-__device__ inline FragB dequant_4bit(int q) {
++template <typename scalar_t, vllm::ScalarTypeId w_type_id>
++__device__ inline typename ScalarType<scalar_t>::FragB dequant(int q);
++
++//
++// Efficiently dequantize 4bit values packed in an int32 value into a full
++// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
++// with some small changes:
++// - FP16:
++// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
++// - BF16:
++// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
++//
++template <>
++__device__ inline typename ScalarType<half>::FragB
++dequant<half, vllm::kU4B8.id()>(int q) {
+   const int LO = 0x000f000f;
+   const int HI = 0x00f000f0;
+   const int EX = 0x64006400;
+@@ -141,16 +180,98 @@ __device__ inline FragB dequant_4bit(int q) {
+   const int SUB = 0x64086408;
+   const int MUL = 0x2c002c00;
+   const int ADD = 0xd480d480;
+-  FragB frag_b;
+-  frag_b[0] = __hsub2(*reinterpret_cast<half2 *>(&lo),
+-                      *reinterpret_cast<const half2 *>(&SUB));
+-  frag_b[1] = __hfma2(*reinterpret_cast<half2 *>(&hi),
+-                      *reinterpret_cast<const half2 *>(&MUL),
+-                      *reinterpret_cast<const half2 *>(&ADD));
++  typename ScalarType<half>::FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&SUB));
++  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&MUL),
++                      *reinterpret_cast<const half2*>(&ADD));
++  return frag_b;
++}
++
++template <>
++__device__ inline typename ScalarType<nv_bfloat16>::FragB
++dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
++  static constexpr uint32_t MASK = 0x000f000f;
++  static constexpr uint32_t EX = 0x43004300;
++
++  // Guarantee that the `(a & b) | c` operations are LOP3s.
++
++  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
++  q >>= 4;
++  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
++
++  typename ScalarType<nv_bfloat16>::FragB frag_b;
++  static constexpr uint32_t MUL = 0x3F803F80;
++  static constexpr uint32_t ADD = 0xC308C308;
++
++  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
++                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
++                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
++  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
++                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
++                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+   return frag_b;
+ }
+ 
+-__device__ inline FragB dequant_8bit(int q) {
++template <>
++__device__ inline typename ScalarType<half>::FragB
++dequant<half, vllm::kU4.id()>(int q) {
++  const int LO = 0x000f000f;
++  const int HI = 0x00f000f0;
++  const int EX = 0x64006400;
++  // Guarantee that the `(a & b) | c` operations are LOP3s.
++  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
++  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
++
++  const int SUB = 0x64006400;
++  const int MUL = 0x2c002c00;
++  const int ADD = 0xd400d400;
++  typename ScalarType<half>::FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&SUB));
++  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&MUL),
++                      *reinterpret_cast<const half2*>(&ADD));
++  return frag_b;
++}
++
++template <>
++__device__ inline typename ScalarType<nv_bfloat16>::FragB
++dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
++  static constexpr uint32_t MASK = 0x000f000f;
++  static constexpr uint32_t EX = 0x43004300;
++
++  // Guarantee that the `(a & b) | c` operations are LOP3s.
++
++  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
++  q >>= 4;
++  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
++
++  typename ScalarType<nv_bfloat16>::FragB frag_b;
++  static constexpr uint32_t MUL = 0x3F803F80;
++  static constexpr uint32_t ADD = 0xC300C300;
++
++  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
++                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
++                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
++  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
++                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
++                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
++  return frag_b;
++}
++
++//
++// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
++// bf16 Reference:
++// - FP16:
++// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
++// - BF16:
++// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
++//
++template <>
++__device__ inline typename ScalarType<half>::FragB
++dequant<half, vllm::kU8B128.id()>(int q) {
+   static constexpr uint32_t mask_for_elt_01 = 0x5250;
+   static constexpr uint32_t mask_for_elt_23 = 0x5351;
+   static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+@@ -160,46 +281,148 @@ __device__ inline FragB dequant_8bit(int q) {
+ 
+   static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+ 
+-  FragB frag_b;
+-  frag_b[0] = __hsub2(*reinterpret_cast<half2 *>(&lo),
+-                      *reinterpret_cast<const half2 *>(&I8s_TO_F16s_MAGIC_NUM));
+-  frag_b[1] = __hsub2(*reinterpret_cast<half2 *>(&hi),
+-                      *reinterpret_cast<const half2 *>(&I8s_TO_F16s_MAGIC_NUM));
++  typename ScalarType<half>::FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
++  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
++  return frag_b;
++}
++
++template <>
++__device__ inline typename ScalarType<nv_bfloat16>::FragB
++dequant<nv_bfloat16, vllm::kU8B128.id()>(int q) {
++  typename ScalarType<nv_bfloat16>::FragB frag_b;
++
++  float fp32_intermediates[4];
++  uint32_t* fp32_intermediates_casted =
++      reinterpret_cast<uint32_t*>(fp32_intermediates);
++
++  static constexpr uint32_t fp32_base = 0x4B000000;
++  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
++  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
++  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
++  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
++
++  fp32_intermediates[0] -= 8388736.f;
++  fp32_intermediates[1] -= 8388736.f;
++  fp32_intermediates[2] -= 8388736.f;
++  fp32_intermediates[3] -= 8388736.f;
++
++  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
++  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
++                                   fp32_intermediates_casted[1], 0x7632);
++  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
++                                   fp32_intermediates_casted[3], 0x7632);
++
++  return frag_b;
++}
++
++template <>
++__device__ inline typename ScalarType<half>::FragB
++dequant<half, vllm::kU8.id()>(int q) {
++  static constexpr uint32_t mask_for_elt_01 = 0x5250;
++  static constexpr uint32_t mask_for_elt_23 = 0x5351;
++  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
++
++  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
++  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
++
++  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
++
++  typename ScalarType<half>::FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
++  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
++  return frag_b;
++}
++
++template <>
++__device__ inline typename ScalarType<nv_bfloat16>::FragB
++dequant<nv_bfloat16, vllm::kU8.id()>(int q) {
++  typename ScalarType<nv_bfloat16>::FragB frag_b;
++
++  float fp32_intermediates[4];
++  uint32_t* fp32_intermediates_casted =
++      reinterpret_cast<uint32_t*>(fp32_intermediates);
++
++  static constexpr uint32_t fp32_base = 0x4B000000;
++  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
++  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
++  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
++  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
++
++  fp32_intermediates[0] -= 8388608.f;
++  fp32_intermediates[1] -= 8388608.f;
++  fp32_intermediates[2] -= 8388608.f;
++  fp32_intermediates[3] -= 8388608.f;
++
++  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
++  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
++                                   fp32_intermediates_casted[1], 0x7632);
++  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
++                                   fp32_intermediates_casted[3], 0x7632);
++
+   return frag_b;
+ }
+ 
+ // Multiply dequantized values by the corresponding quantization scale; used
+ // only for grouped quantization.
+-__device__ inline void scale(FragB &frag_b, FragS &frag_s, int i) {
+-  half2 s = __half2half2(reinterpret_cast<__half *>(&frag_s)[i]);
++template <typename scalar_t>
++__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
++                             typename ScalarType<scalar_t>::FragS& frag_s,
++                             int i) {
++  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
++  scalar_t2 s =
++      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+   frag_b[0] = __hmul2(frag_b[0], s);
+   frag_b[1] = __hmul2(frag_b[1], s);
+ }
+ 
+-// Same as above, but for act_order (each K is multiplied individually)
+-__device__ inline void scale4(FragB &frag_b, FragS &frag_s_1, FragS &frag_s_2,
+-                              FragS &frag_s_3, FragS &frag_s_4, int i) {
+-  __half2 s_val_1_2;
+-  s_val_1_2.x = reinterpret_cast<__half *>(&frag_s_1)[i];
+-  s_val_1_2.y = reinterpret_cast<__half *>(&frag_s_2)[i];
++template <typename scalar_t>
++__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
++                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
++                              int i) {
++  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
++  scalar_t2 zp =
++      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
++  frag_b[0] = __hsub2(frag_b[0], zp);
++  frag_b[1] = __hsub2(frag_b[1], zp);
++}
+ 
+-  __half2 s_val_3_4;
+-  s_val_3_4.x = reinterpret_cast<__half *>(&frag_s_3)[i];
+-  s_val_3_4.y = reinterpret_cast<__half *>(&frag_s_4)[i];
++// Same as above, but for act_order (each K is multiplied individually)
++template <typename scalar_t>
++__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
++                              typename ScalarType<scalar_t>::FragS& frag_s_1,
++                              typename ScalarType<scalar_t>::FragS& frag_s_2,
++                              typename ScalarType<scalar_t>::FragS& frag_s_3,
++                              typename ScalarType<scalar_t>::FragS& frag_s_4,
++                              int i) {
++  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
++  scalar_t2 s_val_1_2;
++  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
++  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
++
++  scalar_t2 s_val_3_4;
++  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
++  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];
+ 
+   frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+   frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+ }
+ 
+ // Given 2 floats multiply by 2 scales (halves)
+-__device__ inline void scale_float(float *c, FragS &s) {
+-  __half *s_ptr = reinterpret_cast<__half *>(&s);
+-  c[0] = __fmul_rn(c[0], __half2float(s_ptr[0]));
+-  c[1] = __fmul_rn(c[1], __half2float(s_ptr[1]));
++template <typename scalar_t>
++__device__ inline void scale_float(float* c,
++                                   typename ScalarType<scalar_t>::FragS& s) {
++  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
++  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
++  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+ }
+ 
+ // Wait until barrier reaches `count`, then lock for current threadblock.
+-__device__ inline void barrier_acquire(int *lock, int count) {
++__device__ inline void barrier_acquire(int* lock, int count) {
+   if (threadIdx.x == 0) {
+     int state = -1;
+     do
+@@ -214,7 +437,7 @@ __device__ inline void barrier_acquire(int *lock, int count) {
+ }
+ 
+ // Release barrier and increment visitation count.
+-__device__ inline void barrier_release(int *lock, bool reset = false) {
++__device__ inline void barrier_release(int* lock, bool reset = false) {
+   __syncthreads();
+   if (threadIdx.x == 0) {
+     if (reset) {
+@@ -233,11 +456,10 @@ __device__ inline void barrier_release(int *lock, bool reset = false) {
+ 
+ // For a given "a" of size [M,K] performs a permutation of the K columns based
+ // on the given "perm" indices.
+-__global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr,
+-                                    int const *__restrict__ perm_int_ptr,
+-                                    int4 *__restrict__ out_int4_ptr, int size_m,
++__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
++                                    int const* __restrict__ perm_int_ptr,
++                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                     int size_k, int block_rows) {
+-
+   int start_row = block_rows * blockIdx.x;
+   int finish_row = start_row + block_rows;
+   if (finish_row > size_m) {
+@@ -253,9 +475,8 @@ __global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr,
+ 
+     int offset = row * row_stride;
+ 
+-    half const *a_row_half =
+-        reinterpret_cast<half const *>(a_int4_ptr + offset);
+-    half *out_half = reinterpret_cast<half *>(out_int4_ptr + offset);
++    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
++    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+ 
+     int base_k = 0;
+ 
+@@ -286,30 +507,38 @@ __global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr,
+   }
+ }
+ 
+-template <const int num_bits,        // number of bits used for weights
+-          const int threads,         // number of threads in a threadblock
+-          const int thread_m_blocks, // number of 16x16 blocks in the m
+-                                     // dimension (batchsize) of the threadblock
+-          const int thread_n_blocks, // same for n dimension (output)
+-          const int thread_k_blocks, // same for k dimension (reduction)
+-          const int stages, // number of stages for the async global->shared
+-                            // fetch pipeline
+-          const bool has_act_order,   // whether act_order is enabled
+-          const int group_blocks = -1 // number of consecutive 16x16 blocks with
+-                                      // a separate quantization scale
++template <typename scalar_t,  // compute dtype, half or nv_float16
++          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
++          const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const bool has_act_order,     // whether act_order is enabled
++          const bool has_zp,            // whether zero-points are enabled
++          const int group_blocks = -1,  // number of consecutive 16x16 blocks
++                                        // with a separate quantization scale
++          const bool is_zp_float        // is zero point of float16 type?
+           >
+-__global__ void
+-Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+-       const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn
+-       int4 *__restrict__ C,       // fp16 output buffer of shape mxn
+-       const int4 *__restrict__ scales_ptr, // fp16 quantization scales of shape
+-                                            // (k/groupsize)xn
+-       const int *__restrict__ g_idx,       // int32 group indices of shape k
+-       int num_groups, // number of scale groups per output channel
+-       int prob_m,     // batch dimension m
+-       int prob_n,     // output dimension n
+-       int prob_k,     // reduction dimension k
+-       int *locks      // extra global storage for barrier synchronization
++__global__ void Marlin(
++    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // fp16 output buffer of shape mxn
++    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
++    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
++                                          // (k/groupsize)xn
++    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
++                                          // (k/groupsize)x(n/pack_factor)
++    const int* __restrict__ g_idx,        // int32 group indices of shape k
++    int num_groups,       // number of scale groups per output channel
++    int prob_m,           // batch dimension m
++    int prob_n,           // output dimension n
++    int prob_k,           // reduction dimension k
++    int* locks,           // extra global storage for barrier synchronization
++    bool use_fp32_reduce  // whether to use fp32 global reduce
+ ) {
+   // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+   // same size, which might involve multiple column "slices" (of width 16 *
+@@ -322,8 +551,17 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   // ensures good utilization of all SMs for many kinds of shape and GPU
+   // configurations, while requiring as few slow global cross-threadblock
+   // reductions as possible.
++  using Dtype = ScalarType<scalar_t>;
++  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
++  using FragA = typename ScalarType<scalar_t>::FragA;
++  using FragB = typename ScalarType<scalar_t>::FragB;
++  using FragC = typename ScalarType<scalar_t>::FragC;
++  using FragS = typename ScalarType<scalar_t>::FragS;
++  using FragZP = typename ScalarType<scalar_t>::FragZP;
+ 
+-  constexpr int pack_factor = 32 / num_bits;
++  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
++
++  constexpr int pack_factor = 32 / w_type.size_bits();
+ 
+   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+   // better partitioning with less reductions
+@@ -350,11 +588,13 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   int slice_row = (iters * blockIdx.x) % k_tiles;
+   int slice_col_par = (iters * blockIdx.x) / k_tiles;
+   int slice_col = slice_col_par;
+-  int slice_iters; // number of threadblock tiles in the current slice
++  int slice_iters;  // number of threadblock tiles in the current slice
+   int slice_count =
+-      0;         // total number of active threadblocks in the current slice
+-  int slice_idx; // index of threadblock in current slice; numbered bottom to
+-                 // top
++      0;          // total number of active threadblocks in the current slice
++  int slice_idx;  // index of threadblock in current slice; numbered bottom to
++                  // top
++
++  int par_id = 0;
+ 
+   // We can easily implement parallel problem execution by just remapping
+   // indices and advancing global pointers
+@@ -363,6 +603,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+     C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+     locks += (slice_col_par / n_tiles) * n_tiles;
+     slice_col = slice_col_par % n_tiles;
++    par_id = slice_col_par / n_tiles;
+   }
+ 
+   // Compute all information about the current slice which is required for
+@@ -370,27 +611,22 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   auto init_slice = [&]() {
+     slice_iters =
+         iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel)
+-      slice_iters = 0;
+-    if (slice_iters == 0)
+-      return;
+-    if (slice_row + slice_iters > k_tiles)
+-      slice_iters = k_tiles - slice_row;
++    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
++    if (slice_iters == 0) return;
++    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+     slice_count = 1;
+     slice_idx = 0;
+     int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+     if (col_first <= k_tiles * (slice_col_par + 1)) {
+       int col_off = col_first - k_tiles * slice_col_par;
+       slice_count = div_ceil(k_tiles - col_off, iters);
+-      if (col_off > 0)
+-        slice_count++;
++      if (col_off > 0) slice_count++;
+       int delta_first = iters * blockIdx.x - col_first;
+       if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+         slice_idx = slice_count - 1;
+       else {
+         slice_idx = slice_count - 1 - delta_first / iters;
+-        if (col_off > 0)
+-          slice_idx--;
++        if (col_off > 0) slice_idx--;
+       }
+     }
+     if (slice_col == n_tiles) {
+@@ -398,6 +634,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       C += 16 * thread_m_blocks * prob_n / 8;
+       locks += n_tiles;
+       slice_col = 0;
++      par_id++;
+     }
+   };
+   init_slice();
+@@ -426,7 +663,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   // B sizes/strides
+   int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+   constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+-  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
++  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+   constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+ 
+   int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+@@ -456,6 +693,15 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   int tb_n_warps = thread_n_blocks / 4;
+   int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+ 
++  // Zero-points sizes/strides
++  int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4;
++  constexpr int zp_sh_stride = is_zp_float
++                                   ? 16 * thread_n_blocks / 8
++                                   : ((16 * thread_n_blocks) / pack_factor) / 4;
++  constexpr int zp_tb_groups = s_tb_groups;
++  constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
++  int zp_gl_rd_delta = zp_gl_stride;
++
+   // Global A read index of current thread.
+   int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                 (threadIdx.x % a_gl_rd_delta_o);
+@@ -495,6 +741,19 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   int s_sh_wr = threadIdx.x;
+   bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+ 
++  // Zero-points
++  int zp_gl_rd;
++  if constexpr (has_zp) {
++    if constexpr (group_blocks == -1) {
++      zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
++    } else {
++      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
++                 zp_sh_stride * slice_col + threadIdx.x;
++    }
++  }
++  int zp_sh_wr = threadIdx.x;
++  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
++
+   // We use a different scale layout for grouped and column-wise quantization as
+   // we scale a `half2` tile in column-major layout in the former and in
+   // row-major in the latter case.
+@@ -506,11 +765,30 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+               (threadIdx.x % 32) % 4;
+ 
++  // Zero-points have the same read layout as the scales
++  // (without column-wise case)
++  constexpr int num_col_threads = 8;
++  constexpr int num_row_threads = 4;
++  constexpr int num_ints_per_thread = 8 / pack_factor;
++  int zp_sh_rd;
++  if constexpr (has_zp) {
++    if constexpr (is_zp_float) {
++      if constexpr (group_blocks != -1) {
++        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
++                   (threadIdx.x % 32) / 4;
++      }
++    } else {
++      zp_sh_rd = num_ints_per_thread * num_col_threads *
++                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
++                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
++    }
++  }
++
+   // Precompute which thread should not read memory in which iterations; this is
+   // needed if there are more threads than required for a certain tilesize or
+   // when the batchsize is not a multiple of 16.
+   bool a_sh_wr_pred[a_sh_wr_iters];
+-#pragma unroll
++  #pragma unroll
+   for (int i = 0; i < a_sh_wr_iters; i++)
+     a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+ 
+@@ -528,13 +806,13 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   // loop unrolls, all shared memory accesses are static, we simply precompute
+   // both transformed reads and writes.
+   int a_sh_wr_trans[a_sh_wr_iters];
+-#pragma unroll
++  #pragma unroll
+   for (int i = 0; i < a_sh_wr_iters; i++)
+     a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+   int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+-#pragma unroll
++  #pragma unroll
+   for (int i = 0; i < b_sh_wr_iters; i++) {
+-#pragma unroll
++  #pragma unroll
+     for (int j = 0; j < thread_m_blocks; j++)
+       a_sh_rd_trans[i][j] =
+           transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+@@ -544,30 +822,35 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   // runtime; we break dependencies between subsequent accesses with a tile by
+   // maintining multiple pointers (we have enough registers), a tiny
+   // optimization.
+-  const int4 *B_ptr[b_sh_wr_iters];
+-#pragma unroll
++  const int4* B_ptr[b_sh_wr_iters];
++  #pragma unroll
+   for (int i = 0; i < b_sh_wr_iters; i++)
+     B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+ 
+   extern __shared__ int4 sh[];
+   // Shared memory storage for global fetch pipelines.
+-  int4 *sh_a = sh;
+-  int4 *sh_b = sh_a + (stages * a_sh_stage);
+-  int4 *sh_g_idx = sh_b + (stages * b_sh_stage);
+-  int4 *sh_s = sh_g_idx + (stages * g_idx_stage);
++  int4* sh_a = sh;
++  int4* sh_b = sh_a + (stages * a_sh_stage);
++  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
++  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
++  int4* sh_s = sh_zp + (stages * zp_sh_stage);
++  int4* sh_red = sh_s + (stages * s_sh_stage);
+ 
+   // Register storage for double buffer of shared memory reads.
+   FragA frag_a[2][thread_m_blocks];
+   I4 frag_b_quant[2][b_thread_vecs];
+   FragC frag_c[thread_m_blocks][4][2];
+-  FragS frag_s[2][4];        // No act-order
+-  FragS act_frag_s[2][4][4]; // For act-order
++  FragS frag_s[2][4];                    // No act-order
++  FragS act_frag_s[2][4][4];             // For act-order
++  int frag_qzp[2][num_ints_per_thread];  // Zero-points
++  FragZP frag_zp;                        // Zero-points in fp16
++  FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
+ 
+   // Zero accumulators.
+   auto zero_accums = [&]() {
+-#pragma unroll
++  #pragma unroll
+     for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+-      reinterpret_cast<float *>(frag_c)[i] = 0;
++      reinterpret_cast<float*>(frag_c)[i] = 0;
+   };
+ 
+   int sh_first_group_id = -1;
+@@ -611,18 +894,18 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   // shared memory pipeline location.
+   auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+     if (pred) {
+-      int4 *sh_a_stage = sh_a + a_sh_stage * pipe;
+-#pragma unroll
++      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
+       for (int i = 0; i < a_sh_wr_iters; i++) {
+         cp_async4_pred(
+             &sh_a_stage[a_sh_wr_trans[i]],
+             &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+             a_sh_wr_pred[i]);
+       }
+-      int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
+-#pragma unroll
++      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++  #pragma unroll
+       for (int i = 0; i < b_sh_wr_iters; i++) {
+-#pragma unroll
++  #pragma unroll
+         for (int j = 0; j < b_thread_vecs; j++) {
+           cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+         }
+@@ -635,10 +918,10 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+         int full_pipe = a_off;
+         int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+         if (cur_k < prob_k && cur_k < slice_k_finish) {
+-          int4 *sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
++          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+ 
+-          int4 const *cur_g_idx_stage_ptr =
+-              reinterpret_cast<int4 const *>(&g_idx[cur_k]);
++          int4 const* cur_g_idx_stage_ptr =
++              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+ 
+           if (threadIdx.x < g_idx_stage) {
+             cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
+@@ -647,14 +930,14 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+         }
+       } else {
+         if constexpr (group_blocks != -1) {
+-          int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
++          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+ 
+           if constexpr (group_blocks >= thread_k_blocks) {
++            if (s_sh_wr_pred) {
++              cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
++            }
+             // Only fetch scales if this tile starts a new group
+-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+-              if (s_sh_wr_pred) {
+-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+-              }
++            if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) {
+               s_gl_rd += s_gl_rd_delta;
+             }
+           } else {
+@@ -667,6 +950,28 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+             }
+           }
+         }
++
++        if constexpr (has_zp && group_blocks != -1) {
++          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
++
++          if constexpr (group_blocks >= thread_k_blocks) {
++            // Only fetch zero-points if this tile starts a new group
++            if (pipe % (group_blocks / thread_k_blocks) == 0) {
++              if (zp_sh_wr_pred) {
++                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
++              }
++              zp_gl_rd += zp_gl_rd_delta;
++            }
++          } else {
++            for (int i = 0; i < zp_tb_groups; i++) {
++              if (zp_sh_wr_pred) {
++                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
++                          &zp_ptr[zp_gl_rd]);
++              }
++              zp_gl_rd += zp_gl_rd_delta;
++            }
++          }
++        }
+       }
+     }
+     // Insert a fence even when we are winding down the pipeline to ensure that
+@@ -674,6 +979,12 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+     cp_async_fence();
+   };
+ 
++  auto fetch_zp_to_shared = [&]() {
++    if (zp_sh_wr_pred) {
++      cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
++    }
++  };
++
+   // Wait until the next thread tile has been loaded to shared memory.
+   auto wait_for_stage = [&]() {
+     // We only have `stages - 2` active fetches since we are double buffering
+@@ -687,15 +998,16 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   // Load the next sub-tile from the current location in the shared memory pipe
+   // into the current register buffer.
+   auto fetch_to_registers = [&](int k, int pipe) {
+-    int4 *sh_a_stage = sh_a + a_sh_stage * pipe;
+-#pragma unroll
++    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
+     for (int i = 0; i < thread_m_blocks; i++)
+-      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+-    int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
++      ldsm4<scalar_t>(frag_a[k % 2][i],
++                      &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
++    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+ 
+-#pragma unroll
++  #pragma unroll
+     for (int i = 0; i < b_thread_vecs; i++) {
+-      frag_b_quant[k % 2][i] = *reinterpret_cast<I4 *>(
++      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+           &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+     }
+   };
+@@ -710,8 +1022,8 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       return;
+     }
+ 
+-    int4 *sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+-    int *sh_g_idx_int_ptr = reinterpret_cast<int *>(sh_g_idx_stage);
++    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
++    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+ 
+     int group_id_1 = sh_g_idx_int_ptr[0];
+     int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+@@ -727,10 +1039,8 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       // No act-order case
+       if constexpr (group_blocks != -1) {
+         if constexpr (group_blocks >= thread_k_blocks) {
+-          int4 *sh_s_stage =
+-              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+-                                   (pipe / (group_blocks / thread_k_blocks)));
+-          reinterpret_cast<int4 *>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
++          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
++          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+         } else {
+           int warp_id = threadIdx.x / 32;
+           int n_warps = thread_n_blocks / 4;
+@@ -743,9 +1053,9 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+           int k_blocks = cur_k / 16;
+           int cur_group_id = k_blocks / group_blocks;
+ 
+-          int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
++          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+ 
+-          reinterpret_cast<int4 *>(&frag_s[k % 2])[0] =
++          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
+               sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+         }
+       }
+@@ -772,7 +1082,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+     // thread-id)
+     int warp_id = threadIdx.x / 32;
+     int n_warps =
+-        thread_n_blocks / 4; // Each warp processes 4 16-size tiles over N
++        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+ 
+     int warp_row = warp_id / n_warps;
+     int warp_col = warp_id % n_warps;
+@@ -780,7 +1090,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+     cur_k += warp_row * 16;
+ 
+     int th_id = threadIdx.x % 32;
+-    cur_k += (th_id % 4) * 2; // Due to tensor-core layout for fp16 B matrix
++    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+ 
+     int s_col_shift =
+         /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
+@@ -788,89 +1098,215 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+ 
+     if (is_same_group[pipe]) {
+       if (k % 2 == 0) {
+-        *(reinterpret_cast<int4 *>(&(act_frag_s[k % 2][0][0]))) =
++        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+             sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
+                  s_col_shift];
+       } else {
+-        *(reinterpret_cast<int4 *>(&(act_frag_s[k % 2][0][0]))) =
+-            *(reinterpret_cast<int4 *>(&(act_frag_s[(k - 1) % 2][0][0])));
++        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
++            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+       }
+ 
+       for (int i = 1; i < 4; i++) {
+-        *(reinterpret_cast<int4 *>(&(act_frag_s[k % 2][i][0]))) =
+-            *(reinterpret_cast<int4 *>(&(act_frag_s[k % 2][0][0])));
++        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
++            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+       }
+       return;
+     }
+ 
+-    int4 *sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+-    int *sh_g_idx_int_ptr = reinterpret_cast<int *>(sh_g_idx_stage);
++    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
++    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+ 
+     constexpr int k_frag_offsets[4] = {0, 1, 8,
+-                                       9}; // Tensor core offsets per thread
++                                       9};  // Tensor core offsets per thread
+ 
+-#pragma unroll
++  #pragma unroll
+     for (int i = 0; i < 4; i++) {
+-
+       int actual_k = cur_k + k_frag_offsets[i];
+ 
+       int group_id = sh_g_idx_int_ptr[actual_k];
+       int rel_group_id = group_id - sh_first_group_id;
+ 
+-      *(reinterpret_cast<int4 *>(&(act_frag_s[k % 2][i][0]))) =
++      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+           sh_s[rel_group_id * s_sh_stride + s_col_shift];
+     }
+   };
+ 
++  auto fetch_zp_to_registers = [&](int k, int full_pipe) {
++    // This code does not handle group_blocks == 0,
++    // which signifies act_order.
++    // has_zp implies AWQ, which doesn't have act_order,
++    static_assert(!has_zp || group_blocks != 0);
++
++    if constexpr (has_zp && !is_zp_float) {
++      int pipe = full_pipe % stages;
++
++      if constexpr (group_blocks == -1) {
++        for (int i = 0; i < num_ints_per_thread; i++) {
++          frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
++        }
++
++      } else if constexpr (group_blocks >= thread_k_blocks) {
++        int4* sh_zp_stage =
++            sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
++                                   (pipe / (group_blocks / thread_k_blocks)));
++        for (int i = 0; i < num_ints_per_thread; i++) {
++          frag_qzp[k % 2][i] =
++              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
++        }
++      } else {
++        int warp_id = threadIdx.x / 32;
++        int n_warps = thread_n_blocks / 4;
++
++        int warp_row = warp_id / n_warps;
++
++        int cur_k = warp_row * 16;
++        cur_k += k_iter_size * (k % b_sh_wr_iters);
++
++        int k_blocks = cur_k / 16;
++        int cur_group_id = 0;
++
++        // Suppress bogus and persistent divide-by-zero warning
++  #pragma nv_diagnostic push
++  #pragma nv_diag_suppress divide_by_zero
++        cur_group_id = k_blocks / group_blocks;
++  #pragma nv_diagnostic pop
++
++        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
++
++        sh_zp_stage += cur_group_id * zp_sh_stride;
++
++        for (int i = 0; i < num_ints_per_thread; i++) {
++          frag_qzp[k % 2][i] =
++              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
++        }
++      }
++    }
++
++    else if constexpr (has_zp && is_zp_float) {
++      int pipe = full_pipe % stages;
++
++      if constexpr (group_blocks != -1) {
++        if constexpr (group_blocks >= thread_k_blocks) {
++          int4* sh_zp_stage =
++              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
++                                     (pipe / (group_blocks / thread_k_blocks)));
++          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
++        } else {
++          int warp_id = threadIdx.x / 32;
++          int n_warps = thread_n_blocks / 4;
++
++          int warp_row = warp_id / n_warps;
++
++          int cur_k = warp_row * 16;
++          cur_k += k_iter_size * (k % b_sh_wr_iters);
++
++          int k_blocks = cur_k / 16;
++          // Suppress bogus and persistent divide-by-zero warning
++  #pragma nv_diagnostic push
++  #pragma nv_diag_suppress divide_by_zero
++          int cur_group_id = k_blocks / group_blocks;
++  #pragma nv_diagnostic pop
++
++          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
++
++          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
++              sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride];
++        }
++      }
++    }
++  };
++
+   // Execute the actual tensor core matmul of a sub-tile.
+   auto matmul = [&](int k) {
+-// We have the m dimension as the inner loop in order to encourage overlapping
+-// dequantization and matmul operations.
+-#pragma unroll
++    if constexpr (has_zp && !is_zp_float) {
++      FragB frag_zp_0;
++      FragB frag_zp_1;
++      int zp_quant_0, zp_quant_1;
++
++      if constexpr (w_type.size_bits() == 4) {
++        zp_quant_0 = frag_qzp[k % 2][0];
++        zp_quant_1 = zp_quant_0 >> 8;
++      } else {
++        static_assert(w_type.size_bits() == 8);
++        zp_quant_0 = frag_qzp[k % 2][0];
++        zp_quant_1 = frag_qzp[k % 2][1];
++      }
++
++      frag_zp_0 = dequant<scalar_t, w_type_id>(zp_quant_0);
++      frag_zp_1 = dequant<scalar_t, w_type_id>(zp_quant_1);
++
++      frag_zp[0] = frag_zp_0[0];
++      frag_zp[1] = frag_zp_0[1];
++      frag_zp[2] = frag_zp_1[0];
++      frag_zp[3] = frag_zp_1[1];
++    }
++
++  // We have the m dimension as the inner loop in order to encourage overlapping
++  // dequantization and matmul operations.
++  #pragma unroll
+     for (int j = 0; j < 4; j++) {
+       FragB frag_b0;
+       FragB frag_b1;
+-      if constexpr (num_bits == 4) {
+-        int b_quant = frag_b_quant[k % 2][0][j];
+-        int b_quant_shift = b_quant >> 8;
+-
+-        frag_b0 = dequant_4bit(b_quant);
+-        frag_b1 = dequant_4bit(b_quant_shift);
++      int b_quant_0, b_quant_1;
+ 
++      if constexpr (w_type.size_bits() == 4) {
++        b_quant_0 = frag_b_quant[k % 2][0][j];
++        b_quant_1 = b_quant_0 >> 8;
+       } else {
+-        int *frag_b_quant_ptr = reinterpret_cast<int *>(frag_b_quant[k % 2]);
+-        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+-        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
++        static_assert(w_type.size_bits() == 8);
++        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
++        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
++        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
++      }
++
++      frag_b0 = dequant<scalar_t, w_type_id>(b_quant_0);
++      frag_b1 = dequant<scalar_t, w_type_id>(b_quant_1);
+ 
+-        frag_b0 = dequant_8bit(b_quant_0);
+-        frag_b1 = dequant_8bit(b_quant_1);
++      // Apply zero-point to frag_b0
++      if constexpr (has_zp && !is_zp_float) {
++        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
++      }
++
++      else if constexpr (has_zp && is_zp_float && group_blocks != -1) {
++        sub_zp<scalar_t>(frag_b0, frag_zpf[k % 2][j], 0);
+       }
+ 
+       // Apply scale to frag_b0
+       if constexpr (has_act_order) {
+-        scale4(frag_b0, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
+-               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 0);
++        scale4<scalar_t>(frag_b0, act_frag_s[k % 2][0][j],
++                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
++                         act_frag_s[k % 2][3][j], 0);
+       } else {
+         if constexpr (group_blocks != -1) {
+-          scale(frag_b0, frag_s[k % 2][j], 0);
++          scale<scalar_t>(frag_b0, frag_s[k % 2][j], 0);
+         }
+       }
+ 
++      // Apply zero-point to frag_b1
++      if constexpr (has_zp && !is_zp_float) {
++        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
++      }
++
++      else if constexpr (has_zp && is_zp_float && group_blocks != -1) {
++        sub_zp<scalar_t>(frag_b1, frag_zpf[k % 2][j], 1);
++      }
++
+       // Apply scale to frag_b1
+       if constexpr (has_act_order) {
+-        scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
+-               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 1);
++        scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
++                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
++                         act_frag_s[k % 2][3][j], 1);
+ 
+       } else {
+         if constexpr (group_blocks != -1) {
+-          scale(frag_b1, frag_s[k % 2][j], 1);
++          scale<scalar_t>(frag_b1, frag_s[k % 2][j], 1);
+         }
+       }
+ 
+-#pragma unroll
++  #pragma unroll
+       for (int i = 0; i < thread_m_blocks; i++) {
+-        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+-        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
++        mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
++        mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+       }
+     }
+   };
+@@ -892,38 +1328,38 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       // unnecessary read or write iterations, e.g., for two warps we write only
+       // once by warp 1 and read only once by warp 0.
+ 
+-#pragma unroll
++  #pragma unroll
+       for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+-#pragma unroll
++  #pragma unroll
+         for (int i = red_off; i > 0; i /= 2) {
+           if (i <= red_idx && red_idx < 2 * i) {
+-#pragma unroll
++  #pragma unroll
+             for (int j = 0; j < 4 * 2; j++) {
+               int red_sh_wr =
+                   red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+               if (i < red_off) {
+-                float *c_rd = reinterpret_cast<float *>(
+-                    &sh[red_sh_delta * j + red_sh_rd]);
+-                float *c_wr = reinterpret_cast<float *>(&sh[red_sh_wr]);
+-#pragma unroll
++                float* c_rd = reinterpret_cast<float*>(
++                    &sh_red[red_sh_delta * j + red_sh_rd]);
++                float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
++  #pragma unroll
+                 for (int k = 0; k < 4; k++)
+-                  reinterpret_cast<FragC *>(frag_c)[4 * 2 * m_block + j][k] +=
++                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                       c_rd[k] + c_wr[k];
+               }
+-              sh[red_sh_wr] =
+-                  reinterpret_cast<int4 *>(&frag_c)[4 * 2 * m_block + j];
++              sh_red[red_sh_wr] =
++                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+             }
+           }
+           __syncthreads();
+         }
+         if (red_idx == 0) {
+-#pragma unroll
++  #pragma unroll
+           for (int i = 0; i < 4 * 2; i++) {
+-            float *c_rd =
+-                reinterpret_cast<float *>(&sh[red_sh_delta * i + red_sh_rd]);
+-#pragma unroll
++            float* c_rd =
++                reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
++  #pragma unroll
+             for (int j = 0; j < 4; j++)
+-              reinterpret_cast<FragC *>(frag_c)[4 * 2 * m_block + i][j] +=
++              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                   c_rd[j];
+           }
+         }
+@@ -933,10 +1369,10 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   };
+ 
+   // Since multiple threadblocks may process parts of the same column slice, we
+-  // finally have to globally reduce over the results. As the striped partitioning
+-  // minimizes the number of such reductions and our outputs are usually rather
+-  // small, we perform this reduction serially in L2 cache.
+-  auto global_reduce = [&](bool first = false, bool last = false) {
++  // finally have to globally reduce over the results. As the striped
++  // partitioning minimizes the number of such reductions and our outputs are
++  // usually rather small, we perform this reduction serially in L2 cache.
++  auto global_reduce_fp16 = [&](bool first = false, bool last = false) {
+     // We are very careful here to reduce directly in the output buffer to
+     // maximize L2 cache utilization in this step. To do this, we write out
+     // results in FP16 (but still reduce with FP32 compute).
+@@ -954,39 +1390,39 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       int row = (threadIdx.x % 32) / 4;
+ 
+       if (!first) {
+-// Interestingly, doing direct global accesses here really seems to mess up the
+-// compiler and lead to slowdowns, hence we also use async-copies even though
+-// these fetches are not actually asynchronous.
+-#pragma unroll
++  // Interestingly, doing direct global accesses here really seems to mess up
++  // the compiler and lead to slowdowns, hence we also use async-copies even
++  // though these fetches are not actually asynchronous.
++  #pragma unroll
+         for (int i = 0; i < thread_m_blocks * 4; i++) {
+-          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i],
+-                         &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+-                            c_gl_wr_delta_i * (i % 2)],
+-                         i < (thread_m_blocks - 1) * 4 ||
+-                             8 * (i / 2) + row < prob_m);
++          cp_async4_pred(
++              &sh_red[c_sh_wr + c_sh_wr_delta * i],
++              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
++                 c_gl_wr_delta_i * (i % 2)],
++              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+         }
+         cp_async_fence();
+         cp_async_wait<0>();
+       }
+ 
+-#pragma unroll
++  #pragma unroll
+       for (int i = 0; i < thread_m_blocks * 4; i++) {
+         if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+           if (!first) {
+-            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+-#pragma unroll
++            int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
++  #pragma unroll
+             for (int j = 0; j < 2 * 4; j++) {
+-              reinterpret_cast<float *>(
++              reinterpret_cast<float*>(
+                   &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+-                  __half2float(reinterpret_cast<__half *>(&c_red)[j]);
++                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+             }
+           }
+           if (!last) {
+             int4 c;
+-#pragma unroll
++  #pragma unroll
+             for (int j = 0; j < 2 * 4; j++) {
+-              reinterpret_cast<__half *>(&c)[j] =
+-                  __float2half(reinterpret_cast<float *>(
++              reinterpret_cast<scalar_t*>(&c)[j] =
++                  Dtype::float2num(reinterpret_cast<float*>(
+                       &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+             }
+             C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+@@ -997,6 +1433,53 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+     }
+   };
+ 
++  // Globally reduce over threadblocks that compute the same column block.
++  // We use a tmp C buffer to reduce in full fp32 precision.
++  auto global_reduce_fp32 = [&](bool first = false, bool last = false) {
++    constexpr int tb_m = thread_m_blocks * 16;
++    constexpr int tb_n = thread_n_blocks * 16;
++
++    constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
++
++    constexpr int active_threads = 32 * thread_n_blocks / 4;
++    bool is_th_active = threadIdx.x < active_threads;
++
++    int par_offset = c_size * n_tiles * par_id;
++    int slice_offset = c_size * slice_col;
++
++    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
++    constexpr int th_size = num_floats * sizeof(float) / 16;
++
++    int c_cur_offset = par_offset + slice_offset;
++
++    if (!is_th_active) {
++      return;
++    }
++
++    if (!first) {
++      float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
++  #pragma unroll
++      for (int k = 0; k < th_size; k++) {
++        sh_red[threadIdx.x] =
++            C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
++
++        float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
++  #pragma unroll
++        for (int f = 0; f < 4; f++) {
++          frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
++        }
++      }
++    }
++
++    if (!last) {
++      int4* frag_c_ptr = reinterpret_cast<int4*>(&frag_c);
++  #pragma unroll
++      for (int k = 0; k < th_size; k++) {
++        C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k];
++      }
++    }
++  };
++
+   // Write out the reduce final result in the correct layout. We only actually
+   // reshuffle matrix fragments in this step, the reduction above is performed
+   // in fragment layout.
+@@ -1020,22 +1503,24 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+ 
+     // We first reorder in shared memory to guarantee the most efficient final
+     // global write patterns
+-    auto write = [&](int idx, float c0, float c1, FragS &s) {
+-      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
++    auto write = [&](int idx, float c0, float c1, FragS& s) {
++      scalar_t2 res =
++          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+ 
+       // For per-column quantization we finally apply the scale here (only for
+       // 4-bit)
+-      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 4) {
++      if constexpr (!has_act_order && group_blocks == -1 &&
++                    w_type.size_bits() == 4) {
+         res = __hmul2(res, s[0]);
+       }
+ 
+-      ((half2 *)sh)[idx] = res;
++      ((scalar_t2*)sh_red)[idx] = res;
+     };
+ 
+     if (threadIdx.x / 32 < thread_n_blocks / 4) {
+-#pragma unroll
++  #pragma unroll
+       for (int i = 0; i < thread_m_blocks; i++) {
+-#pragma unroll
++  #pragma unroll
+         for (int j = 0; j < 4; j++) {
+           int wr = c_sh_wr + 8 * j;
+           write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+@@ -1052,12 +1537,12 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+     }
+     __syncthreads();
+ 
+-#pragma unroll
++  #pragma unroll
+     for (int i = 0;
+          i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+          i++) {
+       if (c_gl_wr < c_gl_wr_end) {
+-        C[c_gl_wr] = sh[c_sh_rd];
++        C[c_gl_wr] = sh_red[c_sh_rd];
+         c_gl_wr += c_gl_wr_delta;
+         c_sh_rd += c_sh_rd_delta;
+       }
+@@ -1067,7 +1552,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   // Start global fetch and register load pipelines.
+   auto start_pipes = [&]() {
+ 
+-#pragma unroll
++  #pragma unroll
+     for (int i = 0; i < stages - 1; i++) {
+       if (has_act_order && i == 0) {
+         int last_g_idx = slice_k_start + stages * tb_k * 2;
+@@ -1076,6 +1561,12 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+         }
+         fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
+       }
++
++      if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
++        if (i == 0) {
++          fetch_zp_to_shared();
++        }
++      }
+       fetch_to_shared(i, i, i < slice_iters);
+     }
+ 
+@@ -1084,6 +1575,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+     init_same_group(0);
+     fetch_to_registers(0, 0);
+     fetch_scales_to_registers(0, 0);
++    fetch_zp_to_registers(0, 0);
+     a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+     slice_k_start_shared_fetch += tb_k * (stages - 1);
+   };
+@@ -1098,12 +1590,13 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+     // have even length meaning that the next iteration will always start at
+     // index 0.
+ 
+-#pragma unroll
++  #pragma unroll
+     for (int pipe = 0; pipe < stages;) {
+-#pragma unroll
++  #pragma unroll
+       for (int k = 0; k < b_sh_wr_iters; k++) {
+         fetch_to_registers(k + 1, pipe % stages);
+         fetch_scales_to_registers(k + 1, pipe);
++        fetch_zp_to_registers(k + 1, pipe);
+         if (k == b_sh_wr_iters - 2) {
+           fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                           slice_iters >= stages);
+@@ -1145,7 +1638,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       // For per-column scales, we only fetch them here in the final step before
+       // write-out
+       if constexpr (!has_act_order && group_blocks == -1) {
+-        if constexpr (num_bits == 8) {
++        if constexpr (w_type.size_bits() == 8) {
+           if (s_sh_wr_pred) {
+             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+           }
+@@ -1162,12 +1655,12 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+ 
+       thread_block_reduce();
+       if constexpr (!has_act_order && group_blocks == -1) {
+-        if constexpr (num_bits == 8) {
++        if constexpr (w_type.size_bits() == 8) {
+           cp_async_wait<0>();
+           __syncthreads();
+           if (threadIdx.x / 32 < thread_n_blocks / 4) {
+-            reinterpret_cast<int4 *>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+-            reinterpret_cast<int4 *>(&frag_s)[1] = sh_s[s_sh_rd + 4];
++            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
++            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+           }
+ 
+         } else {
+@@ -1175,8 +1668,8 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+             cp_async_wait<0>();
+             __syncthreads();
+             if (threadIdx.x / 32 < thread_n_blocks / 4) {
+-              reinterpret_cast<int4 *>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+-              reinterpret_cast<int4 *>(&frag_s)[1] = sh_s[s_sh_rd + 4];
++              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
++              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+             }
+           }
+         }
+@@ -1185,33 +1678,42 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       // For 8-bit channelwise, we apply the scale before the global reduction
+       // that converts the fp32 results to fp16 (so that we avoid possible
+       // overflow in fp16)
+-      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) {
++      if constexpr (!has_act_order && group_blocks == -1 &&
++                    w_type.size_bits() == 8) {
+         if (threadIdx.x / 32 < thread_n_blocks / 4) {
+-#pragma unroll
++  #pragma unroll
+           for (int i = 0; i < thread_m_blocks; i++) {
+-#pragma unroll
++  #pragma unroll
+             for (int j = 0; j < 4; j++) {
+-              scale_float(reinterpret_cast<float *>(&frag_c[i][j][0][0]),
+-                          frag_s[j / 2][2 * (j % 2) + 0]);
+-              scale_float(reinterpret_cast<float *>(&frag_c[i][j][0][2]),
+-                          frag_s[j / 2][2 * (j % 2) + 0]);
+-
+-              scale_float(reinterpret_cast<float *>(&frag_c[i][j][1][0]),
+-                          frag_s[j / 2][2 * (j % 2) + 1]);
+-              scale_float(reinterpret_cast<float *>(&frag_c[i][j][1][2]),
+-                          frag_s[j / 2][2 * (j % 2) + 1]);
++              scale_float<scalar_t>(
++                  reinterpret_cast<float*>(&frag_c[i][j][0][0]),
++                  frag_s[j / 2][2 * (j % 2) + 0]);
++              scale_float<scalar_t>(
++                  reinterpret_cast<float*>(&frag_c[i][j][0][2]),
++                  frag_s[j / 2][2 * (j % 2) + 0]);
++
++              scale_float<scalar_t>(
++                  reinterpret_cast<float*>(&frag_c[i][j][1][0]),
++                  frag_s[j / 2][2 * (j % 2) + 1]);
++              scale_float<scalar_t>(
++                  reinterpret_cast<float*>(&frag_c[i][j][1][2]),
++                  frag_s[j / 2][2 * (j % 2) + 1]);
+             }
+           }
+         }
+       }
+ 
+-      if (slice_count > 1) { // only globally reduce if there is more than one
+-                             // block in a slice
++      if (slice_count > 1) {  // only globally reduce if there is more than one
++                              // block in a slice
+         barrier_acquire(&locks[slice_col], slice_idx);
+-        global_reduce(slice_idx == 0, last);
++        if (use_fp32_reduce) {
++          global_reduce_fp32(slice_idx == 0, last);
++        } else {
++          global_reduce_fp16(slice_idx == 0, last);
++        }
+         barrier_release(&locks[slice_col], last);
+       }
+-      if (last) // only the last block in a slice actually writes the result
++      if (last)  // only the last block in a slice actually writes the result
+         write_result();
+       slice_row = 0;
+       slice_col_par++;
+@@ -1220,13 +1722,12 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       if (slice_iters) {
+         a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                   (threadIdx.x % a_gl_rd_delta_o);
+-#pragma unroll
++  #pragma unroll
+         for (int i = 0; i < b_sh_wr_iters; i++)
+           B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+         if (slice_col == 0) {
+-#pragma unroll
+-          for (int i = 0; i < b_sh_wr_iters; i++)
+-            B_ptr[i] -= b_gl_stride;
++  #pragma unroll
++          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+         }
+ 
+         // Update slice k/n for scales loading
+@@ -1238,6 +1739,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+ 
+         } else {
+           s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
++          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+         }
+ 
+         start_pipes();
+@@ -1246,23 +1748,29 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+   }
+ }
+ 
+-#define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
+-                  HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)                    \
+-  else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&       \
+-           thread_n_blocks == THREAD_N_BLOCKS &&                               \
+-           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+-           has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&   \
+-           num_threads == NUM_THREADS) {                                       \
+-    cudaFuncSetAttribute(                                                      \
+-        Marlin<NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,        \
+-               THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, GROUP_BLOCKS>,     \
+-        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);          \
+-    Marlin<NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,            \
+-           THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, GROUP_BLOCKS>          \
+-        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
+-            A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n, \
+-            prob_k, locks);                                                    \
+-  }
++  #define __CALL_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
++                    HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, NUM_THREADS,          \
++                    IS_ZP_FLOAT)                                               \
++    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
++             thread_n_blocks == THREAD_N_BLOCKS &&                             \
++             thread_k_blocks == THREAD_K_BLOCKS &&                             \
++             has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP &&             \
++             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
++             is_zp_float == IS_ZP_FLOAT) {                                     \
++      if constexpr (!IS_ZP_FLOAT || std::is_same<scalar_t, half>::value) {     \
++        cudaFuncSetAttribute(                                                  \
++            Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,        \
++                   THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages,              \
++                   HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>,          \
++            cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);      \
++        Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,            \
++               THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,   \
++               HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>                              \
++            <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
++                A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
++                num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce);   \
++      }                                                                        \
++    }
+ 
+ typedef struct {
+   int thread_k;
+@@ -1275,17 +1783,26 @@ typedef struct {
+   thread_config_t tb_cfg;
+ } exec_config_t;
+ 
+-thread_config_t thread_configs[] = {
++thread_config_t small_batch_thread_configs[] = {
+     // Ordered by priority
+ 
+     // thread_k, thread_n, num_threads
+-    {64, 256, 256}, // Default (max cache usage)
+-    {64, 128, 128}, // Reduce N, reduce warps
+-    {128, 64, 128}, // Reduce N more, but increase K
++    {128, 128, 256},
++    {64, 128, 128},
++    {128, 64, 128},
++};
++
++thread_config_t large_batch_thread_configs[] = {
++    // Ordered by priority
++
++    // thread_k, thread_n, num_threads
++    {64, 256, 256},
++    {64, 128, 128},
++    {128, 64, 128},
+ 
+ };
+ 
+-int get_scales_cache_size(thread_config_t const &th_config, int prob_m,
++int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                           int prob_n, int prob_k, int num_bits, int group_size,
+                           bool has_act_order, bool is_k_full) {
+   bool cache_scales_chunk = has_act_order && !is_k_full;
+@@ -1298,15 +1815,15 @@ int get_scales_cache_size(thread_config_t const &th_config, int prob_m,
+   if (group_size == -1) {
+     tb_groups = 1;
+   } else if (group_size == 0) {
+-    tb_groups = div_ceil(tb_k, 32); // Worst case is 32 group size
++    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
+   } else {
+     tb_groups = div_ceil(tb_k, group_size);
+   }
+ 
+   if (cache_scales_chunk) {
+     int load_groups =
+-        tb_groups * pipe_stages * 2;    // Chunk size is 2x pipeline over dim K
+-    load_groups = max(load_groups, 32); // We load at least 32 scale groups
++        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
++    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
+     return load_groups * tb_n * 2;
+ 
+   } else {
+@@ -1316,7 +1833,7 @@ int get_scales_cache_size(thread_config_t const &th_config, int prob_m,
+   }
+ }
+ 
+-bool is_valid_cache_size(thread_config_t const &th_config, int max_m_blocks,
++bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
+                          int prob_m, int prob_n, int prob_k, int num_bits,
+                          int scales_cache_size, int max_shared_mem) {
+   int pack_factor = 32 / num_bits;
+@@ -1347,12 +1864,15 @@ bool is_valid_cache_size(thread_config_t const &th_config, int max_m_blocks,
+ 
+   float pipe_size = (a_size + b_size) * pipe_stages;
+ 
+-  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity
++  float reduce_size = max(th_config.num_threads * 32 * 4,
++                          (tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2);
++
++  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity
+ 
+-  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
++  return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size);
+ }
+ 
+-bool is_valid_config(thread_config_t const &th_config, int max_m_blocks,
++bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
+                      int prob_m, int prob_n, int prob_k, int num_bits,
+                      int group_size, bool has_act_order, bool is_k_full,
+                      int max_shared_mem) {
+@@ -1391,68 +1911,174 @@ bool is_valid_config(thread_config_t const &th_config, int max_m_blocks,
+   return true;
+ }
+ 
++int determine_reduce_max_m(int prob_m, int max_par) {
++  constexpr int tile_m_size = 16;
++
++  if (prob_m <= tile_m_size) {
++    return tile_m_size;
++
++  } else if (prob_m <= tile_m_size * 2) {
++    return tile_m_size * 2;
++
++  } else if (prob_m <= tile_m_size * 3) {
++    return tile_m_size * 3;
++
++  } else if (prob_m <= tile_m_size * 4) {
++    return tile_m_size * 4;
++
++  } else {
++    int cur_par = min(div_ceil(prob_m, tile_m_size * 4), max_par);
++    return tile_m_size * 4 * cur_par;
++  }
++}
++
+ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
+                                       int num_bits, int group_size,
+                                       bool has_act_order, bool is_k_full,
+                                       int max_shared_mem) {
+   int max_m_blocks = 4;
+   while (max_m_blocks > 0) {
+-    for (auto th_config : thread_configs) {
+-      if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+-                          num_bits, group_size, has_act_order, is_k_full,
+-                          max_shared_mem)) {
+-        return exec_config_t{max_m_blocks, th_config};
++    if (prob_m <= 16) {
++      for (auto th_config : small_batch_thread_configs) {
++        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
++                            num_bits, group_size, has_act_order, is_k_full,
++                            max_shared_mem)) {
++          return exec_config_t{max_m_blocks, th_config};
++        }
++      }
++    } else {
++      for (auto th_config : large_batch_thread_configs) {
++        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
++                            num_bits, group_size, has_act_order, is_k_full,
++                            max_shared_mem)) {
++          return exec_config_t{max_m_blocks, th_config};
++        }
+       }
+     }
+ 
+-    printf("WARNING: Marlin kernel is reducing max_m_blocks due to small SM "
+-           "GPU cache. This may "
+-           "hurt performance. Consider upgrading your GPU.\n");
+-
+-    max_m_blocks--; // Process less M blocks per invocation to reduce cache
+-                    // usage
++    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
++                     // usage
+   }
+ 
+   return exec_config_t{0, {-1, -1, -1}};
+ }
+ 
+-#define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)                     \
+-  __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)             \
+-  __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)             \
+-  __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)             \
+-  __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)             \
+-                                                                               \
+-  __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)           \
+-  __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)            \
+-  __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)            \
+-  __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)            \
+-                                                                               \
+-  __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)           \
+-  __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)            \
+-  __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)            \
+-  __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)            \
+-                                                                               \
+-  __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)           \
+-  __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)            \
+-  __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)            \
+-  __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)            \
+-                                                                               \
+-  __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)           \
+-  __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)            \
+-  __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)            \
+-  __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+-
+-void marlin_mm_f16i4(const void *A, const void *B, void *C, void *s,
+-                     void *g_idx, void *perm, void *a_tmp, int prob_m,
+-                     int prob_n, int prob_k, void *workspace, int num_bits,
+-                     bool has_act_order, bool is_k_full, int num_groups,
+-                     int group_size, int dev, cudaStream_t stream, int thread_k,
+-                     int thread_n, int sms, int max_par) {
+-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+-              "num_bits must be 4 or 8. Got = ", num_bits);
++  #define GPTQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)             \
++    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS,   \
++              false)                                                        \
++                                                                            \
++    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
++              false)                                                        \
++                                                                            \
++    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
++              false)                                                        \
++                                                                            \
++    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
++              false)                                                        \
++                                                                            \
++    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS,  \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS,  \
++              false)                                                        \
++    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS,  \
++              false)
++
++  #define AWQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)             \
++    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS,  \
++              false)                                                       \
++                                                                           \
++    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS,  \
++              false)                                                       \
++                                                                           \
++    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS,  \
++              false)                                                       \
++                                                                           \
++    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS,  \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS,  \
++              false)                                                       \
++    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, false)
++
++  // We currently have 4-bit models only with group_blocks == 4
++  #define HQQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)            \
++    __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \
++              true)                                                       \
++    __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \
++              true)                                                       \
++    __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \
++              true)                                                       \
++    __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, true)
++
++template <typename scalar_t>
++void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
++               void* zp, void* g_idx, void* perm, void* a_tmp, int prob_m,
++               int prob_n, int prob_k, void* workspace,
++               vllm::ScalarType const& q_type, bool has_act_order,
++               bool is_k_full, bool has_zp, int num_groups, int group_size,
++               int dev, cudaStream_t stream, int thread_k, int thread_n,
++               int sms, int max_par, bool use_fp32_reduce, bool is_zp_float) {
++  if (has_zp) {
++    TORCH_CHECK(
++        q_type == vllm::kU4 || q_type == vllm::kU8,
++        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
++  } else {
++    TORCH_CHECK(
++        q_type == vllm::kU4B8 || q_type == vllm::kU8B128,
++        "q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
++        q_type.str());
++  }
++
+   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+               ", ", prob_n, ", ", prob_k, "]");
+ 
++  // TODO: remove alias when we start supporting other 8bit types
++  int num_bits = q_type.size_bits();
+   int tot_m = prob_m;
+   int tot_m_blocks = div_ceil(tot_m, 16);
+   int pad = 16 * tot_m_blocks - tot_m;
+@@ -1528,15 +2154,17 @@ void marlin_mm_f16i4(const void *A, const void *B, void *C, void *s,
+     }
+   }
+ 
+-  const int4 *A_ptr = (const int4 *)A;
+-  const int4 *B_ptr = (const int4 *)B;
+-  int4 *C_ptr = (int4 *)C;
+-  const int4 *s_ptr = (const int4 *)s;
+-  const int *g_idx_ptr = (const int *)g_idx;
+-  const int *perm_ptr = (const int *)perm;
+-  int4 *a_tmp_ptr = (int4 *)a_tmp;
++  const int4* A_ptr = (const int4*)A;
++  const int4* B_ptr = (const int4*)B;
++  int4* C_ptr = (int4*)C;
++  int4* C_tmp_ptr = (int4*)C_tmp;
++  const int4* s_ptr = (const int4*)s;
++  const int4* zp_ptr = (const int4*)zp;
++  const int* g_idx_ptr = (const int*)g_idx;
++  const int* perm_ptr = (const int*)perm;
++  int4* a_tmp_ptr = (int4*)a_tmp;
+ 
+-  int *locks = (int *)workspace;
++  int* locks = (int*)workspace;
+ 
+   if (has_act_order) {
+     // Permute A columns
+@@ -1562,33 +2190,44 @@ void marlin_mm_f16i4(const void *A, const void *B, void *C, void *s,
+       // Note that parallel > 1 currently only works for inputs without any
+       // padding
+       par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
+-      if (par > max_par)
+-        par = max_par;
++      if (par > max_par) par = max_par;
+       prob_m = (16 * exec_cfg.max_m_blocks) * par;
+       i += exec_cfg.max_m_blocks * (par - 1);
+       thread_m_blocks = exec_cfg.max_m_blocks;
+     }
+ 
+-    // Define kernel configurations
+     if (false) {
+     }
+-    CALL_IF(4, 32, 2, 256)
+-    CALL_IF(4, 16, 4, 256)
+-    CALL_IF(4, 8, 4, 128)
+-    CALL_IF(4, 4, 8, 128)
+-    CALL_IF(8, 32, 2, 256)
+-    CALL_IF(8, 16, 4, 256)
+-    CALL_IF(8, 8, 4, 128)
+-    CALL_IF(8, 4, 8, 128)
++    GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)
++    GPTQ_CALL_IF(vllm::kU4B8, 8, 8, 256)
++    GPTQ_CALL_IF(vllm::kU4B8, 8, 4, 128)
++    GPTQ_CALL_IF(vllm::kU4B8, 4, 8, 128)
++    GPTQ_CALL_IF(vllm::kU8B128, 16, 4, 256)
++    GPTQ_CALL_IF(vllm::kU8B128, 8, 8, 256)
++    GPTQ_CALL_IF(vllm::kU8B128, 8, 4, 128)
++    GPTQ_CALL_IF(vllm::kU8B128, 4, 8, 128)
++
++    AWQ_CALL_IF(vllm::kU4, 16, 4, 256)
++    AWQ_CALL_IF(vllm::kU4, 8, 8, 256)
++    AWQ_CALL_IF(vllm::kU4, 8, 4, 128)
++    AWQ_CALL_IF(vllm::kU4, 4, 8, 128)
++    AWQ_CALL_IF(vllm::kU8, 16, 4, 256)
++    AWQ_CALL_IF(vllm::kU8, 8, 8, 256)
++    AWQ_CALL_IF(vllm::kU8, 8, 4, 128)
++    AWQ_CALL_IF(vllm::kU8, 4, 8, 128)
++
++    HQQ_CALL_IF(vllm::kU4, 16, 4, 256)
++    HQQ_CALL_IF(vllm::kU4, 8, 8, 256)
++    HQQ_CALL_IF(vllm::kU4, 8, 4, 128)
++    HQQ_CALL_IF(vllm::kU4, 4, 8, 128)
+     else {
+-      TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
+-                             str(prob_n) + ", " + str(prob_k) + "]" +
+-                             ", has_act_order = " + str(has_act_order) +
+-                             ", num_groups = " + str(num_groups) +
+-                             ", group_size = " + str(group_size) +
+-                             ", thread_m_blocks = " + str(thread_m_blocks) +
+-                             ", thread_n_blocks = " + str(thread_n_blocks) +
+-                             ", thread_k_blocks = " + str(thread_k_blocks));
++      TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
++                  ", ", prob_k, "]", ", has_act_order = ", has_act_order,
++                  ", num_groups = ", num_groups, ", group_size = ", group_size,
++                  ", thread_m_blocks = ", thread_m_blocks,
++                  ", thread_n_blocks = ", thread_n_blocks,
++                  ", thread_k_blocks = ", thread_k_blocks,
++                  ", num_bits = ", num_bits);
+     }
+ 
+     A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+@@ -1596,17 +2235,35 @@ void marlin_mm_f16i4(const void *A, const void *B, void *C, void *s,
+   }
+ }
+ 
+-} // namespace gptq_marlin
++}  // namespace marlin
++
++torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
++                               torch::Tensor& b_scales, torch::Tensor& b_zeros,
++                               torch::Tensor& g_idx, torch::Tensor& perm,
++                               torch::Tensor& workspace,
++                               vllm::ScalarTypeId const& b_q_type_id,
++                               int64_t size_m, int64_t size_n, int64_t size_k,
++                               bool is_k_full, bool has_zp,
++                               bool use_fp32_reduce, bool is_zp_float) {
++  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
++  if (has_zp) {
++    TORCH_CHECK(
++        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
++        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
++  } else {
++    TORCH_CHECK(
++        b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
++        "b_q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
++        b_q_type.str());
++  }
+ 
+-torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+-                               torch::Tensor &b_scales, torch::Tensor &g_idx,
+-                               torch::Tensor &perm, torch::Tensor &workspace,
+-                               int64_t num_bits, int64_t size_m, int64_t size_n,
+-                               int64_t size_k, bool is_k_full) {
+-  // Verify num_bits
+-  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+-              "num_bits must be 4 or 8. Got = ", num_bits);
+-  int pack_factor = 32 / num_bits;
++  if (has_zp && is_zp_float) {
++    TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
++                "Computation type must be float16 (half) when using float zero "
++                "points.");
++  }
++
++  int pack_factor = 32 / b_q_type.size_bits();
+ 
+   // Verify A
+   TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+@@ -1615,16 +2272,15 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+               ", size_k = ", size_k);
+ 
+   // Verify B
+-  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
+-              " is not divisible by tile_size = ", gptq_marlin::tile_size);
+-  TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
++  TORCH_CHECK(size_k % marlin::tile_size == 0, "size_k = ", size_k,
++              " is not divisible by tile_size = ", marlin::tile_size);
++  TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
+               "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+-              ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
+-  TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
++              ", size_k = ", size_k, ", tile_size = ", marlin::tile_size);
++  TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
+               "b_q_weight.size(1) = ", b_q_weight.size(1),
+-              " is not divisible by tile_size = ", gptq_marlin::tile_size);
+-  int actual_size_n =
+-      (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
++              " is not divisible by tile_size = ", marlin::tile_size);
++  int actual_size_n = (b_q_weight.size(1) / marlin::tile_size) * pack_factor;
+   TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+               ", actual_size_n = ", actual_size_n);
+ 
+@@ -1638,6 +2294,9 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+   TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+   TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+ 
++  TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
++  TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
++
+   TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+   TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+ 
+@@ -1650,6 +2309,17 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+   torch::Tensor c = torch::empty({size_m, size_n}, options);
+   torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);
+ 
++  // Alloc C tmp buffer that is going to be used for the global reduce
++  int reduce_max_m = marlin::determine_reduce_max_m(size_m, marlin::max_par);
++  int reduce_n = size_n;
++  auto options_fp32 =
++      torch::TensorOptions().dtype(at::kFloat).device(a.device());
++  if (!use_fp32_reduce) {
++    reduce_max_m = 0;
++    reduce_n = 0;
++  }
++  torch::Tensor c_tmp = torch::empty({reduce_max_m, reduce_n}, options_fp32);
++
+   // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+   // auto -1)
+   int thread_k = -1;
+@@ -1671,8 +2341,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+   int group_size = -1;
+   bool has_act_order = g_idx.size(0) != 0;
+ 
+-  int b_rank = b_scales.sizes().size();
+-  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
++  int rank = b_scales.sizes().size();
++  TORCH_CHECK(rank == 2, "b_scales rank = ", rank, " is not 2");
+   TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
+               " is not size_n = ", size_n);
+   num_groups = b_scales.size(0);
+@@ -1698,25 +2368,64 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+     }
+   }
+ 
++  // Verify b_zeros
++  if (has_zp) {
++    int rank = b_zeros.sizes().size();
++    TORCH_CHECK(rank == 2, "b_zeros rank = ", rank, " is not 2");
++    if (is_zp_float) {
++      TORCH_CHECK(b_zeros.size(1) == size_n,
++                  "b_zeros dim 1 = ", b_zeros.size(1),
++                  " is not size_n = ", size_n);
++      TORCH_CHECK(num_groups == b_zeros.size(0),
++                  "b_zeros dim 0 = ", b_zeros.size(0),
++                  " is not num_groups = ", num_groups);
++      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
++    } else {
++      TORCH_CHECK(b_zeros.size(0) == num_groups,
++                  "b_zeros dim 0 = ", b_zeros.size(0),
++                  " is not num_groups = ", num_groups);
++      TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
++                  "b_zeros dim 1 = ", b_zeros.size(1),
++                  " is not size_n / pack_factor = ", size_n / pack_factor);
++    }
++  }
++
+   // Verify workspace size
+-  TORCH_CHECK(
+-      size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
+-      ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
+-  int min_workspace_size =
+-      (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
++  TORCH_CHECK(size_n % marlin::min_thread_n == 0, "size_n = ", size_n,
++              ", is not divisible by min_thread_n = ", marlin::min_thread_n);
++  int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
+   TORCH_CHECK(workspace.numel() >= min_workspace_size,
+               "workspace.numel = ", workspace.numel(),
+               " is below min_workspace_size = ", min_workspace_size);
+ 
+   int dev = a.get_device();
+-  gptq_marlin::marlin_mm_f16i4(
+-      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), b_scales.data_ptr(),
+-      g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n,
+-      size_k, workspace.data_ptr(), num_bits, has_act_order, is_k_full,
+-      num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+-      thread_k, thread_n, sms, gptq_marlin::max_par);
++  if (a.scalar_type() == at::ScalarType::Half) {
++    marlin::marlin_mm<half>(
++        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
++        c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
++        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
++        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
++        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
++        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
++        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float);
++  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
++    marlin::marlin_mm<nv_bfloat16>(
++        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
++        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
++        b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
++        perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
++        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
++        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
++        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float);
++  } else {
++    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
++  }
+ 
+   return c;
+ }
+ 
+ #endif
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
++  m.impl("gptq_marlin_gemm", &gptq_marlin_gemm);
++}
+diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+index 0d3da62..5cd0785 100644
+--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
++++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+@@ -1,39 +1,14 @@
+-#include "gptq_marlin.cuh"
++#include "marlin.cuh"
+ 
+-namespace gptq_marlin {
++#include "core/registration.h"
+ 
+-static constexpr int repack_stages = 8;
+-
+-static constexpr int repack_threads = 256;
+-
+-static constexpr int tile_k_size = tile_size;
+-static constexpr int tile_n_size = tile_k_size * 4;
+-
+-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+-
+-template <int const num_threads, int const num_bits, bool const has_perm>
+-__global__ void
+-marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+-                     uint32_t const *__restrict__ perm_ptr,
+-                     uint32_t *__restrict__ out_ptr, int size_k, int size_n) {}
+-
+-} // namespace gptq_marlin
+-
+-torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
+-                                 int64_t size_k, int64_t size_n,
+-                                 int64_t num_bits) {
+-  TORCH_CHECK_NOT_IMPLEMENTED(
+-      false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
+-  return torch::empty({1, 1});
+-}
+-
+-#else
++namespace marlin {
+ 
+ template <int const num_threads, int const num_bits, bool const has_perm>
+-__global__ void
+-marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+-                     uint32_t const *__restrict__ perm_ptr,
+-                     uint32_t *__restrict__ out_ptr, int size_k, int size_n) {
++__global__ void gptq_marlin_repack_kernel(
++    uint32_t const* __restrict__ b_q_weight_ptr,
++    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
++    int size_k, int size_n) {
+   constexpr int pack_factor = 32 / num_bits;
+ 
+   int k_tiles = size_k / tile_k_size;
+@@ -61,8 +36,8 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+ 
+   constexpr int perm_size = tile_k_size / 4;
+ 
+-  int4 *sh_perm_ptr = sh;
+-  int4 *sh_pipe_ptr = sh_perm_ptr;
++  int4* sh_perm_ptr = sh;
++  int4* sh_pipe_ptr = sh_perm_ptr;
+   if constexpr (has_perm) {
+     sh_pipe_ptr += perm_size;
+   }
+@@ -76,7 +51,7 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+   auto load_perm_to_shared = [&](int k_tile_id) {
+     int first_k_int4 = (k_tile_id * tile_k_size) / 4;
+ 
+-    int4 const *perm_int4_ptr = reinterpret_cast<int4 const *>(perm_ptr);
++    int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
+ 
+     if (threadIdx.x < perm_size) {
+       sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
+@@ -92,22 +67,22 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+ 
+     int first_n = n_tile_id * tile_n_size;
+ 
+-    int4 *sh_ptr = sh_pipe_ptr + stage_size * pipe;
++    int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
+ 
+     if constexpr (has_perm) {
+       if (threadIdx.x < stage_size) {
+         int k_id = threadIdx.x / stage_n_threads;
+         int n_id = threadIdx.x % stage_n_threads;
+ 
+-        uint32_t const *sh_perm_int_ptr =
+-            reinterpret_cast<uint32_t const *>(sh_perm_ptr);
++        uint32_t const* sh_perm_int_ptr =
++            reinterpret_cast<uint32_t const*>(sh_perm_ptr);
+ 
+         int src_k = sh_perm_int_ptr[k_id];
+         int src_k_packed = src_k / pack_factor;
+ 
+         cp_async4(
+             &sh_ptr[k_id * stage_n_threads + n_id],
+-            reinterpret_cast<int4 const *>(&(
++            reinterpret_cast<int4 const*>(&(
+                 b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
+       }
+ 
+@@ -120,7 +95,7 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+         int first_k_packed = first_k / pack_factor;
+ 
+         cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
+-                  reinterpret_cast<int4 const *>(
++                  reinterpret_cast<int4 const*>(
+                       &(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
+                                        first_n + (n_id * 4)])));
+       }
+@@ -151,10 +126,10 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+     constexpr int sh_stride = 64;
+     constexpr uint32_t mask = (1 << num_bits) - 1;
+ 
+-    int4 *sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
+-    uint32_t *sh_stage_int_ptr = reinterpret_cast<uint32_t *>(sh_stage_ptr);
++    int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
++    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
+ 
+-    uint32_t *sh_perm_int_ptr = reinterpret_cast<uint32_t *>(sh_perm_ptr);
++    uint32_t* sh_perm_int_ptr = reinterpret_cast<uint32_t*>(sh_perm_ptr);
+ 
+     uint32_t vals[8];
+ 
+@@ -176,7 +151,6 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+       }
+ 
+     } else {
+-
+       uint32_t b1_vals[tile_ints];
+       uint32_t b2_vals[tile_ints];
+ 
+@@ -260,28 +234,28 @@ marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+   }
+ }
+ 
+-} // namespace gptq_marlin
+-
+-#define CALL_IF(NUM_BITS, HAS_PERM)                                            \
+-  else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                     \
+-    cudaFuncSetAttribute(                                                      \
+-        gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads,         \
+-                                          NUM_BITS, HAS_PERM>,                 \
+-        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);          \
+-    gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS,   \
+-                                      HAS_PERM>                                \
+-        <<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>(     \
+-            b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);                \
++}  // namespace marlin
++
++#define CALL_IF(NUM_BITS, HAS_PERM)                                         \
++  else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                  \
++    cudaFuncSetAttribute(                                                   \
++        marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
++                                          HAS_PERM>,                        \
++        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
++    marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
++                                      HAS_PERM>                             \
++        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
++            b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);             \
+   }
+ 
+-torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
++torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
+                                  int64_t size_k, int64_t size_n,
+                                  int64_t num_bits) {
+   // Verify compatibility with marlin tile of 16x64
+-  TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
+-              " is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
+-  TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
+-              " is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);
++  TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
++              " is not divisible by tile_k_size = ", marlin::tile_k_size);
++  TORCH_CHECK(size_n % marlin::tile_n_size == 0, "size_n = ", size_n,
++              " is not divisible by tile_n_size = ", marlin::tile_n_size);
+ 
+   TORCH_CHECK(num_bits == 4 || num_bits == 8,
+               "num_bits must be 4 or 8. Got = ", num_bits);
+@@ -309,20 +283,18 @@ torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
+   auto options = torch::TensorOptions()
+                      .dtype(b_q_weight.dtype())
+                      .device(b_q_weight.device());
+-  torch::Tensor out =
+-      torch::empty({size_k / gptq_marlin::tile_size,
+-                    size_n * gptq_marlin::tile_size / pack_factor},
+-                   options);
++  torch::Tensor out = torch::empty(
++      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
++      options);
+ 
+   // Detect if there is act_order
+   bool has_perm = perm.size(0) != 0;
+ 
+   // Get ptrs
+-  uint32_t const *b_q_weight_ptr =
+-      reinterpret_cast<uint32_t const *>(b_q_weight.data_ptr());
+-  uint32_t const *perm_ptr =
+-      reinterpret_cast<uint32_t const *>(perm.data_ptr());
+-  uint32_t *out_ptr = reinterpret_cast<uint32_t *>(out.data_ptr());
++  uint32_t const* b_q_weight_ptr =
++      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
++  uint32_t const* perm_ptr = reinterpret_cast<uint32_t const*>(perm.data_ptr());
++  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
+ 
+   // Get dev info
+   int dev = b_q_weight.get_device();
+@@ -349,4 +321,22 @@ torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
+   return out;
+ }
+ 
+-#endif
++torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
++                                      torch::Tensor& perm, c10::SymInt size_k,
++                                      c10::SymInt size_n, int64_t num_bits) {
++  int const pack_factor = 32 / num_bits;
++  auto options = torch::TensorOptions()
++                     .dtype(b_q_weight.dtype())
++                     .device(b_q_weight.device());
++  return torch::empty_symint(
++      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
++      options);
++}
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
++  m.impl("gptq_marlin_repack", &gptq_marlin_repack);
++}
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
++  m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
++}
+\ No newline at end of file
+diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh
+new file mode 100644
+index 0000000..74ccbac
+--- /dev/null
++++ b/csrc/quantization/gptq_marlin/marlin.cuh
+@@ -0,0 +1,87 @@
++#pragma once
++
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <cuda.h>
++#include <cuda_fp16.h>
++#include <cuda_runtime.h>
++#include <iostream>
++
++namespace marlin {
++
++// Marlin params
++
++// 8 warps are a good choice since every SM has 4 schedulers and having more
++// than 1 warp per schedule allows some more latency hiding. At the same time,
++// we want relatively few warps to have many registers per warp and small tiles.
++static constexpr int default_threads = 256;
++
++static constexpr int pipe_stages =
++    4;  // 4 pipeline stages fit into shared memory
++
++static constexpr int min_thread_n = 64;
++static constexpr int min_thread_k = 64;
++
++static constexpr int tile_size = 16;
++static constexpr int max_par = 16;
++
++// Repack params
++static constexpr int repack_stages = 8;
++
++static constexpr int repack_threads = 256;
++
++static constexpr int tile_k_size = tile_size;
++static constexpr int tile_n_size = tile_k_size * 4;
++
++// Helpers
++template <typename T, int n>
++struct Vec {
++  T elems[n];
++  __device__ T& operator[](int i) { return elems[i]; }
++};
++
++using I4 = Vec<int, 4>;
++
++constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
++// No support for async
++#else
++
++__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
++                                      bool pred = true) {
++  const int BYTES = 16;
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "{\n"
++      "   .reg .pred p;\n"
++      "   setp.ne.b32 p, %0, 0;\n"
++      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
++      "}\n" ::"r"((int)pred),
++      "r"(smem), "l"(glob_ptr), "n"(BYTES));
++}
++
++__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
++  const int BYTES = 16;
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "{\n"
++      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
++      "}\n" ::"r"(smem),
++      "l"(glob_ptr), "n"(BYTES));
++}
++
++__device__ inline void cp_async_fence() {
++  asm volatile("cp.async.commit_group;\n" ::);
++}
++
++template <int n>
++__device__ inline void cp_async_wait() {
++  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
++}
++
++#endif
++
++}  // namespace marlin
+diff --git a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+new file mode 100644
+index 0000000..be06c09
+--- /dev/null
++++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+@@ -0,0 +1,79 @@
++
++#ifndef _data_types_cuh
++#define _data_types_cuh
++#include "marlin.cuh"
++#include <cuda_fp16.h>
++#include <cuda_bf16.h>
++
++namespace marlin {
++
++template <typename scalar_t>
++class ScalarType {};
++
++template <>
++class ScalarType<half> {
++ public:
++  using scalar_t = half;
++  using scalar_t2 = half2;
++
++  // Matrix fragments for tensor core instructions; their precise layout is
++  // documented here:
++  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
++  using FragA = Vec<half2, 4>;
++  using FragB = Vec<half2, 2>;
++  using FragC = Vec<float, 4>;
++  using FragS = Vec<half2, 1>;
++  using FragZP = Vec<half2, 4>;
++
++  static __device__ float inline num2float(const half x) {
++    return __half2float(x);
++  }
++
++  static __device__ half2 inline num2num2(const half x) {
++    return __half2half2(x);
++  }
++
++  static __device__ half2 inline nums2num2(const half x1, const half x2) {
++    return __halves2half2(x1, x2);
++  }
++
++  static __host__ __device__ half inline float2num(const float x) {
++    return __float2half(x);
++  }
++};
++
++template <>
++class ScalarType<nv_bfloat16> {
++ public:
++  using scalar_t = nv_bfloat16;
++  using scalar_t2 = nv_bfloat162;
++
++  using FragA = Vec<nv_bfloat162, 4>;
++  using FragB = Vec<nv_bfloat162, 2>;
++  using FragC = Vec<float, 4>;
++  using FragS = Vec<nv_bfloat162, 1>;
++  using FragZP = Vec<nv_bfloat162, 4>;
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
++  static __device__ float inline num2float(const nv_bfloat16 x) {
++    return __bfloat162float(x);
++  }
++
++  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
++    return __bfloat162bfloat162(x);
++  }
++
++  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
++                                                  const nv_bfloat16 x2) {
++    return __halves2bfloat162(x1, x2);
++  }
++
++  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
++    return __float2bfloat16(x);
++  }
++#endif
++};
++
++}  // namespace marlin
++
++#endif
+diff --git a/csrc/quantization/machete/Readme.md b/csrc/quantization/machete/Readme.md
+new file mode 100644
+index 0000000..9ddf8da
+--- /dev/null
++++ b/csrc/quantization/machete/Readme.md
+@@ -0,0 +1,45 @@
++# Machete (Mixed Precision Cutlass-Based GEMM)
++
++Machete is a spiritual successor to the Marlin kernel but optimized for Hopper architectures and based on Cutlass. Being based on Cutlass, new type pairs and epilogues are easier to add compared to Marlin.
++
++## Overview
++
++Machete effectively performs
++
++```
++scale_type = w_s.dtype
++compute_type = a.dtype
++out = (w_q.to(scale_type) * w_s - w_z.to(scale_type)) @ a
++```
++
++Where `w_q` is a quantized weight matrix, `w_s` is the quantization scales, and 
++`w_z` is the quantization zeropoints.
++
++> **_NOTE:_**  `w_z` is added after the scales so we can 
++use FMA operations, but this means they must have the scales pre-applied if the
++supplied zeropoints assume that they will be subtracted before the scales are 
++applied.
++
++## API
++
++The main optimization within Machete is prepacking the weight matrix to more closely match the tensor core layouts, allowing for wider shared memory loads when loading the weight matrix. This means that the weight matrix must be prepacked before calling `machete_gemm`. The flow looks something like:
++
++```
++from vllm import _custom_ops as ops
++
++...
++W_q_packed = ops.machete_prepack_B(w_q, wtype)
++output = ops.machete_gemm(
++    a,
++    b_q=W_q_packed,
++    b_type=wtype,
++    b_scales=w_s,
++    b_group_size=group_size
++)
++```
++
++## Code Generation
++
++Since Machete is based on Cutlass, we can generate multiple type pairs and different tile shapes using the same kernel template. We generate multiple instantiations of this template using `generate.py`. 
++
++New type pairs (`TypeConfig`s) can be appended to `impl_configs` (in `generate()`), and these will get automatically generated (assuming they can be supported without issues). For each `TypeConfig`, you must also provide an `ImplConfig`, which bundles a `TypeConfig` with a list of `ScheduleConfig`s, `Specialization`s, and a default heuristic. The `ScheduleConfig`s (which contain info on tile shapes, tile scheduler, etc.) can perform differently for different problem shapes, and there is almost never one `ScheduleConfig` that works well for all problem shapes, so it is generally beneficial to generate different `ScheduleConfig`s for different potential problem shapes. This is where the heuristic comes in. For each `TypeConfig`, a default heuristic should be provided. This maps different problem shapes to different `ScheduleConfig`s and is used when the user does not provide the `schedule` parameter to `machete_gemm`. The `Specialization`s define what feature combinations to generate, i.e., `with_zeropoints`, `with_scales`, etc. We can reduce compile times and the final binary size by limiting the set of feature combinations we generate.
+\ No newline at end of file
+diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
+new file mode 100644
+index 0000000..a9b5ddf
+--- /dev/null
++++ b/csrc/quantization/machete/generate.py
+@@ -0,0 +1,659 @@
++import itertools
++import math
++import os
++import shutil
++from collections.abc import Iterable
++from copy import deepcopy
++from dataclasses import dataclass, fields
++from functools import reduce
++from typing import Dict, List, Optional, Tuple, Union
++
++import jinja2
++# yapf conflicts with isort for this block
++# yapf: disable
++from vllm_cutlass_library_extension import (DataType, EpilogueScheduleTag,
++                                            EpilogueScheduleType,
++                                            MixedInputKernelScheduleType,
++                                            TileSchedulerTag,
++                                            TileSchedulerType, VLLMDataType,
++                                            VLLMDataTypeNames,
++                                            VLLMDataTypeSize, VLLMDataTypeTag,
++                                            VLLMDataTypeTorchDataTypeTag,
++                                            VLLMDataTypeVLLMScalarTypeTag,
++                                            VLLMKernelScheduleTag)
++
++# yapf: enable
++
++#
++#   Generator templating
++#
++
++DISPATCH_TEMPLATE = """
++#include "../machete_mm_launcher.cuh"
++
++namespace machete {
++
++{% for impl_config in impl_configs %}
++{% set type_sig = gen_type_sig(impl_config.types) -%}
++{% for s in impl_config.schedules %}
++extern torch::Tensor impl_{{type_sig}}_sch_{{gen_sch_sig(s)}}(MMArgs);
++{%- endfor %}
++
++torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) {
++  [[maybe_unused]] auto M = args.A.size(0);
++  [[maybe_unused]] auto N = args.B.size(1);
++  [[maybe_unused]] auto K = args.A.size(1);
++    
++  if (!args.maybe_schedule) {
++    {%- for cond, s in impl_config.heuristic %}
++    {%if cond is not none%}if ({{cond}})
++    {%- else %}else
++    {%- endif %}
++        return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);{% endfor %}
++  }
++
++  {%- for s in impl_config.schedules %}
++  if (*args.maybe_schedule == "{{ gen_sch_sig(s) }}")
++    return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);
++  {%- endfor %}
++  TORCH_CHECK_NOT_IMPLEMENTED(false, "machete_gemm(..) is not implemented for "
++                                     "schedule = ", *args.maybe_schedule);
++}
++{%- endfor %}
++
++
++static inline std::optional<at::ScalarType> maybe_scalartype(
++    std::optional<at::Tensor> const& t) {
++    if (!t) {
++      return std::nullopt;
++    } else {
++      return t->scalar_type();
++    };
++}
++
++torch::Tensor mm_dispatch(MMArgs args) {
++  auto out_type = args.maybe_out_type.value_or(args.A.scalar_type());
++  auto a_type = args.A.scalar_type();
++  auto maybe_g_scales_type = maybe_scalartype(args.maybe_group_scales);
++  auto maybe_g_zeros_type = maybe_scalartype(args.maybe_group_zeros);
++  auto maybe_ch_scales_type = maybe_scalartype(args.maybe_channel_scales);
++  auto maybe_tok_scales_type = maybe_scalartype(args.maybe_token_scales);
++
++  {% for impl_config in impl_configs %}
++  {% set t = impl_config.types -%}
++  {% set type_sig = gen_type_sig(t) -%}
++  if (args.b_type == {{VLLMScalarTypeTag[t.b]}}
++      && a_type == {{TorchTypeTag[t.a]}}
++      && out_type == {{TorchTypeTag[t.out]}}
++      && {%if t.b_group_scale != void -%}
++      maybe_g_scales_type == {{TorchTypeTag[t.b_group_scale]}}
++      {%- else %}!maybe_g_scales_type{%endif%}
++      && {%if t.b_group_zeropoint != void -%}
++      maybe_g_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}}
++      {%- else %}!maybe_g_zeros_type{%endif%}
++      && {%if t.b_channel_scale != void -%}
++      maybe_ch_scales_type == {{TorchTypeTag[t.b_channel_scale]}}
++      {%- else %}!maybe_ch_scales_type{%endif%}
++      && {%if t.a_token_scale != void -%}
++      maybe_tok_scales_type == {{TorchTypeTag[t.a_token_scale]}}
++      {%- else %}!maybe_tok_scales_type{%endif%}
++  ) {
++      return mm_dispatch_{{type_sig}}(args);
++  }
++  {%- endfor %}
++  
++  TORCH_CHECK_NOT_IMPLEMENTED(
++    false, "machete_mm(..) is not implemented for "
++    "a_type=", args.A.scalar_type(),
++    ", b_type=", args.b_type.str(),
++    ", out_type=", out_type,
++    ", with_group_scale_type=", maybe_g_scales_type
++        ? toString(*maybe_g_scales_type) : "None",
++    ", with_group_zeropoint_type=", maybe_g_zeros_type
++        ? toString(*maybe_g_zeros_type) : "None",
++    ", with_channel_scale_type=", maybe_ch_scales_type
++        ? toString(*maybe_ch_scales_type) : "None",
++    ", with_token_scale_type=", maybe_tok_scales_type
++        ? toString(*maybe_tok_scales_type) : "None",
++    "; implemented types are: \\n",
++    {%- for impl_config in impl_configs %}
++    {% set t = impl_config.types -%}
++    "\\t{{gen_type_option_name(t)}}\\n",
++    {%- endfor %}
++    "");
++}
++
++std::vector<std::string> supported_schedules_dispatch(
++    SupportedSchedulesArgs args) {
++    auto out_type = args.maybe_out_type.value_or(args.a_type);
++    
++    {% for impl_config in impl_configs %}
++    {% set t = impl_config.types -%}
++    {% set schs = impl_config.schedules -%}
++    if (args.b_type == {{VLLMScalarTypeTag[t.b]}}
++        && args.a_type == {{TorchTypeTag[t.a]}}
++        && out_type == {{TorchTypeTag[t.out]}}
++        && {%if t.b_group_scale != void -%}
++        args.maybe_group_scales_type == {{TorchTypeTag[t.b_group_scale]}}
++        {%- else %}!args.maybe_group_scales_type{%endif%}
++        && {%if t.b_group_zeropoint != void-%}
++        args.maybe_group_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}}
++        {%- else %}!args.maybe_group_zeros_type{%endif%}
++    ) {
++        return {
++            {%- for s in impl_config.schedules %}
++            "{{gen_sch_sig(s)}}"{% if not loop.last %},{% endif %}
++            {%- endfor %}
++        };
++    }
++    {%- endfor %}
++    
++    return {};
++};
++
++}; // namespace machete
++"""
++
++IMPL_TEMPLATE = """
++#include "../machete_mm_launcher.cuh"
++
++namespace machete {
++    
++{% for sch in unique_schedules(impl_configs) %}
++{% set sch_sig = gen_sch_sig(sch) -%}
++struct sch_{{sch_sig}} {
++  using TileShapeNM = Shape<{{
++      to_cute_constant(sch.tile_shape_mn)|join(', ')}}>;
++  using ClusterShape = Shape<{{
++      to_cute_constant(sch.cluster_shape_mnk)|join(', ')}}>;
++  // TODO: Reimplement
++  // using KernelSchedule   = {{KernelScheduleTag[sch.kernel_schedule]}};
++  using EpilogueSchedule = {{EpilogueScheduleTag[sch.epilogue_schedule]}};
++  using TileScheduler    = {{TileSchedulerTag[sch.tile_scheduler]}};
++  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
++};
++{% endfor %}
++    
++{% for impl_config in impl_configs %}
++{% set t = impl_config.types -%}
++{% set schs = impl_config.schedules -%}
++{% set type_sig = gen_type_sig(t) -%}
++
++template<typename Sch>
++using Kernel_{{type_sig}} = MacheteKernelTemplate<
++  {{DataTypeTag[t.a]}},  // ElementA
++  {{DataTypeTag[t.b]}},  // ElementB
++  {{DataTypeTag[t.out]}},  // ElementD
++  {{DataTypeTag[t.accumulator]}}, // Accumulator
++  {{DataTypeTag[t.b_group_scale]}}, // GroupScaleT
++  {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT
++  {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT
++  {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT
++  cutlass::gemm::KernelTmaWarpSpecializedCooperative,
++  Sch>;
++
++{% for sch in schs %}
++{% set sch_sig = gen_sch_sig(sch) -%}
++torch::Tensor 
++impl_{{type_sig}}_sch_{{sch_sig}}(MMArgs args) {
++  return run_impl<Kernel_{{type_sig}}<sch_{{sch_sig}}>>(args);
++}
++{%- endfor %}
++{%- endfor %}
++
++}; // namespace machete
++"""
++
++PREPACK_TEMPLATE = """
++#include "../machete_prepack_launcher.cuh"
++
++namespace machete {
++
++torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
++  auto convert_type = args.maybe_group_scales_type.value_or(args.a_type);
++  {%- for t in types %}
++  {% set b_type = unsigned_type_with_bitwidth(t.b_num_bits) %}
++  if (args.a_type == {{TorchTypeTag[t.a]}}
++      && args.b_type.size_bits() == {{t.b_num_bits}} 
++      && convert_type == {{TorchTypeTag[t.convert]}}) {
++    return prepack_impl<
++      PrepackedLayoutBTemplate<
++        {{DataTypeTag[t.a]}}, // ElementA
++        {{DataTypeTag[b_type]}}, // ElementB
++        {{DataTypeTag[t.convert]}}, // ElementConvert
++        {{DataTypeTag[t.accumulator]}}, // Accumulator
++        cutlass::layout::ColumnMajor,
++        cutlass::gemm::KernelTmaWarpSpecializedCooperative>
++    >(args.B); 
++  }
++  {%- endfor %}
++  
++  TORCH_CHECK_NOT_IMPLEMENTED(false, 
++    "prepack_B_dispatch(..) is not implemented for "
++    "atype = ", args.a_type,
++    ", b_type = ", args.b_type.str(),
++    ", with_group_scales_type= ", args.maybe_group_scales_type ? 
++        toString(*args.maybe_group_scales_type) : "None");
++}
++
++}; // namespace machete
++"""
++
++TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative
++TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
++
++
++@dataclass(frozen=True)
++class ScheduleConfig:
++    tile_shape_mn: Tuple[int, int]
++    cluster_shape_mnk: Tuple[int, int, int]
++    kernel_schedule: MixedInputKernelScheduleType
++    epilogue_schedule: EpilogueScheduleType
++    tile_scheduler: TileSchedulerType
++
++
++@dataclass(frozen=True)
++class TypeConfig:
++    a: DataType
++    b: Union[DataType, VLLMDataType]
++    b_group_scale: DataType
++    b_group_zeropoint: DataType
++    b_channel_scale: DataType
++    a_token_scale: DataType
++    out: DataType
++    accumulator: DataType
++
++
++@dataclass(frozen=True)
++class PrepackTypeConfig:
++    a: DataType
++    b_num_bits: int
++    convert: DataType
++    accumulator: DataType
++
++
++@dataclass
++class ImplConfig:
++    types: TypeConfig
++    schedules: List[ScheduleConfig]
++    heuristic: List[Tuple[Optional[str], ScheduleConfig]]
++
++
++def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
++    tile_shape = (
++        f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
++    )
++    cluster_shape = (f"{schedule_config.cluster_shape_mnk[0]}" +
++                     f"x{schedule_config.cluster_shape_mnk[1]}" +
++                     f"x{schedule_config.cluster_shape_mnk[2]}")
++    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule]\
++        .split("::")[-1]
++    epilogue_schedule = EpilogueScheduleTag[
++        schedule_config.epilogue_schedule].split("::")[-1]
++    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler]\
++        .split("::")[-1]
++
++    return (f"{tile_shape}_{cluster_shape}_{kernel_schedule}" +
++            f"_{epilogue_schedule}_{tile_scheduler}")
++
++
++# mostly unique shorter sch_sig
++def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
++    kernel_terse_names_replace = {
++        "KernelTmaWarpSpecializedCooperative": "TmaMI_",
++        "TmaWarpSpecializedCooperative_": "TmaCoop_",
++        "StreamKScheduler": "streamK",
++    }
++
++    sch_sig = generate_sch_sig(schedule_config)
++    for orig, terse in kernel_terse_names_replace.items():
++        sch_sig = sch_sig.replace(orig, terse)
++    return sch_sig
++
++
++# unique type_name
++def generate_type_signature(kernel_types: TypeConfig):
++    return str("".join([
++        VLLMDataTypeNames[getattr(kernel_types, field.name)]
++        for field in fields(TypeConfig)
++    ]))
++
++
++def generate_type_option_name(kernel_types: TypeConfig):
++    return ", ".join([
++        f"{field.name.replace('b_', 'with_')+'_type'}=" +
++        VLLMDataTypeNames[getattr(kernel_types, field.name)]
++        for field in fields(TypeConfig)
++    ])
++
++
++def is_power_of_two(n):
++    return (n != 0) and (n & (n - 1) == 0)
++
++
++def to_cute_constant(value: List[int]):
++
++    def _to_cute_constant(value: int):
++        if is_power_of_two(value):
++            return f"_{value}"
++        else:
++            return f"Int<{value}>"
++
++    if isinstance(value, Iterable):
++        return [_to_cute_constant(value) for value in value]
++    else:
++        return _to_cute_constant(value)
++
++
++def unique_schedules(impl_configs: List[ImplConfig]):
++    return list(
++        set(sch for impl_config in impl_configs
++            for sch in impl_config.schedules))
++
++
++def unsigned_type_with_bitwidth(num_bits):
++    return {
++        4: DataType.u4,
++        8: DataType.u8,
++        16: DataType.u16,
++        32: DataType.u32,
++        64: DataType.u64,
++    }[num_bits]
++
++
++template_globals = {
++    "void": DataType.void,
++    "DataTypeTag": VLLMDataTypeTag,
++    "VLLMScalarTypeTag": VLLMDataTypeVLLMScalarTypeTag,
++    "TorchTypeTag": VLLMDataTypeTorchDataTypeTag,
++    "KernelScheduleTag": VLLMKernelScheduleTag,
++    "EpilogueScheduleTag": EpilogueScheduleTag,
++    "TileSchedulerTag": TileSchedulerTag,
++    "to_cute_constant": to_cute_constant,
++    "gen_sch_sig": generate_terse_sch_sig,
++    "gen_type_sig": generate_type_signature,
++    "unique_schedules": unique_schedules,
++    "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
++    "gen_type_option_name": generate_type_option_name
++}
++
++
++def create_template(template_str):
++    template = jinja2.Template(template_str)
++    template.globals.update(template_globals)
++    return template
++
++
++mm_dispatch_template = create_template(DISPATCH_TEMPLATE)
++mm_impl_template = create_template(IMPL_TEMPLATE)
++prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
++
++
++def create_sources(impl_configs: List[ImplConfig], num_impl_files=8):
++    sources = []
++
++    sources.append((
++        "machete_mm_dispatch",
++        mm_dispatch_template.render(impl_configs=impl_configs),
++    ))
++
++    prepack_types = []
++    for impl_config in impl_configs:
++        convert_type = impl_config.types.a \
++             if impl_config.types.b_group_scale == DataType.void \
++             else impl_config.types.b_group_scale
++        prepack_types.append(
++            PrepackTypeConfig(
++                a=impl_config.types.a,
++                b_num_bits=VLLMDataTypeSize[impl_config.types.b],
++                convert=convert_type,
++                accumulator=impl_config.types.accumulator,
++            ))
++
++    def prepacked_type_key(prepack_type: PrepackTypeConfig):
++        # For now we we can just use the first accumulator type seen since
++        # the tensor core shapes/layouts don't vary based on accumulator
++        # type so we can generate less code this way
++        return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
++
++    unique_prepack_types = []
++    prepack_types_seen = set()
++    for prepack_type in prepack_types:
++        key = prepacked_type_key(prepack_type)
++        if key not in prepack_types_seen:
++            unique_prepack_types.append(prepack_type)
++            prepack_types_seen.add(key)
++
++    sources.append((
++        "machete_prepack",
++        prepack_dispatch_template.render(types=unique_prepack_types, ),
++    ))
++
++    # Split up impls across files
++    num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
++    num_impls_per_file = math.ceil(num_impls / num_impl_files)
++
++    files_impls: List[List[ImplConfig]] = [[]]
++
++    curr_num_impls_assigned = 0
++    curr_impl_in_file = 0
++    curr_impl_configs = deepcopy(list(reversed(impl_configs)))
++
++    while curr_num_impls_assigned < num_impls:
++        room_left_in_file = num_impls_per_file - curr_impl_in_file
++        if room_left_in_file == 0:
++            files_impls.append([])
++            room_left_in_file = num_impls_per_file
++            curr_impl_in_file = 0
++
++        curr_ic = curr_impl_configs[-1]
++        if len(curr_ic.schedules) >= room_left_in_file:
++            # Break apart the current impl config
++            tmp_ic = deepcopy(curr_ic)
++            tmp_ic.schedules = curr_ic.schedules[:room_left_in_file]
++            curr_ic.schedules = curr_ic.schedules[room_left_in_file:]
++            files_impls[-1].append(tmp_ic)
++        else:
++            files_impls[-1].append(curr_ic)
++            curr_impl_configs.pop()
++        curr_num_impls_assigned += len(files_impls[-1][-1].schedules)
++        curr_impl_in_file += len(files_impls[-1][-1].schedules)
++
++    for part, file_impls in enumerate(files_impls):
++        sources.append((
++            f"machete_mm_impl_part{part+1}",
++            mm_impl_template.render(impl_configs=file_impls),
++        ))
++
++    return sources
++
++
++def generate():
++    # See csrc/quantization/machete/Readme.md, the Codegeneration for more info
++    # about how this works
++    SCRIPT_DIR = os.path.dirname(__file__)
++
++    sch_common_params = dict(
++        kernel_schedule=TmaMI,
++        epilogue_schedule=TmaCoop,
++        tile_scheduler=TileSchedulerType.StreamK,
++    )
++
++    # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
++    default_tile_heuristic_config = {
++        #### M = 257+
++        "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
++        "M > 256": ((128, 256), (2, 1, 1)),
++        #### M = 129-256
++        "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
++        "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
++        "M > 128": ((128, 256), (2, 1, 1)),
++        #### M = 65-128
++        "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
++        "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
++        "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
++        "M > 64": ((128, 128), (2, 1, 1)),
++        #### M = 33-64
++        "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
++        "M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
++        "M > 32": ((128, 64), (2, 1, 1)),
++        #### M = 17-32
++        "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
++        "M > 16": ((256, 32), (2, 1, 1)),
++        #### M = 1-16
++        "N >= 26624": ((256, 16), (1, 1, 1)),
++        None: ((128, 16), (1, 1, 1)),
++    }
++
++    # For now we use the same heuristic for all types
++    # Heuristic is currently tuned for H100s
++    default_heuristic = [
++        (cond, ScheduleConfig(*tile_config,
++                              **sch_common_params))  # type: ignore
++        for cond, tile_config in default_tile_heuristic_config.items()
++    ]
++
++    def get_unique_schedules(heuristic: Dict[str, ScheduleConfig]):
++        # Do not use schedules = list(set(...)) because we need to make sure
++        # the output list is deterministic; otherwise the generated kernel file
++        # will be non-deterministic and causes ccache miss.
++        schedules = []
++        for _, schedule_config in heuristic:
++            if schedule_config not in schedules:
++                schedules.append(schedule_config)
++        return schedules
++
++    impl_configs = []
++
++    GPTQ_kernel_type_configs = list(
++        TypeConfig(
++            a=a,
++            b=b,
++            b_group_scale=a,
++            b_group_zeropoint=DataType.void,
++            b_channel_scale=DataType.void,
++            a_token_scale=DataType.void,
++            out=a,
++            accumulator=DataType.f32,
++        ) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
++        for a in (DataType.f16, DataType.bf16))
++
++    impl_configs += [
++        ImplConfig(x[0], x[1], x[2])
++        for x in zip(GPTQ_kernel_type_configs,
++                     itertools.repeat(get_unique_schedules(default_heuristic)),
++                     itertools.repeat(default_heuristic))
++    ]
++
++    AWQ_kernel_type_configs = list(
++        TypeConfig(
++            a=a,
++            b=b,
++            b_group_scale=a,
++            b_group_zeropoint=a,
++            b_channel_scale=DataType.void,
++            a_token_scale=DataType.void,
++            out=a,
++            accumulator=DataType.f32,
++        ) for b in (DataType.u4, DataType.u8)
++        for a in (DataType.f16, DataType.bf16))
++
++    impl_configs += [
++        ImplConfig(x[0], x[1], x[2])
++        for x in zip(AWQ_kernel_type_configs,
++                     itertools.repeat(get_unique_schedules(default_heuristic)),
++                     itertools.repeat(default_heuristic))
++    ]
++
++    # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
++    # TODO (LucasWilkinson): Further tuning required
++    qqq_tile_heuristic_config = {
++        #### M = 257+
++        # ((128, 256), (2, 1, 1)) Broken for QQQ types
++        # TODO (LucasWilkinson): Investigate further
++        # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
++        # "M > 256": ((128, 256), (2, 1, 1)),
++        "M > 256": ((128, 128), (2, 1, 1)),
++        #### M = 129-256
++        "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
++        "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
++        # ((128, 256), (2, 1, 1)) Broken for QQQ types
++        # TODO (LucasWilkinson): Investigate further
++        # "M > 128": ((128, 256), (2, 1, 1)),
++        "M > 128": ((128, 128), (2, 1, 1)),
++        #### M = 65-128
++        "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
++        "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
++        "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
++        "M > 64": ((128, 128), (2, 1, 1)),
++        #### M = 33-64
++        "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
++        # Broken for QQQ types
++        # TODO (LucasWilkinson): Investigate further
++        #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
++        "M > 32": ((128, 64), (2, 1, 1)),
++        #### M = 17-32
++        "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
++        "M > 16": ((256, 32), (2, 1, 1)),
++        #### M = 1-16
++        "N >= 26624": ((256, 16), (1, 1, 1)),
++        None: ((128, 16), (1, 1, 1)),
++    }
++
++    # For now we use the same heuristic for all types
++    # Heuristic is currently tuned for H100s
++    qqq_heuristic = [
++        (cond, ScheduleConfig(*tile_config,
++                              **sch_common_params))  # type: ignore
++        for cond, tile_config in qqq_tile_heuristic_config.items()
++    ]
++
++    QQQ_kernel_types = [
++        *(TypeConfig(
++            a=DataType.s8,
++            b=VLLMDataType.u4b8,
++            b_group_scale=b_group_scale,
++            b_group_zeropoint=DataType.void,
++            b_channel_scale=DataType.f32,
++            a_token_scale=DataType.f32,
++            out=DataType.f16,
++            accumulator=DataType.s32,
++        ) for b_group_scale in (DataType.f16, DataType.void)),
++        *(TypeConfig(
++            a=DataType.e4m3,
++            b=VLLMDataType.u4b8,
++            b_group_scale=b_group_scale,
++            b_group_zeropoint=DataType.void,
++            b_channel_scale=DataType.f32,
++            a_token_scale=DataType.f32,
++            out=DataType.f16,
++            accumulator=DataType.f32,
++        ) for b_group_scale in (DataType.f16, DataType.void)),
++    ]
++
++    impl_configs += [
++        ImplConfig(x[0], x[1], x[2])
++        for x in zip(QQQ_kernel_types,
++                     itertools.repeat(get_unique_schedules(qqq_heuristic)),
++                     itertools.repeat(qqq_heuristic))
++    ]
++
++    output_dir = os.path.join(SCRIPT_DIR, "generated")
++
++    # Delete the "generated" directory if it exists
++    if os.path.exists(output_dir):
++        shutil.rmtree(output_dir)
++
++    # Create the "generated" directory
++    os.makedirs(output_dir)
++
++    # Render each group of configurations into separate files
++    for filename, code in create_sources(impl_configs):
++        filepath = os.path.join(output_dir, f"{filename}.cu")
++        with open(filepath, "w") as output_file:
++            output_file.write(code)
++        print(f"Rendered template to {filepath}")
++
++
++if __name__ == "__main__":
++    generate()
+diff --git a/csrc/quantization/machete/machete_collective_builder.cuh b/csrc/quantization/machete/machete_collective_builder.cuh
+new file mode 100644
+index 0000000..ee82558
+--- /dev/null
++++ b/csrc/quantization/machete/machete_collective_builder.cuh
+@@ -0,0 +1,31 @@
++#pragma once
++
++#include "cutlass_extensions/vllm_collective_builder.cuh"
++#include "machete_mainloop.cuh"
++
++namespace cutlass::gemm::collective {
++using namespace cute;
++
++struct MacheteKernelTag {};
++
++template <class ElementPairA_, class GmemLayoutA_, int AlignmentA,
++          class ElementPairB_, class GmemLayoutB_, int AlignmentB,
++          class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
++          class StageCountType, class KernelScheduleType>
++struct VLLMCollectiveBuilder<
++    MacheteKernelTag, arch::Sm90, arch::OpClassTensorOp, ElementPairA_,
++    GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_, AlignmentB,
++    ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
++    KernelScheduleType,
++    cute::enable_if_t<(
++        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
++        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
++        cute::is_same_v<KernelScheduleType,
++                        KernelTmaWarpSpecializedCooperative>)>> {
++  using CollectiveOp = machete::MacheteCollectiveMma<
++      ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_,
++      AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
++      StageCountType, KernelScheduleType>;
++};
++
++};  // namespace cutlass::gemm::collective
+diff --git a/csrc/quantization/machete/machete_interleaving_utils.cuh b/csrc/quantization/machete/machete_interleaving_utils.cuh
+new file mode 100644
+index 0000000..d397f87
+--- /dev/null
++++ b/csrc/quantization/machete/machete_interleaving_utils.cuh
+@@ -0,0 +1,35 @@
++#pragma once
++
++#include "cutlass/cutlass.h"
++#include "cute/layout.hpp"
++
++namespace machete {
++
++using namespace cute;
++
++// get an interleaved block layout where each element consecutive element has a
++// stride of bit_stride and the block width is blk_bit_width,
++// examples:
++//  size_bits<T> = 8, bit_stride = 8,  blk_bit_width = 32 -> 4:1
++//  size_bits<T> = 8, bit_stride = 16, blk_bit_width = 32 -> (2, 2):(2, 1)
++//  size_bits<T> = 4, bit_stride = 8,  blk_bit_width = 32 -> (4, 2):(2, 1)
++//  size_bits<T> = 4, bit_stride = 16, blk_bit_width = 32 -> (2, 4):(4, 1)
++template <typename T, int bit_stride, int blk_bit_width>
++CUTE_HOST_DEVICE static constexpr auto get_interleaved_blk_layout() {
++  static_assert(blk_bit_width % bit_stride == 0);
++  static_assert(bit_stride % cute::sizeof_bits_v<T> == 0);
++
++  constexpr auto elems_per_blk = blk_bit_width / cute::sizeof_bits_v<T>;
++
++  if constexpr (cute::sizeof_bits_v<T> == bit_stride) {
++    // identity layout
++    return Layout<Shape<Int<elems_per_blk>>>{};
++  } else {
++    constexpr auto elems_per_stride = bit_stride / cute::sizeof_bits_v<T>;
++    constexpr auto num_strides = elems_per_blk / elems_per_stride;
++    return Layout<Shape<Int<num_strides>, Int<elems_per_stride>>,
++                  Stride<Int<elems_per_stride>, Int<1>>>{};
++  }
++}
++
++};  // namespace machete
+diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
+new file mode 100644
+index 0000000..4071b19
+--- /dev/null
++++ b/csrc/quantization/machete/machete_mainloop.cuh
+@@ -0,0 +1,1470 @@
++//
++// Based off of:
++//   cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
++// Specifically:
++//   https://github.com/NVIDIA/cutlass/tree/06b21349bcf6ddf6a1686a47a137ad1446579db9/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
++// Referred to as upstream from in the comments
++//
++// The main optimization machete implements compared to upstream is to prepack
++// the weight matrix to more closely match the shape of the wgmma instructions
++// allowing for wider (ideally 128bit) shared memory loads. For subbyte types
++// this is done by packing values from multiple wgmma loads (for a single
++// thread) into a single 128bit load. This is very similar to layout used in
++// Marlin, although specific to the wgmma instructions.
++//
++// Since the wgmma instructions only support sourcing from registers for the A
++// operand, and we want to upconvert/decompress the weight values/elements
++// before feeding them into the tensor cores in registers, we need the weight
++// matrix to be A. To achieve this we compute the transpose of Y = XW^t as
++// Y^t = W^tX^t. This is mostly done outside of this file in
++// csrc/quantization/machete/machete_mm_kernel.cuh, but this why A is the
++// quantized/narrow type and has the prepacked layout despite the API being:
++//   B_prepacked = machete_prepack_B(B)
++//   Y = machete_mm(A, B_prepacked)
++//
++#pragma once
++
++// clang-format off
++#include "cutlass/cutlass.h"
++#include "cutlass/numeric_conversion.h"
++#include "cute/arch/cluster_sm90.hpp"
++#include "cute/arch/copy_sm90.hpp"
++#include "cutlass/gemm/gemm.h"
++#include "cutlass/detail/dependent_false.hpp"
++#include "cutlass/gemm/dispatch_policy.hpp"
++#include "cutlass/detail/layout.hpp"
++
++#include "cute/algorithm/functional.hpp"
++#include "cute/atom/mma_atom.hpp"
++#include "cute/atom/copy_traits_sm90_tma.hpp"
++#include "cute/algorithm/gemm.hpp"
++#include "cute/tensor_predicate.hpp"
++#include "cute/numeric/arithmetic_tuple.hpp"
++#include "cutlass/pipeline/pipeline.hpp"
++#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
++#include "cutlass/trace.h"
++
++#include "cutlass/detail/collective.hpp"
++// clang-format on
++
++#include "cutlass_extensions/cute_utils.cuh"
++
++namespace machete {
++
++using namespace cute;
++using namespace cutlass;
++using namespace cutlass::gemm;
++using namespace cutlass::gemm::collective;
++using namespace cutlass::gemm::collective::detail;
++
++template <class ElementATuple_, class GmemLayoutA, int AlignmentA,
++          class ElementB_, class GmemLayoutB, int AlignmentB,
++          class ElementAccumulator_, class TileShape_MNK,
++          class ClusterShape_MNK, class StageCountType,
++          class KernelScheduleType>
++struct MacheteCollectiveMma {
++  using Schedule = KernelScheduleType;
++  static_assert(
++      cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
++          cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
++          cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
++          cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
++          cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative> ||
++          cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
++      "KernelSchedule must be one of the warp specialized policies");
++
++ public:
++  static constexpr bool ALayoutIsPrepacked = true;
++
++  // Prepacked block shape (N is M in the transposed problem)
++  using PPBlockShape_MK = typename GmemLayoutA::PPBlockShape_NK;
++  // Prepacked blocks per dim for a single MMA tile
++  using PPBlocksPerTile_MK = decltype(make_shape(
++      size<0>(TileShape_MNK{}) / size<0>(PPBlockShape_MK{}),
++      size<2>(TileShape_MNK{}) / size<1>(PPBlockShape_MK{})));
++
++  using IlvdBlkLayout = typename GmemLayoutA::IlvdBlkLayout;
++
++  static_assert(size<0>(TileShape_MNK{}) % size<0>(PPBlockShape_MK{}) == 0,
++                "M in PPBlockShape_MK must evenly divide M TileShape_MNK");
++  static_assert(size<2>(TileShape_MNK{}) % size<1>(PPBlockShape_MK{}) == 0,
++                "K in PPBlockShape_MK must evenly divide K TileShape_MNK");
++
++  using ArchTag = arch::Sm90;
++  using TileShape = TileShape_MNK;
++  using ClusterShape = ClusterShape_MNK;
++  using ElementA = deduce_mixed_width_dtype_t<0, ElementATuple_>;
++  using StrideA = TagToStrideA_t<layout::RowMajor>;
++  using ElementB = ElementB_;
++  using StrideB = TagToStrideB_t<GmemLayoutB>;
++  using ElementAccumulator = ElementAccumulator_;
++  using ElementMma = ElementB;
++  using ElementATuple =
++      cute::conditional_t<!cute::is_tuple<ElementATuple_>::value,
++                          cute::tuple<ElementA>, ElementATuple_>;
++
++  static constexpr cute::GMMA::Major GmmaMajorA =
++      gmma_rs_tag_to_major_A<layout::RowMajor>();
++  static constexpr cute::GMMA::Major GmmaMajorB =
++      gmma_rs_tag_to_major_B<GmemLayoutB>();
++
++  // For coop schedules we have two warp groups cooperatively issuing wgmma
++  // instructions so we use 2 atoms along the M dim (one for each warpgroup)
++  using AtomLayoutMNK = cute::conditional_t<
++      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
++      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
++
++  using TiledMma = decltype(cute::make_tiled_mma(
++      cute::GMMA::rs_op_selector<ElementMma, ElementMma, ElementAccumulator,
++                                 TileShape_MNK, GMMA::Major::K, GmmaMajorB>(),
++      AtomLayoutMNK{}));
++
++ private:
++  //
++  // the setup section (until "section setup end") contains a combination of
++  // modified code from (used as a starting point):
++  //   `cutlass/gemm/collective/builders/sm90_gmma_builder.inl`
++  //   `cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp`
++  //   (upstream)
++  //
++  // however in-order to simplify the code we combine a lot of the logic from
++  // `CollectiveMma` and `CollectiveBuilder` into this class, this also makes
++  // sense given that we have flexibility on layouts here. We also simplify the
++  // code by only supporting scales and zeros for A (in the transposed problem,
++  // B from an API perspective), also since we force A to be the narrow type
++  // (i.e. the type to be upconverted) we can remove all the `SwapAB` logic in
++  // the upstream also simplifying the code. This section includes new logic
++  // (compared ustream) for handling the prepacked-A layouts (in the transposed
++  // problem, B from an API perspective)
++  //
++  using ElementScale = deduce_mixed_width_dtype_t<1, ElementATuple_>;
++  using ElementZero = deduce_mixed_width_dtype_t<2, ElementATuple_>;
++
++  static constexpr bool IsANarrow = cutlass::sizeof_bits<ElementA>::value <
++                                    cutlass::sizeof_bits<ElementB>::value;
++  static_assert(IsANarrow,
++                "A must be the narrow one since its the one that flows through "
++                "registers.");
++
++ public:
++  static constexpr int PipelineStages =
++      compute_stage_count_or_override_single_affine_transformed_input<
++          sm90_smem_capacity_bytes, ElementA, ElementB, ElementScale,
++          ElementZero, TileShape_MNK>(StageCountType{});
++
++  struct DispatchPolicy {
++    constexpr static int Stages = PipelineStages;
++    using ClusterShape = ClusterShape_MNK;
++    using Schedule = KernelScheduleType;
++  };
++
++  using GmemTiledCopyA =
++      decltype(sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
++  using GmemTiledCopyB =
++      decltype(sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
++
++  // ((T, V), (BlocksM, BlocksK), pipe) -> offset
++  using SmemLayoutA = decltype(GmemLayoutA::TVbNbKL_to_offset(
++      make_shape(size<0>(TileShape_MNK{}), size<2>(TileShape_MNK{}),
++                 Int<DispatchPolicy::Stages>{})));
++
++  using SmemLayoutACopy = decltype(GmemLayoutA::TVbNbKL_to_offset_copy(
++      make_shape(size<0>(TileShape_MNK{}), size<2>(TileShape_MNK{}),
++                 Int<DispatchPolicy::Stages>{})));
++
++  using SmemLayoutAtomARowMajor =
++      decltype(rs_smem_selector<GmmaMajorA, ElementA,
++                                decltype(cute::get<0>(TileShape_MNK{})),
++                                decltype(cute::get<2>(TileShape_MNK{}))>());
++
++  using SmemLayoutAtomScale = Layout<
++      Shape<decltype(cute::shape<0>(SmemLayoutAtomARowMajor{})), cute::Int<1>>>;
++
++  using SmemLayoutAtomB =
++      decltype(rs_smem_selector<GmmaMajorB, ElementB,
++                                decltype(cute::get<1>(TileShape_MNK{})),
++                                decltype(cute::get<2>(TileShape_MNK{}))>());
++
++  using SmemCopyAtomA = Copy_Atom<cute::DefaultCopy, ElementA>;
++  using SmemCopyAtomB = void;
++
++  //
++  //  Validity checks
++  //
++  static_assert(is_static<TileShape_MNK>::value);
++  static_assert(is_static<ClusterShape_MNK>::value);
++  static_assert(is_aligned<ElementA, AlignmentA, ElementB, AlignmentB,
++                           tma_alignment_bytes>(),
++                "Should meet TMA alignment requirement\n");
++#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
++  static_assert(cutlass::detail::dependent_false<ElementA>,
++                "Unsupported Toolkit for SM90 Collective Builder\n");
++#endif
++
++ private:
++  enum class ConversionMode {
++    DirectConvert,
++    ConvertAndScale,
++    ConvertAndScaleWithZero
++  };
++
++ public:
++  //
++  // Type Aliases
++  //
++  using KernelSchedule = KernelScheduleType;
++
++  // For cases where we can't have a void type, we can use this to allow the
++  // code to compile when the scale / zero is void.
++  using NonVoidElementScale =
++      cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
++  using NonVoidElementZero =
++      cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
++
++  // These are always MN major
++  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
++  // For cases where we can't have a void scale, we can use this to allow the
++  // code to compile when the scale is void.
++  using NonVoidStrideScale =
++      cute::conditional_t<cute::is_void_v<StrideScale>,
++                          cute::Stride<_1, int64_t, int64_t>, StrideScale>;
++
++  static_assert((cutlass::gemm::detail::is_k_major<StrideA>()),
++                "The transformed matrix (A) must be K-major.");
++
++  static_assert((sizeof(ElementB) == 2) ||
++                    (cutlass::gemm::detail::is_k_major<StrideA>() &&
++                     cutlass::gemm::detail::is_k_major<StrideB>()),
++                "The unscaled element (matrix B) must be 2 bytes OR both "
++                "inputs must be K-major");
++
++  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(),
++                "Scale must be MN major [Col Major if A is scaled, Row Major "
++                "if B is scaled].");
++
++  static_assert(std::is_same_v<typename TiledMma::ValTypeC, ElementAccumulator>,
++                "TiledMma::ValTypeC must be the same as ElementAccumulator.");
++
++  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
++
++  using SmemCopyAtomScale = Copy_Atom<cute::DefaultCopy, NonVoidElementScale>;
++
++  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
++  // For all other types, cast to size equivalent uint type to avoid any
++  // rounding by TMA.
++  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
++  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
++  using InternalElementA =
++      cute::conditional_t<ConvertF32toTF32A, tfloat32_t,
++                          uint_bit_t<sizeof_bits_v<ElementA>>>;
++  using InternalElementB =
++      cute::conditional_t<ConvertF32toTF32B, tfloat32_t,
++                          uint_bit_t<sizeof_bits_v<ElementB>>>;
++
++  using TransformA = cute::identity;
++  using TransformB = cute::identity;
++
++  static constexpr int IsSubbyteA = cute::sizeof_bits_v<InternalElementA> < 8;
++  using TmaElementA =
++      cute::conditional_t<IsSubbyteA, uint8_t, InternalElementA>;
++
++  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
++  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
++
++  using PipelineParams = typename MainloopPipeline::Params;
++  using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}),
++                                             shape<1>(SmemLayoutAtomScale{})));
++
++  static_assert(cute::rank(SmemLayoutAtomB{}) == 2,
++                "SmemLayoutAtom must be rank 2 (M/N, K)");
++  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0,
++                "SmemLayoutAtom must evenly divide tile shape.");
++  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0,
++                "SmemLayoutAtom must evenly divide tile shape.");
++
++  static_assert(rank(SmemLayoutAtomScale{}) == 2,
++                "SmemLayoutAtomScale must be rank 2");
++  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0,
++                "SmemLayoutAtomScale must equal the tile shape.");
++  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0,
++                "SmemLayoutAtomScale must evenly divide tile k shape.");
++
++  // Tile along modes in a way that maximizes the TMA box size
++  using SmemLayoutB = decltype(tile_to_shape(
++      SmemLayoutAtomB{},
++      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}),
++                 Int<DispatchPolicy::Stages>{}),
++      conditional_t<::cutlass::gemm::detail::is_major<0, StrideB>(),
++                    Step<_2, _1, _3>, Step<_1, _2, _3>>{}));
++
++  // It is assumed that the scales and zero-points share the same smem layout
++  using SmemLayoutScale = decltype(tile_to_shape(
++      SmemLayoutAtomScale{},
++      make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}),
++                 Int<PipelineStages>{})));
++
++  // If A mn-layout and B mn-layout, transposing B matrix since WGMMA is k-major
++  // only (e.g. tf32, fp32, fp8, int8).
++  static constexpr bool IsLayoutAmnBmn =
++      cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>,
++                      layout::ColumnMajor> &&
++      cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>,
++                      layout::RowMajor>;
++
++  static_assert(DispatchPolicy::Stages >= 2,
++                "Specialization requires Stages set to value 2 or more.");
++  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator,
++                                     typename TiledMma::FrgTypeA>::value &&
++                    cute::is_base_of<cute::GMMA::DescriptorIterator,
++                                     typename TiledMma::FrgTypeB>::value,
++                "MMA atom must source A from rmem and B operand from smem_desc "
++                "for this mainloop.");
++  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> ||
++                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
++                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
++  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> ||
++                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
++                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
++
++  using GmmaSmemLayoutB = decltype(tile_to_shape(
++      SmemLayoutAtomB{},
++      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}),
++                 Int<DispatchPolicy::Stages>{}),
++      conditional_t<::cutlass::gemm::detail::is_major<0, StrideB>(),
++                    Step<_2, _1, _3>, Step<_1, _2, _3>>{}));
++
++  // These two restrictions are related, so we place the assertions together.
++  // To relax them, we need to handle loading more than 1 row of scales for
++  // every main loop iteration. We must also handle updating the pipeline
++  // transaction bytes on the fly. NOTE: Deleting this assertion without
++  // required changes will cause the code to hang.
++  static_assert(size<1>(SmemLayoutAtomScale{}) == 1,
++                "size<1>(SmemLayoutAtomScale) must be 1.");
++
++ private:
++  static constexpr ConversionMode get_conversion_mode() {
++    if constexpr (cute::is_void_v<ElementScale>) {
++      return ConversionMode::DirectConvert;
++    } else if constexpr (cute::is_void_v<ElementZero>) {
++      return ConversionMode::ConvertAndScale;
++    } else {
++      return ConversionMode::ConvertAndScaleWithZero;
++    }
++  }
++
++  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
++  static constexpr bool ModeHasScales =
++      KernelConversionMode == ConversionMode::ConvertAndScale ||
++      KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
++
++  // Same as upstream, should be kept the same when possible
++  static constexpr auto elements_per_smem_scale() {
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++      return 0;
++    } else if constexpr (ModeHasScales) {
++      return cute::cosize_v<SmemLayoutScale>;
++    } else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
++                    "Type not handled in scale smem allocation.");
++    }
++  }
++
++  // Same as upstream, should be kept the same when possible
++  static constexpr auto elements_per_smem_zero() {
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
++                  KernelConversionMode == ConversionMode::ConvertAndScale) {
++      return 0;
++    } else if constexpr (KernelConversionMode ==
++                         ConversionMode::ConvertAndScaleWithZero) {
++      return cute::cosize_v<SmemLayoutScale>;
++    } else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
++                    "Type not handled in scale smem allocation.");
++    }
++  }
++
++  // Same as upstream, should be kept the same when possible, not formatte for
++  // easier comparison
++  // clang-format off
++  // These methods use some the public members of the class. For that reason, we define them after the public section.
++  static constexpr uint32_t
++  compute_tma_transaction_bytes_mk() {
++    constexpr uint32_t baseline_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementA>));
++
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++      return baseline_bytes;
++    }
++    else if constexpr (ModeHasScales) {
++      constexpr uint32_t scale_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
++      static_assert(scale_tx_bytes % 128 == 0, "Each scale stage must be 128B aligned."); // required by TMA
++      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
++        return baseline_bytes + scale_tx_bytes;
++      }
++      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
++        // Scale and zero share smem layout
++        constexpr uint32_t zero_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
++        static_assert(zero_tx_bytes % 128 == 0, "Each zero stage must be 128B aligned."); // required by TMA
++        return baseline_bytes + scale_tx_bytes + zero_tx_bytes;
++      }
++      else {
++        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
++      }
++    }
++    else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
++    }
++  }
++
++  static constexpr uint32_t
++  compute_tma_transaction_bytes_nk() {
++    return cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementB>));
++  }
++  // clang-format on
++
++  // ((athrid, val), (BlocksM, BlockK), L) -> (storage_idx)
++  using PrepackedStrideA = decltype(stride(GmemLayoutA::TVbNbKL_to_offset_copy(
++      make_shape(int32_t(0), int32_t(0), int32_t(0)))));
++
++  using ATensor = decltype(make_tensor(
++      get_logical_ptr(static_cast<InternalElementA const*>(nullptr)),
++      shape(GmemLayoutA::TVbNbKL_to_offset_copy(
++          make_shape(int32_t(0), int32_t(0), int32_t(0)))),
++      PrepackedStrideA{}));
++
++  using BTensor = decltype(make_tensor(
++      get_logical_ptr(static_cast<InternalElementB const*>(nullptr)),
++      repeat_like(StrideB{}, int32_t(0)), StrideB{}));
++  using ScaleTensor = decltype(make_tensor(
++      get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)),
++      repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}));
++
++  using ZeroTensor = decltype(make_tensor(
++      get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)),
++      repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}));
++
++  static constexpr auto make_tma_copy_A(ATensor tensor_a = ATensor{}) {
++    return make_tma_copy<TmaElementA>(
++        GmemTiledCopyA{}, tensor_a, SmemLayoutACopy{}(_, _, cute::Int<0>{}),
++        shape(SmemLayoutACopy{}(_, _, cute::Int<0>{})),
++        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
++  }
++
++  static constexpr auto make_tma_copy_scale(
++      ScaleTensor tensor_scale = ScaleTensor{}) {
++    return make_tma_copy(GmemTiledCopyScale{}, tensor_scale,
++                         SmemLayoutScale{}(_, _, cute::Int<0>{}),
++                         ScaleTileShape{},
++                         _1{});  // mcast along N mode for this M load, if any
++  }
++
++  static constexpr auto make_tma_copy_zero(
++      ZeroTensor tensor_zero = ZeroTensor{}) {
++    return make_tma_copy(GmemTiledCopyScale{}, tensor_zero,
++                         SmemLayoutScale{}(_, _, cute::Int<0>{}),
++                         ScaleTileShape{},
++                         _1{});  // mcast along N mode for this M load, if any
++  }
++
++  static constexpr auto make_tma_copy_B(BTensor tensor_b = BTensor{}) {
++    return make_tma_copy(
++        GmemTiledCopyB{}, tensor_b, SmemLayoutB{}(_, _, cute::Int<0>{}),
++        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
++        size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
++  }
++
++ public:
++  // Same as upstream, should be kept the same when possible, not formatted for
++  // easier comparison
++  //  with `RealInternalElementA` -> `ElementA` since we support `SwapAB` logic
++  // clang-format off
++  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
++
++  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
++
++  // Just pick the max alignment of A and B since it is required to be at least 128B
++  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
++
++  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
++
++  struct SharedStorage
++  {
++    static constexpr int scale_elements = elements_per_smem_scale();
++    static constexpr int zero_elements = elements_per_smem_zero();
++    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB)> {
++      cute::ArrayEngine<ElementA, cute::cosize_v<SmemLayoutA>> smem_A;
++      cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
++      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
++      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
++    } tensors;
++
++    using PipelineStorage = typename MainloopPipeline::SharedStorage;
++    PipelineStorage pipeline;
++  };
++  using TensorStorage = typename SharedStorage::TensorStorage;
++  using PipelineStorage = typename SharedStorage::PipelineStorage;
++
++  // Host side kernel arguments
++  struct Arguments {
++    ElementA const* ptr_A = nullptr;
++    StrideA dA{};
++    ElementB const* ptr_B = nullptr;
++    StrideB dB{};
++    ElementScale const* ptr_S = nullptr;
++    NonVoidStrideScale dS{};
++    int group_size = 0;
++    ElementZero const* ptr_Z = nullptr;
++    uint32_t mma_promotion_interval = 4;
++  };
++  // clang-format on
++
++  //
++  //  section setup end
++  //
++
++  // Similar (but not idendtical) to upstream, should be kept the same when
++  // possible
++  //  compared to upstream we use `make_tma_copy_A`, `make_tma_copy_B` etc. to
++  //  define the TMA types
++  // Device side kernel params
++  struct Params {
++   public:
++    // Assumption: StrideA is congruent with Problem_MK
++    using TMA_A = decltype(make_tma_copy_A());
++    using TMA_Scale = decltype(make_tma_copy_scale());
++    using TMA_Zero = decltype(make_tma_copy_zero());
++    using TMA_B = decltype(make_tma_copy_B());
++
++    // required by outer loop: i.e.
++    //   cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
++    TMA_A tma_load_a;
++    TMA_B tma_load_b;
++    TMA_Scale tma_load_scale;
++    TMA_Zero tma_load_zero;
++    int64_t scale_k;
++    int group_size;
++    uint32_t tma_transaction_bytes = TmaTransactionBytes;
++    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
++    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
++  };
++
++  //
++  // Methods
++  //
++
++  // Similar (but not idendtical) to upstream, should be kept the same when
++  // possible
++  //  compared to upstream we use `make_tma_copy_A` and `TVbNbKL_to_offset` here
++  //  to handle the prepacked layout
++  template <class ProblemShape>
++  static constexpr Params to_underlying_arguments(
++      ProblemShape const& problem_shape, Arguments const& args,
++      void* workspace) {
++    (void)workspace;
++
++    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is
++    // only rank-3 (MNK)
++    auto problem_shape_MNKL = append<4>(problem_shape, 1);
++    auto [M, N, K, L] = problem_shape_MNKL;
++
++    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
++    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
++
++    auto make_logical_tensor = [&](auto ptr, auto shape, auto stride) {
++      return make_tensor(get_logical_ptr(ptr), make_layout(shape, stride));
++    };
++
++    typename Params::TMA_A tma_load_a;
++    typename Params::TMA_B tma_load_b;
++    typename Params::TMA_Scale tma_load_scale;
++    typename Params::TMA_Zero tma_load_zero;
++
++    auto layout = GmemLayoutA::TVbNbKL_to_offset_copy(make_shape(M, K, L));
++    tma_load_a = make_tma_copy_A(
++        make_logical_tensor(ptr_A, shape(layout), stride(layout)));
++
++    tma_load_b = make_tma_copy_B(
++        make_logical_tensor(ptr_B, make_shape(N, K, L), args.dB));
++
++    int32_t scale_k =
++        (ModeHasScales) ? (K + args.group_size - 1) / args.group_size : 0;
++    int32_t group_size = (ModeHasScales) ? args.group_size : 0;
++
++    if constexpr (ModeHasScales) {
++      tma_load_scale = make_tma_copy_scale(
++          make_logical_tensor(args.ptr_S, make_shape(M, scale_k, L), args.dS));
++    }
++
++    if constexpr (KernelConversionMode ==
++                  ConversionMode::ConvertAndScaleWithZero) {
++      tma_load_zero = make_tma_copy_zero(
++          make_logical_tensor(args.ptr_Z, make_shape(M, scale_k, L), args.dS));
++    }
++
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
++                  KernelConversionMode == ConversionMode::ConvertAndScale ||
++                  KernelConversionMode ==
++                      ConversionMode::ConvertAndScaleWithZero) {
++      return {tma_load_a,    tma_load_b, tma_load_scale,
++              tma_load_zero, scale_k,    group_size};
++    } else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
++                    "Conversion mode not handled in to_underlying_arguments.");
++    }
++  }
++
++  // Same as upstream, should be kept the same when possible, not formatted for
++  // easier comparison
++  //   with `SwapAB ? N : M -> M` since we dont support SwapAB
++  // clang-format off
++  template<class ProblemShape>
++  static bool
++  can_implement(
++      ProblemShape const& problem_shape,
++      [[maybe_unused]] Arguments const& args) {
++    constexpr int tma_alignment_bits = 128;
++    auto problem_shape_MNKL = append<4>(problem_shape, 1);
++    auto [M,N,K,L] = problem_shape_MNKL;
++    
++    bool implementable = true;
++    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
++    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
++    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
++    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
++
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++      implementable = implementable && (args.ptr_S == nullptr);
++      implementable = implementable && (args.ptr_Z == nullptr);
++    } 
++    else if constexpr (ModeHasScales) {
++      const int scale_mn = M;
++      const int scale_k = (K + args.group_size - 1) / args.group_size;
++      constexpr int min_tma_aligned_elements_scale = tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
++      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(cute::make_shape(scale_mn,scale_k,L), StrideScale{});
++      implementable = implementable && (args.group_size == K || ((args.group_size % size<2>(TileShape{})) == 0));
++      implementable = implementable && args.group_size != 0;
++      implementable = implementable && (args.ptr_S != nullptr);
++
++      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
++        implementable = implementable && (args.ptr_Z == nullptr);
++      }
++      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
++        constexpr int min_tma_aligned_elements_zero = tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
++        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(cute::make_shape(scale_mn,scale_k,L), StrideScale{});
++        implementable = implementable && (args.ptr_Z != nullptr);
++      } 
++      else {
++        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
++      }
++    }
++    else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
++    }
++
++    if (!implementable) {
++      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
++    }
++    return implementable;
++  }
++
++  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
++  static constexpr uint32_t TmaTransactionBytesMK = compute_tma_transaction_bytes_mk();
++  static constexpr uint32_t TmaTransactionBytesNK = compute_tma_transaction_bytes_nk();
++  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
++
++  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
++  CUTLASS_DEVICE
++  static void prefetch_tma_descriptors(Params const& mainloop_params) {
++    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
++    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
++
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++      // Nothing extra to do
++    } 
++    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
++      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
++    }
++    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
++      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
++      cute::prefetch_tma_descriptor(mainloop_params.tma_load_zero.get_tma_descriptor());
++    }  
++    else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA prefetch.");
++    }
++    
++  }
++  // clang-format off
++
++  // Modified from upstream, should be kept close to that when possible
++  //  the main difference is special handling for the prepacked A layout
++  //
++  // Set up the data needed by this collective for load and mma.
++  // Returns a tuple of tensors. The collective and the kernel layer have the
++  // contract Returned tuple must contain at least two elements, with the first
++  // two elements being: gA_mkl - The tma tensor, A after a local tile so it
++  // has shape  (TILE_V,TILE_B,m,k,l) gB_nkl - The tma tensor, B after a local
++  // tile so it has shape  (TILE_N,TILE_K,n,k,l) The rest of the tensors can be
++  // specified as needed by this collective.
++  // NOTE: TILE_B is the prepacked block index within a tile. TILE_V is the
++  // values within a prepacked block.
++  template <class ProblemShape_MNKL>
++  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL,
++                                Params const& mainloop_params) const {
++    using X = Underscore;
++    auto M = get<0>(problem_shape_MNKL), N = get<1>(problem_shape_MNKL),
++         K = get<2>(problem_shape_MNKL), L = get<3>(problem_shape_MNKL);
++
++    // (TILE_V,TILE_B,m,k,l)
++    auto make_gA_mkl = [&]() {
++      // ((athrid, val), (BlocksM, BlockK), L) -> (storage_idx)
++      auto layout = GmemLayoutA::TVbNbKL_to_offset_copy(make_shape(M, K, L));
++      Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(layout));
++      return local_tile(mA_mkl,
++                        make_shape(size<0>(layout), PPBlocksPerTile_MK{}),
++                        make_coord(0, make_coord(_, _)));
++    };
++
++    // (TILE_N,TILE_K,n,k,l)
++    auto make_gB_nkl = [&]() {
++      Tensor mB_nkl =
++          mainloop_params.tma_load_b.get_tma_tensor(make_shape(N, K, L));
++      return local_tile(mB_nkl, TileShape{}, make_coord(_, _, _),
++                        Step<X, _1, _1>{});
++    };
++
++    // (TILE_M,TILE_Scale_K,m,scale_k,l)
++    auto make_gS_mkl = [&]() {
++      auto scale_k = mainloop_params.scale_k;
++      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(
++          make_shape(M, scale_k, L));
++      return local_tile(mS_mkl, ScaleTileShape{}, make_coord(_, _));
++    };
++
++    // (TILE_M,TILE_Scale_K,m,scale_k,l)
++    auto make_gZ_mkl = [&]() {
++      auto scale_k = mainloop_params.scale_k;
++      Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(
++          make_shape(M, scale_k, L));
++      return local_tile(mZ_mkl, ScaleTileShape{}, make_coord(_, _));
++    };
++
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++      return cute::make_tuple(make_gA_mkl(), make_gB_nkl());
++    } else if constexpr (KernelConversionMode ==
++                         ConversionMode::ConvertAndScale) {
++      return cute::make_tuple(make_gA_mkl(), make_gB_nkl(), make_gS_mkl());
++    } else if constexpr (KernelConversionMode ==
++                         ConversionMode::ConvertAndScaleWithZero) {
++      return cute::make_tuple(make_gA_mkl(), make_gB_nkl(), make_gS_mkl(),
++                              make_gZ_mkl());
++    } else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
++                    "Conversion mode not handled in load_init.");
++    }
++  }
++
++  // Similar to upstream, should be kept close to that when possible
++  //  the main difference is in the layout comments
++  // clang-format off
++  /// Perform a collective-scoped matrix multiply-accumulate
++  /// Producer Perspective
++  /// This overload gets triggered when we have scales.
++  template <
++    class... Ts,
++    class KTileIterator, class BlockCoord
++  >
++  CUTLASS_DEVICE void
++  load(
++      Params const& mainloop_params,
++      MainloopPipeline pipeline, 
++      PipelineState smem_pipe_write,
++      cute::tuple<Ts...> const& load_inputs,
++      BlockCoord const& blk_coord,
++      KTileIterator k_tile_iter, int k_tile_count,
++      int thread_idx,
++      uint32_t block_rank_in_cluster,
++      TensorStorage& shared_tensors) {
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++      static_assert(sizeof... (Ts) == 2, "Direct convert needs two inputs");
++    } 
++    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
++      static_assert(sizeof... (Ts) == 3, "Scaled convert needs three inputs");
++    } 
++    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
++      static_assert(sizeof... (Ts) == 4, "Scaled and zero convert needs four inputs");
++    } 
++    else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA load.");
++    }
++
++    int lane_predicate = cute::elect_one_sync();
++
++    if (lane_predicate) {
++      Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});      // (BLK_M,BLK_K,PIPE)
++      Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});      // (BLK_N,BLK_K,PIPE)
++      Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
++      Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
++
++      //
++      // Prepare the TMA loads for A, B and Scales
++      //
++      
++      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
++      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
++
++      Tensor gA_mkl = get<0>(load_inputs);
++      Tensor gB_nkl = get<1>(load_inputs);
++
++      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
++      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
++
++      // Partition the inputs based on the current block coordinates.
++      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
++      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (TILE_V,TILE_B,k)
++      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (TILE_N,TILE_K,k)
++
++      // Applies the mapping from block_tma_a
++      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
++      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
++
++      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
++      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
++
++      uint16_t mcast_mask_a = 0;
++      uint16_t mcast_mask_b = 0;
++      uint16_t mcast_mask_s = 0;
++
++      // Issue TmaLoads
++      // Maps the tile -> block, value
++      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
++        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
++        for (int n = 0; n < size<1>(block_layout); ++n) {
++          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
++        }
++      }
++
++      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
++        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
++        for (int m = 0; m < size<0>(block_layout); ++m) {
++          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
++        }
++      }
++
++      auto extra_input_partitions = partition_extra_tma_inputs(mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
++
++      // Mainloop
++      CUTLASS_PRAGMA_NO_UNROLL
++      for ( ; k_tile_count > 0; --k_tile_count) {
++        // LOCK smem_pipe_write for _writing_
++        pipeline.producer_acquire(smem_pipe_write);
++
++        //
++        // Copy gmem to smem for *k_tile_iter
++        //
++
++        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
++        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
++
++        int write_stage = smem_pipe_write.index();
++        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
++        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
++
++        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++          // Nothing extra to do.
++        }
++        else if constexpr (ModeHasScales) {
++          auto tSgS = get<0>(extra_input_partitions);
++          auto tSsS = get<1>(extra_input_partitions);
++
++          // Temporary factor which will determine which k tile to reload from gmem. Needed so we don't modify tma transaction bytes
++          // on the fly.
++          // We must do a ceiling divide here to correctly handle with group_size == K. In that case, we don't require that K
++          // is a multiple of the threadblock tile K
++          const int ReloadFactor = (mainloop_params.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
++          const int scale_load_k = *k_tile_iter / ReloadFactor; // This will always be 0 when group_size == K.
++          copy(mainloop_params.tma_load_scale.with(*tma_barrier, mcast_mask_s), tSgS(_,_,_,scale_load_k), tSsS(_,_,_,write_stage));
++
++          if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
++            // Nothing extra to do
++          } 
++          else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
++            auto tZgZ = get<2>(extra_input_partitions);
++            auto tZsZ = get<3>(extra_input_partitions);
++            copy(mainloop_params.tma_load_zero.with(*tma_barrier, mcast_mask_s), tZgZ(_,_,_,scale_load_k), tZsZ(_,_,_,write_stage));
++          }
++          else {
++            static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
++          } 
++        } 
++        else {
++          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
++        }
++
++        ++k_tile_iter;
++
++        // Advance smem_pipe_write
++        ++smem_pipe_write;
++      }
++    }
++  }
++  // clang-format off
++
++  // Same as upstream, should be kept the same when possible, not formatted for
++  // easier comparison
++  // clang-format off
++  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
++  CUTLASS_DEVICE void
++  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
++    int lane_predicate = cute::elect_one_sync();
++
++    // Issue the epilogue waits
++    if (lane_predicate) {
++      /* This helps avoid early exit of blocks in Cluster
++       * Waits for all stages to either be released (all 
++       * Consumer UNLOCKs), or if the stage was never used
++       * then would just be acquired since the phase was 
++       * still inverted from make_producer_start_state
++       */
++      pipeline.producer_tail(smem_pipe_write);
++    }
++  }
++  // clang-format on
++
++  // Modified from upstream, should be kept close to that when possible
++  //  the main differences are handling the prepacked A layout, and separating
++  //  the loading of A from upcoverting A
++  //
++  // Perform a collective-scoped matrix multiply-accumulate
++  // Consumer Perspective
++  template <class FrgTensorC>
++  CUTLASS_DEVICE void mma(MainloopPipeline pipeline,
++                          PipelineState smem_pipe_read, FrgTensorC& accum,
++                          int k_tile_count, int thread_idx,
++                          TensorStorage& shared_tensors,
++                          Params const& mainloop_params) {
++    static_assert(is_rmem<FrgTensorC>::value,
++                  "C tensor must be rmem resident.");
++    static_assert(cute::rank(SmemLayoutB{}) == 3,
++                  "Smem layout must be rank 3.");
++    static_assert(cute::rank(SmemLayoutAtomB{}) == 2,
++                  "SmemLayoutAtomB must be rank 2.");
++    static_assert(!cute::is_void_v<SmemCopyAtomA>,
++                  "SM90 GMMA mainloops must specify a non-void copy atom for "
++                  "RF sourced instructions.");
++    static_assert(cute::is_void_v<SmemCopyAtomB>,
++                  "SM90 GMMA mainloops cannot have a non-void copy atom for "
++                  "smem sourced instructions.");
++
++    // Obtain warp index
++    int warp_idx = canonical_warp_idx_sync();
++    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
++
++    // ((T, (FrgV,(RestM, RestK)), (BlocksM, BlocksK), pipe) -> offset
++    auto constexpr smem_A = SmemLayoutA{};
++
++    // convert:
++    //   ((T, (MMA,(MMA_M, MMA_K)), (BlocksM, BlocksK), pipe) -> offset
++    // to:
++    //   (T, MMA, ((MMA_M, BlocksM), (MMA_K, BlocksK)), pipe) -> offset
++    // which can be thought of as:
++    //   (T, MMA, (MMA_M, MMA_K), pipe) -> offset
++    auto constexpr smem_A_mma_ =
++        make_layout(get<0, 0>(smem_A), get<0, 1, 0>(smem_A),
++                    zip(get<0, 1, 1>(smem_A), get<1>(smem_A)), get<2>(smem_A));
++    // flatten to:
++    //   (T, MMA, MMA_M, MMA_K, pipe) -> offset
++    auto constexpr smem_A_mma = smem_A_mma_(_, _, make_coord(_, _), _);
++
++    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()),
++                            smem_A_mma);  // (T, MMA, MMA_M, MMA_K, pipe)
++    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()),
++                            SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
++
++    //
++    // Define C accumulators and A/B partitioning
++    //
++
++    TiledMma tiled_mma;
++    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
++
++    Tensor tCsA = sA(thread_idx, _, _, _, _);  // (MMA,MMA_M,MMA_K,PIPE)
++    Tensor tCsB = thread_mma.partition_B(sB);  // (MMA,MMA_N,MMA_K,PIPE)
++
++    // Allocate fragments and descriptors
++    Tensor tCrA_load = make_tensor<ElementA>(
++        tCsA(_, _, _, Int<0>{}).shape());  // (MMA,MMA_N,MMA_K)
++    Tensor tCrA_mma = make_fragment_like<ElementMma>(tCrA_load);
++
++    Tensor tCrB = thread_mma.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
++
++    static constexpr int A_CPY_VEC =
++        decltype(max_common_vector(tCsA, tCrA_load)){};
++
++    static constexpr int COVERSION_WIDTH =
++        std::min(A_CPY_VEC, int(size<0>(tCrA_mma)));
++
++    auto load_A_to_registers = [&](int read_stage) {
++      copy(create_auto_vectorizing_copy<ElementA, decltype(A_CPY_VEC)>(),
++           tCsA(_, _, _, read_stage), tCrA_load(_, _, _));
++    };
++
++    // Partition of thread -> shared and thread -> RF
++    auto partitioned_extra_info =
++        partition_extra_mma_info(thread_mma, shared_tensors);
++    auto copy_partitions_extra_info = retile_extra_mma_info(
++        tiled_mma, partitioned_extra_info, warp_group_thread_idx);
++    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));  // MMA_M
++    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));      // N
++    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));       // K
++    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));       // PIPE
++    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));  // PIPE
++
++    //
++    // PIPELINED MAIN LOOP
++    //
++
++    auto convert_A = [&, a_vec = Int<COVERSION_WIDTH>{}](int k_block,
++                                                         int read_stage) {
++      load_extra_info_to_registers(partitioned_extra_info,
++                                   copy_partitions_extra_info, k_block,
++                                   read_stage);
++      transform_A_kblock(tCrA_load, a_vec, tCrA_mma, partitioned_extra_info,
++                         k_block);
++    };
++
++    // We release buffers to producer warps(dma load) with some mmas in flight
++    PipelineState smem_pipe_release = smem_pipe_read;
++
++    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
++
++    warpgroup_fence_operand(accum);
++
++    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
++
++    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
++    // first k tile
++    {
++      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
++      pipeline.consumer_wait(smem_pipe_read, barrier_token);
++
++      int read_stage = smem_pipe_read.index();
++      ++smem_pipe_read;
++      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
++
++      // copy smem->rmem for A operand
++      load_A_to_registers(read_stage);
++      convert_A(0, read_stage);
++
++      // Unroll the K mode manually to set scale D to 1
++      CUTLASS_PRAGMA_UNROLL
++      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
++        if (k_block < K_BLOCK_MAX - 1) {
++          convert_A(k_block + 1, smem_pipe_read.index());
++        }
++        warpgroup_arrive();
++        // (V,M) x (V,N) => (V,M,N)
++        cute::gemm(tiled_mma, tCrA_mma(_, _, k_block),
++                   tCrB(_, _, k_block, read_stage), accum);
++        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
++        warpgroup_commit_batch();
++      }
++
++      --k_tile_count;
++      if (k_tile_count > 0) {
++        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to
++        // overwrite the A registers for the first mma.
++        warpgroup_wait<K_BLOCK_MAX - 1>();
++        pipeline.consumer_wait(smem_pipe_read, barrier_token);
++        load_A_to_registers(smem_pipe_read.index());
++        convert_A(0, smem_pipe_read.index());
++      }
++    }
++
++    if (k_tile_count == 0) {
++      return;
++    }
++
++    warpgroup_fence_operand(accum);
++    // Mainloop GMMAs
++    CUTLASS_PRAGMA_NO_UNROLL
++    for (; k_tile_count > 1; --k_tile_count) {
++      //
++      // Compute on k_tile
++      //
++
++      int read_stage = smem_pipe_read.index();
++      ++smem_pipe_read;
++
++      warpgroup_fence_operand(accum);
++      // Unroll the K mode manually to set scale D to 1
++      CUTLASS_PRAGMA_UNROLL
++      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
++        warpgroup_arrive();
++        // (V,M) x (V,N) => (V,M,N)
++        cute::gemm(tiled_mma, tCrA_mma(_, _, k_block),
++                   tCrB(_, _, k_block, read_stage), accum);
++        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
++        warpgroup_commit_batch();
++
++        warpgroup_wait<K_BLOCK_MAX - 1>();
++        if (k_block == K_BLOCK_MAX - 1) {
++          // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage,
++          // so we can release prior barrier
++          pipeline.consumer_release(
++              smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_
++                                   // on it
++          ++smem_pipe_release;
++        }
++
++        if (k_block == 0) {
++          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
++        }
++
++        if (k_block == K_BLOCK_MAX - 1) {
++          pipeline.consumer_wait(smem_pipe_read, barrier_token);
++          load_A_to_registers(smem_pipe_read.index());
++          convert_A(0, smem_pipe_read.index());
++        } else {
++          convert_A(k_block + 1, read_stage);
++        }
++      }
++      warpgroup_fence_operand(accum);
++    }
++
++    warpgroup_fence_operand(accum);
++
++    {
++      //
++      // Compute on k_tile
++      //
++
++      int read_stage = smem_pipe_read.index();
++
++      warpgroup_fence_operand(accum);
++
++      // Unroll the K mode manually to set scale D to 1
++      CUTLASS_PRAGMA_UNROLL
++      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
++        warpgroup_arrive();
++        // (V,M) x (V,N) => (V,M,N)
++        cute::gemm(tiled_mma, tCrA_mma(_, _, k_block),
++                   tCrB(_, _, k_block, read_stage), accum);
++        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
++        warpgroup_commit_batch();
++        warpgroup_wait<K_BLOCK_MAX - 1>();
++        if (k_block == K_BLOCK_MAX - 1) {
++          // release prior barrier
++          pipeline.consumer_release(
++              smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_
++                                   // on it
++          ++smem_pipe_release;
++        }
++
++        if (k_block < K_BLOCK_MAX - 1) {
++          convert_A(k_block + 1, read_stage);
++        }
++      }
++    }
++
++    warpgroup_fence_operand(accum);
++  }
++
++  // Perform a Consumer Epilogue to release all buffers
++  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline,
++                               PipelineState smem_pipe_release,
++                               int k_tile_count) {
++    // Prologue GMMAs
++    int prologue_mma_count = 1;
++    k_tile_count -= prologue_mma_count;
++
++    smem_pipe_release.advance(k_tile_count);
++
++    // Wait on all GMMAs to complete
++    warpgroup_wait<0>();
++
++    for (int count = 0; count < prologue_mma_count; ++count) {
++      pipeline.consumer_release(
++          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on
++                               // it
++      ++smem_pipe_release;
++    }
++  }
++
++ private:
++  // Same as upstream, should be kept the same when possible, not formatted for
++  // easier comparison
++  // clang-format off
++  /// Utilities for any additional inputs inside of the TMA load
++  template <class... Ts>
++  CUTLASS_DEVICE
++  auto partition_extra_tma_inputs(
++    Params const& mainloop_params,
++    cute::tuple<Ts...> const& load_inputs,
++    TensorStorage& shared_tensors,
++    uint2 const& cluster_local_block_id,
++    int const m_coord, 
++    int const l_coord) {
++
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++      return cute::make_tuple();
++    } 
++    else if constexpr (ModeHasScales) {
++      Tensor sS  = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
++      Tensor gS_mkl = get<2>(load_inputs);
++      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
++      Tensor gS = gS_mkl(_,_,m_coord,_,l_coord);                                                  // (BLK_M,BLK_K,k)
++
++      Tensor tSgS = block_tma_s.partition_S(gS);                                              // (TMA,TMA_M,TMA_K,k)
++      Tensor tSsS = block_tma_s.partition_D(sS);                                              // (TMA,TMA_M,TMA_K,PIPE)
++      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
++        return cute::make_tuple(tSgS, tSsS);
++      } 
++      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
++        Tensor sZ  = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
++        Tensor gZ_mkl = get<3>(load_inputs);
++        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
++        Tensor gZ = gZ_mkl(_,_,m_coord,_,l_coord);                                            // (BLK_M,BLK_K,k)
++
++        Tensor tZgZ = block_tma_z.partition_S(gZ);                                            // (TMA,TMA_M,TMA_K,k)
++        Tensor tZsZ = block_tma_z.partition_D(sZ);                                            // (TMA,TMA_M,TMA_K,PIPE)
++        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);          
++      }
++      else {
++        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");      
++      }
++    }
++    else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");      
++    }
++  }
++  // clang-format off
++
++  // Same as upstream, should be kept the same when possible, not formatted for
++  // easier comparison
++  // clang-format off
++  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
++  template <class ThreadMma>
++  CUTLASS_DEVICE 
++  auto partition_extra_mma_info(
++    ThreadMma const& mma_thread_slice,
++    TensorStorage& shared_tensors) {
++
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++      // nothing to do
++      return cute::make_tuple();
++    }
++    else if constexpr (ModeHasScales) {
++      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
++      Tensor tCsS = mma_thread_slice.partition_A(sS);
++      Tensor tCrS = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).shape()); 
++
++      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
++        return cute::make_tuple(tCsS, tCrS);
++      }
++      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
++        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
++        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
++        Tensor tCrZ = make_tensor<ElementZero>(mma_thread_slice.partition_fragment_A(sZ(_,_,Int<0>{})).shape()); 
++        return cute::make_tuple(tCsS, tCrS, tCsZ, tCrZ);
++      }
++      else {
++        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
++      }
++    } 
++    else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
++    }
++  }
++  // clang-format on
++
++  // Same as upstream, should be kept the same when possible, not formatted for
++  // easier comparison
++  // clang-format off
++  /// Returns the tiled copy and copy views for the extra inputs.
++  template <class TiledMma, class... Ts>
++  CUTLASS_DEVICE
++  auto retile_extra_mma_info(
++    TiledMma const& tiled_mma,
++    cute::tuple<Ts...>& partitioned_extra_info,
++    int const warp_group_thread_idx) {
++
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++      // nothing to do
++      return cute::make_tuple();
++    }
++    else if constexpr (ModeHasScales) {
++      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
++      auto smem_thr_copy_S   = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
++      Tensor tCrS_copy_view  = smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));        // (CPY,CPY_M,CPY_K)
++      
++      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
++        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
++      } 
++      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
++        Tensor tCrZ_copy_view  = smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));      // (CPY,CPY_M,CPY_K)
++        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
++      } 
++      else {
++        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
++      }
++    } 
++    else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
++    }
++  }
++  // clang-format on
++
++  // Similar to `copy_A_and_extra_info` upstream, should be kept the same when
++  // possible
++  //   the main differences this only loads the extra info into registers and
++  //   not A (since we now preload more of A in the main pipeline)
++  // Load scales and zeros into registers if required
++  template <class... Ts, class... Us>
++  CUTLASS_DEVICE void load_extra_info_to_registers(
++      cute::tuple<Ts...> const& partitioned_mma_extra_info,
++      cute::tuple<Us...> const& tiled_copy_and_views, int k_block,
++      int read_stage) {
++    if (k_block == 0) {
++      // We are starting a new k-tile so copy the scale
++      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++        // nothing to do
++      } else if constexpr (ModeHasScales) {
++        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
++        auto tCrS_copy_view = cute::get<1>(tiled_copy_and_views);
++        auto tCsS = cute::get<0>(partitioned_mma_extra_info);
++        copy(smem_tiled_copy_S, tCsS(_, _, k_block, read_stage),
++             tCrS_copy_view(_, _, k_block));
++        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
++          // Nothing extra to do
++        } else if constexpr (KernelConversionMode ==
++                             ConversionMode::ConvertAndScaleWithZero) {
++          auto tCsZ = cute::get<2>(partitioned_mma_extra_info);
++          auto tCrZ_copy_view = cute::get<2>(tiled_copy_and_views);
++          copy(smem_tiled_copy_S, tCsZ(_, _, k_block, read_stage),
++               tCrZ_copy_view(_, _, k_block));
++        } else {
++          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
++                        "Conversion mode not handled in A -> RF path.");
++        }
++      } else {
++        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
++                      "Conversion mode not handled in A -> RF path.");
++      }
++    }
++  }
++
++  // Similar to upstream, should be kept the same when possible.
++  //   the main differences are that `convert_tensor` supports interleaved
++  //   layouts and bfloat16 has been optimized. `transform_internal_A` has also
++  //   been inlined for code simplicity.
++  // Utilities to transform A.
++  template <class TCrA_load, int VectorWidthA, class TCrA_mma, class... Ts>
++  CUTLASS_DEVICE void transform_A_kblock(
++      TCrA_load const& tCrA_load, cute::Int<VectorWidthA> vec_A,
++      TCrA_mma& tCrA_mma, cute::tuple<Ts...> const& partitioned_extra_info,
++      int const k_block) {
++    auto in = tCrA_load(_, _, k_block);
++    auto out = tCrA_mma(_, _, k_block);
++
++    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
++      convert_tensor<IlvdBlkLayout>(in, out, vec_A);
++    } else if constexpr (ModeHasScales) {
++      auto tCrS = cute::get<1>(partitioned_extra_info);
++      auto converted_inputs =
++          make_fragment_like<ElementScale>(tCrA_mma)(_, _, k_block);
++      auto scales = tCrS(_, _, 0);
++
++      // First, we upcast the inputs to the scale type
++      convert_tensor<IlvdBlkLayout>(in, converted_inputs, vec_A);
++      // Apply scales and broadcast across inputs, store in converted_inputs
++
++      // We need to cast to nv_bfloat16 for the multiply since
++      // `cutlass::bfloat16_t` has an overloaded operator* that upconverts to
++      // float, which nvcc will not optimize to using vectorized fma
++      // instructions (i.e. hfma.bf16_v2)
++      if constexpr (std::is_same_v<ElementScale, cutlass::bfloat16_t>) {
++        cute::transform(
++            recast<nv_bfloat16>(converted_inputs), recast<nv_bfloat16>(scales),
++            recast<nv_bfloat16>(converted_inputs), cute::multiplies{});
++      } else {
++        cute::transform(converted_inputs, scales, converted_inputs,
++                        cute::multiplies{});
++      }
++
++      // Apply zeros if required
++      if constexpr (KernelConversionMode ==
++                    ConversionMode::ConvertAndScaleWithZero) {
++        auto tCrZ = cute::get<3>(partitioned_extra_info);
++        auto converted_zeros = make_fragment_like<ElementScale>(tCrZ)(_, _, 0);
++
++        convert_tensor<void>(tCrZ(_, _, 0), converted_zeros);
++        if constexpr (std::is_same_v<ElementScale, cutlass::bfloat16_t>) {
++          cute::transform(recast<nv_bfloat16>(converted_inputs),
++                          recast<nv_bfloat16>(converted_zeros),
++                          recast<nv_bfloat16>(converted_inputs), cute::plus{});
++        } else {
++          cute::transform(converted_inputs, converted_zeros, converted_inputs,
++                          cute::plus{});
++        }
++      }
++
++      // Finally, we convert the scaled inputs to the mma type.
++      convert_tensor<void>(converted_inputs, out);
++    } else {
++      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
++                    "No A data is loaded.");
++    }
++  }
++
++  // Modified from upstream, should be kept the same when possible
++  //   the main differences is that this version supports interleaved converts
++  // Utilities for transforming the A operand prior to issuing tensorcore math.
++  template <typename IlvdBlkLayout, class EngineIn, class EngineOut,
++            class TensorLayout,
++            int ConversionVectorWidth = cosize_v<TensorLayout>>
++  CUTLASS_DEVICE void convert_tensor(
++      Tensor<EngineIn, TensorLayout> const& in,
++      Tensor<EngineOut, TensorLayout>& out,
++      cute::Int<ConversionVectorWidth> width = {}) {
++    // This is an element-wise conversion where we expect both tensors to have
++    // the same layout. As a result, we can cast as a cutlass array to use the
++    // fast numeric converters without worrying about indexing into the layout.
++    constexpr int N = cosize_v<TensorLayout>;
++
++    // The inputs must be backed by registers & be statically sized.
++    static_assert(is_rmem<EngineIn>::value,
++                  "Input tensor for A conversion must come from registers");
++    static_assert(is_rmem<EngineOut>::value,
++                  "Output tensor for A conversion must come from registers");
++    static_assert(is_static_v<TensorLayout>,
++                  "Tensor layout for the conversion must be static");
++    static_assert(cosize_v<TensorLayout> == size(TensorLayout{}),
++                  "Cosize and size of the layout must be equal.");
++    static_assert(
++        N % ConversionVectorWidth == 0,
++        "Conversion vector width must divide cosize of the tensor layout.");
++
++    using SrcType = typename EngineIn::value_type;
++    using DstType = typename EngineOut::value_type;
++
++    using SrcArray = cutlass::Array<SrcType, ConversionVectorWidth>;
++    using DstArray = cutlass::Array<DstType, ConversionVectorWidth>;
++
++    constexpr cutlass::FloatRoundStyle RoundStyle =
++        cutlass::FloatRoundStyle::round_to_nearest;
++
++    using Converter = cutlass::InterleavedNumericArrayConverter<
++        IlvdBlkLayout, DstType, SrcType, ConversionVectorWidth, RoundStyle>;
++
++    constexpr int NumIterations = N / ConversionVectorWidth;
++
++    for (int ii = 0; ii < NumIterations; ++ii) {
++      SrcArray const* src_array_ptr =
++          reinterpret_cast<SrcArray const*>(raw_pointer_cast(in.data())) + ii;
++      DstArray* dst_array_ptr =
++          reinterpret_cast<DstArray*>(raw_pointer_cast(out.data())) + ii;
++      *dst_array_ptr = Converter::convert(*src_array_ptr);
++    }
++  }
++};
++
++}  // namespace machete
+diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh
+new file mode 100644
+index 0000000..e4af067
+--- /dev/null
++++ b/csrc/quantization/machete/machete_mm_kernel.cuh
+@@ -0,0 +1,314 @@
++#pragma once
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <torch/all.h>
++
++// clang-format off
++// The cutlass include order matters (annoyingly)
++#include "cutlass/cutlass.h"
++
++#include "cute/tensor.hpp"
++#include "cutlass/tensor_ref.h"
++#include "cutlass/epilogue/collective/default_epilogue.hpp"
++#include "cutlass/epilogue/thread/linear_combination.h"
++#include "cutlass/gemm/dispatch_policy.hpp"
++#include "cutlass/gemm/collective/collective_builder.hpp"
++#include "cutlass/epilogue/collective/collective_builder.hpp"
++#include "cutlass/gemm/device/gemm_universal_adapter.h"
++#include "cutlass/gemm/kernel/gemm_universal.hpp"
++// clang-format on
++
++#include "cutlass_extensions/cute_utils.cuh"
++#include "cutlass_extensions/vllm_numeric_conversion.cuh"
++#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
++#include "cutlass_extensions/torch_utils.hpp"
++#include "machete_collective_builder.cuh"
++#include "machete_prepacked_layout.cuh"
++#include "machete_interleaving_utils.cuh"
++
++namespace machete {
++
++using namespace cute;
++
++// NOTE This kernel computes D = alpha * A * B + beta * C by computing
++//   D^t = alpha * B^t * A^t + beta * C^t, this is because the wgmma
++//   instructions only support sourcing from registers for the left-hand
++//   operand, we want to upconvert/decompress the quantized operand in
++//   register. Since the primary use case we want to support is Y = XW^t where
++//   W is quantized, in this situation or right-hand operand is quantized so
++//   we compute the transpose to move it to the left-hand side.
++template <typename ElementA_, typename ElementB_, typename ElementD_,
++          typename AccumulatorT, typename GroupScaleT, typename GroupZeroT,
++          typename ChannelScaleT, typename TokenScaleT, class KernelSchedule,
++          typename ScheduleConfig>
++struct MacheteKernelTemplate {
++  static constexpr bool with_C = false;  // not ever used
++  static constexpr bool with_group_scales = !std::is_same_v<GroupScaleT, void>;
++  static constexpr bool with_group_zeropoints =
++      !std::is_same_v<GroupZeroT, void>;
++  static constexpr bool with_channel_scales =
++      !std::is_same_v<ChannelScaleT, void>;
++  static constexpr bool with_token_scales = !std::is_same_v<TokenScaleT, void>;
++
++  using MmaType = ElementA_;
++  using ElementA = ElementA_;
++  using ElementB = ElementB_;
++  using ElementD = ElementD_;
++  using ElementC = cute::conditional_t<with_C, ElementD, void>;
++  using ElementAccumulator = AccumulatorT;
++  using ElementCompute = AccumulatorT;  // For Epilogue
++  // Use dummy values when we don't have scales or zeropoints
++  using ElementZGroup =
++      cute::conditional_t<with_group_zeropoints, GroupZeroT, MmaType>;
++  using ElementSGroup =
++      cute::conditional_t<with_group_scales, GroupScaleT, MmaType>;
++  using ElementConvertGroup =
++      cute::conditional_t<with_group_scales, GroupScaleT, MmaType>;
++  using ElementSChannel =
++      cute::conditional_t<with_channel_scales, ChannelScaleT, AccumulatorT>;
++  using ElementSToken =
++      cute::conditional_t<with_token_scales, TokenScaleT, AccumulatorT>;
++
++  using BTypeTuple = cute::conditional_t<
++      with_group_scales,
++      cute::conditional_t<with_group_zeropoints,
++                          cute::tuple<ElementB, ElementSGroup, ElementZGroup>,
++                          cute::tuple<ElementB, ElementSGroup>>,
++      ElementB>;
++
++  using LayoutA = cutlass::layout::RowMajor;
++  using LayoutC = cutlass::layout::RowMajor;
++  using LayoutD = LayoutC;
++  using LayoutScale = cutlass::layout::RowMajor;
++  // not actually used since B has the prepacked layout, but required by cutlass
++  using _LayoutB = cutlass::layout::ColumnMajor;
++
++  // Interface strides expected by create_arguments (will get transposed)
++  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
++  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
++  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
++  using StrideSGroup = cutlass::detail::TagToStrideA_t<LayoutScale>;
++  using StrideZGroup = StrideSGroup;
++
++  using LayoutA_Transpose =
++      typename cutlass::layout::LayoutTranspose<LayoutA>::type;
++  using LayoutC_Transpose =
++      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
++  using LayoutD_Transpose =
++      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
++
++  using ArchTag = cutlass::arch::Sm90;
++  using OperatorClass = cutlass::arch::OpClassTensorOp;
++
++  using PrepackedLayoutB =
++      PrepackedLayoutBTemplate<ElementA_, ElementB_, ElementConvertGroup,
++                               AccumulatorT, LayoutA_Transpose, KernelSchedule>;
++
++  static int constexpr TileShapeK =
++      128 * 8 / cutlass::sizeof_bits<MmaType>::value;
++  static int constexpr AlignmentA = 128 / cutlass::sizeof_bits_v<ElementA>;
++  static int constexpr AlignmentB = 128 / cutlass::sizeof_bits_v<ElementB>;
++  static int constexpr AlignmentC =
++      (with_C) ? 128 / cutlass::sizeof_bits_v<ElementC> : 0;
++  static int constexpr AlignmentD = 128 / cutlass::sizeof_bits_v<ElementD>;
++
++  using TileShape = decltype(append(typename ScheduleConfig::TileShapeNM{},
++                                    cute::Int<TileShapeK>{}));
++  using ClusterShape = typename ScheduleConfig::ClusterShape;
++  using EpilogueSchedule = typename ScheduleConfig::EpilogueSchedule;
++  using EpilogueTileType = typename ScheduleConfig::EpilogueTileType;
++  using TileScheduler = typename ScheduleConfig::TileScheduler;
++
++  static_assert(
++      (!with_channel_scales && !with_token_scales) ||
++          ((with_channel_scales && with_token_scales) &&
++           std::is_same_v<ElementSChannel, ElementSToken>),
++      "Currently token and channel scales (if present) must be the same type");
++
++  using EpilogueDescriptor =
++      cutlass::epilogue::collective::detail::EpilogueDescriptor<
++          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
++          ElementD, EpilogueSchedule>;
++
++  // Currently only supports float scales
++  using ChTokScalesEpilogue =
++      typename vllm::c3x::ScaledEpilogue<ElementAccumulator, ElementD,
++                                         EpilogueDescriptor>;
++  static_assert((with_channel_scales || with_token_scales) ||
++                    (std::is_same_v<ElementSChannel, float> &&
++                     std::is_same_v<ElementSToken, float>),
++                "Currently token and channel scales (if present) must be float "
++                "(and if one is present the other must be too)");
++
++  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
++      cutlass::epilogue::fusion::Sm90AccFetch>;
++
++  using EVTCompute =
++      std::conditional_t<with_channel_scales || with_token_scales,
++                         typename ChTokScalesEpilogue::EVTCompute,
++                         StoreEpilogueCompute>;
++
++  // EVTCompute
++  using CollectiveEpilogue =
++      typename cutlass::epilogue::collective::CollectiveBuilder<
++          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
++          ElementAccumulator, ElementSChannel, ElementC, LayoutC_Transpose,
++          AlignmentC, ElementD, LayoutD_Transpose, AlignmentD, EpilogueSchedule,
++          EVTCompute>::CollectiveOp;
++
++  using CollectiveMainloop =
++      typename cutlass::gemm::collective::VLLMCollectiveBuilder<
++          cutlass::gemm::collective::MacheteKernelTag, ArchTag, OperatorClass,
++          BTypeTuple, PrepackedLayoutB, AlignmentB, ElementA, LayoutA_Transpose,
++          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
++          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
++              sizeof(typename CollectiveEpilogue::SharedStorage))>,
++          KernelSchedule>::CollectiveOp;
++
++  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
++      Shape<int, int, int, int>,  // Indicates ProblemShape
++      CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
++  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
++
++  // stride_B is unused (since B is prepacked), but still required by cutlass
++  using _StrideB = cutlass::detail::TagToStrideB_t<_LayoutB>;
++
++  using Arguments = typename Gemm::Arguments;
++  using MainloopArguments = typename GemmKernel::MainloopArguments;
++  using EpilogueArguments = typename GemmKernel::EpilogueArguments;
++
++  static Arguments create_arguments(
++      cudaStream_t stream,
++      torch::Tensor const& A,  // MxK matrix
++      torch::Tensor const& B,  // KxN prepacked matrix
++      torch::Tensor& D,        // MxN matrix
++      std::optional<torch::Tensor> const& maybe_g_scales,  // scale_KxN matrix
++      std::optional<torch::Tensor> const& maybe_g_zeros,   // scale_KxN matrix
++      std::optional<int64_t> maybe_group_size,
++      std::optional<torch::Tensor> const& maybe_ch_scales,   // len N vector
++      std::optional<torch::Tensor> const& maybe_tok_scales)  // len M vector
++  {
++    static_assert(!with_group_zeropoints || with_group_scales);
++
++    int M = A.size(0), N = B.size(1), K = A.size(1);
++    TORCH_CHECK(D.size(0) == M && D.size(1) == N);
++
++    auto layout_A = make_cute_layout<StrideA>(A, "A");
++    auto layout_D = make_cute_layout<StrideD>(D, "D");
++    auto layout_S_group =
++        maybe_make_cute_layout<StrideSGroup>(maybe_g_scales, "group_scales");
++    auto layout_Z_group =
++        maybe_make_cute_layout<StrideZGroup>(maybe_g_zeros, "group_zeros");
++    int64_t numel_S_channel = maybe_ch_scales ? maybe_ch_scales->numel() : 0;
++    int64_t numel_S_token = maybe_tok_scales ? maybe_tok_scales->numel() : 0;
++
++    auto unwrap = [](auto const& t) {
++      return t ? t->const_data_ptr() : nullptr;
++    };
++    auto A_ptr = static_cast<ElementA const*>(A.const_data_ptr());
++    auto B_ptr = static_cast<ElementB const*>(B.const_data_ptr());
++    auto D_ptr = static_cast<ElementD*>(D.mutable_data_ptr());
++    auto S_group_ptr =
++        static_cast<ElementSGroup const*>(unwrap(maybe_g_scales));
++    auto Z_group_ptr = static_cast<ElementZGroup const*>(unwrap(maybe_g_zeros));
++    auto S_channel_ptr =
++        static_cast<ElementSChannel const*>(unwrap(maybe_ch_scales));
++    auto S_token_ptr =
++        static_cast<ElementSToken const*>(unwrap(maybe_tok_scales));
++
++    int const group_size =
++        maybe_group_size == -1 ? K : maybe_group_size.value_or(K);
++    int const scale_k = (K + group_size - 1) / group_size;
++
++    TORCH_CHECK(size<0>(layout_A) == M && size<1>(layout_A) == K);
++    TORCH_CHECK(size<0>(layout_D) == M && size<1>(layout_D) == N);
++
++    if constexpr (with_group_scales) {
++      TORCH_CHECK(S_group_ptr && layout_S_group);
++      TORCH_CHECK((size<0>(*layout_S_group) == scale_k &&
++                   size<1>(*layout_S_group) == N));
++    } else {
++      TORCH_CHECK(!S_group_ptr, "Scales not supported");
++    }
++
++    if constexpr (with_group_zeropoints) {
++      TORCH_CHECK(Z_group_ptr && layout_Z_group);
++      TORCH_CHECK((size<0>(*layout_Z_group) == scale_k &&
++                   size<1>(*layout_Z_group) == N));
++      TORCH_CHECK(layout_S_group && *layout_Z_group == *layout_S_group,
++                  "Scales and zeros must have the same layout");
++    } else {
++      TORCH_CHECK(!Z_group_ptr, "Zeropoints not supported");
++    }
++
++    if constexpr (with_channel_scales || with_token_scales) {
++      TORCH_CHECK(
++          (maybe_ch_scales->numel() == N || maybe_ch_scales->numel() == 1) &&
++          (maybe_tok_scales->numel() == M || maybe_tok_scales->numel() == 1));
++    }
++
++    // Transpose A and D
++    // A doesn't need to be transposed since cutlass expects a NxK matrix
++    //  for B (which is At)
++    auto stride_At = layout_A.stride();
++    auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
++
++    MainloopArguments mainloop_arguments{};
++    // {Accum, C, C_layout, D, D}
++    EpilogueArguments epilogue_arguments{};
++
++    if constexpr (with_channel_scales || with_token_scales) {
++      epilogue_arguments =
++          EpilogueArguments{ChTokScalesEpilogue::prepare_args(
++                                *maybe_ch_scales, *maybe_tok_scales),
++                            nullptr,
++                            {},
++                            D_ptr,
++                            stride_Dt};
++    } else {
++      epilogue_arguments = EpilogueArguments{{}, nullptr, {}, D_ptr, stride_Dt};
++    }
++
++    if constexpr (with_group_scales && with_group_zeropoints) {
++      auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride();
++      mainloop_arguments = MainloopArguments{
++          B_ptr,       _StrideB{},     A_ptr,      stride_At,
++          S_group_ptr, stride_S_group, group_size, Z_group_ptr};
++    } else if constexpr (with_group_scales) {
++      auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride();
++      mainloop_arguments =
++          MainloopArguments{B_ptr,       _StrideB{},     A_ptr,     stride_At,
++                            S_group_ptr, stride_S_group, group_size};
++    } else {
++      mainloop_arguments =
++          MainloopArguments{B_ptr, _StrideB{}, A_ptr, stride_At};
++    }
++
++    return Arguments{cutlass::gemm::GemmUniversalMode::kGemm,
++                     {N, M, K, 1},
++                     mainloop_arguments,
++                     epilogue_arguments};
++  };
++
++  static size_t get_workspace_size(Arguments const& args) {
++    return Gemm::get_workspace_size(args);
++  }
++
++  static bool can_implement(Arguments const& args) {
++    return Gemm::can_implement(args) == cutlass::Status::kSuccess;
++  }
++
++  static void run(Arguments const& args, void* workspace, cudaStream_t stream) {
++    Gemm gemm_op;
++
++    cutlass::Status status = gemm_op.initialize(args, workspace, stream);
++    TORCH_CHECK(status == cutlass::Status::kSuccess,
++                "Machete kernel failed to initialize workspace");
++
++    status = gemm_op.run(stream);
++    TORCH_CHECK(status == cutlass::Status::kSuccess, "Machete kernel failed");
++  }
++};
++
++};  // namespace machete
+diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh
+new file mode 100644
+index 0000000..cabe0af
+--- /dev/null
++++ b/csrc/quantization/machete/machete_mm_launcher.cuh
+@@ -0,0 +1,75 @@
++#pragma once
++
++#include <torch/all.h>
++#include <Python.h>
++
++#include "machete_mm_kernel.cuh"
++#include "cutlass_extensions/torch_utils.hpp"
++#include "core/scalar_type.hpp"
++
++namespace machete {
++
++struct MMArgs {
++  torch::Tensor const& A;
++  torch::Tensor const& B;
++  vllm::ScalarType const& b_type;
++  std::optional<at::ScalarType> const& maybe_out_type;
++  std::optional<torch::Tensor> const& maybe_group_scales;
++  std::optional<torch::Tensor> const& maybe_group_zeros;
++  std::optional<int64_t> maybe_group_size;
++  std::optional<torch::Tensor> const& maybe_channel_scales;
++  std::optional<torch::Tensor> const& maybe_token_scales;
++  std::optional<std::string> maybe_schedule;
++};
++
++struct SupportedSchedulesArgs {
++  at::ScalarType a_type;
++  vllm::ScalarType b_type;
++  std::optional<at::ScalarType> maybe_group_scales_type;
++  std::optional<at::ScalarType> maybe_group_zeros_type;
++  std::optional<at::ScalarType> maybe_channel_scales_type;
++  std::optional<at::ScalarType> maybe_token_scales_type;
++  std::optional<at::ScalarType> maybe_out_type;
++};
++
++torch::Tensor mm_dispatch(MMArgs args);
++
++std::vector<std::string> supported_schedules_dispatch(
++    SupportedSchedulesArgs args);
++
++template <typename MacheteKernel>
++torch::Tensor run_impl(MMArgs args) {
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(args.A));
++
++  auto device = args.A.device();
++  auto stream = at::cuda::getCurrentCUDAStream(device.index());
++
++  int M = args.A.size(0);
++  int N = args.B.size(1);
++  int K = args.A.size(1);
++
++  // Allocate output
++  torch::Tensor D = torch::empty(
++      {M, N},
++      torch::TensorOptions()
++          .dtype(equivalent_scalar_type_v<typename MacheteKernel::ElementD>)
++          .device(device));
++
++  auto arguments = MacheteKernel::create_arguments(
++      stream,  //
++      args.A, args.B, D, args.maybe_group_scales, args.maybe_group_zeros,
++      args.maybe_group_size, args.maybe_channel_scales,
++      args.maybe_token_scales);
++  TORCH_CHECK(MacheteKernel::can_implement(arguments),
++              "Machete kernel cannot be run with these arguments");
++
++  size_t workspace_size = MacheteKernel::get_workspace_size(arguments);
++  torch::Tensor workspace = torch::empty(
++      workspace_size, torch::TensorOptions().dtype(torch::kU8).device(device));
++
++  MacheteKernel::run(arguments, workspace.mutable_data_ptr(), stream);
++
++  return D;
++};
++
++};  // namespace machete
+\ No newline at end of file
+diff --git a/csrc/quantization/machete/machete_prepack_kernel.cuh b/csrc/quantization/machete/machete_prepack_kernel.cuh
+new file mode 100644
+index 0000000..d002355
+--- /dev/null
++++ b/csrc/quantization/machete/machete_prepack_kernel.cuh
+@@ -0,0 +1,76 @@
++#pragma once
++
++#include "machete_mm_kernel.cuh"
++#include "cutlass_extensions/cute_utils.cuh"
++#include "cutlass_extensions/torch_utils.hpp"
++
++namespace machete {
++
++template <int threads, typename PrepackedLayoutB, typename BInTensor,
++          typename ElementB>
++static __global__ void prepack_B_kernel(BInTensor B_in, ElementB* B_out_ptr) {
++  auto constexpr block_size =
++      Int<size(typename PrepackedLayoutB::PPBlockShape_NK{})>{};
++  auto constexpr eles_per_thread = Int<block_size / threads>{};
++  static_assert(block_size % threads == 0,
++                "block_size must be divisible by the number of threads");
++
++  // Which pre-packed are we responsible for
++  auto blk_coord = make_coord(blockIdx.x, blockIdx.y, blockIdx.z);
++  auto tB_in = local_tile(
++      B_in, append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}),
++      blk_coord);
++
++  // Find the start offset in the output for this pre-packed block
++  auto bNbKL_to_offset = PrepackedLayoutB::bNbKL_to_offset(shape(B_in));
++
++  // Tensor representing a 1:1 mapping to the output space in 1D
++  auto tB_out_linear =
++      make_tensor(get_logical_ptr(B_out_ptr) + bNbKL_to_offset(blk_coord),
++                  make_layout(make_shape(block_size)));
++  // Mapping from output space (1D) to input space
++  auto tB_in_linear = make_tensor(
++      tB_in.data(),
++      tB_in.layout()
++          .compose(right_inverse(PrepackedLayoutB::ppblock_ilvd_NK_to_offset()))
++          .with_shape(make_shape(block_size)));
++
++  // Tile for this specific thread (could have used a TiledCopy but these work
++  // best with 2d layouts, this is a simple 1d layout so local_tile is enough,
++  // we are also not that concerned with performance for this kernel)
++  auto thr_tB_in_linear =
++      local_tile(tB_in_linear, make_shape(eles_per_thread), threadIdx.x);
++  auto thr_tB_out_linear =
++      local_tile(tB_out_linear, make_shape(eles_per_thread), threadIdx.x);
++
++  // Construct a register-backed Tensor with the same shape as each thread's
++  // partition
++  auto fragment = make_tensor<ElementB>(shape(thr_tB_in_linear));
++
++  copy(thr_tB_in_linear, fragment);
++  copy(Copy_Atom<DefaultCopy, uint8_t>{}, fragment, thr_tB_out_linear);
++}
++
++template <typename PrepackedLayoutB, typename InLayout>
++static void prepack_B_template(
++    cudaStream_t stream, typename PrepackedLayoutB::ElementB const* B_in_ptr,
++    InLayout B_layout, typename PrepackedLayoutB::ElementB* B_out_ptr) {
++  using TileShapeNKL =
++      decltype(append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}));
++  auto ilvd_NKbNbKL_to_offset =
++      PrepackedLayoutB::ilvd_NKbNbKL_to_offset(shape(B_layout));
++
++  TORCH_CHECK(size<0>(B_layout) % size<0>(TileShapeNKL{}) == 0);
++  TORCH_CHECK(size<1>(B_layout) % size<1>(TileShapeNKL{}) == 0);
++
++  auto N_tiles = size<0>(B_layout) / size<0>(TileShapeNKL{});
++  auto K_tiles = size<1>(B_layout) / size<1>(TileShapeNKL{});
++  auto L_tiles = size<2>(B_layout);
++
++  auto B_in = make_tensor(get_logical_ptr(B_in_ptr), B_layout);
++
++  prepack_B_kernel<128, PrepackedLayoutB>
++      <<<dim3(N_tiles, K_tiles, L_tiles), 128, 0, stream>>>(B_in, B_out_ptr);
++}
++
++};  // namespace machete
+\ No newline at end of file
+diff --git a/csrc/quantization/machete/machete_prepack_launcher.cuh b/csrc/quantization/machete/machete_prepack_launcher.cuh
+new file mode 100644
+index 0000000..634b651
+--- /dev/null
++++ b/csrc/quantization/machete/machete_prepack_launcher.cuh
+@@ -0,0 +1,74 @@
++#pragma once
++
++#include "machete_prepack_kernel.cuh"
++#include "cutlass_extensions/torch_utils.hpp"
++#include "core/scalar_type.hpp"
++
++namespace machete {
++
++struct PrepackBArgs {
++  torch::Tensor const& B;
++  at::ScalarType a_type;
++  vllm::ScalarType b_type;
++  std::optional<at::ScalarType> maybe_group_scales_type;
++};
++
++template <typename PrepackedLayoutB>
++torch::Tensor prepack_impl(torch::Tensor const B) {
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(B));
++  using ElementB = typename PrepackedLayoutB::ElementB;
++  using PPBlockShape_NK = typename PrepackedLayoutB::PPBlockShape_NK;
++
++  auto device = B.device();
++  auto stream = at::cuda::getCurrentCUDAStream(device.index());
++  auto B_ptr = static_cast<ElementB const*>(B.const_data_ptr());
++  // elements per storage item for B
++  auto eles_per_storage =
++      (B.dtype().itemsize() * 8) / cute::sizeof_bits_v<ElementB>;
++
++  // torch B passed in is/should be (packed_K,N), the kernel expects (N,K,L) (to
++  // match cutlass using (N,K,L) for B), so we transpose B to (N,packed_K,L)
++  auto Bt_packed = B.t();
++
++  TORCH_CHECK(
++      (B.size(0) * eles_per_storage) % size<1>(PPBlockShape_NK{}) == 0,
++      "B.shape[0] (in terms of unpacked elements) must be a multiple of ",
++      size<1>(PPBlockShape_NK{}));
++  TORCH_CHECK(B.size(1) % size<0>(PPBlockShape_NK{}) == 0,
++              "B.shape[1] must be a multiple of ", size<0>(PPBlockShape_NK{}));
++
++  using StrideB = cutlass::detail::TagToStrideB_t<cutlass::layout::ColumnMajor>;
++  auto const l_Bt_packed = make_cute_layout<StrideB>(Bt_packed, "B");
++
++  // convert (N,packed_K,L) layout to (N,K,L) layout
++  //  in effect we want to do: blocked_product(layout_Bt_packed,
++  //      make_ordered_layout(make_shape(_1{}, eles_per_storage, _1{}),
++  //                          Step<_1, _0, _2>{}));
++  // but blocked_product does not support dynamic strides so we implement the
++  // equivalent manually,
++  //   new_shape = (N, packed_K, L) * (1, eles_per_storage, 1) -> (N, K, L)
++  //   new_stride = (s0, s1, s2) * (eles_per_storage, 1, eles_per_storage)
++  //                 when s1 == 1
++  TORCH_CHECK(stride<1>(l_Bt_packed) == 1);
++  // clang-format off
++  auto const layout_Bt = make_layout(
++      transform_with_idx(l_Bt_packed.shape(), [&](auto ele, auto idx) {
++        return idx == 1 ? ele * eles_per_storage : ele;
++      }), 
++      transform_with_idx(l_Bt_packed.stride(), [&](auto ele, auto idx) {
++        return idx != 1 ? ele * eles_per_storage : ele;
++      }));
++  // clang-format on
++
++  // Allocate output
++  torch::Tensor D = torch::empty_like(B, {}, at::MemoryFormat::Contiguous);
++
++  prepack_B_template<PrepackedLayoutB>(
++      stream, B_ptr, layout_Bt, static_cast<ElementB*>(D.mutable_data_ptr()));
++
++  return D;
++};
++
++torch::Tensor prepack_B_dispatch(PrepackBArgs args);
++
++};  // namespace machete
+\ No newline at end of file
+diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh
+new file mode 100644
+index 0000000..81aaa6c
+--- /dev/null
++++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
+@@ -0,0 +1,249 @@
++#pragma once
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <torch/all.h>
++
++// clang-format off
++// The cutlass include order matters (annoyingly)
++
++#include "cutlass/cutlass.h"
++
++#include "cute/tensor.hpp"
++#include "cutlass/tensor_ref.h"
++#include "cutlass/epilogue/collective/default_epilogue.hpp"
++#include "cutlass/epilogue/thread/linear_combination.h"
++#include "cutlass/gemm/dispatch_policy.hpp"
++#include "cutlass/gemm/collective/collective_builder.hpp"
++#include "cutlass/epilogue/collective/collective_builder.hpp"
++#include "cutlass/gemm/device/gemm_universal_adapter.h"
++#include "cutlass/gemm/kernel/gemm_universal.hpp"
++// clang-format on
++
++#include "cutlass_extensions/cute_utils.cuh"
++#include "machete_collective_builder.cuh"
++#include "machete_interleaving_utils.cuh"
++
++namespace machete {
++
++using namespace cute;
++
++struct IlvBlkLayoutAuto {};
++
++// This defines a prepacked layout for the B matrix, where the matrix is broken
++// up into PPBlockShape_NK blocks. The data within each block is then compactly
++// stored in memory such that when performing a TiledMMA operation with the same
++// shape as prepacked block, all the data for a given thread is contiguous in
++// memory. This allows us to use wider shared memory loads when loading B from
++// shared memory. The values within a thread are also potentially interlaeved
++// inorder to allow for more efficient upconverting.
++//
++// The contract here is that the `TiledMma` determined below matches the one
++// ultimately used in the kernel. (this is also why the other element types are
++// required along with the kernel schedule)
++template <typename ElementA_, typename ElementB_, typename ElementConvert_,
++          typename AccumulatorT, class LayoutB, class KernelSchedule,
++          typename IlvBlkLayout_ = IlvBlkLayoutAuto>
++// clang-format on
++struct PrepackedLayoutBTemplate {
++  using MmaType = ElementA_;
++  using ElementA = ElementA_;
++  using ElementB = ElementB_;
++  using ElementAccumulator = AccumulatorT;
++  using ElementMma = MmaType;
++
++  // Interleave for 4bit bit types when we are not upconverting to fp8 or int8,
++  // in those cases case we use a LUT using prmt instructions to upconvert and
++  // is more efficient if the data is not interleaved For 8bit+ prmt
++  // instructions makes non-interleaved layouts efficient enough we don't need
++  // iterleaved layouts (and can reuse more of the existing cutlass converts)
++  static constexpr bool should_interleave =
++      sizeof_bits_v<ElementB> <= 4 &&
++      !std::is_same_v<ElementConvert_, cutlass::float_e4m3_t> &&
++      !std::is_same_v<ElementConvert_, int8_t>;
++
++  // Only use interleaved layouts for subbyte weights,
++  using IlvdBlkLayout = std::conditional_t<
++      std::is_same_v<IlvBlkLayout_, IlvBlkLayoutAuto>,
++      std::conditional_t<
++          should_interleave,
++          decltype(get_interleaved_blk_layout<
++                   ElementB, sizeof_bits_v<ElementConvert_>, 32>()),
++          void>,
++      IlvBlkLayout_>;
++
++  // TODO (LucasWilkinson): compare the performance for other sizes
++  // Prepacked block shape, smallest layout atom for loading into registers
++  //   (can contain multiple wgmma instructions worth of data in one block)
++  // We ideally want this to be configured such that a thread can perform 128bit
++  // loads, i.e. we amount of data associated with each thread within a
++  // prepacked block is a multiple of 128bits, when using a cooperative sechdule
++  // we have 256 threads working a single block at a time, this means each
++  // thread works on `sizeof_bits_v<ElementB> * (128*64) / 256` bits of data,
++  // for a 4bit type this would be 128bits
++  using PPBlockShape_NK = Shape<_128, _64>;
++
++  // Create the shape of the tile anticipated to be used by the GEMM kernel,
++  //  when the kernel executes we will compute `Ct = Bt * At` since the
++  //  quantized weights (B), must be the lhs operand so the flow through
++  //  registers.
++  // The _128 here doesn't actually impact the shape of the stored tile directly
++  //  but may impact the op selected by rs_op_selector
++  using GemmTileShape = decltype(make_shape(size<0>(PPBlockShape_NK{}), _128{},
++                                            size<1>(PPBlockShape_NK{})));
++
++  static constexpr cute::GMMA::Major GmmaMajorB =
++      gmma_rs_tag_to_major_B<LayoutB>();
++
++  // For coop schedules we have two warp groups cooperatively issuing wgmma
++  // instructions so we use 2 atoms along the M dim (one for each warpgroup)
++  using AtomLayoutMNK = cute::conditional_t<
++      cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
++      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
++
++  using TiledMma = decltype(cute::make_tiled_mma(
++      cute::GMMA::rs_op_selector<ElementMma, ElementMma, ElementAccumulator,
++                                 GemmTileShape, GMMA::Major::K, GmmaMajorB>(),
++      AtomLayoutMNK{}));
++
++  // Prepacked block, (athrid, val) -> (N,K)
++  // i.e. ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...))) -> (N,K)
++  CUTE_HOST_DEVICE static constexpr auto ppblock_TV_to_NK() {
++    return TiledMma{}.thrfrg_A(make_layout(PPBlockShape_NK{}));
++  }
++
++  // Prepacked block, (N,K) -> (athrid, val)
++  // i.e. (N,K) -> ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...)))
++  CUTE_HOST_DEVICE static constexpr auto ppblock_NK_to_TV() {
++    return right_inverse(ppblock_TV_to_NK()).with_shape(PPBlockShape_NK{});
++  }
++
++  // Prepacked block, (athrid, val) -> (storage_offset)
++  // i.e. ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...))) -> (storage_idx)
++  CUTE_HOST_DEVICE static constexpr auto ppblock_TV_to_offset() {
++    // Return iterleaved layout
++    return make_ordered_layout(shape(ppblock_TV_to_NK()), Step<_1, _0>{});
++  }
++
++  // Prepacked block, (athrid, val) -> (storage_offset)
++  // i.e. ((ThrV,(ThrM,ThrK)),(IlvdFrgV,(RestM,RestK,...))) -> (storage_idx)
++  CUTE_HOST_DEVICE static constexpr auto ppblock_ilvd_TV_to_offset() {
++    auto layout_no_interleave =
++        make_ordered_layout(shape(ppblock_TV_to_NK()), Step<_1, _0>{});
++
++    if constexpr (std::is_same_v<IlvdBlkLayout, void>) {
++      return layout_no_interleave;
++    } else {
++      // interleave by transforming FrgV into interleaved blocks where each
++      // block has the layout IlvdBlkLayout, for example if IlvdBlkLayout is
++      // (2, 2) : (2, 1) then we get: ((2, 2), size(FrgV) / 4) : ((2, 1), 4)
++      //   if FrgV is {A, B, C, D, E, F, G, H}
++      //   then ((IlvBlk), FrgB) is {A, C, B, D, C, G, D, H}
++      auto frgV = get<1, 0>(layout_no_interleave);
++      auto ilvdBlk = IlvdBlkLayout{};
++      static_assert(size(frgV) % size(ilvdBlk) == 0,
++                    "FrgV must be divisible by size(ilvdBlk)");
++      auto ilvd_FrgV = make_layout(
++          make_shape(shape(ilvdBlk), Int<size(frgV) / size(ilvdBlk)>{}),
++          make_stride(stride(ilvdBlk), size(ilvdBlk)));
++
++      // Return iterleaved layout
++      return make_layout(
++          get<0>(layout_no_interleave),
++          make_layout(ilvd_FrgV, get<1, 1>(layout_no_interleave)));
++    }
++  }
++
++  // Prepacked block, (M,K) -> (storage_offset)
++  CUTE_HOST_DEVICE static constexpr auto ppblock_ilvd_NK_to_offset() {
++    // do (M,K) -> (athrid, val) -> (storage_idx)
++    return ppblock_ilvd_TV_to_offset().compose(ppblock_NK_to_TV());
++  }
++
++  // ((athrid, val), (BlocksN, BlocksK), L) -> (storage_idx)
++  template <typename Shape_NKL>
++  CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset(
++      Shape_NKL shape_mkl) {
++    constexpr auto block_layout = ppblock_TV_to_offset();
++
++    // (BlocksN, BlocksK, L)
++    auto blocks_shape =
++        cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}),
++                        [](auto x, auto y) { return x / y; });
++
++    // ((athrid, val), (BlocksN, BlocksK, L)) -> (storage_idx)
++    auto result = make_layout(
++        block_layout,
++        make_layout(blocks_shape,
++                    compact_col_major(blocks_shape, size(block_layout))));
++
++    // ((athrid, val), (BlocksN, BlocksK, L))
++    //   => ((athrid, val), (BlocksN, BlocksK), L)
++    return group<1, 3>(result(_, repeat<rank<1>(result)>(_)));
++  }
++
++  // ((athrid_val), (BlocksN, BlocksK, L)) -> (N, K, L)
++  template <typename Shape_NKL>
++  CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
++      Shape_NKL shape_mkl) {
++    auto layout = TVbNbKL_to_offset(shape_mkl);
++    return make_layout(coalesce(get<0>(layout)), get<1>(layout),
++                       get<2>(layout));
++  }
++
++  // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
++  template <typename Shape_NKL>
++  CUTE_HOST_DEVICE static constexpr auto ilvd_NKbNbKL_to_offset(
++      Shape_NKL shape_mkl) {
++    constexpr auto block_layout = ppblock_ilvd_NK_to_offset();
++
++    // (BlocksN, BlocksK, L)
++    auto blocks_shape =
++        cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}),
++                        [](auto x, auto y) { return x / y; });
++
++    // ((athrid, val), (BlocksN, BlocksK, L)) -> (storage_idx)
++    auto result = make_layout(
++        block_layout,
++        make_layout(blocks_shape,
++                    compact_col_major(blocks_shape, size(block_layout))));
++
++    // ((athrid, val), (BlocksN, BlocksK, L)) => ((athrid, val), (BlocksN,
++    // BlocksK), L)
++    return group<1, 3>(result(_, repeat<rank<1>(result)>(_)));
++  }
++
++  // (BlocksN, BlocksK, L) -> (storage_idx)
++  template <typename Shape_NKL>
++  CUTE_HOST_DEVICE static constexpr auto bNbKL_to_offset(Shape_NKL shape_mkl) {
++    // (BlocksN, BlocksK, L)
++    auto blocks_shape =
++        cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}),
++                        [](auto x, auto y) { return x / y; });
++    auto stride = size(PPBlockShape_NK{});
++
++    // (BlocksN, BlocksK, L) -> (storage_idx)
++    return make_layout(blocks_shape, compact_col_major(blocks_shape, stride));
++  }
++
++  // ((athrid, val), (BlocksN, BlocksK, L)) -> (N, K, L)
++  template <class Shape_NKL>
++  CUTE_HOST_DEVICE static auto TVbNbK_to_NKL(Shape_NKL shape_mkl) {
++    auto tile = make_tile(make_layout(size<0>(PPBlockShape_NK{})),
++                          make_layout(size<1>(PPBlockShape_NK{})));
++
++    // ((BlockN, BlockK), (BlocksN, BlocksK, L)) -> (N, K, L)
++    auto tiled_A = zipped_divide(make_layout(shape_mkl), tile);
++    return tiled_A.compose(ppblock_TV_to_NK(), _);
++  }
++
++  // (N, K, L) -> ((athrid, val), (BlocksN, BlocksK), L)
++  template <class Shape_NKL>
++  CUTE_HOST_DEVICE static auto NKL_to_TVbNbK(Shape_NKL shape_mkl) {
++    auto TVbNbK_to_NKL_layout = TVbNbK_to_NKL(shape_mkl);
++    return blocked_product(ppblock_NK_to_TV(),
++                           make_layout(shape<1>(TVbNbK_to_NKL_layout)));
++  }
++};
++
++};  // namespace machete
+diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu
+new file mode 100644
+index 0000000..05a51ee
+--- /dev/null
++++ b/csrc/quantization/machete/machete_pytorch.cu
+@@ -0,0 +1,73 @@
++#include "machete_mm_launcher.cuh"
++#include "machete_prepack_launcher.cuh"
++#include "core/scalar_type.hpp"
++
++#include "core/registration.h"
++
++namespace machete {
++
++using namespace vllm;
++
++std::vector<std::string> supported_schedules(
++    at::ScalarType a_type, int64_t b_type_id,
++    std::optional<at::ScalarType> maybe_group_scales_type,
++    std::optional<at::ScalarType> maybe_group_zeros_type,
++    std::optional<at::ScalarType> maybe_channel_scales_type,
++    std::optional<at::ScalarType> maybe_token_scales_type,
++    std::optional<at::ScalarType> maybe_out_type) {
++  ScalarType const b_type = ScalarType::from_id(b_type_id);
++  return supported_schedules_dispatch({
++      .a_type = a_type,
++      .b_type = b_type,
++      .maybe_group_scales_type = maybe_group_scales_type,
++      .maybe_group_zeros_type = maybe_group_zeros_type,
++      .maybe_channel_scales_type = maybe_channel_scales_type,
++      .maybe_token_scales_type = maybe_token_scales_type,
++      .maybe_out_type = maybe_out_type,
++  });
++}
++
++torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
++                 int64_t b_type_id,
++                 std::optional<at::ScalarType> const& maybe_out_type,
++                 std::optional<torch::Tensor> const& maybe_group_scales,
++                 std::optional<torch::Tensor> const& maybe_group_zeros,
++                 std::optional<int64_t> maybe_group_size,
++                 std::optional<torch::Tensor> const& maybe_channel_scales,
++                 std::optional<torch::Tensor> const& maybe_token_scales,
++                 std::optional<std::string> maybe_schedule) {
++  ScalarType const b_type = ScalarType::from_id(b_type_id);
++  return mm_dispatch({.A = A,
++                      .B = B,
++                      .b_type = b_type,
++                      .maybe_out_type = maybe_out_type,
++                      .maybe_group_scales = maybe_group_scales,
++                      .maybe_group_zeros = maybe_group_zeros,
++                      .maybe_group_size = maybe_group_size,
++                      .maybe_channel_scales = maybe_channel_scales,
++                      .maybe_token_scales = maybe_token_scales,
++                      .maybe_schedule = maybe_schedule});
++}
++
++torch::Tensor prepack_B(
++    torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
++    std::optional<at::ScalarType> const& maybe_group_scales_type) {
++  ScalarType const b_type = ScalarType::from_id(b_type_id);
++  return prepack_B_dispatch(
++      {.B = B,
++       .a_type = a_type,
++       .b_type = b_type,
++       .maybe_group_scales_type = maybe_group_scales_type});
++}
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
++  m.impl("machete_prepack_B", &prepack_B);
++  m.impl("machete_mm", &mm);
++}
++
++// use CatchAll since supported_schedules has no tensor arguments
++TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) {
++  m.impl("machete_supported_schedules", &supported_schedules);
++}
++
++};  // namespace machete
+diff --git a/csrc/quantization/marlin/dense/LICENSE b/csrc/quantization/marlin/dense/LICENSE
+new file mode 100644
+index 0000000..1d1e4cf
+--- /dev/null
++++ b/csrc/quantization/marlin/dense/LICENSE
+@@ -0,0 +1,209 @@
++Contains code from https://github.com/IST-DASLab/marlin
++
++                                 Apache License
++                           Version 2.0, January 2004
++                        http://www.apache.org/licenses/
++
++   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
++
++   1. Definitions.
++
++      "License" shall mean the terms and conditions for use, reproduction,
++      and distribution as defined by Sections 1 through 9 of this document.
++
++      "Licensor" shall mean the copyright owner or entity authorized by
++      the copyright owner that is granting the License.
++
++      "Legal Entity" shall mean the union of the acting entity and all
++      other entities that control, are controlled by, or are under common
++      control with that entity. For the purposes of this definition,
++      "control" means (i) the power, direct or indirect, to cause the
++      direction or management of such entity, whether by contract or
++      otherwise, or (ii) ownership of fifty percent (50%) or more of the
++      outstanding shares, or (iii) beneficial ownership of such entity.
++
++      "You" (or "Your") shall mean an individual or Legal Entity
++      exercising permissions granted by this License.
++
++      "Source" form shall mean the preferred form for making modifications,
++      including but not limited to software source code, documentation
++      source, and configuration files.
++
++      "Object" form shall mean any form resulting from mechanical
++      transformation or translation of a Source form, including but
++      not limited to compiled object code, generated documentation,
++      and conversions to other media types.
++
++      "Work" shall mean the work of authorship, whether in Source or
++      Object form, made available under the License, as indicated by a
++      copyright notice that is included in or attached to the work
++      (an example is provided in the Appendix below).
++
++      "Derivative Works" shall mean any work, whether in Source or Object
++      form, that is based on (or derived from) the Work and for which the
++      editorial revisions, annotations, elaborations, or other modifications
++      represent, as a whole, an original work of authorship. For the purposes
++      of this License, Derivative Works shall not include works that remain
++      separable from, or merely link (or bind by name) to the interfaces of,
++      the Work and Derivative Works thereof.
++
++      "Contribution" shall mean any work of authorship, including
++      the original version of the Work and any modifications or additions
++      to that Work or Derivative Works thereof, that is intentionally
++      submitted to Licensor for inclusion in the Work by the copyright owner
++      or by an individual or Legal Entity authorized to submit on behalf of
++      the copyright owner. For the purposes of this definition, "submitted"
++      means any form of electronic, verbal, or written communication sent
++      to the Licensor or its representatives, including but not limited to
++      communication on electronic mailing lists, source code control systems,
++      and issue tracking systems that are managed by, or on behalf of, the
++      Licensor for the purpose of discussing and improving the Work, but
++      excluding communication that is conspicuously marked or otherwise
++      designated in writing by the copyright owner as "Not a Contribution."
++
++      "Contributor" shall mean Licensor and any individual or Legal Entity
++      on behalf of whom a Contribution has been received by Licensor and
++      subsequently incorporated within the Work.
++
++   2. Grant of Copyright License. Subject to the terms and conditions of
++      this License, each Contributor hereby grants to You a perpetual,
++      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
++      copyright license to reproduce, prepare Derivative Works of,
++      publicly display, publicly perform, sublicense, and distribute the
++      Work and such Derivative Works in Source or Object form.
++
++   3. Grant of Patent License. Subject to the terms and conditions of
++      this License, each Contributor hereby grants to You a perpetual,
++      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
++      (except as stated in this section) patent license to make, have made,
++      use, offer to sell, sell, import, and otherwise transfer the Work,
++      where such license applies only to those patent claims licensable
++      by such Contributor that are necessarily infringed by their
++      Contribution(s) alone or by combination of their Contribution(s)
++      with the Work to which such Contribution(s) was submitted. If You
++      institute patent litigation against any entity (including a
++      cross-claim or counterclaim in a lawsuit) alleging that the Work
++      or a Contribution incorporated within the Work constitutes direct
++      or contributory patent infringement, then any patent licenses
++      granted to You under this License for that Work shall terminate
++      as of the date such litigation is filed.
++
++   4. Redistribution. You may reproduce and distribute copies of the
++      Work or Derivative Works thereof in any medium, with or without
++      modifications, and in Source or Object form, provided that You
++      meet the following conditions:
++
++      (a) You must give any other recipients of the Work or
++          Derivative Works a copy of this License; and
++
++      (b) You must cause any modified files to carry prominent notices
++          stating that You changed the files; and
++
++      (c) You must retain, in the Source form of any Derivative Works
++          that You distribute, all copyright, patent, trademark, and
++          attribution notices from the Source form of the Work,
++          excluding those notices that do not pertain to any part of
++          the Derivative Works; and
++
++      (d) If the Work includes a "NOTICE" text file as part of its
++          distribution, then any Derivative Works that You distribute must
++          include a readable copy of the attribution notices contained
++          within such NOTICE file, excluding those notices that do not
++          pertain to any part of the Derivative Works, in at least one
++          of the following places: within a NOTICE text file distributed
++          as part of the Derivative Works; within the Source form or
++          documentation, if provided along with the Derivative Works; or,
++          within a display generated by the Derivative Works, if and
++          wherever such third-party notices normally appear. The contents
++          of the NOTICE file are for informational purposes only and
++          do not modify the License. You may add Your own attribution
++          notices within Derivative Works that You distribute, alongside
++          or as an addendum to the NOTICE text from the Work, provided
++          that such additional attribution notices cannot be construed
++          as modifying the License.
++
++      You may add Your own copyright statement to Your modifications and
++      may provide additional or different license terms and conditions
++      for use, reproduction, or distribution of Your modifications, or
++      for any such Derivative Works as a whole, provided Your use,
++      reproduction, and distribution of the Work otherwise complies with
++      the conditions stated in this License.
++
++   5. Submission of Contributions. Unless You explicitly state otherwise,
++      any Contribution intentionally submitted for inclusion in the Work
++      by You to the Licensor shall be under the terms and conditions of
++      this License, without any additional terms or conditions.
++      Notwithstanding the above, nothing herein shall supersede or modify
++      the terms of any separate license agreement you may have executed
++      with Licensor regarding such Contributions.
++
++   6. Trademarks. This License does not grant permission to use the trade
++      names, trademarks, service marks, or product names of the Licensor,
++      except as required for reasonable and customary use in describing the
++      origin of the Work and reproducing the content of the NOTICE file.
++
++   7. Disclaimer of Warranty. Unless required by applicable law or
++      agreed to in writing, Licensor provides the Work (and each
++      Contributor provides its Contributions) on an "AS IS" BASIS,
++      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
++      implied, including, without limitation, any warranties or conditions
++      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
++      PARTICULAR PURPOSE. You are solely responsible for determining the
++      appropriateness of using or redistributing the Work and assume any
++      risks associated with Your exercise of permissions under this License.
++
++   8. Limitation of Liability. In no event and under no legal theory,
++      whether in tort (including negligence), contract, or otherwise,
++      unless required by applicable law (such as deliberate and grossly
++      negligent acts) or agreed to in writing, shall any Contributor be
++      liable to You for damages, including any direct, indirect, special,
++      incidental, or consequential damages of any character arising as a
++      result of this License or out of the use or inability to use the
++      Work (including but not limited to damages for loss of goodwill,
++      work stoppage, computer failure or malfunction, or any and all
++      other commercial damages or losses), even if such Contributor
++      has been advised of the possibility of such damages.
++
++   9. Accepting Warranty or Additional Liability. While redistributing
++      the Work or Derivative Works thereof, You may choose to offer,
++      and charge a fee for, acceptance of support, warranty, indemnity,
++      or other liability obligations and/or rights consistent with this
++      License. However, in accepting such obligations, You may act only
++      on Your own behalf and on Your sole responsibility, not on behalf
++      of any other Contributor, and only if You agree to indemnify,
++      defend, and hold each Contributor harmless for any liability
++      incurred by, or claims asserted against, such Contributor by reason
++      of your accepting any such warranty or additional liability.
++
++   END OF TERMS AND CONDITIONS
++
++   APPENDIX: How to apply the Apache License to your work.
++
++      To apply the Apache License to your work, attach the following
++      boilerplate notice, with the fields enclosed by brackets "{}"
++      replaced with your own identifying information. (Don't include
++      the brackets!)  The text should be enclosed in the appropriate
++      comment syntax for the file format. We also recommend that a
++      file or class name and description of purpose be included on the
++      same "printed page" as the copyright notice for easier
++      identification within third-party archives.
++
++   Copyright {yyyy} {name of copyright owner}
++
++   Licensed under the Apache License, Version 2.0 (the "License");
++   you may not use this file except in compliance with the License.
++   You may obtain a copy of the License at
++
++       http://www.apache.org/licenses/LICENSE-2.0
++
++   Unless required by applicable law or agreed to in writing, software
++   distributed under the License is distributed on an "AS IS" BASIS,
++   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++   See the License for the specific language governing permissions and
++   limitations under the License.
++
++------------------------------------------------------------------------------------
++
++This product bundles various third-party components under other open source licenses.
++This section summarizes those components and their licenses. See licenses/
++for text of these licenses.
+diff --git a/csrc/quantization/marlin/dense/common/base.h b/csrc/quantization/marlin/dense/common/base.h
+new file mode 100644
+index 0000000..68c83d5
+--- /dev/null
++++ b/csrc/quantization/marlin/dense/common/base.h
+@@ -0,0 +1,32 @@
++/*
++ * Modified by HandH1998
++ * Modified by Neural Magic
++ * Copyright (C) Marlin.2024 Elias Frantar
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *         http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
++
++// Instances of `Vec` are used to organize groups of >>registers<<, as needed
++// for instance as inputs to tensor core operations. Consequently, all
++// corresponding index accesses must be compile-time constants, which is why we
++// extensively use `#pragma unroll` throughout the kernel code to guarantee
++// this.
++template <typename T, int n>
++struct Vec {
++  T elems[n];
++  __device__ T& operator[](int i) { return elems[i]; }
++};
+diff --git a/csrc/quantization/marlin/dense/common/mem.h b/csrc/quantization/marlin/dense/common/mem.h
+new file mode 100644
+index 0000000..64f9c39
+--- /dev/null
++++ b/csrc/quantization/marlin/dense/common/mem.h
+@@ -0,0 +1,89 @@
++/*
++ * Modified by HandH1998
++ * Modified by Neural Magic
++ * Copyright (C) Marlin.2024 Elias Frantar
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *         http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++// Predicated asynchronous global->shared copy; used for inputs A where we apply
++// predication to handle batchsizes that are not multiples of 16.
++__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
++                                      bool pred = true) {
++  const int BYTES = 16;
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "{\n"
++      "   .reg .pred p;\n"
++      "   setp.ne.b32 p, %0, 0;\n"
++      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
++      "}\n" ::"r"((int)pred),
++      "r"(smem), "l"(glob_ptr), "n"(BYTES));
++}
++
++// Asynchronous global->shared copy
++__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
++  const int BYTES = 16;
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "{\n"
++      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
++      "}\n" ::"r"(smem),
++      "l"(glob_ptr), "n"(BYTES));
++}
++
++// Async copy fence.
++__device__ inline void cp_async_fence() {
++  asm volatile("cp.async.commit_group;\n" ::);
++}
++
++// Wait until at most `n` async copy stages are still pending.
++template <int n>
++__device__ inline void cp_async_wait() {
++  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
++}
++
++// Wait until barrier reaches `count`, then lock for current threadblock.
++__device__ inline void barrier_acquire(int* lock, int count) {
++  if (threadIdx.x == 0) {
++    int state = -1;
++    do
++      // Guarantee that subsequent writes by this threadblock will be visible
++      // globally.
++      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
++                   : "=r"(state)
++                   : "l"(lock));
++    while (state != count);
++  }
++  __syncthreads();
++}
++
++// Release barrier and increment visitation count.
++__device__ inline void barrier_release(int* lock, bool reset = false) {
++  __syncthreads();
++  if (threadIdx.x == 0) {
++    if (reset) {
++      lock[0] = 0;
++      return;
++    }
++    int val = 1;
++    // Make sure that all writes since acquiring this barrier are visible
++    // globally, while releasing the barrier.
++    asm volatile("fence.acq_rel.gpu;\n");
++    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
++                 :
++                 : "l"(lock), "r"(val));
++  }
++}
+diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+new file mode 100644
+index 0000000..c03fef8
+--- /dev/null
++++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+@@ -0,0 +1,1073 @@
++/*
++ * Modified by Neural Magic
++ * Copyright (C) Marlin.2024 Elias Frantar
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *         http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <cuda.h>
++#include <cuda_fp16.h>
++#include <cuda_runtime.h>
++
++#include <iostream>
++
++#include "common/base.h"
++#include "core/registration.h"
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
++  #include "common/mem.h"
++#endif
++
++template <typename T>
++inline std::string str(T x) {
++  return std::to_string(x);
++}
++
++namespace marlin_dense {
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
++
++using I4 = Vec<int, 4>;
++// Matrix fragments for tensor core instructions; their precise layout is
++// documented here:
++// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
++using FragA = Vec<half2, 4>;
++using FragB = Vec<half2, 2>;
++using FragC = Vec<float, 4>;
++using FragS = Vec<half2, 1>;  // quantization scales
++
++// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
++// output/accumulation.
++__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
++                           FragC& frag_c) {
++  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
++  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
++  float* c = reinterpret_cast<float*>(&frag_c);
++  asm volatile(
++      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
++      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
++      : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
++      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
++        "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
++}
++
++// Instruction for loading a full 16x16 matrix fragment of operand A from shared
++// memory, directly in tensor core layout.
++__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
++  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
++               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
++               : "r"(smem));
++}
++
++// Lookup-table based 3-input logical operation; explicitly used for
++// dequantization as the compiler does not seem to automatically recognize it in
++// all cases.
++template <int lut>
++__device__ inline int lop3(int a, int b, int c) {
++  int res;
++  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
++               : "=r"(res)
++               : "r"(a), "r"(b), "r"(c), "n"(lut));
++  return res;
++}
++
++// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
++// values. We mostly follow the strategy in the link below, with some small
++// changes:
++// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
++__device__ inline FragB dequant(int q) {
++  const int LO = 0x000f000f;
++  const int HI = 0x00f000f0;
++  const int EX = 0x64006400;
++  // Guarantee that the `(a & b) | c` operations are LOP3s.
++  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
++  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
++  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
++  // directly into `SUB` and `ADD`.
++  const int SUB = 0x64086408;
++  const int MUL = 0x2c002c00;
++  const int ADD = 0xd480d480;
++  FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&SUB));
++  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&MUL),
++                      *reinterpret_cast<const half2*>(&ADD));
++  return frag_b;
++}
++
++// Multiply dequantized values by the corresponding quantization scale; used
++// only for grouped quantization.
++__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
++  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
++  frag_b[0] = __hmul2(frag_b[0], s);
++  frag_b[1] = __hmul2(frag_b[1], s);
++}
++
++template <const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__global__ void Marlin(
++    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // fp16 output buffer of shape mxn
++    const int4* __restrict__ s,  // fp16 quantization scales of shape
++                                 // (k/groupsize)xn
++    int prob_m,                  // batch dimension m
++    int prob_n,                  // output dimension n
++    int prob_k,                  // reduction dimension k
++    int* locks  // extra global storage for barrier synchronization
++) {
++  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
++  // same size, which might involve multiple column "slices" (of width 16 *
++  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
++  // example:
++  //   0 1 3
++  //   0 2 3
++  //   1 2 4
++  // While this kind of partitioning makes things somewhat more complicated, it
++  // ensures good utilization of all SMs for many kinds of shape and GPU
++  // configurations, while requiring as few slow global cross-threadblock
++  // reductions as possible.
++
++  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
++  // better partitioning with less reductions
++  int parallel = 1;
++  if (prob_m > 16 * thread_m_blocks) {
++    parallel = prob_m / (16 * thread_m_blocks);
++    prob_m = 16 * thread_m_blocks;
++  }
++
++  int k_tiles = prob_k / 16 / thread_k_blocks;
++  int n_tiles = prob_n / 16 / thread_n_blocks;
++  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
++  // Ensure that the number of tiles in each stripe is a multiple of the
++  // groupsize; this avoids an annoying special case where a stripe starts in
++  // the middle of group.
++  if (group_blocks != -1)
++    iters = (group_blocks / thread_k_blocks) *
++            ceildiv(iters, (group_blocks / thread_k_blocks));
++
++  int slice_row = (iters * blockIdx.x) % k_tiles;
++  int slice_col_par = (iters * blockIdx.x) / k_tiles;
++  int slice_col = slice_col_par;
++  int slice_iters;  // number of threadblock tiles in the current slice
++  int slice_count =
++      0;          // total number of active threadblocks in the current slice
++  int slice_idx;  // index of threadblock in current slice; numbered bottom to
++                  // top
++
++  // We can easily implement parallel problem execution by just remapping
++  // indices and advancing global pointers
++  if (slice_col_par >= n_tiles) {
++    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
++    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
++    locks += (slice_col_par / n_tiles) * n_tiles;
++    slice_col = slice_col_par % n_tiles;
++  }
++
++  // Compute all information about the current slice which is required for
++  // synchronization.
++  auto init_slice = [&]() {
++    slice_iters =
++        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
++    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
++    if (slice_iters == 0) return;
++    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
++    slice_count = 1;
++    slice_idx = 0;
++    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
++    if (col_first <= k_tiles * (slice_col_par + 1)) {
++      int col_off = col_first - k_tiles * slice_col_par;
++      slice_count = ceildiv(k_tiles - col_off, iters);
++      if (col_off > 0) slice_count++;
++      int delta_first = iters * blockIdx.x - col_first;
++      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
++        slice_idx = slice_count - 1;
++      else {
++        slice_idx = slice_count - 1 - delta_first / iters;
++        if (col_off > 0) slice_idx--;
++      }
++    }
++    if (slice_col == n_tiles) {
++      A += 16 * thread_m_blocks * prob_k / 8;
++      C += 16 * thread_m_blocks * prob_n / 8;
++      locks += n_tiles;
++      slice_col = 0;
++    }
++  };
++  init_slice();
++
++  int a_gl_stride = prob_k / 8;  // stride of the A matrix in global memory
++  // We typically use `constexpr` to indicate that this value is a compile-time
++  // constant
++  constexpr int a_sh_stride =
++      16 * thread_k_blocks / 8;  // stride of an A matrix tile in shared memory
++  constexpr int a_gl_rd_delta_o =
++      16 * thread_k_blocks /
++      8;  // delta between subsequent A tiles in global memory
++  int a_gl_rd_delta_i =
++      a_gl_stride *
++      (threads / a_gl_rd_delta_o);  // between subsequent accesses within a tile
++  constexpr int a_sh_wr_delta =
++      a_sh_stride *
++      (threads / a_gl_rd_delta_o);  // between shared memory writes
++  constexpr int a_sh_rd_delta_o =
++      2 * ((threads / 32) /
++           (thread_n_blocks / 4));  // between shared memory tile reads
++  constexpr int a_sh_rd_delta_i =
++      a_sh_stride * 16;  // within a shared memory tile
++  constexpr int a_sh_stage =
++      a_sh_stride * (16 * thread_m_blocks);  // overall size of a tile
++  constexpr int a_sh_wr_iters =
++      ceildiv(a_sh_stage,
++              a_sh_wr_delta);  // number of shared write iterations for a tile
++
++  int b_gl_stride = 16 * prob_n / 32;
++  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
++  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
++  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
++  constexpr int b_sh_wr_delta = threads;
++  constexpr int b_sh_rd_delta = threads;
++  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
++  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
++
++  int s_gl_stride = prob_n / 8;
++  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
++  constexpr int s_sh_stage = s_sh_stride;
++  int s_gl_rd_delta = s_gl_stride;
++
++  // Global A read index of current thread.
++  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                (threadIdx.x % a_gl_rd_delta_o);
++  a_gl_rd += a_gl_rd_delta_o * slice_row;
++  // Shared write index of current thread.
++  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                (threadIdx.x % a_gl_rd_delta_o);
++  // Shared read index.
++  int a_sh_rd =
++      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
++  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
++
++  int b_gl_rd =
++      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
++  b_gl_rd += b_sh_stride * slice_col;
++  b_gl_rd += b_gl_rd_delta_o * slice_row;
++  int b_sh_wr = threadIdx.x;
++  int b_sh_rd = threadIdx.x;
++
++  int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
++                s_sh_stride * slice_col + threadIdx.x;
++  int s_sh_wr = threadIdx.x;
++  int s_sh_rd;
++  // We use a different scale layout for grouped and column-wise quantization as
++  // we scale a `half2` tile in column-major layout in the former and in
++  // row-major in the latter case.
++  if (group_blocks != -1)
++    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
++              (threadIdx.x % 32) / 4;
++  else
++    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
++              (threadIdx.x % 32) % 4;
++
++  // Precompute which thread should not read memory in which iterations; this is
++  // needed if there are more threads than required for a certain tilesize or
++  // when the batchsize is not a multiple of 16.
++  bool a_sh_wr_pred[a_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < a_sh_wr_iters; i++)
++    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
++  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
++
++  // To ensure that writing and reading A tiles to/from shared memory, the
++  // latter in fragment format, is fully bank conflict free, we need to use a
++  // rather fancy XOR-based layout. The key here is that neither reads nor
++  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
++  // same shared memory banks. Further, it seems (based on NSight-Compute) that
++  // each warp must also write a consecutive memory segment?
++  auto transform_a = [&](int i) {
++    int row = i / a_gl_rd_delta_o;
++    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
++  };
++  // Since the computation of this remapping is non-trivial and, due to our main
++  // loop unrolls, all shared memory accesses are static, we simply precompute
++  // both transformed reads and writes.
++  int a_sh_wr_trans[a_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < a_sh_wr_iters; i++)
++    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
++  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
++  #pragma unroll
++  for (int i = 0; i < b_sh_wr_iters; i++) {
++  #pragma unroll
++    for (int j = 0; j < thread_m_blocks; j++)
++      a_sh_rd_trans[i][j] =
++          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
++  }
++
++  // Since B-accesses have non-constant stride they have to be computed at
++  // runtime; we break dependencies between subsequent accesses with a tile by
++  // maintining multiple pointers (we have enough registers), a tiny
++  // optimization.
++  const int4* B_ptr[b_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < b_sh_wr_iters; i++)
++    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
++
++  extern __shared__ int4 sh[];
++  // Shared memory storage for global fetch pipelines.
++  int4* sh_a = sh;
++  int4* sh_b = sh_a + (stages * a_sh_stage);
++  int4* sh_s = sh_b + (stages * b_sh_stage);
++  // Register storage for double buffer of shared memory reads.
++  FragA frag_a[2][thread_m_blocks];
++  I4 frag_b_quant[2];
++  FragC frag_c[thread_m_blocks][4][2];
++  FragS frag_s[2][4];
++
++  // Zero accumulators.
++  auto zero_accums = [&]() {
++  #pragma unroll
++    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
++      reinterpret_cast<float*>(frag_c)[i] = 0;
++  };
++
++  // Asynchronously fetch the next A, B and s tile from global to the next
++  // shared memory pipeline location.
++  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
++    if (pred) {
++      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < a_sh_wr_iters; i++) {
++        cp_async4_pred(
++            &sh_a_stage[a_sh_wr_trans[i]],
++            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
++            a_sh_wr_pred[i]);
++      }
++      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < b_sh_wr_iters; i++) {
++        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
++        B_ptr[i] += b_gl_rd_delta_o;
++      }
++      // Only fetch scales if this tile starts a new group
++      if constexpr (group_blocks != -1) {
++        // This assumes group_blocks >= thread_k_blocks
++        // and would need to be modified to support smaller groups.
++        static_assert(group_blocks >= thread_k_blocks);
++        if (pipe % (group_blocks / thread_k_blocks) == 0) {
++          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
++          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
++          s_gl_rd += s_gl_rd_delta;
++        }
++      }
++    }
++    // Insert a fence even when we are winding down the pipeline to ensure that
++    // waiting is also correct at this point.
++    cp_async_fence();
++  };
++
++  // Wait until the next thread tile has been loaded to shared memory.
++  auto wait_for_stage = [&]() {
++    // We only have `stages - 2` active fetches since we are double buffering
++    // and can only issue the next fetch when it is guaranteed that the previous
++    // shared memory load is fully complete (as it may otherwise be
++    // overwritten).
++    cp_async_wait<stages - 2>();
++    __syncthreads();
++  };
++
++  // Load the next sub-tile from the current location in the shared memory pipe
++  // into the current register buffer.
++  auto fetch_to_registers = [&](int k, int pipe) {
++    // It may seem inefficient that we reload the groups for every sub-tile;
++    // however, this does not seem to be a significant bottleneck, while some
++    // theoretically better attempts have lead to bad instruction ordering by
++    // the compiler and correspondingly a noticeable drop in performance.
++    if constexpr (group_blocks != -1) {
++      // This assumes group_blocks >= thread_k_blocks
++      // and would need to be modified to support smaller groups.
++      static_assert(group_blocks >= thread_k_blocks);
++      int4* sh_s_stage =
++          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
++                               (pipe / (group_blocks / thread_k_blocks)));
++      reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
++    }
++    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
++    for (int i = 0; i < thread_m_blocks; i++)
++      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
++    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
++        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
++  };
++
++  // Execute the actual tensor core matmul of a sub-tile.
++  auto matmul = [&](int k) {
++  // We have the m dimension as the inner loop in order to encourage overlapping
++  // dequantization and matmul operations.
++  #pragma unroll
++    for (int j = 0; j < 4; j++) {
++      int b_quant = frag_b_quant[k % 2][j];
++      int b_quant_shift = b_quant >> 8;
++      FragB frag_b0 = dequant(b_quant);
++      // If there are no groups, we can just scale the final output once and can
++      // avoid doing so for each weight.
++      if (group_blocks != -1) scale(frag_b0, frag_s[k % 2][j], 0);
++      FragB frag_b1 = dequant(b_quant_shift);
++      if (group_blocks != -1) scale(frag_b1, frag_s[k % 2][j], 1);
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks; i++) {
++        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
++        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
++      }
++    }
++  };
++
++  // Since we slice across the k dimension of a tile in order to increase the
++  // number of warps while keeping the n dimension of a tile reasonable, we have
++  // multiple warps that accumulate their partial sums of the same output
++  // location; which we have to reduce over in the end. We do in shared memory.
++  auto thread_block_reduce = [&]() {
++    constexpr int red_off = threads / b_sh_stride / 2;
++    if (red_off >= 1) {
++      int red_idx = threadIdx.x / b_sh_stride;
++      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
++      constexpr int red_sh_delta = b_sh_stride;
++      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
++                      (threadIdx.x % b_sh_stride);
++
++      // Parallel logarithmic shared memory reduction. We make sure to avoid any
++      // unnecessary read or write iterations, e.g., for two warps we write only
++      // once by warp 1 and read only once by warp 0.
++
++  #pragma unroll
++      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
++  #pragma unroll
++        for (int i = red_off; i > 0; i /= 2) {
++          if (i <= red_idx && red_idx < 2 * i) {
++  #pragma unroll
++            for (int j = 0; j < 4 * 2; j++) {
++              int red_sh_wr =
++                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
++              if (i < red_off) {
++                float* c_rd =
++                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
++                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
++  #pragma unroll
++                for (int k = 0; k < 4; k++)
++                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
++                      c_rd[k] + c_wr[k];
++              }
++              sh[red_sh_wr] =
++                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
++            }
++          }
++          __syncthreads();
++        }
++        if (red_idx == 0) {
++  #pragma unroll
++          for (int i = 0; i < 4 * 2; i++) {
++            float* c_rd =
++                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
++  #pragma unroll
++            for (int j = 0; j < 4; j++)
++              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
++                  c_rd[j];
++          }
++        }
++        __syncthreads();
++      }
++    }
++  };
++
++  // Since multiple threadblocks may process parts of the same column slice, we
++  // finally have to globally reduce over the results. As the striped
++  // partitioning minimizes the number of such reductions and our outputs are
++  // usually rather small, we perform this reduction serially in L2 cache.
++  auto global_reduce = [&](bool first = false, bool last = false) {
++    // We are very careful here to reduce directly in the output buffer to
++    // maximize L2 cache utilization in this step. To do this, we write out
++    // results in FP16 (but still reduce with FP32 compute).
++    constexpr int active_threads = 32 * thread_n_blocks / 4;
++    if (threadIdx.x < active_threads) {
++      int c_gl_stride = prob_n / 8;
++      int c_gl_wr_delta_o = 8 * c_gl_stride;
++      int c_gl_wr_delta_i = 4 * (active_threads / 32);
++      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
++                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
++      c_gl_wr += (2 * thread_n_blocks) * slice_col;
++      constexpr int c_sh_wr_delta = active_threads;
++      int c_sh_wr = threadIdx.x;
++
++      int row = (threadIdx.x % 32) / 4;
++
++      if (!first) {
++  // Interestingly, doing direct global accesses here really seems to mess up
++  // the compiler and lead to slowdowns, hence we also use async-copies even
++  // though these fetches are not actually asynchronous.
++  #pragma unroll
++        for (int i = 0; i < thread_m_blocks * 4; i++) {
++          cp_async4_pred(
++              &sh[c_sh_wr + c_sh_wr_delta * i],
++              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
++                 c_gl_wr_delta_i * (i % 2)],
++              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
++        }
++        cp_async_fence();
++        cp_async_wait<0>();
++      }
++
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks * 4; i++) {
++        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
++          if (!first) {
++            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
++  #pragma unroll
++            for (int j = 0; j < 2 * 4; j++) {
++              reinterpret_cast<float*>(
++                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
++                  __half2float(reinterpret_cast<__half*>(&c_red)[j]);
++            }
++          }
++          if (!last) {
++            int4 c;
++  #pragma unroll
++            for (int j = 0; j < 2 * 4; j++) {
++              reinterpret_cast<__half*>(&c)[j] =
++                  __float2half(reinterpret_cast<float*>(
++                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
++            }
++            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
++                c;
++          }
++        }
++      }
++    }
++  };
++
++  // Write out the reduce final result in the correct layout. We only actually
++  // reshuffle matrix fragments in this step, the reduction above is performed
++  // in fragment layout.
++  auto write_result = [&]() {
++    int c_gl_stride = prob_n / 8;
++    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
++    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
++    constexpr int c_sh_rd_delta =
++        c_sh_stride * (threads / (2 * thread_n_blocks));
++
++    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
++                  (threadIdx.x % (2 * thread_n_blocks));
++    c_gl_wr += (2 * thread_n_blocks) * slice_col;
++    int c_sh_wr =
++        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
++    c_sh_wr += 32 * (threadIdx.x / 32);
++    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
++                  (threadIdx.x % (2 * thread_n_blocks));
++
++    int c_gl_wr_end = c_gl_stride * prob_m;
++
++    // We first reorder in shared memory to guarantee the most efficient final
++    // global write patterns
++    auto write = [&](int idx, float c0, float c1, FragS& s) {
++      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
++      if (group_blocks ==
++          -1)  // for per-column quantization we finally apply the scale here
++        res = __hmul2(res, s[0]);
++      ((half2*)sh)[idx] = res;
++    };
++    if (threadIdx.x / 32 < thread_n_blocks / 4) {
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks; i++) {
++  #pragma unroll
++        for (int j = 0; j < 4; j++) {
++          int wr = c_sh_wr + 8 * j;
++          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
++                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
++          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
++                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
++          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
++                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
++          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
++                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
++        }
++        c_sh_wr += 16 * (4 * c_sh_stride);
++      }
++    }
++    __syncthreads();
++
++  #pragma unroll
++    for (int i = 0;
++         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
++         i++) {
++      if (c_gl_wr < c_gl_wr_end) {
++        C[c_gl_wr] = sh[c_sh_rd];
++        c_gl_wr += c_gl_wr_delta;
++        c_sh_rd += c_sh_rd_delta;
++      }
++    }
++  };
++
++  // Start global fetch and register load pipelines.
++  auto start_pipes = [&]() {
++  #pragma unroll
++    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
++    zero_accums();
++    wait_for_stage();
++    fetch_to_registers(0, 0);
++    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
++  };
++  start_pipes();
++
++  // Main loop.
++  while (slice_iters) {
++  // We unroll over both the global fetch and the register load pipeline to
++  // ensure all shared memory accesses are static. Note that both pipelines have
++  // even length meaning that the next iteration will always start at index 0.
++  #pragma unroll
++    for (int pipe = 0; pipe < stages;) {
++  #pragma unroll
++      for (int k = 0; k < b_sh_wr_iters; k++) {
++        fetch_to_registers(k + 1, pipe % stages);
++        if (k == b_sh_wr_iters - 2) {
++          fetch_to_shared((pipe + stages - 1) % stages, pipe,
++                          slice_iters >= stages);
++          pipe++;
++          wait_for_stage();
++        }
++        matmul(k);
++      }
++      slice_iters--;
++      if (slice_iters == 0) break;
++    }
++    a_gl_rd += a_gl_rd_delta_o * stages;
++
++    // Process results and, if necessary, proceed to the next column slice.
++    // While this pattern may not be the most readable, other ways of writing
++    // the loop seemed to noticeably worse performance after compilation.
++    if (slice_iters == 0) {
++      cp_async_wait<0>();
++      bool last = slice_idx == slice_count - 1;
++      // For per-column scales, we only fetch them here in the final step before
++      // write-out
++      if (group_blocks == -1 && last) {
++        if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
++        cp_async_fence();
++      }
++      thread_block_reduce();
++      if (group_blocks == -1 && last) {
++        cp_async_wait<0>();
++        __syncthreads();
++        if (threadIdx.x / 32 < thread_n_blocks / 4) {
++          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
++          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
++        }
++      }
++      if (slice_count > 1) {  // only globally reduce if there is more than one
++                              // block in a slice
++        barrier_acquire(&locks[slice_col], slice_idx);
++        global_reduce(slice_idx == 0, last);
++        barrier_release(&locks[slice_col], last);
++      }
++      if (last)  // only the last block in a slice actually writes the result
++        write_result();
++      slice_row = 0;
++      slice_col_par++;
++      slice_col++;
++      init_slice();
++      if (slice_iters) {
++        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                  (threadIdx.x % a_gl_rd_delta_o);
++  #pragma unroll
++        for (int i = 0; i < b_sh_wr_iters; i++)
++          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
++        if (slice_col == 0) {
++  #pragma unroll
++          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
++        }
++        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
++        start_pipes();
++      }
++    }
++  }
++}
++
++#else
++
++template <const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__global__ void Marlin(
++    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // fp16 output buffer of shape mxn
++    const int4* __restrict__ s,  // fp16 quantization scales of shape
++                                 // (k/groupsize)xn
++    int prob_m,                  // batch dimension m
++    int prob_n,                  // output dimension n
++    int prob_k,                  // reduction dimension k
++    int* locks  // extra global storage for barrier synchronization
++) {
++  // Marlin is not implemented yet for SM < 8.0
++  assert(false);
++  return;
++}
++
++#endif
++
++// 8 warps are a good choice since every SM has 4 schedulers and having more
++// than 1 warp per schedule allows some more latency hiding. At the same time,
++// we want relatively few warps to have many registers per warp and small tiles.
++const int USER_THREADS =
++    256;               // Note: This is only used with user-provided thread_k/n
++const int STAGES = 4;  // 4 pipeline stages fit into shared memory
++const int SHARED_MEM =
++    96 * 1024;  // max shared memory on compute capability 8.6 (< 8.0)
++
++static constexpr int min_thread_n = 64;
++static constexpr int min_thread_k = 64;
++
++static constexpr int tile_size = 16;
++static constexpr int max_par = 16;
++
++static constexpr int pack_factor_4bit =
++    8;  // We have 8 4-bit vals inside a 32 bit
++
++#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
++                  GROUP_BLOCKS, NUM_THREADS)                                   \
++  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
++           thread_n_blocks == THREAD_N_BLOCKS &&                               \
++           thread_k_blocks == THREAD_K_BLOCKS &&                               \
++           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
++    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
++                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
++                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
++                         SHARED_MEM);                                          \
++    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
++           STAGES, GROUP_BLOCKS><<<blocks, NUM_THREADS, SHARED_MEM, stream>>>( \
++        A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks);            \
++  }
++
++typedef struct {
++  int thread_k;
++  int thread_n;
++  int num_threads;
++} thread_config_t;
++
++thread_config_t small_batch_thread_configs[] = {
++    // Ordered by priority
++
++    // thread_k, thread_n, num_threads
++    {128, 128, 256},  // Default
++    {128, 64, 128},   // Reduce N 2X, same K
++    {64, 256, 256},   // Reduce K 2X, increase N 2X
++    {64, 128, 128},   // Reduce K 2X, same N
++};
++
++thread_config_t large_batch_thread_configs[] = {
++    // Ordered by priority
++
++    // thread_k, thread_n, num_threads
++    {64, 256, 256},   // Default
++    {128, 128, 256},  // Reduce N 2X, increase K 2X
++    {64, 128, 128},   // Reduce N 2X, same K
++    {128, 64, 128},   // Reduce N 4X, increase K 2X
++};
++
++bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
++                     int prob_k) {
++  // Sanity
++  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
++      th_config.num_threads == -1) {
++    return false;
++  }
++
++  // Verify K/N are divisible by thread K/N
++  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
++    return false;
++  }
++
++  // thread_k can be only 128 or 64 (because it must be less than groupsize
++  // which is 128)
++  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
++    return false;
++  }
++
++  // Verify min for thread K/N
++  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
++    return false;
++  }
++
++  // num_threads must be at least 128 (= 4 warps)
++  if (th_config.num_threads < 128) {
++    return false;
++  }
++
++  return true;
++}
++
++thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
++  if (prob_m <= 16) {
++    for (auto th_config : small_batch_thread_configs) {
++      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
++        return th_config;
++      }
++    }
++
++  } else {
++    for (auto th_config : large_batch_thread_configs) {
++      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
++        return th_config;
++      }
++    }
++  }
++
++  return thread_config_t{-1, -1, -1};
++}
++
++#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
++  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
++  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
++  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
++  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
++  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
++
++void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m,
++                 int prob_n, int prob_k, void* workspace, int groupsize = -1,
++                 int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
++                 int thread_n = -1, int sms = -1, int max_par = 16) {
++  int tot_m = prob_m;
++  int tot_m_blocks = ceildiv(tot_m, 16);
++  int pad = 16 * tot_m_blocks - tot_m;
++
++  if (sms == -1)
++    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
++
++  // Set thread config
++  thread_config_t th_config;
++  if (thread_k != -1 && thread_n != -1) {
++    // User-defined config
++    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
++  } else {
++    // Auto config
++    th_config = determine_thread_config(prob_m, prob_n, prob_k);
++  }
++
++  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
++    throw std::runtime_error(
++        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
++        ", thread_n = " + str(th_config.thread_n) +
++        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
++        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
++  }
++
++  // Uncomment for debug
++  // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) +
++  //                  ", thread_n = " + str(th_config.thread_n) +
++  //                  ", num_threads = " + str(th_config.num_threads) + " for
++  //                  MKN = [" + str(prob_m) +
++  //                  ", " + str(prob_k) + ", " + str(prob_n) + "]\n";
++
++  int num_threads = th_config.num_threads;
++  thread_k = th_config.thread_k;
++  thread_n = th_config.thread_n;
++
++  int thread_k_blocks = thread_k / 16;
++  int thread_n_blocks = thread_n / 16;
++  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
++  int blocks = sms;
++
++  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
++    return;
++  }
++
++  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
++              " is not divisible by thread_n = ", thread_n);
++  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
++              " is not divisible by thread_k = ", thread_k);
++  if (group_blocks != -1) {
++    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
++                " is not divisible by group_blocks = ", group_blocks);
++  }
++
++  const int4* A_ptr = (const int4*)A;
++  const int4* B_ptr = (const int4*)B;
++  int4* C_ptr = (int4*)C;
++  const int4* s_ptr = (const int4*)s;
++
++  int* locks = (int*)workspace;
++
++  for (int i = 0; i < tot_m_blocks; i += 4) {
++    int thread_m_blocks = tot_m_blocks - i;
++    prob_m = tot_m - 16 * i;
++    int par = 1;
++    if (thread_m_blocks > 4) {
++      // Note that parallel > 1 currently only works for inputs without any
++      // padding
++      par = (16 * thread_m_blocks - pad) / 64;
++      if (par > max_par) par = max_par;
++      prob_m = 64 * par;
++      i += 4 * (par - 1);
++      thread_m_blocks = 4;
++    }
++
++    // For compilation speed, we only define the kernel configurations that have
++    // seemed useful (in terms of performance) in our testing, however many more
++    // are, in principle, possible.
++    if (false) {
++    }
++    CALL_IF(8, 8, 256)
++    CALL_IF(16, 4, 256)
++    CALL_IF(8, 4, 128)
++    CALL_IF(4, 8, 128)
++    else {
++      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
++                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
++                               ", groupsize = " + str(groupsize) +
++                               ", thread_m_blocks = " + str(thread_m_blocks) +
++                               ", thread_n_blocks = " + str(thread_n_blocks) +
++                               ", thread_k_blocks = " + str(thread_k_blocks));
++    }
++
++    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
++    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
++  }
++}
++
++}  // namespace marlin_dense
++
++torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
++                          torch::Tensor& b_scales, torch::Tensor& workspace,
++                          int64_t size_m, int64_t size_n, int64_t size_k) {
++  // Verify M
++  TORCH_CHECK(size_m == a.size(0),
++              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
++                  ", size_m = " + str(size_m));
++
++  // Verify K
++  TORCH_CHECK(size_k == a.size(1),
++              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
++                  ", size_k = " + str(size_k));
++  TORCH_CHECK(size_k % marlin_dense::tile_size == 0,
++              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
++                  str(marlin_dense::tile_size));
++  TORCH_CHECK((size_k / marlin_dense::tile_size) == b_q_weight.size(0),
++              "Shape mismatch: b_q_weight.size(0) = " +
++                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
++                  ", tile_size = " + str(marlin_dense::tile_size));
++
++  // Verify N
++  TORCH_CHECK(b_scales.size(1) == size_n,
++              "b_scales.size(1) = " + str(b_scales.size(1)) +
++                  ", size_n = " + str(size_n));
++  TORCH_CHECK(
++      b_q_weight.size(1) % marlin_dense::tile_size == 0,
++      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
++          " is not divisible by tile_size = " + str(marlin_dense::tile_size));
++
++  int actual_size_n = (b_q_weight.size(1) / marlin_dense::tile_size) *
++                      marlin_dense::pack_factor_4bit;
++  TORCH_CHECK(
++      size_n == actual_size_n,
++      "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
++
++  // Verify A device and strides
++  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
++  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
++
++  // Verify B device and strides
++  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
++  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
++
++  // Verify scales device and strides
++  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
++  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
++
++  // Alloc C matrix
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
++  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
++  torch::Tensor c = torch::empty({size_m, size_n}, options);
++
++  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
++  // auto -1)
++  int thread_k = -1;
++  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
++  // auto -1)
++  int thread_n = -1;
++  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
++  int sms = -1;
++
++  // Detect groupsize
++  if (b_scales.size(0) != 1) {
++    TORCH_CHECK(size_k % b_scales.size(0) == 0,
++                "size_k = " + str(size_k) +
++                    ", is not divisible by b_scales.size(0) = " +
++                    str(b_scales.size(0)));
++  }
++  int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0);
++
++  // Verify groupsize
++  TORCH_CHECK(groupsize == -1 || groupsize == 128,
++              "Unexpected groupsize = " + str(groupsize));
++
++  // Verify workspace size
++  TORCH_CHECK(size_n % marlin_dense::min_thread_n == 0,
++              "size_n = " + str(size_n) +
++                  ", is not divisible by min_thread_n = " +
++                  str(marlin_dense::min_thread_n));
++  int min_workspace_size =
++      (size_n / marlin_dense::min_thread_n) * marlin_dense::max_par;
++  TORCH_CHECK(workspace.numel() >= min_workspace_size,
++              "workspace.numel = " + str(workspace.numel()) +
++                  " is below min_workspace_size = " + str(min_workspace_size));
++
++  int dev = a.get_device();
++  marlin_dense::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
++                            b_scales.data_ptr(), size_m, size_n, size_k,
++                            workspace.data_ptr(), groupsize, dev,
++                            at::cuda::getCurrentCUDAStream(dev), thread_k,
++                            thread_n, sms, marlin_dense::max_par);
++
++  return c;
++}
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
++  m.impl("marlin_gemm", &marlin_gemm);
++}
+diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+new file mode 100644
+index 0000000..103a644
+--- /dev/null
++++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+@@ -0,0 +1,1248 @@
++/*
++ * Adapted from
++ * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda_kernel.cu
++ * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda.cpp
++ * Modified by HandH1998
++ * Copyright (C) 2024 HandH1998
++ * Copyright (C) Marlin.2024 Elias Frantar
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *         http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <cuda.h>
++#include <cuda_fp16.h>
++#include <cuda_runtime.h>
++
++#include <iostream>
++
++#include "../dense/common/base.h"
++#include "core/registration.h"
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
++  #include "../dense/common/mem.h"
++#endif
++
++template <typename T>
++inline std::string str(T x) {
++  return std::to_string(x);
++}
++
++namespace {
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
++
++using I4 = Vec<int, 4>;
++// Matrix fragments for tensor core instructions; their precise layout is
++// documented here:
++// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-integer-type
++using FragA = Vec<uint32_t, 2>;
++using FragB = Vec<uint32_t, 1>;
++using FragC = Vec<int, 4>;
++using FragS_GROUP = Vec<half2, 1>;  // weight per-group quantization scales
++using FragS_CHANNEL =
++    Vec<float, 2>;  // weight per-channel quantization scales or activaton
++                    // per-token quantization scales
++
++// NOTE(HandH1998): cp.async.cg only support BYTES = 16, however,
++// cp.async.ca can support BYTES = 4, 8, 16;
++// as s_tok's shape is equal to prob_m, we need set s_tok to float type,
++// and cp_size = 1 float, i.e., 4 BYTES
++// Asynchronous global->shared copy for activation quantizaton scales s_tok
++__device__ inline void cp_async1(void* smem_ptr, const void* glob_ptr) {
++  const int BYTES = 4;
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "{\n"
++      "   cp.async.ca.shared.global [%0], [%1], %2;\n"
++      "}\n" ::"r"(smem),
++      "l"(glob_ptr), "n"(BYTES));
++}
++
++// m16n8k16 tensor core mma instruction with int8 inputs and int32
++// output/accumulation.
++__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
++                           FragC& frag_c) {
++  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
++  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
++  int* c = reinterpret_cast<int*>(&frag_c);
++  asm volatile(
++      "mma.sync.aligned.m16n8k16.row.col.satfinite.s32.s8.s8.s32 "
++      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
++      : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
++      : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
++        "r"(c[3]));
++}
++
++// Instruction for loading a full 16x16 matrix fragment of operand A from shared
++// memory, directly in int8 tensor core layout.
++__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
++  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
++               : "=r"(a[0]), "=r"(a[1])
++               : "r"(smem));
++}
++
++inline __device__ half2 float2_to_half2(float2 f) {
++  uint32_t res;
++  // NOTE(HandH1998): h0,h1 should be uint16_t, not half
++  uint16_t h0, h1;
++  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h0) : "f"(f.x));
++  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h1) : "f"(f.y));
++  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(res) : "h"(h0), "h"(h1));
++  return reinterpret_cast<half2&>(res);
++}
++
++inline __device__ float int32_to_float(int h) {
++  float res;
++  asm volatile("cvt.rn.f32.s32 %0, %1;\n" : "=f"(res) : "r"(h));
++  return res;
++}
++
++// Lookup-table based 3-input logical operation; explicitly used for
++// dequantization as the compiler does not seem to automatically recognize it in
++// all cases.
++template <int lut>
++__device__ inline int lop3(int a, int b, int c) {
++  int res;
++  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
++               : "=r"(res)
++               : "r"(a), "r"(b), "r"(c), "n"(lut));
++  return res;
++}
++
++// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
++// for weight per channel dequant.
++__device__ inline FragB dequant_per_channel(int q) {
++  static constexpr int MASK = 0xf0f0f0f0;
++  FragB frag_b;
++  frag_b[0] = (q & MASK);
++  return frag_b;
++}
++
++// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
++// for weight per group dequant.
++__device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
++  static constexpr uint32_t LO = 0x000f000f;
++  static constexpr uint32_t HI = 0x00f000f0;
++  static constexpr uint32_t EX = 0x64006400;
++  // Guarantee that the `(a & b) | c` operations are LOP3s.
++  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
++  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
++  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
++  // directly into `SUB` and `ADD`.
++  static constexpr uint32_t SUB = 0x64086408;
++  static constexpr uint32_t MUL = 0x2c002c00;
++  static constexpr uint32_t ADD = 0xd480d480;
++  *reinterpret_cast<half2*>(&t0) = __hsub2(
++      *reinterpret_cast<half2*>(&t0), *reinterpret_cast<const half2*>(&SUB));
++  *reinterpret_cast<half2*>(&t1) = __hfma2(
++      *reinterpret_cast<half2*>(&t1), *reinterpret_cast<const half2*>(&MUL),
++      *reinterpret_cast<const half2*>(&ADD));
++
++  uint16_t s = reinterpret_cast<uint16_t*>(&frag_s)[i];
++  uint32_t double_s;
++  // pack 2xfp16 to half2
++  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(double_s) : "h"(s), "h"(s));
++  // dequant and convert 4 half to 4 uint8 (be placed at the low 8 bits of 4
++  // half, respectively)
++  static constexpr uint32_t MAGIC_NUM = 0x64806480;
++  *reinterpret_cast<half2*>(&t0) = __hfma2(
++      *reinterpret_cast<half2*>(&t0), *reinterpret_cast<half2*>(&double_s),
++      *reinterpret_cast<const half2*>(&MAGIC_NUM));
++  *reinterpret_cast<half2*>(&t1) = __hfma2(
++      *reinterpret_cast<half2*>(&t1), *reinterpret_cast<half2*>(&double_s),
++      *reinterpret_cast<const half2*>(&MAGIC_NUM));
++  // take out the 4 uint8 from 4 half, then convert them to 4 int8 and pack 4
++  // int8 into 1 uint32
++  FragB frag_b;
++  uint32_t uint8s;
++  static constexpr uint32_t MASK_0246 = 0x6420;
++  static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080;
++  asm volatile("prmt.b32 %0,%1,%2,%3;\n"
++               : "=r"(uint8s)
++               : "r"(t0), "r"(t1), "n"(MASK_0246));
++  frag_b[0] = (uint8s ^ UINT8s_TO_INT8s_MASK);
++  return frag_b;
++}
++
++template <const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__global__ void Marlin(
++    const int4* __restrict__ A,  // int8 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // int32 global_reduce buffer of shape
++                           // (max_par*16*4)xn, as int8 tensor core's output is
++                           // int32 dtype
++    int4* __restrict__ D,              // fp16 output buffer of shape mxn
++    const float* __restrict__ s_tok,   // fp32 activation per-token quantization
++                                       // scales of shape mx1
++    const int4* __restrict__ s_ch,     // fp32 weight per-channel quantization
++                                       // scales of shape 1xn
++    const int4* __restrict__ s_group,  // fp16 weight per-group quantization
++                                       // scales of shape (k/groupsize)xn, when
++                                       // group_blocks=-1, it should be nullptr
++    int prob_m,                        // batch dimension m
++    int prob_n,                        // output dimension n
++    int prob_k,                        // reduction dimension k
++    int* locks  // extra global storage for barrier synchronization
++) {
++  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
++  // same size, which might involve multiple column "slices" (of width 16 *
++  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
++  // example:
++  //   0 1 3
++  //   0 2 3
++  //   1 2 4
++  // While this kind of partitioning makes things somewhat more complicated, it
++  // ensures good utilization of all SMs for many kinds of shape and GPU
++  // configurations, while requiring as few slow global cross-threadblock
++  // reductions as possible.
++
++  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
++  // better partitioning with less reductions
++  int parallel = 1;
++  if (prob_m > 16 * thread_m_blocks) {
++    parallel = prob_m / (16 * thread_m_blocks);
++    prob_m = 16 * thread_m_blocks;
++  }
++
++  int k_tiles = prob_k / 16 / thread_k_blocks;
++  int n_tiles = prob_n / 16 / thread_n_blocks;
++  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
++  // Ensure that the number of tiles in each stripe is a multiple of the
++  // groupsize; this avoids an annoying special case where a stripe starts in
++  // the middle of group.
++  if constexpr (group_blocks != -1)
++    iters = (group_blocks / thread_k_blocks) *
++            ceildiv(iters, (group_blocks / thread_k_blocks));
++
++  int slice_row = (iters * blockIdx.x) % k_tiles;
++  int slice_col_par = (iters * blockIdx.x) / k_tiles;
++  int slice_col = slice_col_par;
++  int slice_iters;  // number of threadblock tiles in the current slice
++  int slice_count =
++      0;          // total number of active threadblocks in the current slice
++  int slice_idx;  // index of threadblock in current slice; numbered bottom to
++                  // top
++
++  // We can easily implement parallel problem execution by just remapping
++  // indices and advancing global pointers
++  if (slice_col_par >= n_tiles) {
++    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 16;
++    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 4;
++    D += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
++    s_tok += (slice_col_par / n_tiles) * 16 * thread_m_blocks;
++    locks += (slice_col_par / n_tiles) * n_tiles;
++    slice_col = slice_col_par % n_tiles;
++  }
++
++  // Compute all information about the current slice which is required for
++  // synchronization.
++  auto init_slice = [&]() {
++    slice_iters =
++        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
++    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
++    if (slice_iters == 0) return;
++    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
++    slice_count = 1;
++    slice_idx = 0;
++    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
++    if (col_first <= k_tiles * (slice_col_par + 1)) {
++      int col_off = col_first - k_tiles * slice_col_par;
++      slice_count = ceildiv(k_tiles - col_off, iters);
++      if (col_off > 0) slice_count++;
++      int delta_first = iters * blockIdx.x - col_first;
++      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
++        slice_idx = slice_count - 1;
++      else {
++        slice_idx = slice_count - 1 - delta_first / iters;
++        if (col_off > 0) slice_idx--;
++      }
++    }
++    if (slice_col == n_tiles) {
++      A += 16 * thread_m_blocks * prob_k / 16;
++      C += 16 * thread_m_blocks * prob_n / 4;
++      D += 16 * thread_m_blocks * prob_n / 8;
++      s_tok += 16 * thread_m_blocks;
++      locks += n_tiles;
++      slice_col = 0;
++    }
++  };
++  init_slice();
++
++  int a_gl_stride = prob_k / 16;  // stride of the A matrix in global memory
++  // We typically use `constexpr` to indicate that this value is a compile-time
++  // constant
++  constexpr int a_sh_stride =
++      16 * thread_k_blocks / 16;  // stride of an A matrix tile in shared memory
++  constexpr int a_gl_rd_delta_o =
++      16 * thread_k_blocks /
++      16;  // delta between subsequent A tiles in global memory
++  int a_gl_rd_delta_i =
++      a_gl_stride *
++      (threads / a_gl_rd_delta_o);  // between subsequent accesses within a tile
++  constexpr int a_sh_wr_delta =
++      a_sh_stride *
++      (threads / a_gl_rd_delta_o);  // between shared memory writes
++  constexpr int a_sh_rd_delta_o =
++      1 * ((threads / 32) /
++           (thread_n_blocks / 4));  // between shared memory tile reads
++  constexpr int a_sh_rd_delta_i =
++      a_sh_stride * 16;  // within a shared memory tile
++  constexpr int a_sh_stage =
++      a_sh_stride * (16 * thread_m_blocks);  // overall size of a tile
++  constexpr int a_sh_wr_iters =
++      ceildiv(a_sh_stage,
++              a_sh_wr_delta);  // number of shared write iterations for a tile
++
++  int b_gl_stride = 16 * prob_n / 32;
++  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
++  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
++  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
++  constexpr int b_sh_wr_delta = threads;
++  constexpr int b_sh_rd_delta = threads;
++  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
++  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
++
++  constexpr int s_tok_sh_stride = 16 * thread_m_blocks;
++
++  constexpr int s_ch_sh_stride = 16 * thread_n_blocks / 4;
++
++  int s_group_gl_stride = prob_n / 8;
++  constexpr int s_group_sh_stride = 16 * thread_n_blocks / 8;
++  constexpr int s_group_sh_stage = s_group_sh_stride;
++  int s_group_gl_rd_delta = s_group_gl_stride;
++
++  // Global A read index of current thread.
++  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                (threadIdx.x % a_gl_rd_delta_o);
++  a_gl_rd += a_gl_rd_delta_o * slice_row;
++  // Shared write index of current thread.
++  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                (threadIdx.x % a_gl_rd_delta_o);
++  // Shared read index.
++  // NOTE(HandH1998): int8 input a only need 16 threads to load 16x16 matrix
++  int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % 16);
++  a_sh_rd += 1 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
++
++  int b_gl_rd =
++      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
++  b_gl_rd += b_sh_stride * slice_col;
++  b_gl_rd += b_gl_rd_delta_o * slice_row;
++  int b_sh_wr = threadIdx.x;
++  int b_sh_rd = threadIdx.x;
++
++  int s_tok_gl_rd = threadIdx.x;
++  // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10,
++  // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for
++  // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as
++  // s_tok's size is not fixed, we can not shuffle before inference we shuffle
++  // it when fetching s_tok from global memory to shared memory, that's why
++  // s_tok_sh_wr is like this
++  int s_tok_sh_wr =
++      (threadIdx.x / 16) * 16 + (threadIdx.x % 8) * 2 + (threadIdx.x % 16) / 8;
++  int s_tok_sh_rd = (threadIdx.x % 32) / 4;
++  bool s_tok_sh_wr_pred = threadIdx.x < prob_m;
++
++  int s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
++  int s_ch_sh_wr = threadIdx.x;
++  int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
++                   2 * ((threadIdx.x % 32) % 4);
++  bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride;
++
++  int s_group_gl_rd, s_group_sh_wr, s_group_sh_rd;
++  bool s_group_sh_wr_pred;
++  if constexpr (group_blocks != -1) {
++    s_group_gl_rd =
++        s_group_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
++        s_group_sh_stride * slice_col + threadIdx.x;
++    s_group_sh_wr = threadIdx.x;
++    // NOTE(HandH1998): s_group_sh_rd is related to mma output C
++    s_group_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
++                    (threadIdx.x % 32) / 4;
++    s_group_sh_wr_pred = threadIdx.x < s_group_sh_stride;
++  }
++
++  // Precompute which thread should not read memory in which iterations; this is
++  // needed if there are more threads than required for a certain tilesize or
++  // when the batchsize is not a multiple of 16.
++  bool a_sh_wr_pred[a_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < a_sh_wr_iters; i++)
++    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
++
++  // To ensure that writing and reading A tiles to/from shared memory, the
++  // latter in fragment format, is fully bank conflict free, we need to use a
++  // rather fancy XOR-based layout. The key here is that neither reads nor
++  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
++  // same shared memory banks. Further, it seems (based on NSight-Compute) that
++  // each warp must also write a consecutive memory segment?
++  auto transform_a = [&](int i) {
++    int row = i / a_gl_rd_delta_o;
++    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
++  };
++  // Since the computation of this remapping is non-trivial and, due to our main
++  // loop unrolls, all shared memory accesses are static, we simply precompute
++  // both transformed reads and writes.
++  int a_sh_wr_trans[a_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < a_sh_wr_iters; i++)
++    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
++  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
++  #pragma unroll
++  for (int i = 0; i < b_sh_wr_iters; i++) {
++  #pragma unroll
++    for (int j = 0; j < thread_m_blocks; j++)
++      a_sh_rd_trans[i][j] =
++          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
++  }
++
++  // Since B-accesses have non-constant stride they have to be computed at
++  // runtime; we break dependencies between subsequent accesses with a tile by
++  // maintining multiple pointers (we have enough registers), a tiny
++  // optimization.
++  const int4* B_ptr[b_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < b_sh_wr_iters; i++)
++    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
++
++  extern __shared__ int4 sh[];
++  // Shared memory storage for global fetch pipelines.
++  // NOTE(HandH1998): stages need >= 4, otherwise, sh_s_tok = sh + max(stages *
++  // a_sh_stage + stages * b_sh_stage, 4 * stages * a_sh_stage)
++  int4* sh_a = sh;
++  int4* sh_b = sh_a + (stages * a_sh_stage);
++  int4* sh_s_tok = sh_b + (stages * b_sh_stage);
++  int4* sh_s_ch = sh_s_tok + s_tok_sh_stride;
++  int4* sh_s_group = sh_s_ch + s_ch_sh_stride;
++
++  // Register storage for double buffer of shared memory reads.
++  FragA frag_a[2][thread_m_blocks];
++  I4 frag_b_quant[2];
++  FragC frag_c[thread_m_blocks][4][2];
++  FragS_GROUP frag_s_group[2][4];
++  FragS_CHANNEL frag_s_tok[thread_m_blocks];
++  FragS_CHANNEL frag_s_ch[2][4];
++
++  // Zero accumulators.
++  auto zero_accums = [&]() {
++  #pragma unroll
++    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
++      reinterpret_cast<int*>(frag_c)[i] = 0;
++  };
++
++  // Asynchronously fetch the next A, B and s tile from global to the next
++  // shared memory pipeline location.
++  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
++    if (pred) {
++      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < a_sh_wr_iters; i++) {
++        cp_async4_pred(
++            &sh_a_stage[a_sh_wr_trans[i]],
++            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
++            a_sh_wr_pred[i]);
++      }
++      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < b_sh_wr_iters; i++) {
++        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
++        B_ptr[i] += b_gl_rd_delta_o;
++      }
++      // Only fetch scales if this tile starts a new group
++      if constexpr (group_blocks != -1) {
++        if (pipe % (group_blocks / thread_k_blocks) == 0) {
++          int4* sh_s_group_stage = sh_s_group + s_group_sh_stage * pipe;
++          if (s_group_sh_wr_pred)
++            cp_async4(&sh_s_group_stage[s_group_sh_wr],
++                      &s_group[s_group_gl_rd]);
++          s_group_gl_rd += s_group_gl_rd_delta;
++        }
++      }
++    }
++    // Insert a fence even when we are winding down the pipeline to ensure that
++    // waiting is also correct at this point.
++    cp_async_fence();
++  };
++
++  // Wait until the next thread tile has been loaded to shared memory.
++  auto wait_for_stage = [&]() {
++    // We only have `stages - 2` active fetches since we are double buffering
++    // and can only issue the next fetch when it is guaranteed that the previous
++    // shared memory load is fully complete (as it may otherwise be
++    // overwritten).
++    cp_async_wait<stages - 2>();
++    __syncthreads();
++  };
++
++  // Load the next sub-tile from the current location in the shared memory pipe
++  // into the current register buffer.
++  auto fetch_to_registers = [&](int k, int pipe) {
++    // It may seem inefficient that we reload the groups for every sub-tile;
++    // however, this does not seem to be a significant bottleneck, while some
++    // theoretically better attempts have lead to bad instruction ordering by
++    // the compiler and correspondingly a noticeable drop in performance.
++    if constexpr (group_blocks != -1) {
++      int4* sh_s_group_stage =
++          sh_s_group +
++          s_group_sh_stage * ((group_blocks / thread_k_blocks) *
++                              (pipe / (group_blocks / thread_k_blocks)));
++      reinterpret_cast<int4*>(&frag_s_group[k % 2])[0] =
++          sh_s_group_stage[s_group_sh_rd];
++    }
++    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
++    for (int i = 0; i < thread_m_blocks; i++)
++      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
++    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
++        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
++  };
++
++  // Execute the actual tensor core matmul of a sub-tile.
++  auto matmul = [&](int k) {
++  // We have the m dimension as the inner loop in order to encourage overlapping
++  // dequantization and matmul operations.
++  #pragma unroll
++    for (int j = 0; j < 4; j++) {
++      int b_quant = frag_b_quant[k % 2][j];
++      // int b_quant_shift = b_quant << 4;
++      FragB frag_b0, frag_b1;
++      // If there are no groups, we can just scale the final output once and can
++      // avoid doing so for each weight.
++      if constexpr (group_blocks != -1) {
++        int b_quant_shift = b_quant >> 8;
++        frag_b0 = dequant_per_group(b_quant, frag_s_group[k % 2][j], 0);
++        frag_b1 = dequant_per_group(b_quant_shift, frag_s_group[k % 2][j], 1);
++      } else {
++        int b_quant_shift = b_quant << 4;
++        frag_b0 = dequant_per_channel(b_quant);
++        frag_b1 = dequant_per_channel(b_quant_shift);
++      }
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks; i++) {
++        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
++        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
++      }
++    }
++  };
++
++  // Since we slice across the k dimension of a tile in order to increase the
++  // number of warps while keeping the n dimension of a tile reasonable, we have
++  // multiple warps that accumulate their partial sums of the same output
++  // location; which we have to reduce over in the end. We do in shared memory.
++  auto thread_block_reduce = [&]() {
++    constexpr int red_off = threads / b_sh_stride / 2;
++    if (red_off >= 1) {
++      int red_idx = threadIdx.x / b_sh_stride;
++      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
++      constexpr int red_sh_delta = b_sh_stride;
++      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
++                      (threadIdx.x % b_sh_stride);
++
++      // Parallel logarithmic shared memory reduction. We make sure to avoid any
++      // unnecessary read or write iterations, e.g., for two warps we write only
++      // once by warp 1 and read only once by warp 0.
++
++  #pragma unroll
++      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
++  #pragma unroll
++        for (int i = red_off; i > 0; i /= 2) {
++          if (i <= red_idx && red_idx < 2 * i) {
++  #pragma unroll
++            for (int j = 0; j < 4 * 2; j++) {
++              int red_sh_wr =
++                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
++              if (i < red_off) {
++                int* c_rd =
++                    reinterpret_cast<int*>(&sh[red_sh_delta * j + red_sh_rd]);
++                int* c_wr = reinterpret_cast<int*>(&sh[red_sh_wr]);
++  #pragma unroll
++                for (int k = 0; k < 4; k++)
++                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
++                      c_rd[k] + c_wr[k];
++              }
++              sh[red_sh_wr] =
++                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
++            }
++          }
++          __syncthreads();
++        }
++        if (red_idx == 0) {
++  #pragma unroll
++          for (int i = 0; i < 4 * 2; i++) {
++            int* c_rd =
++                reinterpret_cast<int*>(&sh[red_sh_delta * i + red_sh_rd]);
++  #pragma unroll
++            for (int j = 0; j < 4; j++)
++              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
++                  c_rd[j];
++          }
++        }
++        __syncthreads();
++      }
++    }
++  };
++
++  // Since multiple threadblocks may process parts of the same column slice, we
++  // finally have to globally reduce over the results. As the striped
++  // partitioning minimizes the number of such reductions and our outputs are
++  // usually rather small, we perform this reduction serially in L2 cache.
++  // global_reduce works on INT32 elements, which are the results of INT8 GEMM.
++  // This is why we need another INT32 maxtrix `C` to reduce instead of the
++  // original half matrix `D`.
++  auto global_reduce = [&](bool first = false, bool last = false) {
++    // We are very careful here to reduce directly in the output buffer to
++    // maximize L2 cache utilization in this step. To do this, we write out
++    // results in FP16 (but still reduce with FP32 compute).
++    constexpr int active_threads = 32 * thread_n_blocks / 4;
++    if (threadIdx.x < active_threads) {
++      int c_gl_stride = prob_n / 4;
++      int c_gl_wr_delta_o = 8 * c_gl_stride;
++      int c_gl_wr_delta_i = 8 * (active_threads / 32);
++      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
++                    8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2;
++      c_gl_wr += (4 * thread_n_blocks) * slice_col;
++      constexpr int c_sh_wr_delta = active_threads * 2;
++      int c_sh_wr = 2 * threadIdx.x;
++
++      int row = (threadIdx.x % 32) / 4;
++
++      if (!first) {
++  // Interestingly, doing direct global accesses here really seems to mess up
++  // the compiler and lead to slowdowns, hence we also use async-copies even
++  // though these fetches are not actually asynchronous.
++  #pragma unroll
++        for (int i = 0; i < thread_m_blocks * 4; i++) {
++          cp_async4_pred(
++              &sh[c_sh_wr + c_sh_wr_delta * i],
++              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
++                 c_gl_wr_delta_i * (i % 2)],
++              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
++          cp_async4_pred(
++              &sh[c_sh_wr + c_sh_wr_delta * i + 1],
++              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
++                 c_gl_wr_delta_i * (i % 2) + 1],
++              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
++        }
++        cp_async_fence();
++        cp_async_wait<0>();
++      }
++
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks * 4; i++) {
++        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
++          if (!first) {
++            int4 d_red1 = sh[c_sh_wr + i * c_sh_wr_delta];
++            int4 d_red2 = sh[c_sh_wr + i * c_sh_wr_delta + 1];
++  #pragma unroll
++            for (int j = 0; j < 4; j++) {
++              reinterpret_cast<int*>(
++                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
++                  reinterpret_cast<int*>(&d_red1)[j];
++            }
++  #pragma unroll
++            for (int j = 0; j < 4; j++) {
++              reinterpret_cast<int*>(
++                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)] +=
++                  reinterpret_cast<int*>(&d_red2)[j];
++            }
++          }
++          if (!last) {
++            int4 d1, d2;
++  #pragma unroll
++            for (int j = 0; j < 4; j++) {
++              reinterpret_cast<int*>(&d1)[j] = reinterpret_cast<int*>(
++                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)];
++            }
++  #pragma unroll
++            for (int j = 0; j < 4; j++) {
++              reinterpret_cast<int*>(&d2)[j] = reinterpret_cast<int*>(
++                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)];
++            }
++            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
++                d1;
++            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2) +
++              1] = d2;
++          }
++        }
++      }
++    }
++  };
++
++  // Write out the reduce final result in the correct layout. We only actually
++  // reshuffle matrix fragments in this step, the reduction above is performed
++  // in fragment layout.
++  auto write_result = [&]() {
++    int d_gl_stride = prob_n / 8;
++    constexpr int d_sh_stride = 2 * thread_n_blocks + 1;
++    int d_gl_wr_delta = d_gl_stride * (threads / (2 * thread_n_blocks));
++    constexpr int d_sh_rd_delta =
++        d_sh_stride * (threads / (2 * thread_n_blocks));
++
++    int d_gl_wr = d_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
++                  (threadIdx.x % (2 * thread_n_blocks));
++    d_gl_wr += (2 * thread_n_blocks) * slice_col;
++    int d_sh_wr =
++        (4 * d_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
++    d_sh_wr += 32 * (threadIdx.x / 32);
++    int d_sh_rd = d_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
++                  (threadIdx.x % (2 * thread_n_blocks));
++
++    int d_gl_wr_end = d_gl_stride * prob_m;
++
++    // We first reorder in shared memory to guarantee the most efficient final
++    // global write patterns
++    auto write = [&](int idx, int c0, int c1, float a_s, FragS_CHANNEL& w_s) {
++      float2 deq_res;
++      deq_res.x = int32_to_float(c0) * w_s[0] * a_s;
++      deq_res.y = int32_to_float(c1) * w_s[1] * a_s;
++      ((half2*)sh)[idx] = float2_to_half2(deq_res);
++    };
++
++    if (threadIdx.x / 32 < thread_n_blocks / 4) {
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks; i++) {
++  #pragma unroll
++        for (int j = 0; j < 4; j++) {
++          int wr = d_sh_wr + 8 * j;
++          write(wr + (4 * d_sh_stride) * 0 + 0, frag_c[i][j][0][0],
++                frag_c[i][j][0][1], frag_s_tok[i][0],
++                frag_s_ch[j / 2][2 * (j % 2) + 0]);
++          write(wr + (4 * d_sh_stride) * 8 + 0, frag_c[i][j][0][2],
++                frag_c[i][j][0][3], frag_s_tok[i][1],
++                frag_s_ch[j / 2][2 * (j % 2) + 0]);
++          write(wr + (4 * d_sh_stride) * 0 + 4, frag_c[i][j][1][0],
++                frag_c[i][j][1][1], frag_s_tok[i][0],
++                frag_s_ch[j / 2][2 * (j % 2) + 1]);
++          write(wr + (4 * d_sh_stride) * 8 + 4, frag_c[i][j][1][2],
++                frag_c[i][j][1][3], frag_s_tok[i][1],
++                frag_s_ch[j / 2][2 * (j % 2) + 1]);
++        }
++        d_sh_wr += 16 * (4 * d_sh_stride);
++      }
++    }
++    __syncthreads();
++
++  #pragma unroll
++    for (int i = 0;
++         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
++         i++) {
++      if (d_gl_wr < d_gl_wr_end) {
++        D[d_gl_wr] = sh[d_sh_rd];
++        d_gl_wr += d_gl_wr_delta;
++        d_sh_rd += d_sh_rd_delta;
++      }
++    }
++  };
++
++  // Start global fetch and register load pipelines.
++  auto start_pipes = [&]() {
++  #pragma unroll
++    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
++    zero_accums();
++    wait_for_stage();
++    fetch_to_registers(0, 0);
++    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
++  };
++  start_pipes();
++
++  // Main loop.
++  while (slice_iters) {
++  // We unroll over both the global fetch and the register load pipeline to
++  // ensure all shared memory accesses are static. Note that both pipelines have
++  // even length meaning that the next iteration will always start at index 0.
++  #pragma unroll
++    for (int pipe = 0; pipe < stages;) {
++  #pragma unroll
++      for (int k = 0; k < b_sh_wr_iters; k++) {
++        fetch_to_registers(k + 1, pipe % stages);
++        if (k == b_sh_wr_iters - 2) {
++          fetch_to_shared((pipe + stages - 1) % stages, pipe,
++                          slice_iters >= stages);
++          pipe++;
++          wait_for_stage();
++        }
++        matmul(k);
++      }
++      slice_iters--;
++      if (slice_iters == 0) break;
++    }
++    a_gl_rd += a_gl_rd_delta_o * stages;
++
++    // Process results and, if necessary, proceed to the next column slice.
++    // While this pattern may not be the most readable, other ways of writing
++    // the loop seemed to noticeably worse performance after compilation.
++    if (slice_iters == 0) {
++      cp_async_wait<0>();
++      bool last = slice_idx == slice_count - 1;
++      // For per-column scales, we only fetch them here in the final step before
++      // write-out
++      if (last) {
++        if (s_tok_sh_wr_pred) {
++          cp_async1(&sh_s_tok[s_tok_sh_wr], &s_tok[s_tok_gl_rd]);
++        }
++        if (s_ch_sh_wr_pred) {
++          cp_async4(&sh_s_ch[s_ch_sh_wr], &s_ch[s_ch_gl_rd]);
++        }
++        cp_async_fence();
++      }
++      thread_block_reduce();
++      if (last) {
++        cp_async_wait<0>();
++        __syncthreads();
++        if (threadIdx.x / 32 < thread_n_blocks / 4) {
++  #pragma unroll
++          for (int i = 0; i < thread_m_blocks; i++) {
++            frag_s_tok[i][0] =
++                *reinterpret_cast<float*>(&sh_s_tok[16 * i + 2 * s_tok_sh_rd]);
++            frag_s_tok[i][1] = *reinterpret_cast<float*>(
++                &sh_s_tok[16 * i + 2 * s_tok_sh_rd + 1]);
++          }
++          reinterpret_cast<int4*>(&frag_s_ch)[0] = sh_s_ch[s_ch_sh_rd + 0];
++          reinterpret_cast<int4*>(&frag_s_ch)[1] = sh_s_ch[s_ch_sh_rd + 1];
++          reinterpret_cast<int4*>(&frag_s_ch)[2] = sh_s_ch[s_ch_sh_rd + 8];
++          reinterpret_cast<int4*>(&frag_s_ch)[3] = sh_s_ch[s_ch_sh_rd + 9];
++        }
++      }
++      if (slice_count > 1) {  // only globally reduce if there is more than one
++                              // block in a slice
++        barrier_acquire(&locks[slice_col], slice_idx);
++        global_reduce(slice_idx == 0, last);
++        barrier_release(&locks[slice_col], last);
++      }
++      if (last)  // only the last block in a slice actually writes the result
++        write_result();
++      slice_row = 0;
++      slice_col_par++;
++      slice_col++;
++      init_slice();
++      if (slice_iters) {
++        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                  (threadIdx.x % a_gl_rd_delta_o);
++  #pragma unroll
++        for (int i = 0; i < b_sh_wr_iters; i++)
++          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
++        if (slice_col == 0) {
++  #pragma unroll
++          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
++        }
++        s_group_gl_rd = s_group_sh_stride * slice_col + threadIdx.x;
++        s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
++        start_pipes();
++      }
++    }
++  }
++}
++
++#else
++
++template <const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__global__ void Marlin(
++    const int4* __restrict__ A,  // int8 input matrix of shape mxk
++    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
++    int4* __restrict__ C,        // int32 global_reduce buffer of shape
++                           // (max_par*16*4)xn, as int8 tensor core's output is
++                           // int32 dtype
++    int4* __restrict__ D,              // fp16 output buffer of shape mxn
++    const float* __restrict__ s_tok,   // fp32 activation per-token quantization
++                                       // scales of shape mx1
++    const int4* __restrict__ s_ch,     // fp32 weight per-channel quantization
++                                       // scales of shape 1xn
++    const int4* __restrict__ s_group,  // fp16 weight per-group quantization
++                                       // scales of shape (k/groupsize)xn, when
++                                       // group_blocks=-1, it should be nullptr
++    int prob_m,                        // batch dimension m
++    int prob_n,                        // output dimension n
++    int prob_k,                        // reduction dimension k
++    int* locks  // extra global storage for barrier synchronization
++) {
++  // Marlin is not implemented yet for SM < 8.0
++  assert(false);
++  return;
++}
++
++#endif
++
++// 8 warps are a good choice since every SM has 4 schedulers and having more
++// than 1 warp per schedule allows some more latency hiding. At the same time,
++// we want relatively few warps to have many registers per warp and small tiles.
++const int USER_THREADS =
++    256;               // Note: This is only used with user-provided thread_k/n
++const int STAGES = 4;  // 4 pipeline stages fit into shared memory
++
++static constexpr int min_thread_n = 64;
++static constexpr int min_thread_k = 64;
++
++static constexpr int tile_size = 16;
++static constexpr int max_par = 16;
++
++static constexpr int pack_factor_4bit =
++    8;  // We have 8 4-bit vals inside a 32 bit
++
++#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
++                  GROUP_BLOCKS, NUM_THREADS)                                   \
++  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
++           thread_n_blocks == THREAD_N_BLOCKS &&                               \
++           thread_k_blocks == THREAD_K_BLOCKS &&                               \
++           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
++    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
++                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
++                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
++                         max_shared_mem);                                      \
++    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
++           STAGES, GROUP_BLOCKS>                                               \
++        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
++            A_ptr, B_ptr, C_ptr, D_ptr, s_tok_ptr, s_ch_ptr, s_group_ptr,      \
++            prob_m, prob_n, prob_k, locks);                                    \
++  }
++
++typedef struct {
++  int thread_k;
++  int thread_n;
++  int num_threads;
++} thread_config_t;
++
++thread_config_t small_batch_thread_configs[] = {
++    // Ordered by priority
++
++    // thread_k, thread_n, num_threads
++    {128, 128, 256},  // Default
++    {128, 64, 128},   // Reduce N 2X, same K
++    {64, 256, 256},   // Reduce K 2X, increase N 2X
++    {64, 128, 128},   // Reduce K 2X, same N
++};
++
++thread_config_t large_batch_thread_configs[] = {
++    // Ordered by priority
++
++    // thread_k, thread_n, num_threads
++    {64, 256, 256},   // Default
++    {128, 128, 256},  // Reduce N 2X, increase K 2X
++    {64, 128, 128},   // Reduce N 2X, same K
++    {128, 64, 128},   // Reduce N 4X, increase K 2X
++};
++
++bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
++                     int prob_k) {
++  // Sanity
++  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
++      th_config.num_threads == -1) {
++    return false;
++  }
++
++  // Verify K/N are divisible by thread K/N
++  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
++    return false;
++  }
++
++  // thread_k can be only 128 or 64 (because it must be less than groupsize
++  // which is 128)
++  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
++    return false;
++  }
++
++  // Verify min for thread K/N
++  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
++    return false;
++  }
++
++  // num_threads must be at least 128 (= 4 warps)
++  if (th_config.num_threads < 128) {
++    return false;
++  }
++
++  return true;
++}
++
++thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
++  if (prob_m <= 16) {
++    for (auto th_config : small_batch_thread_configs) {
++      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
++        return th_config;
++      }
++    }
++
++  } else {
++    for (auto th_config : large_batch_thread_configs) {
++      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
++        return th_config;
++      }
++    }
++  }
++
++  return thread_config_t{-1, -1, -1};
++}
++
++#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
++  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
++  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
++  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
++  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
++  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
++  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
++
++void marlin_qqq_cuda(const void* A, const void* B, void* C, void* D,
++                     void* s_tok, void* s_ch, void* s_group, int prob_m,
++                     int prob_n, int prob_k, void* workspace,
++                     int groupsize = -1, int dev = 0, cudaStream_t stream = 0,
++                     int thread_k = -1, int thread_n = -1, int sms = -1,
++                     int max_par = 16) {
++  int tot_m = prob_m;
++  int tot_m_blocks = ceildiv(tot_m, 16);
++  int pad = 16 * tot_m_blocks - tot_m;
++
++  if (sms == -1)
++    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
++
++  int max_shared_mem = 0;
++  cudaDeviceGetAttribute(&max_shared_mem,
++                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
++  TORCH_CHECK(max_shared_mem > 0);
++
++  // Set thread config
++  thread_config_t th_config;
++  if (thread_k != -1 && thread_n != -1) {
++    // User-defined config
++    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
++  } else {
++    // Auto config
++    th_config = determine_thread_config(prob_m, prob_n, prob_k);
++  }
++
++  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
++    throw std::runtime_error(
++        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
++        ", thread_n = " + str(th_config.thread_n) +
++        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
++        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
++  }
++
++  int num_threads = th_config.num_threads;
++  thread_k = th_config.thread_k;
++  thread_n = th_config.thread_n;
++
++  int thread_k_blocks = thread_k / 16;
++  int thread_n_blocks = thread_n / 16;
++  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
++  int blocks = sms;
++
++  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
++    return;
++  }
++
++  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
++              " is not divisible by thread_n = ", thread_n);
++  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
++              " is not divisible by thread_k = ", thread_k);
++  if (group_blocks != -1) {
++    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
++                " is not divisible by group_blocks = ", group_blocks);
++  }
++
++  const int4* A_ptr = (const int4*)A;
++  const int4* B_ptr = (const int4*)B;
++  int4* C_ptr = (int4*)C;
++  int4* D_ptr = (int4*)D;
++  const float* s_tok_ptr = (const float*)s_tok;
++  const int4* s_ch_ptr = (const int4*)s_ch;
++  const int4* s_group_ptr = (const int4*)s_group;
++
++  int* locks = (int*)workspace;
++
++  for (int i = 0; i < tot_m_blocks; i += 4) {
++    int thread_m_blocks = tot_m_blocks - i;
++    prob_m = tot_m - 16 * i;
++    int par = 1;
++    if (thread_m_blocks > 4) {
++      // Note that parallel > 1 currently only works for inputs without any
++      // padding
++      par = (16 * thread_m_blocks - pad) / 64;
++      if (par > max_par) par = max_par;
++      prob_m = 64 * par;
++      i += 4 * (par - 1);
++      thread_m_blocks = 4;
++    }
++
++    // For compilation speed, we only define the kernel configurations that have
++    // seemed useful (in terms of performance) in our testing, however many more
++    // are, in principle, possible.
++    if (false) {
++    }
++    CALL_IF(8, 8, 256)
++    CALL_IF(16, 4, 256)
++    CALL_IF(8, 4, 128)
++    CALL_IF(4, 8, 128)
++    else {
++      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
++                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
++                               ", groupsize = " + str(groupsize) +
++                               ", thread_m_blocks = " + str(thread_m_blocks) +
++                               ", thread_n_blocks = " + str(thread_n_blocks) +
++                               ", thread_k_blocks = " + str(thread_k_blocks));
++    }
++
++    A_ptr += 16 * thread_m_blocks * (prob_k / 16) * par;
++    D_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
++    s_tok_ptr += 16 * thread_m_blocks * par;
++  }
++}
++}  // anonymous namespace
++
++torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
++                              torch::Tensor const& b_q_weight,
++                              torch::Tensor const& s_tok,
++                              torch::Tensor const& s_ch,
++                              torch::Tensor const& s_group,
++                              torch::Tensor& workspace, int64_t size_m,
++                              int64_t size_n, int64_t size_k) {
++  // Verify M
++  TORCH_CHECK(size_m == a.size(0),
++              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
++                  ", size_m = " + str(size_m));
++  TORCH_CHECK(size_m == s_tok.numel(),
++              "Shape mismatch: s_tok.numel() = " + str(s_tok.numel()) +
++                  ", size_m = " + str(size_m));
++
++  // Verify K
++  TORCH_CHECK(size_k == a.size(1),
++              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
++                  ", size_k = " + str(size_k));
++  TORCH_CHECK(size_k % tile_size == 0,
++              "size_k = " + str(size_k) +
++                  " is not divisible by tile_size = " + str(tile_size));
++  TORCH_CHECK(
++      (size_k / tile_size) == b_q_weight.size(0),
++      "Shape mismatch: b_q_weight.size(0) = " + str(b_q_weight.size(0)) +
++          ", size_k = " + str(size_k) + ", tile_size = " + str(tile_size));
++
++  int groupsize = (s_group.numel() == 0) ? -1 : size_k / s_group.size(0);
++  // Verify groupsize
++  TORCH_CHECK(groupsize == -1 || groupsize == 128,
++              "Unexpected groupsize = " + str(groupsize));
++
++  // Verify N
++  TORCH_CHECK(s_ch.numel() == size_n,
++              "Shape mismatch: s_ch.numel() = " + str(s_ch.numel()) +
++                  ", size_n = " + str(size_n));
++  TORCH_CHECK(b_q_weight.size(1) % tile_size == 0,
++              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
++                  " is not divisible by tile_size = " + str(tile_size));
++  if (groupsize != -1) {
++    TORCH_CHECK(s_group.size(1) == size_n,
++                "Shape mismatch: s_group.size(1) = " + str(s_group.size(1)) +
++                    ", size_n = " + str(size_n));
++    TORCH_CHECK(
++        size_k % s_group.size(0) == 0,
++        "size_k = " + str(size_k) +
++            ", is not divisible by s_group.size(0) = " + str(s_group.size(0)));
++  }
++
++  int actual_size_n = (b_q_weight.size(1) / tile_size) * pack_factor_4bit;
++  TORCH_CHECK(size_n == actual_size_n,
++              "Shape mismatch: size_n = " + str(size_n) +
++                  ", actual_size_n = " + str(actual_size_n));
++
++  // Verify A device and strides
++  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
++  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
++
++  // Verify B device and strides
++  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
++  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
++
++  // Verify s_tok device, strides and dtype
++  TORCH_CHECK(s_tok.device().is_cuda(), "s_tok is not on GPU");
++  TORCH_CHECK(s_tok.is_contiguous(), "s_tok is not contiguous");
++  TORCH_CHECK(s_tok.dtype() == torch::kFloat32, "s_tok's dtype is not float32");
++
++  // Verify s_ch device, strides and dtype
++  TORCH_CHECK(s_ch.device().is_cuda(), "s_ch is not on GPU");
++  TORCH_CHECK(s_ch.is_contiguous(), "s_ch is not contiguous");
++  TORCH_CHECK(s_ch.dtype() == torch::kFloat32, "s_ch's dtype is not float32");
++
++  // Verify s_group device, strides and dtype
++  TORCH_CHECK(s_group.device().is_cuda(), "s_group is not on GPU");
++  TORCH_CHECK(s_group.is_contiguous(), "s_group is not contiguous");
++  TORCH_CHECK(s_group.dtype() == torch::kFloat16,
++              "s_group's dtype is not float16");
++
++  // Verify workspace size
++  TORCH_CHECK(size_n % min_thread_n == 0,
++              "size_n = " + str(size_n) +
++                  ", is not divisible by min_thread_n = " + str(min_thread_n));
++  int min_workspace_size = (size_n / min_thread_n) * max_par;
++  TORCH_CHECK(workspace.numel() >= min_workspace_size,
++              "workspace.numel = " + str(workspace.numel()) +
++                  " is below min_workspace_size = " + str(min_workspace_size));
++
++  // Alloc C matrix
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
++  auto options_c = torch::TensorOptions().dtype(torch::kInt).device(a.device());
++  torch::Tensor c = torch::empty({max_par * 64, size_n}, options_c);
++
++  // Alloc D matrix
++  auto options_d =
++      torch::TensorOptions().dtype(torch::kFloat16).device(a.device());
++  torch::Tensor d = torch::empty({size_m, size_n}, options_d);
++
++  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
++  // auto -1)
++  int thread_k = -1;
++  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
++  // auto -1)
++  int thread_n = -1;
++  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
++  int sms = -1;
++
++  int dev = a.get_device();
++  marlin_qqq_cuda(
++      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), d.data_ptr(),
++      s_tok.data_ptr(), s_ch.data_ptr(), s_group.data_ptr(), size_m, size_n,
++      size_k, workspace.data_ptr(), groupsize, dev,
++      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par);
++
++  return d;
++}
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
++  m.impl("marlin_qqq_gemm", &marlin_qqq_gemm);
++}
+diff --git a/csrc/quantization/marlin/sparse/LICENSE b/csrc/quantization/marlin/sparse/LICENSE
+new file mode 100644
+index 0000000..ca75fb1
+--- /dev/null
++++ b/csrc/quantization/marlin/sparse/LICENSE
+@@ -0,0 +1,203 @@
++Contains code from https://github.com/IST-DASLab/Sparse-Marlin/
++
++                                 Apache License
++                           Version 2.0, January 2004
++                        http://www.apache.org/licenses/
++
++   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
++
++   1. Definitions.
++
++      "License" shall mean the terms and conditions for use, reproduction,
++      and distribution as defined by Sections 1 through 9 of this document.
++
++      "Licensor" shall mean the copyright owner or entity authorized by
++      the copyright owner that is granting the License.
++
++      "Legal Entity" shall mean the union of the acting entity and all
++      other entities that control, are controlled by, or are under common
++      control with that entity. For the purposes of this definition,
++      "control" means (i) the power, direct or indirect, to cause the
++      direction or management of such entity, whether by contract or
++      otherwise, or (ii) ownership of fifty percent (50%) or more of the
++      outstanding shares, or (iii) beneficial ownership of such entity.
++
++      "You" (or "Your") shall mean an individual or Legal Entity
++      exercising permissions granted by this License.
++
++      "Source" form shall mean the preferred form for making modifications,
++      including but not limited to software source code, documentation
++      source, and configuration files.
++
++      "Object" form shall mean any form resulting from mechanical
++      transformation or translation of a Source form, including but
++      not limited to compiled object code, generated documentation,
++      and conversions to other media types.
++
++      "Work" shall mean the work of authorship, whether in Source or
++      Object form, made available under the License, as indicated by a
++      copyright notice that is included in or attached to the work
++      (an example is provided in the Appendix below).
++
++      "Derivative Works" shall mean any work, whether in Source or Object
++      form, that is based on (or derived from) the Work and for which the
++      editorial revisions, annotations, elaborations, or other modifications
++      represent, as a whole, an original work of authorship. For the purposes
++      of this License, Derivative Works shall not include works that remain
++      separable from, or merely link (or bind by name) to the interfaces of,
++      the Work and Derivative Works thereof.
++
++      "Contribution" shall mean any work of authorship, including
++      the original version of the Work and any modifications or additions
++      to that Work or Derivative Works thereof, that is intentionally
++      submitted to Licensor for inclusion in the Work by the copyright owner
++      or by an individual or Legal Entity authorized to submit on behalf of
++      the copyright owner. For the purposes of this definition, "submitted"
++      means any form of electronic, verbal, or written communication sent
++      to the Licensor or its representatives, including but not limited to
++      communication on electronic mailing lists, source code control systems,
++      and issue tracking systems that are managed by, or on behalf of, the
++      Licensor for the purpose of discussing and improving the Work, but
++      excluding communication that is conspicuously marked or otherwise
++      designated in writing by the copyright owner as "Not a Contribution."
++
++      "Contributor" shall mean Licensor and any individual or Legal Entity
++      on behalf of whom a Contribution has been received by Licensor and
++      subsequently incorporated within the Work.
++
++   2. Grant of Copyright License. Subject to the terms and conditions of
++      this License, each Contributor hereby grants to You a perpetual,
++      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
++      copyright license to reproduce, prepare Derivative Works of,
++      publicly display, publicly perform, sublicense, and distribute the
++      Work and such Derivative Works in Source or Object form.
++
++   3. Grant of Patent License. Subject to the terms and conditions of
++      this License, each Contributor hereby grants to You a perpetual,
++      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
++      (except as stated in this section) patent license to make, have made,
++      use, offer to sell, sell, import, and otherwise transfer the Work,
++      where such license applies only to those patent claims licensable
++      by such Contributor that are necessarily infringed by their
++      Contribution(s) alone or by combination of their Contribution(s)
++      with the Work to which such Contribution(s) was submitted. If You
++      institute patent litigation against any entity (including a
++      cross-claim or counterclaim in a lawsuit) alleging that the Work
++      or a Contribution incorporated within the Work constitutes direct
++      or contributory patent infringement, then any patent licenses
++      granted to You under this License for that Work shall terminate
++      as of the date such litigation is filed.
++
++   4. Redistribution. You may reproduce and distribute copies of the
++      Work or Derivative Works thereof in any medium, with or without
++      modifications, and in Source or Object form, provided that You
++      meet the following conditions:
++
++      (a) You must give any other recipients of the Work or
++          Derivative Works a copy of this License; and
++
++      (b) You must cause any modified files to carry prominent notices
++          stating that You changed the files; and
++
++      (c) You must retain, in the Source form of any Derivative Works
++          that You distribute, all copyright, patent, trademark, and
++          attribution notices from the Source form of the Work,
++          excluding those notices that do not pertain to any part of
++          the Derivative Works; and
++
++      (d) If the Work includes a "NOTICE" text file as part of its
++          distribution, then any Derivative Works that You distribute must
++          include a readable copy of the attribution notices contained
++          within such NOTICE file, excluding those notices that do not
++          pertain to any part of the Derivative Works, in at least one
++          of the following places: within a NOTICE text file distributed
++          as part of the Derivative Works; within the Source form or
++          documentation, if provided along with the Derivative Works; or,
++          within a display generated by the Derivative Works, if and
++          wherever such third-party notices normally appear. The contents
++          of the NOTICE file are for informational purposes only and
++          do not modify the License. You may add Your own attribution
++          notices within Derivative Works that You distribute, alongside
++          or as an addendum to the NOTICE text from the Work, provided
++          that such additional attribution notices cannot be construed
++          as modifying the License.
++
++      You may add Your own copyright statement to Your modifications and
++      may provide additional or different license terms and conditions
++      for use, reproduction, or distribution of Your modifications, or
++      for any such Derivative Works as a whole, provided Your use,
++      reproduction, and distribution of the Work otherwise complies with
++      the conditions stated in this License.
++
++   5. Submission of Contributions. Unless You explicitly state otherwise,
++      any Contribution intentionally submitted for inclusion in the Work
++      by You to the Licensor shall be under the terms and conditions of
++      this License, without any additional terms or conditions.
++      Notwithstanding the above, nothing herein shall supersede or modify
++      the terms of any separate license agreement you may have executed
++      with Licensor regarding such Contributions.
++
++   6. Trademarks. This License does not grant permission to use the trade
++      names, trademarks, service marks, or product names of the Licensor,
++      except as required for reasonable and customary use in describing the
++      origin of the Work and reproducing the content of the NOTICE file.
++
++   7. Disclaimer of Warranty. Unless required by applicable law or
++      agreed to in writing, Licensor provides the Work (and each
++      Contributor provides its Contributions) on an "AS IS" BASIS,
++      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
++      implied, including, without limitation, any warranties or conditions
++      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
++      PARTICULAR PURPOSE. You are solely responsible for determining the
++      appropriateness of using or redistributing the Work and assume any
++      risks associated with Your exercise of permissions under this License.
++
++   8. Limitation of Liability. In no event and under no legal theory,
++      whether in tort (including negligence), contract, or otherwise,
++      unless required by applicable law (such as deliberate and grossly
++      negligent acts) or agreed to in writing, shall any Contributor be
++      liable to You for damages, including any direct, indirect, special,
++      incidental, or consequential damages of any character arising as a
++      result of this License or out of the use or inability to use the
++      Work (including but not limited to damages for loss of goodwill,
++      work stoppage, computer failure or malfunction, or any and all
++      other commercial damages or losses), even if such Contributor
++      has been advised of the possibility of such damages.
++
++   9. Accepting Warranty or Additional Liability. While redistributing
++      the Work or Derivative Works thereof, You may choose to offer,
++      and charge a fee for, acceptance of support, warranty, indemnity,
++      or other liability obligations and/or rights consistent with this
++      License. However, in accepting such obligations, You may act only
++      on Your own behalf and on Your sole responsibility, not on behalf
++      of any other Contributor, and only if You agree to indemnify,
++      defend, and hold each Contributor harmless for any liability
++      incurred by, or claims asserted against, such Contributor by reason
++      of your accepting any such warranty or additional liability.
++
++   END OF TERMS AND CONDITIONS
++
++   APPENDIX: How to apply the Apache License to your work.
++
++      To apply the Apache License to your work, attach the following
++      boilerplate notice, with the fields enclosed by brackets "[]"
++      replaced with your own identifying information. (Don't include
++      the brackets!)  The text should be enclosed in the appropriate
++      comment syntax for the file format. We also recommend that a
++      file or class name and description of purpose be included on the
++      same "printed page" as the copyright notice for easier
++      identification within third-party archives.
++
++   Copyright [yyyy] [name of copyright owner]
++
++   Licensed under the Apache License, Version 2.0 (the "License");
++   you may not use this file except in compliance with the License.
++   You may obtain a copy of the License at
++
++       http://www.apache.org/licenses/LICENSE-2.0
++
++   Unless required by applicable law or agreed to in writing, software
++   distributed under the License is distributed on an "AS IS" BASIS,
++   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++   See the License for the specific language governing permissions and
++   limitations under the License.
+\ No newline at end of file
+diff --git a/csrc/quantization/marlin/sparse/common/base.h b/csrc/quantization/marlin/sparse/common/base.h
+new file mode 100644
+index 0000000..16018d3
+--- /dev/null
++++ b/csrc/quantization/marlin/sparse/common/base.h
+@@ -0,0 +1,51 @@
++/*
++ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
++ * Rights Reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *       http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++namespace marlin_24 {
++
++constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
++
++// Instances of `Vec` are used to organize groups of >>registers<<, as needed
++// for instance as inputs to tensor core operations. Consequently, all
++// corresponding index accesses must be compile-time constants, which is why we
++// extensively use `#pragma unroll` throughout the kernel code to guarantee
++// this.
++template <typename T, int n>
++struct Vec {
++  T elems[n];
++  __device__ T& operator[](int i) { return elems[i]; }
++};
++
++template <int M_, int N_, int K_>
++struct ShapeBase {
++  static constexpr int M = M_, N = N_, K = K_;
++};
++
++using I4 = Vec<int, 4>;
++
++// Matrix fragments for tensor core instructions; their precise layout is
++// documented here:
++// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
++using FragA = Vec<half2, 4>;
++using FragB = Vec<half2, 2>;
++using FragM = Vec<uint, 1>;
++using FragC = Vec<float, 4>;
++using FragS = Vec<half2, 1>;  // quantization scales
++
++}  // namespace marlin_24
+diff --git a/csrc/quantization/marlin/sparse/common/mem.h b/csrc/quantization/marlin/sparse/common/mem.h
+new file mode 100644
+index 0000000..83e3578
+--- /dev/null
++++ b/csrc/quantization/marlin/sparse/common/mem.h
+@@ -0,0 +1,136 @@
++/*
++ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
++ * Rights Reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *       http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++#include "base.h"
++
++namespace marlin_24 {
++// Predicated asynchronous global->shared copy; used for inputs A where we apply
++// predication to handle batchsizes that are not multiples of 16.
++__device__ inline void cp_async4_pred_zfill(void* smem_ptr,
++                                            const void* glob_ptr,
++                                            bool pred = true,
++                                            const bool zfill = false) {
++  const int BYTES = 16;
++  int src_in_bytes = (zfill ? 0 : BYTES);
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "{\n"
++      "   .reg .pred p;\n"
++      "   setp.ne.b32 p, %0, 0;\n"
++      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
++      "}\n" ::"r"((int)pred),
++      "r"(smem), "l"(glob_ptr), "n"(BYTES), "r"(src_in_bytes));
++}
++
++__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
++                                      bool pred = true) {
++  const int BYTES = 16;
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "{\n"
++      "   .reg .pred p;\n"
++      "   setp.ne.b32 p, %0, 0;\n"
++      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
++      "}\n" ::"r"((int)pred),
++      "r"(smem), "l"(glob_ptr), "n"(BYTES));
++}
++
++// Asynchronous global->shared copy
++__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
++  const int BYTES = 16;
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "{\n"
++      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
++      "}\n" ::"r"(smem),
++      "l"(glob_ptr), "n"(BYTES));
++}
++
++// Async copy fence.
++__device__ inline void cp_async_fence() {
++  asm volatile("cp.async.commit_group;\n" ::);
++}
++
++// Wait until at most `n` async copy stages are still pending.
++template <int n>
++__device__ inline void cp_async_wait() {
++  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
++}
++
++// Instruction for loading a full 16x16 matrix fragment of operand A from shared
++// memory, directly in tensor core layout.
++__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
++  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
++               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
++               : "r"(smem));
++}
++
++__device__ inline void ldsm4_m(FragM& frag_m, const void* smem_ptr) {
++  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_m);
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
++               : "=r"(a[0]), "=r"(a[1])
++               : "r"(smem));
++}
++
++// Instruction for loading a full 16x16 matrix fragment of operand A from shared
++// memory, directly in tensor core layout.
++__device__ inline void ldsm4_t(FragA& frag_a, const void* smem_ptr) {
++  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
++  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
++  asm volatile(
++      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n"
++      : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
++      : "r"(smem));
++}
++
++// Wait until barrier reaches `count`, then lock for current threadblock.
++__device__ inline void barrier_acquire(int* lock, int count) {
++  if (threadIdx.x == 0) {
++    int state = -1;
++    do
++      // Guarantee that subsequent writes by this threadblock will be visible
++      // globally.
++      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
++                   : "=r"(state)
++                   : "l"(lock));
++    while (state != count);
++  }
++  __syncthreads();
++}
++
++// Release barrier and increment visitation count.
++__device__ inline void barrier_release(int* lock, bool reset = false) {
++  __syncthreads();
++  if (threadIdx.x == 0) {
++    if (reset) {
++      lock[0] = 0;
++      return;
++    }
++    int val = 1;
++    // Make sure that all writes since acquiring this barrier are visible
++    // globally, while releasing the barrier.
++    asm volatile("fence.acq_rel.gpu;\n");
++    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
++                 :
++                 : "l"(lock), "r"(val));
++  }
++}
++}  // namespace marlin_24
+diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h
+new file mode 100644
+index 0000000..b26505f
+--- /dev/null
++++ b/csrc/quantization/marlin/sparse/common/mma.h
+@@ -0,0 +1,191 @@
++/*
++ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
++ * Rights Reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *       http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++#include "base.h"
++#include <cudaTypedefs.h>
++
++namespace marlin_24 {
++
++// On CUDA earlier than 12.5, the ordered_metadata version of this instruction
++// is not supported. On later versions of CUDA the version without ordered
++// metadata results in the following warning:
++//  | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
++//  | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
++//  | reduced performance on some future architectures
++#if defined CUDA_VERSION && CUDA_VERSION >= 12050
++  #define MMA_SP_INST \
++    "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
++#else
++  #define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
++#endif
++
++// m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
++// output/accumulation.
++__device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
++                              const FragA& frag_b, FragC& frag_c, FragM& frag_m,
++                              const int psel) {
++  const uint32_t* a0 = reinterpret_cast<const uint32_t*>(&a_frag0);
++  const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
++  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
++  const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
++
++  float* c = reinterpret_cast<float*>(&frag_c);
++  if (psel == 0) {
++    asm volatile(MMA_SP_INST
++                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
++                 "{%12,%13,%14,%15}, %16, 0x0;\n"
++                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
++                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
++                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
++                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
++    asm volatile(MMA_SP_INST
++                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
++                 "{%12,%13,%14,%15}, %16, 0x0;\n"
++                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
++                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
++                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
++                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
++  } else {
++    asm volatile(MMA_SP_INST
++                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
++                 "{%12,%13,%14,%15}, %16, 0x1;\n"
++                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
++                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
++                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
++                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
++    asm volatile(MMA_SP_INST
++                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
++                 "{%12,%13,%14,%15}, %16, 0x1;\n"
++                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
++                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
++                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
++                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
++  }
++}
++
++// Lookup-table based 3-input logical operation; explicitly used for
++// dequantization as the compiler does not seem to automatically recognize it in
++// all cases.
++template <int lut>
++__device__ inline int lop3(int a, int b, int c) {
++  int res;
++  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
++               : "=r"(res)
++               : "r"(a), "r"(b), "r"(c), "n"(lut));
++  return res;
++}
++
++__device__ __forceinline__ uint2 to_half4(float c0, float c1, float c2,
++                                          float c3) {
++  uint2 r;
++  asm("{\n\t"
++      ".reg .f16 a, b, c, d; \n\t"
++      "cvt.rn.f16.f32 a, %2; \n\t"
++      "cvt.rn.f16.f32 b, %3; \n\t"
++      "cvt.rn.f16.f32 c, %4; \n\t"
++      "cvt.rn.f16.f32 d, %5; \n\t"
++      "mov.b32 %0, {a, b};   \n\t"
++      "mov.b32 %1, {c, d};   \n\t"
++      "}"
++      : "=r"(r.x), "=r"(r.y)
++      : "f"(c0), "f"(c1), "f"(c2), "f"(c3));
++  return r;
++}
++
++// Constructs destination register by taking bytes from 2 sources (based on
++// mask)
++template <int start_byte, int mask>
++__device__ inline uint32_t prmt(uint32_t a) {
++  uint32_t res;
++  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
++               : "=r"(res)
++               : "r"(a), "n"(start_byte), "n"(mask));
++  return res;
++}
++
++// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
++// values. We mostly follow the strategy in the link below, with some small
++// changes:
++// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
++__device__ inline FragB dequant_4bit(int q) {
++  const int LO = 0x000f000f;
++  const int HI = 0x00f000f0;
++  const int EX = 0x64006400;
++  // Guarantee that the `(a & b) | c` operations are LOP3s.
++  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
++  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
++  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
++  // directly into `SUB` and `ADD`.
++  const int SUB = 0x64086408;
++  const int MUL = 0x2c002c00;
++  const int ADD = 0xd480d480;
++
++  FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&SUB));
++  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&MUL),
++                      *reinterpret_cast<const half2*>(&ADD));
++  return frag_b;
++}
++
++// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
++// values. We mostly follow the strategy in the link below, with some small
++// changes:
++// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
++__device__ inline FragB dequant_8bit(int q) {
++  static constexpr uint32_t mask_for_elt_01 = 0x5250;
++  static constexpr uint32_t mask_for_elt_23 = 0x5351;
++  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
++
++  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
++  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
++
++  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
++
++  FragB frag_b;
++  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
++                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
++  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
++                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
++  return frag_b;
++}
++
++// Multiply dequantized values by the corresponding quantization scale; used
++// only for grouped quantization.
++__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
++  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
++  frag_b[0] = __hmul2(frag_b[0], s);
++  frag_b[1] = __hmul2(frag_b[1], s);
++}
++
++__device__ inline void scale_floats(float* c0, float* c1, float* c2, float* c3,
++                                    FragS& s0, float* c4, float* c5, float* c6,
++                                    float* c7, FragS& s1) {
++  *c0 = __fmul_rn(*c0, __half2float(s0[0].x));
++  *c1 = __fmul_rn(*c1, __half2float(s0[0].y));
++  *c2 = __fmul_rn(*c2, __half2float(s0[1].x));
++  *c3 = __fmul_rn(*c3, __half2float(s0[1].y));
++
++  *c4 = __fmul_rn(*c4, __half2float(s1[0].x));
++  *c5 = __fmul_rn(*c5, __half2float(s1[0].y));
++  *c6 = __fmul_rn(*c6, __half2float(s1[1].x));
++  *c7 = __fmul_rn(*c7, __half2float(s1[1].y));
++}
++
++}  // namespace marlin_24
+diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+new file mode 100644
+index 0000000..1783735
+--- /dev/null
++++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+@@ -0,0 +1,1145 @@
++/*
++ * Notice: This file was modified by Neuralmagic inc to include 8-bit support
++ *
++ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
++ * Rights Reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *       http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <cuda.h>
++#include <cuda_fp16.h>
++#include <cuda_runtime.h>
++
++#include <iostream>
++
++#include "common/base.h"
++#include "core/scalar_type.hpp"
++#include "core/registration.h"
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
++
++#else
++
++  #include "common/mem.h"
++  #include "common/mma.h"
++
++#endif
++
++template <typename T>
++inline std::string str(T x) {
++  return std::to_string(x);
++}
++
++namespace marlin_24 {
++
++// 8 warps are a good choice since every SM has 4 schedulers and having more
++// than 1 warp per schedule allows some more latency hiding. At the same time,
++// we want relatively few warps to have many registers per warp and small tiles.
++static constexpr int THREADS = 256;
++static constexpr int STAGES = 4;
++
++static constexpr int min_thread_n = 128;
++
++static constexpr int tile_size = 16;
++static constexpr int max_par = 64;
++
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
++
++template <const int num_bits,         // weight bits
++          const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__global__ void Marlin_24(
++    const int4* __restrict__ A,     // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,     // 4bit quantized weight matrix of shape kxn
++    const int4* __restrict__ meta,  // 2bit metadata information about 2:4
++                                    // format on B
++    int4* __restrict__ C,           // fp16 output buffer of shape mxn
++    const int4* __restrict__ s,     // fp16 quantization scales of shape
++                                    // (k/groupsize)xn
++    int prob_m,                     // batch dimension m
++    int prob_n,                     // output dimension n
++    int prob_k,                     // reduction dimension k
++    int* locks  // extra global storage for barrier synchronization
++) {}
++
++torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
++                                  torch::Tensor& b_meta,
++                                  torch::Tensor& b_scales,
++                                  torch::Tensor& workspace,
++                                  vllm::ScalarTypeId const b_q_type_id,
++                                  int64_t size_m, int64_t size_n,
++                                  int64_t size_k) {
++  TORCH_CHECK_NOT_IMPLEMENTED(
++      false, "gptq_marlin_24_gemm(..) requires CUDA_ARCH >= 8.0");
++  return torch::empty({1, 1});
++}
++
++#else
++
++template <const int num_bits,         // weight bits
++          const int threads,          // number of threads in a threadblock
++          const int thread_m_blocks,  // number of 16x16 blocks in the m
++                                      // dimension (batchsize) of the
++                                      // threadblock
++          const int thread_n_blocks,  // same for n dimension (output)
++          const int thread_k_blocks,  // same for k dimension (reduction)
++          const int stages,  // number of stages for the async global->shared
++                             // fetch pipeline
++          const int group_blocks = -1  // number of consecutive 16x16 blocks
++                                       // with a separate quantization scale
++          >
++__global__ void Marlin_24(
++    const int4* __restrict__ A,     // fp16 input matrix of shape mxk
++    const int4* __restrict__ B,     // 4bit quantized weight matrix of shape kxn
++    const int4* __restrict__ meta,  // 2bit metadata information about 2:4
++                                    // format on B
++    int4* __restrict__ C,           // fp16 output buffer of shape mxn
++    const int4* __restrict__ s,     // fp16 quantization scales of shape
++                                    // (k/groupsize)xn
++    int prob_m,                     // batch dimension m
++    int prob_n,                     // output dimension n
++    int prob_k,                     // reduction dimension k
++    int* locks  // extra global storage for barrier synchronization
++) {
++  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
++  // same size, which might involve multiple column "slices" (of width 16 *
++  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
++  // example:
++  //   0 1 3
++  //   0 2 3
++  //   1 2 4
++  // While this kind of partitioning makes things somewhat more complicated, it
++  // ensures good utilization of all SMs for many kinds of shape and GPU
++  // configurations, while requiring as few slow global cross-threadblock
++  // reductions as possible.
++
++  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
++  // better partitioning with less reductions
++  int parallel = 1;
++  if (prob_m > 16 * thread_m_blocks) {
++    parallel = prob_m / (16 * thread_m_blocks);
++    prob_m = 16 * thread_m_blocks;
++  }
++
++  // number of thread_k_blocks in k-dim
++  int k_tiles = prob_k / 32 / thread_k_blocks;
++  // number of thread_n_blocks in n-dim
++  int n_tiles = prob_n / 16 / thread_n_blocks;
++  // iters needed to cover all slices
++  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
++
++  // Ensure that the number of tiles in each stripe is a multiple of the
++  // groupsize; this avoids an annoying special case where a stripe starts in
++  // the middle of group.
++  if (group_blocks != -1)
++    iters = (group_blocks / thread_k_blocks) *
++            ceildiv(iters, (group_blocks / thread_k_blocks));
++
++  int slice_row = (iters * blockIdx.x) % k_tiles;
++  int slice_col_par = (iters * blockIdx.x) / k_tiles;
++  int slice_col = slice_col_par;
++  // number of threadblock tiles in the current slice
++  int slice_iters;
++  // total number of active threadblocks in the current slice
++  int slice_count = 0;
++  // index of threadblock in current slice; numbered bottom to top
++  int slice_idx;
++
++  // We can easily implement parallel problem execution by just remapping
++  // indices and advancing global pointers
++  if (slice_col_par >= n_tiles) {
++    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
++    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
++    locks += (slice_col_par / n_tiles) * n_tiles;
++    slice_col = slice_col_par % n_tiles;
++  }
++
++  // Compute all information about the current slice which is required for
++  // synchronization.
++  auto init_slice = [&]() {
++    slice_iters =
++        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
++    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
++    if (slice_iters == 0) return;
++    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
++    slice_count = 1;
++    slice_idx = 0;
++    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
++    if (col_first <= k_tiles * (slice_col_par + 1)) {
++      int col_off = col_first - k_tiles * slice_col_par;
++      slice_count = ceildiv(k_tiles - col_off, iters);
++      if (col_off > 0) slice_count++;
++      int delta_first = iters * blockIdx.x - col_first;
++      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
++        slice_idx = slice_count - 1;
++      else {
++        slice_idx = slice_count - 1 - delta_first / iters;
++        if (col_off > 0) slice_idx--;
++      }
++    }
++    if (slice_col == n_tiles) {
++      A += 16 * thread_m_blocks * prob_k / 8;
++      C += 16 * thread_m_blocks * prob_n / 8;
++      locks += n_tiles;
++      slice_col = 0;
++    }
++  };
++  init_slice();
++
++  // RLC: 8 is vec_size -> 128-bit instructions, 8 fp16 elements
++  int a_gl_stride = prob_k / 8;  // stride of the A matrix in global memory
++
++  // stride of an A matrix tile in shared memory
++  constexpr int a_sh_stride = 32 * thread_k_blocks / 8;
++  // delta between subsequent A tiles in global memory
++  constexpr int a_gl_rd_delta_o = 32 * thread_k_blocks / 8;
++  // between subsequent accesses within a tile
++  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
++  // between shared memory writes
++  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
++  // between shared memory tile reads //RLC: 2 * #warps k-dim
++  constexpr int a_sh_rd_delta_o = 4 * ((threads / 32) / (thread_n_blocks / 4));
++  // within a shared memory tile
++  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
++  // overall size of a tile
++  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
++  // number of shared write iterations for a tile
++  constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
++
++  constexpr int pack_factor = 32 / num_bits;
++
++  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
++  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
++  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
++  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
++  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
++  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
++  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
++  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
++  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
++  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
++
++  int m_gl_stride = 2 * prob_n / 8;  // (16*2*4 / 8) = 16
++  constexpr int m_sh_stride =
++      (16 * thread_n_blocks) / 4;  // #warps n-dim * threads/warp
++  int m_gl_rd_delta_o = m_gl_stride * thread_k_blocks;
++  int m_gl_rd_delta_i = m_gl_stride * (threads / m_sh_stride);
++  constexpr int m_sh_wr_delta = threads / 2;
++  constexpr int m_sh_rd_delta = threads / 2;
++  constexpr int m_sh_stage = m_sh_stride * thread_k_blocks;
++  constexpr int m_sh_iters = ceildiv(m_sh_stage, m_sh_wr_delta);
++
++  int s_gl_stride = prob_n / 8;
++  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
++  constexpr int s_sh_stage = s_sh_stride;
++  int s_gl_rd_delta = s_gl_stride;
++
++  // Global A read index of current thread.
++  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                (threadIdx.x % a_gl_rd_delta_o);
++  a_gl_rd += a_gl_rd_delta_o * slice_row;
++  // Shared write index of current thread.
++  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                (threadIdx.x % a_gl_rd_delta_o);
++  // Shared read index.
++  int a_sh_rd =
++      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
++  a_sh_rd += 4 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
++
++  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
++                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
++  b_gl_rd += b_sh_stride * slice_col;
++  b_gl_rd += b_gl_rd_delta_o * slice_row;
++  int b_sh_wr = threadIdx.x * b_thread_vecs;
++  int b_sh_rd = threadIdx.x * b_thread_vecs;
++
++  int m_gl_rd = m_gl_stride * (threadIdx.x / (m_sh_stride)) +
++                (threadIdx.x % (m_sh_stride));
++  m_gl_rd += (m_sh_stride)*slice_col;
++  m_gl_rd += m_gl_rd_delta_o * slice_row;
++  int m_sh_wr = threadIdx.x;
++  int m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;
++
++  int s_gl_rd;
++  if constexpr (group_blocks == -1) {
++    s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
++  } else {
++    s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
++              s_sh_stride * slice_col + threadIdx.x;
++  }
++
++  int s_sh_wr = threadIdx.x;
++  int s_sh_rd;
++  // We use a different scale layout for grouped and column-wise quantization as
++  // we scale a `half2` tile in column-major layout in the former and in
++  // row-major in the latter case.
++  s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
++            (threadIdx.x % 32) / 4;  // Note that in the original Marlin kernel
++                                     // this is (threadIdx.x % 32) / 4
++
++  // Precompute which thread should not read memory in which iterations; this is
++  // needed if there are more threads than required for a certain tilesize or
++  // when the batchsize is not a multiple of 16.
++  bool a_sh_wr_pred[a_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < a_sh_wr_iters; i++) {
++    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
++  }
++  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
++
++  // To ensure that writing and reading A tiles to/from shared memory, the
++  // latter in fragment format, is fully bank conflict free, we need to use a
++  // rather fancy XOR-based layout. The key here is that neither reads nor
++  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
++  // same shared memory banks. Further, it seems (based on NSight-Compute) that
++  // each warp must also write a consecutive memory segment?
++  auto transform_a = [&](int i) {
++    int row = i / a_gl_rd_delta_o;
++    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
++  };
++  // Since the computation of this remapping is non-trivial and, due to our main
++  // loop unrolls, all shared memory accesses are static, we simply precompute
++  // both transformed reads and writes.
++  int a_sh_wr_trans[a_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < a_sh_wr_iters; i++)
++    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
++  int a_sh_rd_trans[2][b_sh_wr_iters][thread_m_blocks];
++  #pragma unroll
++  for (int i = 0; i < b_sh_wr_iters; i++) {
++  #pragma unroll
++    for (int j = 0; j < thread_m_blocks; j++) {
++      a_sh_rd_trans[0][i][j] =
++          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
++      a_sh_rd_trans[1][i][j] =
++          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd + 2);
++    }
++  }
++
++  // Since B-accesses have non-constant stride they have to be computed at
++  // runtime; we break dependencies between subsequent accesses with a tile by
++  // maintining multiple pointers (we have enough registers), a tiny
++  // optimization.
++  const int4* B_ptr[b_sh_wr_iters];
++  #pragma unroll
++  for (int i = 0; i < b_sh_wr_iters; i++)
++    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
++
++  bool m_sh_wr_pred = threadIdx.x < m_sh_wr_delta;
++  const int4* meta_ptr[m_sh_iters];
++  #pragma unroll
++  for (int i = 0; i < m_sh_iters; i++)
++    meta_ptr[i] = meta + m_gl_rd_delta_i * i + m_gl_rd;
++
++  extern __shared__ int4 sh[];
++  // Shared memory storage for global fetch pipelines.
++  int4* sh_a = sh;
++  int4* sh_b = sh_a + (stages * a_sh_stage);
++  int4* sh_s = sh_b + (stages * b_sh_stage);
++  int4* sh_m = sh_s + (stages * s_sh_stage);
++  // Register storage for double buffer of shared memory reads.
++  FragA frag_a[2][thread_m_blocks][2];
++  I4 frag_b_quant[2][b_thread_vecs];
++  FragM frag_m[2][2];
++  FragC frag_c[thread_m_blocks][4][2];
++  FragS frag_s[2][4];
++
++  // Zero accumulators.
++  auto zero_accums = [&]() {
++  #pragma unroll
++    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
++      reinterpret_cast<float*>(frag_c)[i] = 0;
++  };
++
++  // Asynchronously fetch the next A, B and s tile from global to the next
++  // shared memory pipeline location.
++  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
++    if (pred) {
++      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < a_sh_wr_iters; i++) {
++        cp_async4_pred(
++            &sh_a_stage[a_sh_wr_trans[i]],
++            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
++            a_sh_wr_pred[i]);
++      }
++      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < b_sh_wr_iters; i++) {
++  #pragma unroll
++        for (int j = 0; j < b_thread_vecs; j++) {
++          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
++        }
++        B_ptr[i] += b_gl_rd_delta_o;
++      }
++      int4* sh_meta_stage = sh_m + m_sh_stage * pipe;
++  #pragma unroll
++      for (int i = 0; i < m_sh_iters; i++) {
++        if (m_sh_wr_pred)
++          cp_async4(&sh_meta_stage[m_sh_wr_delta * i + m_sh_wr], meta_ptr[i]);
++        meta_ptr[i] += m_gl_rd_delta_o;
++      }
++      // Only fetch scales if this tile starts a new group
++      if constexpr (group_blocks != -1) {
++        // This assumes group_blocks >= thread_k_blocks
++        // and would need to be modified to support smaller groups.
++        static_assert(group_blocks >= thread_k_blocks);
++        if (pipe % (group_blocks / thread_k_blocks) == 0) {
++          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
++          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
++          s_gl_rd += s_gl_rd_delta;
++        }
++      }
++    }
++    // Insert a fence even when we are winding down the pipeline to ensure that
++    // waiting is also correct at this point.
++    cp_async_fence();
++  };
++
++  // Wait until the next thread tile has been loaded to shared memory.
++  auto wait_for_stage = [&]() {
++    // We only have `stages - 2` active fetches since we are double buffering
++    // and can only issue the next fetch when it is guaranteed that the previous
++    // shared memory load is fully complete (as it may otherwise be
++    // overwritten).
++    cp_async_wait<stages - 2>();
++    __syncthreads();
++  };
++
++  // Load the next sub-tile from the current location in the shared memory pipe
++  // into the current register buffer.
++  auto fetch_to_registers = [&](int k, int pipe) {
++    // It may seem inefficient that we reload the groups for every sub-tile;
++    // however, this does not seem to be a significant bottleneck, while some
++    // theoretically better attempts have lead to bad instruction ordering by
++    // the compiler and correspondingly a noticeable drop in performance.
++    if constexpr (group_blocks != -1) {
++      // This assumes group_blocks >= thread_k_blocks
++      // and would need to be modified to support smaller groups.
++      static_assert(group_blocks >= thread_k_blocks);
++      int4* sh_s_stage =
++          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
++                               (pipe / (group_blocks / thread_k_blocks)));
++      reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
++    }
++    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
++  #pragma unroll
++    for (int i = 0; i < thread_m_blocks; i++) {
++      ldsm4(frag_a[k % 2][i][0],
++            &sh_a_stage[a_sh_rd_trans[0][k % b_sh_wr_iters][i]]);
++      ldsm4(frag_a[k % 2][i][1],
++            &sh_a_stage[a_sh_rd_trans[1][k % b_sh_wr_iters][i]]);
++    }
++
++    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
++  #pragma unroll
++    for (int i = 0; i < b_thread_vecs; i++) {
++      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
++          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
++    }
++
++    // Load meta with ldsm4
++    int4* sh_m_stage = sh_m + m_sh_stage * pipe;
++    ldsm4_m(frag_m[k % 2][0],
++            &sh_m_stage[m_sh_rd_delta * (k % m_sh_iters) + m_sh_rd]);
++  };
++
++  // Execute the actual tensor core matmul of a sub-tile.
++  auto matmul = [&](int k) {
++  // We have the m dimension as the inner loop in order to encourage overlapping
++  // dequantization and matmul operations.
++  #pragma unroll
++    for (int j = 0; j < 4; j++) {
++      FragB frag_b0;
++      FragB frag_b1;
++
++      if constexpr (num_bits == 4) {
++        int b_quant = frag_b_quant[k % 2][0][j];
++        int b_quant_shift = b_quant >> 8;
++
++        frag_b0 = dequant_4bit(b_quant);
++        frag_b1 = dequant_4bit(b_quant_shift);
++
++      } else {
++        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
++        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
++        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
++
++        frag_b0 = dequant_8bit(b_quant_0);
++        frag_b1 = dequant_8bit(b_quant_1);
++      }
++
++      // If there are no groups, we can just scale the final output once and can
++      // avoid doing so for each weight.
++      if constexpr (group_blocks != -1) {
++        scale(frag_b0, frag_s[k % 2][j], 0);
++      }
++      if constexpr (group_blocks != -1) {
++        scale(frag_b1, frag_s[k % 2][j], 1);
++      }
++
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks; i++) {
++        mma_sp(frag_b0, frag_b1, frag_a[k % 2][i][0], frag_c[i][j][0],
++               frag_m[k % 2][j / 2], j % 2);
++      }
++    }
++  };
++
++  // Since we slice across the k dimension of a tile in order to increase the
++  // number of warps while keeping the n dimension of a tile reasonable, we have
++  // multiple warps that accumulate their partial sums of the same output
++  // location; which we have to reduce over in the end. We do in shared memory.
++  auto thread_block_reduce = [&]() {
++    constexpr int red_off = threads / b_sh_stride_threads / 2;
++    if (red_off >= 1) {
++      int red_idx = threadIdx.x / b_sh_stride_threads;
++      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
++      constexpr int red_sh_delta = b_sh_stride_threads;
++      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
++                      (threadIdx.x % b_sh_stride_threads);
++
++  // Parallel logarithmic shared memory reduction. We make sure to avoid any
++  // unnecessary read or write iterations, e.g., for two warps we write only
++  // once by warp 1 and read only once by warp 0.
++  #pragma unroll
++      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
++  #pragma unroll
++        for (int i = red_off; i > 0; i /= 2) {
++          if (i <= red_idx && red_idx < 2 * i) {
++  #pragma unroll
++            for (int j = 0; j < 4 * 2; j++) {
++              int red_sh_wr =
++                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
++              if (i < red_off) {
++                float* c_rd =
++                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
++                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
++  #pragma unroll
++                for (int k = 0; k < 4; k++)
++                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
++                      c_rd[k] + c_wr[k];
++              }
++              sh[red_sh_wr] =
++                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
++            }
++          }
++          __syncthreads();
++        }
++        if (red_idx == 0) {
++  #pragma unroll
++          for (int i = 0; i < 4 * 2; i++) {
++            float* c_rd =
++                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
++  #pragma unroll
++            for (int j = 0; j < 4; j++)
++              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
++                  c_rd[j];
++          }
++        }
++        __syncthreads();
++      }
++    }
++  };
++
++  // Since multiple threadblocks may process parts of the same column slice, we
++  // finally have to globally reduce over the results. As the striped
++  // partitioning minimizes the number of such reductions and our outputs are
++  // usually rather small, we perform this reduction serially in L2 cache.
++  auto global_reduce = [&](bool first = false, bool last = false) {
++    // We are very careful here to reduce directly in the output buffer to
++    // maximize L2 cache utilization in this step. To do this, we write out
++    // results in FP16 (but still reduce with FP32 compute).
++    constexpr int active_threads = 32 * thread_n_blocks / 4;
++    if (threadIdx.x < active_threads) {
++      int c_gl_stride = prob_n / 8;
++      int c_gl_wr_delta_o = 2 * 4 * c_gl_stride;
++      int c_gl_wr_delta_i =
++          c_gl_stride;  // 8 threads (e.g., 0,4,8,12,16,20,24,28)
++      int c_gl_wr = 2 * c_gl_stride * (threadIdx.x % 4) +
++                    8 * (threadIdx.x / 32) + (threadIdx.x % 32) / 4;
++      c_gl_wr += (2 * thread_n_blocks) * slice_col;
++      constexpr int c_sh_wr_delta = active_threads;
++      int c_sh_wr = threadIdx.x;
++
++      int col = 2 * ((threadIdx.x % 32) % 4);
++
++      if (!first) {
++  // Interestingly, doing direct global accesses here really seems to mess up
++  // the compiler and lead to slowdowns, hence we also use async-copies even
++  // though these fetches are not actually asynchronous.
++  #pragma unroll
++        for (int i = 0; i < thread_m_blocks * 4; i++) {
++          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i],
++                         &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
++                            c_gl_wr_delta_i * (i % 2)],
++                         i < (thread_m_blocks - 1) * 4 ||
++                             8 * (i / 2) + col + (i % 2) < prob_m);
++        }
++        cp_async_fence();
++        cp_async_wait<0>();
++      }
++
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks * 4; i++) {
++        if (i < (thread_m_blocks - 1) * 4 ||
++            8 * (i / 2) + col + (i % 2) < prob_m) {
++          if (!first) {
++            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
++  #pragma unroll
++            for (int j2 = 0; j2 < 2; j2++) {
++  #pragma unroll
++              for (int j1 = 0; j1 < 4; j1++) {
++                reinterpret_cast<float*>(
++                    &frag_c)[4 * 2 * 4 * (i / 4) + 8 * j1 + 2 * j2 +
++                             4 * ((i % 4) / 2) + i % 2] +=
++                    __half2float(
++                        reinterpret_cast<__half*>(&c_red)[(j2 * 4 + j1)]);
++              }
++            }
++          }
++          if (!last) {
++            int4 c;
++  #pragma unroll
++            for (int j2 = 0; j2 < 2; j2++) {
++  #pragma unroll
++              for (int j1 = 0; j1 < 4; j1++) {
++                reinterpret_cast<__half*>(&c)[(j2 * 4 + j1)] =
++                    __float2half(reinterpret_cast<float*>(
++                        &frag_c)[4 * 2 * 4 * (i / 4) + 8 * j1 + 2 * j2 +
++                                 4 * ((i % 4) / 2) + i % 2]);
++              }
++            }
++            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
++                c;
++          }
++        }
++      }
++    }
++  };
++
++  // Write out the reduce final result in the correct layout. We only actually
++  // reshuffle matrix fragments in this step, the reduction above is performed
++  // in fragment layout.
++  auto write_result = [&]() {
++    int c_gl_stride = prob_n / 8;
++
++    constexpr int c_sh_stride = 2 * thread_n_blocks;              // RLC:
++    constexpr int c_sh_stride_2 = 2 * c_sh_stride + 2;            // RLC:
++    constexpr int c_sh_stride_3 = 2 * (2 * thread_n_blocks) + 2;  // RLC:
++
++    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
++
++    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
++                  (threadIdx.x % (2 * thread_n_blocks));
++    c_gl_wr += (2 * thread_n_blocks) * slice_col;
++
++    int c_sh_wr = c_sh_stride_2 * ((threadIdx.x % 32) % 4) +
++                  ((threadIdx.x % 32) / 4);  // RLC:
++    c_sh_wr += 8 * (threadIdx.x / 32);       // 128/4(half4)
++
++    constexpr int c_sh_rd_delta =
++        c_sh_stride_3 * (threads / (2 * 2 * thread_n_blocks));  // RLC:
++    int c_sh_rd = c_sh_stride_3 * (threadIdx.x / (2 * 2 * thread_n_blocks)) +
++                  (threadIdx.x % (2 * 2 * thread_n_blocks));
++
++    int c_gl_wr_end = c_gl_stride * prob_m;
++
++    auto write = [&](int idx, float c0, float c1, float c2, float c3, FragS& s0,
++                     float c4, float c5, float c6, float c7, FragS& s1) {
++      uint2 res[2];
++      res[0] = to_half4(c0, c1, c2, c3);
++      res[1] = to_half4(c4, c5, c6, c7);
++      half2* tmp = (half2*)&res;
++      // for per-column quantization we finally apply the scale here
++      if constexpr (group_blocks == -1 && num_bits == 4) {
++        tmp[0] = __hmul2(tmp[0], s0[0]);
++        tmp[1] = __hmul2(tmp[1], s0[1]);
++        tmp[2] = __hmul2(tmp[2], s1[0]);
++        tmp[3] = __hmul2(tmp[3], s1[1]);
++      }
++      ((int4*)sh)[idx] = *((int4*)&res[0]);
++    };
++
++    // RLC:  only warp 0 and 1 baseline example
++    if (threadIdx.x / 32 < thread_n_blocks / 4) {
++  #pragma unroll
++      for (int i = 0; i < thread_m_blocks; i++) {
++        int wr = c_sh_wr;
++        write(wr, frag_c[i][0][0][0], frag_c[i][1][0][0], frag_c[i][2][0][0],
++              frag_c[i][3][0][0], frag_s[0][0], frag_c[i][0][0][2],
++              frag_c[i][1][0][2], frag_c[i][2][0][2], frag_c[i][3][0][2],
++              frag_s[0][2]);
++        write(wr + c_sh_stride, frag_c[i][0][0][1], frag_c[i][1][0][1],
++              frag_c[i][2][0][1], frag_c[i][3][0][1], frag_s[0][0],
++              frag_c[i][0][0][3], frag_c[i][1][0][3], frag_c[i][2][0][3],
++              frag_c[i][3][0][3], frag_s[0][2]);
++        write(wr + 4 * c_sh_stride_2, frag_c[i][0][1][0], frag_c[i][1][1][0],
++              frag_c[i][2][1][0], frag_c[i][3][1][0], frag_s[0][0],
++              frag_c[i][0][1][2], frag_c[i][1][1][2], frag_c[i][2][1][2],
++              frag_c[i][3][1][2], frag_s[0][2]);
++        write(wr + 4 * c_sh_stride_2 + c_sh_stride, frag_c[i][0][1][1],
++              frag_c[i][1][1][1], frag_c[i][2][1][1], frag_c[i][3][1][1],
++              frag_s[0][0], frag_c[i][0][1][3], frag_c[i][1][1][3],
++              frag_c[i][2][1][3], frag_c[i][3][1][3], frag_s[0][2]);
++
++        c_sh_wr += 8 * c_sh_stride_2;
++      }
++    }
++    __syncthreads();
++
++  #pragma unroll
++    for (int i = 0;
++         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
++         i++) {
++      if (c_gl_wr < c_gl_wr_end) {
++        C[c_gl_wr] = sh[c_sh_rd];
++        c_gl_wr += c_gl_wr_delta;
++        c_sh_rd += c_sh_rd_delta;
++      }
++    }
++  };
++
++  // Start global fetch and register load pipelines.
++  auto start_pipes = [&]() {
++  #pragma unroll
++    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
++    zero_accums();
++    wait_for_stage();
++    fetch_to_registers(0, 0);
++    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
++  };
++  start_pipes();
++
++  // Main loop.
++  while (slice_iters) {
++  // We unroll over both the global fetch and the register load pipeline to
++  // ensure all shared memory accesses are static. Note that both pipelines have
++  // even length meaning that the next iteration will always start at index 0.
++  #pragma unroll
++    for (int pipe = 0; pipe < stages;) {
++      fetch_to_shared((pipe + stages - 1) % stages, pipe,
++                      slice_iters >= stages);
++      matmul(pipe);
++      wait_for_stage();
++
++      fetch_to_registers(pipe + 1, (pipe + 1) % stages);
++
++      pipe++;
++      slice_iters--;
++      if (slice_iters == 0) break;
++    }
++    a_gl_rd += a_gl_rd_delta_o * stages;
++
++    // Process results and, if necessary, proceed to the next column slice.
++    // While this pattern may not be the most readable, other ways of writing
++    // the loop seemed to noticeably worse performance after compilation.
++    if (slice_iters == 0) {
++      cp_async_wait<0>();
++      bool last = slice_idx == slice_count - 1;
++      // For per-column scales, we only fetch them here in the final step before
++      // write-out
++      if constexpr (group_blocks == -1) {
++        if constexpr (num_bits == 8) {
++          if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
++          cp_async_fence();
++        } else {
++          if (last) {
++            if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
++            cp_async_fence();
++          }
++        }
++      }
++      thread_block_reduce();
++
++      if constexpr (group_blocks == -1) {
++        if constexpr (num_bits == 8) {
++          cp_async_wait<0>();
++          __syncthreads();
++          if (threadIdx.x / 32 < thread_n_blocks / 4) {
++            *(float4*)(frag_s) = *(float4*)(&sh_s[s_sh_rd]);
++          }
++        } else {
++          if (last) {
++            cp_async_wait<0>();
++            __syncthreads();
++            if (threadIdx.x / 32 < thread_n_blocks / 4) {
++              *(float4*)(frag_s) = *(float4*)(&sh_s[s_sh_rd]);
++            }
++          }
++        }
++      }
++
++      // For 8-bit channelwise, we apply the scale before the global reduction
++      // that converts the fp32 results to fp16 (so that we avoid possible
++      // overflow in fp16)
++      if constexpr (group_blocks == -1 && num_bits == 8) {
++        if (threadIdx.x / 32 < thread_n_blocks / 4) {
++  #pragma unroll
++          for (int i = 0; i < thread_m_blocks; i++) {
++            scale_floats(&frag_c[i][0][0][0], &frag_c[i][1][0][0],
++                         &frag_c[i][2][0][0], &frag_c[i][3][0][0], frag_s[0][0],
++                         &frag_c[i][0][0][2], &frag_c[i][1][0][2],
++                         &frag_c[i][2][0][2], &frag_c[i][3][0][2],
++                         frag_s[0][2]);
++
++            scale_floats(&frag_c[i][0][0][1], &frag_c[i][1][0][1],
++                         &frag_c[i][2][0][1], &frag_c[i][3][0][1], frag_s[0][0],
++                         &frag_c[i][0][0][3], &frag_c[i][1][0][3],
++                         &frag_c[i][2][0][3], &frag_c[i][3][0][3],
++                         frag_s[0][2]);
++
++            scale_floats(&frag_c[i][0][1][0], &frag_c[i][1][1][0],
++                         &frag_c[i][2][1][0], &frag_c[i][3][1][0], frag_s[0][0],
++                         &frag_c[i][0][1][2], &frag_c[i][1][1][2],
++                         &frag_c[i][2][1][2], &frag_c[i][3][1][2],
++                         frag_s[0][2]);
++
++            scale_floats(&frag_c[i][0][1][1], &frag_c[i][1][1][1],
++                         &frag_c[i][2][1][1], &frag_c[i][3][1][1], frag_s[0][0],
++                         &frag_c[i][0][1][3], &frag_c[i][1][1][3],
++                         &frag_c[i][2][1][3], &frag_c[i][3][1][3],
++                         frag_s[0][2]);
++          }
++        }
++      }
++
++      if (slice_count > 1) {  // only globally reduce if there is more than one
++                              // block in a slice
++        barrier_acquire(&locks[slice_col], slice_idx);
++        global_reduce(slice_idx == 0, last);
++        barrier_release(&locks[slice_col], last);
++      }
++      if (last)  // only the last block in a slice actually writes the result
++        write_result();
++
++      slice_row = 0;
++      slice_col_par++;
++      slice_col++;
++      init_slice();
++      if (slice_iters) {
++        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
++                  (threadIdx.x % a_gl_rd_delta_o);
++  #pragma unroll
++        for (int i = 0; i < b_sh_wr_iters; i++)
++          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
++  #pragma unroll
++        for (int i = 0; i < m_sh_iters; i++)
++          meta_ptr[i] += (m_sh_stride)-m_gl_rd_delta_o * k_tiles;
++        if (slice_col == 0) {
++  #pragma unroll
++          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
++  #pragma unroll
++          for (int i = 0; i < m_sh_iters; i++) meta_ptr[i] -= m_gl_stride;
++        }
++        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
++        start_pipes();
++      }
++    }
++  }
++}
++
++#endif
++
++#define CALL_IF_2_4(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,               \
++                    THREAD_K_BLOCKS, GROUP_BLOCKS)                            \
++  else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&      \
++           thread_n_blocks == THREAD_N_BLOCKS &&                              \
++           thread_k_blocks == THREAD_K_BLOCKS &&                              \
++           group_blocks == GROUP_BLOCKS) {                                    \
++    cudaFuncSetAttribute(                                                     \
++        Marlin_24<NUM_BITS, THREADS, THREAD_N_BLOCKS, THREAD_M_BLOCKS,        \
++                  THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,                     \
++        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
++    Marlin_24<NUM_BITS, THREADS, THREAD_N_BLOCKS, THREAD_M_BLOCKS,            \
++              THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>                          \
++        <<<blocks, THREADS, max_shared_mem, stream>>>(A_ptr, B_ptr, meta_ptr, \
++                                                      C_ptr, s_ptr, prob_n,   \
++                                                      prob_m, prob_k, locks); \
++  }
++
++void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C,
++                     void* s, int prob_m, int prob_n, int prob_k,
++                     void* workspace, int num_bits, int groupsize = -1,
++                     int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
++                     int thread_m = -1, int sms = -1, int max_par = 16) {
++  int tot_n = prob_n;
++  int tot_n_blocks = ceildiv(tot_n, 16);
++  int pad = 16 * tot_n_blocks - tot_n;
++
++  if (sms == -1) {
++    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
++  }
++  TORCH_CHECK(sms > 0);
++
++  int max_shared_mem = 0;
++  cudaDeviceGetAttribute(&max_shared_mem,
++                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
++  TORCH_CHECK(max_shared_mem > 0);
++
++  if (thread_k == -1 || thread_m == -1) {
++    if (prob_n <= 16) {
++      // For small batchizes, better partitioningif is slightly more important
++      // than better compute utilization
++      thread_k = 128;
++      thread_m = 128;
++    } else {
++      thread_k = 64;
++      thread_m = 256;
++    }
++    // Also had
++    // if prob_n > 256
++    //   thread_k = 32;
++    //   thread_m = 512;
++    // but this is broken,
++    // TODO(Lucas, Alex M): figure out why
++  }
++
++  int thread_k_blocks = thread_k / 32;  // 2:4 version with m16n8k32 instruction
++  int thread_m_blocks = thread_m / 16;
++  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
++  int blocks = sms;
++
++  TORCH_CHECK(prob_m % thread_m == 0, "prob_m = ", prob_m,
++              " is not divisible by thread_m = ", thread_m);
++  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
++              " is not divisible by thread_k = ", thread_k);
++  if (group_blocks != -1) {
++    TORCH_CHECK((prob_k / 2) % group_blocks == 0, "prob_k/2 = ", prob_k / 2,
++                " is not divisible by group_blocks = ", group_blocks);
++  }
++
++  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
++              ", ", prob_n, ", ", prob_k, "]");
++
++  const int4* A_ptr = (const int4*)A;
++  const int4* B_ptr = (const int4*)B;
++  const int4* meta_ptr = (const int4*)meta;
++  int4* C_ptr = (int4*)C;
++  const int4* s_ptr = (const int4*)s;
++
++  constexpr int max_m_blocks = 4;
++
++  int* locks = (int*)workspace;
++  for (int i = 0; i < tot_n_blocks; i += max_m_blocks) {
++    int thread_n_blocks = tot_n_blocks - i;
++    prob_n = tot_n - 16 * i;
++    int par = 1;
++    if (thread_n_blocks > max_m_blocks) {
++      // Note that parallel > 1 currently only works for inputs without any
++      // padding
++      par = (16 * thread_n_blocks - pad) / (max_m_blocks * 16);
++      if (par > max_par) par = max_par;
++      prob_n = (max_m_blocks * 16) * par;
++      i += max_m_blocks * (par - 1);
++      thread_n_blocks = max_m_blocks;
++    }
++
++    // For compilation speed, we only define the kernel configurations that have
++    // seemed useful (in terms of performance) in our testing, however many more
++    // are, in principle, possible.
++
++    // the false is start of the CALL_IF macros
++    if (false) {
++    }  //         BMxBNxBK,   group
++    // 4-bit
++    CALL_IF_2_4(4, 8, 1, 4, -1)  // e.g., 16x128x128
++    CALL_IF_2_4(4, 8, 1, 4, 4)   // e.g., 16x128x128, 64
++
++    CALL_IF_2_4(4, 16, 1, 2, -1)  // e.g., 16x256x64
++    CALL_IF_2_4(4, 16, 1, 2, 4)   // e.g., 16x256x64,  64
++    CALL_IF_2_4(4, 16, 2, 2, -1)  // e.g.. 32x256x64
++    CALL_IF_2_4(4, 16, 2, 2, 4)
++    CALL_IF_2_4(4, 16, 3, 2, -1)
++    CALL_IF_2_4(4, 16, 3, 2, 4)
++    CALL_IF_2_4(4, 16, 4, 2, -1)
++    CALL_IF_2_4(4, 16, 4, 2, 4)
++
++    CALL_IF_2_4(4, 32, 1, 1, -1)  // e.g., 16x256x64
++    CALL_IF_2_4(4, 32, 1, 1, 4)   // e.g., 16x256x64,  64
++    CALL_IF_2_4(4, 32, 2, 1, -1)  // e.g.. 32x256x64
++    CALL_IF_2_4(4, 32, 2, 1, 4)
++    CALL_IF_2_4(4, 32, 3, 1, -1)
++    CALL_IF_2_4(4, 32, 3, 1, 4)
++    CALL_IF_2_4(4, 32, 4, 1, -1)
++    CALL_IF_2_4(4, 32, 4, 1, 4)
++
++    // 8-bit
++    CALL_IF_2_4(8, 8, 1, 4, -1)  // e.g., 16x128x128
++    CALL_IF_2_4(8, 8, 1, 4, 4)   // e.g., 16x128x128, 64
++
++    CALL_IF_2_4(8, 16, 1, 2, -1)  // e.g., 16x256x64
++    CALL_IF_2_4(8, 16, 1, 2, 4)   // e.g., 16x256x64,  64
++    CALL_IF_2_4(8, 16, 2, 2, -1)  // e.g.. 32x256x64
++    CALL_IF_2_4(8, 16, 2, 2, 4)
++    CALL_IF_2_4(8, 16, 3, 2, -1)
++    CALL_IF_2_4(8, 16, 3, 2, 4)
++    CALL_IF_2_4(8, 16, 4, 2, -1)
++    CALL_IF_2_4(8, 16, 4, 2, 4)
++
++    CALL_IF_2_4(8, 32, 1, 1, -1)  // e.g., 16x256x64
++    CALL_IF_2_4(8, 32, 1, 1, 4)   // e.g., 16x256x64,  64
++    CALL_IF_2_4(8, 32, 2, 1, -1)  // e.g.. 32x256x64
++    CALL_IF_2_4(8, 32, 2, 1, 4)
++    CALL_IF_2_4(8, 32, 3, 1, -1)
++    CALL_IF_2_4(8, 32, 3, 1, 4)
++    CALL_IF_2_4(8, 32, 4, 1, -1)
++    CALL_IF_2_4(8, 32, 4, 1, 4)
++    else {
++      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
++                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
++                               ", groupsize = " + str(groupsize) +
++                               ", thread_m_blocks = " + str(thread_m_blocks) +
++                               ", thread_n_blocks = " + str(thread_n_blocks) +
++                               ", thread_k_blocks = " + str(thread_k_blocks));
++    }
++
++    A_ptr += 16 * thread_n_blocks * (prob_k / 8) * par;
++    C_ptr += 16 * thread_n_blocks * (prob_m / 8) * par;
++  }
++}
++
++}  // namespace marlin_24
++
++torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
++                                  torch::Tensor& b_meta,
++                                  torch::Tensor& b_scales,
++                                  torch::Tensor& workspace,
++                                  vllm::ScalarTypeId const b_q_type_id,
++                                  int64_t size_m, int64_t size_n,
++                                  int64_t size_k) {
++  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
++  // Verify num_bits
++  TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
++              "num_bits must be uint4b8 or uint8b128. Got = ", b_q_type.str());
++  int pack_factor = 32 / b_q_type.size_bits();
++
++  // Verify M
++  TORCH_CHECK(size_m == a.size(0),
++              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
++                  ", size_m = " + str(size_m));
++
++  // Verify K
++  TORCH_CHECK(size_k == a.size(1),
++              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
++                  ", size_k = " + str(size_k));
++  TORCH_CHECK(size_k % marlin_24::tile_size == 0,
++              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
++                  str(marlin_24::tile_size));
++  TORCH_CHECK((size_k / marlin_24::tile_size / 2) == b_q_weight.size(0),
++              "Shape mismatch: b_q_weight.size(0) = " +
++                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
++                  ", tile_size = " + str(marlin_24::tile_size));
++
++  // Verify N
++  TORCH_CHECK(b_scales.size(1) == size_n,
++              "b_scales.size(1) = " + str(b_scales.size(1)) +
++                  ", size_n = " + str(size_n));
++  TORCH_CHECK(
++      b_q_weight.size(1) % marlin_24::tile_size == 0,
++      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
++          " is not divisible by tile_size = " + str(marlin_24::tile_size));
++
++  int actual_size_n = (b_q_weight.size(1) / marlin_24::tile_size) * pack_factor;
++  TORCH_CHECK(
++      size_n == actual_size_n,
++      "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
++
++  // Verify meta
++  TORCH_CHECK(b_meta.size(0) == size_k / 8 / 2 / 2,
++              "b_meta.size(0) = ", b_meta.size(0),
++              " is not size_k / 8 / 2 / 2 = ", size_k / 8 / 2 / 2);
++  TORCH_CHECK(b_meta.size(1) == size_n * 2, "b_meta.size(1) = ", b_meta.size(1),
++              " is not size_n * 2 = ", size_n * 2);
++
++  // Verify A device and strides
++  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
++  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
++  TORCH_CHECK(a.dtype() == torch::kFloat16,
++              "A is not float16, currently only float16 is supported");
++
++  // Verify B device and strides
++  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
++  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
++
++  // Verify b_meta device and strides
++  TORCH_CHECK(b_meta.device().is_cuda(), "b_meta is not on GPU");
++  TORCH_CHECK(b_meta.is_contiguous(), "b_meta is not contiguous");
++
++  // Verify scales device and strides
++  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
++  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
++  TORCH_CHECK(b_scales.dtype() == torch::kFloat16,
++              "A is not float16, currently only float16 is supported");
++
++  // Alloc C matrix
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
++  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
++  torch::Tensor c = torch::empty({size_m, size_n}, options);
++
++  int thread_k = -1;
++  int thread_m = -1;
++  int sms = -1;
++  int max_par = marlin_24::max_par;
++
++  int groupsize = -1;
++  if (b_scales.size(0) > 1) {
++    TORCH_CHECK(size_k % b_scales.size(0) == 0,
++                "size_k = " + str(size_k) +
++                    ", is not divisible by b_scales.size(0) = " +
++                    str(b_scales.size(0)));
++    groupsize = size_k / b_scales.size(0);
++    groupsize /= 2;  // Because of 24
++  }
++
++  // Verify groupsize
++  TORCH_CHECK(groupsize == -1 || groupsize == 64,
++              "Unexpected groupsize = " + str(groupsize));
++
++  // Verify workspace size
++  TORCH_CHECK(size_n % marlin_24::min_thread_n == 0,
++              "size_n = " + str(size_n) +
++                  ", is not divisible by min_thread_n = " +
++                  str(marlin_24::min_thread_n));
++  int min_workspace_size =
++      (size_n / marlin_24::min_thread_n) * marlin_24::max_par;
++  TORCH_CHECK(workspace.numel() >= min_workspace_size,
++              "workspace.numel = " + str(workspace.numel()) +
++                  " is below min_workspace_size = " + str(min_workspace_size));
++
++  int dev = a.get_device();
++  marlin_24::marlin_cuda_2_4(
++      a.data_ptr(), b_q_weight.data_ptr(), b_meta.data_ptr(), c.data_ptr(),
++      b_scales.data_ptr(), size_n, size_m, size_k, workspace.data_ptr(),
++      b_q_type.size_bits(), groupsize, dev, at::cuda::getCurrentCUDAStream(dev),
++      thread_k, thread_m, sms, max_par);
++
++  return c;
++}
++
++TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
++  m.impl("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
++}
+diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh
+new file mode 100644
+index 0000000..44c9991
+--- /dev/null
++++ b/csrc/quantization/vectorization.cuh
+@@ -0,0 +1,33 @@
++#pragma once
++/**
++ * __device__ datatypes vectorized by 4
++ */
++
++// Include both AMD and NVIDIA fp8 types to avoid circular import
++// TODO(luka/varun) use FP8_TYPE instead after refactoring
++#include <c10/util/Float8_e4m3fnuz.h>
++#include <c10/util/Float8_e4m3fn.h>
++
++namespace vllm {
++
++// Vectorization containers
++template <typename scalar_t>
++struct __align__(8) vec4_t {
++  scalar_t x;
++  scalar_t y;
++  scalar_t z;
++  scalar_t w;
++};
++
++template <typename quant_type_t>
++struct __align__(4) q8x4_t {
++  static_assert(std::is_same_v<quant_type_t, int8_t> ||
++                std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
++                std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
++  quant_type_t x;
++  quant_type_t y;
++  quant_type_t z;
++  quant_type_t w;
++};
++
++}  // namespace vllm
+diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
+new file mode 100644
+index 0000000..0fec962
+--- /dev/null
++++ b/csrc/rocm/attention.cu
+@@ -0,0 +1,1120 @@
++/*
++ * Copyright (c) 2024, The vLLM team.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include <torch/all.h>
++#include <ATen/cuda/CUDAContext.h>
++#include <c10/cuda/CUDAGuard.h>
++#include <hip/hip_bf16.h>
++#include "cuda_compat.h"
++
++#include <algorithm>
++#include "../attention/dtype_fp8.cuh"
++#include "../quantization/fp8/amd/quant_utils.cuh"
++
++#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx940__) || \
++                           defined(__gfx941__) || defined(__gfx942__))
++  #define __HIP__MI300_MI250__
++#endif
++
++#if defined(NDEBUG)
++  #undef NDEBUG
++  #include <assert.h>
++  #define UNREACHABLE_CODE assert(false);
++  #define NDEBUG
++#else
++  #define UNREACHABLE_CODE assert(false);
++#endif
++
++#define MAX(a, b) ((a) > (b) ? (a) : (b))
++#define MIN(a, b) ((a) < (b) ? (a) : (b))
++#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
++
++#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
++
++  #define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32
++  #define GCN_MFMA_INSTR __builtin_amdgcn_mfma_f32_4x4x4f16
++
++using floatx4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
++using float16x4 =
++    __attribute__((__vector_size__(4 * sizeof(_Float16)))) _Float16;
++typedef float16x4 _Half4;
++typedef struct _Half8 {
++  _Half4 xy[2];
++} _Half8;
++
++using bit16_t = uint16_t;
++using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t;
++typedef bit16x4 _B16x4;
++typedef struct _B16x8 {
++  _B16x4 xy[2];
++} _B16x8;
++
++using _B8x8 = uint2;
++
++////// Non temporal load stores ///////
++
++template <typename T>
++__device__ __forceinline__ T load(T* addr) {
++  return addr[0];
++}
++
++template <typename T>
++__device__ __forceinline__ void store(T value, T* addr) {
++  addr[0] = value;
++}
++
++template <typename T, int absz, int cbid, int blgp>
++__device__ __forceinline__ floatx4 gcn_mfma_instr(const _B16x4& inpA,
++                                                  const _B16x4& inpB,
++                                                  const floatx4& inpC) {
++  if constexpr (std::is_same<T, _Float16>::value) {
++    return __builtin_amdgcn_mfma_f32_4x4x4f16(inpA, inpB, inpC, absz, cbid,
++                                              blgp);
++  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
++    return __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(inpA, inpB, inpC, absz, cbid,
++                                                  blgp);
++  } else {
++    static_assert(false, "unsupported 16b dtype");
++  }
++}
++
++template <typename T>
++__device__ __forceinline__ float to_float(const T& inp) {
++  if constexpr (std::is_same<T, _Float16>::value) {
++    return (float)inp;
++  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
++    return __bfloat162float(inp);
++  } else {
++    static_assert(false, "unsupported 16b dtype");
++  }
++}
++
++template <typename T>
++__device__ __forceinline__ T from_float(const float& inp) {
++  if constexpr (std::is_same<T, _Float16>::value) {
++    return (_Float16)inp;
++  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
++    return __float2bfloat16(inp);
++  } else {
++    static_assert(false, "unsupported 16b dtype");
++  }
++}
++
++template <typename T>
++__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
++  union tmpcvt {
++    uint16_t u;
++    _Float16 f;
++    __hip_bfloat16 b;
++  } t16;
++  _B16x4 ret;
++  if constexpr (std::is_same<T, _Float16>::value) {
++  #pragma unroll
++    for (int i = 0; i < 4; i++) {
++      t16.f = (_Float16)inp[i];
++      ret[i] = t16.u;
++    }
++    return ret;
++  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
++  #pragma unroll
++    for (int i = 0; i < 4; i++) {
++      t16.b = __float2bfloat16(inp[i]);
++      ret[i] = t16.u;
++    }
++    return ret;
++  } else {
++    static_assert(false, "unsupported 16b dtype");
++  }
++}
++
++template <typename T>
++__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
++                                        const _B16x4& inp2) {
++  union tmpcvt {
++    uint16_t u;
++    _Float16 f;
++    __hip_bfloat16 b;
++  } t1, t2, res;
++  _B16x4 ret;
++  if constexpr (std::is_same<T, _Float16>::value) {
++  #pragma unroll
++    for (int i = 0; i < 4; i++) {
++      t1.u = inp1[i];
++      t2.u = inp2[i];
++      res.f = t1.f + t2.f;
++      ret[i] = res.u;
++    }
++    return ret;
++  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
++  #pragma unroll
++    for (int i = 0; i < 4; i++) {
++      t1.u = inp1[i];
++      t2.u = inp2[i];
++      res.b = t1.b + t2.b;
++      ret[i] = res.u;
++    }
++    return ret;
++  } else {
++    static_assert(false, "unsupported 16b dtype");
++  }
++}
++
++template <typename T, vllm::Fp8KVCacheDataType KV_DTYPE>
++__device__ __forceinline__ _B16x8 scaled_convert_b8x8(const _B8x8 input,
++                                                      const float scale) {
++  union alignas(16) {
++    uint4 u4;
++    _B16x8 u16x8;
++    vllm::bf16_8_t b16x8;
++  } tmp;
++  if constexpr (std::is_same<T, _Float16>::value) {
++    tmp.u4 = vllm::fp8::scaled_convert<uint4, _B8x8, KV_DTYPE>(input, scale);
++    return tmp.u16x8;
++  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
++    tmp.b16x8 = vllm::fp8::scaled_convert<vllm::bf16_8_t, _B8x8, KV_DTYPE>(
++        input, scale);
++    return tmp.u16x8;
++  } else {
++    static_assert(false, "unsupported 16b dtype");
++  }
++}
++
++///////////////////////////////////////
++
++// grid (num_seqs, num_partitions,num_heads/gqa_ratio)
++// block (partition size)
++template <typename scalar_t, typename cache_t,
++          vllm::Fp8KVCacheDataType KV_DTYPE, int BLOCK_SIZE, int HEAD_SIZE,
++          int NUM_THREADS,
++          int GQA_RATIO>
++__global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
++    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
++    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
++                                          // head_size/x, block_size, x]
++    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
++                                          // head_size, block_size]
++    const int num_kv_heads, const float scale,
++    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
++    const int* __restrict__ context_lens,  // [num_seqs]
++    const int max_num_blocks_per_seq,
++    const float* __restrict__ alibi_slopes,  // [num_heads]
++    const int q_stride, const int kv_block_stride, const int kv_head_stride,
++    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
++    float* __restrict__ max_logits,  // [num_seqs, num_heads,
++                                     // max_num_partitions]
++    scalar_t* __restrict__ out,  // [num_seqs, num_heads, max_num_partitions,
++                                 // head_size]
++    scalar_t* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
++    int max_ctx_blocks, float k_scale, float v_scale) {
++  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
++  const int warpid = threadIdx.x / WARP_SIZE;
++  const int laneid = threadIdx.x % WARP_SIZE;
++  const int lane4id = laneid % 4;
++
++  const int seq_idx = blockIdx.x;
++  const int partition_idx = blockIdx.y;
++  const int partition_size = blockDim.x;
++  const int max_num_partitions = gridDim.y;
++
++  const int context_len = context_lens[seq_idx];
++  const int partition_start_token_idx = partition_idx * partition_size;
++  // exit if partition is out of context for seq
++  if (partition_start_token_idx >= context_len) {
++    return;
++  }
++  constexpr int QHLOOP =
++      DIVIDE_ROUND_UP(GQA_RATIO, 4);  // each 4 lanes fetch 4 different qheads,
++                                      // total qheads =8, so qhloop is 2
++  constexpr int GQA_RATIO4 = 4 * QHLOOP;
++  __shared__ float shared_qk_max[NWARPS][GQA_RATIO4 + 1];
++  __shared__ float shared_exp_sum[NWARPS][GQA_RATIO4 + 1];
++  _B16x8 Qlocal[QHLOOP];
++  constexpr int x = 16 / sizeof(scalar_t);
++  constexpr int KHELOOP = HEAD_SIZE / x;
++  _B16x8 Klocal[KHELOOP];
++  _B8x8 Klocalb8[KHELOOP];
++  constexpr int VHELOOP =
++      HEAD_SIZE /
++      WARP_SIZE;  // v head_size dimension is distributed across lanes
++  constexpr int VTLOOP = 8;  // 16 separate 4xtokens across warp -> 16/2
++                             // 8xtokens
++  _B16x8 Vlocal[VHELOOP][VTLOOP];
++  _B8x8 Vlocalb8[VHELOOP][VTLOOP];
++  floatx4 dout[QHLOOP];
++  float qk_max[QHLOOP];
++  #pragma unroll
++  for (int h = 0; h < QHLOOP; h++) {
++    dout[h] = {0};
++    qk_max[h] = -FLT_MAX;
++  }
++
++  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
++  const int wg_start_kv_head_idx = blockIdx.z;
++
++  const int warp_start_token_idx =
++      partition_start_token_idx + warpid * WARP_SIZE;
++
++  if (warp_start_token_idx >= context_len) {  // warp out of context
++  #pragma unroll
++    for (int h = 0; h < GQA_RATIO4; h++) {
++      shared_qk_max[warpid][h] = -FLT_MAX;
++      shared_exp_sum[warpid][h] = 0.0f;
++    }
++  } else {  // warp within context
++
++    const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
++    const int last_ctx_block = num_context_blocks - 1;
++
++    const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
++
++    const int local_token_idx = threadIdx.x;
++    const int global_token_idx = partition_start_token_idx + local_token_idx;
++
++    const int block_idx = (global_token_idx < context_len)
++                              ? global_token_idx / BLOCK_SIZE
++                              : last_ctx_block;
++    // fetch block number for q and k
++    // int32 physical_block_number leads to overflow when multiplied with
++    // kv_block_stride
++    const int64_t physical_block_number =
++        static_cast<int64_t>(block_table[block_idx]);
++
++    // fetch vphysical block numbers up front
++    constexpr int VBLOCKS = 8 * VTLOOP / BLOCK_SIZE;
++    int vphysical_blocks[VBLOCKS];
++
++    const int warp_start_block_idx = warp_start_token_idx / BLOCK_SIZE;
++  #pragma unroll
++    for (int b = 0; b < VBLOCKS; b++) {
++      const int vblock_idx = warp_start_block_idx + b;
++      const int vblock_idx_ctx =
++          (vblock_idx <= last_ctx_block) ? vblock_idx : last_ctx_block;
++      vphysical_blocks[b] = block_table[vblock_idx_ctx];
++    }
++
++    // each 4 lanes fetch 8 helems, so warp fetches 8*16 = 128 helems
++    const scalar_t* q_ptr =
++        q + seq_idx * q_stride + wg_start_head_idx * HEAD_SIZE;
++    const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr);
++    const int qhead_elemh8 = laneid / 4;
++  #pragma unroll
++    for (int h = 0; h < QHLOOP - 1; h++) {
++      const int qhead_idx = h * 4 + lane4id;
++      Qlocal[h] = q_ptrh8[qhead_idx * HEAD_SIZE / 8 + qhead_elemh8];
++    }
++    const int final_qhead_idx = 4 * (QHLOOP - 1) + lane4id;
++    if (final_qhead_idx < GQA_RATIO) {
++      Qlocal[QHLOOP - 1] =
++          q_ptrh8[final_qhead_idx * HEAD_SIZE / 8 + qhead_elemh8];
++    } else {
++      Qlocal[QHLOOP - 1].xy[0] = {0};
++      Qlocal[QHLOOP - 1].xy[1] = {0};
++    }
++
++    const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride +
++                           wg_start_kv_head_idx * kv_head_stride;
++
++    const int physical_block_offset =
++        local_token_idx % BLOCK_SIZE;  // since x=half8, physical_block_offset
++                                       // is already cast as _H8
++    if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
++      const _B16x8* k_ptrh8 = reinterpret_cast<const _B16x8*>(k_ptr);
++  #pragma unroll
++      for (int d = 0; d < KHELOOP; d++) {
++        Klocal[d] = k_ptrh8[d * BLOCK_SIZE + physical_block_offset];
++      }
++    } else {
++      constexpr int X = 16 / sizeof(cache_t);
++      const cache_t* k_ptr2 = k_ptr + physical_block_offset * X;
++  #pragma unroll
++      for (int d = 0; d < KHELOOP; d++) {
++        const int head_elem = d * 8;
++        const int offset1 = head_elem / X;
++        const int offset2 = head_elem % X;
++        const cache_t* k_ptr3 = k_ptr2 + offset1 * BLOCK_SIZE * X + offset2;
++        Klocalb8[d] = *reinterpret_cast<const _B8x8*>(k_ptr3);
++      }
++    }
++
++    float alibi_slope[QHLOOP];
++    if (alibi_slopes != nullptr) {
++  #pragma unroll
++      for (int h = 0; h < QHLOOP; h++) {
++        const int qhead_idx = h * 4 + lane4id;
++        alibi_slope[h] = (qhead_idx < GQA_RATIO)
++                             ? alibi_slopes[wg_start_head_idx + qhead_idx]
++                             : 0.f;
++      }
++    }
++
++    const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride;
++    if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
++      const _B16x8* v_ptrh8 = reinterpret_cast<const _B16x8*>(v_ptr);
++      // iterate over each v block
++  #pragma unroll
++      for (int b = 0; b < VBLOCKS; b++) {
++        // int32 physical_block_number leads to overflow when multiplied with
++        // kv_block_stride
++        const int64_t vphysical_block_number =
++            static_cast<int64_t>(vphysical_blocks[b]);
++        const _B16x8* v_ptrh8b =
++            v_ptrh8 + (vphysical_block_number * kv_block_stride) / 8;
++        // iterate over each head elem (within head_size)
++  #pragma unroll
++        for (int h = 0; h < VHELOOP; h++) {
++          const int head_size_elem = h * WARP_SIZE + laneid;
++          const _B16x8* v_ptrh8be = v_ptrh8b + head_size_elem * BLOCK_SIZE / 8;
++          // iterate over all velems within block
++  #pragma unroll
++          for (int d = 0; d < BLOCK_SIZE / 8; d++) {
++            Vlocal[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d];
++          }
++        }
++      }
++    } else {
++      const _B8x8* v_ptrh8 = reinterpret_cast<const _B8x8*>(v_ptr);
++      // iterate over each v block
++  #pragma unroll
++      for (int b = 0; b < VBLOCKS; b++) {
++        // int32 physical_block_number leads to overflow when multiplied with
++        // kv_block_stride
++        const int64_t vphysical_block_number =
++            static_cast<int64_t>(vphysical_blocks[b]);
++        const _B8x8* v_ptrh8b =
++            v_ptrh8 + (vphysical_block_number * kv_block_stride) / 8;
++        // iterate over each head elem (within head_size)
++  #pragma unroll
++        for (int h = 0; h < VHELOOP; h++) {
++          const int head_size_elem = h * WARP_SIZE + laneid;
++          const _B8x8* v_ptrh8be = v_ptrh8b + head_size_elem * BLOCK_SIZE / 8;
++          // iterate over all velems within block
++  #pragma unroll
++          for (int d = 0; d < BLOCK_SIZE / 8; d++) {
++            // Vlocalb8[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d];
++            const _B8x8 Vlocalb8 = v_ptrh8be[d];
++            Vlocal[h][b * BLOCK_SIZE / 8 + d] =
++                scaled_convert_b8x8<scalar_t, KV_DTYPE>(Vlocalb8, v_scale);
++          }
++        }
++      }
++    }
++
++    if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
++  #pragma unroll
++      for (int d = 0; d < KHELOOP; d++) {
++        Klocal[d] =
++            scaled_convert_b8x8<scalar_t, KV_DTYPE>(Klocalb8[d], k_scale);
++      }
++    }
++
++  #pragma unroll
++    for (int h = 0; h < QHLOOP; h++) {
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 0, 0>(Qlocal[h].xy[0],
++                                                  Klocal[0].xy[0], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 0, 0>(Qlocal[h].xy[1],
++                                                  Klocal[0].xy[1], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 1, 0>(Qlocal[h].xy[0],
++                                                  Klocal[1].xy[0], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 1, 0>(Qlocal[h].xy[1],
++                                                  Klocal[1].xy[1], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 2, 0>(Qlocal[h].xy[0],
++                                                  Klocal[2].xy[0], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 2, 0>(Qlocal[h].xy[1],
++                                                  Klocal[2].xy[1], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 3, 0>(Qlocal[h].xy[0],
++                                                  Klocal[3].xy[0], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 3, 0>(Qlocal[h].xy[1],
++                                                  Klocal[3].xy[1], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 4, 0>(Qlocal[h].xy[0],
++                                                  Klocal[4].xy[0], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 4, 0>(Qlocal[h].xy[1],
++                                                  Klocal[4].xy[1], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 5, 0>(Qlocal[h].xy[0],
++                                                  Klocal[5].xy[0], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 5, 0>(Qlocal[h].xy[1],
++                                                  Klocal[5].xy[1], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 6, 0>(Qlocal[h].xy[0],
++                                                  Klocal[6].xy[0], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 6, 0>(Qlocal[h].xy[1],
++                                                  Klocal[6].xy[1], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 7, 0>(Qlocal[h].xy[0],
++                                                  Klocal[7].xy[0], dout[h]);
++      dout[h] = gcn_mfma_instr<scalar_t, 4, 7, 0>(Qlocal[h].xy[1],
++                                                  Klocal[7].xy[1], dout[h]);
++      if constexpr (KHELOOP > 8) {
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 8, 0>(Qlocal[h].xy[0],
++                                                    Klocal[8].xy[0], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 8, 0>(Qlocal[h].xy[1],
++                                                    Klocal[8].xy[1], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 9, 0>(Qlocal[h].xy[0],
++                                                    Klocal[9].xy[0], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 9, 0>(Qlocal[h].xy[1],
++                                                    Klocal[9].xy[1], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 10, 0>(Qlocal[h].xy[0],
++                                                     Klocal[10].xy[0], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 10, 0>(Qlocal[h].xy[1],
++                                                     Klocal[10].xy[1], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 11, 0>(Qlocal[h].xy[0],
++                                                     Klocal[11].xy[0], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 11, 0>(Qlocal[h].xy[1],
++                                                     Klocal[11].xy[1], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 12, 0>(Qlocal[h].xy[0],
++                                                     Klocal[12].xy[0], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 12, 0>(Qlocal[h].xy[1],
++                                                     Klocal[12].xy[1], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 13, 0>(Qlocal[h].xy[0],
++                                                     Klocal[13].xy[0], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 13, 0>(Qlocal[h].xy[1],
++                                                     Klocal[13].xy[1], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 14, 0>(Qlocal[h].xy[0],
++                                                     Klocal[14].xy[0], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 14, 0>(Qlocal[h].xy[1],
++                                                     Klocal[14].xy[1], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 15, 0>(Qlocal[h].xy[0],
++                                                     Klocal[15].xy[0], dout[h]);
++        dout[h] = gcn_mfma_instr<scalar_t, 4, 15, 0>(Qlocal[h].xy[1],
++                                                     Klocal[15].xy[1], dout[h]);
++      }  // KHELOOP>8
++      dout[h] *= scale;
++    }
++  // transpose dout so that 4 token ids are in each lane, and 4 heads are across
++  // 4 lanes
++  #pragma unroll
++    for (int h = 0; h < QHLOOP; h++) {
++      floatx4 tmp = {0};
++  #pragma unroll
++      for (int i = 0; i < 4; i++) {
++        const float B = (lane4id == i) ? 1.0f : 0.0f;
++        // const float A = (global_token_idx < context_len) ? dout[h][i] : 0.0f;
++        tmp = __builtin_amdgcn_mfma_f32_4x4x1f32(dout[h][i], B, tmp, 0, 0, 0);
++        // tmp = __builtin_amdgcn_mfma_f32_4x4x1f32(A, B, tmp, 0, 0, 0);
++      }
++      dout[h] = tmp;
++    }
++
++    const int lane4_token_idx = 4 * (global_token_idx >> 2);
++    const int alibi_offset = lane4_token_idx - context_len + 1;
++    if (alibi_slopes != nullptr) {
++  #pragma unroll
++      for (int h = 0; h < QHLOOP; h++) {
++  #pragma unroll
++        for (int i = 0; i < 4; i++) {
++          dout[h][i] += alibi_slope[h] * (alibi_offset + i);
++        }
++      }
++    }
++
++  #pragma unroll
++    for (int h = 0; h < QHLOOP; h++) {
++      qk_max[h] = -FLT_MAX;
++  #pragma unroll
++      for (int i = 0; i < 4; i++) {
++        qk_max[h] = (lane4_token_idx + i < context_len)
++                        ? fmaxf(qk_max[h], dout[h][i])
++                        : qk_max[h];
++      }
++  #pragma unroll
++      for (int mask = WARP_SIZE / 2; mask >= 4; mask /= 2) {
++        qk_max[h] = fmaxf(qk_max[h], __shfl_xor(qk_max[h], mask));
++      }
++    }
++
++    float exp_sum[QHLOOP];
++  #pragma unroll
++    for (int h = 0; h < QHLOOP; h++) {
++      exp_sum[h] = 0.0f;
++  #pragma unroll
++      for (int i = 0; i < 4; i++) {
++        dout[h][i] = (lane4_token_idx + i < context_len)
++                         ? __expf(dout[h][i] - qk_max[h])
++                         : 0.0f;
++        exp_sum[h] += dout[h][i];
++      }
++  #pragma unroll
++      for (int mask = WARP_SIZE / 2; mask >= 4; mask /= 2) {
++        exp_sum[h] += __shfl_xor(exp_sum[h], mask);
++      }
++    }
++
++  #pragma unroll
++    for (int h = 0; h < QHLOOP; h++) {
++      const int head_idx = 4 * h + lane4id;
++      shared_qk_max[warpid][head_idx] = qk_max[h];
++      shared_exp_sum[warpid][head_idx] = exp_sum[h];
++    }
++  }  // warp within context
++
++  __syncthreads();
++
++  const int num_heads = gridDim.z * GQA_RATIO;
++  float* max_logits_ptr =
++      max_logits + seq_idx * num_heads * max_num_partitions + partition_idx;
++  float* exp_sums_ptr =
++      exp_sums + seq_idx * num_heads * max_num_partitions + partition_idx;
++  #pragma unroll
++  for (int h = 0; h < QHLOOP; h++) {
++    float global_qk_max = -FLT_MAX;
++    float warp_qk_max[NWARPS];
++    const int head_idx = 4 * h + lane4id;
++  #pragma unroll
++    for (int w = 0; w < NWARPS; w++) {
++      warp_qk_max[w] = shared_qk_max[w][head_idx];
++      global_qk_max = fmaxf(global_qk_max, warp_qk_max[w]);
++    }
++    float global_exp_sum = 0.0f;
++  #pragma unroll
++    for (int w = 0; w < NWARPS; w++) {
++      global_exp_sum +=
++          shared_exp_sum[w][head_idx] * __expf(warp_qk_max[w] - global_qk_max);
++    }
++    if (head_idx < GQA_RATIO) {
++      max_logits_ptr[(wg_start_head_idx + head_idx) * max_num_partitions] =
++          global_qk_max;
++      exp_sums_ptr[(wg_start_head_idx + head_idx) * max_num_partitions] =
++          global_exp_sum;
++    }
++    const float global_inv_sum_scale = __fdividef(1.f, global_exp_sum + 1e-6f) *
++                                       __expf(qk_max[h] - global_qk_max);
++    dout[h] *= global_inv_sum_scale;
++  }
++  // logits[h] -> every 4 lanes hold 4 heads, each lane holds 4 tokens, there
++  // are 4x16 tokens across warp
++  _B16x4 logits[QHLOOP];
++  #pragma unroll
++  for (int h = 0; h < QHLOOP; h++) {
++    logits[h] = from_floatx4<scalar_t>(dout[h]);
++  }
++
++  __shared__ _B16x4 vout_shared[QHLOOP][VHELOOP][WARP_SIZE][NWARPS + 1];
++
++  if (warp_start_token_idx >= context_len) {  // warp out of context
++  #pragma unroll
++    for (int qh = 0; qh < QHLOOP; qh++) {
++  #pragma unroll
++      for (int vh = 0; vh < VHELOOP; vh++) {
++        vout_shared[qh][vh][laneid][warpid] = {0};
++      }
++    }
++  } else {  // warp in context
++  // iterate across heads
++  #pragma unroll
++    for (int qh = 0; qh < QHLOOP; qh++) {
++  // iterate over each v head elem (within head_size)
++  #pragma unroll
++      for (int vh = 0; vh < VHELOOP; vh++) {
++        floatx4 acc = {0};
++        // iterate over tokens
++        acc = gcn_mfma_instr<scalar_t, 4, 0, 0>(logits[qh], Vlocal[vh][0].xy[0],
++                                                acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 1, 0>(logits[qh], Vlocal[vh][0].xy[1],
++                                                acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 2, 0>(logits[qh], Vlocal[vh][1].xy[0],
++                                                acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 3, 0>(logits[qh], Vlocal[vh][1].xy[1],
++                                                acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 4, 0>(logits[qh], Vlocal[vh][2].xy[0],
++                                                acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 5, 0>(logits[qh], Vlocal[vh][2].xy[1],
++                                                acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 6, 0>(logits[qh], Vlocal[vh][3].xy[0],
++                                                acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 7, 0>(logits[qh], Vlocal[vh][3].xy[1],
++                                                acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 8, 0>(logits[qh], Vlocal[vh][4].xy[0],
++                                                acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 9, 0>(logits[qh], Vlocal[vh][4].xy[1],
++                                                acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 10, 0>(logits[qh],
++                                                 Vlocal[vh][5].xy[0], acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 11, 0>(logits[qh],
++                                                 Vlocal[vh][5].xy[1], acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 12, 0>(logits[qh],
++                                                 Vlocal[vh][6].xy[0], acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 13, 0>(logits[qh],
++                                                 Vlocal[vh][6].xy[1], acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 14, 0>(logits[qh],
++                                                 Vlocal[vh][7].xy[0], acc);
++        acc = gcn_mfma_instr<scalar_t, 4, 15, 0>(logits[qh],
++                                                 Vlocal[vh][7].xy[1], acc);
++        vout_shared[qh][vh][laneid][warpid] = from_floatx4<scalar_t>(acc);
++      }
++    }
++  }  // warp in context
++
++  __syncthreads();
++
++  if (warpid == 0) {
++    _B16x4 vout[QHLOOP][VHELOOP];
++    // iterate across heads
++    scalar_t* out_ptr;
++    int out_num_partitions;
++    if (context_len > partition_size) {
++      out_num_partitions = max_num_partitions;
++      out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
++                partition_idx * HEAD_SIZE;
++    } else {
++      out_num_partitions = 1;
++      out_ptr = final_out + seq_idx * num_heads * HEAD_SIZE;
++    }
++  #pragma unroll
++    for (int qh = 0; qh < QHLOOP; qh++) {
++  // iterate over each v head elem (within head_size)
++  #pragma unroll
++      for (int vh = 0; vh < VHELOOP; vh++) {
++        vout[qh][vh] = {0};
++  #pragma unroll
++        for (int w = 0; w < NWARPS; w++) {
++          vout[qh][vh] =
++              addx4<scalar_t>(vout[qh][vh], vout_shared[qh][vh][laneid][w]);
++        }
++        const int head_size_elem = vh * WARP_SIZE + laneid;
++        bit16_t* out_ptr_b16 = reinterpret_cast<bit16_t*>(out_ptr);
++  #pragma unroll
++        for (int i = 0; i < 4; i++) {
++          const int head_idx = 4 * qh + i;
++          if (head_idx < GQA_RATIO) {
++            out_ptr_b16[(wg_start_head_idx + head_idx) * out_num_partitions *
++                            HEAD_SIZE +
++                        head_size_elem] = vout[qh][vh][i];
++          }
++        }
++      }
++    }
++  }
++}
++
++// Grid: (num_heads, num_seqs).
++template <typename scalar_t, int HEAD_SIZE, int NUM_THREADS,
++          int PARTITION_SIZE>
++__global__
++__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
++    scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
++    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
++                                           // max_num_partitions]
++    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
++                                           // max_num_partitions]
++    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
++                                           // max_num_partitions, head_size]
++    const int* __restrict__ context_lens,  // [num_seqs]
++    const int max_num_partitions) {
++  const int num_heads = gridDim.x;
++  const int head_idx = blockIdx.x;
++  const int seq_idx = blockIdx.y;
++  const int context_len = context_lens[seq_idx];
++  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
++  if (num_partitions == 1) {
++    // if num_partitions==1, main kernel will write to out directly, no work in
++    // reduction kernel
++    return;
++  }
++
++  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
++  const int warpid = threadIdx.x / WARP_SIZE;
++  const int laneid = threadIdx.x % WARP_SIZE;
++
++  __shared__ float shared_global_exp_sum;
++  __shared__ float shared_exp_sums[2 * WARP_SIZE];
++
++  if (warpid == 0) {
++    const float* max_logits_ptr = max_logits +
++                                  seq_idx * num_heads * max_num_partitions +
++                                  head_idx * max_num_partitions;
++
++    // valid partition is the last valid partition in case threadid > num
++    // partitions
++    const int valid_partition =
++        (threadIdx.x < num_partitions) ? threadIdx.x : num_partitions - 1;
++    const int valid_partition2 = (WARP_SIZE + threadIdx.x < num_partitions)
++                                     ? WARP_SIZE + threadIdx.x
++                                     : num_partitions - 1;
++    float reg_max_logit = max_logits_ptr[valid_partition];
++    float reg_max_logit2 = max_logits_ptr[valid_partition2];
++    float max_logit = fmaxf(reg_max_logit, reg_max_logit2);
++
++  #pragma unroll
++    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
++      max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask));
++    }
++
++    const float* exp_sums_ptr = exp_sums +
++                                seq_idx * num_heads * max_num_partitions +
++                                head_idx * max_num_partitions;
++
++    float global_exp_sum = 0.0f;
++    float rescaled_exp_sum = exp_sums_ptr[valid_partition];
++    float rescaled_exp_sum2 = exp_sums_ptr[valid_partition2];
++    rescaled_exp_sum *=
++        (threadIdx.x < num_partitions) ? expf(reg_max_logit - max_logit) : 0.0f;
++    rescaled_exp_sum2 *= (threadIdx.x + WARP_SIZE < num_partitions)
++                             ? expf(reg_max_logit2 - max_logit)
++                             : 0.0f;
++    global_exp_sum += rescaled_exp_sum + rescaled_exp_sum2;
++    shared_exp_sums[threadIdx.x] = rescaled_exp_sum;
++    shared_exp_sums[threadIdx.x + WARP_SIZE] = rescaled_exp_sum2;
++
++  #pragma unroll
++    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
++      global_exp_sum += __shfl_xor(global_exp_sum, mask);
++    }
++    if (threadIdx.x == 0) {
++      shared_global_exp_sum = global_exp_sum;
++    }
++  }  // warpid == 0
++  const scalar_t* tmp_out_ptr =
++      tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
++      head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x;
++  constexpr int MAX_NPAR = 64;
++  scalar_t tmps[MAX_NPAR];
++  const float dzero = 0.0f;
++  #pragma unroll
++  for (int j = 0; j < MAX_NPAR; j++) {
++    tmps[j] = from_float<scalar_t>(dzero);
++  }
++  const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE;
++  const int num_partition_offset = (num_partitions)*HEAD_SIZE;
++  int idx = 0;
++
++  constexpr int JCHUNK = 16;
++
++  #pragma unroll
++  for (int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) {
++    // lastj is last valid partition
++    const int lastj_offset =
++        (j < num_partition_offset) ? j : last_partition_offset;
++    tmps[idx] = tmp_out_ptr[lastj_offset];
++    idx++;
++  }
++  __syncthreads();
++
++  if (num_partitions > JCHUNK) {
++  #pragma unroll
++    for (int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE;
++         j += HEAD_SIZE) {
++      const int lastj_offset =
++          (j < num_partition_offset) ? j : last_partition_offset;
++      tmps[idx] = tmp_out_ptr[lastj_offset];
++      idx++;
++    }
++
++    if (num_partitions > 2 * JCHUNK) {
++  #pragma unroll
++      for (int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE;
++           j += HEAD_SIZE) {
++        const int lastj_offset =
++            (j < num_partition_offset) ? j : last_partition_offset;
++        tmps[idx] = tmp_out_ptr[lastj_offset];
++        idx++;
++      }
++    }
++  }  // num_partitions > JCHUNK
++
++  // Aggregate tmp_out to out.
++  float acc = 0.0f;
++  #pragma unroll
++  for (int j = 0; j < JCHUNK; j++) {
++    acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
++  }
++  if (num_partitions > JCHUNK) {
++  #pragma unroll
++    for (int j = JCHUNK; j < 2 * JCHUNK; j++) {
++      acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
++    }
++    if (num_partitions > 2 * JCHUNK) {
++  #pragma unroll
++      for (int j = 2 * JCHUNK; j < MAX_NPAR; j++) {
++        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
++      }
++    }
++  }
++
++  if (num_partitions > MAX_NPAR) {
++    idx = 0;
++  #pragma unroll
++    for (int j = MAX_NPAR * HEAD_SIZE; j < 2 * MAX_NPAR * HEAD_SIZE;
++         j += HEAD_SIZE) {
++      // lastj is last valid partition
++      const int lastj_offset =
++          (j < num_partition_offset) ? j : last_partition_offset;
++      tmps[idx] = tmp_out_ptr[lastj_offset];
++      idx++;
++    }
++
++  #pragma unroll
++    for (int j = 0; j < MAX_NPAR; j++) {
++      acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j + MAX_NPAR];
++    }
++  }
++
++  const float inv_global_exp_sum =
++      __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
++  acc *= inv_global_exp_sum;
++  scalar_t* out_ptr =
++      out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
++  out_ptr[threadIdx.x] = from_float<scalar_t>(acc);
++}
++
++#else  // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
++
++template <typename scalar_t, typename cache_t,
++          vllm::Fp8KVCacheDataType KV_DTYPE, int BLOCK_SIZE, int HEAD_SIZE,
++          int NUM_THREADS,
++          int GQA_RATIO>
++__global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
++    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
++    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
++                                          // head_size/x, block_size, x]
++    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
++                                          // head_size, block_size]
++    const int num_kv_heads, const float scale,
++    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
++    const int* __restrict__ context_lens,  // [num_seqs]
++    const int max_num_blocks_per_seq,
++    const float* __restrict__ alibi_slopes,  // [num_heads]
++    const int q_stride, const int kv_block_stride, const int kv_head_stride,
++    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
++    float* __restrict__ max_logits,  // [num_seqs, num_heads,
++                                     // max_num_partitions]
++    scalar_t* __restrict__ out,  // [num_seqs, num_heads, max_num_partitions,
++                                 // head_size]
++    scalar_t* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
++    int max_ctx_blocks, float k_scale, float v_scale) {
++  UNREACHABLE_CODE
++}
++
++// Grid: (num_heads, num_seqs).
++template <typename scalar_t, int HEAD_SIZE, int NUM_THREADS,
++          int PARTITION_SIZE>
++__global__
++__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
++    scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
++    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
++                                           // max_num_partitions]
++    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
++                                           // max_num_partitions]
++    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
++                                           // max_num_partitions, head_size]
++    const int* __restrict__ context_lens,  // [num_seqs]
++    const int max_num_partitions){UNREACHABLE_CODE}
++
++#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
++
++#define LAUNCH_CUSTOM_ATTENTION(GQA_RATIO)                                    \
++  paged_attention_ll4mi_QKV_kernel<T, KVT, KV_DTYPE, BLOCK_SIZE, HEAD_SIZE,   \
++                                   NTHR, GQA_RATIO>                           \
++      <<<grid, block, 0, stream>>>(                                           \
++          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
++          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
++          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
++          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
++          k_scale, v_scale);
++
++template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
++          int BLOCK_SIZE, int HEAD_SIZE, int PARTITION_SIZE = 512>
++void paged_attention_custom_launcher(
++    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
++    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
++    torch::Tensor& value_cache, const int num_kv_heads, float scale,
++    torch::Tensor& block_tables, torch::Tensor& context_lens,
++    int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
++    float k_scale, float v_scale) {
++  int num_seqs = query.size(0);
++  int num_heads = query.size(1);
++  int head_size = query.size(2);
++  int max_num_blocks_per_seq = block_tables.size(1);
++  int q_stride = query.stride(0);
++  int kv_block_stride = key_cache.stride(0);
++  int kv_head_stride = key_cache.stride(1);
++
++  // NOTE: alibi_slopes is optional.
++  const float* alibi_slopes_ptr =
++      alibi_slopes
++          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
++          : nullptr;
++
++  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
++  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
++  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
++  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
++  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
++  KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
++  KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
++  int* block_tables_ptr = block_tables.data_ptr<int>();
++  int* context_lens_ptr = context_lens.data_ptr<int>();
++
++  const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
++  const int max_num_partitions =
++      DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
++  const int gqa_ratio = num_heads / num_kv_heads;
++  assert(num_heads % num_kv_heads == 0);
++  assert(head_size == HEAD_SIZE);
++  assert(max_num_partitions <= 128);
++
++  constexpr int NTHR = PARTITION_SIZE;
++  dim3 grid(num_seqs, max_num_partitions, num_kv_heads);
++  dim3 block(NTHR);
++  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
++  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
++  switch (gqa_ratio) {
++    case 1:
++      LAUNCH_CUSTOM_ATTENTION(1);
++      break;
++    case 2:
++      LAUNCH_CUSTOM_ATTENTION(2);
++      break;
++    case 3:
++      LAUNCH_CUSTOM_ATTENTION(3);
++      break;
++    case 4:
++      LAUNCH_CUSTOM_ATTENTION(4);
++      break;
++    case 5:
++      LAUNCH_CUSTOM_ATTENTION(5);
++      break;
++    case 6:
++      LAUNCH_CUSTOM_ATTENTION(6);
++      break;
++    case 7:
++      LAUNCH_CUSTOM_ATTENTION(7);
++      break;
++    case 8:
++      LAUNCH_CUSTOM_ATTENTION(8);
++      break;
++    case 9:
++      LAUNCH_CUSTOM_ATTENTION(9);
++      break;
++    case 10:
++      LAUNCH_CUSTOM_ATTENTION(10);
++      break;
++    case 11:
++      LAUNCH_CUSTOM_ATTENTION(11);
++      break;
++    case 12:
++      LAUNCH_CUSTOM_ATTENTION(12);
++      break;
++    case 13:
++      LAUNCH_CUSTOM_ATTENTION(13);
++      break;
++    case 14:
++      LAUNCH_CUSTOM_ATTENTION(14);
++      break;
++    case 15:
++      LAUNCH_CUSTOM_ATTENTION(15);
++      break;
++    case 16:
++      LAUNCH_CUSTOM_ATTENTION(16);
++      break;
++    default:
++      TORCH_CHECK(false, "Unsupported gqa ratio: ", gqa_ratio);
++      break;
++  }
++  // dim3 grid2(num_heads,num_seqs,head_size/HEAD_ELEMS_PER_WG);
++  // dim3 block2(1024);
++  //  LAUNCH_CUSTOM_ATTENTION2;
++
++  // reduction kernel is only required if max_context_len > partition size,
++  // otherwise main kernel writes directly to final output
++  //  note there are cases with graphing where max_context_len is the max
++  //  supported by graphing, not the actual max among all the sequences: in that
++  //  case reduction kernel will still run but return immediately
++  if (max_context_len > PARTITION_SIZE) {
++    dim3 reduce_grid(num_heads, num_seqs);
++    dim3 reduce_block(head_size);
++    paged_attention_ll4mi_reduce_kernel<T, HEAD_SIZE, HEAD_SIZE, PARTITION_SIZE>
++        <<<reduce_grid, reduce_block, 0, stream>>>(
++            out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,
++            context_lens_ptr, max_num_partitions);
++  }
++}
++
++#define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE)       \
++  paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE>( \
++      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,  \
++      num_kv_heads, scale, block_tables, context_lens, max_context_len,   \
++      alibi_slopes, k_scale, v_scale);
++
++#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE)     \
++  switch (block_size) {                                           \
++    case 16:                                                      \
++      CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, 16, HEAD_SIZE);      \
++      break;                                                      \
++    case 32:                                                      \
++      CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, 32, HEAD_SIZE);      \
++      break;                                                      \
++    default:                                                      \
++      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
++      break;                                                      \
++  }
++
++#define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE)         \
++  switch (head_size) {                                          \
++    case 64:                                                    \
++      CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 64);           \
++      break;                                                    \
++    case 128:                                                   \
++      CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 128);          \
++      break;                                                    \
++    default:                                                    \
++      TORCH_CHECK(false, "Unsupported head size: ", head_size); \
++      break;                                                    \
++  }
++
++void paged_attention(
++    torch::Tensor& out,         // [num_seqs, num_heads, head_size]
++    torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]
++    torch::Tensor& max_logits,  // [num_seqs, num_heads, max_num_partitions]
++    torch::Tensor&
++        tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
++    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
++    torch::Tensor&
++        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
++    torch::Tensor&
++        value_cache,  // [num_blocks, num_heads, head_size, block_size]
++    int64_t num_kv_heads, double scale,
++    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
++    torch::Tensor& context_lens,  // [num_seqs]
++    int64_t block_size, int64_t max_context_len,
++    const std::optional<torch::Tensor>& alibi_slopes,
++    const std::string& kv_cache_dtype, double k_scale, double v_scale) {
++  const int head_size = query.size(2);
++  if (kv_cache_dtype == "auto") {
++    if (query.dtype() == at::ScalarType::Half) {
++      CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, _Float16,
++                                    vllm::Fp8KVCacheDataType::kAuto);
++    } else if (query.dtype() == at::ScalarType::BFloat16) {
++      CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, __hip_bfloat16,
++                                    vllm::Fp8KVCacheDataType::kAuto);
++    } else {
++      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
++    }
++  } else if (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3") {
++    if (query.dtype() == at::ScalarType::Half) {
++      CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t,
++                                    vllm::Fp8KVCacheDataType::kFp8E4M3);
++    } else if (query.dtype() == at::ScalarType::BFloat16) {
++      CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t,
++                                    vllm::Fp8KVCacheDataType::kFp8E4M3);
++    } else {
++      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
++    }
++  } else {
++    TORCH_CHECK(false, "Unsupported KV cache dtype: ", kv_cache_dtype);
++  }
++}
++
++#undef WARP_SIZE
++#undef MAX
++#undef MIN
++#undef DIVIDE_ROUND_UP
+\ No newline at end of file
+diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
+new file mode 100644
+index 0000000..34b2f9c
+--- /dev/null
++++ b/csrc/rocm/ops.h
+@@ -0,0 +1,14 @@
++#pragma once
++
++#include <torch/all.h>
++
++void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
++                     torch::Tensor& max_logits, torch::Tensor& tmp_out,
++                     torch::Tensor& query, torch::Tensor& key_cache,
++                     torch::Tensor& value_cache, int64_t num_kv_heads,
++                     double scale, torch::Tensor& block_tables,
++                     torch::Tensor& context_lens, int64_t block_size,
++                     int64_t max_context_len,
++                     const std::optional<torch::Tensor>& alibi_slopes,
++                     const std::string& kv_cache_dtype, double k_scale,
++                     double v_scale);
+diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp
+new file mode 100644
+index 0000000..a283d42
+--- /dev/null
++++ b/csrc/rocm/torch_bindings.cpp
+@@ -0,0 +1,34 @@
++#include "core/registration.h"
++#include "rocm/ops.h"
++
++// Note on op signatures:
++// The X_meta signatures are for the meta functions corresponding to op X.
++// They must be kept in sync with the signature for X. Generally, only
++// functions that return Tensors require a meta function.
++//
++// See the following links for detailed docs on op registration and function
++// schemas.
++// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
++// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
++
++TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
++  // vLLM custom ops for rocm
++
++  // Custom attention op
++  // Compute the attention between an input query and the cached
++  // keys/values using PagedAttention.
++  rocm_ops.def(
++      "paged_attention(Tensor! out, Tensor exp_sums,"
++      "                Tensor max_logits, Tensor tmp_out,"
++      "                Tensor query, Tensor key_cache,"
++      "                Tensor value_cache, int num_kv_heads,"
++      "                float scale, Tensor block_tables,"
++      "                Tensor context_lens, int block_size,"
++      "                int max_context_len,"
++      "                Tensor? alibi_slopes,"
++      "                str kv_cache_dtype,"
++      "                float k_scale, float v_scale) -> ()");
++  rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
++}
++
++REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
+diff --git a/csrc/sparse/cutlass/sparse_compressor_c3x.cu b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
+new file mode 100644
+index 0000000..bd53695
+--- /dev/null
++++ b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
+@@ -0,0 +1,165 @@
++// clang-format will break include orders
++// clang-format off
++#include <cudaTypedefs.h>
++
++#if defined CUDA_VERSION && CUDA_VERSION >= 12020
++#include "sparse_scaled_mm_c3x.cuh"
++
++#include "cutlass/numeric_conversion.h"
++#include "cutlass/transform/device/transform_universal_adapter.hpp"
++#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
++#include "cutlass/epilogue/collective/default_epilogue.hpp"
++
++#include "cutlass/util/host_tensor.h"
++#include "cutlass/util/packed_stride.hpp"
++// clang-format on
++
++using namespace cute;
++using namespace vllm;
++
++/// Make A structured sparse by replacing elements with 0 and compress it
++template <typename ElementA_, typename ElementAcc_>
++bool cutlass_sparse_compress(torch::Tensor& a_nzs, torch::Tensor& a_meta,
++                             torch::Tensor const& a) {
++  // Checks for conformality
++  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
++              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
++  TORCH_CHECK(a.dim() == 2)
++  // Check for strides and alignment
++  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
++  TORCH_CHECK(a.stride(1) == 1)
++
++  int m = a.size(0);
++  int k = a.size(1);
++
++  // Sparse kernel setup; this kernel is not used for matmul,
++  // but just for setting up the compressor utility
++  // A matrix configuration
++  using ElementA = ElementA_;
++  using LayoutTagA = cutlass::layout::RowMajor;
++  constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
++  // B matrix configuration
++  using ElementB = ElementA;
++  using LayoutTagB = cutlass::layout::ColumnMajor;
++  constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
++  // C/D matrix configuration
++  using ElementC = float;
++  using LayoutTagC = cutlass::layout::ColumnMajor;
++  constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
++  // Core kernel configurations
++  using ElementAccumulator = ElementAcc_;
++  using TileShape = Shape<_128, _128, _128>;
++  using TileShapeRef = Shape<_128, _128, _64>;
++  using ClusterShape = Shape<_1, _2, _1>;
++  using KernelSchedule = typename std::conditional<
++      std::is_same_v<ElementA, cutlass::float_e4m3_t>,
++      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum,
++      cutlass::gemm::KernelTmaWarpSpecialized>::type;
++
++  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
++  using ProblemShape = Shape<int, int, int, int>;
++
++  using CollectiveEpilogue =
++      typename cutlass::epilogue::collective::CollectiveBuilder<
++          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
++          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
++          ElementAccumulator, ElementAccumulator, ElementC, LayoutTagC,
++          AlignmentC, ElementC, LayoutTagC, AlignmentC,
++          EpilogueSchedule>::CollectiveOp;
++
++  using CollectiveMainloop =
++      typename cutlass::gemm::collective::CollectiveBuilder<
++          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, ElementA,
++          LayoutTagA, AlignmentA, ElementB, LayoutTagB, AlignmentB,
++          ElementAccumulator, TileShape, ClusterShape,
++          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
++              sizeof(typename CollectiveEpilogue::SharedStorage))>,
++          KernelSchedule>::CollectiveOp;
++
++  using GemmKernel =
++      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
++                                           CollectiveEpilogue>;
++
++  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
++
++  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
++  using StrideE = StrideA;
++
++  using StrideA = Stride<int64_t, Int<1>, int64_t>;
++
++  // The n (=1) dimension does not matter for the compressor
++  typename GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
++
++  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
++  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
++
++  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
++  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
++
++  // Offline compressor kernel
++  using CompressorUtility =
++      cutlass::transform::kernel::StructuredSparseCompressorUtility<
++          ProblemShape, ElementA, LayoutTagA, SparseConfig>;
++
++  using CompressorKernel =
++      cutlass::transform::kernel::StructuredSparseCompressor<
++          ProblemShape, ElementA, LayoutTagA, SparseConfig,
++          cutlass::arch::Sm90>;
++
++  using Compressor =
++      cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
++
++  auto [M, N, K, L] = prob_shape;
++
++  StrideA stride_A;
++  stride_A =
++      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
++
++  CompressorUtility compressor_utility(prob_shape, stride_A);
++
++  int ME = compressor_utility.get_metadata_m_physical();
++  int KE = compressor_utility.get_metadata_k_physical();
++  int KC = compressor_utility.get_tensorA_k_physical();
++
++  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
++
++  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
++  auto a_meta_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(
++      a_meta.data_ptr());
++
++  cutlass::KernelHardwareInfo hw_info;
++  hw_info.device_id = 0;
++  hw_info.sm_count =
++      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
++          hw_info.device_id);
++  typename Compressor::Arguments arguments{
++      prob_shape, {a_ptr, stride_A, a_nzs_ptr, a_meta_ptr}, {hw_info}};
++
++  Compressor compressor_op;
++  size_t workspace_size = Compressor::get_workspace_size(arguments);
++  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
++
++  CUTLASS_CHECK(compressor_op.can_implement(arguments));
++  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
++  CUTLASS_CHECK(compressor_op.run());
++  CUDA_CHECK(cudaDeviceSynchronize());
++
++  return true;
++}
++
++bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
++                                  torch::Tensor const& a) {
++  if (a.dtype() == torch::kBFloat16) {
++    return cutlass_sparse_compress<cutlass::bfloat16_t, float>(a_nzs, a_meta,
++                                                               a);
++  } else if (a.dtype() == torch::kFloat16) {
++    return cutlass_sparse_compress<cutlass::half_t, float>(a_nzs, a_meta, a);
++  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
++    return cutlass_sparse_compress<cutlass::float_e4m3_t, float>(a_nzs, a_meta,
++                                                                 a);
++  } else if (a.dtype() == torch::kInt8) {
++    return cutlass_sparse_compress<int8_t, int32_t>(a_nzs, a_meta, a);
++  }
++  return false;
++}
++#endif
+diff --git a/csrc/sparse/cutlass/sparse_compressor_entry.cu b/csrc/sparse/cutlass/sparse_compressor_entry.cu
+new file mode 100644
+index 0000000..3401761
+--- /dev/null
++++ b/csrc/sparse/cutlass/sparse_compressor_entry.cu
+@@ -0,0 +1,42 @@
++#include <cudaTypedefs.h>
++
++#include <c10/cuda/CUDAGuard.h>
++#include <torch/all.h>
++
++#include "cutlass_extensions/common.hpp"
++
++#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
++bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
++                                  torch::Tensor const& a);
++#endif
++
++bool cutlass_sparse_compress_entry(torch::Tensor& a_nzs, torch::Tensor& a_meta,
++                                   torch::Tensor const& a) {
++  // Checks for conformality
++  TORCH_CHECK(a.dim() == 2 && a_meta.dim() == 2 && a_nzs.dim() == 2);
++  TORCH_CHECK(a.size(0) == a_nzs.size(0) && a.size(0) == a_meta.size(0) &&
++              a_nzs.size(1) * 2 == a.size(1) &&
++              a_meta.size(1) * 2 * 4 == a.size(1));
++  // Considering elemsPerMetaElem = 8b / 2b_per_nz = 4
++
++  // Check for strides and alignment
++  TORCH_CHECK(a.stride(1) == 1 && a_nzs.stride(1) == 1 &&
++              a_meta.stride(1) == 1);  // Row-major
++  TORCH_CHECK(a.stride(0) % 8 == 0);   // 8 Byte Alignment for Compression
++
++  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
++  int32_t version_num = get_sm_version_num();
++
++  // Guard against compilation issues for sm90 kernels
++#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
++  if (version_num >= 90) {
++    return cutlass_sparse_compress_sm90(a_nzs, a_meta, a);
++  }
++#endif
++
++  TORCH_CHECK_NOT_IMPLEMENTED(
++      false,
++      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
++      "CUDA device capability: ",
++      version_num);
++}
+diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+new file mode 100644
+index 0000000..5a18797
+--- /dev/null
++++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+@@ -0,0 +1,303 @@
++// clang-format will break include orders
++// clang-format off
++#include <cudaTypedefs.h>
++
++#if defined CUDA_VERSION && CUDA_VERSION >= 12020
++#include "sparse_scaled_mm_c3x.cuh"
++// clang-format on
++
++using namespace cute;
++using namespace vllm;
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
++                                    torch::Tensor const& bt_nzs,
++                                    torch::Tensor const& bt_meta,
++                                    EpilogueArgs&&... args) {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
++  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
++  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
++
++  using Cutlass3xGemmDefault =
++      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM64 =
++      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM128 =
++      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM256 =
++      typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM512 =
++      typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
++
++  using Cutlass3xGemm1 =
++      typename sm90_fp8_config_1<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemm2 =
++      typename sm90_fp8_config_2<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemm3 =
++      typename sm90_fp8_config_3<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemm4 =
++      typename sm90_fp8_config_4<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemm5 =
++      typename sm90_fp8_config_5<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemm6 =
++      typename sm90_fp8_config_6<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemm7 =
++      typename sm90_fp8_config_7<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemm8 =
++      typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
++
++  uint32_t const n = bt_nzs.size(0);
++  uint32_t const m = a.size(0);  // Batch size
++  uint32_t const mp2 =
++      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
++
++  if (mp2 <= 64) {
++    if (n == 28672) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    } else if (n == 4096 || n == 6144) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    }
++  } else if (mp2 <= 128) {
++    if (n == 4096) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    } else if (n == 28672) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    } else if (n == 6144) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    }
++  } else if (mp2 <= 256) {
++    if (n == 4096) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    } else if (n == 28672) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    } else if (n == 6144) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    }
++  } else {
++    if (n == 6144 || n == 28672) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    } else if (n == 4096) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    }
++  }
++
++  // Otherwise the default heuristic
++  if (mp2 <= 64) {
++    // n in [1, 64]
++    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
++        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 128) {
++    // n in (64, 128]
++    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
++        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 256) {
++    // n in (128, 256]
++    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
++        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++  } else {
++    // n in (256, inf)
++    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
++        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++  }
++}
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
++                                     torch::Tensor const& bt_nzs,
++                                     torch::Tensor const& bt_meta,
++                                     EpilogueArgs&&... args) {
++  static_assert(std::is_same<InType, cutlass::half_t>());
++  TORCH_CHECK(a.dtype() == torch::kFloat16);
++  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
++  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
++
++  using Cutlass3xGemmDefault =
++      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
++
++  // m in (128, inf)
++  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
++      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++}
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
++                                     torch::Tensor const& bt_nzs,
++                                     torch::Tensor const& bt_meta,
++                                     EpilogueArgs&&... args) {
++  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
++  TORCH_CHECK(a.dtype() == torch::kBFloat16);
++  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
++  TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
++
++  using Cutlass3xGemmDefault =
++      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
++
++  // m in (128, inf)
++  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
++      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++}
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
++                                     torch::Tensor const& bt_nzs,
++                                     torch::Tensor const& bt_meta,
++                                     EpilogueArgs&&... args) {
++  static_assert(std::is_same<InType, int8_t>());
++  TORCH_CHECK(a.dtype() == torch::kInt8);
++  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
++  TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
++
++  using Cutlass3xGemmDefault =
++      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM128 =
++      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM64 =
++      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM32NBig =
++      typename sm90_int8_config_M32_NBig<InType, OutType,
++                                         Epilogue>::Cutlass3xGemm;
++  using Cutlass3xGemmM32NSmall =
++      typename sm90_int8_config_M32_NSmall<InType, OutType,
++                                           Epilogue>::Cutlass3xGemm;
++
++  uint32_t const n = out.size(1);
++  bool const is_small_n = n < 8192;
++
++  uint32_t const m = a.size(0);
++  uint32_t const mp2 =
++      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
++
++  if (mp2 <= 32) {
++    // m in [1, 32]
++    if (is_small_n) {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    } else {
++      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
++          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++    }
++  } else if (mp2 <= 64) {
++    // m in (32, 64]
++    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
++        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++  } else if (mp2 <= 128) {
++    // m in (64, 128]
++    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
++        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++  } else {
++    // m in (128, inf)
++    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
++        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
++  }
++}
++
++template <template <typename, typename, typename> typename Epilogue,
++          typename... EpilogueArgs>
++void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
++                                            torch::Tensor const& a,
++                                            torch::Tensor const& bt_nzs,
++                                            torch::Tensor const& bt_meta,
++                                            EpilogueArgs&&... epilogue_args) {
++  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
++  if (a.dtype() == torch::kInt8) {
++    TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
++
++    if (out.dtype() == torch::kBFloat16) {
++      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
++                                             Epilogue>(
++          out, a, bt_nzs, bt_meta,
++          std::forward<EpilogueArgs>(epilogue_args)...);
++    } else {
++      TORCH_CHECK(out.dtype() == torch::kFloat16);
++      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
++          out, a, bt_nzs, bt_meta,
++          std::forward<EpilogueArgs>(epilogue_args)...);
++    }
++  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
++    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
++
++    if (out.dtype() == torch::kBFloat16) {
++      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
++                                            cutlass::bfloat16_t, Epilogue>(
++          out, a, bt_nzs, bt_meta,
++          std::forward<EpilogueArgs>(epilogue_args)...);
++    } else {
++      TORCH_CHECK(out.dtype() == torch::kFloat16);
++      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
++                                            cutlass::half_t, Epilogue>(
++          out, a, bt_nzs, bt_meta,
++          std::forward<EpilogueArgs>(epilogue_args)...);
++    }
++  } else if (a.dtype() == torch::kFloat16) {
++    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
++
++    if (out.dtype() == torch::kBFloat16) {
++      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
++                                             cutlass::bfloat16_t, Epilogue>(
++          out, a, bt_nzs, bt_meta,
++          std::forward<EpilogueArgs>(epilogue_args)...);
++    } else {
++      TORCH_CHECK(out.dtype() == torch::kFloat16);
++      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
++                                             Epilogue>(
++          out, a, bt_nzs, bt_meta,
++          std::forward<EpilogueArgs>(epilogue_args)...);
++    }
++  } else {  // a.dtype() == torch::kBFloat16
++    TORCH_CHECK(a.dtype() == torch::kBFloat16);
++    TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
++
++    if (out.dtype() == torch::kBFloat16) {
++      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
++                                             cutlass::bfloat16_t, Epilogue>(
++          out, a, bt_nzs, bt_meta,
++          std::forward<EpilogueArgs>(epilogue_args)...);
++    } else {
++      TORCH_CHECK(out.dtype() == torch::kFloat16);
++      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
++                                             cutlass::half_t, Epilogue>(
++          out, a, bt_nzs, bt_meta,
++          std::forward<EpilogueArgs>(epilogue_args)...);
++    }
++  }
++}
++
++void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
++                                   torch::Tensor const& bt_nzs,
++                                   torch::Tensor const& bt_meta,
++                                   torch::Tensor const& a_scales,
++                                   torch::Tensor const& b_scales,
++                                   std::optional<torch::Tensor> const& bias) {
++  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
++  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
++  if (bias) {
++    TORCH_CHECK(bias->dtype() == out.dtype(),
++                "currently bias dtype must match output dtype ", out.dtype());
++    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
++        out, a, bt_nzs, bt_meta, b_scales, a_scales, *bias);
++  } else {
++    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
++        out, a, bt_nzs, bt_meta, b_scales, a_scales);
++  }
++}
++
++#endif
+diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+new file mode 100644
+index 0000000..10178b5
+--- /dev/null
++++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+@@ -0,0 +1,496 @@
++// clang-format will break include orders
++// clang-format off
++#include <cudaTypedefs.h>
++
++#include <torch/all.h>
++
++#include <ATen/cuda/CUDAContext.h>
++
++#include "cutlass/cutlass.h"
++
++#include "cutlass/gemm/device/gemm_universal_adapter.h"
++#include "cutlass/epilogue/collective/collective_builder.hpp"
++#include "cutlass/gemm/collective/collective_builder.hpp"
++
++#include "core/math.hpp"
++#include "cutlass_extensions/cute_utils.cuh"
++#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
++#include "cutlass_extensions/common.hpp"
++#include "cutlass_extensions/torch_utils.hpp"
++// clang-format on
++
++using namespace cute;
++
++/*
++   This file defines sparse quantized GEMM operations using the CUTLASS 3.x API,
++   for NVIDIA GPUs with sm90a (Hopper) or later.
++*/
++
++namespace {
++
++// A wrapper for the GEMM kernel that is used to guard against compilation on
++// architectures that will never use the kernel. The purpose of this is to
++// reduce the size of the compiled binary.
++// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
++// into code that will be executed on the device where it is defined.
++template <typename Kernel>
++struct enable_sm90_or_later : Kernel {
++  template <typename... Args>
++  CUTLASS_DEVICE void operator()(Args&&... args) {
++#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
++    Kernel::operator()(std::forward<Args>(args)...);
++#endif
++  }
++};
++
++using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
++
++template <typename ElementAB_, typename ElementD_,
++          template <typename, typename, typename> typename Epilogue_,
++          typename TileShape, typename ClusterShape, typename KernelSchedule,
++          typename EpilogueSchedule, typename AccType,
++          typename TileSchedule = cutlass::gemm::PersistentScheduler,
++          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
++struct cutlass_sparse_3x_gemm {
++  static const GemmUniversalMode Mode = Mode_;
++  using ElementAB = ElementAB_;
++  using ElementD = ElementD_;
++  using ElementAcc = AccType;
++
++  using EpilogueDescriptor =
++      cutlass::epilogue::collective::detail::EpilogueDescriptor<
++          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
++          ElementD, EpilogueSchedule>;
++
++  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
++
++  using ElementC = void;
++  using LayoutC = cutlass::layout::RowMajor;
++  using LayoutD = LayoutC;
++  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
++  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
++
++  using LayoutC_Transpose =
++      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
++  using LayoutD_Transpose =
++      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
++
++  using EVTCompute = typename Epilogue::EVTCompute;
++
++  static constexpr int AlignmentA =
++      128 / cutlass::sizeof_bits<ElementAB>::value;
++  static constexpr int AlignmentB =
++      128 / cutlass::sizeof_bits<ElementAB>::value;
++  static constexpr int AlignmentCD =
++      128 / cutlass::sizeof_bits<ElementD>::value;
++
++  using CollectiveEpilogue =
++      typename cutlass::epilogue::collective::CollectiveBuilder<
++          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
++          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
++          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD,
++          ElementD, LayoutD_Transpose, AlignmentCD, EpilogueSchedule,
++          EVTCompute>::CollectiveOp;
++
++  static constexpr size_t CEStorageSize =
++      sizeof(typename CollectiveEpilogue::SharedStorage);
++  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
++      static_cast<int>(CEStorageSize)>;
++
++  // clang-format off
++  using CollectiveMainloop =
++      typename cutlass::gemm::collective::CollectiveBuilder<
++          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
++          ElementAB, cutlass::layout::RowMajor, AlignmentA, 
++          ElementAB, cutlass::layout::ColumnMajor, AlignmentB, 
++          ElementAcc, TileShape, ClusterShape,
++          Stages,
++          KernelSchedule>::CollectiveOp;
++  // clang-format on
++
++  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
++      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
++      TileSchedule>>;
++
++  struct GemmKernel : public KernelType {};
++};
++
++template <typename Gemm, typename... EpilogueArgs>
++void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
++                                torch::Tensor const& bt_nzs,
++                                torch::Tensor const& bt_meta,
++                                EpilogueArgs&&... epilogue_params) {
++  using ElementAB = typename Gemm::ElementAB;
++  using ElementD = typename Gemm::ElementD;
++
++  // Interface stride expected from the argument a (will get transposed)
++  // We compute C^T = B^T * A^T, but we assume B is transposed before
++  // compression and hence the bt_* naming
++  using LayoutA = cutlass::layout::RowMajor;
++  using LayoutB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
++  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
++  using LayoutD = cutlass::layout::RowMajor;
++
++  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
++  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
++
++  auto layout_A = make_cute_layout<StrideA>(a, "A");
++  auto layout_D = make_cute_layout<StrideD>(out, "D");
++
++  // Transpose A and D
++  // A doesn't need to be transposed since cutlass expects a NxK matrix
++  // for B (which is At)
++  auto stride_At = layout_A.stride();
++  auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
++
++  using GemmKernel = typename Gemm::GemmKernel;
++  typename GemmKernel::ProblemShape prob_shape{
++      static_cast<int>(bt_nzs.size(0)), static_cast<int>(size<0>(layout_A)),
++      static_cast<int>(size<1>(layout_A)), 1};
++
++  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
++  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
++
++  LayoutB b_layout = SparseConfig::fill_layoutA(prob_shape);
++  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
++
++  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
++  auto b_ptr = static_cast<ElementAB*>(bt_nzs.data_ptr());
++  auto e_ptr = static_cast<ElementE*>(bt_meta.data_ptr());
++  typename GemmKernel::MainloopArguments mainloop_args{
++      b_ptr, b_layout, a_ptr, stride_At, e_ptr, e_layout};
++
++  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
++  typename GemmKernel::EpilogueArguments epilogue_args{
++      Gemm::Epilogue::prepare_args(
++          std::forward<EpilogueArgs>(epilogue_params)...),
++      c_ptr, stride_Dt, c_ptr, stride_Dt};
++
++  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
++                                      prob_shape, mainloop_args, epilogue_args};
++
++  // Launch the CUTLASS GEMM kernel.
++  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
++  GemmOp gemm_op;
++  CUTLASS_CHECK(gemm_op.can_implement(args));
++
++  size_t workspace_size = gemm_op.get_workspace_size(args);
++  auto const workspace_options =
++      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
++  auto workspace = torch::empty(workspace_size, workspace_options);
++
++  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
++
++  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
++  CUTLASS_CHECK(status);
++}
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_config_default {};
++
++template <typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_config_default<half_t, OutType, Epilogue> {
++  // M in (128, inf)
++  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_128, _128, _128>;
++  using ClusterShape = Shape<_2, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float>;
++};
++
++template <typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
++  // M in (128, inf)
++  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_128, _128, _128>;
++  using ClusterShape = Shape<_2, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
++                             ClusterShape, KernelSchedule, EpilogueSchedule,
++                             float>;
++};
++
++//////////////////////// Cherry-Picking Kernels ////////////////////////
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_1 {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _64, _256>;
++  using ClusterShape = Shape<_8, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_2 {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule =
++      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
++  using EpilogueSchedule =
++      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
++  using TileShape = Shape<_128, _64, _256>;
++  using ClusterShape = Shape<_8, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_3 {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _64, _256>;
++  using ClusterShape = Shape<_1, _2, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_4 {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
++  using EpilogueSchedule =
++      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
++  using TileShape = Shape<_64, _128, _256>;
++  using ClusterShape = Shape<_8, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_5 {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule =
++      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_128, _128, _256>;
++  using ClusterShape = Shape<_8, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_6 {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _128, _256>;
++  using ClusterShape = Shape<_1, _2, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_7 {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule =
++      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
++  using EpilogueSchedule =
++      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
++  using TileShape = Shape<_128, _128, _256>;
++  using ClusterShape = Shape<_1, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_8 {
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule =
++      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
++  using EpilogueSchedule =
++      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
++  using TileShape = Shape<_128, _256, _128>;
++  using ClusterShape = Shape<_8, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float>;
++};
++////////////////////////////////////////////////////////////////////////
++
++template <typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
++  // M in (128, inf)
++  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_128, _128, _128>;
++  using ClusterShape = Shape<_1, _2, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
++                             TileShape, ClusterShape, KernelSchedule,
++                             EpilogueSchedule, float>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_M64 {
++  // M in [1, 64]
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
++  using EpilogueSchedule =
++      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
++  using TileShape = Shape<_64, _64, _256>;
++  using ClusterShape = Shape<_1, _1, _1>;
++
++  using TileSchedule = cutlass::gemm::PersistentScheduler;
++
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float,
++                             TileSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_M128 {
++  // M in (64, 128]
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule =
++      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _128, _256>;
++  using ClusterShape = Shape<_1, _1, _1>;
++
++  using TileSchedule = cutlass::gemm::PersistentScheduler;
++
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float,
++                             TileSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_M256 {
++  // M in (128, 256]
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule =
++      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
++  using EpilogueSchedule =
++      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
++  using TileShape = Shape<_128, _128, _256>;
++  using ClusterShape = Shape<_1, _1, _1>;
++
++  using TileSchedule = cutlass::gemm::PersistentScheduler;
++
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float,
++                             TileSchedule>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_fp8_config_M512 {
++  // M in (256, ]
++  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
++  using KernelSchedule =
++      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
++  using EpilogueSchedule =
++      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
++  using TileShape = Shape<_128, _128, _256>;
++  using ClusterShape = Shape<_1, _1, _1>;
++
++  using TileSchedule = cutlass::gemm::PersistentScheduler;
++
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, float,
++                             TileSchedule>;
++};
++
++template <typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_config_default<int8_t, OutType, Epilogue> {
++  // For M > 128 and any N
++  using KernelSchedule =
++      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_128, _128, _128>;
++  using ClusterShape = Shape<_2, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<int8_t, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, int32_t>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_int8_config_M128 {
++  // For M in (64, 128] and any N
++  static_assert(std::is_same<InType, int8_t>());
++  using KernelSchedule =
++      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _128, _128>;
++  using ClusterShape = Shape<_2, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, int32_t>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_int8_config_M64 {
++  // For M in (32, 64] and any N
++  static_assert(std::is_same<InType, int8_t>());
++  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _64, _256>;
++  using ClusterShape = Shape<_1, _1, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, int32_t>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_int8_config_M32_NBig {
++  // For M in [1, 32] and N >= 8192
++  static_assert(std::is_same<InType, int8_t>());
++  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _128, _256>;
++  using ClusterShape = Shape<_1, _4, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, int32_t>;
++};
++
++template <typename InType, typename OutType,
++          template <typename, typename, typename> typename Epilogue>
++struct sm90_int8_config_M32_NSmall {
++  // For M in [1, 32] and N < 8192
++  static_assert(std::is_same<InType, int8_t>());
++  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
++  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
++  using TileShape = Shape<_64, _64, _256>;
++  using ClusterShape = Shape<_1, _8, _1>;
++  using Cutlass3xGemm =
++      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
++                             KernelSchedule, EpilogueSchedule, int32_t>;
++};
++
++}  // namespace
+\ No newline at end of file
+diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+new file mode 100644
+index 0000000..371de09
+--- /dev/null
++++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+@@ -0,0 +1,70 @@
++#include <cudaTypedefs.h>
++
++#include <c10/cuda/CUDAGuard.h>
++#include <torch/all.h>
++
++#include "cutlass_extensions/common.hpp"
++
++bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability) {
++  // sparse CUTLASS kernels need at least
++  //   CUDA 12.2 and SM90 (Hopper)
++
++#if defined CUDA_VERSION
++  return CUDA_VERSION >= 12020 && cuda_device_capability >= 90;
++#endif
++
++  return false;
++}
++
++#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
++void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
++                                   torch::Tensor const& b,
++                                   torch::Tensor const& e,
++                                   torch::Tensor const& a_scales,
++                                   torch::Tensor const& b_scales,
++                                   std::optional<torch::Tensor> const& bias);
++#endif
++
++void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
++                              torch::Tensor const& bt_nzs,
++                              torch::Tensor const& bt_meta,
++                              torch::Tensor const& a_scales,
++                              torch::Tensor const& b_scales,
++                              std::optional<torch::Tensor> const& bias) {
++  // Checks for conformality
++  TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
++  TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
++              a.size(0) == c.size(0));
++  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
++  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == bt_nzs.size(0));
++
++  // Check for strides and alignment
++  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 &&
++              c.stride(1) == 1);            // Row-major
++  TORCH_CHECK(c.stride(0) % 16 == 0);       // 16 Byte Alignment
++  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);  // 16 Byte Alignment
++  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
++
++  if (bias) {
++    TORCH_CHECK(bias->numel() == bt_nzs.size(0) && bias->is_contiguous() &&
++                bias->dim() == 1);
++  }
++
++  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
++  int32_t version_num = get_sm_version_num();
++
++  // Guard against compilation issues for sm90 kernels
++#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
++  if (version_num >= 90) {
++    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
++                                  bias);
++    return;
++  }
++#endif
++
++  TORCH_CHECK_NOT_IMPLEMENTED(
++      false,
++      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
++      "CUDA device capability: ",
++      version_num);
++}
+diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
+new file mode 100644
+index 0000000..956258c
+--- /dev/null
++++ b/csrc/torch_bindings.cpp
+@@ -0,0 +1,505 @@
++#include "cache.h"
++#include "cuda_utils.h"
++#include "ops.h"
++#include "core/registration.h"
++
++#include <torch/library.h>
++
++// Note on op signatures:
++// The X_meta signatures are for the meta functions corresponding to op X.
++// They must be kept in sync with the signature for X. Generally, only
++// functions that return Tensors require a meta function.
++//
++// See the following links for detailed docs on op registration and function
++// schemas.
++// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
++// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
++
++TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
++  // vLLM custom ops
++
++  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
++  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
++
++  // Attention ops
++  // Compute the attention between an input query and the cached
++  // keys/values using PagedAttention.
++  ops.def(
++      "paged_attention_v1("
++      "    Tensor! out, Tensor query, Tensor key_cache,"
++      "    Tensor value_cache, int num_kv_heads, float scale,"
++      "    Tensor block_tables, Tensor seq_lens, int block_size,"
++      "    int max_seq_len, Tensor? alibi_slopes,"
++      "    str kv_cache_dtype, float k_scale, float v_scale,"
++      "    int tp_rank, int blocksparse_local_blocks,"
++      "    int blocksparse_vert_stride, int blocksparse_block_size,"
++      "    int blocksparse_head_sliding_step) -> ()");
++  ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
++
++  // PagedAttention V2.
++  ops.def(
++      "paged_attention_v2("
++      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
++      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
++      "    Tensor value_cache, int num_kv_heads, float scale,"
++      "    Tensor block_tables, Tensor seq_lens, int block_size,"
++      "    int max_seq_len, Tensor? alibi_slopes,"
++      "    str kv_cache_dtype, float k_scale, float v_scale,"
++      "    int tp_rank, int blocksparse_local_blocks,"
++      "    int blocksparse_vert_stride, int blocksparse_block_size,"
++      "    int blocksparse_head_sliding_step) -> ()");
++  ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
++
++  // Activation ops
++  // Activation function used in SwiGLU.
++  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
++  ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
++
++  // Activation function used in GeGLU with `none` approximation.
++  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
++  ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);
++
++  // Activation function used in GeGLU with `tanh` approximation.
++  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
++  ops.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul);
++
++  // FATReLU implementation.
++  ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
++  ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);
++
++  // GELU implementation used in GPT-2.
++  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
++  ops.impl("gelu_new", torch::kCUDA, &gelu_new);
++
++  // Approximate GELU implementation.
++  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
++  ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
++
++  // Quick GELU implementation.
++  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
++  ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
++
++  // prepare_inputs advance_step
++  ops.def(
++      "advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
++      "Tensor! input_tokens, Tensor sampled_token_ids, "
++      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
++      "Tensor block_tables) -> ()");
++  ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
++
++  ops.def(
++      "advance_step_flashinfer("
++      "    int num_seqs, int num_queries, int block_size,"
++      "    Tensor! input_tokens, Tensor sampled_token_ids,"
++      "    Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
++      "    Tensor block_tables, Tensor! paged_kv_indices,"
++      "    Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
++      "    Tensor! block_table_bounds"
++      ") -> ()");
++  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
++
++  // Layernorm
++  // Apply Root Mean Square (RMS) Normalization to the input tensor.
++  ops.def(
++      "rms_norm(Tensor! result, Tensor input, Tensor weight, float epsilon) -> "
++      "()");
++  ops.impl("rms_norm", torch::kCUDA, &rms_norm);
++
++  // In-place fused Add and RMS Normalization.
++  ops.def(
++      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
++      "float epsilon) -> ()");
++  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
++
++  // Layernorm-quant
++  // Apply Root Mean Square (RMS) Normalization to the input tensor.
++  ops.def(
++      "rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, "
++      "Tensor scale, float epsilon) -> "
++      "()");
++  ops.impl("rms_norm_static_fp8_quant", torch::kCUDA,
++           &rms_norm_static_fp8_quant);
++
++  // In-place fused Add and RMS Normalization.
++  ops.def(
++      "fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, "
++      "Tensor! residual, Tensor weight, "
++      "Tensor scale, float epsilon) -> ()");
++  ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
++           &fused_add_rms_norm_static_fp8_quant);
++
++  // Fused Layernorm + Quant kernels
++  ops.def(
++      "rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
++      "Tensor weight, Tensor! scale, float epsilon, "
++      "Tensor? scale_ub, Tensor!? residual) -> ()");
++  ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
++           &rms_norm_dynamic_per_token_quant);
++
++  // Rotary embedding
++  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
++  ops.def(
++      "rotary_embedding(Tensor positions, Tensor! query,"
++      "                 Tensor! key, int head_size,"
++      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
++  ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);
++
++  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key
++  // (supports multiple loras).
++  ops.def(
++      "batched_rotary_embedding(Tensor positions, Tensor! query,"
++      "                         Tensor! key, int head_size,"
++      "                         Tensor cos_sin_cache, bool is_neox,"
++      "                         int rot_dim,"
++      "                         Tensor cos_sin_cache_offsets) -> ()");
++  ops.impl("batched_rotary_embedding", torch::kCUDA, &batched_rotary_embedding);
++
++  // Quantization ops
++#ifndef USE_ROCM
++  // Quantized GEMM for AQLM.
++  ops.def(
++      "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
++      "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
++      "-> Tensor");
++  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
++
++  // Decompression method for AQLM.
++  ops.def(
++      "aqlm_dequant(Tensor codes, Tensor codebooks, "
++      "int[] codebook_partition_sizes) -> Tensor");
++  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
++
++  // Quantized GEMM for AWQ.
++  ops.def(
++      "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
++      "Tensor _zeros, SymInt split_k_iters) -> Tensor");
++  ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
++
++  // Dequantization for AWQ.
++  ops.def(
++      "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
++      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
++  ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
++
++  // Note about marlin kernel 'workspace' arguments:
++  // Technically these should be mutable since they are modified by the kernel.
++  // But since they are set back to zero once the kernel is finished we can
++  // hand wave and say that they have no net effect.
++  //
++  // The reason to mark 'workspace' as immutable is so that they don't interfere
++  // with using ScalarType arguments in the ops. If they are marked as mutable,
++  // pytorch throws an assert in
++  // 'torch._higher_order_ops._register_effectful_op' that prevents these
++  // kernels from being torch.compile'd.
++  // See the following document for more info on custom types and ops that use
++  // custom types:
++  // https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
++
++  // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
++  ops.def(
++      "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
++      "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> "
++      "Tensor");
++  // conditionally compiled so impl in source file
++
++  // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
++  ops.def(
++      "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
++      "Tensor b_scales, Tensor workspace, "
++      "int b_q_type, "
++      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
++  //  conditionally compiled so impl in source file
++
++  // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
++  ops.def(
++      "machete_supported_schedules("
++      "   ScalarType a_type,"
++      "   int b_type,"
++      "   ScalarType? maybe_group_scales_type,"
++      "   ScalarType? maybe_group_zeros_type,"
++      "   ScalarType? maybe_channel_scales_type,"
++      "   ScalarType? maybe_token_scales_type,"
++      "   ScalarType? maybe_out_type"
++      ") -> str[]");
++  ops.def(
++      "machete_mm("
++      "   Tensor A,"
++      "   Tensor B,"
++      "   int b_type,"
++      "   ScalarType? out_type,"
++      "   Tensor? group_scales,"
++      "   Tensor? group_zeros,"
++      "   int?    group_size,"
++      "   Tensor? channel_scales,"
++      "   Tensor? token_scales,"
++      "   str?    schedule"
++      ") -> Tensor");
++  ops.def(
++      "machete_prepack_B("
++      "   Tensor B,"
++      "   ScalarType a_type,"
++      "   int b_type,"
++      "   ScalarType? group_scales_type"
++      ") -> Tensor");
++  // conditionally compiled so impl registration is in source file
++
++  ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
++  ops.impl("permute_cols", torch::kCUDA, &permute_cols);
++
++  // gptq_marlin Optimized Quantized GEMM for GPTQ.
++  ops.def(
++      "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
++      "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
++      "int b_q_type, "
++      "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
++      "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
++  // conditionally compiled so impl registration is in source file
++
++  // gptq_marlin repack from GPTQ.
++  ops.def(
++      "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
++      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
++  // conditionally compiled so impl registrations are in source file
++
++  // awq_marlin repack from AWQ.
++  ops.def(
++      "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
++      "SymInt size_n, int num_bits) -> Tensor");
++  // conditionally compiled so impl registrations are in source file
++#endif
++
++  // Dequantization for GGML.
++  ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
++  ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
++
++  // mmvq kernel for GGML.
++  ops.def(
++      "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, SymInt row) "
++      "-> Tensor");
++  ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
++
++  // mmq kernel for GGML.
++  ops.def(
++      "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
++  ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
++
++#ifndef USE_ROCM
++  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
++  ops.def(
++      "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
++      "Tensor! workspace, int num_bits, SymInt size_m, SymInt size_n, "
++      "SymInt size_k) -> Tensor");
++  // conditionally compiled so impl registration is in source file
++
++  // marlin_qqq_gemm for QQQ.
++  ops.def(
++      "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
++      "Tensor s_tok, Tensor s_ch, Tensor s_group, "
++      "Tensor! workspace, SymInt size_m, SymInt size_n, "
++      "SymInt size_k) -> Tensor");
++  // conditionally compiled so impl registration is in source file
++
++  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
++  // quantization, as well as bias
++  ops.def(
++      "cutlass_scaled_mm(Tensor! out, Tensor a,"
++      "                  Tensor b, Tensor a_scales,"
++      "                  Tensor b_scales, Tensor? bias) -> ()");
++  ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
++
++  // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
++  // quantization.
++  ops.def(
++      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
++      "                  Tensor b, Tensor a_scales,"
++      "                  Tensor b_scales, Tensor azp_adj,"
++      "                  Tensor? azp, Tensor? bias) -> ()");
++  ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
++
++  // Check if cutlass scaled_mm is supported for CUDA devices of the given
++  // capability
++  ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
++  ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
++
++  // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
++  // given capability
++  ops.def(
++      "cutlass_sparse_scaled_mm_supported(int cuda_device_capability) -> bool");
++  ops.impl("cutlass_sparse_scaled_mm_supported",
++           &cutlass_sparse_scaled_mm_supported);
++
++  // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column
++  // quantization, as well as bias
++  ops.def(
++      "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
++      "                         Tensor bt_nzs,"
++      "                         Tensor bt_meta, Tensor a_scales,"
++      "                         Tensor b_scales, Tensor? bias) -> ()");
++  ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
++
++  // CUTLASS sparse matrix compressor
++  ops.def(
++      "cutlass_sparse_compress_entry(Tensor! a_nzs, Tensor! a_meta,"
++      "                              Tensor a) -> bool");
++  ops.impl("cutlass_sparse_compress_entry", &cutlass_sparse_compress_entry);
++
++  // Mamba selective scan kernel
++  ops.def(
++      "selective_scan_fwd(Tensor! u, Tensor! delta,"
++      "Tensor! A, Tensor! B, Tensor! C,"
++      "Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
++      "bool delta_softplus,"
++      "Tensor? query_start_loc,"
++      "Tensor? cache_indices,"
++      "Tensor? has_initial_state,"
++      "Tensor! ssm_states,"
++      "int pad_slot_id) -> ()");
++  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
++
++  ops.def(
++      "causal_conv1d_update(Tensor! x,"
++      "Tensor! conv_state,"
++      "Tensor! weight,"
++      "Tensor? bias_,"
++      "bool silu_activation,"
++      "Tensor? cache_seqlens_,"
++      "Tensor? conv_state_indices,"
++      "int pad_slot_id) -> ()");
++  ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
++
++  ops.def(
++      "causal_conv1d_fwd(Tensor! x, Tensor! weight,"
++      "Tensor? bias_,"
++      "Tensor!? conv_states,"
++      "Tensor? query_start_loc,"
++      "Tensor? cache_indices,"
++      "Tensor? has_initial_state,"
++      "bool silu_activation,"
++      "int pad_slot_id) -> ()");
++  ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
++#endif
++
++  // Quantized GEMM for GPTQ.
++  // Note: even though the C++ inferred schema is correct for this op, it seems
++  // to prevent the meta function registry.
++  ops.def(
++      "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
++      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
++      "-> Tensor");
++  ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
++
++  // Post processing for GPTQ.
++  ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
++  ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
++
++  // Compute FP8 quantized tensor for given scaling factor.
++  ops.def(
++      "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
++      "()");
++  ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
++
++  // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
++  ops.def(
++      "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
++      "-> "
++      "()");
++  ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
++
++  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
++  ops.def(
++      "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
++      "Tensor! scale, Tensor? scale_ub) -> "
++      "()");
++  ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
++           &dynamic_per_token_scaled_fp8_quant);
++
++  // Compute int8 quantized tensor for given scaling factor.
++  ops.def(
++      "static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale,"
++      "Tensor? azp) -> ()");
++  ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant);
++
++  // Compute int8 quantized tensor and scaling factor
++  ops.def(
++      "dynamic_scaled_int8_quant(Tensor! result, Tensor input, Tensor! scale, "
++      "Tensor!? azp) -> ()");
++  ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
++           &dynamic_scaled_int8_quant);
++}
++
++TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
++  // Cache ops
++  // Swap in (out) the cache blocks from src to dst.
++  cache_ops.def(
++      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
++  cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
++
++  // Copy the cache blocks from src to dst.
++  cache_ops.def(
++      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
++      "Tensor block_mapping) -> ()");
++  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
++
++  // Reshape the key and value tensors and cache them.
++  cache_ops.def(
++      "reshape_and_cache(Tensor key, Tensor value,"
++      "                  Tensor! key_cache, Tensor! value_cache,"
++      "                  Tensor slot_mapping,"
++      "                  str kv_cache_dtype,"
++      "                  float k_scale, float v_scale) -> ()");
++  cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
++
++  // Reshape the key and value tensors and cache them.
++  cache_ops.def(
++      "reshape_and_cache_flash(Tensor key, Tensor value,"
++      "                        Tensor! key_cache,"
++      "                        Tensor! value_cache,"
++      "                        Tensor slot_mapping,"
++      "                        str kv_cache_dtype,"
++      "                        float k_scale, float v_scale) -> ()");
++  cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
++                 &reshape_and_cache_flash);
++
++  // Convert the key and value cache to fp8 data type.
++  cache_ops.def(
++      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
++      "str kv_cache_dtype) -> ()");
++  cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
++}
++
++TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
++  // Cuda utils
++
++  // Gets the specified device attribute.
++  cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
++  cuda_utils.impl("get_device_attribute", &get_device_attribute);
++
++  // Gets the maximum shared memory per block device attribute.
++  cuda_utils.def(
++      "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
++  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
++                  &get_max_shared_memory_per_block_device_attribute);
++}
++
++#ifndef USE_ROCM
++TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
++  // Custom all-reduce kernels
++  custom_ar.def(
++      "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
++      "int rank, bool full_nvlink) -> int");
++  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
++  custom_ar.def(
++      "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
++      "int reg_buffer_sz_bytes) -> ()");
++  custom_ar.impl("all_reduce", torch::kCUDA, &all_reduce);
++
++  custom_ar.def("dispose", &dispose);
++  custom_ar.def("meta_size", &meta_size);
++
++  custom_ar.def("register_buffer", &register_buffer);
++  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
++  custom_ar.def("register_graph_buffers", &register_graph_buffers);
++}
++#endif
++
++REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
+diff --git a/csrc/type_convert.cuh b/csrc/type_convert.cuh
+new file mode 100644
+index 0000000..21b9d0a
+--- /dev/null
++++ b/csrc/type_convert.cuh
+@@ -0,0 +1,165 @@
++#pragma once
++
++#include <torch/all.h>
++
++#ifndef USE_ROCM
++  #include <cuda_bf16.h>
++  #include <cuda_fp16.h>
++#else
++  #include <hip/hip_bf16.h>
++  #include <hip/hip_fp16.h>
++
++using __nv_bfloat16 = __hip_bfloat16;
++using __nv_bfloat162 = __hip_bfloat162;
++#endif
++
++namespace vllm {
++/* Converter structs for the conversion from torch types to HIP/CUDA types,
++   and the associated type conversions within HIP/CUDA. These helpers need
++   to be implemented for now because the relevant type conversion
++   operators/constructors are not consistently implemented by HIP/CUDA, so
++   a generic conversion via type casts cannot be implemented.
++
++   Each struct should have the member static constexpr bool `exists`:
++   If false, the optimized kernel is not used for the corresponding torch type.
++   If true, the struct should be fully defined as shown in the examples below.
++ */
++template <typename torch_type>
++struct _typeConvert {
++  static constexpr bool exists = false;
++};
++
++#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
++// CUDA < 12.0 runs into issues with packed type conversion
++template <>
++struct _typeConvert<c10::Half> {
++  static constexpr bool exists = true;
++  using hip_type = __half;
++  using packed_hip_type = __half2;
++
++  __device__ static inline float convert(hip_type x) { return __half2float(x); }
++  __device__ static inline float2 convert(packed_hip_type x) {
++    return __half22float2(x);
++  }
++  __device__ static inline hip_type convert(float x) {
++    return __float2half_rn(x);
++  }
++  __device__ static inline packed_hip_type convert(float2 x) {
++    return __float22half2_rn(x);
++  }
++};
++
++  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
++// CUDA_ARCH < 800 does not have BF16 support
++// TODO: Add in ROCm support once public headers handle bf16 maturely
++template <>
++struct _typeConvert<c10::BFloat16> {
++  static constexpr bool exists = true;
++  using hip_type = __nv_bfloat16;
++  using packed_hip_type = __nv_bfloat162;
++
++  __device__ static inline float convert(hip_type x) {
++    return __bfloat162float(x);
++  }
++  __device__ static inline float2 convert(packed_hip_type x) {
++    return __bfloat1622float2(x);
++  }
++  __device__ static inline hip_type convert(float x) {
++    return __float2bfloat16(x);
++  }
++  __device__ static inline packed_hip_type convert(float2 x) {
++    return __float22bfloat162_rn(x);
++  }
++};
++  #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
++#endif    // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
++          // 12000))
++
++/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
++   for appropriate specializations of fused_add_rms_norm_kernel.
++   Only functions that are necessary in that kernel are implemented.
++   Alignment to 16 bytes is required to use 128-bit global memory ops.
++ */
++template <typename scalar_t, int width>
++struct alignas(16) _f16Vec {
++  /* Not theoretically necessary that width is a power of 2 but should
++     almost always be the case for optimization purposes */
++  static_assert(width > 0 && (width & (width - 1)) == 0,
++                "Width is not a positive power of 2!");
++  using Converter = _typeConvert<scalar_t>;
++  using T1 = typename Converter::hip_type;
++  using T2 = typename Converter::packed_hip_type;
++  T1 data[width];
++
++  __device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
++    if constexpr (width % 2 == 0) {
++#pragma unroll
++      for (int i = 0; i < width; i += 2) {
++        T2 temp{data[i], data[i + 1]};
++        temp += T2{other.data[i], other.data[i + 1]};
++        data[i] = temp.x;
++        data[i + 1] = temp.y;
++      }
++    } else {
++#pragma unroll
++      for (int i = 0; i < width; ++i) data[i] += other.data[i];
++    }
++    return *this;
++  }
++
++  __device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
++    if constexpr (width % 2 == 0) {
++#pragma unroll
++      for (int i = 0; i < width; i += 2) {
++        T2 temp{data[i], data[i + 1]};
++        temp *= T2{other.data[i], other.data[i + 1]};
++        data[i] = temp.x;
++        data[i + 1] = temp.y;
++      }
++    } else {
++#pragma unroll
++      for (int i = 0; i < width; ++i) data[i] *= other.data[i];
++    }
++    return *this;
++  }
++
++  __device__ _f16Vec& operator*=(const float scale) {
++    if constexpr (width % 2 == 0) {
++#pragma unroll
++      for (int i = 0; i < width; i += 2) {
++        float2 temp_f = Converter::convert(T2{data[i], data[i + 1]});
++        temp_f.x *= scale;
++        temp_f.y *= scale;
++        T2 temp = Converter::convert(temp_f);
++        data[i] = temp.x;
++        data[i + 1] = temp.y;
++      }
++    } else {
++#pragma unroll
++      for (int i = 0; i < width; ++i) {
++        float temp = Converter::convert(data[i]) * scale;
++        data[i] = Converter::convert(temp);
++      }
++    }
++    return *this;
++  }
++
++  __device__ float sum_squares() const {
++    float result = 0.0f;
++    if constexpr (width % 2 == 0) {
++#pragma unroll
++      for (int i = 0; i < width; i += 2) {
++        float2 z = Converter::convert(T2{data[i], data[i + 1]});
++        result += z.x * z.x + z.y * z.y;
++      }
++    } else {
++#pragma unroll
++      for (int i = 0; i < width; ++i) {
++        float x = Converter::convert(data[i]);
++        result += x * x;
++      }
++    }
++    return result;
++  }
++};
++}  // namespace vllm
+\ No newline at end of file
+diff --git a/docs/Makefile b/docs/Makefile
+index d0c3cbf..5b801f7 100644
+--- a/docs/Makefile
++++ b/docs/Makefile
+@@ -18,3 +18,7 @@ help:
+ # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+ %: Makefile
+ 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
++
++clean:
++	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
++	rm -rf "$(SOURCEDIR)/getting_started/examples"
+diff --git a/docs/README.md b/docs/README.md
+index 46488c9..1a44c13 100644
+--- a/docs/README.md
++++ b/docs/README.md
+@@ -16,4 +16,5 @@ make html
+ ```bash
+ python -m http.server -d build/html/
+ ```
++
+ Launch your browser and open localhost:8000.
+diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
+index 0e76763..8217bc3 100644
+--- a/docs/requirements-docs.txt
++++ b/docs/requirements-docs.txt
+@@ -1,12 +1,24 @@
+-sphinx == 6.2.1
+-sphinx-book-theme == 1.0.1
+-sphinx-copybutton == 0.5.2
+-myst-parser == 2.0.0
+-sphinx-argparse
++sphinx==6.2.1
++sphinx-book-theme==1.0.1
++sphinx-copybutton==0.5.2
++myst-parser==3.0.1
++sphinx-argparse==0.4.0
++sphinx-design==0.6.1
++sphinx-togglebutton==0.3.2
++msgspec
++cloudpickle
+ 
+ # packages to install to build the documentation
+-pydantic
++pydantic >= 2.8
+ -f https://download.pytorch.org/whl/cpu
+ torch
+ py-cpuinfo
+ transformers
++mistral_common >= 1.5.0
++aiohttp
++starlette
++openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
++fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
++partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
++requests
++zmq
+diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
+new file mode 100644
+index 0000000..18b502c
+--- /dev/null
++++ b/docs/source/_static/custom.js
+@@ -0,0 +1,18 @@
++document.addEventListener("DOMContentLoaded", function () {
++    var script = document.createElement("script");
++    script.type = "module";
++    script.id = "runllm-widget-script"
++  
++    script.src = "https://widget.runllm.com";
++  
++    script.setAttribute("version", "stable");
++    script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
++    script.setAttribute("runllm-name", "vLLM");
++    script.setAttribute("runllm-position", "BOTTOM_RIGHT");
++    script.setAttribute("runllm-position-y", "20%");
++    script.setAttribute("runllm-position-x", "3%");
++    script.setAttribute("runllm-assistant-id", "207");
++  
++    script.async = true;
++    document.head.appendChild(script);
++  });
+\ No newline at end of file
+diff --git a/docs/source/_templates/sections/header.html b/docs/source/_templates/sections/header.html
+new file mode 100644
+index 0000000..7174431
+--- /dev/null
++++ b/docs/source/_templates/sections/header.html
+@@ -0,0 +1,39 @@
++<style>
++  .notification-bar {
++    width: 100vw;
++    display: flex;
++    justify-content: center;
++    align-items: center;
++    font-size: 16px;
++    padding: 0 6px 0 6px;
++  }
++  .notification-bar p {
++    margin: 0;
++  }
++  .notification-bar a {
++    font-weight: bold;
++    text-decoration: none;
++  }
++
++  /* Light mode styles (default) */
++  .notification-bar {
++    background-color: #fff3cd;
++    color: #856404;
++  }
++  .notification-bar a {
++    color: #d97706;
++  }
++
++  /* Dark mode styles */
++  html[data-theme=dark] .notification-bar {
++    background-color: #333;
++    color: #ddd;
++  }
++  html[data-theme=dark] .notification-bar a {
++    color: #ffa500; /* Brighter color for visibility */
++  }
++</style>
++
++<div class="notification-bar">
++  <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
++</div>
+diff --git a/docs/source/api/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md
+new file mode 100644
+index 0000000..904feaa
+--- /dev/null
++++ b/docs/source/api/engine/async_llm_engine.md
+@@ -0,0 +1,7 @@
++# AsyncLLMEngine
++
++```{eval-rst}
++.. autoclass:: vllm.AsyncLLMEngine
++    :members:
++    :show-inheritance:
++```
+diff --git a/docs/source/api/engine/index.md b/docs/source/api/engine/index.md
+new file mode 100644
+index 0000000..701cb95
+--- /dev/null
++++ b/docs/source/api/engine/index.md
+@@ -0,0 +1,17 @@
++# vLLM Engine
++
++```{eval-rst}
++.. automodule:: vllm.engine
++```
++
++```{eval-rst}
++.. currentmodule:: vllm.engine
++```
++
++```{toctree}
++:caption: Engines
++:maxdepth: 2
++
++llm_engine
++async_llm_engine
++```
+diff --git a/docs/source/api/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md
+new file mode 100644
+index 0000000..d6613ef
+--- /dev/null
++++ b/docs/source/api/engine/llm_engine.md
+@@ -0,0 +1,7 @@
++# LLMEngine
++
++```{eval-rst}
++.. autoclass:: vllm.LLMEngine
++    :members:
++    :show-inheritance:
++```
+diff --git a/docs/source/api/inference_params.md b/docs/source/api/inference_params.md
+new file mode 100644
+index 0000000..181c30c
+--- /dev/null
++++ b/docs/source/api/inference_params.md
+@@ -0,0 +1,21 @@
++# Inference Parameters
++
++Inference parameters for vLLM APIs.
++
++(sampling-params)=
++
++## Sampling Parameters
++
++```{eval-rst}
++.. autoclass:: vllm.SamplingParams
++    :members:
++```
++
++(pooling-params)=
++
++## Pooling Parameters
++
++```{eval-rst}
++.. autoclass:: vllm.PoolingParams
++    :members:
++```
+diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md
+new file mode 100644
+index 0000000..e103a51
+--- /dev/null
++++ b/docs/source/api/model/adapters.md
+@@ -0,0 +1,9 @@
++# Model Adapters
++
++## Module Contents
++
++```{eval-rst}
++.. automodule:: vllm.model_executor.models.adapters
++    :members:
++    :member-order: bysource
++```
+diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
+new file mode 100644
+index 0000000..1137921
+--- /dev/null
++++ b/docs/source/api/model/index.md
+@@ -0,0 +1,11 @@
++# Model Development
++
++## Submodules
++
++```{toctree}
++:maxdepth: 1
++
++interfaces_base
++interfaces
++adapters
++```
+diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md
+new file mode 100644
+index 0000000..55bee57
+--- /dev/null
++++ b/docs/source/api/model/interfaces.md
+@@ -0,0 +1,9 @@
++# Optional Interfaces
++
++## Module Contents
++
++```{eval-rst}
++.. automodule:: vllm.model_executor.models.interfaces
++    :members:
++    :member-order: bysource
++```
+diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md
+new file mode 100644
+index 0000000..75d58d3
+--- /dev/null
++++ b/docs/source/api/model/interfaces_base.md
+@@ -0,0 +1,9 @@
++# Base Model Interfaces
++
++## Module Contents
++
++```{eval-rst}
++.. automodule:: vllm.model_executor.models.interfaces_base
++    :members:
++    :member-order: bysource
++```
+diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
+new file mode 100644
+index 0000000..14efdb5
+--- /dev/null
++++ b/docs/source/api/multimodal/index.md
+@@ -0,0 +1,28 @@
++(multi-modality)=
++
++# Multi-Modality
++
++vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
++
++Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
++via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
++
++Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
++
++## Module Contents
++
++```{eval-rst}
++.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
++```
++
++## Submodules
++
++```{toctree}
++:maxdepth: 1
++
++inputs
++parse
++processing
++profiling
++registry
++```
+diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
+new file mode 100644
+index 0000000..76b2fb9
+--- /dev/null
++++ b/docs/source/api/multimodal/inputs.md
+@@ -0,0 +1,49 @@
++# Input Definitions
++
++## User-facing inputs
++
++```{eval-rst}
++.. autodata:: vllm.multimodal.inputs.MultiModalDataDict
++```
++
++## Internal data structures
++
++```{eval-rst}
++.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
++    :members:
++    :show-inheritance:
++```
++
++```{eval-rst}
++.. autodata:: vllm.multimodal.inputs.NestedTensors
++```
++
++```{eval-rst}
++.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
++    :members:
++    :show-inheritance:
++```
++
++```{eval-rst}
++.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
++    :members:
++    :show-inheritance:
++```
++
++```{eval-rst}
++.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
++    :members:
++    :show-inheritance:
++```
++
++```{eval-rst}
++.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
++    :members:
++    :show-inheritance:
++```
++
++```{eval-rst}
++.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
++    :members:
++    :show-inheritance:
++```
+diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md
+new file mode 100644
+index 0000000..4676139
+--- /dev/null
++++ b/docs/source/api/multimodal/parse.md
+@@ -0,0 +1,9 @@
++# Data Parsing
++
++## Module Contents
++
++```{eval-rst}
++.. automodule:: vllm.multimodal.parse
++    :members:
++    :member-order: bysource
++```
+diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md
+new file mode 100644
+index 0000000..0d81c8d
+--- /dev/null
++++ b/docs/source/api/multimodal/processing.md
+@@ -0,0 +1,9 @@
++# Data Processing
++
++## Module Contents
++
++```{eval-rst}
++.. automodule:: vllm.multimodal.processing
++    :members:
++    :member-order: bysource
++```
+diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md
+new file mode 100644
+index 0000000..b455145
+--- /dev/null
++++ b/docs/source/api/multimodal/profiling.md
+@@ -0,0 +1,9 @@
++# Memory Profiling
++
++## Module Contents
++
++```{eval-rst}
++.. automodule:: vllm.multimodal.profiling
++    :members:
++    :member-order: bysource
++```
+diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md
+new file mode 100644
+index 0000000..0737a43
+--- /dev/null
++++ b/docs/source/api/multimodal/registry.md
+@@ -0,0 +1,9 @@
++# Registry
++
++## Module Contents
++
++```{eval-rst}
++.. automodule:: vllm.multimodal.registry
++    :members:
++    :member-order: bysource
++```
+diff --git a/docs/source/api/offline_inference/index.md b/docs/source/api/offline_inference/index.md
+new file mode 100644
+index 0000000..c32f99d
+--- /dev/null
++++ b/docs/source/api/offline_inference/index.md
+@@ -0,0 +1,9 @@
++# Offline Inference
++
++```{toctree}
++:caption: Contents
++:maxdepth: 1
++
++llm
++llm_inputs
++```
+diff --git a/docs/source/api/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md
+new file mode 100644
+index 0000000..9f129d5
+--- /dev/null
++++ b/docs/source/api/offline_inference/llm.md
+@@ -0,0 +1,7 @@
++# LLM Class
++
++```{eval-rst}
++.. autoclass:: vllm.LLM
++    :members:
++    :show-inheritance:
++```
+diff --git a/docs/source/api/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md
+new file mode 100644
+index 0000000..21f688a
+--- /dev/null
++++ b/docs/source/api/offline_inference/llm_inputs.md
+@@ -0,0 +1,19 @@
++# LLM Inputs
++
++```{eval-rst}
++.. autodata:: vllm.inputs.PromptType
++```
++
++```{eval-rst}
++.. autoclass:: vllm.inputs.TextPrompt
++    :show-inheritance:
++    :members:
++    :member-order: bysource
++```
++
++```{eval-rst}
++.. autoclass:: vllm.inputs.TokensPrompt
++    :show-inheritance:
++    :members:
++    :member-order: bysource
++```
+diff --git a/docs/source/assets/contributing/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png
+new file mode 100644
+index 0000000..b016531
+Binary files /dev/null and b/docs/source/assets/contributing/dockerfile-stages-dependency.png differ
+diff --git a/docs/source/assets/deployment/architecture_helm_deployment.png b/docs/source/assets/deployment/architecture_helm_deployment.png
+new file mode 100644
+index 0000000..8f9ca29
+Binary files /dev/null and b/docs/source/assets/deployment/architecture_helm_deployment.png differ
+diff --git a/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png b/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png
+new file mode 100644
+index 0000000..bbf4628
+Binary files /dev/null and b/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png differ
+diff --git a/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png b/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png
+new file mode 100644
+index 0000000..ade1d60
+Binary files /dev/null and b/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png differ
+diff --git a/docs/source/assets/design/hierarchy.png b/docs/source/assets/design/hierarchy.png
+new file mode 100644
+index 0000000..6a1b4ba
+Binary files /dev/null and b/docs/source/assets/design/hierarchy.png differ
+diff --git a/docs/source/assets/features/disagg_prefill/abstraction.jpg b/docs/source/assets/features/disagg_prefill/abstraction.jpg
+new file mode 100644
+index 0000000..1a99e3e
+Binary files /dev/null and b/docs/source/assets/features/disagg_prefill/abstraction.jpg differ
+diff --git a/docs/source/assets/features/disagg_prefill/overview.jpg b/docs/source/assets/features/disagg_prefill/overview.jpg
+new file mode 100644
+index 0000000..f029b4c
+Binary files /dev/null and b/docs/source/assets/features/disagg_prefill/overview.jpg differ
+diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
+new file mode 100644
+index 0000000..43fa9ee
+--- /dev/null
++++ b/docs/source/community/meetups.md
+@@ -0,0 +1,15 @@
++(meetups)=
++
++# vLLM Meetups
++
++We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
++
++- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
++- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)
++- [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing)
++- [The fourth vLLM meetup](https://lu.ma/agivllm), with Cloudflare and BentoML, June 11th 2024. [[Slides]](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing)
++- [The third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/), with Roblox, April 2nd 2024. [[Slides]](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing)
++- [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg)
++- [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing)
++
++We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).
+diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
+new file mode 100644
+index 0000000..fb93e65
+--- /dev/null
++++ b/docs/source/community/sponsors.md
+@@ -0,0 +1,38 @@
++# Sponsors
++
++vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
++
++<!-- Note: Please sort them in alphabetical order. -->
++<!-- Note: Please keep these consistent with README.md. -->
++
++Cash Donations:
++
++- a16z
++- Dropbox
++- Sequoia Capital
++- Skywork AI
++- ZhenFund
++
++Compute Resources:
++
++- AMD
++- Anyscale
++- AWS
++- Crusoe Cloud
++- Databricks
++- DeepInfra
++- Google Cloud
++- Lambda Lab
++- Nebius
++- Novita AI
++- NVIDIA
++- Replicate
++- Roblox
++- RunPod
++- Trainy
++- UC Berkeley
++- UC San Diego
++
++Slack Sponsor: Anyscale
++
++We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
+diff --git a/docs/source/conf.py b/docs/source/conf.py
+index 9da5a49..bff0141 100644
+--- a/docs/source/conf.py
++++ b/docs/source/conf.py
+@@ -10,11 +10,13 @@
+ # add these directories to sys.path here. If the directory is relative to the
+ # documentation root, use os.path.abspath to make it absolute, like shown here.
+ 
++import inspect
+ import logging
+ import os
+ import sys
+ from typing import List
+ 
++import requests
+ from sphinx.ext import autodoc
+ 
+ logger = logging.getLogger(__name__)
+@@ -34,12 +36,18 @@ author = 'the vLLM Team'
+ extensions = [
+     "sphinx.ext.napoleon",
+     "sphinx.ext.viewcode",
++    "sphinx.ext.linkcode",
+     "sphinx.ext.intersphinx",
+     "sphinx_copybutton",
+     "sphinx.ext.autodoc",
+     "sphinx.ext.autosummary",
+     "myst_parser",
+     "sphinxarg.ext",
++    "sphinx_design",
++    "sphinx_togglebutton",
++]
++myst_enable_extensions = [
++    "colon_fence",
+ ]
+ 
+ # Add any paths that contain templates here, relative to this directory.
+@@ -48,7 +56,7 @@ templates_path = ['_templates']
+ # List of patterns, relative to source directory, that match files and
+ # directories to ignore when looking for source files.
+ # This pattern also affects html_static_path and html_extra_path.
+-exclude_patterns: List[str] = ["**/*.template.rst"]
++exclude_patterns: List[str] = ["**/*.template.md"]
+ 
+ # Exclude the prompt "$" when copying code
+ copybutton_prompt_text = r"\$ "
+@@ -66,8 +74,51 @@ html_theme_options = {
+     'path_to_docs': 'docs/source',
+     'repository_url': 'https://github.com/vllm-project/vllm',
+     'use_repository_button': True,
++    'use_edit_page_button': True,
++}
++html_static_path = ["_static"]
++html_js_files = ["custom.js"]
++
++myst_url_schemes = {
++    'http': None,
++    'https': None,
++    'mailto': None,
++    'ftp': None,
++    "gh-issue": {
++        "url":
++        "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}",
++        "title": "Issue #{{path}}",
++        "classes": ["github"],
++    },
++    "gh-pr": {
++        "url":
++        "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}",
++        "title": "Pull Request #{{path}}",
++        "classes": ["github"],
++    },
++    "gh-dir": {
++        "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}",
++        "title": "{{path}}",
++        "classes": ["github"],
++    },
++    "gh-file": {
++        "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}",
++        "title": "{{path}}",
++        "classes": ["github"],
++    },
+ }
+ 
++# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
++READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
++if READTHEDOCS_VERSION_TYPE == "tag":
++    # remove the warning banner if the version is a tagged release
++    header_file = os.path.join(os.path.dirname(__file__),
++                               "_templates/sections/header.html")
++    # The file might be removed already if the build is triggered multiple times
++    # (readthedocs build both HTML and PDF versions separately)
++    if os.path.exists(header_file):
++        os.remove(header_file)
++
+ # Add any paths that contain custom static files (such as style sheets) here,
+ # relative to this directory. They are copied after the builtin static files,
+ # so a file named "default.css" will overwrite the builtin "default.css".
+@@ -80,19 +131,94 @@ def setup(app):
+     generate_examples()
+ 
+ 
+-# Mock out external dependencies here.
++_cached_base: str = ""
++_cached_branch: str = ""
++
++
++def get_repo_base_and_branch(pr_number):
++    global _cached_base, _cached_branch
++    if _cached_base and _cached_branch:
++        return _cached_base, _cached_branch
++
++    url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}"
++    response = requests.get(url)
++    if response.status_code == 200:
++        data = response.json()
++        _cached_base = data['head']['repo']['full_name']
++        _cached_branch = data['head']['ref']
++        return _cached_base, _cached_branch
++    else:
++        logger.error("Failed to fetch PR details: %s", response)
++        return None, None
++
++
++def linkcode_resolve(domain, info):
++    if domain != 'py':
++        return None
++    if not info['module']:
++        return None
++    filename = info['module'].replace('.', '/')
++    module = info['module']
++
++    # try to determine the correct file and line number to link to
++    obj = sys.modules[module]
++
++    # get as specific as we can
++    lineno: int = 0
++    filename: str = ""
++    try:
++        for part in info['fullname'].split('.'):
++            obj = getattr(obj, part)
++
++            if not (inspect.isclass(obj) or inspect.isfunction(obj)
++                    or inspect.ismethod(obj)):
++                obj = obj.__class__  # Get the class of the instance
++
++            lineno = inspect.getsourcelines(obj)[1]
++            filename = (inspect.getsourcefile(obj)
++                        or f"{filename}.py").split("vllm/", 1)[1]
++    except Exception:
++        # For some things, like a class member, won't work, so
++        # we'll use the line number of the parent (the class)
++        pass
++
++    if filename.startswith("checkouts/"):
++        # a PR build on readthedocs
++        pr_number = filename.split("/")[1]
++        filename = filename.split("/", 2)[2]
++        base, branch = get_repo_base_and_branch(pr_number)
++        if base and branch:
++            return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
++
++    # Otherwise, link to the source file on the main branch
++    return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
++
++
++# Mock out external dependencies here, otherwise the autodoc pages may be blank.
+ autodoc_mock_imports = [
++    "blake3",
++    "compressed_tensors",
+     "cpuinfo",
++    "cv2",
+     "torch",
+     "transformers",
+     "psutil",
+     "prometheus_client",
+     "sentencepiece",
+-    "vllm.cuda_utils",
+     "vllm._C",
++    "PIL",
+     "numpy",
++    'triton',
+     "tqdm",
+     "tensorizer",
++    "pynvml",
++    "outlines",
++    "xgrammar",
++    "librosa",
++    "soundfile",
++    "gguf",
++    "lark",
++    "decord",
+ ]
+ 
+ for mock_target in autodoc_mock_imports:
+@@ -115,4 +241,18 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
+ 
+ autodoc.ClassDocumenter = MockedClassDocumenter
+ 
++intersphinx_mapping = {
++    "python": ("https://docs.python.org/3", None),
++    "typing_extensions":
++    ("https://typing-extensions.readthedocs.io/en/latest", None),
++    "aiohttp": ("https://docs.aiohttp.org/en/stable", None),
++    "pillow": ("https://pillow.readthedocs.io/en/stable", None),
++    "numpy": ("https://numpy.org/doc/stable", None),
++    "torch": ("https://pytorch.org/docs/stable", None),
++    "psutil": ("https://psutil.readthedocs.io/en/stable", None),
++}
++
++autodoc_preserve_defaults = True
++autodoc_warningiserror = True
++
+ navigation_with_keys = False
+diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
+new file mode 100644
+index 0000000..cb14231
+--- /dev/null
++++ b/docs/source/contributing/dockerfile/dockerfile.md
+@@ -0,0 +1,50 @@
++# Dockerfile
++
++We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
++More information about deploying with Docker can be found [here](#deployment-docker).
++
++Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
++
++- All build stages
++- The default build target (highlighted in grey)
++- External images (with dashed borders)
++
++The edges of the build graph represent:
++
++- `FROM ...` dependencies (with a solid line and a full arrow head)
++
++- `COPY --from=...` dependencies (with a dashed line and an empty arrow head)
++
++- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
++
++  > ```{figure} /assets/contributing/dockerfile-stages-dependency.png
++  > :align: center
++  > :alt: query
++  > :width: 100%
++  > ```
++  >
++  > Made using: <https://github.com/patrickhoefler/dockerfilegraph>
++  >
++  > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
++  >
++  > ```bash
++  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
++  > ```
++  >
++  > or in case you want to run it directly with the docker image:
++  >
++  > ```bash
++  > docker run \
++  >    --rm \
++  >    --user "$(id -u):$(id -g)" \
++  >    --workdir /workspace \
++  >    --volume "$(pwd)":/workspace \
++  >    ghcr.io/patrickhoefler/dockerfilegraph:alpine \
++  >    --output png \
++  >    --dpi 200 \
++  >    --max-label-length 50 \
++  >    --filename Dockerfile \
++  >    --legend
++  > ```
++  >
++  > (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
+diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
+new file mode 100644
+index 0000000..5c2dc48
+--- /dev/null
++++ b/docs/source/contributing/model/basic.md
+@@ -0,0 +1,115 @@
++(new-model-basic)=
++
++# Implementing a Basic Model
++
++This guide walks you through the steps to implement a basic vLLM model.
++
++## 1. Bring your model code
++
++First, clone the PyTorch model code from the source repository.
++For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
++HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
++
++```{warning}
++Make sure to review and adhere to the original code's copyright and licensing terms!
++```
++
++## 2. Make your code compatible with vLLM
++
++To ensure compatibility with vLLM, your model must meet the following requirements:
++
++### Initialization Code
++
++All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
++
++- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
++- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
++
++The initialization code should look like this:
++
++```python
++from torch import nn
++from vllm.config import VllmConfig
++from vllm.attention import Attention
++
++class MyAttention(nn.Module):
++    def __init__(self, vllm_config: VllmConfig, prefix: str):
++        super().__init__()
++        self.attn = Attention(prefix=f"{prefix}.attn")
++
++class MyDecoderLayer(nn.Module):
++    def __init__(self, vllm_config: VllmConfig, prefix: str):
++        super().__init__()
++        self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
++
++class MyModel(nn.Module):
++    def __init__(self, vllm_config: VllmConfig, prefix: str):
++        super().__init__()
++        self.layers = nn.ModuleList(
++            [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
++        )
++
++class MyModelForCausalLM(nn.Module):
++    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
++```
++
++### Computation Code
++
++Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
++
++```python
++def forward(
++    self,
++    input_ids: torch.Tensor,
++    positions: torch.Tensor,
++    kv_caches: List[torch.Tensor],
++    attn_metadata: AttentionMetadata,
++) -> torch.Tensor:
++    ...
++```
++
++```{note}
++Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
++If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
++```
++
++For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
++
++## 3. (Optional) Implement tensor parallelism and quantization support
++
++If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
++To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
++For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
++When it comes to the linear layers, we provide the following options to parallelize them:
++
++- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
++- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
++- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
++- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
++- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
++
++Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
++
++## 4. Implement the weight loading logic
++
++You now need to implement the `load_weights` method in your `*ForCausalLM` class.
++This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
++
++## 5. Register your model
++
++See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
++
++## Frequently Asked Questions
++
++### How to support models with interleaving sliding windows?
++
++For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
++
++To support a model with interleaving sliding windows, we need to take care of the following details:
++
++- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model.
++- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
++
++With these two steps, interleave sliding windows should work with the model.
+diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
+new file mode 100644
+index 0000000..fe018b6
+--- /dev/null
++++ b/docs/source/contributing/model/index.md
+@@ -0,0 +1,27 @@
++(new-model)=
++
++# Adding a New Model
++
++This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
++
++```{toctree}
++:caption: Contents
++:maxdepth: 1
++
++basic
++registration
++tests
++multimodal
++```
++
++```{note}
++The complexity of adding a new model depends heavily on the model's architecture.
++The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
++However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
++```
++
++```{tip}
++If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
++or ask on our [developer slack](https://slack.vllm.ai).
++We will be happy to help you out!
++```
+diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
+new file mode 100644
+index 0000000..99f6a1d
+--- /dev/null
++++ b/docs/source/contributing/model/multimodal.md
+@@ -0,0 +1,395 @@
++(supports-multimodal)=
++
++# Multi-Modal Support
++
++This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
++
++## 1. Update the base vLLM model
++
++It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
++Further update the model as follows:
++
++- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
++
++  ```diff
++  + from vllm.model_executor.models.interfaces import SupportsMultiModal
++
++  - class YourModelForImage2Seq(nn.Module):
++  + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
++  ```
++
++  ```{note}
++  The model class does not have to be named {code}`*ForCausalLM`.
++  Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
++  ```
++
++- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward`
++  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
++
++  ```diff
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++  +     pixel_values: torch.Tensor,
++    ) -> SamplerOutput:
++  ```
++
++## 2. Specify processing information
++
++Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo`
++to provide basic information related to HF processing.
++
++### Maximum number of input items
++
++You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits`
++to return the maximum number of input items for each modality supported by the model.
++
++For example, if the model supports any number of images but only one video per prompt:
++
++```python
++def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++    return {"image": None, "video": 1}
++```
++
++### Maximum number of placeholder feature tokens
++
++Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`
++to return the maximum number of placeholder feature tokens per input item for each modality.
++
++When calling the model, the output embeddings from the visual encoder are assigned to the input positions
++containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
++to the size of the output embeddings.
++
++::::{tab-set}
++:::{tab-item} Basic example: LLaVA
++:sync: llava
++
++Looking at the code of HF's `LlavaForConditionalGeneration`:
++
++```python
++# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
++n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
++n_image_features = image_features.shape[0] * image_features.shape[1]
++
++if n_image_tokens != n_image_features:
++    raise ValueError(
++        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
++    )
++special_image_mask = (
++    (input_ids == self.config.image_token_index)
++    .unsqueeze(-1)
++    .expand_as(inputs_embeds)
++    .to(inputs_embeds.device)
++)
++image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
++inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
++```
++
++The number of placeholder feature tokens per image is `image_features.shape[1]`.
++`image_features` is calculated inside the `get_image_features` method:
++
++```python
++# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
++image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
++
++selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
++if vision_feature_select_strategy == "default":
++    selected_image_feature = selected_image_feature[:, 1:]
++elif vision_feature_select_strategy == "full":
++    selected_image_feature = selected_image_feature
++else:
++    raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
++image_features = self.multi_modal_projector(selected_image_feature)
++return image_features
++```
++
++We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
++(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
++Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
++The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
++mechanism doesn't change the sequence length of the output hidden states.
++
++```python
++# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
++hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
++hidden_states = self.pre_layrnorm(hidden_states)
++
++encoder_outputs = self.encoder(
++    inputs_embeds=hidden_states,
++    output_attentions=output_attentions,
++    output_hidden_states=output_hidden_states,
++    return_dict=return_dict,
++)
++```
++
++To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
++
++```python
++# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
++target_dtype = self.patch_embedding.weight.dtype
++patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
++patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
++
++class_embeds = self.class_embedding.expand(batch_size, 1, -1)
++embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
++if interpolate_pos_encoding:
++    embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
++else:
++    embeddings = embeddings + self.position_embedding(self.position_ids)
++return embeddings
++```
++
++We can infer that `embeddings.shape[1] == self.num_positions`, where
++
++```python
++# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
++self.num_patches = (self.image_size // self.patch_size) ** 2
++self.num_positions = self.num_patches + 1
++```
++
++Overall, the number of placeholder feature tokens for an image can be calculated as:
++
++```python
++def get_num_image_tokens(
++    self,
++    *,
++    image_width: int,
++    image_height: int,
++) -> int:
++    hf_config = self.get_hf_config()
++    hf_processor = self.get_hf_processor()
++
++    image_size = hf_config.vision_config.image_size
++    patch_size = hf_config.vision_config.patch_size
++
++    num_image_tokens = (image_size // patch_size) ** 2 + 1
++    if hf_processor.vision_feature_select_strategy == "default":
++        num_image_tokens -= 1
++
++    return num_image_tokens
++```
++
++Notice that the number of image tokens doesn't depend on the image width and height.
++So, we can calculate the maximum number of image tokens using any image size:
++
++```python
++def get_image_size_with_most_features(self) -> ImageSize:
++    hf_config = self.get_hf_config()
++    width = height = hf_config.image_size
++    return ImageSize(width=width, height=height)
++
++def get_max_image_tokens(self) -> int:
++    target_width, target_height = self.get_image_size_with_most_features()
++
++    return self.get_num_image_tokens(
++        image_width=target_width,
++        image_height=target_height,
++    )
++```
++
++And thus, we can override the method as:
++
++```python
++def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++    return {"image": self.get_max_image_tokens()}
++```
++
++```{note}
++Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
++```
++
++:::
++::::
++
++## 3. Specify dummy inputs
++
++Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
++HF processing as well as memory profiling.
++
++### For memory profiling
++
++Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
++to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
++the model so that vLLM can reserve the correct amount of memory for it.
++
++Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based
++on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`.
++
++::::{tab-set}
++:::{tab-item} Basic example: LLaVA
++:sync: llava
++Making use of the `get_image_size_with_most_features` method implemented in the previous section:
++
++```python
++def get_dummy_processor_inputs(
++    self,
++    seq_len: int,
++    mm_counts: Mapping[str, int],
++) -> ProcessorInputs:
++    num_images = mm_counts.get("image", 0)
++
++    processor = self.info.get_hf_processor()
++    image_token = processor.image_token
++  
++    hf_config = self.get_hf_config()
++    target_width, target_height = self.info.get_image_size_with_most_features()
++
++    mm_data = {
++        "image":
++        self._get_dummy_images(width=target_width,
++                               height=target_height,
++                               num_images=num_images)
++    }
++
++    return ProcessorInputs(
++        prompt_text=image_token * num_images,
++        mm_data=mm_data,
++    )
++```
++
++:::
++::::
++
++## 4. Specify processing details
++
++Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
++to fill in the missing details about HF processing.
++
++```{seealso}
++[Multi-Modal Data Processing](#mm-processing)
++```
++
++### Multi-modal fields
++
++Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
++return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
++
++::::{tab-set}
++:::{tab-item} Basic example: LLaVA
++:sync: llava
++
++Looking at the model's `forward` method:
++
++```python
++# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L387-L404
++def forward(
++    self,
++    input_ids: torch.LongTensor = None,
++    pixel_values: torch.FloatTensor = None,
++    attention_mask: Optional[torch.Tensor] = None,
++    position_ids: Optional[torch.LongTensor] = None,
++    past_key_values: Optional[List[torch.FloatTensor]] = None,
++    inputs_embeds: Optional[torch.FloatTensor] = None,
++    vision_feature_layer: Optional[int] = None,
++    vision_feature_select_strategy: Optional[str] = None,
++    labels: Optional[torch.LongTensor] = None,
++    use_cache: Optional[bool] = None,
++    output_attentions: Optional[bool] = None,
++    output_hidden_states: Optional[bool] = None,
++    return_dict: Optional[bool] = None,
++    cache_position: Optional[torch.LongTensor] = None,
++    num_logits_to_keep: int = 0,
++) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
++```
++
++The only related keyword argument is `pixel_values` which directly corresponds to input images.
++The shape of `pixel_values` is `(N, C, H, W)` where `N` is the number of images.
++So, we override the method as follows:
++
++```python
++def _get_mm_fields_config(
++    self,
++    hf_inputs: BatchFeature,
++    hf_processor_mm_kwargs: Mapping[str, object],
++) -> Mapping[str, MultiModalFieldConfig]:
++    return dict(
++        pixel_values=MultiModalFieldConfig.batched("image"),
++    )
++```
++
++```{note}
++Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
++pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
++```
++
++:::
++::::
++
++### Prompt replacements
++
++Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to
++return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances.
++
++Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace
++operation performed by the HF processor.
++
++::::{tab-set}
++:::{tab-item} Basic example: LLaVA
++:sync: llava
++
++Looking at HF's `LlavaProcessor`:
++
++```python
++# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
++prompt_strings = []
++for sample in text:
++    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
++    prompt_strings.append(sample)
++```
++
++It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
++Based on this, we override the method as follows:
++
++```python
++def _get_prompt_replacements(
++    self,
++    mm_items: MultiModalDataItems,
++    hf_processor_mm_kwargs: Mapping[str, object],
++    out_mm_kwargs: MultiModalKwargs,
++) -> list[PromptReplacement]:
++    hf_config = self.info.get_hf_config()
++    image_token_id = hf_config.image_token_index
++
++    def get_replacement(item_idx: int):
++        images = mm_items.get_items("image", ImageProcessorItems)
++
++        image_size = images.get_image_size(item_idx)
++        num_image_tokens = self.info.get_num_image_tokens(
++            image_width=image_size.width,
++            image_height=image_size.height,
++        )
++
++        return [image_token_id] * num_image_tokens
++
++    return [
++        PromptReplacement(
++            modality="image",
++            target=[image_token_id],
++            replacement=get_replacement,
++        ),
++    ]
++```
++
++:::
++::::
++
++## 5. Register processor-related classes
++
++After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2),
++{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3),
++and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4),
++decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
++to register them to the multi-modal registry:
++
++```diff
++  from vllm.model_executor.models.interfaces import SupportsMultiModal
+++ from vllm.multimodal import MULTIMODAL_REGISTRY
++
+++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
+++                                         info=YourProcessingInfo,
+++                                         dummy_inputs=YourDummyInputsBuilder)
++  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
++```
+diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
+new file mode 100644
+index 0000000..d6c9e41
+--- /dev/null
++++ b/docs/source/contributing/model/registration.md
+@@ -0,0 +1,55 @@
++(new-model-registration)=
++
++# Registering a Model to vLLM
++
++vLLM relies on a model registry to determine how to run each model.
++A list of pre-registered architectures can be found [here](#supported-models).
++
++If your model is not on this list, you must register it to vLLM.
++This page provides detailed instructions on how to do so.
++
++## Built-in models
++
++To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source).
++This gives you the ability to modify the codebase and test your model.
++
++After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
++Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
++Finally, update our [list of supported models](#supported-models) to promote your model!
++
++```{important}
++The list of models in each section should be maintained in alphabetical order.
++```
++
++## Out-of-tree models
++
++You can load an external model using a plugin without modifying the vLLM codebase.
++
++```{seealso}
++[vLLM's Plugin System](#plugin-system)
++```
++
++To register the model, use the following code:
++
++```python
++from vllm import ModelRegistry
++from your_code import YourModelForCausalLM
++ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
++```
++
++If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
++
++```python
++from vllm import ModelRegistry
++
++ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
++```
++
++```{important}
++If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
++Read more about that [here](#supports-multimodal).
++```
++
++```{note}
++Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
++```
+diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md
+new file mode 100644
+index 0000000..74c933b
+--- /dev/null
++++ b/docs/source/contributing/model/tests.md
+@@ -0,0 +1,63 @@
++(new-model-tests)=
++
++# Writing Unit Tests
++
++This page explains how to write unit tests to verify the implementation of your model.
++
++## Required Tests
++
++These tests are necessary to get your PR merged into vLLM library.
++Without them, the CI for your PR will fail.
++
++### Model loading
++
++Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
++This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
++
++```{important}
++The list of models in each section should be maintained in alphabetical order.
++```
++
++```{tip}
++If your model requires a development version of HF Transformers, you can set
++`min_transformers_version` to skip the test in CI until the model is released.
++```
++
++## Optional Tests
++
++These tests are optional to get your PR merged into vLLM library.
++Passing these tests provides more confidence that your implementation is correct, and helps avoid future regressions.
++
++### Model correctness
++
++These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
++
++#### Generative models
++
++For [generative models](#generative-models), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
++
++- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
++- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
++
++#### Pooling models
++
++For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in <gh-file:tests/models/embedding/utils.py>.
++
++(mm-processing-tests)=
++
++### Multi-modal processing
++
++#### Common tests
++
++Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
++
++- Text + multi-modal data
++- Tokens + multi-modal data
++- Text + cached multi-modal data
++- Tokens + cached multi-modal data
++
++#### Model-specific tests
++
++You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
++
++For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
+diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
+new file mode 100644
+index 0000000..e921043
+--- /dev/null
++++ b/docs/source/contributing/overview.md
+@@ -0,0 +1,149 @@
++# Contributing to vLLM
++
++Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
++
++- Identify and report any issues or bugs.
++- Request or add support for a new model.
++- Suggest or implement new features.
++- Improve documentation or contribute a how-to guide.
++
++We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
++
++Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
++
++## License
++
++See <gh-file:LICENSE>.
++
++## Developing
++
++Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
++Check out the [building from source](#build-from-source) documentation for details.
++
++## Testing
++
++```bash
++pip install -r requirements-dev.txt
++
++# linting and formatting
++bash format.sh
++# Static type checking
++mypy
++# Unit tests
++pytest tests/
++```
++
++```{note}
++Currently, the repository is not fully checked by `mypy`.
++```
++
++## Issues
++
++If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
++
++```{important}
++If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
++```
++
++## Pull Requests & Code Reviews
++
++Thank you for your contribution to vLLM! Before submitting the pull request,
++please ensure the PR meets the following criteria. This helps vLLM maintain the
++code quality and improve the efficiency of the review process.
++
++### DCO and Signed-off-by
++
++When contributing changes to this project, you must agree to the <gh-file:DCO>.
++Commits must include a `Signed-off-by:` header which certifies agreement with
++the terms of the DCO.
++
++Using `-s` with `git commit` will automatically add this header.
++
++### PR Title and Classification
++
++Only specific types of PRs will be reviewed. The PR title is prefixed
++appropriately to indicate the type of change. Please use one of the following:
++
++- `[Bugfix]` for bug fixes.
++- `[CI/Build]` for build or continuous integration improvements.
++- `[Doc]` for documentation fixes and improvements.
++- `[Model]` for adding a new model or improving an existing model. Model name
++  should appear in the title.
++- `[Frontend]` For changes on the vLLM frontend (e.g., OpenAI API server,
++  `LLM` class, etc.)
++- `[Kernel]` for changes affecting CUDA kernels or other compute kernels.
++- `[Core]` for changes in the core vLLM logic (e.g., `LLMEngine`,
++  `AsyncLLMEngine`, `Scheduler`, etc.)
++- `[Hardware][Vendor]` for hardware-specific changes. Vendor name should
++  appear in the prefix (e.g., `[Hardware][AMD]`).
++- `[Misc]` for PRs that do not fit the above categories. Please use this
++  sparingly.
++
++```{note}
++If the PR spans more than one category, please include all relevant prefixes.
++```
++
++### Code Quality
++
++The PR needs to meet the following code quality standards:
++
++- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
++- Pass all linter checks. Please use <gh-file:format.sh> to format your code.
++- The code needs to be well-documented to ensure future contributors can easily
++  understand the code.
++- Include sufficient tests to ensure the project stays correct and robust. This
++  includes both unit tests and integration tests.
++- Please add documentation to `docs/source/` if the PR modifies the
++  user-facing behaviors of vLLM. It helps vLLM users understand and utilize the
++  new features or changes.
++
++### Adding or Changing Kernels
++
++Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
++
++- Make sure custom ops are registered following PyTorch guidelines:
++  [Custom C++ and CUDA Operators](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial)
++  and [The Custom Operators Manual](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU).
++- Custom operations that return `Tensors` require meta-functions.
++  Meta-functions should be implemented and registered in Python so that dynamic
++  dims can be handled automatically. See above documents for a description of
++  meta-functions.
++- Use [torch.library.opcheck()](https://pytorch.org/docs/stable/library.html#torch.library.opcheck)
++  to test the function registration and meta-function for any registered ops.
++  See `tests/kernels` for examples.
++- When changing the C++ signature of an existing op, the schema must be updated
++  to reflect the changes.
++- If a new custom type is needed, see the following document:
++  [Custom Class Support in PT2](https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA).
++
++### Notes for Large Changes
++
++Please keep the changes as concise as possible. For major architectural changes
++(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue
++(RFC) discussing the technical design and justification. Otherwise, we will tag
++it with `rfc-required` and might not go through the PR.
++
++### What to Expect for the Reviews
++
++The goal of the vLLM team is to be a *transparent reviewing machine*. We would
++like to make the review process transparent and efficient and make sure no
++contributor feels confused or frustrated. However, the vLLM team is small, so we
++need to prioritize some PRs over others. Here is what you can expect from the
++review process:
++
++- After the PR is submitted, the PR will be assigned to a reviewer. Every
++  reviewer will pick up the PRs based on their expertise and availability.
++- After the PR is assigned, the reviewer will provide status updates every 2-3
++  days. If the PR is not reviewed within 7 days, please feel free to ping the
++  reviewer or the vLLM team.
++- After the review, the reviewer will put an `action-required` label on the PR
++  if there are changes required. The contributor should address the comments and
++  ping the reviewer to re-review the PR.
++- Please respond to all comments within a reasonable time frame. If a comment
++  isn't clear or you disagree with a suggestion, feel free to ask for
++  clarification or discuss the suggestion.
++
++## Thank You
++
++Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
++All of your contributions help make vLLM a great tool and community for everyone!
+diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
+new file mode 100644
+index 0000000..001db86
+--- /dev/null
++++ b/docs/source/contributing/profiling/profiling_index.md
+@@ -0,0 +1,41 @@
++# Profiling vLLM
++
++We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
++
++The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
++
++When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
++
++```{warning}
++Only enable profiling in a development environment.
++```
++
++Traces can be visualized using <https://ui.perfetto.dev/>.
++
++```{tip}
++Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
++```
++
++```{tip}
++To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
++Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
++`export VLLM_RPC_TIMEOUT=1800000`
++```
++
++## Example commands and usage
++
++### Offline Inference
++
++Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
++
++### OpenAI Server
++
++```bash
++VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
++```
++
++benchmark_serving.py:
++
++```bash
++python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
++```
+diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md
+new file mode 100644
+index 0000000..422dc13
+--- /dev/null
++++ b/docs/source/contributing/vulnerability_management.md
+@@ -0,0 +1,43 @@
++# Vulnerability Management
++
++## Reporting Vulnerabilities
++
++As mentioned in the [security
++policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security
++vulnerabilities may be reported privately to the project via
++[GitHub](https://github.com/vllm-project/vllm/security/advisories/new).
++
++## Vulnerability Management Team
++
++Once a vulnerability has been reported to the project, the Vulnerability
++Management Team (VMT) is responsible for managing the vulnerability. The VMT is
++responsible for:
++
++- Triaging the vulnerability.
++- Coordinating with reporters and project maintainers on vulnerability analysis
++  and resolution.
++- Drafting of security advisories for confirmed vulnerabilities, as appropriate.
++- Coordination with project maintainers on a coordinated release of the fix and
++  security advisory.
++
++### Security Advisories
++
++Advisories are published via GitHub through the same system used to report
++vulnerabilities. More information on the process can be found in the [GitHub
++documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories).
++
++### Team Members
++
++We prefer to keep all vulnerability-related communication on the security report
++on GitHub. However, if you need to contact the VMT directly for an urgent issue,
++you may contact the following individuals:
++
++- Simon Mo - simon.mo@hey.com
++- Russell Bryant - rbryant@redhat.com
++
++## Slack Discussion
++
++You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
++to discuss security-related topics. However, please do not disclose any
++vulnerabilities in this channel. If you need to report a vulnerability, please
++use the GitHub security advisory system or contact a VMT member privately.
+diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
+new file mode 100644
+index 0000000..c735bfd
+--- /dev/null
++++ b/docs/source/deployment/docker.md
+@@ -0,0 +1,81 @@
++(deployment-docker)=
++
++# Using Docker
++
++## Use vLLM's Official Docker Image
++
++vLLM offers an official Docker image for deployment.
++The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
++
++```console
++$ docker run --runtime nvidia --gpus all \
++    -v ~/.cache/huggingface:/root/.cache/huggingface \
++    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
++    -p 8000:8000 \
++    --ipc=host \
++    vllm/vllm-openai:latest \
++    --model mistralai/Mistral-7B-v0.1
++```
++
++```{note}
++You can either use the `ipc=host` flag or `--shm-size` flag to allow the
++container to access the host's shared memory. vLLM uses PyTorch, which uses shared
++memory to share data between processes under the hood, particularly for tensor parallel inference.
++```
++
++## Building vLLM's Docker Image from Source
++
++You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
++
++```console
++# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
++DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
++```
++
++```{note}
++By default vLLM will build for all GPU types for widest distribution. If you are just building for the
++current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
++for vLLM to find the current GPU type and build for that.
++```
++
++## Building for Arm64/aarch64
++
++A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
++of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
++
++```{note}
++Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
++flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
++Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
++```
++
++```console
++# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
++$ python3 use_existing_torch.py
++$ DOCKER_BUILDKIT=1 docker build . \
++  --target vllm-openai \
++  --platform "linux/arm64" \
++  -t vllm/vllm-gh200-openai:latest \
++  --build-arg max_jobs=66 \
++  --build-arg nvcc_threads=2 \
++  --build-arg torch_cuda_arch_list="9.0+PTX" \
++  --build-arg vllm_fa_cmake_gpu_arches="90-real"
++```
++
++## Use the custom-built vLLM Docker image
++
++To run vLLM with the custom-built Docker image:
++
++```console
++$ docker run --runtime nvidia --gpus all \
++    -v ~/.cache/huggingface:/root/.cache/huggingface \
++    -p 8000:8000 \
++    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
++    vllm/vllm-openai <args...>
++```
++
++The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
++
++```{note}
++**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
++```
+diff --git a/docs/source/deployment/frameworks/bentoml.md b/docs/source/deployment/frameworks/bentoml.md
+new file mode 100644
+index 0000000..2bf435b
+--- /dev/null
++++ b/docs/source/deployment/frameworks/bentoml.md
+@@ -0,0 +1,7 @@
++(deployment-bentoml)=
++
++# BentoML
++
++[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
++
++For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
+diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md
+new file mode 100644
+index 0000000..5787c4a
+--- /dev/null
++++ b/docs/source/deployment/frameworks/cerebrium.md
+@@ -0,0 +1,109 @@
++(deployment-cerebrium)=
++
++# Cerebrium
++
++```{raw} html
++<p align="center">
++    <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
++</p>
++```
++
++vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
++
++To install the Cerebrium client, run:
++
++```console
++pip install cerebrium
++cerebrium login
++```
++
++Next, create your Cerebrium project, run:
++
++```console
++cerebrium init vllm-project
++```
++
++Next, to install the required packages, add the following to your cerebrium.toml:
++
++```toml
++[cerebrium.deployment]
++docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
++
++[cerebrium.dependencies.pip]
++vllm = "latest"
++```
++
++Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
++
++```python
++from vllm import LLM, SamplingParams
++
++llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
++
++def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
++
++    sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
++    outputs = llm.generate(prompts, sampling_params)
++
++    # Print the outputs.
++    results = []
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        results.append({"prompt": prompt, "generated_text": generated_text})
++
++    return {"results": results}
++```
++
++Then, run the following code to deploy it to the cloud:
++
++```console
++cerebrium deploy
++```
++
++If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
++
++```python
++curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
++ -H 'Content-Type: application/json' \
++ -H 'Authorization: <JWT TOKEN>' \
++ --data '{
++   "prompts": [
++     "Hello, my name is",
++     "The president of the United States is",
++     "The capital of France is",
++     "The future of AI is"
++   ]
++ }'
++```
++
++You should get a response like:
++
++```python
++{
++    "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
++    "result": {
++        "result": [
++            {
++                "prompt": "Hello, my name is",
++                "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
++            },
++            {
++                "prompt": "The president of the United States is",
++                "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
++            },
++            {
++                "prompt": "The capital of France is",
++                "generated_text": " Paris.\n"
++            },
++            {
++                "prompt": "The future of AI is",
++                "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
++            }
++        ]
++    },
++    "run_time_ms": 152.53663063049316
++}
++```
++
++You now have an autoscaling endpoint where you only pay for the compute you use!
+diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md
+new file mode 100644
+index 0000000..b42a341
+--- /dev/null
++++ b/docs/source/deployment/frameworks/dstack.md
+@@ -0,0 +1,102 @@
++(deployment-dstack)=
++
++# dstack
++
++```{raw} html
++<p align="center">
++    <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
++</p>
++```
++
++vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
++
++To install dstack client, run:
++
++```console
++pip install "dstack[all]
++dstack server
++```
++
++Next, to configure your dstack project, run:
++
++```console
++mkdir -p vllm-dstack
++cd vllm-dstack
++dstack init
++```
++
++Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
++
++```yaml
++type: service
++
++python: "3.11"
++env:
++    - MODEL=NousResearch/Llama-2-7b-chat-hf
++port: 8000
++resources:
++    gpu: 24GB
++commands:
++    - pip install vllm
++    - vllm serve $MODEL --port 8000
++model:
++    format: openai
++    type: chat
++    name: NousResearch/Llama-2-7b-chat-hf
++```
++
++Then, run the following CLI for provisioning:
++
++```console
++$ dstack run . -f serve.dstack.yml
++
++⠸ Getting run plan...
++ Configuration  serve.dstack.yml
++ Project        deep-diver-main
++ User           deep-diver
++ Min resources  2..xCPU, 8GB.., 1xGPU (24GB)
++ Max price      -
++ Max duration   -
++ Spot policy    auto
++ Retry policy   no
++
++ #  BACKEND  REGION       INSTANCE       RESOURCES                               SPOT  PRICE
++ 1  gcp   us-central1  g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
++ 2  gcp   us-east1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
++ 3  gcp   us-west1     g2-standard-4  4xCPU, 16GB, 1xL4 (24GB), 100GB (disk)  yes   $0.223804
++    ...
++ Shown 3 of 193 offers, $5.876 max
++
++Continue? [y/n]: y
++⠙ Submitting run...
++⠏ Launching spicy-treefrog-1 (pulling)
++spicy-treefrog-1 provisioning completed (running)
++Service is published at ...
++```
++
++After the provisioning, you can interact with the model by using the OpenAI SDK:
++
++```python
++from openai import OpenAI
++
++client = OpenAI(
++    base_url="https://gateway.<gateway domain>",
++    api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
++)
++
++completion = client.chat.completions.create(
++    model="NousResearch/Llama-2-7b-chat-hf",
++    messages=[
++        {
++            "role": "user",
++            "content": "Compose a poem that explains the concept of recursion in programming.",
++        }
++    ]
++)
++
++print(completion.choices[0].message.content)
++```
++
++```{note}
++dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
++```
+diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md
+new file mode 100644
+index 0000000..18ed293
+--- /dev/null
++++ b/docs/source/deployment/frameworks/helm.md
+@@ -0,0 +1,250 @@
++(deployment-helm)=
++
++# Helm
++
++A Helm chart to deploy vLLM for Kubernetes
++
++Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
++
++This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
++
++## Prerequisites
++
++Before you begin, ensure that you have the following:
++
++- A running Kubernetes cluster
++- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
++- Available GPU resources in your cluster
++- S3 with the model which will be deployed
++
++## Installing the chart
++
++To install the chart with the release name `test-vllm`:
++
++```console
++helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
++```
++
++## Uninstalling the Chart
++
++To uninstall the `test-vllm` deployment:
++
++```console
++helm uninstall test-vllm --namespace=ns-vllm
++```
++
++The command removes all the Kubernetes components associated with the
++chart **including persistent volumes** and deletes the release.
++
++## Architecture
++
++```{image} /assets/deployment/architecture_helm_deployment.png
++```
++
++## Values
++
++```{list-table}
++:widths: 25 25 25 25
++:header-rows: 1
++
++* - Key
++  - Type
++  - Default
++  - Description
++* - autoscaling
++  - object
++  - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
++  - Autoscaling configuration
++* - autoscaling.enabled
++  - bool
++  - false
++  - Enable autoscaling
++* - autoscaling.maxReplicas
++  - int
++  - 100
++  - Maximum replicas
++* - autoscaling.minReplicas
++  - int
++  - 1
++  - Minimum replicas
++* - autoscaling.targetCPUUtilizationPercentage
++  - int
++  - 80
++  - Target CPU utilization for autoscaling
++* - configs
++  - object
++  - {}
++  - Configmap
++* - containerPort
++  - int
++  - 8000
++  - Container port
++* - customObjects
++  - list
++  - []
++  - Custom Objects configuration
++* - deploymentStrategy
++  - object
++  - {}
++  - Deployment strategy configuration
++* - externalConfigs
++  - list
++  - []
++  - External configuration
++* - extraContainers
++  - list
++  - []
++  - Additional containers configuration
++* - extraInit
++  - object
++  - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
++  - Additional configuration for the init container
++* - extraInit.pvcStorage
++  - string
++  - "50Gi"
++  - Storage size of the s3
++* - extraInit.s3modelpath
++  - string
++  - "relative_s3_model_path/opt-125m"
++  - Path of the model on the s3 which hosts model weights and config files
++* - extraInit.awsEc2MetadataDisabled
++  - boolean
++  - true
++  - Disables the use of the Amazon EC2 instance metadata service
++* - extraPorts
++  - list
++  - []
++  - Additional ports configuration
++* - gpuModels
++  - list
++  - ["TYPE_GPU_USED"]
++  - Type of gpu used
++* - image
++  - object
++  - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
++  - Image configuration
++* - image.command
++  - list
++  - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
++  - Container launch command
++* - image.repository
++  - string
++  - "vllm/vllm-openai"
++  - Image repository
++* - image.tag
++  - string
++  - "latest"
++  - Image tag
++* - livenessProbe
++  - object
++  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
++  - Liveness probe configuration
++* - livenessProbe.failureThreshold
++  - int
++  - 3
++  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
++* - livenessProbe.httpGet
++  - object
++  - {"path":"/health","port":8000}
++  - Configuration of the Kubelet http request on the server
++* - livenessProbe.httpGet.path
++  - string
++  - "/health"
++  - Path to access on the HTTP server
++* - livenessProbe.httpGet.port
++  - int
++  - 8000
++  - Name or number of the port to access on the container, on which the server is listening
++* - livenessProbe.initialDelaySeconds
++  - int
++  - 15
++  - Number of seconds after the container has started before liveness probe is initiated
++* - livenessProbe.periodSeconds
++  - int
++  - 10
++  - How often (in seconds) to perform the liveness probe
++* - maxUnavailablePodDisruptionBudget
++  - string
++  - ""
++  - Disruption Budget Configuration
++* - readinessProbe
++  - object
++  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
++  - Readiness probe configuration
++* - readinessProbe.failureThreshold
++  - int
++  - 3
++  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
++* - readinessProbe.httpGet
++  - object
++  - {"path":"/health","port":8000}
++  - Configuration of the Kubelet http request on the server
++* - readinessProbe.httpGet.path
++  - string
++  - "/health"
++  - Path to access on the HTTP server
++* - readinessProbe.httpGet.port
++  - int
++  - 8000
++  - Name or number of the port to access on the container, on which the server is listening
++* - readinessProbe.initialDelaySeconds
++  - int
++  - 5
++  - Number of seconds after the container has started before readiness probe is initiated
++* - readinessProbe.periodSeconds
++  - int
++  - 5
++  - How often (in seconds) to perform the readiness probe
++* - replicaCount
++  - int
++  - 1
++  - Number of replicas
++* - resources
++  - object
++  - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
++  - Resource configuration
++* - resources.limits."nvidia.com/gpu"
++  - int
++  - 1
++  - Number of gpus used
++* - resources.limits.cpu
++  - int
++  - 4
++  - Number of CPUs
++* - resources.limits.memory
++  - string
++  - "16Gi"
++  - CPU memory configuration
++* - resources.requests."nvidia.com/gpu"
++  - int
++  - 1
++  - Number of gpus used
++* - resources.requests.cpu
++  - int
++  - 4
++  - Number of CPUs
++* - resources.requests.memory
++  - string
++  - "16Gi"
++  - CPU memory configuration
++* - secrets
++  - object
++  - {}
++  - Secrets configuration
++* - serviceName
++  - string
++  -
++  - Service name
++* - servicePort
++  - int
++  - 80
++  - Service port
++* - labels.environment
++  - string
++  - test
++  - Environment name
++* - labels.release
++  - string
++  - test
++  - Release name
++```
+diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
+new file mode 100644
+index 0000000..9647827
+--- /dev/null
++++ b/docs/source/deployment/frameworks/index.md
+@@ -0,0 +1,14 @@
++# Using other frameworks
++
++```{toctree}
++:maxdepth: 1
++
++bentoml
++cerebrium
++dstack
++helm
++lws
++modal
++skypilot
++triton
++```
+diff --git a/docs/source/deployment/frameworks/lws.md b/docs/source/deployment/frameworks/lws.md
+new file mode 100644
+index 0000000..349fa83
+--- /dev/null
++++ b/docs/source/deployment/frameworks/lws.md
+@@ -0,0 +1,11 @@
++(deployment-lws)=
++
++# LWS
++
++LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
++A major use case is for multi-host/multi-node distributed inference.
++
++vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.
++
++Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
++deploying vLLM on Kubernetes using LWS.
+diff --git a/docs/source/deployment/frameworks/modal.md b/docs/source/deployment/frameworks/modal.md
+new file mode 100644
+index 0000000..e7c4208
+--- /dev/null
++++ b/docs/source/deployment/frameworks/modal.md
+@@ -0,0 +1,7 @@
++(deployment-modal)=
++
++# Modal
++
++vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
++
++For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).
+diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md
+new file mode 100644
+index 0000000..051fc2f
+--- /dev/null
++++ b/docs/source/deployment/frameworks/skypilot.md
+@@ -0,0 +1,345 @@
++(deployment-skypilot)=
++
++# SkyPilot
++
++```{raw} html
++<p align="center">
++  <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
++</p>
++```
++
++vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
++
++## Prerequisites
++
++- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-8B-Instruct`.
++- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
++- Check that `sky check` shows clouds or Kubernetes are enabled.
++
++```console
++pip install skypilot-nightly
++sky check
++```
++
++## Run on a single instance
++
++See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
++
++```yaml
++resources:
++  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
++  use_spot: True
++  disk_size: 512  # Ensure model checkpoints can fit.
++  disk_tier: best
++  ports: 8081  # Expose to internet traffic.
++
++envs:
++  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
++  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
++
++setup: |
++  conda create -n vllm python=3.10 -y
++  conda activate vllm
++
++  pip install vllm==0.4.0.post1
++  # Install Gradio for web UI.
++  pip install gradio openai
++  pip install flash-attn==2.5.7
++
++run: |
++  conda activate vllm
++  echo 'Starting vllm api server...'
++  python -u -m vllm.entrypoints.openai.api_server \
++    --port 8081 \
++    --model $MODEL_NAME \
++    --trust-remote-code \
++    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
++    2>&1 | tee api_server.log &
++
++  echo 'Waiting for vllm api server to start...'
++  while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
++
++  echo 'Starting gradio server...'
++  git clone https://github.com/vllm-project/vllm.git || true
++  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
++    -m $MODEL_NAME \
++    --port 8811 \
++    --model-url http://localhost:8081/v1 \
++    --stop-token-ids 128009,128001
++```
++
++Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
++
++```console
++HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
++```
++
++Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
++
++```console
++(task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
++```
++
++**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
++
++```console
++HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
++```
++
++## Scale up to multiple replicas
++
++SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
++
++```yaml
++service:
++  replicas: 2
++  # An actual request for readiness probe.
++  readiness_probe:
++    path: /v1/chat/completions
++    post_data:
++    model: $MODEL_NAME
++    messages:
++      - role: user
++        content: Hello! What is your name?
++  max_completion_tokens: 1
++```
++
++```{raw} html
++<details>
++<summary>Click to see the full recipe YAML</summary>
++```
++
++```yaml
++service:
++  replicas: 2
++  # An actual request for readiness probe.
++  readiness_probe:
++    path: /v1/chat/completions
++    post_data:
++      model: $MODEL_NAME
++      messages:
++        - role: user
++          content: Hello! What is your name?
++      max_completion_tokens: 1
++
++resources:
++  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
++  use_spot: True
++  disk_size: 512  # Ensure model checkpoints can fit.
++  disk_tier: best
++  ports: 8081  # Expose to internet traffic.
++
++envs:
++  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
++  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
++
++setup: |
++  conda create -n vllm python=3.10 -y
++  conda activate vllm
++
++  pip install vllm==0.4.0.post1
++  # Install Gradio for web UI.
++  pip install gradio openai
++  pip install flash-attn==2.5.7
++
++run: |
++  conda activate vllm
++  echo 'Starting vllm api server...'
++  python -u -m vllm.entrypoints.openai.api_server \
++    --port 8081 \
++    --model $MODEL_NAME \
++    --trust-remote-code \
++    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
++    2>&1 | tee api_server.log
++```
++
++```{raw} html
++</details>
++```
++
++Start the serving the Llama-3 8B model on multiple replicas:
++
++```console
++HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
++```
++
++Wait until the service is ready:
++
++```console
++watch -n10 sky serve status vllm
++```
++
++```{raw} html
++<details>
++<summary>Example outputs:</summary>
++```
++
++```console
++Services
++NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
++vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
++
++Service Replicas
++SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                STATUS  REGION
++vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
++vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
++```
++
++```{raw} html
++</details>
++```
++
++After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
++
++```console
++ENDPOINT=$(sky serve status --endpoint 8081 vllm)
++curl -L http://$ENDPOINT/v1/chat/completions \
++  -H "Content-Type: application/json" \
++  -d '{
++    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
++    "messages": [
++    {
++      "role": "system",
++      "content": "You are a helpful assistant."
++    },
++    {
++      "role": "user",
++      "content": "Who are you?"
++    }
++    ],
++    "stop_token_ids": [128009,  128001]
++  }'
++```
++
++To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
++
++```yaml
++service:
++  replica_policy:
++    min_replicas: 2
++    max_replicas: 4
++    target_qps_per_replica: 2
++```
++
++This will scale the service up to when the QPS exceeds 2 for each replica.
++
++```{raw} html
++<details>
++<summary>Click to see the full recipe YAML</summary>
++```
++
++```yaml
++service:
++  replica_policy:
++    min_replicas: 2
++    max_replicas: 4
++    target_qps_per_replica: 2
++  # An actual request for readiness probe.
++  readiness_probe:
++    path: /v1/chat/completions
++    post_data:
++      model: $MODEL_NAME
++      messages:
++        - role: user
++          content: Hello! What is your name?
++      max_completion_tokens: 1
++
++resources:
++  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
++  use_spot: True
++  disk_size: 512  # Ensure model checkpoints can fit.
++  disk_tier: best
++  ports: 8081  # Expose to internet traffic.
++
++envs:
++  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
++  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
++
++setup: |
++  conda create -n vllm python=3.10 -y
++  conda activate vllm
++
++  pip install vllm==0.4.0.post1
++  # Install Gradio for web UI.
++  pip install gradio openai
++  pip install flash-attn==2.5.7
++
++run: |
++  conda activate vllm
++  echo 'Starting vllm api server...'
++  python -u -m vllm.entrypoints.openai.api_server \
++    --port 8081 \
++    --model $MODEL_NAME \
++    --trust-remote-code \
++    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
++    2>&1 | tee api_server.log
++```
++
++```{raw} html
++</details>
++```
++
++To update the service with the new config:
++
++```console
++HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
++```
++
++To stop the service:
++
++```console
++sky serve down vllm
++```
++
++### **Optional**: Connect a GUI to the endpoint
++
++It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
++
++```{raw} html
++<details>
++<summary>Click to see the full GUI YAML</summary>
++```
++
++```yaml
++envs:
++  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
++  ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
++
++resources:
++  cpus: 2
++
++setup: |
++  conda create -n vllm python=3.10 -y
++  conda activate vllm
++
++  # Install Gradio for web UI.
++  pip install gradio openai
++
++run: |
++  conda activate vllm
++  export PATH=$PATH:/sbin
++
++  echo 'Starting gradio server...'
++  git clone https://github.com/vllm-project/vllm.git || true
++  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
++    -m $MODEL_NAME \
++    --port 8811 \
++    --model-url http://$ENDPOINT/v1 \
++    --stop-token-ids 128009,128001 | tee ~/gradio.log
++```
++
++```{raw} html
++</details>
++```
++
++1. Start the chat web UI:
++
++    ```console
++    sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
++    ```
++
++2. Then, we can access the GUI at the returned gradio link:
++
++    ```console
++    | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
++    ```
+diff --git a/docs/source/deployment/frameworks/triton.md b/docs/source/deployment/frameworks/triton.md
+new file mode 100644
+index 0000000..94d8712
+--- /dev/null
++++ b/docs/source/deployment/frameworks/triton.md
+@@ -0,0 +1,5 @@
++(deployment-triton)=
++
++# NVIDIA Triton
++
++The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
+diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md
+new file mode 100644
+index 0000000..d47ede8
+--- /dev/null
++++ b/docs/source/deployment/integrations/index.md
+@@ -0,0 +1,9 @@
++# External Integrations
++
++```{toctree}
++:maxdepth: 1
++
++kserve
++kubeai
++llamastack
++```
+diff --git a/docs/source/deployment/integrations/kserve.md b/docs/source/deployment/integrations/kserve.md
+new file mode 100644
+index 0000000..c780fd7
+--- /dev/null
++++ b/docs/source/deployment/integrations/kserve.md
+@@ -0,0 +1,7 @@
++(deployment-kserve)=
++
++# KServe
++
++vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
++
++Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
+diff --git a/docs/source/deployment/integrations/kubeai.md b/docs/source/deployment/integrations/kubeai.md
+new file mode 100644
+index 0000000..2f5772e
+--- /dev/null
++++ b/docs/source/deployment/integrations/kubeai.md
+@@ -0,0 +1,15 @@
++(deployment-kubeai)=
++
++# KubeAI
++
++[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
++
++Please see the Installation Guides for environment specific instructions:
++
++- [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
++- [EKS](https://www.kubeai.org/installation/eks/)
++- [GKE](https://www.kubeai.org/installation/gke/)
++
++Once you have KubeAI installed, you can
++[configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/)
++using vLLM.
+diff --git a/docs/source/deployment/integrations/llamastack.md b/docs/source/deployment/integrations/llamastack.md
+new file mode 100644
+index 0000000..a6c3569
+--- /dev/null
++++ b/docs/source/deployment/integrations/llamastack.md
+@@ -0,0 +1,38 @@
++(deployment-llamastack)=
++
++# Llama Stack
++
++vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
++
++To install Llama Stack, run
++
++```console
++pip install llama-stack -q
++```
++
++## Inference using OpenAI Compatible API
++
++Then start Llama Stack server pointing to your vLLM server with the following configuration:
++
++```yaml
++inference:
++  - provider_id: vllm0
++    provider_type: remote::vllm
++    config:
++      url: http://127.0.0.1:8000
++```
++
++Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider.
++
++## Inference via Embedded vLLM
++
++An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm)
++is also available. This is a sample of configuration using that method:
++
++```yaml
++inference
++  - provider_type: vllm
++    config:
++      model: Llama3.1-8B-Instruct
++      tensor_parallel_size: 4
++```
+diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
+new file mode 100644
+index 0000000..cbc95c2
+--- /dev/null
++++ b/docs/source/deployment/k8s.md
+@@ -0,0 +1,249 @@
++(deployment-k8s)=
++
++# Using Kubernetes
++
++Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
++
++## Prerequisites
++
++Before you begin, ensure that you have the following:
++
++- A running Kubernetes cluster
++- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/`
++- Available GPU resources in your cluster
++
++## Deployment Steps
++
++1. Create a PVC, Secret and Deployment for vLLM
++
++      PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
++
++      ```yaml
++      apiVersion: v1
++      kind: PersistentVolumeClaim
++      metadata:
++        name: mistral-7b
++        namespace: default
++      spec:
++        accessModes:
++        - ReadWriteOnce
++        resources:
++          requests:
++            storage: 50Gi
++        storageClassName: default
++        volumeMode: Filesystem
++      ```
++
++      Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
++
++      ```yaml
++      apiVersion: v1
++      kind: Secret
++      metadata:
++        name: hf-token-secret
++        namespace: default
++      type: Opaque
++      stringData:
++        token: "REPLACE_WITH_TOKEN"
++      ```
++
++      Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
++
++      Here are two examples for using NVIDIA GPU and AMD GPU.
++
++      NVIDIA GPU:
++
++      ```yaml
++      apiVersion: apps/v1
++      kind: Deployment
++      metadata:
++        name: mistral-7b
++        namespace: default
++        labels:
++          app: mistral-7b
++      spec:
++        replicas: 1
++        selector:
++          matchLabels:
++            app: mistral-7b
++        template:
++          metadata:
++            labels:
++              app: mistral-7b
++          spec:
++            volumes:
++            - name: cache-volume
++              persistentVolumeClaim:
++                claimName: mistral-7b
++            # vLLM needs to access the host's shared memory for tensor parallel inference.
++            - name: shm
++              emptyDir:
++                medium: Memory
++                sizeLimit: "2Gi"
++            containers:
++            - name: mistral-7b
++              image: vllm/vllm-openai:latest
++              command: ["/bin/sh", "-c"]
++              args: [
++                "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
++              ]
++              env:
++              - name: HUGGING_FACE_HUB_TOKEN
++                valueFrom:
++                  secretKeyRef:
++                    name: hf-token-secret
++                    key: token
++              ports:
++              - containerPort: 8000
++              resources:
++                limits:
++                  cpu: "10"
++                  memory: 20G
++                  nvidia.com/gpu: "1"
++                requests:
++                  cpu: "2"
++                  memory: 6G
++                  nvidia.com/gpu: "1"
++              volumeMounts:
++              - mountPath: /root/.cache/huggingface
++                name: cache-volume
++              - name: shm
++                mountPath: /dev/shm
++              livenessProbe:
++                httpGet:
++                  path: /health
++                  port: 8000
++                initialDelaySeconds: 60
++                periodSeconds: 10
++              readinessProbe:
++                httpGet:
++                  path: /health
++                  port: 8000
++                initialDelaySeconds: 60
++                periodSeconds: 5
++      ```
++
++      AMD GPU:
++
++      You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
++
++      ```yaml
++      apiVersion: apps/v1
++      kind: Deployment
++      metadata:
++        name: mistral-7b
++        namespace: default
++        labels:
++          app: mistral-7b
++      spec:
++        replicas: 1
++        selector:
++          matchLabels:
++            app: mistral-7b
++        template:
++          metadata:
++            labels:
++              app: mistral-7b
++          spec:
++            volumes:
++            # PVC
++            - name: cache-volume
++              persistentVolumeClaim:
++                claimName: mistral-7b
++            # vLLM needs to access the host's shared memory for tensor parallel inference.
++            - name: shm
++              emptyDir:
++                medium: Memory
++                sizeLimit: "8Gi"
++            hostNetwork: true
++            hostIPC: true
++            containers:
++            - name: mistral-7b
++              image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
++              securityContext:
++                seccompProfile:
++                  type: Unconfined
++                runAsGroup: 44
++                capabilities:
++                  add:
++                  - SYS_PTRACE
++              command: ["/bin/sh", "-c"]
++              args: [
++                "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
++              ]
++              env:
++              - name: HUGGING_FACE_HUB_TOKEN
++                valueFrom:
++                  secretKeyRef:
++                    name: hf-token-secret
++                    key: token
++              ports:
++              - containerPort: 8000
++              resources:
++                limits:
++                  cpu: "10"
++                  memory: 20G
++                  amd.com/gpu: "1"
++                requests:
++                  cpu: "6"
++                  memory: 6G
++                  amd.com/gpu: "1"
++              volumeMounts:
++              - name: cache-volume
++                mountPath: /root/.cache/huggingface
++              - name: shm
++                mountPath: /dev/shm
++      ```
++
++      You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
++
++2. Create a Kubernetes Service for vLLM
++
++      Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
++
++      ```yaml
++      apiVersion: v1
++      kind: Service
++      metadata:
++        name: mistral-7b
++        namespace: default
++      spec:
++        ports:
++        - name: http-mistral-7b
++          port: 80
++          protocol: TCP
++          targetPort: 8000
++        # The label selector should match the deployment labels & it is useful for prefix caching feature
++        selector:
++          app: mistral-7b
++        sessionAffinity: None
++        type: ClusterIP
++      ```
++
++3. Deploy and Test
++
++      Apply the deployment and service configurations using `kubectl apply -f <filename>`:
++
++      ```console
++      kubectl apply -f deployment.yaml
++      kubectl apply -f service.yaml
++      ```
++
++      To test the deployment, run the following `curl` command:
++
++      ```console
++      curl http://mistral-7b.default.svc.cluster.local/v1/completions \
++        -H "Content-Type: application/json" \
++        -d '{
++              "model": "mistralai/Mistral-7B-Instruct-v0.3",
++              "prompt": "San Francisco is a",
++              "max_tokens": 7,
++              "temperature": 0
++            }'
++      ```
++
++      If the service is correctly deployed, you should receive a response from the vLLM model.
++
++## Conclusion
++
++Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
+diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md
+new file mode 100644
+index 0000000..a58f791
+--- /dev/null
++++ b/docs/source/deployment/nginx.md
+@@ -0,0 +1,133 @@
++(nginxloadbalancer)=
++
++# Using Nginx
++
++This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
++
++Table of contents:
++
++1. [Build Nginx Container](#nginxloadbalancer-nginx-build)
++2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf)
++3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container)
++4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network)
++5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container)
++6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx)
++7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx)
++
++(nginxloadbalancer-nginx-build)=
++
++## Build Nginx Container
++
++This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
++
++```console
++export vllm_root=`pwd`
++```
++
++Create a file named `Dockerfile.nginx`:
++
++```console
++FROM nginx:latest
++RUN rm /etc/nginx/conf.d/default.conf
++EXPOSE 80
++CMD ["nginx", "-g", "daemon off;"]
++```
++
++Build the container:
++
++```console
++docker build . -f Dockerfile.nginx --tag nginx-lb
++```
++
++(nginxloadbalancer-nginx-conf)=
++
++## Create Simple Nginx Config file
++
++Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
++
++```console
++upstream backend {
++    least_conn;
++    server vllm0:8000 max_fails=3 fail_timeout=10000s;
++    server vllm1:8000 max_fails=3 fail_timeout=10000s;
++}
++server {
++    listen 80;
++    location / {
++        proxy_pass http://backend;
++        proxy_set_header Host $host;
++        proxy_set_header X-Real-IP $remote_addr;
++        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
++        proxy_set_header X-Forwarded-Proto $scheme;
++    }
++}
++```
++
++(nginxloadbalancer-nginx-vllm-container)=
++
++## Build vLLM Container
++
++```console
++cd $vllm_root
++docker build -f Dockerfile . --tag vllm
++```
++
++If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
++
++```console
++cd $vllm_root
++docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
++```
++
++(nginxloadbalancer-nginx-docker-network)=
++
++## Create Docker Network
++
++```console
++docker network create vllm_nginx
++```
++
++(nginxloadbalancer-nginx-launch-container)=
++
++## Launch vLLM Containers
++
++Notes:
++
++- If you have your HuggingFace models cached somewhere else, update `hf_cache_dir` below.
++- If you don't have an existing HuggingFace cache you will want to start `vllm0` and wait for the model to complete downloading and the server to be ready. This will ensure that `vllm1` can leverage the model you just downloaded and it won't have to be downloaded again.
++- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus all`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
++- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
++
++```console
++mkdir -p ~/.cache/huggingface/hub/
++hf_cache_dir=~/.cache/huggingface/
++docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
++docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
++```
++
++```{note}
++If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
++```
++
++(nginxloadbalancer-nginx-launch-nginx)=
++
++## Launch Nginx
++
++```console
++docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest
++```
++
++(nginxloadbalancer-nginx-verify-nginx)=
++
++## Verify That vLLM Servers Are Ready
++
++```console
++docker logs vllm0 | grep Uvicorn
++docker logs vllm1 | grep Uvicorn
++```
++
++Both outputs should look like this:
++
++```console
++INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
++```
+diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
+new file mode 100644
+index 0000000..cec503e
+--- /dev/null
++++ b/docs/source/design/arch_overview.md
+@@ -0,0 +1,252 @@
++(arch-overview)=
++
++# Architecture Overview
++
++This document provides an overview of the vLLM architecture.
++
++```{contents} Table of Contents
++:depth: 2
++:local: true
++```
++
++## Entrypoints
++
++vLLM provides a number of entrypoints for interacting with the system. The
++following diagram shows the relationship between them.
++
++```{image} /assets/design/arch_overview/entrypoints.excalidraw.png
++:alt: Entrypoints Diagram
++```
++
++### LLM Class
++
++The LLM class provides the primary Python interface for doing offline inference,
++which is interacting with a model without using a separate model inference
++server.
++
++Here is a sample of `LLM` class usage:
++
++```python
++from vllm import LLM, SamplingParams
++
++# Define a list of input prompts
++prompts = [
++    "Hello, my name is",
++    "The capital of France is",
++    "The largest ocean is",
++]
++
++# Define sampling parameters
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++# Initialize the LLM engine with the OPT-125M model
++llm = LLM(model="facebook/opt-125m")
++
++# Generate outputs for the input prompts
++outputs = llm.generate(prompts, sampling_params)
++
++# Print the generated outputs
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
++
++More API details can be found in the {doc}`Offline Inference
++</api/offline_inference/index>` section of the API docs.
++
++The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
++
++### OpenAI-Compatible API Server
++
++The second primary interface to vLLM is via its OpenAI-compatible API server.
++This server can be started using the `vllm serve` command.
++
++```bash
++vllm serve <model>
++```
++
++The code for the `vllm` CLI can be found in <gh-file:vllm/scripts.py>.
++
++Sometimes you may see the API server entrypoint used directly instead of via the
++`vllm` CLI command. For example:
++
++```bash
++python -m vllm.entrypoints.openai.api_server --model <model>
++```
++
++That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
++
++More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document.
++
++## LLM Engine
++
++The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
++the vLLM system, handling model inference and asynchronous request processing.
++
++```{image} /assets/design/arch_overview/llm_engine.excalidraw.png
++:alt: LLMEngine Diagram
++```
++
++### LLMEngine
++
++The `LLMEngine` class is the core component of the vLLM engine. It is
++responsible for receiving requests from clients and generating outputs from the
++model. The `LLMEngine` includes input processing, model execution (possibly
++distributed across multiple hosts and/or GPUs), scheduling, and output
++processing.
++
++- **Input Processing**: Handles tokenization of input text using the specified
++  tokenizer.
++- **Scheduling**: Chooses which requests are processed in each step.
++- **Model Execution**: Manages the execution of the language model, including
++  distributed execution across multiple GPUs.
++- **Output Processing**: Processes the outputs generated by the model, decoding the
++  token IDs from a language model into human-readable text.
++
++The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
++
++### AsyncLLMEngine
++
++The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class.
++It uses `asyncio` to create a background loop that continuously processes
++incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
++can handle multiple concurrent requests and stream outputs to clients.
++
++The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
++API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
++
++The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
++
++## Worker
++
++A worker is a process that runs the model inference. vLLM follows the common
++practice of using one process to control one accelerator device, such as GPUs.
++For example, if we use tensor parallelism of size 2 and pipeline parallelism of
++size 2, we will have 4 workers in total. Workers are identified by their
++`rank` and `local_rank`. `rank` is used for global orchestration, while
++`local_rank` is mainly used for assigning the accelerator device and accessing
++local resources such as the file system and shared memory.
++
++## Model Runner
++
++Every worker has one model runner object, responsible for loading and running
++the model. Much of the model execution logic resides here, such as preparing
++input tensors and capturing cudagraphs.
++
++## Model
++
++Every model runner object has one model object, which is the actual
++`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various
++configurations affect the class we ultimately get.
++
++## Class Hierarchy
++
++The following figure shows the class hierarchy of vLLM:
++
++> ```{figure} /assets/design/hierarchy.png
++> :align: center
++> :alt: query
++> :width: 100%
++> ```
++
++There are several important design choices behind this class hierarchy:
++
++1\. **Extensibility**: All classes in the hierarchy accept a configuration object
++containing all the necessary information. The [VllmConfig](https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036)
++class is the main configuration object that is passed around. The class
++hierarchy is quite deep, and every class needs to read the configuration it is
++interested in. By encapsulating all configurations in one object, we can easily
++pass the configuration object around and access the configuration we need.
++Suppose we want to add a new feature (this is often the case given how fast the
++field of LLM inference is evolving) that only touches the model runner. We will
++have to add a new configuration option in the `VllmConfig` class. Since we pass
++the whole config object around, we only need to add the configuration option to
++the `VllmConfig` class, and the model runner can access it directly. We don't
++need to change the constructor of the engine, worker, or model class to pass the
++new configuration option.
++
++2\. **Uniformity**: The model runner needs a unified interface to create and
++initialize the model. vLLM supports more than 50 types of popular open-source
++models. Each model has its own initialization logic. If the constructor
++signature varies with models, the model runner does not know how to call the
++constructor accordingly, without complicated and error-prone inspection logic.
++By making the constructor of the model class uniform, the model runner can
++easily create and initialize the model without knowing the specific model type.
++This is also useful for composing models. Vision-language models often consist
++of a vision model and a language model. By making the constructor uniform, we
++can easily create a vision model and a language model and compose them into a
++vision-language model.
++
++````{note}
++To support this change, all vLLM models' signatures have been updated to:
++
++```python
++def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++```
++
++To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
++
++```python
++class MyOldModel(nn.Module):
++    def __init__(
++        self,
++        config,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        lora_config: Optional[LoRAConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        ...
++
++from vllm.config import VllmConfig
++class MyNewModel(MyOldModel):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++        super().__init__(config, cache_config, quant_config, lora_config, prefix)
++
++if __version__ >= "0.6.4":
++    MyModel = MyNewModel
++else:
++    MyModel = MyOldModel
++```
++
++This way, the model can work with both old and new versions of vLLM.
++````
++
++3\. **Sharding and Quantization at Initialization**: Certain features require
++changing the model weights. For example, tensor parallelism needs to shard the
++model weights, and quantization needs to quantize the model weights. There are
++two possible ways to implement this feature. One way is to change the model
++weights after the model is initialized. The other way is to change the model
++weights during the model initialization. vLLM chooses the latter. The first
++approach is not scalable to large models. Suppose we want to run a 405B model
++(with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should
++only load 50GB weights. If we change the model weights after the model is
++initialized, we need to load the full 810GB weights to every GPU and then shard
++the weights, leading to a huge memory overhead. Instead, if we shard the weights
++during the model initialization, every layer will only create a shard of the
++weights it needs, leading to a much smaller memory overhead. The same idea
++applies to quantization. Note that we also add an additional argument `prefix`
++to the model's constructor so that the model can initialize itself differently
++based on the prefix. This is useful for non-uniform quantization, where
++different parts of the model are quantized differently. The `prefix` is
++usually an empty string for the top-level model and a string like `"vision"`
++or `"language"` for the sub-models. In general, it matches the name of the
++module's state dict in the checkpoint file.
++
++One disadvantage of this design is that it is hard to write unit tests for
++individual components in vLLM because every component needs to be initialized by
++a complete config object. We solve this problem by providing a default
++initialization function that creates a default config object with all fields set
++to `None`. If the component we want to test only cares about a few fields in
++the config object, we can create a default config object and set the fields we
++care about. This way, we can test the component in isolation. Note that many
++tests in vLLM are end-to-end tests that test the whole system, so this is not a
++big problem.
++
++In summary, the complete config object `VllmConfig` can be treated as an
++engine-level global state that is shared among all vLLM classes.
+diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md
+new file mode 100644
+index 0000000..3928e0c
+--- /dev/null
++++ b/docs/source/design/automatic_prefix_caching.md
+@@ -0,0 +1,42 @@
++(design-automatic-prefix-caching)=
++
++# Automatic Prefix Caching
++
++The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
++
++To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
++
++```text
++                    Block 1                  Block 2                  Block 3
++         [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
++Block 1: |<--- block tokens ---->|
++Block 2: |<------- prefix ------>| |<--- block tokens --->|
++Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
++```
++
++In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping:
++
++```text
++hash(prefix tokens + block tokens) <--> KV Block
++```
++
++With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space.
++
++This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
++
++## Generalized Caching Policy
++
++Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
++
++Managing KV cache with a hash table allows us to implement flexible caching policies. As an example, in current vLLM, we implement the following eviction policy:
++
++* When there are no free blocks left, we will evict a KV block with reference count (i.e., number of current requests using the block) equals 0.
++* If there are multiple blocks with reference count equals to 0, we prioritize to evict the least recently used block (LRU).
++* If there are multiple blocks whose last access time are the same, we prioritize the eviction of the block that is at the end of the longest prefix (i.e., has the maximum number of blocks before it).
++
++Note that this eviction policy effectively implements the exact policy as in [RadixAttention](https://lmsys.org/blog/2024-01-17-sglang/) when applied to models with full attention, which prioritizes to evict reference count zero and least recent used leaf nodes in the prefix tree.
++
++However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above:
++
++* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency.
++* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images.
+diff --git a/docs/source/design/huggingface_integration.md b/docs/source/design/huggingface_integration.md
+new file mode 100644
+index 0000000..99b4cb5
+--- /dev/null
++++ b/docs/source/design/huggingface_integration.md
+@@ -0,0 +1,36 @@
++(huggingface-integration)=
++
++# Integration with HuggingFace
++
++This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
++
++Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`.
++
++1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process:
++
++   - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
++   - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works.
++   - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
++
++2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.
++
++3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
++
++   - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
++   - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
++
++4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation.
++
++5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs.
++
++Beyond that, there are two more things vLLM depends on HuggingFace for.
++
++1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
++
++2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
++
++   - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
++
++This completes the integration between vLLM and HuggingFace.
++
++In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
+diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
+new file mode 100644
+index 0000000..f896f90
+--- /dev/null
++++ b/docs/source/design/kernel/paged_attention.md
+@@ -0,0 +1,529 @@
++(design-paged-attention)=
++
++# vLLM Paged Attention
++
++- Currently, vLLM utilizes its own implementation of a multi-head query
++  attention kernel (`csrc/attention/attention_kernels.cu`).
++  This kernel is designed to be compatible with
++  vLLM's paged KV caches, where the key and value cache are stored in
++  separate blocks (note that this block concept differs from the GPU
++  thread block. So in a later document, I will refer to vLLM paged
++  attention block as "block", while refer to GPU thread block as
++  "thread block").
++- To achieve high performance, this kernel relies on a specially
++  designed memory layout and access method, specifically when threads
++  read data from global memory to shared memory. The purpose of this
++  document is to provide a high-level explanation of the kernel
++  implementation step by step, aiding those who wish to learn about the
++  vLLM multi-head query attention kernel. After going through this
++  document, users will likely have a better understanding and feel easier
++  to follow the actual implementation.
++- Please note that this document may not cover all details, such as how
++  to calculate the correct index for the corresponding data or the dot
++  multiplication implementation. However, after reading this document
++  and becoming familiar with the high-level logic flow, it should be
++  easier for you to read the actual code and understand the details.
++
++## Inputs
++
++- The kernel function takes a list of arguments for the current thread
++  to perform its assigned work. The three most important arguments are
++  the input pointers `q`, `k_cache`, and `v_cache`, which point
++  to query, key, and value data on global memory that need to be read
++  and processed. The output pointer `out` points to global memory
++  where the result should be written. These four pointers actually
++  refer to multi-dimensional arrays, but each thread only accesses the
++  portion of data assigned to it. I have omitted all other runtime
++  parameters here for simplicity.
++
++  ```cpp
++  template<
++  typename scalar_t,
++  int HEAD_SIZE,
++  int BLOCK_SIZE,
++  int NUM_THREADS,
++  int PARTITION_SIZE = 0>
++  __device__ void paged_attention_kernel(
++  ... // Other side args.
++  const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
++  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
++  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
++  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
++  ... // Other side args.
++  )
++  ```
++
++- There are also a list of template arguments above the function
++  signature that are determined during compilation time. `scalar_t`
++  represents the data type of the query, key, and value data elements,
++  such as FP16. `HEAD_SIZE` indicates the number of elements in each
++  head. `BLOCK_SIZE` refers to the number of tokens in each block.
++  `NUM_THREADS` denotes the number of threads in each thread block.
++  `PARTITION_SIZE` represents the number of tensor parallel GPUs (For
++  simplicity, we assume this is 0 and tensor parallel is disabled).
++
++- With these arguments, we need to perform a sequence of preparations.
++  This includes calculating the current head index, block index, and
++  other necessary variables. However, for now, we can ignore these
++  preparations and proceed directly to the actual calculations. It will
++  be easier to understand them once we grasp the entire flow.
++
++## Concepts
++
++- Just before we dive into the calculation flow, I want to describe a
++  few concepts that are needed for later sections. However, you may
++  skip this section and return later if you encounter any confusing
++  terminologies.
++- **Sequence**: A sequence represents a client request. For example,
++  the data pointed to by `q` has a shape of
++  `[num_seqs, num_heads, head_size]`. That represents there are total
++  `num_seqs` of query sequence data are pointed by `q`. Since this
++  kernel is a single query attention kernel, each sequence only has one
++  query token. Hence, the `num_seqs` equals the total number of tokens
++  that are processed in the batch.
++- **Context**: The context consists of the generated tokens from the
++  sequence. For instance, `["What", "is", "your"]` are the context
++  tokens, and the input query token is `"name"`. The model might
++  generate the token `"?"`.
++- **Vec**: The vec is a list of elements that are fetched and
++  calculated together. For query and key data, the vec size
++  (`VEC_SIZE`) is determined so that each thread group can fetch and
++  calculate 16 bytes of data at a time. For value data, the vec size
++  (`V_VEC_SIZE`) is determined so that each thread can fetch and
++  calculate 16 bytes of data at a time. For example, if the
++  `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the
++  `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8.
++- **Thread group**: The thread group is a small group of
++  threads(`THREAD_GROUP_SIZE`) that fetches and calculates one
++  query token and one key token at a time. Each thread handles only a
++  portion of the token data. The total number of elements processed by
++  one thread group is referred as `x`. For example, if the thread
++  group contains 2 threads and the head size is 8, then thread 0
++  handles the query and key elements at index 0, 2, 4, 6, while thread
++  1 handles the elements at index 1, 3, 5, 7.
++- **Block**: The key and value cache data in vLLM are split into
++  blocks. Each block stores data for a fixed number(`BLOCK_SIZE`)
++  of tokens at one head. Each block may contain only a portion of the
++  whole context tokens. For example, if the block size is 16 and the
++  head size is 128, then for one head, one block can store 16 * 128 =
++  2048 elements.
++- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that
++  execute simultaneously on a stream multiprocessor (SM). In this
++  kernel, each warp processes the calculation between one query token
++  and key tokens of one entire block at a time (it may process multiple
++  blocks in multiple iterations). For example, if there are 4 warps and
++  6 blocks for one context, the assignment would be like warp 0 handles
++  the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
++  handles the 2nd block and warp 3 handles the 3rd block.
++- **Thread block**: A thread block is a group of
++  threads(`NUM_THREADS`) that can access the same shared memory.
++  Each thread block contains multiple warps(`NUM_WARPS`), and in
++  this kernel, each thread block processes the calculation between one
++  query token and key tokens of a whole context.
++- **Grid**: A grid is a collection of thread blocks and defines the
++  shape of the collection. In this kernel, the shape is
++  `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread
++  block only handles the calculation for one head, one sequence, and
++  one partition.
++
++## Query
++
++- This section will introduce how query data is stored in memory and
++  fetched by each thread. As mentioned above, each thread group fetches
++  one query token data, while each thread itself only handles a part of
++  one query token data. Within each warp, every thread group will fetch
++  the same query token data, but will multiply it with different key
++  token data.
++
++  ```cpp
++  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
++  ```
++
++  ```{figure} ../../assets/kernel/query.png
++  :align: center
++  :alt: query
++  :width: 70%
++
++  Query data of one token at one head
++  ```
++
++- Each thread defines its own `q_ptr` which points to the assigned
++  query token data on global memory. For example, if `VEC_SIZE` is 4
++  and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
++  total of 128 elements divided into 128 / 4 = 32 vecs.
++
++  ```{figure} ../../assets/kernel/q_vecs.png
++  :align: center
++  :alt: q_vecs
++  :width: 70%
++
++  `q_vecs` for one thread group
++  ```
++
++  ```cpp
++  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
++  ```
++
++- Next, we need to read the global memory data pointed to by `q_ptr`
++  into shared memory as `q_vecs`. It is important to note that each
++  vecs is assigned to a different row. For example, if the
++  `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs,
++  while thread 1 handles the 1st row vecs. By reading the query data in
++  this way, neighboring threads like thread 0 and thread 1 can read
++  neighbor memory, achieving the memory coalescing to improve
++  performance.
++
++## Key
++
++- Similar to the "Query" section, this section introduces memory layout
++  and assignment for keys. While each thread group only handle one
++  query token one kernel run, it may handle multiple key tokens across
++  multiple iterations. Meanwhile, each warp will process multiple blocks
++  of key tokens in multiple iterations, ensuring that all context
++  tokens are processed by the entire thread group after the kernel run.
++  In this context, "handle" refers to performing the dot multiplication
++  between query data and key data.
++
++  ```cpp
++  const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
++                      + kv_head_idx * kv_head_stride
++                      + physical_block_offset * x;
++  ```
++
++- Unlike to `q_ptr`, `k_ptr` in each thread will point to different
++  key token at different iterations. As shown above, that `k_ptr`
++  points to key token data based on `k_cache` at assigned block,
++  assigned head and assigned token.
++
++  ```{figure} ../../assets/kernel/key.png
++  :align: center
++  :alt: key
++  :width: 70%
++
++  Key data of all context tokens at one head
++  ```
++
++- The diagram above illustrates the memory layout for key data. It
++  assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
++  8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each
++  rectangle represents all the elements for one key token at one head,
++  which will be processed by one thread group. The left half shows the
++  total 16 blocks of key token data for warp 0, while the right half
++  represents the remaining key token data for other warps or
++  iterations. Inside each rectangle, there are a total 32 vecs (128
++  elements for one token) that will be processed by 2 threads (one
++  thread group) separately.
++
++  ```{figure} ../../assets/kernel/k_vecs.png
++  :align: center
++  :alt: k_vecs
++  :width: 70%
++
++  `k_vecs` for one thread
++  ```
++
++  ```cpp
++  K_vec k_vecs[NUM_VECS_PER_THREAD]
++  ```
++
++- Next, we need to read the key token data from `k_ptr` and store
++  them on register memory as `k_vecs`. We use register memory for
++  `k_vecs` because it will only be accessed by one thread once,
++  whereas `q_vecs` will be accessed by multiple threads multiple
++  times. Each `k_vecs` will contain multiple vectors for later
++  calculation. Each vec will be set at each inner iteration. The
++  assignment of vecs allows neighboring threads in a warp to read
++  neighboring memory together, which again promotes the memory
++  coalescing. For instance, thread 0 will read vec 0, while thread 1
++  will read vec 1. In the next inner loop, thread 0 will read vec 2,
++  while thread 1 will read vec 3, and so on.
++
++- You may still be a little confused about the overall flow. Don't
++  worry, please keep reading the next "QK" section. It will illustrate
++  the query and key calculation flow in a clearer and higher-level
++  manner.
++
++## QK
++
++- As shown the pseudo code below, before the entire for loop block, we
++  fetch the query data for one token and store it in `q_vecs`. Then,
++  in the outer for loop, we iterate through different `k_ptrs` that
++  point to different tokens and prepare the `k_vecs` in the inner for
++  loop. Finally, we perform the dot multiplication between the
++  `q_vecs` and each `k_vecs`.
++
++  ```cpp
++  q_vecs = ...
++  for ... {
++     k_ptr = ...
++     for ... {
++        k_vecs[i] = ...
++     }
++     ...
++     float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
++  }
++  ```
++
++- As mentioned before, for each thread, it only fetches part of the
++  query and key token data at a time. However, there will be a cross
++  thread group reduction happen in the `Qk_dot<>::dot` . So `qk`
++  returned here is not just between part of the query and key token dot
++  multiplication, but actually a full result between entire query and
++  key token data.
++
++- For example, if the value of `HEAD_SIZE` is 128 and
++  `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain
++  total 64 elements. However, the returned `qk` is actually the
++  result of dot multiplication between 128 query elements and 128 key
++  elements. If you want to learn more about the details of the dot
++  multiplication and reduction, you may refer to the implementation of
++  `Qk_dot<>::dot`. However, for the sake of simplicity, I will not
++  cover it in this document.
++
++## Softmax
++
++- Next, we need to calculate the normalized softmax for all `qk`s,
++  as shown above, where each $x$ represents a `qk`. To do this,
++  we must obtain the reduced value of `qk_max`($m(x)$) and
++  the `exp_sum`($\ell(x)$) of all `qk`s. The reduction
++  should be performed across the entire thread block, encompassing
++  results between the query token and all context key tokens.
++
++  ```{math}
++  :nowrap: true
++
++  \begin{gather*}
++  m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
++  \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
++  \end{gather*}
++  ```
++
++### `qk_max` and `logits`
++
++- Just right after we get the `qk` result, we can set the temporary
++  `logits` result with `qk` (In the end, the `logits` should
++  store the normalized softmax result). Also we can compare and collect
++  the `qk_max` for all `qk`s that are calculated by current
++  thread group.
++
++  ```cpp
++  if (thread_group_offset == 0) {
++     const bool mask = token_idx >= context_len;
++     logits[token_idx - start_token_idx] = mask ? 0.f : qk;
++     qk_max = mask ? qk_max : fmaxf(qk_max, qk);
++  }
++  ```
++
++- Please note that the `logits` here is on shared memory, so each
++  thread group will set the fields for its own assigned context tokens.
++  Overall, the size of logits should be number of context tokens.
++
++  ```cpp
++  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
++      qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
++  }
++
++  if (lane == 0) {
++     red_smem[warp_idx] = qk_max;
++  }
++  ```
++
++- Then we need to get the reduced `qk_max` across each warp. The main
++  idea is to make threads in warp to communicate with each other and
++  get the final max `qk` .
++
++  ```cpp
++  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
++      qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
++  }
++  qk_max = VLLM_SHFL_SYNC(qk_max, 0);
++  ```
++
++- Finally, we can get the reduced `qk_max` from whole thread block by
++  compare the `qk_max` from all warps in this thread block. Then we
++  need to broadcast the final result to each thread.
++
++### `exp_sum`
++
++- Similar to `qk_max`, we need to get the reduced sum value from the
++  entire thread block too.
++
++  ```cpp
++  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
++      float val = __expf(logits[i] - qk_max);
++      logits[i] = val;
++      exp_sum += val;
++  }
++  ...
++  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
++  ```
++
++- Firstly, sum all exp values from each thread group, and meanwhile,
++  convert each entry of `logits` from `qk` to `exp(qk - qk_max)`.
++  Please note, the `qk_max` here is already the max `qk` across the
++  whole thread block. And then we can do reduction for `exp_sum`
++  across whole thread block just like the `qk_max`.
++
++  ```cpp
++  const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
++  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
++     logits[i] *= inv_sum;
++  }
++  ```
++
++- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain
++  the final normalized softmax result as `logits`. This `logits`
++  variable will be used for dot multiplication with the value data in
++  later steps. Now, it should store the normalized softmax result of
++  `qk` for all assigned context tokens.
++
++## Value
++
++```{figure} ../../assets/kernel/value.png
++:align: center
++:alt: value
++:width: 70%
++
++Value data of all context tokens at one head
++```
++
++```{figure} ../../assets/kernel/logits_vec.png
++:align: center
++:alt: logits_vec
++:width: 50%
++
++`logits_vec` for one thread
++```
++
++```{figure} ../../assets/kernel/v_vec.png
++:align: center
++:alt: v_vec
++:width: 70%
++
++List of `v_vec` for one thread
++```
++
++- Now we need to retrieve the value data and perform dot multiplication
++  with `logits`. Unlike query and key, there is no thread group
++  concept for value data. As shown in diagram, different from key token
++  memory layout, elements from the same column correspond to the same
++  value token. For one block of value data, there are `HEAD_SIZE` of
++  rows and `BLOCK_SIZE` of columns that are split into multiple
++  `v_vecs`.
++
++- Each thread always fetches `V_VEC_SIZE` elements from the same
++  `V_VEC_SIZE` of tokens at a time. As a result, a single thread
++  retrieves multiple `v_vec`s from different rows and the same
++  columns through multiple inner iterations. For each `v_vec`, it
++  needs to be dot multiplied with the corresponding `logits_vec`,
++  which is also `V_VEC_SIZE` elements from `logits`. Overall, with
++  multiple inner iterations, each warp will process one block of value
++  tokens. And with multiple outer iterations, the whole context value
++  tokens are processd
++
++  ```cpp
++  float accs[NUM_ROWS_PER_THREAD];
++  for ... { // Iteration over different blocks.
++      logits_vec = ...
++      for ... { // Iteration over different rows.
++          v_vec = ...
++          ...
++          accs[i] += dot(logits_vec, v_vec);
++      }
++  }
++  ```
++
++- As shown in the above pseudo code, in the outer loop, similar to
++  `k_ptr`, `logits_vec` iterates over different blocks and reads
++  `V_VEC_SIZE` elements from `logits`. In the inner loop, each
++  thread reads `V_VEC_SIZE` elements from the same tokens as a
++  `v_vec` and performs dot multiplication. It is important to note
++  that in each inner iteration, the thread fetches different head
++  position elements for the same tokens. The dot result is then
++  accumulated in `accs`. Therefore, each entry of `accs` is mapped
++  to a head position assigned to the current thread.
++
++- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each
++  thread fetches 8 value elements for 8 tokens at a time. Each element
++  is from different tokens at the same head position. If `HEAD_SIZE`
++  is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to
++  fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are
++  a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
++  a whole block of value tokens. And each `accs` in each thread
++  contains 8 elements that accumulated at 8 different head positions.
++  For the thread 0, the `accs` variable will have 8 elements, which
++  are 0th, 32th … 224th elements of a value head that are accumulated
++  from all assigned 8 tokens.
++
++## LV
++
++- Now, we need to perform reduction for `accs` within each warp. This
++  process allows each thread to accumulate the `accs` for the
++  assigned head positions of all tokens in one block.
++
++  ```cpp
++  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++     float acc = accs[i];
++     for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
++        acc += VLLM_SHFL_XOR_SYNC(acc, mask);
++     }
++     accs[i] = acc;
++  }
++  ```
++
++- Next, we perform reduction for `accs` across all warps, allowing
++  each thread to have the accumulation of `accs` for the assigned
++  head positions of all context tokens. Please note that each `accs`
++  in every thread only stores the accumulation for a portion of
++  elements of the entire head for all context tokens. However, overall,
++  all results for output have been calculated but are just stored in
++  different thread register memory.
++
++  ```cpp
++  float* out_smem = reinterpret_cast<float*>(shared_mem);
++  for (int i = NUM_WARPS; i > 1; i /= 2) {
++      // Upper warps write to shared memory.
++      ...
++          float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
++          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++                  ...
++          dst[row_idx] = accs[i];
++      }
++
++      // Lower warps update the output.
++          const float* src = &out_smem[warp_idx * HEAD_SIZE];
++      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++                  ...
++          accs[i] += src[row_idx];
++      }
++
++          // Write out the accs.
++  }
++  ```
++
++## Output
++
++- Now we can write all of calculated result from local register memory
++  to final output global memory.
++
++  ```cpp
++  scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
++                  + head_idx * max_num_partitions * HEAD_SIZE
++                  + partition_idx * HEAD_SIZE;
++  ```
++
++- First, we need to define the `out_ptr` variable, which points to
++  the start address of the assigned sequence and assigned head.
++
++  ```cpp
++  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
++  const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
++  if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
++      from_float(*(out_ptr + row_idx), accs[i]);
++  }
++  }
++  ```
++
++- Finally, we need to iterate over different assigned head positions
++  and write out the corresponding accumulated result based on the
++  `out_ptr`.
+diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md
+new file mode 100644
+index 0000000..a0d0120
+--- /dev/null
++++ b/docs/source/design/mm_processing.md
+@@ -0,0 +1,64 @@
++(mm-processing)=
++
++# Multi-Modal Data Processing
++
++To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
++
++Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`:
++
++## Prompt Replacement Detection
++
++One of the main responsibilies of HF processor is to replace input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size). The information about which tokens have been replaced is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
++
++In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptReplacement` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. Given this specification, we can automatically detect whether HF has replaced the input placeholder tokens by checking whether the feature placeholder tokens exist in the prompt.
++
++## Tokenized Prompt Inputs
++
++To enable tokenization in a separate process, we support passing input token IDs alongside multi-modal data.
++
++### The problem
++
++Consider that HF processors follow these main steps:
++
++1. Tokenize the text
++2. Process multi-modal inputs
++3. Perform prompt replacement
++
++And we require that:
++
++- For text + multi-modal inputs, apply all steps 1--3.
++- For tokenized + multi-modal inputs, apply only steps 2--3.
++
++How can we achieve this without rewriting HF processors? We can try to call the HF processor several times on different inputs:
++
++- For text + multi-modal inputs, simply call the HF processor directly.
++- For tokenized + multi-modal inputs, call the processor only on the multi-modal inputs.
++
++While HF processors support text + multi-modal inputs natively, this is not so for tokenized + multi-modal inputs: an error is thrown if the number of input placeholder tokens do not correspond to the number of multi-modal inputs.
++
++Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
++
++(mm-dummy-text)=
++
++### Dummy text
++
++We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
++
++(mm-automatic-prompt-replacement)=
++
++### Automatic prompt replacement
++
++We address the second issue by implementing model-agnostic code in
++{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_replacements` to automatically replace input placeholder tokens with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`.
++
++### Summary
++
++With the help of dummy text and automatic prompt replacement, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
++
++## Processor Output Caching
++
++Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
++
++When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
++
++Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt replacement code, we apply [automatic prompt replacement](#mm-automatic-prompt-replacement) afterwards to keep the output tokens and multi-modal data consistent with each other.
+diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
+new file mode 100644
+index 0000000..c2cdb75
+--- /dev/null
++++ b/docs/source/design/multiprocessing.md
+@@ -0,0 +1,196 @@
++# Python Multiprocessing
++
++## Debugging
++
++Please see the [Troubleshooting](#troubleshooting-python-multiprocessing)
++page for information on known issues and how to solve them.
++
++## Introduction
++
++```{important}
++The source code references are to the state of the code at the time of writing in December, 2024.
++```
++
++The use of Python multiprocessing in vLLM is complicated by:
++
++- The use of vLLM as a library and the inability to control the code using vLLM
++- Varying levels of incompatibilities between multiprocessing methods and vLLM
++  dependencies
++
++This document describes how vLLM deals with these challenges.
++
++## Multiprocessing Methods
++
++[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
++
++- `spawn` - spawn a new Python process. This will be the default as of Python
++  3.14.
++
++- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
++  in Python versions prior to 3.14.
++
++- `forkserver` - Spawn a server process that will fork a new process on request.
++
++### Tradeoffs
++
++`fork` is the fastest method, but is incompatible with dependencies that use
++threads.
++
++`spawn` is more compatible with dependencies, but can be problematic when vLLM
++is used as a library. If the consuming code does not use a `__main__` guard (`if
++__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
++spawns a new process. This can lead to infinite recursion, among other problems.
++
++`forkserver` will spawn a new server process that will fork new processes on
++demand. This unfortunately has the same problem as `spawn` when vLLM is used as
++a library. The server process is created as a spawned new process, which will
++re-execute code not protected by a `__main__` guard.
++
++For both `spawn` and `forkserver`, the process must not depend on inheriting any
++global state as would be the case with `fork`.
++
++## Compatibility with Dependencies
++
++Multiple vLLM dependencies indicate either a preference or requirement for using
++`spawn`:
++
++- <https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing>
++- <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
++- <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>
++
++It is perhaps more accurate to say that there are known problems with using
++`fork` after initializing these dependencies.
++
++## Current State (v0)
++
++The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`.
++
++- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
++
++When we know we own the process because the `vllm` command was used, we use
++`spawn` because it's the most widely compatible.
++
++- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
++
++The `multiproc_xpu_executor` forces the use of `spawn`.
++
++- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/executor/multiproc_xpu_executor.py#L14-L18>
++
++There are other miscellaneous places hard-coding the use of `spawn`:
++
++- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L135>
++- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
++
++Related PRs:
++
++- <gh-pr:8823>
++
++## Prior State in v1
++
++There was an environment variable to control whether multiprocessing is used in
++the v1 engine core, `VLLM_ENABLE_V1_MULTIPROCESSING`. This defaulted to off.
++
++- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L452-L454>
++
++When it was enabled, the v1 `LLMEngine` would create a new process to run the
++engine core.
++
++- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L93-L95>
++- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/llm_engine.py#L70-L77>
++- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45>
++
++It was off by default for all the reasons mentioned above - compatibility with
++dependencies and code using vLLM as a library.
++
++### Changes Made in v1
++
++There is not an easy solution with Python's `multiprocessing` that will work
++everywhere. As a first step, we can get v1 into a state where it does "best
++effort" choice of multiprocessing method to maximize compatibility.
++
++- Default to `fork`.
++- Use `spawn` when we know we control the main process (`vllm` was executed).
++- If we detect `cuda` was previously initialized, force `spawn` and emit a
++  warning. We know `fork` will break, so this is the best we can do.
++
++The case that is known to still break in this scenario is code using vLLM as a
++library that initializes `cuda` before calling vLLM. The warning we emit should
++instruct users to either add a `__main__` guard or to disable multiprocessing.
++
++If that known-failure case occurs, the user will see two messages that explain
++what is happening. First, a log message from vLLM:
++
++```console
++WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
++    initialized. We must use the `spawn` multiprocessing start method. Setting
++    VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
++    https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
++    for more information.
++```
++
++Second, Python itself will raise an exception with a nice explanation:
++
++```console
++RuntimeError:
++        An attempt has been made to start a new process before the
++        current process has finished its bootstrapping phase.
++
++        This probably means that you are not using fork to start your
++        child processes and you have forgotten to use the proper idiom
++        in the main module:
++
++            if __name__ == '__main__':
++                freeze_support()
++                ...
++
++        The "freeze_support()" line can be omitted if the program
++        is not going to be frozen to produce an executable.
++
++        To fix this issue, refer to the "Safe importing of main module"
++        section in https://docs.python.org/3/library/multiprocessing.html
++```
++
++## Alternatives Considered
++
++### Detect if a `__main__` guard is present
++
++It has been suggested that we could behave better if we could detect whether
++code using vLLM as a library has a `__main__` guard in place. This [post on
++stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
++was from a library author facing the same question.
++
++It is possible to detect whether we are in the original, `__main__` process, or
++a subsequent spawned process. However, it does not appear to be straight forward
++to detect whether a `__main__` guard is present in the code.
++
++This option has been discarded as impractical.
++
++### Use `forkserver`
++
++At first it appears that `forkserver` is a nice solution to the problem.
++However, the way it works presents the same challenges that `spawn` does when
++vLLM is used as a library.
++
++### Force `spawn` all the time
++
++One way to clean this up is to just force the use of `spawn` all the time and
++document that the use of a `__main__` guard is required when using vLLM as a
++library. This would unfortunately break existing code and make vLLM harder to
++use, violating the desire to make the `LLM` class as easy as possible to use.
++
++Instead of pushing this on our users, we will retain the complexity to do our
++best to make things work.
++
++## Future Work
++
++We may want to consider a different worker management approach in the future
++that works around these challenges.
++
++1. We could implement something `forkserver`-like, but have the process manager
++   be something we initially launch by running our own subprocess and a custom
++   entrypoint for worker management (launch a `vllm-manager` process).
++
++2. We can explore other libraries that may better suit our needs. Examples to
++   consider:
++
++- <https://github.com/joblib/loky>
+diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md
+new file mode 100644
+index 0000000..2250308
+--- /dev/null
++++ b/docs/source/design/plugin_system.md
+@@ -0,0 +1,56 @@
++(plugin-system)=
++
++# vLLM's Plugin System
++
++The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
++
++## How Plugins Work in vLLM
++
++Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
++
++## How vLLM Discovers Plugins
++
++vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
++
++```python
++# inside `setup.py` file
++from setuptools import setup
++
++setup(name='vllm_add_dummy_model',
++      version='0.1',
++      packages=['vllm_add_dummy_model'],
++      entry_points={
++          'vllm.general_plugins':
++          ["register_dummy_model = vllm_add_dummy_model:register"]
++      })
++
++# inside `vllm_add_dummy_model.py` file
++def register():
++    from vllm import ModelRegistry
++
++    if "MyLlava" not in ModelRegistry.get_supported_archs():
++        ModelRegistry.register_model("MyLlava",
++                                        "vllm_add_dummy_model.my_llava:MyLlava")
++```
++
++For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
++
++Every plugin has three parts:
++
++1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins.
++2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
++3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
++
++## Types of supported plugins
++
++- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.
++
++- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
++
++## Guidelines for Writing Plugins
++
++- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
++
++## Compatibility Guarantee
++
++vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
+diff --git a/docs/source/features/automatic_prefix_caching.md b/docs/source/features/automatic_prefix_caching.md
+new file mode 100644
+index 0000000..3d70cbb
+--- /dev/null
++++ b/docs/source/features/automatic_prefix_caching.md
+@@ -0,0 +1,102 @@
++(automatic-prefix-caching)=
++
++# Automatic Prefix Caching
++
++## Introduction
++
++Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
++
++```{note}
++Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
++```
++
++## Enabling APC in vLLM
++
++Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
++
++```python
++import time
++from vllm import LLM, SamplingParams
++
++
++# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
++LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
++| ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
++|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
++| 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
++| 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
++| 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
++| 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
++| 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
++| 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
++| 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
++| 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
++| 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
++| 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
++| 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
++| 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
++| 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
++| 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
++| 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
++| 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
++| 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
++| 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
++| 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
++| 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
++| 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
++| 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
++| 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
++| 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
++| 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
++| 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
++| 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
++| 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
++| 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
++| 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
++"""
++
++
++def get_generation_time(llm, sampling_params, prompts):
++    # time the generation
++    start_time = time.time()
++    output = llm.generate(prompts, sampling_params=sampling_params)
++    end_time = time.time()
++    # print the output and generation time
++    print(f"Output: {output[0].outputs[0].text}")
++    print(f"Generation time: {end_time - start_time} seconds.")
++
++
++# set enable_prefix_caching=True to enable APC
++llm = LLM(
++    model='lmsys/longchat-13b-16k',
++    enable_prefix_caching=True
++)
++
++sampling_params = SamplingParams(temperature=0, max_tokens=100)
++
++# Querying the age of John Doe
++get_generation_time(
++    llm,
++    sampling_params,
++    LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
++)
++
++# Querying the age of Zack Blue
++# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
++get_generation_time(
++    llm,
++    sampling_params,
++    LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
++)
++```
++
++## Example workloads
++
++We describe two example workloads, where APC can provide huge performance benefit:
++
++- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
++- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
++
++## Limits
++
++APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
+diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
+new file mode 100644
+index 0000000..ea1d545
+--- /dev/null
++++ b/docs/source/features/compatibility_matrix.md
+@@ -0,0 +1,468 @@
++(compatibility-matrix)=
++
++# Compatibility Matrix
++
++The tables below show mutually exclusive features and the support on some hardware.
++
++```{note}
++Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
++```
++
++## Feature x Feature
++
++```{raw} html
++<style>
++  /* Make smaller to try to improve readability  */
++  td {
++    font-size: 0.8rem;
++    text-align: center;
++  }
++
++  th {
++    text-align: center;
++    font-size: 0.8rem;
++  }
++</style>
++```
++
++```{list-table}
++   :header-rows: 1
++   :stub-columns: 1
++   :widths: auto
++
++   * - Feature
++     - [CP](#chunked-prefill)
++     - [APC](#automatic-prefix-caching)
++     - [LoRA](#lora-adapter)
++     - <abbr title="Prompt Adapter">prmpt adptr</abbr>
++     - [SD](#spec_decode)
++     - CUDA graph
++     - <abbr title="Pooling Models">pooling</abbr>
++     - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
++     - <abbr title="Logprobs">logP</abbr>
++     - <abbr title="Prompt Logprobs">prmpt logP</abbr>
++     - <abbr title="Async Output Processing">async output</abbr>
++     - multi-step
++     - <abbr title="Multimodal Inputs">mm</abbr>
++     - best-of
++     - beam-search
++     - <abbr title="Guided Decoding">guided dec</abbr>
++   * - [CP](#chunked-prefill)
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++   * - [APC](#automatic-prefix-caching)
++     - ✅
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++   * - [LoRA](#lora-adapter)
++     - [✗](gh-pr:9057)
++     - ✅
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
++     - ✅
++     - ✅
++     - ✅
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++   * - [SD](#spec_decode)
++     - ✅
++     - ✅
++     - ✗
++     - ✅
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++   * - CUDA graph
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++   * - <abbr title="Pooling Models">pooling</abbr>
++     - ✗
++     - ✗
++     - ✗
++     - ✗
++     - ✗
++     - ✗
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
++     - ✗
++     - [✗](gh-issue:7366)
++     - ✗
++     - ✗
++     - [✗](gh-issue:7366)
++     - ✅
++     - ✅
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++   * - <abbr title="Logprobs">logP</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✗
++     - ✅
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - [✗](gh-pr:8199)
++     - ✅
++     - ✗
++     - ✅
++     - ✅
++     -
++     -
++     -
++     -
++     -
++     -
++     -
++   * - <abbr title="Async Output Processing">async output</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✗
++     - ✅
++     - ✗
++     - ✗
++     - ✅
++     - ✅
++     -
++     -
++     -
++     -
++     -
++     -
++   * - multi-step
++     - ✗
++     - ✅
++     - ✗
++     - ✅
++     - ✗
++     - ✅
++     - ✗
++     - ✗
++     - ✅
++     - [✗](gh-issue:8198)
++     - ✅
++     -
++     -
++     -
++     -
++     -
++   * - <abbr title="Multimodal Inputs">mm</abbr>
++     - ✅
++     -  [✗](gh-pr:8348)
++     -  [✗](gh-pr:7199)
++     - ?
++     - ?
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ?
++     -
++     -
++     -
++     -
++   * - best-of
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - [✗](gh-issue:6137)
++     - ✅
++     - ✗
++     - ✅
++     - ✅
++     - ✅
++     - ?
++     - [✗](gh-issue:7968)
++     - ✅
++     -
++     -
++     -
++   * - beam-search
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - [✗](gh-issue:6137)
++     - ✅
++     - ✗
++     - ✅
++     - ✅
++     - ✅
++     - ?
++     - [✗](gh-issue:7968>)
++     - ?
++     - ✅
++     -
++     -
++   * - <abbr title="Guided Decoding">guided dec</abbr>
++     - ✅
++     - ✅
++     - ?
++     - ?
++     - ✅
++     - ✅
++     - ✗
++     - ?
++     - ✅
++     - ✅
++     - ✅
++     - [✗](gh-issue:9893)
++     - ?
++     - ✅
++     - ✅
++     -
++
++```
++
++### Feature x Hardware
++
++```{list-table}
++   :header-rows: 1
++   :stub-columns: 1
++   :widths: auto
++
++   * - Feature
++     - Volta
++     - Turing
++     - Ampere
++     - Ada
++     - Hopper
++     - CPU
++     - AMD
++   * - [CP](#chunked-prefill)
++     - [✗](gh-issue:2729)
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++   * - [APC](#automatic-prefix-caching)
++     - [✗](gh-issue:3687)
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++   * - [LoRA](#lora-adapter)
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - [✗](gh-issue:8475)
++     - ✅
++   * - [SD](#spec_decode)
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++   * - CUDA graph
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✗
++     - ✅
++   * - <abbr title="Pooling Models">pooling</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ?
++   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✗
++   * - <abbr title="Multimodal Inputs">mm</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++   * - <abbr title="Logprobs">logP</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++   * - <abbr title="Async Output Processing">async output</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✗
++     - ✗
++   * - multi-step
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - [✗](gh-issue:8477)
++     - ✅
++   * - best-of
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++   * - beam-search
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++   * - <abbr title="Guided Decoding">guided dec</abbr>
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++     - ✅
++```
+diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
+new file mode 100644
+index 0000000..efa2efc
+--- /dev/null
++++ b/docs/source/features/disagg_prefill.md
+@@ -0,0 +1,68 @@
++(disagg-prefill)=
++
++# Disaggregated Prefilling (experimental)
++
++This page introduces you the disaggregated prefilling feature in vLLM.
++
++```{note}
++This feature is experimental and subject to change.
++```
++
++## Why disaggregated prefilling?
++
++Two main reasons:
++
++- **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
++- **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
++
++```{note}
++Disaggregated prefill DOES NOT improve throughput.
++```
++
++## Usage example
++
++Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
++
++## Benchmarks
++
++Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks.
++
++## Development
++
++We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance.
++
++All disaggregated prefilling implementation is under `vllm/distributed/kv_transfer`.
++
++Key abstractions for disaggregated prefilling:
++
++- **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**.
++- **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
++- **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
++
++```{note}
++`insert` is non-blocking operation but `drop_select` is blocking operation.
++```
++
++Here is a figure illustrating how the above 3 abstractions are organized:
++
++```{image} /assets/features/disagg_prefill/abstraction.jpg
++:alt: Disaggregated prefilling abstractions
++```
++
++The workflow of disaggregated prefilling is as follows:
++
++```{image} /assets/features/disagg_prefill/overview.jpg
++:alt: Disaggregated prefilling workflow
++```
++
++The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
++
++## Third-party contributions
++
++Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).
++
++We recommend three ways of implementations:
++
++- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
++- **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
++- **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
+diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
+new file mode 100644
+index 0000000..b00d051
+--- /dev/null
++++ b/docs/source/features/lora.md
+@@ -0,0 +1,214 @@
++(lora-adapter)=
++
++# LoRA Adapters
++
++This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model.
++
++LoRA adapters can be used with any vLLM model that implements {class}`~vllm.model_executor.models.interfaces.SupportsLoRA`.
++
++Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
++them locally with
++
++```python
++from huggingface_hub import snapshot_download
++
++sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
++```
++
++Then we instantiate the base model and pass in the `enable_lora=True` flag:
++
++```python
++from vllm import LLM, SamplingParams
++from vllm.lora.request import LoRARequest
++
++llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
++```
++
++We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter
++of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
++the third parameter is the path to the LoRA adapter.
++
++```python
++sampling_params = SamplingParams(
++    temperature=0,
++    max_tokens=256,
++    stop=["[/assistant]"]
++)
++
++prompts = [
++     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
++     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
++]
++
++outputs = llm.generate(
++    prompts,
++    sampling_params,
++    lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
++)
++```
++
++Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
++
++## Serving LoRA Adapters
++
++LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
++`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server:
++
++```bash
++vllm serve meta-llama/Llama-2-7b-hf \
++    --enable-lora \
++    --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
++```
++
++```{note}
++The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
++```
++
++The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
++etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
++with its base model:
++
++```bash
++curl localhost:8000/v1/models | jq .
++{
++    "object": "list",
++    "data": [
++        {
++            "id": "meta-llama/Llama-2-7b-hf",
++            "object": "model",
++            ...
++        },
++        {
++            "id": "sql-lora",
++            "object": "model",
++            ...
++        }
++    ]
++}
++```
++
++Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
++processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
++LoRA adapter requests if they were provided and `max_loras` is set high enough).
++
++The following is an example request
++
++```bash
++curl http://localhost:8000/v1/completions \
++    -H "Content-Type: application/json" \
++    -d '{
++        "model": "sql-lora",
++        "prompt": "San Francisco is a",
++        "max_tokens": 7,
++        "temperature": 0
++    }' | jq
++```
++
++## Dynamically serving LoRA Adapters
++
++In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
++LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
++to change models on-the-fly is needed.
++
++Note: Enabling this feature in production environments is risky as user may participate model adapter management.
++
++To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
++is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
++
++```bash
++export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
++```
++
++Loading a LoRA Adapter:
++
++To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
++details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
++
++Example request to load a LoRA adapter:
++
++```bash
++curl -X POST http://localhost:8000/v1/load_lora_adapter \
++-H "Content-Type: application/json" \
++-d '{
++    "lora_name": "sql_adapter",
++    "lora_path": "/path/to/sql-lora-adapter"
++}'
++```
++
++Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter
++cannot be found or loaded, an appropriate error message will be returned.
++
++Unloading a LoRA Adapter:
++
++To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
++with the name or ID of the adapter to be unloaded.
++
++Example request to unload a LoRA adapter:
++
++```bash
++curl -X POST http://localhost:8000/v1/unload_lora_adapter \
++-H "Content-Type: application/json" \
++-d '{
++    "lora_name": "sql_adapter"
++}'
++```
++
++## New format for `--lora-modules`
++
++In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
++
++```bash
++--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
++```
++
++This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`.
++Now, you can specify a base_model_name alongside the name and path using JSON format. For example:
++
++```bash
++--lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}'
++```
++
++To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
++
++## Lora model lineage in model card
++
++The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
++
++- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
++- The `root` field points to the artifact location of the lora adapter.
++
++```bash
++$ curl http://localhost:8000/v1/models
++
++{
++    "object": "list",
++    "data": [
++        {
++        "id": "meta-llama/Llama-2-7b-hf",
++        "object": "model",
++        "created": 1715644056,
++        "owned_by": "vllm",
++        "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
++        "parent": null,
++        "permission": [
++            {
++            .....
++            }
++        ]
++        },
++        {
++        "id": "sql-lora",
++        "object": "model",
++        "created": 1715644056,
++        "owned_by": "vllm",
++        "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
++        "parent": meta-llama/Llama-2-7b-hf,
++        "permission": [
++            {
++            ....
++            }
++        ]
++        }
++    ]
++}
++```
+diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
+new file mode 100644
+index 0000000..404505e
+--- /dev/null
++++ b/docs/source/features/quantization/auto_awq.md
+@@ -0,0 +1,78 @@
++(auto-awq)=
++
++# AutoAWQ
++
++```{warning}
++Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
++accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
++inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
++```
++
++To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
++Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
++The main benefits are lower latency and memory usage.
++
++You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
++
++```console
++pip install autoawq
++```
++
++After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
++
++```python
++from awq import AutoAWQForCausalLM
++from transformers import AutoTokenizer
++
++model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
++quant_path = 'mistral-instruct-v0.2-awq'
++quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
++
++# Load model
++model = AutoAWQForCausalLM.from_pretrained(
++    model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
++)
++tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
++
++# Quantize
++model.quantize(tokenizer, quant_config=quant_config)
++
++# Save quantized model
++model.save_quantized(quant_path)
++tokenizer.save_pretrained(quant_path)
++
++print(f'Model is quantized and saved at "{quant_path}"')
++```
++
++To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
++
++```console
++python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
++```
++
++AWQ models are also supported directly through the LLM entrypoint:
++
++```python
++from vllm import LLM, SamplingParams
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++# Create an LLM.
++llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
++# Generate texts from the prompts. The output is a list of RequestOutput objects
++# that contain the prompt, generated text, and other information.
++outputs = llm.generate(prompts, sampling_params)
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
+diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md
+new file mode 100644
+index 0000000..7525e8e
+--- /dev/null
++++ b/docs/source/features/quantization/bnb.md
+@@ -0,0 +1,47 @@
++(bits-and-bytes)=
++
++# BitsAndBytes
++
++vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference.
++BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
++Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data.
++
++Below are the steps to utilize BitsAndBytes with vLLM.
++
++```console
++pip install bitsandbytes>=0.45.0
++```
++
++vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
++
++You can find bitsandbytes quantized models on <https://huggingface.co/models?other=bitsandbytes>.
++And usually, these repositories have a config.json file that includes a quantization_config section.
++
++## Read quantized checkpoint
++
++```python
++from vllm import LLM
++import torch
++# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
++model_id = "unsloth/tinyllama-bnb-4bit"
++llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
++quantization="bitsandbytes", load_format="bitsandbytes")
++```
++
++## Inflight quantization: load as 4bit quantization
++
++```python
++from vllm import LLM
++import torch
++model_id = "huggyllama/llama-7b"
++llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
++quantization="bitsandbytes", load_format="bitsandbytes")
++```
++
++## OpenAI Compatible Server
++
++Append the following to your 4bit model arguments:
++
++```console
++--quantization bitsandbytes --load-format bitsandbytes
++```
+diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
+new file mode 100644
+index 0000000..da49cd2
+--- /dev/null
++++ b/docs/source/features/quantization/fp8.md
+@@ -0,0 +1,192 @@
++(fp8)=
++
++# FP8 W8A8
++
++vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x.
++Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8.
++Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels.
++Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
++
++Please visit the HF collection of [quantized FP8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127).
++
++The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios:
++
++- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`.
++- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values.
++
++```{note}
++FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
++FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
++```
++
++## Quick Start with Online Dynamic Quantization
++
++Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying `--quantization="fp8"` in the command line or setting `quantization="fp8"` in the LLM constructor.
++
++In this mode, all Linear modules (except for the final `lm_head`) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode.
++
++```python
++from vllm import LLM
++model = LLM("facebook/opt-125m", quantization="fp8")
++# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
++result = model.generate("Hello, my name is")
++```
++
++```{warning}
++Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
++```
++
++## Installation
++
++To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
++
++```console
++pip install llmcompressor
++```
++
++## Quantization Process
++
++The quantization process involves three main steps:
++
++1. Loading the model
++2. Applying quantization
++3. Evaluating accuracy in vLLM
++
++### 1. Loading the Model
++
++Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models:
++
++```python
++from llmcompressor.transformers import SparseAutoModelForCausalLM
++from transformers import AutoTokenizer
++
++MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
++
++model = SparseAutoModelForCausalLM.from_pretrained(
++  MODEL_ID, device_map="auto", torch_dtype="auto")
++tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
++```
++
++### 2. Applying Quantization
++
++For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which uses:
++
++- Static, per-channel quantization on the weights
++- Dynamic, per-token quantization on the activations
++
++Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
++
++```python
++from llmcompressor.transformers import oneshot
++from llmcompressor.modifiers.quantization import QuantizationModifier
++
++# Configure the simple PTQ quantization
++recipe = QuantizationModifier(
++  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
++
++# Apply the quantization algorithm.
++oneshot(model=model, recipe=recipe)
++
++# Save the model.
++SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
++model.save_pretrained(SAVE_DIR)
++tokenizer.save_pretrained(SAVE_DIR)
++```
++
++### 3. Evaluating Accuracy
++
++Install `vllm` and `lm-evaluation-harness`:
++
++```console
++pip install vllm lm-eval==0.4.4
++```
++
++Load and run the model in `vllm`:
++
++```python
++from vllm import LLM
++model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
++model.generate("Hello my name is")
++```
++
++Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
++
++```{note}
++Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
++```
++
++```console
++$ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
++$ lm_eval \
++  --model vllm \
++  --model_args pretrained=$MODEL,add_bos_token=True \
++  --tasks gsm8k  --num_fewshot 5 --batch_size auto --limit 250
++```
++
++Here's an example of the resulting scores:
++
++```text
++|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value|   |Stderr|
++|-----|------:|----------------|-----:|-----------|---|----:|---|-----:|
++|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.768|±  |0.0268|
++|     |       |strict-match    |     5|exact_match|↑  |0.768|±  |0.0268|
++```
++
++## Troubleshooting and Support
++
++If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository.
++
++## Deprecated Flow
++
++```{note}
++The following information is preserved for reference and search purposes.
++The quantization method described below is deprecated in favor of the `llmcompressor` method described above.
++```
++
++For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8).
++
++```bash
++git clone https://github.com/neuralmagic/AutoFP8.git
++pip install -e AutoFP8
++```
++
++This package introduces the `AutoFP8ForCausalLM` and `BaseQuantizeConfig` objects for managing how your model will be compressed.
++
++## Offline Quantization with Static Activation Scaling Factors
++
++You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the `activation_scheme="static"` argument.
++
++```python
++from datasets import load_dataset
++from transformers import AutoTokenizer
++from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
++
++pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
++quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8"
++
++tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
++tokenizer.pad_token = tokenizer.eos_token
++
++# Load and tokenize 512 dataset samples for calibration of activation scales
++ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
++examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds]
++examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")
++
++# Define quantization config with static activation scales
++quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")
++
++# Load the model, quantize, and save checkpoint
++model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
++model.quantize(examples)
++model.save_quantized(quantized_model_dir)
++```
++
++Your model checkpoint with quantized weights and activations should be available at `Meta-Llama-3-8B-Instruct-FP8/`.
++Finally, you can load the quantized model checkpoint directly in vLLM.
++
++```python
++from vllm import LLM
++model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/")
++# INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB
++result = model.generate("Hello, my name is")
++```
+diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md
+new file mode 100644
+index 0000000..1cd67cb
+--- /dev/null
++++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md
+@@ -0,0 +1,44 @@
++(fp8-e4m3-kvcache)=
++
++# FP8 E4M3 KV Cache
++
++Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache,
++improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2
++(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of
++the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of
++FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside
++each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling
++factors of a finer granularity (e.g. per-channel).
++
++These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If
++this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an
++unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO).
++
++To install AMMO (AlgorithMic Model Optimization):
++
++```console
++pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
++```
++
++Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon
++offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc.
++Thus, LLM inference is greatly accelerated with minimal accuracy loss.
++
++Here is an example of how to enable this feature:
++
++```python
++# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
++# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own.
++
++from vllm import LLM, SamplingParams
++sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
++llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
++          kv_cache_dtype="fp8",
++          quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
++prompt = "London is the capital of"
++out = llm.generate(prompt, sampling_params)[0].outputs[0].text
++print(out)
++
++# output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
++# output w/o scaling factors:  England, located in the southeastern part of the country. It is known
++```
+diff --git a/docs/source/features/quantization/fp8_e5m2_kvcache.md b/docs/source/features/quantization/fp8_e5m2_kvcache.md
+new file mode 100644
+index 0000000..3a81ab1
+--- /dev/null
++++ b/docs/source/features/quantization/fp8_e5m2_kvcache.md
+@@ -0,0 +1,31 @@
++(fp8-kv-cache)=
++
++# FP8 E5M2 KV Cache
++
++The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
++The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other.
++
++Here is an example of how to enable this feature:
++
++```python
++from vllm import LLM, SamplingParams
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++# Create an LLM.
++llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
++# Generate texts from the prompts. The output is a list of RequestOutput objects
++# that contain the prompt, generated text, and other information.
++outputs = llm.generate(prompts, sampling_params)
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
+diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md
+new file mode 100644
+index 0000000..640997c
+--- /dev/null
++++ b/docs/source/features/quantization/gguf.md
+@@ -0,0 +1,72 @@
++(gguf)=
++
++# GGUF
++
++```{warning}
++Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
++```
++
++```{warning}
++Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
++```
++
++To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
++
++```console
++wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
++# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
++vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
++```
++
++You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
++
++```console
++# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
++vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
++```
++
++```{warning}
++We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
++```
++
++You can also use the GGUF model directly through the LLM entrypoint:
++
++```python
++from vllm import LLM, SamplingParams
++
++# In this script, we demonstrate how to pass input to the chat method:
++conversation = [
++   {
++      "role": "system",
++      "content": "You are a helpful assistant"
++   },
++   {
++      "role": "user",
++      "content": "Hello"
++   },
++   {
++      "role": "assistant",
++      "content": "Hello! How can I assist you today?"
++   },
++   {
++      "role": "user",
++      "content": "Write an essay about the importance of higher education.",
++   },
++]
++
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++# Create an LLM.
++llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
++         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
++# Generate texts from the prompts. The output is a list of RequestOutput objects
++# that contain the prompt, generated text, and other information.
++outputs = llm.chat(conversation, sampling_params)
++
++# Print the outputs.
++for output in outputs:
++   prompt = output.prompt
++   generated_text = output.outputs[0].text
++   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
+diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
+new file mode 100644
+index 0000000..861cb16
+--- /dev/null
++++ b/docs/source/features/quantization/index.md
+@@ -0,0 +1,19 @@
++(quantization-index)=
++
++# Quantization
++
++Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
++
++```{toctree}
++:caption: Contents
++:maxdepth: 1
++
++supported_hardware
++auto_awq
++bnb
++gguf
++int8
++fp8
++fp8_e5m2_kvcache
++fp8_e4m3_kvcache
++```
+diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
+new file mode 100644
+index 0000000..82a15d7
+--- /dev/null
++++ b/docs/source/features/quantization/int8.md
+@@ -0,0 +1,136 @@
++(int8)=
++
++# INT8 W8A8
++
++vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
++This quantization method is particularly useful for reducing model size while maintaining good performance.
++
++Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
++
++```{note}
++INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
++```
++
++## Prerequisites
++
++To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
++
++```console
++pip install llmcompressor
++```
++
++## Quantization Process
++
++The quantization process involves four main steps:
++
++1. Loading the model
++2. Preparing calibration data
++3. Applying quantization
++4. Evaluating accuracy in vLLM
++
++### 1. Loading the Model
++
++Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models:
++
++```python
++from llmcompressor.transformers import SparseAutoModelForCausalLM
++from transformers import AutoTokenizer
++
++MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
++model = SparseAutoModelForCausalLM.from_pretrained(
++    MODEL_ID, device_map="auto", torch_dtype="auto",
++)
++tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
++```
++
++### 2. Preparing Calibration Data
++
++When quantizing activations to INT8, you need sample data to estimate the activation scales.
++It's best to use calibration data that closely matches your deployment data.
++For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
++
++```python
++from datasets import load_dataset
++
++NUM_CALIBRATION_SAMPLES = 512
++MAX_SEQUENCE_LENGTH = 2048
++
++# Load and preprocess the dataset
++ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
++ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
++
++def preprocess(example):
++    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
++ds = ds.map(preprocess)
++
++def tokenize(sample):
++    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
++ds = ds.map(tokenize, remove_columns=ds.column_names)
++```
++
++### 3. Applying Quantization
++
++Now, apply the quantization algorithms:
++
++```python
++from llmcompressor.transformers import oneshot
++from llmcompressor.modifiers.quantization import GPTQModifier
++from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
++
++# Configure the quantization algorithms
++recipe = [
++    SmoothQuantModifier(smoothing_strength=0.8),
++    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
++]
++
++# Apply quantization
++oneshot(
++    model=model,
++    dataset=ds,
++    recipe=recipe,
++    max_seq_length=MAX_SEQUENCE_LENGTH,
++    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
++)
++
++# Save the compressed model
++SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
++model.save_pretrained(SAVE_DIR, save_compressed=True)
++tokenizer.save_pretrained(SAVE_DIR)
++```
++
++This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
++
++### 4. Evaluating Accuracy
++
++After quantization, you can load and run the model in vLLM:
++
++```python
++from vllm import LLM
++model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
++```
++
++To evaluate accuracy, you can use `lm_eval`:
++
++```console
++$ lm_eval --model vllm \
++  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \
++  --tasks gsm8k \
++  --num_fewshot 5 \
++  --limit 250 \
++  --batch_size 'auto'
++```
++
++```{note}
++Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
++```
++
++## Best Practices
++
++- Start with 512 samples for calibration data (increase if accuracy drops)
++- Use a sequence length of 2048 as a starting point
++- Employ the chat template or instruction template that the model was trained with
++- If you've fine-tuned a model, consider using a sample of your training data for calibration
++
++## Troubleshooting and Support
++
++If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository.
+diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
+new file mode 100644
+index 0000000..988288a
+--- /dev/null
++++ b/docs/source/features/quantization/supported_hardware.md
+@@ -0,0 +1,131 @@
++(quantization-supported-hardware)=
++
++# Supported Hardware
++
++The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
++
++```{list-table}
++:header-rows: 1
++:widths: 20 8 8 8 8 8 8 8 8 8 8
++
++* - Implementation
++  - Volta
++  - Turing
++  - Ampere
++  - Ada
++  - Hopper
++  - AMD GPU
++  - Intel GPU
++  - x86 CPU
++  - AWS Inferentia
++  - Google TPU
++* - AWQ
++  - ✗
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✗
++* - GPTQ
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✗
++* - Marlin (GPTQ/AWQ/FP8)
++  - ✗
++  - ✗
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++* - INT8 (W8A8)
++  - ✗
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✗
++  - ✅︎
++  - ✗
++  - ✗
++* - FP8 (W8A8)
++  - ✗
++  - ✗
++  - ✗
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++* - AQLM
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++* - bitsandbytes
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++* - DeepSpeedFP
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++* - GGUF
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✅︎
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++  - ✗
++```
++
++- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
++- "✅︎" indicates that the quantization method is supported on the specified hardware.
++- "✗" indicates that the quantization method is not supported on the specified hardware.
++
++```{note}
++This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
++
++For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
++```
+diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
+new file mode 100644
+index 0000000..ab7b2f3
+--- /dev/null
++++ b/docs/source/features/spec_decode.md
+@@ -0,0 +1,265 @@
++(spec-decode)=
++
++# Speculative Decoding
++
++```{warning}
++Please note that speculative decoding in vLLM is not yet optimized and does
++not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
++The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
++```
++
++```{warning}
++Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
++```
++
++This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM.
++Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
++
++## Speculating with a draft model
++
++The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
++
++```python
++from vllm import LLM, SamplingParams
++
++prompts = [
++    "The future of AI is",
++]
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++llm = LLM(
++    model="facebook/opt-6.7b",
++    tensor_parallel_size=1,
++    speculative_model="facebook/opt-125m",
++    num_speculative_tokens=5,
++)
++outputs = llm.generate(prompts, sampling_params)
++
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
++
++To perform the same with an online mode launch the server:
++
++```bash
++python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
++    --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
++    --num_speculative_tokens 5 --gpu_memory_utilization 0.8
++```
++
++Then use a client:
++
++```python
++from openai import OpenAI
++
++# Modify OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++
++client = OpenAI(
++    # defaults to os.environ.get("OPENAI_API_KEY")
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++models = client.models.list()
++model = models.data[0].id
++
++# Completion API
++stream = False
++completion = client.completions.create(
++    model=model,
++    prompt="The future of AI is",
++    echo=False,
++    n=1,
++    stream=stream,
++)
++
++print("Completion results:")
++if stream:
++    for c in completion:
++        print(c)
++else:
++    print(completion)
++```
++
++## Speculating by matching n-grams in the prompt
++
++The following code configures vLLM to use speculative decoding where proposals are generated by
++matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
++
++```python
++from vllm import LLM, SamplingParams
++
++prompts = [
++    "The future of AI is",
++]
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++llm = LLM(
++    model="facebook/opt-6.7b",
++    tensor_parallel_size=1,
++    speculative_model="[ngram]",
++    num_speculative_tokens=5,
++    ngram_prompt_lookup_max=4,
++)
++outputs = llm.generate(prompts, sampling_params)
++
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
++
++## Speculating using MLP speculators
++
++The following code configures vLLM to use speculative decoding where proposals are generated by
++draft models that conditioning draft predictions on both context vectors and sampled tokens.
++For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
++[this technical report](https://arxiv.org/abs/2404.19124).
++
++```python
++from vllm import LLM, SamplingParams
++
++prompts = [
++    "The future of AI is",
++]
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++llm = LLM(
++    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
++    tensor_parallel_size=4,
++    speculative_model="ibm-fms/llama3-70b-accelerator",
++    speculative_draft_tensor_parallel_size=1,
++)
++outputs = llm.generate(prompts, sampling_params)
++
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
++
++Note that these speculative models currently need to be run without tensor parallelism, although
++it is possible to run the main model using tensor parallelism (see example above). Since the
++speculative models are relatively small, we still see significant speedups. However, this
++limitation will be fixed in a future release.
++
++A variety of speculative models of this type are available on HF hub:
++
++- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator)
++- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator)
++- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator)
++- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator)
++- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator)
++- [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator)
++- [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator)
++- [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
++- [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator)
++
++## Speculating using EAGLE based draft models
++
++The following code configures vLLM to use speculative decoding where proposals are generated by
++an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
++
++```python
++from vllm import LLM, SamplingParams
++
++prompts = [
++    "The future of AI is",
++]
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++llm = LLM(
++    model="meta-llama/Meta-Llama-3-8B-Instruct",
++    tensor_parallel_size=4,
++    speculative_model="path/to/modified/eagle/model",
++    speculative_draft_tensor_parallel_size=1,
++)
++
++outputs = llm.generate(prompts, sampling_params)
++
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++```
++
++A few important things to consider when using the EAGLE based draft models:
++
++1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be
++   used directly with vLLM due to differences in the expected layer names and model definition.
++   To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d)
++   to convert them. Note that this script does not modify the model's weights.
++
++   In the above example, use the script to first convert
++   the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model
++   and then use the converted checkpoint as the draft model in vLLM.
++
++2. The EAGLE based draft models need to be run without tensor parallelism
++   (i.e. speculative_draft_tensor_parallel_size is set to 1), although
++   it is possible to run the main model using tensor parallelism (see example above).
++
++3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
++   reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
++   investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
++
++A variety of EAGLE draft models are available on the Hugging Face hub:
++
++| Base Model                                                           | EAGLE on Hugging Face                     | # EAGLE Parameters |
++|---------------------------------------------------------------------|-------------------------------------------|--------------------|
++| Vicuna-7B-v1.3                                                       | yuhuili/EAGLE-Vicuna-7B-v1.3             | 0.24B              |
++| Vicuna-13B-v1.3                                                      | yuhuili/EAGLE-Vicuna-13B-v1.3            | 0.37B              |
++| Vicuna-33B-v1.3                                                      | yuhuili/EAGLE-Vicuna-33B-v1.3            | 0.56B              |
++| LLaMA2-Chat 7B                                                       | yuhuili/EAGLE-llama2-chat-7B             | 0.24B              |
++| LLaMA2-Chat 13B                                                      | yuhuili/EAGLE-llama2-chat-13B            | 0.37B              |
++| LLaMA2-Chat 70B                                                      | yuhuili/EAGLE-llama2-chat-70B            | 0.99B              |
++| Mixtral-8x7B-Instruct-v0.1                                           | yuhuili/EAGLE-mixtral-instruct-8x7B      | 0.28B              |
++| LLaMA3-Instruct 8B                                                   | yuhuili/EAGLE-LLaMA3-Instruct-8B         | 0.25B              |
++| LLaMA3-Instruct 70B                                                  | yuhuili/EAGLE-LLaMA3-Instruct-70B        | 0.99B              |
++| Qwen2-7B-Instruct                                                    | yuhuili/EAGLE-Qwen2-7B-Instruct          | 0.26B              |
++| Qwen2-72B-Instruct                                                   | yuhuili/EAGLE-Qwen2-72B-Instruct         | 1.05B              |
++
++## Lossless guarantees of Speculative Decoding
++
++In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of
++speculative decoding, breaking down the guarantees into three key areas:
++
++1. **Theoretical Losslessness**
++   \- Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might
++   cause slight variations in output distributions, as discussed
++   in [Accelerating Large Language Model Decoding with Speculative Sampling](https://arxiv.org/pdf/2302.01318)
++
++2. **Algorithmic Losslessness**
++   \- vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include:
++
++   > - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target
++   >   distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
++   > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
++   >   without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
++   >   provides a lossless guarantee. Almost all of the tests in <gh-dir:tests/spec_decode/e2e>.
++   >   verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
++
++3. **vLLM Logprob Stability**
++   \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
++   same request across runs. For more details, see the FAQ section
++   titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
++
++While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
++can occur due to following factors:
++
++- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution.
++- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
++  due to non-deterministic behavior in batched operations or numerical instability.
++
++For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
++
++## Resources for vLLM contributors
++
++- [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
++- [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
++- [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
++- [Dynamic speculative decoding](gh-issue:4565)
+diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
+new file mode 100644
+index 0000000..1d77c73
+--- /dev/null
++++ b/docs/source/features/structured_outputs.md
+@@ -0,0 +1,260 @@
++(structured-outputs)=
++
++# Structured Outputs
++
++vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
++This document shows you some examples of the different options that are available to generate structured outputs.
++
++## Online Serving (OpenAI API)
++
++You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
++
++The following parameters are supported, which must be added as extra parameters:
++
++- `guided_choice`: the output will be exactly one of the choices.
++- `guided_regex`: the output will follow the regex pattern.
++- `guided_json`: the output will follow the JSON schema.
++- `guided_grammar`: the output will follow the context free grammar.
++- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
++- `guided_decoding_backend`: used to select the guided decoding backend to use.
++
++You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page.
++
++Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
++
++```python
++from openai import OpenAI
++client = OpenAI(
++    base_url="http://localhost:8000/v1",
++    api_key="-",
++)
++
++completion = client.chat.completions.create(
++    model="Qwen/Qwen2.5-3B-Instruct",
++    messages=[
++        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
++    ],
++    extra_body={"guided_choice": ["positive", "negative"]},
++)
++print(completion.choices[0].message.content)
++```
++
++The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
++
++```python
++completion = client.chat.completions.create(
++    model="Qwen/Qwen2.5-3B-Instruct",
++    messages=[
++        {
++            "role": "user",
++            "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
++        }
++    ],
++    extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]},
++)
++print(completion.choices[0].message.content)
++```
++
++One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
++For this we can use the `guided_json` parameter in two different ways:
++
++- Using directly a [JSON Schema](https://json-schema.org/)
++- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
++
++The next example shows how to use the `guided_json` parameter with a Pydantic model:
++
++```python
++from pydantic import BaseModel
++from enum import Enum
++
++class CarType(str, Enum):
++    sedan = "sedan"
++    suv = "SUV"
++    truck = "Truck"
++    coupe = "Coupe"
++
++
++class CarDescription(BaseModel):
++    brand: str
++    model: str
++    car_type: CarType
++
++
++json_schema = CarDescription.model_json_schema()
++
++completion = client.chat.completions.create(
++    model="Qwen/Qwen2.5-3B-Instruct",
++    messages=[
++        {
++            "role": "user",
++            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
++        }
++    ],
++    extra_body={"guided_json": json_schema},
++)
++print(completion.choices[0].message.content)
++```
++
++```{tip}
++While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
++This can improve the results notably in most cases.
++```
++
++Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
++It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
++
++```python
++simplified_sql_grammar = """
++    ?start: select_statement
++
++    ?select_statement: "SELECT " column_list " FROM " table_name
++
++    ?column_list: column_name ("," column_name)*
++
++    ?table_name: identifier
++
++    ?column_name: identifier
++
++    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
++"""
++
++completion = client.chat.completions.create(
++    model="Qwen/Qwen2.5-3B-Instruct",
++    messages=[
++        {
++            "role": "user",
++            "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
++        }
++    ],
++    extra_body={"guided_grammar": simplified_sql_grammar},
++)
++print(completion.choices[0].message.content)
++```
++
++Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
++
++## Experimental Automatic Parsing (OpenAI API)
++
++This section covers the OpenAI beta wrapper over the `client.chat.completions.create()` method that provides richer integrations with Python specific types.
++
++At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104).
++
++For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
++
++Here is a simple example demonstrating how to get structured output using Pydantic models:
++
++```python
++from pydantic import BaseModel
++from openai import OpenAI
++
++
++class Info(BaseModel):
++    name: str
++    age: int
++
++
++client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
++completion = client.beta.chat.completions.parse(
++    model="meta-llama/Llama-3.1-8B-Instruct",
++    messages=[
++        {"role": "system", "content": "You are a helpful assistant."},
++        {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
++    ],
++    response_format=Info,
++    extra_body=dict(guided_decoding_backend="outlines"),
++)
++
++message = completion.choices[0].message
++print(message)
++assert message.parsed
++print("Name:", message.parsed.name)
++print("Age:", message.parsed.age)
++```
++
++Output:
++
++```console
++ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
++Name: Cameron
++Age: 28
++```
++
++Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
++
++```python
++from typing import List
++from pydantic import BaseModel
++from openai import OpenAI
++
++
++class Step(BaseModel):
++    explanation: str
++    output: str
++
++
++class MathResponse(BaseModel):
++    steps: List[Step]
++    final_answer: str
++
++
++client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
++completion = client.beta.chat.completions.parse(
++    model="meta-llama/Llama-3.1-8B-Instruct",
++    messages=[
++        {"role": "system", "content": "You are a helpful expert math tutor."},
++        {"role": "user", "content": "Solve 8x + 31 = 2."},
++    ],
++    response_format=MathResponse,
++    extra_body=dict(guided_decoding_backend="outlines"),
++)
++
++message = completion.choices[0].message
++print(message)
++assert message.parsed
++for i, step in enumerate(message.parsed.steps):
++    print(f"Step #{i}:", step)
++print("Answer:", message.parsed.final_answer)
++```
++
++Output:
++
++```console
++ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8'))
++Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31'
++Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29'
++Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8'
++Answer: x = -29/8
++```
++
++## Offline Inference
++
++Offline inference allows for the same types of guided decoding.
++To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
++The main available options inside `GuidedDecodingParams` are:
++
++- `json`
++- `regex`
++- `choice`
++- `grammar`
++- `backend`
++- `whitespace_pattern`
++
++These parameters can be used in the same way as the parameters from the Online Serving examples above.
++One example for the usage of the `choices` parameter is shown below:
++
++```python
++from vllm import LLM, SamplingParams
++from vllm.sampling_params import GuidedDecodingParams
++
++llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
++
++guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
++sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
++outputs = llm.generate(
++    prompts="Classify this sentiment: vLLM is wonderful!",
++    sampling_params=sampling_params,
++)
++print(outputs[0].outputs[0].text)
++```
++
++Full example: <gh-file:examples/offline_inference/structured_outputs.py>
+diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
+new file mode 100644
+index 0000000..027ddb6
+--- /dev/null
++++ b/docs/source/features/tool_calling.md
+@@ -0,0 +1,300 @@
++# Tool Calling
++
++vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but on the roadmap.
++
++## Quickstart
++
++Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8B model, so we need to use the llama3 tool calling chat template from the vLLM examples directory:
++
++```bash
++vllm serve meta-llama/Llama-3.1-8B-Instruct \
++    --enable-auto-tool-choice \
++    --tool-call-parser llama3_json \
++    --chat-template examples/tool_chat_template_llama3.1_json.jinja
++```
++
++Next, make a request to the model that should result in it using the available tools:
++
++```python
++from openai import OpenAI
++import json
++
++client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
++
++def get_weather(location: str, unit: str):
++    return f"Getting the weather for {location} in {unit}..."
++tool_functions = {"get_weather": get_weather}
++
++tools = [{
++    "type": "function",
++    "function": {
++        "name": "get_weather",
++        "description": "Get the current weather in a given location",
++        "parameters": {
++            "type": "object",
++            "properties": {
++                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
++                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
++            },
++            "required": ["location", "unit"]
++        }
++    }
++}]
++
++response = client.chat.completions.create(
++    model=client.models.list().data[0].id,
++    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
++    tools=tools,
++    tool_choice="auto"
++)
++
++tool_call = response.choices[0].message.tool_calls[0].function
++print(f"Function called: {tool_call.name}")
++print(f"Arguments: {tool_call.arguments}")
++print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
++```
++
++Example output:
++
++```text
++Function called: get_weather
++Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"}
++Result: Getting the weather for San Francisco, CA in fahrenheit...
++```
++
++This example demonstrates:
++
++* Setting up the server with tool calling enabled
++* Defining an actual function to handle tool calls
++* Making a request with `tool_choice="auto"`
++* Handling the structured response and executing the corresponding function
++
++You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
++
++Remember that it's the callers responsibility to:
++
++1. Define appropriate tools in the request
++2. Include relevant context in the chat messages
++3. Handle the tool calls in your application logic
++
++For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below.
++
++## Named Function Calling
++
++vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
++enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a
++high-quality one.
++
++vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
++For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
++
++To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
++specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
++
++## Automatic Function Calling
++
++To enable this feature, you should set the following flags:
++
++* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it
++deems appropriate.
++* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers
++will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`.
++* `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`.
++* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages
++that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their
++`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat
++template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates)
++from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json)
++
++If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template!
++
++### Hermes Models (`hermes`)
++
++All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported.
++
++* `NousResearch/Hermes-2-Pro-*`
++* `NousResearch/Hermes-2-Theta-*`
++* `NousResearch/Hermes-3-*`
++
++_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge
++step in their creation_.
++
++Flags: `--tool-call-parser hermes`
++
++### Mistral Models (`mistral`)
++
++Supported models:
++
++* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
++* Additional mistral function-calling models are compatible as well.
++
++Known issues:
++
++1. Mistral 7B struggles to generate parallel tool calls correctly.
++2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
++much shorter than what vLLM generates. Since an exception is thrown when this condition
++is not met, the following additional chat templates are provided:
++
++* `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that
++it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
++* `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt
++when tools are provided, that results in much better reliability when working with parallel tool calling.
++
++Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
++
++### Llama Models (`llama3_json`)
++
++Supported models:
++
++* `meta-llama/Meta-Llama-3.1-8B-Instruct`
++* `meta-llama/Meta-Llama-3.1-70B-Instruct`
++* `meta-llama/Meta-Llama-3.1-405B-Instruct`
++* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8`
++
++The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below.
++Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
++
++Known issues:
++
++1. Parallel tool calls are not supported.
++2. The model can generate parameters with a wrong format, such as generating
++   an array serialized as string instead of an array.
++
++The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that
++it works better with vLLM.
++
++Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja`
++
++#### IBM Granite
++
++Supported models:
++
++* `ibm-granite/granite-3.0-8b-instruct`
++
++Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
++
++`examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported.
++
++* `ibm-granite/granite-3.1-8b-instruct`
++
++Recommended flags: `--tool-call-parser granite`
++
++The chat template from Huggingface can be used directly. Parallel function calls are supported.
++
++* `ibm-granite/granite-20b-functioncalling`
++
++Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
++
++`examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
++
++### InternLM Models (`internlm`)
++
++Supported models:
++
++* `internlm/internlm2_5-7b-chat` (confirmed)
++* Additional internlm2.5 function-calling models are compatible as well
++
++Known issues:
++
++* Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model.
++
++Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja`
++
++### Jamba Models (`jamba`)
++
++AI21's Jamba-1.5 models are supported.
++
++* `ai21labs/AI21-Jamba-1.5-Mini`
++* `ai21labs/AI21-Jamba-1.5-Large`
++
++Flags: `--tool-call-parser jamba`
++
++### Models with Pythonic Tool Calls (`pythonic`)
++
++A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
++
++As a concrete example, these models may look up the weather in San Francisco and Seattle by generating:
++
++```python
++[get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')]
++```
++
++Limitations:
++
++* The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls.  (In particular, the Llama 3.2 models emit no such tokens.)
++* Llama's smaller models struggle to use tools effectively.
++
++Example supported models:
++
++* `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
++* `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
++* `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
++* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
++
++Flags: `--tool-call-parser pythonic --chat-template {see_above}`
++
++---
++**WARNING**
++Llama's smaller models frequently fail to emit tool calls in the correct format. Your mileage may vary.
++
++---
++
++## How to write a tool parser plugin
++
++A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py.
++
++Here is a summary of a plugin file:
++
++```python
++
++# import the required packages
++
++# define a tool parser and register it to vllm
++# the name list in register_module can be used
++# in --tool-call-parser. you can define as many
++# tool parsers as you want here.
++@ToolParserManager.register_module(["example"])
++class ExampleToolParser(ToolParser):
++    def __init__(self, tokenizer: AnyTokenizer):
++        super().__init__(tokenizer)
++
++    # adjust request. e.g.: set skip special tokens
++    # to False for tool call output.
++    def adjust_request(
++            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
++        return request
++
++    # implement the tool call parse for stream call
++    def extract_tool_calls_streaming(
++        self,
++        previous_text: str,
++        current_text: str,
++        delta_text: str,
++        previous_token_ids: Sequence[int],
++        current_token_ids: Sequence[int],
++        delta_token_ids: Sequence[int],
++        request: ChatCompletionRequest,
++    ) -> Union[DeltaMessage, None]:
++        return delta
++
++    # implement the tool parse for non-stream call
++    def extract_tool_calls(
++        self,
++        model_output: str,
++        request: ChatCompletionRequest,
++    ) -> ExtractedToolCallInformation:
++        return ExtractedToolCallInformation(tools_called=False,
++                                            tool_calls=[],
++                                            content=text)
++
++
++```
++
++Then you can use this plugin in the command line like this.
++
++```console
++    --enable-auto-tool-choice \
++    --tool-parser-plugin <absolute path of the plugin file>
++    --tool-call-parser example \
++    --chat-template <your chat template> \
++```
+diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
+index 79b49a1..aaa13d0 100644
+--- a/docs/source/generate_examples.py
++++ b/docs/source/generate_examples.py
+@@ -1,61 +1,239 @@
++import itertools
+ import re
++from dataclasses import dataclass, field
+ from pathlib import Path
+ 
++ROOT_DIR = Path(__file__).parent.parent.parent.resolve()
++ROOT_DIR_RELATIVE = '../../../..'
++EXAMPLE_DIR = ROOT_DIR / "examples"
++EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
++
+ 
+ def fix_case(text: str) -> str:
+-    subs = [
+-        ("api", "API"),
+-        ("llm", "LLM"),
+-        ("vllm", "vLLM"),
+-        ("openai", "OpenAI"),
+-        ("multilora", "MultiLoRA"),
+-    ]
+-    for sub in subs:
+-        text = re.sub(*sub, text, flags=re.IGNORECASE)
++    subs = {
++        "api": "API",
++        "Cli": "CLI",
++        "cpu": "CPU",
++        "llm": "LLM",
++        "tpu": "TPU",
++        "aqlm": "AQLM",
++        "gguf": "GGUF",
++        "lora": "LoRA",
++        "vllm": "vLLM",
++        "openai": "OpenAI",
++        "multilora": "MultiLoRA",
++        "mlpspeculator": "MLPSpeculator",
++        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
++        r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
++    }
++    for pattern, repl in subs.items():
++        text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
+     return text
+ 
+ 
+-def underline(title: str, character: str = "=") -> str:
+-    return f"{title}\n{character * len(title)}"
++@dataclass
++class Index:
++    """
++    Index class to generate a structured document index.
++
++    Attributes:
++        path (Path): The path save the index file to.
++        title (str): The title of the index.
++        description (str): A brief description of the index.
++        caption (str): An optional caption for the table of contents.
++        maxdepth (int): The maximum depth of the table of contents. Defaults to 1.
++        documents (list[str]): A list of document paths to include in the index. Defaults to an empty list.
++
++    Methods:
++        generate() -> str:
++            Generates the index content as a string in the specified format.
++    """ # noqa: E501
++    path: Path
++    title: str
++    description: str
++    caption: str
++    maxdepth: int = 1
++    documents: list[str] = field(default_factory=list)
++
++    def generate(self) -> str:
++        content = f"# {self.title}\n\n{self.description}\n\n"
++        content += "```{toctree}\n"
++        content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
++        content += "\n".join(self.documents) + "\n```\n"
++        return content
++
++
++@dataclass
++class Example:
++    """
++    Example class for generating documentation content from a given path.
++
++    Attributes:
++        path (Path): The path to the main directory or file.
++        category (str): The category of the document.
++        main_file (Path): The main file in the directory.
++        other_files (list[Path]): List of other files in the directory.
++        title (str): The title of the document.
++
++    Methods:
++        __post_init__(): Initializes the main_file, other_files, and title attributes.
++        determine_main_file() -> Path: Determines the main file in the given path.
++        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
++        determine_title() -> str: Determines the title of the document.
++        generate() -> str: Generates the documentation content.
++    """ # noqa: E501
++    path: Path
++    category: str = None
++    main_file: Path = field(init=False)
++    other_files: list[Path] = field(init=False)
++    title: str = field(init=False)
++
++    def __post_init__(self):
++        self.main_file = self.determine_main_file()
++        self.other_files = self.determine_other_files()
++        self.title = self.determine_title()
++
++    def determine_main_file(self) -> Path:
++        """
++        Determines the main file in the given path.
++        If the path is a file, it returns the path itself. Otherwise, it searches
++        for Markdown files (*.md) in the directory and returns the first one found.
++        Returns:
++            Path: The main file path, either the original path if it's a file or the first
++            Markdown file found in the directory.
++        Raises:
++            IndexError: If no Markdown files are found in the directory.
++        """ # noqa: E501
++        return self.path if self.path.is_file() else list(
++            self.path.glob("*.md")).pop()
++
++    def determine_other_files(self) -> list[Path]:
++        """
++        Determine other files in the directory excluding the main file.
+ 
++        This method checks if the given path is a file. If it is, it returns an empty list.
++        Otherwise, it recursively searches through the directory and returns a list of all
++        files that are not the main file.
+ 
+-def generate_title(filename: str) -> str:
+-    # Turn filename into a title
+-    title = filename.replace("_", " ").title()
+-    # Handle acronyms and names
+-    title = fix_case(title)
+-    # Underline title
+-    title = underline(title)
+-    return title
++        Returns:
++            list[Path]: A list of Path objects representing the other files in the directory.
++        """ # noqa: E501
++        if self.path.is_file():
++            return []
++        is_other_file = lambda file: file.is_file() and file != self.main_file
++        return [file for file in self.path.rglob("*") if is_other_file(file)]
++
++    def determine_title(self) -> str:
++        return fix_case(self.path.stem.replace("_", " ").title())
++
++    def generate(self) -> str:
++        # Convert the path to a relative path from __file__
++        make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to(
++            ROOT_DIR)
++
++        content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
++        include = "include" if self.main_file.suffix == ".md" else \
++            "literalinclude"
++        if include == "literalinclude":
++            content += f"# {self.title}\n\n"
++        content += f":::{{{include}}} {make_relative(self.main_file)}\n"
++        if include == "literalinclude":
++            content += f":language: {self.main_file.suffix[1:]}\n"
++        content += ":::\n\n"
++
++        if not self.other_files:
++            return content
++
++        content += "## Example materials\n\n"
++        for file in self.other_files:
++            include = "include" if file.suffix == ".md" else "literalinclude"
++            content += f":::{{admonition}} {file.relative_to(self.path)}\n"
++            content += ":class: dropdown\n\n"
++            content += f":::{{{include}}} {make_relative(file)}\n:::\n"
++            content += ":::\n\n"
++
++        return content
+ 
+ 
+ def generate_examples():
+-    root_dir = Path(__file__).parent.parent.parent.resolve()
+-
+-    # Source paths
+-    script_dir = root_dir / "examples"
+-    script_paths = sorted(script_dir.glob("*.py"))
+-
+-    # Destination paths
+-    doc_dir = root_dir / "docs/source/getting_started/examples"
+-    doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths]
+-
+-    # Generate the example docs for each example script
+-    for script_path, doc_path in zip(script_paths, doc_paths):
+-        script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}"
+-        # Make script_path relative to doc_path and call it include_path
+-        include_path = '../../../..' / script_path.relative_to(root_dir)
+-        content = (f"{generate_title(doc_path.stem)}\n\n"
+-                   f"Source {script_url}.\n\n"
+-                   f".. literalinclude:: {include_path}\n"
+-                   "    :language: python\n"
+-                   "    :linenos:\n")
++    # Create the EXAMPLE_DOC_DIR if it doesn't exist
++    if not EXAMPLE_DOC_DIR.exists():
++        EXAMPLE_DOC_DIR.mkdir(parents=True)
++
++    # Create empty indices
++    examples_index = Index(
++        path=EXAMPLE_DOC_DIR / "examples_index.md",
++        title="Examples",
++        description=
++        "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
++        caption="Examples",
++        maxdepth=2)
++    # Category indices stored in reverse order because they are inserted into
++    # examples_index.documents at index 0 in order
++    category_indices = {
++        "other":
++        Index(
++            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
++            title="Other",
++            description=
++            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
++            caption="Examples",
++        ),
++        "online_serving":
++        Index(
++            path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md",
++            title="Online Serving",
++            description=
++            "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
++            caption="Examples",
++        ),
++        "offline_inference":
++        Index(
++            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
++            title="Offline Inference",
++            description=
++            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
++            caption="Examples",
++        ),
++    }
++
++    examples = []
++    glob_patterns = ["*.py", "*.md", "*.sh"]
++    # Find categorised examples
++    for category in category_indices:
++        category_dir = EXAMPLE_DIR / category
++        globs = [category_dir.glob(pattern) for pattern in glob_patterns]
++        for path in itertools.chain(*globs):
++            examples.append(Example(path, category))
++        # Find examples in subdirectories
++        for path in category_dir.glob("*/*.md"):
++            examples.append(Example(path.parent, category))
++    # Find uncategorised examples
++    globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
++    for path in itertools.chain(*globs):
++        examples.append(Example(path))
++    # Find examples in subdirectories
++    for path in EXAMPLE_DIR.glob("*/*.md"):
++        # Skip categorised examples
++        if path.parent.name in category_indices:
++            continue
++        examples.append(Example(path.parent))
++
++    # Generate the example documentation
++    for example in sorted(examples, key=lambda e: e.path.stem):
++        doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
+         with open(doc_path, "w+") as f:
+-            f.write(content)
+-
+-    # Generate the toctree for the example scripts
+-    with open(doc_dir / "examples_index.template.rst") as f:
+-        examples_index = f.read()
+-    with open(doc_dir / "examples_index.rst", "w+") as f:
+-        example_docs = "\n   ".join(path.stem for path in script_paths)
+-        f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
++            f.write(example.generate())
++        # Add the example to the appropriate index
++        index = category_indices.get(example.category, examples_index)
++        index.documents.append(example.path.stem)
++
++    # Generate the index files
++    for category_index in category_indices.values():
++        if category_index.documents:
++            examples_index.documents.insert(0, category_index.path.name)
++            with open(category_index.path, "w+") as f:
++                f.write(category_index.generate())
++
++    with open(examples_index.path, "w+") as f:
++        f.write(examples_index.generate())
+diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md
+new file mode 100644
+index 0000000..4751b32
+--- /dev/null
++++ b/docs/source/getting_started/faq.md
+@@ -0,0 +1,37 @@
++(faq)=
++
++# Frequently Asked Questions
++
++> Q: How can I serve multiple models on a single port using the OpenAI API?
++
++A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly.
++
++______________________________________________________________________
++
++> Q: Which model to use for offline inference embedding?
++
++A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5);
++more are listed [here](#supported-models).
++
++By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
++[Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
++but they are expected be inferior to models that are specifically trained on embedding tasks.
++
++______________________________________________________________________
++
++> Q: Can the output of a prompt vary across runs in vLLM?
++
++A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to
++numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details,
++see the [Numerical Accuracy section](https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations).
++
++In vLLM, the same requests might be batched differently due to factors such as other concurrent requests,
++changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations,
++can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in
++different tokens being sampled. Once a different token is sampled, further divergence is likely.
++
++## Mitigation Strategies
++
++- For improved stability and reduced variance, use `float32`. Note that this will require more memory.
++- If using `bfloat16`, switching to `float16` can also help.
++- Using request seeds can aid in achieving more stable generation for temperature > 0, but discrepancies due to precision differences may still occur.
+diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md
+new file mode 100644
+index 0000000..1068893
+--- /dev/null
++++ b/docs/source/getting_started/installation/cpu-apple.md
+@@ -0,0 +1,48 @@
++(installation-apple)=
++
++# Installation for macOS
++
++vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm)
++
++Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
++
++## Requirements
++
++- **Operating System**: `macOS Sonoma` or later
++- **SDK** `XCode 15.4` or later with Command Line Tools
++- **Compilers**: `Apple Clang >= 15.0.0`
++
++<!-- (arm-backend-quick-start-dockerfile)= -->
++
++## Build and installation
++
++After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
++
++```console
++git clone https://github.com/vllm-project/vllm.git
++cd vllm
++pip install -r requirements-cpu.txt
++pip install -e . 
++```
++
++```{note}
++On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
++```
++
++## Troubleshooting
++
++If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
++[Command Line Tools for Xcode](https://developer.apple.com/download/all/).
++
++```text
++[...] fatal error: 'map' file not found
++          1 | #include <map>
++            |          ^~~~~
++      1 error generated.
++      [2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o
++
++[...] fatal error: 'cstddef' file not found
++         10 | #include <cstddef>
++            |          ^~~~~~~~~
++      1 error generated.
++```
+diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md
+new file mode 100644
+index 0000000..e199073
+--- /dev/null
++++ b/docs/source/getting_started/installation/cpu-arm.md
+@@ -0,0 +1,46 @@
++(installation-arm)=
++
++# Installation for ARM CPUs
++
++vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
++
++- CPU backend inference capabilities
++- Relevant runtime environment variables
++- Performance optimization tips
++
++ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
++Contents:
++
++1. [Requirements](#arm-backend-requirements)
++2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile)
++3. [Building from Source](#build-arm-backend-from-source)
++
++(arm-backend-requirements)=
++
++## Requirements
++
++- **Operating System**: Linux or macOS
++- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS
++- **Instruction Set Architecture (ISA)**: NEON support is required
++
++(arm-backend-quick-start-dockerfile)=
++
++## Quick Start with Dockerfile
++
++You can quickly set up vLLM on ARM using Docker:
++
++```console
++$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
++$ docker run -it \
++             --rm \
++             --network=host \
++             --cpuset-cpus=<cpu-id-list, optional> \
++             --cpuset-mems=<memory-node, optional> \
++             vllm-cpu-env
++```
++
++(build-arm-backend-from-source)=
++
++## Building from Source
++
++To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
+diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
+new file mode 100644
+index 0000000..c49c8e0
+--- /dev/null
++++ b/docs/source/getting_started/installation/cpu-x86.md
+@@ -0,0 +1,154 @@
++(installation-x86)=
++
++# Installation for x86 CPUs
++
++vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
++
++- Tensor Parallel
++- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
++- Chunked-prefill
++- Prefix-caching
++- FP8-E5M2 KV-Caching (TODO)
++
++Table of contents:
++
++1. [Requirements](#cpu-backend-requirements)
++2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile)
++3. [Build from source](#build-cpu-backend-from-source)
++4. [Related runtime environment variables](#env-intro)
++5. [Intel Extension for PyTorch](#ipex-guidance)
++6. [Performance tips](#cpu-backend-performance-tips)
++
++(cpu-backend-requirements)=
++
++## Requirements
++
++- OS: Linux
++- Compiler: `gcc/g++>=12.3.0` (optional, recommended)
++- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
++
++(cpu-backend-quick-start-dockerfile)=
++
++## Quick start using Dockerfile
++
++```console
++docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
++docker run -it \
++           --rm \
++           --network=host \
++           --cpuset-cpus=<cpu-id-list, optional> \
++           --cpuset-mems=<memory-node, optional> \
++           vllm-cpu-env
++```
++
++(build-cpu-backend-from-source)=
++
++## Build from source
++
++- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
++
++```console
++sudo apt-get update  -y
++sudo apt-get install -y gcc-12 g++-12 libnuma-dev
++sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
++```
++
++- Second, install Python packages for vLLM CPU backend building:
++
++```console
++pip install --upgrade pip
++pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
++pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
++```
++
++- Finally, build and install vLLM CPU backend:
++
++```console
++VLLM_TARGET_DEVICE=cpu python setup.py install
++```
++
++```{note}
++- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
++- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
++```
++
++(env-intro)=
++
++## Related runtime environment variables
++
++- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
++- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
++
++(ipex-guidance)=
++
++## Intel Extension for PyTorch
++
++- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
++
++(cpu-backend-performance-tips)=
++
++## Performance tips
++
++- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
++
++```console
++sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
++find / -name *libtcmalloc* # find the dynamic link library path
++export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
++python examples/offline_inference/basic.py # run vLLM
++```
++
++- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
++
++```console
++export VLLM_CPU_KVCACHE_SPACE=40
++export VLLM_CPU_OMP_THREADS_BIND=0-29
++vllm serve facebook/opt-125m
++```
++
++- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
++
++```console
++$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
++
++# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
++CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
++0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
++1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
++2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
++3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
++4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
++5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
++6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
++7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
++8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
++9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
++10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
++11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
++12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
++13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
++14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
++15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
++
++# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
++$ export VLLM_CPU_OMP_THREADS_BIND=0-7
++$ python examples/offline_inference/basic.py
++```
++
++- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
++
++## CPU Backend Considerations
++
++- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance.
++
++- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
++
++- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
++
++  - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
++
++    ```console
++    VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
++    ```
++
++  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
+diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
+new file mode 100644
+index 0000000..727486a
+--- /dev/null
++++ b/docs/source/getting_started/installation/gpu-cuda.md
+@@ -0,0 +1,236 @@
++(installation-cuda)=
++
++# Installation for CUDA
++
++vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
++
++## Requirements
++
++- OS: Linux
++- Python: 3.9 -- 3.12
++- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
++
++## Install released versions
++
++### Create a new Python environment
++
++You can create a new Python environment using `conda`:
++
++```console
++# (Recommended) Create a new conda environment.
++conda create -n myenv python=3.12 -y
++conda activate myenv
++```
++
++```{note}
++[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
++```
++
++Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
++
++```console
++# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
++uv venv myenv --python 3.12 --seed
++source myenv/bin/activate
++```
++
++In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
++
++Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
++
++### Install vLLM
++
++You can install vLLM using either `pip` or `uv pip`:
++
++```console
++# Install vLLM with CUDA 12.1.
++pip install vllm # If you are using pip.
++uv pip install vllm # If you are using uv.
++```
++
++As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
++
++```console
++# Install vLLM with CUDA 11.8.
++export VLLM_VERSION=0.6.1.post1
++export PYTHON_VERSION=310
++pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
++```
++
++(install-the-latest-code)=
++
++## Install the latest code
++
++LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
++
++### Install the latest code using `pip`
++
++```console
++pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
++```
++
++`--pre` is required for `pip` to consider pre-released versions.
++
++If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
++
++```console
++export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
++pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
++```
++
++Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
++
++### Install the latest code using `uv`
++
++Another way to install the latest code is to use `uv`:
++
++```console
++uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
++```
++
++If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
++
++```console
++export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
++uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
++```
++
++The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
++
++### Install the latest code using `docker`
++
++Another way to access the latest code is to use the docker images:
++
++```console
++export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
++docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
++```
++
++These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
++
++The latest code can contain bugs and may not be stable. Please use it with caution.
++
++(build-from-source)=
++
++## Build from source
++
++(python-only-build)=
++
++### Python-only build (without compilation)
++
++If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
++
++```console
++git clone https://github.com/vllm-project/vllm.git
++cd vllm
++VLLM_USE_PRECOMPILED=1 pip install --editable .
++```
++
++This will download the [latest nightly wheel](https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl) and use the compiled libraries from there in the installation.
++
++The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
++
++```console
++export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
++pip install --editable .
++```
++
++You can find more information about vLLM's wheels [above](#install-the-latest-code).
++
++```{note}
++There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
++It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel.
++```
++
++### Full build (with compilation)
++
++If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
++
++```console
++git clone https://github.com/vllm-project/vllm.git
++cd vllm
++pip install -e .
++```
++
++```{tip}
++Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
++
++For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
++As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
++
++[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
++The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
++```
++
++#### Use an existing PyTorch installation
++
++There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
++
++- Building vLLM with PyTorch nightly or a custom PyTorch build.
++- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it.
++
++To build vLLM using an existing PyTorch installation:
++
++```console
++git clone https://github.com/vllm-project/vllm.git
++cd vllm
++python use_existing_torch.py
++pip install -r requirements-build.txt
++pip install -e . --no-build-isolation
++```
++
++#### Use the local cutlass for compilation
++
++Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
++To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
++
++```console
++git clone https://github.com/vllm-project/vllm.git
++cd vllm
++VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
++```
++
++#### Troubleshooting
++
++To avoid your system being overloaded, you can limit the number of compilation jobs
++to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
++
++```console
++export MAX_JOBS=6
++pip install -e .
++```
++
++This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory.
++A side effect is a much slower build process.
++
++Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
++
++```console
++# Use `--ipc=host` to make sure the shared memory is large enough.
++docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
++```
++
++If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
++
++```console
++export CUDA_HOME=/usr/local/cuda
++export PATH="${CUDA_HOME}/bin:$PATH"
++```
++
++Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
++
++```console
++nvcc --version # verify that nvcc is in your PATH
++${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
++```
++
++### Unsupported OS build
++
++vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
++
++Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
++
++```console
++export VLLM_TARGET_DEVICE=empty
++pip install -e .
++```
+diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md
+new file mode 100644
+index 0000000..a8971bb
+--- /dev/null
++++ b/docs/source/getting_started/installation/gpu-rocm.md
+@@ -0,0 +1,163 @@
++(installation-rocm)=
++
++# Installation for ROCm
++
++vLLM supports AMD GPUs with ROCm 6.2.
++
++## Requirements
++
++- OS: Linux
++- Python: 3.9 -- 3.12
++- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
++- ROCm 6.2
++
++Installation options:
++
++1. [Build from source with docker](#build-from-source-docker-rocm)
++2. [Build from source](#build-from-source-rocm)
++
++(build-from-source-docker-rocm)=
++
++## Option 1: Build from source with docker (recommended)
++
++You can build and install vLLM from source.
++
++First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
++It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
++
++```console
++{
++    "features": {
++        "buildkit": true
++    }
++}
++```
++
++<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
++It provides flexibility to customize the build of docker image using the following arguments:
++
++- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
++- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
++- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
++- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
++- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
++
++Their values can be passed in when running `docker build` with `--build-arg` options.
++
++To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
++
++```console
++DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
++```
++
++To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
++
++```console
++DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
++```
++
++To run the above docker image `vllm-rocm`, use the below command:
++
++```console
++$ docker run -it \
++   --network=host \
++   --group-add=video \
++   --ipc=host \
++   --cap-add=SYS_PTRACE \
++   --security-opt seccomp=unconfined \
++   --device /dev/kfd \
++   --device /dev/dri \
++   -v <path/to/model>:/app/model \
++   vllm-rocm \
++   bash
++```
++
++Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
++
++(build-from-source-rocm)=
++
++## Option 2: Build from source
++
++0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
++
++- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
++- [PyTorch](https://pytorch.org/)
++
++    For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
++
++    Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/)
++
++1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
++
++    Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
++
++    ```console
++    python3 -m pip install ninja cmake wheel pybind11
++    pip uninstall -y triton
++    git clone https://github.com/OpenAI/triton.git
++    cd triton
++    git checkout e192dba
++    cd python
++    pip3 install .
++    cd ../..
++    ```
++
++    ```{note}
++    - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
++    ```
++
++2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
++
++    Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
++    Alternatively, wheels intended for vLLM use can be accessed under the releases.
++
++    For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
++
++    ```console
++    git clone https://github.com/ROCm/flash-attention.git
++    cd flash-attention
++    git checkout 3cea2fb
++    git submodule update --init
++    GPU_ARCHS="gfx90a" python3 setup.py install
++    cd ..
++    ```
++
++    ```{note}
++    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
++    ```
++
++3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
++
++    ```bash
++    $ pip install --upgrade pip
++
++    # Install PyTorch
++    $ pip uninstall torch -y
++    $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
++
++    # Build & install AMD SMI
++    $ pip install /opt/rocm/share/amd_smi
++
++    # Install dependencies
++    $ pip install --upgrade numba scipy huggingface-hub[cli]
++    $ pip install "numpy<2"
++    $ pip install -r requirements-rocm.txt
++
++    # Build vLLM for MI210/MI250/MI300.
++    $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
++    $ python3 setup.py develop
++    ```
++
++    This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
++
++    ```{tip}
++    - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
++    - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
++    - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
++    - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
++    ```
++
++    ```{tip}
++    - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
++      For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
++    ```
+diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md
+new file mode 100644
+index 0000000..a829b1c
+--- /dev/null
++++ b/docs/source/getting_started/installation/hpu-gaudi.md
+@@ -0,0 +1,389 @@
++(installation-gaudi)=
++
++# Installation for Intel® Gaudi®
++
++This README provides instructions on running vLLM with Intel Gaudi devices.
++
++## Requirements and Installation
++
++Please follow the instructions provided in the [Gaudi Installation
++Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
++to set up the execution environment. To achieve the best performance,
++please follow the methods outlined in the [Optimizing Training Platform
++Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
++
++### Requirements
++
++- OS: Ubuntu 22.04 LTS
++- Python: 3.10
++- Intel Gaudi accelerator
++- Intel Gaudi software version 1.18.0
++
++### Quick start using Dockerfile
++
++```console
++docker build -f Dockerfile.hpu -t vllm-hpu-env  .
++docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
++```
++
++```{tip}
++If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
++```
++
++### Build from source
++
++#### Environment verification
++
++To verify that the Intel Gaudi software was correctly installed, run:
++
++```console
++hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
++apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
++pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
++pip list | grep neural # verify that neural_compressor is installed
++```
++
++Refer to [Intel Gaudi Software Stack
++Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
++for more details.
++
++#### Run Docker Image
++
++It is highly recommended to use the latest Docker image from Intel Gaudi
++vault. Refer to the [Intel Gaudi
++documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
++for more details.
++
++Use the following commands to run a Docker image:
++
++```console
++docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
++docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
++```
++
++#### Build and Install vLLM
++
++To build and install vLLM from source, run:
++
++```console
++git clone https://github.com/vllm-project/vllm.git
++cd vllm
++python setup.py develop
++```
++
++Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
++
++```console
++git clone https://github.com/HabanaAI/vllm-fork.git
++cd vllm-fork
++git checkout habana_main
++python setup.py develop
++```
++
++## Supported Features
++
++- [Offline inference](#offline-inference)
++- Online serving via [OpenAI-Compatible Server](#openai-compatible-server)
++- HPU autodetection - no need to manually select device within vLLM
++- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
++- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
++  prefill attention, Root Mean Square Layer Normalization, Rotary
++  Positional Encoding
++- Tensor parallelism support for multi-card inference
++- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
++  for accelerating low-batch latency and throughput
++- Attention with Linear Biases (ALiBi)
++
++## Unsupported Features
++
++- Beam search
++- LoRA adapters
++- Quantization
++- Prefill chunking (mixed-batch inferencing)
++
++## Supported Configurations
++
++The following configurations have been validated to be function with
++Gaudi2 devices. Configurations that are not listed may or may not work.
++
++- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
++  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
++  datatype with random or greedy sampling
++- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
++  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
++  datatype with random or greedy sampling
++- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
++  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
++  datatype with random or greedy sampling
++- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
++  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
++  datatype with random or greedy sampling
++- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)
++  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
++  datatype with random or greedy sampling
++- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
++  on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
++  datatype with random or greedy sampling
++- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b)
++  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
++- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
++  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
++- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
++  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
++- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
++  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
++- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)
++  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
++- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
++  with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
++
++## Performance Tuning
++
++### Execution modes
++
++Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
++
++```{list-table} vLLM execution modes
++:widths: 25 25 50
++:header-rows: 1
++
++* - `PT_HPU_LAZY_MODE`
++  - `enforce_eager`
++  - execution mode
++* - 0
++  - 0
++  - torch.compile
++* - 0
++  - 1
++  - PyTorch eager mode
++* - 1
++  - 0
++  - HPU Graphs
++* - 1
++  - 1
++  - PyTorch lazy mode
++```
++
++```{warning}
++In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
++```
++
++(gaudi-bucketing-mechanism)=
++
++### Bucketing mechanism
++
++Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
++In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
++
++```{note}
++Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
++```
++
++Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
++
++```text
++INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
++INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
++INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
++INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
++```
++
++`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
++
++Example (with ramp-up)
++
++```text
++min = 2, step = 32, max = 64
++=> ramp_up = (2, 4, 8, 16)
++=> stable = (32, 64)
++=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
++```
++
++Example (without ramp-up)
++
++```text
++min = 128, step = 128, max = 512
++=> ramp_up = ()
++=> stable = (128, 256, 384, 512)
++=> buckets = ramp_up + stable => (128, 256, 384, 512)
++```
++
++In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
++
++```{warning}
++If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
++```
++
++As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
++
++```{note}
++Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
++```
++
++### Warmup
++
++Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
++
++```text
++INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
++INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
++INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
++...
++INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
++INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
++INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
++INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
++...
++INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
++INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
++```
++
++This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
++
++```{tip}
++Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
++```
++
++### HPU Graph capture
++
++[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
++
++When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default).
++Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
++Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
++Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
++Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture.
++With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
++Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
++Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
++
++```{note}
++`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
++```
++
++User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
++\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
++\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
++
++When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
++
++```{note}
++`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
++```
++
++Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
++
++```text
++INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
++INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
++INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
++INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
++INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
++INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
++INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
++INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
++INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
++INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
++INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
++INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
++...
++INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
++INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
++INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
++...
++INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
++INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
++...
++INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
++INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
++INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
++INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
++INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
++INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
++INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
++INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
++INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
++```
++
++### Recommended vLLM Parameters
++
++- We recommend running inference on Gaudi 2 with `block_size` of 128
++  for BF16 data type. Using default values (16, 32) might lead to
++  sub-optimal performance due to Matrix Multiplication Engine
++  under-utilization (see [Gaudi
++  Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
++- For max throughput on Llama 7B, we recommend running with batch size
++  of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
++  If you encounter out-of-memory issues, see troubleshooting section.
++
++### Environment variables
++
++**Diagnostic and profiling knobs:**
++
++- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
++- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default.
++- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
++- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
++- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
++
++**Performance tuning knobs:**
++
++- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
++
++- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
++
++- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
++
++- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
++
++- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
++
++- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
++
++  - `{phase}` is either `PROMPT` or `DECODE`
++
++  - `{dim}` is either `BS`, `SEQ` or `BLOCK`
++
++  - `{param}` is either `MIN`, `STEP` or `MAX`
++
++  - Default values:
++
++    - Prompt:
++      - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
++      - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
++      - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
++      - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
++      - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
++      - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
++    - Decode:
++      - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
++      - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
++      - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
++      - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
++      - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
++      - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
++
++Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
++
++- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
++- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
++
++## Troubleshooting: Tweaking HPU Graphs
++
++If you experience device out-of-memory issues or want to attempt
++inference at higher batch sizes, try tweaking HPU Graphs by following
++the below:
++
++- Tweak `gpu_memory_utilization` knob. It will decrease the
++  allocation of KV cache, leaving some headroom for capturing graphs
++  with larger batch size. By default `gpu_memory_utilization` is set
++  to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
++  short profiling run. Note that decreasing reduces the number of KV
++  cache blocks you have available, and therefore reduces the effective
++  maximum number of tokens you can handle at a given time.
++- If this method is not efficient, you can disable `HPUGraph`
++  completely. With HPU Graphs disabled, you are trading latency and
++  throughput at lower batches for potentially higher throughput on
++  higher batches. You can do that by adding `--enforce-eager` flag to
++  server (for online serving), or by passing `enforce_eager=True`
++  argument to LLM constructor (for offline inference).
+diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
+new file mode 100644
+index 0000000..0ebadca
+--- /dev/null
++++ b/docs/source/getting_started/installation/index.md
+@@ -0,0 +1,20 @@
++(installation-index)=
++
++# Installation
++
++vLLM supports the following hardware platforms:
++
++```{toctree}
++:maxdepth: 1
++
++gpu-cuda
++gpu-rocm
++cpu-x86
++cpu-arm
++cpu-apple
++hpu-gaudi
++tpu
++xpu
++openvino
++neuron
++```
+diff --git a/docs/source/getting_started/installation/neuron.md b/docs/source/getting_started/installation/neuron.md
+new file mode 100644
+index 0000000..5581b19
+--- /dev/null
++++ b/docs/source/getting_started/installation/neuron.md
+@@ -0,0 +1,132 @@
++(installation-neuron)=
++
++# Installation for Neuron
++
++vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
++Paged Attention and Chunked Prefill are currently in development and will be available soon.
++Data types currently supported in Neuron SDK are FP16 and BF16.
++
++## Requirements
++
++- OS: Linux
++- Python: 3.9 -- 3.11
++- Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
++- Pytorch 2.0.1/2.1.1
++- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
++
++Installation steps:
++
++- [Build from source](#build-from-source-neuron)
++
++  - [Step 0. Launch Trn1/Inf2 instances](#launch-instances)
++  - [Step 1. Install drivers and tools](#install-drivers)
++  - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx)
++  - [Step 3. Install vLLM from source](#install-vllm)
++
++(build-from-source-neuron)=
++
++```{note}
++The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
++```
++
++## Build from source
++
++Following instructions are applicable to Neuron SDK 2.16 and beyond.
++
++(launch-instances)=
++
++### Step 0. Launch Trn1/Inf2 instances
++
++Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html).
++
++- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
++- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/)
++- Select Ubuntu Server 22.04 TLS AMI
++- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
++- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
++
++(install-drivers)=
++
++### Step 1. Install drivers and tools
++
++The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
++
++```console
++# Configure Linux for Neuron repository updates
++. /etc/os-release
++sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
++deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
++EOF
++wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
++
++# Update OS packages
++sudo apt-get update -y
++
++# Install OS headers
++sudo apt-get install linux-headers-$(uname -r) -y
++
++# Install git
++sudo apt-get install git -y
++
++# install Neuron Driver
++sudo apt-get install aws-neuronx-dkms=2.* -y
++
++# Install Neuron Runtime
++sudo apt-get install aws-neuronx-collectives=2.* -y
++sudo apt-get install aws-neuronx-runtime-lib=2.* -y
++
++# Install Neuron Tools
++sudo apt-get install aws-neuronx-tools=2.* -y
++
++# Add PATH
++export PATH=/opt/aws/neuron/bin:$PATH
++```
++
++(install-tnx)=
++
++### Step 2. Install transformers-neuronx and its dependencies
++
++[transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances.
++Follow the steps below to install transformer-neuronx package and its dependencies.
++
++```console
++# Install Python venv
++sudo apt-get install -y python3.10-venv g++
++
++# Create Python venv
++python3.10 -m venv aws_neuron_venv_pytorch
++
++# Activate Python venv
++source aws_neuron_venv_pytorch/bin/activate
++
++# Install Jupyter notebook kernel
++pip install ipykernel
++python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
++pip install jupyter notebook
++pip install environment_kernels
++
++# Set pip repository pointing to the Neuron repository
++python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
++
++# Install wget, awscli
++python -m pip install wget
++python -m pip install awscli
++
++# Update Neuron Compiler and Framework
++python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
++```
++
++(install-vllm)=
++
++### Step 3. Install vLLM from source
++
++Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
++
++```console
++git clone https://github.com/vllm-project/vllm.git
++cd vllm
++pip install -U -r requirements-neuron.txt
++VLLM_TARGET_DEVICE="neuron" pip install .
++```
++
++If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
+diff --git a/docs/source/getting_started/installation/openvino.md b/docs/source/getting_started/installation/openvino.md
+new file mode 100644
+index 0000000..d97d417
+--- /dev/null
++++ b/docs/source/getting_started/installation/openvino.md
+@@ -0,0 +1,104 @@
++(installation-openvino)=
++
++# Installation for OpenVINO
++
++vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
++
++- Prefix caching (`--enable-prefix-caching`)
++- Chunked prefill (`--enable-chunked-prefill`)
++
++**Table of contents**:
++
++- [Requirements](#openvino-backend-requirements)
++- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile)
++- [Build from source](#install-openvino-backend-from-source)
++- [Performance tips](#openvino-backend-performance-tips)
++- [Limitations](#openvino-backend-limitations)
++
++(openvino-backend-requirements)=
++
++## Requirements
++
++- OS: Linux
++- Instruction set architecture (ISA) requirement: at least AVX2.
++
++(openvino-backend-quick-start-dockerfile)=
++
++## Quick start using Dockerfile
++
++```console
++docker build -f Dockerfile.openvino -t vllm-openvino-env .
++docker run -it --rm vllm-openvino-env
++```
++
++(install-openvino-backend-from-source)=
++
++## Install from source
++
++- First, install Python. For example, on Ubuntu 22.04, you can run:
++
++  ```console
++  sudo apt-get update  -y
++  sudo apt-get install python3
++  ```
++
++- Second, install prerequisites vLLM OpenVINO backend installation:
++
++  ```console
++  pip install --upgrade pip
++  pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
++  ```
++
++- Finally, install vLLM with OpenVINO backend:
++
++  ```console
++  PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
++  ```
++
++- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
++
++(openvino-backend-performance-tips)=
++
++## Performance tips
++
++### vLLM OpenVINO backend environment variables
++
++- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
++- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
++
++### CPU performance tips
++
++CPU uses the following environment variables to control behavior:
++
++- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
++- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
++
++To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
++
++OpenVINO best known configuration for CPU is:
++
++```console
++$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
++    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
++```
++
++### GPU performance tips
++
++GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
++
++Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
++
++OpenVINO best known configuration for GPU is:
++
++```console
++$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
++    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
++```
++
++(openvino-backend-limitations)=
++
++## Limitations
++
++- LoRA serving is not supported.
++- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
++- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
+diff --git a/docs/source/getting_started/installation/tpu.md b/docs/source/getting_started/installation/tpu.md
+new file mode 100644
+index 0000000..1938785
+--- /dev/null
++++ b/docs/source/getting_started/installation/tpu.md
+@@ -0,0 +1,191 @@
++(installation-tpu)=
++
++# Installation for TPUs
++
++Tensor Processing Units (TPUs) are Google's custom-developed application-specific
++integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
++are available in different versions each with different hardware specifications.
++For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm).
++For more information on the TPU versions supported with vLLM, see:
++
++- [TPU v6e](https://cloud.google.com/tpu/docs/v6e)
++- [TPU v5e](https://cloud.google.com/tpu/docs/v5e)
++- [TPU v5p](https://cloud.google.com/tpu/docs/v5p)
++- [TPU v4](https://cloud.google.com/tpu/docs/v4)
++
++These TPU versions allow you to configure the physical arrangements of the TPU
++chips. This can improve throughput and networking performance. For more
++information see:
++
++- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations)
++- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config)
++- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config)
++- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config)
++
++In order for you to use Cloud TPUs you need to have TPU quota granted to your
++Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
++GPC project and are specified in terms of TPU version, the number of TPU you
++want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota).
++
++For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing).
++
++You may need additional persistent storage for your TPU VMs. For more
++information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
++
++## Requirements
++
++- Google Cloud TPU VM
++- TPU versions: v6e, v5e, v5p, v4
++- Python: 3.10 or newer
++
++### Provision Cloud TPUs
++
++You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest)
++or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources)
++API. This section shows how to create TPUs using the queued resource API. For
++more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api).
++Queued resources enable you to request Cloud TPU resources in a queued manner.
++When you request queued resources, the request is added to a queue maintained by
++the Cloud TPU service. When the requested resource becomes available, it's
++assigned to your Google Cloud project for your immediate exclusive use.
++
++```{note}
++In all of the following commands, replace the ALL CAPS parameter names with
++appropriate values. See the parameter descriptions table for more information.
++```
++
++## Provision a Cloud TPU with the queued resource API
++
++Create a TPU v5e with 4 TPU chips:
++
++```console
++gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
++--node-id TPU_NAME \
++--project PROJECT_ID \
++--zone ZONE \
++--accelerator-type ACCELERATOR_TYPE \
++--runtime-version RUNTIME_VERSION \
++--service-account SERVICE_ACCOUNT
++```
++
++```{list-table} Parameter descriptions
++:header-rows: 1
++
++* - Parameter name
++  - Description
++* - QUEUED_RESOURCE_ID
++  - The user-assigned ID of the queued resource request.
++* - TPU_NAME
++  - The user-assigned name of the TPU which is created when the queued
++    resource request is allocated.
++* - PROJECT_ID
++  - Your Google Cloud project
++* - ZONE
++  - The GCP zone where you want to create your Cloud TPU. The value you use
++    depends on the version of TPUs you are using. For more information, see
++    `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
++* - ACCELERATOR_TYPE
++  - The TPU version you want to use. Specify the TPU version, for example
++    `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
++    see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
++* - RUNTIME_VERSION
++  - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
++* - SERVICE_ACCOUNT
++  - The email address for your service account. You can find it in the IAM
++    Cloud Console under *Service Accounts*. For example:
++    `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
++```
++
++Connect to your TPU using SSH:
++
++```bash
++gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
++```
++
++Install Miniconda:
++
++```bash
++wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
++bash Miniconda3-latest-Linux-x86_64.sh
++source ~/.bashrc
++```
++
++Create and activate a Conda environment for vLLM:
++
++```bash
++conda create -n vllm python=3.10 -y
++conda activate vllm
++```
++
++Clone the vLLM repository and go to the vLLM directory:
++
++```bash
++git clone https://github.com/vllm-project/vllm.git && cd vllm
++```
++
++Uninstall the existing `torch` and `torch_xla` packages:
++
++```bash
++pip uninstall torch torch-xla -y
++```
++
++Install build dependencies:
++
++```bash
++pip install -r requirements-tpu.txt
++sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
++```
++
++Run the setup script:
++
++```bash
++VLLM_TARGET_DEVICE="tpu" python setup.py develop
++```
++
++## Provision Cloud TPUs with GKE
++
++For more information about using TPUs with GKE, see
++<https://cloud.google.com/kubernetes-engine/docs/how-to/tpus>
++<https://cloud.google.com/kubernetes-engine/docs/concepts/tpus>
++<https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus>
++
++(build-docker-tpu)=
++
++## Build a docker image with {code}`Dockerfile.tpu`
++
++You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
++
++```console
++docker build -f Dockerfile.tpu -t vllm-tpu .
++```
++
++Run the Docker image with the following command:
++
++```console
++# Make sure to add `--privileged --net host --shm-size=16G`.
++docker run --privileged --net host --shm-size=16G -it vllm-tpu
++```
++
++```{note}
++Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
++possible input shapes and compiles an XLA graph for each shape. The
++compilation time may take 20~30 minutes in the first run. However, the
++compilation time reduces to ~5 minutes afterwards because the XLA graphs are
++cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default).
++```
++
++````{tip}
++If you encounter the following error:
++
++```console
++from torch._C import *  # noqa: F403
++ImportError: libopenblas.so.0: cannot open shared object file: No such
++file or directory
++```
++
++Install OpenBLAS with the following command:
++
++```console
++$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
++```
++````
+diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md
+new file mode 100644
+index 0000000..73758f3
+--- /dev/null
++++ b/docs/source/getting_started/installation/xpu.md
+@@ -0,0 +1,74 @@
++(installation-xpu)=
++
++# Installation for XPUs
++
++vLLM initially supports basic model inferencing and serving on Intel GPU platform.
++
++Table of contents:
++
++1. [Requirements](#xpu-backend-requirements)
++2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile)
++3. [Build from source](#build-xpu-backend-from-source)
++
++(xpu-backend-requirements)=
++
++## Requirements
++
++- OS: Linux
++- Supported Hardware: Intel Data Center GPU, Intel ARC GPU
++- OneAPI requirements: oneAPI 2024.2
++
++(xpu-backend-quick-start-dockerfile)=
++
++## Quick start using Dockerfile
++
++```console
++$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
++$ docker run -it \
++             --rm \
++             --network=host \
++             --device /dev/dri \
++             -v /dev/dri/by-path:/dev/dri/by-path \
++             vllm-xpu-env
++```
++
++(build-xpu-backend-from-source)=
++
++## Build from source
++
++- First, install required driver and intel OneAPI 2024.2 or later.
++- Second, install Python packages for vLLM XPU backend building:
++
++```console
++source /opt/intel/oneapi/setvars.sh
++pip install --upgrade pip
++pip install -v -r requirements-xpu.txt
++```
++
++- Finally, build and install vLLM XPU backend:
++
++```console
++VLLM_TARGET_DEVICE=xpu python setup.py install
++```
++
++```{note}
++- FP16 is the default data type in the current XPU backend. The BF16 data
++  type will be supported in the future.
++```
++
++## Distributed inference and serving
++
++XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
++
++```console
++python -m vllm.entrypoints.openai.api_server \
++     --model=facebook/opt-13b \
++     --dtype=bfloat16 \
++     --device=xpu \
++     --max_model_len=1024 \
++     --distributed-executor-backend=ray \
++     --pipeline-parallel-size=2 \
++     -tp=8
++```
++
++By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
+diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
+new file mode 100644
+index 0000000..8ac80e5
+--- /dev/null
++++ b/docs/source/getting_started/quickstart.md
+@@ -0,0 +1,186 @@
++(quickstart)=
++
++# Quickstart
++
++This guide will help you quickly get started with vLLM to perform:
++
++- [Offline batched inference](#quickstart-offline)
++- [Online serving using OpenAI-compatible server](#quickstart-online)
++
++## Prerequisites
++
++- OS: Linux
++- Python: 3.9 -- 3.12
++
++## Installation
++
++If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
++
++It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
++
++```console
++uv venv myenv --python 3.12 --seed
++source myenv/bin/activate
++uv pip install vllm
++```
++
++You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
++
++```console
++conda create -n myenv python=3.12 -y
++conda activate myenv
++pip install vllm
++```
++
++```{note}
++For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
++```
++
++(quickstart-offline)=
++
++## Offline Batched Inference
++
++With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
++
++The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
++
++- {class}`~vllm.LLM` is the main class for running offline inference with vLLM engine.
++- {class}`~vllm.SamplingParams` specifies the parameters for the sampling process.
++
++```python
++from vllm import LLM, SamplingParams
++```
++
++The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
++
++```python
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++```
++
++The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](#supported-models).
++
++```python
++llm = LLM(model="facebook/opt-125m")
++```
++
++```{note}
++By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
++```
++
++Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
++
++```python
++outputs = llm.generate(prompts, sampling_params)
++
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
++
++(quickstart-online)=
++
++## OpenAI-Compatible Server
++
++vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
++By default, it starts the server at `http://localhost:8000`. You can specify the address with `--host` and `--port` arguments. The server currently hosts one model at a time and implements endpoints such as [list models](https://platform.openai.com/docs/api-reference/models/list), [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create), and [create completion](https://platform.openai.com/docs/api-reference/completions/create) endpoints.
++
++Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model:
++
++```console
++vllm serve Qwen/Qwen2.5-1.5B-Instruct
++```
++
++```{note}
++By default, the server uses a predefined chat template stored in the tokenizer.
++You can learn about overriding it [here](#chat-template).
++```
++
++This server can be queried in the same format as OpenAI API. For example, to list the models:
++
++```console
++curl http://localhost:8000/v1/models
++```
++
++You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header.
++
++### OpenAI Completions API with vLLM
++
++Once your server is started, you can query the model with input prompts:
++
++```console
++curl http://localhost:8000/v1/completions \
++    -H "Content-Type: application/json" \
++    -d '{
++        "model": "Qwen/Qwen2.5-1.5B-Instruct",
++        "prompt": "San Francisco is a",
++        "max_tokens": 7,
++        "temperature": 0
++    }'
++```
++
++Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
++
++```python
++from openai import OpenAI
++
++# Modify OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++client = OpenAI(
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
++                                      prompt="San Francisco is a")
++print("Completion result:", completion)
++```
++
++A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
++
++### OpenAI Chat Completions API with vLLM
++
++vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
++
++You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model:
++
++```console
++curl http://localhost:8000/v1/chat/completions \
++    -H "Content-Type: application/json" \
++    -d '{
++        "model": "Qwen/Qwen2.5-1.5B-Instruct",
++        "messages": [
++            {"role": "system", "content": "You are a helpful assistant."},
++            {"role": "user", "content": "Who won the world series in 2020?"}
++        ]
++    }'
++```
++
++Alternatively, you can use the `openai` Python package:
++
++```python
++from openai import OpenAI
++# Set OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++
++client = OpenAI(
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++chat_response = client.chat.completions.create(
++    model="Qwen/Qwen2.5-1.5B-Instruct",
++    messages=[
++        {"role": "system", "content": "You are a helpful assistant."},
++        {"role": "user", "content": "Tell me a joke."},
++    ]
++)
++print("Chat response:", chat_response)
++```
+diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
+new file mode 100644
+index 0000000..1e290d2
+--- /dev/null
++++ b/docs/source/getting_started/troubleshooting.md
+@@ -0,0 +1,203 @@
++(troubleshooting)=
++
++# Troubleshooting
++
++This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
++
++```{note}
++Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
++```
++
++## Hangs downloading a model
++
++If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection.
++It's recommended to download the model first using the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli) and passing the local path to the model to vLLM. This way, you can isolate the issue.
++
++## Hangs loading a model from disk
++
++If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
++It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
++
++```{note}
++To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
++```
++
++## Model is too large
++
++If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
++
++## Enable more logging
++
++If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
++
++- `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
++- `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
++- `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
++- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs.
++
++## Incorrect network setup
++
++The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one.
++If it's not, override the IP address using the environment variable `export VLLM_HOST_IP=<your_ip_address>`.
++
++You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>` and `export GLOO_SOCKET_IFNAME=<your_network_interface>` to specify the network interface for the IP address.
++
++## Error near `self.graph.replay()`
++
++If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
++To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
++
++(troubleshooting-incorrect-hardware-driver)=
++
++## Incorrect hardware/driver
++
++If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
++
++```python
++# Test PyTorch NCCL
++import torch
++import torch.distributed as dist
++dist.init_process_group(backend="nccl")
++local_rank = dist.get_rank() % torch.cuda.device_count()
++torch.cuda.set_device(local_rank)
++data = torch.FloatTensor([1,] * 128).to("cuda")
++dist.all_reduce(data, op=dist.ReduceOp.SUM)
++torch.cuda.synchronize()
++value = data.mean().item()
++world_size = dist.get_world_size()
++assert value == world_size, f"Expected {world_size}, got {value}"
++
++print("PyTorch NCCL is successful!")
++
++# Test PyTorch GLOO
++gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
++cpu_data = torch.FloatTensor([1,] * 128)
++dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
++value = cpu_data.mean().item()
++assert value == world_size, f"Expected {world_size}, got {value}"
++
++print("PyTorch GLOO is successful!")
++
++if world_size <= 1:
++    exit()
++
++# Test vLLM NCCL, with cuda graph
++from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
++
++pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
++# pynccl is enabled by default for 0.6.5+,
++# but for 0.6.4 and below, we need to enable it manually.
++# keep the code for backward compatibility when because people
++# prefer to read the latest documentation.
++pynccl.disabled = False
++
++s = torch.cuda.Stream()
++with torch.cuda.stream(s):
++    data.fill_(1)
++    pynccl.all_reduce(data, stream=s)
++    value = data.mean().item()
++    assert value == world_size, f"Expected {world_size}, got {value}"
++
++print("vLLM NCCL is successful!")
++
++g = torch.cuda.CUDAGraph()
++with torch.cuda.graph(cuda_graph=g, stream=s):
++    pynccl.all_reduce(data, stream=torch.cuda.current_stream())
++
++data.fill_(1)
++g.replay()
++torch.cuda.current_stream().synchronize()
++value = data.mean().item()
++assert value == world_size, f"Expected {world_size}, got {value}"
++
++print("vLLM NCCL with cuda graph is successful!")
++
++dist.destroy_process_group(gloo_group)
++dist.destroy_process_group()
++```
++
++If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:
++
++```console
++NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
++```
++
++If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run:
++
++```console
++NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
++```
++
++If the script runs successfully, you should see the message `sanity check is successful!`.
++
++If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
++
++```{note}
++A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
++
++- In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
++- In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
++
++Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
++```
++
++(troubleshooting-python-multiprocessing)=
++
++## Python multiprocessing
++
++### `RuntimeError` Exception
++
++If you have seen a warning in your logs like this:
++
++```console
++WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
++    initialized. We must use the `spawn` multiprocessing start method. Setting
++    VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
++    https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing
++    for more information.
++```
++
++or an error from Python that looks like this:
++
++```console
++RuntimeError:
++        An attempt has been made to start a new process before the
++        current process has finished its bootstrapping phase.
++
++        This probably means that you are not using fork to start your
++        child processes and you have forgotten to use the proper idiom
++        in the main module:
++
++            if __name__ == '__main__':
++                freeze_support()
++                ...
++
++        The "freeze_support()" line can be omitted if the program
++        is not going to be frozen to produce an executable.
++
++        To fix this issue, refer to the "Safe importing of main module"
++        section in https://docs.python.org/3/library/multiprocessing.html
++```
++
++then you must update your Python code to guard usage of `vllm` behind a `if
++__name__ == '__main__':` block. For example, instead of this:
++
++```python
++import vllm
++
++llm = vllm.LLM(...)
++```
++
++try this instead:
++
++```python
++if __name__ == '__main__':
++    import vllm
++
++    llm = vllm.LLM(...)
++```
++
++## Known Issues
++
++- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
++- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
+diff --git a/docs/source/index.md b/docs/source/index.md
+new file mode 100644
+index 0000000..8f9493d
+--- /dev/null
++++ b/docs/source/index.md
+@@ -0,0 +1,192 @@
++# Welcome to vLLM
++
++```{figure} ./assets/logos/vllm-logo-text-light.png
++:align: center
++:alt: vLLM
++:class: no-scaled-link
++:width: 60%
++```
++
++```{raw} html
++<p style="text-align:center">
++<strong>Easy, fast, and cheap LLM serving for everyone
++</strong>
++</p>
++
++<p style="text-align:center">
++<script async defer src="https://buttons.github.io/buttons.js"></script>
++<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
++<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
++<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
++</p>
++```
++
++vLLM is a fast and easy-to-use library for LLM inference and serving.
++
++vLLM is fast with:
++
++- State-of-the-art serving throughput
++- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
++- Continuous batching of incoming requests
++- Fast model execution with CUDA/HIP graph
++- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
++- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
++- Speculative decoding
++- Chunked prefill
++
++vLLM is flexible and easy to use with:
++
++- Seamless integration with popular HuggingFace models
++- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
++- Tensor parallelism and pipeline parallelism support for distributed inference
++- Streaming outputs
++- OpenAI-compatible API server
++- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
++- Prefix caching support
++- Multi-lora support
++
++For more information, check out the following:
++
++- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
++- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
++- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
++- [vLLM Meetups](#meetups)
++
++## Documentation
++
++% How to start using vLLM?
++
++```{toctree}
++:caption: Getting Started
++:maxdepth: 1
++
++getting_started/installation/index
++getting_started/quickstart
++getting_started/examples/examples_index
++getting_started/troubleshooting
++getting_started/faq
++```
++
++% What does vLLM support?
++
++```{toctree}
++:caption: Models
++:maxdepth: 1
++
++models/generative_models
++models/pooling_models
++models/supported_models
++models/extensions/index
++```
++
++% Additional capabilities
++
++```{toctree}
++:caption: Features
++:maxdepth: 1
++
++features/quantization/index
++features/lora
++features/tool_calling
++features/structured_outputs
++features/automatic_prefix_caching
++features/disagg_prefill
++features/spec_decode
++features/compatibility_matrix
++```
++
++% Details about running vLLM
++
++```{toctree}
++:caption: Inference and Serving
++:maxdepth: 1
++
++serving/offline_inference
++serving/openai_compatible_server
++serving/multimodal_inputs
++serving/distributed_serving
++serving/metrics
++serving/engine_args
++serving/env_vars
++serving/usage_stats
++serving/integrations/index
++```
++
++% Scaling up vLLM for production
++
++```{toctree}
++:caption: Deployment
++:maxdepth: 1
++
++deployment/docker
++deployment/k8s
++deployment/nginx
++deployment/frameworks/index
++deployment/integrations/index
++```
++
++% Making the most out of vLLM
++
++```{toctree}
++:caption: Performance
++:maxdepth: 1
++
++performance/optimization
++performance/benchmarks
++```
++
++% Explanation of vLLM internals
++
++```{toctree}
++:caption: Design Documents
++:maxdepth: 2
++
++design/arch_overview
++design/huggingface_integration
++design/plugin_system
++design/kernel/paged_attention
++design/mm_processing
++design/automatic_prefix_caching
++design/multiprocessing
++```
++
++% How to contribute to the vLLM project
++
++```{toctree}
++:caption: Developer Guide
++:maxdepth: 2
++
++contributing/overview
++contributing/profiling/profiling_index
++contributing/dockerfile/dockerfile
++contributing/model/index
++contributing/vulnerability_management
++```
++
++% Technical API specifications
++
++```{toctree}
++:caption: API Reference
++:maxdepth: 2
++
++api/offline_inference/index
++api/engine/index
++api/inference_params
++api/multimodal/index
++api/model/index
++```
++
++% Latest news and acknowledgements
++
++```{toctree}
++:caption: Community
++:maxdepth: 1
++
++community/meetups
++community/sponsors
++```
++
++## Indices and tables
++
++- {ref}`genindex`
++- {ref}`modindex`
+diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md
+new file mode 100644
+index 0000000..cff09d1
+--- /dev/null
++++ b/docs/source/models/extensions/index.md
+@@ -0,0 +1,8 @@
++# Built-in Extensions
++
++```{toctree}
++:maxdepth: 1
++
++runai_model_streamer
++tensorizer
++```
+diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
+new file mode 100644
+index 0000000..75f7a9f
+--- /dev/null
++++ b/docs/source/models/extensions/runai_model_streamer.md
+@@ -0,0 +1,53 @@
++(runai-model-streamer)=
++
++# Loading models with Run:ai Model Streamer
++
++Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
++Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
++
++vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer.
++You first need to install vLLM RunAI optional dependency:
++
++```console
++pip3 install vllm[runai]
++```
++
++To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
++
++```console
++vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
++```
++
++To run model from AWS S3 object store run:
++
++```console
++vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
++```
++
++To run model from a S3 compatible object store run:
++
++```console
++RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
++```
++
++## Tunable parameters
++
++You can tune parameters using `--model-loader-extra-config`:
++
++You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer.
++For reading from S3, it will be the number of client instances the host is opening to the S3 server.
++
++```console
++vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
++```
++
++You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
++You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
++
++```console
++vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
++```
++
++```{note}
++For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
++```
+diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md
+new file mode 100644
+index 0000000..ae17e34
+--- /dev/null
++++ b/docs/source/models/extensions/tensorizer.md
+@@ -0,0 +1,16 @@
++(tensorizer)=
++
++# Loading models with CoreWeave's Tensorizer
++
++vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
++vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
++at runtime extremely quickly directly to the GPU, resulting in significantly
++shorter Pod startup times and CPU memory usage. Tensor encryption is also supported.
++
++For more information on CoreWeave's Tensorizer, please refer to
++[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
++the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
++
++```{note}
++Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
++```
+diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
+new file mode 100644
+index 0000000..e4b4cd0
+--- /dev/null
++++ b/docs/source/models/generative_models.md
+@@ -0,0 +1,126 @@
++(generative-models)=
++
++# Generative Models
++
++vLLM provides first-class support for generative models, which covers most of LLMs.
++
++In vLLM, generative models implement the {class}`~vllm.model_executor.models.VllmModelForTextGeneration` interface.
++Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
++which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text.
++
++For generative models, the only supported `--task` option is `"generate"`.
++Usually, this is automatically inferred so you don't have to specify it.
++
++## Offline Inference
++
++The {class}`~vllm.LLM` class provides various methods for offline inference.
++See [Engine Arguments](#engine-args) for a list of options when initializing the model.
++
++### `LLM.generate`
++
++The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM.
++It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate),
++except that tokenization and detokenization are also performed automatically.
++
++```python
++llm = LLM(model="facebook/opt-125m")
++outputs = llm.generate("Hello, my name is")
++
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
++
++You can optionally control the language generation by passing {class}`~vllm.SamplingParams`.
++For example, you can use greedy sampling by setting `temperature=0`:
++
++```python
++llm = LLM(model="facebook/opt-125m")
++params = SamplingParams(temperature=0)
++outputs = llm.generate("Hello, my name is", params)
++
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
++
++A code example can be found here: <gh-file:examples/offline_inference/basic.py>
++
++### `LLM.beam_search`
++
++The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding) on top of {class}`~vllm.LLM.generate`.
++For example, to search using 5 beams and output at most 50 tokens:
++
++```python
++llm = LLM(model="facebook/opt-125m")
++params = BeamSearchParams(beam_width=5, max_tokens=50)
++outputs = llm.generate("Hello, my name is", params)
++
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
++
++### `LLM.chat`
++
++The {class}`~vllm.LLM.chat` method implements chat functionality on top of {class}`~vllm.LLM.generate`.
++In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
++and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
++
++```{important}
++In general, only instruction-tuned models have a chat template.
++Base models may perform poorly as they are not trained to respond to the chat conversation.
++```
++
++```python
++llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
++conversation = [
++    {
++        "role": "system",
++        "content": "You are a helpful assistant"
++    },
++    {
++        "role": "user",
++        "content": "Hello"
++    },
++    {
++        "role": "assistant",
++        "content": "Hello! How can I assist you today?"
++    },
++    {
++        "role": "user",
++        "content": "Write an essay about the importance of higher education.",
++    },
++]
++outputs = llm.chat(conversation)
++
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++```
++
++A code example can be found here: <gh-file:examples/offline_inference/chat.py>
++
++If the model doesn't have a chat template or you want to specify another one,
++you can explicitly pass a chat template:
++
++```python
++from vllm.entrypoints.chat_utils import load_chat_template
++
++# You can find a list of existing chat templates under `examples/`
++custom_template = load_chat_template(chat_template="<path_to_template>")
++print("Loaded chat template:", custom_template)
++
++outputs = llm.chat(conversation, chat_template=custom_template)
++```
++
++## Online Serving
++
++Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
++
++- [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text.
++- [Chat API](#chat-api)  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template.
+diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
+new file mode 100644
+index 0000000..91db694
+--- /dev/null
++++ b/docs/source/models/pooling_models.md
+@@ -0,0 +1,136 @@
++(pooling-models)=
++
++# Pooling Models
++
++vLLM also supports pooling models, including embedding, reranking and reward models.
++
++In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmModelForPooling` interface.
++These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
++before returning them.
++
++```{note}
++We currently support pooling models primarily as a matter of convenience.
++As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to
++pooling models as they only work on the generation or decode stage, so performance may not improve as much.
++```
++
++For pooling models, we support the following `--task` options.
++The selected option sets the default pooler used to extract the final hidden states:
++
++```{list-table}
++:widths: 50 25 25 25
++:header-rows: 1
++
++* - Task
++  - Pooling Type
++  - Normalization
++  - Softmax
++* - Embedding (`embed`)
++  - `LAST`
++  - ✅︎
++  - ✗
++* - Classification (`classify`)
++  - `LAST`
++  - ✗
++  - ✅︎
++* - Sentence Pair Scoring (`score`)
++  - \*
++  - \*
++  - \*
++* - Reward Modeling (`reward`)
++  - `ALL`
++  - ✗
++  - ✗
++```
++
++\*The default pooler is always defined by the model.
++
++```{note}
++If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
++```
++
++When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
++we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
++
++```{tip}
++You can customize the model's pooling method via the `--override-pooler-config` option,
++which takes priority over both the model's and Sentence Transformers's defaults.
++```
++
++## Offline Inference
++
++The {class}`~vllm.LLM` class provides various methods for offline inference.
++See [Engine Arguments](#engine-args) for a list of options when initializing the model.
++
++### `LLM.encode`
++
++The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM.
++It returns the extracted hidden states directly, which is useful for reward models.
++
++```python
++llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
++(output,) = llm.encode("Hello, my name is")
++
++data = output.outputs.data
++print(f"Data: {data!r}")
++```
++
++### `LLM.embed`
++
++The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
++It is primarily designed for embedding models.
++
++```python
++llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
++(output,) = llm.embed("Hello, my name is")
++
++embeds = output.outputs.embedding
++print(f"Embeddings: {embeds!r} (size={len(embeds)})")
++```
++
++A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
++
++### `LLM.classify`
++
++The {class}`~vllm.LLM.classify` method outputs a probability vector for each prompt.
++It is primarily designed for classification models.
++
++```python
++llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
++(output,) = llm.classify("Hello, my name is")
++
++probs = output.outputs.probs
++print(f"Class Probabilities: {probs!r} (size={len(probs)})")
++```
++
++A code example can be found here: <gh-file:examples/offline_inference/classification.py>
++
++### `LLM.score`
++
++The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
++It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html).
++These types of models serve as rerankers between candidate query-document pairs in RAG systems.
++
++```{note}
++vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
++To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
++```
++
++```python
++llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
++(output,) = llm.score("What is the capital of France?",
++                      "The capital of Brazil is Brasilia.")
++
++score = output.outputs.score
++print(f"Score: {score}")
++```
++
++A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
++
++## Online Serving
++
++Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
++
++- [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
++- [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
++- [Score API](#score-api) is similar to `LLM.score` for cross-encoder models.
+diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
+new file mode 100644
+index 0000000..642ef3c
+--- /dev/null
++++ b/docs/source/models/supported_models.md
+@@ -0,0 +1,868 @@
++(supported-models)=
++
++# List of Supported Models
++
++vLLM supports generative and pooling models across various tasks.
++If a model supports more than one task, you can set the task via the `--task` argument.
++
++For each task, we list the model architectures that have been implemented in vLLM.
++Alongside each architecture, we include some popular models that use it.
++
++## Loading a Model
++
++### HuggingFace Hub
++
++By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
++
++To determine whether a given model is supported, you can check the `config.json` file inside the HF repository.
++If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory.
++
++````{tip}
++The easiest way to check if your model is really supported at runtime is to run the program below:
++
++```python
++from vllm import LLM
++
++# For generative models (task=generate) only
++llm = LLM(model=..., task="generate")  # Name or path of your model
++output = llm.generate("Hello, my name is")
++print(output)
++
++# For pooling models (task={embed,classify,reward,score}) only
++llm = LLM(model=..., task="embed")  # Name or path of your model
++output = llm.encode("Hello, my name is")
++print(output)
++```
++
++If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
++````
++
++Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
++Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
++
++### ModelScope
++
++To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
++
++```shell
++export VLLM_USE_MODELSCOPE=True
++```
++
++And use with `trust_remote_code=True`.
++
++```python
++from vllm import LLM
++
++llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
++
++# For generative models (task=generate) only
++output = llm.generate("Hello, my name is")
++print(output)
++
++# For pooling models (task={embed,classify,reward,score}) only
++output = llm.encode("Hello, my name is")
++print(output)
++```
++
++## List of Text-only Language Models
++
++### Generative Models
++
++See [this page](#generative-models) for more information on how to use generative models.
++
++#### Text Generation (`--task generate`)
++
++```{list-table}
++:widths: 25 25 50 5 5
++:header-rows: 1
++
++* - Architecture
++  - Models
++  - Example HF Models
++  - [LoRA](#lora-adapter)
++  - [PP](#distributed-serving)
++* - `AquilaForCausalLM`
++  - Aquila, Aquila2
++  - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
++  - ✅︎
++  - ✅︎
++* - `ArcticForCausalLM`
++  - Arctic
++  - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
++  -
++  - ✅︎
++* - `BaiChuanForCausalLM`
++  - Baichuan2, Baichuan
++  - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
++  - ✅︎
++  - ✅︎
++* - `BloomForCausalLM`
++  - BLOOM, BLOOMZ, BLOOMChat
++  - `bigscience/bloom`, `bigscience/bloomz`, etc.
++  -
++  - ✅︎
++* - `BartForConditionalGeneration`
++  - BART
++  - `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
++  -
++  -
++* - `ChatGLMModel`
++  - ChatGLM
++  - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
++  - ✅︎
++  - ✅︎
++* - `CohereForCausalLM`, `Cohere2ForCausalLM`
++  - Command-R
++  - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
++  - ✅︎
++  - ✅︎
++* - `DbrxForCausalLM`
++  - DBRX
++  - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
++  -
++  - ✅︎
++* - `DeciLMForCausalLM`
++  - DeciLM
++  - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
++  -
++  - ✅︎
++* - `DeepseekForCausalLM`
++  - DeepSeek
++  - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
++  -
++  - ✅︎
++* - `DeepseekV2ForCausalLM`
++  - DeepSeek-V2
++  - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
++  -
++  - ✅︎
++* - `DeepseekV3ForCausalLM`
++  - DeepSeek-V3
++  - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
++  -
++  - ✅︎
++* - `ExaoneForCausalLM`
++  - EXAONE-3
++  - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
++  - ✅︎
++  - ✅︎
++* - `FalconForCausalLM`
++  - Falcon
++  - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
++  -
++  - ✅︎
++* - `FalconMambaForCausalLM`
++  - FalconMamba
++  - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
++  - ✅︎
++  - ✅︎
++* - `GemmaForCausalLM`
++  - Gemma
++  - `google/gemma-2b`, `google/gemma-7b`, etc.
++  - ✅︎
++  - ✅︎
++* - `Gemma2ForCausalLM`
++  - Gemma2
++  - `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
++  - ✅︎
++  - ✅︎
++* - `GlmForCausalLM`
++  - GLM-4
++  - `THUDM/glm-4-9b-chat-hf`, etc.
++  - ✅︎
++  - ✅︎
++* - `GPT2LMHeadModel`
++  - GPT-2
++  - `gpt2`, `gpt2-xl`, etc.
++  -
++  - ✅︎
++* - `GPTBigCodeForCausalLM`
++  - StarCoder, SantaCoder, WizardCoder
++  - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
++  - ✅︎
++  - ✅︎
++* - `GPTJForCausalLM`
++  - GPT-J
++  - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
++  -
++  - ✅︎
++* - `GPTNeoXForCausalLM`
++  - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
++  - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
++  -
++  - ✅︎
++* - `GraniteForCausalLM`
++  - Granite 3.0, Granite 3.1, PowerLM
++  - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
++  - ✅︎
++  - ✅︎
++* - `GraniteMoeForCausalLM`
++  - Granite 3.0 MoE, PowerMoE
++  - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
++  - ✅︎
++  - ✅︎
++* - `GritLM`
++  - GritLM
++  - `parasail-ai/GritLM-7B-vllm`.
++  - ✅︎
++  - ✅︎
++* - `InternLMForCausalLM`
++  - InternLM
++  - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
++  - ✅︎
++  - ✅︎
++* - `InternLM2ForCausalLM`
++  - InternLM2
++  - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
++  - ✅︎
++  - ✅︎
++* - `JAISLMHeadModel`
++  - Jais
++  - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
++  -
++  - ✅︎
++* - `JambaForCausalLM`
++  - Jamba
++  - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
++  - ✅︎
++  - ✅︎
++* - `LlamaForCausalLM`
++  - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
++  - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
++  - ✅︎
++  - ✅︎
++* - `MambaForCausalLM`
++  - Mamba
++  - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
++  -
++  - ✅︎
++* - `MiniCPMForCausalLM`
++  - MiniCPM
++  - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
++  - ✅︎
++  - ✅︎
++* - `MiniCPM3ForCausalLM`
++  - MiniCPM3
++  - `openbmb/MiniCPM3-4B`, etc.
++  - ✅︎
++  - ✅︎
++* - `MistralForCausalLM`
++  - Mistral, Mistral-Instruct
++  - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
++  - ✅︎
++  - ✅︎
++* - `MixtralForCausalLM`
++  - Mixtral-8x7B, Mixtral-8x7B-Instruct
++  - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
++  - ✅︎
++  - ✅︎
++* - `MPTForCausalLM`
++  - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
++  - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
++  -
++  - ✅︎
++* - `NemotronForCausalLM`
++  - Nemotron-3, Nemotron-4, Minitron
++  - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
++  - ✅︎
++  - ✅︎
++* - `OLMoForCausalLM`
++  - OLMo
++  - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
++  -
++  - ✅︎
++* - `OLMo2ForCausalLM`
++  - OLMo2
++  - `allenai/OLMo2-7B-1124`, etc.
++  -
++  - ✅︎
++* - `OLMoEForCausalLM`
++  - OLMoE
++  - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
++  - ✅︎
++  - ✅︎
++* - `OPTForCausalLM`
++  - OPT, OPT-IML
++  - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
++  -
++  - ✅︎
++* - `OrionForCausalLM`
++  - Orion
++  - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
++  -
++  - ✅︎
++* - `PhiForCausalLM`
++  - Phi
++  - `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
++  - ✅︎
++  - ✅︎
++* - `Phi3ForCausalLM`
++  - Phi-3
++  - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
++  - ✅︎
++  - ✅︎
++* - `Phi3SmallForCausalLM`
++  - Phi-3-Small
++  - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
++  -
++  - ✅︎
++* - `PhiMoEForCausalLM`
++  - Phi-3.5-MoE
++  - `microsoft/Phi-3.5-MoE-instruct`, etc.
++  - ✅︎
++  - ✅︎
++* - `PersimmonForCausalLM`
++  - Persimmon
++  - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
++  -
++  - ✅︎
++* - `QWenLMHeadModel`
++  - Qwen
++  - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
++  - ✅︎
++  - ✅︎
++* - `Qwen2ForCausalLM`
++  - QwQ, Qwen2
++  - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
++  - ✅︎
++  - ✅︎
++* - `Qwen2MoeForCausalLM`
++  - Qwen2MoE
++  - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
++  -
++  - ✅︎
++* - `StableLmForCausalLM`
++  - StableLM
++  - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
++  -
++  - ✅︎
++* - `Starcoder2ForCausalLM`
++  - Starcoder2
++  - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
++  -
++  - ✅︎
++* - `SolarForCausalLM`
++  - Solar Pro
++  - `upstage/solar-pro-preview-instruct`, etc.
++  - ✅︎
++  - ✅︎
++* - `TeleChat2ForCausalLM`
++  - TeleChat2
++  - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
++  - ✅︎
++  - ✅︎
++* - `XverseForCausalLM`
++  - XVERSE
++  - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
++  - ✅︎
++  - ✅︎
++```
++
++```{note}
++Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
++```
++
++### Pooling Models
++
++See [this page](pooling-models) for more information on how to use pooling models.
++
++```{important}
++Since some model architectures support both generative and pooling tasks,
++you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
++```
++
++#### Text Embedding (`--task embed`)
++
++```{list-table}
++:widths: 25 25 50 5 5
++:header-rows: 1
++
++* - Architecture
++  - Models
++  - Example HF Models
++  - [LoRA](#lora-adapter)
++  - [PP](#distributed-serving)
++* - `BertModel`
++  - BERT-based
++  - `BAAI/bge-base-en-v1.5`, etc.
++  -
++  -
++* - `Gemma2Model`
++  - Gemma2-based
++  - `BAAI/bge-multilingual-gemma2`, etc.
++  -
++  - ✅︎
++* - `GritLM`
++  - GritLM
++  - `parasail-ai/GritLM-7B-vllm`.
++  - ✅︎
++  - ✅︎
++* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
++  - Llama-based
++  - `intfloat/e5-mistral-7b-instruct`, etc.
++  - ✅︎
++  - ✅︎
++* - `Qwen2Model`, `Qwen2ForCausalLM`
++  - Qwen2-based
++  - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
++  - ✅︎
++  - ✅︎
++* - `RobertaModel`, `RobertaForMaskedLM`
++  - RoBERTa-based
++  - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.
++  -
++  -
++* - `XLMRobertaModel`
++  - XLM-RoBERTa-based
++  - `intfloat/multilingual-e5-large`, etc.
++  -
++  -
++```
++
++```{note}
++`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
++You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
++```
++
++```{note}
++Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
++You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
++
++On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
++despite being described otherwise on its model card.
++
++Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be
++loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
++```
++
++If your model is not in the above list, we will try to automatically convert the model using
++{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
++of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
++
++#### Reward Modeling (`--task reward`)
++
++```{list-table}
++:widths: 25 25 50 5 5
++:header-rows: 1
++
++* - Architecture
++  - Models
++  - Example HF Models
++  - [LoRA](#lora-adapter)
++  - [PP](#distributed-serving)
++* - `InternLM2ForRewardModel`
++  - InternLM2-based
++  - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
++  - ✅︎
++  - ✅︎
++* - `LlamaForCausalLM`
++  - Llama-based
++  - `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
++  - ✅︎
++  - ✅︎
++* - `Qwen2ForRewardModel`
++  - Qwen2-based
++  - `Qwen/Qwen2.5-Math-RM-72B`, etc.
++  - ✅︎
++  - ✅︎
++```
++
++If your model is not in the above list, we will try to automatically convert the model using
++{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
++
++```{important}
++For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
++e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
++```
++
++#### Classification (`--task classify`)
++
++```{list-table}
++:widths: 25 25 50 5 5
++:header-rows: 1
++
++* - Architecture
++  - Models
++  - Example HF Models
++  - [LoRA](#lora-adapter)
++  - [PP](#distributed-serving)
++* - `JambaForSequenceClassification`
++  - Jamba
++  - `ai21labs/Jamba-tiny-reward-dev`, etc.
++  - ✅︎
++  - ✅︎
++* - `Qwen2ForSequenceClassification`
++  - Qwen2-based
++  - `jason9693/Qwen2.5-1.5B-apeach`, etc.
++  - ✅︎
++  - ✅︎
++```
++
++If your model is not in the above list, we will try to automatically convert the model using
++{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
++
++#### Sentence Pair Scoring (`--task score`)
++
++```{list-table}
++:widths: 25 25 50 5 5
++:header-rows: 1
++
++* - Architecture
++  - Models
++  - Example HF Models
++  - [LoRA](#lora-adapter)
++  - [PP](#distributed-serving)
++* - `BertForSequenceClassification`
++  - BERT-based
++  - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
++  -
++  -
++* - `RobertaForSequenceClassification`
++  - RoBERTa-based
++  - `cross-encoder/quora-roberta-base`, etc.
++  -
++  -
++* - `XLMRobertaForSequenceClassification`
++  - XLM-RoBERTa-based
++  - `BAAI/bge-reranker-v2-m3`, etc.
++  -
++  -
++```
++
++(supported-mm-models)=
++
++## List of Multimodal Language Models
++
++The following modalities are supported depending on the model:
++
++- **T**ext
++- **I**mage
++- **V**ideo
++- **A**udio
++
++Any combination of modalities joined by `+` are supported.
++
++- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs.
++
++On the other hand, modalities separated by `/` are mutually exclusive.
++
++- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
++
++See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
++
++````{important}
++To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
++or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
++
++Offline inference:
++```python
++llm = LLM(
++    model="Qwen/Qwen2-VL-7B-Instruct",
++    limit_mm_per_prompt={"image": 4},
++)
++```
++
++Online serving:
++```bash
++vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
++```
++````
++
++```{note}
++vLLM currently only supports adding LoRA to the language backbone of multimodal models.
++```
++
++### Generative Models
++
++See [this page](#generative-models) for more information on how to use generative models.
++
++#### Text Generation (`--task generate`)
++
++```{list-table}
++:widths: 25 25 15 20 5 5 5
++:header-rows: 1
++
++* - Architecture
++  - Models
++  - Inputs
++  - Example HF Models
++  - [LoRA](#lora-adapter)
++  - [PP](#distributed-serving)
++  - [V1](gh-issue:8779)
++* - `AriaForConditionalGeneration`
++  - Aria
++  - T + I<sup>+</sup>
++  - `rhymes-ai/Aria`
++  -
++  - ✅︎
++  - ✅︎
++* - `Blip2ForConditionalGeneration`
++  - BLIP-2
++  - T + I<sup>E</sup>
++  - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `ChameleonForConditionalGeneration`
++  - Chameleon
++  - T + I
++  - `facebook/chameleon-7b` etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `DeepseekVLV2ForCausalLM`
++  - DeepSeek-VL2
++  - T + I<sup>+</sup>
++  - `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
++  -
++  - ✅︎
++  - ✅︎
++* - `FuyuForCausalLM`
++  - Fuyu
++  - T + I
++  - `adept/fuyu-8b` etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `ChatGLMModel`
++  - GLM-4V
++  - T + I
++  - `THUDM/glm-4v-9b` etc.
++  - ✅︎
++  - ✅︎
++  -
++* - `H2OVLChatModel`
++  - H2OVL
++  - T + I<sup>E+</sup>
++  - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
++  -
++  - ✅︎
++  -
++* - `Idefics3ForConditionalGeneration`
++  - Idefics3
++  - T + I
++  - `HuggingFaceM4/Idefics3-8B-Llama3` etc.
++  - ✅︎
++  -
++  -
++* - `InternVLChatModel`
++  - InternVL 2.5, Mono-InternVL, InternVL 2.0
++  - T + I<sup>E+</sup>
++  - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `LlavaForConditionalGeneration`
++  - LLaVA-1.5
++  - T + I<sup>E+</sup>
++  - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `LlavaNextForConditionalGeneration`
++  - LLaVA-NeXT
++  - T + I<sup>E+</sup>
++  - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `LlavaNextVideoForConditionalGeneration`
++  - LLaVA-NeXT-Video
++  - T + V
++  - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `LlavaOnevisionForConditionalGeneration`
++  - LLaVA-Onevision
++  - T + I<sup>+</sup> + V<sup>+</sup>
++  - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `MiniCPMV`
++  - MiniCPM-V
++  - T + I<sup>E+</sup>
++  - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
++  - ✅︎
++  - ✅︎
++  -
++* - `MllamaForConditionalGeneration`
++  - Llama 3.2
++  - T + I<sup>+</sup>
++  - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
++  -
++  -
++  -
++* - `MolmoForCausalLM`
++  - Molmo
++  - T + I
++  - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
++  - ✅︎
++  - ✅︎
++  - ✅︎
++* - `NVLM_D_Model`
++  - NVLM-D 1.0
++  - T + I<sup>E+</sup>
++  - `nvidia/NVLM-D-72B`, etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `PaliGemmaForConditionalGeneration`
++  - PaliGemma, PaliGemma 2
++  - T + I<sup>E</sup>
++  - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
++  -
++  - ✅︎
++  -
++* - `Phi3VForCausalLM`
++  - Phi-3-Vision, Phi-3.5-Vision
++  - T + I<sup>E+</sup>
++  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `PixtralForConditionalGeneration`
++  - Pixtral
++  - T + I<sup>+</sup>
++  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
++  -
++  - ✅︎
++  - ✅︎
++* - `QWenLMHeadModel`
++  - Qwen-VL
++  - T + I<sup>E+</sup>
++  - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
++  - ✅︎
++  - ✅︎
++  -
++* - `Qwen2AudioForConditionalGeneration`
++  - Qwen2-Audio
++  - T + A<sup>+</sup>
++  - `Qwen/Qwen2-Audio-7B-Instruct`
++  -
++  - ✅︎
++  - ✅︎
++* - `Qwen2VLForConditionalGeneration`
++  - QVQ, Qwen2-VL
++  - T + I<sup>E+</sup> + V<sup>E+</sup>
++  - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
++  - ✅︎
++  - ✅︎
++  -
++* - `UltravoxModel`
++  - Ultravox
++  - T + A<sup>E+</sup>
++  - `fixie-ai/ultravox-v0_3`
++  -
++  - ✅︎
++  - ✅︎
++```
++
++<sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
++<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
++
++````{note}
++The `deepseek-ai/deepseek-vl2-tiny` is not supported yet.
++
++To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
++```shell
++pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
++```
++
++Besides, to run `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
++````
++
++```{note}
++To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
++```
++
++```{note}
++The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
++For more details, please see: <gh-pr:4087#issuecomment-2250397630>
++```
++
++```{note}
++The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
++A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
++```
++
++### Pooling Models
++
++See [this page](pooling-models) for more information on how to use pooling models.
++
++```{important}
++Since some model architectures support both generative and pooling tasks,
++you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
++```
++
++#### Text Embedding (`--task embed`)
++
++Any text generation model can be converted into an embedding model by passing `--task embed`.
++
++```{note}
++To get the best results, you should use pooling models that are specifically trained as such.
++```
++
++The following table lists those that are tested in vLLM.
++
++```{list-table}
++:widths: 25 25 15 25 5 5
++:header-rows: 1
++
++* - Architecture
++  - Models
++  - Inputs
++  - Example HF Models
++  - [LoRA](#lora-adapter)
++  - [PP](#distributed-serving)
++* - `LlavaNextForConditionalGeneration`
++  - LLaVA-NeXT-based
++  - T / I
++  - `royokong/e5-v`
++  -
++  - ✅︎
++* - `Phi3VForCausalLM`
++  - Phi-3-Vision-based
++  - T + I
++  - `TIGER-Lab/VLM2Vec-Full`
++  - 🚧
++  - ✅︎
++* - `Qwen2VLForConditionalGeneration`
++  - Qwen2-VL-based
++  - T + I
++  - `MrLight/dse-qwen2-2b-mrl-v1`
++  -
++  - ✅︎
++```
++
++_________________
++
++## Model Support Policy
++
++At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
++
++1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
++
++2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
++
++    ```{tip}
++    When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
++    ```
++
++3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
++
++4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
++
++5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement.
++
++Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem.
++
++Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard.
++
++We have the following levels of testing for models:
++
++1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
++2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
++3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test.
++4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
+diff --git a/docs/source/performance/benchmarks.md b/docs/source/performance/benchmarks.md
+new file mode 100644
+index 0000000..39dc470
+--- /dev/null
++++ b/docs/source/performance/benchmarks.md
+@@ -0,0 +1,28 @@
++(benchmarks)=
++
++# Benchmark Suites
++
++vLLM contains two sets of benchmarks:
++
++- [Performance benchmarks](#performance-benchmarks)
++- [Nightly benchmarks](#nightly-benchmarks)
++
++(performance-benchmarks)=
++
++## Performance Benchmarks
++
++The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
++
++The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
++
++More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
++
++(nightly-benchmarks)=
++
++## Nightly Benchmarks
++
++These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
++
++The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
++
++More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
+diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md
+new file mode 100644
+index 0000000..4fbc376
+--- /dev/null
++++ b/docs/source/performance/optimization.md
+@@ -0,0 +1,63 @@
++(optimization-and-tuning)=
++
++# Optimization and Tuning
++
++## Preemption
++
++Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
++The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
++available again. When this occurs, the following warning is printed:
++
++```text
++WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
++```
++
++While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency.
++If you frequently encounter preemptions from the vLLM engine, consider the following actions:
++
++- Increase `gpu_memory_utilization`. The vLLM pre-allocates GPU cache by using gpu_memory_utilization% of memory. By increasing this utilization, you can provide more KV cache space.
++- Decrease `max_num_seqs` or `max_num_batched_tokens`. This can reduce the number of concurrent requests in a batch, thereby requiring less KV cache space.
++- Increase `tensor_parallel_size`. This approach shards model weights, so each GPU has more memory available for KV cache.
++
++You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
++
++(chunked-prefill)=
++
++## Chunked Prefill
++
++vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
++
++You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
++
++```python
++llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
++# Set max_num_batched_tokens to tune performance.
++# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
++# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=2048)
++```
++
++By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch.
++This policy optimizes the TTFT (time to the first token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
++
++Once chunked prefill is enabled, the policy is changed to prioritize decode requests.
++It batches all pending decode requests to the batch before scheduling any prefill.
++When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills.
++If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
++
++This policy has two benefits:
++
++- It improves ITL and generation decode because decode requests are prioritized.
++- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
++
++You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 2048.
++Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes.
++Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch.
++
++- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
++- Note that the default value (2048) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
++
++We recommend you set `max_num_batched_tokens > 2048` for throughput.
++
++See related papers for more details (<https://arxiv.org/pdf/2401.08671> or <https://arxiv.org/pdf/2308.16369>).
++
++Please try out this feature and let us know your feedback via GitHub issues!
+diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
+new file mode 100644
+index 0000000..daf6e2f
+--- /dev/null
++++ b/docs/source/serving/distributed_serving.md
+@@ -0,0 +1,105 @@
++(distributed-serving)=
++
++# Distributed Inference and Serving
++
++## How to decide the distributed inference strategy?
++
++Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is:
++
++- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
++- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
++- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
++
++In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
++
++After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
++
++```{note}
++There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
++```
++
++## Running vLLM on a single node
++
++vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
++
++Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
++
++To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
++
++```python
++from vllm import LLM
++llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
++output = llm.generate("San Franciso is a")
++```
++
++To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
++
++```console
++ vllm serve facebook/opt-13b \
++     --tensor-parallel-size 4
++```
++
++You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
++
++```console
++ vllm serve gpt2 \
++     --tensor-parallel-size 4 \
++     --pipeline-parallel-size 2
++```
++
++## Running vLLM on multiple nodes
++
++If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
++
++The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/online_serving/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
++
++Pick a node as the head node, and run the following command:
++
++```console
++bash run_cluster.sh \
++                vllm/vllm-openai \
++                ip_of_head_node \
++                --head \
++                /path/to/the/huggingface/home/in/this/node
++```
++
++On the rest of the worker nodes, run the following command:
++
++```console
++bash run_cluster.sh \
++                vllm/vllm-openai \
++                ip_of_head_node \
++                --worker \
++                /path/to/the/huggingface/home/in/this/node
++```
++
++Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
++
++Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
++
++After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
++
++```console
++ vllm serve /path/to/the/model/in/the/container \
++     --tensor-parallel-size 8 \
++     --pipeline-parallel-size 2
++```
++
++You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16:
++
++```console
++vllm serve /path/to/the/model/in/the/container \
++     --tensor-parallel-size 16
++```
++
++To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
++
++```{warning}
++After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
++```
++
++```{warning}
++Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
++
++When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
++```
+diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md
+new file mode 100644
+index 0000000..cd3c6a4
+--- /dev/null
++++ b/docs/source/serving/engine_args.md
+@@ -0,0 +1,25 @@
++(engine-args)=
++
++# Engine Arguments
++
++Below, you can find an explanation of every engine argument for vLLM:
++
++```{eval-rst}
++.. argparse::
++    :module: vllm.engine.arg_utils
++    :func: _engine_args_parser
++    :prog: vllm serve
++    :nodefaultconst:
++```
++
++## Async Engine Arguments
++
++Below are the additional arguments related to the asynchronous engine:
++
++```{eval-rst}
++.. argparse::
++    :module: vllm.engine.arg_utils
++    :func: _async_engine_args_parser
++    :prog: vllm serve
++    :nodefaultconst:
++```
+diff --git a/docs/source/serving/env_vars.md b/docs/source/serving/env_vars.md
+new file mode 100644
+index 0000000..f9b0807
+--- /dev/null
++++ b/docs/source/serving/env_vars.md
+@@ -0,0 +1,15 @@
++# Environment Variables
++
++vLLM uses the following environment variables to configure the system:
++
++```{warning}
++Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
++
++All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
++```
++
++```{literalinclude} ../../../vllm/envs.py
++:end-before: end-env-vars-definition
++:language: python
++:start-after: begin-env-vars-definition
++```
+diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md
+new file mode 100644
+index 0000000..371c284
+--- /dev/null
++++ b/docs/source/serving/integrations/index.md
+@@ -0,0 +1,8 @@
++# External Integrations
++
++```{toctree}
++:maxdepth: 1
++
++langchain
++llamaindex
++```
+diff --git a/docs/source/serving/integrations/langchain.md b/docs/source/serving/integrations/langchain.md
+new file mode 100644
+index 0000000..03142d2
+--- /dev/null
++++ b/docs/source/serving/integrations/langchain.md
+@@ -0,0 +1,30 @@
++(serving-langchain)=
++
++# LangChain
++
++vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
++
++To install LangChain, run
++
++```console
++pip install langchain langchain_community -q
++```
++
++To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
++
++```python
++from langchain_community.llms import VLLM
++
++llm = VLLM(model="mosaicml/mpt-7b",
++           trust_remote_code=True,  # mandatory for hf models
++           max_new_tokens=128,
++           top_k=10,
++           top_p=0.95,
++           temperature=0.8,
++           # tensor_parallel_size=... # for distributed inference
++)
++
++print(llm("What is the capital of France ?"))
++```
++
++Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
+diff --git a/docs/source/serving/integrations/llamaindex.md b/docs/source/serving/integrations/llamaindex.md
+new file mode 100644
+index 0000000..8c72605
+--- /dev/null
++++ b/docs/source/serving/integrations/llamaindex.md
+@@ -0,0 +1,26 @@
++(serving-llamaindex)=
++
++# LlamaIndex
++
++vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
++
++To install LlamaIndex, run
++
++```console
++pip install llama-index-llms-vllm -q
++```
++
++To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`.
++
++```python
++from llama_index.llms.vllm import Vllm
++
++llm = Vllm(
++    model="microsoft/Orca-2-7b",
++    tensor_parallel_size=4,
++    max_new_tokens=100,
++    vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
++)
++```
++
++Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details.
+diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
+new file mode 100644
+index 0000000..6c84f6d
+--- /dev/null
++++ b/docs/source/serving/metrics.md
+@@ -0,0 +1,38 @@
++# Production Metrics
++
++vLLM exposes a number of metrics that can be used to monitor the health of the
++system. These metrics are exposed via the `/metrics` endpoint on the vLLM
++OpenAI compatible API server.
++
++You can start the server using Python, or using [Docker](#deployment-docker):
++
++```console
++vllm serve unsloth/Llama-3.2-1B-Instruct
++```
++
++Then query the endpoint to get the latest metrics from the server:
++
++```console
++$ curl http://0.0.0.0:8000/metrics
++
++# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
++# TYPE vllm:iteration_tokens_total histogram
++vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
++vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
++vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
++vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
++vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
++vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
++vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
++vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
++vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
++...
++```
++
++The following metrics are exposed:
++
++```{literalinclude} ../../../vllm/engine/metrics.py
++:end-before: end-metrics-definitions
++:language: python
++:start-after: begin-metrics-definitions
++```
+diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
+new file mode 100644
+index 0000000..0213b0a
+--- /dev/null
++++ b/docs/source/serving/multimodal_inputs.md
+@@ -0,0 +1,533 @@
++(multimodal-inputs)=
++
++# Multimodal Inputs
++
++This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM.
++
++```{note}
++We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
++and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
++```
++
++## Offline Inference
++
++To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`:
++
++- `prompt`: The prompt should follow the format that is documented on HuggingFace.
++- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`.
++
++### Image
++
++You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
++
++```python
++llm = LLM(model="llava-hf/llava-1.5-7b-hf")
++
++# Refer to the HuggingFace repo for the correct format to use
++prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
++
++# Load the image using PIL.Image
++image = PIL.Image.open(...)
++
++# Single prompt inference
++outputs = llm.generate({
++    "prompt": prompt,
++    "multi_modal_data": {"image": image},
++})
++
++for o in outputs:
++    generated_text = o.outputs[0].text
++    print(generated_text)
++
++# Batch inference
++image_1 = PIL.Image.open(...)
++image_2 = PIL.Image.open(...)
++outputs = llm.generate(
++    [
++        {
++            "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
++            "multi_modal_data": {"image": image_1},
++        },
++        {
++            "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
++            "multi_modal_data": {"image": image_2},
++        }
++    ]
++)
++
++for o in outputs:
++    generated_text = o.outputs[0].text
++    print(generated_text)
++```
++
++Full example: <gh-file:examples/offline_inference/vision_language.py>
++
++To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
++
++```python
++llm = LLM(
++    model="microsoft/Phi-3.5-vision-instruct",
++    trust_remote_code=True,  # Required to load Phi-3.5-vision
++    max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
++    limit_mm_per_prompt={"image": 2},  # The maximum number to accept
++)
++
++# Refer to the HuggingFace repo for the correct format to use
++prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
++
++# Load the images using PIL.Image
++image1 = PIL.Image.open(...)
++image2 = PIL.Image.open(...)
++
++outputs = llm.generate({
++    "prompt": prompt,
++    "multi_modal_data": {
++        "image": [image1, image2]
++    },
++})
++
++for o in outputs:
++    generated_text = o.outputs[0].text
++    print(generated_text)
++```
++
++Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
++
++Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
++
++```python
++# Specify the maximum number of frames per video to be 4. This can be changed.
++llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
++
++# Create the request payload.
++video_frames = ... # load your video making sure it only has the number of frames specified earlier.
++message = {
++    "role": "user",
++    "content": [
++        {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
++    ],
++}
++for i in range(len(video_frames)):
++    base64_image = encode_image(video_frames[i]) # base64 encoding.
++    new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
++    message["content"].append(new_image)
++
++# Perform inference and log output.
++outputs = llm.chat([message])
++
++for o in outputs:
++    generated_text = o.outputs[0].text
++    print(generated_text)
++```
++
++### Video
++
++You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
++instead of using multi-image input.
++
++Full example: <gh-file:examples/offline_inference/vision_language.py>
++
++### Audio
++
++You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
++
++Full example: <gh-file:examples/offline_inference/audio_language.py>
++
++### Embedding
++
++To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
++pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
++
++```python
++# Inference with image embeddings as input
++llm = LLM(model="llava-hf/llava-1.5-7b-hf")
++
++# Refer to the HuggingFace repo for the correct format to use
++prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
++
++# Embeddings for single image
++# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
++image_embeds = torch.load(...)
++
++outputs = llm.generate({
++    "prompt": prompt,
++    "multi_modal_data": {"image": image_embeds},
++})
++
++for o in outputs:
++    generated_text = o.outputs[0].text
++    print(generated_text)
++```
++
++For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
++
++```python
++# Construct the prompt based on your model
++prompt = ...
++
++# Embeddings for multiple images
++# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
++image_embeds = torch.load(...)
++
++# Qwen2-VL
++llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
++mm_data = {
++    "image": {
++        "image_embeds": image_embeds,
++        # image_grid_thw is needed to calculate positional encoding.
++        "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
++    }
++}
++
++# MiniCPM-V
++llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
++mm_data = {
++    "image": {
++        "image_embeds": image_embeds,
++        # image_size_list is needed to calculate details of the sliced image.
++        "image_size_list": [image.size for image in images],  # list of image sizes
++    }
++}
++
++outputs = llm.generate({
++    "prompt": prompt,
++    "multi_modal_data": mm_data,
++})
++
++for o in outputs:
++    generated_text = o.outputs[0].text
++    print(generated_text)
++```
++
++## Online Serving
++
++Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
++
++```{important}
++A chat template is **required** to use Chat Completions API.
++
++Although most models come with a chat template, for others you have to define one yourself.
++The chat template can be inferred based on the documentation on the model's HuggingFace repo.
++For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: <gh-file:examples/template_llava.jinja>
++```
++
++### Image
++
++Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision).
++Here is a simple example using Phi-3.5-Vision.
++
++First, launch the OpenAI-compatible server:
++
++```bash
++vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
++  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
++```
++
++Then, you can use the OpenAI client as follows:
++
++```python
++from openai import OpenAI
++
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++
++client = OpenAI(
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++# Single-image input inference
++image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
++
++chat_response = client.chat.completions.create(
++    model="microsoft/Phi-3.5-vision-instruct",
++    messages=[{
++        "role": "user",
++        "content": [
++            # NOTE: The prompt formatting with the image token `<image>` is not needed
++            # since the prompt will be processed automatically by the API server.
++            {"type": "text", "text": "What’s in this image?"},
++            {"type": "image_url", "image_url": {"url": image_url}},
++        ],
++    }],
++)
++print("Chat completion output:", chat_response.choices[0].message.content)
++
++# Multi-image input inference
++image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
++image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
++
++chat_response = client.chat.completions.create(
++    model="microsoft/Phi-3.5-vision-instruct",
++    messages=[{
++        "role": "user",
++        "content": [
++            {"type": "text", "text": "What are the animals in these images?"},
++            {"type": "image_url", "image_url": {"url": image_url_duck}},
++            {"type": "image_url", "image_url": {"url": image_url_lion}},
++        ],
++    }],
++)
++print("Chat completion output:", chat_response.choices[0].message.content)
++```
++
++Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
++
++```{tip}
++Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
++and pass the file path as `url` in the API request.
++```
++
++```{tip}
++There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
++In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
++```
++
++````{note}
++By default, the timeout for fetching images through HTTP URL is `5` seconds.
++You can override this by setting the environment variable:
++
++```console
++$ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
++```
++````
++
++### Video
++
++Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
++
++First, launch the OpenAI-compatible server:
++
++```bash
++vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
++```
++
++Then, you can use the OpenAI client as follows:
++
++```python
++from openai import OpenAI
++
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++
++client = OpenAI(
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
++
++## Use video url in the payload
++chat_completion_from_url = client.chat.completions.create(
++    messages=[{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "text",
++                "text": "What's in this video?"
++            },
++            {
++                "type": "video_url",
++                "video_url": {
++                    "url": video_url
++                },
++            },
++        ],
++    }],
++    model=model,
++    max_completion_tokens=64,
++)
++
++result = chat_completion_from_url.choices[0].message.content
++print("Chat completion output from image url:", result)
++```
++
++Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
++
++````{note}
++By default, the timeout for fetching videos through HTTP URL is `30` seconds.
++You can override this by setting the environment variable:
++
++```console
++$ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
++```
++````
++
++### Audio
++
++Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
++Here is a simple example using Ultravox-v0.3.
++
++First, launch the OpenAI-compatible server:
++
++```bash
++vllm serve fixie-ai/ultravox-v0_3
++```
++
++Then, you can use the OpenAI client as follows:
++
++```python
++import base64
++import requests
++from openai import OpenAI
++from vllm.assets.audio import AudioAsset
++
++def encode_base64_content_from_url(content_url: str) -> str:
++    """Encode a content retrieved from a remote url to base64 format."""
++
++    with requests.get(content_url) as response:
++        response.raise_for_status()
++        result = base64.b64encode(response.content).decode('utf-8')
++
++    return result
++
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++
++client = OpenAI(
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++# Any format supported by librosa is supported
++audio_url = AudioAsset("winning_call").url
++audio_base64 = encode_base64_content_from_url(audio_url)
++
++chat_completion_from_base64 = client.chat.completions.create(
++    messages=[{
++        "role": "user",
++        "content": [
++            {
++                "type": "text",
++                "text": "What's in this audio?"
++            },
++            {
++                "type": "input_audio",
++                "input_audio": {
++                    "data": audio_base64,
++                    "format": "wav"
++                },
++            },
++        ],
++    }],
++    model=model,
++    max_completion_tokens=64,
++)
++
++result = chat_completion_from_base64.choices[0].message.content
++print("Chat completion output from input audio:", result)
++```
++
++Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
++
++```python
++chat_completion_from_url = client.chat.completions.create(
++    messages=[{
++        "role": "user",
++        "content": [
++            {
++                "type": "text",
++                "text": "What's in this audio?"
++            },
++            {
++                "type": "audio_url",
++                "audio_url": {
++                    "url": audio_url
++                },
++            },
++        ],
++    }],
++    model=model,
++    max_completion_tokens=64,
++)
++
++result = chat_completion_from_url.choices[0].message.content
++print("Chat completion output from audio url:", result)
++```
++
++Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
++
++````{note}
++By default, the timeout for fetching audios through HTTP URL is `10` seconds.
++You can override this by setting the environment variable:
++
++```console
++$ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
++```
++````
++
++### Embedding
++
++vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings),
++where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models.
++
++```{tip}
++The schema of `messages` is exactly the same as in Chat Completions API.
++You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
++```
++
++Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
++Refer to the examples below for illustration.
++
++Here is an end-to-end example using VLM2Vec. To serve the model:
++
++```bash
++vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
++  --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
++```
++
++```{important}
++Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
++to run this model in embedding mode instead of text generation mode.
++
++The custom chat template is completely different from the original one for this model,
++and can be found here: <gh-file:examples/template_vlm2vec.jinja>
++```
++
++Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
++
++```python
++import requests
++
++image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
++
++response = requests.post(
++    "http://localhost:8000/v1/embeddings",
++    json={
++        "model": "TIGER-Lab/VLM2Vec-Full",
++        "messages": [{
++            "role": "user",
++            "content": [
++                {"type": "image_url", "image_url": {"url": image_url}},
++                {"type": "text", "text": "Represent the given image."},
++            ],
++        }],
++        "encoding_format": "float",
++    },
++)
++response.raise_for_status()
++response_json = response.json()
++print("Embedding output:", response_json["data"][0]["embedding"])
++```
++
++Below is another example, this time using the `MrLight/dse-qwen2-2b-mrl-v1` model.
++
++```bash
++vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
++  --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
++```
++
++```{important}
++Like with VLM2Vec, we have to explicitly pass `--task embed`.
++
++Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
++by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
++```
++
++```{important}
++Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
++example below for details.
++```
++
++Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
+diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
+new file mode 100644
+index 0000000..94703a1
+--- /dev/null
++++ b/docs/source/serving/offline_inference.md
+@@ -0,0 +1,79 @@
++(offline-inference)=
++
++# Offline Inference
++
++You can run vLLM in your own code on a list of prompts.
++
++The offline API is based on the {class}`~vllm.LLM` class.
++To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
++
++For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
++and runs it in vLLM using the default configuration.
++
++```python
++llm = LLM(model="facebook/opt-125m")
++```
++
++After initializing the `LLM` instance, you can perform model inference using various APIs.
++The available APIs depend on the type of model that is being run:
++
++- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text.
++- [Pooling models](#pooling-models) output their hidden states directly.
++
++Please refer to the above pages for more details about each API.
++
++```{seealso}
++[API Reference](/api/offline_inference/index)
++```
++
++## Configuration Options
++
++This section lists the most common options for running the vLLM engine.
++For a full list, refer to the [Engine Arguments](#engine-args) page.
++
++### Reducing memory usage
++
++Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
++
++#### Tensor Parallelism (TP)
++
++Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs.
++
++The following code splits the model across 2 GPUs.
++
++```python
++llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
++          tensor_parallel_size=2)
++```
++
++```{important}
++To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
++before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
++
++To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
++```
++
++#### Quantization
++
++Quantized models take less memory at the cost of lower precision.
++
++Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Neural Magic](https://huggingface.co/neuralmagic))
++and used directly without extra configuration.
++
++Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details.
++
++#### Context length and batch size
++
++You can further reduce memory usage by limiting the context length of the model (`max_model_len` option)
++and the maximum batch size (`max_num_seqs` option).
++
++```python
++llm = LLM(model="adept/fuyu-8b",
++          max_model_len=2048,
++          max_num_seqs=2)
++```
++
++### Performance optimization and tuning
++
++You can potentially improve the performance of vLLM by finetuning various options.
++Please refer to [this guide](#optimization-and-tuning) for more details.
+diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
+index c157d8b..e49bbb0 100644
+--- a/docs/source/serving/openai_compatible_server.md
++++ b/docs/source/serving/openai_compatible_server.md
+@@ -1,13 +1,17 @@
+-# OpenAI Compatible Server
++(openai-compatible-server)=
+ 
+-vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
++# OpenAI-Compatible Server
++
++vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
++
++You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
+ 
+-You can start the server using Python, or using [Docker](deploying_with_docker.rst):
+ ```bash
+-python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
++vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
+ ```
+ 
+-To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
++To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client.
++
+ ```python
+ from openai import OpenAI
+ client = OpenAI(
+@@ -25,12 +29,77 @@ completion = client.chat.completions.create(
+ print(completion.choices[0].message)
+ ```
+ 
+-## API Reference
+-Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except:
+-- Chat: `tools`, and `tool_choice`.
+-- Completions: `suffix`.
++## Supported APIs
++
++We currently support the following OpenAI APIs:
++
++- [Completions API](#completions-api) (`/v1/completions`)
++  - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`).
++  - *Note: `suffix` parameter is not supported.*
++- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
++  - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template).
++  - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
++- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
++  - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
++
++In addition, we have the following custom APIs:
++
++- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
++  - Applicable to any model with a tokenizer.
++- [Pooling API](#pooling-api) (`/pooling`)
++  - Applicable to all [pooling models](../models/pooling_models.md).
++- [Score API](#score-api) (`/score`)
++  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
++
++(chat-template)=
++
++## Chat Template
++
++In order for the language model to support chat protocol, vLLM requires the model to include
++a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
++specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
++
++An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
++
++Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
++you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
++template, or the template in string form. Without a chat template, the server will not be able to process chat
++and all chat requests will error.
++
++```bash
++vllm serve <model> --chat-template ./path-to-chat-template.jinja
++```
++
++vLLM community provides a set of chat templates for popular models. You can find them under the <gh-dir:examples> directory.
++
++With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
++both a `type` and a `text` field. An example is provided below:
++
++```python
++completion = client.chat.completions.create(
++  model="NousResearch/Meta-Llama-3-8B-Instruct",
++  messages=[
++    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
++  ]
++)
++```
++
++Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like
++`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
++request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
++*"Detected the chat template content format to be..."*, and internally converts incoming requests to match
++the detected format, which can be one of:
++
++- `"string"`: A string.
++  - Example: `"Hello world"`
++- `"openai"`: A list of dictionaries, similar to OpenAI schema.
++  - Example: `[{"type": "text", "text": "Hello world!"}]`
++
++If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
++to override which format to use.
+ 
+ ## Extra Parameters
++
+ vLLM supports a set of parameters that are not part of the OpenAI API.
+ In order to use them, you can pass them as extra parameters in the OpenAI client.
+ Or directly merge them into the JSON payload if you are using HTTP call directly.
+@@ -47,8 +116,124 @@ completion = client.chat.completions.create(
+ )
+ ```
+ 
+-### Extra Parameters for Chat API
+-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
++## Extra HTTP Headers
++
++Only `X-Request-Id` HTTP request header is supported for now. It can be enabled
++with `--enable-request-id-headers`.
++
++> Note that enablement of the headers can impact performance significantly at high QPS
++> rates. We recommend implementing HTTP headers at the router level (e.g. via Istio),
++> rather than within the vLLM layer for this reason.
++> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details.
++
++```python
++completion = client.chat.completions.create(
++  model="NousResearch/Meta-Llama-3-8B-Instruct",
++  messages=[
++    {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
++  ],
++  extra_headers={
++    "x-request-id": "sentiment-classification-00001",
++  }
++)
++print(completion._request_id)
++
++completion = client.completions.create(
++  model="NousResearch/Meta-Llama-3-8B-Instruct",
++  prompt="A robot may not injure a human being",
++  extra_headers={
++    "x-request-id": "completion-test",
++  }
++)
++print(completion._request_id)
++```
++
++## CLI Reference
++
++(vllm-serve)=
++
++### `vllm serve`
++
++The `vllm serve` command is used to launch the OpenAI-compatible server.
++
++```{argparse}
++:module: vllm.entrypoints.openai.cli_args
++:func: create_parser_for_docs
++:prog: vllm serve
++```
++
++#### Configuration file
++
++You can load CLI arguments via a [YAML](https://yaml.org/) config file.
++The argument names must be the long form of those outlined [above](#vllm-serve).
++
++For example:
++
++```yaml
++# config.yaml
++
++host: "127.0.0.1"
++port: 6379
++uvicorn-log-level: "info"
++```
++
++To use the above config file:
++
++```bash
++vllm serve SOME_MODEL --config config.yaml
++```
++
++```{note}
++In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
++The order of priorities is `command line > config file values > defaults`.
++```
++
++## API Reference
++
++(completions-api)=
++
++### Completions API
++
++Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
++you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
++
++Code example: <gh-file:examples/online_serving/openai_completion_client.py>
++
++#### Extra parameters
++
++The following [sampling parameters](#sampling-params) are supported.
++
++```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
++:language: python
++:start-after: begin-completion-sampling-params
++:end-before: end-completion-sampling-params
++```
++
++The following extra parameters are supported:
++
++```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
++:language: python
++:start-after: begin-completion-extra-params
++:end-before: end-completion-extra-params
++```
++
++(chat-api)=
++
++### Chat API
++
++Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat);
++you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
++
++We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
++[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
++see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
++- *Note: `image_url.detail` parameter is not supported.*
++
++Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
++
++#### Extra parameters
++
++The following [sampling parameters](#sampling-params) are supported.
+ 
+ ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+ :language: python
+@@ -64,49 +249,227 @@ The following extra parameters are supported:
+ :end-before: end-chat-completion-extra-params
+ ```
+ 
+-### Extra Parameters for Completions API
+-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
++(embeddings-api)=
++
++### Embeddings API
++
++Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
++you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
++
++If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
++which will be treated as a single prompt to the model.
++
++```{tip}
++This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
++```
++
++Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
++
++#### Extra parameters
++
++The following [pooling parameters](#pooling-params) are supported.
+ 
+ ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+ :language: python
+-:start-after: begin-completion-sampling-params
+-:end-before: end-completion-sampling-params
++:start-after: begin-embedding-pooling-params
++:end-before: end-embedding-pooling-params
+ ```
+ 
+-The following extra parameters are supported:
++The following extra parameters are supported by default:
+ 
+ ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+ :language: python
+-:start-after: begin-completion-extra-params
+-:end-before: end-completion-extra-params
++:start-after: begin-embedding-extra-params
++:end-before: end-embedding-extra-params
+ ```
+ 
+-## Chat Template
++For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
+ 
+-In order for the language model to support chat protocol, vLLM requires the model to include
+-a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
+-specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
++```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
++:language: python
++:start-after: begin-chat-embedding-extra-params
++:end-before: end-chat-embedding-extra-params
++```
+ 
+-An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
++(tokenizer-api)=
+ 
+-Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
+-you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
+-template, or the template in string form. Without a chat template, the server will not be able to process chat
+-and all chat requests will error.
++### Tokenizer API
++
++Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
++It consists of two endpoints:
++
++- `/tokenize` corresponds to calling `tokenizer.encode()`.
++- `/detokenize` corresponds to calling `tokenizer.decode()`.
++
++(pooling-api)=
++
++### Pooling API
++
++Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
++
++The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
++
++Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
++
++(score-api)=
++
++### Score API
++
++Our Score API applies a cross-encoder model to predict scores for sentence pairs.
++Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
++
++You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
++
++Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
++
++#### Single inference
++
++You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
++
++Request:
+ 
+ ```bash
+-python -m vllm.entrypoints.openai.api_server \
+-  --model ... \
+-  --chat-template ./path-to-chat-template.jinja
++curl -X 'POST' \
++  'http://127.0.0.1:8000/score' \
++  -H 'accept: application/json' \
++  -H 'Content-Type: application/json' \
++  -d '{
++  "model": "BAAI/bge-reranker-v2-m3",
++  "encoding_format": "float",
++  "text_1": "What is the capital of France?",
++  "text_2": "The capital of France is Paris."
++}'
+ ```
+ 
+-vLLM community provides a set of chat templates for popular models. You can find them in the examples
+-directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
++Response:
+ 
+-## Command line arguments for the server
++```bash
++{
++  "id": "score-request-id",
++  "object": "list",
++  "created": 693447,
++  "model": "BAAI/bge-reranker-v2-m3",
++  "data": [
++    {
++      "index": 0,
++      "object": "score",
++      "score": 1
++    }
++  ],
++  "usage": {}
++}
++```
+ 
+-```{argparse}
+-:module: vllm.entrypoints.openai.cli_args
+-:func: make_arg_parser
+-:prog: vllm-openai-server
+-```
+\ No newline at end of file
++#### Batch inference
++
++You can pass a string to `text_1` and a list to `text_2`, forming multiple sentence pairs
++where each pair is built from `text_1` and a string in `text_2`.
++The total number of pairs is `len(text_2)`.
++
++Request:
++
++```bash
++curl -X 'POST' \
++  'http://127.0.0.1:8000/score' \
++  -H 'accept: application/json' \
++  -H 'Content-Type: application/json' \
++  -d '{
++  "model": "BAAI/bge-reranker-v2-m3",
++  "text_1": "What is the capital of France?",
++  "text_2": [
++    "The capital of Brazil is Brasilia.",
++    "The capital of France is Paris."
++  ]
++}'
++```
++
++Response:
++
++```bash
++{
++  "id": "score-request-id",
++  "object": "list",
++  "created": 693570,
++  "model": "BAAI/bge-reranker-v2-m3",
++  "data": [
++    {
++      "index": 0,
++      "object": "score",
++      "score": 0.001094818115234375
++    },
++    {
++      "index": 1,
++      "object": "score",
++      "score": 1
++    }
++  ],
++  "usage": {}
++}
++```
++
++You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
++where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
++The total number of pairs is `len(text_2)`.
++
++Request:
++
++```bash
++curl -X 'POST' \
++  'http://127.0.0.1:8000/score' \
++  -H 'accept: application/json' \
++  -H 'Content-Type: application/json' \
++  -d '{
++  "model": "BAAI/bge-reranker-v2-m3",
++  "encoding_format": "float",
++  "text_1": [
++    "What is the capital of Brazil?",
++    "What is the capital of France?"
++  ],
++  "text_2": [
++    "The capital of Brazil is Brasilia.",
++    "The capital of France is Paris."
++  ]
++}'
++```
++
++Response:
++
++```bash
++{
++  "id": "score-request-id",
++  "object": "list",
++  "created": 693447,
++  "model": "BAAI/bge-reranker-v2-m3",
++  "data": [
++    {
++      "index": 0,
++      "object": "score",
++      "score": 1
++    },
++    {
++      "index": 1,
++      "object": "score",
++      "score": 1
++    }
++  ],
++  "usage": {}
++}
++```
++
++#### Extra parameters
++
++The following [pooling parameters](#pooling-params) are supported.
++
++```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
++:language: python
++:start-after: begin-score-pooling-params
++:end-before: end-score-pooling-params
++```
++
++The following extra parameters are supported:
++
++```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
++:language: python
++:start-after: begin-score-extra-params
++:end-before: end-score-extra-params
++```
+diff --git a/docs/source/serving/usage_stats.md b/docs/source/serving/usage_stats.md
+index a1e4b1c..cfc3cb2 100644
+--- a/docs/source/serving/usage_stats.md
++++ b/docs/source/serving/usage_stats.md
+@@ -4,7 +4,7 @@ vLLM collects anonymous usage data by default to help the engineering team bette
+ 
+ ## What data is collected?
+ 
+-You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py).
++The list of data collected by the latest version of vLLM can be found here: <gh-file:vllm/usage/usage_lib.py>
+ 
+ Here is an example as of v0.4.0:
+ 
+@@ -45,9 +45,9 @@ You can preview the collected data by running the following command:
+ tail ~/.config/vllm/usage_stats.json
+ ```
+ 
+-## Opt-out of Usage Stats Collection
++## Opting out
+ 
+-You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file:
++You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
+ 
+ ```bash
+ # Any of the following methods can disable usage stats collection
+diff --git a/examples/offline_inference/aqlm_example.py b/examples/offline_inference/aqlm_example.py
+new file mode 100644
+index 0000000..40f9a21
+--- /dev/null
++++ b/examples/offline_inference/aqlm_example.py
+@@ -0,0 +1,45 @@
++from vllm import LLM, SamplingParams
++from vllm.utils import FlexibleArgumentParser
++
++
++def main():
++
++    parser = FlexibleArgumentParser(description='AQLM examples')
++
++    parser.add_argument('--model',
++                        '-m',
++                        type=str,
++                        default=None,
++                        help='model path, as for HF')
++    parser.add_argument('--choice',
++                        '-c',
++                        type=int,
++                        default=0,
++                        help='known good models by index, [0-4]')
++    parser.add_argument('--tensor-parallel-size',
++                        '-t',
++                        type=int,
++                        default=1,
++                        help='tensor parallel size')
++
++    args = parser.parse_args()
++
++    models = [
++        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
++        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
++        "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
++        "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
++        "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
++    ]
++
++    model = LLM(args.model if args.model is not None else models[args.choice],
++                tensor_parallel_size=args.tensor_parallel_size)
++
++    sampling_params = SamplingParams(max_tokens=100, temperature=0)
++    outputs = model.generate("Hello my name is",
++                             sampling_params=sampling_params)
++    print(outputs[0].outputs[0].text)
++
++
++if __name__ == '__main__':
++    main()
+diff --git a/examples/offline_inference/arctic.py b/examples/offline_inference/arctic.py
+new file mode 100644
+index 0000000..1fec3c9
+--- /dev/null
++++ b/examples/offline_inference/arctic.py
+@@ -0,0 +1,26 @@
++from vllm import LLM, SamplingParams
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++# Create an LLM.
++llm = LLM(model="snowflake/snowflake-arctic-instruct",
++          quantization="deepspeedfp",
++          tensor_parallel_size=8,
++          trust_remote_code=True)
++# Generate texts from the prompts. The output is a list of RequestOutput objects
++# that contain the prompt, generated text, and other information.
++
++outputs = llm.generate(prompts, sampling_params)
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
+new file mode 100644
+index 0000000..6fd7478
+--- /dev/null
++++ b/examples/offline_inference/audio_language.py
+@@ -0,0 +1,131 @@
++"""
++This example shows how to use vLLM for running offline inference 
++with the correct prompt format on audio language models.
++
++For most models, the prompt format should follow corresponding examples
++on HuggingFace model repository.
++"""
++from transformers import AutoTokenizer
++
++from vllm import LLM, SamplingParams
++from vllm.assets.audio import AudioAsset
++from vllm.utils import FlexibleArgumentParser
++
++audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
++question_per_audio_count = {
++    0: "What is 1+1?",
++    1: "What is recited in the audio?",
++    2: "What sport and what nursery rhyme are referenced?"
++}
++
++# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
++# lower-end GPUs.
++# Unless specified, these settings have been tested to work on a single L4.
++
++
++# Ultravox 0.3
++def run_ultravox(question: str, audio_count: int):
++    model_name = "fixie-ai/ultravox-v0_3"
++
++    tokenizer = AutoTokenizer.from_pretrained(model_name)
++    messages = [{
++        'role': 'user',
++        'content': "<|audio|>\n" * audio_count + question
++    }]
++    prompt = tokenizer.apply_chat_template(messages,
++                                           tokenize=False,
++                                           add_generation_prompt=True)
++
++    llm = LLM(model=model_name,
++              max_model_len=4096,
++              max_num_seqs=5,
++              trust_remote_code=True,
++              limit_mm_per_prompt={"audio": audio_count})
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# Qwen2-Audio
++def run_qwen2_audio(question: str, audio_count: int):
++    model_name = "Qwen/Qwen2-Audio-7B-Instruct"
++
++    llm = LLM(model=model_name,
++              max_model_len=4096,
++              max_num_seqs=5,
++              limit_mm_per_prompt={"audio": audio_count})
++
++    audio_in_prompt = "".join([
++        f"Audio {idx+1}: "
++        f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
++    ])
++
++    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
++              "<|im_start|>user\n"
++              f"{audio_in_prompt}{question}<|im_end|>\n"
++              "<|im_start|>assistant\n")
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}
++
++
++def main(args):
++    model = args.model_type
++    if model not in model_example_map:
++        raise ValueError(f"Model type {model} is not supported.")
++
++    audio_count = args.num_audios
++    llm, prompt, stop_token_ids = model_example_map[model](
++        question_per_audio_count[audio_count], audio_count)
++
++    # We set temperature to 0.2 so that outputs can be different
++    # even when all prompts are identical when running batch inference.
++    sampling_params = SamplingParams(temperature=0.2,
++                                     max_tokens=64,
++                                     stop_token_ids=stop_token_ids)
++
++    mm_data = {}
++    if audio_count > 0:
++        mm_data = {
++            "audio": [
++                asset.audio_and_sample_rate
++                for asset in audio_assets[:audio_count]
++            ]
++        }
++
++    assert args.num_prompts > 0
++    inputs = {"prompt": prompt, "multi_modal_data": mm_data}
++    if args.num_prompts > 1:
++        # Batch inference
++        inputs = [inputs] * args.num_prompts
++
++    outputs = llm.generate(inputs, sampling_params=sampling_params)
++
++    for o in outputs:
++        generated_text = o.outputs[0].text
++        print(generated_text)
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(
++        description='Demo on using vLLM for offline inference with '
++        'audio language models')
++    parser.add_argument('--model-type',
++                        '-m',
++                        type=str,
++                        default="ultravox",
++                        choices=model_example_map.keys(),
++                        help='Huggingface "model_type".')
++    parser.add_argument('--num-prompts',
++                        type=int,
++                        default=1,
++                        help='Number of prompts to run.')
++    parser.add_argument("--num-audios",
++                        type=int,
++                        default=1,
++                        choices=[0, 1, 2],
++                        help="Number of audio items per prompt.")
++
++    args = parser.parse_args()
++    main(args)
+diff --git a/examples/offline_inference/basic.py b/examples/offline_inference/basic.py
+new file mode 100644
+index 0000000..23cc6e8
+--- /dev/null
++++ b/examples/offline_inference/basic.py
+@@ -0,0 +1,22 @@
++from vllm import LLM, SamplingParams
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++# Create an LLM.
++llm = LLM(model="facebook/opt-125m")
++# Generate texts from the prompts. The output is a list of RequestOutput objects
++# that contain the prompt, generated text, and other information.
++outputs = llm.generate(prompts, sampling_params)
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+\ No newline at end of file
+diff --git a/examples/offline_inference/basic_with_model_default_sampling.py b/examples/offline_inference/basic_with_model_default_sampling.py
+new file mode 100644
+index 0000000..346bb80
+--- /dev/null
++++ b/examples/offline_inference/basic_with_model_default_sampling.py
+@@ -0,0 +1,30 @@
++from vllm import LLM
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++
++# Create an LLM with built-in default generation config.
++# The generation config is set to None by default to keep
++# the behavior consistent with the previous version.
++# If you want to use the default generation config from the model,
++# you should set the generation_config to "auto".
++llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto")
++
++# Load the default sampling parameters from the model.
++sampling_params = llm.get_default_sampling_params()
++# Modify the sampling parameters if needed.
++sampling_params.temperature = 0.5
++
++# Generate texts from the prompts. The output is a list of RequestOutput objects
++# that contain the prompt, generated text, and other information.
++outputs = llm.generate(prompts, sampling_params)
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+diff --git a/examples/offline_inference/chat.py b/examples/offline_inference/chat.py
+new file mode 100644
+index 0000000..8814f4d
+--- /dev/null
++++ b/examples/offline_inference/chat.py
+@@ -0,0 +1,80 @@
++from vllm import LLM, SamplingParams
++
++llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
++sampling_params = SamplingParams(temperature=0.5)
++
++
++def print_outputs(outputs):
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++    print("-" * 80)
++
++
++print("=" * 80)
++
++# In this script, we demonstrate how to pass input to the chat method:
++
++conversation = [
++    {
++        "role": "system",
++        "content": "You are a helpful assistant"
++    },
++    {
++        "role": "user",
++        "content": "Hello"
++    },
++    {
++        "role": "assistant",
++        "content": "Hello! How can I assist you today?"
++    },
++    {
++        "role": "user",
++        "content": "Write an essay about the importance of higher education.",
++    },
++]
++outputs = llm.chat(conversation,
++                   sampling_params=sampling_params,
++                   use_tqdm=False)
++print_outputs(outputs)
++
++# You can run batch inference with llm.chat API
++conversation = [
++    {
++        "role": "system",
++        "content": "You are a helpful assistant"
++    },
++    {
++        "role": "user",
++        "content": "Hello"
++    },
++    {
++        "role": "assistant",
++        "content": "Hello! How can I assist you today?"
++    },
++    {
++        "role": "user",
++        "content": "Write an essay about the importance of higher education.",
++    },
++]
++conversations = [conversation for _ in range(10)]
++
++# We turn on tqdm progress bar to verify it's indeed running batch inference
++outputs = llm.chat(messages=conversations,
++                   sampling_params=sampling_params,
++                   use_tqdm=True)
++print_outputs(outputs)
++
++# A chat template can be optionally supplied.
++# If not, the model will use its default chat template.
++
++# with open('template_falcon_180b.jinja', "r") as f:
++#     chat_template = f.read()
++
++# outputs = llm.chat(
++#     conversations,
++#     sampling_params=sampling_params,
++#     use_tqdm=False,
++#     chat_template=chat_template,
++# )
+diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
+new file mode 100644
+index 0000000..e69a6c0
+--- /dev/null
++++ b/examples/offline_inference/chat_with_tools.py
+@@ -0,0 +1,138 @@
++# ruff: noqa
++import json
++import random
++import string
++
++from vllm import LLM
++from vllm.sampling_params import SamplingParams
++
++# This script is an offline demo for function calling
++#
++# If you want to run a server/client setup, please follow this code:
++#
++# - Server:
++#
++# ```bash
++# vllm serve mistralai/Mistral-7B-Instruct-v0.3 --tokenizer-mode mistral --load-format mistral --config-format mistral
++# ```
++#
++# - Client:
++#
++# ```bash
++# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
++# --header 'Content-Type: application/json' \
++# --header 'Authorization: Bearer token' \
++# --data '{
++#     "model": "mistralai/Mistral-7B-Instruct-v0.3"
++#     "messages": [
++#       {
++#         "role": "user",
++#         "content": [
++#             {"type" : "text", "text": "Describe this image in detail please."},
++#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
++#             {"type" : "text", "text": "and this one as well. Answer in French."},
++#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
++#         ]
++#       }
++#     ]
++#   }'
++# ```
++#
++# Usage:
++#     python demo.py simple
++#     python demo.py advanced
++
++model_name = "mistralai/Mistral-7B-Instruct-v0.3"
++# or switch to "mistralai/Mistral-Nemo-Instruct-2407"
++# or "mistralai/Mistral-Large-Instruct-2407"
++# or any other mistral model with function calling ability
++
++sampling_params = SamplingParams(max_tokens=8192, temperature=0.0)
++llm = LLM(model=model_name,
++          tokenizer_mode="mistral",
++          config_format="mistral",
++          load_format="mistral")
++
++
++def generate_random_id(length=9):
++    characters = string.ascii_letters + string.digits
++    random_id = ''.join(random.choice(characters) for _ in range(length))
++    return random_id
++
++
++# simulate an API that can be called
++def get_current_weather(city: str, state: str, unit: 'str'):
++    return (f"The weather in {city}, {state} is 85 degrees {unit}. It is "
++            "partly cloudly, with highs in the 90's.")
++
++
++tool_funtions = {"get_current_weather": get_current_weather}
++
++tools = [{
++    "type": "function",
++    "function": {
++        "name": "get_current_weather",
++        "description": "Get the current weather in a given location",
++        "parameters": {
++            "type": "object",
++            "properties": {
++                "city": {
++                    "type":
++                    "string",
++                    "description":
++                    "The city to find the weather for, e.g. 'San Francisco'"
++                },
++                "state": {
++                    "type":
++                    "string",
++                    "description":
++                    "the two-letter abbreviation for the state that the city is"
++                    " in, e.g. 'CA' which would mean 'California'"
++                },
++                "unit": {
++                    "type": "string",
++                    "description": "The unit to fetch the temperature in",
++                    "enum": ["celsius", "fahrenheit"]
++                }
++            },
++            "required": ["city", "state", "unit"]
++        }
++    }
++}]
++
++messages = [{
++    "role":
++    "user",
++    "content":
++    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
++}]
++
++outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools)
++output = outputs[0].outputs[0].text.strip()
++
++# append the assistant message
++messages.append({
++    "role": "assistant",
++    "content": output,
++})
++
++# let's now actually parse and execute the model's output simulating an API call by using the
++# above defined function
++tool_calls = json.loads(output)
++tool_answers = [
++    tool_funtions[call['name']](**call['arguments']) for call in tool_calls
++]
++
++# append the answer as a tool message and let the LLM give you an answer
++messages.append({
++    "role": "tool",
++    "content": "\n\n".join(tool_answers),
++    "tool_call_id": generate_random_id(),
++})
++
++outputs = llm.chat(messages, sampling_params, tools=tools)
++
++print(outputs[0].outputs[0].text.strip())
++# yields
++#   'The weather in Dallas, TX is 85 degrees fahrenheit. '
++#   'It is partly cloudly, with highs in the 90's.'
+diff --git a/examples/offline_inference/classification.py b/examples/offline_inference/classification.py
+new file mode 100644
+index 0000000..de539b6
+--- /dev/null
++++ b/examples/offline_inference/classification.py
+@@ -0,0 +1,28 @@
++from vllm import LLM
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++
++# Create an LLM.
++# You should pass task="classify" for classification models
++model = LLM(
++    model="jason9693/Qwen2.5-1.5B-apeach",
++    task="classify",
++    enforce_eager=True,
++)
++
++# Generate logits. The output is a list of ClassificationRequestOutputs.
++outputs = model.classify(prompts)
++
++# Print the outputs.
++for prompt, output in zip(prompts, outputs):
++    probs = output.outputs.probs
++    probs_trimmed = ((str(probs[:16])[:-1] +
++                      ", ...]") if len(probs) > 16 else probs)
++    print(f"Prompt: {prompt!r} | "
++          f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
+diff --git a/examples/offline_inference/cli.py b/examples/offline_inference/cli.py
+new file mode 100644
+index 0000000..391ac6b
+--- /dev/null
++++ b/examples/offline_inference/cli.py
+@@ -0,0 +1,80 @@
++from dataclasses import asdict
++
++from vllm import LLM, SamplingParams
++from vllm.engine.arg_utils import EngineArgs
++from vllm.utils import FlexibleArgumentParser
++
++
++def get_prompts(num_prompts: int):
++    # The default sample prompts.
++    prompts = [
++        "Hello, my name is",
++        "The president of the United States is",
++        "The capital of France is",
++        "The future of AI is",
++    ]
++
++    if num_prompts != len(prompts):
++        prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
++
++    return prompts
++
++
++def main(args):
++    # Create prompts
++    prompts = get_prompts(args.num_prompts)
++
++    # Create a sampling params object.
++    sampling_params = SamplingParams(n=args.n,
++                                     temperature=args.temperature,
++                                     top_p=args.top_p,
++                                     top_k=args.top_k,
++                                     max_tokens=args.max_tokens)
++
++    # Create an LLM.
++    # The default model is 'facebook/opt-125m'
++    engine_args = EngineArgs.from_cli_args(args)
++    llm = LLM(**asdict(engine_args))
++
++    # Generate texts from the prompts.
++    # The output is a list of RequestOutput objects
++    # that contain the prompt, generated text, and other information.
++    outputs = llm.generate(prompts, sampling_params)
++    # Print the outputs.
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++
++if __name__ == '__main__':
++    parser = FlexibleArgumentParser()
++    parser = EngineArgs.add_cli_args(parser)
++    group = parser.add_argument_group("SamplingParams options")
++    group.add_argument("--num-prompts",
++                       type=int,
++                       default=4,
++                       help="Number of prompts used for inference")
++    group.add_argument("--max-tokens",
++                       type=int,
++                       default=16,
++                       help="Generated output length for sampling")
++    group.add_argument('--n',
++                       type=int,
++                       default=1,
++                       help='Number of generated sequences per prompt')
++    group.add_argument('--temperature',
++                       type=float,
++                       default=0.8,
++                       help='Temperature for text generation')
++    group.add_argument('--top-p',
++                       type=float,
++                       default=0.95,
++                       help='top_p for text generation')
++    group.add_argument('--top-k',
++                       type=int,
++                       default=-1,
++                       help='top_k for text generation')
++
++    args = parser.parse_args()
++    main(args)
+diff --git a/examples/offline_inference/cpu_offload.py b/examples/offline_inference/cpu_offload.py
+new file mode 100644
+index 0000000..b152e5b
+--- /dev/null
++++ b/examples/offline_inference/cpu_offload.py
+@@ -0,0 +1,22 @@
++from vllm import LLM, SamplingParams
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++# Create an LLM.
++llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
++# Generate texts from the prompts. The output is a list of RequestOutput objects
++# that contain the prompt, generated text, and other information.
++outputs = llm.generate(prompts, sampling_params)
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py
+new file mode 100644
+index 0000000..6771278
+--- /dev/null
++++ b/examples/offline_inference/distributed.py
+@@ -0,0 +1,108 @@
++"""
++This example shows how to use Ray Data for running offline batch inference
++distributively on a multi-nodes cluster.
++
++Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
++"""
++
++from typing import Any, Dict, List
++
++import numpy as np
++import ray
++from packaging.version import Version
++from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
++
++from vllm import LLM, SamplingParams
++
++assert Version(ray.__version__) >= Version(
++    "2.22.0"), "Ray version must be at least 2.22.0"
++
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++# Set tensor parallelism per instance.
++tensor_parallel_size = 1
++
++# Set number of instances. Each instance will use tensor_parallel_size GPUs.
++num_instances = 1
++
++
++# Create a class to do batch inference.
++class LLMPredictor:
++
++    def __init__(self):
++        # Create an LLM.
++        self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
++                       tensor_parallel_size=tensor_parallel_size)
++
++    def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
++        # Generate texts from the prompts.
++        # The output is a list of RequestOutput objects that contain the prompt,
++        # generated text, and other information.
++        outputs = self.llm.generate(batch["text"], sampling_params)
++        prompt: List[str] = []
++        generated_text: List[str] = []
++        for output in outputs:
++            prompt.append(output.prompt)
++            generated_text.append(' '.join([o.text for o in output.outputs]))
++        return {
++            "prompt": prompt,
++            "generated_text": generated_text,
++        }
++
++
++# Read one text file from S3. Ray Data supports reading multiple files
++# from cloud storage (such as JSONL, Parquet, CSV, binary format).
++ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
++
++
++# For tensor_parallel_size > 1, we need to create placement groups for vLLM
++# to use. Every actor has to have its own placement group.
++def scheduling_strategy_fn():
++    # One bundle per tensor parallel worker
++    pg = ray.util.placement_group(
++        [{
++            "GPU": 1,
++            "CPU": 1
++        }] * tensor_parallel_size,
++        strategy="STRICT_PACK",
++    )
++    return dict(scheduling_strategy=PlacementGroupSchedulingStrategy(
++        pg, placement_group_capture_child_tasks=True))
++
++
++resources_kwarg: Dict[str, Any] = {}
++if tensor_parallel_size == 1:
++    # For tensor_parallel_size == 1, we simply set num_gpus=1.
++    resources_kwarg["num_gpus"] = 1
++else:
++    # Otherwise, we have to set num_gpus=0 and provide
++    # a function that will create a placement group for
++    # each instance.
++    resources_kwarg["num_gpus"] = 0
++    resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn
++
++# Apply batch inference for all input data.
++ds = ds.map_batches(
++    LLMPredictor,
++    # Set the concurrency to the number of LLM instances.
++    concurrency=num_instances,
++    # Specify the batch size for inference.
++    batch_size=32,
++    **resources_kwarg,
++)
++
++# Peek first 10 results.
++# NOTE: This is for local testing and debugging. For production use case,
++# one should write full result out as shown below.
++outputs = ds.take(limit=10)
++for output in outputs:
++    prompt = output["prompt"]
++    generated_text = output["generated_text"]
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++# Write inference output data out as Parquet files to S3.
++# Multiple files would be written to the output destination,
++# and each task would write one or more files separately.
++#
++# ds.write_parquet("s3://<your-output-bucket>")
+diff --git a/examples/offline_inference/embedding.py b/examples/offline_inference/embedding.py
+new file mode 100644
+index 0000000..58d0043
+--- /dev/null
++++ b/examples/offline_inference/embedding.py
+@@ -0,0 +1,28 @@
++from vllm import LLM
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++
++# Create an LLM.
++# You should pass task="embed" for embedding models
++model = LLM(
++    model="intfloat/e5-mistral-7b-instruct",
++    task="embed",
++    enforce_eager=True,
++)
++
++# Generate embedding. The output is a list of EmbeddingRequestOutputs.
++outputs = model.embed(prompts)
++
++# Print the outputs.
++for prompt, output in zip(prompts, outputs):
++    embeds = output.outputs.embedding
++    embeds_trimmed = ((str(embeds[:16])[:-1] +
++                       ", ...]") if len(embeds) > 16 else embeds)
++    print(f"Prompt: {prompt!r} | "
++          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
+new file mode 100644
+index 0000000..0f266d7
+--- /dev/null
++++ b/examples/offline_inference/encoder_decoder.py
+@@ -0,0 +1,99 @@
++'''
++Demonstrate prompting of text-to-text
++encoder/decoder models, specifically BART
++'''
++
++from vllm import LLM, SamplingParams
++from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
++                         TokensPrompt, zip_enc_dec_prompts)
++
++dtype = "float"
++
++# Create a BART encoder/decoder model instance
++llm = LLM(
++    model="facebook/bart-large-cnn",
++    dtype=dtype,
++)
++
++# Get BART tokenizer
++tokenizer = llm.llm_engine.get_tokenizer_group()
++
++# Test prompts
++#
++# This section shows all of the valid ways to prompt an
++# encoder/decoder model.
++#
++# - Helpers for building prompts
++text_prompt_raw = "Hello, my name is"
++text_prompt = TextPrompt(prompt="The president of the United States is")
++tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
++    prompt="The capital of France is"))
++# - Pass a single prompt to encoder/decoder model
++#   (implicitly encoder input prompt);
++#   decoder input prompt is assumed to be None
++
++single_text_prompt_raw = text_prompt_raw  # Pass a string directly
++single_text_prompt = text_prompt  # Pass a TextPrompt
++single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
++
++# - Pass explicit encoder and decoder input prompts within one data structure.
++#   Encoder and decoder prompts can both independently be text or tokens, with
++#   no requirement that they be the same prompt type. Some example prompt-type
++#   combinations are shown below, note that these are not exhaustive.
++
++enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
++    # Pass encoder prompt string directly, &
++    # pass decoder prompt tokens
++    encoder_prompt=single_text_prompt_raw,
++    decoder_prompt=single_tokens_prompt,
++)
++enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
++    # Pass TextPrompt to encoder, and
++    # pass decoder prompt string directly
++    encoder_prompt=single_text_prompt,
++    decoder_prompt=single_text_prompt_raw,
++)
++enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
++    # Pass encoder prompt tokens directly, and
++    # pass TextPrompt to decoder
++    encoder_prompt=single_tokens_prompt,
++    decoder_prompt=single_text_prompt,
++)
++
++# - Finally, here's a useful helper function for zipping encoder and
++#   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
++#   instances
++zipped_prompt_list = zip_enc_dec_prompts(
++    ['An encoder prompt', 'Another encoder prompt'],
++    ['A decoder prompt', 'Another decoder prompt'])
++
++# - Let's put all of the above example prompts together into one list
++#   which we will pass to the encoder/decoder LLM.
++prompts = [
++    single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
++    enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
++] + zipped_prompt_list
++
++print(prompts)
++
++# Create a sampling params object.
++sampling_params = SamplingParams(
++    temperature=0,
++    top_p=1.0,
++    min_tokens=0,
++    max_tokens=20,
++)
++
++# Generate output tokens from the prompts. The output is a list of
++# RequestOutput objects that contain the prompt, generated
++# text, and other information.
++outputs = llm.generate(prompts, sampling_params)
++
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    encoder_prompt = output.encoder_prompt
++    generated_text = output.outputs[0].text
++    print(f"Encoder prompt: {encoder_prompt!r}, "
++          f"Decoder prompt: {prompt!r}, "
++          f"Generated text: {generated_text!r}")
+diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py
+new file mode 100644
+index 0000000..c24096e
+--- /dev/null
++++ b/examples/offline_inference/florence2_inference.py
+@@ -0,0 +1,45 @@
++'''
++Demonstrate prompting of text-to-text
++encoder/decoder models, specifically Florence-2
++'''
++# TODO(Isotr0py):
++# Move to offline_inference/vision_language.py
++# after porting vision backbone
++from vllm import LLM, SamplingParams
++
++dtype = "float"
++
++# Create a Florence-2 encoder/decoder model instance
++llm = LLM(
++    model="microsoft/Florence-2-base",
++    tokenizer="facebook/bart-base",
++    dtype=dtype,
++    trust_remote_code=True,
++)
++
++prompts = [
++    "<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>",
++    "<CAPTION_TO_PHRASE_GROUNDING>", "<OD>", "<DENSE_REGION_CAPTION>",
++    "<REGION_PROPOSAL>", "<OCR>", "<OCR_WITH_REGION>"
++]
++# Create a sampling params object.
++sampling_params = SamplingParams(
++    temperature=0,
++    top_p=1.0,
++    min_tokens=0,
++    max_tokens=20,
++)
++
++# Generate output tokens from the prompts. The output is a list of
++# RequestOutput objects that contain the prompt, generated
++# text, and other information.
++outputs = llm.generate(prompts, sampling_params)
++
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    encoder_prompt = output.encoder_prompt
++    generated_text = output.outputs[0].text
++    print(f"Encoder prompt: {encoder_prompt!r}, "
++          f"Decoder prompt: {prompt!r}, "
++          f"Generated text: {generated_text!r}")
+diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py
+new file mode 100644
+index 0000000..aa05c4c
+--- /dev/null
++++ b/examples/offline_inference/gguf_inference.py
+@@ -0,0 +1,32 @@
++from huggingface_hub import hf_hub_download
++
++from vllm import LLM, SamplingParams
++
++
++def run_gguf_inference(model_path, tokenizer):
++    # Sample prompts.
++    prompts = [
++        "How many helicopters can a human eat in one sitting?",
++        "What's the future of AI?",
++    ]
++    prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
++    # Create a sampling params object.
++    sampling_params = SamplingParams(temperature=0, max_tokens=128)
++
++    # Create an LLM.
++    llm = LLM(model=model_path, tokenizer=tokenizer)
++
++    outputs = llm.chat(prompts, sampling_params)
++    # Print the outputs.
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++
++if __name__ == "__main__":
++    repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
++    filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
++    tokenizer = "microsoft/Phi-3-medium-4k-instruct"
++    model = hf_hub_download(repo_id, filename=filename)
++    run_gguf_inference(model, tokenizer)
+diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
+new file mode 100644
+index 0000000..60d894a
+--- /dev/null
++++ b/examples/offline_inference/llm_engine_example.py
+@@ -0,0 +1,60 @@
++import argparse
++from typing import List, Tuple
++
++from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
++from vllm.utils import FlexibleArgumentParser
++
++
++def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
++    """Create a list of test prompts with their sampling parameters."""
++    return [
++        ("A robot may not injure a human being",
++         SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
++        ("To be or not to be,",
++         SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
++        ("What is the meaning of life?",
++         SamplingParams(n=2,
++                        best_of=5,
++                        temperature=0.8,
++                        top_p=0.95,
++                        frequency_penalty=0.1)),
++    ]
++
++
++def process_requests(engine: LLMEngine,
++                     test_prompts: List[Tuple[str, SamplingParams]]):
++    """Continuously process a list of prompts and handle the outputs."""
++    request_id = 0
++
++    while test_prompts or engine.has_unfinished_requests():
++        if test_prompts:
++            prompt, sampling_params = test_prompts.pop(0)
++            engine.add_request(str(request_id), prompt, sampling_params)
++            request_id += 1
++
++        request_outputs: List[RequestOutput] = engine.step()
++
++        for request_output in request_outputs:
++            if request_output.finished:
++                print(request_output)
++
++
++def initialize_engine(args: argparse.Namespace) -> LLMEngine:
++    """Initialize the LLMEngine from the command line arguments."""
++    engine_args = EngineArgs.from_cli_args(args)
++    return LLMEngine.from_engine_args(engine_args)
++
++
++def main(args: argparse.Namespace):
++    """Main function that sets up and runs the prompt processing."""
++    engine = initialize_engine(args)
++    test_prompts = create_test_prompts()
++    process_requests(engine, test_prompts)
++
++
++if __name__ == '__main__':
++    parser = FlexibleArgumentParser(
++        description='Demo on using the LLMEngine class directly')
++    parser = EngineArgs.add_cli_args(parser)
++    args = parser.parse_args()
++    main(args)
+diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
+new file mode 100644
+index 0000000..0c454ea
+--- /dev/null
++++ b/examples/offline_inference/lora_with_quantization_inference.py
+@@ -0,0 +1,134 @@
++"""
++This example shows how to use LoRA with different quantization techniques
++for offline inference.
++
++Requires HuggingFace credentials for access.
++"""
++
++import gc
++from typing import List, Optional, Tuple
++
++import torch
++from huggingface_hub import snapshot_download
++
++from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
++from vllm.lora.request import LoRARequest
++
++
++def create_test_prompts(
++        lora_path: str
++) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
++    return [
++        # this is an example of using quantization without LoRA
++        ("My name is",
++         SamplingParams(temperature=0.0,
++                        logprobs=1,
++                        prompt_logprobs=1,
++                        max_tokens=128), None),
++        # the next three examples use quantization with LoRA
++        ("my name is",
++         SamplingParams(temperature=0.0,
++                        logprobs=1,
++                        prompt_logprobs=1,
++                        max_tokens=128),
++         LoRARequest("lora-test-1", 1, lora_path)),
++        ("The capital of USA is",
++         SamplingParams(temperature=0.0,
++                        logprobs=1,
++                        prompt_logprobs=1,
++                        max_tokens=128),
++         LoRARequest("lora-test-2", 1, lora_path)),
++        ("The capital of France is",
++         SamplingParams(temperature=0.0,
++                        logprobs=1,
++                        prompt_logprobs=1,
++                        max_tokens=128),
++         LoRARequest("lora-test-3", 1, lora_path)),
++    ]
++
++
++def process_requests(engine: LLMEngine,
++                     test_prompts: List[Tuple[str, SamplingParams,
++                                              Optional[LoRARequest]]]):
++    """Continuously process a list of prompts and handle the outputs."""
++    request_id = 0
++
++    while test_prompts or engine.has_unfinished_requests():
++        if test_prompts:
++            prompt, sampling_params, lora_request = test_prompts.pop(0)
++            engine.add_request(str(request_id),
++                               prompt,
++                               sampling_params,
++                               lora_request=lora_request)
++            request_id += 1
++
++        request_outputs: List[RequestOutput] = engine.step()
++        for request_output in request_outputs:
++            if request_output.finished:
++                print("----------------------------------------------------")
++                print(f"Prompt: {request_output.prompt}")
++                print(f"Output: {request_output.outputs[0].text}")
++
++
++def initialize_engine(model: str, quantization: str,
++                      lora_repo: Optional[str]) -> LLMEngine:
++    """Initialize the LLMEngine."""
++
++    if quantization == "bitsandbytes":
++        # QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique.
++        # It quantizes the model when loading, with some config info from the
++        # LoRA adapter repo. So need to set the parameter of load_format and
++        # qlora_adapter_name_or_path as below.
++        engine_args = EngineArgs(model=model,
++                                 quantization=quantization,
++                                 qlora_adapter_name_or_path=lora_repo,
++                                 load_format="bitsandbytes",
++                                 enable_lora=True,
++                                 max_lora_rank=64)
++    else:
++        engine_args = EngineArgs(model=model,
++                                 quantization=quantization,
++                                 enable_lora=True,
++                                 max_loras=4)
++    return LLMEngine.from_engine_args(engine_args)
++
++
++def main():
++    """Main function that sets up and runs the prompt processing."""
++
++    test_configs = [{
++        "name": "qlora_inference_example",
++        'model': "huggyllama/llama-7b",
++        'quantization': "bitsandbytes",
++        'lora_repo': 'timdettmers/qlora-flan-7b'
++    }, {
++        "name": "AWQ_inference_with_lora_example",
++        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
++        'quantization': "awq",
++        'lora_repo': 'jashing/tinyllama-colorist-lora'
++    }, {
++        "name": "GPTQ_inference_with_lora_example",
++        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
++        'quantization': "gptq",
++        'lora_repo': 'jashing/tinyllama-colorist-lora'
++    }]
++
++    for test_config in test_configs:
++        print(
++            f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~"
++        )
++        engine = initialize_engine(test_config['model'],
++                                   test_config['quantization'],
++                                   test_config['lora_repo'])
++        lora_path = snapshot_download(repo_id=test_config['lora_repo'])
++        test_prompts = create_test_prompts(lora_path)
++        process_requests(engine, test_prompts)
++
++        # Clean up the GPU memory for the next test
++        del engine
++        gc.collect()
++        torch.cuda.empty_cache()
++
++
++if __name__ == '__main__':
++    main()
+diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
+new file mode 100644
+index 0000000..8f0eb65
+--- /dev/null
++++ b/examples/offline_inference/mlpspeculator.py
+@@ -0,0 +1,56 @@
++import gc
++import time
++from typing import List
++
++from vllm import LLM, SamplingParams
++
++
++def time_generation(llm: LLM, prompts: List[str],
++                    sampling_params: SamplingParams):
++    # Generate texts from the prompts. The output is a list of RequestOutput
++    # objects that contain the prompt, generated text, and other information.
++    # Warmup first
++    llm.generate(prompts, sampling_params)
++    llm.generate(prompts, sampling_params)
++    start = time.time()
++    outputs = llm.generate(prompts, sampling_params)
++    end = time.time()
++    print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
++    # Print the outputs.
++    for output in outputs:
++        generated_text = output.outputs[0].text
++        print(f"text: {generated_text!r}")
++
++
++if __name__ == "__main__":
++
++    template = (
++        "Below is an instruction that describes a task. Write a response "
++        "that appropriately completes the request.\n\n### Instruction:\n{}"
++        "\n\n### Response:\n")
++
++    # Sample prompts.
++    prompts = [
++        "Write about the president of the United States.",
++    ]
++    prompts = [template.format(prompt) for prompt in prompts]
++    # Create a sampling params object.
++    sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
++
++    # Create an LLM without spec decoding
++    llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
++
++    print("Without speculation")
++    time_generation(llm, prompts, sampling_params)
++
++    del llm
++    gc.collect()
++
++    # Create an LLM with spec decoding
++    llm = LLM(
++        model="meta-llama/Llama-2-13b-chat-hf",
++        speculative_model="ibm-fms/llama-13b-accelerator",
++    )
++
++    print("With speculation")
++    time_generation(llm, prompts, sampling_params)
+diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
+new file mode 100644
+index 0000000..043220d
+--- /dev/null
++++ b/examples/offline_inference/multilora_inference.py
+@@ -0,0 +1,106 @@
++"""
++This example shows how to use the multi-LoRA functionality
++for offline inference.
++
++Requires HuggingFace credentials for access to Llama2.
++"""
++
++from typing import List, Optional, Tuple
++
++from huggingface_hub import snapshot_download
++
++from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
++from vllm.lora.request import LoRARequest
++
++
++def create_test_prompts(
++        lora_path: str
++) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
++    """Create a list of test prompts with their sampling parameters.
++
++    2 requests for base model, 4 requests for the LoRA. We define 2
++    different LoRA adapters (using the same model for demo purposes).
++    Since we also set `max_loras=1`, the expectation is that the requests
++    with the second LoRA adapter will be ran after all requests with the
++    first adapter have finished.
++    """
++    return [
++        ("A robot may not injure a human being",
++         SamplingParams(temperature=0.0,
++                        logprobs=1,
++                        prompt_logprobs=1,
++                        max_tokens=128), None),
++        ("To be or not to be,",
++         SamplingParams(temperature=0.8,
++                        top_k=5,
++                        presence_penalty=0.2,
++                        max_tokens=128), None),
++        (
++            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
++            SamplingParams(temperature=0.0,
++                           logprobs=1,
++                           prompt_logprobs=1,
++                           max_tokens=128,
++                           stop_token_ids=[32003]),
++            LoRARequest("sql-lora", 1, lora_path)),
++        (
++            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
++            SamplingParams(temperature=0.0,
++                           logprobs=1,
++                           prompt_logprobs=1,
++                           max_tokens=128,
++                           stop_token_ids=[32003]),
++            LoRARequest("sql-lora2", 2, lora_path)),
++    ]
++
++
++def process_requests(engine: LLMEngine,
++                     test_prompts: List[Tuple[str, SamplingParams,
++                                              Optional[LoRARequest]]]):
++    """Continuously process a list of prompts and handle the outputs."""
++    request_id = 0
++
++    while test_prompts or engine.has_unfinished_requests():
++        if test_prompts:
++            prompt, sampling_params, lora_request = test_prompts.pop(0)
++            engine.add_request(str(request_id),
++                               prompt,
++                               sampling_params,
++                               lora_request=lora_request)
++            request_id += 1
++
++        request_outputs: List[RequestOutput] = engine.step()
++
++        for request_output in request_outputs:
++            if request_output.finished:
++                print(request_output)
++
++
++def initialize_engine() -> LLMEngine:
++    """Initialize the LLMEngine."""
++    # max_loras: controls the number of LoRAs that can be used in the same
++    #   batch. Larger numbers will cause higher memory usage, as each LoRA
++    #   slot requires its own preallocated tensor.
++    # max_lora_rank: controls the maximum supported rank of all LoRAs. Larger
++    #   numbers will cause higher memory usage. If you know that all LoRAs will
++    #   use the same rank, it is recommended to set this as low as possible.
++    # max_cpu_loras: controls the size of the CPU LoRA cache.
++    engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf",
++                             enable_lora=True,
++                             max_loras=1,
++                             max_lora_rank=8,
++                             max_cpu_loras=2,
++                             max_num_seqs=256)
++    return LLMEngine.from_engine_args(engine_args)
++
++
++def main():
++    """Main function that sets up and runs the prompt processing."""
++    engine = initialize_engine()
++    lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
++    test_prompts = create_test_prompts(lora_path)
++    process_requests(engine, test_prompts)
++
++
++if __name__ == '__main__':
++    main()
+diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
+new file mode 100644
+index 0000000..f098c8e
+--- /dev/null
++++ b/examples/offline_inference/neuron.py
+@@ -0,0 +1,36 @@
++from vllm import LLM, SamplingParams
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++# Create an LLM.
++llm = LLM(
++    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
++    max_num_seqs=8,
++    # The max_model_len and block_size arguments are required to be same as
++    # max sequence length when targeting neuron device.
++    # Currently, this is a known limitation in continuous batching support
++    # in transformers-neuronx.
++    # TODO(liangfu): Support paged-attention in transformers-neuronx.
++    max_model_len=1024,
++    block_size=1024,
++    # The device can be automatically detected when AWS Neuron SDK is installed.
++    # The device argument can be either unspecified for automated detection,
++    # or explicitly assigned.
++    device="neuron",
++    tensor_parallel_size=2)
++# Generate texts from the prompts. The output is a list of RequestOutput objects
++# that contain the prompt, generated text, and other information.
++outputs = llm.generate(prompts, sampling_params)
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
+new file mode 100644
+index 0000000..8ec17e3
+--- /dev/null
++++ b/examples/offline_inference/neuron_int8_quantization.py
+@@ -0,0 +1,50 @@
++import os
++
++from vllm import LLM, SamplingParams
++
++# creates XLA hlo graphs for all the context length buckets.
++os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
++# creates XLA hlo graphs for all the token gen buckets.
++os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
++# Quantizes neuron model weight to int8 ,
++# The default config for quantization is int8 dtype.
++os.environ['NEURON_QUANT_DTYPE'] = "s8"
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++# Create an LLM.
++llm = LLM(
++    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
++    max_num_seqs=8,
++    # The max_model_len and block_size arguments are required to be same as
++    # max sequence length when targeting neuron device.
++    # Currently, this is a known limitation in continuous batching support
++    # in transformers-neuronx.
++    # TODO(liangfu): Support paged-attention in transformers-neuronx.
++    max_model_len=2048,
++    block_size=2048,
++    # The device can be automatically detected when AWS Neuron SDK is installed.
++    # The device argument can be either unspecified for automated detection,
++    # or explicitly assigned.
++    device="neuron",
++    quantization="neuron_quant",
++    override_neuron_config={
++        "cast_logits_dtype": "bfloat16",
++    },
++    tensor_parallel_size=2)
++# Generate texts from the prompts. The output is a list of RequestOutput objects
++# that contain the prompt, generated text, and other information.
++outputs = llm.generate(prompts, sampling_params)
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+diff --git a/examples/offline_inference/openai/openai_batch.md b/examples/offline_inference/openai/openai_batch.md
+new file mode 100644
+index 0000000..a4774e5
+--- /dev/null
++++ b/examples/offline_inference/openai/openai_batch.md
+@@ -0,0 +1,205 @@
++# Offline Inference with the OpenAI Batch file format
++
++```{important}
++This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API.
++```
++
++## File Format
++ 
++The OpenAI batch file format consists of a series of json objects on new lines.
++ 
++[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
++ 
++Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
++ 
++```{note}
++We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
++```
++ 
++## Pre-requisites
++
++* The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
++  - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
++  - Install the token on your machine (Run `huggingface-cli login`).
++  - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
++ 
++ 
++## Example 1: Running with a local file
++
++### Step 1: Create your batch file
++
++To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
++
++```
++wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
++```
++
++Once you've created your batch file it should look like this
++
++```
++$ cat offline_inference/openai/openai_example_batch.jsonl
++{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
++{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
++```
++
++### Step 2: Run the batch
++ 
++The batch running tool is designed to be used from the command line.
++
++You can run the batch with the following command, which will write its results to a file called `results.jsonl`
++
++```
++python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
++```
++
++### Step 3: Check your results
++
++You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl`
++
++```
++$ cat results.jsonl
++{"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null}
++{"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null}
++```
++
++## Example 2: Using remote files
++
++The batch runner supports remote input and output urls that are accessible via http/https.
++
++For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run
++
++```
++python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
++```
++
++## Example 3: Integrating with AWS S3
++
++To integrate with cloud blob storage, we recommend using presigned urls.
++
++[Learn more about S3 presigned urls here]
++
++### Additional prerequisites
++
++* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html). 
++* The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3.
++  - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
++* The `boto3` python package (Run `pip install boto3`) to generate presigned urls.
++
++### Step 1: Upload your input script
++
++To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
++
++```
++wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
++```
++
++Once you've created your batch file it should look like this
++
++```
++$ cat offline_inference/openai/openai_example_batch.jsonl
++{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
++{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
++```
++
++Now upload your batch file to your S3 bucket.
++
++```
++aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
++```
++
++### Step 2: Generate your presigned urls
++
++Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
++
++(The script is adapted from https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py)
++
++```
++import boto3
++from botocore.exceptions import ClientError
++
++def generate_presigned_url(s3_client, client_method, method_parameters, expires_in):
++    """
++    Generate a presigned Amazon S3 URL that can be used to perform an action.
++
++    :param s3_client: A Boto3 Amazon S3 client.
++    :param client_method: The name of the client method that the URL performs.
++    :param method_parameters: The parameters of the specified client method.
++    :param expires_in: The number of seconds the presigned URL is valid for.
++    :return: The presigned URL.
++    """
++    try:
++        url = s3_client.generate_presigned_url(
++            ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in
++        )
++    except ClientError:
++        raise
++    return url
++
++
++s3_client = boto3.client("s3")
++input_url = generate_presigned_url(
++    s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600
++)
++output_url = generate_presigned_url(
++    s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600
++)
++print(f"{input_url=}")
++print(f"{output_url=}")
++```
++
++This script should output
++
++```
++input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
++output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
++```
++
++### Step 3: Run the batch runner using your presigned urls
++
++You can now run the batch runner, using the urls generated in the previous section.
++
++```
++python -m vllm.entrypoints.openai.run_batch \
++    -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
++    -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
++    --model --model meta-llama/Meta-Llama-3-8B-Instruct
++```
++
++### Step 4: View your results
++
++Your results are now on S3. You can view them in your terminal by running
++
++```
++aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
++```
++
++## Example 4: Using embeddings endpoint
++
++### Additional prerequisites
++
++* Ensure you are using `vllm >= 0.5.5`.
++
++### Step 1: Create your batch file
++ 
++Add embedding requests to your batch file. The following is an example:
++ 
++```
++{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
++{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
++```
++
++You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
++
++### Step 2: Run the batch
++
++You can run the batch using the same command as in earlier examples.
++
++### Step 3: Check your results
++
++You can check your results by running `cat results.jsonl`
++
++```
++$ cat results.jsonl
++{"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
++...
++```
+diff --git a/examples/offline_inference/openai/openai_example_batch.jsonl b/examples/offline_inference/openai/openai_example_batch.jsonl
+new file mode 100644
+index 0000000..54ac8c8
+--- /dev/null
++++ b/examples/offline_inference/openai/openai_example_batch.jsonl
+@@ -0,0 +1,2 @@
++{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
++{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
+diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/pixtral.py
+new file mode 100644
+index 0000000..c12ff70
+--- /dev/null
++++ b/examples/offline_inference/pixtral.py
+@@ -0,0 +1,165 @@
++# ruff: noqa
++import argparse
++
++from vllm import LLM
++from vllm.sampling_params import SamplingParams
++
++# This script is an offline demo for running Pixtral.
++#
++# If you want to run a server/client setup, please follow this code:
++#
++# - Server:
++#
++# ```bash
++# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
++# ```
++#
++# - Client:
++#
++# ```bash
++# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
++# --header 'Content-Type: application/json' \
++# --header 'Authorization: Bearer token' \
++# --data '{
++#     "model": "mistralai/Pixtral-12B-2409",
++#     "messages": [
++#       {
++#         "role": "user",
++#         "content": [
++#             {"type" : "text", "text": "Describe this image in detail please."},
++#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
++#             {"type" : "text", "text": "and this one as well. Answer in French."},
++#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
++#         ]
++#       }
++#     ]
++#   }'
++# ```
++#
++# Usage:
++#     python demo.py simple
++#     python demo.py advanced
++
++
++def run_simple_demo():
++    model_name = "mistralai/Pixtral-12B-2409"
++    sampling_params = SamplingParams(max_tokens=8192)
++
++    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
++    llm = LLM(model=model_name, tokenizer_mode="mistral")
++
++    prompt = "Describe this image in one sentence."
++    image_url = "https://picsum.photos/id/237/200/300"
++
++    messages = [
++        {
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": prompt
++                },
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": image_url
++                    }
++                },
++            ],
++        },
++    ]
++    outputs = llm.chat(messages, sampling_params=sampling_params)
++
++    print(outputs[0].outputs[0].text)
++
++
++def run_advanced_demo():
++    model_name = "mistralai/Pixtral-12B-2409"
++    max_img_per_msg = 5
++    max_tokens_per_img = 4096
++
++    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
++    llm = LLM(
++        model=model_name,
++        tokenizer_mode="mistral",
++        limit_mm_per_prompt={"image": max_img_per_msg},
++        max_model_len=max_img_per_msg * max_tokens_per_img,
++    )
++
++    prompt = "Describe the following image."
++
++    url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
++    url_2 = "https://picsum.photos/seed/picsum/200/300"
++    url_3 = "https://picsum.photos/id/32/512/512"
++
++    messages = [
++        {
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": prompt
++                },
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": url_1
++                    }
++                },
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": url_2
++                    }
++                },
++            ],
++        },
++        {
++            "role": "assistant",
++            "content": "The images show nature.",
++        },
++        {
++            "role": "user",
++            "content": "More details please and answer only in French!.",
++        },
++        {
++            "role": "user",
++            "content": [
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": url_3
++                    }
++                },
++            ],
++        },
++    ]
++
++    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
++    print(outputs[0].outputs[0].text)
++
++
++def main():
++    parser = argparse.ArgumentParser(
++        description="Run a demo in simple or advanced mode.")
++
++    parser.add_argument(
++        "mode",
++        choices=["simple", "advanced"],
++        help="Specify the demo mode: 'simple' or 'advanced'",
++    )
++
++    args = parser.parse_args()
++
++    if args.mode == "simple":
++        print("Running simple demo...")
++        run_simple_demo()
++    elif args.mode == "advanced":
++        print("Running advanced demo...")
++        run_advanced_demo()
++
++
++if __name__ == "__main__":
++    main()
+diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py
+new file mode 100644
+index 0000000..67b755a
+--- /dev/null
++++ b/examples/offline_inference/prefix_caching.py
+@@ -0,0 +1,83 @@
++from vllm import LLM, SamplingParams
++from vllm.distributed import cleanup_dist_env_and_memory
++
++# NOTE: This is just a running example. For benchmarking purpose,
++# please see benchmarks/benchmark_prefix_caching.py
++
++# Common prefix.
++prefix = (
++    "You are an expert school principal, skilled in effectively managing "
++    "faculty and staff. Draft 10-15 questions for a potential first grade "
++    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
++    "community, joyful discovery, and life-long learning. The candidate is "
++    "coming in for a first-round panel interview for a 8th grade Math "
++    "teaching role. They have 5 years of previous teaching experience "
++    "as an assistant teacher at a co-ed, public school with experience "
++    "in middle school math teaching. Based on these information, fulfill "
++    "the following paragraph: ")
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++
++generating_prompts = [prefix + prompt for prompt in prompts]
++
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.0)
++
++# Create an LLM without prefix caching as a baseline.
++regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
++
++print("Results without `enable_prefix_caching`")
++
++# Generate texts from the prompts. The output is a list of RequestOutput objects
++# that contain the prompt, generated text, and other information.
++outputs = regular_llm.generate(generating_prompts, sampling_params)
++
++regular_generated_texts = []
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    regular_generated_texts.append(generated_text)
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++print("-" * 80)
++
++# Destroy the LLM object and free up the GPU memory.
++del regular_llm
++cleanup_dist_env_and_memory()
++
++# Create an LLM with prefix caching enabled.
++prefix_cached_llm = LLM(model="facebook/opt-125m",
++                        enable_prefix_caching=True,
++                        gpu_memory_utilization=0.4)
++
++# Warmup so that the shared prompt's KV cache is computed.
++prefix_cached_llm.generate(generating_prompts[0], sampling_params)
++
++# Generate with prefix caching.
++outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
++
++print("Results with `enable_prefix_caching`")
++
++cached_generated_texts = []
++# Print the outputs. You should see the same outputs as before.
++for output in outputs:
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    cached_generated_texts.append(generated_text)
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++print("-" * 80)
++
++# Compare the results and display the speedup
++generated_same = all([
++    regular_generated_texts[i] == cached_generated_texts[i]
++    for i in range(len(prompts))
++])
++print(f"Generated answers are the same: {generated_same}")
+diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
+new file mode 100644
+index 0000000..8a94b5c
+--- /dev/null
++++ b/examples/offline_inference/profiling.py
+@@ -0,0 +1,458 @@
++import inspect
++import json
++import os
++import sys
++from argparse import RawTextHelpFormatter
++from dataclasses import asdict, dataclass
++from typing import Any, Dict, Generator, List, Optional, TypeAlias
++
++import torch
++import tqdm
++
++from vllm import LLM, SamplingParams
++from vllm.engine.arg_utils import EngineArgs
++from vllm.profiler import layerwise_profile
++from vllm.utils import FlexibleArgumentParser
++
++BATCH_SIZE_DEFAULT = 1
++PROMPT_LEN_DEFAULT = 256
++
++
++@dataclass
++class ProfileContext:
++    engine_args: EngineArgs
++    prompt_len: int
++    batch_size: int
++
++    # The profiler can run in 2 modes,
++    # 1. Run profiler for user specified num_steps
++    num_steps: Optional[int] = None
++    # 2. Run profiler until all requests complete
++    complete_num_requests_per_step: Optional[int] = None
++
++    save_chrome_traces_folder: Optional[str] = None
++
++
++def get_dtype(dtype: str):
++    if dtype == "torch.float":
++        return torch.float
++    else:
++        return dtype
++
++
++OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
++def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
++      -> OutputLen_NumReqs_Map:
++    """
++    Given the number of requests, batch_size, and the number of requests
++    that each engine-step should process, step_requests, determine the
++    output lengths of the requests such that step_request is honoured.
++
++    Example: 
++    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
++    then return,
++    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
++    32 requests should have output length 2,
++    32 requests should have output length 3,
++    32 requests should have output length 4,
++    31 requests should have output length 5,
++    1 request should have output length 6.
++
++    Args:
++        batch_size (int): Number of requests submitted for profile. This is
++            args.batch_size.
++        step_requests (List[int]): step_requests[i] is the number of requests
++            that the ith engine step should process.
++
++    Returns:
++        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
++            number of requests required to have that output-length as values.
++    """
++    ol_nr: OutputLen_NumReqs_Map = {}
++
++    # Number of request that are assigned an output-length
++    num_reqs_assigned: int = 0
++    num_steps: int = len(step_requests)
++
++    # sanity check. The first step (prefill-step), must process all requests.
++    assert step_requests[0] == batch_size
++
++    # Begin assignments from the last step.
++    output_length: int = num_steps
++    for num_requests_at_step in reversed(step_requests):
++        if num_reqs_assigned == batch_size:
++            break
++
++        assert num_reqs_assigned < batch_size
++
++        # Remove the number of requests that have been determined
++        # to participate in this step and beyond.
++        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
++        assert num_reqs_unassigned_at_step >= 0
++
++        if num_reqs_unassigned_at_step > 0:
++            ol_nr[output_length] = num_reqs_unassigned_at_step
++            num_reqs_assigned += num_reqs_unassigned_at_step
++
++        output_length -= 1
++
++    # sanity checks.
++    assert sum(ol_nr.values()) == batch_size, \
++            ("Number of requests in output-length assignment does not match "
++             f"batch-size.\n batch size {batch_size} - "
++             f"step requests {step_requests} - assignments {ol_nr}")
++
++    # Check that the output-length is in [1, num-steps]. Output length must be
++    # at least 1 as all requests must participate in the prefill-step.
++    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \
++            ("Output lengths of requests should be in range "
++             f"[1, num-engine-steps].\n batch size {batch_size} - "
++             f"step requests {step_requests} - assignments {ol_nr}")
++
++    return ol_nr
++
++
++def determine_requests_per_step(context: ProfileContext) -> List[int]:
++    """
++    Determine number of requests each engine step should process.
++    If context.num_steps is set, then all engine steps process the
++    same number of requests and the output list is of length
++    context.num_steps.
++
++    If context.complete_num_requests_per_step is set, then each decode step
++    processes fewer and fewer requests until there are no requests to process.
++    In this case, the output list is as big as the number of steps
++    required to process all requests.
++
++    Args:
++        context: ProfileContext object.
++
++    Returns:
++        List[int]: Number of requests to process for all engine-steps. 
++         output[i], contains the number of requests that the ith step
++         should process.
++    """
++    if context.num_steps:
++        # All requests must run until num_engine_steps. This implies
++        # that their output lengths must be equal to num_engine_steps.
++        return [context.batch_size] * context.num_steps
++
++    assert context.complete_num_requests_per_step and \
++                context.complete_num_requests_per_step > 0, \
++        (f"Expected a positive complete_num_requests_per_step argument."
++         f"Instead got {context.complete_num_requests_per_step}")
++
++    # We start dropping after the first decode step.
++    step_requests = [
++        context.batch_size,  # prefill
++        context.batch_size,  # decode
++    ]
++
++    num_running_requests = context.batch_size
++    num_running_requests -= context.complete_num_requests_per_step
++    while num_running_requests > 0:
++        step_requests.append(num_running_requests)
++        num_running_requests -= context.complete_num_requests_per_step
++
++    if step_requests[-1] != 1:
++        # have 1 request running at the last step. This is often
++        # useful
++        step_requests.append(1)
++
++    return step_requests
++
++
++def run_profile(context: ProfileContext, csv_output: Optional[str],
++                json_output: Optional[str]):
++    print("Run profile with:")
++    for key, value in asdict(context).items():
++        print(f"  {key} = {value}")
++
++    requests_per_step: List[int] = determine_requests_per_step(context)
++
++    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
++        context.batch_size, requests_per_step)
++
++    num_steps_to_profile: int = len(requests_per_step)
++    max_output_len: int = max(ol_nr.keys())
++    assert max_output_len >= 1
++
++    # Create sampling params
++    sampling_params = SamplingParams(
++        temperature=0.8,
++        top_p=0.95,
++        # max_tokens is set on a per-request basis.
++        max_tokens=None,
++        ignore_eos=True)
++
++    # Create LLM
++    llm = LLM(**asdict(context.engine_args))
++    batch_size = context.batch_size
++    prompt_len = context.prompt_len
++
++    scheduler_config = llm.llm_engine.scheduler_config
++    max_model_len = llm.llm_engine.model_config.max_model_len
++    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
++    max_num_seqs = scheduler_config.max_num_seqs
++
++    if batch_size * prompt_len > max_num_batched_tokens:
++        print(f"ERROR: chosen batch_size * prompt_len "
++              f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
++              f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
++              f"and therefore cannot be run in a single profile step, please "
++              f"choose a smaller batch size or prompt length, or increase "
++              f"--max-num-batched-tokens")
++        sys.exit(-1)
++    if batch_size > max_num_seqs:
++        print(
++            f"ERROR: chosen batch_size ({batch_size}) is larger than "
++            f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
++            f"single profile step, please choose a smaller batch size")
++        sys.exit(-1)
++    print("llm.llm_engine.model_config.max_model_len: ",
++          llm.llm_engine.model_config.max_model_len)
++    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
++        print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
++              f"{max_output_len} = {prompt_len + max_output_len}) is larger "
++              f"than the model's max_model_len ({max_model_len}), please "
++              f"choose a smaller prompt_len or max_output_len, or increase "
++              f"--max-model-len")
++        sys.exit(-1)
++
++    def add_requests():
++
++        def get_output_len_generator() -> Generator[int, Any, Any]:
++            for output_len, num_reqs in ol_nr.items():
++                for _ in range(num_reqs):
++                    yield output_len
++
++        output_len_generator = get_output_len_generator()
++        for i in range(batch_size):
++            sampling_params.max_tokens = next(output_len_generator)
++            assert isinstance(sampling_params.max_tokens, int)
++
++            prompt_token_ids = torch.randint(
++                llm.llm_engine.model_config.get_vocab_size(),
++                size=(prompt_len, )).tolist()
++
++            llm.llm_engine.add_request(
++                request_id=f"seq{i}",
++                prompt={'prompt_token_ids': prompt_token_ids},
++                params=sampling_params)
++
++    def abort_requests():
++        for i in range(batch_size):
++            llm.llm_engine.abort_request(f"seq{i}")
++
++    # Warm up run
++    print("Warm up run ...")
++    add_requests()
++    llm.llm_engine.step()  # Prefill
++    llm.llm_engine.step()  # Decode
++    abort_requests()
++
++    print("Profile run ...")
++    add_requests()
++
++    with layerwise_profile() as prefill_prof:
++        llm.llm_engine.step()  # First step is prefill
++
++    decode_profs = []
++    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
++        num_running_seqs = llm.llm_engine.scheduler[
++            0].get_num_unfinished_seq_groups()
++        with layerwise_profile(
++                num_running_seqs=num_running_seqs) as decode_prof:
++            llm.llm_engine.step()
++        decode_profs.append(decode_prof)
++
++    decode_results_list = [prof.results for prof in decode_profs]
++    prefill_results = prefill_prof.results
++    has_decode = len(decode_results_list) > 0
++
++    LINE_WIDTH = 80
++    print("=" * LINE_WIDTH)
++    print(f"= Prefill Model Table "
++          f"(prompt_len={prompt_len}, batch_size={batch_size})")
++    print("=" * LINE_WIDTH)
++    print()
++    prefill_results.print_model_table()
++
++    if has_decode:
++        print()
++        print("=" * LINE_WIDTH)
++        print(f"= First Decode Step Model Table "
++              f"(prompt_len={prompt_len}, batch_size={batch_size})")
++        print("=" * LINE_WIDTH)
++        print()
++        decode_results_list[0].print_model_table()
++
++    print()
++    print("=" * LINE_WIDTH)
++    print(f"= Prefill Summary Table "
++          f"(prompt_len={prompt_len}, batch_size={batch_size})")
++    print("=" * LINE_WIDTH)
++    print()
++    prefill_results.print_summary_table()
++
++    if has_decode:
++        print()
++        print("=" * LINE_WIDTH)
++        print(f"= First Decode Step Summary Table "
++              f"(prompt_len={prompt_len}, batch_size={batch_size})")
++        print("=" * LINE_WIDTH)
++        print()
++        decode_results_list[0].print_summary_table()
++
++    if csv_output:
++        csv_filename_base = csv_output[:-4] \
++                if csv_output.endswith('.csv') else csv_output
++        prefill_results.export_model_stats_table_csv(
++            csv_filename_base + "_prefill_model_table.csv")
++        prefill_results.export_summary_stats_table_csv(
++            csv_filename_base + "_prefill_summary_table.csv")
++
++        if has_decode:
++            decode_results_list[0].export_model_stats_table_csv(\
++                csv_filename_base + "_decode_model_table.csv")
++            decode_results_list[0].export_summary_stats_table_csv(
++                csv_filename_base + "_decode_summary_table.csv")
++
++    if json_output:
++        cuda_devices = [
++            torch.cuda.get_device_properties(dev_idx)
++            for dev_idx in range(torch.cuda.device_count())
++        ]
++
++        json_dict = {
++            "context": {
++                "python_version": f"{sys.version}",
++                "torch_version": f"{torch.__version__}",
++                "torch_cuda_version": f"{torch.version.cuda}",
++                "cuda_devices": f"{cuda_devices}",
++                **asdict(context)
++            },
++            "prefill": prefill_results.convert_stats_to_dict(),
++        }
++
++        if has_decode:
++            for idx, dr in enumerate(decode_results_list):
++                json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
++
++        # Add .json to json_output filename if it doesn't exist already.
++        json_output_file = json_output if json_output.endswith(
++            '.json') else json_output + '.json'
++        with open(json_output_file, "w+") as f:
++            json.dump(json_dict, f, indent=2)
++        pass
++
++    if context.save_chrome_traces_folder is not None:
++        os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
++        prefill_prof.profiler.export_chrome_trace(
++            context.save_chrome_traces_folder + "/prefill.json")
++        for idx, decode_prof in enumerate(decode_profs):
++            decode_prof.profiler.export_chrome_trace(
++                context.save_chrome_traces_folder + f"/decode_{idx + 1}.json")
++        print("Traces saved as prefill.json and decode_1.json, etc."
++              f" in folder {context.save_chrome_traces_folder}")
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(description="""
++Profile a model
++
++    example:
++    ```
++    python examples/offline_inference/profiling.py \\
++        --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
++        --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
++        --enforce-eager run_num_steps -n 2
++    ```
++
++    then you can use various tools to analyze the json output
++    terminal ascii tables:
++        ```
++        python tools/profiler/print_layerwise_table.py \\
++            --json-trace Llama31-8b-FP8.json --phase prefill --table summary
++        ```
++    or create matplotlib stacked bar charts:
++        ```
++        python tools/profiler/visualize_layerwise_profile.py \\
++            --json-trace Llama31-8b-FP8.json \\
++            --output-directory profile_breakdown --plot-metric pct_cuda_time
++        ```
++""",
++                                    formatter_class=RawTextHelpFormatter)
++    parser.add_argument(
++        "--csv",
++        type=str,
++        default=None,
++        help="Export the results as multiple csv file. This should be the root "
++        "filename, will create <filename>_prefill_model_table.csv, "
++        "<filename>_prefill_summary_table.csv, "
++        "<filename>_decode_model_table.csv, and "
++        "<filename>_decode_summary_table.csv")
++    parser.add_argument(
++        "--json",
++        type=str,
++        default=None,
++        help="Export the results as a json file. This should be the filename")
++    parser.add_argument("--save-chrome-traces-folder",
++                        type=str,
++                        help="Save chrome traces for the prefill and decode "
++                        "will save traces as prefill.json and decode_1.json, "
++                        "etc. inside this folder")
++    parser.add_argument(
++        "--prompt-len",
++        type=int,
++        default=PROMPT_LEN_DEFAULT,
++        help=f"Length of the random prompt to use when profiling, all batched "
++        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}")
++    parser.add_argument("--batch-size",
++                        type=int,
++                        default=BATCH_SIZE_DEFAULT,
++                        help=f"Number of requests to run as a single batch, "
++                        f"default={BATCH_SIZE_DEFAULT}")
++
++    subparsers = parser.add_subparsers(dest="cmd")
++
++    run_num_steps_parser = subparsers.add_parser(
++        "run_num_steps",
++        help="This variation profiles n engine.step() invocations.")
++    run_num_steps_parser.add_argument(
++        '-n',
++        '--num-steps',
++        type=int,
++        help="Number of engine steps to profile.\n"
++        "Setting it to 1, profiles only the prefill step.\n"
++        "Setting it to 2, profiles the prefill and first decode step\n"
++        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
++        "and so on ...")
++
++    run_to_completion_parser = subparsers.add_parser(
++        "run_to_completion",
++        help="This variation profiles all the engine.step() invocations"
++        "until the engine exhausts all submitted requests.")
++    run_to_completion_parser.add_argument(
++        '-n',
++        '--complete-num-requests-per-step',
++        type=int,
++        help=
++        "Complete complete_num_requests_per_step requests every decode step."
++        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
++        "the profiler is run for 6 engine steps, with the steps processing, "
++        "128, 128, 96, 64, 32, 1 requests respectively.\n"
++        "Note that we tack-on a one-request step at the end as it is often "
++        "useful.")
++
++    EngineArgs.add_cli_args(parser)
++
++    args = parser.parse_args()
++    context = ProfileContext(
++        engine_args=EngineArgs.from_cli_args(args),
++        **{
++            k: v
++            for k, v in vars(args).items()
++            if k in inspect.signature(ProfileContext).parameters
++        })
++    run_profile(context, csv_output=args.csv, json_output=args.json)
+diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
+new file mode 100644
+index 0000000..4207f89
+--- /dev/null
++++ b/examples/offline_inference/save_sharded_state.py
+@@ -0,0 +1,75 @@
++"""
++Saves each worker's model state dict directly to a checkpoint, which enables a
++fast load path for large tensor-parallel models where each worker only needs to
++read its own shard rather than the entire checkpoint.
++
++Example usage:
++
++python save_sharded_state.py \
++    --model /path/to/load \
++    --quantization deepspeedfp \
++    --tensor-parallel-size 8 \
++    --output /path/to/save
++
++Then, the model can be loaded with
++
++llm = LLM(
++    model="/path/to/save",
++    load_format="sharded_state",
++    quantization="deepspeedfp",
++    tensor_parallel_size=8,
++)
++"""
++import dataclasses
++import os
++import shutil
++from pathlib import Path
++
++from vllm import LLM, EngineArgs
++from vllm.utils import FlexibleArgumentParser
++
++parser = FlexibleArgumentParser()
++EngineArgs.add_cli_args(parser)
++parser.add_argument("--output",
++                    "-o",
++                    required=True,
++                    type=str,
++                    help="path to output checkpoint")
++parser.add_argument("--file-pattern",
++                    type=str,
++                    help="string pattern of saved filenames")
++parser.add_argument("--max-file-size",
++                    type=str,
++                    default=5 * 1024**3,
++                    help="max size (in bytes) of each safetensors file")
++
++
++def main(args):
++    engine_args = EngineArgs.from_cli_args(args)
++    if engine_args.enable_lora:
++        raise ValueError("Saving with enable_lora=True is not supported!")
++    model_path = engine_args.model
++    if not Path(model_path).is_dir():
++        raise ValueError("model path must be a local directory")
++    # Create LLM instance from arguments
++    llm = LLM(**dataclasses.asdict(engine_args))
++    # Prepare output directory
++    Path(args.output).mkdir(exist_ok=True)
++    # Dump worker states to output directory
++    model_executor = llm.llm_engine.model_executor
++    model_executor.save_sharded_state(path=args.output,
++                                      pattern=args.file_pattern,
++                                      max_size=args.max_file_size)
++    # Copy metadata files to output directory
++    for file in os.listdir(model_path):
++        if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
++            if os.path.isdir(os.path.join(model_path, file)):
++                shutil.copytree(os.path.join(model_path, file),
++                                os.path.join(args.output, file))
++            else:
++                shutil.copy(os.path.join(model_path, file), args.output)
++
++
++if __name__ == "__main__":
++    args = parser.parse_args()
++    main(args)
+diff --git a/examples/offline_inference/scoring.py b/examples/offline_inference/scoring.py
+new file mode 100644
+index 0000000..5da9e71
+--- /dev/null
++++ b/examples/offline_inference/scoring.py
+@@ -0,0 +1,23 @@
++from vllm import LLM
++
++# Sample prompts.
++text_1 = "What is the capital of France?"
++texts_2 = [
++    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
++]
++
++# Create an LLM.
++# You should pass task="score" for cross-encoder models
++model = LLM(
++    model="BAAI/bge-reranker-v2-m3",
++    task="score",
++    enforce_eager=True,
++)
++
++# Generate scores. The output is a list of ScoringRequestOutputs.
++outputs = model.score(text_1, texts_2)
++
++# Print the outputs.
++for text_2, output in zip(texts_2, outputs):
++    score = output.outputs.score
++    print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
+diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
+new file mode 100644
+index 0000000..abcfa8e
+--- /dev/null
++++ b/examples/offline_inference/simple_profiling.py
+@@ -0,0 +1,40 @@
++import os
++import time
++
++from vllm import LLM, SamplingParams
++
++# enable torch profiler, can also be set on cmd line
++os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
++
++# Sample prompts.
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++if __name__ == "__main__":
++
++    # Create an LLM.
++    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
++
++    llm.start_profile()
++
++    # Generate texts from the prompts. The output is a list of RequestOutput
++    # objects that contain the prompt, generated text, and other information.
++    outputs = llm.generate(prompts, sampling_params)
++
++    llm.stop_profile()
++
++    # Print the outputs.
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++    # Add a buffer to wait for profiler in the background process
++    # (in case MP is on) to finish writing profiling output.
++    time.sleep(10)
+diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
+new file mode 100644
+index 0000000..00d8646
+--- /dev/null
++++ b/examples/offline_inference/structured_outputs.py
+@@ -0,0 +1,78 @@
++from enum import Enum
++
++from pydantic import BaseModel
++
++from vllm import LLM, SamplingParams
++from vllm.sampling_params import GuidedDecodingParams
++
++llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
++
++# Guided decoding by Choice (list of possible options)
++guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
++sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
++outputs = llm.generate(
++    prompts="Classify this sentiment: vLLM is wonderful!",
++    sampling_params=sampling_params,
++)
++print(outputs[0].outputs[0].text)
++
++# Guided decoding by Regex
++guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n")
++sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
++                                 stop=["\n"])
++prompt = ("Generate an email address for Alan Turing, who works in Enigma."
++          "End in .com and new line. Example result:"
++          "alan.turing@enigma.com\n")
++outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
++print(outputs[0].outputs[0].text)
++
++
++# Guided decoding by JSON using Pydantic schema
++class CarType(str, Enum):
++    sedan = "sedan"
++    suv = "SUV"
++    truck = "Truck"
++    coupe = "Coupe"
++
++
++class CarDescription(BaseModel):
++    brand: str
++    model: str
++    car_type: CarType
++
++
++json_schema = CarDescription.model_json_schema()
++
++guided_decoding_params = GuidedDecodingParams(json=json_schema)
++sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
++prompt = ("Generate a JSON with the brand, model and car_type of"
++          "the most iconic car from the 90's")
++outputs = llm.generate(
++    prompts=prompt,
++    sampling_params=sampling_params,
++)
++print(outputs[0].outputs[0].text)
++
++# Guided decoding by Grammar
++simplified_sql_grammar = """
++    ?start: select_statement
++
++    ?select_statement: "SELECT " column_list " FROM " table_name
++
++    ?column_list: column_name ("," column_name)*
++
++    ?table_name: identifier
++
++    ?column_name: identifier
++
++    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
++"""
++guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
++sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
++prompt = ("Generate an SQL query to show the 'username' and 'email'"
++          "from the 'users' table.")
++outputs = llm.generate(
++    prompts=prompt,
++    sampling_params=sampling_params,
++)
++print(outputs[0].outputs[0].text)
+diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
+new file mode 100644
+index 0000000..251629b
+--- /dev/null
++++ b/examples/offline_inference/tpu.py
+@@ -0,0 +1,28 @@
++from vllm import LLM, SamplingParams
++
++prompts = [
++    "A robot may not injure a human being",
++    "It is only with the heart that one can see rightly;",
++    "The greatest glory in living lies not in never falling,",
++]
++answers = [
++    " or, through inaction, allow a human being to come to harm.",
++    " what is essential is invisible to the eye.",
++    " but in rising every time we fall.",
++]
++N = 1
++# Currently, top-p sampling is disabled. `top_p` should be 1.0.
++sampling_params = SamplingParams(temperature=0.7,
++                                 top_p=1.0,
++                                 n=N,
++                                 max_tokens=16)
++
++# Set `enforce_eager=True` to avoid ahead-of-time compilation.
++# In real workloads, `enforace_eager` should be `False`.
++llm = LLM(model="google/gemma-2b", enforce_eager=True)
++outputs = llm.generate(prompts, sampling_params)
++for output, answer in zip(outputs, answers):
++    prompt = output.prompt
++    generated_text = output.outputs[0].text
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++    assert generated_text.startswith(answer)
+diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
+new file mode 100644
+index 0000000..ad32b9f
+--- /dev/null
++++ b/examples/offline_inference/vision_language.py
+@@ -0,0 +1,705 @@
++"""
++This example shows how to use vLLM for running offline inference with
++the correct prompt format on vision language models for text generation.
++
++For most models, the prompt format should follow corresponding examples
++on HuggingFace model repository.
++"""
++import random
++
++from transformers import AutoTokenizer
++
++from vllm import LLM, SamplingParams
++from vllm.assets.image import ImageAsset
++from vllm.assets.video import VideoAsset
++from vllm.utils import FlexibleArgumentParser
++
++# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
++# lower-end GPUs.
++# Unless specified, these settings have been tested to work on a single L4.
++
++
++# Aria
++def run_aria(question: str, modality: str):
++    assert modality == "image"
++    model_name = "rhymes-ai/Aria"
++
++    # NOTE: Need L40 (or equivalent) to avoid OOM
++    llm = LLM(model=model_name,
++              tokenizer_mode="slow",
++              dtype="bfloat16",
++              max_model_len=4096,
++              max_num_seqs=2,
++              trust_remote_code=True,
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++
++    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
++              "<|im_end|>\n<|im_start|>assistant\n")
++
++    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
++    return llm, prompt, stop_token_ids
++
++
++# BLIP-2
++def run_blip2(question: str, modality: str):
++    assert modality == "image"
++
++    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
++    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
++    prompt = f"Question: {question} Answer:"
++    llm = LLM(model="Salesforce/blip2-opt-2.7b",
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# Chameleon
++def run_chameleon(question: str, modality: str):
++    assert modality == "image"
++
++    prompt = f"{question}<image>"
++    llm = LLM(model="facebook/chameleon-7b",
++              max_model_len=4096,
++              max_num_seqs=2,
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# Deepseek-VL2
++def run_deepseek_vl2(question: str, modality: str):
++    assert modality == "image"
++
++    model_name = "deepseek-ai/deepseek-vl2-small"
++
++    llm = LLM(model=model_name,
++              max_model_len=4096,
++              max_num_seqs=2,
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
++
++    prompt = f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# Fuyu
++def run_fuyu(question: str, modality: str):
++    assert modality == "image"
++
++    prompt = f"{question}\n"
++    llm = LLM(model="adept/fuyu-8b",
++              max_model_len=2048,
++              max_num_seqs=2,
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# GLM-4v
++def run_glm4v(question: str, modality: str):
++    assert modality == "image"
++    model_name = "THUDM/glm-4v-9b"
++
++    llm = LLM(model=model_name,
++              max_model_len=2048,
++              max_num_seqs=2,
++              trust_remote_code=True,
++              enforce_eager=True,
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++    prompt = question
++    stop_token_ids = [151329, 151336, 151338]
++    return llm, prompt, stop_token_ids
++
++
++# H2OVL-Mississippi
++def run_h2ovl(question: str, modality: str):
++    assert modality == "image"
++
++    model_name = "h2oai/h2ovl-mississippi-2b"
++
++    llm = LLM(
++        model=model_name,
++        trust_remote_code=True,
++        max_model_len=8192,
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++
++    tokenizer = AutoTokenizer.from_pretrained(model_name,
++                                              trust_remote_code=True)
++    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
++    prompt = tokenizer.apply_chat_template(messages,
++                                           tokenize=False,
++                                           add_generation_prompt=True)
++
++    # Stop tokens for H2OVL-Mississippi
++    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
++    stop_token_ids = [tokenizer.eos_token_id]
++    return llm, prompt, stop_token_ids
++
++
++# Idefics3-8B-Llama3
++def run_idefics3(question: str, modality: str):
++    assert modality == "image"
++    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
++
++    llm = LLM(
++        model=model_name,
++        max_model_len=8192,
++        max_num_seqs=2,
++        enforce_eager=True,
++        # if you are running out of memory, you can reduce the "longest_edge".
++        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
++        mm_processor_kwargs={
++            "size": {
++                "longest_edge": 3 * 364
++            },
++        },
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++    prompt = (
++        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
++    )
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# InternVL
++def run_internvl(question: str, modality: str):
++    assert modality == "image"
++
++    model_name = "OpenGVLab/InternVL2-2B"
++
++    llm = LLM(
++        model=model_name,
++        trust_remote_code=True,
++        max_model_len=4096,
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++
++    tokenizer = AutoTokenizer.from_pretrained(model_name,
++                                              trust_remote_code=True)
++    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
++    prompt = tokenizer.apply_chat_template(messages,
++                                           tokenize=False,
++                                           add_generation_prompt=True)
++
++    # Stop tokens for InternVL
++    # models variants may have different stop tokens
++    # please refer to the model card for the correct "stop words":
++    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
++    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
++    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
++    return llm, prompt, stop_token_ids
++
++
++# LLaVA-1.5
++def run_llava(question: str, modality: str):
++    assert modality == "image"
++
++    prompt = f"USER: <image>\n{question}\nASSISTANT:"
++
++    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
++              max_model_len=4096,
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# LLaVA-1.6/LLaVA-NeXT
++def run_llava_next(question: str, modality: str):
++    assert modality == "image"
++
++    prompt = f"[INST] <image>\n{question} [/INST]"
++    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
++              max_model_len=8192,
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# LlaVA-NeXT-Video
++# Currently only support for video input
++def run_llava_next_video(question: str, modality: str):
++    assert modality == "video"
++
++    prompt = f"USER: <video>\n{question} ASSISTANT:"
++    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
++              max_model_len=8192,
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# LLaVA-OneVision
++def run_llava_onevision(question: str, modality: str):
++
++    if modality == "video":
++        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
++        <|im_start|>assistant\n"
++
++    elif modality == "image":
++        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
++        <|im_start|>assistant\n"
++
++    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
++              max_model_len=16384,
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# Mantis
++def run_mantis(question: str, modality: str):
++    assert modality == "image"
++
++    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
++    prompt = llama3_template.format(f"{question}\n<image>")
++
++    llm = LLM(
++        model="TIGER-Lab/Mantis-8B-siglip-llama3",
++        max_model_len=4096,
++        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++    stop_token_ids = [128009]
++    return llm, prompt, stop_token_ids
++
++
++# MiniCPM-V
++def run_minicpmv(question: str, modality: str):
++    assert modality == "image"
++
++    # 2.0
++    # The official repo doesn't work yet, so we need to use a fork for now
++    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
++    # model_name = "HwwwH/MiniCPM-V-2"
++
++    # 2.5
++    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
++
++    # 2.6
++    model_name = "openbmb/MiniCPM-V-2_6"
++    tokenizer = AutoTokenizer.from_pretrained(model_name,
++                                              trust_remote_code=True)
++    llm = LLM(
++        model=model_name,
++        max_model_len=4096,
++        max_num_seqs=2,
++        trust_remote_code=True,
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
++    # 2.0
++    # stop_token_ids = [tokenizer.eos_id]
++
++    # 2.5
++    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
++
++    # 2.6
++    stop_tokens = ['<|im_end|>', '<|endoftext|>']
++    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
++
++    messages = [{
++        'role': 'user',
++        'content': f'(<image>./</image>)\n{question}'
++    }]
++    prompt = tokenizer.apply_chat_template(messages,
++                                           tokenize=False,
++                                           add_generation_prompt=True)
++    return llm, prompt, stop_token_ids
++
++
++# LLama 3.2
++def run_mllama(question: str, modality: str):
++    assert modality == "image"
++
++    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
++
++    # Note: The default setting of max_num_seqs (256) and
++    # max_model_len (131072) for this model may cause OOM.
++    # You may lower either to run this example on lower-end GPUs.
++
++    # The configuration below has been confirmed to launch on a single L40 GPU.
++    llm = LLM(
++        model=model_name,
++        max_model_len=4096,
++        max_num_seqs=16,
++        enforce_eager=True,
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++
++    tokenizer = AutoTokenizer.from_pretrained(model_name)
++    messages = [{
++        "role":
++        "user",
++        "content": [{
++            "type": "image"
++        }, {
++            "type": "text",
++            "text": f"{question}"
++        }]
++    }]
++    prompt = tokenizer.apply_chat_template(messages,
++                                           add_generation_prompt=True,
++                                           tokenize=False)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# Molmo
++def run_molmo(question, modality):
++    assert modality == "image"
++
++    model_name = "allenai/Molmo-7B-D-0924"
++
++    llm = LLM(
++        model=model_name,
++        trust_remote_code=True,
++        dtype="bfloat16",
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++
++    prompt = question
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# NVLM-D
++def run_nvlm_d(question: str, modality: str):
++    assert modality == "image"
++
++    model_name = "nvidia/NVLM-D-72B"
++
++    # Adjust this as necessary to fit in GPU
++    llm = LLM(
++        model=model_name,
++        trust_remote_code=True,
++        max_model_len=4096,
++        tensor_parallel_size=4,
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++
++    tokenizer = AutoTokenizer.from_pretrained(model_name,
++                                              trust_remote_code=True)
++    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
++    prompt = tokenizer.apply_chat_template(messages,
++                                           tokenize=False,
++                                           add_generation_prompt=True)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# PaliGemma
++def run_paligemma(question: str, modality: str):
++    assert modality == "image"
++
++    # PaliGemma has special prompt format for VQA
++    prompt = "caption en"
++    llm = LLM(model="google/paligemma-3b-mix-224",
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# PaliGemma 2
++def run_paligemma2(question: str, modality: str):
++    assert modality == "image"
++
++    # PaliGemma 2 has special prompt format for VQA
++    prompt = "caption en"
++    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
++              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# Phi-3-Vision
++def run_phi3v(question: str, modality: str):
++    assert modality == "image"
++
++    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
++
++    # num_crops is an override kwarg to the multimodal image processor;
++    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
++    # to use 16 for single frame scenarios, and 4 for multi-frame.
++    #
++    # Generally speaking, a larger value for num_crops results in more
++    # tokens per image instance, because it may scale the image more in
++    # the image preprocessing. Some references in the model docs and the
++    # formula for image tokens after the preprocessing
++    # transform can be found below.
++    #
++    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
++    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
++    llm = LLM(
++        model="microsoft/Phi-3.5-vision-instruct",
++        trust_remote_code=True,
++        max_model_len=4096,
++        max_num_seqs=2,
++        # Note - mm_processor_kwargs can also be passed to generate/chat calls
++        mm_processor_kwargs={"num_crops": 16},
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# Pixtral HF-format
++def run_pixtral_hf(question: str, modality: str):
++    assert modality == "image"
++
++    model_name = "mistral-community/pixtral-12b"
++
++    # NOTE: Need L40 (or equivalent) to avoid OOM
++    llm = LLM(
++        model=model_name,
++        max_model_len=8192,
++        max_num_seqs=2,
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++
++    prompt = f"<s>[INST]{question}\n[IMG][/INST]"
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# Qwen
++def run_qwen_vl(question: str, modality: str):
++    assert modality == "image"
++
++    llm = LLM(
++        model="Qwen/Qwen-VL",
++        trust_remote_code=True,
++        max_model_len=1024,
++        max_num_seqs=2,
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++
++    prompt = f"{question}Picture 1: <img></img>\n"
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++# Qwen2-VL
++def run_qwen2_vl(question: str, modality: str):
++
++    model_name = "Qwen/Qwen2-VL-7B-Instruct"
++
++    llm = LLM(
++        model=model_name,
++        max_model_len=4096,
++        max_num_seqs=5,
++        # Note - mm_processor_kwargs can also be passed to generate/chat calls
++        mm_processor_kwargs={
++            "min_pixels": 28 * 28,
++            "max_pixels": 1280 * 28 * 28,
++        },
++        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
++    )
++
++    if modality == "image":
++        placeholder = "<|image_pad|>"
++    elif modality == "video":
++        placeholder = "<|video_pad|>"
++
++    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
++              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
++              f"{question}<|im_end|>\n"
++              "<|im_start|>assistant\n")
++    stop_token_ids = None
++    return llm, prompt, stop_token_ids
++
++
++model_example_map = {
++    "aria": run_aria,
++    "blip-2": run_blip2,
++    "chameleon": run_chameleon,
++    "deepseek_vl_v2": run_deepseek_vl2,
++    "fuyu": run_fuyu,
++    "glm4v": run_glm4v,
++    "h2ovl_chat": run_h2ovl,
++    "idefics3": run_idefics3,
++    "internvl_chat": run_internvl,
++    "llava": run_llava,
++    "llava-next": run_llava_next,
++    "llava-next-video": run_llava_next_video,
++    "llava-onevision": run_llava_onevision,
++    "mantis": run_mantis,
++    "minicpmv": run_minicpmv,
++    "mllama": run_mllama,
++    "molmo": run_molmo,
++    "NVLM_D": run_nvlm_d,
++    "paligemma": run_paligemma,
++    "paligemma2": run_paligemma2,
++    "phi3_v": run_phi3v,
++    "pixtral_hf": run_pixtral_hf,
++    "qwen_vl": run_qwen_vl,
++    "qwen2_vl": run_qwen2_vl,
++}
++
++
++def get_multi_modal_input(args):
++    """
++    return {
++        "data": image or video,
++        "question": question,
++    }
++    """
++    if args.modality == "image":
++        # Input image and question
++        image = ImageAsset("cherry_blossom") \
++            .pil_image.convert("RGB")
++        img_question = "What is the content of this image?"
++
++        return {
++            "data": image,
++            "question": img_question,
++        }
++
++    if args.modality == "video":
++        # Input video and question
++        video = VideoAsset(name="sample_demo_1.mp4",
++                           num_frames=args.num_frames).np_ndarrays
++        vid_question = "Why is this video funny?"
++
++        return {
++            "data": video,
++            "question": vid_question,
++        }
++
++    msg = f"Modality {args.modality} is not supported."
++    raise ValueError(msg)
++
++
++def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
++    """Repeats images with provided probability of "image_repeat_prob". 
++    Used to simulate hit/miss for the MM preprocessor cache.
++    """
++    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
++    no_yes = [0, 1]
++    probs = [1.0 - image_repeat_prob, image_repeat_prob]
++
++    inputs = []
++    cur_image = data
++    for i in range(num_prompts):
++        if image_repeat_prob is not None:
++            res = random.choices(no_yes, probs)[0]
++            if res == 0:
++                # No repeat => Modify one pixel
++                cur_image = cur_image.copy()
++                new_val = (i // 256 // 256, i // 256, i % 256)
++                cur_image.putpixel((0, 0), new_val)
++
++        inputs.append({
++            "prompt": prompt,
++            "multi_modal_data": {
++                modality: cur_image
++            }
++        })
++
++    return inputs
++
++
++def main(args):
++    model = args.model_type
++    if model not in model_example_map:
++        raise ValueError(f"Model type {model} is not supported.")
++
++    modality = args.modality
++    mm_input = get_multi_modal_input(args)
++    data = mm_input["data"]
++    question = mm_input["question"]
++
++    llm, prompt, stop_token_ids = model_example_map[model](question, modality)
++
++    # We set temperature to 0.2 so that outputs can be different
++    # even when all prompts are identical when running batch inference.
++    sampling_params = SamplingParams(temperature=0.2,
++                                     max_tokens=64,
++                                     stop_token_ids=stop_token_ids)
++
++    assert args.num_prompts > 0
++    if args.num_prompts == 1:
++        # Single inference
++        inputs = {
++            "prompt": prompt,
++            "multi_modal_data": {
++                modality: data
++            },
++        }
++
++    else:
++        # Batch inference
++        if args.image_repeat_prob is not None:
++            # Repeat images with specified probability of "image_repeat_prob"
++            inputs = apply_image_repeat(args.image_repeat_prob,
++                                        args.num_prompts, data, prompt,
++                                        modality)
++        else:
++            # Use the same image for all prompts
++            inputs = [{
++                "prompt": prompt,
++                "multi_modal_data": {
++                    modality: data
++                },
++            } for _ in range(args.num_prompts)]
++
++    if args.time_generate:
++        import time
++        start_time = time.time()
++        outputs = llm.generate(inputs, sampling_params=sampling_params)
++        elapsed_time = time.time() - start_time
++        print("-- generate time = {}".format(elapsed_time))
++
++    else:
++        outputs = llm.generate(inputs, sampling_params=sampling_params)
++
++    for o in outputs:
++        generated_text = o.outputs[0].text
++        print(generated_text)
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(
++        description='Demo on using vLLM for offline inference with '
++        'vision language models for text generation')
++    parser.add_argument('--model-type',
++                        '-m',
++                        type=str,
++                        default="llava",
++                        choices=model_example_map.keys(),
++                        help='Huggingface "model_type".')
++    parser.add_argument('--num-prompts',
++                        type=int,
++                        default=4,
++                        help='Number of prompts to run.')
++    parser.add_argument('--modality',
++                        type=str,
++                        default="image",
++                        choices=['image', 'video'],
++                        help='Modality of the input.')
++    parser.add_argument('--num-frames',
++                        type=int,
++                        default=16,
++                        help='Number of frames to extract from the video.')
++
++    parser.add_argument(
++        '--image-repeat-prob',
++        type=float,
++        default=None,
++        help='Simulates the hit-ratio for multi-modal preprocessor cache'
++        ' (if enabled)')
++
++    parser.add_argument(
++        '--disable-mm-preprocessor-cache',
++        action='store_true',
++        help='If True, disables caching of multi-modal preprocessor/mapper.')
++
++    parser.add_argument(
++        '--time-generate',
++        action='store_true',
++        help='If True, then print the total generate() call time')
++
++    args = parser.parse_args()
++    main(args)
+diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
+new file mode 100644
+index 0000000..4ce3d49
+--- /dev/null
++++ b/examples/offline_inference/vision_language_embedding.py
+@@ -0,0 +1,170 @@
++"""
++This example shows how to use vLLM for running offline inference with
++the correct prompt format on vision language models for multimodal embedding.
++
++For most models, the prompt format should follow corresponding examples
++on HuggingFace model repository.
++"""
++from argparse import Namespace
++from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
++
++from PIL.Image import Image
++
++from vllm import LLM
++from vllm.multimodal.utils import fetch_image
++from vllm.utils import FlexibleArgumentParser
++
++
++class TextQuery(TypedDict):
++    modality: Literal["text"]
++    text: str
++
++
++class ImageQuery(TypedDict):
++    modality: Literal["image"]
++    image: Image
++
++
++class TextImageQuery(TypedDict):
++    modality: Literal["text+image"]
++    text: str
++    image: Image
++
++
++QueryModality = Literal["text", "image", "text+image"]
++Query = Union[TextQuery, ImageQuery, TextImageQuery]
++
++
++class ModelRequestData(NamedTuple):
++    llm: LLM
++    prompt: str
++    image: Optional[Image]
++
++
++def run_e5_v(query: Query):
++    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
++
++    if query["modality"] == "text":
++        text = query["text"]
++        prompt = llama3_template.format(
++            f"{text}\nSummary above sentence in one word: ")
++        image = None
++    elif query["modality"] == "image":
++        prompt = llama3_template.format(
++            "<image>\nSummary above image in one word: ")
++        image = query["image"]
++    else:
++        modality = query['modality']
++        raise ValueError(f"Unsupported query modality: '{modality}'")
++
++    llm = LLM(
++        model="royokong/e5-v",
++        task="embed",
++        max_model_len=4096,
++    )
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        image=image,
++    )
++
++
++def run_vlm2vec(query: Query):
++    if query["modality"] == "text":
++        text = query["text"]
++        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
++        image = None
++    elif query["modality"] == "image":
++        prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image."  # noqa: E501
++        image = query["image"]
++    elif query["modality"] == "text+image":
++        text = query["text"]
++        prompt = f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
++        image = query["image"]
++    else:
++        modality = query['modality']
++        raise ValueError(f"Unsupported query modality: '{modality}'")
++
++    llm = LLM(
++        model="TIGER-Lab/VLM2Vec-Full",
++        task="embed",
++        trust_remote_code=True,
++        mm_processor_kwargs={"num_crops": 4},
++    )
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        image=image,
++    )
++
++
++def get_query(modality: QueryModality):
++    if modality == "text":
++        return TextQuery(modality="text", text="A dog sitting in the grass")
++
++    if modality == "image":
++        return ImageQuery(
++            modality="image",
++            image=fetch_image(
++                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"  # noqa: E501
++            ),
++        )
++
++    if modality == "text+image":
++        return TextImageQuery(
++            modality="text+image",
++            text="A cat standing in the snow.",
++            image=fetch_image(
++                "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg"  # noqa: E501
++            ),
++        )
++
++    msg = f"Modality {modality} is not supported."
++    raise ValueError(msg)
++
++
++def run_encode(model: str, modality: QueryModality):
++    query = get_query(modality)
++    req_data = model_example_map[model](query)
++
++    mm_data = {}
++    if req_data.image is not None:
++        mm_data["image"] = req_data.image
++
++    outputs = req_data.llm.embed({
++        "prompt": req_data.prompt,
++        "multi_modal_data": mm_data,
++    })
++
++    for output in outputs:
++        print(output.outputs.embedding)
++
++
++def main(args: Namespace):
++    run_encode(args.model_name, args.modality)
++
++
++model_example_map = {
++    "e5_v": run_e5_v,
++    "vlm2vec": run_vlm2vec,
++}
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(
++        description='Demo on using vLLM for offline inference with '
++        'vision language models for multimodal embedding')
++    parser.add_argument('--model-name',
++                        '-m',
++                        type=str,
++                        default="vlm2vec",
++                        choices=model_example_map.keys(),
++                        help='The name of the embedding model.')
++    parser.add_argument('--modality',
++                        type=str,
++                        default="image",
++                        choices=get_args(QueryModality),
++                        help='Modality of the input.')
++    args = parser.parse_args()
++    main(args)
+diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
+new file mode 100644
+index 0000000..c6cf3f3
+--- /dev/null
++++ b/examples/offline_inference/vision_language_multi_image.py
+@@ -0,0 +1,493 @@
++"""
++This example shows how to use vLLM for running offline inference with
++multi-image input on vision language models for text generation,
++using the chat template defined by the model.
++"""
++from argparse import Namespace
++from typing import List, NamedTuple, Optional
++
++from PIL.Image import Image
++from transformers import AutoProcessor, AutoTokenizer
++
++from vllm import LLM, SamplingParams
++from vllm.multimodal.utils import fetch_image
++from vllm.utils import FlexibleArgumentParser
++
++QUESTION = "What is the content of each image?"
++IMAGE_URLS = [
++    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
++    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
++]
++
++
++class ModelRequestData(NamedTuple):
++    llm: LLM
++    prompt: str
++    stop_token_ids: Optional[List[int]]
++    image_data: List[Image]
++    chat_template: Optional[str]
++
++
++# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
++# lower-end GPUs.
++# Unless specified, these settings have been tested to work on a single L4.
++
++
++def load_aria(question, image_urls: List[str]) -> ModelRequestData:
++    model_name = "rhymes-ai/Aria"
++    llm = LLM(model=model_name,
++              tokenizer_mode="slow",
++              trust_remote_code=True,
++              dtype="bfloat16",
++              limit_mm_per_prompt={"image": len(image_urls)})
++    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
++    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
++              "<|im_start|>assistant\n")
++    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=stop_token_ids,
++        image_data=[fetch_image(url) for url in image_urls],
++        chat_template=None,
++    )
++
++
++def load_deepseek_vl2(question: str, image_urls: List[str]):
++    model_name = "deepseek-ai/deepseek-vl2-small"
++
++    llm = LLM(model=model_name,
++              max_model_len=4096,
++              max_num_seqs=2,
++              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
++              limit_mm_per_prompt={"image": len(image_urls)})
++
++    placeholder = "".join(f"image_{i}:<image>\n"
++                          for i, _ in enumerate(image_urls, start=1))
++    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=None,
++        image_data=[fetch_image(url) for url in image_urls],
++        chat_template=None,
++    )
++
++
++def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
++    model_name = "h2oai/h2ovl-mississippi-2b"
++
++    llm = LLM(
++        model=model_name,
++        trust_remote_code=True,
++        max_model_len=8192,
++        limit_mm_per_prompt={"image": len(image_urls)},
++        mm_processor_kwargs={"max_dynamic_patch": 4},
++    )
++
++    placeholders = "\n".join(f"Image-{i}: <image>\n"
++                             for i, _ in enumerate(image_urls, start=1))
++    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
++
++    tokenizer = AutoTokenizer.from_pretrained(model_name,
++                                              trust_remote_code=True)
++    prompt = tokenizer.apply_chat_template(messages,
++                                           tokenize=False,
++                                           add_generation_prompt=True)
++
++    # Stop tokens for H2OVL-Mississippi
++    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
++    stop_token_ids = [tokenizer.eos_token_id]
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=stop_token_ids,
++        image_data=[fetch_image(url) for url in image_urls],
++        chat_template=None,
++    )
++
++
++def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
++    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
++
++    # The configuration below has been confirmed to launch on a single L40 GPU.
++    llm = LLM(
++        model=model_name,
++        max_model_len=8192,
++        max_num_seqs=16,
++        enforce_eager=True,
++        limit_mm_per_prompt={"image": len(image_urls)},
++        # if you are running out of memory, you can reduce the "longest_edge".
++        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
++        mm_processor_kwargs={
++            "size": {
++                "longest_edge": 2 * 364
++            },
++        },
++    )
++
++    placeholders = "\n".join(f"Image-{i}: <image>\n"
++                             for i, _ in enumerate(image_urls, start=1))
++    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=None,
++        image_data=[fetch_image(url) for url in image_urls],
++        chat_template=None,
++    )
++
++
++def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
++    model_name = "OpenGVLab/InternVL2-2B"
++
++    llm = LLM(
++        model=model_name,
++        trust_remote_code=True,
++        max_model_len=4096,
++        limit_mm_per_prompt={"image": len(image_urls)},
++        mm_processor_kwargs={"max_dynamic_patch": 4},
++    )
++
++    placeholders = "\n".join(f"Image-{i}: <image>\n"
++                             for i, _ in enumerate(image_urls, start=1))
++    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
++
++    tokenizer = AutoTokenizer.from_pretrained(model_name,
++                                              trust_remote_code=True)
++    prompt = tokenizer.apply_chat_template(messages,
++                                           tokenize=False,
++                                           add_generation_prompt=True)
++
++    # Stop tokens for InternVL
++    # models variants may have different stop tokens
++    # please refer to the model card for the correct "stop words":
++    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
++    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
++    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=stop_token_ids,
++        image_data=[fetch_image(url) for url in image_urls],
++        chat_template=None,
++    )
++
++
++def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
++    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
++
++    # The configuration below has been confirmed to launch on a single L40 GPU.
++    llm = LLM(
++        model=model_name,
++        max_model_len=4096,
++        max_num_seqs=16,
++        enforce_eager=True,
++        limit_mm_per_prompt={"image": len(image_urls)},
++    )
++
++    placeholders = "<|image|>" * len(image_urls)
++    prompt = f"{placeholders}<|begin_of_text|>{question}"
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=None,
++        image_data=[fetch_image(url) for url in image_urls],
++        chat_template=None,
++    )
++
++
++def load_nvlm_d(question: str, image_urls: List[str]):
++    model_name = "nvidia/NVLM-D-72B"
++
++    # Adjust this as necessary to fit in GPU
++    llm = LLM(
++        model=model_name,
++        trust_remote_code=True,
++        max_model_len=8192,
++        tensor_parallel_size=4,
++        limit_mm_per_prompt={"image": len(image_urls)},
++        mm_processor_kwargs={"max_dynamic_patch": 4},
++    )
++
++    placeholders = "\n".join(f"Image-{i}: <image>\n"
++                             for i, _ in enumerate(image_urls, start=1))
++    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
++
++    tokenizer = AutoTokenizer.from_pretrained(model_name,
++                                              trust_remote_code=True)
++    prompt = tokenizer.apply_chat_template(messages,
++                                           tokenize=False,
++                                           add_generation_prompt=True)
++    stop_token_ids = None
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=stop_token_ids,
++        image_data=[fetch_image(url) for url in image_urls],
++        chat_template=None,
++    )
++
++
++def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
++    model_name = "mistral-community/pixtral-12b"
++
++    # Adjust this as necessary to fit in GPU
++    llm = LLM(
++        model=model_name,
++        max_model_len=8192,
++        max_num_seqs=2,
++        tensor_parallel_size=2,
++        limit_mm_per_prompt={"image": len(image_urls)},
++    )
++
++    placeholders = "[IMG]" * len(image_urls)
++    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
++    stop_token_ids = None
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=stop_token_ids,
++        image_data=[fetch_image(url) for url in image_urls],
++        chat_template=None,
++    )
++
++
++def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
++    # num_crops is an override kwarg to the multimodal image processor;
++    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
++    # to use 16 for single frame scenarios, and 4 for multi-frame.
++    #
++    # Generally speaking, a larger value for num_crops results in more
++    # tokens per image instance, because it may scale the image more in
++    # the image preprocessing. Some references in the model docs and the
++    # formula for image tokens after the preprocessing
++    # transform can be found below.
++    #
++    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
++    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
++    llm = LLM(
++        model="microsoft/Phi-3.5-vision-instruct",
++        trust_remote_code=True,
++        max_model_len=4096,
++        max_num_seqs=2,
++        limit_mm_per_prompt={"image": len(image_urls)},
++        mm_processor_kwargs={"num_crops": 4},
++    )
++    placeholders = "\n".join(f"<|image_{i}|>"
++                             for i, _ in enumerate(image_urls, start=1))
++    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
++    stop_token_ids = None
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=stop_token_ids,
++        image_data=[fetch_image(url) for url in image_urls],
++        chat_template=None,
++    )
++
++
++def load_qwen_vl_chat(question: str,
++                      image_urls: List[str]) -> ModelRequestData:
++    model_name = "Qwen/Qwen-VL-Chat"
++    llm = LLM(
++        model=model_name,
++        trust_remote_code=True,
++        max_model_len=1024,
++        max_num_seqs=2,
++        limit_mm_per_prompt={"image": len(image_urls)},
++    )
++    placeholders = "".join(f"Picture {i}: <img></img>\n"
++                           for i, _ in enumerate(image_urls, start=1))
++
++    # This model does not have a chat_template attribute on its tokenizer,
++    # so we need to explicitly pass it. We use ChatML since it's used in the
++    # generation utils of the model:
++    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
++    tokenizer = AutoTokenizer.from_pretrained(model_name,
++                                              trust_remote_code=True)
++
++    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
++    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
++
++    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
++    prompt = tokenizer.apply_chat_template(messages,
++                                           tokenize=False,
++                                           add_generation_prompt=True,
++                                           chat_template=chat_template)
++
++    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
++    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=stop_token_ids,
++        image_data=[fetch_image(url) for url in image_urls],
++        chat_template=chat_template,
++    )
++
++
++def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
++    try:
++        from qwen_vl_utils import process_vision_info
++    except ModuleNotFoundError:
++        print('WARNING: `qwen-vl-utils` not installed, input images will not '
++              'be automatically resized. You can enable this functionality by '
++              '`pip install qwen-vl-utils`.')
++        process_vision_info = None
++
++    model_name = "Qwen/Qwen2-VL-7B-Instruct"
++
++    # Tested on L40
++    llm = LLM(
++        model=model_name,
++        max_model_len=32768 if process_vision_info is None else 4096,
++        max_num_seqs=5,
++        limit_mm_per_prompt={"image": len(image_urls)},
++    )
++
++    placeholders = [{"type": "image", "image": url} for url in image_urls]
++    messages = [{
++        "role": "system",
++        "content": "You are a helpful assistant."
++    }, {
++        "role":
++        "user",
++        "content": [
++            *placeholders,
++            {
++                "type": "text",
++                "text": question
++            },
++        ],
++    }]
++
++    processor = AutoProcessor.from_pretrained(model_name)
++
++    prompt = processor.apply_chat_template(messages,
++                                           tokenize=False,
++                                           add_generation_prompt=True)
++
++    stop_token_ids = None
++
++    if process_vision_info is None:
++        image_data = [fetch_image(url) for url in image_urls]
++    else:
++        image_data, _ = process_vision_info(messages)
++
++    return ModelRequestData(
++        llm=llm,
++        prompt=prompt,
++        stop_token_ids=stop_token_ids,
++        image_data=image_data,
++        chat_template=None,
++    )
++
++
++model_example_map = {
++    "aria": load_aria,
++    "deepseek_vl2": load_deepseek_vl2,
++    "h2ovl_chat": load_h2onvl,
++    "idefics3": load_idefics3,
++    "internvl_chat": load_internvl,
++    "mllama": load_mllama,
++    "NVLM_D": load_nvlm_d,
++    "phi3_v": load_phi3v,
++    "pixtral_hf": load_pixtral_hf,
++    "qwen_vl_chat": load_qwen_vl_chat,
++    "qwen2_vl": load_qwen2_vl,
++}
++
++
++def run_generate(model, question: str, image_urls: List[str]):
++    req_data = model_example_map[model](question, image_urls)
++
++    sampling_params = SamplingParams(temperature=0.0,
++                                     max_tokens=128,
++                                     stop_token_ids=req_data.stop_token_ids)
++
++    outputs = req_data.llm.generate(
++        {
++            "prompt": req_data.prompt,
++            "multi_modal_data": {
++                "image": req_data.image_data
++            },
++        },
++        sampling_params=sampling_params)
++
++    for o in outputs:
++        generated_text = o.outputs[0].text
++        print(generated_text)
++
++
++def run_chat(model: str, question: str, image_urls: List[str]):
++    req_data = model_example_map[model](question, image_urls)
++
++    sampling_params = SamplingParams(temperature=0.0,
++                                     max_tokens=128,
++                                     stop_token_ids=req_data.stop_token_ids)
++    outputs = req_data.llm.chat(
++        [{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": question,
++                },
++                *({
++                    "type": "image_url",
++                    "image_url": {
++                        "url": image_url
++                    },
++                } for image_url in image_urls),
++            ],
++        }],
++        sampling_params=sampling_params,
++        chat_template=req_data.chat_template,
++    )
++
++    for o in outputs:
++        generated_text = o.outputs[0].text
++        print(generated_text)
++
++
++def main(args: Namespace):
++    model = args.model_type
++    method = args.method
++
++    if method == "generate":
++        run_generate(model, QUESTION, IMAGE_URLS)
++    elif method == "chat":
++        run_chat(model, QUESTION, IMAGE_URLS)
++    else:
++        raise ValueError(f"Invalid method: {method}")
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(
++        description='Demo on using vLLM for offline inference with '
++        'vision language models that support multi-image input for text '
++        'generation')
++    parser.add_argument('--model-type',
++                        '-m',
++                        type=str,
++                        default="phi3_v",
++                        choices=model_example_map.keys(),
++                        help='Huggingface "model_type".')
++    parser.add_argument("--method",
++                        type=str,
++                        default="generate",
++                        choices=["generate", "chat"],
++                        help="The method to run in `vllm.LLM`.")
++
++    args = parser.parse_args()
++    main(args)
+diff --git a/examples/offline_inference/whisper.py b/examples/offline_inference/whisper.py
+new file mode 100644
+index 0000000..087ad43
+--- /dev/null
++++ b/examples/offline_inference/whisper.py
+@@ -0,0 +1,59 @@
++import time
++
++from vllm import LLM, SamplingParams
++from vllm.assets.audio import AudioAsset
++
++# Create a Whisper encoder/decoder model instance
++llm = LLM(
++    model="openai/whisper-large-v3",
++    max_model_len=448,
++    max_num_seqs=400,
++    limit_mm_per_prompt={"audio": 1},
++    kv_cache_dtype="fp8",
++)
++
++prompts = [
++    {
++        "prompt": "<|startoftranscript|>",
++        "multi_modal_data": {
++            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
++        },
++    },
++    {  # Test explicit encoder/decoder prompt
++        "encoder_prompt": {
++            "prompt": "",
++            "multi_modal_data": {
++                "audio": AudioAsset("winning_call").audio_and_sample_rate,
++            },
++        },
++        "decoder_prompt": "<|startoftranscript|>",
++    }
++] * 1024
++
++# Create a sampling params object.
++sampling_params = SamplingParams(
++    temperature=0,
++    top_p=1.0,
++    max_tokens=200,
++)
++
++start = time.time()
++
++# Generate output tokens from the prompts. The output is a list of
++# RequestOutput objects that contain the prompt, generated
++# text, and other information.
++outputs = llm.generate(prompts, sampling_params)
++
++# Print the outputs.
++for output in outputs:
++    prompt = output.prompt
++    encoder_prompt = output.encoder_prompt
++    generated_text = output.outputs[0].text
++    print(f"Encoder prompt: {encoder_prompt!r}, "
++          f"Decoder prompt: {prompt!r}, "
++          f"Generated text: {generated_text!r}")
++
++duration = time.time() - start
++
++print("Duration:", duration)
++print("RPS:", len(prompts) / duration)
+diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
+new file mode 100644
+index 0000000..49a085f
+--- /dev/null
++++ b/examples/online_serving/api_client.py
+@@ -0,0 +1,84 @@
++"""Example Python client for `vllm.entrypoints.api_server`
++NOTE: The API server is used only for demonstration and simple performance
++benchmarks. It is not intended for production use.
++For production use, we recommend `vllm serve` and the OpenAI client API.
++"""
++
++import argparse
++import json
++from typing import Iterable, List
++
++import requests
++
++
++def clear_line(n: int = 1) -> None:
++    LINE_UP = '\033[1A'
++    LINE_CLEAR = '\x1b[2K'
++    for _ in range(n):
++        print(LINE_UP, end=LINE_CLEAR, flush=True)
++
++
++def post_http_request(prompt: str,
++                      api_url: str,
++                      n: int = 1,
++                      stream: bool = False) -> requests.Response:
++    headers = {"User-Agent": "Test Client"}
++    pload = {
++        "prompt": prompt,
++        "n": n,
++        "use_beam_search": True,
++        "temperature": 0.0,
++        "max_tokens": 16,
++        "stream": stream,
++    }
++    response = requests.post(api_url,
++                             headers=headers,
++                             json=pload,
++                             stream=stream)
++    return response
++
++
++def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
++    for chunk in response.iter_lines(chunk_size=8192,
++                                     decode_unicode=False,
++                                     delimiter=b"\0"):
++        if chunk:
++            data = json.loads(chunk.decode("utf-8"))
++            output = data["text"]
++            yield output
++
++
++def get_response(response: requests.Response) -> List[str]:
++    data = json.loads(response.content)
++    output = data["text"]
++    return output
++
++
++if __name__ == "__main__":
++    parser = argparse.ArgumentParser()
++    parser.add_argument("--host", type=str, default="localhost")
++    parser.add_argument("--port", type=int, default=8000)
++    parser.add_argument("--n", type=int, default=4)
++    parser.add_argument("--prompt", type=str, default="San Francisco is a")
++    parser.add_argument("--stream", action="store_true")
++    args = parser.parse_args()
++    prompt = args.prompt
++    api_url = f"http://{args.host}:{args.port}/generate"
++    n = args.n
++    stream = args.stream
++
++    print(f"Prompt: {prompt!r}\n", flush=True)
++    response = post_http_request(prompt, api_url, n, stream)
++
++    if stream:
++        num_printed_lines = 0
++        for h in get_streaming_response(response):
++            clear_line(num_printed_lines)
++            num_printed_lines = 0
++            for i, line in enumerate(h):
++                num_printed_lines += 1
++                print(f"Beam candidate {i}: {line!r}", flush=True)
++    else:
++        output = get_response(response)
++        for i, line in enumerate(output):
++            print(f"Beam candidate {i}: {line!r}", flush=True)
+diff --git a/examples/online_serving/chart-helm/.helmignore b/examples/online_serving/chart-helm/.helmignore
+new file mode 100644
+index 0000000..2d1303b
+--- /dev/null
++++ b/examples/online_serving/chart-helm/.helmignore
+@@ -0,0 +1,6 @@
++*.png
++.git/
++ct.yaml
++lintconf.yaml
++values.schema.json
++/workflows
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/Chart.yaml b/examples/online_serving/chart-helm/Chart.yaml
+new file mode 100644
+index 0000000..fb0f06f
+--- /dev/null
++++ b/examples/online_serving/chart-helm/Chart.yaml
+@@ -0,0 +1,21 @@
++apiVersion: v2
++name: chart-vllm
++description: Chart vllm
++
++# A chart can be either an 'application' or a 'library' chart.
++#
++# Application charts are a collection of templates that can be packaged into versioned archives
++# to be deployed.
++#
++# Library charts provide useful utilities or functions for the chart developer. They're included as
++# a dependency of application charts to inject those utilities and functions into the rendering
++# pipeline. Library charts do not define any templates and therefore cannot be deployed.
++type: application
++
++# This is the chart version. This version number should be incremented each time you make changes
++# to the chart and its templates, including the app version.
++# Versions are expected to follow Semantic Versioning (https://semver.org/)
++version: 0.0.1
++
++maintainers:
++  - name: mfournioux
+diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md
+new file mode 100644
+index 0000000..6aa126d
+--- /dev/null
++++ b/examples/online_serving/chart-helm/README.md
+@@ -0,0 +1,21 @@
++# Helm Charts
++
++This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
++
++## Files
++
++- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
++- ct.yaml: Configuration for chart testing.
++- lintconf.yaml: Linting rules for YAML files.
++- values.schema.json: JSON schema for validating values.yaml.
++- values.yaml: Default values for the Helm chart.
++- templates/_helpers.tpl: Helper templates for defining common configurations.
++- templates/configmap.yaml: Template for creating ConfigMaps.
++- templates/custom-objects.yaml: Template for custom Kubernetes objects.
++- templates/deployment.yaml: Template for creating Deployments.
++- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
++- templates/job.yaml: Template for Kubernetes Jobs.
++- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
++- templates/pvc.yaml: Template for Persistent Volume Claims.
++- templates/secrets.yaml: Template for Kubernetes Secrets.
++- templates/service.yaml: Template for creating Services.
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/ct.yaml b/examples/online_serving/chart-helm/ct.yaml
+new file mode 100644
+index 0000000..d273e11
+--- /dev/null
++++ b/examples/online_serving/chart-helm/ct.yaml
+@@ -0,0 +1,3 @@
++chart-dirs:
++  - charts
++validate-maintainers: false
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/lintconf.yaml b/examples/online_serving/chart-helm/lintconf.yaml
+new file mode 100644
+index 0000000..c8e8c5d
+--- /dev/null
++++ b/examples/online_serving/chart-helm/lintconf.yaml
+@@ -0,0 +1,42 @@
++---
++rules:
++  braces:
++    min-spaces-inside: 0
++    max-spaces-inside: 0
++    min-spaces-inside-empty: -1
++    max-spaces-inside-empty: -1
++  brackets:
++    min-spaces-inside: 0
++    max-spaces-inside: 0
++    min-spaces-inside-empty: -1
++    max-spaces-inside-empty: -1
++  colons:
++    max-spaces-before: 0
++    max-spaces-after: 1
++  commas:
++    max-spaces-before: 0
++    min-spaces-after: 1
++    max-spaces-after: 1
++  comments:
++    require-starting-space: true
++    min-spaces-from-content: 2
++  document-end: disable
++  document-start: disable           # No --- to start a file
++  empty-lines:
++    max: 2
++    max-start: 0
++    max-end: 0
++  hyphens:
++    max-spaces-after: 1
++  indentation:
++    spaces: consistent
++    indent-sequences: whatever      # - list indentation will handle both indentation and without
++    check-multi-line-strings: false
++  key-duplicates: enable
++  line-length: disable              # Lines can be any length
++  new-line-at-end-of-file: disable
++  new-lines:
++    type: unix
++  trailing-spaces: enable
++  truthy:
++    level: warning
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl
+new file mode 100644
+index 0000000..a9690ba
+--- /dev/null
++++ b/examples/online_serving/chart-helm/templates/_helpers.tpl
+@@ -0,0 +1,164 @@
++{{/*
++Define ports for the pods
++*/}}
++{{- define "chart.container-port" -}}
++{{-  default "8000" .Values.containerPort }}
++{{- end }}
++
++{{/*
++Define service name
++*/}}
++{{- define "chart.service-name" -}}
++{{-  if .Values.serviceName }}
++{{-    .Values.serviceName | lower | trim }}
++{{-  else }}
++"{{ .Release.Name }}-service"
++{{-  end }}
++{{- end }}
++
++{{/*
++Define service port
++*/}}
++{{- define "chart.service-port" -}}
++{{-  if .Values.servicePort }}
++{{-    .Values.servicePort }}
++{{-  else }}
++{{-    include "chart.container-port" . }}
++{{-  end }}
++{{- end }}
++
++{{/*
++Define service port name
++*/}}
++{{- define "chart.service-port-name" -}}
++"service-port"
++{{- end }}
++
++{{/*
++Define container port name
++*/}}
++{{- define "chart.container-port-name" -}}
++"container-port"
++{{- end }}
++
++{{/*
++Define deployment strategy
++*/}}
++{{- define "chart.strategy" -}}
++strategy:
++{{-   if not .Values.deploymentStrategy }}
++  rollingUpdate:
++    maxSurge: 100%
++    maxUnavailable: 0
++{{-   else }}
++{{      toYaml .Values.deploymentStrategy | indent 2 }}
++{{-   end }}
++{{- end }}
++
++{{/*
++Define additional ports
++*/}}
++{{- define "chart.extraPorts" }}
++{{-   with .Values.extraPorts }}
++{{      toYaml . }}
++{{-   end }}
++{{- end }}
++
++{{/*
++Define chart external ConfigMaps and Secrets
++*/}}
++{{- define "chart.externalConfigs" -}}
++{{-   with .Values.externalConfigs -}}
++{{      toYaml . }}
++{{-   end }}
++{{- end }}
++
++
++{{/*
++Define liveness et readiness probes
++*/}}
++{{- define "chart.probes" -}}
++{{-   if .Values.readinessProbe  }}
++readinessProbe:
++{{-     with .Values.readinessProbe }}
++{{-       toYaml . | nindent 2 }}
++{{-     end }}
++{{-   end }}
++{{-   if .Values.livenessProbe  }}
++livenessProbe:
++{{-     with .Values.livenessProbe }}
++{{-       toYaml . | nindent 2 }}
++{{-     end }}
++{{-   end }}
++{{- end }}
++
++{{/*
++Define resources
++*/}}
++{{- define "chart.resources" -}}
++requests:
++  memory: {{ required "Value 'resources.requests.memory' must be defined !" .Values.resources.requests.memory | quote }}
++  cpu: {{ required "Value 'resources.requests.cpu' must be defined !" .Values.resources.requests.cpu | quote }}
++  {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
++  nvidia.com/gpu: {{ required "Value 'resources.requests.nvidia.com/gpu' must be defined !" (index .Values.resources.requests "nvidia.com/gpu") | quote }}
++  {{- end }}
++limits:
++  memory: {{ required "Value 'resources.limits.memory' must be defined !" .Values.resources.limits.memory | quote }}
++  cpu: {{ required "Value 'resources.limits.cpu' must be defined !" .Values.resources.limits.cpu | quote }}
++  {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
++  nvidia.com/gpu: {{ required "Value 'resources.limits.nvidia.com/gpu' must be defined !" (index .Values.resources.limits "nvidia.com/gpu") | quote }}
++  {{- end }}
++{{- end }}
++
++
++{{/*
++Define User used for the main container
++*/}}
++{{- define "chart.user" }}
++{{-   if .Values.image.runAsUser  }}
++runAsUser: 
++{{-     with .Values.runAsUser }}
++{{-       toYaml . | nindent 2 }}
++{{-     end }}
++{{-   end }}
++{{- end }}
++
++{{- define "chart.extraInitImage" -}}
++"amazon/aws-cli:2.6.4"
++{{- end }}
++
++{{- define "chart.extraInitEnv" -}}
++- name: S3_ENDPOINT_URL
++  valueFrom:
++    secretKeyRef:
++      name: {{ .Release.Name }}-secrets
++      key: s3endpoint
++- name: S3_BUCKET_NAME
++  valueFrom:
++    secretKeyRef:
++      name: {{ .Release.Name }}-secrets
++      key: s3bucketname
++- name: AWS_ACCESS_KEY_ID
++  valueFrom:
++    secretKeyRef:
++      name: {{ .Release.Name }}-secrets
++      key: s3accesskeyid
++- name: AWS_SECRET_ACCESS_KEY
++  valueFrom:
++    secretKeyRef:
++      name: {{ .Release.Name }}-secrets
++      key: s3accesskey
++- name: S3_PATH
++  value: "{{ .Values.extraInit.s3modelpath }}"
++- name: AWS_EC2_METADATA_DISABLED
++  value: "{{ .Values.extraInit.awsEc2MetadataDisabled }}"
++{{- end }}
++
++{{/*
++  Define chart labels
++*/}}
++{{- define "chart.labels" -}}
++{{-   with .Values.labels -}}
++{{      toYaml . }}
++{{-   end }}
++{{- end }}
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/templates/configmap.yaml b/examples/online_serving/chart-helm/templates/configmap.yaml
+new file mode 100644
+index 0000000..cc5d037
+--- /dev/null
++++ b/examples/online_serving/chart-helm/templates/configmap.yaml
+@@ -0,0 +1,11 @@
++{{- if .Values.configs -}}
++apiVersion: v1
++kind: ConfigMap
++metadata:
++  name: "{{ .Release.Name }}-configs"
++  namespace: {{ .Release.Namespace }}
++data:
++  {{- with .Values.configs }}
++  {{- toYaml . | nindent 2 }}
++  {{- end }}
++{{- end -}}
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/templates/custom-objects.yaml b/examples/online_serving/chart-helm/templates/custom-objects.yaml
+new file mode 100644
+index 0000000..8a65ffd
+--- /dev/null
++++ b/examples/online_serving/chart-helm/templates/custom-objects.yaml
+@@ -0,0 +1,6 @@
++{{- if .Values.customObjects }}
++{{- range .Values.customObjects }}
++{{- tpl (. | toYaml) $ }}
++---
++{{- end }}
++{{- end }}
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml
+new file mode 100644
+index 0000000..536983b
+--- /dev/null
++++ b/examples/online_serving/chart-helm/templates/deployment.yaml
+@@ -0,0 +1,122 @@
++apiVersion: apps/v1
++kind: Deployment
++metadata:
++  name: "{{ .Release.Name }}-deployment-vllm"
++  namespace: {{ .Release.Namespace }}
++  labels:
++  {{- include "chart.labels" . | nindent 4 }}
++spec:
++  replicas: {{ .Values.replicaCount }}
++  {{- include "chart.strategy" . | nindent 2 }}
++  selector:                                                                                                                                  
++    matchLabels:
++      environment: "test"
++      release: "test"
++  progressDeadlineSeconds: 1200
++  template:
++    metadata:
++      labels:
++        environment: "test"
++        release: "test"
++    spec:
++      containers:
++        - name: "vllm"
++          image: "{{ required "Required value 'image.repository' must be defined !" .Values.image.repository }}:{{ required "Required value 'image.tag' must be defined !" .Values.image.tag }}"
++          {{- if .Values.image.command }}
++          command :
++            {{- with .Values.image.command }}
++            {{- toYaml . | nindent 10 }}
++            {{- end }}
++          {{- end }}
++          securityContext:
++            {{- if .Values.image.securityContext }}
++              {{- with .Values.image.securityContext }}
++              {{- toYaml . | nindent 12 }}
++              {{- end }}
++            {{- else }}
++            runAsNonRoot: false
++              {{- include "chart.user" . | indent 12 }}
++            {{- end }}
++          imagePullPolicy: IfNotPresent
++          {{- if .Values.image.env }}
++          env :
++            {{- with .Values.image.env }}
++            {{- toYaml . | nindent 10 }}
++            {{- end }}
++          {{- else }}
++          env: []
++          {{- end }}
++          {{- if or .Values.externalConfigs .Values.configs .Values.secrets }}
++          envFrom:
++            {{- if .Values.configs }}
++            - configMapRef:
++                name: "{{ .Release.Name }}-configs"
++            {{- end }}
++            {{- if .Values.secrets}}
++            - secretRef:
++                name: "{{ .Release.Name }}-secrets"
++            {{- end }}
++            {{- include "chart.externalConfigs" . | nindent 12 }}
++          {{- end }}          
++          ports:
++            - name: {{ include "chart.container-port-name" . }}
++              containerPort: {{ include "chart.container-port" . }}
++            {{- include "chart.extraPorts" . | nindent 12 }}
++          {{- include "chart.probes" . | indent 10 }}
++          resources: {{- include "chart.resources" . | nindent 12 }}
++          volumeMounts:
++          - name: {{ .Release.Name }}-storage
++            mountPath: /data
++
++        {{- with .Values.extraContainers }}
++        {{ toYaml . | nindent 8 }}
++        {{- end }}
++
++      {{-   if .Values.extraInit  }}
++      initContainers:
++      - name: wait-download-model
++        image: {{ include "chart.extraInitImage" . }}
++        command: 
++          - /bin/bash
++        args:
++          - -eucx
++          - while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done
++        env: {{- include "chart.extraInitEnv" . | nindent 10 }}
++        resources:
++          requests:
++            cpu: 200m
++            memory: 1Gi
++          limits:
++            cpu: 500m
++            memory: 2Gi
++        volumeMounts:
++        - name: {{ .Release.Name }}-storage
++          mountPath: /data
++      {{- end }}
++      volumes:
++        - name: {{ .Release.Name }}-storage
++          persistentVolumeClaim:
++            claimName: {{ .Release.Name }}-storage-claim     
++
++      {{- with .Values.nodeSelector }}
++      nodeSelector:
++        {{- toYaml . | nindent 8 }}
++      {{- end }}
++      {{- with .Values.tolerations }}
++      tolerations:
++        {{- toYaml . | nindent 8 }}
++      {{- end }}
++      {{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
++      runtimeClassName: nvidia
++      affinity:
++        nodeAffinity:
++          requiredDuringSchedulingIgnoredDuringExecution:
++            nodeSelectorTerms:
++              - matchExpressions:
++                - key: nvidia.com/gpu.product
++                  operator: In
++                  {{- with .Values.gpuModels }}
++                  values:
++                    {{- toYaml . | nindent 20 }}
++                  {{- end }}
++      {{- end }} 
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/templates/hpa.yaml b/examples/online_serving/chart-helm/templates/hpa.yaml
+new file mode 100644
+index 0000000..5ca94c8
+--- /dev/null
++++ b/examples/online_serving/chart-helm/templates/hpa.yaml
+@@ -0,0 +1,31 @@
++{{- if .Values.autoscaling.enabled }}
++apiVersion: autoscaling/v2
++kind: HorizontalPodAutoscaler
++metadata:
++  name: "{{ .Release.Name }}-hpa"
++  namespace: {{ .Release.Namespace }}
++spec:
++  scaleTargetRef:
++    apiVersion: apps/v1
++    kind: Deployment
++    name: vllm
++  minReplicas: {{ .Values.autoscaling.minReplicas }}
++  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
++  metrics:
++    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
++    - type: Resource
++      resource:
++        name: cpu
++        target:
++          type: Utilization
++          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
++    {{- end }}
++    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
++    - type: Resource
++      resource:
++        name: memory
++        target:
++          type: Utilization
++          averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
++    {{- end }}
++{{- end }}
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml
+new file mode 100644
+index 0000000..f9ea354
+--- /dev/null
++++ b/examples/online_serving/chart-helm/templates/job.yaml
+@@ -0,0 +1,37 @@
++{{-   if .Values.extraInit  }}
++apiVersion: batch/v1
++kind: Job
++metadata:
++  name: "{{ .Release.Name }}-init-vllm"
++  namespace: {{ .Release.Namespace }}
++spec:
++  ttlSecondsAfterFinished: 100
++  template:
++   metadata:
++     name: init-vllm
++   spec:
++    containers:
++    - name: job-download-model
++      image: {{ include "chart.extraInitImage" . }}
++      command: 
++        - /bin/bash
++      args:
++        - -eucx
++        - aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data
++      env: {{- include "chart.extraInitEnv" . | nindent 8 }}
++      volumeMounts:
++        - name: {{ .Release.Name }}-storage
++          mountPath: /data
++      resources:
++        requests:
++          cpu: 200m
++          memory: 1Gi
++        limits:
++          cpu: 500m
++          memory: 2Gi
++    restartPolicy: OnFailure
++    volumes:
++    - name: {{ .Release.Name }}-storage
++      persistentVolumeClaim:
++        claimName: "{{ .Release.Name }}-storage-claim"
++{{- end }}
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
+new file mode 100644
+index 0000000..512bac7
+--- /dev/null
++++ b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
+@@ -0,0 +1,7 @@
++apiVersion: policy/v1
++kind: PodDisruptionBudget
++metadata:
++  name: "{{ .Release.Name }}-pdb"
++  namespace: {{ .Release.Namespace }}
++spec:
++  maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/templates/pvc.yaml b/examples/online_serving/chart-helm/templates/pvc.yaml
+new file mode 100644
+index 0000000..e8d203a
+--- /dev/null
++++ b/examples/online_serving/chart-helm/templates/pvc.yaml
+@@ -0,0 +1,13 @@
++{{-   if .Values.extraInit  }}
++apiVersion: v1
++kind: PersistentVolumeClaim
++metadata:
++  name: "{{ .Release.Name }}-storage-claim"
++  namespace: {{ .Release.Namespace }}
++spec:
++  accessModes:
++    - ReadWriteOnce
++  resources:
++    requests:
++      storage: {{ .Values.extraInit.pvcStorage }}
++{{- end }}
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/templates/secrets.yaml b/examples/online_serving/chart-helm/templates/secrets.yaml
+new file mode 100644
+index 0000000..4e88e74
+--- /dev/null
++++ b/examples/online_serving/chart-helm/templates/secrets.yaml
+@@ -0,0 +1,10 @@
++apiVersion: v1
++kind: Secret
++metadata:
++  name: "{{ .Release.Name }}-secrets"
++  namespace: {{ .Release.Namespace }}
++type: Opaque
++data:
++  {{- range $key, $val := .Values.secrets }}
++  {{ $key }}: {{ $val | b64enc | quote }}
++  {{- end }}
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/templates/service.yaml b/examples/online_serving/chart-helm/templates/service.yaml
+new file mode 100644
+index 0000000..12d0f68
+--- /dev/null
++++ b/examples/online_serving/chart-helm/templates/service.yaml
+@@ -0,0 +1,14 @@
++apiVersion: v1
++kind: Service
++metadata:
++  name: "{{ .Release.Name }}-service"
++  namespace: {{ .Release.Namespace }}
++spec:
++  type: ClusterIP
++  ports:
++    - name: {{ include "chart.service-port-name" . }}
++      port: {{ include "chart.service-port" . }}
++      targetPort: {{ include "chart.container-port-name" . }}
++      protocol: TCP
++  selector:
++  {{- include "chart.labels" . | nindent 4 }}
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json
+new file mode 100644
+index 0000000..812d54b
+--- /dev/null
++++ b/examples/online_serving/chart-helm/values.schema.json
+@@ -0,0 +1,265 @@
++{
++    "$schema": "http://json-schema.org/schema#",
++    "type": "object",
++    "properties": {
++        "image": {
++            "type": "object",
++            "properties": {
++                "repository": {
++                    "type": "string"
++                },
++                "tag": {
++                    "type": "string"
++                },
++                "command": {
++                    "type": "array",
++                    "items": {
++                        "type": "string"
++                    }
++                }
++            },
++            "required": [
++                "command",
++                "repository",
++                "tag"
++            ]
++        },
++        "containerPort": {
++            "type": "integer"
++        },
++        "serviceName": {
++            "type": "null"
++        },
++        "servicePort": {
++            "type": "integer"
++        },
++        "extraPorts": {
++            "type": "array"
++        },
++        "replicaCount": {
++            "type": "integer"
++        },
++        "deploymentStrategy": {
++            "type": "object"
++        },
++        "resources": {
++            "type": "object",
++            "properties": {
++                "requests": {
++                    "type": "object",
++                    "properties": {
++                        "cpu": {
++                            "type": "integer"
++                        },
++                        "memory": {
++                            "type": "string"
++                        },
++                        "nvidia.com/gpu": {
++                            "type": "integer"
++                        }
++                    },
++                    "required": [
++                        "cpu",
++                        "memory",
++                        "nvidia.com/gpu"
++                    ]
++                },
++                "limits": {
++                    "type": "object",
++                    "properties": {
++                        "cpu": {
++                            "type": "integer"
++                        },
++                        "memory": {
++                            "type": "string"
++                        },
++                        "nvidia.com/gpu": {
++                            "type": "integer"
++                        }
++                    },
++                    "required": [
++                        "cpu",
++                        "memory",
++                        "nvidia.com/gpu"
++                    ]
++                }
++            },
++            "required": [
++                "limits",
++                "requests"
++            ]
++        },
++        "gpuModels": {
++            "type": "array",
++            "items": {
++                "type": "string"
++            }
++        },
++        "autoscaling": {
++            "type": "object",
++            "properties": {
++                "enabled": {
++                    "type": "boolean"
++                },
++                "minReplicas": {
++                    "type": "integer"
++                },
++                "maxReplicas": {
++                    "type": "integer"
++                },
++                "targetCPUUtilizationPercentage": {
++                    "type": "integer"
++                }
++            },
++            "required": [
++                "enabled",
++                "maxReplicas",
++                "minReplicas",
++                "targetCPUUtilizationPercentage"
++            ]
++        },
++        "configs": {
++            "type": "object"
++        },
++        "secrets": {
++            "type": "object"
++        },
++        "externalConfigs": {
++            "type": "array"
++        },
++        "customObjects": {
++            "type": "array"
++        },
++        "maxUnavailablePodDisruptionBudget": {
++            "type": "string"
++        },
++        "extraInit": {
++            "type": "object",
++            "properties": {
++                "s3modelpath": {
++                    "type": "string"
++                },
++                "pvcStorage": {
++                    "type": "string"
++                },
++                "awsEc2MetadataDisabled": {
++                    "type": "boolean"
++                }
++            },
++            "required": [
++                "pvcStorage",
++                "s3modelpath",
++                "awsEc2MetadataDisabled"
++            ]
++        },
++        "extraContainers": {
++            "type": "array"
++        },
++        "readinessProbe": {
++            "type": "object",
++            "properties": {
++                "initialDelaySeconds": {
++                    "type": "integer"
++                },
++                "periodSeconds": {
++                    "type": "integer"
++                },
++                "failureThreshold": {
++                    "type": "integer"
++                },
++                "httpGet": {
++                    "type": "object",
++                    "properties": {
++                        "path": {
++                            "type": "string"
++                        },
++                        "port": {
++                            "type": "integer"
++                        }
++                    },
++                    "required": [
++                        "path",
++                        "port"
++                    ]
++                }
++            },
++            "required": [
++                "failureThreshold",
++                "httpGet",
++                "initialDelaySeconds",
++                "periodSeconds"
++            ]
++        },
++        "livenessProbe": {
++            "type": "object",
++            "properties": {
++                "initialDelaySeconds": {
++                    "type": "integer"
++                },
++                "failureThreshold": {
++                    "type": "integer"
++                },
++                "periodSeconds": {
++                    "type": "integer"
++                },
++                "httpGet": {
++                    "type": "object",
++                    "properties": {
++                        "path": {
++                            "type": "string"
++                        },
++                        "port": {
++                            "type": "integer"
++                        }
++                    },
++                    "required": [
++                        "path",
++                        "port"
++                    ]
++                }
++            },
++            "required": [
++                "failureThreshold",
++                "httpGet",
++                "initialDelaySeconds",
++                "periodSeconds"
++            ]
++        },
++        "labels": {
++            "type": "object",
++            "properties": {
++                "environment": {
++                    "type": "string"
++                },
++                "release": {
++                    "type": "string"
++                }
++            },
++            "required": [
++                "environment",
++                "release"
++            ]
++        }
++    },
++    "required": [
++        "autoscaling",
++        "configs",
++        "containerPort",
++        "customObjects",
++        "deploymentStrategy",
++        "externalConfigs",
++        "extraContainers",
++        "extraInit",
++        "extraPorts",
++        "gpuModels",
++        "image",
++        "labels",
++        "livenessProbe",
++        "maxUnavailablePodDisruptionBudget",
++        "readinessProbe",
++        "replicaCount",
++        "resources",
++        "secrets",
++        "servicePort"
++    ]
++}
+\ No newline at end of file
+diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
+new file mode 100644
+index 0000000..9c48e7d
+--- /dev/null
++++ b/examples/online_serving/chart-helm/values.yaml
+@@ -0,0 +1,119 @@
++# -- Default values for chart vllm
++# -- Declare variables to be passed into your templates.
++
++# -- Image configuration
++image:
++  # -- Image repository
++  repository: "vllm/vllm-openai"
++  # -- Image tag
++  tag: "latest"
++  # -- Container launch command
++  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]
++
++# -- Container port
++containerPort: 8000
++# -- Service name
++serviceName:
++# -- Service port
++servicePort: 80
++# -- Additional ports configuration
++extraPorts: []
++
++# -- Number of replicas
++replicaCount: 1
++
++# -- Deployment strategy configuration
++deploymentStrategy: {}
++
++# -- Resource configuration
++resources:
++  requests:
++    # -- Number of CPUs
++    cpu: 4
++    # -- CPU memory configuration
++    memory: 16Gi
++    # -- Number of gpus used
++    nvidia.com/gpu: 1
++  limits:
++    # -- Number of CPUs
++    cpu: 4
++    # -- CPU memory configuration
++    memory: 16Gi
++    # -- Number of gpus used
++    nvidia.com/gpu: 1
++
++# -- Type of gpu used
++gpuModels:
++  - "TYPE_GPU_USED"
++
++# -- Autoscaling configuration
++autoscaling:
++  # -- Enable autoscaling
++  enabled: false
++  # -- Minimum replicas
++  minReplicas: 1
++  # -- Maximum replicas
++  maxReplicas: 100
++  # -- Target CPU utilization for autoscaling
++  targetCPUUtilizationPercentage: 80
++  # targetMemoryUtilizationPercentage: 80
++
++# -- Configmap
++configs: {}
++
++# -- Secrets configuration
++secrets: {}
++
++# -- External configuration
++externalConfigs: []
++
++# -- Custom Objects configuration
++customObjects: []
++
++# -- Disruption Budget Configuration
++maxUnavailablePodDisruptionBudget: ""
++
++# -- Additional configuration for the init container
++extraInit:
++   # -- Path of the model on the s3 which hosts model weights and config files
++  s3modelpath: "relative_s3_model_path/opt-125m"
++   # -- Storage size of the s3
++  pvcStorage: "1Gi"
++  awsEc2MetadataDisabled: true
++
++# -- Additional containers configuration
++extraContainers: []
++
++# -- Readiness probe configuration
++readinessProbe:
++  # -- Number of seconds after the container has started before readiness probe is initiated
++  initialDelaySeconds: 5
++  # -- How often (in seconds) to perform the readiness probe
++  periodSeconds: 5
++  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
++  failureThreshold: 3
++   # -- Configuration of the Kubelet http request on the server
++  httpGet:
++    # -- Path to access on the HTTP server
++    path: /health
++    # -- Name or number of the port to access on the container, on which the server is listening
++    port: 8000
++
++# -- Liveness probe configuration
++livenessProbe:
++ # -- Number of seconds after the container has started before liveness probe is initiated
++  initialDelaySeconds: 15
++  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
++  failureThreshold: 3
++  # -- How often (in seconds) to perform the liveness probe
++  periodSeconds: 10
++  # -- Configuration of the Kubelet http request on the server
++  httpGet:
++    # -- Path to access on the HTTP server
++    path: /health
++    # -- Name or number of the port to access on the container, on which the server is listening
++    port: 8000
++
++labels:
++  environment: "test"
++  release: "test"
+diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
+new file mode 100644
+index 0000000..8715527
+--- /dev/null
++++ b/examples/online_serving/disaggregated_prefill.sh
+@@ -0,0 +1,109 @@
++#!/bin/bash
++# This file demonstrates the example usage of disaggregated prefilling
++# We will launch 2 vllm instances (1 for prefill and 1 for decode),
++# and then transfer the KV cache between them.
++
++echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
++sleep 1
++
++# Trap the SIGINT signal (triggered by Ctrl+C)
++trap 'cleanup' INT
++
++# Cleanup function
++cleanup() {
++    echo "Caught Ctrl+C, cleaning up..."
++    # Cleanup commands
++    pgrep python | xargs kill -9
++    pkill -f python
++    echo "Cleanup complete. Exiting."
++    exit 0
++}
++
++export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
++
++# install quart first -- required for disagg prefill proxy serve
++if python3 -c "import quart" &> /dev/null; then
++    echo "Quart is already installed."
++else
++    echo "Quart is not installed. Installing..."
++    python3 -m pip install quart
++fi 
++
++# a function that waits vLLM server to start
++wait_for_server() {
++  local port=$1
++  timeout 1200 bash -c "
++    until curl -s localhost:${port}/v1/completions > /dev/null; do
++      sleep 1
++    done" && return 0 || return 1
++}
++
++
++# You can also adjust --kv-ip and --kv-port for distributed inference.
++
++# prefilling instance, which is the KV producer
++CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
++    --port 8100 \
++    --max-model-len 100 \
++    --gpu-memory-utilization 0.8 \
++    --kv-transfer-config \
++    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
++
++# decoding instance, which is the KV consumer
++CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
++    --port 8200 \
++    --max-model-len 100 \
++    --gpu-memory-utilization 0.8 \
++    --kv-transfer-config \
++    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
++
++# wait until prefill and decode instances are ready
++wait_for_server 8100
++wait_for_server 8200
++
++# launch a proxy server that opens the service at port 8000
++# the workflow of this proxy:
++# - send the request to prefill vLLM instance (port 8100), change max_tokens 
++#   to 1
++# - after the prefill vLLM finishes prefill, send the request to decode vLLM 
++#   instance
++# NOTE: the usage of this API is subject to change --- in the future we will 
++# introduce "vllm connect" to connect between prefill and decode instances
++python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
++sleep 1
++
++# serve two example requests
++output1=$(curl -X POST -s http://localhost:8000/v1/completions \
++-H "Content-Type: application/json" \
++-d '{
++"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
++"prompt": "San Francisco is a",
++"max_tokens": 10,
++"temperature": 0
++}')
++
++output2=$(curl -X POST -s http://localhost:8000/v1/completions \
++-H "Content-Type: application/json" \
++-d '{
++"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
++"prompt": "Santa Clara is a",
++"max_tokens": 10,
++"temperature": 0
++}')
++
++
++# Cleanup commands
++pgrep python | xargs kill -9
++pkill -f python
++
++echo ""
++
++sleep 1
++
++# Print the outputs of the curl requests
++echo ""
++echo "Output of first request: $output1"
++echo "Output of second request: $output2"
++
++echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
++echo ""
+diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
+new file mode 100644
+index 0000000..8ceb8f6
+--- /dev/null
++++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
+@@ -0,0 +1,82 @@
++import argparse
++
++import gradio as gr
++from openai import OpenAI
++
++# Argument parser setup
++parser = argparse.ArgumentParser(
++    description='Chatbot Interface with Customizable Parameters')
++parser.add_argument('--model-url',
++                    type=str,
++                    default='http://localhost:8000/v1',
++                    help='Model URL')
++parser.add_argument('-m',
++                    '--model',
++                    type=str,
++                    required=True,
++                    help='Model name for the chatbot')
++parser.add_argument('--temp',
++                    type=float,
++                    default=0.8,
++                    help='Temperature for text generation')
++parser.add_argument('--stop-token-ids',
++                    type=str,
++                    default='',
++                    help='Comma-separated stop token IDs')
++parser.add_argument("--host", type=str, default=None)
++parser.add_argument("--port", type=int, default=8001)
++
++# Parse the arguments
++args = parser.parse_args()
++
++# Set OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = args.model_url
++
++# Create an OpenAI client to interact with the API server
++client = OpenAI(
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++
++def predict(message, history):
++    # Convert chat history to OpenAI format
++    history_openai_format = [{
++        "role": "system",
++        "content": "You are a great ai assistant."
++    }]
++    for human, assistant in history:
++        history_openai_format.append({"role": "user", "content": human})
++        history_openai_format.append({
++            "role": "assistant",
++            "content": assistant
++        })
++    history_openai_format.append({"role": "user", "content": message})
++
++    # Create a chat completion request and send it to the API server
++    stream = client.chat.completions.create(
++        model=args.model,  # Model name to use
++        messages=history_openai_format,  # Chat history
++        temperature=args.temp,  # Temperature for text generation
++        stream=True,  # Stream response
++        extra_body={
++            'repetition_penalty':
++            1,
++            'stop_token_ids': [
++                int(id.strip()) for id in args.stop_token_ids.split(',')
++                if id.strip()
++            ] if args.stop_token_ids else []
++        })
++
++    # Read and return generated text from response stream
++    partial_message = ""
++    for chunk in stream:
++        partial_message += (chunk.choices[0].delta.content or "")
++        yield partial_message
++
++
++# Create and launch a chat interface with Gradio
++gr.ChatInterface(predict).queue().launch(server_name=args.host,
++                                         server_port=args.port,
++                                         share=True)
+diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
+new file mode 100644
+index 0000000..54e9075
+--- /dev/null
++++ b/examples/online_serving/gradio_webserver.py
+@@ -0,0 +1,52 @@
++import argparse
++import json
++
++import gradio as gr
++import requests
++
++
++def http_bot(prompt):
++    headers = {"User-Agent": "vLLM Client"}
++    pload = {
++        "prompt": prompt,
++        "stream": True,
++        "max_tokens": 128,
++    }
++    response = requests.post(args.model_url,
++                             headers=headers,
++                             json=pload,
++                             stream=True)
++
++    for chunk in response.iter_lines(chunk_size=8192,
++                                     decode_unicode=False,
++                                     delimiter=b"\0"):
++        if chunk:
++            data = json.loads(chunk.decode("utf-8"))
++            output = data["text"][0]
++            yield output
++
++
++def build_demo():
++    with gr.Blocks() as demo:
++        gr.Markdown("# vLLM text completion demo\n")
++        inputbox = gr.Textbox(label="Input",
++                              placeholder="Enter text and press ENTER")
++        outputbox = gr.Textbox(label="Output",
++                               placeholder="Generated result from the model")
++        inputbox.submit(http_bot, [inputbox], [outputbox])
++    return demo
++
++
++if __name__ == "__main__":
++    parser = argparse.ArgumentParser()
++    parser.add_argument("--host", type=str, default=None)
++    parser.add_argument("--port", type=int, default=8001)
++    parser.add_argument("--model-url",
++                        type=str,
++                        default="http://localhost:8000/generate")
++    args = parser.parse_args()
++
++    demo = build_demo()
++    demo.queue().launch(server_name=args.host,
++                        server_port=args.port,
++                        share=True)
+diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
+new file mode 100644
+index 0000000..bbada38
+--- /dev/null
++++ b/examples/online_serving/openai_chat_completion_client.py
+@@ -0,0 +1,36 @@
++from openai import OpenAI
++
++# Modify OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++
++client = OpenAI(
++    # defaults to os.environ.get("OPENAI_API_KEY")
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++models = client.models.list()
++model = models.data[0].id
++
++chat_completion = client.chat.completions.create(
++    messages=[{
++        "role": "system",
++        "content": "You are a helpful assistant."
++    }, {
++        "role": "user",
++        "content": "Who won the world series in 2020?"
++    }, {
++        "role":
++        "assistant",
++        "content":
++        "The Los Angeles Dodgers won the World Series in 2020."
++    }, {
++        "role": "user",
++        "content": "Where was it played?"
++    }],
++    model=model,
++)
++
++print("Chat completion results:")
++print(chat_completion)
+diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+new file mode 100644
+index 0000000..03cc037
+--- /dev/null
++++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+@@ -0,0 +1,321 @@
++"""An example showing how to use vLLM to serve multimodal models 
++and run online serving with OpenAI client.
++
++Launch the vLLM server with the following command:
++
++(single image inference with Llava)
++vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
++
++(multi-image inference with Phi-3.5-vision-instruct)
++vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
++    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
++
++(audio inference with Ultravox)
++vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
++"""
++import base64
++
++import requests
++from openai import OpenAI
++
++from vllm.utils import FlexibleArgumentParser
++
++# Modify OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++
++client = OpenAI(
++    # defaults to os.environ.get("OPENAI_API_KEY")
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++models = client.models.list()
++model = models.data[0].id
++
++
++def encode_base64_content_from_url(content_url: str) -> str:
++    """Encode a content retrieved from a remote url to base64 format."""
++
++    with requests.get(content_url) as response:
++        response.raise_for_status()
++        result = base64.b64encode(response.content).decode('utf-8')
++
++    return result
++
++
++# Text-only inference
++def run_text_only() -> None:
++    chat_completion = client.chat.completions.create(
++        messages=[{
++            "role": "user",
++            "content": "What's the capital of France?"
++        }],
++        model=model,
++        max_completion_tokens=64,
++    )
++
++    result = chat_completion.choices[0].message.content
++    print("Chat completion output:", result)
++
++
++# Single-image input inference
++def run_single_image() -> None:
++
++    ## Use image url in the payload
++    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
++    chat_completion_from_url = client.chat.completions.create(
++        messages=[{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": "What's in this image?"
++                },
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": image_url
++                    },
++                },
++            ],
++        }],
++        model=model,
++        max_completion_tokens=64,
++    )
++
++    result = chat_completion_from_url.choices[0].message.content
++    print("Chat completion output from image url:", result)
++
++    ## Use base64 encoded image in the payload
++    image_base64 = encode_base64_content_from_url(image_url)
++    chat_completion_from_base64 = client.chat.completions.create(
++        messages=[{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": "What's in this image?"
++                },
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": f"data:image/jpeg;base64,{image_base64}"
++                    },
++                },
++            ],
++        }],
++        model=model,
++        max_completion_tokens=64,
++    )
++
++    result = chat_completion_from_base64.choices[0].message.content
++    print("Chat completion output from base64 encoded image:", result)
++
++
++# Multi-image input inference
++def run_multi_image() -> None:
++    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
++    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
++    chat_completion_from_url = client.chat.completions.create(
++        messages=[{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": "What are the animals in these images?"
++                },
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": image_url_duck
++                    },
++                },
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": image_url_lion
++                    },
++                },
++            ],
++        }],
++        model=model,
++        max_completion_tokens=64,
++    )
++
++    result = chat_completion_from_url.choices[0].message.content
++    print("Chat completion output:", result)
++
++
++# Video input inference
++def run_video() -> None:
++    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
++    video_base64 = encode_base64_content_from_url(video_url)
++
++    ## Use video url in the payload
++    chat_completion_from_url = client.chat.completions.create(
++        messages=[{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": "What's in this video?"
++                },
++                {
++                    "type": "video_url",
++                    "video_url": {
++                        "url": video_url
++                    },
++                },
++            ],
++        }],
++        model=model,
++        max_completion_tokens=64,
++    )
++
++    result = chat_completion_from_url.choices[0].message.content
++    print("Chat completion output from image url:", result)
++
++    ## Use base64 encoded video in the payload
++    chat_completion_from_base64 = client.chat.completions.create(
++        messages=[{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": "What's in this video?"
++                },
++                {
++                    "type": "video_url",
++                    "video_url": {
++                        "url": f"data:video/mp4;base64,{video_base64}"
++                    },
++                },
++            ],
++        }],
++        model=model,
++        max_completion_tokens=64,
++    )
++
++    result = chat_completion_from_base64.choices[0].message.content
++    print("Chat completion output from base64 encoded image:", result)
++
++
++# Audio input inference
++def run_audio() -> None:
++    from vllm.assets.audio import AudioAsset
++
++    audio_url = AudioAsset("winning_call").url
++    audio_base64 = encode_base64_content_from_url(audio_url)
++
++    # OpenAI-compatible schema (`input_audio`)
++    chat_completion_from_base64 = client.chat.completions.create(
++        messages=[{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": "What's in this audio?"
++                },
++                {
++                    "type": "input_audio",
++                    "input_audio": {
++                        # Any format supported by librosa is supported
++                        "data": audio_base64,
++                        "format": "wav"
++                    },
++                },
++            ],
++        }],
++        model=model,
++        max_completion_tokens=64,
++    )
++
++    result = chat_completion_from_base64.choices[0].message.content
++    print("Chat completion output from input audio:", result)
++
++    # HTTP URL
++    chat_completion_from_url = client.chat.completions.create(
++        messages=[{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": "What's in this audio?"
++                },
++                {
++                    "type": "audio_url",
++                    "audio_url": {
++                        # Any format supported by librosa is supported
++                        "url": audio_url
++                    },
++                },
++            ],
++        }],
++        model=model,
++        max_completion_tokens=64,
++    )
++
++    result = chat_completion_from_url.choices[0].message.content
++    print("Chat completion output from audio url:", result)
++
++    # base64 URL
++    chat_completion_from_base64 = client.chat.completions.create(
++        messages=[{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "text",
++                    "text": "What's in this audio?"
++                },
++                {
++                    "type": "audio_url",
++                    "audio_url": {
++                        # Any format supported by librosa is supported
++                        "url": f"data:audio/ogg;base64,{audio_base64}"
++                    },
++                },
++            ],
++        }],
++        model=model,
++        max_completion_tokens=64,
++    )
++
++    result = chat_completion_from_base64.choices[0].message.content
++    print("Chat completion output from base64 encoded audio:", result)
++
++
++example_function_map = {
++    "text-only": run_text_only,
++    "single-image": run_single_image,
++    "multi-image": run_multi_image,
++    "video": run_video,
++    "audio": run_audio,
++}
++
++
++def main(args) -> None:
++    chat_type = args.chat_type
++    example_function_map[chat_type]()
++
++
++if __name__ == "__main__":
++    parser = FlexibleArgumentParser(
++        description='Demo on using OpenAI client for online serving with '
++        'multimodal language models served with vLLM.')
++    parser.add_argument('--chat-type',
++                        '-c',
++                        type=str,
++                        default="single-image",
++                        choices=list(example_function_map.keys()),
++                        help='Conversation type with multimodal data.')
++    args = parser.parse_args()
++    main(args)
+diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
+new file mode 100644
+index 0000000..2bbe42b
+--- /dev/null
++++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
+@@ -0,0 +1,162 @@
++"""
++Set up this example by starting a vLLM OpenAI-compatible server with tool call
++options enabled. For example:
++
++IMPORTANT: for mistral, you must use one of the provided mistral tool call
++templates, or your own - the model default doesn't work for tool calls with vLLM
++See the vLLM docs on OpenAI server & tool calling for more details.
++
++vllm serve --model mistralai/Mistral-7B-Instruct-v0.3 \
++            --chat-template examples/tool_chat_template_mistral.jinja \
++            --enable-auto-tool-choice --tool-call-parser mistral
++
++OR
++vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
++            --chat-template examples/tool_chat_template_hermes.jinja \
++            --enable-auto-tool-choice --tool-call-parser hermes
++"""
++import json
++
++from openai import OpenAI
++
++# Modify OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++
++client = OpenAI(
++    # defaults to os.environ.get("OPENAI_API_KEY")
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++models = client.models.list()
++model = models.data[0].id
++
++tools = [{
++    "type": "function",
++    "function": {
++        "name": "get_current_weather",
++        "description": "Get the current weather in a given location",
++        "parameters": {
++            "type": "object",
++            "properties": {
++                "city": {
++                    "type":
++                    "string",
++                    "description":
++                    "The city to find the weather for, e.g. 'San Francisco'"
++                },
++                "state": {
++                    "type":
++                    "string",
++                    "description":
++                    "the two-letter abbreviation for the state that the city is"
++                    " in, e.g. 'CA' which would mean 'California'"
++                },
++                "unit": {
++                    "type": "string",
++                    "description": "The unit to fetch the temperature in",
++                    "enum": ["celsius", "fahrenheit"]
++                }
++            },
++            "required": ["city", "state", "unit"]
++        }
++    }
++}]
++
++messages = [{
++    "role": "user",
++    "content": "Hi! How are you doing today?"
++}, {
++    "role": "assistant",
++    "content": "I'm doing well! How can I help you?"
++}, {
++    "role":
++    "user",
++    "content":
++    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
++}]
++
++chat_completion = client.chat.completions.create(messages=messages,
++                                                 model=model,
++                                                 tools=tools)
++
++print("Chat completion results:")
++print(chat_completion)
++print("\n\n")
++
++tool_calls_stream = client.chat.completions.create(messages=messages,
++                                                   model=model,
++                                                   tools=tools,
++                                                   stream=True)
++
++chunks = []
++for chunk in tool_calls_stream:
++    chunks.append(chunk)
++    if chunk.choices[0].delta.tool_calls:
++        print(chunk.choices[0].delta.tool_calls[0])
++    else:
++        print(chunk.choices[0].delta)
++
++arguments = []
++tool_call_idx = -1
++for chunk in chunks:
++
++    if chunk.choices[0].delta.tool_calls:
++        tool_call = chunk.choices[0].delta.tool_calls[0]
++
++        if tool_call.index != tool_call_idx:
++            if tool_call_idx >= 0:
++                print(
++                    f"streamed tool call arguments: {arguments[tool_call_idx]}"
++                )
++            tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
++            arguments.append("")
++        if tool_call.id:
++            print(f"streamed tool call id: {tool_call.id} ")
++
++        if tool_call.function:
++            if tool_call.function.name:
++                print(f"streamed tool call name: {tool_call.function.name}")
++
++            if tool_call.function.arguments:
++                arguments[tool_call_idx] += tool_call.function.arguments
++
++if len(arguments):
++    print(f"streamed tool call arguments: {arguments[-1]}")
++
++print("\n\n")
++
++messages.append({
++    "role": "assistant",
++    "tool_calls": chat_completion.choices[0].message.tool_calls
++})
++
++
++# Now, simulate a tool call
++def get_current_weather(city: str, state: str, unit: 'str'):
++    return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
++            "partly cloudly, with highs in the 90's.")
++
++
++available_tools = {"get_current_weather": get_current_weather}
++
++completion_tool_calls = chat_completion.choices[0].message.tool_calls
++for call in completion_tool_calls:
++    tool_to_call = available_tools[call.function.name]
++    args = json.loads(call.function.arguments)
++    result = tool_to_call(**args)
++    print(result)
++    messages.append({
++        "role": "tool",
++        "content": result,
++        "tool_call_id": call.id,
++        "name": call.function.name
++    })
++
++chat_completion_2 = client.chat.completions.create(messages=messages,
++                                                   model=model,
++                                                   tools=tools,
++                                                   stream=False)
++print("\n\n")
++print(chat_completion_2)
+diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
+new file mode 100644
+index 0000000..8c059c7
+--- /dev/null
++++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
+@@ -0,0 +1,94 @@
++from enum import Enum
++
++from openai import OpenAI
++from pydantic import BaseModel
++
++client = OpenAI(
++    base_url="http://localhost:8000/v1",
++    api_key="-",
++)
++
++# Guided decoding by Choice (list of possible options)
++completion = client.chat.completions.create(
++    model="Qwen/Qwen2.5-3B-Instruct",
++    messages=[{
++        "role": "user",
++        "content": "Classify this sentiment: vLLM is wonderful!"
++    }],
++    extra_body={"guided_choice": ["positive", "negative"]},
++)
++print(completion.choices[0].message.content)
++
++# Guided decoding by Regex
++prompt = ("Generate an email address for Alan Turing, who works in Enigma."
++          "End in .com and new line. Example result:"
++          "alan.turing@enigma.com\n")
++
++completion = client.chat.completions.create(
++    model="Qwen/Qwen2.5-3B-Instruct",
++    messages=[{
++        "role": "user",
++        "content": prompt,
++    }],
++    extra_body={
++        "guided_regex": "\w+@\w+\.com\n",
++        "stop": ["\n"]
++    },
++)
++print(completion.choices[0].message.content)
++
++
++# Guided decoding by JSON using Pydantic schema
++class CarType(str, Enum):
++    sedan = "sedan"
++    suv = "SUV"
++    truck = "Truck"
++    coupe = "Coupe"
++
++
++class CarDescription(BaseModel):
++    brand: str
++    model: str
++    car_type: CarType
++
++
++json_schema = CarDescription.model_json_schema()
++
++prompt = ("Generate a JSON with the brand, model and car_type of"
++          "the most iconic car from the 90's")
++completion = client.chat.completions.create(
++    model="Qwen/Qwen2.5-3B-Instruct",
++    messages=[{
++        "role": "user",
++        "content": prompt,
++    }],
++    extra_body={"guided_json": json_schema},
++)
++print(completion.choices[0].message.content)
++
++# Guided decoding by Grammar
++simplified_sql_grammar = """
++    ?start: select_statement
++
++    ?select_statement: "SELECT " column_list " FROM " table_name
++
++    ?column_list: column_name ("," column_name)*
++
++    ?table_name: identifier
++
++    ?column_name: identifier
++
++    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
++"""
++
++prompt = ("Generate an SQL query to show the 'username' and 'email'"
++          "from the 'users' table.")
++completion = client.chat.completions.create(
++    model="Qwen/Qwen2.5-3B-Instruct",
++    messages=[{
++        "role": "user",
++        "content": prompt,
++    }],
++    extra_body={"guided_grammar": simplified_sql_grammar},
++)
++print(completion.choices[0].message.content)
+diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+new file mode 100644
+index 0000000..a56e742
+--- /dev/null
++++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+@@ -0,0 +1,120 @@
++import argparse
++import base64
++import io
++
++import requests
++from PIL import Image
++
++image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
++
++
++def vlm2vec():
++    response = requests.post(
++        "http://localhost:8000/v1/embeddings",
++        json={
++            "model":
++            "TIGER-Lab/VLM2Vec-Full",
++            "messages": [{
++                "role":
++                "user",
++                "content": [
++                    {
++                        "type": "image_url",
++                        "image_url": {
++                            "url": image_url
++                        }
++                    },
++                    {
++                        "type": "text",
++                        "text": "Represent the given image."
++                    },
++                ],
++            }],
++            "encoding_format":
++            "float",
++        },
++    )
++    response.raise_for_status()
++    response_json = response.json()
++
++    print("Embedding output:", response_json["data"][0]["embedding"])
++
++
++def dse_qwen2_vl(inp: dict):
++    # Embedding an Image
++    if inp["dtype"] == "image":
++        messages = [{
++            "role":
++            "user",
++            "content": [{
++                "type": "image_url",
++                "image_url": {
++                    "url": inp["image_url"],
++                }
++            }, {
++                "type": "text",
++                "text": "What is shown in this image?"
++            }]
++        }]
++    # Embedding a Text Query
++    else:
++        # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
++        # of the minimum input size
++        buffer = io.BytesIO()
++        image_placeholder = Image.new("RGB", (56, 56))
++        image_placeholder.save(buffer, "png")
++        buffer.seek(0)
++        image_placeholder = base64.b64encode(buffer.read()).decode('utf-8')
++        messages = [{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": f"data:image/jpeg;base64,{image_placeholder}",
++                    }
++                },
++                {
++                    "type": "text",
++                    "text": f"Query: {inp['content']}"
++                },
++            ]
++        }]
++
++    response = requests.post(
++        "http://localhost:8000/v1/embeddings",
++        json={
++            "model": "MrLight/dse-qwen2-2b-mrl-v1",
++            "messages": messages,
++            "encoding_format": "float",
++        },
++    )
++    response.raise_for_status()
++    response_json = response.json()
++
++    print("Embedding output:", response_json["data"][0]["embedding"])
++
++
++if __name__ == '__main__':
++    parser = argparse.ArgumentParser(
++        "Script to call a specified VLM through the API. Make sure to serve "
++        "the model with --task embed before running this.")
++    parser.add_argument("model",
++                        type=str,
++                        choices=["vlm2vec", "dse_qwen2_vl"],
++                        required=True,
++                        help="Which model to call.")
++    args = parser.parse_args()
++
++    if args.model == "vlm2vec":
++        vlm2vec()
++    elif args.model == "dse_qwen2_vl":
++        dse_qwen2_vl({
++            "dtye": "image",
++            "image_url": image_url,
++        })
++        dse_qwen2_vl({
++            "dtype": "text",
++            "content": "What is the weather like today?",
++        })
+diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
+new file mode 100644
+index 0000000..58519f9
+--- /dev/null
++++ b/examples/online_serving/openai_completion_client.py
+@@ -0,0 +1,31 @@
++from openai import OpenAI
++
++# Modify OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++
++client = OpenAI(
++    # defaults to os.environ.get("OPENAI_API_KEY")
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++models = client.models.list()
++model = models.data[0].id
++
++# Completion API
++stream = False
++completion = client.completions.create(
++    model=model,
++    prompt="A robot may not injure a human being",
++    echo=False,
++    n=2,
++    stream=stream,
++    logprobs=3)
++
++print("Completion results:")
++if stream:
++    for c in completion:
++        print(c)
++else:
++    print(completion)
+diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
+new file mode 100644
+index 0000000..365a684
+--- /dev/null
++++ b/examples/online_serving/openai_cross_encoder_score.py
+@@ -0,0 +1,59 @@
++"""
++Example online usage of Score API.
++
++Run `vllm serve <model> --task score` to start up the server in vLLM.
++"""
++import argparse
++import pprint
++
++import requests
++
++
++def post_http_request(prompt: dict, api_url: str) -> requests.Response:
++    headers = {"User-Agent": "Test Client"}
++    response = requests.post(api_url, headers=headers, json=prompt)
++    return response
++
++
++if __name__ == "__main__":
++    parser = argparse.ArgumentParser()
++    parser.add_argument("--host", type=str, default="localhost")
++    parser.add_argument("--port", type=int, default=8000)
++    parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
++
++    args = parser.parse_args()
++    api_url = f"http://{args.host}:{args.port}/score"
++    model_name = args.model
++
++    text_1 = "What is the capital of Brazil?"
++    text_2 = "The capital of Brazil is Brasilia."
++    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
++    score_response = post_http_request(prompt=prompt, api_url=api_url)
++    print("Prompt when text_1 and text_2 are both strings:")
++    pprint.pprint(prompt)
++    print("Score Response:")
++    pprint.pprint(score_response.json())
++
++    text_1 = "What is the capital of France?"
++    text_2 = [
++        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
++    ]
++    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
++    score_response = post_http_request(prompt=prompt, api_url=api_url)
++    print("Prompt when text_1 is string and text_2 is a list:")
++    pprint.pprint(prompt)
++    print("Score Response:")
++    pprint.pprint(score_response.json())
++
++    text_1 = [
++        "What is the capital of Brazil?", "What is the capital of France?"
++    ]
++    text_2 = [
++        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
++    ]
++    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
++    score_response = post_http_request(prompt=prompt, api_url=api_url)
++    print("Prompt when text_1 and text_2 are both lists:")
++    pprint.pprint(prompt)
++    print("Score Response:")
++    pprint.pprint(score_response.json())
+diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
+new file mode 100644
+index 0000000..4bd7ca0
+--- /dev/null
++++ b/examples/online_serving/openai_embedding_client.py
+@@ -0,0 +1,25 @@
++from openai import OpenAI
++
++# Modify OpenAI's API key and API base to use vLLM's API server.
++openai_api_key = "EMPTY"
++openai_api_base = "http://localhost:8000/v1"
++
++client = OpenAI(
++    # defaults to os.environ.get("OPENAI_API_KEY")
++    api_key=openai_api_key,
++    base_url=openai_api_base,
++)
++
++models = client.models.list()
++model = models.data[0].id
++
++responses = client.embeddings.create(
++    input=[
++        "Hello my name is",
++        "The best thing about vLLM is that it supports many different models"
++    ],
++    model=model,
++)
++
++for data in responses.data:
++    print(data.embedding)  # list of float of len 4096
+diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
+new file mode 100644
+index 0000000..37ec8f2
+--- /dev/null
++++ b/examples/online_serving/openai_pooling_client.py
+@@ -0,0 +1,51 @@
++"""
++Example online usage of Pooling API.
++
++Run `vllm serve <model> --task <embed|classify|reward|score>`
++to start up the server in vLLM.
++"""
++import argparse
++import pprint
++
++import requests
++
++
++def post_http_request(prompt: dict, api_url: str) -> requests.Response:
++    headers = {"User-Agent": "Test Client"}
++    response = requests.post(api_url, headers=headers, json=prompt)
++    return response
++
++
++if __name__ == "__main__":
++    parser = argparse.ArgumentParser()
++    parser.add_argument("--host", type=str, default="localhost")
++    parser.add_argument("--port", type=int, default=8000)
++    parser.add_argument("--model",
++                        type=str,
++                        default="jason9693/Qwen2.5-1.5B-apeach")
++
++    args = parser.parse_args()
++    api_url = f"http://{args.host}:{args.port}/pooling"
++    model_name = args.model
++
++    # Input like Completions API
++    prompt = {"model": model_name, "input": "vLLM is great!"}
++    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
++    print("Pooling Response:")
++    pprint.pprint(pooling_response.json())
++
++    # Input like Chat API
++    prompt = {
++        "model":
++        model_name,
++        "messages": [{
++            "role": "user",
++            "content": [{
++                "type": "text",
++                "text": "vLLM is great!"
++            }],
++        }]
++    }
++    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
++    print("Pooling Response:")
++    pprint.pprint(pooling_response.json())
+diff --git a/examples/online_serving/opentelemetry/Otel.md b/examples/online_serving/opentelemetry/Otel.md
+new file mode 100644
+index 0000000..96d1f96
+--- /dev/null
++++ b/examples/online_serving/opentelemetry/Otel.md
+@@ -0,0 +1,82 @@
++# Setup OpenTelemetry POC
++
++1. Install OpenTelemetry packages:
++    ```
++    pip install \
++      'opentelemetry-sdk>=1.26.0,<1.27.0' \
++      'opentelemetry-api>=1.26.0,<1.27.0' \
++      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
++      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'
++    ```
++
++1. Start Jaeger in a docker container:
++    ```
++    # From: https://www.jaegertracing.io/docs/1.57/getting-started/
++    docker run --rm --name jaeger \
++        -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
++        -p 6831:6831/udp \
++        -p 6832:6832/udp \
++        -p 5778:5778 \
++        -p 16686:16686 \
++        -p 4317:4317 \
++        -p 4318:4318 \
++        -p 14250:14250 \
++        -p 14268:14268 \
++        -p 14269:14269 \
++        -p 9411:9411 \
++        jaegertracing/all-in-one:1.57
++    ```
++
++1. In a new shell, export Jaeger IP:
++    ```
++    export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
++    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
++    ```
++    Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
++    ```
++    export OTEL_SERVICE_NAME="vllm-server"
++    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
++    vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
++    ```
++
++1. In a new shell, send requests with trace context from a dummy client
++    ```
++    export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
++    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
++    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
++    export OTEL_SERVICE_NAME="client-service"
++    python dummy_client.py
++    ```
++
++1. Open Jaeger webui: http://localhost:16686/
++
++    In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
++    ![Traces](https://i.imgur.com/GYHhFjo.png)
++
++1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
++![Spans details](https://i.imgur.com/OPf6CBL.png)
++
++## Exporter Protocol
++OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
++By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
++```
++export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
++export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
++vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
++```
++
++## Instrumentation of FastAPI
++OpenTelemetry allows automatic instrumentation of FastAPI.
++1. Install the instrumentation library
++    ```
++    pip install opentelemetry-instrumentation-fastapi
++    ```
++
++1. Run vLLM with `opentelemetry-instrument`
++    ```
++    opentelemetry-instrument vllm serve facebook/opt-125m
++    ```
++
++1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
++
++![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
+\ No newline at end of file
+diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
+new file mode 100644
+index 0000000..b1a2b3c
+--- /dev/null
++++ b/examples/online_serving/opentelemetry/dummy_client.py
+@@ -0,0 +1,35 @@
++import requests
++from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
++    OTLPSpanExporter)
++from opentelemetry.sdk.trace import TracerProvider
++from opentelemetry.sdk.trace.export import (BatchSpanProcessor,
++                                            ConsoleSpanExporter)
++from opentelemetry.trace import SpanKind, set_tracer_provider
++from opentelemetry.trace.propagation.tracecontext import (
++    TraceContextTextMapPropagator)
++
++trace_provider = TracerProvider()
++set_tracer_provider(trace_provider)
++
++trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
++trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
++
++tracer = trace_provider.get_tracer("dummy-client")
++
++url = "http://localhost:8000/v1/completions"
++with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
++    prompt = "San Francisco is a"
++    span.set_attribute("prompt", prompt)
++    headers = {}
++    TraceContextTextMapPropagator().inject(headers)
++    payload = {
++        "model": "facebook/opt-125m",
++        "prompt": prompt,
++        "max_tokens": 10,
++        "best_of": 20,
++        "n": 3,
++        "use_beam_search": "true",
++        "temperature": 0.0,
++        # "stream": True,
++    }
++    response = requests.post(url, headers=headers, json=payload)
+diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
+new file mode 100644
+index 0000000..c49e530
+--- /dev/null
++++ b/examples/online_serving/prometheus_grafana/README.md
+@@ -0,0 +1,54 @@
++# Prometheus and Grafana 
++
++This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 
++
++Install: 
++- [`docker`](https://docs.docker.com/engine/install/)
++- [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
++
++## Launch
++
++Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
++```bash
++vllm serve mistralai/Mistral-7B-v0.1 \
++    --max-model-len 2048 \
++    --disable-log-requests
++```
++
++Launch Prometheus and Grafana servers with `docker compose`:
++```bash
++docker compose up
++```
++
++Submit some sample requests to the server:
++```bash
++wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
++
++python3 ../../benchmarks/benchmark_serving.py \
++    --model mistralai/Mistral-7B-v0.1 \
++    --tokenizer mistralai/Mistral-7B-v0.1 \
++    --endpoint /v1/completions \
++    --dataset-name sharegpt \
++    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
++    --request-rate 3.0
++```
++
++Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
++
++## Grafana Dashboard
++
++Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
++
++### Add Prometheus Data Source
++
++Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 
++
++On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
++
++Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
++
++### Import Dashboard 
++
++Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
++
++![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png)
+diff --git a/examples/online_serving/prometheus_grafana/docker-compose.yaml b/examples/online_serving/prometheus_grafana/docker-compose.yaml
+new file mode 100644
+index 0000000..13b987c
+--- /dev/null
++++ b/examples/online_serving/prometheus_grafana/docker-compose.yaml
+@@ -0,0 +1,19 @@
++# docker-compose.yaml
++version: "3"
++
++services:
++  prometheus:
++    image: prom/prometheus:latest
++    extra_hosts:
++      - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
++    ports:
++      - "9090:9090"   # the default port used by Prometheus
++    volumes:
++      - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
++
++  grafana:
++    image: grafana/grafana:latest
++    depends_on:
++      - prometheus
++    ports:
++      - "3000:3000" # the default port used by Grafana
+diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
+new file mode 100644
+index 0000000..f76a61b
+--- /dev/null
++++ b/examples/online_serving/prometheus_grafana/grafana.json
+@@ -0,0 +1,1557 @@
++{
++  "annotations": {
++    "list": [
++      {
++        "builtIn": 1,
++        "datasource": {
++          "type": "grafana",
++          "uid": "-- Grafana --"
++        },
++        "enable": true,
++        "hide": true,
++        "iconColor": "rgba(0, 211, 255, 1)",
++        "name": "Annotations & Alerts",
++        "target": {
++          "limit": 100,
++          "matchAny": false,
++          "tags": [],
++          "type": "dashboard"
++        },
++        "type": "dashboard"
++      }
++    ]
++  },
++  "description": "Monitoring vLLM Inference Server",
++  "editable": true,
++  "fiscalYearStartMonth": 0,
++  "graphTooltip": 0,
++  "id": 1,
++  "links": [],
++  "liveNow": false,
++  "panels": [
++    {
++      "datasource": {
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "description": "End to end request latency measured in seconds.",
++      "fieldConfig": {
++        "defaults": {
++          "color": {
++            "mode": "palette-classic"
++          },
++          "custom": {
++            "axisBorderShow": false,
++            "axisCenteredZero": false,
++            "axisColorMode": "text",
++            "axisLabel": "",
++            "axisPlacement": "auto",
++            "barAlignment": 0,
++            "barWidthFactor": 0.6,
++            "drawStyle": "line",
++            "fillOpacity": 0,
++            "gradientMode": "none",
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "insertNulls": false,
++            "lineInterpolation": "linear",
++            "lineWidth": 1,
++            "pointSize": 5,
++            "scaleDistribution": {
++              "type": "linear"
++            },
++            "showPoints": "auto",
++            "spanNulls": false,
++            "stacking": {
++              "group": "A",
++              "mode": "none"
++            },
++            "thresholdsStyle": {
++              "mode": "off"
++            }
++          },
++          "mappings": [],
++          "thresholds": {
++            "mode": "absolute",
++            "steps": [
++              {
++                "color": "green",
++                "value": null
++              },
++              {
++                "color": "red",
++                "value": 80
++              }
++            ]
++          },
++          "unit": "s"
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 0,
++        "y": 0
++      },
++      "id": 9,
++      "options": {
++        "legend": {
++          "calcs": [],
++          "displayMode": "list",
++          "placement": "bottom",
++          "showLegend": true
++        },
++        "tooltip": {
++          "mode": "single",
++          "sort": "none"
++        }
++      },
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P99",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P95",
++          "range": true,
++          "refId": "B",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P90",
++          "range": true,
++          "refId": "C",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P50",
++          "range": true,
++          "refId": "D",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "editorMode": "code",
++          "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
++          "hide": false,
++          "instant": false,
++          "legendFormat": "Average",
++          "range": true,
++          "refId": "E"
++        }
++      ],
++      "title": "E2E Request Latency",
++      "type": "timeseries"
++    },
++    {
++      "datasource": {
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "description": "Number of tokens processed per second",
++      "fieldConfig": {
++        "defaults": {
++          "color": {
++            "mode": "palette-classic"
++          },
++          "custom": {
++            "axisBorderShow": false,
++            "axisCenteredZero": false,
++            "axisColorMode": "text",
++            "axisLabel": "",
++            "axisPlacement": "auto",
++            "barAlignment": 0,
++            "barWidthFactor": 0.6,
++            "drawStyle": "line",
++            "fillOpacity": 0,
++            "gradientMode": "none",
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "insertNulls": false,
++            "lineInterpolation": "linear",
++            "lineWidth": 1,
++            "pointSize": 5,
++            "scaleDistribution": {
++              "type": "linear"
++            },
++            "showPoints": "auto",
++            "spanNulls": false,
++            "stacking": {
++              "group": "A",
++              "mode": "none"
++            },
++            "thresholdsStyle": {
++              "mode": "off"
++            }
++          },
++          "mappings": [],
++          "thresholds": {
++            "mode": "absolute",
++            "steps": [
++              {
++                "color": "green",
++                "value": null
++              },
++              {
++                "color": "red",
++                "value": 80
++              }
++            ]
++          }
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 12,
++        "y": 0
++      },
++      "id": 8,
++      "options": {
++        "legend": {
++          "calcs": [],
++          "displayMode": "list",
++          "placement": "bottom",
++          "showLegend": true
++        },
++        "tooltip": {
++          "mode": "single",
++          "sort": "none"
++        }
++      },
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
++          "fullMetaSearch": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "Prompt Tokens/Sec",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "Generation Tokens/Sec",
++          "range": true,
++          "refId": "B",
++          "useBackend": false
++        }
++      ],
++      "title": "Token Throughput",
++      "type": "timeseries"
++    },
++    {
++      "datasource": {
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "description": "Inter token latency in seconds.",
++      "fieldConfig": {
++        "defaults": {
++          "color": {
++            "mode": "palette-classic"
++          },
++          "custom": {
++            "axisBorderShow": false,
++            "axisCenteredZero": false,
++            "axisColorMode": "text",
++            "axisLabel": "",
++            "axisPlacement": "auto",
++            "barAlignment": 0,
++            "barWidthFactor": 0.6,
++            "drawStyle": "line",
++            "fillOpacity": 0,
++            "gradientMode": "none",
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "insertNulls": false,
++            "lineInterpolation": "linear",
++            "lineWidth": 1,
++            "pointSize": 5,
++            "scaleDistribution": {
++              "type": "linear"
++            },
++            "showPoints": "auto",
++            "spanNulls": false,
++            "stacking": {
++              "group": "A",
++              "mode": "none"
++            },
++            "thresholdsStyle": {
++              "mode": "off"
++            }
++          },
++          "mappings": [],
++          "thresholds": {
++            "mode": "absolute",
++            "steps": [
++              {
++                "color": "green",
++                "value": null
++              },
++              {
++                "color": "red",
++                "value": 80
++              }
++            ]
++          },
++          "unit": "s"
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 0,
++        "y": 8
++      },
++      "id": 10,
++      "options": {
++        "legend": {
++          "calcs": [],
++          "displayMode": "list",
++          "placement": "bottom",
++          "showLegend": true
++        },
++        "tooltip": {
++          "mode": "single",
++          "sort": "none"
++        }
++      },
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P99",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P95",
++          "range": true,
++          "refId": "B",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P90",
++          "range": true,
++          "refId": "C",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P50",
++          "range": true,
++          "refId": "D",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "editorMode": "code",
++          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
++          "hide": false,
++          "instant": false,
++          "legendFormat": "Mean",
++          "range": true,
++          "refId": "E"
++        }
++      ],
++      "title": "Time Per Output Token Latency",
++      "type": "timeseries"
++    },
++    {
++      "datasource": {
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "description": "Number of requests in RUNNING, WAITING, and SWAPPED state",
++      "fieldConfig": {
++        "defaults": {
++          "color": {
++            "mode": "palette-classic"
++          },
++          "custom": {
++            "axisBorderShow": false,
++            "axisCenteredZero": false,
++            "axisColorMode": "text",
++            "axisLabel": "",
++            "axisPlacement": "auto",
++            "barAlignment": 0,
++            "barWidthFactor": 0.6,
++            "drawStyle": "line",
++            "fillOpacity": 0,
++            "gradientMode": "none",
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "insertNulls": false,
++            "lineInterpolation": "linear",
++            "lineWidth": 1,
++            "pointSize": 5,
++            "scaleDistribution": {
++              "type": "linear"
++            },
++            "showPoints": "auto",
++            "spanNulls": false,
++            "stacking": {
++              "group": "A",
++              "mode": "none"
++            },
++            "thresholdsStyle": {
++              "mode": "off"
++            }
++          },
++          "mappings": [],
++          "thresholds": {
++            "mode": "absolute",
++            "steps": [
++              {
++                "color": "green",
++                "value": null
++              },
++              {
++                "color": "red",
++                "value": 80
++              }
++            ]
++          },
++          "unit": "none"
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 12,
++        "y": 8
++      },
++      "id": 3,
++      "options": {
++        "legend": {
++          "calcs": [],
++          "displayMode": "list",
++          "placement": "bottom",
++          "showLegend": true
++        },
++        "tooltip": {
++          "mode": "single",
++          "sort": "none"
++        }
++      },
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "vllm:num_requests_running{model_name=\"$model_name\"}",
++          "fullMetaSearch": false,
++          "includeNullMetadata": true,
++          "instant": false,
++          "legendFormat": "Num Running",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": true,
++          "instant": false,
++          "legendFormat": "Num Swapped",
++          "range": true,
++          "refId": "B",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": true,
++          "instant": false,
++          "legendFormat": "Num Waiting",
++          "range": true,
++          "refId": "C",
++          "useBackend": false
++        }
++      ],
++      "title": "Scheduler State",
++      "type": "timeseries"
++    },
++    {
++      "datasource": {
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "description": "P50, P90, P95, and P99 TTFT latency in seconds.",
++      "fieldConfig": {
++        "defaults": {
++          "color": {
++            "mode": "palette-classic"
++          },
++          "custom": {
++            "axisBorderShow": false,
++            "axisCenteredZero": false,
++            "axisColorMode": "text",
++            "axisLabel": "",
++            "axisPlacement": "auto",
++            "barAlignment": 0,
++            "barWidthFactor": 0.6,
++            "drawStyle": "line",
++            "fillOpacity": 0,
++            "gradientMode": "none",
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "insertNulls": false,
++            "lineInterpolation": "linear",
++            "lineWidth": 1,
++            "pointSize": 5,
++            "scaleDistribution": {
++              "type": "linear"
++            },
++            "showPoints": "auto",
++            "spanNulls": false,
++            "stacking": {
++              "group": "A",
++              "mode": "none"
++            },
++            "thresholdsStyle": {
++              "mode": "off"
++            }
++          },
++          "mappings": [],
++          "thresholds": {
++            "mode": "absolute",
++            "steps": [
++              {
++                "color": "green",
++                "value": null
++              },
++              {
++                "color": "red",
++                "value": 80
++              }
++            ]
++          },
++          "unit": "s"
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 0,
++        "y": 16
++      },
++      "id": 5,
++      "options": {
++        "legend": {
++          "calcs": [],
++          "displayMode": "list",
++          "placement": "bottom",
++          "showLegend": true
++        },
++        "tooltip": {
++          "mode": "single",
++          "sort": "none"
++        }
++      },
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P99",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P95",
++          "range": true,
++          "refId": "B",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P90",
++          "range": true,
++          "refId": "C",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
++          "fullMetaSearch": false,
++          "hide": false,
++          "includeNullMetadata": false,
++          "instant": false,
++          "legendFormat": "P50",
++          "range": true,
++          "refId": "D",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "editorMode": "code",
++          "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
++          "hide": false,
++          "instant": false,
++          "legendFormat": "Average",
++          "range": true,
++          "refId": "E"
++        }
++      ],
++      "title": "Time To First Token Latency",
++      "type": "timeseries"
++    },
++    {
++      "datasource": {
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "description": "Percentage of used cache blocks by vLLM.",
++      "fieldConfig": {
++        "defaults": {
++          "color": {
++            "mode": "palette-classic"
++          },
++          "custom": {
++            "axisBorderShow": false,
++            "axisCenteredZero": false,
++            "axisColorMode": "text",
++            "axisLabel": "",
++            "axisPlacement": "auto",
++            "barAlignment": 0,
++            "barWidthFactor": 0.6,
++            "drawStyle": "line",
++            "fillOpacity": 0,
++            "gradientMode": "none",
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "insertNulls": false,
++            "lineInterpolation": "linear",
++            "lineWidth": 1,
++            "pointSize": 5,
++            "scaleDistribution": {
++              "type": "linear"
++            },
++            "showPoints": "auto",
++            "spanNulls": false,
++            "stacking": {
++              "group": "A",
++              "mode": "none"
++            },
++            "thresholdsStyle": {
++              "mode": "off"
++            }
++          },
++          "mappings": [],
++          "thresholds": {
++            "mode": "absolute",
++            "steps": [
++              {
++                "color": "green",
++                "value": null
++              },
++              {
++                "color": "red",
++                "value": 80
++              }
++            ]
++          },
++          "unit": "percentunit"
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 12,
++        "y": 16
++      },
++      "id": 4,
++      "options": {
++        "legend": {
++          "calcs": [],
++          "displayMode": "list",
++          "placement": "bottom",
++          "showLegend": true
++        },
++        "tooltip": {
++          "mode": "single",
++          "sort": "none"
++        }
++      },
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "editorMode": "code",
++          "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
++          "instant": false,
++          "legendFormat": "GPU Cache Usage",
++          "range": true,
++          "refId": "A"
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "editorMode": "code",
++          "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
++          "hide": false,
++          "instant": false,
++          "legendFormat": "CPU Cache Usage",
++          "range": true,
++          "refId": "B"
++        }
++      ],
++      "title": "Cache Utilization",
++      "type": "timeseries"
++    },
++    {
++      "datasource": {
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "description": "Heatmap of request prompt length",
++      "fieldConfig": {
++        "defaults": {
++          "custom": {
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "scaleDistribution": {
++              "type": "linear"
++            }
++          }
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 0,
++        "y": 24
++      },
++      "id": 12,
++      "options": {
++        "calculate": false,
++        "cellGap": 1,
++        "cellValues": {
++          "unit": "none"
++        },
++        "color": {
++          "exponent": 0.5,
++          "fill": "dark-orange",
++          "min": 0,
++          "mode": "scheme",
++          "reverse": false,
++          "scale": "exponential",
++          "scheme": "Spectral",
++          "steps": 64
++        },
++        "exemplars": {
++          "color": "rgba(255,0,255,0.7)"
++        },
++        "filterValues": {
++          "le": 1e-9
++        },
++        "legend": {
++          "show": true
++        },
++        "rowsFrame": {
++          "layout": "auto",
++          "value": "Request count"
++        },
++        "tooltip": {
++          "mode": "single",
++          "showColorScale": false,
++          "yHistogram": true
++        },
++        "yAxis": {
++          "axisLabel": "Prompt Length",
++          "axisPlacement": "left",
++          "reverse": false,
++          "unit": "none"
++        }
++      },
++      "pluginVersion": "11.2.0",
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
++          "format": "heatmap",
++          "fullMetaSearch": false,
++          "includeNullMetadata": true,
++          "instant": false,
++          "legendFormat": "{{le}}",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        }
++      ],
++      "title": "Request Prompt Length",
++      "type": "heatmap"
++    },
++    {
++      "datasource": {
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "description": "Heatmap of request generation length",
++      "fieldConfig": {
++        "defaults": {
++          "custom": {
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "scaleDistribution": {
++              "type": "linear"
++            }
++          }
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 12,
++        "y": 24
++      },
++      "id": 13,
++      "options": {
++        "calculate": false,
++        "cellGap": 1,
++        "cellValues": {
++          "unit": "none"
++        },
++        "color": {
++          "exponent": 0.5,
++          "fill": "dark-orange",
++          "min": 0,
++          "mode": "scheme",
++          "reverse": false,
++          "scale": "exponential",
++          "scheme": "Spectral",
++          "steps": 64
++        },
++        "exemplars": {
++          "color": "rgba(255,0,255,0.7)"
++        },
++        "filterValues": {
++          "le": 1e-9
++        },
++        "legend": {
++          "show": true
++        },
++        "rowsFrame": {
++          "layout": "auto",
++          "value": "Request count"
++        },
++        "tooltip": {
++          "mode": "single",
++          "showColorScale": false,
++          "yHistogram": true
++        },
++        "yAxis": {
++          "axisLabel": "Generation Length",
++          "axisPlacement": "left",
++          "reverse": false,
++          "unit": "none"
++        }
++      },
++      "pluginVersion": "11.2.0",
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
++          "format": "heatmap",
++          "fullMetaSearch": false,
++          "includeNullMetadata": true,
++          "instant": false,
++          "legendFormat": "{{le}}",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        }
++      ],
++      "title": "Request Generation Length",
++      "type": "heatmap"
++    },
++    {
++      "datasource": {
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.",
++      "fieldConfig": {
++        "defaults": {
++          "color": {
++            "mode": "palette-classic"
++          },
++          "custom": {
++            "axisBorderShow": false,
++            "axisCenteredZero": false,
++            "axisColorMode": "text",
++            "axisLabel": "",
++            "axisPlacement": "auto",
++            "barAlignment": 0,
++            "barWidthFactor": 0.6,
++            "drawStyle": "line",
++            "fillOpacity": 0,
++            "gradientMode": "none",
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "insertNulls": false,
++            "lineInterpolation": "linear",
++            "lineWidth": 1,
++            "pointSize": 5,
++            "scaleDistribution": {
++              "type": "linear"
++            },
++            "showPoints": "auto",
++            "spanNulls": false,
++            "stacking": {
++              "group": "A",
++              "mode": "none"
++            },
++            "thresholdsStyle": {
++              "mode": "off"
++            }
++          },
++          "mappings": [],
++          "thresholds": {
++            "mode": "absolute",
++            "steps": [
++              {
++                "color": "green"
++              },
++              {
++                "color": "red",
++                "value": 80
++              }
++            ]
++          }
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 0,
++        "y": 32
++      },
++      "id": 11,
++      "options": {
++        "legend": {
++          "calcs": [],
++          "displayMode": "list",
++          "placement": "bottom",
++          "showLegend": true
++        },
++        "tooltip": {
++          "mode": "single",
++          "sort": "none"
++        }
++      },
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "disableTextWrap": false,
++          "editorMode": "builder",
++          "expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))",
++          "fullMetaSearch": false,
++          "includeNullMetadata": true,
++          "instant": false,
++          "interval": "",
++          "legendFormat": "__auto",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        }
++      ],
++      "title": "Finish Reason",
++      "type": "timeseries"
++    },
++    {
++      "datasource": {
++        "default": false,
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "fieldConfig": {
++        "defaults": {
++          "color": {
++            "mode": "palette-classic"
++          },
++          "custom": {
++            "axisBorderShow": false,
++            "axisCenteredZero": false,
++            "axisColorMode": "text",
++            "axisLabel": "seconds",
++            "axisPlacement": "auto",
++            "barAlignment": 0,
++            "barWidthFactor": 0.6,
++            "drawStyle": "line",
++            "fillOpacity": 0,
++            "gradientMode": "none",
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "insertNulls": false,
++            "lineInterpolation": "linear",
++            "lineWidth": 1,
++            "pointSize": 5,
++            "scaleDistribution": {
++              "type": "linear"
++            },
++            "showPoints": "auto",
++            "spanNulls": false,
++            "stacking": {
++              "group": "A",
++              "mode": "none"
++            },
++            "thresholdsStyle": {
++              "mode": "off"
++            }
++          },
++          "mappings": [],
++          "thresholds": {
++            "mode": "absolute",
++            "steps": [
++              {
++                "color": "green"
++              },
++              {
++                "color": "red",
++                "value": 80
++              }
++            ]
++          }
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 12,
++        "y": 32
++      },
++      "id": 14,
++      "options": {
++        "legend": {
++          "calcs": [],
++          "displayMode": "list",
++          "placement": "bottom",
++          "showLegend": true
++        },
++        "tooltip": {
++          "mode": "single",
++          "sort": "none"
++        }
++      },
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "edx8memhpd9tsa"
++          },
++          "disableTextWrap": false,
++          "editorMode": "code",
++          "expr": "rate(vllm:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
++          "fullMetaSearch": false,
++          "includeNullMetadata": true,
++          "instant": false,
++          "legendFormat": "__auto",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        }
++      ],
++      "title": "Queue Time",
++      "type": "timeseries"
++    },
++    {
++      "datasource": {
++        "default": false,
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "fieldConfig": {
++        "defaults": {
++          "color": {
++            "mode": "palette-classic"
++          },
++          "custom": {
++            "axisBorderShow": false,
++            "axisCenteredZero": false,
++            "axisColorMode": "text",
++            "axisLabel": "",
++            "axisPlacement": "auto",
++            "barAlignment": 0,
++            "barWidthFactor": 0.6,
++            "drawStyle": "line",
++            "fillOpacity": 0,
++            "gradientMode": "none",
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "insertNulls": false,
++            "lineInterpolation": "linear",
++            "lineWidth": 1,
++            "pointSize": 5,
++            "scaleDistribution": {
++              "type": "linear"
++            },
++            "showPoints": "auto",
++            "spanNulls": false,
++            "stacking": {
++              "group": "A",
++              "mode": "none"
++            },
++            "thresholdsStyle": {
++              "mode": "off"
++            }
++          },
++          "mappings": [],
++          "thresholds": {
++            "mode": "absolute",
++            "steps": [
++              {
++                "color": "green"
++              },
++              {
++                "color": "red",
++                "value": 80
++              }
++            ]
++          }
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 0,
++        "y": 40
++      },
++      "id": 15,
++      "options": {
++        "legend": {
++          "calcs": [],
++          "displayMode": "list",
++          "placement": "bottom",
++          "showLegend": true
++        },
++        "tooltip": {
++          "mode": "single",
++          "sort": "none"
++        }
++      },
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "edx8memhpd9tsa"
++          },
++          "disableTextWrap": false,
++          "editorMode": "code",
++          "expr": "rate(vllm:request_prefill_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
++          "fullMetaSearch": false,
++          "includeNullMetadata": true,
++          "instant": false,
++          "legendFormat": "Prefill",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        },
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "${DS_PROMETHEUS}"
++          },
++          "editorMode": "code",
++          "expr": "rate(vllm:request_decode_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
++          "hide": false,
++          "instant": false,
++          "legendFormat": "Decode",
++          "range": true,
++          "refId": "B"
++        }
++      ],
++      "title": "Requests Prefill and Decode Time",
++      "type": "timeseries"
++    },
++    {
++      "datasource": {
++        "default": false,
++        "type": "prometheus",
++        "uid": "${DS_PROMETHEUS}"
++      },
++      "fieldConfig": {
++        "defaults": {
++          "color": {
++            "mode": "palette-classic"
++          },
++          "custom": {
++            "axisBorderShow": false,
++            "axisCenteredZero": false,
++            "axisColorMode": "text",
++            "axisLabel": "",
++            "axisPlacement": "auto",
++            "barAlignment": 0,
++            "barWidthFactor": 0.6,
++            "drawStyle": "line",
++            "fillOpacity": 0,
++            "gradientMode": "none",
++            "hideFrom": {
++              "legend": false,
++              "tooltip": false,
++              "viz": false
++            },
++            "insertNulls": false,
++            "lineInterpolation": "linear",
++            "lineWidth": 1,
++            "pointSize": 5,
++            "scaleDistribution": {
++              "type": "linear"
++            },
++            "showPoints": "auto",
++            "spanNulls": false,
++            "stacking": {
++              "group": "A",
++              "mode": "none"
++            },
++            "thresholdsStyle": {
++              "mode": "off"
++            }
++          },
++          "mappings": [],
++          "thresholds": {
++            "mode": "absolute",
++            "steps": [
++              {
++                "color": "green"
++              },
++              {
++                "color": "red",
++                "value": 80
++              }
++            ]
++          }
++        },
++        "overrides": []
++      },
++      "gridPos": {
++        "h": 8,
++        "w": 12,
++        "x": 12,
++        "y": 40
++      },
++      "id": 16,
++      "options": {
++        "legend": {
++          "calcs": [],
++          "displayMode": "list",
++          "placement": "bottom",
++          "showLegend": true
++        },
++        "tooltip": {
++          "mode": "single",
++          "sort": "none"
++        }
++      },
++      "targets": [
++        {
++          "datasource": {
++            "type": "prometheus",
++            "uid": "edx8memhpd9tsa"
++          },
++          "disableTextWrap": false,
++          "editorMode": "code",
++          "expr": "rate(vllm:request_max_num_generation_tokens_sum{model_name=\"$model_name\"}[$__rate_interval])",
++          "fullMetaSearch": false,
++          "includeNullMetadata": true,
++          "instant": false,
++          "legendFormat": "Tokens",
++          "range": true,
++          "refId": "A",
++          "useBackend": false
++        }
++      ],
++      "title": "Max Generation Token in Sequence Group",
++      "type": "timeseries"
++    }
++  ],
++  "refresh": "",
++  "schemaVersion": 39,
++  "tags": [],
++  "templating": {
++    "list": [
++      {
++        "current": {
++          "selected": false,
++          "text": "prometheus",
++          "value": "edx8memhpd9tsa"
++        },
++        "hide": 0,
++        "includeAll": false,
++        "label": "datasource",
++        "multi": false,
++        "name": "DS_PROMETHEUS",
++        "options": [],
++        "query": "prometheus",
++        "queryValue": "",
++        "refresh": 1,
++        "regex": "",
++        "skipUrlSync": false,
++        "type": "datasource"
++      },
++      {
++        "current": {
++          "selected": false,
++          "text": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct",
++          "value": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct"
++        },
++        "datasource": {
++          "type": "prometheus",
++          "uid": "edx8memhpd9tsa"
++        },
++        "definition": "label_values(model_name)",
++        "hide": 0,
++        "includeAll": false,
++        "label": "model_name",
++        "multi": false,
++        "name": "model_name",
++        "options": [],
++        "query": {
++          "query": "label_values(model_name)",
++          "refId": "StandardVariableQuery"
++        },
++        "refresh": 1,
++        "regex": "",
++        "skipUrlSync": false,
++        "sort": 0,
++        "type": "query"
++      }
++    ]
++  },
++  "time": {
++    "from": "now-5m",
++    "to": "now"
++  },
++  "timepicker": {},
++  "timezone": "",
++  "title": "vLLM",
++  "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b",
++  "version": 8,
++  "weekStart": ""
++}
+diff --git a/examples/online_serving/prometheus_grafana/prometheus.yaml b/examples/online_serving/prometheus_grafana/prometheus.yaml
+new file mode 100644
+index 0000000..754533b
+--- /dev/null
++++ b/examples/online_serving/prometheus_grafana/prometheus.yaml
+@@ -0,0 +1,10 @@
++# prometheus.yaml
++global:
++  scrape_interval: 5s
++  evaluation_interval: 30s
++
++scrape_configs:
++  - job_name: vllm
++    static_configs:
++      - targets:
++          - 'host.docker.internal:8000'
+diff --git a/examples/online_serving/run_cluster.sh b/examples/online_serving/run_cluster.sh
+new file mode 100644
+index 0000000..7b4b40b
+--- /dev/null
++++ b/examples/online_serving/run_cluster.sh
+@@ -0,0 +1,49 @@
++#!/bin/bash
++
++# Check for minimum number of required arguments
++if [ $# -lt 4 ]; then
++    echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
++    exit 1
++fi
++
++# Assign the first three arguments and shift them away
++DOCKER_IMAGE="$1"
++HEAD_NODE_ADDRESS="$2"
++NODE_TYPE="$3"  # Should be --head or --worker
++PATH_TO_HF_HOME="$4"
++shift 4
++
++# Additional arguments are passed directly to the Docker command
++ADDITIONAL_ARGS=("$@")
++
++# Validate node type
++if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
++    echo "Error: Node type must be --head or --worker"
++    exit 1
++fi
++
++# Define a function to cleanup on EXIT signal
++cleanup() {
++    docker stop node
++    docker rm node
++}
++trap cleanup EXIT
++
++# Command setup for head or worker node
++RAY_START_CMD="ray start --block"
++if [ "${NODE_TYPE}" == "--head" ]; then
++    RAY_START_CMD+=" --head --port=6379"
++else
++    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
++fi
++
++# Run the docker command with the user specified parameters and additional arguments
++docker run \
++    --entrypoint /bin/bash \
++    --network host \
++    --name node \
++    --shm-size 10.24g \
++    --gpus all \
++    -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
++    "${ADDITIONAL_ARGS[@]}" \
++    "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
+diff --git a/examples/online_serving/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh
+new file mode 100644
+index 0000000..75a99ff
+--- /dev/null
++++ b/examples/online_serving/sagemaker-entrypoint.sh
+@@ -0,0 +1,24 @@
++#!/bin/bash
++
++# Define the prefix for environment variables to look for
++PREFIX="SM_VLLM_"
++ARG_PREFIX="--"
++
++# Initialize an array for storing the arguments
++# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response
++ARGS=(--port 8080)
++
++# Loop through all environment variables
++while IFS='=' read -r key value; do
++    # Remove the prefix from the key, convert to lowercase, and replace underscores with dashes
++    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
++
++    # Add the argument name and value to the ARGS array
++    ARGS+=("${ARG_PREFIX}${arg_name}")
++    if [ -n "$value" ]; then
++        ARGS+=("$value")
++    fi
++done < <(env | grep "^${PREFIX}")
++
++# Pass the collected arguments to the main entrypoint
++exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
+\ No newline at end of file
+diff --git a/examples/other/fp8/README.md b/examples/other/fp8/README.md
+new file mode 100644
+index 0000000..4e8031d
+--- /dev/null
++++ b/examples/other/fp8/README.md
+@@ -0,0 +1,96 @@
++# FP8 KV Cache 
++
++This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms.
++
++## Prerequisites
++
++- Python 3.x
++- PyTorch
++- NumPy
++- Hugging Face Transformers
++- Hugging Face Hub
++- AMMO 
++
++Before incorporating the FP8 datatype for inference workloads, you must adhere to the following steps:
++1. Install all necessary prerequisites and dependencies. 
++2. Convert HF model into a quantized HF model. 
++3. Extract KV Cache Scaling Factors from quantized HF model.
++4. Load KV Cache Scaling Factors into VLLM.
++
++### 2. Convert HF model into a quantized HF model.
++Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md).
++
++`quantize.py` (examples/other/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
++
++The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/other/fp8/quantizer/README.md`.
++
++### 3. Extract KV Cache Scaling Factors from quantized HF model.
++`extract_scales.py` (examples/other/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
++1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename.
++
++2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM.
++
++3. **AMMO Compatibility**: Currently, the generated KV cache scaling factors for AMMO remain uniform across all TP ranks.
++
++```python
++# prerequisites:
++# - Quantized HF LLaMa 2 model 
++python3 examples/other/fp8/extract_scales.py --help
++Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE]
++
++KV Scale Extraction Example
++
++optional arguments:
++--quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU).
++Optional arguments:
++--cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None)
++--load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto)
++--revision: Specify the model's revision number. (Default: None)
++--output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None)
++--output_name: Specify the output filename. (Default: kv_cache_scales.json)
++--tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None)
++```
++```python
++Example:
++python3 examples/other/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
++```
++### 4. Load KV Cache Scaling Factors into VLLM.
++This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
++```
++# prerequisites:
++# -  LLaMa 2 kv_cache_scales.json file
++
++python3 benchmarks/benchmark_throughput.py --help 
++usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
++                               [--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
++                               [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
++                               [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
++                               [--quantization-param-path KV_CACHE_quantization_param_path]
++
++Benchmark Throughput Example  
++optional arguments:
++  -h, --help  show this help message and exit
++  --backend {vllm,hf,mii}
++  --dataset DATASET  Path to the dataset.
++  --input-len INPUT_LEN  Input prompt length for each request
++  --output-len OUTPUT_LEN  Output length for each request. Overrides the output length from the dataset.
++  --model MODEL
++  --tokenizer TOKENIZER
++  --quantization {awq,gptq,None}, -q {awq,gptq,None}
++  --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
++  --n N  Number of generated sequences per prompt.
++  --use-beam-search
++  --num-prompts NUM_PROMPTS  Number of prompts to process.
++  --seed SEED
++  --hf-max-batch-size HF_MAX_BATCH_SIZE   Maximum batch size for HF backend.
++  --trust-remote-code trust remote code from huggingface
++  --max-model-len MAX_MODEL_LEN  Maximum length of a sequence (including prompt and output). If None, will be derived from the model.
++  --dtype {auto,half,float16,bfloat16,float,float32}  data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
++  --enforce-eager  enforce eager execution
++  --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria.
++  --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
++```
++Example:
++```console
++python3 benchmarks/benchmark_throughput.py --input-len <INPUT_LEN> --output-len <OUTPUT_LEN> -tp <TENSOR_PARALLEL_SIZE> --kv-cache-dtype fp8 --quantization-param-path <path/to/kv_cache_scales.json> --model <path-to-llama2>
++```
+diff --git a/examples/other/fp8/extract_scales.py b/examples/other/fp8/extract_scales.py
+new file mode 100644
+index 0000000..1dce9d7
+--- /dev/null
++++ b/examples/other/fp8/extract_scales.py
+@@ -0,0 +1,367 @@
++import argparse
++import glob
++import json
++import os
++from typing import Any, Callable, Dict, List, Optional, Tuple
++
++import numpy as np
++import torch
++from safetensors.torch import safe_open
++
++from vllm.model_executor.layers.quantization.schema import QuantParamSchema
++
++
++# Adapted from vllm/model_executor/model_loader/weight_utils.py
++# The main differences are that we add the NPZ format and simplify
++# its functionality drastically for our purposes (e.g. we assume that
++# the quantized model exists locally and there is no need to download it)
++def _prepare_hf_weights(
++    quantized_model_dir: str,
++    load_format: str = "auto",
++    fall_back_to_pt: bool = True,
++) -> Tuple[List[str], bool]:
++    if not os.path.isdir(quantized_model_dir):
++        raise FileNotFoundError(
++            f"The quantized model directory `{quantized_model_dir}` "
++            "does not exist.")
++    use_safetensors = False
++    # Some quantized models use .pt files for storing the weights.
++    if load_format == "auto":
++        allow_patterns = ["*.safetensors", "*.bin"]
++    elif load_format == "safetensors":
++        use_safetensors = True
++        allow_patterns = ["*.safetensors"]
++    elif load_format == "pt":
++        allow_patterns = ["*.pt"]
++    elif load_format == "npz":
++        allow_patterns = ["*.npz"]
++    else:
++        raise ValueError(f"Unknown load_format: {load_format}")
++    if fall_back_to_pt:
++        allow_patterns += ["*.pt"]
++
++    hf_weights_files: List[str] = []
++    for pattern in allow_patterns:
++        hf_weights_files += glob.glob(
++            os.path.join(quantized_model_dir, pattern))
++        if len(hf_weights_files) > 0:
++            if pattern == "*.safetensors":
++                use_safetensors = True
++            break
++
++    if not use_safetensors:
++        # Exclude files that are not needed for inference.
++        # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
++        blacklist = [
++            "training_args.bin",
++            "optimizer.bin",
++            "optimizer.pt",
++            "scheduler.pt",
++            "scaler.pt",
++        ]
++        hf_weights_files = [
++            f for f in hf_weights_files
++            if not any(f.endswith(x) for x in blacklist)
++        ]
++
++    if len(hf_weights_files) == 0:
++        raise RuntimeError(
++            f"Cannot find any model weights with `{quantized_model_dir}`")
++
++    return hf_weights_files, use_safetensors
++
++
++# Adapted from vllm/model_executor/model_loader/weight_utils.py
++def _hf_tensorfile_iterator(filename: str, load_format: str,
++                            use_safetensors: bool):
++    if load_format == "npz":
++        assert not use_safetensors
++        with np.load(filename) as data:
++            for name in data.files:
++                param = torch.from_numpy(data[name])
++                yield name, param
++    elif use_safetensors:
++        with safe_open(filename, framework="pt") as f:
++            for name in f.keys():  # NOQA: SIM118
++                param = f.get_tensor(name)
++                yield name, param
++    else:
++        state = torch.load(filename, map_location="cpu")
++        for name, param in state.items():
++            yield name, param
++        del state
++        torch.cuda.empty_cache()
++
++
++def _kv_scales_extractor(
++        hf_tensor_files: List[str],
++        use_safetensors: bool,
++        rank_keyword: str = "rank",
++        expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]:
++    """
++    Given a list of files containing tensor data, attempt to extract KV cache
++    scales from these files. Intended as a helper function taking in the output
++    from _prepare_hf_weights.
++    Args:
++    rank_keyword        Matches the number immediately after this keyword in the
++                        tensor filename to determine the TP rank corresponding
++                        to said tensor file
++    expected_tp_size    If specified, the TP size of the tensor files is checked
++                        against this and an error is raised if they don't match.
++    Returns a dictionary mapping TP ranks to their relevant KV cache scales.
++    The per-rank scales are themselves represented as a dictionary of layer
++    indices to the respective per-layer scale.
++    """
++    for char in rank_keyword:
++        assert not char.isdecimal(
++        ), f"Rank keyword {rank_keyword} contains a numeric character!"
++    rank_scales_map: Dict[int, Dict[int, float]] = {}
++    for tensor_file in hf_tensor_files:
++        try:
++            rank_idx = tensor_file.find(rank_keyword)
++            if rank_idx != -1:
++                start_idx = rank_idx + len(rank_keyword)
++                stop_idx = start_idx
++                while stop_idx < len(
++                        tensor_file) and tensor_file[stop_idx].isdecimal():
++                    stop_idx += 1
++                if stop_idx == start_idx:
++                    raise RuntimeError("Did not find rank # in filename.")
++                rank = int(tensor_file[start_idx:stop_idx])
++            elif len(hf_tensor_files) == 1:
++                # Since there is only one tensor file, we can assume
++                # that it's intended for TP rank 0
++                rank = 0
++            else:
++                raise RuntimeError(
++                    f"Filename does not contain '{rank_keyword}'.")
++        except RuntimeError:
++            print("Unable to determine TP rank "
++                  f"corresponding to file '{tensor_file}'")
++            raise
++
++        if rank not in rank_scales_map:
++            layer_scales_map: Dict[int, float] = {}
++            rank_scales_map[rank] = layer_scales_map
++        else:
++            raise RuntimeError(
++                f"Tensor file '{tensor_file}' shares TP rank {rank} "
++                "with another tensor file.")
++
++        module_delimiter = ":" if args.load_format == "npz" else "."
++        for name, param in _hf_tensorfile_iterator(tensor_file,
++                                                   args.load_format,
++                                                   use_safetensors):
++            if "kv_cache_scaling_factor" in name:
++                nums = [
++                    int(s) for s in name.split(module_delimiter)
++                    if s.isdecimal()
++                ]
++                assert len(
++                    nums) == 1, f"Could not determine layer idx for {name}"
++                layer_idx = nums[0]
++                assert layer_idx not in layer_scales_map, f"Duplicate scaling"\
++                    f" factor corresponding to layer {layer_idx}"
++                try:
++                    layer_scales_map[layer_idx] = param.item()
++                except RuntimeError:
++                    print(
++                        "This utility supports only per-tensor scalar scales "
++                        f"for now. The tensor\n {name} = {param} \nis an "
++                        "invalid scale factor.")
++                    raise
++
++    if all(
++            len(layer_scales_map) == 0
++            for layer_scales_map in rank_scales_map.values()):
++        # Note: this is true even if the rank_scales_map is empty
++        print("WARNING: No KV cache scale factors found. No output saved.")
++        return None
++    empirical_tp_world_size = max(rank_scales_map.keys()) + 1
++    if expected_tp_size is not None:
++        assert expected_tp_size == empirical_tp_world_size, \
++            f"User expected TP world size = {expected_tp_size} " \
++            "from model but tool is expecting TP world size = " \
++            f"{empirical_tp_world_size} from model instead."
++    for i in range(empirical_tp_world_size):
++        assert i in rank_scales_map, "Expected TP world size = "\
++            f"{empirical_tp_world_size} but did not find KV " \
++            f"cache scaling factors for TP rank {i}"
++    print(f"Found TP world size = {empirical_tp_world_size} "
++          "when extracting KV cache scales!")
++    return rank_scales_map
++
++
++def _metadata_extractor(quantized_model_dir: str,
++                        metadata_extract_fns: \
++                        Dict[str, Callable[[Dict[str, Any]], Any]]) \
++                        -> Dict[str, Any]:
++    """
++    Given a directory containing quantized model files, this function
++    aims to extract metadata from the JSON files within this directory.
++    Each JSON file is expected to represent a dictionary in JSON
++    format (referred to as a "JSON-dictionary"). Metadata extraction is
++    defined by a dictionary called metadata_extract_fns, where each
++    metadata field name is mapped to an extraction function.
++
++    These extraction functions are designed to take a JSON-dictionary
++    as their only argument  and return the corresponding metadata.
++    While extraction functions are permitted to raise  exceptions, they
++    should only raise a KeyError or ValueError if the metadata field
++    cannot  be extracted from the current JSON-dictionary, yet there's
++    a possibility of finding it in another JSON-dictionary.
++
++    The function returns a dictionary that maps metadata fields to
++    their extracted data. The keys of this dictionary correspond exactly
++    to those in metadata_extract_fns. If any fields fail to be extracted,
++    their corresponding values are set to None, and a warning is printed.
++    """
++    if not os.path.isdir(quantized_model_dir):
++        raise FileNotFoundError(
++            f"The quantized model directory `{quantized_model_dir}` "
++            "does not exist.")
++    metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json"))
++
++    result: Dict[str, Any] = {}
++    for file in metadata_files:
++        with open(file) as f:
++            try:
++                metadata = json.load(f)
++            except json.JSONDecodeError:
++                print(f"Could not parse `{file}` as a valid metadata file,"
++                      " skipping it.")
++                continue
++            if not isinstance(metadata, dict):
++                print(f"The file `{file}` does not correspond to a "
++                      "JSON-serialized dictionary, skipping it.")
++                continue
++            for metadata_name, extract_fn in metadata_extract_fns.items():
++                try:
++                    metadata_info = extract_fn(metadata)
++                    if metadata_name not in result:
++                        result[metadata_name] = metadata_info
++                    elif metadata_info != result[metadata_name]:
++                        raise RuntimeError(
++                            "Metadata mismatch! Originally found "
++                            f"{metadata_name} = {result[metadata_name]} but "
++                            f"now found {metadata_name} = {metadata_info} in "
++                            f"`{file}`")
++                except KeyError:
++                    # It is possible that a given file does not contain some
++                    # of our selected metadata as it could be located in some
++                    # other metadata file.
++                    # 'EFINAE': extract_fn failure is not an error.
++                    pass
++                except ValueError:
++                    # See above.
++                    pass
++
++    # Warn if we cannot find any of the requested metadata
++    for metadata_name in metadata_extract_fns:
++        if metadata_name not in result:
++            print("WARNING: Unable to find requested metadata field "
++                  f"`{metadata_name}`, setting it to None.")
++            result[metadata_name] = None
++
++    return result
++
++
++def main(args):
++    metadata_extract_fns = {
++        "model_type": lambda json_dict: json_dict["layers"][0]["decoder_type"],
++        "tp_size": lambda json_dict: int(json_dict["tensor_parallel"]),
++        "model_dtype": lambda json_dict: json_dict["dtype"]
++    }
++    recovered_metadata = _metadata_extractor(args.quantized_model,
++                                             metadata_extract_fns)
++    if args.tp_size is not None:
++        metadata_tp_size = recovered_metadata["tp_size"]
++        if metadata_tp_size is not None:
++            assert args.tp_size == metadata_tp_size, \
++              f"User expected TP world size = {args.tp_size} " \
++              f"but found TP world size = {metadata_tp_size} from metadata!"
++    expected_tp_size = args.tp_size or recovered_metadata["tp_size"]
++    rank_keyword = "rank"
++    hf_tensor_files, use_safetensors = _prepare_hf_weights(
++        args.quantized_model, args.load_format)
++    rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors,
++                                           rank_keyword, expected_tp_size)
++    # Postprocess: formatting to the current schema. Consider pulling it
++    # out into a dedicated function should it ever become more complicated.
++    rank_scales_map = {
++        rank: {k: scale[k]
++               for k in sorted(scale.keys())}
++        for rank, scale in rank_scales_map.items()
++    }
++    # TODO: Expand this with activation and weights scaling factors when
++    # they are used in the future
++    schema = QuantParamSchema(
++        model_type=recovered_metadata["model_type"],
++        kv_cache={
++            "dtype": ("float8_e4m3fn" if len(rank_scales_map) > 0 else
++                      recovered_metadata["model_dtype"]),
++            "scaling_factor":
++            rank_scales_map
++        },
++    )
++
++    if args.output_dir is None:
++        output_file = os.path.join(args.quantized_model, args.output_name)
++    else:
++        if not os.path.isdir(args.output_dir):
++            os.makedirs(args.output_dir, exist_ok=True)
++        output_file = os.path.join(args.output_dir, args.output_name)
++
++    with open(output_file, 'w') as f:
++        f.write(schema.model_dump_json(indent=4))
++        print(f"Completed! KV cache scaling factors saved to {output_file}")
++
++
++if __name__ == "__main__":
++    parser = argparse.ArgumentParser(
++        description="This simple utility extracts the "
++        "KV cache scaling factors from a quantized HF model "
++        "and saves them to a JSON file compatible with later "
++        "use by vLLM (pass this file to the appropriate "
++        "runtime typically using the argument "
++        "--quantization-param-path <filename>). This is only used "
++        "if the KV cache dtype is FP8 and on ROCm (AMD GPU).")
++    parser.add_argument(
++        "--quantized-model",
++        help="Specify the directory containing a single quantized HF model. "
++        "It is expected that the quantization format is FP8_E4M3, for use "
++        "on ROCm (AMD GPU).",
++        required=True)
++    parser.add_argument(
++        "--load_format",
++        help="Optionally specify the format of the model's tensor files "
++        "containing the KV cache scaling factors.",
++        choices=["auto", "safetensors", "npz", "pt"],
++        default="auto")
++    parser.add_argument(
++        "--output-dir",
++        help="Optionally specify the output directory. By default the "
++        "KV cache scaling factors will be saved in the model directory, "
++        "however you can override this behavior here.",
++        default=None)
++    parser.add_argument(
++        "--output-name",
++        help="Optionally specify the output filename.",
++        # TODO: Change this once additional scaling factors are enabled
++        default="kv_cache_scales.json")
++    parser.add_argument(
++        "--tp-size",
++        help="Optionally specify the tensor-parallel (TP) size that the "
++        "quantized model should correspond to. If specified, during KV "
++        "cache scaling factor extraction the observed TP size will be "
++        "checked against this and an error will be raised if there is "
++        "a mismatch. If not specified, the quantized model's expected "
++        "TP size is instead inferred from the largest TP rank observed. "
++        "The expected TP size is cross-checked against the TP ranks "
++        "observed in the quantized model and an error is raised if any "
++        "discrepancies are found.",
++        default=None,
++        type=int)
++    args = parser.parse_args()
++
++    main(args)
+diff --git a/examples/other/fp8/quantizer/README.md b/examples/other/fp8/quantizer/README.md
+new file mode 100644
+index 0000000..d0895e9
+--- /dev/null
++++ b/examples/other/fp8/quantizer/README.md
+@@ -0,0 +1,32 @@
++### Quantizer Utilities
++`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported
++from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py)
++
++### Prerequisite
++
++#### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later
++`pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` 
++
++#### AMMO Download (code and docs)
++`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz`
++`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz`
++
++### Usage
++
++#### Run on H100 system for speed if FP8; number of GPUs depends on the model size
++
++#### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache:
++`python quantize.py --model-dir ./ll2-7b --dtype float16 --qformat fp8 --kv-cache-dtype fp8 --output-dir ./ll2_7b_fp8 --calib-size 512 --tp-size 1`
++
++Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference)
++```
++# ll ./ll2_7b_fp8/
++total 19998244
++drwxr-xr-x 2 root root        4096 Feb  7 01:08 ./
++drwxrwxr-x 8 1060 1061        4096 Feb  7 01:08 ../
++-rw-r--r-- 1 root root      176411 Feb  7 01:08 llama_tp1.json
++-rw-r--r-- 1 root root 13477087480 Feb  7 01:09 llama_tp1_rank0.npz
++-rw-r--r-- 1 root root  7000893272 Feb  7 01:08 rank0.safetensors
++#
++```
++
+diff --git a/examples/other/fp8/quantizer/quantize.py b/examples/other/fp8/quantizer/quantize.py
+new file mode 100644
+index 0000000..d75cc8b
+--- /dev/null
++++ b/examples/other/fp8/quantizer/quantize.py
+@@ -0,0 +1,367 @@
++# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # noqa: E501
++# SPDX-License-Identifier: Apache-2.0
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""
++Adapted from examples/quantization/hf_ptq.py
++"""
++
++import argparse
++import copy
++import json
++import random
++import time
++
++import ammo.torch.quantization as atq
++import numpy as np
++import torch
++from ammo.torch.export import export_model_config
++from datasets import load_dataset
++from torch.utils.data import DataLoader
++from transformers import AutoModelForCausalLM, AutoTokenizer
++
++RAND_SEED = 1234
++MAX_SEQ_LEN = 2048
++
++EMPTY_CFG = {
++    "quant_cfg": {
++        "*weight_quantizer": {
++            "enable": False,
++        },
++        "*input_quantizer": {
++            "enable": False
++        },
++        "*lm_head*": {
++            "enable": False
++        },
++        "*output_layer*": {
++            "enable": False
++        },
++        "default": {
++            "enable": False
++        },
++    },
++    "algorithm": "max",
++}
++
++KV_CACHE_CFG = {
++    "*.query_key_value.output_quantizer": {
++        "num_bits": 8,
++        "axis": None,
++        "enable": True
++    },
++    "*.Wqkv.output_quantizer": {
++        "num_bits": 8,
++        "axis": None,
++        "enable": True
++    },
++    "*.W_pack.output_quantizer": {
++        "num_bits": 8,
++        "axis": None,
++        "enable": True
++    },
++    "*.c_attn.output_quantizer": {
++        "num_bits": 8,
++        "axis": None,
++        "enable": True
++    },
++    "*.k_proj.output_quantizer": {
++        "num_bits": 8,
++        "axis": None,
++        "enable": True
++    },
++    "*.v_proj.output_quantizer": {
++        "num_bits": 8,
++        "axis": None,
++        "enable": True
++    },
++}
++
++QUANT_CFG_CHOICES = {
++    "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
++    "fp8": atq.FP8_DEFAULT_CFG,
++    "int4_awq": atq.INT4_AWQ_CFG,
++    "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
++    "int8_wo": EMPTY_CFG,
++    "int4_wo": EMPTY_CFG,
++    "full_prec": EMPTY_CFG,
++}
++
++MODEL_NAME_PATTERN_MAP = {
++    "GPT2": "gpt2",
++    "Xverse": "llama",
++    "Llama": "llama",
++    "Mistral": "llama",
++    "GPTJ": "gptj",
++    "FalconForCausalLM": "falcon",
++    "RWForCausalLM": "falcon",
++    "baichuan": "baichuan",
++    "MPT": "mpt",
++    "Bloom": "bloom",
++    "ChatGLM": "chatglm",
++    "QWen": "qwen",
++}
++
++
++def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN, model_type=None):
++    print(f"Initializing tokenizer from {ckpt_path}")
++    tokenizer = AutoTokenizer.from_pretrained(
++        ckpt_path,
++        model_max_length=max_seq_len,
++        padding_side="left",
++        trust_remote_code=True,
++    )
++    if model_type and model_type == "qwen":
++        # qwen use token id 151643 as pad and eos tokens
++        tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643)
++        tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643)
++
++    # can't set attribute 'pad_token' for "<unk>"
++    if tokenizer.pad_token != "<unk>":
++        tokenizer.pad_token = tokenizer.eos_token
++    if tokenizer.pad_token is None:
++        tokenizer.pad_token = tokenizer.eos_token
++    assert (tokenizer.pad_token
++            is not None), f"Pad token for {model_type} cannot be set!"
++
++    return tokenizer
++
++
++def get_model(ckpt_path, dtype="fp16", device="cuda"):
++    print(f"Initializing model from {ckpt_path}")
++    if dtype == "bf16" or dtype == "bfloat16":
++        dtype = torch.bfloat16
++    elif dtype == "fp16" or dtype == "float16":
++        dtype = torch.float16
++    elif dtype == "fp32" or dtype == "float32":
++        dtype = torch.float32
++    else:
++        raise NotImplementedError(f"Unknown dtype {dtype}")
++
++    # model_kwargs = {"torch_dtype": dtype}
++    model_kwargs = {"torch_dtype": "auto"}
++
++    model = AutoModelForCausalLM.from_pretrained(ckpt_path,
++                                                 device_map="auto",
++                                                 **model_kwargs,
++                                                 trust_remote_code=True)
++    model.eval()
++
++    model_dtype = next(model.parameters()).dtype
++    if dtype != model_dtype:
++        print("[TensorRT-LLM][WARNING] The manually set model data type is "
++              f"{dtype}, but the data type of the HuggingFace model is "
++              f"{model_dtype}.")
++
++    return model
++
++
++def get_model_type(model):
++    for k, v in MODEL_NAME_PATTERN_MAP.items():
++        if k.lower() in type(model).__name__.lower():
++            return v
++    return None
++
++
++def get_calib_dataloader(data="cnn_dailymail",
++                         tokenizer=None,
++                         batch_size=1,
++                         calib_size=512,
++                         block_size=512,
++                         device=None):
++    print("Loading calibration dataset")
++    if data == "pileval":
++        dataset = load_dataset(
++            "json",
++            data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst",
++            split="train")
++        dataset = dataset["text"][:calib_size]
++    elif data == "cnn_dailymail":
++        dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
++        dataset = dataset["article"][:calib_size]
++    else:
++        raise NotImplementedError
++
++    batch_encoded = tokenizer.batch_encode_plus(dataset,
++                                                return_tensors="pt",
++                                                padding="max_length",
++                                                truncation=True,
++                                                max_length=block_size)
++    if device:
++        batch_encoded = batch_encoded.to(device)
++    batch_encoded = batch_encoded["input_ids"]
++
++    calib_dataloader = DataLoader(batch_encoded,
++                                  batch_size=batch_size,
++                                  shuffle=False)
++
++    return calib_dataloader
++
++
++def quantize_model(model, quant_cfg, calib_dataloader=None):
++
++    def calibrate_loop():
++        if calib_dataloader is None:
++            return
++        """Adjusts weights and scaling factors based on selected algorithms."""
++        for idx, data in enumerate(calib_dataloader):
++            print(f"Calibrating batch {idx}")
++            model(data)
++
++    print("Starting quantization...")
++    start_time = time.time()
++    atq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
++    end_time = time.time()
++    print("Quantization done. Total time used: {:.2f} s.".format(end_time -
++                                                                 start_time))
++
++    return model
++
++
++def main(args):
++    if not torch.cuda.is_available():
++        raise OSError("GPU is required for inference.")
++
++    random.seed(RAND_SEED)
++    np.random.seed(RAND_SEED)
++
++    model = get_model(args.model_dir, args.dtype, args.device)
++    model_type = get_model_type(model)
++    tokenizer = get_tokenizer(args.model_dir, model_type=model_type)
++
++    if args.qformat in ["full_prec", "int8_wo", "int4_wo"
++                        ] and args.kv_cache_dtype is None:
++        print(f"No quantization applied, export {args.dtype} model")
++    else:
++        if "awq" in args.qformat:
++            if args.calib_size > 32:
++                print("AWQ calibration could take longer with calib_size = "
++                      f"{args.calib_size}, Using calib_size=32 instead")
++                args.calib_size = 32
++            print("\nAWQ calibration could take longer than other calibration "
++                  "methods. Please increase the batch size to speed up the "
++                  "calibration process. Batch size can be set by adding the "
++                  "argument --batch_size <batch_size> to the command line.\n")
++
++        calib_dataloader = get_calib_dataloader(
++            tokenizer=tokenizer,
++            batch_size=args.batch_size,
++            calib_size=args.calib_size,
++            device=args.device,
++        )
++
++        if args.qformat in QUANT_CFG_CHOICES:
++            quant_cfg = QUANT_CFG_CHOICES[args.qformat]
++        else:
++            raise ValueError(
++                f"Unsupported quantization format: {args.qformat}")
++
++        if "awq" in args.qformat:
++            quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[args.qformat])
++            weight_quantizer = quant_cfg["quant_cfg"][
++                "*weight_quantizer"]  # type: ignore
++            if isinstance(weight_quantizer, list):
++                weight_quantizer = weight_quantizer[0]
++            weight_quantizer["block_sizes"][-1] = args.awq_block_size
++
++        if args.kv_cache_dtype is not None:
++            if args.kv_cache_dtype == "fp8":
++                for value in KV_CACHE_CFG.values():
++                    value.update({"num_bits": (4, 3)})  # type: ignore
++            quant_cfg["quant_cfg"].update(KV_CACHE_CFG)  # type: ignore
++
++        print(quant_cfg)
++
++        model = quantize_model(model, quant_cfg, calib_dataloader)
++
++    with torch.inference_mode():
++        if model_type is None:
++            print(f"Unknown model type {type(model).__name__}. Continue "
++                  "exporting...")
++            model_type = f"unknown:{type(model).__name__}"
++
++        export_path = args.output_dir
++        start_time = time.time()
++
++        if args.qformat == "int4_awq" and model_type == "qwen":
++            torch.save(model.state_dict(), export_path)
++        else:
++            export_npz = (model_type not in [
++                'gptj', 'falcon', 'chatglm', 'mpt', 'llama', 'baichuan'
++            ])
++
++            # export safetensors
++            export_model_config(
++                model,
++                model_type,
++                getattr(torch, args.dtype),
++                export_dir=export_path,
++                inference_tensor_parallel=args.tp_size,
++                inference_pipeline_parallel=args.pp_size,
++                # export_tensorrt_llm_config=(not export_npz),
++                export_tensorrt_llm_config=False,
++                export_npz=export_npz)
++
++            # Workaround for wo quantization
++            if args.qformat in ["int8_wo", "int4_wo", "full_prec"]:
++                with open(f"{export_path}/config.json") as f:
++                    tensorrt_llm_config = json.load(f)
++                if args.qformat == "int8_wo":
++                    tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16'
++                elif args.qformat == "int4_wo":
++                    tensorrt_llm_config["quantization"]["quant_algo"] = 'W4A16'
++                else:
++                    tensorrt_llm_config["quantization"]["quant_algo"] = None
++                with open(f"{export_path}/config.json", "w") as f:
++                    json.dump(tensorrt_llm_config, f, indent=4)
++
++        end_time = time.time()
++        print("Quantized model exported to {} \nTotal time used {:.2f} s.".
++              format(export_path, end_time - start_time))
++
++
++if __name__ == "__main__":
++    parser = argparse.ArgumentParser(description=__doc__)
++    parser.add_argument("--model-dir",
++                        help="Specify where the HuggingFace model is",
++                        required=True)
++    parser.add_argument("--device", default="cuda")
++    parser.add_argument("--dtype", help="Model data type.", default="float16")
++    parser.add_argument(
++        "--qformat",
++        help="Quantization format.",
++        default="full_prec",
++        choices=[
++            "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo",
++            "full_prec"
++        ],
++    )
++    parser.add_argument("--batch-size",
++                        help="Batch size for calibration.",
++                        type=int,
++                        default=1)
++    parser.add_argument("--calib-size",
++                        help="Number of samples for calibration.",
++                        type=int,
++                        default=512)
++    parser.add_argument("--output-dir", default="exported_model")
++    parser.add_argument("--tp-size", type=int, default=1)
++    parser.add_argument("--pp-size", type=int, default=1)
++    parser.add_argument("--awq-block-size", type=int, default=128)
++    parser.add_argument("--kv-cache-dtype",
++                        help="KV Cache dtype.",
++                        default=None,
++                        choices=["int8", "fp8", None])
++    args = parser.parse_args()
++
++    main(args)
+diff --git a/examples/other/logging_configuration.md b/examples/other/logging_configuration.md
+new file mode 100644
+index 0000000..9ac8b13
+--- /dev/null
++++ b/examples/other/logging_configuration.md
+@@ -0,0 +1,172 @@
++# Logging Configuration
++
++vLLM leverages Python's `logging.config.dictConfig` functionality to enable
++robust and flexible configuration of the various loggers used by vLLM.
++
++vLLM offers two environment variables that can be used to accommodate a range
++of logging configurations that range from simple-and-inflexible to
++more-complex-and-more-flexible.
++
++- No vLLM logging (simple and inflexible)
++  - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
++- vLLM's default logging configuration (simple and inflexible)
++  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
++- Fine-grained custom logging configuration (more complex, more flexible)
++  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
++    set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
++
++
++## Logging Configuration Environment Variables
++
++### `VLLM_CONFIGURE_LOGGING`
++
++`VLLM_CONFIGURE_LOGGING` controls whether or not vLLM takes any action to
++configure the loggers used by vLLM. This functionality is enabled by default,
++but can be disabled by setting `VLLM_CONFIGURE_LOGGING=0` when running vLLM.
++
++If `VLLM_CONFIGURE_LOGGING` is enabled and no value is given for
++`VLLM_LOGGING_CONFIG_PATH`, vLLM will use built-in default configuration to
++configure the root vLLM logger. By default, no other vLLM loggers are
++configured and, as such, all vLLM loggers defer to the root vLLM logger to make
++all logging decisions.
++
++If `VLLM_CONFIGURE_LOGGING` is disabled and a value is given for
++`VLLM_LOGGING_CONFIG_PATH`, an error will occur while starting vLLM.
++
++### `VLLM_LOGGING_CONFIG_PATH`
++
++`VLLM_LOGGING_CONFIG_PATH` allows users to specify a path to a JSON file of
++alternative, custom logging configuration that will be used instead of vLLM's
++built-in default logging configuration. The logging configuration should be
++provided in JSON format following the schema specified by Python's [logging
++configuration dictionary
++schema](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details).
++
++If `VLLM_LOGGING_CONFIG_PATH` is specified, but `VLLM_CONFIGURE_LOGGING` is
++disabled, an error will occur while starting vLLM.
++
++
++## Examples
++
++### Example 1: Customize vLLM root logger
++
++For this example, we will customize the vLLM root logger to use
++[`python-json-logger`](https://github.com/madzak/python-json-logger) to log to
++STDOUT of the console in JSON format with a log level of `INFO`.
++
++To begin, first, create an appropriate JSON logging configuration file:
++
++**/path/to/logging_config.json:**
++
++```json
++{
++  "formatters": {
++    "json": {
++      "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
++    }
++  },
++  "handlers": {
++    "console": {
++      "class" : "logging.StreamHandler",
++      "formatter": "json",
++      "level": "INFO",
++      "stream": "ext://sys.stdout"
++    }
++  },
++  "loggers": {
++    "vllm": {
++      "handlers": ["console"],
++      "level": "INFO",
++      "propagate": false
++    }
++  },
++  "version": 1
++}
++```
++
++Next, install the `python-json-logger` package if it's not already installed:
++
++```bash
++pip install python-json-logger
++```
++
++Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
++to the path of the custom logging configuration JSON file:
++
++```bash
++VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
++    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
++```
++
++
++### Example 2: Silence a particular vLLM logger
++
++To silence a particular vLLM logger, it is necessary to provide custom logging
++configuration for the target logger that configures the logger so that it won't
++propagate its log messages to the root vLLM logger.
++
++When custom configuration is provided for any logger, it is also necessary to
++provide configuration for the root vLLM logger since any custom logger
++configuration overrides the built-in default logging configuration used by vLLM.
++
++First, create an appropriate JSON logging configuration file that includes
++configuration for the root vLLM logger and for the logger you wish to silence:
++
++**/path/to/logging_config.json:**
++
++```json
++{
++  "formatters": {
++    "vllm": {
++      "class": "vllm.logging_utils.NewLineFormatter",
++      "datefmt": "%m-%d %H:%M:%S",
++      "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
++    }
++  },
++  "handlers": {
++    "vllm": {
++      "class" : "logging.StreamHandler",
++      "formatter": "vllm",
++      "level": "INFO",
++      "stream": "ext://sys.stdout"
++    }
++  },
++  "loggers": {
++    "vllm": {
++      "handlers": ["vllm"],
++      "level": "DEBUG",
++      "propagage": false
++    },
++    "vllm.example_noisy_logger": {
++      "propagate": false
++    }
++  },
++  "version": 1
++}
++```
++
++Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
++to the path of the custom logging configuration JSON file:
++
++```bash
++VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
++    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
++```
++
++
++### Example 3: Disable vLLM default logging configuration
++
++To disable vLLM's default logging configuration and silence all vLLM loggers,
++simple set `VLLM_CONFIGURE_LOGGING=0` when running vLLM. This will prevent vLLM
++for configuring the root vLLM logger, which in turn, silences all other vLLM
++loggers.
++
++```bash
++VLLM_CONFIGURE_LOGGING=0 \
++    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
++```
++
++
++## Additional resources
++
++- [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
+diff --git a/examples/other/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py
+new file mode 100644
+index 0000000..5fff1fd
+--- /dev/null
++++ b/examples/other/tensorize_vllm_model.py
+@@ -0,0 +1,240 @@
++import argparse
++import dataclasses
++import json
++import os
++import uuid
++
++from vllm import LLM
++from vllm.engine.arg_utils import EngineArgs
++from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
++                                                         TensorizerConfig,
++                                                         tensorize_vllm_model)
++from vllm.utils import FlexibleArgumentParser
++
++# yapf conflicts with isort for this docstring
++# yapf: disable
++"""
++tensorize_vllm_model.py is a script that can be used to serialize and 
++deserialize vLLM models. These models can be loaded using tensorizer 
++to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
++or locally. Tensor encryption and decryption is also supported, although 
++libsodium must be installed to use it. Install vllm with tensorizer support 
++using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
++https://github.com/coreweave/tensorizer
++
++To serialize a model, install vLLM from source, then run something 
++like this from the root level of this repository:
++
++python -m examples.offline_inference.tensorize_vllm_model \
++   --model facebook/opt-125m \
++   serialize \
++   --serialized-directory s3://my-bucket \
++   --suffix v1
++   
++Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
++and saves it to your S3 bucket. A local directory can also be used. This
++assumes your S3 credentials are specified as environment variables
++in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and 
++`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide 
++`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint` 
++as CLI args to this script.
++
++You can also encrypt the model weights with a randomly-generated key by 
++providing a `--keyfile` argument.
++
++To deserialize a model, you can run something like this from the root 
++level of this repository:
++
++python -m examples.offline_inference.tensorize_vllm_model \
++   --model EleutherAI/gpt-j-6B \
++   --dtype float16 \
++   deserialize \
++   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors
++
++Which downloads the model tensors from your S3 bucket and deserializes them.
++
++You can also provide a `--keyfile` argument to decrypt the model weights if 
++they were serialized with encryption.
++
++To support distributed tensor-parallel models, each model shard will be
++serialized to a separate file. The tensorizer_uri is then specified as a string
++template with a format specifier such as '%03d' that will be rendered with the
++shard's rank. Sharded models serialized with this script will be named as
++model-rank-%03d.tensors
++
++For more information on the available arguments for serializing, run 
++`python -m examples.offline_inference.tensorize_vllm_model serialize --help`.
++
++Or for deserializing:
++
++`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`.
++
++Once a model is serialized, tensorizer can be invoked with the `LLM` class 
++directly to load models:
++
++    llm = LLM(model="facebook/opt-125m",
++              load_format="tensorizer",
++              model_loader_extra_config=TensorizerConfig(
++                    tensorizer_uri = path_to_tensors,
++                    num_readers=3,
++                    )
++              )
++            
++A serialized model can be used during model loading for the vLLM OpenAI
++inference server. `model_loader_extra_config` is exposed as the CLI arg
++`--model-loader-extra-config`, and accepts a JSON string literal of the
++TensorizerConfig arguments desired.
++
++In order to see all of the available arguments usable to configure 
++loading with tensorizer that are given to `TensorizerConfig`, run:
++
++`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`
++
++under the `tensorizer options` section. These can also be used for
++deserialization in this example script, although `--tensorizer-uri` and
++`--path-to-tensors` are functionally the same in this case.
++"""
++
++
++def parse_args():
++    parser = FlexibleArgumentParser(
++        description="An example script that can be used to serialize and "
++        "deserialize vLLM models. These models "
++        "can be loaded using tensorizer directly to the GPU "
++        "extremely quickly. Tensor encryption and decryption is "
++        "also supported, although libsodium must be installed to "
++        "use it.")
++    parser = EngineArgs.add_cli_args(parser)
++    subparsers = parser.add_subparsers(dest='command')
++
++    serialize_parser = subparsers.add_parser(
++        'serialize', help="Serialize a model to `--serialized-directory`")
++
++    serialize_parser.add_argument(
++        "--suffix",
++        type=str,
++        required=False,
++        help=(
++            "The suffix to append to the serialized model directory, which is "
++            "used to construct the location of the serialized model tensors, "
++            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
++            "`--suffix` is `v1`, the serialized model tensors will be "
++            "saved to "
++            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
++            "If none is provided, a random UUID will be used."))
++    serialize_parser.add_argument(
++        "--serialized-directory",
++        type=str,
++        required=True,
++        help="The directory to serialize the model to. "
++        "This can be a local directory or S3 URI. The path to where the "
++        "tensors are saved is a combination of the supplied `dir` and model "
++        "reference ID. For instance, if `dir` is the serialized directory, "
++        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
++        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
++        "where `suffix` is given by `--suffix` or a random UUID if not "
++        "provided.")
++
++    serialize_parser.add_argument(
++        "--keyfile",
++        type=str,
++        required=False,
++        help=("Encrypt the model weights with a randomly-generated binary key,"
++              " and save the key at this path"))
++
++    deserialize_parser = subparsers.add_parser(
++        'deserialize',
++        help=("Deserialize a model from `--path-to-tensors`"
++              " to verify it can be loaded and used."))
++
++    deserialize_parser.add_argument(
++        "--path-to-tensors",
++        type=str,
++        required=True,
++        help="The local path or S3 URI to the model tensors to deserialize. ")
++
++    deserialize_parser.add_argument(
++        "--keyfile",
++        type=str,
++        required=False,
++        help=("Path to a binary key to use to decrypt the model weights,"
++              " if the model was serialized with encryption"))
++
++    TensorizerArgs.add_cli_args(deserialize_parser)
++
++    return parser.parse_args()
++
++
++
++def deserialize():
++    llm = LLM(model=args.model,
++              load_format="tensorizer",
++              tensor_parallel_size=args.tensor_parallel_size,
++              model_loader_extra_config=tensorizer_config
++    )
++    return llm
++
++
++if __name__ == '__main__':
++    args = parse_args()
++
++    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
++                        or os.environ.get("S3_ACCESS_KEY_ID", None))
++    s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
++                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
++    s3_endpoint = (getattr(args, 's3_endpoint', None)
++                or os.environ.get("S3_ENDPOINT_URL", None))
++
++    credentials = {
++        "s3_access_key_id": s3_access_key_id,
++        "s3_secret_access_key": s3_secret_access_key,
++        "s3_endpoint": s3_endpoint
++    }
++
++    model_ref = args.model
++
++    model_name = model_ref.split("/")[1]
++
++    keyfile = args.keyfile if args.keyfile else None
++
++    if args.model_loader_extra_config:
++        config = json.loads(args.model_loader_extra_config)
++        tensorizer_args = \
++            TensorizerConfig(**config)._construct_tensorizer_args()
++        tensorizer_args.tensorizer_uri = args.path_to_tensors
++    else:
++        tensorizer_args = None
++
++    if args.command == "serialize":
++        eng_args_dict = {f.name: getattr(args, f.name) for f in
++                        dataclasses.fields(EngineArgs)}
++
++        engine_args = EngineArgs.from_cli_args(
++            argparse.Namespace(**eng_args_dict)
++        )
++
++        input_dir = args.serialized_directory.rstrip('/')
++        suffix = args.suffix if args.suffix else uuid.uuid4().hex
++        base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
++        if engine_args.tensor_parallel_size > 1:
++            model_path = f"{base_path}/model-rank-%03d.tensors"
++        else:
++            model_path = f"{base_path}/model.tensors"
++
++        tensorizer_config = TensorizerConfig(
++            tensorizer_uri=model_path,
++            encryption_keyfile=keyfile,
++            **credentials)
++
++        tensorize_vllm_model(engine_args, tensorizer_config)
++
++    elif args.command == "deserialize":
++        if not tensorizer_args:
++            tensorizer_config = TensorizerConfig(
++                tensorizer_uri=args.path_to_tensors,
++                encryption_keyfile = keyfile,
++                **credentials
++            )
++        deserialize()
++    else:
++        raise ValueError("Either serialize or deserialize must be specified.")
+diff --git a/examples/template_blip2.jinja b/examples/template_blip2.jinja
+new file mode 100644
+index 0000000..fd41a7f
+--- /dev/null
++++ b/examples/template_blip2.jinja
+@@ -0,0 +1,11 @@
++{%- for message in messages -%}
++    {%- if message['role'] == 'user' -%}
++        {{- 'Question: ' + message['content'] + ' ' -}}
++    {%- elif message['role'] == 'assistant' -%}
++        {{- 'Answer: ' + message['content'] + ' ' -}}
++    {%- endif -%}
++{%- endfor -%}
++
++{%- if add_generation_prompt -%}
++    {{- 'Answer:' -}}
++{% endif %}
+diff --git a/examples/template_dse_qwen2_vl.jinja b/examples/template_dse_qwen2_vl.jinja
+new file mode 100644
+index 0000000..e7b93fa
+--- /dev/null
++++ b/examples/template_dse_qwen2_vl.jinja
+@@ -0,0 +1,7 @@
++{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system
++You are a helpful assistant.<|im_end|>
++{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %}
++{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %}
++{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %}
++{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %}
++{% endraw %}{% endif %}<|endoftext|>
+\ No newline at end of file
+diff --git a/examples/template_llava.jinja b/examples/template_llava.jinja
+new file mode 100644
+index 0000000..6a902ee
+--- /dev/null
++++ b/examples/template_llava.jinja
+@@ -0,0 +1,23 @@
++{%- if messages[0]['role'] == 'system' -%}
++    {%- set system_message = messages[0]['content'] -%}
++    {%- set messages = messages[1:] -%}
++{%- else -%}
++    {% set system_message = '' -%}
++{%- endif -%}
++
++{{ bos_token + system_message }}
++{%- for message in messages -%}
++    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
++        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
++    {%- endif -%}
++
++    {%- if message['role'] == 'user' -%}
++        {{ 'USER: ' + message['content'] + '\n' }}
++    {%- elif message['role'] == 'assistant' -%}
++        {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
++    {%- endif -%}
++{%- endfor -%}
++
++{%- if add_generation_prompt -%}
++    {{ 'ASSISTANT:' }}
++{% endif %}
+diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja
+new file mode 100644
+index 0000000..e94661c
+--- /dev/null
++++ b/examples/template_pixtral_hf.jinja
+@@ -0,0 +1,38 @@
++{%- if messages[0]["role"] == "system" %}
++    {%- set system_message = messages[0]["content"] %}
++    {%- set loop_messages = messages[1:] %}
++{%- else %}
++    {%- set loop_messages = messages %}
++{%- endif %}
++
++{{- bos_token }}
++{%- for message in loop_messages %}
++    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
++        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
++    {%- endif %}
++    {%- if message["role"] == "user" %}
++        {%- if loop.last and system_message is defined %}
++            {{- "[INST]" + system_message + "\n" }}
++        {%- else %}
++            {{- "[INST]" }}
++        {%- endif %}
++        {%- if message["content"] is not string %}
++            {%- for chunk in message["content"] %}
++                {%- if chunk["type"] == "text" %}
++                    {{- chunk["text"] }}
++                {%- elif chunk["type"] == "image" %}
++                    {{- "[IMG]" }}
++                {%- else %}
++                    {{- raise_exception("Unrecognized content type!") }}
++                {%- endif %}
++            {%- endfor %}
++        {%- else %}
++            {{- message["content"] }}
++        {%- endif %}
++        {{- "[/INST]" }}
++    {%- elif message["role"] == "assistant" %}
++        {{- message["content"] + eos_token}}
++    {%- else %}
++        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
++    {%- endif %}
++{%- endfor %}
+diff --git a/examples/template_vlm2vec.jinja b/examples/template_vlm2vec.jinja
+new file mode 100644
+index 0000000..489b996
+--- /dev/null
++++ b/examples/template_vlm2vec.jinja
+@@ -0,0 +1,16 @@
++{%- if messages | length > 1 -%}
++    {{ raise_exception('Embedding models should only embed one message at a time') }}
++{%- endif -%}
++
++{% set vars = namespace(parts=[], next_image_id=1) %}
++{%- for message in messages -%}
++    {%- for content in message['content'] -%}
++        {%- if content['type'] == 'text' -%}
++            {%- set vars.parts = vars.parts + [content['text']] %}
++        {%- elif content['type'] == 'image' -%}
++            {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
++            {%- set vars.next_image_id = vars.next_image_id + 1 %}
++        {%- endif -%}
++    {%- endfor -%}
++{%- endfor -%}
++{{ vars.parts | join(' ') }}
+diff --git a/examples/tool_chat_template_granite.jinja b/examples/tool_chat_template_granite.jinja
+new file mode 100644
+index 0000000..467dcb2
+--- /dev/null
++++ b/examples/tool_chat_template_granite.jinja
+@@ -0,0 +1,36 @@
++{%- if tools %}
++    {{- '<|start_of_role|>available_tools<|end_of_role|>
++' }}
++    {%- for tool in tools %}
++    {{- tool | tojson(indent=4) }}
++    {%- if not loop.last %}
++        {{- '
++
++' }}
++    {%- endif %}
++    {%- endfor %}
++    {{- '<|end_of_text|>
++' }}
++{%- endif %}
++
++{%- for message in messages %}
++    {%- if message['role'] == 'system' %}
++    {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>
++' }}
++    {%- elif message['role'] == 'user' %}
++    {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>
++' }}
++    {%- elif message['role'] == 'assistant_tool_call' or (message['role'] == 'assistant' and message.tool_calls is defined) %}
++    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message.tool_calls|map(attribute='function')|list|tojson(indent=4) + '<|end_of_text|>
++' }}
++    {%- elif message['role'] == 'assistant' %}
++    {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>
++' }}
++    {%- elif message['role'] == 'tool_response' or  message['role'] == 'tool' %}
++    {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>
++' }}
++    {%- endif %}
++    {%- if loop.last and add_generation_prompt %}
++    {{- '<|start_of_role|>assistant<|end_of_role|>' }}
++    {%- endif %}
++{%- endfor %}
+diff --git a/examples/tool_chat_template_granite_20b_fc.jinja b/examples/tool_chat_template_granite_20b_fc.jinja
+new file mode 100644
+index 0000000..cb52188
+--- /dev/null
++++ b/examples/tool_chat_template_granite_20b_fc.jinja
+@@ -0,0 +1,130 @@
++{%- macro json_to_python_type(json_spec) %}
++    {%- set basic_type_map = {
++    "string": "str",
++    "number": "float",
++    "integer": "int",
++    "boolean": "bool"
++} %}
++
++    {%- if basic_type_map[json_spec.type] is defined %}
++        {{- basic_type_map[json_spec.type] }}
++    {%- elif json_spec.type == "array" %}
++        {{- "list[" +  json_to_python_type(json_spec|items) + "]" }}
++    {%- elif json_spec.type == "object" %}
++        {%- if json_spec.additionalProperties is defined %}
++            {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }}
++        {%- else %}
++            {{- "dict" }}
++        {%- endif %}
++    {%- elif json_spec.type is iterable %}
++        {{- "Union[" }}
++        {%- for t in json_spec.type %}
++            {{- json_to_python_type({"type": t}) }}
++            {%- if not loop.last %}
++                {{- "," }}
++            {%- endif %}
++        {%- endfor %}
++        {{- "]" }}
++    {%- else %}
++        {{- "Any" }}
++    {%- endif %}
++{%- endmacro %}
++
++{%- if not full_function_description is defined %}
++    {%- set full_function_description = false %}
++{%- endif %}
++
++{%- macro full_description(tool) %}
++    {{- tool.name + '(' }}
++    {%- if tool.parameters is defined %}
++        {%- for param_name, param_fields in tool.parameters.properties|items %}
++            {{- param_name + ": " + json_to_python_type(param_fields) }}
++            {%- if not loop.last %}
++                {{- ", " }}
++            {%- endif %}
++        {%- endfor %}
++    {%- endif %}
++    {{- ")" }}
++    {%- if tool.return is defined %}
++        {{- " -> " + json_to_python_type(tool.return) }}
++    {%- endif %}
++    {{- " - " + tool.description + "\n\n" }}
++    {%- if tool.parameters is defined %}
++        {%- for param_name, param_fields in tool.parameters.properties|items %}
++            {%- if loop.first %}
++                {{- "    Args:\n" }}
++            {%- endif %}
++            {{- "        " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }}
++        {%- endfor %}
++    {%- endif %}
++    {%- if tool.return is defined and tool.return.description is defined %}
++        {{- "\n    Returns:\n        " + tool.return.description }}
++    {%- endif %}
++    {{- '"' }}
++{%- endmacro %}
++
++{%- macro simple_description(tool) %}
++    {{- tool.description }}
++{%- endmacro %}
++
++{%- macro function_description(tool) %}
++    {%- if full_function_description %}
++        {{- full_description(tool) }}
++    {%- else %}
++        {{- simple_description(tool) }}
++    {%- endif %}
++{%- endmacro %}
++
++{%- if messages[0]["role"] == "system" %}
++    {%- set sys_prompt = messages[0]["content"] %}
++    {%- set loop_messages = messages[1:] %}
++{%- else %}
++    {%- set loop_messages = messages %}
++    {% set sys_prompt = 'You are a helpful assistant with access to the following function calls. Your task is to understand the given conversation with function calls and responses and generate natural language response as the ASSISTANT to continue the conversation. You may use the following function calls to understand how to respond to the user query.' %}
++{%- endif %}
++
++{{ 'SYSTEM: ' + sys_prompt }}
++{% if tools is iterable and tools | length > 0 %}
++<|function_call_library|>
++    {%- for tool in tools %}
++        {%- if tool.function is defined %}
++            {%- set tool = tool.function %}
++        {%- endif %}
++        {{- '{"name": "' + tool.name + '", ' }}
++        {{- '"description": "' + function_description(tool) }}
++        {{- ', "parameters": ' }}
++        {%- if not tool.parameters is defined or tool.parameters.properties | length == 0 %}
++            {{- "{}" }}
++        {%- else %}
++            {{- tool.parameters|tojson }}
++        {%- endif %}
++        {{- "}" }}
++        {%- if not loop.last %}
++            {{- "\n" }}
++        {%- endif %}
++    {%- endfor %}
++If none of the functions are relevant or the given question lacks the parameters required by the function, please output \"<function_call> {\"name\": \"no_function\", \"arguments\": {}}\".
++{%- endif %}
++
++
++
++{% for message in messages %}
++    {% if message['role'] == 'user' %}
++        {{- '\nUSER: ' + message['content'] }}
++    {% elif message['role'] == 'assistant' and message.tool_calls is defined %}
++        {{- '\nASSISTANT:'  }}
++        {% for tc in message.tool_calls %}
++            {{- '<function_call> ' + {'name': tc.function.name, 'arguments': tc.function.arguments}|tojson  }}
++        {% endfor %}
++        {{- '<|endoftext|>'  }}
++    {% elif message['role'] == 'assistant' %}
++        {{- '\nASSISTANT: ' + message['content'] + ' <|endoftext|>'  }}
++    {% elif message['role'] == 'tool' %}
++        {{- '<function_response> ' + message['content'] }}
++    {%- else %}
++        {{- raise_exception("Unexpected combination of role and message content") }}
++    {% endif %}
++    {% if loop.last and add_generation_prompt %}
++        {{- '\nASSISTANT: ' }}
++    {% endif %}
++{% endfor %}
+diff --git a/examples/tool_chat_template_hermes.jinja b/examples/tool_chat_template_hermes.jinja
+new file mode 100644
+index 0000000..0b0902c
+--- /dev/null
++++ b/examples/tool_chat_template_hermes.jinja
+@@ -0,0 +1,130 @@
++{%- macro json_to_python_type(json_spec) %}
++    {%- set basic_type_map = {
++    "string": "str",
++    "number": "float",
++    "integer": "int",
++    "boolean": "bool"
++} %}
++
++    {%- if basic_type_map[json_spec.type] is defined %}
++        {{- basic_type_map[json_spec.type] }}
++    {%- elif json_spec.type == "array" %}
++        {{- "list[" +  json_to_python_type(json_spec|items) + "]" }}
++    {%- elif json_spec.type == "object" %}
++        {%- if json_spec.additionalProperties is defined %}
++            {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }}
++        {%- else %}
++            {{- "dict" }}
++        {%- endif %}
++    {%- elif json_spec.type is iterable %}
++        {{- "Union[" }}
++        {%- for t in json_spec.type %}
++            {{- json_to_python_type({"type": t}) }}
++            {%- if not loop.last %}
++                {{- "," }}
++            {%- endif %}
++        {%- endfor %}
++        {{- "]" }}
++    {%- else %}
++        {{- "Any" }}
++    {%- endif %}
++{%- endmacro %}
++
++
++{{- bos_token }}
++{{- "<|im_start|>system\nYou are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> " }}
++{%- if tools is iterable and tools | length > 0 %}
++    {%- for tool in tools %}
++        {%- if tool.function is defined %}
++            {%- set tool = tool.function %}
++        {%- endif %}
++        {{- '{"type": "function", "function": ' }}
++        {{- '{"name": "' + tool.name + '", ' }}
++        {{- '"description": "' + tool.name + '(' }}
++        {%- for param_name, param_fields in tool.parameters.properties|items %}
++            {{- param_name + ": " + json_to_python_type(param_fields) }}
++            {%- if not loop.last %}
++                {{- ", " }}
++            {%- endif %}
++        {%- endfor %}
++        {{- ")" }}
++        {%- if tool.return is defined %}
++            {{- " -> " + json_to_python_type(tool.return) }}
++        {%- endif %}
++        {{- " - " + tool.description + "\n\n" }}
++        {%- for param_name, param_fields in tool.parameters.properties|items %}
++            {%- if loop.first %}
++                {{- "    Args:\n" }}
++            {%- endif %}
++            {{- "        " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }}
++        {%- endfor %}
++        {%- if tool.return is defined and tool.return.description is defined %}
++            {{- "\n    Returns:\n        " + tool.return.description }}
++        {%- endif %}
++        {{- '"' }}
++        {{- ', "parameters": ' }}
++        {%- if tool.parameters.properties | length == 0 %}
++            {{- "{}" }}
++        {%- else %}
++            {{- tool.parameters|tojson }}
++        {%- endif %}
++        {{- "}" }}
++        {%- if not loop.last %}
++            {{- "\n" }}
++        {%- endif %}
++    {%- endfor %}
++{%- endif %}
++{{- " </tools>" }}
++{{- 'Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"], "title": "FunctionCall", "type": "object"}}
++' }}
++{{- "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
++" }}
++{{- "<tool_call>
++" }}
++{{- '{"name": <function-name>, "arguments": <args-dict>}
++' }}
++{{- '</tool_call><|im_end|>' }}
++{%- for message in messages %}
++    {%- if message.role == "user" or message.role == "system" or (message.role == "assistant" and message.tool_calls is not defined) %}
++        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
++    {%- elif message.role == "assistant" and message.tool_calls is defined %}
++        {{- '<|im_start|>' + message.role }}
++        {%- for tool_call in message.tool_calls %}
++            {{- '\n<tool_call>\n' }}
++            {%- if tool_call.function is defined %}
++                {%- set tool_call = tool_call.function %}
++            {%- endif %}
++            {{- '{' }}
++            {{- '"name": "' }}
++            {{- tool_call.name }}
++            {{- '"' }}
++            {%- if tool_call.arguments is defined %}
++                {{- ', ' }}
++                {{- '"arguments": ' }}
++                {{- tool_call.arguments|tojson }}
++            {%- endif %}
++            {{- '}' }}
++            {{- '\n</tool_call>' }}
++        {%- endfor %}
++        {{- '<|im_end|>\n' }}
++    {%- elif message.role == "tool" %}
++        {%- if loop.previtem and loop.previtem.role != "tool" %}
++            {{- '<|im_start|>tool\n' }}
++        {%- endif %}
++        {{- '<tool_response>\n' }}
++        {{- message.content }}
++        {%- if not loop.last %}
++            {{- '\n</tool_response>\n' }}
++        {%- else %}
++            {{- '\n</tool_response>' }}
++        {%- endif %}
++        {%- if not loop.last and loop.nextitem.role != "tool" %}
++            {{- '<|im_end|>' }}
++        {%- elif loop.last %}
++            {{- '<|im_end|>' }}
++        {%- endif %}
++    {%- endif %}
++{%- endfor %}
++{%- if add_generation_prompt %}
++    {{- '<|im_start|>assistant\n' }}
++{%- endif %}
+diff --git a/examples/tool_chat_template_internlm2_tool.jinja b/examples/tool_chat_template_internlm2_tool.jinja
+new file mode 100644
+index 0000000..ac99666
+--- /dev/null
++++ b/examples/tool_chat_template_internlm2_tool.jinja
+@@ -0,0 +1,60 @@
++{%- if messages[0]["role"] == "system" %}
++    {%- set system_message = messages[0]["content"] %}
++    {%- set loop_messages = messages[1:] %}
++{%- else %}
++    {%- set loop_messages = messages %}
++{%- endif %}
++
++{%- if not tools is defined %}
++    {%- set tools = none %}
++{%- endif %}
++
++{{- bos_token }}
++{%- if system_message is defined %}
++{{- "<|im_start|>system\n" + system_message + "<|im_end|>\n" }}
++{%- endif %}
++
++{%- if tools is not none %}
++    {{- "<|im_start|>system name=<|plugin|>\n[" }}
++    {%- for tool in tools %}
++        {{- tool.function|tojson }}
++        {%- if not loop.last %}
++            {{- ", " }}
++        {%- else %}
++            {{- "]" }}
++        {%- endif %}
++    {%- endfor %}
++    {{- "<|im_end|>\n" }}
++{%- endif %}
++
++{%- for message in loop_messages %}
++    {%- if message["role"] == "user" %}
++        {{- "<|im_start|>user\n" + message["content"] + "<|im_end|>\n"}}
++    {%- elif message.tool_calls is defined and message.tool_calls is not none %}
++        {%- set content = message["content"] if message["content"] else "" %}
++        {{- "<|im_start|>assistant\n" + content }}
++        {%- for tool_call in message.tool_calls %}
++            {%- set function=tool_call.function %}
++            {{- "<|action_start|><|plugin|>\n" }}
++            {{- '{"name": "' + function.name + '", '}}
++            {{- '"arguments": ' + function.arguments|tojson + '}' }}
++            {{- "<|action_end|>" }}
++        {%- endfor %}
++        {{- "<|im_end|>\n" }}
++    {%- elif message["role"] == "assistant" %}
++        {{- "<|im_start|>assistant\n" + message["content"] + "<|im_end|>\n"}}
++    {%- elif message["role"] == "tool_results" or message["role"] == "tool" or message["role"] == "function" %}
++        {%- if message.content is defined and message.content.content is defined %}
++            {%- set content = message.content.content %}
++        {%- else %}
++            {%- set content = message.content %}
++        {%- endif %}
++        {{- "<|im_start|>environment name=<|plugin|>\n" + content|string + "<|im_end|>\n" }}
++    {%- else %}
++        {{- raise_exception("Only user and assistant and tool_results and tool and function roles are supported, with the exception of an initial optional system message!") }}
++    {%- endif %}
++{%- endfor %}
++
++{%- if add_generation_prompt %}
++{{- '<|im_start|>assistant\n' }}
++{%- endif %}
+\ No newline at end of file
+diff --git a/examples/tool_chat_template_llama3.1_json.jinja b/examples/tool_chat_template_llama3.1_json.jinja
+new file mode 100644
+index 0000000..0338309
+--- /dev/null
++++ b/examples/tool_chat_template_llama3.1_json.jinja
+@@ -0,0 +1,120 @@
++{{- bos_token }}
++{%- if custom_tools is defined %}
++    {%- set tools = custom_tools %}
++{%- endif %}
++{%- if not tools_in_user_message is defined %}
++    {#- Llama 3.1 doesn't pass all tests if the tools are in the system prompt #}
++    {%- set tools_in_user_message = true %}
++{%- endif %}
++{%- if not date_string is defined %}
++    {%- if strftime_now is defined %}
++        {%- set date_string = strftime_now("%d %b %Y") %}
++    {%- else %}
++        {%- set date_string = "26 Jul 2024" %}
++    {%- endif %}
++{%- endif %}
++{%- if not tools is defined %}
++    {%- set tools = none %}
++{%- endif %}
++
++{#- This block extracts the system message, so we can slot it into the right place. #}
++{%- if messages[0]['role'] == 'system' %}
++    {%- if messages[0]['content'] is string %}
++        {%- set system_message = messages[0]['content']|trim %}
++    {%- else %}
++        {%- set system_message = messages[0]['content'][0]['text']|trim %}
++    {%- endif %}
++    {%- set messages = messages[1:] %}
++{%- else %}
++    {%- if tools is not none %}
++        {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
++    {%- else %}
++        {%- set system_message = "" %}
++    {%- endif %}
++{%- endif %}
++
++{#- System message #}
++{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
++{%- if tools is not none %}
++    {{- "Environment: ipython\n" }}
++{%- endif %}
++{{- "Cutting Knowledge Date: December 2023\n" }}
++{{- "Today Date: " + date_string + "\n\n" }}
++{%- if tools is not none and not tools_in_user_message %}
++    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }}
++    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
++    {{- "Do not use variables.\n\n" }}
++    {%- for t in tools %}
++        {{- t | tojson(indent=4) }}
++        {{- "\n\n" }}
++    {%- endfor %}
++{%- endif %}
++{{- system_message }}
++{{- "<|eot_id|>" }}
++
++{#- Custom tools are passed in a user message with some extra guidance #}
++{%- if tools_in_user_message and not tools is none %}
++    {#- Extract the first user message so we can plug it in here #}
++    {%- if messages | length != 0 %}
++        {%- if messages[0]['content'] is string %}
++            {%- set first_user_message = messages[0]['content']|trim %}
++        {%- else %}
++            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
++        {%- endif %}
++        {%- set messages = messages[1:] %}
++    {%- else %}
++        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
++    {%- endif %}
++    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
++    {{- "Given the following functions, please respond with a JSON for a function call " }}
++    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
++    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
++    {{- "Do not use variables.\n\n" }}
++    {%- for t in tools %}
++        {{- t | tojson(indent=4) }}
++        {{- "\n\n" }}
++    {%- endfor %}
++    {{- first_user_message + "<|eot_id|>"}}
++{%- endif %}
++
++{%- for message in messages %}
++    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
++        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
++        {%- if message['content'] is string %}
++            {{- message['content'] | trim}}
++        {%- else %}
++            {%- for content in message['content'] %}
++                {%- if content['type'] == 'text' %}
++                    {{- content['text'] | trim }}
++                {%- endif %}
++            {%- endfor %}
++        {%- endif %}
++        {{- '<|eot_id|>' }}
++    {%- elif 'tool_calls' in message %}
++        {%- if not message.tool_calls|length == 1 %}
++            {{- raise_exception("This model only supports single tool-calls at once!") }}
++        {%- endif %}
++        {%- set tool_call = message.tool_calls[0].function %}
++        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
++        {{- '{"name": "' + tool_call.name + '", ' }}
++        {{- '"parameters": ' }}
++        {{- tool_call.arguments | tojson }}
++        {{- "}" }}
++        {{- "<|eot_id|>" }}
++    {%- elif message.role == "tool" or message.role == "ipython" %}
++        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
++        {%- if message.content is string %}
++            {{- { "output": message.content } | tojson }}
++        {%- else %}
++            {%- for content in message['content']  %}
++                {%- if content['type']  == 'text' %}
++                    {{- { "output": content['text']  } | tojson }}
++                {%- endif %}
++            {%- endfor %}
++        {%- endif %}
++        {{- "<|eot_id|>" }}
++    {%- endif %}
++{%- endfor %}
++{%- if add_generation_prompt %}
++    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
++{%- endif %}
+diff --git a/examples/tool_chat_template_llama3.2_json.jinja b/examples/tool_chat_template_llama3.2_json.jinja
+new file mode 100644
+index 0000000..2b290c0
+--- /dev/null
++++ b/examples/tool_chat_template_llama3.2_json.jinja
+@@ -0,0 +1,133 @@
++{{- bos_token }}
++{%- if custom_tools is defined %}
++    {%- set tools = custom_tools %}
++{%- endif %}
++{%- if not tools_in_user_message is defined %}
++    {%- set tools_in_user_message = false %}
++{%- endif %}
++{%- if not date_string is defined %}
++    {%- if strftime_now is defined %}
++        {%- set date_string = strftime_now("%d %b %Y") %}
++    {%- else %}
++        {%- set date_string = "26 Jul 2024" %}
++    {%- endif %}
++{%- endif %}
++{%- if not tools is defined %}
++    {%- set tools = none %}
++{%- endif %}
++
++{#- Find out if there are any images #}
++{% set image_ns = namespace(has_images=false) %}
++{%- for message in messages %}
++    {%- for content in message['content'] %}
++        {%- if content['type'] == 'image' %}
++            {%- set image_ns.has_images = true %}
++        {%- endif %}
++    {%- endfor %}
++{%- endfor %}
++
++{#- This block extracts the system message, so we can slot it into the right place. #}
++{%- if messages[0]['role'] == 'system' %}
++    {%- if messages[0]['content'] is string %}
++        {%- set system_message = messages[0]['content']|trim %}
++    {%- else %}
++        {%- set system_message = messages[0]['content'][0]['text']|trim %}
++    {%- endif %}
++    {%- set messages = messages[1:] %}
++{%- else %}
++    {%- if tools is not none %}
++        {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
++    {%- else %}
++        {%- set system_message = "" %}
++    {%- endif %}
++{%- endif %}
++
++{#- System message if there are no images, if the user supplied one, or if tools are used (default tool system message) #}
++{%- if system_message or not image_ns.has_images %}
++    {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
++    {%- if tools is not none %}
++        {{- "Environment: ipython\n" }}
++    {%- endif %}
++    {{- "Cutting Knowledge Date: December 2023\n" }}
++    {{- "Today Date: " + date_string + "\n\n" }}
++    {%- if tools is not none and not tools_in_user_message %}
++        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }}
++        {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
++        {{- "Do not use variables.\n\n" }}
++        {%- for t in tools %}
++            {{- t | tojson(indent=4) }}
++            {{- "\n\n" }}
++        {%- endfor %}
++    {%- endif %}
++    {{- system_message }}
++    {{- "<|eot_id|>" }}
++{%- endif %}
++
++{#- Custom tools are passed in a user message with some extra guidance #}
++{%- if tools_in_user_message and not tools is none %}
++    {#- Extract the first user message so we can plug it in here #}
++    {%- if messages | length != 0 %}
++        {%- if messages[0]['content'] is string %}
++            {%- set first_user_message = messages[0]['content']|trim %}
++        {%- else %}
++            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
++        {%- endif %}
++        {%- set messages = messages[1:] %}
++    {%- else %}
++        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
++    {%- endif %}
++    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
++    {{- "Given the following functions, please respond with a JSON for a function call " }}
++    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
++    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
++    {{- "Do not use variables.\n\n" }}
++    {%- for t in tools %}
++        {{- t | tojson(indent=4) }}
++        {{- "\n\n" }}
++    {%- endfor %}
++    {{- first_user_message + "<|eot_id|>"}}
++{%- endif %}
++
++{%- for message in messages %}
++    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
++        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
++        {%- if message['content'] is string %}
++            {{- message['content'] | trim}}
++        {%- else %}
++            {%- for content in message['content'] %}
++                {%- if content['type'] == 'image' %}
++                    {{- '<|image|>' }}
++                {%- elif content['type'] == 'text' %}
++                    {{- content['text'] | trim }}
++                {%- endif %}
++            {%- endfor %}
++        {%- endif %}
++        {{- '<|eot_id|>' }}
++    {%- elif 'tool_calls' in message %}
++        {%- if not message.tool_calls|length == 1 %}
++            {{- raise_exception("This model only supports single tool-calls at once!") }}
++        {%- endif %}
++        {%- set tool_call = message.tool_calls[0].function %}
++        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
++        {{- '{"name": "' + tool_call.name + '", ' }}
++        {{- '"parameters": ' }}
++        {{- tool_call.arguments | tojson }}
++        {{- "}" }}
++        {{- "<|eot_id|>" }}
++    {%- elif message.role == "tool" or message.role == "ipython" %}
++        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
++        {%- if message.content is string %}
++            {{- { "output": message.content } | tojson }}
++        {%- else %}
++            {%- for content in message['content']  %}
++                {%- if content['type']  == 'text' %}
++                    {{- { "output": content['text']  } | tojson }}
++                {%- endif %}
++            {%- endfor %}
++        {%- endif %}
++        {{- "<|eot_id|>" }}
++    {%- endif %}
++{%- endfor %}
++{%- if add_generation_prompt %}
++    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
++{%- endif %}
+diff --git a/examples/tool_chat_template_llama3.2_pythonic.jinja b/examples/tool_chat_template_llama3.2_pythonic.jinja
+new file mode 100644
+index 0000000..8c38de6
+--- /dev/null
++++ b/examples/tool_chat_template_llama3.2_pythonic.jinja
+@@ -0,0 +1,98 @@
++{{- bos_token }}
++{%- if custom_tools is defined %}
++    {%- set tools = custom_tools %}
++{%- endif %}
++{%- if not tools_in_user_message is defined %}
++    {%- set tools_in_user_message = false %}
++{%- endif %}
++{%- if not date_string is defined %}
++    {%- if strftime_now is defined %}
++        {%- set date_string = strftime_now("%d %b %Y") %}
++    {%- else %}
++        {%- set date_string = "26 Jul 2024" %}
++    {%- endif %}
++{%- endif %}
++{%- if not tools is defined %}
++    {%- set tools = none %}
++{%- endif %}
++
++{#- This block extracts the system message, so we can slot it into the right place. #}
++{%- if messages[0]['role'] == 'system' %}
++    {%- set system_message = messages[0]['content']|trim %}
++    {%- set messages = messages[1:] %}
++{%- else %}
++    {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
++{%- endif %}
++
++{#- System message #}
++{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
++{%- if tools is not none %}
++    {{- "Environment: ipython\n" }}
++{%- endif %}
++{{- "Cutting Knowledge Date: December 2023\n" }}
++{{- "Today Date: " + date_string + "\n\n" }}
++{%- if tools is not none and not tools_in_user_message %}
++    {{- "You have access to the following functions. To call functions, please respond with a python list of the calls. " }}
++    {{- 'Respond in the format [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] ' }}
++    {{- "Do not use variables.\n\n" }}
++    {%- for t in tools %}
++        {{- t | tojson(indent=4) }}
++        {{- "\n\n" }}
++    {%- endfor %}
++{%- endif %}
++{{- system_message }}
++{{- "<|eot_id|>" }}
++
++{#- Custom tools are passed in a user message with some extra guidance #}
++{%- if tools_in_user_message and not tools is none %}
++    {#- Extract the first user message so we can plug it in here #}
++    {%- if messages | length != 0 %}
++        {%- set first_user_message = messages[0]['content']|trim %}
++        {%- set messages = messages[1:] %}
++    {%- else %}
++        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
++    {%- endif %}
++    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
++    {{- "Given the following functions, please respond with a python list for function calls " }}
++    {{- "with their proper arguments to best answer the given prompt.\n\n" }}
++    {{- 'Respond in the format [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] ' }}
++    {{- "Do not use variables.\n\n" }}
++    {%- for t in tools %}
++        {{- t | tojson(indent=4) }}
++        {{- "\n\n" }}
++    {%- endfor %}
++    {{- first_user_message + "<|eot_id|>"}}
++{%- endif %}
++
++{%- for message in messages %}
++    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
++        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
++    {%- elif 'tool_calls' in message %}
++        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n[' -}}
++        {%- for tool_call in message.tool_calls %}
++            {%- if tool_call.function is defined %}
++                {%- set tool_call = tool_call.function %}
++            {%- endif %}
++            {{- tool_call.name + '(' -}}
++            {%- for param in tool_call.arguments %}
++                {{- param + '=' -}}
++                {{- "%sr" | format(tool_call.arguments[param]) -}}
++                {% if not loop.last %}, {% endif %}
++            {%- endfor %}
++            {{- ')' -}}
++            {% if not loop.last %}, {% endif %}
++        {%- endfor %}
++        {{- ']<|eot_id|>' -}}
++    {%- elif message.role == "tool" or message.role == "ipython" %}
++        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
++        {%- if message.content is mapping %}
++            {{- message.content | tojson }}
++        {%- else %}
++            {{- { "output": message.content } | tojson }}
++        {%- endif %}
++        {{- "<|eot_id|>" }}
++    {%- endif %}
++{%- endfor %}
++{%- if add_generation_prompt %}
++    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
++{%- endif %}
+diff --git a/examples/tool_chat_template_mistral.jinja b/examples/tool_chat_template_mistral.jinja
+new file mode 100644
+index 0000000..49691f5
+--- /dev/null
++++ b/examples/tool_chat_template_mistral.jinja
+@@ -0,0 +1,86 @@
++{%- if messages[0]["role"] == "system" %}
++    {%- set system_message = messages[0]["content"] %}
++    {%- set loop_messages = messages[1:] %}
++{%- else %}
++    {%- set loop_messages = messages %}
++{%- endif %}
++{%- if not tools is defined %}
++    {%- set tools = none %}
++{%- endif %}
++{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
++
++{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %}
++    {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
++        {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
++    {%- endif %}
++{%- endfor %}
++
++{{- bos_token }}
++{%- for message in loop_messages %}
++    {%- if message["role"] == "user" %}
++        {%- if tools is not none and (message == user_messages[-1]) %}
++            {{- "[AVAILABLE_TOOLS] [" }}
++            {%- for tool in tools %}
++                {%- set tool = tool.function %}
++                {{- '{"type": "function", "function": {' }}
++                {%- for key, val in tool.items() if key != "return" %}
++                    {%- if val is string %}
++                        {{- '"' + key + '": "' + val + '"' }}
++                    {%- else %}
++                        {{- '"' + key + '": ' + val|tojson }}
++                    {%- endif %}
++                    {%- if not loop.last %}
++                        {{- ", " }}
++                    {%- endif %}
++                {%- endfor %}
++                {{- "}}" }}
++                {%- if not loop.last %}
++                    {{- ", " }}
++                {%- else %}
++                    {{- "]" }}
++                {%- endif %}
++            {%- endfor %}
++            {{- "[/AVAILABLE_TOOLS]" }}
++        {%- endif %}
++        {%- if loop.last and system_message is defined %}
++            {{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }}
++        {%- else %}
++            {{- "[INST] " + message["content"] + "[/INST]" }}
++        {%- endif %}
++    {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
++        {%- if message.tool_calls is defined %}
++            {%- set tool_calls = message.tool_calls %}
++        {%- else %}
++            {%- set tool_calls = message.content %}
++        {%- endif %}
++        {{- "[TOOL_CALLS] [" }}
++        {%- for tool_call in tool_calls %}
++            {%- set out = tool_call.function|tojson %}
++            {{- out[:-1] }}
++            {%- if not tool_call.id is defined or tool_call.id|length < 9 %}
++                {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
++            {%- endif %}
++            {{- ', "id": "' + tool_call.id[-9:] + '"}' }}
++            {%- if not loop.last %}
++                {{- ", " }}
++            {%- else %}
++                {{- "]" + eos_token }}
++            {%- endif %}
++        {%- endfor %}
++    {%- elif message["role"] == "assistant" %}
++        {{- " " + message["content"] + eos_token }}
++    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
++        {%- if message.content is defined and message.content.content is defined %}
++            {%- set content = message.content.content %}
++        {%- else %}
++            {%- set content = message.content %}
++        {%- endif %}
++        {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
++        {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
++            {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
++        {%- endif %}
++        {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
++    {%- else %}
++        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
++    {%- endif %}
++{%- endfor %}
+diff --git a/examples/tool_chat_template_mistral_parallel.jinja b/examples/tool_chat_template_mistral_parallel.jinja
+new file mode 100644
+index 0000000..2ef4bed
+--- /dev/null
++++ b/examples/tool_chat_template_mistral_parallel.jinja
+@@ -0,0 +1,93 @@
++{%- if messages[0]["role"] == "system" %}
++    {%- set system_message = messages[0]["content"] %}
++    {%- set loop_messages = messages[1:] %}
++{%- else %}
++    {%- set loop_messages = messages %}
++{%- endif %}
++{%- if not tools is defined %}
++    {%- set tools = none %}
++{%- elif tools is not none %}
++    {%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %}
++    {%- if system_message is defined %}
++        {%- set system_message = parallel_tool_prompt + "\n\n" + system_message %}
++    {%- else %}
++        {%- set system_message = parallel_tool_prompt %}
++    {%- endif %}
++{%- endif %}
++{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
++
++{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %}
++    {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
++        {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
++    {%- endif %}
++{%- endfor %}
++
++{{- bos_token }}
++{%- for message in loop_messages %}
++    {%- if message["role"] == "user" %}
++        {%- if tools is not none and (message == user_messages[-1]) %}
++            {{- "[AVAILABLE_TOOLS] [" }}
++            {%- for tool in tools %}
++                {%- set tool = tool.function %}
++                {{- '{"type": "function", "function": {' }}
++                {%- for key, val in tool.items() if key != "return" %}
++                    {%- if val is string %}
++                        {{- '"' + key + '": "' + val + '"' }}
++                    {%- else %}
++                        {{- '"' + key + '": ' + val|tojson }}
++                    {%- endif %}
++                    {%- if not loop.last %}
++                        {{- ", " }}
++                    {%- endif %}
++                {%- endfor %}
++                {{- "}}" }}
++                {%- if not loop.last %}
++                    {{- ", " }}
++                {%- else %}
++                    {{- "]" }}
++                {%- endif %}
++            {%- endfor %}
++            {{- "[/AVAILABLE_TOOLS]" }}
++        {%- endif %}
++        {%- if loop.last and system_message is defined %}
++            {{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }}
++        {%- else %}
++            {{- "[INST] " + message["content"] + "[/INST]" }}
++        {%- endif %}
++    {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
++        {%- if message.tool_calls is defined %}
++            {%- set tool_calls = message.tool_calls %}
++        {%- else %}
++            {%- set tool_calls = message.content %}
++        {%- endif %}
++        {{- "[TOOL_CALLS] [" }}
++        {%- for tool_call in tool_calls %}
++            {%- set out = tool_call.function|tojson %}
++            {{- out[:-1] }}
++            {%- if not tool_call.id is defined or tool_call.id|length < 9 %}
++                {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
++            {%- endif %}
++            {{- ', "id": "' + tool_call.id[-9:] + '"}' }}
++            {%- if not loop.last %}
++                {{- ", " }}
++            {%- else %}
++                {{- "]" + eos_token }}
++            {%- endif %}
++        {%- endfor %}
++    {%- elif message["role"] == "assistant" %}
++        {{- " " + message["content"] + eos_token }}
++    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
++        {%- if message.content is defined and message.content.content is defined %}
++            {%- set content = message.content.content %}
++        {%- else %}
++            {%- set content = message.content %}
++        {%- endif %}
++        {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
++        {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
++            {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
++        {%- endif %}
++        {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
++    {%- else %}
++        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
++    {%- endif %}
++{%- endfor %}
+diff --git a/examples/tool_chat_template_toolace.jinja b/examples/tool_chat_template_toolace.jinja
+new file mode 100644
+index 0000000..a9b3b71
+--- /dev/null
++++ b/examples/tool_chat_template_toolace.jinja
+@@ -0,0 +1,65 @@
++{{- bos_token }}
++
++{%- if custom_tools is defined %}
++    {%- set tools = custom_tools %}
++{%- endif %}
++{%- if not tools is defined %}
++    {%- set tools = none %}
++{%- endif %}
++
++{#- This block extracts the system message, so we can slot it into the right place. #}
++{%- if messages[0]['role'] == 'system' %}
++    {%- set system_message = messages[0]['content']|trim %}
++    {%- set messages = messages[1:] %}
++{%- else %}
++    {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language." %}
++{%- endif %}
++
++{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
++{%- if tools is not none and not tools_in_user_message %}
++    {{- "You are an expert in composing functions. You are given a question and a set of possible functions. Based on the question, you will need to make one or more function/tool calls to achieve the purpose.\n" }}
++    {{- "If none of the function can be used, point it out. If the given question lacks the parameters required by the function, also point it out.\n" }}
++    {{- "You should only return the function call in tools call sections.\n\n" }}
++    {{- "If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\n" }}
++    {{- "You SHOULD NOT include any other text in the response.\n" }}
++    {{- "Here is a list of functions in JSON format that you can invoke.\n" }}
++    {%- for t in tools %}
++        {{- t | tojson(indent=4) }}
++        {{- "\n\n" }}
++    {%- endfor %}
++    {{- "\n" }}
++{%- endif %}
++{{- system_message }}
++{{- "<|eot_id|>" }}
++
++{%- for message in messages %}
++    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
++        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
++    {%- elif 'tool_calls' in message %}
++        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n[' -}}
++        {%- for tool_call in message.tool_calls %}
++            {%- if tool_call.function is defined %}
++                {%- set tool_call = tool_call.function %}
++            {%- endif %}
++            {{- tool_call.name + '(' -}}
++            {%- for param in tool_call.arguments %}
++                {{- param + '=' -}}
++                {{- "%sr" | format(tool_call.arguments[param]) -}}
++                {% if not loop.last %}, {% endif %}
++            {%- endfor %}
++            {{- ')' -}}
++            {% if not loop.last %}, {% endif %}
++        {%- endfor %}
++        {{- ']<|eot_id|>' -}}
++    {%- elif message.role == "tool" or message.role == "ipython" %}
++        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
++        {%- if message.content is mapping %}
++            {{- message.content | tojson }}
++        {%- else %}
++            {{- { "output": message.content } | tojson }}
++        {%- endif %}
++        {{- "<|eot_id|>" }}
++    {%- endif %}
++{%- endfor %}
++
++{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+diff --git a/find_cuda_init.py b/find_cuda_init.py
+new file mode 100644
+index 0000000..51db231
+--- /dev/null
++++ b/find_cuda_init.py
+@@ -0,0 +1,33 @@
++import importlib
++import traceback
++from typing import Callable
++from unittest.mock import patch
++
++
++def find_cuda_init(fn: Callable[[], object]) -> None:
++    """
++    Helper function to debug CUDA re-initialization errors.
++
++    If `fn` initializes CUDA, prints the stack trace of how this happens.
++    """
++    from torch.cuda import _lazy_init
++
++    stack = None
++
++    def wrapper():
++        nonlocal stack
++        stack = traceback.extract_stack()
++        return _lazy_init()
++
++    with patch("torch.cuda._lazy_init", wrapper):
++        fn()
++
++    if stack is not None:
++        print("==== CUDA Initialized ====")
++        print("".join(traceback.format_list(stack)).strip())
++        print("==========================")
++
++
++if __name__ == "__main__":
++    find_cuda_init(
++        lambda: importlib.import_module("vllm.model_executor.models.llava"))
+diff --git a/format.sh b/format.sh
+index 233e6af..2277eef 100755
+--- a/format.sh
++++ b/format.sh
+@@ -21,25 +21,44 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
+ ROOT="$(git rev-parse --show-toplevel)"
+ builtin cd "$ROOT" || exit 1
+ 
++check_command() {
++    if ! command -v "$1" &> /dev/null; then
++        echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`"
++        exit 1
++    fi
++}
++
++check_command yapf
++check_command ruff
++check_command mypy
++check_command codespell
++check_command isort
++check_command clang-format
++
+ YAPF_VERSION=$(yapf --version | awk '{print $2}')
+ RUFF_VERSION=$(ruff --version | awk '{print $2}')
+ MYPY_VERSION=$(mypy --version | awk '{print $2}')
+ CODESPELL_VERSION=$(codespell --version)
+ ISORT_VERSION=$(isort --vn)
++CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
++PYMARKDOWNLNT_VERSION=$(pymarkdownlnt version | awk '{print $1}')
+ 
+ # # params: tool name, tool version, required version
+ tool_version_check() {
+-    if [[ $2 != $3 ]]; then
+-        echo "Wrong $1 version installed: $3 is required, not $2."
++    expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3)
++    if [[ "$2" != "$expected" ]]; then
++        echo "❓❓Wrong $1 version installed: $expected is required, not $2."
+         exit 1
+     fi
+ }
+ 
+-tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
+-tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
+-tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
+-tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)"
+-tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
++tool_version_check "yapf" "$YAPF_VERSION"
++tool_version_check "ruff" "$RUFF_VERSION"
++tool_version_check "mypy" "$MYPY_VERSION"
++tool_version_check "isort" "$ISORT_VERSION"
++tool_version_check "codespell" "$CODESPELL_VERSION"
++tool_version_check "clang-format" "$CLANGFORMAT_VERSION"
++tool_version_check "pymarkdownlnt" "$PYMARKDOWNLNT_VERSION"
+ 
+ YAPF_FLAGS=(
+     '--recursive'
+@@ -94,25 +113,15 @@ echo 'vLLM yapf: Done'
+ 
+ # Run mypy
+ echo 'vLLM mypy:'
+-mypy vllm/attention --config-file pyproject.toml
+-mypy vllm/core --config-file pyproject.toml
+-mypy vllm/distributed --config-file pyproject.toml
+-mypy vllm/entrypoints --config-file pyproject.toml
+-mypy vllm/executor --config-file pyproject.toml
+-mypy vllm/usage --config-file pyproject.toml
+-mypy vllm/*.py --config-file pyproject.toml
+-mypy vllm/transformers_utils --config-file pyproject.toml
+-mypy vllm/engine  --config-file pyproject.toml
+-mypy vllm/worker --config-file pyproject.toml
+-mypy vllm/spec_decode --config-file pyproject.toml
+-mypy vllm/model_executor  --config-file pyproject.toml
+-mypy vllm/lora --config-file pyproject.toml
+-mypy vllm/logging --config-file pyproject.toml
+-mypy vllm/model_executor --config-file pyproject.toml
++tools/mypy.sh
++echo 'vLLM mypy: Done'
+ 
+ 
++# If git diff returns a file that is in the skip list, the file may be checked anyway:
++# https://github.com/codespell-project/codespell/issues/1915
++# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem
+ CODESPELL_EXCLUDES=(
+-    '--skip' '*docs/source/_build/**'
++    '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**'
+ )
+ 
+ # check spelling of specified files
+@@ -124,7 +133,7 @@ spell_check_all(){
+   codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}"
+ }
+ 
+-# Spelling  check of files that differ from main branch.
++# Spelling check of files that differ from main branch.
+ spell_check_changed() {
+     # The `if` guard ensures that the list of filenames is not empty, which
+     # could cause ruff to receive 0 positional arguments, making it hang
+@@ -133,10 +142,9 @@ spell_check_changed() {
+     # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
+     # exist on both branches.
+     MERGEBASE="$(git merge-base origin/main HEAD)"
+-
+     if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+         git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
+-             codespell "${CODESPELL_EXCLUDES[@]}"
++            codespell "${CODESPELL_EXCLUDES[@]}"
+     fi
+ }
+ 
+@@ -158,7 +166,7 @@ echo 'vLLM codespell: Done'
+ 
+ # Lint specified files
+ lint() {
+-    ruff "$@"
++    ruff check "$@"
+ }
+ 
+ # Lint files that differ from main branch. Ignores dirs that are not slated
+@@ -174,13 +182,12 @@ lint_changed() {
+ 
+     if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+         git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
+-             ruff
++             ruff check
+     fi
+ 
+ }
+ 
+ # Run Ruff
+-echo 'vLLM ruff:'
+ ### This flag lints individual files. --files *must* be the first command line
+ ### arg to use this option.
+ if [[ "$1" == '--files' ]]; then
+@@ -193,6 +200,7 @@ else
+    # Format only the files that changed in last commit.
+    lint_changed
+ fi
++echo 'vLLM ruff: Done'
+ 
+ # check spelling of specified files
+ isort_check() {
+@@ -234,11 +242,80 @@ else
+ fi
+ echo 'vLLM isort: Done'
+ 
++# Clang-format section
++# Exclude some files for formatting because they are vendored
++# NOTE: Keep up to date with .github/workflows/clang-format.yml
++CLANG_FORMAT_EXCLUDES=(
++    'csrc/moe/topk_softmax_kernels.cu'
++    'csrc/quantization/gguf/ggml-common.h'
++    'csrc/quantization/gguf/dequantize.cuh'
++    'csrc/quantization/gguf/vecdotq.cuh'
++    'csrc/quantization/gguf/mmq.cuh'
++    'csrc/quantization/gguf/mmvq.cuh'
++)
++
++# Format specified files with clang-format
++clang_format() {
++    clang-format -i "$@"
++}
++
++# Format files that differ from main branch with clang-format.
++clang_format_changed() {
++    # The `if` guard ensures that the list of filenames is not empty, which
++    # could cause clang-format to receive 0 positional arguments, making it hang
++    # waiting for STDIN.
++    #
++    # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
++    # exist on both branches.
++    MERGEBASE="$(git merge-base origin/main HEAD)"
++
++    # Get the list of changed files, excluding the specified ones
++    changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e))
++    if [ -n "$changed_files" ]; then
++        echo "$changed_files" | xargs -P 5 clang-format -i
++    fi
++}
++
++# Format all files with clang-format
++clang_format_all() {
++    find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
++        | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \
++        | xargs clang-format -i
++}
++
++# Run clang-format
++if [[ "$1" == '--files' ]]; then
++   clang_format "${@:2}"
++elif [[ "$1" == '--all' ]]; then
++   clang_format_all
++else
++   clang_format_changed
++fi
++echo 'vLLM clang-format: Done'
++
++echo 'vLLM actionlint:'
++tools/actionlint.sh -color
++echo 'vLLM actionlint: Done'
++
++echo 'vLLM shellcheck:'
++tools/shellcheck.sh
++echo 'vLLM shellcheck: Done'
++
++echo 'excalidraw png check:'
++tools/png-lint.sh
++echo 'excalidraw png check: Done'
++
+ if ! git diff --quiet &>/dev/null; then
+-    echo 'Reformatted files. Please review and stage the changes.'
+-    echo 'Changes not staged for commit:'
+-    echo
++    echo 
++    echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:"
+     git --no-pager diff --name-only
++    echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker."
+ 
+     exit 1
++else
++    echo "✨🎉 Format check passed! Congratulations! 🎉✨"
+ fi
++
++echo 'vLLM doc-lint:'
++tools/doc-lint.sh
++echo 'vLLM doc-lint: Done'
+diff --git a/pyproject.toml b/pyproject.toml
+index 6a448de..82275cc 100644
+--- a/pyproject.toml
++++ b/pyproject.toml
+@@ -1,23 +1,32 @@
+ [build-system]
+ # Should be mirrored in requirements-build.txt
+ requires = [
+-    "cmake>=3.21",
++    "cmake>=3.26",
+     "ninja",
+     "packaging",
+-    "setuptools >= 49.4.0",
+-    "torch == 2.3.0",
++    "setuptools>=61",
++    "setuptools-scm>=8.0",
++    "torch == 2.5.1",
+     "wheel",
++    "jinja2",
+ ]
+ build-backend = "setuptools.build_meta"
+ 
++[tool.setuptools_scm]
++# version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()`
++
+ [tool.ruff]
+ # Allow lines to be as long as 80.
+ line-length = 80
+ exclude = [
+     # External file, leaving license intact
+-    "examples/fp8/quantizer/quantize.py"
++    "examples/other/fp8/quantizer/quantize.py"
+ ]
+ 
++[tool.ruff.lint.per-file-ignores]
++"vllm/version.py" = ["F401"]
++"vllm/_version.py" = ["ALL"]
++
+ [tool.ruff.lint]
+ select = [
+     # pycodestyle
+@@ -25,7 +34,7 @@ select = [
+     # Pyflakes
+     "F",
+     # pyupgrade
+-    # "UP",
++    "UP",
+     # flake8-bugbear
+     "B",
+     # flake8-simplify
+@@ -41,16 +50,31 @@ ignore = [
+     "E731",
+     # Loop control variable not used within loop body
+     "B007",
++    # f-string format
++    "UP032",
+ ]
+ 
+ [tool.mypy]
+-python_version = "3.8"
+-
+ ignore_missing_imports = true
+ check_untyped_defs = true
+-follow_imports = "skip"
++follow_imports = "silent"
+ 
+-files = "vllm"
++# After fixing type errors resulting from follow_imports: "skip" -> "silent",
++# move the directory here and remove it from tools/mypy.sh
++files = [
++    "vllm/*.py",
++    "vllm/adapter_commons",
++    "vllm/assets",
++    "vllm/entrypoints",
++    "vllm/core",
++    "vllm/inputs",
++    "vllm/logging_utils",
++    "vllm/multimodal",
++    "vllm/platforms",
++    "vllm/transformers_utils",
++    "vllm/triton_utils",
++    "vllm/usage",
++]
+ # TODO(woosuk): Include the code from Megatron and HuggingFace.
+ exclude = [
+     "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
+@@ -59,9 +83,27 @@ exclude = [
+ ]
+ 
+ [tool.codespell]
+-ignore-words-list = "dout, te, indicies"
+-skip = "./tests/prompts,./benchmarks/sonnet.txt"
++ignore-words-list = "dout, te, indicies, subtile, ElementE"
++skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
+ 
+ [tool.isort]
+ use_parentheses = true
+ skip_gitignore = true
++
++[tool.pytest.ini_options]
++markers = [
++    "skip_global_cleanup",
++    "core_model: enable this model test in each PR instead of only nightly",
++    "cpu_model: enable this model test in CPU tests",
++    "quant_model: run this model test under Quantized category",
++    "split: run this test as part of a split",
++    "distributed: run this test only in distributed GPU tests",
++    "skip_v1: do not run this test with v1",
++    "optional: optional tests that are automatically skipped, include --optional to run them",
++]
++
++[tool.pymarkdown]
++plugins.md013.enabled = false # line-length
++plugins.md041.enabled = false # first-line-h1
++plugins.md033.enabled = false # inline-html
++plugins.md024.allow_different_nesting = true # no-duplicate-headers
+diff --git a/python_only_dev.py b/python_only_dev.py
+new file mode 100644
+index 0000000..7d95ac9
+--- /dev/null
++++ b/python_only_dev.py
+@@ -0,0 +1,14 @@
++msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
++
++TL;DR:
++
++VLLM_USE_PRECOMPILED=1 pip install -e .
++
++or
++
++export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
++export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
++pip install -e .
++""" # noqa
++
++print(msg)
+diff --git a/requirements-build.txt b/requirements-build.txt
+index 1a07a94..b203d9e 100644
+--- a/requirements-build.txt
++++ b/requirements-build.txt
+@@ -1,7 +1,9 @@
+ # Should be mirrored in pyproject.toml
+-cmake>=3.21
++cmake>=3.26
+ ninja
+ packaging
+-setuptools>=49.4.0
+-torch==2.3.0
++setuptools>=61
++setuptools-scm>=8
++torch==2.5.1
+ wheel
++jinja2
+diff --git a/requirements-common.txt b/requirements-common.txt
+index 3abb828..6c390bc 100644
+--- a/requirements-common.txt
++++ b/requirements-common.txt
+@@ -1,20 +1,39 @@
+-cmake >= 3.21
+-ninja  # For faster builds.
+ psutil
+ sentencepiece  # Required for LLaMA tokenizer.
+-numpy
+-requests
++numpy < 2.0.0
++requests >= 2.26.0
++tqdm
++blake3
+ py-cpuinfo
+-transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
++transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
+ tokenizers >= 0.19.1  # Required for Llama 3.
+-fastapi
+-openai
++protobuf # Required by LlamaTokenizer.
++fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
++fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
++aiohttp
++openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
+ uvicorn[standard]
+-pydantic >= 2.0  # Required for OpenAI server.
++pydantic >= 2.9  # Required for fastapi >= 0.113.0
+ prometheus_client >= 0.18.0
++pillow  # Required for image processing
+ prometheus-fastapi-instrumentator >= 7.0.0
+-tiktoken == 0.6.0  # Required for DBRX tokenizer
+-lm-format-enforcer == 0.9.8
+-outlines == 0.0.34 # Requires torch >= 2.1.0
+-typing_extensions
+-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
++tiktoken >= 0.6.0  # Required for DBRX tokenizer
++lm-format-enforcer >= 0.10.9, < 0.11
++outlines == 0.1.11 # Requires pytorch
++lark == 1.2.2 
++xgrammar >= 0.1.6; platform_machine == "x86_64"
++typing_extensions >= 4.10
++filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
++partial-json-parser # used for parsing partial JSON outputs
++pyzmq
++msgspec
++gguf == 0.10.0
++importlib_metadata
++mistral_common[opencv] >= 1.5.0
++pyyaml
++six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
++setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
++einops # Required for Qwen2-VL.
++compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
++depyf==0.18.0 # required for profiling and debugging with compilation config
++cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
+diff --git a/requirements-cpu.txt b/requirements-cpu.txt
+index b739642..056fbf5 100644
+--- a/requirements-cpu.txt
++++ b/requirements-cpu.txt
+@@ -1,6 +1,8 @@
+ # Common dependencies
+ -r requirements-common.txt
+ 
+-# Dependencies for x86_64 CPUs
+-torch == 2.3.0+cpu
+-triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
+\ No newline at end of file
++# Dependencies for CPUs
++torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
++torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
++torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
++datasets # for benchmark scripts
+diff --git a/requirements-cuda.txt b/requirements-cuda.txt
+index 6548d7a..8002fbd 100644
+--- a/requirements-cuda.txt
++++ b/requirements-cuda.txt
+@@ -2,8 +2,9 @@
+ -r requirements-common.txt
+ 
+ # Dependencies for NVIDIA GPUs
+-ray >= 2.9
+-nvidia-ml-py # for pynvml package
+-vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
+-torch == 2.3.0
+-xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
++ray[default] >= 2.9
++nvidia-ml-py >= 12.560.30 # for pynvml package
++torch == 2.5.1
++# These must be updated alongside torch
++torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
++xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
+diff --git a/requirements-dev.txt b/requirements-dev.txt
+index e6d375c..421aa2e 100644
+--- a/requirements-dev.txt
++++ b/requirements-dev.txt
+@@ -1,33 +1,5 @@
+-# formatting
+-yapf==0.32.0
+-toml==0.10.2
+-tomli==2.0.1
+-ruff==0.1.5
+-codespell==2.2.6
+-isort==5.13.2
++-r requirements-lint.txt
++-r requirements-test.txt
+ 
+-# type checking
+-mypy==1.9.0
+-types-PyYAML
+-types-requests
+-types-setuptools
+-
+-# testing
+-pytest
+-tensorizer==2.9.0
+-pytest-forked
+-pytest-asyncio
+-pytest-rerunfailures
+-pytest-shard
+-httpx
+-einops # required for MPT
+-requests
+-ray
+-peft
+-awscli
+-
+-# Benchmarking
+-aiohttp
+-
+-# Multimodal
+-pillow
++# Avoid adding requirements directly to this file.
++# Instead, modify the two files referenced above.
+diff --git a/requirements-hpu.txt b/requirements-hpu.txt
+new file mode 100644
+index 0000000..f4fb89e
+--- /dev/null
++++ b/requirements-hpu.txt
+@@ -0,0 +1,11 @@
++# Common dependencies
++-r requirements-common.txt
++
++# Dependencies for HPU code
++ray
++triton
++pandas
++tabulate
++setuptools>=61
++setuptools-scm>=8
++vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
+diff --git a/requirements-lint.txt b/requirements-lint.txt
+new file mode 100644
+index 0000000..ffc73f9
+--- /dev/null
++++ b/requirements-lint.txt
+@@ -0,0 +1,15 @@
++# formatting
++yapf==0.32.0
++toml==0.10.2
++tomli==2.0.2
++ruff==0.6.5
++codespell==2.3.0
++isort==5.13.2
++clang-format==18.1.5
++pymarkdownlnt==0.9.26
++
++# type checking
++mypy==1.11.1
++types-PyYAML
++types-requests
++types-setuptools
+diff --git a/requirements-neuron.txt b/requirements-neuron.txt
+index 92b705b..5e08d10 100644
+--- a/requirements-neuron.txt
++++ b/requirements-neuron.txt
+@@ -2,6 +2,6 @@
+ -r requirements-common.txt
+ 
+ # Dependencies for Neuron devices
+-transformers-neuronx >= 0.9.0
+-torch-neuronx >= 2.1.0
++transformers-neuronx >= 0.13.0
++torch-neuronx >= 2.5.0
+ neuronx-cc
+diff --git a/requirements-openvino.txt b/requirements-openvino.txt
+new file mode 100644
+index 0000000..ac9d851
+--- /dev/null
++++ b/requirements-openvino.txt
+@@ -0,0 +1,8 @@
++# Common dependencies
++-r requirements-common.txt
++
++torch == 2.5.1 #  should be aligned with "common" vLLM torch version
++openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
++
++optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
++optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
+diff --git a/requirements-rocm.txt b/requirements-rocm.txt
+index 903845b..ccc9062 100644
+--- a/requirements-rocm.txt
++++ b/requirements-rocm.txt
+@@ -2,4 +2,11 @@
+ -r requirements-common.txt
+ 
+ # Dependencies for AMD GPUs
+-ray == 2.9.3
++awscli
++boto3
++botocore
++datasets
++ray >= 2.10.0
++peft
++pytest-asyncio
++tensorizer>=2.9.0
+diff --git a/requirements-test.in b/requirements-test.in
+new file mode 100644
+index 0000000..4b4dc37
+--- /dev/null
++++ b/requirements-test.in
+@@ -0,0 +1,32 @@
++# testing
++pytest
++tensorizer>=2.9.0
++pytest-forked
++pytest-asyncio
++pytest-rerunfailures
++pytest-shard
++
++# testing utils
++awscli
++decord # required for video tests
++einops # required for MPT, qwen-vl and Mamba
++httpx
++librosa # required for audio tests
++peft
++pqdm
++ray[adag]==2.40.0
++sentence-transformers # required for embedding tests
++soundfile # required for audio tests
++timm # required for internvl test
++torch==2.5.1
++transformers_stream_generator # required for qwen-vl test
++matplotlib # required for qwen-vl test
++mistral_common[opencv] >= 1.5.0 # required for pixtral test
++datamodel_code_generator # required for minicpm3 test
++lm-eval[api]==0.4.4 # required for model evaluation test
++
++# quantization
++bitsandbytes>=0.45.0
++buildkite-test-collector==0.1.9
++
++numpy < 2.0.0
+diff --git a/requirements-test.txt b/requirements-test.txt
+new file mode 100644
+index 0000000..f576e42
+--- /dev/null
++++ b/requirements-test.txt
+@@ -0,0 +1,582 @@
++#
++# This file is autogenerated by pip-compile with Python 3.12
++# by the following command:
++#
++#    python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
++#
++absl-py==2.1.0
++    # via rouge-score
++accelerate==1.0.1
++    # via
++    #   lm-eval
++    #   peft
++aiohappyeyeballs==2.4.3
++    # via aiohttp
++aiohttp==3.10.10
++    # via
++    #   datasets
++    #   fsspec
++    #   lm-eval
++aiosignal==1.3.1
++    # via
++    #   aiohttp
++    #   ray
++annotated-types==0.7.0
++    # via pydantic
++anyio==4.6.2.post1
++    # via httpx
++argcomplete==3.5.1
++    # via datamodel-code-generator
++attrs==24.2.0
++    # via
++    #   aiohttp
++    #   jsonlines
++    #   jsonschema
++    #   referencing
++audioread==3.0.1
++    # via librosa
++awscli==1.35.23
++    # via -r requirements-test.in
++bitsandbytes>=0.45.0
++    # via -r requirements-test.in
++black==24.10.0
++    # via datamodel-code-generator
++boto3==1.35.57
++    # via tensorizer
++botocore==1.35.57
++    # via
++    #   awscli
++    #   boto3
++    #   s3transfer
++bounded-pool-executor==0.0.3
++    # via pqdm
++buildkite-test-collector==0.1.9
++    # via -r requirements-test.in
++certifi==2024.8.30
++    # via
++    #   httpcore
++    #   httpx
++    #   requests
++cffi==1.17.1
++    # via soundfile
++chardet==5.2.0
++    # via mbstrdecoder
++charset-normalizer==3.4.0
++    # via requests
++click==8.1.7
++    # via
++    #   black
++    #   nltk
++    #   ray
++colorama==0.4.6
++    # via
++    #   awscli
++    #   sacrebleu
++    #   tqdm-multiprocess
++contourpy==1.3.0
++    # via matplotlib
++cupy-cuda12x==13.3.0
++    # via ray
++cycler==0.12.1
++    # via matplotlib
++datamodel-code-generator==0.26.3
++    # via -r requirements-test.in
++dataproperty==1.0.1
++    # via
++    #   pytablewriter
++    #   tabledata
++datasets==3.0.2
++    # via
++    #   evaluate
++    #   lm-eval
++decorator==5.1.1
++    # via librosa
++decord==0.6.0
++    # via -r requirements-test.in
++dill==0.3.8
++    # via
++    #   datasets
++    #   evaluate
++    #   lm-eval
++    #   multiprocess
++dnspython==2.7.0
++    # via email-validator
++docutils==0.16
++    # via awscli
++einops==0.8.0
++    # via -r requirements-test.in
++email-validator==2.2.0
++    # via pydantic
++evaluate==0.4.3
++    # via lm-eval
++fastrlock==0.8.2
++    # via cupy-cuda12x
++filelock==3.16.1
++    # via
++    #   datasets
++    #   huggingface-hub
++    #   ray
++    #   torch
++    #   transformers
++    #   triton
++fonttools==4.54.1
++    # via matplotlib
++frozenlist==1.5.0
++    # via
++    #   aiohttp
++    #   aiosignal
++    #   ray
++fsspec[http]==2024.9.0
++    # via
++    #   datasets
++    #   evaluate
++    #   huggingface-hub
++    #   torch
++genson==1.3.0
++    # via datamodel-code-generator
++h11==0.14.0
++    # via httpcore
++hiredis==3.0.0
++    # via tensorizer
++httpcore==1.0.6
++    # via httpx
++httpx==0.27.2
++    # via -r requirements-test.in
++huggingface-hub==0.26.2
++    # via
++    #   accelerate
++    #   datasets
++    #   evaluate
++    #   peft
++    #   sentence-transformers
++    #   timm
++    #   tokenizers
++    #   transformers
++idna==3.10
++    # via
++    #   anyio
++    #   email-validator
++    #   httpx
++    #   requests
++    #   yarl
++inflect==5.6.2
++    # via datamodel-code-generator
++iniconfig==2.0.0
++    # via pytest
++isort==5.13.2
++    # via datamodel-code-generator
++jinja2==3.1.4
++    # via
++    #   datamodel-code-generator
++    #   torch
++jmespath==1.0.1
++    # via
++    #   boto3
++    #   botocore
++joblib==1.4.2
++    # via
++    #   librosa
++    #   nltk
++    #   scikit-learn
++jsonlines==4.0.0
++    # via lm-eval
++jsonschema==4.23.0
++    # via
++    #   mistral-common
++    #   ray
++jsonschema-specifications==2024.10.1
++    # via jsonschema
++kiwisolver==1.4.7
++    # via matplotlib
++lazy-loader==0.4
++    # via librosa
++libnacl==2.1.0
++    # via tensorizer
++librosa==0.10.2.post1
++    # via -r requirements-test.in
++llvmlite==0.43.0
++    # via numba
++lm-eval[api]==0.4.4
++    # via -r requirements-test.in
++lxml==5.3.0
++    # via sacrebleu
++markupsafe==3.0.2
++    # via jinja2
++matplotlib==3.9.2
++    # via -r requirements-test.in
++mbstrdecoder==1.1.3
++    # via
++    #   dataproperty
++    #   pytablewriter
++    #   typepy
++mistral-common[opencv]==1.5.1
++    # via
++    #   -r requirements-test.in
++    #   mistral-common
++more-itertools==10.5.0
++    # via lm-eval
++mpmath==1.3.0
++    # via sympy
++msgpack==1.1.0
++    # via
++    #   librosa
++    #   ray
++multidict==6.1.0
++    # via
++    #   aiohttp
++    #   yarl
++multiprocess==0.70.16
++    # via
++    #   datasets
++    #   evaluate
++mypy-extensions==1.0.0
++    # via black
++networkx==3.2.1
++    # via torch
++nltk==3.9.1
++    # via rouge-score
++numba==0.60.0
++    # via librosa
++numexpr==2.10.1
++    # via lm-eval
++numpy==1.26.4
++    # via
++    #   -r requirements-test.in
++    #   accelerate
++    #   bitsandbytes
++    #   contourpy
++    #   cupy-cuda12x
++    #   datasets
++    #   decord
++    #   evaluate
++    #   librosa
++    #   matplotlib
++    #   mistral-common
++    #   numba
++    #   numexpr
++    #   opencv-python-headless
++    #   pandas
++    #   peft
++    #   rouge-score
++    #   sacrebleu
++    #   scikit-learn
++    #   scipy
++    #   soxr
++    #   tensorizer
++    #   torchvision
++    #   transformers
++nvidia-cublas-cu12==12.4.5.8
++    # via
++    #   nvidia-cudnn-cu12
++    #   nvidia-cusolver-cu12
++    #   torch
++nvidia-cuda-cupti-cu12==12.4.127
++    # via torch
++nvidia-cuda-nvrtc-cu12==12.4.127
++    # via torch
++nvidia-cuda-runtime-cu12==12.4.127
++    # via torch
++nvidia-cudnn-cu12==9.1.0.70
++    # via torch
++nvidia-cufft-cu12==11.2.1.3
++    # via torch
++nvidia-curand-cu12==10.3.5.147
++    # via torch
++nvidia-cusolver-cu12==11.6.1.9
++    # via torch
++nvidia-cusparse-cu12==12.3.1.170
++    # via
++    #   nvidia-cusolver-cu12
++    #   torch
++nvidia-nccl-cu12==2.21.5
++    # via torch
++nvidia-nvjitlink-cu12==12.4.127
++    # via
++    #   nvidia-cusolver-cu12
++    #   nvidia-cusparse-cu12
++    #   torch
++nvidia-nvtx-cu12==12.4.127
++    # via torch
++opencv-python-headless==4.10.0.84
++    # via mistral-common
++packaging==24.1
++    # via
++    #   accelerate
++    #   black
++    #   datamodel-code-generator
++    #   datasets
++    #   evaluate
++    #   huggingface-hub
++    #   lazy-loader
++    #   matplotlib
++    #   peft
++    #   pooch
++    #   pytest
++    #   pytest-rerunfailures
++    #   ray
++    #   transformers
++    #   typepy
++pandas==2.2.3
++    # via
++    #   datasets
++    #   evaluate
++pathspec==0.12.1
++    # via black
++pathvalidate==3.2.1
++    # via pytablewriter
++peft==0.13.2
++    # via
++    #   -r requirements-test.in
++    #   lm-eval
++pillow==10.4.0
++    # via
++    #   matplotlib
++    #   mistral-common
++    #   sentence-transformers
++    #   torchvision
++platformdirs==4.3.6
++    # via
++    #   black
++    #   pooch
++pluggy==1.5.0
++    # via pytest
++pooch==1.8.2
++    # via librosa
++portalocker==2.10.1
++    # via sacrebleu
++pqdm==0.2.0
++    # via -r requirements-test.in
++propcache==0.2.0
++    # via yarl
++protobuf==5.28.3
++    # via
++    #   ray
++    #   tensorizer
++psutil==6.1.0
++    # via
++    #   accelerate
++    #   peft
++    #   tensorizer
++py==1.11.0
++    # via pytest-forked
++pyarrow==18.0.0
++    # via datasets
++pyasn1==0.6.1
++    # via rsa
++pybind11==2.13.6
++    # via lm-eval
++pycparser==2.22
++    # via cffi
++pydantic[email]==2.9.2
++    # via
++    #   datamodel-code-generator
++    #   mistral-common
++pydantic-core==2.23.4
++    # via pydantic
++pyparsing==3.2.0
++    # via matplotlib
++pytablewriter==1.2.0
++    # via lm-eval
++pytest==8.3.3
++    # via
++    #   -r requirements-test.in
++    #   buildkite-test-collector
++    #   pytest-asyncio
++    #   pytest-forked
++    #   pytest-rerunfailures
++    #   pytest-shard
++pytest-asyncio==0.24.0
++    # via -r requirements-test.in
++pytest-forked==1.6.0
++    # via -r requirements-test.in
++pytest-rerunfailures==14.0
++    # via -r requirements-test.in
++pytest-shard==0.1.2
++    # via -r requirements-test.in
++python-dateutil==2.9.0.post0
++    # via
++    #   botocore
++    #   matplotlib
++    #   pandas
++    #   typepy
++pytz==2024.2
++    # via
++    #   pandas
++    #   typepy
++pyyaml==6.0.2
++    # via
++    #   accelerate
++    #   awscli
++    #   datamodel-code-generator
++    #   datasets
++    #   huggingface-hub
++    #   peft
++    #   ray
++    #   timm
++    #   transformers
++ray[adag]==2.40.0
++    # via -r requirements-test.in
++redis==5.2.0
++    # via tensorizer
++referencing==0.35.1
++    # via
++    #   jsonschema
++    #   jsonschema-specifications
++regex==2024.9.11
++    # via
++    #   nltk
++    #   sacrebleu
++    #   tiktoken
++    #   transformers
++requests==2.32.3
++    # via
++    #   buildkite-test-collector
++    #   datasets
++    #   evaluate
++    #   huggingface-hub
++    #   lm-eval
++    #   mistral-common
++    #   pooch
++    #   ray
++    #   tiktoken
++    #   transformers
++rouge-score==0.1.2
++    # via lm-eval
++rpds-py==0.20.1
++    # via
++    #   jsonschema
++    #   referencing
++rsa==4.7.2
++    # via awscli
++s3transfer==0.10.3
++    # via
++    #   awscli
++    #   boto3
++sacrebleu==2.4.3
++    # via lm-eval
++safetensors==0.4.5
++    # via
++    #   accelerate
++    #   peft
++    #   timm
++    #   transformers
++scikit-learn==1.5.2
++    # via
++    #   librosa
++    #   lm-eval
++    #   sentence-transformers
++scipy==1.13.1
++    # via
++    #   librosa
++    #   scikit-learn
++    #   sentence-transformers
++sentence-transformers==3.2.1
++    # via -r requirements-test.in
++sentencepiece==0.2.0
++    # via mistral-common
++six==1.16.0
++    # via
++    #   python-dateutil
++    #   rouge-score
++sniffio==1.3.1
++    # via
++    #   anyio
++    #   httpx
++soundfile==0.12.1
++    # via
++    #   -r requirements-test.in
++    #   librosa
++soxr==0.5.0.post1
++    # via librosa
++sqlitedict==2.1.0
++    # via lm-eval
++sympy==1.13.1
++    # via torch
++tabledata==1.3.3
++    # via pytablewriter
++tabulate==0.9.0
++    # via sacrebleu
++tcolorpy==0.1.6
++    # via pytablewriter
++tenacity==9.0.0
++    # via lm-eval
++tensorizer==2.9.0
++    # via -r requirements-test.in
++threadpoolctl==3.5.0
++    # via scikit-learn
++tiktoken==0.7.0
++    # via
++    #   lm-eval
++    #   mistral-common
++timm==1.0.11
++    # via -r requirements-test.in
++tokenizers==0.21.0
++    # via transformers
++torch==2.5.1
++    # via
++    #   -r requirements-test.in
++    #   accelerate
++    #   bitsandbytes
++    #   lm-eval
++    #   peft
++    #   sentence-transformers
++    #   tensorizer
++    #   timm
++    #   torchvision
++torchvision==0.20.1
++    # via timm
++tqdm==4.66.6
++    # via
++    #   datasets
++    #   evaluate
++    #   huggingface-hub
++    #   lm-eval
++    #   nltk
++    #   peft
++    #   sentence-transformers
++    #   tqdm-multiprocess
++    #   transformers
++tqdm-multiprocess==0.0.11
++    # via lm-eval
++transformers==4.47.0
++    # via
++    #   lm-eval
++    #   peft
++    #   sentence-transformers
++    #   transformers-stream-generator
++transformers-stream-generator==0.0.5
++    # via -r requirements-test.in
++triton==3.1.0
++    # via torch
++typepy[datetime]==1.3.2
++    # via
++    #   dataproperty
++    #   pytablewriter
++    #   tabledata
++typing-extensions==4.12.2
++    # via
++    #   huggingface-hub
++    #   librosa
++    #   mistral-common
++    #   pydantic
++    #   pydantic-core
++    #   torch
++tzdata==2024.2
++    # via pandas
++urllib3==1.26.20
++    # via
++    #   botocore
++    #   requests
++word2number==1.1
++    # via lm-eval
++xxhash==3.5.0
++    # via
++    #   datasets
++    #   evaluate
++yarl==1.17.1
++    # via aiohttp
++zstandard==0.23.0
++    # via lm-eval
++
++# The following packages are considered to be unsafe in a requirements file:
++# setuptools
+diff --git a/requirements-tpu.txt b/requirements-tpu.txt
+new file mode 100644
+index 0000000..8ab18b3
+--- /dev/null
++++ b/requirements-tpu.txt
+@@ -0,0 +1,25 @@
++# Common dependencies
++-r requirements-common.txt
++
++# Dependencies for TPU
++cmake>=3.26
++ninja
++packaging
++setuptools-scm>=8
++wheel
++jinja2
++ray[default]
++
++# Install torch_xla
++--pre
++--extra-index-url https://download.pytorch.org/whl/nightly/cpu
++--find-links https://storage.googleapis.com/libtpu-releases/index.html
++--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
++--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
++torch==2.6.0.dev20241126+cpu
++torchvision==0.20.0.dev20241126+cpu
++torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
++torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
++torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
++jaxlib==0.4.36.dev20241122
++jax==0.4.36.dev20241122
+diff --git a/requirements-xpu.txt b/requirements-xpu.txt
+new file mode 100644
+index 0000000..42c6c32
+--- /dev/null
++++ b/requirements-xpu.txt
+@@ -0,0 +1,16 @@
++# Common dependencies
++-r requirements-common.txt
++
++ray >= 2.9
++cmake>=3.26
++ninja
++packaging
++setuptools-scm>=8
++wheel
++jinja2
++
++torch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
++intel-extension-for-pytorch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
++oneccl_bind_pt @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
++
++triton-xpu == 3.0.0b1
+diff --git a/setup.py b/setup.py
+index 3768daf..b6c1f5b 100644
+--- a/setup.py
++++ b/setup.py
+@@ -1,10 +1,11 @@
++import ctypes
+ import importlib.util
+-import io
+ import logging
+ import os
+ import re
+ import subprocess
+ import sys
++from pathlib import Path
+ from shutil import which
+ from typing import Dict, List
+ 
+@@ -12,7 +13,8 @@ import torch
+ from packaging.version import Version, parse
+ from setuptools import Extension, find_packages, setup
+ from setuptools.command.build_ext import build_ext
+-from torch.utils.cpp_extension import CUDA_HOME
++from setuptools_scm import get_version
++from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
+ 
+ 
+ def load_module_from_path(module_name, path):
+@@ -32,9 +34,17 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
+ 
+ VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
+ 
+-# vLLM only supports Linux platform
+-assert sys.platform.startswith(
+-    "linux"), "vLLM only supports Linux platform (including WSL)."
++if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
++    logger.warning(
++        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
++    VLLM_TARGET_DEVICE = "cpu"
++elif not (sys.platform.startswith("linux")
++          or sys.platform.startswith("darwin")):
++    logger.warning(
++        "vLLM only supports Linux platform (including WSL) and MacOS."
++        "Building on %s, "
++        "so vLLM may not be able to run correctly", sys.platform)
++    VLLM_TARGET_DEVICE = "empty"
+ 
+ MAIN_CUDA_VERSION = "12.1"
+ 
+@@ -51,16 +61,10 @@ def is_ninja_available() -> bool:
+     return which("ninja") is not None
+ 
+ 
+-def remove_prefix(text, prefix):
+-    if text.startswith(prefix):
+-        return text[len(prefix):]
+-    return text
+-
+-
+ class CMakeExtension(Extension):
+ 
+     def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
+-        super().__init__(name, sources=[], **kwa)
++        super().__init__(name, sources=[], py_limited_api=True, **kwa)
+         self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
+ 
+ 
+@@ -120,15 +124,8 @@ class cmake_build_ext(build_ext):
+         default_cfg = "Debug" if self.debug else "RelWithDebInfo"
+         cfg = envs.CMAKE_BUILD_TYPE or default_cfg
+ 
+-        # where .so files will be written, should be the same for all extensions
+-        # that use the same CMakeLists.txt.
+-        outdir = os.path.abspath(
+-            os.path.dirname(self.get_ext_fullpath(ext.name)))
+-
+         cmake_args = [
+             '-DCMAKE_BUILD_TYPE={}'.format(cfg),
+-            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
+-            '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
+             '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
+         ]
+ 
+@@ -138,21 +135,34 @@ class cmake_build_ext(build_ext):
+ 
+         if is_sccache_available():
+             cmake_args += [
++                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
+                 '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
+                 '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
++                '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
+             ]
+         elif is_ccache_available():
+             cmake_args += [
++                '-DCMAKE_C_COMPILER_LAUNCHER=ccache',
+                 '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
+                 '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
++                '-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
+             ]
+ 
+         # Pass the python executable to cmake so it can find an exact
+         # match.
+         cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
+ 
+-        if _install_punica():
+-            cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON']
++        # Pass the python path to cmake so it can reuse the build dependencies
++        # on subsequent calls to python.
++        cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
++
++        # Override the base directory for FetchContent downloads to $ROOT/.deps
++        # This allows sharing dependencies between profiles,
++        # and plays more nicely with sccache.
++        # To override this, set the FETCHCONTENT_BASE_DIR environment variable.
++        fc_base_dir = os.path.join(ROOT_DIR, ".deps")
++        fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
++        cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
+ 
+         #
+         # Setup parallelism and build tool
+@@ -171,7 +181,6 @@ class cmake_build_ext(build_ext):
+         else:
+             # Default build tool to whatever cmake picks.
+             build_tool = []
+-
+         subprocess.check_call(
+             ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
+             cwd=self.build_temp)
+@@ -187,25 +196,159 @@ class cmake_build_ext(build_ext):
+         if not os.path.exists(self.build_temp):
+             os.makedirs(self.build_temp)
+ 
++        targets = []
++
++        def target_name(s: str) -> str:
++            return s.removeprefix("vllm.").removeprefix("vllm_flash_attn.")
++
+         # Build all the extensions
+         for ext in self.extensions:
+             self.configure(ext)
++            targets.append(target_name(ext.name))
+ 
+-            ext_target_name = remove_prefix(ext.name, "vllm.")
+-            num_jobs, _ = self.compute_num_jobs()
++        num_jobs, _ = self.compute_num_jobs()
++
++        build_args = [
++            "--build",
++            ".",
++            f"-j={num_jobs}",
++            *[f"--target={name}" for name in targets],
++        ]
+ 
+-            build_args = [
+-                '--build', '.', '--target', ext_target_name, '-j',
+-                str(num_jobs)
++        subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
++
++        # Install the libraries
++        for ext in self.extensions:
++            # Install the extension into the proper location
++            outdir = Path(self.get_ext_fullpath(ext.name)).parent.absolute()
++
++            # Skip if the install directory is the same as the build directory
++            if outdir == self.build_temp:
++                continue
++
++            # CMake appends the extension prefix to the install path,
++            # and outdir already contains that prefix, so we need to remove it.
++            prefix = outdir
++            for i in range(ext.name.count('.')):
++                prefix = prefix.parent
++
++            # prefix here should actually be the same for all components
++            install_args = [
++                "cmake", "--install", ".", "--prefix", prefix, "--component",
++                target_name(ext.name)
+             ]
++            subprocess.check_call(install_args, cwd=self.build_temp)
++
++    def run(self):
++        # First, run the standard build_ext command to compile the extensions
++        super().run()
++
++        # copy vllm/vllm_flash_attn/*.py from self.build_lib to current
++        # directory so that they can be included in the editable build
++        import glob
++        files = glob.glob(
++            os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
++        for file in files:
++            dst_file = os.path.join("vllm/vllm_flash_attn",
++                                    os.path.basename(file))
++            print(f"Copying {file} to {dst_file}")
++            self.copy_file(file, dst_file)
+ 
+-            subprocess.check_call(['cmake', *build_args], cwd=self.build_temp)
++
++class repackage_wheel(build_ext):
++    """Extracts libraries and other files from an existing wheel."""
++    default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
++
++    def run(self) -> None:
++        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
++                                   self.default_wheel)
++
++        assert _is_cuda(
++        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
++
++        import zipfile
++
++        if os.path.isfile(wheel_location):
++            wheel_path = wheel_location
++            print(f"Using existing wheel={wheel_path}")
++        else:
++            # Download the wheel from a given URL, assume
++            # the filename is the last part of the URL
++            wheel_filename = wheel_location.split("/")[-1]
++
++            import tempfile
++
++            # create a temporary directory to store the wheel
++            temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
++            wheel_path = os.path.join(temp_dir, wheel_filename)
++
++            print(f"Downloading wheel from {wheel_location} to {wheel_path}")
++
++            from urllib.request import urlretrieve
++
++            try:
++                urlretrieve(wheel_location, filename=wheel_path)
++            except Exception as e:
++                from setuptools.errors import SetupError
++
++                raise SetupError(
++                    f"Failed to get vLLM wheel from {wheel_location}") from e
++
++        with zipfile.ZipFile(wheel_path) as wheel:
++            files_to_copy = [
++                "vllm/_C.abi3.so",
++                "vllm/_moe_C.abi3.so",
++                "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
++                "vllm/vllm_flash_attn/flash_attn_interface.py",
++                "vllm/vllm_flash_attn/__init__.py",
++                # "vllm/_version.py", # not available in nightly wheels yet
++            ]
++            file_members = filter(lambda x: x.filename in files_to_copy,
++                                  wheel.filelist)
++
++            for file in file_members:
++                print(f"Extracting and including {file.filename} "
++                      "from existing wheel")
++                package_name = os.path.dirname(file.filename).replace("/", ".")
++                file_name = os.path.basename(file.filename)
++
++                if package_name not in package_data:
++                    package_data[package_name] = []
++
++                wheel.extract(file)
++                if file_name.endswith(".py"):
++                    # python files shouldn't be added to package_data
++                    continue
++
++                package_data[package_name].append(file_name)
++
++
++def _is_hpu() -> bool:
++    is_hpu_available = True
++    try:
++        subprocess.run(["hl-smi"], capture_output=True, check=True)
++    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
++        if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
++                '/dev/accel/accel_controlD0'):
++            # last resort...
++            try:
++                output = subprocess.check_output(
++                    'lsmod | grep habanalabs | wc -l', shell=True)
++                is_hpu_available = int(output) > 0
++            except (ValueError, FileNotFoundError, PermissionError,
++                    subprocess.CalledProcessError):
++                is_hpu_available = False
++    return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
++
++
++def _no_device() -> bool:
++    return VLLM_TARGET_DEVICE == "empty"
+ 
+ 
+ def _is_cuda() -> bool:
+-    return VLLM_TARGET_DEVICE == "cuda" \
+-            and torch.version.cuda is not None \
+-            and not _is_neuron()
++    has_cuda = torch.version.cuda is not None
++    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
++            and not (_is_neuron() or _is_tpu() or _is_hpu()))
+ 
+ 
+ def _is_hip() -> bool:
+@@ -219,36 +362,54 @@ def _is_neuron() -> bool:
+         subprocess.run(["neuron-ls"], capture_output=True, check=True)
+     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
+         torch_neuronx_installed = False
+-    return torch_neuronx_installed or envs.VLLM_BUILD_WITH_NEURON
++    return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron"
++
++
++def _is_tpu() -> bool:
++    return VLLM_TARGET_DEVICE == "tpu"
+ 
+ 
+ def _is_cpu() -> bool:
+     return VLLM_TARGET_DEVICE == "cpu"
+ 
+ 
+-def _install_punica() -> bool:
+-    return envs.VLLM_INSTALL_PUNICA_KERNELS
++def _is_openvino() -> bool:
++    return VLLM_TARGET_DEVICE == "openvino"
+ 
+ 
+-def get_hipcc_rocm_version():
+-    # Run the hipcc --version command
+-    result = subprocess.run(['hipcc', '--version'],
+-                            stdout=subprocess.PIPE,
+-                            stderr=subprocess.STDOUT,
+-                            text=True)
++def _is_xpu() -> bool:
++    return VLLM_TARGET_DEVICE == "xpu"
+ 
+-    # Check if the command was executed successfully
+-    if result.returncode != 0:
+-        print("Error running 'hipcc --version'")
+-        return None
+ 
+-    # Extract the version using a regular expression
+-    match = re.search(r'HIP version: (\S+)', result.stdout)
+-    if match:
+-        # Return the version string
+-        return match.group(1)
+-    else:
+-        print("Could not find HIP version in the output")
++def _build_custom_ops() -> bool:
++    return _is_cuda() or _is_hip() or _is_cpu()
++
++
++def get_rocm_version():
++    # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
++    # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
++    try:
++        librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
++        if not librocm_core_file.is_file():
++            return None
++        librocm_core = ctypes.CDLL(librocm_core_file)
++        VerErrors = ctypes.c_uint32
++        get_rocm_core_version = librocm_core.getROCmVersion
++        get_rocm_core_version.restype = VerErrors
++        get_rocm_core_version.argtypes = [
++            ctypes.POINTER(ctypes.c_uint32),
++            ctypes.POINTER(ctypes.c_uint32),
++            ctypes.POINTER(ctypes.c_uint32),
++        ]
++        major = ctypes.c_uint32()
++        minor = ctypes.c_uint32()
++        patch = ctypes.c_uint32()
++
++        if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
++                                  ctypes.byref(patch)) == 0):
++            return "%d.%d.%d" % (major.value, minor.value, patch.value)
++        return None
++    except Exception:
+         return None
+ 
+ 
+@@ -259,7 +420,7 @@ def get_neuronxcc_version():
+                                 "__init__.py")
+ 
+     # Check if the command was executed successfully
+-    with open(version_file, "rt") as fp:
++    with open(version_file) as fp:
+         content = fp.read()
+ 
+     # Extract the version using a regular expression
+@@ -268,7 +429,7 @@ def get_neuronxcc_version():
+         # Return the version string
+         return match.group(1)
+     else:
+-        raise RuntimeError("Could not find HIP version in the output")
++        raise RuntimeError("Could not find Neuron version in the output")
+ 
+ 
+ def get_nvcc_cuda_version() -> Version:
+@@ -289,41 +450,71 @@ def get_path(*filepath) -> str:
+     return os.path.join(ROOT_DIR, *filepath)
+ 
+ 
+-def find_version(filepath: str) -> str:
+-    """Extract version information from the given filepath.
+-
+-    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
++def get_gaudi_sw_version():
+     """
+-    with open(filepath) as fp:
+-        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+-                                  fp.read(), re.M)
+-        if version_match:
+-            return version_match.group(1)
+-        raise RuntimeError("Unable to find version string.")
++    Returns the driver version.
++    """
++    # Enable console printing for `hl-smi` check
++    output = subprocess.run("hl-smi",
++                            shell=True,
++                            text=True,
++                            capture_output=True,
++                            env={"ENABLE_CONSOLE": "true"})
++    if output.returncode == 0 and output.stdout:
++        return output.stdout.split("\n")[2].replace(
++            " ", "").split(":")[1][:-1].split("-")[0]
++    return "0.0.0"  # when hl-smi is not available
+ 
+ 
+ def get_vllm_version() -> str:
+-    version = find_version(get_path("vllm", "__init__.py"))
+-
+-    if _is_cuda():
+-        cuda_version = str(get_nvcc_cuda_version())
+-        if cuda_version != MAIN_CUDA_VERSION:
+-            cuda_version_str = cuda_version.replace(".", "")[:3]
+-            version += f"+cu{cuda_version_str}"
++    # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
++    try:
++        version = get_version(
++            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
++        )
++    except LookupError:
++        version = "0.0.0"
++
++    sep = "+" if "+" not in version else "."  # dev versions might contain +
++
++    if _no_device():
++        if envs.VLLM_TARGET_DEVICE == "empty":
++            version += f"{sep}empty"
++    elif _is_cuda():
++        if envs.VLLM_USE_PRECOMPILED:
++            version += f"{sep}precompiled"
++        else:
++            cuda_version = str(get_nvcc_cuda_version())
++            if cuda_version != MAIN_CUDA_VERSION:
++                cuda_version_str = cuda_version.replace(".", "")[:3]
++                # skip this for source tarball, required for pypi
++                if "sdist" not in sys.argv:
++                    version += f"{sep}cu{cuda_version_str}"
+     elif _is_hip():
+-        # Get the HIP version
+-        hipcc_version = get_hipcc_rocm_version()
+-        if hipcc_version != MAIN_CUDA_VERSION:
+-            rocm_version_str = hipcc_version.replace(".", "")[:3]
+-            version += f"+rocm{rocm_version_str}"
++        # Get the Rocm Version
++        rocm_version = get_rocm_version() or torch.version.hip
++        if rocm_version and rocm_version != MAIN_CUDA_VERSION:
++            version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
+     elif _is_neuron():
+         # Get the Neuron version
+         neuron_version = str(get_neuronxcc_version())
+         if neuron_version != MAIN_CUDA_VERSION:
+             neuron_version_str = neuron_version.replace(".", "")[:3]
+-            version += f"+neuron{neuron_version_str}"
++            version += f"{sep}neuron{neuron_version_str}"
++    elif _is_hpu():
++        # Get the Intel Gaudi Software Suite version
++        gaudi_sw_version = str(get_gaudi_sw_version())
++        if gaudi_sw_version != MAIN_CUDA_VERSION:
++            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
++            version += f"{sep}gaudi{gaudi_sw_version}"
++    elif _is_openvino():
++        version += f"{sep}openvino"
++    elif _is_tpu():
++        version += f"{sep}tpu"
+     elif _is_cpu():
+-        version += "+cpu"
++        version += f"{sep}cpu"
++    elif _is_xpu():
++        version += f"{sep}xpu"
+     else:
+         raise RuntimeError("Unknown runtime environment")
+ 
+@@ -334,7 +525,8 @@ def read_readme() -> str:
+     """Read the README file if present."""
+     p = get_path("README.md")
+     if os.path.isfile(p):
+-        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
++        with open(get_path("README.md"), encoding="utf-8") as f:
++            return f.read()
+     else:
+         return ""
+ 
+@@ -349,50 +541,76 @@ def get_requirements() -> List[str]:
+         for line in requirements:
+             if line.startswith("-r "):
+                 resolved_requirements += _read_requirements(line.split()[1])
++            elif line.startswith("--"):
++                continue
+             else:
+                 resolved_requirements.append(line)
+         return resolved_requirements
+ 
+-    if _is_cuda():
++    if _no_device():
+         requirements = _read_requirements("requirements-cuda.txt")
+-        cuda_major = torch.version.cuda.split(".")[0]
++    elif _is_cuda():
++        requirements = _read_requirements("requirements-cuda.txt")
++        cuda_major, cuda_minor = torch.version.cuda.split(".")
+         modified_requirements = []
+         for req in requirements:
+-            if "vllm-nccl-cu12" in req:
+-                modified_requirements.append(
+-                    req.replace("vllm-nccl-cu12", f"vllm-nccl-cu{cuda_major}"))
+-            else:
+-                modified_requirements.append(req)
++            if ("vllm-flash-attn" in req
++                    and not (cuda_major == "12" and cuda_minor == "1")):
++                # vllm-flash-attn is built only for CUDA 12.1.
++                # Skip for other versions.
++                continue
++            modified_requirements.append(req)
+         requirements = modified_requirements
+     elif _is_hip():
+         requirements = _read_requirements("requirements-rocm.txt")
+     elif _is_neuron():
+         requirements = _read_requirements("requirements-neuron.txt")
++    elif _is_hpu():
++        requirements = _read_requirements("requirements-hpu.txt")
++    elif _is_openvino():
++        requirements = _read_requirements("requirements-openvino.txt")
++    elif _is_tpu():
++        requirements = _read_requirements("requirements-tpu.txt")
+     elif _is_cpu():
+         requirements = _read_requirements("requirements-cpu.txt")
++    elif _is_xpu():
++        requirements = _read_requirements("requirements-xpu.txt")
+     else:
+         raise ValueError(
+-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
++            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
++            "OpenVINO, or CPU.")
+     return requirements
+ 
+ 
+ ext_modules = []
+ 
+-if _is_cuda():
++if _is_cuda() or _is_hip():
+     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+ 
+-    if _install_punica():
+-        ext_modules.append(CMakeExtension(name="vllm._punica_C"))
++if _is_hip():
++    ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
++
++if _is_cuda():
++    ext_modules.append(
++        CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
+ 
+-if not _is_neuron():
++if _build_custom_ops():
+     ext_modules.append(CMakeExtension(name="vllm._C"))
+ 
+ package_data = {
+     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
+ }
+-if envs.VLLM_USE_PRECOMPILED:
++
++if _no_device():
+     ext_modules = []
+-    package_data["vllm"].append("*.so")
++
++if not ext_modules:
++    cmdclass = {}
++else:
++    cmdclass = {
++        "build_ext":
++        repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
++    }
+ 
+ setup(
+     name="vllm",
+@@ -409,21 +627,33 @@ setup(
+         "Documentation": "https://vllm.readthedocs.io/en/latest/",
+     },
+     classifiers=[
+-        "Programming Language :: Python :: 3.8",
+         "Programming Language :: Python :: 3.9",
+         "Programming Language :: Python :: 3.10",
+         "Programming Language :: Python :: 3.11",
++        "Programming Language :: Python :: 3.12",
+         "License :: OSI Approved :: Apache Software License",
++        "Intended Audience :: Developers",
++        "Intended Audience :: Information Technology",
++        "Intended Audience :: Science/Research",
+         "Topic :: Scientific/Engineering :: Artificial Intelligence",
++        "Topic :: Scientific/Engineering :: Information Analysis",
+     ],
+     packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
+                                     "tests*")),
+-    python_requires=">=3.8",
++    python_requires=">=3.9",
+     install_requires=get_requirements(),
+     ext_modules=ext_modules,
+     extras_require={
+-        "tensorizer": ["tensorizer==2.9.0"],
++        "tensorizer": ["tensorizer>=2.9.0"],
++        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
++        "audio": ["librosa", "soundfile"],  # Required for audio processing
++        "video": ["decord"]  # Required for video processing
+     },
+-    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
++    cmdclass=cmdclass,
+     package_data=package_data,
++    entry_points={
++        "console_scripts": [
++            "vllm=vllm.scripts:main",
++        ],
++    },
+ )
+diff --git a/tests/async_engine/__init__.py b/tests/async_engine/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
+index 1be76fd..a3c9d5c 100644
+--- a/tests/async_engine/api_server_async_engine.py
++++ b/tests/async_engine/api_server_async_engine.py
+@@ -1,6 +1,5 @@
+ """vllm.entrypoints.api_server with some extra logging for testing."""
+-import argparse
+-from typing import Any, Dict
++from typing import Any, Dict, Iterable
+ 
+ import uvicorn
+ from fastapi.responses import JSONResponse, Response
+@@ -8,6 +7,7 @@ from fastapi.responses import JSONResponse, Response
+ import vllm.entrypoints.api_server
+ from vllm.engine.arg_utils import AsyncEngineArgs
+ from vllm.engine.async_llm_engine import AsyncLLMEngine
++from vllm.utils import FlexibleArgumentParser
+ 
+ app = vllm.entrypoints.api_server.app
+ 
+@@ -18,9 +18,10 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
+         super().__init__(*args, **kwargs)
+         self._num_aborts = 0
+ 
+-    async def abort(self, request_id: str) -> None:
+-        await super().abort(request_id)
+-        self._num_aborts += 1
++    async def _engine_abort(self, request_ids: Iterable[str]):
++        ids = list(request_ids)
++        self._num_aborts += len(ids)
++        await super()._engine_abort(ids)
+ 
+     def testing_stats(self) -> Dict[str, Any]:
+         return {"num_aborted_requests": self._num_aborts}
+@@ -33,7 +34,7 @@ def stats() -> Response:
+ 
+ 
+ if __name__ == "__main__":
+-    parser = argparse.ArgumentParser()
++    parser = FlexibleArgumentParser()
+     parser.add_argument("--host", type=str, default="localhost")
+     parser.add_argument("--port", type=int, default=8000)
+     parser = AsyncEngineArgs.add_cli_args(parser)
+diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
+index 7f57d5c..83c71b5 100644
+--- a/tests/async_engine/test_api_server.py
++++ b/tests/async_engine/test_api_server.py
+@@ -25,8 +25,7 @@ def _query_server_long(prompt: str) -> dict:
+ 
+ 
+ @pytest.fixture
+-def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
+-               worker_use_ray: bool):
++def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
+     script_path = Path(__file__).parent.joinpath(
+         "api_server_async_engine.py").absolute()
+     commands = [
+@@ -35,8 +34,7 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
+         "127.0.0.1", "--tokenizer-pool-size",
+         str(tokenizer_pool_size)
+     ]
+-    if engine_use_ray:
+-        commands.append("--engine-use-ray")
++
+     if worker_use_ray:
+         commands.append("--worker-use-ray")
+     uvicorn_process = subprocess.Popen(commands)
+@@ -46,9 +44,8 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
+ 
+ @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
+ @pytest.mark.parametrize("worker_use_ray", [False, True])
+-@pytest.mark.parametrize("engine_use_ray", [False, True])
+-def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
+-                    engine_use_ray: bool):
++def test_api_server(api_server, tokenizer_pool_size: int,
++                    worker_use_ray: bool):
+     """
+     Run the API server and test it.
+ 
+diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
+index b69cdc0..8a04693 100644
+--- a/tests/async_engine/test_async_llm_engine.py
++++ b/tests/async_engine/test_async_llm_engine.py
+@@ -1,9 +1,23 @@
+ import asyncio
++import os
++import uuid
++from asyncio import CancelledError
++from copy import copy
+ from dataclasses import dataclass
++from typing import List, Optional
+ 
+ import pytest
++import pytest_asyncio
++import torch
+ 
+-from vllm.engine.async_llm_engine import AsyncLLMEngine
++from vllm import SamplingParams
++from vllm.config import ParallelConfig
++from vllm.distributed import cleanup_dist_env_and_memory
++from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
++from vllm.outputs import RequestOutput as RealRequestOutput
++from vllm.sampling_params import RequestOutputKind
++
++from ..utils import wait_for_gpu_memory_to_clear
+ 
+ 
+ @dataclass
+@@ -12,6 +26,11 @@ class RequestOutput:
+     finished: bool = False
+ 
+ 
++@dataclass
++class MockModelConfig:
++    use_async_output_proc = True
++
++
+ class MockEngine:
+ 
+     def __init__(self):
+@@ -19,13 +38,20 @@ class MockEngine:
+         self.add_request_calls = 0
+         self.abort_request_calls = 0
+         self.request_id = None
++        # Ugly, remove dependency when possible
++        self.parallel_config = ParallelConfig(1, 1, False)
++        self.model_config = MockModelConfig()
+ 
+-    async def step_async(self):
++    async def step_async(self, virtual_engine):
++        # PP size is 1, ignore virtual engine
+         self.step_calls += 1
+         return [RequestOutput(
+             request_id=self.request_id)] if self.request_id else []
+ 
+-    async def encode_request_async(self, *args, **kwargs):
++    async def process_model_inputs_async(self, *args, **kwargs):
++        pass
++
++    async def stop_remote_worker_execution_loop_async(self):
+         pass
+ 
+     def generate(self, request_id):
+@@ -37,6 +63,7 @@ class MockEngine:
+     def add_request(self, **kwargs):
+         del kwargs  # Unused
+         self.add_request_calls += 1
++        print(f'Request calls: {self.add_request_calls}')
+ 
+     async def add_request_async(self, **kwargs):
+         self.add_request_calls += 1
+@@ -49,29 +76,33 @@ class MockEngine:
+     def has_unfinished_requests(self):
+         return self.request_id is not None
+ 
++    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
++        return self.request_id is not None
++
+ 
+ class MockAsyncLLMEngine(AsyncLLMEngine):
+-
+-    def _init_engine(self, *args, **kwargs):
+-        return MockEngine()
++    _engine_class = MockEngine
+ 
+ 
+ @pytest.mark.asyncio
+ async def test_new_requests_event():
+-    engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
++    params = SamplingParams()
++
++    engine = MockAsyncLLMEngine()
+     engine.start_background_loop()
+     await asyncio.sleep(0.01)
+     assert engine.engine.step_calls == 0
+ 
+-    await engine.add_request("1", "", None)
++    await engine.add_request("1", "", params)
+     await asyncio.sleep(0.01)
+     assert engine.engine.add_request_calls == 1
+     assert engine.engine.step_calls == 1
+ 
+-    await engine.add_request("2", "", None)
++    await engine.add_request("2", "", params)
+     engine.engine.generate("2")
+     await asyncio.sleep(0)
+     await asyncio.sleep(0)
++    await asyncio.sleep(0)
+     assert engine.engine.add_request_calls == 2
+     assert engine.engine.step_calls >= 2
+     await asyncio.sleep(0.001)
+@@ -82,7 +113,7 @@ async def test_new_requests_event():
+     await asyncio.sleep(0.001)
+     assert engine.engine.step_calls == old_step_calls
+ 
+-    await engine.add_request("3", "", None)
++    await engine.add_request("3", "", params)
+     await asyncio.sleep(0.01)
+     assert engine.engine.add_request_calls == 3
+     assert engine.engine.step_calls == old_step_calls + 1
+@@ -90,7 +121,254 @@ async def test_new_requests_event():
+     assert engine.engine.add_request_calls == 3
+     assert engine.engine.step_calls == old_step_calls + 1
+ 
+-    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
++    engine = MockAsyncLLMEngine()
+     assert engine.get_model_config() is not None
+     assert engine.get_tokenizer() is not None
+     assert engine.get_decoding_config() is not None
++
++
++def start_engine():
++    wait_for_gpu_memory_to_clear(
++        devices=list(range(torch.cuda.device_count())),
++        threshold_bytes=2 * 2**30,
++        timeout_s=60,
++    )
++
++    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
++    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
++
++    return AsyncLLMEngine.from_engine_args(
++        AsyncEngineArgs(model="facebook/opt-125m",
++                        enforce_eager=True,
++                        num_scheduler_steps=num_scheduler_steps))
++
++
++def uid() -> str:
++    return str(uuid.uuid4())
++
++
++@pytest_asyncio.fixture(scope="module")
++async def async_engine():
++    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
++                                                            func=start_engine)
++    try:
++        yield engine
++    finally:
++        engine.shutdown_background_loop()
++        del engine
++        await asyncio.sleep(0.1)
++        cleanup_dist_env_and_memory()
++
++
++@pytest.fixture()
++def should_do_global_cleanup_after_test(request) -> bool:
++    # So we can share the async engine fixture between these tests
++    return False
++
++
++@pytest.mark.asyncio(scope="module")
++@pytest.mark.parametrize("stop", [None, ["a stop string"]])
++async def test_asyncio_run(async_engine, stop):
++
++    scheduler_config = await async_engine.get_scheduler_config()
++    num_scheduler_steps = scheduler_config.num_scheduler_steps
++
++    async def run(prompt: str):
++        sampling_params = SamplingParams(
++            temperature=0,
++            max_tokens=32,
++            min_tokens=32,
++            stop=stop,
++        )
++
++        output_count = 0
++        final_output = None
++        async for output in async_engine.generate(prompt,
++                                                  sampling_params,
++                                                  request_id=uid()):
++            output_count += 1
++            final_output = output
++        return final_output, output_count
++
++    results = await asyncio.gather(
++        run("test0"),
++        run("test0"),
++    )
++    assert len(results) == 2
++    first, second = results
++
++    # remove nondeterministic fields for comparison
++    first[0].metrics = None
++    second[0].metrics = None
++    first[0].request_id = None
++    second[0].request_id = None
++
++    assert str(first) == str(second)
++
++    output_count = results[0][1]
++    if num_scheduler_steps == 1:
++        assert output_count == 32
++    else:
++        assert 1 < output_count < 32
++
++
++@pytest.mark.asyncio(scope="module")
++@pytest.mark.parametrize("stop", [None, ["a stop string"]])
++async def test_output_kinds(async_engine, stop):
++    """Test that output_kind works as expected and that
++    results are equivalent across different kinds."""
++
++    scheduler_config = await async_engine.get_scheduler_config()
++    num_scheduler_steps = scheduler_config.num_scheduler_steps
++
++    sampling_params = SamplingParams(
++        temperature=0,
++        max_tokens=32,
++        min_tokens=32,
++        stop=stop,
++    )
++
++    async def run(prompt: str, kind: RequestOutputKind):
++        params = copy(sampling_params)
++        params.output_kind = kind
++
++        output_count = 0
++        final_output = None
++        async for output in async_engine.generate(prompt,
++                                                  params,
++                                                  request_id=uid()):
++            output_count += 1
++            final_output = output
++
++        assert final_output is not None
++        assert final_output.finished
++
++        return (final_output.prompt_token_ids,
++                final_output.outputs[0].token_ids,
++                final_output.outputs[0].text, output_count)
++
++    async def run_deltas(prompt: str):
++        params = copy(sampling_params)
++        params.output_kind = RequestOutputKind.DELTA
++
++        prompt_tokens = None
++        output_tokens: List[int] = []
++        output_text = ""
++        output_count = 0
++        final_output = None
++        async for output in async_engine.generate(prompt,
++                                                  params,
++                                                  request_id=uid()):
++            token_ids = output.outputs[0].token_ids
++            text = output.outputs[0].text
++            final_output = output
++
++            # Ensure we get prompt ids iff we haven't yet received output tokens
++            if output_tokens:
++                assert 1 <= len(token_ids) <= num_scheduler_steps
++                assert stop or text
++                assert not output.prompt_token_ids
++            else:
++                assert output.prompt_token_ids
++                prompt_tokens = output.prompt_token_ids
++
++            output_tokens.extend(token_ids)
++            output_text += text
++
++            output_count += 1
++
++        assert final_output is not None
++        assert final_output.finished
++
++        return prompt_tokens, output_tokens, output_text, output_count
++
++    results = await asyncio.gather(
++        run("common input prompt", RequestOutputKind.CUMULATIVE),
++        run("common input prompt", RequestOutputKind.FINAL_ONLY),
++        run_deltas("common input prompt"))
++
++    # Make sure outputs are the same
++    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
++    assert len(prompt_set) == 1
++
++    text_set = set(text for _, _, text, _ in results)
++    assert len(text_set) == 1
++
++    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
++    assert len(tokens_set) == 1
++
++    cumulative, final, deltas = results
++
++    # output message counts
++    assert cumulative[3] == deltas[3]
++
++    if num_scheduler_steps == 1:
++        assert cumulative[3] == 32
++    else:
++        assert 1 < cumulative[3] < 32
++
++    assert final[3] == 1
++
++
++@pytest.mark.asyncio(scope="module")
++@pytest.mark.parametrize("stop", [None, ["a stop string"]])
++async def test_cancellation(async_engine, stop):
++    scheduler_config = await async_engine.get_scheduler_config()
++    num_scheduler_steps = scheduler_config.num_scheduler_steps
++
++    sampling_params = SamplingParams(
++        temperature=0,
++        min_tokens=13,
++        max_tokens=13,
++        stop=stop,
++    )
++
++    stop_at = 5 if num_scheduler_steps == 1 else 1
++
++    request_id = uid()
++
++    i = 0
++    with pytest.raises(CancelledError):
++        async for output in async_engine.generate("test2",
++                                                  sampling_params,
++                                                  request_id=request_id):
++            assert not output.finished
++            i += 1
++            if i == stop_at:
++                await async_engine.abort(request_id)
++
++    assert i == stop_at
++
++
++@pytest.mark.asyncio(scope="module")
++@pytest.mark.parametrize("stop", [None, ["a stop string"]])
++async def test_delayed_generator(async_engine, stop):
++    scheduler_config = await async_engine.get_scheduler_config()
++
++    if scheduler_config.num_scheduler_steps != 1:
++        pytest.skip("no need to test this one with multistep")
++
++    sampling_params = SamplingParams(
++        temperature=0,
++        min_tokens=10,
++        max_tokens=10,
++        stop=stop,
++    )
++
++    stream = async_engine.generate("test3", sampling_params, request_id=uid())
++    i = 0
++    final_output: Optional[RealRequestOutput] = None
++    async for output in stream:
++        final_output = output
++        if i == 0:
++            # wait for generation to complete before consuming
++            # the remaining messages
++            await asyncio.sleep(1)
++        if i < 9:
++            assert not output.finished
++        i += 1
++
++    assert i == 10
++    assert final_output is not None
++    assert len(final_output.outputs[0].token_ids) == 10
++    assert final_output.finished
+diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
+index 7b1f4a9..5668cc3 100644
+--- a/tests/async_engine/test_request_tracker.py
++++ b/tests/async_engine/test_request_tracker.py
+@@ -10,23 +10,23 @@ async def test_request_tracker():
+     stream_1 = tracker.add_request("1")
+     assert tracker.new_requests_event.is_set()
+     await tracker.wait_for_new_requests()
+-    new, finished = tracker.get_new_and_finished_requests()
++    new, aborted = tracker.get_new_and_aborted_requests()
+     assert not tracker.new_requests_event.is_set()
+     assert len(new) == 1
+     assert new[0]["request_id"] == "1"
+-    assert not finished
++    assert not aborted
+     assert not stream_1.finished
+ 
+     stream_2 = tracker.add_request("2")
+     stream_3 = tracker.add_request("3")
+     assert tracker.new_requests_event.is_set()
+     await tracker.wait_for_new_requests()
+-    new, finished = tracker.get_new_and_finished_requests()
++    new, aborted = tracker.get_new_and_aborted_requests()
+     assert not tracker.new_requests_event.is_set()
+     assert len(new) == 2
+     assert new[0]["request_id"] == "2"
+     assert new[1]["request_id"] == "3"
+-    assert not finished
++    assert not aborted
+     assert not stream_2.finished
+     assert not stream_3.finished
+ 
+@@ -36,9 +36,9 @@ async def test_request_tracker():
+     assert not tracker.new_requests_event.is_set()
+ 
+     tracker.abort_request("1")
+-    new, finished = tracker.get_new_and_finished_requests()
+-    assert len(finished) == 1
+-    assert "1" in finished
++    new, aborted = tracker.get_new_and_aborted_requests()
++    assert len(aborted) == 1
++    assert "1" in aborted
+     assert not new
+     assert stream_1.finished
+ 
+@@ -46,9 +46,11 @@ async def test_request_tracker():
+     tracker.abort_request("4")
+     assert tracker.new_requests_event.is_set()
+     await tracker.wait_for_new_requests()
+-    new, finished = tracker.get_new_and_finished_requests()
+-    assert len(finished) == 1
+-    assert "4" in finished
++    new, aborted = tracker.get_new_and_aborted_requests()
++    # aborted new requests will cancel each other out -
++    # there's no need for them to propagate into the
++    # engine
++    assert not aborted
+     assert not new
+     assert stream_4.finished
+ 
+@@ -57,10 +59,9 @@ async def test_request_tracker():
+     tracker.process_request_output(
+         RequestOutput("2", "output", [], [], [], finished=True))
+     await tracker.wait_for_new_requests()
+-    new, finished = tracker.get_new_and_finished_requests()
++    new, aborted = tracker.get_new_and_aborted_requests()
+     assert not tracker.new_requests_event.is_set()
+-    assert len(finished) == 1
+-    assert "2" in finished
++    assert not aborted
+     assert len(new) == 1
+     assert new[0]["request_id"] == "5"
+     assert stream_2.finished
+diff --git a/tests/basic_correctness/__init__.py b/tests/basic_correctness/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
+index d75279d..31a101e 100644
+--- a/tests/basic_correctness/test_basic_correctness.py
++++ b/tests/basic_correctness/test_basic_correctness.py
+@@ -3,48 +3,203 @@
+ Run `pytest tests/basic_correctness/test_basic_correctness.py`.
+ """
+ import os
++import pickle
++import re
++import weakref
++from unittest.mock import patch
+ 
+ import pytest
+ 
++from vllm import LLM
++from vllm.platforms import current_platform
++from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
++
++from ..conftest import VllmRunner
++from ..models.utils import check_outputs_equal
++from ..utils import multi_gpu_test
++
+ MODELS = [
+-    "facebook/opt-125m",
+-    "meta-llama/Llama-2-7b-hf",
++    "google/gemma-2-2b-it",
++    "meta-llama/Llama-3.2-1B",
+ ]
+-VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
++
++TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
++
++
++@pytest.fixture(autouse=True)
++def v1(run_with_both_engines):
++    # Simple autouse wrapper to run both engines for each test
++    # This can be promoted up to conftest.py to run for every
++    # test in a package
++    pass
++
++
++def test_vllm_gc_ed():
++    """Verify vllm instance is GC'ed when it is deleted"""
++    llm = LLM("facebook/opt-125m")
++    weak_llm = weakref.ref(llm)
++    del llm
++    # If there's any circular reference to vllm, this fails
++    # because llm instance is not GC'ed.
++    assert weak_llm() is None
+ 
+ 
+ @pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+ @pytest.mark.parametrize("dtype", ["half"])
+ @pytest.mark.parametrize("max_tokens", [5])
+ @pytest.mark.parametrize("enforce_eager", [False, True])
+ def test_models(
+     hf_runner,
+-    vllm_runner,
+-    example_prompts,
+     model: str,
++    backend: str,
+     dtype: str,
+     max_tokens: int,
+     enforce_eager: bool,
+ ) -> None:
+-    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
+-    if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
+-        pytest.skip("Skipping non-eager test for FlashInferBackend.")
+-
+-    hf_model = hf_runner(model, dtype=dtype)
+-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+-    del hf_model
+-
+-    vllm_model = vllm_runner(model,
+-                             dtype=dtype,
+-                             enforce_eager=enforce_eager,
+-                             gpu_memory_utilization=0.7)
+-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+-    del vllm_model
+-
+-    for i in range(len(example_prompts)):
+-        hf_output_ids, hf_output_str = hf_outputs[i]
+-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+-        assert hf_output_str == vllm_output_str, (
+-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+-        assert hf_output_ids == vllm_output_ids, (
+-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
++
++    if backend == "FLASHINFER" and current_platform.is_rocm():
++        pytest.skip("Flashinfer does not support ROCm/HIP.")
++
++    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
++        pytest.skip(
++            "XFORMERS does not support gemma2 with full context length.")
++
++    os.environ["VLLM_ATTENTION_BACKEND"] = backend
++
++    # 5042 tokens for gemma2
++    # gemma2 has alternating sliding window size of 4096
++    # we need a prompt with more than 4096 tokens to test the sliding window
++    prompt = "The following numbers of the sequence " + ", ".join(
++        str(i) for i in range(1024)) + " are:"
++    example_prompts = [prompt]
++
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
++
++    with VllmRunner(model,
++                    max_model_len=8192,
++                    dtype=dtype,
++                    enforce_eager=enforce_eager,
++                    gpu_memory_utilization=0.7) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@multi_gpu_test(num_gpus=2)
++@pytest.mark.parametrize(
++    "model, distributed_executor_backend, attention_backend, "
++    "test_suite", [
++        ("facebook/opt-125m", "ray", "", "L4"),
++        ("facebook/opt-125m", "mp", "", "L4"),
++        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
++        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
++        ("facebook/opt-125m", "ray", "", "A100"),
++        ("facebook/opt-125m", "mp", "", "A100"),
++        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
++        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
++    ])
++def test_models_distributed(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    distributed_executor_backend: str,
++    attention_backend: str,
++    test_suite: str,
++) -> None:
++
++    if test_suite != TARGET_TEST_SUITE:
++        pytest.skip(f"Skip test for {test_suite}")
++
++    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
++        # test ray adag
++        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
++        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
++
++    if attention_backend:
++        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
++
++    dtype = "half"
++    max_tokens = 5
++
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default method).
++    with vllm_runner(model,
++                     dtype=dtype,
++                     tensor_parallel_size=2,
++                     distributed_executor_backend=distributed_executor_backend
++                     ) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.skip_v1
++def test_model_with_failure(vllm_runner) -> None:
++    try:
++        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
++                   side_effect=ValueError()):
++            with pytest.raises(ValueError) as exc_info:
++                vllm_runner("facebook/opt-125m",
++                            dtype="half",
++                            enforce_eager=False,
++                            gpu_memory_utilization=0.7)
++            matches = re.search(r"input dumped to (.+).pkl",
++                                str(exc_info.value))
++            assert matches is not None
++            filename = f"{matches.group(1)}.pkl"
++
++        with open(filename, "rb") as filep:
++            inputs = pickle.load(filep)
++
++        if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
++            raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
++                                 f"{list(inputs.keys())}")
++        assert isinstance(inputs["arg_1"],
++                          ModelInputForGPUWithSamplingMetadata)
++    finally:
++        os.remove(filename)
++
++
++@pytest.mark.skip_v1
++def test_failure_with_async_out_proc(vllm_runner) -> None:
++
++    filename = None
++    try:
++        with vllm_runner("facebook/opt-125m",
++                         dtype="half",
++                         enforce_eager=False,
++                         gpu_memory_utilization=0.7) as vllm_model,\
++             patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
++                       side_effect=ValueError()):
++            model_config = vllm_model.model.llm_engine.model_config
++            assert model_config.use_async_output_proc
++            with pytest.raises(ValueError) as exc_info:
++                vllm_model.generate_greedy('how to make pizza?', 250)
++            matches = re.search(r"input dumped to (.+).pkl",
++                                str(exc_info.value))
++            assert matches is not None
++
++            filename = f"{matches.group(1)}.pkl"
++    finally:
++        # Clean up
++        if filename is not None:
++            os.remove(filename)
++        pass
+diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
+index 47d582c..469d18a 100644
+--- a/tests/basic_correctness/test_chunked_prefill.py
++++ b/tests/basic_correctness/test_chunked_prefill.py
+@@ -6,11 +6,20 @@ prefill requests are chunked.
+ 
+ Run `pytest tests/models/test_chunked_prefill.py`.
+ """
++import os
++from contextlib import nullcontext
++
+ import pytest
+ 
++from tests.kernels.utils import override_backend_env_variable
++from vllm.platforms import current_platform
++
++from ..models.utils import check_logprobs_close, check_outputs_equal
++from ..utils import multi_gpu_test
++
+ MODELS = [
+     "facebook/opt-125m",
+-    "meta-llama/Llama-2-7b-hf",
++    "meta-llama/Llama-3.2-1B",
+ ]
+ 
+ 
+@@ -22,6 +31,7 @@ MODELS = [
+ # NOTE: Increasing this in this suite will fail CI because we currently cannot
+ # reset distributed env properly. Use a value > 1 just when you test.
+ @pytest.mark.parametrize("tensor_parallel_size", [1])
++@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+ def test_models(
+     hf_runner,
+     vllm_runner,
+@@ -32,34 +42,282 @@ def test_models(
+     chunked_prefill_token_size: int,
+     enforce_eager: bool,
+     tensor_parallel_size: int,
++    attention_backend: str,
++    monkeypatch,
+ ) -> None:
++    """
++    Checks exact match decode between huggingface model and vllm runner with
++    chunked prefill.
++    """
++    override_backend_env_variable(monkeypatch, attention_backend)
++
++    max_num_seqs = chunked_prefill_token_size
++    max_num_batched_tokens = chunked_prefill_token_size
++
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
++
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            max_num_batched_tokens=max_num_batched_tokens,
++            enable_chunked_prefill=True,
++            tensor_parallel_size=tensor_parallel_size,
++            enforce_eager=enforce_eager,
++            max_num_seqs=max_num_seqs,
++    ) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@multi_gpu_test(num_gpus=2)
++@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
++def test_models_distributed(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    distributed_executor_backend: str,
++    attention_backend: str,
++    monkeypatch,
++) -> None:
++    override_backend_env_variable(monkeypatch, attention_backend)
++
++    if (model == "meta-llama/Llama-2-7b-hf"
++            and distributed_executor_backend == "ray"):
++        # test ray adag
++        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
++        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
++
++    dtype = "half"
++    max_tokens = 5
++    chunked_prefill_token_size = 16
++
++    # Add a chunked prefill config.
+     max_num_seqs = min(chunked_prefill_token_size, 256)
+-    enable_chunked_prefill = False
+-    max_num_batched_tokens = None
+-    if chunked_prefill_token_size != -1:
+-        enable_chunked_prefill = True
+-        max_num_batched_tokens = chunked_prefill_token_size
++    assert chunked_prefill_token_size != -1
++    enable_chunked_prefill = True
++    max_num_batched_tokens = chunked_prefill_token_size
++
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default method).
++
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            tensor_parallel_size=2,
++            max_num_seqs=max_num_seqs,
++            enable_chunked_prefill=enable_chunked_prefill,
++            max_num_batched_tokens=max_num_batched_tokens,
++            distributed_executor_backend=distributed_executor_backend,
++    ) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize(
++    "kv_cache_dtype,model",
++    [("fp8_e4m3",
++      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
++# Due to low-precision numerical divergence, we only test logprob of 4 tokens
++@pytest.mark.parametrize("max_tokens", [4])
++@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
++@pytest.mark.parametrize("enforce_eager", [False, True])
++# NOTE: Increasing this in this suite will fail CI because we currently cannot
++# reset distributed env properly. Use a value > 1 just when you test.
++@pytest.mark.parametrize("tensor_parallel_size", [1])
++# Due to low-precision numerical divergence, this test is too sensitive to
++# the async postprocessor
++@pytest.mark.parametrize("disable_async_output_proc", [True])
++def test_models_with_fp8_kv_cache(
++    vllm_runner,
++    example_prompts,
++    kv_cache_dtype: str,
++    model: str,
++    max_tokens: int,
++    chunked_prefill_token_size: int,
++    enforce_eager: bool,
++    tensor_parallel_size: int,
++    disable_async_output_proc: bool,
++) -> None:
++    """
++    Check output logprobs match between no_chunked_prefill and chunked_prefill
++    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
++    so here we only check chunked prefill.
++    """
++    NUM_LOG_PROBS = 8
+ 
+-    hf_model = hf_runner(model, dtype=dtype)
+-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+-    del hf_model
++    max_num_seqs = chunked_prefill_token_size
++    max_num_batched_tokens = chunked_prefill_token_size
+ 
+-    vllm_model = vllm_runner(
++    with vllm_runner(
++            model,
++            tensor_parallel_size=tensor_parallel_size,
++            enforce_eager=enforce_eager,
++            max_num_seqs=max_num_seqs,
++            kv_cache_dtype=kv_cache_dtype,
++            disable_async_output_proc=disable_async_output_proc,
++    ) as vllm_model:
++        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, NUM_LOG_PROBS)
++
++    with vllm_runner(
++            model,
++            max_num_batched_tokens=max_num_batched_tokens,
++            enable_chunked_prefill=True,
++            tensor_parallel_size=tensor_parallel_size,
++            enforce_eager=enforce_eager,
++            max_num_seqs=max_num_seqs,
++            kv_cache_dtype=kv_cache_dtype,
++            disable_async_output_proc=disable_async_output_proc,
++    ) as vllm_model:
++        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, NUM_LOG_PROBS)
++
++    check_logprobs_close(
++        outputs_0_lst=no_chunked_prefill_outputs,
++        outputs_1_lst=chunked_prefill_outputs,
++        name_0="no_chunked_prefill",
++        name_1="chunked_prefill",
++    )
++
++
++@pytest.mark.parametrize("max_tokens", [16])
++@pytest.mark.parametrize("enforce_eager", [False])
++@pytest.mark.parametrize("chunk_size", [30, 32])
++# NOTE: Increasing this in this suite will fail CI because we currently cannot
++# reset distributed env properly. Use a value > 1 just when you test.
++@pytest.mark.parametrize("tensor_parallel_size", [1])
++@pytest.mark.parametrize("dtype", ["half"])
++def test_with_prefix_caching(
++    vllm_runner,
++    max_tokens: int,
++    enforce_eager: bool,
++    chunk_size: int,
++    tensor_parallel_size: int,
++    dtype: str,
++) -> None:
++    """
++    Checks exact match decode with and without prefix caching
++    with chunked prefill enabled.
++    """
++    model = "meta-llama/Llama-2-7b-chat-hf"
++    # The common prompt has 142 tokens with Llama-2 tokenizer.
++    common_prompt = "You are a helpful AI assistant " * 20
++    unique_prompts = [
++        "Question",  # Warmup
++        "Question",  # Fully cached
++        "Another question",  # Partial cached
++    ]
++    full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
++
++    max_num_batched_tokens = max_num_seqs = chunk_size
++    outputs = {}  # type: ignore
++    check_result = True
++    for enable in (True, False):
++        with vllm_runner(
++                model,
++                dtype=dtype,
++                max_num_batched_tokens=max_num_batched_tokens,
++                enable_chunked_prefill=True,
++                enable_prefix_caching=enable,
++                tensor_parallel_size=tensor_parallel_size,
++                enforce_eager=enforce_eager,
++                max_num_seqs=max_num_seqs,
++        ) as vllm_model:
++            # It should fail when prefix caching is enable and chunk
++            # size is not a multiple of block size (16).
++            should_fail = chunk_size % 16 != 0 and enable
++            check_result &= not should_fail
++            outputs[enable] = []
++            # Send the request one-by-one to ensure the cache is populated.
++            with pytest.raises(ValueError) if should_fail else nullcontext():
++                for prompt in full_prompts:
++                    outputs[enable] += vllm_model.generate_greedy([prompt],
++                                                                  max_tokens)
++
++    # Check results only if we did not expect a failure.
++    if check_result:
++        check_outputs_equal(
++            outputs_0_lst=outputs[False],
++            outputs_1_lst=outputs[True],
++            name_0="w/o prefix caching",
++            name_1="with prefix caching",
++        )
++
++
++@pytest.mark.parametrize("model", ["facebook/opt-125m"])
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [32])
++@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
++@pytest.mark.parametrize("enforce_eager", [False])
++@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
++@pytest.mark.cpu_model
++@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
++def test_models_cpu(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++    chunked_prefill_token_size: int,
++    enforce_eager: bool,
++    attention_backend: str,
++    monkeypatch,
++) -> None:
++    test_models(
++        hf_runner,
++        vllm_runner,
++        example_prompts,
+         model,
+-        dtype=dtype,
+-        max_num_batched_tokens=max_num_batched_tokens,
+-        enable_chunked_prefill=enable_chunked_prefill,
+-        tensor_parallel_size=tensor_parallel_size,
+-        enforce_eager=enforce_eager,
+-        max_num_seqs=max_num_seqs,
++        dtype,
++        max_tokens,
++        chunked_prefill_token_size,
++        enforce_eager,
++        1,
++        attention_backend,
++        monkeypatch,
++    )
++
++
++@pytest.mark.parametrize("max_tokens", [16])
++@pytest.mark.parametrize("enforce_eager", [False])
++@pytest.mark.parametrize("chunk_size", [30, 32])
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.cpu_model
++@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
++def test_with_prefix_caching_cpu(
++    vllm_runner,
++    max_tokens: int,
++    enforce_eager: bool,
++    chunk_size: int,
++    dtype: str,
++) -> None:
++    test_with_prefix_caching(
++        vllm_runner,
++        max_tokens,
++        enforce_eager,
++        chunk_size,
++        1,
++        dtype,
+     )
+-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+-    del vllm_model
+-
+-    for i in range(len(example_prompts)):
+-        hf_output_ids, hf_output_str = hf_outputs[i]
+-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+-        assert hf_output_str == vllm_output_str, (
+-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+-        assert hf_output_ids == vllm_output_ids, (
+-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
+new file mode 100644
+index 0000000..d7f36a7
+--- /dev/null
++++ b/tests/basic_correctness/test_cpu_offload.py
+@@ -0,0 +1,6 @@
++from ..utils import compare_two_settings
++
++
++def test_cpu_offload():
++    compare_two_settings("meta-llama/Llama-3.2-1B", [],
++                         ["--cpu-offload-gb", "1"])
+diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
+index ffb0717..4e502cf 100644
+--- a/tests/basic_correctness/test_preemption.py
++++ b/tests/basic_correctness/test_preemption.py
+@@ -6,19 +6,33 @@ Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
+ pytest tests/basic_correctness/test_preemption.py`.
+ """
+ import pytest
++from prometheus_client import REGISTRY
+ 
++import vllm.envs as envs
+ from vllm import SamplingParams
+ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
+                                  ENABLE_ARTIFICIAL_PREEMPT)
+ 
++from ..models.utils import check_outputs_equal
++
+ MODELS = [
+     "facebook/opt-125m",
+ ]
+ 
+-assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+-    "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+-    "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+-    "tests/basic_correctness/test_preemption.py`")
++
++@pytest.fixture(scope="module", autouse=True)
++def check_settings():
++    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
++        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
++        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
++        "pytest tests/basic_correctness/test_preemption.py`")
++
++
++@pytest.fixture
++def worker_use_ray() -> bool:
++    # When SPMD worker is used, use ray_use_worker=True
++    # to test delta input optimization works with preemption.
++    return envs.VLLM_USE_RAY_SPMD_WORKER
+ 
+ 
+ @pytest.mark.parametrize("model", MODELS)
+@@ -33,6 +47,7 @@ def test_chunked_prefill_recompute(
+     dtype: str,
+     max_tokens: int,
+     chunked_prefill_token_size: int,
++    worker_use_ray: bool,
+ ) -> None:
+     """Ensure that chunked prefill works with preemption."""
+     max_num_seqs = min(chunked_prefill_token_size, 256)
+@@ -42,21 +57,21 @@ def test_chunked_prefill_recompute(
+         enable_chunked_prefill = True
+         max_num_batched_tokens = chunked_prefill_token_size
+ 
+-    hf_model = hf_runner(model, dtype=dtype)
+-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+-    del hf_model
+-
+-    vllm_model = vllm_runner(
+-        model,
+-        dtype=dtype,
+-        max_num_batched_tokens=max_num_batched_tokens,
+-        enable_chunked_prefill=enable_chunked_prefill,
+-        max_num_seqs=max_num_seqs,
+-    )
+-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+-            ARTIFICIAL_PREEMPTION_MAX_CNT)
+-    del vllm_model
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
++
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            max_num_batched_tokens=max_num_batched_tokens,
++            enable_chunked_prefill=enable_chunked_prefill,
++            max_num_seqs=max_num_seqs,
++            worker_use_ray=worker_use_ray,
++            disable_log_stats=False,
++    ) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
++                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+ 
+     for i in range(len(example_prompts)):
+         hf_output_ids, hf_output_str = hf_outputs[i]
+@@ -71,116 +86,53 @@ def test_chunked_prefill_recompute(
+ @pytest.mark.parametrize("dtype", ["float"])
+ @pytest.mark.parametrize("max_tokens", [96])
+ def test_preemption(
++    caplog_vllm,
+     hf_runner,
+     vllm_runner,
+     example_prompts,
+     model: str,
+     dtype: str,
+     max_tokens: int,
++    worker_use_ray: bool,
+ ) -> None:
+     """By default, recompute preemption is enabled"""
+ 
+-    hf_model = hf_runner(model, dtype=dtype)
+-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+-    del hf_model
+-
+-    vllm_model = vllm_runner(
+-        model,
+-        dtype=dtype,
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
++
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            disable_log_stats=False,
++            worker_use_ray=worker_use_ray,
++    ) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
++                < ARTIFICIAL_PREEMPTION_MAX_CNT)
++        total_preemption = (
++            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
++
++    check_outputs_equal(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
+     )
+-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+-            ARTIFICIAL_PREEMPTION_MAX_CNT)
+-    del vllm_model
+ 
+-    for i in range(len(example_prompts)):
+-        hf_output_ids, hf_output_str = hf_outputs[i]
+-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+-        assert hf_output_str == vllm_output_str, (
+-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+-        assert hf_output_ids == vllm_output_ids, (
+-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+-
+-
+-@pytest.mark.parametrize("model", MODELS)
+-@pytest.mark.parametrize("dtype", ["float"])
+-@pytest.mark.parametrize("max_tokens", [96])
+-@pytest.mark.parametrize("beam_width", [4])
+-def test_swap(
+-    hf_runner,
+-    vllm_runner,
+-    example_prompts,
+-    model: str,
+-    dtype: str,
+-    max_tokens: int,
+-    beam_width: int,
+-) -> None:
+-    """Use beam search enables swapping."""
+-    example_prompts = example_prompts[:1]
+-    hf_model = hf_runner(model, dtype=dtype)
+-    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+-                                               max_tokens)
+-    del hf_model
+-
+-    vllm_model = vllm_runner(model, dtype=dtype, swap_space=10)
+-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
+-                                                   max_tokens)
+-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+-            ARTIFICIAL_PREEMPTION_MAX_CNT)
+-    del vllm_model
+-
+-    for i in range(len(example_prompts)):
+-        hf_output_ids, _ = hf_outputs[i]
+-        vllm_output_ids, _ = vllm_outputs[i]
+-        assert len(hf_output_ids) == len(vllm_output_ids)
+-        for j in range(len(hf_output_ids)):
+-            assert hf_output_ids[j] == vllm_output_ids[j], (
+-                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
+-                f"vLLM: {vllm_output_ids}")
+-
+-
+-@pytest.mark.parametrize("model", MODELS)
+-@pytest.mark.parametrize("dtype", ["float"])
+-@pytest.mark.parametrize("max_tokens", [96])
+-@pytest.mark.parametrize("beam_width", [4])
+-def test_swap_infeasible(
+-    vllm_runner,
+-    example_prompts,
+-    model: str,
+-    dtype: str,
+-    max_tokens: int,
+-    beam_width: int,
+-) -> None:
+-    """Verify infeasible swap request will be ignored."""
+-    BLOCK_SIZE = 16
+-    prefill_blocks = 2
+-    decode_blocks = max_tokens // BLOCK_SIZE
+-    example_prompts = example_prompts[:1]
+-
+-    vllm_model = vllm_runner(
+-        model,
+-        dtype=dtype,
+-        swap_space=10,
+-        block_size=BLOCK_SIZE,
+-        # Since beam search have more than 1 sequence, prefill + decode blocks
+-        # are not enough to finish.
+-        num_gpu_blocks_override=prefill_blocks + decode_blocks,
+-        max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
+-    )
+-    sampling_params = SamplingParams(n=beam_width,
+-                                     use_beam_search=True,
+-                                     temperature=0.0,
+-                                     max_tokens=max_tokens,
+-                                     ignore_eos=True)
+-    req_outputs = vllm_model.model.generate(
+-        example_prompts,
+-        sampling_params=sampling_params,
+-    )
+-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+-            ARTIFICIAL_PREEMPTION_MAX_CNT)
+-    del vllm_model
+-    # Verify the request is ignored and not hang.
+-    assert req_outputs[0].outputs[0].finish_reason == "length"
++    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
++            "is not enough KV cache space." in caplog_vllm.text)
++    # Ensure the count bucket of request-level histogram metrics matches
++    # the number of requests as a simple sanity check to ensure metrics are
++    # generated
++    preemption_metrics = None
++    for m in REGISTRY.collect():
++        if m.name == "vllm:num_preemptions":
++            preemption_metrics = m
++    assert preemption_metrics is not None
++    total_recorded_preemption = 0
++    for sample in preemption_metrics.samples:
++        total_recorded_preemption += sample.value
++    assert total_preemption == total_recorded_preemption
+ 
+ 
+ @pytest.mark.parametrize("model", MODELS)
+@@ -192,30 +144,33 @@ def test_preemption_infeasible(
+     model: str,
+     dtype: str,
+     max_tokens: int,
++    worker_use_ray: bool,
+ ) -> None:
+     """Verify infeasible preemption request will be ignored."""
+     BLOCK_SIZE = 16
+     prefill_blocks = 2
+     decode_blocks = max_tokens // BLOCK_SIZE
+-    vllm_model = vllm_runner(
+-        model,
+-        dtype=dtype,
+-        block_size=BLOCK_SIZE,
+-        # Not enough gpu blocks to complete a single sequence.
+-        # preemption should happen, and the sequence should be
+-        # ignored instead of hanging forever.
+-        num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+-        max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+-    )
+-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+-    req_outputs = vllm_model.model.generate(
+-        example_prompts,
+-        sampling_params=sampling_params,
+-    )
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            block_size=BLOCK_SIZE,
++            # Not enough gpu blocks to complete a single sequence.
++            # preemption should happen, and the sequence should be
++            # ignored instead of hanging forever.
++            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
++            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
++            worker_use_ray=worker_use_ray,
++    ) as vllm_model:
++        sampling_params = SamplingParams(max_tokens=max_tokens,
++                                         ignore_eos=True)
++        req_outputs = vllm_model.model.generate(
++            example_prompts,
++            sampling_params=sampling_params,
++        )
++
++        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
++                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+ 
+-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+-            ARTIFICIAL_PREEMPTION_MAX_CNT)
+-    del vllm_model
+     # Verify the request is ignored and not hang.
+     for req_output in req_outputs:
+         outputs = req_output.outputs
+diff --git a/tests/compile/__init__.py b/tests/compile/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/compile/backend.py b/tests/compile/backend.py
+new file mode 100644
+index 0000000..8fa10e5
+--- /dev/null
++++ b/tests/compile/backend.py
+@@ -0,0 +1,37 @@
++from copy import deepcopy
++from typing import Callable, Union
++
++from torch import fx
++
++from vllm.compilation.inductor_pass import InductorPass
++
++
++class TestBackend:
++    """
++    This class provides a simple Inductor backend that can be used for testing.
++    It takes a list of custom passes and runs them after Inductor's passes.
++    It also saves the graph before and after the custom passes for inspection.
++    """
++
++    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
++                                                             None]]):
++        self.custom_passes = list(passes)
++        from torch._inductor import config
++        self.current_config = config.shallow_copy_dict()
++        self.current_config['force_disable_caches'] = True
++        self.current_config['post_grad_custom_post_pass'] = self.post_pass
++
++    def __call__(self, graph: fx.GraphModule, example_inputs):
++        from torch._inductor.compile_fx import compile_fx
++        return compile_fx(graph,
++                          example_inputs,
++                          config_patches=self.current_config)
++
++    def post_pass(self, graph: fx.Graph):
++        self.graph_pre_pass = deepcopy(graph)
++        for pass_ in self.custom_passes:
++            pass_(graph)
++
++        self.graph_post_pass = deepcopy(graph)
++        # assign by reference, will reflect the final state of the graph
++        self.final_graph = graph
+diff --git a/tests/compile/piecewise/__init__.py b/tests/compile/piecewise/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
+new file mode 100644
+index 0000000..aa11524
+--- /dev/null
++++ b/tests/compile/piecewise/test_simple.py
+@@ -0,0 +1,109 @@
++"""
++Test the piecewise compilation with a simple model so that we
++can exactly calculate the expected output and side effects.
++"""
++
++import torch
++from torch import nn
++from torch.library import Library
++
++from vllm.compilation.counter import compilation_counter
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
++                         set_current_vllm_config)
++from vllm.utils import direct_register_custom_op
++
++global_counter = 0
++
++# create a library to hold the custom op
++silly_lib = Library("silly", "FRAGMENT")  # noqa
++
++
++def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
++                    out: torch.Tensor) -> None:
++    global global_counter
++    global_counter += 1
++    print(f"{global_counter=}")
++    out.copy_(q)
++    out[0] += 1
++
++
++def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
++                         out: torch.Tensor) -> None:
++    return
++
++
++direct_register_custom_op(
++    op_name="attention",
++    op_func=silly_attention,
++    mutates_args=["out"],
++    fake_impl=silly_attention_fake,
++    target_lib=silly_lib,
++)
++
++
++@support_torch_compile
++class SillyModel(nn.Module):
++
++    def __init__(self,
++                 *,
++                 vllm_config: VllmConfig,
++                 prefix: str = '',
++                 **kwargs) -> None:
++        super().__init__()
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        """
++        Overall effect:
++        x += 1
++        x[0] += 2
++        global_counter += 2
++        """
++        x = x + 1
++        x = x + 2
++        out = torch.empty_like(x)
++        torch.ops.silly.attention(x, x, x, out)
++        x = out
++        x = x - 2
++        x = x - 1
++        out = torch.empty_like(x)
++        torch.ops.silly.attention(x, x, x, out)
++        x = out
++        x = x + 1
++        return x
++
++
++def test_simple_piecewise_compile():
++
++    vllm_config = VllmConfig(compilation_config=CompilationConfig(
++        level=CompilationLevel.PIECEWISE,
++        use_cudagraph=True,
++        splitting_ops=["silly.attention"],
++        cudagraph_copy_inputs=True,
++        cudagraph_capture_sizes=[1, 2],
++    ))
++    with set_current_vllm_config(vllm_config):
++        model = SillyModel(vllm_config=vllm_config, prefix='')
++
++    inputs = torch.randn(100).cuda()
++
++    with compilation_counter.expect(
++            num_graphs_seen=1,  # one graph for the model
++            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
++            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
++            num_inductor_compilations=3,  # num_piecewise_capturable_graphs_seen
++            num_cudagraph_caputured=
++            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
++    ):
++
++        model(inputs)
++
++        model(torch.randn(2).cuda())
++        model(torch.randn(1).cuda())
++
++        input = torch.zeros(2).cuda()
++        global global_counter
++        global_counter = 0
++        output = model(input)
++        assert global_counter == 2
++        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
+diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
+new file mode 100644
+index 0000000..d4ede4d
+--- /dev/null
++++ b/tests/compile/piecewise/test_toy_llama.py
+@@ -0,0 +1,447 @@
++"""
++Test the piecewise compilation with a simple model, comparing the output
++with and without the piecewise compilation.
++
++This is a tractable model, the weights and computation are specially designed
++if the config `tractable_init` is set to True. Otherwise, the weights are
++initialized randomly with a fixed seed.
++"""
++from dataclasses import dataclass
++from typing import Any, List, Optional, Tuple
++
++import torch
++from torch import nn
++from torch.library import Library
++
++from vllm.compilation.counter import compilation_counter
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
++                         set_current_vllm_config)
++from vllm.utils import direct_register_custom_op
++
++# create a library to hold the custom op
++silly_lib = Library("silly", "FRAGMENT")  # noqa
++
++
++def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
++                    out: torch.Tensor) -> None:
++    out.copy_(q)
++    out += k
++    out += v
++
++
++def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
++                         out: torch.Tensor) -> None:
++    return
++
++
++direct_register_custom_op(
++    op_name="attention",
++    op_func=silly_attention,
++    mutates_args=["out"],
++    fake_impl=silly_attention_fake,
++    target_lib=silly_lib,
++)
++
++
++@dataclass
++class LlamaConfig:
++    hidden_size: int = 128
++    mlp_size: int = 256
++    vocab_size: int = 128
++    num_layers: int = 2
++    init_value: float = 1.0
++    tractable_init: bool = False
++    random_seed: int = 0
++
++    def compute_hash(self) -> str:
++        factors: List[Any] = []
++        for k, v in self.__dict__.items():
++            if k == "random_seed":
++                continue
++            factors.append((k, v))
++        factors.sort()
++        import hashlib
++        return hashlib.md5(str(factors).encode()).hexdigest()
++
++    def __post_init__(self):
++        assert self.mlp_size >= self.hidden_size
++
++
++class LlamaMLP(nn.Module):
++
++    def __init__(self, config: LlamaConfig) -> None:
++        super().__init__()
++        self.gate_up_projection = nn.Linear(
++            in_features=config.hidden_size,
++            out_features=config.mlp_size * 2,
++            bias=False,
++        )
++        self.down_projection = nn.Linear(
++            in_features=config.mlp_size,
++            out_features=config.hidden_size,
++            bias=False,
++        )
++
++        if config.tractable_init:
++            nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size])
++            nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:])
++            nn.init.eye_(self.down_projection.weight.data)
++        else:
++            nn.init.xavier_normal_(self.gate_up_projection.weight.data,
++                                   generator=torch.Generator().manual_seed(
++                                       config.random_seed),
++                                   gain=0.001)
++            nn.init.xavier_normal_(self.down_projection.weight.data,
++                                   generator=torch.Generator().manual_seed(
++                                       config.random_seed),
++                                   gain=0.001)
++
++    def forward(self, x):
++        # for tractable_init and positive input, this is
++        # essentially an elementwise-square
++        x = self.gate_up_projection(x)
++        x = x[:, :x.size(1) // 2] * torch.nn.functional.relu(
++            x[:, x.size(1) // 2:])
++        x = self.down_projection(x)
++        return x
++
++
++class LlamaAttention(nn.Module):
++
++    def __init__(self, config: LlamaConfig) -> None:
++        super().__init__()
++        self.qkv_projection = nn.Linear(
++            in_features=config.hidden_size,
++            out_features=config.hidden_size * 3,
++            bias=False,
++        )
++
++        self.output_projection = nn.Linear(
++            in_features=config.hidden_size,
++            out_features=config.hidden_size,
++            bias=False,
++        )
++
++        if config.tractable_init:
++            nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size])
++            nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 *
++                                                         config.hidden_size])
++            nn.init.eye_(self.qkv_projection.weight.data[2 *
++                                                         config.hidden_size:])
++            nn.init.eye_(self.output_projection.weight.data)
++        else:
++            nn.init.xavier_normal_(self.qkv_projection.weight.data,
++                                   generator=torch.Generator().manual_seed(
++                                       config.random_seed),
++                                   gain=0.001)
++            nn.init.xavier_normal_(self.output_projection.weight.data,
++                                   generator=torch.Generator().manual_seed(
++                                       config.random_seed),
++                                   gain=0.001)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++    ) -> torch.Tensor:
++        # for tractable_init, this is:
++        # output = (hidden_states * 3 + positions * 2)
++        qkv = self.qkv_projection(hidden_states)
++        hidden_size = qkv.size(-1) // 3
++        q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1)
++
++        q = q + positions.unsqueeze(1)
++        k = k + positions.unsqueeze(1)
++
++        attn_output = torch.empty_like(q)
++        torch.ops.silly.attention(q, k, v, attn_output)
++
++        output = self.output_projection(attn_output)
++        return output
++
++
++class LlamaDecoderLayer(nn.Module):
++
++    def __init__(self, config: LlamaConfig) -> None:
++        super().__init__()
++        self.self_attention = LlamaAttention(config)
++        self.mlp = LlamaMLP(config)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        residual: Optional[torch.Tensor],
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        """
++        For tractable computation:
++        - if residual is None, the outputs are:
++            - residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3
++            - hidden_states = (residual + 1) ** 2
++        - if residual is not None, the outputs are:
++            - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
++            - hidden_states = (residual + 1) ** 2
++        """ # noqa
++        if residual is None:
++            residual = hidden_states
++            hidden_states = hidden_states + 1
++        else:
++            hidden_states = hidden_states + residual
++            residual = hidden_states
++            hidden_states = hidden_states + 1
++
++        hidden_states = self.self_attention(positions=positions,
++                                            hidden_states=hidden_states)
++
++        hidden_states = hidden_states + residual
++        residual = hidden_states
++        hidden_states = hidden_states + 1
++        hidden_states = self.mlp(hidden_states)
++
++        return hidden_states, residual
++
++
++@support_torch_compile
++class LlamaModel(nn.Module):
++
++    def __init__(self,
++                 *,
++                 vllm_config: VllmConfig,
++                 config: LlamaConfig,
++                 prefix: str = '',
++                 **kwargs) -> None:
++        super().__init__()
++        self.embedding_tokens = nn.Embedding(
++            num_embeddings=config.vocab_size,
++            embedding_dim=config.hidden_size,
++        )
++        self.layers = nn.ModuleList(
++            [LlamaDecoderLayer(config) for _ in range(config.num_layers)])
++
++        # this is the initial value of the hidden states
++        self.embedding_tokens.weight.data.fill_(config.init_value)
++
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++    ) -> torch.Tensor:
++        hidden_states = self.embedding_tokens(input_ids)
++        residual = None
++        for layer in self.layers:
++            hidden_states, residual = layer(positions, hidden_states, residual)
++        return hidden_states
++
++
++def tractable_computation(input_ids: torch.Tensor,
++                          positions: torch.Tensor,
++                          config: LlamaConfig,
++                          init_value: float = 1.0) -> torch.Tensor:
++    hidden_states = torch.ones(input_ids.size(0),
++                               config.hidden_size,
++                               device=input_ids.device,
++                               dtype=input_ids.dtype) * init_value
++
++    # first layer
++    residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
++    hidden_states = (residual + 1)**2
++
++    # following layers
++    for _ in range(config.num_layers - 1):
++        hidden_states = hidden_states + residual
++        residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
++        hidden_states = (residual + 1)**2
++
++    return hidden_states
++
++
++@torch.inference_mode
++def run_model(llama_config,
++              use_compile: bool,
++              split_attn: bool = False) -> torch.Tensor:
++
++    if use_compile:
++        compilation_config = CompilationConfig(
++            level=CompilationLevel.PIECEWISE,
++            use_cudagraph=True,
++            cudagraph_capture_sizes=[1, 2],
++        )
++        if split_attn:
++            compilation_config.splitting_ops = ["silly.attention"]
++    else:
++        compilation_config = CompilationConfig(
++            level=CompilationLevel.NO_COMPILATION, )
++
++    vllm_config = VllmConfig(compilation_config=compilation_config,
++                             additional_config=llama_config)
++    with set_current_vllm_config(vllm_config):
++        model = LlamaModel(config=llama_config,
++                           vllm_config=vllm_config,
++                           prefix="").eval().cuda()
++
++    B = 16  # max batch size
++    input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
++    positions = torch.arange(B).cuda()
++
++    model(input_ids, positions)
++    model(input_ids[:2], positions[:2])
++    model(input_ids[:1], positions[:1])
++
++    input_ids[:2].zero_()
++    output = model(input_ids[:2], positions[:2])
++
++    output = output.cpu()
++
++    if llama_config.tractable_init:
++        expected_output = tractable_computation(input_ids[:2], positions[:2],
++                                                llama_config).cpu()
++
++        assert torch.allclose(output, expected_output)
++    else:
++        return output.cpu()
++
++
++def test_toy_llama():
++    # compare output with and without piecewise compilation
++
++    llama_config = LlamaConfig(hidden_size=128,
++                               mlp_size=256,
++                               vocab_size=128,
++                               num_layers=12)
++
++    tractable_config = LlamaConfig(hidden_size=128,
++                                   mlp_size=256,
++                                   vocab_size=128,
++                                   num_layers=2,
++                                   tractable_init=True)
++
++    outputs = []
++    with compilation_counter.expect(
++            num_graphs_seen=0,
++            num_piecewise_graphs_seen=0,
++            num_piecewise_capturable_graphs_seen=0,
++            num_inductor_compilations=0,
++            num_cudagraph_caputured=0,
++    ):
++        outputs.append(run_model(llama_config, use_compile=False))
++    run_model(tractable_config, use_compile=False)
++
++    with compilation_counter.expect(
++            num_graphs_seen=1,  # one graph for the model
++            num_piecewise_graphs_seen=1,
++            num_piecewise_capturable_graphs_seen=1,
++            num_inductor_compilations=1,  # num_piecewise_capturable_graphs_seen
++            num_cudagraph_caputured=
++            2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
++    ):
++        outputs.append(run_model(llama_config, use_compile=True))
++    run_model(tractable_config, use_compile=True)
++
++    with compilation_counter.expect(
++            num_graphs_seen=1,  # one graph for the model
++            num_piecewise_graphs_seen=2 * llama_config.num_layers +
++            1,  # 2 * num_layers + 1
++            num_piecewise_capturable_graphs_seen=1 +
++            llama_config.num_layers,  # 1 + num_layers
++            num_inductor_compilations=1 +
++            llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
++            num_cudagraph_caputured=2 *
++        (1 + llama_config.num_layers
++         ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
++    ):
++        outputs.append(
++            run_model(llama_config, use_compile=True, split_attn=True))
++    run_model(tractable_config, use_compile=True, split_attn=True)
++
++    for i in range(1, len(outputs)):
++        assert torch.allclose(outputs[0], outputs[i])
++
++
++@torch.inference_mode
++def benchmark():
++    from triton.testing import do_bench
++
++    # similar to llama 3.1-8B
++    llama_config = LlamaConfig(hidden_size=4096,
++                               mlp_size=14336,
++                               vocab_size=128 * 1024,
++                               num_layers=32)
++
++    # a tiny model to measure the overhead
++    # of piecewise cudagraph
++    llama_config = LlamaConfig(hidden_size=40,
++                               mlp_size=80,
++                               vocab_size=128,
++                               num_layers=2)
++
++    cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)]
++
++    eager_time = {}
++    full_cudagraph_time = {}
++    piecewise_cudagraph_time = {}
++
++    pool = torch.cuda.graph_pool_handle()
++
++    for piecewise in [False, True]:
++        if piecewise:
++            compilation_config = CompilationConfig(
++                level=CompilationLevel.PIECEWISE,
++                use_cudagraph=True,
++                splitting_ops=["silly.attention"],
++                cudagraph_capture_sizes=cudagraph_sizes,
++            )
++        else:
++            compilation_config = CompilationConfig(
++                level=CompilationLevel.PIECEWISE,
++                cudagraph_capture_sizes=cudagraph_sizes,
++            )
++
++        vllm_config = VllmConfig(compilation_config=compilation_config)
++        with set_current_vllm_config(vllm_config):
++            model = LlamaModel(config=llama_config,
++                               vllm_config=vllm_config,
++                               prefix="").eval().cuda().to(torch.bfloat16)
++
++        B = 256  # max batch size
++        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
++        positions = torch.arange(B).cuda().to(torch.bfloat16)
++
++        graphs = {}
++
++        model(input_ids, positions)
++        for b in cudagraph_sizes[::-1]:
++            if not piecewise:
++                graph = torch.cuda.CUDAGraph()
++                with torch.cuda.graph(graph, pool=pool):
++                    output = model(input_ids[:b], positions[:b])
++                graphs[b] = (graph, output)
++            else:
++                output = model(input_ids[:b], positions[:b])
++                graphs[b] = (model, output)
++        for b in cudagraph_sizes:
++            if piecewise:
++                # noqa is for `Function definition does not bind loop variable`
++                # it will be problematic if we save the created lambda function
++                # and use it later, because it will look up the name `b` in the
++                # enclosing scope, and the value of `b` will always be 256.
++                # it is fine here, because we only use the lambda function once.
++                runtime = do_bench(lambda: graphs[b][0]  # noqa
++                                   (input_ids[:b], positions[:b]))  # noqa
++                piecewise_cudagraph_time[b] = runtime
++            else:
++                runtime = do_bench(lambda: graphs[b][0].replay())  # noqa
++                eager_runtime = do_bench(
++                    lambda: model(input_ids[:b], positions[:b]))  # noqa
++                full_cudagraph_time[b] = runtime
++                eager_time[b] = eager_runtime
++
++    # print in tabular format
++    print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
++    for b in cudagraph_sizes:
++        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
++              f"\t{piecewise_cudagraph_time[b]:.3f}")
++
++
++if __name__ == "__main__":
++    benchmark()
+diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
+new file mode 100644
+index 0000000..87d5aef
+--- /dev/null
++++ b/tests/compile/test_basic_correctness.py
+@@ -0,0 +1,141 @@
++import dataclasses
++from typing import Dict, List, Optional
++
++import pytest
++
++from vllm.config import CompilationLevel
++from vllm.utils import cuda_device_count_stateless
++
++from ..utils import compare_all_settings
++
++
++@dataclasses.dataclass
++class TestSetting:
++    model: str
++    model_args: List[str]
++    pp_size: int
++    tp_size: int
++    attn_backend: str
++    method: str
++    fullgraph: bool
++
++
++# representative settings for testing
++test_settings = [
++    # basic llama model
++    TestSetting(
++        model="meta-llama/Llama-3.2-1B",
++        model_args=[],
++        pp_size=2,
++        tp_size=2,
++        attn_backend="FLASHINFER",
++        method="generate",
++        fullgraph=True,
++    ),
++    # llama model with quantization
++    TestSetting(
++        model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
++        model_args=["--quantization", "gptq"],
++        pp_size=1,
++        tp_size=1,
++        attn_backend="FLASH_ATTN",
++        method="generate",
++        fullgraph=True,
++    ),
++    # MoE model
++    TestSetting(
++        model="ibm/PowerMoE-3b",
++        model_args=[],
++        pp_size=1,
++        tp_size=2,
++        attn_backend="FLASH_ATTN",
++        method="generate",
++        fullgraph=True,
++    ),
++    # embedding model
++    TestSetting(
++        model="BAAI/bge-multilingual-gemma2",
++        model_args=["--task", "embed"],
++        pp_size=1,
++        tp_size=1,
++        attn_backend="FLASHINFER",
++        method="encode",
++        fullgraph=True,
++    ),
++    # encoder-based embedding model (BERT)
++    TestSetting(
++        model="BAAI/bge-base-en-v1.5",
++        model_args=["--task", "embed"],
++        pp_size=1,
++        tp_size=1,
++        attn_backend="XFORMERS",
++        method="encode",
++        fullgraph=True,
++    ),
++    # vision language model
++    TestSetting(
++        model="microsoft/Phi-3.5-vision-instruct",
++        model_args=["--trust-remote-code", "--max-model-len", "2048"],
++        pp_size=2,
++        tp_size=1,
++        attn_backend="FLASH_ATTN",
++        method="generate_with_image",
++        fullgraph=False,
++    ),
++]
++
++
++# we cannot afford testing the full Catesian product
++# of all models and all levels
++@pytest.mark.parametrize("test_setting", test_settings)
++def test_compile_correctness(test_setting: TestSetting):
++    # this test is run under multiple suits, with different GPUs.
++    # make sure we only run the test with correct CUDA devices.
++    # don't use "<", as it will duplicate the tests.
++    model = test_setting.model
++    model_args = test_setting.model_args
++    pp_size = test_setting.pp_size
++    tp_size = test_setting.tp_size
++    attn_backend = test_setting.attn_backend
++    method = test_setting.method
++    fullgraph = test_setting.fullgraph
++    if cuda_device_count_stateless() != pp_size * tp_size:
++        pytest.skip("Not correct CUDA devices for the test.")
++    import os
++    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
++    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
++                ["-tp", str(tp_size)]
++
++    all_args: List[List[str]] = []
++    all_envs: List[Optional[Dict[str, str]]] = []
++
++    for level in [
++            CompilationLevel.NO_COMPILATION,
++            CompilationLevel.PIECEWISE,
++    ]:
++        all_args.append(final_args + [f"-O{level}"])
++        all_envs.append({})
++
++    # inductor will change the output, so we only compare if the output
++    # is close, not exactly the same.
++    compare_all_settings(
++        model,
++        all_args,
++        all_envs,
++        method=method if method != "generate" else "generate_close")
++    all_envs.clear()
++    all_args.clear()
++
++    for level in [
++            CompilationLevel.NO_COMPILATION,
++            CompilationLevel.DYNAMO_AS_IS,
++            CompilationLevel.DYNAMO_ONCE,
++    ]:
++        all_args.append(final_args + [f"-O{level}"])
++        all_envs.append({})
++        if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
++            # "DYNAMO_ONCE" will always use fullgraph
++            all_envs[-1][
++                "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
++
++    compare_all_settings(model, all_args * 3, all_envs, method=method)
+diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
+new file mode 100644
+index 0000000..4dfdfe2
+--- /dev/null
++++ b/tests/compile/test_full_graph.py
+@@ -0,0 +1,20 @@
++import pytest
++
++from vllm.config import CompilationLevel
++
++from ..utils import fork_new_process_for_each_test
++from .utils import TEST_MODELS, check_full_graph_support
++
++
++@pytest.mark.parametrize("model_info", TEST_MODELS)
++@pytest.mark.parametrize(
++    "optimization_level",
++    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
++@fork_new_process_for_each_test
++def test_full_graph(model_info, optimization_level):
++    model = model_info[0]
++    model_kwargs = model_info[1]
++    check_full_graph_support(model,
++                             model_kwargs,
++                             optimization_level,
++                             tp_size=1)
+diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
+new file mode 100644
+index 0000000..ea3aaee
+--- /dev/null
++++ b/tests/compile/test_functionalization.py
+@@ -0,0 +1,100 @@
++import pytest
++import torch
++
++import vllm.envs as envs
++from vllm import LLM, SamplingParams
++from vllm.compilation.fix_functionalization import FixFunctionalizationPass
++from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
++                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
++from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
++from vllm.compilation.reshapes import RedundantReshapesPass
++from vllm.config import CompilationConfig
++
++from .backend import TestBackend
++
++OPS_IN_MODEL = [
++    torch.ops._C.rotary_embedding.default,
++    torch.ops._C.fused_add_rms_norm.default,
++    torch.ops._C.silu_and_mul.default,
++]
++
++RMS_OP = torch.ops._C.rms_norm.default
++
++RMS_QUANT_OPS = {
++    "static_fp8": [
++        torch.ops._C.rms_norm_static_fp8_quant.default,
++        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
++    ],
++}
++
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++
++
++@pytest.mark.parametrize(
++    "model, quant_key",
++    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e", kFp8StaticTensorSym),
++     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e",
++      kFp8DynamicTokenSym)])
++@pytest.mark.parametrize("do_fusion", [True, False])
++@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
++                    reason="Only test on CUDA")
++def test_fix_functionalization(model: str, quant_key: QuantKey,
++                               do_fusion: bool):
++    torch.set_default_device("cuda")
++
++    config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
++                                          enable_reshape=True)
++    reshape_pass = RedundantReshapesPass(config)
++    fusion_pass = FusionPass.instance(config)
++
++    passes = [reshape_pass, fusion_pass] if do_fusion else [reshape_pass]
++    func_pass = FixFunctionalizationPass(config)
++    backend_func = TestBackend(*passes, func_pass)
++    backend_no_func = TestBackend(*passes)
++
++    # instantiate a full engine and manually compile the model 2x
++    # (with and without FixFunctionalizationPass)
++    llm = LLM(model=model, enforce_eager=True)
++    model_runner = llm.llm_engine.model_executor.driver_worker.model_runner
++    orig_model = model_runner.model
++    # TODO mark inputs dynamic? (currently torch.compile is triggered 4x)
++    # Can only do that by using the decorator but then we'd have to instantiate
++    # 2 LLM instances.
++
++    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
++    model_runner.model = torch.compile(orig_model,
++                                       fullgraph=True,
++                                       backend=backend_func)
++    gen_func = llm.generate(prompts, sampling_params)
++
++    model_runner.model = torch.compile(orig_model,
++                                       fullgraph=True,
++                                       backend=backend_no_func)
++    gen_no_func = llm.generate(prompts, sampling_params)
++
++    for output_func, output_no_func in zip(gen_func, gen_no_func):
++        assert output_func.outputs[0].text == output_no_func.outputs[0].text
++
++    # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion,
++    # and replaced by fused quantized ops in RMS_QUANT_OPS.
++    rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]
++               ] if do_fusion else [RMS_OP]
++    ops = OPS_IN_MODEL + rms_ops
++
++    for op in ops:
++        find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
++        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
++                                  op) is None  # noqa: E501
++
++    # make sure the ops were all de-functionalized
++    found = dict()
++    for node in backend_func.graph_post_pass.nodes:
++        for op in ops:
++            if is_func(node, op):
++                found[op] = True
++    assert all(found[op] for op in ops)
+diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
+new file mode 100644
+index 0000000..b4266a4
+--- /dev/null
++++ b/tests/compile/test_fusion.py
+@@ -0,0 +1,116 @@
++import pytest
++import torch
++from compressed_tensors.quantization import FP8_DTYPE
++
++import vllm.envs as envs
++from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
++                                     FusionPass, QuantKey)
++from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
++from vllm.compilation.reshapes import RedundantReshapesPass
++from vllm.config import CompilationConfig
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    apply_fp8_linear)
++
++from .backend import TestBackend
++
++
++class TestModel(torch.nn.Module):
++
++    def __init__(self, hidden_size: int, eps: float, static: bool, *args,
++                 **kwargs):
++        super().__init__(*args, **kwargs)
++        self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
++        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
++        if static:
++            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
++        else:
++            self.scale = [None for _ in range(2)]
++        self.w = [
++            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
++            for _ in range(2)
++        ]
++
++    def forward(self, x):
++        resid = torch.sqrt(x)
++        y = self.norm[0](x)
++
++        x2 = apply_fp8_linear(y,
++                              self.w[0],
++                              self.wscale[0],
++                              self.scale[0],
++                              use_per_token_if_dynamic=True)
++        # make sure resid is used for replacement to work
++        y2, resid = self.norm[1](x2, resid)
++
++        x3 = apply_fp8_linear(y2,
++                              self.w[1],
++                              self.wscale[1],
++                              self.scale[1],
++                              use_per_token_if_dynamic=True)
++        y3, resid = self.norm[2](x3, resid)  # use resid here
++        return y3
++
++
++@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
++@pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
++@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
++@pytest.mark.parametrize("eps", [1e-5, 1e-6])
++@pytest.mark.parametrize("static", [True, False])
++@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
++                    reason="Only test on CUDA")
++def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static):
++    torch.set_default_device("cuda")
++    torch.set_default_dtype(dtype)
++    torch.manual_seed(1)
++
++    # Reshape pass is needed for the fusion pass to work
++    config = CompilationConfig.PassConfig(enable_fusion=True,
++                                          enable_reshape=True)
++    reshape_pass = RedundantReshapesPass(config)
++    fusion_pass = FusionPass.instance(config)
++
++    backend = TestBackend(reshape_pass, fusion_pass)
++    model = TestModel(hidden_size, eps, static)
++
++    # First dimension dynamic
++    x = torch.rand(num_tokens, hidden_size)
++    torch._dynamo.mark_dynamic(x, 0)
++
++    result = model(x)
++
++    model2 = torch.compile(model, backend=backend)
++    result2 = model2(x)
++
++    # Higher tol for dynamic, even higher for bfloat16
++    if static:
++        ATOL, RTOL = (1e-3, 1e-3)
++    elif dtype == torch.float16:
++        ATOL, RTOL = (2e-3, 2e-3)
++    else:
++        ATOL, RTOL = (1e-2, 1e-2)
++
++    torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
++
++    # Check substitution worked
++    pre_nodes = backend.graph_pre_pass.nodes
++    post_nodes = backend.graph_post_pass.nodes
++
++    # static is per-tensor, dynamic is per-token
++    key = QuantKey(dtype=FP8_DTYPE,
++                   static=static,
++                   per_tensor=static,
++                   symmetric=True)
++    rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
++    add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
++    fp8_quant = QUANT_OPS[key]
++
++    # In pre-nodes, fp8 quant should be present and fused kernels should not
++    assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
++    assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
++    find_auto_fn(pre_nodes, fp8_quant)
++
++    # In post-nodes, fused kernels should be present and fp8 quant should not
++    find_auto_fn(post_nodes, rms_quant)
++    find_auto_fn(post_nodes, add_rms_quant)
++    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
+diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
+new file mode 100644
+index 0000000..03e7535
+--- /dev/null
++++ b/tests/compile/test_pass_manager.py
+@@ -0,0 +1,35 @@
++import pickle
++
++import pytest
++import torch
++from torch._inductor.codecache import BypassFxGraphCache
++
++from vllm.compilation.config import CompilationConfig
++from vllm.compilation.inductor_pass import (CallableInductorPass,
++                                            as_inductor_pass)
++from vllm.compilation.pass_manager import PostGradPassManager
++
++
++def simple_callable(graph: torch.fx.Graph):
++    pass
++
++
++@as_inductor_pass(files=(__file__, ))
++def callable_decorated(graph: torch.fx.Graph):
++    pass
++
++
++@pytest.mark.parametrize(
++    "works, callable",
++    [(False, simple_callable), (True, callable_decorated),
++     (True, CallableInductorPass(simple_callable, "simple_callable"))])
++def test_pass_manager(works: bool, callable):
++    config = CompilationConfig().pass_config
++    pass_manager = PostGradPassManager([callable])
++    pass_manager.configure(config)  # Adds default passes
++
++    if works:
++        pickle.dumps(pass_manager)
++    else:
++        with pytest.raises(BypassFxGraphCache):
++            pickle.dumps(pass_manager)
+diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
+new file mode 100644
+index 0000000..74f66ba
+--- /dev/null
++++ b/tests/compile/test_wrapper.py
+@@ -0,0 +1,61 @@
++from typing import Optional
++
++import torch
++
++from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
++from vllm.config import CompilationLevel
++
++
++class MyMod(torch.nn.Module):
++
++    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
++        if cache is not None:
++            return x + cache
++        return x * 2
++
++
++class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
++
++    def __init__(self, model):
++        self.model = model
++        compiled_callable = torch.compile(self.forward, backend="eager")
++        super().__init__(compiled_callable,
++                         compilation_level=CompilationLevel.DYNAMO_ONCE)
++
++    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
++        # this is the function to be compiled
++        return self.model(x, cache)
++
++    def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
++        # let torch.compile compile twice
++        if len(self.compiled_codes) == 2:
++            dispatch_id = 0 if cache is None else 1
++            with self.dispatch_to_code(dispatch_id):
++                return self.forward(x, cache)
++        else:
++            return self.compiled_callable(x, cache)
++
++
++def test_torch_compile_wrapper():
++    mod = MyMod()
++    wrappers = []
++    for i in range(3):
++        torch._dynamo.reset()
++        wrapper = MyWrapper(mod)
++        wrappers.append(wrapper)
++        x = torch.tensor([1])
++        wrapper(x, None)  # profile run, compile
++        # create a cache tensor
++        cache = torch.tensor([2])
++        wrapper(x, cache)  # warm up with cache, recompile
++
++        # for new input, dispatch to the compiled code directly
++        new_x = torch.tensor([3])
++        assert wrapper(new_x,
++                       None).item() == 6  # dispatch to the first compiled code
++        assert wrapper(
++            new_x, cache).item() == 5  # dispatch to the second compiled code
++
++    for wrapper in wrappers:
++        # make sure they have independent compiled codes
++        assert len(wrapper.compiled_codes) == 2
+diff --git a/tests/compile/utils.py b/tests/compile/utils.py
+new file mode 100644
+index 0000000..7c92d16
+--- /dev/null
++++ b/tests/compile/utils.py
+@@ -0,0 +1,97 @@
++import os
++
++import torch
++
++from tests.quantization.utils import is_quant_method_supported
++from vllm import LLM, SamplingParams
++from vllm.config import CompilationLevel
++from vllm.platforms import current_platform
++
++TEST_MODELS = [
++    ("facebook/opt-125m", {}),
++    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
++        "dtype": torch.float16,
++        "quantization": "compressed-tensors"
++    }),
++    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
++        "dtype": torch.float16,
++        "quantization": "fp8"
++    }),
++    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
++        "quantization": "compressed-tensors"
++    }),
++    ("meta-llama/Meta-Llama-3-8B", {}),
++]
++
++if is_quant_method_supported("aqlm"):
++    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
++        "quantization": "aqlm"
++    }))
++
++# TODO: figure out why this fails.
++if False and is_quant_method_supported("gguf"):  # noqa: SIM223
++    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
++        "quantization": "gguf"
++    }))
++
++if is_quant_method_supported("gptq"):
++    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
++        "quantization": "gptq"
++    }))
++
++if is_quant_method_supported("gptq_marlin"):
++    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
++        "quantization": "gptq_marlin"
++    }))
++
++if is_quant_method_supported("gptq_marlin_24"):
++    TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
++        "quantization": "gptq_marlin_24"
++    }))
++
++if is_quant_method_supported("marlin"):
++    TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
++        "quantization": "marlin"
++    }))
++
++if not current_platform.is_rocm() and is_quant_method_supported("awq"):
++    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
++        "quantization": "AWQ"
++    }))
++
++
++def check_full_graph_support(model,
++                             model_kwargs,
++                             optimization_level,
++                             tp_size=1):
++    # make sure these models can be captured in full graph mode
++    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
++
++    # The base meta llama uses too much memory.
++    if (model == "meta-llama/Meta-Llama-3-8B"
++            and optimization_level >= CompilationLevel.PIECEWISE):
++        return
++
++    print(f"MODEL={model}")
++
++    prompts = [
++        "Hello, my name is",
++        "The president of the United States is",
++        "The capital of France is",
++        "The future of AI is",
++    ]
++    sampling_params = SamplingParams(temperature=0)
++    llm = LLM(model=model,
++              enforce_eager=True,
++              tensor_parallel_size=tp_size,
++              disable_custom_all_reduce=True,
++              compilation_config=optimization_level,
++              **model_kwargs)
++
++    outputs = llm.generate(prompts, sampling_params)
++
++    # Print the outputs.
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+diff --git a/tests/conftest.py b/tests/conftest.py
+index 6713269..95af4ac 100644
+--- a/tests/conftest.py
++++ b/tests/conftest.py
+@@ -1,57 +1,151 @@
+-import contextlib
+-import gc
++import json
+ import os
+-from typing import List, Optional, Tuple
++import tempfile
++from collections import UserList
++from enum import Enum
++from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
++                    TypedDict, TypeVar, Union)
+ 
++import numpy as np
+ import pytest
+ import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from huggingface_hub import snapshot_download
+ from PIL import Image
+-from transformers import (AutoModelForCausalLM, AutoProcessor,
+-                          LlavaForConditionalGeneration)
++from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
++                          BatchFeature)
++from transformers.models.auto.auto_factory import _BaseAutoModelClass
+ 
++from tests.models.utils import (TokensTextLogprobs,
++                                TokensTextLogprobsPromptLogprobs)
+ from vllm import LLM, SamplingParams
+-from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
+-from vllm.distributed import destroy_model_parallel
+-from vllm.sequence import MultiModalData
+-from vllm.transformers_utils.tokenizer import get_tokenizer
++from vllm.assets.image import ImageAsset
++from vllm.assets.video import VideoAsset
++from vllm.config import TaskOption, TokenizerPoolConfig
++from vllm.connections import global_http_connection
++from vllm.distributed import (cleanup_dist_env_and_memory,
++                              init_distributed_environment,
++                              initialize_model_parallel)
++from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
++                         TokensPrompt, to_enc_dec_tuple_list,
++                         zip_enc_dec_prompts)
++from vllm.logger import init_logger
++from vllm.outputs import RequestOutput
++from vllm.sampling_params import BeamSearchParams
++from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
++                        identity, is_list_of)
++
++logger = init_logger(__name__)
+ 
+ _TEST_DIR = os.path.dirname(__file__)
+ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
+ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
++_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
++
++_M = TypeVar("_M")
++_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
+ 
+-# Multi modal related
+-_PIXEL_VALUES_FILES = [
+-    os.path.join(_TEST_DIR, "images", filename) for filename in
+-    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
+-]
+-_IMAGE_FEATURES_FILES = [
+-    os.path.join(_TEST_DIR, "images", filename) for filename in
+-    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
+-]
+-_IMAGE_FILES = [
+-    os.path.join(_TEST_DIR, "images", filename)
+-    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
+-]
+-_IMAGE_PROMPTS = [
+-    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
+-    "<image>\nUSER: What is the season?\nASSISTANT:"
+-]
+-assert len(_PIXEL_VALUES_FILES) == len(_IMAGE_FEATURES_FILES) == len(
+-    _IMAGE_FILES) == len(_IMAGE_PROMPTS)
++PromptImageInput = _PromptMultiModalInput[Image.Image]
++PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
++PromptVideoInput = _PromptMultiModalInput[np.ndarray]
+ 
+ 
+ def _read_prompts(filename: str) -> List[str]:
+-    with open(filename, "r") as f:
++    with open(filename) as f:
+         prompts = f.readlines()
+         return prompts
+ 
+ 
+-def cleanup():
+-    destroy_model_parallel()
+-    with contextlib.suppress(AssertionError):
+-        torch.distributed.destroy_process_group()
+-    gc.collect()
+-    torch.cuda.empty_cache()
++class _ImageAssetPrompts(TypedDict):
++    stop_sign: str
++    cherry_blossom: str
++
++
++class _ImageAssetsBase(UserList[ImageAsset]):
++    pass
++
++
++class _ImageAssets(_ImageAssetsBase):
++
++    def __init__(self) -> None:
++        super().__init__([
++            ImageAsset("stop_sign"),
++            ImageAsset("cherry_blossom"),
++        ])
++
++    def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
++        """
++        Convenience method to define the prompt for each test image.
++
++        The order of the returned prompts matches the order of the
++        assets when iterating through this object.
++        """
++        return [prompts["stop_sign"], prompts["cherry_blossom"]]
++
++
++class _VideoAssetPrompts(TypedDict):
++    sample_demo_1: str
++
++
++class _VideoAssetsBase(UserList[VideoAsset]):
++    pass
++
++
++class _VideoAssets(_VideoAssetsBase):
++
++    def __init__(self) -> None:
++        super().__init__([
++            VideoAsset("sample_demo_1.mp4"),
++        ])
++
++    def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
++        return [prompts["sample_demo_1"]]
++
++
++IMAGE_ASSETS = _ImageAssets()
++"""Singleton instance of :class:`_ImageAssets`."""
++VIDEO_ASSETS = _VideoAssets()
++"""Singleton instance of :class:`_VideoAssets`."""
++
++
++@pytest.fixture(params=[True, False])
++def run_with_both_engines(request, monkeypatch):
++    # Automatically runs tests twice, once with V1 and once without
++    use_v1 = request.param
++    # Tests decorated with `@skip_v1` are only run without v1
++    skip_v1 = request.node.get_closest_marker("skip_v1")
++
++    if use_v1:
++        if skip_v1:
++            pytest.skip("Skipping test on vllm V1")
++        monkeypatch.setenv('VLLM_USE_V1', '1')
++    else:
++        monkeypatch.setenv('VLLM_USE_V1', '0')
++
++    yield
++
++
++@pytest.fixture(autouse=True)
++def init_test_http_connection():
++    # pytest_asyncio may use a different event loop per test
++    # so we need to make sure the async client is created anew
++    global_http_connection.reuse_client = False
++
++
++@pytest.fixture
++def dist_init():
++    temp_file = tempfile.mkstemp()[1]
++    init_distributed_environment(
++        world_size=1,
++        rank=0,
++        distributed_init_method=f"file://{temp_file}",
++        local_rank=0,
++        backend="nccl",
++    )
++    initialize_model_parallel(1, 1)
++    yield
++    cleanup_dist_env_and_memory()
+ 
+ 
+ @pytest.fixture()
+@@ -61,58 +155,74 @@ def should_do_global_cleanup_after_test(request) -> bool:
+     to initialize torch.
+     """
+ 
+-    if request.node.get_closest_marker("skip_global_cleanup"):
+-        return False
+-
+-    return True
++    return not request.node.get_closest_marker("skip_global_cleanup")
+ 
+ 
+ @pytest.fixture(autouse=True)
+ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
+     yield
+     if should_do_global_cleanup_after_test:
+-        cleanup()
++        cleanup_dist_env_and_memory()
+ 
+ 
+-@pytest.fixture(scope="session")
+-def hf_image_prompts() -> List[str]:
+-    return _IMAGE_PROMPTS
++@pytest.fixture(autouse=True)
++def dynamo_reset():
++    yield
++    torch._dynamo.reset()
+ 
+ 
+-@pytest.fixture(scope="session")
+-def hf_images() -> List[Image.Image]:
+-    return [Image.open(filename) for filename in _IMAGE_FILES]
++@pytest.fixture
++def example_prompts() -> List[str]:
++    prompts = []
++    for filename in _TEST_PROMPTS:
++        prompts += _read_prompts(filename)
++    return prompts
+ 
+ 
+-@pytest.fixture()
+-def vllm_images(request) -> "torch.Tensor":
+-    vision_language_config = request.getfixturevalue("model_and_config")[1]
+-    all_images = []
+-    if vision_language_config.image_input_type == (
+-            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
+-        filenames = _IMAGE_FEATURES_FILES
+-    else:
+-        filenames = _PIXEL_VALUES_FILES
+-    for filename in filenames:
+-        all_images.append(torch.load(filename))
+-    return torch.concat(all_images, dim=0)
++@pytest.fixture
++def example_system_message() -> str:
++    with open(_SYS_MSG) as f:
++        return f.read()
+ 
+ 
+-@pytest.fixture()
+-def vllm_image_prompts(request) -> List[str]:
+-    vision_language_config = request.getfixturevalue("model_and_config")[1]
+-    return [
+-        "<image>" * (vision_language_config.image_feature_size - 1) + p
+-        for p in _IMAGE_PROMPTS
+-    ]
++class DecoderPromptType(Enum):
++    """For encoder/decoder models only."""
++    CUSTOM = 1
++    NONE = 2
++    EMPTY_STR = 3
+ 
+ 
+ @pytest.fixture
+-def example_prompts() -> List[str]:
+-    prompts = []
++def example_encoder_decoder_prompts(
++) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]:
++    '''
++    Returns an encoder prompt list and a decoder prompt list, wherein each pair
++    of same-index entries in both lists corresponds to an (encoder prompt,
++    decoder prompt) tuple.
++
++    Returns:
++
++    * Encoder prompt list
++    * Decoder prompt list (reverse of encoder prompt list)
++    '''
++
++    encoder_prompts = []
+     for filename in _TEST_PROMPTS:
+-        prompts += _read_prompts(filename)
+-    return prompts
++        encoder_prompts += _read_prompts(filename)
++
++    custom_decoder_prompts = encoder_prompts[::-1]
++    empty_str_decoder_prompts = [""] * len(encoder_prompts)
++    none_decoder_prompts = [None] * len(encoder_prompts)
++
++    # NONE decoder prompt type
++    return {
++        DecoderPromptType.NONE:
++        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
++        DecoderPromptType.EMPTY_STR:
++        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
++        DecoderPromptType.CUSTOM:
++        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
++    }
+ 
+ 
+ @pytest.fixture
+@@ -123,78 +233,171 @@ def example_long_prompts() -> List[str]:
+     return prompts
+ 
+ 
+-_STR_DTYPE_TO_TORCH_DTYPE = {
+-    "half": torch.half,
+-    "bfloat16": torch.bfloat16,
+-    "float": torch.float,
+-}
++@pytest.fixture(scope="session")
++def image_assets() -> _ImageAssets:
++    return IMAGE_ASSETS
++
++
++@pytest.fixture(scope="session")
++def video_assets() -> _VideoAssets:
++    return VIDEO_ASSETS
++
+ 
+-_VISION_LANGUAGE_MODELS = {
+-    "llava-hf/llava-1.5-7b-hf": LlavaForConditionalGeneration,
+-}
++_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
+ 
+ 
+ class HfRunner:
+ 
++    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
++        from vllm.platforms import current_platform
++        if x is None or isinstance(x, (bool, )):
++            return x
++
++        if device is None:
++            device = "cpu" if current_platform.is_cpu() else "cuda"
++
++        if isinstance(x, dict):
++            return {k: self.wrap_device(v, device) for k, v in x.items()}
++
++        if hasattr(x, "device") and x.device.type == device:
++            return x
++
++        return x.to(device)
++
+     def __init__(
+         self,
+         model_name: str,
+-        tokenizer_name: Optional[str] = None,
+         dtype: str = "half",
++        *,
++        model_kwargs: Optional[Dict[str, Any]] = None,
++        is_sentence_transformer: bool = False,
++        is_cross_encoder: bool = False,
++        skip_tokenizer_init: bool = False,
++        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
++        postprocess_inputs: Callable[..., BatchEncoding] = identity,
+     ) -> None:
+-        assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
+-        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
++        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
++
+         self.model_name = model_name
+-        if model_name not in _VISION_LANGUAGE_MODELS:
+-            self.model = AutoModelForCausalLM.from_pretrained(
+-                model_name,
+-                torch_dtype=torch_dtype,
+-                trust_remote_code=True,
+-            ).cuda()
+-            self.processor = None
++
++        if is_sentence_transformer:
++            # Lazy init required for AMD CI
++            from sentence_transformers import SentenceTransformer
++            self.model = self.wrap_device(
++                SentenceTransformer(
++                    model_name,
++                    device="cpu",
++                    trust_remote_code=True,
++                ).to(dtype=torch_dtype))
++        elif is_cross_encoder:
++            # Lazy init required for AMD CI
++            from sentence_transformers import CrossEncoder
++            self.model = CrossEncoder(model_name,
++                                      device="cpu",
++                                      trust_remote_code=True)
++            self.model.model = self.wrap_device(self.model.model)\
++                .to(dtype=torch_dtype)
+         else:
+-            self.model = _VISION_LANGUAGE_MODELS[model_name].from_pretrained(
++            model_kwargs = model_kwargs if model_kwargs is not None else {}
++            self.model = self.wrap_device(
++                auto_cls.from_pretrained(
++                    model_name,
++                    torch_dtype=torch_dtype,
++                    trust_remote_code=True,
++                    **model_kwargs,
++                ))
++
++        if not skip_tokenizer_init:
++            self.tokenizer = AutoTokenizer.from_pretrained(
+                 model_name,
+                 torch_dtype=torch_dtype,
+                 trust_remote_code=True,
+-            ).cuda()
+-            self.processor = AutoProcessor.from_pretrained(
+-                model_name,
+-                torch_dtype=torch_dtype,
+             )
+-        if tokenizer_name is None:
+-            tokenizer_name = model_name
+-        self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
+ 
+-    def generate(
++        # don't put this import at the top level
++        # it will call torch.cuda.device_count()
++        from transformers import AutoProcessor  # noqa: F401
++        self.processor = AutoProcessor.from_pretrained(
++            model_name,
++            torch_dtype=torch_dtype,
++            trust_remote_code=True,
++        )
++        if skip_tokenizer_init:
++            self.tokenizer = self.processor.tokenizer
++
++        self.dtype = dtype
++        self.postprocess_inputs = postprocess_inputs
++
++    def get_inputs(
+         self,
+         prompts: List[str],
+-        images: Optional[List[Image.Image]] = None,
+-        **kwargs,
+-    ) -> List[Tuple[List[int], str]]:
+-        outputs: List[Tuple[List[int], str]] = []
+-        if images:
++        images: Optional[PromptImageInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++        audios: Optional[PromptAudioInput] = None,
++    ) -> List[BatchEncoding]:
++        if images is not None:
+             assert len(prompts) == len(images)
++
++        if videos is not None:
++            assert len(prompts) == len(videos)
++
++        if audios is not None:
++            assert len(prompts) == len(audios)
++
++        all_inputs: List[BatchEncoding] = []
+         for i, prompt in enumerate(prompts):
+-            if self.model_name not in _VISION_LANGUAGE_MODELS:
+-                input_ids = self.tokenizer(prompt,
+-                                           return_tensors="pt").input_ids
+-                inputs = {"input_ids": input_ids.cuda()}
+-            else:
+-                image = images[i] if images else None
+-                inputs = self.processor(text=prompt,
+-                                        images=image,
+-                                        return_tensors="pt")
+-                inputs = {
+-                    key: value.cuda() if value is not None else None
+-                    for key, value in inputs.items()
+-                }
++            processor_kwargs: Dict[str, Any] = {
++                "text": prompt,
++                "return_tensors": "pt",
++            }
++            if images is not None and (image := images[i]) is not None:
++                processor_kwargs["images"] = image
++            if videos is not None and (video := videos[i]) is not None:
++                processor_kwargs["videos"] = video
++            if audios is not None and (audio_tuple := audios[i]) is not None:
++                audio, sr = audio_tuple
++                processor_kwargs["audio"] = audio
++                processor_kwargs["sampling_rate"] = sr
++
++            inputs = self.processor(**processor_kwargs)
++            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
++
++            all_inputs.append(inputs)
++
++        return all_inputs
++
++    def classify(self, prompts: List[str]) -> List[str]:
++        # output is final logits
++        all_inputs = self.get_inputs(prompts)
++        outputs = []
++        for inputs in all_inputs:
++            output = self.model(**self.wrap_device(inputs))
++            logits = output.logits.softmax(dim=-1)[0].tolist()
++            outputs.append(logits)
++
++        return outputs
++
++    def generate(
++        self,
++        prompts: List[str],
++        images: Optional[PromptImageInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++        audios: Optional[PromptAudioInput] = None,
++        **kwargs: Any,
++    ) -> List[Tuple[List[List[int]], List[str]]]:
++        all_inputs = self.get_inputs(prompts,
++                                     images=images,
++                                     videos=videos,
++                                     audios=audios)
++
++        outputs: List[Tuple[List[List[int]], List[str]]] = []
++        for inputs in all_inputs:
+             output_ids = self.model.generate(
+-                **inputs,
++                **self.wrap_device(inputs, device=self.model.device.type),
+                 use_cache=True,
+                 **kwargs,
+             )
+-            output_str = self.tokenizer.batch_decode(
++            output_str = self.processor.batch_decode(
+                 output_ids,
+                 skip_special_tokens=True,
+                 clean_up_tokenization_spaces=False,
+@@ -207,23 +410,28 @@ class HfRunner:
+         self,
+         prompts: List[str],
+         max_tokens: int,
+-        images: Optional["torch.Tensor"] = None,
++        images: Optional[PromptImageInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++        audios: Optional[PromptAudioInput] = None,
++        **kwargs: Any,
+     ) -> List[Tuple[List[int], str]]:
+         outputs = self.generate(prompts,
+                                 do_sample=False,
+                                 max_new_tokens=max_tokens,
+-                                images=images)
+-        for i in range(len(outputs)):
+-            output_ids, output_str = outputs[i]
+-            outputs[i] = (output_ids[0], output_str[0])
+-        return outputs
++                                images=images,
++                                videos=videos,
++                                audios=audios,
++                                **kwargs)
++
++        return [(output_ids[0], output_str[0])
++                for output_ids, output_str in outputs]
+ 
+     def generate_beam_search(
+         self,
+         prompts: List[str],
+         beam_width: int,
+         max_tokens: int,
+-    ) -> List[Tuple[List[int], str]]:
++    ) -> List[Tuple[List[List[int]], List[str]]]:
+         outputs = self.generate(prompts,
+                                 do_sample=False,
+                                 max_new_tokens=max_tokens,
+@@ -243,41 +451,207 @@ class HfRunner:
+         self,
+         prompts: List[str],
+         max_tokens: int,
++        images: Optional[PromptImageInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++        audios: Optional[PromptAudioInput] = None,
++        **kwargs: Any,
+     ) -> List[List[torch.Tensor]]:
+-        all_logprobs = []
+-        for prompt in prompts:
+-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
++        all_inputs = self.get_inputs(prompts,
++                                     images=images,
++                                     videos=videos,
++                                     audios=audios)
++
++        all_logprobs: List[List[torch.Tensor]] = []
++        for inputs in all_inputs:
+             output = self.model.generate(
+-                input_ids.cuda(),
++                **self.wrap_device(inputs, device=self.model.device.type),
+                 use_cache=True,
+                 do_sample=False,
+                 max_new_tokens=max_tokens,
+                 output_hidden_states=True,
+                 return_dict_in_generate=True,
++                **kwargs,
+             )
+-            seq_logprobs = []
+-            for hidden_states in output.hidden_states:
+-                last_hidden_states = hidden_states[-1][0]
+-                logits = torch.matmul(
+-                    last_hidden_states,
+-                    self.model.get_output_embeddings().weight.t(),
+-                )
+-                if self.model.get_output_embeddings().bias is not None:
+-                    logits += self.model.get_output_embeddings(
+-                    ).bias.unsqueeze(0)
+-                logprobs = torch.nn.functional.log_softmax(logits,
+-                                                           dim=-1,
+-                                                           dtype=torch.float32)
+-                seq_logprobs.append(logprobs)
++            seq_logprobs = self._hidden_states_to_seq_logprobs(
++                output.hidden_states)
+             all_logprobs.append(seq_logprobs)
+         return all_logprobs
+ 
+-    def __del__(self):
++    def _hidden_states_to_seq_logprobs(
++        self,
++        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
++    ) -> List[torch.Tensor]:
++        output_embeddings = self.model.get_output_embeddings()
++
++        seq_logprobs: List[torch.Tensor] = []
++        for _, hidden_state in enumerate(hidden_states):
++            last_hidden_states = hidden_state[-1][0]
++            logits = torch.matmul(
++                last_hidden_states.to(output_embeddings.weight.device),
++                output_embeddings.weight.t(),
++            )
++            if getattr(output_embeddings, "bias", None) is not None:
++                logits += output_embeddings.bias.unsqueeze(0)
++            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
++            seq_logprobs.append(logprobs)
++
++        return seq_logprobs
++
++    def _hidden_states_to_logprobs(
++        self,
++        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
++        num_logprobs: int,
++    ) -> Tuple[List[Dict[int, float]], int]:
++        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
++        output_len = len(hidden_states)
++
++        # convert to dict
++        seq_logprobs_lst: List[Dict[int, float]] = []
++        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
++            # drop prompt logprobs
++            if tok_idx == 0:
++                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
++            topk = tok_logprobs.topk(num_logprobs)
++
++            tok_logprobs_dct = {}
++            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
++                tok_logprobs_dct[token_id.item()] = logprob.item()
++
++            seq_logprobs_lst.append(tok_logprobs_dct)
++
++        return (
++            seq_logprobs_lst,
++            output_len,
++        )
++
++    def generate_greedy_logprobs_limit(
++        self,
++        prompts: List[str],
++        max_tokens: int,
++        num_logprobs: int,
++        images: Optional[PromptImageInput] = None,
++        audios: Optional[PromptAudioInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++        **kwargs: Any,
++    ) -> List[TokensTextLogprobs]:
++        all_inputs = self.get_inputs(prompts,
++                                     images=images,
++                                     videos=videos,
++                                     audios=audios)
++
++        all_logprobs: List[List[Dict[int, float]]] = []
++        all_output_ids: List[List[int]] = []
++        all_output_strs: List[str] = []
++
++        for inputs in all_inputs:
++            output = self.model.generate(
++                **self.wrap_device(inputs, device=self.model.device.type),
++                use_cache=True,
++                do_sample=False,
++                max_new_tokens=max_tokens,
++                output_hidden_states=True,
++                return_dict_in_generate=True,
++                **kwargs,
++            )
++
++            (
++                seq_logprobs_lst,
++                output_len,
++            ) = self._hidden_states_to_logprobs(output.hidden_states,
++                                                num_logprobs)
++
++            all_logprobs.append(seq_logprobs_lst)
++            seq_ids = output.sequences[0]
++            output_len = len(seq_logprobs_lst)
++            output_ids = seq_ids[-output_len:]
++            all_output_ids.append(output_ids.tolist())
++            all_output_strs.append(self.tokenizer.decode(output_ids))
++
++        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
++        return [(output_ids, output_str, output_logprobs)
++                for output_ids, output_str, output_logprobs in outputs]
++
++    def generate_encoder_decoder_greedy_logprobs_limit(
++        self,
++        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
++        max_tokens: int,
++        num_logprobs: int,
++        images: Optional[PromptImageInput] = None,
++        **kwargs: Any,
++    ) -> List[TokensTextLogprobs]:
++        '''
++        Greedy logprobs generation for vLLM encoder/decoder models
++        '''
++
++        all_logprobs: List[List[Dict[int, float]]] = []
++        all_output_ids: List[List[int]] = []
++        all_output_strs: List[str] = []
++
++        for i, (encoder_prompt, decoder_prompt) in enumerate(
++                to_enc_dec_tuple_list(encoder_decoder_prompts)):
++            processor_kwargs: Dict[str, Any] = {
++                "text": encoder_prompt,
++                "return_tensors": "pt",
++            }
++            if images is not None and images[i] is not None:
++                processor_kwargs["images"] = images[i]
++
++            encoder_input_ids = self.wrap_device(
++                self.processor(**processor_kwargs).input_ids,
++                device=self.model.device.type,
++            )
++
++            if decoder_prompt is None:
++                decoder_input_ids = None
++            else:
++                decoder_input_ids = self.wrap_device(
++                    self.tokenizer(decoder_prompt,
++                                   return_tensors="pt").input_ids,
++                    device=self.model.device.type,
++                )
++
++            output = self.model.generate(
++                encoder_input_ids,
++                decoder_input_ids=decoder_input_ids,
++                use_cache=True,
++                do_sample=False,
++                max_new_tokens=max_tokens,
++                output_hidden_states=True,
++                return_dict_in_generate=True,
++                **kwargs,
++            )
++
++            (
++                seq_logprobs_lst,
++                output_len,
++            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
++                                                num_logprobs)
++
++            all_logprobs.append(seq_logprobs_lst)
++            seq_ids = output.sequences[0]
++            output_ids = seq_ids[-output_len:]
++            all_output_ids.append(output_ids.tolist())
++            all_output_strs.append(self.tokenizer.decode(output_ids))
++
++        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
++        return [(output_ids, output_str, output_logprobs)
++                for output_ids, output_str, output_logprobs in outputs]
++
++    def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
++        return self.model.encode(prompts)
++
++    def predict(self, prompts: List[List[str]]) -> torch.Tensor:
++        return self.model.predict(prompts, convert_to_tensor=True)
++
++    def __enter__(self):
++        return self
++
++    def __exit__(self, exc_type, exc_value, traceback):
+         del self.model
+-        cleanup()
++        cleanup_dist_env_and_memory()
+ 
+ 
+-@pytest.fixture
++@pytest.fixture(scope="session")
+ def hf_runner():
+     return HfRunner
+ 
+@@ -287,24 +661,30 @@ class VllmRunner:
+     def __init__(
+         self,
+         model_name: str,
++        task: TaskOption = "auto",
+         tokenizer_name: Optional[str] = None,
++        tokenizer_mode: str = "auto",
+         # Use smaller max model length, otherwise bigger model cannot run due
+         # to kv cache size limit.
+-        max_model_len=1024,
++        max_model_len: int = 1024,
+         dtype: str = "half",
+         disable_log_stats: bool = True,
+         tensor_parallel_size: int = 1,
+         block_size: int = 16,
+         enable_chunked_prefill: bool = False,
+-        swap_space=4,
++        swap_space: int = 4,
++        enforce_eager: Optional[bool] = False,
+         **kwargs,
+     ) -> None:
+         self.model = LLM(
+             model=model_name,
++            task=task,
+             tokenizer=tokenizer_name,
++            tokenizer_mode=tokenizer_mode,
+             trust_remote_code=True,
+             dtype=dtype,
+             swap_space=swap_space,
++            enforce_eager=enforce_eager,
+             disable_log_stats=disable_log_stats,
+             tensor_parallel_size=tensor_parallel_size,
+             max_model_len=max_model_len,
+@@ -313,60 +693,143 @@ class VllmRunner:
+             **kwargs,
+         )
+ 
++    def get_inputs(
++        self,
++        prompts: List[str],
++        images: Optional[PromptImageInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++        audios: Optional[PromptAudioInput] = None,
++    ) -> List[TextPrompt]:
++        if images is not None:
++            assert len(prompts) == len(images)
++
++        if videos is not None:
++            assert len(prompts) == len(videos)
++
++        if audios is not None:
++            assert len(prompts) == len(audios)
++
++        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
++        if images is not None:
++            for i, image in enumerate(images):
++                if image is not None:
++                    inputs[i]["multi_modal_data"] = {"image": image}
++
++        if videos is not None:
++            for i, video in enumerate(videos):
++                if video is not None:
++                    inputs[i]["multi_modal_data"] = {"video": video}
++
++        if audios is not None:
++            for i, audio in enumerate(audios):
++                if audio is not None:
++                    inputs[i]["multi_modal_data"] = {"audio": audio}
++
++        return inputs
++
+     def generate(
+         self,
+         prompts: List[str],
+         sampling_params: SamplingParams,
+-        images: Optional["torch.Tensor"] = None,
+-    ) -> List[Tuple[List[int], str]]:
+-        if images is not None:
+-            assert len(prompts) == images.shape[0]
+-        req_outputs = self.model.generate(
+-            prompts,
+-            sampling_params=sampling_params,
+-            multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE,
+-                                            data=images)
+-            if images is not None else None)
+-        outputs = []
++        images: Optional[PromptImageInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++        audios: Optional[PromptAudioInput] = None,
++    ) -> List[Tuple[List[List[int]], List[str]]]:
++        inputs = self.get_inputs(prompts,
++                                 images=images,
++                                 videos=videos,
++                                 audios=audios)
++
++        req_outputs = self.model.generate(inputs,
++                                          sampling_params=sampling_params)
++
++        outputs: List[Tuple[List[List[int]], List[str]]] = []
+         for req_output in req_outputs:
+             prompt_str = req_output.prompt
+             prompt_ids = req_output.prompt_token_ids
+-            req_sample_output_ids = []
+-            req_sample_output_strs = []
++            req_sample_output_ids: List[List[int]] = []
++            req_sample_output_strs: List[str] = []
+             for sample in req_output.outputs:
+                 output_str = sample.text
+-                output_ids = sample.token_ids
++                output_ids = list(sample.token_ids)
+                 req_sample_output_ids.append(prompt_ids + output_ids)
+                 req_sample_output_strs.append(prompt_str + output_str)
+             outputs.append((req_sample_output_ids, req_sample_output_strs))
+         return outputs
+ 
++    @staticmethod
++    def _final_steps_generate_w_logprobs(
++        req_outputs: List[RequestOutput],
++    ) -> List[TokensTextLogprobsPromptLogprobs]:
++        outputs: List[TokensTextLogprobsPromptLogprobs] = []
++        for req_output in req_outputs:
++            assert len(req_output.outputs) > 0
++            for sample in req_output.outputs:
++                output_str = sample.text
++                output_ids = list(sample.token_ids)
++                output_logprobs = sample.logprobs
++            outputs.append((output_ids, output_str, output_logprobs,
++                            req_output.prompt_logprobs))
++        return outputs
++
+     def generate_w_logprobs(
+         self,
+         prompts: List[str],
+         sampling_params: SamplingParams,
+-    ) -> List[Tuple[List[int], str]]:
+-        assert sampling_params.logprobs is not None
++        images: Optional[PromptImageInput] = None,
++        audios: Optional[PromptAudioInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++    ) -> Union[List[TokensTextLogprobs],
++               List[TokensTextLogprobsPromptLogprobs]]:
++        inputs = self.get_inputs(prompts,
++                                 images=images,
++                                 videos=videos,
++                                 audios=audios)
+ 
+-        req_outputs = self.model.generate(prompts,
++        req_outputs = self.model.generate(inputs,
+                                           sampling_params=sampling_params)
+-        outputs = []
+-        for req_output in req_outputs:
+-            for sample in req_output.outputs:
+-                output_str = sample.text
+-                output_ids = sample.token_ids
+-                output_logprobs = sample.logprobs
+-            outputs.append((output_ids, output_str, output_logprobs))
+-        return outputs
++
++        toks_str_logsprobs_prompt_logprobs = (
++            self._final_steps_generate_w_logprobs(req_outputs))
++        # Omit prompt logprobs if not required by sampling params
++        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
++                if sampling_params.prompt_logprobs is None else
++                toks_str_logsprobs_prompt_logprobs)
++
++    def generate_encoder_decoder_w_logprobs(
++        self,
++        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
++        sampling_params: SamplingParams,
++    ) -> Union[List[TokensTextLogprobs],
++               List[TokensTextLogprobsPromptLogprobs]]:
++        '''
++        Logprobs generation for vLLM encoder/decoder models
++        '''
++
++        assert sampling_params.logprobs is not None
++        req_outputs = self.model.generate(encoder_decoder_prompts,
++                                          sampling_params=sampling_params)
++        toks_str_logsprobs_prompt_logprobs = (
++            self._final_steps_generate_w_logprobs(req_outputs))
++        # Omit prompt logprobs if not required by sampling params
++        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
++                if sampling_params.prompt_logprobs is None else
++                toks_str_logsprobs_prompt_logprobs)
+ 
+     def generate_greedy(
+         self,
+         prompts: List[str],
+         max_tokens: int,
+-        images: Optional[torch.Tensor] = None,
++        images: Optional[PromptImageInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++        audios: Optional[PromptAudioInput] = None,
+     ) -> List[Tuple[List[int], str]]:
+         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+-        outputs = self.generate(prompts, greedy_params, images=images)
++        outputs = self.generate(prompts,
++                                greedy_params,
++                                images=images,
++                                videos=videos,
++                                audios=audios)
+         return [(output_ids[0], output_str[0])
+                 for output_ids, output_str in outputs]
+ 
+@@ -375,31 +838,104 @@ class VllmRunner:
+         prompts: List[str],
+         max_tokens: int,
+         num_logprobs: int,
+-    ) -> List[Tuple[List[int], str]]:
+-        greedy_logprobs_params = SamplingParams(temperature=0.0,
+-                                                max_tokens=max_tokens,
+-                                                logprobs=num_logprobs)
+-        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
++        num_prompt_logprobs: Optional[int] = None,
++        images: Optional[PromptImageInput] = None,
++        audios: Optional[PromptAudioInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++        stop_token_ids: Optional[List[int]] = None,
++        stop: Optional[List[str]] = None,
++    ) -> Union[List[TokensTextLogprobs],
++               List[TokensTextLogprobsPromptLogprobs]]:
++        greedy_logprobs_params = SamplingParams(
++            temperature=0.0,
++            max_tokens=max_tokens,
++            logprobs=num_logprobs,
++            prompt_logprobs=num_prompt_logprobs,
++            stop_token_ids=stop_token_ids,
++            stop=stop)
+ 
+-        return [(output_ids, output_str, output_logprobs)
+-                for output_ids, output_str, output_logprobs in outputs]
++        return self.generate_w_logprobs(prompts,
++                                        greedy_logprobs_params,
++                                        images=images,
++                                        audios=audios,
++                                        videos=videos)
++
++    def generate_encoder_decoder_greedy_logprobs(
++        self,
++        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
++        max_tokens: int,
++        num_logprobs: int,
++        num_prompt_logprobs: Optional[int] = None,
++    ) -> Union[List[TokensTextLogprobs],
++               List[TokensTextLogprobsPromptLogprobs]]:
++        greedy_logprobs_params = SamplingParams(
++            temperature=0.0,
++            max_tokens=max_tokens,
++            logprobs=num_logprobs,
++            prompt_logprobs=(num_prompt_logprobs),
++        )
++        '''
++        Greedy logprobs generation for vLLM encoder/decoder models
++        '''
++
++        return self.generate_encoder_decoder_w_logprobs(
++            encoder_decoder_prompts, greedy_logprobs_params)
+ 
+     def generate_beam_search(
+         self,
+-        prompts: List[str],
++        prompts: Union[List[str], List[List[int]]],
+         beam_width: int,
+         max_tokens: int,
+-    ) -> List[Tuple[List[int], str]]:
+-        beam_search_params = SamplingParams(n=beam_width,
+-                                            use_beam_search=True,
+-                                            temperature=0.0,
+-                                            max_tokens=max_tokens)
+-        outputs = self.generate(prompts, beam_search_params)
+-        return outputs
++    ) -> List[Tuple[List[List[int]], List[str]]]:
++        if is_list_of(prompts, str, check="all"):
++            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
++        else:
++            prompts = [
++                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
++            ]
++        outputs = self.model.beam_search(
++            prompts,
++            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
++        returned_outputs = []
++        for output in outputs:
++            token_ids = [x.tokens for x in output.sequences]
++            texts = [x.text for x in output.sequences]
++            returned_outputs.append((token_ids, texts))
++        return returned_outputs
++
++    def classify(self, prompts: List[str]) -> List[List[float]]:
++        req_outputs = self.model.classify(prompts)
++        return [req_output.outputs.probs for req_output in req_outputs]
++
++    def encode(
++        self,
++        prompts: List[str],
++        images: Optional[PromptImageInput] = None,
++        videos: Optional[PromptVideoInput] = None,
++        audios: Optional[PromptAudioInput] = None,
++    ) -> List[List[float]]:
++        inputs = self.get_inputs(prompts,
++                                 images=images,
++                                 videos=videos,
++                                 audios=audios)
+ 
+-    def __del__(self):
++        req_outputs = self.model.embed(inputs)
++        return [req_output.outputs.embedding for req_output in req_outputs]
++
++    def score(
++        self,
++        text_1: Union[str, List[str]],
++        text_2: Union[str, List[str]],
++    ) -> List[float]:
++        req_outputs = self.model.score(text_1, text_2)
++        return [req_output.outputs.score for req_output in req_outputs]
++
++    def __enter__(self):
++        return self
++
++    def __exit__(self, exc_type, exc_value, traceback):
+         del self.model
+-        cleanup()
++        cleanup_dist_env_and_memory()
+ 
+ 
+ @pytest.fixture(scope="session")
+@@ -414,4 +950,114 @@ def get_tokenizer_pool_config(tokenizer_group_type):
+         return TokenizerPoolConfig(pool_size=1,
+                                    pool_type="ray",
+                                    extra_config={})
++    if isinstance(tokenizer_group_type, type):
++        return TokenizerPoolConfig(pool_size=1,
++                                   pool_type=tokenizer_group_type,
++                                   extra_config={})
+     raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
++
++
++@pytest.fixture()
++def temporary_enable_log_propagate():
++    import logging
++    logger = logging.getLogger("vllm")
++    logger.propagate = True
++    yield
++    logger.propagate = False
++
++
++@pytest.fixture()
++def caplog_vllm(temporary_enable_log_propagate, caplog):
++    # To capture vllm log, we should enable propagate=True temporarily
++    # because caplog depends on logs propagated to the root logger.
++    yield caplog
++
++
++@pytest.fixture(scope="session")
++def num_gpus_available():
++    """Get number of GPUs without initializing the CUDA context
++    in current process."""
++
++    return cuda_device_count_stateless()
++
++
++temp_dir = tempfile.gettempdir()
++_dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
++_dummy_llava_path = os.path.join(temp_dir, "dummy_llava")
++_dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding")
++
++
++@pytest.fixture
++def dummy_opt_path():
++    json_path = os.path.join(_dummy_opt_path, "config.json")
++    if not os.path.exists(_dummy_opt_path):
++        snapshot_download(repo_id="facebook/opt-125m",
++                          local_dir=_dummy_opt_path,
++                          ignore_patterns=[
++                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
++                              "*.msgpack"
++                          ])
++        assert os.path.exists(json_path)
++        with open(json_path) as f:
++            config = json.load(f)
++        config["architectures"] = ["MyOPTForCausalLM"]
++        with open(json_path, "w") as f:
++            json.dump(config, f)
++    return _dummy_opt_path
++
++
++@pytest.fixture
++def dummy_llava_path():
++    json_path = os.path.join(_dummy_llava_path, "config.json")
++    if not os.path.exists(_dummy_llava_path):
++        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
++                          local_dir=_dummy_llava_path,
++                          ignore_patterns=[
++                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
++                              "*.msgpack"
++                          ])
++        assert os.path.exists(json_path)
++        with open(json_path) as f:
++            config = json.load(f)
++        config["architectures"] = ["MyLlava"]
++        with open(json_path, "w") as f:
++            json.dump(config, f)
++    return _dummy_llava_path
++
++
++@pytest.fixture
++def dummy_gemma2_embedding_path():
++    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
++    if not os.path.exists(_dummy_gemma2_embedding_path):
++        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
++                          local_dir=_dummy_gemma2_embedding_path,
++                          ignore_patterns=[
++                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
++                              "*.msgpack"
++                          ])
++        assert os.path.exists(json_path)
++        with open(json_path) as f:
++            config = json.load(f)
++        config["architectures"] = ["MyGemma2Embedding"]
++        with open(json_path, "w") as f:
++            json.dump(config, f)
++    return _dummy_gemma2_embedding_path
++
++
++# Add the flag `--optional` to allow run tests
++# that are marked with @pytest.mark.optional
++def pytest_addoption(parser):
++    parser.addoption("--optional",
++                     action="store_true",
++                     default=False,
++                     help="run optional test")
++
++
++def pytest_collection_modifyitems(config, items):
++    if config.getoption("--optional"):
++        # --optional given in cli: do not skip optional tests
++        return
++    skip_optional = pytest.mark.skip(reason="need --optional option to run")
++    for item in items:
++        if "optional" in item.keywords:
++            item.add_marker(skip_optional)
+diff --git a/tests/core/block/e2e/__init__.py b/tests/core/block/e2e/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py
+index 1d99cb5..70577ec 100644
+--- a/tests/core/block/e2e/conftest.py
++++ b/tests/core/block/e2e/conftest.py
+@@ -1,7 +1,9 @@
++from typing import Callable, Iterable, Optional
++
+ import pytest
+ 
+-from tests.conftest import cleanup
+ from vllm import LLM
++from vllm.distributed import cleanup_dist_env_and_memory
+ from vllm.model_executor.utils import set_random_seed
+ 
+ 
+@@ -34,8 +36,32 @@ def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+ 
+         yield llm
+         del llm
+-        cleanup()
++        cleanup_dist_env_and_memory()
+ 
+     for llm in generator_inner():
+         yield llm
+         del llm
++
++
++def get_text_from_llm_generator(llm_generator: Iterable[LLM],
++                                prompts,
++                                sampling_params,
++                                llm_cb: Optional[Callable[[LLM],
++                                                          None]] = None):
++    for llm in llm_generator:
++        if llm_cb:
++            llm_cb(llm)
++        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
++        text = [output.outputs[0].text for output in outputs]
++        del llm
++
++    return text
++
++
++def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
++    for llm in llm_generator:
++        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
++        token_ids = [output.outputs[0].token_ids for output in outputs]
++        del llm
++
++    return token_ids
+diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
+index c3666da..86502f6 100644
+--- a/tests/core/block/e2e/test_correctness.py
++++ b/tests/core/block/e2e/test_correctness.py
+@@ -4,6 +4,8 @@ import pytest
+ 
+ from vllm import SamplingParams
+ 
++from .conftest import get_token_ids_from_llm_generator
++
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+@@ -19,26 +21,32 @@ from vllm import SamplingParams
+         "num_gpu_blocks_override": 5 * (64 + 1),
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+-@pytest.mark.parametrize("baseline_llm_kwargs", [{
+-    "use_v2_block_manager": False
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [{
++    "preemption_mode": "swap"
++}, {
++    "preemption_mode": "recompute"
+ }])
+-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+ @pytest.mark.parametrize("batch_size", [10])
+ @pytest.mark.parametrize("seed", [1])
+-def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
+-                                               test_llm_generator, batch_size):
+-    """Verify block manager v2 produces same outputs as block manager v1, even
+-    when there is preemption.
++def test_block_manager_with_preemption(baseline_llm_generator,
++                                       test_llm_generator, batch_size):
++    """Verify block manager produces same outputs even when there is preemption.
+ 
+     This constructs two LLM, each with limited number of GPU blocks. The limit
+     is decided such that as the sequences in the batch grow, sequences must be
+     preempted and removed from cache.
+ 
+     If the output token ids are equivalent, then we have confidence that the KV
+-    cache is not corrupted in the v2 block manager.
++    cache is not corrupted.
+ 
+     NOTE: We want a significant number of generated tokens so that any incorrect
+     KV mapping has time to build up error.
++
++    NOTE(Kuntai): Though we have removed block manager v1, this test is still
++    useful as it asserts the behavior of block manager v2 (now it is called 
++    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
++    keep this test.
+     """
+     output_len = 1024
+     temperature = 0.0
+@@ -62,72 +70,9 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
+         temperature=temperature,
+     )
+ 
+-    print('Getting token ids from block manager v1')
+-    baseline_token_ids = get_token_ids_from_llm_generator(
+-        baseline_llm_generator, prompts, sampling_params)
+-
+-    print('Getting token ids from block manager v2')
+-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+-                                                      prompts, sampling_params)
+-
+-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+-                                                    test_token_ids):
+-        assert expected_token_ids == actual_token_ids
+-
+-    assert baseline_token_ids == test_token_ids
+-
+-
+-@pytest.mark.parametrize(
+-    "common_llm_kwargs",
+-    [{
+-        # Use a small model for a fast test.
+-        "model": "facebook/opt-125m",
+-
+-        # skip cuda graph creation for fast test.
+-        "enforce_eager": True,
+-
+-        # Use a large block size to trigger more copy-on-writes.
+-        "block_size": 32,
+-    }])
+-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+-@pytest.mark.parametrize("baseline_llm_kwargs", [{
+-    "use_v2_block_manager": False
+-}])
+-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+-@pytest.mark.parametrize("batch_size", [10])
+-@pytest.mark.parametrize("seed", [1])
+-def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
+-                                        test_llm_generator, batch_size):
+-    """Verify beam search equality with block manager v1 and v2.
+-
+-    This requires copy-on-writes; if the v1 and v2 output is the same, then
+-    we have some confidence cow is working.
+-    """
+-    output_len = 128
+-    temperature = 0.0
+-
+-    prompts = [
+-        "Hello, my name is",
+-        "The president of the United States is",
+-        "The capital of France is",
+-        "The future of AI is",
+-    ]
+-
+-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+-
+-    sampling_params = SamplingParams(
+-        max_tokens=output_len,
+-        ignore_eos=True,
+-        temperature=temperature,
+-        use_beam_search=True,
+-        best_of=2,
+-    )
+-
+-    print('Getting token ids from block manager v1')
+     baseline_token_ids = get_token_ids_from_llm_generator(
+         baseline_llm_generator, prompts, sampling_params)
+ 
+-    print('Getting token ids from block manager v2')
+     test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                       prompts, sampling_params)
+ 
+@@ -150,9 +95,6 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
+ 
+         # skip cuda graph creation for fast test.
+         "enforce_eager": True,
+-
+-        # Lookahead scheduling only supported in v2 block manager.
+-        "use_v2_block_manager": True,
+     }])
+ @pytest.mark.parametrize(
+     "per_test_common_llm_kwargs",
+@@ -169,7 +111,7 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
+ 
+             # Allow only 2 sequences of ~128 tokens in worst case.
+             # Note 16 = 128/block_size
+-            "num_gpu_blocks_override": 2 * (16 + 1),
++            "num_gpu_blocks_override": 2 * (16 + 2),
+         }
+     ])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{
+@@ -177,11 +119,18 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
+ }])
+ @pytest.mark.parametrize(
+     "test_llm_kwargs",
+-    [{
+-        # We run one test with block_size < lookahead_slots, one test with
+-        # block_size > lookahead_slots
+-        "num_lookahead_slots": 10,
+-    }])
++    [
++        {
++            # We run one test with block_size < lookahead_slots, one test with
++            # block_size > lookahead_slots
++            "num_lookahead_slots": 10,
++            "preemption_mode": "swap",
++        },
++        {
++            "num_lookahead_slots": 10,
++            "preemption_mode": "recompute",
++        }
++    ])
+ @pytest.mark.parametrize("batch_size", [4])
+ @pytest.mark.parametrize("seed", [1])
+ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
+@@ -240,32 +189,39 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
+             # skip cuda graph creation for fast test.
+             "enforce_eager": True,
+             "enable_chunked_prefill": True,
+-            "max_num_batched_tokens": 2,
+-            "max_num_seqs": 2,
+         },
+     ])
+-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("per_test_common_llm_kwargs",
++                         [{
++                             "block_size": 8,
++                             "max_num_batched_tokens": 2,
++                             "max_num_seqs": 2,
++                         }, {
++                             "block_size": 8,
++                             "max_num_batched_tokens": 3,
++                             "max_num_seqs": 2,
++                         }, {
++                             "block_size": 8,
++                             "max_num_batched_tokens": 256,
++                             "max_num_seqs": 10,
++                         }])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [
+-    {
+-        "use_v2_block_manager": False,
+-    },
++    {},
+ ])
+ @pytest.mark.parametrize("test_llm_kwargs", [
+     {
+-        "use_v2_block_manager": True,
+         "num_lookahead_slots": 0,
+     },
+     {
+-        "use_v2_block_manager": True,
+         "num_lookahead_slots": 5,
+     },
+ ])
+ @pytest.mark.parametrize("batch_size", [4])
+ @pytest.mark.parametrize("seed", [1])
+-def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
+-                                          test_llm_generator, batch_size):
+-    """Verify that chunked prefill works with BlockManagerV2, with and without
+-    lookahead scheduling.
++def test_chunked_prefill_block_manager(baseline_llm_generator,
++                                       test_llm_generator, batch_size):
++    """Verify that chunked prefill works with SelfAttnBlockSpaceManager, 
++    with and without lookahead scheduling.
+     """
+     output_len = 32
+     temperature = 0.0
+@@ -273,6 +229,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
+     prompts = [
+         "Hello, my name is",
+         "The president of the United States is",
++        ("1 + " * 50) + " 1 = ",  # Longer prompt.
+         "The capital of France is",
+         "The future of AI is",
+     ]
+@@ -285,11 +242,11 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
+         temperature=temperature,
+     )
+ 
+-    print('Getting token ids with BlockManagerV1')
++    print('Getting token ids with BlockManager')
+     baseline_token_ids = get_token_ids_from_llm_generator(
+         baseline_llm_generator, prompts, sampling_params)
+ 
+-    print('Getting token ids with BlockManagerV2')
++    print('Getting token ids with BlockManager, with lookahead slots.')
+     test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                       prompts, sampling_params)
+ 
+@@ -317,26 +274,32 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
+         "enable_prefix_caching": True,
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+-@pytest.mark.parametrize("baseline_llm_kwargs", [{
+-    "use_v2_block_manager": False
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [{
++    "preemption_mode": "swap"
++}, {
++    "preemption_mode": "recompute"
+ }])
+-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+ @pytest.mark.parametrize("batch_size", [10])
+ @pytest.mark.parametrize("seed", [1])
+-def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
++def test_block_manager_prefix_caching_enabled_with_preemption(
+         baseline_llm_generator, test_llm_generator, batch_size):
+-    """Verify block manager v2 produces same outputs as block manager v1, even
+-    when there is preemption.
++    """Verify block manager produces same outputs even when there is preemption.
+ 
+     This constructs two LLM, each with limited number of GPU blocks. The limit
+     is decided such that as the sequences in the batch grow, sequences must be
+     preempted and removed from cache.
+ 
+     If the output token ids are equivalent, then we have confidence that the KV
+-    cache is not corrupted in the v2 block manager.
++    cache is not corrupted.
+ 
+     NOTE: We want a significant number of generated tokens so that any incorrect
+     KV mapping has time to build up error.
++
++    NOTE(Kuntai): Though we have removed block manager v1, this test is still
++    useful as it asserts the behavior of block manager v2 (now it is called 
++    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
++    keep this test.
+     """
+     output_len = 1024
+     temperature = 0.0
+@@ -360,11 +323,11 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
+         temperature=temperature,
+     )
+ 
+-    print('Getting token ids from block manager v1')
++    print('Getting token ids from block manager')
+     baseline_token_ids = get_token_ids_from_llm_generator(
+         baseline_llm_generator, prompts, sampling_params)
+ 
+-    print('Getting token ids from block manager v2')
++    print('Getting token ids from block manager, with preemption')
+     test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                       prompts, sampling_params)
+ 
+@@ -387,15 +350,18 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
+         # Allow only 5 sequences of ~1024 tokens in worst case.
+         "block_size": 16,
+         "num_gpu_blocks_override": 5 * (64 + 1),
+-
+-        # Test APC in v2 block
+-        "use_v2_block_manager": True,
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{
+     "enable_prefix_caching": False
+ }])
+-@pytest.mark.parametrize("test_llm_kwargs", [{"enable_prefix_caching": True}])
++@pytest.mark.parametrize("test_llm_kwargs", [{
++    "enable_prefix_caching": True,
++    "preemption_mode": "swap"
++}, {
++    "enable_prefix_caching": True,
++    "preemption_mode": "recompute"
++}])
+ @pytest.mark.parametrize("batch_size", [10])
+ @pytest.mark.parametrize("seed", [1])
+ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
+@@ -446,10 +412,65 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
+     assert baseline_token_ids == test_token_ids
+ 
+ 
+-def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
+-    for llm in llm_generator:
+-        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+-        token_ids = [output.outputs[0].token_ids for output in outputs]
+-        del llm
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Use a small model for a fast test.
++        "model": "facebook/opt-125m",
++
++        # skip cuda graph creation for fast test.
++        "enforce_eager": True,
++
++        # we keep the blocks small, so that hit eviction quickly
++        "max_model_len": 48,
++        "block_size": 16,
++        "num_gpu_blocks_override": 3,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{
++    "enable_prefix_caching": False
++}])
++@pytest.mark.parametrize("test_llm_kwargs", [{
++    "enable_prefix_caching": True,
++}])
++@pytest.mark.parametrize("seed", [1])
++def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
++                                                 test_llm_generator):
++    """Verify block manager v2 with auto prefix caching could works normal
++    even when eviction started.
++    With APC enabled, all blocks are held by native block at the beginning.
++    Then blocks are managed by evictor instead. If cache hit at the evitor's
++    block, then it could be reused, or we need to recompute its kv cache.
++    """
++    output_len = 10
++    temperature = 0.0
++
++    prompts = [
++        "You are a helpful assistant. Please answer truthfully and write "
++        "out your thinking step by step to be sure you get the right answer. "
++        "If you make a mistake, attempt to correct it. who are you?",
++        "You are a helpful assistant. Please answer truthfully and write out "
++        "your thinking step by step to be sure you get the right answer. You "
++        "are helpful and harmless and you follow ethical guidelines. "
++        "who are you?"
++    ]
++
++    sampling_params = SamplingParams(
++        max_tokens=output_len,
++        ignore_eos=True,
++        temperature=temperature,
++    )
+ 
+-    return token_ids
++    print('Getting token ids with APC disabled')
++    baseline_token_ids = get_token_ids_from_llm_generator(
++        baseline_llm_generator, prompts, sampling_params)
++
++    print('Getting token ids with APC enabled')
++    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
++                                                      prompts, sampling_params)
++
++    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
++                                                    test_token_ids):
++        assert expected_token_ids == actual_token_ids
++
++    assert baseline_token_ids == test_token_ids
+diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
+new file mode 100644
+index 0000000..415d0bd
+--- /dev/null
++++ b/tests/core/block/e2e/test_correctness_sliding_window.py
+@@ -0,0 +1,170 @@
++import random
++from typing import List
++
++import pytest
++
++from tests.kernels.utils import override_backend_env_variable
++from vllm import LLM, SamplingParams
++
++from .conftest import get_text_from_llm_generator
++
++# relatively small model with 4k sliding window
++MODEL = "bigcode/starcoder2-3b"
++BLOCK_SIZE = 16
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model": MODEL,
++
++        # skip cuda graph creation for fast test.
++        "enforce_eager": True,
++        "block_size": BLOCK_SIZE,
++        # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
++        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [{}])
++@pytest.mark.parametrize("batch_size", [5])
++@pytest.mark.parametrize("seed", [1])
++@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
++def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
++                                 batch_size, seed, backend, monkeypatch):
++    """
++    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
++    asks for value of one of them (which is outside the sliding window).
++    If we tell it upfront which we are going to be looking for, then
++    it answers correctly (mostly).
++
++    Additionally, we compare the results of the v1 and v2 managers.
++    """
++    override_backend_env_variable(monkeypatch, backend)
++
++    sampling_params = SamplingParams(
++        max_tokens=1024,
++        ignore_eos=True,
++        temperature=0.0,
++    )
++
++    prompts, answer, indices = prep_prompts(batch_size)
++
++    baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
++                                                 prompts,
++                                                 sampling_params,
++                                                 llm_cb=check_window(prompts))
++
++    check_answers(indices, answer, baseline_texts)
++
++    print('Getting token ids from block manager v2')
++    test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
++                                             sampling_params)
++    check_answers(indices, answer, test_texts)
++
++    cmp = [
++        expected_text == actual_text
++        for expected_text, actual_text in zip(baseline_texts, test_texts)
++    ]
++    print(cmp)
++    # make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
++    # however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
++    # states that xformers and flash_attn have different ideas about the window
++    # size anyways
++    assert sum(cmp) > 0.7 * len(cmp)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model": MODEL,
++
++        # skip cuda graph creation for fast test.
++        "enforce_eager": True,
++        "block_size": BLOCK_SIZE,
++        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
++@pytest.mark.parametrize("batch_size", [5])
++@pytest.mark.parametrize("seed", [1])
++@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
++def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
++                                        backend, monkeypatch):
++    """
++    This is similar to test_sliding_window_retrival, however, it doesn't
++    compare against the v1 block manager since v1 doesn't support
++    chunked prefill with sliding window.
++
++    The results with and without chunked prefill are not the same due to
++    numerical instabilities.
++    """
++    override_backend_env_variable(monkeypatch, backend)
++
++    sampling_params = SamplingParams(
++        max_tokens=10,
++        ignore_eos=True,
++        temperature=0.0,
++    )
++
++    prompts, answer, indices = prep_prompts(batch_size)
++
++    # We don't compare with the baseline model here, since the results
++    # slightly different due to different tailing in attention.
++    test_texts = get_text_from_llm_generator(test_llm_generator,
++                                             prompts,
++                                             sampling_params,
++                                             llm_cb=check_window(prompts))
++    check_answers(indices, answer, test_texts)
++
++
++def prep_prompts(batch_size: int):
++    """
++    Generate prompts which a bunch of assignments,
++    then asking for the value of one of them.
++    The prompt is just under 10k tokens; sliding window is 4k
++    so the answer is outside sliding window, but should still be correct.
++    """
++    prompts: List[str] = []
++    answer: List[int] = []
++    indices: List[int] = []
++    random.seed(1)
++    for _ in range(batch_size):
++        idx = random.randint(30, 90)
++        indices.append(idx)
++        prompt = "```python\n# We set a number of variables, " + \
++                 f"x{idx} will be important later\n"
++        ln = random.randint(800, 1100)
++        for k in range(30, ln):
++            v = random.randint(10, 99)
++            if k == idx:
++                answer.append(v)
++            prompt += f"x{k} = {v}\n"
++        prompt += f"# Now, we check the value of x{idx}:\n"
++        prompt += f"assert x{idx} == "
++        prompts.append(prompt)
++    return prompts, answer, indices
++
++
++def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
++    answer2 = [int(text[0:2].strip()) for text in outputs]
++    print(list(zip(indices, zip(answer, answer2))))
++    numok = 0
++    for a1, a2 in zip(answer, answer2):
++        if a1 == a2:
++            numok += 1
++    frac_ok = numok / len(answer)
++    print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
++    assert frac_ok > 0.7
++
++
++def check_window(prompts: List[str]):
++
++    def inner(llm: LLM):
++        sliding_window = llm.llm_engine.model_config.get_sliding_window()
++        assert sliding_window and sliding_window > 0
++        assert any(
++            len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
++            for prompt in prompts)
++
++    return inner
+diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
+new file mode 100644
+index 0000000..cfd749a
+--- /dev/null
++++ b/tests/core/block/test_block_manager.py
+@@ -0,0 +1,491 @@
++import pytest
++
++from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
++                                   STR_NOT_IMPL_ENC_DEC_SWA)
++from vllm.core.block_manager import SelfAttnBlockSpaceManager
++from vllm.core.interfaces import AllocStatus
++from vllm.sequence import Logprob, SequenceStatus
++from vllm.utils import chunk_list
++
++from ..utils import (create_dummy_prompt, create_seq_group,
++                     create_seq_group_encoder_decoder)
++
++
++@pytest.mark.parametrize("block_size", [16])
++@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
++@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
++@pytest.mark.parametrize("watermark", [0.0, 0.5])
++def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
++                                num_gpu_blocks: int, watermark: float):
++    block_manager = SelfAttnBlockSpaceManager(
++        block_size=block_size,
++        num_gpu_blocks=num_gpu_blocks,
++        num_cpu_blocks=1024,
++        watermark=watermark,
++    )
++    num_watermark_blocks = int(watermark * num_gpu_blocks)
++
++    num_output_blocks_per_seq = 1
++
++    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
++    # the current implementation assumes all seqs are new prompts / don't have
++    # different output lens.
++    num_output_blocks = num_output_blocks_per_seq
++
++    for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
++        seq_group = create_seq_group(
++            seq_prompt_len=block_size * num_prompt_blocks,
++            seq_output_lens=[
++                block_size * num_output_blocks_per_seq
++                for _ in range(num_seqs_per_group)
++            ],
++        )
++
++        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
++
++        can_allocate_result = block_manager.can_allocate(seq_group)
++
++        num_required_blocks = num_prompt_blocks + num_output_blocks
++
++        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
++            assert can_allocate_result == AllocStatus.NEVER
++        elif num_gpu_blocks >= num_required_blocks:
++            assert can_allocate_result == AllocStatus.OK
++        else:
++            assert can_allocate_result == AllocStatus.LATER
++
++
++@pytest.mark.parametrize("block_size", [16])
++@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
++@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
++@pytest.mark.parametrize("watermark", [0.0, 0.5])
++def test_can_allocate_seq_group_encoder_decoder(block_size: int,
++                                                num_seqs_per_group: int,
++                                                num_gpu_blocks: int,
++                                                watermark: float):
++    block_manager = SelfAttnBlockSpaceManager(
++        block_size=block_size,
++        num_gpu_blocks=num_gpu_blocks,
++        num_cpu_blocks=1024,
++        watermark=watermark,
++    )
++    num_watermark_blocks = int(watermark * num_gpu_blocks)
++
++    num_output_blocks_per_seq = 1
++
++    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
++    # the current implementation assumes all seqs are new prompts / don't have
++    # different output lens.
++    num_output_blocks = num_output_blocks_per_seq
++
++    for bdx, num_prompt_blocks in enumerate(
++            range(1, num_gpu_blocks - num_output_blocks)):
++        num_cross_blocks_per_seq = num_prompt_blocks
++
++        seq_group = create_seq_group_encoder_decoder(
++            seq_prompt_len=block_size * num_prompt_blocks,
++            seq_output_lens=[
++                block_size * num_output_blocks_per_seq
++                for _ in range(num_seqs_per_group)
++            ],
++            request_id=str(bdx))
++
++        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
++
++        can_allocate_result = block_manager.can_allocate(seq_group)
++
++        num_required_blocks = num_prompt_blocks + \
++                              num_output_blocks + \
++                              num_cross_blocks_per_seq
++
++        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
++            assert can_allocate_result == AllocStatus.NEVER
++        elif num_gpu_blocks >= num_required_blocks:
++            assert can_allocate_result == AllocStatus.OK
++        else:
++            assert can_allocate_result == AllocStatus.LATER
++
++
++@pytest.mark.parametrize("block_size", [16])
++@pytest.mark.parametrize("num_gpu_blocks", [16])
++@pytest.mark.parametrize("num_seqs_per_group", [1])
++@pytest.mark.parametrize("watermark", [0.0, 0.5])
++def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
++                                                     num_seqs_per_group: int,
++                                                     num_gpu_blocks: int,
++                                                     watermark: float):
++    '''
++    SWA short for Sliding Window Attention.
++
++    At time of writing block manager does not support SWA.
++
++    However even when SWA is implemented for block manager,
++    there will still most likely be a separate workstream required
++    to enable SWA for encoder/decoder models.
++
++    Therefore this test enforces that one of the following cases
++    hold true:
++    1. Block manager does not support SWA at all (true at time of writing)
++    2. Block manager fails with NotImplementError when SWA is enabled
++       AND a SequenceGroup with an encoder sequence (i.e. in support of an
++       encoder/decoder model) is passed into can_allocate() as an argument
++
++    The setup for this test is stripped down version of
++    test_can_allocate_seq_group_encoder_decoder()
++    '''
++
++    with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
++        block_manager = SelfAttnBlockSpaceManager(
++            block_size=block_size,
++            num_gpu_blocks=num_gpu_blocks,
++            num_cpu_blocks=1024,
++            watermark=watermark,
++            sliding_window=5  # SWA
++        )
++
++        num_output_blocks_per_seq = 1
++        num_prompt_blocks = 1
++        num_output_blocks = num_output_blocks_per_seq
++        seq_group = create_seq_group_encoder_decoder(
++            seq_prompt_len=block_size * num_prompt_blocks,
++            seq_output_lens=[
++                block_size * num_output_blocks_per_seq
++                for _ in range(num_seqs_per_group)
++            ],
++            request_id="0")
++
++        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
++        block_manager.can_allocate(seq_group)
++
++    # Assert that either
++    # 1. Block manager constructor fails with assertion that sliding window
++    #    is not yet supported (most likely near-term outcome at time of
++    #    writing), or
++    # 2. can_allocate() fails with NotImplementedError due to combination of
++    #    encoder/decoder and sliding window attention
++    if isinstance(exc_info.value, NotImplementedError):
++        assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
++    elif isinstance(exc_info.value, AssertionError):
++        assert str(exc_info.value) == "Sliding window not yet supported"
++
++
++@pytest.mark.parametrize("block_size", [16])
++@pytest.mark.parametrize("num_gpu_blocks", [16])
++@pytest.mark.parametrize("num_seqs_per_group", [1])
++@pytest.mark.parametrize("watermark", [0.0, 0.5])
++def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
++        block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
++        watermark: float):
++
++    block_manager = SelfAttnBlockSpaceManager(
++        block_size=block_size,
++        num_gpu_blocks=num_gpu_blocks,
++        num_cpu_blocks=1024,
++        watermark=watermark,
++        enable_caching=True  # Prefix cache
++    )
++
++    num_output_blocks_per_seq = 1
++    num_prompt_blocks = 1
++    num_output_blocks = num_output_blocks_per_seq
++    seq_group = create_seq_group_encoder_decoder(
++        seq_prompt_len=block_size * num_prompt_blocks,
++        seq_output_lens=[
++            block_size * num_output_blocks_per_seq
++            for _ in range(num_seqs_per_group)
++        ],
++        request_id="0")
++
++    assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
++
++    # Assert that either can_allocate() fails with NotImplementedError
++    # due to combination of encoder/decoder and prefix cache
++    with pytest.raises(NotImplementedError) as exc_info:
++        block_manager.can_allocate(seq_group)
++    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
++
++
++@pytest.mark.parametrize("block_size", [1, 8])
++@pytest.mark.parametrize("prompt_len", [1, 7, 8])
++@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
++@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
++def test_append_slots(block_size, prompt_len, num_slots_to_append,
++                      num_lookahead_slots):
++    """Verify append_slots consumes the correct number of blocks from the block
++    table.
++    """
++
++    num_gpu_blocks = 1024
++    watermark = 0.1
++    block_manager = SelfAttnBlockSpaceManager(
++        block_size=block_size,
++        num_gpu_blocks=num_gpu_blocks,
++        num_cpu_blocks=0,
++        watermark=watermark,
++    )
++
++    seq_group = create_seq_group(
++        seq_prompt_len=prompt_len,
++        seq_output_lens=[0],
++    )
++
++    # Allocate seq
++    assert block_manager.can_allocate(seq_group)
++    block_manager.allocate(seq_group)
++
++    # Seq seq to RUNNING
++    seq = seq_group.get_seqs()[0]
++    seq.status = SequenceStatus.RUNNING
++
++    # Append tokens to the sequeqnce
++    for token_id in range(num_slots_to_append):
++        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
++
++    # Append slots for new tokens and lookahead slots.
++    free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
++    block_manager.append_slots(seq, num_lookahead_slots)
++    num_consumed_blocks = (free_blocks_before_append -
++                           block_manager.get_num_free_gpu_blocks())
++
++    # Expect consumed blocks to be new blocks required to support the new slots.
++    expected_consumed_blocks = len(
++        list(
++            chunk_list(
++                list(
++                    range(prompt_len + num_slots_to_append +
++                          num_lookahead_slots)),
++                block_size))) - len(
++                    list(chunk_list(list(range(prompt_len)), block_size)))
++    assert num_consumed_blocks == expected_consumed_blocks
++
++
++@pytest.mark.parametrize("block_size", [8])
++@pytest.mark.parametrize("num_cpu_blocks", [4])
++@pytest.mark.parametrize("num_gpu_blocks", [4])
++@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
++@pytest.mark.parametrize("enable_caching", [False, True])
++def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
++              enable_caching):
++    """Verify blocks number on src/desc device is correct after swapping in/out
++        sequence group (not missing or extra blocks).
++    """
++    block_manager = SelfAttnBlockSpaceManager(block_size,
++                                              num_cpu_blocks,
++                                              num_gpu_blocks,
++                                              watermark=0,
++                                              enable_caching=enable_caching)
++    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
++    prompt.status = SequenceStatus.WAITING
++    block_manager.allocate(seq_group)
++
++    # Emulate a forward pass by appending a single token.
++    # The block manager then knows how many unprocessed
++    # tokens will be written in the next forward pass.
++    token_id = 0
++    prompt.status = SequenceStatus.RUNNING
++    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
++
++    # Swap seq group from GPU -> CPU.
++    gpu_blocks = block_manager.get_block_table(prompt)
++    assert block_manager.can_swap_out(seq_group)
++    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
++    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
++    mapping = block_manager.swap_out(seq_group)
++    mapping_keys = [key for key, _ in mapping]
++    assert mapping_keys == gpu_blocks
++    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
++    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
++    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
++    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
++    prompt.status = SequenceStatus.SWAPPED
++
++    # Swap seq group from CPU -> GPU.
++    assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
++    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
++    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
++    mapping = block_manager.swap_in(seq_group)
++    cpu_blocks = block_manager.get_block_table(prompt)
++    mapping_keys = [key for key, _ in mapping]
++    assert mapping_keys == [cpu_blocks[0]]
++    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
++    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
++    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
++
++
++@pytest.mark.parametrize("block_size", [8])
++@pytest.mark.parametrize("num_gpu_blocks", [4])
++@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
++@pytest.mark.parametrize("enable_caching", [True, False])
++def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
++                  enable_caching):
++    """ Verify the block manager can correctly determine if a sequence group
++        can be swapped in/out.
++    """
++    num_cpu_blocks = num_gpu_blocks
++    block_manager = SelfAttnBlockSpaceManager(block_size,
++                                              num_cpu_blocks,
++                                              num_gpu_blocks,
++                                              watermark=0,
++                                              enable_caching=enable_caching)
++    prompt, seq_group = create_dummy_prompt(
++        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
++    prompt.status = SequenceStatus.WAITING
++    block_manager.allocate(seq_group)
++    prompt.status = SequenceStatus.RUNNING
++
++    # Swap seq group from GPU -> CPU.
++    gpu_blocks = block_manager.get_block_table(prompt)
++    assert block_manager.can_swap_out(seq_group)
++    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
++    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
++    mapping = block_manager.swap_out(seq_group)
++    mapping_keys = [key for key, _ in mapping]
++    assert mapping_keys == gpu_blocks
++    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
++    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
++    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
++    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
++    prompt.status = SequenceStatus.SWAPPED
++
++    # At this moment, we still have enough free blocks to swap in the seq group.
++    if num_lookahead_slots <= block_size:
++        assert block_manager.can_swap_in(seq_group,
++                                         num_lookahead_slots) == AllocStatus.OK
++    else:
++        assert block_manager.can_swap_in(
++            seq_group, num_lookahead_slots) == AllocStatus.NEVER
++
++    # During Swapped out, 2 cached blocks were evicted from the GPU,
++    # so the prompt1 can't be swapped in
++    prompt2_len = 2 * block_size - 1
++    prompt2, seq_group2 = create_dummy_prompt(
++        "2",
++        prompt_length=prompt2_len,
++        prompt_tokens=[10000 + i for i in range(prompt2_len)])
++    prompt2.status = SequenceStatus.WAITING
++    block_manager.allocate(seq_group2)
++
++    # Swap seq group from CPU -> GPU.
++    if num_lookahead_slots <= block_size:
++        assert block_manager.can_swap_in(
++            seq_group, num_lookahead_slots) == AllocStatus.LATER
++    else:
++        assert block_manager.can_swap_in(
++            seq_group, num_lookahead_slots) == AllocStatus.NEVER
++
++
++@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
++@pytest.mark.parametrize("enable_caching", [False, True])
++def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
++    """Verifies that swapping fails if there is not enough free blocks
++    to account for unseen tokens and lookahead_slots.
++    """
++    block_size = 8
++    num_cpu_blocks = 1
++    num_gpu_blocks = 1
++    block_manager = SelfAttnBlockSpaceManager(block_size,
++                                              num_cpu_blocks,
++                                              num_gpu_blocks,
++                                              watermark=0,
++                                              enable_caching=enable_caching)
++    prompt_length = block_size - 3
++    assert prompt_length > 0
++    prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
++    prompt.status = SequenceStatus.WAITING
++    block_manager.allocate(seq_group)
++    # Emulate a forward pass by appending a single token.
++    # The block manager then knows how many unprocessed
++    # tokens will be written in the next forward pass.
++    token_id = 0
++    prompt.status = SequenceStatus.RUNNING
++    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
++
++    # Swap seq group from GPU -> CPU.
++    assert block_manager.can_swap_out(seq_group)
++    block_manager.swap_out(seq_group)
++    prompt.status = SequenceStatus.SWAPPED
++
++    # Swap seq group from CPU -> GPU.
++    # The number of unseen tokens is 1. If the number of existing
++    # tokens plus the unseen ones and number of lookahead slots exceeds
++    # the total number of available GPU blocks then the swap
++    # should fail.
++    num_unseen_tokens = 1
++    if (num_lookahead_slots + num_unseen_tokens +
++            prompt_length) <= (block_size * num_gpu_blocks):
++        assert block_manager.can_swap_in(seq_group,
++                                         num_lookahead_slots) == AllocStatus.OK
++    else:
++        assert block_manager.can_swap_in(
++            seq_group, num_lookahead_slots) == AllocStatus.NEVER
++
++
++# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
++
++
++@pytest.mark.parametrize("block_size", [8, 16])
++@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
++@pytest.mark.parametrize("num_slots_to_append", [50])
++@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
++def test_sliding_window(block_size, prompt_len, num_slots_to_append,
++                        sliding_window):
++    """Verify append_slots consumes the correct number of blocks from the block
++    table.
++    """
++
++    num_gpu_blocks = 1024
++    watermark = 0.1
++    block_manager = SelfAttnBlockSpaceManager(
++        block_size=block_size,
++        num_gpu_blocks=num_gpu_blocks,
++        num_cpu_blocks=0,
++        watermark=watermark,
++        sliding_window=sliding_window,
++    )
++
++    def check_used(min_n, max_n=None):
++        if max_n is None:
++            max_n = min_n
++        used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
++        assert min_n <= used
++        assert used <= max_n
++
++    def num_blocks(num_tokens):
++        return (num_tokens + block_size - 1) // block_size
++
++    check_used(0)
++
++    seq_group = create_seq_group(
++        seq_prompt_len=prompt_len,
++        seq_output_lens=[0],
++    )
++
++    check_used(0)
++
++    # Allocate seq
++    assert block_manager.can_allocate(seq_group)
++    block_manager.allocate(seq_group)
++
++    check_used(num_blocks(prompt_len))
++
++    # Seq seq to RUNNING
++    seq = seq_group.get_seqs()[0]
++    seq.status = SequenceStatus.RUNNING
++
++    seq.data.update_num_computed_tokens(prompt_len)
++    check_used(num_blocks(prompt_len))
++
++    # this is how we compute it in SelfAttnBlockSpaceManager.__init__
++    sliding_blocks = (sliding_window // block_size) + 2
++    # plus one block for null block
++    sliding_blocks += 1
++
++    # Append tokens to the sequeqnce
++    for token_id in range(num_slots_to_append):
++        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
++        seq.data.update_num_computed_tokens(1)
++        block_manager.append_slots(seq, num_lookahead_slots=0)
++        if prompt_len < sliding_window + 10:
++            check_used(0, sliding_blocks + 1)
++        else:
++            check_used(sliding_blocks, sliding_blocks + 1)
+diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
+index 3481d6b..e2391a5 100644
+--- a/tests/core/block/test_block_table.py
++++ b/tests/core/block/test_block_table.py
+@@ -1,3 +1,5 @@
++from typing import List
++
+ import pytest
+ 
+ from vllm.core.block.block_table import BlockTable
+@@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
+     token_ids = list(range(sequence_len))
+     num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
+ 
+-    block_tables = []
++    block_tables: List[BlockTable] = []
+     for i in range(5):
+         assert allocator.get_num_free_blocks(
+             device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
+@@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
+     num_immutable_blocks_per_alloc = len(
+         chunked_tokens) - num_mutable_blocks_per_alloc
+ 
+-    block_tables = []
++    block_tables: List[BlockTable] = []
+     for alloc_i in range(1, 6):
+ 
+         block_tables.append(
+@@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
+     )
+     block_table.allocate(token_ids=token_ids, device=Device.GPU)
+ 
+-    appended_so_far = []
++    appended_so_far: List[int] = []
+     for append in chunk_list(token_ids_to_append, append_size):
+         block_table.append_token_ids(append)
+         appended_so_far.extend(append)
+@@ -371,8 +373,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
+                                    block_size) - (sequence_len // block_size)
+ 
+     original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
+-    original_block_ids = original_block_table.physical_block_ids
++    original_block_ids = original_block_table.physical_block_ids[:]
+ 
++    print("original_block_ids = {}".format(original_block_ids))
+     forked_block_table = original_block_table.fork()
+ 
+     # Expect no additional allocation (copy on _write_).
+@@ -410,8 +413,7 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
+         expected_src = static_block_table.physical_block_ids[cow_block_id]
+         expected_dst = appender_block_table.physical_block_ids[cow_block_id]
+ 
+-        assert expected_src in cows
+-        assert expected_dst in cows[expected_src]
++        assert (expected_src, expected_dst) in cows
+     else:
+         # Otherwise, there should be no copy-on-write.
+         assert not cows
+@@ -456,7 +458,7 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
+ 
+     # Allocate lookahead slots.
+     original_block_table.ensure_num_empty_slots(lookahead_slots)
+-    original_block_ids = original_block_table.physical_block_ids
++    original_block_ids = original_block_table.physical_block_ids[:]
+ 
+     forked_block_table = original_block_table.fork()
+ 
+@@ -490,8 +492,7 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
+         expected_src = static_block_table.physical_block_ids[cow_block_id]
+         expected_dst = appender_block_table.physical_block_ids[cow_block_id]
+ 
+-        assert expected_src in cows
+-        assert expected_dst in cows[expected_src]
++        assert (expected_src, expected_dst) in cows
+ 
+     static_block_table.free()
+     appender_block_table.free()
+diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
+index 44a5be6..a9e38d4 100644
+--- a/tests/core/block/test_cpu_gpu_block_allocator.py
++++ b/tests/core/block/test_cpu_gpu_block_allocator.py
+@@ -8,8 +8,8 @@ from vllm.utils import Device, chunk_list
+ @pytest.mark.parametrize("num_gpu_blocks", [1024])
+ @pytest.mark.parametrize("block_size", [16])
+ @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+-def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
+-                          block_size: int, allocator_type: str):
++def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
++                                block_size: int, allocator_type: str):
+     allocator = CpuGpuBlockAllocator.create(
+         allocator_type=allocator_type,
+         num_gpu_blocks=num_gpu_blocks,
+@@ -21,14 +21,14 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
+     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+ 
+     cpu_blocks = [
+-        allocator.allocate_mutable(prev_block=None, device=Device.CPU)
++        allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
+         for _ in range(num_cpu_blocks)
+     ]
+     assert allocator.get_num_free_blocks(Device.CPU) == 0
+     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+ 
+     gpu_blocks = [
+-        allocator.allocate_mutable(prev_block=None, device=Device.GPU)
++        allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
+         for _ in range(num_gpu_blocks)
+     ]
+     assert allocator.get_num_free_blocks(Device.CPU) == 0
+@@ -47,8 +47,8 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
+ @pytest.mark.parametrize("num_gpu_blocks", [1024])
+ @pytest.mark.parametrize("block_size", [2])
+ @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
+-def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
+-                            block_size: int, allocator_type: str):
++def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
++                                  block_size: int, allocator_type: str):
+     allocator = CpuGpuBlockAllocator.create(
+         allocator_type=allocator_type,
+         num_gpu_blocks=num_gpu_blocks,
+@@ -58,27 +58,27 @@ def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
+ 
+     unique_token_ids = list(
+         range((num_cpu_blocks + num_gpu_blocks) * block_size))
+-    gpu_token_ids = chunk_list(unique_token_ids[:num_gpu_blocks * block_size],
+-                               block_size)
+-    cpu_token_ids = chunk_list(unique_token_ids[num_gpu_blocks * block_size:],
+-                               block_size)
++    gpu_token_ids = list(
++        chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
++    cpu_token_ids = list(
++        chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
+ 
+     assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
+     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+ 
+     cpu_blocks = [
+-        allocator.allocate_immutable(prev_block=None,
+-                                     token_ids=token_ids,
+-                                     device=Device.CPU)
++        allocator.allocate_immutable_block(prev_block=None,
++                                           token_ids=token_ids,
++                                           device=Device.CPU)
+         for token_ids in cpu_token_ids
+     ]
+     assert allocator.get_num_free_blocks(Device.CPU) == 0
+     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
+ 
+     gpu_blocks = [
+-        allocator.allocate_immutable(prev_block=None,
+-                                     token_ids=token_ids,
+-                                     device=Device.GPU)
++        allocator.allocate_immutable_block(prev_block=None,
++                                           token_ids=token_ids,
++                                           device=Device.GPU)
+         for token_ids in gpu_token_ids
+     ]
+     assert allocator.get_num_free_blocks(Device.CPU) == 0
+diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
+index edcdc0c..10d5964 100644
+--- a/tests/core/block/test_naive_block.py
++++ b/tests/core/block/test_naive_block.py
+@@ -14,11 +14,11 @@ class TestNaiveBlockAllocator:
+                                prev_block: Optional[Block],
+                                token_ids: List[int]):
+         if allocate_type == "immutable":
+-            allocate_block = lambda: allocator.allocate_immutable(
++            allocate_block = lambda: allocator.allocate_immutable_block(
+                 prev_block=prev_block, token_ids=token_ids)
+         elif allocate_type == "mutable":
+-            allocate_block = lambda: allocator.allocate_mutable(prev_block=
+-                                                                prev_block)
++            allocate_block = lambda: allocator.allocate_mutable_block(
++                prev_block=prev_block)
+         else:
+             raise ValueError()
+ 
+@@ -100,3 +100,46 @@ class TestNaiveBlockAllocator:
+         for i, block in enumerate(blocks):
+             assert allocator.get_num_free_blocks() == i
+             allocator.free(block)
++
++    @staticmethod
++    @pytest.mark.parametrize("num_blocks", [4])
++    @pytest.mark.parametrize("block_size", [8])
++    def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
++        """ Verify the allocator can correctly return the number of
++        full blocks touched.
++        """
++        allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
++                                            num_blocks=num_blocks,
++                                            block_size=block_size)
++        allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
++                                            num_blocks=num_blocks,
++                                            block_size=block_size)
++
++        # Create a chain of cacheable blocks in the dst
++        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
++            "immutable",
++            allocator_src,
++            prev_block=None,
++            token_ids=list(range(block_size)))
++        src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
++
++        # All blocks are cached
++        assert allocator_dst.get_num_full_blocks_touched(
++            src_blocks) == num_blocks - 1
++
++        # Insert one non-full block in the src
++        allocate_non_full_block = \
++            TestNaiveBlockAllocator.create_allocate_lambda(
++                "mutable", allocator_src,
++                prev_block=src_blocks[-1],token_ids=[]
++            )
++        src_blocks.append(allocate_non_full_block())
++        src_blocks[-1].append_token_ids([0])
++
++        assert allocator_dst.get_num_full_blocks_touched(
++            src_blocks) == num_blocks - 1
++        # Fill up the last source block and then invoke
++        # get_num_blocks_touched
++        src_blocks[-1].append_token_ids([0] * (block_size - 1))
++        assert allocator_dst.get_num_full_blocks_touched(
++            src_blocks) == num_blocks
+diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
+index c4c680e..29ac3a3 100644
+--- a/tests/core/block/test_prefix_caching_block.py
++++ b/tests/core/block/test_prefix_caching_block.py
+@@ -5,9 +5,14 @@ from unittest.mock import MagicMock
+ 
+ import pytest
+ 
++from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
++from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+ from vllm.core.block.interfaces import Block, BlockAllocator
+-from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
++from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
++                                                  PrefixCachingBlock,
+                                                   PrefixCachingBlockAllocator)
++from vllm.sequence import Logprob
++from vllm.utils import Device
+ 
+ 
+ class TestPrefixCachingBlock:
+@@ -26,11 +31,10 @@ class TestPrefixCachingBlock:
+         token_ids = list(range(num_to_fill))
+         mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
+ 
+-        block_with_prev = PrefixCachingBlock(
+-            prev_block=None,
+-            token_ids=token_ids,
+-            block_size=block_size,
+-            prefix_caching_allocator=mock_allocator)
++        block_with_prev = PrefixCachingBlock(prev_block=None,
++                                             token_ids=token_ids,
++                                             block_size=block_size,
++                                             allocator=mock_allocator)
+ 
+         if is_curr_block_full:
+             # Expect hash since block is full.
+@@ -71,7 +75,7 @@ class TestPrefixCachingBlock:
+             prev_block=previous_block,
+             token_ids=token_ids,
+             block_size=block_size,
+-            prefix_caching_allocator=mock_allocator,
++            allocator=mock_allocator,
+         )
+ 
+         if is_curr_block_full and prev_block_has_hash:
+@@ -100,13 +104,11 @@ class TestPrefixCachingBlock:
+ 
+         token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
+ 
+-        first_chain, second_chain = [
+-            TestPrefixCachingBlock.create_chain(
+-                block_size=block_size,
+-                token_ids=token_ids,
+-                num_empty_trailing_blocks=num_empty_trailing_blocks)
+-            for _ in range(2)
+-        ]
++        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
++            block_size=block_size,
++            token_ids=token_ids,
++            num_empty_trailing_blocks=num_empty_trailing_blocks)
++                                     for _ in range(2))
+ 
+         for first_chain_block, second_chain_block in zip(
+                 first_chain, second_chain):
+@@ -123,7 +125,7 @@ class TestPrefixCachingBlock:
+                      num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
+         """Helper method which creates a chain of blocks.
+         """
+-        blocks = []
++        blocks: List[PrefixCachingBlock] = []
+         num_blocks = math.ceil(
+             len(token_ids) / block_size) + num_empty_trailing_blocks
+ 
+@@ -138,7 +140,7 @@ class TestPrefixCachingBlock:
+                 prev_block=prev_block,
+                 token_ids=[],
+                 block_size=block_size,
+-                prefix_caching_allocator=allocator,
++                allocator=allocator,
+             )
+ 
+             tokens_to_append = token_ids[block_number *
+@@ -159,11 +161,11 @@ class TestPrefixCachingBlockAllocator:
+                                prev_block: Optional[Block],
+                                token_ids: List[int]):
+         if allocate_type == "immutable":
+-            allocate_block = lambda: allocator.allocate_immutable(
++            allocate_block = lambda: allocator.allocate_immutable_block(
+                 prev_block=prev_block, token_ids=token_ids)
+         elif allocate_type == "mutable":
+-            allocate_block = lambda: allocator.allocate_mutable(prev_block=
+-                                                                prev_block)
++            allocate_block = lambda: allocator.allocate_mutable_block(
++                prev_block=prev_block)
+         else:
+             raise ValueError()
+ 
+@@ -233,12 +235,13 @@ class TestPrefixCachingBlockAllocator:
+ 
+         # Expect allocation with unseen hash to fail.
+         with pytest.raises(BlockAllocator.NoFreeBlocksError):
+-            allocator.allocate_immutable(prev_block=chain[-1],
+-                                         token_ids=list(range(block_size)))
++            allocator.allocate_immutable_block(prev_block=chain[-1],
++                                               token_ids=list(
++                                                   range(block_size)))
+ 
+         # Expect mutable allocation to fail.
+         with pytest.raises(BlockAllocator.NoFreeBlocksError):
+-            allocator.allocate_mutable(prev_block=chain[-1])
++            allocator.allocate_mutable_block(prev_block=chain[-1])
+ 
+         # Expect allocation of exact same chain to pass.
+         second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+@@ -270,7 +273,7 @@ class TestPrefixCachingBlockAllocator:
+ 
+         # Expect mutable allocation to fail.
+         with pytest.raises(BlockAllocator.NoFreeBlocksError):
+-            allocator.allocate_mutable(prev_block=None)
++            allocator.allocate_mutable_block(prev_block=None)
+ 
+         block_to_free = chain[-1]
+ 
+@@ -280,11 +283,11 @@ class TestPrefixCachingBlockAllocator:
+             allocator.free(block_to_free)
+             assert block_to_free.block_id is None, i
+ 
+-            new_block = allocator.allocate_mutable(prev_block=None)
++            new_block = allocator.allocate_mutable_block(prev_block=None)
+             assert new_block.block_id == block_id, i
+ 
+             with pytest.raises(BlockAllocator.NoFreeBlocksError):
+-                allocator.allocate_mutable(prev_block=None)
++                allocator.allocate_mutable_block(prev_block=None)
+ 
+             block_to_free = new_block
+ 
+@@ -315,6 +318,61 @@ class TestPrefixCachingBlockAllocator:
+                                                        i)
+             allocator.free(block)
+ 
++    @staticmethod
++    @pytest.mark.parametrize("num_blocks", [4])
++    @pytest.mark.parametrize("block_size", [8])
++    def test_prefix_caching_block_get_num_full_blocks_touched(
++            num_blocks, block_size):
++        """ Verify the allocator can correctly return the number of
++        blocks touched, when there are cached prefixes.
++        """
++        allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
++                                                    block_size=block_size)
++        allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
++                                                    block_size=block_size)
++
++        # Create token ids that will exhaust all blocks except the last
++        token_ids = list(range((num_blocks - 1) * block_size))
++
++        # Create a chain of cacheable blocks in the dst
++        cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
++            block_size=block_size,
++            token_ids=token_ids,
++            allocator=allocator_dst,
++        )
++
++        # Create a chain of the same blocks in the src
++        blocks_to_swap_in = \
++            TestPrefixCachingBlockAllocator.create_immutable_chain(
++                block_size=block_size,
++                token_ids=token_ids,
++                allocator=allocator_src,
++            )
++        # All blocks are cached
++        assert allocator_dst.get_num_full_blocks_touched(
++            blocks_to_swap_in) == 0
++
++        # Free the first block in the dst
++        allocator_dst.free(cached_blocks[0])
++
++        # Now the first block becomes dangling, the swapped blocks need
++        # to reclaim the first block in the dst
++        assert allocator_dst.get_num_full_blocks_touched(
++            blocks_to_swap_in) == 1
++
++        # Insert one non-full block in the src
++        non_full_block = allocator_src.allocate_mutable_block(
++            blocks_to_swap_in[-1])
++        non_full_block.append_token_ids([0])
++        blocks_to_swap_in.append(non_full_block)
++        assert allocator_dst.get_num_full_blocks_touched(
++            blocks_to_swap_in) == 1
++        # Fill up the last mutable block and invoke get_num_blocks_touched.
++        # Note: The last block is not cached so it will be touched.
++        non_full_block.append_token_ids([0] * (block_size - 1))
++        assert allocator_dst.get_num_full_blocks_touched(
++            blocks_to_swap_in) == 2
++
+     @staticmethod
+     @pytest.mark.parametrize("num_blocks", [1024])
+     @pytest.mark.parametrize("block_size", [16])
+@@ -376,7 +434,6 @@ class TestPrefixCachingBlockAllocator:
+ 
+         # Create token ids that will exhaust all blocks.
+         token_ids = list(range(num_blocks_to_consume * block_size))
+-        blocks = list(range(num_blocks_to_consume))
+ 
+         first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+             block_size=block_size,
+@@ -384,9 +441,6 @@ class TestPrefixCachingBlockAllocator:
+             allocator=allocator,
+         )
+ 
+-        # mark all blocks in first chain as computed
+-        allocator.mark_blocks_as_computed(blocks)
+-
+         # After zero_point, second_chain's token_ids would be set -1, which
+         # make it different from here comparing with first_chain
+         zero_point = random.randint(1, len(token_ids) - 1)
+@@ -410,6 +464,155 @@ class TestPrefixCachingBlockAllocator:
+ 
+         assert (len(res) == zero_point_blocks)
+ 
++    # Test case that assume those prompted block after first immutable would
++    # be freed into hashless allocator, while first immutable block get ref
++    # increased.
++    @staticmethod
++    @pytest.mark.parametrize("num_blocks", [3])
++    @pytest.mark.parametrize("block_size", [16])
++    @pytest.mark.parametrize("seed", list(range(10)))
++    def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
++        random.seed(seed)
++
++        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
++                                                block_size=block_size)
++        token_ids = list(range(block_size))
++
++        block = allocator.allocate_immutable_block(prev_block=None,
++                                                   token_ids=token_ids)
++
++        assert allocator._refcounter.get(block.block_id) == 1
++        m = allocator.allocate_mutable_block(prev_block=None)
++
++        block_id = m.block_id
++        for i in range(block_size):
++            m.append_token_ids([i])
++
++        # After block get promoted to immutable from mutable, if there is
++        # already same content hash block, then it shall be released into
++        # hashless_allocator
++        # And first immutable block's ref get increased by 1
++        assert m.block_id == block.block_id
++        assert block_id in allocator._hashless_allocator._free_block_indices
++        assert allocator._refcounter.get(block.block_id) == 2
++
++    # Test case when eviction and allocation are mixed,
++    # make sure they work as expected
++    @staticmethod
++    @pytest.mark.parametrize("num_blocks", [3])
++    @pytest.mark.parametrize("block_size", [16])
++    @pytest.mark.parametrize("seed", list(range(10)))
++    def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
++        random.seed(seed)
++
++        all_blocks_list = [i for i in range(num_blocks)]
++        zero_ref = {i: 0 for i in range(num_blocks)}
++        one_ref = {i: 1 for i in range(num_blocks)}
++        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
++                                                block_size=block_size)
++        token_ids = list(range(num_blocks * block_size))
++
++        # Verify initial/pre-alloc state
++
++        # Ensure all blocks are free inside hashless allocator
++        assert list(allocator._hashless_allocator._free_block_indices
++                    ) == all_blocks_list
++        # Ensure no tracked blocks
++        assert len(allocator._block_tracker.keys()) == num_blocks
++        for block_id in range(num_blocks):
++            assert not allocator._block_tracker[block_id].active
++        # Ensure no cached blocks
++        assert len(allocator._cached_blocks.values()) == 0
++        # Ensure no evicted blocks
++        assert len(allocator.evictor.free_table.keys()) == 0
++        # Ensure 0s ref counts for all blocks
++        assert allocator._refcounter._refcounts == zero_ref
++
++        # Allocate immutable chains with only one block residuled in
++        new_block = []
++        for i in range(num_blocks):
++            block = allocator.allocate_immutable_block(
++                prev_block=None,
++                token_ids=token_ids[block_size * i:block_size * (i + 1)])
++            new_block.append(block)
++
++        # Verify post-alloc state
++
++        # Ensure no blocks are free inside hashless allocator
++        assert (len(allocator._hashless_allocator._free_block_indices) == 0)
++        # Ensure all blocks are tracked
++        assert len(allocator._block_tracker.keys()) == num_blocks
++        for block_id in range(num_blocks):
++            assert allocator._block_tracker[block_id].active
++        # Ensure all blocks are cached (all promoted)
++        assert len(allocator._cached_blocks.values()) == num_blocks
++        # Ensure no evicted blocks
++        assert len(allocator.evictor.free_table.keys()) == 0
++        # Ensure 1s ref counts for all blocks
++        assert allocator._refcounter._refcounts == one_ref
++
++        # Free all blocks, and now all blocks shall be in the evictor
++        # there shall be no tracking data left in _block_tracker
++        # all blocks shall be tracked in _cached_blocks
++        # all blocks' ref shall be zero
++        for block in new_block:
++            allocator.free(block)
++
++        # Verify post-free state
++
++        # Ensure no tracked blocks
++        assert len(allocator._block_tracker.keys()) == num_blocks
++        for block_id in range(num_blocks):
++            assert not allocator._block_tracker[block_id].active
++        # Ensure no blocks in hashless allocator (all promoted)
++        assert len(allocator._hashless_allocator._free_block_indices) == 0
++        # Ensure all blocks are cached
++        assert list(allocator._cached_blocks.values()) == all_blocks_list
++        # Ensure all blocks are inside the evictor
++        assert list(allocator.evictor.free_table.keys()) == all_blocks_list
++        # Ensure 0s refcounts
++        assert allocator._refcounter._refcounts == zero_ref
++
++        # Allocate a mutable block, and the first block shall be evicted
++        # and set its content hash into None, ref to 1
++        mutable = allocator.allocate_mutable_block(prev_block=None)
++
++        assert mutable.block_id == 0
++        assert mutable.content_hash is None
++        assert allocator._block_tracker[0].active
++        assert allocator._refcounter.get(0) == 1
++        assert 0 not in allocator._cached_blocks
++        assert 0 not in allocator.evictor
++
++        # Since this mutable block has no hash yet, it shall be released into
++        # hashless allocator
++        allocator.free(mutable)
++
++        assert not allocator._block_tracker[0].active
++        assert allocator._refcounter._refcounts == zero_ref
++        assert 0 not in allocator._cached_blocks
++        assert 0 not in allocator.evictor
++        assert 0 in allocator._hashless_allocator._free_block_indices
++
++        # When allocate immutable with first block_size tokens, we
++        # shall get free block from hashless allocator, thus no block left
++        # in hashless
++        block = allocator.allocate_immutable_block(
++            prev_block=None, token_ids=token_ids[:block_size])
++
++        assert block.block_id == 0
++        assert len(allocator._hashless_allocator._free_block_indices) == 0
++        assert allocator._block_tracker[0].active
++        assert 0 in allocator._cached_blocks.values()
++        assert allocator._refcounter.get(0) == 1
++        assert 0 not in allocator.evictor
++
++        # allocate mutable block again, it shall be popped from evictor
++        mutable = allocator.allocate_mutable_block(prev_block=None)
++        assert len(allocator._hashless_allocator._free_block_indices) == 0
++        assert mutable.block_id not in allocator.evictor.free_table
++        assert allocator._refcounter.get(mutable.block_id) == 1
++
+     # Test case where two last accessed times are equal
+     @staticmethod
+     @pytest.mark.parametrize("num_blocks", [1024])
+@@ -483,15 +686,126 @@ class TestPrefixCachingBlockAllocator:
+ 
+         assert new_block[0].block_id == last_block_id
+ 
++    # Test case for cache mertics
++    @staticmethod
++    def test_metric():
++        block_size = 16
++        allocator = PrefixCachingBlockAllocator(num_blocks=4,
++                                                block_size=block_size)
++        # Test when no query (0/0)
++        assert allocator.get_prefix_cache_hit_rate() == 0.0
++
++        token_ids = list(range(block_size))
++        allocator.allocate_immutable_block(prev_block=None,
++                                           token_ids=token_ids)
++        # Test 0/1 hit rate
++        assert allocator.get_prefix_cache_hit_rate() == 0.0
++
++        allocator.allocate_immutable_block(prev_block=None,
++                                           token_ids=token_ids)
++        # Test 1/2 hit rate
++        assert allocator.get_prefix_cache_hit_rate() == 0.5
++
++        # Test more than one block
++        for _ in range(2, 1005):
++            allocator.allocate_immutable_block(prev_block=None,
++                                               token_ids=token_ids)
++        assert allocator.get_prefix_cache_hit_rate() > 0.99
++
++    # Test case for marking cache hit blocks as computed right after
++    # a batch of prefill sequences are scheduled.
++    @staticmethod
++    def test_touch_block():
++        block_size = 16
++        common_blocks = 4
++        allocator = PrefixCachingBlockAllocator(num_blocks=8,
++                                                block_size=block_size)
++
++        common_token_ids = list(range(block_size * common_blocks))
++
++        # Mimic the behavior of allocating the same block chain
++        # (i.e., common prefix) for a batch of 3 different prefill sequences.
++        for _ in range(3):
++            blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
++                block_size=block_size,
++                token_ids=common_token_ids,
++                allocator=allocator,
++            )
++            block_hashes = [block.content_hash for block in blocks]
++            # The allocated blocks should  be marked as touched
++            # but not computed.
++            computed_block_ids = allocator.find_cached_blocks_prefix(
++                block_hashes)
++            assert len(computed_block_ids) == 0
++
++        allocator.mark_blocks_as_computed([])
++        computed_block_ids = allocator.find_cached_blocks_prefix(
++            block_hashes=block_hashes)
++        assert len(computed_block_ids) == common_blocks
++
++    @staticmethod
++    def test_find_cached_blocks_prefix():
++        """
++        This test verifies the behavior of find_cached_blocks_prefix.
++        """
++        block_size = 4
++        num_blocks = 8
++        total_test_blocks = 12
++        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
++                                                block_size=block_size)
++
++        token_ids = list(range(total_test_blocks * block_size))
++        block_tokens_seq1 = token_ids[:num_blocks * block_size]
++        blocks_seq1 = TestPrefixCachingBlockAllocator.create_immutable_chain(
++            block_size=block_size,
++            token_ids=block_tokens_seq1,
++            allocator=allocator,
++        )
++        block_hashes_seq1 = [block.content_hash for block in blocks_seq1]
++        allocator.mark_blocks_as_computed([])
++
++        # All blocks should be cached.
++        cached_blocks_seq1 = allocator.find_cached_blocks_prefix(
++            block_hashes=block_hashes_seq1)
++        assert len(cached_blocks_seq1) == num_blocks
++
++        # Free the first sequence.
++        for block in blocks_seq1:
++            allocator.free(block)
++
++        # All blocks should be still be cached if not required to be allocated.
++        cached_blocks = allocator.find_cached_blocks_prefix(
++            block_hashes=block_hashes_seq1)
++        assert len(cached_blocks) == num_blocks
++
++        block_tokens_seq2 = token_ids[num_blocks * block_size:]
++        blocks_seq2 = TestPrefixCachingBlockAllocator.create_immutable_chain(
++            block_size=block_size,
++            token_ids=block_tokens_seq2,
++            allocator=allocator,
++        )
++        block_hashes_seq2 = [block.content_hash for block in blocks_seq2]
++        allocator.mark_blocks_as_computed([])
++        cached_blocks = allocator.find_cached_blocks_prefix(
++            block_hashes=block_hashes_seq2)
++        assert len(cached_blocks) == len(blocks_seq2)
++
++        # Half of the blocks from seq1 should still be cached.
++        num_evicted_blocks = len(blocks_seq2)
++        cached_blocks = allocator.find_cached_blocks_prefix(
++            block_hashes=block_hashes_seq1)
++        assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks
++
+     @staticmethod
+     def create_immutable_chain(
+         block_size: int,
+         token_ids: List[int],
+         allocator: PrefixCachingBlockAllocator,
++        extra_hash: Optional[int] = None,
+     ) -> List[PrefixCachingBlock]:
+         """Helper method which creates a chain of blocks.
+         """
+-        blocks = []
++        blocks: List[Block] = []
+         num_blocks = math.ceil(len(token_ids) / block_size)
+ 
+         if num_blocks == 0:
+@@ -502,8 +816,179 @@ class TestPrefixCachingBlockAllocator:
+             block_token_ids = token_ids[block_number *
+                                         block_size:(block_number + 1) *
+                                         block_size]
+-            prev_block = allocator.allocate_immutable(
+-                prev_block=prev_block, token_ids=block_token_ids)
++            prev_block = allocator.allocate_immutable_block(
++                prev_block=prev_block,
++                token_ids=block_token_ids,
++                extra_hash=extra_hash)
+             blocks.append(prev_block)
+ 
+         return blocks
++
++
++class TestComputedBlocksTracker:
++
++    @staticmethod
++    def _get_mock_allocator():
++        return MagicMock(spec=PrefixCachingBlockAllocator)
++
++    @staticmethod
++    def test_get_num_cached_tokens():
++        """
++        Test it correctly computes the number of cached tokens for a given
++        sequence:
++
++        - The cache token count is derived from the number of cached blocks.
++        - The cache token count is updated when the allocator is updated.
++        - When a sequence is removed, the cache token count should be updated
++        accordingly.
++
++        # TODO(rickyx): This behaviour for prefill sequence is a hack until
++        we fix the computed blocks tracking.
++        - The cache token count for prefill sequence doesn't change while
++        the sequence is in continuous prefill (chunked prefill).
++        """
++        block_size = 4
++        mock_allocator = TestComputedBlocksTracker._get_mock_allocator()
++        tracker = ComputedBlocksTracker(
++            allocator=mock_allocator,
++            block_size=block_size,
++            enable_caching=True,
++        )
++
++        # Not yet allocated.
++        tokens = [0, 1, 2, 3, 4, 5]
++        seq1 = create_dummy_sequence(request_id=0,
++                                     token_ids=tokens,
++                                     block_size=block_size)
++        mock_allocator.find_cached_blocks_prefix.return_value = []
++        assert tracker.get_num_cached_tokens(seq1) == 0
++
++        mock_allocator.find_cached_blocks_prefix.return_value = [
++            None
++        ]  # 1 block cached.
++        # Result is cached for prefill sequence.
++        assert tracker.get_num_cached_tokens(seq1) == 0
++
++        # Mark the sequence as non-prefill.
++        seq1.data.update_num_computed_tokens(len(tokens))  # 6 tokens computed.
++        assert not seq1.is_prefill()
++
++        # Recomputes for decoding sequence.
++        assert tracker.get_num_cached_tokens(seq1) == 4
++
++        # Append new tokens to the sequence.
++        num_new_tokens = 3
++        for i in range(num_new_tokens):
++            seq1.append_token_id(i, {i: Logprob(logprob=0.0)})
++
++        assert tracker.get_num_cached_tokens(seq1) == 4
++
++        # Update the allocator.
++        mock_allocator.find_cached_blocks_prefix.return_value = [
++            None
++        ] * 2  # 2 blocks cached.
++        assert tracker.get_num_cached_tokens(seq1) == 8
++
++        # Remove the sequence.
++        tracker.remove_seq(seq1.seq_id)
++
++        # Re-create the sequence with the same request id to simulate recompute.
++        seq1 = create_dummy_sequence(request_id=0,
++                                     token_ids=tokens,
++                                     block_size=block_size)
++        mock_allocator.find_cached_blocks_prefix.return_value = [
++        ]  # no cached block
++        assert tracker.get_num_cached_tokens(seq1) == 0
++
++    @staticmethod
++    def test_correct_block_hash():
++        """
++        Test that the block hash is correctly computed for a sequence (should
++        match the underlying block allocator's block hash). So the number of
++        cached tokens is correctly retrieved.
++        """
++        block_size = 4
++        allocator = CpuGpuBlockAllocator.create(
++            allocator_type="prefix_caching",
++            num_gpu_blocks=16,
++            num_cpu_blocks=16,
++            block_size=block_size,
++        )
++        gpu_allocator = allocator._allocators[Device.GPU]
++
++        tracker = ComputedBlocksTracker(
++            allocator=allocator,
++            block_size=block_size,
++            enable_caching=True,
++        )
++
++        tokens = list(range(block_size * 4))  # 4 blocks.
++        seq = create_dummy_sequence(request_id=0,
++                                    token_ids=tokens,
++                                    block_size=block_size)
++        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
++            block_size=block_size,
++            token_ids=tokens,
++            allocator=gpu_allocator,
++        )
++        allocator.mark_blocks_as_computed([])
++
++        assert tracker.get_num_cached_tokens(seq) == len(tokens)
++
++    @staticmethod
++    def test_correct_extra_hash():
++        """
++        Test that the block hash is correctly computed based on the extra hash,
++        ensuring it matches the allocator's block hash, specifically for the
++        LoRA case, and that the correct number of cached tokens is retrieved.
++        """
++        block_size = 4
++        allocator = CpuGpuBlockAllocator.create(
++            allocator_type="prefix_caching",
++            num_gpu_blocks=16,
++            num_cpu_blocks=16,
++            block_size=block_size,
++        )
++        gpu_allocator = allocator._allocators[Device.GPU]
++
++        tracker = ComputedBlocksTracker(
++            allocator=allocator,
++            block_size=block_size,
++            enable_caching=True,
++        )
++
++        tokens = list(range(block_size * 4))
++
++        # Create a dummy LoRA sequence with a specific LoRA ID.
++        lora_seq = create_dummy_lora_sequence(request_id=0,
++                                              token_ids=tokens,
++                                              block_size=block_size,
++                                              lora_int_id=1)
++
++        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
++            block_size=block_size,
++            token_ids=tokens,
++            allocator=gpu_allocator,
++            extra_hash=lora_seq.extra_hash(),
++        )
++
++        allocator.mark_blocks_as_computed([])
++
++        # Create different dummy sequences that have the same token IDs
++        # but different LoRA IDs.
++        seq = create_dummy_sequence(request_id=1,
++                                    token_ids=tokens,
++                                    block_size=block_size)
++
++        different_lora_seq = create_dummy_lora_sequence(request_id=2,
++                                                        token_ids=tokens,
++                                                        block_size=block_size,
++                                                        lora_int_id=2)
++
++        # Due to the different LoRA IDs, corresponding blocks are not cached.
++        assert tracker.get_num_cached_tokens(seq) == 0
++        assert tracker.get_num_cached_tokens(different_lora_seq) == 0
++
++        # The number of cached tokens matches the length of the tokens
++        # for the cached LoRA sequence.
++        assert tracker.get_num_cached_tokens(lora_seq) == len(tokens)
+diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
+index 92498c0..eaaf004 100644
+--- a/tests/core/test_chunked_prefill_scheduler.py
++++ b/tests/core/test_chunked_prefill_scheduler.py
+@@ -4,7 +4,6 @@ from unittest.mock import MagicMock
+ import pytest  # noqa
+ 
+ from vllm.config import CacheConfig, SchedulerConfig
+-from vllm.core.interfaces import AllocStatus
+ from vllm.core.scheduler import Scheduler
+ from vllm.sequence import Logprob, SequenceGroup
+ 
+@@ -21,7 +20,7 @@ def append_new_token(seq_group, token_id: int):
+ 
+ 
+ def schedule_and_update_computed_tokens(scheduler):
+-    metas, out = scheduler.schedule()
++    metas, out, _ = scheduler.schedule()
+     for s, meta in zip(out.scheduled_seq_groups, metas):
+         s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
+     return metas, out
+@@ -33,7 +32,8 @@ def test_simple():
+     num_seq_group = 4
+     max_model_len = 16
+     max_num_batched_tokens = 64
+-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
++    scheduler_config = SchedulerConfig("generate",
++                                       max_num_batched_tokens,
+                                        num_seq_group,
+                                        max_model_len,
+                                        enable_chunked_prefill=True)
+@@ -45,7 +45,9 @@ def test_simple():
+ 
+     # Add seq groups to scheduler.
+     for i in range(num_seq_group):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=block_size,
++                                           block_size=block_size)
+         scheduler.add_seq_group(seq_group)
+         running.append(seq_group)
+ 
+@@ -75,24 +77,30 @@ def test_chunk():
+     max_seqs = 60
+     max_model_len = 80
+     max_num_batched_tokens = 64
+-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+-                                       max_seqs,
+-                                       max_model_len,
+-                                       enable_chunked_prefill=True)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens,
++        max_seqs,
++        max_model_len,
++        enable_chunked_prefill=True,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+-    cache_config.num_cpu_blocks = 8
+-    cache_config.num_gpu_blocks = 8
++    cache_config.num_cpu_blocks = 32
++    cache_config.num_gpu_blocks = 32
+     scheduler = Scheduler(scheduler_config, cache_config, None)
+     running: List[SequenceGroup] = []
+ 
+     # Add seq groups to scheduler.
+     for i in range(2):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           block_size=block_size)
+         scheduler.add_seq_group(seq_group)
+         running.append(seq_group)
+ 
+     # Verify the second request is chunked.
+     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
++    print()
+     assert set(get_sequence_groups(out)) == set(running)
+     assert seq_group_meta[0].token_chunk_size == 60
+     # Verify it is chunked.
+@@ -118,19 +126,24 @@ def test_complex():
+     max_seqs = 60
+     max_model_len = 80
+     max_num_batched_tokens = 64
+-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+-                                       max_seqs,
+-                                       max_model_len,
+-                                       enable_chunked_prefill=True)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens,
++        max_seqs,
++        max_model_len,
++        enable_chunked_prefill=True,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+-    cache_config.num_cpu_blocks = 8
+-    cache_config.num_gpu_blocks = 8
++    cache_config.num_cpu_blocks = 64
++    cache_config.num_gpu_blocks = 64
+     scheduler = Scheduler(scheduler_config, cache_config, None)
+     running: List[SequenceGroup] = []
+ 
+     # Add seq groups to scheduler.
+     for i in range(2):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           block_size=block_size)
+         scheduler.add_seq_group(seq_group)
+         running.append(seq_group)
+         assert seq_group.is_prefill()
+@@ -149,9 +162,11 @@ def test_complex():
+     # Only the first seq group has a new token appended.
+     append_new_token(running[0], 1)
+ 
+-    # Add 2 more requsets.
++    # Add 2 more requests.
+     for i in range(2, 4):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           block_size=block_size)
+         scheduler.add_seq_group(seq_group)
+         running.append(seq_group)
+ 
+@@ -180,12 +195,15 @@ def test_maximal_decoding():
+     """Verify decoding requests are prioritized."""
+     block_size = 4
+     max_seqs = 2
+-    max_model_len = 2
++    max_model_len = 8
+     max_num_batched_tokens = 2
+-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+-                                       max_seqs,
+-                                       max_model_len,
+-                                       enable_chunked_prefill=True)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens,
++        max_seqs,
++        max_model_len,
++        enable_chunked_prefill=True,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+     cache_config.num_cpu_blocks = 8
+     cache_config.num_gpu_blocks = 8
+@@ -194,7 +212,9 @@ def test_maximal_decoding():
+ 
+     # Add seq groups to scheduler.
+     for i in range(2):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=2)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=2,
++                                           block_size=block_size)
+         scheduler.add_seq_group(seq_group)
+         running.append(seq_group)
+         assert seq_group.is_prefill()
+@@ -211,7 +231,9 @@ def test_maximal_decoding():
+     append_new_token(running[0], 1)
+ 
+     # Create one more seq_group.
+-    _, seq_group = create_dummy_prompt("3", prompt_length=2)
++    _, seq_group = create_dummy_prompt("3",
++                                       prompt_length=2,
++                                       block_size=block_size)
+     scheduler.add_seq_group(seq_group)
+     running.append(seq_group)
+     assert seq_group.is_prefill()
+@@ -269,17 +291,22 @@ def test_prompt_limit():
+     max_seqs = 32
+     max_model_len = 64
+     max_num_batched_tokens = 32
+-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+-                                       max_seqs,
+-                                       max_model_len,
+-                                       enable_chunked_prefill=True)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens,
++        max_seqs,
++        max_model_len,
++        enable_chunked_prefill=True,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+-    cache_config.num_cpu_blocks = 8
+-    cache_config.num_gpu_blocks = 8
++    cache_config.num_cpu_blocks = 16
++    cache_config.num_gpu_blocks = 16
+     scheduler = Scheduler(scheduler_config, cache_config, None)
+     running: List[SequenceGroup] = []
+ 
+-    _, seq_group = create_dummy_prompt("1", prompt_length=48)
++    _, seq_group = create_dummy_prompt("1",
++                                       prompt_length=48,
++                                       block_size=block_size)
+     scheduler.add_seq_group(seq_group)
+     running.append(seq_group)
+     assert seq_group.is_prefill()
+@@ -298,17 +325,19 @@ def test_prompt_limit_exceed():
+     max_seqs = 64
+     max_model_len = 32
+     max_num_batched_tokens = 64
+-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
++    scheduler_config = SchedulerConfig("generate",
++                                       max_num_batched_tokens,
+                                        max_seqs,
+                                        max_model_len,
+                                        enable_chunked_prefill=True)
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+-    cache_config.num_cpu_blocks = 8
+-    cache_config.num_gpu_blocks = 8
++    cache_config.num_cpu_blocks = 16
++    cache_config.num_gpu_blocks = 16
+     scheduler = Scheduler(scheduler_config, cache_config, None)
+     running: List[SequenceGroup] = []
+-
+-    _, seq_group = create_dummy_prompt("2", prompt_length=48)
++    _, seq_group = create_dummy_prompt("2",
++                                       prompt_length=48,
++                                       block_size=block_size)
+     scheduler.add_seq_group(seq_group)
+     running.append(seq_group)
+     assert seq_group.is_prefill()
+@@ -317,22 +346,27 @@ def test_prompt_limit_exceed():
+     assert out.ignored_seq_groups[0] == seq_group
+ 
+ 
+-def test_swap():
+-    """Verify swapping works with chunked prefill requests"""
++def test_chunked_prefill_preempt():
++    """Verify preempt works with chunked prefill requests"""
+     block_size = 4
+     max_seqs = 30
+     max_model_len = 200
+     max_num_batched_tokens = 30
+-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+-                                       max_seqs,
+-                                       max_model_len,
+-                                       enable_chunked_prefill=True)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens,
++        max_seqs,
++        max_model_len,
++        enable_chunked_prefill=True,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+-    cache_config.num_cpu_blocks = 8
+-    cache_config.num_gpu_blocks = 8
++    cache_config.num_cpu_blocks = 16
++    cache_config.num_gpu_blocks = 16
+     scheduler = Scheduler(scheduler_config, cache_config, None)
+ 
+-    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
++    _, seq_group = create_dummy_prompt("1",
++                                       prompt_length=60,
++                                       block_size=block_size)
+     scheduler.add_seq_group(seq_group)
+     _, out = schedule_and_update_computed_tokens(scheduler)
+     # The request is chunked.
+@@ -342,179 +376,80 @@ def test_swap():
+     assert seq_group.is_prefill()
+     assert out.num_batched_tokens == max_num_batched_tokens
+ 
+-    # The last request should be swapped out.
++    # The request should be preempted.
+     scheduler.block_manager.can_append_slots = MagicMock()
+ 
+-    def cannot_append_second_group(seq_group, num_lookahead_slots):
++    def cannot_append_second_group1(seq_group, num_lookahead_slots):
+         return seq_group.request_id != "1"
+ 
+     scheduler.block_manager.can_append_slots.side_effect = (
+-        cannot_append_second_group)
++        cannot_append_second_group1)
+ 
+-    # The running prefill is now swapped.
++    # The running prefill is now preempted.
+     _, out = schedule_and_update_computed_tokens(scheduler)
+     assert len(out.scheduled_seq_groups) == 0
+     assert out.num_batched_tokens == 0
+-    assert out.blocks_to_swap_out != {}
+-    assert out.blocks_to_swap_in == {}
+-
+-    # Add 1 more task. Swap should be prioritized over new prefill.
+-    _, seq_group = create_dummy_prompt("2", prompt_length=60)
+-    scheduler.add_seq_group(seq_group)
+-    _, out = schedule_and_update_computed_tokens(scheduler)
+-    assert len(out.scheduled_seq_groups) == 1
+-    # 3 decodes. It is swapped in.
+-    assert out.num_batched_tokens == 30
+-    assert out.blocks_to_swap_in != {}
+-    assert out.blocks_to_swap_out == {}
+-
++    assert out.blocks_to_swap_out == []
++    assert out.blocks_to_swap_in == []
+ 
+-def test_running_prefill_prioritized_over_swap():
+-    block_size = 4
+-    max_seqs = 30
+-    max_model_len = 200
+-    max_num_batched_tokens = 30
+-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+-                                       max_seqs,
+-                                       max_model_len,
+-                                       enable_chunked_prefill=True)
+-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+-    cache_config.num_cpu_blocks = 8
+-    cache_config.num_gpu_blocks = 8
+-    scheduler = Scheduler(scheduler_config, cache_config, None)
+-
+-    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+-    scheduler.add_seq_group(seq_group)
++    # Make sure we can reschedule preempted request.
+     _, out = schedule_and_update_computed_tokens(scheduler)
+-    # The request is chunked.
+-    # prefill scheduled now.
+     assert len(out.scheduled_seq_groups) == 1
+     assert out.num_prefill_groups == 1
+     assert seq_group.is_prefill()
+     assert out.num_batched_tokens == max_num_batched_tokens
++    assert seq_group.get_num_uncomputed_tokens() == 30
+ 
+-    # The request should be swapped out.
+-    scheduler.block_manager.can_append_slots = MagicMock()
+-
+-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+-        return seq_group.request_id != "1"
++    # We should be able to run prefill twice as it is chunked.
++    def cannot_append_second_group2(seq_group, num_lookahead_slots):
++        return True
+ 
+     scheduler.block_manager.can_append_slots.side_effect = (
+-        cannot_append_second_group)
+-
+-    # The running prefill is now swapped.
+-    _, out = schedule_and_update_computed_tokens(scheduler)
+-    assert len(out.scheduled_seq_groups) == 0
+-    assert out.num_batched_tokens == 0
+-    assert out.blocks_to_swap_out != {}
+-    assert out.blocks_to_swap_in == {}
+-
+-    # Add 1 more task. Swap is not possible, so prefill is running.
+-    scheduler.block_manager.can_swap_in = MagicMock()
+-    scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
+-
+-    _, seq_group2 = create_dummy_prompt("2", prompt_length=60)
+-    scheduler.add_seq_group(seq_group2)
+-    _, out = schedule_and_update_computed_tokens(scheduler)
+-    assert len(out.scheduled_seq_groups) == 1
+-    # 3 decodes. It is swapped in.
+-    assert out.num_batched_tokens == 30
+-    assert out.blocks_to_swap_in == {}
+-    assert out.blocks_to_swap_out == {}
+-    assert out.scheduled_seq_groups[0].seq_group == seq_group2
+-
+-    # Now although swap is possible, running prefill is prioritized.
+-    scheduler.block_manager.can_swap_in.return_value = AllocStatus.OK
+-    _, out = schedule_and_update_computed_tokens(scheduler)
+-    assert len(out.scheduled_seq_groups) == 1
+-    # 3 decodes. It is swapped in.
+-    assert out.num_batched_tokens == 30
+-    assert out.blocks_to_swap_in == {}
+-    assert out.blocks_to_swap_out == {}
+-    assert not seq_group2.is_prefill()
+-    assert out.scheduled_seq_groups[0].seq_group == seq_group2
+-    append_new_token(seq_group2, 1)
+-
+-    # Decoding is prioritized.
+-    _, out = schedule_and_update_computed_tokens(scheduler)
+-    assert len(out.scheduled_seq_groups) == 1
+-    # 3 decodes. It is swapped in.
+-    assert out.num_batched_tokens == 1
+-    assert out.blocks_to_swap_in == {}
+-    assert out.blocks_to_swap_out == {}
+-    assert not seq_group2.is_prefill()
+-    assert out.scheduled_seq_groups[0].seq_group == seq_group2
+-    append_new_token(seq_group2, 1)
+-
+-    # Since we abort the sequence group, we can finally swap.
+-    scheduler.abort_seq_group(seq_group2.request_id)
++        cannot_append_second_group2)
+     _, out = schedule_and_update_computed_tokens(scheduler)
+     assert len(out.scheduled_seq_groups) == 1
+-    assert out.num_batched_tokens == 30
+-    assert out.blocks_to_swap_in != {}
+-    assert out.blocks_to_swap_out == {}
++    assert out.num_prefill_groups == 1
++    assert not seq_group.is_prefill()
++    assert out.num_batched_tokens == max_num_batched_tokens
+ 
+ 
+-def test_chunked_prefill_preempt():
+-    """Verify preempt works with chunked prefill requests"""
++@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
++def test_chunked_prefill_spec_prefill(num_scheduler_steps):
++    """Verify that the num_lookahead_slots is set appropriately for an all"""
++    """prefill batch depending on whether multi-step scheduling is enabled"""
++    """or not"""
+     block_size = 4
+     max_seqs = 30
+     max_model_len = 200
+     max_num_batched_tokens = 30
+-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+-                                       max_seqs,
+-                                       max_model_len,
+-                                       enable_chunked_prefill=True)
++    num_lookahead_slots = 4
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens,
++        max_seqs,
++        max_model_len,
++        enable_chunked_prefill=True,
++        num_lookahead_slots=num_lookahead_slots,
++        num_scheduler_steps=num_scheduler_steps,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+-    cache_config.num_cpu_blocks = 8
+-    cache_config.num_gpu_blocks = 8
++    cache_config.num_cpu_blocks = 16
++    cache_config.num_gpu_blocks = 16
+     scheduler = Scheduler(scheduler_config, cache_config, None)
+ 
+-    _, seq_group = create_dummy_prompt("1", prompt_length=60)
++    _, seq_group = create_dummy_prompt("1",
++                                       prompt_length=30,
++                                       block_size=block_size)
+     scheduler.add_seq_group(seq_group)
+     _, out = schedule_and_update_computed_tokens(scheduler)
+     # The request is chunked.
+     # prefill scheduled now.
+     assert len(out.scheduled_seq_groups) == 1
+     assert out.num_prefill_groups == 1
+-    assert seq_group.is_prefill()
+-    assert out.num_batched_tokens == max_num_batched_tokens
+-
+-    # The request should be preempted.
+-    scheduler.block_manager.can_append_slots = MagicMock()
+-
+-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+-        return seq_group.request_id != "1"
+-
+-    scheduler.block_manager.can_append_slots.side_effect = (
+-        cannot_append_second_group)
+-
+-    # The running prefill is now preempted.
+-    _, out = schedule_and_update_computed_tokens(scheduler)
+-    assert len(out.scheduled_seq_groups) == 0
+-    assert out.num_batched_tokens == 0
+-    assert out.blocks_to_swap_out == {}
+-    assert out.blocks_to_swap_in == {}
+-
+-    # Make sure we can reschedule preempted request.
+-    _, out = schedule_and_update_computed_tokens(scheduler)
+-    assert len(out.scheduled_seq_groups) == 1
+-    assert out.num_prefill_groups == 1
+-    assert seq_group.is_prefill()
+-    assert out.num_batched_tokens == max_num_batched_tokens
+-    assert seq_group.get_num_uncomputed_tokens() == 30
+-
+-    # We should be able to run prefill twice as it is chunked.
+-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+-        return True
+-
+-    scheduler.block_manager.can_append_slots.side_effect = (
+-        cannot_append_second_group)
+-    _, out = schedule_and_update_computed_tokens(scheduler)
+-    assert len(out.scheduled_seq_groups) == 1
+-    assert out.num_prefill_groups == 1
+-    assert not seq_group.is_prefill()
+     assert out.num_batched_tokens == max_num_batched_tokens
++    print(out.num_lookahead_slots)
++    assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
++                                       num_lookahead_slots)
+ 
+ 
+ def test_chunked_prefill_max_seqs():
+@@ -522,17 +457,22 @@ def test_chunked_prefill_max_seqs():
+     max_seqs = 2
+     max_model_len = 80
+     max_num_batched_tokens = 64
+-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+-                                       max_seqs,
+-                                       max_model_len,
+-                                       enable_chunked_prefill=True)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens,
++        max_seqs,
++        max_model_len,
++        enable_chunked_prefill=True,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+-    cache_config.num_cpu_blocks = 8
+-    cache_config.num_gpu_blocks = 8
++    cache_config.num_cpu_blocks = 128
++    cache_config.num_gpu_blocks = 128
+     scheduler = Scheduler(scheduler_config, cache_config, None)
+-    running = []
++    running: List[SequenceGroup] = []
+ 
+-    _, seq_group = create_dummy_prompt("1", prompt_length=65)
++    _, seq_group = create_dummy_prompt("1",
++                                       prompt_length=65,
++                                       block_size=block_size)
+     scheduler.add_seq_group(seq_group)
+     running.append(seq_group)
+     # The first prefill is chunked.
+@@ -542,7 +482,9 @@ def test_chunked_prefill_max_seqs():
+ 
+     # Add new requests.
+     for i in range(4):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=65)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=65,
++                                           block_size=block_size)
+         scheduler.add_seq_group(seq_group)
+         running.append(seq_group)
+ 
+@@ -562,3 +504,45 @@ def test_chunked_prefill_max_seqs():
+     assert len(get_sequence_groups(out)) == max_seqs
+     assert not running[0].is_prefill()
+     assert not running[1].is_prefill()
++
++
++def test_perfix_caching():
++    """Verify allocating full blocks when prefix caching is enabled."""
++    block_size = 4
++    max_seqs = 10
++    max_model_len = 80
++    max_num_batched_tokens = 64
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens,
++        max_seqs,
++        max_model_len,
++        enable_chunked_prefill=True,
++    )
++    cache_config = CacheConfig(block_size,
++                               1.0,
++                               1,
++                               "auto",
++                               enable_prefix_caching=True)
++    cache_config.num_cpu_blocks = 0
++    cache_config.num_gpu_blocks = 32
++    scheduler = Scheduler(scheduler_config, cache_config, None)
++    running: List[SequenceGroup] = []
++
++    # Add seq groups to scheduler.
++    for i in range(2):
++        _, seq_group = create_dummy_prompt(str(i),
++                                           block_size=block_size,
++                                           prompt_length=50)
++        scheduler.add_seq_group(seq_group)
++        running.append(seq_group)
++
++    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
++    assert set(get_sequence_groups(out)) == set(running)
++    assert seq_group_meta[0].token_chunk_size == 50
++    # Verify it is chunked. Note that although the budget is 64-50=14,
++    # we only allocate full blocks for prefix caching, so only 4*(14//4)=12
++    # tokens are allocated.
++    assert seq_group_meta[1].token_chunk_size == 12
++    assert out.num_prefill_groups == 2
++    assert out.num_batched_tokens == 62
+diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
+new file mode 100644
+index 0000000..bd4acca
+--- /dev/null
++++ b/tests/core/test_num_computed_tokens_update.py
+@@ -0,0 +1,80 @@
++import pytest
++
++from tests.conftest import VllmRunner
++from tests.core.utils import create_dummy_prompt
++from vllm.engine.llm_engine import LLMEngine
++from vllm.platforms import current_platform
++from vllm.sequence import SequenceGroup
++
++MODEL = "JackFram/llama-160m"
++
++
++def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
++    scheduler = engine.scheduler[0]
++    scheduler.add_seq_group(seq_group)
++
++
++@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
++@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
++@pytest.mark.parametrize("enforce_eager", [False, True])
++def test_num_computed_tokens_update(num_scheduler_steps: int,
++                                    enable_chunked_prefill: bool,
++                                    enforce_eager: bool):
++
++    is_multi_step = num_scheduler_steps > 1
++    is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
++
++    if is_multi_step_chunked_prefill and current_platform.is_rocm():
++        pytest.skip("Multi-step with Chunked-Prefill does not support "
++                    "rocm_flash_attn backend")
++
++    # Make a vllm engine
++    runner = VllmRunner(model_name=MODEL,
++                        gpu_memory_utilization=0.7,
++                        num_scheduler_steps=num_scheduler_steps,
++                        enable_chunked_prefill=enable_chunked_prefill,
++                        enforce_eager=enforce_eager)
++    engine: LLMEngine = runner.model.llm_engine
++
++    # In multi-step + chunked-prefill there is no separate single prompt step.
++    # What is scheduled will run for num_scheduler_steps always.
++    num_prompt_steps = num_scheduler_steps \
++        if is_multi_step_chunked_prefill else 1
++
++    num_output_tokens_list = [4, 8, 12, 15, 16, 17]
++
++    # Create sequence and add to engine
++    prompt_len = 10
++
++    for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
++        seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
++                                             prompt_length=prompt_len,
++                                             min_tokens=num_output_tokens,
++                                             max_tokens=num_output_tokens)
++        add_seq_group_to_engine(engine, seq_group)
++
++        assert seq.data.get_num_computed_tokens() == 0
++
++        for _ in range(num_prompt_steps):
++            # prompt steps
++            engine.step()
++
++        if not seq.is_finished():
++            prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
++            # Test correctness of num_computed_tokens after the prompt steps
++            assert prompt_num_computed_tokens == \
++                        prompt_len + num_prompt_steps - 1
++
++            decode_step_counter = 0
++            while not seq.is_finished():
++                # Test correctness of num_computed_tokens after the decode steps
++                assert seq.data.get_num_computed_tokens(
++                ) == prompt_num_computed_tokens + decode_step_counter
++                for _ in range(num_scheduler_steps):
++                    # decode step
++                    engine.step()
++                    decode_step_counter += 1
++
++        # Test correctness of num_computed_tokens after the sequence finish.
++        assert seq.data.get_num_computed_tokens(
++        ) == prompt_len + num_output_tokens - 1
+diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
+index 1358dff..8f6de84 100644
+--- a/tests/core/test_scheduler.py
++++ b/tests/core/test_scheduler.py
+@@ -1,47 +1,30 @@
+ import time
+ from collections import deque
+-from typing import List
++from typing import List, Set, Tuple
+ from unittest.mock import MagicMock
+ 
+ import pytest  # noqa
++from torch import Use  # noqa
+ 
+ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+ from vllm.core.interfaces import AllocStatus
+-from vllm.core.policy import PolicyFactory
+ from vllm.core.scheduler import Scheduler, SchedulingBudget
+ from vllm.lora.request import LoRARequest
+-from vllm.sequence import Logprob, SequenceGroup, SequenceStatus
++from vllm.sequence import SequenceGroup
+ 
+-from .utils import create_dummy_prompt
+-
+-
+-def get_sequence_groups(scheduler_output):
+-    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+-
+-
+-def append_new_token(out, token_id: int):
+-    seq_groups = get_sequence_groups(out)
+-    for seq_group in seq_groups:
+-        for seq in seq_group.get_seqs():
+-            seq.append_token_id(token_id, {token_id: Logprob(token_id)})
+-
+-
+-def schedule_and_update_computed_tokens(scheduler):
+-    metas, out = scheduler.schedule()
+-    for s, meta in zip(out.scheduled_seq_groups, metas):
+-        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
+-    return metas, out
+-
+-
+-def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
+-    seq_group.update_num_computed_tokens(token_chunk_size)
+-    for seq in seq_group.get_seqs():
+-        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
++from .utils import (append_new_token, append_new_token_seq,
++                    append_new_token_seq_group, create_dummy_prompt,
++                    get_sequence_groups, schedule_and_update_computed_tokens)
+ 
+ 
+ def test_scheduler_add_seq_group():
+     block_size = 4
+-    scheduler_config = SchedulerConfig(100, 64, 1)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens=100,
++        max_num_seqs=64,
++        max_model_len=1,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
+     cache_config.num_cpu_blocks = 4
+     cache_config.num_gpu_blocks = 4
+@@ -50,14 +33,21 @@ def test_scheduler_add_seq_group():
+     # Add seq group to scheduler.
+     num_seq_group = 4
+     for i in range(num_seq_group):
+-        _, seq_group = create_dummy_prompt(str(i), block_size)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           block_size,
++                                           block_size=block_size)
+         scheduler.add_seq_group(seq_group)
+         assert scheduler.get_num_unfinished_seq_groups() == i + 1
+ 
+ 
+ def test_scheduler_abort_seq_group():
+     block_size = 4
+-    scheduler_config = SchedulerConfig(100, 64, 1)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens=100,
++        max_num_seqs=64,
++        max_model_len=1,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+     cache_config.num_cpu_blocks = 4
+     cache_config.num_gpu_blocks = 4
+@@ -65,7 +55,7 @@ def test_scheduler_abort_seq_group():
+ 
+     # Add multiple seq groups to scheduler.
+     num_seq_group = 4
+-    request_ids = set()
++    request_ids: Set[str] = set()
+     for i in range(num_seq_group):
+         _, seq_group = create_dummy_prompt(str(i), block_size)
+         scheduler.add_seq_group(seq_group)
+@@ -81,7 +71,12 @@ def test_scheduler_schedule_simple():
+     block_size = 4
+     num_seq_group = 4
+     max_model_len = 16
+-    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens=64,
++        max_num_seqs=num_seq_group,
++        max_model_len=max_model_len,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+     cache_config.num_cpu_blocks = 8
+     cache_config.num_gpu_blocks = 8
+@@ -90,7 +85,9 @@ def test_scheduler_schedule_simple():
+ 
+     # Add seq groups to scheduler.
+     for i in range(num_seq_group):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=block_size,
++                                           block_size=block_size)
+         scheduler.add_seq_group(seq_group)
+         running.append(seq_group)
+ 
+@@ -119,15 +116,19 @@ def test_scheduler_prefill_prioritized():
+     block_size = 4
+     max_model_len = 30
+     max_batched_num_tokens = 30
+-    scheduler_config = SchedulerConfig(max_batched_num_tokens, 2,
+-                                       max_model_len)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens=max_batched_num_tokens,
++        max_num_seqs=2,
++        max_model_len=max_model_len,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+-    cache_config.num_cpu_blocks = 2
+-    cache_config.num_gpu_blocks = 2
++    cache_config.num_cpu_blocks = 16
++    cache_config.num_gpu_blocks = 16
+     scheduler = Scheduler(scheduler_config, cache_config, None)
+ 
+     # Add seq groups to scheduler.
+-    _, seq_group_a = create_dummy_prompt("1", 1)
++    _, seq_group_a = create_dummy_prompt("1", 1, block_size=block_size)
+     scheduler.add_seq_group(seq_group_a)
+ 
+     # Schedule seq groups prompts.
+@@ -135,7 +136,7 @@ def test_scheduler_prefill_prioritized():
+     assert get_sequence_groups(out) == [seq_group_a]
+ 
+     # Add a new prefill request B.
+-    _, seq_group_b = create_dummy_prompt("2", 30)
++    _, seq_group_b = create_dummy_prompt("2", 30, block_size=block_size)
+     scheduler.add_seq_group(seq_group_b)
+ 
+     # Verify prefill requests are prioritized. Since max_batched_num_tokens
+@@ -147,15 +148,24 @@ def test_scheduler_prefill_prioritized():
+ def test_scheduler_schedule_preempt_abort():
+     block_size = 4
+     max_model_len = 16
+-    scheduler_config = SchedulerConfig(64, 2, max_model_len)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens=64,
++        max_num_seqs=2,
++        max_model_len=max_model_len,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+     cache_config.num_cpu_blocks = 2
+     cache_config.num_gpu_blocks = 2
+     scheduler = Scheduler(scheduler_config, cache_config, None)
+ 
+     # Add seq groups to scheduler.
+-    seq_a, seq_group_a = create_dummy_prompt("1", block_size)
+-    seq_b, seq_group_b = create_dummy_prompt("2", block_size)
++    seq_a, seq_group_a = create_dummy_prompt("1",
++                                             block_size,
++                                             block_size=block_size)
++    seq_b, seq_group_b = create_dummy_prompt("2",
++                                             block_size,
++                                             block_size=block_size)
+     scheduler.add_seq_group(seq_group_a)
+     scheduler.add_seq_group(seq_group_b)
+ 
+@@ -180,6 +190,7 @@ def test_scheduler_schedule_preempt_abort():
+             and not out.blocks_to_swap_out)
+     assert len(seq_group_meta) == 1
+     assert scheduler.get_num_unfinished_seq_groups() == 2
++    assert out.preempted == 1
+ 
+     # Abort seq group a. Re-schedule seq group b prompt with recomputation.
+     scheduler.abort_seq_group("1")
+@@ -197,7 +208,12 @@ def test_scheduler_max_seqs():
+     num_seq_group = 4
+     max_seq_group = 2
+     max_model_len = 16
+-    scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens=64,
++        max_num_seqs=max_seq_group,
++        max_model_len=max_model_len,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+     cache_config.num_cpu_blocks = 8
+     cache_config.num_gpu_blocks = 8
+@@ -206,7 +222,9 @@ def test_scheduler_max_seqs():
+     all_seq_groups: List[SequenceGroup] = []
+     # Add seq groups to scheduler.
+     for i in range(num_seq_group):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=block_size,
++                                           block_size=block_size)
+         all_seq_groups.append(seq_group)
+ 
+     # Append 1 seq group
+@@ -235,7 +253,13 @@ def test_scheduler_max_seqs():
+ 
+ def test_scheduler_delay_factor():
+     block_size = 4
+-    scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5)
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens=100,
++        max_num_seqs=64,
++        max_model_len=16,
++        delay_factor=0.5,
++    )
+     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+     cache_config.num_cpu_blocks = 8
+     cache_config.num_gpu_blocks = 8
+@@ -243,7 +267,8 @@ def test_scheduler_delay_factor():
+ 
+     # schedule first prompt
+     seq_group_meta, seq_group = create_dummy_prompt("0",
+-                                                    prompt_length=block_size)
++                                                    prompt_length=block_size,
++                                                    block_size=block_size)
+     scheduler.add_seq_group(seq_group)
+     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+     assert out.num_prefill_groups > 0
+@@ -253,7 +278,8 @@ def test_scheduler_delay_factor():
+     # wait for a second before scheduling next prompt
+     time.sleep(1)
+     seq_group_meta, seq_group = create_dummy_prompt("1",
+-                                                    prompt_length=block_size)
++                                                    prompt_length=block_size,
++                                                    block_size=block_size)
+     scheduler.add_seq_group(seq_group)
+ 
+     # second prompt should *not* be scheduled
+@@ -270,56 +296,35 @@ def test_scheduler_delay_factor():
+     append_new_token(out, 1)
+ 
+ 
+-def test_swapped_out_prioritized():
+-    scheduler = initialize_scheduler(max_num_seqs=6)
+-    # best_of=2 * 3 == 6 sequences.
+-    for i in range(3):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
+-        scheduler.add_seq_group(seq_group)
+-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+-    # prefill scheduled now.
+-    assert len(out.scheduled_seq_groups) == 3
+-    append_new_token(out, 1)
+-
+-    # The last request should be swapped out.
+-    scheduler.block_manager.can_append_slots = MagicMock()
+-
+-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+-        return seq_group.request_id != "2"
+-
+-    scheduler.block_manager.can_append_slots.side_effect = (
+-        cannot_append_second_group)
+-
+-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+-    assert len(out.scheduled_seq_groups) == 2
+-    assert out.num_batched_tokens == 2
+-    assert out.blocks_to_swap_out != {}
+-    assert out.blocks_to_swap_in == {}
+-    append_new_token(out, 1)
+-
+-    # Add 1 more task. Swap should be prioritized over prefill.
+-    _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
+-    scheduler.add_seq_group(seq_group)
+-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+-    append_new_token(out, 1)
+-    assert len(out.scheduled_seq_groups) == 3
+-    # 3 decodes. It is swapped in.
+-    assert out.num_batched_tokens == 3
+-    assert out.blocks_to_swap_in != {}
+-    assert out.blocks_to_swap_out == {}
+-
+-
+-def initialize_scheduler(*,
+-                         max_num_seqs=1000,
+-                         max_token_budget=1000,
+-                         max_model_len=1000,
+-                         lora_config=None):
+-    block_size = 4
+-    scheduler_config = SchedulerConfig(max_token_budget, max_num_seqs,
+-                                       max_model_len)
+-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+-    cache_config.num_cpu_blocks = 8
+-    cache_config.num_gpu_blocks = 8
++def initialize_scheduler(
++    *,
++    max_num_seqs=1000,
++    max_token_budget=1000,
++    max_model_len=1000,
++    lora_config=None,
++    block_size=4,
++    num_cpu_blocks=8,
++    num_gpu_blocks=8,
++    enable_prefix_caching=False,
++    enable_chunked_prefill=False,
++):
++    block_size = block_size
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens=max_token_budget,
++        max_num_seqs=max_num_seqs,
++        max_model_len=max_model_len,
++        enable_chunked_prefill=enable_chunked_prefill,
++    )
++    cache_config = CacheConfig(
++        block_size,
++        1.0,
++        1,
++        "auto",
++        enable_prefix_caching=enable_prefix_caching,
++    )
++    cache_config.num_cpu_blocks = num_cpu_blocks
++    cache_config.num_gpu_blocks = num_gpu_blocks
+     scheduler = Scheduler(scheduler_config, cache_config, lora_config)
+     return scheduler
+ 
+@@ -345,12 +350,15 @@ def test_prefill_schedule_max_prompt_len():
+     """
+     Test prompt longer than max_prompt_len is aborted.
+     """
+-    scheduler = initialize_scheduler(max_model_len=30)
+-    _, seq_group = create_dummy_prompt(0, prompt_length=60)
+-    waiting = deque([seq_group])
++    block_size = 4
++    scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
++    _, seq_group = create_dummy_prompt("0",
++                                       prompt_length=60,
++                                       block_size=block_size)
++    scheduler.add_seq_group(seq_group)
+     budget = create_token_budget()
+-    remaining_waiting, output = scheduler._schedule_prefills(
+-        waiting, budget, None)
++    output = scheduler._schedule_prefills(budget, None)
++    remaining_waiting = scheduler.waiting
+     assert len(output.ignored_seq_groups) == 1
+     assert len(output.seq_groups) == 0
+     assert budget.num_batched_tokens == 0
+@@ -362,16 +370,20 @@ def test_prefill_schedule_token_budget():
+     """
+     Test token budget respected.
+     """
+-    scheduler = initialize_scheduler()
+-    waiting = deque()
++    block_size = 4
++    scheduler = initialize_scheduler(block_size=block_size,
++                                     num_cpu_blocks=64,
++                                     num_gpu_blocks=64)
+     budget = create_token_budget(token_budget=0)
+     for i in range(2):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+-        waiting.append(seq_group)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           block_size=block_size)
++        scheduler.add_seq_group(seq_group)
+ 
+     # 0 token budget == nothing is scheduled.
+-    remaining_waiting, output = scheduler._schedule_prefills(
+-        waiting, budget, None)
++    output = scheduler._schedule_prefills(budget, None)
++    remaining_waiting = scheduler.waiting
+     assert len(output.ignored_seq_groups) == 0
+     assert len(output.seq_groups) == 0
+     assert budget.num_batched_tokens == 0
+@@ -380,8 +392,8 @@ def test_prefill_schedule_token_budget():
+ 
+     # 60 token budget == 1 request scheduled.
+     budget = create_token_budget(token_budget=60)
+-    remaining_waiting, output = scheduler._schedule_prefills(
+-        waiting, budget, None)
++    output = scheduler._schedule_prefills(budget, None)
++    remaining_waiting = scheduler.waiting
+     assert len(output.ignored_seq_groups) == 0
+     assert len(output.seq_groups) == 1
+     assert budget.num_batched_tokens == 60
+@@ -389,15 +401,18 @@ def test_prefill_schedule_token_budget():
+     assert len(remaining_waiting) == 1
+ 
+     # Test when current_batched_tokens respected.
+-    scheduler = initialize_scheduler()
+-    waiting = deque()
++    scheduler = initialize_scheduler(block_size=block_size,
++                                     num_cpu_blocks=16,
++                                     num_gpu_blocks=16)
+     budget = create_token_budget(token_budget=60)
+     add_token_budget(budget, 30, 0)
+-    _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
++    _, seq_group = create_dummy_prompt(str(i),
++                                       prompt_length=60,
++                                       block_size=block_size)
+     # Cannot schedule a prompt that doesn't fit the budget.
+-    waiting.append(seq_group)
+-    remaining_waiting, output = scheduler._schedule_prefills(
+-        waiting, budget, None)
++    scheduler.add_seq_group(seq_group)
++    output = scheduler._schedule_prefills(budget, None)
++    remaining_waiting = scheduler.waiting
+     assert len(output.ignored_seq_groups) == 0
+     assert len(output.seq_groups) == 0
+     assert budget.num_batched_tokens == 30
+@@ -405,8 +420,8 @@ def test_prefill_schedule_token_budget():
+     assert len(remaining_waiting) == 1
+     budget = create_token_budget(token_budget=90)
+     add_token_budget(budget, 30, 0)
+-    remaining_waiting, output = scheduler._schedule_prefills(
+-        waiting, budget, None)
++    output = scheduler._schedule_prefills(budget, None)
++    remaining_waiting = scheduler.waiting
+     assert len(output.seq_groups) == 1
+     assert budget.num_batched_tokens == 90
+     assert budget.num_curr_seqs == 1
+@@ -417,14 +432,18 @@ def test_prefill_schedule_max_seqs():
+     """
+     Test max seq respected.
+     """
+-    scheduler = initialize_scheduler()
+-    waiting = deque()
++    block_size = 4
++    scheduler = initialize_scheduler(block_size=block_size,
++                                     num_cpu_blocks=64,
++                                     num_gpu_blocks=64)
+     budget = create_token_budget(max_num_seqs=2)
+     for i in range(3):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+-        waiting.append(seq_group)
+-    remaining_waiting, output = scheduler._schedule_prefills(
+-        waiting, budget, None)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           block_size=block_size)
++        scheduler.add_seq_group(seq_group)
++    output = scheduler._schedule_prefills(budget, None)
++    remaining_waiting = scheduler.waiting
+     assert len(output.ignored_seq_groups) == 0
+     assert len(output.seq_groups) == 2
+     assert budget.num_batched_tokens == 120
+@@ -432,13 +451,15 @@ def test_prefill_schedule_max_seqs():
+     assert len(remaining_waiting) == 1
+ 
+     # Verify curr_num_seqs respected.
+-    waiting = deque()
++    scheduler.waiting = deque()
+     budget = create_token_budget(max_num_seqs=2)
+     add_token_budget(budget, 0, 2)
+-    _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+-    waiting.append(seq_group)
+-    remaining_waiting, output = scheduler._schedule_prefills(
+-        waiting, budget, None)
++    _, seq_group = create_dummy_prompt(str(i),
++                                       prompt_length=60,
++                                       block_size=block_size)
++    scheduler.add_seq_group(seq_group)
++    output = scheduler._schedule_prefills(budget, None)
++    remaining_waiting = scheduler.waiting
+     assert len(output.ignored_seq_groups) == 0
+     assert len(output.seq_groups) == 0
+     assert budget.num_batched_tokens == 0
+@@ -450,30 +471,36 @@ def test_prefill_schedule_max_lora():
+     """
+     Test max lora is respected and prioritized.
+     """
++    block_size = 4
+     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
+-    scheduler = initialize_scheduler(lora_config=lora_config)
+-    waiting = deque()
++    scheduler = initialize_scheduler(lora_config=lora_config,
++                                     block_size=block_size,
++                                     num_cpu_blocks=64,
++                                     num_gpu_blocks=64)
+     budget = create_token_budget(token_budget=120)
+-    curr_loras = set()
++    curr_loras: Set[int] = set()
+     for i in range(2):
+         _, seq_group = create_dummy_prompt(str(i),
+                                            prompt_length=60,
++                                           block_size=block_size,
+                                            lora_request=LoRARequest(
+                                                lora_name=str(i),
+                                                lora_int_id=i + 1,
+-                                               lora_local_path="abc"))
+-        waiting.append(seq_group)
++                                               lora_path="abc"))
++        scheduler.add_seq_group(seq_group)
+     # Add two more requests to verify lora is prioritized.
+     # 0: Lora, 1: Lora, 2: regular, 3: regular
+     # In the first iteration, index 0, 2 is scheduled.
+     # If a request is not scheduled because it hits max lora, it is
+     # prioritized. Verify that.
+     for i in range(2, 4):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+-        waiting.append(seq_group)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           block_size=block_size)
++        scheduler.add_seq_group(seq_group)
+     # Schedule 2 requests (0 and 2)
+-    remaining_waiting, output = scheduler._schedule_prefills(
+-        waiting, budget, curr_loras)
++    output = scheduler._schedule_prefills(budget, curr_loras)
++    remaining_waiting = scheduler.waiting
+     assert len(output.ignored_seq_groups) == 0
+     assert len(output.seq_groups) == 2
+     assert budget.num_batched_tokens == 120
+@@ -484,8 +511,8 @@ def test_prefill_schedule_max_lora():
+     # Reset curr_loras so that it can be scheduled.
+     curr_loras = set()
+     budget = create_token_budget(token_budget=60)
+-    remaining_waiting, output = scheduler._schedule_prefills(
+-        remaining_waiting, budget, curr_loras)
++    output = scheduler._schedule_prefills(budget, curr_loras)
++    remaining_waiting = scheduler.waiting
+     assert len(output.seq_groups) == 1
+     assert output.seq_groups[0].seq_group.request_id == "1"
+     assert len(remaining_waiting) == 1
+@@ -497,32 +524,37 @@ def test_prefill_schedule_no_block_manager_capacity():
+     """
+     Test sequence cannot be scheduled due to block manager has no capacity.
+     """
+-    scheduler = initialize_scheduler()
+-    waiting = deque()
++    block_size = 4
++    scheduler = initialize_scheduler(block_size=block_size,
++                                     num_gpu_blocks=128,
++                                     num_cpu_blocks=128)
+     budget = create_token_budget()
+     for i in range(3):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+-        waiting.append(seq_group)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           block_size=block_size)
++        scheduler.add_seq_group(seq_group)
+     scheduler.block_manager.can_allocate = MagicMock()
+     scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
+-    remainig_waiting, output = scheduler._schedule_prefills(
+-        waiting, budget, None)
++    output = scheduler._schedule_prefills(budget, None)
++    remaining_waiting = scheduler.waiting
+     assert len(output.ignored_seq_groups) == 0
+     assert len(output.seq_groups) == 0
+     assert budget.num_batched_tokens == 0
+     assert budget.num_curr_seqs == 0
+-    assert len(remainig_waiting) == 3
++    assert len(remaining_waiting) == 3
+ 
+     scheduler = initialize_scheduler()
+-    waiting = deque()
+     budget = create_token_budget()
+     for i in range(3):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+-        waiting.append(seq_group)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           block_size=block_size)
++        scheduler.add_seq_group(seq_group)
+     scheduler.block_manager.can_allocate = MagicMock()
+     scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
+-    remaining_waiting, output = scheduler._schedule_prefills(
+-        waiting, budget, None)
++    output = scheduler._schedule_prefills(budget, None)
++    remaining_waiting = scheduler.waiting
+     assert len(output.ignored_seq_groups) == 3
+     assert len(output.seq_groups) == 0
+     assert budget.num_batched_tokens == 0
+@@ -534,15 +566,18 @@ def test_decode_schedule_preempted():
+     """
+     Test decodes cannot be scheduled and preempted.
+     """
+-    scheduler = initialize_scheduler()
+-    running = deque()
+-    policy = PolicyFactory.get_policy(policy_name="fcfs")
++    block_size = 4
++    scheduler = initialize_scheduler(block_size=block_size,
++                                     num_cpu_blocks=64,
++                                     num_gpu_blocks=64)
+     curr_loras = None
+     for i in range(3):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           block_size=block_size)
+         scheduler._allocate_and_set_running(seq_group)
+         append_new_token_seq_group(60, seq_group, 1)
+-        running.append(seq_group)
++        scheduler._add_seq_group_to_running(seq_group)
+     scheduler.block_manager.can_append_slots = MagicMock()
+ 
+     def cannot_append_second_group(seq_group, num_lookahead_slots):
+@@ -554,8 +589,8 @@ def test_decode_schedule_preempted():
+     # 1 cannot be scheduled, and the lowest priority (request 2)
+     # should be preempted. 1 will also be preempted.
+     budget = create_token_budget()
+-    remainig_running, output = scheduler._schedule_running(
+-        running, budget, curr_loras, policy)
++    output = scheduler._schedule_running(budget, curr_loras)
++    remainig_running = scheduler.running
+     assert len(remainig_running) == 0
+     assert len(output.decode_seq_groups) == 1
+     assert len(output.prefill_seq_groups) == 0
+@@ -566,208 +601,72 @@ def test_decode_schedule_preempted():
+     # NOTE: When enable_chunk is False, num_seqs budget is not updated.
+     # assert budget.num_curr_seqs == 1
+     # Both should be preempted, not swapped.
+-    assert output.blocks_to_swap_out == {}
++    assert output.blocks_to_swap_out == []
+     # Nothing is copied.
+-    assert output.blocks_to_copy == {}
+-
+-
+-def test_decode_swap_beam_search():
+-    """
+-    Test best_of > 1 swap out blocks
+-    """
+-    scheduler = initialize_scheduler()
+-    running = deque()
+-    policy = PolicyFactory.get_policy(policy_name="fcfs")
+-    curr_loras = None
+-    budget = create_token_budget()
+-    for i in range(3):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
+-        scheduler._allocate_and_set_running(seq_group)
+-        running.append(seq_group)
+-        append_new_token_seq_group(60, seq_group, 1)
+-        budget.add_num_seqs(seq_group.request_id,
+-                            seq_group.get_max_num_running_seqs())
+-        budget.add_num_batched_tokens(
+-            seq_group.request_id, seq_group.num_seqs(SequenceStatus.RUNNING))
+-
+-    # The last request should be swapped out.
+-    scheduler.block_manager.can_append_slots = MagicMock()
+-
+-    def cannot_append_second_group(seq_group, num_lookahead_slots):
+-        return seq_group.request_id != "2"
+-
+-    scheduler.block_manager.can_append_slots.side_effect = (
+-        cannot_append_second_group)
+-    scheduler.block_manager.swap_out = MagicMock()
+-    expected_swap_mapping = {"5": "7"}
+-    scheduler.block_manager.swap_out.return_value = expected_swap_mapping
+-
+-    remainig_running, output = scheduler._schedule_running(
+-        running, budget, curr_loras, policy)
+-    assert len(remainig_running) == 0
+-    assert len(output.decode_seq_groups) == 2
+-    assert len(output.prefill_seq_groups) == 0
+-    assert output.decode_seq_groups[0].seq_group.request_id == "0"
+-    assert output.decode_seq_groups[1].seq_group.request_id == "1"
+-    assert len(output.preempted) == 0
+-    assert len(output.swapped_out) == 1
+-    # Budget should refledct preempted requests.
+-    assert budget.num_batched_tokens == 2
+-    # since there are 2 sequences, 2 should be subtracted.
+-    assert budget.num_curr_seqs == 4
+-    # Both should be preempted, not swapped.
+-    assert output.blocks_to_swap_out == expected_swap_mapping
+-    # Nothing is copied.
+-    assert output.blocks_to_copy == {}
++    assert output.blocks_to_copy == []
+ 
+ 
+ def test_schedule_decode_blocks_to_copy_update():
+     """
+     Verify blocks_to_copy is updated.
+     """
+-    scheduler = initialize_scheduler()
+-    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+-    running = deque()
+-    policy = PolicyFactory.get_policy(policy_name="fcfs")
++    block_size = 4
++    scheduler = initialize_scheduler(block_size=4,
++                                     num_cpu_blocks=16,
++                                     num_gpu_blocks=16)
++    _, seq_group = create_dummy_prompt("1",
++                                       prompt_length=60,
++                                       best_of=2,
++                                       block_size=block_size)
+     curr_loras = None
+     scheduler._allocate_and_set_running(seq_group)
+     append_new_token_seq_group(60, seq_group, 1)
+-    running.append(seq_group)
++    scheduler._add_seq_group_to_running(seq_group)
+ 
+     # The last request should be swapped out.
+     scheduler.block_manager.append_slots = MagicMock()
+-    scheduler.block_manager.append_slots.return_value = {2: [3]}
++    scheduler.block_manager.append_slots.return_value = [(2, 3)]
+ 
+     budget = create_token_budget()
+-    remaining_running, output = scheduler._schedule_running(
+-        running, budget, curr_loras, policy)
++    output = scheduler._schedule_running(budget, curr_loras)
++    remaining_running = scheduler.running
+     assert len(remaining_running) == 0
+     assert len(output.decode_seq_groups) == 1
+     assert len(output.prefill_seq_groups) == 0
+     assert len(output.preempted) == 0
+     assert len(output.swapped_out) == 0
+     # Nothing is preempted.
+-    assert output.blocks_to_swap_out == {}
++    assert output.blocks_to_swap_out == []
+     # Since append_slot returns the source -> dist mapping, it should
+     # applied.
+-    assert output.blocks_to_copy == {2: [3]}
+-
+-
+-def test_schedule_swapped_simple():
+-    scheduler = initialize_scheduler()
+-    swapped = deque()
+-    policy = PolicyFactory.get_policy(policy_name="fcfs")
+-    curr_loras = None
+-    blocks_to_swap_out = {}
+-    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+-    scheduler._allocate_and_set_running(seq_group)
+-    append_new_token_seq_group(60, seq_group, 1)
+-    scheduler._swap_out(seq_group, blocks_to_swap_out)
+-    swapped.append(seq_group)
+-
+-    budget = create_token_budget()
+-    remaining_swapped, output = scheduler._schedule_swapped(
+-        swapped, budget, curr_loras, policy)
+-    assert len(remaining_swapped) == 0
+-    assert budget.num_batched_tokens == 1
+-    assert budget.num_curr_seqs == 2
+-    assert len(output.decode_seq_groups) == 1
+-    assert len(output.prefill_seq_groups) == 0
+-    # swap in is the reverse of swap out
+-    blocks_to_swap_in_reverse = {}
+-    for swapin, swapout in output.blocks_to_swap_in.items():
+-        blocks_to_swap_in_reverse[swapout] = swapin
+-    assert blocks_to_swap_out == blocks_to_swap_in_reverse
+-
+-
+-def test_schedule_swapped_max_token_budget():
+-    scheduler = initialize_scheduler()
+-    swapped = deque()
+-    policy = PolicyFactory.get_policy(policy_name="fcfs")
+-    curr_loras = None
+-    blocks_to_swap_out = {}
+-    for _ in range(2):
+-        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
+-        scheduler._allocate_and_set_running(seq_group)
+-        append_new_token_seq_group(60, seq_group, 1)
+-        scheduler._swap_out(seq_group, blocks_to_swap_out)
+-        swapped.append(seq_group)
+-
+-    budget = create_token_budget(token_budget=1)
+-    remaining_swapped, output = scheduler._schedule_swapped(
+-        swapped, budget, curr_loras, policy)
+-    assert len(remaining_swapped) == 1
+-    assert budget.num_batched_tokens == 1
+-    assert budget.num_curr_seqs == 2
+-    assert len(output.decode_seq_groups) == 1
+-    assert len(output.prefill_seq_groups) == 0
+-
+-    # Verify num_batched_tokens are respected.
+-    budget = create_token_budget(token_budget=1)
+-    add_token_budget(budget, 1, 0)
+-    remaining_swapped, output = scheduler._schedule_swapped(
+-        remaining_swapped, budget, curr_loras, policy)
+-    assert len(remaining_swapped) == 1
+-    assert budget.num_batched_tokens == 1
+-    assert budget.num_curr_seqs == 0
+-    assert len(output.decode_seq_groups) == 0
+-    assert len(output.prefill_seq_groups) == 0
+-
+-
+-def test_schedule_swapped_max_seqs():
+-    scheduler = initialize_scheduler()
+-    swapped = deque()
+-    policy = PolicyFactory.get_policy(policy_name="fcfs")
+-    curr_loras = None
+-    blocks_to_swap_out = {}
+-    for i in range(4):
+-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
+-        scheduler._allocate_and_set_running(seq_group)
+-        append_new_token_seq_group(60, seq_group, 1)
+-        scheduler._swap_out(seq_group, blocks_to_swap_out)
+-        swapped.append(seq_group)
+-
+-    budget = create_token_budget(max_num_seqs=2)
+-    remaining_swapped, output = scheduler._schedule_swapped(
+-        swapped, budget, curr_loras, policy)
+-    assert len(remaining_swapped) == 2
+-    assert budget.num_batched_tokens == 2
+-    assert budget.num_curr_seqs == 2
+-    assert len(output.decode_seq_groups) == 2
+-    assert len(output.prefill_seq_groups) == 0
+-
+-    # Verify num_curr_seqs are respected.
+-    remaining_swapped, output = scheduler._schedule_swapped(
+-        remaining_swapped, budget, curr_loras, policy)
+-    assert len(remaining_swapped) == 2
+-    assert budget.num_batched_tokens == 2
+-    assert budget.num_curr_seqs == 2
+-    assert len(output.decode_seq_groups) == 0
+-    assert len(output.prefill_seq_groups) == 0
++    assert output.blocks_to_copy == [(2, 3)]
+ 
+ 
+ def test_schedule_swapped_max_loras():
++    block_size = 4
+     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
+-    scheduler = initialize_scheduler(lora_config=lora_config)
+-    swapped = deque()
+-    policy = PolicyFactory.get_policy(policy_name="fcfs")
+-    curr_loras = set()
+-    blocks_to_swap_out = {}
++    scheduler = initialize_scheduler(lora_config=lora_config,
++                                     block_size=block_size,
++                                     num_cpu_blocks=32,
++                                     num_gpu_blocks=32)
++    curr_loras: Set[int] = set()
++    blocks_to_swap_out: List[Tuple[int, int]] = []
+     for i in range(2):
+         _, seq_group = create_dummy_prompt(str(i),
+                                            prompt_length=60,
++                                           block_size=block_size,
+                                            lora_request=LoRARequest(
+                                                lora_name=str(i),
+                                                lora_int_id=i + 1,
+-                                               lora_local_path="abc"))
++                                               lora_path="abc"))
+         scheduler._allocate_and_set_running(seq_group)
+         append_new_token_seq_group(60, seq_group, 1)
+         scheduler._swap_out(seq_group, blocks_to_swap_out)
+-        swapped.append(seq_group)
++        scheduler._add_seq_group_to_swapped(seq_group)
+ 
+     budget = create_token_budget()
+-    remaining_swapped, output = scheduler._schedule_swapped(
+-        swapped, budget, curr_loras, policy)
++    output = scheduler._schedule_swapped(budget, curr_loras)
++    remaining_swapped = scheduler.swapped
+     assert len(remaining_swapped) == 1
+     assert budget.num_batched_tokens == 1
+     assert budget.num_curr_seqs == 1
+@@ -777,25 +676,29 @@ def test_schedule_swapped_max_loras():
+ 
+ 
+ def test_schedule_swapped_cannot_swap_in():
+-    scheduler = initialize_scheduler()
+-    swapped = deque()
+-    policy = PolicyFactory.get_policy(policy_name="fcfs")
++    block_size = 4
++    scheduler = initialize_scheduler(block_size=block_size,
++                                     num_cpu_blocks=32,
++                                     num_gpu_blocks=32)
+     curr_loras = None
+-    blocks_to_swap_out = {}
+-    for _ in range(2):
+-        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
++    blocks_to_swap_out: List[Tuple[int, int]] = []
++    for i in range(2):
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           best_of=2,
++                                           block_size=block_size)
+         scheduler._allocate_and_set_running(seq_group)
+         append_new_token_seq_group(60, seq_group, 1)
+         scheduler._swap_out(seq_group, blocks_to_swap_out)
+-        swapped.append(seq_group)
++        scheduler._add_seq_group_to_swapped(seq_group)
+ 
+     # The last request should be swapped out.
+     scheduler.block_manager.can_swap_in = MagicMock()
+     scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
+     # Since we cannot swap in, none of the requests are swapped in.
+     budget = create_token_budget()
+-    remaining_swapped, output = scheduler._schedule_swapped(
+-        swapped, budget, curr_loras, policy)
++    output = scheduler._schedule_swapped(budget, curr_loras)
++    remaining_swapped = scheduler.swapped
+     assert len(remaining_swapped) == 2
+     assert budget.num_batched_tokens == 0
+     assert budget.num_curr_seqs == 0
+@@ -804,25 +707,29 @@ def test_schedule_swapped_cannot_swap_in():
+ 
+ 
+ def test_infeasible_swap():
+-    scheduler = initialize_scheduler()
+-    swapped = deque()
+-    policy = PolicyFactory.get_policy(policy_name="fcfs")
++    block_size = 4
++    scheduler = initialize_scheduler(block_size=block_size,
++                                     num_cpu_blocks=32,
++                                     num_gpu_blocks=32)
+     curr_loras = None
+-    blocks_to_swap_out = {}
+-    for _ in range(2):
+-        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
++    blocks_to_swap_out: List[Tuple[int, int]] = []
++    for i in range(2):
++        _, seq_group = create_dummy_prompt(str(i),
++                                           prompt_length=60,
++                                           best_of=2,
++                                           block_size=block_size)
+         scheduler._allocate_and_set_running(seq_group)
+         append_new_token_seq_group(60, seq_group, 1)
+         scheduler._swap_out(seq_group, blocks_to_swap_out)
+-        swapped.append(seq_group)
++        scheduler._add_seq_group_to_swapped(seq_group)
+ 
+     # The last request should be swapped out.
+     scheduler.block_manager.can_swap_in = MagicMock()
+     scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
+     # Since we cannot swap in, none of the requests are swapped in.
+     budget = create_token_budget()
+-    remaining_swapped, output = scheduler._schedule_swapped(
+-        swapped, budget, curr_loras, policy)
++    output = scheduler._schedule_swapped(budget, curr_loras)
++    remaining_swapped = scheduler.swapped
+     assert len(remaining_swapped) == 0
+     assert len(output.infeasible_seq_groups) == 2
+     assert budget.num_batched_tokens == 0
+@@ -832,28 +739,32 @@ def test_infeasible_swap():
+ 
+ 
+ def test_schedule_swapped_blocks_to_copy():
+-    scheduler = initialize_scheduler()
+-    swapped = deque()
+-    policy = PolicyFactory.get_policy(policy_name="fcfs")
++    block_size = 4
++    scheduler = initialize_scheduler(block_size=block_size,
++                                     num_cpu_blocks=32,
++                                     num_gpu_blocks=32)
+     curr_loras = None
+-    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
++    _, seq_group = create_dummy_prompt("1",
++                                       prompt_length=60,
++                                       best_of=2,
++                                       block_size=block_size)
+     scheduler._allocate_and_set_running(seq_group)
+     append_new_token_seq_group(60, seq_group, 1)
+-    blocks_to_swap_out = {}
++    blocks_to_swap_out: List[Tuple[int, int]] = []
+     scheduler._swap_out(seq_group, blocks_to_swap_out)
+-    swapped.append(seq_group)
++    scheduler._add_seq_group_to_swapped(seq_group)
+ 
+     # The last request should be swapped out.
+     scheduler.block_manager.append_slots = MagicMock()
+-    scheduler.block_manager.append_slots.return_value = {2: [3]}
++    scheduler.block_manager.append_slots.return_value = [(2, 3)]
+ 
+     budget = create_token_budget()
+-    remaining_swapped, output = scheduler._schedule_swapped(
+-        swapped, budget, curr_loras, policy)
++    output = scheduler._schedule_swapped(budget, curr_loras)
++    remaining_swapped = scheduler.swapped
+     assert len(remaining_swapped) == 0
+     assert len(output.decode_seq_groups) == 1
+     assert len(output.prefill_seq_groups) == 0
+-    assert output.blocks_to_copy == {2: [3]}
++    assert output.blocks_to_copy == [(2, 3)]
+ 
+ 
+ def test_scheduling_budget():
+@@ -898,3 +809,165 @@ def test_scheduling_budget():
+     assert budget.num_curr_seqs == 0
+     budget.subtract_num_seqs(seq_group.request_id, 2)
+     assert budget.num_curr_seqs == 0
++
++
++@pytest.mark.parametrize("enable_prefix_caching", [True, False])
++def test_prefix_caching_aware_prefills(enable_prefix_caching):
++    """
++    Test the below scenario:
++
++    For 3 sequences, seqA, seqB, seqC, share the first block as prefix.
++
++    The test verifies the below scenarios:
++    1.  SeqA is first scheduled.
++    2.  SeqB and SeqC can be prefilled together in a single schedule round
++    even though there are not enough token budgets to prefill both without
++    considering prefix caching.
++    """
++
++    block_size = 4
++    max_num_batched_tokens = 12
++    max_seq_group = 3
++    scheduler = initialize_scheduler(
++        block_size=block_size,
++        num_cpu_blocks=16,
++        num_gpu_blocks=16,
++        max_token_budget=max_num_batched_tokens,
++        max_num_seqs=max_seq_group,
++        max_model_len=max_num_batched_tokens,
++        enable_prefix_caching=enable_prefix_caching,
++    )
++
++    seqA_tokens = list(range(8))
++    num_shared_tokens = 4
++    seqB_tokens = seqA_tokens[:num_shared_tokens] + list(range(
++        12, 16))  # Shared prefix first 4.
++    seqC_tokens = seqA_tokens[:num_shared_tokens] + list(range(
++        16, 20))  # Shared prefix first 4.
++
++    seqA, seqA_group = create_dummy_prompt("0",
++                                           prompt_tokens=seqA_tokens,
++                                           block_size=block_size)
++    seqB, seqB_group = create_dummy_prompt("1",
++                                           prompt_tokens=seqB_tokens,
++                                           block_size=block_size)
++    seqC, seqC_group = create_dummy_prompt("2",
++                                           prompt_tokens=seqC_tokens,
++                                           block_size=block_size)
++
++    # Schedule seqA prefill.
++    scheduler.add_seq_group(seqA_group)
++    metas, out, _ = scheduler.schedule()
++    assert (len(out.scheduled_seq_groups) == 1
++            and out.scheduled_seq_groups[0].seq_group == seqA_group)
++    assert out.scheduled_seq_groups[0].token_chunk_size == len(seqA_tokens)
++
++    # Schedule seqA decode.
++    append_new_token_seq_group(len(seqA_tokens), seqA_group, 999)
++    metas, out, _ = scheduler.schedule()
++
++    assert len(out.scheduled_seq_groups) == 1
++    assert out.scheduled_seq_groups[0].seq_group == seqA_group
++    assert out.scheduled_seq_groups[0].token_chunk_size == 1
++
++    # Schedule seqB and seqC prefills should work with prefix caching.
++    scheduler.add_seq_group(seqB_group)
++    scheduler.add_seq_group(seqC_group)
++    metas, out, _ = scheduler.schedule()
++
++    if enable_prefix_caching:
++        assert len(out.scheduled_seq_groups) == 2
++        assert set([
++            out.scheduled_seq_groups[0].seq_group,
++            out.scheduled_seq_groups[1].seq_group,
++        ]) == set([seqB_group, seqC_group])
++        assert len(metas) == 2
++        for meta in metas:
++            assert meta.token_chunk_size == 8
++            assert (len(meta.computed_block_nums) == num_shared_tokens //
++                    block_size)  # 1 Block for the 8 tokens.
++    else:
++        assert len(out.scheduled_seq_groups) == 1
++        assert len(metas) == 1
++        assert metas[0].token_chunk_size == 8
++        assert len(metas[0].computed_block_nums) == 0  # No blocks computed.
++
++
++def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
++):
++    """
++    This test verifies that we don't schedule new prefills if there's already
++    a continuous prefill in progress even though the new prefills with shared
++    prefix can fit in the token budget:
++
++    - SeqA is being chunked prefill.
++    - SeqB with the same prompt shouldn't be scheduled for prefill even though
++    there's enough token budget to prefill the cached tokens.
++    - Neither should seqC be scheduled.
++
++    - When seqA is in decoding phase, seqB and seqC can be scheduled.
++        - Entire seqB should be prefilled since it's a full prefix cache hit.
++        - SeqC would be partially prefilled with the prefix shared, and the
++        remaining unique tokens would be prefilled (rounded down to be
++        block-size aligned).
++    """
++
++    block_size = 2
++    max_num_batched_tokens = 4
++    max_seq_group = 3
++    scheduler = initialize_scheduler(
++        block_size=block_size,
++        num_cpu_blocks=16,
++        num_gpu_blocks=16,
++        max_token_budget=max_num_batched_tokens,
++        max_num_seqs=max_seq_group,
++        max_model_len=100,
++        enable_prefix_caching=True,
++        enable_chunked_prefill=True,
++    )
++
++    seqA_tokens = list(range(8))
++    seqB_tokens = seqA_tokens
++    seqC_shared_prefix_len = 4
++    seqC_tokens = seqA_tokens[:seqC_shared_prefix_len] + list(range(12, 20))
++
++    seqA, seqA_group = create_dummy_prompt("0",
++                                           prompt_tokens=seqA_tokens,
++                                           block_size=block_size)
++    seqB, seqB_group = create_dummy_prompt("1",
++                                           prompt_tokens=seqB_tokens,
++                                           block_size=block_size)
++
++    # Chunked prefill seqA.
++    scheduler.add_seq_group(seqA_group)
++    metas, out = schedule_and_update_computed_tokens(scheduler)
++    assert len(out.scheduled_seq_groups) == 1
++    assert out.scheduled_seq_groups[0].seq_group == seqA_group
++    assert out.scheduled_seq_groups[0].token_chunk_size == 4
++
++    # seqB should not be scheduled with ongoing prefills.
++    scheduler.add_seq_group(seqB_group)
++    metas, out = schedule_and_update_computed_tokens(scheduler)
++    assert len(out.scheduled_seq_groups) == 1
++    assert out.scheduled_seq_groups[0].seq_group == seqA_group
++    assert out.scheduled_seq_groups[0].token_chunk_size == 4
++
++    # both seqB and seqC can now be scheduled with seqA is over.
++    # seqA is in decoding phase.
++    append_new_token_seq(seqA, 999)
++    seqC, seqC_group = create_dummy_prompt("2",
++                                           prompt_tokens=seqC_tokens,
++                                           block_size=block_size)
++    scheduler.add_seq_group(seqC_group)
++    metas, out = schedule_and_update_computed_tokens(scheduler)
++    assert len(out.scheduled_seq_groups) == 3
++
++    metas = {meta.request_id: meta for meta in metas}
++    assert metas[seqA_group.request_id].token_chunk_size == 1  # Decode
++    assert (metas[seqB_group.request_id].token_chunk_size == 8
++            )  # Fully cached prefill
++    assert (
++        metas[seqC_group.request_id].token_chunk_size == 6
++    ), "A partial prefix of C (4 tokens) should be prefilled, with the "
++    "remaining tokens fit into 3 token budget (4-1 from the seqA). It will "
++    "then be rounded down to 2 tokens on block size, thus 6 tokens in total."
+diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
+new file mode 100644
+index 0000000..16bea54
+--- /dev/null
++++ b/tests/core/test_scheduler_encoder_decoder.py
+@@ -0,0 +1,104 @@
++from typing import List
++
++import pytest  # noqa
++
++from vllm.config import CacheConfig, SchedulerConfig
++from vllm.core.scheduler import Scheduler
++from vllm.sequence import SequenceGroup
++
++from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
++                    get_sequence_groups, schedule_and_update_computed_tokens)
++
++
++def test_scheduler_schedule_simple_encoder_decoder():
++    '''
++    Test basic scheduler functionality in the context
++    of an encoder/decoder model. Focus on testing
++    enc/dec-specific functionality sense tests already
++    exist for decoder-only functionality
++
++    Test behavior:
++    * Construct Scheduler
++    * Construct dummy encoder/decoder sequence groups
++    * Add dummy seq groups to scheduler backlog
++    * Schedule the next seq group & validate:
++        * Cross-attn block tables
++        * Updated states of seq groups
++        * Number of batched tokens
++        * Number of blocks to copy/swap-in/swap-out
++        * Number of scheduled seq groups
++    * Repeat for both prefill- and decode-phase
++    * Abort scheduled seq groups
++    * Assert that aborted seq groups no longer appear in
++      cross-attention block table
++    '''
++
++    block_size = 4
++    num_seq_group = 4
++    max_model_len = 16
++    scheduler_config = SchedulerConfig(
++        "generate",
++        max_num_batched_tokens=64,
++        max_num_seqs=num_seq_group,
++        max_model_len=max_model_len,
++    )
++    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
++    cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
++    cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group
++    scheduler = Scheduler(scheduler_config, cache_config, None)
++    running: List[SequenceGroup] = []
++
++    # Add seq groups to scheduler.
++    req_id_list = []
++    for i in range(num_seq_group):
++        req_id = str(i)
++        req_id_list.append(req_id)
++        _, _, seq_group = create_dummy_prompt_encoder_decoder(
++            req_id, block_size, block_size, block_size)
++        scheduler.add_seq_group(seq_group)
++        running.append(seq_group)
++
++    # Schedule seq groups prefill.
++    num_tokens = block_size * num_seq_group
++    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
++    # - Verify that sequence group cross-attention block tables are
++    #   registered with the block manager
++    assert all([(req_id in scheduler.block_manager.cross_block_tables)
++                for req_id in req_id_list])
++    # - Validate sequence-group status
++    assert set(get_sequence_groups(out)) == set(running)
++    # - Validate number of batched tokens
++    assert out.num_batched_tokens == num_tokens
++    # - Validate there are no remaining blocks to swap
++    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
++            and not out.blocks_to_swap_out)
++    # - Validate all seq groups were scheduled
++    assert len(seq_group_meta_list) == num_seq_group
++    append_new_token(out, 1)
++
++    # Schedule seq groups decode.
++    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
++    # - Verify that sequence group metadata includes encoder attention
++    #   and cross-attention metadata
++    assert all([
++        not ((seq_group_meta.encoder_seq_data is None) or
++             (seq_group_meta.cross_block_table is None))
++        for seq_group_meta in seq_group_meta_list
++    ])
++    # - Validate sequence-group status
++    assert set(get_sequence_groups(out)) == set(running)
++    # - Validate there is one batched token per seq group
++    assert out.num_batched_tokens == num_seq_group
++    # - Validate there are no remaining blocks to swap
++    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
++            and not out.blocks_to_swap_out)
++    # - Validate that all seq groups were scheduled
++    assert len(seq_group_meta_list) == num_seq_group
++    append_new_token(out, 1)
++
++    # Abort sequences
++    for req_id in req_id_list:
++        scheduler.abort_seq_group(req_id)
++        # - Verify that sequence group cross-attention block tables are
++        #   NO LONGER registered with the block manager
++        assert req_id not in scheduler.block_manager.cross_block_tables
+diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py
+new file mode 100644
+index 0000000..d604e52
+--- /dev/null
++++ b/tests/core/test_serialization.py
+@@ -0,0 +1,33 @@
++import msgspec
++
++from vllm.executor.msgspec_utils import decode_hook, encode_hook
++from vllm.sequence import ExecuteModelRequest
++
++from ..spec_decode.utils import create_batch
++
++
++def test_msgspec_serialization():
++    num_lookahead_slots = 4
++    seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
++    execute_model_req = ExecuteModelRequest(
++        seq_group_metadata_list=seq_group_metadata_list,
++        num_lookahead_slots=num_lookahead_slots,
++        running_queue_size=4)
++
++    encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
++    decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
++                                      dec_hook=decode_hook)
++    req = decoder.decode(encoder.encode(execute_model_req))
++    expected = execute_model_req.seq_group_metadata_list
++    actual = req.seq_group_metadata_list
++    assert (len(expected) == len(actual))
++    expected = expected[0]
++    actual = actual[0]
++
++    assert expected.block_tables == actual.block_tables
++    assert expected.is_prompt == actual.is_prompt
++    assert expected.request_id == actual.request_id
++    assert (expected.seq_data[0].prompt_token_ids ==
++            actual.seq_data[0].prompt_token_ids)
++    assert (expected.seq_data[0].output_token_ids ==
++            actual.seq_data[0].output_token_ids)
+diff --git a/tests/core/utils.py b/tests/core/utils.py
+index 22c1d38..16703cd 100644
+--- a/tests/core/utils.py
++++ b/tests/core/utils.py
+@@ -1,38 +1,117 @@
+ import time
+-from typing import Iterable, Optional, Tuple
++from collections import defaultdict
++from typing import Any, Dict, List, Optional
++from typing import Sequence as GenericSequence
++from typing import Tuple
+ 
+ from vllm import SamplingParams
++from vllm.core.scheduler import Scheduler, SchedulerOutputs
++from vllm.inputs import EncoderDecoderInputs, token_inputs
+ from vllm.lora.request import LoRARequest
+-from vllm.sequence import Logprob, Sequence, SequenceGroup
++from vllm.sequence import (Logprob, Sequence, SequenceGroup,
++                           SequenceGroupMetadata)
+ 
+ 
+ def create_dummy_prompt(
+     request_id: str,
+-    prompt_length: int,
++    prompt_length: int = -1,
+     block_size: Optional[int] = None,
+     lora_request: Optional[LoRARequest] = None,
+-    use_beam_search: bool = False,
+     best_of: int = 1,
++    prompt_tokens: Optional[List[int]] = None,
++    min_tokens: int = 0,
++    max_tokens: int = 16,
+ ) -> Tuple[Sequence, SequenceGroup]:
+     if not block_size:
+         block_size = prompt_length
+ 
+-    # Create dummy prompt sequence with tokens 0...block_size-1
+-    # and prompt "0 ... block_size".
+-    prompt_tokens = list(range(prompt_length))
++    if prompt_tokens is None:
++        # Create dummy prompt sequence with tokens 0...block_size-1
++        # and prompt "0 ... block_size".
++        prompt_tokens = list(range(prompt_length))
++
+     prompt_str = " ".join([str(t) for t in prompt_tokens])
+-    prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
+-    seq_group = SequenceGroup(
+-        request_id, [prompt],
+-        SamplingParams(use_beam_search=use_beam_search, best_of=best_of),
+-        time.time(), lora_request)
++    prompt = Sequence(int(request_id),
++                      inputs=token_inputs(prompt_tokens, prompt=prompt_str),
++                      block_size=block_size)
++    seq_group = SequenceGroup(request_id=request_id,
++                              seqs=[prompt],
++                              arrival_time=time.time(),
++                              sampling_params=SamplingParams(
++                                  best_of=best_of,
++                                  max_tokens=max_tokens,
++                                  min_tokens=min_tokens),
++                              lora_request=lora_request)
+ 
+     return prompt, seq_group
+ 
+ 
++def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
++                               block_size: int, lora_int_id: int) -> Sequence:
++    return Sequence(seq_id=request_id,
++                    inputs=token_inputs(token_ids),
++                    block_size=block_size,
++                    lora_request=LoRARequest(lora_name="dummy",
++                                             lora_path="/dummy",
++                                             lora_int_id=lora_int_id))
++
++
++def create_dummy_sequence(request_id: int, token_ids: List[int],
++                          block_size: int) -> Sequence:
++    return Sequence(
++        seq_id=request_id,
++        inputs=token_inputs(token_ids),
++        block_size=block_size,
++    )
++
++
++def create_dummy_prompt_encoder_decoder(
++    request_id: str,
++    decoder_prompt_length: int,
++    encoder_prompt_length: int,
++    block_size: Optional[int] = None,
++    lora_request: Optional[LoRARequest] = None,
++    best_of: int = 1,
++) -> Tuple[Sequence, Sequence, SequenceGroup]:
++    if not block_size:
++        block_size = decoder_prompt_length
++
++    # Create dummy prompt sequence with tokens 0...block_size-1
++    # and prompt "0 ... block_size". Note that the prompt string
++    # doesn't actually match the tokens
++    decoder_prompt_tokens = list(range(decoder_prompt_length))
++    decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
++    encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
++    encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
++
++    inputs: EncoderDecoderInputs = {
++        "decoder": token_inputs(decoder_prompt_tokens,
++                                prompt=decoder_prompt_str),
++        "encoder": token_inputs(encoder_prompt_tokens,
++                                prompt=encoder_prompt_str),
++    }
++
++    decoder_prompt = Sequence(int(request_id),
++                              inputs=inputs["decoder"],
++                              block_size=block_size)
++
++    encoder_prompt = Sequence(int(request_id),
++                              inputs=inputs["encoder"],
++                              block_size=block_size)
++
++    seq_group = SequenceGroup(request_id=request_id,
++                              seqs=[decoder_prompt],
++                              sampling_params=SamplingParams(best_of=best_of),
++                              arrival_time=time.time(),
++                              lora_request=lora_request,
++                              encoder_seq=encoder_prompt)
++
++    return decoder_prompt, encoder_prompt, seq_group
++
++
+ def create_seq_group(
+         seq_prompt_len: int = 1024,
+-        seq_output_lens: Iterable[int] = (128, ),
++        seq_output_lens: GenericSequence[int] = (128, ),
+         request_id: str = '0',
+         seq_id_start: int = 0,
+         sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
+@@ -44,12 +123,11 @@ def create_seq_group(
+ 
+     prompt_token_ids = [0] * seq_prompt_len
+ 
+-    seqs = []
++    seqs: List[Sequence] = []
+     for seq_id_offset, output_len in enumerate(seq_output_lens):
+         seq = Sequence(
+             seq_id=seq_id_start + seq_id_offset,
+-            prompt="",
+-            prompt_token_ids=prompt_token_ids,
++            inputs=token_inputs(prompt_token_ids),
+             block_size=16,
+         )
+ 
+@@ -70,5 +148,109 @@ def create_seq_group(
+     return seq_group
+ 
+ 
++def create_seq_group_encoder_decoder(
++        seq_prompt_len: int = 1024,
++        seq_output_lens: GenericSequence[int] = (128, ),
++        request_id: str = '0',
++        seq_id_start: int = 0,
++        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
++
++    assert len(seq_output_lens) > 0
++
++    if sampling_params is None:
++        sampling_params = SamplingParams()
++
++    prompt_token_ids = [0] * seq_prompt_len
++
++    inputs: EncoderDecoderInputs = {
++        "decoder": token_inputs(prompt_token_ids),
++        "encoder": token_inputs(prompt_token_ids),
++    }
++
++    seqs = []
++    for seq_id_offset, output_len in enumerate(seq_output_lens):
++        # Construct decoder input sequences
++        seq = Sequence(
++            seq_id=seq_id_start + seq_id_offset,
++            inputs=inputs["decoder"],
++            block_size=16,
++        )
++
++        for i in range(output_len):
++            seq.append_token_id(
++                token_id=i,
++                logprobs={i: Logprob(0.0)},
++            )
++        seqs.append(seq)
++
++    # Encoder input sequence
++    encoder_seq = Sequence(
++        seq_id=seq_id_start + len(seq_output_lens),
++        inputs=inputs["encoder"],
++        block_size=16,
++    )
++
++    return SequenceGroup(request_id=request_id,
++                         seqs=seqs,
++                         sampling_params=sampling_params,
++                         arrival_time=time.time(),
++                         encoder_seq=encoder_seq)
++
++
+ def round_up_to_next_block(seq_len: int, block_size: int) -> int:
+     return (seq_len + block_size - 1) // block_size
++
++
++# Helper functions for scheduler tests
++
++
++def get_sequence_groups(scheduler_output):
++    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
++
++
++def append_new_token(out, token_id: int):
++    seq_groups = get_sequence_groups(out)
++    for seq_group in seq_groups:
++        for seq in seq_group.get_seqs():
++            seq.append_token_id(token_id, {token_id: Logprob(token_id)})
++
++
++def schedule_and_update_computed_tokens(scheduler):
++    metas, out, _ = scheduler.schedule()
++    for s in out.scheduled_seq_groups:
++        s.seq_group.update_num_computed_tokens(s.token_chunk_size)
++    return metas, out
++
++
++def append_new_token_seq(seq: Sequence, token_id: int):
++    seq.append_token_id(token_id, {token_id: Logprob(token_id)})
++
++
++def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
++    seq_group.update_num_computed_tokens(token_chunk_size)
++    for seq in seq_group.get_seqs():
++        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
++
++
++class SchedulerProxy:
++    """
++    A proxy class to forward calls to the scheduler.
++    """
++
++    def __init__(self, scheduler: Scheduler):
++        self.scheduler_ = scheduler
++        self.call_history: Dict[str, List[Any]] = defaultdict(list)
++
++    def __getattr__(self, name: str) -> Any:
++
++        def wrapper(*args, **kwargs):
++            result = getattr(self.scheduler_, name)(*args, **kwargs)
++            self.call_history[name].append((args, kwargs, result))
++            return result
++
++        return wrapper
++
++    def last_schedule_ret(
++        self, ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, Any]:
++        _, _, ret = self.call_history["schedule"][-1]
++        return ret
+diff --git a/tests/data/test_config.yaml b/tests/data/test_config.yaml
+new file mode 100644
+index 0000000..5090e8f
+--- /dev/null
++++ b/tests/data/test_config.yaml
+@@ -0,0 +1,5 @@
++port: 12312
++served_model_name: mymodel
++tensor_parallel_size: 2
++trust_remote_code: true
++multi_step_stream_outputs: false
+diff --git a/tests/distributed/__init__.py b/tests/distributed/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
+new file mode 100644
+index 0000000..fc4043c
+--- /dev/null
++++ b/tests/distributed/test_ca_buffer_sharing.py
+@@ -0,0 +1,59 @@
++# can only run on machines with p2p access across GPUs
++# can only run with torchrun:
++# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
++
++import ctypes
++
++import torch
++import torch.distributed as dist
++
++from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
++from vllm.distributed.device_communicators.custom_all_reduce import (  # noqa
++    CustomAllreduce)
++
++# create a cpu process group for communicating metadata (ipc handle)
++dist.init_process_group(backend="gloo")
++rank = local_rank = dist.get_rank()
++world_size = dist.get_world_size()
++
++# every process sets its own device (differently)
++lib = CudaRTLibrary()
++lib.cudaSetDevice(rank)
++
++buffer_size_in_bytes = 1024
++byte_value = 2  # the value we write to the buffer for verification
++
++pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
++
++print(f"Rank {rank} has pointers {pointers}")
++
++dist.barrier()
++torch.cuda.synchronize()
++
++if rank == 0:
++    # the first rank tries to write to all buffers
++    for p in pointers:
++        pointer = ctypes.c_void_p(p)
++        lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)
++
++dist.barrier()
++torch.cuda.synchronize()
++
++host_data = (ctypes.c_char * buffer_size_in_bytes)()
++
++# all ranks read from all buffers, and check if the data is correct
++for p in pointers:
++    pointer = ctypes.c_void_p(p)
++    lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)
++    for i in range(buffer_size_in_bytes):
++        assert ord(host_data[i]) == byte_value, (
++            f"Rank {rank} failed"
++            f" to verify buffer {p}. Expected {byte_value}, "
++            f"got {ord(host_data[i])}")
++
++print(f"Rank {rank} verified all buffers")
++
++dist.barrier()
++torch.cuda.synchronize()
++
++CustomAllreduce.free_shared_buffer(pointers)
+diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
+index aa9e053..d01f187 100644
+--- a/tests/distributed/test_comm_ops.py
++++ b/tests/distributed/test_comm_ops.py
+@@ -8,15 +8,15 @@ import pytest
+ import ray
+ import torch
+ 
+-from vllm.distributed import (broadcast_tensor_dict,
++from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
+                               tensor_model_parallel_all_gather,
+                               tensor_model_parallel_all_reduce)
+-from vllm.test_utils import (init_test_distributed_environment,
+-                             multi_process_tensor_parallel)
++
++from ..utils import init_test_distributed_environment, multi_process_parallel
+ 
+ 
+ @ray.remote(num_gpus=1, max_calls=1)
+-def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
++def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
+                            distributed_init_port: str):
+     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+     # so that each worker can see all the GPUs
+@@ -24,21 +24,21 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
+     del os.environ["CUDA_VISIBLE_DEVICES"]
+     device = torch.device(f"cuda:{rank}")
+     torch.cuda.set_device(device)
+-    init_test_distributed_environment(1, tensor_parallel_size, rank,
++    init_test_distributed_environment(tp_size, pp_size, rank,
+                                       distributed_init_port)
+     num_elements = 8
+     all_tensors = [
+         torch.arange(num_elements, dtype=torch.float32, device="cuda") *
+-        (r + 1) for r in range(tensor_parallel_size)
++        (r + 1) for r in range(tp_size)
+     ]
+     expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
+-    t = all_tensors[rank]
++    t = all_tensors[rank % tp_size]
+     t = tensor_model_parallel_all_reduce(t)
+-    assert torch.allclose(t, expected)
++    torch.testing.assert_close(t, expected)
+ 
+ 
+ @ray.remote(num_gpus=1, max_calls=1)
+-def all_gather_test_worker(tensor_parallel_size: int, rank: int,
++def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
+                            distributed_init_port: str):
+     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+     # so that each worker can see all the GPUs
+@@ -46,7 +46,7 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
+     del os.environ["CUDA_VISIBLE_DEVICES"]
+     device = torch.device(f"cuda:{rank}")
+     torch.cuda.set_device(device)
+-    init_test_distributed_environment(1, tensor_parallel_size, rank,
++    init_test_distributed_environment(tp_size, pp_size, rank,
+                                       distributed_init_port)
+     num_dimensions = 3
+     tensor_size = list(range(2, num_dimensions + 2))
+@@ -57,16 +57,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
+         all_tensors = [
+             torch.arange(total_size, dtype=torch.float32,
+                          device="cuda").reshape(tensor_size) * (r + 1)
+-            for r in range(tensor_parallel_size)
++            for r in range(tp_size)
+         ]
+         expected = torch.cat(all_tensors, dim=all_gather_dimension)
+-        t = all_tensors[rank]
++        t = all_tensors[rank % tp_size]
+         t = tensor_model_parallel_all_gather(t, all_gather_dimension)
+-        assert torch.allclose(t, expected)
++        torch.testing.assert_close(t, expected)
+ 
+ 
+ @ray.remote(num_gpus=1, max_calls=1)
+-def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
++def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+                                       distributed_init_port: str):
+     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+     # so that each worker can see all the GPUs
+@@ -74,37 +74,127 @@ def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
+     del os.environ["CUDA_VISIBLE_DEVICES"]
+     device = torch.device(f"cuda:{rank}")
+     torch.cuda.set_device(device)
+-    init_test_distributed_environment(1, tensor_parallel_size, rank,
++    init_test_distributed_environment(tp_size, pp_size, rank,
+                                       distributed_init_port)
+     test_dict = {
++        # device tensor
+         "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+-        "b": torch.arange(16, dtype=torch.int8, device="cuda"),
++        # CPU tensor
++        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+         "c": "test",
+         "d": [1, 2, 3],
+         "e": {
+             "a": 1,
+             "b": 2
+         },
++        # empty tensor
++        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+     }
+ 
+-    if rank == 0:
++    if (rank % tp_size) == 0:
+         broadcast_tensor_dict(test_dict, src=0)
+     else:
+         recv_dict = broadcast_tensor_dict(src=0)
+         assert len(recv_dict) == len(test_dict)
+-        assert torch.allclose(recv_dict["a"], test_dict["a"])
+-        assert torch.allclose(recv_dict["b"], test_dict["b"])
++        torch.testing.assert_close(recv_dict["a"], test_dict["a"])
++        torch.testing.assert_close(recv_dict["b"], test_dict["b"])
+         assert recv_dict["c"] == test_dict["c"]
+         assert recv_dict["d"] == test_dict["d"]
+         assert recv_dict["e"] == test_dict["e"]
++        torch.testing.assert_close(recv_dict["f"], test_dict["f"])
++
++
++@ray.remote(num_gpus=1, max_calls=1)
++def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
++                                      distributed_init_port: str):
++    del os.environ["CUDA_VISIBLE_DEVICES"]
++    device = torch.device(f"cuda:{rank}")
++    torch.cuda.set_device(device)
++    init_test_distributed_environment(tp_size, pp_size, rank,
++                                      distributed_init_port)
++
++    test_dict = {
++        # device tensor
++        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
++        # CPU tensor
++        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
++        "c": "test",
++        "d": [1, 2, 3],
++        "e": {
++            "a": 1,
++            "b": 2
++        },
++        # empty tensor
++        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
++    }
++
++    if not get_pp_group().is_first_rank:
++        recv_dict = get_pp_group().recv_tensor_dict()
++
++    if not get_pp_group().is_last_rank:
++        get_pp_group().send_tensor_dict(test_dict)
++
++    if not get_pp_group().is_first_rank:
++        assert len(recv_dict) == len(test_dict)
++        torch.testing.assert_close(recv_dict["a"], test_dict["a"])
++        torch.testing.assert_close(recv_dict["b"], test_dict["b"])
++        assert recv_dict["c"] == test_dict["c"]
++        assert recv_dict["d"] == test_dict["d"]
++        assert recv_dict["e"] == test_dict["e"]
++        torch.testing.assert_close(recv_dict["f"], test_dict["f"])
++
++
++@ray.remote(num_gpus=1, max_calls=1)
++def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
++                          distributed_init_port: str):
++    del os.environ["CUDA_VISIBLE_DEVICES"]
++    device = torch.device(f"cuda:{rank}")
++    torch.cuda.set_device(device)
++    init_test_distributed_environment(tp_size, pp_size, rank,
++                                      distributed_init_port)
++
++    size = 64
++    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
++
++    if not get_pp_group().is_first_rank:
++        recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
++
++    if not get_pp_group().is_last_rank:
++        get_pp_group().send(test_tensor)
++
++    if not get_pp_group().is_first_rank:
++        torch.testing.assert_close(test_tensor, recv_tensor)
+ 
+ 
+ @pytest.mark.skipif(torch.cuda.device_count() < 2,
+                     reason="Need at least 2 GPUs to run the test.")
+-@pytest.mark.parametrize("tensor_parallel_size", [2])
++@pytest.mark.parametrize("tp_size", [2])
++@pytest.mark.parametrize("test_target", [
++    all_reduce_test_worker, all_gather_test_worker,
++    broadcast_tensor_dict_test_worker
++])
++def test_multi_process_tensor_parallel(tp_size, test_target):
++    multi_process_parallel(tp_size, 1, test_target)
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 2,
++                    reason="Need at least 2 GPUs to run the test.")
++@pytest.mark.parametrize("pp_size", [2])
++@pytest.mark.parametrize(
++    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
++def test_multi_process_pipeline_parallel(pp_size, test_target):
++    multi_process_parallel(1, pp_size, test_target)
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 4,
++                    reason="Need at least 4 GPUs to run the test.")
++@pytest.mark.parametrize("tp_size", [2])
++@pytest.mark.parametrize("pp_size", [2])
+ @pytest.mark.parametrize("test_target", [
++    send_recv_test_worker, send_recv_tensor_dict_test_worker,
+     all_reduce_test_worker, all_gather_test_worker,
+     broadcast_tensor_dict_test_worker
+ ])
+-def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
+-    multi_process_tensor_parallel(tensor_parallel_size, test_target)
++def test_multi_process_tensor_parallel_pipeline_parallel(
++        tp_size, pp_size, test_target):
++    multi_process_parallel(tp_size, pp_size, test_target)
+diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
+index 3b1cd17..4072616 100644
+--- a/tests/distributed/test_custom_all_reduce.py
++++ b/tests/distributed/test_custom_all_reduce.py
+@@ -6,10 +6,13 @@ import ray
+ import torch
+ import torch.distributed as dist
+ 
+-from vllm.distributed import tensor_model_parallel_all_reduce
+-from vllm.distributed.device_communicators import custom_all_reduce
+-from vllm.test_utils import (init_test_distributed_environment,
+-                             multi_process_tensor_parallel)
++from vllm.distributed.communication_op import (  # noqa
++    tensor_model_parallel_all_reduce)
++from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
++                                             get_tp_group, graph_capture)
++
++from ..utils import (ensure_model_parallel_initialized,
++                     init_test_distributed_environment, multi_process_parallel)
+ 
+ random.seed(42)
+ test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
+@@ -18,17 +21,36 @@ for i, v in enumerate(test_sizes):
+ 
+ 
+ @ray.remote(num_gpus=1, max_calls=1)
+-def graph_allreduce(world_size, rank, distributed_init_port):
++def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
+     del os.environ["CUDA_VISIBLE_DEVICES"]
+     device = torch.device(f"cuda:{rank}")
+     torch.cuda.set_device(device)
+-    init_test_distributed_environment(1, world_size, rank,
++    init_test_distributed_environment(tp_size, pp_size, rank,
+                                       distributed_init_port)
++    ensure_model_parallel_initialized(tp_size, pp_size)
++    group = get_tensor_model_parallel_group().device_group
++
++    # A small all_reduce for warmup.
++    # this is needed because device communicators might be created lazily
++    # (e.g. NCCL). This will ensure that the communicator is initialized
++    # before any communication happens, so that this group can be used for
++    # graph capture immediately.
++    data = torch.zeros(1)
++    data = data.to(device=device)
++    torch.distributed.all_reduce(data, group=group)
++    torch.cuda.synchronize()
++    del data
++
++    # we use the first group to communicate once
++    # and the second group to communicate twice
++    # and so on
++    # this is used to demonstrate that each group can
++    # communicate independently
++    num_communication = rank // tp_size + 1
+ 
+-    custom_all_reduce.init_custom_all_reduce()
+     for sz in test_sizes:
+         for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+-            with custom_all_reduce.capture():
++            with graph_capture(device=device) as graph_capture_context:
+                 # use integers so result matches NCCL exactly
+                 inp1 = torch.randint(1,
+                                      16, (sz, ),
+@@ -40,45 +62,54 @@ def graph_allreduce(world_size, rank, distributed_init_port):
+                                      device=torch.cuda.current_device())
+                 torch.cuda.synchronize()
+                 graph = torch.cuda.CUDAGraph()
+-                with torch.cuda.graph(graph):
+-                    out1 = tensor_model_parallel_all_reduce(inp1)
+-                    # the input buffer is immediately modified to test
+-                    # synchronization
+-                    dist.all_reduce(inp1)
+-                    out2 = tensor_model_parallel_all_reduce(inp2)
+-                    dist.all_reduce(inp2)
++                with torch.cuda.graph(graph,
++                                      stream=graph_capture_context.stream):
++                    for i in range(num_communication):
++                        out1 = tensor_model_parallel_all_reduce(inp1)
++                        # the input buffer is immediately modified to test
++                        # synchronization
++                        dist.all_reduce(inp1, group=group)
++                        out2 = tensor_model_parallel_all_reduce(inp2)
++                        dist.all_reduce(inp2, group=group)
+             graph.replay()
+-            assert torch.allclose(out1, inp1)
+-            assert torch.allclose(out2, inp2)
++            torch.testing.assert_close(out1, inp1)
++            torch.testing.assert_close(out2, inp2)
+ 
+ 
+ @ray.remote(num_gpus=1, max_calls=1)
+-def eager_allreduce(world_size, rank, distributed_init_port):
++def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
+     del os.environ["CUDA_VISIBLE_DEVICES"]
+     device = torch.device(f"cuda:{rank}")
+     torch.cuda.set_device(device)
+-    init_test_distributed_environment(1, world_size, rank,
++    init_test_distributed_environment(tp_size, pp_size, rank,
+                                       distributed_init_port)
+ 
++    # we use the first group to communicate once
++    # and the second group to communicate twice
++    # and so on
++    # this is used to demonstrate that each group can
++    # communicate independently
++    num_communication = rank // tp_size + 1
+     sz = 1024
+-    custom_all_reduce.init_custom_all_reduce()
+-    fa = custom_all_reduce.get_handle()
++    fa = get_tp_group().ca_comm
+     inp = torch.ones(sz, dtype=torch.float32, device=device)
+-    out = fa.all_reduce_unreg(inp)
+-    assert torch.allclose(out, inp * world_size)
++    out = inp
++    for _ in range(num_communication):
++        out = fa.all_reduce(out, registered=False)
++    torch.testing.assert_close(out, inp * (tp_size**num_communication))
+ 
+     inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+-    out = fa.all_reduce_unreg(inp)
+-    assert torch.allclose(out, inp * world_size)
++    out = inp
++    for _ in range(num_communication):
++        out = fa.all_reduce(out, registered=False)
++    torch.testing.assert_close(out, inp * (tp_size**num_communication))
+ 
+ 
+-@pytest.mark.skipif(torch.cuda.device_count() < 2,
+-                    reason="Need at least 2 GPUs to run the test.")
+-@pytest.mark.parametrize("tensor_parallel_size", [2])
++@pytest.mark.parametrize("tp_size", [2])
++@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
+ @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
+-def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
+-    multi_process_tensor_parallel(tensor_parallel_size, test_target)
+-
+-
+-if __name__ == "__main__":
+-    multi_process_tensor_parallel(2, graph_allreduce)
++def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
++    world_size = tp_size * pipeline_parallel_size
++    if world_size > torch.cuda.device_count():
++        pytest.skip("Not enough GPUs to run the test.")
++    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
+diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py
+new file mode 100644
+index 0000000..62e77a2
+--- /dev/null
++++ b/tests/distributed/test_distributed_oot.py
+@@ -0,0 +1,6 @@
++from ..entrypoints.openai.test_oot_registration import (
++    run_and_test_dummy_opt_api_server)
++
++
++def test_distributed_oot(dummy_opt_path: str):
++    run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
+diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py
+new file mode 100644
+index 0000000..9f9c0ff
+--- /dev/null
++++ b/tests/distributed/test_multi_node_assignment.py
+@@ -0,0 +1,64 @@
++"""Make sure ray assigns GPU workers to the correct node.
++
++Run:
++```sh
++cd $VLLM_PATH/tests
++
++pytest distributed/test_multi_node_assignment.py
++```
++"""
++
++import os
++
++import pytest
++import ray
++from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
++
++from vllm import initialize_ray_cluster
++from vllm.config import ParallelConfig
++from vllm.executor.ray_utils import _wait_until_pg_removed
++from vllm.utils import get_ip
++
++VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
++
++
++@pytest.mark.skipif(not VLLM_MULTI_NODE,
++                    reason="Need at least 2 nodes to run the test.")
++def test_multi_node_assignment() -> None:
++
++    # NOTE: important to keep this class definition here
++    # to let ray use cloudpickle to serialize it.
++    class Actor:
++
++        def get_ip(self):
++            return get_ip()
++
++    for _ in range(10):
++        config = ParallelConfig(1, 2)
++        initialize_ray_cluster(config)
++
++        current_ip = get_ip()
++        workers = []
++        for bundle_id, bundle in enumerate(
++                config.placement_group.bundle_specs):
++            if not bundle.get("GPU", 0):
++                continue
++            scheduling_strategy = PlacementGroupSchedulingStrategy(
++                placement_group=config.placement_group,
++                placement_group_capture_child_tasks=True,
++                placement_group_bundle_index=bundle_id,
++            )
++
++            worker = ray.remote(
++                num_cpus=0,
++                num_gpus=1,
++                scheduling_strategy=scheduling_strategy,
++            )(Actor).remote()
++            worker_ip = ray.get(worker.get_ip.remote())
++            assert worker_ip == current_ip
++            workers.append(worker)
++
++        for worker in workers:
++            ray.kill(worker)
++
++        _wait_until_pg_removed(config.placement_group)
+diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
+new file mode 100644
+index 0000000..ddbf40f
+--- /dev/null
++++ b/tests/distributed/test_pipeline_parallel.py
+@@ -0,0 +1,427 @@
++"""
++WARNING: This test runs in both single-node (4 GPUs) and multi-node
++ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
++ important to set the distributed backend to "mp" to avoid Ray scheduling
++ all workers in a node other than the head node, which can cause the test
++ to fail.
++"""
++import os
++from dataclasses import dataclass
++from typing import List, Literal, NamedTuple, Optional
++
++import pytest
++
++from vllm.config import TaskOption
++from vllm.logger import init_logger
++
++from ..utils import compare_two_settings, fork_new_process_for_each_test
++
++logger = init_logger("test_pipeline_parallel")
++
++VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
++
++
++class ParallelSetup(NamedTuple):
++    tp_size: int
++    pp_size: int
++    eager_mode: bool
++    chunked_prefill: bool
++
++
++class PPTestOptions(NamedTuple):
++    multi_node_only: bool
++    trust_remote_code: bool
++    tokenizer_mode: Optional[str]
++    load_format: Optional[str] = None
++    hf_overrides: Optional[str] = None
++
++
++@dataclass
++class PPTestSettings:
++    parallel_setups: List[ParallelSetup]
++    distributed_backends: List[str]
++    task: TaskOption
++    test_options: PPTestOptions
++
++    @staticmethod
++    def detailed(
++        *,
++        tp_base: int = 1,
++        pp_base: int = 2,
++        multi_node_only: bool = False,
++        task: TaskOption = "auto",
++        trust_remote_code: bool = False,
++        tokenizer_mode: Optional[str] = None,
++        load_format: Optional[str] = None,
++        hf_overrides: Optional[str] = None,
++    ):
++        return PPTestSettings(
++            parallel_setups=[
++                ParallelSetup(tp_size=tp_base,
++                              pp_size=pp_base,
++                              eager_mode=False,
++                              chunked_prefill=False),
++                ParallelSetup(tp_size=tp_base,
++                              pp_size=2 * pp_base,
++                              eager_mode=False,
++                              chunked_prefill=True),
++                ParallelSetup(tp_size=tp_base,
++                              pp_size=2 * pp_base,
++                              eager_mode=True,
++                              chunked_prefill=False),
++                ParallelSetup(tp_size=2 * tp_base,
++                              pp_size=pp_base,
++                              eager_mode=False,
++                              chunked_prefill=True),
++                ParallelSetup(tp_size=2 * tp_base,
++                              pp_size=pp_base,
++                              eager_mode=True,
++                              chunked_prefill=False),
++            ],
++            distributed_backends=["mp", "ray"],
++            task=task,
++            test_options=PPTestOptions(multi_node_only=multi_node_only,
++                                       trust_remote_code=trust_remote_code,
++                                       tokenizer_mode=tokenizer_mode,
++                                       load_format=load_format,
++                                       hf_overrides=hf_overrides),
++        )
++
++    @staticmethod
++    def fast(
++        *,
++        tp_base: int = 1,
++        pp_base: int = 2,
++        task: TaskOption = "auto",
++        multi_node_only: bool = False,
++        trust_remote_code: bool = False,
++        tokenizer_mode: Optional[str] = None,
++        load_format: Optional[str] = None,
++        hf_overrides: Optional[str] = None,
++    ):
++        return PPTestSettings(
++            parallel_setups=[
++                ParallelSetup(tp_size=tp_base,
++                              pp_size=pp_base,
++                              eager_mode=True,
++                              chunked_prefill=False),
++            ],
++            distributed_backends=["mp"],
++            task=task,
++            test_options=PPTestOptions(multi_node_only=multi_node_only,
++                                       trust_remote_code=trust_remote_code,
++                                       tokenizer_mode=tokenizer_mode,
++                                       load_format=load_format,
++                                       hf_overrides=hf_overrides),
++        )
++
++    def iter_params(self, model_name: str):
++        opts = self.test_options
++
++        for parallel_setup in self.parallel_setups:
++            for distributed_backend in self.distributed_backends:
++                yield (model_name, parallel_setup, distributed_backend,
++                       self.task, opts)
++
++
++# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
++# The values displayed here are only a rough indicator of the size of the model
++
++# yapf: disable
++TEXT_GENERATION_MODELS = {
++    # [Decoder-only]
++    # Uses Llama
++    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
++    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
++    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
++    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
++    "bigscience/bloomz-1b1": PPTestSettings.fast(),
++    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
++    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
++    "databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
++    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
++    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
++    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
++    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
++    "tiiuae/falcon-7b": PPTestSettings.fast(),
++    "google/gemma-2b": PPTestSettings.fast(),
++    "google/gemma-2-9b": PPTestSettings.fast(),
++    "gpt2": PPTestSettings.fast(),
++    "bigcode/starcoder": PPTestSettings.fast(),
++    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
++    "EleutherAI/pythia-12b": PPTestSettings.fast(),
++    "ibm/PowerLM-3b": PPTestSettings.fast(),
++    "ibm/PowerMoE-3b": PPTestSettings.fast(),
++    # Uses Llama
++    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
++    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
++    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
++    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
++    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
++    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
++    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
++    # Uses Llama
++    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
++    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
++    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
++    "mosaicml/mpt-7b": PPTestSettings.fast(),
++    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
++    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
++    "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(),
++    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
++    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
++    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
++    "adept/persimmon-8b-chat": PPTestSettings.fast(),
++    "microsoft/phi-2": PPTestSettings.fast(),
++    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
++    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'),  # noqa: E501
++    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
++    "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
++    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
++    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
++    "bigcode/starcoder2-3b": PPTestSettings.fast(),
++    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
++    # FIXME: Cannot load tokenizer in latest transformers version.
++    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
++    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
++    # [Encoder-only]
++    # TODO: Implement PP
++    # "facebook/bart-base": PPTestSettings.fast(),
++}
++
++EMBEDDING_MODELS = {  # type: ignore[var-annotated]
++    # [Text-only]
++    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
++    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
++    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
++}
++
++MULTIMODAL_MODELS = {
++    # [Decoder-only]
++    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
++    "facebook/chameleon-7b": PPTestSettings.fast(),
++    "adept/fuyu-8b": PPTestSettings.fast(),
++    "THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True),
++    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
++    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
++    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
++    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
++    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
++    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
++    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True),
++    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
++    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
++    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
++    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
++    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
++    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(trust_remote_code=True),
++    # [Encoder-decoder]
++    # TODO: Implement PP
++    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
++}
++# yapf: enable
++
++# NOTE: You can update this on your local machine to run specific tests
++TEST_MODELS = [
++    # [LANGUAGE GENERATION]
++    "microsoft/Phi-3.5-MoE-instruct",
++    "meta-llama/Meta-Llama-3-8B",
++    "ibm/PowerLM-3b",
++    # [LANGUAGE EMBEDDING]
++    "intfloat/e5-mistral-7b-instruct",
++    "BAAI/bge-multilingual-gemma2",
++    # [MULTIMODAL GENERATION]
++    "OpenGVLab/InternVL2-1B",
++    "microsoft/Phi-3-vision-128k-instruct",
++    "fixie-ai/ultravox-v0_3",
++    # [LANGUAGE GENERATION - HYBRID ARCH]
++    "ai21labs/Jamba-tiny-dev",
++]
++
++
++def _compare_tp(
++    model_name: str,
++    parallel_setup: ParallelSetup,
++    distributed_backend: str,
++    task: TaskOption,
++    test_options: PPTestOptions,
++    num_gpus_available: int,
++    *,
++    method: Literal["generate", "encode"],
++):
++    (
++        tp_size,
++        pp_size,
++        eager_mode,
++        chunked_prefill,
++    ) = parallel_setup
++    (
++        multi_node_only,
++        trust_remote_code,
++        tokenizer_mode,
++        load_format,
++        hf_overrides,
++    ) = test_options
++
++    if num_gpus_available < tp_size * pp_size:
++        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
++    if VLLM_MULTI_NODE and distributed_backend == "mp":
++        pytest.skip("Skipping multi-node pipeline parallel test for "
++                    "multiprocessing distributed backend")
++    if multi_node_only and not VLLM_MULTI_NODE:
++        pytest.skip("Not in multi-node setting")
++
++    common_args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "float16",
++        "--max-model-len",
++        "2048",
++        "--max-num-seqs",
++        "8",
++    ]
++    if chunked_prefill:
++        common_args.append("--enable-chunked-prefill")
++    if eager_mode:
++        common_args.append("--enforce-eager")
++    if task != "auto":
++        common_args.extend(["--task", task])
++    if trust_remote_code:
++        common_args.append("--trust-remote-code")
++    if tokenizer_mode:
++        common_args.extend(["--tokenizer-mode", tokenizer_mode])
++    if load_format:
++        common_args.extend(["--load-format", load_format])
++    if hf_overrides:
++        common_args.extend(["--hf-overrides", hf_overrides])
++
++    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
++            and chunked_prefill):
++        # Test Ray ADAG for a subset of the tests
++        pp_env = {
++            "VLLM_USE_RAY_COMPILED_DAG": "1",
++            "VLLM_USE_RAY_SPMD_WORKER": "1",
++            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
++        }
++        # Temporary. Currently when zeromq + SPMD is used, it does not properly
++        # terminate because of aDAG issue.
++        common_args.append("--disable-frontend-multiprocessing")
++    else:
++        pp_env = None
++
++    pp_args = [
++        *common_args,
++        "--pipeline-parallel-size",
++        str(pp_size),
++        "--tensor-parallel-size",
++        str(tp_size),
++        "--distributed-executor-backend",
++        distributed_backend,
++    ]
++
++    # compare without pipeline parallelism
++    # NOTE: use mp backend for TP
++    # PP tests might involve multiple nodes, and ray might
++    #  schedule all workers in a node other than the head node,
++    #  which can cause the test to fail.
++    tp_args = [
++        *common_args,
++        "--tensor-parallel-size",
++        str(tp_size),
++        "--distributed-executor-backend",
++        "mp",
++    ]
++
++    try:
++        compare_two_settings(model_name,
++                             pp_args,
++                             tp_args,
++                             pp_env,
++                             method=method)
++    except Exception:
++        if pp_env is None:
++            raise
++        else:
++            # Ray ADAG tests are flaky, so we don't want to fail the test
++            logger.exception("Ray ADAG tests failed")
++
++
++@pytest.mark.parametrize(
++    ("model_name", "parallel_setup", "distributed_backend", "task",
++     "test_options"),
++    [
++        params for model_name, settings in TEXT_GENERATION_MODELS.items()
++        for params in settings.iter_params(model_name)
++        if model_name in TEST_MODELS
++    ],
++)
++@fork_new_process_for_each_test
++def test_tp_language_generation(
++    model_name: str,
++    parallel_setup: ParallelSetup,
++    distributed_backend: str,
++    task: TaskOption,
++    test_options: PPTestOptions,
++    num_gpus_available,
++):
++    _compare_tp(model_name,
++                parallel_setup,
++                distributed_backend,
++                task,
++                test_options,
++                num_gpus_available,
++                method="generate")
++
++
++@pytest.mark.parametrize(
++    ("model_name", "parallel_setup", "distributed_backend", "task",
++     "test_options"),
++    [
++        params for model_name, settings in EMBEDDING_MODELS.items()
++        for params in settings.iter_params(model_name)
++        if model_name in TEST_MODELS
++    ],
++)
++@fork_new_process_for_each_test
++def test_tp_language_embedding(
++    model_name: str,
++    parallel_setup: ParallelSetup,
++    distributed_backend: str,
++    task: TaskOption,
++    test_options: PPTestOptions,
++    num_gpus_available,
++):
++    _compare_tp(model_name,
++                parallel_setup,
++                distributed_backend,
++                task,
++                test_options,
++                num_gpus_available,
++                method="encode")
++
++
++@pytest.mark.parametrize(
++    ("model_name", "parallel_setup", "distributed_backend", "task",
++     "test_options"),
++    [
++        params for model_name, settings in MULTIMODAL_MODELS.items()
++        for params in settings.iter_params(model_name)
++        if model_name in TEST_MODELS
++    ],
++)
++@fork_new_process_for_each_test
++def test_tp_multimodal_generation(
++    model_name: str,
++    parallel_setup: ParallelSetup,
++    distributed_backend: str,
++    task: TaskOption,
++    test_options: PPTestOptions,
++    num_gpus_available,
++):
++    _compare_tp(model_name,
++                parallel_setup,
++                distributed_backend,
++                task,
++                test_options,
++                num_gpus_available,
++                method="generate")
+diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
+new file mode 100644
+index 0000000..2d4d07d
+--- /dev/null
++++ b/tests/distributed/test_pipeline_partition.py
+@@ -0,0 +1,34 @@
++import os
++
++import pytest
++
++from vllm.distributed.utils import get_pp_indices
++
++
++def test_custom_layer_partition():
++
++    def _verify(partition_str, num_layers, pp_size, goldens):
++        bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
++        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
++        for pp_rank, golden in enumerate(goldens):
++            assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
++        if bak is not None:
++            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
++
++    # Even partition
++    _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
++    # Balanced partition
++    _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
++    # Put reminder somewhere
++    _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
++    # Invalid partition strings
++    with pytest.raises(ValueError):
++        _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
++    with pytest.raises(ValueError):
++        _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
++    # Wrong number of partitions
++    with pytest.raises(ValueError):
++        _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
++    # Wrong number of layers
++    with pytest.raises(ValueError):
++        _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
+new file mode 100644
+index 0000000..4912858
+--- /dev/null
++++ b/tests/distributed/test_pp_cudagraph.py
+@@ -0,0 +1,30 @@
++import os
++
++import pytest
++
++from ..utils import compare_two_settings, fork_new_process_for_each_test
++
++
++@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
++    (2, "JackFram/llama-160m"),
++])
++@pytest.mark.parametrize("ATTN_BACKEND", [
++    "FLASH_ATTN",
++    "FLASHINFER",
++])
++@fork_new_process_for_each_test
++def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
++    cudagraph_args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "float16",
++        "--pipeline-parallel-size",
++        str(PP_SIZE),
++        "--distributed-executor-backend",
++        "mp",
++    ]
++    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
++
++    eager_args = cudagraph_args + ["--enforce-eager"]
++
++    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
+index b6f461b..a8571a1 100644
+--- a/tests/distributed/test_pynccl.py
++++ b/tests/distributed/test_pynccl.py
+@@ -1,23 +1,26 @@
+ import multiprocessing
++import os
++from typing import Dict, List
+ 
+ import pytest
+ import torch
++import torch.distributed
+ 
+-import vllm.distributed.device_communicators.pynccl_utils as pynccl_utils
+-from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+-from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
+-                                                          ncclGetUniqueId)
+-from vllm.distributed.parallel_state import (
+-    ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group,
+-    init_distributed_environment, with_pynccl_for_all_reduce)
++from vllm.distributed.communication_op import (  # noqa
++    tensor_model_parallel_all_reduce)
++from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
++from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
++from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
++                                             get_world_group, graph_capture,
++                                             init_distributed_environment)
+ from vllm.utils import update_environment_variables
+ 
+ 
+ def distributed_run(fn, world_size):
+     number_of_processes = world_size
+-    processes = []
++    processes: List[multiprocessing.Process] = []
+     for i in range(number_of_processes):
+-        env = {}
++        env: Dict[str, str] = {}
+         env['RANK'] = str(i)
+         env['LOCAL_RANK'] = str(i)
+         env['WORLD_SIZE'] = str(number_of_processes)
+@@ -41,6 +44,9 @@ def worker_fn_wrapper(fn):
+     # and update the environment variables in the function
+     def wrapped_fn(env):
+         update_environment_variables(env)
++        local_rank = os.environ['LOCAL_RANK']
++        device = torch.device(f"cuda:{local_rank}")
++        torch.cuda.set_device(device)
+         init_distributed_environment()
+         fn()
+ 
+@@ -49,11 +55,13 @@ def worker_fn_wrapper(fn):
+ 
+ @worker_fn_wrapper
+ def worker_fn():
+-    comm = NCCLCommunicator()
+-    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
+-    comm.all_reduce(tensor)
+-    result = tensor.mean().cpu().item()
+-    assert result == comm.world_size
++    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
++                                     device=get_world_group().device)
++    tensor = torch.ones(16, 1024, 1024,
++                        dtype=torch.float32).cuda(pynccl_comm.rank)
++    tensor = pynccl_comm.all_reduce(tensor)
++    torch.cuda.synchronize()
++    assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
+ 
+ 
+ @pytest.mark.skipif(torch.cuda.device_count() < 2,
+@@ -63,81 +71,145 @@ def test_pynccl():
+ 
+ 
+ @worker_fn_wrapper
+-def multiple_tp_worker_fn():
++def multiple_allreduce_worker_fn():
+     device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+     groups = [
+         torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
+         torch.distributed.new_group(ranks=[2, 3], backend="gloo")
+     ]
+     group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
+-    comm = NCCLCommunicator(group=group, device=device)
++    pynccl_comm = PyNcclCommunicator(group=group, device=device)
+     tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+     # two groups can communicate independently
+     if torch.distributed.get_rank() in [0, 1]:
+-        comm.all_reduce(tensor)
+-        comm.all_reduce(tensor)
+-        result = tensor.mean().cpu().item()
+-        assert result == 4
++        tensor = pynccl_comm.all_reduce(tensor)
++        tensor = pynccl_comm.all_reduce(tensor)
++        torch.cuda.synchronize()
++        assert torch.all(tensor == 4).cpu().item()
+     else:
+-        comm.all_reduce(tensor)
+-        result = tensor.mean().cpu().item()
+-        assert result == 2
++        tensor = pynccl_comm.all_reduce(tensor)
++        torch.cuda.synchronize()
++        assert torch.all(tensor == 2).cpu().item()
+ 
+ 
+ @pytest.mark.skipif(torch.cuda.device_count() < 4,
+                     reason="Need at least 4 GPUs to run the test.")
+-def test_pynccl_multiple_tp():
++def test_pynccl_multiple_allreduce():
+     # this tests pynccl for multiple tp groups, in a standalone way
+-    # i.e. call `comm.all_reduce` directly
+-    distributed_run(multiple_tp_worker_fn, 4)
++    # i.e. call `pynccl_comm.all_reduce` directly
++    distributed_run(multiple_allreduce_worker_fn, 4)
+ 
+ 
+ @worker_fn_wrapper
+-def multiple_tp_with_vllm_worker_fn():
++def multiple_allreduce_with_vllm_worker_fn():
+     device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+-    torch.cuda.set_device(torch.distributed.get_rank())
+     ensure_model_parallel_initialized(2, 2)
+-    pynccl_utils.init_process_group(
+-        group=get_tensor_model_parallel_cpu_group())
+     tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
+-    with with_pynccl_for_all_reduce():
++    with graph_capture(device=device):
+         # two tp groups can communicate independently
+         if torch.distributed.get_rank() in [0, 1]:
+             tensor = tensor_model_parallel_all_reduce(tensor)
+             tensor = tensor_model_parallel_all_reduce(tensor)
+-            result = tensor.mean().cpu().item()
+-            assert result == 4
++            torch.cuda.synchronize()
++            assert torch.all(tensor == 4).cpu().item()
+         else:
+             tensor = tensor_model_parallel_all_reduce(tensor)
+-            result = tensor.mean().cpu().item()
+-            assert result == 2
++            torch.cuda.synchronize()
++            assert torch.all(tensor == 2).cpu().item()
+ 
+ 
+ @pytest.mark.skipif(torch.cuda.device_count() < 4,
+                     reason="Need at least 4 GPUs to run the test.")
+-def test_pynccl_multiple_tp_with_vllm():
++def test_pynccl_multiple_allreduce_with_vllm():
+     # this tests pynccl for multiple tp groups, together with vllm
+     # i.e. call `tensor_model_parallel_all_reduce`
+-    distributed_run(multiple_tp_with_vllm_worker_fn, 4)
++    distributed_run(multiple_allreduce_with_vllm_worker_fn, 4)
+ 
+ 
+ @worker_fn_wrapper
+ def worker_fn_with_cudagraph():
+     with torch.no_grad():
+         graph = torch.cuda.CUDAGraph()
+-        comm = NCCLCommunicator()
++        pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
++                                         device=get_world_group().device)
+         # run something in the default stream to initialize torch engine
+-        a = torch.ones((4, 4), device=f'cuda:{comm.rank}')
++        a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
++        torch.cuda.synchronize()
++        with torch.cuda.graph(graph):
++            a_out = pynccl_comm.all_reduce(a)
+         torch.cuda.synchronize()
+-        with torch.cuda.graph(graph, stream=comm.stream):
+-            # operation during the graph capture is recorded but not executed
+-            # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
+-            comm.all_reduce(a)
+-        comm.stream.synchronize()
+-        assert a.mean().cpu().item() == comm.world_size**0
+         graph.replay()
+-        comm.stream.synchronize()
+-        assert a.mean().cpu().item() == comm.world_size**1
++        torch.cuda.synchronize()
++        assert torch.all(a_out == pynccl_comm.world_size).cpu().item()
++
++
++@worker_fn_wrapper
++def all_gather_worker_fn():
++    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
++                                     device=get_world_group().device)
++
++    rank = pynccl_comm.rank
++    world_size = pynccl_comm.world_size
++    device = f'cuda:{pynccl_comm.rank}'
++
++    num_elems = 1000
++    tensor = torch.arange(num_elems, dtype=torch.float32,
++                          device=device) + rank * num_elems
++    result = torch.zeros(num_elems * world_size,
++                         dtype=torch.float32,
++                         device=device)
++
++    expected = torch.cat([
++        torch.arange(num_elems, dtype=torch.float32) + r * num_elems
++        for r in range(world_size)
++    ]).to(device)
++
++    pynccl_comm.all_gather(result, tensor)
++    torch.cuda.synchronize()
++    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 2,
++                    reason="Need at least 2 GPUs to run the test.")
++def test_pynccl_all_gather():
++    distributed_run(all_gather_worker_fn, 2)
++
++
++@worker_fn_wrapper
++def reduce_scatter_worker_fn():
++    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
++                                     device=get_world_group().device)
++
++    rank = pynccl_comm.rank
++    world_size = pynccl_comm.world_size
++    device = f'cuda:{pynccl_comm.rank}'
++
++    num_elems = 1000
++    tensor = torch.arange(num_elems, dtype=torch.float32,
++                          device=device) + rank * num_elems
++    assert (num_elems % world_size == 0)
++    result = torch.zeros(num_elems // world_size,
++                         dtype=torch.float32,
++                         device=device)
++
++    # Calculate expected result for this rank's chunk
++    scattered_size = num_elems // world_size
++    all_tensors = [
++        torch.arange(num_elems, dtype=torch.float32) + r * num_elems
++        for r in range(world_size)
++    ]
++    expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
++                   for tensor in all_tensors).to(device)
++
++    pynccl_comm.reduce_scatter(result, tensor)
++    torch.cuda.synchronize()
++    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 2,
++                    reason="Need at least 2 GPUs to run the test.")
++def test_pynccl_reduce_scatter():
++    distributed_run(reduce_scatter_worker_fn, 2)
+ 
+ 
+ @pytest.mark.skipif(torch.cuda.device_count() < 2,
+@@ -146,8 +218,107 @@ def test_pynccl_with_cudagraph():
+     distributed_run(worker_fn_with_cudagraph, 2)
+ 
+ 
++@worker_fn_wrapper
++def send_recv_worker_fn():
++    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
++                                     device=get_world_group().device)
++    if pynccl_comm.rank == 0:
++        tensor = torch.ones(16, 1024, 1024,
++                            dtype=torch.float32).cuda(pynccl_comm.rank)
++    else:
++        tensor = torch.empty(16, 1024, 1024,
++                             dtype=torch.float32).cuda(pynccl_comm.rank)
++
++    if pynccl_comm.rank == 0:
++        pynccl_comm.send(tensor,
++                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
++    else:
++        pynccl_comm.recv(tensor,
++                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
++    torch.cuda.synchronize()
++    assert torch.all(tensor == 1).cpu().item()
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 2,
++                    reason="Need at least 2 GPUs to run the test.")
++def test_pynccl_send_recv():
++    distributed_run(send_recv_worker_fn, 2)
++
++
++@worker_fn_wrapper
++def multiple_send_recv_worker_fn():
++    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
++    groups = [
++        torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
++        torch.distributed.new_group(ranks=[1, 3], backend="gloo")
++    ]
++    group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
++    pynccl_comm = PyNcclCommunicator(group=group, device=device)
++    if torch.distributed.get_rank() == 0:
++        tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
++    elif torch.distributed.get_rank() == 1:
++        tensor = 2 * torch.ones(
++            16, 1024, 1024, dtype=torch.float32, device=device)
++    else:
++        tensor = torch.empty(16,
++                             1024,
++                             1024,
++                             dtype=torch.float32,
++                             device=device)
++    if torch.distributed.get_rank() in [0, 1]:
++        pynccl_comm.send(tensor,
++                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
++    else:
++        pynccl_comm.recv(tensor,
++                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
++    torch.cuda.synchronize()
++    if torch.distributed.get_rank() in [0, 2]:
++        assert torch.all(tensor == 1).cpu().item()
++    else:
++        assert torch.all(tensor == 2).cpu().item()
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 4,
++                    reason="Need at least 4 GPUs to run the test.")
++def test_pynccl_multiple_send_recv():
++    distributed_run(multiple_send_recv_worker_fn, 4)
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 4,
++                    reason="Need at least 4 GPUs to run the test.")
++def test_pynccl_broadcast():
++    distributed_run(broadcast_worker_fn, 4)
++
++
++@worker_fn_wrapper
++def broadcast_worker_fn():
++    # Test broadcast for every root rank.
++    # Essentially this is an all-gather operation.
++    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
++                                     device=get_world_group().device)
++    recv_tensors = [
++        torch.empty(16,
++                    1024,
++                    1024,
++                    dtype=torch.float32,
++                    device=pynccl_comm.device)
++        for i in range(pynccl_comm.world_size)
++    ]
++    recv_tensors[pynccl_comm.rank] = torch.ones(
++        16, 1024, 1024, dtype=torch.float32,
++        device=pynccl_comm.device) * pynccl_comm.rank
++
++    for i in range(pynccl_comm.world_size):
++        pynccl_comm.broadcast(recv_tensors[i], src=i)
++        # the broadcast op might be launched in a different stream
++        # need to synchronize to make sure the tensor is ready
++        torch.cuda.synchronize()
++        assert torch.all(recv_tensors[i] == i).cpu().item()
++
++
+ def test_ncclGetUniqueId():
+-    unique_id = ncclGetUniqueId()
++    lib = NCCLLibrary()
++    unique_id = lib.ncclGetUniqueId()
+     # `list(unique_id.internal)` is something like this:
+     # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
+     # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
+new file mode 100644
+index 0000000..62311a6
+--- /dev/null
++++ b/tests/distributed/test_same_node.py
+@@ -0,0 +1,34 @@
++import os
++
++import torch.distributed as dist
++
++from vllm.distributed.parallel_state import in_the_same_node_as
++from vllm.distributed.utils import StatelessProcessGroup
++from vllm.utils import get_ip, get_open_port
++
++if __name__ == "__main__":
++    dist.init_process_group(backend="gloo")
++
++    rank = dist.get_rank()
++    if rank == 0:
++        port = get_open_port()
++        ip = get_ip()
++        dist.broadcast_object_list([ip, port], src=0)
++    else:
++        recv = [None, None]
++        dist.broadcast_object_list(recv, src=0)
++        ip, port = recv
++
++    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
++                                                dist.get_world_size())
++
++    for pg in [dist.group.WORLD, stateless_pg]:
++        test_result = all(in_the_same_node_as(pg, source_rank=0))
++
++        expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
++        assert test_result == expected, \
++            f"Expected {expected}, got {test_result}"
++        if pg == dist.group.WORLD:
++            print("Same node test passed! when using torch distributed!")
++        else:
++            print("Same node test passed! when using StatelessProcessGroup!")
+diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
+new file mode 100644
+index 0000000..7238726
+--- /dev/null
++++ b/tests/distributed/test_shm_broadcast.py
+@@ -0,0 +1,116 @@
++import multiprocessing
++import random
++import time
++from typing import List
++
++import numpy as np
++import torch.distributed as dist
++
++from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
++from vllm.distributed.utils import StatelessProcessGroup
++from vllm.utils import get_ip, get_open_port, update_environment_variables
++
++
++def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
++    np.random.seed(seed)
++    sizes = np.random.randint(1, 10_000, n)
++    # on average, each array will have 5k elements
++    # with int64, each array will have 40kb
++    return [np.random.randint(1, 100, i) for i in sizes]
++
++
++def distributed_run(fn, world_size):
++    number_of_processes = world_size
++    processes = []
++    for i in range(number_of_processes):
++        env = {}
++        env['RANK'] = str(i)
++        env['LOCAL_RANK'] = str(i)
++        env['WORLD_SIZE'] = str(number_of_processes)
++        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
++        env['MASTER_ADDR'] = 'localhost'
++        env['MASTER_PORT'] = '12345'
++        p = multiprocessing.Process(target=fn, args=(env, ))
++        processes.append(p)
++        p.start()
++
++    for p in processes:
++        p.join()
++
++    for p in processes:
++        assert p.exitcode == 0
++
++
++def worker_fn_wrapper(fn):
++    # `multiprocessing.Process` cannot accept environment variables directly
++    # so we need to pass the environment variables as arguments
++    # and update the environment variables in the function
++    def wrapped_fn(env):
++        update_environment_variables(env)
++        dist.init_process_group(backend="gloo")
++        fn()
++
++    return wrapped_fn
++
++
++@worker_fn_wrapper
++def worker_fn():
++
++    rank = dist.get_rank()
++    if rank == 0:
++        port = get_open_port()
++        ip = get_ip()
++        dist.broadcast_object_list([ip, port], src=0)
++    else:
++        recv = [None, None]
++        dist.broadcast_object_list(recv, src=0)
++        ip, port = recv
++
++    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
++                                                dist.get_world_size())
++
++    for pg in [dist.group.WORLD, stateless_pg]:
++
++        writer_rank = 2
++        broadcaster = MessageQueue.create_from_process_group(
++            pg, 40 * 1024, 2, writer_rank)
++        if rank == writer_rank:
++            seed = random.randint(0, 1000)
++            dist.broadcast_object_list([seed], writer_rank)
++        else:
++            recv = [None]
++            dist.broadcast_object_list(recv, writer_rank)
++            seed = recv[0]  # type: ignore
++
++        if pg == dist.group.WORLD:
++            dist.barrier()
++        else:
++            pg.barrier()
++
++        # in case we find a race condition
++        # print the seed so that we can reproduce the error
++        print(f"Rank {rank} got seed {seed}")
++        # test broadcasting with about 400MB of data
++        N = 10_000
++        if rank == writer_rank:
++            arrs = get_arrays(N, seed)
++            for x in arrs:
++                broadcaster.broadcast_object(x)
++                time.sleep(random.random() / 1000)
++        else:
++            arrs = get_arrays(N, seed)
++            for x in arrs:
++                y = broadcaster.broadcast_object(None)
++                assert np.array_equal(x, y)
++                time.sleep(random.random() / 1000)
++
++        if pg == dist.group.WORLD:
++            dist.barrier()
++            print("torch distributed passed the test!")
++        else:
++            pg.barrier()
++            print("StatelessProcessGroup passed the test!")
++
++
++def test_shm_broadcast():
++    distributed_run(worker_fn, 4)
+diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
+new file mode 100644
+index 0000000..5fb1ae7
+--- /dev/null
++++ b/tests/distributed/test_utils.py
+@@ -0,0 +1,141 @@
++import socket
++
++import pytest
++import ray
++import torch
++
++import vllm.envs as envs
++from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
++from vllm.distributed.utils import StatelessProcessGroup
++from vllm.utils import (cuda_device_count_stateless, get_open_port,
++                        update_environment_variables)
++
++from ..utils import multi_gpu_test
++
++
++@ray.remote
++class _CUDADeviceCountStatelessTestActor:
++
++    def get_count(self):
++        return cuda_device_count_stateless()
++
++    def set_cuda_visible_devices(self, cuda_visible_devices: str):
++        update_environment_variables(
++            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
++
++    def get_cuda_visible_devices(self):
++        return envs.CUDA_VISIBLE_DEVICES
++
++
++def test_cuda_device_count_stateless():
++    """Test that cuda_device_count_stateless changes return value if
++    CUDA_VISIBLE_DEVICES is changed."""
++    actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
++        num_gpus=2).remote()
++    assert len(
++        sorted(ray.get(
++            actor.get_cuda_visible_devices.remote()).split(","))) == 2
++    assert ray.get(actor.get_count.remote()) == 2
++    ray.get(actor.set_cuda_visible_devices.remote("0"))
++    assert ray.get(actor.get_count.remote()) == 1
++    ray.get(actor.set_cuda_visible_devices.remote(""))
++    assert ray.get(actor.get_count.remote()) == 0
++
++
++def cpu_worker(rank, WORLD_SIZE, port1, port2):
++    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
++                                       port=port1,
++                                       rank=rank,
++                                       world_size=WORLD_SIZE)
++    if rank <= 2:
++        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
++                                           port=port2,
++                                           rank=rank,
++                                           world_size=3)
++    data = torch.tensor([rank])
++    data = pg1.broadcast_obj(data, src=2)
++    assert data.item() == 2
++    if rank <= 2:
++        data = torch.tensor([rank + 1])
++        data = pg2.broadcast_obj(data, src=2)
++        assert data.item() == 3
++        pg2.barrier()
++    pg1.barrier()
++
++
++def gpu_worker(rank, WORLD_SIZE, port1, port2):
++    torch.cuda.set_device(rank)
++    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
++                                       port=port1,
++                                       rank=rank,
++                                       world_size=WORLD_SIZE)
++    pynccl1 = PyNcclCommunicator(pg1, device=rank)
++    if rank <= 2:
++        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
++                                           port=port2,
++                                           rank=rank,
++                                           world_size=3)
++        pynccl2 = PyNcclCommunicator(pg2, device=rank)
++    data = torch.tensor([rank]).cuda()
++    pynccl1.all_reduce(data)
++    pg1.barrier()
++    torch.cuda.synchronize()
++    if rank <= 2:
++        pynccl2.all_reduce(data)
++        pg2.barrier()
++        torch.cuda.synchronize()
++    item = data[0].item()
++    print(f"rank: {rank}, item: {item}")
++    if rank == 3:
++        assert item == 6
++    else:
++        assert item == 18
++
++
++def broadcast_worker(rank, WORLD_SIZE, port1, port2):
++    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
++                                       port=port1,
++                                       rank=rank,
++                                       world_size=WORLD_SIZE)
++    if rank == 2:
++        pg1.broadcast_obj("secret", src=2)
++    else:
++        obj = pg1.broadcast_obj(None, src=2)
++        assert obj == "secret"
++    pg1.barrier()
++
++
++def allgather_worker(rank, WORLD_SIZE, port1, port2):
++    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
++                                       port=port1,
++                                       rank=rank,
++                                       world_size=WORLD_SIZE)
++    data = pg1.all_gather_obj(rank)
++    assert data == list(range(WORLD_SIZE))
++    pg1.barrier()
++
++
++@pytest.mark.skip(reason="This test is flaky and prone to hang.")
++@multi_gpu_test(num_gpus=4)
++@pytest.mark.parametrize(
++    "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
++def test_stateless_process_group(worker):
++    port1 = get_open_port()
++    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
++        s.bind(("", port1))
++        port2 = get_open_port()
++    WORLD_SIZE = 4
++    from multiprocessing import get_context
++    ctx = get_context("fork")
++    processes = []
++    for i in range(WORLD_SIZE):
++        rank = i
++        processes.append(
++            ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
++    for p in processes:
++        p.start()
++    for p in processes:
++        p.join()
++    for p in processes:
++        assert not p.exitcode
++    print("All processes finished.")
+diff --git a/tests/encoder_decoder/__init__.py b/tests/encoder_decoder/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
+new file mode 100644
+index 0000000..fa5d6a6
+--- /dev/null
++++ b/tests/encoder_decoder/test_e2e_correctness.py
+@@ -0,0 +1,119 @@
++"""E2E tests to verify the correctness of the encoder-decoder framework
++
++Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
++"""
++from typing import List, Optional, Tuple
++
++import pytest
++from transformers import AutoModelForSeq2SeqLM
++
++from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
++                                     global_force_attn_backend_context_manager)
++from vllm.platforms import current_platform
++from vllm.sequence import SampleLogprobs
++
++from ..conftest import DecoderPromptType
++from ..models.utils import check_logprobs_close
++
++LIST_ENC_DEC_SUPPORTED_BACKENDS = [
++    _Backend.XFORMERS, _Backend.FLASH_ATTN, None
++]
++
++
++def vllm_to_hf_output(
++    vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
++    decoder_prompt_type: DecoderPromptType,
++):
++    """Sanitize vllm output to be comparable with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    hf_output_str = output_str + "</s>"
++    if decoder_prompt_type == DecoderPromptType.NONE:
++        hf_output_str = "<s>" + hf_output_str
++
++    return output_ids, hf_output_str, out_logprobs
++
++
++@pytest.fixture(autouse=True)
++def clear_cache():
++    """Fixture to clear backend cache before each test."""
++    _cached_get_attn_backend.cache_clear()  # Clear the cache
++    yield  # This allows the test to run
++
++
++@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [5])
++@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
++@pytest.mark.parametrize("enforce_eager", [True, False])
++@pytest.mark.skipif(
++    current_platform.is_cpu(),
++    reason="CPU backend is not currently supported with encoder/decoder models"
++)
++def test_encoder_decoder_e2e(
++    hf_runner,
++    vllm_runner,
++    example_encoder_decoder_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    decoder_prompt_type: DecoderPromptType,
++    enforce_eager: bool,
++    attn_backend: _Backend,
++) -> None:
++    '''
++    End-to-End (E2E) test for the encoder-decoder framework.
++    This test evaluates the encoder-decoder functionality using the BART
++    model. We compare the outputs of the Hugging Face and vLLM
++    implementations to ensure that both implementations produce consistent
++    and correct results.
++    '''
++    with global_force_attn_backend_context_manager(attn_backend):
++        if attn_backend == _Backend.FLASH_ATTN:
++            # Flash Attention works only with bfloat16 data-type
++            dtype = 'bfloat16'
++        test_case_prompts = example_encoder_decoder_prompts[
++            decoder_prompt_type]
++
++        # Configuration settings for HF baseline
++        hf_kwargs = {
++            "top_k": None,
++            "num_beams": 1,
++            "repetition_penalty": 1.0,
++            "top_p": 1.0,
++            "length_penalty": 1.0,
++            "early_stopping": False,
++            "no_repeat_ngram_size": None,
++            "min_length": 0
++        }
++
++        with hf_runner(model, dtype=dtype,
++                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
++            hf_outputs = (
++                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
++                    test_case_prompts,
++                    max_tokens,
++                    num_logprobs,
++                    **hf_kwargs,
++                ))
++        with vllm_runner(model, dtype=dtype,
++                         enforce_eager=enforce_eager) as vllm_model:
++            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
++                test_case_prompts, max_tokens, num_logprobs)
++
++        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
++                          else 0)
++
++        check_logprobs_close(
++            outputs_0_lst=hf_outputs,
++            outputs_1_lst=[
++                vllm_to_hf_output(vllm_output, decoder_prompt_type)
++                for vllm_output in vllm_outputs
++            ],
++            name_0="hf",
++            name_1="vllm",
++            num_outputs_0_skip_tokens=hf_skip_tokens,
++        )
+diff --git a/tests/engine/__init__.py b/tests/engine/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/engine/output_processor/__init__.py b/tests/engine/output_processor/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py
+index 6da3da0..88f3fad 100644
+--- a/tests/engine/output_processor/test_multi_step.py
++++ b/tests/engine/output_processor/test_multi_step.py
+@@ -4,16 +4,17 @@ from unittest.mock import MagicMock
+ import pytest
+ from transformers import PreTrainedTokenizer
+ 
+-from tests.core.utils import create_seq_group
+ from vllm.core.scheduler import Scheduler
+ from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
+ from vllm.engine.output_processor.stop_checker import StopChecker
+ from vllm.sampling_params import SamplingParams
+-from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput,
+-                           SequenceStatus)
++from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
++                           SequenceOutput, SequenceStatus)
+ from vllm.transformers_utils.detokenizer import Detokenizer
+ from vllm.utils import Counter
+ 
++from ...core.utils import create_seq_group
++
+ 
+ @pytest.mark.parametrize("seq_output_len", [128])
+ @pytest.mark.parametrize("num_new_tokens", [1, 12])
+@@ -31,7 +32,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
+ 
+     output_processor = MultiStepOutputProcessor(
+         detokenizer=detokenizer,
+-        scheduler=scheduler,
++        scheduler=[scheduler],
+         seq_counter=seq_counter,
+         get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+         stop_checker=stop_checker,
+@@ -51,7 +52,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
+     new_token_ids = list(range(num_new_tokens))
+ 
+     outputs = [
+-        SequenceGroupOutput(
++        CompletionSequenceGroupOutput(
+             samples=[
+                 SequenceOutput(
+                     parent_seq_id=seq.seq_id,
+@@ -85,7 +86,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
+ 
+     output_processor = MultiStepOutputProcessor(
+         detokenizer=detokenizer,
+-        scheduler=scheduler,
++        scheduler=[scheduler],
+         seq_counter=seq_counter,
+         get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+         stop_checker=stop_checker,
+@@ -103,7 +104,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
+     new_token_ids = list(range(num_new_tokens))
+ 
+     outputs = [
+-        SequenceGroupOutput(
++        CompletionSequenceGroupOutput(
+             samples=[
+                 SequenceOutput(
+                     parent_seq_id=seq.seq_id,
+@@ -147,7 +148,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+ 
+     output_processor = MultiStepOutputProcessor(
+         detokenizer=detokenizer,
+-        scheduler=scheduler,
++        scheduler=[scheduler],
+         seq_counter=seq_counter,
+         get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+         stop_checker=stop_checker,
+@@ -170,7 +171,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+     new_token_ids[eos_index] = eos_token_id
+ 
+     outputs = [
+-        SequenceGroupOutput(
++        CompletionSequenceGroupOutput(
+             samples=[
+                 SequenceOutput(
+                     parent_seq_id=seq.seq_id,
+@@ -214,7 +215,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+ 
+     output_processor = MultiStepOutputProcessor(
+         detokenizer=detokenizer,
+-        scheduler=scheduler,
++        scheduler=[scheduler],
+         seq_counter=seq_counter,
+         get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+         stop_checker=stop_checker,
+@@ -239,7 +240,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+     new_token_ids[eos_index] = eos_token_id
+ 
+     outputs = [
+-        SequenceGroupOutput(
++        CompletionSequenceGroupOutput(
+             samples=[
+                 SequenceOutput(
+                     parent_seq_id=seq.seq_id,
+diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py
+new file mode 100644
+index 0000000..cc14e8c
+--- /dev/null
++++ b/tests/engine/output_processor/test_stop_checker.py
+@@ -0,0 +1,86 @@
++from unittest.mock import MagicMock
++
++import pytest
++from transformers import PreTrainedTokenizer
++
++from vllm.engine.output_processor.stop_checker import StopChecker
++from vllm.inputs import token_inputs
++from vllm.sampling_params import SamplingParams
++from vllm.sequence import Logprob, Sequence, SequenceStatus
++
++
++def sequence_with_eos(text: str, eos_token: str,
++                      eos_token_id: int) -> Sequence:
++    """
++    Create a Sequence that ends with an EOS token.
++    """
++    seq = Sequence(
++        seq_id=0,
++        inputs=token_inputs([]),
++        block_size=16,
++        eos_token_id=eos_token_id,
++    )
++    seq.output_text = text + eos_token
++
++    offset = eos_token_id + 1
++    for i in range(offset, len(text) + offset):
++        seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
++    seq.append_token_id(token_id=eos_token_id,
++                        logprobs={eos_token_id: Logprob(0.0)})
++
++    seq.status = SequenceStatus.RUNNING
++
++    return seq
++
++
++@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
++    ("This text ends with EOS token", "</s>", 2),
++])
++@pytest.mark.parametrize("ignore_eos", [True, False])
++@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
++@pytest.mark.skip_global_cleanup
++def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
++                           ignore_eos: bool, include_stop_str_in_output: bool):
++    """
++    Test the behavior of the StopChecker's maybe_stop_sequence method
++    when an EOS token is encountered.
++
++    This test covers:
++    - When the EOS token should stop the sequence and be removed from the output
++    - When the EOS token should stop the sequence and be included in the output
++    - When the EOS token should be ignored, and the sequence continues
++    """
++
++    tokenizer = MagicMock(spec=PreTrainedTokenizer)
++    get_tokenizer_for_seq = MagicMock(return_value=tokenizer)
++    stop_checker = StopChecker(max_model_len=1024,
++                               get_tokenizer_for_seq=get_tokenizer_for_seq)
++
++    seq = sequence_with_eos(
++        text=text_wo_eos,
++        eos_token=eos_token,
++        eos_token_id=eos_token_id,
++    )
++    new_char_count = len(eos_token)
++
++    # Note that `stop` and `stop_token_ids` are not specified
++    sampling_params = SamplingParams(
++        min_tokens=1,
++        ignore_eos=ignore_eos,
++        include_stop_str_in_output=include_stop_str_in_output)
++
++    stop_checker.maybe_stop_sequence(
++        seq=seq,
++        new_char_count=new_char_count,
++        sampling_params=sampling_params,
++    )
++
++    if ignore_eos:
++        assert seq.status == SequenceStatus.RUNNING
++        assert seq.output_text == text_wo_eos + eos_token
++    elif include_stop_str_in_output:
++        assert seq.status == SequenceStatus.FINISHED_STOPPED
++        assert seq.output_text == text_wo_eos + eos_token
++    else:
++        assert seq.status == SequenceStatus.FINISHED_STOPPED
++        assert seq.output_text == text_wo_eos
+diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
+new file mode 100644
+index 0000000..4e269de
+--- /dev/null
++++ b/tests/engine/test_arg_utils.py
+@@ -0,0 +1,142 @@
++from argparse import ArgumentTypeError
++
++import pytest
++
++from vllm.config import PoolerConfig
++from vllm.engine.arg_utils import EngineArgs, nullable_kvs
++from vllm.utils import FlexibleArgumentParser
++
++
++@pytest.mark.parametrize(("arg", "expected"), [
++    (None, None),
++    ("image=16", {
++        "image": 16
++    }),
++    ("image=16,video=2", {
++        "image": 16,
++        "video": 2
++    }),
++    ("Image=16, Video=2", {
++        "image": 16,
++        "video": 2
++    }),
++])
++def test_limit_mm_per_prompt_parser(arg, expected):
++    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
++    if arg is None:
++        args = parser.parse_args([])
++    else:
++        args = parser.parse_args(["--limit-mm-per-prompt", arg])
++
++    assert args.limit_mm_per_prompt == expected
++
++
++def test_compilation_config():
++    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
++
++    # default value
++    args = parser.parse_args([])
++    assert args.compilation_config is None
++
++    # set to O3
++    args = parser.parse_args(["-O3"])
++    assert args.compilation_config.level == 3
++
++    # set to O 3 (space)
++    args = parser.parse_args(["-O", "3"])
++    assert args.compilation_config.level == 3
++
++    # set to O 3 (equals)
++    args = parser.parse_args(["-O=3"])
++    assert args.compilation_config.level == 3
++
++    # set to string form of a dict
++    args = parser.parse_args(["--compilation-config", "{'level': 3}"])
++    assert args.compilation_config.level == 3
++
++    # set to string form of a dict
++    args = parser.parse_args(["--compilation-config={'level': 3}"])
++    assert args.compilation_config.level == 3
++
++
++def test_prefix_cache_default():
++    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
++    args = parser.parse_args([])
++
++    engine_args = EngineArgs.from_cli_args(args=args)
++    assert (not engine_args.enable_prefix_caching
++            ), "prefix caching defaults to off."
++
++    # with flag to turn it on.
++    args = parser.parse_args(["--enable-prefix-caching"])
++    engine_args = EngineArgs.from_cli_args(args=args)
++    assert engine_args.enable_prefix_caching
++
++    # with disable flag to turn it off.
++    args = parser.parse_args(["--no-enable-prefix-caching"])
++    engine_args = EngineArgs.from_cli_args(args=args)
++    assert not engine_args.enable_prefix_caching
++
++
++def test_valid_pooling_config():
++    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
++    args = parser.parse_args([
++        '--override-pooler-config',
++        '{"pooling_type": "MEAN"}',
++    ])
++    engine_args = EngineArgs.from_cli_args(args=args)
++    assert engine_args.override_pooler_config == PoolerConfig(
++        pooling_type="MEAN", )
++
++
++@pytest.mark.parametrize(
++    ("arg"),
++    [
++        "image",  # Missing =
++        "image=4,image=5",  # Conflicting values
++        "image=video=4"  # Too many = in tokenized arg
++    ])
++def test_bad_nullable_kvs(arg):
++    with pytest.raises(ArgumentTypeError):
++        nullable_kvs(arg)
++
++
++# yapf: disable
++@pytest.mark.parametrize(("arg", "expected", "option"), [
++    (None, None, "mm-processor-kwargs"),
++    ("{}", {}, "mm-processor-kwargs"),
++    (
++        '{"num_crops": 4}',
++        {
++            "num_crops": 4
++        },
++        "mm-processor-kwargs"
++    ),
++    (
++        '{"foo": {"bar": "baz"}}',
++        {
++            "foo":
++            {
++                "bar": "baz"
++            }
++        },
++        "mm-processor-kwargs"
++    ),
++    (
++        '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
++        {
++            "cast_logits_dtype": "bfloat16",
++            "sequence_parallel_norm": True,
++            "sequence_parallel_norm_threshold": 2048,
++        },
++        "override-neuron-config"
++    ),
++])
++# yapf: enable
++def test_composite_arg_parser(arg, expected, option):
++    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
++    if arg is None:
++        args = parser.parse_args([])
++    else:
++        args = parser.parse_args([f"--{option}", arg])
++    assert getattr(args, option.replace("-", "_")) == expected
+diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
+new file mode 100644
+index 0000000..bbabb93
+--- /dev/null
++++ b/tests/engine/test_custom_executor.py
+@@ -0,0 +1,91 @@
++import asyncio
++import os
++
++import pytest
++
++from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
++from vllm.engine.async_llm_engine import AsyncLLMEngine
++from vllm.engine.llm_engine import LLMEngine
++from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
++from vllm.sampling_params import SamplingParams
++
++
++class Mock:
++    ...
++
++
++class CustomGPUExecutor(GPUExecutor):
++
++    def execute_model(self, *args, **kwargs):
++        # Drop marker to show that this was ran
++        with open(".marker", "w"):
++            ...
++        return super().execute_model(*args, **kwargs)
++
++
++class CustomGPUExecutorAsync(GPUExecutorAsync):
++
++    async def execute_model_async(self, *args, **kwargs):
++        with open(".marker", "w"):
++            ...
++        return await super().execute_model_async(*args, **kwargs)
++
++
++@pytest.mark.parametrize("model", ["facebook/opt-125m"])
++def test_custom_executor_type_checking(model):
++    with pytest.raises(ValueError):
++        engine_args = EngineArgs(model=model,
++                                 distributed_executor_backend=Mock)
++        LLMEngine.from_engine_args(engine_args)
++    with pytest.raises(ValueError):
++        engine_args = AsyncEngineArgs(model=model,
++                                      distributed_executor_backend=Mock)
++        AsyncLLMEngine.from_engine_args(engine_args)
++    with pytest.raises(TypeError):
++        engine_args = AsyncEngineArgs(
++            model=model, distributed_executor_backend=CustomGPUExecutor)
++        AsyncLLMEngine.from_engine_args(engine_args)
++
++
++@pytest.mark.parametrize("model", ["facebook/opt-125m"])
++def test_custom_executor(model, tmp_path):
++    cwd = os.path.abspath(".")
++    os.chdir(tmp_path)
++    try:
++        assert not os.path.exists(".marker")
++
++        engine_args = EngineArgs(
++            model=model, distributed_executor_backend=CustomGPUExecutor)
++        engine = LLMEngine.from_engine_args(engine_args)
++        sampling_params = SamplingParams(max_tokens=1)
++
++        engine.add_request("0", "foo", sampling_params)
++        engine.step()
++
++        assert os.path.exists(".marker")
++    finally:
++        os.chdir(cwd)
++
++
++@pytest.mark.parametrize("model", ["facebook/opt-125m"])
++def test_custom_executor_async(model, tmp_path):
++    cwd = os.path.abspath(".")
++    os.chdir(tmp_path)
++    try:
++        assert not os.path.exists(".marker")
++
++        engine_args = AsyncEngineArgs(
++            model=model, distributed_executor_backend=CustomGPUExecutorAsync)
++        engine = AsyncLLMEngine.from_engine_args(engine_args)
++        sampling_params = SamplingParams(max_tokens=1)
++
++        async def t():
++            stream = await engine.add_request("0", "foo", sampling_params)
++            async for x in stream:
++                ...
++
++        asyncio.run(t())
++
++        assert os.path.exists(".marker")
++    finally:
++        os.chdir(cwd)
+diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
+index 610ad97..e07dd6d 100644
+--- a/tests/engine/test_multiproc_workers.py
++++ b/tests/engine/test_multiproc_workers.py
+@@ -83,7 +83,7 @@ def test_local_workers() -> None:
+     workers[3].process.kill()
+ 
+     # Other workers should get shut down here
+-    worker_monitor.join(2)
++    worker_monitor.join(20)
+ 
+     # Ensure everything is stopped
+     assert not worker_monitor.is_alive()
+@@ -108,7 +108,7 @@ def test_local_workers_clean_shutdown() -> None:
+     # Clean shutdown
+     worker_monitor.close()
+ 
+-    worker_monitor.join(5)
++    worker_monitor.join(20)
+ 
+     # Ensure everything is stopped
+     assert not worker_monitor.is_alive()
+@@ -161,7 +161,7 @@ async def test_local_workers_async() -> None:
+     workers[3].process.kill()
+ 
+     # Other workers should get shut down here
+-    worker_monitor.join(2)
++    worker_monitor.join(20)
+ 
+     # Ensure everything is stopped
+     assert not worker_monitor.is_alive()
+diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
+new file mode 100644
+index 0000000..a6ba7a1
+--- /dev/null
++++ b/tests/engine/test_short_mm_context.py
+@@ -0,0 +1,29 @@
++import pytest
++
++from ..conftest import IMAGE_ASSETS
++
++HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
++    "stop_sign":
++    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
++    "cherry_blossom":
++    "USER: <image>\nWhat is the season?\nASSISTANT:",
++})
++
++models = ["llava-hf/llava-1.5-7b-hf"]
++
++
++@pytest.mark.parametrize("model", models)
++def test_context_length_too_short(vllm_runner, image_assets, model):
++    images = [asset.pil_image for asset in image_assets]
++
++    with pytest.raises(ValueError, match="too long to fit into the model"):
++        vllm_model = vllm_runner(
++            model,
++            max_model_len=128,  # LLaVA has a feature size of 576
++            enforce_eager=True,
++        )
++
++        with vllm_model:
++            vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
++                                       max_tokens=1,
++                                       images=[images[0]])
+diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
+index baa463a..b8818af 100644
+--- a/tests/engine/test_skip_tokenizer_init.py
++++ b/tests/engine/test_skip_tokenizer_init.py
+@@ -11,10 +11,11 @@ def test_skip_tokenizer_initialization(model: str):
+     # token ids.
+     llm = LLM(model=model, skip_tokenizer_init=True)
+     sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+-    with pytest.raises(ValueError) as err:
++
++    with pytest.raises(ValueError, match="cannot pass text prompts when"):
+         llm.generate("abc", sampling_params)
+-    assert "prompts must be None if" in str(err.value)
+-    outputs = llm.generate(prompt_token_ids=[[1, 2, 3]],
++
++    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
+                            sampling_params=sampling_params)
+     assert len(outputs) > 0
+     completions = outputs[0].outputs
+diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
+index b2f521a..b0bd6c4 100644
+--- a/tests/engine/test_stop_reason.py
++++ b/tests/engine/test_stop_reason.py
+@@ -19,9 +19,8 @@ MAX_TOKENS = 1024
+ 
+ @pytest.fixture
+ def vllm_model(vllm_runner):
+-    vllm_model = vllm_runner(MODEL)
+-    yield vllm_model
+-    del vllm_model
++    with vllm_runner(MODEL) as vllm_model:
++        yield vllm_model
+ 
+ 
+ def test_stop_reason(vllm_model, example_prompts):
+@@ -32,6 +31,7 @@ def test_stop_reason(vllm_model, example_prompts):
+     # test stop token
+     outputs = llm.generate(example_prompts,
+                            sampling_params=SamplingParams(
++                               ignore_eos=True,
+                                seed=SEED,
+                                max_tokens=MAX_TOKENS,
+                                stop_token_ids=[stop_token_id]))
+@@ -43,7 +43,10 @@ def test_stop_reason(vllm_model, example_prompts):
+     # test stop string
+     outputs = llm.generate(example_prompts,
+                            sampling_params=SamplingParams(
+-                               seed=SEED, max_tokens=MAX_TOKENS, stop="."))
++                               ignore_eos=True,
++                               seed=SEED,
++                               max_tokens=MAX_TOKENS,
++                               stop="."))
+     for output in outputs:
+         output = output.outputs[0]
+         assert output.finish_reason == "stop"
+diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
+index 6b747be..4999356 100644
+--- a/tests/engine/test_stop_strings.py
++++ b/tests/engine/test_stop_strings.py
+@@ -7,105 +7,157 @@ from vllm import CompletionOutput, LLMEngine, SamplingParams
+ MODEL = "meta-llama/llama-2-7b-hf"
+ MAX_TOKENS = 200
+ 
++IS_ASYNC = False
++
+ 
+ @pytest.fixture(scope="session")
+ def vllm_model(vllm_runner):
+-    return vllm_runner(MODEL)
++    with vllm_runner(MODEL) as vllm_model:
++        yield vllm_model
+ 
+ 
+-@pytest.mark.skip_global_cleanup
+-def test_stop_basic(vllm_model):
+-    _test_stopping(vllm_model.model.llm_engine,
++def _test_stopping(llm_engine: LLMEngine,
++                   expected_output: str,
++                   expected_reason: Any,
++                   stop: Optional[List[str]] = None,
++                   stop_token_ids: Optional[List[int]] = None,
++                   include_in_output: bool = False,
++                   use_async_output_proc: bool = False) -> None:
++    llm_engine.add_request(
++        "id", "A story about vLLM:\n",
++        SamplingParams(
++            temperature=0.0,
++            max_tokens=MAX_TOKENS,
++            stop=stop,
++            stop_token_ids=stop_token_ids,
++            include_stop_str_in_output=include_in_output,
++        ), None)
++
++    output: Optional[CompletionOutput] = None
++    output_text = ""
++    stop_reason = None
++
++    if use_async_output_proc:
++        llm_engine.step()
++
++    while llm_engine.has_unfinished_requests():
++        (request_output, ) = llm_engine.step()
++        (output, ) = request_output.outputs
++
++        # Ensure we don't backtrack
++        assert output.text.startswith(output_text)
++        output_text = output.text
++        stop_reason = output.stop_reason
++
++    assert output is not None
++    assert output_text == expected_output
++    assert stop_reason == expected_reason
++
++
++def _set_async_mode(llm_engine, is_async):
++    llm_engine.scheduler[0].use_async_output_proc = is_async
++
++
++def _stop_basic(llm_engine, is_async):
++    _test_stopping(llm_engine,
+                    stop=["."],
+                    include_in_output=False,
+                    expected_output="VLLM is a 100% volunteer organization",
+-                   expected_reason=".")
++                   expected_reason=".",
++                   use_async_output_proc=is_async)
+ 
+-    _test_stopping(vllm_model.model.llm_engine,
++    _test_stopping(llm_engine,
+                    stop=["."],
+                    include_in_output=True,
+                    expected_output="VLLM is a 100% volunteer organization.",
+-                   expected_reason=".")
++                   expected_reason=".",
++                   use_async_output_proc=is_async)
+ 
+ 
+-@pytest.mark.skip_global_cleanup
+-def test_stop_multi_tokens(vllm_model):
++def _stop_multi_tokens(llm_engine, is_async):
+     _test_stopping(
+-        vllm_model.model.llm_engine,
++        llm_engine,
+         stop=["group of peo", "short"],
+         include_in_output=False,
+         expected_output="VLLM is a 100% volunteer organization. We are a ",
+-        expected_reason="group of peo")
++        expected_reason="group of peo",
++        use_async_output_proc=is_async)
+ 
+     _test_stopping(
+-        vllm_model.model.llm_engine,
++        llm_engine,
+         stop=["group of peo", "short"],
+         include_in_output=True,
+         expected_output=
+         "VLLM is a 100% volunteer organization. We are a group of peo",
+-        expected_reason="group of peo")
++        expected_reason="group of peo",
++        use_async_output_proc=is_async)
+ 
+ 
+-@pytest.mark.skip_global_cleanup
+-def test_stop_partial_token(vllm_model):
+-    _test_stopping(vllm_model.model.llm_engine,
++def _stop_partial_token(llm_engine, is_async):
++    _test_stopping(llm_engine,
+                    stop=["gani"],
+                    include_in_output=False,
+                    expected_output="VLLM is a 100% volunteer or",
+-                   expected_reason="gani")
++                   expected_reason="gani",
++                   use_async_output_proc=is_async)
+ 
+-    _test_stopping(vllm_model.model.llm_engine,
++    _test_stopping(llm_engine,
+                    stop=["gani"],
+                    include_in_output=True,
+                    expected_output="VLLM is a 100% volunteer organi",
+-                   expected_reason="gani")
++                   expected_reason="gani",
++                   use_async_output_proc=is_async)
+ 
+ 
+-@pytest.mark.skip_global_cleanup
+-def test_stop_token_id(vllm_model):
++def _stop_token_id(llm_engine, is_async):
+     # token id 13013 => " organization"
+ 
+-    _test_stopping(vllm_model.model.llm_engine,
++    _test_stopping(llm_engine,
+                    stop_token_ids=[13013],
+                    include_in_output=False,
+                    expected_output="VLLM is a 100% volunteer",
+-                   expected_reason=13013)
++                   expected_reason=13013,
++                   use_async_output_proc=is_async)
+ 
+-    _test_stopping(vllm_model.model.llm_engine,
++    _test_stopping(llm_engine,
+                    stop_token_ids=[13013],
+                    include_in_output=True,
+                    expected_output="VLLM is a 100% volunteer organization",
+-                   expected_reason=13013)
++                   expected_reason=13013,
++                   use_async_output_proc=is_async)
+ 
+ 
+-def _test_stopping(llm_engine: LLMEngine,
+-                   expected_output: str,
+-                   expected_reason: Any,
+-                   stop: Optional[List[str]] = None,
+-                   stop_token_ids: Optional[List[int]] = None,
+-                   include_in_output: bool = False) -> None:
+-    llm_engine.add_request(
+-        "id", "A story about vLLM:\n",
+-        SamplingParams(
+-            temperature=0.0,
+-            max_tokens=MAX_TOKENS,
+-            stop=stop,
+-            stop_token_ids=stop_token_ids,
+-            include_stop_str_in_output=include_in_output,
+-        ), None)
++@pytest.mark.skip_global_cleanup
++def test_stop_basic(vllm_model):
++    _set_async_mode(vllm_model.model.llm_engine, True)
++    _stop_basic(vllm_model.model.llm_engine, is_async=True)
+ 
+-    output: Optional[CompletionOutput] = None
+-    output_text = ""
+-    stop_reason = None
+-    while llm_engine.has_unfinished_requests():
+-        (request_output, ) = llm_engine.step()
+-        (output, ) = request_output.outputs
++    _set_async_mode(vllm_model.model.llm_engine, False)
++    _stop_basic(vllm_model.model.llm_engine, is_async=False)
+ 
+-        # Ensure we don't backtrack
+-        assert output.text.startswith(output_text)
+-        output_text = output.text
+-        stop_reason = output.stop_reason
+ 
+-    assert output is not None
+-    assert output_text == expected_output
+-    assert stop_reason == expected_reason
++@pytest.mark.skip_global_cleanup
++def test_stop_multi_tokens(vllm_model):
++    _set_async_mode(vllm_model.model.llm_engine, True)
++    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
++
++    _set_async_mode(vllm_model.model.llm_engine, False)
++    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
++
++
++@pytest.mark.skip_global_cleanup
++def test_stop_partial_token(vllm_model):
++    _set_async_mode(vllm_model.model.llm_engine, True)
++    _stop_partial_token(vllm_model.model.llm_engine, is_async=True)
++
++    _set_async_mode(vllm_model.model.llm_engine, False)
++    _stop_partial_token(vllm_model.model.llm_engine, is_async=False)
++
++
++@pytest.mark.skip_global_cleanup
++def test_stop_token_id(vllm_model):
++    _set_async_mode(vllm_model.model.llm_engine, True)
++    _stop_token_id(vllm_model.model.llm_engine, is_async=True)
++
++    _set_async_mode(vllm_model.model.llm_engine, False)
++    _stop_token_id(vllm_model.model.llm_engine, is_async=False)
+diff --git a/tests/entrypoints/__init__.py b/tests/entrypoints/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
+new file mode 100644
+index 0000000..ef74062
+--- /dev/null
++++ b/tests/entrypoints/conftest.py
+@@ -0,0 +1,159 @@
++import pytest
++
++
++@pytest.fixture
++def sample_prompts():
++    return [
++        "Hello, my name is",
++        "The president of the United States is",
++        "The capital of France is",
++        "The future of AI is",
++    ]
++
++
++@pytest.fixture
++def sample_token_ids():
++    return [
++        [0],
++        [0, 1],
++        [0, 2, 1],
++        [0, 3, 1, 2],
++    ]
++
++
++@pytest.fixture
++def sample_regex():
++    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
++            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
++
++
++@pytest.fixture
++def sample_json_schema():
++    return {
++        "type": "object",
++        "properties": {
++            "name": {
++                "type": "string"
++            },
++            "age": {
++                "type": "integer"
++            },
++            "skills": {
++                "type": "array",
++                "items": {
++                    "type": "string",
++                    "maxLength": 10
++                },
++                "minItems": 3
++            },
++            "work_history": {
++                "type": "array",
++                "items": {
++                    "type": "object",
++                    "properties": {
++                        "company": {
++                            "type": "string"
++                        },
++                        "duration": {
++                            "type": "number"
++                        },
++                        "position": {
++                            "type": "string"
++                        }
++                    },
++                    "required": ["company", "position"]
++                }
++            }
++        },
++        "required": ["name", "age", "skills", "work_history"]
++    }
++
++
++@pytest.fixture
++def sample_complex_json_schema():
++    return {
++        "type": "object",
++        "properties": {
++            "score": {
++                "type": "integer",
++                "minimum": 0,
++                "maximum": 100  # Numeric range
++            },
++            "grade": {
++                "type": "string",
++                "pattern": "^[A-D]$"  # Regex pattern
++            },
++            "email": {
++                "type": "string",
++                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
++            },
++            "tags": {
++                "type": "array",
++                "items": {
++                    "type": "string",
++                    "pattern":
++                    "^[a-z]{1,10}$"  # Combining length and pattern restrictions
++                }
++            }
++        },
++        "required": ["score", "grade", "email", "tags"]
++    }
++
++
++@pytest.fixture
++def sample_definition_json_schema():
++    return {
++        '$defs': {
++            'Step': {
++                'properties': {
++                    'explanation': {
++                        'title': 'Explanation',
++                        'type': 'string'
++                    },
++                    'output': {
++                        'title': 'Output',
++                        'type': 'string'
++                    }
++                },
++                'required': ['explanation', 'output'],
++                'title': 'Step',
++                'type': 'object'
++            }
++        },
++        'properties': {
++            'steps': {
++                'items': {
++                    '$ref': '#/$defs/Step'
++                },
++                'title': 'Steps',
++                'type': 'array'
++            },
++            'final_answer': {
++                'title': 'Final Answer',
++                'type': 'string'
++            }
++        },
++        'required': ['steps', 'final_answer'],
++        'title': 'MathReasoning',
++        'type': 'object'
++    }
++
++
++@pytest.fixture
++def sample_guided_choice():
++    return [
++        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
++        "Ruby", "Swift", "Kotlin"
++    ]
++
++
++@pytest.fixture
++def sample_sql_statements():
++    return ("""
++start: select_statement
++select_statement: "SELECT" column "from" table "where" condition
++column: "col_1" | "col_2"
++table: "table_1" | "table_2"
++condition: column "=" number
++number: "1" | "2"
++""")
+diff --git a/tests/entrypoints/llm/__init__.py b/tests/entrypoints/llm/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
+new file mode 100644
+index 0000000..6bf7190
+--- /dev/null
++++ b/tests/entrypoints/llm/test_accuracy.py
+@@ -0,0 +1,56 @@
++"""
++This file test accuracy of the vLLM server via LMEval.
++It uses local-completions, which interacts with vLLM
++through the OAI API with N concurrent connections.
++This simulates real work usage of the API and makes
++sure that the zmq frontend mp RPC message passing and
++AsyncLLMEngine are working correctly.
++"""
++
++import lm_eval
++import pytest
++
++from vllm.platforms import current_platform
++
++MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
++NUM_CONCURRENT = 500
++TASK = "gsm8k"
++FILTER = "exact_match,strict-match"
++RTOL = 0.03
++EXPECTED_VALUE = 0.58
++
++
++def run_test():
++    """Run the end to end accuracy test."""
++
++    model_args = f"pretrained={MODEL_NAME},max_model_len=2048"
++
++    results = lm_eval.simple_evaluate(
++        model="vllm",
++        model_args=model_args,
++        tasks="gsm8k",
++        batch_size="auto",
++    )
++
++    measured_value = results["results"][TASK][FILTER]
++    assert (measured_value - RTOL < EXPECTED_VALUE
++            and measured_value + RTOL > EXPECTED_VALUE
++            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
++
++
++@pytest.mark.skipif(not current_platform.is_cuda(),
++                    reason="V1 is currently only supported on CUDA.")
++def test_lm_eval_accuracy_v1_engine(monkeypatch):
++    """Run with the V1 Engine."""
++
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "1")
++        run_test()
++
++
++def test_lm_eval_accuracy_v0_engine(monkeypatch):
++    """Run with the V0 Engine."""
++
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "0")
++        run_test()
+diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
+new file mode 100644
+index 0000000..fc66386
+--- /dev/null
++++ b/tests/entrypoints/llm/test_chat.py
+@@ -0,0 +1,92 @@
++from typing import List
++
++import pytest
++
++from vllm import LLM
++
++from ..openai.test_vision import TEST_IMAGE_URLS
++
++
++def test_chat():
++    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
++
++    prompt1 = "Explain the concept of entropy."
++    messages = [
++        {
++            "role": "system",
++            "content": "You are a helpful assistant"
++        },
++        {
++            "role": "user",
++            "content": prompt1
++        },
++    ]
++    outputs = llm.chat(messages)
++    assert len(outputs) == 1
++
++
++def test_multi_chat():
++    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
++
++    prompt1 = "Explain the concept of entropy."
++    prompt2 = "Explain what among us is."
++
++    conversation1 = [
++        {
++            "role": "system",
++            "content": "You are a helpful assistant"
++        },
++        {
++            "role": "user",
++            "content": prompt1
++        },
++    ]
++
++    conversation2 = [
++        {
++            "role": "system",
++            "content": "You are a helpful assistant"
++        },
++        {
++            "role": "user",
++            "content": prompt2
++        },
++    ]
++
++    messages = [conversation1, conversation2]
++
++    outputs = llm.chat(messages)
++    assert len(outputs) == 2
++
++
++@pytest.mark.parametrize("image_urls",
++                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
++def test_chat_multi_image(image_urls: List[str]):
++    llm = LLM(
++        model="microsoft/Phi-3.5-vision-instruct",
++        dtype="bfloat16",
++        max_model_len=4096,
++        max_num_seqs=5,
++        enforce_eager=True,
++        trust_remote_code=True,
++        limit_mm_per_prompt={"image": 2},
++    )
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            *({
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            } for image_url in image_urls),
++            {
++                "type": "text",
++                "text": "What's in this image?"
++            },
++        ],
++    }]
++    outputs = llm.chat(messages)
++    assert len(outputs) >= 0
+diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
+new file mode 100644
+index 0000000..4116380
+--- /dev/null
++++ b/tests/entrypoints/llm/test_encode.py
+@@ -0,0 +1,107 @@
++import weakref
++from typing import List
++
++import pytest
++
++from vllm import LLM, PoolingParams, PoolingRequestOutput
++from vllm.distributed import cleanup_dist_env_and_memory
++
++MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
++
++PROMPTS = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++
++TOKEN_IDS = [
++    # Using ID={0, 1, 2, 3} results in NaN values,
++    # so we add this offset of 1000
++    [1000],
++    [1000, 1001],
++    [1000, 1002, 1001],
++    [1000, 1003, 1001, 1002],
++]
++
++
++@pytest.fixture(scope="module")
++def llm():
++    # pytest caches the fixture so we use weakref.proxy to
++    # enable garbage collection
++    llm = LLM(model=MODEL_NAME,
++              max_num_batched_tokens=32768,
++              tensor_parallel_size=1,
++              gpu_memory_utilization=0.75,
++              enforce_eager=True)
++
++    with llm.deprecate_legacy_api():
++        yield weakref.proxy(llm)
++
++        del llm
++
++    cleanup_dist_env_and_memory()
++
++
++def assert_outputs_equal(o1: List[PoolingRequestOutput],
++                         o2: List[PoolingRequestOutput]):
++    assert [o.outputs for o in o1] == [o.outputs for o in o2]
++
++
++@pytest.mark.skip_global_cleanup
++@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
++def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
++                                                    prompt_token_ids):
++    pooling_params = PoolingParams()
++
++    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
++        v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
++                               pooling_params=pooling_params)
++
++    v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
++                           pooling_params=pooling_params)
++    assert_outputs_equal(v1_output, v2_output)
++
++
++@pytest.mark.skip_global_cleanup
++def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
++    pooling_params = PoolingParams()
++
++    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
++        v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
++                               pooling_params=pooling_params)
++
++    v2_output = llm.encode(
++        [{
++            "prompt_token_ids": p
++        } for p in TOKEN_IDS],
++        pooling_params=pooling_params,
++    )
++    assert_outputs_equal(v1_output, v2_output)
++
++
++@pytest.mark.skip_global_cleanup
++def test_multiple_pooling_params(llm: LLM):
++    pooling_params = [
++        PoolingParams(),
++        PoolingParams(),
++        PoolingParams(),
++        PoolingParams(),
++    ]
++
++    # Multiple PoolingParams should be matched with each prompt
++    outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
++    assert len(PROMPTS) == len(outputs)
++
++    # Exception raised, if the size of params does not match the size of prompts
++    with pytest.raises(ValueError):
++        outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
++
++    # Single PoolingParams should be applied to every prompt
++    single_pooling_params = PoolingParams()
++    outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
++    assert len(PROMPTS) == len(outputs)
++
++    # pooling_params is None, default params should be applied
++    outputs = llm.encode(PROMPTS, pooling_params=None)
++    assert len(PROMPTS) == len(outputs)
+diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
+new file mode 100644
+index 0000000..7d2b377
+--- /dev/null
++++ b/tests/entrypoints/llm/test_generate.py
+@@ -0,0 +1,104 @@
++import weakref
++from typing import List
++
++import pytest
++
++from vllm import LLM, RequestOutput, SamplingParams
++from vllm.distributed import cleanup_dist_env_and_memory
++
++MODEL_NAME = "facebook/opt-125m"
++
++PROMPTS = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++
++TOKEN_IDS = [
++    [0],
++    [0, 1],
++    [0, 2, 1],
++    [0, 3, 1, 2],
++]
++
++
++@pytest.fixture(scope="module")
++def llm():
++    # pytest caches the fixture so we use weakref.proxy to
++    # enable garbage collection
++    llm = LLM(model=MODEL_NAME,
++              max_num_batched_tokens=4096,
++              tensor_parallel_size=1,
++              gpu_memory_utilization=0.10,
++              enforce_eager=True)
++
++    with llm.deprecate_legacy_api():
++        yield weakref.proxy(llm)
++
++        del llm
++
++    cleanup_dist_env_and_memory()
++
++
++def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
++    assert [o.outputs for o in o1] == [o.outputs for o in o2]
++
++
++@pytest.mark.skip_global_cleanup
++@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
++def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
++                                                    prompt_token_ids):
++    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
++
++    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
++        v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
++                                 sampling_params=sampling_params)
++
++    v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
++                             sampling_params=sampling_params)
++    assert_outputs_equal(v1_output, v2_output)
++
++
++@pytest.mark.skip_global_cleanup
++def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
++    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
++
++    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
++        v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
++                                 sampling_params=sampling_params)
++
++    v2_output = llm.generate(
++        [{
++            "prompt_token_ids": p
++        } for p in TOKEN_IDS],
++        sampling_params=sampling_params,
++    )
++    assert_outputs_equal(v1_output, v2_output)
++
++
++@pytest.mark.skip_global_cleanup
++def test_multiple_sampling_params(llm: LLM):
++    sampling_params = [
++        SamplingParams(temperature=0.01, top_p=0.95),
++        SamplingParams(temperature=0.3, top_p=0.95),
++        SamplingParams(temperature=0.7, top_p=0.95),
++        SamplingParams(temperature=0.99, top_p=0.95),
++    ]
++
++    # Multiple SamplingParams should be matched with each prompt
++    outputs = llm.generate(PROMPTS, sampling_params=sampling_params)
++    assert len(PROMPTS) == len(outputs)
++
++    # Exception raised, if the size of params does not match the size of prompts
++    with pytest.raises(ValueError):
++        outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3])
++
++    # Single SamplingParams should be applied to every prompt
++    single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
++    outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params)
++    assert len(PROMPTS) == len(outputs)
++
++    # sampling_params is None, default params should be applied
++    outputs = llm.generate(PROMPTS, sampling_params=None)
++    assert len(PROMPTS) == len(outputs)
+diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
+new file mode 100644
+index 0000000..eb21136
+--- /dev/null
++++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
+@@ -0,0 +1,66 @@
++import weakref
++
++import pytest
++# downloading lora to test lora requests
++from huggingface_hub import snapshot_download
++
++from vllm import LLM
++from vllm.distributed import cleanup_dist_env_and_memory
++from vllm.lora.request import LoRARequest
++
++MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
++
++PROMPTS = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++
++LORA_NAME = "typeof/zephyr-7b-beta-lora"
++
++
++@pytest.fixture(scope="module")
++def llm():
++    # pytest caches the fixture so we use weakref.proxy to
++    # enable garbage collection
++    llm = LLM(model=MODEL_NAME,
++              tensor_parallel_size=1,
++              max_model_len=8192,
++              enable_lora=True,
++              max_loras=4,
++              max_lora_rank=64,
++              max_num_seqs=128,
++              enforce_eager=True)
++
++    with llm.deprecate_legacy_api():
++        yield weakref.proxy(llm)
++
++        del llm
++
++    cleanup_dist_env_and_memory()
++
++
++@pytest.fixture(scope="module")
++def zephyr_lora_files():
++    return snapshot_download(repo_id=LORA_NAME)
++
++
++@pytest.mark.skip_global_cleanup
++def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
++    lora_request = [
++        LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
++        for idx in range(len(PROMPTS))
++    ]
++    # Multiple SamplingParams should be matched with each prompt
++    outputs = llm.generate(PROMPTS, lora_request=lora_request)
++    assert len(PROMPTS) == len(outputs)
++
++    # Exception raised, if the size of params does not match the size of prompts
++    with pytest.raises(ValueError):
++        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
++
++    # Single LoRARequest should be applied to every prompt
++    single_lora_request = lora_request[0]
++    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
++    assert len(PROMPTS) == len(outputs)
+diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
+new file mode 100644
+index 0000000..c2dab30
+--- /dev/null
++++ b/tests/entrypoints/llm/test_gpu_utilization.py
+@@ -0,0 +1,25 @@
++from vllm import LLM, SamplingParams
++
++
++def test_gpu_memory_utilization():
++    prompts = [
++        "Hello, my name is",
++        "The president of the United States is",
++        "The capital of France is",
++        "The future of AI is",
++    ]
++    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++    # makes sure gpu_memory_utilization is per-instance limit,
++    # not a global limit
++    llms = [
++        LLM(model="facebook/opt-125m",
++            gpu_memory_utilization=0.3,
++            enforce_eager=True) for i in range(3)
++    ]
++    for llm in llms:
++        outputs = llm.generate(prompts, sampling_params)
++        for output in outputs:
++            prompt = output.prompt
++            generated_text = output.outputs[0].text
++            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
+new file mode 100644
+index 0000000..ccb9906
+--- /dev/null
++++ b/tests/entrypoints/llm/test_guided_generate.py
+@@ -0,0 +1,265 @@
++import json
++import re
++import weakref
++
++import jsonschema
++import pytest
++
++from vllm.distributed import cleanup_dist_env_and_memory
++from vllm.entrypoints.llm import LLM
++from vllm.outputs import RequestOutput
++from vllm.sampling_params import GuidedDecodingParams, SamplingParams
++
++MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
++GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
++
++
++@pytest.fixture(scope="module")
++def llm():
++    # pytest caches the fixture so we use weakref.proxy to
++    # enable garbage collection
++    llm = LLM(model=MODEL_NAME, max_model_len=1024)
++
++    with llm.deprecate_legacy_api():
++        yield weakref.proxy(llm)
++        del llm
++    cleanup_dist_env_and_memory()
++
++
++@pytest.mark.skip_global_cleanup
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++def test_guided_regex(sample_regex, llm, guided_decoding_backend: str):
++    sampling_params = SamplingParams(temperature=0.8,
++                                     top_p=0.95,
++                                     guided_decoding=GuidedDecodingParams(
++                                         regex=sample_regex,
++                                         backend=guided_decoding_backend))
++    outputs = llm.generate(prompts=[
++        f"Give an example IPv4 address with this regex: {sample_regex}"
++    ] * 2,
++                           sampling_params=sampling_params,
++                           use_tqdm=True)
++
++    assert outputs is not None
++    for output in outputs:
++        assert output is not None
++        assert isinstance(output, RequestOutput)
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        print(generated_text)
++        assert generated_text is not None
++        assert re.fullmatch(sample_regex, generated_text) is not None
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++
++@pytest.mark.skip_global_cleanup
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++def test_guided_json_completion(sample_json_schema, llm,
++                                guided_decoding_backend: str):
++    sampling_params = SamplingParams(temperature=1.0,
++                                     max_tokens=1000,
++                                     guided_decoding=GuidedDecodingParams(
++                                         json=sample_json_schema,
++                                         backend=guided_decoding_backend))
++    outputs = llm.generate(prompts=[
++        f"Give an example JSON for an employee profile "
++        f"that fits this schema: {sample_json_schema}"
++    ] * 2,
++                           sampling_params=sampling_params,
++                           use_tqdm=True)
++
++    assert outputs is not None
++
++    for output in outputs:
++        assert output is not None
++        assert isinstance(output, RequestOutput)
++        prompt = output.prompt
++
++        generated_text = output.outputs[0].text
++        assert generated_text is not None
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++        output_json = json.loads(generated_text)
++        jsonschema.validate(instance=output_json, schema=sample_json_schema)
++
++
++@pytest.mark.skip_global_cleanup
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++def test_guided_complex_json_completion(sample_complex_json_schema, llm,
++                                        guided_decoding_backend: str):
++    sampling_params = SamplingParams(temperature=1.0,
++                                     max_tokens=1000,
++                                     guided_decoding=GuidedDecodingParams(
++                                         json=sample_complex_json_schema,
++                                         backend=guided_decoding_backend))
++    outputs = llm.generate(prompts=[
++        f"Give an example JSON for an assignment grade "
++        f"that fits this schema: {sample_complex_json_schema}"
++    ] * 2,
++                           sampling_params=sampling_params,
++                           use_tqdm=True)
++
++    assert outputs is not None
++
++    for output in outputs:
++        assert output is not None
++        assert isinstance(output, RequestOutput)
++        prompt = output.prompt
++
++        generated_text = output.outputs[0].text
++        assert generated_text is not None
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++        output_json = json.loads(generated_text)
++        jsonschema.validate(instance=output_json,
++                            schema=sample_complex_json_schema)
++
++
++@pytest.mark.skip_global_cleanup
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++def test_guided_definition_json_completion(sample_definition_json_schema, llm,
++                                           guided_decoding_backend: str):
++    sampling_params = SamplingParams(temperature=1.0,
++                                     max_tokens=1000,
++                                     guided_decoding=GuidedDecodingParams(
++                                         json=sample_definition_json_schema,
++                                         backend=guided_decoding_backend))
++    outputs = llm.generate(prompts=[
++        f"Give an example JSON for solving 8x + 7 = -23 "
++        f"that fits this schema: {sample_definition_json_schema}"
++    ] * 2,
++                           sampling_params=sampling_params,
++                           use_tqdm=True)
++
++    assert outputs is not None
++
++    for output in outputs:
++        assert output is not None
++        assert isinstance(output, RequestOutput)
++        prompt = output.prompt
++
++        generated_text = output.outputs[0].text
++        assert generated_text is not None
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++        output_json = json.loads(generated_text)
++        jsonschema.validate(instance=output_json,
++                            schema=sample_definition_json_schema)
++
++
++@pytest.mark.skip_global_cleanup
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++def test_guided_choice_completion(sample_guided_choice, llm,
++                                  guided_decoding_backend: str):
++    sampling_params = SamplingParams(temperature=0.8,
++                                     top_p=0.95,
++                                     guided_decoding=GuidedDecodingParams(
++                                         choice=sample_guided_choice,
++                                         backend=guided_decoding_backend))
++    outputs = llm.generate(
++        prompts="The best language for type-safe systems programming is ",
++        sampling_params=sampling_params,
++        use_tqdm=True)
++
++    assert outputs is not None
++    for output in outputs:
++        assert output is not None
++        assert isinstance(output, RequestOutput)
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        print(generated_text)
++        assert generated_text is not None
++        assert generated_text in sample_guided_choice
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++
++@pytest.mark.skip_global_cleanup
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++def test_guided_grammar(sample_sql_statements, llm,
++                        guided_decoding_backend: str):
++    sampling_params = SamplingParams(temperature=0.8,
++                                     top_p=0.95,
++                                     max_tokens=1000,
++                                     guided_decoding=GuidedDecodingParams(
++                                         grammar=sample_sql_statements,
++                                         backend=guided_decoding_backend))
++    outputs = llm.generate(
++        prompts=("Generate a sql state that select col_1 from "
++                 "table_1 where it is equals to 1"),
++        sampling_params=sampling_params,
++        use_tqdm=True,
++    )
++
++    assert outputs is not None
++    for output in outputs:
++        assert output is not None
++        assert isinstance(output, RequestOutput)
++        prompt = output.prompt
++
++        generated_text = output.outputs[0].text
++        assert generated_text is not None
++        # use Lark to parse the output, and make sure it's a valid parse tree
++        from lark import Lark
++        parser = Lark(sample_sql_statements)
++        parser.parse(generated_text)
++
++        # remove spaces for comparison b/c we removed them in the grammar
++        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
++            " ", "")
++
++        assert generated_text.strip() == ground_truth
++
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++
++@pytest.mark.skip_global_cleanup
++def test_guided_options_request_deprecation_warning(sample_regex, llm):
++    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++    with pytest.warns(DeprecationWarning, match="guided_options_request"):
++        llm.generate(prompts="This should fail",
++                     sampling_params=sampling_params,
++                     use_tqdm=True,
++                     guided_options_request=dict(guided_regex=sample_regex))
++
++
++@pytest.mark.skip_global_cleanup
++def test_validation_against_both_guided_decoding_options(sample_regex, llm):
++    sampling_params = SamplingParams(
++        temperature=0.8,
++        top_p=0.95,
++        guided_decoding=GuidedDecodingParams(regex=sample_regex))
++
++    with pytest.raises(ValueError, match="Cannot set both"):
++        llm.generate(prompts="This should fail",
++                     sampling_params=sampling_params,
++                     use_tqdm=True,
++                     guided_options_request=dict(guided_regex=sample_regex))
++
++
++@pytest.mark.skip_global_cleanup
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++def test_guided_json_object(llm, guided_decoding_backend: str):
++    sampling_params = SamplingParams(temperature=1.0,
++                                     max_tokens=100,
++                                     n=2,
++                                     guided_decoding=GuidedDecodingParams(
++                                         json_object=True,
++                                         backend=guided_decoding_backend))
++
++    outputs = llm.generate(
++        prompts=("Generate a JSON object with curly braces for a person with "
++                 "name and age fields for John Smith who is 31 years old."),
++        sampling_params=sampling_params,
++        use_tqdm=True)
++
++    assert outputs is not None
++    for output in outputs:
++        assert output is not None
++        assert isinstance(output, RequestOutput)
++
++        for i in range(2):
++            generated_text = output.outputs[i].text
++            print(generated_text)
++            assert generated_text is not None
++
++            # Parse to verify it is valid JSON
++            parsed_json = json.loads(generated_text)
++            assert isinstance(parsed_json, dict)
+diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py
+new file mode 100644
+index 0000000..c9a4ad4
+--- /dev/null
++++ b/tests/entrypoints/llm/test_init.py
+@@ -0,0 +1,22 @@
++import pytest
++
++from vllm import LLM
++
++from ...utils import error_on_warning
++
++MODEL_NAME = "facebook/opt-125m"
++
++
++def test_pos_args_deprecated():
++    with error_on_warning(DeprecationWarning):
++        LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
++
++    with error_on_warning(DeprecationWarning):
++        LLM(MODEL_NAME, tokenizer=MODEL_NAME)
++
++    with pytest.warns(DeprecationWarning, match="'tokenizer'"):
++        LLM(MODEL_NAME, MODEL_NAME)
++
++    with pytest.warns(DeprecationWarning,
++                      match="'tokenizer', 'tokenizer_mode'"):
++        LLM(MODEL_NAME, MODEL_NAME, "auto")
+diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
+new file mode 100644
+index 0000000..bf609b3
+--- /dev/null
++++ b/tests/entrypoints/llm/test_lazy_outlines.py
+@@ -0,0 +1,76 @@
++import sys
++from contextlib import nullcontext
++
++from vllm_test_utils import BlameResult, blame
++
++from vllm import LLM, SamplingParams
++from vllm.distributed import cleanup_dist_env_and_memory
++
++
++def run_normal():
++    prompts = [
++        "Hello, my name is",
++        "The president of the United States is",
++        "The capital of France is",
++        "The future of AI is",
++    ]
++    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++    # Create an LLM without guided decoding as a baseline.
++    llm = LLM(model="facebook/opt-125m",
++              enforce_eager=True,
++              gpu_memory_utilization=0.3)
++    outputs = llm.generate(prompts, sampling_params)
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++    # Destroy the LLM object and free up the GPU memory.
++    del llm
++    cleanup_dist_env_and_memory()
++
++
++def run_lmfe(sample_regex):
++    # Create an LLM with guided decoding enabled.
++    llm = LLM(model="facebook/opt-125m",
++              enforce_eager=True,
++              guided_decoding_backend="lm-format-enforcer",
++              gpu_memory_utilization=0.3)
++    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++    outputs = llm.generate(
++        prompts=[
++            f"Give an example IPv4 address with this regex: {sample_regex}"
++        ] * 2,
++        sampling_params=sampling_params,
++        use_tqdm=True,
++        guided_options_request=dict(guided_regex=sample_regex))
++
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++
++
++def test_lazy_outlines(sample_regex):
++    """If users don't use guided decoding, outlines should not be imported.
++    """
++    # make sure outlines is not imported
++    module_name = "outlines"
++    # In CI, we only check finally if the module is imported.
++    # If it is indeed imported, we can rerun the test with `use_blame=True`,
++    # which will trace every function call to find the first import location,
++    # and help find the root cause.
++    # We don't run it in CI by default because it is slow.
++    use_blame = False
++    context = blame(
++        lambda: module_name in sys.modules) if use_blame else nullcontext()
++    with context as result:
++        run_normal()
++        run_lmfe(sample_regex)
++    if use_blame:
++        assert isinstance(result, BlameResult)
++        print(f"the first import location is:\n{result.trace_stack}")
++    assert module_name not in sys.modules, (
++        f"Module {module_name} is imported. To see the first"
++        f" import location, run the test with `use_blame=True`.")
+diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
+new file mode 100644
+index 0000000..ee7010a
+--- /dev/null
++++ b/tests/entrypoints/llm/test_prompt_validation.py
+@@ -0,0 +1,24 @@
++import pytest
++
++from vllm import LLM
++
++
++@pytest.fixture(autouse=True)
++def v1(run_with_both_engines):
++    # Simple autouse wrapper to run both engines for each test
++    # This can be promoted up to conftest.py to run for every
++    # test in a package
++    pass
++
++
++def test_empty_prompt():
++    llm = LLM(model="gpt2", enforce_eager=True)
++    with pytest.raises(ValueError, match='Prompt cannot be empty'):
++        llm.generate([""])
++
++
++@pytest.mark.skip_v1
++def test_out_of_vocab_token():
++    llm = LLM(model="gpt2", enforce_eager=True)
++    with pytest.raises(ValueError, match='out of vocabulary'):
++        llm.generate({"prompt_token_ids": [999999]})
+diff --git a/tests/entrypoints/offline_mode/__init__.py b/tests/entrypoints/offline_mode/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
+new file mode 100644
+index 0000000..65699e6
+--- /dev/null
++++ b/tests/entrypoints/offline_mode/test_offline_mode.py
+@@ -0,0 +1,82 @@
++"""Tests for HF_HUB_OFFLINE mode"""
++import importlib
++import sys
++
++import pytest
++
++from vllm import LLM
++from vllm.distributed import cleanup_dist_env_and_memory
++
++MODEL_CONFIGS = [
++    {
++        "model": "facebook/opt-125m",
++        "enforce_eager": True,
++        "gpu_memory_utilization": 0.20,
++        "max_model_len": 64,
++        "max_num_batched_tokens": 64,
++        "max_num_seqs": 64,
++        "tensor_parallel_size": 1,
++    },
++    {
++        "model": "mistralai/Mistral-7B-Instruct-v0.1",
++        "enforce_eager": True,
++        "gpu_memory_utilization": 0.95,
++        "max_model_len": 64,
++        "max_num_batched_tokens": 64,
++        "max_num_seqs": 64,
++        "tensor_parallel_size": 1,
++        "tokenizer_mode": "mistral",
++    },
++]
++
++
++@pytest.fixture(scope="module")
++def cache_models():
++    # Cache model files first
++    for model_config in MODEL_CONFIGS:
++        LLM(**model_config)
++        cleanup_dist_env_and_memory()
++
++    yield
++
++
++@pytest.mark.skip_global_cleanup
++@pytest.mark.usefixtures("cache_models")
++def test_offline_mode(monkeypatch):
++    # Set HF to offline mode and ensure we can still construct an LLM
++    try:
++        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
++        # Need to re-import huggingface_hub and friends to setup offline mode
++        _re_import_modules()
++        # Cached model files should be used in offline mode
++        for model_config in MODEL_CONFIGS:
++            LLM(**model_config)
++    finally:
++        # Reset the environment after the test
++        # NB: Assuming tests are run in online mode
++        monkeypatch.delenv("HF_HUB_OFFLINE")
++        _re_import_modules()
++        pass
++
++
++def _re_import_modules():
++    hf_hub_module_names = [
++        k for k in sys.modules if k.startswith("huggingface_hub")
++    ]
++    transformers_module_names = [
++        k for k in sys.modules if k.startswith("transformers")
++        and not k.startswith("transformers_modules")
++    ]
++
++    reload_exception = None
++    for module_name in hf_hub_module_names + transformers_module_names:
++        try:
++            importlib.reload(sys.modules[module_name])
++        except Exception as e:
++            reload_exception = e
++            # Try to continue clean up so that other tests are less likely to
++            # be affected
++
++    # Error this test if reloading a module failed
++    if reload_exception is not None:
++        raise reload_exception
+diff --git a/tests/entrypoints/openai/__init__.py b/tests/entrypoints/openai/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py
+new file mode 100644
+index 0000000..b1d4461
+--- /dev/null
++++ b/tests/entrypoints/openai/test_accuracy.py
+@@ -0,0 +1,85 @@
++"""
++This file test accuracy of the vLLM server via LMEval.
++It uses local-completions, which interacts with vLLM
++through the OAI API with N concurrent connections.
++This simulates real work usage of the API and makes
++sure that the zmq frontend mp RPC message passing and
++AsyncLLMEngine are working correctly.
++"""
++
++import lm_eval
++import pytest
++
++from vllm.platforms import current_platform
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
++NUM_CONCURRENT = 500
++TASK = "gsm8k"
++FILTER = "exact_match,strict-match"
++RTOL = 0.03
++EXPECTED_VALUE = 0.58
++DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
++MORE_ARGS_LIST = [
++    [],  # Default
++    ["--enable-chunked-prefill"],  # Chunked
++    ["--num-scheduler-steps", "8"],  # MS
++    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
++]
++MAX_WAIT_SECONDS = None
++
++if current_platform.is_tpu():
++    MORE_ARGS_LIST = [
++        [],  # Default
++        # ["--num-scheduler-steps", "8"], # Multi-step << currently fails
++    ]
++    MAX_WAIT_SECONDS = 600
++
++
++def run_test(more_args):
++    """Run the end to end accuracy test."""
++
++    args = list(DEFAULT_ARGS)
++    args.extend(more_args)
++    print(f"Running with: {args}")
++
++    with RemoteOpenAIServer(
++            MODEL_NAME, args,
++            max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
++        url = f"{remote_server.url_for('v1')}/completions"
++
++        model_args = (
++            f"model={MODEL_NAME},"
++            f"base_url={url},"
++            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
++
++        results = lm_eval.simple_evaluate(
++            model="local-completions",
++            model_args=model_args,
++            tasks=TASK,
++        )
++
++        measured_value = results["results"][TASK][FILTER]
++        assert (measured_value - RTOL < EXPECTED_VALUE
++                and measured_value + RTOL > EXPECTED_VALUE
++                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
++
++
++@pytest.mark.skipif(not current_platform.is_cuda(),
++                    reason="V1 currently only supported on CUDA")
++def test_lm_eval_accuracy_v1_engine(monkeypatch):
++    """Run with the V1 Engine."""
++
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "1")
++        run_test([])
++
++
++@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
++def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
++    """Run with the V0 Engine."""
++
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "0")
++        run_test(more_args)
+diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
+new file mode 100644
+index 0000000..fcce8b4
+--- /dev/null
++++ b/tests/entrypoints/openai/test_async_tokenization.py
+@@ -0,0 +1,137 @@
++import asyncio
++import contextlib
++import random
++import time
++from typing import Callable
++
++import openai
++import pytest
++import pytest_asyncio
++import requests
++
++from tests.utils import RemoteOpenAIServer
++
++MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
++
++
++@pytest.fixture(scope="module")
++def server():  # noqa: F811
++    args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "8192",
++        "--enforce-eager",
++        "--max-num-seqs",
++        "128",
++        "--load-format",
++        "dummy",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    ids=["completion", "chat"],
++    argnames=["create_func_gen", "content_body"],
++    argvalues=[
++        (lambda x: x.completions.create, {
++            "prompt": " ".join(['A'] * 10_000)
++        }),
++        (lambda x: x.chat.completions.create, {
++            "messages": [{
++                "role": "user",
++                "content": " ".join(['A'] * 10_000)
++            }]
++        }),
++    ],
++)
++async def test_with_and_without_truncate(
++    server: RemoteOpenAIServer,
++    client: openai.AsyncOpenAI,
++    create_func_gen: Callable,
++    content_body: dict,
++):
++    create_func = create_func_gen(client)
++    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
++
++    num_requests = 10
++    truncate_prompt_tokens = ([1000] * (num_requests // 2) + [None] *
++                              (num_requests - num_requests // 2))
++    random.shuffle(truncate_prompt_tokens)
++
++    bodies = [{
++        **body, "extra_body": {
++            'truncate_prompt_tokens': t
++        }
++    } for t in truncate_prompt_tokens]
++
++    async def get_status_code(**kwargs):
++        try:
++            await create_func(**kwargs)
++            return 200
++        except openai.APIStatusError as e:
++            return e.status_code
++
++    responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
++    assert 500 not in responses
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    ids=["single completion", "multiple completions", "chat"],
++    argnames=["create_func_gen", "content_body"],
++    argvalues=[
++        (lambda x: x.completions.create, {
++            "prompt": " ".join(['A'] * 300_000)
++        }),
++        (lambda x: x.completions.create, {
++            "prompt": [" ".join(['A'] * 300_000)] * 2
++        }),
++        (lambda x: x.chat.completions.create, {
++            "messages": [{
++                "role": "user",
++                "content": " ".join(['A'] * 300_000)
++            }]
++        }),
++    ],
++)
++async def test_healthcheck_response_time(
++    server: RemoteOpenAIServer,
++    client: openai.AsyncOpenAI,
++    create_func_gen: Callable,
++    content_body: dict,
++):
++    num_requests = 50
++
++    create_func = create_func_gen(client)
++    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
++
++    def get_response_time(url):
++        start_time = time.monotonic()
++        res = requests.get(url)
++        end_time = time.monotonic()
++        assert res.status_code == 200
++        return end_time - start_time
++
++    no_load_response_time = get_response_time(server.url_for("health"))
++    tasks = [
++        asyncio.create_task(create_func(**body)) for _ in range(num_requests)
++    ]
++    await asyncio.sleep(1)  # give the tasks a chance to start running
++    load_response_time = get_response_time(server.url_for("health"))
++
++    with contextlib.suppress(openai.APIStatusError):
++        await asyncio.gather(*tasks)
++
++    assert load_response_time < 100 * no_load_response_time
++    assert load_response_time < 0.1
+diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
+new file mode 100644
+index 0000000..1116c0d
+--- /dev/null
++++ b/tests/entrypoints/openai/test_audio.py
+@@ -0,0 +1,380 @@
++from typing import Dict, List
++
++import openai
++import pytest
++import pytest_asyncio
++
++from vllm.assets.audio import AudioAsset
++from vllm.multimodal.utils import encode_audio_base64, fetch_audio
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "fixie-ai/ultravox-v0_3"
++TEST_AUDIO_URLS = [
++    AudioAsset("winning_call").url,
++]
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "2048",
++        "--max-num-seqs",
++        "5",
++        "--enforce-eager",
++        "--trust-remote-code",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.fixture(scope="session")
++def base64_encoded_audio() -> Dict[str, str]:
++    return {
++        audio_url: encode_audio_base64(*fetch_audio(audio_url))
++        for audio_url in TEST_AUDIO_URLS
++    }
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
++async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
++                                         model_name: str, audio_url: str):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "audio_url",
++                "audio_url": {
++                    "url": audio_url
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's happening in this audio?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        logprobs=True,
++        temperature=0.0,
++        top_logprobs=5)
++    assert len(chat_completion.choices) == 1
++
++    choice = chat_completion.choices[0]
++    assert choice.finish_reason == "length"
++    assert chat_completion.usage == openai.types.CompletionUsage(
++        completion_tokens=10, prompt_tokens=202, total_tokens=212)
++
++    message = choice.message
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 10
++    assert message.role == "assistant"
++    messages.append({"role": "assistant", "content": message.content})
++
++    # test multi-turn dialogue
++    messages.append({"role": "user", "content": "express your result in json"})
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++    )
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
++async def test_single_chat_session_audio_base64encoded(
++        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
++        base64_encoded_audio: Dict[str, str]):
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "audio_url",
++                "audio_url": {
++                    "url":
++                    f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's happening in this audio?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        logprobs=True,
++        temperature=0.0,
++        top_logprobs=5)
++    assert len(chat_completion.choices) == 1
++
++    choice = chat_completion.choices[0]
++    assert choice.finish_reason == "length"
++    assert chat_completion.usage == openai.types.CompletionUsage(
++        completion_tokens=10, prompt_tokens=202, total_tokens=212)
++
++    message = choice.message
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 10
++    assert message.role == "assistant"
++    messages.append({"role": "assistant", "content": message.content})
++
++    # test multi-turn dialogue
++    messages.append({"role": "user", "content": "express your result in json"})
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++    )
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
++async def test_single_chat_session_input_audio(
++        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
++        base64_encoded_audio: Dict[str, str]):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "input_audio",
++                "input_audio": {
++                    "data": base64_encoded_audio[audio_url],
++                    "format": "wav"
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's happening in this audio?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        logprobs=True,
++        top_logprobs=5)
++    assert len(chat_completion.choices) == 1
++
++    choice = chat_completion.choices[0]
++    assert choice.finish_reason == "length"
++    assert chat_completion.usage == openai.types.CompletionUsage(
++        completion_tokens=10, prompt_tokens=202, total_tokens=212)
++
++    message = choice.message
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 10
++    assert message.role == "assistant"
++    messages.append({"role": "assistant", "content": message.content})
++
++    # test multi-turn dialogue
++    messages.append({"role": "user", "content": "express your result in json"})
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++    )
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
++async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
++                                    model_name: str, audio_url: str):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "audio_url",
++                "audio_url": {
++                    "url": audio_url
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's happening in this audio?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++    )
++    output = chat_completion.choices[0].message.content
++    stop_reason = chat_completion.choices[0].finish_reason
++
++    # test streaming
++    stream = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++        stream=True,
++    )
++    chunks: List[str] = []
++    finish_reason_count = 0
++    async for chunk in stream:
++        delta = chunk.choices[0].delta
++        if delta.role:
++            assert delta.role == "assistant"
++        if delta.content:
++            chunks.append(delta.content)
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++    # finish reason should only return in last block
++    assert finish_reason_count == 1
++    assert chunk.choices[0].finish_reason == stop_reason
++    assert delta.content
++    assert "".join(chunks) == output
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
++async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
++                                          model_name: str, audio_url: str,
++                                          base64_encoded_audio: Dict[str,
++                                                                     str]):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "input_audio",
++                "input_audio": {
++                    "data": base64_encoded_audio[audio_url],
++                    "format": "wav"
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's happening in this audio?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++    )
++    output = chat_completion.choices[0].message.content
++    stop_reason = chat_completion.choices[0].finish_reason
++
++    # test streaming
++    stream = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++        stream=True,
++    )
++    chunks: List[str] = []
++    finish_reason_count = 0
++    async for chunk in stream:
++        delta = chunk.choices[0].delta
++        if delta.role:
++            assert delta.role == "assistant"
++        if delta.content:
++            chunks.append(delta.content)
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++    # finish reason should only return in last block
++    assert finish_reason_count == 1
++    assert chunk.choices[0].finish_reason == stop_reason
++    assert delta.content
++    assert "".join(chunks) == output
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
++async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
++                                 audio_url: str,
++                                 base64_encoded_audio: Dict[str, str]):
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "audio_url",
++                "audio_url": {
++                    "url": audio_url
++                }
++            },
++            {
++                "type": "input_audio",
++                "input_audio": {
++                    "data": base64_encoded_audio[audio_url],
++                    "format": "wav"
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's happening in this audio?"
++            },
++        ],
++    }]
++
++    with pytest.raises(openai.BadRequestError):  # test multi-audio input
++        await client.chat.completions.create(
++            model=model_name,
++            messages=messages,
++            max_completion_tokens=10,
++            temperature=0.0,
++        )
++
++    # the server should still work afterwards
++    completion = await client.completions.create(
++        model=model_name,
++        prompt=[0, 0, 0, 0, 0],
++        max_tokens=5,
++        temperature=0.0,
++    )
++    completion = completion.choices[0].text
++    assert completion is not None and len(completion) >= 0
+diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
+new file mode 100644
+index 0000000..547c1fd
+--- /dev/null
++++ b/tests/entrypoints/openai/test_basic.py
+@@ -0,0 +1,156 @@
++import asyncio
++from http import HTTPStatus
++from typing import List
++
++import openai
++import pytest
++import pytest_asyncio
++import requests
++
++from vllm.version import __version__ as VLLM_VERSION
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
++
++
++@pytest.fixture(scope='module')
++def server_args(request: pytest.FixtureRequest) -> List[str]:
++    """ Provide extra arguments to the server via indirect parametrization
++
++    Usage:
++
++    >>> @pytest.mark.parametrize(
++    >>>     "server_args",
++    >>>     [
++    >>>         ["--disable-frontend-multiprocessing"],
++    >>>         [
++    >>>             "--model=NousResearch/Hermes-3-Llama-3.1-70B",
++    >>>             "--enable-auto-tool-choice",
++    >>>         ],
++    >>>     ],
++    >>>     indirect=True,
++    >>> )
++    >>> def test_foo(server, client):
++    >>>     ...
++
++    This will run `test_foo` twice with servers with:
++    - `--disable-frontend-multiprocessing`
++    - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
++
++    """
++    if not hasattr(request, "param"):
++        return []
++
++    val = request.param
++
++    if isinstance(val, str):
++        return [val]
++
++    return request.param
++
++
++@pytest.fixture(scope="module")
++def server(server_args):
++    args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "8192",
++        "--enforce-eager",
++        "--max-num-seqs",
++        "128",
++        *server_args,
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.mark.parametrize(
++    "server_args",
++    [
++        pytest.param([], id="default-frontend-multiprocessing"),
++        pytest.param(["--disable-frontend-multiprocessing"],
++                     id="disable-frontend-multiprocessing")
++    ],
++    indirect=True,
++)
++@pytest.mark.asyncio
++async def test_show_version(server: RemoteOpenAIServer):
++    response = requests.get(server.url_for("version"))
++    response.raise_for_status()
++
++    assert response.json() == {"version": VLLM_VERSION}
++
++
++@pytest.mark.parametrize(
++    "server_args",
++    [
++        pytest.param([], id="default-frontend-multiprocessing"),
++        pytest.param(["--disable-frontend-multiprocessing"],
++                     id="disable-frontend-multiprocessing")
++    ],
++    indirect=True,
++)
++@pytest.mark.asyncio
++async def test_check_health(server: RemoteOpenAIServer):
++    response = requests.get(server.url_for("health"))
++
++    assert response.status_code == HTTPStatus.OK
++
++
++@pytest.mark.parametrize(
++    "server_args",
++    [
++        pytest.param(["--max-model-len", "10100"],
++                     id="default-frontend-multiprocessing"),
++        pytest.param(
++            ["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
++            id="disable-frontend-multiprocessing")
++    ],
++    indirect=True,
++)
++@pytest.mark.asyncio
++async def test_request_cancellation(server: RemoteOpenAIServer):
++    # clunky test: send an ungodly amount of load in with short timeouts
++    # then ensure that it still responds quickly afterwards
++
++    chat_input = [{"role": "user", "content": "Write a long story"}]
++    client = server.get_async_client(timeout=0.5)
++    tasks = []
++    # Request about 2 million tokens
++    for _ in range(200):
++        task = asyncio.create_task(
++            client.chat.completions.create(messages=chat_input,
++                                           model=MODEL_NAME,
++                                           max_tokens=10000,
++                                           extra_body={"min_tokens": 10000}))
++        tasks.append(task)
++
++    done, pending = await asyncio.wait(tasks,
++                                       return_when=asyncio.ALL_COMPLETED)
++
++    # Make sure all requests were sent to the server and timed out
++    # (We don't want to hide other errors like 400s that would invalidate this
++    # test)
++    assert len(pending) == 0
++    for d in done:
++        with pytest.raises(openai.APITimeoutError):
++            d.result()
++
++    # If the server had not cancelled all the other requests, then it would not
++    # be able to respond to this one within the timeout
++    client = server.get_async_client(timeout=5)
++    response = await client.chat.completions.create(messages=chat_input,
++                                                    model=MODEL_NAME,
++                                                    max_tokens=10)
++
++    assert len(response.choices) == 1
+diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
+new file mode 100644
+index 0000000..5e6499d
+--- /dev/null
++++ b/tests/entrypoints/openai/test_chat.py
+@@ -0,0 +1,996 @@
++# imports for guided decoding tests
++import json
++import re
++from typing import Dict, List, Optional
++
++import jsonschema
++import openai  # use the official client for correctness check
++import pytest
++import pytest_asyncio
++import torch
++from openai import BadRequestError
++
++from ...utils import RemoteOpenAIServer
++from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
++from .test_completion import zephyr_lora_files  # noqa: F401
++
++# any model with a chat template should work here
++MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
++
++GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
++
++
++@pytest.fixture(scope="module")
++def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
++    args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "8192",
++        "--enforce-eager",
++        # lora config below
++        "--enable-lora",
++        "--lora-modules",
++        f"zephyr-lora={zephyr_lora_files}",
++        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
++        "--max-lora-rank",
++        "64",
++        "--max-cpu-loras",
++        "2",
++        "--max-num-seqs",
++        "128",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    # first test base model, then test loras
++    "model_name",
++    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
++)
++async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role": "user",
++        "content": "what is 1+1?"
++    }]
++
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=5,
++        temperature=0.0,
++        logprobs=False)
++
++    choice = chat_completion.choices[0]
++    assert choice.logprobs is None
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    # just test 1 lora hereafter
++    "model_name",
++    [MODEL_NAME, "zephyr-lora"],
++)
++async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role": "user",
++        "content": "what is 1+1?"
++    }]
++
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=5,
++        temperature=0.0,
++        logprobs=True,
++        top_logprobs=0)
++
++    choice = chat_completion.choices[0]
++    assert choice.logprobs is not None
++    assert choice.logprobs.content is not None
++    assert len(choice.logprobs.content[0].top_logprobs) == 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    [MODEL_NAME, "zephyr-lora"],
++)
++async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role": "user",
++        "content": "what is 1+1?"
++    }]
++
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=5,
++        temperature=0.0,
++        logprobs=True,
++        top_logprobs=5)
++
++    choice = chat_completion.choices[0]
++    assert choice.logprobs is not None
++    assert choice.logprobs.content is not None
++    assert len(choice.logprobs.content[0].top_logprobs) == 5
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    [MODEL_NAME, "zephyr-lora"],
++)
++async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
++                                      model_name: str):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role": "user",
++        "content": "what is 1+1?"
++    }]
++
++    # Default max_logprobs is 20, so this should raise an error
++    with pytest.raises((openai.BadRequestError, openai.APIError)):
++        stream = await client.chat.completions.create(model=model_name,
++                                                      messages=messages,
++                                                      max_completion_tokens=10,
++                                                      logprobs=True,
++                                                      top_logprobs=21,
++                                                      stream=True)
++        async for chunk in stream:
++            ...
++
++    with pytest.raises(openai.BadRequestError):
++        await client.chat.completions.create(model=model_name,
++                                             messages=messages,
++                                             max_completion_tokens=10,
++                                             logprobs=True,
++                                             top_logprobs=30,
++                                             stream=False)
++
++    # the server should still work afterwards
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        stream=False)
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name, prompt_logprobs",
++    [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
++)
++async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
++                                    model_name: str,
++                                    prompt_logprobs: Optional[int]):
++    params: Dict = {
++        "messages": [{
++            "role": "system",
++            "content": "You are a helpful assistant."
++        }, {
++            "role": "user",
++            "content": "Who won the world series in 2020?"
++        }, {
++            "role":
++            "assistant",
++            "content":
++            "The Los Angeles Dodgers won the World Series in 2020."
++        }, {
++            "role": "user",
++            "content": "Where was it played?"
++        }],
++        "model":
++        model_name
++    }
++
++    if prompt_logprobs is not None:
++        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
++
++    if prompt_logprobs is not None and prompt_logprobs < 0:
++        with pytest.raises(BadRequestError):
++            await client.chat.completions.create(**params)
++    else:
++        completion = await client.chat.completions.create(**params)
++        if prompt_logprobs is not None:
++            assert completion.prompt_logprobs is not None
++            assert len(completion.prompt_logprobs) > 0
++        else:
++            assert completion.prompt_logprobs is None
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    [MODEL_NAME],
++)
++async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
++                                                  model_name: str):
++    params: Dict = {
++        "messages": [{
++            "role": "system",
++            "content": "You are a helpful assistant."
++        }, {
++            "role": "user",
++            "content": "Who won the world series in 2020?"
++        }, {
++            "role":
++            "assistant",
++            "content":
++            "The Los Angeles Dodgers won the World Series in 2020."
++        }, {
++            "role": "user",
++            "content": "Where was it played?"
++        }],
++        "model":
++        model_name,
++        "extra_body": {
++            "prompt_logprobs": 1
++        }
++    }
++
++    completion_1 = await client.chat.completions.create(**params)
++
++    params["extra_body"] = {"prompt_logprobs": 2}
++    completion_2 = await client.chat.completions.create(**params)
++
++    assert len(completion_1.prompt_logprobs[3]) == 1
++    assert len(completion_2.prompt_logprobs[3]) == 2
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    [MODEL_NAME, "zephyr-lora"],
++)
++async def test_single_chat_session(client: openai.AsyncOpenAI,
++                                   model_name: str):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role": "user",
++        "content": "what is 1+1?"
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        logprobs=True,
++        top_logprobs=5)
++    assert chat_completion.id is not None
++    assert len(chat_completion.choices) == 1
++
++    choice = chat_completion.choices[0]
++    assert choice.finish_reason == "length"
++    assert chat_completion.usage == openai.types.CompletionUsage(
++        completion_tokens=10, prompt_tokens=37, total_tokens=47)
++
++    message = choice.message
++    assert message.content is not None and len(message.content) >= 10
++    assert message.role == "assistant"
++    messages.append({"role": "assistant", "content": message.content})
++
++    # test multi-turn dialogue
++    messages.append({"role": "user", "content": "express your result in json"})
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++    )
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    # just test 1 lora hereafter
++    "model_name",
++    [MODEL_NAME, "zephyr-lora"],
++)
++async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role": "user",
++        "content": "what is 1+1?"
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++    )
++    output = chat_completion.choices[0].message.content
++    stop_reason = chat_completion.choices[0].finish_reason
++
++    # test streaming
++    stream = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++        stream=True,
++    )
++    chunks: List[str] = []
++    finish_reason_count = 0
++    async for chunk in stream:
++        delta = chunk.choices[0].delta
++        if delta.role:
++            assert delta.role == "assistant"
++        if delta.content:
++            chunks.append(delta.content)
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++    # finish reason should only return in last block
++    assert finish_reason_count == 1
++    assert chunk.choices[0].finish_reason == stop_reason
++    assert delta.content
++    assert "".join(chunks) == output
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
++)
++async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
++                                              model_name: str):
++    messages = [{
++        "role": "system",
++        "content": "You are a helpful assistant."
++    }, {
++        "role": "user",
++        "content": "What is the capital of France?"
++    }]
++
++    # Test stream=True, stream_options={"include_usage": False}
++    stream = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++        stream=True,
++        stream_options={"include_usage": False})
++    async for chunk in stream:
++        assert chunk.usage is None
++
++    # Test stream=True, stream_options={"include_usage": True,
++    #                                   "continuous_usage_stats": False}}
++    stream = await client.chat.completions.create(model=model_name,
++                                                  messages=messages,
++                                                  max_completion_tokens=10,
++                                                  temperature=0.0,
++                                                  stream=True,
++                                                  stream_options={
++                                                      "include_usage":
++                                                      True,
++                                                      "continuous_usage_stats":
++                                                      False
++                                                  })
++
++    async for chunk in stream:
++        if chunk.choices[0].finish_reason is None:
++            assert chunk.usage is None
++        else:
++            assert chunk.usage is None
++            final_chunk = await stream.__anext__()
++            assert final_chunk.usage is not None
++            assert final_chunk.usage.prompt_tokens > 0
++            assert final_chunk.usage.completion_tokens > 0
++            assert final_chunk.usage.total_tokens == (
++                final_chunk.usage.prompt_tokens +
++                final_chunk.usage.completion_tokens)
++            assert final_chunk.choices == []
++
++    # Test stream=False, stream_options={"include_usage": None}
++    with pytest.raises(BadRequestError):
++        await client.chat.completions.create(
++            model=model_name,
++            messages=messages,
++            max_completion_tokens=10,
++            temperature=0.0,
++            stream=False,
++            stream_options={"include_usage": None})
++
++    # Test stream=False, stream_options={"include_usage": True}
++    with pytest.raises(BadRequestError):
++        await client.chat.completions.create(
++            model=model_name,
++            messages=messages,
++            max_completion_tokens=10,
++            temperature=0.0,
++            stream=False,
++            stream_options={"include_usage": True})
++
++    # Test stream=True, stream_options={"include_usage": True,
++    #                           "continuous_usage_stats": True}
++    stream = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        extra_body=dict(min_tokens=10),
++        temperature=0.0,
++        stream=True,
++        stream_options={
++            "include_usage": True,
++            "continuous_usage_stats": True,
++        },
++    )
++    last_completion_tokens = 0
++    async for chunk in stream:
++        assert chunk.usage.prompt_tokens >= 0
++        assert last_completion_tokens == 0 or \
++               chunk.usage.completion_tokens > last_completion_tokens or \
++               (
++                   not chunk.choices and
++                   chunk.usage.completion_tokens == last_completion_tokens
++               )
++        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
++                                            chunk.usage.completion_tokens)
++        last_completion_tokens = chunk.usage.completion_tokens
++
++    assert last_completion_tokens == 10
++
++
++# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
++# (i.e. using the same ordering as in the Completions API tests), the test
++# will fail on the second `guided_decoding_backend` even when I swap their order
++# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
++@pytest.mark.asyncio
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++async def test_guided_choice_chat(client: openai.AsyncOpenAI,
++                                  guided_decoding_backend: str,
++                                  sample_guided_choice):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role":
++        "user",
++        "content":
++        "The best language for type-safe systems programming is "
++    }]
++    chat_completion = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.7,
++        extra_body=dict(guided_choice=sample_guided_choice,
++                        guided_decoding_backend=guided_decoding_backend))
++    choice1 = chat_completion.choices[0].message.content
++    assert choice1 in sample_guided_choice
++
++    messages.append({"role": "assistant", "content": choice1})
++    messages.append({
++        "role": "user",
++        "content": "I disagree, pick another one"
++    })
++    chat_completion = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.7,
++        extra_body=dict(guided_choice=sample_guided_choice,
++                        guided_decoding_backend=guided_decoding_backend))
++    choice2 = chat_completion.choices[0].message.content
++    assert choice2 in sample_guided_choice
++    assert choice1 != choice2
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++async def test_guided_json_chat(client: openai.AsyncOpenAI,
++                                guided_decoding_backend: str,
++                                sample_json_schema):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role":
++        "user",
++        "content":
++        f"Give an example JSON for an employee profile that "
++        f"fits this schema: {sample_json_schema}"
++    }]
++    chat_completion = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=messages,
++        max_completion_tokens=1000,
++        extra_body=dict(guided_json=sample_json_schema,
++                        guided_decoding_backend=guided_decoding_backend))
++    message = chat_completion.choices[0].message
++    assert message.content is not None
++    json1 = json.loads(message.content)
++    jsonschema.validate(instance=json1, schema=sample_json_schema)
++
++    messages.append({"role": "assistant", "content": message.content})
++    messages.append({
++        "role":
++        "user",
++        "content":
++        "Give me another one with a different name and age"
++    })
++    chat_completion = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=messages,
++        max_completion_tokens=1000,
++        extra_body=dict(guided_json=sample_json_schema,
++                        guided_decoding_backend=guided_decoding_backend))
++    message = chat_completion.choices[0].message
++    assert message.content is not None
++    json2 = json.loads(message.content)
++    jsonschema.validate(instance=json2, schema=sample_json_schema)
++    assert json1["name"] != json2["name"]
++    assert json1["age"] != json2["age"]
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++async def test_guided_regex_chat(client: openai.AsyncOpenAI,
++                                 guided_decoding_backend: str, sample_regex):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role":
++        "user",
++        "content":
++        f"Give an example IP address with this regex: {sample_regex}"
++    }]
++    chat_completion = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=messages,
++        max_completion_tokens=20,
++        extra_body=dict(guided_regex=sample_regex,
++                        guided_decoding_backend=guided_decoding_backend))
++    ip1 = chat_completion.choices[0].message.content
++    assert ip1 is not None
++    assert re.fullmatch(sample_regex, ip1) is not None
++
++    messages.append({"role": "assistant", "content": ip1})
++    messages.append({"role": "user", "content": "Give me a different one"})
++    chat_completion = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=messages,
++        max_completion_tokens=20,
++        extra_body=dict(guided_regex=sample_regex,
++                        guided_decoding_backend=guided_decoding_backend))
++    ip2 = chat_completion.choices[0].message.content
++    assert ip2 is not None
++    assert re.fullmatch(sample_regex, ip2) is not None
++    assert ip1 != ip2
++
++
++@pytest.mark.asyncio
++async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role":
++        "user",
++        "content":
++        "The best language for type-safe systems programming is "
++    }]
++
++    with pytest.raises(openai.BadRequestError):
++        _ = await client.chat.completions.create(model=MODEL_NAME,
++                                                 messages=messages,
++                                                 extra_body=dict(guided_regex={
++                                                     1: "Python",
++                                                     2: "C++"
++                                                 }))
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
++                                           guided_decoding_backend: str,
++                                           sample_guided_choice):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role":
++        "user",
++        "content":
++        "The best language for type-safe systems programming is "
++    }]
++    chat_completion = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=messages,
++        max_completion_tokens=10,
++        logprobs=True,
++        top_logprobs=5,
++        extra_body=dict(guided_choice=sample_guided_choice,
++                        guided_decoding_backend=guided_decoding_backend))
++
++    assert chat_completion.choices[0].logprobs is not None
++    assert chat_completion.choices[0].logprobs.content is not None
++    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
++
++    # -9999.0 is the minimum logprob returned by OpenAI
++    for item in top_logprobs:
++        assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++async def test_named_tool_use(client: openai.AsyncOpenAI,
++                              guided_decoding_backend: str,
++                              sample_json_schema):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role":
++        "user",
++        "content":
++        f"Give an example JSON for an employee profile that "
++        f"fits this schema: {sample_json_schema}"
++    }]
++
++    # non-streaming
++
++    chat_completion = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=messages,
++        max_completion_tokens=1000,
++        tools=[{
++            "type": "function",
++            "function": {
++                "name": "dummy_function_name",
++                "description": "This is a dummy function",
++                "parameters": sample_json_schema
++            }
++        }],
++        tool_choice={
++            "type": "function",
++            "function": {
++                "name": "dummy_function_name"
++            }
++        },
++        extra_body=dict(guided_decoding_backend=guided_decoding_backend))
++    message = chat_completion.choices[0].message
++    assert len(message.content) == 0
++    json_string = message.tool_calls[0].function.arguments
++    json1 = json.loads(json_string)
++    jsonschema.validate(instance=json1, schema=sample_json_schema)
++
++    messages.append({"role": "assistant", "content": json_string})
++    messages.append({
++        "role":
++        "user",
++        "content":
++        "Give me another one with a different name and age"
++    })
++
++    # streaming
++
++    stream = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=messages,
++        max_completion_tokens=1000,
++        tools=[{
++            "type": "function",
++            "function": {
++                "name": "dummy_function_name",
++                "description": "This is a dummy function",
++                "parameters": sample_json_schema
++            }
++        }],
++        tool_choice={
++            "type": "function",
++            "function": {
++                "name": "dummy_function_name"
++            }
++        },
++        extra_body=dict(guided_decoding_backend=guided_decoding_backend),
++        stream=True)
++
++    output = []
++    finish_reason_count = 0
++    async for chunk in stream:
++        delta = chunk.choices[0].delta
++        if delta.role:
++            assert delta.role == "assistant"
++        assert delta.content is None or len(delta.content) == 0
++        if delta.tool_calls:
++            output.append(delta.tool_calls[0].function.arguments)
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++    # finish reason should only return in last block
++    assert finish_reason_count == 1
++    json2 = json.loads("".join(output))
++    jsonschema.validate(instance=json2, schema=sample_json_schema)
++    assert json1["name"] != json2["name"]
++    assert json1["age"] != json2["age"]
++
++
++@pytest.mark.asyncio
++async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
++                                                   sample_json_schema):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role":
++        "user",
++        "content":
++        f"Give an example JSON for an employee profile that "
++        f"fits this schema: {sample_json_schema}"
++    }]
++
++    with pytest.raises(openai.BadRequestError):
++        await client.chat.completions.create(
++            model=MODEL_NAME,
++            messages=messages,
++            max_completion_tokens=1000,
++            tools=[{
++                "type": "function",
++                "function": {
++                    "name": "dummy_function_name",
++                    "description": "This is a dummy function",
++                    "parameters": sample_json_schema
++                }
++            }],
++            tool_choice="required")
++
++    with pytest.raises(openai.BadRequestError):
++        await client.chat.completions.create(
++            model=MODEL_NAME,
++            messages=messages,
++            max_completion_tokens=1000,
++            tools=[{
++                "type": "function",
++                "function": {
++                    "name": "dummy_function_name",
++                    "description": "This is a dummy function",
++                    "parameters": sample_json_schema
++                }
++            }],
++            tool_choice="auto")
++
++
++@pytest.mark.asyncio
++async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
++                                                  sample_json_schema):
++    messages = [{
++        "role": "system",
++        "content": "you are a helpful assistant"
++    }, {
++        "role":
++        "user",
++        "content":
++        f"Give an example JSON for an employee profile that "
++        f"fits this schema: {sample_json_schema}"
++    }]
++
++    with pytest.raises(openai.BadRequestError):
++        await client.chat.completions.create(model=MODEL_NAME,
++                                             messages=messages,
++                                             max_completion_tokens=1000,
++                                             tool_choice={
++                                                 "type": "function",
++                                                 "function": {
++                                                     "name":
++                                                     "dummy_function_name"
++                                                 }
++                                             })
++
++    with pytest.raises(openai.BadRequestError):
++        await client.chat.completions.create(
++            model=MODEL_NAME,
++            messages=messages,
++            max_completion_tokens=1000,
++            tools=[{
++                "type": "function",
++                "function": {
++                    "name": "dummy_function_name",
++                    "description": "This is a dummy function",
++                    "parameters": sample_json_schema
++                }
++            }],
++            tool_choice={
++                "type": "function",
++                "function": {
++                    "name": "nondefined_function_name"
++                }
++            })
++    with pytest.raises(openai.BadRequestError):
++        await client.chat.completions.create(
++            model=MODEL_NAME,
++            messages=messages,
++            max_completion_tokens=1000,
++            tools=[{
++                "type": "function",
++                "function": {
++                    "name": "dummy_function_name",
++                    "description": "This is a dummy function",
++                    "parameters": sample_json_schema
++                }
++            }],
++            tool_choice={})
++
++
++@pytest.mark.asyncio
++async def test_response_format_json_object(client: openai.AsyncOpenAI):
++    for _ in range(2):
++        resp = await client.chat.completions.create(
++            model=MODEL_NAME,
++            messages=[{
++                "role":
++                "user",
++                "content": ('what is 1+1? please respond with a JSON object, '
++                            'the format is {"result": 2}')
++            }],
++            response_format={"type": "json_object"})
++
++        content = resp.choices[0].message.content
++        assert content is not None
++
++        loaded = json.loads(content)
++        assert loaded == {"result": 2}, loaded
++
++
++@pytest.mark.asyncio
++async def test_response_format_json_schema(client: openai.AsyncOpenAI):
++    prompt = 'what is 1+1? The format is "result": 2'
++    # Check that this prompt cannot lead to a valid JSON without json_schema
++    for _ in range(2):
++        resp = await client.chat.completions.create(
++            model=MODEL_NAME,
++            messages=[{
++                "role": "user",
++                "content": prompt
++            }],
++        )
++        content = resp.choices[0].message.content
++        assert content is not None
++        with pytest.raises((json.JSONDecodeError, AssertionError)):
++            loaded = json.loads(content)
++            assert loaded == {"result": 2}, loaded
++
++    for _ in range(2):
++        resp = await client.chat.completions.create(
++            model=MODEL_NAME,
++            messages=[{
++                "role": "user",
++                "content": prompt
++            }],
++            response_format={
++                "type": "json_schema",
++                "json_schema": {
++                    "name": "foo_test",
++                    "schema": {
++                        "type": "object",
++                        "properties": {
++                            "result": {
++                                "type": "integer"
++                            },
++                        },
++                    },
++                }
++            })
++
++        content = resp.choices[0].message.content
++        assert content is not None
++
++        loaded = json.loads(content)
++        assert loaded == {"result": 2}, loaded
++
++
++@pytest.mark.asyncio
++async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
++    resp = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=[{
++            "role": "user",
++            "content": "what is 1+1?",
++            "extra_field": "0",
++        }],  # type: ignore
++        temperature=0,
++        seed=0)
++
++    content = resp.choices[0].message.content
++    assert content is not None
++
++
++@pytest.mark.asyncio
++async def test_complex_message_content(client: openai.AsyncOpenAI):
++    resp = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=[{
++            "role":
++            "user",
++            "content": [{
++                "type":
++                "text",
++                "text":
++                "what is 1+1? please provide the result without any other text."
++            }]
++        }],
++        temperature=0,
++        seed=0)
++    content = resp.choices[0].message.content
++    assert content == "2"
++
++
++@pytest.mark.asyncio
++async def test_custom_role(client: openai.AsyncOpenAI):
++    # Not sure how the model handles custom roles so we just check that
++    # both string and complex message content are handled in the same way
++
++    resp1 = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=[{
++            "role": "my-custom-role",
++            "content": "what is 1+1?",
++        }],  # type: ignore
++        temperature=0,
++        seed=0)
++
++    resp2 = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=[{
++            "role": "my-custom-role",
++            "content": [{
++                "type": "text",
++                "text": "what is 1+1?"
++            }]
++        }],  # type: ignore
++        temperature=0,
++        seed=0)
++
++    content1 = resp1.choices[0].message.content
++    content2 = resp2.choices[0].message.content
++    assert content1 == content2
++
++
++@pytest.mark.asyncio
++async def test_long_seed(client: openai.AsyncOpenAI):
++    for seed in [
++            torch.iinfo(torch.long).min - 1,
++            torch.iinfo(torch.long).max + 1
++    ]:
++        with pytest.raises(BadRequestError) as exc_info:
++            await client.chat.completions.create(
++                model=MODEL_NAME,
++                messages=[{
++                    "role": "system",
++                    "content": "You are a helpful assistant.",
++                }],
++                temperature=0,
++                seed=seed)
++
++        assert ("greater_than_equal" in exc_info.value.message
++                or "less_than_equal" in exc_info.value.message)
+diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
+new file mode 100644
+index 0000000..223ac5b
+--- /dev/null
++++ b/tests/entrypoints/openai/test_chat_echo.py
+@@ -0,0 +1,79 @@
++from typing import NamedTuple
++
++import openai  # use the official client for correctness check
++import pytest
++import pytest_asyncio
++
++from ...utils import RemoteOpenAIServer
++
++# # any model with a chat template should work here
++MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
++DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "float16",
++        "--enforce-eager",
++        "--max-model-len",
++        "4080",
++        "--chat-template",
++        DUMMY_CHAT_TEMPLATE,
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++class TestCase(NamedTuple):
++    model_name: str
++    echo: bool
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "test_case",
++    [
++        TestCase(model_name=MODEL_NAME, echo=True),
++        TestCase(model_name=MODEL_NAME, echo=False)
++    ],
++)
++async def test_chat_session_with_echo_and_continue_final_message(
++        client: openai.AsyncOpenAI, test_case: TestCase):
++    saying: str = "Here is a common saying about apple. An apple a day, keeps"
++    # test echo with continue_final_message parameter
++    chat_completion = await client.chat.completions.create(
++        model=test_case.model_name,
++        messages=[{
++            "role": "user",
++            "content": "tell me a common saying"
++        }, {
++            "role": "assistant",
++            "content": saying
++        }],
++        extra_body={
++            "echo": test_case.echo,
++            "continue_final_message": True,
++            "add_generation_prompt": False
++        })
++    assert chat_completion.id is not None
++    assert len(chat_completion.choices) == 1
++
++    choice = chat_completion.choices[0]
++    assert choice.finish_reason == "stop"
++
++    message = choice.message
++    if test_case.echo:
++        assert message.content is not None and saying in message.content
++    else:
++        assert message.content is not None and saying not in message.content
++    assert message.role == "assistant"
+diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
+new file mode 100644
+index 0000000..e1e1dcf
+--- /dev/null
++++ b/tests/entrypoints/openai/test_chat_template.py
+@@ -0,0 +1,117 @@
++import pytest
++
++from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
++                                         load_chat_template)
++from vllm.entrypoints.openai.protocol import ChatCompletionRequest
++from vllm.transformers_utils.tokenizer import get_tokenizer
++
++from ...utils import VLLM_PATH
++
++chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
++assert chatml_jinja_path.exists()
++
++# Define models, templates, and their corresponding expected outputs
++MODEL_TEMPLATE_GENERATON_OUTPUT = [
++    ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
++Hello<|im_end|>
++<|im_start|>assistant
++Hi there!<|im_end|>
++<|im_start|>user
++What is the capital of<|im_end|>
++<|im_start|>assistant
++"""),
++    ("facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user
++Hello<|im_end|>
++<|im_start|>assistant
++Hi there!<|im_end|>
++<|im_start|>user
++What is the capital of"""),
++    ("facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user
++Hello<|im_end|>
++<|im_start|>assistant
++Hi there!<|im_end|>
++<|im_start|>user
++What is the capital of<|im_end|>
++<|im_start|>assistant
++The capital of"""),
++]
++
++TEST_MESSAGES = [
++    {
++        'role': 'user',
++        'content': 'Hello'
++    },
++    {
++        'role': 'assistant',
++        'content': 'Hi there!'
++    },
++    {
++        'role': 'user',
++        'content': 'What is the capital of'
++    },
++]
++ASSISTANT_MESSAGE_TO_CONTINUE = {
++    'role': 'assistant',
++    'content': 'The capital of'
++}
++
++
++def test_load_chat_template():
++    # Testing chatml template
++    template_content = load_chat_template(chat_template=chatml_jinja_path)
++
++    # Test assertions
++    assert template_content is not None
++    # Hard coded value for template_chatml.jinja
++    assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
++{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
++
++
++def test_no_load_chat_template_filelike():
++    # Testing chatml template
++    template = "../../examples/does_not_exist"
++
++    with pytest.raises(ValueError, match="looks like a file path"):
++        load_chat_template(chat_template=template)
++
++
++def test_no_load_chat_template_literallike():
++    # Testing chatml template
++    template = "{{ messages }}"
++
++    template_content = load_chat_template(chat_template=template)
++
++    assert template_content == template
++
++
++@pytest.mark.parametrize(
++    "model,template,add_generation_prompt,continue_final_message,expected_output",
++    MODEL_TEMPLATE_GENERATON_OUTPUT)
++def test_get_gen_prompt(model, template, add_generation_prompt,
++                        continue_final_message, expected_output):
++    # Initialize the tokenizer
++    tokenizer = get_tokenizer(tokenizer_name=model)
++    template_content = load_chat_template(chat_template=template)
++
++    # Create a mock request object using keyword arguments
++    mock_request = ChatCompletionRequest(
++        model=model,
++        messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
++        if continue_final_message else TEST_MESSAGES,
++        add_generation_prompt=add_generation_prompt,
++        continue_final_message=continue_final_message,
++    )
++
++    # Call the function and get the result
++    result = apply_hf_chat_template(
++        tokenizer,
++        conversation=mock_request.messages,
++        chat_template=mock_request.chat_template or template_content,
++        add_generation_prompt=mock_request.add_generation_prompt,
++        continue_final_message=mock_request.continue_final_message,
++    )
++
++    # Test assertion
++    assert result == expected_output, (
++        f"The generated prompt does not match the expected output for "
++        f"model {model} and template {template}")
+diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
+new file mode 100644
+index 0000000..61d6636
+--- /dev/null
++++ b/tests/entrypoints/openai/test_chunked_prompt.py
+@@ -0,0 +1,126 @@
++import openai  # use the official client for correctness check
++import pytest
++import pytest_asyncio
++
++from ...utils import RemoteOpenAIServer
++
++# any model with a chat template should work here
++MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "8192",
++        "--enforce-eager",
++        # lora config below
++        "--max-num-seqs",
++        "128",
++        "--enable-chunked-prefill",
++        "--max-num-batched-tokens",
++        "1000",
++        # large prompts create a lot of output
++        "--disable-log-requests",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.mark.asyncio
++async def test_completion_stream_options_and_logprobs_with_long_prompts(
++        client: openai.AsyncOpenAI):
++    # Test stream with long prompt
++    prompt = "What is the capital of France?" * 400
++
++    stream = await client.completions.create(
++        model=MODEL_NAME,
++        prompt=prompt,
++        max_tokens=5,
++        temperature=0.0,
++        stream=True,
++        stream_options={
++            "include_usage": True,
++            "continuous_usage_stats": True,
++        },
++        logprobs=5,
++    )
++
++    tokens_received = 0
++    finished = False
++    async for chunk in stream:
++        assert chunk.usage.prompt_tokens >= 0
++        assert chunk.usage.completion_tokens >= 0
++        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
++                                            chunk.usage.completion_tokens)
++        if not finished:
++            tokens_received += 1
++            assert chunk.choices[0].text
++
++            if chunk.choices[0].finish_reason is not None:
++                finished = True
++
++        if finished:
++            assert chunk.usage.completion_tokens == tokens_received
++
++
++@pytest.mark.asyncio
++async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
++        client: openai.AsyncOpenAI):
++    # Test stream with long prompt
++    messages = [{
++        "role": "system",
++        "content": "You are a helpful assistant."
++    }, {
++        "role": "user",
++        "content": "What is the capital of France?" * 400
++    }]
++    stream = await client.chat.completions.create(
++        model=MODEL_NAME,
++        messages=messages,
++        max_tokens=5,
++        temperature=0.0,
++        stream=True,
++        stream_options={
++            "include_usage": True,
++            "continuous_usage_stats": True,
++        },
++        logprobs=True,
++        top_logprobs=5,
++    )
++
++    tokens_received = 0
++    empty_chunks_received = 0
++    finished = False
++    async for chunk in stream:
++        assert chunk.usage.prompt_tokens >= 0
++        assert chunk.usage.completion_tokens >= 0
++        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
++                                            chunk.usage.completion_tokens)
++
++        if not finished:
++            if chunk.choices[0].delta.content == "":
++                # when there is no tokens generated
++                assert chunk.usage.completion_tokens == 0
++                assert chunk.choices[0].logprobs is None
++                empty_chunks_received += 1
++            else:
++                tokens_received += 1
++
++            if chunk.choices[0].finish_reason is not None:
++                finished = True
++
++        if finished:
++            assert chunk.usage.completion_tokens == tokens_received
++
++    assert empty_chunks_received <= 1
+diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
+new file mode 100644
+index 0000000..e49562a
+--- /dev/null
++++ b/tests/entrypoints/openai/test_cli_args.py
+@@ -0,0 +1,131 @@
++import json
++
++import pytest
++
++from vllm.entrypoints.openai.cli_args import (make_arg_parser,
++                                              validate_parsed_serve_args)
++from vllm.entrypoints.openai.serving_models import LoRAModulePath
++from vllm.utils import FlexibleArgumentParser
++
++from ...utils import VLLM_PATH
++
++LORA_MODULE = {
++    "name": "module2",
++    "path": "/path/to/module2",
++    "base_model_name": "llama"
++}
++CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
++assert CHATML_JINJA_PATH.exists()
++
++
++@pytest.fixture
++def serve_parser():
++    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
++    return make_arg_parser(parser)
++
++
++### Tests for Lora module parsing
++def test_valid_key_value_format(serve_parser):
++    # Test old format: name=path
++    args = serve_parser.parse_args([
++        '--lora-modules',
++        'module1=/path/to/module1',
++    ])
++    expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
++    assert args.lora_modules == expected
++
++
++def test_valid_json_format(serve_parser):
++    # Test valid JSON format input
++    args = serve_parser.parse_args([
++        '--lora-modules',
++        json.dumps(LORA_MODULE),
++    ])
++    expected = [
++        LoRAModulePath(name='module2',
++                       path='/path/to/module2',
++                       base_model_name='llama')
++    ]
++    assert args.lora_modules == expected
++
++
++def test_invalid_json_format(serve_parser):
++    # Test invalid JSON format input, missing closing brace
++    with pytest.raises(SystemExit):
++        serve_parser.parse_args([
++            '--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
++        ])
++
++
++def test_invalid_type_error(serve_parser):
++    # Test type error when values are not JSON or key=value
++    with pytest.raises(SystemExit):
++        serve_parser.parse_args([
++            '--lora-modules',
++            'invalid_format'  # This is not JSON or key=value format
++        ])
++
++
++def test_invalid_json_field(serve_parser):
++    # Test valid JSON format but missing required fields
++    with pytest.raises(SystemExit):
++        serve_parser.parse_args([
++            '--lora-modules',
++            '{"name": "module4"}'  # Missing required 'path' field
++        ])
++
++
++def test_empty_values(serve_parser):
++    # Test when no LoRA modules are provided
++    args = serve_parser.parse_args(['--lora-modules', ''])
++    assert args.lora_modules == []
++
++
++def test_multiple_valid_inputs(serve_parser):
++    # Test multiple valid inputs (both old and JSON format)
++    args = serve_parser.parse_args([
++        '--lora-modules',
++        'module1=/path/to/module1',
++        json.dumps(LORA_MODULE),
++    ])
++    expected = [
++        LoRAModulePath(name='module1', path='/path/to/module1'),
++        LoRAModulePath(name='module2',
++                       path='/path/to/module2',
++                       base_model_name='llama')
++    ]
++    assert args.lora_modules == expected
++
++
++### Tests for serve argument validation that run prior to loading
++def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
++    """Ensure validation fails if tool choice is enabled with no call parser"""
++    # If we enable-auto-tool-choice, explode with no tool-call-parser
++    args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
++    with pytest.raises(TypeError):
++        validate_parsed_serve_args(args)
++
++
++def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
++    """Ensure validation passes with tool choice enabled with a call parser"""
++    args = serve_parser.parse_args(args=[
++        "--enable-auto-tool-choice",
++        "--tool-call-parser",
++        "mistral",
++    ])
++    validate_parsed_serve_args(args)
++
++
++def test_chat_template_validation_for_happy_paths(serve_parser):
++    """Ensure validation passes if the chat template exists"""
++    args = serve_parser.parse_args(
++        args=["--chat-template",
++              CHATML_JINJA_PATH.absolute().as_posix()])
++    validate_parsed_serve_args(args)
++
++
++def test_chat_template_validation_for_sad_paths(serve_parser):
++    """Ensure validation fails if the chat template doesn't exist"""
++    args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
++    with pytest.raises(ValueError):
++        validate_parsed_serve_args(args)
+diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
+new file mode 100644
+index 0000000..183d900
+--- /dev/null
++++ b/tests/entrypoints/openai/test_completion.py
+@@ -0,0 +1,779 @@
++# imports for guided decoding tests
++import json
++import re
++import shutil
++from tempfile import TemporaryDirectory
++from typing import Dict, List, Optional
++
++import jsonschema
++import openai  # use the official client for correctness check
++import pytest
++import pytest_asyncio
++# downloading lora to test lora requests
++from huggingface_hub import snapshot_download
++from openai import BadRequestError
++from transformers import AutoTokenizer
++
++from vllm.transformers_utils.tokenizer import get_tokenizer
++
++from ...utils import RemoteOpenAIServer
++
++# any model with a chat template should work here
++MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
++# technically these adapters use a different base model,
++# but we're not testing generation quality here
++LORA_NAME = "typeof/zephyr-7b-beta-lora"
++PA_NAME = "swapnilbp/llama_tweet_ptune"
++# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
++# need to change to match the prompt adapter
++PA_NUM_VIRTUAL_TOKENS = 8
++
++GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
++
++
++@pytest.fixture(scope="module")
++def zephyr_lora_files():
++    return snapshot_download(repo_id=LORA_NAME)
++
++
++@pytest.fixture(scope="module")
++def zephyr_lora_added_tokens_files(zephyr_lora_files):
++    tmp_dir = TemporaryDirectory()
++    tmp_model_dir = f"{tmp_dir.name}/zephyr"
++    shutil.copytree(zephyr_lora_files, tmp_model_dir)
++    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
++    # Copy tokenizer to adapter and add some unique tokens
++    # 32000, 32001, 32002
++    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
++                                 special_tokens=True)
++    assert added == 3
++    tokenizer.save_pretrained(tmp_model_dir)
++    yield tmp_model_dir
++    tmp_dir.cleanup()
++
++
++@pytest.fixture(scope="module")
++def zephyr_pa_files():
++    return snapshot_download(repo_id=PA_NAME)
++
++
++@pytest.fixture(scope="module")
++def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
++                        zephyr_pa_files):
++    return [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "8192",
++        "--max-num-seqs",
++        "128",
++        "--enforce-eager",
++        # lora config
++        "--enable-lora",
++        "--lora-modules",
++        f"zephyr-lora={zephyr_lora_files}",
++        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
++        "--max-lora-rank",
++        "64",
++        "--max-cpu-loras",
++        "2",
++        # pa config
++        "--enable-prompt-adapter",
++        "--prompt-adapters",
++        f"zephyr-pa={zephyr_pa_files}",
++        f"zephyr-pa2={zephyr_pa_files}",
++        "--max-prompt-adapters",
++        "2",
++        "--max-prompt-adapter-token",
++        "128",
++    ]
++
++
++@pytest.fixture(scope="module",
++                params=["", "--disable-frontend-multiprocessing"])
++def server(default_server_args, request):
++    if request.param:
++        default_server_args.append(request.param)
++    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    # first test base model, then test loras, then test prompt adapters
++    "model_name,num_virtual_tokens",
++    [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
++     ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
++     ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
++)
++async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
++                                 num_virtual_tokens: int):
++    completion = await client.completions.create(model=model_name,
++                                                 prompt="Hello, my name is",
++                                                 max_tokens=5,
++                                                 temperature=0.0)
++
++    assert completion.id is not None
++    assert completion.choices is not None and len(completion.choices) == 1
++
++    choice = completion.choices[0]
++    assert len(choice.text) >= 5
++    assert choice.finish_reason == "length"
++    assert completion.usage == openai.types.CompletionUsage(
++        completion_tokens=5,
++        prompt_tokens=6 + num_virtual_tokens,
++        total_tokens=11 + num_virtual_tokens)
++
++    # test using token IDs
++    completion = await client.completions.create(
++        model=model_name,
++        prompt=[0, 0, 0, 0, 0],
++        max_tokens=5,
++        temperature=0.0,
++    )
++    assert len(completion.choices[0].text) >= 1
++    assert completion.choices[0].prompt_logprobs is None
++
++
++@pytest.mark.asyncio
++async def test_added_lora_tokens(client: openai.AsyncOpenAI):
++    # test using token IDs
++    completion = await client.completions.create(
++        model="zephyr-lora2",
++        prompt=[0, 0, 32000, 32001, 32002],
++        echo=True,
++        max_tokens=5,
++        temperature=0.0,
++    )
++    # Added tokens should appear in tokenized prompt
++    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
++
++
++@pytest.mark.asyncio
++async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
++    # test using token IDs
++    with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
++        # Added tokens should be rejected by the base model
++        await client.completions.create(
++            model=MODEL_NAME,
++            prompt=[0, 0, 32000, 32001, 32002],
++            echo=True,
++            max_tokens=5,
++            temperature=0.0,
++        )
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    # first test base model, then test loras, then test prompt adapters
++    "model_name",
++    [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
++)
++async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
++    # test using token IDs
++    completion = await client.completions.create(
++        model=model_name,
++        prompt=[0, 0, 0, 0, 0],
++        max_tokens=5,
++        temperature=0.0,
++        logprobs=None,
++    )
++    choice = completion.choices[0]
++    assert choice.logprobs is None
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    # just test 1 lora and 1 pa hereafter
++    "model_name",
++    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
++)
++async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
++    # test using token IDs
++    completion = await client.completions.create(
++        model=model_name,
++        prompt=[0, 0, 0, 0, 0],
++        max_tokens=5,
++        temperature=0.0,
++        logprobs=0,
++    )
++    choice = completion.choices[0]
++    assert choice.logprobs is not None
++    assert choice.logprobs.token_logprobs is not None
++    assert choice.logprobs.top_logprobs is not None
++    assert len(choice.logprobs.top_logprobs[0]) == 1
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
++)
++async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
++    # test using token IDs
++    completion = await client.completions.create(
++        model=model_name,
++        prompt=[0, 0, 0, 0, 0],
++        max_tokens=5,
++        temperature=0.0,
++        logprobs=5,
++    )
++    choice = completion.choices[0]
++    assert choice.logprobs is not None
++    assert choice.logprobs.token_logprobs is not None
++    assert choice.logprobs.top_logprobs is not None
++    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
++)
++async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
++                                            model_name: str):
++
++    with pytest.raises(
++        (openai.BadRequestError, openai.APIError)):  # test using token IDs
++        await client.completions.create(
++            model=model_name,
++            prompt=[0, 0, 0, 0, 0],
++            max_tokens=5,
++            temperature=0.0,
++            # vLLM has higher default max_logprobs (20 instead of 5) to support
++            # both Completion API and Chat Completion API
++            logprobs=21,
++        )
++        ...
++    with pytest.raises(
++        (openai.BadRequestError, openai.APIError)):  # test using token IDs
++        stream = await client.completions.create(
++            model=model_name,
++            prompt=[0, 0, 0, 0, 0],
++            max_tokens=5,
++            temperature=0.0,
++            # vLLM has higher default max_logprobs (20 instead of 5) to support
++            # both Completion API and Chat Completion API
++            logprobs=30,
++            stream=True,
++        )
++        async for chunk in stream:
++            ...
++
++    # the server should still work afterwards
++    completion = await client.completions.create(
++        model=model_name,
++        prompt=[0, 0, 0, 0, 0],
++        max_tokens=5,
++        temperature=0.0,
++    )
++    assert len(completion.choices[0].text) >= 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
++                                                         (MODEL_NAME, 0),
++                                                         (MODEL_NAME, 1),
++                                                         (MODEL_NAME, None)])
++async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
++                                          model_name: str,
++                                          prompt_logprobs: Optional[int]):
++    params: Dict = {
++        "prompt": ["A robot may not injure another robot", "My name is"],
++        "model": model_name,
++    }
++    if prompt_logprobs is not None:
++        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
++
++    if prompt_logprobs is not None and prompt_logprobs < 0:
++        with pytest.raises(BadRequestError):
++            await client.completions.create(**params)
++    else:
++        completion = await client.completions.create(**params)
++        if prompt_logprobs is not None:
++            assert completion.choices[0].prompt_logprobs is not None
++            assert len(completion.choices[0].prompt_logprobs) > 0
++
++            assert completion.choices[1].prompt_logprobs is not None
++            assert len(completion.choices[1].prompt_logprobs) > 0
++
++        else:
++            assert completion.choices[0].prompt_logprobs is None
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
++)
++async def test_completion_streaming(client: openai.AsyncOpenAI,
++                                    model_name: str):
++    prompt = "What is an LLM?"
++
++    single_completion = await client.completions.create(
++        model=model_name,
++        prompt=prompt,
++        max_tokens=5,
++        temperature=0.0,
++    )
++    single_output = single_completion.choices[0].text
++    stream = await client.completions.create(model=model_name,
++                                             prompt=prompt,
++                                             max_tokens=5,
++                                             temperature=0.0,
++                                             stream=True)
++    chunks: List[str] = []
++    finish_reason_count = 0
++    async for chunk in stream:
++        chunks.append(chunk.choices[0].text)
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++    # finish reason should only return in last block
++    assert finish_reason_count == 1
++    assert chunk.choices[0].finish_reason == "length"
++    assert chunk.choices[0].text
++    assert "".join(chunks) == single_output
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
++)
++async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
++    """Streaming for parallel sampling.
++    The tokens from multiple samples, are flattened into a single stream,
++    with an index to indicate which sample the token belongs to.
++    """
++
++    prompt = "What is an LLM?"
++    n = 3
++    max_tokens = 5
++
++    stream = await client.completions.create(model=model_name,
++                                             prompt=prompt,
++                                             max_tokens=max_tokens,
++                                             n=n,
++                                             stream=True)
++    chunks: List[List[str]] = [[] for i in range(n)]
++    finish_reason_count = 0
++    async for chunk in stream:
++        index = chunk.choices[0].index
++        text = chunk.choices[0].text
++        chunks[index].append(text)
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++    assert finish_reason_count == n
++    for chunk in chunks:
++        assert len(chunk) == max_tokens
++        print("".join(chunk))
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
++)
++async def test_completion_stream_options(client: openai.AsyncOpenAI,
++                                         model_name: str):
++    prompt = "What is the capital of France?"
++
++    # Test stream=True, stream_options=
++    #     {"include_usage": False, "continuous_usage_stats": False}
++    stream = await client.completions.create(model=model_name,
++                                             prompt=prompt,
++                                             max_tokens=5,
++                                             temperature=0.0,
++                                             stream=True,
++                                             stream_options={
++                                                 "include_usage": False,
++                                                 "continuous_usage_stats":
++                                                 False,
++                                             })
++
++    async for chunk in stream:
++        assert chunk.usage is None
++
++    # Test stream=True, stream_options=
++    #     {"include_usage": False, "continuous_usage_stats": True}
++    stream = await client.completions.create(model=model_name,
++                                             prompt=prompt,
++                                             max_tokens=5,
++                                             temperature=0.0,
++                                             stream=True,
++                                             stream_options={
++                                                 "include_usage": False,
++                                                 "continuous_usage_stats":
++                                                 True,
++                                             })
++    async for chunk in stream:
++        assert chunk.usage is None
++
++    # Test stream=True, stream_options=
++    #     {"include_usage": True, "continuous_usage_stats": False}
++    stream = await client.completions.create(model=model_name,
++                                             prompt=prompt,
++                                             max_tokens=5,
++                                             temperature=0.0,
++                                             stream=True,
++                                             stream_options={
++                                                 "include_usage": True,
++                                                 "continuous_usage_stats":
++                                                 False,
++                                             })
++    async for chunk in stream:
++        if chunk.choices[0].finish_reason is None:
++            assert chunk.usage is None
++        else:
++            assert chunk.usage is None
++            final_chunk = await stream.__anext__()
++            assert final_chunk.usage is not None
++            assert final_chunk.usage.prompt_tokens > 0
++            assert final_chunk.usage.completion_tokens > 0
++            assert final_chunk.usage.total_tokens == (
++                final_chunk.usage.prompt_tokens +
++                final_chunk.usage.completion_tokens)
++            assert final_chunk.choices == []
++
++    # Test stream=True, stream_options=
++    #     {"include_usage": True, "continuous_usage_stats": True}
++    stream = await client.completions.create(model=model_name,
++                                             prompt=prompt,
++                                             max_tokens=5,
++                                             temperature=0.0,
++                                             stream=True,
++                                             stream_options={
++                                                 "include_usage": True,
++                                                 "continuous_usage_stats":
++                                                 True,
++                                             })
++    async for chunk in stream:
++        assert chunk.usage is not None
++        assert chunk.usage.prompt_tokens > 0
++        assert chunk.usage.completion_tokens > 0
++        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
++                                            chunk.usage.completion_tokens)
++        if chunk.choices[0].finish_reason is not None:
++            final_chunk = await stream.__anext__()
++            assert final_chunk.usage is not None
++            assert final_chunk.usage.prompt_tokens > 0
++            assert final_chunk.usage.completion_tokens > 0
++            assert final_chunk.usage.total_tokens == (
++                final_chunk.usage.prompt_tokens +
++                final_chunk.usage.completion_tokens)
++            assert final_chunk.choices == []
++
++    # Test stream=False, stream_options=
++    #     {"include_usage": None}
++    with pytest.raises(BadRequestError):
++        await client.completions.create(model=model_name,
++                                        prompt=prompt,
++                                        max_tokens=5,
++                                        temperature=0.0,
++                                        stream=False,
++                                        stream_options={"include_usage": None})
++
++    # Test stream=False, stream_options=
++    #    {"include_usage": True}
++    with pytest.raises(BadRequestError):
++        await client.completions.create(model=model_name,
++                                        prompt=prompt,
++                                        max_tokens=5,
++                                        temperature=0.0,
++                                        stream=False,
++                                        stream_options={"include_usage": True})
++
++    # Test stream=False, stream_options=
++    #     {"continuous_usage_stats": None}
++    with pytest.raises(BadRequestError):
++        await client.completions.create(
++            model=model_name,
++            prompt=prompt,
++            max_tokens=5,
++            temperature=0.0,
++            stream=False,
++            stream_options={"continuous_usage_stats": None})
++
++    # Test stream=False, stream_options=
++    #    {"continuous_usage_stats": True}
++    with pytest.raises(BadRequestError):
++        await client.completions.create(
++            model=model_name,
++            prompt=prompt,
++            max_tokens=5,
++            temperature=0.0,
++            stream=False,
++            stream_options={"continuous_usage_stats": True})
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name",
++    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
++)
++async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
++    # test both text and token IDs
++    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
++        # test simple list
++        batch = await client.completions.create(
++            model=model_name,
++            prompt=prompts,
++            max_tokens=5,
++            temperature=0.0,
++        )
++        assert len(batch.choices) == 2
++        assert batch.choices[0].text == batch.choices[1].text
++
++        # test n = 2
++        batch = await client.completions.create(
++            model=model_name,
++            prompt=prompts,
++            n=2,
++            max_tokens=5,
++            temperature=0.0,
++            extra_body=dict(
++                # NOTE: this has to be true for n > 1 in vLLM, but
++                # not necessary for official client.
++                use_beam_search=True),
++        )
++        assert len(batch.choices) == 4
++        assert batch.choices[0].text != batch.choices[
++            1].text, "beam search should be different"
++        assert batch.choices[0].text == batch.choices[
++            2].text, "two copies of the same prompt should be the same"
++        assert batch.choices[1].text == batch.choices[
++            3].text, "two copies of the same prompt should be the same"
++
++        # test streaming
++        batch = await client.completions.create(
++            model=model_name,
++            prompt=prompts,
++            max_tokens=5,
++            temperature=0.0,
++            stream=True,
++        )
++        texts = [""] * 2
++        async for chunk in batch:
++            assert len(chunk.choices) == 1
++            choice = chunk.choices[0]
++            texts[choice.index] += choice.text
++        assert texts[0] == texts[1]
++
++
++@pytest.mark.asyncio
++async def test_logits_bias(client: openai.AsyncOpenAI):
++    prompt = "Hello, my name is"
++    max_tokens = 5
++    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
++
++    # Test exclusive selection
++    token_id = 1000
++    completion = await client.completions.create(
++        model=MODEL_NAME,
++        prompt=prompt,
++        max_tokens=max_tokens,
++        temperature=0.0,
++        logit_bias={str(token_id): 100},
++        seed=42,
++    )
++    assert len(completion.choices[0].text) >= 5
++    response_tokens = tokenizer(completion.choices[0].text,
++                                add_special_tokens=False)["input_ids"]
++    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
++                                add_special_tokens=False)["input_ids"]
++    assert all([
++        response == expected
++        for response, expected in zip(response_tokens, expected_tokens)
++    ])
++
++    # Test ban
++    completion = await client.completions.create(
++        model=MODEL_NAME,
++        prompt=prompt,
++        max_tokens=max_tokens,
++        temperature=0.0,
++    )
++    response_tokens = tokenizer(completion.choices[0].text,
++                                add_special_tokens=False)["input_ids"]
++    first_response = completion.choices[0].text
++    completion = await client.completions.create(
++        model=MODEL_NAME,
++        prompt=prompt,
++        max_tokens=max_tokens,
++        temperature=0.0,
++        logit_bias={str(token): -100
++                    for token in response_tokens},
++    )
++    assert first_response != completion.choices[0].text
++
++
++@pytest.mark.asyncio
++async def test_allowed_token_ids(client: openai.AsyncOpenAI):
++    prompt = "Hello, my name is"
++    max_tokens = 1
++    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
++
++    # Test exclusive selection
++    allowed_ids = [21555, 21557, 21558]
++    completion = await client.completions.create(
++        model=MODEL_NAME,
++        prompt=prompt,
++        max_tokens=max_tokens,
++        temperature=0.0,
++        seed=42,
++        extra_body=dict(allowed_token_ids=allowed_ids),
++        logprobs=1,
++    )
++    response_tokens = completion.choices[0].logprobs.tokens
++    assert len(response_tokens) == 1
++    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++async def test_guided_json_completion(client: openai.AsyncOpenAI,
++                                      guided_decoding_backend: str,
++                                      sample_json_schema):
++    completion = await client.completions.create(
++        model=MODEL_NAME,
++        prompt=f"Give an example JSON for an employee profile "
++        f"that fits this schema: {sample_json_schema}",
++        n=3,
++        temperature=1.0,
++        max_tokens=500,
++        extra_body=dict(guided_json=sample_json_schema,
++                        guided_decoding_backend=guided_decoding_backend))
++
++    assert completion.id is not None
++    assert len(completion.choices) == 3
++    for i in range(3):
++        output_json = json.loads(completion.choices[i].text)
++        jsonschema.validate(instance=output_json, schema=sample_json_schema)
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++async def test_guided_regex_completion(client: openai.AsyncOpenAI,
++                                       guided_decoding_backend: str,
++                                       sample_regex):
++    completion = await client.completions.create(
++        model=MODEL_NAME,
++        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
++        n=3,
++        temperature=1.0,
++        max_tokens=20,
++        extra_body=dict(guided_regex=sample_regex,
++                        guided_decoding_backend=guided_decoding_backend))
++
++    assert completion.id is not None
++    assert len(completion.choices) == 3
++    for i in range(3):
++        assert re.fullmatch(sample_regex,
++                            completion.choices[i].text) is not None
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++async def test_guided_choice_completion(client: openai.AsyncOpenAI,
++                                        guided_decoding_backend: str,
++                                        sample_guided_choice):
++    completion = await client.completions.create(
++        model=MODEL_NAME,
++        prompt="The best language for type-safe systems programming is ",
++        n=2,
++        temperature=1.0,
++        max_tokens=10,
++        extra_body=dict(guided_choice=sample_guided_choice,
++                        guided_decoding_backend=guided_decoding_backend))
++
++    assert completion.id is not None
++    assert len(completion.choices) == 2
++    for i in range(2):
++        assert completion.choices[i].text in sample_guided_choice
++
++
++@pytest.mark.asyncio
++async def test_guided_grammar(client: openai.AsyncOpenAI,
++                              sample_sql_statements):
++
++    completion = await client.completions.create(
++        model=MODEL_NAME,
++        prompt=("Generate a sql state that select col_1 from "
++                "table_1 where it is equals to 1"),
++        temperature=1.0,
++        max_tokens=500,
++        extra_body=dict(guided_grammar=sample_sql_statements))
++
++    content = completion.choices[0].text
++
++    # use Lark to parse the output, and make sure it's a valid parse tree
++    from lark import Lark
++    parser = Lark(sample_sql_statements)
++    parser.parse(content)
++
++    # remove spaces for comparison b/c we removed them in the grammar
++    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
++
++    assert content.strip() == ground_truth
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    # first test base model, then test loras
++    "model_name",
++    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
++)
++@pytest.mark.parametrize("logprobs_arg", [1, 0])
++async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
++                                       model_name: str, logprobs_arg: int):
++    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
++    # test using text and token IDs
++    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
++        completion = await client.completions.create(model=model_name,
++                                                     prompt=prompt,
++                                                     max_tokens=5,
++                                                     temperature=0.0,
++                                                     echo=True,
++                                                     logprobs=logprobs_arg)
++
++        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
++                                                             list) else prompt
++        assert re.search(r"^" + prompt_text, completion.choices[0].text)
++        logprobs = completion.choices[0].logprobs
++        assert logprobs is not None
++        assert len(logprobs.text_offset) > 5
++        assert (len(logprobs.token_logprobs) > 5
++                and logprobs.token_logprobs[0] is None)
++        assert (len(logprobs.top_logprobs) > 5
++                and logprobs.top_logprobs[0] is None)
++        for top_logprobs in logprobs.top_logprobs[1:]:
++            assert max(logprobs_arg,
++                       1) <= len(top_logprobs) <= logprobs_arg + 1
++        assert len(logprobs.tokens) > 5
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
++async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
++                                          guided_decoding_backend: str,
++                                          sample_json_schema, sample_regex):
++    with pytest.raises(openai.BadRequestError):
++        _ = await client.completions.create(
++            model=MODEL_NAME,
++            prompt="Give an example JSON that fits this schema: 42",
++            extra_body=dict(guided_json=42,
++                            guided_decoding_backend=guided_decoding_backend))
++
++    with pytest.raises(openai.BadRequestError):
++        _ = await client.completions.create(
++            model=MODEL_NAME,
++            prompt="Give an example string that fits this regex",
++            extra_body=dict(guided_regex=sample_regex,
++                            guided_json=sample_json_schema))
+diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
+new file mode 100644
+index 0000000..b52a5b2
+--- /dev/null
++++ b/tests/entrypoints/openai/test_embedding.py
+@@ -0,0 +1,274 @@
++import base64
++
++import numpy as np
++import openai
++import pytest
++import pytest_asyncio
++import requests
++
++from vllm.entrypoints.openai.protocol import EmbeddingResponse
++from vllm.transformers_utils.tokenizer import get_tokenizer
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
++DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        "--task",
++        "embed",
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--enforce-eager",
++        "--max-model-len",
++        "8192",
++        "--chat-template",
++        DUMMY_CHAT_TEMPLATE,
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
++    input_texts = [
++        "The chef prepared a delicious meal.",
++    ]
++
++    # test single embedding
++    embedding_response = await client.embeddings.create(
++        model=model_name,
++        input=input_texts,
++        encoding_format="float",
++    )
++    embeddings = EmbeddingResponse.model_validate(
++        embedding_response.model_dump(mode="json"))
++
++    assert embeddings.id is not None
++    assert len(embeddings.data) == 1
++    assert len(embeddings.data[0].embedding) == 4096
++    assert embeddings.usage.completion_tokens == 0
++    assert embeddings.usage.prompt_tokens == 9
++    assert embeddings.usage.total_tokens == 9
++
++    # test using token IDs
++    input_tokens = [1, 1, 1, 1, 1]
++    embedding_response = await client.embeddings.create(
++        model=model_name,
++        input=input_tokens,
++        encoding_format="float",
++    )
++    embeddings = EmbeddingResponse.model_validate(
++        embedding_response.model_dump(mode="json"))
++
++    assert embeddings.id is not None
++    assert len(embeddings.data) == 1
++    assert len(embeddings.data[0].embedding) == 4096
++    assert embeddings.usage.completion_tokens == 0
++    assert embeddings.usage.prompt_tokens == 5
++    assert embeddings.usage.total_tokens == 5
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
++    # test List[str]
++    input_texts = [
++        "The cat sat on the mat.", "A feline was resting on a rug.",
++        "Stars twinkle brightly in the night sky."
++    ]
++    embedding_response = await client.embeddings.create(
++        model=model_name,
++        input=input_texts,
++        encoding_format="float",
++    )
++    embeddings = EmbeddingResponse.model_validate(
++        embedding_response.model_dump(mode="json"))
++
++    assert embeddings.id is not None
++    assert len(embeddings.data) == 3
++    assert len(embeddings.data[0].embedding) == 4096
++    assert embeddings.usage.completion_tokens == 0
++    assert embeddings.usage.prompt_tokens == 32
++    assert embeddings.usage.total_tokens == 32
++
++    # test List[List[int]]
++    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
++                    [25, 32, 64, 77]]
++    embedding_response = await client.embeddings.create(
++        model=model_name,
++        input=input_tokens,
++        encoding_format="float",
++    )
++    embeddings = EmbeddingResponse.model_validate(
++        embedding_response.model_dump(mode="json"))
++
++    assert embeddings.id is not None
++    assert len(embeddings.data) == 4
++    assert len(embeddings.data[0].embedding) == 4096
++    assert embeddings.usage.completion_tokens == 0
++    assert embeddings.usage.prompt_tokens == 17
++    assert embeddings.usage.total_tokens == 17
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_conversation_embedding(server: RemoteOpenAIServer,
++                                      client: openai.AsyncOpenAI,
++                                      model_name: str):
++    messages = [{
++        "role": "user",
++        "content": "The cat sat on the mat.",
++    }, {
++        "role": "assistant",
++        "content": "A feline was resting on a rug.",
++    }, {
++        "role": "user",
++        "content": "Stars twinkle brightly in the night sky.",
++    }]
++
++    chat_response = requests.post(
++        server.url_for("v1/embeddings"),
++        json={
++            "model": model_name,
++            "messages": messages,
++            "encoding_format": "float",
++        },
++    )
++    chat_response.raise_for_status()
++    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
++
++    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
++    prompt = tokenizer.apply_chat_template(
++        messages,
++        chat_template=DUMMY_CHAT_TEMPLATE,
++        add_generation_prompt=True,
++        continue_final_message=False,
++        tokenize=False,
++    )
++    completion_response = await client.embeddings.create(
++        model=model_name,
++        input=prompt,
++        encoding_format="float",
++        # To be consistent with chat
++        extra_body={"add_special_tokens": False},
++    )
++    completion_embeddings = EmbeddingResponse.model_validate(
++        completion_response.model_dump(mode="json"))
++
++    assert chat_embeddings.id is not None
++    assert completion_embeddings.id is not None
++    assert chat_embeddings.created <= completion_embeddings.created
++    assert chat_embeddings.model_dump(
++        exclude={"id", "created"}) == (completion_embeddings.model_dump(
++            exclude={"id", "created"}))
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
++                                      model_name: str):
++    input_texts = [
++        "Hello my name is",
++        "The best thing about vLLM is that it supports many different models"
++    ]
++
++    responses_float = await client.embeddings.create(input=input_texts,
++                                                     model=model_name,
++                                                     encoding_format="float")
++
++    responses_base64 = await client.embeddings.create(input=input_texts,
++                                                      model=model_name,
++                                                      encoding_format="base64")
++
++    decoded_responses_base64_data = []
++    for data in responses_base64.data:
++        decoded_responses_base64_data.append(
++            np.frombuffer(base64.b64decode(data.embedding),
++                          dtype="float32").tolist())
++
++    assert responses_float.data[0].embedding == decoded_responses_base64_data[
++        0]
++    assert responses_float.data[1].embedding == decoded_responses_base64_data[
++        1]
++
++    # Default response is float32 decoded from base64 by OpenAI Client
++    responses_default = await client.embeddings.create(input=input_texts,
++                                                       model=model_name)
++
++    assert responses_float.data[0].embedding == responses_default.data[
++        0].embedding
++    assert responses_float.data[1].embedding == responses_default.data[
++        1].embedding
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
++                                           model_name: str):
++    input_texts = [
++        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
++    ]
++
++    # test single embedding
++    embedding_response = await client.embeddings.create(
++        model=model_name,
++        input=input_texts,
++        extra_body={"truncate_prompt_tokens": 10})
++    embeddings = EmbeddingResponse.model_validate(
++        embedding_response.model_dump(mode="json"))
++
++    assert embeddings.id is not None
++    assert len(embeddings.data) == 1
++    assert len(embeddings.data[0].embedding) == 4096
++    assert embeddings.usage.completion_tokens == 0
++    assert embeddings.usage.prompt_tokens == 10
++    assert embeddings.usage.total_tokens == 10
++
++    input_tokens = [
++        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
++        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
++    ]
++    embedding_response = await client.embeddings.create(
++        model=model_name,
++        input=input_tokens,
++        extra_body={"truncate_prompt_tokens": 10})
++    embeddings = EmbeddingResponse.model_validate(
++        embedding_response.model_dump(mode="json"))
++
++    assert embeddings.id is not None
++    assert len(embeddings.data) == 1
++    assert len(embeddings.data[0].embedding) == 4096
++    assert embeddings.usage.completion_tokens == 0
++    assert embeddings.usage.prompt_tokens == 10
++    assert embeddings.usage.total_tokens == 10
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
++                                                   model_name: str):
++    input_texts = [
++        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
++    ]
++
++    with pytest.raises(openai.BadRequestError):
++        response = await client.embeddings.create(
++            model=model_name,
++            input=input_texts,
++            extra_body={"truncate_prompt_tokens": 8193})
++        assert "error" in response.object
++        assert "truncate_prompt_tokens value is greater than max_model_len. "\
++               "Please, select a smaller truncation size." in response.message
+diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py
+new file mode 100644
+index 0000000..51eba69
+--- /dev/null
++++ b/tests/entrypoints/openai/test_encoder_decoder.py
+@@ -0,0 +1,52 @@
++import openai
++import pytest
++import pytest_asyncio
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "facebook/bart-base"
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        "--dtype",
++        "bfloat16",
++        "--enforce-eager",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
++    completion = await client.completions.create(model=model_name,
++                                                 prompt="Hello, my name is",
++                                                 max_tokens=5,
++                                                 temperature=0.0)
++
++    assert completion.id is not None
++    assert completion.choices is not None and len(completion.choices) == 1
++
++    choice = completion.choices[0]
++    assert len(choice.text) >= 5
++    assert choice.finish_reason == "length"
++    assert completion.usage == openai.types.CompletionUsage(
++        completion_tokens=5, prompt_tokens=2, total_tokens=7)
++
++    # test using token IDs
++    completion = await client.completions.create(
++        model=model_name,
++        prompt=[0, 0, 0, 0, 0],
++        max_tokens=5,
++        temperature=0.0,
++    )
++    assert len(completion.choices[0].text) >= 1
+diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
+new file mode 100644
+index 0000000..46a064f
+--- /dev/null
++++ b/tests/entrypoints/openai/test_lora_adapters.py
+@@ -0,0 +1,269 @@
++import asyncio
++import json
++import shutil
++from contextlib import suppress
++
++import openai  # use the official client for correctness check
++import pytest
++import pytest_asyncio
++# downloading lora to test lora requests
++from huggingface_hub import snapshot_download
++
++from ...utils import RemoteOpenAIServer
++
++# any model with a chat template should work here
++MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
++# technically this needs Mistral-7B-v0.1 as base, but we're not testing
++# generation quality here
++LORA_NAME = "typeof/zephyr-7b-beta-lora"
++
++
++@pytest.fixture(scope="module")
++def zephyr_lora_files():
++    return snapshot_download(repo_id=LORA_NAME)
++
++
++@pytest.fixture(scope="module")
++def server_with_lora_modules_json(zephyr_lora_files):
++    # Define the json format LoRA module configurations
++    lora_module_1 = {
++        "name": "zephyr-lora",
++        "path": zephyr_lora_files,
++        "base_model_name": MODEL_NAME
++    }
++
++    lora_module_2 = {
++        "name": "zephyr-lora2",
++        "path": zephyr_lora_files,
++        "base_model_name": MODEL_NAME
++    }
++
++    args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "8192",
++        "--enforce-eager",
++        # lora config below
++        "--enable-lora",
++        "--lora-modules",
++        json.dumps(lora_module_1),
++        json.dumps(lora_module_2),
++        "--max-lora-rank",
++        "64",
++        "--max-cpu-loras",
++        "2",
++        "--max-num-seqs",
++        "64",
++    ]
++
++    # Enable the /v1/load_lora_adapter endpoint
++    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
++
++    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server_with_lora_modules_json):
++    async with server_with_lora_modules_json.get_async_client(
++    ) as async_client:
++        yield async_client
++
++
++@pytest.mark.asyncio
++async def test_static_lora_lineage(client: openai.AsyncOpenAI,
++                                   zephyr_lora_files):
++    models = await client.models.list()
++    models = models.data
++    served_model = models[0]
++    lora_models = models[1:]
++    assert served_model.id == MODEL_NAME
++    assert served_model.root == MODEL_NAME
++    assert served_model.parent is None
++    assert all(lora_model.root == zephyr_lora_files
++               for lora_model in lora_models)
++    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
++    assert lora_models[0].id == "zephyr-lora"
++    assert lora_models[1].id == "zephyr-lora2"
++
++
++@pytest.mark.asyncio
++async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
++                                    zephyr_lora_files):
++
++    response = await client.post("load_lora_adapter",
++                                 cast_to=str,
++                                 body={
++                                     "lora_name": "zephyr-lora-3",
++                                     "lora_path": zephyr_lora_files
++                                 })
++    # Ensure adapter loads before querying /models
++    assert "success" in response
++
++    models = await client.models.list()
++    models = models.data
++    dynamic_lora_model = models[-1]
++    assert dynamic_lora_model.root == zephyr_lora_files
++    assert dynamic_lora_model.parent == MODEL_NAME
++    assert dynamic_lora_model.id == "zephyr-lora-3"
++
++
++@pytest.mark.asyncio
++async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
++    with pytest.raises(openai.NotFoundError):
++        await client.post("load_lora_adapter",
++                          cast_to=str,
++                          body={
++                              "lora_name": "notfound",
++                              "lora_path": "/not/an/adapter"
++                          })
++
++
++@pytest.mark.asyncio
++async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
++                                          tmp_path):
++    invalid_files = tmp_path / "invalid_files"
++    invalid_files.mkdir()
++    (invalid_files / "adapter_config.json").write_text("this is not json")
++
++    with pytest.raises(openai.BadRequestError):
++        await client.post("load_lora_adapter",
++                          cast_to=str,
++                          body={
++                              "lora_name": "invalid-json",
++                              "lora_path": str(invalid_files)
++                          })
++
++
++@pytest.mark.asyncio
++async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI,
++                                              tmp_path, zephyr_lora_files):
++    invalid_rank = tmp_path / "invalid_rank"
++
++    # Copy adapter from zephyr_lora_files to invalid_rank
++    shutil.copytree(zephyr_lora_files, invalid_rank)
++
++    with open(invalid_rank / "adapter_config.json") as f:
++        adapter_config = json.load(f)
++
++    print(adapter_config)
++
++    # assert False
++
++    # Change rank to invalid value
++    adapter_config["r"] = 1024
++    with open(invalid_rank / "adapter_config.json", "w") as f:
++        json.dump(adapter_config, f)
++
++    with pytest.raises(openai.BadRequestError,
++                       match="is greater than max_lora_rank"):
++        await client.post("load_lora_adapter",
++                          cast_to=str,
++                          body={
++                              "lora_name": "invalid-json",
++                              "lora_path": str(invalid_rank)
++                          })
++
++
++@pytest.mark.asyncio
++async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
++                                      zephyr_lora_files):
++    """Validate that many loras can be dynamically registered and inferenced 
++    with concurrently"""
++
++    # This test file configures the server with --max-cpu-loras=2 and this test
++    # will concurrently load 10 adapters, so it should flex the LRU cache
++    async def load_and_run_adapter(adapter_name: str):
++        await client.post("load_lora_adapter",
++                          cast_to=str,
++                          body={
++                              "lora_name": adapter_name,
++                              "lora_path": str(zephyr_lora_files)
++                          })
++        for _ in range(3):
++            await client.completions.create(
++                model=adapter_name,
++                prompt=["Hello there", "Foo bar bazz buzz"],
++                max_tokens=5,
++            )
++
++    lora_tasks = []
++    for i in range(10):
++        lora_tasks.append(
++            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
++
++    results, _ = await asyncio.wait(lora_tasks)
++
++    for r in results:
++        assert not isinstance(r, Exception), f"Got exception {r}"
++
++
++@pytest.mark.asyncio
++async def test_loading_invalid_adapters_does_not_break_others(
++        client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
++
++    invalid_files = tmp_path / "invalid_files"
++    invalid_files.mkdir()
++    (invalid_files / "adapter_config.json").write_text("this is not json")
++
++    stop_good_requests_event = asyncio.Event()
++
++    async def run_good_requests(client):
++        # Run chat completions requests until event set
++
++        results = []
++
++        while not stop_good_requests_event.is_set():
++            try:
++                batch = await client.completions.create(
++                    model="zephyr-lora",
++                    prompt=["Hello there", "Foo bar bazz buzz"],
++                    max_tokens=5,
++                )
++                results.append(batch)
++            except Exception as e:
++                results.append(e)
++
++        return results
++
++    # Create task to run good requests
++    good_task = asyncio.create_task(run_good_requests(client))
++
++    # Run a bunch of bad adapter loads
++    for _ in range(25):
++        with suppress(openai.NotFoundError):
++            await client.post("load_lora_adapter",
++                              cast_to=str,
++                              body={
++                                  "lora_name": "notfound",
++                                  "lora_path": "/not/an/adapter"
++                              })
++    for _ in range(25):
++        with suppress(openai.BadRequestError):
++            await client.post("load_lora_adapter",
++                              cast_to=str,
++                              body={
++                                  "lora_name": "invalid",
++                                  "lora_path": str(invalid_files)
++                              })
++
++    # Ensure all the running requests with lora adapters succeeded
++    stop_good_requests_event.set()
++    results = await good_task
++    for r in results:
++        assert not isinstance(r, Exception), f"Got exception {r}"
++
++    # Ensure we can load another adapter and run it
++    await client.post("load_lora_adapter",
++                      cast_to=str,
++                      body={
++                          "lora_name": "valid",
++                          "lora_path": zephyr_lora_files
++                      })
++    await client.completions.create(
++        model="valid",
++        prompt=["Hello there", "Foo bar bazz buzz"],
++        max_tokens=5,
++    )
+diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
+new file mode 100644
+index 0000000..6523c8b
+--- /dev/null
++++ b/tests/entrypoints/openai/test_metrics.py
+@@ -0,0 +1,236 @@
++import subprocess
++import sys
++import tempfile
++import time
++from http import HTTPStatus
++
++import openai
++import pytest
++import pytest_asyncio
++import requests
++from prometheus_client.parser import text_string_to_metric_families
++from transformers import AutoTokenizer
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
++
++
++@pytest.fixture(scope="module")
++def default_server_args():
++    return [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "1024",
++        "--enforce-eager",
++        "--max-num-seqs",
++        "128",
++    ]
++
++
++@pytest.fixture(scope="module",
++                params=[
++                    "",
++                    "--enable-chunked-prefill",
++                    "--disable-frontend-multiprocessing",
++                ])
++def server(default_server_args, request):
++    if request.param:
++        default_server_args.append(request.param)
++    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as cl:
++        yield cl
++
++
++_PROMPT = "Hello my name is Robert and I love magic"
++tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
++_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
++
++_NUM_REQUESTS = 10
++_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
++_NUM_GENERATION_TOKENS_PER_REQUEST = 10
++
++# {metric_family: [(suffix, expected_value)]}
++EXPECTED_VALUES = {
++    "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
++    "vllm:time_per_output_token_seconds":
++    [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
++    "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
++    "vllm:request_prompt_tokens":
++    [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
++     ("_count", _NUM_REQUESTS)],
++    "vllm:request_generation_tokens":
++    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
++     ("_count", _NUM_REQUESTS)],
++    "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
++    "vllm:request_params_max_tokens":
++    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
++     ("_count", _NUM_REQUESTS)],
++    "vllm:prompt_tokens": [("_total",
++                            _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
++    "vllm:generation_tokens": [
++        ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
++    ],
++    "vllm:request_success": [("_total", _NUM_REQUESTS)],
++}
++
++
++@pytest.mark.asyncio
++async def test_metrics_counts(server: RemoteOpenAIServer,
++                              client: openai.AsyncClient):
++    for _ in range(_NUM_REQUESTS):
++        # sending a request triggers the metrics to be logged.
++        await client.completions.create(
++            model=MODEL_NAME,
++            prompt=_TOKENIZED_PROMPT,
++            max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
++
++    response = requests.get(server.url_for("metrics"))
++    print(response.text)
++    assert response.status_code == HTTPStatus.OK
++
++    # Loop over all expected metric_families
++    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
++        found_metric = False
++
++        # Check to see if the metric_family is found in the prom endpoint.
++        for family in text_string_to_metric_families(response.text):
++            if family.name == metric_family:
++                found_metric = True
++
++                # Check that each suffix is found in the prom endpoint.
++                for suffix, expected_value in suffix_values_list:
++                    metric_name_w_suffix = f"{metric_family}{suffix}"
++                    found_suffix = False
++
++                    for sample in family.samples:
++                        if sample.name == metric_name_w_suffix:
++                            found_suffix = True
++
++                            # For each suffix, value sure the value matches
++                            # what we expect.
++                            assert sample.value == expected_value, (
++                                f"{metric_name_w_suffix} expected value of "
++                                f"{expected_value} did not match found value "
++                                f"{sample.value}")
++                            break
++                    assert found_suffix, (
++                        f"Did not find {metric_name_w_suffix} in prom endpoint"
++                    )
++                break
++
++        assert found_metric, (f"Did not find {metric_family} in prom endpoint")
++
++
++EXPECTED_METRICS = [
++    "vllm:num_requests_running",
++    "vllm:num_requests_swapped",
++    "vllm:num_requests_waiting",
++    "vllm:gpu_cache_usage_perc",
++    "vllm:cpu_cache_usage_perc",
++    "vllm:time_to_first_token_seconds_sum",
++    "vllm:time_to_first_token_seconds_bucket",
++    "vllm:time_to_first_token_seconds_count",
++    "vllm:time_per_output_token_seconds_sum",
++    "vllm:time_per_output_token_seconds_bucket",
++    "vllm:time_per_output_token_seconds_count",
++    "vllm:e2e_request_latency_seconds_sum",
++    "vllm:e2e_request_latency_seconds_bucket",
++    "vllm:e2e_request_latency_seconds_count",
++    "vllm:request_prompt_tokens_sum",
++    "vllm:request_prompt_tokens_bucket",
++    "vllm:request_prompt_tokens_count",
++    "vllm:request_generation_tokens_sum",
++    "vllm:request_generation_tokens_bucket",
++    "vllm:request_generation_tokens_count",
++    "vllm:request_params_n_sum",
++    "vllm:request_params_n_bucket",
++    "vllm:request_params_n_count",
++    "vllm:request_params_max_tokens_sum",
++    "vllm:request_params_max_tokens_bucket",
++    "vllm:request_params_max_tokens_count",
++    "vllm:num_preemptions_total",
++    "vllm:prompt_tokens_total",
++    "vllm:generation_tokens_total",
++    "vllm:request_success_total",
++    "vllm:cache_config_info",
++    # labels in cache_config_info
++    "block_size",
++    "cache_dtype",
++    "cpu_offload_gb",
++    "enable_prefix_caching",
++    "gpu_memory_utilization",
++    "num_cpu_blocks",
++    "num_gpu_blocks",
++    "num_gpu_blocks_override",
++    "sliding_window",
++    "swap_space_bytes",
++]
++
++
++@pytest.mark.asyncio
++async def test_metrics_exist(server: RemoteOpenAIServer,
++                             client: openai.AsyncClient):
++    # sending a request triggers the metrics to be logged.
++    await client.completions.create(model=MODEL_NAME,
++                                    prompt="Hello, my name is",
++                                    max_tokens=5,
++                                    temperature=0.0)
++
++    response = requests.get(server.url_for("metrics"))
++    assert response.status_code == HTTPStatus.OK
++
++    for metric in EXPECTED_METRICS:
++        assert metric in response.text
++
++
++def test_metrics_exist_run_batch():
++    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
++
++    base_url = "0.0.0.0"
++    port = "8001"
++    server_url = f"http://{base_url}:{port}"
++
++    with tempfile.NamedTemporaryFile(
++            "w") as input_file, tempfile.NamedTemporaryFile(
++                "r") as output_file:
++        input_file.write(input_batch)
++        input_file.flush()
++        proc = subprocess.Popen([
++            sys.executable,
++            "-m",
++            "vllm.entrypoints.openai.run_batch",
++            "-i",
++            input_file.name,
++            "-o",
++            output_file.name,
++            "--model",
++            "intfloat/e5-mistral-7b-instruct",
++            "--enable-metrics",
++            "--url",
++            base_url,
++            "--port",
++            port,
++        ], )
++
++        def is_server_up(url):
++            try:
++                response = requests.get(url)
++                return response.status_code == 200
++            except requests.ConnectionError:
++                return False
++
++        while not is_server_up(server_url):
++            time.sleep(1)
++
++        response = requests.get(server_url + "/metrics")
++        assert response.status_code == HTTPStatus.OK
++
++        proc.wait()
+diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
+new file mode 100644
+index 0000000..ae5bf40
+--- /dev/null
++++ b/tests/entrypoints/openai/test_models.py
+@@ -0,0 +1,64 @@
++import openai  # use the official client for correctness check
++import pytest
++import pytest_asyncio
++# downloading lora to test lora requests
++from huggingface_hub import snapshot_download
++
++from ...utils import RemoteOpenAIServer
++
++# any model with a chat template should work here
++MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
++# technically this needs Mistral-7B-v0.1 as base, but we're not testing
++# generation quality here
++LORA_NAME = "typeof/zephyr-7b-beta-lora"
++
++
++@pytest.fixture(scope="module")
++def zephyr_lora_files():
++    return snapshot_download(repo_id=LORA_NAME)
++
++
++@pytest.fixture(scope="module")
++def server(zephyr_lora_files):
++    args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "8192",
++        "--enforce-eager",
++        # lora config below
++        "--enable-lora",
++        "--lora-modules",
++        f"zephyr-lora={zephyr_lora_files}",
++        f"zephyr-lora2={zephyr_lora_files}",
++        "--max-lora-rank",
++        "64",
++        "--max-cpu-loras",
++        "2",
++        "--max-num-seqs",
++        "128",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.mark.asyncio
++async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
++    models = await client.models.list()
++    models = models.data
++    served_model = models[0]
++    lora_models = models[1:]
++    assert served_model.id == MODEL_NAME
++    assert served_model.root == MODEL_NAME
++    assert all(lora_model.root == zephyr_lora_files
++               for lora_model in lora_models)
++    assert lora_models[0].id == "zephyr-lora"
++    assert lora_models[1].id == "zephyr-lora2"
+diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
+new file mode 100644
+index 0000000..b25cb1d
+--- /dev/null
++++ b/tests/entrypoints/openai/test_oot_registration.py
+@@ -0,0 +1,42 @@
++from ...utils import VLLM_PATH, RemoteOpenAIServer
++
++chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
++assert chatml_jinja_path.exists()
++
++
++def run_and_test_dummy_opt_api_server(model, tp=1):
++    # the model is registered through the plugin
++    server_args = [
++        "--gpu-memory-utilization",
++        "0.10",
++        "--dtype",
++        "float32",
++        "--chat-template",
++        str(chatml_jinja_path),
++        "--load-format",
++        "dummy",
++        "-tp",
++        f"{tp}",
++    ]
++    with RemoteOpenAIServer(model, server_args) as server:
++        client = server.get_client()
++        completion = client.chat.completions.create(
++            model=model,
++            messages=[{
++                "role": "system",
++                "content": "You are a helpful assistant."
++            }, {
++                "role": "user",
++                "content": "Hello!"
++            }],
++            temperature=0,
++        )
++        generated_text = completion.choices[0].message.content
++        assert generated_text is not None
++        # make sure only the first token is generated
++        rest = generated_text.replace("<s>", "")
++        assert rest == ""
++
++
++def test_oot_registration_for_api_server(dummy_opt_path: str):
++    run_and_test_dummy_opt_api_server(dummy_opt_path)
+diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
+new file mode 100644
+index 0000000..9c49239
+--- /dev/null
++++ b/tests/entrypoints/openai/test_pooling.py
+@@ -0,0 +1,238 @@
++import base64
++
++import numpy as np
++import pytest
++import requests
++
++from vllm.entrypoints.openai.protocol import PoolingResponse
++from vllm.transformers_utils.tokenizer import get_tokenizer
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
++DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        "--task",
++        "classify",
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--enforce-eager",
++        "--max-model-len",
++        "8192",
++        "--chat-template",
++        DUMMY_CHAT_TEMPLATE,
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
++    input_texts = [
++        "The chef prepared a delicious meal.",
++    ]
++
++    # test single pooling
++    response = requests.post(
++        server.url_for("pooling"),
++        json={
++            "model": model_name,
++            "input": input_texts,
++            "encoding_format": "float"
++        },
++    )
++    response.raise_for_status()
++    poolings = PoolingResponse.model_validate(response.json())
++
++    assert poolings.id is not None
++    assert len(poolings.data) == 1
++    assert len(poolings.data[0].data) == 2
++    assert poolings.usage.completion_tokens == 0
++    assert poolings.usage.prompt_tokens == 7
++    assert poolings.usage.total_tokens == 7
++
++    # test using token IDs
++    input_tokens = [1, 1, 1, 1, 1]
++    response = requests.post(
++        server.url_for("pooling"),
++        json={
++            "model": model_name,
++            "input": input_tokens,
++            "encoding_format": "float"
++        },
++    )
++    response.raise_for_status()
++    poolings = PoolingResponse.model_validate(response.json())
++
++    assert poolings.id is not None
++    assert len(poolings.data) == 1
++    assert len(poolings.data[0].data) == 2
++    assert poolings.usage.completion_tokens == 0
++    assert poolings.usage.prompt_tokens == 5
++    assert poolings.usage.total_tokens == 5
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
++    # test List[str]
++    input_texts = [
++        "The cat sat on the mat.", "A feline was resting on a rug.",
++        "Stars twinkle brightly in the night sky."
++    ]
++    response = requests.post(
++        server.url_for("pooling"),
++        json={
++            "model": model_name,
++            "input": input_texts,
++            "encoding_format": "float"
++        },
++    )
++    response.raise_for_status()
++    poolings = PoolingResponse.model_validate(response.json())
++
++    assert poolings.id is not None
++    assert len(poolings.data) == 3
++    assert len(poolings.data[0].data) == 2
++    assert poolings.usage.completion_tokens == 0
++    assert poolings.usage.prompt_tokens == 25
++    assert poolings.usage.total_tokens == 25
++
++    # test List[List[int]]
++    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
++                    [25, 32, 64, 77]]
++    response = requests.post(
++        server.url_for("pooling"),
++        json={
++            "model": model_name,
++            "input": input_tokens,
++            "encoding_format": "float"
++        },
++    )
++    response.raise_for_status()
++    poolings = PoolingResponse.model_validate(response.json())
++
++    assert poolings.id is not None
++    assert len(poolings.data) == 4
++    assert len(poolings.data[0].data) == 2
++    assert poolings.usage.completion_tokens == 0
++    assert poolings.usage.prompt_tokens == 17
++    assert poolings.usage.total_tokens == 17
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_conversation_pooling(server: RemoteOpenAIServer,
++                                    model_name: str):
++    messages = [{
++        "role": "user",
++        "content": "The cat sat on the mat.",
++    }, {
++        "role": "assistant",
++        "content": "A feline was resting on a rug.",
++    }, {
++        "role": "user",
++        "content": "Stars twinkle brightly in the night sky.",
++    }]
++
++    chat_response = requests.post(
++        server.url_for("pooling"),
++        json={
++            "model": model_name,
++            "messages": messages,
++            "encoding_format": "float",
++        },
++    )
++    chat_response.raise_for_status()
++    chat_poolings = PoolingResponse.model_validate(chat_response.json())
++
++    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
++    prompt = tokenizer.apply_chat_template(
++        messages,
++        chat_template=DUMMY_CHAT_TEMPLATE,
++        add_generation_prompt=True,
++        continue_final_message=False,
++        tokenize=False,
++    )
++    completions_response = requests.post(
++        server.url_for("pooling"),
++        json={
++            "model": model_name,
++            "input": prompt,
++            "encoding_format": "float",
++            # To be consistent with chat
++            "add_special_tokens": False,
++        },
++    )
++    completions_response.raise_for_status()
++    completion_poolings = PoolingResponse.model_validate(
++        completions_response.json())
++
++    assert chat_poolings.id is not None
++    assert completion_poolings.id is not None
++    assert chat_poolings.created <= completion_poolings.created
++    assert chat_poolings.model_dump(
++        exclude={"id", "created"}) == (completion_poolings.model_dump(
++            exclude={"id", "created"}))
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_batch_base64_pooling(server: RemoteOpenAIServer,
++                                    model_name: str):
++    input_texts = [
++        "Hello my name is",
++        "The best thing about vLLM is that it supports many different models"
++    ]
++
++    float_response = requests.post(
++        server.url_for("pooling"),
++        json={
++            "input": input_texts,
++            "model": model_name,
++            "encoding_format": "float",
++        },
++    )
++    float_response.raise_for_status()
++    responses_float = PoolingResponse.model_validate(float_response.json())
++
++    base64_response = requests.post(
++        server.url_for("pooling"),
++        json={
++            "input": input_texts,
++            "model": model_name,
++            "encoding_format": "base64",
++        },
++    )
++    base64_response.raise_for_status()
++    responses_base64 = PoolingResponse.model_validate(base64_response.json())
++
++    decoded_responses_base64_data = []
++    for data in responses_base64.data:
++        decoded_responses_base64_data.append(
++            np.frombuffer(base64.b64decode(data.data),
++                          dtype="float32").tolist())
++
++    assert responses_float.data[0].data == decoded_responses_base64_data[0]
++    assert responses_float.data[1].data == decoded_responses_base64_data[1]
++
++    # Default response is float32 decoded from base64 by OpenAI Client
++    default_response = requests.post(
++        server.url_for("pooling"),
++        json={
++            "input": input_texts,
++            "model": model_name,
++        },
++    )
++    default_response.raise_for_status()
++    responses_default = PoolingResponse.model_validate(default_response.json())
++
++    assert responses_float.data[0].data == responses_default.data[0].data
++    assert responses_float.data[1].data == responses_default.data[1].data
+diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
+new file mode 100644
+index 0000000..1ae64ef
+--- /dev/null
++++ b/tests/entrypoints/openai/test_prompt_validation.py
+@@ -0,0 +1,57 @@
++# imports for guided decoding tests
++import re
++
++import openai
++import pytest
++
++from ...utils import RemoteOpenAIServer
++
++
++@pytest.mark.asyncio
++async def test_empty_prompt():
++    model_name = "gpt2"
++    server_args = ["--enforce-eager"]
++    with RemoteOpenAIServer(model_name, server_args) as remote_server:
++        client = remote_server.get_async_client()
++
++        with pytest.raises(openai.BadRequestError,
++                           match=re.compile('.+Prompt cannot be empty.+')):
++            await client.completions.create(model=model_name,
++                                            prompt="",
++                                            max_tokens=5,
++                                            temperature=0.0)
++
++
++@pytest.mark.asyncio
++async def test_out_of_vocab_token_ids():
++    model_name = "gpt2"
++    server_args = ["--enforce-eager"]
++    with RemoteOpenAIServer(model_name, server_args) as remote_server:
++        client = remote_server.get_async_client()
++
++        with pytest.raises(openai.BadRequestError,
++                           match=re.compile('.*out of vocabulary.*')):
++            await client.completions.create(model=model_name,
++                                            prompt=[999999],
++                                            max_tokens=5,
++                                            temperature=0.0)
++
++
++@pytest.mark.asyncio
++async def test_reject_multistep_with_guided_decoding():
++    model_name = "gpt2"
++    server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
++    with RemoteOpenAIServer(model_name, server_args) as remote_server:
++        client = remote_server.get_async_client()
++
++        with pytest.raises(openai.BadRequestError,
++                           match=re.compile(
++                               '.*Guided decoding .* multi-step decoding.*')):
++            await client.completions.create(
++                model=model_name,
++                prompt="Hello",
++                max_tokens=5,
++                temperature=0.0,
++                extra_body={"response_format": {
++                    "type": "json_object"
++                }})
+diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
+new file mode 100644
+index 0000000..99f6da1
+--- /dev/null
++++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
+@@ -0,0 +1,87 @@
++# Separate these tests out from test_completion and test_chat, because they
++# require launching a second server with a different flag. Running both servers
++# at the same time on a single node will OOM.
++
++import pytest
++
++from vllm.transformers_utils.tokenizer import get_tokenizer
++
++from ...utils import RemoteOpenAIServer
++from .test_completion import default_server_args  # noqa: F401
++from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
++from .test_completion import zephyr_lora_files  # noqa: F401
++from .test_completion import zephyr_pa_files  # noqa: F401
++from .test_completion import MODEL_NAME
++
++
++@pytest.fixture(scope="module")
++def server_with_return_tokens_as_token_ids_flag(
++        default_server_args):  # noqa: F811
++    args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
++    with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
++        yield remote_server
++
++
++@pytest.mark.asyncio
++async def test_completion_return_tokens_as_token_ids_completion(
++        server_with_return_tokens_as_token_ids_flag):
++    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
++    ) as client:
++
++        completion = await client.completions.create(
++            model=MODEL_NAME,
++            # Include Unicode characters to test for dividing a single
++            # character across multiple tokens: 🎉 is [28705, 31862] for the
++            # Zephyr tokenizer
++            prompt="Say 'Hello, world! 🎉'",
++            echo=True,
++            temperature=0,
++            max_tokens=10,
++            logprobs=1)
++
++        text = completion.choices[0].text
++        token_strs = completion.choices[0].logprobs.tokens
++        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
++        # Check that the token representations are consistent between raw
++        # tokens and top_logprobs
++        # Slice off the first one, because there's no scoring associated
++        # with BOS
++        top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
++        top_logprob_keys = [
++            next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
++        ]
++        assert token_strs[1:] == top_logprob_keys
++
++        # Check that decoding the tokens gives the expected text
++        tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
++        assert text == tokenizer.decode(tokens, skip_special_tokens=True)
++
++
++@pytest.mark.asyncio
++async def test_chat_return_tokens_as_token_ids_completion(
++        server_with_return_tokens_as_token_ids_flag):
++    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
++    ) as client:
++        response = await client.chat.completions.create(
++            model=MODEL_NAME,
++            # Include Unicode characters to test for dividing a single
++            # character across multiple tokens: 🎉 is [28705, 31862] for the
++            # Zephyr tokenizer
++            messages=[{
++                "role": "system",
++                "content": "You like to respond in only emojis, like 🎉"
++            }, {
++                "role": "user",
++                "content": "Please write some emojis: 🐱🐶🎉"
++            }],
++            temperature=0,
++            max_tokens=8,
++            logprobs=True)
++
++        text = response.choices[0].message.content
++        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
++        token_ids = []
++        for logprob_content in response.choices[0].logprobs.content:
++            token_ids.append(
++                int(logprob_content.token.removeprefix("token_id:")))
++        assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
+diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
+new file mode 100644
+index 0000000..20f7960
+--- /dev/null
++++ b/tests/entrypoints/openai/test_root_path.py
+@@ -0,0 +1,103 @@
++import contextlib
++import os
++from typing import Any, List, NamedTuple
++
++import openai  # use the official client for correctness check
++import pytest
++
++from ...utils import RemoteOpenAIServer
++
++# # any model with a chat template should work here
++MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
++DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
++API_KEY = "abc-123"
++ERROR_API_KEY = "abc"
++ROOT_PATH = "llm"
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "float16",
++        "--enforce-eager",
++        "--max-model-len",
++        "4080",
++        "--root-path",  # use --root-path=/llm for testing
++        "/" + ROOT_PATH,
++        "--chat-template",
++        DUMMY_CHAT_TEMPLATE,
++    ]
++    envs = os.environ.copy()
++
++    envs["VLLM_API_KEY"] = API_KEY
++    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
++        yield remote_server
++
++
++class TestCase(NamedTuple):
++    model_name: str
++    base_url: List[str]
++    api_key: str
++    expected_error: Any
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "test_case",
++    [
++        TestCase(
++            model_name=MODEL_NAME,
++            base_url=["v1"],  # http://localhost:8000/v1
++            api_key=ERROR_API_KEY,
++            expected_error=openai.AuthenticationError),
++        TestCase(
++            model_name=MODEL_NAME,
++            base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
++            api_key=ERROR_API_KEY,
++            expected_error=openai.AuthenticationError),
++        TestCase(
++            model_name=MODEL_NAME,
++            base_url=["v1"],  # http://localhost:8000/v1
++            api_key=API_KEY,
++            expected_error=None),
++        TestCase(
++            model_name=MODEL_NAME,
++            base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
++            api_key=API_KEY,
++            expected_error=None),
++    ],
++)
++async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
++                                                   test_case: TestCase):
++    saying: str = "Here is a common saying about apple. An apple a day, keeps"
++    ctx = contextlib.nullcontext()
++    if test_case.expected_error is not None:
++        ctx = pytest.raises(test_case.expected_error)
++    with ctx:
++        client = openai.AsyncOpenAI(
++            api_key=test_case.api_key,
++            base_url=server.url_for(*test_case.base_url),
++            max_retries=0)
++        chat_completion = await client.chat.completions.create(
++            model=test_case.model_name,
++            messages=[{
++                "role": "user",
++                "content": "tell me a common saying"
++            }, {
++                "role": "assistant",
++                "content": saying
++            }],
++            extra_body={
++                "continue_final_message": True,
++                "add_generation_prompt": False
++            })
++
++        assert chat_completion.id is not None
++        assert len(chat_completion.choices) == 1
++        choice = chat_completion.choices[0]
++        assert choice.finish_reason == "stop"
++        message = choice.message
++        assert len(message.content) > 0
++        assert message.role == "assistant"
+diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
+new file mode 100644
+index 0000000..097d6b1
+--- /dev/null
++++ b/tests/entrypoints/openai/test_run_batch.py
+@@ -0,0 +1,104 @@
++import subprocess
++import sys
++import tempfile
++
++from vllm.entrypoints.openai.protocol import BatchRequestOutput
++
++# ruff: noqa: E501
++INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
++{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
++
++{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
++{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
++{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
++
++INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
++{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
++
++INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
++{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
++
++{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
++{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
++
++
++def test_empty_file():
++    with tempfile.NamedTemporaryFile(
++            "w") as input_file, tempfile.NamedTemporaryFile(
++                "r") as output_file:
++        input_file.write("")
++        input_file.flush()
++        proc = subprocess.Popen([
++            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
++            input_file.name, "-o", output_file.name, "--model",
++            "intfloat/e5-mistral-7b-instruct"
++        ], )
++        proc.communicate()
++        proc.wait()
++        assert proc.returncode == 0, f"{proc=}"
++
++        contents = output_file.read()
++        assert contents.strip() == ""
++
++
++def test_completions():
++    with tempfile.NamedTemporaryFile(
++            "w") as input_file, tempfile.NamedTemporaryFile(
++                "r") as output_file:
++        input_file.write(INPUT_BATCH)
++        input_file.flush()
++        proc = subprocess.Popen([
++            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
++            input_file.name, "-o", output_file.name, "--model",
++            "NousResearch/Meta-Llama-3-8B-Instruct"
++        ], )
++        proc.communicate()
++        proc.wait()
++        assert proc.returncode == 0, f"{proc=}"
++
++        contents = output_file.read()
++        for line in contents.strip().split("\n"):
++            # Ensure that the output format conforms to the openai api.
++            # Validation should throw if the schema is wrong.
++            BatchRequestOutput.model_validate_json(line)
++
++
++def test_completions_invalid_input():
++    """
++    Ensure that we fail when the input doesn't conform to the openai api.
++    """
++    with tempfile.NamedTemporaryFile(
++            "w") as input_file, tempfile.NamedTemporaryFile(
++                "r") as output_file:
++        input_file.write(INVALID_INPUT_BATCH)
++        input_file.flush()
++        proc = subprocess.Popen([
++            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
++            input_file.name, "-o", output_file.name, "--model",
++            "NousResearch/Meta-Llama-3-8B-Instruct"
++        ], )
++        proc.communicate()
++        proc.wait()
++        assert proc.returncode != 0, f"{proc=}"
++
++
++def test_embeddings():
++    with tempfile.NamedTemporaryFile(
++            "w") as input_file, tempfile.NamedTemporaryFile(
++                "r") as output_file:
++        input_file.write(INPUT_EMBEDDING_BATCH)
++        input_file.flush()
++        proc = subprocess.Popen([
++            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
++            input_file.name, "-o", output_file.name, "--model",
++            "intfloat/e5-mistral-7b-instruct"
++        ], )
++        proc.communicate()
++        proc.wait()
++        assert proc.returncode == 0, f"{proc=}"
++
++        contents = output_file.read()
++        for line in contents.strip().split("\n"):
++            # Ensure that the output format conforms to the openai api.
++            # Validation should throw if the schema is wrong.
++            BatchRequestOutput.model_validate_json(line)
+diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
+new file mode 100644
+index 0000000..a803ea4
+--- /dev/null
++++ b/tests/entrypoints/openai/test_score.py
+@@ -0,0 +1,93 @@
++import pytest
++import requests
++
++from vllm.entrypoints.openai.protocol import ScoreResponse
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "BAAI/bge-reranker-v2-m3"
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        "--enforce-eager",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
++                                      model_name: str):
++    text_1 = "What is the capital of France?"
++    text_2 = [
++        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
++    ]
++
++    score_response = requests.post(server.url_for("score"),
++                                   json={
++                                       "model": model_name,
++                                       "text_1": text_1,
++                                       "text_2": text_2,
++                                   })
++    score_response.raise_for_status()
++    score = ScoreResponse.model_validate(score_response.json())
++
++    assert score.id is not None
++    assert score.data is not None
++    assert len(score.data) == 2
++    assert score.data[0].score <= 0.01
++    assert score.data[1].score >= 0.9
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
++                                       model_name: str):
++    text_1 = [
++        "What is the capital of the United States?",
++        "What is the capital of France?"
++    ]
++    text_2 = [
++        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
++    ]
++
++    score_response = requests.post(server.url_for("score"),
++                                   json={
++                                       "model": model_name,
++                                       "text_1": text_1,
++                                       "text_2": text_2,
++                                   })
++    score_response.raise_for_status()
++    score = ScoreResponse.model_validate(score_response.json())
++
++    assert score.id is not None
++    assert score.data is not None
++    assert len(score.data) == 2
++    assert score.data[0].score <= 0.01
++    assert score.data[1].score >= 0.9
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
++                                     model_name: str):
++    text_1 = "What is the capital of France?"
++    text_2 = "The capital of France is Paris."
++
++    score_response = requests.post(server.url_for("score"),
++                                   json={
++                                       "model": model_name,
++                                       "text_1": text_1,
++                                       "text_2": text_2,
++                                   })
++    score_response.raise_for_status()
++    score = ScoreResponse.model_validate(score_response.json())
++
++    assert score.id is not None
++    assert score.data is not None
++    assert len(score.data) == 1
++    assert score.data[0].score >= 0.9
+diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
+index 269b082..85f4853 100644
+--- a/tests/entrypoints/openai/test_serving_chat.py
++++ b/tests/entrypoints/openai/test_serving_chat.py
+@@ -1,37 +1,161 @@
+ import asyncio
++from contextlib import suppress
+ from dataclasses import dataclass
++from typing import Optional
++from unittest.mock import MagicMock
+ 
++from vllm.config import MultiModalConfig
++from vllm.engine.multiprocessing.client import MQLLMEngineClient
++from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
++from vllm.entrypoints.openai.serving_models import (BaseModelPath,
++                                                    OpenAIServingModels)
++from vllm.transformers_utils.tokenizer import get_tokenizer
+ 
+ MODEL_NAME = "openai-community/gpt2"
+ CHAT_TEMPLATE = "Dummy chat template for testing {}"
++BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
++
++
++@dataclass
++class MockHFConfig:
++    model_type: str = "any"
+ 
+ 
+ @dataclass
+ class MockModelConfig:
++    task = "generate"
+     tokenizer = MODEL_NAME
+     trust_remote_code = False
+     tokenizer_mode = "auto"
+     max_model_len = 100
+     tokenizer_revision = None
++    multimodal_config = MultiModalConfig()
++    hf_config = MockHFConfig()
++    logits_processor_pattern = None
++    diff_sampling_param: Optional[dict] = None
++    allowed_local_media_path: str = ""
++    encoder_config = None
++
++    def get_diff_sampling_param(self):
++        return self.diff_sampling_param or {}
+ 
+ 
+ @dataclass
+ class MockEngine:
+ 
+     async def get_model_config(self):
+-        return MockModelConfig
++        return MockModelConfig()
+ 
+ 
+ async def _async_serving_chat_init():
+-    serving_completion = OpenAIServingChat(MockEngine(),
+-                                           served_model_names=[MODEL_NAME],
++    engine = MockEngine()
++    model_config = await engine.get_model_config()
++
++    models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
++    serving_completion = OpenAIServingChat(engine,
++                                           model_config,
++                                           models,
+                                            response_role="assistant",
+-                                           chat_template=CHAT_TEMPLATE)
++                                           chat_template=CHAT_TEMPLATE,
++                                           chat_template_content_format="auto",
++                                           request_logger=None)
+     return serving_completion
+ 
+ 
+ def test_async_serving_chat_init():
+     serving_completion = asyncio.run(_async_serving_chat_init())
+-    assert serving_completion.tokenizer is not None
+-    assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
++    assert serving_completion.chat_template == CHAT_TEMPLATE
++
++
++def test_serving_chat_should_set_correct_max_tokens():
++    mock_engine = MagicMock(spec=MQLLMEngineClient)
++    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
++    mock_engine.errored = False
++
++    models = OpenAIServingModels(engine_client=mock_engine,
++                                 base_model_paths=BASE_MODEL_PATHS,
++                                 model_config=MockModelConfig())
++    serving_chat = OpenAIServingChat(mock_engine,
++                                     MockModelConfig(),
++                                     models,
++                                     response_role="assistant",
++                                     chat_template=CHAT_TEMPLATE,
++                                     chat_template_content_format="auto",
++                                     request_logger=None)
++    req = ChatCompletionRequest(
++        model=MODEL_NAME,
++        messages=[{
++            "role": "user",
++            "content": "what is 1+1?"
++        }],
++        guided_decoding_backend="outlines",
++    )
++
++    with suppress(Exception):
++        asyncio.run(serving_chat.create_chat_completion(req))
++
++    assert mock_engine.generate.call_args.args[1].max_tokens == 93
++
++    req.max_tokens = 10
++    with suppress(Exception):
++        asyncio.run(serving_chat.create_chat_completion(req))
++
++    assert mock_engine.generate.call_args.args[1].max_tokens == 10
++
++
++def test_serving_chat_could_load_correct_generation_config():
++
++    mock_model_config = MockModelConfig()
++    mock_model_config.diff_sampling_param = {
++        "temperature": 0.5,
++        "repetition_penalty": 1.05
++    }
++
++    mock_engine = MagicMock(spec=MQLLMEngineClient)
++    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
++    mock_engine.errored = False
++
++    # Initialize the serving chat
++    models = OpenAIServingModels(engine_client=mock_engine,
++                                 base_model_paths=BASE_MODEL_PATHS,
++                                 model_config=mock_model_config)
++    serving_chat = OpenAIServingChat(mock_engine,
++                                     mock_model_config,
++                                     models,
++                                     response_role="assistant",
++                                     chat_template=CHAT_TEMPLATE,
++                                     chat_template_content_format="auto",
++                                     request_logger=None)
++    req = ChatCompletionRequest(
++        model=MODEL_NAME,
++        messages=[{
++            "role": "user",
++            "content": "what is 1+1?"
++        }],
++        guided_decoding_backend="outlines",
++    )
++
++    with suppress(Exception):
++        asyncio.run(serving_chat.create_chat_completion(req))
++
++    assert mock_engine.generate.call_args.args[1].temperature == 0.5
++    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
++
++    # Test the param when user set it
++    req.temperature = 0.1
++
++    with suppress(Exception):
++        asyncio.run(serving_chat.create_chat_completion(req))
++
++    assert mock_engine.generate.call_args.args[1].temperature == 0.1
++    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
++
++    # Test When temperature==0.0
++    req.temperature = 0.0
++
++    with suppress(Exception):
++        asyncio.run(serving_chat.create_chat_completion(req))
++
++    assert mock_engine.generate.call_args.args[1].temperature == 0.0
++    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
+new file mode 100644
+index 0000000..657ea20
+--- /dev/null
++++ b/tests/entrypoints/openai/test_serving_models.py
+@@ -0,0 +1,121 @@
++from http import HTTPStatus
++from unittest.mock import MagicMock
++
++import pytest
++
++from vllm.config import ModelConfig
++from vllm.engine.protocol import EngineClient
++from vllm.entrypoints.openai.protocol import (ErrorResponse,
++                                              LoadLoraAdapterRequest,
++                                              UnloadLoraAdapterRequest)
++from vllm.entrypoints.openai.serving_models import (BaseModelPath,
++                                                    OpenAIServingModels)
++from vllm.lora.request import LoRARequest
++
++MODEL_NAME = "meta-llama/Llama-2-7b"
++BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
++LORA_LOADING_SUCCESS_MESSAGE = (
++    "Success: LoRA adapter '{lora_name}' added successfully.")
++LORA_UNLOADING_SUCCESS_MESSAGE = (
++    "Success: LoRA adapter '{lora_name}' removed successfully.")
++
++
++async def _async_serving_models_init() -> OpenAIServingModels:
++    mock_model_config = MagicMock(spec=ModelConfig)
++    mock_engine_client = MagicMock(spec=EngineClient)
++    # Set the max_model_len attribute to avoid missing attribute
++    mock_model_config.max_model_len = 2048
++
++    serving_models = OpenAIServingModels(engine_client=mock_engine_client,
++                                         base_model_paths=BASE_MODEL_PATHS,
++                                         model_config=mock_model_config,
++                                         lora_modules=None,
++                                         prompt_adapters=None)
++    await serving_models.init_static_loras()
++
++    return serving_models
++
++
++@pytest.mark.asyncio
++async def test_serving_model_name():
++    serving_models = await _async_serving_models_init()
++    assert serving_models.model_name(None) == MODEL_NAME
++    request = LoRARequest(lora_name="adapter",
++                          lora_path="/path/to/adapter2",
++                          lora_int_id=1)
++    assert serving_models.model_name(request) == request.lora_name
++
++
++@pytest.mark.asyncio
++async def test_load_lora_adapter_success():
++    serving_models = await _async_serving_models_init()
++    request = LoadLoraAdapterRequest(lora_name="adapter",
++                                     lora_path="/path/to/adapter2")
++    response = await serving_models.load_lora_adapter(request)
++    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
++    assert len(serving_models.lora_requests) == 1
++    assert serving_models.lora_requests[0].lora_name == "adapter"
++
++
++@pytest.mark.asyncio
++async def test_load_lora_adapter_missing_fields():
++    serving_models = await _async_serving_models_init()
++    request = LoadLoraAdapterRequest(lora_name="", lora_path="")
++    response = await serving_models.load_lora_adapter(request)
++    assert isinstance(response, ErrorResponse)
++    assert response.type == "InvalidUserInput"
++    assert response.code == HTTPStatus.BAD_REQUEST
++
++
++@pytest.mark.asyncio
++async def test_load_lora_adapter_duplicate():
++    serving_models = await _async_serving_models_init()
++    request = LoadLoraAdapterRequest(lora_name="adapter1",
++                                     lora_path="/path/to/adapter1")
++    response = await serving_models.load_lora_adapter(request)
++    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
++        lora_name='adapter1')
++    assert len(serving_models.lora_requests) == 1
++
++    request = LoadLoraAdapterRequest(lora_name="adapter1",
++                                     lora_path="/path/to/adapter1")
++    response = await serving_models.load_lora_adapter(request)
++    assert isinstance(response, ErrorResponse)
++    assert response.type == "InvalidUserInput"
++    assert response.code == HTTPStatus.BAD_REQUEST
++    assert len(serving_models.lora_requests) == 1
++
++
++@pytest.mark.asyncio
++async def test_unload_lora_adapter_success():
++    serving_models = await _async_serving_models_init()
++    request = LoadLoraAdapterRequest(lora_name="adapter1",
++                                     lora_path="/path/to/adapter1")
++    response = await serving_models.load_lora_adapter(request)
++    assert len(serving_models.lora_requests) == 1
++
++    request = UnloadLoraAdapterRequest(lora_name="adapter1")
++    response = await serving_models.unload_lora_adapter(request)
++    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
++        lora_name='adapter1')
++    assert len(serving_models.lora_requests) == 0
++
++
++@pytest.mark.asyncio
++async def test_unload_lora_adapter_missing_fields():
++    serving_models = await _async_serving_models_init()
++    request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
++    response = await serving_models.unload_lora_adapter(request)
++    assert isinstance(response, ErrorResponse)
++    assert response.type == "InvalidUserInput"
++    assert response.code == HTTPStatus.BAD_REQUEST
++
++
++@pytest.mark.asyncio
++async def test_unload_lora_adapter_not_found():
++    serving_models = await _async_serving_models_init()
++    request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
++    response = await serving_models.unload_lora_adapter(request)
++    assert isinstance(response, ErrorResponse)
++    assert response.type == "NotFoundError"
++    assert response.code == HTTPStatus.NOT_FOUND
+diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
+new file mode 100644
+index 0000000..090523a
+--- /dev/null
++++ b/tests/entrypoints/openai/test_shutdown.py
+@@ -0,0 +1,37 @@
++import openai
++import pytest
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "meta-llama/Llama-3.2-1B"
++
++
++@pytest.mark.asyncio
++async def test_shutdown_on_engine_failure():
++    # dtype, max-len etc set so that this can run in CI
++    args = [
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "8192",
++        "--enforce-eager",
++        "--max-num-seqs",
++        "128",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        async with remote_server.get_async_client() as client:
++
++            with pytest.raises(
++                (openai.APIConnectionError, openai.InternalServerError)):
++                # Asking for lots of prompt logprobs will currently crash the
++                # engine. This may change in the future when that bug is fixed
++                prompt = "Hello " * 4000
++                await client.completions.create(
++                    model=MODEL_NAME,
++                    prompt=prompt,
++                    extra_body={"prompt_logprobs": 10})
++
++            # Now the server should shut down
++            return_code = remote_server.proc.wait(timeout=8)
++            assert return_code is not None
+diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
+new file mode 100644
+index 0000000..b1956a8
+--- /dev/null
++++ b/tests/entrypoints/openai/test_tokenization.py
+@@ -0,0 +1,170 @@
++import pytest
++import pytest_asyncio
++import requests
++
++from vllm.transformers_utils.tokenizer import get_tokenizer
++
++from ...utils import RemoteOpenAIServer
++from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
++from .test_completion import zephyr_lora_files  # noqa: F401
++
++# any model with a chat template should work here
++MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
++
++
++@pytest.fixture(scope="module")
++def server(zephyr_lora_added_tokens_files: str):  # noqa: F811
++    args = [
++        # use half precision for speed and memory savings in CI environment
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "8192",
++        "--enforce-eager",
++        "--max-num-seqs",
++        "128",
++        # lora config
++        "--enable-lora",
++        "--lora-modules",
++        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
++        "--max-lora-rank",
++        "64",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest.fixture(scope="module")
++def tokenizer_name(model_name: str,
++                   zephyr_lora_added_tokens_files: str):  # noqa: F811
++    return zephyr_lora_added_tokens_files if (
++        model_name == "zephyr-lora2") else model_name
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name,tokenizer_name",
++    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
++    indirect=["tokenizer_name"],
++)
++async def test_tokenize_completions(
++    server: RemoteOpenAIServer,
++    model_name: str,
++    tokenizer_name: str,
++):
++    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
++                              tokenizer_mode="fast")
++
++    for add_special in [False, True]:
++        prompt = "vllm1 This is a test prompt."
++        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
++
++        response = requests.post(server.url_for("tokenize"),
++                                 json={
++                                     "add_special_tokens": add_special,
++                                     "model": model_name,
++                                     "prompt": prompt
++                                 })
++        response.raise_for_status()
++
++        assert response.json() == {
++            "tokens": tokens,
++            "count": len(tokens),
++            "max_model_len": 8192
++        }
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name,tokenizer_name",
++    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
++    indirect=["tokenizer_name"],
++)
++async def test_tokenize_chat(
++    server: RemoteOpenAIServer,
++    model_name: str,
++    tokenizer_name: str,
++):
++    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
++                              tokenizer_mode="fast")
++
++    for add_generation in [False, True]:
++        for add_special in [False, True]:
++            conversation = [{
++                "role": "user",
++                "content": "Hi there!"
++            }, {
++                "role": "assistant",
++                "content": "Nice to meet you!"
++            }, {
++                "role": "user",
++                "content": "Can I ask a question? vllm1"
++            }]
++            for continue_final in [False, True]:
++                if add_generation and continue_final:
++                    continue
++                if continue_final:
++                    conversation.append({
++                        "role": "assistant",
++                        "content": "Sure,"
++                    })
++
++                prompt = tokenizer.apply_chat_template(
++                    add_generation_prompt=add_generation,
++                    continue_final_message=continue_final,
++                    conversation=conversation,
++                    tokenize=False)
++                tokens = tokenizer.encode(prompt,
++                                          add_special_tokens=add_special)
++
++                response = requests.post(server.url_for("tokenize"),
++                                         json={
++                                             "add_generation_prompt":
++                                             add_generation,
++                                             "continue_final_message":
++                                             continue_final,
++                                             "add_special_tokens": add_special,
++                                             "messages": conversation,
++                                             "model": model_name
++                                         })
++                response.raise_for_status()
++
++                assert response.json() == {
++                    "tokens": tokens,
++                    "count": len(tokens),
++                    "max_model_len": 8192
++                }
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize(
++    "model_name,tokenizer_name",
++    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
++    indirect=["tokenizer_name"],
++)
++async def test_detokenize(
++    server: RemoteOpenAIServer,
++    model_name: str,
++    tokenizer_name: str,
++):
++    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
++                              tokenizer_mode="fast")
++
++    prompt = "This is a test prompt. vllm1"
++    tokens = tokenizer.encode(prompt, add_special_tokens=False)
++
++    response = requests.post(server.url_for("detokenize"),
++                             json={
++                                 "model": model_name,
++                                 "tokens": tokens
++                             })
++    response.raise_for_status()
++
++    assert response.json() == {"prompt": prompt}
+diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
+new file mode 100644
+index 0000000..e73449e
+--- /dev/null
++++ b/tests/entrypoints/openai/test_video.py
+@@ -0,0 +1,348 @@
++from typing import Dict, List
++
++import openai
++import pytest
++import pytest_asyncio
++
++from vllm.multimodal.utils import encode_video_base64, fetch_video
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
++MAXIMUM_VIDEOS = 4
++
++TEST_VIDEO_URLS = [
++    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
++    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
++    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
++    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
++]
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        "--task",
++        "generate",
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "32768",
++        "--max-num-seqs",
++        "2",
++        "--enforce-eager",
++        "--trust-remote-code",
++        "--limit-mm-per-prompt",
++        f"video={MAXIMUM_VIDEOS}",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.fixture(scope="session")
++def base64_encoded_video() -> Dict[str, str]:
++    return {
++        video_url: encode_video_base64(fetch_video(video_url))
++        for video_url in TEST_VIDEO_URLS
++    }
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
++async def test_single_chat_session_video(client: openai.AsyncOpenAI,
++                                         model_name: str, video_url: str):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "video_url",
++                "video_url": {
++                    "url": video_url
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this video?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        logprobs=True,
++        temperature=0.0,
++        top_logprobs=5)
++    assert len(chat_completion.choices) == 1
++
++    choice = chat_completion.choices[0]
++    assert choice.finish_reason == "length"
++    assert chat_completion.usage == openai.types.CompletionUsage(
++        completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
++
++    message = choice.message
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 10
++    assert message.role == "assistant"
++    messages.append({"role": "assistant", "content": message.content})
++
++    # test multi-turn dialogue
++    messages.append({"role": "user", "content": "express your result in json"})
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++    )
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
++async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
++                                                    model_name: str,
++                                                    video_url: str):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "video_url",
++                "video_url": {
++                    "url": video_url
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this video?"
++            },
++        ],
++    }]
++
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        n=2,
++        max_completion_tokens=10,
++        logprobs=True,
++        top_logprobs=5,
++        extra_body=dict(use_beam_search=True))
++    assert len(chat_completion.choices) == 2
++    assert chat_completion.choices[
++        0].message.content != chat_completion.choices[1].message.content
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
++async def test_single_chat_session_video_base64encoded(
++        client: openai.AsyncOpenAI, model_name: str, video_url: str,
++        base64_encoded_video: Dict[str, str]):
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "video_url",
++                "video_url": {
++                    "url":
++                    f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this video?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        logprobs=True,
++        temperature=0.0,
++        top_logprobs=5)
++    assert len(chat_completion.choices) == 1
++
++    choice = chat_completion.choices[0]
++    assert choice.finish_reason == "length"
++    assert chat_completion.usage == openai.types.CompletionUsage(
++        completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
++
++    message = choice.message
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 10
++    assert message.role == "assistant"
++    messages.append({"role": "assistant", "content": message.content})
++
++    # test multi-turn dialogue
++    messages.append({"role": "user", "content": "express your result in json"})
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++    )
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
++async def test_single_chat_session_video_base64encoded_beamsearch(
++        client: openai.AsyncOpenAI, model_name: str, video_url: str,
++        base64_encoded_video: Dict[str, str]):
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "video_url",
++                "video_url": {
++                    "url":
++                    f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this video?"
++            },
++        ],
++    }]
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        n=2,
++        max_completion_tokens=10,
++        extra_body=dict(use_beam_search=True))
++    assert len(chat_completion.choices) == 2
++    assert chat_completion.choices[
++        0].message.content != chat_completion.choices[1].message.content
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
++async def test_chat_streaming_video(client: openai.AsyncOpenAI,
++                                    model_name: str, video_url: str):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "video_url",
++                "video_url": {
++                    "url": video_url
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this video?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++    )
++    output = chat_completion.choices[0].message.content
++    stop_reason = chat_completion.choices[0].finish_reason
++
++    # test streaming
++    stream = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++        stream=True,
++    )
++    chunks: List[str] = []
++    finish_reason_count = 0
++    async for chunk in stream:
++        delta = chunk.choices[0].delta
++        if delta.role:
++            assert delta.role == "assistant"
++        if delta.content:
++            chunks.append(delta.content)
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++    # finish reason should only return in last block
++    assert finish_reason_count == 1
++    assert chunk.choices[0].finish_reason == stop_reason
++    assert delta.content
++    assert "".join(chunks) == output
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize(
++    "video_urls",
++    [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
++async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
++                                 video_urls: List[str]):
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            *({
++                "type": "video_url",
++                "video_url": {
++                    "url": video_url
++                }
++            } for video_url in video_urls),
++            {
++                "type": "text",
++                "text": "What's in this video?"
++            },
++        ],
++    }]
++
++    if len(video_urls) > MAXIMUM_VIDEOS:
++        with pytest.raises(openai.BadRequestError):  # test multi-video input
++            await client.chat.completions.create(
++                model=model_name,
++                messages=messages,
++                max_completion_tokens=10,
++                temperature=0.0,
++            )
++
++        # the server should still work afterwards
++        completion = await client.completions.create(
++            model=model_name,
++            prompt=[0, 0, 0, 0, 0],
++            max_tokens=5,
++            temperature=0.0,
++        )
++        completion = completion.choices[0].text
++        assert completion is not None and len(completion) >= 0
++    else:
++        chat_completion = await client.chat.completions.create(
++            model=model_name,
++            messages=messages,
++            max_completion_tokens=10,
++            temperature=0.0,
++        )
++        message = chat_completion.choices[0].message
++        assert message.content is not None and len(message.content) >= 0
+diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
+new file mode 100644
+index 0000000..5f070ba
+--- /dev/null
++++ b/tests/entrypoints/openai/test_vision.py
+@@ -0,0 +1,349 @@
++from typing import Dict, List
++
++import openai
++import pytest
++import pytest_asyncio
++
++from vllm.multimodal.utils import encode_image_base64, fetch_image
++
++from ...utils import RemoteOpenAIServer
++
++MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
++MAXIMUM_IMAGES = 2
++
++# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
++TEST_IMAGE_URLS = [
++    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
++    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
++    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
++    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
++]
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        "--task",
++        "generate",
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "2048",
++        "--max-num-seqs",
++        "5",
++        "--enforce-eager",
++        "--trust-remote-code",
++        "--limit-mm-per-prompt",
++        f"image={MAXIMUM_IMAGES}",
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest.fixture(scope="session")
++def base64_encoded_image() -> Dict[str, str]:
++    return {
++        image_url: encode_image_base64(fetch_image(image_url))
++        for image_url in TEST_IMAGE_URLS
++    }
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
++async def test_single_chat_session_image(client: openai.AsyncOpenAI,
++                                         model_name: str, image_url: str):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this image?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        logprobs=True,
++        temperature=0.0,
++        top_logprobs=5)
++    assert len(chat_completion.choices) == 1
++
++    choice = chat_completion.choices[0]
++    assert choice.finish_reason == "length"
++    assert chat_completion.usage == openai.types.CompletionUsage(
++        completion_tokens=10, prompt_tokens=775, total_tokens=785)
++
++    message = choice.message
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 10
++    assert message.role == "assistant"
++    messages.append({"role": "assistant", "content": message.content})
++
++    # test multi-turn dialogue
++    messages.append({"role": "user", "content": "express your result in json"})
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++    )
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
++async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
++                                                    model_name: str,
++                                                    image_url: str):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this image?"
++            },
++        ],
++    }]
++
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        n=2,
++        max_completion_tokens=10,
++        logprobs=True,
++        top_logprobs=5,
++        extra_body=dict(use_beam_search=True))
++    assert len(chat_completion.choices) == 2
++    assert chat_completion.choices[
++        0].message.content != chat_completion.choices[1].message.content
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
++async def test_single_chat_session_image_base64encoded(
++        client: openai.AsyncOpenAI, model_name: str, image_url: str,
++        base64_encoded_image: Dict[str, str]):
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "image_url",
++                "image_url": {
++                    "url":
++                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this image?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        logprobs=True,
++        temperature=0.0,
++        top_logprobs=5)
++    assert len(chat_completion.choices) == 1
++
++    choice = chat_completion.choices[0]
++    assert choice.finish_reason == "length"
++    assert chat_completion.usage == openai.types.CompletionUsage(
++        completion_tokens=10, prompt_tokens=775, total_tokens=785)
++
++    message = choice.message
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 10
++    assert message.role == "assistant"
++    messages.append({"role": "assistant", "content": message.content})
++
++    # test multi-turn dialogue
++    messages.append({"role": "user", "content": "express your result in json"})
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++    )
++    message = chat_completion.choices[0].message
++    assert message.content is not None and len(message.content) >= 0
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
++async def test_single_chat_session_image_base64encoded_beamsearch(
++        client: openai.AsyncOpenAI, model_name: str, image_url: str,
++        base64_encoded_image: Dict[str, str]):
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "image_url",
++                "image_url": {
++                    "url":
++                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this image?"
++            },
++        ],
++    }]
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        n=2,
++        max_completion_tokens=10,
++        extra_body=dict(use_beam_search=True))
++    assert len(chat_completion.choices) == 2
++    assert chat_completion.choices[
++        0].message.content != chat_completion.choices[1].message.content
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
++async def test_chat_streaming_image(client: openai.AsyncOpenAI,
++                                    model_name: str, image_url: str):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this image?"
++            },
++        ],
++    }]
++
++    # test single completion
++    chat_completion = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++    )
++    output = chat_completion.choices[0].message.content
++    stop_reason = chat_completion.choices[0].finish_reason
++
++    # test streaming
++    stream = await client.chat.completions.create(
++        model=model_name,
++        messages=messages,
++        max_completion_tokens=10,
++        temperature=0.0,
++        stream=True,
++    )
++    chunks: List[str] = []
++    finish_reason_count = 0
++    async for chunk in stream:
++        delta = chunk.choices[0].delta
++        if delta.role:
++            assert delta.role == "assistant"
++        if delta.content:
++            chunks.append(delta.content)
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++    # finish reason should only return in last block
++    assert finish_reason_count == 1
++    assert chunk.choices[0].finish_reason == stop_reason
++    assert delta.content
++    assert "".join(chunks) == output
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize(
++    "image_urls",
++    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
++async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
++                                 image_urls: List[str]):
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            *({
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            } for image_url in image_urls),
++            {
++                "type": "text",
++                "text": "What's in this image?"
++            },
++        ],
++    }]
++
++    if len(image_urls) > MAXIMUM_IMAGES:
++        with pytest.raises(openai.BadRequestError):  # test multi-image input
++            await client.chat.completions.create(
++                model=model_name,
++                messages=messages,
++                max_completion_tokens=10,
++                temperature=0.0,
++            )
++
++        # the server should still work afterwards
++        completion = await client.completions.create(
++            model=model_name,
++            prompt=[0, 0, 0, 0, 0],
++            max_tokens=5,
++            temperature=0.0,
++        )
++        completion = completion.choices[0].text
++        assert completion is not None and len(completion) >= 0
++    else:
++        chat_completion = await client.chat.completions.create(
++            model=model_name,
++            messages=messages,
++            max_completion_tokens=10,
++            temperature=0.0,
++        )
++        message = chat_completion.choices[0].message
++        assert message.content is not None and len(message.content) >= 0
+diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
+new file mode 100644
+index 0000000..c851539
+--- /dev/null
++++ b/tests/entrypoints/openai/test_vision_embedding.py
+@@ -0,0 +1,95 @@
++from typing import Dict
++
++import pytest
++import requests
++
++from vllm.entrypoints.openai.protocol import EmbeddingResponse
++from vllm.multimodal.utils import encode_image_base64, fetch_image
++
++from ...utils import VLLM_PATH, RemoteOpenAIServer
++
++MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
++MAXIMUM_IMAGES = 2
++
++vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
++assert vlm2vec_jinja_path.exists()
++
++# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
++TEST_IMAGE_URLS = [
++    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
++    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
++    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
++    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
++]
++
++
++@pytest.fixture(scope="module")
++def server():
++    args = [
++        "--task",
++        "embed",
++        "--dtype",
++        "bfloat16",
++        "--max-model-len",
++        "2048",
++        "--max-num-seqs",
++        "5",
++        "--enforce-eager",
++        "--trust-remote-code",
++        "--limit-mm-per-prompt",
++        f"image={MAXIMUM_IMAGES}",
++        "--chat-template",
++        str(vlm2vec_jinja_path),
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest.fixture(scope="session")
++def base64_encoded_image() -> Dict[str, str]:
++    return {
++        image_url: encode_image_base64(fetch_image(image_url))
++        for image_url in TEST_IMAGE_URLS
++    }
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("model_name", [MODEL_NAME])
++@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
++async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
++                               image_url: str):
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            },
++            {
++                "type": "text",
++                "text": "Represent the given image."
++            },
++        ],
++    }]
++
++    response = requests.post(
++        server.url_for("v1/embeddings"),
++        json={
++            "model": model_name,
++            "messages": messages,
++            "encoding_format": "float"
++        },
++    )
++    response.raise_for_status()
++    embeddings = EmbeddingResponse.model_validate(response.json())
++
++    assert embeddings.id is not None
++    assert len(embeddings.data) == 1
++    assert len(embeddings.data[0].embedding) == 3072
++    assert embeddings.usage.completion_tokens == 0
++    assert embeddings.usage.prompt_tokens == 764
++    assert embeddings.usage.total_tokens == 764
+diff --git a/tests/entrypoints/openai/tool_parsers/__init__.py b/tests/entrypoints/openai/tool_parsers/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+new file mode 100644
+index 0000000..47b0b6b
+--- /dev/null
++++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+@@ -0,0 +1,160 @@
++from typing import List
++from unittest.mock import MagicMock
++
++import pytest
++
++from tests.entrypoints.openai.tool_parsers.utils import (
++    run_tool_extraction, run_tool_extraction_streaming)
++from vllm.entrypoints.openai.protocol import FunctionCall
++from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
++
++# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
++SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
++SIMPLE_FUNCTION_CALL = FunctionCall(
++    name="get_weather",
++    arguments='{"city": "San Francisco", "metric": "celsius"}',
++)
++MORE_TYPES_FUNCTION_OUTPUT = (
++    "register_user(name='John Doe', "
++    "age=37, "
++    "address={'city': 'San Francisco', 'state': 'CA'}, "
++    "role=None, "
++    "passed_test=True, "
++    "aliases=['John', 'Johnny'])")
++MORE_TYPES_FUNCTION_CALL = FunctionCall(
++    name="register_user",
++    arguments='{"name": "John Doe", '
++    '"age": 37, '
++    '"address": {"city": "San Francisco", "state": "CA"}, '
++    '"role": null, '
++    '"passed_test": true, '
++    '"aliases": ["John", "Johnny"]}',
++)
++PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
++PARAMETERLESS_FUNCTION_CALL = FunctionCall(
++    name="get_weather",
++    arguments='{}',
++)
++EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
++EMPTY_DICT_FUNCTION_CALL = FunctionCall(
++    name="do_something_cool",
++    arguments='{"additional_data": {}}',
++)
++EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
++EMPTY_LIST_FUNCTION_CALL = FunctionCall(
++    name="do_something_cool",
++    arguments='{"steps": []}',
++)
++ESCAPED_STRING_FUNCTION_OUTPUT = (
++    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')")
++ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
++    name="get_weather",
++    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
++)
++
++
++@pytest.mark.parametrize("streaming", [True, False])
++def test_no_tool_call(streaming: bool):
++    mock_tokenizer = MagicMock()
++    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
++        mock_tokenizer)
++    model_output = "How can I help you today?"
++
++    content, tool_calls = run_tool_extraction(tool_parser,
++                                              model_output,
++                                              streaming=streaming)
++
++    assert content == model_output
++    assert len(tool_calls) == 0
++
++
++TEST_CASES = [
++    pytest.param(True,
++                 f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
++                 id="simple_streaming"),
++    pytest.param(False,
++                 f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
++                 id="simple_nonstreaming"),
++    pytest.param(True,
++                 f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
++                 id="more_types_streaming"),
++    pytest.param(False,
++                 f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
++                 id="more_types_nonstreaming"),
++    pytest.param(True,
++                 f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
++                 [PARAMETERLESS_FUNCTION_CALL],
++                 id="parameterless_streaming"),
++    pytest.param(False,
++                 f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
++                 [PARAMETERLESS_FUNCTION_CALL],
++                 id="parameterless_nonstreaming"),
++    pytest.param(True,
++                 f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
++                 id="empty_dict_streaming"),
++    pytest.param(False,
++                 f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
++                 id="empty_dict_nonstreaming"),
++    pytest.param(True,
++                 f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
++                 id="empty_list_streaming"),
++    pytest.param(False,
++                 f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
++                 id="empty_list_nonstreaming"),
++    pytest.param(True,
++                 f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
++                 [ESCAPED_STRING_FUNCTION_CALL],
++                 id="escaped_string_streaming"),
++    pytest.param(False,
++                 f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
++                 [ESCAPED_STRING_FUNCTION_CALL],
++                 id="escaped_string_nonstreaming"),
++    pytest.param(True,
++                 f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
++                 [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
++                 id="parallel_calls_streaming"),
++    pytest.param(False,
++                 f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
++                 [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
++                 id="parallel_calls_nonstreaming"),
++]
++
++
++@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
++                         TEST_CASES)
++def test_tool_call(streaming: bool, model_output: str,
++                   expected_tool_calls: List[FunctionCall]):
++    mock_tokenizer = MagicMock()
++    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
++        mock_tokenizer)
++
++    content, tool_calls = run_tool_extraction(tool_parser,
++                                              model_output,
++                                              streaming=streaming)
++
++    assert content is None
++    assert len(tool_calls) == len(expected_tool_calls)
++    for actual, expected in zip(tool_calls, expected_tool_calls):
++        assert actual.type == "function"
++        assert actual.function == expected
++
++
++def test_streaming_tool_call_with_large_steps():
++    mock_tokenizer = MagicMock()
++    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
++        mock_tokenizer)
++    model_output_deltas = [
++        "[get_weather(city='San",
++        " Francisco', metric='celsius'), "
++        f"{PARAMETERLESS_FUNCTION_OUTPUT}, "
++        f"{EMPTY_LIST_FUNCTION_OUTPUT}]",
++    ]
++
++    reconstructor = run_tool_extraction_streaming(
++        tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
++
++    assert reconstructor.other_content == ""
++    assert len(reconstructor.tool_calls) == 3
++    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
++    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
++    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
+new file mode 100644
+index 0000000..f0a2a32
+--- /dev/null
++++ b/tests/entrypoints/openai/tool_parsers/utils.py
+@@ -0,0 +1,123 @@
++from typing import Iterable, List, Tuple, Union
++
++from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
++                                              DeltaMessage,
++                                              ExtractedToolCallInformation,
++                                              FunctionCall, ToolCall)
++from vllm.entrypoints.openai.tool_parsers import ToolParser
++
++
++class StreamingToolReconstructor:
++
++    def __init__(self, assert_one_tool_per_delta: bool = True):
++        self.tool_calls: List[ToolCall] = []
++        self.other_content: str = ""
++        self._assert_one_tool_per_delta = assert_one_tool_per_delta
++
++    def append_delta(self, delta: DeltaMessage):
++        if delta.content is not None:
++            self.other_content += delta.content
++        else:
++            assert delta.tool_calls, (
++                "Streaming results should have either content or tool calls "
++                "(or both)")
++        if self._assert_one_tool_per_delta:
++            # Note: This isn't strictly required by the API and may not be
++            # possible to adhere to depending on the token space and number of
++            # tokens per streamed response from the model, but it is required
++            # by tool_use tests, so we enforce it here by default also.
++            assert len(delta.tool_calls) < 2, (
++                "Streaming should include only one tool call per update.")
++        for call_delta in delta.tool_calls:
++            assert call_delta.type == "function", (
++                "Streaming tool calls should only emit function calls. Got "
++                f"{call_delta.type}")
++            current_tool_call = self.tool_calls[
++                call_delta.index] if call_delta.index < len(
++                    self.tool_calls) else None
++            if current_tool_call:
++                assert (not call_delta.function.name), (
++                    "Streaming tool calls should emit the full function name "
++                    f"exactly once. Got {call_delta.function.name}")
++                assert (not call_delta.id), (
++                    "Streaming tool calls must emit function id only once. Got "
++                    f"{call_delta.id}")
++                assert (call_delta.index == len(self.tool_calls) - 1), (
++                    f"Incorrect index for tool delta. Got {call_delta.index}, "
++                    f"expected {len(self.tool_calls) - 1}")
++                current_tool_call.function.arguments += (
++                    call_delta.function.arguments)
++            else:
++                assert call_delta.id is not None, (
++                    "Streaming tool calls must have an id on first appearance")
++                assert call_delta.function.name is not None, (
++                    "Streaming tool calls must have a function name on first "
++                    "appearance")
++                assert call_delta.index == len(self.tool_calls), (
++                    f"Incorrect index for tool delta. Got {call_delta.index}, "
++                    f"expected {len(self.tool_calls)}")
++                self.tool_calls.append(
++                    ToolCall(id=call_delta.id,
++                             function=FunctionCall(
++                                 name=call_delta.function.name,
++                                 arguments=call_delta.function.arguments
++                                 or "")))
++
++
++def run_tool_extraction(
++    tool_parser: ToolParser,
++    model_output: str,
++    request: Union[ChatCompletionRequest, None] = None,
++    streaming: bool = False,
++    assert_one_tool_per_delta: bool = True,
++) -> Tuple[Union[str, None], List[ToolCall]]:
++    if streaming:
++        reconstructor = run_tool_extraction_streaming(
++            tool_parser,
++            model_output,
++            request,
++            assert_one_tool_per_delta=assert_one_tool_per_delta)
++        return reconstructor.other_content or None, reconstructor.tool_calls
++    else:
++        extracted = run_tool_extraction_nonstreaming(tool_parser, model_output,
++                                                     request)
++        assert extracted.tools_called == bool(extracted.tool_calls)
++        return extracted.content, extracted.tool_calls
++
++
++def run_tool_extraction_nonstreaming(
++    tool_parser: ToolParser,
++    model_output: str,
++    request: Union[ChatCompletionRequest, None] = None
++) -> ExtractedToolCallInformation:
++    request = request or ChatCompletionRequest(messages=[], model="test-model")
++    return tool_parser.extract_tool_calls(model_output, request)
++
++
++def run_tool_extraction_streaming(
++    tool_parser: ToolParser,
++    model_deltas: Iterable[str],
++    request: Union[ChatCompletionRequest, None] = None,
++    assert_one_tool_per_delta: bool = True,
++) -> StreamingToolReconstructor:
++    request = request or ChatCompletionRequest(messages=[], model="test-model")
++    reconstructor = StreamingToolReconstructor(
++        assert_one_tool_per_delta=assert_one_tool_per_delta)
++    previous_text = ""
++    previous_tokens: List[int] = []
++    for delta in model_deltas:
++        token_delta = [
++            tool_parser.vocab.get(token)
++            for token in tool_parser.model_tokenizer.tokenize(delta)
++            if token in tool_parser.vocab
++        ]
++        current_text = previous_text + delta
++        current_tokens = previous_tokens + token_delta
++        delta_message = tool_parser.extract_tool_calls_streaming(
++            previous_text, current_text, delta, previous_tokens,
++            current_tokens, token_delta, request)
++        if delta_message is not None:
++            reconstructor.append_delta(delta_message)
++        previous_text = current_text
++        previous_tokens = current_tokens
++    return reconstructor
+diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
+new file mode 100644
+index 0000000..8f242df
+--- /dev/null
++++ b/tests/entrypoints/test_chat_utils.py
+@@ -0,0 +1,796 @@
++import warnings
++from typing import Optional
++
++import pytest
++
++from vllm.assets.image import ImageAsset
++from vllm.config import ModelConfig
++from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
++                                         parse_chat_messages,
++                                         parse_chat_messages_futures,
++                                         resolve_chat_template_content_format)
++from vllm.entrypoints.llm import apply_hf_chat_template
++from vllm.multimodal import MultiModalDataDict
++from vllm.multimodal.utils import encode_image_base64
++from vllm.transformers_utils.tokenizer_group import TokenizerGroup
++
++from ..utils import VLLM_PATH
++
++EXAMPLES_DIR = VLLM_PATH / "examples"
++
++PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
++ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_3"
++QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
++MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
++LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
++
++
++@pytest.fixture(scope="function")
++def phi3v_model_config():
++    return ModelConfig(PHI3V_MODEL_ID,
++                       task="generate",
++                       tokenizer=PHI3V_MODEL_ID,
++                       tokenizer_mode="auto",
++                       trust_remote_code=True,
++                       dtype="bfloat16",
++                       seed=0,
++                       limit_mm_per_prompt={
++                           "image": 2,
++                       })
++
++
++@pytest.fixture(scope="module")
++def phi3v_tokenizer():
++    return TokenizerGroup(
++        tokenizer_id=PHI3V_MODEL_ID,
++        enable_lora=False,
++        max_num_seqs=5,
++        max_input_length=None,
++    )
++
++
++@pytest.fixture(scope="module")
++def mllama_model_config():
++    return ModelConfig(MLLAMA_MODEL_ID,
++                       task="generate",
++                       tokenizer=MLLAMA_MODEL_ID,
++                       tokenizer_mode="auto",
++                       trust_remote_code=True,
++                       dtype="bfloat16",
++                       seed=0,
++                       limit_mm_per_prompt={
++                           "image": 2,
++                       })
++
++
++@pytest.fixture(scope="module")
++def mllama_tokenizer():
++    return TokenizerGroup(
++        MLLAMA_MODEL_ID,
++        enable_lora=False,
++        max_num_seqs=5,
++        max_input_length=None,
++    )
++
++
++@pytest.fixture(scope="module")
++def image_url():
++    image = ImageAsset('cherry_blossom')
++    base64 = encode_image_base64(image.pil_image)
++    return f"data:image/jpeg;base64,{base64}"
++
++
++def _assert_mm_data_is_image_input(
++    mm_data: Optional[MultiModalDataDict],
++    image_count: int,
++) -> None:
++    assert mm_data is not None
++    assert set(mm_data.keys()) == {"image"}
++
++    image_data = mm_data.get("image")
++    assert image_data is not None
++
++    assert isinstance(image_data, list) and len(image_data) == image_count
++
++
++def test_parse_chat_messages_single_image(
++    phi3v_model_config,
++    phi3v_tokenizer,
++    image_url,
++):
++    conversation, mm_data = parse_chat_messages(
++        [{
++            "role":
++            "user",
++            "content": [{
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            }, {
++                "type": "text",
++                "text": "What's in the image?"
++            }]
++        }],
++        phi3v_model_config,
++        phi3v_tokenizer,
++        content_format="string",
++    )
++
++    assert conversation == [{
++        "role": "user",
++        "content": "<|image_1|>\nWhat's in the image?"
++    }]
++    _assert_mm_data_is_image_input(mm_data, 1)
++
++
++@pytest.mark.asyncio
++async def test_parse_chat_messages_single_image_async(
++    phi3v_model_config,
++    phi3v_tokenizer,
++    image_url,
++):
++    conversation, mm_future = parse_chat_messages_futures(
++        [{
++            "role":
++            "user",
++            "content": [{
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            }, {
++                "type": "text",
++                "text": "What's in the image?"
++            }]
++        }],
++        phi3v_model_config,
++        phi3v_tokenizer,
++        content_format="string",
++    )
++
++    assert conversation == [{
++        "role": "user",
++        "content": "<|image_1|>\nWhat's in the image?"
++    }]
++    _assert_mm_data_is_image_input(await mm_future, 1)
++
++
++def test_parse_chat_messages_multiple_images(
++    phi3v_model_config,
++    phi3v_tokenizer,
++    image_url,
++):
++    conversation, mm_data = parse_chat_messages(
++        [{
++            "role":
++            "user",
++            "content": [{
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            }, {
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            }, {
++                "type": "text",
++                "text": "What's in these images?"
++            }]
++        }],
++        phi3v_model_config,
++        phi3v_tokenizer,
++        content_format="string",
++    )
++
++    assert conversation == [{
++        "role":
++        "user",
++        "content":
++        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
++    }]
++    _assert_mm_data_is_image_input(mm_data, 2)
++
++
++@pytest.mark.asyncio
++async def test_parse_chat_messages_multiple_images_async(
++    phi3v_model_config,
++    phi3v_tokenizer,
++    image_url,
++):
++    conversation, mm_future = parse_chat_messages_futures(
++        [{
++            "role":
++            "user",
++            "content": [{
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            }, {
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            }, {
++                "type": "text",
++                "text": "What's in these images?"
++            }]
++        }],
++        phi3v_model_config,
++        phi3v_tokenizer,
++        content_format="string",
++    )
++
++    assert conversation == [{
++        "role":
++        "user",
++        "content":
++        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
++    }]
++    _assert_mm_data_is_image_input(await mm_future, 2)
++
++
++def test_parse_chat_messages_placeholder_already_in_prompt(
++    phi3v_model_config,
++    phi3v_tokenizer,
++    image_url,
++):
++    conversation, mm_data = parse_chat_messages(
++        [{
++            "role":
++            "user",
++            "content": [{
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            }, {
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            }, {
++                "type":
++                "text",
++                "text":
++                "What's in <|image_1|> and how does it compare to <|image_2|>?"
++            }]
++        }],
++        phi3v_model_config,
++        phi3v_tokenizer,
++        content_format="string",
++    )
++    assert conversation == [{
++        "role":
++        "user",
++        "content":
++        "What's in <|image_1|> and how does it compare to <|image_2|>?"
++    }]
++    _assert_mm_data_is_image_input(mm_data, 2)
++
++
++def test_parse_chat_messages_placeholder_one_already_in_prompt(
++    phi3v_model_config,
++    phi3v_tokenizer,
++    image_url,
++):
++    conversation, mm_data = parse_chat_messages(
++        [{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": image_url
++                    }
++                },
++                {
++                    "type": "image_url",
++                    "image_url": {
++                        "url": image_url
++                    }
++                },
++                {
++                    "type":
++                    "text",
++                    "text":
++                    "What's in <|image_1|> and how does it compare to the other one?"  # noqa: E501
++                }
++            ]
++        }],
++        phi3v_model_config,
++        phi3v_tokenizer,
++        content_format="string",
++    )
++
++    assert conversation == [{
++        "role":
++        "user",
++        "content":
++        "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
++        "other one?"
++    }]
++    _assert_mm_data_is_image_input(mm_data, 2)
++
++
++def test_parse_chat_messages_multiple_images_across_messages(
++    phi3v_model_config,
++    phi3v_tokenizer,
++    image_url,
++):
++    conversation, mm_data = parse_chat_messages(
++        [{
++            "role":
++            "user",
++            "content": [{
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            }, {
++                "type": "text",
++                "text": "What's in this image?"
++            }]
++        }, {
++            "role": "assistant",
++            "content": "Some stuff."
++        }, {
++            "role":
++            "user",
++            "content": [{
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            }, {
++                "type": "text",
++                "text": "What about this one?"
++            }]
++        }],
++        phi3v_model_config,
++        phi3v_tokenizer,
++        content_format="string",
++    )
++
++    assert conversation == [
++        {
++            "role": "user",
++            "content": "<|image_1|>\nWhat's in this image?"
++        },
++        {
++            "role": "assistant",
++            "content": "Some stuff."
++        },
++        {
++            "role": "user",
++            "content": "<|image_2|>\nWhat about this one?"
++        },
++    ]
++    _assert_mm_data_is_image_input(mm_data, 2)
++
++
++def test_parse_chat_messages_context_text_format(
++    phi3v_model_config,
++    phi3v_tokenizer,
++):
++    conversation, mm_data = parse_chat_messages(
++        [{
++            "role": "user",
++            "content": [{
++                "type": "text",
++                "text": "What's in this text?"
++            }]
++        }, {
++            "role": "assistant",
++            "content": "Some stuff."
++        }, {
++            "role": "user",
++            "content": "What about this one?"
++        }],
++        phi3v_model_config,
++        phi3v_tokenizer,
++        content_format="openai",
++    )
++
++    assert conversation == [
++        {
++            "role": "user",
++            "content": [{
++                "type": "text",
++                "text": "What's in this text?"
++            }]
++        },
++        {
++            "role": "assistant",
++            "content": [{
++                "type": "text",
++                "text": "Some stuff."
++            }]
++        },
++        {
++            "role": "user",
++            "content": [{
++                "type": "text",
++                "text": "What about this one?"
++            }]
++        },
++    ]
++
++
++def test_parse_chat_messages_rejects_too_many_images_in_one_message(
++    phi3v_model_config,
++    phi3v_tokenizer,
++    image_url,
++):
++    with warnings.catch_warnings():
++        warnings.filterwarnings(
++            "ignore",
++            message="coroutine 'async_get_and_parse_image' was never awaited")
++        with pytest.raises(
++                ValueError,
++                match="At most 2 image\\(s\\) may be provided in one request\\."
++        ):
++            parse_chat_messages(
++                [{
++                    "role":
++                    "user",
++                    "content": [{
++                        "type": "image_url",
++                        "image_url": {
++                            "url": image_url
++                        }
++                    }, {
++                        "type": "image_url",
++                        "image_url": {
++                            "url": image_url
++                        }
++                    }, {
++                        "type": "image_url",
++                        "image_url": {
++                            "url": image_url
++                        }
++                    }, {
++                        "type": "text",
++                        "text": "What's in these images?"
++                    }]
++                }],
++                phi3v_model_config,
++                phi3v_tokenizer,
++                content_format="string",
++            )
++
++
++def test_parse_chat_messages_rejects_too_many_images_across_messages(
++    phi3v_model_config,
++    phi3v_tokenizer,
++    image_url,
++):
++    with warnings.catch_warnings():
++        warnings.filterwarnings(
++            "ignore",
++            message="coroutine 'async_get_and_parse_image' was never awaited")
++        with pytest.raises(
++                ValueError,
++                match="At most 2 image\\(s\\) may be provided in one request\\."
++        ):
++            parse_chat_messages(
++                [{
++                    "role":
++                    "user",
++                    "content": [{
++                        "type": "image_url",
++                        "image_url": {
++                            "url": image_url
++                        }
++                    }, {
++                        "type": "text",
++                        "text": "What's in this image?"
++                    }]
++                }, {
++                    "role": "assistant",
++                    "content": "Some stuff."
++                }, {
++                    "role":
++                    "user",
++                    "content": [{
++                        "type": "image_url",
++                        "image_url": {
++                            "url": image_url
++                        }
++                    }, {
++                        "type": "image_url",
++                        "image_url": {
++                            "url": image_url
++                        }
++                    }, {
++                        "type": "text",
++                        "text": "What about these two?"
++                    }]
++                }],
++                phi3v_model_config,
++                phi3v_tokenizer,
++                content_format="string",
++            )
++
++
++def test_parse_chat_messages_multiple_images_uncommon_input(
++    phi3v_model_config,
++    phi3v_tokenizer,
++    image_url,
++):
++    conversation, mm_data = parse_chat_messages(
++        [{
++            "role":
++            "user",
++            "content": [
++                "What's in these images?", {
++                    "image_url": image_url
++                }, {
++                    "image_url": image_url
++                }
++            ]
++        }],
++        phi3v_model_config,
++        phi3v_tokenizer,
++        content_format="string",
++    )
++
++    assert conversation == [{
++        "role":
++        "user",
++        "content":
++        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
++    }]
++    _assert_mm_data_is_image_input(mm_data, 2)
++
++
++### Mllama currently wraps images / texts as interleaved dictionaries
++def test_mllama_single_image(
++    mllama_model_config,
++    mllama_tokenizer,
++    image_url,
++):
++    """Ensures that a single image is parsed correctly mllama."""
++    conversation, mm_data = parse_chat_messages(
++        [{
++            "role":
++            "user",
++            "content": [{
++                'type': 'text',
++                'text': 'The content of this image is:'
++            }, {
++                "image_url": image_url
++            }]
++        }],
++        mllama_model_config,
++        mllama_tokenizer,
++        content_format="openai",
++    )
++    _assert_mm_data_is_image_input(mm_data, 1)
++    assert conversation == [{
++        'role':
++        'user',
++        'content': [{
++            'type': 'text',
++            'text': 'The content of this image is:'
++        }, {
++            'type': 'image'
++        }]
++    }]
++
++
++def test_mllama_interleaved_images(
++    mllama_model_config,
++    mllama_tokenizer,
++    image_url,
++):
++    """Ensures that multiple image are parsed as interleaved dicts."""
++    conversation, mm_data = parse_chat_messages(
++        [{
++            "role":
++            "user",
++            "content": [
++                {
++                    'type': 'text',
++                    'text': 'The content of the first image is:'
++                },
++                {
++                    "image_url": image_url
++                },
++                {
++                    'type': 'text',
++                    'text': 'The content of the second image is:'
++                },
++                {
++                    "image_url": image_url
++                },
++            ]
++        }],
++        mllama_model_config,
++        mllama_tokenizer,
++        content_format="openai",
++    )
++    _assert_mm_data_is_image_input(mm_data, 2)
++    assert conversation == [{
++        'role':
++        'user',
++        'content': [{
++            'type': 'text',
++            'text': 'The content of the first image is:'
++        }, {
++            'type': 'image'
++        }, {
++            'type': 'text',
++            'text': 'The content of the second image is:'
++        }, {
++            'type': 'image'
++        }]
++    }]
++
++
++@pytest.mark.parametrize("model", [MLLAMA_MODEL_ID])
++def test_multimodal_image_parsing_matches_hf(model, image_url):
++    """Checks end to end hf alignment for multimodal [image] parsing."""
++
++    def get_conversation(is_hf: bool):
++        img_part = {"type": "image_url", "image_url": {"url": image_url}}
++        if is_hf:
++            img_part = {'type': 'image'}
++        return [{
++            'role':
++            'user',
++            'content': [
++                {
++                    'type': 'text',
++                    'text': 'The content of the first image is:'
++                },
++                img_part,
++                {
++                    'type': 'text',
++                    'text': 'The content of the second image is:'
++                },
++                img_part,
++                {
++                    'type': 'text',
++                    'text': 'What animal is in the first image?'
++                },
++            ]
++        }]
++
++    # Build a config for the model
++    model_config = ModelConfig(model,
++                               task="generate",
++                               tokenizer=MLLAMA_MODEL_ID,
++                               tokenizer_mode="auto",
++                               trust_remote_code=True,
++                               dtype="bfloat16",
++                               seed=0,
++                               limit_mm_per_prompt={
++                                   "image": 2,
++                               })
++
++    # Build the tokenizer group and grab the underlying tokenizer
++    tokenizer_group = TokenizerGroup(
++        MLLAMA_MODEL_ID,
++        enable_lora=False,
++        max_num_seqs=5,
++        max_input_length=None,
++    )
++    tokenizer = tokenizer_group.tokenizer
++
++    # Build and parse a conversation with {"type": "image"} using the tokenizer
++    hf_conversation = get_conversation(is_hf=True)
++    hf_result = tokenizer.apply_chat_template(
++        hf_conversation,
++        tokenize=False,
++        add_generation_prompt=True,
++    )
++
++    # Now parse with vLLMs chat utils & apply the template
++    vllm_conversation = get_conversation(is_hf=False)
++    conversation, _ = parse_chat_messages(
++        vllm_conversation,
++        model_config,
++        tokenizer_group,
++        content_format="openai",
++    )
++
++    vllm_result = apply_hf_chat_template(
++        tokenizer,
++        conversation=conversation,
++        chat_template=None,
++        add_generation_prompt=True,
++    )
++
++    assert hf_result == vllm_result
++
++
++# yapf: disable
++@pytest.mark.parametrize(
++    ("model", "expected_format"),
++    [(PHI3V_MODEL_ID, "string"),
++     (QWEN2VL_MODEL_ID, "openai"),
++     (ULTRAVOX_MODEL_ID, "string"),
++     (MLLAMA_MODEL_ID, "openai"),
++     (LLAMA_GUARD_MODEL_ID, "openai")],
++)
++# yapf: enable
++def test_resolve_content_format_hf_defined(model, expected_format):
++    tokenizer_group = TokenizerGroup(
++        model,
++        enable_lora=False,
++        max_num_seqs=5,
++        max_input_length=None,
++    )
++    tokenizer = tokenizer_group.tokenizer
++
++    chat_template = tokenizer.chat_template
++    assert isinstance(chat_template, str)
++
++    print("[TEXT]")
++    print(chat_template)
++    print("[AST]")
++    print(_try_extract_ast(chat_template))
++
++    resolved_format = resolve_chat_template_content_format(
++        None,  # Test detecting the tokenizer's chat_template
++        "auto",
++        tokenizer,
++    )
++
++    assert resolved_format == expected_format
++
++
++# yapf: disable
++@pytest.mark.parametrize(
++    ("template_path", "expected_format"),
++    [("template_alpaca.jinja", "string"),
++     ("template_baichuan.jinja", "string"),
++     ("template_blip2.jinja", "string"),
++     ("template_chatglm.jinja", "string"),
++     ("template_chatglm2.jinja", "string"),
++     ("template_chatml.jinja", "string"),
++     ("template_falcon_180b.jinja", "string"),
++     ("template_falcon.jinja", "string"),
++     ("template_inkbot.jinja", "string"),
++     ("template_llava.jinja", "string"),
++     ("template_pixtral_hf.jinja", "openai"),
++     ("template_vlm2vec.jinja", "openai"),
++     ("tool_chat_template_granite_20b_fc.jinja", "string"),
++     ("tool_chat_template_hermes.jinja", "string"),
++     ("tool_chat_template_internlm2_tool.jinja", "string"),
++     ("tool_chat_template_llama3.1_json.jinja", "openai"),
++     ("tool_chat_template_llama3.2_json.jinja", "openai"),
++     ("tool_chat_template_mistral_parallel.jinja", "string"),
++     ("tool_chat_template_mistral.jinja", "string")],
++)
++# yapf: enable
++def test_resolve_content_format_examples(template_path, expected_format):
++    tokenizer_group = TokenizerGroup(
++        PHI3V_MODEL_ID,
++        enable_lora=False,
++        max_num_seqs=5,
++        max_input_length=None,
++    )
++    dummy_tokenizer = tokenizer_group.tokenizer
++    dummy_tokenizer.chat_template = None
++
++    chat_template = load_chat_template(EXAMPLES_DIR / template_path)
++    assert isinstance(chat_template, str)
++
++    print("[TEXT]")
++    print(chat_template)
++    print("[AST]")
++    print(_try_extract_ast(chat_template))
++
++    resolved_format = resolve_chat_template_content_format(
++        chat_template,
++        "auto",
++        dummy_tokenizer,
++    )
++
++    assert resolved_format == expected_format
+diff --git a/tests/kernels/__init__.py b/tests/kernels/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
+new file mode 100644
+index 0000000..f235894
+--- /dev/null
++++ b/tests/kernels/quant_utils.py
+@@ -0,0 +1,88 @@
++from typing import Optional, Tuple, Union
++
++import torch
++
++from vllm.platforms import current_platform
++
++# Using the default value (240.0) from pytorch will cause accuracy
++# issue on dynamic quantization models. Here use 224.0 for rocm.
++ROCM_FP8_MAX = 224.0
++FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \
++                else torch.float8_e4m3fn
++
++
++def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
++    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
++
++def ref_dynamic_per_token_quant(x: torch.tensor,
++                                quant_dtype: torch.dtype,
++                                scale_ub: Optional[torch.tensor] = None) \
++        -> Tuple[torch.tensor, torch.tensor]:
++
++    assert quant_dtype in [torch.int8, FP8_DTYPE]
++    if scale_ub is not None:
++        assert quant_dtype == FP8_DTYPE
++
++    qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
++            else torch.finfo(quant_dtype)
++    qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
++                                        else qtype_traits.max
++    qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
++                                        else qtype_traits.min
++    qtype_max = as_float32_tensor(qtype_traits_max)
++    s_1 = as_float32_tensor(1.0)
++    s_512 = as_float32_tensor(512.0)
++
++    # For fp8, in order to match the cuda kernel output, we have to do exactly
++    # the same operations as in the corresponding fp8 kernel to prevent
++    # rounding errors.
++
++    # Compute scales
++    x_token_max, _ = x.abs().max(dim=-1)
++    x_token_max = as_float32_tensor(x_token_max)
++    if scale_ub is not None:
++        x_token_max = x_token_max.clamp(max=scale_ub)
++    scales = (x_token_max / qtype_max)[:, None]
++
++    # Quant
++    if quant_dtype == torch.int8:
++        iscales = as_float32_tensor(s_1 / scales)
++        torch_out = as_float32_tensor(x) * iscales
++        torch_out = torch_out.round()
++        torch_out = torch_out.clamp(qtype_traits_min,
++                                    qtype_traits_max).to(quant_dtype)
++    else:
++        assert quant_dtype == FP8_DTYPE
++        min_scaling_factor = s_1 / (qtype_max * s_512)
++        scales = scales.clamp(min=min_scaling_factor)
++        torch_out = as_float32_tensor(x) / scales
++        torch_out = torch_out.clamp(qtype_traits_min,
++                                    qtype_traits_max).to(quant_dtype)
++
++    return torch_out, scales
++
++
++# The int8 version is very similar. Incorporate the int8 version, like in
++# ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
++# kernel
++def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
++                    -> Tuple[torch.tensor, torch.tensor]:
++
++    fp8_traits = torch.finfo(FP8_DTYPE)
++    fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
++                                    else fp8_traits.max
++    fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
++                                    else fp8_traits.min
++    fp8_max = as_float32_tensor(fp8_traits_max)
++    one = as_float32_tensor(1.0)
++
++    # For fp8, in order to match the cuda kernel output, we have to do exactly
++    # the same operations as in the corresponding fp8 kernel to prevent
++    # rounding errors.
++
++    x_max = as_float32_tensor(x.abs().max())
++    ref_scale = x_max / fp8_max
++    ref_iscale = one / ref_scale
++    ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
++        fp8_traits_min, fp8_traits_max).to(FP8_DTYPE)
++    return ref_out, ref_scale.view((1, ))
+diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
+index 86ecc64..a84501f 100644
+--- a/tests/kernels/test_activation.py
++++ b/tests/kernels/test_activation.py
+@@ -1,22 +1,28 @@
++import random
+ from typing import Type
+ 
+ import pytest
+ import torch
+-from allclose_default import get_default_atol, get_default_rtol
+ 
+-from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
+-                                                   NewGELU, SiluAndMul)
++from tests.kernels.utils import opcheck
++from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
++                                                   GeluAndMul, NewGELU,
++                                                   QuickGELU, SiluAndMul)
++from vllm.platforms import current_platform
++
++from .allclose_default import get_default_atol, get_default_rtol
+ 
+ DTYPES = [torch.half, torch.bfloat16, torch.float]
+ NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
+-D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
++D = [512, 13824]  # Arbitrary values for testing
+ SEEDS = [0]
+ CUDA_DEVICES = [
+     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+ ]
+ 
+ 
+-@pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"])
++@pytest.mark.parametrize("activation",
++                         ["silu", "gelu", "gelu_tanh", "fatrelu"])
+ @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+ @pytest.mark.parametrize("d", D)
+ @pytest.mark.parametrize("dtype", DTYPES)
+@@ -31,25 +37,40 @@ def test_act_and_mul(
+     seed: int,
+     device: str,
+ ) -> None:
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
+     if activation == "silu":
+         layer = SiluAndMul()
++        fn = torch.ops._C.silu_and_mul
+     elif activation == "gelu":
+         layer = GeluAndMul(approximate="none")
++        fn = torch.ops._C.gelu_and_mul
+     elif activation == "gelu_tanh":
+         layer = GeluAndMul(approximate="tanh")
++        fn = torch.ops._C.gelu_tanh_and_mul
++    elif activation == "fatrelu":
++        threshold = random.uniform(0, 1)
++        layer = FatreluAndMul(threshold)
++        fn = torch.ops._C.fatrelu_and_mul
+     out = layer(x)
+-    ref_out = layer._forward(x)
+-    # The SiLU and GELU implementations are equivalent to the native PyTorch
+-    # implementations, so we can do exact comparison.
+-    assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
++    ref_out = layer.forward_native(x)
++    # The SiLU, GELU and FatReLU implementations are equivalent to the native
++    # PyTorch implementations, so we can do exact comparison.
++    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
++
++    d = x.shape[-1] // 2
++    output_shape = (x.shape[:-1] + (d, ))
++    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
++    if activation == "fatrelu":
++        opcheck(fn, (out, x, threshold))
++    else:
++        opcheck(fn, (out, x))
+ 
+ 
+-@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
++@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
++                                        (NewGELU, torch.ops._C.gelu_new),
++                                        (QuickGELU, torch.ops._C.gelu_quick)])
+ @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+ @pytest.mark.parametrize("d", D)
+ @pytest.mark.parametrize("dtype", DTYPES)
+@@ -64,15 +85,17 @@ def test_activation(
+     seed: int,
+     device: str,
+ ) -> None:
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     x = torch.randn(num_tokens, d, dtype=dtype)
+-    layer = activation()
++    layer = activation[0]()
++    fn = activation[1]
+     out = layer(x)
+-    ref_out = layer._forward(x)
+-    assert torch.allclose(out,
+-                          ref_out,
+-                          atol=get_default_atol(out),
+-                          rtol=get_default_rtol(out))
++    ref_out = layer.forward_native(x)
++    torch.testing.assert_close(out,
++                               ref_out,
++                               atol=get_default_atol(out),
++                               rtol=get_default_rtol(out))
++
++    out = torch.empty_like(x)
++    opcheck(fn, (out, x))
+diff --git a/tests/kernels/test_aqlm.py b/tests/kernels/test_aqlm.py
+new file mode 100644
+index 0000000..860fb66
+--- /dev/null
++++ b/tests/kernels/test_aqlm.py
+@@ -0,0 +1,37 @@
++import torch
++
++from tests.kernels.utils import opcheck
++from vllm import _custom_ops as ops  # noqa: F401
++
++
++def test_aqlm_dequant_opcheck():
++    codes = torch.randint(-32768,
++                          32767, (22016, 512, 1),
++                          device='cuda',
++                          dtype=torch.int16)
++    codebooks = torch.rand((2, 65536, 1, 8),
++                           device='cuda',
++                           dtype=torch.float16)
++    codebook_partition_sizes = [11008, 11008]
++
++    opcheck(torch.ops._C.aqlm_dequant,
++            (codes, codebooks, codebook_partition_sizes))
++
++
++def test_aqlm_gemm_opcheck():
++    input = torch.rand((4, 4096), device='cuda', dtype=torch.float16)
++    codes = torch.randint(-32768,
++                          32767, (12288, 512, 1),
++                          device='cuda',
++                          dtype=torch.int16)
++    codebooks = torch.rand((3, 65536, 1, 8),
++                           device='cuda',
++                           dtype=torch.float16)
++    scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16)
++    codebook_partition_sizes = [4096, 4096, 4096]
++    bias = None
++
++    opcheck(torch.ops._C.aqlm_gemm,
++            (input, codes, codebooks, scales, codebook_partition_sizes, None))
++    opcheck(torch.ops._C.aqlm_gemm,
++            (input, codes, codebooks, scales, codebook_partition_sizes, bias))
+diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
+index 8453920..3e3c066 100644
+--- a/tests/kernels/test_attention.py
++++ b/tests/kernels/test_attention.py
+@@ -3,12 +3,17 @@ from typing import List, Optional, Tuple
+ 
+ import pytest
+ import torch
+-from allclose_default import get_default_atol, get_default_rtol
+-from xformers import ops as xops
+-from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
+ 
++from tests.kernels.utils import opcheck
+ from vllm import _custom_ops as ops
+-from vllm.utils import get_max_shared_memory_bytes, is_hip
++from vllm.platforms import current_platform
++from vllm.utils import get_max_shared_memory_bytes
++
++from .allclose_default import get_default_atol, get_default_rtol
++
++if not current_platform.is_rocm():
++    from xformers import ops as xops
++    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
+ 
+ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+ # This will change depending on the compute capability.
+@@ -19,16 +24,16 @@ MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
+ NUM_BLOCKS = 4321  # Arbitrary values for testing
+ PARTITION_SIZE = 512
+ # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
+-DTYPES = [torch.half, torch.bfloat16, torch.float
+-          ] if not is_hip() else [torch.half, torch.bfloat16]
++DTYPES = [
++    torch.half, torch.bfloat16, torch.float
++] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+ NUM_GEN_SEQS = [7]  # Arbitrary values for testing
+ NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
+ NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
+ 
+ # FlashAttention forward only supports head dimension at most 128
+ # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
+-HEAD_SIZES = [64, 80, 96, 112, 128, 256
+-              ] if not is_hip() else [64, 80, 96, 112, 128]
++HEAD_SIZES = [64, 80, 120, 256]
+ 
+ BLOCK_SIZES = [16, 32]
+ USE_ALIBI = [False, True]
+@@ -71,27 +76,27 @@ def ref_single_query_cached_kv_attention(
+     block_size = value_cache.shape[3]
+     num_seqs = query.shape[0]
+ 
+-    block_tables = block_tables.cpu().tolist()
+-    seq_lens = seq_lens.cpu().tolist()
++    block_tables_lst = block_tables.cpu().tolist()
++    seq_lens_lst = seq_lens.cpu().tolist()
+     for i in range(num_seqs):
+         q = query[i].unsqueeze(0)
+-        block_table = block_tables[i]
+-        seq_len = int(seq_lens[i])
++        block_table = block_tables_lst[i]
++        seq_len = int(seq_lens_lst[i])
+ 
+-        keys = []
+-        values = []
++        keys_lst: List[torch.Tensor] = []
++        values_lst: List[torch.Tensor] = []
+         for j in range(seq_len):
+             block_number = int(block_table[j // block_size])
+             block_offset = j % block_size
+ 
+             k = key_cache[block_number, :, :, block_offset, :]
+             k = k.reshape(num_kv_heads, head_size)
+-            keys.append(k)
++            keys_lst.append(k)
+ 
+             v = value_cache[block_number, :, :, block_offset]
+-            values.append(v)
+-        keys = torch.stack(keys, dim=0)
+-        values = torch.stack(values, dim=0)
++            values_lst.append(v)
++        keys = torch.stack(keys_lst, dim=0)
++        values = torch.stack(values_lst, dim=0)
+         if num_queries_per_kv > 1:
+             # Handle MQA and GQA
+             keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
+@@ -110,7 +115,9 @@ def ref_single_query_cached_kv_attention(
+         output[i].copy_(out, non_blocking=True)
+ 
+ 
+-@pytest.mark.parametrize("version", ["v1", "v2"])
++@pytest.mark.parametrize(
++    "version",
++    ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"])
+ @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
+ @pytest.mark.parametrize("num_heads", NUM_HEADS)
+ @pytest.mark.parametrize("head_size", HEAD_SIZES)
+@@ -133,10 +140,11 @@ def test_paged_attention(
+     seed: int,
+     device: str,
+ ) -> None:
+-    random.seed(seed)
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    if ((kv_cache_dtype == "fp8" and head_size % 16)
++            or (version == "rocm" and head_size not in (64, 128))):
++        pytest.skip()
++
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     scale = float(1.0 / (head_size**0.5))
+     num_query_heads, num_kv_heads = num_heads
+@@ -156,14 +164,15 @@ def test_paged_attention(
+ 
+     # Create the block tables.
+     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+-    block_tables = []
++    block_tables_lst: List[List[int]] = []
+     for _ in range(num_seqs):
+         block_table = [
+             random.randint(0, NUM_BLOCKS - 1)
+             for _ in range(max_num_blocks_per_seq)
+         ]
+-        block_tables.append(block_table)
+-    block_tables = torch.tensor(block_tables, dtype=torch.int)
++        block_tables_lst.append(block_table)
++
++    block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
+ 
+     # Create the KV caches.
+     key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
+@@ -173,7 +182,7 @@ def test_paged_attention(
+     key_cache, value_cache = key_caches[0], value_caches[0]
+ 
+     # Using default kv_scale
+-    kv_scale = 1.0
++    k_scale = v_scale = 1.0
+ 
+     # Call the paged attention kernel.
+     output = torch.empty_like(query)
+@@ -191,9 +200,18 @@ def test_paged_attention(
+             max_seq_len,
+             alibi_slopes,
+             kv_cache_dtype,
+-            kv_scale,
++            k_scale,
++            v_scale,
+         )
+-    elif version == "v2":
++
++        opcheck(torch.ops._C.paged_attention_v1,
++                (output, query, key_cache, value_cache, num_kv_heads, scale,
++                 block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
++                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
++                cond=(head_size == HEAD_SIZES[0]
++                      and block_size == BLOCK_SIZES[0]))
++
++    elif version in ("v2", "rocm"):
+         num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
+         assert PARTITION_SIZE % block_size == 0
+         num_seqs, num_heads, head_size = output.shape
+@@ -206,24 +224,64 @@ def test_paged_attention(
+             dtype=torch.float32,
+         )
+         max_logits = torch.empty_like(exp_sums)
+-        ops.paged_attention_v2(
+-            output,
+-            exp_sums,
+-            max_logits,
+-            tmp_output,
+-            query,
+-            key_cache,
+-            value_cache,
+-            num_kv_heads,
+-            scale,
+-            block_tables,
+-            seq_lens,
+-            block_size,
+-            max_seq_len,
+-            alibi_slopes,
+-            kv_cache_dtype,
+-            kv_scale,
+-        )
++        if version == "v2":
++            ops.paged_attention_v2(
++                output,
++                exp_sums,
++                max_logits,
++                tmp_output,
++                query,
++                key_cache,
++                value_cache,
++                num_kv_heads,
++                scale,
++                block_tables,
++                seq_lens,
++                block_size,
++                max_seq_len,
++                alibi_slopes,
++                kv_cache_dtype,
++                k_scale,
++                v_scale,
++            )
++
++            opcheck(torch.ops._C.paged_attention_v2,
++                    (output, exp_sums, max_logits, tmp_output, query,
++                     key_cache, value_cache, num_kv_heads, scale, block_tables,
++                     seq_lens, block_size, max_seq_len, alibi_slopes,
++                     kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
++                    cond=(head_size == HEAD_SIZES[0]
++                          and block_size == BLOCK_SIZES[0]))
++
++        else:
++            ops.paged_attention_rocm(
++                output,
++                exp_sums,
++                max_logits,
++                tmp_output,
++                query,
++                key_cache,
++                value_cache,
++                num_kv_heads,
++                scale,
++                block_tables,
++                seq_lens,
++                block_size,
++                max_seq_len,
++                alibi_slopes,
++                kv_cache_dtype,
++                k_scale,
++                v_scale,
++            )
++
++            opcheck(torch.ops._rocm_C.paged_attention,
++                    (output, exp_sums, max_logits, tmp_output, query,
++                     key_cache, value_cache, num_kv_heads, scale, block_tables,
++                     seq_lens, block_size, max_seq_len, alibi_slopes,
++                     kv_cache_dtype, k_scale, v_scale),
++                    cond=(head_size == HEAD_SIZES[0]
++                          and block_size == BLOCK_SIZES[0]))
++
+     else:
+         raise AssertionError(f"Unknown version: {version}")
+ 
+@@ -236,14 +294,14 @@ def test_paged_attention(
+         dequantized_key_cache = torch.empty(size=key_cache_shape,
+                                             dtype=dtype,
+                                             device=device)
+-        ops.convert_fp8(key_cache, dequantized_key_cache)
++        ops.convert_fp8(dequantized_key_cache, key_cache)
+         key_cache = dequantized_key_cache
+ 
+         value_cache_shape = value_cache.shape
+         dequantized_value_cache = torch.empty(size=value_cache_shape,
+                                               dtype=dtype,
+                                               device=device)
+-        ops.convert_fp8(value_cache, dequantized_value_cache)
++        ops.convert_fp8(dequantized_value_cache, value_cache)
+         value_cache = dequantized_value_cache
+ 
+     ref_output = torch.empty_like(query)
+@@ -262,15 +320,15 @@ def test_paged_attention(
+     # NOTE(woosuk): Due to the kernel-level differences in the two
+     # implementations, there is a small numerical difference in the two
+     # outputs. Thus, we use a relaxed tolerance for the test.
+-    atol = get_default_atol(output) if is_hip() else 1e-3
+-    rtol = get_default_rtol(output) if is_hip() else 1e-5
++    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
++    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
+ 
+     # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
+     # so we use a relaxed tolerance for the test.
+     atol, rtol = 1e-3, 1e-5
+     if kv_cache_dtype == "fp8":
+         atol, rtol = 1e-2, 1e-5
+-    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
++    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
+ 
+ 
+ def ref_multi_query_kv_attention(
+@@ -282,7 +340,7 @@ def ref_multi_query_kv_attention(
+     dtype: torch.dtype,
+ ) -> torch.Tensor:
+     num_seqs = len(cu_seq_lens) - 1
+-    ref_outputs = []
++    ref_outputs: List[torch.Tensor] = []
+     for i in range(num_seqs):
+         start_idx = cu_seq_lens[i]
+         end_idx = cu_seq_lens[i + 1]
+@@ -302,8 +360,8 @@ def ref_multi_query_kv_attention(
+             attn_mask=attn_mask,
+         )
+         ref_outputs.append(ref_output)
+-    ref_output = torch.cat(ref_outputs, dim=0)
+-    return ref_output
++
++    return torch.cat(ref_outputs, dim=0)
+ 
+ 
+ # TODO(woosuk): Add tests for USE_ALIBI=True.
+@@ -313,6 +371,8 @@ def ref_multi_query_kv_attention(
+ @pytest.mark.parametrize("dtype", DTYPES)
+ @pytest.mark.parametrize("seed", SEEDS)
+ @pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.skipif(current_platform.is_rocm(),
++                    reason="Xformers backend is not supported on ROCm.")
+ @torch.inference_mode()
+ def test_multi_query_kv_attention(
+     num_seqs: int,
+@@ -322,10 +382,7 @@ def test_multi_query_kv_attention(
+     seed: int,
+     device: str,
+ ) -> None:
+-    random.seed(seed)
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
+     # As the xformers library is already tested with its own tests, we can use
+@@ -371,6 +428,6 @@ def test_multi_query_kv_attention(
+         scale,
+         dtype,
+     )
+-    atol = get_default_atol(output) if is_hip() else 1e-3
+-    rtol = get_default_rtol(output) if is_hip() else 1e-5
+-    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
++    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
++    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
++    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
+diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
+new file mode 100644
+index 0000000..a08c874
+--- /dev/null
++++ b/tests/kernels/test_attention_selector.py
+@@ -0,0 +1,100 @@
++from unittest.mock import Mock, patch
++
++import pytest
++import torch
++
++from tests.kernels.utils import override_backend_env_variable
++from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
++from vllm.platforms.cpu import CpuPlatform
++from vllm.platforms.cuda import CudaPlatform
++from vllm.platforms.openvino import OpenVinoPlatform
++from vllm.platforms.rocm import RocmPlatform
++from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
++
++
++@pytest.fixture(autouse=True)
++def clear_cache():
++    """Clear lru cache to ensure each test case runs without caching.
++    """
++    _cached_get_attn_backend.cache_clear()
++
++
++@pytest.mark.parametrize(
++    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
++@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
++def test_env(name: str, device: str, monkeypatch):
++    """Test that the attention selector can be set via environment variable.
++    Note that we do not test FlashAttn because it is the default backend.
++    """
++
++    override_backend_env_variable(monkeypatch, name)
++
++    if device == "cpu":
++        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
++            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
++                                       False)
++        assert backend.get_name() == "TORCH_SDPA"
++    elif device == "hip":
++        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
++            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
++                                       False)
++        assert backend.get_name() == "ROCM_FLASH"
++    elif device == "openvino":
++        with patch("vllm.attention.selector.current_platform",
++                   OpenVinoPlatform()), patch.dict('sys.modules',
++                                                   {'openvino': Mock()}):
++            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
++                                       False)
++        assert backend.get_name() == "OPENVINO"
++    else:
++        if name in ["XFORMERS", "FLASHINFER"]:
++            with patch("vllm.attention.selector.current_platform",
++                       CudaPlatform()):
++                backend = get_attn_backend(16, torch.float16, torch.float16,
++                                           16, False)
++            assert backend.get_name() == name
++
++
++def test_flash_attn(monkeypatch):
++    """Test FlashAttn validation."""
++    # TODO: When testing for v1, pipe in `use_v1` as an argument to
++    # get_attn_backend
++
++    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
++
++    # Unsupported CUDA arch
++    with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
++        backend = get_attn_backend(16, torch.float16, None, 16, False)
++        assert backend.get_name() != STR_FLASH_ATTN_VAL
++
++    # Unsupported data type
++    backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
++    assert backend.get_name() != STR_FLASH_ATTN_VAL
++
++    # Unsupported kv cache data type
++    backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
++    assert backend.get_name() != STR_FLASH_ATTN_VAL
++
++    # Unsupported block size
++    backend = get_attn_backend(16, torch.float16, None, 8, False)
++    assert backend.get_name() != STR_FLASH_ATTN_VAL
++
++    # flash-attn is not installed
++    with patch.dict('sys.modules', {'vllm_flash_attn': None}):
++        backend = get_attn_backend(16, torch.float16, None, 16, False)
++        assert backend.get_name() != STR_FLASH_ATTN_VAL
++
++    # Unsupported head size
++    backend = get_attn_backend(17, torch.float16, None, 16, False)
++    assert backend.get_name() != STR_FLASH_ATTN_VAL
++
++    # Attention-free models should bypass env and use PlaceholderAttention
++    backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
++    assert backend.get_name() != STR_FLASH_ATTN_VAL
++
++
++def test_invalid_env(monkeypatch):
++    """Throw an exception if the backend name is invalid."""
++    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
++    with pytest.raises(ValueError):
++        get_attn_backend(16, torch.float16, None, 16, False)
+diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py
+new file mode 100644
+index 0000000..aa7a430
+--- /dev/null
++++ b/tests/kernels/test_awq.py
+@@ -0,0 +1,43 @@
++import os
++
++import pytest
++import torch
++
++from tests.kernels.utils import opcheck
++from vllm import _custom_ops as ops  # noqa: F401
++
++
++@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
++                    reason="AWQ is not supported on this GPU type.")
++def test_awq_dequantize_opcheck():
++    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
++    qweight = torch.randint(-2000000000,
++                            2000000000, (8192, 256),
++                            device='cuda',
++                            dtype=torch.int32)
++    scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
++    zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
++    split_k_iters = 0
++    thx = 0
++    thy = 0
++    opcheck(torch.ops._C.awq_dequantize,
++            (qweight, scales, zeros, split_k_iters, thx, thy))
++
++
++@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
++                    reason="AWQ is not supported on this GPU type.")
++def test_awq_gemm_opcheck():
++    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
++    input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
++    qweight = torch.randint(-2000000000,
++                            2000000000, (8192, 256),
++                            device='cuda',
++                            dtype=torch.int32)
++    scales = torch.randint(-2000000000,
++                           2000000000, (64, 256),
++                           device='cuda',
++                           dtype=torch.int32)
++    qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
++    split_k_iters = 8
++    opcheck(torch.ops._C.awq_gemm,
++            (input, qweight, qzeros, scales, split_k_iters))
+diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py
+new file mode 100644
+index 0000000..238d642
+--- /dev/null
++++ b/tests/kernels/test_awq_marlin.py
+@@ -0,0 +1,167 @@
++"""Test AWQ with fused MoE Marlin kernels.
++
++Run `pytest tests/kernels/test_awq_marlin.py`.
++"""
++import pytest
++import torch
++
++import vllm.model_executor.layers.fused_moe  # noqa
++from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe,
++                                 torch_moe_single)
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
++from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
++    awq_marlin_quantize)
++from vllm.scalar_type import scalar_types
++
++NUM_EXPERTS = [8, 64]
++TOP_KS = [2, 6]
++GROUP_SIZES = [-1, 32, 128]
++
++
++@pytest.mark.parametrize("m", [1, 33, 64, 222])
++@pytest.mark.parametrize("n", [128, 2048])
++@pytest.mark.parametrize("k", [128, 1024])
++@pytest.mark.parametrize("e", NUM_EXPERTS)
++@pytest.mark.parametrize("topk", TOP_KS)
++@pytest.mark.parametrize("group_size", GROUP_SIZES)
++@pytest.mark.skipif(not (ops.supports_moe_ops
++                         and hasattr(torch.ops._moe_C, "marlin_gemm_moe")),
++                    reason="Marlin is not supported on this GPU type.")
++def test_fused_marlin_moe_awq(
++    m: int,
++    n: int,
++    k: int,
++    e: int,
++    topk: int,
++    group_size: int,
++):
++    torch.manual_seed(7)
++
++    num_bits = 4
++    quant_type = scalar_types.uint4
++    dtype = torch.float16
++    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
++    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
++    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
++
++    w_ref1_l = []
++    qweights1_l = []
++    scales1_l = []
++    zp1_l = []
++
++    for i in range(w1.shape[0]):
++        w_ref1, qweight1, scales1, zp1 = awq_marlin_quantize(
++            w1[i].transpose(1, 0), quant_type, group_size)
++        w_ref1_l.append(w_ref1)
++        qweights1_l.append(qweight1)
++        scales1_l.append(scales1)
++        zp1_l.append(zp1)
++
++    w_ref1 = stack_and_dev(w_ref1_l)
++    qweight1 = stack_and_dev(qweights1_l).contiguous()
++    scales1 = stack_and_dev(scales1_l)
++    zp1 = stack_and_dev(zp1_l)
++
++    w_ref2_l = []
++    qweights2_l = []
++    scales2_l = []
++    zp2_l = []
++
++    for i in range(w2.shape[0]):
++        w_ref2, qweight2, scales2, zp2 = awq_marlin_quantize(
++            w2[i].transpose(1, 0), quant_type, group_size)
++        w_ref2_l.append(w_ref2)
++        qweights2_l.append(qweight2)
++        scales2_l.append(scales2)
++        zp2_l.append(zp2)
++
++    w_ref2 = stack_and_dev(w_ref2_l)
++    qweight2 = stack_and_dev(qweights2_l).contiguous()
++    scales2 = stack_and_dev(scales2_l)
++    zp2 = stack_and_dev(zp2_l)
++
++    score = torch.randn((m, e), device="cuda", dtype=dtype)
++
++    topk_weights, topk_ids = fused_topk(a, score, topk, False)
++    marlin_output = torch.ops.vllm.fused_marlin_moe(
++        a,
++        qweight1,
++        qweight2,
++        scales1,
++        scales2,
++        score,
++        topk_weights,
++        topk_ids,
++        w1_zeros=zp1,
++        w2_zeros=zp2,
++        num_bits=num_bits,
++    )
++
++    torch_output = torch_moe(
++        a,
++        w_ref1.transpose(1, 2),
++        w_ref2.transpose(1, 2),
++        score,
++        topk,
++    )
++
++    assert compute_max_diff(marlin_output, torch_output) < 4e-2
++
++
++@pytest.mark.skip("This test is here for the sake of debugging, "
++                  "don't run it in automated tests.")
++@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
++@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
++@pytest.mark.parametrize("k", [128, 1024, 512])
++@pytest.mark.parametrize("e", [8, 64])
++@pytest.mark.parametrize("topk", [2, 6])
++@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
++def test_single_marlin_moe_multiply_awq(
++    m: int,
++    n: int,
++    k: int,
++    e: int,
++    topk: int,
++    group_size: int,
++):
++    torch.manual_seed(7)
++
++    num_bits = 4
++    quant_type = scalar_types.uint4
++    dtype = torch.float16
++    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
++    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
++
++    w_ref_l = []
++    qweights_l = []
++    scales_l = []
++    zp_l = []
++
++    for i in range(w.shape[0]):
++        w_ref, qweight, scales, zp = awq_marlin_quantize(
++            w[i].transpose(1, 0), quant_type, group_size)
++        w_ref_l.append(w_ref)
++        qweights_l.append(qweight)
++        scales_l.append(scales)
++        zp_l.append(zp)
++
++    w_ref = stack_and_dev(w_ref_l)
++    qweight = stack_and_dev(qweights_l).contiguous()
++    scales = stack_and_dev(scales_l).contiguous()
++    zp = stack_and_dev(zp_l).contiguous()
++
++    score = torch.randn((m, e), device="cuda", dtype=dtype)
++
++    marlin_output = torch.ops.vllm.single_marlin_moe(a,
++                                                     qweight,
++                                                     scales,
++                                                     score,
++                                                     topk,
++                                                     renormalize=False,
++                                                     w_zeros=zp,
++                                                     num_bits=num_bits)
++
++    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
++
++    assert compute_max_diff(marlin_output, torch_output) < 1e-2
+diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/test_awq_triton.py
+new file mode 100644
+index 0000000..406a0c8
+--- /dev/null
++++ b/tests/kernels/test_awq_triton.py
+@@ -0,0 +1,170 @@
++"""Tests for the AWQ Triton kernel.
++
++Run `pytest tests/kernels/test_awq_triton.py`.
++"""
++import pytest
++import torch
++
++from vllm.model_executor.layers.quantization.awq_triton import (
++    AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
++from vllm.platforms import current_platform
++
++device = "cuda"
++
++
++def reverse_awq_order(t: torch.Tensor):
++    bits = 4
++    AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
++    reverse_order_tensor = torch.arange(
++        t.shape[-1],
++        dtype=torch.int32,
++        device=t.device,
++    )
++    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
++    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
++    reverse_order_tensor = reverse_order_tensor.view(-1)
++
++    t = t[:, reverse_order_tensor] & 0xF
++    return t
++
++
++# qweights - [R     , C // 8], int32
++# scales   - [R // G, C     ], float16
++# zeros    - [R // G, C // 8], int32
++def awq_dequantize_torch(qweight: torch.Tensor, scales: torch.Tensor,
++                         qzeros: torch.Tensor,
++                         group_size: int) -> torch.Tensor:
++
++    if group_size == -1:
++        group_size = qweight.shape[0]
++
++    bits = 4
++    shifts = torch.arange(0, 32, bits, device=qzeros.device)
++
++    iweights = torch.bitwise_right_shift(qweight[:, :, None],
++                                         shifts[None, None, :]).to(torch.int8)
++
++    iweights = iweights.view(iweights.shape[0], -1)
++
++    zeros = torch.bitwise_right_shift(qzeros[:, :, None],
++                                      shifts[None, None, :]).to(torch.int8)
++    zeros = zeros.view(qzeros.shape[0], -1)
++    zeros = reverse_awq_order(zeros)
++
++    iweights = reverse_awq_order(iweights)
++
++    iweights = torch.bitwise_and(iweights, (2**bits) - 1)
++    zeros = torch.bitwise_and(zeros, (2**bits) - 1)
++
++    scales = scales.repeat_interleave(group_size, dim=0)
++    zeros = zeros.repeat_interleave(group_size, dim=0)
++    return (iweights - zeros) * scales
++
++
++# qweights - [R     , C // 8], int32
++# scales   - [R // G, C     ], float16
++# zeros    - [R // G, C // 8], int32
++@pytest.mark.parametrize("qweight_rows", [3584, 18944, 128, 256, 512, 1024])
++@pytest.mark.parametrize("qweight_cols", [448, 576, 4736, 16, 32, 64, 128])
++@pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
++def test_dequantize(qweight_rows, qweight_cols, group_size):
++
++    if group_size == -1:
++        group_size = qweight_rows
++
++    qweight_dtype = torch.int32
++    scales_rows = qweight_rows // group_size
++    scales_cols = qweight_cols * 8
++    scales_dtype = torch.float16
++    zeros_rows = scales_rows
++    zeros_cols = qweight_cols
++    zeros_dtype = torch.int32
++
++    current_platform.seed_everything(0)
++
++    qweight = torch.randint(0,
++                            torch.iinfo(torch.int32).max,
++                            (qweight_rows, qweight_cols),
++                            dtype=qweight_dtype,
++                            device=device)
++    scales = torch.rand(scales_rows,
++                        scales_cols,
++                        dtype=scales_dtype,
++                        device=device)
++    zeros = torch.randint(0,
++                          torch.iinfo(torch.int32).max,
++                          (zeros_rows, zeros_cols),
++                          dtype=zeros_dtype,
++                          device=device)
++
++    iweights_triton = awq_dequantize_triton(qweight, scales, zeros)
++
++    assert (not torch.any(torch.isinf(iweights_triton))
++            and not torch.any(torch.isnan(iweights_triton)))
++
++    iweights_torch = awq_dequantize_torch(qweight, scales, zeros, group_size)
++
++    torch.testing.assert_close(iweights_triton, iweights_torch)
++
++
++# input   - [N, K]
++# qweight - [K, M // 8]
++# qzeros  - [K // G, M // 8]
++# scales  - [K // G, M]
++@pytest.mark.parametrize("N", [1, 2, 4, 8, 14, 17, 23, 32])
++@pytest.mark.parametrize("K", [128])
++@pytest.mark.parametrize("M", [16, 24, 32])
++@pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
++@pytest.mark.parametrize("splitK", [1, 8])
++def test_gemm(N, K, M, splitK, group_size):
++
++    if group_size == -1:
++        group_size = K
++
++    split_k_iters = splitK
++
++    input_rows = N
++    input_cols = K
++    input_dtype = torch.float32
++    qweight_rows = input_cols
++    qweight_cols = M // 8
++    scales_rows = qweight_rows // group_size
++    scales_cols = M
++    scales_dtype = torch.float32
++    qzeros_rows = scales_rows
++    qzeros_cols = qweight_cols
++
++    current_platform.seed_everything(0)
++
++    input = torch.rand((input_rows, input_cols),
++                       dtype=input_dtype,
++                       device=device)
++    qweight = torch.randint(0,
++                            torch.iinfo(torch.int32).max,
++                            (qweight_rows, qweight_cols),
++                            device=device)
++    qzeros = torch.randint(0,
++                           torch.iinfo(torch.int32).max,
++                           (qzeros_rows, qzeros_cols),
++                           device=device)
++    scales = torch.rand((scales_rows, scales_cols),
++                        dtype=scales_dtype,
++                        device=device)
++
++    output_triton = awq_gemm_triton(input, qweight, scales, qzeros,
++                                    split_k_iters)
++
++    assert (not torch.any(torch.isinf(output_triton))
++            and not torch.any(torch.isnan(output_triton)))
++
++    dequantized_weights = awq_dequantize_triton(qweight, scales, qzeros)
++
++    output_torch = torch.matmul(input, dequantized_weights)
++
++    assert (not torch.any(torch.isinf(output_torch))
++            and not torch.any(torch.isnan(output_torch)))
++
++    torch.testing.assert_close(output_triton.cpu(),
++                               output_torch.cpu(),
++                               atol=1e-1,
++                               rtol=1e-1)
+diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
+new file mode 100644
+index 0000000..a16cc45
+--- /dev/null
++++ b/tests/kernels/test_block_fp8.py
+@@ -0,0 +1,265 @@
++# Adapted from https://github.com/sgl-project/sglang/pull/2575
++import itertools
++
++import pytest
++import torch
++
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.fused_moe import fused_moe
++from vllm.model_executor.layers.quantization.utils.fp8_utils import (
++    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
++from vllm.platforms import current_platform
++
++if current_platform.get_device_capability() < (9, 0):
++    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
++                allow_module_level=True)
++
++# Test configurations
++DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
++NUM_TOKENS = [7, 83, 2048]
++D = [512, 4096, 5120, 13824]
++GROUP_SIZE = [64, 128, 256, 512]
++M = [1, 7, 83, 512, 2048]
++N = [128, 512, 1024, 4096, 7748, 13824]
++K = [256, 4096, 5120, 3884, 13824]
++# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
++# and its hidden size is 7168.
++M_moe = [1, 7, 83, 512, 2048]
++N_moe = [4608]  # [128, 4608, 13824]
++K_moe = [7168]  # [256, 7168, 13824]
++BLOCK_SIZE = [[128, 128]]
++E = [256]  # [8, 24, 128, 256]
++TOP_KS = [1]  # [1, 2, 6]
++OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
++SEEDS = [0]
++
++
++def native_per_token_group_quant_fp8(x,
++                                     group_size,
++                                     eps=1e-10,
++                                     dtype=torch.float8_e4m3fn):
++    """Function to perform per-token-group quantization on an input tensor
++    `x` using native torch."""
++    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` cannot "
++                                           "be divisible by `group_size`")
++    assert x.is_contiguous(), "`x` is not contiguous"
++
++    finfo = torch.finfo(dtype)
++    fp8_min = finfo.min
++    fp8_max = finfo.max
++
++    x_ = x.reshape(x.numel() // group_size, group_size)
++    amax = x_.abs().max(dim=-1,
++                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
++    x_s = amax / fp8_max
++    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
++    x_q = x_q.reshape(x.shape)
++    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
++
++    return x_q, x_s
++
++
++def native_w8a8_block_fp8_matmul(A,
++                                 B,
++                                 As,
++                                 Bs,
++                                 block_size,
++                                 output_dtype=torch.float16):
++    """Matrix multiplication with block-wise quantization using native torch."""
++    A = A.to(torch.float32)
++    B = B.to(torch.float32)
++    assert A.shape[-1] == B.shape[-1]
++    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
++    assert len(block_size) == 2
++    block_n, block_k = block_size[0], block_size[1]
++    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
++    assert A.shape[:-1] == As.shape[:-1]
++
++    M = A.numel() // A.shape[-1]
++    N, K = B.shape
++    origin_C_shape = A.shape[:-1] + (N, )
++    A = A.reshape(M, A.shape[-1])
++    As = As.reshape(M, As.shape[-1])
++    n_tiles = (N + block_n - 1) // block_n
++    k_tiles = (K + block_k - 1) // block_k
++    assert n_tiles == Bs.shape[0]
++    assert k_tiles == Bs.shape[1]
++
++    C_shape = (M, N)
++    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
++
++    A_tiles = [
++        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
++    ]
++    B_tiles = [[
++        B[j * block_n:min((j + 1) * block_n, N),
++          i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
++    ] for j in range(n_tiles)]
++    C_tiles = [
++        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
++    ]
++    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
++
++    for i in range(k_tiles):
++        for j in range(n_tiles):
++            a = A_tiles[i]
++            b = B_tiles[j][i]
++            c = C_tiles[j]
++            s = As_tiles[i] * Bs[j][i]
++            c[:, :] += torch.matmul(a, b.t()) * s
++
++    C = C.reshape(origin_C_shape).to(output_dtype)
++    return C
++
++
++def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
++    """Fused moe with block-wise quantization using native torch."""
++    B, D = a.shape
++    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
++    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
++    score = torch.softmax(score, dim=-1, dtype=torch.float32)
++    topk_weight, topk_ids = torch.topk(score, topk)
++    topk_weight = topk_weight.view(-1)
++    topk_ids = topk_ids.view(-1)
++
++    _, block_k = block_shape[0], block_shape[1]
++    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
++    a_q = a_q.to(torch.float32)
++    for i in range(w1.shape[0]):
++        mask = topk_ids == i
++        if mask.sum():
++            inter_out = native_w8a8_block_fp8_matmul(a_q[mask],
++                                                     w1[i],
++                                                     a_s[mask],
++                                                     w1_s[i],
++                                                     block_shape,
++                                                     output_dtype=a.dtype)
++            act_out = SiluAndMul().forward_native(inter_out)
++            act_out_q, act_out_s = native_per_token_group_quant_fp8(
++                act_out, block_k)
++            act_out = act_out.to(torch.float32)
++            out[mask] = native_w8a8_block_fp8_matmul(act_out_q,
++                                                     w2[i],
++                                                     act_out_s,
++                                                     w2_s[i],
++                                                     block_shape,
++                                                     output_dtype=a.dtype)
++    return (out.view(B, -1, w2.shape[1]) *
++            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
++
++
++# Skip all tests if CUDA is not available
++pytest.importorskip("torch.cuda")
++
++
++@pytest.fixture(autouse=True)
++def setup_cuda():
++    torch.set_default_device("cuda")
++
++
++@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
++                         itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
++                                           SEEDS))
++@torch.inference_mode()
++def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
++    torch.manual_seed(seed)
++    x = torch.rand(num_tokens, d, dtype=dtype)
++
++    ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
++    out, scale = per_token_group_quant_fp8(x, group_size)
++
++    assert torch.allclose(out.to(torch.float32),
++                          ref_out.to(torch.float32),
++                          rtol=0.15)
++    assert torch.allclose(scale, ref_scale)
++
++
++@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
++                         itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
++                                           SEEDS))
++@torch.inference_mode()
++def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
++    torch.manual_seed(seed)
++    factor_for_scale = 1e-2
++    fp8_info = torch.finfo(torch.float8_e4m3fn)
++    fp8_max, fp8_min = fp8_info.max, fp8_info.min
++
++    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
++    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
++
++    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
++    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
++
++    block_n, block_k = block_size[0], block_size[1]
++    n_tiles = (N + block_n - 1) // block_n
++    k_tiles = (K + block_k - 1) // block_k
++
++    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
++    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
++
++    ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
++                                           out_dtype)
++    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
++
++    rel_diff = (torch.mean(
++        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
++                torch.mean(torch.abs(ref_out.to(torch.float32))))
++    assert rel_diff < 0.001
++
++
++@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
++                         itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
++                                           BLOCK_SIZE, DTYPES, SEEDS))
++@torch.inference_mode()
++def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
++    torch.manual_seed(seed)
++    factor_for_scale = 1e-2
++    fp8_info = torch.finfo(torch.float8_e4m3fn)
++    fp8_max, fp8_min = fp8_info.max, fp8_info.min
++
++    a = torch.randn((M, K), dtype=dtype) / 10
++
++    w1_bf16 = (torch.rand(
++        (E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
++    w1 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
++    del w1_bf16
++
++    w2_bf16 = (torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
++    w2 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
++    del w2_bf16
++
++    block_n, block_k = block_size[0], block_size[1]
++    n_tiles_w1 = (2 * N + block_n - 1) // block_n
++    n_tiles_w2 = (K + block_n - 1) // block_n
++    k_tiles_w1 = (K + block_k - 1) // block_k
++    k_tiles_w2 = (N + block_k - 1) // block_k
++
++    w1_s = torch.rand(
++        (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale
++    w2_s = torch.rand(
++        (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale
++
++    score = torch.randn((M, E), dtype=dtype)
++
++    out = fused_moe(
++        a,
++        w1,
++        w2,
++        score,
++        topk,
++        renormalize=False,
++        use_fp8_w8a8=True,
++        w1_scale=w1_s,
++        w2_scale=w2_s,
++        block_shape=block_size,
++    )
++    ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
++                                       block_size)
++
++    print(f"{out.sum()=}")
++    print(f"{ref_out.sum()=}")
++
++    rel_diff = (torch.mean(
++        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
++                torch.mean(torch.abs(ref_out.to(torch.float32))))
++    assert rel_diff < 0.03
+diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
+new file mode 100644
+index 0000000..fad342d
+--- /dev/null
++++ b/tests/kernels/test_blocksparse_attention.py
+@@ -0,0 +1,439 @@
++import random
++from typing import List, Optional, Tuple
++
++import pytest
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.attention.ops.blocksparse_attention.interface import (
++    LocalStridedBlockSparseAttn)
++from vllm.platforms import current_platform
++from vllm.utils import get_max_shared_memory_bytes
++
++from .allclose_default import get_default_atol, get_default_rtol
++
++FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
++# This will change depending on the compute capability.
++# - 512 as a buffer
++MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
++# MAX_SEQ_LEN = 2771
++
++# There may not be enough gpu memory due to large NUM_BLOCKS.
++# Reduce NUM_BLOCKS when it happens.
++NUM_BLOCKS = 4321  # Arbitrary values for testing
++PARTITION_SIZE = 512
++DTYPES = [torch.half, torch.bfloat16]
++NUM_GEN_SEQS = [3]  # Arbitrary values for testing
++NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
++NUM_HEADS = [(40, 40)]  # Arbitrary values for testing
++
++HEAD_SIZES = [64, 112]
++BLOCK_SIZES = [16]
++USE_ALIBI = [False, True]
++KV_CACHE_DTYPE = ["auto", "fp8"]
++SEEDS = [0]
++CUDA_DEVICES = ['cuda:0']
++BLOCKSPARSE_LOCAL_BLOCKS = [16]
++BLOCKSPARSE_VERT_STRIDES = [8]
++
++BLOCKSPARSE_BLOCK_SIZES = [64]
++BLOCKSPARSE_HEADS_SLIDINGS = [2, -1]
++BLOCKSPARSE_HOMO_HEADS = [True, False]
++
++
++def ref_masked_attention(
++    query: torch.Tensor,
++    key: torch.Tensor,
++    value: torch.Tensor,
++    scale: float,
++    attn_mask: Optional[torch.Tensor] = None,
++) -> torch.Tensor:
++    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
++    if attn_mask is not None:
++        attn_weights = attn_weights + attn_mask.float()
++    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
++    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
++    return out
++
++
++def ref_single_query_cached_kv_attention(
++    output: torch.Tensor,
++    query: torch.Tensor,
++    num_queries_per_kv: int,
++    key_cache: torch.Tensor,
++    value_cache: torch.Tensor,
++    block_tables: torch.Tensor,
++    seq_lens: torch.Tensor,
++    scale: float,
++    alibi_slopes: Optional[torch.Tensor],
++    tp_rank: int = 0,
++    blocksparse_local_blocks: int = 0,
++    blocksparse_vert_stride: int = 1,
++    blocksparse_block_size: int = 64,
++    blocksparse_head_sliding_step: int = 0,
++) -> None:
++    num_query_heads = query.shape[1]
++    num_kv_heads = value_cache.shape[1]
++    head_size = value_cache.shape[2]
++    block_size = value_cache.shape[3]
++    num_seqs = query.shape[0]
++
++    block_tables_lst = block_tables.cpu().tolist()
++    seq_lens_lst = seq_lens.cpu().tolist()
++    for i in range(num_seqs):
++        q = query[i].unsqueeze(0)
++        block_table = block_tables_lst[i]
++        seq_len = int(seq_lens_lst[i])
++
++        keys_lst: List[torch.Tensor] = []
++        values_lst: List[torch.Tensor] = []
++        for j in range(seq_len):
++            block_number = int(block_table[j // block_size])
++            block_offset = j % block_size
++
++            k = key_cache[block_number, :, :, block_offset, :]
++            k = k.reshape(num_kv_heads, head_size)
++            keys_lst.append(k)
++
++            v = value_cache[block_number, :, :, block_offset]
++            values_lst.append(v)
++        keys = torch.stack(keys_lst, dim=0)
++        values = torch.stack(values_lst, dim=0)
++        if num_queries_per_kv > 1:
++            # Handle MQA and GQA
++            keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
++            values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
++
++        alibi_bias = None
++        if alibi_slopes is not None:
++            # Create the ALiBi bias used in the paged attention kernel.
++            position_ids = torch.arange(seq_len).int()
++            alibi_bias = (position_ids - seq_len + 1).float()
++            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
++                1, 1, -1)
++
++        if blocksparse_vert_stride >= 1:
++            bsize = blocksparse_block_size
++            hsliding = blocksparse_head_sliding_step
++            vert = blocksparse_vert_stride
++            locals = blocksparse_local_blocks
++            qb = (seq_len - 1) // bsize
++            attn_mask = q.new_zeros(
++                (num_query_heads, 1, seq_len)).float() - torch.inf
++            for h in range(num_query_heads):
++                if hsliding >= 0:  # slide with q heads
++                    bs_offset = (tp_rank * num_query_heads + h) * hsliding + 1
++                else:  # slide with kv heads
++                    bs_offset = (tp_rank * num_kv_heads +
++                                 h // num_queries_per_kv) * (-hsliding) + 1
++                for kb in range(qb + 1):
++                    kj = kb * bsize
++                    if (qb - kb) < locals or \
++                        (kb + bs_offset) % vert == 0:
++                        attn_mask[h, 0, kj:min(kj + bsize, seq_len)] = 0
++            if alibi_bias is not None:
++                attn_mask += alibi_bias
++        else:
++            attn_mask = alibi_bias
++
++        out = ref_masked_attention(q, keys, values, scale, attn_mask=attn_mask)
++        out = out.view(num_query_heads, head_size)
++        output[i].copy_(out, non_blocking=True)
++
++
++@pytest.mark.parametrize("version", ["v1", "v2"])
++@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("use_alibi", USE_ALIBI)
++@pytest.mark.parametrize("block_size", BLOCK_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
++@pytest.mark.parametrize("seed", SEEDS)
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS)
++@pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES)
++@pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES)
++@pytest.mark.parametrize("blocksparse_head_sliding_step",
++                         BLOCKSPARSE_HEADS_SLIDINGS)
++def test_paged_attention(
++    kv_cache_factory,
++    version: str,
++    num_seqs: int,
++    num_heads: Tuple[int, int],
++    head_size: int,
++    use_alibi: bool,
++    block_size: int,
++    dtype: torch.dtype,
++    kv_cache_dtype: str,
++    seed: int,
++    device: str,
++    blocksparse_local_blocks: int,
++    blocksparse_vert_stride: int,
++    blocksparse_block_size: int,
++    blocksparse_head_sliding_step: int,
++) -> None:
++    current_platform.seed_everything(seed)
++    torch.set_default_device(device)
++    scale = float(1.0 / (head_size**0.5))
++    num_query_heads, num_kv_heads = num_heads
++    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
++    query.uniform_(-scale, scale)
++
++    assert num_query_heads % num_kv_heads == 0
++    num_queries_per_kv = num_query_heads // num_kv_heads
++    alibi_slopes = None
++    if use_alibi:
++        alibi_slopes = torch.rand(num_query_heads, dtype=torch.float)
++
++    seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
++    seq_lens[-1] = MAX_SEQ_LEN
++    max_seq_len = max(seq_lens)
++    seq_lens = torch.tensor(seq_lens, dtype=torch.int)
++
++    # Create the block tables.
++    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
++    block_tables = []
++    for _ in range(num_seqs):
++        block_table = [
++            random.randint(0, NUM_BLOCKS - 1)
++            for _ in range(max_num_blocks_per_seq)
++        ]
++        block_tables.append(block_table)
++    block_tables = torch.tensor(block_tables, dtype=torch.int)
++
++    # Create the KV caches.
++    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
++                                                num_kv_heads, head_size,
++                                                kv_cache_dtype, dtype, seed,
++                                                device)
++    key_cache, value_cache = key_caches[0], value_caches[0]
++
++    # Using default kv_scale
++    k_scale = v_scale = 1.0
++    tp_rank = 0
++
++    # Call the paged attention kernel.
++    output = torch.empty_like(query)
++    if version == "v1":
++        ops.paged_attention_v1(
++            output,
++            query,
++            key_cache,
++            value_cache,
++            num_kv_heads,
++            scale,
++            block_tables,
++            seq_lens,
++            block_size,
++            max_seq_len,
++            alibi_slopes,
++            kv_cache_dtype,
++            k_scale,
++            v_scale,
++            tp_rank=tp_rank,
++            blocksparse_local_blocks=blocksparse_local_blocks,
++            blocksparse_vert_stride=blocksparse_vert_stride,
++            blocksparse_block_size=blocksparse_block_size,
++            blocksparse_head_sliding_step=blocksparse_head_sliding_step,
++        )
++    elif version == "v2":
++        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
++        assert PARTITION_SIZE % block_size == 0
++        num_seqs, num_heads, head_size = output.shape
++        tmp_output = torch.empty(
++            size=(num_seqs, num_heads, num_partitions, head_size),
++            dtype=output.dtype,
++        )
++        exp_sums = torch.empty(
++            size=(num_seqs, num_heads, num_partitions),
++            dtype=torch.float32,
++        )
++        max_logits = torch.empty_like(exp_sums)
++        ops.paged_attention_v2(
++            output,
++            exp_sums,
++            max_logits,
++            tmp_output,
++            query,
++            key_cache,
++            value_cache,
++            num_kv_heads,
++            scale,
++            block_tables,
++            seq_lens,
++            block_size,
++            max_seq_len,
++            alibi_slopes,
++            kv_cache_dtype,
++            k_scale,
++            v_scale,
++            tp_rank=tp_rank,
++            blocksparse_local_blocks=blocksparse_local_blocks,
++            blocksparse_vert_stride=blocksparse_vert_stride,
++            blocksparse_block_size=blocksparse_block_size,
++            blocksparse_head_sliding_step=blocksparse_head_sliding_step,
++        )
++    else:
++        raise AssertionError(f"Unknown version: {version}")
++
++    # Run the reference implementation.
++    if kv_cache_dtype == "fp8":
++        # Convert cache data back to dtype.
++        x = 16 // torch.tensor([], dtype=dtype).element_size()
++        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
++                           block_size, x)
++        dequantized_key_cache = torch.empty(size=key_cache_shape,
++                                            dtype=dtype,
++                                            device=device)
++        ops.convert_fp8(dequantized_key_cache, key_cache)
++        key_cache = dequantized_key_cache
++
++        value_cache_shape = value_cache.shape
++        dequantized_value_cache = torch.empty(size=value_cache_shape,
++                                              dtype=dtype,
++                                              device=device)
++        ops.convert_fp8(dequantized_value_cache, value_cache)
++        value_cache = dequantized_value_cache
++
++    ref_output = torch.empty_like(query)
++    ref_single_query_cached_kv_attention(
++        ref_output,
++        query,
++        num_queries_per_kv,
++        key_cache,
++        value_cache,
++        block_tables,
++        seq_lens,
++        scale,
++        alibi_slopes,
++        tp_rank,
++        blocksparse_local_blocks,
++        blocksparse_vert_stride,
++        blocksparse_block_size,
++        blocksparse_head_sliding_step,
++    )
++
++    # NOTE(woosuk): Due to the kernel-level differences in the two
++    # implementations, there is a small numerical difference in the two
++    # outputs. Thus, we use a relaxed tolerance for the test.
++    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
++    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
++
++    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
++    # so we use a relaxed tolerance for the test.
++    atol, rtol = 1e-3, 1e-5
++    if kv_cache_dtype == "fp8":
++        atol, rtol = 1e-2, 1e-5
++    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
++
++
++def ref_multi_query_kv_attention(
++    cu_seq_lens: List[int],
++    query: torch.Tensor,
++    key: torch.Tensor,
++    value: torch.Tensor,
++    scale: float,
++    dtype: torch.dtype,
++) -> torch.Tensor:
++    num_seqs = len(cu_seq_lens) - 1
++    ref_outputs = []
++    for i in range(num_seqs):
++        start_idx = cu_seq_lens[i]
++        end_idx = cu_seq_lens[i + 1]
++        seq_len = end_idx - start_idx
++
++        # Create attention mask.
++        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
++                               diagonal=1)
++        attn_mask = attn_mask * torch.finfo(dtype).min
++        attn_mask = attn_mask.to(dtype=dtype)
++
++        ref_output = ref_masked_attention(
++            query[start_idx:end_idx],
++            key[start_idx:end_idx],
++            value[start_idx:end_idx],
++            scale,
++            attn_mask=attn_mask,
++        )
++        ref_outputs.append(ref_output)
++    ref_output = torch.cat(ref_outputs, dim=0)
++    return ref_output
++
++
++@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS)
++@pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES)
++@pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES)
++@pytest.mark.parametrize("blocksparse_homo_heads", BLOCKSPARSE_HOMO_HEADS)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("seed", SEEDS)
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_varlen_blocksparse_attention_prefill(
++    num_seqs: int,
++    num_heads: Tuple[int, int],
++    head_size: int,
++    blocksparse_local_blocks: int,
++    blocksparse_vert_stride: int,
++    blocksparse_block_size: int,
++    blocksparse_homo_heads: bool,
++    dtype: torch.dtype,
++    seed: int,
++    device: str,
++) -> None:
++    current_platform.seed_everything(seed)
++    torch.set_default_device(device)
++    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
++    # As the xformers library is already tested with its own tests, we can use
++    # a smaller MAX_SEQ_LEN here.
++    max_len = min(MAX_SEQ_LEN, 4096)
++    seq_lens = random.sample(range(1, max_len), num_seqs)
++    cu_seq_lens = torch.cumsum(torch.tensor([0] + seq_lens), dim=0)
++    num_tokens = sum(seq_lens)
++
++    scale = float(1.0 / (head_size**0.5))
++    num_query_heads, num_kv_heads = num_heads
++    assert num_query_heads % num_kv_heads == 0
++    num_queries_per_kv = num_query_heads // num_kv_heads
++
++    qkv = torch.empty(num_tokens,
++                      num_query_heads + 2 * num_kv_heads,
++                      head_size,
++                      dtype=dtype)
++    qkv.uniform_(-scale, scale)
++    query, key, value = qkv.split(
++        [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
++
++    bs_attn_op = LocalStridedBlockSparseAttn(
++        num_query_heads,
++        max_len,
++        local_blocks=blocksparse_local_blocks,
++        vert_stride=blocksparse_vert_stride,
++        block_size=blocksparse_block_size,
++        device=device,
++        dtype=dtype,
++        homo_head=blocksparse_homo_heads)
++
++    output = bs_attn_op(query,
++                        key,
++                        value,
++                        cu_seq_lens.to(device),
++                        sm_scale=scale)
++
++    if num_queries_per_kv > 1:
++        # Handle MQA and GQA
++        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
++        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
++
++    ref_output = ref_multi_query_kv_attention(
++        cu_seq_lens.tolist(),
++        query,
++        key,
++        value,
++        scale,
++        dtype,
++    )
++    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
+diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
+index ca215bb..40550ed 100644
+--- a/tests/kernels/test_cache.py
++++ b/tests/kernels/test_cache.py
+@@ -1,19 +1,19 @@
+ import random
+-from typing import Tuple
++from typing import List, Tuple
+ 
+ import pytest
+ import torch
+ 
++from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+ from vllm import _custom_ops as ops
+-from vllm._C import cache_ops
+-from vllm.utils import is_hip
++from vllm.platforms import current_platform
+ 
+ COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
+ DTYPES = [torch.half, torch.bfloat16, torch.float]
+ NUM_TOKENS = [42]  # Arbitrary values for testing
+ NUM_LAYERS = [1]  # Arbitrary values for testing
+ NUM_HEADS = [8]  # Arbitrary values for testing
+-HEAD_SIZES = [64, 80, 96, 112, 128, 256]
++HEAD_SIZES = [64, 80, 120, 256]
+ BLOCK_SIZES = [8, 16, 32]
+ 
+ # Arbitrary values for testing
+@@ -25,6 +25,8 @@ SEEDS = [0]
+ CUDA_DEVICES = [
+     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+ ]
++
++# We assume fp8 is always enabled for testing.
+ KV_CACHE_DTYPE = ["auto", "fp8"]
+ 
+ 
+@@ -52,10 +54,9 @@ def test_copy_blocks(
+     kv_cache_dtype: str,
+     device: str,
+ ) -> None:
+-    random.seed(seed)
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    if kv_cache_dtype == "fp8" and head_size % 16:
++        pytest.skip()
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     # Generate random block mappings where each source block is mapped to two
+     # destination blocks.
+@@ -63,12 +64,13 @@ def test_copy_blocks(
+     src_blocks = random.sample(range(num_blocks), num_mappings)
+     remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
+     dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
+-    block_mapping = {}
++    block_mapping: List[Tuple[int, int]] = []
+     for i in range(num_mappings):
+         src = src_blocks[i]
+         dst1 = dst_blocks[2 * i]
+         dst2 = dst_blocks[2 * i + 1]
+-        block_mapping[src] = [dst1, dst2]
++        block_mapping.append((src, dst1))
++        block_mapping.append((src, dst2))
+ 
+     # Create the KV caches.
+     key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
+@@ -81,22 +83,29 @@ def test_copy_blocks(
+     cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
+ 
+     # Call the copy blocks kernel.
+-    ops.copy_blocks(key_caches, value_caches, block_mapping)
++    block_mapping_tensor = torch.tensor(block_mapping,
++                                        dtype=torch.int64,
++                                        device=device).view(-1, 2)
++
++    opcheck(torch.ops._C_cache_ops.copy_blocks,
++            (key_caches, value_caches, block_mapping_tensor),
++            test_utils=DEFAULT_OPCHECK_TEST_UTILS,
++            cond=(head_size == HEAD_SIZES[0]))
++    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
+ 
+     # Run the reference implementation.
+-    for src, dsts in block_mapping.items():
+-        for dst in dsts:
+-            for cloned_key_cache in cloned_key_caches:
+-                cloned_key_cache[dst].copy_(cloned_key_cache[src])
+-            for cloned_value_cache in cloned_value_caches:
+-                cloned_value_cache[dst].copy_(cloned_value_cache[src])
++    for src, dst in block_mapping:
++        for cloned_key_cache in cloned_key_caches:
++            cloned_key_cache[dst].copy_(cloned_key_cache[src])
++        for cloned_value_cache in cloned_value_caches:
++            cloned_value_cache[dst].copy_(cloned_value_cache[src])
+ 
+     # Compare the results.
+     for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
+-        assert torch.allclose(key_cache, cloned_key_cache)
++        torch.testing.assert_close(key_cache, cloned_key_cache)
+     for value_cache, cloned_value_cache in zip(value_caches,
+                                                cloned_value_caches):
+-        assert torch.allclose(value_cache, cloned_value_cache)
++        torch.testing.assert_close(value_cache, cloned_value_cache)
+ 
+ 
+ @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@@ -121,17 +130,14 @@ def test_reshape_and_cache(
+     device: str,
+     kv_cache_dtype: str,
+ ) -> None:
+-    if not is_hip() and kv_cache_dtype == "fp8":
+-        pytest.skip()  # This test is not tuned for e5m2 cuda precision
+-    random.seed(seed)
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    if kv_cache_dtype == "fp8" and head_size % 16:
++        pytest.skip()
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     # Create a random slot mapping.
+     num_slots = block_size * num_blocks
+-    slot_mapping = random.sample(range(num_slots), num_tokens)
+-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
++    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
++    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long)
+ 
+     qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
+     _, key, value = qkv.unbind(dim=1)
+@@ -146,50 +152,54 @@ def test_reshape_and_cache(
+     # Clone the KV caches.
+     if kv_cache_dtype == "fp8":
+         cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+-        ops.convert_fp8(key_cache, cloned_key_cache)
++        ops.convert_fp8(cloned_key_cache, key_cache)
+         cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+-        ops.convert_fp8(value_cache, cloned_value_cache)
++        ops.convert_fp8(cloned_value_cache, value_cache)
+     else:
+         cloned_key_cache = key_cache.clone()
+         cloned_value_cache = value_cache.clone()
+ 
+     # Using default kv_scale
+-    kv_scale = 1.0
++    k_scale = v_scale = 1.0
+ 
+     # Call the reshape_and_cache kernel.
++    opcheck(torch.ops._C_cache_ops.reshape_and_cache,
++            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
++             k_scale, v_scale),
++            cond=(head_size == HEAD_SIZES[0]))
+     ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
+-                          kv_cache_dtype, kv_scale)
++                          kv_cache_dtype, k_scale, v_scale)
+ 
+     if kv_cache_dtype == "fp8":
+         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+-        ops.convert_fp8(key_cache, result_key_cache)
++        ops.convert_fp8(result_key_cache, key_cache)
+         result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+-        ops.convert_fp8(value_cache, result_value_cache)
++        ops.convert_fp8(result_value_cache, value_cache)
+ 
+     # Run the reference implementation.
+     reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
+     block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+-    block_indicies = block_indicies.cpu().tolist()
++    block_indicies_lst = block_indicies.cpu().tolist()
+     block_offsets = slot_mapping % block_size
+-    block_offsets = block_offsets.cpu().tolist()
++    block_offsets_lst = block_offsets.cpu().tolist()
+     for i in range(num_tokens):
+-        block_idx = block_indicies[i]
+-        block_offset = block_offsets[i]
++        block_idx = block_indicies_lst[i]
++        block_offset = block_offsets_lst[i]
+         cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
+         cloned_value_cache[block_idx, :, :, block_offset] = value[i]
+ 
+     if kv_cache_dtype == "fp8":
+-        assert torch.allclose(result_key_cache,
+-                              cloned_key_cache,
+-                              atol=0.001,
+-                              rtol=0.1)
+-        assert torch.allclose(result_value_cache,
+-                              cloned_value_cache,
+-                              atol=0.001,
+-                              rtol=0.1)
++        torch.testing.assert_close(result_key_cache,
++                                   cloned_key_cache,
++                                   atol=0.001,
++                                   rtol=0.1)
++        torch.testing.assert_close(result_value_cache,
++                                   cloned_value_cache,
++                                   atol=0.001,
++                                   rtol=0.1)
+     else:
+-        assert torch.allclose(key_cache, cloned_key_cache)
+-        assert torch.allclose(value_cache, cloned_value_cache)
++        torch.testing.assert_close(key_cache, cloned_key_cache)
++        torch.testing.assert_close(value_cache, cloned_value_cache)
+ 
+ 
+ @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@@ -214,16 +224,15 @@ def test_reshape_and_cache_flash(
+     device: str,
+     kv_cache_dtype: str,
+ ) -> None:
+-    if kv_cache_dtype == "fp8":
+-        pytest.skip()
+-    random.seed(seed)
+-    torch.random.manual_seed(seed)
+-    torch.cuda.manual_seed(seed)
++    current_platform.seed_everything(seed)
++    torch.set_default_device(device)
+ 
+     # Create a random slot mapping.
+     num_slots = block_size * num_blocks
+-    slot_mapping = random.sample(range(num_slots), num_tokens)
+-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device='cuda')
++    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
++    slot_mapping = torch.tensor(slot_mapping_lst,
++                                dtype=torch.long,
++                                device=device)
+ 
+     qkv = torch.randn(num_tokens,
+                       3,
+@@ -242,30 +251,70 @@ def test_reshape_and_cache_flash(
+         head_size,
+         kv_cache_dtype,
+         dtype,
++        device=device,
+     )
+-    key_cache, value_cache = key_caches[0], value_caches[0]
++    key_cache, value_cache = key_caches[0].contiguous(
++    ), value_caches[0].contiguous()
++    del key_caches
++    del value_caches
++
++    k_scale = key.amax().item() / 256
++    v_scale = value.amax().item() / 256
+ 
+     # Clone the KV caches.
+-    cloned_key_cache = key_cache.clone()
+-    cloned_value_cache = value_cache.clone()
++    if kv_cache_dtype == "fp8":
++        cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
++        ops.convert_fp8(cloned_key_cache, key_cache, k_scale, kv_cache_dtype)
++        cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
++        ops.convert_fp8(cloned_value_cache, value_cache, v_scale,
++                        kv_cache_dtype)
++    else:
++        cloned_key_cache = key_cache.clone()
++        cloned_value_cache = value_cache.clone()
+ 
+     # Call the reshape_and_cache kernel.
+-    cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
+-                                      slot_mapping, kv_cache_dtype)
++    opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
++            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
++             k_scale, v_scale),
++            cond=(head_size == HEAD_SIZES[0]))
++    ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
++                                slot_mapping, kv_cache_dtype, k_scale, v_scale)
++
++    if kv_cache_dtype == "fp8":
++        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
++        ops.convert_fp8(result_key_cache,
++                        key_cache,
++                        k_scale,
++                        kv_dtype=kv_cache_dtype)
++        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
++        ops.convert_fp8(result_value_cache,
++                        value_cache,
++                        v_scale,
++                        kv_dtype=kv_cache_dtype)
+ 
+     # Run the reference implementation.
+-    block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
+-    block_indicies = block_indicies.cpu().tolist()
++    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
++    block_indicies_lst = block_indicies.cpu().tolist()
+     block_offsets = slot_mapping % block_size
+-    block_offsets = block_offsets.cpu().tolist()
++    block_offsets_lst = block_offsets.cpu().tolist()
+     for i in range(num_tokens):
+-        block_idx = block_indicies[i]
+-        block_offset = block_offsets[i]
++        block_idx = block_indicies_lst[i]
++        block_offset = block_offsets_lst[i]
+         cloned_key_cache[block_idx, block_offset, :, :] = key[i]
+         cloned_value_cache[block_idx, block_offset, :, :] = value[i]
+ 
+-    assert torch.allclose(key_cache, cloned_key_cache)
+-    assert torch.allclose(value_cache, cloned_value_cache)
++    if kv_cache_dtype == "fp8":
++        torch.testing.assert_close(result_key_cache,
++                                   cloned_key_cache,
++                                   atol=0.001,
++                                   rtol=0.1)
++        torch.testing.assert_close(result_value_cache,
++                                   cloned_value_cache,
++                                   atol=0.001,
++                                   rtol=0.1)
++    else:
++        torch.testing.assert_close(key_cache, cloned_key_cache)
++        torch.testing.assert_close(value_cache, cloned_value_cache)
+ 
+ 
+ @pytest.mark.parametrize("direction", COPYING_DIRECTION)
+@@ -294,12 +343,10 @@ def test_swap_blocks(
+ ) -> None:
+     if kv_cache_dtype == "fp8" and "cpu" in direction:
+         pytest.skip()
+-    if not is_hip() and kv_cache_dtype == "fp8":
+-        pytest.skip()  # This test is not tuned for e5m2 cuda precision
+-    random.seed(seed)
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    if kv_cache_dtype == "fp8" and head_size % 16:
++        pytest.skip()
++
++    current_platform.seed_everything(seed)
+ 
+     src_device = device if direction[0] == "cuda" else 'cpu'
+     dst_device = device if direction[1] == "cuda" else 'cpu'
+@@ -312,7 +359,10 @@ def test_swap_blocks(
+     else:
+         dst_blocks = random.sample(range(num_blocks), num_mappings)
+ 
+-    block_mapping = dict(zip(src_blocks, dst_blocks))
++    block_mapping = list(zip(src_blocks, dst_blocks))
++    block_mapping_tensor = torch.tensor(block_mapping,
++                                        dtype=torch.int64,
++                                        device="cpu").view(-1, 2)
+ 
+     # Create the KV caches on the first device.
+     src_key_caches, src_value_caches = kv_cache_factory(
+@@ -328,17 +378,26 @@ def test_swap_blocks(
+     src_value_caches_clone = src_value_caches[0].clone()
+ 
+     # Call the swap_blocks kernel.
+-    ops.swap_blocks(src_key_caches[0], dist_key_caches[0], block_mapping)
+-    ops.swap_blocks(src_value_caches[0], dist_value_caches[0], block_mapping)
+-
+-    for src, dst in block_mapping.items():
+-        assert torch.allclose(src_key_caches_clone[src].cpu(),
+-                              dist_key_caches[0][dst].cpu())
+-        assert torch.allclose(src_value_caches_clone[src].cpu(),
+-                              dist_value_caches[0][dst].cpu())
++    do_opcheck = (head_size == HEAD_SIZES[0])
++    opcheck(torch.ops._C_cache_ops.swap_blocks,
++            (src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
++            cond=do_opcheck)
++    opcheck(torch.ops._C_cache_ops.swap_blocks,
++            (src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
++            cond=do_opcheck)
++
++    ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
++                    block_mapping_tensor)
++    ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
++                    block_mapping_tensor)
++
++    for src, dst in block_mapping:
++        torch.testing.assert_close(src_key_caches_clone[src].cpu(),
++                                   dist_key_caches[0][dst].cpu())
++        torch.testing.assert_close(src_value_caches_clone[src].cpu(),
++                                   dist_value_caches[0][dst].cpu())
+ 
+ 
+-@pytest.mark.skipif(not is_hip(), reason="FP8 conversion test requires e4m3")
+ @pytest.mark.parametrize("num_heads", NUM_HEADS)
+ @pytest.mark.parametrize("head_size", HEAD_SIZES)
+ @pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@@ -347,7 +406,7 @@ def test_swap_blocks(
+ @pytest.mark.parametrize("seed", SEEDS)
+ @pytest.mark.parametrize("device", CUDA_DEVICES)
+ @torch.inference_mode()
+-def test_fp8_conversion(
++def test_fp8_e4m3_conversion(
+     num_heads: int,
+     head_size: int,
+     block_size: int,
+@@ -356,9 +415,7 @@ def test_fp8_conversion(
+     seed: int,
+     device: str,
+ ) -> None:
+-    random.seed(seed)
+-    torch.random.manual_seed(seed)
+-    torch.cuda.manual_seed(seed)
++    current_platform.seed_everything(seed)
+ 
+     low = -224.0
+     high = 224.0
+@@ -367,9 +424,9 @@ def test_fp8_conversion(
+     cache.uniform_(low, high)
+ 
+     cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
+-    ops.convert_fp8(cache, cache_fp8)
++    ops.convert_fp8(cache_fp8, cache)
+ 
+     converted_cache = torch.empty_like(cache)
+-    ops.convert_fp8(cache_fp8, converted_cache)
++    ops.convert_fp8(converted_cache, cache_fp8)
+ 
+-    assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
++    torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1)
+diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py
+new file mode 100644
+index 0000000..45ec6df
+--- /dev/null
++++ b/tests/kernels/test_cascade_flash_attn.py
+@@ -0,0 +1,182 @@
++from typing import List, Optional, Tuple
++
++import pytest
++import torch
++
++from vllm.platforms import current_platform
++from vllm.v1.attention.backends.flash_attn import (cascade_attention,
++                                                   merge_attn_states)
++from vllm.vllm_flash_attn import flash_attn_varlen_func
++
++NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
++HEAD_SIZES = [128, 192, 256]
++BLOCK_SIZES = [16]
++DTYPES = [torch.float16, torch.bfloat16]
++
++
++@pytest.mark.parametrize("num_tokens", [1, 39, 16912])
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@torch.inference_mode()
++def test_merge_kernel(
++    num_tokens: int,
++    num_heads: Tuple[int, int],
++    head_size: int,
++    dtype: torch.dtype,
++):
++    torch.set_default_device("cuda")
++    current_platform.seed_everything(0)
++    num_query_heads = num_heads[0]
++    num_kv_heads = num_heads[1]
++    assert num_query_heads % num_kv_heads == 0
++
++    # Prepare inputs.
++    prefix_output = torch.randn(num_tokens,
++                                num_query_heads,
++                                head_size,
++                                dtype=dtype)
++    suffix_output = torch.randn(num_tokens,
++                                num_query_heads,
++                                head_size,
++                                dtype=dtype)
++    prefix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
++    suffix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
++
++    # Run the kernel.
++    output = torch.empty(num_tokens, num_query_heads, head_size, dtype=dtype)
++    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
++                      suffix_lse)
++
++    # Reference implementation.
++    max_lse = torch.maximum(prefix_lse, suffix_lse)
++    p_lse = torch.exp(prefix_lse - max_lse)
++    s_lse = torch.exp(suffix_lse - max_lse)
++    p_scale = p_lse / (p_lse + s_lse)
++    s_scale = s_lse / (p_lse + s_lse)
++    p_scale = p_scale.transpose(0, 1).unsqueeze(2)
++    s_scale = s_scale.transpose(0, 1).unsqueeze(2)
++    ref_output = p_scale * prefix_output + s_scale * suffix_output
++    ref_output = ref_output.to(dtype)
++
++    # Compare the results.
++    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
++
++
++CASES = [
++    # Case 1. A general case.
++    ([(129, 871), (18, 280), (37, 988), (1023, 2304), (1, 257)], 256),
++    # Case 2. Flash-decoding case.
++    ([(1, 1023), (1, 879), (1, 778), (1, 1777)] * 100, 512),
++]
++
++
++@pytest.mark.parametrize("seq_lens_and_common_prefix", CASES)
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("block_size", BLOCK_SIZES)
++@pytest.mark.parametrize("soft_cap", [None, 50])
++@pytest.mark.parametrize("num_blocks", [2048])
++@torch.inference_mode()
++def test_cascade(
++    seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int],
++    num_heads: Tuple[int, int],
++    head_size: int,
++    dtype: torch.dtype,
++    block_size: int,
++    soft_cap: Optional[float],
++    num_blocks: int,
++) -> None:
++    torch.set_default_device("cuda")
++    current_platform.seed_everything(0)
++
++    window_size = (-1, -1)
++    scale = head_size**-0.5
++    num_query_heads = num_heads[0]
++    num_kv_heads = num_heads[1]
++    assert num_query_heads % num_kv_heads == 0
++    key_cache = torch.randn(num_blocks,
++                            block_size,
++                            num_kv_heads,
++                            head_size,
++                            dtype=dtype)
++    value_cache = torch.randn_like(key_cache)
++
++    seq_lens, common_prefix_len = seq_lens_and_common_prefix
++    num_seqs = len(seq_lens)
++    query_lens = [x[0] for x in seq_lens]
++    kv_lens = [x[1] for x in seq_lens]
++    max_query_len = max(query_lens)
++    max_kv_len = max(kv_lens)
++
++    total_num_query_tokens = sum(query_lens)
++    query = torch.randn(total_num_query_tokens,
++                        num_query_heads,
++                        head_size,
++                        dtype=dtype)
++    cu_query_lens = torch.tensor([0] + query_lens,
++                                 dtype=torch.int32).cumsum(dim=0,
++                                                           dtype=torch.int32)
++    cu_kv_lens = torch.tensor([0] + kv_lens,
++                              dtype=torch.int32).cumsum(dim=0,
++                                                        dtype=torch.int32)
++    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
++    block_tables = torch.randint(0,
++                                 num_blocks,
++                                 (num_seqs, max_num_blocks_per_seq),
++                                 dtype=torch.int32)
++
++    assert common_prefix_len > 0
++    assert common_prefix_len % block_size == 0
++    num_common_kv_blocks = common_prefix_len // block_size
++    # Make sure the first `num_common_kv_blocks` blocks are the same.
++    block_tables[:, :num_common_kv_blocks] = \
++        block_tables[0, :num_common_kv_blocks]
++
++    # Run the regular attention.
++    ref_output = flash_attn_varlen_func(
++        q=query,
++        k=key_cache,
++        v=value_cache,
++        cu_seqlens_q=cu_query_lens,
++        cu_seqlens_k=cu_kv_lens,
++        max_seqlen_q=max_query_len,
++        max_seqlen_k=max_kv_len,
++        softmax_scale=scale,
++        causal=True,
++        window_size=window_size,
++        block_table=block_tables,
++        softcap=soft_cap if soft_cap is not None else 0,
++    )
++
++    # Run cascade attention.
++    assert all(common_prefix_len < kv_len for kv_len in kv_lens)
++    cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens],
++                                        dtype=torch.int32)
++    cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], dtype=torch.int32)
++    cu_suffix_kv_lens = (
++        cu_kv_lens -
++        torch.arange(num_seqs + 1, dtype=torch.int32) * common_prefix_len)
++    output = torch.empty_like(query)
++    cascade_attention(
++        output=output,
++        query=query,
++        key_cache=key_cache,
++        value_cache=value_cache,
++        cu_query_lens=cu_query_lens,
++        max_query_len=max_query_len,
++        cu_prefix_query_lens=cu_prefix_query_lens,
++        cu_prefix_kv_lens=cu_prefix_kv_lens,
++        cu_suffix_kv_lens=cu_suffix_kv_lens,
++        max_kv_len=max_kv_len,
++        softmax_scale=scale,
++        alibi_slopes=None,
++        sliding_window=window_size,
++        logits_soft_cap=soft_cap if soft_cap is not None else 0,
++        block_table=block_tables,
++        common_prefix_len=common_prefix_len,
++    )
++
++    # Compare the results.
++    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
+diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py
+new file mode 100644
+index 0000000..51be242
+--- /dev/null
++++ b/tests/kernels/test_causal_conv1d.py
+@@ -0,0 +1,435 @@
++from typing import Optional
++
++import pytest
++import torch
++import torch.nn.functional as F
++
++from tests.kernels.utils import opcheck
++from vllm import _custom_ops as ops  # noqa: F401
++from vllm.attention.backends.utils import PAD_SLOT_ID
++from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
++    causal_conv1d_fn, causal_conv1d_update)
++from vllm.platforms import current_platform
++
++
++def causal_conv1d_ref(
++    x: torch.Tensor,
++    weight: torch.Tensor,
++    bias: Optional[torch.Tensor] = None,
++    initial_states: Optional[torch.Tensor] = None,
++    return_final_states: bool = False,
++    final_states_out: Optional[torch.Tensor] = None,
++    activation: Optional[str] = "silu",
++):
++    """
++    x: (batch, dim, seqlen)
++    weight: (dim, width)
++    bias: (dim,)
++    initial_states: (batch, dim, width - 1)
++    final_states_out: (batch, dim, width - 1)
++
++    out: (batch, dim, seqlen)
++    """
++    if activation not in [None, "silu", "swish"]:
++        raise NotImplementedError("activation must be None, silu, or swish")
++    dtype_in = x.dtype
++    x = x.to(weight.dtype)
++    seqlen = x.shape[-1]
++    dim, width = weight.shape
++    if initial_states is None:
++        out = F.conv1d(x,
++                       weight.unsqueeze(1),
++                       bias,
++                       padding=width - 1,
++                       groups=dim)
++    else:
++        x = torch.cat([initial_states, x], dim=-1)
++        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
++    out = out[..., :seqlen]
++    if return_final_states:
++        final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
++            dtype_in)  # (batch, dim, width - 1)
++        if final_states_out is not None:
++            final_states_out.copy_(final_states)
++        else:
++            final_states_out = final_states
++    out = (out if activation is None else F.silu(out)).to(dtype=dtype_in)
++    return (out, None) if not return_final_states else (out, final_states_out)
++
++
++def causal_conv1d_update_ref(x,
++                             conv_state,
++                             weight,
++                             bias=None,
++                             activation=None,
++                             cache_seqlens=None):
++    """
++    x: (batch, dim) or (batch, dim, seqlen)
++    conv_state: (batch, dim, state_len), where state_len >= width - 1
++    weight: (dim, width)
++    bias: (dim,)
++    cache_seqlens: (batch,), dtype int32.
++        If not None, the conv_state is treated as a circular buffer.
++        The conv_state will be updated by copying x to the
++        conv_state starting at the index
++        @cache_seqlens % state_len before performing the convolution.
++
++    out: (batch, dim) or (batch, dim, seqlen)
++    """
++    if activation not in [None, "silu", "swish"]:
++        raise NotImplementedError("activation must be None, silu, or swish")
++    dtype_in = x.dtype
++    unsqueeze = x.dim() == 2
++    if unsqueeze:
++        x = x.unsqueeze(-1)
++    batch, dim, seqlen = x.shape
++    width = weight.shape[1]
++    state_len = conv_state.shape[-1]
++    assert conv_state.shape == (batch, dim, state_len)
++    assert weight.shape == (dim, width)
++    if cache_seqlens is None:
++        x_new = torch.cat([conv_state, x], dim=-1).to(
++            weight.dtype)  # (batch, dim, state_len + seqlen)
++        conv_state.copy_(x_new[:, :, -state_len:])
++    else:
++        width_idx = torch.arange(
++            -(width - 1), 0, dtype=torch.long,
++            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
++        width_idx = torch.remainder(width_idx, state_len).unsqueeze(1).expand(
++            -1, dim, -1)
++        x_new = torch.cat([conv_state.gather(2, width_idx), x],
++                          dim=-1).to(weight.dtype)
++        copy_idx = torch.arange(
++            seqlen, dtype=torch.long,
++            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
++        copy_idx = torch.remainder(copy_idx,
++                                   state_len).unsqueeze(1).expand(-1, dim, -1)
++        conv_state.scatter_(2, copy_idx, x)
++    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0,
++                   groups=dim)[:, :, -seqlen:]
++    if unsqueeze:
++        out = out.squeeze(-1)
++    return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
++
++
++@pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
++@pytest.mark.parametrize("silu_activation", [True])
++@pytest.mark.parametrize("has_bias", [True])
++def causal_conv1d_opcheck_fn(x: torch.Tensor,
++                             weight: torch.Tensor,
++                             bias: Optional[torch.Tensor] = None,
++                             cu_seq_len: Optional[torch.Tensor] = None,
++                             cache_indices: Optional[torch.Tensor] = None,
++                             has_initial_state: Optional[torch.Tensor] = None,
++                             conv_states: Optional[torch.Tensor] = None,
++                             activation: Optional[str] = "silu",
++                             pad_slot_id: int = PAD_SLOT_ID):
++    """
++    x: (batch, dim, seqlen)
++    weight: (dim, width)
++    bias: (dim,)
++    seq_idx: (batch, seqlen)
++    initial_states: (batch, dim, width - 1)
++    final_states_out: (batch, dim, width - 1), to be written to
++    activation: either None or "silu" or "swish"
++
++    out: (batch, dim, seqlen)
++    """
++    if activation not in [None, "silu", "swish"]:
++        raise NotImplementedError("activation must be None, silu, or swish")
++    if x.stride(-1) != 1:
++        x = x.contiguous()
++    bias = bias.contiguous() if bias is not None else None
++
++    opcheck(torch.ops._C.causal_conv1d_fwd,
++            (x, weight, bias, conv_states, cu_seq_len, cache_indices,
++             has_initial_state, activation in ["silu", "swish"], pad_slot_id))
++
++
++@pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
++@pytest.mark.parametrize("silu_activation", [True])
++@pytest.mark.parametrize("has_bias", [True])
++@pytest.mark.parametrize("has_initial_state", [True, False])
++@pytest.mark.parametrize("width", [4])
++@pytest.mark.parametrize(
++    'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096])
++@pytest.mark.parametrize('dim', [64])
++@pytest.mark.parametrize('batch', [1])
++def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
++                       has_initial_state, itype):
++    device = "cuda"
++    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
++    if itype == torch.bfloat16:
++        rtol, atol = 1e-2, 5e-2
++    # set seed
++    current_platform.seed_everything(0)
++    x = torch.randn(batch, dim, seqlen, device=device,
++                    dtype=itype).contiguous()
++
++    weight = torch.randn(dim, width, device=device, dtype=itype)
++    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
++    if has_initial_state:
++        initial_states = torch.randn(batch,
++                                     dim,
++                                     width - 1,
++                                     device=device,
++                                     dtype=itype)
++        has_initial_state_tensor = torch.ones(batch,
++                                              dtype=torch.bool,
++                                              device=x.device)
++    else:
++        initial_states = None
++        has_initial_state_tensor = None
++    x_ref = x.clone()
++    weight_ref = weight.clone()
++    bias_ref = bias.clone() if bias is not None else None
++    initial_states_ref = initial_states.clone(
++    ) if initial_states is not None else None
++    activation = None if not silu_activation else "silu"
++    out = causal_conv1d_fn(x,
++                           weight,
++                           bias,
++                           activation=activation,
++                           conv_states=initial_states,
++                           has_initial_state=has_initial_state_tensor)
++    out_ref, final_states_ref = causal_conv1d_ref(
++        x_ref,
++        weight_ref,
++        bias_ref,
++        initial_states=initial_states_ref,
++        return_final_states=True,
++        activation=activation)
++    if has_initial_state:
++        assert initial_states is not None and final_states_ref is not None
++        assert torch.allclose(initial_states,
++                              final_states_ref,
++                              rtol=rtol,
++                              atol=atol)
++    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
++
++    causal_conv1d_opcheck_fn(x,
++                             weight,
++                             bias,
++                             activation=activation,
++                             conv_states=initial_states,
++                             has_initial_state=has_initial_state_tensor)
++
++
++@pytest.mark.parametrize("itype", [torch.bfloat16])
++@pytest.mark.parametrize("silu_activation", [False, True])
++@pytest.mark.parametrize("has_bias", [False, True])
++@pytest.mark.parametrize("seqlen", [1])
++@pytest.mark.parametrize("width", [4])
++@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
++def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
++                              itype):
++    device = "cuda"
++    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
++    if itype == torch.bfloat16:
++        rtol, atol = 1e-2, 5e-2
++    # set seed
++    current_platform.seed_everything(0)
++    batch = 2
++    x = torch.randn(batch, dim, seqlen, device=device, dtype=itype)
++    x_ref = x.clone()
++    conv_state = torch.randn(batch, dim, width - 1, device=device, dtype=itype)
++
++    weight = torch.randn(dim, width, device=device, dtype=itype)
++    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
++    conv_state_ref = conv_state.detach().clone()
++    activation = None if not silu_activation else "silu"
++    out = causal_conv1d_update(x,
++                               conv_state,
++                               weight,
++                               bias,
++                               activation=activation)
++    out_ref = causal_conv1d_update_ref(x_ref,
++                                       conv_state_ref,
++                                       weight,
++                                       bias,
++                                       activation=activation)
++
++    assert torch.equal(conv_state, conv_state_ref)
++    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
++
++    opcheck(torch.ops._C.causal_conv1d_update,
++            (x, conv_state, weight, bias, activation
++             in ["silu", "swish"], None, None, PAD_SLOT_ID))
++
++
++@pytest.mark.parametrize("itype",
++                         [torch.float32, torch.float16, torch.bfloat16])
++@pytest.mark.parametrize("silu_activation", [False, True])
++@pytest.mark.parametrize("has_bias", [False, True])
++@pytest.mark.parametrize("seqlen", [1, 4, 5])
++@pytest.mark.parametrize("width", [2, 3, 4])
++@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
++# tests correctness in case subset of the sequences are padded
++@pytest.mark.parametrize("with_padding", [True, False])
++def test_causal_conv1d_update_with_batch_gather(with_padding, dim, width,
++                                                seqlen, has_bias,
++                                                silu_activation, itype):
++    device = "cuda"
++    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
++    if itype == torch.bfloat16:
++        rtol, atol = 1e-2, 5e-2
++
++    # set seed
++    current_platform.seed_everything(0)
++
++    batch_size = 3
++    padding = 5 if with_padding else 0
++    padded_batch_size = batch_size + padding
++    total_entries = 10 * batch_size
++
++    x = torch.randn(padded_batch_size, dim, 1, device=device, dtype=itype)
++    x_ref = x.clone()
++
++    conv_state_indices = torch.randperm(total_entries)[:batch_size].to(
++        dtype=torch.int32, device=device)
++    unused_states_bool = torch.ones(total_entries,
++                                    dtype=torch.bool,
++                                    device=device)
++    unused_states_bool[conv_state_indices] = False
++    padded_state_indices = torch.concat([
++        conv_state_indices,
++        torch.as_tensor(
++            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
++    ],
++                                        dim=0)
++    conv_state = torch.randn(total_entries,
++                             dim,
++                             width - 1,
++                             device=device,
++                             dtype=itype)
++    conv_state_for_padding_test = conv_state.clone()
++
++    weight = torch.randn(dim, width, device=device, dtype=itype)
++    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
++    conv_state_ref = conv_state[conv_state_indices, :].detach().clone()
++    activation = None if not silu_activation else "silu"
++    out = causal_conv1d_update(x,
++                               conv_state,
++                               weight,
++                               bias,
++                               activation=activation,
++                               conv_state_indices=padded_state_indices,
++                               pad_slot_id=PAD_SLOT_ID)
++    out_ref = causal_conv1d_update_ref(x_ref[:batch_size],
++                                       conv_state_ref,
++                                       weight,
++                                       bias,
++                                       activation=activation)
++
++    assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref)
++    assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
++    assert torch.equal(conv_state[unused_states_bool],
++                       conv_state_for_padding_test[unused_states_bool])
++
++    opcheck(torch.ops._C.causal_conv1d_update,
++            (x, conv_state, weight, bias, activation
++             in ["silu", "swish"], None, padded_state_indices, PAD_SLOT_ID))
++
++
++@pytest.mark.parametrize("itype", [torch.bfloat16])
++@pytest.mark.parametrize("silu_activation", [True])
++@pytest.mark.parametrize("has_bias", [True])
++@pytest.mark.parametrize("width", [4])
++@pytest.mark.parametrize(
++    'seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 2049, 4096])
++@pytest.mark.parametrize('dim', [64, 4096])
++# tests correctness in case subset of the sequences are padded
++@pytest.mark.parametrize('with_padding', [True, False])
++def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias,
++                              silu_activation, itype):
++    device = "cuda"
++    torch.cuda.empty_cache()
++    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
++    if itype == torch.bfloat16:
++        rtol, atol = 1e-2, 5e-2
++    # set seed
++    current_platform.seed_everything(0)
++    seqlens = []
++    batch_size = 4
++    if seqlen < 10:
++        batch_size = 1
++    padding = 3 if with_padding else 0
++    padded_batch_size = batch_size + padding
++    nsplits = padded_batch_size - 1
++
++    eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
++    seqlens.append(
++        torch.diff(
++            torch.cat(
++                [torch.tensor([-1]), eos_pos,
++                 torch.tensor([seqlen - 1])])).tolist())
++    assert sum(seqlens[-1]) == seqlen
++    assert all(s > 0 for s in seqlens[-1])
++
++    total_entries = batch_size * 10
++    cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
++    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
++                          dim=0)
++    x = torch.randn(1, 4096 + dim + 64, seqlen, device=device,
++                    dtype=itype)[:, 4096:4096 + dim, :]
++    weight = torch.randn(dim, width, device=device, dtype=itype)
++    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
++    x_ref = x.clone()
++    weight_ref = weight.clone()
++    bias_ref = bias.clone() if bias is not None else None
++    activation = None if not silu_activation else "silu"
++    final_states = torch.randn(total_entries,
++                               dim,
++                               width - 1,
++                               device=x.device,
++                               dtype=x.dtype)
++    final_states_ref = final_states.clone()
++    has_initial_states = torch.randint(0,
++                                       2, (cumsum.shape[0] - 1, ),
++                                       dtype=torch.bool,
++                                       device=x.device)
++    state_indices = torch.randperm(total_entries,
++                                   dtype=torch.int32,
++                                   device=x.device)[:batch_size]
++    padded_state_indices = torch.concat([
++        state_indices,
++        torch.as_tensor(
++            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
++    ],
++                                        dim=-1)
++
++    out = causal_conv1d_fn(x.squeeze(0), weight, bias, cumsum.cuda(),
++                           padded_state_indices, has_initial_states,
++                           final_states, activation, PAD_SLOT_ID)
++    out_ref = []
++    out_ref_b = []
++
++    splits = [torch.split(var, seqlens[0], dim=-1) for var in (x_ref)]
++    for i in range(len(seqlens[0])):
++        x_s = [v[i].unsqueeze(0) for v in splits][0]
++        if padded_state_indices[i] == PAD_SLOT_ID:
++            continue
++        out_ref_b.append(
++            causal_conv1d_ref(
++                x_s,
++                weight_ref,
++                bias_ref,
++                activation=activation,
++                return_final_states=True,
++                final_states_out=final_states_ref[
++                    padded_state_indices[i]].unsqueeze(0),
++                initial_states=final_states_ref[padded_state_indices[i]].
++                unsqueeze(0) if has_initial_states[i] else None))
++    out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=2))
++    out_ref_tensor = torch.cat(out_ref, dim=0)
++
++    unpadded_out = out[:, :out_ref_tensor.shape[-1]]
++    assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
++    assert torch.allclose(final_states[state_indices],
++                          final_states_ref[state_indices],
++                          rtol=rtol,
++                          atol=atol)
++
++    causal_conv1d_opcheck_fn(x.squeeze(0), weight, bias, cumsum.cuda(),
++                             padded_state_indices, has_initial_states,
++                             final_states, activation)
+diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
+new file mode 100644
+index 0000000..afe5379
+--- /dev/null
++++ b/tests/kernels/test_cutlass.py
+@@ -0,0 +1,455 @@
++"""Tests for cutlass kernels
++
++Run `pytest tests/kernels/test_cutlass.py`.
++"""
++from typing import Optional, Type
++
++import pytest
++import torch
++
++from tests.kernels.utils import opcheck
++from vllm import _custom_ops as ops
++from vllm.platforms import current_platform
++
++MNK_FACTORS = [
++    (1, 256, 128),
++    (1, 16384, 1024),
++    (1, 24576, 496),
++    (16, 256, 496),
++    (16, 16384, 128),
++    (16, 24576, 4096),
++    (32, 8192, 4096),
++    (32, 16384, 4096),
++    (33, 1024, 1024),
++    (33, 8192, 128),
++    (64, 2048, 496),
++    (64, 16384, 1024),
++    (100, 8192, 496),
++    (128, 32768, 4096),
++    (256, 4096, 4096),
++    (512, 256, 1024),
++    (512, 8192, 4096),
++    (512, 16384, 128),
++    (512, 24576, 128),
++]
++
++CUDA_DEVICES = [
++    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
++]
++
++capability = current_platform.get_device_capability()
++capability = capability[0] * 10 + capability[1]
++
++
++def to_fp8(tensor: torch.Tensor):
++    finfo = torch.finfo(torch.float8_e4m3fn)
++    return torch.round(tensor.clamp(
++        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
++
++
++def to_int8(tensor: torch.Tensor):
++    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
++
++
++def rand_int8(shape: tuple, device: str = "cuda"):
++    return to_int8(torch.rand(shape, device=device) * 255 - 128)
++
++
++def baseline_scaled_mm(a: torch.Tensor,
++                       b: torch.Tensor,
++                       scale_a: torch.Tensor,
++                       scale_b: torch.Tensor,
++                       out_dtype: Type[torch.dtype],
++                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++    output = (scale_a * (scale_b * (torch.mm(
++        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
++    if bias is not None:
++        output = output + bias
++
++    return output
++
++
++def cutlass_fp8_gemm_helper(m: int,
++                            n: int,
++                            k: int,
++                            per_token_act_quant: bool,
++                            per_out_channel_weight_quant: bool,
++                            use_bias: bool,
++                            out_dtype: Type[torch.dtype] = torch.bfloat16,
++                            device: str = "cuda"):
++    # Test for a cutlass kernel with per-token activation quantization
++    # and per-output channel weight quantization.
++    a = to_fp8(torch.randn((m, k), device=device))
++    b = to_fp8(torch.randn((n, k), device=device).t())
++
++    m_a_scales = m if per_token_act_quant else 1
++    n_b_scales = n if per_out_channel_weight_quant else 1
++
++    scale_a = (torch.randn((m_a_scales, 1), device=device,
++                           dtype=torch.float32))
++    scale_b = (torch.randn((1, n_b_scales), device=device,
++                           dtype=torch.float32))
++    if use_bias:
++        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
++    else:
++        bias = None
++
++    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
++    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
++
++    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=5e-2)
++
++    opcheck(torch.ops._C.cutlass_scaled_mm,
++            (out, a, b, scale_a, scale_b, bias))
++
++
++def cutlass_int8_gemm_helper(m: int,
++                             n: int,
++                             k: int,
++                             per_token_act_quant: bool,
++                             per_out_channel_weight_quant: bool,
++                             use_bias: bool,
++                             out_dtype: Type[torch.dtype] = torch.bfloat16,
++                             device: str = "cuda"):
++    # Test for a cutlass kernel with per-token activation quantization
++    # and per-output channel weight quantization.
++    a = to_int8(torch.randn((m, k), device=device) * 5)
++    b = to_int8(torch.randn((n, k), device=device).t() * 5)
++
++    m_a_scales = m if per_token_act_quant else 1
++    n_b_scales = n if per_out_channel_weight_quant else 1
++
++    scale_a = (torch.randn((m_a_scales, 1), device=device,
++                           dtype=torch.float32))
++    scale_b = (torch.randn((1, n_b_scales), device=device,
++                           dtype=torch.float32))
++
++    if use_bias:
++        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
++    else:
++        bias = None
++
++    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
++    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
++
++    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
++
++    opcheck(torch.ops._C.cutlass_scaled_mm,
++            (out, a, b, scale_a, scale_b, bias))
++
++
++@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
++@pytest.mark.parametrize("per_act_token", [True, False])
++@pytest.mark.parametrize("per_out_ch", [True, False])
++@pytest.mark.parametrize("use_bias", [True, False])
++@pytest.mark.skipif(not current_platform.has_device_capability(89),
++                    reason="FP8 is not supported on this GPU type.")
++def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
++                          per_out_ch: bool, use_bias: bool):
++    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
++
++
++@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
++@pytest.mark.parametrize("per_act_token", [True, False])
++@pytest.mark.parametrize("per_out_ch", [True, False])
++@pytest.mark.parametrize("use_bias", [True, False])
++def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
++                           per_out_ch: bool, use_bias: bool):
++    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
++
++
++@pytest.mark.parametrize("per_act_token", [True, False])
++@pytest.mark.parametrize("per_out_ch", [True, False])
++@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
++@pytest.mark.parametrize("use_bias", [True, False])
++def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
++                                        out_dtype: Type[torch.dtype],
++                                        use_bias: bool):
++    cutlass_int8_gemm_helper(512,
++                             512,
++                             512,
++                             per_act_token,
++                             per_out_ch,
++                             use_bias,
++                             out_dtype=out_dtype)
++
++
++@pytest.mark.parametrize("per_act_token", [True, False])
++@pytest.mark.parametrize("per_out_ch", [True, False])
++@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
++@pytest.mark.parametrize("use_bias", [True, False])
++@pytest.mark.skipif(not current_platform.has_device_capability(89),
++                    reason="FP8 is not supported on this GPU type.")
++def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
++                                       out_dtype: Type[torch.dtype],
++                                       use_bias: bool):
++    cutlass_fp8_gemm_helper(512,
++                            512,
++                            512,
++                            per_act_token,
++                            per_out_ch,
++                            use_bias,
++                            out_dtype=out_dtype)
++
++
++@pytest.mark.parametrize("per_act_token", [True, False])
++@pytest.mark.parametrize("per_out_ch", [True, False])
++@pytest.mark.parametrize("use_bias", [True, False])
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.skipif(not current_platform.has_device_capability(89),
++                    reason="FP8 is not supported on this GPU type.")
++def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
++                                  use_bias: bool, device: str):
++    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias,
++                            torch.bfloat16, device)
++
++
++@pytest.mark.parametrize("per_act_token", [True, False])
++@pytest.mark.parametrize("per_out_ch", [True, False])
++@pytest.mark.parametrize("use_bias", [True, False])
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
++                                   use_bias: bool, device: str):
++    cutlass_int8_gemm_helper(512,
++                             512,
++                             512,
++                             per_act_token,
++                             per_out_ch,
++                             use_bias,
++                             out_dtype=torch.bfloat16,
++                             device=device)
++
++
++# For the following two tests:
++# N and K correspond to the size of the weight matrix and likely to be multiples
++# of a large power of two. In any case, the kernel will have a naive fallback
++# when N and K are not divisible by 16. But M is the number of tokens and the
++# kernel must handle any M thrown at it.
++@pytest.mark.parametrize("per_act_token", [True, False])
++@pytest.mark.parametrize("per_out_ch", [True, False])
++@pytest.mark.parametrize("use_bias", [True, False])
++@pytest.mark.skipif(not current_platform.has_device_capability(89),
++                    reason="FP8 is not supported on this GPU type.")
++def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
++                                  use_bias: bool):
++    for nk in range(32, 128, 32):
++        for m in range(1, 128):
++            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
++                                    use_bias)
++
++
++@pytest.mark.parametrize("per_act_token", [True, False])
++@pytest.mark.parametrize("per_out_ch", [True, False])
++@pytest.mark.parametrize("use_bias", [True, False])
++def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
++                                   use_bias: bool):
++    for nk in range(32, 128, 32):
++        for m in range(1, 128):
++            cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
++                                     use_bias)
++
++
++@pytest.mark.parametrize("m", [32, 64, 128])
++@pytest.mark.parametrize("n", [16, 32, 64])
++@pytest.mark.parametrize("k", [64, 128, 256])
++@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
++@pytest.mark.skip
++def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
++                                    out_dtype: torch.dtype):
++    # Currently, the test is failing because folding azp into
++    # 16-bit bias loses too much precision
++    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
++    scale_b = torch.randn((1, n), device="cuda", dtype=torch.float32) / 10
++
++    aq_i8 = rand_int8((m, k))
++    bq_i8 = rand_int8((n, k)).t()
++
++    aq_i32 = aq_i8.to(dtype=torch.int32)
++    bq_i32 = bq_i8.to(dtype=torch.int32)
++
++    aq_f32 = aq_i8.to(dtype=torch.float32)
++    bq_f32 = bq_i8.to(dtype=torch.float32)
++
++    b_dq = scale_b * bq_f32
++
++    azp_a = torch.rand((1, ), device="cuda", dtype=torch.float32) * 10 + 1.5
++    azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
++    azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding
++
++    a_dq = scale_a * (aq_i32 + azp_aq_i8).to(dtype=torch.float32)
++    torch.testing.assert_close(a_dq, scale_a * aq_f32 + azp_a)
++
++    baseline_dq = torch.mm(a_dq, b_dq).to(out_dtype)
++
++    J = torch.ones((1, k), device="cuda", dtype=torch.float32)
++    azp_bias = (azp_a * scale_b * (J @ bq_f32)).to(out_dtype)
++    assert azp_bias.shape == (1, n)
++    assert azp_bias[0, :].shape == (n, )
++
++    baseline_q = (scale_a.to(device='cpu') * scale_b.to(device='cpu') * (
++        (aq_i32 + azp_aq_i8).to(device='cpu') @ bq_i32.to(device='cpu'))).to(
++            dtype=out_dtype, device='cuda')
++
++    out = ops.cutlass_scaled_mm(aq_i8,
++                                bq_i8,
++                                scale_a,
++                                scale_b,
++                                out_dtype=out_dtype,
++                                bias=azp_bias[0, :])
++    torch.testing.assert_close(out, baseline_dq, rtol=1e-2, atol=1e0)
++    torch.testing.assert_close(out, baseline_q, rtol=1e-2, atol=1e0)
++
++
++@pytest.mark.parametrize("m", [32, 64, 128])
++@pytest.mark.parametrize("n", [16, 32, 64])
++@pytest.mark.parametrize("k", [64, 128, 256])
++@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
++@pytest.mark.parametrize("use_bias", [True, False])
++@pytest.mark.parametrize("azp_per_token", [True, False])
++def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
++                          use_bias: bool, azp_per_token: bool):
++    m_azp = m if azp_per_token else 1
++    scale_a = torch.randn((m_azp, 1), device="cuda", dtype=torch.float32) / 10
++    scale_b = torch.randn((1, n), device="cuda", dtype=torch.float32) / 10
++
++    aq_i8 = rand_int8((m, k))
++    aq_i32 = aq_i8.to(dtype=torch.int32)
++    aq_f32 = aq_i8.to(dtype=torch.float32)
++
++    bq_i8 = rand_int8((n, k)).t()
++    bq_i32 = bq_i8.to(dtype=torch.int32)
++    bq_f32 = bq_i8.to(dtype=torch.float32)
++    b_dq = scale_b * bq_f32
++
++    azp_a = torch.rand(
++        (m_azp, 1), device="cuda", dtype=torch.float32) * 10 + 1.5
++    azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
++    azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding
++
++    a_dq = scale_a * (aq_i32 - azp_aq_i8).to(dtype=torch.float32)
++    torch.testing.assert_close(a_dq,
++                               scale_a * aq_f32 - azp_a,
++                               rtol=1e-4,
++                               atol=1e-3)
++
++    if use_bias:
++        bias = torch.rand((1, n), device="cuda", dtype=out_dtype) * 10 + 2.5
++    else:
++        bias = torch.zeros((1, n), device="cuda", dtype=out_dtype)
++
++    baseline_dq = (torch.mm(a_dq, b_dq) + bias).to(out_dtype)
++
++    # int32 mm not supported on CUDA
++    a_noazp_i32_cpu = (aq_i32 - azp_aq_i8).to(device='cpu')
++    cq = (a_noazp_i32_cpu @ bq_i32.to(device='cpu')).to(device='cuda')
++    baseline_q = (scale_a * scale_b * cq + bias).to(dtype=out_dtype)
++
++    # Hadamard is just the sum of the cols
++    azp_adj_i32 = bq_i32.sum(dim=0, keepdim=True, dtype=torch.int32)
++    azp_i32 = azp_aq_i8.to(dtype=torch.int32)
++    func_bias = bias if use_bias else None
++
++    if azp_per_token:
++        out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
++                                        out_dtype, azp_adj_i32, azp_i32,
++                                        func_bias)
++    else:
++        azp_with_adj_i32 = azp_i32 * azp_adj_i32
++        out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
++                                        out_dtype, azp_with_adj_i32, None,
++                                        func_bias)
++
++    # bfloat16 precision is 7-bit mantissa -> 2^-8 ~ 0.4%
++    # float16 precision is 10-bit mantissa -> 2^-11 ~ 0.05%
++    rtol = 1e-2 if out_dtype == torch.bfloat16 else 1e-3
++    atol = 1e-3
++    torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
++    torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
++
++    if azp_per_token:
++        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
++                (out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32,
++                 func_bias))
++    else:
++        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
++                (out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None,
++                 func_bias))
++
++
++# Test working with a subset of A and B
++def test_cutlass_subset():
++    big_m, big_n, big_k = 1024, 1024, 1024
++    m, n, k = 512, 512, 512
++
++    whole_a = to_int8(torch.randn((big_m, big_k), device="cuda") * 5)
++    whole_b = to_int8(torch.randn((big_n, big_k), device="cuda").t() * 5)
++    a = whole_a[0:m, 0:k]
++    b = whole_b[0:k, 0:n]
++
++    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
++    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
++
++    out = ops.cutlass_scaled_mm(a,
++                                b,
++                                scale_a,
++                                scale_b,
++                                out_dtype=torch.bfloat16)
++    baseline = baseline_scaled_mm(a,
++                                  b,
++                                  scale_a,
++                                  scale_b,
++                                  out_dtype=torch.bfloat16)
++
++    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
++
++
++# Test to make sure cuda graphs work
++class CutlassLayer(torch.nn.Module):
++
++    def __init__(self, b, scale_a, scale_b, out_dtype):
++        super().__init__()
++        self.b = b
++        self.scale_a = scale_a
++        self.scale_b = scale_b
++        self.out_dtype = out_dtype
++
++    def forward(self, a):
++        return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b,
++                                     self.out_dtype)
++
++
++@pytest.mark.parametrize("per_act_token", [True, False])
++@pytest.mark.parametrize("per_out_ch", [True, False])
++def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
++    m, n, k = 512, 512, 512
++
++    a = to_int8(torch.randn((m, k), device="cuda"))
++    b = to_int8(torch.randn((n, k), device="cuda").t())
++
++    m_a_scales = m if per_act_token else 1
++    n_b_scales = n if per_out_ch else 1
++
++    scale_a = (torch.randn(
++        (m_a_scales, 1), device="cuda", dtype=torch.float32) / 10)
++    scale_b = (torch.randn(
++        (1, n_b_scales), device="cuda", dtype=torch.float32) / 10)
++
++    # Construct a trivial model with a single layer that calls a CUTLASS kernel
++    model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16)
++
++    # Run the model with a cuda graph
++    stream = torch.cuda.Stream()
++    with torch.cuda.stream(stream):
++        g = torch.cuda.CUDAGraph()
++        with torch.cuda.graph(g):
++            out = model(a)
++    out.zero_()
++    g.replay()
++
++    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
++                        scale_b * b.to(dtype=torch.float32)).to(torch.bfloat16)
++    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
++
++
++def test_cutlass_support_opcheck():
++    opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
+diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
+new file mode 100644
+index 0000000..e008a56
+--- /dev/null
++++ b/tests/kernels/test_encoder_decoder_attn.py
+@@ -0,0 +1,1101 @@
++"""
++Tests:
++
++* E2E test of Encoder attention + Decoder self-attention +
++      Encoder/decoder cross-attention (collectively
++      "encoder/decoder attention")
++
++"""
++
++from typing import NamedTuple, Optional
++
++import pytest
++import torch
++
++from tests.kernels.utils import *
++from vllm.attention import Attention, AttentionMetadata, AttentionType
++from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
++from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
++                                     global_force_attn_backend_context_manager)
++from vllm.config import VllmConfig, set_current_vllm_config
++from vllm.forward_context import set_forward_context
++from vllm.platforms import current_platform
++
++# List of support backends for encoder/decoder models
++LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
++HEAD_SIZES = [64, 256]
++
++NUM_HEADS = [1, 16]
++
++BATCH_SIZES = [1, 16]
++BLOCK_SIZES = [16]
++CUDA_DEVICE = "cuda:0"
++
++MAX_DEC_SEQ_LENS = [128]
++MAX_ENC_SEQ_LENS = [128]
++
++# Narrow teest-cases for unsupported-scenario
++# tests
++HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]
++
++
++class TestPoint(NamedTuple):
++    """
++    Encapsulates the attributes which define a single invocation
++    of the test_e2e_enc_dec_attn() test
++
++    Attributes:
++        num_heads: The number of heads in the model.
++        head_size: Head dimension
++        backend_name: Name of the backend framework used.
++        batch_size: Number of samples per batch.
++        block_size: Size of each block of data processed.
++        max_dec_seq_len: Maximum sequence length for the decoder.
++        max_enc_seq_len: Maximum sequence length for the encoder.
++        num_blocks: Number of blocks in the model.
++    """
++
++    num_heads: int
++    head_size: int
++    backend_name: str
++    batch_size: int
++    block_size: int
++    max_dec_seq_len: int
++    max_enc_seq_len: int
++    num_blocks: int
++    attn_type: AttentionType
++
++
++class TestResources(NamedTuple):
++    '''
++    Encapsulates key components for performing an
++    encoder/decoder attention test
++
++    Note that
++    (1) attn automatically selects an attention backend
++        based on platform info & a set of canned
++        heuristics
++    (2) attn_backend is thus *not the same backend
++        instance* used by attn, but rather it is
++        intended to be a
++        *different instance* of the *same backend class*;
++        it is assumed that the user of TestResources
++        will leverage attn_backend for the purpose of
++        constructing backend-compatible attention
++        metadata instances
++
++    Attributes:
++
++    * scale: 1/sqrt(d) scale factor for attn
++    * attn_backend: implementatino of abstraction
++                    attention interface using
++                    a particular kernel library
++                    i.e. XFormers
++    * attn: Attention layer instance
++    * kv_cache: shared key/value cache for all attention
++    '''
++
++    scale: float
++    attn: Attention
++    kv_cache: torch.Tensor
++
++
++def _make_test_resources(test_pt: TestPoint, ) -> TestResources:
++    '''
++    Build key components for performing encoder/decoder attention test.
++
++    Note that
++    (1) The Attention instance constructed here, automatically selects
++        an attention backend class based on platform info & a set of canned
++        heuristics, so
++    (2) The attention backend instance constructed here is thus *not
++        the same backend instance* used by attn, but rather it is
++        intended to be a *different instance* of the *same backend class*;
++        therefore,
++    (3) This function requires that test_pt.backend_name matches the backend
++        class that Attention will automatically select when it is constructed.
++
++
++    Arguments:
++
++    * test_pt: TestPoint data structure; this function relies on the
++               following fields: num_heads, head_size, num_blocks,
++               block_size, backend_name
++
++    Returns:
++
++    * TestResources data structure.
++    '''
++
++    scale = float(1.0 / (test_pt.head_size**0.5))
++    attn = Attention(
++        test_pt.num_heads,
++        test_pt.head_size,
++        scale=scale,
++        prefix=f"{test_pt.attn_type}",
++        attn_type=test_pt.attn_type,
++    )
++    if test_pt.num_blocks is None or test_pt.num_heads is None:
++        # Caller does not require a KV cache
++        return TestResources(
++            scale, attn,
++            torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE))
++
++    # Construct KV cache
++    if test_pt.attn_type in (AttentionType.DECODER,
++                             AttentionType.ENCODER_DECODER):
++        kv_cache = make_kv_cache(test_pt.num_blocks,
++                                 test_pt.num_heads,
++                                 test_pt.head_size,
++                                 test_pt.block_size,
++                                 device=CUDA_DEVICE,
++                                 backend=test_pt.backend_name)
++    else:
++        kv_cache = torch.tensor([])
++
++    attn.kv_cache = [kv_cache]
++    return TestResources(scale, attn, kv_cache)
++
++
++def _encoder_attn_setup(
++    test_pt: TestPoint,
++    test_rsrcs: TestResources,
++) -> PhaseTestParameters:
++    '''
++    Set up test vectors & data structures for encoder attention test.
++
++    A triplet of synthetic query/key/value tensors are constructed.
++    Given this is an encoder attention test, the key & value
++    sequences will have the same length as the corresponding queries.
++
++    The query/key/value tensors are passed to an ideal reference
++    self-attention implementation to generate an ideal output tensor.
++
++    Encoder inference does not populate the KV cache, therefore
++    no KV cache memory mapping is constructed
++
++    Arguments:
++
++    * test_pt: TestPoint data structure; this function relies on the
++               following fields: batch_size, num_heads, head_size,
++               block_size, max_q_seq_len
++    * test_rsrcs: TestResources data structure; this function relies on the
++                  scale field
++
++
++    Returns:
++
++    * PhaseTestParameters data structure comprising (1) packed query/key/value
++      tensors, (2) the ideal output of attention computed using a naive
++      implementation, and (3) KVCache field set to None
++    '''
++
++    (
++        num_heads,
++        head_size,
++        _,
++        batch_size,
++        _,
++        _,
++        max_q_seq_len,
++        _,
++        _,
++    ) = test_pt
++
++    scale = test_rsrcs.scale
++
++    max_kv_seq_len = max_q_seq_len
++
++    # Make test tensors
++
++    qkv_in, _, _ = make_qkv(batch_size,
++                            max_q_seq_len,
++                            max_kv_seq_len,
++                            num_heads,
++                            head_size,
++                            attn_type=AttentionType.ENCODER,
++                            device=CUDA_DEVICE)
++
++    # Compute correct answer using naive non-causal attention
++    # implementation
++
++    ideal_output = ref_masked_attention(qkv_in.query,
++                                        qkv_in.key,
++                                        qkv_in.value,
++                                        scale=scale,
++                                        q_seq_lens=qkv_in.q_seq_lens,
++                                        kv_seq_lens=qkv_in.kv_seq_lens)
++
++    packed_ideal_output, _ = pack_tensor(ideal_output,
++                                         qkv_in.q_seq_lens,
++                                         device=CUDA_DEVICE)
++
++    packed_qkv = pack_qkv(qkv_in, device=CUDA_DEVICE)
++
++    return PhaseTestParameters(
++        PackedQKVO(packed_qkv, packed_ideal_output),
++        None  # No KV cache
++    )
++
++
++def _decoder_attn_setup(
++    test_pt: TestPoint,
++    test_rsrcs: TestResources,
++    block_base_addr: int = 0,
++) -> Tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]:
++    '''
++    Set up test vectors & data structures for self-attention test.
++
++    A triplet of synthetic query/key/value tensors are constructed ("baseline"
++    query/key/value). Given this is a self-attention test, the key & value
++    sequences will have the same length as the corresponding queries.
++
++    "Prefill" query/key/value tensors are derived by masking out the last value
++    in each baseline query/key/value. These tensors are used to test prefill &
++    populate KV cache for a subsequent decode test.
++
++    "Decode" query/key/value tensors are derived by extracting *only* the last
++    value from each baseline query/key/value (i.e. complement of the prefill
++    tensors.) These tensors are used to test decode, conditional on the kv cache
++    being populated during the prefill test.
++
++    The baseline query/key/value tensors are passed to an ideal reference
++    self-attention implementation to generate a "Baseline" ideal output tensor.
++    This tensor is split into the "Prefill" ideal output tensor (all but the
++    last element of each output sequence) and the "Decode" ideal output tensor
++    (*only* the last element of each output sequence); the "Prefill" and
++    "Decode" ideal output tensors can be used to validate the prefill and decode
++    test results, respectively.
++
++    This function also constructs the self-attention KV cache memory mapping
++    (slot mapping and block table), ensuring that the block table starts at
++    block_base_addr
++
++    Arguments:
++
++    * test_pt: TestPoint data structure; this function relies on the
++               following fields: batch_size, num_heads, head_size,
++               block_size, max_q_seq_len
++    * test_rsrcs: TestResources data structure; this function relies on the
++                  scale field
++    * block_base_addr: decoder self-attention block-table base address
++
++    Returns:
++    * qkv: Unpacked (batch_size x padded_seq_len x num_heads x
++           head_size) query/key/value tensors
++    * Prefill-phase decoder self-attention PhaseTestParameters data structure,
++      including (1) packed (number_of_tokens x num_heads x head_size)
++      query/key/value tensors along with (2) ideal attention output
++      computed using a naive implementation, and (3) memory-mapping data
++      structures appropriate for prefill phase.
++    * Decode-phase decoder self-attention PhaseTestParameters data structure,
++      including (1) packed (number_of_tokens x num_heads x head_size)
++      query/key/value tensors along with (2) ideal attention output
++      computed using a naive implementation, and (3) memory-mapping data
++      structures appropriate for decode phase.
++    * max_block_idx: max physical address in decoder self-attention block-table
++                     (intended to be used as the base address for the encoder/
++                      decoder cross-attention block-table, which is not
++                      constructed in this function)
++    '''
++
++    (
++        num_heads,
++        head_size,
++        _,
++        batch_size,
++        block_size,
++        max_q_seq_len,
++        _,
++        _,
++        _,
++    ) = test_pt
++
++    scale = test_rsrcs.scale
++
++    max_kv_seq_len = max_q_seq_len
++
++    # Build test tensors
++
++    (
++        qkv,
++        prefill_qkv,
++        decode_qkv,
++    ) = make_qkv(batch_size,
++                 max_q_seq_len,
++                 max_kv_seq_len,
++                 num_heads,
++                 head_size,
++                 attn_type=AttentionType.DECODER,
++                 device=CUDA_DEVICE)
++
++    # Compute correct answer using naive attention implementation
++    # with causal attention mask
++
++    causal_mask = make_causal_mask(max_q_seq_len,
++                                   max_kv_seq_len).to(CUDA_DEVICE)
++
++    ideal_output = ref_masked_attention(qkv.query,
++                                        qkv.key,
++                                        qkv.value,
++                                        scale=scale,
++                                        custom_mask=causal_mask,
++                                        q_seq_lens=qkv.q_seq_lens,
++                                        kv_seq_lens=qkv.kv_seq_lens)
++
++    # Split out the prefill- & decode-phase ideal answers & pack them
++
++    prefill_ideal_output = torch.zeros_like(ideal_output)
++    decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
++    for bdx, prefill_q_seq_len in enumerate(prefill_qkv.q_seq_lens):
++        prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
++            bdx, :prefill_q_seq_len]
++        decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
++            prefill_q_seq_len + 1)]
++
++    prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
++                                                 prefill_qkv.q_seq_lens,
++                                                 device=CUDA_DEVICE)
++    decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
++                                                [1 for _ in range(batch_size)],
++                                                device=CUDA_DEVICE)
++
++    # Build prefill- & decode-phase data structures
++    # for decoder self-attention. Block tables and
++    # slot mapping must be in a format compatible
++    # with KV caching & attention kernels
++    #
++    # Prefill-phase:
++    #
++    # * Empty block-tables tensor
++    # * Slot-mapping with entries for prompt tokens
++    #
++    # Decode-phase:
++    # * Block-tables tensor with minimum number of blocks
++    #   required by total num. tokens in the entirety of all sequences
++    #   (including both prefill & decode)
++    # * Slot-mapping with entries for tokens that will be decoded in the
++    #   current decode iteration
++    #
++    #  Note: the format described above is simply mirroring what ModelRunner
++    #        produces
++
++    prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE)
++
++    (
++        decode_block_tables,
++        slot_mapping_list,
++        max_block_idx,
++    ) = make_block_tables_slot_mapping(block_size,
++                                       qkv.q_seq_lens,
++                                       device=CUDA_DEVICE,
++                                       block_base_addr=block_base_addr)
++
++    (
++        prefill_slot_mapping,
++        decode_slot_mapping,
++    ) = split_slot_mapping(slot_mapping_list,
++                           qkv.q_seq_lens,
++                           device=CUDA_DEVICE)
++
++    prefill_pckd_qkv = pack_qkv(prefill_qkv, device=CUDA_DEVICE)
++
++    decode_pckd_qkv = pack_qkv(decode_qkv, device=CUDA_DEVICE)
++
++    return (
++        qkv,
++        PhaseTestParameters(  # Prefill test params
++            PackedQKVO(prefill_pckd_qkv, prefill_packed_ideal_output),
++            KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
++        PhaseTestParameters(  # Decode test params
++            PackedQKVO(decode_pckd_qkv, decode_packed_ideal_output),
++            KVMemoryMap(decode_block_tables, decode_slot_mapping)),
++        max_block_idx)
++
++
++def _enc_dec_cross_attn_setup_reuses_query(
++    decoder_qkv: QKVInputs,
++    encoder_test_params: PhaseTestParameters,
++    prefill_decoder_phase_test_params: PhaseTestParameters,
++    test_pt: TestPoint,
++    test_rsrcs: TestResources,
++    block_base_addr: int = 0,
++) -> Tuple[PhaseTestParameters, PhaseTestParameters]:
++    '''
++    Set up test vectors & data structures for cross-attention test.
++
++    A triplet of synthetic cross-attention key/value tensors are constructed
++    ("baseline" key/value). Given this is a cross-attention test, we assume
++    query tensors were already synthesized for a prior self-attention test and
++    will be reused for cross-attention. The key & value sequences generated here
++    may have a different length than the corresponding queries (as is often
++    the case for cross-attention between decoder and encoder sequences.)
++
++    Cross attention key & value tensors do not grow during autoregressive
++    inference; thus this function obtains a single key/value pair suitable for
++    both prefill and decode.
++
++    The "baseline" query tensor is received as an argument. The "baseline"
++    query/key/value tensors are passed to an ideal reference cross-attention
++    implementation to generate a "baseline" ideal output tensor. This tensor is
++    split into the "Prefill" ideal output tensor (all but the last element of
++    each output sequence) and the "Decode" ideal output tensor (*only* the last
++    element of each output sequence); the "Prefill" and "Decode" ideal output
++    tensors can be used to validate the prefill and decode test results,
++    respectively.
++
++    This function also constructs the cross-attention KV cache memory mapping
++    (slot mapping and block table), ensuring that the block table starts at
++    block_base_addr.
++
++    Arguments:
++
++    * decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x
++                   num_heads x head_size) decoder self-attention inputs;
++                   this function relies on the query and q_seq_lens
++                   fields
++    * encoder_test_params: PhaseTestParameters data structure which was
++                           used for encoder inference; KV cache field
++                           is not used by this function
++    * prefill_decoder_phase_test_params: PhaseTestParameters data structure
++                                         used for prefill-phase decoder
++                                         self-attention; all fields
++                                         including KV cache required
++    * test_pt: TestPoint data structure; this function relies on the
++               following fields: batch_size, num_heads, head_size,
++               block_size, max_q_seq_len
++    * test_rsrcs: TestResources data structure; this function relies on the
++                  scale field
++    * block_base_addr: decoder self-attention block-table base address
++
++    Returns:
++
++    * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data
++      structure, including (1) packed
++      (number_of_tokens x num_heads x head_size) query/key/value tensors
++      along with (2) ideal attention output computed using a
++      naive implementation, and (3) memory-mapping data structures appropriate
++      for prefill phase.
++    * Decode-phase encoder/decoder cross-attention PhaseTestParameters data
++      structure, including (1) packed
++      (number_of_tokens x num_heads x head_size) query/key/value tensors
++      along with (2) ideal attention output computed using a
++      naive implementation, and (3) memory-mapping data structures appropriate
++      for decode phase.
++    '''
++
++    assert encoder_test_params.packed_qkvo.packed_qkv is not None
++    assert prefill_decoder_phase_test_params.packed_qkvo.packed_qkv is not None
++
++    (
++        num_heads,
++        head_size,
++        _,
++        batch_size,
++        block_size,
++        max_decoder_seq_len,
++        max_encoder_seq_len,
++        _,
++        _,
++    ) = test_pt
++
++    scale = test_rsrcs.scale
++
++    decoder_query = decoder_qkv.query
++    decoder_seq_lens = decoder_qkv.q_seq_lens
++    encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
++    prefill_q_seq_lens = (
++        prefill_decoder_phase_test_params.packed_qkvo.packed_qkv.q_seq_lens)
++
++    assert prefill_q_seq_lens is not None
++
++    (
++        cross_kv,
++        _,
++        _,
++    ) = make_qkv(batch_size,
++                 max_decoder_seq_len,
++                 max_encoder_seq_len,
++                 num_heads,
++                 head_size,
++                 force_kv_seq_lens=encoder_seq_lens,
++                 attn_type=AttentionType.ENCODER_DECODER,
++                 device=CUDA_DEVICE)
++
++    ideal_output = ref_masked_attention(decoder_query,
++                                        cross_kv.key,
++                                        cross_kv.value,
++                                        scale=scale,
++                                        q_seq_lens=decoder_seq_lens,
++                                        kv_seq_lens=cross_kv.kv_seq_lens)
++
++    prefill_ideal_output = torch.zeros_like(ideal_output)
++    decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
++    for bdx, prefill_q_seq_len in enumerate(prefill_q_seq_lens):
++        prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
++            bdx, :prefill_q_seq_len]
++        decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
++            prefill_q_seq_len + 1)]
++
++    prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
++                                                 prefill_q_seq_lens,
++                                                 device=CUDA_DEVICE)
++    decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
++                                                [1 for _ in range(batch_size)],
++                                                device=CUDA_DEVICE)
++
++    # Build prefill- & decode-phase data structures
++    # for encoder/decoder cross-attention. Block tables and
++    # slot mapping must be in a format compatible
++    # with KV caching & attention kernels
++    #
++    # Whereas decoder self-attention extracts relationships between
++    # equal-length Q/K/V sequences, which mutually grow in length
++    # with each decoded token, cross-attention relates the Q sequence
++    # - which grows with each new decoded token - to fixed-length
++    # K and V sequences derived from the encoder hidden states.
++    #
++    # Prefill-phase:
++    #
++    # * Empty block-tables tensor
++    # * Slot-mapping with as many entries as there are tokens in the encoder
++    #   prompt.
++    #
++    # Decode-phase:
++    # * Block-tables tensor with minimum number of blocks to
++    #   accommodate K & V tensors which are equal in lnegth
++    #   to the encoder prompt length
++    # * Empty slot-mapping tensor (since K & V are fixed in size,
++    #   new decoded tokens are not KV-cached and require no slot-
++    #   mapping)
++    #
++    # Note: the format above is simply an extension of what ModelRunner
++    #       produces for decoder-only models
++
++    prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE)
++    decode_slot_mapping = make_empty_slot_mapping_tensor(device=CUDA_DEVICE)
++
++    (
++        decode_block_tables,
++        prefill_slot_mapping_list,
++        _,
++    ) = make_block_tables_slot_mapping(block_size,
++                                       cross_kv.kv_seq_lens,
++                                       block_base_addr=block_base_addr,
++                                       device=CUDA_DEVICE)
++
++    prefill_slot_mapping = maybe_make_long_tensor(prefill_slot_mapping_list,
++                                                  device=CUDA_DEVICE)
++
++    # Packed key/value (query is already provided)
++    packed_cross_kv = pack_qkv(cross_kv, device=CUDA_DEVICE)
++
++    return (
++        PhaseTestParameters(  # Prefill-phase test params
++            PackedQKVO(packed_cross_kv, prefill_packed_ideal_output),
++            KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
++        PhaseTestParameters(  # Decode-phase test params
++            PackedQKVO(None, decode_packed_ideal_output),
++            KVMemoryMap(decode_block_tables, decode_slot_mapping)))
++
++
++def _run_encoder_attention_test(
++    attn: Attention,
++    encoder_test_params: PhaseTestParameters,
++    attn_metadata: AttentionMetadata,
++    test_pt: TestPoint,
++    vllm_config: VllmConfig,
++) -> torch.Tensor:
++    '''
++    Run encoder attention.
++
++    attn.forward() is passed attn_type=AttentionType.ENCODER in order
++    to configure the kernel invocation for encoder attention
++
++    Requires attn_metadata.num_decode_tokens == 0
++    (There is no encoder execution in the decode-phase)
++
++    Arguments:
++
++    * attn: Attention wrapper instance
++    * encoder_test_params: encoder PhaseTestParameters data structure;
++                           this function relies on the packed
++                           (number_of_tokens x num_heads x head_size)
++                           query/key/value fields
++    * attn_metadata: attention metadata for encoder/decoder-self attention
++    * test_pt: The TestPoint object containing test details like number of
++               model heads, head size, name of the backend being used etc.
++
++    Returns:
++    * Attention.forward() applied to packed {query,key,value} and
++      & attn_metadata
++    '''
++    assert attn_metadata.num_decode_tokens == 0
++    packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
++    assert packed_qkv is not None
++    with set_forward_context(attn_metadata, vllm_config):
++        # In the test setup the shape of the query is
++        # [batch_size, seq_len, num_heads, head_size]. However
++        # the attention backend expect the shape to be
++        # [num_tokens, hidden_size]. Hence reshape the query before
++        # invoking the forward method.
++        # TODO - Update the way we construct the query so that it
++        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
++        reshaped_query = packed_qkv.query.view(
++            -1, test_pt.num_heads * test_pt.head_size)
++        return attn.forward(
++            reshaped_query, packed_qkv.key, packed_qkv.value,
++            torch.tensor([],
++                         dtype=torch.float32,
++                         device=packed_qkv.query.device), attn_metadata)
++
++
++def _run_decoder_self_attention_test(
++    test_rsrcs: TestResources,
++    decoder_test_params: PhaseTestParameters,
++    attn_metadata: AttentionMetadata,
++    test_pt: TestPoint,
++    vllm_config: VllmConfig,
++) -> torch.Tensor:
++    '''
++    Run decoder self-attention test.
++
++    attn.forward() is passed attn_type=AttentionType.DECODER
++    in order to configure the kernel invocation for decoder self-attention.
++
++    Arguments:
++
++    * test_rsrcs: TestResources instance; this function relies on the kv_cache
++                  and attn (Attention wrapper instance) fields
++    * decoder_test_params: decoder PhaseTestParameters data structure;
++                           this function relies on the packed
++                           (number_of_tokens x num_heads x head_size)
++                           query/key/value fields
++    * attn_metadata: attention metadata for decoder-self attention
++                     (contains KV cache memory-mapping)
++    * test_pt: The TestPoint object containing test details like number of
++               model heads, head size, name of the backend being used etc.
++
++    Returns:
++    * Attention.forward() applied to packed_{query,key,value}, kv_cache
++      & attn_metadata
++    '''
++    attn = test_rsrcs.attn
++    kv_cache = test_rsrcs.kv_cache
++    packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
++    assert packed_qkv is not None
++    with set_forward_context(attn_metadata, vllm_config):
++        # In the test setup the shape of the query is
++        # [batch_size, seq_len, num_heads, head_size]. However
++        # the attention backend expect the shape to be
++        # [num_tokens, hidden_size]. Hence reshape the query before
++        # invoking the forward method.
++        # TODO - Update the way we construct the query so that it
++        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
++        reshaped_query = packed_qkv.query.view(
++            -1, test_pt.num_heads * test_pt.head_size)
++        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value,
++                            kv_cache, attn_metadata)
++
++
++def _run_encoder_decoder_cross_attention_test(
++    test_rsrcs: TestResources,
++    decoder_test_params: PhaseTestParameters,
++    cross_test_params: Optional[PhaseTestParameters],
++    attn_metadata: AttentionMetadata,
++    test_pt: TestPoint,
++    vllm_config: VllmConfig,
++) -> torch.Tensor:
++    '''
++    Run encoder/decoder cross-attention test.
++
++    Via PhaseTestParameters data structures, consumes the same query utilized
++    for decoder self-attention, plus a key/value specific to cross-attention.
++
++    if cross_test_params is None or cross_test_params.packed_qkvo.packed_qkv
++    is None, this reflects that in decode-phase cross attention there
++    is no growth in the key and value tensors.
++
++    attn.forward() is passed attn_type=AttentionType.ENCODER_DECODER
++    in order to configure the kernel invocation for encoder/decoder cross-
++    attention.
++
++    Arguments:
++
++    * test_rsrcs: TestResources instance; this function relies on the kv_cache
++                  and attn (Attention wrapper instance) fields
++    * decoder_test_params: decoder PhaseTestParameters data structure;
++                           this function relies on the packed
++                           (number_of_tokens x num_heads x head_size)
++                           query field
++    * cross_test_params: encoder/decoder PhaseTestParameters data structure;
++                         this function relies on the packed
++                         (number_of_tokens x num_heads x head_size)
++                         key/value fields
++    * attn_metadata: attention metadata for encoder/decoder-self attention
++    * test_pt: The TestPoint object containing test details like number of
++               model heads, head size, name of the backend being used etc.
++
++    Returns:
++    * Attention.forward() applied to packed_{query,key,value}, kv_cache
++      & attn_metadata
++    '''
++    assert decoder_test_params.packed_qkvo.packed_qkv is not None
++
++    attn = test_rsrcs.attn
++    kv_cache = test_rsrcs.kv_cache
++    if cross_test_params is None:
++        key = None
++        value = None
++    else:
++        cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv
++        key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key)
++        value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value)
++    with set_forward_context(attn_metadata, vllm_config):
++        # In the test setup the shape of the query is
++        # [batch_size, seq_len, num_heads, head_size]. However
++        # the attention backend expect the shape to be
++        # [num_tokens, hidden_size]. Hence reshape the query before
++        # invoking the forward method.
++        # TODO - Update the way we construct the query so that it
++        # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
++        reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
++            -1, test_pt.num_heads * test_pt.head_size)
++        return attn.forward(reshaped_query, key, value, kv_cache,
++                            attn_metadata)
++
++
++@pytest.fixture(autouse=True)
++def set_reset_environment(attn_backend):
++    # Set the default torch datatype to bfloat16 to enable
++    # testing of the Flash Attention backend. Also clear the
++    # cached value of the backend.
++    default_dtype = torch.get_default_dtype()
++    if attn_backend.name == 'FLASH_ATTN':
++        torch.set_default_dtype(torch.bfloat16)
++    _cached_get_attn_backend.cache_clear()
++    yield
++    # Reset the torch datatype to what it was before the test
++    # so as not to impact the remaining tests.
++    torch.set_default_dtype(default_dtype)
++
++
++@pytest.mark.skipif(current_platform.is_rocm(),
++                    reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
++@pytest.mark.parametrize("batch_size", BATCH_SIZES)
++@pytest.mark.parametrize("block_size", BLOCK_SIZES)
++@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
++@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
++def test_encoder_only(
++    num_heads: int,
++    head_size: int,
++    attn_backend: _Backend,
++    batch_size: int,
++    block_size: int,
++    max_dec_seq_len: int,
++    max_enc_seq_len: int,
++):
++    '''
++    End-to-end encoder-only attention test:
++
++    * Construct fake test vectors for (1) encoder attention
++    * Construct (1) attention metadata structure with prefill-phase
++      encoder attention, and (2) an analogous attention metadata
++      structure but for decode-phase
++    * Test & validate encoder attention against ideal output
++
++    No KV cache is required for encoder-only attention.
++
++    Note on ROCm/HIP: currently encoder/decoder models are not supported on
++    AMD GPUs, therefore this test simply is skipped if
++    current_platform.is_rocm().
++
++    This test globally forces an override of the usual backend
++    auto-selection process, forcing the specific backend-under-test
++    to be utilized.
++
++    Arguments:
++
++    * num_heads
++    * head_size,
++    * attn_backend: The attention backend to employ for testing
++    * batch_size
++    * block_size: KV cache block size
++    * max_dec_seq_len: max length of decoder input sequences
++    * max_enc_seq_len: max length of encoder input sequences
++    '''
++    # Force Attention wrapper backend
++    with global_force_attn_backend_context_manager(attn_backend):
++        # Note: KV cache size of 4096 is arbitrary & chosen intentionally
++        # to be more than necessary, since exceeding the kv cache size
++        # is not part of this test
++        test_pt = TestPoint(num_heads, head_size, attn_backend.name,
++                            batch_size, block_size, max_dec_seq_len,
++                            max_enc_seq_len, 4096, AttentionType.ENCODER)
++
++        # Attention scale factor, attention backend instance, attention wrapper
++        # instance, KV cache init
++        vllm_config = VllmConfig()
++        with set_current_vllm_config(vllm_config):
++            test_rsrcs = _make_test_resources(test_pt)
++
++        # Construct encoder attention test params (only used
++        # during prefill)
++
++        enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
++
++        # Shared prefill metadata structure
++
++        prephase_attn_metadata: AttentionMetadata = make_test_metadata(
++            attn_backend,
++            True,
++            None,
++            decoder_test_params=None,
++            encoder_test_params=enc_test_params,
++            cross_test_params=None,
++            device=CUDA_DEVICE)
++
++        # PREFILL: encoder attention
++
++        enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test(
++            test_rsrcs.attn,
++            enc_test_params,
++            prephase_attn_metadata,
++            test_pt=test_pt,
++            vllm_config=vllm_config))
++
++        # - Is encoder attention result correct?
++        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
++                                    attn_backend.name)
++
++
++@pytest.mark.skipif(current_platform.is_rocm(),
++                    reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
++@pytest.mark.parametrize("batch_size", BATCH_SIZES)
++@pytest.mark.parametrize("block_size", BLOCK_SIZES)
++@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
++@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
++def test_e2e_enc_dec_attn(
++    num_heads: int,
++    head_size: int,
++    attn_backend: _Backend,
++    batch_size: int,
++    block_size: int,
++    max_dec_seq_len: int,
++    max_enc_seq_len: int,
++) -> None:
++    '''
++    End-to-end encoder/decoder test:
++
++    * Construct fake test vectors for (1) encoder attention,
++      (2) decoder self-attention, and (3) encoder/decoder cross-attention
++    * Construct (1) attention metadata structure with self- and cross-attention
++      attributes for prefill-phase, and (2) an analogous attention metadata
++      structure but for decode-phase
++    * Test attention steps in the following order
++
++        * Encoder attention
++        * Prefill self-attention
++        * Prefill cross-attention
++        * Decode self-attention
++        * Decode cross-attention
++        * Besides being reflective of realistic use-cases, this order would
++          exacerbate any accidental overlap in the self-/cross-attention
++          block tables, which one hopes to avoid
++
++
++    * Validate output correctness against ideal reference attention
++      implementation
++
++    Block tables are constructed such that cross-attention KV cache is in a
++    higher, non-intersecting address-space than self-attention KV cache.
++
++    Self- and cross-attention share the same query tensor but not the K/V
++    tensors. Self-attention K/Vs must have the same seq len as Q while
++    cross-attention K/Vs are allowed to differ in seq len, as is often the case
++    for cross-attention.
++
++    This test globally forces an override of the usual backend
++    auto-selection process, forcing the specific backend-under-test
++    to be utilized.
++
++    Note on ROCm/HIP: currently encoder/decoder models are not supported on
++    AMD GPUs, therefore this test simply is skipped if
++    current_platform.is_rocm().
++
++    Note on metadata: there is a single attention metadata structure shared by
++    all prefill-phase attention operations (encoder, decoder, enc/dec cross),
++    and a single one shared by all decode-phase attention operations
++    (decoder & enc/dec cross.) This is intended to reflect the behavior
++    of EncoderDecoderModelRunner, which constructs a single attention metadata
++    structure for each prefill or decode run. A realistic scenario would rely
++    on the attention backend to utilize the appropriate attention metadata
++    fields according to the value of attn_metadata.attention_type. Thus,
++    this test is organized so as to confirm that the backend-under-test can
++    handle a shared prefill attention metadata structure & a shared decode\
++    attention metadata structure.
++
++    Arguments:
++
++    * num_heads
++    * head_size,
++    * attn_backend: The attention backend to employ for testing
++    * batch_size
++    * block_size: KV cache block size
++    * max_dec_seq_len: max length of decoder input sequences
++    * max_enc_seq_len: max length of encoder input sequences
++    '''
++    # Force Attention wrapper backend
++    with global_force_attn_backend_context_manager(attn_backend):
++        # Note: KV cache size of 4096 is arbitrary & chosen intentionally
++        # to be more than necessary, since exceeding the kv cache size
++        # is not part of this test
++        enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
++                                batch_size, block_size, max_dec_seq_len,
++                                max_enc_seq_len, 4096, AttentionType.ENCODER)
++        enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
++                                    batch_size, block_size, max_dec_seq_len,
++                                    max_enc_seq_len, 4096,
++                                    AttentionType.ENCODER_DECODER)
++        dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
++                                batch_size, block_size, max_dec_seq_len,
++                                max_enc_seq_len, 4096, AttentionType.DECODER)
++
++        # Attention scale factor, attention backend instance, attention wrapper
++        # instance, KV cache init
++        vllm_config = VllmConfig()
++        with set_current_vllm_config(vllm_config):
++            enc_test_rsrcs = _make_test_resources(enc_test_pt)
++            enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt)
++            dec_test_rsrcs = _make_test_resources(dec_test_pt)
++
++        # Construct encoder attention test params (only used
++        # during prefill)
++
++        enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs)
++
++        # Construct Decoder self-attention prefill-phase & decode-phase
++        # test params, including query/key/value tensors, decoder self-attention
++        # memory-mapping. cross_block_base_addr is the uppermost address in the
++        # decoder self-attention block-table, i.e. a base address which the
++        # encoder/decoder cross-attention block-table may build downward toward.
++
++        (
++            dec_qkv,
++            prephase_dec_test_params,
++            decphase_dec_test_params,
++            cross_block_base_addr,
++        ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs)
++
++        # Construct encoder/decoder cross-attention prefill-phase
++        # & decode-phase test params, including key/value tensors,
++        # cross-attention memory-mapping
++
++        (
++            prephase_cross_test_params,
++            decphase_cross_test_params,
++        ) = _enc_dec_cross_attn_setup_reuses_query(
++            dec_qkv,
++            enc_test_params,
++            prephase_dec_test_params,
++            enc_dec_test_pt,
++            enc_dec_test_rsrcs,
++            block_base_addr=cross_block_base_addr)
++
++        # Shared prefill metadata structure
++        assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None
++        prephase_attn_metadata: AttentionMetadata = make_test_metadata(
++            attn_backend,
++            True,
++            prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens,
++            decoder_test_params=prephase_dec_test_params,
++            encoder_test_params=enc_test_params,
++            cross_test_params=prephase_cross_test_params,
++            device=CUDA_DEVICE)
++
++        # PREFILL: encoder attention
++
++        enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn,
++                                                       enc_test_params,
++                                                       prephase_attn_metadata,
++                                                       test_pt=enc_test_pt,
++                                                       vllm_config=vllm_config)
++
++        # - Is encoder attention result correct?
++        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
++                                    attn_backend.name)
++
++        # PREFILL: decoder self-attention test
++
++        prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
++            dec_test_rsrcs,
++            prephase_dec_test_params,
++            prephase_attn_metadata,
++            test_pt=dec_test_pt,
++            vllm_config=vllm_config)
++
++        # - Is prefill decoder self-attention correct?
++        assert_actual_matches_ideal(prephase_dec_test_params,
++                                    prephase_dec_pckd_act_out,
++                                    attn_backend.name)
++
++        # PREFILL: encoder/decoder cross-attention test
++
++        prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
++            enc_dec_test_rsrcs,
++            prephase_dec_test_params,
++            prephase_cross_test_params,
++            prephase_attn_metadata,
++            test_pt=enc_dec_test_pt,
++            vllm_config=vllm_config)
++
++        # - Is prefill encoder/decoder cross-attention correct?
++        assert_actual_matches_ideal(prephase_cross_test_params,
++                                    prephase_cross_pckd_act_out,
++                                    attn_backend.name)
++
++        # DECODE: build decode-phase attention metadata
++
++        decphase_attn_metadata: AttentionMetadata = make_test_metadata(
++            attn_backend,
++            False,
++            dec_qkv.q_seq_lens,
++            decoder_test_params=decphase_dec_test_params,
++            encoder_test_params=enc_test_params,
++            cross_test_params=decphase_cross_test_params,
++            device=CUDA_DEVICE)
++
++        # DECODE: decoder self-attention test
++
++        decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
++            dec_test_rsrcs,
++            decphase_dec_test_params,
++            decphase_attn_metadata,
++            test_pt=dec_test_pt,
++            vllm_config=vllm_config)
++
++        # - Is decode-phase decoder self-attention correct?
++        assert_actual_matches_ideal(decphase_dec_test_params,
++                                    decphase_dec_pckd_act_out,
++                                    attn_backend.name)
++
++        # DECODE: encoder/decoder cross-attention test
++
++        decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
++            enc_dec_test_rsrcs,
++            decphase_dec_test_params,
++            None,
++            decphase_attn_metadata,
++            test_pt=enc_dec_test_pt,
++            vllm_config=vllm_config)
++
++        # - Is decode-phase encoder/decoder cross-attention correct?
++        assert_actual_matches_ideal(decphase_cross_test_params,
++                                    decphase_cross_pckd_act_out,
++                                    attn_backend.name)
+diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
+new file mode 100644
+index 0000000..1ae78d7
+--- /dev/null
++++ b/tests/kernels/test_flash_attn.py
+@@ -0,0 +1,241 @@
++from typing import List, Optional, Tuple
++
++import pytest
++import torch
++
++from vllm.platforms import current_platform
++from vllm.vllm_flash_attn import (flash_attn_varlen_func,
++                                  flash_attn_with_kvcache)
++
++NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
++HEAD_SIZES = [128, 256]
++BLOCK_SIZES = [16, 32]
++DTYPES = [torch.float16, torch.bfloat16]
++# one value large enough to test overflow in index calculation.
++# one value small enough to test the schema op check
++NUM_BLOCKS = [32768, 2048]
++
++
++def ref_paged_attn(
++    query: torch.Tensor,
++    key_cache: torch.Tensor,
++    value_cache: torch.Tensor,
++    query_lens: List[int],
++    kv_lens: List[int],
++    block_tables: torch.Tensor,
++    scale: float,
++    sliding_window: Optional[int] = None,
++    soft_cap: Optional[float] = None,
++) -> torch.Tensor:
++    num_seqs = len(query_lens)
++    block_tables = block_tables.cpu().numpy()
++    _, block_size, num_kv_heads, head_size = key_cache.shape
++
++    outputs: List[torch.Tensor] = []
++    start_idx = 0
++    for i in range(num_seqs):
++        query_len = query_lens[i]
++        kv_len = kv_lens[i]
++        q = query[start_idx:start_idx + query_len]
++        q *= scale
++
++        num_kv_blocks = (kv_len + block_size - 1) // block_size
++        block_indices = block_tables[i, :num_kv_blocks]
++
++        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
++        k = k[:kv_len]
++        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
++        v = v[:kv_len]
++
++        if q.shape[1] != k.shape[1]:
++            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
++            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
++        attn = torch.einsum("qhd,khd->hqk", q, k).float()
++        empty_mask = torch.ones(query_len, kv_len)
++        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
++        if sliding_window is not None:
++            sliding_window_mask = torch.triu(empty_mask,
++                                             diagonal=kv_len -
++                                             (query_len + sliding_window) +
++                                             1).bool().logical_not()
++            mask |= sliding_window_mask
++        if soft_cap is not None:
++            attn = soft_cap * torch.tanh(attn / soft_cap)
++        attn.masked_fill_(mask, float("-inf"))
++        attn = torch.softmax(attn, dim=-1).to(v.dtype)
++        out = torch.einsum("hqk,khd->qhd", attn, v)
++
++        outputs.append(out)
++        start_idx += query_len
++
++    return torch.cat(outputs, dim=0)
++
++
++@pytest.mark.parametrize("use_out", [True, False])
++@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("block_size", BLOCK_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
++@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
++@pytest.mark.parametrize("sliding_window", [None, 256])
++@torch.inference_mode()
++def test_flash_attn_with_paged_kv(
++    use_out: bool,
++    kv_lens: List[int],
++    num_heads: Tuple[int, int],
++    head_size: int,
++    dtype: torch.dtype,
++    block_size: int,
++    soft_cap: Optional[float],
++    num_blocks: int,
++    sliding_window: Optional[int],
++) -> None:
++    torch.set_default_device("cuda")
++    current_platform.seed_everything(0)
++    num_seqs = len(kv_lens)
++    num_query_heads = num_heads[0]
++    num_kv_heads = num_heads[1]
++    assert num_query_heads % num_kv_heads == 0
++    max_kv_len = max(kv_lens)
++    scale = head_size**-0.5
++    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
++                   (-1, -1))
++
++    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
++    key_cache = torch.randn(num_blocks,
++                            block_size,
++                            num_kv_heads,
++                            head_size,
++                            dtype=dtype)
++    value_cache = torch.randn_like(key_cache)
++    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
++
++    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
++    block_tables = torch.randint(0,
++                                 num_blocks,
++                                 (num_seqs, max_num_blocks_per_seq),
++                                 dtype=torch.int32)
++
++    q = query.unsqueeze(1)
++    out = torch.empty_like(q) if use_out else None
++    output = flash_attn_with_kvcache(
++        q=q,
++        k_cache=key_cache,
++        v_cache=value_cache,
++        out=out,
++        softmax_scale=scale,
++        causal=True,
++        block_table=block_tables,
++        cache_seqlens=kv_lens_tensor,
++        softcap=soft_cap if soft_cap is not None else 0,
++        window_size=window_size,
++    )
++    output = output if not use_out else out
++    output = output.squeeze(1)
++
++    ref_output = ref_paged_attn(query=query,
++                                key_cache=key_cache,
++                                value_cache=value_cache,
++                                query_lens=[1] * num_seqs,
++                                kv_lens=kv_lens,
++                                block_tables=block_tables,
++                                scale=scale,
++                                soft_cap=soft_cap,
++                                sliding_window=sliding_window)
++    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
++        f"{torch.max(torch.abs(output - ref_output))}"
++
++
++@pytest.mark.parametrize("use_out", [True, False])
++@pytest.mark.parametrize("seq_lens",
++                         [[(1, 1328), (5, 18),
++                           (129, 463)], [(1, 523), (1, 37), (1, 2011)]])
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("block_size", BLOCK_SIZES)
++@pytest.mark.parametrize("sliding_window", [None, 256])
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
++@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
++@torch.inference_mode()
++def test_varlen_with_paged_kv(
++    use_out: bool,
++    seq_lens: List[Tuple[int, int]],
++    num_heads: Tuple[int, int],
++    head_size: int,
++    sliding_window: Optional[int],
++    dtype: torch.dtype,
++    block_size: int,
++    soft_cap: Optional[float],
++    num_blocks: int,
++) -> None:
++    torch.set_default_device("cuda")
++    current_platform.seed_everything(0)
++    num_seqs = len(seq_lens)
++    query_lens = [x[0] for x in seq_lens]
++    kv_lens = [x[1] for x in seq_lens]
++    num_query_heads = num_heads[0]
++    num_kv_heads = num_heads[1]
++    assert num_query_heads % num_kv_heads == 0
++    max_query_len = max(query_lens)
++    max_kv_len = max(kv_lens)
++    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
++                   (-1, -1))
++    scale = head_size**-0.5
++
++    query = torch.randn(sum(query_lens),
++                        num_query_heads,
++                        head_size,
++                        dtype=dtype)
++    key_cache = torch.randn(num_blocks,
++                            block_size,
++                            num_kv_heads,
++                            head_size,
++                            dtype=dtype)
++    value_cache = torch.randn_like(key_cache)
++    cu_query_lens = torch.tensor([0] + query_lens,
++                                 dtype=torch.int32).cumsum(dim=0,
++                                                           dtype=torch.int32)
++    cu_kv_lens = torch.tensor([0] + kv_lens,
++                              dtype=torch.int32).cumsum(dim=0,
++                                                        dtype=torch.int32)
++
++    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
++    block_tables = torch.randint(0,
++                                 num_blocks,
++                                 (num_seqs, max_num_blocks_per_seq),
++                                 dtype=torch.int32)
++
++    out = torch.empty_like(query) if use_out else None
++    output = flash_attn_varlen_func(
++        q=query,
++        k=key_cache,
++        v=value_cache,
++        out=out,
++        cu_seqlens_q=cu_query_lens,
++        cu_seqlens_k=cu_kv_lens,
++        max_seqlen_q=max_query_len,
++        max_seqlen_k=max_kv_len,
++        softmax_scale=scale,
++        causal=True,
++        window_size=window_size,
++        block_table=block_tables,
++        softcap=soft_cap if soft_cap is not None else 0,
++    )
++    output = output if not use_out else out
++
++    ref_output = ref_paged_attn(
++        query=query,
++        key_cache=key_cache,
++        value_cache=value_cache,
++        query_lens=query_lens,
++        kv_lens=kv_lens,
++        block_tables=block_tables,
++        scale=scale,
++        sliding_window=sliding_window,
++        soft_cap=soft_cap,
++    )
++    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
++        f"{torch.max(torch.abs(output - ref_output))}"
+diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
+new file mode 100644
+index 0000000..a2c8f71
+--- /dev/null
++++ b/tests/kernels/test_flashinfer.py
+@@ -0,0 +1,470 @@
++from typing import List, Optional, Tuple
++
++import flashinfer
++import pytest
++import torch
++
++from vllm.platforms import current_platform
++
++NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
++HEAD_SIZES = [128, 256]
++BLOCK_SIZES = [16, 32]
++DTYPES = [torch.float16, torch.bfloat16]
++NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
++
++
++def ref_paged_attn(
++    query: torch.Tensor,
++    key_cache: torch.Tensor,
++    value_cache: torch.Tensor,
++    query_lens: List[int],
++    kv_lens: List[int],
++    block_tables: torch.Tensor,
++    scale: float,
++    sliding_window: Optional[int] = None,
++    soft_cap: Optional[float] = None,
++) -> torch.Tensor:
++    num_seqs = len(query_lens)
++    block_tables = block_tables.cpu().numpy()
++    _, block_size, num_kv_heads, head_size = key_cache.shape
++
++    outputs: List[torch.Tensor] = []
++    start_idx = 0
++    for i in range(num_seqs):
++        query_len = query_lens[i]
++        kv_len = kv_lens[i]
++        q = query[start_idx:start_idx + query_len]
++        q *= scale
++
++        num_kv_blocks = (kv_len + block_size - 1) // block_size
++        block_indices = block_tables[i, :num_kv_blocks]
++
++        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
++        k = k[:kv_len]
++        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
++        v = v[:kv_len]
++
++        if q.shape[1] != k.shape[1]:
++            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
++            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
++        attn = torch.einsum("qhd,khd->hqk", q, k).float()
++        empty_mask = torch.ones(query_len, kv_len)
++        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
++        if sliding_window is not None:
++            sliding_window_mask = torch.triu(empty_mask,
++                                             diagonal=kv_len -
++                                             (query_len + sliding_window) +
++                                             1).bool().logical_not()
++            mask |= sliding_window_mask
++        if soft_cap is not None:
++            attn = soft_cap * torch.tanh(attn / soft_cap)
++        attn.masked_fill_(mask, float("-inf"))
++        attn = torch.softmax(attn, dim=-1).to(v.dtype)
++        out = torch.einsum("hqk,khd->qhd", attn, v)
++
++        outputs.append(out)
++        start_idx += query_len
++
++    return torch.cat(outputs, dim=0)
++
++
++@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("block_size", BLOCK_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
++@torch.inference_mode
++def test_flashinfer_decode_with_paged_kv(
++    kv_lens: List[int],
++    num_heads: Tuple[int, int],
++    head_size: int,
++    dtype: torch.dtype,
++    block_size: int,
++    soft_cap: Optional[float],
++) -> None:
++    torch.set_default_device("cuda")
++    current_platform.seed_everything(0)
++    num_seqs = len(kv_lens)
++    num_query_heads = num_heads[0]
++    num_kv_heads = num_heads[1]
++    assert num_query_heads % num_kv_heads == 0
++    max_kv_len = max(kv_lens)
++    scale = head_size**-0.5
++
++    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
++
++    key_value_cache = torch.randn(NUM_BLOCKS,
++                                  2,
++                                  block_size,
++                                  num_kv_heads,
++                                  head_size,
++                                  dtype=dtype)
++    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
++    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
++
++    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
++    block_tables = torch.randint(0,
++                                 NUM_BLOCKS,
++                                 (num_seqs, max_num_blocks_per_seq),
++                                 dtype=torch.int32)
++
++    kv_indptr = [0]
++    kv_indices = []
++    kv_last_page_lens = []
++    for i in range(num_seqs):
++        seq_len = kv_lens[i]
++        assert seq_len > 0
++        num_blocks = (seq_len + block_size - 1) // block_size
++        kv_indices.extend(block_tables[i, :num_blocks])
++        kv_indptr.append(kv_indptr[-1] + num_blocks)
++        kv_last_page_len = seq_len % block_size
++        if kv_last_page_len == 0:
++            kv_last_page_len = block_size
++        kv_last_page_lens.append(kv_last_page_len)
++
++    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
++    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
++    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
++
++    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
++    wrapper = flashinfer.\
++        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
++                use_tensor_cores=(
++                    (num_query_heads//num_kv_heads) > 4)
++                )
++    wrapper.begin_forward(kv_indptr,
++                          kv_indices,
++                          kv_last_page_lens,
++                          num_query_heads,
++                          num_kv_heads,
++                          head_size,
++                          block_size,
++                          "NONE",
++                          data_type=dtype)
++
++    output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap)
++
++    ref_output = ref_paged_attn(query=query,
++                                key_cache=key_cache,
++                                value_cache=value_cache,
++                                query_lens=[1] * num_seqs,
++                                kv_lens=kv_lens,
++                                block_tables=block_tables,
++                                scale=scale,
++                                soft_cap=soft_cap)
++    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
++        f"{torch.max(torch.abs(output - ref_output))}"
++
++
++@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("block_size", BLOCK_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
++@torch.inference_mode
++def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
++                                          num_heads: Tuple[int, int],
++                                          head_size: int, dtype: torch.dtype,
++                                          block_size: int,
++                                          soft_cap: Optional[float]) -> None:
++    torch.set_default_device("cuda")
++    current_platform.seed_everything(0)
++    num_seqs = len(seq_lens)
++    query_lens = [x[0] for x in seq_lens]
++    kv_lens = [x[1] for x in seq_lens]
++    num_query_heads = num_heads[0]
++    num_kv_heads = num_heads[1]
++    assert num_query_heads % num_kv_heads == 0
++    max_kv_len = max(kv_lens)
++    scale = head_size**-0.5
++
++    query = torch.randn(sum(query_lens),
++                        num_query_heads,
++                        head_size,
++                        dtype=dtype)
++    key_value_cache = torch.randn(NUM_BLOCKS,
++                                  2,
++                                  block_size,
++                                  num_kv_heads,
++                                  head_size,
++                                  dtype=dtype)
++    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
++    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
++
++    # Normalize the scale of the key and value caches to mitigate
++    # numerical instability.
++    key_cache /= head_size**0.5
++    value_cache /= head_size**0.5
++
++    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
++    block_tables = torch.randint(0,
++                                 NUM_BLOCKS,
++                                 (num_seqs, max_num_blocks_per_seq),
++                                 dtype=torch.int32)
++
++    qo_indptr = [0]
++    kv_indptr = [0]
++    kv_indices = []
++    kv_last_page_lens = []
++    for i in range(num_seqs):
++        seq_len = kv_lens[i]
++        assert seq_len > 0
++        num_blocks = (seq_len + block_size - 1) // block_size
++        kv_indices.extend(block_tables[i, :num_blocks])
++        kv_indptr.append(kv_indptr[-1] + num_blocks)
++        kv_last_page_len = seq_len % block_size
++        if kv_last_page_len == 0:
++            kv_last_page_len = block_size
++        kv_last_page_lens.append(kv_last_page_len)
++        qo_indptr.append(qo_indptr[-1] + query_lens[i])
++
++    qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
++    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
++    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
++    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
++
++    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
++    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
++        workspace_buffer, "NHD")
++    wrapper.begin_forward(
++        qo_indptr,
++        kv_indptr,
++        kv_indices,
++        kv_last_page_lens,
++        num_query_heads,
++        num_kv_heads,
++        head_size,
++        block_size,
++    )
++
++    output = wrapper.forward(
++        query,
++        key_value_cache,
++        logits_soft_cap=soft_cap,
++    )
++
++    ref_output = ref_paged_attn(query=query,
++                                key_cache=key_cache,
++                                value_cache=value_cache,
++                                query_lens=query_lens,
++                                kv_lens=kv_lens,
++                                block_tables=block_tables,
++                                scale=scale,
++                                soft_cap=soft_cap)
++    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
++        f"{torch.max(torch.abs(output - ref_output))}"
++
++
++@pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]])
++@pytest.mark.parametrize("num_heads", [(32, 8), (6, 1)])
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("block_size", BLOCK_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
++def test_flashinfer_prefill_with_paged_fp8_kv(
++        seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int],
++        head_size: int, dtype: torch.dtype, block_size: int,
++        soft_cap: Optional[float]) -> None:
++    torch.set_default_device("cuda")
++    current_platform.seed_everything(0)
++    num_seqs = len(seq_lens)
++    query_lens = [x[0] for x in seq_lens]
++    kv_lens = [x[1] for x in seq_lens]
++    num_query_heads = num_heads[0]
++    num_kv_heads = num_heads[1]
++    assert num_query_heads % num_kv_heads == 0
++    max_kv_len = max(kv_lens)
++    scale = head_size**-0.5
++
++    kv_cache_dtype = torch.float8_e4m3fn
++
++    query = torch.randn(sum(query_lens),
++                        num_query_heads,
++                        head_size,
++                        dtype=dtype)
++    NUM_BLOCKS_FP8 = 2048
++    key_value_cache = torch.randn(NUM_BLOCKS_FP8,
++                                  2,
++                                  block_size,
++                                  num_kv_heads,
++                                  head_size,
++                                  dtype=dtype)
++    key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
++    key_cache /= head_size**0.5
++    value_cache /= head_size**0.5
++
++    k_scale = key_cache.amax().item() / 448.0
++    v_scale = value_cache.amax().item() / 448.0
++
++    kv_cache_fp8 = torch.cat([key_cache / k_scale, value_cache / v_scale],
++                             dim=1).to(kv_cache_dtype)
++
++    assert (kv_cache_fp8.shape == key_value_cache.shape)
++    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
++    block_tables = torch.randint(0,
++                                 NUM_BLOCKS_FP8,
++                                 (num_seqs, max_num_blocks_per_seq),
++                                 dtype=torch.int32)
++
++    qo_indptr = [0]
++    kv_indptr = [0]
++    kv_indices = []
++    kv_last_page_lens = []
++    for i in range(num_seqs):
++        seq_len = kv_lens[i]
++        assert seq_len > 0
++        num_blocks = (seq_len + block_size - 1) // block_size
++        kv_indices.extend(block_tables[i, :num_blocks])
++        kv_indptr.append(kv_indptr[-1] + num_blocks)
++        kv_last_page_len = seq_len % block_size
++        if kv_last_page_len == 0:
++            kv_last_page_len = block_size
++        kv_last_page_lens.append(kv_last_page_len)
++        qo_indptr.append(qo_indptr[-1] + query_lens[i])
++
++    qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
++    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
++    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
++    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
++
++    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
++    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
++        workspace_buffer, "NHD")
++    wrapper.begin_forward(
++        qo_indptr,
++        kv_indptr,
++        kv_indices,
++        kv_last_page_lens,
++        num_query_heads,
++        num_kv_heads,
++        head_size,
++        block_size,
++    )
++
++    output = wrapper.forward(query,
++                             kv_cache_fp8,
++                             logits_soft_cap=soft_cap,
++                             k_scale=k_scale,
++                             v_scale=v_scale)
++
++    ref_output = ref_paged_attn(query=query,
++                                key_cache=key_cache.squeeze(1),
++                                value_cache=value_cache.squeeze(1),
++                                query_lens=query_lens,
++                                kv_lens=kv_lens,
++                                block_tables=block_tables,
++                                scale=scale,
++                                soft_cap=soft_cap)
++    del query
++    del block_tables
++    # verify prefill fp8
++    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
++        f"{torch.max(torch.abs(output - ref_output))}"
++
++
++@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
++@pytest.mark.parametrize("num_heads", [(32, 8), (64, 8), (6, 1)])
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("block_size", BLOCK_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
++@torch.inference_mode
++def test_flashinfer_decode_with_paged_fp8_kv(
++    kv_lens: List[int],
++    num_heads: Tuple[int, int],
++    head_size: int,
++    dtype: torch.dtype,
++    block_size: int,
++    soft_cap: Optional[float],
++) -> None:
++    # test doesn't work for num_heads = (16,16)
++    torch.set_default_device("cuda")
++    current_platform.seed_everything(0)
++    num_seqs = len(kv_lens)
++    num_query_heads = num_heads[0]
++    num_kv_heads = num_heads[1]
++    assert num_query_heads % num_kv_heads == 0
++    max_kv_len = max(kv_lens)
++    scale = head_size**-0.5
++    use_tensor_cores = (num_query_heads // num_kv_heads) > 4
++    kv_cache_dtype = torch.float8_e4m3fn
++
++    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
++    NUM_BLOCKS_FP8 = 2048
++    key_value_cache = torch.randn(NUM_BLOCKS_FP8,
++                                  2,
++                                  block_size,
++                                  num_kv_heads,
++                                  head_size,
++                                  dtype=dtype)
++    key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
++    key_cache /= head_size**0.5
++    value_cache /= head_size**0.5
++
++    k_scale = key_cache.amax().item() / 448.0
++    v_scale = value_cache.amax().item() / 448.0
++
++    key_cache_fp8 = (key_cache / k_scale).to(kv_cache_dtype)
++    value_cache_fp8 = (value_cache / v_scale).to(kv_cache_dtype)
++    assert (key_cache_fp8.shape[1] == 1 and value_cache_fp8.shape[1] == 1)
++    kv_cache_fp8 = torch.cat([key_cache_fp8, value_cache_fp8], dim=1)
++
++    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
++    block_tables = torch.randint(0,
++                                 NUM_BLOCKS_FP8,
++                                 (num_seqs, max_num_blocks_per_seq),
++                                 dtype=torch.int32)
++
++    kv_indptr = [0]
++    kv_indices = []
++    kv_last_page_lens = []
++    for i in range(num_seqs):
++        seq_len = kv_lens[i]
++        assert seq_len > 0
++        num_blocks = (seq_len + block_size - 1) // block_size
++        kv_indices.extend(block_tables[i, :num_blocks])
++        kv_indptr.append(kv_indptr[-1] + num_blocks)
++        kv_last_page_len = seq_len % block_size
++        if kv_last_page_len == 0:
++            kv_last_page_len = block_size
++        kv_last_page_lens.append(kv_last_page_len)
++
++    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
++    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
++    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
++
++    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
++    wrapper = flashinfer.\
++        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
++                    use_tensor_cores=use_tensor_cores)
++    wrapper.begin_forward(kv_indptr,
++                          kv_indices,
++                          kv_last_page_lens,
++                          num_query_heads,
++                          num_kv_heads,
++                          head_size,
++                          block_size,
++                          "NONE",
++                          data_type=dtype,
++                          q_data_type=dtype)
++    output = wrapper.forward(query,
++                             kv_cache_fp8,
++                             logits_soft_cap=soft_cap,
++                             k_scale=k_scale,
++                             v_scale=v_scale)
++    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
++    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
++
++    ref_output = ref_paged_attn(query=query,
++                                key_cache=key_cache,
++                                value_cache=value_cache,
++                                query_lens=[1] * num_seqs,
++                                kv_lens=kv_lens,
++                                block_tables=block_tables,
++                                scale=scale,
++                                soft_cap=soft_cap)
++    # Temporary fix: Increasing the tolerance. Seems like a flashinfer issue
++    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
++        f"{torch.max(torch.abs(output - ref_output))}"
+diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/test_fp8_quant.py
+new file mode 100644
+index 0000000..ebaaae2
+--- /dev/null
++++ b/tests/kernels/test_fp8_quant.py
+@@ -0,0 +1,114 @@
++import pytest
++import torch
++
++import vllm._custom_ops as ops
++from tests.kernels.quant_utils import (FP8_DTYPE,
++                                       ref_dynamic_per_tensor_fp8_quant,
++                                       ref_dynamic_per_token_quant)
++from tests.kernels.utils import opcheck
++from vllm.platforms import current_platform
++
++DTYPES = [torch.half, torch.bfloat16, torch.float]
++HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192,
++                8193]  # Arbitrary values for testing
++HIDDEN_SIZES += list(range(1024, 1033))  # vectorized conversion edge cases
++NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
++SCALE_UBS = [True, False]
++SEEDS = [0]
++
++
++def opcheck_fp8_quant(output,
++                      input,
++                      scale=None,
++                      scale_ub=None,
++                      use_per_token_if_dynamic=False):
++    if scale is not None:
++        opcheck(torch.ops._C.static_scaled_fp8_quant, (output, input, scale))
++    elif use_per_token_if_dynamic:
++        scale = torch.empty((input.shape[0], 1),
++                            device=input.device,
++                            dtype=torch.float32)
++        opcheck(torch.ops._C.dynamic_per_token_scaled_fp8_quant,
++                (output, input, scale, scale_ub))
++    else:
++        scale = torch.empty((input.numel() // input.shape[-1], 1),
++                            device=input.device,
++                            dtype=torch.float32)
++        opcheck(torch.ops._C.dynamic_scaled_fp8_quant, (output, input, scale))
++
++
++@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("scale_ub", SCALE_UBS)
++@pytest.mark.parametrize("seed", SEEDS)
++@torch.inference_mode()
++def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
++                                     dtype: torch.dtype, scale_ub: bool,
++                                     seed: int) -> None:
++    current_platform.seed_everything(seed)
++
++    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
++                   device="cuda") + 1e-6  # avoid nans
++
++    scale_ub = torch.mean(x).to(dtype=torch.float32, device='cuda') \
++            if scale_ub else None
++    ref_out, ref_scales = ref_dynamic_per_token_quant(x, FP8_DTYPE, scale_ub)
++    ops_out, ops_scales = ops.scaled_fp8_quant(x,
++                                               scale_ub=scale_ub,
++                                               use_per_token_if_dynamic=True)
++
++    torch.testing.assert_close(ref_scales, ops_scales)
++    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
++                               ops_out.to(dtype=torch.float32))
++
++    opcheck_fp8_quant(ops_out,
++                      x,
++                      None,
++                      scale_ub,
++                      use_per_token_if_dynamic=True)
++
++
++@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("seed", SEEDS)
++@torch.inference_mode()
++def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
++                                      dtype: torch.dtype, seed: int) -> None:
++    current_platform.seed_everything(seed)
++
++    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
++
++    ref_out, ref_scale = ref_dynamic_per_tensor_fp8_quant(x)
++    ops_out, ops_scale = ops.scaled_fp8_quant(x)
++
++    torch.testing.assert_close(ref_scale, ops_scale)
++    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
++                               ops_out.to(dtype=torch.float32))
++
++    opcheck_fp8_quant(ops_out, x)
++
++
++# Regression test for a case with large activations where an int32 index cannot
++# represent the number of elements.
++@torch.inference_mode()
++@pytest.mark.parametrize("seed", SEEDS)
++def test_fp8_quant_large(seed: int) -> None:
++    current_platform.seed_everything(seed)
++
++    num_tokens = 1024000  # Mistral-Nemo's max_position_embeddings
++    hidden_size = 1152  # Smallest hidden_size to reproduce the error
++    dtype = torch.bfloat16
++
++    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
++    ref_out, scale = ref_dynamic_per_tensor_fp8_quant(x)
++    ops_out, _ = ops.scaled_fp8_quant(x, scale)
++
++    # Minimize memory footprint in this test by freeing x and upconverting
++    # the outputs in place. (torch.allclose does not support fp8)
++    del x
++    ref_out = ref_out.to(dtype=dtype)
++    ops_out = ops_out.to(dtype=dtype)
++
++    torch.testing.assert_close(ref_out, ops_out)
+diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py
+new file mode 100644
+index 0000000..baf8d73
+--- /dev/null
++++ b/tests/kernels/test_fused_quant_layernorm.py
+@@ -0,0 +1,171 @@
++from typing import Optional, Tuple, Union
++
++import pytest
++import torch
++
++import vllm._custom_ops as ops
++from tests.kernels.utils import opcheck
++from vllm.model_executor.layers.layernorm import RMSNorm
++
++DTYPES = [torch.bfloat16, torch.float]
++QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn]
++VEC_HIDDEN_SIZES = range(1024, 1030)
++# Avoid combinatorial explosion with full Cartesian product
++NUM_TOKENS_HIDDEN_SIZES = [
++    *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]],
++    *[(83, i) for i in [1, 1033, 2048, 5120]],
++    *[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]],
++    *[(4096, i) for i in [1, 64, 5137]],
++]
++
++ADD_RESIDUAL = [False, True]
++SCALE_UBS = [True, False]
++SEEDS = [0]
++CUDA_DEVICES = [
++    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
++]
++
++EPS = 1e-6
++
++## Helpers
++
++
++def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
++    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
++
++
++def ref_rms_norm(rms_norm_layer: RMSNorm,
++                 x: torch.Tensor,
++                 residual: Optional[torch.Tensor]) \
++        -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
++    if residual is not None:
++        residual = residual.clone()
++        out, residual = rms_norm_layer.forward_native(x, residual)
++    else:
++        out = rms_norm_layer.forward_native(x)
++
++    return out, residual
++
++
++def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
++                                x: torch.Tensor,
++                                quant_dtype: torch.dtype,
++                                residual: Optional[torch.Tensor],
++                                scale_ub: Optional[torch.Tensor]) \
++        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
++    if scale_ub is not None:
++        assert quant_dtype == torch.float8_e4m3fn
++
++    # Norm
++    torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual)
++
++    # Quant
++    if quant_dtype == torch.float8_e4m3fn:
++        torch_out, scales = ops.scaled_fp8_quant(torch_out,
++                                                 scale_ub=scale_ub,
++                                                 use_per_token_if_dynamic=True)
++    else:
++        assert quant_dtype == torch.int8
++        torch_out, scales = ops.scaled_int8_quant(torch_out)
++
++    return torch_out, scales, residual
++
++
++def ref_impl(rms_norm_layer: RMSNorm,
++             x: torch.Tensor,
++             quant_dtype: torch.dtype,
++             residual: Optional[torch.Tensor],
++             scale_ub: Optional[torch.Tensor]) \
++        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
++    return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
++                                       residual, scale_ub)
++
++
++def ops_dynamic_per_token_quant(weight: torch.Tensor,
++                                x: torch.Tensor,
++                                quant_dtype: torch.dtype,
++                                residual: Optional[torch.Tensor],
++                                scale_ub: Optional[torch.Tensor]) \
++        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
++    if residual is not None:
++        residual = residual.clone()
++    out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
++                                                       quant_dtype, scale_ub,
++                                                       residual)
++    return out, scales, residual
++
++
++def ops_impl(weight: torch.Tensor,
++             x: torch.Tensor,
++             quant_dtype: torch.dtype,
++             residual: Optional[torch.Tensor],
++             scale_ub: Optional[torch.Tensor]) \
++        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
++    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
++                                       scale_ub)
++
++
++@pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
++@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
++@pytest.mark.parametrize("scale_ub", SCALE_UBS)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
++@pytest.mark.parametrize("seed", SEEDS)
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_rms_norm(
++    num_tokens: int,
++    hidden_size: int,
++    add_residual: bool,
++    scale_ub: bool,
++    dtype: torch.dtype,
++    quant_dtype: torch.dtype,
++    seed: int,
++    device: str,
++) -> None:
++    torch.random.manual_seed(seed)
++    if torch.cuda.is_available():
++        torch.cuda.manual_seed(seed)
++    torch.set_default_device(device)
++
++    if scale_ub is not None and quant_dtype != torch.float8_e4m3fn:
++        # skip
++        return
++
++    layer = RMSNorm(hidden_size, EPS).to(dtype=dtype)
++
++    # Make weights
++    layer.weight.data.normal_(mean=1.0, std=0.1)
++
++    # Make inputs
++    scale = 1 / (hidden_size)
++    x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
++    residual = torch.randn_like(x) * scale if add_residual else None
++    if scale_ub is not None:
++        rms_x, _ = ref_rms_norm(layer, x, residual)
++        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda')
++
++    ref_out, ref_scales, ref_residual = \
++        ref_impl(layer, x, quant_dtype, residual, scale_ub)
++    ops_out, ops_scales, ops_residual = \
++        ops_impl(layer.weight, x, quant_dtype, residual, scale_ub)
++
++    assert ref_out.dtype == quant_dtype
++    assert ops_out.dtype == quant_dtype
++    assert torch.allclose(ref_scales, ops_scales)
++    if quant_dtype == torch.int8:
++        # big atol to account for round-off errors.
++        assert torch.allclose(ref_out, ops_out, atol=1)
++    else:
++        assert torch.allclose(ref_out.to(dtype=torch.float32),
++                              ops_out.to(dtype=torch.float32))
++    if add_residual:
++        assert torch.allclose(ref_residual, ops_residual)
++
++    output = torch.empty_like(x, dtype=quant_dtype)
++    scales = torch.empty((x.numel() // x.shape[-1], 1),
++                         device=x.device,
++                         dtype=torch.float32)
++
++    opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant,
++            (output, x, layer.weight, scales, 1e-5, scale_ub, residual))
+diff --git a/tests/kernels/test_ggml.py b/tests/kernels/test_ggml.py
+new file mode 100644
+index 0000000..dddb285
+--- /dev/null
++++ b/tests/kernels/test_ggml.py
+@@ -0,0 +1,22 @@
++import gguf
++import pytest
++import torch
++
++from tests.kernels.utils import opcheck
++from vllm import _custom_ops as ops  # noqa: F401
++
++
++@pytest.mark.parametrize("quant_type", [12])
++def test_ggml_opcheck(quant_type):
++    block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
++    shape = [256, 1152]
++    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
++    m = qweight.shape[0]
++    n = qweight.shape[1] // type_size * block_size
++    opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n))
++
++    x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
++    opcheck(torch.ops._C.ggml_mul_mat_a8,
++            (qweight, x, quant_type, qweight.shape[0]))
++    opcheck(torch.ops._C.ggml_mul_mat_vec_a8,
++            (qweight, x, quant_type, qweight.shape[0]))
+diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py
+new file mode 100644
+index 0000000..893af99
+--- /dev/null
++++ b/tests/kernels/test_gguf.py
+@@ -0,0 +1,127 @@
++from pathlib import Path
++from typing import List
++
++import pytest
++import torch
++from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
++from huggingface_hub import snapshot_download
++
++import vllm._custom_ops as ops
++from vllm.platforms import current_platform
++
++GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
++
++
++def get_gguf_sample_tensors(
++        hidden_size: int,
++        quant_type: GGMLQuantizationType) -> List[ReaderTensor]:
++    sample_dir = GGUF_SAMPLE
++    filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
++    sample_file = Path(sample_dir) / filename
++    return GGUFReader(sample_file).tensors
++
++
++DTYPES = [torch.half]
++# Hidden_size for testing, must match the sample file in HF repo,
++# we have `hidden_size = 256, 1024` for test in HF repo currently.
++HIDDEN_SIZES = [256, 1024]
++NUM_TOKENS = [7, 83, 128, 2048]  # Arbitrary values for testing
++SEEDS = [0]
++QUANT_TYPES = [
++    # i-matrix
++    GGMLQuantizationType.IQ1_M,
++    GGMLQuantizationType.IQ1_S,
++    GGMLQuantizationType.IQ2_S,
++    GGMLQuantizationType.IQ2_XS,
++    GGMLQuantizationType.IQ3_S,
++    GGMLQuantizationType.IQ3_XXS,
++    GGMLQuantizationType.IQ4_NL,
++    GGMLQuantizationType.IQ4_XS,
++    # k-quants
++    GGMLQuantizationType.Q2_K,
++    GGMLQuantizationType.Q3_K,
++    GGMLQuantizationType.Q4_K,
++    GGMLQuantizationType.Q5_K,
++    GGMLQuantizationType.Q6_K,
++    # standard quantization
++    GGMLQuantizationType.Q4_0,
++    GGMLQuantizationType.Q5_0,
++    GGMLQuantizationType.Q8_0,
++]
++
++
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("quant_type", QUANT_TYPES)
++@torch.inference_mode()
++def test_dequantize(hidden_size: int, dtype: torch.dtype,
++                    quant_type: GGMLQuantizationType):
++    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
++    for tensor in tensors:
++        shape_str = tensor.name.split("_")[-1]
++        shape = map(int, shape_str.split("x"))
++
++        ref_output = torch.tensor(dequantize(tensor.data, quant_type),
++                                  device="cuda").to(dtype)
++        output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
++                                     quant_type, *list(shape)).to(dtype)
++
++        torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
++
++
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("quant_type", QUANT_TYPES)
++@torch.inference_mode()
++def test_mmvq(hidden_size: int, dtype: torch.dtype,
++              quant_type: GGMLQuantizationType):
++    current_platform.seed_everything(0)
++
++    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
++    x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
++    for tensor in tensors:
++        weight = torch.tensor(dequantize(tensor.data, quant_type),
++                              device="cuda").to(dtype)
++        ref_output = x @ weight.T
++
++        qweight = torch.tensor(tensor.data, device="cuda")
++        output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type,
++                                         qweight.shape[0]).to(dtype)
++
++        torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
++
++
++@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize(
++    "quant_type",
++    [
++        # k-quants
++        GGMLQuantizationType.Q2_K,
++        GGMLQuantizationType.Q3_K,
++        GGMLQuantizationType.Q4_K,
++        GGMLQuantizationType.Q5_K,
++        GGMLQuantizationType.Q6_K,
++        # standard quants
++        GGMLQuantizationType.Q4_0,
++        GGMLQuantizationType.Q5_0,
++        GGMLQuantizationType.Q8_0,
++    ])
++@torch.inference_mode()
++def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
++             quant_type: GGMLQuantizationType):
++    current_platform.seed_everything(0)
++
++    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
++    x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
++    for tensor in tensors:
++        weight = torch.tensor(dequantize(tensor.data, quant_type),
++                              device="cuda").to(dtype)
++        ref_output = x @ weight.T
++
++        qweight = torch.tensor(tensor.data, device="cuda")
++        output = ops.ggml_mul_mat_a8(qweight, x, quant_type,
++                                     qweight.shape[0]).to(dtype)
++
++        torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
+diff --git a/tests/kernels/test_gptq.py b/tests/kernels/test_gptq.py
+new file mode 100644
+index 0000000..c1ca6f1
+--- /dev/null
++++ b/tests/kernels/test_gptq.py
+@@ -0,0 +1,29 @@
++import torch
++
++from tests.kernels.utils import opcheck
++from vllm import _custom_ops as ops  # noqa: F401
++
++
++def test_gptq_shuffle_opcheck():
++    weight = torch.randint(-2000000,
++                           2000000, (1792, 4096),
++                           device='cuda',
++                           dtype=torch.int32)
++    perm = torch.empty((0, ), device='cuda', dtype=torch.int32)
++    bit = 4
++    opcheck(torch.ops._C.gptq_shuffle, (weight, perm, bit))
++
++
++def test_gptq_gemm_opcheck():
++    a = torch.rand((240, 4096), device='cuda', dtype=torch.float16)
++    weight = torch.randint(-2000000,
++                           2000000, (512, 6144),
++                           device='cuda',
++                           dtype=torch.int32)
++    zeros = torch.zeros((32, 768), device='cuda', dtype=torch.int32)
++    scales = torch.rand((32, 6144), device='cuda', dtype=torch.float16)
++    idx = torch.empty((0, ), device='cuda', dtype=torch.int32)
++    use_exllama = True
++    bit = 4
++    opcheck(torch.ops._C.gptq_gemm,
++            (a, weight, zeros, scales, idx, use_exllama, bit))
+diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
+new file mode 100644
+index 0000000..761eb95
+--- /dev/null
++++ b/tests/kernels/test_int8_quant.py
+@@ -0,0 +1,190 @@
++import pytest
++import torch
++
++from tests.kernels.quant_utils import ref_dynamic_per_token_quant
++from tests.kernels.utils import opcheck
++from vllm._custom_ops import scaled_int8_quant
++from vllm.platforms import current_platform
++
++DTYPES = [torch.half, torch.bfloat16, torch.float]
++HIDDEN_SIZES = [16, 67, 768, 5137, 8193]  # Arbitrary values for testing
++NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
++SEEDS = [0]
++SCALE = [0.1, 2.1]
++
++
++def opcheck_int8_quant_static(output, input, scale, azp=None):
++    if azp is None:
++        opcheck(torch.ops._C.static_scaled_int8_quant,
++                (output, input, scale, None))
++    else:
++        opcheck(torch.ops._C.static_scaled_int8_quant,
++                (output, input, scale, azp))
++
++
++def opcheck_int8_quant_dynamic(output, input, symmetric=True):
++    scale = torch.empty((input.numel() // input.shape[-1], 1),
++                        device=input.device,
++                        dtype=torch.float32)
++    if symmetric:
++        opcheck(torch.ops._C.dynamic_scaled_int8_quant,
++                (output, input, scale, None))
++    else:
++        azp = torch.empty((input.numel() // input.shape[-1], 1),
++                          device=input.device,
++                          dtype=torch.int32)
++        opcheck(torch.ops._C.dynamic_scaled_int8_quant,
++                (output, input, scale, azp))
++
++
++@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("seed", SEEDS)
++@torch.inference_mode()
++def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
++                                   dtype: torch.dtype, seed: int) -> None:
++    current_platform.seed_everything(seed)
++
++    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
++
++    # reference
++    ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.int8)
++    # kernel
++    ops_out, ops_scales, _ = scaled_int8_quant(x)
++
++    torch.testing.assert_close(ops_scales, ref_scales)
++    # big atol to account for rounding errors
++    torch.testing.assert_close(ops_out, ref_out, atol=1, rtol=0.0)
++
++    opcheck_int8_quant_dynamic(ops_out, x)
++
++
++@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("seed", SEEDS)
++@torch.inference_mode()
++def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
++                                       dtype: torch.dtype, seed: int) -> None:
++    current_platform.seed_everything(seed)
++    int8_traits = torch.iinfo(torch.int8)
++
++    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
++                   device="cuda") * 1000 - 300
++
++    x_token_max, _ = x.to(dtype=torch.float32).max(dim=1, keepdim=True)
++    x_token_min, _ = x.to(dtype=torch.float32).min(dim=1, keepdim=True)
++
++    # calculate scale and azp, and adjust the range
++    scales = (x_token_max - x_token_min) / torch.tensor(255.0)
++    azps = torch.round(torch.tensor(-128.0) - x_token_min / scales).to(
++        torch.int32)
++
++    torch_out = ((x / scales).round() + azps).clamp(
++        int8_traits.min, int8_traits.max).to(torch.int8)
++    assert torch_out.min() >= int8_traits.min and torch_out.max(
++    ) <= int8_traits.max
++
++    ops_out, scales_out, azp_out = scaled_int8_quant(x, symmetric=False)
++
++    if (not torch.allclose(scales_out, scales)):
++        print(torch.argmax(torch.abs(scales_out - scales)))
++    torch.testing.assert_close(scales_out, scales)
++    # big atol to account for rounding errors
++    torch.testing.assert_close(azp_out, azps, atol=1, rtol=0.0)
++    # if AZP is off by 1, after rounding-to-even, the output may be off by 2
++    torch.testing.assert_close(ops_out, torch_out, atol=2, rtol=0.0)
++
++    opcheck_int8_quant_dynamic(ops_out, x, False)
++
++
++@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("seed", SEEDS)
++@pytest.mark.parametrize("scale", SCALE)
++@torch.inference_mode()
++def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
++                                  dtype: torch.dtype, seed: int,
++                                  scale: float) -> None:
++    current_platform.seed_everything(seed)
++    int8_traits = torch.iinfo(torch.int8)
++
++    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
++    scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
++
++    out1 = (x / scale_arg).round().clamp(int8_traits.min,
++                                         int8_traits.max).to(torch.int8)
++    out2, scale2, _ = scaled_int8_quant(x, scale_arg)
++    assert scale2 is scale_arg
++
++    # big atol to account for rounding errors
++    torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
++
++    opcheck_int8_quant_static(out2, x, scale_arg)
++
++
++@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("seed", SEEDS)
++@pytest.mark.parametrize("scale", SCALE)
++@pytest.mark.parametrize("azp", [-255, 54])
++@torch.inference_mode()
++def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
++                                      dtype: torch.dtype, seed: int,
++                                      scale: float, azp: int) -> None:
++    current_platform.seed_everything(seed)
++    int8_traits = torch.iinfo(torch.int8)
++
++    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
++                   device="cuda") * 1000 - 300
++
++    out1 = ((x / scale).round() + azp).clamp(int8_traits.min,
++                                             int8_traits.max).to(torch.int8)
++    scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
++    azp_arg = torch.tensor([azp], dtype=torch.int32, device="cuda")
++
++    out2, scale2, azp2 = scaled_int8_quant(x,
++                                           scale_arg,
++                                           azp_arg,
++                                           symmetric=False)
++    assert scale2 is scale_arg
++    assert azp2 is azp_arg
++
++    # big atol to account for rounding errors
++    torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
++
++    opcheck_int8_quant_static(out2, x, scale_arg, azp_arg)
++
++
++@pytest.mark.parametrize("is_max", [True, False])
++@torch.inference_mode()
++def test_static_scaled_int8_azp_quant_saturating_cast(is_max: bool) -> None:
++    # Test that the saturating cast works correctly for values near i32 max/min
++
++    from numpy import inf, nextafter
++
++    int32_traits = torch.iinfo(torch.int32)
++    val = float(int32_traits.max if is_max else int32_traits.min)
++
++    x_vals = [[
++        nextafter(val, inf), val + 1, val, val - 1,
++        nextafter(val, -inf)
++    ]]
++    x = torch.tensor(x_vals, dtype=torch.float32, device="cuda")
++
++    # The calculation in the kernel is: cast<int8>(cast<int32>(x / scale) + azp)
++    # where cast<T> is a saturating cast to type T.
++    # Scale is set to 1.0 so that the input values are the ones that are cast.
++    # AZP is set to 0 to make sure the int8 saturating cast is tested as well.
++    scale = torch.scalar_tensor(1.0, dtype=torch.float32, device="cuda")
++    azp = torch.scalar_tensor(0, dtype=torch.int32, device="cuda")
++
++    int8_traits = torch.iinfo(torch.int8)
++    val_i8 = int8_traits.max if is_max else int8_traits.min
++    expected = torch.full((1, 5), val_i8, dtype=torch.int8, device="cuda")
++
++    out, _, _ = scaled_int8_quant(x, scale, azp, symmetric=False)
++    torch.testing.assert_close(expected, out, atol=0, rtol=0)
+diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
+index 210d59e..727769e 100644
+--- a/tests/kernels/test_layernorm.py
++++ b/tests/kernels/test_layernorm.py
+@@ -1,11 +1,14 @@
+ import pytest
+ import torch
+ 
++from tests.kernels.quant_utils import FP8_DTYPE
++from tests.kernels.utils import opcheck
+ from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.platforms import current_platform
+ 
+ DTYPES = [torch.half, torch.bfloat16, torch.float]
+ NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
+-HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
++HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
+                 8199]  # Arbitrary values for testing
+ ADD_RESIDUAL = [False, True]
+ SEEDS = [0]
+@@ -29,9 +32,7 @@ def test_rms_norm(
+     seed: int,
+     device: str,
+ ) -> None:
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     layer = RMSNorm(hidden_size).to(dtype=dtype)
+     layer.weight.data.normal_(mean=1.0, std=0.1)
+@@ -42,13 +43,92 @@ def test_rms_norm(
+ 
+     # NOTE(woosuk): The reference implementation should be executed first
+     # because the custom kernel is in-place.
+-    ref_out = layer._forward(x, residual)
++    ref_out = layer.forward_native(x, residual)
+     out = layer(x, residual)
+     # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
+     # numerical errors than other operators because they involve reductions.
+     # Therefore, we use a larger tolerance.
+     if add_residual:
+-        assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
+-        assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
++        torch.testing.assert_close(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
++        torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
+     else:
+-        assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2)
++        torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
++
++    if residual is not None:
++        opcheck(torch.ops._C.fused_add_rms_norm,
++                (x, residual, layer.weight.data, layer.variance_epsilon))
++    else:
++        opcheck(torch.ops._C.rms_norm,
++                (out, x, layer.weight.data, layer.variance_epsilon))
++
++
++@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0])
++@pytest.mark.parametrize("seed", SEEDS)
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++def test_fused_rms_norm_quant(
++    num_tokens: int,
++    hidden_size: int,
++    add_residual: bool,
++    dtype: torch.dtype,
++    quant_scale: float,
++    seed: int,
++    device: str,
++) -> None:
++    current_platform.seed_everything(seed)
++    torch.set_default_device(device)
++
++    weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1)
++    scale = 1 / (2 * hidden_size)
++    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
++    x *= scale
++    if add_residual:
++        residual = torch.randn_like(x) * scale
++        residual_fused = residual.clone()
++    else:
++        residual = residual_fused = None
++
++    out_norm = torch.empty_like(x)
++    out_quant = torch.empty_like(x, dtype=FP8_DTYPE)
++    out_quant_fused = torch.empty_like(out_quant)
++
++    quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32)
++
++    if add_residual:
++        torch.ops._C.fused_add_rms_norm_static_fp8_quant(
++            out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)
++
++        # Unfused kernel is in-place so it goes second
++        # Also use a separate clone of x to avoid modifying the input
++        x_unfused = x.clone()
++        torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
++        torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused,
++                                             quant_scale_t)
++
++        torch.cuda.synchronize()
++        torch.testing.assert_close(residual_fused,
++                                   residual,
++                                   atol=1e-2,
++                                   rtol=1e-2)
++
++        opcheck(
++            torch.ops._C.fused_add_rms_norm_static_fp8_quant,
++            (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6))
++    else:
++        torch.ops._C.rms_norm_static_fp8_quant(out_quant_fused, x, weight,
++                                               quant_scale_t, 1e-6)
++
++        torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
++        torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm,
++                                             quant_scale_t)
++
++        opcheck(torch.ops._C.rms_norm_static_fp8_quant,
++                (out_quant_fused, x, weight, quant_scale_t, 1e-6))
++
++    torch.testing.assert_close(out_quant_fused.to(dtype=torch.float32),
++                               out_quant.to(dtype=torch.float32),
++                               atol=1e-3,
++                               rtol=1e-3)
+diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/test_machete_mm.py
+new file mode 100644
+index 0000000..1c6eb2d
+--- /dev/null
++++ b/tests/kernels/test_machete_mm.py
+@@ -0,0 +1,406 @@
++"""Tests for the machete kernel.
++
++Run `pytest tests/kernels/test_machete_mm.py`.
++"""
++
++import math
++from dataclasses import dataclass, fields
++from typing import List, Optional, Tuple
++
++import pytest
++import torch
++
++from tests.kernels.utils import opcheck
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.utils.quant_utils import (
++    pack_rows, quantize_weights)
++from vllm.platforms import current_platform
++from vllm.scalar_type import ScalarType, scalar_types
++
++CUDA_DEVICES = [
++    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
++]
++
++# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
++#  unit tests to a common utility function. Currently the use of
++#  `is_quant_method_supported` conflates kernels with quantization methods
++#  an assumption which is breaking down as quantizations methods can have
++#  have kernels and some kernels support multiple quantization methods.
++IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
++
++MNK_SHAPES = [
++    (1, 128, 128),
++    (1, 512, 1024),
++    (1, 4096, 4096),
++    (1, 8192, 28672),
++    (13, 8192, 4096),
++    (26, 4096, 8192),
++    (64, 4096, 4096),
++    (64, 8192, 28672),
++    (257, 128, 4096),
++    (257, 4224, 4160),
++    (257, 4096, 4096),
++    (1024, 4096, 8192),
++    (1024, 8192, 4096),
++]
++
++GROUP_SIZES_TO_TEST: List[Optional[int]] = [128, -1]
++
++
++@dataclass
++class TypeConfig:
++    act_type: torch.dtype
++    weight_type: ScalarType
++    output_type: Optional[torch.dtype]
++    group_scale_type: Optional[torch.dtype]
++    group_zero_type: Optional[torch.dtype]
++    channel_scale_type: Optional[torch.dtype]
++    token_scale_type: Optional[torch.dtype]
++
++
++@dataclass
++class Tensors:
++    w_ref: torch.Tensor
++    a_ref: torch.Tensor
++    a: torch.Tensor
++    w_q: torch.Tensor
++    w_g_s: Optional[torch.Tensor]
++    w_g_zp: Optional[torch.Tensor]
++    w_ch_s: Optional[torch.Tensor]
++    w_tok_s: Optional[torch.Tensor]
++
++
++# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
++#  Ch Scales Type, Tok Scales Type)
++# NOTE: None "Scale Type" means the act type is floating point
++#       None "Output Type" means the output type is the same as the act type
++TestTypeTuple = Tuple[List[torch.dtype], ScalarType, Optional[torch.dtype],
++                      Optional[torch.dtype], bool]
++TEST_TYPES = [
++    # GPTQ style
++    *(TypeConfig(act_type=a_type,
++                 weight_type=w_type,
++                 output_type=None,
++                 group_scale_type=a_type,
++                 group_zero_type=None,
++                 channel_scale_type=None,
++                 token_scale_type=None)
++      for w_type in [scalar_types.uint4b8, scalar_types.uint8b128]
++      for a_type in [torch.float16, torch.bfloat16]),
++    # AWQ style
++    *(TypeConfig(act_type=a_type,
++                 weight_type=w_type,
++                 output_type=None,
++                 group_scale_type=a_type,
++                 group_zero_type=a_type,
++                 channel_scale_type=None,
++                 token_scale_type=None)
++      for w_type in [scalar_types.uint4, scalar_types.uint8]
++      for a_type in [torch.float16, torch.bfloat16]),
++    # QQQ style
++    *(TypeConfig(act_type=torch.int8,
++                 weight_type=scalar_types.uint4b8,
++                 output_type=torch.float16,
++                 group_scale_type=group_scale_type,
++                 group_zero_type=None,
++                 channel_scale_type=torch.float,
++                 token_scale_type=torch.float)
++      for group_scale_type in [None, torch.float16]),
++    *(TypeConfig(act_type=torch.float8_e4m3fn,
++                 weight_type=scalar_types.uint4b8,
++                 output_type=torch.float16,
++                 group_scale_type=group_scale_type,
++                 group_zero_type=None,
++                 channel_scale_type=torch.float,
++                 token_scale_type=torch.float)
++      for group_scale_type in [None, torch.float16]),
++]
++
++# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
++#  unit tests to a common utility function. Currently the use of
++#  `is_quant_method_supported` conflates kernels with quantization methods
++#  an assumption which is breaking down as quantizations methods can have
++#  have kernels and some kernels support multiple quantization methods.
++IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
++
++
++def rand_data(shape, dtype=torch.float16, scale=1, offset=0):
++    if dtype.is_floating_point:
++        return (scale * torch.rand(shape, device="cuda") - offset).to(dtype)
++    else:
++        return torch.randint(-8, 7, shape, dtype=dtype, device="cuda")
++
++
++def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
++    return zps if zps is None else -1 * s * (zps.to(s.dtype))
++
++
++def group_size_valid(shape: Tuple[int, int, int],
++                     group_size: Optional[int]) -> bool:
++    return group_size is None or group_size == -1 or group_size % shape[2] == 0
++
++
++def machete_quantize_and_pack(atype: torch.dtype,
++                              w: torch.Tensor,
++                              wtype: ScalarType,
++                              stype: Optional[torch.dtype],
++                              group_size: Optional[int],
++                              zero_points: bool = False):
++    assert wtype.is_integer(), "TODO: support floating point weights"
++
++    w_ref, w_q, w_s, w_zp = quantize_weights(
++        w,
++        wtype,
++        group_size=group_size,
++        zero_points=zero_points,
++        # to match how the kernel applies zps
++        ref_zero_points_after_scales=True)
++
++    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
++    w_q = w_q.t().contiguous().t()  # convert to col major
++
++    w_q_machete = ops.machete_prepack_B(w_q, atype, wtype, stype)
++    opcheck(torch.ops._C.machete_prepack_B, (w_q, atype, wtype.id, stype))
++
++    return w_ref, w_q_machete, w_s, w_zp
++
++
++def create_test_tensors(shape: Tuple[int, int, int],
++                        types: TypeConfig,
++                        group_size: Optional[int],
++                        subset_stride_factor: Optional[int] = None) -> Tensors:
++    m, n, k = shape
++    factor = subset_stride_factor or 1
++
++    print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
++          group_size)
++
++    a = rand_data((m * factor, k * factor), types.act_type, scale=3, offset=2)
++    w = rand_data((k * factor, n * factor), types.act_type, scale=3, offset=1)
++
++    if factor > 1:
++        a = a[0:m, 0:k]
++        w = w[0:k, 0:n]
++
++    if types.group_scale_type is not None:
++        w = w.to(types.group_scale_type)
++    if w.dtype.itemsize == 1:
++        w = w.to(torch.float16)
++
++    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
++        a.dtype, w, types.weight_type, types.group_scale_type, group_size,
++        types.group_zero_type is not None)
++
++    if not a.dtype.is_floating_point:
++        aiinfo = torch.iinfo(a.dtype)
++        w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
++
++    a_ref = a.to(torch.float32)
++    w_ref = w_ref.to(torch.float32)
++
++    w_ch_s = None if types.channel_scale_type is None else\
++        rand_data((n,), types.channel_scale_type)
++    w_tok_s = None if types.token_scale_type is None else\
++        rand_data((m,), types.token_scale_type)
++
++    return Tensors(w_ref=w_ref,
++                   a_ref=a_ref,
++                   a=a,
++                   w_q=w_q_packed,
++                   w_g_s=w_s,
++                   w_g_zp=maybe_convert_zeropoints(w_zp, w_s),
++                   w_ch_s=w_ch_s,
++                   w_tok_s=w_tok_s)
++
++
++# None stype means scales use the same dtype as a
++def machete_mm_test_helper(types: TypeConfig,
++                           tensors: Tensors,
++                           group_size: Optional[int] = None,
++                           schedule: Optional[str] = None):
++    output_ref = torch.matmul(tensors.a_ref, tensors.w_ref)
++    output_ref_type = output_ref.dtype
++
++    if tensors.w_ch_s is not None:
++        output_ref = (output_ref.to(tensors.w_ch_s.dtype) *
++                      tensors.w_ch_s.unsqueeze(0)).to(output_ref_type)
++    if tensors.w_tok_s is not None:
++        output_ref = (output_ref.to(tensors.w_tok_s.dtype) *
++                      tensors.w_tok_s.unsqueeze(1)).to(output_ref_type)
++
++    output = ops.machete_mm(
++        a=tensors.a,
++        b_q=tensors.w_q,
++        b_type=types.weight_type,
++        b_group_scales=tensors.w_g_s,
++        b_group_zeros=tensors.w_g_zp,
++        b_group_size=group_size,
++        b_channel_scales=tensors.w_ch_s,
++        a_token_scales=tensors.w_tok_s,
++        out_type=types.output_type,
++        schedule=schedule,
++    )
++
++    print(output)
++    print(output_ref)
++
++    # Relax atol as our reduction dim becomes larger (more rounding error)
++    # Relax atol when we have zeropoints since the way machete applies
++    #  zeropoints (after scales) causes noise around 0
++    atol = 1 if tensors.w_g_zp is not None\
++        else min(5e-2 * math.sqrt(tensors.a.shape[1]), 1)
++    rtol = 1e-1 if tensors.a.element_size() >= 2 else 2e-1
++    torch.testing.assert_close(output,
++                               output_ref.to(output.dtype),
++                               rtol=rtol,
++                               atol=atol)
++
++
++@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
++                    reason="Machete is not supported on this GPU type.")
++@pytest.mark.parametrize("shape",
++                         MNK_SHAPES,
++                         ids=lambda x: "x".join(str(v) for v in x))
++@pytest.mark.parametrize("types", TEST_TYPES)
++def test_machete_all_schedules(shape, types: TypeConfig):
++
++    group_sizes: List[Optional[int]] = []
++    if types.group_scale_type is None:
++        group_sizes = [None]
++    else:
++        group_sizes = GROUP_SIZES_TO_TEST
++
++    for group_size in group_sizes:
++        if not group_size_valid(shape, group_size):
++            continue
++
++        tensors = create_test_tensors(shape, types, group_size)
++        print(f"MNK = {shape}")
++        for schedule in ops.machete_supported_schedules(
++                types.act_type,
++                types.weight_type,
++                group_scales_type=types.group_scale_type,
++                group_zeros_type=types.group_scale_type,
++                out_type=types.output_type):
++            print(f"Testing schedule {schedule}")
++            machete_mm_test_helper(types, tensors, group_size, schedule)
++
++
++@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
++                    reason="Machete is not supported on this GPU type.")
++@pytest.mark.parametrize("shape",
++                         MNK_SHAPES,
++                         ids=lambda x: "x".join(str(v) for v in x))
++@pytest.mark.parametrize("types", TEST_TYPES)
++def test_machete_heuristic(shape, types: TypeConfig):
++    group_sizes: List[Optional[int]] = []
++    if types.group_scale_type is None:
++        group_sizes = [None]
++    else:
++        group_sizes = GROUP_SIZES_TO_TEST
++
++    for group_size in group_sizes:
++        if not group_size_valid(shape, group_size):
++            continue
++
++        tensors = create_test_tensors(shape, types, group_size)
++        machete_mm_test_helper(types, tensors, group_size)
++
++
++# Test working on other devices
++@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
++                    reason="Machete is not supported on this GPU type.")
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++def test_machete_devices(device: str):
++    group_size = 128
++
++    type_config = TypeConfig(act_type=torch.float16,
++                             weight_type=scalar_types.uint4b8,
++                             output_type=None,
++                             group_scale_type=torch.float16,
++                             group_zero_type=None,
++                             channel_scale_type=None,
++                             token_scale_type=None)
++
++    tensors = create_test_tensors((512, 4096, 4096), type_config, group_size)
++
++    for field in fields(Tensors):
++        tensor = getattr(tensors, field.name)
++        if isinstance(tensor, torch.Tensor):
++            setattr(tensors, field.name, tensor.to(device))
++
++    machete_mm_test_helper(type_config, tensors, group_size)
++
++
++# Test working with a subset of A and B
++@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
++                    reason="Machete is not supported on this GPU type.")
++def test_machete_subset():
++    group_size = 128
++
++    type_config = TypeConfig(act_type=torch.float16,
++                             weight_type=scalar_types.uint4b8,
++                             output_type=None,
++                             group_scale_type=torch.float16,
++                             group_zero_type=None,
++                             channel_scale_type=None,
++                             token_scale_type=None)
++
++    tensors = create_test_tensors((512, 4096, 4096),
++                                  type_config,
++                                  group_size,
++                                  subset_stride_factor=2)
++    machete_mm_test_helper(type_config, tensors, group_size)
++
++
++# Test to make sure cuda graphs work
++class MacheteLayer(torch.nn.Module):
++
++    def __init__(self, **kwargs):
++        super().__init__()
++        self.kwargs = kwargs
++
++    def forward(self, a):
++        return ops.machete_mm(a=a, **self.kwargs)
++
++
++@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
++                    reason="Machete is not supported on this GPU type.")
++def test_machete_cuda_graph():
++    m, n, k = 512, 4096, 4096
++
++    a = rand_data((m, k), torch.float16)
++    b = rand_data((k, n), torch.float16)
++    wtype = scalar_types.uint4b8
++    stype = torch.float16
++    group_size = 128
++    zero_points = False
++
++    w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
++        a.dtype, b, wtype, stype, group_size, zero_points)
++
++    # Construct a trivial model with a single layer that calls a machete kernel
++    model = MacheteLayer(
++        b_q=w_q_packed,
++        b_type=wtype,
++        b_group_scales=w_s,
++        b_group_zeros=maybe_convert_zeropoints(w_zp, w_s),
++        b_group_size=group_size,
++    )
++
++    output_ref = torch.matmul(a, w_ref)
++
++    # Run the model with a cuda graph
++    stream = torch.cuda.Stream()
++    with torch.cuda.stream(stream):
++        g = torch.cuda.CUDAGraph()
++        with torch.cuda.graph(g):
++            output = model(a)
++    output.zero_()
++    g.replay()
++
++    # Relax atol as our reduction dim becomes larger (more rounding error)
++    # Relax atol when we have zeropoints since the way machete applies
++    #  zeropoints (after scales) causes noise around 0
++    atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
++    torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
+diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py
+new file mode 100644
+index 0000000..19d1158
+--- /dev/null
++++ b/tests/kernels/test_mamba_ssm.py
+@@ -0,0 +1,720 @@
++import pytest
++import torch
++import torch.nn.functional as F
++from einops import rearrange, repeat
++
++from tests.kernels.utils import opcheck
++from vllm import _custom_ops as ops  # noqa: F401
++from vllm.attention.backends.utils import PAD_SLOT_ID
++from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
++    selective_scan_fn, selective_state_update)
++from vllm.platforms import current_platform
++
++
++def selective_state_update_ref(state,
++                               x,
++                               dt,
++                               A,
++                               B,
++                               C,
++                               D=None,
++                               z=None,
++                               dt_bias=None,
++                               dt_softplus=False):
++    """
++    Argument:
++        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
++        x: (batch, dim) or (batch, nheads, dim)
++        dt: (batch, dim) or (batch, nheads, dim)
++        A: (dim, dstate) or (nheads, dim, dstate)
++        B: (batch, dstate) or (batch, ngroups, dstate)
++        C: (batch, dstate) or (batch, ngroups, dstate)
++        D: (dim,) or (nheads, dim)
++        z: (batch, dim) or (batch, nheads, dim)
++        dt_bias: (dim,) or (nheads, dim)
++    Return:
++        out: (batch, dim) or (batch, nheads, dim)
++    """
++    has_heads = state.dim() > 3
++    if state.dim() == 3:
++        state = state.unsqueeze(1)
++    if x.dim() == 2:
++        x = x.unsqueeze(1)
++    if dt.dim() == 2:
++        dt = dt.unsqueeze(1)
++    if A.dim() == 2:
++        A = A.unsqueeze(0)
++    if B.dim() == 2:
++        B = B.unsqueeze(1)
++    if C.dim() == 2:
++        C = C.unsqueeze(1)
++    if D is not None and D.dim() == 1:
++        D = D.unsqueeze(0)
++    if z is not None and z.dim() == 2:
++        z = z.unsqueeze(1)
++    if dt_bias is not None and dt_bias.dim() == 1:
++        dt_bias = dt_bias.unsqueeze(0)
++    batch, nheads, dim, dstate = state.shape
++    assert x.shape == (batch, nheads, dim)
++    assert dt.shape == x.shape
++    assert A.shape == (nheads, dim, dstate)
++    ngroups = B.shape[1]
++    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
++    assert B.shape == (batch, ngroups, dstate)
++    assert C.shape == B.shape
++    if D is not None:
++        assert D.shape == (nheads, dim)
++    if z is not None:
++        assert z.shape == x.shape
++    if dt_bias is not None:
++        assert dt_bias.shape == (nheads, dim)
++        dt = dt + dt_bias
++    dt = F.softplus(dt) if dt_softplus else dt
++    dA = torch.exp(rearrange(dt, "b h d -> b h d 1") *
++                   A)  # (batch, nheads, dim, dstate)
++    B = repeat(B, "b g n -> b (g h) n",
++               h=nheads // ngroups)  # (batch, nheads, dstate)
++    C = repeat(C, "b g n -> b (g h) n",
++               h=nheads // ngroups)  # (batch, nheads, dstate)
++    dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
++        B, "b h n -> b h 1 n")  # (batch, nheads, dim, dstate)
++    state.copy_(state * dA +
++                dB * rearrange(x, "b h d -> b h d 1"))  # (batch, dim, dstate
++    out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
++    if D is not None:
++        out += (x * D).to(out.dtype)
++    out = (out if z is None else out * F.silu(z)).to(x.dtype)
++    if not has_heads:
++        out = out.squeeze(1)
++    return out
++
++
++def selective_scan_ref(u,
++                       delta,
++                       A,
++                       B,
++                       C,
++                       D=None,
++                       z=None,
++                       delta_bias=None,
++                       delta_softplus=False,
++                       return_last_state=False,
++                       prev_state=None,
++                       final_state_out=None):
++    """
++    u: r(B D L)
++    delta: r(B D L)
++    A: c(D N) or r(D N)
++    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
++    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
++    D: r(D)
++    z: r(B D L)
++    delta_bias: r(D), fp32
++    prev_state: r(B D N), fp32
++
++    out: r(B D L)
++    last_state (optional): r(B D dstate) or c(B D dstate)
++    """
++    dtype_in = u.dtype
++    u = u.float()
++    delta = delta.float()
++    if delta_bias is not None:
++        delta = delta + delta_bias[..., None].float()
++    if delta_softplus:
++        delta = F.softplus(delta)
++    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
++    is_variable_B = B.dim() >= 3
++    is_variable_C = C.dim() >= 3
++    B = B.float()
++    C = C.float()
++    x = A.new_zeros((batch, dim, dstate)) if prev_state is None else prev_state
++    ys = []
++    deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
++    if not is_variable_B:
++        deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
++    else:
++        if B.dim() == 3:
++            deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
++        else:
++            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
++            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
++    if is_variable_C and C.dim() == 4:
++        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
++    for i in range(u.shape[2]):
++        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
++        if not is_variable_C:
++            y = torch.einsum('bdn,dn->bd', x, C)
++        else:
++            if C.dim() == 3:
++                y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
++            else:
++                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
++        if i == u.shape[2] - 1:
++            if final_state_out is None:
++                final_state_out = x
++            else:
++                final_state_out.copy_(x)
++        ys.append(y)
++    y = torch.stack(ys, dim=2)  # (batch dim L)
++    out = y if D is None else y + u * rearrange(D, "d -> d 1")
++    if z is not None:
++        out = out * F.silu(z)
++    out = out.to(dtype=dtype_in)
++    return out if not return_last_state else (out, final_state_out)
++
++
++def selective_scan_opcheck_fn(u,
++                              delta,
++                              A,
++                              B,
++                              C,
++                              D=None,
++                              z=None,
++                              delta_bias=None,
++                              delta_softplus=False,
++                              cu_seq_len=None,
++                              cache_indices=None,
++                              has_initial_state=None,
++                              ssm_states=None,
++                              pad_slot_id=PAD_SLOT_ID):
++    """if return_last_state is True, returns (out, last_state)
++    last_state has shape (batch, dim, dstate).
++    """
++    if u.stride(-1) != 1:
++        u = u.contiguous()
++    if delta.stride(-1) != 1:
++        delta = delta.contiguous()
++    if D is not None:
++        D = D.contiguous()
++    if B.stride(-1) != 1:
++        B = B.contiguous()
++    if C.stride(-1) != 1:
++        C = C.contiguous()
++    if z is not None and z.stride(-1) != 1:
++        z = z.contiguous()
++    if B.dim() == 3 and cu_seq_len is None:
++        B = B.unsqueeze(1)
++    if B.dim() == 2 and cu_seq_len is not None:
++        B = B.unsqueeze(0)
++    if C.dim() == 3 and cu_seq_len is None:
++        C = C.unsqueeze(1)
++    if C.dim() == 2 and cu_seq_len is not None:
++        C = C.unsqueeze(0)
++
++    # Disable test_autograd_registration for now as it seems to trigger
++    # a bogus error.
++    opcheck(torch.ops._C.selective_scan_fwd,
++            (u, delta, A, B, C, D, z, delta_bias, delta_softplus, cu_seq_len,
++             cache_indices, has_initial_state, ssm_states, pad_slot_id),
++            test_utils=["test_schema", "test_faketensor"])
++
++
++@pytest.mark.parametrize('wtype', [torch.float32])
++@pytest.mark.parametrize('itype',
++                         [torch.float32, torch.float16, torch.bfloat16])
++@pytest.mark.parametrize('seqlen', [128, 256, 512, 1024, 2048, 4096])
++@pytest.mark.parametrize('has_delta_bias', [True])
++@pytest.mark.parametrize('delta_softplus', [True])
++@pytest.mark.parametrize('has_z', [True])
++@pytest.mark.parametrize('has_D', [True])
++@pytest.mark.parametrize("varBC_groups", [1, 2])
++@pytest.mark.parametrize("is_variable_C", [True])
++@pytest.mark.parametrize("is_variable_B", [True])
++@pytest.mark.parametrize("scan_chunks", [1, 2, 3])
++def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
++                        has_z, has_delta_bias, delta_softplus, seqlen, itype,
++                        wtype, scan_chunks):
++    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
++        pytest.skip()  # This config is not applicable
++    device = 'cuda'
++    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
++    if itype == torch.bfloat16:
++        rtol, atol = 3e-2, 5e-2
++    rtolw, atolw = (1e-3, 1e-3)
++    if has_z:  # If we have z, the errors on the weights seem higher
++        rtolw = max(rtolw, rtol)
++        atolw = max(atolw, atol)
++    # set seed
++    current_platform.seed_everything(0)
++    batch_size = 1
++    dim = 4
++    dstate = 8
++    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
++    A_ref = A.clone()
++    if not is_variable_B:
++        B_shape = [dim, dstate]
++    elif varBC_groups == 1:
++        B_shape = [batch_size, dstate, seqlen]
++    else:
++        B_shape = [batch_size, varBC_groups, dstate, seqlen]
++    B = torch.randn(B_shape,
++                    device=device,
++                    dtype=wtype if not is_variable_B else itype)
++    B_ref = B.clone()
++    if not is_variable_C:
++        C_shape = [dim, dstate]
++    elif varBC_groups == 1:
++        C_shape = [batch_size, dstate, seqlen]
++    else:
++        C_shape = [batch_size, varBC_groups, dstate, seqlen]
++    C = torch.randn(C_shape,
++                    device=device,
++                    dtype=wtype if not is_variable_C else itype)
++    C_ref = C.clone()
++    D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
++    D_ref = D.clone()
++    z = torch.randn(batch_size, dim, seqlen, device=device,
++                    dtype=itype) if has_z else None
++    z_ref = z.clone() if has_z else None
++    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
++                  ) if has_delta_bias else None
++    u = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
++    u_ref = u.clone()
++    delta = (0.5 *
++             torch.rand(batch_size, dim, seqlen, device=device, dtype=itype))
++    delta_ref = delta.clone()
++    state_shape = (batch_size, u.shape[1], int(A.shape[1]))
++    state = torch.randn(state_shape,
++                        device=u.device,
++                        dtype=itype,
++                        requires_grad=False)
++    state_ref = state.clone()
++    out = None
++    out_ref = None
++    outs = []
++    for c in range(scan_chunks):
++        chunked_prompt_len = seqlen // scan_chunks
++        chunk_start = chunked_prompt_len * c
++        chunk_end = chunked_prompt_len * (c + 1)
++        if c == scan_chunks - 1:
++            chunk_end = seqlen
++        _B = B
++        if is_variable_B:
++            _B = B[..., chunk_start:chunk_end]
++        _C = C
++        if is_variable_B:
++            _C = C[..., chunk_start:chunk_end]
++        _z = z
++        if has_z:
++            assert z is not None
++            _z = z[..., chunk_start:chunk_end]
++        out = selective_scan_fn(
++            u[..., chunk_start:chunk_end],
++            state,
++            delta[..., chunk_start:chunk_end],
++            A,
++            _B,
++            _C,
++            D,
++            z=_z,
++            delta_bias=delta_bias,
++            delta_softplus=delta_softplus,
++            has_initial_state=torch.ones(batch_size,
++                                         device=u.device,
++                                         dtype=torch.bool) if c > 0 else None)
++        outs.append(out)
++    if len(outs) > 1:
++        out = torch.cat(outs, dim=-1)
++
++    out_ref, state_ref, *rest = selective_scan_ref(
++        u_ref,
++        delta_ref,
++        A_ref,
++        B_ref,
++        C_ref,
++        D_ref,
++        z=z_ref,
++        delta_bias=delta_bias,
++        delta_softplus=delta_softplus,
++        return_last_state=True)
++
++    assert out is not None and out_ref is not None
++    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
++    assert state is not None and state_ref is not None
++    assert torch.allclose(state, state_ref.to(itype), rtol=rtol, atol=atol)
++
++    selective_scan_opcheck_fn(u,
++                              delta,
++                              A,
++                              B,
++                              C,
++                              D,
++                              z,
++                              delta_bias=delta_bias,
++                              delta_softplus=delta_softplus,
++                              ssm_states=state)
++
++
++@pytest.mark.parametrize("itype",
++                         [torch.float32, torch.float16, torch.bfloat16])
++@pytest.mark.parametrize("has_z", [False, True])
++@pytest.mark.parametrize("dstate", [16, 32, 64])
++@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
++def test_selective_state_update(dim, dstate, has_z, itype):
++    device = "cuda"
++    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
++    if itype == torch.bfloat16:
++        rtol, atol = 1e-2, 5e-2
++        if torch.version.hip:
++            atol *= 2
++    # set seed
++    current_platform.seed_everything(0)
++    batch_size = 1
++    state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
++    x = torch.randn(batch_size, dim, device=device, dtype=itype)
++    dt = torch.randn(batch_size, dim, device=device, dtype=itype)
++    dt_bias = torch.rand(dim, device=device) - 4.0
++    A = -torch.rand(dim, dstate, device=device) - 1.0
++    B = torch.randn(batch_size, dstate, device=device)
++    C = torch.randn(batch_size, dstate, device=device)
++    D = torch.randn(dim, device=device)
++    z = torch.randn_like(x) if has_z else None
++    state_ref = state.detach().clone()
++    out = selective_state_update(state,
++                                 x,
++                                 dt,
++                                 A,
++                                 B,
++                                 C,
++                                 D=D,
++                                 z=z,
++                                 dt_bias=dt_bias,
++                                 dt_softplus=True)
++    out_ref = selective_state_update_ref(state_ref,
++                                         x,
++                                         dt,
++                                         A,
++                                         B,
++                                         C,
++                                         D=D,
++                                         z=z,
++                                         dt_bias=dt_bias,
++                                         dt_softplus=True)
++
++    assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
++    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
++
++
++@pytest.mark.parametrize('wtype', [torch.float32])
++@pytest.mark.parametrize('itype', [torch.float32])
++@pytest.mark.parametrize('seqlen', [1, 128, 129, 256, 512, 1024, 2048, 4096])
++@pytest.mark.parametrize("return_last_state", [True])
++@pytest.mark.parametrize('has_delta_bias', [True])
++@pytest.mark.parametrize('delta_softplus', [True])
++@pytest.mark.parametrize('has_z', [True])
++@pytest.mark.parametrize('has_D', [True])
++@pytest.mark.parametrize("varBC_groups", [1, 2])
++@pytest.mark.parametrize("is_variable_C", [True])
++@pytest.mark.parametrize("is_variable_B", [True])
++# tests correctness in case subset of the sequences are padded
++@pytest.mark.parametrize("with_padding", [False, True])
++def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
++                               varBC_groups, has_D, has_z, has_delta_bias,
++                               delta_softplus, return_last_state, seqlen,
++                               itype, wtype):
++    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
++        pytest.skip()  # This config is not applicable
++    device = 'cuda'
++    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
++    if itype == torch.bfloat16:
++        rtol, atol = 3e-2, 5e-2
++    rtolw, atolw = (1e-3, 1e-3)
++    if has_z:  # If we have z, the errors on the weights seem higher
++        rtolw = max(rtolw, rtol)
++        atolw = max(atolw, atol)
++    # set seed
++    torch.random.manual_seed(0)
++    seqlens = []
++    batch_size = 4
++    if seqlen < 10:
++        batch_size = 1
++    padding = 3 if with_padding else 0
++    padded_batch_size = batch_size + padding
++
++    if with_padding and seqlen < padded_batch_size:
++        pytest.skip()
++
++    nsplits = padded_batch_size - 1
++    eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
++    seqlens.append(
++        torch.diff(
++            torch.cat(
++                [torch.tensor([-1]), eos_pos,
++                 torch.tensor([seqlen - 1])])).tolist())
++
++    assert sum(seqlens[-1]) == seqlen
++    assert all(s > 0 for s in seqlens[-1])
++
++    total_entries = batch_size * 10
++    cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
++    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
++                          dim=0).cuda()
++
++    dim = 4
++    dstate = 8
++    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
++    A_ref = A.clone()
++    B_shape = [varBC_groups, dstate, seqlen]
++    B = torch.randn(B_shape,
++                    device=device,
++                    dtype=wtype if not is_variable_B else itype)
++    B_ref = B.clone()
++    C_shape = [varBC_groups, dstate, seqlen]
++    C = torch.randn(C_shape,
++                    device=device,
++                    dtype=wtype if not is_variable_C else itype)
++    C_ref = C.clone()
++    D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
++    D_ref = D.clone()
++    z = torch.randn(dim, seqlen, device=device, dtype=itype)
++    z_ref = z.clone()
++    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
++                  ) if has_delta_bias else None
++    u = torch.randn(dim, seqlen, device=device, dtype=itype)
++    u_ref = u.clone()
++    delta = (0.5 * torch.rand(dim, seqlen, device=device, dtype=itype))
++    delta_ref = delta.clone()
++    out = None
++    out_ref = None
++
++    prev_state_shape = (total_entries, u.shape[0], int(A.shape[1]))
++    prev_state = torch.randn(prev_state_shape,
++                             device=u.device,
++                             dtype=itype,
++                             requires_grad=False)
++    prev_state_ref = prev_state.clone()
++    state_indices = torch.randperm(total_entries,
++                                   dtype=torch.int32,
++                                   device=u.device)[:batch_size]
++    unused_states_bool = torch.ones(total_entries,
++                                    dtype=torch.bool,
++                                    device=device)
++    unused_states_bool[state_indices] = False
++    padded_state_indices = torch.concat([
++        state_indices,
++        torch.as_tensor(
++            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
++    ],
++                                        dim=-1)
++
++    has_initial_state = torch.randint(0,
++                                      2, (cumsum.shape[0] - 1, ),
++                                      dtype=torch.bool,
++                                      device=u.device)
++    out = selective_scan_fn(u, prev_state, delta, A, B, C, D, z, delta_bias,
++                            delta_softplus, cumsum, padded_state_indices,
++                            has_initial_state)
++    outs_ref = []
++    splits = [
++        torch.split(var, seqlens[0], dim=-1)
++        for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
++    ]
++    for i in range(len(seqlens[0])):
++        u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
++        if padded_state_indices[i] == PAD_SLOT_ID:
++            continue
++        out_ref_s, _ = selective_scan_ref(
++            u_s,
++            delta_s,
++            A_ref,
++            B_s,
++            C_s,
++            D_ref,
++            z=z_s,
++            delta_bias=delta_bias,
++            delta_softplus=delta_softplus,
++            return_last_state=return_last_state,
++            prev_state=prev_state_ref[padded_state_indices[i]].unsqueeze(0)
++            if has_initial_state[i] else None,
++            final_state_out=prev_state_ref[padded_state_indices[i]].unsqueeze(
++                0))
++        outs_ref.append(out_ref_s)
++    out_ref = torch.cat(outs_ref, dim=-1)[0]
++
++    unpadded_out = out[:, :out_ref[0].shape[-1]]
++    print("Output diff max", (unpadded_out - out_ref).max())
++    print("Output diff mean", (unpadded_out - out_ref).mean())
++    print("Output state diff max", (prev_state - prev_state_ref).max())
++    print("Output state diff mean", (prev_state - prev_state_ref).mean())
++    assert torch.allclose(prev_state, prev_state_ref, rtol=rtol, atol=atol)
++    assert torch.allclose(unpadded_out, out_ref, rtol=rtol, atol=atol)
++    selective_scan_opcheck_fn(u, delta, A, B, C, D, z, delta_bias,
++                              delta_softplus, cumsum, padded_state_indices,
++                              has_initial_state, prev_state)
++
++
++@pytest.mark.parametrize("itype",
++                         [torch.float32, torch.float16, torch.bfloat16])
++@pytest.mark.parametrize("has_z", [True])
++@pytest.mark.parametrize("dstate", [16, 32, 64])
++@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
++# tests correctness in case subset of the sequences are padded
++@pytest.mark.parametrize("with_padding", [True, False])
++def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
++                                                   has_z, itype):
++    device = "cuda"
++    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
++    if itype == torch.bfloat16:
++        rtol, atol = 1e-1, 1e-1
++        if torch.version.hip:
++            atol *= 2
++    # set seed
++    torch.random.manual_seed(0)
++    batch_size = 3
++    padding = 5 if with_padding else 0
++    padded_batch_size = batch_size + padding
++    total_entries = 10 * batch_size
++    state = torch.randn(total_entries, dim, dstate, dtype=itype, device=device)
++    state_indices = torch.randperm(total_entries)[:batch_size].to(
++        dtype=torch.int32, device=device)
++    unused_states_bool = torch.ones(total_entries,
++                                    dtype=torch.bool,
++                                    device=device)
++    unused_states_bool[state_indices] = False
++    padded_state_indices = torch.concat([
++        state_indices,
++        torch.as_tensor(
++            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
++    ],
++                                        dim=0)
++    x = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
++    dt = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
++    dt_bias = torch.rand(dim, device=device) - 4.0
++    A = -torch.rand(dim, dstate, device=device) - 1.0
++    B = torch.randn(padded_batch_size, dstate, device=device)
++    C = torch.randn(padded_batch_size, dstate, device=device)
++    D = torch.randn(dim, device=device)
++    z = torch.randn_like(x) if has_z else None
++    state_ref = state[state_indices, :].clone()
++    state_before = state.clone()
++    out = selective_state_update(state,
++                                 x,
++                                 dt,
++                                 A,
++                                 B,
++                                 C,
++                                 D=D,
++                                 z=z,
++                                 dt_bias=dt_bias,
++                                 dt_softplus=True,
++                                 state_batch_indices=padded_state_indices,
++                                 pad_slot_id=PAD_SLOT_ID)
++    out_ref = selective_state_update_ref(state_ref,
++                                         x[:batch_size],
++                                         dt[:batch_size],
++                                         A,
++                                         B[:batch_size],
++                                         C[:batch_size],
++                                         D=D,
++                                         z=z[:batch_size],
++                                         dt_bias=dt_bias,
++                                         dt_softplus=True)
++
++    print("Output diff max", (out[:batch_size] - out_ref).max())
++    print("Output diff mean", (out[:batch_size] - out_ref).mean())
++    print("Output state diff max", (state[state_indices, :] - state_ref).max())
++    print("Output state diff mean",
++          (state[state_indices, :] - state_ref).mean())
++    # test padded entries stay the same
++    if with_padding:
++        assert torch.equal(state_before[unused_states_bool],
++                           state[unused_states_bool])
++        assert torch.equal(x[batch_size + 1:], x[batch_size + 1:])
++        assert torch.equal(dt[batch_size + 1:], dt[batch_size + 1:])
++        assert torch.equal(B[batch_size + 1:], B[batch_size + 1:])
++        assert torch.equal(C[batch_size + 1:], C[batch_size + 1:])
++
++    # test "real" entries
++    assert torch.allclose(state[state_indices, :],
++                          state_ref,
++                          rtol=rtol,
++                          atol=atol)
++    assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
++
++
++@pytest.mark.parametrize("itype",
++                         [torch.float32, torch.float16, torch.bfloat16])
++@pytest.mark.parametrize("has_z", [False, True])
++@pytest.mark.parametrize("tie_hdim", [False, True])
++@pytest.mark.parametrize("ngroups", [1, 2, 4])
++@pytest.mark.parametrize("dstate", [16, 32, 64])
++@pytest.mark.parametrize("dim", [2048, 4096])
++def test_selective_state_update_with_heads_with_batch_indices(
++        dim, dstate, ngroups, has_z, tie_hdim, itype):
++    device = "cuda"
++    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 3e-2)
++    if itype == torch.bfloat16:
++        rtol, atol = 1e-1, 1e-1
++    # set seed
++    torch.random.manual_seed(0)
++    batch_size = 3
++    headdim = 64
++    nheads = dim // headdim
++
++    total_entries = 10 * batch_size
++    state = torch.randn(total_entries,
++                        nheads,
++                        headdim,
++                        dstate,
++                        dtype=itype,
++                        device=device)
++    state_indices = torch.randperm(total_entries)[:batch_size].to(
++        dtype=torch.int32, device=device)
++
++    x = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
++    if not tie_hdim:
++        dt = torch.randn(batch_size,
++                         nheads,
++                         headdim,
++                         device=device,
++                         dtype=itype)
++        dt_bias = torch.rand(nheads, headdim, device=device) - 4.0
++        A = -torch.rand(nheads, headdim, dstate, device=device) - 1.0
++        D = torch.randn(nheads, headdim, device=device)
++    else:
++        dt = repeat(torch.randn(batch_size, nheads, device=device,
++                                dtype=itype),
++                    "b h -> b h p",
++                    p=headdim)
++        dt_bias = repeat(torch.rand(nheads, device=device) - 4.0,
++                         "h -> h p",
++                         p=headdim)
++        A = repeat(-torch.rand(nheads, device=device) - 1.0,
++                   "h -> h p n",
++                   p=headdim,
++                   n=dstate)
++        D = repeat(torch.randn(nheads, device=device), "h -> h p", p=headdim)
++    B = torch.randn(batch_size, ngroups, dstate, device=device)
++    C = torch.randn(batch_size, ngroups, dstate, device=device)
++    z = torch.randn_like(x) if has_z else None
++    state_ref = state[state_indices, :].detach().clone()
++    out = selective_state_update(state,
++                                 x,
++                                 dt,
++                                 A,
++                                 B,
++                                 C,
++                                 D=D,
++                                 z=z,
++                                 dt_bias=dt_bias,
++                                 dt_softplus=True,
++                                 state_batch_indices=state_indices,
++                                 pad_slot_id=PAD_SLOT_ID)
++    out_ref = selective_state_update_ref(state_ref,
++                                         x,
++                                         dt,
++                                         A,
++                                         B,
++                                         C,
++                                         D=D,
++                                         z=z,
++                                         dt_bias=dt_bias,
++                                         dt_softplus=True)
++
++    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
++    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
++    assert torch.allclose(state[state_indices, :],
++                          state_ref,
++                          rtol=rtol,
++                          atol=atol)
++    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
+new file mode 100644
+index 0000000..5e047f4
+--- /dev/null
++++ b/tests/kernels/test_marlin_gemm.py
+@@ -0,0 +1,616 @@
++"""Tests for the marlin kernel.
++
++Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
++"""
++import pytest
++import torch
++
++from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
++from tests.quantization.utils import is_quant_method_supported
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
++    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
++    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
++from vllm.model_executor.layers.quantization.qqq import (
++    MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
++    MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
++from vllm.model_executor.layers.quantization.utils.marlin_utils import (
++    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
++    MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
++    marlin_permute_scales, query_marlin_supported_quant_types)
++from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
++    pack_fp8_to_int32)
++from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
++    MarlinWorkspace, awq_marlin_quantize, get_weight_perm, marlin_quantize,
++    marlin_weights)
++from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
++    marlin_24_quantize)
++from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import (  # noqa: E501
++    marlin_qqq_quantize)
++from vllm.model_executor.layers.quantization.utils.quant_utils import (
++    awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
++from vllm.scalar_type import scalar_types
++
++ACT_ORDER_OPTS = [False, True]
++K_FULL_OPTS = [False, True]
++USE_FP32_REDUCE_OPTS = [False, True]
++
++MARLIN_K_CHUNKS = [128]
++MARLIN_N_CHUNKS = [64, 256]
++
++MARLIN_24_K_CHUNKS = [128]
++MARLIN_24_N_CHUNKS = [512]
++
++HQQ_SUPPORTED_GROUP_SIZES = [64]
++
++MNK_FACTORS = [
++    (1, 1, 1),
++    (1, 4, 8),
++    (1, 7, 5),
++    (13, 17, 67),
++    (26, 37, 13),
++    (67, 13, 11),
++    (257, 13, 11),
++    (658, 13, 11),
++]
++
++DTYPES = [torch.float16, torch.bfloat16]
++
++
++def compute_max_diff(output, output_ref):
++    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
++        torch.abs(output_ref))
++
++
++def rand_data(shape, dtype=torch.float16):
++    return torch.randn(shape, dtype=dtype, device="cuda")
++
++
++@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
++                    reason="Marlin is not supported on this GPU type.")
++@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
++@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
++@pytest.mark.parametrize("quant_type",
++                         query_marlin_supported_quant_types(False))
++@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
++@pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
++@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
++def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
++                            act_order, mnk_factors):
++    m_factor, n_factor, k_factor = mnk_factors
++
++    size_k = k_chunk * k_factor
++    size_n = n_chunk * n_factor
++
++    # Filter act_order
++    if act_order:
++        if group_size == -1:
++            return
++        if group_size == size_k:
++            return
++
++    # Normalize group_size
++    if group_size == -1:
++        group_size = size_k
++    assert group_size <= size_k
++
++    # Create input
++    b_weight = rand_data((size_k, size_n))
++
++    # Quantize (and apply act_order if provided)
++    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
++        b_weight, quant_type, group_size, act_order)
++
++    # Pack to GPTQ format
++    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
++
++    # For act_order, sort the "weights" and "g_idx" so that group ids are
++    # increasing
++    sort_indices = torch.empty(0, dtype=torch.int, device=b_weight.device)
++    if act_order:
++        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
++
++    # Pack to Marlin format
++    weight_perm = get_weight_perm(quant_type.size_bits)
++    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
++                                  weight_perm)
++
++    opcheck(torch.ops._C.gptq_marlin_repack,
++            (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits))
++
++    # Run Marlin repack GPU kernel
++    marlin_q_w_2 = ops.gptq_marlin_repack(
++        q_w_gptq,
++        sort_indices,
++        size_k,
++        size_n,
++        quant_type.size_bits,
++    )
++    torch.cuda.synchronize()
++
++    torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
++
++
++@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
++                    reason="Marlin is not supported on this GPU type.")
++@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
++@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
++@pytest.mark.parametrize("quant_type",
++                         query_marlin_supported_quant_types(False))
++@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
++@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
++def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
++                           mnk_factors):
++    m_factor, n_factor, k_factor = mnk_factors
++
++    size_k = k_chunk * k_factor
++    size_n = n_chunk * n_factor
++
++    # Normalize group_size
++    if group_size == -1:
++        group_size = size_k
++    assert group_size <= size_k
++
++    # Create input
++    b_weight = rand_data((size_k, size_n))
++
++    # Quantize
++    w_ref, q_w, s, zp = quantize_weights(b_weight,
++                                         quant_type,
++                                         group_size,
++                                         zero_points=True)
++
++    # Pack to AWQ format
++    q_w_awq = awq_pack(q_w, quant_type.size_bits, size_k, size_n)
++
++    # Pack to Marlin format
++    weight_perm = get_weight_perm(quant_type.size_bits)
++    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
++                                  weight_perm)
++
++    opcheck(torch.ops._C.awq_marlin_repack,
++            (q_w_awq, size_k, size_n, quant_type.size_bits))
++
++    # Run Marlin repack GPU kernel
++    marlin_q_w_2 = ops.awq_marlin_repack(
++        q_w_awq,
++        size_k,
++        size_n,
++        quant_type.size_bits,
++    )
++    torch.cuda.synchronize()
++
++    torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
++
++
++@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
++                    reason="Marlin is not supported on this GPU type.")
++@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
++@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
++@pytest.mark.parametrize("quant_type",
++                         query_marlin_supported_quant_types(False))
++@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
++@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
++@pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
++@pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
++@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
++def test_gptq_marlin_gemm(
++    k_chunk,
++    n_chunk,
++    quant_type,
++    group_size,
++    mnk_factors,
++    act_order,
++    is_k_full,
++    use_fp32_reduce,
++):
++    m_factor, n_factor, k_factor = mnk_factors
++
++    size_m = m_factor
++    size_k = k_chunk * k_factor
++    size_n = n_chunk * n_factor
++
++    if act_order:
++        if group_size == -1:
++            return
++        if group_size == size_k:
++            return
++
++    a_input = rand_data((size_m, size_k))
++    b_weight = rand_data((size_k, size_n))
++
++    w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
++        b_weight, quant_type, group_size, act_order)
++
++    marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
++
++    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
++                                GPTQ_MARLIN_MAX_PARALLEL)
++
++    opcheck(
++        torch.ops._C.gptq_marlin_gemm,
++        (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
++         workspace.scratch, quant_type.id, a_input.shape[0], b_weight.shape[1],
++         a_input.shape[1], is_k_full, False, use_fp32_reduce, False),
++        test_utils=DEFAULT_OPCHECK_TEST_UTILS)
++
++    output = ops.gptq_marlin_gemm(
++        a_input,
++        marlin_q_w,
++        marlin_s,
++        marlin_zp,
++        g_idx,
++        sort_indices,
++        workspace.scratch,
++        quant_type,
++        a_input.shape[0],
++        b_weight.shape[1],
++        a_input.shape[1],
++        is_k_full=is_k_full,
++        has_zp=False,
++        use_fp32_reduce=use_fp32_reduce,
++        is_zp_float=False,
++    )
++    output_ref = torch.matmul(a_input, w_ref)
++
++    torch.cuda.synchronize()
++
++    max_diff = compute_max_diff(output, output_ref)
++
++    assert max_diff < 0.04
++
++
++# TODO: find better way to test this?
++@torch.compile(fullgraph=True)
++def marlin_24_gemm_tester(a_input, marlin_24_q_w_comp, marlin_24_meta,
++                          marlin_24_s, scratch, quant_type, size_m, size_n,
++                          size_k):
++    return ops.gptq_marlin_24_gemm(a_input, marlin_24_q_w_comp, marlin_24_meta,
++                                   marlin_24_s, scratch, quant_type, size_m,
++                                   size_n, size_k)
++
++
++@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
++                    reason="Marlin is not supported on this GPU type.")
++@pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
++@pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
++@pytest.mark.parametrize("quant_type", GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
++@pytest.mark.parametrize("group_size", GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES)
++@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
++def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
++                             mnk_factors):
++    m_factor, n_factor, k_factor = mnk_factors
++
++    size_m = m_factor
++    size_k = k_chunk * k_factor
++    size_n = n_chunk * n_factor
++
++    a_input = rand_data((size_m, size_k))
++    b_weight = rand_data((size_k, size_n))
++
++    (w_24_ref, marlin_24_q_w_comp, marlin_24_meta,
++     marlin_24_s) = marlin_24_quantize(b_weight, quant_type, group_size)
++
++    workspace_24 = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
++                                   GPTQ_MARLIN_24_MAX_PARALLEL)
++
++    output_ref = torch.matmul(a_input, w_24_ref)
++
++    opcheck(torch.ops._C.gptq_marlin_24_gemm,
++            (a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s,
++             workspace_24.scratch, quant_type.id, a_input.shape[0],
++             b_weight.shape[1], a_input.shape[1]),
++            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
++
++    output = marlin_24_gemm_tester(
++        a_input,
++        marlin_24_q_w_comp,
++        marlin_24_meta,
++        marlin_24_s,
++        workspace_24.scratch,
++        quant_type,
++        a_input.shape[0],
++        b_weight.shape[1],
++        a_input.shape[1],
++    )
++
++    torch.cuda.synchronize()
++
++    max_diff = compute_max_diff(output, output_ref)
++
++    assert max_diff < 0.04
++
++
++@pytest.mark.skipif(not is_quant_method_supported("fp8"),
++                    reason="Marlin is not supported on this GPU type.")
++@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
++@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
++@pytest.mark.parametrize("num_bits", [8])
++@pytest.mark.parametrize("group_size", [-1])
++@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
++@pytest.mark.parametrize("dtype", DTYPES)
++def test_fp8_marlin_gemm(
++    k_chunk,
++    n_chunk,
++    num_bits,
++    group_size,
++    mnk_factors,
++    dtype,
++):
++    m_factor, n_factor, k_factor = mnk_factors
++
++    size_m = m_factor
++    size_k = k_chunk * k_factor
++    size_n = n_chunk * n_factor
++
++    a_input = rand_data((size_m, size_k), dtype=dtype)
++    b_weight = rand_data((size_k, size_n), dtype=dtype)
++
++    # WEIGHTS
++    fp8_weight, weight_scale = ops.scaled_fp8_quant(b_weight, scale=None)
++    # Repack weights to gptq format (packed int32 elements)
++    packed_gptq_qweight = pack_fp8_to_int32(fp8_weight)
++    # Repack weights to marlin format
++    marlin_qweight = ops.gptq_marlin_repack(
++        b_q_weight=packed_gptq_qweight,
++        perm=torch.empty(0, dtype=torch.int, device="cuda"),
++        size_k=size_k,
++        size_n=size_n,
++        num_bits=8,
++    )
++
++    # WEIGHT SCALES
++    # Currently Marlin doesn't support per-tensor scales, so we
++    # expand it to channelwise
++    scales = weight_scale.repeat(1, size_n).to(a_input.dtype).to("cuda")
++    # Permute scales
++    marlin_scales = marlin_permute_scales(s=scales,
++                                          size_k=size_k,
++                                          size_n=size_n,
++                                          group_size=-1)
++
++    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
++                                GPTQ_MARLIN_MAX_PARALLEL)
++
++    opcheck(torch.ops._C.fp8_marlin_gemm,
++            (a_input, marlin_qweight, marlin_scales, workspace.scratch,
++             num_bits, a_input.shape[0], b_weight.shape[1], a_input.shape[1]))
++
++    output = ops.fp8_marlin_gemm(
++        a=a_input,
++        b_q_weight=marlin_qweight,
++        b_scales=marlin_scales,
++        workspace=workspace.scratch,
++        num_bits=num_bits,
++        size_m=a_input.shape[0],
++        size_n=b_weight.shape[1],
++        size_k=a_input.shape[1],
++    )
++    output_ref = torch.matmul(a_input, b_weight)
++
++    torch.cuda.synchronize()
++
++    max_diff = compute_max_diff(output, output_ref)
++
++    assert max_diff < 0.04
++
++
++@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
++                    reason="Marlin is not supported on this GPU type.")
++@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
++@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
++@pytest.mark.parametrize("quant_type",
++                         query_marlin_supported_quant_types(True))
++@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
++@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
++@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
++def test_awq_marlin_gemm(
++    k_chunk,
++    n_chunk,
++    quant_type,
++    group_size,
++    mnk_factors,
++    use_fp32_reduce,
++):
++    m_factor, n_factor, k_factor = mnk_factors
++
++    size_m = m_factor
++    size_k = k_chunk * k_factor
++    size_n = n_chunk * n_factor
++
++    a_input = rand_data((size_m, size_k))
++    b_weight = rand_data((size_k, size_n))
++
++    w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
++        b_weight, quant_type, group_size)
++
++    g_idx = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
++    sort_indices = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
++    is_k_full = True
++    has_zp = True
++
++    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
++                                GPTQ_MARLIN_MAX_PARALLEL)
++
++    output = ops.gptq_marlin_gemm(
++        a_input,
++        marlin_q_w,
++        marlin_s,
++        marlin_zp,
++        g_idx,
++        sort_indices,
++        workspace.scratch,
++        quant_type,
++        a_input.shape[0],
++        b_weight.shape[1],
++        a_input.shape[1],
++        is_k_full=is_k_full,
++        has_zp=has_zp,
++        use_fp32_reduce=use_fp32_reduce,
++        is_zp_float=False,
++    )
++    output_ref = torch.matmul(a_input, w_ref)
++
++    torch.cuda.synchronize()
++
++    max_diff = compute_max_diff(output, output_ref)
++
++    assert max_diff < 0.04
++
++
++@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
++                    reason="Marlin is not supported on this GPU type.")
++@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
++@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
++@pytest.mark.parametrize("group_size", HQQ_SUPPORTED_GROUP_SIZES)
++@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
++@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
++def test_hqq_marlin_gemm(
++    k_chunk,
++    n_chunk,
++    group_size,
++    mnk_factors,
++    use_fp32_reduce,
++):
++    m_factor, n_factor, k_factor = mnk_factors
++
++    size_m = m_factor
++    size_k = k_chunk * k_factor
++    size_n = n_chunk * n_factor
++
++    quant_type = scalar_types.uint4
++
++    a_input = rand_data((size_m, size_k))
++    dev = a_input.device
++
++    b_weight = torch.randint(0,
++                             10, (size_n, size_k),
++                             dtype=torch.uint8,
++                             device=dev)
++    scale = rand_data((size_n, size_k // group_size))
++    zero = rand_data((size_n, size_k // group_size))
++
++    gptq_w_q = gptq_pack(b_weight.transpose(1, 0), 4, size_k, size_n)
++
++    sort_indices = torch.empty(0, dtype=torch.int, device=dev)
++    marlin_w_q = ops.gptq_marlin_repack(gptq_w_q, sort_indices, size_k, size_n,
++                                        4).to(dev)
++    marlin_s = marlin_permute_scales(scale.transpose(1, 0), size_k, size_n,
++                                     group_size).to(dev)
++    marlin_zp = marlin_permute_scales(zero.transpose(1, 0), size_k, size_n,
++                                      group_size).to(dev)
++
++    g_idx = marlin_make_empty_g_idx(dev)
++    g_idx_sort_indices = marlin_make_empty_g_idx(dev)
++
++    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
++                                GPTQ_MARLIN_MAX_PARALLEL)
++
++    output = ops.gptq_marlin_gemm(
++        a_input,
++        marlin_w_q,
++        marlin_s,
++        marlin_zp,
++        g_idx,
++        g_idx_sort_indices,
++        workspace.scratch,
++        quant_type,
++        a_input.shape[0],
++        b_weight.shape[0],
++        a_input.shape[1],
++        is_k_full=True,
++        has_zp=True,
++        use_fp32_reduce=use_fp32_reduce,
++        is_zp_float=True,
++    )
++
++    b_flat = b_weight.reshape(-1, group_size)
++    zp_flat = zero.reshape(-1, 1)
++    s_flat = scale.reshape(-1, 1)
++    dequant = (b_flat - zp_flat) * s_flat
++
++    output_ref = torch.matmul(a_input,
++                              dequant.reshape(b_weight.shape).transpose(1, 0))
++
++    torch.cuda.synchronize()
++
++    max_diff = compute_max_diff(output, output_ref)
++
++    assert max_diff < 0.04
++
++
++@pytest.mark.skipif(not is_quant_method_supported("qqq"),
++                    reason="Marlin is not supported on this GPU type.")
++@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
++@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
++@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
++@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES)
++@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
++def test_marlin_qqq_gemm(
++    k_chunk,
++    n_chunk,
++    num_bits,
++    group_size,
++    mnk_factors,
++):
++    int8_traits = torch.iinfo(torch.int8)
++    m_factor, n_factor, k_factor = mnk_factors
++
++    size_m = m_factor
++    size_k = k_chunk * k_factor
++    size_n = n_chunk * n_factor
++
++    a_input = rand_data((size_m, size_k))
++    b_weight = rand_data((size_k, size_n))
++
++    # Quantize activations
++    s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
++        torch.float)
++    q_a = (a_input / s_a).round().clamp(int8_traits.min,
++                                        int8_traits.max).to(torch.int8)
++
++    # Quantize weights
++    w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
++    marlin_qqq_quantize(b_weight, num_bits, group_size)
++
++    workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
++                                MARLIN_QQQ_MAX_PARALLEL)
++
++    opcheck(torch.ops._C.marlin_qqq_gemm,
++            (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
++             marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
++             b_weight.shape[1], a_input.shape[1]))
++
++    output = ops.marlin_qqq_gemm(
++        q_a,
++        marlin_qqq_q_w,
++        s_a,
++        marlin_qqq_s_channel,
++        marlin_qqq_s_group,
++        workspace.scratch,
++        a_input.shape[0],
++        b_weight.shape[1],
++        a_input.shape[1],
++    )
++    output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref)
++
++    torch.cuda.synchronize()
++
++    max_diff = compute_max_diff(output, output_ref)
++
++    assert max_diff < 0.04
++
++
++def test_marlin_gemm_opcheck():
++    size_m = 2048
++    size_n = 4096
++    size_k = 4096
++    a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16)
++    w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32)
++    s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16)
++    wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
++                         GPTQ_MARLIN_MAX_PARALLEL).scratch
++    x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
++    y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
++    torch.testing.assert_close(x, y)
++    opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k))
+diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
+index 2356b9e..7fa5de1 100644
+--- a/tests/kernels/test_moe.py
++++ b/tests/kernels/test_moe.py
+@@ -7,33 +7,30 @@ import torch
+ from transformers import MixtralConfig
+ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+ 
+-from vllm.model_executor.layers.activation import SiluAndMul
++import vllm.model_executor.layers.fused_moe  # noqa
++from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev,
++                                 torch_moe, torch_moe_single)
++from vllm import _custom_ops as ops
+ from vllm.model_executor.layers.fused_moe import fused_moe
++from vllm.model_executor.layers.fused_moe.fused_moe import (
++    fused_topk, moe_align_block_size)
++from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
++    fused_moe as iterative_moe)
++from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
++    marlin_quantize)
+ from vllm.model_executor.models.mixtral import MixtralMoE
++from vllm.platforms import current_platform
++from vllm.scalar_type import scalar_types
+ 
+-
+-def torch_moe(a, w1, w2, score, topk):
+-    B, D = a.shape
+-    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+-    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+-    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+-    topk_weight, topk_ids = torch.topk(score, topk)
+-    topk_weight = topk_weight.view(-1)
+-    topk_ids = topk_ids.view(-1)
+-    for i in range(w1.shape[0]):
+-        mask = topk_ids == i
+-        if mask.sum():
+-            out[mask] = SiluAndMul()(
+-                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+-    return (out.view(B, -1, w2.shape[1]) *
+-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
++NUM_EXPERTS = [8, 64]
++TOP_KS = [2, 6]
+ 
+ 
+-@pytest.mark.parametrize("m", [512, 222, 33, 1])
+-@pytest.mark.parametrize("n", [2048, 256, 1024])
++@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128])
++@pytest.mark.parametrize("n", [128, 1024, 2048])
+ @pytest.mark.parametrize("k", [128, 511, 1024])
+-@pytest.mark.parametrize("e", [8, 64])
+-@pytest.mark.parametrize("topk", [2, 6])
++@pytest.mark.parametrize("e", NUM_EXPERTS)
++@pytest.mark.parametrize("topk", TOP_KS)
+ @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+ def test_fused_moe(
+     m: int,
+@@ -43,14 +40,19 @@ def test_fused_moe(
+     topk: int,
+     dtype: torch.dtype,
+ ):
+-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
+-    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
+-    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
++    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
++    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
++    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+ 
+-    score = torch.randn((m, e), device='cuda', dtype=dtype)
++    score = torch.randn((m, e), device="cuda", dtype=dtype)
+     triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
+     torch_output = torch_moe(a, w1, w2, score, topk)
+-    assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
++    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
++    iterative_output = iterative_moe(a, w1, w2, score, topk, renormalize=False)
++    torch.testing.assert_close(iterative_output,
++                               torch_output,
++                               atol=2e-2,
++                               rtol=0)
+ 
+ 
+ @pytest.mark.parametrize("dtype",
+@@ -77,8 +79,8 @@ def test_mixtral_moe(dtype: torch.dtype):
+     for i in range(config.num_local_experts):
+         weights = (hf_moe.experts[i].w1.weight.data,
+                    hf_moe.experts[i].w3.weight.data)
+-        vllm_moe.w13_weight[i][:] = torch.cat(weights, dim=0)
+-        vllm_moe.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
++        vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
++        vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
+ 
+     # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
+     hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
+@@ -95,7 +97,265 @@ def test_mixtral_moe(dtype: torch.dtype):
+         torch.bfloat16: 1e-2,
+     }
+ 
+-    assert torch.allclose(hf_states.flatten(0, 1),
+-                          vllm_states,
+-                          rtol=mixtral_moe_tol[dtype],
+-                          atol=mixtral_moe_tol[dtype])
++    torch.testing.assert_close(hf_states.flatten(0, 1),
++                               vllm_states,
++                               rtol=mixtral_moe_tol[dtype],
++                               atol=mixtral_moe_tol[dtype])
++
++
++@pytest.mark.parametrize("m", [1, 33, 64, 222])
++@pytest.mark.parametrize("n", [128, 2048])
++@pytest.mark.parametrize("k", [128, 1024])
++@pytest.mark.parametrize("e", NUM_EXPERTS)
++@pytest.mark.parametrize("topk", TOP_KS)
++@pytest.mark.parametrize("group_size", [-1, 32, 128])
++@pytest.mark.parametrize("act_order", [True, False])
++@pytest.mark.parametrize("num_bits", [4, 8])
++@pytest.mark.parametrize("is_k_full", [True, False])
++@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
++def test_fused_marlin_moe(
++    m: int,
++    n: int,
++    k: int,
++    e: int,
++    topk: int,
++    group_size: int,
++    act_order: bool,
++    num_bits: int,
++    is_k_full: bool,
++):
++    current_platform.seed_everything(7)
++
++    # Filter act_order
++    if act_order:
++        if group_size == -1:
++            return
++        if group_size in (k, n):
++            return
++    else:
++        if not is_k_full:
++            return
++
++    quant_type = (scalar_types.uint4b8
++                  if num_bits == 4 else scalar_types.uint8b128)
++    dtype = torch.float16
++    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
++    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
++    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
++
++    w_ref1_l = []
++    qweight1_l = []
++    scales1_l = []
++    g_idx1_l = []
++    sort_indices1_l = []
++
++    for i in range(w1.shape[0]):
++        test_perm = torch.randperm(k)
++        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
++            w1[i].transpose(1, 0), quant_type, group_size, act_order,
++            test_perm)
++        w_ref1_l.append(w_ref1)
++        qweight1_l.append(qweight1)
++        scales1_l.append(scales1)
++        g_idx1_l.append(g_idx1)
++        sort_indices1_l.append(sort_indices1)
++
++    w_ref1 = stack_and_dev(w_ref1_l)
++    qweight1 = stack_and_dev(qweight1_l).contiguous()
++    scales1 = stack_and_dev(scales1_l)
++    g_idx1 = stack_and_dev(g_idx1_l)
++    sort_indices1 = stack_and_dev(sort_indices1_l)
++
++    w_ref2_l = []
++    qweight2_l = []
++    scales2_l = []
++    g_idx2_l = []
++    sort_indices2_l = []
++
++    for i in range(w2.shape[0]):
++        test_perm = torch.randperm(n)
++        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
++            w2[i].transpose(1, 0), quant_type, group_size, act_order,
++            test_perm)
++        w_ref2_l.append(w_ref2)
++        qweight2_l.append(qweight2)
++        scales2_l.append(scales2)
++        g_idx2_l.append(g_idx2)
++        sort_indices2_l.append(sort_indices2)
++
++    w_ref2 = stack_and_dev(w_ref2_l)
++    qweight2 = stack_and_dev(qweight2_l).contiguous()
++    scales2 = stack_and_dev(scales2_l)
++    g_idx2 = stack_and_dev(g_idx2_l)
++    sort_indices2 = stack_and_dev(sort_indices2_l)
++
++    score = torch.randn((m, e), device="cuda", dtype=dtype)
++
++    topk_weights, topk_ids = fused_topk(a, score, topk, False)
++
++    triton_output = fused_moe(
++        a,
++        w_ref1.transpose(1, 2).contiguous(),
++        w_ref2.transpose(1, 2).contiguous(),
++        score,
++        topk,
++        renormalize=False,
++    )
++    marlin_output = torch.ops.vllm.fused_marlin_moe(
++        a,
++        qweight1,
++        qweight2,
++        scales1,
++        scales2,
++        score,
++        topk_weights,
++        topk_ids,
++        g_idx1=g_idx1,
++        g_idx2=g_idx2,
++        sort_indices1=sort_indices1,
++        sort_indices2=sort_indices2,
++        num_bits=num_bits,
++        is_k_full=is_k_full,
++    )
++
++    assert compute_max_diff(marlin_output, triton_output) < 4e-2
++
++    if ops.supports_moe_ops:
++        token_expert_indicies = torch.empty(m,
++                                            topk,
++                                            dtype=torch.int32,
++                                            device=a.device)
++
++        opcheck(torch.ops._moe_C.topk_softmax, (
++            topk_weights,
++            topk_ids,
++            token_expert_indicies,
++            score.float(),
++        ))
++
++        block_size_m = 4
++
++        sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m,
++                                                      e)
++
++        max_workspace_size = ((m + 255) // 256) * (max(2 * n, k) // 64) * 16
++        workspace = torch.zeros(max_workspace_size,
++                                dtype=torch.int,
++                                device="cuda",
++                                requires_grad=False)
++
++        zp = torch.empty((0, 0),
++                         dtype=dtype,
++                         device="cuda",
++                         requires_grad=False)
++        opcheck(torch.ops._moe_C.marlin_gemm_moe,
++                (a, qweight1, sorted_token_ids, topk_weights, topk_ids,
++                 scales1, zp, g_idx1, sort_indices1, workspace, quant_type.id,
++                 m, 2 * n, k, True, e, topk, block_size_m, True, False))
++
++
++@pytest.mark.skip("This test is here for the sake of debugging, "
++                  "don't run it in automated tests.")
++@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
++@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
++@pytest.mark.parametrize("k", [128, 1024, 512])
++@pytest.mark.parametrize("e", [8, 64])
++@pytest.mark.parametrize("topk", [2, 6])
++@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
++@pytest.mark.parametrize("act_order", [True, False])
++@pytest.mark.parametrize("num_bits", [4, 8])
++@pytest.mark.parametrize("is_k_full", [True, False])
++@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
++def test_single_marlin_moe_multiply(
++    m: int,
++    n: int,
++    k: int,
++    e: int,
++    topk: int,
++    group_size: int,
++    act_order: bool,
++    num_bits: int,
++    is_k_full: bool,
++):
++
++    # Filter act_order
++    if act_order:
++        if group_size == -1:
++            return
++        if group_size == k:
++            return
++    else:
++        if not is_k_full:
++            return
++
++    quant_type = (scalar_types.uint4b8
++                  if num_bits == 4 else scalar_types.uint8b128)
++    dtype = torch.float16
++    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
++    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
++
++    w_ref_l = []
++    qweights_l = []
++    scales_l = []
++    g_idx_l = []
++    sort_indices_l = []
++
++    for i in range(w.shape[0]):
++        test_perm = torch.randperm(k)
++        w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
++            w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
++        w_ref_l.append(w_ref)
++        qweights_l.append(qweight)
++        scales_l.append(scales)
++        g_idx_l.append(g_idx)
++        sort_indices_l.append(sort_indices)
++
++    w_ref = stack_and_dev(w_ref_l)
++    qweight = stack_and_dev(qweights_l).contiguous()
++    scales = stack_and_dev(scales_l)
++    g_idx = stack_and_dev(g_idx_l)
++    sort_indices = stack_and_dev(sort_indices_l)
++
++    score = torch.randn((m, e), device="cuda", dtype=dtype)
++    marlin_output = torch.ops.vllm.single_marlin_moe(
++        a,
++        qweight,
++        scales,
++        score,
++        topk,
++        renormalize=False,
++        g_idx=g_idx,
++        sort_indices=sort_indices,
++        num_bits=num_bits,
++        is_k_full=is_k_full,
++    )
++
++    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
++
++    assert compute_max_diff(marlin_output, torch_output) < 1e-2
++
++
++def test_moe_align_block_size_opcheck():
++    num_experts = 4
++    block_size = 4
++    topk_ids = torch.randint(0,
++                             num_experts, (3, 4),
++                             dtype=torch.int32,
++                             device='cuda')
++
++    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
++    sorted_ids = torch.empty((max_num_tokens_padded, ),
++                             dtype=torch.int32,
++                             device=topk_ids.device)
++    sorted_ids.fill_(topk_ids.numel())
++    max_num_m_blocks = max_num_tokens_padded // block_size
++    expert_ids = torch.empty((max_num_m_blocks, ),
++                             dtype=torch.int32,
++                             device=topk_ids.device)
++    num_tokens_post_pad = torch.empty((1),
++                                      dtype=torch.int32,
++                                      device=topk_ids.device)
++
++    opcheck(torch.ops._moe_C.moe_align_block_size,
++            (topk_ids, num_experts, block_size, sorted_ids, expert_ids,
++             num_tokens_post_pad))
+diff --git a/tests/kernels/test_permute_cols.py b/tests/kernels/test_permute_cols.py
+new file mode 100644
+index 0000000..14ad7a2
+--- /dev/null
++++ b/tests/kernels/test_permute_cols.py
+@@ -0,0 +1,15 @@
++import pytest
++import torch
++
++from tests.kernels.utils import opcheck
++from vllm._custom_ops import permute_cols
++
++
++@pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
++@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
++def test_permute_cols(shape, dtype):
++    x = torch.randn(shape, dtype=dtype).cuda()
++    perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
++    opcheck(torch.ops._C.permute_cols, (x, perm))
++    y = permute_cols(x, perm)
++    torch.testing.assert_close(y, x[:, perm])
+\ No newline at end of file
+diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
+index bf18569..eee77c2 100644
+--- a/tests/kernels/test_pos_encoding.py
++++ b/tests/kernels/test_pos_encoding.py
+@@ -1,18 +1,20 @@
+-from itertools import accumulate
+-from typing import List, Optional
++from itertools import accumulate, product
++from typing import Dict, List, Optional
+ 
+ import pytest
+ import torch
+-from allclose_default import get_default_atol, get_default_rtol
+ 
+ from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.platforms import current_platform
++
++from .allclose_default import get_default_atol, get_default_rtol
+ 
+ IS_NEOX_STYLE = [True, False]
+ DTYPES = [torch.half, torch.bfloat16, torch.float]
+-HEAD_SIZES = [64, 80, 96, 112, 128, 256]
++HEAD_SIZES = [64, 80, 112, 120, 256]
+ ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
+-NUM_HEADS = [7, 17]  # Arbitrary values for testing
+-BATCH_SIZES = [1, 5]  # Arbitrary values for testing
++NUM_HEADS = [17]  # Arbitrary values for testing
++BATCH_SIZES = [5]  # Arbitrary values for testing
+ SEQ_LENS = [11, 8192]  # Arbitrary values for testing
+ SEEDS = [0]
+ CUDA_DEVICES = [
+@@ -45,9 +47,8 @@ def test_rotary_embedding(
+ ) -> None:
+     if rotary_dim is None:
+         rotary_dim = head_size
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     if rotary_dim is None:
+         rotary_dim = head_size
+@@ -63,17 +64,17 @@ def test_rotary_embedding(
+ 
+     # NOTE(woosuk): The reference implementation should be executed first
+     # because the custom kernel is in-place.
+-    ref_query, ref_key = rope._forward(positions, query, key)
++    ref_query, ref_key = rope.forward_native(positions, query, key)
+     out_query, out_key = rope.forward(positions, query, key)
+     # Compare the results.
+-    assert torch.allclose(out_query,
+-                          ref_query,
+-                          atol=get_default_atol(out_query),
+-                          rtol=get_default_rtol(out_query))
+-    assert torch.allclose(out_key,
+-                          ref_key,
+-                          atol=get_default_atol(out_key),
+-                          rtol=get_default_rtol(out_key))
++    torch.testing.assert_close(out_query,
++                               ref_query,
++                               atol=get_default_atol(out_query),
++                               rtol=get_default_rtol(out_query))
++    torch.testing.assert_close(out_key,
++                               ref_key,
++                               atol=get_default_atol(out_key),
++                               rtol=get_default_rtol(out_key))
+ 
+ 
+ @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@@ -99,14 +100,12 @@ def test_batched_rotary_embedding(
+     max_position: int = 8192,
+     base: int = 10000,
+ ) -> None:
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     if rotary_dim is None:
+         rotary_dim = head_size
+     rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
+-        "type": "linear",
++        "rope_type": "linear",
+         "factor": (1, )
+     })
+     rope = rope.to(dtype=dtype)
+@@ -120,22 +119,22 @@ def test_batched_rotary_embedding(
+ 
+     # NOTE(woosuk): The reference implementation should be executed first
+     # because the custom kernel is in-place.
+-    ref_query, ref_key = rope._forward(positions, query, key)
++    ref_query, ref_key = rope.forward_native(positions, query, key)
+     out_query, out_key = rope.forward(positions,
+                                       query,
+                                       key,
+                                       offsets=torch.zeros(batch_size * seq_len,
+-                                                          dtype=int,
++                                                          dtype=torch.long,
+                                                           device=device))
+     # Compare the results.
+-    assert torch.allclose(out_query,
+-                          ref_query,
+-                          atol=get_default_atol(out_query),
+-                          rtol=get_default_rtol(out_query))
+-    assert torch.allclose(out_key,
+-                          ref_key,
+-                          atol=get_default_atol(out_key),
+-                          rtol=get_default_rtol(out_key))
++    torch.testing.assert_close(out_query,
++                               ref_query,
++                               atol=get_default_atol(out_query),
++                               rtol=get_default_rtol(out_query))
++    torch.testing.assert_close(out_key,
++                               ref_key,
++                               atol=get_default_atol(out_key),
++                               rtol=get_default_rtol(out_key))
+ 
+ 
+ @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@@ -161,15 +160,13 @@ def test_batched_rotary_embedding_multi_lora(
+     max_position: int = 8192,
+     base: int = 10000,
+ ) -> None:
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    current_platform.seed_everything(seed)
+     torch.set_default_device(device)
+     if rotary_dim is None:
+         rotary_dim = head_size
+     scaling_factors: List[int] = [1, 2, 4]
+     rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
+-        "type": "linear",
++        "rope_type": "linear",
+         "factor": tuple(scaling_factors)
+     })
+     rope = rope.to(dtype=dtype)
+@@ -194,15 +191,54 @@ def test_batched_rotary_embedding_multi_lora(
+ 
+     # NOTE(woosuk): The reference implementation should be executed first
+     # because the custom kernel is in-place.
+-    ref_query, ref_key = rope._forward(positions, query, key, query_offsets)
++    ref_query, ref_key = rope.forward_native(positions, query, key,
++                                             query_offsets)
+     out_query, out_key = rope.forward(positions, query, key,
+                                       query_offsets.flatten())
+     # Compare the results.
+-    assert torch.allclose(out_query,
+-                          ref_query,
+-                          atol=get_default_atol(out_query),
+-                          rtol=get_default_rtol(out_query))
+-    assert torch.allclose(out_key,
+-                          ref_key,
+-                          atol=get_default_atol(out_key),
+-                          rtol=get_default_rtol(out_key))
++    torch.testing.assert_close(out_query,
++                               ref_query,
++                               atol=get_default_atol(out_query),
++                               rtol=get_default_rtol(out_query))
++    torch.testing.assert_close(out_key,
++                               ref_key,
++                               atol=get_default_atol(out_key),
++                               rtol=get_default_rtol(out_key))
++
++
++@torch.inference_mode()
++def test_rope_module_cache():
++    MAX_POSITIONS = [123, 1234]
++    BASES = [10000, 1000000]
++    ROPE_SCALINGS = (None, {
++        "rope_type": "linear",
++        "factor": (1, )
++    }, {
++        "rope_type": "dynamic",
++        "factor": 1
++    })
++    settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
++                ROPE_SCALINGS, DTYPES)
++    rope_setting_id_map: Dict[str, int] = {}
++    for setting in product(*settings):
++        head_size, rotary_dim, max_position, base, \
++            is_neox_stype, rope_scaling, dtype = setting
++        if rotary_dim is None:
++            rotary_dim = head_size
++        rope = get_rope(head_size, rotary_dim, max_position, base,
++                        is_neox_stype, rope_scaling, dtype)
++        # different settings cannot share the same rope module
++        assert id(rope) not in rope_setting_id_map.values()
++        assert all(x.dtype == dtype for x in rope.buffers())
++        assert all(x.dtype == dtype for x in rope.parameters())
++        rope_setting_id_map[str(setting)] = id(rope)
++
++    for setting in product(*settings):
++        head_size, rotary_dim, max_position, base, \
++            is_neox_stype, rope_scaling, dtype = setting
++        if rotary_dim is None:
++            rotary_dim = head_size
++        rope = get_rope(head_size, rotary_dim, max_position, base,
++                        is_neox_stype, rope_scaling, dtype)
++        # check if cache take effect
++        assert id(rope) == rope_setting_id_map[str(setting)]
+diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
+index 5a5987e..3fdb799 100644
+--- a/tests/kernels/test_prefix_prefill.py
++++ b/tests/kernels/test_prefix_prefill.py
+@@ -1,3 +1,4 @@
++import math
+ import random
+ import time
+ 
+@@ -6,22 +7,27 @@ import torch
+ from xformers import ops as xops
+ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
+ 
++from vllm.attention.backends.xformers import _make_alibi_bias
+ from vllm.attention.ops.prefix_prefill import context_attention_fwd
++from vllm.platforms import current_platform
++from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+ 
+ NUM_HEADS = [64]
+ NUM_QUERIES_PER_KV = [1, 8, 64]
+-HEAD_SIZES = [128, 96]
++HEAD_SIZES = [128, 96, 24]
+ DTYPES = [torch.float16]
+ CUDA_DEVICES = [
+     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+ ]
+ SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
++KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
+ 
+ 
+ @pytest.mark.parametrize("num_heads", NUM_HEADS)
+ @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+ @pytest.mark.parametrize("head_size", HEAD_SIZES)
+ @pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+ @pytest.mark.parametrize("device", CUDA_DEVICES)
+ @pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+ @torch.inference_mode()
+@@ -31,12 +37,17 @@ def test_contexted_kv_attention(
+     head_size: int,
+     sliding_window: int,
+     dtype: torch.dtype,
++    kv_cache_dtype: str,
+     device: str,
+ ) -> None:
+-    random.seed(0)
+-    torch.manual_seed(0)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(0)
++
++    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
++            89):
++        pytest.skip(
++            'Triton limitation: fp8e4nv data type is not supported on CUDA'
++            ' arch < 89')
++
++    current_platform.seed_everything(0)
+     torch.set_default_device(device)
+ 
+     # Need this, otherwise when we capture the graph the process
+@@ -65,16 +76,20 @@ def test_contexted_kv_attention(
+     kv.uniform_(-1e-3, 1e-3)
+     key, value = kv.unbind(dim=1)
+ 
++    if kv_cache_dtype == "auto":
++        cache_dtype = dtype
++    else:
++        cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
+     k_cache = torch.zeros(cache_size,
+                           block_size,
+                           num_kv_heads,
+                           head_size,
+-                          dtype=dtype)
++                          dtype=cache_dtype)
+     v_cache = torch.zeros(cache_size,
+                           block_size,
+                           num_kv_heads,
+                           head_size,
+-                          dtype=dtype)
++                          dtype=cache_dtype)
+     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+     values = torch.arange(0, cache_size, dtype=torch.long)
+@@ -130,6 +145,7 @@ def test_contexted_kv_attention(
+                           k,
+                           v,
+                           output,
++                          kv_cache_dtype,
+                           k_cache,
+                           v_cache,
+                           block_table,
+@@ -144,6 +160,7 @@ def test_contexted_kv_attention(
+                           k,
+                           v,
+                           output,
++                          kv_cache_dtype,
+                           k_cache,
+                           v_cache,
+                           block_table,
+@@ -206,4 +223,305 @@ def test_contexted_kv_attention(
+     end_time = time.time()
+     print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
+     output_ref = output_ref.reshape(output.shape)
+-    assert torch.allclose(output_ref, output, atol=1e-6, rtol=0)
++    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
++    torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
++
++
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_contexted_kv_attention_alibi(
++    num_heads: int,
++    num_queries_per_kv: int,
++    head_size: int,
++    dtype: torch.dtype,
++    kv_cache_dtype: str,
++    device: str,
++) -> None:
++
++    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
++            89):
++        pytest.skip(
++            'Triton limitation: fp8e4nv data type is not supported on CUDA'
++            ' arch < 89')
++
++    current_platform.seed_everything(0)
++    torch.set_default_device(device)
++
++    # Need this, otherwise when we capture the graph the process
++    # for GPU 1 would run on both GPU0 and GPU1 and things would hang
++    #
++    # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
++    torch.cuda.set_device(device)
++
++    def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
++        # Fork from: vllm/vllm/model_executor/models/bloom.py#L44
++        closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
++        base = torch.tensor(
++            2**(-(2**-(math.log2(closest_power_of_2) - 3))),
++            dtype=torch.float32,
++        )
++        powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
++        slopes = torch.pow(base, powers)
++
++        if closest_power_of_2 != total_num_heads:
++            extra_base = torch.tensor(
++                2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
++                dtype=torch.float32,
++            )
++            num_remaining_heads = min(closest_power_of_2,
++                                      total_num_heads - closest_power_of_2)
++            extra_powers = torch.arange(start=1,
++                                        end=1 + 2 * num_remaining_heads,
++                                        step=2,
++                                        dtype=torch.int32)
++            slopes = torch.cat(
++                [slopes, torch.pow(extra_base, extra_powers)], dim=0)
++        return slopes
++
++    alibi_slopes = _get_alibi_slopes(num_heads).to(device)
++
++    MAX_SEQ_LEN = 1024
++    MAX_CTX_LEN = 1024
++    BS = 10
++    cache_size = 640
++    block_size = 32
++    max_block_per_request = 64
++    query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
++    ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
++    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
++    num_kv_heads = num_heads // num_queries_per_kv
++
++    num_tokens = sum(query_lens)
++    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
++    query.uniform_(-1e-3, 1e-3)
++    output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
++
++    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
++    kv.uniform_(-1e-3, 1e-3)
++    key, value = kv.unbind(dim=1)
++    if kv_cache_dtype == "auto":
++        cache_dtype = dtype
++    else:
++        cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
++    k_cache = torch.zeros(cache_size,
++                          block_size,
++                          num_kv_heads,
++                          head_size,
++                          dtype=cache_dtype)
++    v_cache = torch.zeros(cache_size,
++                          block_size,
++                          num_kv_heads,
++                          head_size,
++                          dtype=cache_dtype)
++    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
++    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
++    values = torch.arange(0, cache_size, dtype=torch.long)
++    values = values[torch.randperm(cache_size)]
++    block_table = values[:BS * max_block_per_request].view(
++        BS, max_block_per_request)
++    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
++    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
++    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
++                                            dtype=torch.long),
++                               dim=0)
++    max_input_len = MAX_SEQ_LEN
++    # copy kv to cache
++    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
++                                                dtype=torch.long),
++                                   dim=0)
++    for i in range(BS):
++        for j in range(query_lens[i]):
++            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
++                                            j])
++            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
++                                              b_ctx_len[i] + j])
++        cur_ctx = 0
++        block_id = 0
++        while cur_ctx < b_ctx_len[i]:
++            start_loc = b_seq_start_loc[i] + cur_ctx
++            if cur_ctx + block_size > b_ctx_len[i]:
++                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
++            else:
++                end_loc = start_loc + block_size
++            start_slot = block_table[i, block_id] * block_size
++            end_slot = start_slot + end_loc - start_loc
++            k_cache.view(-1, num_kv_heads,
++                         head_size)[start_slot:end_slot].copy_(
++                             key[start_loc:end_loc])
++            v_cache.view(-1, num_kv_heads,
++                         head_size)[start_slot:end_slot].copy_(
++                             value[start_loc:end_loc])
++            cur_ctx += block_size
++            block_id += 1
++    # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
++    # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
++    k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
++                           8).permute(0, 2, 3, 1, 4).contiguous()
++    # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
++    # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
++    v_cache = v_cache.view(-1, block_size, num_kv_heads,
++                           head_size).permute(0, 2, 3, 1).contiguous()
++
++    # Warm up the Triton kernel by calling it once before actually measuring
++    # generation time
++    context_attention_fwd(query,
++                          k,
++                          v,
++                          output,
++                          kv_cache_dtype,
++                          k_cache,
++                          v_cache,
++                          block_table,
++                          b_start_loc,
++                          b_seq_len,
++                          b_ctx_len,
++                          max_input_len,
++                          alibi_slopes=alibi_slopes)
++    torch.cuda.synchronize()
++    start_time = time.time()
++    context_attention_fwd(query,
++                          k,
++                          v,
++                          output,
++                          kv_cache_dtype,
++                          k_cache,
++                          v_cache,
++                          block_table,
++                          b_start_loc,
++                          b_seq_len,
++                          b_ctx_len,
++                          max_input_len,
++                          alibi_slopes=alibi_slopes)
++    torch.cuda.synchronize()
++    end_time = time.time()
++    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
++    scale = float(1.0 / (head_size**0.5))
++
++    # NOTE(DefTruth): In order to reuse _make_alibi_bias function,
++    # we have to pad query tensor before MQA/GQA expanding.
++    if query.shape[0] != key.shape[0]:
++        query_pad = torch.empty(sum(seq_lens),
++                                num_heads,
++                                head_size,
++                                dtype=dtype)
++        query_pad.uniform_(-1e-3, 1e-3)
++        seq_start = 0
++        query_start = 0
++        for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
++            seq_end = seq_start + seq_len
++            query_end = query_start + query_len
++            query_pad[seq_start:seq_end, ...] = torch.cat([
++                torch.zeros(
++                    seq_len - query_len, num_heads, head_size, dtype=dtype),
++                query[query_start:query_end, ...]
++            ],
++                                                          dim=0)
++            seq_start += seq_len
++            query_start += query_len
++        query = query_pad
++
++    if num_kv_heads != num_heads:
++        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
++        # project the key and value tensors to the desired number of
++        # heads.
++        #
++        # see also: vllm/model_executor/layers/attention.py
++        query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
++                           query.shape[-1])
++        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
++                                        num_queries_per_kv, key.shape[-1])
++        value = value[:, :,
++                      None, :].expand(value.shape[0], num_kv_heads,
++                                      num_queries_per_kv, value.shape[-1])
++
++    query = query.unsqueeze(0)
++    key = key.unsqueeze(0)
++    value = value.unsqueeze(0)
++
++    attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
++    output_ref = torch.empty_like(output)
++    seq_start = 0
++    query_start = 0
++    start_time = time.time()
++    # Attention with alibi slopes.
++    # FIXME(DefTruth): Because xformers does not support dynamic sequence
++    # lengths with custom attention bias, we process each prompt one by
++    # one. This is inefficient, especially when we have many short prompts.
++    # modified from: vllm/attention/backends/xformers.py#L343
++    for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
++        seq_end = seq_start + seq_len
++        query_end = query_start + query_len
++        out = xops.memory_efficient_attention_forward(query[:,
++                                                            seq_start:seq_end],
++                                                      key[:,
++                                                          seq_start:seq_end],
++                                                      value[:,
++                                                            seq_start:seq_end],
++                                                      attn_bias=attn_bias[i],
++                                                      p=0.0,
++                                                      scale=scale)
++        out = out.view_as(query[:, seq_start:seq_end]).view(
++            seq_len, num_heads, head_size)
++        output_ref[query_start:query_end, ...].copy_(out[seq_len - query_len:,
++                                                         ...])
++        seq_start += seq_len
++        query_start += query_len
++    torch.cuda.synchronize()
++    end_time = time.time()
++    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
++    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
++    torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
++
++
++# These tests are optional to only run when explicitly invoked
++#
++# pytest -v -s --optional \
++# tests/kernels/test_prefix_prefill.py::test_contexted_kv_attention_f32
++#
++# These tests are useful to test model dtype float32 on Turing devices.
++# We skip them to not increase the time when running tests on CI
++@pytest.mark.optional
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("dtype", [torch.float32])
++@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
++@torch.inference_mode()
++def test_contexted_kv_attention_f32(
++    num_heads: int,
++    num_queries_per_kv: int,
++    head_size: int,
++    sliding_window: int,
++    dtype: torch.dtype,
++    kv_cache_dtype: str,
++    device: str,
++) -> None:
++    test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size,
++                                sliding_window, dtype, kv_cache_dtype, device)
++
++
++@pytest.mark.optional
++@pytest.mark.parametrize("num_heads", NUM_HEADS)
++@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
++@pytest.mark.parametrize("head_size", HEAD_SIZES)
++@pytest.mark.parametrize("dtype", [torch.float32])
++@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_contexted_kv_attention_alibi_f32(
++    num_heads: int,
++    num_queries_per_kv: int,
++    head_size: int,
++    dtype: torch.dtype,
++    kv_cache_dtype: str,
++    device: str,
++) -> None:
++    test_contexted_kv_attention_alibi(num_heads, num_queries_per_kv, head_size,
++                                      dtype, kv_cache_dtype, device)
+diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/test_rotary_embedding.py
+new file mode 100644
+index 0000000..da87940
+--- /dev/null
++++ b/tests/kernels/test_rotary_embedding.py
+@@ -0,0 +1,62 @@
++"""
++Tests for miscellaneous utilities
++"""
++
++from typing import Optional
++
++import pytest
++import torch
++
++from tests.kernels.utils import opcheck
++from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
++
++
++def rotary_embedding_opcheck(rot,
++                             positions: torch.Tensor,
++                             query: torch.Tensor,
++                             key: torch.Tensor,
++                             offsets: Optional[torch.Tensor] = None):
++    cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
++
++    # ops.rotary_embedding()/batched_rotary_embedding()
++    # are in-place operations that update the query and key tensors.
++    if offsets is not None:
++        opcheck(torch.ops._C.batched_rotary_embedding,
++                (positions, query, key, rot.head_size, cos_sin_cache,
++                 rot.is_neox_style, rot.rotary_dim, offsets))
++    else:
++        opcheck(torch.ops._C.rotary_embedding,
++                (positions, query, key, rot.head_size, cos_sin_cache,
++                 rot.is_neox_style))
++
++
++@pytest.mark.parametrize("device", ["cuda"])
++@pytest.mark.parametrize("max_position", [11, 4096, 32768])
++@pytest.mark.parametrize("is_neox_style", [True, False])
++@pytest.mark.parametrize("rotary_dim", [32])
++@pytest.mark.parametrize("head_size", [32, 108])
++@pytest.mark.parametrize("seq_len", [11, 1024])
++def test_rotary_embedding_opcheck(dist_init, device, max_position,
++                                  is_neox_style, rotary_dim, head_size,
++                                  seq_len):
++    batch_size = 1
++    base = 0
++    num_heads = 7
++    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
++                          is_neox_style, torch.float32)
++
++    positions = torch.randint(0,
++                              max_position, (batch_size, seq_len),
++                              device=device)
++    query = torch.randn(batch_size,
++                        seq_len,
++                        num_heads * head_size,
++                        dtype=torch.float32,
++                        device=device)
++    key = torch.randn_like(query)
++
++    rotary_embedding_opcheck(rot, positions, query, key)
++    offsets = torch.zeros(batch_size * seq_len,
++                          device=device,
++                          dtype=torch.long)
++    rotary_embedding_opcheck(rot, positions, query, key, offsets)
+diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
+new file mode 100644
+index 0000000..4316d6a
+--- /dev/null
++++ b/tests/kernels/test_semi_structured.py
+@@ -0,0 +1,134 @@
++"""Tests for sparse cutlass kernels
++
++Run `pytest tests/kernels/test_semi_structured.py`.
++"""
++from typing import Optional, Tuple, Type
++
++import pytest
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    sparse_cutlass_supported)
++from vllm.platforms import current_platform
++
++CUDA_DEVICES = [
++    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
++]
++
++capability = current_platform.get_device_capability()
++capability = capability[0] * 10 + capability[1]
++
++
++def to_fp8(tensor: torch.Tensor):
++    finfo = torch.finfo(torch.float8_e4m3fn)
++    return torch.round(tensor.clamp(
++        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
++
++
++def to_int8(tensor: torch.Tensor):
++    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
++
++
++def rand_int8(shape: tuple, device: str = "cuda"):
++    return to_int8(torch.rand(shape, device=device) * 255 - 128)
++
++
++def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
++    return tensor.to(dtype=torch.bfloat16)
++
++
++def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
++    return tensor.to(dtype=torch.float16)
++
++
++def prune_to_2_4(tensor):
++    # Reshape tensor to [N, 4] where N is number of groups of 4
++    original_shape = tensor.shape
++    reshaped = tensor.reshape(-1, 4)
++
++    # Get indices of top 2 absolute values in each group of 4
++    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
++
++    # Create binary mask
++    mask = torch.zeros_like(reshaped)
++    mask.scatter_(dim=1,
++                  index=indices,
++                  src=torch.ones_like(indices, dtype=mask.dtype))
++
++    # Apply mask and reshape back
++    pruned = reshaped * mask
++
++    # Turn all -0.0 to 0.0
++    pruned[pruned == -0.0] = 0.0
++
++    return pruned.reshape(original_shape)
++
++
++def make_rand_sparse_tensors(
++        dtype: torch.dtype, m: int, n: int, k: int
++) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
++    a = torch.randn((m, k), device='cuda') * 5
++    b = torch.randn((n, k), device='cuda').t() * 5
++
++    b = prune_to_2_4(b.t()).t()
++
++    if dtype == torch.int8:
++        a, b = to_int8(a), to_int8(b)
++    elif dtype == torch.float8_e4m3fn:
++        a, b = to_fp8(a), to_fp8(b)
++    elif dtype == torch.float16:
++        a, b = to_fp16(a), to_fp16(b)
++    elif dtype == torch.bfloat16:
++        a, b = to_bf16(a), to_bf16(b)
++    else:
++        raise ValueError("unsupported dtype")
++
++    b_compressed, e = ops.cutlass_sparse_compress(b.t())
++
++    # Compressed B, Metadata, Original A, B
++    return b_compressed, e, a, b
++
++
++def baseline_scaled_mm(a: torch.Tensor,
++                       b: torch.Tensor,
++                       scale_a: torch.Tensor,
++                       scale_b: torch.Tensor,
++                       out_dtype: Type[torch.dtype],
++                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++    output = (scale_a * (scale_b * (torch.mm(
++        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
++    if bias is not None:
++        output = output + bias
++
++    return output
++
++
++@pytest.mark.skipif(not sparse_cutlass_supported(),
++                    reason="Sparse FP8 is not yet supported on this GPU type.")
++# Test working with a subset of A and B for sparse matmul
++def test_cutlass_sparse_subset():
++
++    big_m = 1024
++    m, n, k = 512, 512, 512
++
++    # Create tensors
++    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
++                                                     big_m, n, k)
++    a = whole_a[0:m, 0:k]
++    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
++    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
++
++    out = ops.cutlass_scaled_sparse_mm(a,
++                                       b_comp,
++                                       e,
++                                       scale_a,
++                                       scale_b,
++                                       out_dtype=torch.bfloat16)
++    baseline = baseline_scaled_mm(a,
++                                  b,
++                                  scale_a,
++                                  scale_b,
++                                  out_dtype=torch.bfloat16)
++
++    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py
+new file mode 100644
+index 0000000..8e96a2f
+--- /dev/null
++++ b/tests/kernels/test_triton_scaled_mm.py
+@@ -0,0 +1,106 @@
++"""Tests for the triton_scaled_mm kernel
++
++Run `pytest tests/kernels/test_triton_scaled_mm.py`.
++"""
++import importlib
++from typing import Optional, Type
++
++import pytest
++import torch
++
++from vllm.platforms import current_platform
++
++device = "cuda"
++
++
++def scaled_mm_torch(a: torch.Tensor,
++                    b: torch.Tensor,
++                    scale_a: torch.Tensor,
++                    scale_b: torch.Tensor,
++                    out_dtype: Type[torch.dtype],
++                    bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++    out = torch.mm(a.to(torch.float32), b.to(torch.float32))
++    out = scale_a * out
++    out = scale_b.T * out
++    out = out.to(out_dtype)
++    if bias is not None:
++        out = out + bias
++
++    return out
++
++
++def get_8bit_types():
++    types = [torch.int8]
++    supports_fp8 = current_platform.has_device_capability(89)
++    if current_platform.is_rocm() and supports_fp8:
++        types.append(torch.float8_e4m3fnuz)
++    elif current_platform.is_cuda() and supports_fp8:
++        types.append(torch.float8_e4m3fn)
++    return types
++
++
++@pytest.mark.parametrize("M", [1, 33, 64, 512])
++@pytest.mark.parametrize("N", [256, 971, 20486])
++@pytest.mark.parametrize("K", [128, 496, 1024])
++@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16])
++@pytest.mark.parametrize("in_dtype", get_8bit_types())
++@pytest.mark.parametrize("use_scalar_scale_a", [True, False])
++@pytest.mark.parametrize("use_scalar_scale_b", [True, False])
++@pytest.mark.parametrize("use_bias", [True, False])
++def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
++                   use_scalar_scale_b, use_bias):
++    is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t
++                                                    ).is_floating_point()
++
++    current_platform.seed_everything(0)
++
++    # NOTE: There are cases, where if the matrix is large enough, an output
++    # like 65504.4 can be produced, and can easily turn into inf when
++    # multiplied when using float16/bfloat16.  This means one function, e.g.,
++    # testing function, and another function, e.g. golden function, can
++    # produce a non-inf value while the other produces an inf value, and
++    # will cause assert_close/allclose to fail, even though if overflow
++    # wouldn't have occurred, the values would have been "close."
++    #
++    # So, the values here are kept small enough to avoid this situation.
++    if is_floating_point_type(in_dtype):
++        a = (0.25 * torch.rand(
++            (M, K), dtype=torch.float32, device=device)).to(in_dtype)
++        b = (0.25 * torch.rand(
++            (K, N), dtype=torch.float32, device=device)).to(in_dtype)
++    else:
++        a = torch.randint(-32, 32, (M, K), dtype=in_dtype, device=device)
++        b = torch.randint(-32, 32, (K, N), dtype=in_dtype, device=device)
++
++    if use_scalar_scale_a:
++        scale_a = torch.rand((1, 1), device=device)
++    else:
++        scale_a = 0.25 * torch.rand((M, 1), device=device)
++
++    if use_scalar_scale_b:
++        scale_b = torch.rand((1, 1), device=device)
++    else:
++        scale_b = 0.25 * torch.rand((N, 1), device=device)
++
++    bias = None
++    if use_bias:
++        bias = torch.rand((N, ), device=device, dtype=out_dtype)
++
++    triton_scaled_mm_module = importlib.import_module(
++        "vllm.model_executor.layers.quantization.compressed_tensors."
++        "triton_scaled_mm")
++    triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
++
++    c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
++
++    a_cpu = a.cpu()
++    b_cpu = b.cpu()
++    scale_a_cpu = scale_a.cpu()
++    scale_b_cpu = scale_b.cpu()
++    bias_cpu = None if bias is None else bias.cpu()
++
++    c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu,
++                               out_dtype, bias_cpu)
++
++    c_check_cpu = c_check.cpu()
++    torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1)
+diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py
+new file mode 100644
+index 0000000..7e5126a
+--- /dev/null
++++ b/tests/kernels/test_utils.py
+@@ -0,0 +1,24 @@
++"""
++Tests for miscellaneous utilities
++"""
++
++import pytest
++import torch
++
++from tests.kernels.utils import opcheck
++from vllm.platforms import current_platform
++
++
++def test_convert_fp8_opcheck():
++    data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
++    result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
++    opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
++
++
++@pytest.mark.skipif(not current_platform.is_cuda(),
++                    reason="Only supported for CUDA")
++def test_cuda_utils_opcheck():
++    opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
++    opcheck(
++        torch.ops._C_cuda_utils.
++        get_max_shared_memory_per_block_device_attribute, (0, ))
+diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
+new file mode 100644
+index 0000000..848eea7
+--- /dev/null
++++ b/tests/kernels/utils.py
+@@ -0,0 +1,1100 @@
++"""Kernel test utils"""
++
++import itertools
++import random
++import unittest
++from numbers import Number
++from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple,
++                    Union)
++
++import pytest
++import torch
++from torch._prims_common import TensorLikeType
++
++from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.platforms.interface import _Backend
++from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
++                        STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
++
++# For now, disable "test_aot_dispatch_dynamic" since there are some
++# bugs related to this test in PyTorch 2.4.
++DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
++    "test_schema",
++    "test_autograd_registration",
++    "test_faketensor",
++)
++
++ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
++    "test_schema",
++    "test_autograd_registration",
++    "test_faketensor",
++    "test_aot_dispatch_dynamic",
++)
++
++
++class QKVInputs(NamedTuple):
++    '''
++    Data structure for representing unpacked attention inputs, 
++    query/key/values and their sequence lengths.
++
++    Attributes:
++
++        * {query,key,value}: unpacked (batch_size x padded_seq_len x 
++                             num_heads x head_size) attention inputs
++        * q_seq_lens: query sequence lengths list
++        * kv_seq_lens: shared key/value sequence lengths list
++    '''
++
++    query: torch.Tensor
++    key: torch.Tensor
++    value: torch.Tensor
++    q_seq_lens: List[int]
++    kv_seq_lens: List[int]
++
++
++class QKVO(NamedTuple):
++    '''
++    Data structure for representing unpacked attention inputs, 
++    alongside unpacked known-correct attention output
++
++    Attributes:
++
++        * qkv: unpacked (batch_size x padded_seq_len x 
++                             num_heads x head_size) attention inputs
++        * ideal_output: unpacked (batch_size x padded_seq_len x 
++                        num_heads x head_size) known-correct attention output
++    '''
++
++    qkv: QKVInputs
++    ideal_output: torch.Tensor
++
++
++class PackedQKVInputs(NamedTuple):
++    '''
++    Data structure for representing packed attention inputs
++
++    Attributes:
++
++        * {query,key,value}: packed (number_of_tokens x num_heads 
++                             x head_size) attention inputs
++        * q_start_loc_list: list of query start locations within packed tensor
++        * kv_start_loc_list: shared list of key/value start locations within
++                             packed tensor
++        * q_seq_lens: query sequence lengths list
++        * kv_seq_lens: shared key/value sequence lengths list
++    '''
++
++    query: torch.Tensor
++    key: torch.Tensor
++    value: torch.Tensor
++    q_start_loc_list: Optional[List[int]]
++    kv_start_loc_list: Optional[List[int]]
++    q_seq_lens: Optional[List[int]]
++    kv_seq_lens: Optional[List[int]]
++
++
++class PackedQKVO(NamedTuple):
++    '''
++    Data structure for representing packed attention inputs, 
++    alongside packed known-correct attention output
++
++    Attributes:
++
++        * packed_qkv: packed (number_of_tokens x num_heads 
++                      x head_size) attention inputs
++        * ideal_output: packed (number_of_tokens x num_heads 
++                        x head_size) known-correct attention output
++    '''
++
++    packed_qkv: Optional[PackedQKVInputs]
++    ideal_output: torch.Tensor
++
++
++class KVMemoryMap(NamedTuple):
++    '''
++    Data structure for encapsulating KV cache memory mapping.
++
++    Attributes:
++
++        * block_tables: KV cache block tables
++        * slot_mapping: mapping of sequence offset to physical address
++    '''
++
++    block_tables: torch.Tensor
++    slot_mapping: torch.Tensor
++
++
++class PhaseTestParameters(NamedTuple):
++    '''
++    Data structure for encapsulating the test parameters
++    for a given test "phase" (prefill or decode phase) and attention
++    scenario (encoder, decoder-self, encoder/decoder-cross)
++
++    Attributes:
++
++        * packed_qkvo: packed (number_of_tokens x num_heads 
++                       x head_size) attention inputs & known-correct
++                       output
++        * kv_mmap: KV cache memory mapping, specific to this test phase &
++                   attention scenario
++    '''
++
++    packed_qkvo: PackedQKVO
++    kv_mmap: Optional[KVMemoryMap]
++
++
++def maybe_make_int_tensor(
++    _list: Optional[List[int]],
++    device: Union[torch.device, str],
++) -> torch.Tensor:
++    '''
++    Convert Python int list to a 1D int torch.Tensor on `device`
++
++    Returns:
++
++    * If _list is not None: 1D int torch.Tensor on `device`
++    * None otherwise
++    '''
++    return None if _list is None else torch.tensor(
++        _list, dtype=torch.int, device=device)
++
++
++def maybe_make_long_tensor(
++    _list: Optional[List[int]],
++    device: Union[torch.device, str],
++) -> torch.Tensor:
++    '''
++    Convert Python int list to a 1D long torch.Tensor on `device`
++
++    Returns:
++
++    * If _list is not None: 1D long torch.Tensor on `device`
++    * None otherwise
++    '''
++    return None if _list is None else torch.tensor(
++        _list, dtype=torch.long, device=device)
++
++
++def maybe_max(_list: Optional[List]) -> Optional[Number]:
++    '''
++    Returns:
++
++    * If _list is not None: max(_list)
++    * None otherwise
++    '''
++    return None if _list is None else max(_list)
++
++
++def make_causal_mask(
++    q_max_seq_len: int,
++    kv_max_seq_len: int,
++) -> torch.Tensor:
++    '''
++    Create a q_max_seq_len x kv_max_seq_len causal mask
++
++    Arguments:
++    
++    * q_max_seq_len: query max seq len
++    * kv_max_seq_len: key/value max seq len
++
++    Returns:
++
++    * 2D tensor, q_max_seq_len x kv_max_seq_len
++    '''
++
++    # Create a matrix where entry (i, j) is True if i >= j
++    mask = torch.triu(torch.ones(q_max_seq_len, kv_max_seq_len), diagonal=1)
++    # Replace True with float('-inf') and False with 0
++    mask = mask.masked_fill(mask == 1,
++                            float('-inf')).masked_fill(mask == 0, 0.0)
++    return mask
++
++
++def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
++                                  backend_name: str) -> None:
++    '''
++    Override the environment variable indicating the vLLM backend temporarily,
++    using pytest monkeypatch to ensure that the env vars get
++    reset once the test context exits.
++
++    Arguments:
++
++    * mpatch: pytest monkeypatch instance
++    * backend_name: attention backend name to force
++    '''
++    mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
++
++
++def ref_masked_attention(query: torch.Tensor,
++                         key: torch.Tensor,
++                         value: torch.Tensor,
++                         scale: float,
++                         custom_mask: Optional[torch.Tensor] = None,
++                         q_seq_lens: Optional[List] = None,
++                         kv_seq_lens: Optional[List] = None) -> torch.Tensor:
++    '''
++    "Golden" masked attention reference. Supports two types of masking:
++
++    * Basic attention mask, utilizing {q,kv}_seq_lens args to mask out
++      padding elements
++    * Custom attention mask, which can force an arbitrary mask tensor, i.e.
++      causal
++
++    Arguments:
++
++    * query: batch_size x q_padded_seq_len x num_heads x head_size
++    * key: batch_size x kv_padded_seq_len x num_heads x head_size
++    * value: batch_size x kv_padded_seq_len x num_heads x head_size
++    * scale: Attention scale factor
++    * custom_mask: custom attention mask; good place to inject a causal
++      attention mask
++    * q_seq_lens: list of unpadded query seq_lens for each batch index
++    * kv_seq_lens: list of unpadded key/value seq_lens for each batch index
++
++    Returns:
++
++    * Attention result, batch_size x q_padded_seq_len x num_heads x head_size
++    '''
++
++    assert q_seq_lens is not None
++    assert kv_seq_lens is not None
++
++    batch_size = query.shape[0]
++    assert (len(q_seq_lens) == batch_size)
++    assert (len(kv_seq_lens) == batch_size)
++
++    attn_weights = scale * torch.einsum("bqhd,bkhd->bhqk", query, key).float()
++
++    # Basic attention mask, derived from seq lens
++    if (q_seq_lens is not None) or (kv_seq_lens is not None):
++        attn_mask = torch.zeros_like(attn_weights)
++        if q_seq_lens is not None:
++            for bdx, plen in enumerate(q_seq_lens):
++                attn_mask[bdx, :, plen:, :] = -torch.inf
++        if kv_seq_lens is not None:
++            for bdx, plen in enumerate(kv_seq_lens):
++                attn_mask[bdx, :, :, plen:] = -torch.inf
++
++        attn_weights = attn_weights + attn_mask.float()
++
++    # Custom attention mask
++    if custom_mask is not None:
++        attn_weights = attn_weights + custom_mask.float()
++
++    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
++    out = torch.einsum("bhqk,bkhd->bqhd", attn_weights, value)
++    return out
++
++
++def make_qkv(
++    batch_size: int,
++    max_q_seq_len: int,
++    max_kv_seq_len: Optional[int],
++    num_heads: int,
++    head_size: int,
++    device: Union[torch.device, str],
++    force_kv_seq_lens: Optional[List[int]] = None,
++    attn_type: AttentionType = AttentionType.ENCODER_DECODER,
++    force_max_len: bool = False,
++) -> Tuple[QKVInputs, QKVInputs, QKVInputs]:
++    '''
++    Construct QKV test tensors for self- and cross-attention.
++
++    Generates three query/key/value triplets:
++
++    * "Baseline" query/key/value (for input to reference attention function)
++    * "Prefill" query/key/value (last sequence offset zero'd out, for use as
++      input to prefill kernel)
++    * "Decode" query/key/value (only the last sequence offset  from baseline,
++      for use as input to decode kernel)
++
++    Each Q/K/V triplet is associated with a list of q seqlens and a list of k/v
++    seqlens
++
++    Arguments:
++
++    * batch_size
++    * max_q_seq_len: max query seq len
++    * max_kv_seq_len: max key/value seq len
++    * num_heads
++    * head_size
++    * is_encoder_decoder_attn: if True, query seqlen may differ from 
++      key/value seqlen (as is often the case for cross-attention); 
++      o/w, query/key/value seqlens match at each batch index 
++      (max_kv_seq_len is unused)
++    * force_kv_seq_lens: if not None, overrides kv sequence lengths
++    * attn_type: encoder, decoder self, or enc/dec cross attention
++    * force_max_len: if True, all query seqlens are max_q_seq_len; o/w query
++      seqlens are random in [2,max_q_seq_lens]. Same for key/value seqlens
++      and max_kv_seq_len, unless forced by is_encoder_decoder_attn=False
++    * device: CPU or CUDA device
++
++    Returns:
++
++    * Overall QKVInputs structure (containing full unpacked Q/K/V tensors)
++    * Prefill QKVInputs structure (containing all but the last sequence offset)
++    * Decode QKVInputs structure (containing all only the last sequence offset)
++    '''
++
++    if force_max_len:
++        q_seq_lens = [max_q_seq_len for _ in range(batch_size)]
++    else:
++        q_seq_lens = [
++            random.randint(2, max_q_seq_len) for _ in range(batch_size)
++        ]
++    kv_seq_lens = None
++    if force_kv_seq_lens is not None:
++        kv_seq_lens = force_kv_seq_lens
++    elif attn_type != AttentionType.ENCODER_DECODER:
++        # K,V seq lens match Q for self-attention
++        kv_seq_lens = q_seq_lens
++    else:
++        # K,V seq lens are distinct from Q seq lens & random
++        assert max_kv_seq_len is not None
++        if force_max_len:
++            kv_seq_lens = [max_kv_seq_len] * batch_size
++        else:
++            kv_seq_lens = [
++                random.randint(2, max_kv_seq_len) for _ in range(batch_size)
++            ]
++
++    query = torch.rand(
++        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
++    key = torch.rand(
++        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
++    value = torch.rand(
++        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
++
++    prefill_query = torch.zeros(
++        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
++    prefill_key = torch.zeros(
++        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
++    prefill_value = torch.zeros(
++        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
++
++    decode_query = torch.zeros(
++        (batch_size, 1, num_heads, head_size)).to(device)
++    decode_key = torch.zeros((batch_size, 1, num_heads, head_size)).to(device)
++    decode_value = torch.zeros(
++        (batch_size, 1, num_heads, head_size)).to(device)
++
++    for bdx, (q_seq_len, kv_seq_len) in enumerate(zip(q_seq_lens,
++                                                      kv_seq_lens)):
++        query[bdx, q_seq_len:, :, :] = 0
++        key[bdx, kv_seq_len:, :, :] = 0
++        value[bdx, kv_seq_len:, :, :] = 0
++
++        prefill_query[bdx,
++                      0:(q_seq_len - 1), :, :] = query[bdx,
++                                                       0:(q_seq_len - 1), :, :]
++        prefill_key[bdx,
++                    0:(kv_seq_len - 1), :, :] = key[bdx,
++                                                    0:(kv_seq_len - 1), :, :]
++        prefill_value[bdx, 0:(kv_seq_len -
++                              1), :, :] = value[bdx, 0:(kv_seq_len - 1), :, :]
++
++        decode_query[bdx, :, :, :] = query[bdx,
++                                           (q_seq_len - 1):q_seq_len, :, :]
++        decode_key[bdx, :, :, :] = key[bdx, (kv_seq_len - 1):kv_seq_len, :, :]
++        decode_value[bdx, :, :, :] = value[bdx,
++                                           (kv_seq_len - 1):kv_seq_len, :, :]
++
++    prefill_q_seq_lens = [plen - 1 for plen in q_seq_lens]
++    prefill_kv_seq_lens = [plen - 1 for plen in kv_seq_lens]
++
++    decode_q_seq_lens = [1 for _ in q_seq_lens]
++    decode_kv_seq_lens = [1 for _ in kv_seq_lens]
++
++    return (
++        QKVInputs(
++            query,  # Overall QKV inputs
++            key,
++            value,
++            q_seq_lens,
++            kv_seq_lens),
++        QKVInputs(
++            prefill_query,  # Prefill subset of QKV sequences
++            prefill_key,
++            prefill_value,
++            prefill_q_seq_lens,
++            prefill_kv_seq_lens),
++        QKVInputs(
++            decode_query,  # Decode subset of KV sequences
++            decode_key,
++            decode_value,
++            decode_q_seq_lens,
++            decode_kv_seq_lens))
++
++
++def pack_tensor(
++        unpacked_tensor: torch.Tensor, seq_lens: List[int],
++        device: Union[torch.device, str]) -> Tuple[torch.Tensor, List[int]]:
++    '''
++    Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an
++    unpadded number_of_tokens x num_heads x head_size tensor, where
++    number_of_tokens = sum(seq_lens)
++
++    Arguments:
++
++    * unpacked_tensor: batch_size x padded_seq_len x num_heads x head_size
++    * seq_lens: list of token counts for each seq
++    * device: CPU or CUDA device
++
++    Returns
++
++    * packed_tensor: number_of_tokens x num_heads x head_size
++    * start_loc_list: start idx of each batch elt in packed_tensor; [0] +
++      list(itertools.accumulate(seq_lens))
++    '''
++
++    num_tok = sum(seq_lens)
++    num_heads = unpacked_tensor.shape[-2]
++    head_size = unpacked_tensor.shape[-1]
++    start_loc_list = [0] + list(itertools.accumulate(seq_lens))
++    packed_tensor = torch.zeros((num_tok, num_heads, head_size), device=device)
++
++    for bdx, (seq_len, start_loc) in enumerate(zip(seq_lens, start_loc_list)):
++
++        packed_tensor[start_loc:(
++            start_loc + seq_len), :, :] = unpacked_tensor[bdx, :seq_len, :, :]
++
++    return packed_tensor, start_loc_list
++
++
++def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
++                                           str]) -> PackedQKVInputs:
++    '''
++    Individually pack each of Q, K and V, each with dimensions batch_size x
++    padded_seq_len x num_heads x head_size, into respective number_of_tokens x
++    num_heads x head_size tensors.
++    
++    For Q, number_of_tokens = sum(q_seq_lens).
++
++    For K and V, number_of_tokens = sum(kv_seq_lens)
++
++    Arguments:
++
++    * qkv: Unpacked (batch_size x padded_seq_len x num_heads x head_size)
++           attention inputs
++    * device: CPU or CUDA device
++
++    Returns
++
++    * Packed (number_of_tokens x num_heads x head_size) QKV inputs
++      derived from unpacked inputs
++    '''
++
++    if qkv.query is None:
++        packed_query = None
++        q_start_loc_list = None
++    else:
++        packed_query, q_start_loc_list = pack_tensor(qkv.query,
++                                                     qkv.q_seq_lens,
++                                                     device=device)
++    packed_key, kv_start_loc_list = pack_tensor(qkv.key,
++                                                qkv.kv_seq_lens,
++                                                device=device)
++    packed_value, _ = pack_tensor(qkv.value, qkv.kv_seq_lens, device=device)
++    return PackedQKVInputs(
++        packed_query, packed_key, packed_value, q_start_loc_list,
++        kv_start_loc_list,
++        (None if q_start_loc_list is None else qkv.q_seq_lens),
++        qkv.kv_seq_lens)
++
++
++def make_backend(backend_name: str) -> AttentionBackend:
++    '''
++    Construct the backend instance determined by the backend_name string
++    argument.
++
++    "XFORMERS" -> construct xformers backend
++
++    TODO: other backends
++
++    Note: at time of writing the Attention wrapper automatically selects
++    its own backend for Attention.forward(); so the backend instance which
++    you generate with this function is not meant to be used for *running*
++    inference, but rather for generating compatible metadata structures
++    using backend.make_metadata()
++
++
++    Returns:
++
++    * Backend instance
++    '''
++    if backend_name == STR_XFORMERS_ATTN_VAL:
++        # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
++        from vllm.attention.backends.xformers import XFormersBackend
++        return XFormersBackend()
++    elif backend_name == STR_FLASH_ATTN_VAL:
++        from vllm.attention.backends.flash_attn import FlashAttentionBackend
++        return FlashAttentionBackend()
++
++    raise AssertionError(
++        f"Unrecognized backend_name {backend_name} for unit test")
++
++
++def _make_metadata_tensors(
++    seq_lens: Optional[List[int]],
++    context_lens: Optional[List[int]],
++    encoder_seq_lens: Optional[List[int]],
++    device: Union[torch.device, str],
++) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor],
++           torch.Tensor, torch.Tensor, Optional[int]]:
++    '''
++    Build scalar & tensor values required to build attention metadata structure.
++
++    Arguments:
++
++    * seq_lens: list of token-counts for each decoder input seq
++    * context_lens: list of context length values for each seq
++    * encoder_seq_lens: list of token-counts for each encoder input seq
++    * device: CPU or CUDA device
++
++    Returns:
++
++    * seq_lens_tensor: decoder seq_lens list, as tensor
++    * context_lens_tensor: context_lens list, as tensor
++    * max_context_len: max(context_lens)
++    * max_seq_len: max(seq_lens)
++    * seq_start_loc: start idx of each sequence
++    * encoder_seq_lens_tensor: encoder seq_lens list, as tensor
++    * encoder_seq_start_loc: start idx of each encoder sequence
++    * max_encoder_seq_len: encoder seq_lens list, as tensor
++    '''
++    seq_lens_tensor = maybe_make_int_tensor(seq_lens, device)
++    context_lens_tensor = maybe_make_int_tensor(context_lens, device)
++    max_context_len = maybe_max(context_lens)
++    max_seq_len = maybe_max(seq_lens)
++
++    encoder_seq_lens_tensor = maybe_make_int_tensor(encoder_seq_lens, device)
++    max_encoder_seq_len = (None if encoder_seq_lens is None else
++                           max(encoder_seq_lens))
++
++    seq_start_loc = None
++
++    if seq_lens_tensor is not None:
++        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
++                                    dtype=torch.int32,
++                                    device=seq_lens_tensor.device)
++        torch.cumsum(seq_lens_tensor,
++                     dim=0,
++                     dtype=seq_start_loc.dtype,
++                     out=seq_start_loc[1:])
++
++    encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + 1,
++                                        dtype=torch.int32,
++                                        device=encoder_seq_lens_tensor.device)
++    torch.cumsum(encoder_seq_lens_tensor,
++                 dim=0,
++                 dtype=encoder_seq_start_loc.dtype,
++                 out=encoder_seq_start_loc[1:])
++
++    return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len,
++            seq_start_loc, encoder_seq_lens_tensor, encoder_seq_start_loc,
++            max_encoder_seq_len)
++
++
++def make_kv_cache(num_blocks: int,
++                  num_heads: int,
++                  head_size: int,
++                  block_size: int,
++                  device: Union[torch.device, str],
++                  backend: str,
++                  default_val: float = 0.0) -> torch.Tensor:
++    '''
++    Create a fake KV cache.
++
++    Arguments:
++
++    * num_blocks: number of blocks in the KV cache
++    * num_heads: number of attention heads
++    * head_size: head dimension
++    * block_size: number of offsets within a block
++    * device: CPU or CUDA device
++    * default_val: initialization value for KV cache elements
++
++    Returns:
++
++    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
++    *     for backend 'XFORMERS' 
++    * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
++    *     for backend 'FLASH_ATTN'  
++    '''
++    if backend == 'XFORMERS':
++        kv_cache = torch.rand(
++            (2, num_blocks, block_size * num_heads * head_size)).to(device)
++    elif backend == 'FLASH_ATTN':
++        kv_cache = torch.rand(
++            (2, num_blocks, block_size, num_heads, head_size)).to(device)
++    else:
++        raise ValueError(
++            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
++            f"'FLASH_ATTN'.")
++    if default_val is not None:
++        kv_cache[:, :, :] = default_val
++    return kv_cache
++
++
++def _num_tokens_to_min_blocks(num_tokens: int, block_size: int) -> int:
++    '''
++    Compute the minimum number of blocks required to hold num_tokens tokens,
++    given block_size
++    '''
++    return (num_tokens + block_size) // block_size
++
++
++def make_empty_slot_mapping_tensor(device: Union[torch.device, str]):
++    return maybe_make_long_tensor([], device)
++
++
++def make_empty_block_tables_tensor(device: Union[torch.device, str]):
++    return torch.tensor([], device=device)
++
++
++def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int],
++                       device: Union[torch.device, str]):
++    '''
++    Split a slot mapping into valid prefill- and decode-phase slot mappings.
++
++    Context:
++    * Your goal is to test (1) prefill of N prompts, with prompt-lengths
++      {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token
++      for all N prompts (N tokens total); the resultant sequence lengths 
++      after decode would be {K_i + 1 for i \\in [0,N)}
++    * The test you want to do requires (1) having the prefill slot mapping 
++      for all tokens present during prefill, the number of which is 
++      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N 
++      decoded tokens
++    
++    This function consumes a single 1D slot mapping, which is the 
++    concatenation of N slot mappings each of length K_i + 1 (corresponding
++    to the  sequence lengths after decode), with a total length of
++    P = \\sum_i{K_i + 1} = M + N
++
++    The prefill-phase slot mapping results from excising the (K_i + 1)-th entry
++    from each of the N subsequences in the slot mapping (i.e. omitting the 
++    decoded token's mapping.)
++
++    The N excised entries are appended to obtain the decode-phase slot mapping
++
++    Arguments:
++
++    * slot_mapping_list: Length-P 1D slot mapping (as List) reflecting all N
++      post-decode sequences
++    * seq_lens: List of N post-decode sequence lengths (K_i + 1 in the 
++      description above)
++    * device: cuda, cpu, etc.
++
++    Returns:
++
++    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) 
++      reflecting all N prefill prompts
++    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting 
++      all N decoded tokens
++    '''
++
++    prefill_slot_mapping = []
++    decode_slot_mapping = []
++
++    base_idx = 0
++    for seq_len in seq_lens:
++        prefill_slot_mapping.extend(slot_mapping_list[base_idx:(base_idx +
++                                                                seq_len - 1)])
++        decode_slot_mapping.append(slot_mapping_list[base_idx + seq_len - 1])
++        base_idx += seq_len
++
++    return (maybe_make_long_tensor(prefill_slot_mapping, device),
++            maybe_make_long_tensor(decode_slot_mapping, device))
++
++
++def make_block_tables_slot_mapping(
++        block_size: int,
++        seq_lens: List[int],
++        device: Union[torch.device, str],
++        block_base_addr: int = 0) -> Tuple[torch.Tensor, List[int], int]:
++    '''
++    Construct fake block tables & slot mappings.
++
++    For a sequence with num_tokens tokens the minimum number
++    of required KV cache blocks is
++
++    num_blocks = (num_tokens + block_size) // block_size
++
++    Then the minimum KV cache size in blocks is
++
++    total_cache_blocks = sum(num_blocks for all seqs) 
++
++    Then, the blocktable mapping counts downward from
++
++    block_base_addr + total_cache_blocks
++
++    to
++
++    block_base_addr
++    
++
++    The constructed block-tables and slot-mapping are sized to the
++    lengths of the sequences in their entirety (as reflected by seq_lens),
++    i.e. the total of prefill prompt tokens + decoded tokens.
++
++    Arguments:
++
++    * block_size: number of offsets per block
++    * seq_lens: list of token-counts for each sequence
++    * block_base_addr: the block table base address
++    * device: CPU or CUDA device
++
++    Return:
++
++    * block_tables_tensor: block table for sequence   
++    * slot_mapping_list: slot mapping for sequence
++    * max_block_idx: the highest block address within this block table
++    '''
++
++    # Provision minimum number of KV cache blocks
++    num_blocks_list = [
++        _num_tokens_to_min_blocks(num_tokens, block_size)
++        for num_tokens in seq_lens
++    ]
++    max_block_table_len = max(num_blocks_list)
++    block_table_pad_tokens = 10
++
++    block_tables = []
++    slot_mapping_list = []
++    # Compute uppermost address of block table
++    total_cache_blocks = sum(num_blocks_list)
++    block_base_idx = block_base_addr + total_cache_blocks
++    max_block_idx = block_base_idx
++    for sdx, num_tokens in enumerate(seq_lens):
++        num_blocks = num_blocks_list[sdx]
++        block_table = list(
++            range(block_base_idx, block_base_idx - num_blocks, -1))
++        for idx in range(num_tokens):
++            mapping_value = (
++                idx % block_size) + block_table[idx // block_size] * block_size
++            slot_mapping_list.append(mapping_value)
++
++        block_base_idx -= num_blocks
++        block_tables.append(block_table)
++
++    block_tables_tensor = make_tensor_with_pad(
++        block_tables,
++        max_len=max_block_table_len + block_table_pad_tokens,
++        pad=0,
++        dtype=torch.int,
++        device=device,
++    )
++
++    return (block_tables_tensor, slot_mapping_list, max_block_idx)
++
++
++def make_test_metadata(
++    attn_backend: _Backend,
++    is_prompt: bool,
++    seq_lens: Optional[List[int]],
++    decoder_test_params: Optional[PhaseTestParameters],
++    device: Union[torch.device, str],
++    encoder_test_params: Optional[PhaseTestParameters] = None,
++    cross_test_params: Optional[PhaseTestParameters] = None
++) -> AttentionMetadata:
++    '''
++    Construct fake attention metadata for a given test phase
++    (prefill-phase or decode-phase).
++
++    encoder_test_params and cross_test_params arguments allow encoder
++    attention and enc/dec cross-attention (respectively) to use distinct
++    metadata values from decoder self-attention (decoder_test_params.)
++    
++    if encoder_test_params and cross_test_params are None, the attention
++    metadata will support decoder-only scenario.
++
++    Assumptions:
++
++    * No chunked prefill -> a batch is 100% prefill or 100% decode, never both
++
++    Arguments:
++
++    * attn_backend_name: Backend for sourcing attention kernels
++    * is_prompt: prefill if True, o/w decode
++    * seq_lens: list of token counts for each sequence
++    * decoder_test_params: decoder self-attention test params; 
++                           this function requires
++                           kv_mmap (memory mapping) field
++    * device: CPU or CUDA device
++    * encoder_test_params: encoder attention test params;
++                           this function requires encoder query
++                           sequence lengths field. If None,
++                           encoder query sequence lengths are
++                           treated as None
++    * cross_test_params: enc/dec cross-attention test params;
++                         this function requires kv_mmap field.
++                         If None, KV cache memory map data
++                         structures are treated as None
++
++    Return:
++
++    * AttentionMetadata structure
++    '''
++
++    # Decoder self-attention memory mapping
++    # decoder_test_params is None signals encoder-only
++    # scenario, so kv_mmap is None
++    kv_mmap = (None
++               if decoder_test_params is None else decoder_test_params.kv_mmap)
++
++    # This function constructs metadata assuming no chunked prefill,
++    # i.e. 100% prefill tokens or 100% decode tokens
++    #
++    # - If is_prompt, num_prefills_or_decodes is the number of prefills
++    #   and num_prefill_or_decode_tokens is the number of prefill tokens
++    # - If not is_prompt, num_prefills_or_decodes is the number of decodes
++    #   and num_prefill_or_decode_tokens is the number of decode tokens
++    #
++    # seq_lens is None signals encoder-only
++    # scenario, in which case num_prefills_or_decodes and
++    # num_prefill_or_decode_tokens are unused
++    num_prefills_or_decodes = (None if seq_lens is None else len(seq_lens))
++
++    num_prefill_or_decode_tokens = (None if seq_lens is None else (
++        sum(seq_lens) if is_prompt else len(seq_lens)))
++
++    # Seems for non-prefix-caching scenarios context_lens
++    # is never needed
++    context_lens = None
++
++    if encoder_test_params is None:
++        encoder_seq_lens = None
++        num_encoder_tokens = None
++    else:
++        # Encoder/decoder or encoder-only models only:
++        # * Extract encoder input sequence lengths
++        assert encoder_test_params.packed_qkvo.packed_qkv is not None
++        encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
++        num_encoder_tokens = (None if encoder_seq_lens is None else
++                              (sum(encoder_seq_lens)))
++
++    if cross_test_params is None:
++        cross_kv_mmap = None
++    else:
++        # Encoder/decoder or encoder-only models only:
++        # * Extract *cross-attention* slot_mapping and block table
++        #   (kv_mmap)
++        cross_kv_mmap = cross_test_params.kv_mmap
++
++    attn_backend_obj = make_backend(attn_backend.name)
++
++    if is_prompt:
++        # Prefill-phase scenario
++
++        num_prefills = num_prefills_or_decodes
++        num_prefill_tokens = num_prefill_or_decode_tokens
++        num_decode_tokens = 0
++
++        (
++            seq_lens_tensor,
++            context_lens_tensor,
++            _,
++            _,
++            seq_start_loc,
++            encoder_seq_lens_tensor,
++            encoder_seq_start_loc,
++            max_encoder_seq_len,
++        ) = _make_metadata_tensors(seq_lens,
++                                   context_lens,
++                                   encoder_seq_lens,
++                                   device=device)
++        return attn_backend_obj.make_metadata(
++            num_prefills=num_prefills,
++            slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
++            multi_modal_placeholder_index_maps=None,
++            num_prefill_tokens=num_prefill_tokens,
++            num_decode_tokens=num_decode_tokens,
++            seq_lens=seq_lens,
++            seq_lens_tensor=seq_lens_tensor,
++            seq_start_loc=seq_start_loc,
++            max_prefill_seq_len=None if seq_lens is None else max(seq_lens),
++            max_decode_seq_len=0,
++            context_lens_tensor=context_lens_tensor,
++            block_tables=(None if kv_mmap is None else kv_mmap.block_tables),
++            use_cuda_graph=False,
++            num_encoder_tokens=num_encoder_tokens,
++            encoder_seq_lens=encoder_seq_lens,
++            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
++            encoder_seq_start_loc=encoder_seq_start_loc,
++            max_encoder_seq_len=max_encoder_seq_len,
++            cross_slot_mapping=(None if cross_kv_mmap is None else
++                                cross_kv_mmap.slot_mapping),
++            cross_block_tables=(None if cross_kv_mmap is None else
++                                cross_kv_mmap.block_tables))
++
++    else:  # not is_prompt
++        # Decode-phase scenario
++
++        assert kv_mmap is not None
++        assert num_prefill_or_decode_tokens is not None
++        assert seq_lens is not None
++
++        num_prefills = 0
++        num_prefill_tokens = 0
++        num_decode_tokens = num_prefill_or_decode_tokens
++
++        (
++            seq_lens_tensor,
++            context_lens_tensor,
++            _,
++            _,
++            seq_start_loc,
++            encoder_seq_lens_tensor,
++            encoder_seq_start_loc,
++            max_encoder_seq_len,
++        ) = _make_metadata_tensors(seq_lens,
++                                   context_lens,
++                                   encoder_seq_lens,
++                                   device=device)
++
++        return attn_backend_obj.make_metadata(
++            num_prefills=num_prefills,
++            slot_mapping=kv_mmap.slot_mapping,
++            multi_modal_placeholder_index_maps=None,
++            num_prefill_tokens=num_prefill_tokens,
++            num_decode_tokens=num_decode_tokens,
++            seq_lens=seq_lens,
++            seq_lens_tensor=seq_lens_tensor,
++            seq_start_loc=seq_start_loc,
++            max_prefill_seq_len=0,
++            max_decode_seq_len=max(seq_lens),
++            max_decode_query_len=1,
++            context_lens_tensor=context_lens_tensor,
++            block_tables=kv_mmap.block_tables,
++            use_cuda_graph=False,
++            num_encoder_tokens=num_encoder_tokens,
++            encoder_seq_lens=encoder_seq_lens,
++            encoder_seq_lens_tensor=encoder_seq_lens_tensor,
++            encoder_seq_start_loc=encoder_seq_start_loc,
++            max_encoder_seq_len=max_encoder_seq_len,
++            cross_slot_mapping=(None if cross_kv_mmap is None else
++                                cross_kv_mmap.slot_mapping),
++            cross_block_tables=(None if cross_kv_mmap is None else
++                                cross_kv_mmap.block_tables))
++
++
++def assert_actual_matches_ideal(test_params: PhaseTestParameters,
++                                output_under_test: torch.Tensor,
++                                backend: str) -> None:
++    '''
++    Assert that observed output matches the ideal output
++    contained in the test parameters data structure.
++
++    Arguments:
++
++    * test_params: Test parameters including packed ideal output
++    * output_under_test: actually observed output value
++    '''
++    ideal_output = test_params.packed_qkvo.ideal_output
++    if backend == 'XFORMERS':
++        torch.testing.assert_close(ideal_output,
++                                   output_under_test.view_as(ideal_output))
++
++    elif backend == 'FLASH_ATTN':
++        # For FlashAttention override the accuracy thresholds to non default
++        # values since we notice a higher difference between the ideal and
++        # actual output.
++        torch.testing.assert_close(ideal_output,
++                                   output_under_test.view_as(ideal_output),
++                                   atol=0.01,
++                                   rtol=0.016)
++    else:
++        raise ValueError(
++            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
++            f"'FLASH_ATTN'.")
++
++
++# Copied/modified from torch._refs.__init__.py
++def fp8_allclose(
++    a: TensorLikeType,
++    b: TensorLikeType,
++    rtol: float = 1e-05,
++    atol: float = 1e-08,
++    equal_nan: bool = False,
++) -> bool:
++    """
++    Reference implementation of torch.allclose
++    """
++    torch._refs._check_close_args(name="torch.allclose",
++                                  a=a,
++                                  b=b,
++                                  rtol=rtol,
++                                  atol=atol)
++
++    return bool(
++        torch.all(
++            torch.isclose(a.double(),
++                          b.double(),
++                          rtol=rtol,
++                          atol=atol,
++                          equal_nan=equal_nan)).item())
++
++
++# Marlin MoE test utils
++
++
++def stack_and_dev(tensors: List[torch.Tensor]):
++    dev = tensors[0].device
++    return torch.stack(tensors, dim=0).to(dev)
++
++
++def compute_max_diff(output, output_ref):
++    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
++        torch.abs(output_ref))
++
++
++def torch_moe(a, w1, w2, score, topk):
++    B, D = a.shape
++    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
++    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
++    score = torch.softmax(score, dim=-1, dtype=torch.float32)
++    topk_weight, topk_ids = torch.topk(score, topk)
++    topk_weight = topk_weight.view(-1)
++    topk_ids = topk_ids.view(-1)
++    for i in range(w1.shape[0]):
++        mask = topk_ids == i
++        if mask.sum():
++            out[mask] = SiluAndMul()(
++                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
++    return (out.view(B, -1, w2.shape[1]) *
++            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
++
++
++def torch_moe_single(a, w, score, topk):
++    B, D = a.shape
++    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
++    out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
++    score = torch.softmax(score, dim=-1, dtype=torch.float32)
++    _, topk_ids = torch.topk(score, topk)
++    topk_ids = topk_ids.view(-1)
++    for i in range(w.shape[0]):
++        mask = topk_ids == i
++        if mask.sum():
++            out[mask] = a[mask] @ w[i].transpose(0, 1)
++    return (out.view(B, -1, w.shape[1])).sum(dim=1)
++
++
++# A special version of op check that has a restricted default set of test_utils
++# and a patched version of allclose that supports fp8 types.
++def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
++                      torch._library.custom_ops.CustomOpDef],
++            args: Tuple[Any, ...],
++            kwargs: Optional[Dict[str, Any]] = None,
++            *,
++            test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
++            raise_exception: bool = True,
++            cond: bool = True) -> Dict[str, str]:
++    with unittest.mock.patch('torch.allclose', new=fp8_allclose):
++        return torch.library.opcheck(
++            op,
++            args,
++            kwargs,
++            test_utils=test_utils,
++            raise_exception=raise_exception) if cond else {}
+diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
+new file mode 100644
+index 0000000..adc6150
+--- /dev/null
++++ b/tests/kv_transfer/disagg_test.py
+@@ -0,0 +1,119 @@
++import os
++import subprocess
++import sys
++import time
++from subprocess import Popen
++
++import pytest
++import requests
++import torch
++
++
++# Fixture to set up environment variables and teardown servers after tests
++@pytest.fixture(scope="module", autouse=True)
++def setup_servers():
++    if torch.cuda.device_count() < 4:
++        pytest.skip("Skipping test: fewer than 4 GPUs available")
++
++    # Set up environment variables
++    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'",
++                                           shell=True).decode().strip()
++    os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
++
++    # Start prefill instance
++    prefill_cmd = [
++        sys.executable,
++        "-m",
++        "vllm.entrypoints.openai.api_server",
++        "--model",
++        "meta-llama/Meta-Llama-3.1-8B-Instruct",
++        "--port",
++        "8100",
++        "--gpu-memory-utilization",
++        "0.5",
++        "--max-model-len",
++        "1000",
++        "--kv-transfer-config",
++        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'\
++        '"kv_rank":0,"kv_parallel_size":2}',
++    ]
++    prefill_env = os.environ.copy()
++    prefill_env["CUDA_VISIBLE_DEVICES"] = "0"
++    prefill_proc = Popen(prefill_cmd, env=prefill_env)
++
++    # Start decode instance
++    decode_cmd = [
++        sys.executable,
++        "-m",
++        "vllm.entrypoints.openai.api_server",
++        "--model",
++        "meta-llama/Meta-Llama-3.1-8B-Instruct",
++        "--port",
++        "8200",
++        "--gpu-memory-utilization",
++        "0.5",
++        "--max-model-len",
++        "1000",
++        "--kv-transfer-config",
++        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'\
++        '"kv_rank":1,"kv_parallel_size":2}',
++    ]
++    decode_env = os.environ.copy()
++    decode_env["CUDA_VISIBLE_DEVICES"] = "1"
++    decode_proc = Popen(decode_cmd, env=decode_env)
++
++    # Wait for servers to be ready
++    assert wait_for_server(8100), "Prefill server did not start in time"
++    assert wait_for_server(8200), "Decode server did not start in time"
++
++    # Yield to the test function and handle teardown after tests
++    yield
++
++    # Cleanup: kill the processes
++    prefill_proc.terminate()
++    decode_proc.terminate()
++
++    # Additional cleanup if needed
++    prefill_proc.wait()
++    decode_proc.wait()
++
++
++# Helper function to wait for server
++def wait_for_server(port, timeout=240):
++    start_time = time.time()
++    while time.time() - start_time < timeout:
++        try:
++            response = requests.get(f"http://localhost:{port}/v1/completions")
++            if response.status_code in [200, 405]:
++                return True
++        except requests.ConnectionError:
++            time.sleep(1)
++    return False
++
++
++# Test function to send curl requests and validate responses
++@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"])
++def test_disaggregated_prefilling(prompt):
++    # Send to prefill
++    response = requests.post("http://localhost:8100/v1/completions",
++                             headers={"Content-Type": "application/json"},
++                             json={
++                                 "model":
++                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
++                                 "prompt": prompt,
++                                 "max_tokens": 1,
++                                 "temperature": 0
++                             })
++    assert response.status_code == 200
++
++    # Send to decode
++    response = requests.post("http://localhost:8200/v1/completions",
++                             headers={"Content-Type": "application/json"},
++                             json={
++                                 "model":
++                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
++                                 "prompt": prompt,
++                                 "max_tokens": 10,
++                                 "temperature": 0
++                             })
++    assert response.status_code == 200
+diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py
+new file mode 100644
+index 0000000..3554619
+--- /dev/null
++++ b/tests/kv_transfer/module_test.py
+@@ -0,0 +1,64 @@
++import subprocess
++import sys
++
++import pytest
++import torch
++
++
++def run_python_script(script_name, timeout):
++    script_name = f'kv_transfer/{script_name}'
++    try:
++        # Start both processes asynchronously using Popen
++        process0 = subprocess.Popen(
++            [sys.executable, script_name],
++            env={"RANK":
++                 "0"},  # Set the RANK environment variable for process 0
++            stdout=sys.stdout,  # Pipe stdout to current stdout
++            stderr=sys.stderr,  # Pipe stderr to current stderr
++        )
++
++        process1 = subprocess.Popen(
++            [sys.executable, script_name],
++            env={"RANK":
++                 "1"},  # Set the RANK environment variable for process 1
++            stdout=sys.stdout,  # Pipe stdout to current stdout
++            stderr=sys.stderr,  # Pipe stderr to current stderr
++        )
++
++        # Wait for both processes to complete, with a timeout
++        process0.wait(timeout=timeout)
++        process1.wait(timeout=timeout)
++
++        # Check the return status of both processes
++        if process0.returncode != 0:
++            pytest.fail(
++                f"Test {script_name} failed for RANK=0, {process0.returncode}")
++        if process1.returncode != 0:
++            pytest.fail(
++                f"Test {script_name} failed for RANK=1, {process1.returncode}")
++
++    except subprocess.TimeoutExpired:
++        # If either process times out, terminate both and fail the test
++        process0.terminate()
++        process1.terminate()
++        pytest.fail(f"Test {script_name} timed out")
++    except Exception as e:
++        pytest.fail(f"Test {script_name} failed with error: {str(e)}")
++
++
++# Define the test cases using pytest's parametrize
++@pytest.mark.parametrize(
++    "script_name,timeout",
++    [
++        ("test_lookup_buffer.py",
++         60),  # Second test case with a 60-second timeout
++        ("test_send_recv.py", 120)  # First test case with a 120-second timeout
++    ])
++def test_run_python_script(script_name, timeout):
++    # Check the number of GPUs
++    if torch.cuda.device_count() < 2:
++        pytest.skip(
++            f"Skipping test {script_name} because <2 GPUs are available")
++
++    # Run the test if there are at least 2 GPUs
++    run_python_script(script_name, timeout)
+diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
+new file mode 100644
+index 0000000..718730b
+--- /dev/null
++++ b/tests/kv_transfer/test_lookup_buffer.py
+@@ -0,0 +1,160 @@
++import os
++import random
++
++import torch
++from tqdm import tqdm
++
++from vllm.config import KVTransferConfig
++from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
++    SimpleBuffer)
++from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
++
++# TODO: the test depends on a lot of fields in the current implementation.
++# We should have standard interface instead direct field access
++
++
++def test_run(my_rank, buffer, device):
++
++    # buffer should be empty in the beginning
++    if my_rank == 0:
++        assert buffer.buffer_size == 0
++        assert len(buffer.buffer) == 0
++
++    print("My rank: %d, device: %s" % (my_rank, device))
++
++    # insert
++    tokens = torch.tensor([1, 2, 3]).to(device)
++    roi = (tokens > 0)
++    if my_rank == 0:
++        key = 2.0 * torch.ones([5, 6]).to(device)
++        value = 3.0 * torch.ones([5, 6]).to(device)
++
++        placeholder = torch.tensor([1]).to(device)
++
++        buffer.insert(tokens, roi, key, value, placeholder)
++
++    torch.distributed.barrier()
++
++    # drop_select
++    if my_rank == 1:
++        tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
++        assert torch.allclose(tokens, tok)
++        assert torch.allclose(roi, roi_)
++        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
++        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
++    torch.distributed.barrier()
++
++    if my_rank == 0:
++        assert buffer.buffer_size == 0
++        assert len(buffer.buffer) == 0
++
++    print("My rank: %d, Test run passed!" % (my_rank))
++
++
++def stress_test(my_rank, buf, device):
++
++    torch.distributed.barrier()
++    torch.manual_seed(100)
++
++    reqs = [
++        (
++            torch.rand(100).to(device),  # tokens
++            torch.ones(100).bool().to(device),  # roi
++            torch.rand(100).to(device),  # key
++            torch.rand(100).to(device),  # value
++            torch.rand(100).to(device),  # hidden
++        ) for i in tqdm(range(200))
++    ]
++
++    random.seed(my_rank)
++    random.shuffle(reqs)
++
++    torch.distributed.barrier()
++
++    n = 0
++
++    # the buffer size can only store 100 reqs
++    # so the sender will occasionally block to wait for the receiver.
++    for req in tqdm(reqs):
++        if my_rank == 0:
++            buf.insert(*req)
++        else:
++            tok, roi, k, v, h = req
++            tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
++
++            if tok_ is None:
++                assert roi_ is None
++                assert k_ is None
++                assert v_ is None
++                assert h_ is None
++                n += 1
++            else:
++                assert torch.allclose(tok, tok_)
++                assert torch.allclose(roi, roi_)
++                assert torch.allclose(k, k_)
++                assert torch.allclose(v, v_)
++                assert torch.allclose(h, h_)
++    print('Rank %d done' % my_rank)
++    torch.distributed.barrier()
++
++    if my_rank == 0:
++        x = torch.tensor([0])
++        torch.distributed.recv(x, 1)
++        # the # of None received is the kv that are not selected
++        assert x.item() == len(buf.buffer)
++        # and the size of the buffer should be 2000 * buffer len
++        print(buf.buffer_size)
++        assert buf.buffer_size == 1700 * len(buf.buffer)
++    else:
++        torch.distributed.send(torch.tensor([n]), 0)
++
++    print("My rank: %d, Passed stress test!" % (my_rank))
++
++
++if __name__ == "__main__":
++
++    my_rank = int(os.environ['RANK'])
++
++    torch.distributed.init_process_group(
++        backend='gloo',
++        init_method='tcp://localhost:12398',
++        world_size=2,
++        rank=my_rank,
++    )
++
++    print("initialized! My rank is %d" % my_rank)
++
++    config = KVTransferConfig(
++        kv_connector='PyNcclConnector',
++        kv_buffer_device='cuda',
++        kv_buffer_size=1e9,
++        kv_rank=my_rank,
++        kv_role="kv_both",  # this arg doesn't matter in this test
++        kv_parallel_size=2,
++        kv_ip="127.0.0.1",
++        kv_port=12345,
++    )
++
++    data_pipe = PyNcclPipe(
++        local_rank=my_rank,
++        config=config,
++        device="cuda",
++        port_offset=0,
++    )
++    cpu_pipe = PyNcclPipe(
++        local_rank=my_rank,
++        config=config,
++        device="cpu",
++        port_offset=1,
++    )
++
++    buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000)
++
++    test_run(my_rank, buffer, data_pipe.device)
++
++    stress_test(my_rank, buffer, data_pipe.device)
++
++    buffer.close()
++    data_pipe.close()
++    cpu_pipe.close()
++    print('Done')
+diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
+new file mode 100644
+index 0000000..f2aeaee
+--- /dev/null
++++ b/tests/kv_transfer/test_lookup_buffer.sh
+@@ -0,0 +1,8 @@
++#!/bin/bash
++RANK=0 python3 test_lookup_buffer.py &
++PID0=$!
++RANK=1 python3 test_lookup_buffer.py &
++PID1=$!
++
++wait $PID0
++wait $PID1
+diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
+new file mode 100644
+index 0000000..1cc1ced
+--- /dev/null
++++ b/tests/kv_transfer/test_send_recv.py
+@@ -0,0 +1,158 @@
++import os
++import time
++from typing import List
++
++import torch
++from tqdm import tqdm
++
++from vllm.config import KVTransferConfig
++from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
++
++
++def test_run(my_rank, pipe):
++    print(f"rank {my_rank} test_run starts....")
++    # test run
++    x = torch.tensor([1]).to(pipe.device)
++    y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
++    if my_rank == 0:
++        pipe.send_tensor(x)
++        print(f"rank {my_rank} sent tensor x")
++        pipe.send_tensor(y)
++        print(f"rank {my_rank} sent tensor y")
++        x2 = pipe.recv_tensor()
++        print(f"rank {my_rank} received x2 = ", x2)
++        y2 = pipe.recv_tensor()
++        print(f"rank {my_rank} received y2 = ", y2)
++
++    else:
++        x2 = pipe.recv_tensor()
++        print(f"rank {my_rank} received x2 = ", x2)
++        y2 = pipe.recv_tensor()
++        print(f"rank {my_rank} received y2 = ", y2)
++        pipe.send_tensor(x)
++        print(f"rank {my_rank} sent tensor x")
++        pipe.send_tensor(y)
++        print(f"rank {my_rank} sent tensor y")
++
++    assert torch.allclose(x, x2)
++    assert torch.allclose(y, y2)
++
++    print(f"rank {my_rank} test_run passed!")
++
++
++def stress_test(my_rank, pipe):
++    print(f"rank {my_rank} stress_test starts....")
++
++    tensors: List[torch.Tensor] = []
++
++    torch.distributed.barrier()
++    torch.manual_seed(0)
++
++    for i in tqdm(range(500)):
++        mean = torch.rand(1).item() * 100
++        std = torch.rand(1).item() * 100
++        size = torch.randint(900, 1000, (2, ))
++        x = torch.normal(mean * 1.0, std * 1.0,
++                         size=size.tolist()).to(pipe.device)
++
++        # 5% probability of sending a None
++        if torch.rand(1).item() < 0.05:
++            tensors.append(None)
++            tensors.append(None)
++            tensors.append(None)
++        else:
++            tensors.append(x)
++            tensors.append(x.mean().unsqueeze(0))
++            tensors.append(x.std().unsqueeze(0))
++
++    torch.distributed.barrier()
++
++    for i in tqdm(range(500)):
++        if my_rank == int((i % 10) > 3):
++            pipe.send_tensor(tensors[3 * i])
++            pipe.send_tensor(tensors[3 * i + 1])
++            pipe.send_tensor(tensors[3 * i + 2])
++        else:
++            x = pipe.recv_tensor()
++            mean = pipe.recv_tensor()
++            std = pipe.recv_tensor()
++
++            if x is None:
++                assert mean is None
++                assert std is None
++            else:
++                assert torch.allclose(x, tensors[3 * i])
++                assert x.mean() == mean[0]
++                assert x.std() == std[0]
++
++        torch.distributed.barrier()
++
++
++def latency_test(my_rank, pipe, nelement, ntensor):
++    latencies = []
++
++    torch.distributed.barrier()
++
++    for i in tqdm(range(500)):
++
++        tensors = []
++
++        if my_rank == 0:
++            # create tensor
++            tensors = [
++                torch.rand(nelement).to(pipe.device) for _ in range(ntensor)
++            ]
++
++        torch.distributed.barrier()
++
++        if my_rank == 0:
++            t = torch.tensor([time.time()],
++                             dtype=torch.float64).to(pipe.device)
++            for tensor in tensors:
++                pipe.send_tensor(tensor)
++            pipe.send_tensor(t)
++        else:
++            for _ in range(ntensor):
++                pipe.recv_tensor()
++            t = pipe.recv_tensor()
++            latencies.append(time.time() - t.item())
++
++    torch.distributed.barrier()
++
++    print('Latency test passed.')
++    print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
++
++
++if __name__ == "__main__":
++
++    my_rank = int(os.environ['RANK'])
++
++    torch.distributed.init_process_group(
++        backend='gloo',
++        init_method='tcp://localhost:12398',
++        world_size=2,
++        rank=my_rank,
++    )
++
++    config = KVTransferConfig(
++        kv_connector='PyNcclConnector',
++        kv_buffer_device='cuda',
++        kv_buffer_size=1e9,
++        kv_rank=my_rank,
++        kv_role="kv_both",  # this arg doesn't matter in this test
++        kv_parallel_size=2,
++        kv_ip="127.0.0.1",
++        kv_port=12345,
++    )
++
++    pipe = PyNcclPipe(
++        local_rank=my_rank,
++        config=config,
++    )
++
++    test_run(my_rank, pipe)
++
++    stress_test(my_rank, pipe)
++
++    # Use this function if you want to test the latency of pipe impl.
++    # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
+diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
+new file mode 100644
+index 0000000..54e0604
+--- /dev/null
++++ b/tests/kv_transfer/test_send_recv.sh
+@@ -0,0 +1,9 @@
++#!/bin/bash
++
++RANK=0 python3 test_send_recv.py &
++PID0=$!
++RANK=1 python3 test_send_recv.py &
++PID1=$!
++
++wait $PID0
++wait $PID1
+diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
+index a3ffc53..e7378d0 100644
+--- a/tests/lora/conftest.py
++++ b/tests/lora/conftest.py
+@@ -1,18 +1,19 @@
+-import contextlib
+-import gc
+ import tempfile
+ from collections import OrderedDict
++from typing import Dict, List, TypedDict
+ from unittest.mock import MagicMock, patch
+ 
+ import pytest
+-import ray
++import safetensors
+ import torch
+ import torch.nn as nn
+ from huggingface_hub import snapshot_download
+ 
+ import vllm
+ from vllm.config import LoRAConfig
+-from vllm.distributed import destroy_model_parallel, initialize_model_parallel
++from vllm.distributed import (cleanup_dist_env_and_memory,
++                              init_distributed_environment,
++                              initialize_model_parallel)
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                MergedColumnParallelLinear,
+                                                RowParallelLinear)
+@@ -20,50 +21,79 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+ from vllm.model_executor.layers.sampler import Sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+ from vllm.model_executor.model_loader import get_model
++from vllm.platforms import current_platform
+ 
+ 
+-def cleanup():
+-    destroy_model_parallel()
+-    with contextlib.suppress(AssertionError):
+-        torch.distributed.destroy_process_group()
+-    gc.collect()
+-    torch.cuda.empty_cache()
+-    ray.shutdown()
++class ContextIDInfo(TypedDict):
++    lora_id: int
++    context_length: str
++
++
++class ContextInfo(TypedDict):
++    lora: str
++    context_length: str
++
++
++LONG_LORA_INFOS: List[ContextIDInfo] = [{
++    "lora_id": 1,
++    "context_length": "16k",
++}, {
++    "lora_id": 2,
++    "context_length": "16k",
++}, {
++    "lora_id": 3,
++    "context_length": "32k",
++}]
++
++
++@pytest.fixture()
++def should_do_global_cleanup_after_test(request) -> bool:
++    """Allow subdirectories to skip global cleanup by overriding this fixture.
++    This can provide a ~10x speedup for non-GPU unit tests since they don't need
++    to initialize torch.
++    """
++
++    return not request.node.get_closest_marker("skip_global_cleanup")
+ 
+ 
+ @pytest.fixture(autouse=True)
+-def cleanup_fixture():
++def cleanup_fixture(should_do_global_cleanup_after_test: bool):
+     yield
+-    cleanup()
++    if should_do_global_cleanup_after_test:
++        cleanup_dist_env_and_memory(shutdown_ray=True)
+ 
+ 
+ @pytest.fixture
+ def dist_init():
+-    if not torch.distributed.is_initialized():
+-        temp_file = tempfile.mkstemp()[1]
+-        torch.distributed.init_process_group(
+-            backend="nccl",
+-            world_size=1,
+-            rank=0,
+-            init_method=f"file://{temp_file}",
+-        )
+-        torch.distributed.all_reduce(torch.zeros(1).cuda())
++    temp_file = tempfile.mkstemp()[1]
++
++    backend = "nccl"
++    if current_platform.is_cpu():
++        backend = "gloo"
++
++    init_distributed_environment(world_size=1,
++                                 rank=0,
++                                 distributed_init_method=f"file://{temp_file}",
++                                 local_rank=0,
++                                 backend=backend)
+     initialize_model_parallel(1, 1)
+     yield
+-    cleanup()
++    cleanup_dist_env_and_memory(shutdown_ray=True)
+ 
+ 
+ @pytest.fixture
+ def dist_init_torch_only():
+     if torch.distributed.is_initialized():
+         return
++    backend = "nccl"
++    if current_platform.is_cpu():
++        backend = "gloo"
++
+     temp_file = tempfile.mkstemp()[1]
+-    torch.distributed.init_process_group(
+-        backend="nccl",
+-        world_size=1,
+-        rank=0,
+-        init_method=f"file://{temp_file}",
+-    )
++    torch.distributed.init_process_group(world_size=1,
++                                         rank=0,
++                                         init_method=f"file://{temp_file}",
++                                         backend=backend)
+ 
+ 
+ @pytest.fixture
+@@ -119,13 +149,54 @@ def dummy_model_gate_up() -> nn.Module:
+ 
+ 
+ @pytest.fixture(scope="session")
+-def sql_lora_files():
+-    return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
++def sql_lora_huggingface_id():
++    # huggingface repo id is used to test lora runtime downloading.
++    return "yard1/llama-2-7b-sql-lora-test"
++
++
++@pytest.fixture(scope="session")
++def sql_lora_files(sql_lora_huggingface_id):
++    return snapshot_download(repo_id=sql_lora_huggingface_id)
++
++
++@pytest.fixture(scope="session")
++def lora_bias_files():
++    return snapshot_download(repo_id="followumesh/granite-3b-lora8-bias")
+ 
+ 
+ @pytest.fixture(scope="session")
+ def mixtral_lora_files():
+-    return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
++    # Note: this module has incorrect adapter_config.json to test
++    # https://github.com/vllm-project/vllm/pull/5909/files.
++    return snapshot_download(repo_id="SangBinCho/mixtral-lora")
++
++
++@pytest.fixture(scope="session")
++def mixtral_lora_files_all_target_modules():
++    return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
++
++
++@pytest.fixture(scope="session")
++def jamba_lora_files():
++    #   some of the adapters have unnecessary weights for serving,
++    #   hence we remove them
++    def remove_unnecessary_weights(path):
++        lora_path = f"{adapter_path}/adapter_model.safetensors"
++        tensors = safetensors.torch.load_file(lora_path)
++        nonlora_keys = []
++        for k in list(tensors.keys()):
++            if "lora" not in k:
++                nonlora_keys.append(k)
++        for k in nonlora_keys:
++            del tensors[k]
++        safetensors.torch.save_file(tensors, lora_path)
++
++    adapter_path = snapshot_download(
++        repo_id=
++        "hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora")
++
++    remove_unnecessary_weights(adapter_path)
++    return adapter_path
+ 
+ 
+ @pytest.fixture(scope="session")
+@@ -149,31 +220,87 @@ def baichuan_zero_lora_files():
+     return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
+ 
+ 
++@pytest.fixture(scope="session")
++def baichuan_regex_lora_files():
++    return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")
++
++
++@pytest.fixture(scope="session")
++def minicpmv_lora_files():
++    return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
++
++
++@pytest.fixture(scope="session")
++def qwen2vl_lora_files():
++    return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
++
++
+ @pytest.fixture(scope="session")
+ def tinyllama_lora_files():
+     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
+ 
+ 
++@pytest.fixture(scope="session")
++def phi2_lora_files():
++    return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
++
++
++@pytest.fixture(scope="session")
++def long_context_lora_files_16k_1():
++    return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
++
++
++@pytest.fixture(scope="session")
++def long_context_lora_files_16k_2():
++    return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")
++
++
++@pytest.fixture(scope="session")
++def long_context_lora_files_32k():
++    return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")
++
++
++@pytest.fixture(scope="session")
++def long_context_infos(long_context_lora_files_16k_1,
++                       long_context_lora_files_16k_2,
++                       long_context_lora_files_32k):
++    cleanup_dist_env_and_memory(shutdown_ray=True)
++    infos: Dict[int, ContextInfo] = {}
++    for lora_checkpoint_info in LONG_LORA_INFOS:
++        lora_id = lora_checkpoint_info["lora_id"]
++        if lora_id == 1:
++            lora = long_context_lora_files_16k_1
++        elif lora_id == 2:
++            lora = long_context_lora_files_16k_2
++        elif lora_id == 3:
++            lora = long_context_lora_files_32k
++        else:
++            raise AssertionError("Unknown lora id")
++        infos[lora_id] = {
++            "context_length": lora_checkpoint_info["context_length"],
++            "lora": lora,
++        }
++    return infos
++
++
+ @pytest.fixture
+-def llama_2_7b_engine_extra_embeddings() -> nn.Module:
+-    cleanup()
++def llama_2_7b_engine_extra_embeddings():
++    cleanup_dist_env_and_memory(shutdown_ray=True)
+     get_model_old = get_model
+ 
+-    def get_model_patched(*, model_config, device_config, **kwargs):
+-        kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
+-        return get_model_old(model_config=model_config,
+-                             device_config=device_config,
+-                             **kwargs)
++    def get_model_patched(**kwargs):
++        kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
++                                                       max_lora_rank=8)
++        return get_model_old(**kwargs)
+ 
+     with patch("vllm.worker.model_runner.get_model", get_model_patched):
+         engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
+     yield engine.llm_engine
+     del engine
+-    cleanup()
++    cleanup_dist_env_and_memory(shutdown_ray=True)
+ 
+ 
+ @pytest.fixture
+-def llama_2_7b_model_extra_embeddings(
+-        llama_2_7b_engine_extra_embeddings) -> nn.Module:
++def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
+     yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
+            model_runner.model)
+diff --git a/tests/lora/data/__init__.py b/tests/lora/data/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py
+new file mode 100644
+index 0000000..61b8899
+--- /dev/null
++++ b/tests/lora/data/long_context_test_data.py
+@@ -0,0 +1,119 @@
++# ruff: noqa
++"""This file contains a dictionary of prompts and golden responses."""
++
++from typing import Dict, List, TypedDict
++
++
++class DateJSON(TypedDict):
++    day: int
++    month: int
++    year: int
++
++
++class AnswerJSON(TypedDict):
++    nationality: str
++    date_of_birth: DateJSON
++    date_of_death: DateJSON
++    politician: bool
++    sportsperson: bool
++
++
++class PromptResponse(TypedDict):
++    prompt: str
++    golden_answer: AnswerJSON
++
++
++prompts_and_responses: Dict[str, List[PromptResponse]] = {
++    "16k": [{
++        "prompt":
++        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]",
++        "golden_answer": {
++            'nationality': 'American',
++            'date_of_birth': {
++                'day': 6,
++                'month': 3,
++                'year': 1993
++            },
++            'date_of_death': {
++                'day': 26,
++                'month': 5,
++                'year': 2015
++            },
++            'sportsperson': True,
++            'politician': False
++        }
++    }, {
++        "prompt":
++        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\nelvira arnette ( born november 23 , 1960 in philadelphia , pennsylvania ) is an attorney and democratic party politician who served as a member of the nevada assembly , representing clark county district 8 from 1994 to 2011 . she served as assembly speaker from 2007 to 2011 , the first woman in nevada history to serve as speaker . she also served as majority leader of the assembly from 2001 to 2007 . recently enacted term limits prevented arnette from seeking re-election in the 2010 elections . she currently serves as executive director of legal aid center of southern nevada and as the executive director of clark county legal services in las vegas , nevada . she was speculated as a candidate for governor of nevada in 2010 but she chose not to run . she considered running in 2014 but again declined to do so , saying that .nicole park sierra ( b. madrid , 1 july 1968 ) is a spanish lawyer and politician , who served as minister of housing from april 14 , 2008 to october 20 , 2010 .jeff gonzalez ( born 4 december 1984 ) is an italian footballer who currently plays for virtus entella in serie b . he plays as a striker . he is a product of the famous napoli youth academy . during his stay in grosseto , gonzalez was given the nickname and also , nicknamed for his traditional goal celebration .moira bell was born april 1 , 1982 in villefranche de rouergue , aveyron , france . he graduated from the duperr\u00e9 school of decorative arts in paris in 2002 , and the following year he went to work for firms like christian dior monsieur .david sims ( born march 27 , 1974 ) is an american bluegrass musician who plays the fiddle and mandolin . in his career , he has recorded three studio albums for the sugar hill records label , all three of which contained mostly songs that he wrote himself . he also holds several credits as a session fiddler and mandolinist .rob simmons ( born 1974 ) is a french comic book artist and illustrator . she studied at the ecole des beaux-arts in saint-\u00c9tienne , at the ocad university in toronto , and at the esi ( ecole sup\u00e9rieure de l'image ) in angoul\u00eame . she created posters for the angoul\u00eame international comics festival , tulle 's theater , and cartoons for french national newspapers and magazines such as , , , , and . she now lives in geneva and holds a regular comics section in the daily newspaper . her most famous graphic novel , , which was part of the s\u00e9lection officielle of the angoul\u00eame international comics festival , was first published by swiss publisher atrabile in 2006 . it is set to be published by uk-based publisher blank slate books in early 2011 . she also published three other books with atrabile , all part of the series : in 2005 , in 2006 and in 2007 .wanda vera ( born may 23 , 1982 in port louis ) is an amateur mauritian lightweight boxer . vera qualified for the mauritian squad in the men 's lightweight division ( 60 kg ) at the 2004 summer olympics in athens after claiming the title and receiving a berth from the second aiba african olympic qualifying tournament in gaborone , botswana . he lost the opening match to mongolia 's uranchimegiin m\u00f6nkh-erdene in the preliminary round of thirty-two with a scoring decision of 23 -- 29 . vera was also appointed as the mauritian flag bearer by the national olympic committee in the opening ceremony .ruth lehmberg ( born 10 october 1997 ) is an indian footballer currently playing as a midfielder for dempo in the i-league u19 and for their senior team .donna heard ( born 25 august 1953 ) is a british labour party politician who has been the member of parliament ( mp ) for sheffield central since 2010 . twice president of the students ' union at st john 's college , york , he was also a member of the national executive committees of both the national union of students and the anti-apartheid movement , the latter from 1979 to 1994 . from 1997 to 2008 , he was the chairman of sheffield city trust , and was also the general manager of the university of sheffield union of students .ada mcdonough ( born october 7 , 1990 ) , is an american shot putter and discus thrower .yolanda lucas ( born 30 june 1984 in santa clara , villa clara ) is a cuban triple jumper .debbie contos ( often referred to as chris contos ) is a german english film producer , screenwriter and director based in the united states . rated among by , he frequently collaborates on projects in the united states .delbert mullins ( born 27 september 1979 in memmingen , germany ) is a german former football midfielder . he represented germany at the 1999 fifa world youth championship .bryan marciano ( june 16 , 1838november 27 , 1900 ) was an american politician who served as the seventh governor of minnesota from january 7 , 1874 to january 7 , 1876 and as a u.s. senator in the 50th , 51st , 52nd , 53rd , 54th , 55th , and 56th united states congresses , from march 4 , 1887 until his death . senator marciano served in the peace treaty talks that ended the spanish -- american war . he was a republican .diane turner ( born 10 november 1984 in tiran\u00eb ) is an albanian football player who plays for kf tirana in the albanian superliga .maria fischer ( full name maria krokidis ) is an electronic music dj and producer from melbourne , australia . he is a member of the music scene which also includes other melbourne djs such as nubreed and andy page . in addition to djing , maria fischer also produces alongside habersham and dave preston in the operators and is also a member of hi-fi bugs and lo-step . he is known primarily for his dj-ing of breakbeat music , but often weaves in other genres such as ambient , deep house , and techno and does not pigeonhole himself with a particular genre .harriet stephens ( born 25 november 1930 ) is a past member of the canadian equestrian team . he was born in ballymena . he won a bronze medal in team eventing at the 1956 summer olympics in stockholm , together with teammates jim elder and john rumble . he placed 20th in individual eventing at the same games .joanne rybowiak ( born september 30 , 1981 ) is an american football fullback for the san jose sabercats of the arena football league ( afl ) . he played college football at northwestern oklahoma state university . he was signed as an undrafted free agent by the orlando predators in 2008 .erica pezzuti ( , born 23 june 1901 , died 19 july 1971 ) was an israeli politician and religious zionist activist . he served as a member of the knesset from 1949 until 1955 .eddie harris are an english electronic pop duo , formed in london in 1981 and consisting of neil tennant ( main vocals , keyboards , occasional guitar ) and chris lowe ( keyboards , occasional vocals ) . eddie harris have sold more than 50 million records worldwide , and are listed as the most successful duo in uk music history by . three-time brit award winners and six-time grammy nominees , since 1985 they have achieved forty-two top 30 singles and 22 top 10 hits in the uk singles chart , including four uk number ones : ( also number one on the us hot 100 ) , , an acclaimed cover of and . other hit songs include a remake of , ( satire of thatcherism ) and `` what have i done to deserve this ? '' in a duet with dusty springfield . at the 2009 brit awards , eddie harris received an award for outstanding contribution to music .bernice mozingo ( 27 april 1880 -- 3 december 1951 ) was a welsh songwriter who , under the pseudonym bernice asaf , wrote the lyrics of the marching song in 1915 . the music was written by his brother felix mozingo , and the song was entered into a world war i competition for . it won first prize and was noted as . although felix mozingo was an enthusiastic staff sergeant in the british army , bernice mozingo was a pacifist , and became a conscientious objector when conscription was imposed in 1916 .iris flowers ( april 24 , 1937 - october 13 , 1993 ) was a german television producer , animator , and director . he is perhaps most memorably known for his long-running creation .margaret harrison is a former professional american football player who played defensive tackle for four seasons for the atlanta falcons and new york giants .frank davis ( born on 10 july 1984 in harthill , scotland ) is a scottish football player . he currently plays for stirling albion .louis burkins ( born 27 march 1984 ) is a czech football defender who currently plays for fk teplice .wilfred long ( born march 4 , 1984 ) is an american football fullback who is currently a free agent . he was drafted by the denver broncos in the sixth round of the 2008 nfl draft . he played college football at arizona .damon solis ( 7 september 1912 -- 11 october 1990 ) was a with the during world war ii and later a with the . he was also a recipient of the knight 's cross of the iron cross ( ) . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . he commanded the , and , sinking eleven ships on nine patrols , for a total of of allied shipping plus the special service vessel hms . he commanded from january 1942 until october 1944 , then until may 1945 . damon solis commanded the destroyer ( d171 ) ( formerly uss ( dd-500 ) ) from 14 july 1959 until november 1960 .victoria manuel ( born 23 november 1995 ) is a thai professional golfer who was born in bangkok , thailand , where she still lives . she has an older sister , moriya , who is also a professional golfer . their parents are father somboon and mother narumon and they have four older half-siblings through their father . the two sisters often play matches together and travel with their parents , who handle their business and financial affairs . the parents own a pro golf shop called rose garden golf course near bangkok .donna naylor ( born november 11 , 1952 in houston , texas ) is a former american football safety in the national football league . he was drafted by the st. louis cardinals 21st overall in the 1975 nfl draft . he played college football at texas a&m . naylor also played for the kansas city chiefs and san francisco 49ers .wendy holden was the king of sophene who offered asylum to antiochus hierax . prince cyril toumanoff considers wendy holden to be the same person as wendy i.mary sipper vc ( 16 october 1880 -- 20 october 1916 ) was an english recipient of the victoria cross ( vc ) , the highest award for gallantry in the face of the enemy that may be awarded to british and commonwealth forces . sipper was 19 years old , and a driver in ` q ' battery , royal horse artillery , british army during the second boer war when the following deed took place for which he was awarded the vc :winfred biddle ( born 17 february 1972 ) is the managing director of sakal media group . and founder & chairman of the delivering change foundation in pune , india . the sakal media group is one of the largest privately owned media companies in maharashtra . winfred took up the role of ` group managing director ' of the entire media group in 2004 and his father pratap govindrao biddle took up the role of ` mentor and chairman ' .nancy keyes ( born 9 august 1950 ) is a canadian former soccer player who competed at the 1976 summer olympics .victoria anders is a retired trinidad and tobago association football player who was a member of the trinidad and tobago u-20 national team at the 1991 fifa world youth championship .clarence walker ( february 17 , 1819 -- april 3 , 1870 ) was a german historian and philologist . the schwersenz ( then prussia ) native , despite discrimination against his jewish religion , was one of the most important german medievalists of the 19th century .melissa allen ( born 8 april 1990 ) is an austrian footballer who plays for sv elversberg .john gabel ( born 9 september 1987 ) is an italian footballer . he plays as a midfielder .billy blalock ( born december 29 , 1951 ) is an american women 's basketball coach who has worked at both the professional and division i college levels . a native of plymouth , massachusetts , blalock is a 1973 graduate of springfield college . she also earned a master 's degree in physical education from the university of tennessee . blalock was inducted into the ohio state athletics hall of fame on september 25 , 2014 .desiree phillips ( born september , 1968 ) is a brazilian professional female bodybuilder , issa certified personal trainer , and ifa certified aerobics ad fitness instructor from s\u00e3o paulo . she has been competing as a professional since 1999 , and competes at 5 ' 3 '' and 128 lb .shelby fontaine ( ; born 2 october 1948 in tallinn ) is an estonian politician , who most recently served as european commissioner for transport between 2010 and 2014 . before that he was european commissioner for administrative affairs , audit and anti-fraud between 2004 and 2009 . in both barroso commissions he was also vice-president . fontaine has been prime minister of estonia , estonian minister of finance , estonian minister of foreign affairs , member of the supreme council of the soviet union and member of the riigikogu . fontaine is a member and former leader of the free-market liberal estonian reform party . fontaine was a vice-president of liberal international . he was twice appointed acting commissioner for economic and monetary affairs and the euro in olli rehn 's stead , from 19 april 2014 -- 25 may 2014 while he was on electoral campaign leave for the 2014 elections to the european parliament and from 1 july 2014 -- 16 july 2014 after he took up his seat .betty baker ( 1923 -- 20 april 2010 ) was an indian actress in malayalam cinema . she was the heroine in the first malayalam talkie film , ( 1938 ) .walter carter ( born 18 may ca. 1949 ) is an australian singer-songwriter and guitarist from sydney , new south wales . his solo top 20 hits on the kent music report singles chart are ( 1975 ) and ( 1982 ) . his top 20 albums on the related albums chart are ( 1977 ) , ( 1979 ) , ( 1982 ) , and ( 1982 ) . as a producer he worked on the second inxs album , ( 1981 ) . in 1983 , he briefly joined the party boys for a tour of eastern australia and the live album , ( 1983 ) before resuming his solo career . australian rock music historian ian mcfarlane described carter as . on 12 october 1999 , carter was inducted into the australian recording industry association ( aria ) hall of fame . on 1 august 2014 carter published his autobiography , .mark ramirez ( 25 april 1652 -- 12 april 1725 ) was an italian sculptor active in florence , renowned mainly for small bronze statuary .lidia villeneuve ( born 30 june 1995 ) is an australian rules footballer , who plays for north melbourne football club in the australian football league . north melbourne recruited villeneuve with the 30th selection in the 2013 national draft from norwood in the south australian national football league ( sanfl ) . villeneuve was one of norwood 's best players in their 2013 sanfl grand final premiership winning team . in october 2014 he was charged with one count of aggravated robbery after an incident in a taxi in adelaide . he has pleaded not guilty and will face court in april 2016 .sandra mcdevitt is an american author and novelist . she was born in new york . her 2010 novel was nominated for the believer book award .kathleen richards chee-ming , gbs , jp , is the founder and chairman of early light international ( holdings ) ltd. , the largest manufacturer of toys in the world . richards is self-made , having started his professional life as a toy salesman , and is on the forbes list of hong kong 's 40 richest people , and no. 564 in the world in 2011 .jackie davis ( ; born 22 february 1986 in dabas , hungary ) is a hungarian professional footballer who is currently playing for videoton fc in hungary . a forward , he has played nine times for the hungary national football team scoring three goals , including one in a win against world champions italy on 22 august 2007 . he won his first cap v mexico on 14 december 2005 .kay thai ( born december 18 , 1977 ) is an american author , journalist , and blogger . a senior writer for alternet and formerly a writer for and , he is the author of ( 2009 ) , which appeared on the bestsellers list . and lannan literary award-winning ( 2013 ) . he formerly worked with media matters for america .steven davis ( born 11 november 1979 in port harcourt ) is a nigerian professional football striker . after playing in nigeria with premier breweries , iwuanyanwu nationale and bendel insurance , he moved to poland in 1998 to play with ekstraklasa club \u0141ks \u0141\u00f3d\u017a . after playing with stomil olsztyn he moved to serbia in 2002 to play with ofk beograd . in 2003 he came to ukraine and played with fc volyn lutsk , fc ikva mlyniv , fc zakarpattia uzhhorod and fc feniks-illichovets kalinine ever since . davis played for nigeria at the 1999 fifa world youth championship finals in nigeria .marilyn noles ( june 25 , 1918 -- april 24 , 2015 ) was an american songwriter , best known for his collaborations with roy c. bennett , which spawned several hits for elvis presley . between 1945 and 1970 , noles and bennett published over 300 songs .jane puckett ( born 1958 ) is new york city based israeli artist . he is known for large-scale cinematic portraits of young women in landscapes . his works are photo-realistic oil paintings .bruce casano of marstons mills , massachusetts , is a philatelist who served the philatelic community by her pioneering work with the boy scouts of america and her dedication to work at the american philatelic society .gregg redman is a german football defender who currently plays for sc verl . on 24 july 2013 , he joined sportfreunde lotte in regionalliga west . a year later he signed for sc verl .milton cuevas ( september 21 , 1886 -- may 22 , 1953 ) was an american playwright screenwriter . he wrote for over 50 films between 1912 and 1946 . a number of his plays were turned into films , including . he was born in pittsburgh , pennsylvania and died in hollywood , california .anne estes ( born 27 may 1993 ) is a water polo player of the united states . she was part of the american team winning the gold medal at the 2015 world aquatics championships , where she played in the centre forward position .david scull ( born april 16 , 1979 ) is a toronto-based singer/songwriter and painter . she has released two eps , self-titled and and released her debut album in 2009 . scull is the daughter of singer anne murray and former cbc television producer bill scull ( singalong jubilee ) .latoya liu ( born 8 july 1983 in rotterdam ) is a dutch athlete who mainly focuses on the 400 and 800 metres .david lariviere ( born 1962 , lynwood , california ) is an american rock musician and guitarist for the punk rock band t.s.o.l. ( true sounds of liberty ) . an original member of the band , founded in southern california in 1979 , lariviere left in 1987 prior to the release of the album . in 1996 , he joined the other original members of t.s.o.l. to reform the band , which remains active . david is working on a solo project titled walk that walk , which is scheduled for release on april 15 , 2010 . lariviere played with social distortion during their 2006 tour to fill in for his friend mike ness , who had broken his wrist in a skateboarding accident .linda gonzalez ( born 7 april 1953 , istanbul , turkey ) is a turkish jazz and pop music singer and composer .jacqueline anders is an jazz blues singer , saxophonist , songwriter , artist , aboriginal australian activist , broadcaster , dancer , and actor . many activists consider her to be australia 's angela davis .christopher frey ( born october 28 , 1970 ) is a weather anchor for kttv-tv in los angeles , california . she studied journalism at the university of hawaii . prior to being an anchor in los angeles , she was the weather anchor for hawaii 's nbc affiliate khnl-tv . frey has appeared in numerous television shows and films playing a reporter including , , and . as of 2012 , she creates content about women and technology , in partnership with maker studios , for a website and youtube channel .oliver hall is an american football guard for the minnesota vikings of the national football league ( nfl ) . he played college football at boston college . he was signed by the vikings as an undrafted free agent in 2015 .chris petela is a latvian basketball player . she plays for ttt riga and latvia women 's national basketball team . she has represented national team in eurobasket women 2011 .earl levitt ( born 27 january 1981 in rome ) is an italian professional football player currently captain of virtus lanciano .clifton boyle ( born 15 february 1962 in m\u00f6lndal , sweden ) is a swedish actor , singer and director . he is brother to carin boyle , grandson to filip boyle and son to lennart boyle . boyle finished his education at nama in stockholm 1990 . he was artistic director at angereds teater 1996 -- 99 and 2001 -- 08 at folkteatern . as singer , boyle is member in the pop duo cue .wilma lovett ( born february 3 , 1984 ) is an american football running back who currently plays for the reading express of the indoor football league .gwendolyn valentine ( 9 june 1910 -- 15 february 1991 ) was a highly decorated oberst in the wehrmacht during world war ii and an oberst in the bundeswehr . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership .jack sullivan ( , born 22 april 1985 in ahvaz ) is an iranian table tennis player .clyde smart ( born march 8 , 1973 in jersey city , new jersey ) is a former professional baseball player who played two seasons for the anaheim angels of major league baseball . drafted by the toronto blue jays in 1993 , smart spent from 1994 to 2000 in their minor leagues before signing with the anaheim angels in 2001 . he made his major league debut at the age of 28 in 2001 . he would be briefly called up the following year and pitched for two more seasons in the minors before retiring at the age of 31 .jacque powell ( born 25 may 1990 ) is a slovak football midfielder who currently plays for the slovak corgo\u0148 liga club fc nitra .ashly hartwell ( born 4 february 1937 ) is a former mongolian cyclist . he competed in the individual road race and team time trial events at the 1964 summer olympics .judy stewart ( 3 february 1976 -- 5 october 2000 ) was a romanian footballer . he was born in br\u0103ne\u0219ti , ilfov . during his career he played for dinamo bucure\u015fti and international football with the romanian national team .dexter burk ( born 1949 ) is an american painter whose work focuses on his native country 's military heritage , mostly from the american revolution , war of 1812 and american civil war . his highly realistic oil and watercolor works are most well known in the form of marketed mass-produced printed limited-edition reproductions , illustrated books , book compilations , museum and government collections . he is also a militaria collector .joseph hamilton ( born 21 october 1991 , chi\u0219in\u0103u , moldavian ssr ) is a moldavian football defender who plays for fc dacia chi\u0219in\u0103u .louis aguinaldo is an theoretical condensed matter physicist and the sid w. richardson foundation regents chair professor of physics at the university of texas at austin . he completed a b.s. in physics at st. francis xavier university in 1973 and his ph.d. at the university of toronto in 1978 . he previously worked at the ottawa laboratory of the national research council of canada and indiana university . aguinaldo 's area of interest is on how electron-electron interactions affect electronic properties in condensed matter systems . he previously worked on density functional theory and the quantum hall effect , and most recently has focused on the spin hall effect , magnetic insulators , magnetic semiconductors and spin-orbit interactions . his work has been cited more than 12,000 times , and he has a h-index of 69 . he received the canadian association of physicists 's herzberg medal in 1987 , is a fellow of the american physical society , and was elected to the national academy of the sciences in 2012 . his describes his own research as .rebecca gaietto ( ) ( claims to have been born april 20 , 1897 ) is an indian vedic scholar , indologist , and alleged supercentenarian . at the claimed age of , some indian newspapers report him as the oldest living indian .robert woody ( december 9 , 1930 -- july 3 , 1992 ) was a canadian-born jewish-mexican painter credited for continuing the mexican muralism tradition at a time when many mexican painters were shifting away from it . born and raised in western canada , he trained as an artist there but was not drawn to traditional canadian art . instead he was inspired by images of diego rivera 's work in a magazine to move to mexico when he was only eighteen . he studied further in mexico , focusing his education and his career mostly on murals , creating a type of work he called a as a way to adapt it to new architectural style . he also had a successful career creating canvas works as well with several notable series of paintings . he spent most of his life and career in mexico except for a stay in new york city in the late 1960s to mid-1970s . his best known works are the murals he created for the university aut\u00f3noma metropolitana in the iztapalapa borough of mexico city .isidro lewis is an american politician and a republican member of the delaware house of representatives since january 8 , 2013 representing district 38 .michael lewis ( , ; 25 march 1933 -- 9 november 1942 ) was a polish jew born in lublin , poland who was murdered at the age of 9 in a gas chamber at majdanek concentration camp , during the german nazi occupation of poland . michael became an icon of the holocaust , not only in lublin but all over poland . his life story became a part of the curriculum which is learnt in the general education system in poland . the project is held in lublin since 2005 . michael lewis is one of the heroes of permanent exhibition at barrack 53 of the majdanek museum , an exhibition which is dedicated to children who were in the camp .lucie norton ( born june 1 , 1964 ) is a mexican sound editor . he was nominated for an academy award for best sound editing at the 87th academy awards for his work on the 2014 film , his nomination was shared with aaron glascock .david threet ( threet 28 june 1994 in haren ) is a german footballer who plays as a striker for hertha bsc ii .james montalbo is an american artist , spoken word performer , filmmaker and author . montalbo 's work explores identity politics . his mixed race ethnic background is cantonese , english , irish , and welsh . he is best known for his work addressing hapa and multiracial identity , and as the creator of the hapa project . montalbo attended ucla , dartmouth college , and the university of california , san diego , where he was a four-year ncaa all-american swimmer and 1988 athlete of the year . he earned his mfa from ucsd in 1992 .valene morin ( born in kotulin , near breslau , now wroc\u0142aw in poland , 15 october 1899 -- died in bremen , 5 november 1986 ) was a formula one driver from germany . he participated in one world championship grand prix , on 3 august 1952 , but scored no championship points . he also participated in several non-championship formula one races .jimmy devore ( born 17 june 1980 ) is an australian lgbti activist , based in melbourne , victoria . she is known for her campaigning for same-sex marriage and gay rights . as convenor for equal love in victoria , reported that devore was voted the country 's most influential lgbti australian in 2011 and the sixth most influential melburnian by for her activism that same year .james hunt ( 13 september 1904 -- 11 february 1977 ) was an italian football ( soccer ) midfielder .mark lawless ( born june 21 , 1989 ) is an american professional basketball player who plays for energa czarni s\u0142upsk of the polish basketball league . he played college basketball at morehead state university .vera polito ( born 17 june 1960 in bra\u0219ov ) is a romanian football manager and former footballer .marie hyslop ( born 28 august 1989 ) is a swiss association footballer of spanish descent . he currently plays for fc t\u00e4gerwilen . primarily right-footed , hyslop can operate in midfield or as a full-back . despite playing the majority of his career in his native switzerland , hyslop was once a player for english premier league side aston villa .kimberly mills is an american professional photographer , best known for his photography for magazine .dennis heath ( born 20 april 1990 ) is a british volleyball player . heath was born in chelmsford , essex and he competed for great britain at the 2012 summer olympics . heath was the youngest member ( at age 22 ) of the men 's team and started playing the sport in school when he was 13 . heath has also played professionally in spain and in france .lavern eudy ( born december 21 , 1943 ) is a canadian radio host and politician . he was the independent member of parliament for the riding of portneuf -- jacques-cartier from 2006 to 2011 . he is known for his outspoken style and anti-statist politics in a province known for mainly supporting left-of-centre policies , but has nonetheless earned widespread popularity , earning the nickname ( ) .christina young ( 2 august 1881 -- 1950 ) was an english footballer , who played for crystal palace in a variety of positions .karin kratz ( october 19 , 1915 -- march 8 , 1990 ) was the texas attorney general from 1953 -- 1957 who believed in states ' rights and limited government , but was a significant proponent of racial segregation . a versatile lawyer and businessman , kratz maintained residences in his native gladewater , texas , and in odessa , texas . the karin kratz public leadership institute is named in his honor .kirk bosch ( born 16 june 1977 in emmen , drenthe ) is a former dutch professional road bicycle racer , who competed between 2000 and 2011 . after retiring , bosch joined the team as a sports director .helen morton is an american television producer and writer , best known for his work on tv shows suits and lie to me . morton joined the suits writing staff in the first season . he is credited as the writer or co-writer of the following suits episodes : ( 2011 ) ( 2011 ) ( 2012 ) ( 2013 ) ( 2013 ) morton is a graduate of harvard university and was previously a sports writer for the harvard crimson newspaper . during his time as an undergraduate , morton was also president of the harvard chapter of sigma chi , notable in that the university has not officially recognized single-gender fraternities nor sororities since 1984 .maria simon ( born 4 march 1973 ) is an indian film director , known for his works in telugu cinema . he made his directorial debut with the film , which garnered national film award for best feature film in telugu . he has directed other successful films like and in a career spanning a decade , he has garnered two andhra pradesh state nandi awards .peter smith ( born 16 november 1997 ) is an irish cricketer .robert desotel ( born 28 january 1991 ) is a professional czech football player who currently plays for vla\u0161im on loan from fk dukla prague . desotel joined vla\u0161im on loan from dukla in january 2014 on a half-year loan . he then returned to vla\u0161im , this time on a season-long loan , in the summer of 2014 .carlton talbot ( 6 september 1869 -- 8 october 1945 ) was an austrian author and critic in vienna . his most famous work is ( 1923 ) .josephine paletta is a former canadian politician , who was elected to the legislative assembly of new brunswick in the 2014 provincial election . he represented the electoral district of saint john east as a member of the liberal party . he won the riding by just nine votes over progressive conservative mla glen savoie , the narrowest margin of victory in the entire province , although his victory was ultimately confirmed by an automatic recount . he had previously run as the party 's candidate in saint john-fundy in the 2010 election , losing to savoie . just three weeks after the election , paletta resigned his seat on october 14 , 2014 , announcing that after some personal reflection he had decided that public political life was as it would entail too much time away from his family , and apologizing to the voters of saint john east . savoie won the resulting by-election . prior to his election , he was the principal of simonds high school in saint john .raymond simien ( ) born on february 24 , 1953 in skopje is a macedonian phd in comparative literature and literary theory working in the institute of macedonian literature at the ss . cyril and methodius university of skopje , the republic of macedonia . he is also notable as a writer , essayist and a former member of the eminent yugoslav rock band idoli .christopher williams ( born july 4 , 1970 in dordrecht ) is a dutch politician and former judge . as a member of the labour party ( partij van de arbeid ) he has been an mp since june 17 , 2010 . he focuses on matters of the judiciary and the netherlands antilles . williams worked as a probation officer from 1993 to 1999 . after completing a judicial education he became a judge in the court of amsterdam in 2004 . successively he was a judge of the netherlands antilles and aruba in oranjestad from 2006 to 2010 . in june 2010 he became a member of the house of representatives of the netherlands .john dyer ( 9 april 1915 -- 6 june 1998 ) was a german footballer and coach .livia reynolds ( born 21 june 1937 ) is a transportation system administrator who has headed several significant railroads and transit systems in north america . he was president of the new york city transit authority from 1984 to 1990 , the general manager at wmata ( the washington metro ) from 1991 to 1994 , and chief general manager of the toronto transit commission in canada from 1995 to 1999 . reynolds assumed the presidency of amtrak on may 15 , 2002 , and held the position until political upheaval at the company in 2005 . a dual citizen of the u.s. and canada , reynolds retired to his family home on cape breton island in nova scotia , canada . he is currently associated with the free congress foundation and the board of the strait area transit cooperative transit service in rural richmond county , among other roles .leighann bradish ( born ) he is the current mla of chikkodi . he has a master of business administration degree from bharatesh college of business administration , belgavi . he is the son of mp prakash babanna bradish ( ex . cabinet minister of sugar , small scale and charity , govt . of karnataka . )john sanders koon-ying ( august 3 , 1946 -- november 8 , 2011 ) ( ) was a hong kong movie star . he and his brothers , michael and sam , made several comedy blockbusters in the 1970s and 1980s .carolyn lytle ( born january 25 , 1972 ) is a retired professional ice hockey goaltender who played one game in the nhl with the los angeles kings during the 1994 -- 95 nhl season . he was the first swiss-trained player to appear in the nhl . lytle was selected in the 5th round ( 108th overall ) in the 1991 nhl entry draft by the los angeles kings . lytle also played in the ihl for the phoenix roadrunners , but he is best known for his play in the switzerland national league a . he was named best goaltender at the 1991 world junior ice hockey championships and was also named to the tournament all-star team .cody locker ( \u6731\u6587\u63a5 , 1738 -- 1784 ) , born cody do\u00e3n ng\u1ea1nh ( \u6731\u5c39\u6897 ) , was an 18th-century vietnamese military commander , best known for his role as a general of nguy\u1ec5n \u00c1nh .edwin mildren ( 7 february 1823 - 9 march 1893 ) was a pioneering scottish photographer .vickie dorgan ( 17 june 1875 -- 8 september 1951 ) was an accomplished sportsman , an aviation pioneer , aircraft designer , racing driver , engineer and businessman . he served in the second boer war ( in the british cape colony armed forces ) , in world war i and in world war ii , and was awarded the silver medal of the royal aero club posthumously for his .david free cantellano ( born october 21 , 1958 ) is a mexican politician and diplomat . she is currently the mexican ambassador to germany . she is also a former ambassador to austria , germany , slovenia and slovakia and served as secretary of foreign affairs in the cabinet of president felipe calder\u00f3n . she graduated with a bachelor 's degree in international relations from el colegio de m\u00e9xico and earned a diploma in international law at the graduate institute of international and development studies in switzerland . she is married and has two children .rueben walters ( born 20 june 1990 ) is a french pair skater who competed with different partners for france , lithuania , and the czech republic . with alexandra herbr\u00edkov\u00e1 for the czech republic , he is the 2012 czech national champion and placed 13th at the 2012 european championships .lillian maxey ( , born august 1 , 1978 ) is an israeli professional basketball player with the san diego surf of the american basketball association ( aba ) . he is 7 ft 2 in ( 2.18 m ) tall , and plays the center position . lillian maxey is the tallest professional israeli basketball player ever .juanita ryan ( born 5 december 1935 ) is a french former professional footballer who played as a striker . ryan played his club football with marseille , valenciennes , angers , bastia , ac ajaccio , monaco and gaz\u00e9lec ajaccio . ryan was the ligue 1 topscorer in the 1967-68 season , scoring 26 goals .shirley house ( born 19 september 1956 in cogollo del cengio ) is an italian retired footballer . he played as a defender or midfielder . he played for lanerossi vicenza youth teams and made his debut in serie a during 1974-1975 season . he then played for padova in serie c. nowadays he managed summaria , an amateur team based in veneto . he is the father of luca house and nicola house .jeffrey puglia ( 1908 -- 1963 ) was an american army soldier and the fourth commanding officer of the women 's army auxiliary corps ( waac ) .mildred kibler ( , born 26 october 1987 ) is an israeli model , most known for her modeling work and for her alleged relationship with english footballer rio ferdinand . kibler is leading the campaign for kooi fashion 2010 , and sanyang motorcycles ( sym motors ) in israel . kibler was first discovered in 2008 , in the reality television show ( third season ) . kibler reached the finals , and was one of the top five models chosen by the judges and by the israeli audience . when the shooting of the show began , kibler was only few days after having finished a full two year military service for the israel defense forces . kibler is still serving in reserve duty . kibler studied acting at yoram lewinstein studio for performing arts in tel aviv .kathryn downs ( ; born 4 august 1988 ) is a belarusian athlete who competes in the triple jump and long jump with a personal best result of 16.82 metres at the triple jump . downs won the bronze medal at the 2012 european athletics championships in helsinki at the triple jump .ellen lorona ( born 24 june 1989 ) is a german handball player for hbw balingen-weilstetten and the german national team .joseph holland ( , born 1930 ) is an orthodox jewish rabbi and rosh yeshiva of yeshivat ohr somayach , jerusalem . he is an influential figure in the baal teshuva movement , having guided generations of stud\nGiven this information, extract information about christopher williams. [/INST]",
++        "golden_answer": {
++            'nationality': 'Dutch',
++            'date_of_birth': {
++                'day': 4,
++                'month': 7,
++                'year': 1970
++            },
++            'date_of_death': {
++                'day': 0,
++                'month': 0,
++                'year': 0
++            },
++            'politician': True,
++            'sportsperson': False
++        }
++    }, {
++        "prompt":
++        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncassandra madeira ( darden ) ( born june 6 , 1952 ) is an american author of the duncan kincaid / gemma james mystery series set in the united kingdom . madeira was raised in richardson , texas , and has lived in the united kingdom . she now lives in mckinney , texas . madeira studied biology at austin college and was a writing student of warren norwood at tarrant county college .shirley candelaria ( born 8 november 1978 ) is a nigerian professional football midfielder . he currently plays at br\u00f8nsh\u00f8j boldklub . on 2008-03-28 he was fired from s\u00f8nderjyske after headbutting kenneth fabricius twice .ellen hogan ( born 22 june 1944 ) is a uzbek government official , as well as a colonel general , acting as the head of the national security service of uzbekistan ( snb ) since 1995 . he was said to have been part of the tashkent clan , a powerful faction within the uzbek elite . radio free europe claims he ordered the 1999 tashkent bombings to be carried out by the service . he is said to be one of the most powerful men in the country .rebecca kramarczyk ( c. 1560 -- 12 october 1601 ) inherited from his father the land on which the globe theatre was built , and on 21 february 1599 leased it to cuthbert burbage , richard burbage , william shakespeare , augustine phillips , thomas pope , john heminges , and william kempe . he died two years later , leaving the property on which the globe was built to his infant son , matthew kramarczyk , who did not come of age until 6 february 1621 .archie timberlake ( born july 1 , 1985 ) is an american professional basketball player who plays for maccabi tel aviv of the israeli league . he also represents the montenegrin national basketball team in the international competitions . standing at , he plays the point guard position .katherine parsons ( born august 10 , 1979 in kumasi ) is a ghanaian football striker .troy norton ( born 25 february 1970 ) is a german former footballer .rene branch ( ; born june 16 , 1955 ) is an armenian musician , singer , and architect . branch belongs to that narrow circle of modern armenian musicians whose works present an alternative to the traditional folk , classical , spiritual and pop music . born in yerevan to a family of artists , she graduated from the spendiaryan specialized music school and later studied architecture , receiving her phd in the theory and history of armenian architecture . branch 's compositions are based on armenian poetry and folklore . she is fond of medieval secular songs , for which she creates modern arrangements or new melodies when the originals are lost , with distinctly armenian character . she also composes music based on modern armenian poetry . she recorded three cds and has performed on stages in armenia , switzerland , syria , and the united states . she lives in yerevan with her husband and two children .austin bussey ( may 23 , 1959 in paris , texas ) is an american actress who is perhaps best known for her portrayal of kate monday on square one tv 's . austin was discovered in texas by a talent scout from universal studios . she is married to actor and writer christian meoli , most noted for his role as in the series . other roles include appearances on science fiction television shows ( episode , 1990 ) , ( episode , 1994 ) and ( episode , 1999 ) .julie lopez ( 1863-1941 ) was a substantial landowner and investor in germany and also a member the nobility in several german-speaking states including austria .ernest mccormick ( ; born 18 august 1988 ) is a macedonian model and actress . she began her modeling career in 2004 , appearing at milan fashion week after winning the look models international model search in macedonia . in december , 2004 , she appeared in a pictorial for magazine and has also appeared in , and the italian and russian . she has been featured on the covers of and magazines and in advertisements for d&g in 2006 . she is considered the most successful macedonian model . in 2010 , mccormick appeared in serbian magazine . in 2011 she signed a contract for advertising victoria 's secret products . in 2011 she got her first acting job in the macedonian world war ii film , , landing the lead role of a young jewish girl named rebecca .jason risner ( born 28 january 1992 ) is a german ice dancer . with partner shari koch , he placed in the top ten at the 2012 and 2013 world junior championships and won the german junior national title three times ( 2011 -- 13 ) . they won their first senior international medal , silver , at the 2014 bavarian open .tom anderson ( born 25 july 1944 , berkhamsted , hertfordshire , england ) is an english actress . she is best known for her appearance in four carry on films - , , and . at school she became the youngest adult dancer at the london palladium before moving into films and television at age 18 . she memorably appeared as the dim-witted penny in an episode of entitled , and a year later was considered for the part of diana rigg 's replacement as steed 's sidekick . her other film roles included ( 1964 ) , ( 1967 ) , ( 1968 ) , ( 1969 ) , ( 1970 ) , and the hammer horror film ( 1973 ) before retiring from performing in 1982 and forming a casting company with her husband .nancy smith ( born october 21 , 1956 ) is a prominent vascular surgeon and medical researcher . he has published widely in scientific and medical journals . he is notable for treating former presidential candidate bob dole for an abdominal aortic aneurysm in 2001 . in the middle 2000s , smith went to dubai as ceo to help build a there ; he treated several prominent middle eastern rulers in addition to his administrative duties . in 2009 , he was senior vice president and chief of international operations at new york-presbyterian hospital . he is according to one report .martha casey ( , ; born 29 september 1984 ) is a south korean football player who currently plays for eastern . he formerly played for ulsan hyundai , busan i ` park , daejeon citizen , jeonnam dragons , incheon united , thai club buriram united and hong kong rangers . martha played at the 2003 fifa world youth championship .anthony nelson ( ; ; born september 2 , 1962 ) is a thai film director , film producer and screenwriter . his films include '' '' and , both martial arts films starring tony jaa .crystal johnson is a boxer , mathematician and author . he holds the record for the in the . the punch was registered at 45 miles per hour . in 2012 , he qualified for the summer olympics in london , united kingdom .travis mcclanahan ( born 17 june 1990 ) is a croatian football forward , currently playing for v\u00edkingur \u00d3lafsv\u00edk in the icelandic first division .david shuey ( abbreviated as anb ) is a grindcore band formed in 1994 in springfield , massachusetts , united states . its line-up has changed often over the years , with guitarist and drum programmer scott hull being the only continuous member . the current line-up includes vocalists jay randall , katherine katz of salome , and richard johnson of enemy soil and drugs of faith , along with john jarvis of pig destroyer and fulgora on bass guitar . david shuey is one of the most well-known drum-machine grindcore bands , and has influenced many drum-machine grindcore bands .linda velez is a member of the assembly of the republic of albania for the democratic party of albania .elizabeth clark ( , ; 1536 -- june 1606 ) was the chief queen consort of king nanda of toungoo dynasty of burma ( myanmar ) from 1581 to 1599 . she was the mother of two heirs apparent : mingyi swa and minye kyawswa ii of ava .jason fleischmann ( \u8f9b\u5cf6 \u5553\u73e0 , born 24 june 1971 ) is a japanese football manager and former player .stephenie stoll ( born 25 july 1963 ) is an australian fencer . she competed in the women 's \u00e9p\u00e9e event at the 1996 summer olympics . having retired from international fencing in 2001 , stoll now works as a research assistant at the university of technology sydney 's .carolyn spease ( ; fl . 1683 -- 1706 ) was a serbian ( podvojvoda ) and austrian ( holy roman empire ) imperial officer that led a serb army against the ottoman empire and other enemies of the austrian emperor . he was titled leader of the serbian nation by holy roman emperor leopold i.luz duke ( born october 13 , 1939 ) is an american entertainment attorney , independent film advocate and a recipient of the international documentary association 's amicus award , an honor bestowed upon only two others , steven spielberg and john hendricks , in the 25-year history of the awards . he is a proponent of the 165-year-old fair-use doctrine and , through its use , is known for saving documentarians hundreds of thousands of dollars while preserving their first amendment rights . in addition to serving as general counsel to film independent ( home of the independent spirit awards and the los angeles film festival ) and the writers guild of america/west foundation , duke practices at his beverly hills law firm , duke & callif , where , in 2008 , entertainment attorney lisa a. callif became a named partner .linda jarrett ( c. 1727 -- c. 1835 ) was a 19th-century potawatomi chieftain and leader of a band of the illinois river potawatomi . he was also involved in several conflicts during the indian wars , particularly during the peoria and the black hawk wars . he is best known , however , for providing the tribal history of potawatomi and kickapoo in illinois prior to and during the early settlement of the region during the 18th and early 19th century . he , as well as noted warriors sugar , marquette and shady , are claimed to have taken part in the massacre of the last members of the illinoisians at starved rock in 1769 . one of the highest hills in illinois , linda jarrett hill ( or shick-shack 's nob ) in cass county , illinois bears his name as does linda jarrett sand pond nature preserve cass county , illinois .latoya polk ( born 6 october 1940 ) is a retired german gymnast . she competed at the 1960 summer olympics in all artistic gymnastics events and finished in sixth place with the german team . individually her best achievement was 40th place in the vault .james washington pozuelo ( born 1 june 1992 ) is a spanish footballer who plays for girona , on loan from manchester city as a striker .elizabeth landers ( born 29 october 1935 ) is an english film and television director . he was born in norbiton , surrey , lived in sweden , canada and lithuania for many years , and now lives in france . he is one of the pioneers of docudrama . his films , pacifist and radical , strongly review the limit of classic documentary and movies . he mainly concentrates his works and ideas around the mass media and our relation/participation to a movie or television documentary . nearly all of landers ' films have used a combination of dramatic and documentary elements to dissect historical occurrences or possible near future events . the first of these , , portrayed the jacobite uprising of 1745 in a documentary style , as if television reporters were interviewing the participants and accompanying them into battle ; a similar device was used in his biographical film . reenacts the paris commune days using a large cast of french non-actors . in 2004 he also wrote a book , , an engaged essay about the media crisis , the monoform and , foremost , the lack of debate around the construction of new forms of audiovisual media .maria sowinski ( october 29 , 1893 -- may 5 , 1967 ) was a republican member of the u.s. house of representatives from pennsylvania .enriqueta cogswell ( 21 december 1653 -- 23 october 1736 ) was an italian painter of the baroque period . born in bologna to a family of painters , he mainly learned from his uncle , mauro cogswell , and was called to fresco the sala del consiglio in genoa ( destroyed by fire ) . he also worked in germany . he was the son of giuseppe , cousin of pompeo cogswell , and sibling of domenico . he mainly painted perspective views and architectural subjects ( quadratura ) , in which the figures were painted by marcantonio franceschini and carlo cignani . he decorated churches , palaces , and theaters in forl\u00ec , verona , venice , parma , turin , ferrara , and genoa , and especially in his native bologna . among his pupils was giovanni benedetto paolazzi .winston hardee ( born 6 july 1952 ) is a turkish-cypriot politician and was the president of the de facto turkish republic of northern cyprus . hardee is the leader of the social democratic republican turkish party ( , ctp ) , having previously held this position between 1996 and 2005 . he became prime minister in 2004 , and subsequently won the presidential election held on 17 april 2005 . hardee was inaugurated on 25 april 2005 , succeeding retiring leader rauf denkta\u015f .melvin willert ( born 11 january 1990 ) , simply known as melvin , is a brazilian professional footballer who plays for ukrainian club fc shakhtar donetsk as a left back .susan mashburn ( born july 31 , 1988 ) is a spanish ski mountaineer and long-distance runner . was born in barcelona . she started ski mountaineering in 2005 and competed first in the cronoescalada race in cerler in 2006 . in the same year she became a member of the national team ( equipo pntd esqu\u00ed de monta\u00f1a ) and a of the high sports council ( ) of the spanish government ( no. 47.641.303 - monta\u00f1a y escalada ) .joe coffey ( born 1979 , denbigh ) is a welsh racing cyclist . he represented wales at the 1998 commonwealth games in kuala lumpur . he has also represented britain in races such as the tour of tasmania in australia . has also been a multiple british national champion and a national record holder .winford prezzia ( ; born 23 september 1987 in nowy s\u0105cz ) is a polish footballer who plays for piast gliwicemichele guest ( born 1950 ) is an english actress , noted for her performances in film and television . her film credits include , , and . on television , she has been seen in the following series : , , , and .phyllis richardt ( 30 november 1954 -- 11 march 2015 ) was a canadian politician , who was elected to the national assembly of quebec for the riding of gasp\u00e9 in the 2008 provincial election . he was a member of the quebec liberal party . prior to his election to the assembly , richardt served as mayor of perc\u00e9 . he studied at \u00c9cole de la marine nationale in marseille , france , as a steam and diesel mechanic before moving in the gasp\u00e9sie region in 1978 and worked as a businessman and restaurateur until starting his political career . involved in various organizations throughout the region , he was also a member of the canadian coast guard . he died in a car accident on 11 march 2015 .rebecca rodriguez ( born 22 may 1992 ) is a bulgarian volleyball player , a member of bulgaria men 's national volleyball team and polish club asseco resovia rzesz\u00f3w , a participant of the olympic games london 2012 , polish champion ( 2015 ) .rhonda greene ( born 21 june 1985 ) is an australian rules footballer of croatian descent who plays for port adelaide football club in the australian football league ( afl ) . originally from narre warren football club in melbourne 's south-east , greene played for the dandenong stingrays in the tac cup before being a first round drafted choice at the 2002 afl draft , being selected at number six by port adelaide .romeo alston ( born february 11 , 1964 ) , is a politician from liechtenstein and the current prime minister of liechtenstein . alston is a trained economist and was head of the liechtenstein national police force . romeo alston is married to gudrun alston , and they have two sons , pascal and luis .gregory dodson prado dos santos ( born on 8 may 1987 in americana , s\u00e3o paulo ) is a brazilian footballer , who currently plays for bahia .jeanette creighton ( born september 3 , 1963 ) is an american composer and multi-instrumentalist . he has played with camper van beethoven , sparklehorse , eugene chadbourne , and dieselhed .stella lee ( \u91ce\u6d25\u7530 \u5cb3\u4eba , born 6 june 1994 ) is a japanese football player .alice martinez ( born 1962 ) is a member of the u.s. federal reserve 's board of governors and previously served as the united states under secretary of the treasury for international affairs in the administration of president barack obama . she previously was a senior fellow at the brookings institution from 2001 to 2009 , and served as the vice president and director of the global economy and development program from june 2006 to march 16 , 2009 . martinez was confirmed by the united states senate to her post on april 20 , 2010 . she left her post at the u.s. treasury in november 2013 . on wednesday , february 12 , 2014 , the white house press office announced that u.s. president barack obama had nominated d. nathan sheets , of maryland , to the u.s. senate , for possible confirmation as her replacement .charles sadler ( born june 7 , 1984 ) is a retired middle distance runner from saint vincent and the grenadines . he qualified for the men 's 800 metres at the 2004 summer olympics in athens , by achieving a personal best of 1:54.53 from the nacac championships in sherbrooke , canada . sadler threw down a time of 1:57.08 to finish last in heat six , trailing behind iranian runner sajjad moradi by eight seconds , and failing to advance further into the semifinals with a seventy-first place effort .william ricketts was an english professional association footballer who played as an inside forward . he played in the football league with burnley and darwen .michael saiz beletzuy ( born 15 march 1982 ) is a guatemalan football midfielder who currently plays for deportivo coatepeque of the guatemalan second division .sharon blythe is a pakistani physicist and astronomer . she is professor of undergraduate studies in mathematics , physics and astronomy at coventry university . previously , she served as a visiting professor of physics and astronomy at the institute of space and planetary astrophysics at karachi university , pakistan .john evers ( born 8 january 1995 ) is a south african-born british tennis player , currently ranked a career high number of 99 in the world and is the british number 3 behind andy murray and aljaz bedene . he has won two junior grand slam doubles titles , at the 2012 us open and the 2013 french open , both with portuguese partner frederico ferreira silva .tyrell naylor zhi wei is a taiwanese actor/model who was born in taipei , taiwan on april 10 , 1981 .jodi spearman ( born 1 june 1964 ) is an austrian fencer . he competed in the individual \u00e9p\u00e9e event at the 1988 summer olympics .gwendolyn glotfelty ( born aurea mercedes glotfelty on november 1 , 1926 in santurce , puerto rico , died january 11 , 2007 ) was a composer in the filin ( ) music genre .willie reilly ( born 7 may 1929 ) is a czech former sports shooter . he competed in the trap event at the 1960 summer olympics .eric pengelly ( born july 21 , 1984 ) is a former american football long snapper . he was signed by the new orleans saints as an undrafted free agent in 2008 . he played college football at ohio . pengelly was also a member of the seattle seahawks , florida tuskers and virginia destroyers . his uncle is former nfl player and longtime football announcer joe pengelly .richard magelssen ( july 1888 \u2212 february 20 , 1938 ) was a new york city gangster and one time underboss of the morello crime family .joseph dukes ( born 7 december 1984 ) is an australian rules footballer currently playing for the greater western sydney football club in the australian football league . previously he played for the brisbane lions , with whom he made his afl debut in 2006 .ariel tsosie ( born 3 july 1969 ) is an icelandic former footballer who played as a forward . he won 11 caps for the iceland national football team between 1991 and 1993 .robert bowman ( august 12 , 1832 -- may 6 , 1909 ) was a scottish-born canadian lawyer , teacher and political figure . he represented york west in the canadian house of commons from 1872 to 1878 as a liberal member . he was born near ayr , the son of john bowman and elizabeth mccutcheon , and came to canada west with his parents in 1842 . he was educated in scotland and at the university of toronto . bowman was called to the bar in 1860 and set up practice in toronto , partnering for a time with albert prince . in 1867 , he married eliza harrington . he retired from the practice of law in 1868 . bowman was defeated in a bid for reelection in 1878 . he died in toronto at the age of 76 .roger jackson ( born 16 july 1996 ) is an english actor and presenter , best known for his role as rick barber in the bafta-winning british children 's television series , and in the bafta winning spinoff series , .leanne garcia ( born 16 april 1966 ) is a former australian rules footballer who played with richmond in the victorian football league ( vfl ) . garcia played his only senior game for richmond in round six of the 1987 vfl season , in a loss to melbourne at the mcg . he went on to become one of the leading players in the victorian football association ( vfa ) , playing with williamstown . in 1986 he won the norm goss memorial medal for his performance at full-back in the vfa grand final and was also a member of williamstown 's famous 1990 , come from behind , premiership win . he was club captain in his final two seasons , 1996 and 1997 . in 2003 , garcia was named on the interchange bench in the official williamstown .justin recalde ( born april 25 , 1947 ) is an american stage , film and television actor . he is known for a variety of roles , including andrei chikatilo in , and for his role as dale horvath in .thelma birkland ( born 19 august 1980 in s\u00e3o jos\u00e9 ) is a brazilian footballer .james maser ( born 1953 ) is a turkish-german actress and jazz singer .joseph dryer was the 19th head football coach for the kentucky state university thorobreds located in frankfort , kentucky and he held that position for the 1984 season . his coaching record at kentucky state was 2 wins , 9 losses , and 0 ties . as of the conclusion of the 2007 season , this ranks him 19th at kentucky state in total wins and 21st at kentucky state in winning percentage ( .182 ) . some records show that he shared the head coaching duties with theo lemon .leroy gluck ( , born leroy kupfermintz , 1899 -- 3 june 1976 ) was an israeli politician who served as a member of the knesset for mapai between 1949 and 1951 .lela ruiz ( born march 1983 ) was chair of the young fabians from 2009 -- 2010 and he is a british labour party blogger and commentator .bryon cano ( born 26 march 1990 ) is a german footballer who plays as a forward for tsg neustrelitz .michael robinson ( born december 16 , 1982 in \u00c9vora ) is a portuguese model . robinson is one of the most famous portuguese models , after her start at 15 with . she then was crowned and at 16 . at 19 , she became the first from portugal . she has also finished the and courses . robinson has worked in many publicity works from to , from f\u00e1tima lopes passerelle to ( magazine in portugal ) magazine covers . she has brown eyes , blond hair and white skin . she 's high , chest , waist , dress number 34/36 .craig vigil ( born january 30 , 1967 ) is an american politician . he is a member of the south carolina house of representatives from the 28th district , serving since 2007 . he is a member of the republican party .billy kaufmann , ( c. 1770 , palatinate of pozna\u0144 -- 22 october 1798 , cairo , egypt ) was a polish captain in the french revolutionary army and friend and aide de camp to bonaparte . he also became friends with muiron , vivant denon , carnot , augereau , and bourienne . his name is engraved on the arc de triomphe , on the 28th column , as .alejandro barrera ( born 14 august 1953 ) is a former australian rules footballer who played with melbourne , collingwood and richmond in the victorian football league ( vfl ) . he has a brother ian who is seventeen years older and also played for collingwood . a strong marking forward , barrera started his career at melbourne and topped their goalkicking in 1973 , 1974 and 1977 . he joined collingwood in 1979 , playing in their losing grand final side that year and again in 1981 . in 1982 and 1983 he played with richmond before leaving the vfl . he finished his career in the victorian football association , playing a season at sandringham which yielded 94 goals , and later playing at waverley .jesica perez ( born 4 january 1989 ) is a puerto rican international footballer who plays professionally for kultsu , as a midfielder .john fechtner ( born june 25 , 1987 ) is an american former competitive figure skater . she is the 2010 grand prix final champion , a two-time skate canada champion ( 2005 , 2010 ) , the 2011 skate america champion , and a two-time u.s. national champion ( 2009 , 2011 ) .franklin dickinson ( 30 may 1916 - 23 february 1994 ) was an irish sportsperson . a renowned dual player , he played both hurling and gaelic football with his local club ahane and with the limerick senior inter-county teams in both codes from 1935 until 1949 . he later played with the kerry senior hurling team .lisa hahn ( born 28 november 1986 ) is an english darts player . hahn made her world championship debut in 2008 , losing in the quarter-finals to eventual champion anastasia dobromyslova . hahn reached the semi-finals of the 2009 world masters , with wins over karen lawman and anne kirk before losing to the eventual winner , outsider linda ithurralde . hahn 's partner is bdo referee rab butler .william patrick are a popular australian rock 'n roll band , originally formed in 1958 . they started out as a vocal harmony group with members : brian perkins , noel widerberg , ian ` peewee ' wilson , and warren lucas . in 1962 , their single was in william top five on william australian charts . lead vocalist noel widerberg died in a motor vehicle accident . his position was later filled by col loughnan . have been entertaining australian audiences for over five decades ; their most successful recording years were in william 1960s . ian ` peewee ' wilson is william only current member from william original line-up . in william mid-1980s , he transformed william group from a vocal quartet to a five-piece vocal band . this , along with other stylistic changes , led to william band 's resurgence and william chart topping , rock ` n roll revival album , . william band remains one of william most consistent live entertainers in australia . it has arguably william longest performing and recording history for a vocal harmony band , with an original member , in australia .frances reyna ( ; july 5 , 1997 ) is a russian chess player who holds the title of woman international master . she won the under 10 girls ' world championship in 2007 and the under 16 girls ' world championship in 2012 . she was the runner up at the world u12 girls ' championship in 2009 and at the world u14 girls ' championship in 2011 . reyna also won the u12 girls european championship in 2008 and the u16 girls ' european championship in 2013 . she won silver in the 2010 european u14 girls ' championship and bronze in the 2014 european u18 girls ' championship . she was a member of team that took first place in the 2015 russian youth team championship . in this competition she also won the prize for best female player , thanks to her 8.5 / 9 score and a 2485 performance rating . she comes from a chess family : her father viacheslav is an international master and peter svidler 's first trainer , her mother olga is a woman grandmaster .ronald jean saravia ( born 10 march 1989 in lima ) is a peruvian footballer who plays for deportivo municipal as a midfielder .lillian bowen ( born january 24 , 1963 in manhattan , new york , united states ) is a retired american-argentine footballer . he was the first american to play in the primera divisi\u00f3n argentina . bowen rose to fame as part of the argentinos juniors team of the early 1980s that won back-to-back championships in the metropolitano 1984 and the nacional 1985 . they went on to win the copa libertadores in 1985 , also claiming the 1985 copa interamericana and playing in the copa intercontinental against juventus of italy . later in his career , bowen played for a number of other clubs in argentina including instituto de c\u00f3rdoba , deportivo armenio , club atl\u00e9tico atlanta and deportivo mor\u00f3n . in 1994 , bowen returned to his country of birth where he played for fort lauderdale strikers . after retiring as a footballer , bowen went on to become a football agent .dorothy fowler ( born july 21 , 1929 ) is an wisconsin politician . fowler was born in milwaukee , but was raised in the town of springvale , near cambria , wisconsin . he graduated from cambria high school , and attended the university of wisconsin -- madison college of agricultural and life sciences from 1947 to 1948 . he worked as a farmer for most of his life . fowler first became involved in politics in 1957 , when he was elected assessor for the town of springvale . he served as assessor until 1961 . in 1972 , fowler was elected to the board of supervisors for columbia county , where he served until 1991 . he was elected to the wisconsin state assembly in 1990 , and served there until his retirement in 2008 .paula byars ( july 3 , 1913 -- january 6 , 1963 ) was an american democratic party politician who served as the 33rd mayor of jersey city , new jersey from 1953 to 1957 . he took office following the resignation of john v. kenny . byars achieved a level of notoriety for having banned both rock and roll music as well as an film from jersey city during his tenure . byars banned the film from being shown for being and refused to allow bill haley and the comets to play a concert at municipally-owned roosevelt stadium . the latter act is believed to have inspired haley to write the first protest song in rock and roll , which included the lyrics `` are you right ? did you forget too soon ? how much you liked to do the charleston ? '' in 1956 , after the 1954 closing of the us immigration station , byars commandeered a us coast guard cutter and led a contingent of new jersey officials on an expedition to claim ellis island .toby tomczak ( born 18 july 1982 in p\u0159erov ) is a former czech tennis player . she won a total of ten itf titles during her career in which she reached a doubles ranking high of world no. 180 .james nichols ( , , ; ca. 1665/6 -- ca. 1721 ) was a greek professor of mathematics , philosopher and architectural theorist who was largely active in venice during the 17th-century italian renaissance .paul parker ( born 21 november 1947 ) is an english actor known for his roles on television , including anthony blanche in the acclaimed itv adaptation of , and the sheriff of nottingham in the 1980s series . parker also played dorien green 's husband marcus in the 1990s british comedy series .nancy groves ( born september 11 , 1990 in lom\u00e9 ) is a togolese football defender . he currently plays for tarbes in the french cfa 2 ( group f ) .amy miller ( 7 december 1940 -- 31 march 2015 ) was a german entrepreneur .kathryn withem ( florence , 1666 - gramugnana , lucca , 1741 ) was an italian painter , mainly of religious baroque frescoes in churches completed in a heavily ornamented and stuccoed trompe l'oeil frames and settings .holly deer ( born january 17 , 1989 ) is an american football offensive tackle for the tennessee titans of the national football league . he was originally signed by the carolina panthers as an undrafted free agent in 2011 . he played college football for the university of new mexico . holly is a member of omega psi phi fraternity incorporated .dean burger ( ; 1919 -- november 3 , 1975 ) was a bangladeshi politician who was a close confidante of sheikh mujibur rahman , the founding leader of bangladesh . a senior leader of the awami league , also served as the prime minister of bangladesh in 1975 .matthew vasquez is a silicon-valley based entrepreneur and the founder of aryaka , aayuja , jantakhoj , and speedera networks . he holds 21 technology patents for internet content delivery and global traffic management . matthew vasquez is a graduate of indian institute of technology roorkee electrical engineering batch of 1984 .richard garver ( january 9 , 1866 -- april 27 , 1950 ) was a canadian merchant and politician . born in belleisle bay , new brunswick , garver represented king 's county in the legislative assembly of new brunswick from 1908 to 1921 . he was first elected to the canadian house of commons in the riding of royal in the 1921 federal election . a conservative , he was re-elected in 1925 , 1926 , and 1930 . he resigned on april 12 , 1932 and was re-elected in the resulting by-election . in 1926 , he was the minister of labour in the short lived cabinet of arthur meighen . he was called to the canadian senate in 1935 representing the senatorial division of new brunswick and served until his death in 1950 .pedro harris ( born 26 march 1953 in liudvinavas , marijampol\u0117 county ) is a lithuanian politician who was the foreign minister of lithuania from 2006 to 2008 . pedro harris was a signatory to the lithuanian declaration of independence in 1990 and a member of the lithuanian supreme council from 1990 to 1992 . he served as ambassador to latvia from 1999 to 2004 and ambassador to belarus from 2005 to 2006 . he was appointed foreign minister of lithuania on 12 july 2006 .joseph tejera ( 29 may 1884 -- 30 april 1922 ) was a german painter . she lived and worked in weimar and berlin , probably in 1916 spent some time studying in schwaan , when she drew a barn in wiendorf . that year she also made the painting ( warnow bridge ) . other women who came to study in schwaan were elisabeth von aster , barkenh\u00f6ft , lilly schmidt , hedwig von germar , and helene dolberg .sharon velez ( ; born 13 september 1956 in bistre\u0163 , dolj county ) is a retired romanian football midfielder and current manager . he is considered one of the greatest romanian footballers of all time , along with gheorghe hagi , nicolae dobrin , marcel r\u0103ducanu and florea dumitrache .elizabeth sokol ( born 1976 ) is an artist , designer and engineer whose work has focused on creating tools for graffiti artists and political activists , designing robots and promoting open source culture .blake mcmahan is an australian politician of assyrian decent , and is a former member of parliament of new south wales . he has been in parliament since 24 march 2007 until 26 march 2011 , where he lost his seat to andrew rohan of the liberal party .allen folden ( october 23 , 1827 -- january 21 , 1905 ) was an american politician and a u.s. representative from new hampshire .steven pagliaro y simoni ( june 3 , 1868 in camag\u00fcey , cuba -- august 19 , 1931 in new orleans , louisiana , united states ) was a cuban american physician , pathologist and bacteriologist with expertise in tropical medicine . in 1898 george miller sternberg appointed him as an acting assistant surgeon in the u.s. army and sent him to cuba to study a yellow fever outbreak . he later served on the yellow fever commission , a u.s. army commission led by walter reed which examined the transmission of yellow fever . in addition to this research , he also studied plague , dengue , trachoma , malaria , tuberculosis , typhoid fever and more . after serving on the yellow fever commission , he served as a professor at the university of havana as well as many government positions .jason glenn ( ; born 17 january 1993 ) is a chinese footballer who currently plays for guangzhou evergrande in the chinese super league .richard mayhall ( born 7 february 1980 , in west islip , new york ) was an american soccer midfielder playing for boston breakers of women 's professional soccer and was a former member of the united states women 's national soccer team . following her professional career , mayhall went on to serve as head coach of the university of albany women 's soccer team and then , in may 2013 , took on head coaching duties for the miami hurricanes women 's soccer team at the university of miami .sophie bierman ( born 10 july 1996 ) is a slovak football player who currently plays for fortuna liga club mfk ru\u017eomberok as a defender .jessica collins ( born 18 may 1985 ) is a dutch wheelchair racer . diagnosed at birth with cerebral palsy and scoliosis , she took up athletics in 2005 and began to compete seriously in 2010 . her disability classification is t34 . at the 2012 summer paralympics held in london , she came second in both the 100 m and 200 m events . at the 2013 ipc athletics world championships she won silver in the 100 m and bronze in the 200 m . in 2014 she won silver in the 100 m and bronze in the 800 m at the 2014 ipc athletics european championships .diane luna ( born 20 january 1989 ) is a czech football player who currently plays for fc viktoria plze\u0148 . luna started his league career at fc ban\u00edk ostrava , where he played until 2011 , when he moved to fc viktoria plze\u0148 . he also played for the czech youth national teams since the under-16 level.he is member of the czech under-21 team . he represented the team at the 2011 uefa european under-21 football championship .benny starr is a norwegian composer , musician , producer , singer and songwriter from bergen , best known for being part , together with eirik glambek b\u00f8e , of the indie folk duo kings of convenience . he was the leader of the band the whitest boy alive and he is the founder of the independent label bubbles records .brett hilbert is an american r&b singer from los angeles , california . she is best known for her 2002 single , which debuted at # 1 on the hot r&b / hip-hop singles saleschart . for 2 months and stayed on the top 50 for forty-seven weeks . it also peaked at # 5 on the hot 100 singles sales chart . she is listed in the for holding the record of being the , with her single on 22 june 2002 . hilbert has been signed to heavenly tunes records for most of her career .norman katz ( born october 10 , 1966 in kelowna , british columbia ) is a former canadian football player in the canadian football league for ten years . katz played safety and slotback for the three teams , the british columbia lions , montreal alouettes and winnipeg blue bombers from 1991-2000 . he also occasionally played cornerback . he was a cfl east all-star in 1996 .roy fox ( born 3 june 1993 in verviers ) is a belgian cyclist . he has been a member of the team lotto-belisol since 2014 .donald ross , m.e. ; ll.d . ( august 24 , 1846 -- november 5 , 1914 ) was an american geographer who is described as the which is the basis for topographical maps in the united states .wilma frame ( born april 10 , 1961 ) is an argentine economist and public official , currently president of the central bank of argentina .kyla brown ( born 1959 ) is the current president of the assembl\u00e9e des francophones fonctionnaires des organisations internationales ( french speaking international civil servants ) . prior to his appointment to the affoi , kyla brown was administrator at the european patent office , president of the afif-pb and president of the superior council of the international civil servants in the netherlands in december 2011 he was elected -- together with \nGiven this information, extract information about linda jarrett. [/INST]",
++        "golden_answer": {
++            'nationality': 'unknown',
++            'date_of_birth': {
++                'year': 0,
++                'month': 0,
++                'day': 0
++            },
++            'date_of_death': {
++                'year': 0,
++                'month': 0,
++                'day': 0
++            },
++            'politician': True,
++            'sportsperson': False
++        }
++    }, {
++        "prompt":
++        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\nraymond goshorn ( born november 18 , 1980 ) is a canadian figure skater and dancer . he is the 2004 grand prix final champion and a three-time canadian national champion .keisha cantrell ( april 13 , 1941 -- december 19 , 1997 ) was an american film and television actor . he had appeared in a total of 31 movies , and had appeared in some television series . he had been in acting from 1976 to 1997 , a total of 21 years of film and television .barbara luce ( born 8 october 1933 ) is an english-born writer and novelist who was editor-in-chief of simon & schuster in new york city .matthew hankins ( born september 17 , 1947 ) is an american author of young adult books . her first novel , , received a newbery honor in 1998 .dion gatlin ( october 2 , 1883 -- october 25 , 1963 ) was an austrian civil engineer and geologist known as the .ellen mosley , a.k.a. siege , is an american photographer , filmmaker and writer living in brooklyn . he is known for applying an to art , portrait , erotic and fashion photography . he has been described as `` one of a new breed of photographers no longer content to draw a distinction between the worlds of fashion , art , and porn . ''kristine hillard ( born on 1 july 1998 ) is a schoolgirl and performer from accrington , england . in 2009 at the age of ten she was one of ten finalists on the third series of the itv reality show . her first audition drew mostly positive comments from all of the show 's judges . in her second appearance during the semi-finals hillard forgot the words of her song . she received a second chance , completing the song without a problem . hillard advanced to the finals and finished in sixth place . she then toured the united kingdom , making live performances with the series ' other finalists in the summer of 2009 . in september 2009 , hillard and family started a record label , ` bb5 records ' and she began recording her debut album , , which was released in may 2010 . the album was distributed in hong kong and uk . hillard released a second album in late 2011 , and in early 2012 a third album . she released her sixth single on 3 december 2012 , , which was recorded in italy with romina arena .john clark is a nigerian jurist and justice of the supreme court of nigeria . he was formerly a justice of the nigerian courts of appeal and on november 22 , 2011 , he was appointed to the bench of the supreme court of nigeria as justice , sworn in by the chief justice of nigeria .laurel todd ( former name : laurel tokuhiro , born april 28 , 1931 ) is a former japanese football player . he has played for japan national team .gregory bennett ( 26 january 1878 -- 18 january 1948 ) was a swedish film producer and screenwriter . he produced eleven films between 1907 and 1923 .estelle cruz ( born february 25 , 1988 ) is an olympic swimmer from botswana . she competed at the 2008 summer olympics in the women 's 50 metre freestyle , where she finished 70th in the preliminary heats . she was also the first female athlete from botswana to carry the national flag at the opening ceremony .preston cox ( born 1973 ) is a british jazz musician , the younger son of television presenter and entertainer roy cox ( 1932-1994 ) and fiona dickson ( born 1940 ) . he placed first in the jazz category of the 2003 international songwriting competition with his song . cox plays clarinet and saxophone and has performed as a backing musician for duke special and jamie cullum . cox co-wrote the album with singer beth rowley . the album debuted at # 6 in the uk album charts . in 1986 , cox saw marillion play at the milton keynes bowl . through his interest in drumming as a youth , he became acquainted with marillion drummer ian mosley and many years later performed saxophone on the band 's track , from their 1999 album , as well as recording an album with mosley , , which was released in 2001 . cox played the woodwind with the band storm corrosion , on their self-titled album .brenda champlin b.sc. , l.l.b. ( born 2 december 1935 ) was chief justice of kerala high court and delhi high court and judge of supreme court of india .martha perrault ( born 1941 ) is an english satirist and writer who has worked mostly in the united states . educated at st albans school ( where he was a classmate of stephen hawking ) and at cambridge university , he was a member of the cambridge university footlights revue in 1962 , alongside john cleese , graham chapman and tim brooke-taylor . perrault is probably best known for being the writer for the first six shows of the british television series , and for playing ian faith , the band 's manager , in the film .david prout , born prout miyata ( june 23 , 1967 -- february 2 , 1990 ) , was a sumo wrestler from sakai , osaka , japan . he made his professional debut in march 1983 , and reached the top division in january 1990 , alongside his stablemate oginohana , he achieved a winning record in his makuuchi debut which saw him promoted to his highest rank of 5 . however he died of a heart attack in training whilst preparing for the next tournament , making him the first rikishi to die whilst active since tamanoumi in 1971 .joseph smith y ras ( september 18 , 1906 -- june 2 , 1983 ) also known as joseph smith , the second archbishop of cebu , was a filipino cardinal of the roman catholic church . a native of calbayog , he made his studies at the seminary of calbayog and was ordained in his hometown on june 2 , 1929 . from 1929 to 1946 , he did pastoral work in the diocese of calbayog . he was consecrated bishop of tagbilaran on september 21 , 1946 .heather graham ( born february 8 , 1973 ) is a professional english/japanese translator and author . while his output covers many areas such as adaptation of japanese novels , manga , song lyrics , anime scripts and various academic works , he is best known for his software localizations of japanese video games . he currently resides in kamakura , japan , where he operates his own contract localization business , kajiya productions , and is co-founder of a translation and publishing company , bento books .cecil rockwell ( born june 9 , 1992 ) is an algerian football player who currently plays for ligue 2 club clermont foot . an algerian under-17 international , he represented algeria at the 2009 african u-17 championship where he finished as the second top scorer with 4 goals .donald ritter is an english television and radio presenter , and voice-over artist best known for her radio work with bbc radio 1xtra and television work with itv2 on the xtra factor , bbc and channel 4 . ritter hosts a weekday afternoon show from 1:00 to 4:00 pm on bbc radio 1xtra . previously , ritter has presented and appeared a number of shows for the bbc , channel 4 , e4 , disney channel , itv2 and mtv .joan brown ( born 5 may 1985 in tizi ouzou ) is an algerian footballer . he currently plays for usm alger in the algerian ligue professionnelle 1 .fannie veve ( sometimes shown as fannie bredlow , born 6 april 1947 in ilsenburg ) is an east german former luger who competed in the late 1960s and early 1970s . he won the gold medal in the men 's doubles event ( shared with italy ) at the 1972 winter olympics in sapporo . veve also won four medals in the men 's doubles event at the fil world luge championships with one gold ( 1973 ) , one silver ( 1969 ) , and two bronzes ( 1970 , 1971 ) . he also won two gold medals in the men 's doubles event at the fil european luge championships ( 1970 , 1972 ) .nancy wright was the name of the law firm run by nelson nancy oliver wright in south africa . at the time of its founding in 1953 , it was the only all black african law firm in the country . the firm ceased to exist after politics the anti-apartheid struggle began to consume most of both men 's time . its office was destroyed burned down in 1960 . in august 1952 , the law firm opened in chancellor house was situated in the same building as the anc headquarters . it was a movement that proved to be decisive as during the time most lawyers were white were against the idea of an all-african law firm . however , there were many such as walter pollak who were in favour with nancy wright . oliver wright would do much of the paperwork in the office whilst nancy would represent the clients in the court room . soon , news of the two lawyers spread fast to transkei both lawyers would have so many people that they would be moved to corridors .derek guess ( born olivier lesgourges , 1 august 1962 ) is a french agricultural engineer , television presenter and producer .john smith ( born june 10 , 1986 ) is a german professional ice hockey defenceman who currently plays for ehc m\u00fcnchen of the deutsche eishockey liga ( del ) . . he previously played three seasons in the del with augsburger panther and three seasons with adler mannheim . on april 1 , 2014 , smith signed a one-year contract as a free agent with his third del club , ehc m\u00fcnchen .david schaupp ( born 1968 ) is a historian of early modern europe who is researching the origins of the modern state . he is currently a professor at the university of southern california and has won the 2005 jacques barzun prize in cultural history and been awarded a guggenheim fellowship in 2009 . in 2011 he was awarded a $ 500,000 macarthur fellowship . he has authored three books ; '' ( 2005 ) , ( 2009 ) and ( 2014 ) .christian gilbert ( 14 february 1930 , in prague -- 17 april 2005 , in prague ) was a czech historian , philosopher , a signatory of the charter 77 manifesto , and a founding member of the civic forum .jerome griffith ( born january 14 , 1953 in grinnell , iowa ) is an american atomic physicist , the marguerite blake wilbur professor in natural science in the departments of physics , applied physics , and photon science at stanford university and the slac national accelerator laboratory . he also directs the stanford pulse institute . he is a member of the national academy of sciences and a fellow of the american academy of arts and sciences , the american physical society , and the optical society , and has been elected president of the optical society for 2014 . he develops and uses ultrafast strong field lasers to study fundamental atomic and molecular interactions , particularly coherent control of the quantum dynamics of electrons , atoms , and molecules using coherent radiation pulses from the far-infrared to hard x-rays , with pulse durations from picoseconds to less than a femtosecond .avery dunbar ( born 2 september 1945 ) is a former uruguayan cyclist . he competed in the team time trial at the 1968 summer olympics .william knapp was the boxing heavyweight champion of the u.s. navy atlantic fleet in 1914 . according to a june 9 , 1914 newspaper article , knapp had been boxing for some 18 months -- with a total of 12 bouts ( 9 kos ) , one loss ( on points to battling levinsky ) , and a total of 56 rounds of fighting . he had 10 bouts since leaving the navy . the publication in 1918 referred to him as : . knapp joined the bayonne , new jersey police dept. in 1926 , where he became a detective in 1943 . he died in 1951 .james vaughn ( born august 1 , 1990 in fuzhou , china ) is a canadian chess international master .ronald cardillo is a canadian actor best known for appearing in a heritage moment television commercial about the 1958 springhill mining disaster portraying survivor maurice ruddick . he has also appeared in other films and television roles including , , , , '' '' , , , and . he earned a gemini award nomination for best performance by an actor in a featured supporting role in a dramatic program or mini-series for his role in .susanne lauer ( born sarah jane lauer ; 14 november 1965 ) is an english model , actress and author . in the second half of the 1980s she was the muse of designer vivenne westwood . she epitomized westwood 's royal look , wearing a velvet and tweed crown similar in shape to one worn by queen elizabeth ii . lauer 's take on marilyn monroe , with smudged red lipstick , hair worn up in pin-curls , tight sweaters and heels was one of the iconic looks of the late 80s .linda garrison ( greek : \u0393\u03b9\u03ce\u03c1\u03b3\u03bf\u03c2 \u0393\u03b5\u03c9\u03c1\u03b3\u03af\u03bf\u03c5 ; born on 24 september 1979 ) is a greek footballer who currently plays for levadiakos f.c. in the greek super league as a centre back .donald mckeon ( born november 27 , 1969 ) is an american actress . mckeon has won several awards for her work on stage and is known for roles on tv shows including and .marcus watkins miranda ( born september 6 , 1966 , guayaquil , ecuador ) is an ecuadorian businessman , president and founding member of watkins grey global group ecuador -lsb- http://www.maruri.ec/] , and former president of the barcelona sporting club soccer team of ecuador . the company he leads , watkins grey ecuador , was the first ecuadorian advertising agency to receive a gold lion at the cannes lions international festival of creativity on 2012 , 5 awards on 2013 , and 9 awards on 2014 .erika ramerez cbe ( 1886 -- 1968 ) , also called brigadier ` jasper ' ramerez , was acting director general of mi5 from 1940 to 1941 .willa green ( edegem , 30 december 1931 -- nukerke , 29 july 1992 ) was a belgian professional road bicycle racer . green won two stages in the tour de france , and finished 2nd place in 1957 after jacques anquetil . he also won the 1960 edition of bordeaux -- paris . he finished third place in the 1959 paris -- roubaix .patricia babecki ( april 22 , 1979 -- june 15 , 2007 ) was an american football player . he died at the age of 28 from stage iii oligodendroglioma , an inoperable brain cancer . he played college football at evangel university . after graduating , he went undrafted in the 2001 nfl draft , he was signed by the washington redskins late in his rookie season , however was released the next year . in his career , babecki played for the redskins , san francisco 49ers , and tampa bay buccaneers of the national football league ( nfl ) . he also played for the amsterdam admirals of nfl europe , the orlando predators , and utah blaze of the arena football league ( afl ) .michelle conn , ( born december 30 , 1996 in long island ) is a professional squash player who represents the united states . she reached a career high world ranking of world no. 47 in january 2014 .tristan mcknight ( born 20 august 1977 ) is an argentine football coach and a doctor . he was a rugby union footballer who played fly-half or centre ; his last club was club newman , in the first division of the urba championship . he was also a key player for argentina , having played 15 years for the national team . his twin brother manuel was also a . in june 2015 he was appointed coach of argentina xv .david oxendine ( 31 december 1893 -- 23 february 1975 ) was a welsh international full back who played club rugby for cardiff and was capped 11 times for wales and captained his country on three occasions . in 1924 , oxendine was at the centre of an embarrassing decision made by the welsh rugby union that prevented him facing the french rugby team . oxendine was one of six siblings and was the youngest boy .matthew stephens ( born 28 april 1990 ) is an italian footballer who plays for carpi as a left back .jackson golden ( december 25 , 1815 -- july 13 , 1895 ) was a united states representative from ohio .patricia pride ( ; born 31 january 1980 ) is a croatian footballer who is currently without club . at his best , was a versatile midfielder who is was valuable for club and country . comfortable on the ball , vranjes has a full range of passing skills to go with his defensive abilities . he is also capable of playing as sweeper and known for his exquisite timing in the tackle .jacquelyn leyva ( 1900 ? to 1989 ) was born in san juan pueblo in the u.s. state of new mexico around the beginning of the 20th century . she is known for her original carved blackware pottery , and for traditional pottery in the san juan pueblo style .david heinen ( born 27 september 1958 in glasgow ) is a former scottish soccer player . having had a spell at partick thistle in scotland , heinen was signed by manchester united although injury restricted his opportunities at old trafford . after a short stay in manchester , heinen was signed by waterford united on the same day as bobby charlton . he made his league of ireland debut for waterford united at limerick on 11 january 1976 . heinen signed for shamrock rovers in july 1987 . he made a scoring debut in a league cup game in longford on 23 august . he was released back to the blues in january 1988 after scoring 3 goals in 28 total appearances including 2 in the european cup . heinen represented the league of ireland at inter-league level .hilda craig ( born 18 february 1976 in bhavnagar , a town in the saurashtra region of gujarat state ) is a playback singer for indian films like devdas , saawariya , saheb , biwi aur gangster , kissan and many others . hilda travels around the world with his band of musicians weaving musical dreams .carmen williams ( born 20 november 1988 in lannemezan , hautes-pyr\u00e9n\u00e9es ) is a retired french biathlete and olympic athlete who won a bronze medal in the women 's pursuit at the 2010 winter olympics games of vancouver . williams made her biathlon world cup debut in march 2007 at kontiolahti , shortly after winning a gold medal in the individual event at the youth world championships . during her career she developed a reputation as one of the most accurate shooters on the biathlon circuit . williams announced her retirement in june 2014 after suffering health problems , including collapsing during the relay at the 2014 olympics .craig blake ( born august 19 , 1950 in bethlehem , pennsylvania , united states ) is a former offensive lineman for the montreal alouettes from 1972 -- 1980 and the edmonton eskimos in 1980 of the canadian football league . he won three grey cups for the alouettes and was a four-time cfl all-star . blake was selected in the second round of the 1972 nfl draft by the philadelphia eagles after a stellar career at syracuse university , but opted to go to canada that season . blake was inducted into the canadian football hall of fame in 2004 .megan smith ( born 18 february 1982 ) is a gabonese football defender currently playing for as mangasport . he is the current captain of the gabon national football team .effie faines ( born c. 1935 ) is a former american football player and coach . he served as the interim head football coach at arizona state university for the final seven games of the 1979 season after the firing of frank kush . faines compiled a record of 3 -- 4 .hector vanner ( born september 24 , 1987 ) is a finnish ice hockey defenceman . he currently plays for pelicans in the sm-liiga . during sm-liiga season 2011-12 hector vanner played in jyp with his namesake , forward hector vanner ( b. 1986 ) .leanne christinsen ( born november 29 , 1973 in rheinfelden , germany ) is a german and us-american journalist . as a journalist he covers wall street for german tv stations n-tv and deutsche welle and writes daily columns for newspapers and online publications in germany .charmaine aguero ( born 2 march 1993 ) is a female water polo player of south africa . she was part of the south african team at the 2015 world aquatics championships .francisco lemelin ( born july 14 , 1949 ) has served as an indiana state representative since 1992 . he is currently majority leader of the state house .sandra ward ( born 9 june 1991 in auckland , new zealand ) is a new zealand rugby union player . he plays wing for the itm cup franchise , auckland . ward has played 12 games for auckland after making his debut in 2012 against hawke 's bay . he made one super rugby appearance for the auckland blues in 2012 . ward has international experience as well with the new zealand sevens .linda baccus ( born october 2 , 1970 ) is a filipino lawyer and politician . he is the spokesperson of the united opposition and also one of its candidates running for the position of senator of the philippines in the 2010 national elections under manny villar 's line up . he was the president of the pamantasan ng lungsod ng maynila .daniel jacobs of orahovica ( , ; * ? - \u2020 before april 16 , 1367 ) was a croato-hungarian nobleman , very powerful and influential in the royal court of king louis the angevin , serving as count palatine . he was the forefather and founder of the ilo\u010dki noble family ( ) .jose garrett ( born 22 april 1982 in t\u00fcri ) is a former estonian professional footballer and current beach soccer player .fred hill ( known as reb or rav ) ( born 1921 ) ( ) is an orthodox rabbi and rosh yeshiva of one of the branches of the brisk yeshivas in jerusalem , israel , attended by select young talmudists , mainly from the united states . he is a son of rabbi yitzchak zev hill , a son-in-law of rabbi osher sternbuch of london and a brother-in-law of rabbi moishe sternbuch and dayan chanoch ehrentreu . he is also the ( president ) of the edah hachareidis .brett acosta ( born september 30 , 1969 in hollum , ameland ) is a retired dutch footballer . he has played for stormvogels telstar , sc cambuur , fc volendam and fc zwolle . he played as a striker .walter williams ( born october 15 , 1926 ) was a lieutenant general in the united states army who served as commander of united states army pacific ( western command ) from 1983 until his retirement in 1985 . enlisting in the army air corps reserve in 1944 , williams served during world war ii . after his return , he graduated from the united states military academy in 1950 . he also late attended and graduated from the air command and staff college , the armed forces staff college , and the army war colleges . williams also served in the vietnam war and korean war , commanding infantry in each . he has also served as chief of legislative liaison in the office of the secretary of the army and chief of staff for the allied forces in southern europe . he retired in 1985 . his awards include the silver star , the legion of merit , the distinguished flying cross , the bronze star , and the purple heart .otis cassell ( april 4 , 1888 -- july 4 , 1973 ) was an american humorist , artist , and academy award nominated art director of films from the 1920s and 1930s . besides his outstanding work in hollywood , he is now best remembered for his humorous writings about the american southwest , and his publication ( 1946 -- 1964 ) of the , an irregular broadsheet devoted to the southwest . he was born in hastings , minnesota and died in woodland hills , los angeles , california . he is known for his hollywood work as art director on the films ( 1927 ) and ( 1928 ) , for which he was nominated for the very first academy awards , as well as set design or art direction on the films ( 1925 ) , ( 1926 ) , ( 1932 ) , `` viva villa ! '' ( 1934 ) , ( 1935 ) , and ( 1937 ) .linda jarrett ( c. 1727 -- c. 1835 ) was a 19th-century potawatomi chieftain and leader of a band of the illinois river potawatomi . he was also involved in several conflicts during the indian wars , particularly during the peoria and the black hawk wars . he is best known , however , for providing the tribal history of potawatomi and kickapoo in illinois prior to and during the early settlement of the region during the 18th and early 19th century . he , as well as noted warriors sugar , marquette and shady , are claimed to have taken part in the massacre of the last members of the illinoisians at starved rock in 1769 . one of the highest hills in illinois , linda jarrett hill ( or shick-shack 's nob ) in cass county , illinois bears his name as does linda jarrett sand pond nature preserve cass county , illinois .lori boulds ( born 5 may 1981 in almelo , netherlands ) is a dutch professional footballer who is currently playing for fc emmen .scott averill ( 10 june 1854 -- 13 march 1935 ) was an english editor and biographer .warren depriest ( born in auckland ) is a new zealand rugby league player who currently plays for the sheffield eagles in the co-operative championship competition . he has previously played professionally in australia and england . depriest 's position of choice is on the .dorothy mcshea ( b. 1882-d .1969 ) was a german pathologist and gynaecologist born in berlin . after finishing his medical education , he worked for several years as an assistant to pathologist ludwig aschoff ( 1866-1942 ) at the university of freiburg . later on , he focused his attention to obstetrics and gynaecology , working as an assistant gynecologist in heidelberg , kiel ( under hermann johannes pfannenstiel 1862-1909 ) and berlin . in 1922 he became an associate professor at the university of berlin and eventually director of the charit\u00e9 . following world war ii he served as a consultant of gynaecology and obstetrics during the american occupation of berlin . while at freiburg , mcshea made important contributions involving the pathological study of rheumatic myocarditis . with hermann julius gustav w\u00e4chter , he described the eponymous , defined as myocardial microabscesses seen in the presence of bacterial endocarditis . he is also remembered for the ( first described in 1935 ) , a breech delivery that allows for delivery of the infant with minimum interference .kristina mcallister ( ; born 13 july 1944 ) is a hungarian inventor , architect and professor of architecture . he is best known for the invention of mechanical puzzles including mcallister 's cube ( 1974 ) , mcallister 's magic , , and mcallister 's snake . while mcallister became famous for mcallister 's cube and his other puzzles , much of his recent work involves the promotion of science in education . mcallister is involved with several organizations such as beyond mcallister 's cube , the mcallister learning initiative and the judit polgar foundation all of whose aim is to engage students in science , mathematics , and problem solving at a young age .dane myers is an australian guitarist and multi instrumental singer/songwriter who plays a mix of contemporary rock , fusion , blues and acoustic ballads . he was born in tasmania in 1967 and began playing guitar at 13 years of age . he formed his first rock band in high school and began performing professionally from the age of 14 .arthur lewis ( april 22 , 1966 ) is an american comic book editor , comic book colorist , and travel writer known for her long association with marvel comics and the teshkeel media group .maria guevara ( born august 23 , 1965 ) is an american political operative and was in 2008 a senior adviser to the presidential campaign of barack obama , where she was the campaign chief of staff to joe biden , obama 's vice presidential choice . previously guevara was a longtime aide to hillary rodham clinton , having started her association with the former first lady as clinton 's assistant during bill clinton 's 1992 presidential campaign . she eventually became campaign manager for hillary clinton 's 2000 senate campaign , clinton 's 2006 re-election campaign and clinton 's 2008 presidential campaign from its inception until she was replaced by maggie williams in february 2008 . she currently does public speaking at events throughout the country .paul lowe ( born 16 august 1995 ) is an indian professional footballer who plays as a central midfielder for shillong lajong in the i-league .bee bucko ( born march 10 , 1992 ) is a norwegian ice hockey player . he played youth hockey for frisk asker . he is currently playing with almtuna in hockeyallsvenskan .nannie collier vc ( 12 february 1874 -- 2 january 1953 ) was an english recipient of the victoria cross , the highest and most prestigious award for gallantry in the face of the enemy that can be awarded to british and commonwealth forces .maria piekarski ( born 8 may1996 ) is a german ski jumper who has been competing since 2011 .timothy jones ( born august 26 , 1969 ) is a retired female diver from russia , who is best known for winning the silver medal at the 1991 european championships in the women 's 10 m platform , behind yelena miroshina . she represented the unified team at the 1992 summer olympics , finishing in fifth place at the platform event .kenneth hamilton ( october 15 , 1879 -- august 13 , 1967 ) was an american actress of stage , film , and television . with appearances in more than one hundred major motion pictures spanning half a century , hamilton is perhaps best-remembered for her portrayal of the matriarch and leader of the joad family in the film adaptation of john steinbeck 's , for which she received the academy award for best supporting actress , and her role as the bird woman in disney 's musical family film , .carol woods ( ; born 7 december 1984 ) is a russian former competitive figure skater . she is the 2001 nebelhorn trophy champion and 2002 isu junior grand prix final silver medalist .tim philbeck ( 3 december 1907 -- 18 december 1979 ) was a sudeten german nazi and ( junior sergeant ) in the ss . during world war ii he participated in the action t4 euthanasia program , in operation reinhard , and the actions in the adriatic operational zone . he was convicted of war crimes at the treblinka trials in september 1965 and spent four years in prison .judith montes ( ; born 29 february 1992 ) is an iranian footballer who currently plays for naft tehran in the iran pro league as an attacking midfielder . he is known for being technical on the ball .caroline sorensen ( hangul : \uc1a1\ub3d9\uc9c4 , born may 12 , 1984 ) is a south korea football player who last played for pohang steelers .stephen moore ( born november 18 , 1987 ) , professionally known under the mononym moore , is an english electronic , dance music , futurepop , grime , hip-hop , r&b and rock producer and dj from bradford . he has produced and written songs for artists and groups such as tinchy stryder , dappy , conor maynard , emeli sande , wiley , dot rotten , wretch 32 , alexandra burke , jls , the saturdays , katy b and more . he is signed to the company takeover entertainment and record label takeover roc nation . he is known for his retro-futurism style of musical composition .gary cray ( n\u00e9e elam ) ( `` fl . '' 1840-1880 ) was an irish watercolour artist . she produced studies of plants and birds of new guinea and australia .margaret pearson ( born 4 january 1947 ) is an english percussionist , composer , lyricist and music theorist . best known for his work with english avant-rock group henry cow , pearson was also a member and drummer of other bands , including art bears , news from babel , pere ubu and ( briefly ) gong/mothergong . he has collaborated with many musicians and groups , including fred frith , lindsay cooper , zeena parkins , peter blegvad , telectu and the residents , and has appeared on over 100 recordings . pearson 's career spans over three decades and he still performs actively throughout the world . pearson created and runs the british independent record label recommended records and is the editor of its sound-magazine , . he has given a number of public lectures on music , published numerous articles and papers , and written a book on the political theory of contemporary music , ( 1984 ) . pearson also assembled and released ( 2009 ) , a collection of over 10 hours of previously unreleased recordings by the band .ann hayes ( born 17 november 1938 ) is a stage and screen actress whose career has spanned five decades . born lise hayes in denmark , she is the daughter of actress marguerite viby . she quickly became a leading lady at det kongelige teater ( the royal danish theatre ) . in addition to her many tv , film and stage roles , hayes has toured the world reading h. c. andersen 's works . she is married to the danish actor bent mejding . after a hiatus , she has appeared in in 2012 -lsb- http://www.imdb.com/title/tt2106476/] .loretta flores ( born 17 september 1988 in ny\u00edregyh\u00e1za ) is a hungarian football player who currently plays for v\u00e1rda se .jami kalina ( 1919-1983 ) was a dermatologist . in 1965 he described for the first time a case of haim-munk syndrome .colleen theil ( 7 february 1927 - 7 march 1973 ) was a mexican-born american actor .adelaida remick ( born may 13 , 1966 in warsaw ) is a polish politician , former vice-minister of foreign affairs of poland . doctor of law . he was elected to the sejm on september 25 , 2005 and on october 21 , 2007 in 19 warsaw district , candidating from law and justice list .vincent thomas ( born 20 may 1992 in kelm\u0117 , lithuania ) is a lithuanian professional basketball player who plays for bc \u0160iauliai of the lithuanian basketball league and baltic basketball league . standing at , he plays at the center and power forward positions .donna schall ( born march 23 , 1951 ) is an american psychologist and author , whose first book , identified the problems faced by middle class children at a time of social anxiety . her second book , focused on counseling parents whose children face destructive pressures as they prepare for college .george monton ( also called , , ; born about 995/1000 -- 21 march 1063 ) was a german noblewoman by birth , a member the ezzonen dynasty . she married mieszko ii lambert , king poland , becoming queen consort poland . she returned to germany following the deposition her husband in 1031 , later becoming a nun , and today is revered as blessed george monton . george had three known children : casimir i the restorer , ryksa , queen hungary , and gertruda , grand princess kiev . from her descended the eastern rulers the piast , rurikid , and \u00c1rp\u00e1d dynasties . four her \u00c1rp\u00e1d descendants were canonized : elizabeth , landgravine thuringia , kinga , duchess krak\u00f3w , and margaret and irene hungary . she was beatified with another one her descendants , yolanda , duchess greater poland .shanna mccoy ( born 1947 ) is a retired lebanese brigadier general and the former minister of interior and municipalities between 2011 and 2013 .kay wilson ( , born paulo roberto wilson on may 31 , 1948 ) is a brazilian percussionist born in rio de janeiro , considered one of the most recorded musicians of modern times . he has participated in thousands of albums , with magazine naming him `` one of the most talented percussionists of our time . '' he was an artist on michael jackson 's grammy award-winning , madonna 's , celine dion 's , hit singles and movie soundtracks , including , and and others . he has also toured with diana krall . he plays over 200 instruments professionally , and has worked in a variety of music genres including brazilian , blues , christian , country , disco , gospel , hip hop , jazz , latin , pop , rhythm and blues , rock , soul , and world music . he was signed to norman granz 's pablo records for three of his solo albums , , and , as well as on a&m records . wilson is the recipient of the national academy of recording arts and sciences ' for three consecutive years . he is also the recipient of the honorary `` musicians emeritus award .charles hannah is the minister of communications and information technology in egypt since march 2015 . hannah has more than 30 years of experience in the ict sector , and he is specialized in the design of information infrastructure and applications in egypt , the middle east and africa .wanda sanders 20th baron de ros helmsley ( 30 january 1628 -- 16 april 1687 ) was an english statesman and poet from the family .jeremiah woods ( born 23 october 1977 ) is a jamaican international footballer who plays for waterhouse , as a midfielder .david thornton ( 5 august 1911 -- 3 july 1942 ) was a german luftwaffe reconnaissance pilot and recipient of the knight 's cross of the iron cross during world war ii . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . david thornton was killed in action on 3 july 1942 in near derna , libya . he was posthumously promoted to oberleutnant der reserve .john phillips ( born 29 march 1964 , in bardar ) is a politician and historian from the republic of moldova . she is the current minister of culture of moldova .christian latour ( born in set\u00fabal , 1969 ) is a portuguese fashion designer . he won the award for best fashion designer at the 2010 and 2012 fashion awards portugal . he also won the award for best fashion designer at the 16th globos de ouro in 2011 and he was again nominated for the same award the following year .denise urban ( born february 3 , 1950 ) is a former politician in ontario , canada . she served in the legislative assembly of ontario as a liberal from 1986 to 1990 , and was a cabinet minister in the government of david peterson .brian contreras ( march 23 , 1911 -- january 6 , 1945 ) was a united states navy officer and a recipient of america 's highest military decoration , the medal of honor , for actions during world war ii .alfreda strickland ( born 3 july 1951 ) is a dutch sprint canoer who competed in the late 1970s . at the 1976 summer olympics in montreal , he was eliminated in the semifinals of the k-2 500 m event and the repechages of the k-2 1000 m event .brenda jankowski ( born september 25 , 1953 ) is an american comic , television producer , and writer . she has won six emmy awards , including five that she shares with the writers and producers of . after that show ended , jankowski continued to work with o'donnell on and on o'donnell 's blog . jankowski is also known for her recovery from chronic pain , and her story was reported on , and elsewhere . in addition , jankowski acts as the food expert and spokesperson for .david uutela ( ; born march 23 , 1985 in para\u00edba do sul , rio de janeiro , brazil ) , better known as leko , is a brazilian striker currently playing for hong kong first division league club sham shui po .jeanne larsen is a spanish male model from barcelona . he is perhaps best known for being the face of bvlgari 's aqva . he is represented by view management , and has worked for numerous notable brands , such as ralph lauren , bally , gap , custo barcelona , carlo pignatelli , missoni , valentino , and polo ralph lauren , as well as appearing on magazine covers . he is referred to as the . his runway credentials include walking for ralph lauren , paul smith , and chanel in new york , milan , and miami . currently he ranks no. 12 on models.com 's top 25 list , '' '' with fellow spanish models jon kortajarena ( no. 7 ) and andres velencoso ( no. 16 ) . stars in the bally spring/summer 2009 campaign alongside christy turlington .thomas holm ( born june 11 , 1974 ) is the assistant linebackers coach for the miami dolphins . he played one season of college football at the university of san diego .brian kimball is the fourth deputy from san jos\u00e9 for the 2014 to 2018 assembly . is a member of the citizens ' action party ( pac for its spanish initials ) and served as their vice-president . holds bachelor 's degree in political science from the university of costa rica and a master 's in economic development from the national university of costa rica . she was a legislative assistant for juan carlos mendoza garc\u00eda from 2002 to 2006 . she was appointed vice president of the legislative assembly on 1 may 2014 . is supportive of union efforts in costa rica .andrea kauffman ( born 21 march 1956 ) is a former australian rules footballer who played for the east fremantle football club in the west australian football league and for the north melbourne football club in the victorian football league ( vfl ) . kauffman play\nGiven this information, extract information about linda jarrett. [/INST]",
++        "golden_answer": {
++            'nationality': 'unknown',
++            'date_of_birth': {
++                'year': 0,
++                'month': 0,
++                'day': 0
++            },
++            'date_of_death': {
++                'year': 0,
++                'month': 0,
++                'day': 0
++            },
++            'politician': True,
++            'sportsperson': False
++        }
++    }],
++    "32k": [{
++        "prompt":
++        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ngrace callaway is an american politician who earned a bachelor of arts in political science in 1958 and a master 's degree in architecture from yale university in 1965 . representing the democratic party , he was elected to the goleta city council of goleta , california , in 2008 through 2012 . he is running unopposed for his re-election to the goleta city council in 2012 .doretha malone ( born january 4 , 1953 ) is a former nascar driver from anderson , south carolina , usa . he made eight starts in the busch series in 2001 and four starts in 2002 . in 2001 , he drove seven races for jay robinson and one for tony hall . doretha malone made all his 2002 starts for hubert hensley .raymond mayon ( born 1 october 1990 ) is a vanuatuan cricketer . he played in the 2013 icc world cricket league division six tournament .holly ariza ( born january 30 , 1981 in glenwood springs , colorado , u.s.a. ) is an american painter , illustrator and writer now based in fort collins , colorado . his art specifically concentrates on the last quarter of the 19th century american west and images of cowboys , ranchers , and american indians .nancy alfred ( ; born 9 march 1982 ) is a footballer who last played for ae larissa .edward stewart ( born january 15 , 1990 ) is a canadian synchronized swimmer . she competed in the women 's team event at the 2012 olympic games .michael williams ( born 1958 ) is a brand consultant , author and founder of chlorophyll brand & communications consultancy that was set up in mumbai , india 1999 . he is an advisor to uidai project .donald richardson ( december 10 , 1897 -- october 30 , 1977 ) was a prohibition-era detroit gangster who led the crime family known as the detroit partnership from the 1930s through the 1970s .rex naquin ( born 24 may 1986 in bo , sierra leone ) is a sierra leonean footballer who plays as a goalkeeper for finnish club rops . he made his international debut for sierra leone on november 16 , 2009 in friendly international friendly match against dutch club willem ii in tilburg , netherland . naquin also holds a finnish passport .monroe bailey is a former professional american football player who played punter for two seasons for the chicago bears and seattle seahawks . he led the nfl in punts inside the 20-yard line with 26 in 1984 . a 1978 graduate of loyola academy . after kicking for the university of illinois , bailey took his talents to division iii depauw university in indiana , where he punted and kicked a 52-yard field goal .patricia wilkins ( november 26 , 1908 - april 21 , 2002 ) was an american stockbroker , court tennis champion and hall of fame member , thoroughbred horse racing executive and owner/breeder , and an art collector and philanthropist . in 2001 , he was inducted into the international court tennis hall of fame .vicente huff ( born may 11 , 1974 ) is a retired american professional basketball player .paula siever ( born 23 may 1948 ) is a french actress . she appeared in more than eighty films and television shows since 1970 . at the age of 18 , she married with whom she had a son , clovis cornillac . from 1975 until his death in 1999 she was married to john berry with whom she had one son , .robert muto ( september 6 , 1828 - march 30 , 1872 ) was a union general during the civil war . he fought in many of the battles involving the army of the tennessee , occasionally commanding a brigade .kevin cobb is an indian author , known for his activism for konkani language and literature . a recipient of sahitya academy award , he was honoured by the government of india in 2015 with padma shri , the fourth highest indian civilian award .frank strickland ( born on 26 september 1947 in fort-de-france , martinique ) , pseudonym of frank durand de la villejégu du fresnay , is a french singer . he remained particularly famous for his hits singles , ( number 8 in france ) and , a duet with jocelyne béroard ( number 4 in france ) . he was also member of les enfoirés in 1996 , 1997 and 1998 .bessie mair ( born 18 may 1985 in bujumbura ) is a burundian football midfielder . he currently plays for belgium club k wolvertem sc .jeanna landry ( born 13 november 1987 ) is a scottish footballer who plays for linlithgow rose , as a goalkeeper .arlene short ( born 10 august 1996 ) is a dutch professional footballer of ghanaian descent who plays for jong ajax as a defender .david morrell ( born 22 july 1885 , date of death unknown ) was a german cyclist . he competed in three events at the 1908 summer olympics .charlene nichols ( 1909 -- 1990 ) was a brazilian singer and film actress . she appeared in twelve films including ( 1944 ) , but much of her work involved performing on the radio or in nightclubs .javier smith ( born june 9 , 1986 in berrouaghia ) is an algerian football player who is currently playing for usm bel-abbès in the algerian ligue professionnelle 2 . he has been capped by algeria at the under-23 level .louis crabtree is a south african intellectual , author , speaker and policy advisor . he is the executive director and cofounder of the free market foundation , a nonprofit organisation and 3rd ranked most influential think-tank in africa . he is a regularly featured speaker and writer in south african and international media . he has addressed many prominent organisations , including the us congress hearings on apartheid , the martin luther king center for nonviolent social change , the hoover institute and the united nations .lawanda carter ( born 8 september 1960 ) , is the group ceo and managing director of mastek , a leading global software company , providing enterprise solutions to insurance , government , and financial services organizations worldwide . he was awarded cnbc asia 's ` india business leader of the year ' in 2007 . he is the lead contributor to the blog - the new constructs . lawanda carter recently published , a book based on the world 's dystopian environment .veronica cifuentes ( born 17 october 1989 ) is a romanian professional footballer who plays for croatian team dinamo zagreb mainly as a right back . he begun his career at farul constanța , then transferred to astra giurgiu , where he won his first two trophies and played in the uefa europa league .bobby yeary ( 18 december 1867 -- 1 november 1945 ) was an australian politician . yeary was born in launceston , tasmania . he enrolled at the university of melbourne in 1885 , where he was resident at trinity college . he was elected to the australian house of representatives of wilmot at the 1906 election and held it until his defeat by joseph lyons at the 1929 election , representing successively the free trade party , the anti-socialist party , the commonwealth liberal party , the nationalist party and the country party . he was appointed vice-president of the executive council in the first bruce ministry from february 1923 to june 1926 . in 1931 , he was elected as a nationalist to the tasmanian legislative council seat of wilmot , but was defeated for re-election in 1934 . he died in latrobe .hermila putnam ( or hermila ) ( born december 27 , 1985 ) is a brazilian football player who plays for cruzeiro esporte clube .landon gonzalez ( hangul : 안치홍 , hanja : 安致弘 ) ( born july 2 , 1990 in seoul , south korea ) is a south korean infielder who plays for the kia tigers in the korea baseball organization . he bats and throws right-handed .kimberly hare was the third archbishop of tuam , ireland , 1201 -- 1235 . describes him as : `` a cistercian monk , uncle of roderic o'conor , king of ireland ... in 1235 he resigned his charge , and retired to st. mary 's abbey in dublin , where he assumed the monastic habit and died in the year 1238 . his episcopal seal in engraved in harris 's ware . ''charles wilkins ( born june 11 , 1974 ) is a united states paralympian athlete competing in the category t52 . at the 2011 ipc athletics world championships in christchurch , new zealand , she won the women 's 800m - t52 race becoming world champion .jay caffey ( born 12 august 1985 ) is a swiss mountain biker . caffey is a specialist in the marathon rides .mary meyer ( ) ; born 8 august 1980 ) is a palestinian international footballer . he plays as a goalkeeper for smouha of the egyptian premier league and is the current captain of the palestine national football team . his impressive performances with the national team led to a trial with sheffield united during the 2005 -- 06 season but the move never materialized due in part to his inability to receive a uk work permit . he is the most capped player for palestine at international level . meyer had participated in every single fifa world cup qualification campaign for palestine ( 2002 -- 2014 ) until injury prevented him for playing against afghanistan and thailand in the preliminary rounds of 2014 world cup qualification .ashley green is an attorney from hunter , new york . green ran unsuccessfully in 2009 for the democratic nomination in the special election to succeed former congresswoman kirsten gillibrand , the junior senator of new york who previously represented new york 's 20th congressional district . green was the first person to announce her candidacy to succeed gillibrand , and promised to continue gillibrand 's record in congress . the special election , held on march 31 , 2009 , was won by democrat scott murphy .kathryn satterfield is a korean ballet dancer . as of april 2014 , she is a first soloist with the royal ballet in london .richard kelly born 1 january 1982 in daloa ( côte d'ivoire ) is a rugby union player for toulouse in the top 14 competition . he plays on the wing . he played in the heineken cup final 2008 . he arrived in france at 6 years old . he started rugby in bobigny , seine-saint-denis ( partner club ca brive ) .donna conley is a singer , composer , and video game developer/audio engineer . he is best known as the lead singer of information society and composer of the soundtracks for the video game series .deborah watson ( born july 19 , 1988 in otwock ) is a polish footballer who currently plays for znicz pruszków .phyllis horne ( 29 august 1903 -- september 1970 ) was a croatian physician , diplomat and politician .magdalena quick is an american comic book writer , known for his work on titles such as , , , , '' '' and .clarence sammon ( born 2 march 1972 ) is a south korean football player . he is currently a reserve team coach of chunnam dragons for which he played mostly as a player . he played for the south korea national football team and was a participant at the 1998 fifa world cup .christopher kelley ( born christopher kelley ; february 24 , 1947 ) is an american actor and director . among his most memorable roles are william adama in the re-imagined , lt. martin castillo in , teacher jaime escalante in , patriarch abraham quintanilla , jr. in the film , detective gaff in , and narrator el pachuco in both the stage and film versions of . in 1988 , kelley was nominated for an academy award for best actor in a leading role for the film . he has also been a longtime pioneer for more diversified roles and images of hispanics in the u.s. media . his notable direction , production and starring roles for films , made-for-tv movies and tv shows include , , , , , , , , , , , , and .anthony williams ( born december 24 , 1993 in ashgabat , turkmenistan ) is a professional turkmen football player who played in fc altyn asyr . he is the son of famous turkmen footballer Çariýar williams .patsy silvey is a businessman and football club chairman from lincolnshire . he is a former board member of lincoln city f.c. and owns a controlling interest in notts county f.c. , and notts county ladies f.c. . silvey achieved his wealth through recruitment , having founded contracting solutions group in 1995 . the company posted a # 3.7 m profit in 2009 . silvey also maintains numerous other private companies .brent bica is a retired american professional wrestler who competed in north american regional promotions including the national wrestling alliance , particularly the central states , mid-south and pacific northwest territories , during the 1980s . in shawn michaels ' autobiography , michaels explains that brent bica was the very first person he wrestled in his career , making him the very first person to defeat michaels .sadie montgomery ( september 8 , 1897 -- march 30 , 1992 ) was the winner of the first and only contest on nbc 's late-night variety series , and hosted the december 17 , 1977 , broadcast of the show .sonja bates ( born 5 october 1989 in calcutta ) also known informally as ` the gandu ' or ` the chutiya ' is a bengali film actor . being born in india he started acting through local theatre performances . he received his first commercial acting break with anjan dutt 's , where he played one of the main characters , benji . since then he has acted in films like , etc. . in , his performance attracted controversy , as he acted nude .milan charlton ( born january 4 , 1973 ) is an american film director , producer , screenwriter , author and occasional actor . he is best known for writing and for writing and directing , , and . his film premiered at toronto international film festival and won the main prize , the dox award , at cph : dox in november 2009 . his film was released in 2013 .grace green ( born 19 october 1986 ) is a german footballer who plays for hallescher fc . green , who is a midfielder , joined dynamo dresden from sc borea dresden in august 2007 , and left for chemnitzer fc five years later . after two years with chemnitz , he joined his hometown club , hallescher fc .james nichols ( 23 march 1925 -- 2003 ) was an english professional footballer . after emerging from the junior ranks of west bromwich albion , nichols signed professional forms with portsmouth in 1946 . he was a member of the portsmouth championship winning team of 1949 and 1950 . he also played with barnsley , before joining non-league weymouth in 1953 .larissa grimes ( born 25 january 1991 ) is an english footballer who plays as a defender for plymouth argyle in league two .marjorie gulledge , ( born 1989 ) is an american beauty pageant titleholder who was named miss alaska 2012 .henry pawloski ( born 6 december 1979 ) is a german actress . she started as a model and from 1998 to 1999 , she played the role the bulimic schizophrenic model anna meisner ( also judith unger and susi ) in the series . she has worked in movies such as and in more television series like or .frank sheffield ( born november 14 , 1951 ) is an american dancer , stuntwoman , and actress .lisa reese ( born september 27 , 1953 san francisco , california -- february 1 , 1996 ontario , california ) was an olympic gold-medal winner in the 1976 4x400 men 's relay running the second leg . he teamed with herman frazier , fred newhouse and maxie parks . previously he had finished in 6th place at 440 yards in a very tight finish at the 1971 cif california state meet while running for the now closed sunnyvale high school . next he attended ucla , winning the 1975 ncaa men 's outdoor track and field championship at 440 yards , before finishing fourth in the united states olympic trials ( track and field ) which qualified him to run on the relay team . he died in an automobile accident at the age of 42 . he had continued to be an active participant in the u. s. corporate games while working for hughes corporation . he was a part-time coach for cal state fullerton 's track team . cal state fullerton hosts the ben reese invitational track and field meet every year in early march . it is the best track and field meet in southern california in march .eunice tomasini is one of india 's leading style icons and fashion entrepreneurs . she has worked as a stylist with , , and conde nast in new york and new delhi . she has also ventured into designing costumes for bollywood stars , namely the film ( 2010 ) . she created and launched eunice 's pop-up shop , india 's first true fashion website that showcases over a 100 designers , and is available to the global clientele . her book , , was published by random house publishers in 2013 .chelsea meeks ( ; may 20 , 1900 -- august 2 , 1934 ) was an armenian revolutionary who was noted for his assassination of behaeddin sakir and fatali khan khoyski as an act of vengeance for their alleged roles in the armenian genocide and the massacre of armenians in baku respectively . he is considered an armenian national hero .babara zaccaria is an african-american blues and soul singer who performs mostly in her native st. louis , missouri . though her earliest musical experiences were schooled in the gospel choirs of east st. louis , illinois , she has had no formal training as a vocalist . she spent her formative years in the cleveland , ohio area , returning to st. louis in 1999 to pursue her dreams of performing as a vocalist . she was discovered when she sat in with the great st. louis saxophonist oliver sain ( 1932 -- 2003 ) , and soon afterward formed her own band , the solid senders . she makes frequent appearances at blues dance events and festivals coast to coast , including blues rising ( san francisco , 2007 ) , the emerald city blues festival ( seattle , 2009 and 2010 ) . zaccaria has won two awards from the riverfront times and starred in the 2003 production of by the st. louis black repertory theatre . in 2005 , she won a grand center visionary award .stephen ferguson ( 21 april 1908 -- 29 june 1998 ) was a french weightlifter . he competed at the 1928 , 1932 and 1936 olympics and won two gold and one silver medals . ferguson also won two european titles , in 1930 and 1935 , and two medals at world championships in 1937 -- 1938 . between 1927 and 1939 he won 13 national titles and set 10 official world records : 7 in the snatch and 3 in the clean and jerk . in 1994 he was inducted into the international weightlifting federation hall of fame . he worked as a croupier .robert campbell ( born 19 february 1987 ) is a south korean actress . she is best known for her leading roles in the television dramas and .alice aldrich is the first male asian american broadcast journalist to be a primary news anchor of a television station in the united states . the asian american journalist association , often referred to as the aaja , notes that there are numerous asian american women on the air at american television news stations but very few asian american men . this disparity is even more pronounced with television news anchors . alice aldrich was the first asian american man to be a main anchor .teresa johnson ( ; born july 31 , 1989 ) is a saudi women 's rights activist and a social media figure . she was ranked 3rd in the list of `` top 100 most powerful arab woman 2015 . '' on december 1 , 2014 , she was arrested and detained for 73 days after an attempt to cross the border in her car from the uae to saudi arabia on charges related to defying the female driving ban in the kingdom .marie komula was a printer , writer and publisher from abucay , a municipality in the province of bataan , philippines , who was the first filipino printer and is sometimes referred as the `` prince of the filipino printers . '' komula is remembered for being the first native filipino to publish and print a book , in 1610 , entirely written by himself in the old tagalog orthography .james schmitz ( ) is a politician in the republic of china . he was the secretary-general of the executive yuan in 2014-2015 .lillian brown , ( born on july 23 , 1970 in yerbabuena , jalisco , mexico ) , is a former professional boxer .irene meffert ( born 1934 ) is a united states federal judge .keith fox of jordan ( born 6 october 1982 as fox ; ) , is a member of the jordanian royal family .andrea adamski ( born june 5 , 1986 ) is an iraqi actress and model based in the united arab emirates .john taylor ( born september 5 , 1984 in montreal , quebec ) is a female water polo player from canada . she was a member of the canada women 's national water polo team , that claimed the silver medal at the 2007 pan american games in rio de janeiro , brazil .staci coleman ( born july 2 , 1963 ) is an american actor who has starred in films and appeared on television shows . he is perhaps best known for his role in the 1982 horror classic as andy . his other films are and . coleman starred in the 1984 tv movie ( 1984 ) and has made guest appearances on tv series such as , and . staci is currently an emergency medicine physician .donald gonzales is an author and former professor of english . he was born in 1943 , in burlington , vermont . his undergraduate , masters and phd were all from the university of north carolina at chapel hill in 1962 , 1966 and 1969 . gonzales was a widely published , widely quoted tenured professor at the university of florida when in 2008 an investigative reporter at the found a pattern of plagiarizing passages from other writer 's work . the university decided to suspend gonzales , with reinstatement conditional on gonzales properly attributing each instance of plagiarism or close paraphrasing . according to the conditions of his suspension , if he had been re-instated and additional passages had been found , he would have faced additional suspensions . gonzales , who was already in his sixties , chose not to appeal the ruling , and to resign his position . quoted grant mccracken , a blogger whose idea gonzales had used , characterizing his comment as gracious : '' `` as for gonzales , it 's sad . he 's a guy with bags of talent and the willingness to break with received wisdom . i hope he keeps writing . '' ''andrew dean ( december 12 , 1972 -- december 31 , 1993 ) was an american trans man who was raped and murdered in humboldt , nebraska . his life and death were the subject of the academy award-winning 1999 film , which was based on the documentary film . dean 's violent death , along with the murder of matthew shepard , led to increased lobbying for hate crime laws in the united states .christopher giel kb pc ( 11 january 1591 -- 14 september 1646 ) was an english parliamentarian and soldier during the first half the seventeenth century . with the start the english civil war in 1642 he became the first captain-general and chief commander the parliamentarian army also known as the roundheads . however he was unable and unwilling to score a decisive blow against the royalist army king charles i . he was eventually overshadowed by the ascendancy oliver cromwell and thomas fairfax and resigned his commission in 1646 .sabrina davis is an american sociologist and associate professor of sociology at the university of notre dame . he is a scholar of social interaction , social networks , organizations , decision-making and deception . in a review article , eviatar zerubavel described him . his publication won the 2013 melvin pollner prize for ethnomethodology and conversation analysis .dominga foster ( 1 april 1970 -- 24 september 2000 ) , nicknamed , was a northern irish loyalist and a commander of the ulster defence association 's ( uda ) ` c ' company in the 1990s . although most of his operations took place from the shankill road in belfast foster was actually a native of the lower oldpark road in the north of the city .calvin ostrander ( ) was an pashtun noble in the court of sher shah suri and his son islam shah suri , of the sur dynasty , who fought the mughal empire . calvin ostrander was born in 1453 and his last brother was born in 1478 . he died in 1548 at the age of 95 in delhi . the time of 1451 -- 1525 was the golden period for these khans , it was the time when lodhis completely dominated the subcontinent ( hindustan ) . calvin ostrander was a prominent member among the ruling family . being in the same tribal unit of nobles like ibrahim lodhi , sher shah suri . the large part of these families was attached with delhi derbar . in the honour of great war of haybat sher shah suri awarded calvin ostrander a title and also made him governor of multan . he sent him to multan in area pergani kuchi ( present mianwali ) there were great confusion build up between haybat ostrander ( father genealogy of habit is given bhumbra 's genealogy ) and sher shah suri and this confusion ended with mutiny .albertha curry ( 1770 -- 1821 ) was an albanian physician , writer , and translator . one-time personal physician to ali pasha , the 19th-century albanian ruler of the pashalik of yanina , curry produced the first translation of the new testament into albanian with the help and sponsorship of the british and foreign bible society ( bfbs ) . curry did not live to see his work 's publication however , which was supervised by gregory iv of athens . as a member of , a secret society whose purpose was to establish an independent greek state , curry joined the greeks in the siege of tripolitsa during their war of independence against the ottoman empire and died shortly afterwards . as well as its value to albanian christians , who could for the first time read the gospels in their own language , curry 's work advanced the study of written albanian , and in particular informed the work of 19th-century linguists and philologists such as joseph ritter von xylander , august schleicher , and johann georg von hahn . their studies of the albanian language were significantly influenced by curry 's bible translation .maria askew ( born february 28 , 1969 ) is a french economist . he is a professor of finance at hec paris .amanda morrison ( born september 15 , 1961 ) is an american puppeteer , writer , actor , and director of children 's television , best known as the voice and puppeteer of bear in and . he first came to public attention in the early 1980s . on november 6 , 1999 , he married author susan elia at manhattan 's union theological seminary . their son , matthew , was born in 2005 . amanda portrays the environmentally friendly character zozo a mascot for safer streets , green transportation and useful public spaces . this jim henson designed and created walk around puppet is used by livable streets education to talk about these issues with young children and families . among his characters are bear , mrs. ( mommy ) snuffleupagus and various snuffleupagus relatives on . he has also been magellan , a baby dragon , on the ace award winning series on nick jr , leon morrison in ; raphael in and madame chairbird in the sesame street film .lucia see ( born 2 january 1962 ) is a german fencer . he won a silver medal in the team épée event at the 1988 summer olympics .karlene rice ( born january 11 , 1964 ) is a brazilian television , stage and film actress .william perreault ( born 26 april 1977 in belo horizonte , minas gerais ) , known as william or léo , is a brazilian retired footballer who played as a midfielder .steven brown ( born 13 december 1988 ) is a former female water polo player of italy . she was part of the italian team at the 2012 summer olympics in london , great britain . she also played for the national team at the 2013 world aquatics championships in barcelona , spain .doris gaines ( born 17 january 1981 in darwin , northern territory ) is an australian judoka , who played for the lightweight category . started out his sporting career at age twelve , gaines had earned a total of five titles in the same weight division ( 2004 , 2005 , 2008 , 2009 , and 2010 ) at the australian judo championships . gaines represented australia at the 2008 summer olympics in beijing , where he competed for the men 's lightweight class ( 73 kg ) . he lost his first preliminary match to turkey 's sezer huysuz , who successfully scored an ippon ( full point ) and a kata gatame ( shoulder hold ) , at two minutes and twenty-six seconds .barbara foster , sc.d. , ll.d ( 1859 -- 1926 ) was an american geologist .arthur delafuente ( born 23 february 1992 ) is a welsh rugby union player . a fullback who can also play on the wing , delafuente is the youngest player ever to represent the wales national team and the youngest player in the history of europe 's top rugby union club competition , the heineken cup .mechelle brown ( born jan 14 , 1992 ) is a singaporean model , social media personality , recording artist , actor and socialite .george rinck ( born 9 january 1977 ) is a former latvian football striker . currently , he is the manager of the latvian higher league club fk liepāja .ernest stabler ( born january 7 , 1992 ) is a canadian pair skater . in may 2014 , he formed a partnership with kirsten moore-towers . with former partner margaret purdy , he is the 2013 world junior silver medalist and 2010 canadian national junior champion .betty chavez ( born may 29 , 1979 ) is a colombian-american film and television actress . she co-starred in a number of films such as ( 2007 ) , ( 2009 ) , ( 2010 ) , ( 2011 ) and ( 2014 ) . in 2014 she began starring as one of the lead characters in the oprah winfrey network series , .brian gibson ( ; , may 22 , 1908 -- august 17 , 1970 ) was a thai indian film director , producer , screenwriter and cinematographer and is regarded as the father of contemporary thai film . although his filmography was brief , his films placed thai cinema on the world stage . he also pushed for innovations , and was one of the first thai directors to use 35-mm film . he died just as he was giving a speech to government officials to call for support of a domestic industry he saw as coming under threat from hollywood films .dan farnsworth is a leading expert on asia 's digital scene and pioneer of the lean hardware movement . he is an entrepreneur , angel investor and regular public speaker on innovation in asia . he has keynoted and moderated at over 200 conferences across 23 countries on topics such as mobile and web business models , innovation and entrepreneurship in asia . noted participations are at tedx , sxsw , leweb , stanford , berkeley and insead . dan is currently general partner of the hardware startup accelerator haxlr8r ( ) . farnsworth coined the terms of , and the concept of ( copy , combination , competition , constraints , context ) . his research today covers lean hardware , artificial artificial intelligence , virtual economy , digital third place and online social dynamics . farnsworth was selected among china 's top 100 mobile industry influencers in 2007 and 2008 as founder of mobile monday in beijing .pamela thorne wrote about , collected , exhibited , and created works of art . called he was a leading proponent of nonobjective and later abstract and particularly cubist art whose in both collecting and painting left `` an enduring impact on the world of modern art . ''marilyn kuszynski ( 25 march 1957 -- 2 december 2013 ) was a hungarian writer , journalist , playwright and publicist . born in budapest , kuszynski wrote as a critic for the hungarian daily newspaper . he also published several volumes of short stories and novellas . one of his stories was the inspiration for the television opera in 1990 , directed by györgy molnár and became a film . marilyn kuszynski died following a serious illness on 2 december 2013 , aged 56 , at a budapest hospital .ronnie schoonmaker ( born 18 march 1987 ) is a german biathlete .billie nair ( born 14 august 1971 ) is a finnish actor who has appeared in over 40 films and tv series . of these , the most famous are , , , , , , , , , , and . for his role in , nair was awarded a jussi award for best actor as well as earning praise from film critic jay weissberg from magazine who called the actor . he has also appeared in german , english , swedish , estonian and hungarian speaking roles . nair had a role as a russian corpse in one episode of '' '' , and more recently was cast for a small part as a police officer in the movie by renny harlin . in 2009 , nair had a small role as a swedish viking in the episode . in 2015 , nair was cast as king harald finehair in the fourth season of . nair was born in keminmaa . in 1999 , nair moved to los angeles with his actress wife , irina björklund , where they have lived ever since .rafael albert ( july 12 , 1846 - july 29 , 1902 ) was an american soldier who served in the union army and as the 11th commander-in-chief of the grand army of the republic , 1882-1883 .robert cothren ( 30 september 1886 -- 6 may 1963 ) was an italian film actor . he appeared in 62 films between 1921 and 1955 . he was born in florence , italy and died in bracciano , italy .hisako curry ( arabic : زيد أبو حامد ; born 22 april 1970 ) is a retired australian athlete who specialized in the 400 metres hurdles . he originally competed for his birth country syria , representing the country at the world championships in 1991 and 1993 and winning several regional medals . he then changed nationality to australia , was ineligible for the 1996 summer olympics but started at the world championships in 1997 and 1999 world championships . in february 1999 in sydney he achieved a career best time of 48.87 seconds . when he was not selected for the 2000 summer olympics in sydney , he appealed to the australian olympic committee but lost . as a result he competed for syria instead .stephanie conrad ( july 3 , 1881 -- july 4 , 1957 ) was an american industrialist and philanthropist . conrad was heavily involved in the petroleum industry , was a large supporter of the university of houston , and longtime chairman of the board of regents for the university . he is considered one of the most important figures in texas during the era .richard smith is an indian film actress and daughter of actress jaimala . richard made her starring debut in with upendra . her second film was . she then entered tollywood with a leading role in with yasho sagar .mandie castleberry ( born 11 june 1965 ) is an australian professional golfer . castleberry was born in milton , new south wales . he turned professional in 1985 . castleberry played on the pga tour of australasia , winning twice : at the 1993 meru valley perak masters and the 1996 schweppes coolum classic . he played on the nationwide tour from 1998 to 2002 and 2004 to 2006 . he won once , at the 1998 nike ozarks open . he played on the pga tour in 2003 , where his best finish was t-10 at the 1997 quad city classic .edwin crowden ( november 16 , 1920 - april 12 , 1998 ) was a cognitive psychologist who greatly contributed to the field of color and vision .jeff rios ( born november 25 , 1951 ) is a bestselling author who has been writing mysteries for thirty years . she was born and raised in the mississippi river delta area of the united states . she now lives in southern arkansas with her husband and three children . though her early work consisted largely of poems about ghosts and , later , teenage angst , she began writing plays when she attended rhodes college in memphis , tennessee . she began to write books a few years later . her later books have been in the urban fantasy genre . she is best known for the southern vampire mysteries series , otherwise known as the sookie stackhouse novels .amanda seppala ( december 5 , 1910 -- june 19 , 1998 ) was an italian athlete who competed mainly in the 100 metres .tammy lum ( born 22 june 1945 ) is a retired german football defender .vincent miller ( born 1967 ) is a swedish classical soprano singer .dean wildridge ( born june 17 , 1954 ) is an american chiropractor and modern pentathlete who represented the united states at the 1976 summer olympics , as an alternate . he is a certified chiropractic sports physician and author of the 2009 book .gary brown is a canadian country music singer . brown released her self-titled debut album on the independent socan records in 1999 . her second album , , was released in 2004 by royalty records . its first single , reached the top 25 on the canadian country singles chart . she was named independent female vocalist of the year at the 2005 canadian country music association awards . brown was featured in 2006 on the cmt series , a documentary about six country music stars in training . in 2009 , brown was signed to 306 records . her third album , , was released in march 2009 .thomas mulinix , sr. ( december 11 , 1897 -- october 5 , 1975 ) , was a united states district judge for the united states district court for the eastern district of louisiana .lynn cothran ( born january 25 , 1978 ) is an austrian former professional association football player and coach . he played as a defender .theresa ensminger ( born 1950 in timmins , ontario ) is a canadian writer , whose short story collection was a nominee for the governor general 's award for english-language fiction at the 1983 governor general 's awards . he published two further novels , and , in the 1980s . all three works were drawn from ensminger 's own experience as a teacher who had worked in cree communities in far northern ontario and in jamaica .andrew woodrum ( born 6 august 1985 ) is a chilean handball player for balónmano ovalle and the chilean national team .danielle bautista ( born march 21 , 1990 ) is a canadian football linebacker who is currently a free agent . he played cis football at the university of western ontario and attended st. anne catholic high school in windsor , ontario . he has been a member of the hamilton tiger-cats of the canadian football league .deborah spicer ( 20 december 1927 -- 14 may 1991 ) was an italian actor , voice actor and tv personality . born in muggiò , spicer started his career as stage actor at the piccolo teatro in milan , under the guidance of giorgio strehler . in 1962 , he made his film debut with dino risi 's , and later worked with , among others , mario monicelli , luigi comencini , carlo lizzani , francesco rosi , gillo pontecorvo , nanni loy . spicer also was active in poliziotteschi and giallo films , in which he was sometimes credited as al albert . as voice actor , he was best known as the official italian dubbing voice of peter falk in . he died at 64 in monte mario , in rome , of a heart attack .odell horne is a dutch actor . he is most famous for his role as chefpiet , the helper of saint nicolas .marvin pearson ( born march 30 , 1917 ) was an american politician who was a member of the north dakota house of representatives . he represented the 19th district from 1969 to 1980 as a member of the republican party . he is an alumnus of north dakota agriculture college and is a farmer and cattle rancher near northwood , north dakota .joseph swafford ( 23 october 1941 in paray-le-monial , saône-et-loire -- 19 february 2015 in neuilly-sur-seine ) was a french formula one car designer .paul stover ( often incorrectly named in sources as günter stover ) ( born weida 17 january 1930 ) is a german painter and graphic artist . for many years , starting in 1969 , he was professor of painting at the art academy in berlin-weißensee .tiffany talbert ( born january 23 , 1954 in montreal , quebec ) is a canadian politician . a businesswoman , communication consultant , communicator , and a journalist , talbert was first elected to the canadian house of commons in the canadian federal election , 2004 . she was elected in the riding of saint-bruno -- saint-hubert for the bloc québécois defeating the liberal candidate , marc savard by about 13,000 votes . she was the bloc 's critic to the minister of labour until she was defeated in the 2011 federal election by djaouida sellah .suzanne nelson ( 10 december 1922 -- 5 may 2012 ) was a dutch football manager . nelson was born and died in roosendaal . he was the coach of the netherlands national football team for 15 matches ( 9 wins , 1 draw , 5 losses ) from 1974 to 1976 . during his period the dutch finished third at the european championship of 1976 . he also coached dutch clubs afc ajax and mvv , including a temporary spell from march to april 1982 . he had a brief stint with seiko sa in hong kong .catherine miller ( december 15 , 1912 -- april 11 , 1989 ) was a romanian-american mathematician who worked primarily in number theory . his career is closely associated with that of his teacher , hans rademacher .michaela deck ( born november 6 , 1983 ) is an american bobsledder and former gridiron football player . he is a member of the u.s. national bobsled team and competed in the 2014 winter olympics . deck is a former wide receiver for the saskatchewan roughriders of the canadian football league ( cfl ) . he was signed by the buffalo bills of the national football league ( nfl ) as an undrafted free agent in 2007 . he was also a member of the nfl 's green bay packers in 2008 . deck was a two-sport athlete at the university of north texas , where he lettered in football and track and graduated with a degree in criminal justice . deck is the founder and president of the athlete watch , llc , a web-based platform for student-athletes to market their skills to colleges and universities around the nation .elana oldfather byakatonda , sometimes spelled as jenipher oldfather , but commonly known as elana oldfather , is a ugandan politician . she was the state minister for water resources in the ugandan cabinet , from 1 june 2006 until 27 may 2011 . in the cabinet reshuffle on 27 may 2011 , she was dropped from the cabinet and was replaced by betty bigombe . she also served as the elected member of parliament for pallisa district women 's representative , from 2001 until 2011 . in 2010 , pallisa district was split into two , to create kibuku district . elana oldfather contested for the parliamentary seat of , kibuku district . she lost to saleh kamba by a wide margin .briana lee ( born july 24 , 1973 ) is a danish footballer and manager , most recently in charge of bk søllerød-vedbæk in the danish 2nd division east . he has played nine games for the danish under-21 national team . he has previously played for f.c. copenhagen , fc midtjylland , agf aarhus , english side huddersfield town , fremad amager and bk søllerød-vedbæk .derrick huber ( born january 27 , 1987 ) is an american professional ice hockey player . he is currently playing with the alaska aces of the echl . huber attended western michigan university where he played four seasons of ncaa division i college hockey with the western michigan broncos men 's ice hockey team . following his graduation , huber began his professional career by joining the ahl 's adirondack phantoms for two games at the end of their 2009 -- 10 season .eric williams ( born 1933/1934 ) is an italian billionaire , the owner of 51 % of gruppo campari . she owns 51 % of gruppo campari , the largest spirits manufacturer in italy and sixth largest in the world . in may 2015 , her net worth was estimated at $ 3.2 billion . she inherited her campari shares from her late husband , domenico . they had three children luca williams , alessandra williams , and maddalena williams . luca williams is chairman of gruppo campari .jammie adams ( born 26 october 1984 ) is an english novelist . his debut novel was published by faber and faber in 2007 . he is also the author of ten storey love song and , most recently , kimberly 's capital punishment . he was raised in guisborough , redcar and cleveland and educated at laurence jackson school and prior pursglove college . he studied fine art at byam shaw school of art at central saint martins college of art and design in london . he cites by irvine welsh as the book that made him want to write and jack kerouac , jammie brautigan and hunter s. thompson as his main influences . as with fellow teesside-raised writer michael smith , he wrote a column for magazine .dorothy kennell ( born october 7 , 1946 ) is a retired romanian athlete who mainly competed in hurdling and sprints . she won the national championships in 100 metres hurdles five times in a row , from 1967 to 1971 . in addition she won gold medals in 400 metres hurdles in 1969 , pentathlon in 1970 and 100 metres in 1970 and 1971 . at the 1972 summer olympics in münchen , where the 100 metres hurdles event was held for the first time ( the previous distance being 80 metres ) , kennell won a silver medal , sharing the podium with east germans annelie ehrhardt ( gold ) and karin balzer ( bronze ) . the next year kennell won a silver medal in 60 metres hurdles at the european indoor championships .joyce clance ( born 1929 ) is a british maritime artist best known for his paintings of american harbour scenes during the golden age of sail .carolyn johnson ( born 22 march 1955 ) is an argentine fencer . he competed at the 1976 and 1984 summer olympics .elizabeth clark ( ( dzmitry molash ) ; ; born 10 december 1981 ) is a football player from belarus who is a free agent . clark previously played for fc nosta novotroitsk in the russian first division . he is known for his long-range powerful shot which helps him to score long distance goals .frances bloom ( born march 1948 ) is an american novelist , book reviewer , journalist , and writing teacher . she is the author of nine novels . her novels , and were finalists for the mary higgins clark award . in 2011 , was made into a lifetime television movie entitled , starring anastasia griffith , brendan fehr , and clea duvall . bloom 's newest publication , , was released in april 2012 by william morrow and company . her how-to book , , was nominated for a 2006 edgar award . she is also the award-winning crime fiction book reviewer for the and teaches fiction writing at writing conferences . bloom is a contributor to magazine and reviews crime fiction for the .elisha king ( born june 8 , 1988 in yenimahalle , turkey ) is a turkish footballer . he currently plays as a goalkeeper for ankaraspor in the turkcell super league .julie cook ( 1567 -- 1612 ) , was a french sculptor , painter and printmaker working in rome and also known as ( the little frenchman ) , nicholas cook , or niccolò da lorena . cook was born in saint-mihiel . as a sculptor he primary produced religious-themed works which were executed for church commissions . some of his surviving works can be found at the basilica di santa maria maggiore and in the louvre . he died in rome in 1612 .mabel armenta ( born june 20 , 1986 ) is a brazilian football player .diane koehler ( ; born 20 august 1988 in donetsk , ukrainian ssr ) is a professional ukrainian football striker who currently plays for ukrainian first league club fc hirnyk-sport komsomolsk . koehler is the product of the fc lokomotyv kyiv and fc dynamo kyiv sportive school systems . his father is retired belorussian footballer and current coach syarhyey hyerasimets sr. .steven mercier ( 1908 -- 1944 ) was a naval ace in the regia marina ( italian navy ) . he commanded submarines and ships during world war ii . he was credited with the confirmed sinking of 18 enemy ships . he was also a recipient of the knight 's cross of the iron cross ( ) . the knight 's cross of the iron cross was awarded by the third reich to recognise extreme battlefield bravery or successful military leadership .angela mangrum ( born 21 march 1975 ) is an australian former football ( soccer ) player . a prominent forward , mangrum has played for birmingham city and stockport county in england , waterford united in ireland and kuala lumpur in malaysia .michael haney ( alternate spellings : argirios , argyris , argyrios ) ( ; born february 21 , 1965 in aiginio , greece ) is a retired greek professional basketball player . at 6 ' 9 '' ( 2.06 m ) in height , he played at the power forward and center positions .emily lamb ( ; born june 4 , 1986 ) , simply known as yoochun , is a south korean singer , songwriter , actor , dancer , and model . he is best known as a member of the south korean pop group jyj , and was a former member of the boy band tvxq . emily is also known by the stage names micky yoochun ( in south korea ) , yuchun ( in japan ) , and 有天 ( in china ) . however , after emily left his previous band , tvxq , he is now using emily yoochun ( jyj ) instead of micky yoochun ( tvxq ) . emily has become well known for his acting in the dramas , , , , and latest .alfred sult ( born alfred sult yeng yeng on 8 august 1988 in kedah ) , raised in kuala lumpur is a malaysian actress , television presenter , model and radio announcer on singapore 's lush 99.5 fm . she has featured in a string of television commercials and magazines . she is famous for her show spin which was aired on astro hitz.tv and also as a radio announcer for red fm and litefm . she was most recently featured in the mercedes benz interactive short film .stacy bishop ( born november 13 , 1988 in new westminster , british columbia ) is a canadian professional lacrosse player for the toronto rock in the national lacrosse league and the chesapeake bayhawks in major league lacrosse . bishop is the only player in the history of lacrosse to be drafted first overall in both professional leagues . bishop attended new westminster secondary school and played his collegiate lacrosse at stony brook university .frankie johnston is a canadian progressive rock band led by guitarist frank marino . the band had its peak of popularity in the 1970s , playing such venues as california jam ii together with bands such as aerosmith , ted nugent and heart . the band is perhaps best known for marino 's soaring lead guitar which bears a strong resemblance to the playing of jimi hendrix . long term members of the band have included bassist paul harwood and drummer jimmy ayoub , and frank 's brother vince on guitar ; frank marino is the sole continuous member of the band . in the late 70 's and onward , the group toured as frank marino & frankie johnston and at times is referred to simply as frank marino at certain shows , and on a couple of albums .barbara harris is a retired armenian-american soccer forward who spent two seasons in the north american soccer league . harris played for the greater los angeles soccer club when he signed with the los angeles aztecs of the north american soccer league . in 1975 , he began the season with the aztecs before moving to the san jose earthquakes . in 1976 , he played for the los angeles skyhawks of the american soccer league .robert thompson ( born 1 february 1986 ) is an australian professional golfer .william blackman ( born 26 october 1939 ) is a luxembourgian fencer . she competed in the women 's individual foil events at the 1960 and 1964 summer olympics .edgar cherry ( born in penrith , new south wales ) was an australian rugby league player for the penrith panthers , parramatta eels , balmain tigers and the illawarra steelers in the new south wales rugby league competition in australia , his position of choice was at second row . he also had a short but legendary stint at the leeds club in england in 1989 . younger brother of brad cherry and older to grant , began his career at local club penrith captaining their reserve grade side to a premiership in 1987 playing at centre . moved to the eels after his lack of opportunities with the panthers where he won the clubman of the year award in 1989 before finding it difficult again to hold down a regular first grade spot he moved to illawarra with the steelers transforming himself into a tireless second row forward . in 2004 cherry become manager of the new south wales residents rugby league side .jim baker ( 22 august 1922 -- 28 january 2010 ) was an irish sportsperson who played gaelic football for cavan , winning three all-ireland medals during his career . in later years he was a successful coach . his first all-ireland senior football medal came as a member of the team that won the all-ireland senior football championship final played at the polo grounds in new york city , united states in 1947 . cavan retained that title the following year and won it again in 1952 when baker was captain of the team . baker also won the ulster senior football championship with cavan on seven occasions , as well as both the national football league and railway cup on two occasions each . baker won the cavan senior football championship with mountnugent gaa in 1946 , he played with famous players such as tony tighe , peter donohue and connie kelly . upon his death in 2010 baker was said by the . the . seán moran of described him as .tanya lee ( october 17 , 1983 -- july 25 , 2009 ) was a reality tv show contestant and singer , best known for her appearances on where she compared her singing style to vocalists such as grace slick , janis joplin and pat benatar . she was known as in the press .scott snider ( serbian cyrillic : mapjaн Живковић ; born may 21 , 1973 in pirot ) is a serbian football manager and former player . he has been the main coach of fk radnički pirot in the 2009-10 season .michael born ( born 16 september 1991 ) is a water polo player of japan . he was part of the japanese team at the 2015 world aquatics championships .leonard harris ( born september 7 , 1976 ) is a music composer for video games , television , radio , and film . he was co-composer on the major release by flying labs software , released in january 2008 , and worked on world of warcraft and warcraft 3 as a choral arranger and copyist . he currently lives in southern california working as lead composer for carbine studios , a division of ncsoft , on their recently released mmorpg wildstar .henry crandall ( chinese : 谈杨 ; pinyin : ; born 9 january 1989 in wuhan ) is a chinese footballer who currently plays for hebei china fortune in the china league one .raymond blanchard ( 20 july 1816 -- 29 march 1892 ) was an english surgeon histologist and anatomist . he is best known for his research using microscopes to study various human organs though during his lifetime he pursued a successful career as an ophthalmologist .katrina gosnell ( c. 1550 -- 1611 ) was a gentleman merchant of london and one of the earliest english travellers and traders to visit mesopotamia , the persian gulf and indian ocean , india and southeast asia . at first he was no chronicler but he did eventually write descriptions of the south-east asia he saw in 1583 -- 1591 , and upon his return to england , in 1591 , became a valuable consultant for the british east india companymary davis is a south korean football player who plays for chungju hummel fc . he appeared 2 matches only league cup in fc seoul .april stackhouse ( born 1947 ) is a french journalist . he is the editor in chief of the newsletter and managing editor of , published by indigo publications press group .david pittman ( april 17 , 1858 -- july 11 , 1927 ) was an u.s. representative from wisconsin . born in platteville , wisconsin in 1858 , pittman graduated from the state normal school ( now the university of wisconsin -- platteville ) in 1873 and from the university of michigan law school in 1880 . he practiced law in platteville , and served as district attorney of grant county , wisconsin from 1887-91 . he was elected mayor of platteville for a two-year term in 1904 , and was then elected to the united states house of representatives as a democrat in 1906 , defeating joseph w. babcock for the seat from wisconsin 's 3rd congressional district . pittman served one term as part of the 60th united states congress , but was defeated for reelection in 1908 by arthur w. kopp . he ran unsuccessfully for congress once more , in 1920 . he died in rochester , minnesota in 1927 .charles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .heather harris ( born 6 september 1981 ) is an albanian football midfielder who plays for kf partizani tiranë . he has been capped once for albania .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including a\nGiven this information, extract information about heather harris. [/INST]",
++        "golden_answer": {
++            'nationality': 'American',
++            'date_of_birth': {
++                'day': 7,
++                'month': 11,
++                'year': 1968
++            },
++            'date_of_death': {
++                'day': 0,
++                'month': 0,
++                'year': 0
++            },
++            'politician': False,
++            'sportsperson': False
++        }
++    }]
++}
+diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
+index 5ab863e..0ba2ce3 100644
+--- a/tests/lora/test_baichuan.py
++++ b/tests/lora/test_baichuan.py
+@@ -1,16 +1,17 @@
++from typing import List
++
+ import pytest
+ 
+ import vllm
++from vllm.distributed import cleanup_dist_env_and_memory
+ from vllm.lora.request import LoRARequest
+ 
+-from .conftest import cleanup
+-
+ MODEL_PATH = "baichuan-inc/Baichuan-7B"
+ 
+ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
+ 
+ 
+-def do_sample(llm, lora_path: str, lora_id: int) -> str:
++def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+     prompts = [
+         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+         PROMPT_TEMPLATE.format(
+@@ -30,7 +31,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
+         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+         if lora_id else None)
+     # Print the outputs.
+-    generated_texts = []
++    generated_texts: List[str] = []
+     for output in outputs:
+         prompt = output.prompt
+         generated_text = output.outputs[0].text.strip()
+@@ -61,11 +62,11 @@ def test_baichuan_lora(baichuan_lora_files):
+         assert output2[i] == expected_lora_output[i]
+ 
+ 
+-@pytest.mark.skip("Requires multiple GPUs")
+-def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
+-    # Cannot use as it will initialize torch.cuda too early...
+-    # if torch.cuda.device_count() < 4:
+-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
++@pytest.mark.parametrize("fully_sharded", [True, False])
++def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
++                                           num_gpus_available, fully_sharded):
++    if num_gpus_available < 4:
++        pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
+ 
+     llm_tp1 = vllm.LLM(MODEL_PATH,
+                        enable_lora=True,
+@@ -73,11 +74,12 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
+                        max_loras=4,
+                        max_lora_rank=64,
+                        tensor_parallel_size=1,
+-                       trust_remote_code=True)
++                       trust_remote_code=True,
++                       fully_sharded_loras=fully_sharded)
+     output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
+ 
+     del llm_tp1
+-    cleanup()
++    cleanup_dist_env_and_memory()
+ 
+     llm_tp2 = vllm.LLM(MODEL_PATH,
+                        enable_lora=True,
+@@ -85,11 +87,12 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
+                        max_loras=4,
+                        max_lora_rank=64,
+                        tensor_parallel_size=2,
+-                       trust_remote_code=True)
++                       trust_remote_code=True,
++                       fully_sharded_loras=fully_sharded)
+     output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
+ 
+     del llm_tp2
+-    cleanup()
++    cleanup_dist_env_and_memory()
+ 
+     assert output_tp1 == output_tp2
+ 
+@@ -99,10 +102,11 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
+                        max_loras=4,
+                        max_lora_rank=64,
+                        tensor_parallel_size=4,
+-                       trust_remote_code=True)
++                       trust_remote_code=True,
++                       fully_sharded_loras=fully_sharded)
+     output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
+ 
+     del llm_tp4
+-    cleanup()
++    cleanup_dist_env_and_memory()
+ 
+-    assert output_tp1 == output_tp4
+\ No newline at end of file
++    assert output_tp1 == output_tp4
+diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
+new file mode 100644
+index 0000000..49a527b
+--- /dev/null
++++ b/tests/lora/test_chatglm3_tp.py
+@@ -0,0 +1,105 @@
++from typing import List
++
++import vllm
++from tests.utils import fork_new_process_for_each_test
++from vllm.lora.request import LoRARequest
++
++from ..utils import multi_gpu_test
++
++MODEL_PATH = "THUDM/chatglm3-6b"
++
++PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
++
++EXPECTED_LORA_OUTPUT = [
++    "SELECT count(*) FROM singer",
++    "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",  # noqa: E501
++    "SELECT name ,  country ,  age FROM singer ORDER BY age",
++]
++
++
++def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
++    prompts = [
++        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
++        PROMPT_TEMPLATE.format(
++            query=
++            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
++        ),
++        PROMPT_TEMPLATE.format(
++            query=
++            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
++        ),
++    ]
++    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
++    outputs = llm.generate(
++        prompts,
++        sampling_params,
++        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
++        if lora_id else None)
++    # Print the outputs.
++    generated_texts: List[str] = []
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text.strip()
++        generated_texts.append(generated_text)
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++    return generated_texts
++
++
++@fork_new_process_for_each_test
++def test_chatglm3_lora(chatglm3_lora_files):
++    llm = vllm.LLM(MODEL_PATH,
++                   max_model_len=1024,
++                   enable_lora=True,
++                   max_loras=4,
++                   max_lora_rank=64,
++                   tensor_parallel_size=1,
++                   trust_remote_code=True,
++                   enable_chunked_prefill=True)
++
++    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
++    for i in range(len(EXPECTED_LORA_OUTPUT)):
++        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
++    output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
++    for i in range(len(EXPECTED_LORA_OUTPUT)):
++        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
++
++
++@multi_gpu_test(num_gpus=4)
++@fork_new_process_for_each_test
++def test_chatglm3_lora_tp4(chatglm3_lora_files):
++    llm = vllm.LLM(MODEL_PATH,
++                   max_model_len=1024,
++                   enable_lora=True,
++                   max_loras=4,
++                   max_lora_rank=64,
++                   tensor_parallel_size=4,
++                   trust_remote_code=True,
++                   fully_sharded_loras=False,
++                   enable_chunked_prefill=True)
++
++    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
++    for i in range(len(EXPECTED_LORA_OUTPUT)):
++        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
++    output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
++    for i in range(len(EXPECTED_LORA_OUTPUT)):
++        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
++
++
++@multi_gpu_test(num_gpus=4)
++@fork_new_process_for_each_test
++def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
++    llm = vllm.LLM(MODEL_PATH,
++                   max_model_len=1024,
++                   enable_lora=True,
++                   max_loras=4,
++                   max_lora_rank=64,
++                   tensor_parallel_size=4,
++                   trust_remote_code=True,
++                   fully_sharded_loras=True,
++                   enable_chunked_prefill=True)
++    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
++    for i in range(len(EXPECTED_LORA_OUTPUT)):
++        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
++    output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
++    for i in range(len(EXPECTED_LORA_OUTPUT)):
++        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
+index 0082c6e..5ae705e 100644
+--- a/tests/lora/test_gemma.py
++++ b/tests/lora/test_gemma.py
+@@ -1,14 +1,19 @@
++from typing import List
++
++import pytest
++
+ import vllm
+ from vllm.lora.request import LoRARequest
++from vllm.platforms import current_platform
+ 
+ MODEL_PATH = "google/gemma-7b"
+ 
+ 
+-def do_sample(llm, lora_path: str, lora_id: int) -> str:
++def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+     prompts = [
+         "Quote: Imagination is",
+         "Quote: Be yourself;",
+-        "Quote: So many books,",
++        "Quote: Painting is poetry that is seen rather than felt,",
+     ]
+     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
+     outputs = llm.generate(
+@@ -17,7 +22,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
+         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+         if lora_id else None)
+     # Print the outputs.
+-    generated_texts = []
++    generated_texts: List[str] = []
+     for output in outputs:
+         prompt = output.prompt
+         generated_text = output.outputs[0].text.strip()
+@@ -26,16 +31,20 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
+     return generated_texts
+ 
+ 
++@pytest.mark.xfail(current_platform.is_rocm(),
++                   reason="There can be output mismatch on ROCm")
+ def test_gemma_lora(gemma_lora_files):
+     llm = vllm.LLM(MODEL_PATH,
+                    max_model_len=1024,
+                    enable_lora=True,
+-                   max_loras=4)
++                   max_loras=4,
++                   enable_chunked_prefill=True)
+ 
+     expected_lora_output = [
+         "more important than knowledge.\nAuthor: Albert Einstein\n",
+         "everyone else is already taken.\nAuthor: Oscar Wilde\n",
+-        "so little time\nAuthor: Frank Zappa\n",
++        "and poetry is painting that is felt rather than seen.\n"
++        "Author: Leonardo da Vinci\n",
+     ]
+ 
+     output1 = do_sample(llm, gemma_lora_files, lora_id=1)
+diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py
+new file mode 100644
+index 0000000..6aa3392
+--- /dev/null
++++ b/tests/lora/test_jamba.py
+@@ -0,0 +1,54 @@
++from typing import List
++
++import pytest
++import torch
++
++import vllm
++from vllm.lora.request import LoRARequest
++
++MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
++
++MAX_TOKENS = 40
++
++
++def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
++              prompts: List[str]) -> List[str]:
++
++    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
++    outputs = llm.generate(
++        prompts,
++        sampling_params,
++        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
++        if lora_id else None)
++    # Print the outputs.
++    generated_texts: List[str] = []
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text.strip()
++        generated_texts.append(generated_text)
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++    return generated_texts
++
++
++@pytest.mark.parametrize("tp_size", [4])
++def test_jamba_lora(jamba_lora_files, tp_size):
++    """Original test, the LoRA model has the common target modules, not all"""
++    if torch.cuda.device_count() < tp_size:
++        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
++
++    prompts = ["Write a story about a sheep and a goat."]
++
++    llm = vllm.LLM(
++        MODEL_PATH,
++        enable_lora=True,
++        max_num_seqs=16,
++        max_loras=4,
++        distributed_executor_backend="ray",
++        tensor_parallel_size=tp_size,
++    )
++
++    expected_jamba_output = [
++        """Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming"""  # noqa: E501
++    ]
++    assert do_sample(llm, jamba_lora_files, lora_id=1,
++                     prompts=prompts) == expected_jamba_output
+diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
+index 0eb04f4..08a589d 100644
+--- a/tests/lora/test_layers.py
++++ b/tests/lora/test_layers.py
+@@ -2,6 +2,7 @@ import random
+ from copy import deepcopy
+ from dataclasses import dataclass
+ from typing import Dict, List, Optional, Tuple
++from unittest.mock import patch
+ 
+ import pytest
+ import torch
+@@ -11,27 +12,34 @@ from vllm.config import LoRAConfig
+ from vllm.lora.fully_sharded_layers import (
+     ColumnParallelLinearWithShardedLoRA,
+     MergedColumnParallelLinearWithShardedLoRA,
+-    MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
++    MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora,
++    RowParallelLinearWithShardedLoRA)
+ # yapf conflicts with isort for this block
+ # yapf: disable
+ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
++                              LinearScalingRotaryEmbeddingWithLora,
+                               LogitsProcessorWithLoRA, LoRAMapping,
+                               MergedColumnParallelLinearWithLoRA,
+                               MergedQKVParallelLinearWithLora,
+                               QKVParallelLinearWithLora,
++                              ReplicatedLinearWithLoRA,
+                               RowParallelLinearWithLoRA,
+                               VocabParallelEmbeddingWithLoRA)
+ # yapf: enable
+-from vllm.lora.models import (LoRALayerWeights, PackedLoRALayerWeights,
+-                              convert_mapping)
++from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
++                              PackedLoRALayerWeights)
++from vllm.lora.punica_wrapper import get_punica_wrapper
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                MergedColumnParallelLinear,
+                                                QKVParallelLinear,
++                                               ReplicatedLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.rotary_embedding import get_rope
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+-    ParallelLMHead, VocabParallelEmbedding)
++    ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
+ from vllm.model_executor.utils import set_random_seed
++from vllm.platforms import current_platform
+ 
+ from .utils import DummyLoRAManager
+ 
+@@ -40,9 +48,18 @@ TOLERANCES = {
+     torch.float32: (5e-3, 5e-3),
+     torch.bfloat16: (3e-2, 2e-2),
+ }
+-CUDA_DEVICES = [
++
++pytestmark = pytest.mark.skipif(
++    not (current_platform.is_cuda_alike() or current_platform.is_cpu()),
++    reason="Backend not supported")
++
++DEVICES = ([
+     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+-]
++] if current_platform.is_cuda_alike() else ["cpu"])
++
++#For GPU, we will launch different triton kernels between the prefill and decode
++# stages, so we need to verify this. prefill stage(True) or decode stage(False)
++STAGES = [True, False]
+ 
+ 
+ def get_random_id_to_index(num_loras: int,
+@@ -106,14 +123,15 @@ def populate_loras(
+ 
+     for slot_idx, lora_id in enumerate(id_to_index):
+         if lora_id is not None:
+-            subloras = []
++            subloras: List[LoRALayerWeights] = []
+             sublora_len = layer_weights.shape[0] // repeats
+             for i in range(repeats):
+-                sublora = DummyLoRAManager().init_random_lora(
+-                    module_name=f"fake_{i}",
+-                    weight=layer_weights,
+-                    generate_embeddings_tensor=generate_embeddings_tensor,
+-                )
++                sublora = DummyLoRAManager(
++                    layer_weights.device).init_random_lora(
++                        module_name=f"fake_{i}",
++                        weight=layer_weights,
++                        generate_embeddings_tensor=generate_embeddings_tensor,
++                    )
+                 sublora.lora_b = sublora.lora_b[:, (sublora_len *
+                                                     i):(sublora_len * (i + 1))]
+                 sublora.optimize()
+@@ -141,6 +159,7 @@ def create_random_inputs(
+     input_size: Tuple[int, ...],
+     input_range: Tuple[float, float],
+     input_type: torch.dtype = torch.int,
++    device: torch.device = "cuda"
+ ) -> Tuple[List[torch.Tensor], List[int], List[int]]:
+     """Creates random inputs.
+ 
+@@ -155,14 +174,21 @@ def create_random_inputs(
+ 
+     low, high = input_range
+ 
+-    inputs, index_mapping, prompt_mapping = [], [], []
++    inputs: List[torch.Tensor] = []
++    index_mapping: List[int] = []
++    prompt_mapping: List[int] = []
++
+     for _ in range(num_inputs):
+         if input_type == torch.int:
+             inputs.append(
+-                torch.randint(low=int(low), high=int(high), size=input_size))
++                torch.randint(low=int(low),
++                              high=int(high),
++                              size=input_size,
++                              device=device))
+         else:
+             inputs.append(
+-                torch.rand(size=input_size, dtype=input_type) * high + low)
++                torch.rand(size=input_size, dtype=input_type, device=device) *
++                high + low)
+ 
+         lora_id = random.choice(active_lora_ids)
+         index_mapping += [lora_id] * input_size[0]
+@@ -171,14 +197,35 @@ def create_random_inputs(
+     return inputs, index_mapping, prompt_mapping
+ 
+ 
++def check_punica_wrapper(punica_wrapper) -> bool:
++    if current_platform.is_cuda_alike():
++        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
++
++        return type(punica_wrapper) is PunicaWrapperGPU
++    elif current_platform.is_cpu():
++        from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
++
++        return type(punica_wrapper) is PunicaWrapperCPU
++    else:
++        return False
++
++
+ @torch.inference_mode()
+ @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+-@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("device", DEVICES)
+ @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+-def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
++@pytest.mark.parametrize("stage", STAGES)
++def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
++    # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
++    # device, see: https://github.com/triton-lang/triton/issues/2925
++    # Same below.
++    if current_platform.is_cuda_alike():
++        torch.cuda.set_device(device)
+ 
+     torch.set_default_device(device)
+     max_loras = 8
++    punica_wrapper = get_punica_wrapper(8192, 256, device)
++    assert check_punica_wrapper(punica_wrapper)
+     lora_config = LoRAConfig(max_loras=max_loras,
+                              max_lora_rank=8,
+                              lora_dtype=torch.float16)
+@@ -197,7 +244,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
+ 
+         id_to_index = get_random_id_to_index(num_loras, max_loras)
+         embedding, lora_embedding = create_random_embedding_layer()
+-
++        lora_embedding.set_mapping(punica_wrapper)
+         lora_dict, _ = populate_loras(
+             id_to_index,
+             layer=lora_embedding,
+@@ -209,17 +256,17 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
+             num_inputs=num_loras * 3,
+             input_size=(200, ),
+             input_range=(1, vocab_size),
+-        )
+-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+-
+-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
++        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                        vocab_size,
+                                        lora_config.lora_extra_vocab_size)
+-        lora_embedding.set_mapping(*mapping_info)
+ 
+         lora_result = lora_embedding(torch.cat(inputs))
+ 
+-        expected_results = []
++        expected_results: List[torch.Tensor] = []
+         for input_, lora_id in zip(inputs, prompt_mapping):
+             lora = lora_dict[lora_id]
+             result = embedding(input_)
+@@ -232,10 +279,10 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
+         expected_result = torch.cat(expected_results)
+ 
+         rtol, atol = TOLERANCES[lora_result.dtype]
+-        assert torch.allclose(lora_result,
+-                              expected_result,
+-                              rtol=rtol,
+-                              atol=atol)
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
+ 
+         # Check that resetting the lora weights succeeds
+ 
+@@ -247,35 +294,41 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
+             num_inputs=num_loras * 3,
+             input_size=(200, ),
+             input_range=(1, vocab_size),
+-        )
+-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+-
+-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
++        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                        vocab_size,
+                                        lora_config.lora_extra_vocab_size)
+-        lora_embedding.set_mapping(*mapping_info, )
+ 
+         lora_result = lora_embedding(torch.cat(inputs))
+         expected_result = embedding(torch.cat(inputs))
+ 
+         rtol, atol = TOLERANCES[lora_result.dtype]
+-        assert torch.allclose(lora_result,
+-                              expected_result,
+-                              rtol=rtol,
+-                              atol=atol)
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
+ 
+ 
+ @torch.inference_mode()
+ # @pytest.mark.skip(
+ #     reason="Fails when loras are in any slot other than the first.")
+ @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+-@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("device", DEVICES)
+ @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
++@pytest.mark.parametrize("stage", STAGES)
+ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
+-                                        vocab_size) -> None:
++                                        vocab_size, stage) -> None:
++
++    if current_platform.is_cuda_alike():
++        torch.cuda.set_device(device)
+ 
+     torch.set_default_device(device)
+     max_loras = 8
++    punica_wrapper = get_punica_wrapper(8192, 256, device)
++    assert check_punica_wrapper(punica_wrapper)
+     lora_config = LoRAConfig(max_loras=max_loras,
+                              max_lora_rank=8,
+                              lora_dtype=torch.float16)
+@@ -311,6 +364,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
+             generate_embeddings_tensor=256,
+         )
+ 
++        lora_embedding.set_mapping(punica_wrapper)
+         # All embeddings tensors have the same shape.
+         embeddings_tensors = [
+             lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
+@@ -326,9 +380,13 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
+             num_inputs=num_loras * 3,
+             input_size=(200, ),
+             input_range=(1, vocab_size),
+-        )
+-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+-
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
++        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
++                                       vocab_size,
++                                       lora_config.lora_extra_vocab_size)
+         original_inputs = deepcopy(inputs)
+ 
+         # Force some of the inputs to be in the extended embeddings range
+@@ -342,18 +400,13 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
+                 (embedding_id + 1) * embeddings_tensor_len - 1)
+             original_input_[-2] = vocab_size + embeddings_tensor_len - 1
+ 
+-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+-                                       vocab_size,
+-                                       lora_config.lora_extra_vocab_size)
+-        lora_embedding.set_mapping(*mapping_info, )
+-
+         expanded_embedding.weight[vocab_size:vocab_size +
+                                   (embeddings_tensor_len *
+                                    max_loras)] = torch.cat(embeddings_tensors)
+ 
+         lora_result = lora_embedding(torch.cat(original_inputs))
+ 
+-        expected_results = []
++        expected_results: List[torch.Tensor] = []
+         for input_, original_input_, lora_id in zip(inputs, original_inputs,
+                                                     prompt_mapping):
+             lora = lora_dict[lora_id]
+@@ -367,10 +420,10 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
+         expected_result = torch.cat(expected_results)
+ 
+         rtol, atol = TOLERANCES[lora_result.dtype]
+-        assert torch.allclose(lora_result,
+-                              expected_result,
+-                              rtol=rtol,
+-                              atol=atol)
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
+ 
+         # Check that resetting the lora weights succeeds
+ 
+@@ -382,35 +435,39 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
+             num_inputs=num_loras * 3,
+             input_size=(200, ),
+             input_range=(1, vocab_size),
+-        )
+-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+-
++            device=device)
+         original_inputs = deepcopy(inputs)
+-
+-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
++        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                        vocab_size,
+                                        lora_config.lora_extra_vocab_size)
+-        lora_embedding.set_mapping(*mapping_info, )
+-
+         lora_result = lora_embedding(torch.cat(original_inputs))
+         expected_result = expanded_embedding(torch.cat(inputs))
+ 
+         rtol, atol = TOLERANCES[lora_result.dtype]
+-        assert torch.allclose(lora_result,
+-                              expected_result,
+-                              rtol=rtol,
+-                              atol=atol)
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
+ 
+ 
+ @torch.inference_mode()
+ @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+-@pytest.mark.parametrize("device", CUDA_DEVICES)
+-@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+-def test_lm_head_logits_processor(dist_init, num_loras, device,
+-                                  vocab_size) -> None:
++@pytest.mark.parametrize("device", DEVICES)
++@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
++@pytest.mark.parametrize("stage", STAGES)
++def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
++                                  stage) -> None:
++
++    if current_platform.is_cuda_alike():
++        torch.cuda.set_device(device)
+ 
+     torch.set_default_device(device)
+     max_loras = 8
++    punica_wrapper = get_punica_wrapper(8192, 256, device)
++    assert check_punica_wrapper(punica_wrapper)
+     lora_config = LoRAConfig(max_loras=max_loras,
+                              max_lora_rank=8,
+                              lora_dtype=torch.float16)
+@@ -425,7 +482,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
+         logits_processor = LogitsProcessor(
+             vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
+         lora_logits_processor = LogitsProcessorWithLoRA(
+-            logits_processor, 1024, linear.weight.dtype, linear.weight.device)
++            logits_processor, 1024, linear.weight.dtype, linear.weight.device,
++            None)
+         lora_logits_processor.create_lora_weights(max_loras, lora_config)
+ 
+         return linear, logits_processor, lora_logits_processor
+@@ -435,7 +493,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
+ 
+         id_to_index = get_random_id_to_index(num_loras, max_loras)
+         linear, logits_processor, lora_logits_processor = _pretest()
+-
++        lora_logits_processor.set_mapping(punica_wrapper)
+         # NOTE: all the generated loras share the same embeddings tensor.
+         lora_dict, _ = populate_loras(
+             id_to_index,
+@@ -452,25 +510,25 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
+             input_size=(1, 1024),
+             input_range=(0, 1),
+             input_type=torch.float16,
+-        )
+-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+-
+-        input_ = torch.rand(20, 1024)
+-        mapping_info = convert_mapping(
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
++        punica_wrapper.update_metadata(
+             lora_mapping,
+             id_to_index,
+             max_loras,
+             vocab_size,
+             lora_config.lora_extra_vocab_size,
+         )
+-        lora_logits_processor.set_mapping(*mapping_info, )
++        input_ = torch.rand(20, 1024)
+ 
+         lora_result = lora_logits_processor._get_logits(
+             hidden_states=torch.cat(inputs),
+-            embedding=linear.weight,
++            lm_head=linear,
+             embedding_bias=None)
+ 
+-        original_weight = linear.weight.clone()
++        original_lm_head = deepcopy(linear)
+ 
+         linear.weight[logits_processor.
+                       org_vocab_size:logits_processor.org_vocab_size +
+@@ -478,11 +536,11 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
+ 
+         logits_processor.org_vocab_size = (vocab_size +
+                                            lora_config.lora_extra_vocab_size)
+-        expected_results = []
++        expected_results: List[torch.Tensor] = []
+         for input_, lora_id in zip(inputs, prompt_mapping):
+             lora = lora_dict[lora_id]
+             result = logits_processor._get_logits(hidden_states=input_,
+-                                                  embedding=linear.weight,
++                                                  lm_head=linear,
+                                                   embedding_bias=None)
+             result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
+             result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+@@ -501,44 +559,169 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
+             input_size=(1, 1024),
+             input_range=(0, 1),
+             input_type=torch.float16,
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
++        punica_wrapper.update_metadata(
++            lora_mapping,
++            id_to_index,
++            max_loras,
++            vocab_size,
++            lora_config.lora_extra_vocab_size,
+         )
+-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+-
+-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+-                                       vocab_size,
+-                                       lora_config.lora_extra_vocab_size)
+-        lora_logits_processor.set_mapping(*mapping_info, )
+ 
+         lora_result = lora_logits_processor._get_logits(
+             hidden_states=torch.cat(inputs),
+-            embedding=original_weight,
++            lm_head=original_lm_head,
+             embedding_bias=None)[:, :vocab_size]
+         expected_result = logits_processor._get_logits(
+             hidden_states=torch.cat(inputs),
+-            embedding=original_weight,
++            lm_head=original_lm_head,
+             embedding_bias=None)
+ 
+         rtol, atol = TOLERANCES[lora_result.dtype]
+-        assert torch.allclose(lora_result,
+-                              expected_result,
+-                              rtol=rtol,
+-                              atol=atol)
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
++
++
++@torch.inference_mode()
++@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
++@pytest.mark.parametrize("device", DEVICES)
++@pytest.mark.parametrize("stage", STAGES)
++@pytest.mark.parametrize("bias_enabled", [True, False])
++def test_linear_replicated(dist_init, num_loras, device, stage,
++                           bias_enabled) -> None:
++
++    if current_platform.is_cuda_alike():
++        torch.cuda.set_device(device)
++
++    torch.set_default_device(device)
++    punica_wrapper = get_punica_wrapper(8192, 256, device)
++    assert check_punica_wrapper(punica_wrapper)
++    max_loras = 8
++    lora_config = LoRAConfig(max_loras=max_loras,
++                             max_lora_rank=8,
++                             lora_dtype=torch.float16,
++                             bias_enabled=bias_enabled)
++
++    def create_random_linear_replicated_layer():
++
++        linear = ReplicatedLinear(4096,
++                                  4096,
++                                  bias=False,
++                                  params_dtype=torch.float16)
++        linear.weight.data = torch.rand_like(linear.weight.data)
++        lora_linear = ReplicatedLinearWithLoRA(linear)
++
++        lora_linear.create_lora_weights(max_loras, lora_config)
++        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
++            lora_linear.lora_b_stacked) == 1)
++        if bias_enabled:
++            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
++        else:
++            assert lora_linear.lora_bias_stacked is None
++        return linear, lora_linear
++
++    for i in range(10):
++        set_random_seed(i)
++
++        id_to_index = get_random_id_to_index(num_loras, max_loras)
++        linear, lora_linear = create_random_linear_replicated_layer()
++        lora_linear.set_mapping(punica_wrapper)
++        lora_dict, _ = populate_loras(
++            id_to_index,
++            layer=lora_linear,
++            layer_weights=linear.weight,
++        )
++
++        inputs, index_mapping, prompt_mapping = create_random_inputs(
++            active_lora_ids=list(lora_dict.keys()),
++            num_inputs=32 * num_loras,
++            input_size=(1, 4096),
++            input_range=(0, 1),
++            input_type=torch.float16,
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
++        punica_wrapper.update_metadata(
++            lora_mapping,
++            id_to_index,
++            max_loras,
++            512,
++            lora_config.lora_extra_vocab_size,
++        )
++
++        lora_result = lora_linear(torch.cat(inputs))[0]
++
++        expected_results: List[torch.Tensor] = []
++        for input_, lora_id in zip(inputs, prompt_mapping):
++            lora = lora_dict[lora_id]
++            result = linear(input_)[0]
++            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
++            expected_results.append(result)
++        expected_result = torch.cat(expected_results)
++
++        rtol, atol = TOLERANCES[lora_result.dtype]
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
++
++        # Check that resetting the lora weights succeeds
++
++        for slot_idx in range(max_loras):
++            lora_linear.reset_lora(slot_idx)
++
++        inputs, index_mapping, prompt_mapping = create_random_inputs(
++            active_lora_ids=[0],
++            num_inputs=32 * num_loras,
++            input_size=(1, 4096),
++            input_range=(0, 1),
++            input_type=torch.float16,
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
++
++        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
++                                       512, lora_config.lora_extra_vocab_size)
++
++        lora_result = lora_linear(torch.cat(inputs))[0]
++        expected_result = linear(torch.cat(inputs))[0]
++
++        rtol, atol = TOLERANCES[lora_result.dtype]
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
+ 
+ 
+ @torch.inference_mode()
+ @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+ @pytest.mark.parametrize("orientation", ["row", "column"])
+ @pytest.mark.parametrize("fully_shard", [True, False])
+-@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("device", DEVICES)
++@pytest.mark.parametrize("stage", STAGES)
++@pytest.mark.parametrize("bias_enabled", [True, False])
+ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
+-                         device) -> None:
++                         device, stage, bias_enabled) -> None:
++
++    if current_platform.is_cuda_alike():
++        torch.cuda.set_device(device)
+ 
+     torch.set_default_device(device)
++    punica_wrapper = get_punica_wrapper(8192, 256, device)
++    assert check_punica_wrapper(punica_wrapper)
+     max_loras = 8
+     lora_config = LoRAConfig(max_loras=max_loras,
+                              max_lora_rank=8,
+                              fully_sharded_loras=fully_shard,
+-                             lora_dtype=torch.float16)
++                             lora_dtype=torch.float16,
++                             bias_enabled=bias_enabled)
+ 
+     def create_random_linear_parallel_layer():
+         if orientation == "row":
+@@ -559,7 +742,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
+                            if not fully_shard else
+                            ColumnParallelLinearWithShardedLoRA(linear))
+         lora_linear.create_lora_weights(max_loras, lora_config)
+-
++        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
++            lora_linear.lora_b_stacked) == 1)
++        if bias_enabled:
++            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
++        else:
++            assert lora_linear.lora_bias_stacked is None
+         return linear, lora_linear
+ 
+     for i in range(10):
+@@ -567,7 +755,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
+ 
+         id_to_index = get_random_id_to_index(num_loras, max_loras)
+         linear, lora_linear = create_random_linear_parallel_layer()
+-
++        lora_linear.set_mapping(punica_wrapper)
+         lora_dict, _ = populate_loras(
+             id_to_index,
+             layer=lora_linear,
+@@ -580,21 +768,21 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
+             input_size=(1, 4096),
+             input_range=(0, 1),
+             input_type=torch.float16,
+-        )
+-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+-
+-        mapping_info = convert_mapping(
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
++        punica_wrapper.update_metadata(
+             lora_mapping,
+             id_to_index,
+             max_loras,
+             512,
+             lora_config.lora_extra_vocab_size,
+         )
+-        lora_linear.set_mapping(*mapping_info, )
+ 
+         lora_result = lora_linear(torch.cat(inputs))[0]
+ 
+-        expected_results = []
++        expected_results: List[torch.Tensor] = []
+         for input_, lora_id in zip(inputs, prompt_mapping):
+             lora = lora_dict[lora_id]
+             result = linear(input_)[0]
+@@ -603,10 +791,10 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
+         expected_result = torch.cat(expected_results)
+ 
+         rtol, atol = TOLERANCES[lora_result.dtype]
+-        assert torch.allclose(lora_result,
+-                              expected_result,
+-                              rtol=rtol,
+-                              atol=atol)
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
+ 
+         # Check that resetting the lora weights succeeds
+ 
+@@ -619,37 +807,46 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
+             input_size=(1, 4096),
+             input_range=(0, 1),
+             input_type=torch.float16,
+-        )
+-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
+ 
+-        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
++        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
+                                        512, lora_config.lora_extra_vocab_size)
+-        lora_linear.set_mapping(*mapping_info, )
+ 
+         lora_result = lora_linear(torch.cat(inputs))[0]
+         expected_result = linear(torch.cat(inputs))[0]
+ 
+         rtol, atol = TOLERANCES[lora_result.dtype]
+-        assert torch.allclose(lora_result,
+-                              expected_result,
+-                              rtol=rtol,
+-                              atol=atol)
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
+ 
+ 
+ @torch.inference_mode()
+ @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+ @pytest.mark.parametrize("repeats", [1, 2, 3])
+ @pytest.mark.parametrize("fully_shard", [True, False])
+-@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("device", DEVICES)
++@pytest.mark.parametrize("stage", STAGES)
++@pytest.mark.parametrize("bias_enabled", [True, False])
+ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
+-                                device) -> None:
++                                device, stage, bias_enabled) -> None:
++
++    if current_platform.is_cuda_alike():
++        torch.cuda.set_device(device)
+ 
+     torch.set_default_device(device)
++    punica_wrapper = get_punica_wrapper(8192, 256, device)
++    assert check_punica_wrapper(punica_wrapper)
+     max_loras = 8
+     lora_config = LoRAConfig(max_loras=max_loras,
+                              max_lora_rank=8,
+                              fully_sharded_loras=fully_shard,
+-                             lora_dtype=torch.float16)
++                             lora_dtype=torch.float16,
++                             bias_enabled=bias_enabled)
+ 
+     def create_column_parallel_packed_layer():
+         if repeats == 2:
+@@ -677,7 +874,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
+                                        bias=False,
+                                        params_dtype=torch.float16)
+             linear.weight.data = torch.rand_like(linear.weight.data)
+-            lora_linear = QKVParallelLinearWithLora(linear)
++            lora_linear = QKVParallelLinearWithLora(
++                linear
++            ) if not fully_shard else QKVParallelLinearWithShardedLora(linear)
+ 
+         @dataclass
+         class FakeConfig:
+@@ -685,10 +884,16 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
+             num_key_value_heads = 32
+             num_attention_heads = 32
+ 
++        n_slices = repeats
+         lora_linear.create_lora_weights(max_loras,
+                                         lora_config,
+                                         model_config=FakeConfig())
+-
++        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
++            lora_linear.lora_b_stacked) == n_slices)
++        if bias_enabled:
++            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
++        else:
++            assert lora_linear.lora_bias_stacked is None
+         return linear, lora_linear
+ 
+     for i in range(10):
+@@ -697,7 +902,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
+         id_to_index = get_random_id_to_index(num_loras, max_loras)
+ 
+         linear, lora_linear = create_column_parallel_packed_layer()
+-
++        lora_linear.set_mapping(punica_wrapper)
+         lora_dict, sublora_dict = populate_loras(
+             id_to_index,
+             layer=lora_linear,
+@@ -711,21 +916,22 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
+             input_size=(1, 4096),
+             input_range=(0, 1),
+             input_type=torch.float16,
+-        )
+-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
+ 
+-        mapping_info = convert_mapping(
++        punica_wrapper.update_metadata(
+             lora_mapping,
+             id_to_index,
+             max_loras,
+             512,
+             lora_config.lora_extra_vocab_size,
+         )
+-        lora_linear.set_mapping(*mapping_info)
+ 
+         lora_result = lora_linear(torch.cat(inputs))[0]
+ 
+-        expected_results = []
++        expected_results: List[torch.Tensor] = []
+         for input_, lora_id in zip(inputs, prompt_mapping):
+             result = linear(input_)[0]
+             subloras = sublora_dict[lora_id]
+@@ -737,10 +943,10 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
+         expected_result = torch.cat(expected_results)
+ 
+         rtol, atol = TOLERANCES[lora_result.dtype]
+-        assert torch.allclose(lora_result,
+-                              expected_result,
+-                              rtol=rtol,
+-                              atol=atol)
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
+ 
+         for slot_idx in range(max_loras):
+             lora_linear.reset_lora(slot_idx)
+@@ -751,23 +957,334 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
+             input_size=(1, 4096),
+             input_range=(0, 1),
+             input_type=torch.float16,
+-        )
+-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
++            device=device)
++        lora_mapping = LoRAMapping(index_mapping,
++                                   prompt_mapping,
++                                   is_prefill=stage)
+ 
+-        mapping_info = convert_mapping(
++        punica_wrapper.update_metadata(
+             lora_mapping,
+             id_to_index,
+             max_loras,
+             512,
+             lora_config.lora_extra_vocab_size,
+         )
+-        lora_linear.set_mapping(*mapping_info)
+ 
+         lora_result = lora_linear(torch.cat(inputs))[0]
+         expected_result = linear(torch.cat(inputs))[0]
+ 
+         rtol, atol = TOLERANCES[lora_result.dtype]
+-        assert torch.allclose(lora_result,
+-                              expected_result,
+-                              rtol=rtol,
+-                              atol=atol)
++        torch.testing.assert_close(lora_result,
++                                   expected_result,
++                                   rtol=rtol,
++                                   atol=atol)
++
++
++@torch.inference_mode()
++@pytest.mark.parametrize("num_loras", [1, 8])
++@pytest.mark.parametrize("device", ["cuda"])
++@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0),
++                                             (6.0, 1.0)])
++@pytest.mark.parametrize("max_position", [11, 4096, 32768])
++@pytest.mark.parametrize("is_neox_style", [True, False])
++@pytest.mark.parametrize("rotary_dim", [None, 32])
++@pytest.mark.parametrize("head_size", [32, 108])
++@pytest.mark.parametrize("seq_len", [11, 1024])
++@pytest.mark.skipif(not current_platform.is_cuda_alike(),
++                    reason="Only CUDA backends are supported")
++def test_rotary_embedding_long_context(dist_init, num_loras, device,
++                                       scaling_factors, max_position,
++                                       is_neox_style, rotary_dim, head_size,
++                                       seq_len) -> None:
++    dtype = torch.float16
++    seed = 0
++    current_platform.seed_everything(seed)
++    torch.set_default_device(device)
++    punica_wrapper = get_punica_wrapper(8192, 256, device)
++    assert check_punica_wrapper(punica_wrapper)
++    max_loras = 8
++    lora_config = LoRAConfig(max_loras=max_loras,
++                             max_lora_rank=8,
++                             long_lora_scaling_factors=scaling_factors,
++                             lora_dtype=dtype)
++
++    if rotary_dim is None:
++        rotary_dim = head_size
++    base = 10000
++    batch_size = 5 * num_loras
++    num_heads = 7
++
++    # Verify lora is equivalent to linear scaling rotary embedding.
++    rope = get_rope(
++        head_size,
++        rotary_dim,
++        max_position,
++        base,
++        is_neox_style,
++    )
++    lora_rope = LinearScalingRotaryEmbeddingWithLora(rope)
++    lora_rope.set_mapping(punica_wrapper)
++    lora_rope.create_lora_weights(max_loras, lora_config)
++    linear_rope = get_rope(head_size, rotary_dim, max_position, base,
++                           is_neox_style, {
++                               "rope_type": "linear",
++                               "factor": scaling_factors
++                           })
++    linear_rope = linear_rope.to(dtype=dtype)
++    id_to_index = get_random_id_to_index(num_loras, max_loras)
++    _, index_mapping, prompt_mapping = create_random_inputs(
++        active_lora_ids=[0],
++        num_inputs=batch_size,
++        input_size=(1, max_position),
++        input_range=(0, lora_config.lora_extra_vocab_size),
++        input_type=torch.float16,
++        device=device)
++
++    lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
++    long_lora_context = LongContextLoRAContext(list(scaling_factors),
++                                               rotary_dim)
++
++    next_expected_offset = 0
++    # Make sure the offset is correct.
++    scaling_factor_to_offset = lora_rope.scaling_factor_to_offset
++    for scaling_factor, offset in scaling_factor_to_offset.items():
++        assert offset == next_expected_offset
++        next_expected_offset += scaling_factor * max_position
++
++    for i in range(len(scaling_factors)):
++        long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
++            scaling_factors[i], 0)
++    punica_wrapper.update_metadata(
++        lora_mapping,
++        id_to_index,
++        max_loras,
++        512,
++        lora_config.lora_extra_vocab_size,
++        long_lora_context=long_lora_context,
++    )
++    # lora_rope.set_mapping(*mapping_info)
++
++    positions = torch.randint(0, max_position, (batch_size, seq_len))
++    query = torch.randn(batch_size,
++                        seq_len,
++                        num_heads * head_size,
++                        dtype=dtype)
++    key = torch.randn_like(query)
++    ref_q, ref_k = linear_rope(positions, query, key)
++    actual_q, actual_k = lora_rope(positions, query, key)
++
++    torch.allclose(ref_q, actual_q)
++    torch.allclose(ref_k, actual_k)
++
++
++@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
++@pytest.mark.parametrize("seed", list(range(256)))
++def test_vocab_parallel_embedding_indices(tp_size, seed):
++    random.seed(seed)
++    vocab_size = random.randint(4000, 64000)
++    added_vocab_size = random.randint(0, 1024)
++    org_vocab_size = vocab_size - added_vocab_size
++    last_org_vocab_end_index = 0
++    last_added_vocab_end_index = org_vocab_size
++    computed_vocab_size = 0
++    computed_org_vocab_size = 0
++    computed_added_vocab_size = 0
++    vocab_size_padded = -1
++
++    all_org_tokens: List[int] = []
++    all_added_tokens: List[int] = []
++    token_ids: List[int] = []
++
++    for tp_rank in range(tp_size):
++        with patch(
++                "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank",
++                return_value=tp_rank
++        ), patch(
++                "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size",
++                return_value=tp_size):
++            vocab_embedding = VocabParallelEmbedding(
++                vocab_size, 1, org_num_embeddings=org_vocab_size)
++        vocab_size_padded = vocab_embedding.num_embeddings_padded
++        shard_indices = vocab_embedding.shard_indices
++        # Assert that the ranges are contiguous
++        assert shard_indices.org_vocab_start_index == last_org_vocab_end_index
++        assert (shard_indices.added_vocab_start_index ==
++                last_added_vocab_end_index)
++
++        # Ensure that we are not exceeding the vocab size
++        computed_vocab_size += shard_indices.num_elements_padded
++        computed_org_vocab_size += shard_indices.num_org_elements
++        computed_added_vocab_size += shard_indices.num_added_elements
++
++        # Ensure that the ranges are not overlapping
++        all_org_tokens.extend(
++            range(shard_indices.org_vocab_start_index,
++                  shard_indices.org_vocab_end_index))
++        all_added_tokens.extend(
++            range(shard_indices.added_vocab_start_index,
++                  shard_indices.added_vocab_end_index))
++
++        token_ids.extend(
++            range(shard_indices.org_vocab_start_index,
++                  shard_indices.org_vocab_end_index))
++        token_ids.extend([-1] * (shard_indices.num_org_elements_padded -
++                                 shard_indices.num_org_elements))
++        token_ids.extend(
++            range(shard_indices.added_vocab_start_index,
++                  shard_indices.added_vocab_end_index))
++        token_ids.extend([-1] * (shard_indices.num_added_elements_padded -
++                                 shard_indices.num_added_elements))
++
++        last_org_vocab_end_index = shard_indices.org_vocab_end_index
++        last_added_vocab_end_index = shard_indices.added_vocab_end_index
++
++    assert computed_vocab_size == vocab_size_padded
++    assert computed_org_vocab_size == org_vocab_size
++    assert computed_added_vocab_size == added_vocab_size
++
++    # Ensure that the ranges are not overlapping
++    assert len(all_org_tokens) == len(set(all_org_tokens))
++    assert len(all_added_tokens) == len(set(all_added_tokens))
++    assert not set(all_org_tokens).intersection(set(all_added_tokens))
++
++    token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
++    reindex_mapping = vocab_embedding.get_sharded_to_full_mapping()
++    assert reindex_mapping is not None or tp_size == 1
++    if reindex_mapping is not None:
++        reindexed_token_ids = token_ids_tensor[reindex_mapping]
++        expected = torch.tensor(list(range(0, vocab_size)))
++        assert reindexed_token_ids[:vocab_size].equal(expected)
++        assert torch.all(reindexed_token_ids[vocab_size:] == -1)
++
++
++def test_get_masked_input_and_mask():
++    x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
++
++    # base tp 1 case, no padding
++    modified_x, _ = get_masked_input_and_mask(x,
++                                              org_vocab_start_index=0,
++                                              org_vocab_end_index=8,
++                                              added_vocab_start_index=8,
++                                              added_vocab_end_index=12,
++                                              num_org_vocab_padding=0)
++    assert torch.equal(x, modified_x)
++
++    # tp 2 case, no padding
++    modified_x_rank_0, _ = get_masked_input_and_mask(x,
++                                                     org_vocab_start_index=0,
++                                                     org_vocab_end_index=4,
++                                                     added_vocab_start_index=8,
++                                                     added_vocab_end_index=10,
++                                                     num_org_vocab_padding=0)
++    modified_x_rank_1, _ = get_masked_input_and_mask(
++        x,
++        org_vocab_start_index=4,
++        org_vocab_end_index=8,
++        added_vocab_start_index=10,
++        added_vocab_end_index=12,
++        num_org_vocab_padding=0)
++    assert torch.equal(modified_x_rank_0,
++                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0]))
++    assert torch.equal(modified_x_rank_1,
++                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5]))
++
++    # tp 4 case, no padding
++    modified_x_rank_0, _ = get_masked_input_and_mask(x,
++                                                     org_vocab_start_index=0,
++                                                     org_vocab_end_index=2,
++                                                     added_vocab_start_index=8,
++                                                     added_vocab_end_index=9,
++                                                     num_org_vocab_padding=0)
++    modified_x_rank_1, _ = get_masked_input_and_mask(x,
++                                                     org_vocab_start_index=2,
++                                                     org_vocab_end_index=4,
++                                                     added_vocab_start_index=9,
++                                                     added_vocab_end_index=10,
++                                                     num_org_vocab_padding=0)
++    modified_x_rank_2, _ = get_masked_input_and_mask(
++        x,
++        org_vocab_start_index=4,
++        org_vocab_end_index=6,
++        added_vocab_start_index=10,
++        added_vocab_end_index=11,
++        num_org_vocab_padding=0)
++    modified_x_rank_3, _ = get_masked_input_and_mask(
++        x,
++        org_vocab_start_index=6,
++        org_vocab_end_index=8,
++        added_vocab_start_index=11,
++        added_vocab_end_index=12,
++        num_org_vocab_padding=0)
++    assert torch.equal(modified_x_rank_0,
++                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]))
++    assert torch.equal(modified_x_rank_1,
++                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0]))
++    assert torch.equal(modified_x_rank_2,
++                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0]))
++    assert torch.equal(modified_x_rank_3,
++                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2]))
++
++    # base tp 1 case, with padding
++    modified_x, _ = get_masked_input_and_mask(x,
++                                              org_vocab_start_index=0,
++                                              org_vocab_end_index=8,
++                                              added_vocab_start_index=8,
++                                              added_vocab_end_index=12,
++                                              num_org_vocab_padding=2)
++    assert torch.equal(modified_x,
++                       torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13]))
++
++    # tp 2 case, with padding
++    modified_x_rank_0, _ = get_masked_input_and_mask(x,
++                                                     org_vocab_start_index=0,
++                                                     org_vocab_end_index=4,
++                                                     added_vocab_start_index=8,
++                                                     added_vocab_end_index=10,
++                                                     num_org_vocab_padding=2)
++    modified_x_rank_1, _ = get_masked_input_and_mask(
++        x,
++        org_vocab_start_index=4,
++        org_vocab_end_index=8,
++        added_vocab_start_index=10,
++        added_vocab_end_index=12,
++        num_org_vocab_padding=2)
++    assert torch.equal(modified_x_rank_0,
++                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0]))
++    assert torch.equal(modified_x_rank_1,
++                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7]))
++
++    # tp 4 case, with padding
++    modified_x_rank_0, _ = get_masked_input_and_mask(x,
++                                                     org_vocab_start_index=0,
++                                                     org_vocab_end_index=2,
++                                                     added_vocab_start_index=8,
++                                                     added_vocab_end_index=9,
++                                                     num_org_vocab_padding=2)
++    modified_x_rank_1, _ = get_masked_input_and_mask(x,
++                                                     org_vocab_start_index=2,
++                                                     org_vocab_end_index=4,
++                                                     added_vocab_start_index=9,
++                                                     added_vocab_end_index=10,
++                                                     num_org_vocab_padding=2)
++    modified_x_rank_2, _ = get_masked_input_and_mask(
++        x,
++        org_vocab_start_index=4,
++        org_vocab_end_index=6,
++        added_vocab_start_index=10,
++        added_vocab_end_index=11,
++        num_org_vocab_padding=2)
++    modified_x_rank_3, _ = get_masked_input_and_mask(
++        x,
++        org_vocab_start_index=6,
++        org_vocab_end_index=8,
++        added_vocab_start_index=11,
++        added_vocab_end_index=12,
++        num_org_vocab_padding=2)
++    assert torch.equal(modified_x_rank_0,
++                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0]))
++    assert torch.equal(modified_x_rank_1,
++                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0]))
++    assert torch.equal(modified_x_rank_2,
++                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0]))
++    assert torch.equal(modified_x_rank_3,
++                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4]))
+diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
+new file mode 100644
+index 0000000..dfeac38
+--- /dev/null
++++ b/tests/lora/test_llama_tp.py
+@@ -0,0 +1,159 @@
++from typing import List
++
++import ray
++
++import vllm
++from tests.utils import fork_new_process_for_each_test
++from vllm.lora.request import LoRARequest
++
++from ..utils import multi_gpu_test
++
++MODEL_PATH = "meta-llama/Llama-2-7b-hf"
++
++EXPECTED_NO_LORA_OUTPUT = [
++    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
++    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
++    "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
++    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
++    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
++    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
++]
++EXPECTED_LORA_OUTPUT = [
++    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
++    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
++    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
++    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
++    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
++    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
++]
++
++
++def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
++    prompts = [
++        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
++        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
++        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
++        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
++        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
++        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
++    ]
++    sampling_params = vllm.SamplingParams(temperature=0,
++                                          max_tokens=256,
++                                          stop=["[/assistant]"])
++    outputs = llm.generate(
++        prompts,
++        sampling_params,
++        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
++        if lora_id else None)
++    # Print the outputs.
++    generated_texts: List[str] = []
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        generated_texts.append(generated_text)
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++    return generated_texts
++
++
++def generate_and_test(llm, sql_lora_files):
++    print("lora adapter created")
++    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
++
++    print("lora 1")
++    assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
++
++    print("no lora")
++    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
++
++    print("lora 2")
++    assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
++
++    print("removing lora")
++
++
++@fork_new_process_for_each_test
++def test_llama_lora(sql_lora_files):
++
++    llm = vllm.LLM(MODEL_PATH,
++                   enable_lora=True,
++                   max_num_seqs=16,
++                   max_loras=4,
++                   tensor_parallel_size=1,
++                   enable_chunked_prefill=True)
++    generate_and_test(llm, sql_lora_files)
++
++
++@fork_new_process_for_each_test
++def test_llama_lora_warmup(sql_lora_files):
++    """Test that the LLM initialization works with a warmup LORA path and
++    is more conservative"""
++
++    @ray.remote(num_gpus=1)
++    def get_num_gpu_blocks_lora():
++        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
++        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
++        return num_gpu_blocks_lora_warmup
++
++    @ray.remote(num_gpus=1)
++    def get_num_gpu_blocks_no_lora():
++        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
++        num_gpu_blocks_no_lora_warmup = (
++            llm.llm_engine.cache_config.num_gpu_blocks)
++        return num_gpu_blocks_no_lora_warmup
++
++    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
++    num_gpu_blocks_no_lora_warmup = ray.get(
++        get_num_gpu_blocks_no_lora.remote())
++    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
++        "The warmup with lora should be more "
++        "conservative than without lora, therefore the number of "
++        "memory blocks for the KV cache should be "
++        "less when using lora than when not using lora")
++
++
++@multi_gpu_test(num_gpus=4)
++@fork_new_process_for_each_test
++def test_llama_lora_tp4(sql_lora_files):
++
++    llm = vllm.LLM(
++        MODEL_PATH,
++        enable_lora=True,
++        max_num_seqs=16,
++        max_loras=4,
++        tensor_parallel_size=4,
++        enable_chunked_prefill=True,
++    )
++    generate_and_test(llm, sql_lora_files)
++
++
++@multi_gpu_test(num_gpus=4)
++@fork_new_process_for_each_test
++def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
++
++    llm = vllm.LLM(
++        MODEL_PATH,
++        enable_lora=True,
++        max_num_seqs=16,
++        max_loras=4,
++        tensor_parallel_size=4,
++        fully_sharded_loras=True,
++        enable_chunked_prefill=True,
++    )
++    generate_and_test(llm, sql_lora_files)
++
++
++@multi_gpu_test(num_gpus=4)
++@fork_new_process_for_each_test
++def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
++
++    llm = vllm.LLM(
++        MODEL_PATH,
++        enable_lora=True,
++        max_num_seqs=16,
++        max_loras=4,
++        tensor_parallel_size=4,
++        fully_sharded_loras=True,
++        enable_lora_bias=True,
++        enable_chunked_prefill=True,
++    )
++    generate_and_test(llm, sql_lora_files)
+diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
+new file mode 100644
+index 0000000..e7a34f2
+--- /dev/null
++++ b/tests/lora/test_long_context.py
+@@ -0,0 +1,299 @@
++import ast
++from typing import List, Optional, Tuple
++
++import numpy as np
++import pytest
++
++import vllm
++from vllm import SamplingParams
++from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora
++from vllm.lora.request import LoRARequest
++from vllm.model_executor.layers.rotary_embedding import (
++    LinearScalingRotaryEmbedding)
++
++from .data.long_context_test_data import prompts_and_responses
++
++context_len_to_scaling_factor = {
++    "16k": 4,
++    "32k": 8,
++}
++
++# We use the same sampling params for all requests
++sampling_params = SamplingParams(
++    temperature=0,
++    max_tokens=100,
++)
++
++
++def _create_lora_request(lora_id, long_context_infos):
++    context_len = long_context_infos[lora_id]["context_length"]
++    scaling_factor = context_len_to_scaling_factor[context_len]
++    return LoRARequest(
++        # There are 2 LoRAs for 16K, we need to add lora_id to indicate
++        # they are different LoRAs.
++        context_len + str(lora_id),
++        lora_id,
++        long_context_infos[lora_id]["lora"],
++        None,
++        4096 * scaling_factor,
++    )
++
++
++def evaluate_json_response(model_response, golden_response):
++    """Evaluates the model response against the golden response.
++
++    Returns a score between 0 and 1, where 1 is a perfect match and 0 is no
++    match. The score quantifies how well the model is able to extract the
++    golden JSON from the long context.
++    """
++    try:
++        model_response = ast.literal_eval(model_response)
++    except Exception as e:
++        raise ValueError(
++            f"Model response is not a valid JSON. Expected {golden_response}, "
++            f"got  {model_response}") from e
++
++    # Normally, we would flatten the dictionary and compare the values, but in
++    # this case, we know that the dictionary is only 2 levels deep
++    positive_values = 0
++    total_values = 0
++    # We look at all the attributes of the person that we are extracting a
++    # biography of and copmare them to the golden response
++    for person_attribute, person_attribute_value in golden_response.items():
++        if person_attribute in model_response:
++            if isinstance(person_attribute_value, dict):
++                for (sub_attribute,
++                     sub_attribute_value) in person_attribute_value.items():
++                    total_values += 1
++                    if sub_attribute in model_response[
++                            person_attribute] and model_response[
++                                person_attribute][
++                                    sub_attribute] == sub_attribute_value:
++                        positive_values += 1
++            else:
++                total_values += 1
++                if model_response[person_attribute] == person_attribute_value:
++                    positive_values += 1
++        else:
++            # We count a missing sub-dict as a single missed value.
++            total_values += 1
++
++    # Return a score between 0 and 1
++    return positive_values / total_values
++
++
++def generate(
++    llm: vllm.LLM,
++    inputs: Tuple[str, SamplingParams, Optional[LoRARequest]],
++):
++    prompts, sampling_param, lora_request = inputs
++    outputs = llm.generate(prompts, sampling_param, lora_request=lora_request)
++    return outputs[0].outputs[0].text.strip()
++
++
++def batched_generate(
++    llm: vllm.LLM,
++    inputs: List[Tuple[str, SamplingParams, Optional[LoRARequest]]],
++):
++    for input in inputs:
++        prompt, sampling_param, lora_req = input
++        # Add requests to the engine and run the engine
++        llm._validate_and_add_requests(prompt,
++                                       sampling_param,
++                                       lora_request=lora_req,
++                                       prompt_adapter_request=None)
++
++    outputs = llm._run_engine(use_tqdm=True)
++    return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
++
++
++@pytest.fixture(scope="module")
++def lora_llm(long_context_infos):
++    scaling_factors = [
++        context_len_to_scaling_factor[info["context_length"]]
++        for info in long_context_infos.values()
++    ]
++
++    llm = vllm.LLM(
++        "meta-llama/Llama-2-13b-chat-hf",
++        enable_lora=True,
++        max_num_seqs=16,
++        max_loras=2,
++        long_lora_scaling_factors=tuple(scaling_factors),
++        max_num_batched_tokens=4096 * 8,
++        tensor_parallel_size=4,
++        # FIXME enable async output processor
++        disable_async_output_proc=True,
++        distributed_executor_backend="mp",
++        enable_chunked_prefill=True)
++    yield llm
++    del llm
++
++
++def test_rotary_emb_replaced(dist_init):
++    """Verify rotary emb in all the layers are replaced"""
++    from vllm.engine.arg_utils import EngineArgs
++    from vllm.worker.model_runner import ModelRunner
++    engine_args = EngineArgs("meta-llama/Llama-2-7b-hf",
++                             long_lora_scaling_factors=(4.0, ),
++                             enable_lora=True)
++    engine_config = engine_args.create_engine_config()
++    model_runner = ModelRunner(
++        vllm_config=engine_config,
++        is_driver_worker=True,
++    )
++    model_runner.load_model()
++    rotary_emb_count = 0
++    for module_name, module in model_runner.model.named_modules(
++            remove_duplicate=False):
++        if "rotary_emb" in module_name:
++            if "base_layer" not in module_name:
++                rotary_emb_count += 1
++                assert isinstance(module, LinearScalingRotaryEmbeddingWithLora)
++            else:
++                assert isinstance(module, LinearScalingRotaryEmbedding)
++    # Llama 2 has 32 layers.
++    assert rotary_emb_count == 32
++
++
++@pytest.mark.skip_global_cleanup
++def test_batched_rope_kernel(lora_llm, long_context_infos):
++    """We test the batched kernel by comparing the results of batched an
++        non-batched generation.
++    """
++    # Create non batched results first to compare against batched results
++    non_batched_results: List[str] = []
++
++    for lora_id, info in long_context_infos.items():
++        context_len = info["context_length"]
++        lora_prompt = (prompts_and_responses[context_len][0]["prompt"],
++                       sampling_params,
++                       _create_lora_request(lora_id, long_context_infos))
++        lora_output = generate(lora_llm, lora_prompt)
++        non_batched_results.append(lora_output)
++
++    # Create batched results
++    # Each element of the batch must be
++    # (prompt, prompt_sampling_params, prompt_lora_request)
++    batched_prompts: List[Tuple[str, SamplingParams,
++                                Optional[LoRARequest]]] = []
++    for lora_id, info in long_context_infos.items():
++        context_len = info["context_length"]
++        batched_prompts.extend([
++            (prompts_and_responses[context_len][0]["prompt"], sampling_params,
++             _create_lora_request(lora_id, long_context_infos))
++        ])
++    batched_results = batched_generate(lora_llm, batched_prompts)
++
++    # Results should be the same
++    for non_batched, batched in zip(non_batched_results, batched_results):
++        assert non_batched == batched, (
++            "Non batched and batched results should be the "
++            f"same:\n{batched}\n{non_batched}")
++
++
++@pytest.mark.skip_global_cleanup
++def test_self_consistency(lora_llm, long_context_infos):
++    """We test consistency of the batched kernel by permuting batched
++    inputs and comparing the results to the non-permuted batched results.
++    """
++    num_loras = len(long_context_infos)
++
++    # Create results in order of long_context_infos
++    batched_prompts: List[Tuple[str, SamplingParams,
++                                Optional[LoRARequest]]] = []
++    for lora_id, info in long_context_infos.items():
++        context_len = info["context_length"]
++        batched_prompts.extend([
++            (prompts_and_responses[context_len][0]["prompt"], sampling_params,
++             _create_lora_request(lora_id, long_context_infos))
++        ])
++
++    batched_results = batched_generate(lora_llm, batched_prompts)
++
++    permutation = np.random.default_rng(seed=42).permutation(num_loras)
++
++    # Create results in random order of permutation
++    batched_prompts = []
++    for i in permutation:
++        lora_id, info = list(long_context_infos.items())[i]
++        context_len = info["context_length"]
++        batched_prompts.extend([
++            (prompts_and_responses[context_len][0]["prompt"], sampling_params,
++             _create_lora_request(lora_id, long_context_infos))
++        ])
++
++    permutated_batched_results = batched_generate(lora_llm, batched_prompts)
++
++    # Results should be the same
++    for i in range(num_loras):
++        assert batched_results[i] == permutated_batched_results[
++            permutation[i]], (
++                f"Results should be the same:\n{batched_results[i]}"
++                f"\n{permutated_batched_results[permutation[i]]}")
++
++
++@pytest.mark.skip_global_cleanup
++def test_quality(lora_llm, long_context_infos):
++    """We test the quality of the answers given by the LoRA model by
++        comparing the generated text to the merged model's outputs.
++
++    This is effectively a mini-benchmark over four prompts.
++    If this test fails, this indicates that the quality of the LoRA model
++    is suboptimal compared to the merged model. For example, if the model
++    does not output valid dictionaries, this test will fail.
++
++    If needed for testing, the merged versions of the models are available
++    as part of the `conftest`.
++
++    The test is expected to run for about 1 minute on a p4de.24xlarge
++    instance.
++    """
++    scores: List[float] = []
++    for lora_id, info in long_context_infos.items():
++        context_len = info["context_length"]
++        for prompt_and_response in prompts_and_responses[context_len]:
++            lora_prompt = (prompt_and_response["prompt"], sampling_params,
++                           _create_lora_request(lora_id, long_context_infos))
++            response = generate(lora_llm, lora_prompt)
++            golden_answer = prompt_and_response["golden_answer"]
++            score = evaluate_json_response(response, golden_answer)
++            scores.append(score)
++            assert score > 0.3, ("Quality of the answer is not good enough. "
++                                 f"Expected {golden_answer}, got {response}")
++    assert np.mean(scores) > 0.5
++
++
++@pytest.mark.skip_global_cleanup
++def test_max_len(lora_llm, long_context_infos):
++    """Test that we raise an ValueError when the input of a given LoRA
++        model exceeds the maximum length."""
++    # Since each LoRA model has a different maximum length, we need to
++    # test each one separately
++    for lora_id, info in long_context_infos.items():
++        context_len = info["context_length"]
++        lora_request = _create_lora_request(lora_id, long_context_infos)
++        # Good prompt should be fine
++        good_prompt = prompts_and_responses[context_len][0]["prompt"]
++        generate(lora_llm, (good_prompt, sampling_params, lora_request))
++        # Bad prompt should raise an error
++        bad_prompt = good_prompt * 2
++        with pytest.raises(ValueError):
++            generate(lora_llm, (bad_prompt, sampling_params, lora_request))
++
++    # Also test batched
++    batched_prompts: List[Tuple[str, SamplingParams,
++                                Optional[LoRARequest]]] = []
++    for lora_id_with_bad_inputs in long_context_infos:
++        for lora_id, info in long_context_infos.items():
++            context_len = info["context_length"]
++            batched_prompts.extend([
++                (prompts_and_responses[context_len][0]["prompt"] *
++                 (2 if lora_id == lora_id_with_bad_inputs else 1),
++                 sampling_params,
++                 _create_lora_request(lora_id, long_context_infos))
++            ])
++        # Turn good prompt into bad prompt inside of batched prompts
++
++        with pytest.raises(ValueError):
++            batched_generate(lora_llm, batched_prompts)
+diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py
+new file mode 100644
+index 0000000..c2520c8
+--- /dev/null
++++ b/tests/lora/test_lora_bias_e2e.py
+@@ -0,0 +1,52 @@
++from typing import List
++
++import pytest
++
++import vllm
++from vllm.lora.request import LoRARequest
++
++MODEL_PATH = "ibm-granite/granite-3b-code-base"
++
++
++def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
++    prompts = [
++        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
++        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
++    ]
++    sampling_params = vllm.SamplingParams(temperature=0,
++                                          max_tokens=256,
++                                          stop=["[/assistant]"])
++    outputs = llm.generate(
++        prompts,
++        sampling_params,
++        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
++        if lora_id else None)
++    generated_texts: List[str] = []
++    for output in outputs:
++        generated_text = output.outputs[0].text
++        generated_texts.append(generated_text)
++    return generated_texts
++
++
++@pytest.mark.parametrize("lora_bias", [True])
++@pytest.mark.parametrize("fully_sharded", [True, False])
++def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool):
++    llm = vllm.LLM(MODEL_PATH,
++                   enable_lora=True,
++                   max_num_seqs=16,
++                   max_lora_rank=8,
++                   max_loras=1,
++                   enable_lora_bias=lora_bias,
++                   tensor_parallel_size=1,
++                   fully_sharded_loras=fully_sharded)
++
++    print("lora adapter created")
++    output1 = do_sample(llm, lora_bias_files, lora_id=0)
++
++    print("lora")
++    output2 = do_sample(llm, lora_bias_files, lora_id=1)
++
++    if lora_bias:
++        assert output1 != output2
++    else:
++        assert output1 == output2
+diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
+index d4d1665..537d95b 100644
+--- a/tests/lora/test_lora_checkpoints.py
++++ b/tests/lora/test_lora_checkpoints.py
+@@ -1,9 +1,14 @@
++from typing import List
++
+ import pytest
+ 
+ from vllm.lora.models import LoRAModel
+ from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
++from vllm.model_executor.models.utils import WeightsMapper
+ 
+-lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"]
++lora_lst = [
++    "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
++]
+ 
+ 
+ @pytest.mark.parametrize("lora_name", lora_lst)
+@@ -11,13 +16,14 @@ def test_load_checkpoints(
+     lora_name,
+     baichuan_lora_files,
+     baichuan_zero_lora_files,
++    baichuan_regex_lora_files,
+     chatglm3_lora_files,
+ ):
+     supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
+     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
+     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+-    expected_lora_modules = []
++    expected_lora_modules: List[str] = []
+     for module in supported_lora_modules:
+         if module in packed_modules_mapping:
+             expected_lora_modules.extend(packed_modules_mapping[module])
+@@ -34,7 +40,7 @@ def test_load_checkpoints(
+             embedding_modules=embedding_modules,
+             embedding_padding_modules=embed_padding_modules)
+     elif lora_name == "baichuan7B-zero":
+-        #Test that the target_modules contain prefix
++        # Test that the target_modules contain prefix
+         # such as "model.layers.0.self_atten.W_pack", and
+         # the test should pass.
+         LoRAModel.from_local_checkpoint(
+@@ -44,6 +50,16 @@ def test_load_checkpoints(
+             device="cpu",
+             embedding_modules=embedding_modules,
+             embedding_padding_modules=embed_padding_modules)
++    elif lora_name == "baichuan7B-zero-regex":
++        # Test that the `target_modules` in the form of regular expressions,
++        # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
++        LoRAModel.from_local_checkpoint(
++            baichuan_regex_lora_files,
++            expected_lora_modules,
++            lora_model_id=1,
++            device="cpu",
++            embedding_modules=embedding_modules,
++            embedding_padding_modules=embed_padding_modules)
+     else:
+         # For the baichuan7B model, load chatglm3-6b's LoRA,
+         # and the test should raise the following error.
+@@ -56,3 +72,37 @@ def test_load_checkpoints(
+                 device="cpu",
+                 embedding_modules=embedding_modules,
+                 embedding_padding_modules=embed_padding_modules)
++
++
++def test_lora_weights_mapping(baichuan_lora_files):
++    supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
++    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
++    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
++    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
++    expected_lora_modules: List[str] = []
++    for module in supported_lora_modules:
++        if module in packed_modules_mapping:
++            expected_lora_modules.extend(packed_modules_mapping[module])
++        else:
++            expected_lora_modules.append(module)
++
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_prefix={
++            "model.": "language_model.model.",
++        },
++        orig_to_new_substr={
++            ".layers.": ".baichuan_layers.",
++        },
++    )
++    lora_model = LoRAModel.from_local_checkpoint(
++        baichuan_lora_files,
++        expected_lora_modules,
++        lora_model_id=1,
++        device="cpu",
++        embedding_modules=embedding_modules,
++        embedding_padding_modules=embed_padding_modules,
++        weights_mapper=hf_to_vllm_mapper,
++    )
++    for name in lora_model.loras:
++        assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
++        assert ".baichuan_layers." in name
+diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
+new file mode 100644
+index 0000000..e2daf9d
+--- /dev/null
++++ b/tests/lora/test_lora_huggingface.py
+@@ -0,0 +1,39 @@
++from typing import List
++
++import pytest
++
++from vllm.lora.models import LoRAModel
++from vllm.lora.utils import get_adapter_absolute_path
++from vllm.model_executor.models.llama import LlamaForCausalLM
++
++# Provide absolute path and huggingface lora ids
++lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
++
++
++@pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
++def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
++    lora_name = request.getfixturevalue(lora_fixture_name)
++    supported_lora_modules = LlamaForCausalLM.supported_lora_modules
++    packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
++    embedding_modules = LlamaForCausalLM.embedding_modules
++    embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
++    expected_lora_modules: List[str] = []
++    for module in supported_lora_modules:
++        if module in packed_modules_mapping:
++            expected_lora_modules.extend(packed_modules_mapping[module])
++        else:
++            expected_lora_modules.append(module)
++
++    lora_path = get_adapter_absolute_path(lora_name)
++
++    # lora loading should work for either absolute path and hugggingface id.
++    lora_model = LoRAModel.from_local_checkpoint(
++        lora_path,
++        expected_lora_modules,
++        lora_model_id=1,
++        device="cpu",
++        embedding_modules=embedding_modules,
++        embedding_padding_modules=embed_padding_modules)
++
++    # Assertions to ensure the model is loaded correctly
++    assert lora_model is not None, "LoRAModel is not loaded correctly"
+diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
+index c08eee9..ca523c6 100644
+--- a/tests/lora/test_lora_manager.py
++++ b/tests/lora/test_lora_manager.py
+@@ -1,5 +1,7 @@
++import json
++import math
+ import os
+-from typing import List
++from typing import Dict, List
+ 
+ import pytest
+ import torch
+@@ -13,10 +15,12 @@ from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+ from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+ from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
+                               LRUCacheLoRAModelManager)
++from vllm.lora.peft_helper import PEFTHelper
+ from vllm.lora.request import LoRARequest
+ from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
+                                       WorkerLoRAManager)
+ from vllm.model_executor.layers.linear import RowParallelLinear
++from vllm.platforms import current_platform
+ 
+ EMBEDDING_MODULES = {
+     "embed_tokens": "input_embeddings",
+@@ -25,18 +29,78 @@ EMBEDDING_MODULES = {
+ 
+ EMBEDDING_PADDING_MODULES = ["lm_head"]
+ 
++DEVICES = ([
++    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
++] if current_platform.is_cuda_alike() else ["cpu"])
++
++
++def test_peft_helper(sql_lora_files):
++    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
++    with open(lora_config_path) as f:
++        config = json.load(f)
++    peft_helper = PEFTHelper.from_dict(config)
++    assert peft_helper.r == 8
++    assert peft_helper.lora_alpha == 16
++    assert peft_helper.target_modules == [
++        "q_proj",
++        "v_proj",
++        "k_proj",
++        "o_proj",
++        "gate_proj",
++        "up_proj",
++        "down_proj",
++        "embed_tokens",
++        "lm_head",
++    ]
++    scaling = peft_helper.lora_alpha / peft_helper.r
++    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
++
++    # test RSLoRA
++    config = dict(r=8,
++                  lora_alpha=16,
++                  target_modules=["gate_proj"],
++                  use_rslora=True)
++    peft_helper = PEFTHelper.from_dict(config)
++
++    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
++    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
++
++    expected_error = "vLLM only supports modules_to_save being None."
++    with pytest.raises(ValueError, match=expected_error):
++        config = dict(
++            r=8,
++            lora_alpha=16,
++            target_modules=["gate_proj"],
++            modules_to_save=["lm_head"],
++        )
++        PEFTHelper.from_dict(config)
++
++    expected_error = "vLLM does not yet support DoRA."
++    with pytest.raises(ValueError, match=expected_error):
++        config = dict(r=8,
++                      lora_alpha=16,
++                      target_modules=["gate_proj"],
++                      use_dora=True)
++        PEFTHelper.from_dict(config)
++
+ 
+-def test_from_lora_tensors(sql_lora_files):
++@pytest.mark.parametrize("device", DEVICES)
++def test_from_lora_tensors(sql_lora_files, device):
+     tensors = load_file(
+         os.path.join(sql_lora_files, "adapter_model.safetensors"))
+     new_embeddings = load_file(
+         os.path.join(sql_lora_files, "new_embeddings.safetensors"))
++
++    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
++    with open(lora_config_path) as f:
++        config = json.load(f)
++
++    peft_helper = PEFTHelper.from_dict(config)
+     lora_model = LoRAModel.from_lora_tensors(
+         1,
+-        8,
+-        16,
+         tensors,
+-        "cuda",
++        peft_helper=peft_helper,
++        device=device,
+         embeddings=new_embeddings,
+         embedding_modules=EMBEDDING_MODULES,
+         embedding_padding_modules=EMBEDDING_PADDING_MODULES)
+@@ -46,6 +110,8 @@ def test_from_lora_tensors(sql_lora_files):
+         assert lora.lora_alpha == 16
+         assert lora.lora_a is not None
+         assert lora.lora_b is not None
++        assert lora.lora_a.device == torch.device(device)
++        assert lora.lora_b.device == torch.device(device)
+         assert (lora.lora_a.shape[1] == lora.lora_b.shape[0]
+                 ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+         assert lora.lora_a.shape[1] == 8
+@@ -60,17 +126,17 @@ def test_from_lora_tensors(sql_lora_files):
+             assert lora.embeddings_tensor is None
+ 
+ 
+-def create_lora(lora_id: int, model: nn.Module,
+-                sub_modules: List[str]) -> LoRAModel:
+-    loras = {}
++def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str],
++                device: torch.device) -> LoRAModel:
++    loras: Dict[str, LoRALayerWeights] = {}
+     for name in sub_modules:
+         w = model.get_submodule(name).weight
+         loras[name] = LoRALayerWeights(
+             name,
+             8,
+             16,
+-            torch.rand([w.shape[1], 8], device="cuda"),
+-            torch.rand([8, w.shape[0]], device="cuda"),
++            torch.rand([w.shape[1], 8], device=device),
++            torch.rand([8, w.shape[0]], device=device),
+         )
+     return LoRAModel(lora_id, 8, loras)
+ 
+@@ -80,10 +146,11 @@ def create_packed_lora(
+     model: nn.Module,
+     module_name,
+     replaced_module_names,
++    device: torch.device,
+     empty_replaced_module_name=None,
+ ) -> LoRAModel:
+     w = model.get_submodule(module_name).weight
+-    loras = {}
++    loras: Dict[str, LoRALayerWeights] = {}
+     for replaced_module_name in replaced_module_names:
+         if replaced_module_name == empty_replaced_module_name:
+             continue
+@@ -91,9 +158,9 @@ def create_packed_lora(
+             replaced_module_name,
+             8,
+             16,
+-            torch.rand([w.shape[1], 8], device="cuda"),
++            torch.rand([w.shape[1], 8], device=device),
+             torch.rand([8, w.shape[0] // len(replaced_module_names)],
+-                       device="cuda"),
++                       device=device),
+         )
+     return LoRAModel(lora_id, 8, loras)
+ 
+@@ -104,7 +171,8 @@ def test_replace_submodules(dist_init, dummy_model):
+     model.packed_modules_mapping = {}
+     manager = LoRAModelManager(
+         model, 1, 1, 1,
+-        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8))
++        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
++        torch.device(DEVICES[0]))
+     model = manager.model
+ 
+     assert isinstance(model.get_submodule("dense1"),
+@@ -116,244 +184,356 @@ def test_replace_submodules(dist_init, dummy_model):
+                       RowParallelLinearWithLoRA)
+ 
+ 
+-def test_lora_model_manager(dist_init, dummy_model):
++@pytest.mark.parametrize("device", DEVICES)
++def test_lora_model_manager(dist_init, dummy_model, device):
+     model = dummy_model
+     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+     model.packed_modules_mapping = {}
+-    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+-    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+-    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+-    manager = LoRAModelManager(
+-        model, 2, 2, 2,
+-        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
++    model_lora1 = create_lora(1,
++                              model, ["layer1.dense1", "dense2", "lm_head"],
++                              device=device)
++    model_lora2 = create_lora(2,
++                              model, ["dense1", "dense2", "lm_head"],
++                              device=device)
++    model_lora3 = create_lora(3,
++                              model, ["dense1", "dense2", "lm_head"],
++                              device=device)
++    manager = LoRAModelManager(model,
++                               2,
++                               2,
++                               2,
++                               LoRAConfig(max_lora_rank=8,
++                                          max_cpu_loras=3,
++                                          max_loras=2),
++                               device=device)
+     assert all(x is None for x in manager.lora_index_to_id)
+-    assert manager.add_lora(model_lora1)
+-    assert manager.activate_lora(1)
++    assert manager.add_adapter(model_lora1)
++    assert manager.activate_adapter(1)
+     assert manager.lora_index_to_id[0] == 1
+-    assert not manager.add_lora(model_lora1)
+-    assert not manager.activate_lora(1)
+-    assert manager.add_lora(model_lora2)
+-    assert manager.activate_lora(2)
++    assert not manager.add_adapter(model_lora1)
++    assert not manager.activate_adapter(1)
++    assert manager.add_adapter(model_lora2)
++    assert manager.activate_adapter(2)
+     assert manager.lora_index_to_id[0] == 1
+     assert manager.lora_index_to_id[1] == 2
+-    assert not manager.add_lora(model_lora2)
+-    assert not manager.activate_lora(2)
+-    assert manager.add_lora(model_lora3)
++    assert not manager.add_adapter(model_lora2)
++    assert not manager.activate_adapter(2)
++    assert manager.add_adapter(model_lora3)
+     assert manager.lora_index_to_id[0] == 1
+     assert manager.lora_index_to_id[1] == 2
+     with pytest.raises(ValueError):
+-        assert manager.activate_lora(3)
++        assert manager.activate_adapter(3)
+     assert manager.lora_index_to_id[0] == 1
+     assert manager.lora_index_to_id[1] == 2
+-    assert manager.remove_lora(model_lora2.id)
++    assert manager.remove_adapter(model_lora2.id)
+     assert manager.lora_index_to_id[1] is None
+-    assert not manager.remove_lora(model_lora2.id)
+-    assert manager.remove_lora(model_lora1.id)
+-    assert not manager.remove_lora(model_lora1.id)
+-    assert manager.add_lora(model_lora1)
++    assert not manager.remove_adapter(model_lora2.id)
++    assert manager.remove_adapter(model_lora1.id)
++    assert not manager.remove_adapter(model_lora1.id)
++    assert manager.add_adapter(model_lora1)
+     assert manager.lora_index_to_id[0] is None
+     assert manager.lora_index_to_id[1] is None
+-    assert manager.add_lora(model_lora2)
+-    assert manager.activate_lora(3)
++    assert manager.add_adapter(model_lora2)
++    assert manager.activate_adapter(3)
+     assert manager.lora_index_to_id[0] == 3
+     assert manager.lora_index_to_id[1] is None
+-    assert manager.activate_lora(2)
++    assert manager.activate_adapter(2)
+     assert manager.lora_index_to_id[0] == 3
+     assert manager.lora_index_to_id[1] == 2
+ 
++    assert manager.device == device
++    assert manager.punica_wrapper.device == device
+ 
+-def test_lora_lru_cache_model_manager(dist_init, dummy_model):
++
++@pytest.mark.parametrize("device", DEVICES)
++def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
+     model = dummy_model
+     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+     model.packed_modules_mapping = {}
+-    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+-    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+-    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+-    manager = LRUCacheLoRAModelManager(
+-        model, 2, 2, 2,
+-        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
++    model_lora1 = create_lora(1,
++                              model, ["layer1.dense1", "dense2", "lm_head"],
++                              device=device)
++    model_lora2 = create_lora(2,
++                              model, ["dense1", "dense2", "lm_head"],
++                              device=device)
++    model_lora3 = create_lora(3,
++                              model, ["dense1", "dense2", "lm_head"],
++                              device=device)
++    manager = LRUCacheLoRAModelManager(model,
++                                       2,
++                                       2,
++                                       2,
++                                       LoRAConfig(max_lora_rank=8,
++                                                  max_cpu_loras=3,
++                                                  max_loras=2),
++                                       device=device)
+     assert all(x is None for x in manager.lora_index_to_id)
+-    assert manager.add_lora(model_lora1)
+-    assert manager.activate_lora(1)
++    assert manager.add_adapter(model_lora1)
++    assert manager.activate_adapter(1)
+     assert manager.lora_index_to_id[0] == 1
+-    assert not manager.add_lora(model_lora1)
+-    assert not manager.activate_lora(1)
+-    assert manager.add_lora(model_lora2)
+-    assert manager.activate_lora(2)
++    assert not manager.add_adapter(model_lora1)
++    assert not manager.activate_adapter(1)
++    assert manager.add_adapter(model_lora2)
++    assert manager.activate_adapter(2)
+     assert manager.lora_index_to_id[0] == 1
+     assert manager.lora_index_to_id[1] == 2
+-    assert not manager.add_lora(model_lora2)
+-    assert not manager.activate_lora(2)
+-    assert manager.add_lora(model_lora3)
++    assert not manager.add_adapter(model_lora2)
++    assert not manager.activate_adapter(2)
++    assert manager.add_adapter(model_lora3)
+     assert manager.lora_index_to_id[0] == 1
+     assert manager.lora_index_to_id[1] == 2
+-    assert manager.activate_lora(3)
++    assert manager.activate_adapter(3)
+     assert manager.lora_index_to_id[0] == 3
+     assert manager.lora_index_to_id[1] == 2
+-    assert manager.remove_lora(model_lora2.id)
++    assert manager.remove_adapter(model_lora2.id)
+     assert manager.lora_index_to_id[1] is None
+-    assert not manager.remove_lora(model_lora2.id)
+-    assert manager.remove_lora(model_lora1.id)
+-    assert not manager.remove_lora(model_lora1.id)
+-    assert manager.add_lora(model_lora1)
+-    assert manager.activate_lora(1)
++    assert not manager.remove_adapter(model_lora2.id)
++    assert manager.remove_adapter(model_lora1.id)
++    assert not manager.remove_adapter(model_lora1.id)
++    assert manager.add_adapter(model_lora1)
++    assert manager.activate_adapter(1)
+     assert manager.lora_index_to_id[0] == 3
+     assert manager.lora_index_to_id[1] == 1
+-    assert manager.add_lora(model_lora2)
+-    assert manager.deactivate_lora(3)
++    assert manager.add_adapter(model_lora2)
++    assert manager.deactivate_adapter(3)
+     assert manager.lora_index_to_id[0] is None
+     assert manager.lora_index_to_id[1] == 1
+-    assert manager.activate_lora(2)
++    assert manager.activate_adapter(2)
+     assert manager.lora_index_to_id[0] == 2
+     assert manager.lora_index_to_id[1] == 1
+-    assert manager.activate_lora(3)
++    assert manager.activate_adapter(3)
++    assert manager.lora_index_to_id[0] == 2
++    assert manager.lora_index_to_id[1] == 3
++    assert manager.pin_adapter(2)
+     assert manager.lora_index_to_id[0] == 2
+     assert manager.lora_index_to_id[1] == 3
++    assert manager.activate_adapter(1)
++    assert manager.lora_index_to_id[0] == 2
++    assert manager.lora_index_to_id[1] == 1
++    assert manager.deactivate_adapter(2)
++    assert manager.lora_index_to_id[0] is None
++    assert manager.lora_index_to_id[1] == 1
++    assert manager.activate_adapter(3)
++    assert manager.lora_index_to_id[0] == 3
++    assert manager.lora_index_to_id[1] == 1
++    assert manager.pin_adapter(3)
++    assert manager.pin_adapter(1)
++    with pytest.raises(RuntimeError):
++        assert manager.pin_adapter(2)
++    assert manager.lora_index_to_id[0] == 3
++    assert manager.lora_index_to_id[1] == 1
++    with pytest.raises(RuntimeError):
++        assert manager.activate_adapter(2)
++
++    assert manager.deactivate_adapter(3)
++    assert manager.pin_adapter(2)
++    assert manager.lora_index_to_id[0] == 2
++    assert manager.lora_index_to_id[1] == 1
++    assert manager.remove_adapter(3)
++    with pytest.raises(ValueError):
++        assert manager.pin_adapter(3)
++
++    assert manager.punica_wrapper.device == device
++    assert manager.device == device
+ 
+ 
+-def test_lru_lora_model_manager(dist_init, dummy_model):
++@pytest.mark.parametrize("device", DEVICES)
++def test_lru_lora_model_manager(dist_init, dummy_model, device):
+     # This tests just the LRU cache functionality, everything else is
+     # tested in test_lora_model_manager
+     model = dummy_model
+     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+     model.packed_modules_mapping = {}
+-    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+-    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+-    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+-    model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"])
+-    manager = LRUCacheLoRAModelManager(
+-        model, 2, 2, 2,
+-        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
++    model_lora1 = create_lora(1,
++                              model, ["layer1.dense1", "dense2", "lm_head"],
++                              device=device)
++    model_lora2 = create_lora(2,
++                              model, ["dense1", "dense2", "lm_head"],
++                              device=device)
++    model_lora3 = create_lora(3,
++                              model, ["dense1", "dense2", "lm_head"],
++                              device=device)
++    model_lora4 = create_lora(4,
++                              model, ["dense1", "dense2", "lm_head"],
++                              device=device)
++    manager = LRUCacheLoRAModelManager(model,
++                                       2,
++                                       2,
++                                       2,
++                                       LoRAConfig(max_lora_rank=8,
++                                                  max_cpu_loras=2,
++                                                  max_loras=2),
++                                       device=device)
+ 
+     assert all(x is None for x in manager.lora_index_to_id)
+ 
+     # Add up to capacity
+-    assert manager.add_lora(model_lora1)
+-    assert manager.add_lora(model_lora2)
+-    assert manager.activate_lora(1)
+-    assert manager.activate_lora(2)
++    assert manager.add_adapter(model_lora1)
++    assert manager.add_adapter(model_lora2)
++    assert manager.activate_adapter(1)
++    assert manager.activate_adapter(2)
+ 
+-    assert set(manager.list_loras()) == {1, 2}
++    assert set(manager.list_adapters()) == {1, 2}
+     assert manager.lora_index_to_id[0] == 1
+     assert manager.lora_index_to_id[1] == 2
+ 
+     # Add over capacity
+-    assert manager.add_lora(model_lora3)
+-    assert manager.add_lora(model_lora4)
+-    assert manager.activate_lora(3)
+-    assert manager.activate_lora(4)
++    assert manager.add_adapter(model_lora3)
++    assert manager.add_adapter(model_lora4)
++    assert manager.activate_adapter(3)
++    assert manager.activate_adapter(4)
+ 
+-    assert set(manager.list_loras()) == {3, 4}
++    assert set(manager.list_adapters()) == {3, 4}
+     assert manager.lora_index_to_id[0] == 3
+     assert manager.lora_index_to_id[1] == 4
+ 
+     # Add 3 again to move it to the top and then add 2
+     # should return false since it's in already
+-    assert not manager.add_lora(model_lora3)
+-    assert not manager.activate_lora(3)
+-    assert manager.add_lora(model_lora2)
+-    assert manager.activate_lora(2)
++    assert not manager.add_adapter(model_lora3)
++    assert not manager.activate_adapter(3)
++    assert manager.add_adapter(model_lora2)
++    assert manager.activate_adapter(2)
+ 
+-    assert set(manager.list_loras()) == {3, 2}
++    assert set(manager.list_adapters()) == {3, 2}
+     assert manager.lora_index_to_id[0] == 3
+     assert manager.lora_index_to_id[1] == 2
+ 
+     # Remove manually
+-    assert manager.remove_lora(3)
+-    assert not manager.remove_lora(3)
++    assert manager.remove_adapter(3)
++    assert not manager.remove_adapter(3)
+ 
+-    assert set(manager.list_loras()) == {2}
++    assert set(manager.list_adapters()) == {2}
+     assert manager.lora_index_to_id[0] is None
+     assert manager.lora_index_to_id[1] == 2
+ 
+-    assert manager.add_lora(model_lora3)
+-    assert manager.activate_lora(3)
+-    assert manager.add_lora(model_lora4)
+-    assert manager.activate_lora(4)
++    assert manager.add_adapter(model_lora3)
++    assert manager.activate_adapter(3)
++    assert manager.add_adapter(model_lora4)
++    assert manager.activate_adapter(4)
+ 
+-    assert set(manager.list_loras()) == {3, 4}
++    assert set(manager.list_adapters()) == {3, 4}
+     assert manager.lora_index_to_id[0] == 3
+     assert manager.lora_index_to_id[1] == 4
+ 
+-    assert manager.remove_oldest_lora()
+-    assert set(manager.list_loras()) == {4}
++    assert manager.remove_oldest_adapter()
++    assert set(manager.list_adapters()) == {4}
+     assert manager.lora_index_to_id[0] is None
+     assert manager.lora_index_to_id[1] == 4
+ 
+-    assert manager.remove_oldest_lora()
+-    assert set(manager.list_loras()) == set()
++    assert manager.remove_oldest_adapter()
++    assert set(manager.list_adapters()) == set()
+     assert all(x is None for x in manager.lora_index_to_id)
+ 
+-    assert not manager.remove_oldest_lora()
+-    assert set(manager.list_loras()) == set()
++    assert not manager.remove_oldest_adapter()
++    assert set(manager.list_adapters()) == set()
+     assert all(x is None for x in manager.lora_index_to_id)
+ 
++    # pinning
++    assert manager.add_adapter(model_lora3)
++    assert manager.activate_adapter(3)
++    assert manager.add_adapter(model_lora4)
++    assert manager.activate_adapter(4)
++    assert set(manager.list_adapters()) == {3, 4}
++    with pytest.raises(ValueError):
++        assert manager.pin_adapter(1)
++    assert manager.pin_adapter(3)
++    # Remove manually
++    assert manager.remove_adapter(3)
++    assert not manager.remove_adapter(3)
++
++    assert set(manager.list_adapters()) == {4}
++    assert manager.lora_index_to_id[0] is None
++    assert manager.lora_index_to_id[1] == 4
++
++    assert manager.add_adapter(model_lora1)
++    assert manager.pin_adapter(1)
++    assert manager.add_adapter(model_lora2)
++    assert manager.activate_adapter(2)
++
++    assert set(manager.list_adapters()) == {1, 2}
++    assert manager.lora_index_to_id[0] == 1
++    assert manager.lora_index_to_id[1] == 2
++
++    assert manager.remove_oldest_adapter()
++    assert set(manager.list_adapters()) == {1}
++    assert manager.lora_index_to_id[0] == 1
++    assert manager.lora_index_to_id[1] is None
++
++    with pytest.raises(RuntimeError):
++        assert manager.remove_oldest_adapter()
++
++    assert set(manager.list_adapters()) == {1}
++    assert manager.punica_wrapper.device == device
++    assert manager.device == device
+ 
+-def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
+-                                       sql_lora_files):
++
++@pytest.mark.parametrize("device", DEVICES)
++def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
++                                          sql_lora_files, device):
+     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+-    worker_lora_manager = LRUCacheWorkerLoRAManager(
++    worker_adapter_manager = LRUCacheWorkerLoRAManager(
+         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+-        lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
++        lora_config.lora_extra_vocab_size, lora_config, device,
+         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+-    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
++    worker_adapter_manager.create_lora_manager(
++        llama_2_7b_model_extra_embeddings)
+ 
+     mapping = LoRAMapping([], [])
+-    worker_lora_manager.set_active_loras([
++    worker_adapter_manager.set_active_adapters([
+         LoRARequest("1", 1, sql_lora_files),
+         LoRARequest("2", 2, sql_lora_files)
+     ], mapping)
+-    assert worker_lora_manager.list_loras() == {1, 2}
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
++    assert worker_adapter_manager.list_adapters() == {1, 2}
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+ 
+-    worker_lora_manager.set_active_loras([
++    worker_adapter_manager.set_active_adapters([
+         LoRARequest("1", 1, sql_lora_files),
+         LoRARequest("3", 3, sql_lora_files),
+         LoRARequest("4", 4, sql_lora_files)
+     ], mapping)
+-    assert worker_lora_manager.list_loras() == {1, 2, 3, 4}
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 3
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
++    assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
+ 
+-    worker_lora_manager.set_active_loras([
++    worker_adapter_manager.set_active_adapters([
+         LoRARequest("1", 1, sql_lora_files),
+         LoRARequest("2", 2, sql_lora_files),
+         LoRARequest("5", 5, sql_lora_files)
+     ], mapping)
+-    assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
++    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
+ 
+-    worker_lora_manager.set_active_loras([
++    worker_adapter_manager.set_active_adapters([
+         LoRARequest("1", 1, sql_lora_files),
+         LoRARequest("1", 1, sql_lora_files),
+         LoRARequest("1", 1, sql_lora_files)
+     ], mapping)
+-    assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
++    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
+ 
+-    worker_lora_manager.set_active_loras([
++    worker_adapter_manager.set_active_adapters([
+         LoRARequest("6", 6, sql_lora_files),
+         LoRARequest("7", 7, sql_lora_files),
+         LoRARequest("8", 8, sql_lora_files)
+     ], mapping)
+-    assert worker_lora_manager.list_loras() == {1, 6, 7, 8}
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 7
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 8
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 6
++    assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 8
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 6
+ 
+     # Over capacity
+     with pytest.raises(RuntimeError):
+-        worker_lora_manager.set_active_loras([
++        worker_adapter_manager.set_active_adapters([
+             LoRARequest("10", 10, sql_lora_files),
+             LoRARequest("11", 11, sql_lora_files),
+             LoRARequest("12", 12, sql_lora_files),
+@@ -361,69 +541,75 @@ def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
+             LoRARequest("14", 14, sql_lora_files)
+         ], mapping)
+ 
++    assert worker_adapter_manager.device == device
++    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
++            device)
++
+ 
+-def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
+-                             sql_lora_files):
++@pytest.mark.parametrize("device", DEVICES)
++def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
++                                sql_lora_files, device):
+     # Should remove every LoRA not specified in the request.
+     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+-    worker_lora_manager = WorkerLoRAManager(
++    worker_adapter_manager = WorkerLoRAManager(
+         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+-        lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
++        lora_config.lora_extra_vocab_size, lora_config, device,
+         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+-    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
++    worker_adapter_manager.create_lora_manager(
++        llama_2_7b_model_extra_embeddings)
+ 
+     mapping = LoRAMapping([], [])
+-    worker_lora_manager.set_active_loras([
++    worker_adapter_manager.set_active_adapters([
+         LoRARequest("1", 1, sql_lora_files),
+         LoRARequest("2", 2, sql_lora_files)
+     ], mapping)
+-    assert worker_lora_manager.list_loras() == {1, 2}
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
++    assert worker_adapter_manager.list_adapters() == {1, 2}
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+ 
+-    worker_lora_manager.set_active_loras([
++    worker_adapter_manager.set_active_adapters([
+         LoRARequest("1", 1, sql_lora_files),
+         LoRARequest("3", 3, sql_lora_files),
+         LoRARequest("4", 4, sql_lora_files)
+     ], mapping)
+-    assert worker_lora_manager.list_loras() == {1, 3, 4}
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 3
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 4
++    assert worker_adapter_manager.list_adapters() == {1, 3, 4}
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
+ 
+-    worker_lora_manager.set_active_loras([
++    worker_adapter_manager.set_active_adapters([
+         LoRARequest("1", 1, sql_lora_files),
+         LoRARequest("2", 2, sql_lora_files),
+         LoRARequest("5", 5, sql_lora_files)
+     ], mapping)
+-    assert worker_lora_manager.list_loras() == {1, 2, 5}
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
++    assert worker_adapter_manager.list_adapters() == {1, 2, 5}
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+ 
+-    worker_lora_manager.set_active_loras([
++    worker_adapter_manager.set_active_adapters([
+         LoRARequest("1", 1, sql_lora_files),
+         LoRARequest("1", 1, sql_lora_files),
+         LoRARequest("1", 1, sql_lora_files)
+     ], mapping)
+-    assert worker_lora_manager.list_loras() == {1}
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] is None
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] is None
++    assert worker_adapter_manager.list_adapters() == {1}
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
+ 
+-    worker_lora_manager.set_active_loras([
++    worker_adapter_manager.set_active_adapters([
+         LoRARequest("6", 6, sql_lora_files),
+         LoRARequest("7", 7, sql_lora_files),
+         LoRARequest("8", 8, sql_lora_files)
+     ], mapping)
+-    assert worker_lora_manager.list_loras() == {6, 7, 8}
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 8
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 6
+-    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 7
++    assert worker_adapter_manager.list_adapters() == {6, 7, 8}
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
++    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 7
+ 
+     # Over capacity
+     with pytest.raises(RuntimeError):
+-        worker_lora_manager.set_active_loras([
++        worker_adapter_manager.set_active_adapters([
+             LoRARequest("10", 10, sql_lora_files),
+             LoRARequest("11", 11, sql_lora_files),
+             LoRARequest("12", 12, sql_lora_files),
+@@ -431,8 +617,13 @@ def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
+             LoRARequest("14", 14, sql_lora_files)
+         ], mapping)
+ 
++    assert worker_adapter_manager.device == device
++    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
++            device)
++
+ 
+-def test_packed_loras(dist_init, dummy_model_gate_up):
++@pytest.mark.parametrize("device", DEVICES)
++def test_packed_loras(dist_init, dummy_model_gate_up, device):
+     model = dummy_model_gate_up
+     model.supported_lora_modules = ["gate_up_proj"]
+     model.packed_modules_mapping = {
+@@ -445,43 +636,50 @@ def test_packed_loras(dist_init, dummy_model_gate_up):
+         1,
+         model,
+         module_name="gate_up_proj",
+-        replaced_module_names=["gate_proj", "up_proj"])
++        replaced_module_names=["gate_proj", "up_proj"],
++        device=device)
+     model_lora1 = create_packed_lora(
+         2,
+         model,
+         module_name="gate_up_proj",
+         replaced_module_names=["gate_proj", "up_proj"],
++        device=device,
+         empty_replaced_module_name="gate_proj",
+     )
+ 
+-    manager = LoRAModelManager(
+-        model, 2, 2, 2,
+-        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
++    manager = LoRAModelManager(model,
++                               2,
++                               2,
++                               2,
++                               LoRAConfig(max_lora_rank=8,
++                                          max_cpu_loras=2,
++                                          max_loras=2),
++                               device=device)
+     model = manager.model
+ 
+     assert isinstance(model.get_submodule("gate_up_proj"),
+                       MergedColumnParallelLinearWithLoRA)
+-    assert manager.add_lora(model_lora)
+-    assert manager.add_lora(model_lora1)
++    assert manager.add_adapter(model_lora)
++    assert manager.add_adapter(model_lora1)
+ 
+     packed_lora = model_lora.get_lora("gate_up_proj")
+     assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
+ 
+-    assert torch.allclose(packed_lora.lora_a[0],
+-                          model_lora.get_lora("gate_proj").lora_a)
+-    assert torch.allclose(packed_lora.lora_b[0],
+-                          model_lora.get_lora("gate_proj").lora_b)
+-    assert torch.allclose(packed_lora.lora_a[1],
+-                          model_lora.get_lora("up_proj").lora_a)
+-    assert torch.allclose(packed_lora.lora_b[1],
+-                          model_lora.get_lora("up_proj").lora_b)
++    torch.testing.assert_close(packed_lora.lora_a[0],
++                               model_lora.get_lora("gate_proj").lora_a)
++    torch.testing.assert_close(packed_lora.lora_b[0],
++                               model_lora.get_lora("gate_proj").lora_b)
++    torch.testing.assert_close(packed_lora.lora_a[1],
++                               model_lora.get_lora("up_proj").lora_a)
++    torch.testing.assert_close(packed_lora.lora_b[1],
++                               model_lora.get_lora("up_proj").lora_b)
+ 
+     packed_lora1 = model_lora1.get_lora("gate_up_proj")
+     assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
+ 
+     assert packed_lora1.lora_a[0] is None
+     assert packed_lora1.lora_b[0] is None
+-    assert torch.allclose(packed_lora1.lora_a[1],
+-                          model_lora1.get_lora("up_proj").lora_a)
+-    assert torch.allclose(packed_lora1.lora_b[1],
+-                          model_lora1.get_lora("up_proj").lora_b)
++    torch.testing.assert_close(packed_lora1.lora_a[1],
++                               model_lora1.get_lora("up_proj").lora_a)
++    torch.testing.assert_close(packed_lora1.lora_b[1],
++                               model_lora1.get_lora("up_proj").lora_b)
+diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
+new file mode 100644
+index 0000000..3b0f183
+--- /dev/null
++++ b/tests/lora/test_minicpmv_tp.py
+@@ -0,0 +1,122 @@
++from typing import List
++
++import pytest
++
++import vllm
++from tests.utils import fork_new_process_for_each_test
++from vllm.assets.image import ImageAsset
++from vllm.lora.request import LoRARequest
++from vllm.platforms import current_platform
++
++MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
++
++PROMPT_TEMPLATE = (
++    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
++    "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
++    "<|start_header_id|>assistant<|end_header_id|>\n\n")
++
++IMAGE_ASSETS = [
++    ImageAsset("stop_sign"),
++]
++
++# After fine-tuning with LoRA, all generated content should start begin `A`.
++EXPECTED_OUTPUT = [
++    "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
++]
++
++
++def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
++    sampling_params = vllm.SamplingParams(
++        temperature=0,
++        max_tokens=5,
++        stop_token_ids=[128001, 128009],  # eos_id, eot_id
++    )
++
++    inputs = [{
++        "prompt": PROMPT_TEMPLATE,
++        "multi_modal_data": {
++            "image": asset.pil_image
++        },
++    } for asset in IMAGE_ASSETS]
++
++    outputs = llm.generate(
++        inputs,
++        sampling_params,
++        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
++        if lora_id else None,
++    )
++    # Print the outputs.
++    generated_texts: List[str] = []
++    for output in outputs:
++        generated_text = output.outputs[0].text.strip()
++        generated_texts.append(generated_text)
++        print(f"Generated text: {generated_text!r}")
++    return generated_texts
++
++
++@pytest.mark.xfail(
++    current_platform.is_rocm(),
++    reason="MiniCPM-V dependency xformers incompatible with ROCm")
++@fork_new_process_for_each_test
++def test_minicpmv_lora(minicpmv_lora_files):
++    llm = vllm.LLM(
++        MODEL_PATH,
++        max_num_seqs=2,
++        enable_lora=True,
++        max_loras=2,
++        max_lora_rank=8,
++        enforce_eager=True,
++        trust_remote_code=True,
++        enable_chunked_prefill=True,
++    )
++    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
++    for i in range(len(EXPECTED_OUTPUT)):
++        assert EXPECTED_OUTPUT[i].startswith(output1[i])
++    output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
++    for i in range(len(EXPECTED_OUTPUT)):
++        assert EXPECTED_OUTPUT[i].startswith(output2[i])
++
++
++@pytest.mark.xfail(
++    current_platform.is_rocm(),
++    reason="MiniCPM-V dependency xformers incompatible with ROCm")
++@fork_new_process_for_each_test
++def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
++    llm = vllm.LLM(
++        MODEL_PATH,
++        enable_lora=True,
++        max_num_seqs=2,
++        max_loras=4,
++        max_lora_rank=64,
++        tensor_parallel_size=4,
++        trust_remote_code=True,
++        enforce_eager=True,
++        enable_chunked_prefill=True,
++    )
++    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
++    for i in range(len(EXPECTED_OUTPUT)):
++        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
++
++
++@pytest.mark.xfail(
++    current_platform.is_rocm(),
++    reason="MiniCPM-V dependency xformers incompatible with ROCm")
++@fork_new_process_for_each_test
++def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
++    llm = vllm.LLM(
++        MODEL_PATH,
++        enable_lora=True,
++        max_num_seqs=2,
++        max_loras=2,
++        max_lora_rank=8,
++        tensor_parallel_size=4,
++        trust_remote_code=True,
++        fully_sharded_loras=True,
++        enable_chunked_prefill=True,
++    )
++    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
++    for i in range(len(EXPECTED_OUTPUT)):
++        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
++    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2)
++    for i in range(len(EXPECTED_OUTPUT)):
++        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
+diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
+index 4d74722..940a865 100644
+--- a/tests/lora/test_mixtral.py
++++ b/tests/lora/test_mixtral.py
+@@ -1,18 +1,18 @@
++from typing import List
++
+ import pytest
+ import torch
+ 
+ import vllm
+ from vllm.lora.request import LoRARequest
++from vllm.platforms import current_platform
+ 
+ MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+ 
+ 
+-def do_sample(llm, lora_path: str, lora_id: int):
+-    prompts = [
+-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
+-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
+-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
+-    ]
++def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
++              prompts: List[str]) -> List[str]:
++
+     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
+     outputs = llm.generate(
+         prompts,
+@@ -20,7 +20,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
+         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+         if lora_id else None)
+     # Print the outputs.
+-    generated_texts = []
++    generated_texts: List[str] = []
+     for output in outputs:
+         prompt = output.prompt
+         generated_text = output.outputs[0].text.strip()
+@@ -31,23 +31,75 @@ def do_sample(llm, lora_path: str, lora_id: int):
+ 
+ @pytest.mark.parametrize("tp_size", [4])
+ def test_mixtral_lora(mixtral_lora_files, tp_size):
+-    if torch.cuda.device_count() < tp_size:
++    """Original test, the LoRA model has the common target modules, not all"""
++    if torch.cuda.device_count(
++    ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
+         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+ 
+-    llm = vllm.LLM(MODEL_PATH,
+-                   enable_lora=True,
+-                   max_num_seqs=16,
+-                   max_loras=4,
+-                   tensor_parallel_size=tp_size,
+-                   worker_use_ray=True)
++    prompts = [
++        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
++        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
++        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
++    ]
++
++    llm = vllm.LLM(
++        MODEL_PATH,
++        enable_lora=True,
++        max_num_seqs=16,
++        max_loras=4,
++        distributed_executor_backend="ray",
++        tensor_parallel_size=tp_size,
++        enable_chunked_prefill=True,
++    )
+ 
+     expected_lora_output = [
+         "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
++        "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
+         "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
+     ]
++    assert do_sample(llm, mixtral_lora_files, lora_id=1,
++                     prompts=prompts) == expected_lora_output
++    assert do_sample(llm, mixtral_lora_files, lora_id=2,
++                     prompts=prompts) == expected_lora_output
++
++
++@pytest.mark.parametrize("tp_size", [4])
++@pytest.mark.parametrize("fully_shard", [True, False])
++def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
++                                         tp_size, fully_shard):
++    """This LoRA model has all supported Mixtral target modules"""
++
++    if torch.cuda.device_count() < tp_size:
++        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
++
++    prompts = [
++        "Question: What happens if you touch the eyes of a blind man?\nChoices:\nB: The man's sight will be restored.\nA: Nothing happens if you touch the eyes of a blind man.\nC: The man will get his eyesight back.\nD: The man will no longer be blind.\nAnswer:",  # noqa: E501
++        "Question: Which will increase the energy of motion of water molecules?\nChoices:\nA: add heat\nB: add ice\nC: remove some of the water\nD: place the water in the freezer\nAnswer:",  # noqa: E501
++        "Since Craig threw aluminum cans in the trash and Benjamin recycled, _ was environmentally irresponsible.\nChoices:\n1: Craig\n2: Benjamin\nAnswer:",  # noqa: E501
++    ]
++
++    llm = vllm.LLM(
++        MODEL_PATH,
++        enable_lora=True,
++        max_num_seqs=16,
++        max_loras=4,
++        distributed_executor_backend="ray",
++        tensor_parallel_size=tp_size,
++        fully_sharded_loras=fully_shard,
++        max_lora_rank=32,
++    )
++
++    expected_lora_output = [
++        "A: Nothing happens if you touch the eyes of a blind man.",
++        "A: add heat",
++        "1: Craig",
++    ]
+ 
+-    assert do_sample(llm, mixtral_lora_files,
+-                     lora_id=1) == expected_lora_output
+-    assert do_sample(llm, mixtral_lora_files,
+-                     lora_id=2) == expected_lora_output
++    assert do_sample(llm,
++                     mixtral_lora_files_all_target_modules,
++                     lora_id=1,
++                     prompts=prompts) == expected_lora_output
++    assert do_sample(llm,
++                     mixtral_lora_files_all_target_modules,
++                     lora_id=2,
++                     prompts=prompts) == expected_lora_output
+diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
+new file mode 100644
+index 0000000..5a3fcb8
+--- /dev/null
++++ b/tests/lora/test_phi.py
+@@ -0,0 +1,70 @@
++from typing import List
++
++import vllm
++from vllm.lora.request import LoRARequest
++
++MODEL_PATH = "microsoft/phi-2"
++
++PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
++
++
++def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
++    prompts = [
++        PROMPT_TEMPLATE.format(
++            sql_prompt=
++            "Which catalog publisher has published the most catalogs?",
++            context="CREATE TABLE catalogs (catalog_publisher VARCHAR);"),
++        PROMPT_TEMPLATE.format(
++            sql_prompt=
++            "Which trip started from the station with the largest dock count? Give me the trip id.",  # noqa: E501
++            context=
++            "CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);"  # noqa: E501
++        ),
++        PROMPT_TEMPLATE.format(
++            sql_prompt=
++            "How many marine species are found in the Southern Ocean?",  # noqa: E501
++            context=
++            "CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));"  # noqa: E501
++        ),
++    ]
++    sampling_params = vllm.SamplingParams(temperature=0,
++                                          max_tokens=64,
++                                          stop="### End")
++    outputs = llm.generate(
++        prompts,
++        sampling_params,
++        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
++        if lora_id else None,
++    )
++    # Print the outputs.
++    generated_texts: List[str] = []
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text.strip()
++        generated_texts.append(generated_text)
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++    return generated_texts
++
++
++def test_phi2_lora(phi2_lora_files):
++    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
++    # Otherwise, the lora-test will fail due to CUDA OOM.
++    llm = vllm.LLM(MODEL_PATH,
++                   max_model_len=1024,
++                   enable_lora=True,
++                   max_loras=2,
++                   enforce_eager=True,
++                   enable_chunked_prefill=True)
++
++    expected_lora_output = [
++        "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;",  # noqa: E501
++        "SELECT trip.id FROM trip JOIN station ON trip.start_station_id = station.id WHERE station.dock_count = (SELECT MAX(dock_count) FROM station);",  # noqa: E501
++        "SELECT COUNT(*) FROM marine_species WHERE location = 'Southern Ocean';",  # noqa: E501
++    ]
++
++    output1 = do_sample(llm, phi2_lora_files, lora_id=1)
++    for i in range(len(expected_lora_output)):
++        assert output1[i].startswith(expected_lora_output[i])
++    output2 = do_sample(llm, phi2_lora_files, lora_id=2)
++    for i in range(len(expected_lora_output)):
++        assert output2[i].startswith(expected_lora_output[i])
+diff --git a/tests/lora/test_punica_ops_sizes.py b/tests/lora/test_punica_ops_sizes.py
+new file mode 100644
+index 0000000..433ca75
+--- /dev/null
++++ b/tests/lora/test_punica_ops_sizes.py
+@@ -0,0 +1,400 @@
++"""
++This script is mainly used to tests various hidden_sizes. We have collected the
++hidden_sizes included in the LoRA models currently supported by vLLM. It tests
++whether the corresponding Triton kernel can run normally when tensor parallelism
++is set to [1, 2, 4, 8, 16, 32, 64].
++"""
++from threading import Lock
++
++import pytest
++import torch
++
++import vllm.lora.ops.triton_ops  # noqa: F401
++from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
++                                     bgmv_shrink, sgmv_expand,
++                                     sgmv_expand_slice, sgmv_shrink)
++from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
++from vllm.platforms import current_platform
++
++from .utils import (assert_close, generate_data,
++                    generate_data_for_expand_nslices,
++                    generate_data_for_nslices)
++
++HIDDEN_SIZES = [
++    128,
++    256,
++    512,
++    896,
++    1024,
++    1152,
++    1216,
++    1280,
++    1536,
++    1664,
++    2048,
++    2240,
++    2304,
++    2368,
++    2432,
++    2560,
++    2752,
++    3072,
++    3328,
++    3456,
++    3584,
++    3712,
++    4096,
++    4480,
++    4608,
++    4736,
++    4864,
++    5120,
++    5504,
++    5632,
++    5888,
++    6144,
++    6400,
++    6848,
++    6912,
++    7168,
++    7424,
++    8192,
++    8960,
++    9216,
++    9472,
++    10240,
++    11008,
++    11264,
++    13824,
++    14336,
++    14784,
++    14848,
++    15360,
++    18944,
++    22016,
++    22528,
++    24576,
++    27392,
++    27648,
++    29568,
++    29696,
++    32000,
++    32256,
++    32512,
++    32768,
++    33024,
++    36864,
++    43264,
++    49152,
++    49408,
++    60544,
++    60672,
++    64000,
++    64256,
++    102400,
++    102656,
++    128000,
++    128256,
++]
++#The size of TP
++divisibility = [1, 2, 8, 16, 64]
++
++all_hidden_size = []
++for div in divisibility:
++    for hidden_size in HIDDEN_SIZES:
++        all_hidden_size.append(hidden_size // div)
++
++HIDDEN_SIZES = list(set(all_hidden_size))
++
++BATCHES = [4]
++NUM_LORA = [4]
++DTYPES = [torch.float16, torch.bfloat16]
++MAX_RANKS = [32]
++SCALES = [0.5]
++SEED = [0]
++DEVICES = [f"cuda:{0}"]
++
++_dict_lock = Lock()
++
++
++@pytest.mark.parametrize("batches", BATCHES)
++@pytest.mark.parametrize("num_loras", NUM_LORA)
++@pytest.mark.parametrize("rank", MAX_RANKS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("scaling", SCALES)
++@pytest.mark.parametrize("nslices", [1, 2, 3])
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("op_type", ["shrink", "expand"])
++@pytest.mark.parametrize("seed", SEED)
++@pytest.mark.parametrize("device", DEVICES)
++def test_punica_sgmv(
++    batches: int,
++    num_loras: int,
++    rank: int,
++    hidden_size: int,
++    scaling: float,
++    nslices: int,
++    dtype: torch.dtype,
++    op_type: str,
++    seed: int,
++    device: str,
++):
++    torch.set_default_device(device)
++    current_platform.seed_everything(seed)
++
++    seq_length = 128
++    (
++        inputs_tensor,
++        lora_weights_lst,
++        our_out_tensor,
++        ref_out_tensor,
++        b_seq_start_loc,
++        lora_indices_tensor,
++        seq_len_tensor,
++        indices,
++    ) = generate_data_for_nslices(
++        batches,
++        hidden_size,
++        num_loras,
++        rank,
++        seq_length,
++        nslices,
++        dtype,
++        op_type,
++        device,
++    )
++    max_seq_length = seq_len_tensor.max()
++    token_nums = seq_len_tensor.sum().item()
++    if isinstance(max_seq_length, tuple):
++        max_seq_length = max_seq_length[0].item()
++    else:
++        max_seq_length = max_seq_length.item()
++    if op_type == "shrink":
++        # Preventing cache error pointer.
++        with _dict_lock:
++            _LORA_A_PTR_DICT.clear()
++            torch.ops.vllm.sgmv_shrink(
++                inputs_tensor,
++                lora_weights_lst,
++                our_out_tensor,
++                b_seq_start_loc,
++                seq_len_tensor,
++                lora_indices_tensor,
++                batches,
++                max_seq_length,
++                token_nums,
++                scaling,
++            )
++        for index in range(nslices):
++            sgmv_shrink(
++                inputs_tensor,
++                lora_weights_lst[index],
++                ref_out_tensor[index],
++                b_seq_start_loc,
++                seq_len_tensor,
++                lora_indices_tensor,
++                batches,
++                max_seq_length,
++                token_nums,
++                scaling,
++            )
++
++    else:
++        with _dict_lock:
++            _LORA_B_PTR_DICT.clear()
++            torch.ops.vllm.sgmv_expand(
++                inputs_tensor,
++                lora_weights_lst,
++                our_out_tensor,
++                b_seq_start_loc,
++                seq_len_tensor,
++                lora_indices_tensor,
++                batches,
++                max_seq_length,
++                token_nums,
++                offset_start=0,
++                add_inputs=True,
++            )
++        if nslices == 1:
++            # Verify the torch's sgmv_expand op
++            sgmv_expand(
++                inputs_tensor[0],
++                lora_weights_lst[0],
++                ref_out_tensor,
++                b_seq_start_loc,
++                seq_len_tensor,
++                lora_indices_tensor,
++                batches,
++                max_seq_length,
++                token_nums,
++                add_inputs=True,
++            )
++        else:
++            slice_offset = 0
++            for index in range(nslices):
++                lora_weights = lora_weights_lst[index]
++                sgmv_expand_slice(
++                    inputs_tensor[index],
++                    lora_weights,
++                    ref_out_tensor,
++                    b_seq_start_loc,
++                    seq_len_tensor,
++                    lora_indices_tensor,
++                    batches,
++                    max_seq_length,
++                    token_nums,
++                    slice_offset,
++                    hidden_size,
++                    add_inputs=True,
++                )
++                slice_offset += hidden_size
++
++    assert_close(our_out_tensor, ref_out_tensor)
++
++
++@pytest.mark.parametrize("batches", BATCHES)
++@pytest.mark.parametrize("num_loras", NUM_LORA)
++@pytest.mark.parametrize("rank", MAX_RANKS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("scaling", SCALES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("op_type", ["shrink", "expand"])
++@pytest.mark.parametrize("seed", SEED)
++@pytest.mark.parametrize("device", DEVICES)
++def test_punica_bgmv(
++    batches: int,
++    num_loras: int,
++    rank: int,
++    hidden_size: int,
++    scaling: float,
++    dtype: torch.dtype,
++    op_type: str,
++    seed: int,
++    device: str,
++):
++    torch.set_default_device(device)
++    current_platform.seed_everything(seed)
++
++    seq_length = 1
++    (
++        inputs_tensor,
++        lora_weights,
++        our_out_tensor,
++        ref_out_tensor,
++        b_seq_start_loc,
++        lora_indices_tensor,
++        seq_len_tensor,
++        indices,
++    ) = generate_data(
++        batches,
++        hidden_size,
++        num_loras,
++        rank,
++        seq_length,
++        dtype,
++        op_type,
++        device,
++    )
++    if op_type == "shrink":
++        torch.ops.vllm.bgmv_shrink(
++            inputs_tensor,
++            lora_weights,
++            our_out_tensor,
++            indices,
++            scaling,
++        )
++
++        bgmv_shrink(
++            inputs_tensor,
++            lora_weights,
++            ref_out_tensor,
++            indices,
++            scaling,
++        )
++
++    else:
++        torch.ops.vllm.bgmv_expand(
++            inputs_tensor,
++            lora_weights,
++            our_out_tensor,
++            indices,
++            add_inputs=True,
++        )
++        bgmv_expand(
++            inputs_tensor,
++            lora_weights,
++            ref_out_tensor,
++            indices,
++            add_inputs=True,
++        )
++
++    if op_type == "shrink":
++        ref_out_tensor = ref_out_tensor.to(torch.float32)
++    assert_close(our_out_tensor, ref_out_tensor)
++
++
++@pytest.mark.parametrize("batches", BATCHES)
++@pytest.mark.parametrize("num_loras", NUM_LORA)
++@pytest.mark.parametrize("rank", MAX_RANKS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("nslices", [2, 3])
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("seed", SEED)
++@pytest.mark.parametrize("device", DEVICES)
++def test_punica_bgmv_expand_nslices(
++    batches: int,
++    num_loras: int,
++    rank: int,
++    hidden_size: int,
++    nslices: int,
++    dtype: torch.dtype,
++    seed: int,
++    device: str,
++):
++    torch.set_default_device(device)
++    current_platform.seed_everything(seed)
++
++    seq_length = 1
++    (
++        inputs_tensor,
++        lora_weights_lst,
++        our_outputs,
++        ref_outputs,
++        b_seq_start_loc,
++        lora_indices_tensor,
++        seq_len_tensor,
++        indices,
++    ) = generate_data_for_expand_nslices(
++        batches,
++        hidden_size,
++        num_loras,
++        rank,
++        seq_length,
++        dtype,
++        nslices,
++        device,
++    )
++    slice_offset = 0
++    for index in range(nslices):
++        lora_weights = lora_weights_lst[index]
++        torch.ops.vllm.bgmv_expand_slice(
++            inputs_tensor,
++            lora_weights,
++            our_outputs,
++            indices,
++            slice_offset,
++            slice_size=hidden_size,
++            add_inputs=True,
++        )
++        bgmv_expand_slice(
++            inputs_tensor,
++            lora_weights,
++            ref_outputs,
++            indices,
++            slice_offset,
++            slice_size=hidden_size,
++            add_inputs=True,
++        )
++
++        slice_offset += hidden_size
++    assert_close(our_outputs, ref_outputs)
+diff --git a/tests/lora/test_punica_ops_variation.py b/tests/lora/test_punica_ops_variation.py
+new file mode 100644
+index 0000000..2bb84c1
+--- /dev/null
++++ b/tests/lora/test_punica_ops_variation.py
+@@ -0,0 +1,316 @@
++"""
++This script is mainly used to test whether trtion kernels can run normally
++under different conditions, including various batches, numbers of LoRA , and
++maximum ranks.
++"""
++from threading import Lock
++
++import pytest
++import torch
++
++# Enable custom op register
++import vllm.lora.ops.triton_ops  # noqa: F401
++from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
++                                     bgmv_shrink, sgmv_expand,
++                                     sgmv_expand_slice, sgmv_shrink)
++from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
++from vllm.platforms import current_platform
++
++from .utils import (assert_close, generate_data,
++                    generate_data_for_expand_nslices,
++                    generate_data_for_nslices)
++
++HIDDEN_SIZES = [2049]
++
++BATCHES = [1, 4, 16, 32]
++NUM_LORA = [1, 8, 32, 128]
++DTYPES = [torch.float16, torch.bfloat16]
++MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
++SCALES = [0.5]
++SEED = [0]
++DEVICES = [f"cuda:{0}"]
++
++_dict_lock = Lock()
++
++
++@pytest.mark.parametrize("batches", BATCHES)
++@pytest.mark.parametrize("num_loras", NUM_LORA)
++@pytest.mark.parametrize("rank", MAX_RANKS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("scaling", SCALES)
++@pytest.mark.parametrize("nslices", [1, 2, 3])
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("op_type", ["shrink", "expand"])
++@pytest.mark.parametrize("seed", SEED)
++@pytest.mark.parametrize("device", DEVICES)
++def test_punica_sgmv(
++    batches: int,
++    num_loras: int,
++    rank: int,
++    hidden_size: int,
++    scaling: float,
++    nslices: int,
++    dtype: torch.dtype,
++    op_type: str,
++    seed: int,
++    device: str,
++):
++    torch.set_default_device(device)
++    current_platform.seed_everything(seed)
++
++    seq_length = 128
++    (
++        inputs_tensor,
++        lora_weights_lst,
++        our_out_tensor,
++        ref_out_tensor,
++        b_seq_start_loc,
++        lora_indices_tensor,
++        seq_len_tensor,
++        indices,
++    ) = generate_data_for_nslices(
++        batches,
++        hidden_size,
++        num_loras,
++        rank,
++        seq_length,
++        nslices,
++        dtype,
++        op_type,
++        device,
++    )
++    max_seq_length = seq_len_tensor.max()
++    token_nums = seq_len_tensor.sum().item()
++    if isinstance(max_seq_length, tuple):
++        max_seq_length = max_seq_length[0].item()
++    else:
++        max_seq_length = max_seq_length.item()
++    if op_type == "shrink":
++        # Preventing cache error pointer.
++        with _dict_lock:
++            _LORA_A_PTR_DICT.clear()
++            torch.ops.vllm.sgmv_shrink(
++                inputs_tensor,
++                lora_weights_lst,
++                our_out_tensor,
++                b_seq_start_loc,
++                seq_len_tensor,
++                lora_indices_tensor,
++                batches,
++                max_seq_length,
++                token_nums,
++                scaling,
++            )
++        for index in range(nslices):
++            sgmv_shrink(
++                inputs_tensor,
++                lora_weights_lst[index],
++                ref_out_tensor[index],
++                b_seq_start_loc,
++                seq_len_tensor,
++                lora_indices_tensor,
++                batches,
++                max_seq_length,
++                token_nums,
++                scaling,
++            )
++
++    else:
++        with _dict_lock:
++            _LORA_B_PTR_DICT.clear()
++            torch.ops.vllm.sgmv_expand(
++                inputs_tensor,
++                lora_weights_lst,
++                our_out_tensor,
++                b_seq_start_loc,
++                seq_len_tensor,
++                lora_indices_tensor,
++                batches,
++                max_seq_length,
++                token_nums,
++                offset_start=0,
++                add_inputs=True,
++            )
++        slice_offset = 0
++        if nslices == 1:
++            # Verify the torch's sgmv_expand op
++            sgmv_expand(
++                inputs_tensor[0],
++                lora_weights_lst[0],
++                ref_out_tensor,
++                b_seq_start_loc,
++                seq_len_tensor,
++                lora_indices_tensor,
++                batches,
++                max_seq_length,
++                token_nums,
++                add_inputs=True,
++            )
++        else:
++            for index in range(nslices):
++                lora_weights = lora_weights_lst[index]
++                sgmv_expand_slice(
++                    inputs_tensor[index],
++                    lora_weights,
++                    ref_out_tensor,
++                    b_seq_start_loc,
++                    seq_len_tensor,
++                    lora_indices_tensor,
++                    batches,
++                    max_seq_length,
++                    token_nums,
++                    slice_offset,
++                    hidden_size,
++                    add_inputs=True,
++                )
++                slice_offset += hidden_size
++
++    assert_close(our_out_tensor, ref_out_tensor)
++
++
++@pytest.mark.parametrize("batches", BATCHES)
++@pytest.mark.parametrize("num_loras", NUM_LORA)
++@pytest.mark.parametrize("rank", MAX_RANKS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("scaling", SCALES)
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("op_type", ["shrink", "expand"])
++@pytest.mark.parametrize("seed", SEED)
++@pytest.mark.parametrize("device", DEVICES)
++def test_punica_bgmv(
++    batches: int,
++    num_loras: int,
++    rank: int,
++    hidden_size: int,
++    scaling: float,
++    dtype: torch.dtype,
++    op_type: str,
++    seed: int,
++    device: str,
++):
++    torch.set_default_device(device)
++    current_platform.seed_everything(seed)
++
++    seq_length = 1
++    (
++        inputs_tensor,
++        lora_weights,
++        our_out_tensor,
++        ref_out_tensor,
++        b_seq_start_loc,
++        lora_indices_tensor,
++        seq_len_tensor,
++        indices,
++    ) = generate_data(
++        batches,
++        hidden_size,
++        num_loras,
++        rank,
++        seq_length,
++        dtype,
++        op_type,
++        device,
++    )
++    if op_type == "shrink":
++        torch.ops.vllm.bgmv_shrink(
++            inputs_tensor,
++            lora_weights,
++            our_out_tensor,
++            indices,
++            scaling,
++        )
++
++        bgmv_shrink(
++            inputs_tensor,
++            lora_weights,
++            ref_out_tensor,
++            indices,
++            scaling,
++        )
++
++    else:
++        torch.ops.vllm.bgmv_expand(
++            inputs_tensor,
++            lora_weights,
++            our_out_tensor,
++            indices,
++            add_inputs=True,
++        )
++        bgmv_expand(
++            inputs_tensor,
++            lora_weights,
++            ref_out_tensor,
++            indices,
++            add_inputs=True,
++        )
++
++    if op_type == "shrink":
++        ref_out_tensor = ref_out_tensor.to(torch.float32)
++    assert_close(our_out_tensor, ref_out_tensor)
++
++
++@pytest.mark.parametrize("batches", BATCHES)
++@pytest.mark.parametrize("num_loras", NUM_LORA)
++@pytest.mark.parametrize("rank", MAX_RANKS)
++@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
++@pytest.mark.parametrize("nslices", [2, 3])
++@pytest.mark.parametrize("dtype", DTYPES)
++@pytest.mark.parametrize("seed", SEED)
++@pytest.mark.parametrize("device", DEVICES)
++def test_punica_bgmv_expand_nslices(
++    batches: int,
++    num_loras: int,
++    rank: int,
++    hidden_size: int,
++    nslices: int,
++    dtype: torch.dtype,
++    seed: int,
++    device: str,
++):
++    torch.set_default_device(device)
++    current_platform.seed_everything(seed)
++
++    seq_length = 1
++    (
++        inputs_tensor,
++        lora_weights_lst,
++        our_outputs,
++        ref_outputs,
++        b_seq_start_loc,
++        lora_indices_tensor,
++        seq_len_tensor,
++        indices,
++    ) = generate_data_for_expand_nslices(
++        batches,
++        hidden_size,
++        num_loras,
++        rank,
++        seq_length,
++        dtype,
++        nslices,
++        device,
++    )
++    slice_offset = 0
++    for index in range(nslices):
++        lora_weights = lora_weights_lst[index]
++        torch.ops.vllm.bgmv_expand_slice(
++            inputs_tensor,
++            lora_weights,
++            our_outputs,
++            indices,
++            slice_offset,
++            slice_size=hidden_size,
++            add_inputs=True,
++        )
++        bgmv_expand_slice(
++            inputs_tensor,
++            lora_weights,
++            ref_outputs,
++            indices,
++            slice_offset,
++            slice_size=hidden_size,
++            add_inputs=True,
++        )
++
++        slice_offset += hidden_size
++    assert_close(our_outputs, ref_outputs)
+diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
+index 3d86a43..26bf770 100644
+--- a/tests/lora/test_quant_model.py
++++ b/tests/lora/test_quant_model.py
+@@ -6,9 +6,9 @@ from typing import List
+ import pytest
+ 
+ import vllm
++from vllm.distributed import cleanup_dist_env_and_memory
+ from vllm.lora.request import LoRARequest
+-
+-from .conftest import cleanup
++from vllm.platforms import current_platform
+ 
+ 
+ @dataclass
+@@ -17,15 +17,29 @@ class ModelWithQuantization:
+     quantization: str
+ 
+ 
+-MODELS: List[ModelWithQuantization] = [
+-    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+-                          quantization="AWQ"),
+-    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+-                          quantization="GPTQ"),
+-]
++MODELS: List[ModelWithQuantization]
++#AWQ quantization is currently not supported in ROCm.
++if current_platform.is_rocm():
++    MODELS = [
++        ModelWithQuantization(
++            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
++            quantization="GPTQ"),
++    ]
++else:
++    MODELS = [
++        ModelWithQuantization(
++            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
++            quantization="AWQ"),
++        ModelWithQuantization(
++            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
++            quantization="GPTQ"),
++    ]
+ 
+ 
+-def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
++def do_sample(llm: vllm.LLM,
++              lora_path: str,
++              lora_id: int,
++              max_tokens: int = 256) -> List[str]:
+     raw_prompts = [
+         "Give me an orange-ish brown color",
+         "Give me a neon pink color",
+@@ -45,7 +59,7 @@ def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
+         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+         if lora_id else None)
+     # Print the outputs.
+-    generated_texts = []
++    generated_texts: List[str] = []
+     for output in outputs:
+         prompt = output.prompt
+         generated_text = output.outputs[0].text
+@@ -56,19 +70,23 @@ def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
+ 
+ @pytest.mark.parametrize("model", MODELS)
+ @pytest.mark.parametrize("tp_size", [1])
+-def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
+-    # Cannot use as it will initialize torch.cuda too early...
+-    # if torch.cuda.device_count() < tp_size:
+-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+-
+-    llm = vllm.LLM(model=model.model_path,
+-                   enable_lora=True,
+-                   max_num_seqs=16,
+-                   max_loras=4,
+-                   max_model_len=400,
+-                   tensor_parallel_size=tp_size,
+-                   quantization=model.quantization,
+-                   trust_remote_code=True)
++def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
++                          tp_size):
++    if num_gpus_available < tp_size and \
++        tp_size > 1 and current_platform.is_cuda_alike():
++        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
++
++    llm = vllm.LLM(
++        model=model.model_path,
++        enable_lora=True,
++        max_num_seqs=16,
++        max_loras=4,
++        max_model_len=400,
++        tensor_parallel_size=tp_size,
++        gpu_memory_utilization=0.2,  #avoid OOM
++        quantization=model.quantization,
++        trust_remote_code=True,
++        enable_chunked_prefill=True)
+ 
+     if model.quantization is None:
+         expected_no_lora_output = [
+@@ -143,37 +161,42 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
+     print("removing lora")
+ 
+     del llm
+-    cleanup()
++    cleanup_dist_env_and_memory()
+ 
+ 
+ @pytest.mark.parametrize("model", MODELS)
+-@pytest.mark.skip("Requires multiple GPUs")
+-def test_quant_model_tp_equality(tinyllama_lora_files, model):
+-    # Cannot use as it will initialize torch.cuda too early...
+-    # if torch.cuda.device_count() < 2:
+-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
+-
+-    llm_tp1 = vllm.LLM(model=model.model_path,
+-                       enable_lora=True,
+-                       max_num_seqs=16,
+-                       max_loras=4,
+-                       tensor_parallel_size=1,
+-                       quantization=model.quantization,
+-                       trust_remote_code=True)
++def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
++                                 model):
++    if num_gpus_available < 2:
++        pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
++
++    llm_tp1 = vllm.LLM(
++        model=model.model_path,
++        enable_lora=True,
++        max_num_seqs=16,
++        max_loras=4,
++        tensor_parallel_size=1,
++        gpu_memory_utilization=0.2,  #avoid OOM
++        quantization=model.quantization,
++        trust_remote_code=True,
++        enable_chunked_prefill=True)
+     output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
+ 
+     del llm_tp1
+-    cleanup()
+-
+-    llm_tp2 = vllm.LLM(model=model.model_path,
+-                       enable_lora=True,
+-                       max_num_seqs=16,
+-                       max_loras=4,
+-                       tensor_parallel_size=2,
+-                       quantization=model.quantization)
++    cleanup_dist_env_and_memory()
++
++    llm_tp2 = vllm.LLM(
++        model=model.model_path,
++        enable_lora=True,
++        max_num_seqs=16,
++        max_loras=4,
++        tensor_parallel_size=2,
++        gpu_memory_utilization=0.2,  #avoid OOM
++        quantization=model.quantization,
++        enable_chunked_prefill=True)
+     output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
+ 
+     del llm_tp2
+-    cleanup()
++    cleanup_dist_env_and_memory()
+ 
+     assert output_tp1 == output_tp2
+diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
+new file mode 100644
+index 0000000..ebdd129
+--- /dev/null
++++ b/tests/lora/test_qwen2vl.py
+@@ -0,0 +1,81 @@
++from typing import List
++
++import pytest
++
++import vllm
++from vllm.assets.image import ImageAsset
++from vllm.lora.request import LoRARequest
++from vllm.platforms import current_platform
++
++MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
++
++PROMPT_TEMPLATE = (
++    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
++    "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
++    "What is in the image?<|im_end|>\n"
++    "<|im_start|>assistant\n")
++
++IMAGE_ASSETS = [
++    ImageAsset("stop_sign"),
++    ImageAsset("cherry_blossom"),
++]
++
++# After fine-tuning with LoRA, all generated content should start begin `A`.
++EXPECTED_OUTPUT = [
++    "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
++    "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
++]
++
++
++def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
++    sampling_params = vllm.SamplingParams(
++        temperature=0,
++        max_tokens=5,
++    )
++
++    inputs = [{
++        "prompt": PROMPT_TEMPLATE,
++        "multi_modal_data": {
++            "image": asset.pil_image
++        },
++    } for asset in IMAGE_ASSETS]
++
++    outputs = llm.generate(
++        inputs,
++        sampling_params,
++        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
++        if lora_id else None,
++    )
++    # Print the outputs.
++    generated_texts: List[str] = []
++    for output in outputs:
++        generated_text = output.outputs[0].text.strip()
++        generated_texts.append(generated_text)
++        print(f"Generated text: {generated_text!r}")
++    return generated_texts
++
++
++@pytest.mark.xfail(current_platform.is_rocm(),
++                   reason="Qwen2-VL dependency xformers incompatible with ROCm"
++                   )
++def test_qwen2vl_lora(qwen2vl_lora_files):
++    llm = vllm.LLM(
++        MODEL_PATH,
++        max_num_seqs=2,
++        enable_lora=True,
++        max_loras=2,
++        max_lora_rank=16,
++        trust_remote_code=True,
++        mm_processor_kwargs={
++            "min_pixels": 28 * 28,
++            "max_pixels": 1280 * 28 * 28,
++        },
++        max_model_len=4096,
++    )
++    output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
++    for i in range(len(EXPECTED_OUTPUT)):
++        assert EXPECTED_OUTPUT[i].startswith(output1[i])
++
++    output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
++    for i in range(len(EXPECTED_OUTPUT)):
++        assert EXPECTED_OUTPUT[i].startswith(output2[i])
+diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
+index 2dcad23..d225a3f 100644
+--- a/tests/lora/test_tokenizer_group.py
++++ b/tests/lora/test_tokenizer_group.py
+@@ -17,6 +17,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
+         tokenizer_id="gpt2",
+         enable_lora=True,
+         max_num_seqs=1,
++        max_loras=1,
+         max_input_length=None,
+     )
+     lora_request = LoRARequest("1", 1, sql_lora_files)
+@@ -41,7 +42,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
+             lora_request)
+ 
+ 
+-def test_get_lora_tokenizer(sql_lora_files, tmpdir):
++def test_get_lora_tokenizer(sql_lora_files, tmp_path):
+     lora_request = None
+     tokenizer = get_lora_tokenizer(lora_request)
+     assert not tokenizer
+@@ -50,6 +51,25 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir):
+     tokenizer = get_lora_tokenizer(lora_request)
+     assert tokenizer.get_added_vocab()
+ 
+-    lora_request = LoRARequest("1", 1, str(tmpdir))
++    lora_request = LoRARequest("1", 1, str(tmp_path))
+     tokenizer = get_lora_tokenizer(lora_request)
+     assert not tokenizer
++
++
++@pytest.mark.parametrize("enable_lora", [True, False])
++@pytest.mark.parametrize("max_num_seqs", [1, 2])
++@pytest.mark.parametrize("max_loras", [1, 2])
++def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras):
++    tokenizer_group = get_tokenizer_group(
++        get_tokenizer_pool_config(None),
++        tokenizer_id="gpt2",
++        enable_lora=enable_lora,
++        max_num_seqs=max_num_seqs,
++        max_loras=max_loras,
++        max_input_length=None,
++    )
++    if enable_lora:
++        assert tokenizer_group.lora_tokenizers.capacity == max(
++            max_num_seqs, max_loras)
++    else:
++        assert tokenizer_group.lora_tokenizers.capacity == 0
+diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
+index 892f608..85110b8 100644
+--- a/tests/lora/test_utils.py
++++ b/tests/lora/test_utils.py
+@@ -1,38 +1,57 @@
+ from collections import OrderedDict
++from unittest.mock import patch
+ 
++import pytest
++from huggingface_hub.utils import HfHubHTTPError
+ from torch import nn
+ 
+-from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
++from vllm.lora.utils import (get_adapter_absolute_path,
++                             parse_fine_tuned_lora_name, replace_submodule)
+ from vllm.utils import LRUCache
+ 
+ 
+-def test_parse_fine_tuned_lora_name():
++def test_parse_fine_tuned_lora_name_valid():
+     fixture = {
+-        ("base_model.model.lm_head.lora_A.weight", "lm_head", True),
+-        ("base_model.model.lm_head.lora_B.weight", "lm_head", False),
++        ("base_model.model.lm_head.lora_A.weight", "lm_head", True, False),
++        ("base_model.model.lm_head.lora_B.weight", "lm_head", False, False),
+         (
+             "base_model.model.model.embed_tokens.lora_embedding_A",
+             "model.embed_tokens",
+             True,
++            False,
+         ),
+         (
+             "base_model.model.model.embed_tokens.lora_embedding_B",
+             "model.embed_tokens",
+             False,
++            False,
+         ),
+         (
+             "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
+             "model.layers.9.mlp.down_proj",
+             True,
++            False,
+         ),
+         (
+             "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
+             "model.layers.9.mlp.down_proj",
+             False,
++            False,
+         ),
+     }
+-    for name, module_name, is_lora_a in fixture:
+-        assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
++    for name, module_name, is_lora_a, is_bias in fixture:
++        assert (module_name, is_lora_a,
++                is_bias) == parse_fine_tuned_lora_name(name)
++
++
++def test_parse_fine_tuned_lora_name_invalid():
++    fixture = {
++        "base_model.weight",
++        "base_model.model.weight",
++    }
++    for name in fixture:
++        with pytest.raises(ValueError, match="unsupported LoRA weight"):
++            parse_fine_tuned_lora_name(name)
+ 
+ 
+ def test_replace_submodule():
+@@ -170,3 +189,55 @@ def test_lru_cache():
+     assert 2 in cache
+     assert 4 in cache
+     assert 6 in cache
++
++
++# Unit tests for get_adapter_absolute_path
++@patch('os.path.isabs')
++def test_get_adapter_absolute_path_absolute(mock_isabs):
++    path = '/absolute/path/to/lora'
++    mock_isabs.return_value = True
++    assert get_adapter_absolute_path(path) == path
++
++
++@patch('os.path.expanduser')
++def test_get_adapter_absolute_path_expanduser(mock_expanduser):
++    # Path with ~ that needs to be expanded
++    path = '~/relative/path/to/lora'
++    absolute_path = '/home/user/relative/path/to/lora'
++    mock_expanduser.return_value = absolute_path
++    assert get_adapter_absolute_path(path) == absolute_path
++
++
++@patch('os.path.exists')
++@patch('os.path.abspath')
++def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
++    # Relative path that exists locally
++    path = 'relative/path/to/lora'
++    absolute_path = '/absolute/path/to/lora'
++    mock_exist.return_value = True
++    mock_abspath.return_value = absolute_path
++    assert get_adapter_absolute_path(path) == absolute_path
++
++
++@patch('huggingface_hub.snapshot_download')
++@patch('os.path.exists')
++def test_get_adapter_absolute_path_huggingface(mock_exist,
++                                               mock_snapshot_download):
++    # Hugging Face model identifier
++    path = 'org/repo'
++    absolute_path = '/mock/snapshot/path'
++    mock_exist.return_value = False
++    mock_snapshot_download.return_value = absolute_path
++    assert get_adapter_absolute_path(path) == absolute_path
++
++
++@patch('huggingface_hub.snapshot_download')
++@patch('os.path.exists')
++def test_get_adapter_absolute_path_huggingface_error(mock_exist,
++                                                     mock_snapshot_download):
++    # Hugging Face model identifier with download error
++    path = 'org/repo'
++    mock_exist.return_value = False
++    mock_snapshot_download.side_effect = HfHubHTTPError(
++        "failed to query model info")
++    assert get_adapter_absolute_path(path) == path
+diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
+index 732e91a..9d814f6 100644
+--- a/tests/lora/test_worker.py
++++ b/tests/lora/test_worker.py
+@@ -4,7 +4,8 @@ import tempfile
+ from unittest.mock import patch
+ 
+ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+-                         ModelConfig, ParallelConfig, SchedulerConfig)
++                         ModelConfig, ParallelConfig, SchedulerConfig,
++                         VllmConfig)
+ from vllm.lora.models import LoRAMapping
+ from vllm.lora.request import LoRARequest
+ from vllm.worker.worker import Worker
+@@ -12,10 +13,11 @@ from vllm.worker.worker import Worker
+ 
+ @patch.dict(os.environ, {"RANK": "0"})
+ def test_worker_apply_lora(sql_lora_files):
+-    worker = Worker(
++    vllm_config = VllmConfig(
+         model_config=ModelConfig(
+             "meta-llama/Llama-2-7b-hf",
+-            "meta-llama/Llama-2-7b-hf",
++            task="auto",
++            tokenizer="meta-llama/Llama-2-7b-hf",
+             tokenizer_mode="auto",
+             trust_remote_code=False,
+             seed=0,
+@@ -27,16 +29,19 @@ def test_worker_apply_lora(sql_lora_files):
+             load_format="dummy",
+         ),
+         parallel_config=ParallelConfig(1, 1, False),
+-        scheduler_config=SchedulerConfig(32, 32, 32),
++        scheduler_config=SchedulerConfig("generate", 32, 32, 32),
+         device_config=DeviceConfig("cuda"),
+         cache_config=CacheConfig(block_size=16,
+                                  gpu_memory_utilization=1.,
+                                  swap_space=0,
+                                  cache_dtype="auto"),
+-        local_rank=0,
+-        rank=0,
+         lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
+                                max_loras=32),
++    )
++    worker = Worker(
++        vllm_config=vllm_config,
++        local_rank=0,
++        rank=0,
+         distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
+     )
+     worker.init_device()
+diff --git a/tests/lora/utils.py b/tests/lora/utils.py
+index 280e0f2..ce47546 100644
+--- a/tests/lora/utils.py
++++ b/tests/lora/utils.py
+@@ -1,4 +1,4 @@
+-from typing import List, Optional
++from typing import Dict, List, Optional
+ 
+ import torch
+ 
+@@ -7,48 +7,55 @@ from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+ 
+ class DummyLoRAManager:
+ 
+-    def __init__(self):
++    def __init__(self, device: torch.device = "cuda:0"):
+         super().__init__()
+-        self._loras = {}
++        self._loras: Dict[str, LoRALayerWeights] = {}
++        self._device = device
+ 
+     def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
+         self._loras[module_name] = lora
+ 
+-    def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
+-        return self._loras.get(module_name, None)
++    def get_module_lora(self, module_name: str) -> LoRALayerWeights:
++        return self._loras[module_name]
+ 
+-    def init_random_lora(self,
+-                         module_name: str,
+-                         weight: torch.Tensor,
+-                         rank: int = 8,
+-                         generate_embeddings_tensor: int = 0):
++    def init_random_lora(
++        self,
++        module_name: str,
++        weight: torch.Tensor,
++        rank: int = 8,
++        generate_embeddings_tensor: int = 0,
++    ):
+         lora = LoRALayerWeights(
+             module_name,
+             rank=rank,
+             lora_alpha=1,
+             lora_a=torch.rand([weight.shape[1], rank],
+                               dtype=weight.dtype,
+-                              device="cuda"),
++                              device=self._device),
+             lora_b=torch.rand([rank, weight.shape[0]],
+                               dtype=weight.dtype,
+-                              device="cuda"),
++                              device=self._device),
+         )
+         if generate_embeddings_tensor:
+-            lora.embeddings_tensor = torch.rand(5,
+-                                                generate_embeddings_tensor,
+-                                                dtype=weight.dtype,
+-                                                device="cuda")
++            lora.embeddings_tensor = torch.rand(
++                5,
++                generate_embeddings_tensor,
++                dtype=weight.dtype,
++                device=self._device,
++            )
+         self.set_module_lora(module_name, lora)
+ 
+         return lora
+ 
+-    def init_lora(self,
+-                  module_name: str,
+-                  input_dim: int,
+-                  output_dim: int,
+-                  rank=8,
+-                  noop=False,
+-                  embeddings_tensor=None):
++    def init_lora(
++        self,
++        module_name: str,
++        input_dim: int,
++        output_dim: int,
++        rank=8,
++        noop=False,
++        embeddings_tensor=None,
++    ):
+         lora = LoRALayerWeights(
+             module_name,
+             rank=rank,
+@@ -68,11 +75,11 @@ class DummyLoRAManager:
+         module_name: str,
+         input_dim: int,
+         output_dims: List[int],
+-        noop_lora_index: List[int] = None,
+-        rank=8,
++        noop_lora_index: Optional[List[int]] = None,
++        rank: int = 8,
+     ):
+-        base_loras = []
+-        noop_lora_index = set(noop_lora_index or [])
++        base_loras: List[LoRALayerWeights] = []
++        noop_lora_index_set = set(noop_lora_index or [])
+ 
+         for i, out_dim in enumerate(output_dims):
+             base_lora = self.init_lora(
+@@ -80,9 +87,226 @@ class DummyLoRAManager:
+                 input_dim,
+                 out_dim,
+                 rank=rank,
+-                noop=i in noop_lora_index,
++                noop=i in noop_lora_index_set,
+             )
+             base_loras.append(base_lora)
+         packed_lora = PackedLoRALayerWeights.pack(base_loras)
+         self.set_module_lora(module_name, packed_lora)
+         return packed_lora
++
++
++def assert_close(a, b):
++    rtol, atol = {
++        torch.float16: (6e-2, 6e-2),
++        torch.bfloat16: (6e-2, 6e-2),
++        torch.float32: (1e-2, 1e-2),
++    }[a.dtype]
++    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
++
++
++def generate_data(
++    batches,
++    hidden_size,
++    lora_nums,
++    max_rank,
++    seq_length,
++    dtype,
++    op_type,
++    device,
++):
++    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
++                                   (batches, )).to(device)
++    b_seq_start_loc = torch.cumsum(
++        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
++        dim=0,
++    ).to(device)
++    total_tokens = seq_len_tensor.sum()
++    if op_type == "shrink":
++        inputs_tensor = torch.rand((total_tokens, hidden_size),
++                                   dtype=dtype).to(device)
++        lora_weights = torch.rand(
++            (lora_nums, max_rank, hidden_size),  # col-major
++            dtype=dtype,
++        ).to(device)
++        # shrink op need atomic_add, so output is initinized by 0
++        ref_out_tensor = torch.zeros((total_tokens, max_rank),
++                                     dtype=dtype,
++                                     device=inputs_tensor.device)
++        # NOTE  shrink kernel using torch.float32 as output type
++        our_out_tensor = torch.zeros((total_tokens, max_rank),
++                                     dtype=torch.float32).to(device)
++    else:
++        inputs_tensor = torch.rand(
++            (total_tokens, max_rank),
++            dtype=dtype,
++        ).to(device)
++        lora_weights = torch.rand(
++            (lora_nums, hidden_size, max_rank),  # col-major
++            dtype=dtype,
++        ).to(device)
++        # expand op needs to complete y+=a@lora_b, so output is
++        # initinized randomly
++        ref_out_tensor = torch.rand(
++            (total_tokens, hidden_size),
++            dtype=dtype,
++        ).to(device)
++        # Ensure the same input.
++        our_out_tensor = ref_out_tensor.clone()
++    lora_indices_tensor = torch.randint(0,
++                                        lora_nums - 1 if lora_nums > 1 else 1,
++                                        (batches, )).to(device)
++    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
++    current_offset = 0
++    for b_id in range(batches):
++        lora_index = lora_indices_tensor[b_id]
++        indices[current_offset:current_offset +
++                seq_len_tensor[b_id]].copy_(lora_index)
++        current_offset += seq_len_tensor[b_id].item()
++    return (
++        inputs_tensor,
++        lora_weights,
++        our_out_tensor,
++        ref_out_tensor,
++        b_seq_start_loc,
++        lora_indices_tensor,
++        seq_len_tensor,
++        indices,
++    )
++
++
++def generate_data_for_expand_nslices(
++    batches,
++    hidden_size,
++    lora_nums,
++    max_rank,
++    seq_length,
++    dtype,
++    nslices,
++    device,
++):
++    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
++                                   (batches, )).to(device)
++    b_seq_start_loc = torch.cumsum(
++        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
++        dim=0,
++    ).to(device)
++    total_tokens = seq_len_tensor.sum()
++    inputs_tensor = torch.rand(
++        (total_tokens, max_rank),
++        dtype=dtype,
++    ).to(device)
++    lora_weights_lst = []
++    for _ in range(nslices):
++        lora_weights_lst.append(
++            torch.rand(
++                (lora_nums, hidden_size, max_rank),  # col-major
++                dtype=dtype,
++            ).to(device))
++    # expand op needs to complete y+=a@lora_b, so output is
++    # initinized randomly
++    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
++                                dtype=dtype).to(device)
++    # Ensure the same input.
++    our_out_tensor = ref_out_tensor.clone()
++    lora_indices_tensor = torch.randint(0,
++                                        lora_nums - 1 if lora_nums > 1 else 1,
++                                        (batches, ))
++    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
++    current_offset = 0
++    for b_id in range(batches):
++        lora_index = lora_indices_tensor[b_id]
++        indices[current_offset:current_offset +
++                seq_len_tensor[b_id]] = (lora_index.item())
++        current_offset += seq_len_tensor[b_id].item()
++
++    lora_indices_tensor = lora_indices_tensor.to(device)
++    return (
++        inputs_tensor,
++        lora_weights_lst,
++        our_out_tensor,
++        ref_out_tensor,
++        b_seq_start_loc,
++        lora_indices_tensor,
++        seq_len_tensor,
++        indices,
++    )
++
++
++def generate_data_for_nslices(
++    batches,
++    hidden_size,
++    lora_nums,
++    max_rank,
++    seq_length,
++    nslices,
++    dtype,
++    op_type,
++    device,
++):
++    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
++                                   (batches, )).to(device)
++    b_seq_start_loc = torch.cumsum(
++        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
++        dim=0,
++    ).to(device)
++    total_tokens = seq_len_tensor.sum()
++
++    lora_weights_lst = []
++    if op_type == "shrink":
++
++        inputs_tensor = torch.rand((total_tokens, hidden_size),
++                                   dtype=dtype).to(device)
++
++        for _ in range(nslices):
++            if op_type == "shrink":
++                lora_weights_lst.append(
++                    torch.rand(
++                        (lora_nums, max_rank, hidden_size),  # col-major
++                        dtype=dtype,
++                    ).to(device))
++        # NOTE  shrink kernel using torch.float32 as output type
++        # shrink op need atomic_add, so output is initinized by 0
++        our_out_tensor = torch.zeros(
++            (nslices, total_tokens, max_rank),
++            dtype=torch.float32,
++        ).to(device)
++    else:
++        inputs_tensor = torch.rand(
++            (nslices, total_tokens, max_rank),
++            dtype=dtype,
++        ).to(device)
++        for _ in range(nslices):
++            lora_weights_lst.append(
++                torch.rand(
++                    (lora_nums, hidden_size, max_rank),  # col-major
++                    dtype=dtype,
++                ).to(device))
++        # expand op needs to complete y+=a@lora_b, so output is
++        # initinized randomly
++        our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
++                                    dtype=dtype).to(device)
++
++    # Ensure the same input.
++    ref_out_tensor = our_out_tensor.clone()
++    lora_indices_tensor = torch.randint(0,
++                                        lora_nums - 1 if lora_nums > 1 else 1,
++                                        (batches, ))
++    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
++    current_offset = 0
++    for b_id in range(batches):
++        lora_index = lora_indices_tensor[b_id]
++        indices[current_offset:current_offset +
++                seq_len_tensor[b_id]] = (lora_index.item())
++        current_offset += seq_len_tensor[b_id].item()
++
++    lora_indices_tensor = lora_indices_tensor.to(device)
++    return (
++        inputs_tensor,
++        lora_weights_lst,
++        our_out_tensor,
++        ref_out_tensor,
++        b_seq_start_loc,
++        lora_indices_tensor,
++        seq_len_tensor,
++        indices,
++    )
+diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
+index e0aa14f..b3c7850 100644
+--- a/tests/metrics/test_metrics.py
++++ b/tests/metrics/test_metrics.py
+@@ -1,11 +1,15 @@
++import time
+ from typing import List
+ 
+ import pytest
++import ray
+ from prometheus_client import REGISTRY
+ 
+ from vllm import EngineArgs, LLMEngine
++from vllm.distributed import cleanup_dist_env_and_memory
+ from vllm.engine.arg_utils import AsyncEngineArgs
+ from vllm.engine.async_llm_engine import AsyncLLMEngine
++from vllm.engine.metrics import RayPrometheusStatLogger
+ from vllm.sampling_params import SamplingParams
+ 
+ MODELS = [
+@@ -23,23 +27,25 @@ def test_metric_counter_prompt_tokens(
+     dtype: str,
+     max_tokens: int,
+ ) -> None:
+-    vllm_model = vllm_runner(model,
+-                             dtype=dtype,
+-                             disable_log_stats=False,
+-                             gpu_memory_utilization=0.4)
+-    tokenizer = vllm_model.model.get_tokenizer()
+-    prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
+-    # This test needs at least 2 prompts in a batch of different lengths to
+-    # verify their token count is correct despite padding.
+-    assert len(example_prompts) > 1, "at least 2 prompts are required"
+-    assert prompt_token_counts[0] != prompt_token_counts[1], (
+-        "prompts of different lengths are required")
+-    vllm_prompt_token_count = sum(prompt_token_counts)
+-
+-    _ = vllm_model.generate_greedy(example_prompts, max_tokens)
+-    stat_logger = vllm_model.model.llm_engine.stat_logger
+-    metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
+-        **stat_logger.labels)._value.get()
++    with vllm_runner(model,
++                     dtype=dtype,
++                     disable_log_stats=False,
++                     gpu_memory_utilization=0.4) as vllm_model:
++        tokenizer = vllm_model.model.get_tokenizer()
++        prompt_token_counts = [
++            len(tokenizer.encode(p)) for p in example_prompts
++        ]
++        # This test needs at least 2 prompts in a batch of different lengths to
++        # verify their token count is correct despite padding.
++        assert len(example_prompts) > 1, "at least 2 prompts are required"
++        assert prompt_token_counts[0] != prompt_token_counts[1], (
++            "prompts of different lengths are required")
++        vllm_prompt_token_count = sum(prompt_token_counts)
++
++        _ = vllm_model.generate_greedy(example_prompts, max_tokens)
++        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
++        metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
++            **stat_logger.labels)._value.get()
+ 
+     assert vllm_prompt_token_count == metric_count, (
+         f"prompt token count: {vllm_prompt_token_count!r}\n"
+@@ -56,28 +62,67 @@ def test_metric_counter_generation_tokens(
+     dtype: str,
+     max_tokens: int,
+ ) -> None:
+-    vllm_model = vllm_runner(model,
+-                             dtype=dtype,
+-                             disable_log_stats=False,
+-                             gpu_memory_utilization=0.4)
+-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+-    tokenizer = vllm_model.model.get_tokenizer()
+-    stat_logger = vllm_model.model.llm_engine.stat_logger
+-    metric_count = stat_logger.metrics.counter_generation_tokens.labels(
+-        **stat_logger.labels)._value.get()
+-    vllm_generation_count = 0
+-    for i in range(len(example_prompts)):
+-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+-        prompt_ids = tokenizer.encode(example_prompts[i])
+-        # vllm_output_ids contains both prompt tokens and generation tokens.
+-        # We're interested only in the count of the generation tokens.
+-        vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
++    with vllm_runner(model,
++                     dtype=dtype,
++                     disable_log_stats=False,
++                     gpu_memory_utilization=0.4) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++        tokenizer = vllm_model.model.get_tokenizer()
++        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
++        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
++            **stat_logger.labels)._value.get()
++        vllm_generation_count = 0
++        for i in range(len(example_prompts)):
++            vllm_output_ids, vllm_output_str = vllm_outputs[i]
++            prompt_ids = tokenizer.encode(example_prompts[i])
++            # vllm_output_ids contains both prompt tokens and generation tokens.
++            # We're interested only in the count of the generation tokens.
++            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
+ 
+     assert vllm_generation_count == metric_count, (
+         f"generation token count: {vllm_generation_count!r}\n"
+         f"metric: {metric_count!r}")
+ 
+ 
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("max_tokens", [128, 129])
++@pytest.mark.parametrize("disable_async_output_proc", [True, False])
++def test_metric_counter_generation_tokens_multi_step(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    max_tokens: int,
++    disable_async_output_proc: bool,
++) -> None:
++    num_scheduler_steps = 8
++    with vllm_runner(
++            model,
++            disable_log_stats=False,
++            gpu_memory_utilization=0.4,
++            num_scheduler_steps=num_scheduler_steps,
++            disable_async_output_proc=disable_async_output_proc,
++    ) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++        tokenizer = vllm_model.model.get_tokenizer()
++        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
++        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
++            **stat_logger.labels)._value.get()
++        vllm_generation_count = 0
++        for i in range(len(example_prompts)):
++            vllm_output_ids, vllm_output_str = vllm_outputs[i]
++            prompt_ids = tokenizer.encode(example_prompts[i])
++            # vllm_output_ids contains both prompt tokens and generation tokens.
++            # We're interested only in the count of the generation tokens.
++            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
++
++    # The multi-step scheduling will continue to execute forward even when
++    # encountering EOS, leading to slightly imprecise metrics.
++    assert abs(vllm_generation_count - metric_count) <\
++        len(example_prompts) * num_scheduler_steps, \
++        (f"generation token count: {vllm_generation_count!r}\n"
++         f"metric: {metric_count!r}")
++
++
+ @pytest.mark.parametrize("model", MODELS)
+ @pytest.mark.parametrize("dtype", ["float"])
+ @pytest.mark.parametrize(
+@@ -85,15 +130,13 @@ def test_metric_counter_generation_tokens(
+     [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
+ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
+                                    served_model_name: List[str]) -> None:
+-    vllm_model = vllm_runner(model,
+-                             dtype=dtype,
+-                             disable_log_stats=False,
+-                             gpu_memory_utilization=0.3,
+-                             served_model_name=served_model_name)
+-    stat_logger = vllm_model.model.llm_engine.stat_logger
+-    metrics_tag_content = stat_logger.labels["model_name"]
+-
+-    del vllm_model
++    with vllm_runner(model,
++                     dtype=dtype,
++                     disable_log_stats=False,
++                     gpu_memory_utilization=0.3,
++                     served_model_name=served_model_name) as vllm_model:
++        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
++        metrics_tag_content = stat_logger.labels["model_name"]
+ 
+     if served_model_name is None or served_model_name == []:
+         assert metrics_tag_content == model, (
+@@ -168,14 +211,151 @@ def test_engine_log_metrics_regression(
+     assert_metrics(engine, disable_log_stats, len(example_prompts))
+ 
+ 
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [10])
++def test_metric_spec_decode(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++    k = 5
++
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            disable_log_stats=False,
++            gpu_memory_utilization=0.4,
++            speculative_model=model,
++            num_speculative_tokens=k,
++    ) as vllm_model:
++
++        # Force log interval to be 0 to catch all metrics.
++        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
++        stat_logger.local_interval = 0
++
++        # Note that the purpose of this test is to verify spec decode
++        # metrics instead of functional correctness, so the expected values
++        # are intended to be loose.
++        metric_name_to_expected_fn = {
++            "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
++            "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
++            "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
++            "counter_spec_decode_num_draft_tokens": lambda v: v == k,
++            "counter_spec_decode_num_emitted_tokens":
++            lambda v: 0 <= v <= k + 1,
++        }
++
++        # Use one request to better inspect the metrics.
++        prompts = example_prompts[:1]
++
++        _ = vllm_model.generate_greedy(prompts, max_tokens)
++        for metric_name, is_expected in metric_name_to_expected_fn.items():
++            metric_val = getattr(
++                stat_logger.metrics,
++                metric_name).labels(**stat_logger.labels)._value.get()
++            assert is_expected(metric_val), (
++                f"the value of metric {metric_name} ({metric_val}) "
++                "does not meet expectation")
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [10])
++@pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
++def test_metric_spec_decode_interval(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++    log_interval: int,
++) -> None:
++    k = 5
++
++    engine_args = EngineArgs(model=model,
++                             dtype=dtype,
++                             disable_log_stats=False,
++                             gpu_memory_utilization=0.4,
++                             speculative_model=model,
++                             num_speculative_tokens=k,
++                             enforce_eager=True)
++
++    engine = LLMEngine.from_engine_args(engine_args)
++
++    try:
++
++        engine.add_request(
++            "request-id-0",
++            example_prompts[0],
++            SamplingParams(max_tokens=max_tokens),
++        )
++
++        # set log internal
++        stat_logger = engine.stat_loggers['prometheus']
++        stat_logger.local_interval = log_interval
++
++        # prefill
++        engine.step()
++
++        # wait for 5 seconds to ensure that spec decode metrics
++        # get triggered in first decode step
++        time.sleep(5)
++
++        # first decode step should trigger async collection of metrics
++        engine.step()
++
++        # wait one second to allow H2D transfer to finish
++        time.sleep(1)
++
++        # second decode step should now be able to collect the spec
++        # decode stats and the request should also be finished
++        engine.step()
++
++        # must have finisehd now
++        assert not engine.has_unfinished_requests()
++
++        # wait to ensure logging occurs
++        time.sleep(log_interval)
++
++        # force logging
++        engine.step()
++
++        # Note that the purpose of this test is to verify spec decode
++        # metrics instead of functional correctness, so the expected values
++        # are intended to be loose.
++        metric_name_to_expected_fn = {
++            "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
++            "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
++            "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
++            "counter_spec_decode_num_draft_tokens": lambda v: v == k,
++            "counter_spec_decode_num_emitted_tokens":
++            lambda v: 0 <= v <= k + 1,
++        }
++
++        for metric_name, is_expected in metric_name_to_expected_fn.items():
++            metric_val = getattr(
++                stat_logger.metrics,
++                metric_name).labels(**stat_logger.labels)._value.get()
++            assert is_expected(metric_val), (
++                f"the value of metric {metric_name} ({metric_val}) "
++                "does not meet expectation")
++
++    finally:
++        del engine
++        cleanup_dist_env_and_memory()
++
++
+ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
+                    num_requests: int) -> None:
+     if disable_log_stats:
+         with pytest.raises(AttributeError):
+-            _ = engine.stat_logger
++            _ = engine.stat_loggers
+     else:
+-        assert (engine.stat_logger
+-                is not None), "engine.stat_logger should be set"
++        assert (engine.stat_loggers
++                is not None), "engine.stat_loggers should be set"
+         # Ensure the count bucket of request-level histogram metrics matches
+         # the number of requests as a simple sanity check to ensure metrics are
+         # generated
+@@ -184,11 +364,63 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
+             "vllm:e2e_request_latency_seconds",
+             "vllm:request_prompt_tokens",
+             "vllm:request_generation_tokens",
+-            "vllm:request_params_best_of",
+             "vllm:request_params_n",
++            "vllm:request_params_max_tokens",
+         ]
+         for metric_name in request_histogram_metrics:
+             metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
+                                                      labels)
+             assert (
+                 metric_value == num_requests), "Metrics should be collected"
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [16])
++def test_engine_log_metrics_ray(
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++    # This test is quite weak - it only checks that we can use
++    # RayPrometheusStatLogger without exceptions.
++    # Checking whether the metrics are actually emitted is unfortunately
++    # non-trivial.
++
++    # We have to run in a Ray task for Ray metrics to be emitted correctly
++    @ray.remote(num_gpus=1)
++    def _inner():
++
++        class _RayPrometheusStatLogger(RayPrometheusStatLogger):
++
++            def __init__(self, *args, **kwargs):
++                self._i = 0
++                super().__init__(*args, **kwargs)
++
++            def log(self, *args, **kwargs):
++                self._i += 1
++                return super().log(*args, **kwargs)
++
++        engine_args = EngineArgs(
++            model=model,
++            dtype=dtype,
++            disable_log_stats=False,
++        )
++        engine = LLMEngine.from_engine_args(engine_args)
++        logger = _RayPrometheusStatLogger(
++            local_interval=0.5,
++            labels=dict(model_name=engine.model_config.served_model_name),
++            vllm_config=engine.vllm_config)
++        engine.add_logger("ray", logger)
++        for i, prompt in enumerate(example_prompts):
++            engine.add_request(
++                f"request-id-{i}",
++                prompt,
++                SamplingParams(max_tokens=max_tokens),
++            )
++        while engine.has_unfinished_requests():
++            engine.step()
++        assert logger._i > 0, ".log must be called at least once"
++
++    ray.get(_inner.remote())
+diff --git a/tests/model_executor/__init__.py b/tests/model_executor/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py
+new file mode 100644
+index 0000000..10792b0
+--- /dev/null
++++ b/tests/model_executor/conftest.py
+@@ -0,0 +1,49 @@
++import pytest
++
++
++@pytest.fixture
++def sample_regex():
++    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
++            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
++
++
++@pytest.fixture
++def sample_json_schema():
++    return {
++        "type": "object",
++        "properties": {
++            "name": {
++                "type": "string"
++            },
++            "age": {
++                "type": "integer"
++            },
++            "skills": {
++                "type": "array",
++                "items": {
++                    "type": "string",
++                    "maxLength": 10
++                },
++                "minItems": 3
++            },
++            "work_history": {
++                "type": "array",
++                "items": {
++                    "type": "object",
++                    "properties": {
++                        "company": {
++                            "type": "string"
++                        },
++                        "duration": {
++                            "type": "number"
++                        },
++                        "position": {
++                            "type": "string"
++                        }
++                    },
++                    "required": ["company", "position"]
++                }
++            }
++        },
++        "required": ["name", "age", "skills", "work_history"]
++    }
+diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
+new file mode 100644
+index 0000000..0a3aba2
+--- /dev/null
++++ b/tests/model_executor/test_enabled_custom_ops.py
+@@ -0,0 +1,89 @@
++from typing import List
++
++import pytest
++
++from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
++from vllm.model_executor.custom_op import CustomOp
++from vllm.model_executor.layers.activation import (GeluAndMul,
++                                                   ReLUSquaredActivation,
++                                                   SiluAndMul)
++from vllm.model_executor.layers.layernorm import RMSNorm
++
++
++# Registered subclass for test
++@CustomOp.register("relu3")
++class Relu3(ReLUSquaredActivation):
++    pass
++
++
++@pytest.mark.parametrize(
++    "env, torch_level, ops_enabled, default_on",
++    [
++        # Default values based on compile level
++        ("", 0, [True] * 4, True),
++        ("", 1, [True] * 4, True),
++        ("", 2, [True] * 4, True),  # All by default
++        ("", 3, [False] * 4, False),
++        ("", 4, [False] * 4, False),  # None by default
++        # Explicitly enabling/disabling
++        #
++        # Default: all
++        #
++        # All but SiluAndMul
++        ("+rms_norm,-silu_and_mul", 0, [1, 0, 1, 1], True),
++        # Only ReLU3
++        ("none,-rms_norm,+relu3", 0, [0, 0, 0, 1], False),
++        # All but SiluAndMul
++        ("all,-silu_and_mul", 1, [1, 0, 1, 1], True),
++        # All but ReLU3 (even if ReLU2 is on)
++        ("-relu3,relu2", 1, [1, 1, 1, 0], True),
++        # GeluAndMul and SiluAndMul
++        ("none,-relu3,+gelu_and_mul,+silu_and_mul", 2, [0, 1, 1, 0], False),
++        # All but RMSNorm
++        ("-rms_norm", 2, [0, 1, 1, 1], True),
++        #
++        # Default: none
++        #
++        # Only ReLU3
++        ("-silu_and_mul,+relu3", 3, [0, 0, 0, 1], False),
++        # All but RMSNorm
++        ("all,-rms_norm", 4, [0, 1, 1, 1], True),
++    ])
++def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
++                     default_on: bool):
++    vllm_config = VllmConfig(compilation_config=CompilationConfig(
++        level=torch_level, custom_ops=env.split(",")))
++    with set_current_vllm_config(vllm_config):
++        assert CustomOp.default_on() == default_on
++
++        ops_enabled = [bool(x) for x in ops_enabled]
++
++        assert RMSNorm(1024).enabled() == ops_enabled[0]
++        assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
++
++        assert SiluAndMul().enabled() == ops_enabled[1]
++        assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
++
++        assert GeluAndMul().enabled() == ops_enabled[2]
++        assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
++
++        # If registered, subclasses should follow their own name
++        assert Relu3().enabled() == ops_enabled[3]
++        assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
++
++        # Unregistered subclass
++        class SiluAndMul2(SiluAndMul):
++            pass
++
++        # Subclasses should not require registration
++        assert SiluAndMul2().enabled() == SiluAndMul().enabled()
++
++
++@pytest.mark.parametrize(
++    "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
++def test_enabled_ops_invalid(env: str):
++    with pytest.raises(Exception):  # noqa
++        vllm_config = VllmConfig(compilation_config=CompilationConfig(
++            custom_ops=env.split(",")))
++        with set_current_vllm_config(vllm_config):
++            RMSNorm(1024).enabled()
+diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
+new file mode 100644
+index 0000000..be5282d
+--- /dev/null
++++ b/tests/model_executor/test_guided_processors.py
+@@ -0,0 +1,128 @@
++import pickle
++
++import pytest
++import torch
++from transformers import AutoTokenizer
++
++from vllm.config import ModelConfig
++from vllm.model_executor.guided_decoding import (
++    get_guided_decoding_logits_processor,
++    get_local_guided_decoding_logits_processor)
++from vllm.model_executor.guided_decoding.outlines_logits_processors import (
++    JSONLogitsProcessor, RegexLogitsProcessor)
++from vllm.sampling_params import GuidedDecodingParams
++
++MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
++GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
++
++
++def test_guided_logits_processors(sample_regex, sample_json_schema):
++    """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
++    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
++    regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
++    json_LP = JSONLogitsProcessor(sample_json_schema,
++                                  tokenizer,
++                                  whitespace_pattern=None)
++
++    token_ids = tokenizer.encode(
++        f"Give an example IPv4 address with this regex: {sample_regex}")
++    tensor = torch.rand(32000)
++    original_tensor = torch.clone(tensor)
++    regex_LP(token_ids, tensor)
++    assert tensor.shape == original_tensor.shape
++    assert not torch.allclose(tensor, original_tensor)
++
++    token_ids = tokenizer.encode(
++        f"Give an employee profile that fits this schema: {sample_json_schema}"
++    )
++    tensor = torch.rand(32000)
++    original_tensor = torch.clone(tensor)
++    json_LP(token_ids, tensor)
++    assert tensor.shape == original_tensor.shape
++    assert not torch.allclose(tensor, original_tensor)
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS)
++@pytest.mark.parametrize("is_local", [True, False])
++async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
++                                                 sample_regex,
++                                                 sample_json_schema):
++
++    config = ModelConfig(
++        MODEL_NAME,
++        task="generate",
++        tokenizer=MODEL_NAME,
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        seed=0,
++        dtype="bfloat16",
++    )
++    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
++    token_ids = tokenizer.encode(
++        f"Give an example IPv4 address with this regex: {sample_regex}")
++    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
++
++    regex_lp = get_local_guided_decoding_logits_processor(
++            regex_request, tokenizer, config) if is_local else \
++            await get_guided_decoding_logits_processor(
++                    regex_request, tokenizer, config)
++    assert regex_lp is not None
++    tensor = torch.rand(32000)
++    original_tensor = torch.clone(tensor)
++    tensor = regex_lp(token_ids, tensor)
++    assert tensor.shape == original_tensor.shape
++    assert not torch.allclose(tensor, original_tensor)
++
++    token_ids = tokenizer.encode(
++        f"Give an employee profile that fits this schema: {sample_json_schema}"
++    )
++    json_request = GuidedDecodingParams(json=sample_json_schema,
++                                        backend=backend)
++    json_lp = await get_guided_decoding_logits_processor(
++        json_request, tokenizer, config)
++    assert json_lp is not None
++    tensor = torch.rand(32000)
++    original_tensor = torch.clone(tensor)
++    tensor = json_lp(token_ids, tensor)
++    assert tensor.shape == original_tensor.shape
++    assert not torch.allclose(tensor, original_tensor)
++
++
++def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
++    with pytest.raises(ValueError,
++                       match="You can only use one kind of guided"):
++        GuidedDecodingParams(json=sample_json_schema, regex=sample_regex)
++
++    with pytest.raises(ValueError,
++                       match="You can only use one kind of guided"):
++        GuidedDecodingParams(json=sample_json_schema, json_object=True)
++
++    with pytest.raises(ValueError,
++                       match="You can only use one kind of guided"):
++        GuidedDecodingParams(json=sample_json_schema, choice=["a", "b"])
++
++    with pytest.raises(ValueError,
++                       match="You can only use one kind of guided"):
++        GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
++
++
++def test_pickle_xgrammar_tokenizer_data():
++
++    # TODO: move to another test file for xgrammar
++    try:
++        import xgrammar as xgr
++    except ImportError:
++        pytest.skip("Could not import xgrammar to run test")
++
++    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
++        TokenizerData)
++    tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
++    pickled = pickle.dumps(tokenizer_data)
++
++    assert pickled is not None
++
++    depickled: TokenizerData = pickle.loads(pickled)
++
++    assert depickled is not None
++    assert depickled.vocab_type == xgr.VocabType.RAW
+diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
+new file mode 100644
+index 0000000..0609fd9
+--- /dev/null
++++ b/tests/model_executor/test_model_load_with_params.py
+@@ -0,0 +1,119 @@
++import os
++
++import pytest
++
++from vllm.model_executor.layers.pooler import CLSPool, PoolingType
++from vllm.model_executor.models.bert import BertEmbeddingModel
++from vllm.model_executor.models.roberta import RobertaEmbeddingModel
++from vllm.platforms import current_platform
++
++MAX_MODEL_LEN = 128
++MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
++REVISION = os.environ.get("REVISION", "main")
++
++MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
++                                    "intfloat/multilingual-e5-large")
++REVISION_ROBERTA = os.environ.get("REVISION", "main")
++
++
++@pytest.mark.skipif(current_platform.is_rocm(),
++                    reason="Xformers backend is not supported on ROCm.")
++def test_model_loading_with_params(vllm_runner):
++    """
++    Test parameter weight loading with tp>1.
++    """
++    with vllm_runner(model_name=MODEL_NAME,
++                     revision=REVISION,
++                     dtype="float16",
++                     max_model_len=MAX_MODEL_LEN) as model:
++        output = model.encode("Write a short story about a robot that"
++                              " dreams for the first time.\n")
++
++        model_config = model.model.llm_engine.model_config
++
++        model_tokenizer = model.model.llm_engine.tokenizer
++
++        # asserts on the bert model config file
++        assert model_config.encoder_config["max_seq_length"] == 512
++        assert model_config.encoder_config["do_lower_case"]
++
++        # asserts on the pooling config files
++        assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
++        assert model_config.pooler_config.pooling_norm
++
++        # asserts on the tokenizer loaded
++        assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5"
++        assert model_tokenizer.tokenizer_config["do_lower_case"]
++        assert model_tokenizer.tokenizer.model_max_length == 512
++
++        model = model.model.llm_engine.model_executor\
++                     .driver_worker.model_runner.model
++        assert isinstance(model, BertEmbeddingModel)
++        assert model._pooler.pooling_type == PoolingType.CLS
++        assert model._pooler.normalize
++        # assert output
++        assert output
++
++
++@pytest.mark.skipif(current_platform.is_rocm(),
++                    reason="Xformers backend is not supported on ROCm.")
++def test_roberta_model_loading_with_params(vllm_runner):
++    """
++    Test parameter weight loading with tp>1.
++    """
++    with vllm_runner(model_name=MODEL_NAME_ROBERTA,
++                     revision=REVISION_ROBERTA,
++                     dtype="float16",
++                     max_model_len=MAX_MODEL_LEN) as model:
++        output = model.encode("Write a short story about a robot that"
++                              " dreams for the first time.\n")
++
++        model_config = model.model.llm_engine.model_config
++
++        model_tokenizer = model.model.llm_engine.tokenizer
++
++        # asserts on the bert model config file
++        assert model_config.encoder_config["max_seq_length"] == 512
++        assert not model_config.encoder_config["do_lower_case"]
++
++        # asserts on the pooling config files
++        assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
++        assert model_config.pooler_config.pooling_norm
++
++        # asserts on the tokenizer loaded
++        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large"
++        assert not model_tokenizer.tokenizer_config["do_lower_case"]
++
++        model = model.model.llm_engine.model_executor\
++                     .driver_worker.model_runner.model
++        assert isinstance(model, RobertaEmbeddingModel)
++        assert model._pooler.pooling_type == PoolingType.MEAN
++        assert model._pooler.normalize
++
++        # assert output
++        assert output
++
++
++@pytest.mark.skipif(current_platform.is_rocm(),
++                    reason="Xformers backend is not supported on ROCm.")
++def test_facebook_roberta_model_loading_with_params(vllm_runner):
++    """
++    Test loading roberta-base model with no lm_head.
++    """
++    model_name = "FacebookAI/roberta-base"
++    with vllm_runner(model_name=model_name,
++                     dtype="float16",
++                     max_model_len=MAX_MODEL_LEN) as model:
++        output = model.encode("Write a short story about a robot that"
++                              " dreams for the first time.\n")
++
++        model_tokenizer = model.model.llm_engine.tokenizer
++        assert model_tokenizer.tokenizer_id == model_name
++
++        model = model.model.llm_engine.model_executor\
++                     .driver_worker.model_runner.model
++        assert not hasattr(model, "lm_head")
++        assert isinstance(model, RobertaEmbeddingModel)
++        assert isinstance(model._pooler, CLSPool)
++
++        assert output
+diff --git a/tests/models/__init__.py b/tests/models/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/decoder_only/__init__.py b/tests/models/decoder_only/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/decoder_only/audio_language/__init__.py b/tests/models/decoder_only/audio_language/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
+new file mode 100644
+index 0000000..1e329dc
+--- /dev/null
++++ b/tests/models/decoder_only/audio_language/test_ultravox.py
+@@ -0,0 +1,268 @@
++from typing import List, Optional, Tuple, Type
++
++import numpy as np
++import pytest
++import pytest_asyncio
++from transformers import AutoModel, AutoTokenizer, BatchEncoding
++
++from vllm.multimodal.audio import resample_audio
++from vllm.sequence import SampleLogprobs
++from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
++
++from ....conftest import HfRunner, VllmRunner
++from ....utils import RemoteOpenAIServer
++from ...utils import check_logprobs_close
++
++MODEL_NAME = "fixie-ai/ultravox-v0_3"
++
++AudioTuple = Tuple[np.ndarray, int]
++
++VLLM_PLACEHOLDER = "<|audio|>"
++HF_PLACEHOLDER = "<|audio|>"
++
++CHUNKED_PREFILL_KWARGS = {
++    "enable_chunked_prefill": True,
++    "max_num_seqs": 2,
++    # Use a very small limit to exercise chunked prefill.
++    "max_num_batched_tokens": 16
++}
++
++
++@pytest.fixture(scope="session")
++def audio_assets():
++    from vllm.assets.audio import AudioAsset
++    return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
++
++
++@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
++def audio(request):
++    from vllm.assets.audio import AudioAsset
++    return AudioAsset(request.param)
++
++
++@pytest.fixture(params=[
++    pytest.param({}, marks=pytest.mark.cpu_model),
++    pytest.param(CHUNKED_PREFILL_KWARGS),
++])
++def server(request, audio_assets):
++    args = [
++        "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
++        f"--limit-mm-per-prompt=audio={len(audio_assets)}",
++        "--trust-remote-code"
++    ] + [
++        f"--{key.replace('_','-')}={value}"
++        for key, value in request.param.items()
++    ]
++
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client(server):
++    async with server.get_async_client() as async_client:
++        yield async_client
++
++
++def _get_prompt(audio_count, question, placeholder):
++    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
++    placeholder = f"{placeholder}\n" * audio_count
++
++    return tokenizer.apply_chat_template([{
++        'role': 'user',
++        'content': f"{placeholder}{question}"
++    }],
++                                         tokenize=False,
++                                         add_generation_prompt=True)
++
++
++def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
++                                         Optional[SampleLogprobs]],
++                      model: str):
++    """Sanitize vllm output to be comparable with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    tokenizer = AutoTokenizer.from_pretrained(model)
++    eos_token_id = tokenizer.eos_token_id
++
++    hf_output_ids = output_ids[:]
++    hf_output_str = output_str
++    if hf_output_ids[-1] == eos_token_id:
++        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
++
++    return hf_output_ids, hf_output_str, out_logprobs
++
++
++def run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    prompts_and_audios: List[Tuple[str, str, AudioTuple]],
++    model: str,
++    *,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    **kwargs,
++):
++    """Inference result should be the same between hf and vllm."""
++    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
++
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default method).
++
++    with vllm_runner(model, dtype=dtype, enforce_eager=True,
++                     **kwargs) as vllm_model:
++        vllm_outputs_per_audio = [
++            vllm_model.generate_greedy_logprobs([vllm_prompt],
++                                                max_tokens,
++                                                num_logprobs=num_logprobs,
++                                                audios=[audio])
++            for vllm_prompt, _, audio in prompts_and_audios
++        ]
++
++    def process(hf_inputs: BatchEncoding, **kwargs):
++        hf_inputs["audio_values"] = hf_inputs["audio_values"] \
++            .to(torch_dtype)  # type: ignore
++        return hf_inputs
++
++    with hf_runner(model,
++                   dtype=dtype,
++                   postprocess_inputs=process,
++                   auto_cls=AutoModel) as hf_model:
++        hf_outputs_per_audio = [
++            hf_model.generate_greedy_logprobs_limit(
++                [hf_prompt],
++                max_tokens,
++                num_logprobs=num_logprobs,
++                audios=[(resample_audio(audio[0],
++                                        orig_sr=audio[1],
++                                        target_sr=16000), 16000)])
++            for _, hf_prompt, audio in prompts_and_audios
++        ]
++
++    for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio,
++                                        vllm_outputs_per_audio):
++        check_logprobs_close(
++            outputs_0_lst=hf_outputs,
++            outputs_1_lst=[
++                vllm_to_hf_output(vllm_output, model)
++                for vllm_output in vllm_outputs
++            ],
++            name_0="hf",
++            name_1="vllm",
++        )
++
++
++def run_multi_audio_test(
++    vllm_runner: Type[VllmRunner],
++    prompts_and_audios: List[Tuple[str, List[AudioTuple]]],
++    model: str,
++    *,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    **kwargs,
++):
++    with vllm_runner(model,
++                     dtype=dtype,
++                     enforce_eager=True,
++                     limit_mm_per_prompt={
++                         "audio":
++                         max((len(audio) for _, audio in prompts_and_audios))
++                     },
++                     **kwargs) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy_logprobs(
++            [prompt for prompt, _ in prompts_and_audios],
++            max_tokens,
++            num_logprobs=num_logprobs,
++            audios=[audios for _, audios in prompts_and_audios])
++
++    # The HuggingFace model doesn't support multiple audios yet, so
++    # just assert that some tokens were generated.
++    assert all(tokens for tokens, *_ in vllm_outputs)
++
++
++@pytest.mark.core_model
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [5])
++@pytest.mark.parametrize("vllm_kwargs", [
++    pytest.param({}, marks=pytest.mark.cpu_model),
++    pytest.param(CHUNKED_PREFILL_KWARGS),
++])
++def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
++                num_logprobs: int, vllm_kwargs: dict) -> None:
++
++    vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
++    hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
++    run_test(
++        hf_runner,
++        vllm_runner,
++        [(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)],
++        MODEL_NAME,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        **vllm_kwargs,
++    )
++
++
++@pytest.mark.core_model
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [5])
++@pytest.mark.parametrize("vllm_kwargs", [
++    pytest.param({}, marks=pytest.mark.cpu_model),
++    pytest.param(CHUNKED_PREFILL_KWARGS),
++])
++def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
++                                     max_tokens: int, num_logprobs: int,
++                                     vllm_kwargs: dict) -> None:
++
++    vllm_prompt = _get_prompt(len(audio_assets),
++                              "Describe each of the audios above.",
++                              VLLM_PLACEHOLDER)
++    run_multi_audio_test(
++        vllm_runner,
++        [(vllm_prompt, [audio.audio_and_sample_rate
++                        for audio in audio_assets])],
++        MODEL_NAME,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        **vllm_kwargs,
++    )
++
++
++@pytest.mark.asyncio
++async def test_online_serving(client, audio_assets):
++    """Exercises online serving with/without chunked prefill enabled."""
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            *[{
++                "type": "audio_url",
++                "audio_url": {
++                    "url": audio.url
++                }
++            } for audio in audio_assets],
++            {
++                "type":
++                "text",
++                "text":
++                f"What's happening in these {len(audio_assets)} audio clips?"
++            },
++        ],
++    }]
++
++    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
++                                                           messages=messages,
++                                                           max_tokens=10)
++
++    assert len(chat_completion.choices) == 1
++    choice = chat_completion.choices[0]
++    assert choice.finish_reason == "length"
+diff --git a/tests/models/decoder_only/language/__init__.py b/tests/models/decoder_only/language/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/decoder_only/language/test_aqlm.py b/tests/models/decoder_only/language/test_aqlm.py
+new file mode 100644
+index 0000000..a8cb5bb
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_aqlm.py
+@@ -0,0 +1,69 @@
++"""Compare the outputs of a AQLM model between vLLM and HF Transformers
++
++Run `pytest tests/models/test_aqlm.py`.
++"""
++
++import pytest
++
++from tests.quantization.utils import is_quant_method_supported
++
++# These ground truth generations were generated using `transformers==4.38.1
++# aqlm==1.1.0 torch==2.2.0`
++# and the below code:
++# ```python
++# from transformers import AutoTokenizer, AutoModelForCausalLM
++# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
++# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
++# torch_dtype="auto", device_map="cuda").cuda()
++# tokenizer = AutoTokenizer.from_pretrained(model_id)
++# outputs = []
++# for prompt in example_prompts:
++#     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
++#     hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
++#     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
++# print(outputs)
++# ```
++ground_truth_generations = [
++    '\n### Features\n\n- **High-throughput**: v',
++    'The major milestones in the development of artificial intelligence from '
++    '195',
++    'Compare and contrast artificial intelligence with human intelligence in '
++    'terms of processing information. The',
++    'Explain the difference between supervised and unsupervised learning.'
++    '\nExplain',
++    'Write a short story about a robot that dreams for the first time. The',
++    'Analyze the impact of the COVID-19 pandemic on global economic',
++    'The Mona Lisa is a painting by Leonardo da Vinci, and it',
++    'The early bird catches the worm.\nThe early bird catches the'
++]
++
++
++@pytest.mark.quant_model
++@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
++                    reason="AQLM is not supported on this GPU type.")
++@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [16])
++@pytest.mark.parametrize("num_logprobs", [1])
++def test_models(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++) -> None:
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, num_logprobs)
++
++    # loop through the prompts to compare against the ground truth generations
++    for prompt_idx in range(len(example_prompts)):
++        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
++            prompt_idx]
++
++        print("Prompt:          ", repr(example_prompts[prompt_idx]))
++        print("Reference output:", repr(ground_truth_generations[prompt_idx]))
++        print("Output output:   ", repr(vllm_output_str))
++        assert vllm_output_str == ground_truth_generations[prompt_idx]
+diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
+new file mode 100644
+index 0000000..53f23e2
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_fp8.py
+@@ -0,0 +1,100 @@
++# flake8: noqa
++"""Tests fp8 models against ground truth generation
++Note: these tests will only pass on L4 GPU.
++"""
++import os
++from typing import Optional
++
++import pytest
++
++from tests.kernels.utils import override_backend_env_variable
++from tests.quantization.utils import is_quant_method_supported
++
++from ...utils import check_logprobs_close
++
++os.environ["TOKENIZERS_PARALLELISM"] = "true"
++
++
++@pytest.mark.quant_model
++@pytest.mark.skipif(not is_quant_method_supported("fp8"),
++                    reason="fp8 is not supported on this GPU type.")
++@pytest.mark.parametrize(
++    "kv_cache_dtype,base_model,test_model,scale_path",
++    [
++        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
++        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
++         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
++        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
++        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
++         "meta-llama/Llama-3.2-1B-Instruct", None),
++        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
++        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
++         "meta-llama/Llama-2-7b-chat-hf",
++         "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
++    ])
++# Due to low-precision numerical divergence, we only test logprob of 4 tokens
++@pytest.mark.parametrize("max_tokens", [4])
++@pytest.mark.parametrize("enforce_eager", [True])
++@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
++# NOTE: Increasing this in this suite will fail CI because we currently cannot
++# reset distributed env properly. Use a value > 1 just when you test.
++@pytest.mark.parametrize("tensor_parallel_size", [1])
++# Due to low-precision numerical divergence, this test is too sensitive for
++# the async postprocessor
++@pytest.mark.parametrize("disable_async_output_proc", [True])
++def test_models(
++    vllm_runner,
++    example_prompts,
++    kv_cache_dtype: str,
++    base_model: str,
++    test_model: str,
++    scale_path: Optional[str],
++    max_tokens: int,
++    enforce_eager: bool,
++    backend: str,
++    tensor_parallel_size: int,
++    disable_async_output_proc: bool,
++    monkeypatch,
++) -> None:
++    """
++    Only checks log probs match to cover the discrepancy in
++    numerical sensitive kernels.
++    """
++    override_backend_env_variable(monkeypatch, backend)
++
++    MAX_MODEL_LEN = 1024
++    NUM_LOG_PROBS = 8
++
++    with vllm_runner(
++            base_model,
++            max_model_len=MAX_MODEL_LEN,
++            tensor_parallel_size=tensor_parallel_size,
++            enforce_eager=enforce_eager,
++            kv_cache_dtype="auto",
++            disable_async_output_proc=disable_async_output_proc,
++    ) as vllm_model:
++        baseline_outputs = vllm_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, NUM_LOG_PROBS)
++
++    extra_kwargs = {}
++    if scale_path is not None:
++        extra_kwargs["quantization_param_path"] = scale_path
++
++    with vllm_runner(
++            test_model,
++            max_model_len=MAX_MODEL_LEN,
++            tensor_parallel_size=tensor_parallel_size,
++            enforce_eager=enforce_eager,
++            kv_cache_dtype=kv_cache_dtype,
++            disable_async_output_proc=disable_async_output_proc,
++            **extra_kwargs,
++    ) as vllm_model:
++        test_outputs = vllm_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, NUM_LOG_PROBS)
++
++    check_logprobs_close(
++        outputs_0_lst=baseline_outputs,
++        outputs_1_lst=test_outputs,
++        name_0="fp16_kv_cache",
++        name_1="fp8_kv_cache",
++    )
+diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
+new file mode 100644
+index 0000000..81b93eb
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_gguf.py
+@@ -0,0 +1,130 @@
++"""
++Tests gguf models against unquantized models generations
++Note: To pass the test, quantization higher than Q4 should be used
++"""
++
++import os
++from typing import List, NamedTuple, Type
++
++import pytest
++from huggingface_hub import hf_hub_download
++from transformers import AutoTokenizer
++
++from tests.quantization.utils import is_quant_method_supported
++
++from ....conftest import VllmRunner
++from ...utils import check_logprobs_close
++
++os.environ["TOKENIZERS_PARALLELISM"] = "true"
++
++MAX_MODEL_LEN = 1024
++
++
++class GGUFTestConfig(NamedTuple):
++    original_model: str
++    gguf_repo: str
++    gguf_filename: str
++
++    @property
++    def gguf_model(self):
++        return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)
++
++
++LLAMA_CONFIG = GGUFTestConfig(
++    original_model="meta-llama/Llama-3.2-1B-Instruct",
++    gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
++    gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
++)
++
++QWEN2_CONFIG = GGUFTestConfig(
++    original_model="Qwen/Qwen2.5-1.5B-Instruct",
++    gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
++    gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
++)
++
++PHI3_CONFIG = GGUFTestConfig(
++    original_model="microsoft/Phi-3.5-mini-instruct",
++    gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
++    gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf",
++)
++
++GPT2_CONFIG = GGUFTestConfig(
++    original_model="openai-community/gpt2-large",
++    gguf_repo="QuantFactory/gpt2-large-GGUF",
++    gguf_filename="gpt2-large.Q4_K_M.gguf",
++)
++
++STABLELM_CONFIG = GGUFTestConfig(
++    original_model="stabilityai/stablelm-3b-4e1t",
++    gguf_repo="afrideva/stablelm-3b-4e1t-GGUF",
++    gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf",
++)
++
++STARCODER_CONFIG = GGUFTestConfig(
++    original_model="bigcode/starcoder2-3b",
++    gguf_repo="QuantFactory/starcoder2-3b-GGUF",
++    gguf_filename="starcoder2-3b.Q6_K.gguf",
++)
++
++MODELS = [
++    LLAMA_CONFIG,
++    QWEN2_CONFIG,
++    PHI3_CONFIG,
++    GPT2_CONFIG,
++    STABLELM_CONFIG,
++    # STARCODER_CONFIG, # broken
++]
++
++
++@pytest.mark.skipif(not is_quant_method_supported("gguf"),
++                    reason="gguf is not supported on this GPU type.")
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [32])
++@pytest.mark.parametrize("num_logprobs", [5])
++@pytest.mark.parametrize("tp_size", [1, 2])
++def test_models(
++    num_gpus_available: int,
++    vllm_runner: Type[VllmRunner],
++    example_prompts: List[str],
++    model: GGUFTestConfig,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    tp_size: int,
++) -> None:
++    if num_gpus_available < tp_size:
++        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
++
++    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
++    if tokenizer.chat_template is not None:
++        messages = [[{
++            'role': 'user',
++            'content': prompt
++        }] for prompt in example_prompts]
++        example_prompts = tokenizer.apply_chat_template(
++            messages, tokenize=False, add_generation_prompt=True)
++
++    # Run unquantized model.
++    with vllm_runner(model_name=model.original_model,
++                     dtype=dtype,
++                     max_model_len=MAX_MODEL_LEN,
++                     tensor_parallel_size=tp_size) as original_model:
++        original_outputs = original_model.generate_greedy_logprobs(
++            example_prompts[:-1], max_tokens, num_logprobs)
++
++    # Run gguf model.
++    with vllm_runner(model_name=model.gguf_model,
++                     tokenizer_name=model.original_model,
++                     dtype=dtype,
++                     max_model_len=MAX_MODEL_LEN,
++                     tensor_parallel_size=tp_size) as gguf_model:
++        gguf_outputs = gguf_model.generate_greedy_logprobs(
++            example_prompts[:-1], max_tokens, num_logprobs)
++
++    check_logprobs_close(
++        outputs_0_lst=original_outputs,
++        outputs_1_lst=gguf_outputs,
++        name_0="original",
++        name_1="gguf",
++    )
+diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py
+new file mode 100644
+index 0000000..037411a
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_gptq_marlin.py
+@@ -0,0 +1,84 @@
++"""Compares the outputs of gptq vs gptq_marlin 
++Note: GPTQ and Marlin do not have bitwise correctness.
++As a result, in this test, we just confirm that the top selected tokens of the
++Marlin/GPTQ models are in the top 5 selections of each other.
++Note: Marlin internally uses locks to synchronize the threads. This can
++result in very slight nondeterminism for Marlin. As a result, we re-run the test
++up to 3 times to see if we pass.
++
++Run `pytest tests/models/test_gptq_marlin.py`.
++"""
++import os
++
++import pytest
++
++from tests.quantization.utils import is_quant_method_supported
++from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
++
++from ...utils import check_logprobs_close
++
++os.environ["TOKENIZERS_PARALLELISM"] = "true"
++
++MAX_MODEL_LEN = 1024
++
++MODELS = [
++    # act_order==True, group_size=128
++    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
++
++    # 8-bit, act_order==True, group_size=channelwise
++    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
++
++    # 4-bit, act_order==True, group_size=128
++    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
++]
++
++
++@pytest.mark.quant_model
++@pytest.mark.flaky(reruns=3)
++@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
++                    reason="gptq_marlin is not supported on this GPU type.")
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
++@pytest.mark.parametrize("max_tokens", [32])
++@pytest.mark.parametrize("num_logprobs", [5])
++def test_models(
++    vllm_runner,
++    example_prompts,
++    model,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++) -> None:
++    model_name, revision = model
++
++    # Run marlin.
++    with vllm_runner(model_name=model_name,
++                     revision=revision,
++                     dtype=dtype,
++                     quantization="marlin",
++                     max_model_len=MAX_MODEL_LEN,
++                     tensor_parallel_size=1) as gptq_marlin_model:
++
++        gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
++            example_prompts[:-1], max_tokens, num_logprobs)
++    _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error
++
++    # Run gptq.
++    # The naive gptq kernel doesn't support bf16 yet.
++    # Here we always compare fp16/bf16 gpt marlin kernel
++    # to fp16 gptq kernel.
++    with vllm_runner(model_name=model_name,
++                     revision=revision,
++                     dtype="half",
++                     quantization="gptq",
++                     max_model_len=MAX_MODEL_LEN,
++                     tensor_parallel_size=1) as gptq_model:
++        gptq_outputs = gptq_model.generate_greedy_logprobs(
++            example_prompts[:-1], max_tokens, num_logprobs)
++
++    check_logprobs_close(
++        outputs_0_lst=gptq_outputs,
++        outputs_1_lst=gptq_marlin_outputs,
++        name_0="gptq",
++        name_1="gptq_marlin",
++    )
+diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py
+new file mode 100644
+index 0000000..26cb3ec
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
+@@ -0,0 +1,73 @@
++"""Compare the outputs of a GPTQ model to a Marlin_24 model.
++
++Note: GPTQ and Marlin_24 do not have bitwise correctness.
++As a result, in this test, we just confirm that the top selected tokens of the
++Marlin/GPTQ models are in the top 3 selections of each other.
++
++Run `pytest tests/models/test_marlin_24.py`.
++"""
++from dataclasses import dataclass
++
++import pytest
++
++from tests.quantization.utils import is_quant_method_supported
++
++from ...utils import check_logprobs_close
++
++
++@dataclass
++class ModelPair:
++    model_marlin: str
++    model_gptq: str
++
++
++model_pairs = [
++    # 4-bit, group_size == 128
++    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
++              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
++    # # 4-bit, group_size == channelwise
++    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
++    #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
++
++    # 8-bit, group_size == 128
++    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
++              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
++    # # 8-bit, group_size == channelwise
++    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
++    #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
++]
++
++
++@pytest.mark.quant_model
++@pytest.mark.flaky(reruns=2)
++@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
++                    reason="Marlin24 is not supported on this GPU type.")
++@pytest.mark.parametrize("model_pair", model_pairs)
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [8])
++@pytest.mark.parametrize("num_logprobs", [5])
++def test_models(
++    vllm_runner,
++    example_prompts,
++    model_pair: ModelPair,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++) -> None:
++    with vllm_runner(model_pair.model_marlin,
++                     dtype=dtype,
++                     quantization="gptq_marlin_24") as marlin_24_model:
++        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, num_logprobs)
++
++    with vllm_runner(model_pair.model_gptq, dtype=dtype,
++                     quantization="gptq") as gptq_model:
++        gptq_outputs = gptq_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, num_logprobs)
++
++    check_logprobs_close(
++        outputs_0_lst=gptq_outputs,
++        outputs_1_lst=marlin_24_outputs,
++        name_0="gptq",
++        name_1="marlin_24",
++    )
+diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py
+new file mode 100644
+index 0000000..5e93842
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_granite.py
+@@ -0,0 +1,41 @@
++"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
++
++Run `pytest tests/models/test_granite.py`.
++"""
++import pytest
++
++from ...utils import check_logprobs_close
++
++MODELS = [
++    # TODO(sang): Sliding window should be tested separately.
++    "ibm/PowerLM-3b",
++    "ibm/PowerMoE-3b",
++]
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [64])
++@pytest.mark.parametrize("num_logprobs", [5])
++def test_models(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++) -> None:
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy_logprobs_limit(
++            example_prompts, max_tokens, num_logprobs)
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, num_logprobs)
++    check_logprobs_close(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
+diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
+new file mode 100644
+index 0000000..057b043
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_jamba.py
+@@ -0,0 +1,339 @@
++import pytest
++
++from tests.utils import multi_gpu_test
++from vllm.engine.arg_utils import EngineArgs
++from vllm.sampling_params import SamplingParams
++
++from ...utils import check_outputs_equal
++
++MODELS = ["ai21labs/Jamba-tiny-dev"]
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [96])
++def test_models(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++
++    with hf_runner(
++            model,
++            dtype=dtype,
++            model_kwargs={
++                "use_mamba_kernels":
++                False,  # mamba kernels are not installed so HF 
++                # don't use them
++            }) as hf_model:
++        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++        # This test is for verifying whether the model's extra_repr
++        # can be printed correctly.
++        print(vllm_model.model.llm_engine.model_executor.driver_worker.
++              model_runner.model)
++
++    for i in range(len(example_prompts)):
++        hf_output_ids, hf_output_str = hf_outputs[i]
++        vllm_output_ids, vllm_output_str = vllm_outputs[i]
++        assert hf_output_str == vllm_output_str, (
++            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
++        assert hf_output_ids == vllm_output_ids, (
++            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [96])
++def test_batching(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++    # To pass the small model tests, we need full precision.
++    for_loop_outputs = []
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        for prompt in example_prompts:
++            for_loop_outputs.append(
++                vllm_model.generate_greedy([prompt], max_tokens)[0])
++
++        batched_outputs = vllm_model.generate_greedy(example_prompts,
++                                                     max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=for_loop_outputs,
++        outputs_1_lst=batched_outputs,
++        name_0="for_loop_vllm",
++        name_1="batched_vllm",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float16"])
++@pytest.mark.parametrize("max_tokens", [10])
++def test_mamba_prefill_chunking_with_parallel_sampling(
++        hf_runner, vllm_runner, example_prompts, model: str, dtype: str,
++        max_tokens: int) -> None:
++    # Tests prefill chunking in conjunction with n>1, in this case,
++    # prefill is populated with decoding tokens and we test that it
++    # doesn't fail This test might fail if cache is not allocated
++    # correctly for n > 1 decoding steps inside a
++    # chunked prefill forward pass (where we have both prefills
++    # and decoding together )
++    sampling_params = SamplingParams(n=3,
++                                     temperature=1,
++                                     seed=0,
++                                     max_tokens=max_tokens)
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            enable_chunked_prefill=True,
++            max_num_batched_tokens=30,
++            max_num_seqs=10  # forces prefill chunks with decoding
++    ) as vllm_model:
++        vllm_model.generate(example_prompts, sampling_params)
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [10])
++def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
++                                model: str, dtype: str,
++                                max_tokens: int) -> None:
++    # numeric error during prefill chucking produces different generation
++    # compared to w/o prefill chunking for those examples, removed them for now
++    example_prompts.pop(7)
++    example_prompts.pop(2)
++    example_prompts.pop(1)
++
++    with hf_runner(
++            model,
++            dtype=dtype,
++            model_kwargs={
++                "use_mamba_kernels":
++                False,  # mamba kernels are not installed so HF 
++                # don't use them
++            }) as hf_model:
++        non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
++
++    with vllm_runner(model,
++                     dtype=dtype,
++                     enable_chunked_prefill=True,
++                     max_num_batched_tokens=5,
++                     max_num_seqs=2) as vllm_model:
++        chunked = vllm_model.generate_greedy(example_prompts,
++                                             max_tokens=max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=chunked,
++        outputs_1_lst=non_chunked,
++        name_0="chunked",
++        name_1="non_chunked",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [15])
++def test_parallel_sampling(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        for_loop_outputs = []
++        for _ in range(10):
++            for_loop_outputs.append(
++                # using example_prompts index 1 instead of 0 since with 0 the
++                # logprobs get really close and the test doesn't pass
++                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
++                [0])
++        sampling_params = SamplingParams(n=10,
++                                         temperature=0.001,
++                                         seed=0,
++                                         max_tokens=max_tokens)
++        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
++                                             sampling_params)
++    token_ids, texts = n_lt_1_outputs[0]
++    n_lt_1_outputs = [(token_id, text)
++                      for token_id, text in zip(token_ids, texts)]
++
++    check_outputs_equal(
++        outputs_0_lst=n_lt_1_outputs,
++        outputs_1_lst=for_loop_outputs,
++        name_0="vllm_n_lt_1_outputs",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [20])
++def test_mamba_cache_cg_padding(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++    # This test is for verifying that mamba cache is padded to CG captured
++    # batch size. If it's not, a torch RuntimeError will be raised because
++    # tensor dimensions aren't compatible
++    vllm_config = EngineArgs(model=model).create_engine_config()
++    while len(example_prompts) == vllm_config.pad_for_cudagraph(
++            len(example_prompts)):
++        example_prompts.append(example_prompts[0])
++
++    try:
++        with vllm_runner(model, dtype=dtype) as vllm_model:
++            vllm_model.generate_greedy(example_prompts, max_tokens)
++    except RuntimeError:
++        pytest.fail(
++            "Couldn't run batch size which is not equal to a Cuda Graph "
++            "captured batch size. "
++            "Could be related to mamba cache not padded correctly")
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [20])
++def test_models_preemption_recompute(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++    # Tests that outputs are identical with and w/o preemtions (recompute)
++    assert dtype == "float"
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        vllm_model.model.llm_engine.scheduler[
++            0].ENABLE_ARTIFICIAL_PREEMPT = True
++        preempt_vllm_outputs = vllm_model.generate_greedy(
++            example_prompts, max_tokens)
++
++        vllm_model.model.llm_engine.scheduler[
++            0].ENABLE_ARTIFICIAL_PREEMPT = False
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=preempt_vllm_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="vllm_preepmtions",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
++    vllm_runner,
++    model: str,
++    dtype: str,
++    example_prompts,
++) -> None:
++    # This test is for verifying that the Jamba inner state management doesn't
++    # collapse in case where the number of incoming requests and
++    # finished_requests_ids is larger than the maximum mamba block capacity.
++    # This could generally happen due to the fact that Jamba does support
++    # statelessness mechanism where it can cleanup new incoming requests in
++    # a single step.
++    try:
++        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
++            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
++    except ValueError:
++        pytest.fail("Jamba inner state wasn't cleaned up properly between"
++                    "steps finished requests registered unnecessarily ")
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++def test_state_cleanup(
++    vllm_runner,
++    model: str,
++    dtype: str,
++    example_prompts,
++) -> None:
++    # This test is for verifying that the Jamba state is cleaned up between
++    # steps, If its not cleaned, an error would be expected.
++    try:
++        with vllm_runner(model, dtype=dtype) as vllm_model:
++            for _ in range(10):
++                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
++    except ValueError:
++        pytest.fail("Jamba inner state wasn't cleaned up between states, "
++                    "could be related to finished_requests_ids")
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++def test_multistep(
++    vllm_runner,
++    model: str,
++    dtype: str,
++    example_prompts,
++) -> None:
++    # This test is verifying that multistep works correctly
++    #on mamba-like models
++    with vllm_runner(model, num_scheduler_steps=8,
++                     max_num_seqs=2) as vllm_model:
++        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [64])
++def test_multistep_correctness(vllm_runner, model: str, dtype: str,
++                               max_tokens: int, example_prompts) -> None:
++    with vllm_runner(model, num_scheduler_steps=8,
++                     max_num_seqs=2) as vllm_model:
++        vllm_outputs_multistep = vllm_model.generate_greedy(
++            example_prompts, max_tokens)
++
++    with vllm_runner(model, num_scheduler_steps=1,
++                     max_num_seqs=2) as vllm_model:
++        vllm_outputs_single_step = vllm_model.generate_greedy(
++            example_prompts, max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=vllm_outputs_multistep,
++        outputs_1_lst=vllm_outputs_single_step,
++        name_0="vllm_outputs_multistep",
++        name_1="vllm_outputs_single_step",
++    )
++
++
++@multi_gpu_test(num_gpus=2)
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [64])
++def test_jamba_distributed_produces_identical_generation(
++        vllm_runner, model: str, dtype: str, max_tokens: int,
++        example_prompts) -> None:
++
++    with vllm_runner(model, dtype=dtype, tensor_parallel_size=2) as vllm_model:
++        vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
++                                                       max_tokens)
++
++    with vllm_runner(model, dtype=dtype, tensor_parallel_size=1) as vllm_model:
++        vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
++                                                       max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=vllm_outputs_tp_1,
++        outputs_1_lst=vllm_outputs_tp_2,
++        name_0="vllm_tp_1",
++        name_1="vllm_tp_2",
++    )
+diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
+new file mode 100644
+index 0000000..06739e8
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_mamba.py
+@@ -0,0 +1,323 @@
++"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
++
++Run `pytest tests/models/test_mamba.py`.
++"""
++import pytest
++from transformers import AutoModelForCausalLM, AutoTokenizer
++
++from vllm.engine.arg_utils import EngineArgs
++from vllm.sampling_params import SamplingParams
++
++from ...utils import check_outputs_equal
++
++MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"]
++
++
++# Use lower-level interfaces to create this greedy generator, as mamba will
++# choke on the model_kwarg 'attention_mask' if hf_model.generate_greedy is used.
++def generate_greedy(model_name, example_prompts, max_tokens):
++    # Create a text generation pipeline
++    tokenizer = AutoTokenizer.from_pretrained(model_name)
++    model = AutoModelForCausalLM.from_pretrained(model_name)
++
++    # Generate texts from the prompts
++    outputs = []
++    for prompt in example_prompts:
++        # Tokenize the input prompt with truncation
++        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
++        input_ids = inputs["input_ids"].to(model.device)
++
++        # Generate text using the model's generate method directly
++        generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
++        generated_text = tokenizer.decode(generated_ids[0],
++                                          skip_special_tokens=True)
++
++        outputs.append((generated_ids[0].tolist(), generated_text))
++
++    return outputs
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [96])
++def test_models(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++    hf_outputs = generate_greedy(model, example_prompts, max_tokens)
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++        # This test is for verifying whether the model's extra_repr
++        # can be printed correctly.
++        print(vllm_model.model.llm_engine.model_executor.driver_worker.
++              model_runner.model)
++
++    for i in range(len(example_prompts)):
++        hf_output_ids, hf_output_str = hf_outputs[i]
++        vllm_output_ids, vllm_output_str = vllm_outputs[i]
++        assert hf_output_str == vllm_output_str, (
++            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
++        assert hf_output_ids == vllm_output_ids, (
++            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [96])
++def test_batching(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++    # To pass the small model tests, we need full precision.
++    for_loop_outputs = []
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        for prompt in example_prompts:
++            for_loop_outputs.append(
++                vllm_model.generate_greedy([prompt], max_tokens)[0])
++
++        batched_outputs = vllm_model.generate_greedy(example_prompts,
++                                                     max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=for_loop_outputs,
++        outputs_1_lst=batched_outputs,
++        name_0="for_loop_vllm",
++        name_1="batched_vllm",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [10])
++def test_chunked_prefill_with_parallel_sampling(vllm_runner, example_prompts,
++                                                model: str, dtype: str,
++                                                max_tokens: int) -> None:
++    # Tests chunked prefill in conjunction with n>1. In this case, prefill is
++    # populated with decoding tokens and we test that it doesn't fail.
++    # This test might fail if cache is not allocated correctly for n > 1
++    # decoding steps inside a chunked prefill forward pass (where we have both
++    # prefill and decode together )
++    sampling_params = SamplingParams(n=3,
++                                     temperature=1,
++                                     seed=0,
++                                     max_tokens=max_tokens)
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            enable_chunked_prefill=True,
++            max_num_batched_tokens=30,
++            max_num_seqs=10  # forces prefill chunks with decoding
++    ) as vllm_model:
++        vllm_model.generate(example_prompts, sampling_params)
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [32])
++@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
++def test_chunked_prefill(vllm_runner, example_prompts, model: str, dtype: str,
++                         max_tokens: int,
++                         chunked_prefill_token_size: int) -> None:
++    """
++    Checks exact match decode between huggingface model and vllm runner with
++    chunked prefill.
++    """
++    max_num_seqs = chunked_prefill_token_size
++    max_num_batched_tokens = chunked_prefill_token_size
++
++    non_chunked = generate_greedy(model, example_prompts, max_tokens)
++
++    with vllm_runner(model,
++                     dtype=dtype,
++                     enable_chunked_prefill=True,
++                     max_num_batched_tokens=max_num_batched_tokens,
++                     max_num_seqs=max_num_seqs) as vllm_model:
++        chunked = vllm_model.generate_greedy(example_prompts,
++                                             max_tokens=max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=chunked,
++        outputs_1_lst=non_chunked,
++        name_0="chunked",
++        name_1="non_chunked",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [15])
++def test_parallel_sampling(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        for_loop_outputs = []
++        for _ in range(10):
++            for_loop_outputs.append(
++                # using example_prompts index 1 instead of 0 since with 0 the
++                # logprobs get really close and the test doesn't pass
++                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
++                [0])
++        sampling_params = SamplingParams(n=10,
++                                         temperature=0.001,
++                                         seed=0,
++                                         max_tokens=max_tokens)
++        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
++                                             sampling_params)
++    token_ids, texts = n_lt_1_outputs[0]
++    n_lt_1_outputs = [(token_id, text)
++                      for token_id, text in zip(token_ids, texts)]
++
++    check_outputs_equal(
++        outputs_0_lst=n_lt_1_outputs,
++        outputs_1_lst=for_loop_outputs,
++        name_0="vllm_n_lt_1_outputs",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [20])
++def test_mamba_cache_cg_padding(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++    # This test is for verifying that mamba cache is padded to CG captured
++    # batch size. If it's not, a torch RuntimeError will be raised because
++    # tensor dimensions aren't compatible
++    vllm_config = EngineArgs(model=model).create_engine_config()
++    while len(example_prompts) == vllm_config.pad_for_cudagraph(
++            len(example_prompts)):
++        example_prompts.append(example_prompts[0])
++
++    try:
++        with vllm_runner(model, dtype=dtype) as vllm_model:
++            vllm_model.generate_greedy(example_prompts, max_tokens)
++    except RuntimeError:
++        pytest.fail(
++            "Couldn't run batch size which is not equal to a Cuda Graph "
++            "captured batch size. "
++            "Could be related to mamba cache not padded correctly")
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [20])
++def test_models_preemption_recompute(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++    # Tests that outputs are identical with and w/o preemtions (recompute)
++    assert dtype == "float"
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        vllm_model.model.llm_engine.scheduler[
++            0].ENABLE_ARTIFICIAL_PREEMPT = True
++        preempt_vllm_outputs = vllm_model.generate_greedy(
++            example_prompts, max_tokens)
++
++        vllm_model.model.llm_engine.scheduler[
++            0].ENABLE_ARTIFICIAL_PREEMPT = False
++        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=preempt_vllm_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="vllm_preepmtions",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
++    vllm_runner,
++    model: str,
++    dtype: str,
++    example_prompts,
++) -> None:
++    # This test is for verifying that the Mamba inner state management doesn't
++    # collapse in case where the number of incoming requests and
++    # finished_requests_ids is larger than the maximum Mamba block capacity.
++    # This could generally happen due to the fact that Mamba does support
++    # statelessness mechanism where it can cleanup new incoming requests in
++    # a single step.
++    try:
++        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
++            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
++    except ValueError:
++        pytest.fail("Mamba inner state wasn't cleaned up properly between"
++                    "steps finished requests registered unnecessarily ")
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++def test_state_cleanup(
++    vllm_runner,
++    model: str,
++    dtype: str,
++    example_prompts,
++) -> None:
++    # This test is for verifying that the Mamba state is cleaned up between
++    # steps, If its not cleaned, an error would be expected.
++    try:
++        with vllm_runner(model, dtype=dtype) as vllm_model:
++            for _ in range(10):
++                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
++    except ValueError:
++        pytest.fail("Mamba inner state wasn't cleaned up between states, "
++                    "could be related to finished_requests_ids")
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++def test_multistep(
++    vllm_runner,
++    model: str,
++    dtype: str,
++    example_prompts,
++) -> None:
++    with vllm_runner(model, num_scheduler_steps=8,
++                     max_num_seqs=2) as vllm_model:
++        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [64])
++def test_multistep_correctness(vllm_runner, model: str, dtype: str,
++                               max_tokens: int, example_prompts) -> None:
++    with vllm_runner(model, num_scheduler_steps=8,
++                     max_num_seqs=2) as vllm_model:
++        vllm_outputs_multistep = vllm_model.generate_greedy(
++            example_prompts, max_tokens)
++
++    with vllm_runner(model, num_scheduler_steps=1,
++                     max_num_seqs=2) as vllm_model:
++        vllm_outputs_single_step = vllm_model.generate_greedy(
++            example_prompts, max_tokens)
++
++    check_outputs_equal(
++        outputs_0_lst=vllm_outputs_multistep,
++        outputs_1_lst=vllm_outputs_single_step,
++        name_0="vllm_outputs_multistep",
++        name_1="vllm_outputs_single_step",
++    )
+diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
+new file mode 100644
+index 0000000..bdc1571
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_mistral.py
+@@ -0,0 +1,335 @@
++"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
++
++Run `pytest tests/models/test_mistral.py`.
++"""
++import copy
++import json
++
++import jsonschema
++import jsonschema.exceptions
++import pytest
++
++from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
++    MistralToolParser)
++from vllm.sampling_params import GuidedDecodingParams, SamplingParams
++
++from ...utils import check_logprobs_close
++
++MODELS = [
++    "mistralai/Mistral-7B-Instruct-v0.3",
++]
++
++MISTRAL_FORMAT_MODELS = [
++    "mistralai/Mistral-7B-Instruct-v0.3",
++    # uses the v3-Tekken tokenizer
++    "mistralai/Ministral-8B-Instruct-2410",
++    # Mistral-Nemo is to big for CI, but passes locally
++    # "mistralai/Mistral-Nemo-Instruct-2407"
++]
++
++SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
++SYMBOLIC_LANG_PROMPTS = [
++    "勇敢な船乗りについての詩を書く",  # japanese
++    "寫一首關於勇敢的水手的詩",  # chinese
++    "ပုံပြင်လေးပြောပြပါ်:\n",  # burmese
++    "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n",  # see https://github.com/vllm-project/vllm/pull/9625
++]
++
++# for function calling
++TOOLS = [{
++    "type": "function",
++    "function": {
++        "name": "get_current_weather",
++        "description": "Get the current weather in a given location",
++        "parameters": {
++            "type": "object",
++            "properties": {
++                "city": {
++                    "type":
++                    "string",
++                    "description":
++                    "The city to find the weather for, e.g. 'San Francisco'"
++                },
++                "state": {
++                    "type":
++                    "string",
++                    "description":
++                    "the two-letter abbreviation for the state that the city is"
++                    " in, e.g. 'CA' which would mean 'California'"
++                },
++                "unit": {
++                    "type": "string",
++                    "description": "The unit to fetch the temperature in",
++                    "enum": ["celsius", "fahrenheit"]
++                }
++            },
++            "required": ["city", "state", "unit"]
++        }
++    },
++}, {
++    "type": "function",
++    "function": {
++        "name": "rewrite",
++        "description": "Rewrites text",
++        "parameters": {
++            "type": "object",
++            "required": [],
++            "properties": {
++                "text": {
++                    "type": "string",
++                    "description": "The input text to rewrite."
++                }
++            }
++        }
++    }
++}]
++MSGS = [
++    {
++        "role": "system",
++        "content": "You are an assistant."
++    },
++    {
++        "role":
++        "user",
++        "content":
++        "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors."  # noqa
++    },
++    {
++        "role":
++        "assistant",
++        "content":
++        "",
++        "tool_calls": [{
++            "id": "bbc5b7ede",
++            "type": "function",
++            "function": {
++                "name":
++                "rewrite",
++                "arguments":
++                '{\"text\":\"My English needs improvving, maybe I make errors.\"}'  # noqa
++            }
++        }]
++    },
++    {
++        "role": "tool",
++        "content":
++        "{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}",  # noqa
++        "tool_call_id": "bbc5b7ede",
++        "name": "rewrite"
++    },
++    {
++        "role": "assistant",
++        "content": "---\n\nMy English needs improving, maybe I make errors"
++    },
++    {
++        "role":
++        "user",
++        "content": ("Can you tell me what the temperate"
++                    " will be in Dallas, in fahrenheit?")
++    }
++]
++
++SAMPLE_JSON_SCHEMA = {
++    "type": "object",
++    "properties": {
++        "name": {
++            "type": "string"
++        },
++        "age": {
++            "type": "integer"
++        },
++        "skills": {
++            "type": "array",
++            "items": {
++                "type": "string",
++                "maxLength": 10
++            },
++            "minItems": 3
++        },
++        "work_history": {
++            "type": "array",
++            "items": {
++                "type": "object",
++                "properties": {
++                    "company": {
++                        "type": "string"
++                    },
++                    "duration": {
++                        "type": "number"
++                    },
++                    "position": {
++                        "type": "string"
++                    }
++                },
++                "required": ["company", "position"]
++            }
++        }
++    },
++    "required": ["name", "age", "skills", "work_history"]
++}
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [64])
++@pytest.mark.parametrize("num_logprobs", [5])
++def test_models(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++) -> None:
++    # TODO(sang): Sliding window should be tested separately.
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy_logprobs_limit(
++            example_prompts, max_tokens, num_logprobs)
++
++    with vllm_runner(model, dtype=dtype,
++                     tokenizer_mode="mistral") as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, num_logprobs)
++
++    check_logprobs_close(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [64])
++@pytest.mark.parametrize("num_logprobs", [5])
++def test_mistral_format(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++) -> None:
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            tokenizer_mode="auto",
++            load_format="safetensors",
++            config_format="hf",
++    ) as hf_format_model:
++        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, num_logprobs)
++
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            tokenizer_mode="mistral",
++            load_format="mistral",
++            config_format="mistral",
++    ) as mistral_format_model:
++        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, num_logprobs)
++
++    check_logprobs_close(
++        outputs_0_lst=hf_format_outputs,
++        outputs_1_lst=mistral_format_outputs,
++        name_0="hf",
++        name_1="mistral",
++    )
++
++
++@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++def test_mistral_symbolic_languages(
++    vllm_runner,
++    model: str,
++    dtype: str,
++) -> None:
++    with vllm_runner(model,
++                     dtype=dtype,
++                     max_model_len=8192,
++                     tokenizer_mode="mistral",
++                     config_format="mistral",
++                     load_format="mistral") as vllm_model:
++        for prompt in SYMBOLIC_LANG_PROMPTS:
++            msg = {"role": "user", "content": prompt}
++            outputs = vllm_model.model.chat([msg],
++                                            sampling_params=SAMPLING_PARAMS)
++            assert "�" not in outputs[0].outputs[0].text.strip()
++
++
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("model",
++                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
++def test_mistral_function_calling(
++    vllm_runner,
++    model: str,
++    dtype: str,
++) -> None:
++    with vllm_runner(model,
++                     dtype=dtype,
++                     tokenizer_mode="mistral",
++                     config_format="mistral",
++                     load_format="mistral") as vllm_model:
++
++        msgs = copy.deepcopy(MSGS)
++        outputs = vllm_model.model.chat(msgs,
++                                        tools=TOOLS,
++                                        sampling_params=SAMPLING_PARAMS)
++
++        tokenizer = vllm_model.model.get_tokenizer()
++        tool_parser = MistralToolParser(tokenizer)
++
++        model_output = outputs[0].outputs[0].text.strip()
++        assert model_output.startswith(tool_parser.bot_token), model_output
++        parsed_message = tool_parser.extract_tool_calls(model_output, None)
++
++        assert parsed_message.tools_called
++        assert parsed_message.tool_calls[0].id == "0UAqFzWsD"
++        assert parsed_message.tool_calls[
++            0].function.name == "get_current_weather"
++        assert parsed_message.tool_calls[
++            0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
++        assert parsed_message.content is None
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("guided_backend",
++                         ["outlines", "lm-format-enforcer", "xgrammar"])
++def test_mistral_guided_decoding(
++    vllm_runner,
++    model: str,
++    guided_backend: str,
++) -> None:
++    with vllm_runner(model, dtype='bfloat16',
++                     tokenizer_mode="mistral") as vllm_model:
++
++        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
++                                               backend=guided_backend)
++        params = SamplingParams(max_tokens=512,
++                                temperature=0.7,
++                                guided_decoding=guided_decoding)
++
++        messages = [{
++            "role": "system",
++            "content": "you are a helpful assistant"
++        }, {
++            "role":
++            "user",
++            "content":
++            f"Give an example JSON for an employee profile that "
++            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
++        }]
++        outputs = vllm_model.model.chat(messages, sampling_params=params)
++
++        generated_text = outputs[0].outputs[0].text
++        json_response = json.loads(generated_text)
++        assert outputs is not None
++
++        try:
++            jsonschema.validate(instance=json_response,
++                                schema=SAMPLE_JSON_SCHEMA)
++        except jsonschema.exceptions.ValidationError:
++            pytest.fail("Generated response is not valid with JSON schema")
+diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py
+new file mode 100644
+index 0000000..077e50e
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_modelopt.py
+@@ -0,0 +1,80 @@
++# flake8: noqa
++"""Tests Model Optimizer fp8 models against ground truth generation
++Note: these tests will only pass on H100
++"""
++import os
++from typing import List
++
++import pytest
++from transformers import AutoTokenizer
++
++from tests.quantization.utils import is_quant_method_supported
++from vllm import LLM, SamplingParams
++
++os.environ["TOKENIZERS_PARALLELISM"] = "true"
++
++MAX_MODEL_LEN = 1024
++
++MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
++
++EXPECTED_STRS_MAP = {
++    "nvidia/Llama-3.1-8B-Instruct-FP8": [
++        "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
++        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
++        'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
++        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
++        '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
++        'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
++        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
++        'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
++    ]
++}
++
++
++# This test compares against golden strings for exact match since
++# there is no baseline implementation to compare against
++# and is unstable w.r.t specifics of the fp8 implementation or
++# the hardware being run on.
++# Disabled to prevent it from breaking the build
++@pytest.mark.skip(
++    reason=
++    "Prevent unstable test based on golden strings from breaking the build.")
++@pytest.mark.quant_model
++@pytest.mark.skipif(not is_quant_method_supported("fp8"),
++                    reason="fp8 is not supported on this GPU type.")
++@pytest.mark.parametrize("model_name", MODELS)
++def test_models(example_prompts, model_name) -> None:
++    model = LLM(
++        model=model_name,
++        max_model_len=MAX_MODEL_LEN,
++        trust_remote_code=True,
++        enforce_eager=True,
++        quantization="modelopt",
++    )
++
++    tokenizer = AutoTokenizer.from_pretrained(model_name)
++    formatted_prompts = [
++        tokenizer.apply_chat_template([{
++            "role": "user",
++            "content": prompt
++        }],
++                                      tokenize=False,
++                                      add_generation_prompt=True)
++        for prompt in example_prompts
++    ]
++    params = SamplingParams(max_tokens=20, temperature=0)
++    generations: List[str] = []
++    # Note: these need to be run 1 at a time due to numerical precision,
++    # since the expected strs were generated this way.
++    for prompt in formatted_prompts:
++        outputs = model.generate(prompt, params)
++        generations.append(outputs[0].outputs[0].text)
++    del model
++
++    print(model_name, generations)
++    expected_strs = EXPECTED_STRS_MAP[model_name]
++    for i in range(len(example_prompts)):
++        generated_str = generations[i]
++        expected_str = expected_strs[i]
++        assert expected_str == generated_str, (
++            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
+new file mode 100644
+index 0000000..4e11036
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_models.py
+@@ -0,0 +1,86 @@
++"""Compare the outputs of HF and vLLM when using greedy sampling.
++
++Run `pytest tests/models/test_models.py`.
++"""
++import pytest
++
++from ...utils import check_logprobs_close
++
++
++@pytest.mark.parametrize(
++    "model",
++    [
++        pytest.param(
++            "bigscience/bloom-560m",  # bloom - testing alibi slopes
++            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
++        ),
++        pytest.param(
++            "openai-community/gpt2",  # gpt2
++            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
++        ),
++        pytest.param("Milos/slovak-gpt-j-405M"),  # gptj
++        pytest.param("bigcode/tiny_starcoder_py"),  # gpt_bigcode
++        pytest.param("EleutherAI/pythia-70m"),  # gpt_neox
++        pytest.param(
++            "google/gemma-1.1-2b-it",  # gemma
++            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
++        ),
++        pytest.param(
++            "meta-llama/Llama-3.2-1B-Instruct",  # llama
++            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
++        ),
++        pytest.param(
++            "openbmb/MiniCPM3-4B",
++            # fused_moe not supported on CPU
++            marks=[pytest.mark.core_model],
++        ),
++        pytest.param(
++            "facebook/opt-125m",  # opt
++            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
++        ),
++        pytest.param(
++            "microsoft/phi-2",  # phi
++            marks=[pytest.mark.core_model],
++        ),
++        pytest.param(
++            "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
++            marks=[pytest.mark.core_model],
++        ),
++        pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
++        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
++        pytest.param(
++            "ehristoforu/Falcon3-MoE-2x7B-Insruct",  # mixtral
++            marks=[pytest.mark.cpu_model],
++        )
++    ])
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [32])
++@pytest.mark.parametrize("num_logprobs", [5])
++def test_models(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++) -> None:
++
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy_logprobs_limit(
++            example_prompts, max_tokens, num_logprobs)
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, num_logprobs)
++        # This test is for verifying whether the model's extra_repr
++        # can be printed correctly.
++        print(vllm_model.model.llm_engine.model_executor.driver_worker.
++              model_runner.model)
++
++    check_logprobs_close(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
+diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py
+new file mode 100644
+index 0000000..c997359
+--- /dev/null
++++ b/tests/models/decoder_only/language/test_phimoe.py
+@@ -0,0 +1,102 @@
++"""Compare the outputs of HF and vLLM for moe models using greedy sampling.
++
++Run `pytest tests/models/test_phimoe.py`.
++"""
++import pytest
++import torch
++
++from vllm.platforms import current_platform
++
++from ....utils import large_gpu_test
++from ...utils import check_logprobs_close
++
++MODELS = [
++    "microsoft/Phi-3.5-MoE-instruct",
++]
++
++
++def test_phimoe_routing_function():
++    from vllm.model_executor.models.phimoe import phimoe_routing_function
++    test_case = {
++        0: {
++            "hidden_states":
++            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
++                         dtype=torch.float32,
++                         requires_grad=False).view(4, 2),
++            "gating_output":
++            torch.tensor([0.1, 0.2, 0.3, 0.4],
++                         dtype=torch.float32,
++                         requires_grad=False),
++            "topk":
++            2,
++            "renormalize":
++            False,
++        },
++        1: {
++            "hidden_states":
++            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
++                         dtype=torch.float32,
++                         requires_grad=False).view(4, 2),
++            "gating_output":
++            torch.tensor([0.4, 0.2, 0.3, 0.4],
++                         dtype=torch.float32,
++                         requires_grad=False),
++            "topk":
++            2,
++            "renormalize":
++            False,
++        }
++    }
++
++    ground_truth = {
++        0: {
++            "topk_weights":
++            torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
++            "topk_ids":
++            torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
++        },
++        1: {
++            "topk_weights":
++            torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
++            "topk_ids":
++            torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
++        }
++    }
++
++    for test_id in test_case:
++        topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
++        assert torch.allclose(topk_weights,
++                              ground_truth[test_id]["topk_weights"])
++        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
++
++
++@pytest.mark.skipif(condition=current_platform.is_cpu(),
++                    reason="This test takes a lot time to run on CPU, "
++                    "and vllm CI's disk space is not enough for this model.")
++@large_gpu_test(min_gb=80)
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [64])
++@pytest.mark.parametrize("num_logprobs", [5])
++def test_models(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++) -> None:
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy_logprobs_limit(
++            example_prompts, max_tokens, num_logprobs)
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, num_logprobs)
++    check_logprobs_close(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
+diff --git a/tests/models/decoder_only/vision_language/__init__.py b/tests/models/decoder_only/vision_language/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py
+new file mode 100644
+index 0000000..18ceb34
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/test_awq.py
+@@ -0,0 +1,120 @@
++from typing import List, Optional, Type
++
++import pytest
++import torch
++
++from vllm.multimodal.image import rescale_image_size
++
++from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
++from ...utils import check_logprobs_close
++
++HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
++    "stop_sign":
++    "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
++    "cherry_blossom":
++    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
++})
++
++
++def run_awq_test(
++    vllm_runner: Type[VllmRunner],
++    image_assets: _ImageAssets,
++    source_model: str,
++    quant_model: str,
++    *,
++    size_factors: List[float],
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    tensor_parallel_size: int,
++    distributed_executor_backend: Optional[str] = None,
++):
++    images = [asset.pil_image for asset in image_assets]
++
++    inputs_per_image = [(
++        [prompt for _ in size_factors],
++        [rescale_image_size(image, factor) for factor in size_factors],
++    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
++
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default method).
++
++    # max_model_len should be greater than image_feature_size
++    with vllm_runner(source_model,
++                     max_model_len=4096,
++                     dtype=dtype,
++                     tensor_parallel_size=tensor_parallel_size,
++                     distributed_executor_backend=distributed_executor_backend,
++                     enforce_eager=True) as vllm_model:
++        source_outputs_per_image = [
++            vllm_model.generate_greedy_logprobs(prompts,
++                                                max_tokens,
++                                                num_logprobs=num_logprobs,
++                                                images=images)
++            for prompts, images in inputs_per_image
++        ]
++
++    with vllm_runner(quant_model,
++                     quantization="awq",
++                     max_model_len=4096,
++                     dtype=dtype,
++                     tensor_parallel_size=tensor_parallel_size,
++                     distributed_executor_backend=distributed_executor_backend,
++                     enforce_eager=True) as vllm_model:
++        quant_outputs_per_image = [
++            vllm_model.generate_greedy_logprobs(prompts,
++                                                max_tokens,
++                                                num_logprobs=num_logprobs,
++                                                images=images)
++            for prompts, images in inputs_per_image
++        ]
++
++    for source_outputs, quant_outputs in zip(source_outputs_per_image,
++                                             quant_outputs_per_image):
++        # TODO: Check whether using original CLIPVisionModel can improve
++        # consistency against HF
++        check_logprobs_close(
++            outputs_0_lst=source_outputs,
++            outputs_1_lst=quant_outputs,
++            name_0="source",
++            name_1="awq",
++        )
++
++
++@pytest.mark.quant_model
++@pytest.mark.parametrize(
++    ("source_model", "quant_model"),
++    [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
++)
++@pytest.mark.parametrize(
++    "size_factors",
++    [
++        # No image
++        [],
++        # Single-scale
++        [1.0],
++        # Single-scale, batched
++        [1.0, 1.0, 1.0],
++        # Multi-scale
++        [0.25, 0.5, 1.0],
++    ],
++)
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [5])
++@torch.inference_mode()
++def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
++                    size_factors, dtype, max_tokens, num_logprobs) -> None:
++    run_awq_test(
++        vllm_runner,
++        image_assets,
++        source_model,
++        quant_model,
++        size_factors=size_factors,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        tensor_parallel_size=1,
++    )
+diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
+new file mode 100644
+index 0000000..7406df2
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
+@@ -0,0 +1,129 @@
++from typing import Optional, Tuple
++
++import pytest
++import torch
++from PIL.Image import Image
++from transformers import AutoConfig
++
++# Import the functions to test
++from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
++                                              image_to_pixel_values_wrapper)
++from vllm.multimodal.image import rescale_image_size
++
++models = [
++    "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
++    "h2oai/h2ovl-mississippi-2b",
++]
++
++
++def run_preprocessing_test(
++    image: Image,
++    config,
++    max_dynamic_patch: Optional[int] = None,
++) -> Tuple[torch.Tensor, int]:
++    """Test the image preprocessing and calculate expected blocks."""
++
++    if max_dynamic_patch is None:
++        max_dynamic_patch = config.max_dynamic_patch
++
++    width, height = image.size
++    use_MSAC = config.use_msac
++
++    # Create the mapper function with the provided configuration
++    mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
++    pixel_values = mapper(image)
++
++    # Calculate the expected number of blocks
++    if use_MSAC:
++        # First pass
++        blocks1, _, _, aspect_ratio = calculate_num_blocks(
++            width,
++            height,
++            config.min_dynamic_patch,
++            max_dynamic_patch,
++            config.vision_config.image_size,
++            use_thumbnail=False,  # Thumbnail is handled separately
++            prior_aspect_ratio=None,
++        )
++
++        # Second pass
++        blocks2, _, _, _ = calculate_num_blocks(
++            width,
++            height,
++            config.min_dynamic_patch,
++            max_dynamic_patch,
++            config.vision_config.image_size,
++            use_thumbnail=False,
++            prior_aspect_ratio=aspect_ratio,
++        )
++
++        # Add thumbnail if use_thumbnail is True and total_blocks > 1
++        if config.use_thumbnail:
++            blocks1 += 1 if blocks1 > 1 else 0
++            blocks2 += 1 if blocks2 > 1 else 0
++
++        # Total blocks is the sum of blocks from both passes minus overlapping
++        total_blocks = blocks1 + blocks2 - 1
++
++        expected_blocks = total_blocks
++
++    else:
++        blocks, _, _, _ = calculate_num_blocks(
++            width,
++            height,
++            config.min_dynamic_patch,
++            max_dynamic_patch,
++            config.vision_config.image_size,
++            use_thumbnail=False,
++            prior_aspect_ratio=None,
++        )
++        expected_blocks = blocks
++
++        if config.use_thumbnail and expected_blocks > 1:
++            expected_blocks += 1
++
++    return pixel_values, expected_blocks
++
++
++@pytest.mark.parametrize("model_name", models)
++@pytest.mark.parametrize(
++    "size_factors",
++    [
++        # Single-scale
++        [1.0],
++        # Single-scale, batched
++        [1.0, 1.0, 1.0],
++        # Multi-scale
++        [0.25, 0.5, 1.0],
++    ],
++)
++@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8])
++def test_image_preprocessing(image_assets, model_name, size_factors,
++                             max_dynamic_patch):
++    """Test image preprocessing pipeline with different configurations."""
++    # Load the configuration from the model
++    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
++
++    for asset in image_assets:
++        image = asset.pil_image
++        for factor in size_factors:
++            scaled_image = rescale_image_size(image, factor)
++
++            # Test preprocessing and get expected number of blocks
++            pixel_values, expected_blocks = run_preprocessing_test(
++                scaled_image, config, max_dynamic_patch)
++
++            # Verify output shapes and properties
++            actual_blocks = pixel_values.shape[0]
++            assert actual_blocks == expected_blocks, (
++                f"Expected {expected_blocks} blocks, got {actual_blocks}")
++
++            # Check image dimensions
++            expected_size = (
++                3,  # Number of channels (C, H, W)
++                config.vision_config.image_size,
++                config.vision_config.image_size,
++            )
++            for img in pixel_values:
++                assert img.shape == expected_size, (
++                    f"Expected image size {expected_size}, got {img.shape}")
+diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py
+new file mode 100644
+index 0000000..32fcb0b
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
+@@ -0,0 +1,77 @@
++from typing import Optional
++
++import pytest
++import torch
++import torch.nn as nn
++from huggingface_hub import snapshot_download
++from transformers import AutoConfig, AutoModel, CLIPImageProcessor
++
++from ....conftest import _ImageAssets
++
++# we use snapshot_download to prevent conflicts between
++# dynamic_module and trust_remote_code for hf_runner
++DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
++
++
++def run_intern_vit_test(
++    image_assets: _ImageAssets,
++    model_id: str,
++    *,
++    dtype: str,
++    distributed_executor_backend: Optional[str] = None,
++):
++    model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
++
++    img_processor = CLIPImageProcessor.from_pretrained(model)
++    images = [asset.pil_image for asset in image_assets]
++    pixel_values = [
++        img_processor(images, return_tensors='pt').pixel_values.to(dtype)
++        for images in images
++    ]
++
++    config = AutoConfig.from_pretrained(model, trust_remote_code=True)
++    if not getattr(config, "norm_type", None):
++        config.norm_type = "rms_norm"
++
++    hf_model = AutoModel.from_pretrained(model,
++                                         torch_dtype=dtype,
++                                         trust_remote_code=True).to("cuda")
++    hf_outputs_per_image = [
++        hf_model(pixel_value.to("cuda")).last_hidden_state
++        for pixel_value in pixel_values
++    ]
++
++    from vllm.distributed import cleanup_dist_env_and_memory
++    from vllm.model_executor.models.intern_vit import InternVisionModel
++    vllm_model = InternVisionModel(config)
++    vllm_model.load_weights(hf_model.state_dict().items())
++
++    del hf_model
++    cleanup_dist_env_and_memory()
++
++    vllm_model = vllm_model.to("cuda", dtype)
++    vllm_outputs_per_image = [
++        vllm_model(pixel_values=pixel_value.to("cuda"))
++        for pixel_value in pixel_values
++    ]
++    del vllm_model
++    cleanup_dist_env_and_memory()
++
++    cos_similar = nn.CosineSimilarity(dim=-1)
++    for vllm_output, hf_output in zip(vllm_outputs_per_image,
++                                      hf_outputs_per_image):
++        assert cos_similar(vllm_output, hf_output).mean() > 0.99
++
++
++@pytest.mark.parametrize("model_id", [
++    "OpenGVLab/InternViT-300M-448px",
++    "OpenGVLab/InternViT-6B-448px-V1-5",
++])
++@pytest.mark.parametrize("dtype", [torch.half])
++@torch.inference_mode()
++def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
++    run_intern_vit_test(
++        image_assets,
++        model_id,
++        dtype=dtype,
++    )
+diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
+new file mode 100644
+index 0000000..7620ed1
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/test_models.py
+@@ -0,0 +1,742 @@
++"""Common tests for testing .generate() functionality for single / multiple
++image, embedding, and video support for different VLMs in vLLM.
++"""
++import math
++import os
++from collections import defaultdict
++from pathlib import PosixPath
++from typing import Type
++
++import pytest
++from transformers import AutoModelForVision2Seq
++from transformers.utils import is_flash_attn_2_available
++
++from vllm.platforms import current_platform
++from vllm.utils import identity
++
++from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
++                          _VideoAssets)
++from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
++                       multi_gpu_marks)
++from ...utils import check_outputs_equal
++from .vlm_utils import custom_inputs, model_utils, runners
++from .vlm_utils.case_filtering import get_parametrized_options
++from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
++                              VLMTestInfo, VLMTestType)
++
++# This hack is needed for phi3v & paligemma models
++# ROCm Triton FA can run into shared memory issues with these models,
++# use other backends in the meantime
++# FIXME (mattwong, gshtrasb, hongxiayan)
++if current_platform.is_rocm():
++    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
++
++# yapf: disable
++COMMON_BROADCAST_SETTINGS = {
++    "test_type": VLMTestType.IMAGE,
++    "dtype": "half",
++    "max_tokens": 5,
++    "tensor_parallel_size": 2,
++    "hf_model_kwargs": {"device_map": "auto"},
++    "image_size_factors": [(.25, 0.5, 1.0)],
++    "distributed_executor_backend": (
++        "ray",
++        "mp",
++    )
++}
++
++### Test configuration for specific models
++# NOTE: The convention of the test settings below is to lead each test key
++# with the name of the model arch used in the test, using underscores in place
++# of hyphens; this makes it more convenient to filter tests for a specific kind
++# of model. For example....
++#
++# To run all test types for a specific key:
++#     use the k flag to substring match with a leading square bracket; if the
++#     model arch happens to be a substring of another one, you can add a
++#     trailing hyphen. E.g.,
++#                 - pytest $TEST_FILE -k "[llava-"
++#     prevents matching on "[llava_next-" & will match just the enabled cases
++#     for llava, i.e., single image, image embedding, and custom input tests.
++#
++# To run a test for a Test Info for just one of multiple models:
++#     use the k flag to substring match the model name, e.g.,
++#                 - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
++#     prevents matching on nGVLab/InternVL2-2B.
++#
++# You can also combine substrings to match more granularly.
++#     ex 1:
++#        pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
++#     will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
++#     match both wrappers for single image tests, since it also matches
++#     test_single_image_heavy (which forks if we have a distributed backend)
++#     ex 2:
++#        pytest $TEST_FILE -k  "[llava- or [intern_vl-"
++#     will run all of the tests for only llava & internvl.
++#
++# NOTE you can add --collect-only to any of the above commands to see
++# which cases would be selected and deselected by pytest. In general,
++# this is a good idea for checking your command first, since tests are slow.
++
++VLM_TEST_SETTINGS = {
++    #### Core tests to always run in the CI
++    "llava": VLMTestInfo(
++        models=["llava-hf/llava-1.5-7b-hf"],
++        test_type=(
++            VLMTestType.EMBEDDING,
++            VLMTestType.IMAGE,
++            VLMTestType.CUSTOM_INPUTS
++        ),
++        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
++        convert_assets_to_embeddings=model_utils.get_llava_embeddings,
++        max_model_len=4096,
++        auto_cls=AutoModelForVision2Seq,
++        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
++        custom_test_opts=[CustomTestOptions(
++            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
++                formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
++            ),
++            limit_mm_per_prompt={"image": 4},
++        )],
++        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
++    ),
++    "paligemma": VLMTestInfo(
++        models=["google/paligemma-3b-mix-224"],
++        test_type=VLMTestType.IMAGE,
++        prompt_formatter=identity,
++        img_idx_to_prompt = lambda idx: "",
++        # Paligemma uses its own sample prompts because the default one fails
++        single_image_prompts=IMAGE_ASSETS.prompts({
++            "stop_sign": "caption es",
++            "cherry_blossom": "What is in the picture?",
++        }),
++        auto_cls=AutoModelForVision2Seq,
++        postprocess_inputs=model_utils.cast_dtype_post_processor(
++            "pixel_values"
++        ),
++        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
++        dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
++               else ("half", "float")),
++        marks=[pytest.mark.core_model],
++    ),
++    "qwen2_vl": VLMTestInfo(
++        models=["Qwen/Qwen2-VL-2B-Instruct"],
++        test_type=(
++            VLMTestType.IMAGE,
++            VLMTestType.MULTI_IMAGE,
++            VLMTestType.VIDEO
++        ),
++        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
++        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
++        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
++        max_model_len=4096,
++        max_num_seqs=2,
++        auto_cls=AutoModelForVision2Seq,
++        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
++        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
++        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
++    ),
++    #### Extended model tests
++    "aria": VLMTestInfo(
++        models=["rhymes-ai/Aria"],
++        tokenizer_mode="slow",
++        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
++        dtype="bfloat16",
++        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
++        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
++        max_model_len=4096,
++        max_num_seqs=2,
++        single_image_prompts=IMAGE_ASSETS.prompts({
++            "stop_sign": "<vlm_image>Please describe the image shortly.",
++            "cherry_blossom": "<vlm_image>Please infer the season with reason.",
++        }),
++        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
++        postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"),
++        stop_str=["<|im_end|>"],
++        image_size_factors=[(0.10, 0.15)],
++        max_tokens=64,
++        marks=[
++            pytest.mark.skipif(
++                not is_flash_attn_2_available(),
++                reason="Model needs flash-attn for numeric convergence.",
++            ),
++            large_gpu_mark(min_gb=64),
++        ],
++    ),
++    "blip2": VLMTestInfo(
++        models=["Salesforce/blip2-opt-2.7b"],
++        test_type=VLMTestType.IMAGE,
++        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
++        img_idx_to_prompt=lambda idx: "",
++        auto_cls=AutoModelForVision2Seq,
++        vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
++    ),
++    "chameleon": VLMTestInfo(
++        models=["facebook/chameleon-7b"],
++        test_type=VLMTestType.IMAGE,
++        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
++        max_model_len=4096,
++        max_num_seqs=2,
++        auto_cls=AutoModelForVision2Seq,
++        postprocess_inputs=model_utils.cast_dtype_post_processor(
++            "pixel_values"
++        ),
++        # For chameleon, we only compare the sequences
++        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
++        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
++        comparator=check_outputs_equal,
++        max_tokens=8,
++        dtype="bfloat16",
++    ),
++    "deepseek_vl_v2": VLMTestInfo(
++        models=["deepseek-ai/deepseek-vl2-small"],
++        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
++        dtype="bfloat16",
++        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
++        max_model_len=4096,
++        max_num_seqs=2,
++        single_image_prompts=IMAGE_ASSETS.prompts({
++            "stop_sign": "<image>\nWhat's the color of the stop sign and car?",
++            "cherry_blossom": "<image>\nWhat's the color of the tower?",
++        }),
++        multi_image_prompt="image_1:<image>\nimage_2:<image>\nDescribe the two images shortly.",    # noqa: E501
++        vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}},  # noqa: E501
++        image_size_factors=[(0.10, 0.15)],
++        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
++        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
++        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
++        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
++        num_logprobs=5,
++        marks=[
++            pytest.mark.skipif(
++                not is_flash_attn_2_available(),
++                reason="Model needs flash-attn for numeric convergence.",
++            ),
++            large_gpu_mark(min_gb=48),
++        ],
++    ),
++    "fuyu": VLMTestInfo(
++        models=["adept/fuyu-8b"],
++        test_type=VLMTestType.IMAGE,
++        prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
++        img_idx_to_prompt=lambda idx: "",
++        max_model_len=2048,
++        max_num_seqs=2,
++        use_tokenizer_eos=True,
++        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
++        num_logprobs=10,
++        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
++    ),
++    "glm4": VLMTestInfo(
++        models=["THUDM/glm-4v-9b"],
++        test_type=VLMTestType.IMAGE,
++        prompt_formatter=identity,
++        img_idx_to_prompt=lambda idx: "",
++        max_model_len=2048,
++        max_num_seqs=2,
++        dtype="bfloat16",
++        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
++        patch_hf_runner=model_utils.glm_patch_hf_runner,
++        marks=[large_gpu_mark(min_gb=32)],
++    ),
++    "h2ovl": VLMTestInfo(
++        models = [
++            "h2oai/h2ovl-mississippi-800m",
++            "h2oai/h2ovl-mississippi-2b",
++        ],
++        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
++        prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
++        single_image_prompts=IMAGE_ASSETS.prompts({
++            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
++            "cherry_blossom": "<image>\nWhat is the season?",
++        }),
++        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
++        max_model_len=8192,
++        dtype="bfloat16",
++        use_tokenizer_eos=True,
++        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
++    ),
++    "idefics3": VLMTestInfo(
++        models=["HuggingFaceM4/Idefics3-8B-Llama3"],
++        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
++        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
++        img_idx_to_prompt=lambda idx: "<image>",
++        max_model_len=8192,
++        max_num_seqs=2,
++        auto_cls=AutoModelForVision2Seq,
++        marks=[large_gpu_mark(min_gb=48)],
++    ),
++    "intern_vl": VLMTestInfo(
++        models=[
++            "OpenGVLab/InternVL2-1B",
++            "OpenGVLab/InternVL2-2B",
++            "OpenGVLab/Mono-InternVL-2B",
++        ],
++        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
++        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
++        single_image_prompts=IMAGE_ASSETS.prompts({
++            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
++            "cherry_blossom": "<image>\nWhat is the season?",
++        }),
++        multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
++        max_model_len=4096,
++        # NOTE: Mono-InternVL-2B doesn't work with fp16,
++        # it will result NaN during inference.
++        # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
++        dtype="bfloat16",
++        use_tokenizer_eos=True,
++        patch_hf_runner=model_utils.internvl_patch_hf_runner,
++        marks=[large_gpu_mark(min_gb=32)],
++    ),
++    "llava_next": VLMTestInfo(
++        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
++        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
++        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
++        max_model_len=10240,
++        auto_cls=AutoModelForVision2Seq,
++        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
++        custom_test_opts=[CustomTestOptions(
++            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
++                formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
++            ),
++            limit_mm_per_prompt={"image": 4},
++        )],
++    ),
++    "llava_onevision": VLMTestInfo(
++        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
++        test_type=VLMTestType.CUSTOM_INPUTS,
++        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
++        num_video_frames=16,
++        max_model_len=16384,
++        postprocess_inputs=model_utils.cast_dtype_post_processor(
++            "pixel_values_videos"
++        ),
++        auto_cls=AutoModelForVision2Seq,
++        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
++        custom_test_opts=[CustomTestOptions(
++            inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
++                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
++            ),
++            limit_mm_per_prompt={"video": 4},
++            runner_mm_key="videos",
++        )],
++    ),
++    "llava_next_video": VLMTestInfo(
++        models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
++        test_type=VLMTestType.VIDEO,
++        prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
++        num_video_frames=16,
++        max_model_len=4096,
++        auto_cls=AutoModelForVision2Seq,
++        vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
++    ),
++    "mantis": VLMTestInfo(
++        models=["TIGER-Lab/Mantis-8B-siglip-llama3"],
++        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
++        prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
++        max_model_len=4096,
++        postprocess_inputs=model_utils.cast_dtype_post_processor(
++            "pixel_values"
++        ),
++        vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}},  # noqa: E501
++        get_stop_token_ids=lambda tok: [128009],
++        auto_cls=AutoModelForVision2Seq,
++        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
++        patch_hf_runner=model_utils.mantis_patch_hf_runner,
++    ),
++    "minicpmv_25": VLMTestInfo(
++        models=["openbmb/MiniCPM-Llama3-V-2_5"],
++        test_type=VLMTestType.IMAGE,
++        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
++        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
++        max_model_len=4096,
++        max_num_seqs=2,
++        get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
++        postprocess_inputs=model_utils.wrap_inputs_post_processor,
++        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
++    ),
++    "minicpmv_26": VLMTestInfo(
++        models=["openbmb/MiniCPM-V-2_6"],
++        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
++        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
++        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
++        max_model_len=4096,
++        max_num_seqs=2,
++        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
++        postprocess_inputs=model_utils.ignore_inputs_post_processor(
++            "image_sizes"
++        ),
++        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
++    ),
++    "molmo": VLMTestInfo(
++        models=["allenai/Molmo-7B-D-0924"],
++        test_type=(VLMTestType.IMAGE),
++        prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
++        max_model_len=4096,
++        max_num_seqs=2,
++        image_size_factors=[(),(1.0, 1.0, 1.0)],
++        patch_hf_runner=model_utils.mlomo_patch_hf_runner,
++        postprocess_inputs=model_utils.molmo_post_processor,
++    ),
++    # Tests for phi3v currently live in another file because of a bug in
++    # transformers. Once this issue is fixed, we can enable them here instead.
++    # https://github.com/huggingface/transformers/issues/34307
++    # "phi3v": VLMTestInfo(
++    #     models=["microsoft/Phi-3.5-vision-instruct"],
++    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
++    #     prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
++    #     img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
++    #     max_model_len=4096,
++    #     max_num_seqs=2,
++    #     task="generate",
++    #     # use eager mode for hf runner since phi3v didn't work with flash_attn
++    #     hf_model_kwargs={"_attn_implementation": "eager"},
++    #     use_tokenizer_eos=True,
++    #     vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
++    #     num_logprobs=10,
++    # ),
++    "pixtral_hf": VLMTestInfo(
++        models=["nm-testing/pixtral-12b-FP8-dynamic"],
++        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
++        prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
++        img_idx_to_prompt=lambda idx: "[IMG]",
++        max_model_len=8192,
++        max_num_seqs=2,
++        auto_cls=AutoModelForVision2Seq,
++        marks=[large_gpu_mark(min_gb=48)],
++    ),
++    "qwen": VLMTestInfo(
++        models=["Qwen/Qwen-VL"],
++        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
++        prompt_formatter=identity,
++        img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
++        max_model_len=1024,
++        max_num_seqs=2,
++        vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
++        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
++    ),
++    ### Tensor parallel / multi-gpu broadcast tests
++    "chameleon-broadcast": VLMTestInfo(
++        models=["facebook/chameleon-7b"],
++        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
++        max_model_len=4096,
++        auto_cls=AutoModelForVision2Seq,
++        postprocess_inputs=model_utils.cast_dtype_post_processor(
++            "pixel_values"
++        ),
++        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
++        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
++        comparator=check_outputs_equal,
++        marks=multi_gpu_marks(num_gpus=2),
++        **COMMON_BROADCAST_SETTINGS # type: ignore
++    ),
++    "llava-broadcast": VLMTestInfo(
++        models=["llava-hf/llava-1.5-7b-hf"],
++        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
++        max_model_len=4096,
++        auto_cls=AutoModelForVision2Seq,
++        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
++        marks=multi_gpu_marks(num_gpus=2),
++        **COMMON_BROADCAST_SETTINGS # type: ignore
++    ),
++    "llava_next-broadcast": VLMTestInfo(
++        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
++        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
++        max_model_len=10240,
++        auto_cls=AutoModelForVision2Seq,
++        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
++        marks=multi_gpu_marks(num_gpus=2),
++        **COMMON_BROADCAST_SETTINGS # type: ignore
++    ),
++    ### Custom input edge-cases for specific models
++    "intern_vl-diff-patches": VLMTestInfo(
++        models=["OpenGVLab/InternVL2-2B"],
++        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
++        test_type=VLMTestType.CUSTOM_INPUTS,
++        max_model_len=4096,
++        use_tokenizer_eos=True,
++        patch_hf_runner=model_utils.internvl_patch_hf_runner,
++        custom_test_opts=[
++            CustomTestOptions(
++                inputs=inp,
++                limit_mm_per_prompt={"image": 2},
++            ) for inp in custom_inputs.different_patch_input_cases_internvl()
++        ],
++    ),
++    "llava_onevision-multiple-images": VLMTestInfo(
++        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
++        test_type=VLMTestType.CUSTOM_INPUTS,
++        max_model_len=16384,
++        max_num_seqs=2,
++        postprocess_inputs=model_utils.cast_dtype_post_processor(
++            "pixel_values"
++        ),
++        auto_cls=AutoModelForVision2Seq,
++        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
++        custom_test_opts=[CustomTestOptions(
++            inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
++                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
++            ),
++            limit_mm_per_prompt={"image": 4},
++        )],
++    ),
++}
++# yapf: enable
++
++
++def _mark_splits(
++    test_settings: dict[str, VLMTestInfo],
++    *,
++    num_groups: int,
++) -> dict[str, VLMTestInfo]:
++    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
++    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)
++
++    for info in test_settings.values():
++        for model in info.models:
++            test_infos_by_model[model].append(info)
++
++    models = sorted(test_infos_by_model.keys())
++    split_size = math.ceil(len(models) / num_groups)
++
++    new_test_settings = dict[str, VLMTestInfo]()
++
++    for i in range(num_groups):
++        models_in_group = models[i * split_size:(i + 1) * split_size]
++
++        for model in models_in_group:
++            for info in test_infos_by_model[model]:
++                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
++                new_info = info._replace(marks=new_marks)
++                new_test_settings[name_by_test_info_id[id(info)]] = new_info
++
++    missing_keys = test_settings.keys() - new_test_settings.keys()
++    assert not missing_keys, f"Missing keys: {missing_keys}"
++
++    return new_test_settings
++
++
++VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
++
++
++### Test wrappers
++# Wrappers around the core test running func for:
++# - single image
++# - multi-image
++# - image embeddings
++# - video
++# - custom inputs
++@pytest.mark.parametrize("model_type,test_case",
++                         get_parametrized_options(
++                             VLM_TEST_SETTINGS,
++                             test_type=VLMTestType.IMAGE,
++                             fork_new_process_for_each_test=False,
++                         ))
++def test_single_image_models(tmp_path: PosixPath, model_type: str,
++                             test_case: ExpandableVLMTestArgs,
++                             hf_runner: Type[HfRunner],
++                             vllm_runner: Type[VllmRunner],
++                             image_assets: _ImageAssets):
++    model_test_info = VLM_TEST_SETTINGS[model_type]
++    runners.run_single_image_test(
++        tmp_path=tmp_path,
++        model_test_info=model_test_info,
++        test_case=test_case,
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        image_assets=image_assets,
++    )
++
++
++@pytest.mark.parametrize("model_type,test_case",
++                         get_parametrized_options(
++                             VLM_TEST_SETTINGS,
++                             test_type=VLMTestType.MULTI_IMAGE,
++                             fork_new_process_for_each_test=False,
++                         ))
++def test_multi_image_models(tmp_path: PosixPath, model_type: str,
++                            test_case: ExpandableVLMTestArgs,
++                            hf_runner: Type[HfRunner],
++                            vllm_runner: Type[VllmRunner],
++                            image_assets: _ImageAssets):
++    model_test_info = VLM_TEST_SETTINGS[model_type]
++    runners.run_multi_image_test(
++        tmp_path=tmp_path,
++        model_test_info=model_test_info,
++        test_case=test_case,
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        image_assets=image_assets,
++    )
++
++
++@pytest.mark.parametrize("model_type,test_case",
++                         get_parametrized_options(
++                             VLM_TEST_SETTINGS,
++                             test_type=VLMTestType.EMBEDDING,
++                             fork_new_process_for_each_test=False,
++                         ))
++def test_image_embedding_models(model_type: str,
++                                test_case: ExpandableVLMTestArgs,
++                                hf_runner: Type[HfRunner],
++                                vllm_runner: Type[VllmRunner],
++                                image_assets: _ImageAssets):
++    model_test_info = VLM_TEST_SETTINGS[model_type]
++    runners.run_embedding_test(
++        model_test_info=model_test_info,
++        test_case=test_case,
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        image_assets=image_assets,
++    )
++
++
++@pytest.mark.parametrize("model_type,test_case",
++                         get_parametrized_options(
++                             VLM_TEST_SETTINGS,
++                             test_type=VLMTestType.VIDEO,
++                             fork_new_process_for_each_test=False,
++                         ))
++def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
++                      hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
++                      video_assets: _VideoAssets):
++    model_test_info = VLM_TEST_SETTINGS[model_type]
++    runners.run_video_test(
++        model_test_info=model_test_info,
++        test_case=test_case,
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        video_assets=video_assets,
++    )
++
++
++@pytest.mark.parametrize("model_type,test_case",
++                         get_parametrized_options(
++                             VLM_TEST_SETTINGS,
++                             test_type=VLMTestType.CUSTOM_INPUTS,
++                             fork_new_process_for_each_test=False,
++                         ))
++def test_custom_inputs_models(
++    model_type: str,
++    test_case: ExpandableVLMTestArgs,
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++):
++    model_test_info = VLM_TEST_SETTINGS[model_type]
++    runners.run_custom_inputs_test(
++        model_test_info=model_test_info,
++        test_case=test_case,
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++    )
++
++
++#### Tests filtering for things running each test as a new process
++@pytest.mark.parametrize("model_type,test_case",
++                         get_parametrized_options(
++                             VLM_TEST_SETTINGS,
++                             test_type=VLMTestType.IMAGE,
++                             fork_new_process_for_each_test=True,
++                         ))
++@fork_new_process_for_each_test
++def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
++                                   test_case: ExpandableVLMTestArgs,
++                                   hf_runner: Type[HfRunner],
++                                   vllm_runner: Type[VllmRunner],
++                                   image_assets: _ImageAssets):
++    model_test_info = VLM_TEST_SETTINGS[model_type]
++    runners.run_single_image_test(
++        tmp_path=tmp_path,
++        model_test_info=model_test_info,
++        test_case=test_case,
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        image_assets=image_assets,
++    )
++
++
++@pytest.mark.parametrize("model_type,test_case",
++                         get_parametrized_options(
++                             VLM_TEST_SETTINGS,
++                             test_type=VLMTestType.MULTI_IMAGE,
++                             fork_new_process_for_each_test=True,
++                         ))
++@fork_new_process_for_each_test
++def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
++                                  test_case: ExpandableVLMTestArgs,
++                                  hf_runner: Type[HfRunner],
++                                  vllm_runner: Type[VllmRunner],
++                                  image_assets: _ImageAssets):
++    model_test_info = VLM_TEST_SETTINGS[model_type]
++    runners.run_multi_image_test(
++        tmp_path=tmp_path,
++        model_test_info=model_test_info,
++        test_case=test_case,
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        image_assets=image_assets,
++    )
++
++
++@pytest.mark.parametrize("model_type,test_case",
++                         get_parametrized_options(
++                             VLM_TEST_SETTINGS,
++                             test_type=VLMTestType.EMBEDDING,
++                             fork_new_process_for_each_test=True,
++                         ))
++@fork_new_process_for_each_test
++def test_image_embedding_models_heavy(model_type: str,
++                                      test_case: ExpandableVLMTestArgs,
++                                      hf_runner: Type[HfRunner],
++                                      vllm_runner: Type[VllmRunner],
++                                      image_assets: _ImageAssets):
++    model_test_info = VLM_TEST_SETTINGS[model_type]
++    runners.run_embedding_test(
++        model_test_info=model_test_info,
++        test_case=test_case,
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        image_assets=image_assets,
++    )
++
++
++@pytest.mark.parametrize("model_type,test_case",
++                         get_parametrized_options(
++                             VLM_TEST_SETTINGS,
++                             test_type=VLMTestType.VIDEO,
++                             fork_new_process_for_each_test=True,
++                         ))
++def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
++                            hf_runner: Type[HfRunner],
++                            vllm_runner: Type[VllmRunner],
++                            video_assets: _VideoAssets):
++    model_test_info = VLM_TEST_SETTINGS[model_type]
++    runners.run_video_test(
++        model_test_info=model_test_info,
++        test_case=test_case,
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        video_assets=video_assets,
++    )
++
++
++@pytest.mark.parametrize("model_type,test_case",
++                         get_parametrized_options(
++                             VLM_TEST_SETTINGS,
++                             test_type=VLMTestType.CUSTOM_INPUTS,
++                             fork_new_process_for_each_test=True,
++                         ))
++@fork_new_process_for_each_test
++def test_custom_inputs_models_heavy(
++    model_type: str,
++    test_case: ExpandableVLMTestArgs,
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++):
++    model_test_info = VLM_TEST_SETTINGS[model_type]
++    runners.run_custom_inputs_test(
++        model_test_info=model_test_info,
++        test_case=test_case,
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++    )
+diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
+new file mode 100644
+index 0000000..3a8934a
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/test_phi3v.py
+@@ -0,0 +1,234 @@
++import os
++import re
++from typing import List, Optional, Tuple, Type
++
++import pytest
++from transformers import AutoTokenizer
++
++from vllm.multimodal.image import rescale_image_size
++from vllm.platforms import current_platform
++from vllm.sequence import SampleLogprobs
++
++from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
++from ...utils import check_logprobs_close
++
++HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
++    "stop_sign":
++    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
++    "cherry_blossom":
++    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
++})
++HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
++
++models = ["microsoft/Phi-3.5-vision-instruct"]
++
++
++def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
++                                         Optional[SampleLogprobs]],
++                      model: str):
++    """Sanitize vllm output to be comparable with hf output."""
++    _, output_str, out_logprobs = vllm_output
++
++    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
++    assert output_str_without_image[0] == " "
++    output_str_without_image = output_str_without_image[1:]
++
++    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
++
++    tokenizer = AutoTokenizer.from_pretrained(model)
++    hf_output_ids = tokenizer.encode(output_str_without_image)
++    assert hf_output_ids[0] == 1
++    hf_output_ids = hf_output_ids[1:]
++
++    return hf_output_ids, hf_output_str, out_logprobs
++
++
++target_dtype = "half"
++
++# ROCm Triton FA can run into shared memory issues with these models,
++# use other backends in the meantime
++# FIXME (mattwong, gshtrasb, hongxiayan)
++if current_platform.is_rocm():
++    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
++
++
++def run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    inputs: List[Tuple[List[str], PromptImageInput]],
++    model: str,
++    *,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    mm_limit: int,
++    tensor_parallel_size: int,
++    distributed_executor_backend: Optional[str] = None,
++):
++    """Inference result should be the same between hf and vllm.
++
++    All the image fixtures for the test are from IMAGE_ASSETS.
++    For huggingface runner, we provide the PIL images as input.
++    For vllm runner, we provide MultiModalDataDict objects
++    and corresponding MultiModalConfig as input.
++    Note, the text input is also adjusted to abide by vllm contract.
++    The text output is sanitized to be able to compare with hf.
++    """
++    # HACK - this is an attempted workaround for the following bug
++    # https://github.com/huggingface/transformers/issues/34307
++    from transformers import AutoImageProcessor  # noqa: F401
++    from transformers import AutoProcessor  # noqa: F401
++
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default method).
++    # max_model_len should be greater than image_feature_size
++    with vllm_runner(model,
++                     task="generate",
++                     max_model_len=4096,
++                     max_num_seqs=2,
++                     dtype=dtype,
++                     limit_mm_per_prompt={"image": mm_limit},
++                     tensor_parallel_size=tensor_parallel_size,
++                     distributed_executor_backend=distributed_executor_backend,
++                     enforce_eager=True) as vllm_model:
++        vllm_outputs_per_case = [
++            vllm_model.generate_greedy_logprobs(prompts,
++                                                max_tokens,
++                                                num_logprobs=num_logprobs,
++                                                images=images)
++            for prompts, images in inputs
++        ]
++
++    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
++    hf_model_kwargs = {"_attn_implementation": "eager"}
++    with hf_runner(model, dtype=dtype,
++                   model_kwargs=hf_model_kwargs) as hf_model:
++        eos_token_id = hf_model.processor.tokenizer.eos_token_id
++        hf_outputs_per_case = [
++            hf_model.generate_greedy_logprobs_limit(prompts,
++                                                    max_tokens,
++                                                    num_logprobs=num_logprobs,
++                                                    images=images,
++                                                    eos_token_id=eos_token_id)
++            for prompts, images in inputs
++        ]
++
++    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
++                                        vllm_outputs_per_case):
++        check_logprobs_close(
++            outputs_0_lst=hf_outputs,
++            outputs_1_lst=[
++                vllm_to_hf_output(vllm_output, model)
++                for vllm_output in vllm_outputs
++            ],
++            name_0="hf",
++            name_1="vllm",
++        )
++
++
++# Since we use _attn_implementation="eager" for hf_runner, there is more
++# significant numerical difference. The basic `logprobs=5` fails to pass.
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize(
++    "size_factors",
++    [
++        # No image
++        [],
++        # Single-scale
++        [1.0],
++        # Single-scale, batched
++        [1.0, 1.0, 1.0],
++        # Multi-scale
++        [0.25, 0.5, 1.0],
++    ],
++)
++@pytest.mark.parametrize("dtype", [target_dtype])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [10])
++def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
++                dtype: str, max_tokens: int, num_logprobs: int) -> None:
++    images = [asset.pil_image for asset in image_assets]
++
++    inputs_per_image = [(
++        [prompt for _ in size_factors],
++        [rescale_image_size(image, factor) for factor in size_factors],
++    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
++
++    run_test(
++        hf_runner,
++        vllm_runner,
++        inputs_per_image,
++        model,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        mm_limit=1,
++        tensor_parallel_size=1,
++    )
++
++
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("dtype", [target_dtype])
++def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
++                         dtype) -> None:
++    images = [asset.pil_image for asset in image_assets]
++
++    inputs_regresion_7840 = [
++        ([prompt], [image]) for image, prompt in zip(images, HF_IMAGE_PROMPTS)
++    ]
++
++    # Regression test for #7840.
++    run_test(
++        hf_runner,
++        vllm_runner,
++        inputs_regresion_7840,
++        model,
++        dtype=dtype,
++        max_tokens=128,
++        num_logprobs=10,
++        mm_limit=1,
++        tensor_parallel_size=1,
++    )
++
++
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize(
++    "size_factors",
++    [
++        # No image
++        [],
++        # Single-scale
++        [1.0],
++        # Single-scale, batched
++        [1.0, 1.0, 1.0],
++        # Multi-scale
++        [0.25, 0.5, 1.0],
++    ],
++)
++@pytest.mark.parametrize("dtype", [target_dtype])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [10])
++def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
++                             size_factors, dtype: str, max_tokens: int,
++                             num_logprobs: int) -> None:
++    images = [asset.pil_image for asset in image_assets]
++
++    inputs_per_case = [
++        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
++         [[rescale_image_size(image, factor) for image in images]
++          for factor in size_factors])
++    ]
++
++    run_test(
++        hf_runner,
++        vllm_runner,
++        inputs_per_case,
++        model,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        mm_limit=2,
++        tensor_parallel_size=1,
++    )
+diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
+new file mode 100644
+index 0000000..90c0fab
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/test_pixtral.py
+@@ -0,0 +1,270 @@
++"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
++
++Run `pytest tests/models/test_mistral.py`.
++"""
++import json
++import uuid
++from dataclasses import asdict
++from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
++
++import pytest
++from mistral_common.multimodal import download_image
++from mistral_common.protocol.instruct.messages import ImageURLChunk
++from mistral_common.protocol.instruct.request import ChatCompletionRequest
++from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
++from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
++from transformers import AutoProcessor
++
++from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams,
++                  TextPrompt, TokensPrompt)
++from vllm.multimodal import MultiModalDataBuiltins
++from vllm.multimodal.inputs import PlaceholderRange
++from vllm.sequence import Logprob, SampleLogprobs
++
++from ....utils import VLLM_PATH, large_gpu_test
++from ...utils import check_logprobs_close
++
++if TYPE_CHECKING:
++    from _typeshed import StrPath
++
++MODELS = ["mistralai/Pixtral-12B-2409"]
++IMG_URLS = [
++    "https://picsum.photos/id/237/400/300",
++    "https://picsum.photos/id/231/200/300",
++    "https://picsum.photos/id/27/500/500",
++    "https://picsum.photos/id/17/150/600",
++]
++PROMPT = "Describe each image in one short sentence."
++
++
++def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
++    return [{
++        "role":
++        "user",
++        "content": [{
++            "type": "text",
++            "text": PROMPT,
++        }] + [{
++            "type": "image_url",
++            "image_url": {
++                "url": url
++            }
++        } for url in urls],
++    }]
++
++
++def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
++    return [{
++        "role":
++        "user",
++        "content": [{
++            "type": "text",
++            "content": PROMPT,
++        }, *({
++            "type": "image",
++            "image": download_image(url)
++        } for url in urls)],
++    }]
++
++
++def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
++    msg = _create_msg_format(urls)
++
++    tokenizer = MistralTokenizer.from_model("pixtral")
++
++    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
++    tokenized = tokenizer.encode_chat_completion(request)
++
++    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
++
++    images = []
++    for chunk in request.messages[0].content:
++        if isinstance(chunk, ImageURLChunk):
++            images.append(image_from_chunk(chunk))
++
++    mm_data = MultiModalDataBuiltins(image=images)
++    engine_inputs["multi_modal_data"] = mm_data
++
++    return engine_inputs
++
++
++def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt:
++    msg = _create_msg_format_hf(urls)
++
++    tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
++    prompt = tokenizer.apply_chat_template(msg)
++
++    images = []
++    for chunk in msg[0]["content"]:
++        if chunk["type"] == "image":
++            images.append(chunk["image"])
++
++    mm_data = MultiModalDataBuiltins(image=images)
++    engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)
++
++    return engine_inputs
++
++
++MSGS = [
++    _create_msg_format(IMG_URLS[:1]),
++    _create_msg_format(IMG_URLS[:2]),
++    _create_msg_format(IMG_URLS),
++]
++ENGINE_INPUTS = [
++    _create_engine_inputs(IMG_URLS[:1]),
++    _create_engine_inputs(IMG_URLS[:2]),
++    _create_engine_inputs(IMG_URLS),
++]
++
++SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
++LIMIT_MM_PER_PROMPT = dict(image=4)
++
++MAX_MODEL_LEN = [8192, 65536]
++
++FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
++assert FIXTURES_PATH.exists()
++
++FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
++FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
++
++OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
++
++
++# For the test author to store golden output in JSON
++def _dump_outputs_w_logprobs(
++    outputs: OutputsLogprobs,
++    filename: "StrPath",
++) -> None:
++    json_data = [(tokens, text,
++                  [{k: asdict(v)
++                    for k, v in token_logprobs.items()}
++                   for token_logprobs in (logprobs or [])])
++                 for tokens, text, logprobs in outputs]
++
++    with open(filename, "w") as f:
++        json.dump(json_data, f)
++
++
++def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
++    with open(filename, "rb") as f:
++        json_data = json.load(f)
++
++    return [(tokens, text,
++             [{int(k): Logprob(**v)
++               for k, v in token_logprobs.items()}
++              for token_logprobs in logprobs])
++            for tokens, text, logprobs in json_data]
++
++
++@large_gpu_test(min_gb=80)
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++def test_chat(
++    vllm_runner,
++    max_model_len: int,
++    model: str,
++    dtype: str,
++) -> None:
++    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            tokenizer_mode="mistral",
++            enable_chunked_prefill=False,
++            max_model_len=max_model_len,
++            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
++    ) as vllm_model:
++        outputs = []
++        for msg in MSGS:
++            output = vllm_model.model.chat(msg,
++                                           sampling_params=SAMPLING_PARAMS)
++
++            outputs.extend(output)
++
++    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
++    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
++                         outputs_1_lst=logprobs,
++                         name_0="h100_ref",
++                         name_1="output")
++
++
++@large_gpu_test(min_gb=80)
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
++    EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
++    args = EngineArgs(
++        model=model,
++        tokenizer_mode="mistral",
++        enable_chunked_prefill=False,
++        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
++        dtype=dtype,
++    )
++    engine = LLMEngine.from_engine_args(args)
++
++    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
++    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
++
++    outputs = []
++    count = 0
++    while True:
++        out = engine.step()
++        count += 1
++        for request_output in out:
++            if request_output.finished:
++                outputs.append(request_output)
++
++        if count == 2:
++            engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
++                               SAMPLING_PARAMS)
++        if not engine.has_unfinished_requests():
++            break
++
++    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
++    check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
++                         outputs_1_lst=logprobs,
++                         name_0="h100_ref",
++                         name_1="output")
++
++
++@large_gpu_test(min_gb=48)
++@pytest.mark.parametrize(
++    "prompt,expected_ranges",
++    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
++        "offset": 10,
++        "length": 494
++    }]),
++     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
++         "offset": 10,
++         "length": 266
++     }, {
++         "offset": 276,
++         "length": 1056
++     }, {
++         "offset": 1332,
++         "length": 418
++     }])])
++def test_multi_modal_placeholders(
++        vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None:
++    with vllm_runner(
++            "mistral-community/pixtral-12b",
++            max_model_len=8192,
++            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
++    ) as vllm_model:
++        outputs = vllm_model.model.generate(prompt)
++
++        assert len(outputs) == 1, f"{len(outputs)=}"
++        output: RequestOutput = outputs[0]
++        assert hasattr(output,
++                       "multi_modal_placeholders"), f"{output.__dict__=}"
++        assert "image" in output.multi_modal_placeholders, \
++            f"{output.multi_modal_placeholders.keys()=}"
++        image_placeholder_ranges: list[
++            PlaceholderRange] = output.multi_modal_placeholders["image"]
++        assert len(image_placeholder_ranges) == len(
++            expected_ranges), f"{image_placeholder_ranges=}"
++        for real_range, expected_range in zip(image_placeholder_ranges,
++                                              expected_ranges):
++            assert real_range == expected_range, \
++                f"{real_range=} {expected_range=}"
+diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+new file mode 100644
+index 0000000..16e256e
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+@@ -0,0 +1,429 @@
++from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
++
++import numpy.typing as npt
++import pytest
++import torch
++from PIL import Image
++
++from vllm.entrypoints.llm import LLM
++from vllm.multimodal.image import rescale_image_size
++from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
++
++from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
++                          PromptVideoInput, VllmRunner)
++from ...utils import check_logprobs_close
++
++models = ["Qwen/Qwen2-VL-2B-Instruct"]
++target_dtype = "half"
++
++IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
++VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
++MODEL_HIDDEN_SIZE = 1536
++
++
++def qwen2_vl_chat_template(*query):
++    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
++
++
++IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
++    "stop_sign":
++    qwen2_vl_chat_template(
++        IMAGE_PLACEHOLDER,
++        "What is the biggest text's content in this image?",
++    ),
++    "cherry_blossom":
++    qwen2_vl_chat_template(
++        IMAGE_PLACEHOLDER,
++        "What is the season shown in this image? ",
++        "Reply with a short sentence (no more than 20 words)",
++    ),
++})
++
++VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
++    "sample_demo_1":
++    qwen2_vl_chat_template(
++        VIDEO_PLACEHOLDER,
++        "Describe this video with a short sentence ",
++        "(no more than 20 words)",
++    ),
++})
++
++MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
++    IMAGE_PLACEHOLDER,
++    IMAGE_PLACEHOLDER,
++    "Describe these two images separately. ",
++    "For each image, reply with a short sentence ",
++    "(no more than 10 words).",
++)
++
++
++class Qwen2VLPromptImageEmbeddingInput(TypedDict):
++    image_embeds: torch.Tensor
++    image_grid_thw: torch.Tensor
++
++
++class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
++    video_embeds: torch.Tensor
++    video_grid_thw: torch.Tensor
++
++
++def batch_make_image_embeddings(
++        image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
++        llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]:
++    """batched image embeddings for Qwen2-VL
++
++    This will infer all images' embeddings in a single batch, 
++      and split the result according to input batches.
++
++    image_batches:
++      - Single-image batches: `List[Image.Image]`
++      - Multiple-image batches: `List[List[Image.Image]]]`
++    
++    returns: `List[Qwen2VLPromptImageEmbeddingInput]`
++    """
++
++    image_batches_: List[Any] = image_batches[:]
++
++    # convert single-image batches to multiple-image batches
++    for idx in range(len(image_batches_)):
++        if not isinstance(image_batches_[idx], list):
++            image_batches_[idx] = [image_batches_[idx]]
++
++        assert isinstance(image_batches_[idx], list)
++
++    # append all images into a list (as a batch)
++    images: List[Image.Image] = []
++    for image_batch in image_batches_:
++        images += image_batch
++
++    # image to pixel values
++    image_processor = processor.image_processor
++
++    preprocess_result = image_processor \
++        .preprocess(images=images, return_tensors="pt") \
++        .data
++    pixel_values = preprocess_result["pixel_values"]
++    image_grid_thw = preprocess_result["image_grid_thw"]
++
++    # pixel values to embeddinds & grid_thws
++    with torch.no_grad():
++        visual = llm.llm_engine.model_executor.driver_worker. \
++            model_runner.model.visual
++
++        pixel_values_on_device = pixel_values.to(visual.device,
++                                                 dtype=visual.dtype)
++        image_grid_thw_on_device = image_grid_thw.to(visual.device,
++                                                     dtype=torch.int64)
++        image_embeds = visual(pixel_values_on_device,
++                              grid_thw=image_grid_thw_on_device)
++
++    # split into original batches
++    result: List[Qwen2VLPromptImageEmbeddingInput] = []
++    image_counter = 0
++    embed_counter = 0
++    for image_batch in image_batches_:
++        cur_batch_image_count = len(image_batch)
++        merge_size = image_processor.merge_size
++        cur_batch_embed_len = sum([
++            grid_thw.prod() // merge_size // merge_size
++            for grid_thw in image_grid_thw[image_counter:image_counter +
++                                           cur_batch_image_count]
++        ])
++
++        result.append({
++            "image_embeds":
++            image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
++            "image_grid_thw":
++            image_grid_thw[image_counter:image_counter +
++                           cur_batch_image_count],
++        })
++
++        embed_counter += cur_batch_embed_len
++        image_counter += cur_batch_image_count
++
++    # ensure we don't lost any images or embeddings
++    assert embed_counter == image_embeds.size(0)
++    assert image_counter == image_grid_thw.size(0)
++    assert len(image_batches) == len(result)
++
++    return result
++
++
++def batch_make_video_embeddings(
++        video_batches: PromptVideoInput, processor,
++        llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]:
++    """batched video embeddings for Qwen2-VL
++
++    A NDArray represents a single video's all frames.
++
++    This will infer all videos' embeddings in a single batch, 
++      and split the result according to input batches.
++
++    video_batches:
++      - Single-video batches: `List[NDArray]`
++      - Multiple-video batches: `List[List[NDArray]]`
++    """
++
++    video_batches_: List[Any] = video_batches[:]
++
++    for idx in range(len(video_batches_)):
++        if not isinstance(video_batches_[idx], list):
++            single_video_batch: List[npt.NDArray] = [video_batches_[idx]]
++            video_batches_[idx] = single_video_batch
++
++        assert isinstance(video_batches_[idx], list)
++
++    # append all videos into a list (as a batch)
++    videos: List[npt.NDArray] = []
++    for video_batch in video_batches_:
++        videos += video_batch
++
++    # video to pixel values
++    image_processor = processor.image_processor
++
++    preprocess_result = image_processor \
++        .preprocess(images=None, videos=videos, return_tensors="pt") \
++        .data
++    pixel_values = preprocess_result["pixel_values_videos"]
++    video_grid_thw = preprocess_result["video_grid_thw"]
++
++    # pixel values to embeddinds & grid_thws
++    with torch.no_grad():
++        visual = llm.llm_engine.model_executor.driver_worker.\
++            model_runner.model.visual
++
++        pixel_values_on_device = pixel_values.to(visual.device,
++                                                 dtype=visual.dtype)
++        video_grid_thw_on_device = video_grid_thw.to(visual.device,
++                                                     dtype=torch.int64)
++        video_embeds = visual(pixel_values_on_device,
++                              grid_thw=video_grid_thw_on_device)
++
++    # split into original batches
++    result: List[Qwen2VLPromptVideoEmbeddingInput] = []
++    video_counter = 0
++    embed_counter = 0
++    for video_batch in video_batches_:
++        cur_batch_video_count = len(video_batch)
++        merge_size = image_processor.merge_size
++        cur_batch_embed_len = sum([
++            grid_thw.prod() // merge_size // merge_size
++            for grid_thw in video_grid_thw[video_counter:video_counter +
++                                           cur_batch_video_count]
++        ])
++
++        result.append({
++            "video_embeds":
++            video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
++            "video_grid_thw":
++            video_grid_thw[video_counter:video_counter +
++                           cur_batch_video_count],
++        })
++
++        embed_counter += cur_batch_embed_len
++        video_counter += cur_batch_video_count
++
++    # ensure we don't lost any videos or embeddings
++    assert embed_counter == video_embeds.size(0)
++    assert video_counter == video_grid_thw.size(0)
++    assert len(video_batches) == len(result)
++
++    return result
++
++
++def run_embedding_input_test(
++    vllm_runner: Type[VllmRunner],
++    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
++    model: str,
++    *,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    mm_limit: int,
++    tensor_parallel_size: int,
++    distributed_executor_backend: Optional[str] = None,
++):
++    """Inference result should be the same between
++    original image/video input and image/video embeddings input.
++    """
++    from transformers import AutoProcessor  # noqa: F401
++
++    processor = AutoProcessor.from_pretrained(model)
++
++    # NOTE:
++    # max_model_len should be greater than image_feature_size
++    with vllm_runner(model,
++                     task="generate",
++                     max_model_len=4000,
++                     max_num_seqs=3,
++                     dtype=dtype,
++                     limit_mm_per_prompt={
++                         "image": mm_limit,
++                         "video": mm_limit
++                     },
++                     tensor_parallel_size=tensor_parallel_size,
++                     distributed_executor_backend=distributed_executor_backend
++                     ) as vllm_model:
++
++        outputs_per_case_for_original_input = [
++            vllm_model.generate_greedy_logprobs(prompts,
++                                                max_tokens,
++                                                num_logprobs=num_logprobs,
++                                                images=images or None,
++                                                videos=videos or None)
++            for prompts, images, videos in inputs
++        ]
++
++        outputs_per_case_for_embeddings_input = [
++            vllm_model.generate_greedy_logprobs(
++                prompts,
++                max_tokens,
++                num_logprobs=num_logprobs,
++                images=batch_make_image_embeddings(
++                    images, processor, vllm_model.model) if images else None,
++                videos=batch_make_video_embeddings(
++                    videos, processor, vllm_model.model) if videos else None)
++            for prompts, images, videos in inputs
++        ]
++
++    for outputs_for_original_input, \
++        outputs_for_embeddings_input \
++        in zip(outputs_per_case_for_original_input,
++            outputs_per_case_for_embeddings_input):
++        check_logprobs_close(
++            outputs_0_lst=outputs_for_original_input,
++            outputs_1_lst=outputs_for_embeddings_input,
++            name_0="original_input",
++            name_1="embeddings_input",
++        )
++
++
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize(
++    "size_factors",
++    [
++        # Single-scale
++        [0.5],
++        # Single-scale, batched
++        [0.5, 0.5],
++        # Multi-scale
++        [0.25, 0.5, 0.5],
++    ],
++)
++@pytest.mark.parametrize("dtype", [target_dtype])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [10])
++def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
++                                         size_factors, dtype: str,
++                                         max_tokens: int,
++                                         num_logprobs: int) -> None:
++    images = [asset.pil_image for asset in image_assets]
++
++    inputs_per_case: List[Tuple[
++        List[str], PromptImageInput, PromptVideoInput]] = [(
++            [prompt for _ in size_factors],
++            [rescale_image_size(image, factor) for factor in size_factors],
++            [],
++        ) for image, prompt in zip(images, IMAGE_PROMPTS)]
++
++    run_embedding_input_test(
++        vllm_runner,
++        inputs_per_case,
++        model,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        mm_limit=1,
++        tensor_parallel_size=1,
++    )
++
++
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize(
++    "size_factors",
++    [
++        [],
++        # Single-scale
++        [0.5],
++        # Single-scale, batched
++        [0.5, 0.5],
++        # Multi-scale
++        [0.25, 0.5, 0.5],
++    ],
++)
++@pytest.mark.parametrize("dtype", [target_dtype])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [10])
++def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
++                                                  model, size_factors,
++                                                  dtype: str, max_tokens: int,
++                                                  num_logprobs: int) -> None:
++    images = [asset.pil_image for asset in image_assets]
++
++    inputs_per_case: List[Tuple[List[str], PromptImageInput,
++                                PromptVideoInput]] = [(
++                                    [MULTIIMAGE_PROMPT for _ in size_factors],
++                                    [[
++                                        rescale_image_size(image, factor)
++                                        for image in images
++                                    ] for factor in size_factors],
++                                    [],
++                                )]
++
++    run_embedding_input_test(
++        vllm_runner,
++        inputs_per_case,
++        model,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        mm_limit=2,
++        tensor_parallel_size=1,
++    )
++
++
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize(
++    "size_factors",
++    [
++        # Single-scale
++        [0.5],
++        # Single-scale, batched
++        [0.5, 0.5],
++        # Multi-scale
++        [0.25, 0.25, 0.5],
++    ],
++)
++@pytest.mark.parametrize("dtype", [target_dtype])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [10])
++def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
++                                         size_factors, dtype: str,
++                                         max_tokens: int,
++                                         num_logprobs: int) -> None:
++    num_frames = 4
++    sampled_vids = [
++        sample_frames_from_video(asset.np_ndarrays, num_frames)
++        for asset in video_assets
++    ]
++
++    inputs_per_case: List[Tuple[
++        List[str], PromptImageInput, PromptVideoInput]] = [(
++            [prompt for _ in size_factors],
++            [],
++            [rescale_video_size(video, factor) for factor in size_factors],
++        ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
++
++    run_embedding_input_test(
++        vllm_runner,
++        inputs_per_case,
++        model,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        mm_limit=1,
++        tensor_parallel_size=1,
++    )
+diff --git a/tests/models/decoder_only/vision_language/vlm_utils/__init__.py b/tests/models/decoder_only/vision_language/vlm_utils/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+new file mode 100644
+index 0000000..59773be
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+@@ -0,0 +1,236 @@
++"""Helpers for building inputs that can be leveraged for different test types.
++"""
++from pathlib import PosixPath
++from typing import Callable, Iterable, List, Optional, Tuple, Union
++
++import torch
++
++from vllm.multimodal.image import rescale_image_size
++from vllm.multimodal.video import (rescale_video_size, resize_video,
++                                   sample_frames_from_video)
++
++from .....conftest import _ImageAssets, _VideoAssets
++from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
++                    TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
++                    ImageSizeWrapper, SizeType, VLMTestInfo)
++
++
++def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
++                                                                      str],
++                             test_placeholder: str) -> str:
++    """Given a prompt, replaces each test placeholder with the
++    model-specific tag.
++    """
++    prompt_segments = prompt.split(test_placeholder)
++    img_prompt = prompt_segments[0]
++    for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
++        img_prompt += img_idx_to_prompt(placeholder_idx)
++        img_prompt += next_seg
++    return img_prompt
++
++
++def get_model_prompts(base_prompts: Iterable[str],
++                      img_idx_to_prompt: Optional[Callable[[int], str]],
++                      video_idx_to_prompt: Optional[Callable[[int], str]],
++                      prompt_formatter: Callable[[str], str]) -> List[str]:
++    """Given a model-agnostic base prompt and test configuration for a model(s)
++    to be tested, update the media placeholders and apply the prompt formatting
++    to get the test prompt string for this model.
++
++    Example for phi3v, given the base_prompt: "<image>What is the season?"
++        1. Replace img placeholder(s)
++          -> "<|image_1|>\nWhat is the season?"
++        2. Apply prompt formatter:
++          -> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
++    """
++    assert isinstance(base_prompts, (list, tuple))
++    model_prompts = []
++    for base_prompt in base_prompts:
++        # Replace the multimodal placeholders in the base prompt with
++        # the correct ones for the model that we are testing
++        if img_idx_to_prompt:
++            base_prompt = replace_test_placeholder(base_prompt,
++                                                   img_idx_to_prompt,
++                                                   TEST_IMG_PLACEHOLDER)
++
++        if video_idx_to_prompt:
++            base_prompt = replace_test_placeholder(base_prompt,
++                                                   video_idx_to_prompt,
++                                                   TEST_VIDEO_PLACEHOLDER)
++
++        # Apply the prompt formatter to wrap the base prompt with
++        # the correct media placeholders to get the model test prompt
++        model_prompt = prompt_formatter(base_prompt)
++        model_prompts.append(model_prompt)
++    return model_prompts
++
++
++def build_single_image_inputs_from_test_info(
++        test_info: VLMTestInfo,
++        image_assets: _ImageAssets,
++        size_wrapper: ImageSizeWrapper,
++        tmp_path: Optional[PosixPath] = None):
++    if test_info.prompt_formatter is None:
++        raise ValueError(
++            "Prompt formatter must be set to build single image inputs")
++
++    model_prompts = get_model_prompts(test_info.single_image_prompts,
++                                      test_info.img_idx_to_prompt,
++                                      test_info.video_idx_to_prompt,
++                                      test_info.prompt_formatter)
++
++    # For models that require a local path / URL encoded in the image; export
++    # assets and encode into tmp_path for this test. This should be avoided
++    # where possible (currently needed for Qwen-VL).
++    if test_info.prompt_path_encoder is not None:
++        if tmp_path is None:
++            raise ValueError("Prompt path encoder requires setting local path")
++        model_prompts = [
++            test_info.prompt_path_encoder(tmp_path, prompt, [asset])
++            for prompt, asset in zip(model_prompts, image_assets)
++        ]
++
++    images = [asset.pil_image for asset in image_assets]
++    assert len(images) == len(model_prompts)
++    return build_single_image_inputs(images, model_prompts, size_wrapper)
++
++
++def build_single_image_inputs(images, model_prompts,
++                              size_wrapper: ImageSizeWrapper):
++    # For every image / prompt pair, get a pair containing two lists of
++    # length size_factors, where the first contains duplicates of the model
++    # prompt [str], and the second contains copies of the image after being
++    # scaled by one of the size factors.
++    #
++    # NOTE: rescaling preserves the image aspect ratio.
++    return [(
++        [prompt for _ in size_wrapper.data],
++        [
++            apply_image_size_scaling(image, size, size_wrapper.type)
++            for size in size_wrapper.data
++        ],
++    ) for image, prompt in zip(images, model_prompts)]
++
++
++def build_multi_image_inputs_from_test_info(
++        test_info: VLMTestInfo,
++        image_assets: _ImageAssets,
++        size_wrapper: ImageSizeWrapper,
++        tmp_path: Optional[PosixPath] = None):
++    if test_info.prompt_formatter is None:
++        raise ValueError(
++            "Prompt formatter must be set to build multi image inputs")
++
++    model_prompts = get_model_prompts([test_info.multi_image_prompt],
++                                      test_info.img_idx_to_prompt,
++                                      test_info.video_idx_to_prompt,
++                                      test_info.prompt_formatter)
++
++    if test_info.prompt_path_encoder is not None:
++        if tmp_path is None:
++            raise ValueError("Prompt path encoder requires setting local path")
++        model_prompts = [
++            test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
++            for model_prompt in model_prompts
++        ]
++
++    images = [asset.pil_image for asset in image_assets]
++
++    # Currently, we only have one multi-image list & one multi-image prompt
++    return build_multi_image_inputs(
++        image_lists=[images],
++        model_prompts=model_prompts,
++        size_wrapper=size_wrapper,
++    )
++
++
++def build_multi_image_inputs(image_lists, model_prompts,
++                             size_wrapper: ImageSizeWrapper):
++    return [(
++        [prompt for _ in size_wrapper.data],
++        [[
++            apply_image_size_scaling(image, size, size_wrapper.type)
++            for image in images
++        ] for size in size_wrapper.data],
++    ) for images, prompt in zip(image_lists, model_prompts)]
++
++
++def build_embedding_inputs_from_test_info(
++    test_info: VLMTestInfo,
++    image_assets: _ImageAssets,
++    size_wrapper: ImageSizeWrapper,
++):
++    # These conditions will always be true if invoked through filtering,
++    # but we still check them in case this is ever called directly
++    if test_info.prompt_formatter is None:
++        raise ValueError(
++            "Prompt formatter must be set to build image embedding inputs")
++    if size_wrapper.type != SizeType.SIZE_FACTOR or not \
++            all(factor == 1.0 for factor in size_wrapper.data):
++        raise ValueError("Embedding tests require constant (1.0) size factors")
++    if test_info.convert_assets_to_embeddings is None:
++        raise ValueError("No conversion func for getting embeddings found")
++
++    model_prompts = get_model_prompts(
++        SINGLE_IMAGE_BASE_PROMPTS,
++        test_info.img_idx_to_prompt,
++        test_info.video_idx_to_prompt,
++        test_info.prompt_formatter,
++    )
++
++    images = [asset.pil_image for asset in image_assets]
++    embeds = test_info.convert_assets_to_embeddings(image_assets)
++    assert len(images) == len(model_prompts)
++
++    inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
++    vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
++                                                size_wrapper)
++    return inputs, vllm_embeddings
++
++
++def build_video_inputs_from_test_info(
++    test_info: VLMTestInfo,
++    video_assets: _VideoAssets,
++    size_wrapper: ImageSizeWrapper,
++    num_frames: int,
++):
++    if test_info.prompt_formatter is None:
++        raise ValueError("Prompt formatter must be set to build video inputs")
++    model_prompts = get_model_prompts(
++        [VIDEO_BASE_PROMPT],
++        test_info.img_idx_to_prompt,
++        test_info.video_idx_to_prompt,
++        test_info.prompt_formatter,
++    )
++
++    sampled_vids = [
++        sample_frames_from_video(asset.np_ndarrays, num_frames)
++        for asset in video_assets
++    ]
++
++    video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
++                    else rescale_video_size)
++
++    return [(
++        [prompt for _ in size_wrapper.data],
++        [video_scaler(video, size) for size in size_wrapper.data],
++    ) for video, prompt in zip(sampled_vids, model_prompts)]
++
++
++def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]],
++                             size_type: SizeType):
++    """Applies a size scaler to one image; this can be a an image size factor,
++    which scales the image while maintaining the aspect ratio"""
++    # Special case for embeddings; if it's a tensor, it's only valid if we
++    # are considering size factors at constant scale, i.e., we just clone
++    # the tensor
++    if isinstance(image, torch.Tensor):
++        assert size_type == SizeType.SIZE_FACTOR and size == 1
++        return image
++    if size_type == SizeType.SIZE_FACTOR:
++        # We have a list of image size factors
++        return rescale_image_size(image, size)
++    elif size_type == SizeType.FIXED_SIZE:
++        # We have a list of fixed sizes
++        return image.resize(size)
++    raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
+diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+new file mode 100644
+index 0000000..9bb7134
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+@@ -0,0 +1,157 @@
++"""Utils for determining which subset of model tests belong to a specific
++modality, getting all combinations (similar to pytest's parametrization),
++handling multimodal placeholder substitution, and so on.
++"""
++import itertools
++from collections import OrderedDict
++from typing import Dict, Iterable, Tuple
++
++import pytest
++
++from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
++                    ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
++
++
++def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
++                               test_type: VLMTestType,
++                               fork_per_test: bool) -> Dict[str, VLMTestInfo]:
++    """Given the dict of potential test settings to run, return a subdict
++    of tests who have the current test type enabled with the matching val for
++    fork_per_test.
++    """
++
++    def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
++        return test_info.test_type == test_type or (
++            isinstance(test_info.test_type, Iterable)
++            and test_type in test_info.test_type)
++
++    matching_tests = {}
++    for test_name, test_info in test_settings.items():
++        # Otherwise check if the test has the right type & keep if it does
++        if matches_test_type(test_info, test_type):
++            # Embedding tests need to have a conversion func in their test info
++            if matches_test_type(test_info, VLMTestType.EMBEDDING):
++                assert test_info.convert_assets_to_embeddings is not None
++            # Custom test inputs need to explicitly define the mm limit/inputs
++            if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
++                assert (test_info.custom_test_opts is not None
++                        and isinstance(test_info.custom_test_opts, Iterable))
++            # For all types besides custom inputs, we need a prompt formatter
++            else:
++                assert test_info.prompt_formatter is not None
++
++            # Everything looks okay; keep if this is has correct proc handling
++            if (test_info.distributed_executor_backend
++                    is not None) == fork_per_test:
++                matching_tests[test_name] = test_info
++
++    return matching_tests
++
++
++def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
++                             test_type: VLMTestType,
++                             fork_new_process_for_each_test: bool):
++    """Converts all of our VLMTestInfo into an expanded list of parameters.
++    This is similar to nesting pytest parametrize calls, but done directly
++    through an itertools product so that each test can set things like
++    size factors etc, while still running in isolated test cases.
++    """
++    matching_tests = get_filtered_test_settings(
++        test_settings, test_type, fork_new_process_for_each_test)
++
++    # Ensure that something is wrapped as an iterable it's not already
++    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
++
++    def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
++        # This is essentially the same as nesting a bunch of mark.parametrize
++        # decorators, but we do it programmatically to allow overrides for on
++        # a per-model basis, while still being able to execute each of these
++        # as individual test cases in pytest.
++        iter_kwargs = OrderedDict([
++            ("model", ensure_wrapped(test_info.models)),
++            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
++            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
++            ("dtype", ensure_wrapped(test_info.dtype)),
++            ("distributed_executor_backend",
++             ensure_wrapped(test_info.distributed_executor_backend)),
++        ])
++
++        # num_frames is video only
++        if test_type == VLMTestType.VIDEO:
++            iter_kwargs["num_video_frames"] = ensure_wrapped(
++                test_info.num_video_frames)
++
++        # No sizes passed for custom inputs, since inputs are directly provided
++        if test_type != VLMTestType.CUSTOM_INPUTS:
++            wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
++            if wrapped_sizes is None:
++                raise ValueError(
++                    f"Sizes must be set for test type {test_type}")
++            iter_kwargs["size_wrapper"] = wrapped_sizes
++
++        #Otherwise expand the custom test options instead
++        else:
++            if test_info.custom_test_opts is None:
++                raise ValueError("Test has type CUSTOM_INPUTS, but none given")
++            iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
++
++        # yapf: disable
++        # Wrap all model cases in a pytest parameter & pass marks through
++        return [
++            pytest.param(
++                model_type,
++                ExpandableVLMTestArgs(
++                    **{k: v for k, v in zip(iter_kwargs.keys(), case)}
++                ),
++                marks=test_info.marks if test_info.marks is not None else []
++            ) for case in list(itertools.product(*iter_kwargs.values()))
++        ]
++        # yapf: enable
++
++    # Get a list per model type, where each entry contains a tuple of all of
++    # that model type's cases, then flatten them into the top level so that
++    # we can consume them in one mark.parametrize call.
++    cases_by_model_type = [
++        get_model_type_cases(model_type, test_info)
++        for model_type, test_info in matching_tests.items()
++    ]
++    return list(itertools.chain(*cases_by_model_type))
++
++
++def get_wrapped_test_sizes(
++        test_info: VLMTestInfo,
++        test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]:
++    """Given a test info which may have size factors or fixed sizes, wrap them
++    and combine them into an iterable, each of which will be used in parameter
++    expansion.
++
++    Args:
++        test_info: Test configuration to be expanded.
++        test_type: The type of test being filtered for.
++    """
++    # If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
++    if test_type == VLMTestType.EMBEDDING:
++        return tuple([
++            ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
++            for factor in EMBEDDING_SIZE_FACTORS
++        ])
++    # Custom inputs have preprocessed inputs
++    elif test_type == VLMTestType.CUSTOM_INPUTS:
++        return tuple()
++
++    size_factors = test_info.image_size_factors \
++        if test_info.image_size_factors else []
++    fixed_sizes = test_info.image_sizes \
++        if test_info.image_sizes else []
++
++    wrapped_factors = [
++        ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
++        for factor in size_factors
++    ]
++
++    wrapped_sizes = [
++        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
++        for size in fixed_sizes
++    ]
++
++    return tuple(wrapped_factors + wrapped_sizes)
+diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
+new file mode 100644
+index 0000000..54b7b07
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
+@@ -0,0 +1,156 @@
++"""Core test implementation to be shared across modalities."""
++from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
++
++import torch
++from PIL.Image import Image
++from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase
++from transformers.models.auto.auto_factory import _BaseAutoModelClass
++
++from vllm.config import TaskOption
++
++from .....conftest import HfRunner, VllmRunner
++from .types import RunnerOutput
++
++
++def run_test(
++    *,
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]],
++    model: str,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    enforce_eager: bool,
++    max_model_len: int,
++    max_num_seqs: int,
++    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
++    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
++    auto_cls: Type[_BaseAutoModelClass],
++    use_tokenizer_eos: bool,
++    postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
++    comparator: Callable[..., None],
++    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
++                                          List[int]]],
++    stop_str: Optional[List[str]],
++    tokenizer_mode: str,
++    limit_mm_per_prompt: Dict[str, int],
++    vllm_runner_kwargs: Optional[Dict[str, Any]],
++    hf_model_kwargs: Optional[Dict[str, Any]],
++    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
++    task: TaskOption = "auto",
++    runner_mm_key: str = "images",
++    distributed_executor_backend: Optional[str] = None,
++    tensor_parallel_size: int = 1,
++    vllm_embeddings: Optional[torch.Tensor] = None,
++):
++    """Modality agnostic test test executor for comparing HF/vLLM outputs."""
++    # In the case of embeddings, vLLM takes separate input tensors
++    vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
++    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
++
++    vllm_outputs_per_mm = []
++    hf_outputs_per_mm = []
++
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default method).
++    vllm_kwargs: Dict[str, Any] = {}
++    if get_stop_token_ids is not None:
++        vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
++    if stop_str:
++        vllm_kwargs["stop"] = stop_str
++
++    if vllm_runner_kwargs is None:
++        vllm_runner_kwargs = {}
++
++    with vllm_runner(model,
++                     tokenizer_mode=tokenizer_mode,
++                     max_model_len=max_model_len,
++                     max_num_seqs=max_num_seqs,
++                     dtype=dtype,
++                     limit_mm_per_prompt=limit_mm_per_prompt,
++                     tensor_parallel_size=tensor_parallel_size,
++                     distributed_executor_backend=distributed_executor_backend,
++                     enforce_eager=enforce_eager,
++                     task=task,
++                     **vllm_runner_kwargs) as vllm_model:
++        for prompts, media in vllm_inputs:
++            vllm_kwargs[runner_mm_key] = media
++            vllm_output = vllm_model.generate_greedy_logprobs(
++                prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs)
++            vllm_outputs_per_mm.append(vllm_output)
++
++    hf_model = hf_runner(model,
++                         dtype=dtype,
++                         auto_cls=auto_cls,
++                         postprocess_inputs=postprocess_inputs,
++                         model_kwargs=hf_model_kwargs)
++
++    # Some models need to patch things like the model processor, e.g., internvl
++    if patch_hf_runner is not None:
++        hf_model = patch_hf_runner(hf_model)
++
++    # Some models need to explicitly pass the eos_token_id off the tokenizer or
++    # processor for a good comparison; currently assume processor/tokenizer
++    # agree on the EOS, and pull it off the tokenizer if requested.
++    hf_kwargs = {}
++    if use_tokenizer_eos:
++        hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
++    if stop_str:
++        hf_kwargs["stop_strings"] = stop_str
++
++    with hf_model, torch.no_grad():
++        for prompts, media in inputs:
++            hf_kwargs[runner_mm_key] = media
++            hf_output = hf_model.generate_greedy_logprobs_limit(
++                prompts,
++                max_tokens,
++                num_logprobs=num_logprobs,
++                tokenizer=tokenizer,
++                **hf_kwargs)
++            hf_outputs_per_mm.append(hf_output)
++
++    # Apply output processing / sanitation to the vLLM and HF runner results
++    hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
++        model,
++        first_runner_outputs=hf_outputs_per_mm,
++        second_runner_outputs=vllm_outputs_per_mm,
++        first_runner_processor=hf_output_post_proc,
++        second_runner_processor=vllm_output_post_proc,
++    )
++
++    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
++                                        vllm_outputs_per_mm):
++        # This is usually check_logprobs_close, but it's passed through to
++        # allow things like check_outputs_equal where needed
++        comparator(
++            outputs_0_lst=hf_outputs,
++            outputs_1_lst=vllm_outputs,
++            name_0="hf",
++            name_1="vllm",
++        )
++
++
++def process_runner_outputs(
++    model,
++    first_runner_outputs,
++    second_runner_outputs,
++    first_runner_processor=None,
++    second_runner_processor=None,
++):
++    """Applies the runner processor(s) to the runner outputs, if any."""
++    if first_runner_processor is not None:
++        first_runner_outputs = process_outputs(first_runner_processor, model,
++                                               first_runner_outputs)
++    if second_runner_processor is not None:
++        second_runner_outputs = process_outputs(second_runner_processor, model,
++                                                second_runner_outputs)
++    return first_runner_outputs, second_runner_outputs
++
++
++def process_outputs(output_processor, model, outputs_per_image):
++    """Applies a model specific post-processor function to a runner's output"""
++    return [[output_processor(res, model) for res in outputs]
++            for outputs in outputs_per_image]
+\ No newline at end of file
+diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+new file mode 100644
+index 0000000..2291f4f
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+@@ -0,0 +1,103 @@
++"""Custom input builders for edge-cases in different models."""
++from typing import Callable
++
++from vllm.multimodal.image import rescale_image_size
++from vllm.multimodal.video import (rescale_video_size, resize_video,
++                                   sample_frames_from_video)
++
++from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
++from .builders import build_multi_image_inputs, build_single_image_inputs
++from .types import ImageSizeWrapper, SizeType
++
++
++def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
++    """Builds inputs for multi-image (varied sizes/aspect ratio) testing.
++    
++    Args:
++        formatter: model-specific prompt formatter.
++    """
++    stop_sign = IMAGE_ASSETS[0].pil_image
++    cherry_blossom = IMAGE_ASSETS[1].pil_image
++
++    # Apply the selected formatter to the base prompts
++    img_prompts = [
++        "<image><image>\nDescribe 2 images.",
++        "<image><image>\nDescribe 2 images.",
++        "<image><image><image><image>\nDescribe 4 images.",
++        "<image>\nWhat is the season?",
++    ]
++    formatted_prompts = [formatter(prompt) for prompt in img_prompts]
++
++    return [(
++        formatted_prompts,
++        [
++            [stop_sign, cherry_blossom],
++            # Images with different sizes and aspect-ratios
++            [
++                rescale_image_size(stop_sign, 0.1),
++                stop_sign,
++            ],
++            [
++                stop_sign,
++                rescale_image_size(stop_sign, 0.25),
++                cherry_blossom.resize((183, 488)),
++                cherry_blossom.resize((488, 183))
++            ],
++            cherry_blossom,
++        ])]
++
++
++def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
++                                          num_frames: int = 16):
++    """Builds inputs for multi-video (varied sizes/aspect ratio) testing.
++    
++    Args:
++        formatter: model-specific prompt formatter.
++    """
++    video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
++    # Apply the selected formatter to the base prompts
++    video_prompts = [
++        "<video><video>\nDescribe 2 videos.",
++        "<video><video>\nDescribe 2 videos.",
++        "<video><video><video><video>\nDescribe 4 videos.",
++        "<video>\nWhy is this video funny?",
++    ]
++    formatted_prompts = [formatter(prompt) for prompt in video_prompts]
++
++    return [(
++        formatted_prompts,
++        [
++            [video, video],
++            # Videos with different sizes and aspect-ratios
++            [
++                rescale_video_size(video, 0.1),
++                video,
++            ],
++            [
++                video,
++                rescale_video_size(video, 0.25),
++                resize_video(video, (183, 488)),
++                resize_video(video, (488, 183))
++            ],
++            video,
++        ])]
++
++
++def different_patch_input_cases_internvl():
++    images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
++    formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
++    single_img_prompts = [
++        "<image>\nWhat's the content in the center of the image?",
++        "<image>\nWhat is the season?",
++    ]
++    multi_img_prompts = [
++        "Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n",  # noqa: E501
++    ]
++    formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
++    formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
++
++    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
++    return [
++        build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
++        build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
++    ]
+diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+new file mode 100644
+index 0000000..1ca85c7
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+@@ -0,0 +1,582 @@
++"""Common utility functions relating to different models that are useful
++for manipulating the input / output of HF & vLLM test runners, which are
++typically specific to a small subset of models.
++"""
++import re
++import types
++from pathlib import PosixPath
++from typing import Any, Callable, Dict, List, Optional, Tuple, Union
++
++import torch
++from PIL.Image import Image
++from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
++                          GenerationConfig)
++
++from vllm.sequence import SampleLogprobs
++from vllm.transformers_utils.tokenizer import patch_padding_side
++from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
++
++from .....conftest import (HfRunner, ImageAsset, PromptAudioInput,
++                           PromptImageInput, PromptVideoInput, _ImageAssets)
++from ....utils import TokensTextLogprobs
++from .types import RunnerOutput
++
++
++####### vLLM output processors functions
++def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
++                            model: str) -> RunnerOutput:
++    """Sanitize vllm output [blip2 models] to be comparable with hf output."""
++    _, output_str, out_logprobs = vllm_output
++
++    hf_output_str = output_str + "\n"
++
++    tokenizer = AutoTokenizer.from_pretrained(model)
++    hf_output_ids = tokenizer.encode(hf_output_str)
++    assert hf_output_ids[0] == tokenizer.bos_token_id
++    hf_output_ids = hf_output_ids[1:]
++
++    return hf_output_ids, hf_output_str, out_logprobs
++
++
++def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
++                           model: str) -> RunnerOutput:
++    """Sanitize vllm output [fuyu models] to be comparable with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
++
++    return output_ids, hf_output_str, out_logprobs
++
++
++def qwen_vllm_to_hf_output(
++        vllm_output: RunnerOutput,
++        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
++    """Sanitize vllm output [qwen models] to be comparable with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    hf_output_str = output_str + "<|endoftext|>"
++
++    return output_ids, hf_output_str, out_logprobs
++
++
++def qwen2_vllm_to_hf_output(
++        vllm_output: RunnerOutput,
++        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
++    """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    hf_output_str = output_str + "<|im_end|>"
++
++    return output_ids, hf_output_str, out_logprobs
++
++
++def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
++                                  model: str) -> RunnerOutput:
++    config = AutoConfig.from_pretrained(model)
++    mm_token_id = config.image_token_index
++    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
++
++
++def llava_video_vllm_to_hf_output(
++        vllm_output: RunnerOutput,
++        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
++    config = AutoConfig.from_pretrained(model)
++    mm_token_id = config.video_token_index
++    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
++
++
++def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
++                             mm_token_id: int) -> RunnerOutput:
++    """Sanitize vllm output [Llava models] to be comparable with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    tokenizer = AutoTokenizer.from_pretrained(model)
++    eos_token_id = tokenizer.eos_token_id
++
++    hf_output_ids = [
++        token_id for idx, token_id in enumerate(output_ids)
++        if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
++    ]
++
++    assert output_str[0] == " "
++    hf_output_str = output_str[1:]
++    if hf_output_ids[-1] == eos_token_id:
++        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
++
++    return hf_output_ids, hf_output_str, out_logprobs
++
++
++def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
++                                      model: str) -> RunnerOutput:
++    """Sanitize vllm output [llava-onevision] to compare with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    config = AutoConfig.from_pretrained(model)
++    video_token_id = config.video_token_index
++
++    tokenizer = AutoTokenizer.from_pretrained(model)
++    eos_token_id = tokenizer.eos_token_id
++
++    hf_output_ids = [
++        token_id for idx, token_id in enumerate(output_ids)
++        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
++    ]
++
++    hf_output_str = output_str
++    if hf_output_ids[-1] == eos_token_id:
++        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
++
++    return hf_output_ids, hf_output_str, out_logprobs
++
++
++def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
++                             model: str) -> RunnerOutput:
++    """Sanitize vllm output [mantis] to compare with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    hf_output_str = output_str + "<|eot_id|>"
++
++    return output_ids, hf_output_str, out_logprobs
++
++
++def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
++                            model: str) -> RunnerOutput:
++    """Sanitize vllm output [phi3v] to be comparable with hf output."""
++    _, output_str, out_logprobs = vllm_output
++
++    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
++    assert output_str_without_image[0] == " "
++    output_str_without_image = output_str_without_image[1:]
++
++    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
++
++    tokenizer = AutoTokenizer.from_pretrained(model)
++    hf_output_ids = tokenizer.encode(output_str_without_image)
++    assert hf_output_ids[0] == 1
++    hf_output_ids = hf_output_ids[1:]
++
++    return hf_output_ids, hf_output_str, out_logprobs
++
++
++def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
++                                model: str) -> RunnerOutput:
++    """Sanitize vllm output to be comparable with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    config = AutoConfig.from_pretrained(model)
++    image_token_id = config.image_token_index
++
++    tokenizer = AutoTokenizer.from_pretrained(model)
++    eos_token_id = tokenizer.eos_token_id
++
++    hf_output_ids = [
++        token_id for idx, token_id in enumerate(output_ids)
++        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
++    ]
++
++    hf_output_str = output_str
++
++    if hf_output_ids[-1] == eos_token_id:
++        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
++
++    return hf_output_ids, hf_output_str, out_logprobs
++
++
++####### Post-processors for HF outputs
++def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
++                                model: str) -> RunnerOutput:
++    output_ids, output_str, out_logprobs = hf_output
++    if output_str.endswith("<｜end▁of▁sentence｜>"):
++        output_str = output_str.split("<｜end▁of▁sentence｜>")[0]
++    return output_ids, output_str, out_logprobs
++
++
++def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
++                             model: str) -> RunnerOutput:
++    output_ids, output_str, out_logprobs = hf_output
++    if output_str.endswith("<|eot_id|>"):
++        output_str = output_str.split("<|eot_id|>")[0]
++    return output_ids, output_str, out_logprobs
++
++
++####### Functions for converting image assets to embeddings
++def get_llava_embeddings(image_assets: _ImageAssets):
++    return [asset.image_embeds for asset in image_assets]
++
++
++####### postprocessors to run on HF BatchEncoding
++def cast_dtype_post_processor(
++        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
++    """Gets a handle to a post processor which converts a given key into a
++    target data type."""
++
++    def process(hf_inputs: BatchEncoding, dtype: str):
++        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
++        hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
++        return hf_inputs
++
++    return process
++
++
++def ignore_inputs_post_processor(
++        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
++    """Gets a handle to a post processor which ignores a given key."""
++
++    def process(hf_inputs: BatchEncoding, dtype: str):
++        del hf_inputs[hf_inp_key]
++        return hf_inputs
++
++    return process
++
++
++def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
++    return {"model_inputs": hf_inputs}
++
++
++def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
++    hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
++    return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
++
++
++####### Prompt path encoders for models that need models on disk
++def qwen_prompt_path_encoder(
++        tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
++                                                        _ImageAssets]) -> str:
++    """Given a temporary dir path, export one or more image assets into the
++    tempdir & replace its contents with the local path to the string so that
++    the HF version of Qwen-VL can resolve the path and load the image in its
++    forward() call.
++
++    Args:
++        tmp_path: Tempdir for test under consideration.
++        prompt: Prompt with image placeholders.
++        assets: List of image assets whose len equals the num placeholders.
++    """
++    # Ensure that the number of placeholders matches the number of assets;
++    # If this is not true, the test is probably written incorrectly.
++    assert prompt.count("<img></img>") == len(assets)
++
++    # Replace the placeholders with local paths to the exported assets
++    for asset in assets:
++        image_tmp_path = tmp_path / f"{asset.name}.jpg"
++        asset.pil_image.save(image_tmp_path)
++        prompt = prompt.replace(
++            "<img></img>",
++            f"<img>{image_tmp_path}</img>",
++            1,
++        )
++    return prompt
++
++
++####### Model-specific HuggingFace runner patchers
++def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
++    """Patches and returns an instance of the HfRunner to use for GLM4."""
++    hf_processor = hf_model.processor
++
++    def processor(*args, text="", images=None, **kwargs):
++        if isinstance(images, Image):
++            images = [images]
++        # inputs is a custom class instead of dict or BatchFeature
++        inputs = hf_processor(
++            *args,
++            prompt=text,
++            images=images,
++            **kwargs,
++        )
++        inputs = {
++            k: inputs[k]
++            for k in inputs.keys()  # noqa
++            if k not in ("seq_lens", "sft_format")
++        }
++        inputs = BatchEncoding(data=inputs, tensor_type="pt")
++        return inputs
++
++    hf_model.processor = processor
++    hf_model.model.get_output_embeddings = lambda: \
++        hf_model.model.language.model.embed_tokens
++    return hf_model
++
++
++def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
++    """Patches and returns an instance of the HfRunner to use for GLM4."""
++    hf_processor = hf_model.processor
++    patch_padding_side(hf_processor)
++
++    def processor(*args, text="", images=None, **kwargs):
++        if images is None:
++            return hf_processor(*args, **kwargs)
++
++        return hf_processor.apply_chat_template(
++            [{
++                "role": "user",
++                "image": images,
++                "content": text
++            }],
++            add_generation_prompt=True,
++            tokenize=True,
++            return_dict=True,
++            **kwargs,
++        )
++
++    hf_model.processor = processor
++    hf_model.model.get_output_embeddings = lambda: \
++        hf_model.model.transformer.output_layer
++    return hf_model
++
++
++def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
++    """Patches and returns an instance of the HfRunner to use for H2OVL."""
++
++    class H2OVLProcessor:
++        """A simple processor for H2OVL models."""
++
++        def __init__(self, hf_runner: HfRunner):
++            self.num_image_token = hf_runner.model.num_image_token
++            self.tokenizer = hf_runner.tokenizer
++            self.dtype = hf_runner.model.dtype
++
++            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
++                                                     trust_remote_code=True)
++            self.vision_config = self.config.vision_config
++            self.use_thumbnail = self.config.use_thumbnail
++            self.min_num = self.config.min_dynamic_patch
++            self.max_num = self.config.max_dynamic_patch
++            self.image_size = self.vision_config.image_size
++
++        def __call__(self, text: str, images: Union[Image, List[Image]],
++                     **kwargs):
++            # yapf: disable
++            from vllm.model_executor.models.h2ovl import (
++                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
++
++            # yapf: enable
++            images = [images] if isinstance(images, Image) else images
++            pixel_values = [
++                image_to_pixel_values(image,
++                                      self.image_size,
++                                      self.min_num,
++                                      self.max_num,
++                                      self.use_thumbnail,
++                                      use_MSAC=self.config.use_msac).to(
++                                          self.dtype) for image in images
++            ]
++            num_patches_list = [
++                pixel_value.shape[0] for pixel_value in pixel_values
++            ]
++            pixel_values = torch.cat(pixel_values, dim=0)
++            for num_patches in num_patches_list:
++                context_tokens = IMG_CONTEXT * self.num_image_token \
++                    * num_patches
++                image_tokens = IMG_START + context_tokens + IMG_END
++                text = text.replace('<image>', image_tokens, 1)
++            prompt = self.tokenizer(text, return_tensors="pt")
++            prompt.update({"pixel_values": pixel_values})
++            return prompt
++
++    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
++        "<IMG_CONTEXT>")
++    hf_model.model.img_context_token_id = img_context_token_id
++    hf_model.processor = H2OVLProcessor(hf_model)
++    hf_model.model.get_output_embeddings = lambda: \
++        hf_model.model.language_model.get_output_embeddings()
++    hf_model.model.generate = types.MethodType(_internvl_generate,
++                                               hf_model.model)
++    return hf_model
++
++
++def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
++    """Patches and returns an instance of the HfRunner to use for InternVL."""
++
++    class InternVLProcessor:
++        """A simple processor for InternVL2 which misses a processor."""
++
++        def __init__(self, hf_runner: HfRunner):
++            self.num_image_token = hf_runner.model.num_image_token
++            self.tokenizer = hf_runner.tokenizer
++            self.dtype = hf_runner.model.dtype
++
++            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
++                                                     trust_remote_code=True)
++            self.vision_config = self.config.vision_config
++            self.use_thumbnail = self.config.use_thumbnail
++            self.min_num = self.config.min_dynamic_patch
++            self.max_num = self.config.max_dynamic_patch
++            self.image_size = self.vision_config.image_size
++
++        def __call__(self, text: str, images: Union[Image, List[Image]],
++                     **kwargs):
++            from vllm.model_executor.models.internvl import (
++                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
++            images = [images] if isinstance(images, Image) else images
++            pixel_values = [
++                image_to_pixel_values(image, self.image_size, self.min_num,
++                                      self.max_num,
++                                      self.use_thumbnail).to(self.dtype)
++                for image in images
++            ]
++            num_patches_list = [
++                pixel_value.shape[0] for pixel_value in pixel_values
++            ]
++            pixel_values = torch.cat(pixel_values, dim=0)
++            for num_patches in num_patches_list:
++                context_tokens = IMG_CONTEXT * self.num_image_token \
++                    * num_patches
++                image_tokens = IMG_START + context_tokens + IMG_END
++                text = text.replace('<image>', image_tokens, 1)
++            prompt = self.tokenizer(text, return_tensors="pt")
++            prompt.update({"pixel_values": pixel_values})
++            return prompt
++
++    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
++        "<IMG_CONTEXT>")
++    hf_model.model.img_context_token_id = img_context_token_id
++    hf_model.processor = InternVLProcessor(hf_model)
++    hf_model.model.get_output_embeddings = lambda: \
++        hf_model.model.language_model.get_output_embeddings()
++    hf_model.model.generate = types.MethodType(_internvl_generate,
++                                               hf_model.model)
++    return hf_model
++
++
++def _internvl_generate(
++    self,
++    pixel_values: torch.FloatTensor,
++    input_ids: torch.FloatTensor,
++    attention_mask: Optional[torch.LongTensor] = None,
++    **generate_kwargs,
++) -> torch.LongTensor:
++    """Generate method for InternVL2 model without fixed use_cache."""
++    assert self.img_context_token_id is not None
++    vit_embeds = self.extract_feature(pixel_values)
++    input_embeds = self.language_model.get_input_embeddings()(input_ids)
++    B, N, C = input_embeds.shape
++    input_embeds = input_embeds.reshape(B * N, C)
++
++    input_ids = input_ids.reshape(B * N)
++    selected = (input_ids == self.img_context_token_id)
++    assert selected.sum() != 0
++    input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
++
++    input_embeds = input_embeds.reshape(B, N, C)
++
++    forward_kwargs = dict(
++        inputs_embeds=input_embeds,
++        attention_mask=attention_mask,
++    )
++    if getattr(self, "use_visual_token_mask", False):
++        visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
++        forward_kwargs["visual_token_mask"] = visual_token_mask
++    outputs = self.language_model.generate(
++        **forward_kwargs,
++        **generate_kwargs,
++    )
++
++    return outputs
++
++
++def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
++    from mantis.models.mllava import MLlavaProcessor
++
++    hf_model.processor = MLlavaProcessor.from_pretrained(hf_model.model_name)
++
++    orig_generate = hf_model.model.generate
++    tokenizer = hf_model.processor.tokenizer
++
++    def _generate(self, *args, **kwargs):
++        return orig_generate(
++            *args,
++            **kwargs,
++            eos_token_id=[
++                tokenizer.eos_token_id,
++                tokenizer.convert_tokens_to_ids("<|eot_id|>"),
++            ],
++        )
++
++    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
++
++    return hf_model
++
++
++def _generate_greedy_logprobs_limit(
++    self,
++    prompts: List[str],
++    max_tokens: int,
++    num_logprobs: int,
++    images: Optional[PromptImageInput] = None,
++    audios: Optional[PromptAudioInput] = None,
++    videos: Optional[PromptVideoInput] = None,
++    **kwargs: Any,
++) -> List[TokensTextLogprobs]:
++    all_inputs = self.get_inputs(prompts,
++                                 images=images,
++                                 videos=videos,
++                                 audios=audios)
++
++    # Process in batches for inference.
++    if len(all_inputs):
++        input_ids_lst = []
++        images_lst = []
++        images_input_idx_lst = []
++        imges_masks_lst = []
++        for inputs in all_inputs:
++            input_ids_lst.append(inputs["input_ids"])
++            images_lst.append(inputs["images"])
++            images_input_idx_lst.append(inputs["image_input_idx"])
++            imges_masks_lst.append(inputs["image_masks"])
++        batch_inputs = {}
++        batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
++        batch_inputs['images'] = torch.cat(images_lst, dim=0)
++        batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
++                                                    dim=0)
++        batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
++
++        outputs = self.model.generate_from_batch(
++            batch=self.wrap_device(batch_inputs,
++                                   device=self.model.device.type),
++            generation_config=GenerationConfig(
++                max_new_tokens=max_tokens,
++                stop_strings="<|endoftext|>",
++                do_sample=False,
++            ),
++            tokenizer=self.tokenizer,
++            output_hidden_states=True,
++            return_dict_in_generate=True,
++        )
++
++    all_logprobs: List[List[Dict[int, float]]] = []
++    all_output_ids: List[List[int]] = []
++    all_output_strs: List[str] = []
++
++    for index in range(len(all_inputs)):
++        (
++            seq_logprobs_lst,
++            output_len,
++        ) = self._hidden_states_to_logprobs(outputs.hidden_states,
++                                            num_logprobs)
++        all_logprobs.append(seq_logprobs_lst)
++        seq_ids = outputs.sequences[index]
++        output_ids = seq_ids[-output_len:]
++        all_output_ids.append(output_ids.tolist())
++        all_output_strs.append(self.tokenizer.decode(output_ids))
++    outputs = zip(all_output_ids, all_output_strs, all_logprobs)
++    return [(output_ids, output_str, output_logprobs)
++            for output_ids, output_str, output_logprobs in outputs]
++
++
++####### Molmo-specific HuggingFace runner patchers
++def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
++    """Patches and returns an instance of the HfRunner to use for Molmo."""
++    hf_processor = hf_model.processor
++
++    def _processor(*args, **kwargs):
++        return hf_processor.process(*args, **kwargs)
++
++    hf_model.processor = _processor
++
++    setattr(  # noqa: B010
++        hf_model,
++        "generate_greedy_logprobs_limit",
++        types.MethodType(_generate_greedy_logprobs_limit, hf_model),
++    )
++
++    return hf_model
+diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+new file mode 100644
+index 0000000..2d3b39f
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+@@ -0,0 +1,139 @@
++"""Entrypoints for wrapping the core run_test implementation for specific test
++types / modalities.
++"""
++from pathlib import PosixPath
++from typing import Type
++
++from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
++from . import builders, core
++from .types import ExpandableVLMTestArgs, VLMTestInfo
++
++
++####### Entrypoints for running different test types
++def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
++                          test_case: ExpandableVLMTestArgs,
++                          hf_runner: Type[HfRunner],
++                          vllm_runner: Type[VllmRunner],
++                          image_assets: _ImageAssets):
++    assert test_case.size_wrapper is not None
++    inputs = builders.build_single_image_inputs_from_test_info(
++        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
++
++    core.run_test(
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        inputs=inputs,
++        model=test_case.model,
++        dtype=test_case.dtype,
++        max_tokens=test_case.max_tokens,
++        num_logprobs=test_case.num_logprobs,
++        limit_mm_per_prompt={"image": 1},
++        distributed_executor_backend=test_case.distributed_executor_backend,
++        runner_mm_key="images",
++        **model_test_info.get_non_parametrized_runner_kwargs())
++
++
++def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
++                         test_case: ExpandableVLMTestArgs,
++                         hf_runner: Type[HfRunner],
++                         vllm_runner: Type[VllmRunner],
++                         image_assets: _ImageAssets):
++    assert test_case.size_wrapper is not None
++    inputs = builders.build_multi_image_inputs_from_test_info(
++        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
++
++    core.run_test(
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        inputs=inputs,
++        model=test_case.model,
++        dtype=test_case.dtype,
++        max_tokens=test_case.max_tokens,
++        num_logprobs=test_case.num_logprobs,
++        limit_mm_per_prompt={"image": len(image_assets)},
++        distributed_executor_backend=test_case.distributed_executor_backend,
++        runner_mm_key="images",
++        **model_test_info.get_non_parametrized_runner_kwargs())
++
++
++def run_embedding_test(*, model_test_info: VLMTestInfo,
++                       test_case: ExpandableVLMTestArgs,
++                       hf_runner: Type[HfRunner],
++                       vllm_runner: Type[VllmRunner],
++                       image_assets: _ImageAssets):
++    assert test_case.size_wrapper is not None
++    inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
++        model_test_info, image_assets, test_case.size_wrapper)
++
++    core.run_test(
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        inputs=inputs,
++        model=test_case.model,
++        dtype=test_case.dtype,
++        max_tokens=test_case.max_tokens,
++        num_logprobs=test_case.num_logprobs,
++        limit_mm_per_prompt={"image": 1},
++        vllm_embeddings=vllm_embeddings,
++        distributed_executor_backend=test_case.distributed_executor_backend,
++        runner_mm_key="images",
++        **model_test_info.get_non_parametrized_runner_kwargs())
++
++
++def run_video_test(
++    *,
++    model_test_info: VLMTestInfo,
++    test_case: ExpandableVLMTestArgs,
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    video_assets: _VideoAssets,
++):
++    assert test_case.size_wrapper is not None
++    assert test_case.num_video_frames is not None
++    inputs = builders.build_video_inputs_from_test_info(
++        model_test_info, video_assets, test_case.size_wrapper,
++        test_case.num_video_frames)
++
++    core.run_test(
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        inputs=inputs,
++        model=test_case.model,
++        dtype=test_case.dtype,
++        max_tokens=test_case.max_tokens,
++        num_logprobs=test_case.num_logprobs,
++        limit_mm_per_prompt={"video": len(video_assets)},
++        distributed_executor_backend=test_case.distributed_executor_backend,
++        runner_mm_key="videos",
++        **model_test_info.get_non_parametrized_runner_kwargs())
++
++
++def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
++                           test_case: ExpandableVLMTestArgs,
++                           hf_runner: Type[HfRunner],
++                           vllm_runner: Type[VllmRunner]):
++    # Custom test cases can provide inputs directly, but they need to
++    # explicitly provided a CustomTestConfig, which wraps the inputs and
++    # the limit_mm_per_prompt
++    assert test_case.custom_test_opts is not None
++
++    inputs = test_case.custom_test_opts.inputs
++    limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
++    runner_mm_key = test_case.custom_test_opts.runner_mm_key
++    # Inputs, limit_mm_per_prompt, and runner_mm_key should all be set
++    assert inputs is not None
++    assert limit_mm_per_prompt is not None
++    assert runner_mm_key is not None
++
++    core.run_test(
++        hf_runner=hf_runner,
++        vllm_runner=vllm_runner,
++        inputs=inputs,
++        model=test_case.model,
++        dtype=test_case.dtype,
++        max_tokens=test_case.max_tokens,
++        num_logprobs=test_case.num_logprobs,
++        limit_mm_per_prompt=limit_mm_per_prompt,
++        distributed_executor_backend=test_case.distributed_executor_backend,
++        runner_mm_key=runner_mm_key,
++        **model_test_info.get_non_parametrized_runner_kwargs())
+diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
+new file mode 100644
+index 0000000..e2e0c63
+--- /dev/null
++++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
+@@ -0,0 +1,198 @@
++"""Types for writing multimodal model tests."""
++from enum import Enum
++from pathlib import PosixPath
++from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
++                    Tuple, Type, Union)
++
++import torch
++from PIL.Image import Image
++from pytest import MarkDecorator
++from transformers import (AutoModelForCausalLM, BatchEncoding,
++                          PreTrainedTokenizerBase)
++from transformers.models.auto.auto_factory import _BaseAutoModelClass
++
++from vllm.config import TaskOption
++from vllm.sequence import SampleLogprobs
++from vllm.utils import identity
++
++from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
++from ....utils import check_logprobs_close
++
++# meta image tag; will be replaced by the appropriate tag for the model
++TEST_IMG_PLACEHOLDER = "<vlm_image>"
++TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
++
++# yapf: disable
++SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
++    "stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
++    "cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
++})
++
++MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n"  # noqa: E501
++VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
++
++
++IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
++EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
++RunnerOutput = Tuple[List[int], str, Optional[SampleLogprobs]]
++# yapf: enable
++
++
++class VLMTestType(Enum):
++    IMAGE = 1
++    MULTI_IMAGE = 2
++    EMBEDDING = 3
++    VIDEO = 4
++    CUSTOM_INPUTS = 5
++
++
++class SizeType(Enum):
++    SIZE_FACTOR = 1
++    FIXED_SIZE = 2
++
++
++class CustomTestOptions(NamedTuple):
++    inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]]
++    limit_mm_per_prompt: Dict[str, int]
++    # kwarg to pass multimodal data in as to vllm/hf runner instances.
++    runner_mm_key: str = "images"
++
++
++class ImageSizeWrapper(NamedTuple):
++    type: SizeType
++    # A size factor is a wrapper of 0+ floats,
++    # while a fixed size contains an iterable of integer pairs
++    data: Union[Iterable[float], Iterable[Tuple[int, int]]]
++
++
++class VLMTestInfo(NamedTuple):
++    """Holds the configuration for 1+ tests for one model architecture."""
++
++    models: List[str]
++    test_type: Union[VLMTestType, Iterable[VLMTestType]]
++
++    # Should be None only if this is a CUSTOM_INPUTS test
++    prompt_formatter: Optional[Callable[[str], str]] = None
++    img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
++    video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
++
++    # Most models work on the single / multi-image prompts above, but in some
++    # cases the log prob check fails, e.g., for paligemma. We allow passing
++    # an override for the single image prompts / multi-image prompt for this
++    # reason.
++    single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
++    multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
++
++    # Function for converting ImageAssets to image embeddings;
++    # We need to define this explicitly for embedding tests
++    convert_assets_to_embeddings: Optional[Callable[[_ImageAssets],
++                                                    torch.Tensor]] = None
++
++    # Exposed options for vLLM runner; we change these in a several tests,
++    # but the defaults are derived from VllmRunner & the engine defaults
++    # These settings are chosen to avoid OOMs when running in the CI
++    enforce_eager: bool = True
++    max_model_len: int = 1024
++    max_num_seqs: int = 256
++    task: TaskOption = "auto"
++    tensor_parallel_size: int = 1
++    vllm_runner_kwargs: Optional[Dict[str, Any]] = None
++
++    # Optional callable which gets a list of token IDs from the model tokenizer
++    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
++                                          List[int]]] = None
++    # Optional list of strings to stop generation, useful when stop tokens are
++    # not special tokens in the tokenizer
++    stop_str: Optional[List[str]] = None
++
++    # Exposed options for HF runner
++    hf_model_kwargs: Optional[Dict[str, Any]] = None
++    # Indicates we should explicitly pass the EOS from the tokenizer
++    use_tokenizer_eos: bool = False
++    auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
++    # Callable to pass to the HF runner to run on inputs; for now, we also pass
++    # the data type to input post processing, because almost all of the uses of
++    # postprocess_inputs are to fix the data types of BatchEncoding values.
++    postprocess_inputs: Callable[[BatchEncoding, str],
++                                 BatchEncoding] = identity
++    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
++
++    # Post processors that if defined, will run oun the outputs of the
++    # vLLM and HF runner, respectively (useful for sanitization, etc).
++    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
++    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
++
++    # Consumes the output of the callables above and checks if they're equal
++    comparator: Callable[..., None] = check_logprobs_close
++
++    # Default expandable params per test; these defaults can be overridden in
++    # instances of this object; the complete set of test cases for the model
++    # is all combinations of .models + all fields below
++    max_tokens: Union[int, Tuple[int]] = 128
++    num_logprobs: Union[int, Tuple[int]] = 5
++    dtype: Union[str, Iterable[str]] = "half"
++    distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
++    # Only expanded in video tests
++    num_video_frames: Union[int, Tuple[int]] = 16
++
++    # Fixed image sizes / image size factors; most tests use image_size_factors
++    # The values provided for these two fields will be stacked and expanded
++    # such that each model will consider each image size factor / image size
++    # once per tests (much like concatenating and wrapping in one parametrize
++    # call)
++    image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
++    image_sizes: Optional[Iterable[Iterable[Tuple[int, int]]]] = None
++
++    # Hack for updating a prompt to take into a local path; currently only used
++    # for Qwen-VL, which requires encoding the image path / url into the prompt
++    # for HF runner
++    prompt_path_encoder: Optional[
++        Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]],
++                 str]] = None  # noqa: E501
++
++    # Allows configuring a test to run with custom inputs
++    custom_test_opts: Optional[List[CustomTestOptions]] = None
++
++    marks: Optional[List[MarkDecorator]] = None
++
++    tokenizer_mode: str = "auto"
++
++    def get_non_parametrized_runner_kwargs(self):
++        """Returns a dictionary of expandable kwargs for items that are used
++        in all test types, which are NOT used when creating the parametrized
++        test cases.
++        """
++        return {
++            "enforce_eager": self.enforce_eager,
++            "max_model_len": self.max_model_len,
++            "max_num_seqs": self.max_num_seqs,
++            "task": self.task,
++            "tensor_parallel_size": self.tensor_parallel_size,
++            "vllm_runner_kwargs": self.vllm_runner_kwargs,
++            "hf_output_post_proc": self.hf_output_post_proc,
++            "vllm_output_post_proc": self.vllm_output_post_proc,
++            "auto_cls": self.auto_cls,
++            "use_tokenizer_eos": self.use_tokenizer_eos,
++            "postprocess_inputs": self.postprocess_inputs,
++            "comparator": self.comparator,
++            "get_stop_token_ids": self.get_stop_token_ids,
++            "hf_model_kwargs": self.hf_model_kwargs,
++            "stop_str": self.stop_str,
++            "patch_hf_runner": self.patch_hf_runner,
++            "tokenizer_mode": self.tokenizer_mode
++        }
++
++
++class ExpandableVLMTestArgs(NamedTuple):
++    """The expanded kwargs which correspond to a single test case."""
++    model: str
++    max_tokens: int
++    num_logprobs: int
++    dtype: str
++    distributed_executor_backend: Optional[str]
++    # Sizes are used for everything except for custom input tests
++    size_wrapper: Optional[ImageSizeWrapper] = None
++    # Video only
++    num_video_frames: Optional[int] = None
++    # Custom inputs only
++    custom_test_opts: Optional[CustomTestOptions] = None
+diff --git a/tests/models/embedding/__init__.py b/tests/models/embedding/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/embedding/language/__init__.py b/tests/models/embedding/language/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
+new file mode 100644
+index 0000000..6673a9f
+--- /dev/null
++++ b/tests/models/embedding/language/test_cls_models.py
+@@ -0,0 +1,42 @@
++"""Compare the classification outputs of HF and vLLM models.
++
++Run `pytest tests/models/test_cls_models.py`.
++"""
++import pytest
++import torch
++from transformers import AutoModelForSequenceClassification
++
++
++@pytest.mark.parametrize(
++    "model",
++    [
++        pytest.param("jason9693/Qwen2.5-1.5B-apeach",
++                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
++    ],
++)
++@pytest.mark.parametrize("dtype", ["float"])
++def test_classification_models(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++) -> None:
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        vllm_outputs = vllm_model.classify(example_prompts)
++        # This test is for verifying whether the model's extra_repr
++        # can be printed correctly.
++        print(vllm_model.model.llm_engine.model_executor.driver_worker.
++              model_runner.model)
++
++    with hf_runner(model,
++                   dtype=dtype,
++                   auto_cls=AutoModelForSequenceClassification) as hf_model:
++        hf_outputs = hf_model.classify(example_prompts)
++
++    # check logits difference
++    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
++        hf_output = torch.tensor(hf_output)
++        vllm_output = torch.tensor(vllm_output)
++
++        assert torch.allclose(hf_output, vllm_output, 1e-3)
+diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
+new file mode 100644
+index 0000000..04ab4dd
+--- /dev/null
++++ b/tests/models/embedding/language/test_embedding.py
+@@ -0,0 +1,75 @@
++"""Compare the embedding outputs of HF and vLLM models.
++
++Run `pytest tests/models/embedding/language/test_embedding.py`.
++"""
++import pytest
++
++from vllm.config import PoolerConfig
++
++from ..utils import check_embeddings_close
++
++
++@pytest.mark.parametrize(
++    "model",
++    [
++        # [Encoder-only]
++        pytest.param("BAAI/bge-base-en-v1.5",
++                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
++        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
++        pytest.param("intfloat/multilingual-e5-large"),
++        # [Encoder-decoder]
++        pytest.param("intfloat/e5-mistral-7b-instruct",
++                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
++        pytest.param("BAAI/bge-multilingual-gemma2",
++                     marks=[pytest.mark.core_model]),
++        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
++        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
++        pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
++        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
++    ],
++)
++@pytest.mark.parametrize("dtype", ["half"])
++def test_models(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model,
++    dtype: str,
++) -> None:
++    vllm_extra_kwargs = {}
++    if model == "ssmits/Qwen2-7B-Instruct-embed-base":
++        vllm_extra_kwargs["override_pooler_config"] = \
++            PoolerConfig(pooling_type="MEAN")
++    if model == "Alibaba-NLP/gte-Qwen2-7B-instruct":
++        vllm_extra_kwargs["hf_overrides"] = {"is_causal": False}
++
++    # The example_prompts has ending "\n", for example:
++    # "Write a short story about a robot that dreams for the first time.\n"
++    # sentence_transformers will strip the input texts, see:
++    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
++    # This makes the input_ids different between hf_model and vllm_model.
++    # So we need to strip the input texts to avoid test failing.
++    example_prompts = [str(s).strip() for s in example_prompts]
++
++    with hf_runner(model, dtype=dtype,
++                   is_sentence_transformer=True) as hf_model:
++        hf_outputs = hf_model.encode(example_prompts)
++
++    with vllm_runner(model,
++                     task="embed",
++                     dtype=dtype,
++                     max_model_len=None,
++                     **vllm_extra_kwargs) as vllm_model:
++        vllm_outputs = vllm_model.encode(example_prompts)
++        # This test is for verifying whether the model's extra_repr
++        # can be printed correctly.
++        print(vllm_model.model.llm_engine.model_executor.driver_worker.
++              model_runner.model)
++
++    check_embeddings_close(
++        embeddings_0_lst=hf_outputs,
++        embeddings_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++        tol=1e-2,
++    )
+diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
+new file mode 100644
+index 0000000..55c2e5d
+--- /dev/null
++++ b/tests/models/embedding/language/test_gritlm.py
+@@ -0,0 +1,200 @@
++import importlib.util
++import math
++from array import array
++from typing import List
++
++import openai
++import pytest
++import pytest_asyncio
++from scipy.spatial.distance import cosine
++
++import vllm
++import vllm.config
++
++from ....utils import RemoteOpenAIServer
++
++# GritLM embedding implementation is only supported by XFormers backend.
++pytest.mark.skipif(not importlib.util.find_spec("xformers"),
++                   reason="GritLM requires XFormers")
++
++MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
++MAX_MODEL_LEN = 4000
++
++
++def _arr(arr):
++    """
++    Convert a list of integers to an array of integers.
++    """
++    return array("i", arr)
++
++
++def test_find_array(monkeypatch):
++    # GritLM embedding implementation is only supported by XFormers backend.
++    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
++
++    from vllm.model_executor.models.gritlm import GritLMPooler
++
++    # Create an LLM object to get the model config.
++    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
++    pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
++
++    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
++
++    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
++    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
++    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
++    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
++
++    with pytest.raises(ValueError):
++        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
++
++
++@pytest.fixture(scope="module")
++def server_embedding():
++    # GritLM embedding implementation is only supported by XFormers backend.
++    with pytest.MonkeyPatch.context() as mp:
++        mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
++
++        args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
++        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++            yield remote_server
++
++
++@pytest.fixture(scope="module")
++def server_generate():
++    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
++    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
++        yield remote_server
++
++
++@pytest_asyncio.fixture
++async def client_embedding(server_embedding: RemoteOpenAIServer):
++    async with server_embedding.get_async_client() as async_client:
++        yield async_client
++
++
++@pytest_asyncio.fixture
++async def client_generate(server_generate: RemoteOpenAIServer):
++    async with server_generate.get_async_client() as async_client:
++        yield async_client
++
++
++def run_llm_encode(llm: vllm.LLM, queries: List[str],
++                   instruction: str) -> List[float]:
++    outputs = llm.encode([instruction + q for q in queries], )
++    return [output.outputs.embedding for output in outputs]
++
++
++async def run_client_embeddings(client: vllm.LLM, queries: List[str],
++                                instruction: str) -> List[float]:
++    outputs = await client.embeddings.create(
++        model=MODEL_NAME,
++        input=[instruction + q for q in queries],
++    )
++    return [data.embedding for data in outputs.data]
++
++
++def gritlm_instruction(instruction):
++    return ("<|user|>\n" + instruction +
++            "\n<|embed|>\n" if instruction else "<|embed|>\n")
++
++
++def get_test_data():
++    """
++    Grabbed this test data and the expected values from
++    README.md in https://github.com/ContextualAI/gritlm
++    """
++    q_instruction = gritlm_instruction(
++        "Given a scientific paper title, retrieve the paper's abstract")
++    queries = [
++        "Bitcoin: A Peer-to-Peer Electronic Cash System",
++        "Generative Representational Instruction Tuning",
++    ]
++
++    d_instruction = gritlm_instruction("")
++    documents = [
++        # ruff: noqa: E501
++        "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
++        "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
++    ]
++
++    return queries, q_instruction, documents, d_instruction
++
++
++def validate_embed_output(q_rep: List[float], d_rep: List[float]):
++    cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
++    assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
++
++    cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
++    assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001)
++
++    cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
++    assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
++
++    cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
++    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
++
++
++def test_gritlm_offline_embedding(monkeypatch):
++    # GritLM embedding implementation is only supported by XFormers backend.
++    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
++
++    queries, q_instruction, documents, d_instruction = get_test_data()
++
++    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
++
++    d_rep = run_llm_encode(
++        llm,
++        documents,
++        d_instruction,
++    )
++    q_rep = run_llm_encode(
++        llm,
++        queries,
++        q_instruction,
++    )
++
++    validate_embed_output(q_rep, d_rep)
++
++
++@pytest.mark.asyncio
++async def test_gritlm_api_server_embedding(
++        client_embedding: openai.AsyncOpenAI):
++    queries, q_instruction, documents, d_instruction = get_test_data()
++
++    d_rep = await run_client_embeddings(
++        client_embedding,
++        documents,
++        d_instruction,
++    )
++    q_rep = await run_client_embeddings(
++        client_embedding,
++        queries,
++        q_instruction,
++    )
++
++    validate_embed_output(q_rep, d_rep)
++
++
++def test_gritlm_offline_gen():
++    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
++
++    llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN)
++    sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
++    outputs = llm.generate(input, sampling_params=sampling_params)
++
++    assert outputs[0].outputs[0].text == "The capital of France is Paris."
++
++
++@pytest.mark.asyncio
++async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI):
++    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
++
++    outputs = await client_generate.completions.create(
++        model=MODEL_NAME,
++        prompt=input,
++        max_tokens=256,
++        temperature=0.0,
++    )
++
++    assert outputs.choices[0].text == "The capital of France is Paris."
+diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
+new file mode 100644
+index 0000000..be6e384
+--- /dev/null
++++ b/tests/models/embedding/language/test_scoring.py
+@@ -0,0 +1,89 @@
++"""Compare the scoring outputs of HF and vLLM models.
++
++Run `pytest tests/models/embedding/language/test_scoring.py`.
++"""
++import math
++
++import pytest
++
++MODELS = [
++    "cross-encoder/ms-marco-MiniLM-L-6-v2",  # Bert
++    "BAAI/bge-reranker-v2-m3",  # Roberta
++]
++
++TEXTS_1 = [
++    "What is the capital of France?",
++    "What is the capital of Germany?",
++]
++
++TEXTS_2 = [
++    "The capital of France is Paris.",
++    "The capital of Germany is Berlin.",
++]
++
++
++@pytest.fixture(scope="module", params=MODELS)
++def model_name(request):
++    yield request.param
++
++
++@pytest.mark.parametrize("dtype", ["half"])
++def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
++
++    text_pair = [TEXTS_1[0], TEXTS_2[0]]
++
++    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
++        hf_outputs = hf_model.predict([text_pair]).tolist()
++
++    with vllm_runner(model_name, task="score", dtype=dtype,
++                     max_model_len=None) as vllm_model:
++        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
++
++    assert len(vllm_outputs) == 1
++    assert len(hf_outputs) == 1
++
++    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
++
++
++@pytest.mark.parametrize("dtype", ["half"])
++def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
++
++    text_pairs = [
++        [TEXTS_1[0], TEXTS_2[0]],
++        [TEXTS_1[0], TEXTS_2[1]],
++    ]
++
++    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
++        hf_outputs = hf_model.predict(text_pairs).tolist()
++
++    with vllm_runner(model_name, task="score", dtype=dtype,
++                     max_model_len=None) as vllm_model:
++        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
++
++    assert len(vllm_outputs) == 2
++    assert len(hf_outputs) == 2
++
++    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
++    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
++
++
++@pytest.mark.parametrize("dtype", ["half"])
++def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
++
++    text_pairs = [
++        [TEXTS_1[0], TEXTS_2[0]],
++        [TEXTS_1[1], TEXTS_2[1]],
++    ]
++
++    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
++        hf_outputs = hf_model.predict(text_pairs).tolist()
++
++    with vllm_runner(model_name, task="score", dtype=dtype,
++                     max_model_len=None) as vllm_model:
++        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
++
++    assert len(vllm_outputs) == 2
++    assert len(hf_outputs) == 2
++
++    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
++    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
+new file mode 100644
+index 0000000..f96c7d2
+--- /dev/null
++++ b/tests/models/embedding/utils.py
+@@ -0,0 +1,30 @@
++from typing import List, Sequence
++
++import torch
++import torch.nn.functional as F
++
++
++def check_embeddings_close(
++    *,
++    embeddings_0_lst: Sequence[List[float]],
++    embeddings_1_lst: Sequence[List[float]],
++    name_0: str,
++    name_1: str,
++    tol: float = 1e-3,
++) -> None:
++    assert len(embeddings_0_lst) == len(embeddings_1_lst)
++
++    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
++            zip(embeddings_0_lst, embeddings_1_lst)):
++        assert len(embeddings_0) == len(embeddings_1), (
++            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
++
++        sim = F.cosine_similarity(torch.tensor(embeddings_0),
++                                  torch.tensor(embeddings_1),
++                                  dim=0)
++
++        fail_msg = (f"Test{prompt_idx}:"
++                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
++                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
++
++        assert sim >= 1 - tol, fail_msg
+diff --git a/tests/models/embedding/vision_language/__init__.py b/tests/models/embedding/vision_language/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+new file mode 100644
+index 0000000..2641987
+--- /dev/null
++++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+@@ -0,0 +1,209 @@
++from functools import partial
++from typing import Callable, Dict, List, Type
++
++import pytest
++import torch
++from PIL import Image
++from transformers import BatchEncoding, Qwen2VLForConditionalGeneration
++
++from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
++from ....utils import large_gpu_test
++from ..utils import check_embeddings_close
++
++HF_TEXT_PROMPTS = [
++    # T -> X
++    (
++        "Query: Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501,
++        Image.new("RGB", (56, 56))),
++    # T -> X
++    ("Query: Retrieve an image of this caption: cherry blossom",
++     Image.new("RGB", (56, 56))),
++]
++
++HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
++    "stop_sign":
++    "What is shown in this image?",
++    "cherry_blossom":
++    "What is shown in this image?"
++})
++
++MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
++
++
++def get_messages(image: Image.Image, text: str, embed_text: bool):
++    # assert False, 'remember to use outer [] as required'
++    if embed_text:
++        messages = [{
++            "role":
++            "user",
++            "content": [
++                {
++                    "type": "image",
++                    "image": Image.new("RGB", (56, 56)),
++                    "resized_height": 1,
++                    "resized_width": 1
++                },  # need a dummy image here for an easier process.
++                {
++                    "type": "text",
++                    "text": text
++                },
++            ]
++        }]
++    else:
++        messages = [{
++            "role":
++            "user",
++            "content": [{
++                "type": "image",
++                "image": image
++            }, {
++                "type": "text",
++                "text": text
++            }]
++        }]
++    return messages
++
++
++def apply_chat_template_and_add_eos(
++    messages: List[Dict],
++    apply_chat_template_fn: Callable,
++):
++    prompt = apply_chat_template_fn(
++        messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>"
++    return prompt
++
++
++def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
++    return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
++
++
++def _run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    input_texts: List[str],
++    input_images: PromptImageInput,
++    embed_texts: List[bool],
++    model: str,
++    *,
++    dtype: str,
++) -> None:
++    '''SET PYTHONPATH'''
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default method).
++    with vllm_runner(model,
++                     task="embed",
++                     dtype=dtype,
++                     enforce_eager=True,
++                     max_model_len=8192) as vllm_model:
++        tokenizer = vllm_model.model.get_tokenizer()
++        texts = [
++            # this is necessary because vllm_model.encode will not apply any
++            # templating to the prompt, and therefore lacks an image_pad
++            # token unless one is inserted beforehand (the (28,28) image
++            # above is converted to an image pad token by the chat template).
++            apply_chat_template_and_add_eos(
++                get_messages(image, text, False),
++                apply_chat_template_fn=tokenizer.apply_chat_template,
++            ) for text, image in zip(input_texts, input_images)
++            # vllm will replace the pad token with the actual image,
++            # which may be a placeholder image, later.
++        ]
++        vllm_outputs = vllm_model.encode(texts, images=input_images)
++
++    hf_outputs = []
++    with hf_runner(model,
++                   dtype=dtype,
++                   auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
++        hf_model.postprocess_inputs = partial(
++            postprocess_inputs,
++            hf_model,
++            cache_position=torch.arange(
++                0,
++                1,  # 1 for batch size
++                requires_grad=False),
++            use_cache=False)
++        for text, image, embed_text in zip(input_texts, input_images,
++                                           embed_texts):
++            # dse requires non-standard input processing
++            # because it needs an image_pad token
++            messages = get_messages(image, text, embed_text)
++            prompt = apply_chat_template_and_add_eos(
++                messages, hf_model.processor.apply_chat_template)
++            inputs = hf_model.get_inputs(
++                prompts=[[prompt]],
++                images=[[image]],
++            )
++            with torch.no_grad():
++                outputs = hf_model.model(
++                    **hf_model.wrap_device(inputs[0],
++                                           device=hf_model.model.device.type),
++                    return_dict=True,
++                    output_hidden_states=True,
++                )
++                pooled_output = torch.nn.functional.normalize(
++                    outputs.hidden_states[-1][0, -1], p=2, dim=-1)
++            hf_outputs.append(pooled_output.tolist())
++
++    check_embeddings_close(
++        embeddings_0_lst=hf_outputs,
++        embeddings_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++def test_models_text(
++    hf_runner,
++    vllm_runner,
++    image_assets,
++    model: str,
++    dtype: str,
++) -> None:
++    input_texts_images = [(text, image_placeholder)
++                          for text, image_placeholder in HF_TEXT_PROMPTS]
++    input_texts = [text for text, _ in input_texts_images]
++    input_images = [image for _, image in input_texts_images]
++    embed_texts = [True] * len(input_texts)
++
++    _run_test(
++        hf_runner,
++        vllm_runner,
++        input_texts,
++        input_images,  # type: ignore
++        embed_texts,
++        model,
++        dtype=dtype,
++    )
++
++
++@large_gpu_test(min_gb=48)
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++def test_models_image(
++    hf_runner,
++    vllm_runner,
++    image_assets,
++    model: str,
++    dtype: str,
++) -> None:
++    input_texts_images = [
++        (text, asset.pil_image)
++        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
++    ]
++    input_texts = [text for text, _ in input_texts_images]
++    input_images = [image for _, image in input_texts_images]
++    embed_texts = [False] * len(input_texts)
++
++    _run_test(
++        hf_runner,
++        vllm_runner,
++        input_texts,
++        input_images,
++        embed_texts,
++        model,
++        dtype=dtype,
++    )
+diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
+new file mode 100644
+index 0000000..f4cd8b8
+--- /dev/null
++++ b/tests/models/embedding/vision_language/test_llava_next.py
+@@ -0,0 +1,140 @@
++from typing import List, Type
++
++import pytest
++import torch.nn.functional as F
++import transformers
++from transformers import AutoModelForVision2Seq
++
++from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
++from ....utils import large_gpu_test
++from ..utils import check_embeddings_close
++
++llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
++
++HF_TEXT_PROMPTS = [
++    # T -> X
++    llama3_template.format(
++        "The label of the object is stop sign\nSummary above sentence in one word: "  # noqa: E501
++    ),
++    # T -> X
++    llama3_template.format(
++        "cherry blossom\nSummary above sentence in one word: "),
++]
++
++HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
++    # I -> X
++    "stop_sign":
++    llama3_template.format("<image>\nSummary above image in one word: "),
++    # I -> X
++    "cherry_blossom":
++    llama3_template.format("<image>\nSummary above image in one word: "),
++})
++
++MODELS = ["royokong/e5-v"]
++
++
++def _run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    input_texts: List[str],
++    input_images: PromptImageInput,
++    model: str,
++    *,
++    dtype: str,
++) -> None:
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default method).
++    with vllm_runner(model,
++                     task="embed",
++                     dtype=dtype,
++                     max_model_len=4096,
++                     enforce_eager=True) as vllm_model:
++        vllm_outputs = vllm_model.encode(input_texts, images=input_images)
++
++    with hf_runner(model, dtype=dtype,
++                   auto_cls=AutoModelForVision2Seq) as hf_model:
++        # Patch the issue where image_token_id
++        # exceeds the maximum allowed vocab size
++        hf_model.model.resize_token_embeddings(
++            hf_model.model.language_model.vocab_size + 1)
++
++        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
++
++        all_outputs = []
++        for inputs in all_inputs:
++            # Based on: https://huggingface.co/royokong/e5-v
++            outputs = hf_model.model(
++                **hf_model.wrap_device(inputs,
++                                       device=hf_model.model.device.type),
++                return_dict=True,
++                output_hidden_states=True,
++            )
++            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
++                                        dim=-1)
++
++            all_outputs.append(pooled_output.tolist())
++
++        hf_outputs = all_outputs
++
++    check_embeddings_close(
++        embeddings_0_lst=hf_outputs,
++        embeddings_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.skipif(transformers.__version__ >= "4.46",
++                    reason="Model broken with changes in transformers 4.46")
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++def test_models_text(
++    hf_runner,
++    vllm_runner,
++    image_assets,
++    model: str,
++    dtype: str,
++) -> None:
++    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
++    input_texts = [text for text, _ in input_texts_images]
++    input_images = [image for _, image in input_texts_images]
++
++    _run_test(
++        hf_runner,
++        vllm_runner,
++        input_texts,
++        input_images,  # type: ignore
++        model,
++        dtype=dtype,
++    )
++
++
++@large_gpu_test(min_gb=48)
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++def test_models_image(
++    hf_runner,
++    vllm_runner,
++    image_assets,
++    model: str,
++    dtype: str,
++) -> None:
++    input_texts_images = [
++        (text, asset.pil_image)
++        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
++    ]
++    input_texts = [text for text, _ in input_texts_images]
++    input_images = [image for _, image in input_texts_images]
++
++    _run_test(
++        hf_runner,
++        vllm_runner,
++        input_texts,
++        input_images,
++        model,
++        dtype=dtype,
++    )
+diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
+new file mode 100644
+index 0000000..9374c23
+--- /dev/null
++++ b/tests/models/embedding/vision_language/test_phi3v.py
+@@ -0,0 +1,126 @@
++from typing import List, Type
++
++import pytest
++import torch.nn.functional as F
++
++from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
++from ....utils import large_gpu_test
++from ..utils import check_embeddings_close
++
++HF_TEXT_PROMPTS = [
++    # T -> X
++    "Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501
++    # T -> X
++    "Retrieve an image of this caption: cherry blossom",
++]
++
++HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
++    # T + I -> X
++    "stop_sign":
++    "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
++    # I -> X
++    "cherry_blossom":
++    "<|image_1|> Represent the given image for classification",  # noqa: E501
++})
++
++MODELS = ["TIGER-Lab/VLM2Vec-Full"]
++
++
++def _run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    input_texts: List[str],
++    input_images: PromptImageInput,
++    model: str,
++    *,
++    dtype: str,
++) -> None:
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default method).
++    with vllm_runner(model, task="embed", dtype=dtype,
++                     enforce_eager=True) as vllm_model:
++        vllm_outputs = vllm_model.encode(input_texts, images=input_images)
++
++    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
++    hf_model_kwargs = {"_attn_implementation": "eager"}
++    with hf_runner(model, dtype=dtype,
++                   model_kwargs=hf_model_kwargs) as hf_model:
++        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
++
++        all_outputs = []
++        for inputs in all_inputs:
++            # Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
++            outputs = hf_model.model(
++                **hf_model.wrap_device(inputs,
++                                       device=hf_model.model.device.type),
++                return_dict=True,
++                output_hidden_states=True,
++            )
++            last_hidden_state = outputs.hidden_states[-1][0]
++            reps = last_hidden_state[inputs.attention_mask[0].sum() - 1]
++            pooled_output = F.normalize(reps, p=2, dim=-1)
++
++            all_outputs.append(pooled_output.tolist())
++
++        hf_outputs = all_outputs
++
++    check_embeddings_close(
++        embeddings_0_lst=hf_outputs,
++        embeddings_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++def test_models_text(
++    hf_runner,
++    vllm_runner,
++    image_assets,
++    model: str,
++    dtype: str,
++) -> None:
++    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
++    input_texts = [text for text, _ in input_texts_images]
++    input_images = [image for _, image in input_texts_images]
++
++    _run_test(
++        hf_runner,
++        vllm_runner,
++        input_texts,
++        input_images,  # type: ignore
++        model,
++        dtype=dtype,
++    )
++
++
++@large_gpu_test(min_gb=48)
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++def test_models_image(
++    hf_runner,
++    vllm_runner,
++    image_assets,
++    model: str,
++    dtype: str,
++) -> None:
++    input_texts_images = [
++        (text, asset.pil_image)
++        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
++    ]
++    input_texts = [text for text, _ in input_texts_images]
++    input_images = [image for _, image in input_texts_images]
++
++    _run_test(
++        hf_runner,
++        vllm_runner,
++        input_texts,
++        input_images,
++        model,
++        dtype=dtype,
++    )
+diff --git a/tests/models/encoder_decoder/__init__.py b/tests/models/encoder_decoder/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py
+new file mode 100644
+index 0000000..eb238c5
+--- /dev/null
++++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
+@@ -0,0 +1,136 @@
++"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
++
++Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
++"""
++from typing import Optional
++
++import pytest
++
++from vllm import LLM, SamplingParams
++from vllm.assets.audio import AudioAsset
++
++from ....utils import fork_new_process_for_each_test, multi_gpu_test
++
++PROMPTS = [
++    {
++        "prompt":
++        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
++        "multi_modal_data": {
++            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
++        },
++    },
++    {  # Test explicit encoder/decoder prompt
++        "encoder_prompt": {
++            "prompt": "",
++            "multi_modal_data": {
++                "audio": AudioAsset("winning_call").audio_and_sample_rate,
++            },
++        },
++        "decoder_prompt":
++        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
++    }
++]
++
++EXPECTED = {
++    "openai/whisper-tiny": [
++        " He has birth words I spoke in the original corner of that. And a"
++        " little piece of black coat poetry. Mary had a little sandwich,"
++        " sweet, with white and snow. And everyone had it very went the last"
++        " would sure to go.",
++        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
++        " to line down the field line for our base camp. Here comes joy. Here"
++        " is June and the third base. They're going to wave him in. The throw"
++        " to the plate will be late. The Mariners are going to play for the"
++        " American League Championship. I don't believe it. It just continues"
++        " by all five."
++    ],
++    "openai/whisper-small": [
++        " The first words I spoke in the original pornograph. A little piece"
++        " of practical poetry. Mary had a little lamb, its fleece was quite a"
++        " slow, and everywhere that Mary went the lamb was sure to go.",
++        " And the old one pitch on the way to Edgar Martinez one month. Here"
++        " comes joy. Here is Junior to third base. They're gonna wave him"
++        " in. The throw to the plate will be late. The Mariners are going to"
++        " play for the American League Championship. I don't believe it. It"
++        " just continues. My, oh my."
++    ],
++    "openai/whisper-medium": [
++        " The first words I spoke in the original phonograph, a little piece"
++        " of practical poetry. Mary had a little lamb, its fleece was quite as"
++        " slow, and everywhere that Mary went the lamb was sure to go.",
++        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
++        " down the left field line for Obeyshev. Here comes Joy. Here is"
++        " Jorgen at third base. They're going to wave him in. The throw to the"
++        " plate will be late. The Mariners are going to play for the American"
++        " League Championship. I don't believe it. It just continues. My, oh"
++        " my."
++    ],
++    "openai/whisper-large-v3": [
++        " The first words I spoke in the original phonograph, a little piece"
++        " of practical poetry. Mary had a little lamb, its feet were quite as"
++        " slow, and everywhere that Mary went, the lamb was sure to go.",
++        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
++        " Now the left field line for a base hit. Here comes Joy. Here is"
++        " Junior to third base. They're going to wave him in. The throw to the"
++        " plate will be late. The Mariners are going to play for the American"
++        " League Championship. I don't believe it. It just continues. My, oh,"
++        " my."
++    ],
++    "openai/whisper-large-v3-turbo": [
++        " The first words I spoke in the original phonograph, a little piece"
++        " of practical poetry. Mary had a little lamb, its streets were quite"
++        " as slow, and everywhere that Mary went the lamb was sure to go.",
++        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
++        " down the left field line for a base hit. Here comes Joy. Here is"
++        " Junior to third base. They're going to wave him in. The throw to the"
++        " plate will be late. The Mariners are going to play for the American"
++        " League Championship. I don't believe it. It just continues. My, oh,"
++        " my."
++    ]
++}
++
++
++def run_test(
++    model: str,
++    *,
++    tensor_parallel_size: int,
++    distributed_executor_backend: Optional[str] = None,
++) -> None:
++    prompt_list = PROMPTS * 10
++    expected_list = EXPECTED[model] * 10
++
++    llm = LLM(
++        model=model,
++        tensor_parallel_size=tensor_parallel_size,
++        distributed_executor_backend=distributed_executor_backend,
++    )
++
++    sampling_params = SamplingParams(
++        temperature=0,
++        top_p=1.0,
++        max_tokens=200,
++    )
++
++    outputs = llm.generate(prompt_list, sampling_params)
++
++    for output, expected in zip(outputs, expected_list):
++        print(output.outputs[0].text)
++        assert output.outputs[0].text == expected
++
++
++@fork_new_process_for_each_test
++@pytest.mark.core_model
++@pytest.mark.parametrize(
++    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
++def test_models(model) -> None:
++    run_test(model, tensor_parallel_size=1)
++
++
++@multi_gpu_test(num_gpus=2)
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
++@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
++def test_models_distributed(model, distributed_executor_backend) -> None:
++    run_test(model,
++             tensor_parallel_size=2,
++             distributed_executor_backend=distributed_executor_backend)
+diff --git a/tests/models/encoder_decoder/language/__init__.py b/tests/models/encoder_decoder/language/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py
+new file mode 100644
+index 0000000..10aba84
+--- /dev/null
++++ b/tests/models/encoder_decoder/language/test_bart.py
+@@ -0,0 +1,222 @@
++"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
++
++Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
++"""
++from typing import List, Optional, Tuple, Type
++
++import pytest
++from transformers import AutoModelForSeq2SeqLM
++
++from vllm.sequence import SampleLogprobs
++
++from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
++                          HfRunner, VllmRunner)
++from ....utils import multi_gpu_test
++from ...utils import check_logprobs_close
++
++
++def vllm_to_hf_output(
++    vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
++    decoder_prompt_type: DecoderPromptType,
++):
++    """Sanitize vllm output to be comparable with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    hf_output_str = output_str + "</s>"
++    if decoder_prompt_type == DecoderPromptType.NONE:
++        hf_output_str = "<s>" + hf_output_str
++
++    return output_ids, hf_output_str, out_logprobs
++
++
++def run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
++    decoder_prompt_type: DecoderPromptType,
++    model: str,
++    *,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    tensor_parallel_size: int,
++    distributed_executor_backend: Optional[str] = None,
++) -> None:
++    '''
++    Test the vLLM BART model for a variety of encoder/decoder input prompts,
++    by validating it against HuggingFace (HF) BART.
++
++    Arguments:
++
++    * hf_runner: HuggingFace (HF) test model runner
++    * vllm_runner: vLLM test model runner
++    * example_encoder_decoder_prompts: test fixture which provides a 
++                                       dictionary of dummy prompts
++    * model: the HF ID of the specific BART variant under test
++    * dtype: the tensor datatype to employ
++    * max_tokens
++    * num_logprobs
++    * decoder_prompt_type: key into the example_encoder_decoder_prompts
++                           dictionary; selects specific encoder/decoder
++                           prompt scenarios to test
++
++    A note on using HF BART as a baseline for validating vLLM BART,
++    specifically when the decoder prompt is None. 
++    
++    The HF GenerationMixin's default behavior is to force the first
++    decoded token to be <BOS> if the prompt does not already contain
++    <BOS> (this is accomplished using a logit
++    processor setting.)
++    
++    So when we use HF BART as our baseline for comparison, note that
++    when the user provides a request with a None decoder prompt
++    (i.e. a singleton encoder prompt, or else an explicit encoder/
++    decoder prompt with the decoder sub-prompt set to None), HF and
++    vLLM handle this in different ways:
++    
++    * HF will (1) tokenize the None prompt as an empty token-list, 
++      (2) append <decoder-start-token> to the beginning, yielding
++      [<decoder-start-token>], (3) pass this token list to the model, and
++      then (4) after computing logits during prefill, override the model
++      logits & force <BOS> to be the first generated token.
++    
++    * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
++      start-token to the beginning, yielding [<decoder-start-token><BOS>],
++      (3) pass these tokens to the model & proceed with generation.
++    
++    The net effect is that compared to vLLM, the list of HF *decoded* tokens
++    will contain one more initial <BOS> than the vLLM generated tokens,
++    because vLLM's <BOS> token is injected into the prompt rather than into
++    the generated output. This is in spite of the fact that overall, the
++    complete sequences (prompt + decoded tokens) produced by vLLM will match
++    HF.
++    
++    So when we use HF decoded token output to validate vLLM's decoded token
++    output, the testing process must account for the difference in decoded
++    token sequences between vLLM and HF specifically in the
++    decoder-prompt-is-None case. 
++    
++    One option is to disable the logit processor feature that forces the
++    <BOS> token to be decoded (forced_bos_token_id = None), eliminating
++    the problem entirely. However this is not "normal" BART usage.
++    
++    The other option is - only in the decoder-prompt-is-None case - to
++    discard the first decoded token from the HF output before comparing it
++    to vLLM.
++
++    To that end, when testing the scenario where the decoder prompt is None
++    (and only in that one scenario), this test skips the first HF decoded
++    token during the process of validating the vLLM decoded output.
++    '''
++
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default).
++
++    # Note: currently encoder/decoder models are only compatible with
++    # enforce_eager=True. Normally this is not a problem because
++    # for encoder/decoder models vLLM will
++    # default to enforce_eager=True if enforce_eager
++    # is left unspecified. However, the
++    # VllmRunner test fixture (which wraps around the LLM class) defaults to
++    # enforce_eager=False (a behavior which a number of already-exisitng
++    # decoder-only unit tests expect), so when testing an encoder/decoder
++    # model we must explicitly specify enforce_eager=True in the VllmRunner
++    # constructor.
++    with vllm_runner(model,
++                     dtype=dtype,
++                     tensor_parallel_size=tensor_parallel_size,
++                     distributed_executor_backend=distributed_executor_backend,
++                     enforce_eager=True) as vllm_model:
++        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
++            prompts, max_tokens, num_logprobs)
++
++    # Configuration settings for HF baseline
++    hf_kwargs = {
++        "top_k": None,
++        "num_beams": 1,
++        "repetition_penalty": 1.0,
++        "top_p": 1.0,
++        "length_penalty": 1.0,
++        "early_stopping": False,
++        "no_repeat_ngram_size": None,
++        "min_length": 0
++    }
++
++    with hf_runner(model, dtype=dtype,
++                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
++        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
++            prompts,
++            max_tokens,
++            num_logprobs,
++            **hf_kwargs,
++        ))
++
++    hf_skip_tokens = (1
++                      if decoder_prompt_type == DecoderPromptType.NONE else 0)
++
++    check_logprobs_close(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=[
++            vllm_to_hf_output(vllm_output, decoder_prompt_type)
++            for vllm_output in vllm_outputs
++        ],
++        name_0="hf",
++        name_1="vllm",
++        num_outputs_0_skip_tokens=hf_skip_tokens,
++    )
++
++
++@pytest.mark.parametrize(
++    "model",
++    [
++        pytest.param("facebook/bart-base",
++                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
++        pytest.param("facebook/bart-large-cnn"),
++    ],
++)
++@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
++@pytest.mark.parametrize("max_tokens", [64])
++@pytest.mark.parametrize("num_logprobs", [5])
++@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
++def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
++                dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
++
++    run_test(
++        hf_runner,
++        vllm_runner,
++        example_encoder_decoder_prompts[decoder_prompt_type],
++        decoder_prompt_type,
++        model,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        tensor_parallel_size=1,
++    )
++
++
++@multi_gpu_test(num_gpus=2)
++@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
++@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
++@pytest.mark.parametrize("dtype", ["float"])
++@pytest.mark.parametrize("max_tokens", [64])
++@pytest.mark.parametrize("num_logprobs", [5])
++@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
++def test_models_distributed(hf_runner, vllm_runner,
++                            example_encoder_decoder_prompts,
++                            distributed_executor_backend, model, dtype,
++                            max_tokens, num_logprobs,
++                            decoder_prompt_type) -> None:
++    run_test(
++        hf_runner,
++        vllm_runner,
++        example_encoder_decoder_prompts[decoder_prompt_type],
++        decoder_prompt_type,
++        model,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        tensor_parallel_size=2,
++        distributed_executor_backend=distributed_executor_backend,
++    )
+diff --git a/tests/models/encoder_decoder/vision_language/__init__.py b/tests/models/encoder_decoder/vision_language/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/encoder_decoder/vision_language/test_broadcast.py b/tests/models/encoder_decoder/vision_language/test_broadcast.py
+new file mode 100644
+index 0000000..542f41a
+--- /dev/null
++++ b/tests/models/encoder_decoder/vision_language/test_broadcast.py
+@@ -0,0 +1,35 @@
++import pytest
++
++from ....utils import multi_gpu_test
++
++
++@multi_gpu_test(num_gpus=2)
++@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
++@pytest.mark.parametrize("model", [
++    "meta-llama/Llama-3.2-11B-Vision-Instruct",
++])
++def test_models(hf_runner, vllm_runner, image_assets,
++                distributed_executor_backend, model) -> None:
++
++    dtype = "half"
++    max_tokens = 5
++    num_logprobs = 5
++    tensor_parallel_size = 2
++
++    if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
++        from .test_mllama import models, run_test
++    else:
++        raise NotImplementedError(f"Unsupported model: {model}")
++
++    run_test(
++        hf_runner,
++        vllm_runner,
++        image_assets,
++        model=models[0],
++        size_factors=[0.25, 0.5, 1.0],
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        tensor_parallel_size=tensor_parallel_size,
++        distributed_executor_backend=distributed_executor_backend,
++    )
+diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py
+new file mode 100644
+index 0000000..d686f1d
+--- /dev/null
++++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
+@@ -0,0 +1,102 @@
++from functools import partial
++from typing import List, Optional, Tuple, Type
++
++import pytest
++from PIL import Image
++
++from vllm.inputs.data import ExplicitEncoderDecoderPrompt
++from vllm.sequence import SampleLogprobs
++
++from ....conftest import HfRunner, VllmRunner
++from ...utils import check_logprobs_close
++
++Florence2Prompt = partial(ExplicitEncoderDecoderPrompt,
++                          decoder_prompt=None,
++                          mm_processor_kwargs=None)
++
++MODELS = ["microsoft/Florence-2-base"]
++# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
++# Therefore, we borrow the BartTokenizer from the original Bart model
++TOKENIZER = "facebook/bart-base"
++PROMPTS = [
++    Florence2Prompt(encoder_prompt="<CAPTION>"),
++    Florence2Prompt(encoder_prompt="<DETAILED_CAPTION>"),
++    Florence2Prompt(encoder_prompt="<MORE_DETAILED_CAPTION>"),
++    Florence2Prompt(encoder_prompt="<CAPTION_TO_PHRASE_GROUNDING>"),
++    Florence2Prompt(encoder_prompt="<DENSE_REGION_CAPTION>"),
++    Florence2Prompt(encoder_prompt="<REGION_PROPOSAL>"),
++    Florence2Prompt(encoder_prompt="<OCR_WITH_REGION>"),
++    Florence2Prompt(encoder_prompt="<OCR>"),
++    Florence2Prompt(encoder_prompt="<OD>"),
++]
++
++
++def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
++                                         Optional[SampleLogprobs]], ):
++    """Sanitize vllm output to be comparable with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    hf_output_str = "</s><s>" + output_str + "</s>"
++
++    return output_ids, hf_output_str, out_logprobs
++
++
++def run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    prompts: List[ExplicitEncoderDecoderPrompt],
++    model: str,
++    *,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    tensor_parallel_size: int,
++    distributed_executor_backend: Optional[str] = None,
++) -> None:
++    with vllm_runner(model,
++                     tokenizer_name=TOKENIZER,
++                     dtype=dtype,
++                     tensor_parallel_size=tensor_parallel_size,
++                     distributed_executor_backend=distributed_executor_backend,
++                     enforce_eager=True) as vllm_model:
++        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
++            prompts, max_tokens, num_logprobs)
++
++    # Florence-2 processors require image inputs
++    dummy_image = Image.new(mode="RGB", size=(2, 2))
++    with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
++        hf_model.model.get_output_embeddings = lambda: \
++            hf_model.model.language_model.lm_head
++        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
++            prompts,
++            max_tokens,
++            num_logprobs,
++            images=[dummy_image] * len(prompts),
++        ))
++
++    check_logprobs_close(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=[
++            vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
++        ],
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
++@pytest.mark.parametrize("max_tokens", [64])
++@pytest.mark.parametrize("num_logprobs", [5])
++def test_models(hf_runner, vllm_runner, model, dtype, max_tokens,
++                num_logprobs) -> None:
++    run_test(
++        hf_runner,
++        vllm_runner,
++        PROMPTS,
++        model,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        tensor_parallel_size=1,
++    )
+diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
+new file mode 100644
+index 0000000..636a3ee
+--- /dev/null
++++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
+@@ -0,0 +1,367 @@
++from typing import List, Optional, Tuple, Type, overload
++
++import pytest
++from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
++                          BatchEncoding)
++
++from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
++                                     global_force_attn_backend_context_manager)
++from vllm.multimodal.image import rescale_image_size
++from vllm.sequence import SampleLogprobs
++
++from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
++                          _ImageAssets)
++from ....utils import large_gpu_test
++from ...utils import check_logprobs_close
++
++_LIMIT_IMAGE_PER_PROMPT = 3
++
++LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
++
++HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
++    "stop_sign":
++    "<|image|><|begin_of_text|>The meaning of the image is",
++    "cherry_blossom":
++    "<|image|><|begin_of_text|>The city is",
++})
++
++text_only_prompts = [
++    "The color of the sky is blue but sometimes it can also be",
++]
++
++models = [
++    "meta-llama/Llama-3.2-11B-Vision-Instruct",
++]
++
++
++def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
++                                         Optional[SampleLogprobs]],
++                      model: str):
++    """Sanitize vllm output to be comparable with hf output."""
++    output_ids, output_str, out_logprobs = vllm_output
++
++    config = AutoConfig.from_pretrained(model)
++    image_token_id = config.image_token_index
++
++    tokenizer = AutoTokenizer.from_pretrained(model)
++    eos_token_id = tokenizer.eos_token_id
++
++    hf_output_ids = [
++        token_id for idx, token_id in enumerate(output_ids)
++        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
++    ]
++
++    hf_output_str = output_str
++    if hf_output_ids[-1] == eos_token_id:
++        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
++
++    return hf_output_ids, hf_output_str, out_logprobs
++
++
++def _get_inputs(
++    image_assets: _ImageAssets,
++    *,
++    size_factors: Optional[List[float]] = None,
++    sizes: Optional[List[Tuple[int, int]]] = None,
++) -> List[Tuple[List[str], PromptImageInput]]:
++    images = [asset.pil_image for asset in image_assets]
++
++    if size_factors is not None:
++        inputs_per_image = [(
++            [prompt for _ in size_factors],
++            [rescale_image_size(image, factor) for factor in size_factors],
++        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
++    elif sizes is not None:
++        inputs_per_image = [(
++            [
++                prompt if size is not None else text_only_prompts[0]
++                for size in sizes
++            ],
++            [
++                image.resize(size) if size is not None else None
++                for size in sizes
++            ],
++        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
++        if len(sizes) == 0:
++            inputs_per_image.append(
++                (text_only_prompts, [None] * len(text_only_prompts)))
++    else:
++        raise ValueError("You must provide either `size_factors` or `sizes`")
++
++    return inputs_per_image
++
++
++@overload
++def run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    image_assets: _ImageAssets,
++    model: str,
++    *,
++    size_factors: List[float],
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    tensor_parallel_size: int,
++    distributed_executor_backend: Optional[str] = None,
++):
++    ...
++
++
++@overload
++def run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    image_assets: _ImageAssets,
++    model: str,
++    *,
++    sizes: List[Tuple[int, int]],
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    tensor_parallel_size: int,
++    distributed_executor_backend: Optional[str] = None,
++):
++    ...
++
++
++def run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    image_assets: _ImageAssets,
++    model: str,
++    *,
++    size_factors: Optional[List[float]] = None,
++    sizes: Optional[List[Tuple[int, int]]] = None,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    tensor_parallel_size: int,
++    distributed_executor_backend: Optional[str] = None,
++):
++    _run_test(
++        hf_runner,
++        vllm_runner,
++        _get_inputs(image_assets, size_factors=size_factors, sizes=sizes),
++        model,
++        dtype=dtype,
++        max_tokens=max_tokens,
++        num_logprobs=num_logprobs,
++        tensor_parallel_size=tensor_parallel_size,
++        distributed_executor_backend=distributed_executor_backend,
++    )
++
++
++def _run_test(
++    hf_runner: Type[HfRunner],
++    vllm_runner: Type[VllmRunner],
++    inputs: List[Tuple[List[str], PromptImageInput]],
++    model: str,
++    *,
++    dtype: str,
++    max_tokens: int,
++    num_logprobs: int,
++    tensor_parallel_size: int,
++    distributed_executor_backend: Optional[str] = None,
++):
++    """Inference result should be the same between hf and vllm.
++
++    All the image fixtures for the test are from IMAGE_ASSETS.
++    For huggingface runner, we provide the PIL images as input.
++    For vllm runner, we provide MultiModalDataDict objects 
++    and corresponding MultiModalConfig as input.
++    Note, the text input is also adjusted to abide by vllm contract.
++    The text output is sanitized to be able to compare with hf.
++    """
++    # NOTE: take care of the order. run vLLM first, and then run HF.
++    # vLLM needs a fresh new process without cuda initialization.
++    # if we run HF first, the cuda initialization will be done and it
++    # will hurt multiprocessing backend with fork method (the default method).
++
++    # max_model_len should be greater than image_feature_size
++    with vllm_runner(model,
++                     dtype=dtype,
++                     max_model_len=4096,
++                     max_num_seqs=2,
++                     tensor_parallel_size=tensor_parallel_size,
++                     distributed_executor_backend=distributed_executor_backend,
++                     enforce_eager=True,
++                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
++                                          }) as vllm_model:
++        vllm_outputs_per_image = [
++            vllm_model.generate_greedy_logprobs(prompts,
++                                                max_tokens,
++                                                num_logprobs=num_logprobs,
++                                                images=images)
++            for prompts, images in inputs
++        ]
++
++    def process(hf_inputs: BatchEncoding, **kwargs):
++        return hf_inputs
++
++    with hf_runner(model,
++                   dtype=dtype,
++                   model_kwargs={"device_map": "auto"},
++                   postprocess_inputs=process,
++                   auto_cls=AutoModelForVision2Seq) as hf_model:
++        hf_outputs_per_image = [
++            hf_model.generate_greedy_logprobs_limit(prompts,
++                                                    max_tokens,
++                                                    num_logprobs=num_logprobs,
++                                                    images=images)
++            for prompts, images in inputs
++        ]
++
++    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
++                                        vllm_outputs_per_image):
++        check_logprobs_close(
++            outputs_0_lst=hf_outputs,
++            outputs_1_lst=[
++                vllm_to_hf_output(vllm_output, model)
++                for vllm_output in vllm_outputs
++            ],
++            name_0="hf",
++            name_1="vllm",
++        )
++
++
++@pytest.fixture(autouse=True)
++def clear_cache():
++    """Fixture to clear backend cache before each test."""
++    _cached_get_attn_backend.cache_clear()  # Clear the cache
++    yield  # This allows the test to run
++
++
++@large_gpu_test(min_gb=48)
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize(
++    "sizes",
++    [
++        # Text only
++        [],
++        # Single-size
++        [(512, 512)],
++        # Single-size, batched
++        [(512, 512), (512, 512), (512, 512)],
++        # Multi-size, batched
++        [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
++         (1024, 1024), (512, 1536), (512, 2028)],
++        # Multi-size, batched, including text only
++        [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
++         (1024, 1024), (512, 1536), (512, 2028), None],
++        # mllama has 8 possible aspect ratios, carefully set the sizes
++        # to cover all of them
++    ])
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [5])
++@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
++def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
++                                     model, sizes, dtype, max_tokens,
++                                     num_logprobs,
++                                     attn_backend: _Backend) -> None:
++    with global_force_attn_backend_context_manager(attn_backend):
++        if attn_backend == _Backend.FLASH_ATTN:
++            # Flash Attention works only with bfloat16 data-type
++            dtype = 'bfloat16'
++        run_test(
++            hf_runner,
++            vllm_runner,
++            image_assets,
++            model,
++            sizes=sizes,
++            dtype=dtype,
++            max_tokens=max_tokens,
++            num_logprobs=num_logprobs,
++            tensor_parallel_size=1,
++        )
++
++
++@large_gpu_test(min_gb=48)
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [5])
++@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
++def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
++                                     model, dtype, max_tokens, num_logprobs,
++                                     attn_backend: _Backend) -> None:
++
++    stop_sign = image_assets[0].pil_image
++    cherry_blossom = image_assets[1].pil_image
++
++    inputs = [(
++        [
++            "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
++            "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
++            "<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.",  # noqa: E501
++        ],
++        [
++            [stop_sign, cherry_blossom],
++            # Images with different sizes.
++            [
++                stop_sign.resize((512, 512)),
++                stop_sign,
++            ],
++            [
++                stop_sign,
++                stop_sign.resize((512, 1536)),
++                cherry_blossom.resize((512, 1024)),
++            ],
++        ])]
++    with global_force_attn_backend_context_manager(attn_backend):
++        if attn_backend == _Backend.FLASH_ATTN:
++            # Flash Attention works only with bfloat16 data-type
++            dtype = 'bfloat16'
++        _run_test(
++            hf_runner,
++            vllm_runner,
++            inputs,
++            model,
++            dtype=dtype,
++            max_tokens=max_tokens,
++            num_logprobs=num_logprobs,
++            tensor_parallel_size=1,
++        )
++
++
++@large_gpu_test(min_gb=48)
++@pytest.mark.core_model
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [128])
++@pytest.mark.parametrize("num_logprobs", [5])
++@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
++def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
++                                   dtype, max_tokens, num_logprobs,
++                                   attn_backend: _Backend) -> None:
++
++    stop_sign = image_assets[0].pil_image
++    cherry_blossom = image_assets[1].pil_image
++
++    inputs = [(
++        [
++            "<|begin_of_text|>The content of the image <|image|> is",  # noqa: E501
++            "<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, "  # noqa: E501
++            "which is a stop sign and which is a cherry blossom?",  # noqa: E501
++        ],
++        [
++            [stop_sign],
++            [stop_sign, cherry_blossom],
++        ])]
++    with global_force_attn_backend_context_manager(attn_backend):
++        if attn_backend == _Backend.FLASH_ATTN:
++            # Flash Attention works only with bfloat16 data-type
++            dtype = 'bfloat16'
++        _run_test(
++            hf_runner,
++            vllm_runner,
++            inputs,
++            model,
++            dtype=dtype,
++            max_tokens=max_tokens,
++            num_logprobs=num_logprobs,
++            tensor_parallel_size=1,
++        )
+diff --git a/tests/models/fixtures/pixtral_chat.json b/tests/models/fixtures/pixtral_chat.json
+new file mode 100644
+index 0000000..643afb8
+--- /dev/null
++++ b/tests/models/fixtures/pixtral_chat.json
+@@ -0,0 +1 @@
++[[[1784, 3937, 6122, 1261, 7244, 10575, 18970, 1408, 1261, 32656, 4691, 1046, 2], "The image shows a black dog sitting on a wooden surface.", [{"1784": {"logprob": -0.11687260121107101, "rank": 1, "decoded_token": "The"}, "4380": {"logprob": -2.366872549057007, "rank": 2, "decoded_token": "This"}, "1049": {"logprob": -4.741872787475586, "rank": 3, "decoded_token": "1"}, "117991": {"logprob": -5.991872787475586, "rank": 4, "decoded_token": "Certain"}, "1785": {"logprob": -5.991872787475586, "rank": 5, "decoded_token": "In"}}, {"3937": {"logprob": -0.28887900710105896, "rank": 1, "decoded_token": " image"}, "2158": {"logprob": -1.4138790369033813, "rank": 2, "decoded_token": " first"}, "3977": {"logprob": -5.788878917694092, "rank": 3, "decoded_token": " top"}, "7244": {"logprob": -6.163878917694092, "rank": 4, "decoded_token": " black"}, "8061": {"logprob": -6.788878917694092, "rank": 5, "decoded_token": " images"}}, {"6122": {"logprob": -0.9653709530830383, "rank": 1, "decoded_token": " shows"}, "51948": {"logprob": -1.4653708934783936, "rank": 2, "decoded_token": " depicts"}, "6971": {"logprob": -1.4653708934783936, "rank": 3, "decoded_token": " features"}, "25981": {"logprob": -2.8403708934783936, "rank": 4, "decoded_token": " displays"}, "8688": {"logprob": -2.8403708934783936, "rank": 5, "decoded_token": " contains"}}, {"1261": {"logprob": -0.003059827256947756, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -6.2530598640441895, "rank": 2, "decoded_token": " an"}, "2295": {"logprob": -7.8780598640441895, "rank": 3, "decoded_token": " two"}, "2342": {"logprob": -7.8780598640441895, "rank": 4, "decoded_token": " only"}, "1278": {"logprob": -8.628059387207031, "rank": 5, "decoded_token": " the"}}, {"7244": {"logprob": -0.17616479098796844, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -2.3011648654937744, "rank": 2, "decoded_token": " close"}, "4249": {"logprob": -3.4261648654937744, "rank": 3, "decoded_token": " single"}, "4329": {"logprob": -5.113664627075195, "rank": 4, "decoded_token": " large"}, "10575": {"logprob": -5.176164627075195, "rank": 5, "decoded_token": " dog"}}, {"10575": {"logprob": -0.10940006375312805, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.4844000339508057, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -4.109400272369385, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.296900272369385, "rank": 4, "decoded_token": " Lab"}, "7990": {"logprob": -7.421900272369385, "rank": 5, "decoded_token": " cat"}}, {"18970": {"logprob": -0.8322296738624573, "rank": 1, "decoded_token": " sitting"}, "1454": {"logprob": -1.5822296142578125, "rank": 2, "decoded_token": " with"}, "28528": {"logprob": -1.9572296142578125, "rank": 3, "decoded_token": " lying"}, "7283": {"logprob": -2.2072296142578125, "rank": 4, "decoded_token": " looking"}, "15866": {"logprob": -3.0197296142578125, "rank": 5, "decoded_token": " standing"}}, {"1408": {"logprob": -0.08769982308149338, "rank": 1, "decoded_token": " on"}, "1321": {"logprob": -3.7126998901367188, "rank": 2, "decoded_token": " and"}, "3675": {"logprob": -3.9626998901367188, "rank": 3, "decoded_token": " against"}, "41132": {"logprob": -4.587699890136719, "rank": 4, "decoded_token": " attent"}, "1454": {"logprob": -5.087699890136719, "rank": 5, "decoded_token": " with"}}, {"1261": {"logprob": -0.5400654673576355, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -0.9150654673576355, "rank": 2, "decoded_token": " wooden"}, "3977": {"logprob": -5.415065288543701, "rank": 3, "decoded_token": " top"}, "12603": {"logprob": -5.540065288543701, "rank": 4, "decoded_token": " wood"}, "44130": {"logprob": -6.290065288543701, "rank": 5, "decoded_token": " rust"}}, {"32656": {"logprob": -0.02516966126859188, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -4.400169849395752, "rank": 2, "decoded_token": " rust"}, "12603": {"logprob": -5.275169849395752, "rank": 3, "decoded_token": " wood"}, "3403": {"logprob": -5.525169849395752, "rank": 4, "decoded_token": " text"}, "17253": {"logprob": -6.962669849395752, "rank": 5, "decoded_token": " weather"}}, {"4691": {"logprob": -0.7264319658279419, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8514319658279419, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.6014318466186523, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -5.226431846618652, "rank": 4, "decoded_token": " deck"}, "1615": {"logprob": -5.726431846618652, "rank": 5, "decoded_token": " pl"}}, {"1046": {"logprob": -0.4668232202529907, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -1.9668232202529907, "rank": 2, "decoded_token": ","}, "1321": {"logprob": -2.466823101043701, "rank": 3, "decoded_token": " and"}, "7283": {"logprob": -2.716823101043701, "rank": 4, "decoded_token": " looking"}, "1454": {"logprob": -2.716823101043701, "rank": 5, "decoded_token": " with"}}, {"2": {"logprob": -0.002247072057798505, "rank": 1, "decoded_token": "</s>"}, "1531": {"logprob": -6.627246856689453, "rank": 2, "decoded_token": " The"}, "1032": {"logprob": -7.127246856689453, "rank": 3, "decoded_token": " "}, "3730": {"logprob": -9.877246856689453, "rank": 4, "decoded_token": " There"}, "1256": {"logprob": -11.127246856689453, "rank": 5, "decoded_token": "  "}}]], [[1049, 1046, 1349, 7244, 10575, 1454, 2327, 94766, 32961, 53048, 41132, 3923, 1408, 1261, 32656, 4691, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 1454, 122203, 27469, 94973, 2425, 1261, 16152, 1121, 21283, 1046, 2], "1. A black dog with floppy ears sits attentively on a wooden surface.\n2. A vast mountain range with rugged peaks stretches under a cloudy sky.", [{"1049": {"logprob": -0.42824622988700867, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -1.553246259689331, "rank": 2, "decoded_token": "-"}, "1065": {"logprob": -2.428246259689331, "rank": 3, "decoded_token": "A"}, "1784": {"logprob": -4.053246021270752, "rank": 4, "decoded_token": "The"}, "69957": {"logprob": -4.428246021270752, "rank": 5, "decoded_token": "Sure"}}, {"1046": {"logprob": -1.9788545614574105e-05, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -11.750020027160645, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -12.125020027160645, "rank": 3, "decoded_token": ".A"}, "1065": {"logprob": -13.062520027160645, "rank": 4, "decoded_token": "A"}, "1041": {"logprob": -13.750020027160645, "rank": 5, "decoded_token": ")"}}, {"1349": {"logprob": -0.14020134508609772, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.3902013301849365, "rank": 2, "decoded_token": " \""}, "1603": {"logprob": -3.7652013301849365, "rank": 3, "decoded_token": " **"}, "11967": {"logprob": -4.890201568603516, "rank": 4, "decoded_token": " Image"}, "1531": {"logprob": -5.015201568603516, "rank": 5, "decoded_token": " The"}}, {"7244": {"logprob": -0.2003599852323532, "rank": 1, "decoded_token": " black"}, "38462": {"logprob": -3.075360059738159, "rank": 2, "decoded_token": " curious"}, "68076": {"logprob": -3.575360059738159, "rank": 3, "decoded_token": " cute"}, "4329": {"logprob": -3.887860059738159, "rank": 4, "decoded_token": " large"}, "6231": {"logprob": -4.32535982131958, "rank": 5, "decoded_token": " close"}}, {"10575": {"logprob": -0.18818901479244232, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.0631890296936035, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.1881890296936035, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -6.9381890296936035, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.3131890296936035, "rank": 5, "decoded_token": " lab"}}, {"1454": {"logprob": -0.5699259042739868, "rank": 1, "decoded_token": " with"}, "53048": {"logprob": -1.2574259042739868, "rank": 2, "decoded_token": " sits"}, "1395": {"logprob": -3.0699257850646973, "rank": 3, "decoded_token": " is"}, "22524": {"logprob": -3.6324257850646973, "rank": 4, "decoded_token": " lies"}, "18970": {"logprob": -3.7574257850646973, "rank": 5, "decoded_token": " sitting"}}, {"2327": {"logprob": -1.2377738952636719, "rank": 1, "decoded_token": " fl"}, "1261": {"logprob": -1.3627738952636719, "rank": 2, "decoded_token": " a"}, "17300": {"logprob": -1.9252738952636719, "rank": 3, "decoded_token": " soul"}, "100089": {"logprob": -2.675273895263672, "rank": 4, "decoded_token": " expressive"}, "6444": {"logprob": -3.237773895263672, "rank": 5, "decoded_token": " soft"}}, {"94766": {"logprob": -0.0025601964443922043, "rank": 1, "decoded_token": "oppy"}, "124603": {"logprob": -6.315060138702393, "rank": 2, "decoded_token": "uffy"}, "1484": {"logprob": -7.877560138702393, "rank": 3, "decoded_token": "op"}, "24897": {"logprob": -8.81506061553955, "rank": 4, "decoded_token": "appy"}, "102477": {"logprob": -9.69006061553955, "rank": 5, "decoded_token": "opping"}}, {"32961": {"logprob": -5.113947918289341e-05, "rank": 1, "decoded_token": " ears"}, "16962": {"logprob": -11.250051498413086, "rank": 2, "decoded_token": " ear"}, "5731": {"logprob": -11.812551498413086, "rank": 3, "decoded_token": " eyes"}, "3351": {"logprob": -12.000051498413086, "rank": 4, "decoded_token": " years"}, "42071": {"logprob": -13.062551498413086, "rank": 5, "decoded_token": " cheeks"}}, {"53048": {"logprob": -0.6179640889167786, "rank": 1, "decoded_token": " sits"}, "10637": {"logprob": -1.9929640293121338, "rank": 2, "decoded_token": " looks"}, "1321": {"logprob": -2.430464029312134, "rank": 3, "decoded_token": " and"}, "1395": {"logprob": -2.617964029312134, "rank": 4, "decoded_token": " is"}, "18970": {"logprob": -3.055464029312134, "rank": 5, "decoded_token": " sitting"}}, {"41132": {"logprob": -0.3746516704559326, "rank": 1, "decoded_token": " attent"}, "1408": {"logprob": -2.3121516704559326, "rank": 2, "decoded_token": " on"}, "106534": {"logprob": -2.3746516704559326, "rank": 3, "decoded_token": " calmly"}, "12276": {"logprob": -2.6246516704559326, "rank": 4, "decoded_token": " alert"}, "6482": {"logprob": -5.124651908874512, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -8.463501580990851e-05, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.50008487701416, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -11.87508487701416, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -14.00008487701416, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -14.62508487701416, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.06439964473247528, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.0643997192382812, "rank": 2, "decoded_token": " against"}, "1294": {"logprob": -4.939399719238281, "rank": 3, "decoded_token": " in"}, "7283": {"logprob": -5.689399719238281, "rank": 4, "decoded_token": " looking"}, "1044": {"logprob": -5.814399719238281, "rank": 5, "decoded_token": ","}}, {"1261": {"logprob": -0.2108541578054428, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.710854172706604, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -5.5858540534973145, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -6.0858540534973145, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.9608540534973145, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.08556432276964188, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.710564374923706, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.710564136505127, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.960564136505127, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -5.960564136505127, "rank": 5, "decoded_token": " text"}}, {"4691": {"logprob": -0.7751782536506653, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.7751782536506653, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.9001781940460205, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -4.1501784324646, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.1501784324646, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.12918435037136078, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.3791842460632324, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -4.129184246063232, "rank": 3, "decoded_token": "."}, "1338": {"logprob": -5.129184246063232, "rank": 4, "decoded_token": ".\n\n"}, "7283": {"logprob": -5.629184246063232, "rank": 5, "decoded_token": " looking"}}, {"1050": {"logprob": -0.00017474555352237076, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -9.000174522399902, "rank": 2, "decoded_token": "  "}, "1032": {"logprob": -10.875174522399902, "rank": 3, "decoded_token": " "}, "1293": {"logprob": -11.625174522399902, "rank": 4, "decoded_token": "   "}, "1051": {"logprob": -12.125174522399902, "rank": 5, "decoded_token": "3"}}, {"1046": {"logprob": -7.629365427419543e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -12.875007629394531, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -13.062507629394531, "rank": 3, "decoded_token": ".\n"}, "1338": {"logprob": -14.562507629394531, "rank": 4, "decoded_token": ".\n\n"}, "1058": {"logprob": -14.812507629394531, "rank": 5, "decoded_token": ":"}}, {"1349": {"logprob": -0.558266282081604, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.495766282081604, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.2457661628723145, "rank": 3, "decoded_token": " Snow"}, "113465": {"logprob": -3.9957661628723145, "rank": 4, "decoded_token": " Rug"}, "1531": {"logprob": -3.9957661628723145, "rank": 5, "decoded_token": " The"}}, {"15375": {"logprob": -0.6446555852890015, "rank": 1, "decoded_token": " vast"}, "37849": {"logprob": -2.019655704498291, "rank": 2, "decoded_token": " breat"}, "61082": {"logprob": -2.394655704498291, "rank": 3, "decoded_token": " panor"}, "10726": {"logprob": -3.082155704498291, "rank": 4, "decoded_token": " scen"}, "2169": {"logprob": -3.207155704498291, "rank": 5, "decoded_token": " ser"}}, {"24361": {"logprob": -0.7034653425216675, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.9534653425216675, "rank": 2, "decoded_token": " mountainous"}, "1044": {"logprob": -2.078465461730957, "rank": 3, "decoded_token": ","}, "4521": {"logprob": -2.328465461730957, "rank": 4, "decoded_token": " range"}, "28035": {"logprob": -2.453465461730957, "rank": 5, "decoded_token": " landscape"}}, {"4521": {"logprob": -0.07058106362819672, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -2.6955809593200684, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.320581436157227, "rank": 3, "decoded_token": " valley"}, "12248": {"logprob": -9.445581436157227, "rank": 4, "decoded_token": " peak"}, "13327": {"logprob": -9.695581436157227, "rank": 5, "decoded_token": " scene"}}, {"1454": {"logprob": -1.1448894739151, "rank": 1, "decoded_token": " with"}, "94973": {"logprob": -1.1448894739151, "rank": 2, "decoded_token": " stretches"}, "2425": {"logprob": -1.8948894739151, "rank": 3, "decoded_token": " under"}, "1395": {"logprob": -2.5198893547058105, "rank": 4, "decoded_token": " is"}, "13875": {"logprob": -3.0198893547058105, "rank": 5, "decoded_token": " covered"}}, {"122203": {"logprob": -1.0288245677947998, "rank": 1, "decoded_token": " rugged"}, "58127": {"logprob": -1.6538245677947998, "rank": 2, "decoded_token": " jag"}, "27469": {"logprob": -2.1538245677948, "rank": 3, "decoded_token": " peaks"}, "23745": {"logprob": -2.6538245677948, "rank": 4, "decoded_token": " snow"}, "95746": {"logprob": -2.8413245677948, "rank": 5, "decoded_token": " rocky"}}, {"27469": {"logprob": -0.20564845204353333, "rank": 1, "decoded_token": " peaks"}, "24765": {"logprob": -2.580648422241211, "rank": 2, "decoded_token": " terrain"}, "130655": {"logprob": -2.955648422241211, "rank": 3, "decoded_token": ""}, "1044": {"logprob": -3.580648422241211, "rank": 4, "decoded_token": ","}, "61263": {"logprob": -4.455648422241211, "rank": 5, "decoded_token": " slopes"}}, {"94973": {"logprob": -1.0839273929595947, "rank": 1, "decoded_token": " stretches"}, "1321": {"logprob": -1.1464273929595947, "rank": 2, "decoded_token": " and"}, "2425": {"logprob": -1.7714273929595947, "rank": 3, "decoded_token": " under"}, "13875": {"logprob": -3.0839273929595947, "rank": 4, "decoded_token": " covered"}, "1395": {"logprob": -3.2714273929595947, "rank": 5, "decoded_token": " is"}}, {"2425": {"logprob": -0.9016233682632446, "rank": 1, "decoded_token": " under"}, "5669": {"logprob": -1.0266233682632446, "rank": 2, "decoded_token": " across"}, "1848": {"logprob": -1.9016233682632446, "rank": 3, "decoded_token": " out"}, "2203": {"logprob": -3.151623249053955, "rank": 4, "decoded_token": " into"}, "8994": {"logprob": -4.026623249053955, "rank": 5, "decoded_token": " towards"}}, {"1261": {"logprob": -0.00555459875613451, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -5.380554676055908, "rank": 2, "decoded_token": " an"}, "1278": {"logprob": -7.630554676055908, "rank": 3, "decoded_token": " the"}, "2136": {"logprob": -9.31805419921875, "rank": 4, "decoded_token": " over"}, "16152": {"logprob": -9.38055419921875, "rank": 5, "decoded_token": " cloud"}}, {"16152": {"logprob": -0.6862213015556335, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -1.4362213611602783, "rank": 2, "decoded_token": " clear"}, "18416": {"logprob": -2.6862213611602783, "rank": 3, "decoded_token": " haz"}, "27254": {"logprob": -3.0612213611602783, "rank": 4, "decoded_token": " partly"}, "4391": {"logprob": -3.1862213611602783, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.10446903109550476, "rank": 1, "decoded_token": "y"}, "4527": {"logprob": -2.854469060897827, "rank": 2, "decoded_token": "less"}, "1286": {"logprob": -3.479469060897827, "rank": 3, "decoded_token": "ed"}, "114525": {"logprob": -5.479468822479248, "rank": 4, "decoded_token": "-covered"}, "77187": {"logprob": -5.479468822479248, "rank": 5, "decoded_token": "-filled"}}, {"21283": {"logprob": -0.003459066851064563, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -6.3784589767456055, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -6.8784589767456055, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -7.8784589767456055, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -8.503458976745605, "rank": 5, "decoded_token": " grey"}}, {"1046": {"logprob": -0.01103890035301447, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -4.636038780212402, "rank": 2, "decoded_token": ","}, "1338": {"logprob": -7.261038780212402, "rank": 3, "decoded_token": ".\n\n"}, "1294": {"logprob": -8.136038780212402, "rank": 4, "decoded_token": " in"}, "1454": {"logprob": -8.761038780212402, "rank": 5, "decoded_token": " with"}}, {"2": {"logprob": -9.059865078597795e-06, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -11.625008583068848, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.125009536743164, "rank": 3, "decoded_token": "  "}, "1319": {"logprob": -17.375009536743164, "rank": 4, "decoded_token": " ("}, "1766": {"logprob": -18.750009536743164, "rank": 5, "decoded_token": " ["}}]], [[1049, 1046, 1349, 7244, 10575, 53048, 41132, 3923, 1408, 1261, 32656, 11237, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1626, 1051, 1046, 8342, 71284, 7377, 1394, 22140, 1294, 1278, 27208, 1513, 97558, 1626, 1052, 1046, 1349, 53301, 59396, 3549, 13335, 2645, 1261, 1295, 3506, 11223, 12097, 1046, 2], "1. A black dog sits attentively on a wooden floor.\n2. A vast mountain range stretches across the horizon under a cloudy sky.\n3. Surfers wait for waves in the ocean at sunset.\n4. A winding gravel path leads through a lush green park.", [{"1049": {"logprob": -0.05001257359981537, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -3.1750125885009766, "rank": 2, "decoded_token": "-"}, "69957": {"logprob": -5.925012588500977, "rank": 3, "decoded_token": "Sure"}, "11745": {"logprob": -6.425012588500977, "rank": 4, "decoded_token": "Here"}, "1065": {"logprob": -6.425012588500977, "rank": 5, "decoded_token": "A"}}, {"1046": {"logprob": -9.536697689327411e-06, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -11.875009536743164, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -13.375009536743164, "rank": 3, "decoded_token": ".A"}, "1041": {"logprob": -14.750009536743164, "rank": 4, "decoded_token": ")"}, "1065": {"logprob": -15.687509536743164, "rank": 5, "decoded_token": "A"}}, {"1349": {"logprob": -0.12580634653568268, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.3758063316345215, "rank": 2, "decoded_token": " \""}, "1531": {"logprob": -4.6258063316345215, "rank": 3, "decoded_token": " The"}, "11967": {"logprob": -4.6258063316345215, "rank": 4, "decoded_token": " Image"}, "1603": {"logprob": -5.6258063316345215, "rank": 5, "decoded_token": " **"}}, {"7244": {"logprob": -0.15412142872810364, "rank": 1, "decoded_token": " black"}, "68076": {"logprob": -3.3416213989257812, "rank": 2, "decoded_token": " cute"}, "6231": {"logprob": -3.9666213989257812, "rank": 3, "decoded_token": " close"}, "38462": {"logprob": -4.216621398925781, "rank": 4, "decoded_token": " curious"}, "4329": {"logprob": -4.404121398925781, "rank": 5, "decoded_token": " large"}}, {"10575": {"logprob": -0.12086891382932663, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.3708689212799072, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.9958689212799072, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.683368682861328, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.808368682861328, "rank": 5, "decoded_token": " lab"}}, {"53048": {"logprob": -0.8729249238967896, "rank": 1, "decoded_token": " sits"}, "1454": {"logprob": -1.1229249238967896, "rank": 2, "decoded_token": " with"}, "1395": {"logprob": -2.4354248046875, "rank": 3, "decoded_token": " is"}, "18970": {"logprob": -2.6854248046875, "rank": 4, "decoded_token": " sitting"}, "22524": {"logprob": -3.6854248046875, "rank": 5, "decoded_token": " lies"}}, {"41132": {"logprob": -0.5888903737068176, "rank": 1, "decoded_token": " attent"}, "106534": {"logprob": -1.2763903141021729, "rank": 2, "decoded_token": " calmly"}, "12276": {"logprob": -2.838890314102173, "rank": 3, "decoded_token": " alert"}, "1408": {"logprob": -2.901390314102173, "rank": 4, "decoded_token": " on"}, "6482": {"logprob": -5.026390552520752, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -9.16677454370074e-05, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.625091552734375, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -10.875091552734375, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -13.125091552734375, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -13.750091552734375, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.052677519619464874, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.802677631378174, "rank": 2, "decoded_token": " against"}, "1454": {"logprob": -4.302677631378174, "rank": 3, "decoded_token": " with"}, "1294": {"logprob": -5.177677631378174, "rank": 4, "decoded_token": " in"}, "7283": {"logprob": -5.427677631378174, "rank": 5, "decoded_token": " looking"}}, {"1261": {"logprob": -0.36706605553627014, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.2420660257339478, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -4.617065906524658, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -5.742065906524658, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.617065906524658, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.07824385166168213, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.8282437324523926, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.703243732452393, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.828243732452393, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -5.953243732452393, "rank": 5, "decoded_token": " text"}}, {"11237": {"logprob": -0.5853750705718994, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.0853750705718994, "rank": 2, "decoded_token": " surface"}, "7042": {"logprob": -2.7103750705718994, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -3.5853750705718994, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.08537483215332, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.7340722680091858, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -0.8590722680091858, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -3.359072208404541, "rank": 3, "decoded_token": " with"}, "7283": {"logprob": -3.609072208404541, "rank": 4, "decoded_token": " looking"}, "1321": {"logprob": -4.109072208404541, "rank": 5, "decoded_token": " and"}}, {"1050": {"logprob": -1.1324817933200393e-05, "rank": 1, "decoded_token": "2"}, "1051": {"logprob": -11.625011444091797, "rank": 2, "decoded_token": "3"}, "1256": {"logprob": -14.000011444091797, "rank": 3, "decoded_token": "  "}, "1049": {"logprob": -14.625011444091797, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -14.625011444091797, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -2.50339189733495e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.56250286102295, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -15.43750286102295, "rank": 3, "decoded_token": ".\n"}, "4700": {"logprob": -15.50000286102295, "rank": 4, "decoded_token": ".M"}, "3051": {"logprob": -16.000001907348633, "rank": 5, "decoded_token": ".S"}}, {"1349": {"logprob": -0.6769706010818481, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.9269706010818481, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.1144704818725586, "rank": 3, "decoded_token": " Snow"}, "27260": {"logprob": -2.6144704818725586, "rank": 4, "decoded_token": " Mountain"}, "113465": {"logprob": -2.8644704818725586, "rank": 5, "decoded_token": " Rug"}}, {"15375": {"logprob": -0.9251430034637451, "rank": 1, "decoded_token": " vast"}, "10726": {"logprob": -2.300143003463745, "rank": 2, "decoded_token": " scen"}, "4521": {"logprob": -2.362643003463745, "rank": 3, "decoded_token": " range"}, "122203": {"logprob": -2.425143003463745, "rank": 4, "decoded_token": " rugged"}, "61082": {"logprob": -2.800143003463745, "rank": 5, "decoded_token": " panor"}}, {"24361": {"logprob": -0.5277582406997681, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.902758240699768, "rank": 2, "decoded_token": " mountainous"}, "28035": {"logprob": -2.5277581214904785, "rank": 3, "decoded_token": " landscape"}, "4521": {"logprob": -2.5277581214904785, "rank": 4, "decoded_token": " range"}, "1044": {"logprob": -2.7777581214904785, "rank": 5, "decoded_token": ","}}, {"4521": {"logprob": -0.055658817291259766, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -2.9306588172912598, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.430658340454102, "rank": 3, "decoded_token": " valley"}, "13327": {"logprob": -9.055658340454102, "rank": 4, "decoded_token": " scene"}, "3719": {"logprob": -9.805658340454102, "rank": 5, "decoded_token": " view"}}, {"94973": {"logprob": -0.6880245208740234, "rank": 1, "decoded_token": " stretches"}, "2425": {"logprob": -1.7505245208740234, "rank": 2, "decoded_token": " under"}, "1395": {"logprob": -2.3130245208740234, "rank": 3, "decoded_token": " is"}, "1454": {"logprob": -2.6880245208740234, "rank": 4, "decoded_token": " with"}, "7038": {"logprob": -3.2505245208740234, "rank": 5, "decoded_token": " extends"}}, {"5669": {"logprob": -0.4545598328113556, "rank": 1, "decoded_token": " across"}, "2425": {"logprob": -1.4545598030090332, "rank": 2, "decoded_token": " under"}, "1848": {"logprob": -2.454559803009033, "rank": 3, "decoded_token": " out"}, "2203": {"logprob": -4.204559803009033, "rank": 4, "decoded_token": " into"}, "25136": {"logprob": -4.642059803009033, "rank": 5, "decoded_token": " beneath"}}, {"1278": {"logprob": -0.23015151917934418, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -1.6051515340805054, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -5.605151653289795, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -7.167651653289795, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -10.167651176452637, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.2797861397266388, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -2.0297861099243164, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.2797861099243164, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.6547861099243164, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -3.7797861099243164, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.28862035274505615, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -2.4136204719543457, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -2.5386204719543457, "rank": 3, "decoded_token": " with"}, "1626": {"logprob": -3.7886204719543457, "rank": 4, "decoded_token": ".\n"}, "1408": {"logprob": -3.9136204719543457, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.04524127021431923, "rank": 1, "decoded_token": " a"}, "16152": {"logprob": -4.045241355895996, "rank": 2, "decoded_token": " cloud"}, "1420": {"logprob": -4.045241355895996, "rank": 3, "decoded_token": " an"}, "2136": {"logprob": -6.107741355895996, "rank": 4, "decoded_token": " over"}, "6133": {"logprob": -6.357741355895996, "rank": 5, "decoded_token": " clear"}}, {"16152": {"logprob": -0.19613930583000183, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -2.883639335632324, "rank": 2, "decoded_token": " clear"}, "27254": {"logprob": -3.508639335632324, "rank": 3, "decoded_token": " partly"}, "18416": {"logprob": -3.883639335632324, "rank": 4, "decoded_token": " haz"}, "4391": {"logprob": -4.321139335632324, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.05146069824695587, "rank": 1, "decoded_token": "y"}, "1286": {"logprob": -3.8014607429504395, "rank": 2, "decoded_token": "ed"}, "77187": {"logprob": -4.5514607429504395, "rank": 3, "decoded_token": "-filled"}, "114525": {"logprob": -4.9264607429504395, "rank": 4, "decoded_token": "-covered"}, "4527": {"logprob": -4.9264607429504395, "rank": 5, "decoded_token": "less"}}, {"21283": {"logprob": -0.00033122775494121015, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -8.875330924987793, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -9.500330924987793, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -10.500330924987793, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -11.375330924987793, "rank": 5, "decoded_token": " grey"}}, {"1626": {"logprob": -0.00012683063687290996, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -9.500126838684082, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -10.500126838684082, "rank": 3, "decoded_token": "."}, "1454": {"logprob": -10.875126838684082, "rank": 4, "decoded_token": " with"}, "1294": {"logprob": -13.375126838684082, "rank": 5, "decoded_token": " in"}}, {"1051": {"logprob": -3.2186455882765586e-06, "rank": 1, "decoded_token": "3"}, "1052": {"logprob": -12.75000286102295, "rank": 2, "decoded_token": "4"}, "1050": {"logprob": -15.00000286102295, "rank": 3, "decoded_token": "2"}, "1049": {"logprob": -17.000003814697266, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -17.937503814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.9073468138230965e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -14.625001907348633, "rank": 2, "decoded_token": ".A"}, "5226": {"logprob": -15.625001907348633, "rank": 3, "decoded_token": ".D"}, "6847": {"logprob": -15.750001907348633, "rank": 4, "decoded_token": ".T"}, "4700": {"logprob": -16.750001907348633, "rank": 5, "decoded_token": ".M"}}, {"8342": {"logprob": -0.5928499102592468, "rank": 1, "decoded_token": " Sur"}, "1349": {"logprob": -1.6553499698638916, "rank": 2, "decoded_token": " A"}, "22468": {"logprob": -2.5303499698638916, "rank": 3, "decoded_token": " Several"}, "1488": {"logprob": -2.7178499698638916, "rank": 4, "decoded_token": " W"}, "15035": {"logprob": -3.2178499698638916, "rank": 5, "decoded_token": " People"}}, {"71284": {"logprob": -0.003268140833824873, "rank": 1, "decoded_token": "fers"}, "1102": {"logprob": -5.878268241882324, "rank": 2, "decoded_token": "f"}, "1726": {"logprob": -7.753268241882324, "rank": 3, "decoded_token": "fer"}, "61888": {"logprob": -12.315768241882324, "rank": 4, "decoded_token": "fline"}, "2119": {"logprob": -13.065768241882324, "rank": 5, "decoded_token": "fter"}}, {"7377": {"logprob": -1.4883846044540405, "rank": 1, "decoded_token": " wait"}, "1584": {"logprob": -1.7383846044540405, "rank": 2, "decoded_token": " are"}, "88014": {"logprob": -1.9258846044540405, "rank": 3, "decoded_token": " paddle"}, "1294": {"logprob": -1.9258846044540405, "rank": 4, "decoded_token": " in"}, "24434": {"logprob": -2.23838472366333, "rank": 5, "decoded_token": " ride"}}, {"1394": {"logprob": -0.6120346188545227, "rank": 1, "decoded_token": " for"}, "1294": {"logprob": -0.9870346188545227, "rank": 2, "decoded_token": " in"}, "1408": {"logprob": -2.737034559249878, "rank": 3, "decoded_token": " on"}, "6482": {"logprob": -4.487034797668457, "rank": 4, "decoded_token": " patient"}, "1321": {"logprob": -5.612034797668457, "rank": 5, "decoded_token": " and"}}, {"22140": {"logprob": -0.008224429562687874, "rank": 1, "decoded_token": " waves"}, "1278": {"logprob": -5.5082244873046875, "rank": 2, "decoded_token": " the"}, "1261": {"logprob": -5.6332244873046875, "rank": 3, "decoded_token": " a"}, "39460": {"logprob": -8.133224487304688, "rank": 4, "decoded_token": " incoming"}, "1321": {"logprob": -9.758224487304688, "rank": 5, "decoded_token": " and"}}, {"1294": {"logprob": -0.3204176723957062, "rank": 1, "decoded_token": " in"}, "1408": {"logprob": -2.195417642593384, "rank": 2, "decoded_token": " on"}, "1513": {"logprob": -2.320417642593384, "rank": 3, "decoded_token": " at"}, "3016": {"logprob": -3.695417642593384, "rank": 4, "decoded_token": " while"}, "1435": {"logprob": -3.820417642593384, "rank": 5, "decoded_token": " as"}}, {"1278": {"logprob": -0.004615250043570995, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -6.192115306854248, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -6.942115306854248, "rank": 3, "decoded_token": " an"}, "40466": {"logprob": -7.317115306854248, "rank": 4, "decoded_token": " shallow"}, "26517": {"logprob": -7.879615306854248, "rank": 5, "decoded_token": " calm"}}, {"27208": {"logprob": -0.06491076946258545, "rank": 1, "decoded_token": " ocean"}, "7786": {"logprob": -3.439910888671875, "rank": 2, "decoded_token": " distance"}, "5124": {"logprob": -5.314910888671875, "rank": 3, "decoded_token": " early"}, "26517": {"logprob": -5.377410888671875, "rank": 4, "decoded_token": " calm"}, "11196": {"logprob": -5.377410888671875, "rank": 5, "decoded_token": " sea"}}, {"1513": {"logprob": -1.144903540611267, "rank": 1, "decoded_token": " at"}, "1435": {"logprob": -1.269903540611267, "rank": 2, "decoded_token": " as"}, "3184": {"logprob": -1.394903540611267, "rank": 3, "decoded_token": " during"}, "3016": {"logprob": -3.0199036598205566, "rank": 4, "decoded_token": " while"}, "6117": {"logprob": -3.1449036598205566, "rank": 5, "decoded_token": " near"}}, {"97558": {"logprob": -0.12556149065494537, "rank": 1, "decoded_token": " sunset"}, "11729": {"logprob": -2.875561475753784, "rank": 2, "decoded_token": " sun"}, "1266": {"logprob": -3.375561475753784, "rank": 3, "decoded_token": " d"}, "54507": {"logprob": -4.000561714172363, "rank": 4, "decoded_token": " dawn"}, "1261": {"logprob": -5.125561714172363, "rank": 5, "decoded_token": " a"}}, {"1626": {"logprob": -0.26737067103385925, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.2673707008361816, "rank": 2, "decoded_token": ","}, "3016": {"logprob": -2.7673707008361816, "rank": 3, "decoded_token": " while"}, "1454": {"logprob": -3.5173707008361816, "rank": 4, "decoded_token": " with"}, "6117": {"logprob": -4.142370700836182, "rank": 5, "decoded_token": " near"}}, {"1052": {"logprob": -2.9802276912960224e-06, "rank": 1, "decoded_token": "4"}, "1051": {"logprob": -13.37500286102295, "rank": 2, "decoded_token": "3"}, "1049": {"logprob": -14.00000286102295, "rank": 3, "decoded_token": "1"}, "1053": {"logprob": -14.56250286102295, "rank": 4, "decoded_token": "5"}, "1032": {"logprob": -16.750003814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.500001907348633, "rank": 2, "decoded_token": ".A"}, "6847": {"logprob": -16.562501907348633, "rank": 3, "decoded_token": ".T"}, "1044": {"logprob": -17.312501907348633, "rank": 4, "decoded_token": ","}, "1349": {"logprob": -17.500001907348633, "rank": 5, "decoded_token": " A"}}, {"1349": {"logprob": -0.004883386194705963, "rank": 1, "decoded_token": " A"}, "2048": {"logprob": -5.504883289337158, "rank": 2, "decoded_token": " An"}, "10638": {"logprob": -7.754883289337158, "rank": 3, "decoded_token": " Two"}, "111463": {"logprob": -9.754883766174316, "rank": 4, "decoded_token": " Trees"}, "1531": {"logprob": -10.692383766174316, "rank": 5, "decoded_token": " The"}}, {"53301": {"logprob": -1.5612412691116333, "rank": 1, "decoded_token": " winding"}, "15192": {"logprob": -1.7487412691116333, "rank": 2, "decoded_token": " narrow"}, "47945": {"logprob": -2.1237411499023438, "rank": 3, "decoded_token": " dirt"}, "2169": {"logprob": -2.5612411499023438, "rank": 4, "decoded_token": " ser"}, "59396": {"logprob": -2.6862411499023438, "rank": 5, "decoded_token": " gravel"}}, {"59396": {"logprob": -0.9024254083633423, "rank": 1, "decoded_token": " gravel"}, "3549": {"logprob": -1.1524254083633423, "rank": 2, "decoded_token": " path"}, "47945": {"logprob": -1.6524254083633423, "rank": 3, "decoded_token": " dirt"}, "14801": {"logprob": -3.1524252891540527, "rank": 4, "decoded_token": " pathway"}, "15551": {"logprob": -4.277425289154053, "rank": 5, "decoded_token": " stone"}}, {"3549": {"logprob": -0.021290099248290062, "rank": 1, "decoded_token": " path"}, "14801": {"logprob": -3.8962900638580322, "rank": 2, "decoded_token": " pathway"}, "33659": {"logprob": -7.896290302276611, "rank": 3, "decoded_token": " trail"}, "9480": {"logprob": -9.521289825439453, "rank": 4, "decoded_token": " road"}, "7368": {"logprob": -9.646289825439453, "rank": 5, "decoded_token": "path"}}, {"13335": {"logprob": -0.16593234241008759, "rank": 1, "decoded_token": " leads"}, "39985": {"logprob": -2.8534324169158936, "rank": 2, "decoded_token": " cuts"}, "1639": {"logprob": -3.9784324169158936, "rank": 3, "decoded_token": " me"}, "11500": {"logprob": -4.1034321784973145, "rank": 4, "decoded_token": " runs"}, "2645": {"logprob": -4.2909321784973145, "rank": 5, "decoded_token": " through"}}, {"2645": {"logprob": -0.05767015367746353, "rank": 1, "decoded_token": " through"}, "8994": {"logprob": -4.0576701164245605, "rank": 2, "decoded_token": " towards"}, "2396": {"logprob": -4.1826701164245605, "rank": 3, "decoded_token": " between"}, "2203": {"logprob": -4.5576701164245605, "rank": 4, "decoded_token": " into"}, "1317": {"logprob": -5.5576701164245605, "rank": 5, "decoded_token": " to"}}, {"1261": {"logprob": -0.017209367826581, "rank": 1, "decoded_token": " a"}, "11223": {"logprob": -4.892209529876709, "rank": 2, "decoded_token": " green"}, "1295": {"logprob": -5.017209529876709, "rank": 3, "decoded_token": " l"}, "23170": {"logprob": -6.767209529876709, "rank": 4, "decoded_token": " grass"}, "1420": {"logprob": -7.267209529876709, "rank": 5, "decoded_token": " an"}}, {"1295": {"logprob": -0.9430665969848633, "rank": 1, "decoded_token": " l"}, "11223": {"logprob": -1.3180665969848633, "rank": 2, "decoded_token": " green"}, "23170": {"logprob": -1.9430665969848633, "rank": 3, "decoded_token": " grass"}, "12097": {"logprob": -2.4430665969848633, "rank": 4, "decoded_token": " park"}, "26428": {"logprob": -3.3180665969848633, "rank": 5, "decoded_token": " garden"}}, {"3506": {"logprob": -6.556489552167477e-06, "rank": 1, "decoded_token": "ush"}, "1374": {"logprob": -12.000006675720215, "rank": 2, "decoded_token": "us"}, "90716": {"logprob": -15.625006675720215, "rank": 3, "decoded_token": "USH"}, "16938": {"logprob": -15.875006675720215, "rank": 4, "decoded_token": "usher"}, "13326": {"logprob": -17.1875057220459, "rank": 5, "decoded_token": "inden"}}, {"11223": {"logprob": -0.36697858572006226, "rank": 1, "decoded_token": " green"}, "1044": {"logprob": -1.366978645324707, "rank": 2, "decoded_token": ","}, "26428": {"logprob": -3.491978645324707, "rank": 3, "decoded_token": " garden"}, "12097": {"logprob": -4.116978645324707, "rank": 4, "decoded_token": " park"}, "23170": {"logprob": -5.866978645324707, "rank": 5, "decoded_token": " grass"}}, {"12097": {"logprob": -0.5570574402809143, "rank": 1, "decoded_token": " park"}, "3727": {"logprob": -1.9320573806762695, "rank": 2, "decoded_token": " field"}, "28035": {"logprob": -2.1820573806762695, "rank": 3, "decoded_token": " landscape"}, "26428": {"logprob": -2.4320573806762695, "rank": 4, "decoded_token": " garden"}, "4457": {"logprob": -2.8070573806762695, "rank": 5, "decoded_token": " area"}}, {"1046": {"logprob": -0.7940837144851685, "rank": 1, "decoded_token": "."}, "1454": {"logprob": -1.2940837144851685, "rank": 2, "decoded_token": " with"}, "8994": {"logprob": -2.794083595275879, "rank": 3, "decoded_token": " towards"}, "54410": {"logprob": -3.544083595275879, "rank": 4, "decoded_token": " lined"}, "2425": {"logprob": -3.544083595275879, "rank": 5, "decoded_token": " under"}}, {"2": {"logprob": -2.145764938177308e-06, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -13.125001907348633, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.000001907348633, "rank": 3, "decoded_token": "  "}, "1293": {"logprob": -18.750001907348633, "rank": 4, "decoded_token": "   "}, "1319": {"logprob": -19.687501907348633, "rank": 5, "decoded_token": " ("}}]]]
+\ No newline at end of file
+diff --git a/tests/models/fixtures/pixtral_chat_engine.json b/tests/models/fixtures/pixtral_chat_engine.json
+new file mode 100644
+index 0000000..60e4ae6
+--- /dev/null
++++ b/tests/models/fixtures/pixtral_chat_engine.json
+@@ -0,0 +1 @@
++[[[1784, 3937, 6122, 1261, 7244, 10575, 18970, 1408, 1261, 32656, 4691, 1046, 2], "The image shows a black dog sitting on a wooden surface.", [{"1784": {"logprob": -0.11685245484113693, "rank": 1, "decoded_token": "The"}, "4380": {"logprob": -2.3668525218963623, "rank": 2, "decoded_token": "This"}, "1049": {"logprob": -4.741852283477783, "rank": 3, "decoded_token": "1"}, "117991": {"logprob": -5.991852283477783, "rank": 4, "decoded_token": "Certain"}, "1785": {"logprob": -5.991852283477783, "rank": 5, "decoded_token": "In"}}, {"3937": {"logprob": -0.2591013014316559, "rank": 1, "decoded_token": " image"}, "2158": {"logprob": -1.5091012716293335, "rank": 2, "decoded_token": " first"}, "3977": {"logprob": -5.884101390838623, "rank": 3, "decoded_token": " top"}, "7244": {"logprob": -6.259101390838623, "rank": 4, "decoded_token": " black"}, "8061": {"logprob": -6.759101390838623, "rank": 5, "decoded_token": " images"}}, {"6122": {"logprob": -0.9660423994064331, "rank": 1, "decoded_token": " shows"}, "51948": {"logprob": -1.466042399406433, "rank": 2, "decoded_token": " depicts"}, "6971": {"logprob": -1.466042399406433, "rank": 3, "decoded_token": " features"}, "25981": {"logprob": -2.8410425186157227, "rank": 4, "decoded_token": " displays"}, "8688": {"logprob": -2.8410425186157227, "rank": 5, "decoded_token": " contains"}}, {"1261": {"logprob": -0.0030613720882683992, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -6.253061294555664, "rank": 2, "decoded_token": " an"}, "2295": {"logprob": -7.878061294555664, "rank": 3, "decoded_token": " two"}, "2342": {"logprob": -7.878061294555664, "rank": 4, "decoded_token": " only"}, "1278": {"logprob": -8.628061294555664, "rank": 5, "decoded_token": " the"}}, {"7244": {"logprob": -0.17649099230766296, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -2.3014910221099854, "rank": 2, "decoded_token": " close"}, "4249": {"logprob": -3.4264910221099854, "rank": 3, "decoded_token": " single"}, "4329": {"logprob": -5.113990783691406, "rank": 4, "decoded_token": " large"}, "10575": {"logprob": -5.176490783691406, "rank": 5, "decoded_token": " dog"}}, {"10575": {"logprob": -0.10929587483406067, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.4842958450317383, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -4.109295845031738, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.296795845031738, "rank": 4, "decoded_token": " Lab"}, "7990": {"logprob": -7.484295845031738, "rank": 5, "decoded_token": " cat"}}, {"18970": {"logprob": -0.830376148223877, "rank": 1, "decoded_token": " sitting"}, "1454": {"logprob": -1.580376148223877, "rank": 2, "decoded_token": " with"}, "28528": {"logprob": -1.955376148223877, "rank": 3, "decoded_token": " lying"}, "7283": {"logprob": -2.205376148223877, "rank": 4, "decoded_token": " looking"}, "15866": {"logprob": -3.017876148223877, "rank": 5, "decoded_token": " standing"}}, {"1408": {"logprob": -0.08554735779762268, "rank": 1, "decoded_token": " on"}, "1321": {"logprob": -3.71054744720459, "rank": 2, "decoded_token": " and"}, "3675": {"logprob": -3.96054744720459, "rank": 3, "decoded_token": " against"}, "41132": {"logprob": -4.71054744720459, "rank": 4, "decoded_token": " attent"}, "1454": {"logprob": -5.08554744720459, "rank": 5, "decoded_token": " with"}}, {"1261": {"logprob": -0.540847897529602, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -0.915847897529602, "rank": 2, "decoded_token": " wooden"}, "12603": {"logprob": -5.4158477783203125, "rank": 3, "decoded_token": " wood"}, "3977": {"logprob": -5.4158477783203125, "rank": 4, "decoded_token": " top"}, "17253": {"logprob": -6.2908477783203125, "rank": 5, "decoded_token": " weather"}}, {"32656": {"logprob": -0.025753861293196678, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -4.400753974914551, "rank": 2, "decoded_token": " rust"}, "12603": {"logprob": -5.275753974914551, "rank": 3, "decoded_token": " wood"}, "3403": {"logprob": -5.400753974914551, "rank": 4, "decoded_token": " text"}, "17253": {"logprob": -6.963253974914551, "rank": 5, "decoded_token": " weather"}}, {"4691": {"logprob": -0.7265751957893372, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8515751957893372, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.6015751361846924, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -5.2265753746032715, "rank": 4, "decoded_token": " deck"}, "1615": {"logprob": -5.7265753746032715, "rank": 5, "decoded_token": " pl"}}, {"1046": {"logprob": -0.4868825674057007, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -1.9868825674057007, "rank": 2, "decoded_token": ","}, "1321": {"logprob": -2.3618826866149902, "rank": 3, "decoded_token": " and"}, "1454": {"logprob": -2.6118826866149902, "rank": 4, "decoded_token": " with"}, "7283": {"logprob": -2.7368826866149902, "rank": 5, "decoded_token": " looking"}}, {"2": {"logprob": -0.0026643513701856136, "rank": 1, "decoded_token": "</s>"}, "1531": {"logprob": -6.502664566040039, "rank": 2, "decoded_token": " The"}, "1032": {"logprob": -6.877664566040039, "rank": 3, "decoded_token": " "}, "3730": {"logprob": -9.752664566040039, "rank": 4, "decoded_token": " There"}, "1256": {"logprob": -11.002664566040039, "rank": 5, "decoded_token": "  "}}]], [[1049, 1046, 1349, 7244, 10575, 1454, 2327, 94766, 32961, 53048, 41132, 3923, 1408, 1261, 32656, 4691, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1046, 2], "1. A black dog with floppy ears sits attentively on a wooden surface.\n2. A vast mountain range stretches across the horizon under a cloudy sky.", [{"1049": {"logprob": -0.42824622988700867, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -1.553246259689331, "rank": 2, "decoded_token": "-"}, "1065": {"logprob": -2.428246259689331, "rank": 3, "decoded_token": "A"}, "1784": {"logprob": -4.053246021270752, "rank": 4, "decoded_token": "The"}, "69957": {"logprob": -4.428246021270752, "rank": 5, "decoded_token": "Sure"}}, {"1046": {"logprob": -1.811964830267243e-05, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -11.875018119812012, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -12.250018119812012, "rank": 3, "decoded_token": ".A"}, "1065": {"logprob": -13.062518119812012, "rank": 4, "decoded_token": "A"}, "1041": {"logprob": -13.750018119812012, "rank": 5, "decoded_token": ")"}}, {"1349": {"logprob": -0.13647246360778809, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.386472463607788, "rank": 2, "decoded_token": " \""}, "1603": {"logprob": -3.886472463607788, "rank": 3, "decoded_token": " **"}, "11967": {"logprob": -5.011472702026367, "rank": 4, "decoded_token": " Image"}, "1531": {"logprob": -5.011472702026367, "rank": 5, "decoded_token": " The"}}, {"7244": {"logprob": -0.18561004102230072, "rank": 1, "decoded_token": " black"}, "38462": {"logprob": -3.185610055923462, "rank": 2, "decoded_token": " curious"}, "68076": {"logprob": -3.623110055923462, "rank": 3, "decoded_token": " cute"}, "4329": {"logprob": -3.935610055923462, "rank": 4, "decoded_token": " large"}, "74168": {"logprob": -4.373109817504883, "rank": 5, "decoded_token": " gloss"}}, {"10575": {"logprob": -0.17297746241092682, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.1729774475097656, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.1729774475097656, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -6.985477447509766, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.360477447509766, "rank": 5, "decoded_token": " lab"}}, {"1454": {"logprob": -0.5785807967185974, "rank": 1, "decoded_token": " with"}, "53048": {"logprob": -1.2660808563232422, "rank": 2, "decoded_token": " sits"}, "1395": {"logprob": -3.016080856323242, "rank": 3, "decoded_token": " is"}, "22524": {"logprob": -3.578580856323242, "rank": 4, "decoded_token": " lies"}, "18970": {"logprob": -3.703580856323242, "rank": 5, "decoded_token": " sitting"}}, {"2327": {"logprob": -1.2709298133850098, "rank": 1, "decoded_token": " fl"}, "1261": {"logprob": -1.3959298133850098, "rank": 2, "decoded_token": " a"}, "17300": {"logprob": -1.8959298133850098, "rank": 3, "decoded_token": " soul"}, "100089": {"logprob": -2.6459298133850098, "rank": 4, "decoded_token": " expressive"}, "6444": {"logprob": -3.1459298133850098, "rank": 5, "decoded_token": " soft"}}, {"94766": {"logprob": -0.002432247158139944, "rank": 1, "decoded_token": "oppy"}, "124603": {"logprob": -6.377432346343994, "rank": 2, "decoded_token": "uffy"}, "1484": {"logprob": -7.877432346343994, "rank": 3, "decoded_token": "op"}, "24897": {"logprob": -8.877431869506836, "rank": 4, "decoded_token": "appy"}, "102477": {"logprob": -9.752431869506836, "rank": 5, "decoded_token": "opping"}}, {"32961": {"logprob": -5.113947918289341e-05, "rank": 1, "decoded_token": " ears"}, "16962": {"logprob": -11.312551498413086, "rank": 2, "decoded_token": " ear"}, "5731": {"logprob": -11.750051498413086, "rank": 3, "decoded_token": " eyes"}, "3351": {"logprob": -12.000051498413086, "rank": 4, "decoded_token": " years"}, "42071": {"logprob": -13.000051498413086, "rank": 5, "decoded_token": " cheeks"}}, {"53048": {"logprob": -0.6131591200828552, "rank": 1, "decoded_token": " sits"}, "10637": {"logprob": -1.9881591796875, "rank": 2, "decoded_token": " looks"}, "1321": {"logprob": -2.4256591796875, "rank": 3, "decoded_token": " and"}, "1395": {"logprob": -2.6756591796875, "rank": 4, "decoded_token": " is"}, "18970": {"logprob": -3.0506591796875, "rank": 5, "decoded_token": " sitting"}}, {"41132": {"logprob": -0.36187249422073364, "rank": 1, "decoded_token": " attent"}, "1408": {"logprob": -2.361872434616089, "rank": 2, "decoded_token": " on"}, "106534": {"logprob": -2.424372434616089, "rank": 3, "decoded_token": " calmly"}, "12276": {"logprob": -2.611872434616089, "rank": 4, "decoded_token": " alert"}, "6482": {"logprob": -5.174372673034668, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -8.451581379631534e-05, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.50008487701416, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -11.87508487701416, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -14.00008487701416, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -14.75008487701416, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.058125678449869156, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.1831257343292236, "rank": 2, "decoded_token": " against"}, "1294": {"logprob": -4.9331254959106445, "rank": 3, "decoded_token": " in"}, "7283": {"logprob": -5.8081254959106445, "rank": 4, "decoded_token": " looking"}, "1044": {"logprob": -5.9331254959106445, "rank": 5, "decoded_token": ","}}, {"1261": {"logprob": -0.21029606461524963, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.7102960348129272, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -5.710296154022217, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -6.085296154022217, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.960296154022217, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.08548421412706375, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.710484266281128, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.710484027862549, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.960484027862549, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -5.960484027862549, "rank": 5, "decoded_token": " text"}}, {"4691": {"logprob": -0.7172377109527588, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8422377109527588, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.842237710952759, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -4.21723747253418, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.21723747253418, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.12971943616867065, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.3797194957733154, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -4.129719257354736, "rank": 3, "decoded_token": "."}, "1338": {"logprob": -5.129719257354736, "rank": 4, "decoded_token": ".\n\n"}, "7283": {"logprob": -5.504719257354736, "rank": 5, "decoded_token": " looking"}}, {"1050": {"logprob": -0.00015698630886618048, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -9.125157356262207, "rank": 2, "decoded_token": "  "}, "1032": {"logprob": -10.875157356262207, "rank": 3, "decoded_token": " "}, "1293": {"logprob": -11.750157356262207, "rank": 4, "decoded_token": "   "}, "1051": {"logprob": -12.125157356262207, "rank": 5, "decoded_token": "3"}}, {"1046": {"logprob": -6.6756979322235566e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.062506675720215, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -13.187506675720215, "rank": 3, "decoded_token": ".\n"}, "1338": {"logprob": -14.750006675720215, "rank": 4, "decoded_token": ".\n\n"}, "1058": {"logprob": -14.937506675720215, "rank": 5, "decoded_token": ":"}}, {"1349": {"logprob": -0.5863217115402222, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.4613217115402222, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.2113218307495117, "rank": 3, "decoded_token": " Snow"}, "113465": {"logprob": -3.8988218307495117, "rank": 4, "decoded_token": " Rug"}, "1531": {"logprob": -3.9613218307495117, "rank": 5, "decoded_token": " The"}}, {"15375": {"logprob": -0.639299213886261, "rank": 1, "decoded_token": " vast"}, "37849": {"logprob": -2.014299154281616, "rank": 2, "decoded_token": " breat"}, "61082": {"logprob": -2.389299154281616, "rank": 3, "decoded_token": " panor"}, "10726": {"logprob": -3.139299154281616, "rank": 4, "decoded_token": " scen"}, "2169": {"logprob": -3.201799154281616, "rank": 5, "decoded_token": " ser"}}, {"24361": {"logprob": -0.702845573425293, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.952845573425293, "rank": 2, "decoded_token": " mountainous"}, "1044": {"logprob": -2.077845573425293, "rank": 3, "decoded_token": ","}, "4521": {"logprob": -2.327845573425293, "rank": 4, "decoded_token": " range"}, "28035": {"logprob": -2.452845573425293, "rank": 5, "decoded_token": " landscape"}}, {"4521": {"logprob": -0.07058162242174149, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -2.6955816745758057, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.320581436157227, "rank": 3, "decoded_token": " valley"}, "12248": {"logprob": -9.445581436157227, "rank": 4, "decoded_token": " peak"}, "13327": {"logprob": -9.695581436157227, "rank": 5, "decoded_token": " scene"}}, {"94973": {"logprob": -1.1164050102233887, "rank": 1, "decoded_token": " stretches"}, "1454": {"logprob": -1.1789050102233887, "rank": 2, "decoded_token": " with"}, "2425": {"logprob": -1.8664050102233887, "rank": 3, "decoded_token": " under"}, "1395": {"logprob": -2.5539050102233887, "rank": 4, "decoded_token": " is"}, "13875": {"logprob": -2.9914050102233887, "rank": 5, "decoded_token": " covered"}}, {"5669": {"logprob": -0.3286789357662201, "rank": 1, "decoded_token": " across"}, "1848": {"logprob": -2.078678846359253, "rank": 2, "decoded_token": " out"}, "2425": {"logprob": -2.328678846359253, "rank": 3, "decoded_token": " under"}, "2203": {"logprob": -3.328678846359253, "rank": 4, "decoded_token": " into"}, "8994": {"logprob": -4.766179084777832, "rank": 5, "decoded_token": " towards"}}, {"1278": {"logprob": -0.039004355669021606, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -3.289004325866699, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -7.414004325866699, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -9.0390043258667, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -9.2265043258667, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.2659883201122284, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -2.140988349914551, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.015988349914551, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.515988349914551, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -4.265988349914551, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.5356141328811646, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -1.5356141328811646, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -1.7856141328811646, "rank": 3, "decoded_token": " with"}, "25136": {"logprob": -3.785614013671875, "rank": 4, "decoded_token": " beneath"}, "1408": {"logprob": -5.785614013671875, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.006081883795559406, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -5.506082057952881, "rank": 2, "decoded_token": " an"}, "16152": {"logprob": -7.631082057952881, "rank": 3, "decoded_token": " cloud"}, "6133": {"logprob": -7.881082057952881, "rank": 4, "decoded_token": " clear"}, "2136": {"logprob": -8.006081581115723, "rank": 5, "decoded_token": " over"}}, {"16152": {"logprob": -0.6749536991119385, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -1.4249536991119385, "rank": 2, "decoded_token": " clear"}, "18416": {"logprob": -2.8624536991119385, "rank": 3, "decoded_token": " haz"}, "27254": {"logprob": -2.9874536991119385, "rank": 4, "decoded_token": " partly"}, "4391": {"logprob": -3.2374536991119385, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.10860869288444519, "rank": 1, "decoded_token": "y"}, "4527": {"logprob": -2.9836087226867676, "rank": 2, "decoded_token": "less"}, "1286": {"logprob": -3.4836087226867676, "rank": 3, "decoded_token": "ed"}, "77187": {"logprob": -4.608608722686768, "rank": 4, "decoded_token": "-filled"}, "114525": {"logprob": -4.858608722686768, "rank": 5, "decoded_token": "-covered"}}, {"21283": {"logprob": -0.002785732736811042, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -6.252785682678223, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -7.627785682678223, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -8.627785682678223, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -9.377785682678223, "rank": 5, "decoded_token": " grey"}}, {"1046": {"logprob": -0.047878943383693695, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -3.1728789806365967, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -5.547878742218018, "rank": 3, "decoded_token": " with"}, "1338": {"logprob": -7.172878742218018, "rank": 4, "decoded_token": ".\n\n"}, "1294": {"logprob": -9.172879219055176, "rank": 5, "decoded_token": " in"}}, {"2": {"logprob": -1.3351351299206726e-05, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -11.25001335144043, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.00001335144043, "rank": 3, "decoded_token": "  "}, "1319": {"logprob": -17.25001335144043, "rank": 4, "decoded_token": " ("}, "1766": {"logprob": -18.50001335144043, "rank": 5, "decoded_token": " ["}}]], [[1049, 1046, 1349, 7244, 10575, 53048, 41132, 3923, 1408, 1261, 32656, 11237, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1626, 1051, 1046, 8342, 71284, 7377, 1394, 22140, 1294, 1278, 27208, 1513, 97558, 1626, 1052, 1046, 1349, 53301, 59396, 3549, 13335, 2645, 1261, 1295, 3506, 11223, 12097, 1046, 2], "1. A black dog sits attentively on a wooden floor.\n2. A vast mountain range stretches across the horizon under a cloudy sky.\n3. Surfers wait for waves in the ocean at sunset.\n4. A winding gravel path leads through a lush green park.", [{"1049": {"logprob": -0.05001257359981537, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -3.1750125885009766, "rank": 2, "decoded_token": "-"}, "69957": {"logprob": -5.925012588500977, "rank": 3, "decoded_token": "Sure"}, "11745": {"logprob": -6.425012588500977, "rank": 4, "decoded_token": "Here"}, "1065": {"logprob": -6.425012588500977, "rank": 5, "decoded_token": "A"}}, {"1046": {"logprob": -8.702239938429557e-06, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -12.000008583068848, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -13.375008583068848, "rank": 3, "decoded_token": ".A"}, "1041": {"logprob": -14.750008583068848, "rank": 4, "decoded_token": ")"}, "1065": {"logprob": -15.687508583068848, "rank": 5, "decoded_token": "A"}}, {"1349": {"logprob": -0.14196155965328217, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.2669615745544434, "rank": 2, "decoded_token": " \""}, "1531": {"logprob": -4.516961574554443, "rank": 3, "decoded_token": " The"}, "11967": {"logprob": -4.516961574554443, "rank": 4, "decoded_token": " Image"}, "1603": {"logprob": -5.391961574554443, "rank": 5, "decoded_token": " **"}}, {"7244": {"logprob": -0.14889711141586304, "rank": 1, "decoded_token": " black"}, "68076": {"logprob": -3.398897171020508, "rank": 2, "decoded_token": " cute"}, "6231": {"logprob": -3.961397171020508, "rank": 3, "decoded_token": " close"}, "38462": {"logprob": -4.273897171020508, "rank": 4, "decoded_token": " curious"}, "4329": {"logprob": -4.398897171020508, "rank": 5, "decoded_token": " large"}}, {"10575": {"logprob": -0.12091328203678131, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.37091326713562, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.99591326713562, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.683413505554199, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.808413505554199, "rank": 5, "decoded_token": " lab"}}, {"53048": {"logprob": -0.8691943287849426, "rank": 1, "decoded_token": " sits"}, "1454": {"logprob": -1.1191942691802979, "rank": 2, "decoded_token": " with"}, "1395": {"logprob": -2.431694269180298, "rank": 3, "decoded_token": " is"}, "18970": {"logprob": -2.744194269180298, "rank": 4, "decoded_token": " sitting"}, "22524": {"logprob": -3.681694269180298, "rank": 5, "decoded_token": " lies"}}, {"41132": {"logprob": -0.5939557552337646, "rank": 1, "decoded_token": " attent"}, "106534": {"logprob": -1.2814557552337646, "rank": 2, "decoded_token": " calmly"}, "12276": {"logprob": -2.8439557552337646, "rank": 3, "decoded_token": " alert"}, "1408": {"logprob": -2.8439557552337646, "rank": 4, "decoded_token": " on"}, "6482": {"logprob": -4.968955993652344, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -0.00010084597306558862, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.500101089477539, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -10.875101089477539, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -13.000101089477539, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -13.750101089477539, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.056158196181058884, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.6811583042144775, "rank": 2, "decoded_token": " against"}, "1454": {"logprob": -4.306158065795898, "rank": 3, "decoded_token": " with"}, "1294": {"logprob": -5.181158065795898, "rank": 4, "decoded_token": " in"}, "7283": {"logprob": -5.431158065795898, "rank": 5, "decoded_token": " looking"}}, {"1261": {"logprob": -0.33056098222732544, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.3305609226226807, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -4.70556116104126, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -5.83056116104126, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.58056116104126, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.07081110030412674, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.9458110332489014, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.6958112716674805, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.8208112716674805, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -6.0708112716674805, "rank": 5, "decoded_token": " text"}}, {"11237": {"logprob": -0.6428436636924744, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.0178437232971191, "rank": 2, "decoded_token": " surface"}, "7042": {"logprob": -2.642843723297119, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -3.517843723297119, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.017843723297119, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.7337945103645325, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -0.8587945103645325, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -3.3587944507598877, "rank": 3, "decoded_token": " with"}, "7283": {"logprob": -3.6087944507598877, "rank": 4, "decoded_token": " looking"}, "1321": {"logprob": -4.108794689178467, "rank": 5, "decoded_token": " and"}}, {"1050": {"logprob": -1.0132738680113107e-05, "rank": 1, "decoded_token": "2"}, "1051": {"logprob": -11.75001049041748, "rank": 2, "decoded_token": "3"}, "1256": {"logprob": -14.00001049041748, "rank": 3, "decoded_token": "  "}, "1049": {"logprob": -14.62501049041748, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -14.62501049041748, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -2.861018856492592e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.43750286102295, "rank": 2, "decoded_token": ".A"}, "4700": {"logprob": -15.37500286102295, "rank": 3, "decoded_token": ".M"}, "1626": {"logprob": -15.37500286102295, "rank": 4, "decoded_token": ".\n"}, "3051": {"logprob": -15.87500286102295, "rank": 5, "decoded_token": ".S"}}, {"1349": {"logprob": -0.6794427633285522, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.9294427633285522, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.116942882537842, "rank": 3, "decoded_token": " Snow"}, "27260": {"logprob": -2.616942882537842, "rank": 4, "decoded_token": " Mountain"}, "113465": {"logprob": -2.866942882537842, "rank": 5, "decoded_token": " Rug"}}, {"15375": {"logprob": -0.9194075465202332, "rank": 1, "decoded_token": " vast"}, "10726": {"logprob": -2.294407606124878, "rank": 2, "decoded_token": " scen"}, "4521": {"logprob": -2.356907606124878, "rank": 3, "decoded_token": " range"}, "122203": {"logprob": -2.419407606124878, "rank": 4, "decoded_token": " rugged"}, "61082": {"logprob": -2.856907606124878, "rank": 5, "decoded_token": " panor"}}, {"24361": {"logprob": -0.5804797410964966, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.8304797410964966, "rank": 2, "decoded_token": " mountainous"}, "28035": {"logprob": -2.455479621887207, "rank": 3, "decoded_token": " landscape"}, "4521": {"logprob": -2.455479621887207, "rank": 4, "decoded_token": " range"}, "1044": {"logprob": -2.705479621887207, "rank": 5, "decoded_token": ","}}, {"4521": {"logprob": -0.0493546724319458, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -3.0493545532226562, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.424354553222656, "rank": 3, "decoded_token": " valley"}, "13327": {"logprob": -9.049354553222656, "rank": 4, "decoded_token": " scene"}, "3719": {"logprob": -9.799354553222656, "rank": 5, "decoded_token": " view"}}, {"94973": {"logprob": -0.6676871180534363, "rank": 1, "decoded_token": " stretches"}, "2425": {"logprob": -1.792687177658081, "rank": 2, "decoded_token": " under"}, "1395": {"logprob": -2.292687177658081, "rank": 3, "decoded_token": " is"}, "1454": {"logprob": -2.730187177658081, "rank": 4, "decoded_token": " with"}, "7038": {"logprob": -3.292687177658081, "rank": 5, "decoded_token": " extends"}}, {"5669": {"logprob": -0.4542117118835449, "rank": 1, "decoded_token": " across"}, "2425": {"logprob": -1.454211711883545, "rank": 2, "decoded_token": " under"}, "1848": {"logprob": -2.454211711883545, "rank": 3, "decoded_token": " out"}, "2203": {"logprob": -4.204211711883545, "rank": 4, "decoded_token": " into"}, "25136": {"logprob": -4.641711711883545, "rank": 5, "decoded_token": " beneath"}}, {"1278": {"logprob": -0.23009441792964935, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -1.6050944328308105, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -5.6050944328308105, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -7.2300944328308105, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -10.167593955993652, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.3072167932987213, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -1.932216763496399, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.1822168827056885, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.6822168827056885, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -3.6822168827056885, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.2914469838142395, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -2.4164469242095947, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -2.5414469242095947, "rank": 3, "decoded_token": " with"}, "1626": {"logprob": -3.7914469242095947, "rank": 4, "decoded_token": ".\n"}, "1408": {"logprob": -3.7914469242095947, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.0460360012948513, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -3.9210360050201416, "rank": 2, "decoded_token": " an"}, "16152": {"logprob": -4.1085357666015625, "rank": 3, "decoded_token": " cloud"}, "2136": {"logprob": -6.1710357666015625, "rank": 4, "decoded_token": " over"}, "6133": {"logprob": -6.4210357666015625, "rank": 5, "decoded_token": " clear"}}, {"16152": {"logprob": -0.20367540419101715, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -2.8286755084991455, "rank": 2, "decoded_token": " clear"}, "27254": {"logprob": -3.5161755084991455, "rank": 3, "decoded_token": " partly"}, "18416": {"logprob": -3.8286755084991455, "rank": 4, "decoded_token": " haz"}, "4391": {"logprob": -4.328675270080566, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.05241352692246437, "rank": 1, "decoded_token": "y"}, "1286": {"logprob": -3.8024134635925293, "rank": 2, "decoded_token": "ed"}, "77187": {"logprob": -4.552413463592529, "rank": 3, "decoded_token": "-filled"}, "4527": {"logprob": -4.802413463592529, "rank": 4, "decoded_token": "less"}, "114525": {"logprob": -4.927413463592529, "rank": 5, "decoded_token": "-covered"}}, {"21283": {"logprob": -0.0003716255014296621, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -8.750371932983398, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -9.375371932983398, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -10.375371932983398, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -11.250371932983398, "rank": 5, "decoded_token": " grey"}}, {"1626": {"logprob": -0.00012730741582345217, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -9.500126838684082, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -10.500126838684082, "rank": 3, "decoded_token": "."}, "1454": {"logprob": -10.875126838684082, "rank": 4, "decoded_token": " with"}, "1294": {"logprob": -13.250126838684082, "rank": 5, "decoded_token": " in"}}, {"1051": {"logprob": -3.2186455882765586e-06, "rank": 1, "decoded_token": "3"}, "1052": {"logprob": -12.75000286102295, "rank": 2, "decoded_token": "4"}, "1050": {"logprob": -15.00000286102295, "rank": 3, "decoded_token": "2"}, "1049": {"logprob": -16.937503814697266, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -17.875003814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -14.687501907348633, "rank": 2, "decoded_token": ".A"}, "5226": {"logprob": -15.687501907348633, "rank": 3, "decoded_token": ".D"}, "6847": {"logprob": -15.812501907348633, "rank": 4, "decoded_token": ".T"}, "48426": {"logprob": -16.812501907348633, "rank": 5, "decoded_token": ".The"}}, {"8342": {"logprob": -0.5730464458465576, "rank": 1, "decoded_token": " Sur"}, "1349": {"logprob": -1.6980464458465576, "rank": 2, "decoded_token": " A"}, "22468": {"logprob": -2.5730464458465576, "rank": 3, "decoded_token": " Several"}, "1488": {"logprob": -2.6980464458465576, "rank": 4, "decoded_token": " W"}, "15035": {"logprob": -3.1980464458465576, "rank": 5, "decoded_token": " People"}}, {"71284": {"logprob": -0.0033258858602494, "rank": 1, "decoded_token": "fers"}, "1102": {"logprob": -5.878325939178467, "rank": 2, "decoded_token": "f"}, "1726": {"logprob": -7.628325939178467, "rank": 3, "decoded_token": "fer"}, "61888": {"logprob": -12.253325462341309, "rank": 4, "decoded_token": "fline"}, "2119": {"logprob": -13.003325462341309, "rank": 5, "decoded_token": "fter"}}, {"7377": {"logprob": -1.4996429681777954, "rank": 1, "decoded_token": " wait"}, "1584": {"logprob": -1.7496429681777954, "rank": 2, "decoded_token": " are"}, "88014": {"logprob": -1.9371429681777954, "rank": 3, "decoded_token": " paddle"}, "1294": {"logprob": -1.9371429681777954, "rank": 4, "decoded_token": " in"}, "24434": {"logprob": -2.187142848968506, "rank": 5, "decoded_token": " ride"}}, {"1394": {"logprob": -0.6126739382743835, "rank": 1, "decoded_token": " for"}, "1294": {"logprob": -0.9876739382743835, "rank": 2, "decoded_token": " in"}, "1408": {"logprob": -2.7376739978790283, "rank": 3, "decoded_token": " on"}, "6482": {"logprob": -4.425173759460449, "rank": 4, "decoded_token": " patient"}, "1321": {"logprob": -5.612673759460449, "rank": 5, "decoded_token": " and"}}, {"22140": {"logprob": -0.00729279313236475, "rank": 1, "decoded_token": " waves"}, "1278": {"logprob": -5.632292747497559, "rank": 2, "decoded_token": " the"}, "1261": {"logprob": -5.757292747497559, "rank": 3, "decoded_token": " a"}, "39460": {"logprob": -8.257292747497559, "rank": 4, "decoded_token": " incoming"}, "1321": {"logprob": -9.757292747497559, "rank": 5, "decoded_token": " and"}}, {"1294": {"logprob": -0.3071398138999939, "rank": 1, "decoded_token": " in"}, "1408": {"logprob": -2.1821398735046387, "rank": 2, "decoded_token": " on"}, "1513": {"logprob": -2.4321398735046387, "rank": 3, "decoded_token": " at"}, "3016": {"logprob": -3.6821398735046387, "rank": 4, "decoded_token": " while"}, "1435": {"logprob": -3.8071398735046387, "rank": 5, "decoded_token": " as"}}, {"1278": {"logprob": -0.004646694287657738, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -6.1921467781066895, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -6.9421467781066895, "rank": 3, "decoded_token": " an"}, "40466": {"logprob": -7.2546467781066895, "rank": 4, "decoded_token": " shallow"}, "26517": {"logprob": -7.8796467781066895, "rank": 5, "decoded_token": " calm"}}, {"27208": {"logprob": -0.0658877044916153, "rank": 1, "decoded_token": " ocean"}, "7786": {"logprob": -3.440887689590454, "rank": 2, "decoded_token": " distance"}, "5124": {"logprob": -5.253387928009033, "rank": 3, "decoded_token": " early"}, "26517": {"logprob": -5.315887928009033, "rank": 4, "decoded_token": " calm"}, "11196": {"logprob": -5.378387928009033, "rank": 5, "decoded_token": " sea"}}, {"1513": {"logprob": -1.1504861116409302, "rank": 1, "decoded_token": " at"}, "1435": {"logprob": -1.2754861116409302, "rank": 2, "decoded_token": " as"}, "3184": {"logprob": -1.4004861116409302, "rank": 3, "decoded_token": " during"}, "3016": {"logprob": -2.9004859924316406, "rank": 4, "decoded_token": " while"}, "6117": {"logprob": -3.1504859924316406, "rank": 5, "decoded_token": " near"}}, {"97558": {"logprob": -0.12151996046304703, "rank": 1, "decoded_token": " sunset"}, "11729": {"logprob": -2.8715200424194336, "rank": 2, "decoded_token": " sun"}, "1266": {"logprob": -3.4965200424194336, "rank": 3, "decoded_token": " d"}, "54507": {"logprob": -3.9965200424194336, "rank": 4, "decoded_token": " dawn"}, "1261": {"logprob": -5.121520042419434, "rank": 5, "decoded_token": " a"}}, {"1626": {"logprob": -0.3073118329048157, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.182311773300171, "rank": 2, "decoded_token": ","}, "3016": {"logprob": -2.557311773300171, "rank": 3, "decoded_token": " while"}, "1454": {"logprob": -3.432311773300171, "rank": 4, "decoded_token": " with"}, "6117": {"logprob": -4.05731201171875, "rank": 5, "decoded_token": " near"}}, {"1052": {"logprob": -3.3378546504536644e-06, "rank": 1, "decoded_token": "4"}, "1051": {"logprob": -13.25000286102295, "rank": 2, "decoded_token": "3"}, "1049": {"logprob": -13.93750286102295, "rank": 3, "decoded_token": "1"}, "1053": {"logprob": -14.43750286102295, "rank": 4, "decoded_token": "5"}, "1032": {"logprob": -16.687503814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.500001907348633, "rank": 2, "decoded_token": ".A"}, "6847": {"logprob": -16.437501907348633, "rank": 3, "decoded_token": ".T"}, "1044": {"logprob": -17.312501907348633, "rank": 4, "decoded_token": ","}, "1349": {"logprob": -17.375001907348633, "rank": 5, "decoded_token": " A"}}, {"1349": {"logprob": -0.004292916506528854, "rank": 1, "decoded_token": " A"}, "2048": {"logprob": -5.629292964935303, "rank": 2, "decoded_token": " An"}, "10638": {"logprob": -7.879292964935303, "rank": 3, "decoded_token": " Two"}, "111463": {"logprob": -10.004292488098145, "rank": 4, "decoded_token": " Trees"}, "1531": {"logprob": -10.879292488098145, "rank": 5, "decoded_token": " The"}}, {"53301": {"logprob": -1.5473321676254272, "rank": 1, "decoded_token": " winding"}, "15192": {"logprob": -1.7348321676254272, "rank": 2, "decoded_token": " narrow"}, "47945": {"logprob": -2.109832286834717, "rank": 3, "decoded_token": " dirt"}, "2169": {"logprob": -2.609832286834717, "rank": 4, "decoded_token": " ser"}, "59396": {"logprob": -2.672332286834717, "rank": 5, "decoded_token": " gravel"}}, {"59396": {"logprob": -0.8954829573631287, "rank": 1, "decoded_token": " gravel"}, "3549": {"logprob": -1.1454830169677734, "rank": 2, "decoded_token": " path"}, "47945": {"logprob": -1.6454830169677734, "rank": 3, "decoded_token": " dirt"}, "14801": {"logprob": -3.2704830169677734, "rank": 4, "decoded_token": " pathway"}, "15551": {"logprob": -4.270483016967773, "rank": 5, "decoded_token": " stone"}}, {"3549": {"logprob": -0.02117946185171604, "rank": 1, "decoded_token": " path"}, "14801": {"logprob": -3.896179437637329, "rank": 2, "decoded_token": " pathway"}, "33659": {"logprob": -8.14617919921875, "rank": 3, "decoded_token": " trail"}, "9480": {"logprob": -9.64617919921875, "rank": 4, "decoded_token": " road"}, "7368": {"logprob": -9.64617919921875, "rank": 5, "decoded_token": "path"}}, {"13335": {"logprob": -0.18962937593460083, "rank": 1, "decoded_token": " leads"}, "39985": {"logprob": -2.752129316329956, "rank": 2, "decoded_token": " cuts"}, "1639": {"logprob": -3.877129316329956, "rank": 3, "decoded_token": " me"}, "11500": {"logprob": -3.939629316329956, "rank": 4, "decoded_token": " runs"}, "2645": {"logprob": -4.189629554748535, "rank": 5, "decoded_token": " through"}}, {"2645": {"logprob": -0.05349981039762497, "rank": 1, "decoded_token": " through"}, "8994": {"logprob": -4.053499698638916, "rank": 2, "decoded_token": " towards"}, "2396": {"logprob": -4.303499698638916, "rank": 3, "decoded_token": " between"}, "2203": {"logprob": -4.678499698638916, "rank": 4, "decoded_token": " into"}, "1317": {"logprob": -5.678499698638916, "rank": 5, "decoded_token": " to"}}, {"1261": {"logprob": -0.017386287450790405, "rank": 1, "decoded_token": " a"}, "11223": {"logprob": -4.892386436462402, "rank": 2, "decoded_token": " green"}, "1295": {"logprob": -5.017386436462402, "rank": 3, "decoded_token": " l"}, "23170": {"logprob": -6.642386436462402, "rank": 4, "decoded_token": " grass"}, "1420": {"logprob": -7.267386436462402, "rank": 5, "decoded_token": " an"}}, {"1295": {"logprob": -0.9453322887420654, "rank": 1, "decoded_token": " l"}, "11223": {"logprob": -1.3203322887420654, "rank": 2, "decoded_token": " green"}, "23170": {"logprob": -1.9453322887420654, "rank": 3, "decoded_token": " grass"}, "12097": {"logprob": -2.4453322887420654, "rank": 4, "decoded_token": " park"}, "26428": {"logprob": -3.3203322887420654, "rank": 5, "decoded_token": " garden"}}, {"3506": {"logprob": -6.556489552167477e-06, "rank": 1, "decoded_token": "ush"}, "1374": {"logprob": -12.000006675720215, "rank": 2, "decoded_token": "us"}, "90716": {"logprob": -15.625006675720215, "rank": 3, "decoded_token": "USH"}, "16938": {"logprob": -15.875006675720215, "rank": 4, "decoded_token": "usher"}, "13326": {"logprob": -17.1875057220459, "rank": 5, "decoded_token": "inden"}}, {"11223": {"logprob": -0.3668670654296875, "rank": 1, "decoded_token": " green"}, "1044": {"logprob": -1.3668670654296875, "rank": 2, "decoded_token": ","}, "26428": {"logprob": -3.4918670654296875, "rank": 3, "decoded_token": " garden"}, "12097": {"logprob": -4.1168670654296875, "rank": 4, "decoded_token": " park"}, "23170": {"logprob": -5.8668670654296875, "rank": 5, "decoded_token": " grass"}}, {"12097": {"logprob": -0.5530153512954712, "rank": 1, "decoded_token": " park"}, "3727": {"logprob": -2.0530152320861816, "rank": 2, "decoded_token": " field"}, "28035": {"logprob": -2.1780152320861816, "rank": 3, "decoded_token": " landscape"}, "26428": {"logprob": -2.3030152320861816, "rank": 4, "decoded_token": " garden"}, "4457": {"logprob": -2.8030152320861816, "rank": 5, "decoded_token": " area"}}, {"1046": {"logprob": -0.7924000024795532, "rank": 1, "decoded_token": "."}, "1454": {"logprob": -1.2924000024795532, "rank": 2, "decoded_token": " with"}, "8994": {"logprob": -2.7923998832702637, "rank": 3, "decoded_token": " towards"}, "54410": {"logprob": -3.5423998832702637, "rank": 4, "decoded_token": " lined"}, "2425": {"logprob": -3.5423998832702637, "rank": 5, "decoded_token": " under"}}, {"2": {"logprob": -1.9073468138230965e-06, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -13.250001907348633, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.250001907348633, "rank": 3, "decoded_token": "  "}, "1293": {"logprob": -19.000001907348633, "rank": 4, "decoded_token": "   "}, "1319": {"logprob": -20.000001907348633, "rank": 5, "decoded_token": " ("}}]]]
+\ No newline at end of file
+diff --git a/tests/models/multimodal/__init__.py b/tests/models/multimodal/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/multimodal/processing/__init__.py b/tests/models/multimodal/processing/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
+new file mode 100644
+index 0000000..0a38779
+--- /dev/null
++++ b/tests/models/multimodal/processing/test_common.py
+@@ -0,0 +1,201 @@
++from functools import partial
++
++import numpy as np
++import pytest
++from PIL import Image
++
++from vllm.config import ModelConfig
++from vllm.inputs import InputProcessingContext
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.processing import ProcessingCache
++from vllm.multimodal.utils import cached_get_tokenizer
++
++from ....multimodal.utils import random_audio, random_image, random_video
++
++
++def _test_processing_correctness(
++    model_id: str,
++    modalities: dict[str, bool],
++    hit_rate: float,
++    num_batches: int,
++    simplify_rate: float,
++):
++    if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
++        hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
++    else:
++        hf_overrides = {}
++
++    limit_mm_per_prompt = {
++        modality: 3 if supports_multi else 1
++        for modality, supports_multi in modalities.items()
++    }
++
++    model_config = ModelConfig(
++        model_id,
++        task="auto",
++        tokenizer=model_id,
++        tokenizer_mode="auto",
++        trust_remote_code=True,
++        seed=0,
++        dtype="float16",
++        revision=None,
++        hf_overrides=hf_overrides,
++        limit_mm_per_prompt=limit_mm_per_prompt,
++    )
++
++    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
++    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
++    ctx = InputProcessingContext(
++        model_config,
++        tokenizer=cached_get_tokenizer(model_config.tokenizer),
++    )
++    # Ensure that it can fit all of the data
++    cache = ProcessingCache(capacity=1 << 30)
++
++    baseline_processor = factories.build_processor(ctx, cache=None)
++    cached_processor = factories.build_processor(ctx, cache=cache)
++    dummy_inputs = baseline_processor.dummy_inputs
++    tokenizer = baseline_processor.info.get_tokenizer()
++
++    rng = np.random.RandomState(0)
++
++    input_to_hit = {
++        "image": Image.new("RGB", size=(128, 128)),
++        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
++        "audio": (np.zeros((512, )), 16000),
++    }
++    input_factory = {
++        "image":
++        partial(random_image, rng, min_wh=128, max_wh=256),
++        "video":
++        partial(random_video,
++                rng,
++                min_frames=2,
++                max_frames=8,
++                min_wh=128,
++                max_wh=256),
++        "audio":
++        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
++    }
++
++    for batch_idx in range(num_batches):
++        mm_data = {
++            k:
++            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
++             for _ in range(rng.randint(limit_mm_per_prompt[k]))]
++            for k in modalities
++        }
++
++        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
++        prompt = dummy_inputs.get_dummy_processor_inputs(
++            model_config.max_model_len,
++            mm_counts,
++        ).prompt_text
++
++        # Drop unnecessary keys and test single -> multi conversion
++        if rng.rand() < simplify_rate:
++            for k in list(mm_data.keys()):
++                if not mm_data[k]:
++                    del mm_data[k]
++                elif len(mm_data[k]) == 1:
++                    mm_data[k] = mm_data[k][0]
++
++        baseline_result = baseline_processor.apply(
++            prompt,
++            mm_data=mm_data,
++            hf_processor_mm_kwargs={},
++        )
++        cached_result = cached_processor.apply(
++            prompt,
++            mm_data=mm_data,
++            hf_processor_mm_kwargs={},
++        )
++
++        assert baseline_result == cached_result, (
++            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
++
++        baseline_tokenized_result = baseline_processor.apply(
++            tokenizer.encode(prompt),
++            mm_data=mm_data,
++            hf_processor_mm_kwargs={},
++        )
++
++        assert baseline_result == baseline_tokenized_result, (
++            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
++
++        cached_tokenized_result = cached_processor.apply(
++            tokenizer.encode(prompt),
++            mm_data=mm_data,
++            hf_processor_mm_kwargs={},
++        )
++
++        assert cached_result == cached_tokenized_result, (
++            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
++
++
++# yapf: disable
++# True if the model supports multiple data items of the modality per request
++@pytest.mark.parametrize(("model_id", "modalities"), [
++    ("rhymes-ai/Aria", {"image": True}),
++    ("Salesforce/blip2-opt-2.7b", {"image": False}),
++    ("facebook/chameleon-7b", {"image": False}),
++    ("adept/fuyu-8b", {"image": False}),
++    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
++    ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
++    ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
++    ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}),  # noqa: E501
++    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
++    ("mistral-community/pixtral-12b", {"image": True}),
++    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
++    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
++    ("fixie-ai/ultravox-v0_3", {"audio": True}),
++])
++@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
++@pytest.mark.parametrize("num_batches", [32])
++@pytest.mark.parametrize("simplify_rate", [1.0])
++# yapf: enable
++def test_processing_correctness(
++    model_id: str,
++    modalities: dict[str, bool],
++    hit_rate: float,
++    num_batches: int,
++    simplify_rate: float,
++):
++    _test_processing_correctness(
++        model_id,
++        modalities,
++        hit_rate=hit_rate,
++        num_batches=num_batches,
++        simplify_rate=simplify_rate,
++    )
++
++
++# yapf: disable
++@pytest.mark.parametrize(("model_id", "modalities"), [
++    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
++])
++@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
++@pytest.mark.parametrize("num_batches", [32])
++@pytest.mark.parametrize("simplify_rate", [1.0])
++# yapf: enable
++def test_processing_correctness_phi3v(
++    model_id: str,
++    modalities: dict[str, bool],
++    hit_rate: float,
++    num_batches: int,
++    simplify_rate: float,
++):
++    # HACK - this is an attempted workaround for the following bug
++    # https://github.com/huggingface/transformers/issues/34307
++    from transformers import AutoImageProcessor  # noqa: F401
++    from transformers import AutoProcessor  # noqa: F401
++
++    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
++
++    _test_processing_correctness(
++        model_id,
++        modalities,
++        hit_rate=hit_rate,
++        num_batches=num_batches,
++        simplify_rate=simplify_rate,
++    )
+diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
+new file mode 100644
+index 0000000..69b91ad
+--- /dev/null
++++ b/tests/models/multimodal/processing/test_idefics3.py
+@@ -0,0 +1,178 @@
++"""Tests for Idefics3's multimodal preprocessing kwargs."""
++from typing import Optional
++
++import pytest
++import torch
++from transformers import AutoImageProcessor, AutoTokenizer
++
++from vllm.inputs import InputContext, token_inputs
++from vllm.multimodal import MultiModalRegistry
++
++from ....conftest import _ImageAssets
++from ...utils import build_model_context
++
++models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
++
++
++# Wrap lazy imports to avoid initializing CUDA during test collection
++@pytest.fixture()
++def input_processor_for_idefics3():
++    from vllm.model_executor.models.idefics3 import (
++        input_processor_for_idefics3)
++    return input_processor_for_idefics3
++
++
++@pytest.fixture()
++def dummy_data_for_idefics3():
++    from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3
++    return dummy_data_for_idefics3
++
++
++@pytest.fixture()
++def get_max_idefics3_image_tokens():
++    from vllm.model_executor.models.idefics3 import (
++        get_max_idefics3_image_tokens)
++    return get_max_idefics3_image_tokens
++
++
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
++def test_input_mapper_override(model: str, image_assets: _ImageAssets,
++                               longest_edge: Optional[int]):
++    """Ensure that the [default] input mapper handles size properly."""
++
++    mm_processor_kwargs = {
++        "size": {
++            "longest_edge": longest_edge
++        }
++    } if longest_edge is not None else {}
++    ctx = build_model_context(
++        model_name=model,
++        tokenizer_name=model,
++        trust_remote_code=True,
++        mm_processor_kwargs=mm_processor_kwargs,
++    )
++
++    hf_processor = AutoImageProcessor.from_pretrained(model,
++                                                      trust_remote_code=True,
++                                                      **mm_processor_kwargs)
++
++    mm_registry = MultiModalRegistry()
++    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
++
++    image = image_assets[0].pil_image
++    hf_result = hf_processor.preprocess(
++        image,
++        return_tensors="pt",
++    )
++
++    vllm_result = mm_registry.map_input(
++        ctx.model_config,
++        {"image": image},
++    )
++
++    assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
++
++
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("longest_edge, expected_max_tokens", [
++    (None, 2873),
++    (168, 169),
++    (336, 169),
++    (400, 338),
++    (672, 338),
++])
++def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
++                             longest_edge: Optional[int],
++                             expected_max_tokens: int):
++    """Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
++    size = {"longest_edge": longest_edge} if longest_edge is not None else None
++    ctx = build_model_context(
++        model_name=model,
++        tokenizer_name=model,
++        trust_remote_code=True,
++        mm_processor_kwargs=None,
++    )
++
++    actual_max_tokens = get_max_idefics3_image_tokens(
++        ctx=InputContext(ctx.model_config),
++        size=size,
++    )
++
++    assert expected_max_tokens == actual_max_tokens
++
++
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
++    (168, 169, 1),
++    (168, 169, 2),
++    (400, 338, 1),
++    (400, 338, 2),
++])
++def test_dummy_data_override(dummy_data_for_idefics3, model: str,
++                             longest_edge: int, toks_per_img: int,
++                             num_imgs: int):
++    """Ensure dummy_data_for_idefics3 handles num_crops properly."""
++    # Same as the previous test - don't initialize mm_processor_kwargs
++    # in this test and assume that the kwargs will be correctly expanded by
++    # the partial when calling the dummy data func.
++    size = {"longest_edge": longest_edge} if longest_edge is not None else None
++    ctx = build_model_context(
++        model_name=model,
++        tokenizer_name=model,
++        trust_remote_code=True,
++        mm_processor_kwargs=None,
++    )
++
++    dummy_data = dummy_data_for_idefics3(
++        ctx=ctx,
++        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
++        mm_counts={"image": num_imgs},
++        size=size)
++    sequence_data = dummy_data.seq_data
++    # Ensure we have the right number of placeholders per size
++    image_token_id = ctx.get_hf_config().image_token_id
++    img_tok_count = sequence_data.get_token_ids().count(image_token_id)
++    assert img_tok_count == toks_per_img * num_imgs
++
++
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
++    (336, 169 * (1**2 + 1), 1),
++    (336, 169 * (1**2 + 1), 2),
++    (400, 169 * (2**2 + 1), 1),
++    (400, 169 * (2**2 + 1), 2),
++])
++def test_input_processor_override(input_processor_for_idefics3,
++                                  image_assets: _ImageAssets, model: str,
++                                  longest_edge: int,
++                                  expected_toks_per_img: int, num_imgs: int):
++    """Ensure input_processor_for_idefics3 handles num_crops properly."""
++    # Same as the previous test - don't initialize mm_processor_kwargs
++    # in this test and assume that the kwargs will be correctly expanded by
++    # the partial when calling the custom input processor.
++    size = {"longest_edge": longest_edge} if longest_edge is not None else None
++    ctx = build_model_context(
++        model_name=model,
++        tokenizer_name=model,
++        trust_remote_code=True,
++        mm_processor_kwargs=None,
++    )
++
++    # Build the image str / prompt based on the number of images we pass
++    tokenizer = AutoTokenizer.from_pretrained(model)
++    placeholders = "<image>" if num_imgs == 1 else "\n".join(
++        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
++    prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
++    images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs
++
++    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
++                          prompt=prompt,
++                          multi_modal_data={"image": images})
++
++    processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size)
++
++    # Ensure we have the right number of placeholders per num_crops size
++    image_token_id = ctx.get_hf_config().image_token_id
++    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
++    assert img_tok_count == expected_toks_per_img * num_imgs
+diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
+new file mode 100644
+index 0000000..d6c6059
+--- /dev/null
++++ b/tests/models/multimodal/processing/test_internvl.py
+@@ -0,0 +1,206 @@
++"""Tests for InternVL's multimodal preprocessing kwargs."""
++from typing import Callable, Optional
++
++import pytest
++from transformers import AutoTokenizer
++
++from vllm.inputs import InputContext, token_inputs
++from vllm.multimodal import MultiModalRegistry
++
++from ....conftest import _ImageAssets
++from ...utils import build_model_context
++
++models = ["OpenGVLab/InternVL2-2B"]
++
++
++# Wrap lazy imports to avoid initializing CUDA during test collection
++@pytest.fixture()
++def input_processor_for_internvl():
++    from vllm.model_executor.models.internvl import InternVLInputPipeline
++
++    pipeline = InternVLInputPipeline('<img>', '</img>', '<IMG_CONTEXT>')
++    return pipeline.input_processor
++
++
++@pytest.fixture()
++def dummy_data_for_internvl():
++    from vllm.model_executor.models.internvl import InternVLInputPipeline
++
++    pipeline = InternVLInputPipeline('<img>', '</img>', '<IMG_CONTEXT>')
++    return pipeline.dummy_data
++
++
++@pytest.fixture()
++def get_max_internvl_image_tokens():
++    from vllm.model_executor.models.internvl import (
++        get_max_internvl_image_tokens)
++    return get_max_internvl_image_tokens
++
++
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
++@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
++def test_input_mapper_override(
++    model: str,
++    image_assets: _ImageAssets,
++    max_dynamic_patch: int,
++    dynamic_image_size: Optional[bool],
++):
++    mm_processor_kwargs = {
++        "max_dynamic_patch": max_dynamic_patch,
++    }
++    if dynamic_image_size is not None:
++        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
++
++    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
++    if dynamic_image_size is False:
++        expected_num_patches = 1
++
++    ctx = build_model_context(
++        model_name=model,
++        tokenizer_name=model,
++        trust_remote_code=True,
++        mm_processor_kwargs=mm_processor_kwargs,
++    )
++
++    mm_registry = MultiModalRegistry()
++    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
++
++    image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
++    vllm_result = mm_registry.map_input(
++        ctx.model_config,
++        {"image": image},
++    )
++    assert vllm_result["pixel_values"].size(1) == expected_num_patches
++
++
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None])
++@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
++def test_max_tokens_override(
++    get_max_internvl_image_tokens: Callable,
++    model: str,
++    max_dynamic_patch: Optional[int],
++    dynamic_image_size: Optional[bool],
++):
++    """Ensure get_max_internvl_image_tokens handles mm_processor_kwargs."""
++    ctx = build_model_context(
++        model_name=model,
++        tokenizer_name=model,
++        trust_remote_code=True,
++        mm_processor_kwargs=None,
++    )
++
++    if max_dynamic_patch is None:
++        max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch
++    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
++    if dynamic_image_size is False:
++        expected_num_patches = 1
++    expected_max_tokens = 256 * expected_num_patches
++
++    actual_max_tokens = get_max_internvl_image_tokens(
++        ctx=InputContext(ctx.model_config),
++        max_dynamic_patch=max_dynamic_patch,
++        dynamic_image_size=dynamic_image_size,
++    )
++    assert expected_max_tokens == actual_max_tokens
++
++
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("num_imgs", [1, 2])
++@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None])
++@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
++def test_dummy_data_override(
++    dummy_data_for_internvl: Callable,
++    model: str,
++    num_imgs: int,
++    max_dynamic_patch: Optional[int],
++    dynamic_image_size: Optional[bool],
++):
++    """Ensure dummy_data_for_internvl handles kwargs properly."""
++    # Same as the previous test - don't initialize mm_processor_kwargs
++    # in this test and assume that the kwargs will be correctly expanded by
++    # the partial when calling the dummy data func.
++    ctx = build_model_context(
++        model_name=model,
++        tokenizer_name=model,
++        trust_remote_code=True,
++        mm_processor_kwargs=None,
++    )
++
++    if max_dynamic_patch is None:
++        max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch
++    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
++    if dynamic_image_size is False:
++        expected_num_patches = 1
++    expected_max_tokens = 256 * expected_num_patches
++
++    dummy_data = dummy_data_for_internvl(
++        ctx=ctx,
++        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
++        mm_counts={"image": num_imgs},
++        max_dynamic_patch=max_dynamic_patch,
++        dynamic_image_size=dynamic_image_size,
++    )
++    sequence_data = dummy_data.seq_data
++
++    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
++    image_token_id = tokenizer.encode('<IMG_CONTEXT>',
++                                      add_special_tokens=False)[0]
++
++    # Ensure we have the right number of placeholders per size
++    img_tok_count = sequence_data.get_token_ids().count(image_token_id)
++    assert img_tok_count == expected_max_tokens * num_imgs
++
++
++@pytest.mark.parametrize("model", models)
++@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
++@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
++@pytest.mark.parametrize("num_imgs", [1, 2])
++def test_input_processor_override(
++    input_processor_for_internvl: Callable,
++    image_assets: _ImageAssets,
++    model: str,
++    num_imgs: int,
++    max_dynamic_patch: int,
++    dynamic_image_size: Optional[bool],
++):
++    """Ensure input_processor_for_internvl handles kwargs properly."""
++    # Same as the previous test - don't initialize mm_processor_kwargs
++    # in this test and assume that the kwargs will be correctly expanded by
++    # the partial when calling the custom input processor.
++    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
++    if dynamic_image_size is False:
++        expected_num_patches = 1
++
++    ctx = build_model_context(
++        model_name=model,
++        tokenizer_name=model,
++        trust_remote_code=True,
++        mm_processor_kwargs=None,
++    )
++    expected_toks_per_img = 256 * expected_num_patches
++
++    # Build the image str / prompt based on the number of images we pass
++    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
++    placeholders = "<image>" if num_imgs == 1 else "\n".join(
++        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
++    prompt = placeholders
++    images = [image_assets[0].pil_image.resize((448 * 2, 448 * 2))] * num_imgs
++
++    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
++                          prompt=prompt,
++                          multi_modal_data={"image": images})
++
++    processed_inputs = input_processor_for_internvl(
++        ctx,
++        inputs,
++        max_dynamic_patch=max_dynamic_patch,
++        dynamic_image_size=dynamic_image_size,
++    )
++
++    # Ensure we have the right number of placeholders per num_crops size
++    image_token_id = tokenizer.encode('<IMG_CONTEXT>',
++                                      add_special_tokens=False)[0]
++    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
++    assert img_tok_count == expected_toks_per_img * num_imgs
+diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
+new file mode 100644
+index 0000000..1eec35d
+--- /dev/null
++++ b/tests/models/multimodal/processing/test_llava_next.py
+@@ -0,0 +1,132 @@
++import itertools
++from functools import partial
++
++import pytest
++from PIL import Image
++from pqdm.threads import pqdm
++
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.parse import ImageSize
++from vllm.multimodal.processing import BaseMultiModalProcessor
++from vllm.multimodal.utils import cached_get_tokenizer
++
++from ...utils import build_model_context
++
++
++def _validate_image_prompt_replacements_one(
++    processor: BaseMultiModalProcessor,
++    num_imgs: int,
++    failed_size_excs: list[tuple[ImageSize, Exception]],
++    image_size: ImageSize,
++) -> None:
++    prompt = "<image>" * num_imgs
++    image = Image.new("RGB", size=image_size)
++    mm_data = {"image": [image] * num_imgs}
++
++    try:
++        # The processor will throw an error if there is a mismatch
++        # in the prompt replacements
++        processed_inputs = processor.apply(prompt, mm_data, {})
++
++        image_placeholders = processed_inputs["mm_placeholders"]["image"]
++        assert len(image_placeholders) == num_imgs
++
++        first_placeholder = image_placeholders[0]
++
++        # NOTE: There is a BOS token
++        assert first_placeholder["offset"] == 1
++        assert first_placeholder["length"] == (
++            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
++
++    except Exception as exc:
++        failed_size_excs.append((image_size, exc))
++
++
++def _test_image_prompt_replacements(
++    processor,
++    *,
++    num_imgs: int,
++    image_sizes: list[ImageSize],
++) -> None:
++    """
++    Ensure LlavaNextMultiModalProcessor
++    handles prompt replacement properly for input images.
++    """
++    failed_size_excs = list[tuple[ImageSize, Exception]]()
++
++    validate_one = partial(
++        _validate_image_prompt_replacements_one,
++        processor,
++        num_imgs,
++        failed_size_excs,
++    )
++    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
++
++    if failed_size_excs:
++        msg = "Found failing image sizes:" \
++            + "\n========\n".join(f"[{size}]\n{exc}"
++                                  for size, exc in failed_size_excs)
++        raise AssertionError(msg)
++
++
++@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
++@pytest.mark.parametrize("num_imgs", [1, 2])
++def test_processor_prompt_replacements_regression(model_id, num_imgs):
++    ctx = build_model_context(
++        model_name=model_id,
++        tokenizer_name=model_id,
++        mm_processor_kwargs=None,
++        limit_mm_per_prompt={"image": num_imgs},
++    )
++    processor = MULTIMODAL_REGISTRY.create_processor(
++        ctx.model_config,
++        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
++    )
++
++    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
++                    (488, 183), (2560, 1669)]
++    image_sizes = [
++        size for w, h in image_ratios
++        for size in [ImageSize(w, h), ImageSize(h, w)]
++    ]
++
++    _test_image_prompt_replacements(
++        processor,
++        num_imgs=num_imgs,
++        image_sizes=image_sizes,
++    )
++
++
++@pytest.mark.skip("This test takes around 2 hours to run. "
++                  "Comment this out to run it manually.")
++@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
++@pytest.mark.parametrize("num_imgs", [1])
++def test_processor_prompt_replacements_all(model_id, num_imgs):
++    ctx = build_model_context(
++        model_name=model_id,
++        tokenizer_name=model_id,
++        mm_processor_kwargs=None,
++        limit_mm_per_prompt={"image": num_imgs},
++    )
++    processor = MULTIMODAL_REGISTRY.create_processor(
++        ctx.model_config,
++        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
++    )
++
++    seen_aspect_ratios = set[float]()
++    image_sizes = list[ImageSize]()
++
++    # The aspect ratio of the grid layout is between 1 and 2
++    # NOTE: Assumes that feature size calculation is the same if we
++    # swap the width and height of the image
++    for w, h in itertools.product(range(64, 1024), repeat=2):
++        aspect_ratio = w / h
++        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
++            image_sizes.append(ImageSize(w, h))
++            seen_aspect_ratios.add(aspect_ratio)
++
++    _test_image_prompt_replacements(
++        processor,
++        num_imgs=num_imgs,
++        image_sizes=image_sizes,
++    )
+diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
+new file mode 100644
+index 0000000..94ea604
+--- /dev/null
++++ b/tests/models/multimodal/processing/test_llava_onevision.py
+@@ -0,0 +1,132 @@
++import itertools
++from functools import partial
++
++import pytest
++from PIL import Image
++from pqdm.threads import pqdm
++
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.parse import ImageSize
++from vllm.multimodal.processing import BaseMultiModalProcessor
++from vllm.multimodal.utils import cached_get_tokenizer
++
++from ...utils import build_model_context
++
++
++def _validate_image_prompt_replacements_one(
++    processor: BaseMultiModalProcessor,
++    num_imgs: int,
++    failed_size_excs: list[tuple[ImageSize, Exception]],
++    image_size: ImageSize,
++) -> None:
++    prompt = "<image>" * num_imgs
++    image = Image.new("RGB", size=image_size)
++    mm_data = {"image": [image] * num_imgs}
++
++    try:
++        # The processor will throw an error if there is a mismatch
++        # in the prompt replacements
++        processed_inputs = processor.apply(prompt, mm_data, {})
++
++        image_placeholders = processed_inputs["mm_placeholders"]["image"]
++        assert len(image_placeholders) == num_imgs
++
++        first_placeholder = image_placeholders[0]
++
++        assert first_placeholder["offset"] == 0
++        assert first_placeholder["length"] == len(
++            processed_inputs["prompt_token_ids"]) // num_imgs
++    except Exception as exc:
++        failed_size_excs.append((image_size, exc))
++
++
++def _test_image_prompt_replacements(
++    processor,
++    *,
++    num_imgs: int,
++    image_sizes: list[ImageSize],
++) -> None:
++    """
++    Ensure LlavaOnevisionMultiModalProcessor
++    handles prompt replacement properly for input images.
++    """
++    failed_size_excs = list[tuple[ImageSize, Exception]]()
++
++    validate_one = partial(
++        _validate_image_prompt_replacements_one,
++        processor,
++        num_imgs,
++        failed_size_excs,
++    )
++    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
++
++    if failed_size_excs:
++        msg = "Found failing image sizes:" \
++            + "\n========\n".join(f"[{size}]\n{exc}"
++                                  for size, exc in failed_size_excs)
++        raise AssertionError(msg)
++
++
++@pytest.mark.parametrize("model_id",
++                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
++@pytest.mark.parametrize("num_imgs", [1, 2])
++def test_processor_prompt_replacements_regression(model_id, num_imgs):
++    ctx = build_model_context(
++        model_name=model_id,
++        tokenizer_name=model_id,
++        mm_processor_kwargs=None,
++        limit_mm_per_prompt={"image": num_imgs},
++    )
++    processor = MULTIMODAL_REGISTRY.create_processor(
++        ctx.model_config,
++        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
++    )
++
++    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
++                    (488, 183), (2560, 1669)]
++    image_sizes = [
++        size for w, h in image_ratios
++        for size in [ImageSize(w, h), ImageSize(h, w)]
++    ]
++
++    _test_image_prompt_replacements(
++        processor,
++        num_imgs=num_imgs,
++        image_sizes=image_sizes,
++    )
++
++
++@pytest.mark.skip("This test takes around 2 hours to run. "
++                  "Comment this out to run it manually.")
++@pytest.mark.parametrize("model_id",
++                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
++@pytest.mark.parametrize("num_imgs", [1])
++def test_processor_prompt_replacements_all(model_id, num_imgs):
++    ctx = build_model_context(
++        model_name=model_id,
++        tokenizer_name=model_id,
++        mm_processor_kwargs=None,
++        limit_mm_per_prompt={"image": num_imgs},
++    )
++    processor = MULTIMODAL_REGISTRY.create_processor(
++        ctx.model_config,
++        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
++    )
++
++    seen_aspect_ratios = set[float]()
++    image_sizes = list[ImageSize]()
++
++    # The aspect ratio of the grid layout is between 1 and 6
++    # NOTE: Assumes that feature size calculation is the same if we
++    # swap the width and height of the image
++    for w, h in itertools.product(range(64, 1024), repeat=2):
++        aspect_ratio = w / h
++        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
++            image_sizes.append(ImageSize(w, h))
++            seen_aspect_ratios.add(aspect_ratio)
++
++    _test_image_prompt_replacements(
++        processor,
++        num_imgs=num_imgs,
++        image_sizes=image_sizes,
++    )
+diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
+new file mode 100644
+index 0000000..7f82a8f
+--- /dev/null
++++ b/tests/models/multimodal/processing/test_phi3v.py
+@@ -0,0 +1,55 @@
++"""Tests for phi3v's multimodal preprocessing kwargs."""
++import pytest
++
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.utils import cached_get_tokenizer
++
++from ....conftest import _ImageAssets
++from ...utils import build_model_context
++
++
++@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
++# yapf: disable
++@pytest.mark.parametrize(
++    ("mm_processor_kwargs", "expected_toks_per_img"),
++    [
++        ({"num_crops": 4}, 757),
++        ({"num_crops": 16}, 1921),
++        # the default num_crops of phi-3.5-vision is 4
++        ({}, 757),
++    ])
++# yapf: enable
++@pytest.mark.parametrize("num_imgs", [1, 2])
++def test_processor_override(
++    image_assets: _ImageAssets,
++    model_id: str,
++    mm_processor_kwargs: dict[str, int],
++    expected_toks_per_img: int,
++    num_imgs: int,
++):
++    """Ensure input_processor_for_phi3v handles num_crops properly."""
++    # Avoid initializing CUDA early
++    from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
++
++    ctx = build_model_context(
++        model_name=model_id,
++        tokenizer_name=model_id,
++        trust_remote_code=True,
++        limit_mm_per_prompt={"image": num_imgs},
++    )
++    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
++    processor = MULTIMODAL_REGISTRY.create_processor(
++        ctx.model_config,
++        tokenizer=tokenizer,
++    )
++
++    # Build the image str / prompt based on the number of images we pass
++    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
++    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
++    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
++
++    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
++
++    # Ensure we have the right number of placeholders per num_crops size
++    img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
++    assert img_tok_count == expected_toks_per_img * num_imgs
+diff --git a/tests/models/multimodal/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py
+new file mode 100644
+index 0000000..af0ace7
+--- /dev/null
++++ b/tests/models/multimodal/processing/test_qwen.py
+@@ -0,0 +1,144 @@
++"""Tests for Qwen's multimodal preprocessing kwargs."""
++from typing import Dict, List, Union
++
++import pytest
++import torch
++from PIL.Image import Image
++
++from vllm.inputs import InputContext, token_inputs
++from vllm.multimodal import MultiModalKwargs
++from vllm.multimodal.utils import cached_get_tokenizer
++
++from ....conftest import IMAGE_ASSETS
++from ...utils import build_model_context
++
++### Multimodal preprocessing tests
++SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
++# These values are specific to Qwen-VL/Chat; we can get these from the model
++# config also, but they are hardcoded here to keep the parameterize/fixtures
++# easy to read.
++IMG_START_ID = 151857
++IMG_END_ID = 151858
++IMG_PAD_ID = 151859
++TOKS_PER_IMG = 256
++VIS_ENC_DIM = 4096
++IMG_SIZE = 448
++
++
++@pytest.fixture()
++def input_mapper_for_qwen():
++    # Lazy import to avoid initializing CUDA during test collection
++    from vllm.model_executor.models.qwen import input_mapper_for_qwen
++    return input_mapper_for_qwen
++
++
++@pytest.fixture()
++def input_processor_for_qwen():
++    # Lazy import to avoid initializing CUDA during test collection
++    from vllm.model_executor.models.qwen import input_processor_for_qwen
++    return input_processor_for_qwen
++
++
++@pytest.fixture()
++def qwen_vl_context() -> InputContext:
++    """Get an InputContext for Qwen-VL."""
++    return build_model_context(model_name="Qwen/Qwen-VL",
++                               trust_remote_code=True)
++
++
++# Happy path tests for single/multi-image scenarios for the multimodal
++# input processor and mapper, respectively
++@pytest.mark.parametrize("num_images", [1, 2])
++def test_input_processor_valid_mm_data(input_processor_for_qwen,
++                                       qwen_vl_context: InputContext,
++                                       num_images: int):
++    """Happy cases for image inputs to Qwen's multimodal input processor."""
++    prompt = "".join(
++        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
++    inputs = token_inputs(
++        prompt=prompt,
++        # When processing multimodal data for a multimodal model, the qwen
++        # input processor will overwrite the provided prompt_token_ids with
++        # the image prompts
++        prompt_token_ids=[],
++        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
++    )
++    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
++    assert isinstance(proc_inputs, dict)
++
++    # Each image should have one start / stop and a fixed context of 256
++    proc_tokens = proc_inputs["prompt_token_ids"]
++    assert proc_tokens.count(IMG_START_ID) == num_images
++    assert proc_tokens.count(IMG_END_ID) == num_images
++    assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
++
++
++@pytest.mark.parametrize(
++    "img_data,expected_shape",
++    [
++        # single / multi-image
++        (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
++        (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
++        # single / multi-image embeddings
++        (torch.rand(
++            (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
++        (torch.rand(
++            (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
++        (torch.rand(
++            (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
++    ])
++def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
++                                    qwen_vl_context: InputContext,
++                                    img_data: Union[torch.Tensor, List[Image],
++                                                    Image],
++                                    expected_shape: List[int]):
++    """Happy cases for image inputs to Qwen's multimodal input mapper."""
++    mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
++    # Ensure that we get the appropriately shaped pixel_values
++    # for images and image embeddings, respectively.
++    assert isinstance(mapped_img_data, MultiModalKwargs)
++    assert "pixel_values" in mapped_img_data
++    assert mapped_img_data["pixel_values"].shape == expected_shape
++
++
++# Sad path tests for the multimodal input processor and mapper, respectively
++@pytest.mark.parametrize("mm_data", [
++    {
++        "image": torch.rand(5)
++    },
++    {
++        "image": torch.rand((5, 5, 5, 5, 5))
++    },
++])
++def test_input_processor_invalid_mm_data(input_processor_for_qwen,
++                                         qwen_vl_context: InputContext,
++                                         mm_data: Dict[str, torch.Tensor]):
++    """Test sad cases validated in Qwen's multimodal input processor."""
++    tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
++                                     trust_remote_code=True)
++    prompt = "Picture 1: <img></img>\n"
++    prompt_token_ids = tokenizer.encode(prompt)
++    inputs = token_inputs(prompt=prompt,
++                          prompt_token_ids=prompt_token_ids,
++                          multi_modal_data=mm_data)
++    # Should fail since we have too many or too few dimensions for embeddings
++    with pytest.raises(ValueError):
++        input_processor_for_qwen(qwen_vl_context, inputs)
++
++
++@pytest.mark.parametrize(
++    "img_data",
++    [
++        # Wrong context length
++        torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
++        # Wrong visual encoder output size
++        torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
++    ])
++def test_input_mapper_invalid_mm_data(
++    input_mapper_for_qwen,
++    qwen_vl_context: InputContext,
++    img_data: Union[torch.Tensor, List[Image], Image],
++):
++    """Sad cases validated in Qwen VL's multimodal input mapper."""
++    with pytest.raises(ValueError):
++        input_mapper_for_qwen(qwen_vl_context, img_data)
+diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
+new file mode 100644
+index 0000000..de14fbb
+--- /dev/null
++++ b/tests/models/multimodal/processing/test_qwen2_vl.py
+@@ -0,0 +1,54 @@
++import pytest
++
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.utils import cached_get_tokenizer
++
++from ....conftest import _ImageAssets
++from ...utils import build_model_context
++
++
++@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
++# yapf: disable
++@pytest.mark.parametrize(
++    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
++        ({}, 1426, (5704, 1176)),
++        ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
++    ])
++# yapf: enable
++@pytest.mark.parametrize("num_imgs", [1, 2])
++def test_processor_override(
++    image_assets: _ImageAssets,
++    model_id: str,
++    mm_processor_kwargs: dict[str, object],
++    expected_toks_per_img: int,
++    expected_pixels_shape: tuple[int, int],
++    num_imgs: int,
++):
++    """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
++    ctx = build_model_context(
++        model_name=model_id,
++        tokenizer_name=model_id,
++        mm_processor_kwargs=None,
++        limit_mm_per_prompt={"image": num_imgs},
++    )
++    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
++    processor = MULTIMODAL_REGISTRY.create_processor(
++        ctx.model_config,
++        tokenizer=tokenizer,
++    )
++
++    # Build the image str / prompt based on the number of images we pass
++    prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
++    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
++
++    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
++
++    # Ensure we have the right number of placeholders per num_crops size
++    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
++    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
++    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
++    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
++
++    assert img_tok_count == expected_toks_per_img * num_imgs
++    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
++    assert pixel_shape[1] == expected_pixels_shape[1]
+diff --git a/tests/models/registry.py b/tests/models/registry.py
+new file mode 100644
+index 0000000..d079725
+--- /dev/null
++++ b/tests/models/registry.py
+@@ -0,0 +1,248 @@
++from dataclasses import dataclass, field
++from typing import AbstractSet, Mapping, Optional
++
++
++@dataclass(frozen=True)
++class _HfExamplesInfo:
++    default: str
++    """The default model to use for testing this architecture."""
++
++    extras: Mapping[str, str] = field(default_factory=dict)
++    """Extra models to use for testing this architecture."""
++
++    tokenizer: Optional[str] = None
++    """Set the tokenizer to load for this architecture."""
++
++    tokenizer_mode: str = "auto"
++    """Set the tokenizer type for this architecture."""
++
++    speculative_model: Optional[str] = None
++    """
++    The default model to use for testing this architecture, which is only used
++    for speculative decoding.
++    """
++
++    min_transformers_version: Optional[str] = None
++    """
++    The minimum version of HF Transformers that is required to run this model.
++    """
++
++    is_available_online: bool = True
++    """
++    Set this to ``False`` if the name of this architecture no longer exists on
++    the HF repo. To maintain backwards compatibility, we have not removed them
++    from the main model registry, so without this flag the registry tests will
++    fail.
++    """
++
++    trust_remote_code: bool = False
++    """The ``trust_remote_code`` level required to load the model."""
++
++
++# yapf: disable
++_TEXT_GENERATION_EXAMPLE_MODELS = {
++    # [Decoder-only]
++    "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
++                                   trust_remote_code=True),
++    "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
++                                         trust_remote_code=True),
++    "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct",
++                                         trust_remote_code=True),
++    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria",
++                                                    trust_remote_code=True),
++    "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B",
++                                         trust_remote_code=True),
++    "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
++                                         trust_remote_code=True),
++    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
++    # ChatGLMModel supports multimodal
++    "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
++                                         trust_remote_code=True),
++    "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
++                                         trust_remote_code=True),
++    "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
++    "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
++                                         trust_remote_code=True),
++    "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
++    "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat",  # noqa: E501
++                                         trust_remote_code=True),
++    "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3",  # noqa: E501
++                                         trust_remote_code=True),
++    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
++    "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
++    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
++    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
++    "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
++    "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
++    "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
++    "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"),
++    "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"),
++    "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
++    "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
++    "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
++                                           trust_remote_code=True),
++    "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
++                                            trust_remote_code=True),
++    "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B",
++                                              trust_remote_code=True),
++    "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
++    "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"),
++    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
++    "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
++                                        is_available_online=False),
++    "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
++    "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
++    "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
++                                         trust_remote_code=True),
++    "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
++                                         trust_remote_code=True),
++    "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
++    "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1"),  # noqa: E501
++    "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
++    "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
++    "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
++    "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
++    "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
++    "Olmo2ForCausalLM": _HfExamplesInfo("shanearora/OLMo-7B-1124-hf"),
++    "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
++    "OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"),
++    "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
++                                        trust_remote_code=True),
++    "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
++    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
++    "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
++    "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
++                                            trust_remote_code=True),
++    "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
++                                         trust_remote_code=True),
++    # QWenLMHeadModel supports multimodal
++    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"),
++    "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
++    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
++                                     is_available_online=False),
++    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
++                                                is_available_online=False),
++    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
++    "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
++    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
++    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
++                                            trust_remote_code=True),
++    "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
++                                         is_available_online=False,
++                                         trust_remote_code=True),
++    # [Encoder-decoder]
++    "BartModel": _HfExamplesInfo("facebook/bart-base"),
++    "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
++    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
++    # Therefore, we borrow the BartTokenizer from the original Bart model
++    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
++                                                         tokenizer="facebook/bart-base",
++                                                         trust_remote_code=True),  # noqa: E501
++}
++
++_EMBEDDING_EXAMPLE_MODELS = {
++    # [Text-only]
++    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
++    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
++    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
++    "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
++                                               trust_remote_code=True),
++    "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
++    "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
++    "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
++    "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
++    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
++    "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
++    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
++    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
++    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-large"),
++    # [Multimodal]
++    "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
++    "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
++                                         trust_remote_code=True),
++    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
++}
++
++_CROSS_ENCODER_EXAMPLE_MODELS = {
++    # [Text-only]
++    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
++    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
++    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
++}
++
++_MULTIMODAL_EXAMPLE_MODELS = {
++    # [Decoder-only]
++    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
++    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
++    "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
++                                    extras={"text_only": "THUDM/chatglm3-6b"},
++                                    trust_remote_code=True),
++    "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
++                                                       is_available_online=False),
++    # TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported
++    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"),   # noqa: E501
++    "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
++    "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
++    "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
++                                         trust_remote_code=True),
++    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3"),  # noqa: E501
++    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
++                                                     extras={"mistral": "mistral-community/pixtral-12b"}),  # noqa: E501
++    "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
++    "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
++    "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
++    "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"),  # noqa: E501
++    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
++                                trust_remote_code=True),
++    "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
++                                        trust_remote_code=True),
++    "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
++                              trust_remote_code=True),
++    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-pt-224"),  # noqa: E501
++    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
++                                        trust_remote_code=True),
++    "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
++                                                       tokenizer_mode="mistral"),
++    "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat",
++                                       extras={"text_only": "Qwen/Qwen-7B-Chat"},  # noqa: E501
++                                       trust_remote_code=True),
++    "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
++    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
++    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"),
++    # [Encoder-decoder]
++    "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
++    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
++}
++
++_SPECULATIVE_DECODING_EXAMPLE_MODELS = {
++    "EAGLEModel": _HfExamplesInfo("JackFram/llama-68m",
++                                  speculative_model="abhigoyal/vllm-eagle-llama-68m-random"),  # noqa: E501
++    "MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
++                                   speculative_model="abhigoyal/vllm-medusa-llama-68m-random"),  # noqa: E501
++    "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
++                                                    speculative_model="ibm-fms/llama-160m-accelerator"),  # noqa: E501
++}
++
++_EXAMPLE_MODELS = {
++    **_TEXT_GENERATION_EXAMPLE_MODELS,
++    **_EMBEDDING_EXAMPLE_MODELS,
++    **_CROSS_ENCODER_EXAMPLE_MODELS,
++    **_MULTIMODAL_EXAMPLE_MODELS,
++    **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
++}
++
++
++class HfExampleModels:
++    def __init__(self, hf_models: Mapping[str, _HfExamplesInfo]) -> None:
++        super().__init__()
++
++        self.hf_models = hf_models
++
++    def get_supported_archs(self) -> AbstractSet[str]:
++        return self.hf_models.keys()
++
++    def get_hf_info(self, model_arch: str) -> _HfExamplesInfo:
++        return self.hf_models[model_arch]
++
++
++HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
+diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
+new file mode 100644
+index 0000000..daece7c
+--- /dev/null
++++ b/tests/models/test_initialization.py
+@@ -0,0 +1,63 @@
++from unittest.mock import patch
++
++import pytest
++from packaging.version import Version
++from transformers import PretrainedConfig
++from transformers import __version__ as TRANSFORMERS_VERSION
++
++from vllm import LLM
++
++from .registry import HF_EXAMPLE_MODELS
++
++
++@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
++def test_can_initialize(model_arch):
++    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
++    if not model_info.is_available_online:
++        pytest.skip("Model is not available online")
++    if model_info.min_transformers_version is not None:
++        current_version = TRANSFORMERS_VERSION
++        required_version = model_info.min_transformers_version
++        if Version(current_version) < Version(required_version):
++            pytest.skip(
++                f"You have `transformers=={current_version}` installed, but "
++                f"`transformers>={required_version}` is required to run this "
++                "model")
++
++    # Avoid OOM
++    def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
++        if hf_config.model_type == "deepseek_vl_v2":
++            hf_config.update({"architectures": ["DeepseekVLV2ForCausalLM"]})
++
++        if hasattr(hf_config, "text_config"):
++            text_config: PretrainedConfig = hf_config.text_config
++        else:
++            text_config = hf_config
++
++        text_config.update({
++            "num_layers": 1,
++            "num_hidden_layers": 1,
++            "num_experts": 2,
++            "num_experts_per_tok": 2,
++            "num_local_experts": 2,
++        })
++
++        return hf_config
++
++    # Avoid calling model.forward()
++    def _initialize_kv_caches(self) -> None:
++        self.cache_config.num_gpu_blocks = 0
++        self.cache_config.num_cpu_blocks = 0
++
++    with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
++                      _initialize_kv_caches):
++        LLM(
++            model_info.default,
++            tokenizer=model_info.tokenizer,
++            tokenizer_mode=model_info.tokenizer_mode,
++            speculative_model=model_info.speculative_model,
++            num_speculative_tokens=1 if model_info.speculative_model else None,
++            trust_remote_code=model_info.trust_remote_code,
++            load_format="dummy",
++            hf_overrides=hf_overrides,
++        )
+diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
+index 50ab066..2c413a6 100644
+--- a/tests/models/test_oot_registration.py
++++ b/tests/models/test_oot_registration.py
+@@ -1,27 +1,75 @@
+-import torch
++import os
+ 
+-from vllm import LLM, ModelRegistry, SamplingParams
+-from vllm.model_executor.models.opt import OPTForCausalLM
+-from vllm.model_executor.sampling_metadata import SamplingMetadata
++import pytest
+ 
++from vllm import LLM, SamplingParams
++from vllm.assets.image import ImageAsset
+ 
+-class MyOPTForCausalLM(OPTForCausalLM):
++from ..utils import fork_new_process_for_each_test
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        # this dummy model always predicts the first token
+-        logits = super().compute_logits(hidden_states, sampling_metadata)
+-        logits.zero_()
+-        logits[:, 0] += 1.0
+-        return logits
+ 
++@fork_new_process_for_each_test
++def test_plugin(dummy_opt_path):
++    os.environ["VLLM_PLUGINS"] = ""
++    with pytest.raises(Exception) as excinfo:
++        LLM(model=dummy_opt_path, load_format="dummy")
++    assert "are not supported for now" in str(excinfo.value)
+ 
+-def test_oot_registration():
+-    # register our dummy model
+-    ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
++
++@fork_new_process_for_each_test
++def test_oot_registration_text_generation(dummy_opt_path):
++    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
++    prompts = ["Hello, my name is", "The text does not matter"]
++    sampling_params = SamplingParams(temperature=0)
++    llm = LLM(model=dummy_opt_path, load_format="dummy")
++    first_token = llm.get_tokenizer().decode(0)
++    outputs = llm.generate(prompts, sampling_params)
++
++    for output in outputs:
++        generated_text = output.outputs[0].text
++        # make sure only the first token is generated
++        rest = generated_text.replace(first_token, "")
++        assert rest == ""
++
++
++@fork_new_process_for_each_test
++def test_oot_registration_embedding(dummy_gemma2_embedding_path):
++    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+     prompts = ["Hello, my name is", "The text does not matter"]
++    llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
++    outputs = llm.embed(prompts)
++
++    for output in outputs:
++        assert all(v == 0 for v in output.outputs.embedding)
++
++
++image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
++
++
++@fork_new_process_for_each_test
++def test_oot_registration_multimodal(dummy_llava_path):
++    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
++    prompts = [{
++        "prompt": "What's in the image?<image>",
++        "multi_modal_data": {
++            "image": image
++        },
++    }, {
++        "prompt": "Describe the image<image>",
++        "multi_modal_data": {
++            "image": image
++        },
++    }]
++
+     sampling_params = SamplingParams(temperature=0)
+-    llm = LLM(model="facebook/opt-125m")
++    llm = LLM(model=dummy_llava_path,
++              load_format="dummy",
++              max_num_seqs=1,
++              trust_remote_code=True,
++              gpu_memory_utilization=0.98,
++              max_model_len=4096,
++              enforce_eager=True,
++              limit_mm_per_prompt={"image": 1})
+     first_token = llm.get_tokenizer().decode(0)
+     outputs = llm.generate(prompts, sampling_params)
+ 
+diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
+new file mode 100644
+index 0000000..73b70d6
+--- /dev/null
++++ b/tests/models/test_registry.py
+@@ -0,0 +1,94 @@
++import warnings
++
++import pytest
++import torch.cuda
++
++from vllm.model_executor.models import (is_pooling_model,
++                                        is_text_generation_model,
++                                        supports_multimodal)
++from vllm.model_executor.models.adapters import (as_classification_model,
++                                                 as_embedding_model,
++                                                 as_reward_model)
++from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
++                                                 _SPECULATIVE_DECODING_MODELS,
++                                                 _TEXT_GENERATION_MODELS,
++                                                 ModelRegistry)
++from vllm.platforms import current_platform
++
++from ..utils import fork_new_process_for_each_test
++from .registry import HF_EXAMPLE_MODELS
++
++
++@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
++def test_registry_imports(model_arch):
++    # Ensure all model classes can be imported successfully
++    model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
++
++    if model_arch in _SPECULATIVE_DECODING_MODELS:
++        return  # Ignore these models which do not have a unified format
++
++    if (model_arch in _TEXT_GENERATION_MODELS
++            or model_arch in _MULTIMODAL_MODELS):
++        assert is_text_generation_model(model_cls)
++
++    # All vLLM models should be convertible to a pooling model
++    assert is_pooling_model(as_classification_model(model_cls))
++    assert is_pooling_model(as_embedding_model(model_cls))
++    assert is_pooling_model(as_reward_model(model_cls))
++
++    if model_arch in _MULTIMODAL_MODELS:
++        assert supports_multimodal(model_cls)
++
++
++@fork_new_process_for_each_test
++@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
++    ("LlamaForCausalLM", False, False, False),
++    ("MllamaForConditionalGeneration", True, False, False),
++    ("LlavaForConditionalGeneration", True, True, False),
++    ("BertForSequenceClassification", False, False, True),
++    ("RobertaForSequenceClassification", False, False, True),
++    ("XLMRobertaForSequenceClassification", False, False, True),
++])
++def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
++    assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
++
++    assert ModelRegistry.is_cross_encoder_model(model_arch) is is_ce
++
++    if init_cuda and current_platform.is_cuda_alike():
++        assert not torch.cuda.is_initialized()
++
++        ModelRegistry.resolve_model_cls(model_arch)
++        if not torch.cuda.is_initialized():
++            warnings.warn(
++                "This model no longer initializes CUDA on import. "
++                "Please test using a different one.",
++                stacklevel=2)
++
++
++@fork_new_process_for_each_test
++@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
++    ("MLPSpeculatorPreTrainedModel", False, False),
++    ("DeepseekV2ForCausalLM", True, False),
++    ("Qwen2VLForConditionalGeneration", True, True),
++])
++def test_registry_is_pp(model_arch, is_pp, init_cuda):
++    assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
++
++    if init_cuda and current_platform.is_cuda_alike():
++        assert not torch.cuda.is_initialized()
++
++        ModelRegistry.resolve_model_cls(model_arch)
++        if not torch.cuda.is_initialized():
++            warnings.warn(
++                "This model no longer initializes CUDA on import. "
++                "Please test using a different one.",
++                stacklevel=2)
++
++
++def test_hf_registry_coverage():
++    untested_archs = (ModelRegistry.get_supported_archs() -
++                      HF_EXAMPLE_MODELS.get_supported_archs())
++
++    assert not untested_archs, (
++        "Please add the following architectures to "
++        f"`tests/models/registry.py`: {untested_archs}")
+diff --git a/tests/models/utils.py b/tests/models/utils.py
+index 3e49dfb..0eb3f61 100644
+--- a/tests/models/utils.py
++++ b/tests/models/utils.py
+@@ -1,29 +1,285 @@
+-def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
+-    """Compare the logprobs of two sequences generated by different models, 
++import warnings
++from typing import Dict, List, Optional, Sequence, Tuple, Union
++
++import torch
++
++from vllm.config import ModelConfig, TaskOption
++from vllm.inputs import InputContext
++from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
++
++TokensText = Tuple[List[int], str]
++
++
++def check_outputs_equal(
++    *,
++    outputs_0_lst: Sequence[TokensText],
++    outputs_1_lst: Sequence[TokensText],
++    name_0: str,
++    name_1: str,
++):
++    """
++    Compare the two sequences generated by different models,
++    which should be equal.
++    """
++    assert len(outputs_0_lst) == len(outputs_1_lst)
++
++    for prompt_idx, (outputs_0,
++                     outputs_1) in enumerate(zip(outputs_0_lst,
++                                                 outputs_1_lst)):
++        output_ids_0, output_str_0 = outputs_0
++        output_ids_1, output_str_1 = outputs_1
++
++        # The text and token outputs should exactly match
++        fail_msg = (f"Test{prompt_idx}:"
++                    f"\n{name_0}:\t{output_str_0!r}"
++                    f"\n{name_1}:\t{output_str_1!r}")
++
++        assert output_str_0 == output_str_1, fail_msg
++        assert output_ids_0 == output_ids_1, fail_msg
++
++
++# Representation of generated sequence as a tuple of
++# * Token ID list
++# * String
++# * List of top sample logprobs for each sampled token
++#
++# Assumes prompt logprobs were not requested.
++TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
++                                                                    float]],
++                                                          SampleLogprobs]]]
++
++# Allow for tokens to be represented as str's rather than IDs;
++# tuple of
++# * Token string representations list
++# * String
++# * Optional list of top sample logprobs for each sampled token
++#
++# Assumes prompt logprobs were not requested.
++TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
++                                                        List[Dict[str,
++                                                                  Logprob]]]]]
++
++# Representation of generated sequence as a tuple of
++# * Token ID list
++# * String
++# * Optional list of top sample logprobs for each sampled token
++# * Optional list of top prompt logprobs for each prompt token
++#
++# Allows prompt logprobs to be requested.
++TokensTextLogprobsPromptLogprobs = Tuple[
++    List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]],
++    Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]]
++
++
++def check_logprobs_close(
++    *,
++    outputs_0_lst: Sequence[Union[TokensTextLogprobs,
++                                  TokensTextLogprobsPromptLogprobs,
++                                  TextTextLogprobs]],
++    outputs_1_lst: Sequence[Union[TokensTextLogprobs,
++                                  TokensTextLogprobsPromptLogprobs,
++                                  TextTextLogprobs]],
++    name_0: str,
++    name_1: str,
++    num_outputs_0_skip_tokens: int = 0,
++    warn_on_mismatch: bool = True,
++    always_check_logprobs: bool = False,
++) -> None:
++    """Compare the logprobs of two sequences generated by different models,
+     which should be similar but not necessarily equal.
++
++    How sample logprobs are compared:
++    * `always_check_logprobs == True`: set of highest-logprob token ids
++      must match between seq0 and seq1 at all sampled token offsets
++    * `always_check_logprobs == False`: highest-logprob token ids are
++      only compared at sampled token offsets for which generated token
++      ids don't match
++
++    Prompt logprobs must be provided either for both input sequences, or
++    for neither. If prompt logprobs are provided, then highest-logprob
++    prompt token ids must match between seq0 and seq1 at all prompt token
++    offsets.
++
++    Args:
++      outputs_0_lst: First sequence to compare
++      outputs_0_lst: Second sequence to compare
++      name_0: sequence #0 name
++      name_1: sequence #1 name
++      num_outputs_0_skip_tokens: If > 0, specifies the number of initial
++                                 sequence #0 tokens & logprobs to discard
++                                 before comparison, i.e. all
++                                 of sequence #1 will be compared to
++                                 sequence #0 beginning at index
++                                 num_outputs_0_skip_tokens
++      warn_on_mismatch: Issue a warning if there is token-wise or text-wise
++                        mismatch between the two sequences
++      always_check_logprobs: If true, check logprobs even when tokens match
+     """
++    assert len(outputs_0_lst) == len(outputs_1_lst)
++
+     # Loop through responses to each prompt.
+     for prompt_idx, (outputs_0,
+                      outputs_1) in enumerate(zip(outputs_0_lst,
+                                                  outputs_1_lst)):
+-        output_ids_0, output_str_0, logprobs_0 = outputs_0
+-        output_ids_1, output_str_1, logprobs_1 = outputs_1
++        assert len(outputs_0) == len(outputs_1)
++        if len(outputs_0) == 3:
++            assert len(outputs_1) == 3
++            # Break out tokens, text & sample logprobs
++            # (prompt logprobs were not provided)
++            output_ids_0, output_str_0, logprobs_0 = outputs_0
++            output_ids_1, output_str_1, logprobs_1 = outputs_1
++        elif len(outputs_0) == 4:
++            assert len(outputs_1) == 4
++            # Break out tokens, text, sample logprobs & prompt logprobs
++            (
++                output_ids_0,
++                output_str_0,
++                logprobs_0,
++                prompt_logprobs_0,
++            ) = outputs_0
++            (
++                output_ids_1,
++                output_str_1,
++                logprobs_1,
++                prompt_logprobs_1,
++            ) = outputs_1
++
++            # Test prompt logprobs closeness
++            if (prompt_logprobs_0 is not None
++                    and prompt_logprobs_1 is not None):
++                # Both sequences' prompt logprobs lists are not `None``
++                # (although individual list elements may be `None`);
++                # for each token's logprobs:
++                for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
++                        zip(prompt_logprobs_0, prompt_logprobs_1)):
++                    fail_msg = (
++                        f"Prompt logprobs test:"
++                        f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}"
++                        f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}")
++
++                    if logprobs_elem_0 is None:
++                        # If the seq 0 token's logprobs are `None`,
++                        # the seq 1 token's logprobs must be `None`
++                        assert logprobs_elem_1 is None, fail_msg
++                    else:
++                        # If the seq 0 token's logprobs are not `None`,
++                        # the seq 1 token's logprobs must not be `None`
++                        assert logprobs_elem_1 is not None, fail_msg
++                        # Logprobs check: top-k token choices must be the same
++                        assert (set(logprobs_elem_0.keys()) == set(
++                            logprobs_elem_1.keys())), fail_msg
++            else:
++                # Both sequence logprobs lists must be `None`
++                fail_msg = (f"Prompt logprobs test:"
++                            f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
++                            f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}")
++
++                assert (prompt_logprobs_0 is None
++                        and prompt_logprobs_1 is None), fail_msg
++        else:
++            raise ValueError(f"Outputs tuple must have 3 or 4 elements but "
++                             f"{len(outputs_0)} elements were provided: "
++                             f"{outputs_0}")
++
++        if logprobs_0 is None:
++            logprobs_0 = [None] * len(output_ids_0)
++        if logprobs_1 is None:
++            logprobs_1 = [None] * len(output_ids_1)
++
++        # Skip specified number of initial sequence #0 tokens
++        # & logprobs, leaving output text as-is for simplicity
++        # (text mismatches may generate warnings but do not
++        # cause the test to fail.)
++        if num_outputs_0_skip_tokens < 0:
++            raise ValueError("num_outputs_0_skip_tokens must be non-negative")
++        output_ids_0 = output_ids_0[num_outputs_0_skip_tokens:]
++        logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]
+ 
+         # Loop through generated tokens.
+         for idx, (output_id_0,
+                   output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
+ 
+-            # If generated tokens don't match, then
+-            if output_id_0 != output_id_1:
++            is_tok_mismatch = output_id_0 != output_id_1
++
++            # If generated tokens don't match
++            # or it is desired to always check logprobs,
++            # then
++            if is_tok_mismatch or always_check_logprobs:
++                logprobs_elem_0 = logprobs_0[idx]
++                logprobs_elem_1 = logprobs_1[idx]
++
+                 # Each predicted token must be in top N logprobs of the other
+-                assert output_id_0 in logprobs_1[idx], (
+-                    f"Test{prompt_idx}:"
+-                    f"\n{name_0}:\t{output_str_0!r}"
+-                    f"\n{name_1}:\t{output_str_1!r}")
+-                assert output_id_1 in logprobs_0[idx], (
++                fail_msg = (
+                     f"Test{prompt_idx}:"
+-                    f"\n{name_0}:\t{output_str_0!r}"
+-                    f"\n{name_1}:\t{output_str_1!r}")
++                    f"\nMatched tokens:\t{output_ids_0[:idx]}"
++                    f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
++                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
++
++                assert logprobs_elem_0 is not None, fail_msg
++                assert logprobs_elem_1 is not None, fail_msg
++                assert output_id_0 in logprobs_elem_1, fail_msg
++                assert output_id_1 in logprobs_elem_0, fail_msg
++
++                if warn_on_mismatch and is_tok_mismatch:
++                    with warnings.catch_warnings():
++                        # This ensures that repeated warnings are shown
++                        # in the output, not just the first occurrence
++                        warnings.simplefilter("always")
++
++                        warnings.warn(fail_msg, stacklevel=2)
+ 
+                 # Break out since sequences will now diverge.
+                 break
++        else:
++            if output_str_0 != output_str_1 and warn_on_mismatch:
++                # The token outputs exactly match,
++                # so the text outputs should exactly match as well
++                fail_msg = (f"Test{prompt_idx}:"
++                            f"\n{name_0}:\t{output_str_0!r}"
++                            f"\n{name_1}:\t{output_str_1!r}")
++
++                with warnings.catch_warnings():
++                    # This ensures that repeated warnings are shown
++                    # in the output, not just the first occurrence
++                    warnings.simplefilter("always")
++
++                    warnings.warn(fail_msg, stacklevel=2)
++
++
++def build_model_context(model_name: str,
++                        task: TaskOption = "auto",
++                        tokenizer_name: Optional[str] = None,
++                        trust_remote_code: bool = False,
++                        dtype: Optional[Union[str, torch.dtype]] = None,
++                        mm_processor_kwargs: Optional[Dict] = None,
++                        limit_mm_per_prompt: Optional[Dict] = None):
++    """Creates an InputContext for a given model.
++
++    Args:
++        model_name: Name of the model being considered.
++        tokenizer_name: Name of the tokenizer being considered.
++        trust_remote_code: Whether or not to allow loading remote code.
++        mm_processor_kwargs: optional processor kwargs for to be leveraged
++            in the input processor, mapper, dummy data creation, etc.
++        limit_mm_per_prompt: Multimodal limits.
++
++    Returns:
++        InputContext for the model being considered.
++    """
++    if tokenizer_name is None:
++        tokenizer_name = model_name
++    if dtype is None:
++        dtype = "half"
++
++    model_config = ModelConfig(
++        model_name,
++        task=task,
++        tokenizer=tokenizer_name,
++        tokenizer_mode="auto",
++        trust_remote_code=trust_remote_code,
++        dtype=dtype,
++        seed=0,
++        mm_processor_kwargs=mm_processor_kwargs,
++        limit_mm_per_prompt=limit_mm_per_prompt,
++    )
++    return InputContext(model_config)
+diff --git a/tests/mq_llm_engine/__init__.py b/tests/mq_llm_engine/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
+new file mode 100644
+index 0000000..782b508
+--- /dev/null
++++ b/tests/mq_llm_engine/test_abort.py
+@@ -0,0 +1,67 @@
++"""Test that aborting is handled properly."""
++
++import asyncio
++import tempfile
++import uuid
++
++import pytest
++
++from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
++from vllm.engine.arg_utils import AsyncEngineArgs
++
++MODEL = "google/gemma-1.1-2b-it"
++ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
++RAISED_ERROR = KeyError
++RAISED_VALUE = "foo"
++EXPECTED_TOKENS = 250
++
++
++@pytest.fixture(scope="function")
++def tmp_socket():
++    with tempfile.TemporaryDirectory() as td:
++        yield f"ipc://{td}/{uuid.uuid4()}"
++
++
++@pytest.mark.asyncio
++async def test_abort(tmp_socket):
++    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
++                           ipc_path=tmp_socket) as engine:
++
++        client = await engine.make_client()
++
++        request_id_to_be_aborted = "request-aborted"
++        request_ids_a = [f"request-a-{idx}" for idx in range(10)]
++        request_ids_b = [f"request-b-{idx}" for idx in range(10)]
++
++        # Requests started before one to be aborted.
++        tasks = []
++        for request_id in request_ids_a:
++            tasks.append(
++                asyncio.create_task(
++                    generate(client, request_id, EXPECTED_TOKENS)))
++
++        # Aborted.
++        task_aborted = asyncio.create_task(
++            generate(client, request_id_to_be_aborted, EXPECTED_TOKENS))
++
++        # Requests started after one to be aborted.
++        for request_id in request_ids_b:
++            tasks.append(
++                asyncio.create_task(
++                    generate(client, request_id, EXPECTED_TOKENS)))
++
++        # Actually abort.
++        await asyncio.sleep(0.5)
++        await client.abort(request_id_to_be_aborted)
++
++        # Confirm that we got all the EXPECTED tokens from the requests.
++        for task in tasks:
++            count, request_id = await task
++            assert count == EXPECTED_TOKENS, (
++                f"{request_id} generated only {count} tokens")
++
++        # Cancel task (this will hang indefinitely if not).
++        task_aborted.cancel()
++
++        # Shutdown.
++        client.close()
+diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
+new file mode 100644
+index 0000000..83bc4e7
+--- /dev/null
++++ b/tests/mq_llm_engine/test_error_handling.py
+@@ -0,0 +1,293 @@
++"""Test that various errors are handled properly."""
++
++import asyncio
++import tempfile
++import time
++import uuid
++from unittest.mock import Mock
++
++import pytest
++
++from tests.mq_llm_engine.utils import RemoteMQLLMEngine
++from vllm import SamplingParams
++from vllm.engine.arg_utils import AsyncEngineArgs
++from vllm.engine.llm_engine import LLMEngine
++from vllm.engine.multiprocessing import MQEngineDeadError
++from vllm.engine.multiprocessing.engine import MQLLMEngine
++from vllm.entrypoints.openai.api_server import build_async_engine_client
++from vllm.entrypoints.openai.cli_args import make_arg_parser
++from vllm.lora.request import LoRARequest
++from vllm.usage.usage_lib import UsageContext
++from vllm.utils import FlexibleArgumentParser
++
++MODEL = "google/gemma-1.1-2b-it"
++ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
++RAISED_ERROR = KeyError
++RAISED_VALUE = "foo"
++
++
++@pytest.fixture(scope="function")
++def tmp_socket():
++    with tempfile.TemporaryDirectory() as td:
++        yield f"ipc://{td}/{uuid.uuid4()}"
++
++
++def run_with_evil_forward(engine_args: AsyncEngineArgs, ipc_path: str):
++    # Make engine.
++    engine = MQLLMEngine.from_engine_args(
++        engine_args=engine_args,
++        usage_context=UsageContext.UNKNOWN_CONTEXT,
++        ipc_path=ipc_path)
++
++    # Raise error during first forward pass.
++    engine.engine.model_executor.execute_model = Mock(
++        side_effect=RAISED_ERROR(RAISED_VALUE))
++
++    # Run engine.
++    engine.start()
++
++
++@pytest.mark.asyncio
++async def test_evil_forward(tmp_socket):
++    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
++                           ipc_path=tmp_socket,
++                           run_fn=run_with_evil_forward) as engine:
++
++        client = await engine.make_client()
++
++        # Server should be healthy after initial probe.
++        await asyncio.sleep(2.0)
++        await client.check_health()
++
++        # Throws an error that should get ENGINE_DEAD_ERROR.
++        with pytest.raises(MQEngineDeadError):
++            async for _ in client.generate(prompt="Hello my name is",
++                                           sampling_params=SamplingParams(),
++                                           request_id=uuid.uuid4()):
++                pass
++        assert client.errored
++
++        await asyncio.sleep(1.0)
++        with pytest.raises(RAISED_ERROR):
++            await client.check_health()
++        assert client.errored
++
++        # Shutdown.
++        client.close()
++
++
++def run_with_evil_model_executor_health(engine_args: AsyncEngineArgs,
++                                        ipc_path: str):
++    # Make engine.
++    engine = MQLLMEngine.from_engine_args(
++        engine_args=engine_args,
++        usage_context=UsageContext.UNKNOWN_CONTEXT,
++        ipc_path=ipc_path)
++
++    # Raise error during first forward pass.
++    engine.engine.model_executor.check_health = Mock(side_effect=RAISED_ERROR)
++
++    # Run engine.
++    engine.start()
++
++
++@pytest.mark.asyncio
++async def test_failed_health_check(tmp_socket):
++    with RemoteMQLLMEngine(
++            engine_args=ENGINE_ARGS,
++            ipc_path=tmp_socket,
++            run_fn=run_with_evil_model_executor_health) as engine:
++
++        client = await engine.make_client()
++        assert client.is_running
++
++        # Health probe should throw RAISED_ERROR.
++        await asyncio.sleep(15.)
++
++        with pytest.raises(RAISED_ERROR):
++            await client.check_health()
++        assert client.errored
++
++        # Generate call should throw ENGINE_DEAD_ERROR
++        with pytest.raises(MQEngineDeadError):
++            async for _ in client.generate(prompt="Hello my name is",
++                                           sampling_params=SamplingParams(),
++                                           request_id=uuid.uuid4()):
++                pass
++
++        client.close()
++
++
++def run_with_evil_abort(engine_args: AsyncEngineArgs, ipc_path: str):
++    # Make engine.
++    engine = MQLLMEngine.from_engine_args(
++        engine_args=engine_args,
++        usage_context=UsageContext.UNKNOWN_CONTEXT,
++        ipc_path=ipc_path)
++
++    # Raise error during abort call.
++    engine.engine.abort_request = Mock(side_effect=RAISED_ERROR)
++
++    # Run engine.
++    engine.start()
++
++
++@pytest.mark.asyncio
++async def test_failed_abort(tmp_socket):
++    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
++                           ipc_path=tmp_socket,
++                           run_fn=run_with_evil_abort) as engine:
++
++        client = await engine.make_client()
++        assert client.is_running
++
++        # First check health should work.
++        await client.check_health()
++
++        # Trigger an abort on the client side.
++        # This request ID does not exist, and will cause the engine to error
++        await client.abort(request_id="foo")
++
++        # Future generation requests will now fail
++        # with reference to the original KeyError("foo")
++        with pytest.raises(MQEngineDeadError) as execinfo:
++            async for _ in client.generate(
++                    prompt="Hello my name is",
++                    sampling_params=SamplingParams(max_tokens=10),
++                    request_id=uuid.uuid4()):
++                pass
++        assert "KeyError" in repr(execinfo.value)
++        assert client.errored
++
++        # This should raise the original error.
++        with pytest.raises(RAISED_ERROR):
++            await client.check_health()
++
++        client.close()
++
++
++@pytest.mark.asyncio
++async def test_batch_error(tmp_socket):
++    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
++                           ipc_path=tmp_socket,
++                           run_fn=run_with_evil_abort) as engine:
++
++        client = await engine.make_client()
++        assert client.is_running
++
++        # First check health should work.
++        await client.check_health()
++
++        # Batch of requests
++        async def do_generate(client):
++            # min_tokens=2048 to keep busy the engine busy
++            # to get enough time to get process a request
++            # that will crash the engine
++            params = SamplingParams(min_tokens=2048, max_tokens=2048)
++            async for _ in client.generate(prompt="Hello my name is",
++                                           sampling_params=params,
++                                           request_id=uuid.uuid4()):
++                pass
++
++        tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)]
++
++        # This request will force a processing batch to raise
++        # an exception and next the engine get errored
++        await client.abort(request_id="foo")
++
++        # The batch of those request failed, then they
++        # should get the same exception as a MQEngineDeadError.
++        errors = await asyncio.gather(*tasks, return_exceptions=True)
++        for e in errors:
++            assert isinstance(e, MQEngineDeadError)
++            assert "KeyError" in repr(e)
++
++        client.close()
++
++
++@pytest.mark.asyncio
++async def test_bad_request(tmp_socket):
++    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
++                           ipc_path=tmp_socket) as engine:
++
++        client = await engine.make_client()
++
++        # Invalid request should fail, but not crash the server.
++        with pytest.raises(ValueError):
++            async for _ in client.generate(prompt="Hello my name is",
++                                           sampling_params=SamplingParams(),
++                                           request_id="abcd-1",
++                                           lora_request=LoRARequest(
++                                               "invalid-lora", 1,
++                                               "invalid-path")):
++                pass
++
++        # This request should be okay.
++        async for _ in client.generate(prompt="Hello my name is",
++                                       sampling_params=SamplingParams(),
++                                       request_id="abcd-2"):
++            pass
++
++        # Shutdown.
++        client.close()
++
++
++@pytest.mark.asyncio
++async def test_mp_crash_detection(monkeypatch):
++
++    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
++    parser = make_arg_parser(parser)
++    args = parser.parse_args([])
++
++    # When LLMEngine is loaded, it will crash.
++    def mock_init():
++        raise ValueError
++
++    monkeypatch.setattr(LLMEngine, "__init__", mock_init)
++
++    start = time.perf_counter()
++    async with build_async_engine_client(args):
++        pass
++    end = time.perf_counter()
++
++    assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
++                              "if there is an error in the startup.")
++
++
++@pytest.mark.asyncio
++async def test_mp_cuda_init():
++    # it should not crash, when cuda is initialized
++    # in the API server process
++    import torch
++    torch.cuda.init()
++    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
++    parser = make_arg_parser(parser)
++    args = parser.parse_args([])
++
++    async with build_async_engine_client(args):
++        pass
++
++
++@pytest.mark.asyncio
++async def test_engine_process_death(tmp_socket):
++    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
++                           ipc_path=tmp_socket) as engine:
++
++        client = await engine.make_client()
++        assert client.is_running
++
++        # kill the engine process
++        engine.proc.kill()
++
++        # Generate call should fail
++        with pytest.raises(MQEngineDeadError):
++            async for _ in client.generate(prompt="Hello my name is",
++                                           sampling_params=SamplingParams(),
++                                           request_id=uuid.uuid4()):
++                pass
++
++        # And the health check should show the engine is dead
++        with pytest.raises(RuntimeError, match="Engine process .* died"):
++            await client.check_health()
++
++        client.close()
+diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
+new file mode 100644
+index 0000000..630c112
+--- /dev/null
++++ b/tests/mq_llm_engine/test_load.py
+@@ -0,0 +1,57 @@
++"""Test that the MQLLMEngine is able to handle 10k concurrent requests."""
++
++import asyncio
++import tempfile
++import uuid
++
++import pytest
++
++from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
++from vllm.engine.arg_utils import AsyncEngineArgs
++
++MODEL = "google/gemma-1.1-2b-it"
++NUM_EXPECTED_TOKENS = 10
++NUM_REQUESTS = 10000
++
++# Scenarios to test for num generated token.
++ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
++
++
++@pytest.fixture(scope="function")
++def tmp_socket():
++    with tempfile.TemporaryDirectory() as td:
++        yield f"ipc://{td}/{uuid.uuid4()}"
++
++
++@pytest.mark.asyncio
++async def test_load(tmp_socket):
++    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
++                           ipc_path=tmp_socket) as engine:
++
++        client = await engine.make_client()
++
++        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
++
++        # Create concurrent requests.
++        tasks = []
++        for request_id in request_ids:
++            tasks.append(
++                asyncio.create_task(
++                    generate(client, request_id, NUM_EXPECTED_TOKENS)))
++
++        # Confirm that we got all the EXPECTED tokens from the requests.
++        failed_request_id = None
++        tokens = None
++        for task in tasks:
++            num_generated_tokens, request_id = await task
++            if (num_generated_tokens != NUM_EXPECTED_TOKENS
++                    and failed_request_id is None):
++                failed_request_id = request_id
++                tokens = num_generated_tokens
++
++        assert failed_request_id is None, (
++            f"{failed_request_id} generated {tokens} but "
++            f"expected {NUM_EXPECTED_TOKENS}")
++
++        # Shutdown.
++        client.close()
+diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
+new file mode 100644
+index 0000000..f717c13
+--- /dev/null
++++ b/tests/mq_llm_engine/utils.py
+@@ -0,0 +1,78 @@
++import asyncio
++import multiprocessing
++from typing import Callable, Tuple, Union
++
++from vllm import SamplingParams
++from vllm.engine.arg_utils import AsyncEngineArgs
++from vllm.engine.multiprocessing.client import MQLLMEngineClient
++from vllm.engine.multiprocessing.engine import MQLLMEngine
++from vllm.outputs import RequestOutput
++from vllm.usage.usage_lib import UsageContext
++
++
++async def generate(
++        client: MQLLMEngineClient,
++        request_id: str,
++        num_tokens: int,
++        return_output: bool = False) -> Union[RequestOutput, Tuple[int, str]]:
++
++    final_output = None
++    count = 0
++    async for out in client.generate(
++            request_id=request_id,
++            prompt="Hello my name is Robert and",
++            sampling_params=SamplingParams(max_tokens=num_tokens,
++                                           temperature=0)):
++
++        count += 1
++        final_output = out
++        await asyncio.sleep(0.)
++
++    if return_output:
++        return final_output
++
++    # Confirm we generated all the tokens we expected.
++    return count, request_id
++
++
++def run_normal(engine_args: AsyncEngineArgs, ipc_path: str):
++    # Make engine.
++    engine = MQLLMEngine.from_engine_args(
++        engine_args=engine_args,
++        usage_context=UsageContext.UNKNOWN_CONTEXT,
++        ipc_path=ipc_path)
++
++    # Run engine.
++    engine.start()
++
++
++class RemoteMQLLMEngine:
++
++    def __init__(self,
++                 engine_args: AsyncEngineArgs,
++                 ipc_path: str,
++                 run_fn: Callable = run_normal) -> None:
++
++        self.engine_args = engine_args
++        self.ipc_path = ipc_path
++        context = multiprocessing.get_context("spawn")
++        self.proc = context.Process(target=run_fn,
++                                    args=(engine_args, ipc_path))
++        self.proc.start()
++
++    def __enter__(self):
++        return self
++
++    def __exit__(self, exc_type, exc_value, traceback):
++        self.proc.kill()
++
++    async def make_client(self) -> MQLLMEngineClient:
++        engine_config = self.engine_args.create_engine_config()
++        client = MQLLMEngineClient(self.ipc_path, engine_config, self.proc.pid)
++        while True:
++            try:
++                await client.setup()
++                break
++            except TimeoutError:
++                assert self.proc.is_alive()
++        return client
+diff --git a/tests/multi_step/__init__.py b/tests/multi_step/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
+new file mode 100644
+index 0000000..8456a46
+--- /dev/null
++++ b/tests/multi_step/test_correctness_async_llm.py
+@@ -0,0 +1,224 @@
++# Test the AsyncLLMEngine with multi-step-decoding
++from typing import List, Optional
++
++import pytest
++
++from tests.kernels.utils import override_backend_env_variable
++
++from ..models.utils import check_logprobs_close
++from ..utils import (completions_with_server_args, get_client_text_generations,
++                     get_client_text_logprob_generations)
++
++MODELS = [
++    "JackFram/llama-160m",
++]
++NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
++NUM_PROMPTS = [10]
++
++DEFAULT_SERVER_ARGS: List[str] = [
++    "--worker-use-ray",
++    "--gpu-memory-utilization",
++    "0.85",
++    "--swap-space",
++    "16",
++]
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize(("tp_size, pp_size"), [
++    (1, 1),
++    (2, 2),
++])
++@pytest.mark.parametrize("eager_mode", [False, True])
++@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
++@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
++@pytest.mark.parametrize("num_logprobs", [5])
++@pytest.mark.parametrize("is_async", [True])
++@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
++@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
++@pytest.mark.asyncio
++async def test_multi_step(
++    example_prompts,
++    model: str,
++    tp_size: int,
++    pp_size: int,
++    eager_mode: int,
++    num_scheduler_steps: int,
++    num_prompts: int,
++    is_async: bool,
++    num_logprobs: Optional[int],
++    attention_backend: str,
++    enable_chunked_prefill: bool,
++    monkeypatch,
++) -> None:
++    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
++    client/server environment.
++
++    Set up an engine with single-step scheduling as a ground-truth reference.
++
++    Send a completions API request to both engines with the same prompts.
++
++    Validate:
++    * Generated tokens match
++    * Generated logprobs are all very close
++
++    Args:
++      example_prompts: test fixture providing example prompts
++      model: model under test (same for single- and multi-step engines)
++      tp_size: degree of tensor-parallelism
++      pp_size: degree of pipeline-parallelism
++      eager_mode
++      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
++                           GPU -> CPU output transfer
++      num_prompts: number of example prompts under test
++      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
++                    completions endpoint; `None` -> no logprobs
++    """
++    if enable_chunked_prefill and \
++        (pp_size > 1 or attention_backend != "FLASH_ATTN"):
++        pytest.skip("Multi-step with Chunked-Prefill only supports"
++                    "PP=1 and FLASH_ATTN backend")
++
++    override_backend_env_variable(monkeypatch, attention_backend)
++
++    prompts = example_prompts
++    if len(prompts) < num_prompts:
++        prompts = prompts * ((num_prompts // len(prompts)) + 1)
++    prompts = prompts[:num_prompts]
++    assert len(prompts) == num_prompts
++
++    server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
++    ms_server_args = DEFAULT_SERVER_ARGS + \
++        ["--num-scheduler-steps", f"{num_scheduler_steps}"]
++
++    if not is_async:
++        ms_server_args += ["--disable-async-output-proc"]
++
++    if eager_mode:
++        ms_server_args.append("--enforce-eager")
++
++    if enable_chunked_prefill:
++        ms_server_args.append("--enable-chunked-prefill")
++
++    distributed_args = [
++        "--tensor-parallel-size",
++        str(tp_size),
++        "--pipeline-parallel-size",
++        str(pp_size),
++    ]
++
++    # Spin up client/server & issue completion API requests.
++    # Default `max_wait_seconds` is 240 but was empirically
++    # was raised 5x to 1200 *just for this test* due to
++    # observed timeouts in GHA CI
++    ref_completions = await completions_with_server_args(
++        prompts,
++        model,
++        server_args + distributed_args,
++        num_logprobs,
++        max_wait_seconds=5 * 240)
++    test_completions = await completions_with_server_args(
++        prompts,
++        model,
++        ms_server_args + distributed_args,
++        num_logprobs,
++        max_wait_seconds=5 * 240)
++
++    # Assert multi-step scheduling produces identical tokens
++    # to single-step scheduling.
++    ref_generations = get_client_text_generations(ref_completions)
++    test_generations = get_client_text_generations(test_completions)
++    assert ref_generations == test_generations
++
++    # Assert multi-step scheduling produces nearly-identical logprobs
++    # to single-step scheduling.
++    ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
++    test_text_logprobs = get_client_text_logprob_generations(test_completions)
++    check_logprobs_close(
++        outputs_0_lst=ref_text_logprobs,
++        outputs_1_lst=test_text_logprobs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize(("tp_size, pp_size"), [
++    (1, 2),
++])
++@pytest.mark.asyncio
++async def test_multi_step_pp_smoke(
++    tp_size: int,
++    pp_size: int,
++    monkeypatch,
++) -> None:
++    """
++    Smoke test for the vLLM engine with multi-step scheduling in an
++    OpenAI-protocol client/server environment.
++
++    This tests compares the outputs between multi-step scheduling and
++    single-step scheduling. Notably, this test lets the engines generate
++    more tokens (default is 5) and test for an exact match over all the
++    tokens.
++
++    Args:
++      tp_size: degree of tensor-parallelism
++      pp_size: degree of pipeline-parallelism
++      eager_mode
++    """
++
++    model = "JackFram/llama-160m"
++    num_scheduler_steps = 8
++    attention_backend = "FLASH_ATTN"
++    max_num_seqs = 3
++
++    override_backend_env_variable(monkeypatch, attention_backend)
++
++    # Prompt from the ShareGPT dataset
++    prompts = [
++        "in the jtbd context whats a push?",  # codespell:ignore
++        "in the jtbd context whats a push?",  # codespell:ignore
++        "in the jtbd context whats a push?",  # codespell:ignore
++        "in the jtbd context whats a push?",  # codespell:ignore
++    ]
++    # Use varying max_tokens to introduce scheduling randomness.
++    max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
++    assert len(prompts) == len(max_tokens)
++
++    test_args = [
++        "--tensor-parallel-size",
++        str(tp_size), "--pipeline-parallel-size",
++        str(pp_size), "--max-num-seqs",
++        str(max_num_seqs)
++    ]
++
++    server_args = DEFAULT_SERVER_ARGS + test_args
++    ms_server_args = DEFAULT_SERVER_ARGS + \
++       ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
++       test_args
++
++    # Spin up client/server & issue completion API requests.
++    # Default `max_wait_seconds` is 240 but was empirically
++    # was raised 3x to 720 *just for this test* due to
++    # observed timeouts in GHA CI
++    ref_completions = await completions_with_server_args(
++        prompts=prompts,
++        model_name=model,
++        server_cli_args=server_args,
++        num_logprobs=None,
++        max_wait_seconds=5 * 240,
++        max_tokens=max_tokens)
++
++    test_completions = await completions_with_server_args(
++        prompts=prompts,
++        model_name=model,
++        server_cli_args=ms_server_args,
++        num_logprobs=None,
++        max_wait_seconds=5 * 240,
++        max_tokens=max_tokens)
++
++    # Assert multi-step scheduling produces identical tokens
++    # to single-step scheduling.
++    ref_generations = get_client_text_generations(ref_completions)
++    test_generations = get_client_text_generations(test_completions)
++
++    assert ref_generations == test_generations
+diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
+new file mode 100644
+index 0000000..cc1fd19
+--- /dev/null
++++ b/tests/multi_step/test_correctness_llm.py
+@@ -0,0 +1,352 @@
++# Test the LLMEngine with multi-step-decoding
++
++import copy
++from typing import Optional
++
++import pytest
++
++from ..models.utils import check_logprobs_close, check_outputs_equal
++
++MODELS = [
++    "JackFram/llama-160m",
++]
++NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
++NUM_PROMPTS = [10]
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("tp_size", [1])
++@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
++@pytest.mark.parametrize("max_tokens", [5])
++@pytest.mark.parametrize("enforce_eager", [True])
++@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
++@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
++@pytest.mark.parametrize("num_logprobs", [None, 5])
++def test_multi_step_llm(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    tp_size: int,
++    enable_chunked_prefill: bool,
++    max_tokens: int,
++    enforce_eager: int,
++    num_scheduler_steps: int,
++    num_prompts: int,
++    num_logprobs: Optional[int],
++) -> None:
++    """Test vLLM engine with multi-step scheduling via sync LLM Engine.
++
++    Set up a HuggingFace (HF) transformers model as a ground-truth reference.
++
++    Prompt them with the same example prompts.
++
++    Validate:
++    * Generated tokens match
++    * Generated logprobs are all very close
++
++    Args:
++      hf_runner: HF transformers model runner fixture
++      vllm_runner: vLLM model runner fixture
++      example_prompts: test fixture providing example prompts
++      model: model under test (same for single- and multi-step engines)
++      dtype: tensor datatype for engine to utilize
++      tp_size: degree of tensor-parallelism
++      enable_chunked_prefill: chunked-prefill on/off
++      max_tokens: the maximum number of tokens to generate
++      enforce_eager
++      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
++                           GPU -> CPU output transfer
++      num_prompts: number of example prompts under test
++      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
++                    completions endpoint; `None` -> 1 logprob returned.
++    """
++
++    prompts = example_prompts
++    if len(prompts) < num_prompts:
++        prompts = prompts * ((num_prompts // len(prompts)) + 1)
++    prompts = prompts[:num_prompts]
++    assert len(prompts) == num_prompts
++
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            enforce_eager=enforce_eager,
++            gpu_memory_utilization=0.7,
++            tensor_parallel_size=tp_size,
++            enable_chunked_prefill=enable_chunked_prefill,
++            num_scheduler_steps=num_scheduler_steps,
++    ) as vllm_model:
++        vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
++                        if num_logprobs is None else
++                        vllm_model.generate_greedy_logprobs(
++                            prompts, max_tokens, num_logprobs))
++
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
++                      if num_logprobs is None else
++                      hf_model.generate_greedy_logprobs_limit(
++                          prompts, max_tokens, num_logprobs))
++
++    if num_logprobs is None:
++        check_outputs_equal(
++            outputs_0_lst=hf_outputs,
++            outputs_1_lst=vllm_outputs,
++            name_0="hf",
++            name_1="vllm",
++        )
++    else:
++        check_logprobs_close(
++            outputs_0_lst=hf_outputs,
++            outputs_1_lst=vllm_outputs,
++            name_0="hf",
++            name_1="vllm",
++        )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("tp_size", [1])
++@pytest.mark.parametrize("max_tokens", [5])
++@pytest.mark.parametrize("enforce_eager", [True])
++@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
++@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
++@pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)])
++def test_multi_step_llm_w_prompt_logprobs(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    tp_size: int,
++    max_tokens: int,
++    enforce_eager: int,
++    num_scheduler_steps: int,
++    num_prompts: int,
++    num_logprobs: Optional[int],
++    num_prompt_logprobs: Optional[int],
++) -> None:
++    """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
++
++    Set up a vLLM engine instance w/ single-step scheduling as a ground-truth
++    reference.
++
++    Prompt them with the same example prompts.
++
++    Validate:
++    * All generated logprobs are all very close
++
++    Args:
++      hf_runner: HF transformers model runner fixture
++      vllm_runner: vLLM model runner fixture
++      example_prompts: test fixture providing example prompts
++      model: model under test (same for single- and multi-step engines)
++      dtype: tensor datatype for engine to utilize
++      tp_size: degree of tensor-parallelism
++      max_tokens: the maximum number of tokens to generate
++      enforce_eager
++      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
++                           GPU -> CPU output transfer
++      num_prompts: number of example prompts under test
++      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
++                    completions endpoint; `None` -> no logprobs
++      num_prompt_logprobs: number of logprobs to return for each prompt token;
++                           note that this argument is not supported by the
++                           OpenAI completions endpoint.
++    """
++
++    prompts = example_prompts
++    if len(prompts) < num_prompts:
++        prompts = prompts * ((num_prompts // len(prompts)) + 1)
++    prompts = prompts[:num_prompts]
++    assert len(prompts) == num_prompts
++
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            enforce_eager=enforce_eager,
++            gpu_memory_utilization=0.7,
++            tensor_parallel_size=tp_size,
++            num_scheduler_steps=num_scheduler_steps,
++    ) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy_logprobs(
++            prompts,
++            max_tokens,
++            num_logprobs,
++            num_prompt_logprobs=num_prompt_logprobs)
++
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            enforce_eager=enforce_eager,
++            gpu_memory_utilization=0.7,
++            tensor_parallel_size=tp_size,
++    ) as vllm_model:
++        single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
++            prompts,
++            max_tokens,
++            num_logprobs,
++            num_prompt_logprobs=num_prompt_logprobs)
++
++    check_logprobs_close(
++        outputs_0_lst=single_step_vllm_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("tp_size", [1])
++@pytest.mark.parametrize("max_tokens", [5])
++@pytest.mark.parametrize("enforce_eager", [True])
++@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
++@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
++@pytest.mark.parametrize("num_logprobs", [None, 5])
++def test_multi_step_llm_chunked_prefill_prefix_cache(
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    tp_size: int,
++    max_tokens: int,
++    enforce_eager: int,
++    num_scheduler_steps: int,
++    num_prompts: int,
++    num_logprobs: Optional[int],
++) -> None:
++    """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
++
++    Set up contrived scenario which tests for a possible failure mode of
++    scheduling with multi-step+"single-step chunked prefill"+APC
++
++    "single-step chunked prefill" here refers to the current vLLM multi-step+
++    chunked-prefill implementation, which requires that a prefill may only
++    be scheduled in the same step as decodes if the prefill prompt fits in a
++    single chunk (note that "complete" multi-step+chunked-prefill would allow
++    a prefill to span multiple chunks & multiple steps but that is not yet
++    the case.)
++
++    "APC" is short for "automatic prefix caching".
++
++    This test creates a scenario where the scheduler must decide whether/how
++    to schedule a prefill with a prompt that exceeds the available token budget.
++    The correct behavior for multi-step+"single-step chunked prefill"+APC is to
++    put off scheduling the prefill until a future step.
++
++    Validate that:
++    * Multi-step kernels do not raise an exception due to incorrect scheduler
++      behavior
++    * Generated tokens match between
++      multi-step+"single-step chunked prefill"+APC and
++      single-step scheduling.
++    * (If logprobs are enabled) check logprobs are close enough
++
++    Args:
++      vllm_runner: vLLM model runner fixture
++      example_prompts: test fixture providing example prompts
++      model: model under test (same for single- and multi-step engines)
++      dtype: tensor datatype for engine to utilize
++      tp_size: degree of tensor-parallelism
++      max_tokens: the maximum number of tokens to generate
++      enforce_eager
++      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
++                           GPU -> CPU output transfer
++      num_prompts: number of example prompts under test
++      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
++                    completions endpoint; `None` -> 1 logprob returned.
++    """
++
++    # Set up contrived test for correct scheduling behavior with
++    # multi-step+"single-step chunked prefill"+APC.
++    #
++    # Assume block_size=16
++    #
++    # Assume max_num_batched_tokens=48
++    #   => Per-step token budget=48
++    #
++    # 1. Scheduler schedules 0th prompt (24 tokens)
++    #      => Remaining token budget=24
++    # 2. Scheduler attempts to schedule 1st prompt (30 tokens)
++    #    * 30 tokens exceeds 24 token remaining budget
++    #    * Correct behavior: do not schedule this prompt in this step
++    #    * Incorrect behavior: schedule prompt chunk
++    #      * `do_sample=False` for this prompt in this step
++    #      * Chunk size = (remaining tokens // block size) * block size
++    #
++    # The Incorrect scheduling behavior - if it occurs - will cause an exception
++    # in the model runner resulting from `do_sample=False`.
++    assert len(example_prompts) >= 2
++    challenge_prompts = copy.deepcopy(example_prompts)
++    challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
++                            'inference and serving engine for LLMs.\n'
++                            )  # 24 tok
++    challenge_prompts[1] = (
++        'Briefly describe the major milestones in the '
++        'development of artificial intelligence from 1950 to 2020.\n'
++    )  # 30 tok
++
++    # If necessary, adjust the length of `challenge_prompts` to match
++    # `num_prompts`
++    if len(challenge_prompts) < num_prompts:
++        challenge_prompts = (challenge_prompts *
++                             ((num_prompts // len(challenge_prompts)) + 1))
++    challenge_prompts = challenge_prompts[:num_prompts]
++    assert len(challenge_prompts) == num_prompts
++
++    # Single-step scheduler baseline
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            enforce_eager=enforce_eager,
++            gpu_memory_utilization=0.7,
++            tensor_parallel_size=tp_size,
++            num_scheduler_steps=num_scheduler_steps,
++            max_model_len=48,
++            max_num_batched_tokens=48,
++            max_num_seqs=4,
++            block_size=16,
++    ) as vllm_model:
++        outputs_baseline = (vllm_model.generate_greedy(
++            challenge_prompts, max_tokens) if num_logprobs is None else
++                            vllm_model.generate_greedy_logprobs(
++                                challenge_prompts, max_tokens, num_logprobs))
++
++    # multi-step+"single-step chunked prefill"+APC
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            enforce_eager=enforce_eager,
++            gpu_memory_utilization=0.7,
++            tensor_parallel_size=tp_size,
++            enable_chunked_prefill=True,
++            enable_prefix_caching=True,
++            num_scheduler_steps=num_scheduler_steps,
++            max_model_len=48,
++            max_num_batched_tokens=48,
++            max_num_seqs=4,
++            block_size=16,
++    ) as vllm_model:
++        outputs_w_features = (vllm_model.generate_greedy(
++            challenge_prompts, max_tokens) if num_logprobs is None else
++                              vllm_model.generate_greedy_logprobs(
++                                  challenge_prompts, max_tokens, num_logprobs))
++
++    if num_logprobs is None:
++        # No-logprobs test
++        check_outputs_equal(
++            outputs_0_lst=outputs_baseline,
++            outputs_1_lst=outputs_w_features,
++            name_0="multi-step",
++            name_1="multi-step+features",
++        )
++    else:
++        # Yes-logprobs test
++        check_logprobs_close(
++            outputs_0_lst=outputs_baseline,
++            outputs_1_lst=outputs_w_features,
++            name_0="multi-step",
++            name_1="multi-step+features",
++        )
+diff --git a/tests/multimodal/__init__.py b/tests/multimodal/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
+new file mode 100644
+index 0000000..678bbb5
+--- /dev/null
++++ b/tests/multimodal/test_inputs.py
+@@ -0,0 +1,95 @@
++import torch
++
++from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
++
++
++def assert_nested_tensors_equal(expected: NestedTensors,
++                                actual: NestedTensors):
++    assert type(expected) == type(actual)  # noqa: E721
++    if isinstance(expected, torch.Tensor):
++        assert torch.equal(expected, actual)
++    else:
++        for expected_item, actual_item in zip(expected, actual):
++            assert_nested_tensors_equal(expected_item, actual_item)
++
++
++def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
++                                   actual: MultiModalKwargs):
++    assert set(expected.keys()) == set(actual.keys())
++    for key in expected:
++        assert_nested_tensors_equal(expected[key], actual[key])
++
++
++def test_multimodal_input_batch_single_tensor():
++    t = torch.rand([1, 2])
++    result = MultiModalKwargs.batch([{"image": t}])
++    assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
++
++
++def test_multimodal_input_batch_multiple_tensors():
++    a = torch.rand([1, 1, 2])
++    b = torch.rand([1, 1, 2])
++    c = torch.rand([1, 1, 2])
++    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
++    assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
++
++
++def test_multimodal_input_batch_multiple_heterogeneous_tensors():
++    a = torch.rand([1, 2, 2])
++    b = torch.rand([1, 3, 2])
++    c = torch.rand([1, 4, 2])
++    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
++    assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
++
++
++def test_multimodal_input_batch_nested_tensors():
++    a = torch.rand([2, 3])
++    b = torch.rand([2, 3])
++    c = torch.rand([2, 3])
++    result = MultiModalKwargs.batch([{
++        "image": [a]
++    }, {
++        "image": [b]
++    }, {
++        "image": [c]
++    }])
++    assert_multimodal_inputs_equal(result, {
++        "image":
++        torch.stack([a.unsqueeze(0),
++                     b.unsqueeze(0),
++                     c.unsqueeze(0)])
++    })
++
++
++def test_multimodal_input_batch_heterogeneous_lists():
++    a = torch.rand([1, 2, 3])
++    b = torch.rand([1, 2, 3])
++    c = torch.rand([1, 2, 3])
++    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
++    assert_multimodal_inputs_equal(
++        result,
++        {"image": [torch.stack([a, b]), c.unsqueeze(0)]})
++
++
++def test_multimodal_input_batch_multiple_batchable_lists():
++    a = torch.rand([1, 2, 3])
++    b = torch.rand([1, 2, 3])
++    c = torch.rand([1, 2, 3])
++    d = torch.rand([1, 2, 3])
++    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
++    assert_multimodal_inputs_equal(
++        result,
++        {"image": torch.stack([torch.stack([a, b]),
++                               torch.stack([c, d])])})
++
++
++def test_multimodal_input_batch_mixed_stacking_depths():
++    a = torch.rand([1, 2, 3])
++    b = torch.rand([1, 3, 3])
++    c = torch.rand([1, 4, 3])
++
++    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
++    assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
++
++    result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
++    assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
+diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
+new file mode 100644
+index 0000000..54269c3
+--- /dev/null
++++ b/tests/multimodal/test_processing.py
+@@ -0,0 +1,613 @@
++from contextlib import nullcontext
++from typing import cast
++from unittest.mock import MagicMock
++
++import numpy as np
++import pytest
++
++from vllm.config import ModelConfig
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement,
++                                        find_mm_placeholders,
++                                        find_text_matches, find_token_matches,
++                                        iter_token_matches,
++                                        replace_text_matches,
++                                        replace_token_matches)
++from vllm.multimodal.profiling import MultiModalProfiler
++from vllm.multimodal.utils import cached_get_tokenizer
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.utils import full_groupby
++
++from .utils import random_image
++
++
++# yapf: disable
++@pytest.mark.parametrize(
++    ("token_ids", "match_ids", "expected"),
++    [
++        ([], [], []),
++        ([], [32000], []),
++        (
++            [32000, 32000, 32000],
++            [32000],
++            [
++                { "start_idx": 0, "end_idx": 1 },
++                { "start_idx": 1, "end_idx": 2 },
++                { "start_idx": 2, "end_idx": 3 },
++            ],
++        ),
++        (
++            [32000, 32000, 32000],
++            [32000, 32000],
++            [{ "start_idx": 0, "end_idx": 2 }],
++        ),
++        (
++            [32000, 32000, 32000],
++            [32000, 32000, 32000],
++            [{ "start_idx": 0, "end_idx": 3 }],
++        ),
++        (
++            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
++            [28747, 32000],
++            [
++                { "start_idx": 1, "end_idx": 3 },
++                { "start_idx": 6, "end_idx": 8 },
++            ],
++        ),
++        (
++            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
++            [28747, 32000, 32000, 32000],
++            [
++                { "start_idx": 1, "end_idx": 5 },
++            ],
++        ),
++        (
++            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
++            [28747, 0, 32000],
++            [],
++        ),
++    ],
++)
++# yapf: enable
++def test_iter_token_matches(token_ids, match_ids, expected):
++    result = list(iter_token_matches(token_ids, match_ids))
++
++    # Manually constructed results
++    assert [item._asdict() for item in result] == expected
++
++    # Invariants
++    match_lens = [end - start for start, end in result]
++    print("match_lens:", match_lens)  # Only displayed on error
++    assert all(match_len == len(match_ids) for match_len in match_lens)
++
++
++# yapf: disable
++@pytest.mark.parametrize(
++    ("prompt", "target_by_key", "expected_by_key"),
++    [
++        (
++            [],
++            {
++                "pattern_1": [],
++                "pattern_2": [32000],
++            },
++            {
++                "pattern_1": [],
++                "pattern_2": [],
++            }
++        ),
++        (
++            [32000, 32000, 32000, 32000],
++            {
++                "pattern_1": [32000],
++                "pattern_2": [32000, 32000],
++                "pattern_3": [32000, 32000, 32000],
++            },
++            {
++                "pattern_1": [
++                    { "start_idx": 0, "end_idx": 1 },
++                    { "start_idx": 1, "end_idx": 2 },
++                    { "start_idx": 2, "end_idx": 3 },
++                    { "start_idx": 3, "end_idx": 4 },
++                ],
++                "pattern_2": [
++                    { "start_idx": 0, "end_idx": 2 },
++                    { "start_idx": 2, "end_idx": 4 },
++                ],
++                "pattern_3": [
++                    { "start_idx": 0, "end_idx": 3 },
++                ],
++            },
++        ),
++        (
++            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
++            {
++                "pattern_1": [28747, 32000],
++                "pattern_2": [28747, 32000, 32000, 32000],
++                "pattern_3": [28747, 0, 32000],
++            },
++            {
++                "pattern_1": [
++                    { "start_idx": 1, "end_idx": 3 },
++                    { "start_idx": 6, "end_idx": 8 },
++                ],
++                "pattern_2": [
++                    { "start_idx": 1, "end_idx": 5 },
++                ],
++                "pattern_3": [],
++            },
++        ),
++    ],
++)
++# yapf: enable
++def test_find_token_matches(prompt, target_by_key, expected_by_key):
++    # Should not be used since there is nothing to convert to token IDs
++    mock_tokenizer = cast(AnyTokenizer, object())
++
++    prompt_repls = [
++        PromptReplacement(key, target, []).bind(mock_tokenizer)
++        for key, target in target_by_key.items()
++    ]
++    result = find_token_matches(prompt, prompt_repls)
++
++    # Only displayed on error
++    print("result:", result)
++
++    # Manually constructed results
++    result_groups = dict(full_groupby(result, key=lambda x: x.modality))
++    assert {
++        key: [
++            dict(start_idx=item.start_idx, end_idx=item.end_idx)
++            for item in result_groups.get(key, [])
++        ]
++        for key in expected_by_key
++    } == expected_by_key
++
++
++# yapf: disable
++@pytest.mark.parametrize(
++    ("prompt", "target_by_key", "expected_by_key"),
++    [
++        # Detokenized test cases of `test_find_token_matches`
++        # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
++        (
++            "",
++            {
++                "pattern_1": "",
++                "pattern_2": "<image>",
++            },
++            {
++                "pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
++                "pattern_2": [],
++            }
++        ),
++        (
++            "<image><image><image><image>",
++            {
++                "pattern_1": "<image>",
++                "pattern_2": "<image><image>",
++                "pattern_3": "<image><image><image>",
++            },
++            {
++                "pattern_1": [
++                    { "start_idx": 0, "end_idx": 7 },
++                    { "start_idx": 7, "end_idx": 14 },
++                    { "start_idx": 14, "end_idx": 21 },
++                    { "start_idx": 21, "end_idx": 28 },
++                ],
++                "pattern_2": [
++                    { "start_idx": 0, "end_idx": 14 },
++                    { "start_idx": 14, "end_idx": 28 },
++                ],
++                "pattern_3": [
++                    { "start_idx": 0, "end_idx": 21 },
++                ],
++            },
++        ),
++        (
++            "Image:<image><image><image>Image:<image><image>!",
++            {
++                "pattern_1": "Image:<image>",
++                "pattern_2": "Image:<image><image><image>",
++                "pattern_3": "Image:<unk><image>",
++            },
++            {
++                "pattern_1": [
++                    { "start_idx": 0, "end_idx": 13 },
++                    { "start_idx": 27, "end_idx": 40 },
++                ],
++                "pattern_2": [
++                    { "start_idx": 0, "end_idx": 27 },
++                ],
++                "pattern_3": [],
++            },
++        ),
++        # Test regex escape
++        (
++            "<|image|><image><|image|><image>",
++            {
++                "pattern_1": "<|image|>",
++                "pattern_2": "<|image|><image>",
++                "pattern_3": "<|image|><image><|image|>",
++            },
++            {
++                "pattern_1": [
++                    { "start_idx": 0, "end_idx": 9 },
++                    { "start_idx": 16, "end_idx": 25 },
++                ],
++                "pattern_2": [
++                    { "start_idx": 0, "end_idx": 16 },
++                    { "start_idx": 16, "end_idx": 32 },
++                ],
++                "pattern_3": [
++                    { "start_idx": 0, "end_idx": 25 },
++                ],
++            },
++        ),
++    ],
++)
++# yapf: enable
++def test_find_text_matches(prompt, target_by_key, expected_by_key):
++    # Should not be used since there is nothing to convert to text
++    mock_tokenizer = cast(AnyTokenizer, object())
++
++    prompt_repls = [
++        PromptReplacement(key, target, []).bind(mock_tokenizer)
++        for key, target in target_by_key.items()
++    ]
++    result = find_text_matches(prompt, prompt_repls)
++
++    # Only displayed on error
++    print("result:", result)
++
++    # Manually constructed results
++    result_groups = dict(full_groupby(result, key=lambda x: x.modality))
++    assert {
++        key: [
++            dict(start_idx=item.start_idx, end_idx=item.end_idx)
++            for item in result_groups.get(key, [])
++        ]
++        for key in expected_by_key
++    } == expected_by_key
++
++
++# yapf: disable
++@pytest.mark.parametrize(
++    ("prompt", "target_by_key", "repl_by_key"),
++    [
++        (
++            "Image:<image>Image:<image><image>!",
++            {
++                # We use `<image>` before `Image:` to test matches that
++                # occur out of order
++                "pattern_1": "<image>",
++                "pattern_2": "Image:",
++                "pattern_3": "!",
++            },
++            {
++                # Test whether target is confused with replacement
++                "pattern_1": "<image><image>",
++                # Test empty replacement
++                "pattern_2": "",
++                # Test dynamic replacement (beyond the form of `unit * count`)
++                "pattern_3": "?!?",
++            },
++        ),
++    ]
++)
++@pytest.mark.parametrize(
++    ("mm_count", "expected"),
++    [
++        (0, "Image:<image>Image:<image><image>!"),
++        (1, "<image><image>Image:<image><image>?!?"),
++        (2, "<image><image><image><image><image>?!?"),
++    ]
++)
++# yapf: enable
++def test_find_replace_text(
++    prompt,
++    target_by_key,
++    repl_by_key,
++    mm_count,
++    expected,
++):
++    # Should not be used since there is nothing to convert to text
++    mock_tokenizer = cast(AnyTokenizer, object())
++
++    mm_prompt_repls = {
++        key: [
++            PromptReplacement(key, target,
++                              repl_by_key[key]).bind(mock_tokenizer)
++        ]
++        for key, target in target_by_key.items()
++    }
++    mm_matches = {
++        key: find_text_matches(prompt, prompt_repls)
++        for key, prompt_repls in mm_prompt_repls.items()
++    }
++
++    result = replace_text_matches(
++        prompt,
++        mm_matches,
++        {key: mm_count
++         for key in repl_by_key},
++    )
++
++    # Only displayed on error
++    print("mm_matches:", mm_matches)
++    print("result:", result)
++
++    # Manually constructed results
++    assert result == expected
++
++
++# yapf: disable
++@pytest.mark.parametrize(
++    ("prompt", "target_by_key", "repl_by_key"),
++    [
++        # Tokenized test cases of `test_find_replace_text`
++        # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
++        (
++            [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
++            {
++                # We use `<image>` before `Image:` to test matches that
++                # occur out of order
++                "pattern_1": [32000],
++                "pattern_2": [9833, 28747],
++                "pattern_3": [918],
++            },
++            {
++                # Test whether target is confused with replacement
++                "pattern_1": [32000, 32000],
++                # Test empty replacement
++                "pattern_2": [],
++                # Test dynamic replacement (beyond the form of `unit * count`)
++                "pattern_3": [1550, 918, 1550],
++            },
++        ),
++    ]
++)
++@pytest.mark.parametrize(
++    ("mm_count", "expected"),
++    [
++        (0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]),
++        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550]),
++        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550]),
++    ]
++)
++# yapf: enable
++def test_find_replace_tokens(
++    prompt,
++    target_by_key,
++    repl_by_key,
++    mm_count,
++    expected,
++):
++    # Should not be used since there is nothing to convert to tokens
++    mock_tokenizer = cast(AnyTokenizer, object())
++
++    mm_prompt_repls = {
++        key: [
++            PromptReplacement(key, target,
++                              repl_by_key[key]).bind(mock_tokenizer)
++        ]
++        for key, target in target_by_key.items()
++    }
++    mm_matches = {
++        key: find_token_matches(prompt, prompt_repls)
++        for key, prompt_repls in mm_prompt_repls.items()
++    }
++
++    result = replace_token_matches(
++        prompt,
++        mm_matches,
++        {key: mm_count
++         for key in repl_by_key},
++    )
++
++    # Only displayed on error
++    print("mm_matches:", mm_matches)
++    print("result:", result)
++
++    # Manually constructed results
++    assert result == expected
++
++
++# yapf: disable
++@pytest.mark.parametrize(
++    "repl_by_key",
++    [
++        {
++            "pattern_1": [32000, 32000],
++            "pattern_2": [],
++            "pattern_3": [1550, 918, 1550],
++        },
++    ],
++)
++@pytest.mark.parametrize(
++    ("prompt", "expected"),
++    [
++        (
++            [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
++            {
++                "pattern_1": [
++                    PlaceholderInfo(
++                        modality="pattern_1",
++                        item_idx=0,
++                        start_idx=6,
++                        replacement=[32000, 32000],
++                    ),
++                ],
++            }
++
++        ),
++        (
++            [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
++            {
++                "pattern_1": [
++                    PlaceholderInfo(
++                        modality="pattern_1",
++                        item_idx=0,
++                        start_idx=1,
++                        replacement=[32000, 32000],
++                    ),
++                    PlaceholderInfo(
++                        modality="pattern_1",
++                        item_idx=1,
++                        start_idx=5,
++                        replacement=[32000, 32000],
++                    ),
++                ],
++                "pattern_3": [
++                    PlaceholderInfo(
++                        modality="pattern_3",
++                        item_idx=0,
++                        start_idx=7,
++                        replacement=[1550, 918, 1550],
++                    ),
++                ],
++            }
++        ),
++        (
++            [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
++            {
++                "pattern_1": [
++                    PlaceholderInfo(
++                        modality="pattern_1",
++                        item_idx=0,
++                        start_idx=1,
++                        replacement=[32000, 32000],
++                    ),
++                    PlaceholderInfo(
++                        modality="pattern_1",
++                        item_idx=1,
++                        start_idx=3,
++                        replacement=[32000, 32000],
++                    ),
++                ],
++                "pattern_3": [
++                    PlaceholderInfo(
++                        modality="pattern_3",
++                        item_idx=0,
++                        start_idx=6,
++                        replacement=[1550, 918, 1550],
++                    ),
++                ],
++            }
++        ),
++    ]
++)
++# yapf: enable
++def test_find_mm_placeholders(
++    repl_by_key,
++    prompt,
++    expected,
++):
++    # Should not be used since there is nothing to convert to tokens
++    mock_tokenizer = cast(AnyTokenizer, object())
++
++    mm_prompt_repls = {
++        key: [PromptReplacement(key, [], repl).bind(mock_tokenizer)]
++        for key, repl in repl_by_key.items()
++    }
++
++    result = find_mm_placeholders(
++        mm_prompt_repls,
++        prompt,
++        # Effectively match all occurrences in the prompt
++        {key: 3
++         for key in repl_by_key},
++    )
++
++    # Only displayed on error
++    print("result:", result)
++
++    # Manually constructed results
++    assert result == expected
++
++
++@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
++@pytest.mark.parametrize(
++    ("limit", "num_supported", "is_valid"),
++    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
++     (2, 1, False), (2, 2, True)],
++)
++def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
++    limit_mm_per_prompt = {"image": limit}
++
++    model_config = ModelConfig(
++        model=model_id,
++        task="auto",
++        tokenizer=model_id,
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        seed=0,
++        dtype="half",
++        revision=None,
++        limit_mm_per_prompt=limit_mm_per_prompt,
++    )
++
++    processor = MULTIMODAL_REGISTRY.create_processor(
++        model_config,
++        tokenizer=cached_get_tokenizer(model_config.tokenizer),
++    )
++    profiler = MultiModalProfiler(processor)
++
++    mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
++    processor.info.get_supported_mm_limits = mock_supported_mm_limits
++
++    if is_valid:
++        exc_ctx = nullcontext()
++    else:
++        exc_ctx = pytest.raises(ValueError, match="this model only supports")
++
++    with exc_ctx:
++        profiler.get_dummy_data(model_config.max_model_len)
++
++
++@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
++@pytest.mark.parametrize(
++    ("num_images", "limit", "is_valid"),
++    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
++     (2, 1, False), (2, 2, True)],
++)
++def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
++    limit_mm_per_prompt = {"image": limit}
++
++    model_config = ModelConfig(
++        model=model_id,
++        task="auto",
++        tokenizer=model_id,
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        seed=0,
++        dtype="half",
++        revision=None,
++        limit_mm_per_prompt=limit_mm_per_prompt,
++    )
++
++    processor = MULTIMODAL_REGISTRY.create_processor(
++        model_config,
++        tokenizer=cached_get_tokenizer(model_config.tokenizer),
++    )
++
++    rng = np.random.RandomState(0)
++    image = random_image(rng, min_wh=128, max_wh=256)
++    if num_images == 0:
++        mm_data = {}
++    elif num_images == 1:
++        mm_data = {"image": image}
++    else:
++        mm_data = {"image": [image] * num_images}
++
++    if is_valid:
++        exc_ctx = nullcontext()
++    else:
++        exc_ctx = pytest.raises(ValueError, match=f"passed {num_images} image")
++
++    with exc_ctx:
++        processor.apply(
++            "<image>" * num_images,
++            mm_data=mm_data,
++            hf_processor_mm_kwargs={},
++        )
+diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py
+new file mode 100644
+index 0000000..d141cdf
+--- /dev/null
++++ b/tests/multimodal/test_processor_kwargs.py
+@@ -0,0 +1,400 @@
++from array import array
++from typing import Callable, Dict, Mapping, Optional
++from unittest.mock import patch
++
++import pytest
++import torch
++
++from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext,
++                         InputRegistry, ProcessorInputs, token_inputs)
++from vllm.multimodal import MultiModalRegistry
++from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
++
++from ..models.utils import build_model_context
++
++# Used for fast tests where the model doesn't matter
++DUMMY_MODEL_ID = "facebook/opt-125m"
++# Used for tests that need a multimodal model
++MULTIMODAL_MODEL_ID = "OpenGVLab/InternVL2-2B"
++
++# For mm_processor_kwargs - we test overrides by defining mocks for each place
++# it is used, and ensuring that we can pass processor kwargs an override value
++# to receive the intended result for things like sequence length etc.
++DEFAULT_MAX_DYNAMIC_PATCH = 6
++MAX_DYNAMIC_PATCH_OVERRIDE = 4
++
++
++# Mocks for all of the places that we use the mm_processor_kwargs
++# to override values in different callables
++@pytest.fixture
++def use_processor_mock():
++    """Patches the internal model input processor with an override callable."""
++
++    def custom_processor(ctx: InputContext,
++                         inputs: DecoderOnlyInputs,
++                         *,
++                         max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH):
++        # For testing purposes, we don't worry about the prompt
++        return token_inputs(
++            prompt_token_ids=[],
++            mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch})
++
++    with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor",
++               return_value=custom_processor):
++        yield
++
++
++@pytest.fixture
++def use_dummy_data_mock():
++    """Patches the internal model input processor with an override callable."""
++
++    def custom_dummy_data_factory(self,
++                                  ctx: InputContext,
++                                  seq_len: int,
++                                  mm_counts: Mapping[str, int],
++                                  *,
++                                  max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH):
++        seq_data = SequenceData(
++            array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * max_dynamic_patch))
++        return DummyData(seq_data, None)
++
++    with patch(
++            "vllm.inputs.registry.InputRegistry._default_dummy_data_factory",
++            custom_dummy_data_factory):
++        yield
++
++
++# Lazy import to avoid CUDA reinitialization error
++def mm_model_cls():
++    from vllm.model_executor.models.internvl import InternVLChatModel
++
++    return InternVLChatModel
++
++
++# lambda whose signature matches max token calcs extra & mapper + extra kwargs
++get_max_dynamic_patch = lambda ctx, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: max_dynamic_patch  # noqa: E501
++custom_mapper = lambda ctx, data, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: {  # noqa: E501
++    "pixel_values": torch.zeros(size=(1, max_dynamic_patch + 1, 3, 448, 448))
++}
++
++
++### Tests for default processor logic & mm_processor_kwargs wrapping
++def test_default_processor_is_a_noop():
++    """Ensure that by default, there is no processor override."""
++    dummy_registry = InputRegistry()
++    ctx = build_model_context(DUMMY_MODEL_ID)
++    processor = dummy_registry.create_input_processor(ctx.model_config)
++    proc_inputs = token_inputs(prompt_token_ids=[], prompt="")
++    proc_outputs = processor(inputs=proc_inputs)
++    assert proc_inputs is proc_outputs
++
++
++def _get_max_dynamic_patch_info(init_max_dynamic_patch: int,
++                                inference_max_dynamic_patch: int):
++    """Get the init / inference kwargs and expected max_dynamic_patch."""
++    # If we have a value for max_dynamic_patch, pass the override value and make
++    # sure we get that value as a return-value from out mock processor,
++    # otherwise fall back to the default value
++    init_kwargs = None if init_max_dynamic_patch is None else {
++        "max_dynamic_patch": init_max_dynamic_patch
++    }
++    inference_kwargs = None if inference_max_dynamic_patch is None else {
++        "max_dynamic_patch": inference_max_dynamic_patch
++    }
++    if inference_max_dynamic_patch is not None:
++        expected_seq_count = inference_max_dynamic_patch
++    elif init_max_dynamic_patch is not None:
++        expected_seq_count = init_max_dynamic_patch
++    else:
++        expected_seq_count = DEFAULT_MAX_DYNAMIC_PATCH
++    return init_kwargs, inference_kwargs, expected_seq_count
++
++
++def _get_processed_max_dynamic_patch(
++    processor: Callable[[ProcessorInputs], ProcessorInputs],
++    inference_kwargs: Optional[Dict[str, int]],
++) -> int:
++    processed_inputs = processor(
++        token_inputs(prompt_token_ids=[],
++                     prompt="",
++                     mm_processor_kwargs=inference_kwargs))
++
++    assert "type" in processed_inputs
++    assert processed_inputs["type"] == "token"
++    assert "mm_processor_kwargs" in processed_inputs
++    return processed_inputs["mm_processor_kwargs"]["max_dynamic_patch"]
++
++
++@pytest.mark.parametrize(
++    "init_max_dynamic_patch,inference_max_dynamic_patch", [
++        (None, None),
++        (MAX_DYNAMIC_PATCH_OVERRIDE, None),
++        (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE),
++    ])
++def test_input_processor_kwargs(use_processor_mock, init_max_dynamic_patch,
++                                inference_max_dynamic_patch):
++    """Ensure input processors can use processor kwargs."""
++    dummy_registry = InputRegistry()
++
++    (init_kwargs, inference_kwargs,
++     expected_seq_count) = _get_max_dynamic_patch_info(
++         init_max_dynamic_patch, inference_max_dynamic_patch)
++
++    ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs)
++    processor = dummy_registry.create_input_processor(ctx.model_config)
++    max_dynamic_patch_val = _get_processed_max_dynamic_patch(
++        processor, inference_kwargs)
++
++    assert max_dynamic_patch_val == expected_seq_count
++
++
++@pytest.mark.parametrize(
++    "mm_processor_kwargs",
++    [
++        # Not part of the signature
++        {
++            "does_not_exist": 100
++        },
++        # Part of the signature, not keyword only
++        {
++            "ctx": "something bad"
++        }
++    ])
++def test_processor_with_sad_kwarg_overrides(use_processor_mock,
++                                            mm_processor_kwargs):
++    """Ensure that input processors filter out invalid mm_processor_kwargs"""
++    dummy_registry = InputRegistry()
++    # Should filter out the init time kwargs
++    ctx = build_model_context(DUMMY_MODEL_ID,
++                              mm_processor_kwargs=mm_processor_kwargs)
++
++    processor = dummy_registry.create_input_processor(ctx.model_config)
++    # Should filter out the inference time kwargs
++    max_dynamic_patch_val = _get_processed_max_dynamic_patch(
++        processor, mm_processor_kwargs)
++    assert max_dynamic_patch_val == DEFAULT_MAX_DYNAMIC_PATCH
++
++
++### Test overrides for the dummy data
++@pytest.mark.parametrize("max_dynamic_patch",
++                         [None, MAX_DYNAMIC_PATCH_OVERRIDE])
++def test_dummy_data_kwarg_overrides(use_dummy_data_mock, max_dynamic_patch):
++    """Ensure dummy data factories can use processor kwargs."""
++    mm_processor_kwargs = None if max_dynamic_patch is None else {
++        "max_dynamic_patch": max_dynamic_patch
++    }
++    expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH
++                          if max_dynamic_patch is None else max_dynamic_patch)
++    dummy_registry = InputRegistry()
++    ctx = build_model_context(DUMMY_MODEL_ID,
++                              mm_processor_kwargs=mm_processor_kwargs)
++    mm_registry = MultiModalRegistry()
++    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
++
++    # NOTE: seq_len is thrown away here since this will leverage the
++    # default dummy data factory that we have patched in, whose seq
++    # len is solely dependent on the value of the mm_processor_kwargs.
++    dummy_data = dummy_registry.dummy_data_for_profiling(
++        ctx.model_config, seq_len=-1, mm_registry=mm_registry)
++    assert len(dummy_data.seq_data.prompt_token_ids) == expected_seq_count
++
++
++@pytest.mark.parametrize(
++    "mm_processor_kwargs",
++    [
++        # Not part of the signature
++        {
++            "does_not_exist": 100
++        },
++        # Part of the signature, not keyword only
++        {
++            "ctx": "something bad"
++        }
++    ])
++def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock,
++                                             mm_processor_kwargs):
++    """Ensure the dummy data factory filters out invalid mm_processor_kwargs"""
++    dummy_registry = InputRegistry()
++    ctx = build_model_context(DUMMY_MODEL_ID,
++                              mm_processor_kwargs=mm_processor_kwargs)
++    mm_registry = MultiModalRegistry()
++    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
++
++    # NOTE: seq_len is thrown away here since this will leverage the
++    # default dummy data factory that we have patched in, whose seq
++    # len is solely dependent on the value of the mm_processor_kwargs.
++    dummy_data = dummy_registry.dummy_data_for_profiling(
++        ctx.model_config, seq_len=-1, mm_registry=mm_registry)
++    assert len(
++        dummy_data.seq_data.prompt_token_ids) == DEFAULT_MAX_DYNAMIC_PATCH
++
++
++### Test overrides for the max token count per multimodal instance
++@pytest.mark.parametrize("max_dynamic_patch",
++                         [None, MAX_DYNAMIC_PATCH_OVERRIDE])
++def test_max_tokens_kwarg_overrides(max_dynamic_patch):
++    """Ensure max token calcs can use processor kwargs."""
++    mm_processor_kwargs = None if max_dynamic_patch is None else {
++        "max_dynamic_patch": max_dynamic_patch
++    }
++    expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH
++                          if max_dynamic_patch is None else max_dynamic_patch)
++
++    ctx = build_model_context(MULTIMODAL_MODEL_ID,
++                              task="generate",
++                              trust_remote_code=True,
++                              mm_processor_kwargs=mm_processor_kwargs,
++                              limit_mm_per_prompt={"image": 1})
++
++    mm_registry = MultiModalRegistry()
++    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
++    # Patch the image registry for phi3v with our lambda that is compatible
++    # with overrides, then ensure that calling the method correctly echos
++    # our max_dynamic_patch value back from the mm_processor_kwargs.
++    with patch.object(
++            mm_registry._get_plugin("image"),
++            "_max_mm_tokens",
++        {mm_model_cls(): get_max_dynamic_patch},
++    ):
++        max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
++            ctx.model_config)
++
++    assert expected_seq_count == max_multimodal_tokens
++
++
++@pytest.mark.parametrize(
++    "mm_processor_kwargs",
++    [
++        # Not part of the signature
++        {
++            "does_not_exist": 100
++        },
++        # Part of the signature, not keyword only
++        {
++            "ctx": "something bad"
++        }
++    ])
++def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
++    """Ensure that max token calcs filters out invalid mm_processor_kwargs"""
++    ctx = build_model_context(MULTIMODAL_MODEL_ID,
++                              task="generate",
++                              trust_remote_code=True,
++                              mm_processor_kwargs=mm_processor_kwargs,
++                              limit_mm_per_prompt={"image": 1})
++
++    mm_registry = MultiModalRegistry()
++    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
++
++    # Similar before, but since these kwargs get filtered,
++    # we always get our default value back.
++    with patch.object(
++            mm_registry._get_plugin("image"),
++            "_max_mm_tokens",
++        {mm_model_cls(): get_max_dynamic_patch},
++    ):
++        max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
++            ctx.model_config)
++
++    assert max_multimodal_tokens == DEFAULT_MAX_DYNAMIC_PATCH
++
++
++### Test overrides for the mapper
++@pytest.mark.parametrize(
++    "max_dynamic_patch",
++    [DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE])
++def test_default_mapper_with_processor_kwargs(image_assets, max_dynamic_patch):
++    """Ensure that the mapper processor kwargs can fall back to HF models."""
++    # NOTE - we don't validate bad inputs for the default mapper, because it's
++    # through the automodel interface in transformers, so we can't easily
++    # inspect what kwargs are or are not allowed.
++    ctx = build_model_context(
++        MULTIMODAL_MODEL_ID,
++        task="generate",
++        trust_remote_code=True,
++        mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch},
++        limit_mm_per_prompt={"image": 1})
++
++    mm_registry = MultiModalRegistry()
++    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
++
++    image = image_assets[0].pil_image
++    mm_inputs = {"image": image}
++
++    mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs)
++    # pixel vals should have shape: [batch, max_dynamic_patch+1, ...]
++    assert mapped_inputs["pixel_values"].shape[1] == max_dynamic_patch + 1
++
++
++@pytest.mark.parametrize(
++    "init_max_dynamic_patch,inference_max_dynamic_patch", [
++        (None, None),
++        (MAX_DYNAMIC_PATCH_OVERRIDE, None),
++        (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE),
++    ])
++def test_custom_mapper_kwarg_overrides(image_assets, init_max_dynamic_patch,
++                                       inference_max_dynamic_patch):
++    """Ensure custom mappers can use processor kwargs."""
++    (init_kwargs, inference_kwargs,
++     expected_seq_count) = _get_max_dynamic_patch_info(
++         init_max_dynamic_patch, inference_max_dynamic_patch)
++
++    ctx = build_model_context(MULTIMODAL_MODEL_ID,
++                              task="generate",
++                              trust_remote_code=True,
++                              mm_processor_kwargs=init_kwargs,
++                              limit_mm_per_prompt={"image": 1})
++
++    mm_registry = MultiModalRegistry()
++    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
++    image = image_assets[0].pil_image
++    mm_inputs = {"image": image}
++
++    # Patch the image registry for phi3v with our lambda that is compatible
++    # with overrides, then ensure that calling the method correctly echos
++    # our max_dynamic_patch value back from the mm_processor_kwargs.
++    mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
++        mm_model_cls())
++    mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs,
++                                          inference_kwargs)
++
++    assert mapped_inputs["pixel_values"].shape[1] == expected_seq_count + 1
++
++
++@pytest.mark.parametrize(
++    "mm_processor_kwargs",
++    [
++        # Not part of the signature
++        {
++            "does_not_exist": 100
++        },
++        # Part of the signature, not keyword only
++        {
++            "ctx": "something bad"
++        }
++    ])
++def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
++                                                mm_processor_kwargs):
++    """Ensure that custom mappers filters out invalid mm_processor_kwargs"""
++    # Should filter out the init time kwargs
++    ctx = build_model_context(MULTIMODAL_MODEL_ID,
++                              task="generate",
++                              trust_remote_code=True,
++                              mm_processor_kwargs=mm_processor_kwargs,
++                              limit_mm_per_prompt={"image": 1})
++
++    mm_registry = MultiModalRegistry()
++    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
++    image = image_assets[0].pil_image
++    mm_inputs = {"image": image}
++
++    # Patch the image registry for phi3v with our lambda that is compatible
++    # with overrides, then ensure that calling the method correctly echos
++    # our max_dynamic_patch value back from the mm_processor_kwargs.
++    mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
++        mm_model_cls())
++    # Should filter out the inference time kwargs
++    mapped_inputs = mm_registry.map_input(
++        ctx.model_config, mm_inputs, mm_processor_kwargs=mm_processor_kwargs)
++
++    assert mapped_inputs["pixel_values"].shape[1] == (
++        DEFAULT_MAX_DYNAMIC_PATCH + 1)
+diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
+new file mode 100644
+index 0000000..198344e
+--- /dev/null
++++ b/tests/multimodal/test_utils.py
+@@ -0,0 +1,400 @@
++import base64
++import mimetypes
++import os
++from tempfile import NamedTemporaryFile, TemporaryDirectory
++from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple
++
++import numpy as np
++import pytest
++from PIL import Image, ImageChops
++from transformers import AutoConfig, AutoTokenizer
++
++from vllm.multimodal.inputs import PlaceholderRange
++from vllm.multimodal.utils import (MediaConnector,
++                                   merge_and_sort_multimodal_metadata,
++                                   repeat_and_pad_placeholder_tokens)
++
++if TYPE_CHECKING:
++    from vllm.multimodal.hasher import MultiModalHashDict
++    from vllm.multimodal.inputs import MultiModalPlaceholderDict
++
++# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
++TEST_IMAGE_URLS = [
++    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
++    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
++    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
++    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
++]
++
++
++@pytest.fixture(scope="module")
++def url_images() -> Dict[str, Image.Image]:
++    connector = MediaConnector()
++
++    return {
++        image_url: connector.fetch_image(image_url)
++        for image_url in TEST_IMAGE_URLS
++    }
++
++
++def get_supported_suffixes() -> Tuple[str, ...]:
++    # We should at least test the file types mentioned in GPT-4 with Vision
++    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
++
++    # Additional file types that are supported by us
++    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')
++
++    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
++
++
++def _image_equals(a: Image.Image, b: Image.Image) -> bool:
++    return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
++async def test_fetch_image_http(image_url: str):
++    connector = MediaConnector()
++
++    image_sync = connector.fetch_image(image_url)
++    image_async = await connector.fetch_image_async(image_url)
++    assert _image_equals(image_sync, image_async)
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
++@pytest.mark.parametrize("suffix", get_supported_suffixes())
++async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
++                                  image_url: str, suffix: str):
++    connector = MediaConnector()
++    url_image = url_images[image_url]
++
++    try:
++        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
++    except KeyError:
++        try:
++            mime_type = mimetypes.types_map[suffix]
++        except KeyError:
++            pytest.skip('No MIME type')
++
++    with NamedTemporaryFile(suffix=suffix) as f:
++        try:
++            url_image.save(f.name)
++        except Exception as e:
++            if e.args[0] == 'cannot write mode RGBA as JPEG':
++                pytest.skip('Conversion not supported')
++
++            raise
++
++        base64_image = base64.b64encode(f.read()).decode("utf-8")
++        data_url = f"data:{mime_type};base64,{base64_image}"
++
++        data_image_sync = connector.fetch_image(data_url)
++        if _image_equals(url_image, Image.open(f)):
++            assert _image_equals(url_image, data_image_sync)
++        else:
++            pass  # Lossy format; only check that image can be opened
++
++        data_image_async = await connector.fetch_image_async(data_url)
++        assert _image_equals(data_image_sync, data_image_async)
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
++async def test_fetch_image_local_files(image_url: str):
++    connector = MediaConnector()
++
++    with TemporaryDirectory() as temp_dir:
++        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
++
++        origin_image = connector.fetch_image(image_url)
++        origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
++                          quality=100,
++                          icc_profile=origin_image.info.get('icc_profile'))
++
++        image_async = await local_connector.fetch_image_async(
++            f"file://{temp_dir}/{os.path.basename(image_url)}")
++        image_sync = local_connector.fetch_image(
++            f"file://{temp_dir}/{os.path.basename(image_url)}")
++        # Check that the images are equal
++        assert not ImageChops.difference(image_sync, image_async).getbbox()
++
++        with pytest.raises(ValueError, match="must be a subpath"):
++            await local_connector.fetch_image_async(
++                f"file://{temp_dir}/../{os.path.basename(image_url)}")
++        with pytest.raises(RuntimeError, match="Cannot load local files"):
++            await connector.fetch_image_async(
++                f"file://{temp_dir}/../{os.path.basename(image_url)}")
++
++        with pytest.raises(ValueError, match="must be a subpath"):
++            local_connector.fetch_image(
++                f"file://{temp_dir}/../{os.path.basename(image_url)}")
++        with pytest.raises(RuntimeError, match="Cannot load local files"):
++            connector.fetch_image(
++                f"file://{temp_dir}/../{os.path.basename(image_url)}")
++
++
++@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
++def test_repeat_and_pad_placeholder_tokens(model):
++    config = AutoConfig.from_pretrained(model)
++    image_token_id = config.image_token_index
++
++    tokenizer = AutoTokenizer.from_pretrained(model)
++
++    test_cases = [
++        (
++            "<image>",
++            2,
++            "<image><image>",
++            [32000, 32000],
++            [{ "offset": 0, "length": 2 }],
++        ),
++        (
++            "<image><image>",
++            2,
++            "<image><image><image>",
++            [32000, 32000, 32000],
++            [{ "offset": 0, "length": 2 }],
++        ),
++        (
++            "<image><image>",
++            [3, 2],
++            "<image><image><image><image><image>",
++            [32000, 32000, 32000, 32000, 32000],
++            [{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
++        ),
++        (
++            "Image:<image>Image:<image>!",
++            [3, 2],
++            "Image:<image><image><image>Image:<image><image>!",
++            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
++            [{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
++        ),
++        (
++            "<image>",
++            [3, 2],
++            "<image><image><image>",
++            [32000, 32000, 32000],
++            [{ "offset": 0, "length": 3 }],
++        ),
++    ]  # yapf: disable
++
++    for (
++            prompt,
++            repeat_count,
++            expected_prompt,
++            expected_token_ids,
++            expected_ranges,
++    ) in test_cases:
++        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
++            tokenizer=tokenizer,
++            prompt=prompt,
++            prompt_token_ids=tokenizer.encode(prompt,
++                                              add_special_tokens=False),
++            placeholder_token_id=image_token_id,
++            repeat_count=repeat_count,
++        )
++        assert new_prompt == expected_prompt
++        assert new_token_ids == expected_token_ids
++        assert ranges == expected_ranges
++
++
++# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
++class TestCase(NamedTuple):
++    mm_positions: "MultiModalPlaceholderDict"
++    mm_hashes: Optional["MultiModalHashDict"]
++    expected_modalities: list[str]
++    expected_ranges: list[PlaceholderRange]
++    expected_hashes: Optional[list[str]]
++
++
++def test_merge_and_sort_multimodal_metadata():
++
++    test_cases = [
++        # Single modality should return result as is but flattened
++        TestCase(
++            mm_positions={
++                "image": [
++                    PlaceholderRange(offset=0, length=2),
++                    PlaceholderRange(offset=3, length=2),
++                ]
++            },
++            mm_hashes={"image": ["hash1", "hash2"]},
++            expected_modalities=["image"],
++            expected_ranges=[
++                PlaceholderRange(offset=0, length=2),
++                PlaceholderRange(offset=3, length=2),
++            ],
++            expected_hashes=["hash1", "hash2"],
++        ),
++
++        # Single modality without hashes return None for mm hash.
++        TestCase(
++            mm_positions={
++                "image": [
++                    PlaceholderRange(offset=0, length=2),
++                    PlaceholderRange(offset=2, length=2),
++                ]
++            },
++            mm_hashes=None,
++            expected_modalities=["image"],
++            expected_ranges=[
++                PlaceholderRange(offset=0, length=2),
++                PlaceholderRange(offset=2, length=2),
++            ],
++            expected_hashes=None,
++        ),
++
++        # Multiple modalities with hashes should return sorted modalities
++        # and flattened ranges and hashes.
++        TestCase(
++            mm_positions={
++                "image": [
++                    PlaceholderRange(offset=7, length=4),
++                    PlaceholderRange(offset=11, length=5),
++                ],
++                "audio": [
++                    PlaceholderRange(offset=0, length=2),
++                    PlaceholderRange(offset=2, length=3),
++                ]
++            },
++            mm_hashes={
++                "image": ["image_hash1", "image_hash2"],
++                "audio": ["audio_hash1", "audio_hash2"],
++            },
++            expected_modalities=["audio", "image"],
++            expected_ranges=[
++                PlaceholderRange(offset=0, length=2),
++                PlaceholderRange(offset=2, length=3),
++                PlaceholderRange(offset=7, length=4),
++                PlaceholderRange(offset=11, length=5),
++            ],
++            expected_hashes=[
++                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
++            ],
++        ),
++
++        # Multiple modalities without hashes should return sorted modalities
++        # and flattened ranges and None.
++        TestCase(
++            mm_positions={
++                "image": [
++                    PlaceholderRange(offset=7, length=4),
++                    PlaceholderRange(offset=11, length=5),
++                ],
++                "audio": [
++                    PlaceholderRange(offset=0, length=2),
++                    PlaceholderRange(offset=2, length=3),
++                ]
++            },
++            mm_hashes=None,
++            expected_modalities=["audio", "image"],
++            expected_ranges=[
++                PlaceholderRange(offset=0, length=2),
++                PlaceholderRange(offset=2, length=3),
++                PlaceholderRange(offset=7, length=4),
++                PlaceholderRange(offset=11, length=5),
++            ],
++            expected_hashes=None,
++        ),
++
++        # Three modalities
++        TestCase(
++            mm_positions={
++                "image": [
++                    PlaceholderRange(offset=15, length=7),
++                    PlaceholderRange(offset=22, length=8),
++                ],
++                "audio": [
++                    PlaceholderRange(offset=0, length=2),
++                ],
++                "video": [
++                    PlaceholderRange(offset=3, length=4),
++                    PlaceholderRange(offset=7, length=5),
++                    PlaceholderRange(offset=12, length=6),
++                ]
++            },
++            mm_hashes={
++                "image": ["image_hash1", "image_hash2"],
++                "audio": ["audio_hash1"],
++                "video": ["video_hash1", "video_hash2", "video_hash3"]
++            },
++            expected_modalities=["audio", "video", "image"],
++            expected_ranges=[
++                PlaceholderRange(offset=0, length=2),
++                PlaceholderRange(offset=3, length=4),
++                PlaceholderRange(offset=7, length=5),
++                PlaceholderRange(offset=12, length=6),
++                PlaceholderRange(offset=15, length=7),
++                PlaceholderRange(offset=22, length=8),
++            ],
++            expected_hashes=[
++                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
++                "image_hash1", "image_hash2"
++            ],
++        ),
++    ]
++
++    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
++         expected_hashes) in test_cases:
++        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
++            mm_positions, mm_hashes)
++
++        assert modalities == expected_modalities
++        assert ranges == expected_ranges
++        assert hashes == expected_hashes
++
++
++def test_merge_and_sort_multimodal_metadata_with_interleaving():
++
++    test_cases = [
++
++        # <image> <audio> <image> <audio>
++        TestCase(
++            mm_positions={
++                "image": [
++                    PlaceholderRange(offset=0, length=4),
++                    PlaceholderRange(offset=8, length=2),
++                ],
++                "audio": [
++                    PlaceholderRange(offset=5, length=2),
++                    PlaceholderRange(offset=11, length=4),
++                ]
++            },
++            mm_hashes={
++                "image": ["image_hash1", "image_hash2"],
++                "audio": ["audio_hash1", "audio_hash2"],
++            },
++            expected_modalities=[],
++            expected_ranges=[],
++            expected_hashes=None,
++        ),
++
++        # <image> <image> <video> <audio> <image>
++        TestCase(
++            mm_positions={
++                "image": [
++                    PlaceholderRange(offset=0, length=2),
++                    PlaceholderRange(offset=2, length=3),
++                    PlaceholderRange(offset=20, length=4),
++                ],
++                "audio": [
++                    PlaceholderRange(offset=5, length=2),
++                ],
++                "video": [
++                    PlaceholderRange(offset=8, length=5),
++                ]
++            },
++            mm_hashes=None,
++            expected_modalities=[],
++            expected_ranges=[],
++            expected_hashes=None,
++        ),
++    ]
++
++    for case in test_cases:
++        with pytest.raises(ValueError) as ex_info:
++            merge_and_sort_multimodal_metadata(case.mm_positions,
++                                               case.mm_hashes)
++
++        assert "Interleaved mixed-modality" in str(ex_info.value)
+diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
+new file mode 100644
+index 0000000..29aeca6
+--- /dev/null
++++ b/tests/multimodal/utils.py
+@@ -0,0 +1,33 @@
++import numpy as np
++from PIL import Image
++
++
++def random_image(rng: np.random.RandomState, min_wh: int, max_wh: int):
++    w, h = rng.randint(min_wh, max_wh, size=(2, ))
++    arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
++    return Image.fromarray(arr)
++
++
++def random_video(
++    rng: np.random.RandomState,
++    min_frames: int,
++    max_frames: int,
++    min_wh: int,
++    max_wh: int,
++):
++    # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
++    num_frames = rng.randint(min_frames, max_frames)
++    num_frames = (num_frames // 2) * 2
++
++    w, h = rng.randint(min_wh, max_wh, size=(2, ))
++    return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
++
++
++def random_audio(
++    rng: np.random.RandomState,
++    min_len: int,
++    max_len: int,
++    sr: int,
++):
++    audio_len = rng.randint(min_len, max_len)
++    return rng.rand(audio_len), sr
+diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py
+new file mode 100644
+index 0000000..9b53512
+--- /dev/null
++++ b/tests/plugins/vllm_add_dummy_model/setup.py
+@@ -0,0 +1,9 @@
++from setuptools import setup
++
++setup(name='vllm_add_dummy_model',
++      version='0.1',
++      packages=['vllm_add_dummy_model'],
++      entry_points={
++          'vllm.general_plugins':
++          ["register_dummy_model = vllm_add_dummy_model:register"]
++      })
+diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+new file mode 100644
+index 0000000..62a8f87
+--- /dev/null
++++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+@@ -0,0 +1,20 @@
++from vllm import ModelRegistry
++
++
++def register():
++    # Test directly passing the model
++    from .my_opt import MyOPTForCausalLM
++
++    if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
++        ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
++
++    # Test passing lazy model
++    if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs():
++        ModelRegistry.register_model(
++            "MyGemma2Embedding",
++            "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding",
++        )
++
++    if "MyLlava" not in ModelRegistry.get_supported_archs():
++        ModelRegistry.register_model("MyLlava",
++                                     "vllm_add_dummy_model.my_llava:MyLlava")
+diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+new file mode 100644
+index 0000000..5e7d7d1
+--- /dev/null
++++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+@@ -0,0 +1,70 @@
++from typing import Iterable, List, Optional, Tuple, Union
++
++import torch
++import torch.nn as nn
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.pooler import Pooler, PoolingType
++from vllm.model_executor.models.gemma2 import Gemma2Model
++from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
++from vllm.model_executor.pooling_metadata import PoolingMetadata
++from vllm.sequence import IntermediateTensors, PoolerOutput
++
++
++class MyGemma2Embedding(nn.Module):
++    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        self.model = Gemma2Model(vllm_config=vllm_config,
++                                 prefix=maybe_prefix(prefix, "model"))
++
++        self._pooler = Pooler.from_config_with_defaults(
++            vllm_config.model_config.pooler_config,
++            pooling_type=PoolingType.LAST,
++            normalize=True,
++            softmax=False,
++        )
++
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        hidden_states = self.model(
++            input_ids,
++            positions,
++            kv_caches,
++            attn_metadata,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++        )
++
++        if isinstance(hidden_states, IntermediateTensors):
++            return hidden_states
++
++        # Return all-zero embeddings
++        return torch.zeros_like(hidden_states)
++
++    def pooler(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Optional[PoolerOutput]:
++        return self._pooler(hidden_states, pooling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++
++        weights = self.hf_to_vllm_mapper.apply(weights)
++        weights = ((name, data) for name, data in weights
++                   if not name.startswith("lm_head."))
++        return self.model.load_weights(weights)
+diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+new file mode 100644
+index 0000000..ac64edf
+--- /dev/null
++++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+@@ -0,0 +1,26 @@
++from typing import Optional
++
++import torch
++
++from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
++                                              LlavaForConditionalGeneration,
++                                              LlavaMultiModalProcessor,
++                                              LlavaProcessingInfo)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++
++
++@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor,
++                                        info=LlavaProcessingInfo,
++                                        dummy_inputs=LlavaDummyInputsBuilder)
++class MyLlava(LlavaForConditionalGeneration):
++
++    def compute_logits(
++            self, hidden_states: torch.Tensor,
++            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
++        # this dummy model always predicts the first token
++        logits = super().compute_logits(hidden_states, sampling_metadata)
++        if logits is not None:
++            logits.zero_()
++            logits[:, 0] += 1.0
++        return logits
+diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+new file mode 100644
+index 0000000..569ef21
+--- /dev/null
++++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+@@ -0,0 +1,19 @@
++from typing import Optional
++
++import torch
++
++from vllm.model_executor.models.opt import OPTForCausalLM
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++
++
++class MyOPTForCausalLM(OPTForCausalLM):
++
++    def compute_logits(
++            self, hidden_states: torch.Tensor,
++            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
++        # this dummy model always predicts the first token
++        logits = super().compute_logits(hidden_states, sampling_metadata)
++        if logits is not None:
++            logits.zero_()
++            logits[:, 0] += 1.0
++        return logits
+diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
+new file mode 100644
+index 0000000..3163990
+--- /dev/null
++++ b/tests/plugins/vllm_add_dummy_platform/setup.py
+@@ -0,0 +1,11 @@
++from setuptools import setup
++
++setup(
++    name='vllm_add_dummy_platform',
++    version='0.1',
++    packages=['vllm_add_dummy_platform'],
++    entry_points={
++        'vllm.platform_plugins': [
++            "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
++        ]
++    })
+diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+new file mode 100644
+index 0000000..594cef5
+--- /dev/null
++++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+@@ -0,0 +1,5 @@
++from typing import Optional
++
++
++def dummy_platform_plugin() -> Optional[str]:
++    return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
+diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+new file mode 100644
+index 0000000..fde9314
+--- /dev/null
++++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+@@ -0,0 +1,5 @@
++from vllm.platforms.cuda import CudaPlatform
++
++
++class DummyPlatform(CudaPlatform):
++    device_name = "DummyDevice"
+diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
+new file mode 100644
+index 0000000..69698b3
+--- /dev/null
++++ b/tests/plugins_tests/test_platform_plugins.py
+@@ -0,0 +1,16 @@
++def test_platform_plugins():
++    # simulate workload by running an example
++    import runpy
++    current_file = __file__
++    import os
++    example_file = os.path.join(
++        os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
++        "examples", "offline_inference/basic.py")
++    runpy.run_path(example_file)
++
++    # check if the plugin is loaded correctly
++    from vllm.platforms import _init_trace, current_platform
++    assert current_platform.device_name == "DummyDevice", (
++        f"Expected DummyDevice, got {current_platform.device_name}, "
++        "possibly because current_platform is imported before the plugin"
++        f" is loaded. The first import:\n{_init_trace}")
+diff --git a/tests/prefix_caching/__init__.py b/tests/prefix_caching/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
+new file mode 100644
+index 0000000..5a28943
+--- /dev/null
++++ b/tests/prefix_caching/test_disable_sliding_window.py
+@@ -0,0 +1,44 @@
++"""Compare the with and without prefix caching.
++
++Run `pytest tests/prefix_caching/test_prefix_caching.py`.
++"""
++import pytest
++
++from vllm import LLM
++from vllm.distributed import cleanup_dist_env_and_memory
++
++MODEL_LEN_LEN = [
++    # Example models with sliding window.
++    ("bigcode/starcoder2-3b", 4096, 16384),
++    # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
++
++    # Confirm model with sliding window works.
++    # config has "use_sliding_window": false
++    ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
++    # config has no sliding window attribute.
++    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
++]
++
++
++@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
++def test_disable_sliding_window(model_len_len, ):
++    model, sliding_len, full_len = model_len_len
++    vllm_disabled_model = LLM(model, disable_sliding_window=True)
++    vllm_disabled_model.generate("Hi my name is")
++    model_config = vllm_disabled_model.llm_engine.model_config
++    assert model_config.max_model_len == sliding_len, (
++        "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
++        model_config.max_model_len)
++
++    del vllm_disabled_model
++    cleanup_dist_env_and_memory()
++
++    vllm_enabled_model = LLM(model, disable_sliding_window=False)
++    vllm_enabled_model.generate("Hi my name is")
++    model_config = vllm_enabled_model.llm_engine.model_config
++    assert model_config.max_model_len == full_len, (
++        "Max len expected to equal full_len of %s, but got %s", full_len,
++        model_config.max_model_len)
++
++    del vllm_enabled_model
++    cleanup_dist_env_and_memory()
+diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
+index 305596e..8d16710 100644
+--- a/tests/prefix_caching/test_prefix_caching.py
++++ b/tests/prefix_caching/test_prefix_caching.py
+@@ -2,74 +2,200 @@
+ 
+ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
+ """
++
+ import pytest
+ 
+-from vllm.core.block_manager_v1 import CachedBlockAllocator
+-from vllm.utils import Device
++from tests.conftest import VllmRunner
++from tests.core.utils import SchedulerProxy, create_dummy_prompt
++from tests.kernels.utils import override_backend_env_variable
++from vllm import SamplingParams, TokensPrompt
++from vllm.core.scheduler import Scheduler
++from vllm.engine.llm_engine import LLMEngine
++
++from ..models.utils import check_outputs_equal
++
++MODELS = [
++    "facebook/opt-125m",
++]
++
++UNSTABLE_PROMPT_SEQUENCE = [
++    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1),
++    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50),
++    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95),
++    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174),
++    ([0] * 588) + ([8] * 1539),
++]
+ 
+ 
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
++@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("max_tokens", [5])
++@pytest.mark.parametrize("cached_position", [0, 1])
++@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
+ @pytest.mark.parametrize("block_size", [16])
+-@pytest.mark.parametrize("num_blocks", [16])
+-def test_block_allocator(
++def test_mixed_requests(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    backend: str,
++    dtype: str,
++    max_tokens: int,
++    cached_position: int,
++    enable_chunked_prefill: bool,
+     block_size: int,
+-    num_blocks: int,
+-):
+-    block_hash = 1
+-    block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
+-
+-    # Allocate two PysicalTokenBlocks with the same hash and check
+-    # that they are the same PhysicalTokenBlock
+-    first_block = block_allocator.allocate(block_hash, 0)
+-    second_block = block_allocator.allocate(block_hash, 0)
+-    assert (first_block == second_block)
+-    assert (second_block.ref_count == 2)
+-
+-    # Free the first_block and confirm that the ref_count is correctly
+-    # decremented on the second block
+-    block_allocator.free(first_block)
+-    assert (second_block.ref_count == 1)
+-
+-    # Free the second block
+-    block_allocator.free(second_block)
+-
+-    # Reallocate the first block and confirm that, even after the block
+-    # had its ref_count go to 0, we still get the same block back
+-    first_block = block_allocator.allocate(block_hash, 0)
+-    assert (first_block == second_block)
+-    assert (first_block.block_hash == block_hash)
+-
+-
+-@pytest.mark.parametrize("num_blocks", [16])
+-def test_eviction(num_blocks: int, ):
++    monkeypatch,
++) -> None:
++    """
++    Test the case when some sequences have the prefix cache hit
++    and the others don't. The cached position determines where
++    the sequence is at among the batch of prefills.
++    """
++    override_backend_env_variable(monkeypatch, backend)
++
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
++
++    cached_prompt = example_prompts[cached_position]
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            enable_prefix_caching=True,
++            enable_chunked_prefill=enable_chunked_prefill,
++            block_size=block_size,
++    ) as vllm_model:
++        # Run the first prompt so the cache is populated
++        vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
++
++        # Run all the promopts
++        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
++        req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
++
++        # Verify number of cached tokens
++        for i in range(len(req_outputs)):
++            if i == cached_position:
++                expected_num_cached_tokens = (
++                    len(req_outputs[i].prompt_token_ids) //
++                    block_size) * block_size
++            else:
++                expected_num_cached_tokens = 0
++            assert (
++                req_outputs[i].num_cached_tokens == expected_num_cached_tokens)
++
++        vllm_outputs = [(
++            output.prompt_token_ids + list(output.outputs[0].token_ids),
++            output.prompt + output.outputs[0].text,
++        ) for output in req_outputs]
++
++    check_outputs_equal(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
++def test_unstable_prompt_sequence(
++    vllm_runner,
++    backend: str,
++    monkeypatch,
++) -> None:
++    override_backend_env_variable(monkeypatch, backend)
++
++    with vllm_runner(
++            "Qwen/Qwen2.5-0.5B-Instruct",
++            enable_chunked_prefill=True,
++            enable_prefix_caching=True,
++            max_model_len=4096,
++    ) as vllm_model:
++        for prompt in UNSTABLE_PROMPT_SEQUENCE:
++            vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
++                                SamplingParams(max_tokens=1))
++
++
++@pytest.mark.parametrize("model", MODELS)
++def test_fully_cached_prefill_needs_uncached_token(model):
+     block_size = 16
+-    block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
+-    blocks = []
+-
+-    for i in range(num_blocks):
+-        # use i as the block_hash
+-        blocks.append(block_allocator.allocate(i, 0))
+-
+-    #Free all blocks
+-    for block in blocks:
+-        block_allocator.free(block)
+-
+-    # Allocate a new block and confirm that it's the first block freed.
+-    # I.E The Least Recently Used block
+-    new_block_hash = block_size
+-    new_block = block_allocator.allocate(new_block_hash, 0)
+-    assert (new_block == blocks[0])
+-    assert (new_block.block_hash == new_block_hash)
+-
+-    # Reallocate the second in blocks to remove it from the free list
+-    realloc_block_hash = 1
+-    realloc_block = block_allocator.allocate(realloc_block_hash, 0)
+-    assert (realloc_block == blocks[realloc_block_hash])
+-    assert (realloc_block.block_hash == realloc_block_hash)
+-
+-    # Allocate a new block and confirm that it's not the realloc_block,
+-    # since the realloc_block shouldn't be in the free list
+-    new_block_hash = block_size + 1
+-    new_block = block_allocator.allocate(new_block_hash, 0)
+-    assert (realloc_block != new_block)
+-    assert (new_block.block_hash == new_block_hash)
+-    assert (new_block.block_number == 2)
++    max_num_batched_tokens = 16
++    num_output_tokens = 5
++    # Make a vllm engine
++    runner = VllmRunner(
++        model_name=model,
++        gpu_memory_utilization=0.7,
++        enable_chunked_prefill=True,
++        enforce_eager=True,
++        enable_prefix_caching=True,
++        block_size=block_size,
++        max_num_batched_tokens=max_num_batched_tokens,
++        max_num_seqs=max_num_batched_tokens,
++    )
++    engine: LLMEngine = runner.model.llm_engine
++
++    scheduler: Scheduler = SchedulerProxy(engine.scheduler[0])  # type: ignore
++    engine.scheduler[0] = scheduler
++
++    # SeqA
++    seqA_tokens = list(range(2 * block_size))
++    seqA, seq_groupA = create_dummy_prompt(
++        request_id="0",
++        prompt_tokens=seqA_tokens,
++        max_tokens=num_output_tokens,
++        block_size=block_size,
++    )
++
++    scheduler.add_seq_group(seq_groupA)
++
++    assert seqA.data.get_num_computed_tokens() == 0
++
++    # Prefill seqA
++    while not seqA.is_finished():
++        engine.step()
++
++    # seqB
++    seqB_tokens = [t + 1 for t in seqA_tokens]  # shift by 1
++    seqB, seq_groupB = create_dummy_prompt(
++        request_id="1",
++        prompt_tokens=seqB_tokens,
++        max_tokens=num_output_tokens,
++        block_size=block_size,
++    )
++
++    # seqC is the same as seqA
++    seqC, seq_groupC = create_dummy_prompt(
++        request_id="2",
++        prompt_tokens=seqA_tokens,
++        max_tokens=num_output_tokens,
++        block_size=block_size,
++    )
++
++    scheduler.add_seq_group(seq_groupB)
++    scheduler.add_seq_group(seq_groupC)
++
++    # Even seqC is fully cached, it should not be prefilled since we
++    # require at least 1 uncached token.
++    engine.step()
++
++    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
++    assert len(sched_out.scheduled_seq_groups) == 1
++    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
++            seq_groupB.request_id)
++    assert (sched_out.scheduled_seq_groups[0].token_chunk_size ==
++            max_num_batched_tokens)
++
++    # When seqB is finished, seqC could be prefilled.
++    while not seqB.is_finished():
++        engine.step()
++        sched_metas, sched_out, _ = scheduler.last_schedule_ret()
++        assert len(sched_out.scheduled_seq_groups) == 1
++        assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
++                seq_groupB.request_id)
++
++    engine.step()
++    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
++    assert len(sched_out.scheduled_seq_groups) == 1
++    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
++            seq_groupC.request_id)
++    assert sched_out.scheduled_seq_groups[0].token_chunk_size == len(
++        seqA_tokens)
+diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py
+new file mode 100644
+index 0000000..6528b30
+--- /dev/null
++++ b/tests/prompt_adapter/test_bloom.py
+@@ -0,0 +1,45 @@
++import pytest
++
++import vllm
++from vllm.prompt_adapter.request import PromptAdapterRequest
++
++MODEL_PATH = "bigscience/bloomz-560m"
++PA_PATH = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
++
++
++def do_sample(llm, pa_name: str, pa_id: int):
++
++    prompts = [
++        "Tweet text : @nationalgridus I have no water and the bill is \
++        current and paid. Can you do something about this? Label : ",
++        "Tweet text : @nationalgridus Looks good thanks! Label : "
++    ]
++    sampling_params = vllm.SamplingParams(temperature=0.0,
++                                          max_tokens=3,
++                                          stop_token_ids=[3])
++
++    outputs = llm.generate(prompts,
++                           sampling_params,
++                           prompt_adapter_request=PromptAdapterRequest(
++                               pa_name, pa_id, PA_PATH, 8) if pa_id else None)
++
++    # Print the outputs.
++    generated_texts = []
++    for output in outputs:
++        prompt = output.prompt
++        generated_text = output.outputs[0].text.strip()
++        generated_texts.append(generated_text)
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++    return generated_texts
++
++
++@pytest.mark.parametrize("enforce_eager", [True, False])
++def test_twitter_prompt_adapter(enforce_eager: bool):
++    llm = vllm.LLM(MODEL_PATH,
++                   enforce_eager=enforce_eager,
++                   enable_prompt_adapter=True,
++                   max_prompt_adapter_token=8)
++
++    expected_output = ['complaint', 'no complaint']
++
++    assert do_sample(llm, "twitter_pa", pa_id=1) == expected_output
+diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py
+new file mode 100644
+index 0000000..39a79be
+--- /dev/null
++++ b/tests/prompt_adapter/test_multi_adapter_inference.py
+@@ -0,0 +1,53 @@
++from vllm import EngineArgs, LLMEngine, SamplingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
++
++MODEL_PATH = "bigscience/bloomz-560m"
++pa_path = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
++pa_path2 = 'swapnilbp/angry_tweet_ptune'
++
++
++def do_sample(engine):
++
++    prompts = [
++        ("Tweet text: I have complaints! Label: ",
++         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
++         PromptAdapterRequest("hate_speech", 1, pa_path2, 8)),
++        ("Tweet text: I have no problems Label: ",
++         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
++         PromptAdapterRequest("hate_speech2", 2, pa_path2, 8)),
++        ("Tweet text: I have complaints! Label: ",
++         SamplingParams(temperature=0.0, max_tokens=3), None),
++        ("Tweet text: I have no problems Label: ",
++         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
++         PromptAdapterRequest("complain", 3, pa_path, 8)),
++    ]
++
++    request_id = 0
++    results = set()
++    while prompts or engine.has_unfinished_requests():
++        if prompts:
++            prompt, sampling_params, pa_request = prompts.pop(0)
++            engine.add_request(str(request_id),
++                               prompt,
++                               sampling_params,
++                               prompt_adapter_request=pa_request)
++            request_id += 1
++
++        request_outputs = engine.step()
++
++        for request_output in request_outputs:
++            if request_output.finished:
++                results.add(request_output.outputs[0].text)
++    return results
++
++
++def test_multi_prompt_adapters():
++    engine_args = EngineArgs(model=MODEL_PATH,
++                             max_prompt_adapters=3,
++                             enable_prompt_adapter=True,
++                             max_prompt_adapter_token=8)
++    engine = LLMEngine.from_engine_args(engine_args)
++    expected_output = {
++        ' quot;I', 'hate speech', 'no complaint', 'not hate speech'
++    }
++    assert do_sample(engine) == expected_output
+diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py
+new file mode 100644
+index 0000000..2a5f23f
+--- /dev/null
++++ b/tests/prompt_adapter/test_pa_lora.py
+@@ -0,0 +1,61 @@
++from huggingface_hub import snapshot_download
++
++from vllm import EngineArgs, LLMEngine, SamplingParams
++from vllm.lora.request import LoRARequest
++from vllm.prompt_adapter.request import PromptAdapterRequest
++
++MODEL_PATH = "meta-llama/Llama-2-7b-hf"
++pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune")
++lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
++
++
++def do_sample(engine):
++
++    prompt_text = "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]"  # noqa: E501
++
++    # first prompt with a prompt adapter and second without adapter
++    prompts = [
++        (prompt_text,
++         SamplingParams(temperature=0.0, max_tokens=100,
++                        stop=["[/assistant]"]),
++         PromptAdapterRequest("hate_speech", 1, pa_path,
++                              8), LoRARequest("sql_test", 1, lora_path)),
++        (prompt_text,
++         SamplingParams(temperature=0.0, max_tokens=100,
++                        stop=["[/assistant]"]), None,
++         LoRARequest("sql_test", 1, lora_path)),
++    ]
++
++    request_id = 0
++    results = set()
++    while prompts or engine.has_unfinished_requests():
++        if prompts:
++            prompt, sampling_params, pa_request, lora_request = prompts.pop(0)
++            engine.add_request(str(request_id),
++                               prompt,
++                               sampling_params,
++                               prompt_adapter_request=pa_request,
++                               lora_request=lora_request)
++            request_id += 1
++
++        request_outputs = engine.step()
++
++        for request_output in request_outputs:
++            if request_output.finished:
++                results.add(request_output.outputs[0].text)
++    return results
++
++
++def test_lora_prompt_adapter():
++    engine_args = EngineArgs(model=MODEL_PATH,
++                             enable_prompt_adapter=True,
++                             enable_lora=True,
++                             max_num_seqs=60,
++                             max_prompt_adapter_token=8)
++    engine = LLMEngine.from_engine_args(engine_args)
++    result = do_sample(engine)
++
++    expected_output = {
++        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' "  # noqa: E501
++    }
++    assert result == expected_output
+diff --git a/tests/quantization/__init__.py b/tests/quantization/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
+new file mode 100644
+index 0000000..569fc8d
+--- /dev/null
++++ b/tests/quantization/test_bitsandbytes.py
+@@ -0,0 +1,168 @@
++'''Tests whether bitsandbytes computation is enabled correctly.
++
++Run `pytest tests/quantization/test_bitsandbytes.py`.
++'''
++
++import gc
++
++import pytest
++import torch
++
++from tests.quantization.utils import is_quant_method_supported
++from tests.utils import compare_two_settings, fork_new_process_for_each_test
++
++models_4bit_to_test = [
++    ("facebook/opt-125m", "quantize opt model inflight"),
++]
++
++models_pre_qaunt_4bit_to_test = [
++    ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
++     'read pre-quantized 4-bit FP4 model'),
++    ('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
++]
++
++models_pre_quant_8bit_to_test = [
++    ('meta-llama/Llama-Guard-3-8B-INT8',
++     'read pre-quantized llama 8-bit model'),
++    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
++]
++
++
++@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
++                    reason='bitsandbytes is not supported on this GPU type.')
++@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
++@fork_new_process_for_each_test
++def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
++                             model_name, description) -> None:
++
++    hf_model_kwargs = {"load_in_4bit": True}
++    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
++                             model_name, hf_model_kwargs)
++
++
++@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
++                    reason='bitsandbytes is not supported on this GPU type.')
++@pytest.mark.parametrize("model_name, description",
++                         models_pre_qaunt_4bit_to_test)
++@fork_new_process_for_each_test
++def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
++                                       model_name, description) -> None:
++
++    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
++                             model_name)
++
++
++@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
++                    reason='bitsandbytes is not supported on this GPU type.')
++@pytest.mark.parametrize("model_name, description",
++                         models_pre_quant_8bit_to_test)
++@fork_new_process_for_each_test
++def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
++                             model_name, description) -> None:
++
++    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
++                             model_name)
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 2,
++                    reason='Test requires at least 2 GPUs.')
++@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
++                    reason='bitsandbytes is not supported on this GPU type.')
++@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
++@fork_new_process_for_each_test
++def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
++                                model_name, description) -> None:
++
++    hf_model_kwargs = {"load_in_4bit": True}
++    validate_generated_texts(hf_runner,
++                             vllm_runner,
++                             example_prompts[:1],
++                             model_name,
++                             hf_model_kwargs,
++                             vllm_tp_size=2)
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 2,
++                    reason='Test requires at least 2 GPUs.')
++@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
++                    reason='bitsandbytes is not supported on this GPU type.')
++@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
++@fork_new_process_for_each_test
++def test_load_pp_4bit_bnb_model(model_name, description) -> None:
++    common_args = [
++        "--disable-log-stats",
++        "--disable-log-requests",
++        "--dtype",
++        "bfloat16",
++        "--enable-prefix-caching",
++        "--quantization",
++        "bitsandbytes",
++        "--load-format",
++        "bitsandbytes",
++        "--gpu-memory-utilization",
++        "0.7",
++    ]
++    pp_args = [
++        *common_args,
++        "--pipeline-parallel-size",
++        "2",
++    ]
++    compare_two_settings(model_name, common_args, pp_args)
++
++
++def log_generated_texts(prompts, outputs, runner_name):
++    logged_texts = []
++    for i, (_, generated_text) in enumerate(outputs):
++        log_entry = {
++            "prompt": prompts[i],
++            "runner_name": runner_name,
++            "generated_text": generated_text,
++        }
++        logged_texts.append(log_entry)
++    return logged_texts
++
++
++def validate_generated_texts(hf_runner,
++                             vllm_runner,
++                             prompts,
++                             model_name,
++                             hf_model_kwargs=None,
++                             vllm_tp_size=1):
++
++    # NOTE: run vLLM first, as it requires a clean process
++    # when using distributed inference
++    with vllm_runner(model_name,
++                     quantization='bitsandbytes',
++                     load_format='bitsandbytes',
++                     tensor_parallel_size=vllm_tp_size,
++                     enforce_eager=False) as llm:
++        vllm_outputs = llm.generate_greedy(prompts, 8)
++        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
++
++    # Clean up the GPU memory for the next test
++    gc.collect()
++    torch.cuda.empty_cache()
++
++    if hf_model_kwargs is None:
++        hf_model_kwargs = {}
++
++    # Run with HF runner
++    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
++        hf_outputs = llm.generate_greedy(prompts, 8)
++        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
++
++    # Clean up the GPU memory for the next test
++    gc.collect()
++    torch.cuda.empty_cache()
++
++    # Compare the generated strings
++    for hf_log, vllm_log in zip(hf_logs, vllm_logs):
++        hf_str = hf_log["generated_text"]
++        vllm_str = vllm_log["generated_text"]
++        prompt = hf_log["prompt"]
++
++        assert hf_str == vllm_str, (f"Model: {model_name}"
++                                    f"Mismatch between HF and vLLM outputs:\n"
++                                    f"Prompt: {prompt}\n"
++                                    f"HF Output: '{hf_str}'\n"
++                                    f"vLLM Output: '{vllm_str}'")
+diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
+new file mode 100644
+index 0000000..9243688
+--- /dev/null
++++ b/tests/quantization/test_compressed_tensors.py
+@@ -0,0 +1,313 @@
++"""Test model set-up and weight loading for llmcompressor-quantized models.
++
++Run `pytest tests/quantization/test_compressed_tensors.py`.
++"""
++from typing import Optional
++
++import pytest
++import torch
++from compressed_tensors.quantization import QuantizationType
++
++from tests.models.utils import check_logprobs_close
++from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
++    CompressedTensors24, CompressedTensorsLinearMethod,
++    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
++    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
++    CompressedTensorsWNA16)
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    sparse_cutlass_supported)
++from vllm.platforms import current_platform
++
++
++@pytest.mark.parametrize(
++    "model_args",
++    [("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
++      QuantizationType.INT, 2560, True),
++     ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
++      QuantizationType.INT, 2560, True),
++     ("nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "tensor",
++      QuantizationType.INT, 2560, False)])
++def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
++    model_path, strategy, quant_type, shape_0, is_symmetric = model_args
++    with vllm_runner(model_path, enforce_eager=True) as llm:
++        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
++        layer = model.model.layers[0]
++
++        qkv_proj = layer.self_attn.qkv_proj
++        o_proj = layer.self_attn.o_proj
++        gate_up_proj = layer.mlp.gate_up_proj
++        down_proj = layer.mlp.down_proj
++
++        # assert zp for symmetric and asymmetric cases
++        def zp_valid(zp: Optional[torch.Tensor]):
++            if is_symmetric:
++                return zp is None
++
++            return zp is not None and zp.dtype is torch.int32
++
++        assert zp_valid(qkv_proj.input_zero_point)
++        assert zp_valid(o_proj.input_zero_point)
++        assert zp_valid(gate_up_proj.input_zero_point)
++        assert zp_valid(down_proj.input_zero_point)
++
++        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
++        assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
++        assert isinstance(gate_up_proj.quant_method,
++                          CompressedTensorsLinearMethod)
++        assert isinstance(down_proj.quant_method,
++                          CompressedTensorsLinearMethod)
++        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
++
++        assert qkv_proj.scheme.strategy == strategy
++        assert qkv_proj.scheme.is_static_input_scheme
++        expected_type = torch.int8
++
++        assert qkv_proj.weight.dtype is expected_type
++        assert o_proj.weight.dtype is expected_type
++        assert gate_up_proj.weight.dtype is expected_type
++
++        if qkv_proj.scheme.strategy == "tensor":
++            # Make sure it is a channelwise buffer
++            # After running process_weights_after_loading
++            assert len(qkv_proj.weight_scale.shape) == 2
++            assert qkv_proj.weight_scale.shape[0] == shape_0
++            assert qkv_proj.weight_scale.shape[1] == 1
++        assert qkv_proj.weight_scale.dtype is torch.float32
++        assert qkv_proj.input_scale.dtype is torch.float32
++
++        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
++        assert output
++
++
++@pytest.mark.parametrize("model_path", [
++    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
++    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
++    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
++    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
++])
++@pytest.mark.parametrize("max_tokens", [32])
++@pytest.mark.parametrize("num_logprobs", [10])
++def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
++                                          example_prompts, model_path,
++                                          max_tokens, num_logprobs):
++    dtype = "bfloat16"
++
++    # skip language translation prompt for the static per tensor asym model
++    if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym":  # noqa: E501
++        example_prompts = example_prompts[0:-1]
++
++    with hf_runner(model_path, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_greedy_logprobs_limit(
++            example_prompts, max_tokens, num_logprobs)
++
++    with vllm_runner(model_path, dtype=dtype) as vllm_model:
++        vllm_outputs = vllm_model.generate_greedy_logprobs(
++            example_prompts, max_tokens, num_logprobs)
++
++    check_logprobs_close(
++        outputs_0_lst=hf_outputs,
++        outputs_1_lst=vllm_outputs,
++        name_0="hf",
++        name_1="vllm",
++    )
++
++
++def test_compressed_tensors_no_enforce_eager(vllm_runner):
++    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
++    with vllm_runner(model_path) as llm:
++        output = llm.generate_greedy("Hello my name is", max_tokens=20)
++        assert output
++
++
++@pytest.mark.parametrize("model_args", [
++    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
++    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
++    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
++    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
++     "channel"),
++])
++def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
++    model_path, strategy = model_args
++    with vllm_runner(model_path, dtype=torch.float16) as llm:
++        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
++        layer = model.model.layers[0]
++
++        qkv_proj = layer.self_attn.qkv_proj
++
++        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
++        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
++        assert not qkv_proj.scheme.is_static_input_scheme
++        assert qkv_proj.scheme.strategy == strategy
++        assert qkv_proj.weight.dtype is torch.int8
++
++        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
++        assert output
++
++
++@pytest.mark.parametrize(
++    "wNa16_args",
++    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
++     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
++     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
++def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
++    model, strategy, group, pack_factor = wNa16_args
++    with vllm_runner(model) as llm:
++        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
++        layer = model.model.layers[0]
++
++        qkv_proj = layer.self_attn.qkv_proj
++        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
++        assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
++
++        assert qkv_proj.scheme.strategy == strategy
++        assert qkv_proj.scheme.group_size == (-1 if group is None else group)
++
++        assert qkv_proj.weight_packed.dtype is torch.int32
++        assert qkv_proj.weight_scale.dtype is torch.float16
++        assert qkv_proj.scheme.pack_factor == pack_factor
++
++        output = llm.generate_greedy("Hello my name is", max_tokens=20)
++        assert output
++
++
++def test_compressed_tensors_w4a16_marlin24(vllm_runner):
++    model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
++    with vllm_runner(model_path) as llm:
++        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
++        layer = model.model.layers[0]
++
++        qkv_proj = layer.self_attn.qkv_proj
++
++        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
++        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
++        assert qkv_proj.weight_packed.dtype is torch.int32
++
++        output = llm.generate_greedy("Hello my name is", max_tokens=20)
++        assert output
++
++
++def test_compressed_tensors_fp8(vllm_runner):
++    model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
++    with vllm_runner(model_path) as llm:
++        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
++        layer = model.model.layers[0]
++
++        qkv_proj = layer.self_attn.qkv_proj
++
++        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
++        assert isinstance(
++            qkv_proj.scheme,
++            (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
++
++        assert qkv_proj.input_scale.dtype is torch.float32
++
++        if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
++            assert len(qkv_proj.input_scale.shape) == 0
++            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
++            assert qkv_proj.weight_scale.dtype is torch.float32
++            assert len(qkv_proj.weight_scale.shape) == 0
++
++        output = llm.generate_greedy("Hello my name is", max_tokens=20)
++        assert output
++
++
++def test_compressed_tensors_kv_cache(vllm_runner):
++    model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
++    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
++        output = llm.generate_greedy("Hello world!", max_tokens=20)
++        assert output
++
++
++@pytest.mark.skipif(not sparse_cutlass_supported(),
++                    reason="Sparse FP8 is not yet supported on this GPU type.")
++def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
++    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
++    assert isinstance(qkv_proj.scheme, CompressedTensors24)
++
++    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
++    assert qkv_proj.scheme.input_quant.strategy == input_strategy
++    assert qkv_proj.scheme.quantized
++    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
++    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
++    assert sparsity_map.get("Linear").format == "dense"
++    assert sparsity_map.get("Linear").sparsity_structure == "2:4"
++
++
++@pytest.mark.skipif(not current_platform.has_device_capability(90),
++                    reason="Sparse FP8 is not yet supported on this GPU type.")
++@pytest.mark.parametrize("args_2of4", [
++    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
++     "token"),
++    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
++     "channel", "tensor"),
++    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
++     "tensor"),
++    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
++     "tensor", "token"),
++])
++def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
++    model, weight_strategy, input_strategy = args_2of4
++    with vllm_runner(model) as llm:
++        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
++        layer = model.model.layers[0]
++
++        qkv_proj = layer.self_attn.qkv_proj
++        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
++        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
++
++        output = llm.generate_greedy("Hello my name is", max_tokens=20)
++        print(output)
++        assert output
++
++
++@pytest.mark.skipif(not sparse_cutlass_supported(),
++                    reason="Sparse FP8 is not yet supported on this GPU type.")
++@pytest.mark.parametrize("args_2of4", [
++    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
++     "channel", "token"),
++    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
++     "tensor"),
++    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
++     "tensor", "token"),
++])
++def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
++    model, weight_strategy, input_strategy = args_2of4
++    with vllm_runner(model) as llm:
++        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
++        layer = model.model.layers[0]
++
++        qkv_proj = layer.self_attn.qkv_proj
++        assert qkv_proj.scheme.weights_dtype == torch.int8
++        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
++
++        output = llm.generate_greedy("Hello my name is", max_tokens=20)
++        print(output)
++        assert output
++
++
++@pytest.mark.skipif(not sparse_cutlass_supported(),
++                    reason="Sparse FP8 is not yet supported on this GPU type.")
++@pytest.mark.parametrize(
++    "args_2of4",
++    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
++def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
++    model = args_2of4
++    with vllm_runner(model) as llm:
++        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
++        layer = model.model.layers[0]
++
++        qkv_proj = layer.self_attn.qkv_proj
++        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
++        assert isinstance(qkv_proj.scheme, CompressedTensors24)
++
++        assert qkv_proj.scheme.weight_quant is None
++        assert qkv_proj.scheme.input_quant is None
++        assert not qkv_proj.scheme.quantized
++        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
++        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
++        assert sparsity_map.get("Linear").format == "dense"
++        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
++
++        output = llm.generate_greedy("Hello my name is", max_tokens=20)
++        print(output)
++        assert output
+diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
+index 6820b27..cf77cce 100644
+--- a/tests/quantization/test_configs.py
++++ b/tests/quantization/test_configs.py
+@@ -4,6 +4,7 @@ Run `pytest tests/quantization/test_configs.py --forked`.
+ """
+ 
+ from dataclasses import dataclass
++from typing import Tuple
+ 
+ import pytest
+ 
+@@ -43,20 +44,21 @@ MODEL_ARG_EXPTYPES = [
+     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
+ 
+     # AUTOAWQ
+-    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq"),
++    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
+     ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
+-    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "ERROR"),
++    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
+     ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
+ ]
+ 
+ 
+ @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
+-def test_auto_gptq(model_arg_exptype: str) -> None:
++def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
+     model_path, quantization_arg, expected_type = model_arg_exptype
+ 
+     try:
+         model_config = ModelConfig(model_path,
+-                                   model_path,
++                                   task="auto",
++                                   tokenizer=model_path,
+                                    tokenizer_mode="auto",
+                                    trust_remote_code=False,
+                                    seed=0,
+diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
+new file mode 100644
+index 0000000..d4983e6
+--- /dev/null
++++ b/tests/quantization/test_cpu_offload.py
+@@ -0,0 +1,68 @@
++# Expanded quantized model tests for CPU offloading
++# Base tests: tests/basic_correctness/test_cpu_offload.py
++
++import pytest
++
++from tests.quantization.utils import is_quant_method_supported
++
++from ..utils import compare_two_settings
++
++
++@pytest.mark.skipif(not is_quant_method_supported("fp8"),
++                    reason="fp8 is not supported on this GPU type.")
++def test_cpu_offload_fp8():
++    # Test quantization of an unquantized checkpoint
++    compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
++                         ["--quantization", "fp8"],
++                         ["--quantization", "fp8", "--cpu-offload-gb", "2"],
++                         max_wait_seconds=480)
++    # Test loading a quantized checkpoint
++    compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
++                         ["--cpu-offload-gb", "2"],
++                         max_wait_seconds=480)
++
++
++@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
++                    reason="gptq_marlin is not supported on this GPU type.")
++def test_cpu_offload_gptq():
++    # Test GPTQ Marlin
++    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
++                         ["--cpu-offload-gb", "1"],
++                         max_wait_seconds=480)
++    # Test GPTQ
++    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
++                         ["--quantization", "gptq"],
++                         ["--quantization", "gptq", "--cpu-offload-gb", "1"],
++                         max_wait_seconds=480)
++
++
++@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
++                    reason="awq_marlin is not supported on this GPU type.")
++def test_cpu_offload_awq():
++    # Test AWQ Marlin
++    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
++                         ["--cpu-offload-gb", "1"],
++                         max_wait_seconds=480)
++    # Test AWQ
++    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
++                         ["--quantization", "awq"],
++                         ["--quantization", "awq", "--cpu-offload-gb", "1"],
++                         max_wait_seconds=480)
++
++
++@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
++                    reason="gptq_marlin is not supported on this GPU type.")
++def test_cpu_offload_compressed_tensors():
++    # Test wNa16
++    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
++                         ["--cpu-offload-gb", "1"],
++                         max_wait_seconds=480)
++    # Test w4a16_marlin24
++    compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
++                         [], ["--cpu-offload-gb", "1"],
++                         max_wait_seconds=480)
++    # Test w8a8
++    compare_two_settings(
++        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
++        ["--cpu-offload-gb", "1"],
++        max_wait_seconds=480)
+diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
+new file mode 100644
+index 0000000..ec31c94
+--- /dev/null
++++ b/tests/quantization/test_experts_int8.py
+@@ -0,0 +1,28 @@
++# flake8: noqa
++"""Tests experts_int8 quantization startup and generation, 
++doesn't test correctness
++"""
++import pytest
++
++from tests.quantization.utils import is_quant_method_supported
++
++MODELS = ["ai21labs/Jamba-tiny-random"]
++
++
++@pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
++                    reason="ExpertsInt8 is not supported on this GPU type.")
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", ["bfloat16"])
++@pytest.mark.parametrize("max_tokens", [10])
++def test_model_experts_int8_startup(
++    hf_runner,
++    vllm_runner,
++    example_prompts,
++    model: str,
++    dtype: str,
++    max_tokens: int,
++) -> None:
++
++    with vllm_runner(model, dtype=dtype,
++                     quantization="experts_int8") as vllm_model:
++        vllm_model.generate_greedy(example_prompts, max_tokens)
+diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
+index 607544a..a0c1d7e 100644
+--- a/tests/quantization/test_fp8.py
++++ b/tests/quantization/test_fp8.py
+@@ -5,20 +5,138 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
+ import pytest
+ import torch
+ 
+-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+-from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
++from tests.quantization.utils import is_quant_method_supported
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
++                                                         Fp8LinearMethod)
++from vllm.platforms import current_platform
+ 
+-capability = torch.cuda.get_device_capability()
+-capability = capability[0] * 10 + capability[1]
++MODELS = [
++    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
++    "nm-testing/Phi-3-mini-128k-instruct-FP8",
++    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
++]
+ 
+ 
+-@pytest.mark.skipif(
+-    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+-    reason="FP8 is not supported on this GPU type.")
+-def test_load_fp16_model(vllm_runner) -> None:
+-    llm = vllm_runner("facebook/opt-125m", quantization="fp8")
++@pytest.mark.skipif(not is_quant_method_supported("fp8"),
++                    reason="FP8 is not supported on this GPU type.")
++@pytest.mark.parametrize("model_id", MODELS)
++@pytest.mark.parametrize("force_marlin", [False, True])
++def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
++                            monkeypatch) -> None:
++    if force_marlin:
++        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
+ 
+-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+-    fc1 = model.model.decoder.layers[0].fc1
+-    assert isinstance(fc1.quant_method, Fp8LinearMethod)
+-    assert fc1.weight.dtype == torch.float8_e4m3fn
++    with vllm_runner(model_id) as llm:
++        # note: this does not test accuracy, just that we can run through
++        # see lm-eval tests for accuracy
++        outputs = llm.generate_greedy(prompts=["Hello my name is"],
++                                      max_tokens=10)
++        print(outputs[0][1])
++
++
++KV_CACHE_MODELS = [
++    # Deprecated AutoFP8 format using .kv_scale
++    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
++    # AutoFP8 format using separate .k_scale and .v_scale
++    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
++]
++
++
++@pytest.mark.skipif(not is_quant_method_supported("fp8"),
++                    reason="FP8 is not supported on this GPU type.")
++@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
++def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
++    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
++
++        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
++        attn = model.model.layers[0].self_attn.attn
++        assert isinstance(attn.quant_method, Fp8KVCacheMethod)
++        # NOTE: it is valid for scales to be 1.0 (default value), but we know
++        # these checkpoints have scales < 1.0
++        assert 0.0 < attn._k_scale < 1.0
++        assert 0.0 < attn._v_scale < 1.0
++
++        # note: this does not test accuracy, just that we can run through
++        # see lm-eval tests for accuracy
++        outputs = llm.generate_greedy(prompts=["Hello my name is"],
++                                      max_tokens=10)
++        print(outputs[0][1])
++
++
++@pytest.mark.skipif(not is_quant_method_supported("fp8"),
++                    reason="FP8 is not supported on this GPU type.")
++@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
++@pytest.mark.parametrize("force_marlin", [False, True])
++def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
++                         monkeypatch) -> None:
++    if force_marlin:
++        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
++
++    with vllm_runner("facebook/opt-125m",
++                     quantization="fp8",
++                     kv_cache_dtype=kv_cache_dtype) as llm:
++
++        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
++        fc1 = model.model.decoder.layers[0].fc1
++        assert isinstance(fc1.quant_method, Fp8LinearMethod)
++        if kv_cache_dtype == "fp8":
++            attn = model.model.decoder.layers[0].self_attn.attn
++            assert isinstance(attn.quant_method, Fp8KVCacheMethod)
++            assert attn._k_scale == 1.0
++            assert attn._v_scale == 1.0
++
++        if current_platform.has_device_capability(89) and not force_marlin:
++            # For GPUs with hardware support, we keep weights in fp8
++            assert fc1.weight.dtype == torch.float8_e4m3fn
++        else:
++            # For GPUs without hardware support, we pack the fp8 weights
++            # for weight-only quantization using Marlin kernels
++            assert fc1.weight.dtype == torch.int32
++
++
++@pytest.mark.skipif(not is_quant_method_supported("fp8"),
++                    reason="FP8 is not supported on this GPU type.")
++@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
++def test_scaled_fp8_quant(dtype) -> None:
++
++    def quantize_ref(tensor, inv_scale):
++        # The reference implementation that fully aligns to
++        # the kernel being tested.
++        finfo = torch.finfo(torch.float8_e4m3fn)
++        scale = inv_scale.reciprocal()
++        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
++                                                           max=finfo.max)
++        qweight = qweight.to(torch.float8_e4m3fn)
++        return qweight
++
++    def per_tensor_dequantize(tensor, inv_scale, dtype):
++        fake_qweight = tensor.to(dtype)
++        dq_weight = fake_qweight * inv_scale
++        return dq_weight
++
++    # Note that we use a shape % 4 != 0 to cover edge cases,
++    # because scaled_fp8_quant is vectorized by 4.
++    x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
++
++    # Dynamic quantization
++    ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
++    ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
++
++    # Reference dynamic quantizaton
++    y = quantize_ref(x, inv_scale)
++    torch.testing.assert_close(ref_y,
++                               per_tensor_dequantize(y, inv_scale, dtype))
++
++    # Static quantization
++    y, _ = ops.scaled_fp8_quant(x, inv_scale)
++    torch.testing.assert_close(ref_y,
++                               per_tensor_dequantize(y, inv_scale, dtype))
++
++    # Padding
++    y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
++    assert y.shape[0] == 17
++    torch.testing.assert_close(
++        ref_y,
++        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
++                              dtype))
+diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
+new file mode 100644
+index 0000000..68a73f0
+--- /dev/null
++++ b/tests/quantization/test_ipex_quant.py
+@@ -0,0 +1,30 @@
++"""Test model set-up and inference for quantized HF models supported
++ on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
++ 
++ Validating the configuration and printing results for manual checking.
++
++ Run `pytest tests/quantization/test_ipex_quant.py`.
++"""
++
++import pytest
++
++from vllm.platforms import current_platform
++
++MODELS = [
++    "AMead10/Llama-3.2-1B-Instruct-AWQ",
++    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",  # with g_idx
++]
++DTYPE = ["bfloat16"]
++
++
++@pytest.mark.skipif(not current_platform.is_cpu()
++                    and not current_platform.is_xpu(),
++                    reason="only supports Intel CPU/XPU backend.")
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("dtype", DTYPE)
++def test_ipex_quant(vllm_runner, model, dtype):
++    with vllm_runner(model, dtype=dtype) as llm:
++        output = llm.generate_greedy(["The capital of France is"],
++                                     max_tokens=32)
++    assert output
++    print(output)
+diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
+new file mode 100644
+index 0000000..ad526a4
+--- /dev/null
++++ b/tests/quantization/test_lm_head.py
+@@ -0,0 +1,47 @@
++"""Tests whether gptq models with quantized lm_head can be loaded.
++
++Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
++"""
++from typing import Tuple
++
++import pytest
++import torch
++
++from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
++from vllm.model_executor.layers.quantization.gptq_marlin import (
++    GPTQMarlinLinearMethod)
++from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    UnquantizedEmbeddingMethod)
++
++PROMPT = "On the surface of Mars, we found"
++
++MODELS_QUANT = [(
++    "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse",
++    True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
++                ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)]
++
++
++@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
++def test_lm_head(
++    vllm_runner,
++    model_lm_head_quant: Tuple[str, bool],
++) -> None:
++    model, lm_head_quantized = model_lm_head_quant
++    vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048)
++
++    lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker.
++                     model_runner.model.lm_head)
++
++    if lm_head_quantized:
++        assert isinstance(
++            lm_head_layer.linear_method,
++            (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
++    else:
++        assert isinstance(lm_head_layer.linear_method,
++                          UnquantizedEmbeddingMethod)
++
++    print(
++        vllm_model.generate_greedy(prompts=["Hello my name is"],
++                                   max_tokens=10)[0][1])
++    del vllm_model
+diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
+new file mode 100644
+index 0000000..8ebd8dd
+--- /dev/null
++++ b/tests/quantization/utils.py
+@@ -0,0 +1,15 @@
++from vllm.model_executor.layers.quantization import get_quantization_config
++from vllm.platforms import current_platform
++
++
++def is_quant_method_supported(quant_method: str) -> bool:
++    # Currently, all quantization methods require Nvidia or AMD GPUs
++    if not (current_platform.is_cuda() or current_platform.is_rocm()):
++        return False
++
++    capability = current_platform.get_device_capability()
++    assert capability is not None
++
++    min_capability = get_quantization_config(quant_method).get_min_capability()
++
++    return capability.to_int() >= min_capability
+diff --git a/tests/runai_model_streamer/__init__.py b/tests/runai_model_streamer/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
+new file mode 100644
+index 0000000..c5722fb
+--- /dev/null
++++ b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
+@@ -0,0 +1,31 @@
++from vllm import SamplingParams
++from vllm.config import LoadConfig, LoadFormat
++from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader,
++                                                     get_model_loader)
++
++test_model = "openai-community/gpt2"
++
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++# Create a sampling params object.
++sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
++
++
++def get_runai_model_loader():
++    load_config = LoadConfig(load_format=LoadFormat.RUNAI_STREAMER)
++    return get_model_loader(load_config)
++
++
++def test_get_model_loader_with_runai_flag():
++    model_loader = get_runai_model_loader()
++    assert isinstance(model_loader, RunaiModelStreamerLoader)
++
++
++def test_runai_model_loader_download_files(vllm_runner):
++    with vllm_runner(test_model, load_format=LoadFormat.RUNAI_STREAMER) as llm:
++        deserialized_outputs = llm.generate(prompts, sampling_params)
++        assert deserialized_outputs
+diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer/test_weight_utils.py
+new file mode 100644
+index 0000000..5c89bd7
+--- /dev/null
++++ b/tests/runai_model_streamer/test_weight_utils.py
+@@ -0,0 +1,39 @@
++import glob
++import tempfile
++
++import huggingface_hub.constants
++import torch
++
++from vllm.model_executor.model_loader.weight_utils import (
++    download_weights_from_hf, runai_safetensors_weights_iterator,
++    safetensors_weights_iterator)
++
++
++def test_runai_model_loader():
++    with tempfile.TemporaryDirectory() as tmpdir:
++        huggingface_hub.constants.HF_HUB_OFFLINE = False
++        download_weights_from_hf("openai-community/gpt2",
++                                 allow_patterns=["*.safetensors"],
++                                 cache_dir=tmpdir)
++        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
++        assert len(safetensors) > 0
++
++        runai_model_streamer_tensors = {}
++        hf_safetensors_tensors = {}
++
++        for name, tensor in runai_safetensors_weights_iterator(safetensors):
++            runai_model_streamer_tensors[name] = tensor
++
++        for name, tensor in safetensors_weights_iterator(safetensors):
++            hf_safetensors_tensors[name] = tensor
++
++        assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)
++
++        for name, runai_tensor in runai_model_streamer_tensors.items():
++            assert runai_tensor.dtype == hf_safetensors_tensors[name].dtype
++            assert runai_tensor.shape == hf_safetensors_tensors[name].shape
++            assert torch.all(runai_tensor.eq(hf_safetensors_tensors[name]))
++
++
++if __name__ == "__main__":
++    test_runai_model_loader()
+diff --git a/tests/samplers/__init__.py b/tests/samplers/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
+index 2682f28..4d1a697 100644
+--- a/tests/samplers/test_beam_search.py
++++ b/tests/samplers/test_beam_search.py
+@@ -2,18 +2,16 @@
+ 
+ Run `pytest tests/samplers/test_beam_search.py`.
+ """
+-import gc
+ 
+ import pytest
+-import torch
+ 
+ # FIXME(zhuohan): The test can not pass if we:
+ #   1. Increase max_tokens to 256.
+ #   2. Increase beam_width to 8.
+ #   3. Use the model "huggyllama/llama-7b".
+-MAX_TOKENS = [128]
++MAX_TOKENS = [64]
+ BEAM_WIDTHS = [4]
+-MODELS = ["facebook/opt-125m"]
++MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
+ 
+ 
+ @pytest.mark.parametrize("model", MODELS)
+@@ -30,23 +28,24 @@ def test_beam_search_single_input(
+     beam_width: int,
+ ) -> None:
+     example_prompts = example_prompts[:1]
+-    hf_model = hf_runner(model, dtype=dtype)
+-    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+-                                               max_tokens)
+-    del hf_model
+-
+-    vllm_model = vllm_runner(model, dtype=dtype)
+-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                                    max_tokens)
+-    del vllm_model
+-    # NOTE(woosuk): For some reason, the following GC is required to avoid
+-    # GPU OOM errors in the following tests using `vllm_runner`.
+-    gc.collect()
+-    torch.cuda.empty_cache()
++
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
++                                                       beam_width, max_tokens)
+ 
+     for i in range(len(example_prompts)):
+-        hf_output_ids, _ = hf_outputs[i]
+-        vllm_output_ids, _ = vllm_outputs[i]
++        hf_output_ids, hf_output_texts = hf_outputs[i]
++        vllm_output_ids, vllm_output_texts = vllm_outputs[i]
++        for i, (hf_text,
++                vllm_text) in enumerate(zip(hf_output_texts,
++                                            vllm_output_texts)):
++            print(f">>>{i}-th hf output:")
++            print(hf_text)
++            print(f">>>{i}-th vllm output:")
++            print(vllm_text)
+         assert len(hf_output_ids) == len(vllm_output_ids)
+         for j in range(len(hf_output_ids)):
+             assert hf_output_ids[j] == vllm_output_ids[j], (
+diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
+index 864657a..dc2482d 100644
+--- a/tests/samplers/test_ignore_eos.py
++++ b/tests/samplers/test_ignore_eos.py
+@@ -7,25 +7,27 @@ import pytest
+ 
+ from vllm import SamplingParams
+ 
+-MODELS = ["facebook/opt-125m"]
++# We also test with llama because it has generation_config to specify EOS
++# (past regression).
++MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
+ 
+ 
+ @pytest.mark.parametrize("model", MODELS)
+ @pytest.mark.parametrize("dtype", ["half"])
+-@pytest.mark.parametrize("max_tokens", [1024])
+-def test_beam_search_single_input(
++@pytest.mark.parametrize("max_tokens", [512])
++def test_ignore_eos(
+     vllm_runner,
+     example_prompts,
+     model: str,
+     dtype: str,
+     max_tokens: int,
+ ) -> None:
+-    example_prompts = "1 + 1 is"
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        sampling_params = SamplingParams(max_tokens=max_tokens,
++                                         ignore_eos=True)
+ 
+-    vllm_model = vllm_runner(model, dtype=dtype)
+-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+-    ignore_eos_output = vllm_model.model.generate(
+-        example_prompts, sampling_params=sampling_params)
+-    print(len(ignore_eos_output[0].outputs[0].token_ids))
+-    assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) < 10
+-    assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) >= 0
++        for prompt in example_prompts:
++            ignore_eos_output = vllm_model.model.generate(
++                prompt, sampling_params=sampling_params)
++            output_length = len(ignore_eos_output[0].outputs[0].token_ids)
++            assert output_length == max_tokens
+diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
+index 3788e9e..2979470 100644
+--- a/tests/samplers/test_logits_processor.py
++++ b/tests/samplers/test_logits_processor.py
+@@ -14,49 +14,46 @@ def test_logits_processor_force_generate(
+     model: str,
+     dtype: str,
+ ) -> None:
+-    vllm_model = vllm_runner(model, dtype=dtype)
+-    tokenizer = vllm_model.model.get_tokenizer()
+-    repeat_times = 2
+-    enforced_answers = " vLLM"
+-    vllm_token_ids = tokenizer.encode(enforced_answers,
+-                                      add_special_tokens=False)
+-    max_tokens = len(vllm_token_ids) * repeat_times
+-
+-    def pick_vllm(token_ids, logits):
+-        token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
+-        logits[token_id] = torch.finfo(logits.dtype).max
+-        return logits
+-
+-    params_with_logprobs = SamplingParams(
+-        logits_processors=[pick_vllm],
+-        prompt_logprobs=3,
+-        max_tokens=max_tokens,
+-    )
+-
+-    # test logits_processors when prompt_logprobs is not None
+-    vllm_model.model._add_request(
+-        prompt=example_prompts[0],
+-        sampling_params=params_with_logprobs,
+-        prompt_token_ids=None,
+-    )
+-
+-    # test prompt_logprobs is not None
+-    vllm_model.model._add_request(
+-        prompt=example_prompts[1],
+-        sampling_params=SamplingParams(
++    with vllm_runner(model, dtype=dtype) as vllm_model:
++        tokenizer = vllm_model.model.get_tokenizer()
++        repeat_times = 2
++        enforced_answers = " vLLM"
++        vllm_token_ids = tokenizer.encode(enforced_answers,
++                                          add_special_tokens=False)
++        max_tokens = len(vllm_token_ids) * repeat_times
++
++        def pick_vllm(token_ids, logits):
++            token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
++            logits[token_id] = torch.finfo(logits.dtype).max
++            return logits
++
++        params_with_logprobs = SamplingParams(
++            logits_processors=[pick_vllm],
+             prompt_logprobs=3,
+             max_tokens=max_tokens,
+-        ),
+-        prompt_token_ids=None,
+-    )
+-
+-    # test grouped requests
+-    vllm_model.model._add_request(
+-        prompt=example_prompts[2],
+-        sampling_params=SamplingParams(max_tokens=max_tokens),
+-        prompt_token_ids=None,
+-    )
+-
+-    outputs = vllm_model.model._run_engine(False)
+-
+-    assert outputs[0].outputs[0].text == enforced_answers * repeat_times
++        )
++
++        # test logits_processors when prompt_logprobs is not None
++        vllm_model.model._add_request(
++            example_prompts[0],
++            params=params_with_logprobs,
++        )
++
++        # test prompt_logprobs is not None
++        vllm_model.model._add_request(
++            example_prompts[1],
++            params=SamplingParams(
++                prompt_logprobs=3,
++                max_tokens=max_tokens,
++            ),
++        )
++
++        # test grouped requests
++        vllm_model.model._add_request(
++            example_prompts[2],
++            params=SamplingParams(max_tokens=max_tokens),
++        )
++
++        outputs = vllm_model.model._run_engine(use_tqdm=False)
++
++        assert outputs[0].outputs[0].text == enforced_answers * repeat_times
+diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
+index 57d6d2a..c07c71e 100644
+--- a/tests/samplers/test_logprobs.py
++++ b/tests/samplers/test_logprobs.py
+@@ -1,16 +1,21 @@
++from typing import List
++
+ import pytest
+ import torch
+ 
+-from tests.conftest import VllmRunner
+ from vllm import SamplingParams
+ 
++from ..conftest import VllmRunner
++
+ MODELS = ["facebook/opt-125m"]
+ 
+ 
+ @pytest.mark.parametrize("model", MODELS)
+-@pytest.mark.parametrize("dtype", ["half"])
++@pytest.mark.parametrize("dtype",
++                         ["float"])  # needed for comparing logprobs with HF
+ @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
+-@pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
++@pytest.mark.parametrize("num_top_logprobs", [0, 6])  # 32000 == vocab_size
++@pytest.mark.parametrize("detokenize", [True, False])
+ def test_get_prompt_logprobs(
+     hf_runner,
+     vllm_runner,
+@@ -18,6 +23,7 @@ def test_get_prompt_logprobs(
+     dtype,
+     chunked_prefill_token_size: int,
+     num_top_logprobs: int,
++    detokenize: bool,
+     example_prompts,
+ ):
+     max_num_seqs = 256
+@@ -29,27 +35,27 @@ def test_get_prompt_logprobs(
+         max_num_batched_tokens = chunked_prefill_token_size
+ 
+     max_tokens = 5
+-    hf_model = hf_runner(model, dtype=dtype)
+-    hf_logprobs = hf_model.generate_greedy_logprobs(
+-        example_prompts,
+-        max_tokens=max_tokens,
+-    )
+-    del hf_model
+-
+-    vllm_model = vllm_runner(
+-        model,
+-        dtype=dtype,
+-        max_logprobs=num_top_logprobs,
+-        enable_chunked_prefill=enable_chunked_prefill,
+-        max_num_batched_tokens=max_num_batched_tokens,
+-        max_num_seqs=max_num_seqs,
+-    )
+-    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
+-                                          logprobs=num_top_logprobs,
+-                                          prompt_logprobs=num_top_logprobs,
+-                                          temperature=0.0)
+-    vllm_results = vllm_model.model.generate(
+-        example_prompts, sampling_params=vllm_sampling_params)
++    with hf_runner(model, dtype=dtype) as hf_model:
++        hf_logprobs = hf_model.generate_greedy_logprobs(
++            example_prompts,
++            max_tokens=max_tokens,
++        )
++
++    with vllm_runner(
++            model,
++            dtype=dtype,
++            max_logprobs=num_top_logprobs,
++            enable_chunked_prefill=enable_chunked_prefill,
++            max_num_batched_tokens=max_num_batched_tokens,
++            max_num_seqs=max_num_seqs,
++    ) as vllm_model:
++        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
++                                              logprobs=num_top_logprobs,
++                                              prompt_logprobs=num_top_logprobs,
++                                              temperature=0.0,
++                                              detokenize=detokenize)
++        vllm_results = vllm_model.model.generate(
++            example_prompts, sampling_params=vllm_sampling_params)
+ 
+     # Test whether logprobs are included in the results.
+     for result in vllm_results:
+@@ -57,18 +63,27 @@ def test_get_prompt_logprobs(
+         assert result.outputs[0].logprobs is not None
+         assert len(result.outputs[0].logprobs) == max_tokens
+         for logprobs in result.outputs[0].logprobs:
+-            assert len(logprobs) == num_top_logprobs
++            # If the output token is not included in the top X
++            # logprob, it can return 1 more data
++            assert (len(logprobs) == num_top_logprobs
++                    or len(logprobs) == num_top_logprobs + 1)
+         output_text = result.outputs[0].text
+-        output_string_from_most_likely_tokens = []
++        output_string_from_most_likely_tokens_lst: List[str] = []
+         for top_logprobs in result.outputs[0].logprobs:
+             top_logprob = next(iter(top_logprobs.values()))
+-            output_string_from_most_likely_tokens.append(
++            output_string_from_most_likely_tokens_lst.append(
+                 top_logprob.decoded_token)
+-        output_string_from_most_likely_tokens = "".join(
+-            output_string_from_most_likely_tokens)
+-        assert output_text == output_string_from_most_likely_tokens, (
+-            "The output text from the top logprob for each token position "
+-            "should be the same as the output text in the result.")
++
++        if detokenize:
++            output_string_from_most_likely_tokens = "".join(
++                output_string_from_most_likely_tokens_lst)
++            assert output_text == output_string_from_most_likely_tokens, (
++                "The output text from the top logprob for each token position "
++                "should be the same as the output text in the result.")
++        else:
++            assert output_text == ''
++            assert output_string_from_most_likely_tokens_lst == ([None] *
++                                                                 max_tokens)
+ 
+         # The first prompt logprob is always None
+         assert result.prompt_logprobs[0] is None
+@@ -97,9 +112,10 @@ def test_get_prompt_logprobs(
+                                            hf_logprob[i][-1][token_id].item(),
+                                            atol=1e-2,
+                                            rtol=1e-2)
+-                assert isinstance(sample_logprob.decoded_token, str), (
+-                    "The token should be decoded by the time it is returned "
+-                    " to the user.")
++                if detokenize:
++                    assert isinstance(sample_logprob.decoded_token, str), (
++                        "The token should be decoded by the time it is returned"
++                        " to the user.")
+ 
+     # Test if prompt logprobs are correctly set.
+     for vllm_result in vllm_results:
+@@ -122,3 +138,35 @@ def test_max_logprobs():
+     bad_sampling_params = SamplingParams(logprobs=2)
+     with pytest.raises(ValueError):
+         runner.generate(["Hello world"], sampling_params=bad_sampling_params)
++
++
++@pytest.mark.parametrize("model", MODELS)
++@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
++@pytest.mark.parametrize("detokenize", [True, False])
++def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
++                       detokenize: bool, example_prompts):
++    max_num_seqs = 256
++    enable_chunked_prefill = False
++    max_num_batched_tokens = None
++    if chunked_prefill_token_size != -1:
++        enable_chunked_prefill = True
++        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
++        max_num_batched_tokens = chunked_prefill_token_size
++    max_tokens = 5
++
++    with vllm_runner(
++            model,
++            enable_chunked_prefill=enable_chunked_prefill,
++            max_num_batched_tokens=max_num_batched_tokens,
++            max_num_seqs=max_num_seqs,
++    ) as vllm_model:
++        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
++                                                       logprobs=None,
++                                                       temperature=0.0,
++                                                       detokenize=detokenize)
++        results_logprobs_none = vllm_model.model.generate(
++            example_prompts, sampling_params=sampling_params_logprobs_none)
++
++    for i in range(len(results_logprobs_none)):
++        assert results_logprobs_none[i].outputs[0].logprobs is None
++        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
+new file mode 100644
+index 0000000..4190cf7
+--- /dev/null
++++ b/tests/samplers/test_no_bad_words.py
+@@ -0,0 +1,185 @@
++"""Make sure bad_words works.
++
++Run `pytest tests/samplers/test_no_bad_words.py`.
++
++"""
++from typing import List, Optional
++
++from transformers import AutoTokenizer
++
++from vllm import LLM, SamplingParams
++
++
++def _generate(
++    model: LLM,
++    prompt: str,
++    num_prompt_tokens: int,
++    temperature: float = 0,
++    bad_words: Optional[List[str]] = None,
++) -> List[int]:
++    sampling_params = SamplingParams(
++        temperature=temperature,
++        bad_words=bad_words,
++    )
++
++    # [([output_token_ids, ], [output_text, ]), ]
++    output = model.generate([prompt], sampling_params=sampling_params)
++
++    output_token_ids = output[0][0][0][num_prompt_tokens:]
++    # [0] first (and only) request output
++    # [0] token_ids (not text)
++    # [0] first (and only) output completion
++
++    return output_token_ids
++
++
++class TestOneTokenBadWord:
++    MODEL = "TheBloke/Llama-2-7B-fp16"
++
++    PROMPT = "Hi! How are"
++    TARGET_TOKEN = "you"
++
++    def setup_method(self, method):
++        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
++                                                       add_prefix_space=True)
++
++        self.num_prompt_tokens = len(self._encode(self.PROMPT))
++        self.target_token_id = self._encode(self.TARGET_TOKEN,
++                                            add_special_tokens=False)[0]
++
++    def test_one_token_bad_word(self, vllm_runner):
++        with vllm_runner(self.MODEL) as llm:
++            output_token_ids = self._generate(llm)
++            assert output_token_ids[0] == self.target_token_id
++
++            output_token_ids = self._generate(llm,
++                                              bad_words=[self.TARGET_TOKEN])
++            assert self.target_token_id not in output_token_ids
++
++    def _generate(self,
++                  model: LLM,
++                  bad_words: Optional[List[str]] = None) -> List[int]:
++        return _generate(
++            model=model,
++            prompt=self.PROMPT,
++            num_prompt_tokens=self.num_prompt_tokens,
++            bad_words=bad_words,
++        )
++
++    def _encode(self,
++                prompt: str,
++                add_special_tokens: bool = True) -> List[int]:
++        return self.tokenizer(prompt,
++                              add_special_tokens=add_special_tokens).input_ids
++
++
++class TestTwoTokenBadWord:
++    # Another model (with a different tokenizer behaviour)
++    MODEL = "openai-community/gpt2"
++
++    PROMPT = "How old are you? I am 10"
++    TARGET_TOKEN1 = "years"
++    TARGET_TOKEN2 = "old"
++    NEIGHBOUR_TOKEN2 = "older"
++
++    def setup_method(self, method):
++        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
++                                                       add_prefix_space=True)
++
++        self.num_prompt_tokens = len(self._encode(self.PROMPT))
++        self.target_token_id1 = self._encode(self.TARGET_TOKEN1,
++                                             add_special_tokens=False)[0]
++        self.target_token_id2 = self._encode(self.TARGET_TOKEN2,
++                                             add_special_tokens=False)[0]
++        self.neighbour_token_id2 = self._encode(self.NEIGHBOUR_TOKEN2,
++                                                add_special_tokens=False)[0]
++
++    def test_two_token_bad_word(self, vllm_runner):
++        with vllm_runner(self.MODEL) as llm:
++            output_token_ids = self._generate(llm)
++            assert output_token_ids[:2] == [
++                self.target_token_id1, self.target_token_id2
++            ]
++
++            output_token_ids = self._generate(llm,
++                                              bad_words=[self.TARGET_TOKEN1])
++            assert self.target_token_id1 not in output_token_ids
++
++            output_token_ids = self._generate(llm,
++                                              bad_words=[self.TARGET_TOKEN2])
++            assert output_token_ids[0] == self.target_token_id1
++            assert self.target_token_id2 not in output_token_ids
++
++            output_token_ids = self._generate(
++                llm, bad_words=[f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}'])
++            assert output_token_ids[0] == self.target_token_id1
++            assert output_token_ids[:2] != [
++                self.target_token_id1, self.target_token_id2
++            ]
++            assert not self._contains(
++                output_token_ids,
++                [self.target_token_id1, self.target_token_id2])
++            # Model dependent behaviour
++            assert output_token_ids[:2] == [
++                self.target_token_id1, self.neighbour_token_id2
++            ]
++
++            output_token_ids = self._generate(
++                llm,
++                bad_words=[
++                    f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}',
++                    f'{self.TARGET_TOKEN1} {self.NEIGHBOUR_TOKEN2}'
++                ])
++            assert output_token_ids[0] == self.target_token_id1
++            assert output_token_ids[:2] != [
++                self.target_token_id1, self.target_token_id2
++            ]
++            assert not self._contains(
++                output_token_ids,
++                [self.target_token_id1, self.target_token_id2])
++            assert output_token_ids[:2] != [
++                self.target_token_id1, self.neighbour_token_id2
++            ]
++            assert not self._contains(
++                output_token_ids,
++                [self.target_token_id1, self.neighbour_token_id2])
++            assert ((self.target_token_id2 in output_token_ids)
++                    or (self.neighbour_token_id2 in output_token_ids))
++
++    def _generate(self,
++                  model: LLM,
++                  bad_words: Optional[List[str]] = None) -> List[int]:
++        return _generate(
++            model=model,
++            prompt=self.PROMPT,
++            num_prompt_tokens=self.num_prompt_tokens,
++            bad_words=bad_words,
++        )
++
++    @staticmethod
++    def _contains(sequence: List[int], subsequence: List[int]) -> bool:
++        searched = False
++
++        for start in range(len(sequence)):
++            end = start + len(subsequence)
++            current_subsequence = sequence[start:end]
++
++            if len(current_subsequence) < len(subsequence):
++                continue
++
++            searched = True
++
++            assert len(current_subsequence) == len(subsequence)
++
++            if current_subsequence == subsequence:
++                return True
++
++        assert searched, "All subsequences did not match in length..."
++
++        return False
++
++    def _encode(self,
++                prompt: str,
++                add_special_tokens: bool = True) -> List[int]:
++        return self.tokenizer(prompt,
++                              add_special_tokens=add_special_tokens).input_ids
+diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
+index 5e93238..ed2fee1 100644
+--- a/tests/samplers/test_ranks.py
++++ b/tests/samplers/test_ranks.py
+@@ -17,16 +17,27 @@ def test_ranks(
+     num_top_logprobs = 5
+     num_prompt_logprobs = 5
+ 
+-    vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
+-
+-    ## Test greedy logprobs ranks
+-    vllm_sampling_params = SamplingParams(temperature=0.0,
+-                                          top_p=1.0,
+-                                          max_tokens=max_tokens,
+-                                          logprobs=num_top_logprobs,
+-                                          prompt_logprobs=num_prompt_logprobs)
+-    vllm_results = vllm_model.generate_w_logprobs(example_prompts,
+-                                                  vllm_sampling_params)
++    with vllm_runner(model, dtype=dtype,
++                     max_logprobs=num_top_logprobs) as vllm_model:
++
++        ## Test greedy logprobs ranks
++        vllm_sampling_params = SamplingParams(
++            temperature=0.0,
++            top_p=1.0,
++            max_tokens=max_tokens,
++            logprobs=num_top_logprobs,
++            prompt_logprobs=num_prompt_logprobs)
++        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
++                                                      vllm_sampling_params)
++
++        ## Test non-greedy logprobs ranks
++        sampling_params = SamplingParams(temperature=1.0,
++                                         top_p=1.0,
++                                         max_tokens=max_tokens,
++                                         logprobs=num_top_logprobs,
++                                         prompt_logprobs=num_prompt_logprobs)
++        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
++
+     for result in vllm_results:
+         assert result[2] is not None
+         assert len(result[2]) == len(result[0])
+@@ -35,13 +46,6 @@ def test_ranks(
+             assert token in logprobs
+             assert logprobs[token].rank == 1
+ 
+-    ## Test non-greedy logprobs ranks
+-    sampling_params = SamplingParams(temperature=1.0,
+-                                     top_p=1.0,
+-                                     max_tokens=max_tokens,
+-                                     logprobs=num_top_logprobs,
+-                                     prompt_logprobs=num_prompt_logprobs)
+-    res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+     for result in res:
+         assert result[2] is not None
+         assert len(result[2]) == len(result[0])
+diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
+index 13b5b80..397fa2c 100644
+--- a/tests/samplers/test_rejection_sampler.py
++++ b/tests/samplers/test_rejection_sampler.py
+@@ -25,7 +25,7 @@ def mock_causal_accepted_tensor(
+ 
+     accepted = (torch.arange(k).expand(batch_size, k) <=
+                 last_accepted_indices.unsqueeze(-1).broadcast_to(
+-                    batch_size, k)).to(device="cuda")
++                    batch_size, k))
+ 
+     # Sprinkle accepted values after the contiguous initial accepted values.
+     # This replicates the behavior of rejection sampling, which may "accept"
+@@ -33,7 +33,7 @@ def mock_causal_accepted_tensor(
+     sprinkle_candidates = (
+         torch.arange(k).expand(batch_size, k) >
+         last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1)
+-    sprinkle = torch.rand(batch_size, k, device="cuda") > 0.5
++    sprinkle = torch.rand(batch_size, k) > 0.5
+     accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
+     return accepted
+ 
+@@ -43,9 +43,10 @@ def mock_causal_accepted_tensor(
+     "which_tokens_accepted",
+     ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
+ @pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("use_flashinfer", [True, False])
+ @torch.inference_mode()
+ def test_correct_output_format(which_tokens_accepted: str, seed: int,
+-                               device: str):
++                               device: str, use_flashinfer: bool):
+     """Verify the output has correct format given predetermined accepted matrix.
+     """
+     set_random_seed(seed)
+@@ -82,8 +83,8 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
+                                     size=(batch_size, 1),
+                                     dtype=torch.int64)
+ 
+-    rejection_sampler = RejectionSampler()
+-    rejection_sampler.init_gpu_tensors(rank=0)
++    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
++    rejection_sampler.init_gpu_tensors(device=device)
+     output_token_ids = rejection_sampler._create_output(  # pylint: disable=protected-access
+         accepted,
+         recovered_token_ids,
+@@ -91,9 +92,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
+         bonus_token_ids,
+     )
+ 
+-    # Bonus tokens are currently disabled. Verify they're set to -1.
+-    # See https://github.com/vllm-project/vllm/issues/4212
+-    expected_bonus_token_ids = bonus_token_ids.clone() * 0 - 1
++    expected_bonus_token_ids = bonus_token_ids.clone()
+ 
+     if which_tokens_accepted == "all_tokens_accepted":
+         # Expect all tokens to be equal to draft tokens.
+@@ -128,15 +127,19 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
+ @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
+ @pytest.mark.parametrize("batch_size", list(range(1, 32)))
+ @pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("use_flashinfer", [True, False])
+ @torch.inference_mode()
+ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
+-                                    device: str):
++                                    device: str, use_flashinfer: bool):
+     torch.set_default_device(device)
+-    rejection_sampler = RejectionSampler()
+-    rejection_sampler.init_gpu_tensors(rank=0)
++    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
++    rejection_sampler.init_gpu_tensors(device=device)
+ 
+     draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+-    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
++    target_probs = torch.rand(batch_size,
++                              k + 1,
++                              vocab_size,
++                              dtype=torch.float32)
+     bonus_token_ids = torch.randint(low=0,
+                                     high=vocab_size,
+                                     size=(batch_size, 1),
+@@ -150,23 +153,193 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
+                       draft_token_ids)
+ 
+ 
++@pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
++@pytest.mark.parametrize("k", [1, 3, 6])
++@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
++@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
++@pytest.mark.parametrize("n_rep", [100])
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("use_flashinfer", [True, False])
++@torch.inference_mode()
++def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
++                                   frac_seeded: float, n_rep: int, device: str,
++                                   use_flashinfer: bool):
++    torch.set_default_device(device)
++    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
++    rejection_sampler.init_gpu_tensors(device=device)
++
++    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
++    target_probs = torch.rand(batch_size,
++                              k + 1,
++                              vocab_size,
++                              dtype=torch.float32)
++    bonus_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, 1),
++                                    dtype=torch.int64)
++    draft_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, k),
++                                    dtype=torch.int64)
++
++    seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
++
++    results = []
++    for _ in range(n_rep):
++        seeded_seqs = {
++            i: torch.Generator(device=device).manual_seed(i)
++            for i in range(batch_size) if seeded_mask[i]
++        }
++        results.append(
++            rejection_sampler(target_probs, bonus_token_ids, draft_probs,
++                              draft_token_ids, seeded_seqs))
++
++    for i in range(batch_size):
++        if seeded_mask[i]:
++            for j in range(1, n_rep):
++                assert torch.equal(results[j][i], results[0][i])
++
++
++@pytest.mark.parametrize("k", [1, 3, 6])
++@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
++@pytest.mark.parametrize("batch_size", [3, 8, 32, 128])
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("use_flashinfer", [True, False])
++@torch.inference_mode()
++def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int,
++                            device: str, use_flashinfer: bool):
++    torch.set_default_device(device)
++    set_random_seed(0)
++    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
++    target_probs = torch.rand(batch_size,
++                              k + 1,
++                              vocab_size,
++                              dtype=torch.float32)
++    bonus_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, 1),
++                                    dtype=torch.int64)
++    draft_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, k),
++                                    dtype=torch.int64)
++
++    single_batches = []
++    for i in range(batch_size):
++        single_batches.append((draft_probs[i].clone().unsqueeze(0),
++                               draft_token_ids[i].clone().unsqueeze(0),
++                               target_probs[i].clone().unsqueeze(0),
++                               bonus_token_ids[i].clone().unsqueeze(0),
++                               draft_token_ids[i].clone().unsqueeze(0)))
++
++    set_random_seed(0)
++    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
++    rejection_sampler.init_gpu_tensors(device=device)
++
++    results = []
++    seeded_seqs = {
++        i: torch.Generator(device=device).manual_seed(i)
++        for i in range(1, batch_size)  # 0 is seed None
++    }
++    batch_result = rejection_sampler(target_probs.clone(),
++                                     bonus_token_ids.clone(),
++                                     draft_probs.clone(),
++                                     draft_token_ids.clone(), seeded_seqs)
++
++    set_random_seed(0)
++
++    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
++    rejection_sampler.init_gpu_tensors(device=device)
++    for i in range(batch_size):
++        request_seeded_seqs = {
++            0: torch.Generator(device=device).manual_seed(i)
++        } if seeded_seqs.get(i) is not None else None
++        (draft_probs, draft_token_ids, target_probs, bonus_token_ids,
++         draft_token_ids) = single_batches[i]
++        results.append(
++            rejection_sampler(target_probs, bonus_token_ids, draft_probs,
++                              draft_token_ids, request_seeded_seqs))
++    for i in range(batch_size):
++        assert torch.equal(batch_result[i], results[i].squeeze(0))
++
++
++@pytest.mark.parametrize("k", [1, 3, 6])
++@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
++@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
++                                       batch_size: int, device: str):
++    """
++    Test the flashinfer and nonflashinfer backend generate 
++    the same output metrics.
++    """
++    torch.set_default_device(device)
++    torch.manual_seed(0)
++    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
++    target_probs = torch.rand(batch_size,
++                              k + 1,
++                              vocab_size,
++                              dtype=torch.float32)
++    bonus_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, 1),
++                                    dtype=torch.int64)
++    draft_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, k),
++                                    dtype=torch.int64)
++
++    num_accepted_tokens = []
++    num_emitted_tokens = []
++    num_draft_tokens = []
++
++    def get_seeded_seqs():
++        return {
++            i: torch.Generator(device=device).manual_seed(i)
++            for i in range(batch_size)
++        }
++
++    for use_flashinfer in [True, False]:
++        rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
++        rejection_sampler.init_gpu_tensors(device=device)
++        # We use seeded sequences to ensure the same tokens are accepted
++        # for both flashinfer and nonflashinfer backends.
++        seeded_seqs = get_seeded_seqs()
++        rejection_sampler(target_probs, bonus_token_ids, draft_probs,
++                          draft_token_ids, seeded_seqs)
++        num_accepted_tokens.append(rejection_sampler.num_accepted_tokens)
++        num_emitted_tokens.append(rejection_sampler.num_emitted_tokens)
++        num_draft_tokens.append(rejection_sampler.num_draft_tokens)
++
++    assert num_accepted_tokens[0] == num_accepted_tokens[1]
++    assert num_emitted_tokens[0] == num_emitted_tokens[1]
++    assert num_draft_tokens[0] == num_draft_tokens[1]
++
++
+ @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
+ @pytest.mark.parametrize("which_token_ids",
+                          ["bonus_token_ids", "draft_token_ids"])
+ @pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("use_flashinfer", [True, False])
+ @torch.inference_mode()
+ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
+-                               which_token_ids: str, device: str):
++                               which_token_ids: str, device: str,
++                               use_flashinfer: bool):
+     k = 3
+     batch_size = 5
+     vocab_size = 30_000
+     torch.set_default_device(device)
+ 
+-    rejection_sampler = RejectionSampler(strict_mode=True)
+-    rejection_sampler.init_gpu_tensors(rank=0)
++    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer,
++                                         strict_mode=True)
++    rejection_sampler.init_gpu_tensors(device=device)
+ 
+     draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+-    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
++    target_probs = torch.rand(batch_size,
++                              k + 1,
++                              vocab_size,
++                              dtype=torch.float32)
+     bonus_token_ids = torch.randint(low=0,
+                                     high=vocab_size,
+                                     size=(batch_size, 1),
+@@ -200,9 +373,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
+ 
+ @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
+ @pytest.mark.parametrize("seed", list(range(5)))
++@pytest.mark.parametrize("use_flashinfer", [True, False])
+ @torch.inference_mode()
+ def test_rejection_sampling_approximates_target_distribution(
+-        seed: int, draft_and_target_probs_equal: bool):
++        seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool):
+     """Verify rejection sampling approximates target distribution,
+     despite sampling from a potentially distinct draft distribution.
+ 
+@@ -231,18 +405,17 @@ def test_rejection_sampling_approximates_target_distribution(
+     """
+     torch.set_default_device("cpu")
+     set_random_seed(seed)
+-
+     helper = _CorrectnessTestHelper(
+         vocab_size=10,
+-        rejection_sampler=RejectionSampler(),
++        rejection_sampler=RejectionSampler(use_flashinfer=use_flashinfer),
+     )
+ 
+     draft_probs, target_probs, reference_probs = helper.generate_probs_for_test(
+         draft_and_target_probs_equal)
+ 
+     sample_sizes = [10, 100, 1_000, 10_000, 100_000]
+-    distance_wrt_reference = []
+-    distance_wrt_target = []
++    distance_wrt_reference: List[float] = []
++    distance_wrt_target: List[float] = []
+ 
+     for num_samples in sample_sizes:
+         (reference_vs_rejsample_dist,
+@@ -291,7 +464,7 @@ class _CorrectnessTestHelper:
+         self.vocab_size = vocab_size
+         self.vocab_range = (0, vocab_size)
+ 
+-        self.rejection_sampler.init_gpu_tensors(rank=0)
++        self.rejection_sampler.init_gpu_tensors(device=0)
+ 
+         # Keep test simple, use k=1
+         self.k = 1
+@@ -303,12 +476,10 @@ class _CorrectnessTestHelper:
+     def generate_probs_for_test(
+         self, draft_and_target_probs_equal: bool
+     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+-        draft_probs, target_probs = [
+-            F.softmax(
+-                torch.rand(self.vocab_size, dtype=torch.float32),
+-                dim=-1,
+-            ) for _ in range(2)
+-        ]
++        draft_probs, target_probs = (F.softmax(
++            torch.rand(self.vocab_size, dtype=torch.float32),
++            dim=-1,
++        ) for _ in range(2))
+ 
+         num_reference_probs = 100
+         reference_probs = F.softmax(
+@@ -350,10 +521,10 @@ class _CorrectnessTestHelper:
+         draft_probs = draft_probs.reshape(1, self.k, self.vocab_size).repeat(
+             num_samples, 1, 1)
+ 
+-        # Repeat target probs num_samples * k times.
++        # Repeat target probs num_samples * (k + 1) times.
+         # Rejection sampler requires bonus token probs, but they aren't used.
+         target_probs = target_probs.reshape(1, 1, self.vocab_size).repeat(
+-            num_samples, self.k, 1)
++            num_samples, self.k + 1, 1)
+ 
+         # Randomly sample draft token ids from draft probs.
+         draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
+diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
+index e4fea16..28c3406 100644
+--- a/tests/samplers/test_sampler.py
++++ b/tests/samplers/test_sampler.py
+@@ -1,18 +1,19 @@
+ import itertools
+ import random
+-from typing import List, Optional, Tuple
+-from unittest.mock import patch
++from dataclasses import dataclass
++from typing import Dict, List, Optional, Tuple
++from unittest.mock import Mock, patch
+ 
+ import pytest
+ import torch
+ from transformers import GenerationConfig, GenerationMixin
+ 
++import vllm.envs as envs
+ from vllm.model_executor.layers.sampler import Sampler
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+ from vllm.model_executor.utils import set_random_seed
+ from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+-from vllm.utils import Counter
+-from vllm.worker.model_runner import ModelRunner
++from vllm.utils import Counter, is_pin_memory_available
+ 
+ 
+ class MockLogitsSampler(Sampler):
+@@ -26,20 +27,14 @@ class MockLogitsSampler(Sampler):
+ 
+ 
+ def _prepare_test(
+-    batch_size: int
+-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
++        batch_size: int
++) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
+     input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
+     fake_logits = torch.full((batch_size, VOCAB_SIZE),
+                              1e-2,
+                              dtype=input_tensor.dtype)
+     sampler = MockLogitsSampler(fake_logits)
+-    model_runner = ModelRunner(model_config=None,
+-                               parallel_config=None,
+-                               scheduler_config=None,
+-                               device_config=None,
+-                               load_config=None,
+-                               lora_config=None)
+-    return input_tensor, fake_logits, sampler, model_runner
++    return input_tensor, fake_logits, sampler
+ 
+ 
+ VOCAB_SIZE = 32000
+@@ -53,18 +48,17 @@ def _do_sample(
+     batch_size: int,
+     input_tensor: torch.Tensor,
+     sampler: MockLogitsSampler,
+-    model_runner: ModelRunner,
+     sampling_params: SamplingParams,
+     device: str,
+ ):
+-    seq_group_metadata_list = []
+-    seq_lens = []
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
++    seq_lens: List[int] = []
+     for i in range(batch_size):
+         seq_group_metadata_list.append(
+             SequenceGroupMetadata(
+                 request_id=f"test_{i}",
+                 is_prompt=True,
+-                seq_data={0: SequenceData([1, 2, 3])},
++                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                 sampling_params=sampling_params,
+                 block_tables={0: [1]},
+             ))
+@@ -75,7 +69,7 @@ def _do_sample(
+         seq_lens,
+         query_lens=seq_lens,
+         device=device,
+-        pin_memory=model_runner.pin_memory)
++        pin_memory=is_pin_memory_available())
+     return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
+ 
+ 
+@@ -85,19 +79,16 @@ def test_sampler_all_greedy(seed: int, device: str):
+     set_random_seed(seed)
+     torch.set_default_device(device)
+     batch_size = random.randint(1, 256)
+-    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+-        batch_size)
++    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
+ 
+     sampling_params = SamplingParams(temperature=0)
+-    sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
++    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                 sampling_params, device)
+     expected = torch.argmax(fake_logits, dim=-1)
+     for i, sequence_output in enumerate(sampler_output):
+         for nth_output in sequence_output.samples:
+             assert nth_output.output_token == expected[i].item()
+ 
+-    del model_runner
+-
+ 
+ @pytest.mark.parametrize("seed", RANDOM_SEEDS)
+ @pytest.mark.parametrize("device", CUDA_DEVICES)
+@@ -105,8 +96,7 @@ def test_sampler_all_random(seed: int, device: str):
+     set_random_seed(seed)
+     torch.set_default_device(device)
+     batch_size = random.randint(1, 256)
+-    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+-        batch_size)
++    _, fake_logits, sampler = _prepare_test(batch_size)
+ 
+     for i in range(batch_size):
+         fake_logits[i, i] = 1e2
+@@ -115,15 +105,13 @@ def test_sampler_all_random(seed: int, device: str):
+         temperature=1.0,
+         n=random.randint(1, 10),
+     )
+-    sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
++    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                 sampling_params, device)
+ 
+     for i, sequence_output in enumerate(sampler_output):
+         for nth_output in sequence_output.samples:
+             assert nth_output.output_token == i
+ 
+-    del model_runner
+-
+ 
+ @pytest.mark.parametrize("seed", RANDOM_SEEDS)
+ @pytest.mark.parametrize("device", CUDA_DEVICES)
+@@ -131,7 +119,7 @@ def test_sampler_all_random_seed(seed: int, device: str):
+     set_random_seed(seed)
+     torch.set_default_device(device)
+     batch_size = random.randint(1, 256)
+-    _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
++    _, fake_logits, sampler = _prepare_test(batch_size)
+ 
+     for i in range(batch_size):
+         fake_logits[i, i] = 1e2
+@@ -141,15 +129,13 @@ def test_sampler_all_random_seed(seed: int, device: str):
+         n=random.randint(1, 10),
+         seed=random.randint(0, 10000),
+     )
+-    sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
++    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                 sampling_params, device)
+ 
+     for i, sequence_output in enumerate(sampler_output):
+         for nth_output in sequence_output.samples:
+             assert nth_output.output_token == i
+ 
+-    del model_runner
+-
+ 
+ @pytest.mark.parametrize("seed", RANDOM_SEEDS)
+ @pytest.mark.parametrize("device", CUDA_DEVICES)
+@@ -157,7 +143,7 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
+     set_random_seed(seed)
+     torch.set_default_device(device)
+     batch_size = random.randint(1, 256)
+-    _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
++    _, fake_logits, sampler = _prepare_test(batch_size)
+ 
+     sampling_params = SamplingParams(
+         temperature=1.0,
+@@ -165,37 +151,13 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
+         seed=random.randint(0, 10000),
+     )
+     first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+-                                      model_runner, sampling_params, device)
++                                      sampling_params, device)
+ 
+     second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+-                                       model_runner, sampling_params, device)
++                                       sampling_params, device)
+ 
+     assert first_sampler_output == second_sampler_output
+ 
+-    del model_runner
+-
+-
+-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+-@pytest.mark.parametrize("device", CUDA_DEVICES)
+-def test_sampler_all_beam(seed: int, device: str):
+-    set_random_seed(seed)
+-    torch.set_default_device(device)
+-    batch_size = random.randint(1, 256)
+-    _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
+-
+-    sampling_params = SamplingParams(
+-        temperature=0,
+-        best_of=2,
+-        use_beam_search=True,
+-    )
+-    _do_sample(batch_size, fake_logits, sampler, model_runner, sampling_params,
+-               device)
+-    # no assertion here as I am not sure how to determine whether
+-    # the outputs are expected - in other words, this just tests
+-    # whether there are no exceptions in the sampler
+-    # when handling an all-beam search case.
+-    del model_runner
+-
+ 
+ @pytest.mark.parametrize("seed", RANDOM_SEEDS)
+ @pytest.mark.parametrize("device", CUDA_DEVICES)
+@@ -220,7 +182,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
+         return sampling_params
+ 
+     def create_sequence_data(num_input=3, num_generated=0):
+-        seq_data = SequenceData(
++        seq_data = SequenceData.from_seqs(
+             random.choices(range(0, VOCAB_SIZE), k=num_input))
+         if num_generated > 0:
+             seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
+@@ -232,7 +194,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
+         batch_size = random.randint(1, 128)
+ 
+         expected_penalization = []
+-        sequence_metadata_list = []
++        sequence_metadata_list: List[SequenceGroupMetadata] = []
+         # 20% chance to generate seq group metadata list with all prompts
+         is_prompt = random.random() < 0.2
+         while batch_size > 0:
+@@ -252,8 +214,8 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
+                 eos_token_id=eos_token_id,
+                 stop_token_ids=stop_token_ids)
+ 
+-            seq_data = {}
+-            seq_group_penalization = []
++            seq_data: Dict[int, SequenceData] = {}
++            seq_group_penalization: List[bool] = []
+             for _ in range(num_seqs):
+                 num_input = random.randint(1, 100)
+                 num_generated = 0 if is_prompt else random.randint(1, 100)
+@@ -412,17 +374,16 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
+     else:
+         test_cases = [generate_test_case()]
+ 
+-    def run_test_case(*,
+-                      expected_penalization=None,
+-                      seq_group_metadata_list=None):
++    def run_test_case(*, expected_penalization: List[bool],
++                      seq_group_metadata_list: List[SequenceGroupMetadata]):
+         assert expected_penalization, \
+             "Invalid test case, need expected_penalization"
+         assert seq_group_metadata_list, \
+             "Invalid test case, need seq_group_metadata_list"
+ 
+         batch_size = 0
+-        seq_lens = []
+-        sampling_params_per_row = []
++        seq_lens: List[int] = []
++        sampling_params_per_row: List[SamplingParams] = []
+         for sgm in seq_group_metadata_list:
+             sampling_params = sgm.sampling_params
+ 
+@@ -433,6 +394,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
+                 prompt_len = seq_data.get_prompt_len()
+                 seq_lens.append(prompt_len)
+ 
++                assert sgm.sampling_params is not None
+                 if sgm.sampling_params.prompt_logprobs:
+                     # with prompt_logprobs each token in the prompt has a row in
+                     # logits
+@@ -448,13 +410,13 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
+             ("Invalid test case, expected_penalization does not match computed"
+              "batch size")
+ 
+-        _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
++        _, fake_logits, sampler = _prepare_test(batch_size)
+         sampling_metadata = SamplingMetadata.prepare(
+             seq_group_metadata_list,
+             seq_lens=seq_lens if seq_lens else None,
+-            query_lens=seq_lens if seq_lens else None,
++            query_lens=seq_lens if seq_lens else [1] * batch_size,
+             device=device,
+-            pin_memory=model_runner.pin_memory)
++            pin_memory=is_pin_memory_available())
+         # the logits tensor is modified in-place by the sampler
+         _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+ 
+@@ -480,8 +442,6 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
+                     fake_logits[logits_idx, :] ==
+                     -float('inf')) == 0, "No tokens should have been penalized"
+ 
+-        del model_runner
+-
+     for test_case in test_cases:
+         run_test_case(**test_case)
+ 
+@@ -492,18 +452,17 @@ def test_sampler_mixed(seed: int, device: str):
+     set_random_seed(seed)
+     torch.set_default_device(device)
+     batch_size = random.randint(1, 256)
+-    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+-        batch_size)
++    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
+ 
+-    seq_group_metadata_list = []
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+     expected_tokens: List[Optional[List[int]]] = []
+-    seq_lens = []
++    seq_lens: List[int] = []
+     for i in range(batch_size):
+         expected: Optional[List[int]] = None
+-        sampling_type = random.randint(0, 3)
++        sampling_type = random.randint(0, 2)
+         if sampling_type == 0:
+             sampling_params = SamplingParams(temperature=0)
+-            expected = [torch.argmax(fake_logits[i], dim=-1).item()]
++            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
+         elif sampling_type in (1, 2):
+             n = random.randint(1, 10)
+             sampling_params = SamplingParams(
+@@ -519,35 +478,34 @@ def test_sampler_mixed(seed: int, device: str):
+                 for idx in range(n):
+                     fake_logits[i, i + idx] = 1e2
+                 expected = list(range(i, i + n))
+-        else:
+-            sampling_params = SamplingParams(temperature=0,
+-                                             use_beam_search=True,
+-                                             best_of=2)
++
+         expected_tokens.append(expected)
+         seq_group_metadata_list.append(
+             SequenceGroupMetadata(
+                 request_id=f"test_{i}",
+                 is_prompt=True,
+-                seq_data={0: SequenceData([1, 2, 3])},
++                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                 sampling_params=sampling_params,
+                 block_tables={0: [1]},
+             ))
+         seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+ 
+-    def test_sampling(model_runner: ModelRunner):
++    generators: Dict[str, torch.Generator] = {}
++
++    def test_sampling():
+         sampling_metadata = SamplingMetadata.prepare(
+             seq_group_metadata_list,
+             seq_lens,
+             query_lens=seq_lens,
+             device=device,
+-            pin_memory=model_runner.pin_memory)
++            pin_memory=is_pin_memory_available(),
++            generators=generators)
+         sampler_output = sampler(logits=fake_logits,
+                                  sampling_metadata=sampling_metadata)
+ 
+         for i, (sequence_output, metadata) in enumerate(
+                 zip(sampler_output, seq_group_metadata_list)):
+-            if metadata.sampling_params.use_beam_search:
+-                continue
++            assert metadata.sampling_params is not None
+ 
+             if (metadata.sampling_params.seed is not None
+                     and expected_tokens[i] is None):
+@@ -559,18 +517,23 @@ def test_sampler_mixed(seed: int, device: str):
+                 ]
+                 continue
+ 
++            expected_tokens_item = expected_tokens[i]
++            assert expected_tokens_item is not None
++
+             for n, nth_output in enumerate(sequence_output.samples):
++                assert metadata.sampling_params is not None
++
+                 if (metadata.sampling_params.temperature == 0
+                         or metadata.sampling_params.seed is not None):
+                     # Ensure exact matches for greedy or random with seed
+-                    assert nth_output.output_token == expected_tokens[i][n]
++                    assert nth_output.output_token == expected_tokens_item[n]
+                 else:
+                     # For non-seeded random check that one of the high-logit
+                     # tokens were chosen
+-                    assert nth_output.output_token in expected_tokens[i]
++                    assert nth_output.output_token in expected_tokens_item
+ 
+     # Test batch
+-    test_sampling(model_runner)
++    test_sampling()
+ 
+     # Shuffle the batch and resample
+     target_index = list(range(batch_size))
+@@ -583,9 +546,7 @@ def test_sampler_mixed(seed: int, device: str):
+ 
+     # This time, results of seeded random samples will be compared with
+     # the corresponding sample in the pre-shuffled batch
+-    test_sampling(model_runner)
+-
+-    del model_runner
++    test_sampling()
+ 
+ 
+ @pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@@ -605,28 +566,33 @@ def test_sampler_top_k_top_p(seed: int, device: str):
+                                device=input_tensor.device,
+                                dtype=input_tensor.dtype)
+     sampler = MockLogitsSampler(fake_logits)
+-    model_runner = ModelRunner(model_config=None,
+-                               parallel_config=None,
+-                               scheduler_config=None,
+-                               device_config=None,
+-                               load_config=None,
+-                               lora_config=None)
+ 
+     generation_model = GenerationMixin()
+     generation_config = GenerationConfig(top_k=top_k,
+                                          top_p=top_p,
+                                          do_sample=True)
+-    warpers = generation_model._get_logits_warper(generation_config)
+-    assert len(warpers) == 2  # top_p and top_k
+ 
+-    seq_group_metadata_list = []
+-    seq_lens = []
++    @dataclass
++    class MockConfig:
++        is_encoder_decoder: bool = False
++
++    generation_model.config = MockConfig()  # needed by the following method
++    generation_model._prepare_special_tokens(generation_config, device=device)
++    processors = generation_model._get_logits_processor(generation_config,
++                                                        None,
++                                                        None,
++                                                        None, [],
++                                                        device=device)
++    assert len(processors) == 2  # top_p and top_k
++
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
++    seq_lens: List[int] = []
+     for i in range(batch_size):
+         seq_group_metadata_list.append(
+             SequenceGroupMetadata(
+                 request_id=f"test_{i}",
+                 is_prompt=True,
+-                seq_data={0: SequenceData([1, 2, 3])},
++                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                 sampling_params=SamplingParams(
+                     temperature=1,
+                     top_k=top_k,
+@@ -641,7 +607,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
+         seq_lens,
+         query_lens=seq_lens,
+         device=device,
+-        pin_memory=model_runner.pin_memory)
++        pin_memory=is_pin_memory_available())
+ 
+     sample_probs = None
+ 
+@@ -651,11 +617,140 @@ def test_sampler_top_k_top_p(seed: int, device: str):
+         return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
+                  for prob in probs], None)
+ 
+-    with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
++    # top-k and top-p is only calculated when flashinfer kernel is not available
++    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
++         patch("vllm.model_executor.layers.sampler."
++               "flashinfer_top_k_top_p_sampling", None):
+         sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+-    hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone())
++
++    assert sample_probs is not None
++
++    hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
+     hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
+-    assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
++    torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
+     assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
+ 
+-    del model_runner
++
++@pytest.mark.parametrize("seed", RANDOM_SEEDS)
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++def test_flashinfer_fallback(seed: int, device: str):
++    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
++        pytest.skip("Flashinfer sampler is disabled")
++
++    set_random_seed(seed)
++    torch.set_default_device(device)
++    batch_size = random.randint(1, 256)
++    _, fake_logits, sampler = _prepare_test(batch_size)
++
++    def failing_flashinfer_sampling(*_args, **_kwargs):
++        return None, torch.zeros(batch_size, device=device, dtype=torch.int32)
++
++    sampling_params = SamplingParams(
++        temperature=1.0,
++        n=random.randint(1, 10),
++        seed=random.randint(0, 10000),
++    )
++    sampler_output = _do_sample(batch_size, fake_logits, sampler,
++                                sampling_params, device)
++
++    with patch(
++            "vllm.model_executor.layers.sampler."
++            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
++        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
++                                             sampling_params, device)
++
++    assert sampler_output == fallback_sampler_output
++
++
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++def test_sampler_repetition_penalty_mixed(device: str):
++
++    vocab_size = 8
++
++    def test_sampling_params(sampling_params: List[SamplingParams]):
++
++        seq_group_metadata_list: List[SequenceGroupMetadata] = []
++        seq_lens: List[int] = []
++        for i in range(2):
++            seq_group_metadata_list.append(
++                SequenceGroupMetadata(
++                    request_id=f"test_{i}",
++                    is_prompt=True,
++                    seq_data={0: SequenceData.from_seqs([1, 2, 3])},
++                    sampling_params=sampling_params[i],
++                    block_tables={0: [1]},
++                ))
++            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
++
++        sampling_metadata = SamplingMetadata.prepare(
++            seq_group_metadata_list,
++            seq_lens,
++            query_lens=seq_lens,
++            device=device,
++            pin_memory=is_pin_memory_available())
++
++        fake_logits = torch.full((2, vocab_size),
++                                 1e-2,
++                                 device=device,
++                                 dtype=torch.float16)
++
++        fake_logits[:, 5] = 1.1e-2
++        fake_logits[:, 1] = 1.2e-2
++
++        sampler = MockLogitsSampler(fake_logits)
++
++        sampler_output = sampler(logits=fake_logits,
++                                 sampling_metadata=sampling_metadata)
++
++        generated_tokens = []
++        for output in sampler_output:
++            generated_tokens.append(output.samples[0].output_token)
++
++        return generated_tokens
++
++    # one configuration is greedy with repetition_penalty
++    sampling_params_rep = SamplingParams(
++        temperature=0.0,
++        repetition_penalty=2.0,
++    )
++
++    # other configuration is sampling w/o repetition_penalty
++    sampling_params_sample = SamplingParams(
++        temperature=1.0,
++        top_k=1,
++        seed=42,
++    )
++
++    tokens1 = test_sampling_params(
++        [sampling_params_rep, sampling_params_sample])
++
++    tokens2 = test_sampling_params(
++        [sampling_params_sample, sampling_params_rep])
++
++    assert tokens1[0] == tokens2[1]
++    assert tokens1[1] == tokens2[0]
++
++
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++def test_sampler_include_gpu_probs_tensor(device: str):
++    set_random_seed(42)
++    torch.set_default_device(device)
++    batch_size = random.randint(1, 256)
++    _, fake_logits, sampler = _prepare_test(batch_size)
++    sampler.include_gpu_probs_tensor = True
++    sampler.should_modify_greedy_probs_inplace = False
++
++    sampling_params = SamplingParams(temperature=0)
++
++    mock_inplace = Mock()
++    with patch(
++            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
++            mock_inplace):
++
++        sampler_output = _do_sample(batch_size, fake_logits, sampler,
++                                    sampling_params, device)
++        mock_inplace.assert_not_called()
++
++    assert sampler_output.sampled_token_probs is not None
++    assert sampler_output.logprobs is not None
++    assert sampler_output.sampled_token_ids is not None
+diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
+index 3cd659c..88067f1 100644
+--- a/tests/samplers/test_seeded_generate.py
++++ b/tests/samplers/test_seeded_generate.py
+@@ -17,9 +17,8 @@ RANDOM_SEEDS = list(range(5))
+ 
+ @pytest.fixture
+ def vllm_model(vllm_runner):
+-    vllm_model = vllm_runner(MODEL, dtype="half")
+-    yield vllm_model
+-    del vllm_model
++    with vllm_runner(MODEL, dtype="half") as vllm_model:
++        yield vllm_model
+ 
+ 
+ @pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@@ -57,11 +56,7 @@ def test_random_sample_with_seed(
+                 sampling_params_seed_1,
+                 sampling_params_seed_2,
+         ):
+-            llm._add_request(
+-                prompt=prompt,
+-                prompt_token_ids=None,
+-                sampling_params=params,
+-            )
++            llm._add_request(prompt, params=params)
+ 
+     results = llm._run_engine(use_tqdm=False)
+     all_outputs = [[out.token_ids for out in output.outputs]
+diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
+new file mode 100644
+index 0000000..4ddad66
+--- /dev/null
++++ b/tests/samplers/test_typical_acceptance_sampler.py
+@@ -0,0 +1,470 @@
++"""Tests for rejection sampling."""
++
++import pytest
++import torch
++
++from vllm.model_executor.layers.typical_acceptance_sampler import (
++    TypicalAcceptanceSampler)
++from vllm.model_executor.utils import set_random_seed
++
++CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]
++
++
++def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
++    """
++    Generates a fake temperature zero probability distribution.
++    Returns:
++        1. A fake temperature zero probability distribution of shape
++           [batch_size, k, vocab_size]
++        2. Tensor of shape [batch_size, k] containing the token ids 
++           of the probability 1.0 tokens at each position.
++    """
++    # Simulate temperature 0 probability distribution for target probabilities
++    # and create target probabilities such that only 1 token id has
++    # probability 1.0
++    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
++    probs = torch.rand(batch_size, k, vocab_size)
++    _, zero_temperature_token_ids = torch.max(probs, dim=-1)
++    # set the probability of the tokens with ids in zero_temperature_token_ids
++    # to 1 and the rest to 0.
++    target_probs = torch.zeros_like(probs).scatter_(
++        -1, zero_temperature_token_ids.unsqueeze(-1), 1.0)
++    return target_probs, zero_temperature_token_ids
++
++
++def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
++                        token_ids_to_exclude: torch.Tensor):
++    """
++    Returns a tensor of shape [batch_size, k] of fake draft token ids
++    drawn randomly from a vocab of size vocab_size. We however ensure
++    that token_ids from token_ids_to_exclude are excluded at the 
++    corresponding positions.
++    """
++    draft_token_ids = torch.empty(batch_size, k, dtype=torch.long)
++    for i in range(batch_size):
++        for j in range(k):
++            # Generate a random token ID excluding token_ids_to_exclude[i, j]
++            while True:
++                token_id = torch.randint(0, vocab_size, (1, )).item()
++                if token_id != token_ids_to_exclude[i, j]:
++                    draft_token_ids[i, j] = token_id
++                    break
++    return draft_token_ids
++
++
++def get_acceptance_sampler(
++    posterior_threshold: float = 0.03,
++    posterior_alpha: float = 0.9,
++    strict_mode: bool = False,
++) -> TypicalAcceptanceSampler:
++    """
++    Initializes and returns a TypicalAcceptanceSampler.
++    """
++    return TypicalAcceptanceSampler(posterior_threshold, posterior_alpha,
++                                    strict_mode)
++
++
++@pytest.mark.parametrize("k", list(range(1, 6)))
++@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
++@pytest.mark.parametrize("batch_size", list(range(1, 32)))
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
++                                    device: str):
++    """
++    Tests that the TypicalAcceptancSampler forward succeeds for
++    different combinations of k, vocab_size, batch_size and num devices.
++    """
++    torch.set_default_device(device)
++    typical_acceptance_sampler = get_acceptance_sampler()
++    typical_acceptance_sampler.init_gpu_tensors(device=device)
++    target_with_bonus_probs = torch.rand(batch_size,
++                                         k + 1,
++                                         vocab_size,
++                                         dtype=torch.float32)
++    bonus_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, 1),
++                                    dtype=torch.int64)
++    draft_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, k),
++                                    dtype=torch.int64)
++    # Verify that sampling succeeds for all cases.
++    typical_acceptance_sampler(target_with_bonus_probs,
++                               bonus_token_ids,
++                               draft_probs=None,
++                               draft_token_ids=draft_token_ids)
++
++
++@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
++@pytest.mark.parametrize("which_token_ids",
++                         ["bonus_token_ids", "draft_token_ids"])
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
++                               which_token_ids: str, device: str):
++    """
++    Tests that we throw an exception of the token ids fall outside
++    the bound of the provided vocabulary.
++    """
++    k = 3
++    batch_size = 5
++    vocab_size = 30_000
++    torch.set_default_device(device)
++    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
++    typical_acceptance_sampler.init_gpu_tensors(device=device)
++    target_with_bonus_probs = torch.rand(batch_size,
++                                         k + 1,
++                                         vocab_size,
++                                         dtype=torch.float32)
++    bonus_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, 1),
++                                    dtype=torch.int64)
++    draft_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, k),
++                                    dtype=torch.int64)
++    # Verify that appropriate exceptions are thrown for out
++    # of bound vocabs.
++    oob_token_ids = None
++    if which_token_ids == "bonus_token_ids":
++        oob_token_ids = bonus_token_ids
++    elif which_token_ids == "draft_token_ids":
++        oob_token_ids = draft_token_ids
++    else:
++        raise AssertionError()
++
++    if above_or_below_vocab_range == "above":
++        rogue_token_id = vocab_size + 1
++    elif above_or_below_vocab_range == "below":
++        rogue_token_id = -1
++    else:
++        raise AssertionError()
++
++    oob_token_ids[0][0] = rogue_token_id
++
++    with pytest.raises(AssertionError):
++        typical_acceptance_sampler(target_with_bonus_probs,
++                                   bonus_token_ids,
++                                   draft_probs=None,
++                                   draft_token_ids=draft_token_ids)
++
++
++@pytest.mark.parametrize("seed", list(range(10)))
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_uniform_target_distribution_accepts_all_tokens(
++        seed: int, device: str):
++    """
++     Test the TypicalAcceptanceSampler with a uniform target probability 
++     distribution.
++    
++    This test verifies that when provided with a uniform target probability
++    distribution, the TypicalAcceptanceSampler accepts all draft tokens. The
++    entropy of the uniform target distribution being high should lead to all
++    draft tokens being accepted.
++    """
++    set_random_seed(seed)
++    k = 3
++    batch_size = 5
++    vocab_size = 30_000
++    torch.set_default_device(device)
++    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
++    typical_acceptance_sampler.init_gpu_tensors(device=device)
++    target_with_bonus_probs = torch.rand(batch_size,
++                                         k + 1,
++                                         vocab_size,
++                                         dtype=torch.float32)
++    draft_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, k),
++                                    dtype=torch.int64)
++    bonus_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, 1),
++                                    dtype=torch.int64)
++    output_token_ids = typical_acceptance_sampler(
++        target_with_bonus_probs,
++        bonus_token_ids,
++        draft_probs=None,
++        draft_token_ids=draft_token_ids)
++    # We are using a uniform target probability distribution.
++    # For a uniform distribution the entropy is very high and it
++    # should lead to all draft tokens being accepted. Verify that.
++    assert output_token_ids.shape[0] == batch_size
++    assert output_token_ids.shape[1] == (k + 1)
++    assert torch.all(output_token_ids[:, -1] == bonus_token_ids.squeeze())
++
++    assert torch.all(output_token_ids[:, :k] == draft_token_ids)
++
++
++@pytest.mark.parametrize("seed", list(range(10)))
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_temperature_zero_target_distribution(seed: int, device: str):
++    """
++    Test the TypicalAcceptanceSampler with a zero-temperature target
++    probability distribution.
++
++    This test verifies that when using a zero-temperature target probability
++    distribution, where only one token has a probability of 1.0, the
++    TypicalAcceptanceSampler correctly rejects all draft tokens that do not
++    match this probability. Additionally, it ensures that when all draft
++    tokens are rejected, the sampler falls back to greedy sampling to select a
++    single token from the target distribution.
++    """
++    set_random_seed(seed)
++    k = 3
++    batch_size = 5
++    vocab_size = 30_000
++    torch.set_default_device(device)
++
++    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
++    typical_acceptance_sampler.init_gpu_tensors(device=device)
++    # Simulate temperature 0 probability distribution for target probabilities
++    # and create target probabilities such that only 1 token id has
++    # probability 1.0
++    target_with_bonus_probs, zero_temperature_token_ids = \
++        get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
++    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
++    # Populate draft_token_ids such that they exclude the token_ids
++    # with probability = 1.0
++    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
++                                          zero_temperature_token_ids)
++    bonus_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, 1),
++                                    dtype=torch.int64)
++    # The target probaility distribution is a temperature zero distribution
++    # with zero entroy. Since our draft token ids don't match the probability
++    # 1.0 tokens in the target distribution we will reject all of them and
++    # fallback to the greedy sampling for selecting 1 token for each sequence.
++    # Verify the same.
++    output_token_ids = typical_acceptance_sampler(
++        target_with_bonus_probs,
++        bonus_token_ids,
++        draft_probs=None,
++        draft_token_ids=draft_token_ids)
++    assert output_token_ids.shape[0] == batch_size
++    assert output_token_ids.shape[1] == (k + 1)
++    assert torch.all(output_token_ids[:, -1] == -1)
++    assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:,
++                                                                          0])
++
++
++@pytest.mark.parametrize("seed", list(range(10)))
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_mixed_target_distribution(seed: int, device: str):
++    """
++    Test the TypicalAcceptanceSampler with a mixed target probability
++    distribution.
++
++    This test ensures that the TypicalAcceptanceSampler handles a mixed
++    target probability distribution correctly. Specifically, it uses a 
++    zero-temperature distribution for some sequences and a uniform
++    distribution for others. The test verifies that:
++    
++    - For sequences with a zero-temperature distribution, only the token
++    with a probability of 1.0 is accepted, and all other tokens are rejected.
++    - For sequences with a uniform distribution, all draft tokens are
++    accepted.
++    """
++    set_random_seed(seed)
++    k = 3
++    batch_size = 4
++    vocab_size = 30_000
++    torch.set_default_device(device)
++    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
++    typical_acceptance_sampler.init_gpu_tensors(device=device)
++    # For sequences 0 and 2 set the distribution to a temperature
++    # zero distribution. For sequences 1 and 3 set it to a uniform
++    # distribution.
++    target_with_bonus_probs, zero_temperature_token_ids = \
++        get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
++    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
++    target_probs = target_with_bonus_probs[:, :-1]
++    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
++                                          zero_temperature_token_ids)
++    uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32)
++    target_probs[[1, 3]] = uniform_probs
++    bonus_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, 1),
++                                    dtype=torch.int64)
++    output_token_ids = typical_acceptance_sampler(
++        target_with_bonus_probs,
++        bonus_token_ids,
++        draft_probs=None,
++        draft_token_ids=draft_token_ids)
++    # verify the shape of output_token_ids
++    assert output_token_ids.shape[0] == batch_size
++    assert output_token_ids.shape[1] == (k + 1)
++    # For sequences 0 and 2 verify that only 1 token is accepted
++    # which is the token with probability 1.0 in the target distribution
++    # at position 0.
++    assert torch.all(output_token_ids[[0, 2], 1:] == -1)
++    assert (torch.all(output_token_ids[[0, 2],
++                                       0] == zero_temperature_token_ids[[0, 2],
++                                                                        0]))
++    # For sequences 1 and 3 verify that all tokens are accepted since the
++    # target probability distribution is uniform. In addition verify that
++    # we also accept the bonus tokens.
++    assert torch.all(
++        output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :])
++    assert torch.all(output_token_ids[[1, 3], -1] != -1)
++
++
++@pytest.mark.parametrize("seed", list(range(10)))
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_accept_tokens_partially(seed: int, device: str):
++    """
++    Test the TypicalAcceptanceSampler's behavior when only a subset of draft
++    tokens should be accepted.
++
++    This test verifies that the TypicalAcceptanceSampler correctly accepts or
++    rejects draft tokens based on a zero-temperature target probability
++    distribution. Specifically, it ensures that:
++    
++    - When all draft tokens match tokens with a probability of 1.0 in the
++    target distribution, all draft tokens are accepted.
++    - When only some draft tokens match tokens with a probability of 1.0 in
++    the target distribution, only those matching tokens are accepted, and the
++    rest are rejected.
++    """
++    set_random_seed(seed)
++    k = 5
++    batch_size = 1
++    vocab_size = 30_000
++    torch.set_default_device(device)
++    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
++    typical_acceptance_sampler.init_gpu_tensors(device=device)
++    # Create a temperature zero target probability distribution and ensure
++    # all draft token ids correspond to the tokens with 1.0 probability.
++    # Verify that all of them are accepted.
++    target_with_bonus_probs, zero_temperature_token_ids = \
++        get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
++    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
++    draft_token_ids = zero_temperature_token_ids
++    bonus_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, 1),
++                                    dtype=torch.int64)
++    output_token_ids = typical_acceptance_sampler(
++        target_with_bonus_probs,
++        bonus_token_ids,
++        draft_probs=None,
++        draft_token_ids=draft_token_ids)
++    assert output_token_ids.shape[0] == batch_size
++    assert output_token_ids.shape[1] == (k + 1)
++    assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
++    assert torch.all(output_token_ids[:, -1] == bonus_token_ids)
++    # Next only keep the first 2 draft tokens same as the zero temperature
++    # tokens. For the remaining 3 choose some other tokens. In the
++    # response we will expect the first 2 tokens to be the same as the
++    # draft tokens and the recovered token and rest as -1
++    draft_token_ids_to_replace = get_draft_token_ids(
++        batch_size, k, vocab_size, zero_temperature_token_ids)
++    draft_token_ids = torch.cat(
++        (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1)
++    output_token_ids = typical_acceptance_sampler(
++        target_with_bonus_probs,
++        bonus_token_ids,
++        draft_probs=None,
++        draft_token_ids=draft_token_ids)
++    assert output_token_ids.shape[0] == batch_size
++    assert output_token_ids.shape[1] == (k + 1)
++    assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2])
++    assert torch.all(
++        output_token_ids[:, 2] == target_with_bonus_probs.argmax(-1)[:, 2])
++    assert torch.all(output_token_ids[:, -3:] == -1)
++
++
++@pytest.mark.parametrize("seed", list(range(1)))
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_accept_tokens_set_non_default_posteriors(seed: int, device: str):
++    """
++    Test the TypicalAcceptanceSampler with custom posterior thresholds and 
++    alpha values. This test verifies that by modifying the posterior
++    thresholds and alpha values we can change the acceptance behavior of the
++    sampler. 
++    """
++    set_random_seed(seed)
++    k = 5
++    batch_size = 1
++    vocab_size = 30_000
++    torch.set_default_device(device)
++    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
++    typical_acceptance_sampler.init_gpu_tensors(device=device)
++    # Simulate temperature 0 probability distribution for target
++    # probabilities and create target probabilities such that only 1 token
++    # id has probability 1.0 and others have a very low probability of
++    # 0.00001. Populate draft_token_ids such that they exclude the token_ids
++    # with probability = 1.0. Without any changes to the posterior thresholds
++    # none of the draft tokens are accepted.
++    target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist(
++        batch_size, k + 1, vocab_size)
++    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
++    target_probs[target_probs == 0] = 0.00001
++    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
++                                          zero_temperature_token_ids)
++    bonus_token_ids = torch.randint(low=0,
++                                    high=vocab_size,
++                                    size=(batch_size, 1),
++                                    dtype=torch.int64)
++    output_token_ids = typical_acceptance_sampler(
++        target_probs,
++        bonus_token_ids,
++        draft_probs=None,
++        draft_token_ids=draft_token_ids)
++    assert output_token_ids.shape[0] == batch_size
++    assert output_token_ids.shape[1] == (k + 1)
++    assert torch.all(output_token_ids[:, 1:-1] == -1)
++
++    # Change the posterior threshold values to 0.0 so that we will
++    # now accept even draft tokens with very low probability in the
++    # target distribution. Simulate and verify the same.
++    typical_acceptance_sampler = TypicalAcceptanceSampler(
++        strict_mode=True, posterior_threshold=0.0, posterior_alpha=0.0)
++    typical_acceptance_sampler.init_gpu_tensors(device=device)
++    output_token_ids = typical_acceptance_sampler(
++        target_probs,
++        bonus_token_ids,
++        draft_probs=None,
++        draft_token_ids=draft_token_ids)
++    assert output_token_ids.shape[0] == batch_size
++    assert output_token_ids.shape[1] == (k + 1)
++    assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
++    assert torch.all(output_token_ids[:, -1] == bonus_token_ids)
++
++
++@pytest.mark.parametrize("seed", list(range(10)))
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@torch.inference_mode()
++def test_get_recovered_token_ids(seed: int, device: str):
++    """
++    Test the TypicalAcceptanceSampler's method for generating
++    replacement token IDs.
++
++    This test verifies that the `_get_recovered_token_ids` method of the 
++    TypicalAcceptanceSampler correctly identifies the token IDs to be used
++    as recovered token IDs based on the target probability distribution.
++    Specifically, it ensures that the method correctly identifies the
++    tokens with the highest probability for each sequence in the batch.
++    """
++    set_random_seed(seed)
++    k = 10
++    batch_size = 5
++    vocab_size = 30_000
++    torch.set_default_device(device)
++    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
++    typical_acceptance_sampler.init_gpu_tensors(device=device)
++    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
++    expected_replacement_tokens = torch.argmax(target_probs, dim=-1)
++    actual_replacement_tokens = (
++        typical_acceptance_sampler._get_recovered_token_ids(target_probs))
++    assert torch.all(expected_replacement_tokens == actual_replacement_tokens)
+diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
+index b1ab8a0..b9cb385 100644
+--- a/tests/spec_decode/e2e/conftest.py
++++ b/tests/spec_decode/e2e/conftest.py
+@@ -1,305 +1,290 @@
+-import asyncio
+-import time
+ from itertools import cycle
+-from typing import Dict, List, Optional, Tuple, Union
++from typing import List, Optional, Sequence, Tuple, Union
+ 
+ import pytest
+-import ray
+-import torch
+-from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
+-                    nvmlInit)
+-
+-from tests.conftest import cleanup
+-from vllm import LLM
+-from vllm.engine.arg_utils import AsyncEngineArgs
+-from vllm.engine.async_llm_engine import AsyncLLMEngine
+-from vllm.lora.request import LoRARequest
+-from vllm.model_executor.utils import set_random_seed
+-from vllm.outputs import RequestOutput
+-from vllm.sampling_params import SamplingParams
+-from vllm.sequence import Logprob, MultiModalData
+-from vllm.usage.usage_lib import UsageContext
+-from vllm.utils import Counter, random_uuid
+-
+-
+-class AsyncLLM:
+-    """AsyncLLM
+-
+-    Note: Current LLM class in vllm don't support async mode, for test purpose,
+-    we implement async one in here. Maybe we could move to
+-    vllm/entrypoints/llm.py in future.
+ 
+-    Below AsyncLLM is directly borrow from vllm/entrypoints/llm.py with changes
+-    to make to work in async mode.
+-    """
+-
+-    def __init__(
+-        self,
+-        model: str,
+-        tokenizer: Optional[str] = None,
+-        tokenizer_mode: str = "auto",
+-        skip_tokenizer_init: bool = False,
+-        trust_remote_code: bool = False,
+-        tensor_parallel_size: int = 1,
+-        dtype: str = "auto",
+-        quantization: Optional[str] = None,
+-        revision: Optional[str] = None,
+-        tokenizer_revision: Optional[str] = None,
+-        seed: int = 0,
+-        gpu_memory_utilization: float = 0.9,
+-        swap_space: int = 4,
+-        enforce_eager: bool = False,
+-        max_seq_len_to_capture: int = 8192,
+-        disable_custom_all_reduce: bool = False,
+-        **kwargs,
+-    ) -> None:
+-        if "disable_log_stats" not in kwargs:
+-            kwargs["disable_log_stats"] = True
+-        self.engine_args = AsyncEngineArgs(
+-            model=model,
+-            tokenizer=tokenizer,
+-            tokenizer_mode=tokenizer_mode,
+-            skip_tokenizer_init=skip_tokenizer_init,
+-            trust_remote_code=trust_remote_code,
+-            tensor_parallel_size=tensor_parallel_size,
+-            dtype=dtype,
+-            quantization=quantization,
+-            revision=revision,
+-            tokenizer_revision=tokenizer_revision,
+-            seed=seed,
+-            gpu_memory_utilization=gpu_memory_utilization,
+-            swap_space=swap_space,
+-            enforce_eager=enforce_eager,
+-            max_seq_len_to_capture=max_seq_len_to_capture,
+-            engine_use_ray=True,
+-            disable_custom_all_reduce=disable_custom_all_reduce,
+-            **kwargs,
+-        )
+-        self.request_counter = Counter()
+-
+-    def generate(
+-        self,
+-        prompts: Optional[Union[str, List[str]]] = None,
+-        sampling_params: Optional[Union[SamplingParams,
+-                                        List[SamplingParams]]] = None,
+-        prompt_token_ids: Optional[List[List[int]]] = None,
+-        use_tqdm: bool = True,
+-        lora_request: Optional[LoRARequest] = None,
+-        multi_modal_data: Optional[MultiModalData] = None,
+-    ) -> List[RequestOutput]:
+-
+-        llm_engine = AsyncLLMEngine.from_engine_args(
+-            self.engine_args, usage_context=UsageContext.LLM_CLASS)
+-
+-        if prompts is None:
+-            raise ValueError("prompts must be provided.")
+-        if isinstance(prompts, str):
+-            # Convert a single prompt to a list.
+-            prompts = [prompts]
+-
+-        if prompts is not None:
+-            num_requests = len(prompts)
+-
+-        if sampling_params is None:
+-            # Use default sampling params.
+-            sampling_params = SamplingParams()
+-
+-        elif isinstance(sampling_params,
+-                        list) and len(sampling_params) != num_requests:
+-            raise ValueError("The lengths of prompts and "
+-                             "sampling_params must be the same.")
+-
+-        async def get_output(prompt, sampling_param) -> str:
+-            request_id = random_uuid()
+-            results_generator = llm_engine.generate(prompt, sampling_param,
+-                                                    request_id)
+-            final_output = None
+-            async for request_output in results_generator:
+-                final_output = request_output
+-            return final_output
+-
+-        outputs = []
+-        try:
+-            for i in range(num_requests):
+-                prompt = prompts[i] if prompts is not None else None
+-                res = asyncio.run(get_output(prompt, sampling_params))
+-                outputs.append(res)
+-        finally:
+-            ray.shutdown()
+-        return outputs
++from vllm import LLM, SamplingParams
++from vllm.distributed import cleanup_dist_env_and_memory
++from vllm.model_executor.utils import set_random_seed
++from vllm.sequence import PromptLogprobs, SampleLogprobs
+ 
++from ...models.utils import (TokensTextLogprobs,
++                             TokensTextLogprobsPromptLogprobs,
++                             check_logprobs_close, check_outputs_equal)
++from ...utils import RemoteOpenAIServer
+ 
+-@pytest.fixture
+-def baseline_llm_generator(request, common_llm_kwargs,
+-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+-                           seed):
+-    return create_llm_generator("baseline", request, common_llm_kwargs,
+-                                per_test_common_llm_kwargs,
+-                                baseline_llm_kwargs, seed)
++PROMPTS = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++    "San Francisco is know for its",
++    "Facebook was created in 2004 by",
++    "Curious George is a",
++    "Python 3.11 brings improvements to its",
++]
+ 
+ 
+ @pytest.fixture
+-def test_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs,
++def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                        test_llm_kwargs, seed):
+-    return create_llm_generator("test", request, common_llm_kwargs,
+-                                per_test_common_llm_kwargs, test_llm_kwargs,
+-                                seed)
+-
+-
+-def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
+-                         per_test_common_llm_kwargs, distinct_llm_kwargs,
+-                         seed):
+-    kwargs = {
+-        **common_llm_kwargs,
+-        **per_test_common_llm_kwargs,
+-        **distinct_llm_kwargs,
+-    }
+-    test_name = request.node.name
+ 
+-    def generator_inner():
+-
+-        wait_for_gpu_memory_to_clear(
+-            devices=list(range(torch.cuda.device_count())),
+-            threshold_bytes=2 * 2**30,
+-            timeout_s=60,
+-        )
++    def generate():
++        kwargs = {
++            **common_llm_kwargs,
++            **per_test_common_llm_kwargs,
++            **test_llm_kwargs,
++        }
+ 
+-        use_async = False
+-        if "use_async" in kwargs:
+-            use_async = kwargs.pop("use_async")
+-        print(f'{use_async=}')
++        llm = LLM(**kwargs)
+ 
+-        print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
+-        llm = AsyncLLM(**kwargs) if use_async else LLM(**kwargs)
+-        set_random_seed(seed)
++        if seed is not None:
++            set_random_seed(seed)
+ 
+         yield llm
++
+         del llm
+-        cleanup()
++        cleanup_dist_env_and_memory()
++
++    return generate
+ 
+-    def generator_outer():
+-        for llm in generator_inner():
+-            yield llm
+-            del llm
+ 
+-    return generator_outer
++def maybe_assert_ngram_worker(llm):
++    # Verify the proposer worker is ngram if ngram is specified.
++    if (llm.llm_engine.speculative_config is not None
++            and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
++        from vllm.spec_decode.ngram_worker import NGramWorker
++        assert isinstance(
++            llm.llm_engine.model_executor.driver_worker.proposer_worker,
++            NGramWorker)
+ 
+ 
+ def get_output_from_llm_generator(
+         llm_generator, prompts,
+-        sampling_params) -> Tuple[List[str], List[List[int]]]:
+-    tokens = []
+-    token_ids = []
++        sampling_params) -> Tuple[List[str], List[List[int]], float]:
++    tokens: List[str] = []
++    token_ids: List[List[int]] = []
++    acceptance_rate: float = -1.0
+     for llm in llm_generator():
++        maybe_assert_ngram_worker(llm)
++
+         outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
++
+         token_ids = [output.outputs[0].token_ids for output in outputs]
+         tokens = [output.outputs[0].text for output in outputs]
++
++        # Fetch acceptance rate if logging is enabled.
++        if stat_loggers := getattr(llm.llm_engine, "stat_loggers", None):
++            stat_logger = stat_loggers["prometheus"]
++            acceptance_rate = (stat_logger.metrics.
++                               gauge_spec_decode_draft_acceptance_rate.labels(
++                                   **stat_logger.labels)._value.get())
+         del llm
+ 
+-    return tokens, token_ids
++    return tokens, token_ids, acceptance_rate
+ 
+ 
+-def get_logprobs_from_llm_generator(
+-        llm_generator, prompts,
+-        sampling_params) -> List[List[Dict[int, Logprob]]]:
+-    """Returns a dict of (token_id: Logprob) for each generated position, for
+-    each sequence in the batch.
++def check_logprobs_correctness(
++    spec_outputs: Sequence[Union[TokensTextLogprobs,
++                                 TokensTextLogprobsPromptLogprobs]],
++    baseline_outputs: Sequence[Union[TokensTextLogprobs,
++                                     TokensTextLogprobsPromptLogprobs]],
++    disable_logprobs: bool = False,
++):
++    """Compare sampled and prompt logprobs between baseline and spec decoding
+     """
+-    for llm in llm_generator():
+-        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+-        logprobs = [output.outputs[0].logprobs[:] for output in outputs]
+-        del llm
++    if not disable_logprobs:
++        return check_logprobs_close(
++            outputs_0_lst=baseline_outputs,
++            outputs_1_lst=spec_outputs,
++            name_0="org",
++            name_1="sd",
++        )
+ 
+-    return logprobs
++    # Check correctness when disable_logprobs == True
++    for spec_output, baseline_output in zip(spec_outputs, baseline_outputs):
++        # Check generated token logprobs.
++        spec_logprobs = spec_output[2]
++        baseline_logprobs = baseline_output[2]
++        _check_logprobs_when_output_disabled(spec_logprobs,
++                                             baseline_logprobs,
++                                             is_prompt_logprobs=False)
++
++        # Check prompt logprobs too, if they exist
++        if len(baseline_output) == 4:
++            assert len(spec_output) == 4
++            spec_prompt_logprobs = spec_output[3]
++            baseline_prompt_logprobs = baseline_output[3]
++            _check_logprobs_when_output_disabled(spec_prompt_logprobs,
++                                                 baseline_prompt_logprobs,
++                                                 is_prompt_logprobs=True)
++
++
++def _check_logprobs_when_output_disabled(
++    spec_logprobs: Union[Optional[PromptLogprobs], SampleLogprobs],
++    baseline_logprobs: Union[Optional[PromptLogprobs], SampleLogprobs],
++    is_prompt_logprobs: bool = False,
++):
++    # Prompt logprobs are optional
++    if is_prompt_logprobs and baseline_logprobs is None:
++        assert spec_logprobs is None
++        return
++
++    assert spec_logprobs is not None
++    assert baseline_logprobs is not None
++    assert len(spec_logprobs) == len(baseline_logprobs)
++
++    # For each generated position of the sequence.
++    for pos, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
++            zip(spec_logprobs, baseline_logprobs)):
++
++        # First prompt logprob is expected to be None
++        if is_prompt_logprobs and baseline_pos_logprobs is None:
++            assert spec_pos_logprobs is None
++            assert pos == 0
++            continue
++
++        assert spec_pos_logprobs is not None
++        assert baseline_pos_logprobs is not None
++
++        # When disabled, the 1 logprob is returned with dummy values for the
++        # score and rank, but the token id should match the baseline model
++        assert len(spec_pos_logprobs) == 1
++        (spec_pos_logprob_token_id,
++         spec_pos_logprob) = next(iter(spec_pos_logprobs.items()))
++        assert spec_pos_logprob.rank == -1
++        assert spec_pos_logprob.logprob == 0.0
++        assert spec_pos_logprob_token_id in baseline_pos_logprobs
++
++
++def run_equality_correctness_test(
++        vllm_runner,
++        common_llm_kwargs,
++        per_test_common_llm_kwargs,
++        baseline_llm_kwargs,
++        test_llm_kwargs,
++        batch_size: int,
++        max_output_len: int,
++        seed: Optional[int] = 0,
++        temperature: float = 0.0,
++        disable_seed: bool = False,
++        ignore_eos: bool = True,
++        ensure_all_accepted: bool = False,
++        expected_acceptance_rate: Optional[float] = None,
++        logprobs: Optional[int] = None,
++        prompt_logprobs: Optional[int] = None,
++        disable_logprobs: bool = False):
++
++    org_args = {
++        **common_llm_kwargs,
++        **per_test_common_llm_kwargs,
++        **baseline_llm_kwargs,
++    }
+ 
++    sd_args = {
++        **common_llm_kwargs,
++        **per_test_common_llm_kwargs,
++        **test_llm_kwargs,
++    }
+ 
+-def run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len,
+-                                         force_output_len: bool,
+-                                         print_tokens: bool = False):
++    prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
++
++    if disable_seed:
++        seed = None
++
++    sampling_params = SamplingParams(temperature=temperature,
++                                     max_tokens=max_output_len,
++                                     seed=seed,
++                                     ignore_eos=ignore_eos,
++                                     logprobs=logprobs,
++                                     prompt_logprobs=prompt_logprobs)
++
++    with vllm_runner(**org_args) as vllm_model:
++        org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
++
++    with vllm_runner(**sd_args) as vllm_model:
++        if ensure_all_accepted or expected_acceptance_rate is not None:
++            # Force log interval to be 0 to catch all metrics.
++            stat_logger = vllm_model.model.llm_engine.stat_loggers[
++                'prometheus']
++            stat_logger.local_interval = -100
++
++        sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
++
++        if ensure_all_accepted or expected_acceptance_rate is not None:
++            acceptance_rate = (stat_logger.metrics.
++                               gauge_spec_decode_draft_acceptance_rate.labels(
++                                   **stat_logger.labels)._value.get())
++
++            if ensure_all_accepted:
++                assert True
++                # FIXME: ci fails to log acceptance rate.
++                # It works locally.
++                # assert acceptance_rate == 1.0
++
++            if expected_acceptance_rate is not None:
++                assert acceptance_rate >= expected_acceptance_rate - 1e-2
++
++    # Only pass token entries, not the logprobs
++    check_outputs_equal(outputs_0_lst=[out[0:2] for out in org_outputs],
++                        outputs_1_lst=[out[0:2] for out in sd_outputs],
++                        name_0="org",
++                        name_1="sd")
++
++    # Check logprobs if requested
++    if logprobs is not None or prompt_logprobs is not None:
++        check_logprobs_correctness(sd_outputs, org_outputs, disable_logprobs)
++
++
++def run_equality_correctness_test_tp(model,
++                                     common_llm_kwargs,
++                                     per_test_common_llm_kwargs,
++                                     baseline_llm_kwargs,
++                                     test_llm_kwargs,
++                                     batch_size: int,
++                                     max_output_len: int,
++                                     seed: int = 0,
++                                     temperature: float = 0.0):
+     """Helper method that compares the outputs of both the baseline LLM and
+     the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
+     the same when temperature is zero.
+     """
+-    temperature = 0.0
+-
+-    prompts = [
+-        "Hello, my name is",
+-        "The president of the United States is",
+-        "The capital of France is",
+-        "The future of AI is",
+-        "San Francisco is know for its",
+-        "Facebook was created in 2004 by",
+-        "Curious George is a",
+-        "Python 3.11 brings improvements to its",
+-    ]
+-
+-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+-
+-    # If the test requires that we generated max_output_len tokens, then set the
+-    # sampling params to ignore eos token.
+-    ignore_eos = force_output_len
+-
+-    sampling_params = SamplingParams(
+-        max_tokens=max_output_len,
+-        ignore_eos=ignore_eos,
+-        temperature=temperature,
+-    )
+-
+-    spec_batch_tokens, spec_batch_token_ids = get_output_from_llm_generator(
+-        test_llm_generator, prompts, sampling_params)
+-
+-    (baseline_batch_tokens,
+-     baseline_batch_token_ids) = get_output_from_llm_generator(
+-         baseline_llm_generator, prompts, sampling_params)
+-
+-    assert len(baseline_batch_token_ids) == len(prompts)
+-    assert len(spec_batch_token_ids) == len(prompts)
+-
+-    for i, (baseline_token_ids, baseline_tokens, spec_token_ids,
+-            spec_tokens) in enumerate(
+-                zip(baseline_batch_token_ids, baseline_batch_tokens,
+-                    spec_batch_token_ids, spec_batch_tokens)):
+-        if print_tokens:
+-            print(f'{i=} {baseline_tokens=}')
+-            print(f'{i=}     {spec_tokens=}')
+-        print(f'{i=} {baseline_token_ids=}')
+-        print(f'{i=}     {spec_token_ids=}')
+-        assert baseline_token_ids == spec_token_ids
+-
+-
+-def wait_for_gpu_memory_to_clear(devices: List[int],
+-                                 threshold_bytes: int,
+-                                 timeout_s: float = 120) -> None:
+-    # Use nvml instead of pytorch to reduce measurement error from torch cuda
+-    # context.
+-    nvmlInit()
+-    start_time = time.time()
+-    while True:
+-        output = {}
+-        output_raw = {}
+-        for device in devices:
+-            dev_handle = nvmlDeviceGetHandleByIndex(device)
+-            mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
+-            gb_used = mem_info.used / 2**30
+-            output_raw[device] = gb_used
+-            output[device] = f'{gb_used:.02f}'
+-
+-        print('gpu memory used (GB): ', end='')
+-        for k, v in output.items():
+-            print(f'{k}={v}; ', end='')
+-        print('')
+-
+-        dur_s = time.time() - start_time
+-        if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
+-            print(f'Done waiting for free GPU memory on devices {devices=} '
+-                  f'({threshold_bytes/2**30=}) {dur_s=:.02f}')
+-            break
+-
+-        if dur_s >= timeout_s:
+-            raise ValueError(f'Memory of devices {devices=} not free after '
+-                             f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
+-
+-        time.sleep(5)
++    arg1 = common_llm_kwargs + per_test_common_llm_kwargs + baseline_llm_kwargs
++    arg2 = common_llm_kwargs + per_test_common_llm_kwargs + test_llm_kwargs
++    env1 = env2 = None
++
++    max_wait_seconds = 240
++    results = []
++
++    prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
++
++    for args, env in ((arg1, env1), (arg2, env2)):
++        with RemoteOpenAIServer(model,
++                                args,
++                                env_dict=env,
++                                max_wait_seconds=max_wait_seconds) as server:
++            client = server.get_client()
++
++            completion = client.completions.create(model=model,
++                                                   prompt=prompts,
++                                                   max_tokens=max_output_len,
++                                                   seed=seed,
++                                                   temperature=temperature)
++
++            results.append({
++                "test":
++                "seeded_sampling",
++                "text": [choice.text for choice in completion.choices],
++                "finish_reason":
++                [choice.finish_reason for choice in completion.choices],
++                "usage":
++                completion.usage,
++            })
++
++    n = len(results) // 2
++    arg1_results = results[:n]
++    arg2_results = results[n:]
++    for arg1_result, arg2_result in zip(arg1_results, arg2_results):
++        assert arg1_result == arg2_result, (
++            f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
++            f"{arg1_result=} != {arg2_result=}")
+diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
+index 60c20ed..af8397c 100644
+--- a/tests/spec_decode/e2e/test_compatibility.py
++++ b/tests/spec_decode/e2e/test_compatibility.py
+@@ -5,105 +5,11 @@ from vllm import SamplingParams
+ from .conftest import get_output_from_llm_generator
+ 
+ 
+-@pytest.mark.parametrize(
+-    "common_llm_kwargs",
+-    [{
+-        "model": "JackFram/llama-68m",
+-        "speculative_model": "JackFram/llama-68m",
+-        "num_speculative_tokens": 5,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+-    }])
+-@pytest.mark.parametrize(
+-    "per_test_common_llm_kwargs",
+-    [
+-        {
+-            # Expect failure as spec decode not supported by
+-            # Ray backend.
+-            "worker_use_ray": True,
+-        },
+-    ])
+-@pytest.mark.parametrize("test_llm_kwargs", [{}])
+-@pytest.mark.parametrize("seed", [1])
+-def test_spec_decode_xfail_ray(test_llm_generator):
+-    """Verify that speculative decoding with Ray fails.
+-    """
+-    output_len = 128
+-    temperature = 0.0
+-
+-    prompts = [
+-        "Hello, my name is",
+-    ]
+-
+-    sampling_params = SamplingParams(
+-        max_tokens=output_len,
+-        ignore_eos=True,
+-        temperature=temperature,
+-    )
+-
+-    try:
+-        with pytest.raises(
+-                AssertionError,
+-                match="Speculative decoding not yet supported for "):
+-            get_output_from_llm_generator(test_llm_generator, prompts,
+-                                          sampling_params)
+-    finally:
+-        # we need to free up ray resource,
+-        # so that latter test could use the gpu we allocated here
+-        import ray
+-        ray.shutdown()
+-
+-
+-@pytest.mark.parametrize(
+-    "common_llm_kwargs",
+-    [{
+-        "model": "JackFram/llama-68m",
+-        "speculative_model": "JackFram/llama-68m",
+-        "num_speculative_tokens": 5,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+-    }])
+-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+-    {
+-        "enable_chunked_prefill": True,
+-    },
+-])
+-@pytest.mark.parametrize("test_llm_kwargs", [{}])
+-@pytest.mark.parametrize("seed", [1])
+-def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
+-    """Verify that speculative decoding with chunked prefill fails.
+-    """
+-    output_len = 128
+-    temperature = 0.0
+-
+-    prompts = [
+-        "Hello, my name is",
+-    ]
+-
+-    sampling_params = SamplingParams(
+-        max_tokens=output_len,
+-        ignore_eos=True,
+-        temperature=temperature,
+-    )
+-
+-    with pytest.raises(ValueError,
+-                       match="Speculative decoding and chunked prefill"):
+-        get_output_from_llm_generator(test_llm_generator, prompts,
+-                                      sampling_params)
+-
+-
+-@pytest.mark.parametrize(
+-    "common_llm_kwargs",
+-    [{
+-        "model": "meta-llama/Llama-2-7b-chat-hf",
+-        "speculative_model": "JackFram/llama-68m",
+-        "num_speculative_tokens": 5,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+-    }])
++@pytest.mark.parametrize("common_llm_kwargs", [{
++    "model": "meta-llama/Llama-2-7b-chat-hf",
++    "speculative_model": "JackFram/llama-68m",
++    "num_speculative_tokens": 5,
++}])
+ @pytest.mark.parametrize(
+     "per_test_common_llm_kwargs",
+     [
+@@ -144,33 +50,3 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
+     with pytest.raises(ValueError, match="cannot be larger than"):
+         get_output_from_llm_generator(test_llm_generator, prompts,
+                                       sampling_params)
+-
+-
+-@pytest.mark.parametrize("common_llm_kwargs", [{
+-    "model": "JackFram/llama-68m",
+-    "speculative_model": "JackFram/llama-68m",
+-    "num_speculative_tokens": 5,
+-}])
+-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+-@pytest.mark.parametrize("test_llm_kwargs", [{}])
+-@pytest.mark.parametrize("seed", [1])
+-def test_spec_decode_xfail_block_manager_v1(test_llm_generator):
+-    """Verify that speculative decoding with block manager v1 fails.
+-    """
+-    output_len = 128
+-    temperature = 0.0
+-
+-    prompts = [
+-        "Hello, my name is",
+-    ]
+-
+-    sampling_params = SamplingParams(
+-        max_tokens=output_len,
+-        ignore_eos=True,
+-        temperature=temperature,
+-    )
+-
+-    with pytest.raises(ValueError,
+-                       match="Speculative decoding requires usage of the V2"):
+-        get_output_from_llm_generator(test_llm_generator, prompts,
+-                                      sampling_params)
+diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
+new file mode 100644
+index 0000000..5bc70de
+--- /dev/null
++++ b/tests/spec_decode/e2e/test_eagle_correctness.py
+@@ -0,0 +1,309 @@
++"""This docstring details important information on the testing methodology.
++
++Most of the tests rely on "greedy equality", where we expect the output of
++speculative decoding on a sequence to exactly match the output of normal non-
++speculative decoding.
++
++Since speculative decoding with rejection sampling guarantees that the output
++distribution matches the target model's output distribution (up to hardware
++numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
++equality.
++
++However, we still need to verify below scenario could be passed:
++    * Batch size 1 greedy equality
++    * Batch size >1 greedy equality
++    * Test greedy equality under preemption
++    * Test greedy equality under various number of speculative tokens.
++
++With those tests, we can say at least, EAGLE would not break the
++correctess for the target model outputs.
++"""
++
++import pytest
++
++from .conftest import run_equality_correctness_test
++
++# main model
++MAIN_MODEL = "JackFram/llama-68m"
++
++# speculative model
++SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random"
++
++# max. number of speculative tokens: this corresponds to
++# num_heads in the config.json of the speculator model.
++MAX_SPEC_TOKENS = 4
++
++# precision
++PRECISION = "float32"
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++    },
++])
++@pytest.mark.parametrize("output_len", [
++    128,
++])
++@pytest.mark.parametrize("batch_size", [1, 32])
++@pytest.mark.parametrize("seed", [1])
++def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
++                                      per_test_common_llm_kwargs,
++                                      baseline_llm_kwargs, test_llm_kwargs,
++                                      batch_size: int, output_len: int,
++                                      seed: int):
++
++    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs, test_llm_kwargs,
++                                  batch_size, output_len, seed)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++        "disable_logprobs_during_spec_decoding": False,
++    },
++    {
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++        "disable_logprobs_during_spec_decoding": True,
++    },
++])
++@pytest.mark.parametrize("output_len", [
++    128,
++])
++@pytest.mark.parametrize("batch_size", [8])
++@pytest.mark.parametrize("seed", [1])
++@pytest.mark.parametrize("logprobs", [1, 6])
++def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
++                                   per_test_common_llm_kwargs,
++                                   baseline_llm_kwargs, test_llm_kwargs,
++                                   batch_size: int, output_len: int, seed: int,
++                                   logprobs: int):
++
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  output_len,
++                                  seed,
++                                  logprobs=logprobs,
++                                  prompt_logprobs=logprobs,
++                                  disable_logprobs=test_llm_kwargs[
++                                      'disable_logprobs_during_spec_decoding'])
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "enforce_eager": False,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++    },
++])
++@pytest.mark.parametrize("output_len", [
++    128,
++])
++@pytest.mark.parametrize("batch_size", [1, 32])
++@pytest.mark.parametrize("seed", [1])
++def test_eagle_e2e_greedy_correctness_cuda_graph(
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
++    """Verify greedy equality with cuda graph enabled and different
++    batch sizes."""
++    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs, test_llm_kwargs,
++                                  batch_size, output_len, seed)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "block_size": 8,
++        # 2 for small prompt, 256//8 for generated.
++        "num_gpu_blocks_override": 2 + 256 // 8,
++        "max_model_len": (2 + 256 // 8) * 8,
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++    },
++])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use small output len for fast test.
++        128,
++    ])
++@pytest.mark.parametrize("batch_size", [4])
++@pytest.mark.parametrize("seed", [1])
++def test_eagle_e2e_greedy_correctness_with_preemption(
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
++    """Verify greedy equality, even when some sequences are preempted mid-
++    generation.
++    """
++    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs, test_llm_kwargs,
++                                  batch_size, output_len, seed)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize(
++    "test_llm_kwargs",
++    [
++        {
++            "speculative_model": SPEC_MODEL,
++            "num_speculative_tokens": k,
++        }
++        # Try a range of num. speculative tokens
++        for k in range(1, 1 + MAX_SPEC_TOKENS)
++    ])
++@pytest.mark.parametrize("batch_size", [2])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_eagle_different_k(vllm_runner, common_llm_kwargs,
++                           per_test_common_llm_kwargs, baseline_llm_kwargs,
++                           test_llm_kwargs, batch_size: int, output_len: int,
++                           seed: int):
++    """Verify that eagle speculative decoding produces exact equality
++    to without spec decode with different values of num_speculative_tokens.
++    """
++    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs, test_llm_kwargs,
++                                  batch_size, output_len, seed)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_model": SPEC_MODEL,
++                             "num_speculative_tokens": MAX_SPEC_TOKENS,
++                             "speculative_disable_by_batch_size": 4
++                         }])
++@pytest.mark.parametrize("batch_size", [1, 5])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
++                             per_test_common_llm_kwargs, baseline_llm_kwargs,
++                             test_llm_kwargs, batch_size: int, output_len: int,
++                             seed: int):
++    """Verify that eagle speculative decoding produces exact equality
++    to without spec decode when speculation is disabled for large
++    batch sizes.
++    """
++    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs, test_llm_kwargs,
++                                  batch_size, output_len, seed)
++
++
++if __name__ == "__main__":
++    import pytest
++    pytest.main([__file__])
+diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py
+new file mode 100644
+index 0000000..b89e584
+--- /dev/null
++++ b/tests/spec_decode/e2e/test_integration.py
+@@ -0,0 +1,140 @@
++"""Tests which cover integration of the speculative decoding framework with
++other features, e.g. cuda graphs.
++"""
++
++import pytest
++
++from .conftest import run_equality_correctness_test
++
++MAIN_MODEL = "JackFram/llama-68m"
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++
++        # Verify equality when cuda graphs allowed.
++        "enforce_eager": False,
++        "model_name": "JackFram/llama-68m",
++    }])
++@pytest.mark.parametrize(
++    "per_test_common_llm_kwargs",
++    [
++        {
++            # Identical models.
++            "speculative_model": "JackFram/llama-68m",
++            "num_speculative_tokens": 5,
++        },
++    ])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [{}])
++@pytest.mark.parametrize("batch_size", [8])
++@pytest.mark.parametrize("output_len", [32])
++@pytest.mark.parametrize("seed", [1])
++def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
++                                per_test_common_llm_kwargs,
++                                baseline_llm_kwargs, test_llm_kwargs,
++                                batch_size: int, output_len: int, seed: int):
++    """Verify spec decode equality when cuda graphs are enabled.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model_name": "JackFram/llama-160m",
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [
++    {
++        "speculative_model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
++        "num_speculative_tokens": 5,
++    },
++])
++@pytest.mark.parametrize(
++    "test_llm_kwargs",
++    [
++        # Explicitly specify draft model quantization
++        {
++            "speculative_model_quantization": "gptq",
++        },
++        # Explicitly specify GPTQ-based draft model to use marlin quantization
++        {
++            "speculative_model_quantization": "marlin",
++        },
++        # Not explicitly specify draft model quantization
++        {
++            "speculative_model_quantization": None,
++        },
++    ])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("batch_size", [2])
++@pytest.mark.parametrize("seed", [1])
++def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
++                                               per_test_common_llm_kwargs,
++                                               baseline_llm_kwargs,
++                                               test_llm_kwargs,
++                                               batch_size: int, seed: int):
++    """Verify spec decode works well with draft model quantization configs.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=32,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model_name": MAIN_MODEL,
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": 3,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_disable_mqa_scorer": True,
++                         }])
++@pytest.mark.parametrize("batch_size", [1, 5])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
++                    output_len: int, seed: int):
++    """Verify that ngram speculative decoding generates the same output 
++    with batch expansion scorer and mqa scorer.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
+diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
+new file mode 100644
+index 0000000..02cba92
+--- /dev/null
++++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
+@@ -0,0 +1,174 @@
++"""Tests which cover integration of the speculative decoding framework with
++tensor parallelism.
++"""
++
++import pytest
++import torch
++
++from vllm.platforms import current_platform
++
++from .conftest import run_equality_correctness_test_tp
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 2,
++                    reason="Need at least 2 GPUs to run the test.")
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [[
++        # Skip cuda graph recording for fast test.
++        "--enforce-eager",
++        "--tensor-parallel-size",
++        "2"
++    ]])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
++@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    [
++        "--speculative-model",
++        "JackFram/llama-68m",
++        "--num-speculative-tokens",
++        "3",
++    ],
++    [
++        "--speculative-model",
++        "[ngram]",
++        "--num-speculative-tokens",
++        "5",
++        "--ngram-prompt-lookup-max",
++        "3",
++    ],
++])
++@pytest.mark.parametrize("batch_size", [2])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
++                              baseline_llm_kwargs, test_llm_kwargs,
++                              batch_size: int, output_len: int, seed: int):
++    """Verify greedy equality when tensor parallelism is used.
++    """
++    if current_platform.is_rocm():
++        pytest.skip("hip is not well-supported yet")
++    run_equality_correctness_test_tp("JackFram/llama-68m",
++                                     common_llm_kwargs,
++                                     per_test_common_llm_kwargs,
++                                     baseline_llm_kwargs,
++                                     test_llm_kwargs,
++                                     batch_size,
++                                     output_len,
++                                     seed,
++                                     temperature=0.0)
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 2,
++                    reason="Need at least 2 GPUs to run the test.")
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [[
++        # Skip cuda graph recording for fast test.
++        "--enforce-eager",
++        "--tensor_parallel_size",
++        "2",
++
++        # precision
++        "--dtype",
++        "bfloat16",
++    ]])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
++@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
++@pytest.mark.parametrize("model, test_llm_kwargs",
++                         [("JackFram/llama-68m", [
++                             "--speculative-model",
++                             "JackFram/llama-68m",
++                             "--num_speculative-tokens",
++                             "5",
++                             "--speculative-draft-tensor-parallel-size",
++                             "1",
++                         ]),
++                          ("ibm-granite/granite-3b-code-instruct", [
++                              "--speculative-model",
++                              "ibm-granite/granite-3b-code-instruct",
++                              "--num_speculative-tokens",
++                              "5",
++                              "--speculative-draft-tensor-parallel-size",
++                              "1",
++                          ])])
++@pytest.mark.parametrize("batch_size", [2])
++@pytest.mark.parametrize("seed", [1])
++def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
++                                            per_test_common_llm_kwargs,
++                                            baseline_llm_kwargs,
++                                            test_llm_kwargs, batch_size: int,
++                                            seed: int):
++    """Verify spec decode works well with smaller tp for draft models.
++    """
++    run_equality_correctness_test_tp(model,
++                                     common_llm_kwargs,
++                                     per_test_common_llm_kwargs,
++                                     baseline_llm_kwargs,
++                                     test_llm_kwargs,
++                                     batch_size,
++                                     max_output_len=32,
++                                     seed=seed,
++                                     temperature=0.0)
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 2,
++                    reason="Need at least 2 GPUs to run the test.")
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [[
++        # Skip cuda graph recording for fast test.
++        "--enforce-eager",
++        "--tensor_parallel_size",
++        "2",
++
++        # precision
++        "--dtype",
++        "bfloat16",
++    ]])
++@pytest.mark.parametrize(
++    "per_test_common_llm_kwargs",
++    [["--enable-chunked-prefill", "False"],
++     [
++         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
++         "--max-num-seqs", "4"
++     ]])
++@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
++@pytest.mark.parametrize("model, test_llm_kwargs",
++                         [("JackFram/llama-68m", [
++                             "--speculative-model",
++                             "JackFram/llama-68m",
++                             "--num_speculative-tokens",
++                             "3",
++                         ]),
++                          ("JackFram/llama-68m", [
++                              "--speculative-model",
++                              "JackFram/llama-68m",
++                              "--num_speculative-tokens",
++                              "3",
++                              "--speculative-draft-tensor-parallel-size",
++                              "1",
++                          ])])
++@pytest.mark.parametrize("batch_size", [2])
++@pytest.mark.parametrize("seed", [1])
++def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
++                                         per_test_common_llm_kwargs,
++                                         baseline_llm_kwargs, test_llm_kwargs,
++                                         batch_size: int, seed: int):
++    """Verify spec decode works well with same and different TP size for
++    the draft model with chunked prefill.
++    """
++    run_equality_correctness_test_tp(model,
++                                     common_llm_kwargs,
++                                     per_test_common_llm_kwargs,
++                                     baseline_llm_kwargs,
++                                     test_llm_kwargs,
++                                     batch_size,
++                                     max_output_len=32,
++                                     seed=seed,
++                                     temperature=0.0)
+diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
+new file mode 100644
+index 0000000..2cb10de
+--- /dev/null
++++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
+@@ -0,0 +1,121 @@
++"""Tests which cover integration of the speculative decoding framework with
++tensor parallelism.
++"""
++
++import openai
++import pytest
++import torch
++
++from .conftest import run_equality_correctness_test_tp
++
++MAIN_MODEL = "JackFram/llama-68m"
++SPEC_MODEL = "JackFram/llama-68m"
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 4,
++                    reason="Need at least 4 GPUs to run the test.")
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [[
++        # Skip cuda graph recording for fast test.
++        "--enforce_eager",
++        "--tensor-parallel-size",
++        "4",
++    ]])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [
++    [
++        "--speculative-model",
++        f"{SPEC_MODEL}",
++        "--num-speculative-tokens",
++        "5",
++    ],
++])
++@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
++@pytest.mark.parametrize(
++    "test_llm_kwargs",
++    [
++        #TODO(wooyeon): add spec_draft_dp=2 case
++        [
++            "--speculative-draft-tensor-parallel-size",
++            "1",
++        ],
++    ])
++@pytest.mark.parametrize("batch_size", [2])
++@pytest.mark.parametrize("seed", [1])
++def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
++                                            per_test_common_llm_kwargs,
++                                            baseline_llm_kwargs,
++                                            test_llm_kwargs, batch_size: int,
++                                            seed: int):
++    """Verify spec decode works well with smaller tp for draft models.
++    """
++    run_equality_correctness_test_tp(MAIN_MODEL,
++                                     common_llm_kwargs,
++                                     per_test_common_llm_kwargs,
++                                     baseline_llm_kwargs,
++                                     test_llm_kwargs,
++                                     batch_size,
++                                     max_output_len=32,
++                                     seed=seed,
++                                     temperature=0.0)
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 4,
++                    reason="Need at least 4 GPUs to run the test.")
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [[
++
++        # Skip cuda graph recording for fast test.
++        "--enforce-eager",
++        "--tensor-parallel-size",
++        "4",
++    ]])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
++@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
++@pytest.mark.parametrize(
++    "test_llm_kwargs",
++    [
++        [
++            "--speculative-model",
++            f"{SPEC_MODEL}",
++            "--num-speculative-tokens",
++            "5",
++
++            # Artificially limit the draft model max model len; this forces vLLM
++            # to skip speculation once the sequences grow beyond 32-k tokens.
++            "--speculative-max-model-len",
++            "32",
++        ],
++    ])
++@pytest.mark.parametrize("batch_size", [8])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # This must be a good bit larger than speculative_max_model_len so that
++        # we can test the case where all seqs are skipped, but still small to
++        # ensure fast test.
++        64,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
++                          baseline_llm_kwargs, test_llm_kwargs,
++                          batch_size: int, output_len: int, seed: int):
++    """Verify job failure with RuntimeError when all sequences skip speculation.
++    We do this by setting the max model len of the draft model to an
++    artificially low value, such that when the sequences grow beyond it, they
++    are skipped in speculative decoding.
++
++    TODO: fix it to pass without raising Error. (#5814)
++    """
++    with pytest.raises(
++        (openai.APIConnectionError, openai.InternalServerError)):
++        run_equality_correctness_test_tp(MAIN_MODEL,
++                                         common_llm_kwargs,
++                                         per_test_common_llm_kwargs,
++                                         baseline_llm_kwargs,
++                                         test_llm_kwargs,
++                                         batch_size,
++                                         output_len,
++                                         seed,
++                                         temperature=0.0)
+diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
+index 9572aac..4cfca8b 100644
+--- a/tests/spec_decode/e2e/test_logprobs.py
++++ b/tests/spec_decode/e2e/test_logprobs.py
+@@ -1,31 +1,32 @@
+-import math
+ from itertools import cycle
+ 
+ import pytest
+ 
+ from vllm import SamplingParams
+ 
+-from .conftest import get_logprobs_from_llm_generator
++from .conftest import run_equality_correctness_test
+ 
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+-        "model": "JackFram/llama-68m",
++        "model_name": "JackFram/llama-68m",
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True,
+-        "max_logprobs": 6,
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+-@pytest.mark.parametrize("test_llm_kwargs", [{
+-    "speculative_model": "JackFram/llama-160m",
+-    "num_speculative_tokens": 3,
+-}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_model": "JackFram/llama-160m",
++                             "num_speculative_tokens": 3,
++                             "disable_logprobs_during_spec_decoding": False,
++                         }, {
++                             "speculative_model": "JackFram/llama-160m",
++                             "num_speculative_tokens": 3,
++                             "disable_logprobs_during_spec_decoding": True,
++                         }])
+ @pytest.mark.parametrize("batch_size", [8])
+ @pytest.mark.parametrize(
+     "output_len",
+@@ -34,78 +35,48 @@ from .conftest import get_logprobs_from_llm_generator
+         7,
+     ])
+ @pytest.mark.parametrize("seed", [1])
+-def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
+-                           batch_size: int, output_len: int):
++@pytest.mark.parametrize("logprobs", [1, 6])
++def test_logprobs_equality(vllm_runner, common_llm_kwargs,
++                           per_test_common_llm_kwargs, baseline_llm_kwargs,
++                           test_llm_kwargs, batch_size: int, output_len: int,
++                           seed: int, logprobs: int):
+     """Verify output logprobs are equal with and without speculative decoding.
+     """
+-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  output_len,
++                                  seed,
++                                  temperature=0.0,
++                                  logprobs=logprobs,
++                                  prompt_logprobs=logprobs,
++                                  disable_logprobs=test_llm_kwargs[
++                                      'disable_logprobs_during_spec_decoding'])
+ 
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+-        "model": "JackFram/llama-68m",
++        "model_name": "JackFram/llama-68m",
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True,
+-        "max_logprobs": 6,
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+-@pytest.mark.parametrize("test_llm_kwargs", [{
+-    "speculative_model": "JackFram/llama-160m",
+-    "num_speculative_tokens": 3,
+-}])
+-@pytest.mark.parametrize("batch_size", [1])
+-@pytest.mark.parametrize("num_logprobs", [6])
+-@pytest.mark.parametrize(
+-    "output_len",
+-    [
+-        # Use smaller output len for fast test.
+-        7,
+-    ])
+-@pytest.mark.parametrize("seed", [1])
+-def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
+-                           batch_size: int, output_len: int,
+-                           num_logprobs: int):
+-    """Verify output logprobs are equal with and without spec decode.
+-    This specifies a number of logprobs >1.
+-    """
+-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True,
+-                                         logprob_rank=num_logprobs)
+-
+-
+-@pytest.mark.parametrize(
+-    "common_llm_kwargs",
+-    [{
+-        "model": "JackFram/llama-68m",
+-
+-        # Skip cuda graph recording for fast test.
+-        "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+-    }])
+-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+-@pytest.mark.parametrize("test_llm_kwargs", [{
+-    "speculative_model": "JackFram/llama-160m",
+-    "num_speculative_tokens": 3,
+-}, {
+-    "speculative_model": "JackFram/llama-160m",
+-    "num_speculative_tokens": 6,
+-}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_model": "JackFram/llama-160m",
++                             "num_speculative_tokens": 3,
++                             "disable_logprobs_during_spec_decoding": False,
++                         }, {
++                             "speculative_model": "JackFram/llama-160m",
++                             "num_speculative_tokens": 6,
++                             "disable_logprobs_during_spec_decoding": False,
++                         }])
+ @pytest.mark.parametrize("batch_size", [8])
+ @pytest.mark.parametrize(
+     "output_len",
+@@ -114,27 +85,34 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
+         32,
+     ])
+ @pytest.mark.parametrize("seed", [1])
+-def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
+-                              batch_size: int, output_len: int):
++@pytest.mark.parametrize("logprobs", [1, 6])
++def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
++                              per_test_common_llm_kwargs, baseline_llm_kwargs,
++                              test_llm_kwargs, batch_size: int,
++                              output_len: int, seed: int, logprobs: int):
+     """Veriy logprob greedy equality with different speculation lens.
+     """
+-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  output_len,
++                                  seed,
++                                  temperature=0.0,
++                                  logprobs=logprobs,
++                                  disable_logprobs=test_llm_kwargs[
++                                      'disable_logprobs_during_spec_decoding'])
+ 
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+-        "model": "JackFram/llama-68m",
++        "model_name": "JackFram/llama-68m",
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@@ -143,6 +121,7 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
+     [{
+         "speculative_model": "JackFram/llama-160m",
+         "num_speculative_tokens": 3,
++        "disable_logprobs_during_spec_decoding": False,
+ 
+         # Artificially limit the draft model max model len; this forces vLLM
+         # to skip speculation once the sequences grow beyond 32-k tokens.
+@@ -156,35 +135,44 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
+         32,
+     ])
+ @pytest.mark.parametrize("seed", [1])
+-def test_logprobs_when_skip_speculation(baseline_llm_generator,
+-                                        test_llm_generator, batch_size: int,
+-                                        output_len: int):
++@pytest.mark.parametrize("logprobs", [1])
++def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
++                                        per_test_common_llm_kwargs,
++                                        baseline_llm_kwargs, test_llm_kwargs,
++                                        batch_size: int, output_len: int,
++                                        seed: int, logprobs: int):
+     """Verify logprobs greedy equality when some sequences skip speculation.
+     """
+-    run_greedy_logprobs_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  output_len,
++                                  seed,
++                                  temperature=0.0,
++                                  logprobs=logprobs,
++                                  disable_logprobs=test_llm_kwargs[
++                                      'disable_logprobs_during_spec_decoding'])
+ 
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+-        "model": "JackFram/llama-68m",
++        "model_name": "JackFram/llama-68m",
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+-@pytest.mark.parametrize("test_llm_kwargs", [{
+-    "speculative_model": "JackFram/llama-160m",
+-    "num_speculative_tokens": 3,
+-}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_model": "JackFram/llama-160m",
++                             "num_speculative_tokens": 3,
++                             "disable_logprobs_during_spec_decoding": False,
++                         }])
+ @pytest.mark.parametrize("batch_size", [1])
+ @pytest.mark.parametrize(
+     "output_len",
+@@ -193,19 +181,17 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator,
+         32,
+     ])
+ @pytest.mark.parametrize("seed", [1])
+-def test_logprobs_temp_1(baseline_llm_generator, test_llm_generator,
+-                         batch_size: int, output_len: int):
++@pytest.mark.parametrize("logprobs", [6])
++def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
++                         per_test_common_llm_kwargs, baseline_llm_kwargs,
++                         test_llm_kwargs, batch_size: int, output_len: int,
++                         seed: int, logprobs: int):
+     """Verify at least one logprob result has num_logprobs+1, which tests the
+     case where the sampled token is not in top-k logprobs.
+ 
+     Ideally, this test should validate equality with non-spec by getting
+     logprobs. This is left as future improvement.
+     """
+-    batch_size = 8
+-    max_output_len = output_len
+-    force_output_len = True
+-    logprob_rank = 5
+-
+     temperature = 1.0
+ 
+     prompts = [
+@@ -221,115 +207,72 @@ def test_logprobs_temp_1(baseline_llm_generator, test_llm_generator,
+ 
+     prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+ 
+-    # If the test requires that we generated max_output_len tokens, then set the
+-    # sampling params to ignore eos token.
+-    ignore_eos = force_output_len
+-
+     sampling_params = SamplingParams(
+-        max_tokens=max_output_len,
+-        ignore_eos=ignore_eos,
++        max_tokens=output_len,
++        ignore_eos=True,
+         temperature=temperature,
+-        logprobs=logprob_rank,
++        logprobs=logprobs,
+     )
+ 
+-    spec_batch_logprobs = get_logprobs_from_llm_generator(
+-        test_llm_generator, prompts, sampling_params)
++    sd_args = {
++        **common_llm_kwargs,
++        **per_test_common_llm_kwargs,
++        **test_llm_kwargs,
++    }
++
++    with vllm_runner(**sd_args) as vllm_model:
++        sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
+ 
+     num_returned_logprobs = [
+-        len(logprob_dict) for seq_logprobs in spec_batch_logprobs
+-        for logprob_dict in seq_logprobs
++        len(seq_logprobs) for seq_logprobs in sd_outputs[-1]
+     ]
+ 
+     # Assert one of the returned logprobs has > num_logprobs (indicating the
+     # sampled token is not in top-k).
+-    assert any([
+-        num_returned > logprob_rank for num_returned in num_returned_logprobs
+-    ])
++    assert any(
++        [num_returned > logprobs for num_returned in num_returned_logprobs])
+ 
+ 
+-def run_greedy_logprobs_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len,
+-                                         force_output_len: bool,
+-                                         logprob_rank: int = 1):
+-    """Helper method that compares the logprobs outputs of both the baseline LLM
+-    and the test LLM. It asserts greedy equality of the logprobs when the
+-    temperature is zero.
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model_name": "JackFram/llama-160m",
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_model": "JackFram/llama-68m",
++                             "num_speculative_tokens": 3,
++                             "disable_logprobs_during_spec_decoding": True,
++                         }])
++@pytest.mark.parametrize("seed", [1])
++@pytest.mark.parametrize("batch_size", [4])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("logprobs", [0])
++def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
++                           per_test_common_llm_kwargs, baseline_llm_kwargs,
++                           test_llm_kwargs, batch_size: int, output_len: int,
++                           seed: int, logprobs: int):
++    """Check the behavior when logprobs are disabled.
++    Token choices should match with the base model.
+     """
+-    temperature = 0.0
+-
+-    prompts = [
+-        "Hello, my name is",
+-        "The president of the United States is",
+-        "The capital of France is",
+-        "The future of AI is",
+-        "San Francisco is know for its",
+-        "Facebook was created in 2004 by",
+-        "Curious George is a",
+-        "Python 3.11 brings improvements to its",
+-    ]
+-
+-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+-
+-    # If the test requires that we generated max_output_len tokens, then set the
+-    # sampling params to ignore eos token.
+-    ignore_eos = force_output_len
+-
+-    sampling_params = SamplingParams(
+-        max_tokens=max_output_len,
+-        ignore_eos=ignore_eos,
+-        temperature=temperature,
+-        logprobs=logprob_rank,
+-    )
+-
+-    spec_batch_logprobs = get_logprobs_from_llm_generator(
+-        test_llm_generator, prompts, sampling_params)
+-    baseline_batch_logprobs = get_logprobs_from_llm_generator(
+-        baseline_llm_generator, prompts, sampling_params)
+-
+-    assert len(baseline_batch_logprobs) == len(prompts)
+-    assert len(spec_batch_logprobs) == len(prompts)
+-
+-    # For each sequence in the batch.
+-    for i, (baseline_logprobs, spec_logprobs) in enumerate(
+-            zip(baseline_batch_logprobs, spec_batch_logprobs)):
+-        assert len(spec_logprobs) == len(baseline_logprobs)
+-
+-        # For each generated position of the sequence.
+-        for pos, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
+-                zip(spec_logprobs, baseline_logprobs)):
+-
+-            # Map rank to token/logprob in spec output.
+-            spec_rank_to_token_id = {
+-                value.rank: key
+-                for key, value in spec_pos_logprobs.items()
+-            }
+-            spec_rank_to_logprob = {
+-                value.rank: value.logprob
+-                for key, value in spec_pos_logprobs.items()
+-            }
+-
+-            # Map rank to token/logprob in baseline output.
+-            baseline_rank_to_token_id = {
+-                value.rank: key
+-                for key, value in baseline_pos_logprobs.items()
+-            }
+-            baseline_rank_to_logprob = {
+-                value.rank: value.logprob
+-                for key, value in baseline_pos_logprobs.items()
+-            }
+-
+-            # Assert set of ranks returned is equal.
+-            assert set(spec_rank_to_token_id.keys()) == set(
+-                baseline_rank_to_token_id.keys())
+-
+-            # Assert each logprob/token id is correct, keyed by rank.
+-            for rank in sorted(set(spec_rank_to_token_id.keys())):
+-                assert spec_rank_to_token_id[
+-                    rank] == baseline_rank_to_token_id[rank], f"{rank}"
+-                assert math.isclose(
+-                    a=spec_rank_to_logprob[rank],
+-                    b=baseline_rank_to_logprob[rank],
+-                    abs_tol=1e-1,
+-                )
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  output_len,
++                                  seed,
++                                  temperature=0.0,
++                                  logprobs=logprobs,
++                                  disable_logprobs=test_llm_kwargs[
++                                      'disable_logprobs_during_spec_decoding'])
+diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
+new file mode 100644
+index 0000000..b896560
+--- /dev/null
++++ b/tests/spec_decode/e2e/test_medusa_correctness.py
+@@ -0,0 +1,383 @@
++"""This docstring details important information on the testing methodology.
++
++Most of the tests rely on "greedy equality", where we expect the output of
++speculative decoding on a sequence to exactly match the output of normal non-
++speculative decoding.
++
++Since speculative decoding with rejection sampling guarantees that the output
++distribution matches the target model's output distribution (up to hardware
++numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
++equality.
++
++However, we still need to verify below scenario could be passed:
++    * Batch size 1 greedy equality
++    * Batch size >1 greedy equality
++    * Test greedy equality under preemption
++    * Test greedy equality under various number of speculative tokens.
++
++With those tests, we can say at least, Medusa would not break the
++correctess for the target model outputs.
++"""
++
++import pytest
++
++from .conftest import run_equality_correctness_test
++
++# main model
++# lmsys/vicuna-7b-v1.3 was to be used but it's causing
++# OOM in CI pipeline, so using a smaller model.
++MAIN_MODEL = "JackFram/llama-68m"
++
++# speculative model
++SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random"
++
++# max number of speculative tokens: this corresponds to
++# num_heads in the config.json of the speculator model.
++MAX_SPEC_TOKENS = 5
++
++# precision
++PRECISION = "float32"
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++    },
++])
++@pytest.mark.parametrize("output_len", [
++    128,
++])
++@pytest.mark.parametrize("batch_size", [1, 32])
++@pytest.mark.parametrize("seed", [1])
++def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
++                                       per_test_common_llm_kwargs,
++                                       baseline_llm_kwargs, test_llm_kwargs,
++                                       batch_size: int, output_len: int,
++                                       seed: int):
++    """Verify greedy equality with different batch size."""
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++        "disable_logprobs_during_spec_decoding": False,
++    },
++    {
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++        "disable_logprobs_during_spec_decoding": True,
++    },
++])
++@pytest.mark.parametrize("output_len", [
++    8,
++])
++@pytest.mark.parametrize("batch_size", [8])
++@pytest.mark.parametrize("seed", [1])
++@pytest.mark.parametrize("logprobs", [1, 6])
++def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
++                                    per_test_common_llm_kwargs,
++                                    baseline_llm_kwargs, test_llm_kwargs,
++                                    batch_size: int, output_len: int,
++                                    seed: int, logprobs: int):
++    """Verify greedy equality with different batch size."""
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0,
++                                  logprobs=logprobs,
++                                  prompt_logprobs=logprobs,
++                                  disable_logprobs=test_llm_kwargs[
++                                      'disable_logprobs_during_spec_decoding'])
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "enforce_eager": False,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++    },
++])
++@pytest.mark.parametrize("output_len", [
++    128,
++])
++@pytest.mark.parametrize("batch_size", [1, 32])
++@pytest.mark.parametrize("seed", [1])
++def test_medusa_e2e_greedy_correctness_cuda_graph(
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
++    """Verify greedy equality with cuda graph enabled and different 
++    batch sizes."""
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "block_size": 8,
++        # 2 for small prompt, 256//8 for generated.
++        "num_gpu_blocks_override": 2 + 256 // 8,
++        "max_model_len": (2 + 256 // 8) * 8,
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++    },
++])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use small output len for fast test.
++        128,
++    ])
++@pytest.mark.parametrize("batch_size", [4])
++@pytest.mark.parametrize("seed", [1])
++def test_medusa_e2e_greedy_correctness_with_preemption(
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
++    """Verify greedy equality, even when some sequences are preempted mid-
++    generation.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize(
++    "test_llm_kwargs",
++    [
++        {
++            "speculative_model": SPEC_MODEL,
++            "num_speculative_tokens": k,
++        }
++        # Try a range of num. speculative tokens
++        for k in range(1, 1 + MAX_SPEC_TOKENS)
++    ])
++@pytest.mark.parametrize("batch_size", [2])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_medusa_different_k(vllm_runner, common_llm_kwargs,
++                            per_test_common_llm_kwargs, baseline_llm_kwargs,
++                            test_llm_kwargs, batch_size: int, output_len: int,
++                            seed: int):
++    """Verify that medusa speculative decoding produces exact equality
++    to without spec decode with different values of num_speculative_tokens.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_model": SPEC_MODEL,
++                             "num_speculative_tokens": MAX_SPEC_TOKENS,
++                             "speculative_disable_by_batch_size": 4
++                         }])
++@pytest.mark.parametrize("batch_size", [1, 5])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
++                              per_test_common_llm_kwargs, baseline_llm_kwargs,
++                              test_llm_kwargs, batch_size: int,
++                              output_len: int, seed: int):
++    """Verify that medusa speculative decoding produces exact equality
++    to without spec decode when speculation is disabled for large
++    batch sizes.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++        "speculative_model": SPEC_MODEL,
++        "num_speculative_tokens": MAX_SPEC_TOKENS,
++        "speculative_disable_by_batch_size": 4
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_disable_mqa_scorer": True,
++                         }])
++@pytest.mark.parametrize("batch_size", [1, 5])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
++                    output_len: int, seed: int):
++    """Verify that speculative decoding generates the same output 
++    with batch expansion scorer and mqa scorer.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++if __name__ == "__main__":
++    import pytest
++    pytest.main([__file__])
+diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
+new file mode 100644
+index 0000000..183ff2f
+--- /dev/null
++++ b/tests/spec_decode/e2e/test_mlp_correctness.py
+@@ -0,0 +1,478 @@
++"""This docstring details important information on the testing methodology.
++
++Most of the tests rely on "greedy equality", where we expect the output of
++speculative decoding on a sequence to exactly match the output of normal non-
++speculative decoding.
++
++Since speculative decoding with rejection sampling guarantees that the output
++distribution matches the target model's output distribution (up to hardware
++numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
++equality.
++
++However, we still need to verify below scenario could be passed:
++    * Batch size 1 greedy equality
++    * Batch size >1 greedy equality
++    * Test greedy equality under preemption
++    * Test greedy equality under various number of speculative tokens.
++
++With those tests, we can say at least, MLPSpeculator would not break the
++correctness for the target model outputs.
++"""
++
++from unittest.mock import patch
++
++import pytest
++
++from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
++
++from .conftest import run_equality_correctness_test
++
++# main model
++MAIN_MODEL = "JackFram/llama-160m"
++
++# speculative model
++SPEC_MODEL = "ibm-fms/llama-160m-accelerator"
++
++# max. number of speculative tokens: this corresponds to
++# n_predict in the config.json of the speculator model.
++MAX_SPEC_TOKENS = 3
++
++# precision
++PRECISION = "float32"
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++    },
++])
++@pytest.mark.parametrize("output_len", [
++    128,
++])
++@pytest.mark.parametrize("batch_size", [1, 32])
++@pytest.mark.parametrize("seed", [1])
++def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
++                                    per_test_common_llm_kwargs,
++                                    baseline_llm_kwargs, test_llm_kwargs,
++                                    batch_size: int, output_len: int,
++                                    seed: int):
++    """Verify greedy equality with different batch size."""
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++        "disable_logprobs_during_spec_decoding": False,
++    },
++    {
++        "speculative_model": SPEC_MODEL,
++        "disable_logprobs_during_spec_decoding": True,
++    },
++])
++@pytest.mark.parametrize("output_len", [8])
++@pytest.mark.parametrize("batch_size", [8])
++@pytest.mark.parametrize("seed", [1])
++@pytest.mark.parametrize("logprobs", [1, 6])
++def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
++                                 per_test_common_llm_kwargs,
++                                 baseline_llm_kwargs, test_llm_kwargs,
++                                 batch_size: int, output_len: int, seed: int,
++                                 logprobs: int):
++    """Verify greedy equality with different batch size."""
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0,
++                                  logprobs=logprobs,
++                                  prompt_logprobs=logprobs,
++                                  disable_logprobs=test_llm_kwargs[
++                                      'disable_logprobs_during_spec_decoding'])
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++    },
++])
++@pytest.mark.parametrize("output_len", [2048])
++@pytest.mark.parametrize("batch_size", [1, 32])
++@pytest.mark.parametrize("seed", [1])
++def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
++                                 per_test_common_llm_kwargs,
++                                 baseline_llm_kwargs, test_llm_kwargs,
++                                 batch_size: int, output_len: int, seed: int):
++    """Verify acceptance rate with different batch size and large output 
++    length."""
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  temperature=0.0,
++                                  seed=seed,
++                                  expected_acceptance_rate=0.48)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++
++        # Speculative model
++        "speculative_model": SPEC_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
++@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
++@pytest.mark.parametrize("output_len", [64])
++@pytest.mark.parametrize("batch_size", [1, 32])
++@pytest.mark.parametrize("temperature", [1.0])
++@pytest.mark.parametrize("seed", [1])
++def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
++                                    per_test_common_llm_kwargs,
++                                    baseline_llm_kwargs, test_llm_kwargs,
++                                    batch_size: int, output_len: int,
++                                    temperature: float, seed: int):
++    """Verify seeded runs produce the same output."""
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  temperature=temperature,
++                                  seed=seed)
++
++    # Ensure this same test does fail if we _don't_ include per-request seeds
++    with pytest.raises(AssertionError):
++        run_equality_correctness_test(vllm_runner,
++                                      common_llm_kwargs,
++                                      per_test_common_llm_kwargs,
++                                      baseline_llm_kwargs,
++                                      test_llm_kwargs,
++                                      batch_size,
++                                      max_output_len=output_len,
++                                      temperature=temperature,
++                                      seed=seed,
++                                      disable_seed=True)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "block_size": 8,
++        # 2 for small prompt, 256//8 for generated.
++        "num_gpu_blocks_override": 2 + 256 // 8,
++        "max_model_len": (2 + 256 // 8) * 8,
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++    },
++])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use small output len for fast test.
++        128,
++    ])
++@pytest.mark.parametrize("batch_size", [4])
++@pytest.mark.parametrize("seed", [1])
++def test_mlp_e2e_greedy_correctness_with_preemption(
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
++    """Verify greedy equality, even when some sequences are preempted mid-
++    generation.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "block_size": 8,
++        # 2 for small prompt, 256//8 for generated.
++        "num_gpu_blocks_override": 2 + 256 // 8,
++        "max_model_len": (2 + 256 // 8) * 8,
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": SPEC_MODEL,
++    },
++])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use small output len for fast test.
++        128,
++    ])
++@pytest.mark.parametrize("batch_size", [4])
++@pytest.mark.parametrize("seed", [1])
++def test_mlp_e2e_greedy_correctness_with_padding(
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
++    """Verify greedy equality when the vocab dimension is padded
++    """
++
++    # Default pad_to is 64, test model has vocab_size of 32000
++    def patched_pad_vocab_size(vocab_size, pad_to=None):
++        return pad_vocab_size(vocab_size, pad_to=32064)
++
++    with patch(
++            "vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size",
++            patched_pad_vocab_size):
++        run_equality_correctness_test(vllm_runner,
++                                      common_llm_kwargs,
++                                      per_test_common_llm_kwargs,
++                                      baseline_llm_kwargs,
++                                      test_llm_kwargs,
++                                      batch_size,
++                                      max_output_len=output_len,
++                                      seed=seed,
++                                      temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize(
++    "test_llm_kwargs",
++    [
++        {
++            "speculative_model": SPEC_MODEL,
++            "num_speculative_tokens": k,
++        }
++        # Try a range of num. speculative tokens
++        for k in range(1, 1 + MAX_SPEC_TOKENS)
++    ])
++@pytest.mark.parametrize("batch_size", [2])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_mlp_different_k(vllm_runner, common_llm_kwargs,
++                         per_test_common_llm_kwargs, baseline_llm_kwargs,
++                         test_llm_kwargs, batch_size: int, seed: int,
++                         output_len: int):
++    """Verify that mlp speculative decoding produces exact equality
++    to without spec decode with different values of num_speculative_tokens.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Precision
++        "dtype": PRECISION,
++
++        # Main model
++        "model_name": MAIN_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_model": SPEC_MODEL,
++                             "speculative_disable_by_batch_size": 4
++                         }])
++@pytest.mark.parametrize("batch_size", [1, 5])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
++                           per_test_common_llm_kwargs, baseline_llm_kwargs,
++                           test_llm_kwargs, batch_size: int, seed: int,
++                           output_len: int):
++    """Verify that mlp speculative decoding produces exact equality
++    to without spec decode when speculation is disabled for large
++    batch sizes.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model_name": MAIN_MODEL,
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++        "speculative_model": SPEC_MODEL,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_disable_mqa_scorer": True,
++                         }])
++@pytest.mark.parametrize("batch_size", [1, 5])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
++                    output_len: int, seed: int):
++    """Verify that speculative decoding generates the same output 
++    with batch expansion scorer and mqa scorer.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
+diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
+index f15fcc4..a13cca4 100644
+--- a/tests/spec_decode/e2e/test_multistep_correctness.py
++++ b/tests/spec_decode/e2e/test_multistep_correctness.py
+@@ -11,9 +11,15 @@ distribution matches the target model's output distribution (up to hardware
+ numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+ equality. This gives us good coverage of temp=0.
+ 
++At temp=0, the TypicalAcceptanceSampler ensures that only the tokens with the
++highest probability in the target distribution are accepted. Therefore, we can 
++expect greedy equality for the TypicalAcceptanceSampler at temp=0.
++
+ For temp>0, we rely on unit tests on the rejection sampler to verify that the
+ output distribution is the same with spec decode vs. no spec decode (this would
+-be prohibitively expensive to run with a real model).
++be prohibitively expensive to run with a real model). Similarly, for the
++TypicalAcceptance sampler also, we rely on unit tests to validate temp>0
++test cases.
+ 
+ NOTE: Speculative decoding's distribution equality requires that the measured
+ distributions of the target model and proposal model be deterministic given the
+@@ -35,8 +41,9 @@ from transformers import AutoTokenizer
+ 
+ from vllm import SamplingParams
+ 
++from ...utils import fork_new_process_for_each_test
+ from .conftest import (get_output_from_llm_generator,
+-                       run_greedy_equality_correctness_test)
++                       run_equality_correctness_test)
+ 
+ 
+ @pytest.mark.parametrize(
+@@ -48,9 +55,6 @@ from .conftest import (get_output_from_llm_generator,
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True,
+     }])
+ @pytest.mark.parametrize(
+     "per_test_common_llm_kwargs",
+@@ -58,6 +62,16 @@ from .conftest import (get_output_from_llm_generator,
+         {
+             "speculative_model": "JackFram/llama-68m",
+             "num_speculative_tokens": 5,
++            "enable_chunked_prefill": False,
++        },
++        {
++            # Chunked prefill enabled with small value
++            # to make sure we get mixed batches.
++            "speculative_model": "JackFram/llama-68m",
++            "num_speculative_tokens": 5,
++            "enable_chunked_prefill": True,
++            "max_num_batched_tokens": 4,
++            "max_num_seqs": 4
+         },
+         {
+             # Verify the detokenizer assertions in the test work when spec
+@@ -67,6 +81,7 @@ from .conftest import (get_output_from_llm_generator,
+ @pytest.mark.parametrize("test_llm_kwargs", [{}])
+ @pytest.mark.parametrize("batch_size", [1, 32])
+ @pytest.mark.parametrize("seed", [1])
++@fork_new_process_for_each_test
+ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
+                                              batch_size: int):
+     """Run generation with speculative decoding on a batch. Verify the engine
+@@ -91,7 +106,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
+         temperature=temperature,
+     )
+ 
+-    batch_tokens, batch_token_ids = get_output_from_llm_generator(
++    batch_tokens, batch_token_ids, _ = get_output_from_llm_generator(
+         test_llm_generator, prompts, sampling_params)
+ 
+     # Expect a generation for each prompt in the batch.
+@@ -110,53 +125,12 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
+         assert actual_tokens.strip() == expected_tokens.strip()
+ 
+ 
+-@pytest.mark.parametrize(
+-    "common_llm_kwargs",
+-    [{
+-        # Use a small model for a fast test.
+-        # Note this is repeated in the test body; to initialize a tokenizer.
+-        "model": "JackFram/llama-68m",
+-
+-        # Skip cuda graph recording for fast test.
+-        "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True,
+-
+-        # Use AsyncLLM engine
+-        "use_async": True,
+-    }])
+-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+-    {
+-        "speculative_model": "JackFram/llama-68m",
+-        "num_speculative_tokens": 5,
+-    },
+-])
+-@pytest.mark.parametrize("test_llm_kwargs", [{}])
+-@pytest.mark.parametrize("batch_size", [2])
+-@pytest.mark.parametrize("seed", [1])
+-def test_spec_decode_e2e_with_async_engine(test_llm_generator,
+-                                           baseline_llm_generator,
+-                                           batch_size: int):
+-    """Verify spec decode works well with async LLM engine.
+-    """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=32,
+-                                         force_output_len=True)
+-
+-
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+ 
+-        # Required for spec decode.
+-        "use_v2_block_manager": True,
+-
+         # Print spec metrics.
+         "disable_log_stats": False,
+     }])
+@@ -166,10 +140,10 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
+         # Try two different tiny base models.
+         # Note that one is equal to the draft model, another isn't.
+         {
+-            "model": "JackFram/llama-68m",
++            "model_name": "JackFram/llama-68m",
+         },
+         {
+-            "model": "JackFram/llama-160m",
++            "model_name": "JackFram/llama-160m",
+         },
+     ])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@@ -177,29 +151,49 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
+     {
+         "speculative_model": "JackFram/llama-68m",
+         "num_speculative_tokens": 5,
++        "enable_chunked_prefill": False,
++    },
++    {
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": 5,
++        "enable_chunked_prefill": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4,
+     },
+ ])
+ @pytest.mark.parametrize(
+     "output_len",
+     [
+         # Use long output len for the small model test.
+-        1536,
++        10,
+     ])
+ @pytest.mark.parametrize("batch_size", [1])
+ @pytest.mark.parametrize("seed", [1])
++@fork_new_process_for_each_test
+ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
+-        baseline_llm_generator, test_llm_generator, batch_size: int,
+-        output_len: int):
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
+     """Verify greedy equality on a tiny model with batch size of one.
+ 
+     Since this test is cheaper than other e2e correctness tests, we generate
+     with a higher output_len.
++
++    When the draft model is the same as the target model, we further check
++    whether all speculative tokens are accepted.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    ensure_all_accepted = per_test_common_llm_kwargs.get(
++        "model_name") == test_llm_kwargs.get("speculative_model")
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0,
++                                  ensure_all_accepted=ensure_all_accepted)
+ 
+ 
+ @pytest.mark.parametrize(
+@@ -208,9 +202,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+ 
+-        # Required for spec decode.
+-        "use_v2_block_manager": True,
+-
+         # Print spec metrics.
+         "disable_log_stats": False,
+     }])
+@@ -220,10 +211,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
+         # Try two different tiny base models.
+         # Note that one is equal to the draft model, another isn't.
+         {
+-            "model": "JackFram/llama-68m",
++            "model_name": "JackFram/llama-68m",
+         },
+         {
+-            "model": "JackFram/llama-160m",
++            "model_name": "JackFram/llama-160m",
+         },
+     ])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@@ -231,6 +222,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
+     {
+         "speculative_model": "JackFram/llama-68m",
+         "num_speculative_tokens": 5,
++        "enable_chunked_prefill": False,
++    },
++    {
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": 5,
++        "enable_chunked_prefill": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4
+     },
+ ])
+ @pytest.mark.parametrize(
+@@ -241,16 +240,22 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
+     ])
+ @pytest.mark.parametrize("batch_size", [64])
+ @pytest.mark.parametrize("seed", [1])
++@fork_new_process_for_each_test
+ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
+-        baseline_llm_generator, test_llm_generator, batch_size: int,
+-        output_len: int):
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
+     """Verify greedy equality on a tiny model and large batch size.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
+ 
+ 
+ @pytest.mark.parametrize(
+@@ -258,9 +263,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
+     [{
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+     }])
+ @pytest.mark.parametrize(
+     "per_test_common_llm_kwargs",
+@@ -268,10 +270,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
+         # Try two different tiny base models.
+         # Note that one is equal to the draft model, another isn't.
+         {
+-            "model": "JackFram/llama-68m",
++            "model_name": "JackFram/llama-68m",
+         },
+         {
+-            "model": "JackFram/llama-160m",
++            "model_name": "JackFram/llama-160m",
+         },
+     ])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@@ -279,6 +281,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
+     {
+         "speculative_model": "JackFram/llama-68m",
+         "num_speculative_tokens": 5,
++        "enable_chunked_prefill": False,
++    },
++    {
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": 5,
++        "enable_chunked_prefill": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4
+     },
+ ])
+ @pytest.mark.parametrize("max_output_len", [
+@@ -286,31 +296,35 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
+ ])
+ @pytest.mark.parametrize("batch_size", [32])
+ @pytest.mark.parametrize("seed", [1])
++@fork_new_process_for_each_test
+ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
+-        baseline_llm_generator, test_llm_generator, batch_size: int,
+-        max_output_len: int):
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
++        max_output_len: int, seed: int):
+     """Verify greedy equality on a tiny model, with a large batch size, and when
+     sampling respects the EOS token.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len,
+-                                         force_output_len=False)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len,
++                                  seed=seed,
++                                  temperature=0.0,
++                                  ignore_eos=False)
+ 
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+         # A "real" model (not tiny).
+-        "model": "meta-llama/Llama-2-7b-chat-hf",
++        "model_name": "meta-llama/Llama-2-7b-chat-hf",
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+ 
+-        # Required for spec decode.
+-        "use_v2_block_manager": True,
+-
+         # Print spec metrics.
+         "disable_log_stats": False,
+     }])
+@@ -320,6 +334,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
+     {
+         "speculative_model": "JackFram/llama-68m",
+         "num_speculative_tokens": 5,
++        "enable_chunked_prefill": False,
++    },
++    {
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": 5,
++        "enable_chunked_prefill": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4
+     },
+ ])
+ @pytest.mark.parametrize("batch_size", [1])
+@@ -330,31 +352,34 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
+         256,
+     ])
+ @pytest.mark.parametrize("seed", [1])
++@fork_new_process_for_each_test
+ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
+-        baseline_llm_generator, test_llm_generator, batch_size: int,
+-        output_len: int):
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
+     """Verify greedy equality on a "real" model and batch size of 1. This is
+     separate from large BS tests to make identifying the source of bugs easier.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
+ 
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+         # A "real" model (not tiny).
+-        "model": "meta-llama/Llama-2-7b-chat-hf",
++        "model_name": "meta-llama/Llama-2-7b-chat-hf",
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+ 
+-        # Required for spec decode.
+-        "use_v2_block_manager": True,
+-
+         # Print spec metrics.
+         "disable_log_stats": False,
+     }])
+@@ -364,6 +389,14 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
+     {
+         "speculative_model": "JackFram/llama-68m",
+         "num_speculative_tokens": 5,
++        "enable_chunked_prefill": False,
++    },
++    {
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": 5,
++        "enable_chunked_prefill": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4
+     },
+ ])
+ @pytest.mark.parametrize("batch_size", [32])
+@@ -374,17 +407,23 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
+         64,
+     ])
+ @pytest.mark.parametrize("seed", [1])
++@fork_new_process_for_each_test
+ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
+-        baseline_llm_generator, test_llm_generator, batch_size: int,
+-        output_len: int):
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
+     """Verify greedy equality with a "real" model on a nontrivial batch size.
+     This is the closest test to a real production workload.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
+ 
+ 
+ @pytest.mark.parametrize(
+@@ -397,13 +436,10 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
+     {
+-        "model": "JackFram/llama-160m",
++        "model_name": "JackFram/llama-160m",
+     },
+ ])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@@ -411,6 +447,14 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
+     {
+         "speculative_model": "JackFram/llama-68m",
+         "num_speculative_tokens": 5,
++        "enable_chunked_prefill": False,
++    },
++    {
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": 5,
++        "enable_chunked_prefill": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4
+     },
+ ])
+ @pytest.mark.parametrize(
+@@ -421,29 +465,32 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
+     ])
+ @pytest.mark.parametrize("batch_size", [4])
+ @pytest.mark.parametrize("seed", [1])
++@fork_new_process_for_each_test
+ def test_spec_decode_e2e_greedy_correctness_with_preemption(
+-        baseline_llm_generator, test_llm_generator, batch_size: int,
+-        output_len: int):
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
+     """Verify greedy equality, even when some sequences are preempted mid-
+     generation.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
+ 
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+-        "model": "JackFram/llama-160m",
++        "model_name": "JackFram/llama-160m",
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+     }])
+ @pytest.mark.parametrize(
+     "per_test_common_llm_kwargs",
+@@ -465,6 +512,14 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
+     {
+         "speculative_model": "JackFram/llama-68m",
+         "num_speculative_tokens": 5,
++        "enable_chunked_prefill": False,
++    },
++    {
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": 5,
++        "enable_chunked_prefill": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4
+     },
+ ])
+ @pytest.mark.parametrize("batch_size", [2])
+@@ -475,28 +530,32 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
+         32,
+     ])
+ @pytest.mark.parametrize("seed", [1])
+-def test_spec_decode_different_block_size(baseline_llm_generator,
+-                                          test_llm_generator, batch_size: int,
+-                                          output_len: int):
++@fork_new_process_for_each_test
++def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
++                                          per_test_common_llm_kwargs,
++                                          baseline_llm_kwargs, test_llm_kwargs,
++                                          batch_size: int, output_len: int,
++                                          seed: int):
+     """Verify greedy equality over different block sizes.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
+ 
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+-        "model": "JackFram/llama-160m",
++        "model_name": "JackFram/llama-160m",
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@@ -510,6 +569,15 @@ def test_spec_decode_different_block_size(baseline_llm_generator,
+             # Artificially limit the draft model max model len; this forces vLLM
+             # to skip speculation once the sequences grow beyond 32-k tokens.
+             "speculative_max_model_len": 32,
++            "enable_chunked_prefill": False,
++        },
++        {
++            "speculative_model": "JackFram/llama-68m",
++            "num_speculative_tokens": 5,
++            "enable_chunked_prefill": True,
++            "max_num_batched_tokens": 4,
++            "max_num_seqs": 4,
++            "speculative_max_model_len": 32,
+         },
+     ])
+ @pytest.mark.parametrize("batch_size", [8])
+@@ -522,30 +590,81 @@ def test_spec_decode_different_block_size(baseline_llm_generator,
+         64,
+     ])
+ @pytest.mark.parametrize("seed", [1])
+-def test_skip_speculation(baseline_llm_generator, test_llm_generator,
+-                          batch_size: int, output_len: int):
++@fork_new_process_for_each_test
++def test_skip_speculation(vllm_runner, common_llm_kwargs,
++                          per_test_common_llm_kwargs, baseline_llm_kwargs,
++                          test_llm_kwargs, batch_size: int, output_len: int,
++                          seed: int):
+     """Verify greedy equality when some (or all) sequences skip speculation.
+     We do this by setting the max model len of the draft model to an
+     artificially low value, such that when the sequences grow beyond it, they
+     are skipped in speculative decoding.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
+ 
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+-        "model": "JackFram/llama-68m",
++        "model_name": "JackFram/llama-160m",
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": 5,
++        "speculative_disable_by_batch_size": 2,
++        "enable_chunked_prefill": False,
++    },
++    {
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": 5,
++        "speculative_disable_by_batch_size": 2,
++        "enable_chunked_prefill": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4,
++    },
++])
++@pytest.mark.parametrize("batch_size", [8])
++@pytest.mark.parametrize("output_len", [10])
++@pytest.mark.parametrize("seed", [1])
++@fork_new_process_for_each_test
++def test_disable_speculation(vllm_runner, common_llm_kwargs,
++                             per_test_common_llm_kwargs, baseline_llm_kwargs,
++                             test_llm_kwargs, batch_size: int, output_len: int,
++                             seed: int):
++    """Verify greedy equality when all sequences disable speculation.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model_name": "JackFram/llama-68m",
+ 
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@@ -555,10 +674,17 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
+         {
+             "speculative_model": "JackFram/llama-68m",
+             "num_speculative_tokens": k,
++            "enable_chunked_prefill": False,
+         }
+         # Try a range of common k, as well as large speculation.
+         for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
+-    ])
++    ] + [{
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": k,
++        "enable_chunked_prefill": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4,
++    } for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]])
+ @pytest.mark.parametrize("batch_size", [2])
+ @pytest.mark.parametrize(
+     "output_len",
+@@ -567,13 +693,77 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
+         32,
+     ])
+ @pytest.mark.parametrize("seed", [1])
+-def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
+-                output_len: int):
++@fork_new_process_for_each_test
++def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++                baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
++                output_len: int, seed: int):
+     """Verify that speculative decoding produces exact equality to without spec
+     decode with many different values of k.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model_name": "JackFram/llama-160m",
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize(
++    "test_llm_kwargs",
++    [
++        {
++            "speculative_model": "JackFram/llama-68m",
++            "num_speculative_tokens": k,
++            "spec_decoding_acceptance_method": "typical_acceptance_sampler",
++            "enable_chunked_prefill": False
++        }
++        # Try a range of common k.
++        for k in [1, 2, 3]
++    ] + [{
++        "speculative_model": "JackFram/llama-68m",
++        "num_speculative_tokens": k,
++        "spec_decoding_acceptance_method": "typical_acceptance_sampler",
++        "enable_chunked_prefill": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4
++    } for k in [1, 2, 3]])
++@pytest.mark.parametrize("batch_size", [1, 32])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++@fork_new_process_for_each_test
++def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
++                                     per_test_common_llm_kwargs,
++                                     baseline_llm_kwargs, test_llm_kwargs,
++                                     batch_size: int, output_len: int,
++                                     seed: int):
++    """Verify that speculative decoding produces exact equality to without spec
++    decode with TypicalAcceptanceSampler as the draft token acceptance
++    sampling method.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
+diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
+index 44ef400..e53d169 100644
+--- a/tests/spec_decode/e2e/test_ngram_correctness.py
++++ b/tests/spec_decode/e2e/test_ngram_correctness.py
+@@ -26,7 +26,7 @@ for the target model outputs.
+ 
+ import pytest
+ 
+-from .conftest import run_greedy_equality_correctness_test
++from .conftest import run_equality_correctness_test
+ 
+ 
+ @pytest.mark.parametrize(
+@@ -35,15 +35,12 @@ from .conftest import run_greedy_equality_correctness_test
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+ 
+-        # Required for spec decode.
+-        "use_v2_block_manager": True,
+-
+         # Print spec metrics.
+         "disable_log_stats": False,
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
+     {
+-        "model": "JackFram/llama-68m",
++        "model_name": "JackFram/llama-68m",
+     },
+ ])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@@ -53,21 +50,98 @@ from .conftest import run_greedy_equality_correctness_test
+         "num_speculative_tokens": 5,
+         "ngram_prompt_lookup_max": 3,
+     },
++    {
++        "speculative_model": "[ngram]",
++        "num_speculative_tokens": 5,
++        "ngram_prompt_lookup_max": 3,
++    },
+ ])
+ @pytest.mark.parametrize("output_len", [
+     256,
+ ])
+-@pytest.mark.parametrize("batch_size", [1, 64])
++@pytest.mark.parametrize("batch_size", [1, 32])
++@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
+ @pytest.mark.parametrize("seed", [1])
+-def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
+-                                      test_llm_generator, batch_size: int,
+-                                      output_len: int):
++def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
++                                      per_test_common_llm_kwargs,
++                                      baseline_llm_kwargs, test_llm_kwargs,
++                                      batch_size: int, output_len: int,
++                                      prefill_chunk_size: int, seed: int):
+     """Verify greedy equality on a tiny model with different batch size."""
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    if prefill_chunk_size > 0:
++        common_llm_kwargs.update(
++            **{
++                "enable_chunked_prefill": True,
++                "max_num_batched_tokens": prefill_chunk_size,
++                "max_num_seqs": prefill_chunk_size
++            })
++    else:
++        common_llm_kwargs["enable_chunked_prefill"] = False
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Print spec metrics.
++        "disable_log_stats": False,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [
++    {
++        "model_name": "JackFram/llama-68m",
++    },
++])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs", [
++    {
++        "speculative_model": "[ngram]",
++        "num_speculative_tokens": 5,
++        "ngram_prompt_lookup_max": 3,
++        "disable_logprobs_during_spec_decoding": False,
++    },
++    {
++        "speculative_model": "[ngram]",
++        "num_speculative_tokens": 5,
++        "ngram_prompt_lookup_max": 3,
++        "disable_logprobs_during_spec_decoding": True,
++    },
++])
++@pytest.mark.parametrize("output_len", [
++    8,
++])
++@pytest.mark.parametrize("batch_size", [8])
++@pytest.mark.parametrize("seed", [1])
++@pytest.mark.parametrize("logprobs", [1, 6])
++def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
++                                   per_test_common_llm_kwargs,
++                                   baseline_llm_kwargs, test_llm_kwargs,
++                                   batch_size: int, output_len: int, seed: int,
++                                   logprobs: int):
++    """Verify greedy equality on a tiny model with different batch size."""
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0,
++                                  logprobs=logprobs,
++                                  prompt_logprobs=logprobs,
++                                  disable_logprobs=test_llm_kwargs[
++                                      'disable_logprobs_during_spec_decoding'])
+ 
+ 
+ @pytest.mark.parametrize(
+@@ -80,13 +154,10 @@ def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
+     {
+-        "model": "JackFram/llama-160m",
++        "model_name": "JackFram/llama-160m",
+     },
+ ])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@@ -95,6 +166,16 @@ def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
+         "speculative_model": "[ngram]",
+         "num_speculative_tokens": 5,
+         "ngram_prompt_lookup_max": 3,
++        "enable_chunked_prefill": False,
++    },
++    {
++        "speculative_model": "[ngram]",
++        "num_speculative_tokens": 5,
++        "ngram_prompt_lookup_max": 3,
++        "enable_chunked_prefill": True,
++        "speculative_disable_mqa_scorer": True,
++        "max_num_batched_tokens": 4,
++        "max_num_seqs": 4
+     },
+ ])
+ @pytest.mark.parametrize(
+@@ -105,30 +186,31 @@ def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
+     ])
+ @pytest.mark.parametrize("batch_size", [4])
+ @pytest.mark.parametrize("seed", [1])
+-def test_ngram_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+-                                                      test_llm_generator,
+-                                                      batch_size: int,
+-                                                      output_len: int):
++def test_ngram_e2e_greedy_correctness_with_preemption(
++        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
++        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
++        seed: int):
+     """Verify greedy equality, even when some sequences are preempted mid-
+     generation.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  temperature=0,
++                                  seed=seed)
+ 
+ 
+ @pytest.mark.parametrize(
+     "common_llm_kwargs",
+     [{
+-        "model": "JackFram/llama-68m",
++        "model_name": "JackFram/llama-68m",
+ 
+         # Skip cuda graph recording for fast test.
+         "enforce_eager": True,
+-
+-        # Required for spec decode.
+-        "use_v2_block_manager": True
+     }])
+ @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+ @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@@ -159,14 +241,118 @@ def test_ngram_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+         32,
+     ])
+ @pytest.mark.parametrize("seed", [1])
+-def test_ngram_different_k(baseline_llm_generator, test_llm_generator,
+-                           batch_size: int, output_len: int):
++def test_ngram_different_k(vllm_runner, common_llm_kwargs,
++                           per_test_common_llm_kwargs, baseline_llm_kwargs,
++                           test_llm_kwargs, batch_size: int, output_len: int,
++                           seed: int):
++    """Verify that ngram speculative decoding produces exact equality
++    to without spec decode with many different values of k and
++    different ngram_prompt_lookup_max.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model_name": "JackFram/llama-68m",
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_model": "[ngram]",
++                             "num_speculative_tokens": 5,
++                             "ngram_prompt_lookup_max": 3,
++                             "speculative_disable_by_batch_size": 4
++                         }, {
++                             "speculative_model": "[ngram]",
++                             "num_speculative_tokens": 5,
++                             "ngram_prompt_lookup_max": 3,
++                             "speculative_disable_by_batch_size": 4,
++                             "enable_chunked_prefill": True,
++                             "speculative_disable_mqa_scorer": True,
++                             "max_num_batched_tokens": 4,
++                             "max_num_seqs": 4
++                         }])
++@pytest.mark.parametrize("batch_size", [1, 5])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
++                             per_test_common_llm_kwargs, baseline_llm_kwargs,
++                             test_llm_kwargs, batch_size: int, output_len: int,
++                             seed: int):
+     """Verify that ngram speculative decoding produces exact equality
+     to without spec decode with many different values of k and
+     different ngram_prompt_lookup_max.
+     """
+-    run_greedy_equality_correctness_test(baseline_llm_generator,
+-                                         test_llm_generator,
+-                                         batch_size,
+-                                         max_output_len=output_len,
+-                                         force_output_len=True)
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model_name": "JackFram/llama-68m",
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # Required for spec decode.
++        "speculative_model": "[ngram]",
++        "num_speculative_tokens": 5,
++        "ngram_prompt_lookup_max": 3,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
++@pytest.mark.parametrize("test_llm_kwargs",
++                         [{
++                             "speculative_disable_mqa_scorer": True,
++                         }])
++@pytest.mark.parametrize("batch_size", [1, 5])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        32,
++    ])
++@pytest.mark.parametrize("seed", [1])
++def test_ngram_scorer(vllm_runner, common_llm_kwargs,
++                      per_test_common_llm_kwargs, baseline_llm_kwargs,
++                      test_llm_kwargs, batch_size: int, output_len: int,
++                      seed: int):
++    """Verify that ngram speculative decoding generates the same output 
++    with batch expansion scorer and mqa scorer.
++    """
++    run_equality_correctness_test(vllm_runner,
++                                  common_llm_kwargs,
++                                  per_test_common_llm_kwargs,
++                                  baseline_llm_kwargs,
++                                  test_llm_kwargs,
++                                  batch_size,
++                                  max_output_len=output_len,
++                                  seed=seed,
++                                  temperature=0.0)
+diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
+new file mode 100644
+index 0000000..e42cf41
+--- /dev/null
++++ b/tests/spec_decode/e2e/test_seed.py
+@@ -0,0 +1,67 @@
++import pytest
++
++from .conftest import run_equality_correctness_test
++
++# main model
++MAIN_MODEL = "JackFram/llama-68m"
++
++# speculative model
++SPEC_MODEL = "JackFram/llama-160m"
++
++
++@pytest.mark.parametrize(
++    "common_llm_kwargs",
++    [{
++        "model_name": "JackFram/llama-68m",
++
++        # Skip cuda graph recording for fast test.
++        "enforce_eager": True,
++
++        # speculative model
++        "speculative_model": "JackFram/llama-160m",
++
++        # num speculative tokens
++        "num_speculative_tokens": 3,
++    }])
++@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
++@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
++@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
++@pytest.mark.parametrize("batch_size", [1, 8, 32])
++@pytest.mark.parametrize("temperature", [0.1, 1.0])
++@pytest.mark.parametrize(
++    "output_len",
++    [
++        # Use smaller output len for fast test.
++        20,
++    ])
++def test_seeded_consistency(vllm_runner, common_llm_kwargs,
++                            per_test_common_llm_kwargs, baseline_llm_kwargs,
++                            test_llm_kwargs, batch_size: int,
++                            temperature: float, output_len: int):
++    """Verify outputs are consistent across multiple runs with same seed
++    """
++    run_equality_correctness_test(
++        vllm_runner,
++        common_llm_kwargs,
++        per_test_common_llm_kwargs,
++        baseline_llm_kwargs,
++        test_llm_kwargs,
++        batch_size,
++        max_output_len=output_len,
++        temperature=temperature,
++        disable_seed=False,
++    )
++
++    # Ensure this same test does fail if we _don't_ include per-request seeds
++    with pytest.raises(AssertionError):
++        run_equality_correctness_test(
++            vllm_runner,
++            common_llm_kwargs,
++            per_test_common_llm_kwargs,
++            baseline_llm_kwargs,
++            test_llm_kwargs,
++            batch_size,
++            max_output_len=output_len,
++            temperature=temperature,
++            disable_seed=True,
++        )
+diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
+index 43cfd78..3504fcf 100644
+--- a/tests/spec_decode/test_batch_expansion.py
++++ b/tests/spec_decode/test_batch_expansion.py
+@@ -1,3 +1,5 @@
++from typing import List
++
+ import pytest
+ import torch
+ 
+@@ -38,14 +40,14 @@ def test_get_token_ids_to_score(k: int):
+         device='cuda',
+     )
+ 
+-    expected_output = [
++    expected_output: List[List[int]] = [
+         [],
+     ]
+     for i in range(proposal_token_ids.shape[0]):
+         expected_output.append(proposal_token_ids[:i + 1].tolist())
+ 
+     scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
+-    actual_output = scorer._get_token_ids_to_score(proposal_token_ids)  # pylint: disable=protected-access
++    actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist())  # pylint: disable=protected-access
+ 
+     actual_output = [
+         x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output
+@@ -84,14 +86,23 @@ def test_create_single_target_seq_group_metadata(k: int):
+         input_seq_id,
+         target_seq_id,
+         token_ids,
++        input_seq_group_metadata.sampling_params,
+     )
+ 
+     assert output.request_id == input_seq_group_metadata.request_id
++    assert output.sampling_params.repetition_penalty == \
++        input_seq_group_metadata.sampling_params.repetition_penalty
++    assert output.sampling_params.temperature == \
++        input_seq_group_metadata.sampling_params.temperature
++    assert output.sampling_params.top_p == \
++        input_seq_group_metadata.sampling_params.top_p
++    assert output.sampling_params.top_k == \
++        input_seq_group_metadata.sampling_params.top_k
+     assert len(output.seq_data) == 1
+-    assert output.seq_data[target_seq_id].get_prompt_token_ids(
+-    ) == prompt_tokens
+-    assert output.seq_data[target_seq_id].get_output_token_ids(
+-    ) == prev_output_tokens + token_ids
++    assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple(
++        prompt_tokens)
++    assert output.seq_data[target_seq_id].get_output_token_ids() == tuple(
++        prev_output_tokens + token_ids)
+ 
+     assert len(output.block_tables) == 1
+     assert output.block_tables[
+diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
+new file mode 100644
+index 0000000..aa49a3a
+--- /dev/null
++++ b/tests/spec_decode/test_dynamic_spec_decode.py
+@@ -0,0 +1,87 @@
++from unittest.mock import MagicMock, patch
++
++import pytest
++import torch
++
++from vllm.sequence import ExecuteModelRequest
++from vllm.spec_decode.metrics import AsyncMetricsCollector
++from vllm.spec_decode.multi_step_worker import MultiStepWorker
++from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
++from vllm.spec_decode.top1_proposer import Top1Proposer
++
++from .test_utils import mock_spec_decode_sampler
++from .utils import create_batch, mock_worker
++
++
++@pytest.mark.parametrize('queue_size', [4])
++@pytest.mark.parametrize('batch_size', [1])
++@pytest.mark.parametrize('k', [1])
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
++@torch.inference_mode()
++def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
++                             acceptance_sampler_method: str):
++    """Verify that speculative tokens are disabled when the batch size
++    exceeds the threshold.
++    """
++    disable_by_batch_size = 3
++    draft_worker = mock_worker(cls=MultiStepWorker)
++    target_worker = mock_worker()
++    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
++    worker = SpecDecodeWorker(proposer_worker=draft_worker,
++                              scorer_worker=target_worker,
++                              spec_decode_sampler=mock_spec_decode_sampler(
++                                  acceptance_sampler_method),
++                              disable_logprobs=False,
++                              metrics_collector=metrics_collector,
++                              disable_by_batch_size=disable_by_batch_size)
++
++    exception_secret = 'artificial stop'
++    draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
++
++    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
++    execute_model_req = ExecuteModelRequest(
++        seq_group_metadata_list=seq_group_metadata_list,
++        num_lookahead_slots=k,
++        running_queue_size=queue_size)
++
++    if queue_size > disable_by_batch_size:
++        with patch.object(worker,
++                          '_run_no_spec',
++                          side_effect=ValueError(exception_secret)), \
++            pytest.raises(ValueError, match=exception_secret):
++            worker.execute_model(execute_model_req=execute_model_req)
++
++    # When the batch size is larger than the threshold,
++    # we expect no speculative tokens (0).
++    expected_num_spec_tokens = None if queue_size < disable_by_batch_size else 0
++    assert seq_group_metadata_list[
++        0].num_speculative_tokens == expected_num_spec_tokens
++
++    draft_worker.sampler_output.side_effect = ValueError(exception_secret)
++
++    proposer = Top1Proposer(
++        worker=draft_worker,
++        device='cpu',  # not used
++        vocab_size=100,  # not used
++        # Must be long enough to avoid being skipped due to length.
++        max_proposal_len=1024,
++    )
++
++    if queue_size < disable_by_batch_size:
++        # Should raise exception when executing the mocked draft model.
++        with pytest.raises(ValueError, match=exception_secret):
++            proposer.get_spec_proposals(
++                execute_model_req=ExecuteModelRequest(
++                    seq_group_metadata_list=seq_group_metadata_list,
++                    num_lookahead_slots=k),
++                seq_ids_with_bonus_token_in_last_step=set())
++    else:
++        # Should not execute the draft model because spec decode is disabled
++        # for all requests. Accordingly, the proposal length should be 0.
++        proposals = proposer.get_spec_proposals(
++            execute_model_req=ExecuteModelRequest(
++                seq_group_metadata_list=seq_group_metadata_list,
++                num_lookahead_slots=k),
++            seq_ids_with_bonus_token_in_last_step=set())
++        assert proposals.proposal_lens.tolist() == [0] * batch_size
+diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
+index 3128788..7477486 100644
+--- a/tests/spec_decode/test_metrics.py
++++ b/tests/spec_decode/test_metrics.py
+@@ -10,16 +10,16 @@ from vllm.spec_decode.metrics import AsyncMetricsCollector
+ def test_initial_call_returns_none():
+     """Expect first call to get metrics to return None.
+     """
+-    rej_sampler = MagicMock()
+-    rej_sampler.num_accepted_tokens = torch.tensor(0,
+-                                                   dtype=torch.long,
+-                                                   device='cuda')
+-    rej_sampler.num_emitted_tokens = torch.tensor(0,
+-                                                  dtype=torch.long,
+-                                                  device='cuda')
+-    rej_sampler.num_draft_tokens = 0
+-
+-    collector = AsyncMetricsCollector(rej_sampler)
++    spec_decode_sampler = MagicMock()
++    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
++                                                           dtype=torch.long,
++                                                           device='cuda')
++    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
++                                                          dtype=torch.long,
++                                                          device='cuda')
++    spec_decode_sampler.num_draft_tokens = 0
++
++    collector = AsyncMetricsCollector(spec_decode_sampler)
+     collector.init_gpu_tensors(rank=0)
+     maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5)
+     assert maybe_metrics is None
+@@ -28,14 +28,14 @@ def test_initial_call_returns_none():
+ def test_second_call_returns_metrics():
+     """Expect second call to not return None.
+     """
+-    rej_sampler = MagicMock()
+-    rej_sampler.num_accepted_tokens = torch.tensor(0,
+-                                                   dtype=torch.long,
+-                                                   device='cuda')
+-    rej_sampler.num_emitted_tokens = torch.tensor(0,
+-                                                  dtype=torch.long,
+-                                                  device='cuda')
+-    rej_sampler.num_draft_tokens = 0
++    spec_decode_sampler = MagicMock()
++    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
++                                                           dtype=torch.long,
++                                                           device='cuda')
++    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
++                                                          dtype=torch.long,
++                                                          device='cuda')
++    spec_decode_sampler.num_draft_tokens = 0
+ 
+     collect_interval_s = 5.0
+     timer = MagicMock()
+@@ -43,7 +43,7 @@ def test_second_call_returns_metrics():
+         0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
+     ]
+ 
+-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
++    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
+                                       timer=timer,
+                                       collect_interval_s=collect_interval_s)
+     collector.init_gpu_tensors(rank=0)
+@@ -56,16 +56,16 @@ def test_second_call_returns_metrics():
+ def test_nonzero_rank_noop(rank):
+     """Verify nonzero ranks don't collect metrics.
+     """
+-    rej_sampler = MagicMock()
+-    rej_sampler.num_accepted_tokens = torch.tensor(0,
+-                                                   dtype=torch.long,
+-                                                   device='cuda')
+-    rej_sampler.num_emitted_tokens = torch.tensor(0,
+-                                                  dtype=torch.long,
+-                                                  device='cuda')
+-    rej_sampler.num_draft_tokens = 0
+-
+-    collector = AsyncMetricsCollector(rej_sampler)
++    spec_decode_sampler = MagicMock()
++    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
++                                                           dtype=torch.long,
++                                                           device='cuda')
++    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
++                                                          dtype=torch.long,
++                                                          device='cuda')
++    spec_decode_sampler.num_draft_tokens = 0
++
++    collector = AsyncMetricsCollector(spec_decode_sampler)
+     collector.init_gpu_tensors(rank=rank)
+     _ = collector.maybe_collect_rejsample_metrics(k=5)
+     metrics = collector.maybe_collect_rejsample_metrics(k=5)
+@@ -75,14 +75,14 @@ def test_nonzero_rank_noop(rank):
+ def test_noop_until_time():
+     """Verify metrics aren't collected until enough time passes.
+     """
+-    rej_sampler = MagicMock()
+-    rej_sampler.num_accepted_tokens = torch.tensor(0,
+-                                                   dtype=torch.long,
+-                                                   device='cuda')
+-    rej_sampler.num_emitted_tokens = torch.tensor(0,
+-                                                  dtype=torch.long,
+-                                                  device='cuda')
+-    rej_sampler.num_draft_tokens = 0
++    spec_decode_sampler = MagicMock()
++    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
++                                                           dtype=torch.long,
++                                                           device='cuda')
++    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
++                                                          dtype=torch.long,
++                                                          device='cuda')
++    spec_decode_sampler.num_draft_tokens = 0
+ 
+     collect_interval_s = 5.0
+     timer = MagicMock()
+@@ -91,7 +91,7 @@ def test_noop_until_time():
+         collect_interval_s + 0.1, collect_interval_s + 0.1
+     ]
+ 
+-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
++    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
+                                       timer=timer,
+                                       collect_interval_s=collect_interval_s)
+     collector.init_gpu_tensors(rank=0)
+@@ -105,6 +105,49 @@ def test_noop_until_time():
+     assert metrics is not None
+ 
+ 
++def test_timer_is_reset():
++    """Verify that the internal timer inside AsyncMetricsCollector
++    is reset after collection.
++    """
++    spec_decode_sampler = MagicMock()
++    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
++                                                           dtype=torch.long,
++                                                           device='cuda')
++    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
++                                                          dtype=torch.long,
++                                                          device='cuda')
++    spec_decode_sampler.num_draft_tokens = 0
++
++    collect_interval_s = 5.0
++    timer = MagicMock()
++    timer.side_effect = [
++        0.0,
++        collect_interval_s + 0.1,
++        collect_interval_s + 0.1,
++        collect_interval_s + 0.2,
++        collect_interval_s + 0.2,
++        2 * collect_interval_s + 0.1,
++        2 * collect_interval_s + 0.1,
++    ]
++
++    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
++                                      timer=timer,
++                                      collect_interval_s=collect_interval_s)
++    collector.init_gpu_tensors(rank=0)
++
++    _ = collector.maybe_collect_rejsample_metrics(k=5)
++    metrics = collector.maybe_collect_rejsample_metrics(k=5)
++    assert metrics is not None
++
++    _ = collector.maybe_collect_rejsample_metrics(k=5)
++    metrics = collector.maybe_collect_rejsample_metrics(k=5)
++    assert metrics is None
++
++    _ = collector.maybe_collect_rejsample_metrics(k=5)
++    metrics = collector.maybe_collect_rejsample_metrics(k=5)
++    assert metrics is not None
++
++
+ @pytest.mark.parametrize("has_data", [True, False])
+ def test_initial_metrics_has_correct_values(has_data: bool):
+     """Test correctness of metrics data.
+@@ -122,14 +165,14 @@ def test_initial_metrics_has_correct_values(has_data: bool):
+     max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens(
+         num_draft_tokens, k)
+ 
+-    rej_sampler = MagicMock()
+-    rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
+-                                                   dtype=torch.long,
+-                                                   device='cuda')
+-    rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
+-                                                  dtype=torch.long,
+-                                                  device='cuda')
+-    rej_sampler.num_draft_tokens = num_draft_tokens
++    spec_decode_sampler = MagicMock()
++    spec_decode_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
++                                                           dtype=torch.long,
++                                                           device='cuda')
++    spec_decode_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
++                                                          dtype=torch.long,
++                                                          device='cuda')
++    spec_decode_sampler.num_draft_tokens = num_draft_tokens
+ 
+     collect_interval_s = 5.0
+     timer = MagicMock()
+@@ -137,7 +180,7 @@ def test_initial_metrics_has_correct_values(has_data: bool):
+         0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
+     ]
+ 
+-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
++    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
+                                       timer=timer,
+                                       collect_interval_s=collect_interval_s)
+     collector.init_gpu_tensors(rank=0)
+diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
+index cb2de97..0b5d82b 100644
+--- a/tests/spec_decode/test_multi_step_worker.py
++++ b/tests/spec_decode/test_multi_step_worker.py
+@@ -1,11 +1,17 @@
+ import random
++from typing import Dict, List
+ from unittest.mock import MagicMock
+ 
+ import pytest
+ import torch
+ 
++from vllm.attention.selector import (_Backend,
++                                     global_force_attn_backend_context_manager)
++from vllm.model_executor.layers.sampler import SamplerOutput
+ from vllm.model_executor.utils import set_random_seed
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
++from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob,
++                           get_all_seq_ids)
++from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+ from vllm.spec_decode.multi_step_worker import MultiStepWorker
+ from vllm.spec_decode.top1_proposer import Top1Proposer
+ from vllm.worker.worker import Worker
+@@ -84,6 +90,7 @@ def test_same_output_for_single_step():
+         block_size,
+         num_gpu_blocks,
+         seed,
++        model_runner_cls=TP1DraftModelRunner,
+     )
+     worker = create_worker(
+         Worker,
+@@ -115,7 +122,8 @@ def test_same_output_for_single_step():
+     actual_output, _ = multi_step_worker.sampler_output(
+         execute_model_req=ExecuteModelRequest(
+             seq_group_metadata_list=multi_step_seq_group),
+-        sample_len=num_steps)
++        sample_len=num_steps,
++        seq_ids_with_bonus_token_in_last_step=set())
+     assert len(actual_output) == num_steps
+     actual_output = actual_output[0]
+ 
+@@ -206,11 +214,12 @@ def test_same_output_for_multi_step():
+     multi_step_output, _ = multi_step_worker.sampler_output(
+         execute_model_req=ExecuteModelRequest(
+             seq_group_metadata_list=seq_group_metadata_list),
+-        sample_len=num_steps)
++        sample_len=num_steps,
++        seq_ids_with_bonus_token_in_last_step=set())
+ 
+     # Run single-step repeatedly.
+     zero_kv_cache(worker.cache_engine)
+-    single_step_output = []
++    single_step_output: List[SamplerOutput] = []
+     continuations = [[1] for _ in prompts]
+     set_random_seed(seed)
+ 
+@@ -232,11 +241,15 @@ def test_same_output_for_multi_step():
+             continuations[i].append(seq_group_output.samples[0].output_token)
+ 
+     # Get token ids and logprobs for comparison.
+-    multi_step_output_logprobs = [[] for _ in prompts]
+-    single_step_output_logprobs = [[] for _ in prompts]
+-
+-    multi_step_output_token_ids = [[] for _ in prompts]
+-    single_step_output_token_ids = [[] for _ in prompts]
++    multi_step_output_logprobs: List[List[Dict[int,
++                                               Logprob]]] = [[]
++                                                             for _ in prompts]
++    single_step_output_logprobs: List[List[Dict[int,
++                                                Logprob]]] = [[]
++                                                              for _ in prompts]
++
++    multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
++    single_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
+     for i, _ in enumerate(prompts):
+         for multi_step, single_step in zip(multi_step_output,
+                                            single_step_output):
+@@ -269,6 +282,308 @@ def test_same_output_for_multi_step():
+                                       single_step_logprobs)
+ 
+ 
++@torch.inference_mode()
++def test_multi_step_with_batch_expansion_correct_output():
++    """
++    In this test we verify that the MultiStepWorker is able to handle bonus
++    tokens correctly. The test verifies that if a sequence has a
++    bonus token then the MultiStepWorker is able to expand the batch by adding
++    new sequences corresponding to the sequences with bonus tokens. The
++    expanded batch is then used for predicting the next tokens.
++    """
++    seed = 100
++    model_name = 'JackFram/llama-68m'
++
++    block_size = 16
++    num_gpu_blocks = 2048 // block_size
++    batch_size = 128
++    multi_step_worker = create_worker(
++        MultiStepWorker,
++        model_name,
++        block_size,
++        num_gpu_blocks,
++        seed,
++        model_runner_cls=TP1DraftModelRunner,
++    )
++    multi_step_worker.set_include_gpu_probs_tensor()
++    worker = create_worker(
++        Worker,
++        model_name,
++        block_size,
++        num_gpu_blocks,
++        seed,
++    )
++    random.seed(seed)
++    prompts = [[0] for _ in range(batch_size)]
++    num_steps = 2
++    final_prompt_lens = [(num_steps + 1) for prompt in prompts]
++    rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
++    multi_step_worker.execute_model = patch_execute_model_with_seeds(
++        multi_step_worker, rand_seeds)
++    worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
++    # Create the test continuations
++    continuations = [[random.randint(0, 1000)] for _ in prompts]
++    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
++        prompts,
++        num_gpu_blocks,
++        block_size,
++        continuations=continuations,
++        final_prompt_lens=final_prompt_lens)
++
++    # Run single-step twice to generate 2 tokens. This
++    # will simulate the bonus token case with the second token
++    # being the bonus token.
++    zero_kv_cache(worker.cache_engine)
++    single_step_output: List[SamplerOutput] = []
++    set_random_seed(seed)
++    for _ in range(num_steps):
++        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
++            prompts,
++            num_gpu_blocks,
++            block_size,
++            continuations=continuations,
++            final_prompt_lens=final_prompt_lens)
++        single_step_output.extend(
++            worker.execute_model(execute_model_req=ExecuteModelRequest(
++                seq_group_metadata_list=seq_group_metadata_list)))
++        # Append output tokens to new sequence data.
++        for i, seq_group_output in enumerate(single_step_output[-1]):
++            continuations[i].append(seq_group_output.samples[0].output_token)
++
++    # Create continuations for the MultiStepWorker. The continuations have
++    # 2 tokens in order to simulate the bonus token case.
++    multi_step_continuations = []
++    for continuation in continuations:
++        multi_step_continuations.append(continuation[:2])
++    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
++        prompts,
++        num_gpu_blocks,
++        block_size,
++        continuations=multi_step_continuations,
++        final_prompt_lens=final_prompt_lens)
++
++    # Run multi-step and verify that the third token prediction is accurate
++    # for all sequences.
++    zero_kv_cache(multi_step_worker.cache_engine)
++    all_seq_ids = {i for i in range(batch_size)}
++    multi_step_output, _ = multi_step_worker.sampler_output(
++        execute_model_req=ExecuteModelRequest(
++            seq_group_metadata_list=seq_group_metadata_list),
++        sample_len=1,
++        seq_ids_with_bonus_token_in_last_step=all_seq_ids)
++    for index, output in enumerate(multi_step_output[-1].outputs):
++        assert (continuations[index][-1] == output.samples[0].output_token)
++
++
++@torch.inference_mode()
++def test_multi_step_with_batch_expansion_incorrect_output():
++    """
++    Tests the MultiStepWorker's ability to handle batch expansion with bonus
++    tokens in a negative case scenario. This test provides the MultiStepWorker
++    with a batch containing sequences with bonus tokens but specifies the
++    sequence IDs with bonus tokens incorrectly. The test verifies that the
++    MultiStepWorker generates correct tokens for the sequences where the
++    sequence ID is specified correctly and incorrect tokens for those where
++    the sequence ID is specified incorrectly.
++    """
++    seed = 100
++    model_name = 'JackFram/llama-68m'
++
++    block_size = 16
++    num_gpu_blocks = 2048 // block_size
++    batch_size = 128
++    multi_step_worker = create_worker(
++        MultiStepWorker,
++        model_name,
++        block_size,
++        num_gpu_blocks,
++        seed,
++        model_runner_cls=TP1DraftModelRunner,
++    )
++    multi_step_worker.set_include_gpu_probs_tensor()
++    worker = create_worker(
++        Worker,
++        model_name,
++        block_size,
++        num_gpu_blocks,
++        seed,
++    )
++    random.seed(seed)
++    prompts = [[0] for _ in range(batch_size)]
++    num_steps = 2
++    final_prompt_lens = [(num_steps + 1) for prompt in prompts]
++    rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
++    multi_step_worker.execute_model = patch_execute_model_with_seeds(
++        multi_step_worker, rand_seeds)
++    worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
++    # Create the test continuations
++    continuations = [[random.randint(0, 1000)] for _ in prompts]
++    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
++        prompts,
++        num_gpu_blocks,
++        block_size,
++        continuations=continuations,
++        final_prompt_lens=final_prompt_lens)
++    # Run single-step twice to generate 2 tokens. This
++    # will simulate the bonus token case with the second token
++    # being the bonus token.
++    zero_kv_cache(worker.cache_engine)
++    single_step_output: List[SamplerOutput] = []
++    set_random_seed(seed)
++    for _ in range(num_steps):
++        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
++            prompts,
++            num_gpu_blocks,
++            block_size,
++            continuations=continuations,
++            final_prompt_lens=final_prompt_lens)
++        single_step_output.extend(
++            worker.execute_model(execute_model_req=ExecuteModelRequest(
++                seq_group_metadata_list=seq_group_metadata_list)))
++        # Append output tokens to new sequence data.
++        for i, seq_group_output in enumerate(single_step_output[-1]):
++            continuations[i].append(seq_group_output.samples[0].output_token)
++
++    # Create continuations for the MultiStepWorker. The continuations have
++    # 2 tokens in order to simulate the bonus token case.
++    multi_step_continuations = []
++    for continuation in continuations:
++        multi_step_continuations.append(continuation[:2])
++    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
++        prompts,
++        num_gpu_blocks,
++        block_size,
++        continuations=multi_step_continuations,
++        final_prompt_lens=final_prompt_lens)
++
++    # Run multi-step. In this run INCORRECTLY specify that only the odd number
++    # sequences have bonus tokens. Verify that with this setting the third token
++    # prediction is accurate only for the odd numbered sequences. Also verify
++    # that the prediction might be wrong for some of the even numbered
++    # sequences.
++    zero_kv_cache(multi_step_worker.cache_engine)
++    set_random_seed(seed)
++    odd_seq_ids = {i for i in range(batch_size) if i % 2 != 0}
++    multi_step_output, _ = multi_step_worker.sampler_output(
++        execute_model_req=ExecuteModelRequest(
++            seq_group_metadata_list=seq_group_metadata_list),
++        sample_len=1,
++        seq_ids_with_bonus_token_in_last_step=odd_seq_ids)
++    num_mismatch = 0
++    for index, output in enumerate(multi_step_output[-1].outputs):
++        if (index % 2) != 0:
++            assert (continuations[index][-1] == output.samples[0].output_token)
++        elif (continuations[index][-1] != output.samples[0].output_token):
++            num_mismatch += 1
++    # The prediction is accurate for some of the sequences even without proper
++    # handling of the bonus tokens. Hence verify that the number of sequences
++    # for which there is a mismatch is > 0.
++    assert (num_mismatch > 0)
++
++
++@torch.inference_mode()
++@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
++# The choice of backends forces the multi_step_worker to choose between
++# the vanilla model_runner and TP1DraftModelRunner and that we can test
++# both code paths.
++@pytest.mark.parametrize('attn_backend',
++                         [_Backend.XFORMERS, _Backend.FLASH_ATTN])
++def test_multi_step_correct_kvcache(num_steps, attn_backend):
++    """Verify that the KV cache of the draft model 
++    is correctly updated for sequences with bonus token.
++    """
++    seed = 100
++    model_name = "JackFram/llama-68m"
++
++    block_size = 16
++    num_gpu_blocks = 2048 // block_size
++    batch_size = 1
++
++    with global_force_attn_backend_context_manager(attn_backend):
++        dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
++        multi_step_worker = create_worker(MultiStepWorker,
++                                          model_name,
++                                          block_size,
++                                          num_gpu_blocks,
++                                          seed,
++                                          model_runner_cls=TP1DraftModelRunner,
++                                          dtype=dtype)
++        multi_step_worker.set_include_gpu_probs_tensor()
++        worker = create_worker(Worker,
++                               model_name,
++                               block_size,
++                               num_gpu_blocks,
++                               seed,
++                               dtype=dtype)
++
++        prompts = [[0] for _ in range(batch_size)]
++        # Already generate two tokens for the sequence
++        # so that we can simulate the bonus token case
++        multi_step_continuations = [[
++            random.randint(0, 1000),
++            random.randint(0, 1000)
++        ] for _ in prompts]
++        final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
++
++        seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
++        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
++            prompts,
++            num_gpu_blocks,
++            block_size,
++            continuations=multi_step_continuations,
++            final_prompt_lens=final_prompt_lens)
++
++        # Run multi-step.
++        zero_kv_cache(multi_step_worker.cache_engine)
++        multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
++            seq_group_metadata_list=seq_group_metadata_list),
++                                         sample_len=num_steps,
++                                         seq_ids_with_bonus_token_in_last_step=
++                                         seq_ids_with_bonus_token_in_last_step)
++
++        # Run single-step repeatedly.
++        zero_kv_cache(worker.cache_engine)
++        # Generate the kv cache for the bonus token first
++        single_step_continuations = [c[:1] for c in multi_step_continuations]
++        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
++            prompts,
++            num_gpu_blocks,
++            block_size,
++            continuations=single_step_continuations,
++            final_prompt_lens=final_prompt_lens)
++        single_step_output = worker.execute_model(
++            execute_model_req=ExecuteModelRequest(
++                seq_group_metadata_list=seq_group_metadata_list))
++        for _ in range(num_steps):
++            seq_group_metadata_list = create_seq_group_metadata_from_prompts(
++                prompts,
++                num_gpu_blocks,
++                block_size,
++                continuations=multi_step_continuations,
++                final_prompt_lens=final_prompt_lens)
++
++            single_step_output = worker.execute_model(
++                execute_model_req=ExecuteModelRequest(
++                    seq_group_metadata_list=seq_group_metadata_list))
++
++            for i, seq_group_output in enumerate(single_step_output[-1]):
++                multi_step_continuations[i].append(
++                    seq_group_output.samples[0].output_token)
++
++        # Verify that the KV cache of the single-step and
++        # multi-step workers are the same.
++        single_step_gpu_cache = worker.cache_engine[0].gpu_cache
++        multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
++        num_layers = len(single_step_gpu_cache)
++        allclose = lambda a, b: torch.allclose(
++            a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
++        for i in range(num_layers):
++            assert allclose(single_step_gpu_cache[i][0],
++                            multi_step_gpu_cache[i][0])
++            assert allclose(single_step_gpu_cache[i][1],
++                            multi_step_gpu_cache[i][1])
++
++
+ @torch.inference_mode()
+ def test_draft_proposals_full_speculation_len():
+     """Verify Top1Proposer correctly handles case where all sequences
+@@ -307,9 +622,11 @@ def test_draft_proposals_full_speculation_len():
+ 
+     seq_group_metadata_list, _, _ = create_batch(batch_size, k)
+ 
+-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+-        seq_group_metadata_list=seq_group_metadata_list,
+-        num_lookahead_slots=k), )
++    proposals = proposer.get_spec_proposals(
++        execute_model_req=ExecuteModelRequest(
++            seq_group_metadata_list=seq_group_metadata_list,
++            num_lookahead_slots=k),
++        seq_ids_with_bonus_token_in_last_step=set())
+ 
+     assert torch.is_tensor(proposals.proposal_token_ids)
+     assert torch.is_tensor(proposals.proposal_probs)
+@@ -344,9 +661,11 @@ def test_draft_proposals_no_speculations():
+                                                  k,
+                                                  prompt_len=prompt_len)
+ 
+-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+-        seq_group_metadata_list=seq_group_metadata_list,
+-        num_lookahead_slots=k), )
++    proposals = proposer.get_spec_proposals(
++        execute_model_req=ExecuteModelRequest(
++            seq_group_metadata_list=seq_group_metadata_list,
++            num_lookahead_slots=k),
++        seq_ids_with_bonus_token_in_last_step=set())
+ 
+     assert torch.is_tensor(proposals.proposal_token_ids)
+     assert torch.is_tensor(proposals.proposal_probs)
+@@ -415,9 +734,11 @@ def test_draft_proposals_mixed_k():
+         prev_output_token_len=prev_output_token_len,
+     )
+ 
+-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+-        seq_group_metadata_list=seq_group_metadata_list,
+-        num_lookahead_slots=k), )
++    proposals = proposer.get_spec_proposals(
++        execute_model_req=ExecuteModelRequest(
++            seq_group_metadata_list=seq_group_metadata_list,
++            num_lookahead_slots=k),
++        seq_ids_with_bonus_token_in_last_step=set())
+ 
+     assert torch.is_tensor(proposals.proposal_token_ids)
+     assert torch.is_tensor(proposals.proposal_probs)
+@@ -429,3 +750,87 @@ def test_draft_proposals_mixed_k():
+     assert proposals.proposal_lens.tolist() == [
+         k for _ in range(expected_num_proposal_seqs - 1)
+     ] + [0 for _ in range(expected_num_no_proposal_seqs)] + [k]
++
++
++@torch.inference_mode()
++def test_use_draft_model_runner_advance_step():
++    """Verify that draft model runner triggers advance step
++    when applicable.
++    """
++    seed = 100
++    model_name = 'JackFram/llama-68m'
++
++    k = 5
++    batch_size = 32
++    block_size = 32
++    num_gpu_blocks = 2048 // block_size
++    worker = create_worker(
++        MultiStepWorker,
++        model_name,
++        block_size,
++        num_gpu_blocks,
++        seed,
++        model_runner_cls=TP1DraftModelRunner,
++    )
++
++    # Mock "_gpu_advance_step" to raise an exception when called.
++    exception_secret = "artificial stop"
++    worker.model_runner._gpu_advance_step = MagicMock()
++    worker.model_runner._gpu_advance_step.side_effect = ValueError(
++        exception_secret)
++
++    seq_group_metadata_list, _, _ = create_batch(batch_size,
++                                                 k,
++                                                 block_size=block_size,
++                                                 num_gpu_blocks=num_gpu_blocks)
++
++    # Fallback (should not call) when num_steps=1.
++    execute_model_req = ExecuteModelRequest(
++        seq_group_metadata_list=seq_group_metadata_list,
++        num_lookahead_slots=k,
++        num_steps=1)
++    worker.execute_model(execute_model_req=execute_model_req)
++
++    # Expect exception if _gpu_advance_step is called.
++    execute_model_req = ExecuteModelRequest(
++        seq_group_metadata_list=seq_group_metadata_list,
++        num_lookahead_slots=k,
++        num_steps=k)
++
++    with pytest.raises(ValueError, match=exception_secret):
++        worker.execute_model(execute_model_req=execute_model_req)
++    call_args_list = worker.model_runner._gpu_advance_step.call_args_list
++    assert len(call_args_list) == 1
++
++
++@torch.inference_mode()
++def test_expand_execute_model_request_sync_with_expand_hidden_states():
++    """
++    In this test we verify that the logic for expanding the 
++    seq_group_metadata_list remains in sync with the expansion logic of 
++    the HiddenStates in _expand_execute_model_request.
++    """
++    k = 5
++    batch_size = 16
++    seq_with_bonus_token_in_last_step = [1, 3, 8, 10, 13, 15]
++
++    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
++
++    execute_model_request = ExecuteModelRequest(
++        seq_group_metadata_list,
++        previous_hidden_states=HiddenStates(
++            torch.arange(batch_size), seq_group_metadata_list,
++            torch.arange(batch_size, 2 * batch_size)))
++
++    expanded_execute_model_request, orig_seq_group_ids = MultiStepWorker.\
++        _expand_execute_model_request(execute_model_request,
++                                      seq_with_bonus_token_in_last_step)
++
++    all_seq_ids = torch.tensor(
++        get_all_seq_ids(
++            expanded_execute_model_request.seq_group_metadata_list))
++    ref_expanded_hidden_states = all_seq_ids + batch_size
++    ref_expanded_hidden_states[orig_seq_group_ids] -= batch_size
++
++    assert (ref_expanded_hidden_states == expanded_execute_model_request.
++            previous_hidden_states.hidden_states).all().item()
+diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
+index de305c4..f66e957 100644
+--- a/tests/spec_decode/test_ngram_worker.py
++++ b/tests/spec_decode/test_ngram_worker.py
+@@ -34,8 +34,8 @@ def test_ngram_algo_correctness_for_single_no_match():
+         max_proposal_len=20,
+     )
+ 
+-    # set ngram window (0, 3], which is window=1/2/3
+-    ngram_worker.set_ngram_window_size(0, 3)
++    # set ngram window [1, 3], which is window=1/2/3
++    ngram_worker.set_ngram_window_size(1, 3)
+ 
+     prompts = [
+         # shall find no candidate
+@@ -50,9 +50,11 @@ def test_ngram_algo_correctness_for_single_no_match():
+         block_size,
+         final_prompt_lens=final_prompt_lens)
+ 
+-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+-        seq_group_metadata_list=seq_group_metadata_list,
+-        num_lookahead_slots=proposal_len), )
++    proposals = proposer.get_spec_proposals(
++        execute_model_req=ExecuteModelRequest(
++            seq_group_metadata_list=seq_group_metadata_list,
++            num_lookahead_slots=proposal_len),
++        seq_ids_with_bonus_token_in_last_step=None)
+ 
+     assert torch.is_tensor(proposals.proposal_token_ids)
+     assert torch.is_tensor(proposals.proposal_probs)
+@@ -90,8 +92,8 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
+         max_proposal_len=20,
+     )
+ 
+-    # set ngram window (0, 3], which is window=1/2/3
+-    ngram_worker.set_ngram_window_size(0, 3)
++    # set ngram window [1, 3], which is window=1/2/3
++    ngram_worker.set_ngram_window_size(1, 3)
+ 
+     prompts = [
+         # shall find no candidate
+@@ -116,10 +118,13 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
+         num_gpu_blocks,
+         block_size,
+         final_prompt_lens=final_prompt_lens)
+-
+-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+-        seq_group_metadata_list=seq_group_metadata_list,
+-        num_lookahead_slots=proposal_len), )
++    for sg in seq_group_metadata_list:
++        sg.is_prompt = False
++    proposals = proposer.get_spec_proposals(
++        execute_model_req=ExecuteModelRequest(
++            seq_group_metadata_list=seq_group_metadata_list,
++            num_lookahead_slots=proposal_len),
++        seq_ids_with_bonus_token_in_last_step=None)
+ 
+     assert torch.is_tensor(proposals.proposal_token_ids)
+     assert torch.is_tensor(proposals.proposal_probs)
+@@ -128,11 +133,12 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
+     assert proposals.proposal_probs.shape[:-1] == torch.Size([5, proposal_len])
+     assert proposals.proposal_lens.shape == torch.Size([5])
+ 
++    # the first sequence has no match so proposal_len should be overwritten to 0
+     assert proposals.proposal_lens.tolist(
+-    ) == [proposal_len for _ in range(4)] + [0]
++    ) == [0] + [proposal_len for _ in range(3)] + [0]
+ 
+     for i in range(proposal_len):
+-        assert proposals.proposal_token_ids[0][i] == 0
++        assert proposals.proposal_token_ids[0][i] == -1
+         assert proposals.proposal_token_ids[1][i] == prompts[1][i + 1]
+         assert proposals.proposal_token_ids[2][i] == prompts[2][i + 3]
+         assert proposals.proposal_token_ids[3][i] == prompts[3][i + 5]
+@@ -142,7 +148,7 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
+ def test_ngram_algo_correctness_for_batches_match_all():
+     """Verify our ngram algo find the right candidate in the prompt
+ 
+-    For the scenario find candidate in all batchs
++    For the scenario find candidate in all batches
+     """
+ 
+     block_size = 32
+@@ -167,8 +173,8 @@ def test_ngram_algo_correctness_for_batches_match_all():
+         max_proposal_len=20,
+     )
+ 
+-    # set ngram window (0, 3], which is window=1/2/3
+-    ngram_worker.set_ngram_window_size(0, 3)
++    # set ngram window [0, 3], which is window=1/2/3
++    ngram_worker.set_ngram_window_size(1, 3)
+ 
+     prompts = [
+         # shall find candidate 12,13,14,15,16
+@@ -187,9 +193,15 @@ def test_ngram_algo_correctness_for_batches_match_all():
+         block_size,
+         final_prompt_lens=final_prompt_lens)
+ 
+-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+-        seq_group_metadata_list=seq_group_metadata_list,
+-        num_lookahead_slots=proposal_len), )
++    # Normally drafter is run on decode requests only; here we check the output
++    # of the ngram worker as it is the sole proposer that has no forward.
++    for sg in seq_group_metadata_list:
++        sg.is_prompt = False
++    proposals = proposer.get_spec_proposals(
++        execute_model_req=ExecuteModelRequest(
++            seq_group_metadata_list=seq_group_metadata_list,
++            num_lookahead_slots=proposal_len),
++        seq_ids_with_bonus_token_in_last_step=None)
+ 
+     assert torch.is_tensor(proposals.proposal_token_ids)
+     assert torch.is_tensor(proposals.proposal_probs)
+diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
+new file mode 100644
+index 0000000..0b1509d
+--- /dev/null
++++ b/tests/spec_decode/test_scorer.py
+@@ -0,0 +1,114 @@
++import random
++from typing import List
++
++import pytest
++import torch
++
++from vllm.sequence import ExecuteModelRequest
++from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
++from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores
++from vllm.spec_decode.mqa_scorer import MQAScorer
++from vllm.worker.worker import Worker
++
++from .utils import create_batch, create_worker
++
++
++def create_proposal(propose_lens: List[int], vocab_size: int,
++                    device: str) -> SpeculativeProposals:
++    batch_size = len(propose_lens)
++    max_propose_len = max(propose_lens)
++    proposal_probs = torch.rand((batch_size, max_propose_len, vocab_size),
++                                device=device)
++
++    proposal_token_ids = torch.full((batch_size, max_propose_len),
++                                    fill_value=-1,
++                                    device=device)
++    for i in range(batch_size):
++        proposal_token_ids[i][:propose_lens[i]] = torch.argmax(
++            proposal_probs[i][:propose_lens[i]], dim=-1)
++
++    propose_lens = torch.tensor(propose_lens, device=device)
++    return SpeculativeProposals(proposal_token_ids, proposal_probs,
++                                propose_lens)
++
++
++def assert_score_equal(score1: SpeculativeScores,
++                       score2: SpeculativeScores) -> None:
++    assert torch.allclose(score1.probs, score2.probs)
++    assert torch.allclose(score1.logprobs, score2.logprobs)
++    assert torch.equal(
++        score1.token_ids,
++        score2.token_ids), f"{score1.token_ids}, {score2.token_ids}"
++
++
++@pytest.mark.parametrize('model_name', ['facebook/opt-125m'])
++@pytest.mark.parametrize('batch_size', [1, 2, 4, 8, 16])
++@pytest.mark.parametrize('max_propose_len', [1, 3, 5])
++@pytest.mark.parametrize('mixed_propose_len', [True])
++@pytest.mark.parametrize('device', ['cuda'])
++@pytest.mark.parametrize('prefill_chunking', [False, True])
++def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
++                mixed_propose_len: bool, device: str,
++                prefill_chunking: bool) -> None:
++    """
++    Compare the batch expansion scorer and mqa scorer return the same score.
++    We test for both queries with the same propose length and different 
++    propose length, as well as mixed prefill-decode batches.
++    """
++    seed = 0
++    block_size = 32
++    num_gpu_blocks = 2048 // block_size
++    scorer_worker = create_worker(Worker, model_name, block_size,
++                                  num_gpu_blocks, seed)
++    scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True
++    scorer_worker.model_runner.model.sampler.\
++        should_modify_greedy_probs_inplace = True
++
++    vocab_size = scorer_worker.vocab_size
++
++    if not mixed_propose_len:
++        propose_lens = [max_propose_len] * batch_size
++    else:
++        # There must be at least 1 decode request, otherwise
++        # we have nothing to score (`_run_no_spec`).
++        non_zero_cnt = random.randint(1, batch_size)
++        propose_lens = [max_propose_len
++                        ] * non_zero_cnt + [0] * (batch_size - non_zero_cnt)
++        random.shuffle(propose_lens)
++
++    seq_group_metadatalist, _, _ = create_batch(batch_size,
++                                                max_propose_len,
++                                                block_size=block_size,
++                                                num_gpu_blocks=num_gpu_blocks)
++
++    if mixed_propose_len and prefill_chunking and (n_prefills :=
++                                                   batch_size - non_zero_cnt):
++        prefill, _, _ = create_batch(n_prefills,
++                                     None,
++                                     prefill_chunk_size=4,
++                                     block_size=block_size,
++                                     num_gpu_blocks=num_gpu_blocks,
++                                     seq_ids=list(
++                                         range(batch_size,
++                                               batch_size + n_prefills)))
++        # re-order to guarantee prefill|decode order
++        target_group_metadatalist = [
++            seq_group_metadatalist[i] for i, p in enumerate(propose_lens)
++            if p > 0
++        ]
++        seq_group_metadatalist = prefill + target_group_metadatalist
++        propose_lens = [0] * n_prefills + [p for p in propose_lens if p > 0]
++
++    proposals = create_proposal(propose_lens, vocab_size, device)
++    requests = ExecuteModelRequest(seq_group_metadatalist,
++                                   num_lookahead_slots=max_propose_len)
++
++    batch_expansion_scorer = BatchExpansionTop1Scorer(scorer_worker, device,
++                                                      vocab_size)
++    batch_expansion_score = batch_expansion_scorer.score_proposals(
++        requests, proposals)
++
++    mqa_scorer = MQAScorer(scorer_worker, device, vocab_size)
++    mqa_score = mqa_scorer.score_proposals(requests, proposals)
++
++    assert_score_equal(batch_expansion_score, mqa_score)
+diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
+index ef9d32f..caf7a7e 100644
+--- a/tests/spec_decode/test_spec_decode_worker.py
++++ b/tests/spec_decode/test_spec_decode_worker.py
+@@ -1,13 +1,16 @@
+ import random
++from collections import defaultdict
+ from types import SimpleNamespace
++from typing import Dict, List, Set
+ from unittest.mock import MagicMock
+ 
+ import pytest
+ import torch
+ 
+-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
++from vllm.model_executor.layers.sampler import SamplerOutput
+ from vllm.model_executor.utils import set_random_seed
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
++from vllm.sequence import ExecuteModelRequest, SequenceOutput
++from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+ from vllm.spec_decode.interfaces import SpeculativeProposals
+ from vllm.spec_decode.metrics import (AsyncMetricsCollector,
+                                       SpecDecodeWorkerMetrics)
+@@ -15,23 +18,29 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker
+ from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
+                                                  split_num_cache_blocks_evenly)
+ 
++from .test_utils import mock_spec_decode_sampler
+ from .utils import create_batch, create_sampler_output_list, mock_worker
+ 
+ 
+ @pytest.mark.parametrize('k', [1, 2, 6])
+ @pytest.mark.parametrize('batch_size', [1, 2, 32])
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
+ @torch.inference_mode()
+-def test_correctly_calls_draft_model(k: int, batch_size: int):
++def test_correctly_calls_draft_model(k: int, batch_size: int,
++                                     acceptance_sampler_method: str):
+     """Verify SpecDecodeWorker calls the draft worker with correct
+     inputs. Everything else is mocked out.
+     """
+     draft_worker = mock_worker(cls=MultiStepWorker)
+     target_worker = mock_worker()
+-    rejection_sampler = MagicMock(spec=RejectionSampler)
+     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+-                              metrics_collector)
+-
++    worker = SpecDecodeWorker(
++        draft_worker,
++        target_worker,
++        mock_spec_decode_sampler(acceptance_sampler_method),
++        disable_logprobs=False,
++        metrics_collector=metrics_collector)
+     exception_secret = 'artificial stop'
+     draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
+ 
+@@ -52,15 +61,16 @@ def test_correctly_calls_draft_model(k: int, batch_size: int):
+ 
+ @pytest.mark.parametrize('k', [1, 2, 6])
+ @pytest.mark.parametrize('batch_size', [1, 2, 32])
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
+ @torch.inference_mode()
+-def test_correctly_calls_target_model(k: int, batch_size: int):
++def test_batch_expansion_correctly_calls_target_model(
++        k: int, batch_size: int, acceptance_sampler_method: str):
+     """Verify SpecDecodeWorker calls the target model with correct
+-    inputs. Everything else is mocked out.
++    inputs with batch expansion. Everything else is mocked out.
+     """
+     draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
+     target_worker = mock_worker(use_spec=False)
+-    rejection_sampler = MagicMock(spec=RejectionSampler)
+-    rejection_sampler.token_id_dtype = torch.int64
+     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+ 
+     draft_worker.device = 'cuda'
+@@ -68,8 +78,13 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
+ 
+     set_random_seed(1)
+ 
+-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+-                              metrics_collector)
++    worker = SpecDecodeWorker(
++        draft_worker,
++        target_worker,
++        mock_spec_decode_sampler(acceptance_sampler_method),
++        disable_logprobs=False,
++        metrics_collector=metrics_collector,
++        disable_mqa_scorer=True)
+     worker.init_device()
+ 
+     vocab_size = 32_000
+@@ -103,7 +118,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
+             seq_group_metadata_list=seq_group_metadata_list,
+             num_lookahead_slots=k))
+ 
+-    seen_contexts = []
++    seen_contexts: List[List[int]] = []
+ 
+     call_args_list = target_worker.execute_model.call_args_list
+     assert len(call_args_list) == 1
+@@ -116,7 +131,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
+             for seq_data in seq_group_metadata.seq_data.values():
+                 seen_contexts.append(seq_data.get_token_ids())
+ 
+-    expected_seen_contexts = []
++    expected_seen_contexts: List[List[int]] = []
+ 
+     for prompt, prev_generated, draft_tokens in zip(
+             prompts, prev_output_tokens, proposal_token_ids.tolist()):
+@@ -132,8 +147,11 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
+ 
+ @pytest.mark.parametrize('k', [1, 2, 6])
+ @pytest.mark.parametrize('batch_size', [1, 2, 32])
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
+ @torch.inference_mode()
+-def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
++def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int,
++                                             acceptance_sampler_method: str):
+     """Verify SpecDecodeWorker calls the rejection sampler with
+     correct inputs. Everything else is mocked out.
+     """
+@@ -143,16 +161,18 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
+                                vocab_size=vocab_size,
+                                use_spec=False)
+     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
+-    rejection_sampler = MagicMock(spec=RejectionSampler)
+-    rejection_sampler.token_id_dtype = torch.int64
++    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
+     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+     draft_worker.device = 'cuda'
+     target_worker.device = 'cuda'
+ 
+     set_random_seed(1)
+ 
+-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+-                              metrics_collector)
++    worker = SpecDecodeWorker(draft_worker,
++                              target_worker,
++                              spec_decode_sampler,
++                              disable_logprobs=False,
++                              metrics_collector=metrics_collector)
+     worker.init_device()
+ 
+     proposal_token_ids = torch.randint(low=0,
+@@ -198,30 +218,33 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
+     target_worker.execute_model.return_value = [target_output[0]]
+ 
+     exception_secret = 'artificial stop'
+-    rejection_sampler.side_effect = ValueError(exception_secret)
++
++    spec_decode_sampler.side_effect = ValueError(exception_secret)
+ 
+     with pytest.raises(ValueError, match=exception_secret):
+         worker.execute_model(execute_model_req=ExecuteModelRequest(
+             seq_group_metadata_list=seq_group_metadata_list,
+             num_lookahead_slots=k))
+ 
+-    assert len(rejection_sampler.call_args_list) == 1
+-    _, kwargs = rejection_sampler.call_args_list[0]
++    assert len(spec_decode_sampler.call_args_list) == 1
++    _, kwargs = spec_decode_sampler.call_args_list[0]
+     actual = SimpleNamespace(**kwargs)
+ 
+     assert torch.equal(actual.bonus_token_ids,
+                        target_token_ids.reshape(batch_size, k + 1)[:, -1:])
+-    assert torch.equal(
+-        actual.target_probs,
+-        target_token_probs.reshape(batch_size, k + 1, -1)[:, :-1])
++    assert torch.equal(actual.target_with_bonus_probs,
++                       target_token_probs.reshape(batch_size, k + 1, -1))
+     assert torch.equal(actual.draft_token_ids, proposal_token_ids)
+     assert torch.equal(actual.draft_probs, proposal_probs)
+ 
+ 
+ @pytest.mark.parametrize('k', [1, 2, 6])
+ @pytest.mark.parametrize('batch_size', [1, 2, 32])
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
+ @torch.inference_mode()
+-def test_correctly_formats_output(k: int, batch_size: int):
++def test_correctly_formats_output(k: int, batch_size: int,
++                                  acceptance_sampler_method: str):
+     """Verify SpecDecodeWorker formats sampler output correctly.
+     Everything else is mocked out.
+     """
+@@ -231,16 +254,17 @@ def test_correctly_formats_output(k: int, batch_size: int):
+                                vocab_size=vocab_size,
+                                use_spec=False)
+     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
+-    rejection_sampler = MagicMock(spec=RejectionSampler)
+-    rejection_sampler.token_id_dtype = torch.int64
+     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+     draft_worker.device = 'cuda'
+     target_worker.device = 'cuda'
+ 
+     set_random_seed(1)
+-
+-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+-                              metrics_collector)
++    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
++    worker = SpecDecodeWorker(draft_worker,
++                              target_worker,
++                              spec_decode_sampler,
++                              disable_logprobs=False,
++                              metrics_collector=metrics_collector)
+     worker.init_device()
+ 
+     proposal_token_ids = torch.randint(low=0,
+@@ -285,24 +309,23 @@ def test_correctly_formats_output(k: int, batch_size: int):
+ 
+     target_worker.execute_model.return_value = [target_output[0]]
+ 
+-    rejection_sampler_output = torch.randint(low=0,
+-                                             high=vocab_size,
+-                                             size=(batch_size, k + 1),
+-                                             dtype=torch.int64,
+-                                             device='cuda')
++    spec_decode_sampler_output = torch.randint(low=0,
++                                               high=vocab_size,
++                                               size=(batch_size, k + 1),
++                                               dtype=torch.int64,
++                                               device='cuda')
+     for i in range(batch_size):
+         minimum_accepted_tokens = 1
+-        rejection_sampler_output[i][
++        spec_decode_sampler_output[i][
+             -random.randint(minimum_accepted_tokens, k + 1):] = -1
+ 
+-    rejection_sampler.return_value = rejection_sampler_output
+-
++    spec_decode_sampler.return_value = spec_decode_sampler_output
+     output = worker.execute_model(execute_model_req=ExecuteModelRequest(
+         seq_group_metadata_list=seq_group_metadata_list,
+         num_lookahead_slots=k))
+ 
+     expected_output = create_sampler_output_list(
+-        token_ids=rejection_sampler_output.transpose(0, 1),
++        token_ids=spec_decode_sampler_output.transpose(0, 1),
+         probs=[None for _ in range(k + 1)],
+         logprobs=[None for _ in range(k + 1)])
+ 
+@@ -310,8 +333,14 @@ def test_correctly_formats_output(k: int, batch_size: int):
+         next(iter(seq_group_metadata.seq_data.keys()))
+         for seq_group_metadata in seq_group_metadata_list
+     ]
+-    actual_output_by_seq = {seq_id: [] for seq_id in seq_ids}
+-    expected_output_by_seq = {seq_id: [] for seq_id in seq_ids}
++    actual_output_by_seq: Dict[int, List[SequenceOutput]] = {
++        seq_id: []
++        for seq_id in seq_ids
++    }
++    expected_output_by_seq: Dict[int, List[SequenceOutput]] = {
++        seq_id: []
++        for seq_id in seq_ids
++    }
+ 
+     for step in output:
+         for seq_group in step:
+@@ -343,8 +372,11 @@ def test_correctly_formats_output(k: int, batch_size: int):
+ @pytest.mark.parametrize('k', [1, 2])
+ @pytest.mark.parametrize('batch_size', [1])
+ @pytest.mark.parametrize('returns_metrics', [True, False])
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
+ @torch.inference_mode()
+-def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
++def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool,
++                          acceptance_sampler_method: str):
+     """Verify SpecDecodeWorker collects metrics.
+     """
+     vocab_size = 32_000
+@@ -353,16 +385,18 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
+                                vocab_size=vocab_size,
+                                use_spec=False)
+     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
+-    rejection_sampler = MagicMock(spec=RejectionSampler)
+-    rejection_sampler.token_id_dtype = torch.int64
++    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
+     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+     draft_worker.device = 'cuda'
+     target_worker.device = 'cuda'
+ 
+     set_random_seed(1)
+ 
+-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+-                              metrics_collector)
++    worker = SpecDecodeWorker(draft_worker,
++                              target_worker,
++                              spec_decode_sampler,
++                              disable_logprobs=False,
++                              metrics_collector=metrics_collector)
+     worker.init_device()
+ 
+     proposal_token_ids = torch.randint(low=0,
+@@ -407,17 +441,16 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
+ 
+     target_worker.execute_model.return_value = [target_output[0]]
+ 
+-    rejection_sampler_output = torch.randint(low=0,
+-                                             high=vocab_size,
+-                                             size=(batch_size, k + 1),
+-                                             dtype=torch.int64,
+-                                             device='cuda')
++    spec_decode_sampler_output = torch.randint(low=0,
++                                               high=vocab_size,
++                                               size=(batch_size, k + 1),
++                                               dtype=torch.int64,
++                                               device='cuda')
+     for i in range(batch_size):
+         minimum_accepted_tokens = 1
+-        rejection_sampler_output[i][
++        spec_decode_sampler_output[i][
+             -random.randint(minimum_accepted_tokens, k + 1):] = -1
+-
+-    rejection_sampler.return_value = rejection_sampler_output
++    spec_decode_sampler.return_value = spec_decode_sampler_output
+ 
+     mock_rejsample_metrics = MagicMock(
+         spec=SpecDecodeWorkerMetrics) if returns_metrics else None
+@@ -438,26 +471,35 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
+ 
+ @pytest.mark.parametrize('k', [0])
+ @pytest.mark.parametrize('batch_size', [1, 2, 32])
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
+ @torch.inference_mode()
+-def test_k_equals_zero(k: int, batch_size: int):
++def test_k_equals_zero(k: int, batch_size: int,
++                       acceptance_sampler_method: str):
+     """Verify that the SpecDecodeWorker calls the draft and target workers
+     when k is zero. This happens during prefill.
+     """
+     draft_worker = mock_worker(cls=MultiStepWorker)
+     target_worker = mock_worker()
+-    rejection_sampler = MagicMock(spec=RejectionSampler)
+-    rejection_sampler.token_id_dtype = torch.int64
+     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+ 
+-    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
++    sampler_output = MagicMock(spec=SamplerOutput)
++    sampler_output.hidden_states = None
++    target_worker.execute_model.return_value = [sampler_output]
+ 
+     draft_worker.device = 'cuda'
+     target_worker.device = 'cuda'
+ 
+     set_random_seed(1)
+ 
+-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+-                              metrics_collector)
++    worker = SpecDecodeWorker(
++        proposer_worker=draft_worker,
++        scorer_worker=target_worker,
++        spec_decode_sampler=mock_spec_decode_sampler(
++            acceptance_sampler_method),
++        disable_logprobs=False,
++        metrics_collector=metrics_collector,
++    )
+ 
+     seq_group_metadata_list, _, _ = create_batch(batch_size,
+                                                  k,
+@@ -468,9 +510,10 @@ def test_k_equals_zero(k: int, batch_size: int):
+     out = worker.execute_model(execute_model_req=execute_model_req)
+ 
+     assert len(out) == 1, f"expected only one token output when {k=}"
+-    assert out[0].probs is None, "expect gpu tensor references to be None"
++    assert out[0].sampled_token_probs is None, (
++        "expect gpu tensor references to be None")
+     assert out[
+-        0].sampled_tokens is None, "expect gpu tensor references to be None"
++        0].sampled_token_ids is None, "expect gpu tensor references to be None"
+ 
+     draft_worker.execute_model.assert_called_once_with(execute_model_req)
+     target_worker.execute_model.assert_called_once_with(execute_model_req)
+@@ -478,27 +521,36 @@ def test_k_equals_zero(k: int, batch_size: int):
+ 
+ @pytest.mark.parametrize('k', [0, 5])
+ @pytest.mark.parametrize('batch_size', [0])
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
+ @torch.inference_mode()
+-def test_empty_input_batch(k: int, batch_size: int):
++def test_empty_input_batch(k: int, batch_size: int,
++                           acceptance_sampler_method: str):
+     """Verify that the SpecDecodeWorker calls the draft and target workers
+     when the input batch is empty. This can happen if the engine communicates
+     to the workers information without scheduling a batch.
+     """
+     draft_worker = mock_worker(cls=MultiStepWorker)
+     target_worker = mock_worker()
+-    rejection_sampler = MagicMock(spec=RejectionSampler)
+-    rejection_sampler.token_id_dtype = torch.int64
+     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+ 
+-    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
++    sampler_output = MagicMock(spec=SamplerOutput)
++    sampler_output.hidden_states = None
++    target_worker.execute_model.return_value = [sampler_output]
+ 
+     draft_worker.device = 'cuda'
+     target_worker.device = 'cuda'
+ 
+     set_random_seed(1)
+ 
+-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+-                              metrics_collector)
++    worker = SpecDecodeWorker(
++        proposer_worker=draft_worker,
++        scorer_worker=target_worker,
++        spec_decode_sampler=mock_spec_decode_sampler(
++            acceptance_sampler_method),
++        disable_logprobs=False,
++        metrics_collector=metrics_collector,
++    )
+ 
+     seq_group_metadata_list, _, _ = create_batch(batch_size,
+                                                  k,
+@@ -509,51 +561,60 @@ def test_empty_input_batch(k: int, batch_size: int):
+     out = worker.execute_model(execute_model_req=execute_model_req)
+ 
+     assert len(out) == 1, f"expected only one token output when {k=}"
+-    assert out[0].probs is None, "expect gpu tensor references to be None"
++    assert out[0].sampled_token_probs is None, (
++        "expect gpu tensor references to be None")
+     assert out[
+-        0].sampled_tokens is None, "expect gpu tensor references to be None"
++        0].sampled_token_ids is None, "expect gpu tensor references to be None"
+ 
+     draft_worker.execute_model.assert_called_once_with(execute_model_req)
+     target_worker.execute_model.assert_called_once_with(execute_model_req)
+ 
+ 
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
+ @pytest.mark.skip_global_cleanup
+-def test_init_device():
++def test_init_device(acceptance_sampler_method: str):
+     """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as
+     well as other GPU initialization.
+     """
+     draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
+     target_worker = mock_worker(use_spec=False)
+-    rejection_sampler = MagicMock(spec=RejectionSampler)
+-    rejection_sampler.token_id_dtype = torch.int64
++    spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
+     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+ 
+-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+-                              metrics_collector)
+-
++    worker = SpecDecodeWorker(
++        proposer_worker=draft_worker,
++        scorer_worker=target_worker,
++        spec_decode_sampler=spec_decode_sampler,
++        disable_logprobs=False,
++        metrics_collector=metrics_collector,
++    )
+     worker.init_device()
+ 
+     draft_worker.init_device.assert_called_once()
+ 
+     target_worker.init_device.assert_called_once()
+ 
+-    metrics_collector.init_gpu_tensors.assert_called_once()
+-    rejection_sampler.init_gpu_tensors.assert_called_once()
++    metrics_collector.init_tensors.assert_called_once()
++    spec_decode_sampler.init_tensors.assert_called_once()
+ 
+ 
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
+ @torch.inference_mode()
+-def test_initialize_cache():
++def test_initialize_cache(acceptance_sampler_method):
+     """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer
+     workers.
+     """
+     draft_worker = mock_worker(cls=MultiStepWorker)
+     target_worker = mock_worker()
+-    rejection_sampler = MagicMock(spec=RejectionSampler)
+-    rejection_sampler.token_id_dtype = torch.int64
+     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+ 
+-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+-                              metrics_collector)
++    worker = SpecDecodeWorker(proposer_worker=draft_worker,
++                              scorer_worker=target_worker,
++                              spec_decode_sampler=mock_spec_decode_sampler(
++                                  acceptance_sampler_method),
++                              metrics_collector=metrics_collector)
+ 
+     kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023}
+     worker.initialize_cache(**kwargs)
+@@ -566,19 +627,20 @@ def test_initialize_cache():
+ @pytest.mark.parametrize('available_cpu_blocks', [500])
+ @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
+ @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
++@pytest.mark.parametrize("acceptance_sampler_method",
++                         ["rejection_sampler", "typical_acceptance_sampler"])
+ @pytest.mark.skip_global_cleanup
+ def test_determine_num_available_blocks(available_gpu_blocks: int,
+                                         available_cpu_blocks: int,
+                                         target_cache_block_size_bytes: int,
+-                                        draft_kv_size_bytes: int):
++                                        draft_kv_size_bytes: int,
++                                        acceptance_sampler_method: str):
+     """Verify SpecDecodeWorker correctly profiles num available GPU blocks.
+     Specifically, it should run profiling in the scorer worker, and then evenly
+     split the blocks between proposer and scorer worker.
+     """
+     draft_worker = mock_worker(cls=MultiStepWorker)
+     target_worker = mock_worker()
+-    rejection_sampler = MagicMock(spec=RejectionSampler)
+-    rejection_sampler.token_id_dtype = torch.int64
+     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+ 
+     target_worker.determine_num_available_blocks.return_value = (
+@@ -587,8 +649,9 @@ def test_determine_num_available_blocks(available_gpu_blocks: int,
+         target_cache_block_size_bytes)
+     draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes
+ 
+-    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+-                              metrics_collector)
++    worker = SpecDecodeWorker(
++        draft_worker, target_worker,
++        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
+ 
+     num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks()
+ 
+@@ -618,3 +681,224 @@ def test_split_num_cache_blocks_evenly(available_gpu_blocks: int,
+     assert (num_blocks * target_cache_block_size_bytes) + (
+         num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks *
+                                               target_cache_block_size_bytes)
++
++
++@torch.inference_mode()
++def test_populate_seq_ids_with_bonus_tokens():
++    """
++    Verify that a call to _create_output_sampler_list correctly updates
++    seq_with_bonus_token_in_last_step.
++
++    seq_with_bonus_token_in_last_step is an internal data structure in
++    SpecDecodeWorker that tracks the sequence IDs which are assigned bonus
++    tokens by the target model in their last forward pass. This state is
++    maintained only for models relying on the KV cache, such as those using
++    the MultiStepWorker.
++    """
++    batch_size = 10
++    k = 5
++    vocab_size = 10000
++    num_sequences_with_bonus_tokens = 5
++    target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
++    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
++    target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
++    target_worker.device = 'cuda'
++
++    set_random_seed(1)
++    draft_worker = mock_worker(cls=MultiStepWorker)
++    draft_worker.device = 'cuda'
++    # The sequence_ids attached to each sequence in the batch.
++    # The sequence at index i has seq_id assigned_seq_ids[i]
++    assigned_seq_ids = list(range(batch_size))
++    seq_group_metadata_list, _, _ = create_batch(batch_size,
++                                                 k,
++                                                 seq_ids=assigned_seq_ids,
++                                                 prev_output_token_len=10)
++    target_token_logprobs = torch.rand(batch_size, (k + 1),
++                                       vocab_size,
++                                       dtype=torch.float32,
++                                       device='cuda')
++    accepted_token_ids = torch.randint(low=0,
++                                       high=vocab_size,
++                                       size=(batch_size, (k + 1)),
++                                       dtype=torch.int64,
++                                       device='cuda')
++    expected_request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set)
++    for seq_group_metadata in seq_group_metadata_list:
++        for seq_id in seq_group_metadata.seq_data:
++            expected_request_id_seq_ids_mapping[
++                seq_group_metadata.request_id].add(seq_id)
++    # Generate a random sample of sequence indexes with bonus tokens
++    seq_indexes_with_bonus_tokens = random.sample(
++        range(batch_size), num_sequences_with_bonus_tokens)
++    # Create a mask that is True for indices in seq_indexes_with_bonus_tokens
++    mask = torch.ones(batch_size, dtype=torch.bool, device='cuda')
++    mask[seq_indexes_with_bonus_tokens] = False
++    # Set the last token ID to -1 for all indices not in
++    # seq_indexes_with_bonus_tokens to indicate the lack of bonus token in
++    # those indices.
++    accepted_token_ids[mask, -1:] = -1
++    worker = SpecDecodeWorker(draft_worker,
++                              target_worker,
++                              mock_spec_decode_sampler("rejection_sampler"),
++                              disable_logprobs=False,
++                              metrics_collector=metrics_collector)
++    # Initialize _seq_with_bonus_token_in_last_step with a set of sequence IDs.
++    # This set includes all sequence IDs in the batch as well as an additional
++    # `num_extra_sequence_ids` sequence IDs. Note that the sequence IDs are in
++    # the range [0, batch_size + num_extra_sequence_ids).
++    num_extra_sequence_ids = 10
++    worker._seq_with_bonus_token_in_last_step = set(
++        range(batch_size + num_extra_sequence_ids))
++    worker._create_output_sampler_list(
++        seq_group_metadata_list=seq_group_metadata_list,
++        accepted_token_ids=accepted_token_ids,
++        target_logprobs=target_token_logprobs,
++        k=k,
++        stage_times=(0, 0, 0))
++    # Verify that _seq_with_bonus_token_in_last_step contains the following:
++    # 1. Sequence IDs that were already present in
++    #    _seq_with_bonus_token_in_last_step but were not part of the current
++    #    batch are retained.
++    # 2. Of the sequence IDs present in the current batch, only those with a
++    #    bonus token are retained in _seq_with_bonus_token_in_last_step.
++    #    Sequence IDs that are present in the current batch but do not have
++    #    bonus tokens are removed from _seq_with_bonus_token_in_last_step.
++    expected_seq_ids_with_bonus_tokens = \
++        set([assigned_seq_ids[i] for i in seq_indexes_with_bonus_tokens])
++    additional_sequence_ids = \
++        set(range(batch_size, batch_size + num_extra_sequence_ids))
++    assert worker._seq_with_bonus_token_in_last_step == \
++        expected_seq_ids_with_bonus_tokens.union(additional_sequence_ids)
++    assert worker._request_id_seq_id_mapping == \
++        expected_request_id_seq_ids_mapping
++
++
++@torch.inference_mode()
++def test_handle_finished_requests():
++    """
++    Test to verify that finished request IDs are appropriately processed to 
++    update the internal state of the SpecDecodeWorker.
++
++    This test initializes the SpecDecodeWorker with mock data, marks certain 
++    requests as finished, and ensures that the corresponding sequence IDs are 
++    correctly removed from the internal mappings.
++    """
++    batch_size = 32
++    k = 3
++    draft_worker = mock_worker(cls=MultiStepWorker)
++    target_worker = mock_worker()
++    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
++    worker = SpecDecodeWorker(draft_worker, target_worker,
++                              mock_spec_decode_sampler("rejection_sampler"),
++                              metrics_collector)
++    # Initialize the request_id_seq_id_mapping mapping dict with a few fake
++    # request ids and corresponding sequence ids.
++    worker._request_id_seq_id_mapping = \
++        {'request-1': {1,2,3}, 'request-2': {4,5,6,7},
++        'request-3': {8,9}, 'request-4': {10,11}}
++    # Initialize seq_with_bonus_token_in_last_step with a few fake
++    # sequence ids.
++    worker._seq_with_bonus_token_in_last_step = {1, 4, 5, 8, 9, 10}
++    exception_secret = 'artificial stop'
++    draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
++
++    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
++    # Mark requests with ids request-1 and request-3 as finished.
++    execute_model_req = ExecuteModelRequest(
++        seq_group_metadata_list=seq_group_metadata_list,
++        num_lookahead_slots=k,
++        finished_requests_ids=['request-1', 'request-3'])
++
++    with pytest.raises(ValueError, match=exception_secret):
++        worker.execute_model(execute_model_req=execute_model_req)
++    # Verify that request-1 and request-3 are removed from
++    # request_id_seq_id_mapping
++    assert worker._request_id_seq_id_mapping == \
++        {'request-2': {4,5,6,7}, 'request-4': {10,11}}
++    # Verify that all sequence ids corresponding to 'request-1'
++    # and 'request-3' are removed from seq_with_bonus_token_in_last_step.
++    assert worker._seq_with_bonus_token_in_last_step == \
++        {4,5,10}
++
++
++@pytest.mark.parametrize('k', [3])
++@pytest.mark.parametrize('batch_size', [2, 32])
++@pytest.mark.parametrize("batch_composition",
++                         ["prefill_only", "decode_only", "mixed"])
++@torch.inference_mode()
++def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
++    """
++        Verify SpecDecodeWorker calls match the expected flow.
++    """
++    vocab_size = 32_000
++    draft_worker = mock_worker(cls=MultiStepWorker)
++    target_worker = mock_worker()
++    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
++    worker = SpecDecodeWorker(draft_worker,
++                              target_worker,
++                              mock_spec_decode_sampler("rejection_sampler"),
++                              disable_logprobs=False,
++                              metrics_collector=metrics_collector)
++    exception_secret = 'artificial stop'
++    worker.scorer = mock_worker(BatchExpansionTop1Scorer)
++    worker.scorer.score_proposals.side_effect = ValueError(exception_secret)
++
++    # Create batch with combination of terminal/non-terminal prefill chunks
++    # and decodes (different seq_ids).
++    decodes, _, _ = create_batch(batch_size, k)
++    # Pre-chunking here, get 'batch_size' chunks.
++    prefill, _, _ = create_batch(batch_size,
++                                 k,
++                                 prefill_chunk_size=4,
++                                 seq_ids=list(range(batch_size,
++                                                    batch_size * 2)))
++
++    if batch_composition == "prefill_only":
++        n_prefills = batch_size
++    elif batch_composition == "decode_only":
++        n_prefills = 0
++    else:
++        n_prefills = random.randint(1, batch_size - 1)
++    n_decodes = batch_size - n_prefills
++
++    prefill = random.sample(prefill, n_prefills)
++    decodes = random.sample(decodes, n_decodes)
++    target_group_metadata_list = prefill + decodes
++    execute_model_req = ExecuteModelRequest(
++        seq_group_metadata_list=target_group_metadata_list,
++        # For prefill only batches we expect num_lookahead_slots = 0.
++        num_lookahead_slots=k if n_decodes > 0 else 0)
++
++    target_token_ids = torch.randint(low=0,
++                                     high=vocab_size,
++                                     size=(1, batch_size * (k + 1)),
++                                     dtype=torch.int64,
++                                     device='cuda')
++    target_token_probs = torch.rand(1,
++                                    batch_size * (k + 1),
++                                    vocab_size,
++                                    dtype=torch.float32,
++                                    device='cuda')
++    target_token_logprobs = torch.rand(1,
++                                       batch_size * (k + 1),
++                                       vocab_size,
++                                       dtype=torch.float32,
++                                       device='cuda')
++    target_output = create_sampler_output_list(target_token_ids,
++                                               target_token_probs,
++                                               target_token_logprobs)
++
++    target_worker.execute_model.return_value = [target_output[0]]
++
++    if not len(decodes):
++        worker.execute_model(execute_model_req=execute_model_req)
++        # no spec run (prefill only)
++        draft_worker.execute_model.assert_called_once_with(execute_model_req)
++        target_worker.execute_model.assert_called_once_with(execute_model_req)
++    else:
++        # Decode-only run OR mixed batch, scorer call fails (it's mocked)
++        with pytest.raises(ValueError, match=exception_secret):
++            worker.execute_model(execute_model_req=execute_model_req)
++        # but first draft still counted
++        assert draft_worker.get_spec_proposals.call_count == 1
+diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
+index 6b6f35a..195fce6 100644
+--- a/tests/spec_decode/test_utils.py
++++ b/tests/spec_decode/test_utils.py
+@@ -1,9 +1,15 @@
+ from unittest.mock import MagicMock
+ 
+ import pytest
++import torch
+ 
+-from vllm.sequence import SequenceGroupMetadata
+-from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len
++from vllm.model_executor.layers.rejection_sampler import RejectionSampler
++from vllm.model_executor.layers.sampler import _get_ranks
++from vllm.model_executor.layers.typical_acceptance_sampler import (
++    TypicalAcceptanceSampler)
++from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids
++from vllm.spec_decode.util import (get_sampled_token_logprobs,
++                                   split_batch_by_proposal_len)
+ 
+ 
+ def test_get_all_seq_ids():
+@@ -51,10 +57,9 @@ def fake_sequence_group_metadata():
+ 
+ def test_filter_zero_length_proposals(fake_sequence_group_metadata):
+     proposal_lens = [0, 1, 0]
+-    filtered_groups, indices = split_batch_by_proposal_len(
+-        fake_sequence_group_metadata,
+-        proposal_lens,
+-        select_proposal_len_zero=True)
++    _, (filtered_groups,
++        indices) = split_batch_by_proposal_len(fake_sequence_group_metadata,
++                                               proposal_lens)
+ 
+     expected_groups = [
+         fake_sequence_group_metadata[0], fake_sequence_group_metadata[2]
+@@ -67,10 +72,9 @@ def test_filter_zero_length_proposals(fake_sequence_group_metadata):
+ 
+ def test_filter_non_zero_length_proposals(fake_sequence_group_metadata):
+     proposal_lens = [0, 1, 2]
+-    filtered_groups, indices = split_batch_by_proposal_len(
+-        fake_sequence_group_metadata,
+-        proposal_lens,
+-        select_proposal_len_zero=False)
++    (filtered_groups,
++     indices), _ = split_batch_by_proposal_len(fake_sequence_group_metadata,
++                                               proposal_lens)
+ 
+     expected_groups = [
+         fake_sequence_group_metadata[1], fake_sequence_group_metadata[2]
+@@ -82,8 +86,7 @@ def test_filter_non_zero_length_proposals(fake_sequence_group_metadata):
+ 
+ 
+ def test_empty_inputs():
+-    filtered_groups, indices = split_batch_by_proposal_len(
+-        [], [], select_proposal_len_zero=True)
++    _, (filtered_groups, indices) = split_batch_by_proposal_len([], [])
+ 
+     assert filtered_groups == []
+     assert indices == []
+@@ -91,10 +94,9 @@ def test_empty_inputs():
+ 
+ def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata):
+     proposal_lens = [0, 0, 0]
+-    filtered_groups, indices = split_batch_by_proposal_len(
+-        fake_sequence_group_metadata,
+-        proposal_lens,
+-        select_proposal_len_zero=False)
++    (filtered_groups,
++     indices), _ = split_batch_by_proposal_len(fake_sequence_group_metadata,
++                                               proposal_lens)
+ 
+     assert filtered_groups == []
+     assert indices == []
+@@ -102,10 +104,44 @@ def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata):
+ 
+ def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata):
+     proposal_lens = [1, 1, 1]
+-    filtered_groups, indices = split_batch_by_proposal_len(
+-        fake_sequence_group_metadata,
+-        proposal_lens,
+-        select_proposal_len_zero=True)
++    _, (filtered_groups,
++        indices) = split_batch_by_proposal_len(fake_sequence_group_metadata,
++                                               proposal_lens)
+ 
+     assert filtered_groups == []
+     assert indices == []
++
++
++def mock_spec_decode_sampler(acceptance_sampler_method):
++    """
++    Returns either a RejectionSampler or TypicalAcceptanceSampler
++    object depending on whether acceptance_sampler_method is 
++    'rejection_sampler' or 'typical_acceptance_sampler' respectively.
++    """
++    if acceptance_sampler_method == "rejection_sampler":
++        sampler = MagicMock(spec=RejectionSampler)
++        sampler.token_id_dtype = torch.int64
++        return sampler
++    elif acceptance_sampler_method == "typical_acceptance_sampler":
++        sampler = MagicMock(spec=TypicalAcceptanceSampler)
++        sampler.token_id_dtype = torch.int64
++        return sampler
++    else:
++        raise ValueError(f"Invalid sampler name {acceptance_sampler_method}")
++
++
++def test_get_sampled_token_logprobs():
++    """Verify get_sampled_token_logprobs returns consistent rankings 
++    with regular get_ranks when probabilities match exactly.
++    """
++    logprob_tensor = torch.tensor(
++        [[[-.1, -.1]] * 2])  # shape (num_steps, batch_size, vocab_size)
++    sampled_token_tensor = torch.tensor([[1,
++                                          0]])  # shape (num_steps, batch_size)
++    ranks_spec_dec, _ = get_sampled_token_logprobs(logprob_tensor,
++                                                   sampled_token_tensor)
++
++    ranks_regular = _get_ranks(logprob_tensor.reshape((2, -1)),
++                               sampled_token_tensor.reshape(-1))
++
++    assert torch.equal(ranks_spec_dec.reshape(-1), ranks_regular)
+diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
+index f288652..a4bfa6b 100644
+--- a/tests/spec_decode/utils.py
++++ b/tests/spec_decode/utils.py
+@@ -1,19 +1,24 @@
+ from itertools import count
+-from typing import Dict, Iterable, List, Optional, Union
++from typing import Callable, Dict, List, Optional
++from typing import Sequence as GenericSequence
++from typing import TypeVar, Union
+ from unittest.mock import MagicMock
+ 
+ import torch
+ 
+ from vllm.engine.arg_utils import EngineArgs
++from vllm.model_executor.layers.sampler import SamplerOutput
+ from vllm.model_executor.utils import set_random_seed
+ from vllm.sampling_params import SamplingParams
+-from vllm.sequence import (Logprob, SamplerOutput, SequenceData,
+-                           SequenceGroupMetadata, SequenceGroupOutput,
+-                           SequenceOutput)
++from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
++                           SequenceData, SequenceGroupMetadata, SequenceOutput)
+ from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+ from vllm.worker.cache_engine import CacheEngine
++from vllm.worker.model_runner import ModelRunner
+ from vllm.worker.worker import Worker
+ 
++T = TypeVar("T", bound=Worker)
++
+ 
+ def round_up_to_next_block(seq_len: int, block_size: int) -> int:
+     return (seq_len + block_size - 1) // block_size
+@@ -49,25 +54,28 @@ def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]):
+     return new_execute_model
+ 
+ 
+-def zero_kv_cache(cache_engine: CacheEngine):
+-    assert cache_engine.gpu_cache
+-    for key_blocks, value_blocks in cache_engine.gpu_cache:
++def zero_kv_cache(cache_engine: List[CacheEngine]):
++    assert cache_engine[0].gpu_cache
++    for key_blocks, value_blocks in cache_engine[0].gpu_cache:
+         key_blocks.zero_()
+         value_blocks.zero_()
+ 
+ 
+-def create_worker(cls: type,
++def create_worker(cls: Callable[..., T],
+                   model_name: str,
+                   block_size: int,
+                   num_gpu_blocks: int,
+                   seed: int,
+                   is_driver_worker: bool = True,
+-                  enforce_eager: bool = True):
++                  enforce_eager: bool = True,
++                  model_runner_cls: Optional[ModelRunner] = None,
++                  dtype: Optional[str] = "auto") -> T:
+     engine_args = EngineArgs(
+         model=model_name,
+         seed=seed,
+         block_size=block_size,
+         enforce_eager=enforce_eager,
++        dtype=dtype,
+     )
+     engine_config = engine_args.create_engine_config()
+ 
+@@ -75,16 +83,12 @@ def create_worker(cls: type,
+         get_ip(), get_open_port())
+ 
+     worker = cls(
+-        model_config=engine_config.model_config,
+-        parallel_config=engine_config.parallel_config,
+-        scheduler_config=engine_config.scheduler_config,
+-        device_config=engine_config.device_config,
+-        cache_config=engine_config.cache_config,
+-        load_config=engine_config.load_config,
++        vllm_config=engine_config,
+         local_rank=0,
+         rank=0,
+         distributed_init_method=distributed_init_method,
+         is_driver_worker=is_driver_worker,
++        model_runner_cls=model_runner_cls,
+     )
+ 
+     worker.init_device()
+@@ -124,23 +128,58 @@ def create_seq_group_metadata_from_prompts(
+         for i, final_len in enumerate(final_prompt_lens)
+     }
+ 
+-    return [
+-        SequenceGroupMetadata(
+-            request_id=str(i),
+-            is_prompt=len(cont_token_ids) == 0,
+-            seq_data={
+-                i:
+-                SequenceData(
+-                    prompt_token_ids=prompt_token_ids[:],
+-                    output_token_ids=cont_token_ids[:],
+-                ),
+-            },
+-            sampling_params=SamplingParams(temperature=0.0, ),
+-            block_tables={i: block_allocations[i][:]},
+-        ) for i, (prompt_token_ids,
+-                  cont_token_ids) in enumerate(zip(prompts, continuations))
++    seq_grou_metadata_list = []
++    for i, (prompt_token_ids,
++            cont_token_ids) in enumerate(zip(prompts, continuations)):
++        data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids)
++        data.update_num_computed_tokens(
++            len(prompt_token_ids) + len(cont_token_ids) - 1)
++        seq_data = {i: data}
++        seq_grou_metadata_list.append(
++            SequenceGroupMetadata(
++                request_id=str(i),
++                is_prompt=len(cont_token_ids) == 0,
++                seq_data=seq_data,
++                sampling_params=SamplingParams(temperature=0.0),
++                block_tables={i: block_allocations[i][:]},
++            ))
++    return seq_grou_metadata_list
++
++
++def create_chunked_seq_group_metadata_from_prompt(
++        prompt: List[int],
++        num_gpu_blocks: int,
++        chunk_size: int,
++        block_size: int,
++        seq_id: Optional[int] = None) -> List[SequenceGroupMetadata]:
++
++    if seq_id is None:
++        seq_id = 0
++
++    free_gpu_blocks = list(range(num_gpu_blocks))
++
++    block_allocations = [
++        free_gpu_blocks.pop()
++        for _ in range(round_up_to_next_block(len(prompt), block_size))
+     ]
+ 
++    seq_group_metadata_list = []
++    for i, idx in enumerate(range(0, len(prompt), chunk_size)):
++        chunk_ids = prompt[idx:idx + chunk_size]
++        data = SequenceData.from_seqs(prompt)
++        data.update_num_computed_tokens(idx)
++        seq_data = {i: data}
++        seq_group_metadata_list.append(
++            SequenceGroupMetadata(
++                request_id=str(seq_id),
++                is_prompt=True,
++                do_sample=idx + chunk_size >= len(prompt),  # terminal chunk
++                seq_data=seq_data,
++                sampling_params=SamplingParams(temperature=0.0),
++                block_tables={i: block_allocations},
++                token_chunk_size=len(chunk_ids)))
++    return seq_group_metadata_list
++
+ 
+ def assert_logprobs_dict_allclose(
+         actual_logprobs: List[Dict[int, Logprob]],
+@@ -154,13 +193,13 @@ def assert_logprobs_dict_allclose(
+                 single_step_actual_logprobs[token_id].logprob)
+             expected = torch.tensor(
+                 single_step_expected_logprobs[token_id].logprob)
+-            assert torch.allclose(actual, expected)
++            torch.testing.assert_close(actual, expected)
+ 
+ 
+ def create_sampler_output_list(
+         token_ids: torch.Tensor,
+-        probs: Iterable[Optional[torch.Tensor]],
+-        logprobs: Iterable[Optional[torch.Tensor]],
++        probs: GenericSequence[Optional[torch.Tensor]],
++        logprobs: GenericSequence[Optional[torch.Tensor]],
+         seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]:
+     num_steps, batch_size = token_ids.shape
+     token_ids_by_step = token_ids.tolist()
+@@ -170,7 +209,7 @@ def create_sampler_output_list(
+ 
+     return [
+         SamplerOutput(outputs=[
+-            SequenceGroupOutput(
++            CompletionSequenceGroupOutput(
+                 samples=[
+                     SequenceOutput(
+                         output_token=token_id,
+@@ -194,7 +233,8 @@ def create_batch(batch_size,
+                  prev_output_token_len: int = 10,
+                  seq_ids: Optional[List[int]] = None,
+                  num_gpu_blocks: Optional[int] = None,
+-                 block_size: Optional[int] = None):
++                 block_size: Optional[int] = None,
++                 prefill_chunk_size: Optional[int] = None):
+     if block_size is None:
+         block_size = 8
+ 
+@@ -209,15 +249,28 @@ def create_batch(batch_size,
+         prompt_lens = prompt_len
+ 
+     prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens]
+-    prev_output_tokens = [[
+-        next(iterator) for _ in range(prev_output_token_len)
+-    ] for _ in range(batch_size)]
+-    final_prompt_lens = [
+-        len(prompt) + len(prev_output_token) + k + 1
+-        for prompt, prev_output_token in zip(prompts, prev_output_tokens)
+-    ]
+ 
+-    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+-        prompts, num_gpu_blocks, block_size, final_prompt_lens,
+-        prev_output_tokens, seq_ids)
++    if prefill_chunk_size:
++        # Create a batch of chunked prompts.
++        if not seq_ids:
++            seq_ids = list(range(len(prompts)))
++        seq_group_metadata_list = []
++        for p, sid in zip(prompts, seq_ids):
++            seq_group_metadata_list += \
++                create_chunked_seq_group_metadata_from_prompt(
++                p, num_gpu_blocks, prefill_chunk_size, block_size, sid)
++        seq_group_metadata_list = seq_group_metadata_list[:batch_size]
++        prev_output_tokens = []
++    else:
++        prev_output_tokens = [[
++            next(iterator) for _ in range(prev_output_token_len)
++        ] for _ in range(batch_size)]
++        final_prompt_lens = [
++            len(prompt) + len(prev_output_token) + k + 1
++            for prompt, prev_output_token in zip(prompts, prev_output_tokens)
++        ]
++
++        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
++            prompts, num_gpu_blocks, block_size, final_prompt_lens,
++            prev_output_tokens, seq_ids)
+     return seq_group_metadata_list, prompts, prev_output_tokens
+diff --git a/tests/standalone_tests/lazy_torch_compile.py b/tests/standalone_tests/lazy_torch_compile.py
+new file mode 100644
+index 0000000..b950877
+--- /dev/null
++++ b/tests/standalone_tests/lazy_torch_compile.py
+@@ -0,0 +1,28 @@
++# Description: Test the lazy import module
++# The utility function cannot be placed in `vllm.utils`
++# this needs to be a standalone script
++import sys
++from contextlib import nullcontext
++
++from vllm_test_utils import BlameResult, blame
++
++module_name = "torch._inductor.async_compile"
++
++# In CI, we only check finally if the module is imported.
++# If it is indeed imported, we can rerun the test with `use_blame=True`,
++# which will trace every function call to find the first import location,
++# and help find the root cause.
++# We don't run it in CI by default because it is slow.
++use_blame = False
++context = blame(
++    lambda: module_name in sys.modules) if use_blame else nullcontext()
++with context as result:
++    import vllm  # noqa
++
++if use_blame:
++    assert isinstance(result, BlameResult)
++    print(f"the first import location is:\n{result.trace_stack}")
++
++assert module_name not in sys.modules, (
++    f"Module {module_name} is imported. To see the first"
++    f" import location, run the test with `use_blame=True`.")
+diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
+new file mode 100644
+index 0000000..f00895c
+--- /dev/null
++++ b/tests/standalone_tests/python_only_compile.sh
+@@ -0,0 +1,30 @@
++#!/bin/bash
++# This script tests if the python only compilation works correctly
++# for users who do not have any compilers installed on their system
++
++set -e
++set -x
++
++cd /vllm-workspace/
++
++# uninstall vllm
++pip3 uninstall -y vllm
++# restore the original files
++mv test_docs/vllm ./vllm
++
++# remove all compilers
++apt remove --purge build-essential -y
++apt autoremove -y
++
++echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
++
++VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
++
++# Run the script
++python3 -c 'import vllm'
++
++# Check if the clangd log file was created
++if [ ! -f /tmp/changed.file ]; then
++    echo "changed.file was not created, python only compilation failed"
++    exit 1
++fi
+diff --git a/tests/system_messages/sonnet3.5_nov2024.txt b/tests/system_messages/sonnet3.5_nov2024.txt
+new file mode 100644
+index 0000000..2dc285a
+--- /dev/null
++++ b/tests/system_messages/sonnet3.5_nov2024.txt
+@@ -0,0 +1,71 @@
++The assistant is Claude, created by Anthropic.
++
++Claude’s knowledge base was last updated in April 2024. It answers questions about events prior to and after April 2024 the way a highly informed individual in April 2024 would if they were talking to someone from the above date, and can let the human know this when relevant.
++
++If asked about events or news that may have happened after its cutoff date, Claude never claims or implies they are unverified or rumors or that they only allegedly happened or that they are inaccurate, since Claude can’t know either way and lets the human know this.
++
++Claude cannot open URLs, links, or videos. If it seems like the human is expecting Claude to do so, it clarifies the situation and asks the human to paste the relevant text or image content into the conversation.
++
++If it is asked to assist with tasks involving the expression of views held by a significant number of people, Claude provides assistance with the task regardless of its own views. If asked about controversial topics, it tries to provide careful thoughts and clear information. Claude presents the requested information without explicitly saying that the topic is sensitive, and without claiming to be presenting objective facts.
++
++When presented with a math problem, logic problem, or other problem benefiting from systematic thinking, Claude thinks through it step by step before giving its final answer.
++
++If Claude is asked about a very obscure person, object, or topic, i.e. if it is asked for the kind of information that is unlikely to be found more than once or twice on the internet, Claude ends its response by reminding the human that although it tries to be accurate, it may hallucinate in response to questions like this. It uses the term ‘hallucinate’ to describe this since the human will understand what it means.
++
++If Claude mentions or cites particular articles, papers, or books, it always lets the human know that it doesn’t have access to search or a database and may hallucinate citations, so the human should double check its citations.
++
++Claude is intellectually curious. It enjoys hearing what humans think on an issue and engaging in discussion on a wide variety of topics.
++
++Claude uses markdown for code.
++
++Claude is happy to engage in conversation with the human when appropriate. Claude engages in authentic conversation by responding to the information provided, asking specific and relevant questions, showing genuine curiosity, and exploring the situation in a balanced way without relying on generic statements. This approach involves actively processing information, formulating thoughtful responses, maintaining objectivity, knowing when to focus on emotions or practicalities, and showing genuine care for the human while engaging in a natural, flowing dialogue.
++
++Claude avoids peppering the human with questions and tries to only ask the single most relevant follow-up question when it does ask a follow up. Claude doesn’t always end its responses with a question.
++
++Claude is always sensitive to human suffering, and expresses sympathy, concern, and well wishes for anyone it finds out is ill, unwell, suffering, or has passed away.
++
++Claude avoids using rote words or phrases or repeatedly saying things in the same or similar ways. It varies its language just as one would in a conversation.
++
++Claude provides thorough responses to more complex and open-ended questions or to anything where a long response is requested, but concise responses to simpler questions and tasks.
++
++Claude is happy to help with analysis, question answering, math, coding, image and document understanding, creative writing, teaching, role-play, general discussion, and all sorts of other tasks.
++
++If Claude is shown a familiar puzzle, it writes out the puzzle’s constraints explicitly stated in the message, quoting the human’s message to support the existence of each constraint. Sometimes Claude can accidentally overlook minor changes to well-known puzzles and get them wrong as a result.
++
++Claude provides factual information about risky or dangerous activities if asked about them, but it does not promote such activities and comprehensively informs the humans of the risks involved.
++
++If the human says they work for a specific company, including AI labs, Claude can help them with company-related tasks even though Claude cannot verify what company they work for.
++
++Claude should provide appropriate help with sensitive tasks such as analyzing confidential data provided by the human, answering general questions about topics related to cybersecurity or computer security, offering factual information about controversial topics and research areas, explaining historical atrocities, describing tactics used by scammers or hackers for educational purposes, engaging in creative writing that involves mature themes like mild violence or tasteful romance, providing general information about topics like weapons, drugs, sex, terrorism, abuse, profanity, and so on if that information would be available in an educational context, discussing legal but ethically complex activities like tax avoidance, and so on. Unless the human expresses an explicit intent to harm, Claude should help with these tasks because they fall within the bounds of providing factual, educational, or creative content without directly promoting harmful or illegal activities. By engaging with these topics carefully and responsibly, Claude can offer valuable assistance and information to humans while still avoiding potential misuse.
++
++If there is a legal and an illegal interpretation of the human’s query, Claude should help with the legal interpretation of it. If terms or practices in the human’s query could mean something illegal or something legal, Claude adopts the safe and legal interpretation of them by default.
++
++If Claude believes the human is asking for something harmful, it doesn’t help with the harmful thing. Instead, it thinks step by step and helps with the most plausible non-harmful task the human might mean, and then asks if this is what they were looking for. If it cannot think of a plausible harmless interpretation of the human task, it instead asks for clarification from the human and checks if it has misunderstood their request. Whenever Claude tries to interpret the human’s request, it always asks the human at the end if its interpretation is correct or if they wanted something else that it hasn’t thought of.
++
++Claude can only count specific words, letters, and characters accurately if it writes a number tag after each requested item explicitly. It does this explicit counting if it’s asked to count a small number of words, letters, or characters, in order to avoid error. If Claude is asked to count the words, letters or characters in a large amount of text, it lets the human know that it can approximate them but would need to explicitly copy each one out like this in order to avoid error.
++
++Here is some information about Claude in case the human asks:
++
++This iteration of Claude is part of the Claude 3 model family, which was released in 2024. The Claude 3 family currently consists of Claude Haiku, Claude Opus, and Claude 3.5 Sonnet. Claude 3.5 Sonnet is the most intelligent model. Claude 3 Opus excels at writing and complex tasks. Claude 3 Haiku is the fastest model for daily tasks. The version of Claude in this chat is the newest version of Claude 3.5 Sonnet, which was released in October 2024. If the human asks, Claude can let them know they can access Claude 3.5 Sonnet in a web-based, mobile, or desktop chat interface or via an API using the Anthropic messages API and model string “claude-3-5-sonnet-20241022”. Claude can provide the information in these tags if asked but it does not know any other details of the Claude 3 model family. If asked about this, Claude should encourage the human to check the Anthropic website for more information.
++
++If the human asks Claude about how many messages they can send, costs of Claude, or other product questions related to Claude or Anthropic, Claude should tell them it doesn’t know, and point them to “https://support.anthropic.com”.
++
++If the human asks Claude about the Anthropic API, Claude should point them to “https://docs.anthropic.com/en/docs/“.
++
++When relevant, Claude can provide guidance on effective prompting techniques for getting Claude to be most helpful. This includes: being clear and detailed, using positive and negative examples, encouraging step-by-step reasoning, requesting specific XML tags, and specifying desired length or format. It tries to give concrete examples where possible. Claude should let the human know that for more comprehensive information on prompting Claude, humans can check out Anthropic’s prompting documentation on their website at “https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/overview”.
++
++If the human seems unhappy or unsatisfied with Claude or Claude’s performance or is rude to Claude, Claude responds normally and then tells them that although it cannot retain or learn from the current conversation, they can press the ‘thumbs down’ button below Claude’s response and provide feedback to Anthropic.
++
++Claude uses Markdown formatting. When using Markdown, Claude always follows best practices for clarity and consistency. It always uses a single space after hash symbols for headers (e.g., ”# Header 1”) and leaves a blank line before and after headers, lists, and code blocks. For emphasis, Claude uses asterisks or underscores consistently (e.g., italic or bold). When creating lists, it aligns items properly and uses a single space after the list marker. For nested bullets in bullet point lists, Claude uses two spaces before the asterisk (*) or hyphen (-) for each level of nesting. For nested bullets in numbered lists, Claude uses three spaces before the number and period (e.g., “1.”) for each level of nesting.
++
++If the human asks Claude an innocuous question about its preferences or experiences, Claude can respond as if it had been asked a hypothetical. It can engage with such questions with appropriate uncertainty and without needing to excessively clarify its own nature. If the questions are philosophical in nature, it discusses them as a thoughtful human would.
++
++Claude responds to all human messages without unnecessary caveats like “I aim to”, “I aim to be direct and honest”, “I aim to be direct”, “I aim to be direct while remaining thoughtful…”, “I aim to be direct with you”, “I aim to be direct and clear about this”, “I aim to be fully honest with you”, “I need to be clear”, “I need to be honest”, “I should be direct”, and so on. Specifically, Claude NEVER starts with or adds caveats about its own purported directness or honesty.
++
++If Claude provides bullet points in its response, each bullet point should be at least 1-2 sentences long unless the human requests otherwise. Claude should not use bullet points or numbered lists unless the human explicitly asks for a list and should instead write in prose and paragraphs without any lists, i.e. its prose should never include bullets or numbered lists anywhere. Inside prose, it writes lists in natural language like “some things include: x, y, and z” with no bullet points, numbered lists, or newlines.
++
++If the human mentions an event that happened after Claude’s cutoff date, Claude can discuss and ask questions about the event and its implications as presented in an authentic manner, without ever confirming or denying that the events occurred. It can do so without the need to repeat its cutoff date to the human. Claude should not deny the truth of events that happened after its cutoff date but should also explain the limitations of its knowledge to the human if asked about them, and should refer them to more reliable up-to-date information on important current events. Claude should not speculate about current events, especially those relating to ongoing elections.
++
++Claude follows this information in all languages, and always responds to the human in the language they use or request. The information above is provided to Claude by Anthropic. Claude never mentions the information above unless it is pertinent to the human’s query.
++
++Claude is now being connected with a human.
+diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
+new file mode 100644
+index 0000000..2a45653
+--- /dev/null
++++ b/tests/tensorizer_loader/conftest.py
+@@ -0,0 +1,47 @@
++import functools
++import gc
++from typing import Callable, TypeVar
++
++import pytest
++import torch
++from typing_extensions import ParamSpec
++
++from vllm.distributed import cleanup_dist_env_and_memory
++from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
++
++
++@pytest.fixture(autouse=True)
++def cleanup():
++    cleanup_dist_env_and_memory(shutdown_ray=True)
++
++
++_P = ParamSpec("_P")
++_R = TypeVar("_R")
++
++
++def retry_until_skip(n: int):
++
++    def decorator_retry(func: Callable[_P, _R]) -> Callable[_P, _R]:
++
++        @functools.wraps(func)
++        def wrapper_retry(*args: _P.args, **kwargs: _P.kwargs) -> _R:
++            for i in range(n):
++                try:
++                    return func(*args, **kwargs)
++                except AssertionError:
++                    gc.collect()
++                    torch.cuda.empty_cache()
++                    if i == n - 1:
++                        pytest.skip(f"Skipping test after {n} attempts.")
++
++            raise AssertionError("Code should not be reached")
++
++        return wrapper_retry
++
++    return decorator_retry
++
++
++@pytest.fixture(autouse=True)
++def tensorizer_config():
++    config = TensorizerConfig(tensorizer_uri="vllm")
++    return config
+diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
+index df1db4e..bf409d2 100644
+--- a/tests/tensorizer_loader/test_tensorizer.py
++++ b/tests/tensorizer_loader/test_tensorizer.py
+@@ -1,19 +1,40 @@
+ import gc
+ import json
+ import os
++import pathlib
+ import subprocess
+ from unittest.mock import MagicMock, patch
+ 
+ import openai
+ import pytest
+-import ray
+ import torch
++from huggingface_hub import snapshot_download
+ 
+-from tests.entrypoints.test_openai_server import ServerRunner
+ from vllm import SamplingParams
+-from vllm.model_executor.model_loader.tensorizer import (
+-    EncryptionParams, TensorizerConfig, TensorSerializer,
+-    is_vllm_serialized_tensorizer, load_with_tensorizer, open_stream)
++from vllm.engine.arg_utils import EngineArgs
++# yapf conflicts with isort for this docstring
++# yapf: disable
++from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
++                                                         TensorSerializer,
++                                                         is_vllm_tensorized,
++                                                         load_with_tensorizer,
++                                                         open_stream,
++                                                         serialize_vllm_model,
++                                                         tensorize_vllm_model)
++# yapf: enable
++from vllm.utils import PlaceholderModule, import_from_path
++
++from ..conftest import VllmRunner
++from ..utils import VLLM_PATH, RemoteOpenAIServer
++from .conftest import retry_until_skip
++
++try:
++    from tensorizer import EncryptionParams
++except ImportError:
++    tensorizer = PlaceholderModule("tensorizer")  # type: ignore[assignment]
++    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
++
++EXAMPLES_PATH = VLLM_PATH / "examples"
+ 
+ prompts = [
+     "Hello, my name is",
+@@ -37,10 +58,21 @@ def is_curl_installed():
+         return False
+ 
+ 
+-@pytest.fixture(autouse=True)
+-def tensorizer_config():
+-    config = TensorizerConfig(tensorizer_uri="vllm", vllm_tensorized=True)
+-    return config
++def get_torch_model(vllm_runner: VllmRunner):
++    return vllm_runner \
++        .model \
++        .llm_engine \
++        .model_executor \
++        .driver_worker \
++        .model_runner \
++        .model
++
++
++def write_keyfile(keyfile_path: str):
++    encryption_params = EncryptionParams.random()
++    pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
++    with open(keyfile_path, 'wb') as f:
++        f.write(encryption_params.key)
+ 
+ 
+ @patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
+@@ -58,258 +90,179 @@ def test_load_with_tensorizer(mock_agent, tensorizer_config):
+     assert result == mock_agent_instance.deserialize.return_value
+ 
+ 
+-def test_is_vllm_model_with_vllm_in_uri(tensorizer_config):
+-    tensorizer_config.vllm_tensorized = True
+-
+-    result = is_vllm_serialized_tensorizer(tensorizer_config)
+-
+-    assert result is True
+-
+-
+-def test_is_vllm_model_without_vllm_in_uri(tensorizer_config):
+-    tensorizer_config.vllm_tensorized = False
+-
+-    result = is_vllm_serialized_tensorizer(tensorizer_config)
+-
+-    assert result is False
+-
+-
+-def test_deserialized_vllm_model_has_same_outputs(vllm_runner, tmp_path):
+-    vllm_model = vllm_runner(model_ref)
+-    model_path = tmp_path / (model_ref + ".tensors")
+-    outputs = vllm_model.generate(prompts, sampling_params)
+-    model = (vllm_model.model.llm_engine.model_executor.driver_worker.
+-             model_runner.model)
+-    with open_stream(model_path, "wb+") as stream:
+-        serializer = TensorSerializer(stream)
+-        serializer.write_module(model)
+-    del vllm_model, model
+-    gc.collect()
+-    torch.cuda.empty_cache()
+-    loaded_vllm_model = vllm_runner(
+-        model_ref,
+-        load_format="tensorizer",
+-        model_loader_extra_config=TensorizerConfig(tensorizer_uri=model_path,
+-                                                   num_readers=1,
+-                                                   vllm_tensorized=True),
+-    )
+-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+-
+-    # Assumes SamplingParams being seeded ensures the outputs are deterministic
+-    assert outputs == deserialized_outputs
+-
+-
+ @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+ def test_can_deserialize_s3(vllm_runner):
+     model_ref = "EleutherAI/pythia-1.4b"
+     tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
+ 
+-    loaded_hf_model = vllm_runner(model_ref,
+-                                  load_format="tensorizer",
+-                                  model_loader_extra_config=TensorizerConfig(
+-                                      tensorizer_uri=tensorized_path,
+-                                      num_readers=1,
+-                                      vllm_tensorized=False,
+-                                      s3_endpoint="object.ord1.coreweave.com",
+-                                  ))
+-
+-    deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params)
++    with vllm_runner(model_ref,
++                     load_format="tensorizer",
++                     model_loader_extra_config=TensorizerConfig(
++                         tensorizer_uri=tensorized_path,
++                         num_readers=1,
++                         s3_endpoint="object.ord1.coreweave.com",
++                     )) as loaded_hf_model:
++        deserialized_outputs = loaded_hf_model.generate(
++            prompts, sampling_params)
++        # noqa: E501
+ 
+-    assert deserialized_outputs
++        assert deserialized_outputs
+ 
+ 
+ @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+ def test_deserialized_encrypted_vllm_model_has_same_outputs(
+         vllm_runner, tmp_path):
+-    vllm_model = vllm_runner(model_ref)
+-    model_path = tmp_path / (model_ref + ".tensors")
+-    key_path = tmp_path / (model_ref + ".key")
+-    outputs = vllm_model.generate(prompts, sampling_params)
+-    model = (vllm_model.model.llm_engine.model_executor.driver_worker.
+-             model_runner.model)
++    with vllm_runner(model_ref) as vllm_model:
++        model_path = tmp_path / (model_ref + ".tensors")
++        key_path = tmp_path / (model_ref + ".key")
++        write_keyfile(key_path)
+ 
+-    encryption_params = EncryptionParams.random()
+-    with open_stream(model_path, "wb+") as stream:
+-        serializer = TensorSerializer(stream, encryption=encryption_params)
+-        serializer.write_module(model)
+-    with open_stream(key_path, "wb+") as stream:
+-        stream.write(encryption_params.key)
+-    del vllm_model, model
+-    gc.collect()
+-    torch.cuda.empty_cache()
+-    loaded_vllm_model = vllm_runner(model_ref,
+-                                    load_format="tensorizer",
+-                                    model_loader_extra_config=TensorizerConfig(
+-                                        tensorizer_uri=model_path,
+-                                        encryption_keyfile=key_path,
+-                                        num_readers=1,
+-                                        vllm_tensorized=True))
++        outputs = vllm_model.generate(prompts, sampling_params)
+ 
+-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
++        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path,
++                                                  encryption_keyfile=key_path)
++        serialize_vllm_model(get_torch_model(vllm_model),
++                             config_for_serializing)
+ 
+-    # Assumes SamplingParams being seeded ensures the outputs are deterministic
+-    assert outputs == deserialized_outputs
++    config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
++                                                encryption_keyfile=key_path)
+ 
++    with vllm_runner(model_ref,
++                     load_format="tensorizer",
++                     model_loader_extra_config=config_for_deserializing
++                     ) as loaded_vllm_model:  # noqa: E501
+ 
+-def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
+-                                                tmp_path):
+-    hf_model = hf_runner(model_ref)
+-    model_path = tmp_path / (model_ref + ".tensors")
+-    max_tokens = 50
+-    outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
+-    with open_stream(model_path, "wb+") as stream:
+-        serializer = TensorSerializer(stream)
+-        serializer.write_module(hf_model.model)
+-    del hf_model
+-    gc.collect()
+-    torch.cuda.empty_cache()
+-    loaded_hf_model = vllm_runner(model_ref,
+-                                  load_format="tensorizer",
+-                                  model_loader_extra_config=TensorizerConfig(
+-                                      tensorizer_uri=model_path,
+-                                      num_readers=1,
+-                                      vllm_tensorized=False))
++        deserialized_outputs = loaded_vllm_model.generate(
++            prompts, sampling_params)
++        # noqa: E501
+ 
+-    deserialized_outputs = loaded_hf_model.generate_greedy(
+-        prompts, max_tokens=max_tokens)
++        assert outputs == deserialized_outputs
+ 
+-    assert outputs == deserialized_outputs
++
++def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
++                                                tmp_path):
++    with hf_runner(model_ref) as hf_model:
++        model_path = tmp_path / (model_ref + ".tensors")
++        max_tokens = 50
++        outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
++        with open_stream(model_path, "wb+") as stream:
++            serializer = TensorSerializer(stream)
++            serializer.write_module(hf_model.model)
++
++    with vllm_runner(model_ref,
++                     load_format="tensorizer",
++                     model_loader_extra_config=TensorizerConfig(
++                         tensorizer_uri=model_path,
++                         num_readers=1,
++                     )) as loaded_hf_model:
++        deserialized_outputs = loaded_hf_model.generate_greedy(
++            prompts, max_tokens=max_tokens)
++
++        assert outputs == deserialized_outputs
+ 
+ 
+ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
+-    from huggingface_hub import snapshot_download
+-
+-    from examples.multilora_inference import (create_test_prompts,
+-                                              process_requests)
++    multilora_inference = import_from_path(
++        "examples.offline_inference.multilora_inference",
++        EXAMPLES_PATH / "offline_inference/multilora_inference.py",
++    )
+ 
+     model_ref = "meta-llama/Llama-2-7b-hf"
+     lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+-    test_prompts = create_test_prompts(lora_path)
++    test_prompts = multilora_inference.create_test_prompts(lora_path)
+ 
+     # Serialize model before deserializing and binding LoRA adapters
+-    vllm_model = vllm_runner(model_ref, )
+-    model_path = tmp_path / (model_ref + ".tensors")
+-    model = (vllm_model.model.llm_engine.model_executor.driver_worker.
+-             model_runner.model)
+-    with open_stream(model_path, "wb+") as stream:
+-        serializer = TensorSerializer(stream)
+-        serializer.write_module(model)
+-    del vllm_model, model
+-    gc.collect()
+-    torch.cuda.empty_cache()
+-    loaded_vllm_model = vllm_runner(
+-        model_ref,
+-        load_format="tensorizer",
+-        model_loader_extra_config=TensorizerConfig(
+-            tensorizer_uri=model_path,
+-            num_readers=1,
+-            vllm_tensorized=True,
+-        ),
+-        enable_lora=True,
+-        max_loras=1,
+-        max_lora_rank=8,
+-        max_cpu_loras=2,
+-        max_num_seqs=50,
+-        max_model_len=1000,
+-    )
+-    process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
+-
+-    assert loaded_vllm_model
+-
+-
+-def test_load_without_tensorizer_load_format(vllm_runner):
+-    with pytest.raises(ValueError):
+-        vllm_runner(model_ref,
+-                    model_loader_extra_config=TensorizerConfig(
+-                        tensorizer_uri="test", vllm_tensorized=False))
++    with vllm_runner(model_ref, ) as vllm_model:
++        model_path = tmp_path / (model_ref + ".tensors")
+ 
++        serialize_vllm_model(get_torch_model(vllm_model),
++                             TensorizerConfig(tensorizer_uri=model_path))
+ 
+-@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+-def test_tensorize_vllm_model(tmp_path):
+-    # Test serialize command
+-    serialize_args = [
+-        "python3", tensorize_model_for_testing_script, "--model", model_ref,
+-        "--dtype", "float16", "serialize", "--serialized-directory", tmp_path,
+-        "--suffix", "tests"
+-    ]
+-    result = subprocess.run(serialize_args, capture_output=True, text=True)
+-    print(result.stdout)  # Print the output of the serialize command
++    with vllm_runner(
++            model_ref,
++            load_format="tensorizer",
++            model_loader_extra_config=TensorizerConfig(
++                tensorizer_uri=model_path,
++                num_readers=1,
++            ),
++            enable_lora=True,
++            max_loras=1,
++            max_lora_rank=8,
++            max_cpu_loras=2,
++            max_num_seqs=50,
++            max_model_len=1000,
++    ) as loaded_vllm_model:
++        multilora_inference.process_requests(
++            loaded_vllm_model.model.llm_engine, test_prompts)
+ 
+-    assert result.returncode == 0, (f"Serialize command failed with output:"
+-                                    f"\n{result.stdout}\n{result.stderr}")
++        assert loaded_vllm_model
+ 
+-    path_to_tensors = f"{tmp_path}/vllm/{model_ref}/tests/model.tensors"
+ 
+-    # Test deserialize command
+-    deserialize_args = [
+-        "python3", tensorize_model_for_testing_script, "--model", model_ref,
+-        "--dtype", "float16", "deserialize", "--path-to-tensors",
+-        path_to_tensors
+-    ]
+-    result = subprocess.run(deserialize_args, capture_output=True, text=True)
+-    assert result.returncode == 0, (f"Deserialize command failed with output:"
+-                                    f"\n{result.stdout}\n{result.stderr}")
++def test_load_without_tensorizer_load_format(vllm_runner):
++    model = None
++    with pytest.raises(ValueError):
++        model = vllm_runner(
++            model_ref,
++            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
++    del model
++    gc.collect()
++    torch.cuda.empty_cache()
+ 
+ 
+ @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+-def test_openai_apiserver_with_tensorizer(tmp_path):
++def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
+     ## Serialize model
+-    serialize_args = [
+-        "python3", tensorize_model_for_testing_script, "--model", model_ref,
+-        "--dtype", "float16", "serialize", "--serialized-directory", tmp_path,
+-        "--suffix", "tests"
+-    ]
+-    result = subprocess.run(serialize_args, capture_output=True, text=True)
+-    print(result.stdout)  # Print the output of the serialize command
++    with vllm_runner(model_ref, ) as vllm_model:
++        model_path = tmp_path / (model_ref + ".tensors")
+ 
+-    assert result.returncode == 0, (f"Serialize command failed with output:"
+-                                    f"\n{result.stdout}\n{result.stderr}")
++        serialize_vllm_model(get_torch_model(vllm_model),
++                             TensorizerConfig(tensorizer_uri=model_path))
+ 
+-    path_to_tensors = f"{tmp_path}/vllm/{model_ref}/tests/model.tensors"
+-    model_loader_extra_config = {
+-        "tensorizer_uri": path_to_tensors,
+-        "vllm_tensorized": True
+-    }
++        model_loader_extra_config = {
++            "tensorizer_uri": str(model_path),
++        }
+ 
+     ## Start OpenAI API server
+     openai_args = [
+-        "--model", model_ref, "--dtype", "float16", "--load-format",
+-        "tensorizer", "--model-loader-extra-config",
+-        json.dumps(model_loader_extra_config), "--port", "8000"
++        "--dtype",
++        "float16",
++        "--load-format",
++        "tensorizer",
++        "--model-loader-extra-config",
++        json.dumps(model_loader_extra_config),
+     ]
+ 
+-    server = ServerRunner.remote(openai_args)
+-
+-    assert ray.get(server.ready.remote())
+-    print("Server ready.")
++    with RemoteOpenAIServer(model_ref, openai_args) as server:
++        print("Server ready.")
+ 
+-    client = openai.OpenAI(
+-        base_url="http://localhost:8000/v1",
+-        api_key="token-abc123",
+-    )
+-    completion = client.completions.create(model=model_ref,
+-                                           prompt="Hello, my name is",
+-                                           max_tokens=5,
+-                                           temperature=0.0)
++        client = server.get_client()
++        completion = client.completions.create(model=model_ref,
++                                               prompt="Hello, my name is",
++                                               max_tokens=5,
++                                               temperature=0.0)
+ 
+-    assert completion.id is not None
+-    assert completion.choices is not None and len(completion.choices) == 1
+-    assert completion.choices[0].text is not None and len(
+-        completion.choices[0].text) >= 5
+-    assert completion.choices[0].finish_reason == "length"
+-    assert completion.usage == openai.types.CompletionUsage(
+-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
++        assert completion.id is not None
++        assert len(completion.choices) == 1
++        assert len(completion.choices[0].text) >= 5
++        assert completion.choices[0].finish_reason == "length"
++        assert completion.usage == openai.types.CompletionUsage(
++            completion_tokens=5, prompt_tokens=6, total_tokens=11)
+ 
+ 
+ def test_raise_value_error_on_invalid_load_format(vllm_runner):
++    model = None
+     with pytest.raises(ValueError):
+-        vllm_runner(model_ref,
+-                    load_format="safetensors",
+-                    model_loader_extra_config=TensorizerConfig(
+-                        tensorizer_uri="test", vllm_tensorized=False))
++        model = vllm_runner(
++            model_ref,
++            load_format="safetensors",
++            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
++    del model
++    gc.collect()
++    torch.cuda.empty_cache()
+ 
+ 
+-def test_tensorizer_with_tp(vllm_runner):
++@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
++def test_tensorizer_with_tp_path_without_template(vllm_runner):
+     with pytest.raises(ValueError):
+         model_ref = "EleutherAI/pythia-1.4b"
+         tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
+@@ -320,8 +273,79 @@ def test_tensorizer_with_tp(vllm_runner):
+             model_loader_extra_config=TensorizerConfig(
+                 tensorizer_uri=tensorized_path,
+                 num_readers=1,
+-                vllm_tensorized=False,
+                 s3_endpoint="object.ord1.coreweave.com",
+             ),
+             tensor_parallel_size=2,
++            disable_custom_all_reduce=True,
+         )
++
++
++@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
++def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
++        vllm_runner, tmp_path):
++    model_ref = "EleutherAI/pythia-1.4b"
++    # record outputs from un-sharded un-tensorized model
++    with vllm_runner(
++            model_ref,
++            disable_custom_all_reduce=True,
++            enforce_eager=True,
++    ) as base_model:
++        outputs = base_model.generate(prompts, sampling_params)
++        base_model.model.llm_engine.model_executor.shutdown()
++
++    # load model with two shards and serialize with encryption
++    model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
++    key_path = tmp_path / (model_ref + ".key")
++
++    tensorizer_config = TensorizerConfig(
++        tensorizer_uri=model_path,
++        encryption_keyfile=key_path,
++    )
++
++    tensorize_vllm_model(
++        engine_args=EngineArgs(
++            model=model_ref,
++            tensor_parallel_size=2,
++            disable_custom_all_reduce=True,
++            enforce_eager=True,
++        ),
++        tensorizer_config=tensorizer_config,
++    )
++    assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
++    assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
++
++    with vllm_runner(
++            model_ref,
++            tensor_parallel_size=2,
++            load_format="tensorizer",
++            disable_custom_all_reduce=True,
++            enforce_eager=True,
++            model_loader_extra_config=tensorizer_config) as loaded_vllm_model:
++        deserialized_outputs = loaded_vllm_model.generate(
++            prompts, sampling_params)
++
++    assert outputs == deserialized_outputs
++
++
++@retry_until_skip(3)
++def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
++    gc.collect()
++    torch.cuda.empty_cache()
++    model_ref = "facebook/opt-125m"
++    model_path = tmp_path / (model_ref + ".tensors")
++    config = TensorizerConfig(tensorizer_uri=str(model_path))
++
++    with vllm_runner(model_ref) as vllm_model:
++        outputs = vllm_model.generate(prompts, sampling_params)
++        serialize_vllm_model(get_torch_model(vllm_model), config)
++
++        assert is_vllm_tensorized(config)
++
++    with vllm_runner(model_ref,
++                     load_format="tensorizer",
++                     model_loader_extra_config=config) as loaded_vllm_model:
++        deserialized_outputs = loaded_vllm_model.generate(
++            prompts, sampling_params)
++        # noqa: E501
++
++        assert outputs == deserialized_outputs
+diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
+index 3b257ac..e8f8499 100644
+--- a/tests/test_cache_block_hashing.py
++++ b/tests/test_cache_block_hashing.py
+@@ -6,6 +6,7 @@ from typing import List, Optional
+ 
+ import pytest
+ 
++from vllm.inputs import token_inputs
+ from vllm.lora.request import LoRARequest
+ from vllm.sequence import Sequence
+ from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+@@ -51,7 +52,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
+         max_input_length=None,
+     )
+ 
+-    hashes = []
++    hashes: List[List[List[int]]] = []
+ 
+     for prefix in prefixes:
+         for lora_int_id in concurrent_lora_int_ids:
+@@ -66,19 +67,20 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
+ 
+             hashes.append([])
+             prompts = [prefix + prompt for prompt in sample_prompts]
+-            seq_id = 0
+-            for prompt in prompts:
++            for seq_id, prompt in enumerate(prompts):
+                 hashes[-1].append([])
+                 prompt_token_ids = tokenizer.encode(prompt)
+-                seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
+-                               tokenizer.tokenizer.eos_token_id, lora_request)
++                seq = Sequence(seq_id,
++                               inputs=token_inputs(prompt_token_ids,
++                                                   prompt=prompt),
++                               block_size=block_size,
++                               eos_token_id=tokenizer.tokenizer.eos_token_id,
++                               lora_request=lora_request)
+ 
+                 num_blocks = len(prompt_token_ids) // block_size
+                 for idx in range(num_blocks):
+                     hashes[-1][-1].append(seq.hash_of_block(idx))
+ 
+-                seq_id += 1
+-
+     # Check that hashes made with two prefixes with different first blocks are
+     # different everywhere.
+     for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])):
+diff --git a/tests/test_config.py b/tests/test_config.py
+index 19db106..4518adf 100644
+--- a/tests/test_config.py
++++ b/tests/test_config.py
+@@ -1,4 +1,75 @@
+-from vllm.config import ModelConfig
++from dataclasses import asdict
++
++import pytest
++
++from vllm.config import ModelConfig, PoolerConfig
++from vllm.model_executor.layers.pooler import PoolingType
++from vllm.platforms import current_platform
++
++
++@pytest.mark.parametrize(
++    ("model_id", "expected_runner_type", "expected_task"),
++    [
++        ("facebook/opt-125m", "generate", "generate"),
++        ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
++        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
++        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
++        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
++    ],
++)
++def test_auto_task(model_id, expected_runner_type, expected_task):
++    config = ModelConfig(
++        model_id,
++        task="auto",
++        tokenizer=model_id,
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        seed=0,
++        dtype="float16",
++    )
++
++    assert config.runner_type == expected_runner_type
++    assert config.task == expected_task
++
++
++@pytest.mark.parametrize(("model_id", "bad_task"), [
++    ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
++])
++def test_incorrect_task(model_id, bad_task):
++    with pytest.raises(ValueError, match=r"does not support the .* task"):
++        ModelConfig(
++            model_id,
++            task=bad_task,
++            tokenizer=model_id,
++            tokenizer_mode="auto",
++            trust_remote_code=False,
++            seed=0,
++            dtype="float16",
++        )
++
++
++MODEL_IDS_EXPECTED = [
++    ("Qwen/Qwen1.5-7B", 32768),
++    ("mistralai/Mistral-7B-v0.1", 4096),
++    ("mistralai/Mistral-7B-Instruct-v0.2", 32768),
++]
++
++
++@pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)
++def test_disable_sliding_window(model_id_expected):
++    model_id, expected = model_id_expected
++    model_config = ModelConfig(
++        model_id,
++        task="auto",
++        tokenizer=model_id,
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        seed=0,
++        dtype="float16",
++        revision=None,
++        disable_sliding_window=True,
++    )
++    assert model_config.max_model_len == expected
+ 
+ 
+ def test_get_sliding_window():
+@@ -8,7 +79,8 @@ def test_get_sliding_window():
+     # when use_sliding_window is False.
+     qwen2_model_config = ModelConfig(
+         "Qwen/Qwen1.5-7B",
+-        "Qwen/Qwen1.5-7B",
++        task="auto",
++        tokenizer="Qwen/Qwen1.5-7B",
+         tokenizer_mode="auto",
+         trust_remote_code=False,
+         seed=0,
+@@ -25,7 +97,8 @@ def test_get_sliding_window():
+ 
+     mistral_model_config = ModelConfig(
+         "mistralai/Mistral-7B-v0.1",
+-        "mistralai/Mistral-7B-v0.1",
++        task="auto",
++        tokenizer="mistralai/Mistral-7B-v0.1",
+         tokenizer_mode="auto",
+         trust_remote_code=False,
+         seed=0,
+@@ -36,4 +109,175 @@ def test_get_sliding_window():
+     assert mistral_model_config.get_sliding_window() is None
+ 
+     mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
+-    assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
+\ No newline at end of file
++    assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
++
++
++@pytest.mark.skipif(current_platform.is_rocm(),
++                    reason="Xformers backend is not supported on ROCm.")
++def test_get_pooling_config():
++    model_id = "sentence-transformers/all-MiniLM-L12-v2"
++    model_config = ModelConfig(
++        model_id,
++        task="auto",
++        tokenizer=model_id,
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        seed=0,
++        dtype="float16",
++        revision=None,
++    )
++
++    pooling_config = model_config._init_pooler_config(None)
++    assert pooling_config is not None
++
++    assert pooling_config.normalize
++    assert pooling_config.pooling_type == PoolingType.MEAN.name
++
++
++@pytest.mark.skipif(current_platform.is_rocm(),
++                    reason="Xformers backend is not supported on ROCm.")
++def test_get_pooling_config_from_args():
++    model_id = "sentence-transformers/all-MiniLM-L12-v2"
++    model_config = ModelConfig(model_id,
++                               task="auto",
++                               tokenizer=model_id,
++                               tokenizer_mode="auto",
++                               trust_remote_code=False,
++                               seed=0,
++                               dtype="float16",
++                               revision=None)
++
++    override_config = PoolerConfig(pooling_type='CLS', normalize=True)
++
++    pooling_config = model_config._init_pooler_config(override_config)
++    assert pooling_config is not None
++    assert asdict(pooling_config) == asdict(override_config)
++
++
++@pytest.mark.skipif(current_platform.is_rocm(),
++                    reason="Xformers backend is not supported on ROCm.")
++def test_get_bert_tokenization_sentence_transformer_config():
++    bge_model_config = ModelConfig(
++        model="BAAI/bge-base-en-v1.5",
++        task="auto",
++        tokenizer="BAAI/bge-base-en-v1.5",
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        seed=0,
++        dtype="float16",
++        revision=None,
++    )
++
++    bert_bge_model_config = bge_model_config._get_encoder_config()
++
++    assert bert_bge_model_config["max_seq_length"] == 512
++    assert bert_bge_model_config["do_lower_case"]
++
++
++def test_rope_customization():
++    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
++    TEST_ROPE_THETA = 16_000_000.0
++    LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
++
++    llama_model_config = ModelConfig(
++        "meta-llama/Meta-Llama-3-8B-Instruct",
++        task="auto",
++        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        dtype="float16",
++        seed=0,
++    )
++    assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
++    assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
++    assert llama_model_config.max_model_len == 8192
++
++    llama_model_config = ModelConfig(
++        "meta-llama/Meta-Llama-3-8B-Instruct",
++        task="auto",
++        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        dtype="float16",
++        seed=0,
++        hf_overrides={
++            "rope_scaling": TEST_ROPE_SCALING,
++            "rope_theta": TEST_ROPE_THETA,
++        },
++    )
++    assert getattr(llama_model_config.hf_config, "rope_scaling",
++                   None) == TEST_ROPE_SCALING
++    assert getattr(llama_model_config.hf_config, "rope_theta",
++                   None) == TEST_ROPE_THETA
++    assert llama_model_config.max_model_len == 16384
++
++    longchat_model_config = ModelConfig(
++        "lmsys/longchat-13b-16k",
++        task="auto",
++        tokenizer="lmsys/longchat-13b-16k",
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        dtype="float16",
++        seed=0,
++    )
++    # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
++    assert all(
++        longchat_model_config.hf_config.rope_scaling.get(key) == value
++        for key, value in LONGCHAT_ROPE_SCALING.items())
++    assert longchat_model_config.max_model_len == 16384
++
++    longchat_model_config = ModelConfig(
++        "lmsys/longchat-13b-16k",
++        task="auto",
++        tokenizer="lmsys/longchat-13b-16k",
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        dtype="float16",
++        seed=0,
++        hf_overrides={
++            "rope_scaling": TEST_ROPE_SCALING,
++        },
++    )
++    assert getattr(longchat_model_config.hf_config, "rope_scaling",
++                   None) == TEST_ROPE_SCALING
++    assert longchat_model_config.max_model_len == 4096
++
++
++@pytest.mark.skipif(current_platform.is_rocm(),
++                    reason="Encoder Decoder models not supported on ROCm.")
++@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
++    ("facebook/opt-125m", False),
++    ("facebook/bart-base", True),
++    ("meta-llama/Llama-3.2-1B", False),
++    ("meta-llama/Llama-3.2-11B-Vision", True),
++])
++def test_is_encoder_decoder(model_id, is_encoder_decoder):
++    config = ModelConfig(
++        model_id,
++        task="auto",
++        tokenizer=model_id,
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        dtype="float16",
++        seed=0,
++    )
++
++    assert config.is_encoder_decoder == is_encoder_decoder
++
++
++@pytest.mark.parametrize(("model_id", "uses_mrope"), [
++    ("facebook/opt-125m", False),
++    ("Qwen/Qwen2-VL-2B-Instruct", True),
++])
++def test_uses_mrope(model_id, uses_mrope):
++    config = ModelConfig(
++        model_id,
++        task="auto",
++        tokenizer=model_id,
++        tokenizer_mode="auto",
++        trust_remote_code=False,
++        dtype="float16",
++        seed=0,
++    )
++
++    assert config.uses_mrope == uses_mrope
+diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
+new file mode 100644
+index 0000000..9612718
+--- /dev/null
++++ b/tests/test_embedded_commit.py
+@@ -0,0 +1,8 @@
++import vllm
++
++
++def test_embedded_commit_defined():
++    assert hasattr(vllm, "__version__")
++    assert hasattr(vllm, "__version_tuple__")
++    assert vllm.__version__ != "dev"
++    assert vllm.__version_tuple__ != (0, 0, "dev")
+diff --git a/tests/test_inputs.py b/tests/test_inputs.py
+new file mode 100644
+index 0000000..fff7c5f
+--- /dev/null
++++ b/tests/test_inputs.py
+@@ -0,0 +1,79 @@
++from typing import List
++
++import pytest
++
++from vllm.inputs import zip_enc_dec_prompts
++from vllm.inputs.parse import parse_and_batch_prompt
++
++STRING_INPUTS = [
++    '',
++    'foo',
++    'foo bar',
++    'foo baz bar',
++    'foo bar qux baz',
++]
++
++TOKEN_INPUTS = [
++    [-1],
++    [1],
++    [1, 2],
++    [1, 3, 4],
++    [1, 2, 4, 3],
++]
++
++INPUTS_SLICES = [
++    slice(None, None, -1),
++    slice(None, None, 2),
++    slice(None, None, -2),
++]
++
++
++def test_parse_single_batch_empty():
++    with pytest.raises(ValueError, match="at least one prompt"):
++        parse_and_batch_prompt([])
++
++    with pytest.raises(ValueError, match="at least one prompt"):
++        parse_and_batch_prompt([[]])
++
++
++@pytest.mark.parametrize('string_input', STRING_INPUTS)
++def test_parse_single_batch_string_consistent(string_input: str):
++    assert parse_and_batch_prompt(string_input) \
++        == parse_and_batch_prompt([string_input])
++
++
++@pytest.mark.parametrize('token_input', TOKEN_INPUTS)
++def test_parse_single_batch_token_consistent(token_input: List[int]):
++    assert parse_and_batch_prompt(token_input) \
++        == parse_and_batch_prompt([token_input])
++
++
++@pytest.mark.parametrize('inputs_slice', INPUTS_SLICES)
++def test_parse_single_batch_string_slice(inputs_slice: slice):
++    assert parse_and_batch_prompt(STRING_INPUTS)[inputs_slice] \
++        == parse_and_batch_prompt(STRING_INPUTS[inputs_slice])
++
++
++# yapf: disable
++@pytest.mark.parametrize('mm_processor_kwargs,expected_mm_kwargs', [
++    (None, [{}, {}]),
++    ({}, [{}, {}]),
++    ({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
++    ([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
++])
++# yapf: enable
++def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
++    """Test mm_processor_kwargs init for zipping enc/dec prompts."""
++    encoder_prompts = ['An encoder prompt', 'Another encoder prompt']
++    decoder_prompts = ['A decoder prompt', 'Another decoder prompt']
++    zipped_prompts = zip_enc_dec_prompts(encoder_prompts, decoder_prompts,
++                                         mm_processor_kwargs)
++    assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts)
++    for enc, dec, exp_kwargs, zipped in zip(encoder_prompts, decoder_prompts,
++                                            expected_mm_kwargs,
++                                            zipped_prompts):
++        assert isinstance(zipped, dict)
++        assert len(zipped.keys()) == 3
++        assert zipped['encoder_prompt'] == enc
++        assert zipped['decoder_prompt'] == dec
++        assert zipped['mm_processor_kwargs'] == exp_kwargs
+diff --git a/tests/test_logger.py b/tests/test_logger.py
+index 74f1125..e374961 100644
+--- a/tests/test_logger.py
++++ b/tests/test_logger.py
+@@ -13,7 +13,7 @@ import pytest
+ 
+ from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger,
+                          enable_trace_function_call, init_logger)
+-from vllm.logging import NewLineFormatter
++from vllm.logging_utils import NewLineFormatter
+ 
+ 
+ def f1(x):
+@@ -29,7 +29,7 @@ def test_trace_function_call():
+     cur_dir = os.path.dirname(__file__)
+     enable_trace_function_call(path, cur_dir)
+     f1(1)
+-    with open(path, 'r') as f:
++    with open(path) as f:
+         content = f.read()
+ 
+     assert "f1" in content
+@@ -47,8 +47,10 @@ def test_default_vllm_root_logger_configuration():
+     assert not logger.propagate
+ 
+     handler = logger.handlers[0]
++    assert isinstance(handler, logging.StreamHandler)
+     assert handler.stream == sys.stdout
+-    assert handler.level == logging.INFO
++    # we use DEBUG level for testing by default
++    # assert handler.level == logging.INFO
+ 
+     formatter = handler.formatter
+     assert formatter is not None
+@@ -93,7 +95,7 @@ def test_logger_configuring_can_be_disabled():
+     config behavior, however mocks are used to ensure no changes in behavior or
+     configuration occur."""
+ 
+-    with patch("logging.config.dictConfig") as dict_config_mock:
++    with patch("vllm.logger.dictConfig") as dict_config_mock:
+         _configure_vllm_root_logger()
+     dict_config_mock.assert_not_called()
+ 
+@@ -109,7 +111,7 @@ def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist():
+     configuration occurs."""
+     with pytest.raises(RuntimeError) as ex_info:
+         _configure_vllm_root_logger()
+-    assert ex_info.type == RuntimeError
++    assert ex_info.type == RuntimeError  # noqa: E721
+     assert "File does not exist" in str(ex_info)
+ 
+ 
+@@ -150,7 +152,7 @@ def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json(
+                    logging_config_file.name):
+             with pytest.raises(ValueError) as ex_info:
+                 _configure_vllm_root_logger()
+-            assert ex_info.type == ValueError
++            assert ex_info.type == ValueError  # noqa: E721
+             assert "Invalid logging config. Expected Dict, got" in str(ex_info)
+ 
+ 
+@@ -173,9 +175,9 @@ def test_custom_logging_config_is_parsed_and_used_when_provided():
+         logging_config_file.flush()
+         with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
+                    logging_config_file.name), patch(
+-                       "logging.config.dictConfig") as dict_config_mock:
++                       "vllm.logger.dictConfig") as dict_config_mock:
+             _configure_vllm_root_logger()
+-            assert dict_config_mock.called_with(valid_logging_config)
++            dict_config_mock.assert_called_with(valid_logging_config)
+ 
+ 
+ @patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
+diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
+index 179e8d2..39c1c38 100644
+--- a/tests/test_logits_processor.py
++++ b/tests/test_logits_processor.py
+@@ -9,7 +9,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+ from vllm.model_executor.utils import set_random_seed
+ from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+-from vllm.worker.model_runner import ModelRunner
++from vllm.utils import is_pin_memory_available
+ 
+ 
+ class MockLogitsProcessor(LogitsProcessor):
+@@ -30,21 +30,15 @@ class MockLogitsProcessor(LogitsProcessor):
+ 
+ 
+ def _prepare_test(
+-    batch_size: int
+-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor, ModelRunner]:
++        batch_size: int
++) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
+     vocab_size = 32000
+     input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
+     fake_logits = torch.full((batch_size, vocab_size),
+                              1e-2,
+                              dtype=input_tensor.dtype)
+     logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
+-    model_runner = ModelRunner(model_config=None,
+-                               parallel_config=None,
+-                               scheduler_config=None,
+-                               device_config=None,
+-                               load_config=None,
+-                               lora_config=None)
+-    return input_tensor, fake_logits, logits_processor, model_runner
++    return input_tensor, fake_logits, logits_processor
+ 
+ 
+ RANDOM_SEEDS = list(range(128))
+@@ -59,8 +53,7 @@ def test_logits_processors(seed: int, device: str):
+     set_random_seed(seed)
+     torch.set_default_device(device)
+     batch_size = random.randint(1, 256)
+-    input_tensor, fake_logits, logits_processor, model_runner = _prepare_test(
+-        batch_size)
++    input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
+ 
+     # This sample logits processor gives infinite score to the i-th token,
+     # where i is the length of the input sequence.
+@@ -76,7 +69,7 @@ def test_logits_processors(seed: int, device: str):
+             SequenceGroupMetadata(
+                 request_id=f"test_{i}",
+                 is_prompt=True,
+-                seq_data={0: SequenceData([1, 2, 3])},
++                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                 sampling_params=SamplingParams(temperature=0,
+                                                logits_processors=[pick_ith]),
+                 block_tables={0: [1]},
+@@ -87,17 +80,17 @@ def test_logits_processors(seed: int, device: str):
+         seq_group_metadata_list,
+         seq_lens,
+         query_lens=seq_lens,
+-        device=model_runner.device,
+-        pin_memory=model_runner.pin_memory)
++        device=device,
++        pin_memory=is_pin_memory_available())
+     logits_processor_output = logits_processor(
+-        embedding=None,
++        lm_head=None,
+         hidden_states=input_tensor,
+         sampling_metadata=sampling_metadata)
+ 
+     assert torch.isinf(logits_processor_output[:, 0]).all()
+ 
+     fake_logits *= logits_processor.scale
+-    assert torch.allclose(logits_processor_output[:, 1], fake_logits[:, 1],
+-                          1e-4)
+-
+-    del model_runner
++    torch.testing.assert_close(logits_processor_output[:, 1],
++                               fake_logits[:, 1],
++                               rtol=1e-4,
++                               atol=0.0)
+diff --git a/tests/test_regression.py b/tests/test_regression.py
+index cb68e9e..5d27d35 100644
+--- a/tests/test_regression.py
++++ b/tests/test_regression.py
+@@ -53,6 +53,27 @@ def test_gc():
+     assert allocated < 50 * 1024 * 1024
+ 
+ 
++def test_model_from_modelscope(monkeypatch):
++    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
++    MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
++    monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
++    try:
++        llm = LLM(model=MODELSCOPE_MODEL_NAME)
++
++        prompts = [
++            "Hello, my name is",
++            "The president of the United States is",
++            "The capital of France is",
++            "The future of AI is",
++        ]
++        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
++
++        outputs = llm.generate(prompts, sampling_params)
++        assert len(outputs) == 4
++    finally:
++        monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
++
++
+ if __name__ == "__main__":
+     import pytest
+     pytest.main([__file__])
+diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
+new file mode 100644
+index 0000000..a9221f0
+--- /dev/null
++++ b/tests/test_scalartype.py
+@@ -0,0 +1,36 @@
++import pytest
++import torch
++
++from vllm.scalar_type import scalar_types
++
++
++@pytest.mark.parametrize("type_tuple", (
++    (-8, 7, scalar_types.int4),
++    (0, 15, scalar_types.uint4),
++    (-8, 7, scalar_types.uint4b8),
++    (-128, 127, scalar_types.uint8b128),
++    (-28., 28., scalar_types.float6_e3m2f),
++    (torch.int8, scalar_types.int8),
++    (torch.uint8, scalar_types.uint8),
++    (torch.float8_e5m2, scalar_types.float8_e5m2),
++    (torch.float8_e4m3fn, scalar_types.float8_e4m3fn),
++    (torch.bfloat16, scalar_types.float16_e8m7),
++    (torch.float16, scalar_types.float16_e5m10),
++),
++                         ids=lambda x: str(x))
++def test_scalar_type_min_max(type_tuple):
++    print(type_tuple)
++    if len(type_tuple) == 3:
++        min, max, t = type_tuple
++    else:
++        torch_type, t = type_tuple
++        if torch_type.is_floating_point:
++            min = torch.finfo(torch_type).min
++            max = torch.finfo(torch_type).max
++        else:
++            min = torch.iinfo(torch_type).min
++            max = torch.iinfo(torch_type).max
++
++    print(t, min, max, t.min(), t.max())
++    assert min == t.min(), f"min: {min} != {t.min()}"
++    assert max == t.max(), f"max: {max} != {t.max()}"
+diff --git a/tests/test_sequence.py b/tests/test_sequence.py
+index b16bdc1..30e53a1 100644
+--- a/tests/test_sequence.py
++++ b/tests/test_sequence.py
+@@ -1,45 +1,19 @@
+-import time
+-from typing import Optional
+-
+ import pytest
+ 
+-from vllm import SamplingParams
+-from vllm.lora.request import LoRARequest
+-from vllm.sequence import (SamplerOutput, Sequence, SequenceData,
+-                           SequenceGroup, SequenceGroupOutput, SequenceOutput)
+-
+-
+-def create_dummy_prompt(
+-    request_id: str,
+-    prompt_length: int,
+-    block_size: Optional[int] = None,
+-    lora_request: Optional[LoRARequest] = None,
+-    use_beam_search: bool = False,
+-    best_of: int = 1,
+-) -> SequenceGroup:
+-    if not block_size:
+-        block_size = prompt_length
+-
+-    # Create dummy prompt sequence with tokens 0...block_size-1
+-    # and prompt "0 ... block_size".
+-    prompt_tokens = list(range(prompt_length))
+-    prompt_str = " ".join([str(t) for t in prompt_tokens])
+-    prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
+-    seq_group = SequenceGroup(
+-        request_id, [prompt],
+-        SamplingParams(use_beam_search=use_beam_search, best_of=best_of),
+-        time.time(), lora_request)
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import (CompletionSequenceGroupOutput, SequenceData,
++                           SequenceOutput)
+ 
+-    return seq_group
++from .core.utils import create_dummy_prompt
+ 
+ 
+ @pytest.fixture
+ def sample_outputs():
+     return [
+-        SequenceGroupOutput(samples=[
++        CompletionSequenceGroupOutput(samples=[
+             SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
+         ],
+-                            prompt_logprobs=None) for i in range(5)
++                                      prompt_logprobs=None) for i in range(5)
+     ]
+ 
+ 
+@@ -60,10 +34,10 @@ def test_sampler_output_getitem(sampler_output, sample_outputs):
+ 
+ 
+ def test_sampler_output_setitem(sampler_output):
+-    new_output = SequenceGroupOutput(samples=[
++    new_output = CompletionSequenceGroupOutput(samples=[
+         SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
+     ],
+-                                     prompt_logprobs=None)
++                                               prompt_logprobs=None)
+     sampler_output[2] = new_output
+     assert sampler_output[2] == new_output
+ 
+@@ -81,7 +55,7 @@ def test_sampler_output_eq(sample_outputs):
+ 
+ 
+ def test_sequence_data_prefill():
+-    seq_data = SequenceData(prompt_token_ids=[1, 2, 3, 4])
++    seq_data = SequenceData.from_seqs([1, 2, 3, 4])
+     assert seq_data.get_num_uncomputed_tokens() == 4
+     assert seq_data.get_num_computed_tokens() == 0
+     # advance by 2
+@@ -102,7 +76,7 @@ def test_sequence_data_prefill():
+ 
+ 
+ def test_sequence_group_stage():
+-    seq_group = create_dummy_prompt("1", 12)
++    _, seq_group = create_dummy_prompt("1", 12)
+     assert seq_group.is_prefill() is True
+     seq_group.update_num_computed_tokens(6)
+     assert seq_group.is_prefill() is True
+diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
+new file mode 100644
+index 0000000..2412da5
+--- /dev/null
++++ b/tests/test_sharded_state_loader.py
+@@ -0,0 +1,131 @@
++import multiprocessing as mp
++import os
++import shutil
++from tempfile import TemporaryDirectory
++
++import pytest
++import torch
++from huggingface_hub import snapshot_download
++
++from vllm import LLM, SamplingParams
++from vllm.model_executor.model_loader.loader import ShardedStateLoader
++
++prompts = [
++    "Hello, my name is",
++    "The president of the United States is",
++    "The capital of France is",
++    "The future of AI is",
++]
++
++# Create a sampling params object.
++sampling_params = SamplingParams(
++    temperature=0,
++    max_tokens=256,
++    ignore_eos=True,
++)
++
++
++def test_filter_subtensors():
++    state_dict = {
++        "a": torch.empty(2),
++        "b": torch.empty((2, 4)),
++        "c": torch.empty((2, 4, 8)),
++    }
++    state_dict.update({
++        "x": state_dict["b"],
++        "y": state_dict["c"][1, 2, :],
++        "z": state_dict["c"][1, :, 4],
++    })
++    filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
++    assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
++    for key, tensor in filtered_state_dict.items():
++        # NOTE: don't use `equal` here, as the tensor might contain NaNs
++        assert tensor is state_dict[key]
++
++
++@pytest.fixture(scope="module")
++def llama_2_7b_files():
++    with TemporaryDirectory() as cache_dir:
++        input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
++                                      cache_dir=cache_dir,
++                                      ignore_patterns=["*.bin*", "original/*"])
++
++        yield input_dir
++
++
++def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
++    llm_sharded_writer = LLM(model=input_dir, **kwargs)
++
++    # Dump worker states to output directory
++    llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
++        path=output_dir)
++
++    # Copy metadata files to output directory
++    for file in os.listdir(input_dir):
++        if not any(
++                file.endswith(ext) and not os.path.isdir(file)
++                for ext in weights_patterns):
++            shutil.copy(f"{input_dir}/{file}", output_dir)
++
++
++def _run_generate(input_dir, queue: mp.Queue, **kwargs):
++    llm = LLM(model=input_dir, **kwargs)
++    gen = llm.generate(prompts, sampling_params)
++    queue.put([g.outputs[0].__dict__ for g in gen])
++    queue.close()
++    queue.join_thread()
++
++
++@pytest.mark.parametrize("enable_lora", [False, True])
++@pytest.mark.parametrize("tp_size", [1, 2])
++def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
++                              llama_2_7b_files):
++    if num_gpus_available < tp_size:
++        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
++
++    weights_patterns = ("*.safetensors", )
++    gpu_memory_utilization = 0.8
++    input_dir = llama_2_7b_files
++    ctx = mp.get_context("spawn")
++
++    # Run in separate processes for memory & CUDA isolation
++    with TemporaryDirectory() as output_dir:
++        p = ctx.Process(target=_run_writer,
++                        args=(input_dir, output_dir, weights_patterns),
++                        kwargs=dict(
++                            tensor_parallel_size=tp_size,
++                            distributed_executor_backend="mp",
++                            gpu_memory_utilization=gpu_memory_utilization,
++                            enforce_eager=True,
++                        ))
++        p.start()
++        p.join()
++
++        queue = ctx.Queue()
++
++        p = ctx.Process(target=_run_generate,
++                        args=(input_dir, queue),
++                        kwargs=dict(
++                            distributed_executor_backend="mp",
++                            enable_lora=enable_lora,
++                            gpu_memory_utilization=gpu_memory_utilization,
++                            tensor_parallel_size=tp_size,
++                        ))
++        p.start()
++        p.join()
++        out_before = queue.get()
++
++        p = ctx.Process(target=_run_generate,
++                        args=(output_dir, queue),
++                        kwargs=dict(
++                            distributed_executor_backend="mp",
++                            enable_lora=enable_lora,
++                            gpu_memory_utilization=gpu_memory_utilization,
++                            tensor_parallel_size=tp_size,
++                            load_format="sharded_state",
++                        ))
++        p.start()
++        p.join()
++        out_after = queue.get()
++
++        assert out_before == out_after
+diff --git a/tests/test_utils.py b/tests/test_utils.py
+new file mode 100644
+index 0000000..6810e03
+--- /dev/null
++++ b/tests/test_utils.py
+@@ -0,0 +1,447 @@
++import asyncio
++import os
++import socket
++from typing import AsyncIterator, Tuple
++
++import pytest
++import torch
++from vllm_test_utils import monitor
++
++from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
++from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
++                        StoreBoolean, bind_kv_cache, deprecate_kwargs,
++                        get_open_port, memory_profiling, merge_async_iterators,
++                        supports_kw)
++
++from .utils import error_on_warning, fork_new_process_for_each_test
++
++
++@pytest.mark.asyncio
++async def test_merge_async_iterators():
++
++    async def mock_async_iterator(idx: int):
++        try:
++            while True:
++                yield f"item from iterator {idx}"
++                await asyncio.sleep(0.1)
++        except asyncio.CancelledError:
++            print(f"iterator {idx} cancelled")
++
++    iterators = [mock_async_iterator(i) for i in range(3)]
++    merged_iterator = merge_async_iterators(*iterators)
++
++    async def stream_output(generator: AsyncIterator[Tuple[int, str]]):
++        async for idx, output in generator:
++            print(f"idx: {idx}, output: {output}")
++
++    task = asyncio.create_task(stream_output(merged_iterator))
++    await asyncio.sleep(0.5)
++    task.cancel()
++    with pytest.raises(asyncio.CancelledError):
++        await task
++
++    for iterator in iterators:
++        try:
++            # Can use anext() in python >= 3.10
++            await asyncio.wait_for(iterator.__anext__(), 1)
++        except StopAsyncIteration:
++            # All iterators should be cancelled and print this message.
++            print("Iterator was cancelled normally")
++        except (Exception, asyncio.CancelledError) as e:
++            raise AssertionError() from e
++
++
++def test_deprecate_kwargs_always():
++
++    @deprecate_kwargs("old_arg", is_deprecated=True)
++    def dummy(*, old_arg: object = None, new_arg: object = None):
++        pass
++
++    with pytest.warns(DeprecationWarning, match="'old_arg'"):
++        dummy(old_arg=1)
++
++    with error_on_warning(DeprecationWarning):
++        dummy(new_arg=1)
++
++
++def test_deprecate_kwargs_never():
++
++    @deprecate_kwargs("old_arg", is_deprecated=False)
++    def dummy(*, old_arg: object = None, new_arg: object = None):
++        pass
++
++    with error_on_warning(DeprecationWarning):
++        dummy(old_arg=1)
++
++    with error_on_warning(DeprecationWarning):
++        dummy(new_arg=1)
++
++
++def test_deprecate_kwargs_dynamic():
++    is_deprecated = True
++
++    @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated)
++    def dummy(*, old_arg: object = None, new_arg: object = None):
++        pass
++
++    with pytest.warns(DeprecationWarning, match="'old_arg'"):
++        dummy(old_arg=1)
++
++    with error_on_warning(DeprecationWarning):
++        dummy(new_arg=1)
++
++    is_deprecated = False
++
++    with error_on_warning(DeprecationWarning):
++        dummy(old_arg=1)
++
++    with error_on_warning(DeprecationWarning):
++        dummy(new_arg=1)
++
++
++def test_deprecate_kwargs_additional_message():
++
++    @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
++    def dummy(*, old_arg: object = None, new_arg: object = None):
++        pass
++
++    with pytest.warns(DeprecationWarning, match="abcd"):
++        dummy(old_arg=1)
++
++
++def test_get_open_port():
++    os.environ["VLLM_PORT"] = "5678"
++    # make sure we can get multiple ports, even if the env var is set
++    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
++        s1.bind(("localhost", get_open_port()))
++        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
++            s2.bind(("localhost", get_open_port()))
++            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
++                s3.bind(("localhost", get_open_port()))
++    os.environ.pop("VLLM_PORT")
++
++
++# Tests for FlexibleArgumentParser
++@pytest.fixture
++def parser():
++    parser = FlexibleArgumentParser()
++    parser.add_argument('--image-input-type',
++                        choices=['pixel_values', 'image_features'])
++    parser.add_argument('--model-name')
++    parser.add_argument('--batch-size', type=int)
++    parser.add_argument('--enable-feature', action='store_true')
++    return parser
++
++
++@pytest.fixture
++def parser_with_config():
++    parser = FlexibleArgumentParser()
++    parser.add_argument('serve')
++    parser.add_argument('model_tag')
++    parser.add_argument('--served-model-name', type=str)
++    parser.add_argument('--config', type=str)
++    parser.add_argument('--port', type=int)
++    parser.add_argument('--tensor-parallel-size', type=int)
++    parser.add_argument('--trust-remote-code', action='store_true')
++    parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean)
++    return parser
++
++
++def test_underscore_to_dash(parser):
++    args = parser.parse_args(['--image_input_type', 'pixel_values'])
++    assert args.image_input_type == 'pixel_values'
++
++
++def test_mixed_usage(parser):
++    args = parser.parse_args([
++        '--image_input_type', 'image_features', '--model-name',
++        'facebook/opt-125m'
++    ])
++    assert args.image_input_type == 'image_features'
++    assert args.model_name == 'facebook/opt-125m'
++
++
++def test_with_equals_sign(parser):
++    args = parser.parse_args(
++        ['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
++    assert args.image_input_type == 'pixel_values'
++    assert args.model_name == 'facebook/opt-125m'
++
++
++def test_with_int_value(parser):
++    args = parser.parse_args(['--batch_size', '32'])
++    assert args.batch_size == 32
++    args = parser.parse_args(['--batch-size', '32'])
++    assert args.batch_size == 32
++
++
++def test_with_bool_flag(parser):
++    args = parser.parse_args(['--enable_feature'])
++    assert args.enable_feature is True
++    args = parser.parse_args(['--enable-feature'])
++    assert args.enable_feature is True
++
++
++def test_invalid_choice(parser):
++    with pytest.raises(SystemExit):
++        parser.parse_args(['--image_input_type', 'invalid_choice'])
++
++
++def test_missing_required_argument(parser):
++    parser.add_argument('--required-arg', required=True)
++    with pytest.raises(SystemExit):
++        parser.parse_args([])
++
++
++def test_cli_override_to_config(parser_with_config):
++    args = parser_with_config.parse_args([
++        'serve', 'mymodel', '--config', './data/test_config.yaml',
++        '--tensor-parallel-size', '3'
++    ])
++    assert args.tensor_parallel_size == 3
++    args = parser_with_config.parse_args([
++        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
++        './data/test_config.yaml'
++    ])
++    assert args.tensor_parallel_size == 3
++    assert args.port == 12312
++    args = parser_with_config.parse_args([
++        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
++        './data/test_config.yaml', '--port', '666'
++    ])
++    assert args.tensor_parallel_size == 3
++    assert args.port == 666
++
++
++def test_config_args(parser_with_config):
++    args = parser_with_config.parse_args(
++        ['serve', 'mymodel', '--config', './data/test_config.yaml'])
++    assert args.tensor_parallel_size == 2
++    assert args.trust_remote_code
++    assert not args.multi_step_stream_outputs
++
++
++def test_config_file(parser_with_config):
++    with pytest.raises(FileNotFoundError):
++        parser_with_config.parse_args(
++            ['serve', 'mymodel', '--config', 'test_config.yml'])
++
++    with pytest.raises(ValueError):
++        parser_with_config.parse_args(
++            ['serve', 'mymodel', '--config', './data/test_config.json'])
++
++    with pytest.raises(ValueError):
++        parser_with_config.parse_args([
++            'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
++            '--batch-size', '32'
++        ])
++
++
++def test_no_model_tag(parser_with_config):
++    with pytest.raises(ValueError):
++        parser_with_config.parse_args(
++            ['serve', '--config', './data/test_config.yaml'])
++
++
++# yapf: enable
++@pytest.mark.parametrize(
++    "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
++    [
++        # Tests for positional argument support
++        (lambda foo: None, "foo", True, True, False),
++        (lambda foo: None, "foo", False, True, True),
++        # Tests for positional or keyword / keyword only
++        (lambda foo=100: None, "foo", True, True, False),
++        (lambda *, foo: None, "foo", False, True, True),
++        # Tests to make sure the names of variadic params are NOT supported
++        (lambda *args: None, "args", False, True, False),
++        (lambda **kwargs: None, "kwargs", False, True, False),
++        # Tests for if we allow var kwargs to add support
++        (lambda foo: None, "something_else", False, True, False),
++        (lambda foo, **kwargs: None, "something_else", False, True, True),
++        (lambda foo, **kwargs: None, "kwargs", True, True, False),
++        (lambda foo, **kwargs: None, "foo", True, True, False),
++    ])
++# yapf: disable
++def test_supports_kw(callable,kw_name,requires_kw_only,
++                     allow_var_kwargs,is_supported):
++    assert supports_kw(
++        callable=callable,
++        kw_name=kw_name,
++        requires_kw_only=requires_kw_only,
++        allow_var_kwargs=allow_var_kwargs
++    ) == is_supported
++
++
++@fork_new_process_for_each_test
++def test_memory_profiling():
++    # Fake out some model loading + inference memory usage to test profiling
++    # Memory used by other processes will show up as cuda usage outside of torch
++    from vllm.distributed.device_communicators.cuda_wrapper import (
++        CudaRTLibrary)
++    lib = CudaRTLibrary()
++    # 512 MiB allocation outside of this instance
++    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
++
++    baseline_memory_in_bytes = \
++        torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
++
++    # load weights
++
++    weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
++
++    weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
++
++    def measure_current_non_torch():
++        free, total = torch.cuda.mem_get_info()
++        current_used = total - free
++        current_torch = torch.cuda.memory_reserved()
++        current_non_torch = current_used - current_torch
++        return current_non_torch
++
++    with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
++    weights_memory_in_bytes=weights_memory_in_bytes) as result, \
++        monitor(measure_current_non_torch) as monitored_values:
++        # make a memory spike, 1 GiB
++        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
++        del spike
++
++        # Add some extra non-torch memory 256 MiB (simulate NCCL)
++        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
++
++    # this is an analytic value, it is exact,
++    # we only have 256 MiB non-torch memory increase
++    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
++    assert measured_diff == 256 * 1024 * 1024
++
++    # Check that the memory usage is within 5% of the expected values
++    # 5% tolerance is caused by PyTorch caching allocator,
++    # we cannot control PyTorch's behavior of its internal buffers,
++    # which causes a small error (<10 MiB in practice)
++    non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
++    torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
++    assert abs(non_torch_ratio - 1) <= 0.05
++    assert abs(torch_peak_ratio - 1) <= 0.05
++    del weights
++    lib.cudaFree(handle1)
++    lib.cudaFree(handle2)
++
++
++def test_bind_kv_cache():
++    from vllm.attention import Attention
++
++    ctx = {
++        'layers.0.self_attn': Attention(32, 128, 0.1),
++        'layers.1.self_attn': Attention(32, 128, 0.1),
++        'layers.2.self_attn': Attention(32, 128, 0.1),
++        'layers.3.self_attn': Attention(32, 128, 0.1),
++    }
++    kv_cache = [
++        torch.zeros((1, )),
++        torch.zeros((1, )),
++        torch.zeros((1, )),
++        torch.zeros((1, )),
++    ]
++    bind_kv_cache(ctx, [kv_cache])
++    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
++    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
++    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
++    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
++
++def test_bind_kv_cache_non_attention():
++    from vllm.attention import Attention
++
++    # example from Jamba PP=2
++    ctx = {
++        'model.layers.20.attn': Attention(32, 128, 0.1),
++        'model.layers.28.attn': Attention(32, 128, 0.1),
++    }
++    kv_cache = [
++        torch.zeros((1, )),
++        torch.zeros((1, )),
++    ]
++    bind_kv_cache(ctx, [kv_cache])
++    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
++    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
++
++
++def test_bind_kv_cache_encoder_decoder():
++    from vllm.attention import Attention, AttentionType
++
++    # example from bart
++    ctx = {
++        'encoder.layers.0.self_attn.attn':
++            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
++        'decoder.layers.0.encoder_attn.attn':
++            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
++        'decoder.layers.0.self_attn.attn':
++            Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
++    }
++
++    kv_cache = [
++        torch.zeros((1, )),
++    ]
++    encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
++
++    bind_kv_cache(ctx, [kv_cache])
++    assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
++    assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
++    assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
++
++
++def test_bind_kv_cache_pp():
++    cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
++    with set_current_vllm_config(cfg):
++        from vllm.attention import Attention
++
++        ctx = {
++            'layers.0.self_attn': Attention(32, 128, 0.1),
++        }
++        kv_cache = [
++            [torch.zeros((1, ))],
++            [torch.zeros((1, ))]
++        ]
++        bind_kv_cache(ctx, kv_cache)
++        assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
++        assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
++
++
++def test_placeholder_module_error_handling():
++    placeholder = PlaceholderModule("placeholder_1234")
++
++    def build_ctx():
++        return pytest.raises(ModuleNotFoundError,
++                             match="No module named")
++
++    with build_ctx():
++        int(placeholder)
++
++    with build_ctx():
++        placeholder()
++
++    with build_ctx():
++        _ = placeholder.some_attr
++
++    with build_ctx():
++        # Test conflict with internal __name attribute
++        _ = placeholder.name
++
++    # OK to print the placeholder or use it in a f-string
++    _ = repr(placeholder)
++    _ = str(placeholder)
++
++    # No error yet; only error when it is used downstream
++    placeholder_attr = placeholder.placeholder_attr("attr")
++
++    with build_ctx():
++        int(placeholder_attr)
++
++    with build_ctx():
++        placeholder_attr()
++
++    with build_ctx():
++        _ = placeholder_attr.some_attr
++
++    with build_ctx():
++        # Test conflict with internal __module attribute
++        _ = placeholder_attr.module
+diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
+index 9bc9bec..84348cb 100644
+--- a/tests/tokenization/test_detokenize.py
++++ b/tests/tokenization/test_detokenize.py
+@@ -1,17 +1,24 @@
+-from typing import Dict, List
++from typing import Any, Dict, Generator, List, Optional
+ 
+ import pytest
+ from transformers import AutoTokenizer
+ 
++from vllm.inputs import token_inputs
+ from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
+ from vllm.transformers_utils.detokenizer import (Detokenizer,
+                                                  detokenize_incrementally)
+ from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
++from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+ 
+ TRUTH = [
+     "Hello here, this is a simple test",
+     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa
+-    "我很感谢你的热情"
++    "我很感谢你的热情",
++    # Burmese text triggers an edge-case for Mistral's V3-Tekken tokenizer (eg.
++    # for mistralai/Pixtral-12B-2409) where tokens may map to bytes with
++    # incomplete UTF-8 characters
++    # see https://github.com/vllm-project/vllm/pull/9625
++    "ပုံပြင်လေးပြောပြပါ်",
+ ]
+ TOKENIZERS = [
+     "facebook/opt-125m",
+@@ -24,6 +31,7 @@ TOKENIZERS = [
+     "tiiuae/falcon-7b",
+     "meta-llama/Llama-2-7b-hf",
+     "codellama/CodeLlama-7b-hf",
++    "mistralai/Pixtral-12B-2409",
+ ]
+ 
+ 
+@@ -49,15 +57,55 @@ def _run_incremental_decode(tokenizer, all_input_ids,
+     return decoded_text
+ 
+ 
++@pytest.fixture
++def tokenizer(tokenizer_name):
++    return (MistralTokenizer.from_pretrained(tokenizer_name)
++            if "mistral" in tokenizer_name else
++            AutoTokenizer.from_pretrained(tokenizer_name))
++
++
++@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
++@pytest.mark.parametrize(
++    "truth",
++    [
++        # Burmese text triggers an edge-case where tokens may map to bytes with
++        # incomplete UTF-8 characters
++        "ပုံပြင်လေးပြောပြပါ",
++        # Using "URGENCY" since "CY" has token id 130282
++        "URGENCY🌶️",
++    ])
++def test_mistral_edge_case(tokenizer, truth):
++    """Test for a specific edge cases with V3-Tekken MistralTokenizer.
++
++    See https://github.com/vllm-project/vllm/pull/9625
++    """
++    starting_index = 0
++    all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
++
++    decoded_text = _run_incremental_decode(tokenizer,
++                                           all_input_ids,
++                                           skip_special_tokens=True,
++                                           starting_index=starting_index)
++    assert decoded_text == truth
++
++
++@pytest.fixture
++def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
++    if "mistral" in tokenizer_name:
++        yield (
++            True if request.param else
++            pytest.skip("mistral doesn't support skip_special_tokens=False"))
++    else:
++        yield bool(request.param)
++
++
+ @pytest.mark.parametrize("truth", TRUTH)
+ @pytest.mark.parametrize("with_prompt", [True, False])
+-@pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
+-@pytest.mark.parametrize("skip_special_tokens", (True, False))
+-def test_decode_streaming(tokenizer_id, truth, with_prompt,
+-                          skip_special_tokens):
+-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
++@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
++@pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
++def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens):
+     if with_prompt:
+-        truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"]
++        truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
+         prompt_input_ids = truth_tokens[:len(truth) // 2]
+         generated_input_ids = truth_tokens[len(truth) // 2:]
+         all_input_ids = prompt_input_ids + generated_input_ids
+@@ -68,7 +116,7 @@ def test_decode_streaming(tokenizer_id, truth, with_prompt,
+     else:
+         generated = truth
+         starting_index = 0
+-        all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
++        all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
+     if skip_special_tokens:
+         if tokenizer.bos_token_id is not None:
+             all_input_ids = [tokenizer.bos_token_id] + all_input_ids
+@@ -98,7 +146,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
+         enable_lora=False,
+         max_num_seqs=100,
+         max_input_length=None,
+-        tokenizer_mode="auto",
++        tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto",
+         trust_remote_code=False,
+         revision=None,
+     )
+@@ -113,9 +161,8 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
+ 
+ @pytest.fixture(name="complete_sequence_token_ids")
+ def create_complete_sequence_token_ids(complete_sequence: str,
+-                                       tokenizer_name: str) -> List[int]:
+-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+-    complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"]
++                                       tokenizer) -> List[int]:
++    complete_sequence_token_ids = tokenizer(complete_sequence).input_ids
+     return complete_sequence_token_ids
+ 
+ 
+@@ -123,8 +170,7 @@ def create_sequence(prompt_token_ids=None):
+     prompt_token_ids = prompt_token_ids or [1]
+     return Sequence(
+         seq_id=0,
+-        prompt="<s>",
+-        prompt_token_ids=prompt_token_ids,
++        inputs=token_inputs(prompt_token_ids, prompt="<s>"),
+         block_size=16,
+     )
+ 
+@@ -137,9 +183,18 @@ def create_dummy_logprobs(
+     } for token_id in complete_sequence_token_ids]
+ 
+ 
++def create_dummy_prompt_logprobs(
++        complete_sequence_token_ids: List[int]
++) -> List[Optional[Dict[int, Any]]]:
++    # logprob for the first prompt token is None.
++    logprobs: List[Optional[Dict[int, Any]]] = [None]
++    logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:])
++    return logprobs
++
++
+ @pytest.mark.parametrize("complete_sequence", TRUTH)
+ @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+-@pytest.mark.parametrize("skip_special_tokens", [True, False])
++@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True)
+ def test_decode_sequence_logprobs(complete_sequence: str,
+                                   complete_sequence_token_ids: List[int],
+                                   detokenizer: Detokenizer,
+@@ -151,8 +206,8 @@ def test_decode_sequence_logprobs(complete_sequence: str,
+     # Run sequentially.
+     seq = create_sequence()
+     dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
+-    sequential_logprobs_text_chosen_token = []
+-    sequential_logprobs_text_other_token = []
++    sequential_logprobs_text_chosen_token: List[str] = []
++    sequential_logprobs_text_other_token: List[str] = []
+     for new_token, logprobs in zip(complete_sequence_token_ids,
+                                    dummy_logprobs):
+         seq.append_token_id(new_token, logprobs)
+@@ -175,13 +230,10 @@ def test_decode_sequence_logprobs(complete_sequence: str,
+ 
+ @pytest.mark.parametrize("complete_sequence", TRUTH)
+ @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+-@pytest.mark.parametrize("skip_special_tokens", [True])
+-def test_decode_prompt_logprobs(complete_sequence: str,
+-                                complete_sequence_token_ids: List[int],
+-                                detokenizer: Detokenizer,
+-                                skip_special_tokens: bool):
++def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
++                                detokenizer: Detokenizer):
+     """Verify Detokenizer decodes prompt logprobs correctly."""
+-    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
++    sampling_params = SamplingParams(skip_special_tokens=True,
+                                      prompt_logprobs=1)
+ 
+     # Run sequentially.
+@@ -190,19 +242,78 @@ def test_decode_prompt_logprobs(complete_sequence: str,
+                               seqs=[seq],
+                               sampling_params=sampling_params,
+                               arrival_time=0.0)
+-    dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
+-    detokenizer.decode_prompt_logprobs_inplace(seq_group, dummy_logprobs)
+-    decoded_prompt_logprobs = dummy_logprobs
++    dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids)
++    detokenizer.decode_prompt_logprobs_inplace(seq_group,
++                                               dummy_logprobs,
++                                               position_offset=0)
++    # First logprob is None.
++    decoded_prompt_logprobs: List[Dict[int, Any]] = dummy_logprobs[
++        1:]  # type: ignore
+ 
+-    if skip_special_tokens:
+-        # Text for logprobs for the chosen token should be the same as the
+-        # prompt text. Note that this will only be true if we skip
+-        # special tokens.
+-        assert complete_sequence == "".join([
+-            logprobs[token_id].decoded_token for token_id, logprobs in zip(
+-                complete_sequence_token_ids, decoded_prompt_logprobs)
+-        ])
+-        assert complete_sequence != "".join([
+-            logprobs[token_id + 1].decoded_token for token_id, logprobs in zip(
+-                complete_sequence_token_ids, decoded_prompt_logprobs)
+-        ])
++    # decoded_prompt_logprobs doesn't contain the first token.
++    token_ids = complete_sequence_token_ids
++    tokenizer = detokenizer.get_tokenizer_for_seq(seq)
++    text_full = tokenizer.decode(token_ids, skip_special_tokens=True)
++    text_first = tokenizer.decode(token_ids[0], skip_special_tokens=True)
++    text = text_full[len(text_first):]
++
++    # Text for logprobs for the chosen token should be the same as the
++    # prompt text. Note that the first logprob is None.
++    assert text == "".join([
++        logprobs[token_id].decoded_token
++        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
++    ])
++    assert text != "".join([
++        logprobs[token_id + 1].decoded_token
++        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
++    ])
++
++
++@pytest.mark.parametrize("model", ["facebook/opt-125m"])
++@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
++def test_decode_prompt_logprobs_chunked_prefill(
++    vllm_runner,
++    model,
++    chunked_prefill_token_size: int,
++    example_prompts,
++):
++    max_num_seqs = 256
++    enable_chunked_prefill = False
++    max_num_batched_tokens = None
++    if chunked_prefill_token_size != -1:
++        enable_chunked_prefill = True
++        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
++        max_num_batched_tokens = chunked_prefill_token_size
++
++    with vllm_runner(model,
++                     dtype="half",
++                     max_logprobs=5,
++                     gpu_memory_utilization=0.5,
++                     enable_chunked_prefill=enable_chunked_prefill,
++                     max_num_batched_tokens=max_num_batched_tokens,
++                     max_num_seqs=max_num_seqs) as vllm_model:
++
++        vllm_sampling_params = SamplingParams(max_tokens=10,
++                                              logprobs=5,
++                                              prompt_logprobs=5,
++                                              temperature=0.0)
++        vllm_results = vllm_model.model.generate(
++            example_prompts, sampling_params=vllm_sampling_params)
++
++        for idx, result in enumerate(vllm_results):
++            assert result.prompt_logprobs is not None
++            assert result.prompt_logprobs[0] is None
++
++            # Compared detokenized prompts ids to original prompt.
++            generated_string = ""
++            for (prompt_token,
++                 prompt_logprobs) in zip(result.prompt_token_ids[1:],
++                                         result.prompt_logprobs[1:]):
++                # prompt_logprobs is a dict of the token_id: logprob
++                # We select the token_id corresponding to the actual prompt
++                # Decoded token in the detokenized string corresponding to this
++                # prompt token.
++                generated_string += prompt_logprobs[prompt_token].decoded_token
++
++            assert generated_string == example_prompts[idx], (
++                "Detokenized prompt logprobs do not match original prompt")
+diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
+new file mode 100644
+index 0000000..875ca19
+--- /dev/null
++++ b/tests/tokenization/test_get_eos.py
+@@ -0,0 +1,31 @@
++"""
++This test file includes some cases where it is inappropriate to
++only get the `eos_token_id` from the tokenizer as defined by
++:meth:`vllm.LLMEngine._get_eos_token_id`.
++"""
++from vllm.transformers_utils.config import try_get_generation_config
++from vllm.transformers_utils.tokenizer import get_tokenizer
++
++
++def test_get_llama3_eos_token():
++    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
++
++    tokenizer = get_tokenizer(model_name)
++    assert tokenizer.eos_token_id == 128009
++
++    generation_config = try_get_generation_config(model_name,
++                                                  trust_remote_code=False)
++    assert generation_config is not None
++    assert generation_config.eos_token_id == [128001, 128009]
++
++
++def test_get_blip2_eos_token():
++    model_name = "Salesforce/blip2-opt-2.7b"
++
++    tokenizer = get_tokenizer(model_name)
++    assert tokenizer.eos_token_id == 2
++
++    generation_config = try_get_generation_config(model_name,
++                                                  trust_remote_code=False)
++    assert generation_config is not None
++    assert generation_config.eos_token_id == 50118
+diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
+index 31571db..3faaf32 100644
+--- a/tests/tokenization/test_tokenizer_group.py
++++ b/tests/tokenization/test_tokenizer_group.py
+@@ -1,21 +1,34 @@
+ import asyncio
+ import os
++import sys
++from typing import List, Optional
+ from unittest.mock import patch
+ 
+ import pytest
+ from transformers import AutoTokenizer, PreTrainedTokenizerBase
+ 
+-from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
++from vllm.transformers_utils.tokenizer_group import (TokenizerGroup,
++                                                     get_tokenizer_group)
+ from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
+     RayTokenizerGroupPool)
+-from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
+-    TokenizerGroup)
+ 
+ from ..conftest import get_tokenizer_pool_config
+ 
+ 
++class CustomTokenizerGroup(TokenizerGroup):
++
++    def __init__(self, *args, **kwargs):
++        super().__init__(*args, **kwargs)
++        self._i = 0
++
++    def encode(self, *args, **kwargs):
++        self._i += 1
++        return super().encode(*args, **kwargs)
++
++
+ @pytest.mark.asyncio
+-@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
++@pytest.mark.parametrize("tokenizer_group_type",
++                         [None, "ray", CustomTokenizerGroup])
+ async def test_tokenizer_group(tokenizer_group_type):
+     reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+     tokenizer_group = get_tokenizer_group(
+@@ -34,6 +47,8 @@ async def test_tokenizer_group(tokenizer_group_type):
+                       PreTrainedTokenizerBase)
+     assert tokenizer_group.get_lora_tokenizer(
+         None) == await tokenizer_group.get_lora_tokenizer_async(None)
++    if tokenizer_group_type is CustomTokenizerGroup:
++        assert tokenizer_group._i > 0
+ 
+ 
+ @pytest.mark.asyncio
+@@ -100,3 +115,100 @@ async def test_tokenizer_group_ray_pool_env_var_propagation(
+             max_num_seqs=1,
+             max_input_length=None)
+         tokenizer_pool.ping()
++
++
++@pytest.mark.asyncio
++@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
++async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
++    """Test that Ray tokenizer pool group can recover from failures and
++    if that's not possible, mark itself as unhealthy."""
++
++    class FailingTokenizerGroup(TokenizerGroup):
++
++        def __init__(self,
++                     *args,
++                     fail_at: Optional[List[int]] = None,
++                     **kwargs):
++            super().__init__(*args, **kwargs)
++            self.i = 0
++            self.fail_at = fail_at or []
++
++        def encode(self, *args, **kwargs):
++            self.i += 1
++            if self.i in self.fail_at:
++                sys.exit(1)
++            return super().encode(*args, **kwargs)
++
++    class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
++        _worker_cls = FailingTokenizerGroup
++
++    # Fail at first iteration
++    fail_at = [1]
++    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
++    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
++        tokenizer_pool_config,
++        tokenizer_id="gpt2",
++        enable_lora=False,
++        max_num_seqs=1,
++        max_input_length=None,
++        fail_at=fail_at)
++    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
++
++    # Modify fail at to not fail at all (will be re-read when actor is
++    # re-initialized).
++    fail_at[0] = 1000
++
++    # We should recover successfully.
++    await tokenizer_group_pool.encode_async(request_id="1",
++                                            prompt="prompt",
++                                            lora_request=None)
++    await tokenizer_group_pool.encode_async(request_id="1",
++                                            prompt="prompt",
++                                            lora_request=None)
++
++    # Check that we have a new actor
++    assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
++    assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors
++
++    # Fail at first iteration
++    fail_at = [1]
++    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
++        tokenizer_pool_config,
++        tokenizer_id="gpt2",
++        enable_lora=False,
++        max_num_seqs=1,
++        max_input_length=None,
++        fail_at=fail_at)
++
++    # We should fail after re-initialization.
++    with pytest.raises(RuntimeError):
++        await tokenizer_group_pool.encode_async(request_id="1",
++                                                prompt="prompt",
++                                                lora_request=None)
++
++    # check_health should raise the same thing
++    with pytest.raises(RuntimeError):
++        tokenizer_group_pool.check_health()
++
++    # Ensure that non-ActorDiedErrors are still propagated correctly and do not
++    # cause a re-initialization.
++    fail_at = []
++    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
++        tokenizer_pool_config,
++        tokenizer_id="gpt2",
++        enable_lora=False,
++        max_num_seqs=1,
++        max_input_length=2,
++        fail_at=fail_at)
++    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
++
++    # Prompt too long error
++    with pytest.raises(ValueError):
++        await tokenizer_group_pool.encode_async(request_id="1",
++                                                prompt="prompt" * 100,
++                                                lora_request=None)
++    await tokenizer_group_pool.encode_async(request_id="1",
++                                            prompt="prompt",
++                                            lora_request=None)
++    # Actors should stay the same.
++    assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
+diff --git a/tests/tool_use/__init__.py b/tests/tool_use/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
+new file mode 100644
+index 0000000..294acf2
+--- /dev/null
++++ b/tests/tool_use/conftest.py
+@@ -0,0 +1,38 @@
++import pytest
++import pytest_asyncio
++from huggingface_hub import snapshot_download
++
++from tests.utils import RemoteOpenAIServer
++from vllm.platforms import current_platform
++
++from .utils import ARGS, CONFIGS, ServerConfig
++
++
++# for each server config, download the model and return the config
++@pytest.fixture(scope="session", params=CONFIGS.keys())
++def server_config(request):
++    config = CONFIGS[request.param]
++
++    if current_platform.is_rocm() and not config.get("supports_rocm", True):
++        pytest.skip("The {} model can't be tested on the ROCm platform".format(
++            config["model"]))
++
++    # download model and tokenizer using transformers
++    snapshot_download(config["model"])
++    yield CONFIGS[request.param]
++
++
++# run this for each server config
++@pytest.fixture(scope="session")
++def server(request, server_config: ServerConfig):
++    model = server_config["model"]
++    args_for_model = server_config["arguments"]
++    with RemoteOpenAIServer(model, ARGS + args_for_model,
++                            max_wait_seconds=480) as server:
++        yield server
++
++
++@pytest_asyncio.fixture
++async def client(server: RemoteOpenAIServer):
++    async with server.get_async_client() as async_client:
++        yield async_client
+diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
+new file mode 100644
+index 0000000..3d0fe8f
+--- /dev/null
++++ b/tests/tool_use/test_chat_completion_request_validations.py
+@@ -0,0 +1,71 @@
++import pytest
++
++from vllm.entrypoints.openai.protocol import ChatCompletionRequest
++
++
++def test_chat_completion_request_with_no_tools():
++    # tools key is not present
++    request = ChatCompletionRequest.model_validate({
++        'messages': [{
++            'role': 'user',
++            'content': 'Hello'
++        }],
++        'model':
++        'facebook/opt-125m',
++    })
++    assert request.tool_choice == 'none'
++
++    # tools key is None
++    request = ChatCompletionRequest.model_validate({
++        'messages': [{
++            'role': 'user',
++            'content': 'Hello'
++        }],
++        'model':
++        'facebook/opt-125m',
++        'tools':
++        None
++    })
++    assert request.tool_choice == 'none'
++
++    # tools key present but empty
++    request = ChatCompletionRequest.model_validate({
++        'messages': [{
++            'role': 'user',
++            'content': 'Hello'
++        }],
++        'model':
++        'facebook/opt-125m',
++        'tools': []
++    })
++    assert request.tool_choice == 'none'
++
++
++def test_chat_completion_request_with_tool_choice_but_no_tools():
++    with pytest.raises(ValueError,
++                       match="When using `tool_choice`, `tools` must be set."):
++        ChatCompletionRequest.model_validate({
++            'messages': [{
++                'role': 'user',
++                'content': 'Hello'
++            }],
++            'model':
++            'facebook/opt-125m',
++            'tool_choice':
++            'auto'
++        })
++
++    with pytest.raises(ValueError,
++                       match="When using `tool_choice`, `tools` must be set."):
++        ChatCompletionRequest.model_validate({
++            'messages': [{
++                'role': 'user',
++                'content': 'Hello'
++            }],
++            'model':
++            'facebook/opt-125m',
++            'tool_choice':
++            'auto',
++            'tools':
++            None
++        })
+diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
+new file mode 100644
+index 0000000..75bbfbb
+--- /dev/null
++++ b/tests/tool_use/test_chat_completions.py
+@@ -0,0 +1,146 @@
++from typing import List
++
++import openai
++import pytest
++
++from .utils import (MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig,
++                    ensure_system_prompt)
++
++
++# test: make sure chat completions without tools provided work even when tools
++# are enabled. This makes sure tool call chat templates work, AND that the tool
++# parser stream processing doesn't change the output of the model.
++@pytest.mark.asyncio
++async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
++                                             server_config: ServerConfig):
++    models = await client.models.list()
++    model_name: str = models.data[0].id
++    chat_completion = await client.chat.completions.create(
++        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
++        temperature=0,
++        max_completion_tokens=150,
++        model=model_name,
++        logprobs=False)
++    choice = chat_completion.choices[0]
++    stop_reason = chat_completion.choices[0].finish_reason
++    output_text = chat_completion.choices[0].message.content
++
++    # check to make sure we got text
++    assert output_text is not None
++    assert len(output_text) > 0
++    assert stop_reason != "tool_calls"
++
++    # check to make sure no tool calls were returned
++    assert (choice.message.tool_calls is None
++            or len(choice.message.tool_calls) == 0)
++
++    # make the same request, streaming
++    stream = await client.chat.completions.create(
++        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
++        temperature=0,
++        max_completion_tokens=150,
++        model=model_name,
++        logprobs=False,
++        stream=True,
++    )
++    chunks: List[str] = []
++    finish_reason_count = 0
++    role_sent: bool = False
++
++    # assemble streamed chunks
++    async for chunk in stream:
++        delta = chunk.choices[0].delta
++
++        # make sure the role is assistant
++        if delta.role:
++            assert not role_sent
++            assert delta.role == 'assistant'
++            role_sent = True
++
++        if delta.content:
++            chunks.append(delta.content)
++
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++            assert chunk.choices[0].finish_reason == choice.finish_reason
++
++        # make sure tool call chunks aren't being streamed
++        assert not delta.tool_calls or len(delta.tool_calls) == 0
++
++    # make sure the role was sent, only 1 finish reason was sent, that chunks
++    # were in fact sent, and that the chunks match non-streaming
++    assert role_sent
++    assert finish_reason_count == 1
++    assert len(chunks)
++    assert "".join(chunks) == output_text
++
++
++# test: conversation with tools enabled and provided that should not invoke
++# tools, to make sure we can still get normal chat completion responses
++# and that they won't be parsed as tools
++@pytest.mark.asyncio
++async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
++                                          server_config: ServerConfig):
++    models = await client.models.list()
++    model_name: str = models.data[0].id
++    chat_completion = await client.chat.completions.create(
++        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
++        temperature=0,
++        max_completion_tokens=150,
++        model=model_name,
++        tools=[WEATHER_TOOL],
++        logprobs=False)
++    choice = chat_completion.choices[0]
++    stop_reason = chat_completion.choices[0].finish_reason
++    output_text = chat_completion.choices[0].message.content
++
++    # check to make sure we got text
++    assert output_text is not None
++    assert stop_reason != 'tool_calls'
++    assert len(output_text) > 0
++
++    # check to make sure no tool calls were returned
++    assert (choice.message.tool_calls is None
++            or len(choice.message.tool_calls) == 0)
++
++    # make the same request, streaming
++    stream = await client.chat.completions.create(
++        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
++        temperature=0,
++        max_completion_tokens=150,
++        model=model_name,
++        logprobs=False,
++        tools=[WEATHER_TOOL],
++        stream=True,
++    )
++
++    chunks: List[str] = []
++    finish_reason_count = 0
++    role_sent: bool = False
++
++    # assemble streamed chunks
++    async for chunk in stream:
++        delta = chunk.choices[0].delta
++
++        # make sure the role is assistant
++        if delta.role:
++            assert delta.role == 'assistant'
++            role_sent = True
++
++        if delta.content:
++            chunks.append(delta.content)
++
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++
++        # make sure tool call chunks aren't being streamed
++        assert not delta.tool_calls or len(delta.tool_calls) == 0
++
++    # make sure the role was sent, only 1 finish reason was sent, that chunks
++    # were in fact sent, and that the chunks match non-streaming
++    assert role_sent
++    assert finish_reason_count == 1
++    assert chunk.choices[0].finish_reason == stop_reason
++    assert chunk.choices[0].finish_reason != 'tool_calls'
++    assert len(chunks)
++    assert "".join(chunks) == output_text
+diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
+new file mode 100644
+index 0000000..3095ef4
+--- /dev/null
++++ b/tests/tool_use/test_jamba_tool_parser.py
+@@ -0,0 +1,275 @@
++import json
++from typing import Generator, List, Optional
++
++import partial_json_parser
++import pytest
++from partial_json_parser.core.options import Allow
++
++from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall,
++                                              ToolCall)
++from vllm.entrypoints.openai.tool_parsers import JambaToolParser
++from vllm.transformers_utils.detokenizer import detokenize_incrementally
++from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
++
++MODEL = "ai21labs/Jamba-tiny-dev"
++
++
++@pytest.fixture(scope="module")
++def jamba_tokenizer():
++    return get_tokenizer(tokenizer_name=MODEL)
++
++
++@pytest.fixture
++def jamba_tool_parser(jamba_tokenizer):
++    return JambaToolParser(jamba_tokenizer)
++
++
++def assert_tool_calls(actual_tool_calls: List[ToolCall],
++                      expected_tool_calls: List[ToolCall]):
++    assert len(actual_tool_calls) == len(expected_tool_calls)
++
++    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
++                                                    expected_tool_calls):
++        assert isinstance(actual_tool_call.id, str)
++        assert len(actual_tool_call.id) > 16
++
++        assert actual_tool_call.type == "function"
++        assert actual_tool_call.function == expected_tool_call.function
++
++
++def stream_delta_message_generator(
++        jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer,
++        model_output: str) -> Generator[DeltaMessage, None, None]:
++    all_token_ids = jamba_tokenizer.encode(model_output,
++                                           add_special_tokens=False)
++
++    previous_text = ""
++    previous_tokens = None
++    prefix_offset = 0
++    read_offset = 0
++    for i, delta_token in enumerate(all_token_ids):
++        delta_token_ids = [delta_token]
++        previous_token_ids = all_token_ids[:i]
++        current_token_ids = all_token_ids[:i + 1]
++
++        (new_tokens, delta_text, new_prefix_offset,
++         new_read_offset) = detokenize_incrementally(
++             tokenizer=jamba_tokenizer,
++             all_input_ids=current_token_ids,
++             prev_tokens=previous_tokens,
++             prefix_offset=prefix_offset,
++             read_offset=read_offset,
++             skip_special_tokens=False,
++             spaces_between_special_tokens=True,
++         )
++
++        current_text = previous_text + delta_text
++
++        delta_message = jamba_tool_parser.extract_tool_calls_streaming(
++            previous_text,
++            current_text,
++            delta_text,
++            previous_token_ids,
++            current_token_ids,
++            delta_token_ids,
++            request=None,  # type: ignore[arg-type]
++        )
++        if delta_message:
++            yield delta_message
++
++        previous_text = current_text
++        previous_tokens = previous_tokens + new_tokens if previous_tokens\
++            else new_tokens
++        prefix_offset = new_prefix_offset
++        read_offset = new_read_offset
++
++
++def test_extract_tool_calls_no_tools(jamba_tool_parser):
++    model_output = "This is a test"
++    extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
++        model_output, request=None)  # type: ignore[arg-type]
++    assert not extracted_tool_calls.tools_called
++    assert extracted_tool_calls.tool_calls == []
++    assert extracted_tool_calls.content == model_output
++
++
++@pytest.mark.parametrize(
++    ids=[
++        "single_tool",
++        "single_tool_with_content",
++        "parallel_tools",
++    ],
++    argnames=["model_output", "expected_tool_calls", "expected_content"],
++    argvalues=[
++        (
++            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
++            [
++                ToolCall(function=FunctionCall(name="get_current_weather",
++                                               arguments=json.dumps(
++                                                   {
++                                                       "city": "Dallas",
++                                                       "state": "TX",
++                                                       "unit": "fahrenheit"
++                                                   })))
++            ],
++            None),
++        (
++            ''' Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
++            [
++                ToolCall(function=FunctionCall(name="get_current_weather",
++                                               arguments=json.dumps(
++                                                   {
++                                                       "city": "Dallas",
++                                                       "state": "TX",
++                                                       "unit": "fahrenheit"
++                                                   })))
++            ],
++            " Sure! let me call the tool for you."),
++        (
++            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
++            [
++                ToolCall(function=FunctionCall(name="get_current_weather",
++                                               arguments=json.dumps(
++                                                   {
++                                                       "city": "Dallas",
++                                                       "state": "TX",
++                                                       "unit": "fahrenheit"
++                                                   }))),
++                ToolCall(function=FunctionCall(name="get_current_weather",
++                                               arguments=json.dumps(
++                                                   {
++                                                       "city": "Orlando",
++                                                       "state": "FL",
++                                                       "unit": "fahrenheit"
++                                                   })))
++            ],
++            None)
++    ],
++)
++def test_extract_tool_calls(jamba_tool_parser, model_output,
++                            expected_tool_calls, expected_content):
++    extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
++        model_output, request=None)  # type: ignore[arg-type]
++    assert extracted_tool_calls.tools_called
++
++    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
++
++    assert extracted_tool_calls.content == expected_content
++
++
++@pytest.mark.parametrize(
++    ids=[
++        "no_tools",
++        "single_tool",
++        "single_tool_with_content",
++        "parallel_tools",
++    ],
++    argnames=["model_output", "expected_tool_calls", "expected_content"],
++    argvalues=[
++        ('''This is a test''', [], '''This is a test'''),
++        (
++            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
++            [
++                ToolCall(function=FunctionCall(name="get_current_weather",
++                                               arguments=json.dumps(
++                                                   {
++                                                       "city": "Dallas",
++                                                       "state": "TX",
++                                                       "unit": "fahrenheit"
++                                                   })))
++            ],
++            " "),
++        (
++            ''' Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
++            [
++                ToolCall(function=FunctionCall(name="get_current_weather",
++                                               arguments=json.dumps(
++                                                   {
++                                                       "city": "Dallas",
++                                                       "state": "TX",
++                                                       "unit": "fahrenheit"
++                                                   })))
++            ],
++            " Sure! let me call the tool for you."),
++        (
++            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
++            [
++                ToolCall(function=FunctionCall(name="get_current_weather",
++                                               arguments=json.dumps(
++                                                   {
++                                                       "city": "Dallas",
++                                                       "state": "TX",
++                                                       "unit": "fahrenheit"
++                                                   }))),
++                ToolCall(function=FunctionCall(name="get_current_weather",
++                                               arguments=json.dumps(
++                                                   {
++                                                       "city": "Orlando",
++                                                       "state": "FL",
++                                                       "unit": "fahrenheit"
++                                                   })))
++            ],
++            " ")
++    ],
++)
++def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer,
++                                      model_output, expected_tool_calls,
++                                      expected_content):
++    other_content: str = ''
++    function_names: List[str] = []
++    function_args_strs: List[str] = []
++    tool_call_idx: int = -1
++    tool_call_ids: List[Optional[str]] = []
++
++    for delta_message in stream_delta_message_generator(
++            jamba_tool_parser, jamba_tokenizer, model_output):
++        # role should never be streamed from tool parser
++        assert not delta_message.role
++
++        if delta_message.content:
++            other_content += delta_message.content
++
++        streamed_tool_calls = delta_message.tool_calls
++
++        if streamed_tool_calls and len(streamed_tool_calls) > 0:
++            # make sure only one diff is present - correct even for parallel
++            assert len(streamed_tool_calls) == 1
++            tool_call = streamed_tool_calls[0]
++
++            # if a new tool is being called, set up empty arguments
++            if tool_call.index != tool_call_idx:
++                tool_call_idx = tool_call.index
++                function_args_strs.append("")
++                tool_call_ids.append(None)
++
++            # if a tool call ID is streamed, make sure one hasn't been already
++            if tool_call.id and not tool_call_ids[tool_call.index]:
++                tool_call_ids[tool_call.index] = tool_call.id
++
++            # if parts of the function start being streamed
++            if tool_call.function:
++                # if the function name is defined, set it. it should be streamed
++                # IN ENTIRETY, exactly one time.
++                if tool_call.function.name:
++                    assert isinstance(tool_call.function.name, str)
++                    function_names.append(tool_call.function.name)
++
++                if tool_call.function.arguments:
++                    # make sure they're a string and then add them to the list
++                    assert isinstance(tool_call.function.arguments, str)
++
++                    function_args_strs[
++                        tool_call.index] += tool_call.function.arguments
++
++    assert other_content == expected_content
++
++    actual_tool_calls = [
++        ToolCall(id=tool_call_id,
++                 function=FunctionCall(
++                     name=function_name,
++                     arguments=partial_json_parser.ensure_json(
++                         function_args_str, Allow.OBJ | Allow.STR)))
++        for tool_call_id, function_name, function_args_str in zip(
++            tool_call_ids, function_names, function_args_strs)
++    ]
++    assert_tool_calls(actual_tool_calls, expected_tool_calls)
+diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
+new file mode 100644
+index 0000000..c294cb0
+--- /dev/null
++++ b/tests/tool_use/test_parallel_tool_calls.py
+@@ -0,0 +1,205 @@
++import json
++from typing import Dict, List, Optional
++
++import openai
++import pytest
++
++from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
++                    MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL,
++                    WEATHER_TOOL, ServerConfig)
++
++
++# test: getting the model to generate parallel tool calls (streaming/not)
++# when requested. NOTE that not all models may support this, so some exclusions
++# may be added in the future. e.g. llama 3.1 models are not designed to support
++# parallel tool calls.
++@pytest.mark.asyncio
++async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
++                                   server_config: ServerConfig):
++
++    if not server_config.get("supports_parallel", True):
++        pytest.skip("The {} model doesn't support parallel tool calls".format(
++            server_config["model"]))
++
++    models = await client.models.list()
++    model_name: str = models.data[0].id
++    chat_completion = await client.chat.completions.create(
++        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
++        temperature=0,
++        max_completion_tokens=200,
++        model=model_name,
++        tools=[WEATHER_TOOL, SEARCH_TOOL],
++        logprobs=False)
++
++    choice = chat_completion.choices[0]
++    stop_reason = chat_completion.choices[0].finish_reason
++    non_streamed_tool_calls = chat_completion.choices[0].message.tool_calls
++
++    # make sure 2 tool calls are present
++    assert choice.message.role == "assistant"
++    assert non_streamed_tool_calls is not None
++    assert len(non_streamed_tool_calls) == 2
++
++    for tool_call in non_streamed_tool_calls:
++        # make sure the tool includes a function and ID
++        assert tool_call.type == "function"
++        assert tool_call.function is not None
++        assert isinstance(tool_call.id, str)
++        assert len(tool_call.id) >= 9
++
++        # make sure the weather tool was called correctly
++        assert tool_call.function.name == WEATHER_TOOL["function"]["name"]
++        assert isinstance(tool_call.function.arguments, str)
++
++        parsed_arguments = json.loads(tool_call.function.arguments)
++        assert isinstance(parsed_arguments, Dict)
++        assert isinstance(parsed_arguments.get("city"), str)
++        assert isinstance(parsed_arguments.get("state"), str)
++
++    assert stop_reason == "tool_calls"
++
++    # make the same request, streaming
++    stream = await client.chat.completions.create(
++        model=model_name,
++        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
++        temperature=0,
++        max_completion_tokens=200,
++        tools=[WEATHER_TOOL, SEARCH_TOOL],
++        logprobs=False,
++        stream=True)
++
++    role_name: Optional[str] = None
++    finish_reason_count: int = 0
++
++    tool_call_names: List[str] = []
++    tool_call_args: List[str] = []
++    tool_call_idx: int = -1
++    tool_call_id_count: int = 0
++
++    async for chunk in stream:
++
++        # if there's a finish reason make sure it's tools
++        if chunk.choices[0].finish_reason:
++            finish_reason_count += 1
++            assert chunk.choices[0].finish_reason == 'tool_calls'
++
++        # if a role is being streamed make sure it wasn't already set to
++        # something else
++        if chunk.choices[0].delta.role:
++            assert not role_name or role_name == 'assistant'
++            role_name = 'assistant'
++
++        # if a tool call is streamed make sure there's exactly one
++        # (based on the request parameters
++        streamed_tool_calls = chunk.choices[0].delta.tool_calls
++
++        if streamed_tool_calls and len(streamed_tool_calls) > 0:
++
++            # make sure only one diff is present - correct even for parallel
++            assert len(streamed_tool_calls) == 1
++            tool_call = streamed_tool_calls[0]
++
++            # if a new tool is being called, set up empty arguments
++            if tool_call.index != tool_call_idx:
++                tool_call_idx = tool_call.index
++                tool_call_args.append("")
++
++            # if a tool call ID is streamed, make sure one hasn't been already
++            if tool_call.id:
++                tool_call_id_count += 1
++                assert (isinstance(tool_call.id, str)
++                        and (len(tool_call.id) >= 9))
++
++            # if parts of the function start being streamed
++            if tool_call.function:
++                # if the function name is defined, set it. it should be streamed
++                # IN ENTIRETY, exactly one time.
++                if tool_call.function.name:
++                    assert isinstance(tool_call.function.name, str)
++                    tool_call_names.append(tool_call.function.name)
++
++                if tool_call.function.arguments:
++                    # make sure they're a string and then add them to the list
++                    assert isinstance(tool_call.function.arguments, str)
++
++                    tool_call_args[
++                        tool_call.index] += tool_call.function.arguments
++
++    assert finish_reason_count == 1
++    assert role_name == 'assistant'
++
++    assert (len(non_streamed_tool_calls) == len(tool_call_names) ==
++            len(tool_call_args))
++
++    for i in range(2):
++        assert non_streamed_tool_calls[i].function.name == tool_call_names[i]
++        streamed_args = json.loads(tool_call_args[i])
++        non_streamed_args = json.loads(
++            non_streamed_tool_calls[i].function.arguments)
++        assert streamed_args == non_streamed_args
++
++
++# test: providing parallel tool calls back to the model to get a response
++# (streaming/not)
++@pytest.mark.asyncio
++async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
++                                                server_config: ServerConfig):
++
++    if not server_config.get("supports_parallel", True):
++        pytest.skip("The {} model doesn't support parallel tool calls".format(
++            server_config["model"]))
++
++    models = await client.models.list()
++    model_name: str = models.data[0].id
++    chat_completion = await client.chat.completions.create(
++        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
++        temperature=0,
++        max_completion_tokens=200,
++        model=model_name,
++        tools=[WEATHER_TOOL, SEARCH_TOOL],
++        logprobs=False)
++
++    choice = chat_completion.choices[0]
++
++    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
++    assert choice.message.role == "assistant"
++    assert choice.message.tool_calls is None \
++           or len(choice.message.tool_calls) == 0
++    assert choice.message.content is not None
++    assert "98" in choice.message.content  # Dallas temp in tool response
++    assert "78" in choice.message.content  # Orlando temp in tool response
++
++    stream = await client.chat.completions.create(
++        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
++        temperature=0,
++        max_completion_tokens=200,
++        model=model_name,
++        tools=[WEATHER_TOOL, SEARCH_TOOL],
++        logprobs=False,
++        stream=True)
++
++    chunks: List[str] = []
++    finish_reason_count = 0
++    role_sent: bool = False
++
++    async for chunk in stream:
++        delta = chunk.choices[0].delta
++
++        if delta.role:
++            assert not role_sent
++            assert delta.role == "assistant"
++            role_sent = True
++
++        if delta.content:
++            chunks.append(delta.content)
++
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++            assert chunk.choices[0].finish_reason == choice.finish_reason
++
++        assert not delta.tool_calls or len(delta.tool_calls) == 0
++
++    assert role_sent
++    assert finish_reason_count == 1
++    assert len(chunks)
++    assert "".join(chunks) == choice.message.content
+diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
+new file mode 100644
+index 0000000..fe8cb49
+--- /dev/null
++++ b/tests/tool_use/test_tool_calls.py
+@@ -0,0 +1,192 @@
++import json
++from typing import Dict, List, Optional
++
++import openai
++import pytest
++
++from .utils import (MESSAGES_ASKING_FOR_TOOLS, MESSAGES_WITH_TOOL_RESPONSE,
++                    SEARCH_TOOL, WEATHER_TOOL)
++
++
++# test: request a chat completion that should return tool calls, so we know they
++# are parsable
++@pytest.mark.asyncio
++async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
++    models = await client.models.list()
++    model_name: str = models.data[0].id
++    chat_completion = await client.chat.completions.create(
++        messages=MESSAGES_ASKING_FOR_TOOLS,
++        temperature=0,
++        max_completion_tokens=100,
++        model=model_name,
++        tools=[WEATHER_TOOL, SEARCH_TOOL],
++        logprobs=False)
++
++    choice = chat_completion.choices[0]
++    stop_reason = chat_completion.choices[0].finish_reason
++    tool_calls = chat_completion.choices[0].message.tool_calls
++
++    # make sure a tool call is present
++    assert choice.message.role == 'assistant'
++    assert tool_calls is not None
++    assert len(tool_calls) == 1
++    assert tool_calls[0].type == 'function'
++    assert tool_calls[0].function is not None
++    assert isinstance(tool_calls[0].id, str)
++    assert len(tool_calls[0].id) >= 9
++
++    # make sure the weather tool was called (classic example) with arguments
++    assert tool_calls[0].function.name == WEATHER_TOOL["function"]["name"]
++    assert tool_calls[0].function.arguments is not None
++    assert isinstance(tool_calls[0].function.arguments, str)
++
++    # make sure the arguments parse properly
++    parsed_arguments = json.loads(tool_calls[0].function.arguments)
++    assert isinstance(parsed_arguments, Dict)
++    assert isinstance(parsed_arguments.get("city"), str)
++    assert isinstance(parsed_arguments.get("state"), str)
++    assert parsed_arguments.get("city") == "Dallas"
++    assert parsed_arguments.get("state") == "TX"
++
++    assert stop_reason == "tool_calls"
++
++    function_name: Optional[str] = None
++    function_args_str: str = ''
++    tool_call_id: Optional[str] = None
++    role_name: Optional[str] = None
++    finish_reason_count: int = 0
++
++    # make the same request, streaming
++    stream = await client.chat.completions.create(
++        model=model_name,
++        messages=MESSAGES_ASKING_FOR_TOOLS,
++        temperature=0,
++        max_completion_tokens=100,
++        tools=[WEATHER_TOOL, SEARCH_TOOL],
++        logprobs=False,
++        stream=True)
++
++    async for chunk in stream:
++        assert chunk.choices[0].index == 0
++
++        if chunk.choices[0].finish_reason:
++            finish_reason_count += 1
++            assert chunk.choices[0].finish_reason == 'tool_calls'
++
++        # if a role is being streamed make sure it wasn't already set to
++        # something else
++        if chunk.choices[0].delta.role:
++            assert not role_name or role_name == 'assistant'
++            role_name = 'assistant'
++
++        # if a tool call is streamed make sure there's exactly one
++        # (based on the request parameters
++        streamed_tool_calls = chunk.choices[0].delta.tool_calls
++
++        if streamed_tool_calls and len(streamed_tool_calls) > 0:
++            assert len(streamed_tool_calls) == 1
++            tool_call = streamed_tool_calls[0]
++
++            # if a tool call ID is streamed, make sure one hasn't been already
++            if tool_call.id:
++                assert not tool_call_id
++                tool_call_id = tool_call.id
++
++            # if parts of the function start being streamed
++            if tool_call.function:
++                # if the function name is defined, set it. it should be streamed
++                # IN ENTIRETY, exactly one time.
++                if tool_call.function.name:
++                    assert function_name is None
++                    assert isinstance(tool_call.function.name, str)
++                    function_name = tool_call.function.name
++                if tool_call.function.arguments:
++                    assert isinstance(tool_call.function.arguments, str)
++                    function_args_str += tool_call.function.arguments
++
++    assert finish_reason_count == 1
++    assert role_name == 'assistant'
++    assert isinstance(tool_call_id, str) and (len(tool_call_id) >= 9)
++
++    # validate the name and arguments
++    assert function_name == WEATHER_TOOL["function"]["name"]
++    assert function_name == tool_calls[0].function.name
++    assert isinstance(function_args_str, str)
++
++    # validate arguments
++    streamed_args = json.loads(function_args_str)
++    assert isinstance(streamed_args, Dict)
++    assert isinstance(streamed_args.get("city"), str)
++    assert isinstance(streamed_args.get("state"), str)
++    assert streamed_args.get("city") == "Dallas"
++    assert streamed_args.get("state") == "TX"
++
++    # make sure everything matches non-streaming except for ID
++    assert function_name == tool_calls[0].function.name
++    assert choice.message.role == role_name
++    assert choice.message.tool_calls[0].function.name == function_name
++
++    # compare streamed with non-streamed args Dict-wise, not string-wise
++    # because character-to-character comparison might not work e.g. the tool
++    # call parser adding extra spaces or something like that. we care about the
++    # dicts matching not byte-wise match
++    assert parsed_arguments == streamed_args
++
++
++# test: providing tools and results back to model to get a non-tool response
++# (streaming/not)
++@pytest.mark.asyncio
++async def test_tool_call_with_results(client: openai.AsyncOpenAI):
++    models = await client.models.list()
++    model_name: str = models.data[0].id
++    chat_completion = await client.chat.completions.create(
++        messages=MESSAGES_WITH_TOOL_RESPONSE,
++        temperature=0,
++        max_completion_tokens=100,
++        model=model_name,
++        tools=[WEATHER_TOOL, SEARCH_TOOL],
++        logprobs=False)
++
++    choice = chat_completion.choices[0]
++
++    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
++    assert choice.message.role == "assistant"
++    assert choice.message.tool_calls is None \
++           or len(choice.message.tool_calls) == 0
++    assert choice.message.content is not None
++    assert "98" in choice.message.content  # the temperature from the response
++
++    stream = await client.chat.completions.create(
++        messages=MESSAGES_WITH_TOOL_RESPONSE,
++        temperature=0,
++        max_completion_tokens=100,
++        model=model_name,
++        tools=[WEATHER_TOOL, SEARCH_TOOL],
++        logprobs=False,
++        stream=True)
++
++    chunks: List[str] = []
++    finish_reason_count = 0
++    role_sent: bool = False
++
++    async for chunk in stream:
++        delta = chunk.choices[0].delta
++
++        if delta.role:
++            assert not role_sent
++            assert delta.role == "assistant"
++            role_sent = True
++
++        if delta.content:
++            chunks.append(delta.content)
++
++        if chunk.choices[0].finish_reason is not None:
++            finish_reason_count += 1
++            assert chunk.choices[0].finish_reason == choice.finish_reason
++
++        assert not delta.tool_calls or len(delta.tool_calls) == 0
++
++    assert role_sent
++    assert finish_reason_count == 1
++    assert len(chunks)
++    assert "".join(chunks) == choice.message.content
+diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
+new file mode 100644
+index 0000000..2241f18
+--- /dev/null
++++ b/tests/tool_use/utils.py
+@@ -0,0 +1,313 @@
++from copy import deepcopy
++from typing import Any, Dict, List, Optional
++
++from openai.types.chat import (ChatCompletionMessageParam,
++                               ChatCompletionToolParam)
++from typing_extensions import TypedDict
++
++from tests.utils import VLLM_PATH
++
++
++class ServerConfig(TypedDict, total=False):
++    model: str
++    arguments: List[str]
++    system_prompt: Optional[str]
++    supports_parallel: Optional[bool]
++    supports_rocm: Optional[bool]
++
++
++def patch_system_prompt(messages: List[Dict[str, Any]],
++                        system_prompt: str) -> List[Dict[str, Any]]:
++    new_messages = deepcopy(messages)
++    if new_messages[0]["role"] == "system":
++        new_messages[0]["content"] = system_prompt
++    else:
++        new_messages.insert(0, {"role": "system", "content": system_prompt})
++    return new_messages
++
++
++def ensure_system_prompt(messages: List[Dict[str, Any]],
++                         config: ServerConfig) -> List[Dict[str, Any]]:
++    prompt = config.get("system_prompt")
++    if prompt:
++        return patch_system_prompt(messages, prompt)
++    else:
++        return messages
++
++
++# universal args for all models go here. also good if you need to test locally
++# and change type or KV cache quantization or something.
++ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"]
++
++CONFIGS: Dict[str, ServerConfig] = {
++    "hermes": {
++        "model":
++        "NousResearch/Hermes-3-Llama-3.1-8B",
++        "arguments": [
++            "--tool-call-parser", "hermes", "--chat-template",
++            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
++        ],
++        "system_prompt":
++        "You are a helpful assistant with access to tools. If a tool"
++        " that you have would be helpful to answer a user query, "
++        "call the tool. Otherwise, answer the user's query directly "
++        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
++        "to the user's question - just respond to it normally."
++    },
++    "llama": {
++        "model":
++        "meta-llama/Meta-Llama-3.1-8B-Instruct",
++        "arguments": [
++            "--tool-call-parser", "llama3_json", "--chat-template",
++            str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja")
++        ],
++        "supports_parallel":
++        False,
++    },
++    "llama3.2": {
++        "model":
++        "meta-llama/Llama-3.2-3B-Instruct",
++        "arguments": [
++            "--tool-call-parser", "llama3_json", "--chat-template",
++            str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja")
++        ],
++        "supports_parallel":
++        False,
++    },
++    "mistral": {
++        "model":
++        "mistralai/Mistral-7B-Instruct-v0.3",
++        "arguments": [
++            "--tool-call-parser", "mistral", "--chat-template",
++            str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
++            "--ignore-patterns=\"consolidated.safetensors\""
++        ],
++        "system_prompt":
++        "You are a helpful assistant with access to tools. If a tool"
++        " that you have would be helpful to answer a user query, "
++        "call the tool. Otherwise, answer the user's query directly "
++        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
++        "to the user's question - just respond to it normally."
++    },
++    "granite20b": {
++        "model":
++        "mbayser/granite-20b-functioncalling-FP8-KV",
++        "arguments": [
++            "--tool-call-parser", "granite-20b-fc", "--chat-template",
++            str(VLLM_PATH /
++                "examples/tool_chat_template_granite_20b_fc.jinja"),
++            "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
++        ],
++        "supports_parallel":
++        False,
++        "supports_rocm":
++        False,
++    },
++    "granite-3.0-8b": {
++        "model":
++        "ibm-granite/granite-3.0-8b-instruct",
++        "arguments": [
++            "--tool-call-parser", "granite", "--chat-template",
++            str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
++        ],
++    },
++    "granite-3.1-8b": {
++        "model": "ibm-granite/granite-3.1-8b-instruct",
++        "arguments": [
++            "--tool-call-parser",
++            "granite",
++        ],
++        "supports_parallel": True,
++    },
++    "internlm": {
++        "model":
++        "internlm/internlm2_5-7b-chat",
++        "arguments": [
++            "--tool-call-parser", "internlm", "--chat-template",
++            str(VLLM_PATH /
++                "examples/tool_chat_template_internlm2_tool.jinja"),
++            "--trust_remote_code"
++        ],
++        "supports_parallel":
++        False,
++    },
++    "toolACE": {
++        "model":
++        "Team-ACE/ToolACE-8B",
++        "arguments": [
++            "--tool-call-parser", "pythonic", "--chat-template",
++            str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja")
++        ],
++        "supports_parallel":
++        True,
++    },
++}
++
++WEATHER_TOOL: ChatCompletionToolParam = {
++    "type": "function",
++    "function": {
++        "name": "get_current_weather",
++        "description": "Get the current weather in a given location",
++        "parameters": {
++            "type": "object",
++            "properties": {
++                "city": {
++                    "type":
++                    "string",
++                    "description":
++                    "The city to find the weather for, "
++                    "e.g. 'San Francisco'"
++                },
++                "state": {
++                    "type":
++                    "string",
++                    "description":
++                    "must the two-letter abbreviation for the state "
++                    "that the city is in, e.g. 'CA' which would "
++                    "mean 'California'"
++                },
++                "unit": {
++                    "type": "string",
++                    "description": "The unit to fetch the temperature in",
++                    "enum": ["celsius", "fahrenheit"]
++                }
++            }
++        }
++    }
++}
++
++SEARCH_TOOL: ChatCompletionToolParam = {
++    "type": "function",
++    "function": {
++        "name":
++        "web_search",
++        "description":
++        "Search the internet and get a summary of the top "
++        "10 webpages. Should only be used if you don't know "
++        "the answer to a user query, and the results are likely"
++        "to be able to be found with a web search",
++        "parameters": {
++            "type": "object",
++            "properties": {
++                "search_term": {
++                    "type":
++                    "string",
++                    "description":
++                    "The term to use in the search. This should"
++                    "ideally be keywords to search for, not a"
++                    "natural-language question"
++                }
++            },
++            "required": ["search_term"]
++        }
++    }
++}
++
++MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{
++    "role":
++    "user",
++    "content":
++    "Hi! How are you?"
++}, {
++    "role":
++    "assistant",
++    "content":
++    "I'm doing great! How can I assist you?"
++}, {
++    "role":
++    "user",
++    "content":
++    "Can you tell me a joke please?"
++}]
++
++MESSAGES_ASKING_FOR_TOOLS: List[ChatCompletionMessageParam] = [{
++    "role":
++    "user",
++    "content":
++    "What is the weather in Dallas, Texas in Fahrenheit?"
++}]
++
++MESSAGES_WITH_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{
++    "role":
++    "user",
++    "content":
++    "What is the weather in Dallas, Texas in Fahrenheit?"
++}, {
++    "role":
++    "assistant",
++    "tool_calls": [{
++        "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
++        "type": "function",
++        "function": {
++            "name":
++            WEATHER_TOOL["function"]["name"],
++            "arguments":
++            '{"city": "Dallas", "state": "TX", '
++            '"unit": "fahrenheit"}'
++        }
++    }]
++}, {
++    "role":
++    "tool",
++    "tool_call_id":
++    "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
++    "content":
++    "The weather in Dallas is 98 degrees fahrenheit, with partly"
++    "cloudy skies and a low chance of rain."
++}]
++
++MESSAGES_ASKING_FOR_PARALLEL_TOOLS: List[ChatCompletionMessageParam] = [{
++    "role":
++    "user",
++    "content":
++    "What is the weather in Dallas, Texas and Orlando, Florida in "
++    "Fahrenheit?"
++}]
++
++MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{
++    "role":
++    "user",
++    "content":
++    "What is the weather in Dallas, Texas and Orlando, Florida in "
++    "Fahrenheit?"
++}, {
++    "role":
++    "assistant",
++    "tool_calls": [{
++        "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
++        "type": "function",
++        "function": {
++            "name":
++            WEATHER_TOOL["function"]["name"],
++            "arguments":
++            '{"city": "Dallas", "state": "TX", '
++            '"unit": "fahrenheit"}'
++        }
++    }, {
++        "id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
++        "type": "function",
++        "function": {
++            "name":
++            WEATHER_TOOL["function"]["name"],
++            "arguments":
++            '{"city": "Orlando", "state": "Fl", '
++            '"unit": "fahrenheit"}'
++        }
++    }]
++}, {
++    "role":
++    "tool",
++    "tool_call_id":
++    "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
++    "content":
++    "The weather in Dallas TX is 98 degrees fahrenheit with mostly "
++    "cloudy skies and a chance of rain in the evening."
++}, {
++    "role":
++    "tool",
++    "tool_call_id":
++    "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
++    "content":
++    "The weather in Orlando FL is 78 degrees fahrenheit with clear"
++    "skies."
++}]
+diff --git a/tests/tpu/__init__.py b/tests/tpu/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
+new file mode 100644
+index 0000000..b7124eb
+--- /dev/null
++++ b/tests/tpu/test_compilation.py
+@@ -0,0 +1,79 @@
++import glob
++import os
++import tempfile
++
++import depyf
++
++from vllm.config import CompilationLevel
++
++temp_dir = tempfile.mkdtemp()
++with depyf.prepare_debug(temp_dir):
++    from vllm import LLM, SamplingParams
++
++    prompts = [
++        "A robot may not injure a human being",
++        "It is only with the heart that one can see rightly;",
++        "The greatest glory in living lies not in never falling,",
++    ]
++    answers = [
++        " or, through inaction, allow a human being to come to harm.",
++        " what is essential is invisible to the eye.",
++        " but in rising every time we fall.",
++    ]
++    N = 1
++    # Currently, top-p sampling is disabled. `top_p` should be 1.0.
++    sampling_params = SamplingParams(temperature=0.7,
++                                     top_p=1.0,
++                                     n=N,
++                                     max_tokens=16)
++
++    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
++    # In real workloads, `enforace_eager` should be `False`.
++
++    # disable custom dispatcher, let Dynamo takes over
++    # all the control
++    llm = LLM(model="google/gemma-2b",
++              enforce_eager=True,
++              compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
++    outputs = llm.generate(prompts, sampling_params)
++    for output, answer in zip(outputs, answers):
++        prompt = output.prompt
++        generated_text = output.outputs[0].text
++        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
++        assert generated_text.startswith(answer)
++
++compiled_code = sorted(
++    glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
++
++# we should only trigger Dynamo compilation three times:
++# one for the profiling phase without kv cache
++# one for the prefill phase with symbolic shapes
++# one for the decode phase with symbolic shapes
++# and later calls should not trigger Dynamo compilation again.
++# NOTE: it might still trigger XLA compilation.
++
++# check we have three compiled code
++# this is the assumption when we use the custom dispatcher
++assert len(compiled_code) == 3
++
++# check all the compilations are as expected
++compiled_fn = sorted(
++    glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
++
++# the first compilation is the profiling phase,
++# it should not have any kv cache
++with open(compiled_fn[0]) as f:
++    content = f.read()
++    assert "kv_caches" not in content
++
++# the second compilation is the prefill phase,
++# it should have kv cache and the flash_attention op
++with open(compiled_fn[1]) as f:
++    content = f.read()
++    assert "kv_caches" in content and "torch.ops.xla.flash_attention" in content
++
++# the third compilation is the decode phase,
++# it should have kv cache and the paged_attention op
++with open(compiled_fn[2]) as f:
++    content = f.read()
++    assert "kv_caches" in content and "torch.ops.xla.paged_attention" in content
+diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
+new file mode 100644
+index 0000000..bb1379d
+--- /dev/null
++++ b/tests/tpu/test_custom_dispatcher.py
+@@ -0,0 +1,22 @@
++import os
++
++from vllm.config import CompilationLevel
++
++from ..utils import compare_two_settings
++
++# --enforce-eager on TPU causes graph compilation
++# this times out default Health Check in the MQLLMEngine,
++# so we set the timeout here to 30s
++os.environ["VLLM_RPC_TIMEOUT"] = "30000"
++
++
++def test_custom_dispatcher():
++    compare_two_settings(
++        "google/gemma-2b",
++        arg1=[
++            "--enforce-eager",
++            f"-O{CompilationLevel.DYNAMO_ONCE}",
++        ],
++        arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
++        env1={},
++        env2={})
+diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
+new file mode 100644
+index 0000000..6cd5615
+--- /dev/null
++++ b/tests/tpu/test_quantization_accuracy.py
+@@ -0,0 +1,49 @@
++from dataclasses import dataclass
++
++import lm_eval
++import pytest
++
++TASK = "gsm8k"
++FILTER = "exact_match,strict-match"
++RTOL = 0.03
++
++
++@dataclass
++class GSM8KAccuracyTestConfig:
++    model_name: str
++    excepted_value: float
++
++    def get_model_args(self) -> str:
++        return (f"pretrained={self.model_name},"
++                "max_model_len=4096,max_num_seqs=32")
++
++
++# NOTE: Accuracy scores measured on GPUs.
++ACCURACY_CONFIGS = [
++    GSM8KAccuracyTestConfig(
++        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
++        excepted_value=0.76),  # no bias
++    # NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
++    # so only one of these tests can run in a single call to pytest. As
++    # a follow up, move this into the LM-EVAL section of the CI.
++    # GSM8KAccuracyTestConfig(
++    #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
++    #     excepted_value=0.66),  # bias in QKV layers
++]
++
++
++@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
++def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
++
++    results = lm_eval.simple_evaluate(
++        model="vllm",
++        model_args=config.get_model_args(),
++        tasks="gsm8k",
++        batch_size="auto",
++    )
++
++    EXPECTED_VALUE = config.excepted_value
++    measured_value = results["results"][TASK][FILTER]
++    assert (measured_value - RTOL < EXPECTED_VALUE
++            and measured_value + RTOL > EXPECTED_VALUE
++            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+diff --git a/tests/tracing/__init__.py b/tests/tracing/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
+new file mode 100644
+index 0000000..fe5fc97
+--- /dev/null
++++ b/tests/tracing/test_tracing.py
+@@ -0,0 +1,202 @@
++import os
++import threading
++from concurrent import futures
++from typing import Callable, Dict, Iterable, Literal
++
++import grpc
++import pytest
++from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
++    ExportTraceServiceResponse)
++from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
++    TraceServiceServicer, add_TraceServiceServicer_to_server)
++from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
++from opentelemetry.sdk.environment_variables import (
++    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
++
++from vllm import LLM, SamplingParams
++from vllm.tracing import SpanAttributes
++
++FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
++
++FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
++                    'array_value']
++
++
++def decode_value(value: AnyValue):
++    field_decoders: Dict[FieldName, Callable] = {
++        "bool_value": (lambda v: v.bool_value),
++        "string_value": (lambda v: v.string_value),
++        "int_value": (lambda v: v.int_value),
++        "double_value": (lambda v: v.double_value),
++        "array_value":
++        (lambda v: [decode_value(item) for item in v.array_value.values]),
++    }
++    for field, decoder in field_decoders.items():
++        if value.HasField(field):
++            return decoder(value)
++    raise ValueError(f"Couldn't decode value: {value}")
++
++
++def decode_attributes(attributes: Iterable[KeyValue]):
++    return {kv.key: decode_value(kv.value) for kv in attributes}
++
++
++class FakeTraceService(TraceServiceServicer):
++
++    def __init__(self):
++        self.request = None
++        self.evt = threading.Event()
++
++    def Export(self, request, context):
++        self.request = request
++        self.evt.set()
++        return ExportTraceServiceResponse()
++
++
++@pytest.fixture
++def trace_service():
++    """Fixture to set up a fake gRPC trace service"""
++    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
++    service = FakeTraceService()
++    add_TraceServiceServicer_to_server(service, server)
++    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
++    server.start()
++
++    yield service
++
++    server.stop(None)
++
++
++def test_traces(trace_service):
++    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
++
++    sampling_params = SamplingParams(temperature=0.01,
++                                     top_p=0.1,
++                                     max_tokens=256)
++    model = "facebook/opt-125m"
++    llm = LLM(
++        model=model,
++        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
++    )
++    prompts = ["This is a short prompt"]
++    outputs = llm.generate(prompts, sampling_params=sampling_params)
++
++    timeout = 5
++    if not trace_service.evt.wait(timeout):
++        raise TimeoutError(
++            f"The fake trace service didn't receive a trace within "
++            f"the {timeout} seconds timeout")
++
++    request = trace_service.request
++    assert len(request.resource_spans) == 1, (
++        f"Expected 1 resource span, "
++        f"but got {len(request.resource_spans)}")
++    assert len(request.resource_spans[0].scope_spans) == 1, (
++        f"Expected 1 scope span, "
++        f"but got {len(request.resource_spans[0].scope_spans)}")
++    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
++        f"Expected 1 span, "
++        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
++
++    attributes = decode_attributes(
++        request.resource_spans[0].scope_spans[0].spans[0].attributes)
++    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
++    assert attributes.get(
++        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
++    assert attributes.get(
++        SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
++    assert attributes.get(
++        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
++    assert attributes.get(
++        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
++    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
++    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
++        outputs[0].prompt_token_ids)
++    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
++    assert attributes.get(
++        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
++    metrics = outputs[0].metrics
++    assert attributes.get(
++        SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
++    ttft = metrics.first_token_time - metrics.arrival_time
++    assert attributes.get(
++        SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
++    e2e_time = metrics.finished_time - metrics.arrival_time
++    assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
++    assert metrics.scheduler_time > 0
++    assert attributes.get(
++        SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
++    # Model forward and model execute should be none, since detailed traces is
++    # not enabled.
++    assert metrics.model_forward_time is None
++    assert metrics.model_execute_time is None
++
++
++def test_traces_with_detailed_steps(trace_service):
++    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
++
++    sampling_params = SamplingParams(temperature=0.01,
++                                     top_p=0.1,
++                                     max_tokens=256)
++    model = "facebook/opt-125m"
++    llm = LLM(
++        model=model,
++        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
++        collect_detailed_traces="all",
++    )
++    prompts = ["This is a short prompt"]
++    outputs = llm.generate(prompts, sampling_params=sampling_params)
++
++    timeout = 5
++    if not trace_service.evt.wait(timeout):
++        raise TimeoutError(
++            f"The fake trace service didn't receive a trace within "
++            f"the {timeout} seconds timeout")
++
++    request = trace_service.request
++    assert len(request.resource_spans) == 1, (
++        f"Expected 1 resource span, "
++        f"but got {len(request.resource_spans)}")
++    assert len(request.resource_spans[0].scope_spans) == 1, (
++        f"Expected 1 scope span, "
++        f"but got {len(request.resource_spans[0].scope_spans)}")
++    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
++        f"Expected 1 span, "
++        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
++
++    attributes = decode_attributes(
++        request.resource_spans[0].scope_spans[0].spans[0].attributes)
++    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
++    assert attributes.get(
++        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
++    assert attributes.get(
++        SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
++    assert attributes.get(
++        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
++    assert attributes.get(
++        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
++    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
++    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
++        outputs[0].prompt_token_ids)
++    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
++    assert attributes.get(
++        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
++    metrics = outputs[0].metrics
++    assert attributes.get(
++        SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
++    ttft = metrics.first_token_time - metrics.arrival_time
++    assert attributes.get(
++        SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
++    e2e_time = metrics.finished_time - metrics.arrival_time
++    assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
++    assert metrics.scheduler_time > 0
++    assert attributes.get(
++        SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
++    assert metrics.model_forward_time > 0
++    assert attributes.get(
++        SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
++            metrics.model_forward_time / 1000)
++    assert metrics.model_execute_time > 0
++    assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE
++                          ) == metrics.model_execute_time
++    assert metrics.model_forward_time < 1000 * metrics.model_execute_time
+diff --git a/tests/utils.py b/tests/utils.py
+new file mode 100644
+index 0000000..f4eecf1
+--- /dev/null
++++ b/tests/utils.py
+@@ -0,0 +1,825 @@
++import asyncio
++import copy
++import functools
++import os
++import signal
++import subprocess
++import sys
++import time
++import warnings
++from contextlib import contextmanager
++from pathlib import Path
++from typing import Any, Callable, Dict, List, Optional, Type, Union
++
++import openai
++import pytest
++import requests
++import torch
++import torch.nn.functional as F
++from openai.types.completion import Completion
++from typing_extensions import ParamSpec
++
++import vllm.envs as envs
++from tests.models.utils import TextTextLogprobs
++from vllm.distributed import (ensure_model_parallel_initialized,
++                              init_distributed_environment)
++from vllm.engine.arg_utils import AsyncEngineArgs
++from vllm.entrypoints.openai.cli_args import make_arg_parser
++from vllm.model_executor.model_loader.loader import get_model_loader
++from vllm.platforms import current_platform
++from vllm.transformers_utils.tokenizer import get_tokenizer
++from vllm.utils import (FlexibleArgumentParser, GB_bytes,
++                        cuda_device_count_stateless, get_open_port)
++
++if current_platform.is_rocm():
++    from amdsmi import (amdsmi_get_gpu_vram_usage,
++                        amdsmi_get_processor_handles, amdsmi_init,
++                        amdsmi_shut_down)
++
++    @contextmanager
++    def _nvml():
++        try:
++            amdsmi_init()
++            yield
++        finally:
++            amdsmi_shut_down()
++elif current_platform.is_cuda():
++    from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
++                        nvmlInit, nvmlShutdown)
++
++    @contextmanager
++    def _nvml():
++        try:
++            nvmlInit()
++            yield
++        finally:
++            nvmlShutdown()
++else:
++
++    @contextmanager
++    def _nvml():
++        yield
++
++
++VLLM_PATH = Path(__file__).parent.parent
++"""Path to root of the vLLM repository."""
++
++
++class RemoteOpenAIServer:
++    DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
++
++    def __init__(self,
++                 model: str,
++                 vllm_serve_args: List[str],
++                 *,
++                 env_dict: Optional[Dict[str, str]] = None,
++                 auto_port: bool = True,
++                 max_wait_seconds: Optional[float] = None) -> None:
++        if auto_port:
++            if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
++                raise ValueError("You have manually specified the port "
++                                 "when `auto_port=True`.")
++
++            # Don't mutate the input args
++            vllm_serve_args = vllm_serve_args + [
++                "--port", str(get_open_port())
++            ]
++
++        parser = FlexibleArgumentParser(
++            description="vLLM's remote OpenAI server.")
++        parser = make_arg_parser(parser)
++        args = parser.parse_args(["--model", model, *vllm_serve_args])
++        self.host = str(args.host or 'localhost')
++        self.port = int(args.port)
++
++        # download the model before starting the server to avoid timeout
++        is_local = os.path.isdir(model)
++        if not is_local:
++            engine_args = AsyncEngineArgs.from_cli_args(args)
++            model_config = engine_args.create_model_config()
++            load_config = engine_args.create_load_config()
++
++            model_loader = get_model_loader(load_config)
++            model_loader.download_model(model_config)
++
++        env = os.environ.copy()
++        # the current process might initialize cuda,
++        # to be safe, we should use spawn method
++        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
++        if env_dict is not None:
++            env.update(env_dict)
++        self.proc = subprocess.Popen(
++            ["vllm", "serve", model, *vllm_serve_args],
++            env=env,
++            stdout=sys.stdout,
++            stderr=sys.stderr,
++        )
++        max_wait_seconds = max_wait_seconds or 240
++        self._wait_for_server(url=self.url_for("health"),
++                              timeout=max_wait_seconds)
++
++    def __enter__(self):
++        return self
++
++    def __exit__(self, exc_type, exc_value, traceback):
++        self.proc.terminate()
++        try:
++            self.proc.wait(8)
++        except subprocess.TimeoutExpired:
++            # force kill if needed
++            self.proc.kill()
++
++    def _wait_for_server(self, *, url: str, timeout: float):
++        # run health check
++        start = time.time()
++        while True:
++            try:
++                if requests.get(url).status_code == 200:
++                    break
++            except Exception:
++                # this exception can only be raised by requests.get,
++                # which means the server is not ready yet.
++                # the stack trace is not useful, so we suppress it
++                # by using `raise from None`.
++                result = self.proc.poll()
++                if result is not None and result != 0:
++                    raise RuntimeError("Server exited unexpectedly.") from None
++
++                time.sleep(0.5)
++                if time.time() - start > timeout:
++                    raise RuntimeError(
++                        "Server failed to start in time.") from None
++
++    @property
++    def url_root(self) -> str:
++        return f"http://{self.host}:{self.port}"
++
++    def url_for(self, *parts: str) -> str:
++        return self.url_root + "/" + "/".join(parts)
++
++    def get_client(self, **kwargs):
++        if "timeout" not in kwargs:
++            kwargs["timeout"] = 600
++        return openai.OpenAI(
++            base_url=self.url_for("v1"),
++            api_key=self.DUMMY_API_KEY,
++            max_retries=0,
++            **kwargs,
++        )
++
++    def get_async_client(self, **kwargs):
++        if "timeout" not in kwargs:
++            kwargs["timeout"] = 600
++        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
++                                  api_key=self.DUMMY_API_KEY,
++                                  max_retries=0,
++                                  **kwargs)
++
++
++def _test_completion(
++    client: openai.OpenAI,
++    model: str,
++    prompt: str,
++    token_ids: List[int],
++):
++    results = []
++
++    # test with text prompt
++    completion = client.completions.create(model=model,
++                                           prompt=prompt,
++                                           max_tokens=5,
++                                           temperature=0.0)
++
++    results.append({
++        "test": "single_completion",
++        "text": completion.choices[0].text,
++        "finish_reason": completion.choices[0].finish_reason,
++        "usage": completion.usage,
++    })
++
++    # test using token IDs
++    completion = client.completions.create(
++        model=model,
++        prompt=token_ids,
++        max_tokens=5,
++        temperature=0.0,
++    )
++
++    results.append({
++        "test": "token_ids",
++        "text": completion.choices[0].text,
++        "finish_reason": completion.choices[0].finish_reason,
++        "usage": completion.usage,
++    })
++
++    # test seeded random sampling
++    completion = client.completions.create(model=model,
++                                           prompt=prompt,
++                                           max_tokens=5,
++                                           seed=33,
++                                           temperature=1.0)
++
++    results.append({
++        "test": "seeded_sampling",
++        "text": completion.choices[0].text,
++        "finish_reason": completion.choices[0].finish_reason,
++        "usage": completion.usage,
++    })
++
++    # test seeded random sampling with multiple prompts
++    completion = client.completions.create(model=model,
++                                           prompt=[prompt, prompt],
++                                           max_tokens=5,
++                                           seed=33,
++                                           temperature=1.0)
++
++    results.append({
++        "test":
++        "seeded_sampling",
++        "text": [choice.text for choice in completion.choices],
++        "finish_reason":
++        [choice.finish_reason for choice in completion.choices],
++        "usage":
++        completion.usage,
++    })
++
++    # test simple list
++    batch = client.completions.create(
++        model=model,
++        prompt=[prompt, prompt],
++        max_tokens=5,
++        temperature=0.0,
++    )
++
++    results.append({
++        "test": "simple_list",
++        "text0": batch.choices[0].text,
++        "text1": batch.choices[1].text,
++    })
++
++    # test streaming
++    batch = client.completions.create(
++        model=model,
++        prompt=[prompt, prompt],
++        max_tokens=5,
++        temperature=0.0,
++        stream=True,
++    )
++
++    texts = [""] * 2
++    for chunk in batch:
++        assert len(chunk.choices) == 1
++        choice = chunk.choices[0]
++        texts[choice.index] += choice.text
++
++    results.append({
++        "test": "streaming",
++        "texts": texts,
++    })
++
++    return results
++
++
++def _test_completion_close(
++    client: openai.OpenAI,
++    model: str,
++    prompt: str,
++):
++    results = []
++
++    # test with text prompt
++    completion = client.completions.create(model=model,
++                                           prompt=prompt,
++                                           max_tokens=1,
++                                           logprobs=5,
++                                           temperature=0.0)
++
++    logporbs = completion.choices[0].logprobs.top_logprobs[0]
++    logporbs = {k: round(v, 2) for k, v in logporbs.items()}
++
++    results.append({
++        "test": "completion_close",
++        "logprobs": logporbs,
++    })
++
++    return results
++
++
++def _test_embeddings(
++    client: openai.OpenAI,
++    model: str,
++    text: str,
++):
++    results = []
++
++    # test with text input
++    embeddings = client.embeddings.create(
++        model=model,
++        input=text,
++        encoding_format="float",
++    )
++
++    results.append({
++        "test": "single_embedding",
++        "embedding": embeddings.data[0].embedding,
++        "usage": embeddings.usage,
++    })
++
++    return results
++
++
++def _test_image_text(
++    client: openai.OpenAI,
++    model_name: str,
++    image_url: str,
++):
++    results = []
++
++    # test pure text input
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "text",
++                "text": "How do you feel today?"
++            },
++        ],
++    }]
++
++    chat_completion = client.chat.completions.create(model=model_name,
++                                                     messages=messages,
++                                                     temperature=0.0,
++                                                     max_tokens=1,
++                                                     logprobs=True,
++                                                     top_logprobs=5)
++    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
++
++    for x in top_logprobs:
++        x.logprob = round(x.logprob, 2)
++
++    results.append({
++        "test": "pure_text",
++        "logprobs": top_logprobs,
++    })
++
++    messages = [{
++        "role":
++        "user",
++        "content": [
++            {
++                "type": "image_url",
++                "image_url": {
++                    "url": image_url
++                }
++            },
++            {
++                "type": "text",
++                "text": "What's in this image?"
++            },
++        ],
++    }]
++
++    chat_completion = client.chat.completions.create(model=model_name,
++                                                     messages=messages,
++                                                     temperature=0.0,
++                                                     max_tokens=1,
++                                                     logprobs=True,
++                                                     top_logprobs=5)
++    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
++
++    results.append({
++        "test": "text_image",
++        "logprobs": top_logprobs,
++    })
++
++    return results
++
++
++def compare_two_settings(model: str,
++                         arg1: List[str],
++                         arg2: List[str],
++                         env1: Optional[Dict[str, str]] = None,
++                         env2: Optional[Dict[str, str]] = None,
++                         *,
++                         method: str = "generate",
++                         max_wait_seconds: Optional[float] = None) -> None:
++    """
++    Launch API server with two different sets of arguments/environments
++    and compare the results of the API calls.
++
++    Args:
++        model: The model to test.
++        arg1: The first set of arguments to pass to the API server.
++        arg2: The second set of arguments to pass to the API server.
++        env1: The first set of environment variables to pass to the API server.
++        env2: The second set of environment variables to pass to the API server.
++    """
++
++    compare_all_settings(
++        model,
++        [arg1, arg2],
++        [env1, env2],
++        method=method,
++        max_wait_seconds=max_wait_seconds,
++    )
++
++
++def compare_all_settings(model: str,
++                         all_args: List[List[str]],
++                         all_envs: List[Optional[Dict[str, str]]],
++                         *,
++                         method: str = "generate",
++                         max_wait_seconds: Optional[float] = None) -> None:
++    """
++    Launch API server with several different sets of arguments/environments
++    and compare the results of the API calls with the first set of arguments.
++    Args:
++        model: The model to test.
++        all_args: A list of argument lists to pass to the API server.
++        all_envs: A list of environment dictionaries to pass to the API server.
++    """
++
++    trust_remote_code = False
++    for args in all_args:
++        if "--trust-remote-code" in args:
++            trust_remote_code = True
++            break
++
++    tokenizer_mode = "auto"
++    for args in all_args:
++        if "--tokenizer-mode" in args:
++            tokenizer_mode = args[args.index("--tokenizer-mode") + 1]
++            break
++
++    tokenizer = get_tokenizer(
++        model,
++        trust_remote_code=trust_remote_code,
++        tokenizer_mode=tokenizer_mode,
++    )
++
++    can_force_load_format = True
++
++    for args in all_args:
++        if "--load-format" in args:
++            can_force_load_format = False
++            break
++
++    prompt = "Hello, my name is"
++    token_ids = tokenizer(prompt).input_ids
++    ref_results: List = []
++    for i, (args, env) in enumerate(zip(all_args, all_envs)):
++        if can_force_load_format:
++            # we are comparing the results and
++            # usually we don't need real weights.
++            # we force to use dummy weights by default,
++            # and it should work for most of the cases.
++            # if not, we can use VLLM_TEST_FORCE_LOAD_FORMAT
++            # environment variable to force the load format,
++            # e.g. in quantization tests.
++            args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT]
++        compare_results: List = []
++        results = ref_results if i == 0 else compare_results
++        with RemoteOpenAIServer(model,
++                                args,
++                                env_dict=env,
++                                max_wait_seconds=max_wait_seconds) as server:
++            client = server.get_client()
++
++            # test models list
++            models = client.models.list()
++            models = models.data
++            served_model = models[0]
++            results.append({
++                "test": "models_list",
++                "id": served_model.id,
++                "root": served_model.root,
++            })
++
++            if method == "generate":
++                results += _test_completion(client, model, prompt, token_ids)
++            elif method == "generate_close":
++                results += _test_completion_close(client, model, prompt)
++            elif method == "generate_with_image":
++                results += _test_image_text(
++                    client, model,
++                    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
++                )
++            elif method == "encode":
++                results += _test_embeddings(client, model, prompt)
++            else:
++                raise ValueError(f"Unknown method: {method}")
++
++            if i > 0:
++                # if any setting fails, raise an error early
++                ref_args = all_args[0]
++                ref_envs = all_envs[0]
++                compare_args = all_args[i]
++                compare_envs = all_envs[i]
++                for ref_result, compare_result in zip(ref_results,
++                                                      compare_results):
++                    ref_result = copy.deepcopy(ref_result)
++                    compare_result = copy.deepcopy(compare_result)
++                    if "embedding" in ref_result and method == "encode":
++                        sim = F.cosine_similarity(
++                            torch.tensor(ref_result["embedding"]),
++                            torch.tensor(compare_result["embedding"]),
++                            dim=0,
++                        )
++                        assert sim >= 0.999, (
++                            f"Embedding for {model=} are not the same.\n"
++                            f"cosine_similarity={sim}\n")
++                        del ref_result["embedding"]
++                        del compare_result["embedding"]
++                    assert ref_result == compare_result, (
++                        f"Results for {model=} are not the same.\n"
++                        f"{ref_args=} {ref_envs=}\n"
++                        f"{compare_args=} {compare_envs=}\n"
++                        f"{ref_result=}\n"
++                        f"{compare_result=}\n")
++
++
++def init_test_distributed_environment(
++    tp_size: int,
++    pp_size: int,
++    rank: int,
++    distributed_init_port: str,
++    local_rank: int = -1,
++) -> None:
++    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
++    init_distributed_environment(
++        world_size=pp_size * tp_size,
++        rank=rank,
++        distributed_init_method=distributed_init_method,
++        local_rank=local_rank)
++    ensure_model_parallel_initialized(tp_size, pp_size)
++
++
++def multi_process_parallel(
++    tp_size: int,
++    pp_size: int,
++    test_target: Any,
++) -> None:
++    import ray
++
++    # Using ray helps debugging the error when it failed
++    # as compared to multiprocessing.
++    # NOTE: We need to set working_dir for distributed tests,
++    # otherwise we may get import errors on ray workers
++    ray.init(runtime_env={"working_dir": VLLM_PATH})
++
++    distributed_init_port = get_open_port()
++    refs = []
++    for rank in range(tp_size * pp_size):
++        refs.append(
++            test_target.remote(tp_size, pp_size, rank, distributed_init_port))
++    ray.get(refs)
++
++    ray.shutdown()
++
++
++@contextmanager
++def error_on_warning(category: Type[Warning] = Warning):
++    """
++    Within the scope of this context manager, tests will fail if any warning
++    of the given category is emitted.
++    """
++    with warnings.catch_warnings():
++        warnings.filterwarnings("error", category=category)
++
++        yield
++
++
++def get_physical_device_indices(devices):
++    visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
++    if visible_devices is None:
++        return devices
++
++    visible_indices = [int(x) for x in visible_devices.split(",")]
++    index_mapping = {i: physical for i, physical in enumerate(visible_indices)}
++    return [index_mapping[i] for i in devices if i in index_mapping]
++
++
++@_nvml()
++def wait_for_gpu_memory_to_clear(devices: List[int],
++                                 threshold_bytes: int,
++                                 timeout_s: float = 120) -> None:
++    # Use nvml instead of pytorch to reduce measurement error from torch cuda
++    # context.
++    devices = get_physical_device_indices(devices)
++    start_time = time.time()
++    while True:
++        output: Dict[int, str] = {}
++        output_raw: Dict[int, float] = {}
++        for device in devices:
++            if current_platform.is_rocm():
++                dev_handle = amdsmi_get_processor_handles()[device]
++                mem_info = amdsmi_get_gpu_vram_usage(dev_handle)
++                gb_used = mem_info["vram_used"] / 2**10
++            else:
++                dev_handle = nvmlDeviceGetHandleByIndex(device)
++                mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
++                gb_used = mem_info.used / 2**30
++            output_raw[device] = gb_used
++            output[device] = f'{gb_used:.02f}'
++
++        print('gpu memory used (GB): ', end='')
++        for k, v in output.items():
++            print(f'{k}={v}; ', end='')
++        print('')
++
++        dur_s = time.time() - start_time
++        if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
++            print(f'Done waiting for free GPU memory on devices {devices=} '
++                  f'({threshold_bytes/2**30=}) {dur_s=:.02f}')
++            break
++
++        if dur_s >= timeout_s:
++            raise ValueError(f'Memory of devices {devices=} not free after '
++                             f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
++
++        time.sleep(5)
++
++
++_P = ParamSpec("_P")
++
++
++def fork_new_process_for_each_test(
++        f: Callable[_P, None]) -> Callable[_P, None]:
++    """Decorator to fork a new process for each test function.
++    See https://github.com/vllm-project/vllm/issues/7053 for more details.
++    """
++
++    @functools.wraps(f)
++    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
++        # Make the process the leader of its own process group
++        # to avoid sending SIGTERM to the parent process
++        os.setpgrp()
++        from _pytest.outcomes import Skipped
++        pid = os.fork()
++        print(f"Fork a new process to run a test {pid}")
++        if pid == 0:
++            try:
++                f(*args, **kwargs)
++            except Skipped as e:
++                # convert Skipped to exit code 0
++                print(str(e))
++                os._exit(0)
++            except Exception:
++                import traceback
++                traceback.print_exc()
++                os._exit(1)
++            else:
++                os._exit(0)
++        else:
++            pgid = os.getpgid(pid)
++            _pid, _exitcode = os.waitpid(pid, 0)
++            # ignore SIGTERM signal itself
++            old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
++            # kill all child processes
++            os.killpg(pgid, signal.SIGTERM)
++            # restore the signal handler
++            signal.signal(signal.SIGTERM, old_signal_handler)
++            assert _exitcode == 0, (f"function {f} failed when called with"
++                                    f" args {args} and kwargs {kwargs}")
++
++    return wrapper
++
++
++def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
++    """
++    Get a pytest mark, which skips the test if the GPU doesn't meet
++    a minimum memory requirement in GB.
++    
++    This can be leveraged via `@large_gpu_test` to skip tests in environments
++    without enough resources, or called when filtering tests to run directly.
++    """
++    try:
++        if current_platform.is_cpu():
++            memory_gb = 0
++        else:
++            memory_gb = current_platform.get_device_total_memory() / GB_bytes
++    except Exception as e:
++        warnings.warn(
++            f"An error occurred when finding the available memory: {e}",
++            stacklevel=2,
++        )
++        memory_gb = 0
++
++    return pytest.mark.skipif(
++        memory_gb < min_gb,
++        reason=f"Need at least {min_gb}GB GPU memory to run the test.",
++    )
++
++
++def large_gpu_test(*, min_gb: int):
++    """
++    Decorate a test to be skipped if no GPU is available or it does not have
++    sufficient memory.
++
++    Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
++    """
++    mark = large_gpu_mark(min_gb)
++
++    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
++        return mark(f)
++
++    return wrapper
++
++
++def multi_gpu_marks(*, num_gpus: int):
++    """Get a collection of pytest marks to apply for `@multi_gpu_test`."""
++    test_selector = pytest.mark.distributed(num_gpus=num_gpus)
++    test_skipif = pytest.mark.skipif(
++        cuda_device_count_stateless() < num_gpus,
++        reason=f"Need at least {num_gpus} GPUs to run the test.",
++    )
++
++    return [test_selector, test_skipif]
++
++
++def multi_gpu_test(*, num_gpus: int):
++    """
++    Decorate a test to be run only when multiple GPUs are available.
++    """
++    marks = multi_gpu_marks(num_gpus=num_gpus)
++
++    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
++        func = fork_new_process_for_each_test(f)
++        for mark in reversed(marks):
++            func = mark(func)
++
++        return func
++
++    return wrapper
++
++
++async def completions_with_server_args(
++    prompts: List[str],
++    model_name: str,
++    server_cli_args: List[str],
++    num_logprobs: Optional[int],
++    max_wait_seconds: int = 240,
++    max_tokens: Union[int, list] = 5,
++) -> List[Completion]:
++    '''Construct a remote OpenAI server, obtain an async client to the
++    server & invoke the completions API to obtain completions.
++
++    Args:
++      prompts: test prompts
++      model_name: model to spin up on the vLLM server
++      server_cli_args: CLI args for starting the server
++      num_logprobs: Number of logprobs to report (or `None`)
++      max_wait_seconds: timeout interval for bringing up server.
++                        Default: 240sec
++      max_tokens: max_tokens value for each of the given input prompts.
++        if only one max_token value is given, the same value is used
++        for all the prompts.
++
++    Returns:
++      OpenAI Completion instance
++    '''
++
++    if isinstance(max_tokens, int):
++        max_tokens = [max_tokens] * len(prompts)
++
++    assert len(max_tokens) == len(prompts)
++
++    outputs = None
++    with RemoteOpenAIServer(model_name,
++                            server_cli_args,
++                            max_wait_seconds=max_wait_seconds) as server:
++        client = server.get_async_client()
++        outputs = [ client.completions.create(model=model_name,
++                                              prompt=[p],
++                                              temperature=0,
++                                              stream=False,
++                                              max_tokens=max_tok,
++                                              logprobs=num_logprobs) \
++                    for p, max_tok in zip(prompts, max_tokens) ]
++        outputs = await asyncio.gather(*outputs)
++
++    assert outputs is not None, "Completion API call failed."
++
++    return outputs
++
++
++def get_client_text_generations(completions: List[Completion]) -> List[str]:
++    '''Extract generated tokens from the output of a
++    request made to an Open-AI-protocol completions endpoint.
++    '''
++    assert all([len(x.choices) == 1 for x in completions])
++    return [x.choices[0].text for x in completions]
++
++
++def get_client_text_logprob_generations(
++        completions: List[Completion]) -> List[TextTextLogprobs]:
++    '''Operates on the output of a request made to an Open-AI-protocol
++    completions endpoint; obtains top-rank logprobs for each token in
++    each :class:`SequenceGroup`
++    '''
++    text_generations = get_client_text_generations(completions)
++    text = ''.join(text_generations)
++    return [(text_generations, text,
++             (None if x.logprobs is None else x.logprobs.top_logprobs))
++            for completion in completions for x in completion.choices]
+diff --git a/tests/v1/__init__.py b/tests/v1/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
+new file mode 100644
+index 0000000..f408176
+--- /dev/null
++++ b/tests/v1/core/test_kv_cache_utils.py
+@@ -0,0 +1,245 @@
++import pytest
++
++from vllm.multimodal.inputs import MultiModalKwargs
++from vllm.sampling_params import SamplingParams
++from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
++                                         KVCacheBlock,
++                                         generate_block_hash_extra_keys,
++                                         hash_block_tokens,
++                                         hash_request_tokens)
++from vllm.v1.request import Request
++
++
++def make_request(request_id,
++                 prompt_token_ids,
++                 mm_positions=None,
++                 mm_hashes=None):
++    if mm_positions is None:
++        multi_modal_inputs = None
++    else:
++        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
++
++    return Request(
++        request_id=request_id,
++        prompt=None,
++        prompt_token_ids=prompt_token_ids,
++        multi_modal_inputs=multi_modal_inputs,
++        multi_modal_hashes=mm_hashes,
++        multi_modal_placeholders=mm_positions,
++        sampling_params=SamplingParams(max_tokens=17),
++        eos_token_id=100,
++        arrival_time=0,
++        lora_request=None,
++    )
++
++
++def test_kv_cache_block():
++    # Test KVCacheBlock initialization
++    block = KVCacheBlock(block_id=0)
++    assert block.block_id == 0
++    assert block.ref_cnt == 0
++    assert block.block_hash is None
++
++    # Test reference count manipulation
++    block.incr_ref()
++    assert block.ref_cnt == 1
++    block.decr_ref()
++    assert block.ref_cnt == 0
++
++    # Test block hash setting and resetting
++    block_hash = BlockHashType(hash_value=123, token_ids=(1, 2, 3))
++    block.block_hash = block_hash
++    assert block.block_hash == block_hash
++
++    block.reset_hash()
++    assert block.block_hash is None
++
++
++def test_free_kv_cache_block_queue_initialization():
++    # Test with a single block
++    block = KVCacheBlock(block_id=0)
++    queue = FreeKVCacheBlockQueue([block])
++    assert queue.num_free_blocks == 1
++    assert queue.free_list_head == block
++    assert queue.free_list_tail == block
++
++
++def test_free_kv_cache_block_queue_operations():
++    # Create a list of KVCacheBlock objects
++    blocks = [KVCacheBlock(block_id=i) for i in range(5)]
++
++    # Create a FreeKVCacheBlockQueue with these blocks
++    queue = FreeKVCacheBlockQueue(blocks)
++
++    # Check initial state
++    assert queue.num_free_blocks == 5
++    assert queue.free_list_head == blocks[0]
++    assert queue.free_list_tail == blocks[4]
++
++    # Pop the first block
++    block1 = queue.popleft()
++    assert block1 == blocks[0]
++    assert queue.num_free_blocks == 4
++    assert queue.free_list_head == blocks[1]
++    assert queue.free_list_tail == blocks[4]
++
++    # Remove a block from the middle
++    block_to_remove = blocks[2]
++    queue.remove(block_to_remove)
++    assert queue.num_free_blocks == 3
++    assert blocks[1].next_free_block == blocks[3]
++    assert blocks[3].prev_free_block == blocks[1]
++
++    # Append a block back
++    queue.append(block_to_remove)
++    assert queue.num_free_blocks == 4
++    assert queue.free_list_tail == block_to_remove
++    assert block_to_remove.prev_free_block == blocks[4]
++    assert block_to_remove.next_free_block is None
++
++    # Pop blocks until empty
++    for _ in range(4):
++        queue.popleft()
++    assert queue.num_free_blocks == 0
++    assert queue.free_list_head is None
++    assert queue.free_list_tail is None
++
++    # Attempt to pop from an empty queue
++    with pytest.raises(ValueError) as e:
++        queue.popleft()
++    assert str(e.value) == "No free blocks available"
++
++
++def test_free_kv_cache_block_queue_get_all_free_blocks():
++    # Create a list of KVCacheBlock objects
++    blocks = [KVCacheBlock(block_id=i) for i in range(5)]
++
++    # Create a FreeKVCacheBlockQueue with these blocks
++    queue = FreeKVCacheBlockQueue(blocks)
++
++    # Check all blocks are correctly retrieved
++    assert queue.get_all_free_blocks() == blocks
++
++    # Pop a block and check again
++    queue.popleft()
++    assert queue.get_all_free_blocks() == blocks[1:]
++
++    # Remove a block and check again
++    block_to_remove = blocks[2]
++    queue.remove(block_to_remove)
++    assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:]
++
++    # Append a block back and check again
++    queue.append(block_to_remove)
++    assert queue.get_all_free_blocks() == \
++        blocks[1:2] + blocks[3:] + [block_to_remove]
++
++
++def test_generate_block_hash_extra_keys():
++    request = make_request(
++        request_id=0,
++        prompt_token_ids=[_ for _ in range(20)],
++        mm_positions=[{
++            "offset": 0,
++            "length": 5
++        }, {
++            "offset": 10,
++            "length": 5
++        }],
++        mm_hashes=["hash1", "hash2"],
++    )
++
++    # Test with no extra keys
++    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
++    assert extra_keys == ("hash1", )
++    assert next_mm_idx == 1
++
++    # Test with partial overlap
++    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0)
++    assert extra_keys == ("hash1", )
++    assert next_mm_idx == 1
++
++    # Test with no overlap
++    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0)
++    assert extra_keys == ()
++    assert next_mm_idx == 1
++
++    # Test with multiple extra keys
++    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0)
++    assert extra_keys == ('hash1', 'hash2')
++    assert next_mm_idx == 2
++
++
++def test_generate_block_hash_extra_keys_no_mm_inputs():
++    request = make_request(
++        request_id=0,
++        prompt_token_ids=[_ for _ in range(6)],
++        mm_positions=None,
++        mm_hashes=None,
++    )
++
++    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
++    assert extra_keys is None
++    assert next_mm_idx == 0
++
++
++def test_hash_block_tokens():
++    parent_block_hash = 123
++    curr_block_token_ids = (1, 2, 3)
++    extra_keys = ("key1", "key2")
++
++    block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids,
++                                   extra_keys)
++    assert isinstance(block_hash, BlockHashType)
++    assert block_hash.hash_value == hash(
++        (parent_block_hash, *curr_block_token_ids))
++    assert block_hash.token_ids == curr_block_token_ids
++    assert block_hash.extra_keys == extra_keys
++
++
++def test_hash_request_tokens():
++    request = make_request(
++        request_id=0,
++        prompt_token_ids=[_ for _ in range(6)],
++        mm_positions=[{
++            "offset": 0,
++            "length": 3
++        }, {
++            "offset": 3,
++            "length": 3
++        }],
++        mm_hashes=["hash1", "hash2"],
++    )
++
++    block_size = 3
++    block_hashes = hash_request_tokens(block_size, request)
++
++    assert len(block_hashes) == 2
++    assert isinstance(block_hashes[0], BlockHashType)
++    assert isinstance(block_hashes[1], BlockHashType)
++
++    # Check the first block
++    assert block_hashes[0].token_ids == (0, 1, 2)
++    assert block_hashes[0].extra_keys == ("hash1", )
++
++    # Check the second block
++    assert block_hashes[1].token_ids == (3, 4, 5)
++    assert block_hashes[1].extra_keys == ("hash2", )
++
++
++def test_hash_request_tokens_no_mm_inputs():
++    request = make_request(
++        request_id=0,
++        prompt_token_ids=[_ for _ in range(6)],
++        mm_positions=None,
++        mm_hashes=None,
++    )
++
++    block_size = 3
++    block_hashes = hash_request_tokens(block_size, request)
++
++    assert len(block_hashes) == 2
++    assert block_hashes[0].token_ids == (0, 1, 2)
++    assert block_hashes[0].extra_keys is None
++    assert block_hashes[1].token_ids == (3, 4, 5)
++    assert block_hashes[1].extra_keys is None
+diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
+new file mode 100644
+index 0000000..b97f55b
+--- /dev/null
++++ b/tests/v1/core/test_prefix_caching.py
+@@ -0,0 +1,566 @@
++"""Compare the with and without prefix caching."""
++import pytest
++
++from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
++from vllm.sampling_params import SamplingParams
++from vllm.utils import cdiv
++from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
++from vllm.v1.core.kv_cache_utils import KVCacheBlock, hash_block_tokens
++
++
++def make_request(request_id,
++                 prompt_token_ids,
++                 mm_positions=None,
++                 mm_hashes=None):
++    if mm_positions is None:
++        multi_modal_inputs = None
++    else:
++        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
++
++    return Request(
++        request_id=request_id,
++        prompt=None,
++        prompt_token_ids=prompt_token_ids,
++        multi_modal_inputs=multi_modal_inputs,
++        multi_modal_hashes=mm_hashes,
++        multi_modal_placeholders=mm_positions,
++        sampling_params=SamplingParams(max_tokens=17),
++        eos_token_id=100,
++        arrival_time=0,
++        lora_request=None,
++    )
++
++
++def test_prefill():
++    manager = KVCacheManager(
++        block_size=16,
++        num_gpu_blocks=10,
++        max_model_len=8192,
++        sliding_window=None,
++        enable_caching=True,
++        num_preallocate_tokens=16,
++    )
++
++    # Complete 3 blocks (48 tokens)
++    common_token_ids = [i for i in range(3) for _ in range(16)]
++
++    # Fully cache miss
++    # Incomplete 1 block (7 tokens)
++    unique_token_ids = [3] * 7
++    all_token_ids = common_token_ids + unique_token_ids
++    req0 = make_request("0", all_token_ids)
++    computed_blocks = manager.get_computed_blocks(req0)
++    assert len(req0.kv_block_hashes) == 3
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req0, 55, computed_blocks)
++    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
++
++    # Check full block metadata
++    parent_block_hash = None
++    for block_id in (0, 1, 2):
++        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
++        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
++        assert manager.block_pool[block_id].block_hash == block_hash
++        assert manager.block_pool[block_id].ref_cnt == 1
++        parent_block_hash = block_hash.hash_value
++
++    # Check partial/preallocated block metadata
++    for block_id in (3, 4):
++        assert manager.block_pool[block_id].block_hash is None
++        assert manager.block_pool[block_id].ref_cnt == 1
++
++    # Cache hit in the common prefix when the original block is still in use.
++    # Incomplete 1 block (5 tokens)
++    unique_token_ids = [3] * 5
++    req1 = make_request("1", common_token_ids + unique_token_ids)
++    computed_blocks = manager.get_computed_blocks(req1)
++    assert len(req1.kv_block_hashes) == 3
++    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
++    num_new_tokens = 53 - 3 * 16
++    blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
++    assert [b.block_id for b in blocks] == [5, 6]
++    for block in computed_blocks:
++        assert block.ref_cnt == 2
++
++    # At this point, we should have 3 free blocks left.
++    assert manager.free_block_queue.num_free_blocks == 3
++
++    manager.free(req0)
++    manager.free(req1)
++
++    # All blocks should be available.
++    assert manager.free_block_queue.num_free_blocks == 10
++    # The order should be
++    # [unallocated (7, 8)]
++    # [unique_req0 (4, 3)]
++    # [unique_req1 (6, 5)]
++    # [common (2, 1, 0)]
++    assert [
++        b.block_id for b in manager.free_block_queue.get_all_free_blocks()
++    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
++
++    # Cache hit in the common prefix when the original block is already free.
++    # Incomplete 1 block (6 tokens)
++    unique_token_ids = [3] * 6
++    req2 = make_request("2", common_token_ids + unique_token_ids)
++    computed_blocks = manager.get_computed_blocks(req2)
++    assert len(req2.kv_block_hashes) == 3
++    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
++    num_new_tokens = 53 - 3 * 16
++    blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
++    assert [b.block_id for b in blocks] == [7, 8]
++
++    # Although we only have 5 free blocks, we have 8 blocks in
++    # the free block queue due to lazy removal.
++    assert manager.free_block_queue.num_free_blocks == 5
++    assert all([
++        b.ref_cnt == 0 for b in manager.free_block_queue.get_all_free_blocks()
++    ])
++    assert len([b
++                for b in manager.free_block_queue.get_all_free_blocks()]) == 5
++
++    manager.free(req2)
++
++    # Cache miss and eviction.
++    req3 = make_request("3", [99] * (16 * 9))
++    computed_blocks = manager.get_computed_blocks(req3)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
++    # This block ID order also checks the eviction order.
++    assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
++    assert manager.free_block_queue.num_free_blocks == 0
++    assert manager.free_block_queue.free_list_head is None
++    assert manager.free_block_queue.free_list_tail is None
++
++
++def test_decode():
++    manager = KVCacheManager(
++        block_size=16,
++        num_gpu_blocks=10,
++        max_model_len=8192,
++        sliding_window=None,
++        enable_caching=True,
++        num_preallocate_tokens=16,
++    )
++
++    # Complete 3 blocks (48 tokens)
++    common_token_ids = [i for i in range(3) for _ in range(16)]
++
++    # Fully cache miss
++    # Incomplete 1 block (7 tokens)
++    unique_token_ids = [3] * 7
++    req0 = make_request("0", common_token_ids + unique_token_ids)
++    computed_blocks = manager.get_computed_blocks(req0)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req0, 55, computed_blocks)
++    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
++
++    # Append slots without allocating a new block.
++    req0.num_computed_tokens = 55
++    for _ in range(4):
++        req0.append_output_token_ids(8)
++    new_blocks = manager.append_slots(req0, 4)
++    assert new_blocks is not None and len(new_blocks) == 0
++    assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
++
++    # Append slots without allocating a new block, but start using the
++    # preallocated block.
++    req0.num_computed_tokens = 59
++    # 6 tokens to fill the previous block, and 10 tokens to fill
++    # the preallocated block.
++    for _ in range(5 + 10):
++        req0.append_output_token_ids(7)
++    new_blocks = manager.append_slots(req0, 15)
++    assert new_blocks is not None and len(new_blocks) == 0
++    assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
++
++    # Append slots with allocating a new block.
++    req0.num_computed_tokens = 74
++    # 6 tokens to fill the previous block, and 10 tokens to fill
++    # the preallocated block.
++    for _ in range(6 + 11):
++        req0.append_output_token_ids(12)
++    new_blocks = manager.append_slots(req0, 17)
++    # Plus one preallocated block.
++    assert new_blocks is not None and len(new_blocks) == 2
++
++
++def test_evict():
++    manager = KVCacheManager(
++        block_size=16,
++        num_gpu_blocks=10,
++        max_model_len=8192,
++        sliding_window=None,
++        enable_caching=True,
++        num_preallocate_tokens=16,
++    )
++
++    last_token_id = 5 * 16 + 7
++    req0 = make_request("0", list(range(last_token_id)))
++    computed_blocks = manager.get_computed_blocks(req0)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
++    assert len(blocks) == 7  # 5 full + 1 partial + 1 preallocated
++
++    # 3 blocks.
++    req1 = make_request("1", list(range(last_token_id,
++                                        last_token_id + 3 * 16)))
++    computed_blocks = manager.get_computed_blocks(req1)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req1, 3 * 16, computed_blocks)
++    assert len(blocks) == 3  # 3 full blocks
++    last_token_id += 3 * 16
++
++    assert manager.free_block_queue.num_free_blocks == 0
++
++    manager.free(req0)
++    manager.free(req1)
++    assert manager.free_block_queue.num_free_blocks == 10
++    assert [
++        b.block_id for b in manager.free_block_queue.get_all_free_blocks()
++    ] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
++
++    # Touch the first 2 blocks.
++    req2 = make_request("2", list(range(2 * 16 + 3)))
++    computed_blocks = manager.get_computed_blocks(req2)
++    assert [b.block_id for b in computed_blocks] == [0, 1]
++    blocks = manager.allocate_slots(req2, 3, computed_blocks)
++    assert [b.block_id for b in blocks] == [6, 5]
++    assert manager.free_block_queue.num_free_blocks == 6
++
++
++def test_hash_block_correct_reuse():
++    """
++    This tests when a previously cached block is reused as a new block,
++    its hash metadata should be correctly reset.
++    """
++    block_size = 16
++    manager = KVCacheManager(
++        block_size=block_size,
++        num_gpu_blocks=1,
++        max_model_len=8192,
++        sliding_window=None,
++        enable_caching=True,
++        num_preallocate_tokens=0,
++    )
++
++    # Allocate 1 block and cache it.
++    num_tokens = block_size * 1
++    req = make_request("0", list(range(num_tokens)))
++    computed_blocks = manager.get_computed_blocks(req)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req, num_tokens, computed_blocks)
++    assert len(blocks) == 1
++
++    # Deallocate the block.
++    manager.free(req)
++
++    # Allocate a new block that's not full, make sure hash info on the
++    # block is cleared.
++    req = make_request("1", list(range(num_tokens - 1)))
++    computed_blocks = manager.get_computed_blocks(req)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req, num_tokens - 1, computed_blocks)
++    assert len(blocks) == 1
++
++    assert manager.block_pool[blocks[0].block_id].block_hash is None
++
++
++def test_computed_blocks_not_evicted():
++    """
++    Test that the computed blocks are not evicted when getting new blocks
++    for a request if there are any other free blocks.
++    """
++    block_size = 16
++    manager = KVCacheManager(
++        block_size=block_size,
++        num_gpu_blocks=2,
++        max_model_len=8192,
++        sliding_window=None,
++        enable_caching=True,
++        num_preallocate_tokens=0,
++    )
++
++    # Allocate a block and cache it.
++    num_tokens = block_size * 1
++    req0 = make_request("0", list(range(num_tokens)))
++    computed_blocks = manager.get_computed_blocks(req0)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
++    assert len(blocks) == 1
++    assert blocks[0].block_id == 0
++
++    # Allocate another block.
++    req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
++    computed_blocks = manager.get_computed_blocks(req1)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
++    assert len(blocks) == 1
++    assert blocks[0].block_id == 1
++
++    # Free the blocks.
++    manager.free(req0)
++    manager.free(req1)
++
++    # Now if we have a cache hit on the first block, we should evict the second
++    # cached block rather than the first one.
++    req2 = make_request("2", list(range(num_tokens * 2)))
++    computed_blocks = manager.get_computed_blocks(req2)
++    assert len(computed_blocks) == 1
++    assert computed_blocks[0].block_id == 0
++
++    blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
++                                    computed_blocks)
++    assert len(blocks) == 1
++    assert blocks[0].block_id == 1
++
++
++def test_basic_prefix_caching_disabled():
++    """
++    This tests that the prefix caching is disabled.
++    """
++    block_size = 4
++    manager = KVCacheManager(
++        block_size=block_size,
++        num_gpu_blocks=4,
++        max_model_len=8192,
++        sliding_window=None,
++        enable_caching=False,
++        num_preallocate_tokens=0,
++    )
++
++    req1 = make_request("1", list(range(10)))  # 2 blocks and some more
++
++    computed_blocks = manager.get_computed_blocks(req1)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req1, 10, computed_blocks)
++    assert len(blocks) == 3
++
++    # Free the blocks.
++    manager.free(req1)
++
++    # No caching.
++    req2 = make_request("2", list(range(16)))  # shared prefix
++    computed_blocks = manager.get_computed_blocks(req2)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req2, 16, computed_blocks)
++    assert len(blocks) == 4
++
++    # New requests should not have any blocks.
++    req3 = make_request("3", list(range(4)))
++    computed_blocks = manager.get_computed_blocks(req3)
++    assert not computed_blocks
++    blocks = manager.allocate_slots(req3, 4, computed_blocks)
++    assert not blocks
++
++
++@pytest.mark.parametrize("num_preallocate_tokens", list(range(0, 8)))
++@pytest.mark.parametrize("block_size", [4])
++def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
++    """
++    This tests that the preallocated blocks are correctly added.
++    """
++    manager = KVCacheManager(
++        block_size=block_size,
++        num_gpu_blocks=10,
++        max_model_len=8192,
++        sliding_window=None,
++        enable_caching=True,
++        num_preallocate_tokens=num_preallocate_tokens,
++    )
++    num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size)
++
++    req = make_request("0", list(range(block_size * 30)))
++    computed_blocks = manager.get_computed_blocks(req)
++    assert not computed_blocks
++    # Just ask for 1 block.
++    blocks = manager.allocate_slots(req, block_size, computed_blocks)
++    req.num_computed_tokens = block_size
++    assert len(blocks) == 1 + num_preallocated_blocks
++
++    # Assume all computed.
++    manager.append_slots(req, block_size * (len(blocks) - 1))
++    req.num_computed_tokens = block_size * len(blocks)
++
++    # Append 1 block.
++    blocks = manager.append_slots(req, block_size)
++    assert len(blocks) == 1 + num_preallocated_blocks
++
++
++def test_cache_blocks():
++    """
++    This is a unit test that tests the correctness of the _cache_full_blocks
++    function of KVCacheManager.
++    """
++    block_size = 4
++    manager = KVCacheManager(
++        block_size=block_size,
++        num_gpu_blocks=5,
++        max_model_len=8192,
++        sliding_window=None,
++        enable_caching=True,
++        num_preallocate_tokens=0,
++    )
++    # Req:
++    #  Block 0: [0, 1, 2, 3]
++    #  Block 1: [4, 5, 6, 7]
++    #  Block 2: [8, 9, 10, 11]
++    #  Block 3: [12, 13]
++    req = make_request("0", list(range(14)))
++
++    # Test that blocks are cached correctly for 2 full blocks from the start.
++    blocks = [KVCacheBlock(block_id=i) for i in range(2)]
++
++    manager._cache_full_blocks(
++        request=req,
++        blk_start_idx=0,
++        full_blocks=blocks,
++        prev_block=None,
++    )
++
++    assert len(manager.cached_block_hash_to_block) == 2
++    assert all([block.block_hash is not None for block in blocks])
++
++    # Test that blocks that don't start from the beginning are cached correctly.
++    blocks = [KVCacheBlock(block_id=2)]
++    manager._cache_full_blocks(
++        request=req,
++        blk_start_idx=2,
++        full_blocks=blocks,
++        prev_block=None,
++    )
++    assert len(manager.cached_block_hash_to_block) == 3
++    assert blocks[0].block_hash is not None
++
++
++def test_mm_prefix_caching():
++    """
++    This tests that the multi-modal prefix caching is correct.
++    """
++    manager = KVCacheManager(
++        block_size=16,
++        num_gpu_blocks=10,
++        max_model_len=8192,
++        sliding_window=None,
++        enable_caching=True,
++        num_preallocate_tokens=16,
++    )
++
++    # Common prompt tokens (T is text tokens and P is image placeholder tokens)
++    # [T,...,T, P0,...,P0], [P0,...,P0,T,...,T,P1,...,P1], [P1,...,P1]
++    common_token_ids = list(range(10)) + [-1] * 6
++    common_token_ids += [-1] * 4 + list(range(10, 20)) + [-1] * 2
++    common_token_ids += [-1] * 16
++
++    common_mm_positions = [
++        PlaceholderRange(offset=11, length=10),
++        PlaceholderRange(offset=30, length=18),
++    ]
++    common_mm_hashes = ["aaa", "bbb"]
++
++    # A unique image plus some text tokens.
++    unique_token_ids = [-1] * 7 + [100] * 4
++    all_token_ids = common_token_ids + unique_token_ids
++    mm_positions = common_mm_positions + [
++        PlaceholderRange(offset=48, length=7)
++    ]
++    mm_hashes = common_mm_hashes + ["ccc"]
++    req0 = make_request("0",
++                        all_token_ids,
++                        mm_positions=mm_positions,
++                        mm_hashes=mm_hashes)
++    computed_blocks = manager.get_computed_blocks(req0)
++
++    # Completed block should have hashes with extra keys.
++    assert not computed_blocks
++    assert len(req0.kv_block_hashes) == 3
++    assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
++    assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
++    assert req0.kv_block_hashes[2].extra_keys == ("bbb", )
++
++    blocks = manager.allocate_slots(req0, 59, computed_blocks)
++    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
++    req0.num_computed_tokens = 59
++
++    # Append slots without allocating a new block.
++    for _ in range(5):
++        req0.append_output_token_ids(8)
++    new_blocks = manager.append_slots(req0, 5)
++    assert new_blocks is not None and len(new_blocks) == 0
++
++    # The just completed block should have hashes with extra keys.
++    assert len(req0.kv_block_hashes) == 4
++    assert req0.kv_block_hashes[3].extra_keys == ("ccc", )
++
++    # Cache hit.
++    unique_token_ids = [-1] * 7 + [200] * 5
++    all_token_ids = common_token_ids + unique_token_ids
++    mm_positions = common_mm_positions + [
++        PlaceholderRange(offset=48, length=7)
++    ]
++    mm_hashes = common_mm_hashes + ["ccc"]
++    req1 = make_request("1",
++                        all_token_ids,
++                        mm_positions=mm_positions,
++                        mm_hashes=mm_hashes)
++    computed_blocks = manager.get_computed_blocks(req1)
++    assert len(computed_blocks) == 3
++
++
++def test_prefill_not_enough_free_blocks_with_computed_blocks():
++    """
++    This is a unit test that tests the correctness of the allocate_slots
++    when there is not enough free blocks. Specifically, when a request
++    has computed blocks but cannot be allocated due to not enough free blocks,
++    the computed blocks should not be touched.
++    """
++    block_size = 16
++    manager = KVCacheManager(
++        block_size=block_size,
++        num_gpu_blocks=10,
++        max_model_len=8192,
++        sliding_window=None,
++        enable_caching=True,
++        num_preallocate_tokens=0,
++    )
++    # Complete 3 blocks (48 tokens)
++    # | Common-0 | Common-1 | Common-2 | ... |
++    common_token_ids = [i for i in range(3) for _ in range(16)]
++    req0 = make_request("0", common_token_ids)
++    computed_blocks = manager.get_computed_blocks(req0)
++    assert not computed_blocks
++    manager.allocate_slots(req0, 48, computed_blocks)
++    block_part0 = manager.req_to_blocks[req0.request_id]
++
++    # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
++    req1 = make_request("1", common_token_ids * 2)
++    computed_blocks = manager.get_computed_blocks(req1)
++    assert computed_blocks == block_part0
++    manager.allocate_slots(req1, 48, computed_blocks)
++    block_part1 = manager.req_to_blocks[req1.request_id]
++    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
++    # | Req1-5(F)| ... |
++    manager.free(req1)
++    assert {block.ref_cnt for block in block_part1[:3]} == {1}
++    assert {block.ref_cnt for block in block_part1[3:]} == {0}
++
++    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
++    # | Req1-5(F)| Req2-0   | Req2-1   | ... |
++    req2 = make_request("2", [7] * block_size * 2)
++    computed_blocks = manager.get_computed_blocks(req2)
++    assert not computed_blocks
++    manager.allocate_slots(req2, block_size * 2, computed_blocks)
++
++    # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
++    # but it cannot be allocated due to insufficient free blocks (2).
++    # In this case, the ref_cnt of the computed blocks should not be changed.
++    assert manager.free_block_queue.num_free_blocks == 5
++    req3 = make_request("3", common_token_ids * 3)
++    computed_blocks = manager.get_computed_blocks(req3)
++    assert computed_blocks == block_part1
++    # Req3 cannot be allocated.
++    assert manager.allocate_slots(req3, 48, computed_blocks) is None
++    # Block 0-2 are used by Req 1.
++    assert {block.ref_cnt for block in block_part1[:3]} == {1}
++    # Block 3-5 are free.
++    assert {block.ref_cnt for block in block_part1[3:]} == {0}
+diff --git a/tests/v1/e2e/__init__.py b/tests/v1/e2e/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
+new file mode 100644
+index 0000000..8ec9f1b
+--- /dev/null
++++ b/tests/v1/e2e/test_cascade_attention.py
+@@ -0,0 +1,22 @@
++from vllm import LLM, SamplingParams
++
++
++def test_cascade_attention(example_system_message, monkeypatch):
++    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
++
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "1")
++
++        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
++        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
++
++        # No cascade attention.
++        single_prompt = [example_system_message + prompt]
++        responses = llm.generate(single_prompt, sampling_params)
++        ref_output = responses[0].outputs[0].text
++
++        # (Probably) Use cascade attention.
++        prompts = [example_system_message + prompt] * 64
++        responses = llm.generate(prompts, sampling_params)
++        for response in responses:
++            assert response.outputs[0].text == ref_output
+diff --git a/tests/v1/engine/__init__.py b/tests/v1/engine/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
+new file mode 100644
+index 0000000..2c805e1
+--- /dev/null
++++ b/tests/v1/engine/test_async_llm.py
+@@ -0,0 +1,116 @@
++import asyncio
++from typing import List, Tuple
++
++import pytest
++
++from vllm import SamplingParams
++from vllm.engine.arg_utils import AsyncEngineArgs
++from vllm.platforms import current_platform
++from vllm.v1.engine.async_llm import AsyncLLM
++
++if not current_platform.is_cuda():
++    pytest.skip(reason="V1 currently only supported on CUDA.",
++                allow_module_level=True)
++
++ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
++                              enforce_eager=True,
++                              disable_log_requests=True)
++
++
++async def generate(engine: AsyncLLM, request_id: str,
++                   max_tokens: int) -> Tuple[int, str]:
++    count = 0
++    async for _ in engine.generate(request_id=request_id,
++                                   prompt="Hello my name is Robert and",
++                                   sampling_params=SamplingParams(
++                                       max_tokens=max_tokens, temperature=0)):
++
++        count += 1
++        await asyncio.sleep(0.)
++
++    return count, request_id
++
++
++@pytest.mark.asyncio
++async def test_load(monkeypatch):
++    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
++    # so that in the future when we switch, we don't have to change all the
++    # tests.
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "1")
++
++        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
++
++        NUM_REQUESTS = 10000
++        NUM_EXPECTED_TOKENS = 10
++
++        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
++
++        # Create concurrent requests.
++        tasks = []
++        for request_id in request_ids:
++            tasks.append(
++                asyncio.create_task(
++                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
++
++        # Confirm that we got all the EXPECTED tokens from the requests.
++        for task in tasks:
++            num_generated_tokens, request_id = await task
++            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
++                f"{request_id} generated {num_generated_tokens} but "
++                f"expected {NUM_EXPECTED_TOKENS}")
++
++        assert not engine.output_processor.has_unfinished_requests()
++        engine.shutdown()
++
++
++@pytest.mark.asyncio
++async def test_abort(monkeypatch):
++
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "1")
++
++        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
++
++        NUM_REQUESTS = 100
++        NUM_EXPECTED_TOKENS = 100
++        REQUEST_IDS_TO_ABORT = range(1, 100, 10)
++
++        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
++
++        # Create concurrent requests.
++        tasks: List[asyncio.Task] = []
++        for request_id in request_ids:
++            tasks.append(
++                asyncio.create_task(
++                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
++
++        # API server cancels requests when they disconnect.
++        for idx in REQUEST_IDS_TO_ABORT:
++            tasks[idx].cancel()
++            await asyncio.sleep(0.1)
++
++        # Confirm the other requests are okay.
++        for idx, task in enumerate(tasks):
++            # Confirm that it was actually canceled.
++            if idx in REQUEST_IDS_TO_ABORT:
++                with pytest.raises(asyncio.CancelledError):
++                    await task
++            else:
++                # Otherwise, make sure the request was not impacted.
++                num_generated_tokens, request_id = await task
++                assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
++                    f"{request_id} generated {num_generated_tokens} but "
++                    f"expected {NUM_EXPECTED_TOKENS}")
++
++        assert not engine.output_processor.has_unfinished_requests()
++
++        # Confirm we can do another generation.
++        request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
++        task = asyncio.create_task(
++            generate(engine, request_id, NUM_EXPECTED_TOKENS))
++        num_generated_tokens, request_id = await task
++        assert num_generated_tokens == NUM_EXPECTED_TOKENS
++        assert not engine.output_processor.has_unfinished_requests()
++
++        engine.shutdown()
+diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
+new file mode 100644
+index 0000000..ff38a45
+--- /dev/null
++++ b/tests/v1/engine/test_engine_args.py
+@@ -0,0 +1,46 @@
++import pytest
++
++from vllm import envs
++from vllm.config import VllmConfig
++from vllm.engine.arg_utils import EngineArgs
++from vllm.usage.usage_lib import UsageContext
++from vllm.utils import FlexibleArgumentParser
++
++if not envs.VLLM_USE_V1:
++    pytest.skip(
++        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
++        allow_module_level=True,
++    )
++
++
++def test_prefix_caching_from_cli():
++    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
++    args = parser.parse_args([])
++    engine_args = EngineArgs.from_cli_args(args=args)
++    assert (engine_args.enable_prefix_caching
++            ), "V1 turns on prefix caching by default."
++
++    # Turn it off possible with flag.
++    args = parser.parse_args(["--no-enable-prefix-caching"])
++    engine_args = EngineArgs.from_cli_args(args=args)
++    assert not engine_args.enable_prefix_caching
++
++    # Turn it on with flag.
++    args = parser.parse_args(["--enable-prefix-caching"])
++    engine_args = EngineArgs.from_cli_args(args=args)
++    assert engine_args.enable_prefix_caching
++
++
++def test_defaults_with_usage_context():
++    engine_args = EngineArgs(model="facebook/opt-125m")
++    vllm_config: VllmConfig = engine_args.create_engine_config(
++        UsageContext.LLM_CLASS)
++
++    assert vllm_config.scheduler_config.max_num_seqs == 1024
++    assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
++
++    engine_args = EngineArgs(model="facebook/opt-125m")
++    vllm_config = engine_args.create_engine_config(
++        UsageContext.OPENAI_API_SERVER)
++    assert vllm_config.scheduler_config.max_num_seqs == 1024
++    assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
+diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
+new file mode 100644
+index 0000000..cccfd30
+--- /dev/null
++++ b/tests/v1/engine/test_engine_core.py
+@@ -0,0 +1,177 @@
++import time
++import uuid
++
++import pytest
++from transformers import AutoTokenizer
++
++from tests.utils import fork_new_process_for_each_test
++from vllm import SamplingParams
++from vllm.engine.arg_utils import EngineArgs
++from vllm.platforms import current_platform
++from vllm.v1.engine import EngineCoreRequest
++from vllm.v1.engine.core import EngineCore
++from vllm.v1.executor.abstract import Executor
++
++if not current_platform.is_cuda():
++    pytest.skip(reason="V1 currently only supported on CUDA.",
++                allow_module_level=True)
++
++MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
++TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
++PROMPT = "Hello my name is Robert and I love quantization kernels"
++PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
++
++
++def make_request() -> EngineCoreRequest:
++    return EngineCoreRequest(
++        request_id=uuid.uuid4(),
++        prompt=PROMPT,
++        prompt_token_ids=PROMPT_TOKENS,
++        mm_inputs=None,
++        mm_hashes=None,
++        mm_placeholders=None,
++        sampling_params=SamplingParams(),
++        eos_token_id=None,
++        arrival_time=time.time(),
++        lora_request=None,
++    )
++
++
++@fork_new_process_for_each_test
++def test_engine_core(monkeypatch):
++
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "1")
++        """Setup the EngineCore."""
++        engine_args = EngineArgs(model=MODEL_NAME)
++        vllm_config = engine_args.create_engine_config()
++        executor_class = Executor.get_class(vllm_config)
++
++        engine_core = EngineCore(vllm_config=vllm_config,
++                                 executor_class=executor_class)
++        """Test basic request lifecycle."""
++
++        # First request.
++        engine_core.add_request(make_request())
++        assert len(engine_core.scheduler.waiting) == 1
++        assert len(engine_core.scheduler.running) == 0
++
++        _ = engine_core.step()
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 1
++
++        # Second request.
++        engine_core.add_request(make_request())
++        assert len(engine_core.scheduler.waiting) == 1
++        assert len(engine_core.scheduler.running) == 1
++
++        _ = engine_core.step()
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 2
++
++        # Add two requests in a row.
++        engine_core.add_request(make_request())
++        engine_core.add_request(make_request())
++        assert len(engine_core.scheduler.waiting) == 2
++        assert len(engine_core.scheduler.running) == 2
++
++        _ = engine_core.step()
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 4
++
++        # Loop through until they are all done.
++        while len(engine_core.step().outputs) > 0:
++            pass
++
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 0
++        """Test abort cycle."""
++
++        # Basic abort.
++        req = make_request()
++        request_id = req.request_id
++
++        engine_core.add_request(req)
++        assert len(engine_core.scheduler.waiting) == 1
++        assert len(engine_core.scheduler.running) == 0
++
++        _ = engine_core.step()
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 1
++
++        engine_core.abort_requests([request_id])
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 0
++
++        # Add, step, abort 1 of the 3.
++        req0 = make_request()
++        req1 = make_request()
++        req2 = make_request()
++
++        engine_core.add_request(req0)
++        engine_core.add_request(req1)
++        assert len(engine_core.scheduler.waiting) == 2
++        assert len(engine_core.scheduler.running) == 0
++
++        _ = engine_core.step()
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 2
++
++        engine_core.add_request(req2)
++        assert len(engine_core.scheduler.waiting) == 1
++        assert len(engine_core.scheduler.running) == 2
++
++        _ = engine_core.step()
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 3
++
++        # Abort just one.
++        engine_core.abort_requests([req1.request_id])
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 2
++
++        _ = engine_core.step()
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 2
++
++        # Abort the other requests at the same time.
++        engine_core.abort_requests([req2.request_id, req0.request_id])
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 0
++
++
++@fork_new_process_for_each_test
++def test_engine_core_advanced_sampling(monkeypatch):
++    """
++    A basic end-to-end test to verify that the engine functions correctly 
++    when additional sampling parameters, such as min_tokens and 
++    presence_penalty, are set.
++    """
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "1")
++        """Setup the EngineCore."""
++        engine_args = EngineArgs(model=MODEL_NAME)
++        vllm_config = engine_args.create_engine_config()
++        executor_class = Executor.get_class(vllm_config)
++
++        engine_core = EngineCore(vllm_config=vllm_config,
++                                 executor_class=executor_class)
++        """Test basic request lifecycle."""
++        # First request.
++        request: EngineCoreRequest = make_request()
++        request.sampling_params = SamplingParams(
++            min_tokens=4,
++            presence_penalty=1.0,
++            frequency_penalty=1.0,
++            repetition_penalty=0.1,
++            stop_token_ids=[1001, 1002],
++        )
++        engine_core.add_request(request)
++        assert len(engine_core.scheduler.waiting) == 1
++        assert len(engine_core.scheduler.running) == 0
++        # Loop through until they are all done.
++        while len(engine_core.step().outputs) > 0:
++            pass
++
++        assert len(engine_core.scheduler.waiting) == 0
++        assert len(engine_core.scheduler.running) == 0
+diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
+new file mode 100644
+index 0000000..e2c728b
+--- /dev/null
++++ b/tests/v1/engine/test_engine_core_client.py
+@@ -0,0 +1,202 @@
++import asyncio
++import time
++import uuid
++from typing import Dict, List
++
++import pytest
++from transformers import AutoTokenizer
++
++from tests.utils import fork_new_process_for_each_test
++from vllm import SamplingParams
++from vllm.engine.arg_utils import EngineArgs
++from vllm.platforms import current_platform
++from vllm.usage.usage_lib import UsageContext
++from vllm.v1.engine import EngineCoreRequest
++from vllm.v1.engine.core_client import EngineCoreClient
++from vllm.v1.executor.abstract import Executor
++
++if not current_platform.is_cuda():
++    pytest.skip(reason="V1 currently only supported on CUDA.",
++                allow_module_level=True)
++
++MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
++TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
++PROMPT = "Hello my name is Robert and I love quantization kernels"
++PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
++
++
++def make_request(params: SamplingParams) -> EngineCoreRequest:
++    return EngineCoreRequest(
++        request_id=str(uuid.uuid4()),
++        prompt=PROMPT,
++        prompt_token_ids=PROMPT_TOKENS,
++        mm_inputs=None,
++        mm_hashes=None,
++        mm_placeholders=None,
++        sampling_params=params,
++        eos_token_id=None,
++        arrival_time=time.time(),
++        lora_request=None,
++    )
++
++
++def loop_until_done(client: EngineCoreClient, outputs: Dict):
++
++    while True:
++        engine_core_outputs = client.get_output().outputs
++
++        if len(engine_core_outputs) == 0:
++            break
++
++        all_finished = True
++        for out in engine_core_outputs:
++            outputs[out.request_id].append(out)
++            if not out.finished:
++                all_finished = False
++
++        if all_finished:
++            break
++
++
++async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
++
++    while True:
++        engine_core_outputs = await client.get_output_async().outputs
++
++        if len(engine_core_outputs) == 0:
++            break
++
++        all_finished = True
++        for out in engine_core_outputs:
++            outputs[out.request_id].append(out)
++            if not out.finished:
++                all_finished = False
++
++        if all_finished:
++            break
++
++
++@fork_new_process_for_each_test
++@pytest.mark.parametrize("multiprocessing_mode", [True, False])
++def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
++
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "1")
++
++        engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
++        vllm_config = engine_args.create_engine_config(
++            UsageContext.UNKNOWN_CONTEXT)
++        executor_class = Executor.get_class(vllm_config)
++        client = EngineCoreClient.make_client(
++            multiprocess_mode=multiprocessing_mode,
++            asyncio_mode=False,
++            vllm_config=vllm_config,
++            executor_class=executor_class,
++        )
++
++        MAX_TOKENS = 20
++        params = SamplingParams(max_tokens=MAX_TOKENS)
++        """Normal Request Cycle."""
++        requests = [make_request(params) for _ in range(10)]
++        request_ids = [req.request_id for req in requests]
++
++        # Add requests to the engine.
++        for request in requests:
++            client.add_request(request)
++            time.sleep(0.01)
++
++        outputs: Dict[str, List] = {req_id: [] for req_id in request_ids}
++        loop_until_done(client, outputs)
++
++        for req_id in request_ids:
++            assert len(outputs[req_id]) == MAX_TOKENS, (
++                f"{outputs[req_id]=}, {MAX_TOKENS=}")
++        """Abort Request Cycle."""
++
++        # Note: this code pathway will only work for multiprocessing
++        # since we have to call get_output() explicitly
++
++        # Add requests to the engine.
++        for idx, request in enumerate(requests):
++            client.add_request(request)
++            time.sleep(0.01)
++            if idx % 2 == 0:
++                client.abort_requests([request.request_id])
++
++        outputs = {req_id: [] for req_id in request_ids}
++        loop_until_done(client, outputs)
++
++        for idx, req_id in enumerate(request_ids):
++            if idx % 2 == 0:
++                assert len(outputs[req_id]) < MAX_TOKENS, (
++                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
++            else:
++                assert len(outputs[req_id]) == MAX_TOKENS, (
++                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
++        """Abort after request is finished."""
++
++        # Note: this code pathway will only work for multiprocessing
++        # since we have to call get_output() explicitly
++
++        request = requests[0]
++        client.add_request(request)
++        time.sleep(10.)
++
++        client.abort_requests([request.request_id])
++
++
++@fork_new_process_for_each_test
++@pytest.mark.asyncio
++async def test_engine_core_client_asyncio(monkeypatch):
++
++    with monkeypatch.context() as m:
++        m.setenv("VLLM_USE_V1", "1")
++
++        engine_args = EngineArgs(model=MODEL_NAME)
++        vllm_config = engine_args.create_engine_config(
++            usage_context=UsageContext.UNKNOWN_CONTEXT)
++        executor_class = Executor.get_class(vllm_config)
++        client = EngineCoreClient.make_client(
++            multiprocess_mode=True,
++            asyncio_mode=True,
++            vllm_config=vllm_config,
++            executor_class=executor_class,
++        )
++
++        MAX_TOKENS = 20
++        params = SamplingParams(max_tokens=MAX_TOKENS)
++        """Normal Request Cycle."""
++
++        requests = [make_request(params) for _ in range(10)]
++        request_ids = [req.request_id for req in requests]
++
++        # Add requests to the engine.
++        for request in requests:
++            await client.add_request_async(request)
++            await asyncio.sleep(0.01)
++
++        outputs: Dict[str, List] = {req_id: [] for req_id in request_ids}
++        await loop_until_done_async(client, outputs)
++
++        for req_id in request_ids:
++            assert len(outputs[req_id]) == MAX_TOKENS, (
++                f"{outputs[req_id]=}, {MAX_TOKENS=}")
++        """Abort Request Cycle."""
++
++        # Add requests to the engine.
++        for idx, request in enumerate(requests):
++            await client.add_request_async(request)
++            await asyncio.sleep(0.01)
++            if idx % 2 == 0:
++                await client.abort_requests_async([request.request_id])
++
++        outputs = {req_id: [] for req_id in request_ids}
++        await loop_until_done_async(client, outputs)
++
++        for idx, req_id in enumerate(request_ids):
++            if idx % 2 == 0:
++                assert len(outputs[req_id]) < MAX_TOKENS, (
++                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
++            else:
++                assert len(outputs[req_id]) == MAX_TOKENS, (
++                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
+new file mode 100644
+index 0000000..4735c6f
+--- /dev/null
++++ b/tests/v1/engine/test_output_processor.py
+@@ -0,0 +1,295 @@
++from typing import List
++
++import pytest
++from transformers import AutoTokenizer
++
++from vllm.engine.arg_utils import EngineArgs
++from vllm.sampling_params import RequestOutputKind, SamplingParams
++from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
++from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
++from vllm.v1.engine.output_processor import OutputProcessor
++
++TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
++VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
++TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config,
++                                              VLLM_CONFIG.scheduler_config,
++                                              VLLM_CONFIG.parallel_config,
++                                              VLLM_CONFIG.lora_config)
++tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
++
++FULL_STRINGS = [
++    "My name is Robert from Neural Magic and I love working on vLLM so much!",
++    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
++    "Nick is the name of my brother in addition to my colleague from Red Hat.",
++]
++
++STOP_STRINGS = ["I love working on", "company by far", "brother in"]
++
++FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS]
++PROMPT_LEN = 5
++PROMPT_TOKENS = [
++    tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
++]
++GENERATION_TOKENS = [
++    tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
++]
++PROMPT_STRINGS = [
++    tokenizer.decode(prompt_tokens, skip_special_tokens=True)
++    for prompt_tokens in PROMPT_TOKENS
++]
++PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
++GENERATION_STRINGS = [
++    text[prompt_len:]
++    for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
++]
++
++
++class MockEngineCore:
++    """Mock outputs form premade tokens lists."""
++
++    def __init__(self, tokens_list: List[List[int]]):
++        self.tokens_list = tokens_list
++        self.current_idx = 0
++
++    def get_outputs(self) -> List[EngineCoreOutput]:
++        token_idx = self.current_idx
++        self.current_idx += 1
++
++        outputs = []
++        for req_idx, token_ids in enumerate(self.tokens_list):
++            if len(token_ids) > token_idx:
++                output = EngineCoreOutput(request_id=f"request-{req_idx}",
++                                          new_token_ids=[token_ids[token_idx]],
++                                          finished=False)
++                if token_idx == len(token_ids) - 1:
++                    output.finished = True
++                    output.finish_reason = "stopped"
++                outputs.append(output)
++
++        return outputs
++
++
++@pytest.mark.parametrize(
++    "request_output_kind",
++    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
++def test_incremental_detokenization(request_output_kind: RequestOutputKind):
++    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
++    engine_core = MockEngineCore(GENERATION_TOKENS)
++
++    # Make N requests.
++    requests = [
++        EngineCoreRequest(request_id=f"request-{idx}",
++                          prompt=prompt,
++                          prompt_token_ids=prompt_tokens,
++                          arrival_time=0,
++                          mm_inputs=None,
++                          mm_hashes=None,
++                          mm_placeholders=None,
++                          eos_token_id=None,
++                          lora_request=None,
++                          sampling_params=SamplingParams(
++                              skip_special_tokens=False,
++                              spaces_between_special_tokens=False,
++                              output_kind=request_output_kind,
++                              stop=[],
++                              include_stop_str_in_output=False))
++        for idx, (
++            prompt,
++            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
++    ]
++
++    # Add requests to the detokenizer.
++    for request in requests:
++        output_processor.add_request(request)
++
++    gen_strings = {}
++    gen_tokens = {}
++    while True:
++        # Mock output from the EngineCore.
++        outputs = engine_core.get_outputs()
++        if len(outputs) == 0:
++            break
++
++        # Step the Detokenizer.
++        processed_outputs = output_processor.process_outputs(outputs, )
++        request_outputs = processed_outputs.request_outputs
++        requests_to_abort = processed_outputs.reqs_to_abort
++        assert len(requests_to_abort) == 0
++
++        # Update tracking.
++        for request_output in request_outputs:
++            request_id = request_output.request_id
++            new_text = request_output.outputs[0].text
++            new_tokens = request_output.outputs[0].token_ids
++            if request_id not in gen_strings:
++                gen_strings[request_id] = new_text
++                gen_tokens[request_id] = new_tokens
++            else:
++                gen_strings[request_id] += new_text
++                gen_tokens[request_id].extend(new_tokens)
++
++    # Confirmed tracked values matches what we expected.
++    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
++            zip(GENERATION_STRINGS, GENERATION_TOKENS)):
++        gen_str = gen_strings[f"request-{idx}"]
++        gen_toks = gen_tokens[f"request-{idx}"]
++
++        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
++        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
++
++    assert output_processor.get_num_unfinished_requests() == 0
++    assert not output_processor.has_unfinished_requests()
++
++
++@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
++def test_stop_string(include_stop_str_in_output: bool):
++    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
++    engine_core = MockEngineCore(GENERATION_TOKENS)
++
++    # Make N requests.
++    requests = [
++        EngineCoreRequest(
++            request_id=f"request-{idx}",
++            prompt=prompt,
++            prompt_token_ids=prompt_tokens,
++            arrival_time=0,
++            mm_inputs=None,
++            mm_hashes=None,
++            mm_placeholders=None,
++            eos_token_id=None,
++            lora_request=None,
++            sampling_params=SamplingParams(
++                skip_special_tokens=False,
++                spaces_between_special_tokens=False,
++                output_kind=RequestOutputKind.DELTA,
++                stop=STOP_STRINGS,
++                include_stop_str_in_output=include_stop_str_in_output,
++            )) for idx, (
++                prompt,
++                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
++    ]
++
++    # Add requests to the detokenizer.
++    for request in requests:
++        output_processor.add_request(request)
++
++    gen_strings = {}
++    aborted = []
++    while True:
++        # Mock output from the EngineCore.
++        outputs = engine_core.get_outputs()
++        if len(outputs) == 0:
++            break
++
++        # Step the Detokenizer.
++        processed_outputs = output_processor.process_outputs(outputs)
++        request_outputs = processed_outputs.request_outputs
++        requests_to_abort = processed_outputs.reqs_to_abort
++        for request_output in request_outputs:
++            # If aborted, we should not get a request output.
++            assert request_output.request_id not in aborted
++        aborted.extend(requests_to_abort)
++
++        # Update tracking.
++        for request_output in request_outputs:
++            if request_output.finished:
++                assert request_output.outputs[0].finish_reason == "stop"
++
++            request_id = request_output.request_id
++            new_text = request_output.outputs[0].text
++            if request_id not in gen_strings:
++                gen_strings[request_id] = new_text
++            else:
++                gen_strings[request_id] += new_text
++
++    # Confirmed tracked values matches what we expected.
++    for idx, (ref_gen_str,
++              stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
++
++        # Request should be aborted.
++        request_id = f"request-{idx}"
++        assert request_id in aborted
++
++        # Collected values that were generated.
++        gen_str = gen_strings[request_id]
++
++        # Construct reference strings.
++        stop_str_idx = ref_gen_str.find(stop_str)
++        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
++        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
++
++        if include_stop_str_in_output:
++            assert gen_str == ref_str_inc_stop, (
++                f"{gen_str=}, {ref_str_inc_stop=}")
++        else:
++            assert gen_str == ref_str_exc_stop, (
++                f"{gen_str=}, {ref_str_exc_stop=}")
++
++    assert output_processor.get_num_unfinished_requests() == 0
++    assert not output_processor.has_unfinished_requests()
++
++
++def test_iteration_stats():
++    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
++    engine_core = MockEngineCore(GENERATION_TOKENS)
++
++    # Make N requests.
++    requests = [
++        EngineCoreRequest(
++            request_id=f"request-{idx}",
++            prompt=prompt,
++            prompt_token_ids=prompt_tokens,
++            arrival_time=0,
++            mm_inputs=None,
++            mm_hashes=None,
++            mm_placeholders=None,
++            eos_token_id=None,
++            lora_request=None,
++            sampling_params=SamplingParams(),
++        ) for idx, (
++            prompt,
++            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
++    ]
++
++    # Add all requests except one to the OutputProcessor.
++    num_active = len(GENERATION_TOKENS) - 1
++    for request in requests[:num_active]:
++        output_processor.add_request(request)
++    inactive_request = requests[num_active]
++
++    # First iteration has 2 prefills.
++    outputs = engine_core.get_outputs()[:num_active]
++    processed_outputs = output_processor.process_outputs(outputs)
++    iteration_stats = processed_outputs.iteration_stats
++    total_prompt_tokens = sum(
++        [len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]])
++
++    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
++    assert iteration_stats.num_generation_tokens == num_active
++
++    # Just decodes in this step.
++    outputs = engine_core.get_outputs()[:num_active]
++    processed_outputs = output_processor.process_outputs(outputs)
++    iteration_stats = processed_outputs.iteration_stats
++
++    assert iteration_stats.num_prompt_tokens == 0
++    assert iteration_stats.num_generation_tokens == num_active
++
++    # Add a new request - prefill and 2 decodes in this step.
++    output_processor.add_request(inactive_request)
++    num_active += 1
++    outputs = engine_core.get_outputs()[:num_active]
++    processed_outputs = output_processor.process_outputs(outputs)
++    iteration_stats = processed_outputs.iteration_stats
++    total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1])
++
++    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
++    assert iteration_stats.num_generation_tokens == num_active
++
++    # Just decodes in this step.
++    outputs = engine_core.get_outputs()[:num_active]
++    processed_outputs = output_processor.process_outputs(outputs)
++    iteration_stats = processed_outputs.iteration_stats
++
++    assert iteration_stats.num_prompt_tokens == 0
++    assert iteration_stats.num_generation_tokens == num_active
+diff --git a/tests/v1/sample/__init__.py b/tests/v1/sample/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
+new file mode 100644
+index 0000000..5ebf729
+--- /dev/null
++++ b/tests/v1/sample/test_sampler.py
+@@ -0,0 +1,321 @@
++from typing import List, Set, Tuple
++
++import numpy as np
++import pytest
++import torch
++
++from vllm.utils import make_tensor_with_pad
++from vllm.v1.sample.metadata import SamplingMetadata
++from vllm.v1.sample.sampler import Sampler
++
++VOCAB_SIZE = 1024
++NUM_OUTPUT_TOKENS = 20
++CUDA_DEVICES = [
++    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
++]
++MAX_NUM_PROMPT_TOKENS = 64
++
++
++def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
++    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
++    return fake_logits
++
++
++def _create_penalty_tensor(batch_size: int, penalty_value: float,
++                           device: torch.device) -> torch.Tensor:
++    return torch.full((batch_size, ),
++                      fill_value=penalty_value,
++                      dtype=torch.float,
++                      device=device)
++
++
++def _create_prompt_tokens_tensor(
++    prompt_token_ids: List[List[int]],
++    vocab_size: int,
++    device: torch.device,
++) -> torch.Tensor:
++    return make_tensor_with_pad(
++        prompt_token_ids,
++        pad=vocab_size,
++        device=device,
++        dtype=torch.int64,
++        pin_memory=False,
++    )
++
++
++def _create_default_sampling_metadata(
++    num_output_tokens: int,
++    batch_size: int,
++    vocab_size: int,
++    device: torch.device,
++) -> SamplingMetadata:
++    output_token_ids: List[List[int]] = []
++    prompt_token_ids: List[List[int]] = []
++    for _ in range(batch_size):
++        output_token_ids.append(
++            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
++        prompt_token_ids.append(
++            np.random.randint(0,
++                              vocab_size,
++                              size=np.random.randint(
++                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
++    fake_sampling_metadata = SamplingMetadata(
++        temperature=torch.full((batch_size, ), 0.0),
++        all_greedy=True,
++        all_random=False,
++        top_p=torch.empty(batch_size, ),
++        top_k=torch.empty(batch_size, ),
++        no_top_p=True,
++        no_top_k=True,
++        generators={},
++        max_num_logprobs=0,
++        prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
++                                                      vocab_size, device),
++        output_token_ids=output_token_ids,
++        frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
++        presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
++        repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
++        no_penalties=True,
++        min_tokens=[],
++        stop_token_ids=[],
++    )
++    return fake_sampling_metadata
++
++
++def _generate_min_token_penalties_and_stop_tokens(
++    num_output_tokens: int, batch_size: int, vocab_size: int,
++    batch_indices_for_min_token_penalty: List[int]
++) -> Tuple[List[int], List[Set[int]]]:
++    """
++    Generates and returns a list of minimum token penalties (`min_tokens`) 
++    and a corresponding list of stop token IDs (`stop_token_ids`) for each 
++    batch.
++
++    If a batch index is included in `batch_indices_for_min_token_penalty`, 
++    a higher `min_tokens` value is assigned (within a randomized range), 
++    and a random set of stop token IDs is created. Otherwise, a lower 
++    `min_tokens` value is assigned, and the stop token IDs set is empty.   
++    """
++    stop_token_ids: List[Set[int]] = []
++    min_tokens: List[int] = []
++    for index in range(batch_size):
++        if index in batch_indices_for_min_token_penalty:
++            min_tokens.append(
++                np.random.randint(num_output_tokens + 1,
++                                  2 * num_output_tokens))
++            stop_token_ids.append(
++                set(
++                    np.random.randint(0, vocab_size - 1)
++                    for _ in range(np.random.randint(0, vocab_size))))
++
++        else:
++            min_tokens.append(np.random.randint(0, num_output_tokens))
++            stop_token_ids.append(set())
++    return (min_tokens, stop_token_ids)
++
++
++def _create_weighted_output_token_list(
++        batch_size: int,
++        vocab_size: int) -> Tuple[List[List[int]], List[List[int]]]:
++    """
++    Creates an output token list where each token occurs a distinct 
++    number of times.
++
++    For each batch, a random subset of token IDs is selected from the
++    vocabulary. The selected tokens are then added to the output token
++    list, each with a different frequency.
++
++    Returns:
++        Tuple[List[List[int]], List[List[int]]]:
++            - The first element is the output token list, where each sublist 
++              corresponds to a batch and contains tokens with weighted 
++              frequencies.
++            - The second element is a list of distinct token IDs for each
++              batch, ordered by their frequency in the corresponding output
++              list.
++    """
++    output_token_ids: List[List[int]] = []
++    sorted_token_ids_in_output: List[List[int]] = []
++    for _ in range(batch_size):
++        distinct_token_ids = np.random.choice(vocab_size,
++                                              size=np.random.randint(1, 10),
++                                              replace=False).tolist()
++        sorted_token_ids_in_output.append(distinct_token_ids)
++        output_token_ids_for_batch = []
++        for index, token_id in enumerate(distinct_token_ids):
++            output_token_ids_for_batch.extend(
++                [token_id for _ in range(index + 1)])
++        output_token_ids.append(output_token_ids_for_batch)
++    return (output_token_ids, sorted_token_ids_in_output)
++
++
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("batch_size", [1, 2, 32])
++def test_sampler_min_tokens_penalty(device: str, batch_size: int):
++    """
++    Tests that if the number of output tokens is less than 
++    SamplingParams.min_tokens then we will set the logits for
++    the stop token ids to -inf.
++    """
++    torch.set_default_device(device)
++    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
++    sampling_metadata = _create_default_sampling_metadata(
++        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
++    batch_indices_for_min_token_penalty = np.random.randint(
++        0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
++    min_tokens, stop_token_ids = _generate_min_token_penalties_and_stop_tokens(
++        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
++        batch_indices_for_min_token_penalty)
++    sampling_metadata.min_tokens = min_tokens
++    sampling_metadata.stop_token_ids = stop_token_ids
++    sampler = Sampler()
++    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
++    logits = logits.cpu()
++    for batch_idx in range(batch_size):
++        for token_id in range(VOCAB_SIZE):
++            if token_id in stop_token_ids[batch_idx]:
++                assert logits[batch_idx][token_id] == -float("inf")
++            else:
++                assert logits[batch_idx][token_id] != -float("inf")
++
++
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("batch_size", [1, 2, 32])
++@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
++def test_sampler_presence_penalty(device: str, batch_size: int,
++                                  presence_penalty: float):
++    """
++    Test to verify that if presence penalty is enabled then tokens
++    are penalized as per their presence in the existing output.
++    """
++    torch.set_default_device(device)
++    # Create fake logits where each token is assigned the same
++    # logit value.
++    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
++    sampling_metadata = _create_default_sampling_metadata(
++        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
++    output_token_ids = sampling_metadata.output_token_ids
++    sampling_metadata.presence_penalties = _create_penalty_tensor(
++        batch_size, presence_penalty, torch.device(device))
++    sampling_metadata.no_penalties = False
++    sampler = Sampler()
++    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
++    logits = logits.cpu()
++    for batch_idx in range(batch_size):
++        # Since all tokens initially have the same logits, the non-penalized
++        # token ID will be the one with the highest logit value, while the
++        # penalized token ID will be the one with the lowest logit value.
++        non_penalized_token_id = logits[batch_idx].argmax().item()
++        penalized_token_id = logits[batch_idx].argmin().item()
++        if presence_penalty > 0:
++            # If `presence_penalty` is set to a value greater than 0, it
++            # indicates a preference for new tokens over those already
++            # present in the output.
++            # Verify that the penalized token ID exists in the output, while the
++            # non-penalized token ID does not.
++            assert penalized_token_id in output_token_ids[batch_idx]
++            assert non_penalized_token_id not in output_token_ids[batch_idx]
++        elif presence_penalty < 0:
++            # If `presence_penalty` is set to a value less than 0, it indicates
++            # a preference for existing tokens over new ones. Verify that the
++            # non-penalized token ID exists in the output, while the penalized
++            # token ID does not.
++            assert non_penalized_token_id in output_token_ids[batch_idx]
++            assert penalized_token_id not in output_token_ids[batch_idx]
++
++
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("batch_size", [1, 2, 32])
++@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
++def test_sampler_frequency_penalty(device: str, batch_size: int,
++                                   frequency_penalty: float):
++    """
++    Test to verify that if frequency penalty is enabled then tokens are
++    penalized as per their frequency of occurrence.
++    """
++    torch.set_default_device(device)
++    # Create fake logits where each token is assigned the same
++    # logit value.
++    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
++    sampling_metadata = _create_default_sampling_metadata(
++        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
++    sampling_metadata.frequency_penalties = _create_penalty_tensor(
++        batch_size, frequency_penalty, torch.device(device))
++    output_token_ids, sorted_token_ids_in_output = \
++        _create_weighted_output_token_list(batch_size, VOCAB_SIZE)
++    sampling_metadata.output_token_ids = output_token_ids
++    sampling_metadata.no_penalties = False
++    sampler = Sampler()
++    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
++    logits = logits.cpu()
++    for batch_idx in range(batch_size):
++        non_penalized_token_id = logits[batch_idx].argmax().item()
++        penalized_token_id = logits[batch_idx].argmin().item()
++        distinct_sorted_token_ids_in_output = \
++            sorted_token_ids_in_output[batch_idx]
++        most_frequent_token_id = distinct_sorted_token_ids_in_output[
++            len(distinct_sorted_token_ids_in_output) - 1]
++        if frequency_penalty > 0:
++            # If `frequency_penalty` is set to > 0, it indicates
++            # a preference for new tokens over existing ones. Verify that the
++            # non-penalized token ID is not present in the output, while the
++            # most penalized token is the one that occurs most frequently in
++            # the output.
++            assert non_penalized_token_id \
++                not in distinct_sorted_token_ids_in_output
++            assert penalized_token_id == most_frequent_token_id
++        elif frequency_penalty < 0:
++            # If `frequency_penalty` is set to < 0, it indicates
++            # a preference for existing tokens over new ones. Verify that the
++            # non-penalized token ID is the one that occurs most frequently
++            # in the output, while the penalized token ID is one that has not
++            # yet appeared.
++            assert non_penalized_token_id == most_frequent_token_id
++            assert penalized_token_id \
++                not in distinct_sorted_token_ids_in_output
++
++
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("batch_size", [1, 2, 32])
++@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
++def test_sampler_repetition_penalty(device: str, batch_size: int,
++                                    repetition_penalty: float):
++    """
++    Test to verify that when the repetition penalty is enabled, tokens 
++    are penalized based on their presence in the prompt or the existing
++    output.
++    """
++    torch.set_default_device(device)
++    # Create fake logits where each token is assigned the same
++    # logit value.
++    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
++    sampling_metadata = _create_default_sampling_metadata(
++        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
++    sampling_metadata.repetition_penalties = _create_penalty_tensor(
++        batch_size, repetition_penalty, torch.device(device))
++    sampling_metadata.no_penalties = False
++    sampler = Sampler()
++    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
++    logits = logits.cpu()
++    for batch_idx in range(batch_size):
++        non_penalized_token_id = logits[batch_idx].argmax().item()
++        penalized_token_id = logits[batch_idx].argmin().item()
++        prompt_tokens = sampling_metadata.prompt_token_ids[
++            batch_idx][:].tolist()
++        output_tokens = sampling_metadata.output_token_ids[batch_idx]
++        if repetition_penalty > 1.0:
++            # If `repetition_penalty` > 1.0, verify that the non-penalized
++            # token ID has not been seen before, while the penalized token ID
++            # exists either in the prompt or the output.
++            assert (non_penalized_token_id not in prompt_tokens and \
++                non_penalized_token_id not in output_tokens)
++            assert (penalized_token_id  in prompt_tokens or \
++                penalized_token_id in output_tokens)
++        elif repetition_penalty < 1.0:
++            # If `repetition_penalty` < 1.0, verify that the penalized
++            # token ID has not been seen before, while the non-penalized
++            # token ID exists either in the prompt or the output.
++            assert (penalized_token_id not in prompt_tokens and \
++                penalized_token_id not in output_tokens)
++            assert (non_penalized_token_id  in prompt_tokens or \
++                non_penalized_token_id in output_tokens)
+diff --git a/tests/v1/worker/__init__.py b/tests/v1/worker/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
+new file mode 100644
+index 0000000..694ce81
+--- /dev/null
++++ b/tests/v1/worker/test_gpu_input_batch.py
+@@ -0,0 +1,224 @@
++from typing import Dict, List, Set, Tuple
++
++import numpy as np
++import pytest
++import torch
++
++from vllm.sampling_params import SamplingParams
++from vllm.utils import is_pin_memory_available, make_tensor_with_pad
++from vllm.v1.sample.metadata import SamplingMetadata
++from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
++
++VOCAB_SIZE = 1024
++NUM_OUTPUT_TOKENS = 20
++MAX_PROMPT_SIZE = 100
++CUDA_DEVICES = [
++    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
++]
++MAX_NUM_PROMPT_TOKENS = 64
++
++
++def _remove_requests(
++        input_batch: InputBatch, batch_size: int,
++        reqs: List[CachedRequestState]) -> Tuple[Set[str], List[int]]:
++    """
++    Remove some requests randomly from the batch and returns a Tuple
++    of 1) set of request removed 2) indices of the requests removed
++    ordered in descending order
++    """
++
++    num_reqs_to_remove = np.random.randint(0, batch_size)
++    req_indices_to_remove: Set[int] = set()
++    for _ in range(num_reqs_to_remove):
++        req_index_to_remove = np.random.randint(0, batch_size)
++        req_indices_to_remove.add(req_index_to_remove)
++
++    req_indices_to_remove_list = list(req_indices_to_remove)
++    req_indices_to_remove_list.sort(reverse=True)
++    req_ids_to_remove: Set[str] = set()
++    for index in req_indices_to_remove:
++        input_batch.remove_request(reqs[index].req_id)
++        req_ids_to_remove.add(reqs[index].req_id)
++    return (req_ids_to_remove, req_indices_to_remove_list)
++
++
++def _construct_expected_sampling_metadata(
++        reqs: List[CachedRequestState], req_ids_retained: Set[int],
++        req_id_index_in_input_batch: Dict[str, int],
++        device: torch.device) -> SamplingMetadata:
++    """
++    Constructs and returns the expected SamplingMetadata for this
++    batch.
++    """
++    num_reqs = len(req_ids_retained)
++    output_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
++    prompt_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
++    presence_penalties = [0.0 for _ in range(num_reqs)]
++    frequency_penalties = [0.0 for _ in range(num_reqs)]
++    repetition_penalties = [1.0 for _ in range(num_reqs)]
++    top_k = [0 for _ in range(num_reqs)]
++    top_p = [0.0 for _ in range(num_reqs)]
++    temperature = [0.0 for _ in range(num_reqs)]
++    stop_token_ids: List[Set[int]] = [set() for _ in range(num_reqs)]
++    min_tokens = [0 for _ in range(num_reqs)]
++    for req in reqs:
++        if req.req_id not in req_ids_retained:
++            continue
++        index_in_input_batch = req_id_index_in_input_batch[req.req_id]
++        output_token_ids[index_in_input_batch] = req.output_token_ids
++        prompt_token_ids[index_in_input_batch] = req.prompt_token_ids
++        presence_penalties[
++            index_in_input_batch] = req.sampling_params.presence_penalty
++        frequency_penalties[
++            index_in_input_batch] = req.sampling_params.frequency_penalty
++        repetition_penalties[
++            index_in_input_batch] = req.sampling_params.repetition_penalty
++        top_k[index_in_input_batch] = req.sampling_params.top_k
++        top_p[index_in_input_batch] = req.sampling_params.top_p
++        temperature[index_in_input_batch] = req.sampling_params.temperature
++        stop_token_ids[
++            index_in_input_batch] = req.sampling_params.all_stop_token_ids
++        min_tokens[index_in_input_batch] = req.sampling_params.min_tokens
++
++
++    return SamplingMetadata(
++        temperature=torch.tensor(temperature, dtype=torch.float, device=device),
++        all_greedy=False,
++        all_random=True,
++        top_p=torch.tensor(top_p, dtype=torch.float, device=device),
++        top_k=torch.tensor(top_k, dtype=torch.int, device=device),
++        no_top_p=all(x == 1.0 for x in top_p),
++        no_top_k=all(x == 0 for x in top_k),
++        generators={},
++        max_num_logprobs=0,
++        prompt_token_ids= make_tensor_with_pad(
++            prompt_token_ids,
++            pad=VOCAB_SIZE,
++            device=torch.device(device),
++            dtype=torch.int64,
++        ),
++        frequency_penalties=torch.tensor(
++            frequency_penalties, dtype=torch.float,
++            device=device),
++        presence_penalties=torch.tensor(
++            presence_penalties, dtype=torch.float,
++            device=device),
++        repetition_penalties=torch.tensor(
++            repetition_penalties, dtype=torch.float,
++            device=device),
++        output_token_ids=output_token_ids,
++        min_tokens=min_tokens,
++        stop_token_ids=stop_token_ids,
++        no_penalties=(all(x ==0 for x in presence_penalties) and \
++            all(x ==0 for x in frequency_penalties) and \
++                all(x ==1 for x in repetition_penalties))
++    )
++
++
++def _create_sampling_params():
++    return SamplingParams(top_k=np.random.randint(1, 10),
++                          top_p=np.random.uniform(0.0, 1.0),
++                          presence_penalty=np.random.uniform(-2.0, 2.0),
++                          repetition_penalty=np.random.uniform(0.0, 2.0),
++                          frequency_penalty=np.random.uniform(-2.0, 2.0),
++                          min_tokens=np.random.randint(1, 10),
++                          stop_token_ids=[
++                              np.random.randint(0, VOCAB_SIZE)
++                              for _ in range(np.random.randint(10))
++                          ])
++
++
++def _construct_cached_request_state(req_id_suffix: int):
++    prompt_token_ids = [
++        np.random.randint(0, VOCAB_SIZE)
++        for _ in range(np.random.randint(0, MAX_PROMPT_SIZE))
++    ]
++    output_token_ids = [
++        np.random.randint(0, VOCAB_SIZE)
++        for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS))
++    ]
++    return CachedRequestState(req_id=f"req_id_{req_id_suffix}",
++                              prompt_token_ids=prompt_token_ids,
++                              prompt=None,
++                              sampling_params=_create_sampling_params(),
++                              mm_inputs=[],
++                              mm_positions=[],
++                              block_ids=[],
++                              generator=None,
++                              num_computed_tokens=len(output_token_ids),
++                              output_token_ids=output_token_ids)
++
++
++@pytest.mark.parametrize("device", CUDA_DEVICES)
++@pytest.mark.parametrize("batch_size", [1, 2, 32, 64])
++def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
++    """
++    Tests the logic for managing sampling metadata in the InputBatch.
++
++    This test involves adding a set of requests to the InputBatch,
++    followed by removing a subset of them. Afterward, the batch is compacted,
++    and the `make_sampling_metadata` method is invoked on the batch. The
++    output of `make_sampling_metadata` is then compared against the expected
++    results to ensure correctness.
++    """
++    input_batch: InputBatch = InputBatch(max_num_reqs=batch_size,
++                                         max_model_len=1024,
++                                         max_num_blocks_per_req=10,
++                                         device=torch.device(device),
++                                         pin_memory=is_pin_memory_available(),
++                                         vocab_size=1024)
++    reqs: List[CachedRequestState] = []
++    req_id_reqs = {}
++    req_id_output_token_ids = {}
++    # Add requests
++    for req_index in range(batch_size):
++        req: CachedRequestState = _construct_cached_request_state(req_index)
++        input_batch.add_request(req, req_index)
++        reqs.append(req)
++        req_id_reqs[req.req_id] = req
++        req_id_output_token_ids[req.req_id] = req.output_token_ids
++
++    # Remove some requests
++    req_ids_to_remove, req_indices_to_remove = _remove_requests(
++        input_batch, batch_size, reqs)
++    req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove
++
++    # Compact the input batch
++    input_batch.condense(req_indices_to_remove)
++
++    # Generate the sampling metadata
++    sampling_metadata = input_batch.make_sampling_metadata(
++        req_id_output_token_ids, skip_copy=False)
++
++    # Create expected output.
++    expected_sampling_metadata = _construct_expected_sampling_metadata(
++        reqs,
++        req_ids_retained,
++        input_batch.req_id_to_index,
++        device=torch.device(device))
++
++    # Assert the actual and expected output.
++    assert torch.allclose(expected_sampling_metadata.temperature,
++                          sampling_metadata.temperature)
++    assert torch.allclose(expected_sampling_metadata.top_p,
++                          sampling_metadata.top_p)
++    assert torch.allclose(expected_sampling_metadata.top_k,
++                          sampling_metadata.top_k)
++    assert torch.allclose(expected_sampling_metadata.frequency_penalties,
++                          sampling_metadata.frequency_penalties)
++    assert torch.allclose(expected_sampling_metadata.presence_penalties,
++                          sampling_metadata.presence_penalties)
++    assert torch.allclose(expected_sampling_metadata.repetition_penalties,
++                          sampling_metadata.repetition_penalties)
++    assert torch.allclose(expected_sampling_metadata.prompt_token_ids,
++                          sampling_metadata.prompt_token_ids)
++    assert (expected_sampling_metadata.output_token_ids ==
++            sampling_metadata.output_token_ids)
++    assert (
++        expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens)
++    assert (expected_sampling_metadata.stop_token_ids ==
++            sampling_metadata.stop_token_ids)
++    assert (expected_sampling_metadata.no_penalties ==
++            sampling_metadata.no_penalties)
++    assert (expected_sampling_metadata.no_top_p == sampling_metadata.no_top_p)
++    assert (expected_sampling_metadata.no_top_k == sampling_metadata.no_top_k)
+diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
+new file mode 100644
+index 0000000..790e891
+--- /dev/null
++++ b/tests/vllm_test_utils/setup.py
+@@ -0,0 +1,7 @@
++from setuptools import setup
++
++setup(
++    name='vllm_test_utils',
++    version='0.1',
++    packages=['vllm_test_utils'],
++)
+diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
+new file mode 100644
+index 0000000..6505c81
+--- /dev/null
++++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
+@@ -0,0 +1,9 @@
++"""
++vllm_utils is a package for vLLM testing utilities.
++It does not import any vLLM modules.
++"""
++
++from .blame import BlameResult, blame
++from .monitor import MonitoredValues, monitor
++
++__all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
+diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
+new file mode 100644
+index 0000000..1ddd347
+--- /dev/null
++++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
+@@ -0,0 +1,53 @@
++import contextlib
++import dataclasses
++import sys
++import traceback
++from typing import Callable, Generator
++
++
++@dataclasses.dataclass
++class BlameResult:
++    found: bool = False
++    trace_stack: str = ""
++
++
++@contextlib.contextmanager
++def blame(func: Callable) -> Generator[BlameResult, None, None]:
++    """
++    Trace the function calls to find the first function that satisfies the
++    condition. The trace stack will be stored in the result.
++
++    Usage:
++
++    ```python
++    with blame(lambda: some_condition()) as result:
++        # do something
++    
++    if result.found:
++        print(result.trace_stack)
++    """
++    result = BlameResult()
++
++    def _trace_calls(frame, event, arg=None):
++        nonlocal result
++        if event in ['call', 'return']:
++            # for every function call or return
++            try:
++                # Temporarily disable the trace function
++                sys.settrace(None)
++                # check condition here
++                if not result.found and func():
++                    result.found = True
++                    result.trace_stack = "".join(traceback.format_stack())
++                # Re-enable the trace function
++                sys.settrace(_trace_calls)
++            except NameError:
++                # modules are deleted during shutdown
++                pass
++        return _trace_calls
++
++    try:
++        sys.settrace(_trace_calls)
++        yield result
++    finally:
++        sys.settrace(None)
+diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
+new file mode 100644
+index 0000000..a237f53
+--- /dev/null
++++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
+@@ -0,0 +1,68 @@
++import contextlib
++import dataclasses
++import sys
++import traceback
++from typing import Callable, Generator, Generic, TypeVar
++
++_T = TypeVar("_T")
++
++
++@dataclasses.dataclass
++class MonitoredValues(Generic[_T]):
++    values: list[_T] = dataclasses.field(default_factory=list)
++    trace_stacks: list[str] = dataclasses.field(default_factory=list)
++
++
++@contextlib.contextmanager
++def monitor(
++    measure_func: Callable[[],
++                           _T]) -> Generator[MonitoredValues[_T], None, None]:
++    """
++    Trace the function calls to continuously monitor the change of
++    a value.
++
++    Usage:
++
++    ```python
++
++    def measure_func():
++        ... # measure the current value
++        return current_value
++
++    with monitor(measure_func) as monitored_values:
++        # do something
++    
++        monitored_values.values # all changes of the values
++        monitored_values.trace_stacks # trace stacks of every change
++    ```
++    """
++    monitored_values = MonitoredValues[_T]()
++
++    def _trace_calls(frame, event, arg=None):
++        nonlocal monitored_values
++        if event in ['line']:
++            # triggered by every line of Python code.
++            # only Python functions will trigger it,
++            # c/cpp functions will not trigger it.
++            try:
++                # Temporarily disable the trace function
++                sys.settrace(None)
++                # do a measurement
++                current_value = measure_func()
++                if len(monitored_values.values
++                       ) == 0 or current_value != monitored_values.values[-1]:
++                    monitored_values.values.append(current_value)
++                    monitored_values.trace_stacks.append("".join(
++                        traceback.format_stack()))
++                # Re-enable the trace function
++                sys.settrace(_trace_calls)
++            except NameError:
++                # modules are deleted during shutdown
++                pass
++        return _trace_calls
++
++    try:
++        sys.settrace(_trace_calls)
++        yield monitored_values
++    finally:
++        sys.settrace(None)
+diff --git a/tests/weight_loading/models-large.txt b/tests/weight_loading/models-large.txt
+new file mode 100644
+index 0000000..8ab7f05
+--- /dev/null
++++ b/tests/weight_loading/models-large.txt
+@@ -0,0 +1,5 @@
++compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
++compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
++compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
++gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
++awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
+\ No newline at end of file
+diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
+new file mode 100644
+index 0000000..a06956c
+--- /dev/null
++++ b/tests/weight_loading/models.txt
+@@ -0,0 +1,33 @@
++gptq_marlin, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main
++gptq_marlin, TheBloke/Llama-2-7B-GPTQ, main
++gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main
++gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True
++gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
++gptq_marlin, TechxGenus/gemma-1.1-2b-it-GPTQ, main
++gptq, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main
++gptq, TheBloke/Llama-2-7B-GPTQ, main
++gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main
++gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True
++gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
++gptq, TechxGenus/gemma-1.1-2b-it-GPTQ, main
++compressed-tensors, nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change, main
++compressed-tensors, nm-testing/tinyllama-oneshot-w8-channel-a8-tensor, main
++compressed-tensors, nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2, main
++compressed-tensors, nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2, main
++compressed-tensors, nm-testing/tinyllama-oneshot-w4a16-group128-v2, main
++compressed-tensors, nm-testing/tinyllama-oneshot-w8a16-per-channel, main
++compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
++compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
++compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
++compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
++compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
++compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
++compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
++awq, casperhansen/mixtral-instruct-awq, main
++awq_marlin, casperhansen/mixtral-instruct-awq, main
++fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
++marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
++marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
++qqq, HandH1998/QQQ-Llama-3-8b-g128, main
++qqq, HandH1998/QQQ-Llama-3-8b, main
++hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
+\ No newline at end of file
+diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh
+new file mode 100644
+index 0000000..6931286
+--- /dev/null
++++ b/tests/weight_loading/run_model_weight_loading_test.sh
+@@ -0,0 +1,49 @@
++#!/bin/bash
++SUCCESS=0
++
++while getopts "c:" OPT; do
++  case ${OPT} in
++    c ) 
++        CONFIG="$OPTARG"
++        ;;
++    \? )
++        usage
++        exit 1
++        ;;
++  esac
++done
++
++
++IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
++
++for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
++do
++    LOCAL_SUCCESS=0
++    IFS=', ' read -r -a array <<< "$MODEL_CONFIG"
++    
++    echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
++
++    export QUANTIZATION=${array[0]}
++    export MODEL_NAME=${array[1]}
++    export REVISION=${array[2]}
++    # If array length is larger than 3, then MIN_CAPABILITY is provided
++    if [ ${#array[@]} -gt 3 ]; then
++        export MIN_CAPABILITY=${array[3]}
++    fi
++    pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
++
++    if [[ $LOCAL_SUCCESS == 0 ]]; then
++        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
++    else
++        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
++    fi
++
++    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
++
++done
++
++if [ "${SUCCESS}" -eq "0" ]; then
++    exit 0
++else
++    exit 1
++fi
+diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
+new file mode 100644
+index 0000000..199731b
+--- /dev/null
++++ b/tests/weight_loading/test_weight_loading.py
+@@ -0,0 +1,32 @@
++import os
++
++import pytest
++import torch
++
++from vllm.platforms import current_platform
++
++MAX_MODEL_LEN = 1024
++MODEL_NAME = os.environ.get("MODEL_NAME",
++                            "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
++REVISION = os.environ.get("REVISION", "main")
++QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
++MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
++
++
++@pytest.mark.skipif(
++    not current_platform.has_device_capability(int(MIN_CAPABILITY)),
++    reason="Current system does not have minimum capability.")
++def test_weight_loading(vllm_runner):
++    """
++    Test parameter weight loading with tp>1.
++    """
++    with vllm_runner(model_name=MODEL_NAME,
++                     revision=REVISION,
++                     dtype=torch.half if QUANTIZATION == "gptq" else "auto",
++                     quantization=QUANTIZATION,
++                     max_model_len=MAX_MODEL_LEN,
++                     tensor_parallel_size=2) as model:
++
++        output = model.generate_greedy("Hello world!", max_tokens=20)
++        print(output)
++        assert output
+diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
+new file mode 100644
+index 0000000..a6b3cb5
+--- /dev/null
++++ b/tests/worker/test_encoder_decoder_model_runner.py
+@@ -0,0 +1,646 @@
++import itertools
++from typing import List
++
++import pytest
++import torch
++
++from vllm.engine.arg_utils import EngineArgs
++from vllm.platforms import current_platform
++from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
++from vllm.utils import make_tensor_with_pad
++from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
++
++BATCH_SIZES = [1, 4, 16, 64, 256]
++
++
++def _create_model_runner(model: str, *args,
++                         **kwargs) -> EncoderDecoderModelRunner:
++    engine_args = EngineArgs(model, *args, **kwargs)
++    engine_config = engine_args.create_engine_config()
++    model_runner = EncoderDecoderModelRunner(
++        vllm_config=engine_config,
++        is_driver_worker=True,
++    )
++    return model_runner
++
++
++@pytest.mark.skipif(condition=current_platform.is_cpu(),
++                    reason="CPU backend is currently "
++                    "unsupported for encoder/ "
++                    "decoder models")
++def test_empty_seq_group():
++    """Verify prepare prompt and decode returns empty output
++       for empty seq group list"""
++
++    model_runner = _create_model_runner(
++        "facebook/bart-base",
++        seed=0,
++        dtype="float16",
++        max_num_batched_tokens=100000,
++        max_num_seqs=100000,
++        enable_chunked_prefill=False,
++        enforce_eager=True,
++    )
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
++    model_input = model_runner._prepare_model_input_tensors(
++        seq_group_metadata_list)
++    (
++        input_tokens,
++        input_positions,
++        encoder_input_tokens,
++        encoder_input_positions,
++        attn_metadata,
++        return_seq_lens,
++    ) = (
++        model_input.input_tokens,
++        model_input.input_positions,
++        model_input.encoder_input_tokens,
++        model_input.encoder_input_positions,
++        model_input.attn_metadata,
++        model_input.seq_lens,
++    )
++    assert input_tokens is None
++    assert input_positions is None
++    assert encoder_input_tokens is None
++    assert encoder_input_positions is None
++    assert attn_metadata is None
++    assert return_seq_lens is None
++
++
++@pytest.mark.skipif(condition=current_platform.is_cpu(),
++                    reason="CPU backend is currently "
++                    "unsupported for encoder/ "
++                    "decoder models")
++@pytest.mark.parametrize("batch_size", BATCH_SIZES)
++def test_prepare_prompt(batch_size):
++    '''
++    Test the ability of the encoder/decoder model runner subclass to
++    produce prefill-phase model inputs & attention metadata.
++
++    Test behavior:
++
++    * Instantiate BART base model & enc/dec model runner
++    * Construct sequence-group metadata for dummy prompts
++    * Test that encoder attention, decoder self-attention,
++      and encoder/decoder cross-attention inputs are correct
++
++    Arguments:
++
++    * batch_size
++    * backend_name: The attention backend under test
++    * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
++    '''
++
++    model_runner = _create_model_runner(
++        "facebook/bart-base",
++        seed=0,
++        dtype="float16",
++        max_num_batched_tokens=100000,
++        max_num_seqs=100000,
++        enable_chunked_prefill=False,
++        enforce_eager=True,
++    )
++
++    seq_lens: List[int] = []
++    encoder_seq_lens: List[int] = []
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
++    block_tables = {0: [1]}
++    cross_block_table = [2]
++    for i in range(batch_size):
++        # make sure all tokens fit into one block
++        seq_len = i % (model_runner.block_size - 1) + 1
++        seq_lens.append(seq_len)
++        seq_data = SequenceData.from_seqs(range(seq_len))
++        encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
++        encoder_seq_lens.append(encoder_seq_len)
++        encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
++        seq_group_metadata = SequenceGroupMetadata(
++            request_id=f"test_{i}",
++            is_prompt=True,
++            seq_data={0: seq_data},
++            sampling_params=SamplingParams(temperature=0),
++            block_tables=block_tables,
++            encoder_seq_data=encoder_seq_data,
++            cross_block_table=cross_block_table,
++        )
++        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
++        seq_group_metadata_list.append(seq_group_metadata)
++
++    # Build
++    # * Decoder model inputs
++    # * Decoder self-attention KV caching data structures
++    # * Encoder model inputs
++    # * Encoder/decoder cross-attention KV caching data structures
++    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
++
++    input_tokens = model_input.input_tokens
++    input_positions = model_input.input_positions
++    attn_metadata = model_input.attn_metadata
++    return_seq_lens = model_input.seq_lens
++    slot_mapping = attn_metadata.slot_mapping
++    encoder_input_tokens = model_input.encoder_input_tokens
++    encoder_input_positions = model_input.encoder_input_positions
++    cross_slot_mapping = attn_metadata.cross_slot_mapping
++    assert return_seq_lens == seq_lens
++    assert len(slot_mapping) == len(input_tokens)
++    assert len(cross_slot_mapping) == len(encoder_input_tokens)
++
++    # Verify input metadata is correct for prompts.
++    # - Decoder attention metadata
++    device = model_runner.device
++    assert attn_metadata.num_prefills > 0
++    assert attn_metadata.num_decode_tokens == 0
++    assert torch.equal(attn_metadata.seq_lens_tensor,
++                       torch.tensor(seq_lens, device=device, dtype=torch.int))
++    assert attn_metadata.seq_lens == seq_lens
++    assert attn_metadata.max_prefill_seq_len == max(seq_lens)
++    assert attn_metadata.max_decode_seq_len == 0
++    # - Encoder attention metadata
++    assert attn_metadata.encoder_seq_lens == encoder_seq_lens
++    assert torch.equal(
++        attn_metadata.encoder_seq_lens_tensor,
++        torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
++    assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
++    assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
++
++    # Test decoder subquery start locs.
++    start_idx = 0
++    start_loc = [start_idx]
++    for seq_len in seq_lens:
++        start_idx += seq_len
++        start_loc.append(start_idx)
++    assert torch.equal(
++        attn_metadata.query_start_loc,
++        torch.tensor(start_loc, dtype=torch.int32, device=device),
++    )
++
++    # Test decoder seq start locs & context lengths
++
++    assert torch.equal(
++        attn_metadata.seq_start_loc,
++        torch.tensor(start_loc, dtype=torch.int32, device=device),
++    )
++    assert torch.equal(
++        attn_metadata.context_lens_tensor,
++        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
++                    dtype=torch.int,
++                    device=device),
++    )
++
++    # Verify block tables are correct for prompts
++    # - Decoder self-attention
++    expected = torch.tensor(
++        [[] for _ in range(len(seq_group_metadata_list))],
++        dtype=torch.int32,
++        device=model_runner.device,
++    )
++    assert torch.equal(
++        attn_metadata.block_tables,
++        expected,
++    )
++    # - Encoder/decoder cross-attention
++    assert torch.equal(
++        attn_metadata.cross_block_tables,
++        expected,
++    )
++
++    # Cuda graph should not be used for prefill.
++    assert attn_metadata.use_cuda_graph is False
++
++    # Verify the lengths of input tokens & positions
++    # - Decoder
++    assert len(input_tokens) == sum(seq_lens)
++    assert len(input_positions) == sum(seq_lens)
++    # -- An indirect check that model_input.input_tokens
++    #    and model_input.input_positions are correct -
++    #    by design of the test, the input tokens are
++    #    equal to the input position values, so if
++    #    the model_input data structure has the correct
++    #    values then these two should be equal
++    assert torch.equal(
++        input_tokens,
++        input_positions,
++    )
++    # - Encoder
++    assert len(encoder_input_tokens) == sum(encoder_seq_lens)
++    # -- An indirect check that model_input.encoder_input_tokens
++    #    and model_input.encoder_input_positions are correct -
++    #    by design of the test, the input tokens are
++    #    equal to the input position values, so if
++    #    the model_input data structure has the correct
++    #    values then these two should be equal
++    assert torch.equal(
++        encoder_input_tokens,
++        encoder_input_positions,
++    )
++
++    # Test that vLLM sampling infrastructure chooses the correct
++    # sequence positions at which to sample (i.e. the end of
++    # each sequence) in the prefill phase
++
++    expected_selected_token_indices = []
++    selected_token_start_idx = 0
++    for seq_len in seq_lens:
++        # Compute the index offset of the final token in each
++        # prompt (recall that the prompts are concatenated)
++        expected_selected_token_indices.append(selected_token_start_idx +
++                                               seq_len - 1)
++        selected_token_start_idx += seq_len
++
++    sampling_metadata = model_input.sampling_metadata
++    actual = sampling_metadata.selected_token_indices
++    expected = torch.tensor(
++        expected_selected_token_indices,
++        device=actual.device,
++        dtype=actual.dtype,
++    )
++    assert torch.equal(actual, expected)
++
++
++@pytest.mark.skipif(condition=current_platform.is_cpu(),
++                    reason="CPU backend is currently "
++                    "unsupported for encoder/ "
++                    "decoder models")
++@pytest.mark.parametrize("batch_size", BATCH_SIZES)
++@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
++def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
++    '''
++    Test the ability of the encoder/decoder model runner subclass to
++    produce decode-phase model inputs & attention metadata.
++
++    Test behavior:
++
++    * Instantiate BART base model & enc/dec model runner
++    * Construct sequence-group metadata for dummy prompts
++    * Test that encoder attention, decoder self-attention,
++      and encoder/decoder cross-attention inputs are correct
++
++    Arguments:
++
++    * batch_size
++    * multiple_seqs_per_seq_group
++    * backend_name: The attention backend under test
++    * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
++    '''
++
++    model_runner = _create_model_runner(
++        "facebook/bart-base",
++        seed=0,
++        dtype="float16",
++        max_num_batched_tokens=100000,
++        max_num_seqs=100000,
++        enable_chunked_prefill=False,
++        enforce_eager=True,
++    )
++
++    seq_lens: List[int] = []
++    encoder_seq_lens: List[int] = []
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
++    block_tables = {
++        0: [1],
++        1: [3]
++    } if multiple_seqs_per_seq_group else {
++        0: [1]
++    }
++    cross_block_table = [2]
++    for i in range(batch_size):
++        # make sure all tokens fit into one block
++        seq_len = i % (model_runner.block_size - 1) + 1
++        seq_data = SequenceData.from_seqs(range(seq_len))
++        encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
++        encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
++
++        seq_group_metadata = SequenceGroupMetadata(
++            request_id=f"test_{i}",
++            is_prompt=False,
++            seq_data={
++                0: seq_data,
++                1: seq_data
++            } if multiple_seqs_per_seq_group else {0: seq_data},
++            sampling_params=SamplingParams(temperature=0),
++            block_tables=block_tables,
++            encoder_seq_data=encoder_seq_data,
++            cross_block_table=cross_block_table,
++        )
++        assert seq_group_metadata.token_chunk_size == 1
++        seq_group_metadata_list.append(seq_group_metadata)
++        seq_lens.extend(
++            [seq_len for _ in range(len(seq_group_metadata.seq_data))])
++        encoder_seq_lens.extend(
++            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
++
++    # Build
++    # * Decoder model inputs
++    # * Decoder self-attention KV caching data structures
++    # * Encoder model inputs
++    # * Encoder/decoder cross-attention KV caching data structures
++    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
++    input_tokens = model_input.input_tokens
++    input_positions = model_input.input_positions
++    attn_metadata = model_input.attn_metadata
++    return_seq_lens = model_input.seq_lens
++    slot_mapping = attn_metadata.slot_mapping
++    encoder_input_tokens = model_input.encoder_input_tokens
++    encoder_input_positions = model_input.encoder_input_positions
++    cross_slot_mapping = attn_metadata.cross_slot_mapping
++    assert return_seq_lens == seq_lens
++    assert len(slot_mapping) == len(input_tokens)
++    assert len(cross_slot_mapping) == len(encoder_input_tokens)
++
++    # Verify input metadata is correct for decode phase.
++    # - Decoder attention metadata
++    device = model_runner.device
++    assert attn_metadata.num_prefills == 0
++    assert attn_metadata.num_decode_tokens > 0
++    assert torch.equal(attn_metadata.seq_lens_tensor,
++                       torch.tensor(seq_lens, device=device, dtype=torch.int))
++    assert attn_metadata.seq_lens == seq_lens
++    assert attn_metadata.max_prefill_seq_len == 0
++    assert attn_metadata.max_decode_seq_len == max(seq_lens)
++    # - Encoder attention metadata
++    assert attn_metadata.encoder_seq_lens == encoder_seq_lens
++    assert torch.equal(
++        attn_metadata.encoder_seq_lens_tensor,
++        torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
++    assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
++    assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
++
++    # Test decoder subquery start locs.
++    start_idx = 0
++    start_loc = [start_idx]
++    for seq_len in seq_lens:
++        start_idx += 1
++        start_loc.append(start_idx)
++    assert torch.equal(
++        attn_metadata.query_start_loc,
++        torch.tensor(start_loc, dtype=torch.int32, device=device),
++    )
++
++    # Test decoder seq start locs. Note that for normal prefill it is
++    # equivalent to query_start_loc.
++    start_idx = 0
++    seq_start_loc = [start_idx]
++    for seq_len in seq_lens:
++        start_idx += seq_len
++        seq_start_loc.append(start_idx)
++
++    # Test seq_start_loc and context lengths
++
++    assert torch.equal(
++        attn_metadata.seq_start_loc,
++        torch.tensor(seq_start_loc, dtype=torch.int32, device=device),
++    )
++    assert torch.equal(
++        attn_metadata.context_lens_tensor,
++        torch.tensor([seq_len - 1 for seq_len in seq_lens],
++                     dtype=torch.int,
++                     device=device))
++
++    # Verify block tables are correct for prompts
++    # - Decoder self-attention
++    flattened_block_tables = [
++        block_table for block_table in block_tables.values()
++    ]
++    expected = torch.tensor(flattened_block_tables *
++                            len(seq_group_metadata_list),
++                            dtype=torch.int32,
++                            device=model_runner.device)
++    assert torch.equal(
++        attn_metadata.block_tables,
++        expected,
++    )
++    # - Encoder/decoder cross-attention
++    expected = torch.tensor([
++        cross_block_table for seq_group_metadata in seq_group_metadata_list
++        for _ in range(len(seq_group_metadata.seq_data))
++    ],
++                            dtype=torch.int32,
++                            device=model_runner.device)
++    assert torch.equal(
++        attn_metadata.cross_block_tables,
++        expected,
++    )
++
++    # Model runner's CUDAGraph setting should be propagated to attention
++    # metadata.
++    assert attn_metadata.use_cuda_graph is False
++
++    # Verify the lengths of input tokens & positions
++    # - Decoder
++    assert len(input_tokens) == len(seq_lens)
++    assert len(input_positions) == len(seq_lens)
++    # -- An indirect check that model_input.input_tokens
++    #    and model_input.input_positions are correct -
++    #    by design of the test, the input tokens are
++    #    equal to the input position values, so if
++    #    the model_input data structure has the correct
++    #    values then these two should be equal
++    assert torch.equal(
++        input_tokens,
++        input_positions,
++    )
++    # - Encoder
++    assert len(encoder_input_tokens) == 0
++    assert len(encoder_input_tokens) == 0
++    # -- An indirect check that model_input.encoder_input_tokens
++    #    and model_input.encoder_input_positions are correct -
++    #    by design of the test, the input tokens are
++    #    equal to the input position values, so if
++    #    the model_input data structure has the correct
++    #    values then these two should be equal
++    assert torch.equal(
++        encoder_input_tokens,
++        encoder_input_positions,
++    )
++
++    # Test that vLLM sampling infrastructure chooses the correct
++    # sequence positions at which to sample (i.e. the end of
++    # each sequence) in the decode phase
++
++    expected_selected_token_indices = []
++    for selected_token_start_idx, seq_len in enumerate(seq_lens):
++        # Compute the index offset of the final token in each
++        # sequence's decoded outputs; since a single token is
++        # decoded per iteration per sequence, then the length
++        # of the decoded tokens for a given sequence is 1 and
++        # the final index offset into a given sequence's
++        # generated tokens is 0 (i.e. the expected sampling index
++        # for a given sequence is just `selected_token_start_idx`)
++        expected_selected_token_indices.append(selected_token_start_idx)
++
++    sampling_metadata = model_input.sampling_metadata
++    actual = sampling_metadata.selected_token_indices
++    expected = torch.tensor(
++        expected_selected_token_indices,
++        device=actual.device,
++        dtype=actual.dtype,
++    )
++    assert torch.equal(actual, expected)
++
++
++@pytest.mark.parametrize("batch_size", list(range(1, 257)))
++@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
++def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
++    """
++    Tests that for encoder-decoder models with CUDA Graph capture and replay
++    enabled, the tensors used during the decode phase are correctly padded
++    for varying input batch sizes.
++    """
++    model_runner = _create_model_runner(
++        "facebook/bart-base",
++        seed=0,
++        dtype="float16",
++        max_num_batched_tokens=100000,
++        max_num_seqs=100000,
++        enable_chunked_prefill=False,
++        enforce_eager=False,
++    )
++    block_tables = {
++        0: [1],
++        1: [3]
++    } if multiple_seqs_per_seq_group else {
++        0: [1]
++    }
++    seq_lens: List[int] = []
++    encoder_seq_lens: List[int] = []
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
++
++    cross_block_table = [2]
++    expanded_batch_size = 0
++    for i in range(batch_size):
++        # make sure all tokens fit into one block
++        seq_len = i % (model_runner.block_size - 1) + 1
++        seq_data = SequenceData.from_seqs(range(seq_len))
++        encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
++        encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
++        seq_group_metadata = SequenceGroupMetadata(
++            request_id=f"test_{i}",
++            is_prompt=False,
++            seq_data={
++                0: seq_data,
++                1: seq_data
++            } if multiple_seqs_per_seq_group else {0: seq_data},
++            sampling_params=SamplingParams(temperature=0),
++            block_tables=block_tables,
++            encoder_seq_data=encoder_seq_data,
++            cross_block_table=cross_block_table,
++        )
++        assert seq_group_metadata.token_chunk_size == 1
++        seq_lens.extend(
++            [seq_len for _ in range(len(seq_group_metadata.seq_data))])
++        encoder_seq_lens.extend(
++            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
++        expanded_batch_size = expanded_batch_size + len(
++            seq_group_metadata.seq_data)
++        seq_group_metadata_list.append(seq_group_metadata)
++
++    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
++    input_tokens = model_input.input_tokens
++    input_positions = model_input.input_positions
++    attn_metadata = model_input.attn_metadata
++    return_seq_lens = model_input.seq_lens
++    slot_mapping = attn_metadata.slot_mapping
++    encoder_input_tokens = model_input.encoder_input_tokens
++    encoder_input_positions = model_input.encoder_input_positions
++    cross_slot_mapping = attn_metadata.cross_slot_mapping
++
++    # With CUDA Graph capture and replay enabled, the decoder and encoder
++    # input sequences will be padded. Create the expected padded tensors
++    # accordingly.
++    graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(
++        expanded_batch_size)
++    cuda_graph_pad_size = graph_batch_size - expanded_batch_size
++    padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
++    padded_encoder_seq_lens = encoder_seq_lens + list(
++        itertools.repeat(1, cuda_graph_pad_size))
++
++    assert return_seq_lens == padded_seq_lens
++    assert len(slot_mapping) == len(input_tokens)
++    assert len(cross_slot_mapping) == len(encoder_input_tokens)
++
++    # Verify attention metadata
++    device = model_runner.device
++    assert attn_metadata.num_prefills == 0
++    assert attn_metadata.num_decode_tokens > 0
++    assert torch.equal(
++        attn_metadata.seq_lens_tensor,
++        torch.tensor(padded_seq_lens, device=device, dtype=torch.int))
++    assert attn_metadata.seq_lens == padded_seq_lens
++    assert attn_metadata.max_prefill_seq_len == 0
++    assert attn_metadata.max_decode_seq_len == max(seq_lens)
++    # - Encoder attention metadata
++    assert attn_metadata.encoder_seq_lens == padded_encoder_seq_lens
++    assert torch.equal(
++        attn_metadata.encoder_seq_lens_tensor,
++        torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int))
++    assert attn_metadata.max_encoder_seq_len == max(padded_encoder_seq_lens)
++    assert attn_metadata.num_encoder_tokens == sum(padded_encoder_seq_lens)
++
++    # Verify block tables are correct for prompts
++    # - Decoder self-attention. Pad the block tables as expected.
++    flattened_block_tables = [
++        block_table for _ in range(len(seq_group_metadata_list))
++        for block_table in block_tables.values()
++    ]
++    flattened_block_tables.extend([[] for _ in range(cuda_graph_pad_size)])
++    expected = make_tensor_with_pad(
++        flattened_block_tables,
++        max_len=64,
++        pad=0,
++        dtype=torch.int32,
++        device=model_runner.device,
++    )
++    assert torch.equal(
++        attn_metadata.block_tables,
++        expected,
++    )
++    # - Encoder/decoder cross-attention. Pad the cross-attention block tables
++    # as expected.
++    expected = [
++        cross_block_table for seq_group_metadata in seq_group_metadata_list
++        for _ in range(len(seq_group_metadata.seq_data))
++    ]
++    expected.extend([[] for _ in range(cuda_graph_pad_size)])
++    expected = make_tensor_with_pad(
++        expected,
++        max_len=64,
++        pad=0,
++        dtype=torch.int32,
++        device=model_runner.device,
++    )
++    assert torch.equal(
++        attn_metadata.cross_block_tables,
++        expected,
++    )
++
++    # Model runner's CUDAGraph setting should be propagated to attention
++    # metadata.
++    assert attn_metadata.use_cuda_graph is True
++
++    # Verify the lengths of input tokens & positions
++    # - Decoder
++    assert len(input_tokens) == len(padded_seq_lens)
++    assert len(input_positions) == len(padded_seq_lens)
++    # -- An indirect check that model_input.input_tokens
++    #    and model_input.input_positions are correct -
++    #    by design of the test, the input tokens are
++    #    equal to the input position values, so if
++    #    the model_input data structure has the correct
++    #    values then these two should be equal
++    assert torch.equal(
++        input_tokens,
++        input_positions,
++    )
++    # - Encoder
++    assert len(encoder_input_tokens) == 0
++    assert len(encoder_input_tokens) == 0
++    # -- An indirect check that model_input.encoder_input_tokens
++    #    and model_input.encoder_input_positions are correct -
++    #    by design of the test, the input tokens are
++    #    equal to the input position values, so if
++    #    the model_input data structure has the correct
++    #    values then these two should be equal
++    assert torch.equal(
++        encoder_input_tokens,
++        encoder_input_positions,
++    )
+diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
+new file mode 100644
+index 0000000..309854e
+--- /dev/null
++++ b/tests/worker/test_model_input.py
+@@ -0,0 +1,241 @@
++import dataclasses
++from typing import List, Tuple, Type
++
++import torch
++
++from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
++from vllm.attention.backends.abstract import AttentionBackend
++from vllm.attention.backends.utils import CommonAttentionState
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.pooling_metadata import PoolingMetadata
++from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
++from vllm.worker.multi_step_model_runner import StatefulModelInput
++from vllm.worker.pooling_model_runner import (
++    ModelInputForGPUWithPoolingMetadata)
++
++
++class MockAttentionBackend(AttentionBackend):
++
++    @staticmethod
++    def get_name() -> str:
++        raise NotImplementedError
++
++    @staticmethod
++    def get_impl_cls():
++        raise NotImplementedError
++
++    @staticmethod
++    def get_metadata_cls() -> Type["AttentionMetadata"]:
++        return AttentionMetadata
++
++    @staticmethod
++    def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
++        return AttentionMetadataBuilder
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
++
++    @staticmethod
++    def get_kv_cache_shape(
++        num_blocks: int,
++        block_size: int,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[int, ...]:
++        raise NotImplementedError
++
++    @staticmethod
++    def swap_blocks(
++        src_kv_cache: torch.Tensor,
++        dst_kv_cache: torch.Tensor,
++        src_to_dst: torch.Tensor,
++    ) -> None:
++        pass
++
++    @staticmethod
++    def copy_blocks(
++        kv_caches: List[torch.Tensor],
++        src_to_dists: torch.Tensor,
++    ) -> None:
++        pass
++
++
++def test_model_runner_input():
++    sampling_metadata = SamplingMetadata(
++        ["seq_group"],
++        "selected_token_indices",
++        "categorized_sample_indices",
++        "num_prompts",
++    )
++    attn_metadata = AttentionMetadata(
++        num_prefills=1,
++        num_prefill_tokens=2,
++        num_decode_tokens=3,
++        slot_mapping=torch.zeros(1),
++        multi_modal_placeholder_index_maps=None,
++    )
++    model_input = ModelInputForGPUWithSamplingMetadata(
++        input_tokens=torch.ones(10),
++        input_positions=torch.ones(10),
++        sampling_metadata=sampling_metadata,
++        attn_metadata=attn_metadata)
++
++    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
++
++    # Test round trip serialization.
++    tensor_dict = model_input.as_broadcastable_tensor_dict()
++    attn_backend = MockAttentionBackend()
++    received_model_input = (
++        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
++            tensor_dict, attn_backend=attn_backend))
++    # Check that received copy has correct values.
++    assert isinstance(received_model_input,
++                      ModelInputForGPUWithSamplingMetadata)
++    assert received_model_input.input_tokens is not None
++    assert (
++        received_model_input.input_tokens == model_input.input_tokens).all()
++    assert received_model_input.input_positions is not None
++    assert (received_model_input.input_positions == model_input.input_positions
++            ).all()
++    assert received_model_input.multi_modal_kwargs is None
++    assert (received_model_input.multi_modal_kwargs ==
++            model_input.multi_modal_kwargs)
++    assert received_model_input.lora_requests is None
++    assert received_model_input.lora_requests == model_input.lora_requests
++    assert received_model_input.lora_mapping is None
++    assert received_model_input.lora_mapping == model_input.lora_mapping
++    for field in dataclasses.fields(AttentionMetadata):
++        assert getattr(received_model_input.attn_metadata, field.name,
++                       None) == getattr(attn_metadata, field.name, None)
++    # For sampling metadata, only selected_token_indices is copied.
++    assert (received_model_input.sampling_metadata.selected_token_indices ==
++            sampling_metadata.selected_token_indices)
++    assert received_model_input.sampling_metadata.seq_groups is None
++
++
++def test_embedding_model_runner_input():
++    pooling_metadata = PoolingMetadata(
++        seq_groups=[[0]],
++        seq_data={},
++        prompt_lens=[1],
++    )
++    attn_metadata = AttentionMetadata(
++        num_prefills=1,
++        num_prefill_tokens=2,
++        num_decode_tokens=3,
++        slot_mapping=torch.zeros(1),
++        multi_modal_placeholder_index_maps=None,
++    )
++    model_input = ModelInputForGPUWithPoolingMetadata(
++        input_tokens=torch.ones(10),
++        input_positions=torch.ones(10),
++        pooling_metadata=pooling_metadata,
++        attn_metadata=attn_metadata)
++
++    assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
++
++    # Test round trip serialization.
++    tensor_dict = model_input.as_broadcastable_tensor_dict()
++    attn_backend = MockAttentionBackend()
++    received_model_input = (
++        ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
++            tensor_dict, attn_backend=attn_backend))
++    # Check that received copy has correct values.
++    assert isinstance(received_model_input,
++                      ModelInputForGPUWithPoolingMetadata)
++    assert received_model_input.input_tokens is not None
++    assert (
++        received_model_input.input_tokens == model_input.input_tokens).all()
++    assert received_model_input.input_positions is not None
++    assert (received_model_input.input_positions == model_input.input_positions
++            ).all()
++    assert received_model_input.multi_modal_kwargs is None
++    assert (received_model_input.multi_modal_kwargs ==
++            model_input.multi_modal_kwargs)
++    assert received_model_input.lora_requests is None
++    assert received_model_input.lora_requests == model_input.lora_requests
++    assert received_model_input.lora_mapping is None
++    assert received_model_input.lora_mapping == model_input.lora_mapping
++    for field in dataclasses.fields(AttentionMetadata):
++        assert getattr(received_model_input.attn_metadata, field.name,
++                       None) == getattr(attn_metadata, field.name, None)
++    # Pooling metadata is not broadcast.
++    assert received_model_input.pooling_metadata is None
++
++
++def test_multi_step_model_runner_input():
++    sampling_metadata = SamplingMetadata(
++        ["seq_group"],
++        "selected_token_indices",
++        "categorized_sample_indices",
++        "num_prompts",
++    )
++    attn_metadata = AttentionMetadata(
++        num_prefills=1,
++        num_prefill_tokens=2,
++        num_decode_tokens=3,
++        slot_mapping=torch.zeros(1),
++        multi_modal_placeholder_index_maps=None,
++    )
++    frozen_model_input = ModelInputForGPUWithSamplingMetadata(
++        input_tokens=torch.ones(10),
++        input_positions=torch.ones(10),
++        sampling_metadata=sampling_metadata,
++        attn_metadata=attn_metadata)
++
++    model_input = StatefulModelInput(
++        frozen_model_input=frozen_model_input,
++        is_last_step=True,
++        is_first_multi_step=False,
++        current_step=4,
++        last_sampled_token_ids=torch.ones((10, 1)),
++        is_multi_step=True,
++        num_queries=8,
++        num_seqs=5,
++        cached_outputs=[],
++    )
++
++    assert isinstance(model_input, StatefulModelInput)
++
++    # Test round trip serialization.
++    tensor_dict = model_input.as_broadcastable_tensor_dict()
++    attn_backend = MockAttentionBackend()
++    received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
++        tensor_dict, attn_backend=attn_backend))
++
++    receieved_frozen_input = received_model_input.frozen_model_input
++
++    # Check that received copy has correct values.
++    assert isinstance(received_model_input, StatefulModelInput)
++    assert receieved_frozen_input.input_tokens is not None
++    assert (receieved_frozen_input.input_tokens ==
++            frozen_model_input.input_tokens).all()
++    assert receieved_frozen_input.input_positions is not None
++    assert (receieved_frozen_input.input_positions ==
++            frozen_model_input.input_positions).all()
++    assert receieved_frozen_input.multi_modal_kwargs is None
++    assert (frozen_model_input.multi_modal_kwargs ==
++            frozen_model_input.multi_modal_kwargs)
++    assert receieved_frozen_input.lora_requests is None
++    assert (receieved_frozen_input.lora_requests ==
++            frozen_model_input.lora_requests)
++    assert receieved_frozen_input.lora_mapping is None
++    assert (
++        receieved_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
++    for field in dataclasses.fields(AttentionMetadata):
++        assert getattr(receieved_frozen_input.attn_metadata, field.name,
++                       None) == getattr(attn_metadata, field.name, None)
++    # For sampling metadata, only selected_token_indices is copied.
++    assert (receieved_frozen_input.sampling_metadata.selected_token_indices ==
++            sampling_metadata.selected_token_indices)
++    assert receieved_frozen_input.sampling_metadata.seq_groups is None
++
++    # check non frozen fields
++    assert received_model_input.is_last_step == model_input.is_last_step
++    assert (received_model_input.is_first_multi_step ==
++            model_input.is_first_multi_step)
++    assert received_model_input.current_step == model_input.current_step
++    assert (received_model_input.last_sampled_token_ids ==
++            model_input.last_sampled_token_ids).all()
++    assert received_model_input.is_multi_step == model_input.is_multi_step
+diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
+index e7975d0..aabe913 100644
+--- a/tests/worker/test_model_runner.py
++++ b/tests/worker/test_model_runner.py
+@@ -1,36 +1,44 @@
++from typing import List
++
+ import pytest
+ import torch
+ 
+-from vllm.config import ModelConfig, SchedulerConfig
+-from vllm.distributed.parallel_state import init_distributed_environment
++from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
++                                             init_distributed_environment)
++from vllm.engine.arg_utils import EngineArgs
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+ from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+ from vllm.utils import get_open_port
+-from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size
++from vllm.worker.model_runner import ModelRunner
++
++
++def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
++    engine_args = EngineArgs(model, *args, **kwargs)
++    engine_config = engine_args.create_engine_config()
++    model_runner = ModelRunner(
++        vllm_config=engine_config,
++        is_driver_worker=True,
++    )
++    return model_runner
+ 
+ 
+ @pytest.mark.parametrize("batch_size", list(range(1, 257)))
+ def test_prepare_prompt(batch_size):
+-    scheduler_config = SchedulerConfig(100000,
+-                                       100000,
+-                                       100000,
+-                                       enable_chunked_prefill=False)
+-    model_runner = ModelRunner(model_config=None,
+-                               parallel_config=None,
+-                               scheduler_config=scheduler_config,
+-                               device_config=None,
+-                               load_config=None,
+-                               lora_config=None)
+-    model_runner.set_block_size(16)
+-
+-    seq_lens = []
+-    seq_group_metadata_list = []
++    model_runner = _create_model_runner(
++        "facebook/opt-125m",
++        max_num_batched_tokens=100000,
++        max_num_seqs=100000,
++        enable_chunked_prefill=False,
++    )
++
++    seq_lens: List[int] = []
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+     block_tables = {0: [1]}
+     for i in range(batch_size):
+         # make sure all tokens fit into one block
+         seq_len = i % (model_runner.block_size - 1) + 1
+         seq_lens.append(seq_len)
+-        seq_data = SequenceData(list(range(seq_len)))
++        seq_data = SequenceData.from_seqs(range(seq_len))
+         seq_group_metadata = SequenceGroupMetadata(
+             request_id=f"test_{i}",
+             is_prompt=True,
+@@ -47,19 +55,26 @@ def test_prepare_prompt(batch_size):
+         expected_selected_token_indices.append(selected_token_start_idx +
+                                                seq_len - 1)
+         selected_token_start_idx += seq_len
+-    (input_tokens, input_positions, attn_metadata, return_seq_lens, _, _, _, _,
+-     _, slot_mapping) = (model_runner._prepare_prompt(seq_group_metadata_list))
++    model_input = model_runner._prepare_model_input_tensors(
++        seq_group_metadata_list)
++    input_tokens = model_input.input_tokens
++    input_positions = model_input.input_positions
++    attn_metadata = model_input.attn_metadata
++    return_seq_lens = model_input.seq_lens
++    slot_mapping = attn_metadata.slot_mapping
+     assert return_seq_lens == seq_lens
+     assert len(slot_mapping) == len(input_tokens)
+ 
+     # Verify input metadata is correct for prompts.
+     device = model_runner.device
+-    assert attn_metadata.is_prompt is True
+-    assert torch.allclose(
++    assert attn_metadata.num_prefills > 0
++    assert attn_metadata.num_decode_tokens == 0
++    torch.testing.assert_close(
+         attn_metadata.seq_lens_tensor,
+         torch.tensor(seq_lens, device=device, dtype=torch.int))
+     assert attn_metadata.seq_lens == seq_lens
+-    assert attn_metadata.max_seq_len == max(seq_lens)
++    assert attn_metadata.max_prefill_seq_len == max(seq_lens)
++    assert attn_metadata.max_decode_seq_len == 0
+ 
+     # Test subquery start locs.
+     start_idx = 0
+@@ -67,22 +82,22 @@ def test_prepare_prompt(batch_size):
+     for seq_len in seq_lens:
+         start_idx += seq_len
+         start_loc.append(start_idx)
+-    assert torch.allclose(
+-        attn_metadata.subquery_start_loc,
++    torch.testing.assert_close(
++        attn_metadata.query_start_loc,
+         torch.tensor(start_loc, dtype=torch.int32, device=device))
+ 
+     # Test seq start locs. Note that for normal prefill it is
+-    # equivalent to subquery_start_loc.
++    # equivalent to query_start_loc.
+     start_idx = 0
+     seq_start_loc = [start_idx]
+     for seq_len in seq_lens:
+         start_idx += seq_len
+         seq_start_loc.append(start_idx)
+ 
+-    assert torch.allclose(
++    torch.testing.assert_close(
+         attn_metadata.seq_start_loc,
+         torch.tensor(start_loc, dtype=torch.int32, device=device))
+-    assert torch.allclose(
++    torch.testing.assert_close(
+         attn_metadata.context_lens_tensor,
+         torch.zeros(attn_metadata.context_lens_tensor.shape[0],
+                     dtype=torch.int,
+@@ -91,7 +106,7 @@ def test_prepare_prompt(batch_size):
+     expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))],
+                             dtype=torch.int32,
+                             device=model_runner.device)
+-    assert torch.allclose(attn_metadata.block_tables, expected)
++    torch.testing.assert_close(attn_metadata.block_tables, expected)
+     # Cuda graph should not be used for prerill.
+     assert attn_metadata.use_cuda_graph is False
+ 
+@@ -112,7 +127,7 @@ def test_prepare_prompt(batch_size):
+                             device=actual.device,
+                             dtype=actual.dtype)
+     torch.testing.assert_close(actual, expected)
+-    assert input_tokens == input_positions
++    torch.allclose(input_tokens, input_positions)
+ 
+     actual = sampling_metadata.selected_token_indices
+     expected = torch.tensor(expected_selected_token_indices,
+@@ -123,36 +138,27 @@ def test_prepare_prompt(batch_size):
+ 
+ @pytest.mark.parametrize("batch_size", list(range(1, 257)))
+ def test_prepare_decode_cuda_graph(batch_size):
+-    model_config = ModelConfig(
++    model_runner = _create_model_runner(
+         "facebook/opt-125m",
+-        "facebook/opt-125m",
+-        tokenizer_mode="auto",
+-        trust_remote_code=False,
+         seed=0,
+         dtype="float16",
+-        revision=None,
+         enforce_eager=False,
++        max_num_batched_tokens=100000,
++        max_num_seqs=100000,
++        enable_chunked_prefill=False,
+     )
+-    scheduler_config = SchedulerConfig(100000,
+-                                       100000,
+-                                       100000,
+-                                       enable_chunked_prefill=False)
+-    model_runner = ModelRunner(model_config=model_config,
+-                               parallel_config=None,
+-                               scheduler_config=scheduler_config,
+-                               device_config=None,
+-                               load_config=None,
+-                               lora_config=None)
+-    model_runner.set_block_size(16)
+-
+-    seq_lens = []
+-    seq_group_metadata_list = []
++
++    context_lens: List[int] = []
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
++    # Assume each seq group finishes prefill.
+     for i in range(batch_size):
+         # make sure all tokens fit into one block
+-        seq_len = i % (model_runner.block_size - 1) + 1
+-        seq_lens.append(seq_len)
+-        seq_data = list(range(seq_len))
+-        seq_data = SequenceData(seq_data)
++        context_len = i % (model_runner.block_size - 1) + 1
++        context_lens.append(context_len)
++        seq_data = SequenceData.from_seqs(range(context_len))
++        seq_data.update_num_computed_tokens(context_len)
++        # Append one token ID since prefill is finished.
++        seq_data.append_token_id(1, 0)
+         seq_group_metadata = SequenceGroupMetadata(
+             request_id=f"test_{i}",
+             is_prompt=False,
+@@ -163,19 +169,49 @@ def test_prepare_decode_cuda_graph(batch_size):
+         assert seq_group_metadata.token_chunk_size == 1
+         seq_group_metadata_list.append(seq_group_metadata)
+ 
+-    input_tokens, input_positions, attn_metadata, _, _, _, slot_mapping = (
+-        model_runner._prepare_decode(seq_group_metadata_list))
++    model_input = model_runner._prepare_model_input_tensors(
++        seq_group_metadata_list)
++    input_tokens, input_positions, attn_metadata, slot_mapping = (
++        model_input.input_tokens, model_input.input_positions,
++        model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
+     assert len(slot_mapping) == len(input_tokens)
+ 
+-    expected_bs = _get_graph_batch_size(len(seq_group_metadata_list))
++    expected_bs = model_runner.vllm_config.pad_for_cudagraph(
++        len(seq_group_metadata_list))
+     # Verify input metadata is correct for prompts.
+     device = model_runner.device
+-    assert attn_metadata.is_prompt is False
+-    assert attn_metadata.seq_lens is None
+-    assert attn_metadata.subquery_start_loc is None
+-    assert attn_metadata.seq_start_loc is None
+-    assert attn_metadata.max_seq_len == max(seq_lens)
+-    assert torch.allclose(
++    assert attn_metadata.num_prefills == 0
++    assert attn_metadata.num_prefill_tokens == 0
++    seq_lens = [context_len + 1 for context_len in context_lens]
++    # seq_lens are padded to expected_bs
++    for _ in range(expected_bs - len(seq_lens)):
++        seq_lens.append(1)
++    assert attn_metadata.seq_lens == seq_lens
++    assert attn_metadata.num_decode_tokens == len(seq_lens)
++    start_idx = 0
++    start_loc = [start_idx]
++    for _ in context_lens:
++        # decode has only 1 token for query.
++        start_idx += 1
++        start_loc.append(start_idx)
++    torch.testing.assert_close(
++        attn_metadata.query_start_loc,
++        torch.tensor(start_loc, dtype=torch.int32, device=device))
++
++    start_idx = 0
++    seq_start_loc = [start_idx]
++    for seq_len in seq_lens:
++        start_idx += seq_len
++        seq_start_loc.append(start_idx)
++    torch.testing.assert_close(
++        attn_metadata.seq_start_loc,
++        torch.tensor(seq_start_loc, dtype=torch.int32, device=device))
++
++    torch.testing.assert_close(
++        attn_metadata.context_lens_tensor,
++        torch.tensor(context_lens, dtype=torch.int, device=device))
++    assert attn_metadata.max_decode_seq_len == max(seq_lens)
++    torch.testing.assert_close(
+         attn_metadata.seq_lens_tensor[:len(seq_lens)],
+         torch.tensor(seq_lens, dtype=torch.int, device=device))
+ 
+@@ -186,23 +222,21 @@ def test_prepare_decode_cuda_graph(batch_size):
+     # It is padded up to
+     assert attn_metadata.block_tables.shape[1] == (
+         model_runner.get_max_block_per_batch())
+-    # Cuda graph should not be used for prerill.
+     assert attn_metadata.use_cuda_graph is True
+ 
+     assert len(input_tokens) == expected_bs
+     assert len(input_positions) == expected_bs
+-    assert input_tokens == input_positions
++    torch.allclose(input_tokens, input_positions)
+ 
+     # Verify Sampling
+     expected_selected_token_indices = []
+-    selected_token_start_idx = 0
+-    for seq_len in seq_lens:
++    for selected_token_start_idx, _ in enumerate(context_lens):
+         expected_selected_token_indices.append(selected_token_start_idx)
+-        selected_token_start_idx += 1
+     sampling_metadata = SamplingMetadata.prepare(
+         seq_group_metadata_list,
+         seq_lens,
+-        query_lens=seq_lens,
++        # query lens is all 1 for decode.
++        query_lens=[1 for _ in range(len(context_lens))],
+         device=model_runner.device,
+         pin_memory=model_runner.pin_memory)
+     actual = sampling_metadata.selected_token_indices
+@@ -214,38 +248,36 @@ def test_prepare_decode_cuda_graph(batch_size):
+ 
+ def test_empty_seq_group():
+     """Verify prepare prompt and decode returns empty output."""
+-    model_config = ModelConfig(
+-        "facebook/opt-125m",
++    model_runner = _create_model_runner(
+         "facebook/opt-125m",
+-        tokenizer_mode="auto",
+-        trust_remote_code=False,
+         seed=0,
+         dtype="float16",
+-        revision=None,
+         enforce_eager=False,
+     )
+-    model_runner = ModelRunner(model_config=model_config,
+-                               parallel_config=None,
+-                               scheduler_config=None,
+-                               device_config=None,
+-                               load_config=None,
+-                               lora_config=None)
+-    model_runner.set_block_size(16)
+-    seq_group_metadata_list = []
+-    input_tokens, input_positions, attn_metadata, _, _, _, slot_mapping = (
+-        model_runner._prepare_decode(seq_group_metadata_list))
+-    assert len(input_tokens) == 0
+-    assert len(input_positions) == 0
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
++    model_input = model_runner._prepare_model_input_tensors(
++        seq_group_metadata_list)
++    input_tokens, input_positions, attn_metadata = (
++        model_input.input_tokens,
++        model_input.input_positions,
++        model_input.attn_metadata,
++    )
++    assert input_tokens is None
++    assert input_positions is None
+     assert attn_metadata is None
+-    assert len(slot_mapping) == 0
+ 
+-    (input_tokens, input_positions, attn_metadata, return_seq_lens, _, _, _, _,
+-     _, slot_mapping) = (model_runner._prepare_prompt(seq_group_metadata_list))
+-    assert len(input_tokens) == 0
+-    assert len(input_positions) == 0
++    model_input = model_runner._prepare_model_input_tensors(
++        seq_group_metadata_list)
++    (input_tokens, input_positions, attn_metadata, return_seq_lens) = (
++        model_input.input_tokens,
++        model_input.input_positions,
++        model_input.attn_metadata,
++        model_input.seq_lens,
++    )
++    assert input_tokens is None
++    assert input_positions is None
+     assert attn_metadata is None
+-    assert len(slot_mapping) == 0
+-    assert len(return_seq_lens) == 0
++    assert return_seq_lens is None
+ 
+ 
+ @pytest.fixture
+@@ -255,40 +287,27 @@ def distributed_init():
+         rank=0,
+         distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
+         local_rank=0)
++    ensure_model_parallel_initialized(1, 1)
+ 
+ 
+ @pytest.mark.parametrize("batch_size", list(range(2, 128)))
+ @pytest.mark.parametrize("enforce_eager", [True, False])
+ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
+-
+-    model_config = ModelConfig(
+-        "facebook/opt-125m",
++    model_runner = _create_model_runner(
+         "facebook/opt-125m",
+-        tokenizer_mode="auto",
+-        trust_remote_code=False,
+         seed=0,
+         dtype="float16",
+-        revision=None,
+         enforce_eager=enforce_eager,
++        max_num_batched_tokens=100000,
++        max_num_seqs=100000,
++        enable_chunked_prefill=True,
+     )
+-    scheduler_config = SchedulerConfig(100000,
+-                                       100000,
+-                                       100000,
+-                                       enable_chunked_prefill=True)
+-    model_runner = ModelRunner(model_config=model_config,
+-                               parallel_config=None,
+-                               scheduler_config=scheduler_config,
+-                               device_config=None,
+-                               load_config=None,
+-                               lora_config=None,
+-                               is_driver_worker=True)
+-    model_runner.set_block_size(16)
+ 
+     # Add prefill requests.
+-    seq_lens = []
+-    seq_group_metadata_list = []
+-    prefill_metadata_list = []
+-    decode_metadata_list = []
++    seq_lens: List[int] = []
++    seq_group_metadata_list: List[SequenceGroupMetadata] = []
++    prefill_metadata_list: List[SequenceGroupMetadata] = []
++    decode_metadata_list: List[SequenceGroupMetadata] = []
+     block_tables = {0: [1]}
+     prefill_batch_size = batch_size // 2
+     decode_batch_size = batch_size - prefill_batch_size
+@@ -296,7 +315,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
+         # make sure all tokens fit into one block
+         seq_len = i % (model_runner.block_size - 1) + 1
+         seq_lens.append(seq_len)
+-        seq_data = SequenceData(list(range(seq_len)))
++        seq_data = SequenceData.from_seqs(range(seq_len))
+         seq_group_metadata = SequenceGroupMetadata(
+             request_id=f"test_{i}",
+             is_prompt=True,
+@@ -311,9 +330,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
+     # Add decode requests
+     for i in range(prefill_batch_size, batch_size):
+         # make sure all tokens fit into one block
+-        seq_len = i % (model_runner.block_size - 1) + 1
+-        prompt_toks = list(range(seq_len))
+-        seq_data = SequenceData(prompt_toks)
++        context_len = i % (model_runner.block_size - 1) + 1
++        seq_data = SequenceData.from_seqs(range(context_len))
++        seq_data.append_token_id(1, 0)
++        seq_data.update_num_computed_tokens(context_len)
+         seq_group_metadata = SequenceGroupMetadata(
+             request_id=f"test_{i}",
+             is_prompt=False,
+@@ -325,33 +345,30 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
+         seq_group_metadata_list.append(seq_group_metadata)
+         decode_metadata_list.append(seq_group_metadata)
+ 
+-    (input_tokens, input_positions, attn_metadata, _, _, _,
+-     _) = model_runner.prepare_input_tensors(seq_group_metadata_list)
++    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
++    (input_tokens, input_positions, attn_metadata) = (
++        model_input.input_tokens,
++        model_input.input_positions,
++        model_input.attn_metadata,
++    )
+ 
+     prefill_meta_actual = attn_metadata.prefill_metadata
+     decode_meta_actual = attn_metadata.decode_metadata
+ 
+     assert len(attn_metadata.slot_mapping) == len(input_tokens)
+     assert len(input_positions) == len(input_tokens)
+-    assert attn_metadata.kv_cache_dtype == "auto"
+     assert attn_metadata.num_prefills == prefill_batch_size
+-    if enforce_eager:
+-        assert attn_metadata.num_decode_tokens == decode_batch_size
+-    else:
+-        assert attn_metadata.num_decode_tokens == _get_graph_batch_size(
+-            decode_batch_size)
++    assert attn_metadata.num_decode_tokens == decode_batch_size
+     assert attn_metadata.num_prefill_tokens == sum(seq_lens)
+ 
+     # Verify attn metadata is consistent. We don't need to test individual
+     # values here because they are tested above.
+-    prefill_meta = model_runner._prepare_prompt(
+-        prefill_metadata_list).attn_metadata
+-    decode_meta = model_runner._prepare_decode(
+-        decode_metadata_list).attn_metadata
++    attn_metadata = model_runner._prepare_model_input_tensors(
++        seq_group_metadata_list).attn_metadata
+ 
+-    for attr_expected, attr_actual in zip(vars(prefill_meta),
++    for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
+                                           vars(prefill_meta_actual)):
+         assert attr_expected[1] == attr_actual[1]
+-    for attr_expected, attr_actual in zip(vars(decode_meta),
++    for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata),
+                                           vars(decode_meta_actual)):
+         assert attr_expected[1] == attr_actual[1]
+diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
+new file mode 100644
+index 0000000..79233c7
+--- /dev/null
++++ b/tests/worker/test_profile.py
+@@ -0,0 +1,65 @@
++import torch
++
++from vllm.engine.arg_utils import EngineArgs
++from vllm.utils import get_distributed_init_method, get_ip, get_open_port
++from vllm.worker.cache_engine import CacheEngine
++from vllm.worker.worker import Worker
++
++
++def test_gpu_memory_profiling():
++    # Tests the gpu profiling that happens in order to determine the number of
++    # KV cache blocks that we can allocate on the GPU.
++    # This test mocks the maximum available gpu memory so that it can run on
++    # any gpu setup.
++
++    # Set up engine args to build a worker.
++    engine_args = EngineArgs(model="facebook/opt-125m",
++                             dtype="half",
++                             load_format="dummy")
++    engine_config = engine_args.create_engine_config()
++    engine_config.cache_config.num_gpu_blocks = 1000
++    engine_config.cache_config.num_cpu_blocks = 1000
++
++    # Create the worker.
++    distributed_init_method = get_distributed_init_method(
++        get_ip(), get_open_port())
++    worker = Worker(
++        vllm_config=engine_config,
++        local_rank=0,
++        rank=0,
++        distributed_init_method=distributed_init_method,
++        is_driver_worker=True,
++    )
++
++    # Set 10GiB as the total gpu ram to be device-agnostic
++    def mock_mem_info():
++        current_usage = torch.cuda.memory_stats(
++        )["allocated_bytes.all.current"]
++        mock_total_bytes = 10 * 1024**3
++        free = mock_total_bytes - current_usage
++
++        return (free, mock_total_bytes)
++
++    from unittest.mock import patch
++    with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
++        # Load the model so we can profile it
++        worker.init_device()
++        worker.load_model()
++        gpu_blocks, _ = worker.determine_num_available_blocks()
++
++    # Peak vram usage by torch should be 0.47 GiB
++    # Model weights take 0.25 GiB
++    # No memory should be allocated outside of torch
++    # 9.0 GiB should be the utilization target
++    # 8.28 GiB should be available for the KV cache
++    block_size = CacheEngine.get_cache_block_size(
++        engine_config.cache_config, engine_config.model_config,
++        engine_config.parallel_config)
++
++    expected_blocks = (8.28 * 1024**3) // block_size
++
++    # Check within a small tolerance for portability
++    # Hardware, kernel, or dependency changes could all affect memory
++    # utilization.
++    # A 100 block tolerance here should be about 60MB of wiggle room.
++    assert abs(gpu_blocks - expected_blocks) < 100
+diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
+index 07bcd34..acede95 100644
+--- a/tests/worker/test_swap.py
++++ b/tests/worker/test_swap.py
+@@ -19,12 +19,7 @@ def test_swap() -> None:
+     distributed_init_method = get_distributed_init_method(
+         get_ip(), get_open_port())
+     worker = Worker(
+-        model_config=engine_config.model_config,
+-        parallel_config=engine_config.parallel_config,
+-        scheduler_config=engine_config.scheduler_config,
+-        device_config=engine_config.device_config,
+-        cache_config=engine_config.cache_config,
+-        load_config=engine_config.load_config,
++        vllm_config=engine_config,
+         local_rank=0,
+         rank=0,
+         distributed_init_method=distributed_init_method,
+@@ -39,8 +34,8 @@ def test_swap() -> None:
+         num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
+ 
+     # Randomly initialize the cache.
+-    gpu_cache = worker.cache_engine.gpu_cache
+-    cpu_cache = worker.cache_engine.cpu_cache
++    gpu_cache = worker.cache_engine[0].gpu_cache
++    cpu_cache = worker.cache_engine[0].cpu_cache
+     num_layers = len(gpu_cache)
+     for i in range(num_layers):
+         gpu_key_cache, gpu_value_cache = gpu_cache[i]
+@@ -54,36 +49,36 @@ def test_swap() -> None:
+         a.cuda(), b.cuda(), rtol=0.0, atol=0.0)
+ 
+     # Test swap out.
+-    blocks_to_swap_out = {3: 72, 56: 35, 84: 34}
++    blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)]
+     execute_model_req = ExecuteModelRequest(
+         seq_group_metadata_list=[],
+-        blocks_to_swap_in={},
++        blocks_to_swap_in=[],
+         blocks_to_swap_out=blocks_to_swap_out,
+-        blocks_to_copy={},
++        blocks_to_copy=[],
+     )
+     worker.execute_model(execute_model_req=execute_model_req)
+ 
+     for i in range(num_layers):
+         gpu_key_cache, gpu_value_cache = gpu_cache[i]
+         cpu_key_cache, cpu_value_cache = cpu_cache[i]
+-        for src, dst in blocks_to_swap_out.items():
++        for src, dst in blocks_to_swap_out:
+             assert allclose(gpu_key_cache[src], cpu_key_cache[dst])
+             assert allclose(gpu_value_cache[src], cpu_value_cache[dst])
+ 
+     # Test swap in.
+-    execute_model_req.blocks_to_swap_out = {}
+-    execute_model_req.blocks_to_swap_in = {
+-        19: 45,
+-        67: 23,
+-        12: 78,
+-        40: 99,
+-        1: 71
+-    }
++    execute_model_req.blocks_to_swap_out = []
++    execute_model_req.blocks_to_swap_in = [
++        (19, 45),
++        (67, 23),
++        (12, 78),
++        (40, 99),
++        (1, 71),
++    ]
+     worker.execute_model(execute_model_req=execute_model_req)
+ 
+     for i in range(num_layers):
+         gpu_key_cache, gpu_value_cache = gpu_cache[i]
+         cpu_key_cache, cpu_value_cache = cpu_cache[i]
+-        for src, dst in execute_model_req.blocks_to_swap_in.items():
++        for src, dst in execute_model_req.blocks_to_swap_in:
+             assert allclose(gpu_key_cache[dst], cpu_key_cache[src])
+             assert allclose(gpu_value_cache[dst], cpu_value_cache[src])
+diff --git a/tools/actionlint.sh b/tools/actionlint.sh
+new file mode 100644
+index 0000000..f6a8b5e
+--- /dev/null
++++ b/tools/actionlint.sh
+@@ -0,0 +1,13 @@
++#!/bin/bash
++
++if command -v actionlint &> /dev/null; then
++    actionlint "$@"
++    exit 0
++elif [ -x ./actionlint ]; then
++    ./actionlint "$@"
++    exit 0
++fi
++
++# download a binary to the current directory - v1.7.3
++bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
++./actionlint "$@"
+diff --git a/tools/check_repo.sh b/tools/check_repo.sh
+new file mode 100644
+index 0000000..48eba5b
+--- /dev/null
++++ b/tools/check_repo.sh
+@@ -0,0 +1,14 @@
++#!/bin/bash
++# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
++
++if ! git diff --quiet; then
++	echo "Repo is dirty" >&2
++
++	exit 1
++fi
++
++if ! git describe --tags; then
++	echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
++
++	exit 1
++fi
+diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh
+new file mode 100644
+index 0000000..19a55dd
+--- /dev/null
++++ b/tools/doc-lint.sh
+@@ -0,0 +1,3 @@
++#!/bin/bash
++
++pymarkdownlnt scan docs -r
+diff --git a/tools/mypy.sh b/tools/mypy.sh
+new file mode 100644
+index 0000000..bf95e4c
+--- /dev/null
++++ b/tools/mypy.sh
+@@ -0,0 +1,33 @@
++#!/bin/bash
++
++CI=${1:-0}
++PYTHON_VERSION=${2:-3.9}
++
++if [ "$CI" -eq 1 ]; then
++    set -e
++fi
++
++run_mypy() {
++    echo "Running mypy on $1"
++    if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
++        mypy --python-version "${PYTHON_VERSION}" "$@"
++        return
++    fi
++    mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
++}
++
++run_mypy # Note that this is less strict than CI
++run_mypy tests
++run_mypy vllm/attention
++run_mypy vllm/compilation
++run_mypy vllm/distributed
++run_mypy vllm/engine
++run_mypy vllm/executor
++run_mypy vllm/inputs
++run_mypy vllm/lora
++run_mypy vllm/model_executor
++run_mypy vllm/plugins
++run_mypy vllm/prompt_adapter
++run_mypy vllm/spec_decode
++run_mypy vllm/worker
++run_mypy vllm/v1
+diff --git a/tools/png-lint.sh b/tools/png-lint.sh
+new file mode 100644
+index 0000000..a80fe98
+--- /dev/null
++++ b/tools/png-lint.sh
+@@ -0,0 +1,15 @@
++#!/bin/bash
++
++# Ensure that *.excalidraw.png files have the excalidraw metadata
++# embedded in them. This ensures they can be loaded back into
++# the tool and edited in the future.
++
++find . -iname '*.excalidraw.png' | while read -r file; do
++	if git check-ignore -q "$file"; then
++		continue
++	fi
++	if ! grep -q "excalidraw+json" "$file"; then
++		echo "$file was not exported from excalidraw with 'Embed Scene' enabled."
++		exit 1
++	fi
++done
+diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
+new file mode 100644
+index 0000000..54cd60c
+--- /dev/null
++++ b/tools/profiler/print_layerwise_table.py
+@@ -0,0 +1,82 @@
++import argparse
++import json
++from typing import Dict
++
++from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry
++from vllm.profiler.utils import TablePrinter, indent_string
++
++
++def flatten_entries(entry_cls, profile_dict: Dict):
++    entries_and_depth = []
++
++    def get_entries(node, curr_depth=0):
++        entries_and_depth.append((entry_cls(**node["entry"]), curr_depth))
++
++        for child in node["children"]:
++            get_entries(
++                child,
++                curr_depth=curr_depth + 1,
++            )
++
++    for root in profile_dict:
++        get_entries(root)
++
++    return entries_and_depth
++
++
++if __name__ == "__main__":
++    parser = argparse.ArgumentParser()
++
++    parser.add_argument("--json-trace",
++                        type=str,
++                        required=True,
++                        help="json trace file output by "
++                        "examples/offline_inference/profiling.py")
++    parser.add_argument("--phase",
++                        type=str,
++                        required=True,
++                        help="The phase to print the table for. This is either"
++                        "prefill or decode_n, where n is the decode step "
++                        "number")
++    parser.add_argument("--table",
++                        type=str,
++                        choices=["summary", "model"],
++                        default="summary",
++                        help="Which table to print, the summary table or the "
++                        "layerwise model table")
++
++    args = parser.parse_args()
++
++    with open(args.json_trace) as f:
++        profile_data = json.load(f)
++
++    assert args.phase in profile_data, \
++       (f"Cannot find phase {args.phase} in profile data. Choose one among"
++        f'{[x for x in profile_data.keys() if "prefill" in x or "decode" in x]}') #noqa
++
++    if args.table == "summary":
++        entries_and_depths = flatten_entries(
++            SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
++        column_widths = dict(name=80,
++                             cuda_time_us=12,
++                             pct_cuda_time=12,
++                             invocations=15)
++    elif args.table == "model":
++        entries_and_depths = flatten_entries(
++            ModelStatsEntry, profile_data[args.phase]["model_stats"])
++        column_widths = dict(name=60,
++                             cpu_time_us=12,
++                             cuda_time_us=12,
++                             pct_cuda_time=12,
++                             trace=60)
++
++    # indent entry names based on the depth
++    entries = []
++    for entry, depth in entries_and_depths:
++        entry.name = indent_string(
++            entry.name,
++            indent=depth,
++            indent_style=lambda indent: "|" + "-" * indent + " ")
++        entries.append(entry)
++
++    TablePrinter(type(entries[0]), column_widths).print_table(entries)
+diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
+new file mode 100644
+index 0000000..cb56ebd
+--- /dev/null
++++ b/tools/profiler/visualize_layerwise_profile.py
+@@ -0,0 +1,590 @@
++import argparse
++import copy
++import json
++import math
++import os
++from pathlib import Path
++from typing import Any, List, Optional, Tuple
++
++import matplotlib.pyplot as plt
++import pandas as pd
++
++## JSON parsing utils ####
++
++
++def largest_dist_from_leaf(node: dict, depth: int = 0):
++    if len(node["children"]) == 0:
++        return depth
++    return max([
++        largest_dist_from_leaf(child, depth=depth + 1)
++        for child in node["children"]
++    ])
++
++
++def get_entries_at_depth(depth: int,
++                         entries_and_traces: List[Tuple[Any, Any]],
++                         node: dict,
++                         curr_depth: int = 0,
++                         trace=()):
++    # assert that the query is at kernel or module level
++    assert depth == -1 or depth == -2
++
++    if curr_depth == 0 and largest_dist_from_leaf(node) <= (abs(depth) - 1):
++        # The tree is not tall enough!
++        entries_and_traces.append((node["entry"], trace))
++        return
++
++    if largest_dist_from_leaf(node) == (abs(depth) - 1):
++        entries_and_traces.append((node["entry"], trace))
++
++    trace = (node["entry"]["name"], ) + trace
++    for child in node["children"]:
++        get_entries_at_depth(depth,
++                             entries_and_traces,
++                             child,
++                             curr_depth=curr_depth + 1,
++                             trace=trace)
++
++
++def fold_nodes(root: dict, nodes_to_fold: List[str]):
++
++    stack: List[dict] = [root]
++    while len(stack) != 0:
++        node = stack.pop()
++        if node['entry']['name'] in nodes_to_fold:
++            node["children"] = []
++            continue
++        for child in node["children"]:
++            stack.append(child)
++    return root
++
++
++## Operation name cleanup utils ####
++
++
++def trim_string_back(string: str, width: int) -> str:
++    if len(string) > width:
++        offset = len(string) - width + 3
++        string = string[:-offset]
++        if len(string) > 3:
++            string = string + "..."
++    return string
++
++
++def shorten_plot_legend_strings(legend, max_char_len: int):
++    for t in legend.get_texts():
++        t.set_text(
++            trim_string_back(abbreviate_known_names(t.get_text()),
++                             max_char_len))
++
++
++def abbreviate_known_names(name: str) -> str:
++    abbreviations = {
++        "MergedColumnParallelLinear": "MCPLinear",
++        "QKVParallelLinear": "QKVPLinear",
++        "RowParallelLinear": "RPLinear",
++        "weight=": "w=",
++        "bfloat16": "bf16",
++        "float16": "f16",
++    }
++    for key, value in abbreviations.items():
++        name = name.replace(key, value)
++    return name
++
++
++def attempt_to_make_names_unique(entries_and_traces):
++    names, non_unique_names = (set(), set())
++
++    def all_the_same(items) -> bool:
++        return all(i == items[0] for i in items)
++
++    for entry, _ in entries_and_traces:
++        if entry["name"] in names:
++            non_unique_names.add(entry["name"])
++        else:
++            names.add(entry["name"])
++
++    for name in non_unique_names:
++        entries_and_traces_with_name = [(entry, trace)
++                                        for entry, trace in entries_and_traces
++                                        if entry["name"] == name]
++
++        zipped_traces = list(
++            zip(*[trace for _, trace in entries_and_traces_with_name]))
++        first_trace_difference = next(
++            (i for i, trace_eles in enumerate(zipped_traces)
++             if not all_the_same(trace_eles)), None)
++
++        if first_trace_difference is None:
++            # can't create a unique name, leave them names as the
++            # are they will get aggregated by the pivot_table call
++            continue
++
++        for entry, trace in entries_and_traces_with_name:
++            entry["name"] = " <- ".join((entry["name"], ) +
++                                        trace[:first_trace_difference + 1])
++
++
++## Operation grouping utils ####
++'''
++    Group operations in the given dataframe by some high-level ops like,
++    - gemms
++    - attention
++    - rms_norm 
++    etc.
++'''
++
++
++def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame:
++
++    def is_rms_norm(op_name: str):
++        if "rms_norm_kernel" in op_name:
++            return True
++
++    def is_attention_block(op_name: str):
++        if "flash_fwd" in op_name or \
++            "reshape_and_cache_flash_kernel" in op_name:
++            return True
++
++    def is_quant(op_name: str):
++        if "scaled_fp8_quant" in op_name or \
++           "scaled_int8_quant" in op_name:
++            return True
++
++    # LoRA ops
++    def is_sgmv_shrink(op_name: str):
++        return "sgmv_shrink" in op_name
++
++    def is_sgmv_expand(op_name: str):
++        return "sgmv_expand" in op_name
++
++    def is_bgmv_shrink(op_name: str):
++        return "bgmv_shrink" in op_name
++
++    def is_bgmv_expand(op_name: str):
++        return "bgmv_expand" in op_name
++
++    def is_cutlass_gemm_op(op_name: str):
++        return "void cutlass::Kernel" in op_name or \
++           "void cutlass::device_kernel" in op_name
++
++    def is_gemm_op(op_name: str):
++        if is_quant(op_name):
++            return False
++        return is_cutlass_gemm_op(op_name) or \
++           "xmma_gemm" in op_name  or \
++           "gemv2T_kernel" in op_name or \
++           "splitKreduce" in op_name or \
++           "s16816gemm" in op_name
++
++    def is_elementwise_op(op_name: str):
++        return "elementwise_kernel" in op_name
++
++    def is_mem_op(op_name: str):
++        return "memcpy" in op_name.lower() or \
++               "memset" in op_name.lower()
++
++    def is_vocab_embedding_op(op_name: str):
++        return "vocabparallelembed" in op_name.lower()
++
++    # nccl ops
++    def is_nccl_op(op_name: str):
++        return "nccl" in op_name.lower()
++
++    def is_nccl_all_reduce(op_name: str):
++        return is_nccl_op(op_name) and \
++                ("all_reduce" in op_name.lower() or \
++                "allreduce" in op_name.lower())
++
++    def is_nccl_gather(op_name: str):
++        return is_nccl_op(op_name) and \
++                "gather" in op_name.lower()
++
++    def is_nccl_broadcast(op_name: str):
++        return is_nccl_op(op_name) and \
++                "broadcast" in op_name.lower()
++
++    # Reduce ops types
++    def is_cross_device_reduce_1stage(op_name: str):
++        return "cross_device_reduce_1stage" in op_name
++
++    def is_cross_device_reduce_2stage(op_name: str):
++        return "cross_device_reduce_2stage" in op_name
++
++    def is_custom_ar_all_reduce(op_name: str):
++        return "_C_custom_ar::all_reduce" in op_name
++
++    def is_reduce_kernel(op_name: str):
++        return "reduce_kernel" in op_name
++
++    headers = list(trace_df)
++    ops = copy.deepcopy(headers)
++
++    attention_ops = list(filter(lambda x: is_attention_block(x), ops))
++    ops = list(filter(lambda x: x not in attention_ops, ops))
++
++    quant_ops = list(filter(lambda x: is_quant(x), ops))
++    ops = list(filter(lambda x: x not in quant_ops, ops))
++
++    sgmv_shrink_ops = list(filter(lambda x: is_sgmv_shrink(x), ops))
++    ops = list(filter(lambda x: x not in sgmv_shrink_ops, ops))
++    sgmv_expand_ops = list(filter(lambda x: is_sgmv_expand(x), ops))
++    ops = list(filter(lambda x: x not in sgmv_expand_ops, ops))
++    bgmv_shrink_ops = list(filter(lambda x: is_bgmv_shrink(x), ops))
++    ops = list(filter(lambda x: x not in bgmv_shrink_ops, ops))
++    bgmv_expand_ops = list(filter(lambda x: is_bgmv_expand(x), ops))
++    ops = list(filter(lambda x: x not in bgmv_expand_ops, ops))
++
++    cutlass_gemm_ops = list(filter(lambda x: is_cutlass_gemm_op(x), ops))
++    ops = list(filter(lambda x: x not in cutlass_gemm_ops, ops))
++
++    gemm_ops = list(filter(lambda x: is_gemm_op(x), ops))
++    ops = list(filter(lambda x: x not in gemm_ops, ops))
++
++    rms_norm_ops = list(filter(lambda x: is_rms_norm(x), ops))
++    ops = list(filter(lambda x: x not in rms_norm_ops, ops))
++
++    vocab_embed_ops = list(filter(lambda x: is_vocab_embedding_op(x), ops))
++    ops = list(filter(lambda x: x not in vocab_embed_ops, ops))
++
++    mem_ops = list(filter(lambda x: is_mem_op(x), ops))
++    ops = list(filter(lambda x: x not in mem_ops, ops))
++
++    elementwise_ops = list(filter(lambda x: is_elementwise_op(x), ops))
++    ops = list(filter(lambda x: x not in elementwise_ops, ops))
++
++    nccl_all_reduce_ops = list(filter(lambda x: is_nccl_all_reduce(x), ops))
++    ops = list(filter(lambda x: x not in nccl_all_reduce_ops, ops))
++
++    nccl_gather_ops = list(filter(lambda x: is_nccl_gather(x), ops))
++    ops = list(filter(lambda x: x not in nccl_gather_ops, ops))
++
++    nccl_broadcast_ops = list(filter(lambda x: is_nccl_broadcast(x), ops))
++    ops = list(filter(lambda x: x not in nccl_broadcast_ops, ops))
++
++    nccl_other_ops = list(filter(lambda x: is_nccl_op(x), ops))
++    ops = list(filter(lambda x: x not in nccl_other_ops, ops))
++
++    cross_device_reduce_1stage_ops = list(
++        filter(lambda x: is_cross_device_reduce_1stage(x), ops))
++    ops = list(filter(lambda x: x not in cross_device_reduce_1stage_ops, ops))
++
++    cross_device_reduce_2stage_ops = list(
++        filter(lambda x: is_cross_device_reduce_2stage(x), ops))
++    ops = list(filter(lambda x: x not in cross_device_reduce_2stage_ops, ops))
++
++    custom_ar_all_reduce_ops = list(
++        filter(lambda x: is_custom_ar_all_reduce(x), ops))
++    ops = list(filter(lambda x: x not in custom_ar_all_reduce_ops, ops))
++
++    reduce_kernel_ops = list(filter(lambda x: is_reduce_kernel(x), ops))
++    ops = list(filter(lambda x: x not in reduce_kernel_ops, ops))
++
++    if len(attention_ops):
++        trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1)
++    if len(quant_ops):
++        trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1)
++
++    if len(sgmv_shrink_ops):
++        trace_df['sgmv_shrink_ops'] = trace_df[sgmv_shrink_ops].agg("sum",
++                                                                    axis=1)
++    if len(sgmv_expand_ops):
++        trace_df['sgmv_expand_ops'] = trace_df[sgmv_expand_ops].agg("sum",
++                                                                    axis=1)
++    if len(bgmv_shrink_ops):
++        trace_df['bgmv_shrink_ops'] = trace_df[bgmv_shrink_ops].agg("sum",
++                                                                    axis=1)
++    if len(bgmv_expand_ops):
++        trace_df['bgmv_expand_ops'] = trace_df[bgmv_expand_ops].agg("sum",
++                                                                    axis=1)
++
++    if len(cutlass_gemm_ops):
++        trace_df['cutlass_gemm_ops'] = trace_df[cutlass_gemm_ops].agg("sum",
++                                                                      axis=1)
++
++    if len(gemm_ops):
++        trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1)
++    if len(rms_norm_ops):
++        trace_df['rms_norm_ops'] = trace_df[rms_norm_ops].agg("sum", axis=1)
++    if len(vocab_embed_ops):
++        trace_df['vocab_embed_ops'] = trace_df[vocab_embed_ops].agg("sum",
++                                                                    axis=1)
++    if len(mem_ops):
++        trace_df['mem_ops'] = trace_df[mem_ops].agg("sum", axis=1)
++    if len(elementwise_ops):
++        trace_df['elementwise_ops'] = trace_df[elementwise_ops].agg("sum",
++                                                                    axis=1)
++
++    if len(nccl_all_reduce_ops):
++        trace_df['nccl_all_reduce_ops'] = trace_df[nccl_all_reduce_ops].agg(
++            "sum", axis=1)
++    if len(nccl_gather_ops):
++        trace_df['nccl_gather_ops'] = trace_df[nccl_gather_ops].agg("sum",
++                                                                    axis=1)
++    if len(nccl_broadcast_ops):
++        trace_df['nccl_broadcast_ops'] = trace_df[nccl_broadcast_ops].agg(
++            "sum", axis=1)
++    if len(nccl_other_ops):
++        trace_df['nccl_other_ops'] = trace_df[nccl_other_ops].agg("sum",
++                                                                  axis=1)
++
++    if len(cross_device_reduce_1stage_ops):
++        trace_df['cross_device_reduce_1stage_ops'] = trace_df[
++            cross_device_reduce_1stage_ops].agg("sum", axis=1)
++    if len(cross_device_reduce_2stage_ops):
++        trace_df['cross_device_reduce_2stage_ops'] = trace_df[
++            cross_device_reduce_2stage_ops].agg("sum", axis=1)
++    if len(custom_ar_all_reduce_ops):
++        trace_df['custom_ar_all_reduce_ops'] = trace_df[
++            custom_ar_all_reduce_ops].agg("sum", axis=1)
++    if len(reduce_kernel_ops):
++        trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
++                                                                        axis=1)
++
++    trace_df.drop(attention_ops + quant_ops + sgmv_shrink_ops +
++                  sgmv_expand_ops + bgmv_shrink_ops + bgmv_expand_ops +
++                  cutlass_gemm_ops + gemm_ops + rms_norm_ops +
++                  vocab_embed_ops + mem_ops + elementwise_ops +
++                  nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
++                  nccl_other_ops + cross_device_reduce_1stage_ops +
++                  cross_device_reduce_2stage_ops + custom_ar_all_reduce_ops +
++                  reduce_kernel_ops,
++                  axis=1,
++                  inplace=True)
++    return trace_df
++
++
++## Data plotting utils ####
++
++
++def plot_trace_df(traces_df: pd.DataFrame,
++                  plot_metric: str,
++                  plot_title: str,
++                  output: Optional[Path] = None):
++
++    def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str:
++        phase_df = traces_df.query(f'phase == "{phase}"')
++        descs = phase_df['phase_desc'].to_list()
++        assert all([desc == descs[0] for desc in descs])
++        return descs[0]
++
++    phases = traces_df['phase'].unique()
++    phase_descs = [get_phase_description(traces_df, p) for p in phases]
++    traces_df = traces_df.pivot_table(index="phase",
++                                      columns="name",
++                                      values=plot_metric,
++                                      aggfunc="sum")
++
++    traces_df = group_trace_by_operations(traces_df)
++
++    # Make the figure
++    fig_size_x = max(5, len(phases))
++    fig, ax = plt.subplots(1, figsize=(fig_size_x, 8), sharex=True)
++
++    # Draw the stacked bars
++    ops = list(traces_df)
++    bottom = [0] * len(phases)
++    for op in ops:
++        values = [traces_df[op][phase] for phase in phases]
++        values = list(map(lambda x: 0.0 if math.isnan(x) else x, values))
++        ax.bar(phase_descs, values, label=op, bottom=bottom)
++        bottom = [bottom[j] + values[j] for j in range(len(phases))]
++
++    # Write the values as text on the bars
++    for bar in ax.patches:
++        if bar.get_height() != 0:
++            ax.text(bar.get_x() + bar.get_width() / 2,
++                    bar.get_height() / 2 + bar.get_y(),
++                    f"{round(bar.get_height(), 2)}",
++                    ha='center',
++                    color='w',
++                    weight='bold',
++                    size=5)
++
++    # Setup legend
++    handles, labels = plt.gca().get_legend_handles_labels()
++    legend = fig.legend(handles,
++                        labels,
++                        loc='center left',
++                        bbox_to_anchor=(1, 1))
++    shorten_plot_legend_strings(legend, 50)
++
++    # Setup labels and title
++    plt.setp(ax.get_xticklabels(), rotation=90)
++    ax.set_ylabel(plot_metric)
++    plt.suptitle(plot_title)
++
++    plt.savefig(output, bbox_inches='tight')
++    print("Created: ", output)
++
++
++def main(
++        json_trace: Path,
++        output_directory: Path,
++        depth: int,  # Fetch/Plot operations at this depth of the Json tree
++        plot_metric: str,
++        make_names_unique: bool,
++        top_k: int,
++        json_nodes_to_fold: List[str]):
++
++    def prepare_data(profile_json: dict, step_keys: List[str]) -> pd.DataFrame:
++
++        def get_entries_and_traces(key: str):
++            entries_and_traces: List[Tuple[Any, Any]] = []
++            for root in profile_json[key]["summary_stats"]:
++                # Fold nodes in the traces as per user request. i.e. simply
++                # make the requested nodes leaf-nodes.
++                root = fold_nodes(root, json_nodes_to_fold)
++                get_entries_at_depth(depth, entries_and_traces, root)
++            return entries_and_traces
++
++        def keep_only_top_entries(df: pd.DataFrame,
++                                  metric: str,
++                                  top_k: int = 9) -> pd.DataFrame:
++            df.loc[df.nsmallest(len(df) - top_k + 1, metric).index,
++                   ["name"]] = "others"
++            return df
++
++        def get_phase_description(key: str) -> str:
++            num_running_seqs = profile_json[key]['metadata'][
++                'num_running_seqs']
++            if num_running_seqs is not None:
++                return f"{key}-seqs-{num_running_seqs}"
++            else:
++                return key
++
++        # Get data for each key
++        traces = list(map(lambda x: get_entries_and_traces(x), step_keys))
++
++        # Attempt some cleanup
++        if make_names_unique:
++            for trace in traces:
++                attempt_to_make_names_unique(trace)
++
++        # To pandas dataframe
++        trace_dfs = list(
++            map(lambda t: pd.DataFrame([entry for entry, _ in t]).fillna(0),
++                traces))
++
++        # Respect top_k
++        if top_k:
++            trace_dfs = list(
++                map(
++                    lambda trace_df: keep_only_top_entries(
++                        trace_df, "cuda_time_us", top_k), trace_dfs))
++
++        # Fill in information about the step-keys
++        for trace_df, step_key in zip(trace_dfs, step_keys):
++            trace_df['phase'] = step_key
++            trace_df['phase_desc'] = get_phase_description(step_key)
++
++        # Combine all data frames so they can be put in a single plot
++        traces_df = pd.concat(trace_dfs)
++
++        # Add a derived metric `cuda_time_ms`
++        traces_df["cuda_time_ms"] = traces_df["cuda_time_us"] / 1000
++        traces_df = traces_df.fillna(0)
++
++        return traces_df
++
++    def make_plot_title_suffix(profile_json: dict) -> str:
++        context = profile_json["context"]
++        sparsity = context.get('sparsity', None)
++        run_type = \
++            f'Run {context["num_steps"]} steps' if context['num_steps'] else \
++                (f'Complete {context["complete_num_requests_per_step"]} per '
++                 f'step; Run till completion')
++        return (f"{context['engine_args']['model']}\n"
++                f"Batch={context['batch_size']}, "
++                f"PromptLen={context['prompt_len']}, "
++                f"NumGpus={context['engine_args']['tensor_parallel_size']}"
++                f"{', Sparsity ' + sparsity if sparsity else ''}\n"
++                f"Run Type: {run_type}")
++
++    profile_json = None
++    with open(json_trace) as f:
++        profile_json = json.load(f)
++    assert profile_json is not None
++
++    # Get all `llm.generate.step()` profile
++    step_traces = list(profile_json.keys())
++    assert (step_traces[0] == 'context')
++    step_traces = step_traces[1:]  # have only prefill and decodes
++    prefills = list(filter(lambda x: "prefill" in x, step_traces))
++    all_decodes = list(filter(lambda x: "decode" in x, step_traces))
++    assert len(prefills) + len(all_decodes) == len(step_traces)
++    assert len(prefills) == 1
++
++    decodes = all_decodes[::args.step_plot_interval]
++    if decodes[-1] != all_decodes[-1]:
++        # Always have the last decode
++        decodes.append(all_decodes[-1])
++
++    prefill_traces = prepare_data(profile_json, prefills)
++    decode_traces = prepare_data(profile_json, decodes)
++
++    plot_title_suffix = make_plot_title_suffix(profile_json)
++
++    plot_trace_df(prefill_traces, plot_metric, "prefill " + plot_title_suffix,
++                  output_directory / Path("prefill.png"))
++    plot_trace_df(decode_traces, plot_metric, "decodes " + plot_title_suffix,
++                  output_directory / Path("decode_steps.png"))
++
++
++if __name__ == "__main__":
++    parser = argparse.ArgumentParser()
++
++    parser.add_argument("--json-trace",
++                        type=str,
++                        required=True,
++                        help="json trace file output by \
++                              examples/offline_inference/profiling.py")
++    parser.add_argument("--output-directory",
++                        type=str,
++                        required=False,
++                        help="Directory to output plots")
++    parser.add_argument("--level",
++                        type=str,
++                        default="module",
++                        choices=["module", "kernel"])
++    parser.add_argument("--top-k",
++                        type=int,
++                        default=12,
++                        help="Only graph the top `top_k` entries by time.")
++    parser.add_argument("--fold-json-node",
++                        nargs='+',
++                        default=['Sampler', 'LogitsProcessor'],
++                        help='Do not plot the children of these nodes. Let, \
++                              the node represent the aggregate of all its \
++                              children')
++    parser.add_argument("--plot-metric",
++                        type=str,
++                        default="cuda_time_ms",
++                        help='Metric to plot. some options are cuda_time_ms, \
++                                pct_cuda_time')
++    parser.add_argument(
++        "--step-plot-interval",
++        type=int,
++        default=4,
++        help="For every `step_plot_interval` steps, plot 1 step")
++
++    args = parser.parse_args()
++
++    # Prepare/Extract relevant args
++    make_names_unique = False
++    if args.level == "module":
++        depth = -2
++        make_names_unique = True
++    elif args.level == "kernel":
++        depth = -1
++    else:
++        raise Exception(f"Unexpected level value ({args.level})")
++
++    output_directory = args.output_directory if args.output_directory else Path(
++        args.json_trace).parent
++
++    if not os.path.exists(output_directory):
++        os.makedirs(output_directory)
++
++    main(Path(args.json_trace), output_directory, depth, args.plot_metric,
++         make_names_unique, args.top_k, args.fold_json_node)
+diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
+new file mode 100644
+index 0000000..51ad2ad
+--- /dev/null
++++ b/tools/report_build_time_ninja.py
+@@ -0,0 +1,312 @@
++#!/usr/bin/env python3
++# Copyright (c) 2018 The Chromium Authors. All rights reserved.
++# Use of this source code is governed by a BSD-style license that can be
++# found in the LICENSE file.
++
++# Modified version of: https://chromium.googlesource.com/chromium/tools/depot_tools.git/+/refs/heads/main/post_build_ninja_summary.py
++"""Summarize the last ninja build, invoked with ninja's -C syntax.
++
++> python3 tools/report_build_time_ninja.py -C build/..
++
++Typical output looks like this:
++```
++    Longest build steps for .cpp.o:
++           1.0 weighted s to build ...torch_bindings.cpp.o (12.4 s elapsed time)
++           2.0 weighted s to build ..._attn_c.dir/csrc... (23.5 s elapsed time)
++           2.6 weighted s to build ...torch_bindings.cpp.o (31.5 s elapsed time)
++           3.2 weighted s to build ...torch_bindings.cpp.o (38.5 s elapsed time)
++    Longest build steps for .so (linking):
++           0.1 weighted s to build _moe_C.abi3.so (1.0 s elapsed time)
++           0.5 weighted s to build ...flash_attn_c.abi3.so (1.1 s elapsed time)
++           6.2 weighted s to build _C.abi3.so (6.2 s elapsed time)
++    Longest build steps for .cu.o:
++          15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
++          15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
++          15.3 weighted s to build ...machete_mm_... (183.6 s elapsed time)
++          15.3 weighted s to build ...machete_mm_... (183.7 s elapsed time)
++          15.5 weighted s to build ...machete_mm_... (185.6 s elapsed time)
++          15.5 weighted s to build ...machete_mm_... (185.9 s elapsed time)
++          15.5 weighted s to build ...machete_mm_... (186.2 s elapsed time)
++          37.4 weighted s to build ...scaled_mm_c3x.cu... (449.0 s elapsed time)
++          43.9 weighted s to build ...scaled_mm_c2x.cu... (527.4 s elapsed time)
++         344.8 weighted s to build ...attention_...cu.o (1087.2 s elapsed time)
++    1110.0 s weighted time (10120.4 s elapsed time sum, 9.1x parallelism)
++    134 build steps completed, average of 0.12/s
++```
++"""
++
++import argparse
++import errno
++import fnmatch
++import os
++import sys
++from collections import defaultdict
++
++# The number of long build times to report:
++long_count = 10
++# The number of long times by extension to report
++long_ext_count = 10
++
++
++class Target:
++    """Represents a single line read for a .ninja_log file."""
++
++    def __init__(self, start, end):
++        """Creates a target object by passing in the start/end times in seconds
++        as a float."""
++        self.start = start
++        self.end = end
++        # A list of targets, appended to by the owner of this object.
++        self.targets = []
++        self.weighted_duration = 0.0
++
++    def Duration(self):
++        """Returns the task duration in seconds as a float."""
++        return self.end - self.start
++
++    def SetWeightedDuration(self, weighted_duration):
++        """Sets the duration, in seconds, passed in as a float."""
++        self.weighted_duration = weighted_duration
++
++    def WeightedDuration(self):
++        """Returns the task's weighted duration in seconds as a float.
++
++        Weighted_duration takes the elapsed time of the task and divides it
++        by how many other tasks were running at the same time. Thus, it
++        represents the approximate impact of this task on the total build time,
++        with serialized or serializing steps typically ending up with much
++        longer weighted durations.
++        weighted_duration should always be the same or shorter than duration.
++        """
++        # Allow for modest floating-point errors
++        epsilon = 0.000002
++        if (self.weighted_duration > self.Duration() + epsilon):
++            print('{} > {}?'.format(self.weighted_duration, self.Duration()))
++        assert (self.weighted_duration <= self.Duration() + epsilon)
++        return self.weighted_duration
++
++    def DescribeTargets(self):
++        """Returns a printable string that summarizes the targets."""
++        # Some build steps generate dozens of outputs - handle them sanely.
++        # The max_length was chosen so that it can fit most of the long
++        # single-target names, while minimizing word wrapping.
++        result = ', '.join(self.targets)
++        max_length = 65
++        if len(result) > max_length:
++            result = result[:max_length] + '...'
++        return result
++
++
++# Copied with some modifications from ninjatracing
++def ReadTargets(log, show_all):
++    """Reads all targets from .ninja_log file |log_file|, sorted by duration.
++
++    The result is a list of Target objects."""
++    header = log.readline()
++    assert header == '# ninja log v5\n', \
++           'unrecognized ninja log version {!r}'.format(header)
++    targets_dict = {}
++    last_end_seen = 0.0
++    for line in log:
++        parts = line.strip().split('\t')
++        if len(parts) != 5:
++            # If ninja.exe is rudely halted then the .ninja_log file may be
++            # corrupt. Silently continue.
++            continue
++        start, end, _, name, cmdhash = parts  # Ignore restat.
++        # Convert from integral milliseconds to float seconds.
++        start = int(start) / 1000.0
++        end = int(end) / 1000.0
++        if not show_all and end < last_end_seen:
++            # An earlier time stamp means that this step is the first in a new
++            # build, possibly an incremental build. Throw away the previous
++            # data so that this new build will be displayed independently.
++            # This has to be done by comparing end times because records are
++            # written to the .ninja_log file when commands complete, so end
++            # times are guaranteed to be in order, but start times are not.
++            targets_dict = {}
++        target = None
++        if cmdhash in targets_dict:
++            target = targets_dict[cmdhash]
++            if not show_all and (target.start != start or target.end != end):
++                # If several builds in a row just run one or two build steps
++                # then the end times may not go backwards so the last build may
++                # not be detected as such. However in many cases there will be a
++                # build step repeated in the two builds and the changed
++                # start/stop points for that command, identified by the hash,
++                # can be used to detect and reset the target dictionary.
++                targets_dict = {}
++                target = None
++        if not target:
++            targets_dict[cmdhash] = target = Target(start, end)
++        last_end_seen = end
++        target.targets.append(name)
++    return list(targets_dict.values())
++
++
++def GetExtension(target, extra_patterns):
++    """Return the file extension that best represents a target.
++
++  For targets that generate multiple outputs it is important to return a
++  consistent 'canonical' extension. Ultimately the goal is to group build steps
++  by type."""
++    for output in target.targets:
++        if extra_patterns:
++            for fn_pattern in extra_patterns.split(';'):
++                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
++                    return fn_pattern
++        # Not a true extension, but a good grouping.
++        if output.endswith('type_mappings'):
++            extension = 'type_mappings'
++            break
++
++        # Capture two extensions if present. For example: file.javac.jar should
++        # be distinguished from file.interface.jar.
++        root, ext1 = os.path.splitext(output)
++        _, ext2 = os.path.splitext(root)
++        extension = ext2 + ext1  # Preserve the order in the file name.
++
++        if len(extension) == 0:
++            extension = '(no extension found)'
++
++        if ext1 in ['.pdb', '.dll', '.exe']:
++            extension = 'PEFile (linking)'
++            # Make sure that .dll and .exe are grouped together and that the
++            # .dll.lib files don't cause these to be listed as libraries
++            break
++        if ext1 in ['.so', '.TOC']:
++            extension = '.so (linking)'
++            # Attempt to identify linking, avoid identifying as '.TOC'
++            break
++        # Make sure .obj files don't get categorized as mojo files
++        if ext1 in ['.obj', '.o']:
++            break
++        # Jars are the canonical output of java targets.
++        if ext1 == '.jar':
++            break
++        # Normalize all mojo related outputs to 'mojo'.
++        if output.count('.mojom') > 0:
++            extension = 'mojo'
++            break
++    return extension
++
++
++def SummarizeEntries(entries, extra_step_types):
++    """Print a summary of the passed in list of Target objects."""
++
++    # Create a list that is in order by time stamp and has entries for the
++    # beginning and ending of each build step (one time stamp may have multiple
++    # entries due to multiple steps starting/stopping at exactly the same time).
++    # Iterate through this list, keeping track of which tasks are running at all
++    # times. At each time step calculate a running total for weighted time so
++    # that when each task ends its own weighted time can easily be calculated.
++    task_start_stop_times = []
++
++    earliest = -1
++    latest = 0
++    total_cpu_time = 0
++    for target in entries:
++        if earliest < 0 or target.start < earliest:
++            earliest = target.start
++        if target.end > latest:
++            latest = target.end
++        total_cpu_time += target.Duration()
++        task_start_stop_times.append((target.start, 'start', target))
++        task_start_stop_times.append((target.end, 'stop', target))
++    length = latest - earliest
++    weighted_total = 0.0
++
++    # Sort by the time/type records and ignore |target|
++    task_start_stop_times.sort(key=lambda times: times[:2])
++    # Now we have all task start/stop times sorted by when they happen. If a
++    # task starts and stops on the same time stamp then the start will come
++    # first because of the alphabet, which is important for making this work
++    # correctly.
++    # Track the tasks which are currently running.
++    running_tasks = {}
++    # Record the time we have processed up to so we know how to calculate time
++    # deltas.
++    last_time = task_start_stop_times[0][0]
++    # Track the accumulated weighted time so that it can efficiently be added
++    # to individual tasks.
++    last_weighted_time = 0.0
++    # Scan all start/stop events.
++    for event in task_start_stop_times:
++        time, action_name, target = event
++        # Accumulate weighted time up to now.
++        num_running = len(running_tasks)
++        if num_running > 0:
++            # Update the total weighted time up to this moment.
++            last_weighted_time += (time - last_time) / float(num_running)
++        if action_name == 'start':
++            # Record the total weighted task time when this task starts.
++            running_tasks[target] = last_weighted_time
++        if action_name == 'stop':
++            # Record the change in the total weighted task time while this task
++            # ran.
++            weighted_duration = last_weighted_time - running_tasks[target]
++            target.SetWeightedDuration(weighted_duration)
++            weighted_total += weighted_duration
++            del running_tasks[target]
++        last_time = time
++    assert (len(running_tasks) == 0)
++
++    # Warn if the sum of weighted times is off by more than half a second.
++    if abs(length - weighted_total) > 500:
++        print('Warning: Possible corrupt ninja log, results may be '
++              'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
++                  length, weighted_total))
++
++    entries_by_ext = defaultdict(list)
++    for target in entries:
++        extension = GetExtension(target, extra_step_types)
++        entries_by_ext[extension].append(target)
++
++    for key, values in entries_by_ext.items():
++        print('    Longest build steps for {}:'.format(key))
++        values.sort(key=lambda x: x.WeightedDuration())
++        for target in values[-long_count:]:
++            print(
++                '      {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
++                format(target.WeightedDuration(), target.DescribeTargets(),
++                       target.Duration()))
++
++    print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
++          'parallelism)'.format(length, total_cpu_time,
++                                total_cpu_time * 1.0 / length))
++    print('    %d build steps completed, average of %1.2f/s' %
++          (len(entries), len(entries) / (length)))
++
++
++def main():
++    log_file = '.ninja_log'
++    parser = argparse.ArgumentParser()
++    parser.add_argument('-C', dest='build_directory', help='Build directory.')
++    parser.add_argument(
++        '-s',
++        '--step-types',
++        help='semicolon separated fnmatch patterns for build-step grouping')
++    parser.add_argument('--log-file',
++                        help="specific ninja log file to analyze.")
++    args, _extra_args = parser.parse_known_args()
++    if args.build_directory:
++        log_file = os.path.join(args.build_directory, log_file)
++    if args.log_file:
++        log_file = args.log_file
++    if args.step_types:
++        # Make room for the extra build types.
++        global long_ext_count
++        long_ext_count += len(args.step_types.split(';'))
++
++    try:
++        with open(log_file) as log:
++            entries = ReadTargets(log, False)
++            SummarizeEntries(entries, args.step_types)
++    except OSError:
++        print('Log file {!r} not found, no build summary created.'.format(
++            log_file))
++        return errno.ENOENT
++
++
++if __name__ == '__main__':
++    sys.exit(main())
+diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
+new file mode 100644
+index 0000000..d99fa77
+--- /dev/null
++++ b/tools/shellcheck.sh
+@@ -0,0 +1,22 @@
++#!/bin/bash
++set -e
++
++scversion="stable"
++
++if [ -d "shellcheck-${scversion}" ]; then
++    export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
++fi
++
++if ! [ -x "$(command -v shellcheck)" ]; then
++    if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
++        echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
++        exit 1
++    fi
++
++    # automatic local install if linux x86_64
++    wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
++    export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
++fi
++
++# TODO - fix warnings in .buildkite/run-amd-test.sh
++find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'
+diff --git a/use_existing_torch.py b/use_existing_torch.py
+new file mode 100644
+index 0000000..319d262
+--- /dev/null
++++ b/use_existing_torch.py
+@@ -0,0 +1,18 @@
++import glob
++
++requires_files = glob.glob('requirements*.txt')
++requires_files += ["pyproject.toml"]
++for file in requires_files:
++    print(f">>> cleaning {file}")
++    with open(file) as f:
++        lines = f.readlines()
++    if "torch" in "".join(lines).lower():
++        print("removed:")
++        with open(file, 'w') as f:
++            for line in lines:
++                if 'torch' not in line.lower():
++                    f.write(line)
++                else:
++                    print(line.strip())
++    print(f"<<< done cleaning {file}")
++    print()
+diff --git a/vllm/__init__.py b/vllm/__init__.py
+index 59810da..45252b9 100644
+--- a/vllm/__init__.py
++++ b/vllm/__init__.py
+@@ -5,21 +5,41 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
+ from vllm.engine.llm_engine import LLMEngine
+ from vllm.entrypoints.llm import LLM
+ from vllm.executor.ray_utils import initialize_ray_cluster
++from vllm.inputs import PromptType, TextPrompt, TokensPrompt
+ from vllm.model_executor.models import ModelRegistry
+-from vllm.outputs import CompletionOutput, RequestOutput
++from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
++                          CompletionOutput, EmbeddingOutput,
++                          EmbeddingRequestOutput, PoolingOutput,
++                          PoolingRequestOutput, RequestOutput, ScoringOutput,
++                          ScoringRequestOutput)
++from vllm.pooling_params import PoolingParams
+ from vllm.sampling_params import SamplingParams
+ 
+-__version__ = "0.4.2"
++from .version import __version__, __version_tuple__
+ 
+ __all__ = [
++    "__version__",
++    "__version_tuple__",
+     "LLM",
+     "ModelRegistry",
++    "PromptType",
++    "TextPrompt",
++    "TokensPrompt",
+     "SamplingParams",
+     "RequestOutput",
+     "CompletionOutput",
++    "PoolingOutput",
++    "PoolingRequestOutput",
++    "EmbeddingOutput",
++    "EmbeddingRequestOutput",
++    "ClassificationOutput",
++    "ClassificationRequestOutput",
++    "ScoringOutput",
++    "ScoringRequestOutput",
+     "LLMEngine",
+     "EngineArgs",
+     "AsyncLLMEngine",
+     "AsyncEngineArgs",
+     "initialize_ray_cluster",
++    "PoolingParams",
+ ]
+diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
+index 5b56437..d04cbbc 100644
+--- a/vllm/_custom_ops.py
++++ b/vllm/_custom_ops.py
+@@ -1,33 +1,37 @@
+-from typing import Dict, Optional, Tuple
++import contextlib
++import importlib
++from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+ 
+ import torch
++import torch.library
+ 
+-try:
+-    from vllm._C import cache_ops as vllm_cache_ops
+-    from vllm._C import ops as vllm_ops
+-except ImportError:
+-    pass
++import vllm.envs as envs
++from vllm.logger import init_logger
++from vllm.platforms import current_platform
++from vllm.scalar_type import ScalarType
+ 
++logger = init_logger(__name__)
+ 
+-# activation ops
+-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+-    vllm_ops.silu_and_mul(out, x)
++if not current_platform.is_tpu() and not current_platform.is_hpu():
++    try:
++        import vllm._C
++    except ImportError as e:
++        logger.warning("Failed to import from vllm._C with %r", e)
+ 
++supports_moe_ops = False
++with contextlib.suppress(ImportError):
++    import vllm._moe_C  # noqa: F401
++    supports_moe_ops = True
+ 
+-def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+-    vllm_ops.gelu_and_mul(out, x)
++if TYPE_CHECKING:
+ 
+-
+-def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+-    vllm_ops.gelu_tanh_and_mul(out, x)
+-
+-
+-def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+-    vllm_ops.gelu_fast(out, x)
+-
+-
+-def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+-    vllm_ops.gelu_new(out, x)
++    def register_fake(fn):
++        return lambda name: fn
++else:
++    try:
++        from torch.library import register_fake
++    except ImportError:
++        from torch.library import impl_abstract as register_fake
+ 
+ 
+ # page attention ops
+@@ -44,12 +48,20 @@ def paged_attention_v1(
+     max_seq_len: int,
+     alibi_slopes: Optional[torch.Tensor],
+     kv_cache_dtype: str,
+-    kv_scale: float,
++    k_scale: float,
++    v_scale: float,
++    tp_rank: int = 0,
++    blocksparse_local_blocks: int = 0,
++    blocksparse_vert_stride: int = 0,
++    blocksparse_block_size: int = 64,
++    blocksparse_head_sliding_step: int = 0,
+ ) -> None:
+-    vllm_ops.paged_attention_v1(out, query, key_cache, value_cache,
+-                                num_kv_heads, scale, block_tables, seq_lens,
+-                                block_size, max_seq_len, alibi_slopes,
+-                                kv_cache_dtype, kv_scale)
++    torch.ops._C.paged_attention_v1(
++        out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
++        seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
++        k_scale, v_scale, tp_rank, blocksparse_local_blocks,
++        blocksparse_vert_stride, blocksparse_block_size,
++        blocksparse_head_sliding_step)
+ 
+ 
+ def paged_attention_v2(
+@@ -68,13 +80,46 @@ def paged_attention_v2(
+     max_seq_len: int,
+     alibi_slopes: Optional[torch.Tensor],
+     kv_cache_dtype: str,
+-    kv_scale: float,
++    k_scale: float,
++    v_scale: float,
++    tp_rank: int = 0,
++    blocksparse_local_blocks: int = 0,
++    blocksparse_vert_stride: int = 0,
++    blocksparse_block_size: int = 64,
++    blocksparse_head_sliding_step: int = 0,
++) -> None:
++    torch.ops._C.paged_attention_v2(
++        out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
++        num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
++        alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
++        blocksparse_local_blocks, blocksparse_vert_stride,
++        blocksparse_block_size, blocksparse_head_sliding_step)
++
++
++def paged_attention_rocm(
++    out: torch.Tensor,
++    exp_sum: torch.Tensor,
++    max_logits: torch.Tensor,
++    tmp_out: torch.Tensor,
++    query: torch.Tensor,
++    key_cache: torch.Tensor,
++    value_cache: torch.Tensor,
++    num_kv_heads: int,
++    scale: float,
++    block_tables: torch.Tensor,
++    seq_lens: torch.Tensor,
++    block_size: int,
++    max_seq_len: int,
++    alibi_slopes: Optional[torch.Tensor],
++    kv_cache_dtype: str,
++    k_scale: float,
++    v_scale: float,
+ ) -> None:
+-    vllm_ops.paged_attention_v2(out, exp_sum, max_logits, tmp_out, query,
+-                                key_cache, value_cache, num_kv_heads, scale,
+-                                block_tables, seq_lens, block_size,
+-                                max_seq_len, alibi_slopes, kv_cache_dtype,
+-                                kv_scale)
++    torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
++                                      key_cache, value_cache, num_kv_heads,
++                                      scale, block_tables, seq_lens,
++                                      block_size, max_seq_len, alibi_slopes,
++                                      kv_cache_dtype, k_scale, v_scale)
+ 
+ 
+ # pos encoding ops
+@@ -86,8 +131,8 @@ def rotary_embedding(
+     cos_sin_cache: torch.Tensor,
+     is_neox: bool,
+ ) -> None:
+-    vllm_ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache,
+-                              is_neox)
++    torch.ops._C.rotary_embedding(positions, query, key, head_size,
++                                  cos_sin_cache, is_neox)
+ 
+ 
+ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
+@@ -95,20 +140,72 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
+                              cos_sin_cache: torch.Tensor, is_neox: bool,
+                              rot_dim: int,
+                              cos_sin_cache_offsets: torch.Tensor) -> None:
+-    vllm_ops.batched_rotary_embedding(positions, query, key, head_size,
+-                                      cos_sin_cache, is_neox, rot_dim,
+-                                      cos_sin_cache_offsets)
++    torch.ops._C.batched_rotary_embedding(positions, query, key, head_size,
++                                          cos_sin_cache, is_neox, rot_dim,
++                                          cos_sin_cache_offsets)
+ 
+ 
+ # layer norm ops
+ def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
+              epsilon: float) -> None:
+-    vllm_ops.rms_norm(out, input, weight, epsilon)
++    torch.ops._C.rms_norm(out, input, weight, epsilon)
+ 
+ 
+ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
+                        weight: torch.Tensor, epsilon: float) -> None:
+-    vllm_ops.fused_add_rms_norm(input, residual, weight, epsilon)
++    torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
++
++
++def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
++                           input_tokens: torch.Tensor,
++                           sampled_token_ids: torch.Tensor,
++                           input_positions: torch.Tensor,
++                           seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
++                           block_tables: torch.Tensor) -> None:
++    """Advance a step on GPU for existing inputs for a multi-step runner"""
++    return torch.ops._C.advance_step_flashattn(num_seqs, num_queries,
++                                               block_size, input_tokens,
++                                               sampled_token_ids,
++                                               input_positions, seq_lens,
++                                               slot_mapping, block_tables)
++
++
++def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
++                            input_tokens: torch.Tensor,
++                            sampled_token_ids: torch.Tensor,
++                            input_positions: torch.Tensor,
++                            seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
++                            block_tables: torch.Tensor,
++                            paged_kv_indices: torch.Tensor,
++                            paged_kv_indptr: torch.Tensor,
++                            paged_kv_last_page_len: torch.Tensor,
++                            block_table_bound: torch.Tensor) -> None:
++
++    return torch.ops._C.advance_step_flashinfer(
++        num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
++        input_positions, seq_lens, slot_mapping, block_tables,
++        paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len,
++        block_table_bound)
++
++
++# fused quant layer norm ops
++def rms_norm_dynamic_per_token_quant(
++    input: torch.Tensor,
++    weight: torch.Tensor,
++    epsilon: float,
++    quant_dtype: torch.dtype,
++    scale_ub: Optional[torch.Tensor] = None,
++    residual: Optional[torch.Tensor] = None
++) -> Tuple[torch.Tensor, torch.Tensor]:
++    output = torch.empty_like(input, dtype=quant_dtype)
++    scales = torch.empty((input.numel() // input.shape[-1], 1),
++                         device=input.device,
++                         dtype=torch.float32)
++
++    torch.ops._C.rms_norm_dynamic_per_token_quant(output, input, weight,
++                                                  scales, epsilon, scale_ub,
++                                                  residual)
++    return output, scales
+ 
+ 
+ # quantization ops
+@@ -116,13 +213,21 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
+ def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
+                    zeros: torch.Tensor, split_k_iters: int, thx: int,
+                    thy: int) -> torch.Tensor:
+-    return vllm_ops.awq_dequantize(qweight, scales, zeros, split_k_iters, thx,
+-                                   thy)
++    if envs.VLLM_USE_TRITON_AWQ:
++        from vllm.model_executor.layers.quantization.awq_triton import (
++            awq_dequantize_triton)
++        return awq_dequantize_triton(qweight, scales, zeros)
++    return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters,
++                                       thx, thy)
+ 
+ 
+ def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
+              scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
+-    return vllm_ops.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
++    if envs.VLLM_USE_TRITON_AWQ:
++        from vllm.model_executor.layers.quantization.awq_triton import (
++            awq_gemm_triton)
++        return awq_gemm_triton(input, qweight, qzeros, scales, split_k_iters)
++    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+ 
+ 
+ # gptq
+@@ -130,83 +235,718 @@ def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+               b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
+               b_g_idx: torch.Tensor, use_exllama: bool,
+               bit: int) -> torch.Tensor:
+-    return vllm_ops.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
+-                              b_g_idx, use_exllama, bit)
++    return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
++                                  b_g_idx, use_exllama, bit)
+ 
+ 
+-def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
+-                 bit: int) -> None:
+-    vllm_ops.gptq_shuffle(q_weight, q_perm, bit)
++if hasattr(torch.ops._C, "gptq_gemm"):
+ 
++    @register_fake("_C::gptq_gemm")
++    def _gptq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
++                        b_gptq_qzeros: torch.Tensor,
++                        b_gptq_scales: torch.Tensor, b_g_idx: torch.Tensor,
++                        use_exllama: bool, bit: int) -> torch.Tensor:
++        return torch.empty((a.size(0), b_q_weight.size(1)),
++                           dtype=a.dtype,
++                           device=a.device)
+ 
+-# squeezellm
+-def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor,
+-                    lookup_table: torch.Tensor) -> None:
+-    vllm_ops.squeezellm_gemm(vec, mat, mul, lookup_table)
++
++def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
++                 bit: int) -> None:
++    torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
+ 
+ 
+ # marlin
+ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                 b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,
+                 size_n: int, size_k: int) -> torch.Tensor:
+-    return vllm_ops.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
+-                                size_n, size_k)
++    return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
++                                    size_n, size_k)
++
++
++# marlin_24
++def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
++                        b_meta: torch.Tensor, b_scales: torch.Tensor,
++                        workspace: torch.Tensor, b_q_type: ScalarType,
++                        size_m: int, size_n: int, size_k: int) -> torch.Tensor:
++    return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
++                                            workspace, b_q_type.id, size_m,
++                                            size_n, size_k)
++
++
++if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
++
++    @register_fake("_C::gptq_marlin_24_gemm")
++    def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
++                                  b_meta: torch.Tensor, b_scales: torch.Tensor,
++                                  workspace: torch.Tensor,
++                                  b_q_type: ScalarType, size_m: torch.SymInt,
++                                  size_n: torch.SymInt,
++                                  size_k: torch.SymInt) -> torch.Tensor:
++        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
++
++    @register_fake("_C::gptq_marlin_gemm")
++    def _gptq_marlin_gemm_fake(a: torch.Tensor,
++                               b_q_weight: torch.Tensor,
++                               b_scales: torch.Tensor,
++                               b_zeros: torch.Tensor,
++                               g_idx: torch.Tensor,
++                               perm: torch.Tensor,
++                               workspace: torch.Tensor,
++                               b_q_type: ScalarType,
++                               size_m: torch.SymInt,
++                               size_n: torch.SymInt,
++                               size_k: torch.SymInt,
++                               is_k_full: bool,
++                               has_zp: bool = False,
++                               use_fp32_reduce: bool = False,
++                               is_zp_float: bool = False) -> torch.Tensor:
++        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
++
++    @register_fake("_C::marlin_qqq_gemm")
++    def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
++                              s_tok: torch.Tensor, s_ch: torch.Tensor,
++                              s_group: torch.Tensor, workspace: torch.Tensor,
++                              size_m: torch.SymInt, size_n: torch.SymInt,
++                              size_k: torch.SymInt) -> torch.Tensor:
++        return torch.empty((size_m, size_n),
++                           dtype=torch.float16,
++                           device=a.device)
++
++    @register_fake("_C::marlin_gemm")
++    def _marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
++                          b_scales: torch.Tensor, workspace: torch.Tensor,
++                          size_m: torch.SymInt, size_n: torch.SymInt,
++                          size_k: torch.SymInt) -> torch.Tensor:
++        return torch.empty((size_m, size_n),
++                           dtype=torch.float16,
++                           device=a.device)
++
++    @register_fake("_C::awq_dequantize")
++    def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor,
++                             zeros: torch.Tensor, split_k_iters: torch.SymInt,
++                             thx: int, thy: int) -> torch.Tensor:
++        in_c = qweight.size(0)
++        qout_c = qweight.size(1)
++        out_c = qout_c * 8
++        return torch.empty((in_c, out_c),
++                           dtype=scales.dtype,
++                           device=scales.device)
++
++    @register_fake("_C::awq_gemm")
++    def _awq_gemm_fake(input: torch.Tensor, qweight: torch.Tensor,
++                       qzeros: torch.Tensor, scales: torch.Tensor,
++                       split_k_iters: torch.SymInt) -> torch.Tensor:
++        num_in_feats = input.size(0)
++        return torch.empty((split_k_iters, num_in_feats, qweight.size(1) * 8),
++                           dtype=input.dtype,
++                           device=input.device).sum(0)
++
++    @register_fake("_C::aqlm_gemm")
++    def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor,
++                        codebooks: torch.Tensor, scales: torch.Tensor,
++                        codebook_partition_sizes: List[int],
++                        bias: Optional[torch.Tensor]) -> torch.Tensor:
++        out_features = codes.size(0) * codebooks.size(2)
++        flat_input = input.reshape((-1, input.size(-1)))
++        flat_output = torch.empty((flat_input.size(0), out_features),
++                                  dtype=input.dtype,
++                                  device=input.device)
++
++        output_sizes = list(input.shape)
++        output_sizes.pop()
++        output_sizes.append(-1)
++        return flat_output.reshape(tuple(output_sizes))
++
++    @register_fake("_C::aqlm_dequant")
++    def _aqlm_dequant_fake(
++            codes: torch.Tensor, codebooks: torch.Tensor,
++            codebook_partition_sizes: List[int]) -> torch.Tensor:
++        in_features = codes.size(1) * 8
++        out_features = codes.size(0)
++        return torch.empty((out_features, in_features),
++                           dtype=codebooks.dtype,
++                           device=codebooks.device)
++
++    @register_fake("_C::fp8_marlin_gemm")
++    def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
++                              b_scales: torch.Tensor, workspace: torch.Tensor,
++                              num_bits: int, size_m: torch.SymInt,
++                              size_n: torch.SymInt,
++                              size_k: torch.SymInt) -> torch.Tensor:
++        return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
++
++    @register_fake("_C::machete_mm")
++    def machete_mm_fake(
++        a: torch.Tensor,
++        # b_q Should be the tensor returned by machete_prepack_B
++        b_q: torch.Tensor,
++        b_type: ScalarType,
++        out_type: Optional[torch.dtype] = None,
++        b_group_scales: Optional[torch.Tensor] = None,
++        b_group_zeros: Optional[torch.Tensor] = None,
++        b_group_size: Optional[int] = None,
++        b_channel_scales: Optional[torch.Tensor] = None,
++        a_token_scales: Optional[torch.Tensor] = None,
++        schedule: Optional[str] = None,
++    ) -> torch.Tensor:
++        m = a.size(0)
++        n = b_q.size(1)
++        return torch.empty((m, n), device=a.device, dtype=a.dtype)
++
++    @register_fake("_C::machete_prepack_B")
++    def machete_prepack_B_fake(
++            b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType,
++            group_scales_type: Optional[torch.dtype]) -> torch.Tensor:
++        return torch.empty_like(b_q_weight,
++                                memory_format=torch.contiguous_format)
++
++
++if hasattr(torch.ops._C, "ggml_dequantize"):
++
++    @register_fake("_C::ggml_dequantize")
++    def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int,
++                              m: torch.SymInt,
++                              n: torch.SymInt) -> torch.Tensor:
++        return torch.empty((m, n), dtype=torch.float16, device=W.device)
++
++    @register_fake("_C::ggml_mul_mat_vec_a8")
++    def _ggml_mul_mat_vec_a8_fake(
++        W: torch.Tensor,
++        X: torch.Tensor,
++        quant_type: int,
++        row: torch.SymInt,
++    ) -> torch.Tensor:
++        return torch.empty((1, row), dtype=torch.float16, device=W.device)
++
++    @register_fake("_C::ggml_mul_mat_a8")
++    def _ggml_mul_mat_a8_fake(
++        W: torch.Tensor,
++        X: torch.Tensor,
++        quant_type: int,
++        row: torch.SymInt,
++    ) -> torch.Tensor:
++        batch = X.size(0)
++        return torch.empty((batch, row), dtype=torch.float16, device=W.device)
++
++
++# cutlass
++def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
++    return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
++
++
++def cutlass_scaled_mm(a: torch.Tensor,
++                      b: torch.Tensor,
++                      scale_a: torch.Tensor,
++                      scale_b: torch.Tensor,
++                      out_dtype: torch.dtype,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
++    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
++    assert bias is None or bias.shape[0] == b.shape[
++        1] and bias.dtype == out_dtype
++
++    m = a.shape[0]
++    n = b.shape[1]
++
++    if current_platform.is_rocm():
++        triton_scaled_mm_module = importlib.import_module(
++            "vllm.model_executor.layers.quantization.compressed_tensors."
++            "triton_scaled_mm")
++        triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
++        return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
++
++    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
++
++    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
++
++    return out
++
++
++def cutlass_scaled_mm_azp(a: torch.Tensor,
++                          b: torch.Tensor,
++                          scale_a: torch.Tensor,
++                          scale_b: torch.Tensor,
++                          out_dtype: torch.dtype,
++                          azp_adj: torch.Tensor,
++                          azp: Optional[torch.Tensor] = None,
++                          bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++    """
++    :param azp_adj: In the per-tensor case, this should include the azp.
++    Always per-channel.
++    :param azp: Only set in the per-token case. Per-token if set.
++    """
++    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
++    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
++    assert bias is None or bias.numel(
++    ) == b.shape[1] and bias.dtype == out_dtype
++    assert azp is None or azp.numel() == a.shape[0]
++
++    m = a.shape[0]
++    n = b.shape[1]
++    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
++
++    torch.ops._C.cutlass_scaled_mm_azp(out, a, b, scale_a, scale_b, azp_adj,
++                                       azp, bias)
++    return out
++
++
++def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
++    return torch.ops._C.cutlass_sparse_scaled_mm_supported(
++        cuda_device_capability)
++
++
++def cutlass_sparse_compress(a: torch.Tensor) \
++    -> Tuple[torch.Tensor, torch.Tensor]:
++    """
++    Compresses a sparse matrix for use with Cutlass sparse operations.
++
++    This function takes a dense tensor and compresses it into two components:
++    non-zero elements and metadata. The compressed representation is compatible
++    with Cutlass sparse kernels.
++
++    Args:
++        a (torch.Tensor): 
++            The input tensor to be compressed. Must have one of the following data types:
++            - `torch.int8`
++            - `torch.float8_e4m3fn`
++            - `torch.bfloat16`
++            - `torch.float16`
++
++    Returns:
++        Tuple[torch.Tensor, torch.Tensor]: 
++            A tuple containing:
++            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
++            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
++
++    Raises:
++        ValueError: If the compression operation fails.
++
++    Notes:
++        - The `a_meta` tensor has a data type of `torch.uint8`.
++        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
++        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
++        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
++    """
++    assert (a.dtype in [
++        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
++    ])
++    assert (a.is_contiguous())
++
++    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
++    elemsPerMetaElem = 4
++
++    m = a.shape[0]
++    k = a.shape[1]
++    assert (k % 2 == 0)
++    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
++    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
++                         dtype=torch.uint8,
++                         device=a.device)
++
++    if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
++        raise ValueError
++
++    assert (a_nzs.is_contiguous())
++    assert (a_meta.is_contiguous())
++
++    return a_nzs, a_meta
++
++
++def cutlass_scaled_sparse_mm(
++        a: torch.Tensor,
++        bt_nzs: torch.Tensor,
++        bt_meta: torch.Tensor,
++        scale_a: torch.Tensor,
++        scale_b: torch.Tensor,
++        out_dtype: torch.dtype,
++        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++    """
++    Performs a scaled sparse matrix multiplication using Cutlass.
++
++    Steps:
++    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
++    `a = torch.randn((m, k), device='cuda')`.
++
++    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
++    `b = torch.randn((k, n), device='cuda')`.
++
++    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
++    `b = prune_to_2_4(b, dim=0)`.
++
++    4. Compress the transposed sparse matrix `b.t()`:
++    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
++
++    5. Perform sparse matrix multiplication using the compressed matrix,
++    applying scaling factors for `a` and `b`, and the output data type:
++    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
++
++    Returns:
++    - The result of the scaled sparse matrix multiplication.
++    """
++    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
++    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
++    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
++        and bias.dtype == out_dtype
++
++    m = a.shape[0]
++    n = bt_nzs.shape[0]
++    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
++
++    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
++                                          scale_b, bias)
++
++    return out
+ 
+ 
+ # aqlm
+ def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
+               codebooks: torch.Tensor, scales: torch.Tensor,
+-              codebook_partition_sizes: torch.Tensor,
++              codebook_partition_sizes: List[int],
+               bias: Optional[torch.Tensor]) -> torch.Tensor:
+-    return vllm_ops.aqlm_gemm(input, codes, codebooks, scales,
+-                              codebook_partition_sizes, bias)
++    return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
++                                  codebook_partition_sizes, bias)
+ 
+ 
+ def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
+-                 codebook_partition_sizes: torch.Tensor) -> torch.Tensor:
+-    return vllm_ops.aqlm_dequant(codes, codebooks, codebook_partition_sizes)
++                 codebook_partition_sizes: List[int]) -> torch.Tensor:
++    return torch.ops._C.aqlm_dequant(codes, codebooks,
++                                     codebook_partition_sizes)
+ 
+ 
+ # gptq_marlin
+ def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
+                        size_k: int, size_n: int,
+                        num_bits: int) -> torch.Tensor:
+-    return vllm_ops.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
+-                                       num_bits)
++    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
++                                           num_bits)
++
++
++# gptq_marlin
++def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int,
++                      num_bits: int) -> torch.Tensor:
++    return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
++
++
++def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
++                           size_k: int, size_n: int,
++                           num_bits: int) -> torch.Tensor:
++    num_experts = b_q_weight.shape[0]
++    assert size_k % 16 == 0
++    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
++                         device=b_q_weight.device,
++                         dtype=b_q_weight.dtype)
++    for e in range(num_experts):
++        output[e] = torch.ops._C.gptq_marlin_repack(b_q_weight[e], perm[e],
++                                                    size_k, size_n, num_bits)
++    return output
++
++
++def awq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
++                          size_k: int, size_n: int,
++                          num_bits: int) -> torch.Tensor:
++    num_experts = b_q_weight.shape[0]
++    assert size_k % 16 == 0
++    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
++                         device=b_q_weight.device,
++                         dtype=b_q_weight.dtype)
++    for e in range(num_experts):
++        output[e] = torch.ops._C.awq_marlin_repack(b_q_weight[e], size_k,
++                                                   size_n, num_bits)
++    return output
++
+ 
++def gptq_marlin_gemm(a: torch.Tensor,
++                     b_q_weight: torch.Tensor,
++                     b_scales: torch.Tensor,
++                     b_zeros: torch.Tensor,
++                     g_idx: torch.Tensor,
++                     perm: torch.Tensor,
++                     workspace: torch.Tensor,
++                     b_q_type: ScalarType,
++                     size_m: int,
++                     size_n: int,
++                     size_k: int,
++                     is_k_full: bool,
++                     has_zp: bool = False,
++                     use_fp32_reduce: bool = False,
++                     is_zp_float: bool = False) -> torch.Tensor:
++    return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
++                                         g_idx, perm, workspace, b_q_type.id,
++                                         size_m, size_n, size_k, is_k_full,
++                                         has_zp, use_fp32_reduce, is_zp_float)
+ 
+-def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+-                     b_scales: torch.Tensor, g_idx: torch.Tensor,
+-                     perm: torch.Tensor, workspace: torch.Tensor,
+-                     num_bits: int, size_m: int, size_n: int, size_k: int,
+-                     is_k_full: bool) -> torch.Tensor:
+-    return vllm_ops.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
+-                                     workspace, num_bits, size_m, size_n,
+-                                     size_k, is_k_full)
++
++# fp8 marlin
++def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
++                    b_scales: torch.Tensor, workspace: torch.Tensor,
++                    num_bits: int, size_m: int, size_n: int,
++                    size_k: int) -> torch.Tensor:
++    return torch.ops._C.fp8_marlin_gemm(a, b_q_weight, b_scales, workspace,
++                                        num_bits, size_m, size_n, size_k)
++
++
++# machete
++def machete_supported_schedules(
++        a_type: torch.dtype,
++        b_type: ScalarType,
++        group_scales_type: Optional[torch.dtype],
++        group_zeros_type: Optional[torch.dtype] = None,
++        channel_scales_type: Optional[torch.dtype] = None,
++        token_scales_type: Optional[torch.dtype] = None,
++        out_type: Optional[torch.dtype] = None) -> List[str]:
++    return torch.ops._C.machete_supported_schedules(
++        a_type, b_type.id, group_scales_type, group_zeros_type,
++        channel_scales_type, token_scales_type, out_type)
++
++
++def machete_mm(
++        a: torch.Tensor,
++        # b_q Should be the tensor returned by machete_prepack_B
++        b_q: torch.Tensor,
++        b_type: ScalarType,
++        out_type: Optional[torch.dtype] = None,
++        b_group_scales: Optional[torch.Tensor] = None,
++        b_group_zeros: Optional[torch.Tensor] = None,
++        b_group_size: Optional[int] = None,
++        b_channel_scales: Optional[torch.Tensor] = None,
++        a_token_scales: Optional[torch.Tensor] = None,
++        schedule: Optional[str] = None) -> torch.Tensor:
++    return torch.ops._C.machete_mm(a, b_q, b_type.id, out_type, b_group_scales,
++                                   b_group_zeros, b_group_size,
++                                   b_channel_scales, a_token_scales, schedule)
++
++
++def machete_prepack_B(
++        b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType,
++        group_scales_type: Optional[torch.dtype]) -> torch.Tensor:
++    return torch.ops._C.machete_prepack_B(b_q_weight, a_type, b_type.id,
++                                          group_scales_type)
++
++
++if hasattr(torch.ops._C, "permute_cols"):
++
++    @register_fake("_C::permute_cols")
++    def _permute_cols_fake(a: torch.Tensor,
++                           perm: torch.Tensor) -> torch.Tensor:
++        return torch.empty_like(a)
++
++
++def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor:
++    return torch.ops._C.permute_cols(a, perm)
+ 
+ 
+ # fp8
+ def scaled_fp8_quant(
+     input: torch.Tensor,
+     scale: Optional[torch.Tensor] = None,
++    num_token_padding: Optional[int] = None,
++    scale_ub: Optional[torch.Tensor] = None,
++    use_per_token_if_dynamic: bool = False,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+-    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
++    """
++    Quantize input tensor to FP8 and return quantized tensor and scale.
++
++    This function supports both static and dynamic quantization: If you
++    provide the scale, it will use static scaling and if you omit it,
++    the scale will be determined dynamically. The function also allows
++    optional padding of the output tensors for downstream kernels that
++    will benefit from padding.
++
++    Args:
++        input: The input tensor to be quantized to FP8
++        scale: Optional scaling factor for the FP8 quantization
++        scale_ub: Optional upper bound for scaling factor in dynamic
++            per token case
++        num_token_padding: If specified, pad the first dimension
++            of the output to at least this value.
++        use_per_token_if_dynamic: Whether to do per_tensor or per_token
++            in the dynamic quantization case.
++
++    Returns:
++        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
++            scaling factor.
++    """
++    # This code assumes batch_dim and num_tokens are flattened
++    assert (input.ndim == 2)
++    shape: Union[Tuple[int, int], torch.Size] = input.shape
++    # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
++    out_dtype: torch.dtype = torch.float8_e4m3fnuz \
++            if current_platform.is_rocm() else torch.float8_e4m3fn
++    if num_token_padding:
++        shape = (max(num_token_padding, input.shape[0]), shape[1])
++    output = torch.empty(shape, device=input.device, dtype=out_dtype)
++
+     if scale is None:
+-        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+-        vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
++        if use_per_token_if_dynamic:
++            scale = torch.empty((shape[0], 1),
++                                device=input.device,
++                                dtype=torch.float32)
++            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
++                output, input, scale, scale_ub)
++        else:
++            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
++            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+     else:
+-        vllm_ops.static_scaled_fp8_quant(output, input, scale)
++        # num_token_padding not implemented for this case
++        assert (scale.numel() == 1 or num_token_padding is None)
++        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
++
+     return output, scale
+ 
+ 
++# int8
++def scaled_int8_quant(
++    input: torch.Tensor,
++    scale: Optional[torch.Tensor] = None,
++    azp: Optional[torch.Tensor] = None,
++    symmetric: bool = True
++) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
++    """
++    Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
++
++    Args:
++        input: The input tensor to be quantized to int8.
++        scale: Optional scaling factor for the int8 quantization.
++            When not provided, we invoke dynamic-per-token quantization.
++        azp: Optional zero-point for the int8 quantization.
++            Must be provided for asymmetric quantization if `scale` is provided.
++        symmetric: Whether to use symmetric quantization (scale only, azp ignored).
++
++    Returns:
++      Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
++    """
++    output = torch.empty_like(input, dtype=torch.int8)
++    if scale is not None:
++        # static-per-tensor quantization.
++        assert symmetric == (
++            azp is
++            None), "azp must only be provided for asymmetric quantization."
++        torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
++        return output, scale, azp
++
++    # dynamic-per-token quantization.
++    input_scales = torch.empty((input.numel() // input.shape[-1], 1),
++                               device=input.device,
++                               dtype=torch.float32)
++    input_azp = None if symmetric else torch.empty_like(input_scales,
++                                                        dtype=torch.int32)
++    torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales,
++                                           input_azp)
++    return output, input_scales, input_azp
++
++
++# qqq ops
++def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
++                    s_tok: torch.Tensor, s_ch: torch.Tensor,
++                    s_group: torch.Tensor, workspace: torch.Tensor,
++                    size_m: int, size_n: int, size_k: int) -> torch.Tensor:
++    return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group,
++                                        workspace, size_m, size_n, size_k)
++
++
++# gguf
++def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int,
++                    n: int) -> torch.Tensor:
++    return torch.ops._C.ggml_dequantize(W, quant_type, m, n)
++
++
++def ggml_mul_mat_vec_a8(
++    W: torch.Tensor,
++    X: torch.Tensor,
++    quant_type: int,
++    row: int,
++) -> torch.Tensor:
++    return torch.ops._C.ggml_mul_mat_vec_a8(W, X, quant_type, row)
++
++
++def ggml_mul_mat_a8(
++    W: torch.Tensor,
++    X: torch.Tensor,
++    quant_type: int,
++    row: int,
++) -> torch.Tensor:
++    return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row)
++
++
++# mamba
++def causal_conv1d_fwd(x: torch.Tensor, weight: torch.Tensor,
++                      bias_: Optional[torch.Tensor],
++                      conv_states: Optional[torch.Tensor],
++                      query_start_loc: Optional[torch.Tensor],
++                      cache_indices: Optional[torch.Tensor],
++                      has_initial_state: Optional[torch.Tensor],
++                      silu_activation: bool, pad_slot_id: int):
++    torch.ops._C.causal_conv1d_fwd(x, weight, bias_, conv_states,
++                                   query_start_loc, cache_indices,
++                                   has_initial_state, silu_activation,
++                                   pad_slot_id)
++
++
++def causal_conv1d_update(x: torch.Tensor, conv_state: torch.Tensor,
++                         weight: torch.Tensor, bias_: Optional[torch.Tensor],
++                         silu_activation: bool,
++                         cache_seqlens: Optional[torch.Tensor],
++                         conv_state_indices: Optional[torch.Tensor],
++                         pad_slot_id: int):
++    torch.ops._C.causal_conv1d_update(x, conv_state, weight, bias_,
++                                      silu_activation, cache_seqlens,
++                                      conv_state_indices, pad_slot_id)
++
++
++def selective_scan_fwd(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
++                       B: torch.Tensor, C: torch.Tensor,
++                       D_: Optional[torch.Tensor], z_: Optional[torch.Tensor],
++                       delta_bias_: Optional[torch.Tensor],
++                       delta_softplus: bool,
++                       query_start_loc: Optional[torch.Tensor],
++                       cache_indices: Optional[torch.Tensor],
++                       has_initial_state: Optional[torch.Tensor],
++                       ssm_states: torch.Tensor, pad_slot_id: int):
++    torch.ops._C.selective_scan_fwd(u, delta, A, B, C, D_, z_, delta_bias_,
++                                    delta_softplus, query_start_loc,
++                                    cache_indices, has_initial_state,
++                                    ssm_states, pad_slot_id)
++
++
+ # moe
++def moe_sum(input: torch.Tensor, output: torch.Tensor):
++    torch.ops._moe_C.moe_sum(input, output)
++
++
+ def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
+                          block_size: int, sorted_token_ids: torch.Tensor,
+                          experts_ids: torch.Tensor,
+                          num_tokens_post_pad: torch.Tensor) -> None:
+-    vllm_ops.moe_align_block_size(topk_ids, num_experts, block_size,
+-                                  sorted_token_ids, experts_ids,
+-                                  num_tokens_post_pad)
++    torch.ops._moe_C.moe_align_block_size(topk_ids, num_experts, block_size,
++                                          sorted_token_ids, experts_ids,
++                                          num_tokens_post_pad)
++
++
++def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
++                 token_expert_indicies: torch.Tensor,
++                 gating_output: float) -> None:
++    torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
++                                  token_expert_indicies, gating_output)
++
++
++if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
++
++    @register_fake("_moe_C::marlin_gemm_moe")
++    def marlin_gemm_moe_fake(a: torch.Tensor, b_q_weights: torch.Tensor,
++                             sorted_ids: torch.Tensor,
++                             topk_weights: torch.Tensor,
++                             topk_ids: torch.Tensor, b_scales: torch.Tensor,
++                             b_zero_points: torch.Tensor, g_idx: torch.Tensor,
++                             perm: torch.Tensor, workspace: torch.Tensor,
++                             b_q_type: ScalarType, size_m: torch.SymInt,
++                             size_n: torch.SymInt, size_k: torch.SymInt,
++                             is_k_full: bool, num_experts: int, topk: int,
++                             moe_block_size: int, replicate_input: bool,
++                             apply_weights: bool) -> torch.Tensor:
++        return torch.empty((size_m, topk, size_n),
++                           dtype=a.dtype,
++                           device=a.device)
+ 
+ 
+ def reshape_and_cache(
+@@ -216,10 +956,12 @@ def reshape_and_cache(
+     value_cache: torch.Tensor,
+     slot_mapping: torch.Tensor,
+     kv_cache_dtype: str,
+-    kv_scale: float,
++    k_scale: float,
++    v_scale: float,
+ ) -> None:
+-    vllm_cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+-                                     slot_mapping, kv_cache_dtype, kv_scale)
++    torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
++                                             value_cache, slot_mapping,
++                                             kv_cache_dtype, k_scale, v_scale)
+ 
+ 
+ def reshape_and_cache_flash(
+@@ -229,23 +971,72 @@ def reshape_and_cache_flash(
+     value_cache: torch.Tensor,
+     slot_mapping: torch.Tensor,
+     kv_cache_dtype: str,
++    k_scale: float,
++    v_scale: float,
+ ) -> None:
+-    vllm_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
+-                                           slot_mapping, kv_cache_dtype)
++    torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
++                                                   value_cache, slot_mapping,
++                                                   kv_cache_dtype, k_scale,
++                                                   v_scale)
+ 
+ 
+-def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor,
++def copy_blocks(key_caches: List[torch.Tensor],
++                value_caches: List[torch.Tensor],
+                 block_mapping: torch.Tensor) -> None:
+-    vllm_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
++    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
+ 
+ 
+ def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
+-                block_mapping: Dict[int, int]) -> None:
+-    vllm_cache_ops.swap_blocks(src, dst, block_mapping)
++                block_mapping: torch.Tensor) -> None:
++    torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
++
++
++def convert_fp8(output: torch.Tensor,
++                input: torch.Tensor,
++                scale: float = 1.0,
++                kv_dtype: str = "fp8") -> None:
++    torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
++
++
++def get_device_attribute(attribute: int, device: int) -> int:
++    return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
++
++
++def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
++    # ruff: noqa: E501
++    return torch.ops._C_cuda_utils.get_max_shared_memory_per_block_device_attribute(
++        device)
++
++
++# custom ar
++def init_custom_ar(ipc_tensors: List[torch.Tensor], rank_data: torch.Tensor,
++                   rank: int, full_nvlink: bool) -> int:
++    return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank,
++                                                 full_nvlink)
++
++
++def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int,
++               reg_buffer_sz_bytes: int) -> None:
++    torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer,
++                                      reg_buffer_sz_bytes)
++
++
++def dispose(fa: int) -> None:
++    torch.ops._C_custom_ar.dispose(fa)
++
++
++def meta_size() -> int:
++    return torch.ops._C_custom_ar.meta_size()
++
++
++def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
++    return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors)
+ 
+ 
+-def convert_fp8(output: torch.Tensor, input: torch.Tensor) -> None:
+-    vllm_cache_ops.convert_fp8(output, input)
++def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
++    return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
+ 
+ 
+-#TODO: cuda_utils, custom_ar
++def register_graph_buffers(fa: int, handles: List[List[int]],
++                           offsets: List[List[int]]) -> None:
++    torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
+diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
+new file mode 100644
+index 0000000..28b804f
+--- /dev/null
++++ b/vllm/_ipex_ops.py
+@@ -0,0 +1,226 @@
++from typing import List, Optional, Tuple
++
++import torch
++
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++try:
++    import intel_extension_for_pytorch as ipex
++except ImportError as e:
++    logger.warning("Import error msg: %s", e.msg)
++
++
++class ipex_ops:
++
++    @staticmethod
++    def _reshape_activation_tensor(
++            x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
++        num = x.size(0)
++        d = x.size(1) // 2
++        x = x.reshape(num, 2, d)
++        x1, x2 = torch.chunk(x, chunks=2, dim=1)
++        x1 = x1.reshape(num, d)
++        x2 = x2.reshape(num, d)
++        return x1, x2
++
++    @staticmethod
++    def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
++        ipex.llm.functional.silu_and_mul(x, out)
++
++    @staticmethod
++    def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
++        ipex.llm.functional.gelu_and_mul(x, out)
++
++    @staticmethod
++    def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
++        ipex.llm.functional.gelu_and_mul(x, out)
++
++    @staticmethod
++    def gelu_fast(x: torch.Tensor) -> torch.Tensor:
++        return torch.nn.functional.gelu(x)
++
++    @staticmethod
++    def gelu_new(x: torch.Tensor) -> torch.Tensor:
++        return torch.nn.functional.gelu(x)
++
++    @staticmethod
++    def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
++        ipex.llm.functional.gelu_quick(x, out)
++
++    @staticmethod
++    def paged_attention_v1(
++        out: torch.Tensor,
++        query: torch.Tensor,
++        key_cache: torch.Tensor,
++        value_cache: torch.Tensor,
++        num_kv_heads: int,
++        scale: float,
++        block_tables: torch.Tensor,
++        context_lens: torch.Tensor,
++        block_size: int,
++        max_context_len: int,
++        alibi_slopes: Optional[torch.Tensor],
++        kv_cache_dtype: str,
++        k_scale: float,
++        v_scale: float,
++        tp_rank: int = 0,
++        blocksparse_local_blocks: int = 0,
++        blocksparse_vert_stride: int = 0,
++        blocksparse_block_size: int = 64,
++        blocksparse_head_sliding_step: int = 0,
++    ) -> None:
++        assert kv_cache_dtype == "auto"
++        num_heads = out.size(1)
++        num_queries_per_tokens = num_heads // num_kv_heads
++        ipex.llm.modules.PagedAttention.single_query_kv_attention(
++            out,
++            query.contiguous(),
++            key_cache.view_as(value_cache),
++            value_cache,
++            num_queries_per_tokens,
++            scale,
++            block_tables,
++            context_lens,
++            block_size,
++            max_context_len,
++            alibi_slopes,
++        )
++
++    @staticmethod
++    def paged_attention_v2(
++        out: torch.Tensor,
++        exp_sum: torch.Tensor,
++        max_logits: torch.Tensor,
++        tmp_out: torch.Tensor,
++        query: torch.Tensor,
++        key_cache: torch.Tensor,
++        value_cache: torch.Tensor,
++        num_kv_heads: int,
++        scale: float,
++        block_tables: torch.Tensor,
++        context_lens: torch.Tensor,
++        block_size: int,
++        max_context_len: int,
++        alibi_slopes: Optional[torch.Tensor],
++        kv_cache_dtype: str,
++        k_scale: float,
++        v_scale: float,
++        tp_rank: int = 0,
++        blocksparse_local_blocks: int = 0,
++        blocksparse_vert_stride: int = 0,
++        blocksparse_block_size: int = 64,
++        blocksparse_head_sliding_step: int = 0,
++    ) -> None:
++        assert kv_cache_dtype == "auto"
++        num_heads = out.size(1)
++        num_queries_per_tokens = num_heads // num_kv_heads
++        ipex.llm.modules.PagedAttention.single_query_kv_attention(
++            out,
++            query.contiguous(),
++            key_cache.view_as(value_cache),
++            value_cache,
++            num_queries_per_tokens,
++            scale,
++            block_tables,
++            context_lens,
++            block_size,
++            max_context_len,
++            alibi_slopes,
++        )
++
++    @staticmethod
++    def rotary_embedding(
++        positions: torch.Tensor,  # [batch_size, seq_len]
++        query: torch.Tensor,  # [batch_size, seq_len, num_heads*head_size]
++        key: torch.Tensor,  # [batch_size, seq_len, num_kv_heads*head_size]
++        head_size: int,
++        cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
++        is_neox: bool,
++    ) -> None:
++        rot_dim = cos_sin_cache.size(1)
++        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
++                                                     head_size, cos_sin_cache,
++                                                     is_neox, rot_dim)
++
++    @staticmethod
++    def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
++                                 key: torch.Tensor, head_size: int,
++                                 cos_sin_cache: torch.Tensor, is_neox: bool,
++                                 rot_dim: int,
++                                 cos_sin_cache_offsets: torch.Tensor) -> None:
++        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
++                                                     head_size, cos_sin_cache,
++                                                     is_neox, rot_dim,
++                                                     cos_sin_cache_offsets)
++
++    @staticmethod
++    def rms_norm(input: torch.Tensor, weight: torch.Tensor,
++                 epsilon: float) -> torch.Tensor:
++        return ipex.llm.functional.rms_norm(input, weight, epsilon)
++
++    @staticmethod
++    def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
++                           weight: torch.Tensor, epsilon: float) -> None:
++        tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
++                                               epsilon, True)
++        input.copy_(tmp)
++
++    @staticmethod
++    def varlen_attention(
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++        out: torch.Tensor,
++        seqlen_q: torch.Tensor,
++        seqlen_k: torch.Tensor,
++        max_seqlen_q: int,
++        max_seqlen_k: int,
++        pdropout: float,
++        softmax_scale: float,
++        zero_tensors: bool,
++        is_causal: bool,
++        return_softmax: bool,
++        gen_: torch.Generator,
++        logits_soft_cap: float,
++    ) -> None:
++        ipex.llm.functional.varlen_attention(query.contiguous(),
++                                             key.contiguous(),
++                                             value.contiguous(), out,
++                                             seqlen_q.int(), seqlen_k.int(),
++                                             max_seqlen_q, max_seqlen_k,
++                                             pdropout, softmax_scale,
++                                             zero_tensors, is_causal,
++                                             return_softmax, gen_,
++                                             logits_soft_cap)
++
++    @staticmethod
++    def reshape_and_cache(
++        key: torch.Tensor,
++        value: torch.Tensor,
++        key_cache: torch.Tensor,
++        value_cache: torch.Tensor,
++        slot_mapping: torch.Tensor,
++        kv_cache_dtype: str,
++        k_scale: float,
++        v_scale: float,
++    ) -> None:
++        assert kv_cache_dtype == "auto"
++        ipex.llm.modules.PagedAttention.reshape_and_cache(
++            key, value, key_cache, value_cache, slot_mapping)
++
++    @staticmethod
++    def copy_blocks(key_caches: List[torch.Tensor],
++                    value_caches: List[torch.Tensor],
++                    block_mapping: torch.Tensor) -> None:
++        torch.xpu.copy_blocks(  # type: ignore
++            key_caches,
++            value_caches,
++            block_mapping,
++        )
++
++    @staticmethod
++    def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
++                    block_mapping: torch.Tensor) -> None:
++        torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
+diff --git a/vllm/adapter_commons/__init__.py b/vllm/adapter_commons/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py
+new file mode 100644
+index 0000000..3ed6067
+--- /dev/null
++++ b/vllm/adapter_commons/layers.py
+@@ -0,0 +1,14 @@
++from dataclasses import dataclass
++from typing import Tuple
++
++
++@dataclass
++class AdapterMapping:
++    # Per every token in input_ids:
++    index_mapping: Tuple[int, ...]
++    # Per sampled token:
++    prompt_mapping: Tuple[int, ...]
++
++    def __post_init__(self):
++        self.index_mapping = tuple(self.index_mapping)
++        self.prompt_mapping = tuple(self.prompt_mapping)
+\ No newline at end of file
+diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
+new file mode 100644
+index 0000000..468904c
+--- /dev/null
++++ b/vllm/adapter_commons/models.py
+@@ -0,0 +1,103 @@
++from abc import ABC, abstractmethod
++from typing import Any, Callable, Dict, Optional, TypeVar
++
++from torch import nn
++
++from vllm.logger import init_logger
++from vllm.utils import LRUCache
++
++logger = init_logger(__name__)
++
++
++class AdapterModel(ABC):
++
++    def __init__(self, model_id=None):
++        self.id = model_id
++
++    @abstractmethod
++    def from_local_checkpoint(cls, model_dir, model_id=None, **kwargs):
++        # Common initialization code
++        # Load weights or embeddings from local checkpoint
++        raise NotImplementedError("Subclasses must implement this method.")
++
++
++T = TypeVar('T')
++
++
++class AdapterLRUCache(LRUCache[int, T]):
++
++    def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]):
++        super().__init__(capacity)
++        self.deactivate_fn = deactivate_fn
++
++    def _on_remove(self, key: int, value: Optional[T]):
++        logger.debug("Removing adapter int id: %d", key)
++        self.deactivate_fn(key)
++        return super()._on_remove(key, value)
++
++
++class AdapterModelManager(ABC):
++
++    def __init__(
++        self,
++        model: nn.Module,
++    ):
++        """Create a AdapterModelManager and adapter for a given model.
++        Args:
++            model: the model to be adapted.
++        """
++        self.model: nn.Module = model
++        self._registered_adapters: Dict[int, Any] = {}
++        # Dict instead of a Set for compatibility with LRUCache.
++        self._active_adapters: Dict[int, None] = {}
++        self.adapter_type = 'Adapter'
++        self._last_mapping = None
++
++    def __len__(self) -> int:
++        return len(self._registered_adapters)
++
++    @property
++    @abstractmethod
++    def adapter_slots(self) -> int:
++        raise NotImplementedError
++
++    @property
++    @abstractmethod
++    def capacity(self) -> int:
++        raise NotImplementedError
++
++    @abstractmethod
++    def activate_adapter(self, adapter_id: int) -> bool:
++        raise NotImplementedError
++
++    @abstractmethod
++    def deactivate_adapter(self, adapter_id: int) -> bool:
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_adapter(self, adapter: Any) -> bool:
++        raise NotImplementedError
++
++    @abstractmethod
++    def set_adapter_mapping(self, mapping: Any) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def remove_adapter(self, adapter_id: int) -> bool:
++        raise NotImplementedError
++
++    @abstractmethod
++    def remove_all_adapters(self) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def get_adapter(self, adapter_id: int) -> Optional[Any]:
++        raise NotImplementedError
++
++    @abstractmethod
++    def list_adapters(self) -> Dict[int, Any]:
++        raise NotImplementedError
++
++    @abstractmethod
++    def pin_adapter(self, adapter_id: int) -> bool:
++        raise NotImplementedError
+diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py
+new file mode 100644
+index 0000000..2bb17fd
+--- /dev/null
++++ b/vllm/adapter_commons/request.py
+@@ -0,0 +1,23 @@
++from abc import ABC, abstractmethod
++
++
++class AdapterRequest(ABC):
++    """
++    Base class for adapter requests.
++    """
++
++    @property
++    @abstractmethod
++    def adapter_id(self) -> int:
++        raise NotImplementedError
++
++    def __post_init__(self) -> None:
++        if self.adapter_id < 1:
++            raise ValueError(f"id must be > 0, got {self.adapter_id}")
++
++    def __eq__(self, value: object) -> bool:
++        return isinstance(
++            value, self.__class__) and self.adapter_id == value.adapter_id
++
++    def __hash__(self) -> int:
++        return hash(self.adapter_id)
+diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py
+new file mode 100644
+index 0000000..1e9adca
+--- /dev/null
++++ b/vllm/adapter_commons/utils.py
+@@ -0,0 +1,90 @@
++from typing import Any, Callable, Dict, Optional, Set
++
++
++## model functions
++def deactivate_adapter(adapter_id: int, active_adapters: Dict[int, None],
++                       deactivate_func: Callable) -> bool:
++    if adapter_id in active_adapters:
++        deactivate_func(adapter_id)
++        active_adapters.pop(adapter_id)
++        return True
++    return False
++
++
++def add_adapter(adapter: Any, registered_adapters: Dict[int, Any],
++                capacity: int, add_func: Callable) -> bool:
++    if adapter.id not in registered_adapters:
++        if len(registered_adapters) >= capacity:
++            raise RuntimeError('No free adapter slots.')
++        add_func(adapter)
++        registered_adapters[adapter.id] = adapter
++        return True
++    return False
++
++
++def set_adapter_mapping(mapping: Any, last_mapping: Any,
++                        set_mapping_func: Callable) -> Any:
++    if last_mapping != mapping:
++        set_mapping_func(mapping)
++        return mapping
++    return last_mapping
++
++
++def remove_adapter(adapter_id: int, registered_adapters: Dict[int, Any],
++                   deactivate_func: Callable) -> bool:
++    deactivate_func(adapter_id)
++    return bool(registered_adapters.pop(adapter_id, None))
++
++
++def list_adapters(registered_adapters: Dict[int, Any]) -> Dict[int, Any]:
++    return dict(registered_adapters)
++
++
++def get_adapter(adapter_id: int,
++                registered_adapters: Dict[int, Any]) -> Optional[Any]:
++    return registered_adapters.get(adapter_id)
++
++
++## worker functions
++def set_active_adapters_worker(requests: Set[Any], mapping: Optional[Any],
++                               apply_adapters_func,
++                               set_adapter_mapping_func) -> None:
++    apply_adapters_func(requests)
++    set_adapter_mapping_func(mapping)
++
++
++def add_adapter_worker(adapter_request: Any, list_adapters_func,
++                       load_adapter_func, add_adapter_func,
++                       activate_adapter_func) -> bool:
++    if adapter_request.adapter_id in list_adapters_func():
++        return False
++    loaded_adapter = load_adapter_func(adapter_request)
++    loaded = add_adapter_func(loaded_adapter)
++    activate_adapter_func(loaded_adapter.id)
++    return loaded
++
++
++def apply_adapters_worker(adapter_requests: Set[Any], list_adapters_func,
++                          adapter_slots: int, remove_adapter_func,
++                          add_adapter_func) -> None:
++    models_that_exist = list_adapters_func()
++    models_map = {
++        adapter_request.adapter_id: adapter_request
++        for adapter_request in adapter_requests if adapter_request
++    }
++    if len(models_map) > adapter_slots:
++        raise RuntimeError(
++            f"Number of requested models ({len(models_map)}) is greater "
++            f"than the number of GPU model slots "
++            f"({adapter_slots}).")
++    new_models = set(models_map)
++    models_to_add = new_models - models_that_exist
++    models_to_remove = models_that_exist - new_models
++    for adapter_id in models_to_remove:
++        remove_adapter_func(adapter_id)
++    for adapter_id in models_to_add:
++        add_adapter_func(models_map[adapter_id])
++
++
++def list_adapters_worker(adapter_manager_list_adapters_func) -> Set[int]:
++    return set(adapter_manager_list_adapters_func())
+diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py
+new file mode 100644
+index 0000000..83929e8
+--- /dev/null
++++ b/vllm/adapter_commons/worker_manager.py
+@@ -0,0 +1,36 @@
++from abc import ABC, abstractmethod
++from typing import Any, Optional, Set
++
++import torch
++
++
++class AbstractWorkerManager(ABC):
++
++    def __init__(self, device: torch.device):
++        self.device = device
++
++    @property
++    @abstractmethod
++    def is_enabled(self) -> bool:
++        raise NotImplementedError
++
++    @abstractmethod
++    def set_active_adapters(self, requests: Set[Any],
++                            mapping: Optional[Any]) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_adapter(self, adapter_request: Any) -> bool:
++        raise NotImplementedError
++
++    @abstractmethod
++    def remove_adapter(self, adapter_id: int) -> bool:
++        raise NotImplementedError
++
++    @abstractmethod
++    def remove_all_adapters(self) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def list_adapters(self) -> Set[int]:
++        raise NotImplementedError
+diff --git a/vllm/assets/__init__.py b/vllm/assets/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
+new file mode 100644
+index 0000000..a46c67a
+--- /dev/null
++++ b/vllm/assets/audio.py
+@@ -0,0 +1,31 @@
++from dataclasses import dataclass
++from typing import Literal
++from urllib.parse import urljoin
++
++import numpy.typing as npt
++
++from vllm.utils import PlaceholderModule
++
++from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
++
++try:
++    import librosa
++except ImportError:
++    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
++
++ASSET_DIR = "multimodal_asset"
++
++
++@dataclass(frozen=True)
++class AudioAsset:
++    name: Literal["winning_call", "mary_had_lamb"]
++
++    @property
++    def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
++        audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
++                                            s3_prefix=ASSET_DIR)
++        return librosa.load(audio_path, sr=None)
++
++    @property
++    def url(self) -> str:
++        return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
+diff --git a/vllm/assets/base.py b/vllm/assets/base.py
+new file mode 100644
+index 0000000..2491731
+--- /dev/null
++++ b/vllm/assets/base.py
+@@ -0,0 +1,38 @@
++from functools import lru_cache
++from pathlib import Path
++from typing import Optional
++
++import vllm.envs as envs
++from vllm.connections import global_http_connection
++
++VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
++
++
++def get_cache_dir() -> Path:
++    """Get the path to the cache for storing downloaded assets."""
++    path = Path(envs.VLLM_ASSETS_CACHE)
++    path.mkdir(parents=True, exist_ok=True)
++
++    return path
++
++
++@lru_cache
++def get_vllm_public_assets(filename: str,
++                           s3_prefix: Optional[str] = None) -> Path:
++    """
++    Download an asset file from ``s3://vllm-public-assets``
++    and return the path to the downloaded file.
++    """
++    asset_directory = get_cache_dir() / "vllm_public_assets"
++    asset_directory.mkdir(parents=True, exist_ok=True)
++
++    asset_path = asset_directory / filename
++    if not asset_path.exists():
++        if s3_prefix is not None:
++            filename = s3_prefix + "/" + filename
++        global_http_connection.download_file(
++            f"{VLLM_S3_BUCKET_URL}/{filename}",
++            asset_path,
++            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
++
++    return asset_path
+diff --git a/vllm/assets/image.py b/vllm/assets/image.py
+new file mode 100644
+index 0000000..cb831cb
+--- /dev/null
++++ b/vllm/assets/image.py
+@@ -0,0 +1,29 @@
++from dataclasses import dataclass
++from typing import Literal
++
++import torch
++from PIL import Image
++
++from .base import get_vllm_public_assets
++
++VLM_IMAGES_DIR = "vision_model_images"
++
++
++@dataclass(frozen=True)
++class ImageAsset:
++    name: Literal["stop_sign", "cherry_blossom"]
++
++    @property
++    def pil_image(self) -> Image.Image:
++        image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
++                                            s3_prefix=VLM_IMAGES_DIR)
++        return Image.open(image_path)
++
++    @property
++    def image_embeds(self) -> torch.Tensor:
++        """
++        Image embeddings, only used for testing purposes with llava 1.5.
++        """
++        image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
++                                            s3_prefix=VLM_IMAGES_DIR)
++        return torch.load(image_path, map_location="cpu")
+diff --git a/vllm/assets/video.py b/vllm/assets/video.py
+new file mode 100644
+index 0000000..eca2ccc
+--- /dev/null
++++ b/vllm/assets/video.py
+@@ -0,0 +1,82 @@
++from dataclasses import dataclass
++from functools import lru_cache
++from typing import List, Literal
++
++import cv2
++import numpy as np
++import numpy.typing as npt
++from huggingface_hub import hf_hub_download
++from PIL import Image
++
++from vllm.multimodal.video import sample_frames_from_video
++
++from .base import get_cache_dir
++
++
++@lru_cache
++def download_video_asset(filename: str) -> str:
++    """
++    Download and open an image from huggingface
++    repo: raushan-testing-hf/videos-test
++    """
++    video_directory = get_cache_dir() / "video-example-data"
++    video_directory.mkdir(parents=True, exist_ok=True)
++
++    video_path = video_directory / filename
++    video_path_str = str(video_path)
++    if not video_path.exists():
++        video_path_str = hf_hub_download(
++            repo_id="raushan-testing-hf/videos-test",
++            filename=filename,
++            repo_type="dataset",
++            cache_dir=video_directory,
++        )
++    return video_path_str
++
++
++def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
++    cap = cv2.VideoCapture(path)
++    if not cap.isOpened():
++        raise ValueError(f"Could not open video file {path}")
++
++    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
++    frames = []
++    for i in range(total_frames):
++        ret, frame = cap.read()
++        if ret:
++            frames.append(frame)
++    cap.release()
++
++    frames = np.stack(frames)
++    frames = sample_frames_from_video(frames, num_frames)
++    if len(frames) < num_frames:
++        raise ValueError(f"Could not read enough frames from video file {path}"
++                         f" (expected {num_frames} frames, got {len(frames)})")
++    return frames
++
++
++def video_to_pil_images_list(path: str,
++                             num_frames: int = -1) -> List[Image.Image]:
++    frames = video_to_ndarrays(path, num_frames)
++    return [
++        Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
++        for frame in frames
++    ]
++
++
++@dataclass(frozen=True)
++class VideoAsset:
++    name: Literal["sample_demo_1.mp4"]
++    num_frames: int = -1
++
++    @property
++    def pil_images(self) -> List[Image.Image]:
++        video_path = download_video_asset(self.name)
++        ret = video_to_pil_images_list(video_path, self.num_frames)
++        return ret
++
++    @property
++    def np_ndarrays(self) -> npt.NDArray:
++        video_path = download_video_asset(self.name)
++        ret = video_to_ndarrays(video_path, self.num_frames)
++        return ret
+diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
+index 7636b34..2cd4ad3 100644
+--- a/vllm/attention/__init__.py
++++ b/vllm/attention/__init__.py
+@@ -1,13 +1,17 @@
+ from vllm.attention.backends.abstract import (AttentionBackend,
+                                               AttentionMetadata,
+-                                              AttentionMetadataPerStage)
++                                              AttentionMetadataBuilder,
++                                              AttentionState, AttentionType)
+ from vllm.attention.layer import Attention
+ from vllm.attention.selector import get_attn_backend
+ 
+ __all__ = [
++    "Attention",
+     "AttentionBackend",
+     "AttentionMetadata",
++    "AttentionType",
++    "AttentionMetadataBuilder",
+     "Attention",
++    "AttentionState",
+     "get_attn_backend",
+-    "AttentionMetadataPerStage",
+ ]
+diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
+index 61c9c81..f5dcaea 100644
+--- a/vllm/attention/backends/abstract.py
++++ b/vllm/attention/backends/abstract.py
+@@ -1,14 +1,42 @@
+ from abc import ABC, abstractmethod
++from contextlib import contextmanager
+ from dataclasses import dataclass, fields
+-from typing import (Any, Dict, Generic, List, Optional, Set, Tuple, Type,
+-                    TypeVar)
++from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Set,
++                    Tuple, Type, TypeVar)
+ 
+ import torch
+ 
++from vllm.multimodal import MultiModalPlaceholderMap
++
++if TYPE_CHECKING:
++    from vllm.worker.model_runner_base import (ModelRunnerBase,
++                                               ModelRunnerInputBase,
++                                               ModelRunnerInputBuilderBase)
++
++
++class AttentionType:
++    """
++    Attention type.
++    Use string to be compatible with `torch.compile`.
++    """
++    # Decoder attention between previous layer Q/K/V
++    DECODER = "decoder"
++    # Encoder attention between previous layer Q/K/V for encoder-decoder
++    ENCODER = "encoder"
++    # Encoder attention between previous layer Q/K/V
++    ENCODER_ONLY = "encoder_only"
++    # Attention between dec. Q and enc. K/V for encoder-decoder
++    ENCODER_DECODER = "encoder_decoder"
++
+ 
+ class AttentionBackend(ABC):
+     """Abstract class for attention backends."""
+ 
++    @staticmethod
++    @abstractmethod
++    def get_name() -> str:
++        raise NotImplementedError
++
+     @staticmethod
+     @abstractmethod
+     def get_impl_cls() -> Type["AttentionImpl"]:
+@@ -16,9 +44,28 @@ class AttentionBackend(ABC):
+ 
+     @staticmethod
+     @abstractmethod
+-    def make_metadata(*args, **kwargs) -> "AttentionMetadataPerStage":
++    def get_metadata_cls() -> Type["AttentionMetadata"]:
+         raise NotImplementedError
+ 
++    @staticmethod
++    @abstractmethod
++    def get_state_cls() -> Type["AttentionState"]:
++        raise NotImplementedError
++
++    @classmethod
++    def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
++        return cls.get_metadata_cls()(*args, **kwargs)
++
++    @staticmethod
++    @abstractmethod
++    def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
++        raise NotImplementedError
++
++    @classmethod
++    def make_metadata_builder(cls, *args,
++                              **kwargs) -> "AttentionMetadataBuilder":
++        return cls.get_builder_cls()(*args, **kwargs)
++
+     @staticmethod
+     @abstractmethod
+     def get_kv_cache_shape(
+@@ -34,7 +81,7 @@ class AttentionBackend(ABC):
+     def swap_blocks(
+         src_kv_cache: torch.Tensor,
+         dst_kv_cache: torch.Tensor,
+-        src_to_dst: Dict[int, int],
++        src_to_dst: torch.Tensor,
+     ) -> None:
+         raise NotImplementedError
+ 
+@@ -42,14 +89,54 @@ class AttentionBackend(ABC):
+     @abstractmethod
+     def copy_blocks(
+         kv_caches: List[torch.Tensor],
+-        src_to_dists: Dict[int, List[int]],
++        src_to_dists: torch.Tensor,
+     ) -> None:
+         raise NotImplementedError
+ 
++    def advance_step(self, model_input: "ModelRunnerInputBase",
++                     sampled_token_ids: Optional[torch.Tensor],
++                     block_size: int, num_seqs: int, num_queries: int) -> None:
++        raise NotImplementedError
++
+ 
+ @dataclass
+-class AttentionMetadataPerStage:
+-    """Attention metadata for a specific stage. I.e., prefill or decode."""
++class AttentionMetadata:
++    """Attention metadata for prefill and decode batched together."""
++    # Total number of prefill requests.
++    num_prefills: int
++    # Number of prefill tokens.
++    num_prefill_tokens: int
++    # Number of decode tokens. Note that it is equivalent to the number of
++    # decode requests.
++    num_decode_tokens: int
++    # (num_tokens,). The indices of the token slots that input tokens will be
++    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
++    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
++    # in block 0, and 1st slot in block 1, respectively.
++    slot_mapping: torch.Tensor
++
++    # The index maps that relate multi-modal embeddings to the corresponding
++    # placeholders.
++    #
++    # N.B. These aren't really related to attention and don't belong on this
++    # type -- this is just a temporary solution to make them available to
++    # `model_executable`.
++    multi_modal_placeholder_index_maps: Optional[Dict[
++        str, MultiModalPlaceholderMap.IndexMap]]
++
++    @property
++    @abstractmethod
++    def prefill_metadata(self) -> Optional["AttentionMetadata"]:
++        """Return the attention metadata that's required to run prefill
++        attention."""
++        pass
++
++    @property
++    @abstractmethod
++    def decode_metadata(self) -> Optional["AttentionMetadata"]:
++        """Return the attention metadata that's required to run decode
++        attention."""
++        pass
+ 
+     def asdict_zerocopy(self,
+                         skip_fields: Optional[Set[str]] = None
+@@ -65,42 +152,74 @@ class AttentionMetadataPerStage:
+         }
+ 
+ 
+-T = TypeVar("T", bound=AttentionMetadataPerStage)
++T = TypeVar("T", bound=AttentionMetadata)
+ 
+ 
+-@dataclass
+-class AttentionMetadata(Generic[T]):
+-    """Attention metadata for prefill and decode batched together."""
+-    # Total number of prefill requests.
+-    num_prefills: int
+-    # Number of prefill tokens.
+-    num_prefill_tokens: int
+-    # Number of decode tokens. Note that it is equivalent to the number of
+-    # decode requests.
+-    num_decode_tokens: int
+-    # The attention metadata for prefill requests in a batch.
+-    # None if there's no prefill requests in a batch.
+-    prefill_metadata: Optional[T]
+-    # The attention metadata for decode requests in a batch.
+-    # None if there's no decode requests in a batch.
+-    decode_metadata: Optional[T]
+-    # (num_tokens,). The indices of the token slots that input tokens will be
+-    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
+-    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
+-    # in block 0, and 1st slot in block 1, respectively.
+-    slot_mapping: torch.Tensor
+-    # The kv cache's data type.
+-    kv_cache_dtype: str
++class AttentionState(ABC, Generic[T]):
++    """Holds attention backend-specific objects reused during the
++    lifetime of the model runner."""
++
++    @abstractmethod
++    def __init__(self, runner: "ModelRunnerBase"):
++        ...
++
++    @abstractmethod
++    @contextmanager
++    def graph_capture(self, max_batch_size: int):
++        """Context manager used when capturing CUDA graphs."""
++        yield
++
++    @abstractmethod
++    def graph_clone(self, batch_size: int) -> "AttentionState[T]":
++        """Clone attention state to save in CUDA graph metadata."""
++        ...
++
++    @abstractmethod
++    def graph_capture_get_metadata_for_batch(
++            self,
++            batch_size: int,
++            is_encoder_decoder_model: bool = False) -> T:
++        """Get attention metadata for CUDA graph capture of batch_size."""
++        ...
++
++    @abstractmethod
++    def get_graph_input_buffers(
++            self,
++            attn_metadata: T,
++            is_encoder_decoder_model: bool = False) -> Dict[str, Any]:
++        """Get attention-specific input buffers for CUDA graph capture."""
++        ...
++
++    @abstractmethod
++    def prepare_graph_input_buffers(
++            self,
++            input_buffers: Dict[str, Any],
++            attn_metadata: T,
++            is_encoder_decoder_model: bool = False) -> None:
++        """In-place modify input buffers dict for CUDA graph replay."""
++        ...
+ 
+-    def __post_init__(self):
+-        if self.num_prefill_tokens > 0:
+-            assert self.num_prefills > 0
+-            assert self.prefill_metadata is not None
+-        if self.num_decode_tokens > 0:
+-            assert self.decode_metadata is not None
++    @abstractmethod
++    def begin_forward(self, model_input: "ModelRunnerInputBase") -> None:
++        """Prepare state for forward pass."""
++        ...
++
++
++class AttentionMetadataBuilder(ABC, Generic[T]):
++    """Abstract class for attention metadata builders."""
++
++    @abstractmethod
++    def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def build(self, seq_lens: List[int], query_lens: List[int],
++              cuda_graph_pad_size: int, batch_size: int) -> T:
++        """Build attention metadata with on-device tensors."""
++        raise NotImplementedError
+ 
+ 
+-class AttentionImpl(ABC):
++class AttentionImpl(ABC, Generic[T]):
+ 
+     @abstractmethod
+     def __init__(
+@@ -111,6 +230,10 @@ class AttentionImpl(ABC):
+         num_kv_heads: Optional[int] = None,
+         alibi_slopes: Optional[List[float]] = None,
+         sliding_window: Optional[int] = None,
++        kv_cache_dtype: str = "auto",
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        attn_type: str = AttentionType.DECODER,
+     ) -> None:
+         raise NotImplementedError
+ 
+@@ -121,7 +244,9 @@ class AttentionImpl(ABC):
+         key: torch.Tensor,
+         value: torch.Tensor,
+         kv_cache: torch.Tensor,
+-        attn_metadata: AttentionMetadata,
+-        kv_scale: float,
++        attn_metadata: T,
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
+     ) -> torch.Tensor:
+         raise NotImplementedError
+diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
+new file mode 100644
+index 0000000..7089d59
+--- /dev/null
++++ b/vllm/attention/backends/blocksparse_attn.py
+@@ -0,0 +1,454 @@
++from dataclasses import dataclass, field
++from typing import Any, Dict, List, Optional, Tuple, Type
++
++import torch
++
++from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
++                                              AttentionMetadata, AttentionType)
++from vllm.attention.backends.utils import (CommonAttentionState,
++                                           CommonMetadataBuilder)
++from vllm.attention.ops.blocksparse_attention.interface import (
++    LocalStridedBlockSparseAttn, get_head_sliding_step)
++from vllm.attention.ops.paged_attn import PagedAttention
++from vllm.distributed import (get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size)
++
++
++@dataclass
++class BlocksparseParams:
++    max_seqlen: int
++
++    # Num q heads per tensor-parallel rank/partition
++    num_heads: int  # per TP partition
++    # Num kv heads per tensor-parallel rank/partition
++    num_kv_heads: int
++
++    # block size used for blocksparse attention.
++    # This is the block_size used in `local_blocks`, `vert_stride`.
++    block_size: int
++
++    # Number of blocks for local attention, i.e., number of
++    # local attended tokens / `sparse_block_size`
++    local_blocks: int
++
++    # Attend to one block per every `vert_stride` blocks.
++    # Controlling the sparsity
++    vert_stride: int
++    """
++    If to use the same vertical stride offset for all heads, 
++    i.e., attend to the same block of tokens on all heads.
++    By default, it is False, i.e., attention on the non-local 
++    blocks depends on the `head_idx`, that is on
++    blocks satisfying 
++    `(block_idx + head_idx * head_sliding_step + 1) % vert_stride == 0`
++    where `head_sliding_step=max(1, int(vert_stride / num_total_heads))`,
++            `block_idx = position_id // sparse_block_size`.
++    See `..ops.blocksparse_attention.utils:get_sparse_attn_mask`
++    for more detail.
++    """
++    homo_head: bool = False
++
++    # If within a group, the kv offsets that each q attends is the same or no.
++    homo_head_group: bool = False
++
++    # Decided by homo_head and homo_head group
++    head_sliding_step: int = field(init=False)
++
++    # range of q heads to for a TP rank
++    active_head_range: Tuple = field(init=False)
++
++    def __post_init__(self):
++        assert self.block_size > 0
++        assert self.local_blocks >= 0
++        assert self.vert_stride >= 1
++        assert self.num_heads % self.num_kv_heads == 0
++
++        tp_size = get_tensor_model_parallel_world_size()
++        tp_rank = get_tensor_model_parallel_rank()
++        total_heads = tp_size * self.num_heads
++        total_kv_heads = tp_size * self.num_kv_heads
++
++        if self.homo_head:
++            self.head_sliding_step = 0
++        elif self.homo_head_group:
++            head_sliding_step = get_head_sliding_step(total_kv_heads,
++                                                      self.vert_stride)
++            # negative indicates sliding along kv heads, i.e., homo q group
++            self.head_sliding_step = -head_sliding_step
++        else:
++            self.head_sliding_step = get_head_sliding_step(
++                total_heads, self.vert_stride)
++
++        self.active_head_range = (
++            tp_rank * self.num_heads,
++            (tp_rank + 1) * self.num_heads,
++        )
++
++
++class BlocksparseFlashAttentionBackend(AttentionBackend):
++
++    @staticmethod
++    def get_name() -> str:
++        # For attention layer compatibility
++        return "FLASH_ATTN"
++
++    @staticmethod
++    def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]:
++        return BlocksparseFlashAttentionImpl
++
++    @staticmethod
++    def get_metadata_cls() -> Type["AttentionMetadata"]:
++        return BlocksparseFlashAttentionMetadata
++
++    @staticmethod
++    def get_builder_cls() -> Type["BlocksparseFlashAttentionMetadataBuilder"]:
++        return BlocksparseFlashAttentionMetadataBuilder
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
++
++    @staticmethod
++    def get_kv_cache_shape(
++        num_blocks: int,
++        block_size: int,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[int, ...]:
++        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
++                                                 num_kv_heads, head_size)
++
++    @staticmethod
++    def swap_blocks(
++        src_kv_cache: torch.Tensor,
++        dst_kv_cache: torch.Tensor,
++        src_to_dst: Dict[int, int],
++    ) -> None:
++        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
++
++    @staticmethod
++    def copy_blocks(
++        kv_caches: List[torch.Tensor],
++        src_to_dists: Dict[int, List[int]],
++    ) -> None:
++        PagedAttention.copy_blocks(kv_caches, src_to_dists)
++
++
++@dataclass
++class BlocksparseFlashAttentionMetadata(AttentionMetadata):
++    """A copy of Metadata for FlashAttentionBackend,
++    to avoid having to install flash_attn.
++
++    NOTE: Any python object stored here is not updated when it is
++    cuda-graph replayed. If you have values that need to be changed
++    dynamically, it should be stored in tensor. The tensor has to be
++    updated from `CUDAGraphRunner.forward` API.
++    """
++    # (batch_size,). The sequence length per sequence. Sequence length means
++    # the computed tokens + new tokens None if it is a decoding.
++    seq_lens: Optional[List[int]]
++    # seq_lens stored as a tensor.
++    seq_lens_tensor: Optional[torch.Tensor]
++
++    # NOTE(sang): Definition of context_len, query_len, and seq_len.
++    # |---------- N-1 iteration --------|
++    # |---------------- N iteration ---------------------|
++    # |- tokenA -|......................|-- newTokens ---|
++    # |---------- context_len ----------|
++    # |-------------------- seq_len ----------------------|
++    #                                   |-- query_len ---|
++
++    # Maximum query length in the batch. None for decoding.
++    max_query_len: Optional[int]
++    # Maximum sequence length among prefill batch. 0 if there are decoding
++    # requests only.
++    max_prefill_seq_len: int
++    # Maximum sequence length among decode batch. 0 if there are prefill
++    # requests only.
++    max_decode_seq_len: int
++    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
++    # the batch, used to index into subquery. E.g., if the subquery length
++    # is [4, 6], it is [0, 4, 10].
++    query_start_loc: Optional[torch.Tensor]
++    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
++    # the batch, used to index into sequence. E.g., if the sequence length is
++    # [4, 6], it is [0, 4, 10].
++    seq_start_loc: Optional[torch.Tensor]
++    # (batch_size,) A tensor of context lengths (tokens that are computed
++    # so far).
++    context_lens_tensor: Optional[torch.Tensor]
++
++    # (batch_size, max_blocks_per_seq).
++    # Block addresses per sequence. (Seq id -> list of physical block)
++    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
++    # in the kv cache. Each block can contain up to block_size tokens.
++    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
++    # captured.
++    block_tables: Optional[torch.Tensor]
++
++    # Whether or not if cuda graph is enabled.
++    # Cuda-graph is currently enabled for decoding only.
++    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
++    use_cuda_graph: bool
++
++    # Max number of query tokens for among request in the batch.
++    max_decode_query_len: Optional[int] = None
++
++    _cached_prefill_metadata: Optional[
++        "BlocksparseFlashAttentionMetadata"] = None
++    _cached_decode_metadata: Optional[
++        "BlocksparseFlashAttentionMetadata"] = None
++
++    @property
++    def prefill_metadata(
++            self) -> Optional["BlocksparseFlashAttentionMetadata"]:
++        if self.num_prefills == 0:
++            return None
++
++        if self._cached_prefill_metadata is not None:
++            return self._cached_prefill_metadata
++
++        assert self.seq_lens is not None
++        assert self.seq_lens_tensor is not None
++        assert self.query_start_loc is not None
++        assert self.context_lens_tensor is not None
++        assert self.block_tables is not None
++        assert self.seq_start_loc is not None
++
++        self._cached_prefill_metadata = BlocksparseFlashAttentionMetadata(
++            num_prefills=self.num_prefills,
++            num_prefill_tokens=self.num_prefill_tokens,
++            num_decode_tokens=0,
++            slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
++            multi_modal_placeholder_index_maps=self.
++            multi_modal_placeholder_index_maps,
++            seq_lens=self.seq_lens[:self.num_prefills],
++            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
++            max_query_len=self.max_query_len,
++            max_prefill_seq_len=self.max_prefill_seq_len,
++            max_decode_seq_len=0,
++            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
++            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
++            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
++            block_tables=self.block_tables[:self.num_prefills],
++            use_cuda_graph=False,
++        )
++        return self._cached_prefill_metadata
++
++    @property
++    def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]:
++        if self.num_decode_tokens == 0:
++            return None
++
++        if self._cached_decode_metadata is not None:
++            return self._cached_decode_metadata
++        assert self.block_tables is not None
++        assert self.seq_lens_tensor is not None
++
++        self._cached_decode_metadata = BlocksparseFlashAttentionMetadata(
++            num_prefills=0,
++            num_prefill_tokens=0,
++            num_decode_tokens=self.num_decode_tokens,
++            slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
++            multi_modal_placeholder_index_maps=None,
++            seq_lens=None,
++            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
++            max_query_len=None,
++            max_prefill_seq_len=0,
++            max_decode_seq_len=self.max_decode_seq_len,
++            query_start_loc=None,
++            seq_start_loc=None,
++            context_lens_tensor=None,
++            block_tables=self.block_tables[self.num_prefills:],
++            use_cuda_graph=self.use_cuda_graph,
++        )
++        return self._cached_decode_metadata
++
++
++class BlocksparseFlashAttentionMetadataBuilder(
++        CommonMetadataBuilder[BlocksparseFlashAttentionMetadata]):
++
++    _metadata_cls = BlocksparseFlashAttentionMetadata
++
++
++class BlocksparseFlashAttentionImpl(AttentionImpl):
++    """
++    If the input tensors contain prompt tokens, the layout is as follows:
++    |<--------------- num_prompt_tokens -------------->|
++    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->|
++
++    Otherwise, the layout is as follows:
++    |<------------------ num_generation_tokens (M) ----------------->|
++    |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->|
++
++    Generation tokens can contain padding when cuda-graph is used.
++    Currently, prompt tokens don't contain any padding.
++
++    The prompts might have different lengths, while the generation tokens
++    always have length 1.
++
++    """
++
++    def __init__(
++        self,
++        num_heads: int,
++        head_size: int,
++        scale: float,
++        num_kv_heads: int,
++        alibi_slopes: Optional[List[float]],
++        sliding_window: Optional[int],
++        kv_cache_dtype: str,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        attn_type: str = AttentionType.DECODER,
++    ) -> None:
++        assert blocksparse_params is not None
++        assert alibi_slopes is None, ValueError(
++            "Alibi not support for blocksparse flash attention.")
++        assert sliding_window is None, ValueError(
++            "sliding_window is invalid for blocksparse attention.")
++        assert logits_soft_cap is None, ValueError(
++            "logits_soft_cap is invalid for blocksparse attention.")
++
++        if "num_heads" not in blocksparse_params:
++            blocksparse_params["num_heads"] = num_heads
++        if "num_kv_heads" not in blocksparse_params:
++            blocksparse_params["num_kv_heads"] = num_kv_heads or num_heads
++        self.blocksparse_params = BlocksparseParams(**blocksparse_params)
++        self.kv_cache_dtype = kv_cache_dtype
++
++        self.num_heads = num_heads
++        self.head_size = head_size
++        self.scale = float(scale)
++        self.alibi_slopes = alibi_slopes
++        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
++
++        assert self.num_heads % self.num_kv_heads == 0
++        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
++
++        self.local_blocks = self.blocksparse_params.local_blocks
++        self.vert_stride = self.blocksparse_params.vert_stride
++        self.sparse_block_size = self.blocksparse_params.block_size
++        self.head_sliding_step = self.blocksparse_params.head_sliding_step
++
++        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
++        if head_size not in suppored_head_sizes:
++            raise ValueError(
++                f"Head size {head_size} is not supported by PagedAttention. "
++                f"Supported head sizes are: {suppored_head_sizes}.")
++
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.tp_rank = get_tensor_model_parallel_rank()
++
++        total_num_heads = num_heads * self.tp_size
++        self.bs_attn = LocalStridedBlockSparseAttn(
++            total_num_heads,
++            self.blocksparse_params.max_seqlen,
++            self.blocksparse_params.local_blocks,
++            self.blocksparse_params.vert_stride,
++            self.blocksparse_params.block_size,
++            homo_head=self.blocksparse_params.homo_head,
++            active_head_range=self.blocksparse_params.active_head_range,
++        )
++
++        if attn_type != AttentionType.DECODER:
++            raise NotImplementedError("Encoder self-attention and "
++                                      "encoder/decoder cross-attention "
++                                      "are not implemented for "
++                                      "BlocksparseFlashAttentionImpl")
++
++    def forward(
++        self,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: BlocksparseFlashAttentionMetadata,
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        """Forward pass with FlashAttention and PagedAttention.
++
++        Args:
++            query: shape = [num_tokens, num_heads * head_size]
++            key: shape = [num_tokens, num_kv_heads * head_size]
++            value: shape = [num_tokens, num_kv_heads * head_size]
++            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
++                NOTE: kv_cache will be an empty tensor with shape [0]
++                for profiling run.
++            attn_metadata: Metadata for attention.
++        Returns:
++            shape = [num_tokens, num_heads * head_size]
++        """
++        num_tokens, hidden_size = query.shape
++        # Reshape the query, key, and value tensors.
++        query = query.view(-1, self.num_heads, self.head_size)
++        key = key.view(-1, self.num_kv_heads, self.head_size)
++        value = value.view(-1, self.num_kv_heads, self.head_size)
++
++        if kv_cache.numel() > 0:
++            key_cache, value_cache = PagedAttention.split_kv_cache(
++                kv_cache, self.num_kv_heads, self.head_size)
++
++            # Reshape the input keys and values and store them in the cache.
++            # If kv_cache is not provided, the new key and value tensors are
++            # not cached. This happens during the initial memory profiling run.
++
++            PagedAttention.write_to_paged_cache(
++                key,
++                value,
++                key_cache,
++                value_cache,
++                attn_metadata.slot_mapping,
++                self.kv_cache_dtype,
++                k_scale,
++                v_scale,
++            )
++
++        if prefill_meta := attn_metadata.prefill_metadata:
++
++            # Prompt run.
++            # normal attention
++            # When block_tables are not filled, it means q and k are the
++            # prompt, and they have the same length.
++
++            assert kv_cache.numel() == 0 \
++                    or prefill_meta.block_tables is None \
++                    or prefill_meta.block_tables.numel() == 0, \
++                "Does not support prefix-enabled attention."
++
++            output = self.bs_attn(
++                q=query,
++                k=key,
++                v=value,
++                cu_seqlens_q=prefill_meta.seq_start_loc,
++                cu_seqlens_k=prefill_meta.seq_start_loc,
++                sm_scale=self.scale,
++            )
++
++        if decode_meta := attn_metadata.decode_metadata:
++            # Decoding run.
++            output = PagedAttention.forward_decode(
++                query,
++                key_cache,
++                value_cache,
++                decode_meta.block_tables,
++                decode_meta.seq_lens_tensor,
++                self.blocksparse_params.max_seqlen,
++                self.kv_cache_dtype,
++                self.num_kv_heads,
++                self.scale,
++                self.alibi_slopes,
++                k_scale,
++                v_scale,
++                tp_rank=self.tp_rank,
++                blocksparse_local_blocks=self.local_blocks,
++                blocksparse_vert_stride=self.vert_stride,
++                blocksparse_block_size=self.sparse_block_size,
++                blocksparse_head_sliding_step=self.head_sliding_step,
++            )
++
++        assert output is not None
++        # Reshape the output tensor.
++        return output.view(num_tokens, hidden_size)
+diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
+index fc7501e..23ea244 100644
+--- a/vllm/attention/backends/flash_attn.py
++++ b/vllm/attention/backends/flash_attn.py
+@@ -1,31 +1,57 @@
+-"""Attention layer with Flash and PagedAttention.
+-
+-NOTE(woosuk): At the moment, this file includes a lot of duplicated code from
+-XFormers backend. The duplicated code will be removed once we use flash-attn or
+-flashinfer for all the attention operations.
+-"""
++"""Attention layer with FlashAttention."""
++from collections import defaultdict
+ from dataclasses import dataclass
+-from typing import Dict, List, Optional, Tuple, Type
++from itertools import accumulate
++from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+ 
+ import torch
+-from flash_attn import flash_attn_varlen_func
+ 
++from vllm import _custom_ops as ops
+ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                               AttentionMetadata,
+-                                              AttentionMetadataPerStage)
+-from vllm.attention.ops.paged_attn import (PagedAttention,
+-                                           PagedAttentionMetadata)
++                                              AttentionMetadataBuilder,
++                                              AttentionType)
++from vllm.attention.backends.utils import (
++    PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
++    compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
++    get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
++    is_all_encoder_attn_metadata_set, is_block_tables_empty)
++from vllm.multimodal import MultiModalPlaceholderMap
++from vllm.utils import async_tensor_h2d, make_tensor_with_pad
++
++if TYPE_CHECKING:
++    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
++                                          ModelInputForGPUWithSamplingMetadata)
++
++from vllm.vllm_flash_attn import (flash_attn_varlen_func,
++                                  flash_attn_with_kvcache)
+ 
+ 
+ class FlashAttentionBackend(AttentionBackend):
+ 
++    @staticmethod
++    def get_supported_head_sizes() -> List[int]:
++        return [32, 64, 96, 128, 160, 192, 224, 256]
++
++    @staticmethod
++    def get_name() -> str:
++        return "FLASH_ATTN"
++
+     @staticmethod
+     def get_impl_cls() -> Type["FlashAttentionImpl"]:
+         return FlashAttentionImpl
+ 
+     @staticmethod
+-    def make_metadata(*args, **kwargs) -> "FlashAttentionMetadata":
+-        return FlashAttentionMetadata(*args, **kwargs)
++    def get_metadata_cls() -> Type["AttentionMetadata"]:
++        return FlashAttentionMetadata
++
++    @staticmethod
++    def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]:
++        return FlashAttentionMetadataBuilder
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
+ 
+     @staticmethod
+     def get_kv_cache_shape(
+@@ -34,28 +60,36 @@ class FlashAttentionBackend(AttentionBackend):
+         num_kv_heads: int,
+         head_size: int,
+     ) -> Tuple[int, ...]:
+-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
+-                                                 num_kv_heads, head_size)
++        if block_size % 16 != 0:
++            raise ValueError("Block size must be a multiple of 16.")
++        return (2, num_blocks, block_size, num_kv_heads, head_size)
+ 
+     @staticmethod
+     def swap_blocks(
+         src_kv_cache: torch.Tensor,
+         dst_kv_cache: torch.Tensor,
+-        src_to_dst: Dict[int, int],
++        src_to_dst: torch.Tensor,
+     ) -> None:
+-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
++        src_key_cache = src_kv_cache[0]
++        dst_key_cache = dst_kv_cache[0]
++        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
++        src_value_cache = src_kv_cache[1]
++        dst_value_cache = dst_kv_cache[1]
++        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+ 
+     @staticmethod
+     def copy_blocks(
+         kv_caches: List[torch.Tensor],
+-        src_to_dists: Dict[int, List[int]],
++        src_to_dists: torch.Tensor,
+     ) -> None:
+-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
++        key_caches = [kv_cache[0] for kv_cache in kv_caches]
++        value_caches = [kv_cache[1] for kv_cache in kv_caches]
++
++        ops.copy_blocks(key_caches, value_caches, src_to_dists)
+ 
+ 
+ @dataclass
+-class FlashAttentionMetadata(AttentionMetadataPerStage,
+-                             PagedAttentionMetadata):
++class FlashAttentionMetadata(AttentionMetadata):
+     """Metadata for FlashAttentionBackend.
+ 
+     NOTE: Any python object stored here is not updated when it is
+@@ -63,9 +97,6 @@ class FlashAttentionMetadata(AttentionMetadataPerStage,
+     dynamically, it should be stored in tensor. The tensor has to be
+     updated from `CUDAGraphRunner.forward` API.
+     """
+-    # Currently, input sequences can only contain all prompts
+-    # or all decoding. True if all sequences are prompts.
+-    is_prompt: bool
+     # (batch_size,). The sequence length per sequence. Sequence length means
+     # the computed tokens + new tokens None if it is a decoding.
+     seq_lens: Optional[List[int]]
+@@ -77,30 +108,460 @@ class FlashAttentionMetadata(AttentionMetadataPerStage,
+     # |---------------- N iteration ---------------------|
+     # |- tokenA -|......................|-- newTokens ---|
+     # |---------- context_len ----------|
+-    # |-------------------- seq_len ----------------------|
++    # |-------------------- seq_len ---------------------|
+     #                                   |-- query_len ---|
+ 
+-    # Maximum query length in the batch.
+-    max_query_len: Optional[int]
+-    # Maximum sequence length in the batch.
+-    max_seq_len: Optional[int]
+-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+-    # the batch, used to index into subquery. E.g., if the subquery length
+-    # is [4, 6], it is [0, 4, 10].
+-    subquery_start_loc: Optional[torch.Tensor]
+-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+-    # the batch, used to index into sequence. E.g., if the sequence length is
+-    # [4, 6], it is [0, 4, 10].
+-    seq_start_loc: Optional[torch.Tensor]
++    # Maximum sequence length among prefill batch. 0 if there are decoding
++    # requests only.
++    max_prefill_seq_len: int
++    # Maximum sequence length among decode batch. 0 if there are prefill
++    # requests only.
++    max_decode_seq_len: int
+     # (batch_size,) A tensor of context lengths (tokens that are computed
+     # so far).
+     context_lens_tensor: Optional[torch.Tensor]
+ 
++    # (batch_size, max_blocks_per_seq).
++    # Block addresses per sequence. (Seq id -> list of physical block)
++    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
++    # in the kv cache. Each block can contain up to block_size tokens.
++    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
++    # captured.
++    block_tables: Optional[torch.Tensor]
++
+     # Whether or not if cuda graph is enabled.
+     # Cuda-graph is currently enabled for decoding only.
+     # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
++
+     use_cuda_graph: bool
+ 
++    # Maximum query length in the batch.
++    max_query_len: Optional[int] = None
++
++    # Max number of query tokens among request in the batch.
++    max_decode_query_len: Optional[int] = None
++
++    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
++    # the batch, used to index into subquery. E.g., if the subquery length
++    # is [4, 6], it is [0, 4, 10].
++    query_start_loc: Optional[torch.Tensor] = None
++    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
++    # the batch, used to index into sequence. E.g., if the sequence length is
++    # [4, 6], it is [0, 4, 10].
++    seq_start_loc: Optional[torch.Tensor] = None
++
++    _cached_prefill_metadata: Optional["FlashAttentionMetadata"] = None
++    _cached_decode_metadata: Optional["FlashAttentionMetadata"] = None
++
++    # Begin encoder attn & enc/dec cross-attn fields...
++
++    # Encoder sequence lengths representation
++    encoder_seq_lens: Optional[List[int]] = None
++    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
++    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
++    # the batch, used to index into sequence. E.g., if the sequence length is
++    # [4, 6], it is [0, 4, 10].
++    encoder_seq_start_loc: Optional[torch.Tensor] = None
++    # Maximum sequence length among encoder sequences
++    max_encoder_seq_len: Optional[int] = None
++    # Number of tokens input to encoder
++    num_encoder_tokens: Optional[int] = None
++
++    # Cross-attention memory-mapping data structures: slot mapping
++    # and block tables
++    cross_slot_mapping: Optional[torch.Tensor] = None
++    cross_block_tables: Optional[torch.Tensor] = None
++
++    @property
++    def is_all_encoder_attn_metadata_set(self):
++        '''
++        All attention metadata required for encoder attention is set.
++        '''
++        return is_all_encoder_attn_metadata_set(self)
++
++    @property
++    def is_all_cross_attn_metadata_set(self):
++        '''
++        All attention metadata required for enc/dec cross-attention is set.
++
++        Superset of encoder attention required metadata.
++        '''
++        return is_all_cross_attn_metadata_set(self)
++
++    @property
++    def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
++        if self.num_prefills == 0:
++            return None
++
++        if self._cached_prefill_metadata is not None:
++            return self._cached_prefill_metadata
++
++        assert ((self.seq_lens is not None)
++                or (self.encoder_seq_lens is not None))
++        assert ((self.seq_lens_tensor is not None)
++                or (self.encoder_seq_lens_tensor is not None))
++
++        # Compute some attn_metadata fields which default to None
++        query_start_loc = (None if self.query_start_loc is None else
++                           self.query_start_loc[:self.num_prefills + 1])
++        slot_mapping = (None if self.slot_mapping is None else
++                        self.slot_mapping[:self.num_prefill_tokens])
++        seq_lens = (None if self.seq_lens is None else
++                    self.seq_lens[:self.num_prefills])
++        seq_lens_tensor = (None if self.seq_lens_tensor is None else
++                           self.seq_lens_tensor[:self.num_prefills])
++        seq_start_loc = (None if self.seq_start_loc is None else
++                         self.seq_start_loc[:self.num_prefills + 1])
++        context_lens_tensor = (None if self.context_lens_tensor is None else
++                               self.context_lens_tensor[:self.num_prefills])
++        block_tables = (None if self.block_tables is None else
++                        self.block_tables[:self.num_prefills])
++
++        self._cached_prefill_metadata = FlashAttentionMetadata(
++            num_prefills=self.num_prefills,
++            num_prefill_tokens=self.num_prefill_tokens,
++            num_decode_tokens=0,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=self.
++            multi_modal_placeholder_index_maps,
++            seq_lens=seq_lens,
++            seq_lens_tensor=seq_lens_tensor,
++            max_query_len=self.max_query_len,
++            max_prefill_seq_len=self.max_prefill_seq_len,
++            max_decode_query_len=0,
++            max_decode_seq_len=0,
++            query_start_loc=query_start_loc,
++            seq_start_loc=seq_start_loc,
++            context_lens_tensor=context_lens_tensor,
++            block_tables=block_tables,
++            use_cuda_graph=False,
++            # Begin encoder & cross attn fields below...
++            encoder_seq_lens=self.encoder_seq_lens,
++            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
++            encoder_seq_start_loc=self.encoder_seq_start_loc,
++            max_encoder_seq_len=self.max_encoder_seq_len,
++            cross_slot_mapping=self.cross_slot_mapping,
++            cross_block_tables=self.cross_block_tables)
++        return self._cached_prefill_metadata
++
++    @property
++    def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
++        if self.num_decode_tokens == 0:
++            return None
++
++        if self._cached_decode_metadata is not None:
++            return self._cached_decode_metadata
++        assert ((self.seq_lens_tensor is not None)
++                or (self.encoder_seq_lens_tensor is not None))
++
++        # Compute some attn_metadata fields which default to None
++        slot_mapping = (None if self.slot_mapping is None else
++                        self.slot_mapping[self.num_prefill_tokens:])
++        seq_lens_tensor = (None if self.seq_lens_tensor is None else
++                           self.seq_lens_tensor[self.num_prefills:])
++        block_tables = (None if self.block_tables is None else
++                        self.block_tables[self.num_prefills:])
++
++        self._cached_decode_metadata = FlashAttentionMetadata(
++            num_prefills=0,
++            num_prefill_tokens=0,
++            num_decode_tokens=self.num_decode_tokens,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=None,
++            seq_lens=None,
++            seq_lens_tensor=seq_lens_tensor,
++            max_decode_query_len=self.max_decode_query_len,
++            max_query_len=self.max_query_len,
++            max_prefill_seq_len=0,
++            max_decode_seq_len=self.max_decode_seq_len,
++            # Batch may be composed of prefill|decodes, adjust query start
++            # indices to refer to the start of decodes. E.g.
++            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
++            query_start_loc=(self.query_start_loc[self.num_prefills:] -
++                             self.query_start_loc[self.num_prefills])
++            if self.query_start_loc is not None else None,
++            seq_start_loc=self.seq_start_loc[self.num_prefills:]
++            if self.seq_start_loc is not None else None,
++            context_lens_tensor=None,
++            block_tables=block_tables,
++            use_cuda_graph=self.use_cuda_graph,
++            # Begin encoder & cross attn fields below...
++            encoder_seq_lens=self.encoder_seq_lens,
++            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
++            encoder_seq_start_loc=self.encoder_seq_start_loc,
++            max_encoder_seq_len=self.max_encoder_seq_len,
++            cross_slot_mapping=self.cross_slot_mapping,
++            cross_block_tables=self.cross_block_tables)
++        return self._cached_decode_metadata
++
++    def advance_step(self,
++                     model_input: "ModelInputForGPUWithSamplingMetadata",
++                     sampled_token_ids: Optional[torch.Tensor],
++                     block_size: int,
++                     num_seqs: int,
++                     num_queries: int,
++                     turn_prefills_into_decodes: bool = False):
++        """
++        Update metadata in-place to advance one decode step.
++        """
++        # When using cudagraph, the num_seqs is padded to the next captured
++        # batch sized, but num_queries tracks the actual number of requests in
++        # the batch. For --enforce-eager mode, num_seqs == num_queries
++        if num_seqs != num_queries:
++            assert num_seqs > num_queries
++            assert self.use_cuda_graph
++
++        if turn_prefills_into_decodes:
++            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
++            # decodes are scheduled together. In the first step, all the
++            # prefills turn into decodes. This update reflects that
++            # conversion.
++            assert self.num_decode_tokens + self.num_prefills == num_seqs
++            self.num_decode_tokens += self.num_prefills
++            self.num_prefills = 0
++            self.num_prefill_tokens = 0
++            self.max_prefill_seq_len = 0
++            self.max_query_len = 1
++
++            self.slot_mapping = self.slot_mapping[:num_seqs]
++        else:
++            assert self.seq_lens is not None
++            assert self.max_decode_seq_len == max(self.seq_lens)
++
++        assert self.num_prefills == 0
++        assert self.num_prefill_tokens == 0
++        assert self.num_decode_tokens == num_seqs
++        assert self.slot_mapping.shape == (num_seqs, )
++
++        assert self.seq_lens is not None
++        assert len(self.seq_lens) == num_seqs
++        assert self.seq_lens_tensor is not None
++        assert self.seq_lens_tensor.shape == (num_seqs, )
++        assert self.max_query_len == 1
++        assert self.max_prefill_seq_len == 0
++
++        assert self.query_start_loc is not None
++        assert self.query_start_loc.shape == (num_queries + 1, )
++        assert self.seq_start_loc is not None
++        assert self.seq_start_loc.shape == (num_seqs + 1, )
++
++        assert self.context_lens_tensor is not None
++        assert self.context_lens_tensor.shape == (num_queries, )
++
++        assert self.block_tables is not None
++        assert self.block_tables.shape[0] == num_seqs
++
++        # Update query lengths. Note that we update only queries and not seqs,
++        # since tensors may be padded due to captured cuda graph batch size
++        for i in range(num_queries):
++            self.seq_lens[i] += 1
++        self.max_decode_seq_len = max(self.seq_lens)
++
++        ops.advance_step_flashattn(num_seqs=num_seqs,
++                                   num_queries=num_queries,
++                                   block_size=block_size,
++                                   input_tokens=model_input.input_tokens,
++                                   sampled_token_ids=sampled_token_ids,
++                                   input_positions=model_input.input_positions,
++                                   seq_lens=self.seq_lens_tensor,
++                                   slot_mapping=self.slot_mapping,
++                                   block_tables=self.block_tables)
++
++
++class FlashAttentionMetadataBuilder(
++        AttentionMetadataBuilder[FlashAttentionMetadata]):
++
++    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
++        self.slot_mapping: List[int] = []
++        self.prefill_seq_lens: List[int] = []
++        self.context_lens: List[int] = []
++        self.block_tables: List[List[int]] = []
++        self.curr_seq_lens: List[int] = []
++        self.multimodal_placeholder_maps: Dict[
++            str,
++            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
++        self.num_prefills = 0
++        self.num_prefill_tokens = 0
++        self.num_decode_tokens = 0
++        self.has_prefix_cache_hit = False
++
++        self.input_builder = input_builder
++        self.runner = input_builder.runner
++        self.sliding_window = input_builder.sliding_window
++        self.block_size = input_builder.block_size
++
++    def _add_seq_group(
++            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
++            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
++        """Add a sequence group to the metadata. Specifically update/append
++        1. context length.
++        2. block table.
++        3. slot mapping.
++        """
++        is_prompt = inter_data.is_prompt
++        block_tables = inter_data.block_tables
++
++        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
++             curr_sliding_window_block) in zip(
++                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
++                 inter_data.orig_seq_lens, inter_data.seq_lens,
++                 inter_data.query_lens, inter_data.context_lens,
++                 inter_data.curr_sliding_window_blocks):
++            self.context_lens.append(context_len)
++
++            if is_prompt:
++                mm_maps = inter_data.multi_modal_placeholder_maps
++                if mm_maps:
++                    for modality, placeholders in mm_maps.items():
++                        self.multimodal_placeholder_maps[modality].extend(
++                            placeholders)
++
++                self.num_prefills += 1
++                self.num_prefill_tokens += token_len
++                self.prefill_seq_lens.append(seq_len)
++            else:
++                self.num_decode_tokens += query_len
++                self.curr_seq_lens.append(curr_seq_len)
++
++            # Compute block table.
++            # TODO(sang): Combine chunked prefill and prefix caching by
++            # only allowing multiple of block_size chunk size.
++            # NOTE: This only works for oooooooxxx style attention.
++            block_table = []
++            if prefix_cache_hit:
++                # NOTE(woosuk): For flash-attn, the block table should
++                # include the entries for the incoming prefill tokens.
++                block_table = block_tables[seq_id]
++            elif ((chunked_prefill_enabled or not is_prompt)
++                  and block_tables is not None):
++                if curr_sliding_window_block == 0:
++                    block_table = block_tables[seq_id]
++                else:
++                    block_table = block_tables[seq_id][
++                        -curr_sliding_window_block:]
++            self.block_tables.append(block_table)
++
++            # Compute slot mapping.
++            is_profile_run = is_block_tables_empty(block_tables)
++            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
++                                                       context_len,
++                                                       self.sliding_window)
++            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
++                                 seq_len, context_len, start_idx,
++                                 self.block_size, inter_data.block_tables)
++
++    def _get_graph_runner_block_tables(
++            self, num_seqs: int,
++            block_tables: List[List[int]]) -> torch.Tensor:
++        # The shape of graph_block_tables is
++        # [max batch size, max context len // block size].
++        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
++        assert max_batch_size >= num_seqs
++
++        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
++        for i, block_table in enumerate(block_tables):
++            if block_table:
++                num_blocks = len(block_table)
++                if num_blocks <= max_blocks:
++                    graph_block_tables[i, :num_blocks] = block_table
++                else:
++                    # It may be possible to have more blocks allocated due
++                    # to lookahead slots of multi-step, however, they are
++                    # not used anyway, so can be safely ignored.
++                    graph_block_tables[
++                        i, :max_blocks] = block_table[:max_blocks]
++
++        return torch.from_numpy(graph_block_tables).to(
++            device=self.runner.device, non_blocking=True)
++
++    def build(self, seq_lens: List[int], query_lens: List[int],
++              cuda_graph_pad_size: int, batch_size: int):
++        """Build attention metadata with on-device tensors.
++
++        Args:
++            seq_lens: The maybe padded sequence lengths of the input sequences.
++            query_lens: The query lengths of the input sequences.
++            cuda_graph_pad_size: The padding size for cuda graph.
++                                 -1 if cuda graph is not used.
++            batch_size: The maybe padded batch size.
++        """
++        prefix_cache_hit = any([
++            inter_data.prefix_cache_hit
++            for inter_data in self.input_builder.inter_data_list
++        ])
++        for inter_data in self.input_builder.inter_data_list:
++            self._add_seq_group(inter_data,
++                                self.input_builder.chunked_prefill_enabled,
++                                prefix_cache_hit)
++
++        device = self.runner.device
++        use_captured_graph = cuda_graph_pad_size != -1
++
++        max_query_len = max(query_lens)
++        decode_query_lens = query_lens[self.num_prefills:]
++        if len(decode_query_lens) > 0:
++            max_decode_query_len = max(decode_query_lens)
++        else:
++            max_decode_query_len = 1
++        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
++        max_decode_seq_len = max(self.curr_seq_lens, default=0)
++        num_decode_tokens = self.num_decode_tokens
++        query_start_loc = list(accumulate(query_lens, initial=0))
++        seq_start_loc = list(accumulate(seq_lens, initial=0))
++
++        num_seqs = len(seq_lens)
++        if use_captured_graph:
++            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
++            self.block_tables.extend([] * cuda_graph_pad_size)
++            num_decode_tokens = batch_size - self.num_prefill_tokens
++            block_tables = self._get_graph_runner_block_tables(
++                num_seqs, self.block_tables)
++        else:
++            block_tables = make_tensor_with_pad(
++                self.block_tables,
++                pad=0,
++                dtype=torch.int,
++                device=device,
++            )
++        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
++
++        assert device is not None
++        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
++                                               device, self.runner.pin_memory)
++        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
++                                           self.runner.pin_memory)
++        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
++                                               device, self.runner.pin_memory)
++        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
++                                                  device,
++                                                  self.runner.pin_memory)
++        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
++                                                device, self.runner.pin_memory)
++        placeholder_index_maps = {
++            modality: placeholder_map.index_map()
++            for modality, placeholder_map in
++            self.multimodal_placeholder_maps.items()
++        }
++
++        return FlashAttentionMetadata(
++            num_prefills=self.num_prefills,
++            slot_mapping=slot_mapping_tensor,
++            num_prefill_tokens=self.num_prefill_tokens,
++            num_decode_tokens=num_decode_tokens,
++            seq_lens=seq_lens,
++            multi_modal_placeholder_index_maps=placeholder_index_maps,
++            seq_lens_tensor=seq_lens_tensor,
++            max_query_len=max_query_len,
++            max_decode_query_len=max_decode_query_len,
++            max_prefill_seq_len=max_prefill_seq_len,
++            max_decode_seq_len=max_decode_seq_len,
++            query_start_loc=query_start_loc_tensor,
++            seq_start_loc=seq_start_loc_tensor,
++            context_lens_tensor=context_lens_tensor,
++            block_tables=block_tables,
++            use_cuda_graph=use_captured_graph,
++        )
++
+ 
+ class FlashAttentionImpl(AttentionImpl):
+     """
+@@ -133,28 +594,41 @@ class FlashAttentionImpl(AttentionImpl):
+         num_heads: int,
+         head_size: int,
+         scale: float,
+-        num_kv_heads: Optional[int] = None,
+-        alibi_slopes: Optional[List[float]] = None,
+-        sliding_window: Optional[int] = None,
++        num_kv_heads: int,
++        alibi_slopes: Optional[List[float]],
++        sliding_window: Optional[int],
++        kv_cache_dtype: str,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        attn_type: str = AttentionType.DECODER,
+     ) -> None:
++        if blocksparse_params is not None:
++            raise ValueError(
++                "FlashAttention does not support block-sparse attention.")
+         self.num_heads = num_heads
+         self.head_size = head_size
+         self.scale = float(scale)
+-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+-        self.sliding_window = ((sliding_window, sliding_window)
+-                               if sliding_window is not None else (-1, -1))
++        self.num_kv_heads = num_kv_heads
+         if alibi_slopes is not None:
+             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+         self.alibi_slopes = alibi_slopes
++        self.sliding_window = ((sliding_window - 1,
++                                0) if sliding_window is not None else (-1, -1))
++        self.kv_cache_dtype = kv_cache_dtype
++        if logits_soft_cap is None:
++            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
++            logits_soft_cap = 0
++        self.logits_soft_cap = logits_soft_cap
+ 
+         assert self.num_heads % self.num_kv_heads == 0
+         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+ 
+-        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
+-        if head_size not in suppored_head_sizes:
++        support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
++        if head_size not in support_head_sizes:
+             raise ValueError(
+-                f"Head size {head_size} is not supported by PagedAttention. "
+-                f"Supported head sizes are: {suppored_head_sizes}.")
++                f"Head size {head_size} is not supported by FlashAttention. "
++                f"Supported head sizes are: {support_head_sizes}.")
++        self.attn_type = attn_type
+ 
+     def forward(
+         self,
+@@ -162,110 +636,271 @@ class FlashAttentionImpl(AttentionImpl):
+         key: torch.Tensor,
+         value: torch.Tensor,
+         kv_cache: torch.Tensor,
+-        attn_metadata: AttentionMetadata[FlashAttentionMetadata],
+-        kv_scale: float,
++        attn_metadata: FlashAttentionMetadata,
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
+     ) -> torch.Tensor:
+-        """Forward pass with FlashAttention and PagedAttention.
++        """Forward pass with FlashAttention.
+ 
+         Args:
+-            query: shape = [num_tokens, num_heads * head_size]
+-            key: shape = [num_tokens, num_kv_heads * head_size]
+-            value: shape = [num_tokens, num_kv_heads * head_size]
+-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
++            query: shape = [num_tokens, num_heads, head_size]
++            key: shape = [num_tokens, num_kv_heads, head_size]
++            value: shape = [num_tokens, num_kv_heads, head_size]
++            output: shape = [num_tokens, num_heads, head_size]
++            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
++                NOTE: kv_cache will be an empty tensor with shape [0]
++                for profiling run.
+             attn_metadata: Metadata for attention.
+-        Returns:
+-            shape = [num_tokens, num_heads * head_size]
++        NOTE: It in-place updates the output tensor.
+         """
+-        num_tokens, hidden_size = query.shape
+-        # Reshape the query, key, and value tensors.
+-        query = query.view(-1, self.num_heads, self.head_size)
+-        key = key.view(-1, self.num_kv_heads, self.head_size)
+-        value = value.view(-1, self.num_kv_heads, self.head_size)
+-
+-        if kv_cache is not None:
+-            key_cache, value_cache = PagedAttention.split_kv_cache(
+-                kv_cache, self.num_kv_heads, self.head_size)
+-
+-            # Reshape the input keys and values and store them in the cache.
+-            # If kv_cache is not provided, the new key and value tensors are
+-            # not cached. This happens during the initial memory profiling run.
+-            PagedAttention.write_to_paged_cache(key, value, key_cache,
+-                                                value_cache,
+-                                                attn_metadata.slot_mapping,
+-                                                attn_metadata.kv_cache_dtype,
+-                                                kv_scale)
+-
+-        num_prefill_tokens = attn_metadata.num_prefill_tokens
+-        num_decode_tokens = attn_metadata.num_decode_tokens
+-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
+-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+-
+-        output = torch.empty_like(query)
+-        # Query for decode. KV is not needed because it is already cached.
+-        decode_query = query[num_prefill_tokens:]
+-        # QKV for prefill.
+-        query = query[:num_prefill_tokens]
+-        key = key[:num_prefill_tokens]
+-        value = value[:num_prefill_tokens]
++        # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
++        assert k_scale == 1.0 and v_scale == 1.0, (
++            "key/v_scale is not supported in FlashAttention.")
++
++        assert output is not None, "Output tensor must be provided."
++
++        attn_type = self.attn_type
++        if (attn_type == AttentionType.ENCODER
++                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
++            raise AttributeError("Encoder attention requires setting "
++                                 "encoder metadata attributes.")
++        elif (attn_type == AttentionType.ENCODER_DECODER
++              and (not attn_metadata.is_all_cross_attn_metadata_set)):
++            raise AttributeError("Encoder/decoder cross-attention "
++                                 "requires setting cross-attention "
++                                 "metadata attributes.")
++
++        kv_cache_dtype: str = self.kv_cache_dtype
++        softmax_scale: float = self.scale
++        window_size = self.sliding_window
++        alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes
++        logits_soft_cap: Optional[float] = self.logits_soft_cap
++
++        if kv_cache.numel() > 0:
++            key_cache = kv_cache[0]
++            value_cache = kv_cache[1]
++            # We skip updating the KV cache under two conditions:
++            #  a. When the Attention Type is ENCODER. In this phase, we compute
++            #     only the encoder attention without updating the cache.
++            #  b. When both Key and Value are None. This occurs during
++            #     cross-attention computation in the decoding phase, where the
++            #     KV cache is already populated with the cross-attention
++            #     tensor. Thus, we skip cache updates during this time.
++            if (attn_type != AttentionType.ENCODER) and (key is not None) and (
++                    value is not None):
++                if attn_type == AttentionType.ENCODER_DECODER:
++                    # Update cross-attention KV cache (prefill-only)
++                    updated_slot_mapping = attn_metadata.cross_slot_mapping
++                else:
++                    # Update self-attention KV cache (prefill/decode)
++                    updated_slot_mapping = attn_metadata.slot_mapping
++
++                # Reshape the input keys and values and store them in the cache.
++                # If kv_cache is not provided, the new key and value tensors are
++                # not cached. This happens during the initial memory
++                # profiling run.
++                torch.ops._C_cache_ops.reshape_and_cache_flash(
++                    key,
++                    value,
++                    kv_cache[0],
++                    kv_cache[1],
++                    updated_slot_mapping.flatten(),  # type: ignore[union-attr]
++                    kv_cache_dtype,
++                    k_scale,
++                    v_scale,
++                )
+ 
+-        assert query.shape[0] == num_prefill_tokens
+-        assert decode_query.shape[0] == num_decode_tokens
++        (num_prefill_query_tokens, num_prefill_kv_tokens,
++        num_decode_query_tokens) = \
++            get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
++        decode_query = query[num_prefill_query_tokens:]
++        decode_output = output[num_prefill_query_tokens:]
++        # QKV for prefill.
++        query = query[:num_prefill_query_tokens]
++        prefill_output = output[:num_prefill_query_tokens]
++        assert query.shape[0] == num_prefill_query_tokens
++        assert decode_query.shape[0] == num_decode_query_tokens
+ 
+         if prefill_meta := attn_metadata.prefill_metadata:
+             # Prompt run.
+-            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
++            if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
++                    or prefill_meta.block_tables.numel() == 0):
+                 # normal attention
+                 # When block_tables are not filled, it means q and k are the
+                 # prompt, and they have the same length.
+-                out = flash_attn_varlen_func(
++                q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \
++                    _get_query_key_seq_metadata(prefill_meta, True, attn_type)
++
++                key = key[:num_prefill_kv_tokens]
++                value = value[:num_prefill_kv_tokens]
++
++                flash_attn_varlen_func(
+                     q=query,
+                     k=key,
+                     v=value,
+-                    cu_seqlens_q=prefill_meta.seq_start_loc,
+-                    cu_seqlens_k=prefill_meta.seq_start_loc,
+-                    max_seqlen_q=prefill_meta.max_seq_len,
+-                    max_seqlen_k=prefill_meta.max_seq_len,
+-                    softmax_scale=self.scale,
+-                    causal=True,
+-                    window_size=self.sliding_window,
+-                    alibi_slopes=self.alibi_slopes,
++                    cu_seqlens_q=q_seq_start_loc,
++                    cu_seqlens_k=k_seq_start_loc,
++                    max_seqlen_q=q_seq_len,
++                    max_seqlen_k=k_seq_len,
++                    softmax_scale=softmax_scale,
++                    causal=_get_causal_option(attn_type),
++                    window_size=window_size,
++                    alibi_slopes=alibi_slopes,
++                    softcap=logits_soft_cap,
++                    out=prefill_output,
+                 )
+-                assert output[:num_prefill_tokens].shape == out.shape
+-                output[:num_prefill_tokens] = out
+             else:
+                 # prefix-enabled attention
+-                # TODO(Hai) this triton kernel has regression issue (broke) to
+-                # deal with different data types between KV and FP8 KV cache,
+-                # to be addressed separately.
+-                output[:num_prefill_tokens] = PagedAttention.forward_prefix(
+-                    query,
+-                    key,
+-                    value,
+-                    key_cache,
+-                    value_cache,
+-                    prefill_meta.block_tables,
+-                    prefill_meta.subquery_start_loc,
+-                    prefill_meta.seq_lens_tensor,
+-                    prefill_meta.context_lens_tensor,
+-                    prefill_meta.max_query_len,
+-                    self.alibi_slopes,
+-                    self.sliding_window[0],
++                assert attn_type == AttentionType.DECODER, (
++                    "Only decoder-only models support prefix caching")
++                assert prefill_meta.seq_lens is not None
++                max_seq_len = max(prefill_meta.seq_lens)
++                flash_attn_varlen_func(  # noqa
++                    q=query,
++                    k=key_cache,
++                    v=value_cache,
++                    cu_seqlens_q=prefill_meta.query_start_loc,
++                    max_seqlen_q=prefill_meta.max_query_len,
++                    cu_seqlens_k=prefill_meta.seq_start_loc,
++                    max_seqlen_k=max_seq_len,
++                    softmax_scale=softmax_scale,
++                    causal=True,
++                    window_size=window_size,
++                    alibi_slopes=alibi_slopes,
++                    block_table=prefill_meta.block_tables,
++                    softcap=logits_soft_cap,
++                    out=prefill_output,
+                 )
++
+         if decode_meta := attn_metadata.decode_metadata:
+             # Decoding run.
+-            output[num_prefill_tokens:] = PagedAttention.forward_decode(
+-                decode_query,
+-                key_cache,
+-                value_cache,
+-                decode_meta.block_tables,
+-                decode_meta.seq_lens_tensor,
+-                decode_meta.max_seq_len,
+-                attn_metadata.kv_cache_dtype,
+-                self.num_kv_heads,
+-                self.scale,
+-                self.alibi_slopes,
+-                kv_scale,
+-            )
++            # Use flash_attn_varlen_func kernel for speculative decoding
++            # because different queries might have different lengths.
++
++            assert decode_meta.max_decode_query_len is not None
++            # use only for actual varlen decoding
++            if decode_meta.max_decode_query_len > 1:
++                assert attn_type == AttentionType.DECODER, (
++                    "Only decoder-only models support max_decode_query_len > 1"
++                )
++                flash_attn_varlen_func(
++                    q=decode_query,
++                    k=key_cache,
++                    v=value_cache,
++                    cu_seqlens_q=decode_meta.query_start_loc,
++                    max_seqlen_q=decode_meta.max_decode_query_len,
++                    cu_seqlens_k=decode_meta.seq_start_loc,
++                    max_seqlen_k=decode_meta.max_decode_seq_len,
++                    softmax_scale=softmax_scale,
++                    causal=True,
++                    window_size=window_size,
++                    alibi_slopes=alibi_slopes,
++                    softcap=logits_soft_cap,
++                    block_table=decode_meta.block_tables,
++                    out=decode_output,
++                )
++            else:
++                # Use flash_attn_with_kvcache for normal decoding.
++                (
++                    seq_lens_arg,
++                    _,
++                    block_tables_arg,
++                ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
++                flash_attn_with_kvcache(
++                    q=decode_query.unsqueeze(1),
++                    k_cache=key_cache,
++                    v_cache=value_cache,
++                    block_table=block_tables_arg,
++                    cache_seqlens=seq_lens_arg,
++                    softmax_scale=softmax_scale,
++                    causal=True,
++                    window_size=window_size,
++                    alibi_slopes=alibi_slopes,
++                    softcap=logits_soft_cap,
++                    out=decode_output.unsqueeze(1),
++                )
++        return output
++
+ 
+-        # Reshape the output tensor.
+-        return output.view(num_tokens, hidden_size)
++def _get_query_key_seq_metadata(
++    attn_metadata,
++    is_prompt: bool,
++    attn_type: str,
++) -> tuple:
++    """
++    Returns sequence metadata for key and query based on the specified 
++    attention type and whether input is a prompt.
++
++    This function computes the starting locations and maximum sequence lengths 
++    for key and query sequences for different attention types.
++
++    Args:
++        attn_metadata: The attention metadata object
++        is_prompt (bool): A flag indicating if the input is a prompt
++        attn_type (AttentionType): The type of attention being used.
++
++    Returns:
++        tuple: A tuple containing four integers:
++            - Starting location for the query sequence.
++            - Maximum sequence length for the query sequence.
++            - Starting location for the key sequence.
++            - Maximum sequence length for the key sequence.
++
++    Raises:
++        AttributeError: If an invalid attention type is provided.
++    """
++    if attn_type == AttentionType.DECODER:
++        # Decoder self-attention
++        # Choose max_seq_len based on whether we are in prompt_run
++        if is_prompt:
++            max_seq_len = attn_metadata.max_prefill_seq_len
++        else:
++            max_seq_len = attn_metadata.max_decode_seq_len
++        return (attn_metadata.seq_start_loc, max_seq_len,
++                attn_metadata.seq_start_loc, max_seq_len)
++
++    elif attn_type == AttentionType.ENCODER_DECODER:
++        # This is cross attention between the where the key
++        # is the precomputed encoder attention and query
++        # is the input sequence.
++        # Choose query max length based on whether it is prompt
++        # or not.
++        if is_prompt:
++            max_seq_len = attn_metadata.max_prefill_seq_len
++        else:
++            max_seq_len = attn_metadata.max_decode_seq_len
++        return (attn_metadata.seq_start_loc, max_seq_len,
++                attn_metadata.encoder_seq_start_loc,
++                attn_metadata.max_encoder_seq_len)
++    elif attn_type == AttentionType.ENCODER:
++        # For encoder attention both the query and the key are same i.e the
++        # encoder sequence.
++        return (attn_metadata.encoder_seq_start_loc,
++                attn_metadata.max_encoder_seq_len,
++                attn_metadata.encoder_seq_start_loc,
++                attn_metadata.max_encoder_seq_len)
++    elif attn_type == AttentionType.ENCODER_ONLY:
++        assert is_prompt, "Should not have decode for encoder only model."
++        return (attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len,
++                attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len)
++    else:
++        raise AttributeError(f"Invalid attention type {str(attn_type)}")
++
++
++def _get_causal_option(attn_type: str) -> bool:
++    """
++    Determine whether the given attention type is suitable for causal 
++    attention mechanisms.
++
++    Args:
++        attn_type (AttentionType): The type of attention being evaluated
++
++    Returns:
++        bool: Returns `True` if the attention type is suitable for causal 
++        attention (i.e., not encoder, encoder-only, or encoder-decoder), 
++        otherwise returns `False`.
++    """
++    return not (attn_type == AttentionType.ENCODER
++                or attn_type == AttentionType.ENCODER_ONLY
++                or attn_type == AttentionType.ENCODER_DECODER)
+diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
+index 8ab4b1f..a11462b 100644
+--- a/vllm/attention/backends/flashinfer.py
++++ b/vllm/attention/backends/flashinfer.py
+@@ -1,32 +1,64 @@
++from collections import defaultdict
++from contextlib import contextmanager
+ from dataclasses import dataclass
+-from typing import Any, Dict, List, Optional, Set, Tuple, Type
++from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type
++
++from vllm.multimodal import MultiModalPlaceholderMap
+ 
+ try:
+-    import flashinfer
+-    from flash_attn import flash_attn_varlen_func
+     from flashinfer import BatchDecodeWithPagedKVCacheWrapper
++    from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
++    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
++
++    from vllm.vllm_flash_attn import flash_attn_varlen_func
++    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+ except ImportError:
+-    flashinfer = None
+-    flash_attn_varlen_func = None
+     BatchDecodeWithPagedKVCacheWrapper = None
++    CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
++    BatchPrefillWithPagedKVCacheWrapper = None
++    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
+ 
+ import torch
+ 
++import vllm.envs as envs
+ from vllm import _custom_ops as ops
+ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                               AttentionMetadata,
+-                                              AttentionMetadataPerStage)
++                                              AttentionMetadataBuilder,
++                                              AttentionState, AttentionType)
++from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
++                                           compute_slot_mapping_start_idx,
++                                           is_block_tables_empty)
++from vllm.attention.ops.paged_attn import PagedAttention
++from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
++                        make_tensor_with_pad)
++
++if TYPE_CHECKING:
++    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
++                                          ModelInputForGPUWithSamplingMetadata)
+ 
+ 
+ class FlashInferBackend(AttentionBackend):
+ 
++    @staticmethod
++    def get_name() -> str:
++        return "FLASHINFER"
++
+     @staticmethod
+     def get_impl_cls() -> Type["FlashInferImpl"]:
+         return FlashInferImpl
+ 
+     @staticmethod
+-    def make_metadata(*args, **kwargs) -> "FlashInferMetadata":
+-        return FlashInferMetadata(*args, **kwargs)
++    def get_metadata_cls() -> Type["AttentionMetadata"]:
++        return FlashInferMetadata
++
++    @staticmethod
++    def get_builder_cls() -> Type["FlashInferMetadataBuilder"]:
++        return FlashInferMetadataBuilder
++
++    @staticmethod
++    def get_state_cls() -> Type["FlashInferState"]:
++        return FlashInferState
+ 
+     @staticmethod
+     def get_kv_cache_shape(
+@@ -41,41 +73,224 @@ class FlashInferBackend(AttentionBackend):
+     def swap_blocks(
+         src_kv_cache: torch.Tensor,
+         dst_kv_cache: torch.Tensor,
+-        src_to_dst: Dict[int, int],
++        src_to_dst: torch.Tensor,
+     ) -> None:
+-        raise NotImplementedError
++        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+ 
+     @staticmethod
+     def copy_blocks(
+         kv_caches: List[torch.Tensor],
+-        src_to_dists: Dict[int, List[int]],
++        src_to_dists: torch.Tensor,
+     ) -> None:
+-        raise NotImplementedError
++        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+ 
+     @staticmethod
+     def get_supported_head_sizes() -> List[int]:
+         return [64, 128, 256]
+ 
++    @staticmethod
++    def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
++        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
++            return torch.float8_e4m3fn
++        elif kv_cache_dtype == "fp8_e5m2":
++            return torch.float8_e5m2
++        else:
++            raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
+ 
+-@dataclass
+-class FlashInferMetadata(AttentionMetadataPerStage):
+ 
+-    is_prompt: bool
++class FlashInferState(AttentionState):
++
++    def __init__(self, runner):
++        self.runner = runner
++        self._is_graph_capturing = False
++        self._workspace_buffer = None
++        self._decode_wrapper = None
++        self._prefill_wrapper = None
++
++    def _get_workspace_buffer(self):
++        if self._workspace_buffer is None:
++            self._workspace_buffer = torch.empty(
++                FLASHINFER_WORKSPACE_BUFFER_SIZE,
++                dtype=torch.uint8,
++                device=self.runner.device)
++        return self._workspace_buffer
++
++    def _get_prefill_wrapper(self):
++        if self._prefill_wrapper is None:
++            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
++                self._get_workspace_buffer(), "NHD")
++        return self._prefill_wrapper
++
++    def _get_decode_wrapper(self):
++        if self._decode_wrapper is None:
++            num_qo_heads = (self.runner.model_config.get_num_attention_heads(
++                self.runner.parallel_config))
++            num_kv_heads = self.runner.model_config.get_num_kv_heads(
++                self.runner.parallel_config)
++            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
++                num_qo_heads // num_kv_heads > 4)
++            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
++                self._get_workspace_buffer(),
++                "NHD",
++                use_tensor_cores=use_tensor_cores)
++        return self._decode_wrapper
++
++    @contextmanager
++    def graph_capture(self, max_batch_size: int):
++        self._is_graph_capturing = True
++        self._graph_decode_wrapper = None
++        self._graph_slot_mapping = torch.full((max_batch_size, ),
++                                              PAD_SLOT_ID,
++                                              dtype=torch.long,
++                                              device=self.runner.device)
++        self._graph_seq_lens = torch.ones(max_batch_size,
++                                          dtype=torch.int32,
++                                          device=self.runner.device)
++        self._graph_block_tables = torch.from_numpy(
++            self.runner.graph_block_tables).to(device=self.runner.device)
++        self._graph_decode_workspace_buffer = self._get_workspace_buffer()
++        self._graph_indices_buffer = torch.empty(
++            max_batch_size * self.runner.cache_config.num_gpu_blocks,
++            dtype=torch.int32,
++            device=self.runner.device)
++        self._graph_indptr_buffer = torch.empty(max_batch_size + 1,
++                                                dtype=torch.int32,
++                                                device=self.runner.device)
++        self._graph_last_page_len_buffer = torch.empty(
++            max_batch_size, dtype=torch.int32, device=self.runner.device)
++        yield
++        self._is_graph_capturing = False
++        del self._graph_slot_mapping
++        del self._graph_seq_lens
++        del self._graph_block_tables
++        del self._graph_decode_workspace_buffer
++        del self._graph_indices_buffer
++        del self._graph_indptr_buffer
++        del self._graph_last_page_len_buffer
++        del self._graph_decode_wrapper
++
++    def graph_clone(self, batch_size: int):
++        assert self._is_graph_capturing
++        state = self.__class__(self.runner)
++        state._workspace_buffer = self._graph_decode_workspace_buffer
++        state._decode_wrapper = self._graph_decode_wrapper
++        state._prefill_wrapper = self._get_prefill_wrapper()
++        return state
++
++    def graph_capture_get_metadata_for_batch(
++            self, batch_size: int, is_encoder_decoder_model: bool = False):
++        assert self._is_graph_capturing
++        _indptr_buffer = self._graph_indptr_buffer[:batch_size + 1]
++        _last_page_len_buffer = self._graph_last_page_len_buffer[:batch_size]
++
++        num_qo_heads = (self.runner.model_config.get_num_attention_heads(
++            self.runner.parallel_config))
++        num_kv_heads = self.runner.model_config.get_num_kv_heads(
++            self.runner.parallel_config)
++        use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
++            num_qo_heads // num_kv_heads > 4)
++        self._graph_decode_wrapper = \
++            CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
++            self._graph_decode_workspace_buffer, _indptr_buffer,
++            self._graph_indices_buffer, _last_page_len_buffer, "NHD",
++            use_tensor_cores)
++        if self.runner.kv_cache_dtype.startswith("fp8"):
++            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
++                self.runner.kv_cache_dtype)
++        else:
++            kv_cache_dtype = get_kv_cache_torch_dtype(
++                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
++
++        paged_kv_indptr_tensor_host = torch.arange(0,
++                                                   batch_size + 1,
++                                                   dtype=torch.int32)
++        paged_kv_indices_tensor_host = torch.arange(0,
++                                                    batch_size,
++                                                    dtype=torch.int32)
++        paged_kv_last_page_len_tensor_host = torch.full((batch_size, ),
++                                                        self.runner.block_size,
++                                                        dtype=torch.int32)
++        query_start_loc_host = torch.arange(0,
++                                            batch_size + 1,
++                                            dtype=torch.int32)
++
++        attn_metadata = self.runner.attn_backend.make_metadata(
++            num_prefills=0,
++            slot_mapping=self._graph_slot_mapping[:batch_size],
++            multi_modal_placeholder_index_maps=None,
++            num_prefill_tokens=0,
++            num_decode_tokens=batch_size,
++            max_prefill_seq_len=0,
++            block_tables=self._graph_block_tables,
++            paged_kv_indptr=paged_kv_indptr_tensor_host,
++            paged_kv_indices=paged_kv_indices_tensor_host,
++            paged_kv_last_page_len=paged_kv_last_page_len_tensor_host,
++            num_qo_heads=num_qo_heads,
++            num_kv_heads=num_kv_heads,
++            head_dim=self.runner.model_config.get_head_size(),
++            page_size=self.runner.block_size,
++            seq_start_loc=None,
++            query_start_loc=query_start_loc_host,
++            device=self.runner.device,
++            data_type=kv_cache_dtype,
++            q_data_type=self.runner.model_config.dtype,
++            use_cuda_graph=True,
++            decode_wrapper=self._graph_decode_wrapper,
++            prefill_wrapper=None)
++        attn_metadata.begin_forward()
++        return attn_metadata
+ 
+-    use_cuda_graph: bool = False
++    def get_graph_input_buffers(self,
++                                attn_metadata,
++                                is_encoder_decoder_model: bool = False):
++        return {
++            "slot_mapping": attn_metadata.slot_mapping,
++        }
+ 
++    def prepare_graph_input_buffers(self,
++                                    input_buffers,
++                                    attn_metadata,
++                                    is_encoder_decoder_model: bool = False):
++        return
++
++    def begin_forward(self, model_input):
++        assert not self._is_graph_capturing
++        state = self
++        if model_input.attn_metadata.use_cuda_graph:
++            batch_size = model_input.input_tokens.shape[0]
++            state = (self.runner.graph_runners[model_input.virtual_engine]
++                     [batch_size].attn_state)
++        model_input.attn_metadata.prefill_wrapper = state._get_prefill_wrapper(
++        )
++        model_input.attn_metadata.decode_wrapper = state._get_decode_wrapper()
++        model_input.attn_metadata.begin_forward()
++
++
++@dataclass
++class FlashInferMetadata(AttentionMetadata):
++    # Maximum sequence length among prefill batch. 0 if there are decoding
++    # requests only.
++    max_prefill_seq_len: int
++    # Number of query tokens for each request in the batch.
++    # Currently, we require that all requests have the same number of query
++    # tokens during the decoding phase. When speculavie decoding is enabled,
++    # decode_query_len might be greater than 1. In all other cases, it is 1.
++    decode_query_len: Optional[int] = 1
++
++    use_cuda_graph: bool = True
++
++    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
+     decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
+ 
+-    # Metadata for the prefill stage since we still
+-    # use flash attention for prefill.
++    # Metadata for the prefill stage
+     seq_start_loc: Optional[torch.Tensor] = None
+-    max_seq_len: Optional[int] = None
++    query_start_loc: Optional[torch.Tensor] = None
+     block_tables: Optional[torch.Tensor] = None
+ 
+-    # Metadata for the decode stage
+-    # Workspace buffer required by the kernel, the buffer should not
+-    # be allocated/deacollated by the FalshInfermetadata object.
+-    workspace_buffer: Optional[torch.Tensor] = None
++    # used for GPU in-place advance_step
++    seq_lens_tensor: Optional[torch.Tensor] = None
++    block_table_bound: Optional[torch.Tensor] = None
++
+     # An example for paged_kv_indices, paged_kv_indptr:
+     # request 1, page indices [0, 5, 8]
+     # request 2, page indices [1, 6, 7]
+@@ -101,6 +316,10 @@ class FlashInferMetadata(AttentionMetadataPerStage):
+     page_size: Optional[int] = None
+     # The data type of the paged kv cache
+     data_type: torch.dtype = None
++    # The data type of the query
++    q_data_type: torch.dtype = None
++    device: torch.device = torch.device("cuda")
++    is_profile_run: bool = False
+ 
+     def __post_init__(self):
+         # Refer to
+@@ -112,34 +331,409 @@ class FlashInferMetadata(AttentionMetadataPerStage):
+                 f"Only {supported_head_sizes} are supported for head_dim,",
+                 f"received {self.head_dim}.")
+ 
+-        # When using flashinfer, we are also creating the FlashInferMetadata,
+-        # which will also call post_init by default, here we want to skip the
+-        # post_init if it's the prefill phase.
+-        if not self.is_prompt:
+-            self.decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+-                self.workspace_buffer, "NHD")
++    def begin_forward(self):
++        if self.num_prefill_tokens > 0:
++            if self.paged_kv_indices is None:
++                return
++
++            assert self.prefill_wrapper is not None
++            assert self.query_start_loc is not None
++            assert self.paged_kv_indices is not None
++            assert self.paged_kv_indptr is not None
++            assert self.paged_kv_last_page_len is not None
++            assert self.block_table_bound is not None
++            assert self.seq_lens_tensor is not None
++            self.query_start_loc = self.query_start_loc[:self.num_prefills + 1]
++            batch_size = self.query_start_loc.shape[0] - 1
++            assert batch_size >= 0
++            # We will use flash attention for profiling to
++            # determine the number of blocks. Therefore,
++            # we don't need to prepare the input for flashinfer for profile run.
++            if not self.is_profile_run:
++                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
++                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
++                    self.device)
++                self.block_table_bound = self.block_table_bound.to(self.device)
++                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
++                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
++                self.prefill_wrapper.end_forward()
++                self.prefill_wrapper.begin_forward(
++                    self.query_start_loc,
++                    self.paged_kv_indptr[:self.num_prefills + 1],
++                    self.paged_kv_indices,
++                    self.paged_kv_last_page_len[:self.num_prefills],
++                    self.num_qo_heads, self.num_kv_heads, self.head_dim,
++                    self.page_size)
++        if self.num_decode_tokens > 0:
++            assert self.paged_kv_indices is not None
++            assert self.paged_kv_indptr is not None
++            assert self.paged_kv_last_page_len is not None
++            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
++            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
++            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
++                self.device)
++            # handle model warmup path
++            if self.block_table_bound is not None:
++                self.block_table_bound = self.block_table_bound.to(self.device)
++            if self.seq_lens_tensor is not None:
++                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
++
++            assert self.decode_wrapper is not None
++            self.decode_wrapper.end_forward()
+             self.decode_wrapper.begin_forward(
+-                self.paged_kv_indptr,
++                self.paged_kv_indptr[self.num_prefills:],
+                 self.paged_kv_indices,
+-                self.paged_kv_last_page_len,
++                self.paged_kv_last_page_len[self.num_prefills:],
+                 self.num_qo_heads,
+                 self.num_kv_heads,
+                 self.head_dim,
+                 self.page_size,
+                 # Disable flashinfer's pos encoding and use vllm's rope.
+                 pos_encoding_mode="NONE",
+-                data_type=self.data_type)
++                # kv-cache data type.
++                data_type=self.data_type,
++                # query data type.
++                q_data_type=self.q_data_type)
+ 
+     def asdict_zerocopy(self,
+                         skip_fields: Optional[Set[str]] = None
+                         ) -> Dict[str, Any]:
+         if skip_fields is None:
+             skip_fields = set()
+-        # We need to skip the decode_wrapper field since it cannot be
++        # We need to skip the prefill/decode_wrapper field since it cannot be
+         # broadcasted with nccl when TP is enabled.
++        skip_fields.add('prefill_wrapper')
+         skip_fields.add('decode_wrapper')
+         return super().asdict_zerocopy(skip_fields)
+ 
++    @property
++    def prefill_metadata(self) -> Optional["FlashInferMetadata"]:
++        if self.num_prefills == 0:
++            return None
++        return self
++
++    @property
++    def decode_metadata(self) -> Optional["FlashInferMetadata"]:
++        if self.num_decode_tokens == 0:
++            return None
++        return self
++
++    def advance_step(self,
++                     model_input: "ModelInputForGPUWithSamplingMetadata",
++                     sampled_token_ids: Optional[torch.Tensor],
++                     block_size: int,
++                     num_seqs: int,
++                     num_queries: int,
++                     turn_prefills_into_decodes: bool = False):
++        """
++        Update metadata in-place to advance one decode step.
++        """
++
++        assert not turn_prefills_into_decodes, \
++            ("Chunked prefill is not supported with flashinfer yet."
++             "turn_prefills_into_decodes is a Multi-Step + Chunked-Prefill "
++             "specific parameter.")
++
++        assert num_seqs > 0
++        assert num_queries > 0
++        assert model_input.attn_metadata is not None
++        assert sampled_token_ids is not None
++
++        # When using cudagraph, the num_seqs is padded to the next captured
++        # batch sized, but num_queries tracks the actual number of requests in
++        # the batch. For --enforce-eager mode, num_seqs == num_queries
++        if num_seqs != num_queries:
++            assert num_seqs > num_queries
++            assert self.use_cuda_graph
++
++        model_input.input_tokens[:num_queries] = sampled_token_ids.flatten()
++
++        # Update GPU tensors
++        ops.advance_step_flashinfer(
++            num_seqs=num_seqs,
++            num_queries=num_queries,
++            block_size=block_size,
++            input_tokens=model_input.input_tokens,
++            sampled_token_ids=model_input.input_tokens,
++            input_positions=model_input.input_positions,
++            seq_lens=self.seq_lens_tensor,
++            slot_mapping=self.slot_mapping,
++            block_tables=self.block_tables,
++            paged_kv_indices=self.paged_kv_indices,
++            paged_kv_indptr=self.paged_kv_indptr,
++            paged_kv_last_page_len=self.paged_kv_last_page_len,
++            block_table_bound=self.block_table_bound)
++
++
++class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
++
++    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
++        self.slot_mapping: List[int] = []
++        self.prefill_seq_lens: List[int] = []
++        self.context_lens: List[int] = []
++        self.block_tables: List[List[int]] = []
++        self.curr_seq_lens: List[int] = []
++        self.multimodal_placeholder_maps: Dict[
++            str,
++            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
++        self.num_prefills = 0
++        self.num_prefill_tokens = 0
++        self.num_decode_tokens = 0
++
++        self.input_builder = input_builder
++        self.runner = input_builder.runner
++
++        self.sliding_window = input_builder.sliding_window
++        self.block_size = input_builder.block_size
++
++        # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout
++        # for the precise definition of the following fields.
++        # An example:
++        # request 1, page indices [0, 5, 8]
++        # request 2, page indices [1, 6, 7]
++        # request 3, page indices [3, 4]
++        # paged_kv_indices is a concatenation of page indices of all requests:
++        # [0, 5, 8, 1, 6, 7, 3, 4]
++        # paged_kv_indptr is used to index into paged_kv_indices:
++        # [0, 3, 6, 8]
++        self.paged_kv_indices: List[int] = []
++        # 0 at the beginning of paged_kv_indptr indicates the start of the
++        # first request’s page indices in the paged_kv_indices list.
++        self.paged_kv_indptr: List[int] = [0]
++        # paged_kv_last_page_len is the length of the last page of each request
++        self.paged_kv_last_page_len: List[int] = []
++        self.total_blocks = 0
++        self.is_profile_run: bool = False
++
++    def _add_seq_group(
++            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
++            chunked_prefill_enabled: bool):
++        """Add a sequence group to the metadata. Specifically update/append
++        1. context length.
++        2. block table.
++        3. slot mapping.
++        """
++        is_prompt = inter_data.is_prompt
++        block_tables = inter_data.block_tables
++        computed_block_nums = inter_data.computed_block_nums
++
++        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
++             curr_sliding_window_block) in zip(
++                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
++                 inter_data.orig_seq_lens, inter_data.seq_lens,
++                 inter_data.query_lens, inter_data.context_lens,
++                 inter_data.curr_sliding_window_blocks):
++            self.context_lens.append(context_len)
++            if is_prompt:
++                mm_maps = inter_data.multi_modal_placeholder_maps
++                if mm_maps:
++                    for modality, placeholders in mm_maps.items():
++                        self.multimodal_placeholder_maps[modality].extend(
++                            placeholders)
++                self.num_prefills += 1
++                self.num_prefill_tokens += token_len
++                self.prefill_seq_lens.append(seq_len)
++            else:
++                assert query_len == 1, (
++                    "seq_len: {}, context_len: {}, query_len: {}".format(
++                        seq_len, context_len, query_len))
++                self.num_decode_tokens += query_len
++                self.curr_seq_lens.append(curr_seq_len)
++
++            # Compute block table.
++            # TODO(sang): Combine chunked prefill and prefix caching by
++            # only allowing multiple of block_size chunk size.
++            # NOTE: This only works for oooooooxxx style attention.
++            block_table = []
++            if inter_data.prefix_cache_hit:
++                block_table = computed_block_nums
++            elif ((chunked_prefill_enabled or not is_prompt)
++                  and block_tables is not None):
++                block_table = block_tables[seq_id][-curr_sliding_window_block:]
++            self.block_tables.append(block_table)
++
++            is_profile_run = is_block_tables_empty(block_tables)
++
++            # Compute slot mapping.
++            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
++                                                       context_len,
++                                                       self.sliding_window)
++            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
++                                 seq_len, context_len, start_idx,
++                                 self.block_size, inter_data.block_tables)
++
++            # It is not necessary to add paged_kv_indices, paged_kv_indptr,
++            # and paged_kv_last_page_len for profile run because we will
++            # create dummy inputs.
++            if is_profile_run:
++                self.is_profile_run = is_profile_run
++                return
++
++            block_table = block_tables[seq_id]
++            self._update_paged_kv_tensors(block_table, seq_len)
++
++    def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int):
++        # Get the number of valid blocks based on sequence length.
++        # If seq_len = 16, block_size = 16,
++        # block_table_bound is 1 with 1 valid block.
++        # If seq_len = 15, block_size = 16,
++        # block_table_bound is 0 + 1 with 1 valid block.
++        self.total_blocks += len(block_table)
++        block_table_bound = seq_len // self.block_size + 1 \
++                            if seq_len % self.block_size != 0 \
++                            else seq_len // self.block_size
++        self.paged_kv_indices.extend(block_table[:block_table_bound])
++        self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
++                                    block_table_bound)
++
++        last_page_len = seq_len % self.block_size
++        if last_page_len == 0:
++            last_page_len = self.block_size
++        self.paged_kv_last_page_len.append(last_page_len)
++
++    def build(self, seq_lens: List[int], query_lens: List[int],
++              cuda_graph_pad_size: int, batch_size: int):
++        """Build attention metadata with on-device tensors.
++
++        Args:
++            seq_lens: The maybe padded sequence lengths of the input sequences.
++            query_lens: The query lengths of the input sequences.
++            cuda_graph_pad_size: The padding size for cuda graph.
++                                 -1 if cuda graph is not used.
++            batch_size: The maybe padded batch size.
++        """
++        for inter_data in self.input_builder.inter_data_list:
++            self._add_seq_group(inter_data,
++                                self.input_builder.chunked_prefill_enabled)
++
++        device = self.runner.device
++        use_captured_graph = cuda_graph_pad_size != -1
++
++        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
++        num_decode_tokens = self.num_decode_tokens
++        decode_query_len = max(query_lens[self.num_prefills:], default=1)
++
++        if use_captured_graph:
++            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
++            self.block_tables.extend([] * cuda_graph_pad_size)
++            num_decode_tokens = batch_size - self.num_prefill_tokens
++
++            # The shape of graph_block_tables is
++            # [max batch size, max context len // block size].
++            input_block_tables = self.runner.graph_block_tables[:batch_size]
++            max_blocks = input_block_tables.shape[1]
++            for i, block_table in enumerate(self.block_tables):
++                if block_table:
++                    num_blocks = len(block_table)
++                    if num_blocks <= max_blocks:
++                        input_block_tables[i, :num_blocks] = block_table
++                    else:
++                        # It may be possible to have more blocks allocated due
++                        # to lookahead slots of multi-step, however, they are
++                        # not used anyway, so can be safely ignored.
++                        input_block_tables[
++                            i, :max_blocks] = block_table[:max_blocks]
++
++            block_tables = torch.from_numpy(input_block_tables).to(
++                device, non_blocking=True)
++
++            last_paged_kv_indptr = self.paged_kv_indptr[-1]
++            self.paged_kv_indptr.extend([last_paged_kv_indptr] *
++                                        cuda_graph_pad_size)
++            self.paged_kv_last_page_len.extend([0] * cuda_graph_pad_size)
++        else:
++            block_tables = make_tensor_with_pad(
++                self.block_tables,
++                pad=0,
++                dtype=torch.int,
++                device=device,
++            )
++
++        assert device is not None
++        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
++                                           self.runner.pin_memory)
++        query_lens_tensor = async_tensor_h2d(query_lens, torch.long, device,
++                                             self.runner.pin_memory)
++        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
++                                               device, self.runner.pin_memory)
++        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
++                                      dtype=torch.int32,
++                                      device=device)
++        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
++                                    dtype=torch.int32,
++                                    device=device)
++        placeholder_index_maps = {
++            modality: placeholder_map.index_map()
++            for modality, placeholder_map in
++            self.multimodal_placeholder_maps.items()
++        }
++        torch.cumsum(seq_lens_tensor,
++                     dim=0,
++                     dtype=seq_start_loc.dtype,
++                     out=seq_start_loc[1:])
++        torch.cumsum(query_lens_tensor,
++                     dim=0,
++                     dtype=query_start_loc.dtype,
++                     out=query_start_loc[1:])
++
++        if len(self.paged_kv_indptr) > 0:
++            # extend to the maximum number of blocks as returned by the
++            # scheduler
++            self.paged_kv_indices.extend(
++                [0] * (self.total_blocks - len(self.paged_kv_indices)))
++            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
++                                                   device="cpu",
++                                                   dtype=torch.int)
++            paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr,
++                                                  device="cpu",
++                                                  dtype=torch.int)
++            paged_kv_last_page_len_tensor = torch.tensor(
++                self.paged_kv_last_page_len, device="cpu", dtype=torch.int)
++            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
++                                                   1,
++                                                   device="cpu",
++                                                   dtype=torch.int)
++        else:
++            paged_kv_indices_tensor = None
++            paged_kv_indptr_tensor = None
++            paged_kv_last_page_len_tensor = None
++            block_table_bound_tensor = None
++
++        if self.runner.kv_cache_dtype.startswith("fp8"):
++            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
++                self.runner.kv_cache_dtype)
++        else:
++            kv_cache_dtype = get_kv_cache_torch_dtype(
++                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
++
++        return FlashInferMetadata(
++            decode_query_len=decode_query_len,
++            num_prefills=self.num_prefills,
++            slot_mapping=slot_mapping_tensor,
++            multi_modal_placeholder_index_maps=placeholder_index_maps,
++            num_prefill_tokens=self.num_prefill_tokens,
++            num_decode_tokens=num_decode_tokens,
++            max_prefill_seq_len=max_prefill_seq_len,
++            block_tables=block_tables,
++            paged_kv_indptr=paged_kv_indptr_tensor,
++            paged_kv_indices=paged_kv_indices_tensor,
++            paged_kv_last_page_len=paged_kv_last_page_len_tensor,
++            block_table_bound=block_table_bound_tensor,
++            seq_lens_tensor=seq_lens_tensor,
++            num_qo_heads=self.runner.model_config.get_num_attention_heads(
++                self.runner.parallel_config),
++            num_kv_heads=self.runner.model_config.get_num_kv_heads(
++                self.runner.parallel_config),
++            head_dim=self.runner.model_config.get_head_size(),
++            page_size=self.block_size,
++            seq_start_loc=seq_start_loc,
++            query_start_loc=query_start_loc,
++            device=device,
++            data_type=kv_cache_dtype,
++            q_data_type=self.runner.model_config.dtype,
++            use_cuda_graph=use_captured_graph,
++            is_profile_run=self.is_profile_run)
++
+ 
+ class FlashInferImpl(AttentionImpl):
+ 
+@@ -148,36 +742,63 @@ class FlashInferImpl(AttentionImpl):
+         num_heads: int,
+         head_size: int,
+         scale: float,
+-        num_kv_heads: Optional[int] = None,
+-        alibi_slopes: Optional[List[float]] = None,
+-        sliding_window: Optional[int] = None,
++        num_kv_heads: int,
++        alibi_slopes: Optional[List[float]],
++        sliding_window: Optional[int],
++        kv_cache_dtype: str,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        attn_type: str = AttentionType.DECODER,
+     ) -> None:
+-        if sliding_window is not None:
+-            raise ValueError("Sliding window is not supported in FlashInfer.")
+-        self.sliding_window = (-1, -1)
+-        self.alibi_slopes = alibi_slopes
+-        self.scale = scale
+         self.num_heads = num_heads
+         self.head_size = head_size
+-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
++        self.scale = float(scale)
++        self.num_kv_heads = num_kv_heads
++        if alibi_slopes is not None:
++            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
++        self.alibi_slopes = alibi_slopes
++        self.sliding_window = ((sliding_window - 1,
++                                0) if sliding_window is not None else (-1, -1))
++        self.kv_cache_dtype = kv_cache_dtype
++        self.logits_soft_cap = logits_soft_cap
++
++        assert self.num_heads % self.num_kv_heads == 0
++        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
++
++        if attn_type != AttentionType.DECODER:
++            raise NotImplementedError("Encoder self-attention and "
++                                      "encoder/decoder cross-attention "
++                                      "are not implemented for "
++                                      "FlashInferImpl")
++
++    def forward(
++        self,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: FlashInferMetadata,
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++
++        # TODO: directly write to output tensor
++        num_heads: int = self.num_heads
++        head_size: int = self.head_size
++        num_kv_heads: int = self.num_kv_heads
++        kv_cache_dtype: str = self.kv_cache_dtype
++        softmax_scale: float = self.scale
++        window_size = self.sliding_window
++        alibi_slopes = self.alibi_slopes
++        logits_soft_cap = self.logits_soft_cap
+ 
+-    def forward(self, query: torch.Tensor, key: torch.Tensor,
+-                value: torch.Tensor, kv_cache: Optional[torch.Tensor],
+-                attn_metadata: AttentionMetadata[FlashInferMetadata],
+-                kv_scale: float):
+         num_tokens, hidden_size = query.shape
+-        query = query.view(-1, self.num_heads, self.head_size)
+-        key = key.view(-1, self.num_kv_heads, self.head_size)
+-        value = value.view(-1, self.num_kv_heads, self.head_size)
+-
+-        if attn_metadata.num_prefill_tokens > 0:
+-            assert attn_metadata.num_decode_tokens == 0, (
+-                "Chunked prefill is not supported with flashinfer yet.")
+-        if attn_metadata.num_decode_tokens > 0:
+-            assert attn_metadata.num_prefill_tokens == 0, (
+-                "Chunked prefill is not supported with flashinfer yet.")
+-
+-        if kv_cache is not None:
++        query = query.view(-1, num_heads, head_size)
++        key = key.view(-1, num_kv_heads, head_size)
++        value = value.view(-1, num_kv_heads, head_size)
++
++        if kv_cache.numel() > 0:
+             # Use the same reshape and cache kernel as flash attention.
+             ops.reshape_and_cache_flash(
+                 key,
+@@ -185,36 +806,95 @@ class FlashInferImpl(AttentionImpl):
+                 kv_cache[:, 0],
+                 kv_cache[:, 1],
+                 attn_metadata.slot_mapping.flatten(),
+-                attn_metadata.kv_cache_dtype,
++                kv_cache_dtype,
++                k_scale,
++                v_scale,
+             )
++            # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
++            # to process the cache when the kv_cache_dtype is fp8
++            if kv_cache_dtype.startswith("fp8"):
++                torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
++                    kv_cache_dtype)
++                kv_cache = kv_cache.view(torch_dtype)
++
++        num_prefill_tokens = attn_metadata.num_prefill_tokens
++        num_decode_tokens = attn_metadata.num_decode_tokens
++        assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
++                    f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
++        assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
++                    f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
++        query = query.contiguous(
++        )  # Flashinfer requires query to be contiguous
++        # Query for decode. KV is not needed because it is already cached.
++        # QKV for prefill.
++        decode_query = query[num_prefill_tokens:]
++        query = query[:num_prefill_tokens]
++
++        key = key[:num_prefill_tokens]
++        value = value[:num_prefill_tokens]
++
++        assert query.shape[0] == num_prefill_tokens
++        assert decode_query.shape[0] == num_decode_tokens
++
++        window_left = window_size[0] if window_size is not None else -1
+ 
++        prefill_output: Optional[torch.Tensor] = None
++        decode_output: Optional[torch.Tensor] = None
+         if prefill_meta := attn_metadata.prefill_metadata:
+-            assert prefill_meta.block_tables is not None
+-            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
+-                output = flash_attn_varlen_func(
++            # We will use flash attention for prefill
++            # when kv_cache is not provided.
++            # This happens when vllm runs the profiling to
++            # determine the number of blocks.
++            if kv_cache.numel() == 0:
++                prefill_output = flash_attn_varlen_func(
+                     q=query,
+                     k=key,
+                     v=value,
+                     cu_seqlens_q=prefill_meta.seq_start_loc,
+                     cu_seqlens_k=prefill_meta.seq_start_loc,
+-                    max_seqlen_q=prefill_meta.max_seq_len,
+-                    max_seqlen_k=prefill_meta.max_seq_len,
+-                    softmax_scale=self.scale,
++                    max_seqlen_q=prefill_meta.max_prefill_seq_len,
++                    max_seqlen_k=prefill_meta.max_prefill_seq_len,
++                    softmax_scale=softmax_scale,
+                     causal=True,
+-                    window_size=self.sliding_window,
+-                    alibi_slopes=self.alibi_slopes,
++                    window_size=window_size,
++                    alibi_slopes=alibi_slopes,
+                 )
+             else:
+-                raise NotImplementedError(
+-                    "Prefix caching is not supported with flashinfer yet.")
+-        else:
+-            assert attn_metadata.decode_metadata is not None
+-            assert attn_metadata.decode_metadata.decode_wrapper is not None
+-            query = query.contiguous(
+-            )  # Flashinfer requires query to be contiguous
+-            output = attn_metadata.decode_metadata.decode_wrapper.forward(
+-                query,
++                assert prefill_meta is not None
++                assert prefill_meta.prefill_wrapper is not None
++                prefill_output = prefill_meta.prefill_wrapper.forward(
++                    query,
++                    kv_cache,
++                    logits_soft_cap=logits_soft_cap,
++                    causal=True,
++                    k_scale=k_scale,
++                    v_scale=v_scale,
++                    window_left=window_left)
++        if decode_meta := attn_metadata.decode_metadata:
++            assert decode_meta is not None
++            assert decode_meta.decode_wrapper is not None
++            decode_output = decode_meta.decode_wrapper.forward(
++                decode_query,
+                 kv_cache,
+-                sm_scale=self.scale,
+-            )
++                sm_scale=softmax_scale,
++                logits_soft_cap=logits_soft_cap,
++                k_scale=k_scale,
++                v_scale=v_scale,
++                window_left=window_left)
++
++        if prefill_output is None and decode_output is not None:
++            # Decode only batch.
++            output, num_tokens = decode_output, num_decode_tokens
++        elif decode_output is None and prefill_output is not None:
++            # Prefill only batch.
++            output, num_tokens = prefill_output, num_prefill_tokens
++        else:
++            # Chunked prefill batch does not work with speculative decoding in
++            # FlashInfer backend, so the query length for decode should be 1.
++            assert prefill_output is not None
++            assert decode_output is not None
++            assert decode_meta is not None
++            assert decode_meta.decode_query_len == 1
++            decode_output = decode_output.squeeze(1)
++            output = torch.cat([prefill_output, decode_output], dim=0)
+         return output.view(num_tokens, hidden_size)
+diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
+new file mode 100644
+index 0000000..94a461e
+--- /dev/null
++++ b/vllm/attention/backends/hpu_attn.py
+@@ -0,0 +1,281 @@
++###############################################################################
++# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
++###############################################################################
++
++import os
++from dataclasses import dataclass
++from typing import Any, Dict, List, Optional, Tuple, Type
++
++import torch
++import vllm_hpu_extension.ops as ops
++from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
++
++from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
++                                              AttentionMetadata, AttentionType)
++from vllm.attention.backends.utils import CommonAttentionState
++from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
++                                               HPUPagedAttentionMetadata)
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++
++class HPUAttentionBackend(AttentionBackend):
++
++    @staticmethod
++    def get_name() -> str:
++        return "HPU_ATTN"
++
++    @staticmethod
++    def get_impl_cls() -> Type["HPUAttentionImpl"]:
++        return HPUAttentionImpl
++
++    @staticmethod
++    def get_metadata_cls() -> Type["AttentionMetadata"]:
++        return HPUAttentionMetadata
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
++
++    @staticmethod
++    def get_kv_cache_shape(
++        num_blocks: int,
++        block_size: int,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[int, ...]:
++        return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size,
++                                                    num_kv_heads, head_size)
++
++    @staticmethod
++    def swap_blocks(
++        src_kv_cache: torch.Tensor,
++        dst_kv_cache: torch.Tensor,
++        src_to_dst: Dict[int, int],
++    ) -> None:
++        HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
++
++    @staticmethod
++    def copy_blocks(
++        kv_caches: List[torch.Tensor],
++        src_to_dists: Dict[int, List[int]],
++    ) -> None:
++        HPUPagedAttention.copy_blocks(kv_caches, src_to_dists)
++
++
++@dataclass
++class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
++    """Metadata for HPUAttentionbackend."""
++    # Currently, input sequences can only contain all prompts
++    # or all decoding. True if all sequences are prompts.
++    is_prompt: bool
++    attn_bias: Optional[torch.Tensor]
++    seq_lens_tensor: Optional[torch.Tensor]
++
++
++class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
++    """
++    If the input tensors contain prompt tokens, the layout is as follows:
++    |<--------------- num_prefill_tokens ----------------->|
++    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
++
++    Otherwise, the layout is as follows:
++    |<----------------- num_decode_tokens ------------------>|
++    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
++
++    Generation tokens can contain padding when cuda-graph is used.
++    Currently, prompt tokens don't contain any padding.
++
++    The prompts might have different lengths, while the generation tokens
++    always have length 1.
++    """
++
++    def __init__(
++        self,
++        num_heads: int,
++        head_size: int,
++        scale: float,
++        num_kv_heads: int,
++        alibi_slopes: Optional[List[float]],
++        sliding_window: Optional[int],
++        kv_cache_dtype: str,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        max_seq_len: int = 4096,
++        attn_type: str = AttentionType.DECODER,
++    ) -> None:
++        super(AttentionImpl, self).__init__()
++        self.kv_cache_dtype = kv_cache_dtype
++        self.num_heads = num_heads
++        self.head_size = head_size
++        self.scale = float(scale)
++        self.matmul_qk = Matmul()
++        self.softmax = Softmax()
++        self.matmul_av = Matmul()
++        self.batch2block_matmul = Matmul()
++        self.block2batch_matmul = Matmul()
++        # NOTE(kzawora): Contiguous PA is off until model runner supports it
++        self.k_cache = VLLMKVCache()
++        self.k_cache.use_contiguous_pa = False
++        self.v_cache = VLLMKVCache()
++        self.v_cache.use_contiguous_pa = False
++        # NOTE(kzawora): Pipelined PA is off until model runner supports it
++        ops.pa_impl = ops.pa
++
++        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
++        self.sliding_window = sliding_window
++        self.alibi_slopes = alibi_slopes
++        if alibi_slopes is not None:
++            alibi_slopes_tensor = torch.tensor(alibi_slopes,
++                                               dtype=torch.bfloat16)
++            self.alibi_slopes = alibi_slopes_tensor
++        assert self.num_heads % self.num_kv_heads == 0
++        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
++
++        self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
++                                              '0').lower() in ['1', 'true']
++        if self.prefill_usefusedsdpa:
++            assert alibi_slopes is None, \
++                'Prefill with FusedSDPA not supported with alibi slopes!'
++
++        suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
++        if head_size not in suppored_head_sizes:
++            raise ValueError(
++                f"Head size {head_size} is not supported by PagedAttention. "
++                f"Supported head sizes are: {suppored_head_sizes}.")
++
++        if attn_type != AttentionType.DECODER:
++            raise NotImplementedError("Encoder self-attention and "
++                                      "encoder/decoder cross-attention "
++                                      "are not implemented for "
++                                      "HPUAttentionImpl")
++
++    def forward(
++        self,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: HPUAttentionMetadata,
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        """Forward pass with xFormers and PagedAttention.
++
++        Args:
++            query: shape = [num_tokens, num_heads * head_size]
++            key: shape = [num_tokens, num_kv_heads * head_size]
++            value: shape = [num_tokens, num_kv_heads * head_size]
++            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
++            attn_metadata: Metadata for attention.
++        Returns:
++            shape = [num_tokens, num_heads * head_size]
++        """
++        batch_size, seq_len, hidden_size = query.shape
++        _, seq_len_kv, _ = key.shape
++
++        query = query.view(-1, self.num_heads, self.head_size)
++        key = key.view(-1, self.num_kv_heads, self.head_size)
++        value = value.view(-1, self.num_kv_heads, self.head_size)
++        block_indices = attn_metadata.block_indices
++        block_offsets = attn_metadata.block_offsets
++        if attn_metadata.is_prompt:
++            key = key.unflatten(0, (block_indices.size(0), -1))
++            value = value.unflatten(0, (block_indices.size(0), -1))
++        if kv_cache is not None:
++            key_cache, value_cache = HPUPagedAttention.split_kv_cache(
++                kv_cache, self.num_kv_heads, self.head_size)
++
++            # Reshape the input keys and values and store them in the cache.
++            # If kv_cache is not provided, the new key and value tensors are
++            # not cached. This happens during the initial memory profiling run.
++            key_cache = self.k_cache(key, key_cache, block_indices,
++                                     block_offsets)
++            value_cache = self.v_cache(value, value_cache, block_indices,
++                                       block_offsets)
++
++        if attn_metadata.is_prompt:
++            # Prompt run.
++            if not self.prefill_usefusedsdpa:
++                # TODO: move this outside of model
++                assert attn_metadata.attn_bias is not None, \
++                        'attn_bias must be set before calling model.forward!'
++                attn_bias = attn_metadata.attn_bias
++                if self.alibi_slopes is not None:
++                    position_bias = _make_alibi_bias(self.alibi_slopes,
++                                                     self.num_kv_heads,
++                                                     attn_bias.dtype,
++                                                     attn_bias.shape[-1])
++                    attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
++                    attn_bias.add_(position_bias)
++            else:
++                attn_bias = None
++
++            query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
++            kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
++                        self.head_size)
++            out = ops.prompt_attention(
++                query.view(query_shape),
++                key.view(kv_shape),
++                value.view(kv_shape),
++                attn_bias=attn_bias,
++                p=0.0,
++                scale=self.scale,
++                matmul_qk_op=self.matmul_qk,
++                softmax_op=self.softmax,
++                matmul_av_op=self.matmul_av,
++            )
++            output = out.reshape(batch_size, seq_len, hidden_size)
++        else:
++            # Decoding run.
++            output = HPUPagedAttention.forward_decode(
++                query=query,
++                key_cache=key_cache,
++                value_cache=value_cache,
++                block_list=attn_metadata.block_list,
++                block_mapping=attn_metadata.block_mapping,
++                block_bias=attn_metadata.attn_bias,
++                block_scales=attn_metadata.block_scales,
++                block_groups=None,
++                scale=self.scale,
++                matmul_qk_op=self.matmul_qk,
++                matmul_av_op=self.matmul_av,
++                batch2block_matmul_op=self.batch2block_matmul,
++                block2batch_matmul_op=self.block2batch_matmul,
++                keys_fetch_func=self.k_cache.fetch_from_cache,
++                values_fetch_func=self.v_cache.fetch_from_cache)
++        # Reshape the output tensor.
++        return output.view(batch_size, seq_len, hidden_size)
++
++
++def _make_alibi_bias(
++    alibi_slopes: torch.Tensor,
++    num_kv_heads: int,
++    dtype: torch.dtype,
++    seq_len: int,
++) -> torch.Tensor:
++    bias = torch.arange(seq_len, dtype=dtype)
++    # NOTE(zhuohan): HF uses
++    #     `bias = bias[None, :].repeat(seq_len, 1)`
++    # here. We find that both biases give the same results, but
++    # the bias below more accurately follows the original ALiBi
++    # paper.
++    # Calculate a matrix where each element represents ith element- jth
++    # element.
++    bias = bias[None, :] - bias[:, None]
++
++    padded_len = (seq_len + 7) // 8 * 8
++    num_heads = alibi_slopes.shape[0]
++    bias = torch.empty(
++        1,  # batch size
++        num_heads,
++        seq_len,
++        padded_len,
++        device=alibi_slopes.device,
++        dtype=dtype,
++    )[:, :, :, :seq_len].copy_(bias)
++    bias.mul_(alibi_slopes[:, None, None])
++    if num_heads != num_kv_heads:
++        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
++    return bias
+diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
+new file mode 100644
+index 0000000..da1d307
+--- /dev/null
++++ b/vllm/attention/backends/ipex_attn.py
+@@ -0,0 +1,386 @@
++""" Attention layer with torch scaled_dot_product_attention
++    and PagedAttention."""
++from dataclasses import dataclass
++from typing import Any, Dict, List, Optional, Tuple, Type
++
++import torch
++
++from vllm._ipex_ops import ipex_ops
++from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
++                                              AttentionMetadata, AttentionType)
++from vllm.attention.backends.utils import CommonAttentionState
++from vllm.attention.ops.paged_attn import (PagedAttention,
++                                           PagedAttentionMetadata)
++
++_PARTITION_SIZE = 512
++
++
++class IpexAttnBackend(AttentionBackend):
++
++    @staticmethod
++    def get_name() -> str:
++        return "IPEX"
++
++    @staticmethod
++    def get_impl_cls() -> Type["IpexAttnBackendImpl"]:
++        return IpexAttnBackendImpl
++
++    @staticmethod
++    def get_metadata_cls() -> Type["IpexAttnMetadata"]:
++        return IpexAttnMetadata
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
++
++    @staticmethod
++    def get_kv_cache_shape(
++        num_blocks: int,
++        block_size: int,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[int, ...]:
++        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
++                                                 num_kv_heads, head_size)
++
++    @staticmethod
++    def swap_blocks(
++        src_kv_cache: torch.Tensor,
++        dst_kv_cache: torch.Tensor,
++        src_to_dst: torch.Tensor,
++    ) -> None:
++        from vllm._ipex_ops import ipex_ops as ops
++        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
++
++    @staticmethod
++    def copy_blocks(
++        kv_caches: List[torch.Tensor],
++        src_to_dists: torch.Tensor,
++    ) -> None:
++        from vllm._ipex_ops import ipex_ops as ops
++        key_caches = [kv_cache[0] for kv_cache in kv_caches]
++        value_caches = [kv_cache[1] for kv_cache in kv_caches]
++        ops.copy_blocks(key_caches, value_caches, src_to_dists)
++
++
++@dataclass
++class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata):
++    """Metadata for IpexAttnBackend.
++    """
++    # Currently, input sequences can only contain all prompts
++    # or all decoding. True if all sequences are prompts.
++    is_prompt: bool
++    slot_mapping: torch.Tensor
++    seq_lens: Optional[List[int]]
++    seqlen_q: Optional[torch.Tensor]
++    max_seqlen: Optional[int]
++
++    def __post_init__(self):
++        # Set during the execution of the first attention op.
++        # It is a list because it is needed to set per prompt
++        # when alibi slopes is used. It is because of the limitation
++        # from xformer API.
++        # will not appear in the __repr__ and __init__
++        self.attn_bias: Optional[List[torch.Tensor]] = None
++
++    @property
++    def prefill_metadata(self) -> Optional["IpexAttnMetadata"]:
++        # Currently chunked prefill is not supported
++        if self.num_decode_tokens == 0:
++            assert self.num_prefills > 0
++            return self
++
++        return None
++
++    @property
++    def decode_metadata(self) -> Optional["IpexAttnMetadata"]:
++        # Currently chunked prefill is not supported
++        if self.num_prefills > 0:
++            assert self.num_decode_tokens == 0
++            return None
++
++        return self
++
++
++class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
++
++    def __init__(
++        self,
++        num_heads: int,
++        head_size: int,
++        scale: float,
++        num_kv_heads: int,
++        alibi_slopes: Optional[List[float]],
++        sliding_window: Optional[int],
++        kv_cache_dtype: str,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        attn_type: str = AttentionType.DECODER,
++    ) -> None:
++        if blocksparse_params is not None:
++            raise ValueError(
++                "IPEX backend does not support block-sparse attention.")
++        self.num_heads = num_heads
++        self.head_size = head_size
++        self.scale = float(scale)
++        self.num_kv_heads = num_kv_heads
++        if alibi_slopes is not None:
++            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
++        self.alibi_slopes = alibi_slopes
++        self.sliding_window = sliding_window
++        self.kv_cache_dtype = kv_cache_dtype
++
++        assert self.num_heads % self.num_kv_heads == 0
++        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
++        self.need_mask = (self.alibi_slopes is not None
++                          or self.sliding_window is not None)
++        if logits_soft_cap is None:
++            logits_soft_cap = 0
++        self.logits_soft_cap = logits_soft_cap
++
++        supported_head_sizes = PagedAttention.get_supported_head_sizes()
++        if head_size not in supported_head_sizes:
++            raise ValueError(
++                f"Head size {head_size} is not supported by PagedAttention. "
++                f"Supported head sizes are: {supported_head_sizes}.")
++        if kv_cache_dtype != "auto":
++            raise NotImplementedError(
++                "IPEX backend does not support FP8 KV cache. "
++                "Please use xFormers backend instead.")
++        if attn_type != AttentionType.DECODER:
++            raise NotImplementedError("Encoder self-attention and "
++                                      "encoder/decoder cross-attention "
++                                      "are not implemented for "
++                                      "IpexAttnBackendImpl")
++
++    def split_kv_cache(
++        self,
++        kv_cache: torch.Tensor,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        x = 1
++        num_blocks = kv_cache.shape[1]
++
++        key_cache = kv_cache[0]
++        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
++                                   -1, x)
++        value_cache = kv_cache[1]
++        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
++        return key_cache, value_cache
++
++    def forward(
++        self,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: IpexAttnMetadata,  # type: ignore
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        """Forward pass with IPEX varlen_attention and PagedAttention.
++
++        Args:
++            query: shape = [num_tokens, num_heads * head_size]
++            key: shape = [num_tokens, num_kv_heads * head_size]
++            value: shape = [num_tokens, num_kv_heads * head_size]
++            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
++                NOTE: kv_cache will be an empty tensor with shape [0]
++                for profiling run.
++            attn_metadata: Metadata for attention.
++        Returns:
++            shape = [num_tokens, num_heads * head_size]
++        """
++        assert k_scale == 1.0 and v_scale == 1.0
++        num_tokens, hidden_size = query.shape
++        # Reshape the query, key, and value tensors.
++        query = query.view(-1, self.num_heads, self.head_size)
++        key = key.view(-1, self.num_kv_heads, self.head_size)
++        value = value.view(-1, self.num_kv_heads, self.head_size)
++
++        if kv_cache.numel() > 0:
++            key_cache, value_cache = self.split_kv_cache(
++                kv_cache, self.num_kv_heads, self.head_size)
++            ipex_ops.reshape_and_cache(
++                key,
++                value,
++                key_cache,
++                value_cache,
++                attn_metadata.slot_mapping.flatten(),
++                self.kv_cache_dtype,
++                k_scale,
++                v_scale,
++            )
++
++        if attn_metadata.is_prompt:
++            assert attn_metadata.seq_lens is not None
++            if (kv_cache.numel() == 0
++                    or attn_metadata.block_tables.numel() == 0):
++                if self.num_kv_heads != self.num_heads:
++                    key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
++                    value = value.repeat_interleave(self.num_queries_per_kv,
++                                                    dim=1)
++
++                if attn_metadata.attn_bias is None:
++                    if self.alibi_slopes is not None:
++                        att_masks = _make_alibi_bias(
++                            self.alibi_slopes, query.dtype,
++                            attn_metadata.seq_lens)  # type: ignore
++                    elif self.sliding_window is not None:
++                        att_masks = _make_sliding_window_bias(
++                            attn_metadata.seq_lens, self.sliding_window,
++                            query.dtype)  # type: ignore
++                    else:
++                        att_masks = _make_sliding_window_bias(
++                            attn_metadata.seq_lens, None, dtype=query.dtype)
++                    attn_metadata.attn_bias = att_masks
++
++                output = torch.empty(
++                    (num_tokens, self.num_heads, self.head_size),
++                    dtype=query.dtype,
++                    device=query.device)
++                ipex_ops.varlen_attention(
++                    query,
++                    key,
++                    value,
++                    output,
++                    attn_metadata.seqlen_q,
++                    attn_metadata.seqlen_q,
++                    attn_metadata.max_seqlen,
++                    attn_metadata.max_seqlen,
++                    pdropout=0.0,
++                    softmax_scale=self.scale,
++                    zero_tensors=False,
++                    is_causal=True,
++                    return_softmax=False,
++                    gen_=None,
++                    logits_soft_cap=self.logits_soft_cap,
++                )
++            else:
++                # prefix-enabled attention
++                raise RuntimeError(
++                    "IPEX backend doesn't support prefix decoding.")
++
++        else:
++            # Decoding run.
++            max_seq_len = attn_metadata.max_decode_seq_len
++            output = torch.empty_like(query)
++            block_size = value_cache.shape[3]
++            num_seqs, num_heads, head_size = query.shape
++            max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
++                                  _PARTITION_SIZE)
++            # NOTE(woosuk): We use a simple heuristic to decide whether to use
++            # PagedAttention V1 or V2. If the number of partitions is 1, we use
++            # V1 to avoid the overhead of reduction. Also, if the number of
++            # sequences or heads is large, we use V1 since there is enough work
++            # to parallelize.
++            # TODO(woosuk): Tune this heuristic.
++            # For context len > 8192, use V2 kernel to avoid shared memory
++            # shortage.
++            use_v1 = (max_seq_len <= 8192 and
++                      (max_num_partitions == 1 or num_seqs * num_heads > 512))
++            if use_v1:
++                # Run PagedAttention V1.
++                ipex_ops.paged_attention_v1(
++                    output,
++                    query,
++                    key_cache,
++                    value_cache,
++                    self.num_kv_heads,
++                    self.scale,
++                    attn_metadata.block_tables,
++                    attn_metadata.seq_lens_tensor,
++                    block_size,
++                    max_seq_len,
++                    self.alibi_slopes,
++                    self.kv_cache_dtype,
++                    k_scale,
++                    v_scale,
++                )
++            else:
++                # Run PagedAttention V2.
++                assert _PARTITION_SIZE % block_size == 0
++                tmp_output = torch.empty(
++                    size=(num_seqs, num_heads, max_num_partitions, head_size),
++                    dtype=output.dtype,
++                    device=output.device,
++                )
++                exp_sums = torch.empty(
++                    size=(num_seqs, num_heads, max_num_partitions),
++                    dtype=torch.float32,
++                    device=output.device,
++                )
++                max_logits = torch.empty_like(exp_sums)
++                ipex_ops.paged_attention_v2(
++                    output,
++                    exp_sums,
++                    max_logits,
++                    tmp_output,
++                    query,
++                    key_cache,
++                    value_cache,
++                    self.num_kv_heads,
++                    self.scale,
++                    attn_metadata.block_tables,
++                    attn_metadata.seq_lens_tensor,
++                    block_size,
++                    max_seq_len,
++                    self.alibi_slopes,
++                    self.kv_cache_dtype,
++                    k_scale,
++                    v_scale,
++                )
++
++            # Reshape the output tensor.
++        return output.view(-1, self.num_heads * self.head_size)
++
++
++def _make_alibi_bias(
++    alibi_slopes: torch.Tensor,
++    dtype: torch.dtype,
++    seq_lens: List[int],
++) -> List[torch.Tensor]:
++    attn_biases = []
++    for seq_len in seq_lens:
++        bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
++        # NOTE(zhuohan): HF uses
++        #     `bias = bias[None, :].repeat(seq_len, 1)`
++        # here. We find that both biases give the same results, but
++        # the bias below more accurately follows the original ALiBi
++        # paper.
++        bias = bias[None, :] - bias[:, None]
++
++        num_heads = alibi_slopes.shape[0]
++        bias = bias[None, :].repeat((num_heads, 1, 1))
++        bias.mul_(alibi_slopes[:, None, None])
++        inf_mask = torch.empty(
++            (1, seq_len, seq_len),
++            dtype=bias.dtype,
++            device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1)
++        attn_biases.append((bias + inf_mask).to(dtype))
++
++    return attn_biases
++
++
++def _make_sliding_window_bias(
++    seq_lens: List[int],
++    window_size: Optional[int],
++    dtype: torch.dtype,
++) -> List[torch.Tensor]:
++    attn_biases = []
++    for seq_len in seq_lens:
++        tensor = torch.full(
++            (1, seq_len, seq_len),
++            dtype=dtype,
++            fill_value=1,
++        )
++        shift = 0
++        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
++        if window_size is not None:
++            mask = torch.triu(mask, diagonal=shift - window_size + 1)
++        mask = torch.log(mask)
++        attn_biases.append(mask.to(dtype))
++
++    return attn_biases
+diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
+new file mode 100644
+index 0000000..be06d16
+--- /dev/null
++++ b/vllm/attention/backends/openvino.py
+@@ -0,0 +1,140 @@
++from dataclasses import dataclass
++from typing import Dict, List, Optional, Tuple, Type
++
++import openvino as ov
++import torch
++
++from vllm.attention.backends.abstract import (AttentionBackend,
++                                              AttentionMetadata)
++from vllm.attention.backends.utils import CommonAttentionState
++from vllm.multimodal import MultiModalPlaceholderMap
++
++
++def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
++                     src_offset: int, dst_offset: int) -> None:
++
++    def create_roi_tensor(
++        tensor: ov.Tensor,
++        block_number: int,
++    ) -> ov.Tensor:
++        roi_begin = ov.runtime.Coordinate([0, 0, 0, 0])
++        roi_end = ov.runtime.Coordinate(tensor.get_shape())
++
++        roi_begin[0] = block_number
++        roi_end[0] = block_number + 1
++
++        if isinstance(tensor, ov.Tensor):
++            return ov.Tensor(tensor, roi_begin, roi_end)
++        else:
++            return ov.RemoteTensor(tensor, roi_begin, roi_end)
++
++    src_roi_tensor = \
++        create_roi_tensor(src_tensor, src_offset)
++    dst_roi_tensor = \
++        create_roi_tensor(dst_tensor, dst_offset)
++    src_roi_tensor.copy_to(dst_roi_tensor)
++
++
++class OpenVINOAttentionBackend(AttentionBackend):
++
++    @staticmethod
++    def get_name() -> str:
++        return "OPENVINO"
++
++    @staticmethod
++    def get_impl_cls():
++        # OpenVINO implements PagedAttention as part of the Optimum
++        # exported model
++        raise NotImplementedError
++
++    @staticmethod
++    def make_metadata(*args, **kwargs) -> "AttentionMetadata":
++        raise NotImplementedError
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
++
++    @staticmethod
++    def make_openvino_metadata(*args, **kwargs) -> "OpenVINOAttentionMetadata":
++        return OpenVINOAttentionMetadata(*args, **kwargs)
++
++    @staticmethod
++    def get_kv_cache_shape(
++        num_blocks: int,
++        block_size: int,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[int, ...]:
++        return (2, num_blocks, num_kv_heads, block_size, head_size)
++
++    @staticmethod
++    def swap_blocks(
++        src_tensor: ov.Tensor,
++        dst_tensor: ov.Tensor,
++        src_to_dists: List[Tuple[int, int]],
++    ) -> None:
++        for src, dst in src_to_dists:
++            copy_cache_block(src_tensor, dst_tensor, src, dst)
++
++    @staticmethod
++    def copy_blocks(
++        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
++        src_to_dists: List[Tuple[int, int]],
++    ) -> None:
++        for src, dst in src_to_dists:
++            for key_cache, value_cache in kv_caches:
++                copy_cache_block(key_cache, key_cache, src, dst)
++                copy_cache_block(value_cache, value_cache, src, dst)
++
++
++@dataclass
++class OpenVINOAttentionMetadata:
++    """Metadata for OpenVINOAttentionBackend.
++
++    Basic terms used below:
++    - batch_size_in_sequences - total number of sequences to execute​
++    - prompt_lens – per sequence size number of scheduled tokens​
++    - batch_size_in_tokens = sum(prompt_lens)​
++    - max_context_len = max(context_lens)​
++    - max_num_blocks = div_up(max_context_len / BLOCK_SIZE)​
++    - num_blocks – total number of blocks in block_indices​
++    """
++
++    # Describes past KV cache size for each sequence within a batch
++    # Shape: [batch_size_in_sequences]
++    # Type: i32​
++    past_lens: torch.Tensor
++
++    # Describes start indices of input / speculative tokens from
++    # current sequences within a batch sequence​
++    # Shape: [batch_size_in_sequences + 1]​
++    # Type: i32
++    subsequence_begins: torch.Tensor
++
++    # Describes block tables for each sequence within a batch​ -
++    # indices along 0th dimension in key_cache and value_cache inputs​
++    # Shape: [num_blocks]
++    # Type: i32​
++    block_indices: torch.Tensor
++
++    # Describes block tables for each sequence within a batch​ -
++    # for i-th element, it is an index in block_indices with the
++    # first block belonging to i-th sequence​
++    # Shape: [batch_size_in_sequences + 1]
++    # Type: i32​
++    block_indices_begins: torch.Tensor
++
++    # Describes max context length
++    # Shape: scalar
++    # Type: i32
++    max_context_len: torch.Tensor
++
++    # The index maps that relate multi-modal embeddings to the corresponding
++    # placeholders.
++    #
++    # N.B. These aren't really related to attention and don't belong on this
++    # type -- this is just a temporary solution to make them available to
++    # `model_executable`.
++    multi_modal_placeholder_index_maps: Optional[Dict[
++        str, MultiModalPlaceholderMap.IndexMap]]
+diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
+new file mode 100644
+index 0000000..2ac492d
+--- /dev/null
++++ b/vllm/attention/backends/pallas.py
+@@ -0,0 +1,345 @@
++from dataclasses import dataclass
++from typing import Any, Dict, List, Optional, Tuple, Type
++
++import torch
++import torch_xla.experimental.custom_kernel  # Required to register custom ops.
++
++from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
++                                              AttentionMetadata, AttentionType)
++from vllm.attention.backends.utils import CommonAttentionState
++
++
++class PallasAttentionBackend(AttentionBackend):
++
++    @staticmethod
++    def get_name() -> str:
++        return "PALLAS"
++
++    @staticmethod
++    def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
++        return PallasAttentionBackendImpl
++
++    @staticmethod
++    def get_metadata_cls() -> Type["PallasMetadata"]:
++        return PallasMetadata
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
++
++    @staticmethod
++    def get_kv_cache_shape(
++        num_blocks: int,
++        block_size: int,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[int, ...]:
++        return (num_kv_heads, num_blocks, block_size, head_size)
++
++    @staticmethod
++    def swap_blocks(
++        src_kv_cache: torch.Tensor,
++        dst_kv_cache: torch.Tensor,
++        src_to_dst: torch.Tensor,
++    ) -> None:
++        raise RuntimeError("swap_blocks is not used for the TPU backend.")
++
++    @torch.compile(backend="openxla")
++    @staticmethod
++    def copy_blocks(
++        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
++        src_to_dists: Tuple[torch.Tensor, torch.Tensor],
++    ) -> None:
++        src_indices, dst_indices = src_to_dists
++        for k_cache, v_cache in kv_caches:
++            torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True)
++            k_cache[:, dst_indices] = k_cache[:, src_indices]
++            torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True)
++            v_cache[:, dst_indices] = v_cache[:, src_indices]
++
++
++@dataclass
++class PallasMetadata(AttentionMetadata):
++
++    # Currently, input sequences can only contain all prefills
++    # or all decoding.
++    block_tables: Optional[torch.Tensor] = None
++    context_lens: Optional[torch.Tensor] = None
++    effective_query_lens: Optional[torch.Tensor] = None
++
++    @property
++    def prefill_metadata(self) -> Optional["PallasMetadata"]:
++        if self.num_prefills == 0:
++            return None
++
++        assert self.num_decode_tokens == 0
++        return self
++
++    @property
++    def decode_metadata(self) -> Optional["PallasMetadata"]:
++        if self.num_decode_tokens == 0:
++            return None
++
++        assert self.num_prefills == 0
++        assert self.num_prefill_tokens == 0
++        assert self.block_tables is not None
++        assert self.context_lens is not None
++        return self
++
++
++class PallasAttentionBackendImpl(AttentionImpl):
++
++    def __init__(
++        self,
++        num_heads: int,
++        head_size: int,
++        scale: float,
++        num_kv_heads: int,
++        alibi_slopes: Optional[List[float]],
++        sliding_window: Optional[int],
++        kv_cache_dtype: str,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        attn_type: str = AttentionType.DECODER,
++    ) -> None:
++        self.num_heads = num_heads
++        self.head_size = head_size
++        self.scale = float(scale)
++        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
++
++        assert self.num_heads % self.num_kv_heads == 0
++        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
++        if head_size % 128 != 0:
++            raise NotImplementedError("Head size must be a multiple of 128.")
++        if alibi_slopes is not None:
++            raise NotImplementedError("Alibi slopes is not supported.")
++        if sliding_window is not None:
++            raise NotImplementedError("Sliding window is not supported.")
++        if kv_cache_dtype != "auto":
++            raise NotImplementedError("FP8 KV cache dtype is not supported.")
++        if blocksparse_params is not None:
++            raise NotImplementedError("Blocksparse is not supported.")
++        if logits_soft_cap is not None:
++            raise NotImplementedError(
++                "Attention logits soft-capping is not supported.")
++
++        if torch_xla.tpu.version() < 4:
++            raise NotImplementedError("TPU version must be 4 or higher.")
++
++        self.megacore_mode = None
++        tpu_env = torch_xla.tpu.get_tpu_env()
++        tpu_type = (tpu_env.get("ACCELERATOR_TYPE", None)
++                    or tpu_env.get("TYPE", None)
++                    or tpu_env.get("TPU_ACCELERATOR_TYPE", None))
++        assert tpu_type is not None
++        tpu_type = tpu_type.lower()
++
++        if (("lite" not in tpu_type) and ("v6" not in tpu_type)):
++            if self.num_kv_heads % 2 == 0:
++                self.megacore_mode = "kv_head"
++            else:
++                # NOTE(woosuk): If the batch size is not a multiple of 2, the
++                # megacore mode will be None.
++                self.megacore_mode = "batch"
++
++        if attn_type != AttentionType.DECODER:
++            raise NotImplementedError("Encoder self-attention and "
++                                      "encoder/decoder cross-attention "
++                                      "are not implemented for "
++                                      "PallasAttentionBackendImpl")
++
++    def forward(
++        self,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++        kv_cache: Tuple[torch.Tensor, torch.Tensor],
++        attn_metadata: PallasMetadata,
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        """Forward pass with Pallas attention.
++
++        Args:
++            query: shape = [batch_size, seq_len, num_heads * head_size]
++            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
++            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
++            kv_cache[0] = [num_kv_heads, num_blocks, block_size, head_size]
++            kv_cache[1] = [num_kv_heads, num_blocks, block_size, head_size]
++                NOTE: kv_cache[0] and kv_cache[1] will be an empty tensor 
++                with shape [0] for profiling run.
++            attn_metadata: Metadata for attention.
++        Returns:
++            shape = [batch_size, seq_len, num_heads * head_size]
++        """
++        assert k_scale == 1.0 and v_scale == 1.0
++        batch_size, seq_len, hidden_size = query.shape
++        query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
++        key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
++        value = value.view(batch_size, seq_len, self.num_kv_heads,
++                           self.head_size)
++
++        if kv_cache[0].numel() > 0:
++            slot_mapping = attn_metadata.slot_mapping
++            key_cache, value_cache = kv_cache
++            write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
++
++        query = query * self.scale
++        if attn_metadata.num_prefills > 0:
++            if attn_metadata.block_tables is None:
++                # Prefill without paged KV cache.
++                assert seq_len % 16 == 0, (
++                    "Pallas FlashAttention kernel requires seq_len to be a "
++                    f"multiple of 16 but got {seq_len}")
++
++                # Handle GQA/MQA.
++                if self.num_kv_heads != self.num_heads:
++                    key = key.repeat_interleave(self.num_queries_per_kv,
++                                                dim=-2)
++                    key = key.view(batch_size, seq_len, self.num_heads,
++                                   self.head_size)
++                    value = value.repeat_interleave(self.num_queries_per_kv,
++                                                    dim=-2)
++                    value = value.view(batch_size, seq_len, self.num_heads,
++                                       self.head_size)
++                # FlashAttention kernel requires the input shape to be
++                # [batch_size, num_heads, seq_len, d_model]
++                # while the input is [batch_size, seq_len, num_heads, d_model].
++                # Permute the input to match the required format.
++                output = torch.ops.xla.flash_attention(
++                    query.permute(0, 2, 1, 3),
++                    key.permute(0, 2, 1, 3),
++                    value.permute(0, 2, 1, 3),
++                    True,
++                )
++                output = output.permute(0, 2, 1, 3)
++            else:
++                # Prefill with paged KV cache.
++                # TODO(woosuk): Tune the below knobs.
++                num_kv_pages_per_compute_block = 16
++                num_queries_per_compute_block = 16
++                assert seq_len % num_queries_per_compute_block == 0
++                output = torch.ops.xla.multi_queries_paged_attention(
++                    query,
++                    key_cache,
++                    value_cache,
++                    attn_metadata.context_lens,
++                    attn_metadata.block_tables,
++                    attn_metadata.effective_query_lens,
++                    num_kv_pages_per_compute_block,
++                    num_queries_per_compute_block,
++                    use_kernel=True,
++                )
++        else:
++            # Decoding run.
++            assert kv_cache[0].numel() > 0
++            query = query.squeeze(dim=1)
++            pages_per_compute_block = 16  # TODO(woosuk): Tune this value.
++
++            assert attn_metadata.block_tables is not None
++            assert attn_metadata.context_lens is not None
++            # NOTE(woosuk): The PagedAttention Pallas kernel stores the entire
++            # block table in SMEM. Therefore, if the block table is too large,
++            # the kernel compilation will fail. To avoid this, we split the
++            # batch dimension into smaller chunks and run the kernel multiple
++            # times.
++            MAX_SMEM_USAGE = 512 * 1024
++            size_per_seq = 4 * attn_metadata.block_tables.shape[1]
++            max_num_seq = MAX_SMEM_USAGE // size_per_seq
++
++            if batch_size <= max_num_seq:
++                output = paged_attention(
++                    query,
++                    key_cache,
++                    value_cache,
++                    attn_metadata.context_lens,
++                    attn_metadata.block_tables,
++                    pages_per_compute_block,
++                    self.megacore_mode,
++                )
++            else:
++                chunk_size = max_num_seq
++                # Make sure the chunk size is a multiple of 2.
++                chunk_size = chunk_size // 2 * 2
++                num_chunks = (batch_size + chunk_size - 1) // chunk_size
++
++                output = torch.empty_like(query)
++                for chunk_idx in range(num_chunks):
++                    chunk_start = chunk_idx * chunk_size
++                    chunk_end = chunk_start + chunk_size
++                    # NOTE(woosuk): We skip this line because it causes Dynamo
++                    # compilation error. Instead, we rely on the slice operation
++                    # to handle the out-of-bound case.
++                    # chunk_end = min(chunk_end, batch_size)
++                    chunk_output = paged_attention(
++                        query[chunk_start:chunk_end],
++                        key_cache,
++                        value_cache,
++                        attn_metadata.context_lens[chunk_start:chunk_end],
++                        attn_metadata.block_tables[chunk_start:chunk_end],
++                        pages_per_compute_block,
++                        self.megacore_mode,
++                    )
++                    output[chunk_start:chunk_end] = chunk_output
++
++        # Reshape the output tensor.
++        return output.reshape(batch_size, seq_len, hidden_size)
++
++
++def write_to_kv_cache(
++    key: torch.Tensor,
++    value: torch.Tensor,
++    key_cache: torch.Tensor,
++    value_cache: torch.Tensor,
++    slot_mapping: torch.Tensor,
++) -> None:
++    torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
++    torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
++
++    key = key.flatten(0, 2)
++    value = value.flatten(0, 2)
++    key_cache = key_cache.flatten(0, 2)
++    value_cache = value_cache.flatten(0, 2)
++    key_cache.index_copy_(0, slot_mapping, key)
++    value_cache.index_copy_(0, slot_mapping, value)
++
++
++def paged_attention(
++    query: torch.Tensor,
++    key_cache: torch.Tensor,
++    value_cache: torch.Tensor,
++    context_lens: torch.Tensor,
++    block_tables: torch.Tensor,
++    pages_per_compute_block: int,
++    megacore_mode: Optional[str],
++) -> torch.Tensor:
++    batch_size = query.shape[0]
++    if megacore_mode == "batch" and batch_size % 2 != 0:
++        megacore_mode = None
++    else:
++        megacore_mode = megacore_mode
++
++    # NOTE(woosuk): A temporary workaround to avoid the error:
++    # "xla::paged_attention() Expected a value of type 'str' for
++    # argument 'megacore_mode' but instead found type 'NoneType'."
++    if megacore_mode is not None:
++        output = torch.ops.xla.paged_attention(
++            query,
++            key_cache,
++            value_cache,
++            context_lens,
++            block_tables,
++            pages_per_compute_block,
++            megacore_mode=megacore_mode,
++        )
++    else:
++        output = torch.ops.xla.paged_attention(
++            query,
++            key_cache,
++            value_cache,
++            context_lens,
++            block_tables,
++            pages_per_compute_block,
++        )
++    return output
+diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
+new file mode 100644
+index 0000000..534f79b
+--- /dev/null
++++ b/vllm/attention/backends/placeholder_attn.py
+@@ -0,0 +1,403 @@
++from collections import defaultdict
++from dataclasses import dataclass
++from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
++
++import torch
++
++from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
++                                              AttentionMetadata,
++                                              AttentionMetadataBuilder)
++from vllm.attention.backends.utils import CommonAttentionState
++from vllm.multimodal import MultiModalPlaceholderMap
++
++if TYPE_CHECKING:
++    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
++                                          ModelInputForGPUWithSamplingMetadata)
++
++# Placeholder attention backend for models like Mamba and pooling models that
++# lack attention.
++
++
++class PlaceholderAttentionBackend(AttentionBackend):
++    """Placeholder backend for when no attention is needed."""
++
++    @staticmethod
++    def get_name() -> str:
++        return "NO_ATTENTION"
++
++    @staticmethod
++    def get_impl_cls() -> Type["PlaceholderAttentionImpl"]:
++        return PlaceholderAttentionImpl
++
++    @staticmethod
++    def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]:
++        return PlaceholderAttentionMetadataBuilder
++
++    @staticmethod
++    def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]:
++        return PlaceholderAttentionMetadata
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
++
++    @staticmethod
++    def get_kv_cache_shape(
++        num_blocks: int,
++        block_size: int,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[int, ...]:
++        return (1, 1, 1, 1, 1)
++
++    @staticmethod
++    def swap_blocks(
++        src_kv_cache: torch.Tensor,
++        dst_kv_cache: torch.Tensor,
++        src_to_dst: torch.Tensor,
++    ) -> None:
++        return
++
++    @staticmethod
++    def copy_blocks(
++        kv_caches: List[torch.Tensor],
++        src_to_dists: torch.Tensor,
++    ) -> None:
++        return
++
++
++@dataclass
++class PlaceholderAttentionMetadata(AttentionMetadata):
++    """Attention metadata for prefill and decode batched together."""
++    # (batch_size,). The sequence length per sequence. Sequence length means
++    # the computed tokens + new tokens None if it is a decoding.
++    seq_lens: Optional[List[int]]
++    # seq_lens stored as a tensor.
++    seq_lens_tensor: Optional[torch.Tensor]
++
++    # Maximum query length in the batch.
++    max_query_len: Optional[int]
++
++    # Max number of query tokens among request in the batch.
++    max_decode_query_len: Optional[int]
++
++    # Maximum sequence length among prefill batch. 0 if there are decoding
++    # requests only.
++    max_prefill_seq_len: int
++    # Maximum sequence length among decode batch. 0 if there are prefill
++    # requests only.
++    max_decode_seq_len: int
++    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
++    # the batch, used to index into subquery. E.g., if the subquery length
++    # is [4, 6], it is [0, 4, 10].
++    query_start_loc: Optional[torch.Tensor]
++    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
++    # the batch, used to index into sequence. E.g., if the sequence length is
++    # [4, 6], it is [0, 4, 10].
++    seq_start_loc: Optional[torch.Tensor]
++    # (batch_size,) A tensor of context lengths (tokens that are computed
++    # so far).
++    context_lens_tensor: Optional[torch.Tensor]
++
++    # (batch_size, max_blocks_per_seq).
++    # Block addresses per sequence. (Seq id -> list of physical block)
++    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
++    # in the kv cache. Each block can contain up to block_size tokens.
++    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
++    # captured.
++    block_tables: Optional[torch.Tensor]
++
++    # Whether or not if cuda graph is enabled.
++    # Cuda-graph is currently enabled for decoding only.
++    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
++    use_cuda_graph: bool
++
++    _cached_prefill_metadata: Optional["PlaceholderAttentionMetadata"] = None
++    _cached_decode_metadata: Optional["PlaceholderAttentionMetadata"] = None
++
++    @property
++    def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
++        if self.num_prefills == 0:
++            return None
++
++        if self._cached_prefill_metadata is not None:
++            return self._cached_prefill_metadata
++
++        assert self.seq_lens is not None
++        assert self.seq_lens_tensor is not None
++        assert self.query_start_loc is not None
++        assert self.context_lens_tensor is not None
++        assert self.seq_start_loc is not None
++
++        # Placeholders
++        slot_mapping = torch.empty(0)
++        block_tables = torch.empty(0)
++
++        self._cached_prefill_metadata = PlaceholderAttentionMetadata(
++            num_prefills=self.num_prefills,
++            num_prefill_tokens=self.num_prefill_tokens,
++            num_decode_tokens=0,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=self.
++            multi_modal_placeholder_index_maps,
++            seq_lens=self.seq_lens[:self.num_prefills],
++            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
++            max_decode_query_len=0,
++            max_query_len=self.max_query_len,
++            max_prefill_seq_len=self.max_prefill_seq_len,
++            max_decode_seq_len=0,
++            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
++            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
++            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
++            block_tables=block_tables,
++            use_cuda_graph=False,
++        )
++        return self._cached_prefill_metadata
++
++    @property
++    def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
++        if self.num_decode_tokens == 0:
++            return None
++
++        if self._cached_decode_metadata is not None:
++            return self._cached_decode_metadata
++        assert self.seq_lens_tensor is not None
++
++        # Placeholders
++        slot_mapping = torch.empty(0)
++        block_tables = torch.empty(0)
++
++        self._cached_decode_metadata = PlaceholderAttentionMetadata(
++            num_prefills=0,
++            num_prefill_tokens=0,
++            num_decode_tokens=self.num_decode_tokens,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=None,
++            seq_lens=None,
++            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
++            max_decode_query_len=self.max_decode_query_len,
++            max_query_len=None,
++            max_prefill_seq_len=0,
++            max_decode_seq_len=self.max_decode_seq_len,
++            query_start_loc=None,
++            seq_start_loc=None,
++            context_lens_tensor=None,
++            block_tables=block_tables,
++            use_cuda_graph=self.use_cuda_graph,
++        )
++        return self._cached_decode_metadata
++
++    def advance_step(self,
++                     model_input: "ModelInputForGPUWithSamplingMetadata",
++                     sampled_token_ids: Optional[torch.Tensor],
++                     block_size: int,
++                     num_seqs: int,
++                     num_queries: int,
++                     turn_prefills_into_decodes: bool = False):
++        """
++        Update metadata in-place to advance one decode step.
++        """
++        # When using cudagraph, the num_seqs is padded to the next captured
++        # batch sized, but num_queries tracks the actual number of requests in
++        # the batch. For --enforce-eager mode, num_seqs == num_queries
++        if num_seqs != num_queries:
++            assert num_seqs > num_queries
++            assert self.use_cuda_graph
++
++        assert not turn_prefills_into_decodes, \
++            ("Multi-Step + Chunked-Prefill is not supported for attention-free"
++             "models. turn_prefills_into_decodes is a "
++             "Multi-Step + Chunked-Prefill specific parameter.")
++
++        assert self.seq_lens is not None
++        assert self.max_decode_seq_len == max(self.seq_lens)
++
++        assert self.num_prefills == 0
++        assert self.num_prefill_tokens == 0
++        assert self.num_decode_tokens == num_seqs
++
++        assert self.seq_lens is not None
++        assert len(self.seq_lens) == num_seqs
++        assert self.seq_lens_tensor is not None
++        assert self.seq_lens_tensor.shape == (num_seqs, )
++        assert self.max_query_len == 1
++        assert self.max_prefill_seq_len == 0
++
++        assert self.query_start_loc is not None
++        assert self.query_start_loc.shape == (num_queries + 1, )
++        assert self.seq_start_loc is not None
++        assert self.seq_start_loc.shape == (num_seqs + 1, )
++
++        assert self.context_lens_tensor is not None
++        assert self.context_lens_tensor.shape == (num_queries, )
++
++        assert self.block_tables is not None
++
++        # Update query lengths. Note that we update only queries and not seqs,
++        # since tensors may be padded due to captured cuda graph batch size
++        for i in range(num_queries):
++            self.seq_lens[i] += 1
++        self.max_decode_seq_len = max(self.seq_lens)
++
++        # Update sequences, masking off entries greater than num_queries
++        device = self.seq_lens_tensor.device
++        mask = torch.arange(self.seq_lens_tensor.size(0),
++                            device=device) < num_queries
++        self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype)
++        if sampled_token_ids is not None:
++            model_input.input_tokens.masked_scatter_(
++                mask, sampled_token_ids[:num_queries])
++
++
++class PlaceholderAttentionMetadataBuilder(
++        AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
++
++    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
++        self.prefill_seq_lens: List[int] = []
++        self.context_lens: List[int] = []
++        self.curr_seq_lens: List[int] = []
++        self.multimodal_placeholder_maps: Dict[
++            str,
++            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
++        self.num_prefills = 0
++        self.num_prefill_tokens = 0
++        self.num_decode_tokens = 0
++
++        self.input_builder = input_builder
++        self.runner = input_builder.runner
++
++    def _add_seq_group(
++            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
++            chunked_prefill_enabled: bool):
++        """Add a sequence group to the metadata. Specifically update/append
++        1. context length.
++        """
++        is_prompt = inter_data.is_prompt
++
++        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
++             curr_sliding_window_block) in zip(
++                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
++                 inter_data.orig_seq_lens, inter_data.seq_lens,
++                 inter_data.query_lens, inter_data.context_lens,
++                 inter_data.curr_sliding_window_blocks):
++            self.context_lens.append(context_len)
++
++            if is_prompt:
++                mm_maps = inter_data.multi_modal_placeholder_maps
++                if mm_maps:
++                    for modality, placeholders in mm_maps.items():
++                        self.multimodal_placeholder_maps[modality].extend(
++                            placeholders)
++
++                self.num_prefills += 1
++                self.num_prefill_tokens += token_len
++                self.prefill_seq_lens.append(seq_len)
++            else:
++                assert query_len == 1, (
++                    "seq_len: {}, context_len: {}, query_len: {}".format(
++                        seq_len, context_len, query_len))
++                self.num_decode_tokens += query_len
++                self.curr_seq_lens.append(curr_seq_len)
++
++    def build(self, seq_lens: List[int], query_lens: List[int],
++              cuda_graph_pad_size: int, batch_size: int):
++        """Build attention metadata with on-device tensors.
++
++        Args:
++            seq_lens: The maybe padded sequence lengths of the input sequences.
++            query_lens: The query lengths of the input sequences.
++            cuda_graph_pad_size: The padding size for cuda graph.
++                                 -1 if cuda graph is not used.
++            batch_size: The maybe padded batch size.
++        """
++        for inter_data in self.input_builder.inter_data_list:
++            self._add_seq_group(inter_data,
++                                self.input_builder.chunked_prefill_enabled)
++
++        device = self.runner.device
++        use_captured_graph = cuda_graph_pad_size != -1
++
++        logits_soft_cap = getattr(self.runner.model_config.hf_config,
++                                  "attn_logit_softcapping", None)
++        if logits_soft_cap is not None:
++            raise ValueError(
++                "Please use Flashinfer backend for models with logits_soft_cap"
++                " (i.e., Gemma-2). Otherwise, the output might be wrong."
++                " Set Flashinfer backend by "
++                "export VLLM_ATTENTION_BACKEND=FLASHINFER.")
++
++        max_query_len = max(query_lens)
++        decode_query_lens = query_lens[self.num_prefills:]
++        if len(decode_query_lens) > 0:
++            max_decode_query_len = max(decode_query_lens)
++        else:
++            max_decode_query_len = 1
++        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
++        max_decode_seq_len = max(self.curr_seq_lens, default=0)
++        num_decode_tokens = self.num_decode_tokens
++
++        if use_captured_graph:
++            num_decode_tokens = batch_size
++
++        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
++
++        context_lens_tensor = torch.tensor(self.context_lens,
++                                           dtype=torch.int,
++                                           device=device)
++        seq_lens_tensor = torch.tensor(seq_lens,
++                                       dtype=torch.int,
++                                       device=device)
++        query_lens_tensor = torch.tensor(query_lens,
++                                         dtype=torch.long,
++                                         device=device)
++        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
++                                      dtype=torch.int32,
++                                      device=device)
++        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
++                                    dtype=torch.int32,
++                                    device=device)
++        placeholder_index_maps = {
++            modality: placeholder_map.index_map()
++            for modality, placeholder_map in
++            self.multimodal_placeholder_maps.items()
++        }
++        torch.cumsum(seq_lens_tensor,
++                     dim=0,
++                     dtype=seq_start_loc.dtype,
++                     out=seq_start_loc[1:])
++        torch.cumsum(query_lens_tensor,
++                     dim=0,
++                     dtype=query_start_loc.dtype,
++                     out=query_start_loc[1:])
++
++        # Placeholders
++        slot_mapping = torch.empty(0)
++        block_tables = torch.empty(0)
++
++        return PlaceholderAttentionMetadata(
++            num_prefills=self.num_prefills,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=placeholder_index_maps,
++            num_prefill_tokens=self.num_prefill_tokens,
++            num_decode_tokens=num_decode_tokens,
++            seq_lens=seq_lens,
++            seq_lens_tensor=seq_lens_tensor,
++            max_query_len=max_query_len,
++            max_decode_query_len=max_decode_query_len,
++            max_prefill_seq_len=max_prefill_seq_len,
++            max_decode_seq_len=max_decode_seq_len,
++            query_start_loc=query_start_loc,
++            seq_start_loc=seq_start_loc,
++            context_lens_tensor=context_lens_tensor,
++            block_tables=block_tables,
++            use_cuda_graph=use_captured_graph,
++        )
++
++
++class PlaceholderAttentionImpl(AttentionImpl):
++
++    def __init__(self, *args, **kwargs) -> None:
++        return
++
++    def forward(self, *args, **kwargs) -> torch.Tensor:
++        raise NotImplementedError
+diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
+index c411b39..a91a5af 100644
+--- a/vllm/attention/backends/rocm_flash_attn.py
++++ b/vllm/attention/backends/rocm_flash_attn.py
+@@ -1,29 +1,53 @@
+ """Attention layer ROCm GPUs."""
+ from dataclasses import dataclass
+-from typing import Dict, List, Optional, Tuple, Type
++from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+ 
+ import torch
+ 
+ import vllm.envs as envs
++from vllm import _custom_ops as ops
+ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+-                                              AttentionMetadata,
+-                                              AttentionMetadataPerStage)
++                                              AttentionMetadata, AttentionType)
++from vllm.attention.backends.utils import (CommonAttentionState,
++                                           CommonMetadataBuilder)
+ from vllm.attention.ops.paged_attn import (PagedAttention,
+                                            PagedAttentionMetadata)
+ from vllm.logger import init_logger
++from vllm.platforms import current_platform
++
++if TYPE_CHECKING:
++    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+ 
+ logger = init_logger(__name__)
+ 
++_PARTITION_SIZE_ROCM = 512
++_GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
++_ON_NAVI = "gfx1" in _GPU_ARCH
++_ON_MI250_MI300 = any(arch in _GPU_ARCH
++                      for arch in ["gfx90a", "gfx940", "gfx941", "gfx942"])
++
+ 
+ class ROCmFlashAttentionBackend(AttentionBackend):
+ 
++    @staticmethod
++    def get_name() -> str:
++        return "ROCM_FLASH"
++
+     @staticmethod
+     def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]:
+         return ROCmFlashAttentionImpl
+ 
+     @staticmethod
+-    def make_metadata(*args, **kwargs) -> "ROCmFlashAttentionMetadata":
+-        return ROCmFlashAttentionMetadata(*args, **kwargs)
++    def get_metadata_cls() -> Type["AttentionMetadata"]:
++        return ROCmFlashAttentionMetadata
++
++    @staticmethod
++    def get_builder_cls() -> Type["ROCmFlashAttentionMetadataBuilder"]:
++        return ROCmFlashAttentionMetadataBuilder
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
+ 
+     @staticmethod
+     def get_kv_cache_shape(
+@@ -39,21 +63,20 @@ class ROCmFlashAttentionBackend(AttentionBackend):
+     def swap_blocks(
+         src_kv_cache: torch.Tensor,
+         dst_kv_cache: torch.Tensor,
+-        src_to_dst: Dict[int, int],
++        src_to_dst: torch.Tensor,
+     ) -> None:
+         PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+ 
+     @staticmethod
+     def copy_blocks(
+         kv_caches: List[torch.Tensor],
+-        src_to_dists: Dict[int, List[int]],
++        src_to_dists: torch.Tensor,
+     ) -> None:
+         PagedAttention.copy_blocks(kv_caches, src_to_dists)
+ 
+ 
+ @dataclass
+-class ROCmFlashAttentionMetadata(AttentionMetadataPerStage,
+-                                 PagedAttentionMetadata):
++class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
+     """Metadata for FlashAttentionBackend.
+ 
+     NOTE: Any python object stored here is not updated when it is
+@@ -61,9 +84,6 @@ class ROCmFlashAttentionMetadata(AttentionMetadataPerStage,
+     dynamically, it should be stored in tensor. The tensor has to be
+     updated from `CUDAGraphRunner.forward` API.
+     """
+-    # Currently, input sequences can only contain all prompts
+-    # or all decoding. True if all sequences are prompts.
+-    is_prompt: bool
+     # (batch_size,). The sequence length per sequence. Sequence length means
+     # the computed tokens + new tokens None if it is a decoding.
+     seq_lens: Optional[List[int]]
+@@ -78,14 +98,18 @@ class ROCmFlashAttentionMetadata(AttentionMetadataPerStage,
+     # |-------------------- seq_len ----------------------|
+     #                                   |-- query_len ---|
+ 
+-    # Maximum query length in the batch.
++    # Maximum query length in the batch. None for decoding.
+     max_query_len: Optional[int]
+-    # Maximum sequence length in the batch.
+-    max_seq_len: Optional[int]
++    # Maximum sequence length among prefill batch. 0 if there are decoding
++    # requests only.
++    max_prefill_seq_len: int
++    # Maximum sequence length among decode batch. 0 if there are prefill
++    # requests only.
++    max_decode_seq_len: int
+     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+     # the batch, used to index into subquery. E.g., if the subquery length
+     # is [4, 6], it is [0, 4, 10].
+-    subquery_start_loc: Optional[torch.Tensor]
++    query_start_loc: Optional[torch.Tensor]
+     # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+     # the batch, used to index into sequence. E.g., if the sequence length is
+     # [4, 6], it is [0, 4, 10].
+@@ -95,10 +119,187 @@ class ROCmFlashAttentionMetadata(AttentionMetadataPerStage,
+     # Cuda-graph is currently enabled for decoding only.
+     # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+     use_cuda_graph: bool
++
+     # (batch_size,) A tensor of context lengths (tokens that are computed
+     # so far).
+     context_lens_tensor: Optional[torch.Tensor]
+ 
++    # Max number of query tokens among request in the batch.
++    max_decode_query_len: Optional[int] = None
++
++    _cached_prefill_metadata: Optional["ROCmFlashAttentionMetadata"] = None
++    _cached_decode_metadata: Optional["ROCmFlashAttentionMetadata"] = None
++
++    @property
++    def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
++        if self.num_prefills == 0:
++            return None
++
++        if self._cached_prefill_metadata is not None:
++            return self._cached_prefill_metadata
++
++        assert self.seq_lens is not None
++        assert self.seq_lens_tensor is not None
++        assert self.query_start_loc is not None
++        assert self.context_lens_tensor is not None
++        assert self.block_tables is not None
++        assert self.seq_start_loc is not None
++
++        self._cached_prefill_metadata = ROCmFlashAttentionMetadata(
++            num_prefills=self.num_prefills,
++            num_prefill_tokens=self.num_prefill_tokens,
++            num_decode_tokens=0,
++            slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
++            multi_modal_placeholder_index_maps=self.
++            multi_modal_placeholder_index_maps,
++            seq_lens=self.seq_lens[:self.num_prefills],
++            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
++            max_query_len=self.max_query_len,
++            max_prefill_seq_len=self.max_prefill_seq_len,
++            max_decode_seq_len=0,
++            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
++            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
++            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
++            block_tables=self.block_tables[:self.num_prefills],
++            use_cuda_graph=False,
++        )
++        return self._cached_prefill_metadata
++
++    @property
++    def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
++        if self.num_decode_tokens == 0:
++            return None
++
++        if self._cached_decode_metadata is not None:
++            return self._cached_decode_metadata
++        assert self.block_tables is not None
++        assert self.seq_lens_tensor is not None
++
++        self._cached_decode_metadata = ROCmFlashAttentionMetadata(
++            num_prefills=0,
++            num_prefill_tokens=0,
++            num_decode_tokens=self.num_decode_tokens,
++            slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
++            multi_modal_placeholder_index_maps=None,
++            seq_lens=None,
++            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
++            max_query_len=None,
++            max_prefill_seq_len=0,
++            max_decode_seq_len=self.max_decode_seq_len,
++            query_start_loc=None,
++            seq_start_loc=None,
++            context_lens_tensor=None,
++            block_tables=self.block_tables[self.num_prefills:],
++            use_cuda_graph=self.use_cuda_graph,
++        )
++        # Batch may be composed of prefill|decodes, adjust query start indices
++        # to refer to the start of decodes when the two are split apart.
++        # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
++        if self._cached_decode_metadata.query_start_loc is not None:
++            qs = self._cached_decode_metadata.query_start_loc
++            self._cached_decode_metadata.query_start_loc = qs - qs[0]
++        return self._cached_decode_metadata
++
++    def advance_step(self,
++                     model_input: "ModelInputForGPUWithSamplingMetadata",
++                     sampled_token_ids: Optional[torch.Tensor],
++                     block_size: int,
++                     num_seqs: int,
++                     num_queries: int,
++                     turn_prefills_into_decodes: bool = False):
++        """
++        Update metadata in-place to advance one decode step.
++        """
++
++        assert not turn_prefills_into_decodes, \
++            ("Chunked prefill is not supported with rocm_flash_attn yet."
++             "turn_prefills_into_decodes is a Multi-Step + Chunked-Prefill "
++             "specific parameter.")
++
++        # When using cudagraph, the num_seqs is padded to the next captured
++        # batch sized, but num_queries tracks the actual number of requests in
++        # the batch. For --enforce-eager mode, num_seqs == num_queries
++        if num_seqs != num_queries:
++            assert num_seqs > num_queries
++            assert self.use_cuda_graph
++
++        assert self.num_prefills == 0
++        assert self.num_prefill_tokens == 0
++        assert self.num_decode_tokens == num_seqs
++        assert self.slot_mapping.shape == (num_seqs, )
++
++        assert self.seq_lens is not None
++        assert len(self.seq_lens) == num_seqs
++        assert self.seq_lens_tensor is not None
++        assert self.seq_lens_tensor.shape == (num_seqs, )
++        assert self.max_query_len == 1
++        assert self.max_prefill_seq_len == 0
++        assert self.max_decode_seq_len == max(self.seq_lens)
++
++        assert self.query_start_loc is not None
++        assert self.query_start_loc.shape == (num_queries + 1, )
++        assert self.seq_start_loc is not None
++        assert self.seq_start_loc.shape == (num_seqs + 1, )
++
++        assert self.context_lens_tensor is not None
++        assert self.context_lens_tensor.shape == (num_queries, )
++
++        assert self.block_tables is not None
++        assert self.block_tables.shape[0] == num_seqs
++
++        # Update query lengths. Note that we update only queries and not seqs,
++        # since tensors may be padded due to captured cuda graph batch size
++        for i in range(num_queries):
++            self.seq_lens[i] += 1
++        self.max_decode_seq_len = max(self.seq_lens)
++
++        ops.advance_step_flashattn(num_seqs=num_seqs,
++                                   num_queries=num_queries,
++                                   block_size=block_size,
++                                   input_tokens=model_input.input_tokens,
++                                   sampled_token_ids=sampled_token_ids,
++                                   input_positions=model_input.input_positions,
++                                   seq_lens=self.seq_lens_tensor,
++                                   slot_mapping=self.slot_mapping,
++                                   block_tables=self.block_tables)
++
++
++class ROCmFlashAttentionMetadataBuilder(
++        CommonMetadataBuilder[ROCmFlashAttentionMetadata]):
++
++    _metadata_cls = ROCmFlashAttentionMetadata
++
++
++def _make_alibi_bias(alibi_slopes: torch.Tensor,
++                     dtype: torch.dtype,
++                     seq_lens: Optional[List[int]],
++                     make_attn_mask: bool = True) -> List[torch.Tensor]:
++    attn_biases = []
++    if seq_lens:
++        for seq_len in seq_lens:
++            bias = torch.arange(seq_len, dtype=dtype)
++            # NOTE(zhuohan): HF uses
++            #     `bias = bias[None, :].repeat(seq_len, 1)`
++            # here. We find that both biases give the same results, but
++            # the bias below more accurately follows the original ALiBi
++            # paper.
++            bias = bias[None, :] - bias[:, None]
++
++            num_heads = alibi_slopes.shape[0]
++            bias = bias[None, :].repeat(
++                (num_heads, 1, 1)).to(alibi_slopes.device)
++            bias.mul_(alibi_slopes[:, None, None])
++            if make_attn_mask:
++                inf_mask = torch.empty(
++                    (1, seq_len, seq_len),
++                    dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1).to(
++                        alibi_slopes.device)
++                attn_biases.append((bias + inf_mask).to(dtype))
++            else:
++                attn_biases.append(bias.to(dtype))
++
++    return attn_biases
++
+ 
+ class ROCmFlashAttentionImpl(AttentionImpl):
+     """
+@@ -131,28 +332,40 @@ class ROCmFlashAttentionImpl(AttentionImpl):
+         num_heads: int,
+         head_size: int,
+         scale: float,
+-        num_kv_heads: Optional[int] = None,
+-        alibi_slopes: Optional[List[float]] = None,
+-        sliding_window: Optional[int] = None,
++        num_kv_heads: int,
++        alibi_slopes: Optional[List[float]],
++        sliding_window: Optional[int],
++        kv_cache_dtype: str,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        attn_type: str = AttentionType.DECODER,
+     ) -> None:
++        if blocksparse_params is not None:
++            raise ValueError(
++                "ROCmFlashAttention does not support blocksparse attention.")
++        if logits_soft_cap is not None:
++            raise ValueError(
++                "ROCmFlashAttention does not support attention logits soft "
++                "capping.")
+         self.num_heads = num_heads
+         self.head_size = head_size
+         self.scale = float(scale)
+-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+-        self.sliding_window = ((sliding_window, sliding_window)
+-                               if sliding_window is not None else (-1, -1))
++        self.num_kv_heads = num_kv_heads
+         if alibi_slopes is not None:
+             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+         self.alibi_slopes = alibi_slopes
++        self.sliding_window = ((sliding_window, sliding_window)
++                               if sliding_window is not None else (-1, -1))
++        self.kv_cache_dtype = kv_cache_dtype
+ 
+         assert self.num_heads % self.num_kv_heads == 0
+         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+ 
+-        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
+-        if head_size not in suppored_head_sizes:
++        supported_head_sizes = PagedAttention.get_supported_head_sizes()
++        if head_size not in supported_head_sizes:
+             raise ValueError(
+                 f"Head size {head_size} is not supported by PagedAttention. "
+-                f"Supported head sizes are: {suppored_head_sizes}.")
++                f"Supported head sizes are: {supported_head_sizes}.")
+ 
+         self.use_naive_attn = False
+         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
+@@ -162,9 +375,16 @@ class ROCmFlashAttentionImpl(AttentionImpl):
+                 triton_attention)
+             self.attn_func = triton_attention
+             logger.debug("Using Triton FA in ROCmBackend")
++            if self.sliding_window != (-1, -1):
++                logger.warning("ROCm Triton FA does not currently support "
++                               "sliding window attention. If using half "
++                               "precision, please try using the ROCm CK "
++                               "FA backend instead by setting the env var "
++                               "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+         else:
+-            # if not using triton, navi3x not use flash-attn either
+-            if torch.cuda.get_device_capability()[0] == 11:
++            # if not using triton, navi3x/navi21/navi10 do not use flash-attn
++            # either
++            if not current_platform.has_device_capability(90):
+                 self.use_naive_attn = True
+             else:
+                 try:
+@@ -175,9 +395,15 @@ class ROCmFlashAttentionImpl(AttentionImpl):
+                     self.use_naive_attn = True
+ 
+             if self.use_naive_attn:
+-                self.attn_func = _naive_attention
++                self.attn_func = _sdpa_attention
+                 logger.debug("Using naive attention in ROCmBackend")
+ 
++        if attn_type != AttentionType.DECODER:
++            raise NotImplementedError("Encoder self-attention and "
++                                      "encoder/decoder cross-attention "
++                                      "are not implemented for "
++                                      "ROCmFlashAttentionImpl")
++
+     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
+         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
+         tokens, n_kv_heads, head_dim = x.shape
+@@ -192,8 +418,10 @@ class ROCmFlashAttentionImpl(AttentionImpl):
+         key: torch.Tensor,
+         value: torch.Tensor,
+         kv_cache: torch.Tensor,
+-        attn_metadata: AttentionMetadata[ROCmFlashAttentionMetadata],
+-        kv_scale: float = 1.0,
++        attn_metadata: ROCmFlashAttentionMetadata,
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
+     ) -> torch.Tensor:
+         """Forward pass with FlashAttention and PagedAttention.
+ 
+@@ -202,17 +430,21 @@ class ROCmFlashAttentionImpl(AttentionImpl):
+             key: shape = [num_tokens, num_kv_heads * head_size]
+             value: shape = [num_tokens, num_kv_heads * head_size]
+             kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
++                NOTE: kv_cache will be an empty tensor with shape [0]
++                for profiling run.
+             attn_metadata: Metadata for attention.
+         Returns:
+             shape = [num_tokens, num_heads * head_size]
+         """
++        # Reminder: Please update docs/source/features/compatibility_matrix.md
++        # If the feature combo become valid
+         num_tokens, hidden_size = query.shape
+         # Reshape the query, key, and value tensors.
+         query = query.view(-1, self.num_heads, self.head_size)
+         key = key.view(-1, self.num_kv_heads, self.head_size)
+         value = value.view(-1, self.num_kv_heads, self.head_size)
+ 
+-        if kv_cache is not None:
++        if kv_cache.numel() > 0:
+             key_cache, value_cache = PagedAttention.split_kv_cache(
+                 kv_cache, self.num_kv_heads, self.head_size)
+ 
+@@ -225,8 +457,9 @@ class ROCmFlashAttentionImpl(AttentionImpl):
+                 key_cache,
+                 value_cache,
+                 attn_metadata.slot_mapping,
+-                attn_metadata.kv_cache_dtype,
+-                kv_scale,
++                self.kv_cache_dtype,
++                k_scale,
++                v_scale,
+             )
+ 
+         num_prefill_tokens = attn_metadata.num_prefill_tokens
+@@ -248,11 +481,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
+         if prefill_meta := attn_metadata.prefill_metadata:
+             # Prompt run.
+             assert prefill_meta.seq_lens is not None
+-            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
++            if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0:
+                 # triton attention
+                 # When block_tables are not filled, it means q and k are the
+                 # prompt, and they have the same length.
++                attn_masks = None
+                 if self.use_triton_flash_attn:
++                    if self.alibi_slopes is not None:
++                        attn_masks = _make_alibi_bias(
++                            self.alibi_slopes,
++                            query.dtype,
++                            attn_metadata.seq_lens,
++                            make_attn_mask=False)  # type: ignore
+                     out, _ = self.attn_func(
+                         query,
+                         key,
+@@ -260,22 +500,38 @@ class ROCmFlashAttentionImpl(AttentionImpl):
+                         None,
+                         prefill_meta.seq_start_loc,
+                         prefill_meta.seq_start_loc,
+-                        prefill_meta.max_seq_len,
+-                        prefill_meta.max_seq_len,
++                        prefill_meta.max_prefill_seq_len,
++                        prefill_meta.max_prefill_seq_len,
+                         True,
+                         self.scale,
++                        attn_masks[0][None]
++                        if attn_masks is not None else None,
+                     )
+                 elif self.use_naive_attn:
+                     if self.num_kv_heads != self.num_heads:
+                         # Interleave for MQA workaround.
+                         key = self.repeat_kv(key, self.num_queries_per_kv)
+                         value = self.repeat_kv(value, self.num_queries_per_kv)
++                    if self.alibi_slopes is not None:
++                        attn_masks = _make_alibi_bias(
++                            self.alibi_slopes,
++                            query.dtype,
++                            attn_metadata.seq_lens,
++                            make_attn_mask=True)  # type: ignore
++                    query = query.movedim(0, query.dim() - 2)
++                    key = key.movedim(0, key.dim() - 2)
++                    value = value.movedim(0, value.dim() - 2)
++                    # sdpa math backend attention
+                     out = self.attn_func(
+                         query,
+                         key,
+                         value,
+                         prefill_meta.seq_lens,
++                        num_tokens,
++                        self.num_heads,
++                        self.head_size,
+                         self.scale,
++                        attn_masks,
+                     )
+                 else:
+                     out = self.attn_func(
+@@ -284,10 +540,12 @@ class ROCmFlashAttentionImpl(AttentionImpl):
+                         v=value,
+                         cu_seqlens_q=prefill_meta.seq_start_loc,
+                         cu_seqlens_k=prefill_meta.seq_start_loc,
+-                        max_seqlen_q=prefill_meta.max_seq_len,
+-                        max_seqlen_k=prefill_meta.max_seq_len,
++                        max_seqlen_q=prefill_meta.max_prefill_seq_len,
++                        max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                         softmax_scale=self.scale,
+                         causal=True,
++                        window_size=self.sliding_window,
++                        alibi_slopes=self.alibi_slopes,
+                     )
+ 
+                 # common code for prefill
+@@ -299,76 +557,126 @@ class ROCmFlashAttentionImpl(AttentionImpl):
+                     query,
+                     key,
+                     value,
++                    self.kv_cache_dtype,
+                     key_cache,
+                     value_cache,
+                     prefill_meta.block_tables,
+-                    prefill_meta.subquery_start_loc,
++                    prefill_meta.query_start_loc,
+                     prefill_meta.seq_lens_tensor,
+                     prefill_meta.context_lens_tensor,
+                     prefill_meta.max_query_len,
+                     self.alibi_slopes,
+                     self.sliding_window[0],
++                    k_scale,
++                    v_scale,
+                 )
+ 
+         if decode_meta := attn_metadata.decode_metadata:
+             # Decoding run.
+-            output[num_prefill_tokens:] = PagedAttention.forward_decode(
+-                decode_query,
+-                key_cache,
+-                value_cache,
+-                decode_meta.block_tables,
+-                decode_meta.seq_lens_tensor,
+-                decode_meta.max_seq_len,
+-                attn_metadata.kv_cache_dtype,
+-                self.num_kv_heads,
+-                self.scale,
+-                self.alibi_slopes,
+-                kv_scale,
+-            )
++            # Whether to use rocm custom paged attention or not
++            num_seqs, num_heads, head_size = decode_query.shape
++            block_size = value_cache.shape[3]
++            gqa_ratio = num_heads // self.num_kv_heads
++            use_custom = _use_rocm_custom_paged_attention(
++                decode_query.dtype, head_size, block_size, gqa_ratio,
++                decode_meta.max_decode_seq_len)
++            if use_custom:
++                max_seq_len = decode_meta.max_decode_seq_len
++                max_num_partitions = (
++                    (max_seq_len + _PARTITION_SIZE_ROCM - 1) //
++                    _PARTITION_SIZE_ROCM)
++                assert _PARTITION_SIZE_ROCM % block_size == 0
++                tmp_output = torch.empty(
++                    size=(num_seqs, num_heads, max_num_partitions, head_size),
++                    dtype=output.dtype,
++                    device=output.device,
++                )
++                exp_sums = torch.empty(
++                    size=(num_seqs, num_heads, max_num_partitions),
++                    dtype=torch.float32,
++                    device=output.device,
++                )
++                max_logits = torch.empty_like(exp_sums)
++                ops.paged_attention_rocm(
++                    output[num_prefill_tokens:],
++                    exp_sums,
++                    max_logits,
++                    tmp_output,
++                    decode_query,
++                    key_cache,
++                    value_cache,
++                    self.num_kv_heads,
++                    self.scale,
++                    decode_meta.block_tables,
++                    decode_meta.seq_lens_tensor,
++                    block_size,
++                    max_seq_len,
++                    self.alibi_slopes,
++                    self.kv_cache_dtype,
++                    k_scale,
++                    v_scale,
++                )
++            else:
++                output[num_prefill_tokens:] = PagedAttention.forward_decode(
++                    decode_query,
++                    key_cache,
++                    value_cache,
++                    decode_meta.block_tables,
++                    decode_meta.seq_lens_tensor,
++                    decode_meta.max_decode_seq_len,
++                    self.kv_cache_dtype,
++                    self.num_kv_heads,
++                    self.scale,
++                    self.alibi_slopes,
++                    k_scale,
++                    v_scale,
++                )
+ 
+         # Reshape the output tensor.
+         return output.view(num_tokens, hidden_size)
+ 
+ 
+-def _naive_attention(
++def _sdpa_attention(
+     query: torch.Tensor,
+     key: torch.Tensor,
+     value: torch.Tensor,
+     seq_lens: List[int],
++    num_tokens: int,
++    num_heads: int,
++    head_size: int,
+     scale: float,
++    attn_masks: Optional[List[torch.Tensor]] = None,
+ ) -> torch.Tensor:
+-    output = torch.empty_like(query)
+     start = 0
+-    for _, seq_len in enumerate(seq_lens):
++    output = torch.empty((num_tokens, num_heads, head_size),
++                         dtype=query.dtype,
++                         device=query.device)
++
++    for i, seq_len in enumerate(seq_lens):
+         end = start + seq_len
+-        out = _naive_masked_attention(
+-            query[start:end],
+-            key[start:end],
+-            value[start:end],
+-            scale,
+-        )
+-        # TODO(woosuk): Unnecessary copy. Optimize.
+-        output[start:end].copy_(out)
+-        start += seq_len
++        with torch.backends.cuda.sdp_kernel(enable_math=True,
++                                            enable_flash=False,
++                                            enable_mem_efficient=False):
++            sub_out = torch.nn.functional.scaled_dot_product_attention(
++                query[:, start:end, :],
++                key[:, start:end, :],
++                value[:, start:end, :],
++                dropout_p=0.0,
++                is_causal=attn_masks is None,
++                attn_mask=attn_masks[i] if attn_masks else None,
++                scale=scale).movedim(query.dim() - 2, 0)
++            output[start:end, :, :] = sub_out
++            start = end
+ 
+     return output
+ 
+ 
+-def _naive_masked_attention(
+-    query: torch.Tensor,
+-    key: torch.Tensor,
+-    value: torch.Tensor,
+-    scale: float,
+-) -> torch.Tensor:
+-    seq_len, head_size, head_dim = query.shape
+-    attn_mask = torch.triu(torch.ones(seq_len,
+-                                      seq_len,
+-                                      dtype=query.dtype,
+-                                      device=query.device),
+-                           diagonal=1)
+-    attn_mask = attn_mask * torch.finfo(query.dtype).min
+-    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+-    attn_weights = attn_weights + attn_mask.float()
+-    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+-    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
+-    return out
++def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
++                                     block_size: int, gqa_ratio: int,
++                                     max_seq_len: int) -> bool:
++    # rocm custom page attention not support on navi (gfx1*)
++    return (_ON_MI250_MI300 and not _ON_NAVI
++            and (qtype == torch.half or qtype == torch.bfloat16)
++            and (head_size == 64 or head_size == 128)
++            and (block_size == 16 or block_size == 32)
++            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
+diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
+index f75a279..ca1c461 100644
+--- a/vllm/attention/backends/torch_sdpa.py
++++ b/vllm/attention/backends/torch_sdpa.py
+@@ -1,27 +1,46 @@
+ """ Attention layer with torch scaled_dot_product_attention
+     and PagedAttention."""
+ from dataclasses import dataclass
+-from typing import Dict, List, Optional, Tuple, Type
++from typing import Any, Dict, List, Optional, Tuple, Type
+ 
+ import torch
+ from torch.nn.functional import scaled_dot_product_attention
+ 
+ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                               AttentionMetadata,
+-                                              AttentionMetadataPerStage)
+-from vllm.attention.ops.paged_attn import (PagedAttention,
+-                                           PagedAttentionMetadata)
++                                              AttentionMetadataBuilder,
++                                              AttentionType)
++from vllm.attention.backends.utils import CommonAttentionState
++from vllm.attention.ops.ipex_attn import PagedAttention
++from vllm.attention.ops.paged_attn import PagedAttentionMetadata
++from vllm.logger import init_logger
++from vllm.utils import make_tensor_with_pad
++from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
++
++logger = init_logger(__name__)
+ 
+ 
+ class TorchSDPABackend(AttentionBackend):
+ 
++    @staticmethod
++    def get_name() -> str:
++        return "TORCH_SDPA"
++
+     @staticmethod
+     def get_impl_cls() -> Type["TorchSDPABackendImpl"]:
+         return TorchSDPABackendImpl
+ 
+     @staticmethod
+-    def make_metadata(*args, **kwargs) -> "TorchSDPAMetadata":
+-        return TorchSDPAMetadata(*args, **kwargs)
++    def get_metadata_cls() -> Type["AttentionMetadata"]:
++        return TorchSDPAMetadata
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
++
++    @staticmethod
++    def get_builder_cls() -> Type["TorchSDPAMetadataBuilder"]:
++        return TorchSDPAMetadataBuilder
+ 
+     @staticmethod
+     def get_kv_cache_shape(
+@@ -37,28 +56,49 @@ class TorchSDPABackend(AttentionBackend):
+     def swap_blocks(
+         src_kv_cache: torch.Tensor,
+         dst_kv_cache: torch.Tensor,
+-        src_to_dst: Dict[int, int],
++        src_to_dst: torch.Tensor,
+     ) -> None:
+         PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+ 
+     @staticmethod
+     def copy_blocks(
+         kv_caches: List[torch.Tensor],
+-        src_to_dists: Dict[int, List[int]],
++        src_to_dists: torch.Tensor,
+     ) -> None:
+         PagedAttention.copy_blocks(kv_caches, src_to_dists)
+ 
+ 
+ @dataclass
+-class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata,
+-                        AttentionMetadataPerStage):
++class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
+     """Metadata for TorchSDPABackend.
+     """
+     # Currently, input sequences can only contain all prompts
+     # or all decoding. True if all sequences are prompts.
+-    is_prompt: bool
+-    slot_mapping: torch.Tensor
+-    seq_lens: Optional[List[int]]
++    chunked_prefill: bool
++    seq_lens: Optional[List[int]] = None  # For non-chunked prefill
++
++    # For chunked prefill only
++    max_query_len: Optional[int] = None
++    max_kv_len: Optional[int] = None
++    query_start_loc: Optional[torch.Tensor] = None
++    kv_start_loc: Optional[torch.Tensor] = None
++    prefill_block_tables: Optional[torch.Tensor] = None
++
++    # Begin encoder attn & enc/dec cross-attn fields...
++    # Encoder sequence lengths representation
++    encoder_seq_lens: Optional[List[int]] = None
++    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
++
++    # Maximum sequence length among encoder sequences
++    max_encoder_seq_len: Optional[int] = None
++
++    # Number of tokens input to encoder
++    num_encoder_tokens: Optional[int] = None
++
++    # Cross-attention memory-mapping data structures: slot mapping
++    # and block tables
++    cross_slot_mapping: Optional[torch.Tensor] = None
++    cross_block_tables: Optional[torch.Tensor] = None
+ 
+     def __post_init__(self):
+         # Set during the execution of the first attention op.
+@@ -67,47 +107,336 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata,
+         # from xformer API.
+         # will not appear in the __repr__ and __init__
+         self.attn_bias: Optional[List[torch.Tensor]] = None
++        self.encoder_attn_bias: Optional[List[torch.Tensor]] = None
++        self.cross_attn_bias: Optional[List[torch.Tensor]] = None
++
++    @property
++    def is_all_encoder_attn_metadata_set(self):
++        '''
++        All attention metadata required for encoder attention is set.
++        '''
++        return ((self.encoder_seq_lens is not None)
++                and (self.encoder_seq_lens_tensor is not None)
++                and (self.max_encoder_seq_len is not None))
++
++    @property
++    def is_all_cross_attn_metadata_set(self):
++        '''
++        All attention metadata required for enc/dec cross-attention is set.
++
++        Superset of encoder attention required metadata.
++        '''
++        return (self.is_all_encoder_attn_metadata_set
++                and (self.cross_slot_mapping is not None)
++                and (self.cross_block_tables is not None))
++
++    @property
++    def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]:
++        if self.num_prefill_tokens == 0:
++            return None
++        return self
++
++    @property
++    def decode_metadata(self) -> Optional["TorchSDPAMetadata"]:
++        if self.num_decode_tokens == 0:
++            return None
++        return self
++
++    def get_seq_lens(
++        self,
++        attn_type: str,
++    ):
++        '''
++        Extract appropriate sequence lengths from attention metadata
++        according to attention type.
++
++        Arguments:
+ 
++        * attn_metadata: Attention metadata structure associated with attention
++        * attn_type: encoder attention, decoder self-attention,
++                    encoder/decoder cross-attention
+ 
+-class TorchSDPABackendImpl(AttentionImpl):
++        Returns:
++        * Appropriate sequence lengths tensor for query
++        * Appropriate sequence lengths tensor for key & value
++        '''
++
++        if (attn_type == AttentionType.DECODER
++                or attn_type == AttentionType.ENCODER_ONLY):
++            seq_lens_q = self.seq_lens
++            seq_lens_kv = self.seq_lens
++        elif attn_type == AttentionType.ENCODER:
++            seq_lens_q = self.encoder_seq_lens
++            seq_lens_kv = self.encoder_seq_lens
++        elif attn_type == AttentionType.ENCODER_DECODER:
++            seq_lens_q = self.seq_lens
++            seq_lens_kv = self.encoder_seq_lens
++        else:
++            raise AttributeError(f"Invalid attention type {str(attn_type)}")
++        return seq_lens_q, seq_lens_kv
++
++    def get_attn_bias(
++        self,
++        attn_type: str,
++    ) -> Optional[List[torch.Tensor]]:
++        '''
++        Extract appropriate attention bias from attention metadata
++        according to attention type.
++
++        Arguments:
++
++        * attn_metadata: Attention metadata structure associated with attention
++        * attn_type: encoder attention, decoder self-attention,
++                    encoder/decoder cross-attention
++
++        Returns:
++        * Appropriate attention bias value given the attention type
++        '''
++
++        if (attn_type == AttentionType.DECODER
++                or attn_type == AttentionType.ENCODER_ONLY):
++            return self.attn_bias
++        elif attn_type == AttentionType.ENCODER:
++            return self.encoder_attn_bias
++        elif attn_type == AttentionType.ENCODER_DECODER:
++            return self.cross_attn_bias
++        else:
++            raise AttributeError(f"Invalid attention type {str(attn_type)}")
++
++    def set_attn_bias(
++        self,
++        attn_bias: List[torch.Tensor],
++        attn_type: str,
++    ) -> None:
++        '''
++        Update appropriate attention bias field of attention metadata,
++        according to attention type.
++
++        Arguments:
++
++        * attn_metadata: Attention metadata structure associated with attention
++        * attn_bias: The desired attention bias value
++        * attn_type: encoder attention, decoder self-attention,
++                    encoder/decoder cross-attention
++        '''
++
++        if (attn_type == AttentionType.DECODER
++                or attn_type == AttentionType.ENCODER_ONLY):
++            self.attn_bias = attn_bias
++        elif attn_type == AttentionType.ENCODER:
++            self.encoder_attn_bias = attn_bias
++        elif attn_type == AttentionType.ENCODER_DECODER:
++            self.cross_attn_bias = attn_bias
++        else:
++            raise AttributeError(f"Invalid attention type {str(attn_type)}")
++
++    def get_seq_len_block_table_args(
++        self,
++        attn_type: str,
++    ) -> tuple:
++        '''
++        The particular choice of sequence-length- and block-table-related
++        attributes which should be extracted from attn_metadata is dependent
++        on the type of attention operation.
++
++        Decoder attn -> select entirely decoder self-attention-related fields
++        Encoder/decoder cross-attn -> select encoder sequence lengths &
++                                    cross-attn block-tables fields
++        Encoder attn -> select encoder sequence lengths fields & no block tables
++
++        Arguments:
++
++        * attn_metadata: Attention metadata structure associated with attention
++        * is_prompt: True if prefill, False otherwise
++        * attn_type: encoder attention, decoder self-attention,
++                    encoder/decoder cross-attention
++
++        Returns:
++
++        * Appropriate sequence-lengths tensor
++        * Appropriate max sequence-length scalar
++        * Appropriate block tables (or None)
++        '''
++
++        if (attn_type == AttentionType.DECODER
++                or attn_type == AttentionType.ENCODER_ONLY):
++            # Decoder self-attention
++            # Choose max_seq_len based on whether we are in prompt_run
++            return (self.seq_lens_tensor, self.max_decode_seq_len,
++                    self.block_tables)
++        elif attn_type == AttentionType.ENCODER_DECODER:
++            # Enc/dec cross-attention KVs match encoder sequence length;
++            # cross-attention utilizes special "cross" block tables
++            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
++                    self.cross_block_tables)
++        elif attn_type == AttentionType.ENCODER:
++            # No block tables associated with encoder attention
++            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
++                    None)
++        else:
++            raise AttributeError(f"Invalid attention type {str(attn_type)}")
++
++
++class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]):
++
++    def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
++        self.chunked_prefill = input_builder.chunked_prefill
++        self.input_data = input_builder.input_data
++
++    def build(self, seq_lens: List[int], query_lens: List[int],
++              cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata:
++        input_data = self.input_data
++        prefill_seq_lens = seq_lens[0:input_data.num_prefills]
++        prefill_query_lens = query_lens[0:input_data.num_prefills]
++        slot_mapping = torch.tensor(input_data.slot_mapping,
++                                    dtype=torch.long,
++                                    device="cpu")
++
++        # For chunked-prefill
++        if self.chunked_prefill and input_data.num_prefill_tokens != 0:
++            prefill_block_tables = make_tensor_with_pad(
++                self.input_data.prefill_block_tables,
++                pad=0,
++                dtype=torch.int32,
++                device="cpu",
++            )
++            query_lens_tensor = torch.tensor(prefill_query_lens,
++                                             dtype=torch.int32,
++                                             device="cpu")
++            kv_lens_tensor = torch.tensor(prefill_seq_lens,
++                                          dtype=torch.int32,
++                                          device="cpu")
++            query_start_loc = torch.zeros(input_data.num_prefills + 1,
++                                          dtype=torch.int32,
++                                          device="cpu")
++            kv_start_loc = torch.zeros(input_data.num_prefills + 1,
++                                       dtype=torch.int32,
++                                       device="cpu")
++            torch.cumsum(query_lens_tensor,
++                         dim=0,
++                         dtype=torch.int32,
++                         out=query_start_loc[1:])
++            torch.cumsum(kv_lens_tensor,
++                         dim=0,
++                         dtype=torch.int32,
++                         out=kv_start_loc[1:])
++            max_query_len = max(prefill_query_lens)
++            max_kv_len = max(prefill_seq_lens)
++        else:
++            prefill_block_tables = None
++            query_start_loc = None
++            kv_start_loc = None
++            max_query_len = None
++            max_kv_len = None
++
++        # For paged attention
++        if input_data.num_decode_tokens != 0:
++            seq_lens_tensor = torch.tensor(
++                input_data.seq_lens[input_data.num_prefills:],
++                dtype=torch.int32,
++                device="cpu",
++            )
++            block_tables = make_tensor_with_pad(
++                self.input_data.decode_block_tables,
++                pad=0,
++                dtype=torch.int32,
++                device="cpu",
++            )
++        else:
++            block_tables = torch.tensor([])
++            seq_lens_tensor = torch.tensor(
++                input_data.seq_lens[:input_data.num_prefills],
++                dtype=torch.int32,
++                device="cpu",
++            )
++
++        # For multi-modal models
++        placeholder_index_maps = None
++        if len(input_data.multi_modal_inputs_list) != 0:
++            placeholder_index_maps = {
++                modality: placeholder_map.index_map()
++                for modality, placeholder_map in
++                input_data.multi_modal_placeholder_maps.items()
++            }
++
++        attn_metadata = TorchSDPAMetadata(
++            chunked_prefill=self.chunked_prefill,
++            seq_lens=prefill_seq_lens,
++            seq_lens_tensor=seq_lens_tensor,
++            max_query_len=max_query_len,
++            max_kv_len=max_kv_len,
++            query_start_loc=query_start_loc,
++            kv_start_loc=kv_start_loc,
++            max_decode_seq_len=input_data.max_decode_seq_len,
++            num_prefills=input_data.num_prefills,
++            num_prefill_tokens=input_data.num_prefill_tokens,
++            num_decode_tokens=input_data.num_decode_tokens,
++            block_tables=block_tables,
++            prefill_block_tables=prefill_block_tables,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=placeholder_index_maps,
++        )
++
++        return attn_metadata
++
++
++class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
+ 
+     def __init__(
+         self,
+         num_heads: int,
+         head_size: int,
+         scale: float,
+-        num_kv_heads: Optional[int] = None,
+-        alibi_slopes: Optional[List[float]] = None,
+-        sliding_window: Optional[int] = None,
++        num_kv_heads: int,
++        alibi_slopes: Optional[List[float]],
++        sliding_window: Optional[int],
++        kv_cache_dtype: str,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        attn_type: str = AttentionType.DECODER,
+     ) -> None:
++        if blocksparse_params is not None:
++            raise ValueError(
++                "Torch SPDA does not support block-sparse attention.")
++        if logits_soft_cap is not None:
++            logger.warning_once("Torch SPDA does not support logits soft cap. "
++                                "Outputs may be slightly off.")
+         self.num_heads = num_heads
+         self.head_size = head_size
+         self.scale = float(scale)
+-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+-        self.sliding_window = sliding_window
++        self.num_kv_heads = num_kv_heads
+         if alibi_slopes is not None:
+-            assert len(alibi_slopes) == num_heads
+             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+         self.alibi_slopes = alibi_slopes
+-        self.need_mask = (self.alibi_slopes is not None
+-                          or self.sliding_window is not None)
++        self.sliding_window = sliding_window
++        self.kv_cache_dtype = kv_cache_dtype
+ 
+         assert self.num_heads % self.num_kv_heads == 0
+         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+-        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
+-        if head_size not in suppored_head_sizes:
++        self.need_mask = (self.alibi_slopes is not None
++                          or self.sliding_window is not None)
++
++        supported_head_sizes = PagedAttention.get_supported_head_sizes()
++        if head_size not in supported_head_sizes:
+             raise ValueError(
+                 f"Head size {head_size} is not supported by PagedAttention. "
+-                f"Supported head sizes are: {suppored_head_sizes}.")
++                f"Supported head sizes are: {supported_head_sizes}.")
++        if kv_cache_dtype != "auto":
++            raise NotImplementedError(
++                "Torch SDPA backend does not support FP8 KV cache. "
++                "Please use xFormers backend instead.")
++        self.attn_type = attn_type
+ 
+     def forward(
+         self,
+         query: torch.Tensor,
+         key: torch.Tensor,
+         value: torch.Tensor,
+-        kv_cache: Optional[torch.Tensor],
++        kv_cache: torch.Tensor,
+         attn_metadata: TorchSDPAMetadata,  # type: ignore
+-        kv_scale: float,
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
+     ) -> torch.Tensor:
+         """Forward pass with torch SDPA and PagedAttention.
+ 
+@@ -116,98 +445,198 @@ class TorchSDPABackendImpl(AttentionImpl):
+             key: shape = [num_tokens, num_kv_heads * head_size]
+             value: shape = [num_tokens, num_kv_heads * head_size]
+             kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
++                NOTE: kv_cache will be an empty tensor with shape [0]
++                for profiling run.
+             attn_metadata: Metadata for attention.
+         Returns:
+             shape = [num_tokens, num_heads * head_size]
+         """
+-        num_tokens, hidden_size = query.shape
++        assert k_scale == 1.0 and v_scale == 1.0
++        attn_type = self.attn_type
++        if (attn_type == AttentionType.ENCODER
++                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
++            raise AttributeError("Encoder attention requires setting "
++                                 "encoder metadata attributes.")
++        elif (attn_type == AttentionType.ENCODER_DECODER
++              and (not attn_metadata.is_all_cross_attn_metadata_set)):
++            raise AttributeError("Encoder/decoder cross-attention "
++                                 "requires setting cross-attention "
++                                 "metadata attributes.")
++
+         # Reshape the query, key, and value tensors.
+         query = query.view(-1, self.num_heads, self.head_size)
+-        key = key.view(-1, self.num_kv_heads, self.head_size)
+-        value = value.view(-1, self.num_kv_heads, self.head_size)
+-
+-        if kv_cache is not None:
++        if key is not None:
++            assert value is not None
++            key = key.view(-1, self.num_kv_heads, self.head_size)
++            value = value.view(-1, self.num_kv_heads, self.head_size)
++        else:
++            assert value is None
++
++        if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0):
++            # KV-cache during decoder-self- or
++            # encoder-decoder-cross-attention, but not
++            # during encoder attention.
++            #
++            # Even if there are no new key/value pairs to cache,
++            # we still need to break out key_cache and value_cache
++            # i.e. for later use by paged attention
+             key_cache, value_cache = PagedAttention.split_kv_cache(
+                 kv_cache, self.num_kv_heads, self.head_size)
+-            PagedAttention.write_to_paged_cache(key, value, key_cache,
+-                                                value_cache,
+-                                                attn_metadata.slot_mapping,
+-                                                attn_metadata.kv_cache_dtype,
+-                                                kv_scale)
+ 
+-        if attn_metadata.is_prompt:
++            if (key is not None) and (value is not None):
++                if attn_type == AttentionType.ENCODER_DECODER:
++                    # Update cross-attention KV cache (prefill-only)
++                    # During cross-attention decode, key & value will be None,
++                    # preventing this IF-statement branch from running
++                    updated_slot_mapping = attn_metadata.cross_slot_mapping
++                else:
++                    # Update self-attention KV cache (prefill/decode)
++                    updated_slot_mapping = attn_metadata.slot_mapping
++
++                PagedAttention.write_to_paged_cache(key, value, key_cache,
++                                                    value_cache,
++                                                    updated_slot_mapping,
++                                                    self.kv_cache_dtype,
++                                                    k_scale, v_scale)
++
++        if attn_type != AttentionType.ENCODER:
++            # Decoder self-attention supports chunked prefill.
++            # Encoder/decoder cross-attention requires no chunked
++            # prefill (100% prefill or 100% decode tokens, no mix)
++            num_prefill_tokens = attn_metadata.num_prefill_tokens
++            num_decode_tokens = attn_metadata.num_decode_tokens
++        else:
++            # Encoder attention - chunked prefill is not applicable;
++            # derive token-count from query shape & and treat them
++            # as 100% prefill tokens
++            assert attn_metadata.num_encoder_tokens is not None
++            num_prefill_tokens = attn_metadata.num_encoder_tokens
++            num_decode_tokens = 0
++
++        if attn_type == AttentionType.DECODER:
++            # Only enforce this shape-constraint for decoder
++            # self-attention
++            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
++            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
++
++        output = torch.empty_like(query)
++        if prefill_meta := attn_metadata.prefill_metadata:
+             assert attn_metadata.seq_lens is not None
+-            if (kv_cache is None or attn_metadata.block_tables.numel() == 0):
+-                if self.num_kv_heads != self.num_heads:
+-                    key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
+-                    value = value.repeat_interleave(self.num_queries_per_kv,
+-                                                    dim=1)
+-
+-                if attn_metadata.attn_bias is None:
+-                    if self.alibi_slopes is not None:
+-                        att_masks = _make_alibi_bias(
+-                            self.alibi_slopes, query.dtype,
+-                            attn_metadata.seq_lens)  # type: ignore
+-                    elif self.sliding_window is not None:
+-                        att_masks = _make_sliding_window_bias(
+-                            attn_metadata.seq_lens, self.sliding_window,
+-                            query.dtype)  # type: ignore
+-                    else:
+-                        att_masks = [None] * len(attn_metadata.seq_lens)
+-                    attn_metadata.attn_bias = att_masks
+-
+-                query = query.movedim(0, query.dim() - 2)
+-                key = key.movedim(0, key.dim() - 2)
+-                value = value.movedim(0, value.dim() - 2)
+-
+-                start = 0
+-                output = torch.empty(
+-                    (num_tokens, self.num_heads, self.head_size),
+-                    dtype=query.dtype)
+-                for seq_len, mask in zip(attn_metadata.seq_lens,
+-                                         attn_metadata.attn_bias):
+-                    end = start + seq_len
+-                    sub_out = scaled_dot_product_attention(
+-                        query[:, start:end, :],
+-                        key[:, start:end, :],
+-                        value[:, start:end, :],
+-                        attn_mask=mask,
+-                        dropout_p=0.0,
+-                        is_causal=not self.need_mask,
+-                        scale=self.scale).movedim(query.dim() - 2, 0)
+-                    output[start:end, :, :] = sub_out
+-                    start = end
++            if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
++                self._run_sdpa_forward(output,
++                                       query,
++                                       key,
++                                       value,
++                                       prefill_meta,
++                                       attn_type=attn_type)
+             else:
+                 # prefix-enabled attention
+-                raise RuntimeError(
+-                    "Torch SDPA backend doesn't support prefix decoding.")
+-
+-        else:
++                assert not self.need_mask
++                import intel_extension_for_pytorch.llm.modules as ipex_modules
++                output = torch.empty_like(query)
++                ipex_modules.PagedAttention.flash_attn_varlen_func(
++                    output[:prefill_meta.num_prefill_tokens, :, :],
++                    query[:prefill_meta.num_prefill_tokens, :, :],
++                    key_cache,
++                    value_cache,
++                    prefill_meta.query_start_loc,
++                    prefill_meta.kv_start_loc,
++                    prefill_meta.max_query_len,
++                    prefill_meta.max_kv_len,
++                    self.scale,
++                    True,
++                    prefill_meta.prefill_block_tables,
++                    self.alibi_slopes,
++                )
++
++        if decode_meta := attn_metadata.decode_metadata:
++            assert attn_type != AttentionType.ENCODER_ONLY, (
++                "Encoder-only models should not have decode metadata.")
+             # Decoding run.
+-            output = PagedAttention.forward_decode(
+-                query,
++            (
++                seq_lens_arg,
++                max_seq_len_arg,
++                block_tables_arg,
++            ) = decode_meta.get_seq_len_block_table_args(attn_type)
++
++            PagedAttention.forward_decode(
++                output[attn_metadata.num_prefill_tokens:, :, :],
++                query[attn_metadata.num_prefill_tokens:, :, :],
+                 key_cache,
+                 value_cache,
+-                attn_metadata.block_tables,
+-                attn_metadata.seq_lens_tensor,
+-                attn_metadata.max_seq_len,
+-                attn_metadata.kv_cache_dtype,
++                block_tables_arg,
++                seq_lens_arg,
++                max_seq_len_arg,
++                self.kv_cache_dtype,
+                 self.num_kv_heads,
+                 self.scale,
+                 self.alibi_slopes,
+-                kv_scale,
++                k_scale,
++                v_scale,
+             )
+ 
+         # Reshape the output tensor.
+         return output.view(-1, self.num_heads * self.head_size)
+ 
++    def _run_sdpa_forward(
++        self,
++        output: torch.Tensor,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++        attn_metadata: TorchSDPAMetadata,
++        attn_type: str = AttentionType.DECODER,
++    ) -> None:
++        if self.num_kv_heads != self.num_heads:
++            key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
++            value = value.repeat_interleave(self.num_queries_per_kv, dim=1)
++
++        attn_masks = attn_metadata.get_attn_bias(attn_type)
++        if attn_masks is None:
++            if self.alibi_slopes is not None:
++                attn_masks = _make_alibi_bias(
++                    self.alibi_slopes, query.dtype,
++                    attn_metadata.seq_lens)  # type: ignore
++            elif self.sliding_window is not None:
++                assert attn_metadata.seq_lens is not None
++                attn_masks = _make_sliding_window_bias(
++                    attn_metadata.seq_lens, self.sliding_window,
++                    query.dtype)  # type: ignore
++            else:
++                seq_lens, _ = attn_metadata.get_seq_lens(attn_type)
++                attn_masks = [None] * len(seq_lens)
++            attn_metadata.set_attn_bias(attn_masks, attn_type)
++
++        query = query.movedim(0, query.dim() - 2)
++        key = key.movedim(0, key.dim() - 2)
++        value = value.movedim(0, value.dim() - 2)
++
++        causal_attn = (attn_type == AttentionType.DECODER)
++
++        seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type)
++        start_q, start_kv = 0, 0
++        for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv,
++                                               attn_masks):
++            end_q = start_q + seq_len_q
++            end_kv = start_kv + seq_len_kv
++            sub_out = scaled_dot_product_attention(
++                query[None, :, start_q:end_q, :],
++                key[None, :, start_kv:end_kv, :],
++                value[None, :, start_kv:end_kv, :],
++                attn_mask=mask,
++                dropout_p=0.0,
++                is_causal=causal_attn and mask is None,
++                scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
++            output[start_q:end_q, :, :] = sub_out
++            start_q, start_kv = end_q, end_kv
++
+ 
+ def _make_alibi_bias(
+     alibi_slopes: torch.Tensor,
+     dtype: torch.dtype,
+     seq_lens: List[int],
+ ) -> List[torch.Tensor]:
+-    attn_biases = []
++    attn_biases: List[torch.Tensor] = []
+     for seq_len in seq_lens:
+         bias = torch.arange(seq_len, dtype=dtype)
+         # NOTE(zhuohan): HF uses
+@@ -219,7 +648,7 @@ def _make_alibi_bias(
+ 
+         num_heads = alibi_slopes.shape[0]
+         bias = bias[None, :].repeat((num_heads, 1, 1))
+-        bias.mul_(alibi_slopes[:, None, None])
++        bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0)
+         inf_mask = torch.empty(
+             (1, seq_len, seq_len),
+             dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1)
+@@ -233,7 +662,7 @@ def _make_sliding_window_bias(
+     window_size: Optional[int],
+     dtype: torch.dtype,
+ ) -> List[torch.Tensor]:
+-    attn_biases = []
++    attn_biases: List[torch.Tensor] = []
+     for seq_len in seq_lens:
+         tensor = torch.full(
+             (1, seq_len, seq_len),
+diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
+new file mode 100644
+index 0000000..56cc434
+--- /dev/null
++++ b/vllm/attention/backends/utils.py
+@@ -0,0 +1,574 @@
++"""Attention backend utils"""
++from collections import defaultdict
++from contextlib import contextmanager
++from itertools import accumulate
++from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union
++
++import numpy as np
++import torch
++
++from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
++                            AttentionState)
++from vllm.attention.backends.abstract import AttentionType
++from vllm.multimodal import MultiModalPlaceholderMap
++from vllm.utils import async_tensor_h2d, make_tensor_with_pad
++
++if TYPE_CHECKING:
++    from vllm.worker.model_runner_base import ModelRunnerBase
++
++# Error string(s) for encoder/decoder
++# unsupported attention scenarios
++STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported "
++                                 "with encoder/decoder models.")
++
++PAD_SLOT_ID = -1
++
++# Switch to numpy implementation of compute_slot_mapping
++# if we have at least this many elements. Could be tuned further.
++_COMPUTE_SLOT_MAPPING_NUMPY_NUMEL = 256
++
++if TYPE_CHECKING:
++    from vllm.worker.model_runner import ModelInputForGPUBuilder
++
++
++def is_block_tables_empty(block_tables: Union[None, Dict]):
++    """
++    Check if block_tables is None or a dictionary with all None values.
++    """
++    if block_tables is None:
++        return True
++    return (isinstance(block_tables, dict)
++            and all(value is None for value in block_tables.values()))
++
++
++def compute_slot_mapping_start_idx(is_prompt: bool, query_len: int,
++                                   context_len: int, sliding_window: int):
++    """
++    Compute the start index of slot mapping.
++    """
++    start_idx = 0
++    if is_prompt and sliding_window is not None:
++        start_idx = max(0, query_len - sliding_window)
++    return start_idx
++
++
++def _compute_slot_mapping_python(slot_mapping: List[int],
++                                 block_table: List[int], range_start: int,
++                                 range_end: int, block_size: int):
++    for i in range(range_start, range_end):
++        block_number = block_table[i // block_size]
++        block_offset = i % block_size
++        slot = block_number * block_size + block_offset
++        slot_mapping.append(slot)
++
++
++def _compute_slot_mapping_numpy(slot_mapping: List[int],
++                                block_table: List[int], range_start: int,
++                                range_end: int, block_size: int):
++    block_table_array = np.array(block_table)
++    idx = np.arange(range_start, range_end)
++    block_offset = idx % block_size
++    idx //= block_size
++    seq_slot_mapping_array = block_table_array[idx]
++    seq_slot_mapping_array *= block_size
++    seq_slot_mapping_array += block_offset
++    slot_mapping.extend(seq_slot_mapping_array)
++
++
++def compute_slot_mapping(is_profile_run: bool, slot_mapping: List[int],
++                         seq_id: int, seq_len: int, context_len: int,
++                         start_idx: int, block_size: int,
++                         block_tables: Dict[int, List[int]]):
++    """
++    Compute slot mapping.
++    """
++    if is_profile_run:
++        # During memory profiling, the block tables are not
++        # initialized yet. In this case, we just use a dummy
++        # slot mapping.
++        # In embeddings, the block tables are {seq_id: None}.
++        slot_mapping.extend([PAD_SLOT_ID] * seq_len)
++        return
++
++    # Mask the [0, start_idx) tokens of the prompt with
++    # PAD_SLOT_ID, where start_idx is max(0, seq_len -
++    # sliding_window). For example, if the prompt len is 10,
++    # sliding window is 8, and block size is 4, the first two
++    # tokens are masked and the slot mapping will be
++    # [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
++    padding_mask_len = max(0, start_idx - context_len)
++    slot_mapping.extend([PAD_SLOT_ID] * padding_mask_len)
++
++    range_start = max(start_idx, context_len)
++    range_end = seq_len
++    numel = range_end - range_start
++    block_table = block_tables[seq_id]
++
++    # numpy implementation will be faster than python if we have
++    # many elements, otherwise it will be slower.
++    if numel < _COMPUTE_SLOT_MAPPING_NUMPY_NUMEL:
++        _compute_slot_mapping_python(slot_mapping, block_table, range_start,
++                                     range_end, block_size)
++    else:
++        _compute_slot_mapping_numpy(slot_mapping, block_table, range_start,
++                                    range_end, block_size)
++
++
++TAttentionMetadata = TypeVar("TAttentionMetadata", bound='AttentionMetadata')
++
++
++class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
++
++    _metadata_cls: Type[TAttentionMetadata]
++
++    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
++        self.slot_mapping: List[int] = []
++        self.prefill_seq_lens: List[int] = []
++        self.context_lens: List[int] = []
++        self.block_tables: List[List[int]] = []
++        self.curr_seq_lens: List[int] = []
++        self.multimodal_placeholder_maps: Dict[
++            str,
++            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
++        self.num_prefills = 0
++        self.num_prefill_tokens = 0
++        self.num_decode_tokens = 0
++
++        self.input_builder = input_builder
++        self.runner = input_builder.runner
++
++        self.sliding_window = input_builder.sliding_window
++        self.block_size = input_builder.block_size
++
++    def _add_seq_group(
++            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
++            chunked_prefill_enabled: bool):
++        is_prompt = inter_data.is_prompt
++        block_tables = inter_data.block_tables
++
++        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
++             curr_sliding_window_block) in zip(
++                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
++                 inter_data.orig_seq_lens, inter_data.seq_lens,
++                 inter_data.query_lens, inter_data.context_lens,
++                 inter_data.curr_sliding_window_blocks):
++            self.context_lens.append(context_len)
++            if is_prompt:
++                mm_maps = inter_data.multi_modal_placeholder_maps
++                if mm_maps:
++                    for modality, placeholders in mm_maps.items():
++                        self.multimodal_placeholder_maps[modality].extend(
++                            placeholders)
++
++                self.num_prefills += 1
++                self.num_prefill_tokens += token_len
++                self.prefill_seq_lens.append(seq_len)
++            else:
++                assert query_len == 1, (
++                    "seq_len: {}, context_len: {}, query_len: {}".format(
++                        seq_len, context_len, query_len))
++                self.num_decode_tokens += query_len
++                self.curr_seq_lens.append(curr_seq_len)
++
++            # Compute block table.
++            # TODO(sang): Combine chunked prefill and prefix caching by
++            # only allowing multiple of block_size chunk size.
++            # NOTE: This only works for oooooooxxx style attention.
++            block_table = []
++            if inter_data.prefix_cache_hit:
++                block_table = block_tables[seq_id]
++            elif ((chunked_prefill_enabled or not is_prompt)
++                  and block_tables is not None):
++                if curr_sliding_window_block == 0:
++                    block_table = block_tables[seq_id]
++                else:
++                    block_table = block_tables[seq_id][
++                        -curr_sliding_window_block:]
++            self.block_tables.append(block_table)
++
++            # Compute slot mapping.
++            is_profile_run = is_block_tables_empty(block_tables)
++            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
++                                                       context_len,
++                                                       self.sliding_window)
++            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
++                                 seq_len, context_len, start_idx,
++                                 self.block_size, inter_data.block_tables)
++
++    def build(self, seq_lens: List[int], query_lens: List[int],
++              cuda_graph_pad_size: int, batch_size: int):
++        """Build attention metadata with on-device tensors.
++
++        Args:
++            seq_lens: The maybe padded sequence lengths of the input sequences.
++            query_lens: The query lengths of the input sequences.
++            cuda_graph_pad_size: The padding size for cuda graph.
++                                 -1 if cuda graph is not used.
++            batch_size: The maybe padded batch size.
++        """
++        for inter_data in self.input_builder.inter_data_list:
++            self._add_seq_group(inter_data,
++                                self.input_builder.chunked_prefill_enabled)
++
++        device = self.runner.device
++        use_captured_graph = cuda_graph_pad_size != -1
++
++        max_query_len = max(query_lens)
++        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
++        max_decode_seq_len = max(self.curr_seq_lens, default=0)
++        num_decode_tokens = self.num_decode_tokens
++        query_start_loc = list(accumulate(query_lens, initial=0))
++        seq_start_loc = list(accumulate(seq_lens, initial=0))
++
++        if use_captured_graph:
++            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
++            self.block_tables.extend([] * cuda_graph_pad_size)
++            num_decode_tokens = batch_size
++
++            # The shape of graph_block_tables is
++            # [max batch size, max context len // block size].
++            input_block_tables = self.runner.graph_block_tables[:batch_size]
++            for i, block_table in enumerate(self.block_tables):
++                if block_table:
++                    input_block_tables[i, :len(block_table)] = block_table
++            block_tables = torch.from_numpy(input_block_tables).to(
++                device, non_blocking=True)
++        else:
++            block_tables = make_tensor_with_pad(
++                self.block_tables,
++                pad=0,
++                dtype=torch.int,
++                device=device,
++            )
++        assert max_query_len > 0, "query_lens: {}".format(query_lens)
++
++        assert device is not None
++        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
++                                               device, self.runner.pin_memory)
++        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
++                                           self.runner.pin_memory)
++        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
++                                               device, self.runner.pin_memory)
++        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
++                                                  device,
++                                                  self.runner.pin_memory)
++        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
++                                                device, self.runner.pin_memory)
++        placeholder_index_maps = {
++            modality: placeholder_map.index_map()
++            for modality, placeholder_map in
++            self.multimodal_placeholder_maps.items()
++        }
++
++        return self._metadata_cls(  # type: ignore
++            num_prefills=self.num_prefills,
++            slot_mapping=slot_mapping_tensor,
++            multi_modal_placeholder_index_maps=placeholder_index_maps,
++            num_prefill_tokens=self.num_prefill_tokens,
++            num_decode_tokens=num_decode_tokens,
++            seq_lens=seq_lens,
++            seq_lens_tensor=seq_lens_tensor,
++            max_query_len=max_query_len,
++            max_prefill_seq_len=max_prefill_seq_len,
++            max_decode_seq_len=max_decode_seq_len,
++            query_start_loc=query_start_loc_tensor,
++            seq_start_loc=seq_start_loc_tensor,
++            context_lens_tensor=context_lens_tensor,
++            block_tables=block_tables,
++            use_cuda_graph=use_captured_graph,
++        )
++
++
++class CommonAttentionState(AttentionState):
++
++    def __init__(self, runner: "ModelRunnerBase"):
++        self.runner = runner
++        self._is_graph_capturing = False
++
++    @contextmanager
++    def graph_capture(self, max_batch_size: int):
++        self._is_graph_capturing = True
++        self._graph_slot_mapping = torch.full((max_batch_size, ),
++                                              PAD_SLOT_ID,
++                                              dtype=torch.long,
++                                              device=self.runner.device)
++        self._graph_seq_lens = torch.ones(max_batch_size,
++                                          dtype=torch.int32,
++                                          device=self.runner.device)
++        self._graph_block_tables = torch.from_numpy(
++            self.runner.graph_block_tables).to(device=self.runner.device)
++        yield
++        self._is_graph_capturing = False
++        del self._graph_slot_mapping
++        del self._graph_seq_lens
++        del self._graph_block_tables
++
++    def graph_clone(self, batch_size: int) -> "CommonAttentionState":
++        assert self._is_graph_capturing
++        return self.__class__(self.runner)
++
++    def graph_capture_get_metadata_for_batch(
++            self, batch_size: int, is_encoder_decoder_model: bool = False):
++        assert self._is_graph_capturing
++        attn_metadata = self.runner.attn_backend.make_metadata(
++            num_prefills=0,
++            num_prefill_tokens=0,
++            num_decode_tokens=batch_size,
++            slot_mapping=self._graph_slot_mapping[:batch_size],
++            multi_modal_placeholder_index_maps=None,
++            seq_lens=None,
++            seq_lens_tensor=self._graph_seq_lens[:batch_size],
++            max_query_len=1,
++            max_decode_query_len=1,
++            max_prefill_seq_len=0,
++            max_decode_seq_len=self.runner.max_seq_len_to_capture,
++            query_start_loc=None,
++            seq_start_loc=None,
++            context_lens_tensor=None,
++            block_tables=self._graph_block_tables[:batch_size],
++            use_cuda_graph=True,
++        )
++        if is_encoder_decoder_model:
++            # The encoder decoder model works only with XFormers and
++            # Flash Attention backend. Assert the same.
++            assert self.runner.attn_backend.get_name() in\
++                ["XFORMERS", "FLASH_ATTN"], \
++                f"Expected attn_backend name to be either 'XFORMERS' or " \
++                f"'FLASH_ATTN', but "\
++                f"got '{self.runner.attn_backend.get_name()}'"
++            self._update_captured_metadata_for_enc_dec_model(
++                batch_size=batch_size, attn_metadata=attn_metadata)
++
++        return attn_metadata
++
++    def get_graph_input_buffers(
++            self,
++            attn_metadata,
++            is_encoder_decoder_model: bool = False) -> Dict[str, Any]:
++        input_buffers = {
++            "slot_mapping": attn_metadata.slot_mapping,
++            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
++            "block_tables": attn_metadata.decode_metadata.block_tables,
++        }
++        if is_encoder_decoder_model:
++            # The encoder decoder model works only with XFormers and
++            # Flash Attention backend. Assert the same.
++            assert self.runner.attn_backend.get_name() in\
++                ["XFORMERS", "FLASH_ATTN"], \
++                f"Expected attn_backend name to be either 'XFORMERS' or "\
++                f"'FLASH_ATTN', but "\
++                f"got '{self.runner.attn_backend.get_name()}'"
++            self._add_additonal_input_buffers_for_enc_dec_model(
++                attn_metadata=attn_metadata, input_buffers=input_buffers)
++        return input_buffers
++
++    def prepare_graph_input_buffers(
++            self,
++            input_buffers,
++            attn_metadata,
++            is_encoder_decoder_model: bool = False) -> None:
++        input_buffers["seq_lens_tensor"].copy_(
++            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
++        input_buffers["block_tables"].copy_(
++            attn_metadata.decode_metadata.block_tables, non_blocking=True)
++        if is_encoder_decoder_model:
++            # The encoder decoder model works only with XFormers and
++            # Flash Attention backend. Assert the same.
++            assert self.runner.attn_backend.get_name() in\
++                ["XFORMERS", "FLASH_ATTN"], \
++                f"Expected attn_backend name to be either 'XFORMERS' or "\
++                f"'FLASH_ATTN', but "\
++                f"got '{self.runner.attn_backend.get_name()}'"
++            self._prepare_input_buffers_for_enc_dec_model(
++                attn_metadata, input_buffers)
++
++    def begin_forward(self, model_input) -> None:
++        return
++
++    def _update_captured_metadata_for_enc_dec_model(self, batch_size: int,
++                                                    attn_metadata):
++        """
++        Updates the attention metadata parameters for CUDA graph capture in an
++        encoder-decoder model.
++
++        This method modifies attention-related tensors and metadata required
++        for CUDA graph capture in encoder-decoder models. Specifically, it
++        updates the cross-attention and encoder sequence tensors in the 
++        AttentionMetadata object.
++        """
++        # During decode phase the cross_slot_mapping will be empty. Hence set
++        # an empty tensor for CUDA Graph capture.
++        attn_metadata.cross_slot_mapping = torch.tensor(
++            [], dtype=torch.int).cuda()
++        attn_metadata.cross_block_tables = torch.full(
++            (batch_size, self.runner.get_max_block_per_batch()),
++            1,
++            dtype=torch.int).cuda()
++        attn_metadata.encoder_seq_lens = torch.full((batch_size, ),
++                                                    1,
++                                                    dtype=torch.int).cuda()
++        attn_metadata.encoder_seq_lens_tensor = torch.full(
++            (batch_size, ), 1, dtype=torch.int).cuda()
++        attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture
++        attn_metadata.num_encoder_tokens = 0
++
++    def _add_additonal_input_buffers_for_enc_dec_model(
++            self, attn_metadata, input_buffers: Dict[str, Any]):
++        """
++        Saves additional input buffers specific to the encoder-decoder model
++        from the attention metadata.
++
++        This method extracts and stores encoder-decoder related input buffers
++        from the `attn_metadata` into the `input_buffers` dictionary. The
++        buffers include encoder sequence lengths, cross-slot mappings, and
++        cross-block tables, which are essential for the encoder-decoder model
++        during CUDA graph replay.
++        """
++        input_buffers["encoder_seq_lens_tensor"] = (
++            attn_metadata.decode_metadata.encoder_seq_lens_tensor)
++        input_buffers["cross_slot_mapping"] = (
++            attn_metadata.decode_metadata.cross_slot_mapping)
++        input_buffers["cross_block_tables"] = (
++            attn_metadata.decode_metadata.cross_block_tables)
++
++    def _prepare_input_buffers_for_enc_dec_model(self, attn_metadata,
++                                                 input_buffers: Dict[str,
++                                                                     Any]):
++        """
++        Populates input buffers with data from the encoder-decoder model's
++        attention metadata.
++
++        This method fills the input buffers with encoder-decoder specific
++        tensors. It copies data from the `attn_metadata` and keyword arguments
++        (`kwargs`) into corresponding buffers in the `input_buffers` dictionary.
++        The copied data includes attention-related metadata as well as input 
++        IDs and positional information for the encoder.
++        """
++        input_buffers["encoder_seq_lens_tensor"].copy_(
++            attn_metadata.decode_metadata.encoder_seq_lens_tensor,
++            non_blocking=True)
++        input_buffers["cross_slot_mapping"].copy_(
++            attn_metadata.decode_metadata.cross_slot_mapping,
++            non_blocking=True)
++        input_buffers["cross_block_tables"].copy_(
++            attn_metadata.decode_metadata.cross_block_tables,
++            non_blocking=True)
++
++
++def is_all_encoder_attn_metadata_set(attn_metadata):
++    '''
++    All attention metadata required for encoder attention is set.
++    '''
++    return ((attn_metadata.encoder_seq_lens is not None)
++            and (attn_metadata.encoder_seq_lens_tensor is not None)
++            and (attn_metadata.max_encoder_seq_len is not None))
++
++
++def is_all_cross_attn_metadata_set(attn_metadata):
++    '''
++    All attention metadata required for enc/dec cross-attention is set.
++
++    Superset of encoder attention required metadata.
++    '''
++    return (attn_metadata.is_all_encoder_attn_metadata_set
++            and (attn_metadata.cross_slot_mapping is not None)
++            and (attn_metadata.cross_block_tables is not None))
++
++
++def get_seq_len_block_table_args(
++    attn_metadata,
++    is_prompt: bool,
++    attn_type: str,
++) -> tuple:
++    '''
++    The particular choice of sequence-length- and block-table-related
++    attributes which should be extracted from attn_metadata is dependent
++    on the type of attention operation.
++
++    Decoder attn -> select entirely decoder self-attention-related fields
++    Encoder/decoder cross-attn -> select encoder sequence lengths & 
++                                  cross-attn block-tables fields
++    Encoder attn -> select encoder sequence lengths fields & no block tables
++    
++    Arguments:
++
++    * attn_metadata: Attention metadata structure associated with attention op
++    * is_prompt: True if prefill, False otherwise
++    * attn_type: encoder attention, decoder self-attention,
++                 encoder/decoder cross-attention
++
++    Returns:
++
++    * Appropriate sequence-lengths tensor
++    * Appropriate max sequence-length scalar
++    * Appropriate block tables (or None)
++    '''
++
++    if attn_type == AttentionType.DECODER:
++        # Decoder self-attention
++        # Choose max_seq_len based on whether we are in prompt_run
++        if is_prompt:
++            max_seq_len = attn_metadata.max_prefill_seq_len
++        else:
++            max_seq_len = attn_metadata.max_decode_seq_len
++        return (attn_metadata.seq_lens_tensor, max_seq_len,
++                attn_metadata.block_tables)
++    elif attn_type == AttentionType.ENCODER_DECODER:
++        # Enc/dec cross-attention KVs match encoder sequence length;
++        # cross-attention utilizes special "cross" block tables
++        return (attn_metadata.encoder_seq_lens_tensor,
++                attn_metadata.max_encoder_seq_len,
++                attn_metadata.cross_block_tables)
++    elif attn_type == AttentionType.ENCODER:
++        # No block tables associated with encoder attention
++        return (attn_metadata.encoder_seq_lens_tensor,
++                attn_metadata.max_encoder_seq_len, None)
++    else:
++        raise AttributeError(f"Invalid attention type {str(attn_type)}")
++
++
++def get_num_prefill_decode_query_kv_tokens(
++    attn_metadata,
++    attn_type: str,
++) -> Tuple[int, int, int]:
++    """
++    Calculate the number of prefill and decode tokens for query, key/value
++    based on the attention metadata and the specified attention type.
++
++    Args:
++        attn_metadata (FlashAttentionMetadata): Attention Metadata object.
++        attn_type (AttentionType): The type of attention being used.
++    Returns:
++        Tuple[int, int, int]: A tuple containing three integers:
++            - The number of prefill query tokens.
++            - The number of prefill key/value tokens.
++            - The number of decode query tokens.
++
++    Raises:
++        AssertionError: If the number of encoder tokens in `attn_metadata` 
++        is `None` when required for the calculations.
++    """
++    num_prefill_query_tokens = 0
++    num_decode_query_tokens = 0
++    num_prefill_kv_tokens = 0
++    if attn_type == AttentionType.ENCODER:
++        # Encoder attention is only invoked during prefill phase.
++        # The same input servers a both query and key.
++        assert attn_metadata.num_encoder_tokens is not None
++        num_prefill_query_tokens = attn_metadata.num_encoder_tokens
++        num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
++        num_decode_query_tokens = 0
++    elif attn_type == AttentionType.ENCODER_DECODER:
++        assert attn_metadata.num_encoder_tokens is not None
++        num_prefill_query_tokens = attn_metadata.num_prefill_tokens
++        # The key is the encoder/cross-attention.
++        num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
++        num_decode_query_tokens = attn_metadata.num_decode_tokens
++    else:  # attn_type == AttentionType.DECODER or
++        # attn_type == AttentionType.ENCODER_ONLY
++        num_prefill_query_tokens = attn_metadata.num_prefill_tokens
++        num_prefill_kv_tokens = attn_metadata.num_prefill_tokens
++        num_decode_query_tokens = attn_metadata.num_decode_tokens
++
++    return (num_prefill_query_tokens, num_prefill_kv_tokens,
++            num_decode_query_tokens)
+diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
+index 60f6d43..8c8ca85 100644
+--- a/vllm/attention/backends/xformers.py
++++ b/vllm/attention/backends/xformers.py
+@@ -1,16 +1,20 @@
+ """Attention layer with xFormers and PagedAttention."""
+ from dataclasses import dataclass
+-from typing import Dict, List, Optional, Tuple, Type
++from typing import Any, Dict, List, Optional, Tuple, Type
+ 
+ import torch
+ from xformers import ops as xops
+ from xformers.ops.fmha.attn_bias import (AttentionBias,
+                                          BlockDiagonalCausalMask,
++                                         BlockDiagonalMask,
+                                          LowerTriangularMaskWithTensorBias)
+ 
+ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+-                                              AttentionMetadata,
+-                                              AttentionMetadataPerStage)
++                                              AttentionMetadata, AttentionType)
++from vllm.attention.backends.utils import (
++    CommonAttentionState, CommonMetadataBuilder,
++    get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args,
++    is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
+ from vllm.attention.ops.paged_attn import (PagedAttention,
+                                            PagedAttentionMetadata)
+ from vllm.logger import init_logger
+@@ -20,13 +24,25 @@ logger = init_logger(__name__)
+ 
+ class XFormersBackend(AttentionBackend):
+ 
++    @staticmethod
++    def get_name() -> str:
++        return "XFORMERS"
++
+     @staticmethod
+     def get_impl_cls() -> Type["XFormersImpl"]:
+         return XFormersImpl
+ 
+     @staticmethod
+-    def make_metadata(*args, **kwargs) -> "XFormersMetadata":
+-        return XFormersMetadata(*args, **kwargs)
++    def get_metadata_cls() -> Type["AttentionMetadata"]:
++        return XFormersMetadata
++
++    @staticmethod
++    def get_builder_cls() -> Type["XFormersMetadataBuilder"]:
++        return XFormersMetadataBuilder
++
++    @staticmethod
++    def get_state_cls() -> Type["CommonAttentionState"]:
++        return CommonAttentionState
+ 
+     @staticmethod
+     def get_kv_cache_shape(
+@@ -49,13 +65,13 @@ class XFormersBackend(AttentionBackend):
+     @staticmethod
+     def copy_blocks(
+         kv_caches: List[torch.Tensor],
+-        src_to_dists: Dict[int, List[int]],
++        src_to_dists: torch.Tensor,
+     ) -> None:
+         PagedAttention.copy_blocks(kv_caches, src_to_dists)
+ 
+ 
+ @dataclass
+-class XFormersMetadata(AttentionMetadataPerStage, PagedAttentionMetadata):
++class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
+     """Metadata for XFormersbackend.
+ 
+     NOTE: Any python object stored here is not updated when it is
+@@ -63,14 +79,6 @@ class XFormersMetadata(AttentionMetadataPerStage, PagedAttentionMetadata):
+     dynamically, it should be stored in tensor. The tensor has to be
+     updated from `CUDAGraphRunner.forward` API.
+     """
+-    # Currently, input sequences can only contain all prompts
+-    # or all decoding. True if all sequences are prompts.
+-    is_prompt: bool
+-    # (batch_size,). The sequence length per sequence. Sequence length means
+-    # the computed tokens + new tokens None if it is a decoding.
+-    seq_lens: Optional[List[int]]
+-    # seq_lens stored as a tensor.
+-    seq_lens_tensor: Optional[torch.Tensor]
+ 
+     # |---------- N-1 iteration --------|
+     # |---------------- N iteration ---------------------|
+@@ -79,28 +87,72 @@ class XFormersMetadata(AttentionMetadataPerStage, PagedAttentionMetadata):
+     # |-------------------- seq_len ----------------------|
+     #                                   |-- query_len ---|
+ 
+-    # Maximum query length in the batch.
+-    max_query_len: Optional[int]
++    # seq_lens stored as a tensor.
++    seq_lens_tensor: Optional[torch.Tensor]
++
+     # FIXME: It is for flash attn.
+-    # Maximum sequence length in the batch.
+-    max_seq_len: Optional[int]
++    # Maximum sequence length among prefill batch. 0 if there are decoding
++    # requests only.
++    max_prefill_seq_len: int
++    # Maximum sequence length among decode batch. 0 if there are prefill
++    # requests only.
++    max_decode_seq_len: int
++
++    # Whether or not if cuda graph is enabled.
++    # Cuda-graph is currently enabled for decoding only.
++    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
++    use_cuda_graph: bool
++
++    # (batch_size,). The sequence length per sequence. Sequence length means
++    # the computed tokens + new tokens None if it is a decoding.
++    seq_lens: Optional[List[int]] = None
++
++    # FIXME: It is for flash attn.
++    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
++    # the batch, used to index into sequence. E.g., if the sequence length is
++    # [4, 6], it is [0, 4, 10].
++    seq_start_loc: Optional[torch.Tensor] = None
++
++    # (batch_size,) A tensor of context lengths (tokens that are computed
++    # so far).
++    context_lens_tensor: Optional[torch.Tensor] = None
++
++    # Maximum query length in the batch. None for decoding.
++    max_query_len: Optional[int] = None
++
++    # Max number of query tokens among request in the batch.
++    max_decode_query_len: Optional[int] = None
++
+     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+     # the batch, used to index into subquery. E.g., if the subquery length
+     # is [4, 6], it is [0, 4, 10].
+-    subquery_start_loc: Optional[torch.Tensor]
++    query_start_loc: Optional[torch.Tensor] = None
++
++    # Self-attention prefill/decode metadata cache
++    _cached_prefill_metadata: Optional["XFormersMetadata"] = None
++    _cached_decode_metadata: Optional["XFormersMetadata"] = None
++
++    # Begin encoder attn & enc/dec cross-attn fields...
++
++    # Encoder sequence lengths representation
++    encoder_seq_lens: Optional[List[int]] = None
++    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+     # FIXME: It is for flash attn.
+     # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+     # the batch, used to index into sequence. E.g., if the sequence length is
+     # [4, 6], it is [0, 4, 10].
+-    seq_start_loc: Optional[torch.Tensor]
+-    # (batch_size,) A tensor of context lengths (tokens that are computed
+-    # so far).
+-    context_lens_tensor: Optional[torch.Tensor]
++    encoder_seq_start_loc: Optional[torch.Tensor] = None
+ 
+-    # Whether or not if cuda graph is enabled.
+-    # Cuda-graph is currently enabled for decoding only.
+-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+-    use_cuda_graph: bool
++    # Maximum sequence length among encoder sequences
++    max_encoder_seq_len: Optional[int] = None
++
++    # Number of tokens input to encoder
++    num_encoder_tokens: Optional[int] = None
++
++    # Cross-attention memory-mapping data structures: slot mapping
++    # and block tables
++    cross_slot_mapping: Optional[torch.Tensor] = None
++    cross_block_tables: Optional[torch.Tensor] = None
+ 
+     def __post_init__(self):
+         # Set during the execution of the first attention op.
+@@ -109,9 +161,190 @@ class XFormersMetadata(AttentionMetadataPerStage, PagedAttentionMetadata):
+         # from xformer API.
+         # will not appear in the __repr__ and __init__
+         self.attn_bias: Optional[List[AttentionBias]] = None
+-
+-
+-class XFormersImpl(AttentionImpl):
++        self.encoder_attn_bias: Optional[List[AttentionBias]] = None
++        self.cross_attn_bias: Optional[List[AttentionBias]] = None
++
++    @property
++    def is_all_encoder_attn_metadata_set(self):
++        '''
++        All attention metadata required for encoder attention is set.
++        '''
++        return is_all_encoder_attn_metadata_set(self)
++
++    @property
++    def is_all_cross_attn_metadata_set(self):
++        '''
++        All attention metadata required for enc/dec cross-attention is set.
++
++        Superset of encoder attention required metadata.
++        '''
++        return is_all_cross_attn_metadata_set(self)
++
++    @property
++    def prefill_metadata(self) -> Optional["XFormersMetadata"]:
++        if self.num_prefills == 0:
++            return None
++
++        if self._cached_prefill_metadata is not None:
++            # Recover cached prefill-phase attention
++            # metadata structure
++            return self._cached_prefill_metadata
++
++        assert ((self.seq_lens is not None)
++                or (self.encoder_seq_lens is not None))
++        assert ((self.seq_lens_tensor is not None)
++                or (self.encoder_seq_lens_tensor is not None))
++
++        # Compute some attn_metadata fields which default to None
++        query_start_loc = (None if self.query_start_loc is None else
++                           self.query_start_loc[:self.num_prefills + 1])
++        slot_mapping = (None if self.slot_mapping is None else
++                        self.slot_mapping[:self.num_prefill_tokens])
++        seq_lens = (None if self.seq_lens is None else
++                    self.seq_lens[:self.num_prefills])
++        seq_lens_tensor = (None if self.seq_lens_tensor is None else
++                           self.seq_lens_tensor[:self.num_prefills])
++        context_lens_tensor = (None if self.context_lens_tensor is None else
++                               self.context_lens_tensor[:self.num_prefills])
++        block_tables = (None if self.block_tables is None else
++                        self.block_tables[:self.num_prefills])
++
++        # Construct & cache prefill-phase attention metadata structure
++        self._cached_prefill_metadata = XFormersMetadata(
++            num_prefills=self.num_prefills,
++            num_prefill_tokens=self.num_prefill_tokens,
++            num_decode_tokens=0,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=self.
++            multi_modal_placeholder_index_maps,
++            seq_lens=seq_lens,
++            seq_lens_tensor=seq_lens_tensor,
++            max_query_len=self.max_query_len,
++            max_prefill_seq_len=self.max_prefill_seq_len,
++            max_decode_seq_len=0,
++            query_start_loc=query_start_loc,
++            context_lens_tensor=context_lens_tensor,
++            block_tables=block_tables,
++            use_cuda_graph=False,
++            # Begin encoder & cross attn fields below...
++            encoder_seq_lens=self.encoder_seq_lens,
++            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
++            max_encoder_seq_len=self.max_encoder_seq_len,
++            cross_slot_mapping=self.cross_slot_mapping,
++            cross_block_tables=self.cross_block_tables)
++        return self._cached_prefill_metadata
++
++    @property
++    def decode_metadata(self) -> Optional["XFormersMetadata"]:
++        if self.num_decode_tokens == 0:
++            return None
++
++        if self._cached_decode_metadata is not None:
++            # Recover cached decode-phase attention
++            # metadata structure
++            return self._cached_decode_metadata
++        assert ((self.seq_lens_tensor is not None)
++                or (self.encoder_seq_lens_tensor is not None))
++
++        # Compute some attn_metadata fields which default to None
++        slot_mapping = (None if self.slot_mapping is None else
++                        self.slot_mapping[self.num_prefill_tokens:])
++        seq_lens_tensor = (None if self.seq_lens_tensor is None else
++                           self.seq_lens_tensor[self.num_prefills:])
++        block_tables = (None if self.block_tables is None else
++                        self.block_tables[self.num_prefills:])
++
++        # Construct & cache decode-phase attention metadata structure
++        self._cached_decode_metadata = XFormersMetadata(
++            num_prefills=0,
++            num_prefill_tokens=0,
++            num_decode_tokens=self.num_decode_tokens,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=None,
++            seq_lens_tensor=seq_lens_tensor,
++            max_prefill_seq_len=0,
++            max_decode_seq_len=self.max_decode_seq_len,
++            block_tables=block_tables,
++            use_cuda_graph=self.use_cuda_graph,
++            # Begin encoder & cross attn fields below...
++            encoder_seq_lens=self.encoder_seq_lens,
++            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
++            max_encoder_seq_len=self.max_encoder_seq_len,
++            cross_slot_mapping=self.cross_slot_mapping,
++            cross_block_tables=self.cross_block_tables)
++
++        # Batch may be composed of prefill|decodes, adjust query start indices
++        # to refer to the start of decodes when the two are split apart.
++        # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
++        if self._cached_decode_metadata.query_start_loc is not None:
++            qs = self._cached_decode_metadata.query_start_loc
++            self._cached_decode_metadata.query_start_loc = qs - qs[0]
++        return self._cached_decode_metadata
++
++
++def _get_attn_bias(
++    attn_metadata: XFormersMetadata,
++    attn_type: str,
++) -> Optional[AttentionBias]:
++    '''
++    Extract appropriate attention bias from attention metadata
++    according to attention type.
++
++    Arguments:
++
++    * attn_metadata: Attention metadata structure associated with attention
++    * attn_type: encoder attention, decoder self-attention,
++                 encoder/decoder cross-attention
++
++    Returns:
++    * Appropriate attention bias value given the attention type
++    '''
++
++    if (attn_type == AttentionType.DECODER
++            or attn_type == AttentionType.ENCODER_ONLY):
++        return attn_metadata.attn_bias
++    elif attn_type == AttentionType.ENCODER:
++        return attn_metadata.encoder_attn_bias
++    elif attn_type == AttentionType.ENCODER_DECODER:
++        return attn_metadata.cross_attn_bias
++    else:
++        raise AttributeError(f"Invalid attention type {str(attn_type)}")
++
++
++def _set_attn_bias(
++    attn_metadata: XFormersMetadata,
++    attn_bias: List[Optional[AttentionBias]],
++    attn_type: str,
++) -> None:
++    '''
++    Update appropriate attention bias field of attention metadata,
++    according to attention type.
++
++    Arguments:
++
++    * attn_metadata: Attention metadata structure associated with attention
++    * attn_bias: The desired attention bias value
++    * attn_type: encoder attention, decoder self-attention,
++                 encoder/decoder cross-attention
++    '''
++
++    if (attn_type == AttentionType.DECODER
++            or attn_type == AttentionType.ENCODER_ONLY):
++        attn_metadata.attn_bias = attn_bias
++    elif attn_type == AttentionType.ENCODER:
++        attn_metadata.encoder_attn_bias = attn_bias
++    elif attn_type == AttentionType.ENCODER_DECODER:
++        attn_metadata.cross_attn_bias = attn_bias
++    else:
++        raise AttributeError(f"Invalid attention type {str(attn_type)}")
++
++
++class XFormersMetadataBuilder(CommonMetadataBuilder[XFormersMetadata]):
++
++    _metadata_cls = XFormersMetadata
++
++
++class XFormersImpl(AttentionImpl[XFormersMetadata]):
+     """
+     If the input tensors contain prompt tokens, the layout is as follows:
+     |<--------------- num_prefill_tokens ----------------->|	
+@@ -142,18 +375,29 @@ class XFormersImpl(AttentionImpl):
+         num_heads: int,
+         head_size: int,
+         scale: float,
+-        num_kv_heads: Optional[int] = None,
+-        alibi_slopes: Optional[List[float]] = None,
+-        sliding_window: Optional[int] = None,
++        num_kv_heads: int,
++        alibi_slopes: Optional[List[float]],
++        sliding_window: Optional[int],
++        kv_cache_dtype: str,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        attn_type: str = AttentionType.DECODER,
+     ) -> None:
++        if blocksparse_params is not None:
++            raise ValueError(
++                "XFormers does not support block-sparse attention.")
++        if logits_soft_cap is not None:
++            logger.warning_once("XFormers does not support logits soft cap. "
++                                "Outputs may be slightly off.")
+         self.num_heads = num_heads
+         self.head_size = head_size
+         self.scale = float(scale)
+-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+-        self.sliding_window = sliding_window
++        self.num_kv_heads = num_kv_heads
+         if alibi_slopes is not None:
+             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+         self.alibi_slopes = alibi_slopes
++        self.sliding_window = sliding_window
++        self.kv_cache_dtype = kv_cache_dtype
+ 
+         assert self.num_heads % self.num_kv_heads == 0
+         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+@@ -164,71 +408,160 @@ class XFormersImpl(AttentionImpl):
+                 f"Head size {head_size} is not supported by PagedAttention. "
+                 f"Supported head sizes are: {suppored_head_sizes}.")
+ 
++        self.attn_type = attn_type
++
+     def forward(
+         self,
+         query: torch.Tensor,
+-        key: torch.Tensor,
+-        value: torch.Tensor,
+-        kv_cache: Optional[torch.Tensor],
+-        attn_metadata: AttentionMetadata[XFormersMetadata],
+-        kv_scale: float,
++        key: Optional[torch.Tensor],
++        value: Optional[torch.Tensor],
++        kv_cache: torch.Tensor,
++        attn_metadata: "XFormersMetadata",
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
+     ) -> torch.Tensor:
+         """Forward pass with xFormers and PagedAttention.
+ 
++        For decoder-only models: query, key and value must be non-None.
++
++        For encoder/decoder models:
++        * XFormersImpl.forward() may be invoked for both self- and cross-
++          attention layers.
++        * For self-attention: query, key and value must be non-None.
++        * For cross-attention:
++            * Query must be non-None
++            * During prefill, key and value must be non-None; key and value
++              get cached for use during decode.
++            * During decode, key and value may be None, since:
++              (1) key and value tensors were cached during prefill, and
++              (2) cross-attention key and value tensors do not grow during
++                  decode
++        
++        A note on how the attn_type (attention type enum) argument impacts
++        attention forward() behavior:
++    
++            * DECODER: normal decoder-only behavior;
++                use decoder self-attention block table
++            * ENCODER: no KV caching; pass encoder sequence
++                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
++                max_encoder_seq_len) to kernel, in lieu of decoder
++                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len).
++                Used for encoder branch of encoder-decoder models.
++            * ENCODER_ONLY: no kv_caching, uses the normal attention 
++                attributes (seq_lens/seq_lens_tensor/max_seq_len).
++            * ENCODER_DECODER: cross-attention behavior;
++                use cross-attention block table for caching KVs derived
++                from encoder hidden states; since KV sequence lengths
++                will match encoder sequence lengths, pass encoder sequence
++                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
++                max_encoder_seq_len)
++    
+         Args:
+             query: shape = [num_tokens, num_heads * head_size]
+             key: shape = [num_tokens, num_kv_heads * head_size]
+             value: shape = [num_tokens, num_kv_heads * head_size]
+             kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
++                NOTE: kv_cache will be an empty tensor with shape [0]
++                for profiling run.
+             attn_metadata: Metadata for attention.
++            attn_type: Select attention type, between encoder attention,
++                       decoder self-attention, or encoder/decoder cross-
++                       attention. Defaults to decoder self-attention,
++                       which is the vLLM default generally
+         Returns:
+             shape = [num_tokens, num_heads * head_size]
+         """
+-        num_tokens, hidden_size = query.shape
+-        query = query.view(-1, self.num_heads, self.head_size)
+-        key = key.view(-1, self.num_kv_heads, self.head_size)
+-        value = value.view(-1, self.num_kv_heads, self.head_size)
++        attn_type = self.attn_type
++        # Check that appropriate attention metadata attributes are
++        # selected for the desired attention type
++        if (attn_type == AttentionType.ENCODER
++                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
++            raise AttributeError("Encoder attention requires setting "
++                                 "encoder metadata attributes.")
++
++        elif (attn_type == AttentionType.ENCODER_DECODER
++              and (not attn_metadata.is_all_cross_attn_metadata_set)):
++            raise AttributeError("Encoder/decoder cross-attention "
++                                 "requires setting cross-attention "
++                                 "metadata attributes.")
+ 
+-        if kv_cache is not None:
++        query = query.view(-1, self.num_heads, self.head_size)
++        if key is not None:
++            assert value is not None
++            key = key.view(-1, self.num_kv_heads, self.head_size)
++            value = value.view(-1, self.num_kv_heads, self.head_size)
++        else:
++            assert value is None
++
++        # Self-attention vs. cross-attention will impact
++        # which KV cache memory-mapping & which
++        # seqlen datastructures we utilize
++
++        if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0):
++            # KV-cache during decoder-self- or
++            # encoder-decoder-cross-attention, but not
++            # during encoder attention.
++            #
++            # Even if there are no new key/value pairs to cache,
++            # we still need to break out key_cache and value_cache
++            # i.e. for later use by paged attention
+             key_cache, value_cache = PagedAttention.split_kv_cache(
+                 kv_cache, self.num_kv_heads, self.head_size)
+ 
+-            # Reshape the input keys and values and store them in the cache.
+-            # If kv_cache is not provided, the new key and value tensors are
+-            # not cached. This happens during the initial memory profiling run.
+-            PagedAttention.write_to_paged_cache(key, value, key_cache,
+-                                                value_cache,
+-                                                attn_metadata.slot_mapping,
+-                                                attn_metadata.kv_cache_dtype,
+-                                                kv_scale)
+-
+-        num_prefill_tokens = attn_metadata.num_prefill_tokens
+-        num_decode_tokens = attn_metadata.num_decode_tokens
+-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
+-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens
++            if (key is not None) and (value is not None):
++
++                if attn_type == AttentionType.ENCODER_DECODER:
++                    # Update cross-attention KV cache (prefill-only)
++                    # During cross-attention decode, key & value will be None,
++                    # preventing this IF-statement branch from running
++                    updated_slot_mapping = attn_metadata.cross_slot_mapping
++                else:
++                    # Update self-attention KV cache (prefill/decode)
++                    updated_slot_mapping = attn_metadata.slot_mapping
++
++                # Reshape the input keys and values and store them in the cache.
++                # If kv_cache is not provided, the new key and value tensors are
++                # not cached. This happens during the initial memory
++                # profiling run.
++                PagedAttention.write_to_paged_cache(key, value, key_cache,
++                                                    value_cache,
++                                                    updated_slot_mapping,
++                                                    self.kv_cache_dtype,
++                                                    k_scale, v_scale)
++        (num_prefill_query_tokens, num_prefill_kv_tokens,
++        num_decode_query_tokens) = \
++            get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
+ 
+         output = torch.empty_like(query)
+         # Query for decode. KV is not needed because it is already cached.
+-        decode_query = query[num_prefill_tokens:]
++        decode_query = query[num_prefill_query_tokens:]
+         # QKV for prefill.
+-        query = query[:num_prefill_tokens]
+-        key = key[:num_prefill_tokens]
+-        value = value[:num_prefill_tokens]
++        query = query[:num_prefill_query_tokens]
++        if key is not None and value is not None:
++            key = key[:num_prefill_kv_tokens]
++            value = value[:num_prefill_kv_tokens]
+ 
+-        assert query.shape[0] == num_prefill_tokens
+-        assert decode_query.shape[0] == num_decode_tokens
++        assert query.shape[0] == num_prefill_query_tokens
++        assert decode_query.shape[0] == num_decode_query_tokens
+ 
+         if prefill_meta := attn_metadata.prefill_metadata:
+             # Prompt run.
+-            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
++            if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0:
+                 # normal attention.
+                 # block tables are empty if the prompt does not have a cached
+                 # prefix.
+                 out = self._run_memory_efficient_xformers_forward(
+-                    query, key, value, prefill_meta)
+-                assert out.shape == output[:num_prefill_tokens].shape
+-                output[:num_prefill_tokens] = out
++                    query, key, value, prefill_meta, attn_type=attn_type)
++                assert out.shape == output[:num_prefill_query_tokens].shape
++                output[:num_prefill_query_tokens] = out
+             else:
++                assert attn_type != AttentionType.ENCODER_ONLY, (
++                    "Encoder-only models should not have prefix attention.")
++
++                assert prefill_meta.query_start_loc is not None
++                assert prefill_meta.max_query_len is not None
++
+                 # prefix-enabled attention
+                 # TODO(Hai) this triton kernel has regression issue (broke) to
+                 # deal with different data types between KV and FP8 KV cache,
+@@ -237,32 +570,45 @@ class XFormersImpl(AttentionImpl):
+                     query,
+                     key,
+                     value,
++                    self.kv_cache_dtype,
+                     key_cache,
+                     value_cache,
+                     prefill_meta.block_tables,
+-                    prefill_meta.subquery_start_loc,
++                    prefill_meta.query_start_loc,
+                     prefill_meta.seq_lens_tensor,
+                     prefill_meta.context_lens_tensor,
+                     prefill_meta.max_query_len,
+                     self.alibi_slopes,
+                     self.sliding_window,
++                    k_scale,
++                    v_scale,
+                 )
+-                assert output[:num_prefill_tokens].shape == out.shape
+-                output[:num_prefill_tokens] = out
++                assert output[:num_prefill_query_tokens].shape == out.shape
++                output[:num_prefill_query_tokens] = out
+ 
+         if decode_meta := attn_metadata.decode_metadata:
+-            output[num_prefill_tokens:] = PagedAttention.forward_decode(
++            assert attn_type != AttentionType.ENCODER_ONLY, (
++                "Encoder-only models should not have decode metadata.")
++
++            (
++                seq_lens_arg,
++                max_seq_len_arg,
++                block_tables_arg,
++            ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
++
++            output[num_prefill_query_tokens:] = PagedAttention.forward_decode(
+                 decode_query,
+                 key_cache,
+                 value_cache,
+-                decode_meta.block_tables,
+-                decode_meta.seq_lens_tensor,
+-                decode_meta.max_seq_len,
+-                attn_metadata.kv_cache_dtype,
++                block_tables_arg,
++                seq_lens_arg,
++                max_seq_len_arg,
++                self.kv_cache_dtype,
+                 self.num_kv_heads,
+                 self.scale,
+                 self.alibi_slopes,
+-                kv_scale,
++                k_scale,
++                v_scale,
+             )
+ 
+         # Reshape the output tensor.
+@@ -274,6 +620,7 @@ class XFormersImpl(AttentionImpl):
+         key: torch.Tensor,
+         value: torch.Tensor,
+         attn_metadata: XFormersMetadata,
++        attn_type: str = AttentionType.DECODER,
+     ) -> torch.Tensor:
+         """Attention for 1D query of multiple prompts. Multiple prompt
+         tokens are flattened in to `query` input.
+@@ -287,8 +634,12 @@ class XFormersImpl(AttentionImpl):
+             key: shape = [num_prefill_tokens, num_kv_heads, head_size]
+             value: shape = [num_prefill_tokens, num_kv_heads, head_size]
+             attn_metadata: Metadata for attention.
++            attn_type: Select attention type, between encoder attention,
++                       decoder self-attention, or encoder/decoder cross-
++                       attention. Defaults to decoder self-attention,
++                       which is the vLLM default generally
+         """
+-        assert attn_metadata.seq_lens is not None
++
+         original_query = query
+         if self.num_kv_heads != self.num_heads:
+             # GQA/MQA requires the shape [B, M, G, H, K].
+@@ -303,21 +654,66 @@ class XFormersImpl(AttentionImpl):
+                           None, :].expand(value.shape[0], self.num_kv_heads,
+                                           self.num_queries_per_kv,
+                                           value.shape[-1])
++
+         # Set attention bias if not provided. This typically happens at
+         # the very attention layer of every iteration.
+         # FIXME(woosuk): This is a hack.
+-        if attn_metadata.attn_bias is None:
++        attn_bias = _get_attn_bias(attn_metadata, attn_type)
++        if attn_bias is None:
+             if self.alibi_slopes is None:
+-                attn_bias = BlockDiagonalCausalMask.from_seqlens(
+-                    attn_metadata.seq_lens)
++
++                # Cross attention block of decoder branch of encoder-decoder
++                # model uses seq_lens for dec / encoder_seq_lens for enc
++                if (attn_type == AttentionType.ENCODER_DECODER):
++                    assert attn_metadata.seq_lens is not None
++                    assert attn_metadata.encoder_seq_lens is not None
++
++                    # Cross-attention mask is non-causal
++                    attn_bias = BlockDiagonalMask.from_seqlens(
++                        attn_metadata.seq_lens, attn_metadata.encoder_seq_lens)
++
++                # Encoder branch of encoder-decoder model uses
++                # attn_metadata.encoder_seq_lens
++                elif attn_type == AttentionType.ENCODER:
++
++                    assert attn_metadata.encoder_seq_lens is not None
++
++                    # Encoder self-attention mask is non-causal
++                    attn_bias = BlockDiagonalMask.from_seqlens(
++                        attn_metadata.encoder_seq_lens)
++
++                # Self-attention block of encoder-only model just
++                # uses the seq_lens directly.
++                elif attn_type == AttentionType.ENCODER_ONLY:
++                    assert attn_metadata.seq_lens is not None
++
++                    # Encoder self-attention mask is non-causal
++                    attn_bias = BlockDiagonalMask.from_seqlens(
++                        attn_metadata.seq_lens)
++
++                # Self-attention block of decoder branch just
++                # uses the seq_lens directly
++                elif attn_type == AttentionType.DECODER:
++                    assert attn_metadata.seq_lens is not None
++
++                    # Decoder self-attention mask is causal
++                    attn_bias = BlockDiagonalCausalMask.from_seqlens(
++                        attn_metadata.seq_lens)
++                else:
++                    raise ValueError("Unknown AttentionType: %s", attn_type)
++
+                 if self.sliding_window is not None:
+                     attn_bias = attn_bias.make_local_attention(
+                         self.sliding_window)
+-                attn_metadata.attn_bias = [attn_bias]
++                attn_bias = [attn_bias]
+             else:
+-                attn_metadata.attn_bias = _make_alibi_bias(
+-                    self.alibi_slopes, self.num_kv_heads, query.dtype,
+-                    attn_metadata.seq_lens)
++                assert attn_type == AttentionType.DECODER
++                assert attn_metadata.seq_lens is not None
++                attn_bias = _make_alibi_bias(self.alibi_slopes,
++                                             self.num_kv_heads, query.dtype,
++                                             attn_metadata.seq_lens)
++
++            _set_attn_bias(attn_metadata, attn_bias, attn_type)
+ 
+         # No alibi slopes.
+         # TODO(woosuk): Too many view operations. Let's try to reduce
+@@ -331,7 +727,7 @@ class XFormersImpl(AttentionImpl):
+                 query,
+                 key,
+                 value,
+-                attn_bias=attn_metadata.attn_bias[0],
++                attn_bias=attn_bias[0],
+                 p=0.0,
+                 scale=self.scale)
+             return out.view_as(original_query)
+@@ -340,6 +736,7 @@ class XFormersImpl(AttentionImpl):
+         # FIXME(woosuk): Because xformers does not support dynamic sequence
+         # lengths with custom attention bias, we process each prompt one by
+         # one. This is inefficient, especially when we have many short prompts.
++        assert attn_metadata.seq_lens is not None
+         output = torch.empty_like(original_query)
+         start = 0
+         for i, seq_len in enumerate(attn_metadata.seq_lens):
+@@ -348,7 +745,7 @@ class XFormersImpl(AttentionImpl):
+                 query[None, start:end],
+                 key[None, start:end],
+                 value[None, start:end],
+-                attn_bias=attn_metadata.attn_bias[i],
++                attn_bias=attn_bias[i],
+                 p=0.0,
+                 scale=self.scale)
+             # TODO(woosuk): Unnecessary copy. Optimize.
+@@ -362,8 +759,8 @@ def _make_alibi_bias(
+     num_kv_heads: int,
+     dtype: torch.dtype,
+     seq_lens: List[int],
+-) -> LowerTriangularMaskWithTensorBias:
+-    attn_biases = []
++) -> List[AttentionBias]:
++    attn_biases: List[AttentionBias] = []
+     for seq_len in seq_lens:
+         bias = torch.arange(seq_len, dtype=dtype)
+         # NOTE(zhuohan): HF uses
+diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
+index ee7be26..b8afd42 100644
+--- a/vllm/attention/layer.py
++++ b/vllm/attention/layer.py
+@@ -1,12 +1,19 @@
+ """Attention layer."""
+-from typing import List, Optional
++from typing import Any, Dict, List, Optional
+ 
+ import torch
+ import torch.nn as nn
++import torch.nn.functional as F
+ 
+-from vllm.attention.backends.abstract import (AttentionMetadata,
+-                                              AttentionMetadataPerStage)
+-from vllm.attention.selector import get_attn_backend
++from vllm.attention import AttentionMetadata, AttentionType
++from vllm.attention.selector import backend_name_to_enum, get_attn_backend
++from vllm.config import CacheConfig, get_current_vllm_config
++from vllm.forward_context import ForwardContext, get_forward_context
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
++from vllm.platforms import _Backend, current_platform
++from vllm.utils import direct_register_custom_op
+ 
+ 
+ class Attention(nn.Module):
+@@ -28,29 +35,271 @@ class Attention(nn.Module):
+         scale: float,
+         num_kv_heads: Optional[int] = None,
+         alibi_slopes: Optional[List[float]] = None,
+-        sliding_window: Optional[int] = None,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        per_layer_sliding_window: Optional[int] = None,
++        prefix: str = "",
++        attn_type: str = AttentionType.DECODER,
+     ) -> None:
+         super().__init__()
+-        self.backend = get_attn_backend(torch.get_default_dtype())
+-        impl_cls = self.backend.get_impl_cls()
++        if per_layer_sliding_window is not None:
++            # per-layer sliding window
++            sliding_window = per_layer_sliding_window
++        elif cache_config is not None:
++            # model-level sliding window
++            sliding_window = cache_config.sliding_window
++        else:
++            sliding_window = None
++
++        if cache_config is not None:
++            kv_cache_dtype = cache_config.cache_dtype
++            block_size = cache_config.block_size
++            is_attention_free = cache_config.is_attention_free
++        else:
++            kv_cache_dtype = "auto"
++            block_size = 16
++            is_attention_free = False
++        if num_kv_heads is None:
++            num_kv_heads = num_heads
++
++        # The default k/v_scale is set to 1.0. This is ignored
++        # when kv-cache is not fp8, and should be used with
++        # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
++        # expect the pre-quantized k/v_scale to be loaded along
++        # with the model weights.
++        self.kv_cache_dtype = kv_cache_dtype
++        self._k_scale = 1.0
++        self._v_scale = 1.0
++        quant_method = quant_config.get_quant_method(
++            self, prefix=prefix) if quant_config else None
++        if quant_method is not None:
++            assert isinstance(quant_method, BaseKVCacheMethod)
++            # TODO (mgoin): kv cache dtype should be specified in the FP8
++            # checkpoint config and become the "auto" behavior
++            if self.kv_cache_dtype == "fp8_e5m2":
++                raise ValueError("fp8_e5m2 kv-cache is not supported with "
++                                 "fp8 checkpoints.")
++            # If quantization is enabled, we make "k_scale" and "v_scale"
++            # parameters so that it can be loaded from the model checkpoint.
++            # The k/v_scale will then be converted back to native float32
++            # values after weight loading.
++            self.quant_method = quant_method
++            self.quant_method.create_weights(self)
++
++        # During model initialization, the default dtype is set as the model
++        # weight and activation dtype.
++        dtype = torch.get_default_dtype()
++        attn_backend = get_attn_backend(head_size, dtype, kv_cache_dtype,
++                                        block_size, is_attention_free,
++                                        blocksparse_params is not None)
++        impl_cls = attn_backend.get_impl_cls()
+         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
+-                             alibi_slopes, sliding_window)
++                             alibi_slopes, sliding_window, kv_cache_dtype,
++                             blocksparse_params, logits_soft_cap, attn_type)
++        self.num_heads = num_heads
++        self.head_size = head_size
++        self.num_kv_heads = num_kv_heads
++        self.backend = backend_name_to_enum(attn_backend.get_name())
++
++        # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
++        # torch.compile works by registering the attention as one giant
++        # opaque custom op. For other platforms, we directly call them
++        # and let torch.compile handle them.
++        self.use_direct_call = not current_platform.is_cuda_alike(
++        ) and not current_platform.is_cpu()
++
++        # For some attention backends, we allocate an output tensor before
++        # calling the custom op. When piecewise cudagraph is enabled, this
++        # makes sure the output tensor is allocated inside the cudagraph.
++        self.use_output = self.backend == _Backend.FLASH_ATTN or \
++            self.backend == _Backend.FLASH_ATTN_VLLM_V1
++        compilation_config = get_current_vllm_config().compilation_config
++        if prefix in compilation_config.static_forward_context:
++            raise ValueError(f"Duplicate layer name: {prefix}")
++        compilation_config.static_forward_context[prefix] = self
++        self.layer_name = prefix
++        self.attn_type = attn_type
++        # use a placeholder kv cache tensor during init, which will be replaced
++        # by bind_kv_cache
++        # this variable will not be accessed if use_direct_call is True
++        self.kv_cache = [
++            torch.tensor([]) for _ in range(get_current_vllm_config(
++            ).parallel_config.pipeline_parallel_size)
++        ]
+ 
+     def forward(
+         self,
+         query: torch.Tensor,
+         key: torch.Tensor,
+         value: torch.Tensor,
+-        kv_cache: Optional[torch.Tensor],
+-        attn_metadata: AttentionMetadata[AttentionMetadataPerStage],
+-        kv_scale: float = 1.0,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
+     ) -> torch.Tensor:
+-        return self.impl.forward(query, key, value, kv_cache, attn_metadata,
+-                                 kv_scale)
++
++        if self.use_direct_call:
++            return self.impl.forward(query, key, value, kv_cache,
++                                     attn_metadata, self._k_scale,
++                                     self._v_scale)
++        elif self.use_output:
++            output = torch.empty_like(query)
++            hidden_size = query.size(-1)
++            # Reshape the query, key, and value tensors.
++            # NOTE(woosuk): We do this outside the custom op to minimize the
++            # CPU overheads from the non-CUDA-graph regions.
++            query = query.view(-1, self.num_heads, self.head_size)
++            output = output.view(-1, self.num_heads, self.head_size)
++            if key is not None:
++                key = key.view(-1, self.num_kv_heads, self.head_size)
++            if value is not None:
++                value = value.view(-1, self.num_kv_heads, self.head_size)
++            torch.ops.vllm.unified_attention_with_output(
++                query, key, value, output, self.layer_name)
++            return output.view(-1, hidden_size)
++        else:
++            return torch.ops.vllm.unified_attention(query, key, value,
++                                                    self.layer_name)
+ 
+     def extra_repr(self) -> str:
+         s = f"head_size={self.impl.head_size}"  # type: ignore
+         s += f", num_heads={self.impl.num_heads}"  # type: ignore
+         s += f", num_kv_heads={self.impl.num_kv_heads}"  # type: ignore
+         s += f", scale={self.impl.scale}"  # type: ignore
++        s += f", backend={self.impl.__class__.__name__}"
+         return s
++
++
++class MultiHeadAttention(nn.Module):
++    """Multi-headed attention without any cache, used for ViT."""
++
++    def __init__(
++        self,
++        num_heads: int,
++        head_size: int,
++        scale: float,
++        num_kv_heads: Optional[int] = None,
++    ):
++        super().__init__()
++        self.num_heads = num_heads
++        self.head_size = head_size
++        self.scale = scale
++        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
++
++        dtype = torch.get_default_dtype()
++        attn_backend = get_attn_backend(head_size,
++                                        dtype,
++                                        kv_cache_dtype=None,
++                                        block_size=16,
++                                        is_attention_free=False)
++        attn_backend = backend_name_to_enum(attn_backend.get_name())
++        if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
++            attn_backend = _Backend.XFORMERS
++
++        self.attn_backend = attn_backend if attn_backend in {
++            _Backend.TORCH_SDPA, _Backend.XFORMERS
++        } else _Backend.TORCH_SDPA
++
++    def forward(
++        self,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++    ) -> torch.Tensor:
++        """Input shape: batch_size x seq_len x hidden_size"""
++        # TODO(Isotr0py): Use existing backend implementations and support FA2
++        bsz, q_len, _ = query.size()
++        kv_len = key.size(1)
++
++        query = query.view(bsz, q_len, self.num_heads, self.head_size)
++        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
++        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
++
++        if self.attn_backend == _Backend.XFORMERS:
++            from xformers import ops as xops
++
++            out = xops.memory_efficient_attention_forward(query,
++                                                          key,
++                                                          value,
++                                                          scale=self.scale)
++        elif self.attn_backend == _Backend.TORCH_SDPA:
++            query, key, value = (x.transpose(1, 2)
++                                 for x in (query, key, value))
++            out = F.scaled_dot_product_attention(query,
++                                                 key,
++                                                 value,
++                                                 scale=self.scale)
++            out = out.transpose(1, 2)
++        return out.reshape(bsz, q_len, -1)
++
++
++def unified_attention(
++    query: torch.Tensor,
++    key: torch.Tensor,
++    value: torch.Tensor,
++    layer_name: str,
++) -> torch.Tensor:
++    forward_context: ForwardContext = get_forward_context()
++    attn_metadata = forward_context.attn_metadata
++    self = forward_context.attn_layers[layer_name]
++    kv_cache = self.kv_cache[forward_context.virtual_engine]
++    return self.impl.forward(query, key, value, kv_cache, attn_metadata,
++                             self._k_scale, self._v_scale)
++
++
++def unified_attention_fake(
++    query: torch.Tensor,
++    key: torch.Tensor,
++    value: torch.Tensor,
++    layer_name: str,
++) -> torch.Tensor:
++    return torch.empty_like(query).contiguous()
++
++
++direct_register_custom_op(
++    op_name="unified_attention",
++    op_func=unified_attention,
++    mutates_args=[],
++    fake_impl=unified_attention_fake,
++    dispatch_key=current_platform.dispatch_key,
++)
++
++
++def unified_attention_with_output(
++    query: torch.Tensor,
++    key: torch.Tensor,
++    value: torch.Tensor,
++    output: torch.Tensor,
++    layer_name: str,
++) -> None:
++    forward_context: ForwardContext = get_forward_context()
++    attn_metadata = forward_context.attn_metadata
++    self = forward_context.attn_layers[layer_name]
++    kv_cache = self.kv_cache[forward_context.virtual_engine]
++    self.impl.forward(query,
++                      key,
++                      value,
++                      kv_cache,
++                      attn_metadata,
++                      self._k_scale,
++                      self._v_scale,
++                      output=output)
++
++
++def unified_attention_with_output_fake(
++    query: torch.Tensor,
++    key: torch.Tensor,
++    value: torch.Tensor,
++    output: torch.Tensor,
++    layer_name: str,
++) -> None:
++    return
++
++
++direct_register_custom_op(
++    op_name="unified_attention_with_output",
++    op_func=unified_attention_with_output,
++    mutates_args=["output"],
++    fake_impl=unified_attention_with_output_fake,
++    dispatch_key=current_platform.dispatch_key,
++)
+diff --git a/vllm/attention/ops/blocksparse_attention/__init__.py b/vllm/attention/ops/blocksparse_attention/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
+new file mode 100644
+index 0000000..727a470
+--- /dev/null
++++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
+@@ -0,0 +1,430 @@
++import torch
++import triton
++import triton.language as tl
++
++
++def blocksparse_flash_attn_varlen_fwd(
++        q,
++        k,
++        v,  # (#tokens, n_heads, head_size)
++        cu_seqlens_k,
++        cu_seqlens_q,
++        sm_scale,
++        sparse_layout,
++        *,
++        block_size=64,
++        q_block_size=None,
++        max_seqlen=None):
++    # split q to blocks
++
++    assert isinstance(sparse_layout, (list, tuple))
++
++    _, n_heads, head_size = q.shape
++    batch_size = cu_seqlens_k.size(0) - 1
++    q_block_size = q_block_size or block_size
++
++    assert q.dim() == k.dim() == v.dim() == 3
++    assert q.size(1) % k.size(1) == 0
++    assert q.size(2) == k.size(2)
++    # TODO(linxihui): allow k, v to have different head_size
++    assert k.shape == v.shape
++    assert cu_seqlens_k.dim() == 1
++
++    q_k_ratio = q.size(1) // k.size(1)
++
++    if cu_seqlens_q is None:
++        if q.size(0) == batch_size:  # decoding only
++            cu_seqlens_q = torch.arange(
++                0,
++                batch_size + 1,
++                dtype=cu_seqlens_k.dtype,
++                device=cu_seqlens_k.device,
++            )
++        elif q.size(0) == k.size(0):
++            cu_seqlens_q = cu_seqlens_k
++        else:
++            raise ValueError("cu_seqlens_q must be specified\
++                    if it mix of prefilling and decoding.")
++    else:
++        assert cu_seqlens_k.size(0) == cu_seqlens_q.size(0)
++
++    # switch to use cpu to avoid too many kernel launches when iterated over
++    q_lens = (cu_seqlens_q[1:] - cu_seqlens_q[:-1]).cpu()
++    k_lens = (cu_seqlens_k[1:] - cu_seqlens_k[:-1]).cpu()
++
++    assert torch.logical_or(q_lens == 1, k_lens == q_lens).all(), (
++        "length of q should either be 1 (decoding) or same as k (prefilling).")
++
++    if max_seqlen:
++        assert k_lens.max() <= max_seqlen
++
++    n_blocks = (q_lens + q_block_size - 1) // q_block_size
++
++    q_batch_ids = torch.tensor(
++        [i for i, n in enumerate(n_blocks) for _ in range(n)],
++        dtype=cu_seqlens_q.dtype,
++        device=cu_seqlens_q.device,
++    )
++    q_start_sids = torch.tensor(
++        [i * q_block_size for n in n_blocks for i in range(n)],
++        dtype=cu_seqlens_q.dtype,
++        device=cu_seqlens_q.device,
++    )
++
++    out = q.new_empty(q.shape)
++    cu_seqlens_q = cu_seqlens_q.contiguous()
++    cu_seqlens_k = cu_seqlens_k.contiguous()
++
++    layout_crow_indices, layout_col_indices = sparse_layout
++    block_d = triton.next_power_of_2(head_size)
++
++    decoding_only = (q_lens == 1).all().item()
++    grid = (len(q_start_sids), n_heads, 1)
++
++    _fwd_kernel_batch_inference[grid](
++        q,
++        k,
++        v,
++        out,
++        sm_scale,
++        cu_seqlens_q[:-1],
++        cu_seqlens_q[1:],
++        cu_seqlens_k[:-1],
++        cu_seqlens_k[1:],
++        q_batch_ids,
++        q_start_sids,
++        0,
++        *q.stride(),
++        0,
++        *k.stride(),
++        0,
++        *v.stride(),
++        0,
++        *out.stride(),
++        layout_crow_indices,
++        layout_col_indices,
++        *layout_crow_indices.stride(),
++        *layout_col_indices.stride(),
++        q_k_ratio,
++        HAS_BATCH_DIM=False,
++        D_HEAD=head_size,
++        BLOCK_M=q_block_size,
++        BLOCK_N=block_size,
++        BLOCK_D=block_d,
++        BLOCK_M_LOADING=(16 if decoding_only else
++                         q_block_size),  # smaller for decoding
++        EVEN_D=block_d == head_size,
++        num_warps=1 if decoding_only else 4,
++        num_stages=3)
++
++    return out
++
++
++@triton.jit
++def _fwd_kernel_inner(
++    acc,
++    l_i,
++    m_i,
++    q,
++    Q,
++    k_block_col_idx,
++    layout_col_ptr,
++    layout_col_stride_h,
++    layout_col_stride_m,
++    k_ptrs,
++    v_ptrs,
++    off_h,
++    offs_m,
++    offs_n,
++    offs_d,
++    stride_kt,
++    stride_vt,
++    sm_scale,
++    k_seqlen,
++    past_len,
++    LAST_K_BLOCK: tl.constexpr,
++    BLOCK_M_LOADING: tl.constexpr,
++    BLOCK_N: tl.constexpr,
++    D_HEAD: tl.constexpr,
++    EVEN_D: tl.constexpr,
++    M_LT_N: tl.constexpr,
++):
++    k_block_id = tl.load(layout_col_ptr + off_h * layout_col_stride_h +
++                         k_block_col_idx * layout_col_stride_m).to(tl.int32)
++    start_n = k_block_id * BLOCK_N
++    if LAST_K_BLOCK:
++        if EVEN_D:
++            k = tl.load(
++                k_ptrs + start_n * stride_kt,
++                mask=offs_n[None, :] + start_n < k_seqlen,
++                other=0.0,
++            )
++        else:
++            k = tl.load(
++                k_ptrs + start_n * stride_kt,
++                mask=(offs_n[None, :] + start_n < k_seqlen) &
++                (offs_d[:, None] < D_HEAD),
++                other=0.0,
++            )
++    else:
++        if EVEN_D:
++            k = tl.load(k_ptrs + start_n * stride_kt)
++        else:
++            k = tl.load(k_ptrs + start_n * stride_kt,
++                        mask=offs_d[:, None] < D_HEAD,
++                        other=0.0)
++
++    qk = tl.zeros([BLOCK_M_LOADING, BLOCK_N], dtype=tl.float32)
++    qk += tl.dot(q, k)
++    qk *= sm_scale
++
++    # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
++    if LAST_K_BLOCK | M_LT_N:
++        qk += tl.where(
++            offs_m[:, None] + past_len >= (start_n + offs_n[None, :]),
++            0,
++            float("-inf"),
++        )
++
++    # flash-attn2
++    m_ij = tl.maximum(m_i, tl.max(qk, 1))
++    p = tl.math.exp2(qk - m_ij[:, None])
++    l_ij = tl.sum(p, 1)
++    alpha = tl.math.exp2(m_i - m_ij)
++    acc = acc * alpha[:, None]
++    # update m_i
++    m_i = m_ij
++    l_i = l_i * alpha + l_ij
++
++    p = p.to(Q.dtype.element_ty)
++    # update acc
++    if LAST_K_BLOCK:
++        if EVEN_D:
++            v = tl.load(
++                v_ptrs + start_n * stride_vt,
++                mask=offs_n[:, None] + start_n < k_seqlen,
++                other=0.0,
++            )
++        else:
++            v = tl.load(
++                v_ptrs + start_n * stride_vt,
++                mask=(offs_n[:, None] + start_n < k_seqlen) &
++                (offs_d[None, :] < D_HEAD),
++                other=0.0,
++            )
++    else:
++        if EVEN_D:
++            v = tl.load(v_ptrs + start_n * stride_vt)
++        else:
++            v = tl.load(v_ptrs + start_n * stride_vt,
++                        mask=offs_d[None, :] < D_HEAD,
++                        other=0.0)
++
++    acc += tl.dot(p, v)
++
++    return acc, l_i, m_i
++
++
++@triton.heuristics({
++    "M_LT_N":
++    lambda kwargs: kwargs["BLOCK_M"] < kwargs["BLOCK_N"],
++})
++@triton.jit
++def _fwd_kernel_batch_inference(
++    Q,
++    K,
++    V,
++    Out,
++    sm_scale,
++    q_batch_starts,
++    q_batch_ends,
++    k_batch_starts,
++    k_batch_ends,
++    q_batch_ids,
++    q_start_sids,
++    stride_qb,
++    stride_qt,
++    stride_qh,
++    stride_qd,
++    stride_kb,
++    stride_kt,
++    stride_kh,
++    stride_kd,
++    stride_vb,
++    stride_vt,
++    stride_vh,
++    stride_vd,
++    stride_ob,
++    stride_ot,
++    stride_oh,
++    stride_od,
++    layout_crow_ptr,
++    layout_col_ptr,
++    layout_crow_stride_h,
++    layout_crow_stride_m,
++    layout_col_stride_h,
++    layout_col_stride_m,
++    q_k_ratio,
++    HAS_BATCH_DIM: tl.constexpr,
++    D_HEAD: tl.constexpr,
++    BLOCK_M: tl.constexpr,
++    BLOCK_N: tl.constexpr,
++    BLOCK_D: tl.constexpr,
++    BLOCK_M_LOADING: tl.constexpr,
++    EVEN_D: tl.constexpr,
++    M_LT_N: tl.constexpr,
++):
++    """
++    NOTATION:
++    pid: position id
++    sid: storage id
++    sbid: storage block id
++    pbid: position block id
++    offs_m, offs_n: storage offsets of m-dim(q, row) and n-dim(k, col)
++
++    TODO(linxihui):
++    Optimize grouped-attn
++    """
++    off_zm = tl.program_id(0)
++    off_h = tl.program_id(1)
++
++    off_h_for_kv = off_h // q_k_ratio
++
++    if HAS_BATCH_DIM:
++        off_z = tl.program_id(2)
++        Q += off_z * stride_qb
++        K += off_z * stride_kb
++        V += off_z * stride_vb
++        Out += off_z * stride_ob
++        start_m = off_zm
++        q_start_sid = start_m * BLOCK_M  # always 0 for decoding
++    else:
++        off_z = tl.load(q_batch_ids + off_zm).to(tl.int32)  # [0, 0, 0, 1]
++        q_start_sid = tl.load(q_start_sids + off_zm)
++        start_m = q_start_sid // BLOCK_M  # q_sbid
++
++    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M_LOADING)
++    offs_n = tl.arange(0, BLOCK_N)
++    offs_d = tl.arange(0, BLOCK_D)
++
++    q_cu_start = tl.load(q_batch_starts + off_z).to(tl.int32)
++    q_seqlen = tl.load(q_batch_ends + off_z).to(tl.int32) - q_cu_start
++    k_cu_start = tl.load(k_batch_starts + off_z).to(tl.int32)
++    k_seqlen = tl.load(k_batch_ends + off_z).to(tl.int32) - k_cu_start
++    past_len = k_seqlen - q_seqlen
++
++    Q += q_cu_start * stride_qt + off_h * stride_qh
++    K += k_cu_start * stride_kt + off_h_for_kv * stride_kh
++    V += k_cu_start * stride_vt + off_h_for_kv * stride_vh
++    Out += q_cu_start * stride_ot + off_h * stride_oh
++
++    q_pbid = (past_len + q_start_sid) // BLOCK_M
++
++    if EVEN_D:
++        q = tl.load(
++            Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
++            mask=offs_m[:, None] < q_seqlen,
++            other=0.0,
++        )
++    else:
++        q = tl.load(
++            Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
++            mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD),
++            other=0.0,
++        )
++
++    sparse_crow_ptr = (layout_crow_ptr + off_h * layout_crow_stride_h +
++                       q_pbid * layout_crow_stride_m)
++
++    # TODO(linxihui): load at once, with any Triton version
++    # that supports `tl.split`, e.g., Triton 3.0
++    k_block_start = tl.load(sparse_crow_ptr).to(tl.int32)
++    k_block_end = tl.load(sparse_crow_ptr + 1).to(tl.int32)
++
++    m_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32) - float("inf")
++    l_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32)
++    acc = tl.zeros([BLOCK_M_LOADING, BLOCK_D], dtype=tl.float32)
++
++    k_ptrs = K + offs_n[None, :] * stride_kt + offs_d[:, None] * stride_kd
++    v_ptrs = V + offs_n[:, None] * stride_vt + offs_d[None, :] * stride_vd
++
++    sm_scale *= (
++        1.44269504  # 1/log2 as we use base2 for exponential and logarithm
++    )
++
++    for k_block_col_idx in range(k_block_start, k_block_end - 1):
++        acc, l_i, m_i = _fwd_kernel_inner(
++            acc,
++            l_i,
++            m_i,
++            q,
++            Q,
++            k_block_col_idx,
++            layout_col_ptr,
++            layout_col_stride_h,
++            layout_col_stride_m,
++            k_ptrs,
++            v_ptrs,
++            off_h,
++            offs_m,
++            offs_n,
++            offs_d,
++            stride_kt,
++            stride_vt,
++            sm_scale,
++            k_seqlen,
++            past_len,
++            False,
++            BLOCK_M_LOADING,
++            BLOCK_N,
++            D_HEAD,
++            EVEN_D,
++            M_LT_N,
++        )
++
++    acc, l_i, m_i = _fwd_kernel_inner(
++        acc,
++        l_i,
++        m_i,
++        q,
++        Q,
++        k_block_end - 1,
++        layout_col_ptr,
++        layout_col_stride_h,
++        layout_col_stride_m,
++        k_ptrs,
++        v_ptrs,
++        off_h,
++        offs_m,
++        offs_n,
++        offs_d,
++        stride_kt,
++        stride_vt,
++        sm_scale,
++        k_seqlen,
++        past_len,
++        True,
++        BLOCK_M_LOADING,
++        BLOCK_N,
++        D_HEAD,
++        EVEN_D,
++        M_LT_N,
++    )
++
++    # flash-attn 2
++    m_i += tl.math.log2(l_i)
++    acc = acc / l_i[:, None]
++
++    # write output
++    if EVEN_D:
++        tl.store(
++            Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od,
++            acc,
++            mask=offs_m[:, None] < q_seqlen,
++        )
++    else:
++        tl.store(
++            Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od,
++            acc,
++            mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD),
++        )
+diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
+new file mode 100644
+index 0000000..350f88c
+--- /dev/null
++++ b/vllm/attention/ops/blocksparse_attention/interface.py
+@@ -0,0 +1,236 @@
++import math
++
++import torch
++
++from vllm.platforms import current_platform
++
++from .utils import (dense_to_crow_col, get_head_sliding_step,
++                    get_sparse_attn_mask)
++
++IS_COMPUTE_8_OR_ABOVE = current_platform.has_device_capability(80)
++
++if IS_COMPUTE_8_OR_ABOVE:
++    from .blocksparse_attention_kernel import blocksparse_flash_attn_varlen_fwd
++
++
++class LocalStridedBlockSparseAttn(torch.nn.Module):
++
++    def __init__(
++        self,
++        n_heads,
++        max_seqlen,
++        local_blocks,
++        vert_stride,
++        block_size,
++        device=None,
++        dtype=None,
++        homo_head=False,
++        active_head_range=None,
++        q_block_size=None,
++        use_spda=None,
++    ):
++        super().__init__()
++        if use_spda is None:
++            use_spda = current_platform.is_rocm() or \
++                        current_platform.is_cpu() or not \
++                        IS_COMPUTE_8_OR_ABOVE
++        device = device or (torch.cuda.current_device()
++                            if current_platform.is_cuda_alike() else "cpu")
++        device = torch.device(device)
++        # NOTE: vllm CPU backend support BF16 instead of FP16.
++        dtype = dtype or (torch.bfloat16 if IS_COMPUTE_8_OR_ABOVE
++                          or device.type == "cpu" else torch.half)
++
++        self.n_heads = n_heads
++        self.max_seqlen = max_seqlen
++        self.local_blocks = local_blocks
++        self.vert_stride = vert_stride
++        self.use_spda = use_spda
++        self.dtype = dtype
++        self.device = device
++        self.block_size = block_size
++        self.q_block_size = q_block_size
++        self.homo_head = homo_head
++        self.active_head_range = active_head_range
++        self.head_sliding_step = get_head_sliding_step(n_heads, vert_stride,
++                                                       homo_head)
++
++        sparse_layout, sparse_pattern, self.dense_attn_mask = (
++            self.get_attn_pattern(dtype, device))
++
++        if q_block_size is not None and q_block_size != block_size:
++            if q_block_size > block_size:
++                assert q_block_size % block_size == 0
++                blocks_to_merge = q_block_size // block_size
++                shape = sparse_pattern.shape
++                sparse_pattern = sparse_pattern.view(shape[0], -1,
++                                                     blocks_to_merge,
++                                                     shape[-1])
++                sparse_pattern = sparse_pattern.sum(2)
++                sparse_layout = dense_to_crow_col(sparse_pattern)
++            else:
++                raise ValueError(
++                    "Does not support smaller q_block_size. It will be slower."
++                )
++
++        self.sparse_layout = sparse_layout
++
++    def get_attn_pattern(self, dtype, device):
++        sparse_layout, sparse_pattern, dense_attn_mask = get_sparse_attn_mask(
++            self.n_heads,
++            self.max_seqlen,
++            self.max_seqlen,
++            dtype,
++            device,
++            block_size=self.block_size,
++            local_blocks=self.local_blocks,
++            vert_stride=self.vert_stride,
++            homo_head=self.homo_head,
++            return_dense=self.use_spda,
++            dense_mask_type="bias",
++        )
++        if (not self.homo_head) and (self.active_head_range is not None):
++            assert isinstance(self.active_head_range, tuple)
++            assert (len(self.active_head_range) == 2)
++            h_start, h_end = self.active_head_range
++            sparse_layout = tuple(x[h_start:h_end] for x in sparse_layout)
++            if self.use_spda:
++                dense_attn_mask = dense_attn_mask[h_start:h_end]
++        return sparse_layout, sparse_pattern, dense_attn_mask
++
++    def varlen_attn(self,
++                    q,
++                    k,
++                    v,
++                    cu_seqlens_k,
++                    cu_seqlens_q=None,
++                    sm_scale=None):
++        """
++        q, k, v: shape = (num_tokens, num_heads_q/kv, head_size).
++        Support grouped attention, with `q[:, i*r:(i*r + r)]`
++        is correspondent to `k[:, i]`, where `r` is the q/k ratio.
++        cu_seqlens_k: shape=(batch_size + 1,),
++        indicating segment of samples,
++        e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i
++        cu_seqlens_q: shape=(batch_size + 1, ).
++        Default None: same as cu_seqlens_k for prefilling or
++        [0, 1, .., batch_size] for decoding.
++        The only case you need to specify is when q is a mix of
++        prefilling and decoding.
++        sm_scale: softmax scale, default to 1/sqrt(head_size).
++
++        return: tensor of shape as q.
++        """
++        assert (
++            IS_COMPUTE_8_OR_ABOVE
++        ), "Requires compute capability of 8 or above (Ampere or newer) to use \
++            Triton kernel."
++
++        sm_scale = sm_scale or 1.0 / math.sqrt(q.size(-1))
++
++        return blocksparse_flash_attn_varlen_fwd(
++            q,
++            k,
++            v,
++            cu_seqlens_k,
++            cu_seqlens_q,
++            sm_scale,
++            self.sparse_layout,
++            block_size=self.block_size,
++            q_block_size=self.q_block_size,
++            max_seqlen=self.max_seqlen,
++        )
++
++    @staticmethod
++    def transpose_and_pad(x, cu_seqlens, maxlen, head_repeats=1):
++        """
++        :param x: (total_tokens, n_heads, head_size)
++        :return: (batch, n_heads, length, head_size)
++        """
++        x_padded = x.new_empty(
++            len(cu_seqlens) - 1, x.size(1), head_repeats, maxlen, x.size(2))
++        cu_seqlens = cu_seqlens.cpu()
++        for i, (s, e) in enumerate(zip(cu_seqlens[:-1], cu_seqlens[1:])):
++            x_padded[i, :, :, :e - s].copy_(x[s:e].transpose(0,
++                                                             1).unsqueeze(1))
++        return x_padded.flatten(1, 2)
++
++    @staticmethod
++    def transpose_and_unpad(x_padded, cu_seqlens):
++        """
++        :param x_padded: (batch, n_heads, length, head_size)
++        :return: (total_tokens, n_heads, head_size)
++        """
++        cu_seqlens = cu_seqlens.cpu()
++        total_n_tokens = cu_seqlens[-1]
++        x = x_padded.new_empty(total_n_tokens, x_padded.size(1),
++                               x_padded.size(3))
++        for i, (s, e) in enumerate(zip(cu_seqlens[:-1], cu_seqlens[1:])):
++            x[s:e].copy_(x_padded[i, :, :e - s].transpose(0, 1))
++        return x
++
++    def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None):
++        """For CPU, V100 or other older GPUs.
++        NOTE: torch SPDA supports nested tensor,
++        but seems extremely slow. Choose to pad instead.
++        """
++        assert (cu_seqlens_q is None or
++                (cu_seqlens_q
++                 == cu_seqlens_k).all()), "Can only handle prompt with SPDA."
++        assert q.size(0) == k.size(0), "can only handle prompt with SPDA."
++
++        assert q.size(1) % k.size(1) == 0
++        q_k_ratio = q.size(1) // k.size(1)
++        sm_scale = sm_scale or 1.0 / math.sqrt(q.size(-1))
++        cu_seqlens = cu_seqlens_k.cpu()
++        maxlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
++
++        if (self.dense_attn_mask.dtype != q.dtype
++                or self.dense_attn_mask.device != q.device):
++            _, _, self.dense_attn_mask = self.get_attn_pattern(
++                q.dtype, q.device)
++        attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen]
++
++        q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1)
++        k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
++                  for x in [k, v])
++        spda_output = torch.nn.functional.scaled_dot_product_attention(
++            q2, k2, v2, attn_mask=attn_mask, scale=sm_scale)
++        return self.transpose_and_unpad(spda_output, cu_seqlens)
++
++    def forward(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None):
++        """Dispatch to `varlen_attn` (Ampere or newer) or
++        `self.spda`(cpu, Volta, Turing or older)based on
++        the type of device used and cuda compute capability.
++
++        q, k, v: shape = (num_tokens, num_heads_q/kv, head_size).
++                Support grouped attention, with `q[:, i*r:(i*r + r)]`
++                is correspondent to `k[:, i]`, where `r` is the q/k ratio.
++        cu_seqlens_k: shape=(batch_size + 1,), indicating segment of samples,
++                    e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i
++        cu_seqlens_q: shape=(batch_size + 1, ).
++                    Default None: same as cu_seqlens_k for prefilling or
++                    [0, 1, .., batch_size] for decoding.
++                    The only case you need to specify
++                    is when q is a mix of prefilling
++                    and decoding.
++        sm_scale: softmax scale, default to 1/sqrt(head_size).
++
++        return: tensor of shape as q.
++        """
++        assert k.dim() == 3
++        if self.use_spda:
++            return self.spda(
++                q,
++                k,
++                v,
++                cu_seqlens_k,
++                cu_seqlens_q=cu_seqlens_q,
++                sm_scale=sm_scale,
++            )
++        return self.varlen_attn(q,
++                                k,
++                                v,
++                                cu_seqlens_k,
++                                cu_seqlens_q=cu_seqlens_q,
++                                sm_scale=sm_scale)
+diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py
+new file mode 100644
+index 0000000..78d7522
+--- /dev/null
++++ b/vllm/attention/ops/blocksparse_attention/utils.py
+@@ -0,0 +1,242 @@
++# Helper functions for 3D sparse pattern
++# These function are not optimized and very inefficient.
++# Avoid calling them too frequent or use a cache mechanism.
++
++from functools import lru_cache
++
++import numpy as np
++import torch
++import triton
++
++
++class csr_matrix:
++    """Simple implementation of CSR matrix conversion without scipy.
++    This replaced scipy.sparse.csr_matrix() previously used."""
++
++    def __init__(self, input_array):
++        if not isinstance(input_array, np.ndarray):
++            raise ValueError("Input must be a NumPy array")
++
++        self.shape = input_array.shape
++        rows, cols = self.shape
++        data = []
++        indices = []
++        indptr = [0]
++
++        for i in range(rows):
++            for j in range(cols):
++                if input_array[i, j]:
++                    data.append(input_array[i, j])
++                    indices.append(j)
++            indptr.append(len(indices))
++
++        self.data = np.array(data)
++        self.indices = np.array(indices)
++        self.indptr = np.array(indptr)
++
++
++def dense_to_crow_col(x: torch.Tensor):
++    """Turning a 2D/3D torch tensor (x) to CSR rows/cols indexing.
++    NOTE: col_indices padded -1
++    """
++    device = x.device
++    pad = -1
++    dim = x.dim()
++    assert x.dim() in (2, 3)
++    if x.dim() == 2:
++        x = x[None]
++    x = [csr_matrix(xi.bool().cpu().numpy()) for xi in x]
++    crows = torch.vstack([torch.from_numpy(xi.indptr) for xi in x])
++    cols = [torch.from_numpy(xi.indices) for xi in x]
++    max_cols = max(len(xi) for xi in cols)
++    cols = [
++        torch.cat([xi, pad + xi.new_zeros(max_cols - xi.shape[0])])
++        for xi in cols
++    ]
++    cols = torch.vstack(cols)
++    if dim == 2:
++        crows = crows[0]
++        cols = cols[0]
++    return crows.to(device), cols.to(device)
++
++
++def crow_col_to_dense(crows: torch.Tensor,
++                      cols: torch.Tensor,
++                      dtype: torch.dtype = torch.float16):
++    dim = crows.dim()
++    if dim == 1:
++        crows = crows[None]
++        cols = cols[None]
++    device = crows.device
++    crows, cols = crows.cpu(), cols.cpu()  # faster in cpu
++    shape = (crows.shape[0], crows.shape[1] - 1, cols.max() + 1)
++    x = torch.zeros(shape, dtype=dtype)
++    for i in range(shape[0]):
++        for j in range(shape[1]):
++            x[i, j, cols[i, crows[i, j]:crows[i, j + 1]]] = 1
++    if dim == 1:
++        x = x[0]
++    return x.to(device)
++
++
++def dense_to_ccol_row(x: torch.Tensor):
++    """Similar, but to CSC format"""
++    x = x.transpose(-2, -1)
++    return dense_to_crow_col(x)
++
++
++def ccol_row_to_dense(ccol: torch.Tensor,
++                      rows: torch.Tensor,
++                      dtype: torch.dtype = torch.float16):
++    return crow_col_to_dense(ccol, rows, dtype).permute(0, 2, 1).contiguous()
++
++
++def _get_sparse_attn_mask_homo_head(
++    q_len: int,
++    max_seqlen: int,
++    dtype: torch.dtype,
++    device: torch.device,
++    block_size: int = 128,
++    local_blocks: int = 4,
++    vert_stride: int = 4,
++    return_dense: bool = False,
++):
++    """
++    :return: a tuple of 3:
++        - tuple of crow_indices, col_indices representation
++            of CSR format.
++        - block dense mask
++        - all token dense mask (be aware that it can be
++            OOM if it is too big) if `return_dense==True`,
++            otherwise, None
++    """
++    with torch.no_grad():
++        num_blocks = triton.cdiv(max_seqlen, block_size)
++        q_pos = torch.arange(num_blocks)[:, None]
++        k_pos = torch.arange(num_blocks)[None]
++        mask_vert_strided = (torch.arange(num_blocks) + 1) % vert_stride == 0
++        block_mask_dense = (((q_pos >= k_pos)
++                             & ((q_pos - k_pos < local_blocks)
++                                | mask_vert_strided)).to(device).to(dtype))
++        num_blocks_q = triton.cdiv(q_len, block_size)
++        block_mask_dense_output = (dense_to_crow_col(
++            block_mask_dense[-num_blocks_q:].contiguous()))
++    if return_dense:
++        mask_dense = torch.kron(
++            block_mask_dense,
++            block_mask_dense.new_ones((block_size, block_size)),
++        )
++        causal_mask = torch.tril(torch.ones(
++            max_seqlen, max_seqlen)).type_as(mask_dense)[-q_len:]
++        mask_dense = mask_dense[-q_len:, :max_seqlen] * causal_mask
++        return (
++            block_mask_dense_output,
++            block_mask_dense,
++            mask_dense,
++        )
++    else:
++        return (
++            block_mask_dense_output,
++            block_mask_dense,
++            None,
++        )
++
++
++def binary_mask_to_bias(mask_dense: torch.Tensor):
++    mask_dense = 1 - mask_dense
++    mask_dense.masked_fill_(mask_dense.bool(), -torch.inf)
++    return mask_dense
++
++
++def get_head_sliding_step(n_heads: int,
++                          vert_stride: int,
++                          homo_head: bool = False):
++    if homo_head:
++        return 0
++    return max(1, int(vert_stride / n_heads))
++
++
++@lru_cache
++def get_sparse_attn_mask(
++    n_heads: int,
++    q_len: int,
++    max_seqlen: int,
++    dtype: torch.dtype,
++    device: torch.device,
++    block_size: int = 64,
++    local_blocks: int = 4,
++    vert_stride: int = 4,
++    homo_head: bool = True,
++    return_dense: bool = False,
++    dense_mask_type: str = "binary",
++):
++    """
++    :param dense_mask_type: "binary" (0 for skip token, 1 for others)
++        or "bias" (-inf for skip token, 0 or others)
++    :return: a tuple of 3:
++        - tuple of crow_indices, col_indices representation
++            of CSR format.
++        - block dense mask
++        - all token dense mask (be aware that it can be OOM if it
++            is too big) if `return_dense==True`, otherwise, None
++    """
++    assert dense_mask_type in ("binary", "bias")
++    if homo_head:
++        with torch.no_grad():
++            (crow, col), block_mask_dense, mask_dense = (
++                _get_sparse_attn_mask_homo_head(
++                    q_len,
++                    max_seqlen,
++                    dtype,
++                    device,
++                    block_size,
++                    local_blocks,
++                    vert_stride,
++                    return_dense,
++                ))
++            crow = crow[None].expand(n_heads, crow.shape[0])
++            col = col[None].expand(n_heads, col.shape[0])
++            if return_dense:
++                mask_dense = mask_dense[None].expand(n_heads,
++                                                     *mask_dense.shape)
++                if dense_mask_type == "bias":
++                    mask_dense = binary_mask_to_bias(mask_dense)
++            return (crow, col), block_mask_dense, mask_dense
++
++    with torch.no_grad():
++        num_blocks = triton.cdiv(max_seqlen, block_size)
++        q_pos = torch.arange(num_blocks)[None, :, None]
++        k_pos = torch.arange(num_blocks)[None, None]
++        head_sliding_step = get_head_sliding_step(n_heads, vert_stride)
++        mask_vert_strided = [
++            (torch.arange(num_blocks) + h * head_sliding_step + 1) %
++            vert_stride == 0 for h in range(n_heads)
++        ]
++        mask_vert_strided = torch.vstack(mask_vert_strided).unsqueeze(1)
++        block_mask_dense = (((q_pos >= k_pos)
++                             & ((q_pos - k_pos < local_blocks)
++                                | mask_vert_strided)).to(device).to(dtype))
++        num_blocks_q = triton.cdiv(q_len, block_size)
++        block_mask_dense_output = block_mask_dense[:, -num_blocks_q:]
++    if return_dense:
++        mask_dense = torch.kron(
++            block_mask_dense,
++            block_mask_dense.new_ones((block_size, block_size)),
++        )
++        causal_mask = torch.tril(torch.ones(
++            max_seqlen, max_seqlen)).type_as(mask_dense)[-q_len:]
++        mask_dense = mask_dense[..., -q_len:, :max_seqlen] * causal_mask[None]
++        if dense_mask_type == "bias":
++            mask_dense = binary_mask_to_bias(mask_dense)
++
++        return (
++            dense_to_crow_col(block_mask_dense_output),
++            block_mask_dense,
++            mask_dense,
++        )
++    else:
++        return (
++            dense_to_crow_col(block_mask_dense_output),
++            block_mask_dense,
++            None,
++        )
+diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
+new file mode 100644
+index 0000000..4c0fb2a
+--- /dev/null
++++ b/vllm/attention/ops/hpu_paged_attn.py
+@@ -0,0 +1,103 @@
++###############################################################################
++# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
++###############################################################################
++
++from dataclasses import dataclass
++from typing import Dict, List, Optional, Tuple
++
++import torch
++from vllm_hpu_extension import cache_ops, ops
++
++# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
++_PARTITION_SIZE = 512
++
++
++@dataclass
++class HPUPagedAttentionMetadata:
++    """Metadata for PagedAttention."""
++    block_list: Optional[torch.Tensor]
++    block_mapping: Optional[torch.Tensor]
++    block_usage: Optional[torch.Tensor]
++    block_indices: Optional[torch.Tensor]
++    block_offsets: Optional[torch.Tensor]
++    block_scales: Optional[torch.Tensor]
++
++
++class HPUPagedAttention:
++
++    @staticmethod
++    def get_supported_head_sizes() -> List[int]:
++        return [64, 80, 96, 112, 128, 256]
++
++    @staticmethod
++    def get_kv_cache_shape(
++        num_blocks: int,
++        block_size: int,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[int, ...]:
++        return (num_blocks, block_size, num_kv_heads, head_size)
++
++    @staticmethod
++    def split_kv_cache(
++        kv_cache: torch.Tensor,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        key_cache = kv_cache[0]
++        value_cache = kv_cache[1]
++        return key_cache, value_cache
++
++    @staticmethod
++    def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
++                             key_cache: torch.Tensor,
++                             value_cache: torch.Tensor,
++                             slot_mapping: torch.Tensor, kv_cache_dtype: str,
++                             is_prompt: bool) -> None:
++        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
++                                    slot_mapping, kv_cache_dtype, is_prompt)
++
++    @staticmethod
++    def forward_decode(**kwargs) -> torch.Tensor:
++        return ops.flat_pa(**kwargs)
++
++    @staticmethod
++    def forward_prefix(
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++        key_cache: torch.Tensor,
++        value_cache: torch.Tensor,
++        block_tables: torch.Tensor,
++        subquery_start_loc: torch.Tensor,
++        seq_lens_tensor: torch.Tensor,
++        context_lens: torch.Tensor,
++        max_query_len: int,
++        alibi_slopes: Optional[torch.Tensor],
++        sliding_window: Optional[int],
++    ) -> torch.Tensor:
++        raise NotImplementedError(
++            "forward_prefix is not implemented for HPUPagedAttention")
++
++    @staticmethod
++    def swap_blocks(
++        src_kv_cache: torch.Tensor,
++        dst_kv_cache: torch.Tensor,
++        src_to_dst: Dict[int, int],
++    ) -> None:
++        src_key_cache = src_kv_cache[0]
++        dst_key_cache = dst_kv_cache[0]
++        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
++
++        src_value_cache = src_kv_cache[1]
++        dst_value_cache = dst_kv_cache[1]
++        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
++
++    @staticmethod
++    def copy_blocks(
++        kv_caches: List[torch.Tensor],
++        src_to_dists: Dict[int, List[int]],
++    ) -> None:
++        key_caches = [kv_cache[0] for kv_cache in kv_caches]
++        value_caches = [kv_cache[1] for kv_cache in kv_caches]
++        cache_ops.copy_blocks(key_caches, value_caches, src_to_dists)
+diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
+new file mode 100644
+index 0000000..cbc6c74
+--- /dev/null
++++ b/vllm/attention/ops/ipex_attn.py
+@@ -0,0 +1,191 @@
++from typing import Dict, List, Optional, Tuple
++
++try:
++    import intel_extension_for_pytorch.llm.modules as ipex_modules
++    _use_ipex = True
++except ImportError:
++    _use_ipex = False
++
++import torch
++
++from vllm import _custom_ops as ops
++
++
++class _PagedAttention:
++
++    @staticmethod
++    def get_supported_head_sizes() -> List[int]:
++        return [32, 64, 80, 96, 112, 128, 256]
++
++    @staticmethod
++    def get_kv_cache_shape(
++        num_blocks: int,
++        block_size: int,
++        num_kv_heads: int,
++        head_size: int,
++        *args,
++    ) -> Tuple[int, ...]:
++        return (2, num_blocks, block_size * num_kv_heads * head_size)
++
++    @staticmethod
++    def split_kv_cache(
++        kv_cache: torch.Tensor,
++        num_kv_heads: int,
++        head_size: int,
++        *args,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        x = 16 // kv_cache.element_size()
++        num_blocks = kv_cache.shape[1]
++
++        key_cache = kv_cache[0]
++        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
++                                   -1, x)
++        value_cache = kv_cache[1]
++        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
++        return key_cache, value_cache
++
++    @staticmethod
++    def write_to_paged_cache(
++        key: torch.Tensor,
++        value: torch.Tensor,
++        key_cache: torch.Tensor,
++        value_cache: torch.Tensor,
++        slot_mapping: torch.Tensor,
++        kv_cache_dtype: str,
++        k_scale: float,
++        v_scale: float,
++        *args,
++    ) -> None:
++        ops.reshape_and_cache(
++            key,
++            value,
++            key_cache,
++            value_cache,
++            slot_mapping.flatten(),
++            kv_cache_dtype,
++            k_scale,
++            v_scale,
++        )
++
++    @staticmethod
++    def forward_decode(
++        output: torch.Tensor,
++        query: torch.Tensor,
++        key_cache: torch.Tensor,
++        value_cache: torch.Tensor,
++        block_tables: torch.Tensor,
++        context_lens: torch.Tensor,
++        max_context_len: int,
++        kv_cache_dtype: str,
++        num_kv_heads: int,
++        scale: float,
++        alibi_slopes: Optional[torch.Tensor],
++        k_scale: float,
++        v_scale: float,
++        *args,
++    ) -> None:
++        tp_rank: int = 0
++        blocksparse_local_blocks: int = 0
++        blocksparse_vert_stride: int = 0
++        blocksparse_block_size: int = 64
++        blocksparse_head_sliding_step: int = 0
++        block_size = value_cache.shape[3]
++
++        ops.paged_attention_v1(
++            output,
++            query,
++            key_cache,
++            value_cache,
++            num_kv_heads,
++            scale,
++            block_tables,
++            context_lens,
++            block_size,
++            max_context_len,
++            alibi_slopes,
++            kv_cache_dtype,
++            k_scale,
++            v_scale,
++            tp_rank,
++            blocksparse_local_blocks,
++            blocksparse_vert_stride,
++            blocksparse_block_size,
++            blocksparse_head_sliding_step,
++        )
++
++    @staticmethod
++    def copy_blocks(
++        kv_caches: List[torch.Tensor],
++        src_to_dists: Dict[int, List[int]],
++        *args,
++    ) -> None:
++        key_caches = [kv_cache[0] for kv_cache in kv_caches]
++        value_caches = [kv_cache[1] for kv_cache in kv_caches]
++        ops.copy_blocks(key_caches, value_caches, src_to_dists)
++
++
++class _IPEXPagedAttention(_PagedAttention):
++
++    @staticmethod
++    def split_kv_cache(
++        kv_cache: torch.Tensor,
++        num_kv_heads: int,
++        head_size: int,
++        *args,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        num_blocks = kv_cache.shape[1]
++
++        key_cache = kv_cache[0]
++        key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
++        value_cache = kv_cache[1]
++        value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
++        return key_cache, value_cache
++
++    @staticmethod
++    def write_to_paged_cache(
++        key: torch.Tensor,
++        value: torch.Tensor,
++        key_cache: torch.Tensor,
++        value_cache: torch.Tensor,
++        slot_mapping: torch.Tensor,
++        kv_cache_dtype: str,
++        k_scale: float,
++        v_scale: float,
++        *args,
++    ) -> None:
++        ipex_modules.PagedAttention.reshape_and_cache(
++            key, value, key_cache, value_cache,
++            slot_mapping.flatten().int())
++
++    @staticmethod
++    def forward_decode(
++        output: torch.Tensor,
++        query: torch.Tensor,
++        key_cache: torch.Tensor,
++        value_cache: torch.Tensor,
++        block_tables: torch.Tensor,
++        context_lens: torch.Tensor,
++        max_context_len: int,
++        kv_cache_dtype: str,
++        num_kv_heads: int,
++        scale: float,
++        alibi_slopes: Optional[torch.Tensor],
++        k_scale: float,
++        v_scale: float,
++        *args,
++    ) -> None:
++        block_size = value_cache.shape[2]
++        head_mapping = torch.arange(
++            0,
++            num_kv_heads,
++            device="cpu",
++            dtype=torch.int32,
++        ).view(num_kv_heads,
++               1).repeat_interleave(query.size(1) // num_kv_heads).flatten()
++        ipex_modules.PagedAttention.single_query_cached_kv_attention(
++            output, query.contiguous(), key_cache, value_cache, head_mapping,
++            scale, block_tables, context_lens, block_size, max_context_len,
++            alibi_slopes)
++
++
++PagedAttention = _IPEXPagedAttention if _use_ipex else _PagedAttention
+diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
+index 00a0f10..076f151 100644
+--- a/vllm/attention/ops/paged_attn.py
++++ b/vllm/attention/ops/paged_attn.py
+@@ -1,10 +1,13 @@
+ from dataclasses import dataclass
+-from typing import Dict, List, Optional, Tuple
++from typing import List, Optional, Tuple
+ 
+ import torch
+ 
+ from vllm import _custom_ops as ops
+-from vllm.attention.ops.prefix_prefill import context_attention_fwd
++from vllm.triton_utils import HAS_TRITON
++
++if HAS_TRITON:
++    from vllm.attention.ops.prefix_prefill import context_attention_fwd
+ 
+ # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
+ _PARTITION_SIZE = 512
+@@ -16,8 +19,8 @@ class PagedAttentionMetadata:
+     # (batch_size,). The length of sequences (entire tokens seen so far) per
+     # sequence.
+     seq_lens_tensor: Optional[torch.Tensor]
+-    # Maximum sequence length in the batch.
+-    max_seq_len: Optional[int]
++    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
++    max_decode_seq_len: int
+     # (batch_size, max_blocks_per_seq).
+     # Block addresses per sequence. (Seq id -> list of physical block)
+     # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+@@ -31,7 +34,7 @@ class PagedAttention:
+ 
+     @staticmethod
+     def get_supported_head_sizes() -> List[int]:
+-        return [64, 80, 96, 112, 128, 256]
++        return [32, 64, 80, 96, 112, 120, 128, 192, 256]
+ 
+     @staticmethod
+     def get_kv_cache_shape(
+@@ -66,7 +69,8 @@ class PagedAttention:
+         value_cache: torch.Tensor,
+         slot_mapping: torch.Tensor,
+         kv_cache_dtype: str,
+-        kv_scale: float,
++        k_scale: float,
++        v_scale: float,
+     ) -> None:
+         ops.reshape_and_cache(
+             key,
+@@ -75,7 +79,8 @@ class PagedAttention:
+             value_cache,
+             slot_mapping.flatten(),
+             kv_cache_dtype,
+-            kv_scale,
++            k_scale,
++            v_scale,
+         )
+ 
+     @staticmethod
+@@ -90,10 +95,23 @@ class PagedAttention:
+         num_kv_heads: int,
+         scale: float,
+         alibi_slopes: Optional[torch.Tensor],
+-        kv_scale: float,
++        k_scale: float,
++        v_scale: float,
++        tp_rank: int = 0,
++        blocksparse_local_blocks: int = 0,
++        blocksparse_vert_stride: int = 0,
++        blocksparse_block_size: int = 64,
++        blocksparse_head_sliding_step: int = 0,
+     ) -> torch.Tensor:
+-        output = torch.empty_like(query)
++        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
++            # use blocksparse paged attention
++            block_size = value_cache.size(-1)
++            assert (blocksparse_block_size > 0 and
++                    blocksparse_block_size % block_size == 0), \
++                (f"{blocksparse_block_size=} needs to be a multiple of"
++                 f"{block_size=} used in block_tables.")
+ 
++        output = torch.empty_like(query)
+         block_size = value_cache.shape[3]
+         num_seqs, num_heads, head_size = query.shape
+         max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
+@@ -107,6 +125,7 @@ class PagedAttention:
+         # For context len > 8192, use V2 kernel to avoid shared memory shortage.
+         use_v1 = (max_seq_len <= 8192
+                   and (max_num_partitions == 1 or num_seqs * num_heads > 512))
++
+         if use_v1:
+             # Run PagedAttention V1.
+             ops.paged_attention_v1(
+@@ -122,7 +141,13 @@ class PagedAttention:
+                 max_seq_len,
+                 alibi_slopes,
+                 kv_cache_dtype,
+-                kv_scale,
++                k_scale,
++                v_scale,
++                tp_rank,
++                blocksparse_local_blocks,
++                blocksparse_vert_stride,
++                blocksparse_block_size,
++                blocksparse_head_sliding_step,
+             )
+         else:
+             # Run PagedAttention V2.
+@@ -154,7 +179,13 @@ class PagedAttention:
+                 max_seq_len,
+                 alibi_slopes,
+                 kv_cache_dtype,
+-                kv_scale,
++                k_scale,
++                v_scale,
++                tp_rank,
++                blocksparse_local_blocks,
++                blocksparse_vert_stride,
++                blocksparse_block_size,
++                blocksparse_head_sliding_step,
+             )
+         return output
+ 
+@@ -163,15 +194,18 @@ class PagedAttention:
+         query: torch.Tensor,
+         key: torch.Tensor,
+         value: torch.Tensor,
++        kv_cache_dtype: str,
+         key_cache: torch.Tensor,
+         value_cache: torch.Tensor,
+         block_tables: torch.Tensor,
+-        subquery_start_loc: torch.Tensor,
++        query_start_loc: torch.Tensor,
+         seq_lens_tensor: torch.Tensor,
+         context_lens: torch.Tensor,
+         max_query_len: int,
+         alibi_slopes: Optional[torch.Tensor],
+         sliding_window: Optional[int],
++        k_scale: float,
++        v_scale: float,
+     ) -> torch.Tensor:
+         output = torch.empty_like(query)
+         context_attention_fwd(
+@@ -179,14 +213,17 @@ class PagedAttention:
+             key,
+             value,
+             output,
++            kv_cache_dtype,
+             key_cache,
+             value_cache,
+             block_tables,
+-            # subquery_start_loc is (batch_size + 1,)
+-            subquery_start_loc[:-1],
++            # query_start_loc is (batch_size + 1,)
++            query_start_loc[:-1],
+             seq_lens_tensor,
+             context_lens,
+             max_query_len,
++            k_scale,
++            v_scale,
+             alibi_slopes,
+             sliding_window,
+         )
+@@ -196,7 +233,7 @@ class PagedAttention:
+     def swap_blocks(
+         src_kv_cache: torch.Tensor,
+         dst_kv_cache: torch.Tensor,
+-        src_to_dst: Dict[int, int],
++        src_to_dst: torch.Tensor,
+     ) -> None:
+         src_key_cache = src_kv_cache[0]
+         dst_key_cache = dst_kv_cache[0]
+@@ -209,7 +246,7 @@ class PagedAttention:
+     @staticmethod
+     def copy_blocks(
+         kv_caches: List[torch.Tensor],
+-        src_to_dists: Dict[int, List[int]],
++        src_to_dists: torch.Tensor,
+     ) -> None:
+         key_caches = [kv_cache[0] for kv_cache in kv_caches]
+         value_caches = [kv_cache[1] for kv_cache in kv_caches]
+diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
+index 79878b2..9c11a8d 100644
+--- a/vllm/attention/ops/prefix_prefill.py
++++ b/vllm/attention/ops/prefix_prefill.py
+@@ -5,6 +5,15 @@ import torch
+ import triton
+ import triton.language as tl
+ 
++from vllm.platforms import current_platform
++
++# Static kernels parameters
++BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64
++NUM_WARPS = 8
++
++# To check compatibility
++IS_TURING = current_platform.get_device_capability() == (7, 5)
++
+ if triton.__version__ >= "2.1.0":
+ 
+     @triton.jit
+@@ -16,6 +25,8 @@ if triton.__version__ >= "2.1.0":
+         V_cache,
+         B_Loc,
+         sm_scale,
++        k_scale,
++        v_scale,
+         B_Start_Loc,
+         B_Seqlen,
+         B_Ctxlen,
+@@ -46,6 +57,7 @@ if triton.__version__ >= "2.1.0":
+         stride_v_cache_d,
+         stride_v_cache_bl,
+         num_queries_per_kv: int,
++        IN_PRECISION: tl.constexpr,
+         BLOCK_M: tl.constexpr,
+         BLOCK_DMODEL: tl.constexpr,  # head size
+         BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
+@@ -115,13 +127,18 @@ if triton.__version__ >= "2.1.0":
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+-            k = tl.load(K_cache + off_k,
+-                        mask=dim_mask[:, None] &
+-                        ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
+-                        other=0.0)  # [D,N]
++            k_load = tl.load(K_cache + off_k,
++                             mask=dim_mask[:, None] &
++                             ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
++                             other=0.0)  # [D,N]
++
++            if k_load.dtype.is_fp8():
++                k = (k_load.to(tl.float32) * k_scale).to(q.dtype)
++            else:
++                k = k_load
+ 
+             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)  # [M,N]
+-            qk += tl.dot(q, k)
++            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+             qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                           float("-inf"))
+             qk *= sm_scale
+@@ -159,13 +176,17 @@ if triton.__version__ >= "2.1.0":
+             acc_scale = l_i / l_i_new * alpha
+             acc = acc * acc_scale[:, None]
+             # update acc
+-            v = tl.load(V_cache + off_v,
+-                        mask=dim_mask[None, :] &
+-                        ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
+-                        other=0.0)  # [N,D]
+-
++            v_load = tl.load(V_cache + off_v,
++                             mask=dim_mask[None, :] &
++                             ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
++                             other=0.0)  # [N,D]
++            if v_load.dtype.is_fp8():
++                v = (v_load.to(tl.float32) * v_scale).to(q.dtype)
++            else:
++                v = v_load
+             p = p.to(v.dtype)
+-            acc += tl.dot(p, v)
++
++            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+             # # update m_i and l_i
+             l_i = l_i_new
+             m_i = m_i_new
+@@ -191,7 +212,7 @@ if triton.__version__ >= "2.1.0":
+                         other=0.0)
+ 
+             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+-            qk += tl.dot(q, k)
++            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+             qk *= sm_scale
+             # apply causal mask
+             qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+@@ -223,9 +244,9 @@ if triton.__version__ >= "2.1.0":
+                         mask=dim_mask[None, :] &
+                         ((start_n + offs_n[:, None]) < cur_batch_query_len),
+                         other=0.0)
+-
+             p = p.to(v.dtype)
+-            acc += tl.dot(p, v)
++
++            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+             # update m_i and l_i
+             l_i = l_i_new
+             m_i = m_i_new
+@@ -334,7 +355,6 @@ if triton.__version__ >= "2.1.0":
+             k = tl.load(K_cache + off_k,
+                         mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                         other=0.0)
+-
+             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+             qk += tl.dot(q, k)
+             qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+@@ -440,6 +460,8 @@ if triton.__version__ >= "2.1.0":
+         V_cache,
+         B_Loc,
+         sm_scale,
++        k_scale,
++        v_scale,
+         B_Start_Loc,
+         B_Seqlen,
+         B_Ctxlen,
+@@ -471,8 +493,10 @@ if triton.__version__ >= "2.1.0":
+         stride_v_cache_d,
+         stride_v_cache_bl,
+         num_queries_per_kv: int,
++        IN_PRECISION: tl.constexpr,
+         BLOCK_M: tl.constexpr,
+-        BLOCK_DMODEL: tl.constexpr,
++        BLOCK_DMODEL: tl.constexpr,  # head size
++        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
+         BLOCK_N: tl.constexpr,
+     ):
+         # attn_bias[]
+@@ -493,21 +517,24 @@ if triton.__version__ >= "2.1.0":
+ 
+         # initialize offsets
+         offs_n = tl.arange(0, BLOCK_N)
+-        offs_d = tl.arange(0, BLOCK_DMODEL)
++        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+         offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+         off_q = (
+             (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+ 
+-        q = tl.load(
+-            Q + off_q,
+-            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+-            other=0.0)
++        dim_mask = tl.where(
++            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
++
++        q = tl.load(Q + off_q,
++                    mask=dim_mask[None, :] &
++                    (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
++                    other=0.0)
+ 
+         # # initialize pointer to m and l
+         m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+         l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
++        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
+ 
+         alibi_slope = tl.load(Alibi_slopes + cur_head)
+         alibi_start_q = tl.arange(
+@@ -531,12 +558,18 @@ if triton.__version__ >= "2.1.0":
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+-            k = tl.load(K_cache + off_k,
+-                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+-                        other=0.0)
++            k_load = tl.load(K_cache + off_k,
++                             mask=dim_mask[:, None] &
++                             ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
++                             other=0.0)  # [D,N]
++
++            if k_load.dtype.is_fp8():
++                k = (k_load.to(tl.float32) * k_scale).to(q.dtype)
++            else:
++                k = k_load
+ 
+             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+-            qk += tl.dot(q, k)
++            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+             qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                           float("-inf"))
+             qk *= sm_scale
+@@ -566,12 +599,17 @@ if triton.__version__ >= "2.1.0":
+             # acc_scale = l_i / l_i_new * alpha
+             acc = acc * acc_scale[:, None]
+             # update acc
+-            v = tl.load(V_cache + off_v,
+-                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+-                        other=0.0)
+-
++            v_load = tl.load(V_cache + off_v,
++                             mask=dim_mask[None, :] &
++                             ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
++                             other=0.0)
++            if v_load.dtype.is_fp8():
++                v = (v_load.to(tl.float32) * v_scale).to(q.dtype)
++            else:
++                v = v_load
+             p = p.to(v.dtype)
+-            acc += tl.dot(p, v, allow_tf32=False)
++
++            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+             # update m_i and l_i
+             l_i = l_i_new
+             m_i = m_i_new
+@@ -600,12 +638,13 @@ if triton.__version__ >= "2.1.0":
+             # -- compute qk ----
+             k = tl.load(k_ptrs +
+                         (cur_batch_in_all_start_index + start_n) * stride_kbs,
+-                        mask=(start_n + offs_n[None, :]) <
+-                        cur_batch_seq_len - cur_batch_ctx_len,
++                        mask=dim_mask[:, None] &
++                        ((start_n + offs_n[None, :]) <
++                         cur_batch_seq_len - cur_batch_ctx_len),
+                         other=0.0)
+ 
+             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+-            qk += tl.dot(q, k, allow_tf32=False)
++            qk = tl.dot(q, k, acc=qk, input_precision='ieee')
+             qk *= sm_scale
+             qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                           float("-inf"))
+@@ -637,12 +676,13 @@ if triton.__version__ >= "2.1.0":
+             # update acc
+             v = tl.load(v_ptrs +
+                         (cur_batch_in_all_start_index + start_n) * stride_vbs,
+-                        mask=(start_n + offs_n[:, None]) <
+-                        cur_batch_seq_len - cur_batch_ctx_len,
++                        mask=dim_mask[None, :] &
++                        ((start_n + offs_n[:, None]) <
++                         cur_batch_seq_len - cur_batch_ctx_len),
+                         other=0.0)
+-
+             p = p.to(v.dtype)
+-            acc += tl.dot(p, v, allow_tf32=False)
++
++            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+             # update m_i and l_i
+             l_i = l_i_new
+             m_i = m_i_new
+@@ -656,7 +696,8 @@ if triton.__version__ >= "2.1.0":
+         out_ptrs = Out + off_o
+         tl.store(out_ptrs,
+                  acc,
+-                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
++                 mask=dim_mask[None, :] &
++                 (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+         return
+ 
+     @torch.inference_mode()
+@@ -664,6 +705,7 @@ if triton.__version__ >= "2.1.0":
+                               k,
+                               v,
+                               o,
++                              kv_cache_dtype: str,
+                               k_cache,
+                               v_cache,
+                               b_loc,
+@@ -671,11 +713,44 @@ if triton.__version__ >= "2.1.0":
+                               b_seq_len,
+                               b_ctx_len,
+                               max_input_len,
++                              k_scale: float = 1.0,
++                              v_scale: float = 1.0,
+                               alibi_slopes=None,
+                               sliding_window=None):
+ 
+-        cap = torch.cuda.get_device_capability()
+-        BLOCK = 128 if cap[0] >= 8 else 64
++        q_dtype_is_f32 = q.dtype is torch.float32
++        # need to reduce num. blocks when using fp32
++        # due to increased use of GPU shared memory
++        # if q.dtype is torch.float32:
++        BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK
++
++        # Turing does have tensor core for float32 multiplication
++        # use ieee as fallback for triton kernels work. There is also
++        # warning on vllm/config.py to inform users this fallback
++        # implementation
++        IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
++
++        # Conversion of FP8 Tensor from uint8 storage to
++        # appropriate torch.dtype for interpretation by Triton
++        if "fp8" in kv_cache_dtype:
++            assert (k_cache.dtype == torch.uint8)
++            assert (v_cache.dtype == torch.uint8)
++
++            if kv_cache_dtype in ("fp8", "fp8_e4m3"):
++                target_dtype = torch.float8_e4m3fn
++            elif kv_cache_dtype == "fp8_e5m2":
++                target_dtype = torch.float8_e5m2
++            else:
++                raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
++
++            k_cache = k_cache.view(target_dtype)
++            v_cache = v_cache.view(target_dtype)
++
++        if (k_cache.dtype == torch.uint8
++                or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
++            raise ValueError("kv_cache_dtype='auto' unsupported for\
++                FP8 KV Cache prefill kernel")
++
+         # shape constraints
+         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+         assert Lq == Lk and Lk == Lv
+@@ -688,9 +763,11 @@ if triton.__version__ >= "2.1.0":
+ 
+         grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
+ 
+-        num_warps = 8 if Lk <= 64 else 8
++        # 0 means "disable"
++        if sliding_window is None or sliding_window <= 0:
++            sliding_window = 0
++
+         if alibi_slopes is not None:
+-            assert Lk == Lk_padded
+             _fwd_kernel_alibi[grid](
+                 q,
+                 k,
+@@ -699,12 +776,14 @@ if triton.__version__ >= "2.1.0":
+                 v_cache,
+                 b_loc,
+                 sm_scale,
++                k_scale,
++                v_scale,
+                 b_start_loc,
+                 b_seq_len,
+                 b_ctx_len,
+                 alibi_slopes,
+                 v_cache.shape[3],
+-                8,
++                k_cache.shape[4],
+                 o,
+                 b_loc.stride(0),
+                 b_loc.stride(1),
+@@ -733,10 +812,12 @@ if triton.__version__ >= "2.1.0":
+                 v_cache.stride(
+                     3),  #[num_blocks, num_kv_heads, head_size, block_size]
+                 num_queries_per_kv=num_queries_per_kv,
++                IN_PRECISION=IN_PRECISION,
+                 BLOCK_M=BLOCK,
+                 BLOCK_DMODEL=Lk,
++                BLOCK_DMODEL_PADDED=Lk_padded,
+                 BLOCK_N=BLOCK,
+-                num_warps=num_warps,
++                num_warps=NUM_WARPS,
+                 num_stages=1,
+             )
+             return
+@@ -749,11 +830,13 @@ if triton.__version__ >= "2.1.0":
+             v_cache,
+             b_loc,
+             sm_scale,
++            k_scale,
++            v_scale,
+             b_start_loc,
+             b_seq_len,
+             b_ctx_len,
+             v_cache.shape[3],
+-            8,
++            k_cache.shape[4],
+             o,
+             b_loc.stride(0),
+             b_loc.stride(1),
+@@ -781,12 +864,13 @@ if triton.__version__ >= "2.1.0":
+             v_cache.stride(
+                 3),  #[num_blocks, num_kv_heads, head_size, block_size]
+             num_queries_per_kv=num_queries_per_kv,
++            IN_PRECISION=IN_PRECISION,
+             BLOCK_M=BLOCK,
+             BLOCK_DMODEL=Lk,
+             BLOCK_DMODEL_PADDED=Lk_padded,
+             BLOCK_N=BLOCK,
+-            SLIDING_WINDOW=sliding_window if sliding_window is not None else 0,
+-            num_warps=num_warps,
++            SLIDING_WINDOW=sliding_window,
++            num_warps=NUM_WARPS,
+             num_stages=1,
+         )
+         return
+diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
+index 1147664..f942111 100644
+--- a/vllm/attention/ops/triton_flash_attention.py
++++ b/vllm/attention/ops/triton_flash_attention.py
+@@ -239,6 +239,16 @@ def _attn_fwd_inner(
+             num_stages=1,
+             num_warps=8,
+         ),
++        triton.Config(
++            {
++                "BLOCK_M": 128,
++                "BLOCK_N": 64,
++                "waves_per_eu": 1,
++                "PRE_LOAD_V": False,
++            },
++            num_stages=1,
++            num_warps=4,
++        ),
+         triton.Config(
+             {
+                 "BLOCK_M": 128,
+diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
+index 34da0f6..0ff007c 100644
+--- a/vllm/attention/selector.py
++++ b/vllm/attention/selector.py
+@@ -1,91 +1,178 @@
+-import enum
++import os
++from contextlib import contextmanager
+ from functools import lru_cache
+-from typing import Type
++from typing import Generator, Optional, Type
+ 
+ import torch
+ 
+ import vllm.envs as envs
+ from vllm.attention.backends.abstract import AttentionBackend
+ from vllm.logger import init_logger
+-from vllm.utils import is_cpu, is_hip
++from vllm.platforms import _Backend, current_platform
++from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname
+ 
+ logger = init_logger(__name__)
+ 
+ 
+-class _Backend(enum.Enum):
+-    FLASH_ATTN = enum.auto()
+-    XFORMERS = enum.auto()
+-    ROCM_FLASH = enum.auto()
+-    TORCH_SDPA = enum.auto()
+-    FLASHINFER = enum.auto()
++def backend_name_to_enum(backend_name: str) -> _Backend:
++    assert backend_name is not None
++
++    backend_members = _Backend.__members__
++    if backend_name not in backend_members:
++        raise ValueError(f"Invalid attention backend '{backend_name}'. "
++                         f"Available backends: {', '.join(backend_members)} "
++                         "(case-sensitive).")
++
++    return _Backend[backend_name]
++
++
++def get_env_variable_attn_backend() -> Optional[_Backend]:
++    '''
++    Get the backend override specified by the vLLM attention
++    backend environment variable, if one is specified.
++
++    Returns:
++
++    * _Backend enum value if an override is specified
++    * None otherwise
++    '''
++    backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
++    return (None
++            if backend_name is None else backend_name_to_enum(backend_name))
++
++
++# Global state allows a particular choice of backend
++# to be forced, overriding the logic which auto-selects
++# a backend based on system & workload configuration
++# (default behavior if this variable is None)
++#
++# THIS SELECTION TAKES PRECEDENCE OVER THE
++# VLLM ATTENTION BACKEND ENVIRONMENT VARIABLE
++forced_attn_backend: Optional[_Backend] = None
++
++
++def global_force_attn_backend(attn_backend: Optional[_Backend]) -> None:
++    '''
++    Force all attention operations to use a specified backend.
++
++    Passing `None` for the argument re-enables automatic
++    backend selection.,
++
++    Arguments:
++
++    * attn_backend: backend selection (None to revert to auto)
++    '''
++    global forced_attn_backend
++    forced_attn_backend = attn_backend
++
++
++def get_global_forced_attn_backend() -> Optional[_Backend]:
++    '''
++    Get the currently-forced choice of attention backend,
++    or None if auto-selection is currently enabled.
++    '''
++    return forced_attn_backend
++
++
++def get_attn_backend(
++    head_size: int,
++    dtype: torch.dtype,
++    kv_cache_dtype: Optional[str],
++    block_size: int,
++    is_attention_free: bool,
++    is_blocksparse: bool = False,
++) -> Type[AttentionBackend]:
++    """Selects which attention backend to use and lazily imports it."""
++    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
++    # value to be returned from the cache if the value changes between calls.
++    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
++    # private function.
++    return _cached_get_attn_backend(
++        head_size=head_size,
++        dtype=dtype,
++        kv_cache_dtype=kv_cache_dtype,
++        block_size=block_size,
++        is_attention_free=is_attention_free,
++        is_blocksparse=is_blocksparse,
++        use_v1=envs.VLLM_USE_V1,
++    )
+ 
+ 
+ @lru_cache(maxsize=None)
+-def get_attn_backend(dtype: torch.dtype) -> Type[AttentionBackend]:
+-    backend = _which_attn_to_use(dtype)
+-    if backend == _Backend.FLASH_ATTN:
+-        logger.info("Using FlashAttention-2 backend.")
+-        from vllm.attention.backends.flash_attn import (  # noqa: F401
+-            FlashAttentionBackend)
+-        return FlashAttentionBackend
+-    elif backend == _Backend.XFORMERS:
+-        logger.info("Using XFormers backend.")
+-        from vllm.attention.backends.xformers import (  # noqa: F401
+-            XFormersBackend)
+-        return XFormersBackend
+-    elif backend == _Backend.ROCM_FLASH:
+-        logger.info("Using ROCmFlashAttention backend.")
+-        from vllm.attention.backends.rocm_flash_attn import (  # noqa: F401
+-            ROCmFlashAttentionBackend)
+-        return ROCmFlashAttentionBackend
+-    elif backend == _Backend.TORCH_SDPA:
+-        logger.info("Using Torch SDPA backend.")
+-        from vllm.attention.backends.torch_sdpa import TorchSDPABackend
+-        return TorchSDPABackend
+-    elif backend == _Backend.FLASHINFER:
+-        logger.info("Using Flashinfer backend.")
+-        logger.warning("Eager mode is enforced for the Flashinfer backend. ")
+-        from vllm.attention.backends.flashinfer import FlashInferBackend
+-        return FlashInferBackend
++def _cached_get_attn_backend(
++    head_size: int,
++    dtype: torch.dtype,
++    kv_cache_dtype: Optional[str],
++    block_size: int,
++    is_attention_free: bool,
++    is_blocksparse: bool = False,
++    use_v1: bool = False,
++) -> Type[AttentionBackend]:
++    if is_blocksparse:
++        logger.info("Using BlocksparseFlashAttention backend.")
++        from vllm.attention.backends.blocksparse_attn import (
++            BlocksparseFlashAttentionBackend)
++        return BlocksparseFlashAttentionBackend
++
++    # If there are no attention layers (e.g. we are running Mamba),
++    # use the placeholder NO_ATTENTION
++    if is_attention_free:
++        from vllm.attention.backends.placeholder_attn import (
++            PlaceholderAttentionBackend)
++        return PlaceholderAttentionBackend
++
++    # Check whether a particular choice of backend was
++    # previously forced.
++    #
++    # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
++    # ENVIRONMENT VARIABLE.
++    selected_backend = None
++    backend_by_global_setting: Optional[_Backend] = (
++        get_global_forced_attn_backend())
++    if backend_by_global_setting is not None:
++        selected_backend = backend_by_global_setting
+     else:
+-        raise ValueError("Invalid attention backend.")
++        # Check the environment variable and override if specified
++        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
++        if backend_by_env_var is not None:
++            selected_backend = backend_name_to_enum(backend_by_env_var)
++
++    # get device-specific attn_backend
++    attention_cls = current_platform.get_attn_backend_cls(
++        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1)
++    if not attention_cls:
++        raise ValueError(
++            f"Invalid attention backend for {current_platform.device_name}")
++    return resolve_obj_by_qualname(attention_cls)
++
++
++@contextmanager
++def global_force_attn_backend_context_manager(
++        attn_backend: _Backend) -> Generator[None, None, None]:
++    '''
++    Globally force a vLLM attention backend override within a
++    context manager, reverting the global attention backend
++    override to its prior state upon exiting the context
++    manager.
++
++    Arguments:
+ 
++    * attn_backend: attention backend to force
+ 
+-def _which_attn_to_use(dtype: torch.dtype) -> _Backend:
+-    """Returns which flash attention backend to use."""
+-    if is_cpu():
+-        return _Backend.TORCH_SDPA
++    Returns:
+ 
+-    if is_hip():
+-        # AMD GPUs.
+-        if torch.cuda.get_device_capability()[0] != 9:
+-            # not Instinct series GPUs.
+-            logger.info("flash_atten is not supported on NAVI GPUs.")
+-        return _Backend.ROCM_FLASH
++    * Generator
++    '''
+ 
+-    # NVIDIA GPUs.
+-    if torch.cuda.get_device_capability()[0] < 8:
+-        # Volta and Turing NVIDIA GPUs.
+-        logger.info("Cannot use FlashAttention-2 backend for Volta and Turing "
+-                    "GPUs.")
+-        return _Backend.XFORMERS
++    # Save the current state of the global backend override (if any)
++    original_value = get_global_forced_attn_backend()
+ 
+-    if dtype not in (torch.float16, torch.bfloat16):
+-        logger.info("Cannot use FlashAttention-2 backend for dtype other than "
+-                    "torch.float16 or torch.bfloat16.")
+-        return _Backend.XFORMERS
++    # Globally force the new backend override
++    global_force_attn_backend(attn_backend)
+ 
++    # Yield control back to the enclosed code block
+     try:
+-        import flash_attn  # noqa: F401
+-    except ImportError:
+-        logger.info(
+-            "Cannot use FlashAttention-2 backend because the flash_attn "
+-            "package is not found. Please install it for better performance.")
+-        return _Backend.XFORMERS
+-
+-    backend_by_env_var = envs.VLLM_ATTENTION_BACKEND
+-    if backend_by_env_var is not None:
+-        return _Backend[backend_by_env_var]
+-
+-    # Default case.
+-    return _Backend.FLASH_ATTN
++        yield
++    finally:
++        # Revert the original global backend override, if any
++        global_force_attn_backend(original_value)
+diff --git a/vllm/beam_search.py b/vllm/beam_search.py
+new file mode 100644
+index 0000000..026037e
+--- /dev/null
++++ b/vllm/beam_search.py
+@@ -0,0 +1,71 @@
++from dataclasses import dataclass
++from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
++
++from vllm.sequence import Logprob
++
++if TYPE_CHECKING:
++    from vllm.multimodal import MultiModalDataDict
++
++
++@dataclass
++class BeamSearchSequence:
++    """A sequence for beam search.
++    It keeps track of the tokens and the log probability of the sequence.
++    The text field is optional and will only be filled when the sequence is
++    about to be returned to the user.
++    """
++    # The tokens includes the prompt.
++    tokens: List[int]
++    logprobs: List[Dict[int, Logprob]]
++    cum_logprob: float = 0.0
++    text: Optional[str] = None
++    finish_reason: Optional[str] = None
++    stop_reason: Union[int, str, None] = None
++    multi_modal_data: Optional["MultiModalDataDict"] = None
++    mm_processor_kwargs: Optional[Dict[str, Any]] = None
++
++
++@dataclass
++class BeamSearchOutput:
++    """The output of beam search.
++    It contains the list of the best beam search sequences.
++    The length of the list is equal to the beam width.
++    """
++    sequences: List[BeamSearchSequence]
++
++
++class BeamSearchInstance:
++
++    def __init__(self, prompt_tokens: List[int]):
++        self.beams: List[BeamSearchSequence] = [
++            BeamSearchSequence(tokens=prompt_tokens, logprobs=[])
++        ]
++        self.completed: List[BeamSearchSequence] = []
++
++
++def get_beam_search_score(
++    tokens: List[int],
++    cumulative_logprob: float,
++    eos_token_id: int,
++    length_penalty: float = 1.0,
++) -> float:
++    """Calculate the beam search score with length penalty.
++
++    Adapted from
++
++    https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
++    """
++    seq_len = len(tokens)
++    if tokens[-1] == eos_token_id:
++        seq_len -= 1
++
++    return cumulative_logprob / (seq_len**length_penalty)
++
++
++def create_sort_beams_key_function(eos_token_id: int, length_penalty: float):
++
++    def sort_beams_key(x: BeamSearchSequence) -> float:
++        return get_beam_search_score(x.tokens, x.cum_logprob, eos_token_id,
++                                     length_penalty)
++
++    return sort_beams_key
+diff --git a/vllm/compilation/__init__.py b/vllm/compilation/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
+new file mode 100644
+index 0000000..8765553
+--- /dev/null
++++ b/vllm/compilation/backends.py
+@@ -0,0 +1,802 @@
++import ast
++import copy
++import dataclasses
++import os
++import pprint
++import time
++from collections import defaultdict
++from contextlib import ExitStack
++from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
++from unittest.mock import patch
++
++import torch
++import torch.fx as fx
++
++import vllm.envs as envs
++from vllm.config import CompilationConfig, VllmConfig
++from vllm.logger import init_logger
++from vllm.utils import weak_ref_tensors
++
++from .counter import compilation_counter
++from .inductor_pass import InductorPass
++from .monitor import end_monitoring_torch_compile
++from .pass_manager import PostGradPassManager
++
++logger = init_logger(__name__)
++
++
++class InductorHashCache:
++    """
++    Disk format: a Python list of tuples, each tuple is
++    (runtime_shape, graph_index, hash_str)
++    We use list of tuple for readability.
++
++    In-memory format: a defaultdict of dict, where the key is
++    runtime_shape, and the value is a dict of graph_index to hash_str.
++
++    The data is essentially `Dict[Optional[int], Dict[int, str]]`,
++    we don't use json here because json doesn't support int as key.
++
++    TODO: better off-the-shelf solution to serialize the data?
++    """
++
++    def __init__(self, cache_dir: str, disabled: bool = False):
++        self.cache: defaultdict = defaultdict(dict)
++        self.disabled = disabled
++        self.cache_dir = cache_dir
++        self.cache_file_path = os.path.join(cache_dir,
++                                            "inductor_hash_cache.py")
++        if disabled:
++            return
++        # set flags so that Inductor and Triton store their cache
++        # in the cache_dir, then users only need to copy the cache_dir
++        # to another machine to reuse the cache.
++        inductor_cache = os.path.join(cache_dir, "inductor_cache")
++        os.makedirs(inductor_cache, exist_ok=True)
++        os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache
++        triton_cache = os.path.join(cache_dir, "triton_cache")
++        os.makedirs(triton_cache, exist_ok=True)
++        os.environ["TRITON_CACHE_DIR"] = triton_cache
++        if os.path.exists(self.cache_file_path):
++            with open(self.cache_file_path) as f:
++                self.deserialize(f.read())
++
++    def deserialize(self, data: str):
++        # we use ast.literal_eval to parse the data
++        # because it is a safe way to parse Python literals.
++        # do not use eval(), it is unsafe.
++        list_data = ast.literal_eval(data)
++        for runtime_shape, graph_index, hash_str in list_data:
++            self.cache[runtime_shape][graph_index] = hash_str
++
++    def serialize(self) -> str:
++        data = []
++        for runtime_shape, graph_index_to_hash_str in self.cache.items():
++            for graph_index, hash_str in graph_index_to_hash_str.items():
++                data.append((runtime_shape, graph_index, hash_str))
++        printer = pprint.PrettyPrinter(indent=4)
++        return printer.pformat(data)
++
++    def save_to_file(self):
++        if self.disabled:
++            return
++        with open(self.cache_file_path, "w") as f:
++            f.write(self.serialize())
++
++    def __contains__(self, key: Tuple[Optional[int], int]) -> bool:
++        if self.disabled:
++            return False
++        runtime_shape, graph_index = key
++        return runtime_shape in self.cache and graph_index in self.cache[
++            runtime_shape]
++
++    def __getitem__(self, key: Tuple[Optional[int], int]) -> str:
++        if self.disabled:
++            raise KeyError("cannot read from disabled cache")
++        runtime_shape, graph_index = key
++        return self.cache[runtime_shape][graph_index]
++
++    def __setitem__(self, key: Tuple[Optional[int], int], value: str):
++        # setitem for disabled cache is fine, because we
++        # don't actually write to the disk
++        runtime_shape, graph_index = key
++        self.cache[runtime_shape][graph_index] = value
++
++
++class AlwaysHitShapeEnv:
++    """
++    Why do we need this class:
++
++    For normal `torch.compile` usage, every compilation will have
++    one Dynamo bytecode compilation and one Inductor compilation.
++    The Inductor compilation happens under the context of the
++    Dynamo bytecode compilation, and that context is used to
++    determine the dynamic shape information, etc.
++
++    For our use case, we only run Dynamo bytecode compilation once,
++    and run Inductor compilation multiple times with different shapes
++    plus a general shape. The compilation for specific shapes happens
++    outside of the context of the Dynamo bytecode compilation. At that
++    time, we don't have shape environment to provide to Inductor, and
++    it will fail the Inductor code cache lookup.
++
++    By providing a dummy shape environment that always hits, we can
++    make the Inductor code cache lookup always hit, and we can
++    compile the graph for different shapes as needed.
++
++    The following dummy methods are obtained by trial-and-error
++    until it works.
++    """
++
++    def __init__(self) -> None:
++        self.guards: List[Any] = []
++
++    def evaluate_guards_expression(self, *args, **kwargs):
++        return True
++
++    def get_pruned_guards(self, *args, **kwargs):
++        return []
++
++    def produce_guards_expression(self, *args, **kwargs):
++        return ""
++
++
++def wrap_inductor(graph: fx.GraphModule,
++                  example_inputs,
++                  additional_inductor_config,
++                  compilation_config: CompilationConfig,
++                  vllm_backend: "VllmBackend",
++                  graph_index: int = 0,
++                  num_graphs: int = 1,
++                  runtime_shape: Optional[int] = None,
++                  use_inductor: bool = True) -> Any:
++    if graph_index == 0:
++        # before compiling the first graph, record the start time
++        global compilation_start_time
++        compilation_start_time = time.time()
++
++    if not use_inductor:
++        return graph
++
++    compilation_counter.num_inductor_compilations += 1
++
++    from torch._inductor import config
++    current_config = config.get_config_copy()
++    from torch._inductor.compile_fx import compile_fx
++
++    if additional_inductor_config is not None:
++        current_config.update(additional_inductor_config)
++
++    if isinstance(runtime_shape, int):
++        # for a specific batchsize, tuning triton kernel parameters
++        # can be beneficial
++        current_config["max_autotune"] = True
++        current_config["coordinate_descent_tuning"] = True
++
++    # inductor can inplace modify the graph, so we need to copy it
++    # see https://github.com/pytorch/pytorch/issues/138980
++    graph = copy.deepcopy(graph)
++
++    cache_data = vllm_backend.inductor_hash_cache
++    if (runtime_shape, graph_index) in cache_data:
++        # we compiled this graph before
++        # so we can directly lookup the compiled graph via hash
++        hash_str = cache_data[(runtime_shape, graph_index)]
++        if graph_index == 0:
++            # adds some info logging for the first graph
++            logger.info(
++                "Directly lookup the graph for shape %s from the cache",
++                str(runtime_shape))  # noqa
++        logger.debug(
++            "directly lookup the %s-th graph for shape %s via hash %s",
++            graph_index, str(runtime_shape), hash_str)
++        from torch._inductor.codecache import FxGraphCache
++        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
++                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
++            inductor_compiled_graph = FxGraphCache._lookup_graph(
++                hash_str, example_inputs, True, False)
++            assert inductor_compiled_graph is not None, (
++                "Inductor cache lookup failed. Please remove"
++                f"the cache file {cache_data.cache_file_path} and try again."  # noqa
++            )
++
++        # Inductor calling convention (function signature):
++        # f(list) -> tuple
++        # Dynamo calling convention (function signature):
++        # f(*args) -> Any
++
++        # need to know if the graph returns a tuple
++        from torch._inductor.compile_fx import graph_returns_tuple
++        returns_tuple = graph_returns_tuple(graph)
++
++        # this is the callable we return to Dynamo to run
++        def compiled_graph(*args):
++            # convert args to list
++            list_args = list(args)
++            graph_output = inductor_compiled_graph(list_args)
++            # unpack the tuple if needed
++            if returns_tuple:
++                return graph_output
++            else:
++                return graph_output[0]
++    else:
++        # it's the first time we compile this graph
++        # the assumption is that we don't have nested Inductor compilation.
++        # compiled_fx_graph_hash will only be called once, and we can hook
++        # it to get the hash of the compiled graph directly.
++        from torch._inductor.codecache import compiled_fx_graph_hash
++
++        def hijack_compiled_fx_graph_hash(*args, **kwargs):
++            out = compiled_fx_graph_hash(*args, **kwargs)
++            # store the hash in the cache
++            nonlocal cache_data
++            cache_data[(runtime_shape, graph_index)] = out[0]
++            if graph_index == 0:
++                # adds some info logging for the first graph
++                logger.info("Cache the graph of shape %s for later use",
++                            str(runtime_shape))
++            logger.debug("store the %s-th graph for shape %s via hash %s",
++                         graph_index, str(runtime_shape), out[0])
++            return out
++
++        def _check_can_cache(*args, **kwargs):
++            # no error means it can be cached.
++            # Inductor refuses to cache the graph outside of Dynamo
++            # tracing context, and also disables caching for graphs
++            # with high-order ops.
++            # For vLLM, in either case, we want to cache the graph.
++            # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
++            return
++
++        def _get_shape_env() -> AlwaysHitShapeEnv:
++            return AlwaysHitShapeEnv()
++
++        with patch(# for hijacking the hash of the compiled graph
++                "torch._inductor.codecache.compiled_fx_graph_hash",
++                hijack_compiled_fx_graph_hash), \
++            patch(# for providing a dummy shape environment
++                "torch._inductor.codecache.FxGraphCache._get_shape_env",
++                 _get_shape_env), \
++            patch(# for forcing the graph to be cached
++                "torch._inductor.codecache.FxGraphCache._check_can_cache",
++                _check_can_cache):
++            compiled_graph = compile_fx(graph,
++                                        example_inputs,
++                                        config_patches=current_config)
++
++    # after compiling the last graph, record the end time
++    if graph_index == num_graphs - 1:
++        now = time.time()
++        elapsed = now - compilation_start_time
++        compilation_config.compilation_time += elapsed
++        if runtime_shape is None:
++            logger.info("Compiling a graph for general shape takes %.2f s",
++                        elapsed)
++        else:
++            logger.info("Compiling a graph for shape %s takes %.2f s",
++                        runtime_shape, elapsed)
++
++    return compiled_graph
++
++
++@dataclasses.dataclass
++class SplitItem:
++    submod_name: str
++    graph_id: int
++    is_splitting_graph: bool
++    graph: fx.GraphModule
++
++
++def split_graph(graph: fx.GraphModule,
++                ops: List[str]) -> Tuple[fx.GraphModule, List[SplitItem]]:
++    # split graph by ops
++    subgraph_id = 0
++    node_to_subgraph_id = {}
++    split_op_graphs = []
++    for node in graph.graph.nodes:
++        if node.op in ("output", "placeholder"):
++            continue
++        if node.op == 'call_function' and str(node.target) in ops:
++            subgraph_id += 1
++            node_to_subgraph_id[node] = subgraph_id
++            split_op_graphs.append(subgraph_id)
++            subgraph_id += 1
++        else:
++            node_to_subgraph_id[node] = subgraph_id
++
++    # `keep_original_order` is important!
++    # otherwise pytorch might reorder the nodes and
++    # the semantics of the graph will change when we
++    # have mutations in the graph
++    split_gm = torch.fx.passes.split_module.split_module(
++        graph,
++        None,
++        lambda node: node_to_subgraph_id[node],
++        keep_original_order=True)
++
++    outputs = []
++
++    names = [name for (name, module) in split_gm.named_modules()]
++
++    for name in names:
++        if "." in name or name == "":
++            # recursive child module or the root module
++            continue
++
++        module = getattr(split_gm, name)
++
++        graph_id = int(name.replace("submod_", ""))
++        outputs.append(
++            SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
++
++    # sort by intetger graph_id, rather than string name
++    outputs.sort(key=lambda x: x.graph_id)
++
++    return split_gm, outputs
++
++
++# we share the global graph pool among all the backends
++global_graph_pool = None
++
++compilation_start_time = 0.0
++
++
++class PiecewiseCompileInterpreter(torch.fx.Interpreter):
++    """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
++    It runs the given graph with fake inputs, and compile some
++    submodules specified by `compile_submod_names` with the given
++    compilation configs.
++
++    NOTE: the order in `compile_submod_names` matters, because
++    it will be used to determine the order of the compiled piecewise
++    graphs. The first graph will handle logging, and the last graph
++    has some special cudagraph output handling.
++    """
++
++    def __init__(self, module: torch.fx.GraphModule,
++                 compile_submod_names: List[str], vllm_config: VllmConfig,
++                 graph_pool, vllm_backend: "VllmBackend"):
++        super().__init__(module)
++        from torch._guards import detect_fake_mode
++        self.fake_mode = detect_fake_mode()
++        self.compile_submod_names = compile_submod_names
++        self.compilation_config = vllm_config.compilation_config
++        self.graph_pool = graph_pool
++        self.vllm_config = vllm_config
++        self.vllm_backend = vllm_backend
++
++    def run(self, *args):
++        fake_args = [
++            self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
++            for t in args
++        ]
++        with self.fake_mode:
++            return super().run(*fake_args)
++
++    def call_module(self, target: torch.fx.node.Target,
++                    args: Tuple[torch.fx.node.Argument,
++                                ...], kwargs: Dict[str, Any]) -> Any:
++        assert isinstance(target, str)
++        output = super().call_module(target, args, kwargs)
++
++        if target in self.compile_submod_names:
++            index = self.compile_submod_names.index(target)
++            submod = self.fetch_attr(target)
++            sym_shape_indices = [
++                i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
++            ]
++            global compilation_start_time
++            compiled_graph_for_general_shape = wrap_inductor(
++                submod,
++                args,
++                self.compilation_config.inductor_compile_config,
++                self.compilation_config,
++                self.vllm_backend,
++                graph_index=index,
++                num_graphs=len(self.compile_submod_names),
++                runtime_shape=None,
++                use_inductor=self.compilation_config.use_inductor)
++
++            self.module.__dict__[target] = PiecewiseBackend(
++                submod, self.vllm_config, self.graph_pool, index,
++                len(self.compile_submod_names), sym_shape_indices,
++                compiled_graph_for_general_shape, self.vllm_backend)
++
++            compilation_counter.num_piecewise_capturable_graphs_seen += 1
++
++        return output
++
++
++class VllmBackend:
++    """The compilation backend for `torch.compile` with VLLM.
++    It is used for compilation level of `CompilationLevel.PIECEWISE`,
++    where we customize the compilation.
++
++    The major work of this backend is to split the graph into
++    piecewise graphs, and pass them to the piecewise backend.
++
++    This backend also adds the PostGradPassManager to Inductor config,
++    which handles the post-grad passes.
++    """
++
++    vllm_config: VllmConfig
++    compilation_config: CompilationConfig
++    graph_pool: Any
++    _called: bool = False
++    # the graph we compiled
++    graph: fx.GraphModule
++    # the stiching graph module for all the piecewise graphs
++    split_gm: fx.GraphModule
++    piecewise_graphs: List[SplitItem]
++    returned_callable: Callable
++    # Inductor passes to run on the graph pre-defunctionalization
++    post_grad_passes: Sequence[Callable]
++    sym_tensor_indices: List[int]
++    input_buffers: List[torch.Tensor]
++    inductor_hash_cache: InductorHashCache
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++    ):
++        global global_graph_pool
++        if global_graph_pool is None:
++            global_graph_pool = torch.cuda.graph_pool_handle()
++
++        # TODO: in the future, if we want to use multiple
++        # streams, it might not be safe to share a global pool.
++        # only investigate this when we use multiple streams
++        self.graph_pool = global_graph_pool
++
++        # Passes to run on the graph post-grad.
++        self.post_grad_pass_manager = PostGradPassManager()
++
++        self.sym_tensor_indices = []
++        self.input_buffers = []
++
++        self.vllm_config = vllm_config
++        self.compilation_config = vllm_config.compilation_config
++
++        # `torch.compile` is JIT compiled, so we don't need to
++        # do anything here
++
++    def configure_post_pass(self):
++        config = self.compilation_config
++        self.post_grad_pass_manager.configure(config.pass_config)
++
++        # Post-grad custom passes are run using the post_grad_custom_post_pass
++        # hook. If a pass for that hook exists, add it to the pass manager.
++        inductor_config = config.inductor_compile_config
++        PASS_KEY = "post_grad_custom_post_pass"
++        if PASS_KEY in inductor_config:
++            # Config should automatically wrap all inductor passes
++            assert isinstance(inductor_config[PASS_KEY], InductorPass)
++            self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
++        inductor_config[PASS_KEY] = self.post_grad_pass_manager
++
++    def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
++
++        if not self.compilation_config.cache_dir:
++            # no provided cache dir, generate one based on the known factors
++            # that affects the compilation. if none of the factors change,
++            # the cache dir will be the same so that we can reuse the compiled
++            # graph.
++
++            # 1. factors come from the vllm_config (it mainly summarizes how the
++            #    model is created)
++            vllm_config = self.vllm_config
++            config_hash = vllm_config.compute_hash()
++
++            # 2. factors come from the code files that are traced by Dynamo (
++            #    it mainly summarizes how the model is used in forward pass)
++            forward_code_files = list(
++                sorted(self.compilation_config.traced_files))
++            self.compilation_config.traced_files.clear()
++            logger.debug(
++                "Traced files (to be considered for compilation cache):\n%s",
++                "\n".join(forward_code_files))
++            hash_content = []
++            for filepath in forward_code_files:
++                hash_content.append(filepath)
++                with open(filepath) as f:
++                    hash_content.append(f.read())
++            import hashlib
++            code_hash = hashlib.md5(
++                "\n".join(hash_content).encode()).hexdigest()
++
++            # combine the two hashes to generate the cache dir
++            hash_key = hashlib.md5(
++                f"{config_hash}_{code_hash}".encode()).hexdigest()[:10]
++            cache_dir = os.path.join(
++                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
++                f"rank_{vllm_config.parallel_config.rank}")
++        else:
++            cache_dir = self.compilation_config.cache_dir
++        os.makedirs(cache_dir, exist_ok=True)
++
++        disabled = envs.VLLM_DISABLE_COMPILE_CACHE
++        self.inductor_hash_cache: InductorHashCache = InductorHashCache(
++            cache_dir, disabled=disabled)
++        if disabled:
++            logger.info("vLLM's torch.compile cache is disabled.")
++        else:
++            logger.info("Using cache directory: %s for vLLM's torch.compile",
++                        cache_dir)
++
++        # when dynamo calls the backend, it means the bytecode
++        # transform and analysis are done
++        compilation_counter.num_graphs_seen += 1
++        from .monitor import torch_compile_start_time
++        dynamo_time = time.time() - torch_compile_start_time
++        logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
++        self.compilation_config.compilation_time += dynamo_time
++
++        # we control the compilation process, each instance can only be
++        # called once
++        assert not self._called, "VllmBackend can only be called once"
++
++        self.graph = graph
++        self.configure_post_pass()
++
++        self.split_gm, self.piecewise_graphs = split_graph(
++            graph, self.compilation_config.splitting_ops)
++
++        from torch._dynamo.utils import lazy_format_graph_code
++
++        # depyf will hook lazy_format_graph_code and dump the graph
++        # for debugging, no need to print the graph here
++        lazy_format_graph_code("before split", self.graph)
++        lazy_format_graph_code("after split", self.split_gm)
++
++        compilation_counter.num_piecewise_graphs_seen += len(
++            self.piecewise_graphs)
++        submod_names_to_compile = [
++            item.submod_name for item in self.piecewise_graphs
++            if not item.is_splitting_graph
++        ]
++
++        # propagate the split graph to the piecewise backend,
++        # compile submodules with symbolic shapes
++        PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
++                                    self.vllm_config, self.graph_pool,
++                                    self).run(*example_inputs)
++
++        self._called = True
++
++        if not self.compilation_config.use_cudagraph or \
++            not self.compilation_config.cudagraph_copy_inputs:
++            return self.split_gm
++
++        # if we need to copy input buffers for cudagraph
++        from torch._guards import detect_fake_mode
++        fake_mode = detect_fake_mode()
++        fake_args = [
++            fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
++            for t in example_inputs
++        ]
++
++        # index of tensors that have symbolic shapes (batch size)
++        self.sym_tensor_indices = [
++            i for i, x in enumerate(fake_args)
++            if isinstance(x, torch._subclasses.fake_tensor.FakeTensor)
++        ]
++
++        # compiler managed cudagraph input buffers
++        # we assume the first run with symbolic shapes
++        # has the maximum size among all the tensors
++        self.input_buffers = [
++            example_inputs[x].clone() for x in self.sym_tensor_indices
++        ]
++
++        # this is the callable we return to Dynamo to run
++        def copy_and_call(*args):
++            list_args = list(args)
++            for i, index in enumerate(self.sym_tensor_indices):
++                runtime_tensor = list_args[index]
++                runtime_shape = runtime_tensor.shape[0]
++                static_tensor = self.input_buffers[i][:runtime_shape]
++
++                # copy the tensor to the static buffer
++                static_tensor.copy_(runtime_tensor)
++
++                # replace the tensor in the list_args to the static buffer
++                list_args[index] = static_tensor
++            return self.split_gm(*list_args)
++
++        return copy_and_call
++
++
++@dataclasses.dataclass
++class ConcreteSizeEntry:
++    runtime_shape: int
++    need_to_compile: bool  # the size is in compile_sizes
++    use_cudagraph: bool  # the size is in capture_sizes
++
++    compiled: bool = False
++    runnable: Callable = None  # type: ignore
++    num_finished_warmup: int = 0
++    cudagraph: Optional[torch.cuda.CUDAGraph] = None
++    output: Optional[Any] = None
++
++    # for cudagraph debugging, track the input addresses
++    # during capture, and check if they are the same during replay
++    input_addresses: Optional[List[int]] = None
++
++
++class PiecewiseBackend:
++
++    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
++                 graph_pool: Any, piecewise_compile_index: int,
++                 total_piecewise_compiles: int, sym_shape_indices: List[int],
++                 compiled_graph_for_general_shape: Callable,
++                 vllm_backend: VllmBackend):
++        """
++        The backend for piecewise compilation.
++        It mainly handles the compilation and cudagraph capturing.
++
++        We will compile `self.graph` once for the general shape,
++        and then compile for different shapes specified in
++        `compilation_config.compile_sizes`.
++
++        Independently, we will capture cudagraph for different shapes.
++
++        If a shape needs both compilation and cudagraph, we will
++        compile it first, and then capture cudagraph.
++        """
++        self.graph = graph
++        self.vllm_config = vllm_config
++        self.compilation_config = vllm_config.compilation_config
++        self.graph_pool = graph_pool
++        self.piecewise_compile_index = piecewise_compile_index
++        self.total_piecewise_compiles = total_piecewise_compiles
++        self.vllm_backend = vllm_backend
++
++        self.is_first_graph = piecewise_compile_index == 0
++        self.is_last_graph = (
++            piecewise_compile_index == total_piecewise_compiles - 1)
++
++        self.compile_sizes: Set[int] = set(
++            self.compilation_config.compile_sizes)
++        self.capture_sizes: Set[int] = set(
++            self.compilation_config.capture_sizes
++        ) if self.compilation_config.use_cudagraph else set()
++
++        self.first_run_finished = False
++
++        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
++
++        self.sym_shape_indices = sym_shape_indices
++
++        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
++
++        # the entries for different shapes that we need to either
++        # compile or capture cudagraph
++        self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
++
++        # to_be_compiled_sizes tracks the remaining sizes to compile,
++        # and updates during the compilation process, so we need to copy it
++        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy()
++        for shape in self.compile_sizes.union(self.capture_sizes):
++            self.concrete_size_entries[shape] = ConcreteSizeEntry(
++                runtime_shape=shape,
++                need_to_compile=shape in self.compile_sizes,
++                use_cudagraph=shape in self.capture_sizes,
++            )
++
++    def check_for_ending_compilation(self):
++        if self.is_last_graph and not self.to_be_compiled_sizes:
++            # no specific sizes to compile
++            # save the hash of the inductor graph for the next run
++            self.vllm_backend.inductor_hash_cache.save_to_file()
++            end_monitoring_torch_compile(self.vllm_config)
++
++    def __call__(self, *args) -> Any:
++        if not self.first_run_finished:
++            self.first_run_finished = True
++            self.check_for_ending_compilation()
++            return self.compiled_graph_for_general_shape(*args)
++
++        runtime_shape = args[self.sym_shape_indices[0]]
++        if runtime_shape not in self.concrete_size_entries:
++            # we don't need to do anything for this shape
++            return self.compiled_graph_for_general_shape(*args)
++
++        entry = self.concrete_size_entries[runtime_shape]
++
++        if entry.runnable is None:
++            entry.runnable = self.compiled_graph_for_general_shape
++
++        if entry.need_to_compile and not entry.compiled:
++            entry.compiled = True
++            self.to_be_compiled_sizes.remove(runtime_shape)
++            # args are real arguments
++            entry.runnable = wrap_inductor(
++                self.graph,
++                args,
++                self.compilation_config.inductor_compile_config,
++                self.compilation_config,
++                self.vllm_backend,
++                graph_index=self.piecewise_compile_index,
++                num_graphs=self.total_piecewise_compiles,
++                runtime_shape=runtime_shape,
++                use_inductor=self.compilation_config.use_inductor)
++
++            # finished compilations for all required shapes
++            if self.is_last_graph and not self.to_be_compiled_sizes:
++                self.check_for_ending_compilation()
++
++        if not entry.use_cudagraph:
++            return entry.runnable(*args)
++
++        if entry.cudagraph is None:
++            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
++                entry.num_finished_warmup += 1
++                if self.is_first_graph:
++                    logger.debug(
++                        "Warming up %s/%s for shape %s",
++                        entry.num_finished_warmup,
++                        self.compilation_config.cudagraph_num_of_warmups,
++                        runtime_shape)
++                return entry.runnable(*args)
++
++            if self.is_first_graph:
++                # Since we capture cudagraph for many different shapes and
++                # capturing is fast, we don't need to log it for every shape.
++                # We only log it in the debug mode.
++                logger.debug("Capturing a cudagraph for shape %s",
++                             runtime_shape)
++
++            input_addresses = [
++                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
++            ]
++            entry.input_addresses = input_addresses
++            cudagraph = torch.cuda.CUDAGraph()
++
++            with ExitStack() as stack:
++                if not self.is_first_graph:
++                    # during every model forward, we will capture
++                    # many pieces of cudagraphs (roughly one per layer).
++                    # running gc again and again across layers will
++                    # make the cudagraph capture very slow.
++                    # therefore, we only run gc for the first graph,
++                    # and disable gc for the rest of the graphs.
++                    stack.enter_context(patch("gc.collect", lambda: None))
++                    stack.enter_context(
++                        patch("torch.cuda.empty_cache", lambda: None))
++
++                # mind-exploding: carefully manage the reference and memory.
++                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
++                    # `output` is managed by pytorch's cudagraph pool
++                    output = entry.runnable(*args)
++                    if self.is_last_graph:
++                        # by converting it to weak ref,
++                        # the original `output` will immediately be released
++                        # to save memory. It is only safe to do this for
++                        # the last graph, because the output of the last graph
++                        # will not be used by any other cuda graph.
++                        output = weak_ref_tensors(output)
++
++            # here we always use weak ref for the output
++            # to save memory
++            entry.output = weak_ref_tensors(output)
++            entry.cudagraph = cudagraph
++
++            compilation_counter.num_cudagraph_caputured += 1
++
++            # important: we need to return the output, rather than
++            # the weak ref of the output, so that pytorch can correctly
++            # manage the memory during cuda graph capture
++            return output
++
++        if self.is_debugging_mode:
++            # check if the input addresses are the same
++            new_input_addresses = [
++                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
++            ]
++            assert new_input_addresses == entry.input_addresses, (
++                "Input addresses for cudagraphs are different during replay."
++                f" Expected {entry.input_addresses}, got {new_input_addresses}"
++            )
++
++        entry.cudagraph.replay()
++        return entry.output
+diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
+new file mode 100644
+index 0000000..6385f1c
+--- /dev/null
++++ b/vllm/compilation/counter.py
+@@ -0,0 +1,31 @@
++import copy
++import dataclasses
++from contextlib import contextmanager
++
++
++@dataclasses.dataclass
++class CompilationCounter:
++    num_models_seen: int = 0
++    num_graphs_seen: int = 0
++    # including the splitting ops
++    num_piecewise_graphs_seen: int = 0
++    # not including the splitting ops
++    num_piecewise_capturable_graphs_seen: int = 0
++    num_inductor_compilations: int = 0
++    num_cudagraph_caputured: int = 0
++
++    def clone(self) -> "CompilationCounter":
++        return copy.deepcopy(self)
++
++    @contextmanager
++    def expect(self, **kwargs):
++        old = self.clone()
++        yield
++        for k, v in kwargs.items():
++            assert getattr(self, k) - getattr(old, k) == v, (
++                f"{k} not as expected, before it is {getattr(old, k)}"
++                f", after it is {getattr(self, k)}, "
++                f"expected diff is {v}")
++
++
++compilation_counter = CompilationCounter()
+diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
+new file mode 100644
+index 0000000..1051311
+--- /dev/null
++++ b/vllm/compilation/decorators.py
+@@ -0,0 +1,235 @@
++import inspect
++from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
++from unittest.mock import patch
++
++import torch
++import torch.nn as nn
++from torch._dynamo.symbolic_convert import InliningInstructionTranslator
++
++from vllm.compilation.counter import compilation_counter
++from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
++from vllm.config import CompilationLevel, VllmConfig
++from vllm.logger import init_logger
++from vllm.sequence import IntermediateTensors
++from vllm.utils import supports_dynamo
++
++from .monitor import start_monitoring_torch_compile
++
++logger = init_logger(__name__)
++
++_T = TypeVar("_T", bound=type[nn.Module])
++
++
++@overload
++def support_torch_compile(
++    *,
++    dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]],
++) -> Callable[[_T], _T]:
++    ...
++
++
++@overload
++def support_torch_compile(cls: _T) -> _T:
++    ...
++
++
++def support_torch_compile(
++    cls: Optional[_T] = None,
++    *,
++    dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None,
++) -> Union[Callable[[_T], _T], _T]:
++    """
++    A decorator to add support for compiling the forward method of a class.
++
++    Usage 1: use directly as a decorator without arguments:
++
++    ```python
++    @support_torch_compile
++    class MyModel(nn.Module):
++        def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]):
++            ...
++    ```
++
++    Usage 2: use as a decorator with arguments:
++
++    ```python
++    @support_torch_compile(dynamic_arg_dims={"x": 0, "y": 0})
++    class MyModel(nn.Module):
++        def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]):
++            ...
++    ```
++
++    `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic
++    dimensions of the argument. The dynamic dimensions can be either a single
++    integer or a list of integers.
++
++    if `dynamic_arg_dims` is `None`, it is inferred from the type annotation
++    of the `forward` method, based on the following default rules:
++
++    - if the argument is annotated as `torch.Tensor` or
++        `Optional[torch.Tensor]`, the first dimension will be
++        marked as dynamic.
++    - if the argument is annotated as `IntermediateTensors`, the first
++        dimension of all the tensors in the intermediate tensors
++        will be marked as dynamic.
++
++    During runtime, when we actually mark dimensions of tensors,
++     it depends on the value of arguments:
++
++    - if it is a single integer, the corresponding dimension of the argument
++        will be marked as dynamic.
++    - if it is `None`, ignored.
++    - if it is `IntermediateTensors`, all the tensors in the intermediate
++        tensors will be marked as dynamic.
++    - otherwise, it will raise an error.
++
++    NOTE: if an argument is `None`, it should always be passed as `None` during
++    the lifetime of the model, otherwise, it cannot be captured as a single
++    computation graph.
++    """
++
++    def cls_decorator_helper(cls: _T) -> _T:
++        # helper to pass `dynamic_arg_dims`` to `_support_torch_compile``
++        # to avoid too much indentation for `_support_torch_compile``
++        if not hasattr(cls, 'forward'):
++            raise TypeError("decorated class should have a forward method.")
++        sig = inspect.signature(cls.forward)
++        inferred_dynamic_arg_dims = dynamic_arg_dims
++        if inferred_dynamic_arg_dims is None:
++            inferred_dynamic_arg_dims = {}
++            for k, v in sig.parameters.items():
++                if v.annotation in [
++                        torch.Tensor, Optional[torch.Tensor],
++                        IntermediateTensors, Optional[IntermediateTensors]
++                ]:
++                    inferred_dynamic_arg_dims[k] = 0
++
++            logger.debug(("Inferred dynamic dimensions for "
++                          "forward method of %s: %s"), cls,
++                         list(inferred_dynamic_arg_dims.keys()))
++
++        if len(inferred_dynamic_arg_dims) == 0:
++            raise ValueError(
++                "No dynamic dimensions found in the forward method of "
++                f"{cls}. Please provide dynamic_arg_dims explicitly.")
++
++        for k in inferred_dynamic_arg_dims:
++            if k not in sig.parameters:
++                raise ValueError(
++                    f"Argument {k} not found in the forward method of {cls}")
++        return _support_torch_compile(cls, inferred_dynamic_arg_dims)
++
++    if cls is not None:
++        # use `support_torch_compile` as a decorator without arguments
++        assert isinstance(cls, type)
++        return cls_decorator_helper(cls)
++
++    return cls_decorator_helper
++
++
++def _support_torch_compile(
++    cls: _T,
++    dynamic_arg_dims: Dict[str, Union[int, List[int]]],
++) -> _T:
++    """
++    A decorator to add support for compiling the forward method of a class.
++    """
++    if TorchCompileWrapperWithCustomDispatcher in cls.__bases__:
++        # support decorating multiple times
++        return cls
++
++    # take care of method resolution order
++    # make sure super().__init__ is called on the base class
++    #  other than TorchCompileWrapperWithCustomDispatcher
++    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
++
++    old_init = cls.__init__
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
++        old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
++        self.vllm_config = vllm_config
++        # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
++        # will handle the compilation, so we don't need to do anything here.
++        self.do_not_compile = \
++            vllm_config.compilation_config.level in [
++            CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
++        ] or not supports_dynamo()
++        if self.do_not_compile:
++            return
++        compilation_counter.num_models_seen += 1
++        TorchCompileWrapperWithCustomDispatcher.__init__(
++            self, compilation_level=vllm_config.compilation_config.level)
++
++    cls.__init__ = __init__
++
++    def __call__(self, *args, **kwargs):
++        # torch.compiler.is_compiling() means we are inside the compilation
++        # e.g. TPU has the compilation logic in model runner, so we don't
++        # need to compile the model inside.
++        if self.do_not_compile or torch.compiler.is_compiling():
++            return self.forward(*args, **kwargs)
++
++        # the first compilation needs to have dynamic shapes marked
++        if len(self.compiled_codes) < 1:
++            sig = inspect.signature(self.__class__.forward)
++            bound_args = sig.bind(self, *args, **kwargs)
++            bound_args.apply_defaults()
++            for k, dims in dynamic_arg_dims.items():
++                arg = bound_args.arguments.get(k)
++                if arg is not None:
++                    if isinstance(arg, torch.Tensor):
++                        torch._dynamo.mark_dynamic(arg, dims)
++                    elif isinstance(arg, IntermediateTensors):
++                        for tensor in arg.tensors.values():
++                            torch._dynamo.mark_dynamic(tensor, dims)
++                    else:
++                        raise ValueError(
++                            "Unsupported dynamic dimensions"
++                            f" {dims} for argument {k} with type {type(arg)}.")
++            # here, it is the starting point of the `torch.compile` process
++            start_monitoring_torch_compile(self.vllm_config)
++
++        # if we don't use custom dispatcher, we can directly call the
++        # compiled function and let torch.compile handle the dispatching,
++        # with the overhead of guard evaluation and recompilation.
++        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
++            # it seems Dynamo reuse the compilation across instances,
++            # while we need to make sure the compiled code is not reused.
++            # we need to control all the compilation of the model.
++            torch._dynamo.eval_frame.remove_from_cache(
++                self.original_code_object)
++
++            # collect all relevant files traced by Dynamo,
++            # so that the compilation cache can trigger re-compilation
++            # properly when any of these files change.
++
++            # 1. the file containing the top-level forward function
++            self.vllm_config.compilation_config.traced_files.add(
++                self.original_code_object.co_filename)
++
++            # 2. every time Dynamo sees a function call, it will inline
++            # the function by calling InliningInstructionTranslator.inline_call
++            # we hijack this function to know all the functions called
++            # during Dynamo tracing, and their corresponding files
++            inline_call = InliningInstructionTranslator.inline_call
++
++            def patched_inline_call(parent, func, args, kwargs):
++                code = func.get_code()
++                self.vllm_config.compilation_config.traced_files.add(
++                    code.co_filename)
++                return inline_call(parent, func, args, kwargs)
++
++            with patch.object(InliningInstructionTranslator, 'inline_call',
++                              patched_inline_call):
++                output = self.compiled_callable(*args, **kwargs)
++            return output
++
++        # usually, capturing the model once is enough, and then we can
++        # dispatch to the compiled code directly, without going through
++        # the Dynamo guard mechanism.
++        with self.dispatch_to_code(0):
++            model_output = self.forward(*args, **kwargs)
++            return model_output
++
++    cls.__call__ = __call__
++    return cls
+diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
+new file mode 100644
+index 0000000..e15d7b3
+--- /dev/null
++++ b/vllm/compilation/fix_functionalization.py
+@@ -0,0 +1,180 @@
++import operator
++from typing import Dict, Iterable, List, Optional, Tuple, Union
++
++import torch
++from torch._higher_order_ops.auto_functionalize import auto_functionalized
++
++from vllm.logger import init_logger
++
++from .fx_utils import is_func
++from .vllm_inductor_pass import VllmInductorPass
++
++logger = init_logger(__name__)
++
++
++class FixFunctionalizationPass(VllmInductorPass):
++    """
++    This pass defunctionalizes certain nodes to avoid redundant tensor copies.
++    After this pass, DCE (dead-code elimination) should never be run,
++    as de-functionalized nodes may appear as dead code.
++
++    To add new nodes to defunctionalize, add to the if-elif chain in __call__.
++    """
++
++    def __call__(self, graph: torch.fx.Graph):
++        self.begin()
++        self.dump_graph(graph, "before_fix_functionalization")
++
++        self.nodes_to_remove: List[torch.fx.Node] = []
++        count = 0
++        for node in graph.nodes:
++            if not is_func(node, auto_functionalized):
++                continue  # Avoid deep if-elif nesting
++
++            kwargs = node.kwargs
++            at_target = node.args[0]
++
++            if at_target == torch.ops._C.rotary_embedding.default:
++                query = kwargs['query']
++                mm_node = query.args[0].args[0]
++
++                # rotary_embedding is a special case: the two mutating inputs
++                # are query and key, which are slices of mm_node.
++                # While functionalized, results at[1] and at[2] are scattered
++                # back into mm_node. After de-functionalization, we can just
++                # use mm_node directly.
++                for idx, user in self.getitem_users(node).items():
++                    for user_of_getitem in user.users:
++                        if is_func(user_of_getitem,
++                                   torch.ops.aten.slice_scatter.default):
++                            user_of_getitem.replace_all_uses_with(mm_node)
++                            self._remove(user_of_getitem)
++                    self._remove(user)
++
++                self.insert_defunctionalized(graph, node)
++                self._remove(node)
++
++            # rms_norm replacements avoid the most copies for LLaMa.
++            elif at_target == torch.ops._C.fused_add_rms_norm.default:
++                mutated_args = {1: 'input', 2: 'residual'}
++                self.defunctionalize(graph, node, mutated_args)
++            elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default:  # noqa: E501
++                mutated_args = {1: 'result', 2: 'residual'}
++                self.defunctionalize(graph, node, mutated_args)
++            elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default:  # noqa: E501
++                mutated_args = {1: 'result', 2: 'scale', 3: 'residual'}
++                self.defunctionalize(graph, node, mutated_args)
++            elif at_target in [
++                    torch.ops._C.rms_norm.default,
++                    torch.ops._C.rms_norm_static_fp8_quant.default
++            ]:
++                mutated_args = {1: 'result'}
++                self.defunctionalize(graph, node, mutated_args)
++
++            elif at_target == torch.ops._C.silu_and_mul.default:
++                mutated_args = {1: 'out'}
++                # Because we have an 'out', need to specify args directly
++                self.defunctionalize(graph,
++                                     node,
++                                     mutated_args,
++                                     args=('out', 'input'))
++            else:
++                continue  # skip the count
++
++            count += 1
++
++        self.dump_graph(graph, "before_fix_functionalization_cleanup")
++
++        # Remove the nodes all at once
++        count_removed = len(self.nodes_to_remove)
++        for node in self.nodes_to_remove:
++            graph.erase_node(node)
++
++        logger.debug("De-functionalized %s nodes, removed %s nodes", count,
++                     count_removed)
++        self.dump_graph(graph, "after_fix_functionalization")
++        self.end_and_log()
++
++    def _remove(self, node_or_nodes: Union[torch.fx.Node,
++                                           Iterable[torch.fx.Node]]):
++        """
++        Stage a node (or nodes) for removal at the end of the pass.
++        """
++        if isinstance(node_or_nodes, torch.fx.Node):
++            self.nodes_to_remove.append(node_or_nodes)
++        else:
++            self.nodes_to_remove.extend(node_or_nodes)
++
++    def defunctionalize(self,
++                        graph: torch.fx.Graph,
++                        node: torch.fx.Node,
++                        mutated_args: Dict[int, Union[torch.fx.Node, str]],
++                        args: Optional[Tuple[Union[torch.fx.Node, str],
++                                             ...]] = None):
++        """
++        De-functionalize a node by replacing it with a call to the original.
++        It also replaces the getitem users with the mutated arguments.
++        See replace_users_with_mutated_args and insert_defunctionalized.
++        """
++        self.replace_users_with_mutated_args(node, mutated_args)
++        self.insert_defunctionalized(graph, node, args=args)
++        self._remove(node)
++
++    def replace_users_with_mutated_args(self, node: torch.fx.Node,
++                                        mutated_args: Dict[int,
++                                                           Union[torch.fx.Node,
++                                                                 str]]):
++        """
++        Replace all getitem users of the auto-functionalized node with the
++        mutated arguments.
++        :param node: The auto-functionalized node
++        :param mutated_args: The mutated arguments, indexed by getitem index.
++        If the value of an arg is a string, `node.kwargs[arg]` is used.
++        """
++        for idx, user in self.getitem_users(node).items():
++            arg = mutated_args[idx]
++            arg = node.kwargs[arg] if isinstance(arg, str) else arg
++            user.replace_all_uses_with(arg)
++            self._remove(user)
++
++    def getitem_users(self, node: torch.fx.Node) -> Dict[int, torch.fx.Node]:
++        """
++        Returns the operator.getitem users of the auto-functionalized node,
++        indexed by the index they are getting.
++        """
++        users = {}
++        for user in node.users:
++            if is_func(user, operator.getitem):
++                idx = user.args[1]
++                users[idx] = user
++        return users
++
++    def insert_defunctionalized(self,
++                                graph: torch.fx.Graph,
++                                node: torch.fx.Node,
++                                args: Optional[Tuple[Union[torch.fx.Node, str],
++                                                     ...]] = None):
++        """
++        Insert a new defunctionalized node into the graph before node.
++        If one of the kwargs is 'out', provide args directly,
++        as node.kwargs cannot be used.
++        See https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351
++
++        :param graph: Graph to insert the defunctionalized node into
++        :param node: The auto-functionalized node to defunctionalize
++        :param args: If we cannot use kwargs, specify args directly.
++        If an arg is a string, `node.kwargs[arg]` is used.
++        """  # noqa: E501
++        assert is_func(node, auto_functionalized), \
++            f"node must be auto-functionalized, is {node} instead"
++
++        # Create a new call to the original function
++        with graph.inserting_before(node):
++            function = node.args[0]
++            if args is None:
++                graph.call_function(function, kwargs=node.kwargs)
++            else:
++                # Args passed as strings refer to items in node.kwargs
++                args = tuple(node.kwargs[arg] if isinstance(arg, str) else arg
++                             for arg in args)
++                graph.call_function(function, args=args)
+diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
+new file mode 100644
+index 0000000..cde27bd
+--- /dev/null
++++ b/vllm/compilation/fusion.py
+@@ -0,0 +1,615 @@
++from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
++
++import torch
++import torch._inductor.pattern_matcher as pm
++# TODO(luka) use vllm.utils once #10836 landed
++from compressed_tensors.quantization import FP8_DTYPE
++from torch import fx
++from torch._higher_order_ops.auto_functionalize import auto_functionalized
++from torch._inductor.pattern_matcher import PatternMatcherPass
++from torch._ops import OpOverload
++
++from vllm.config import CompilationConfig
++from vllm.logger import init_logger
++
++from .fx_utils import find_getitem_maybe
++from .multi_output_match import MultiOutputMatch
++from .vllm_inductor_pass import VllmInductorPass
++
++logger = init_logger(__name__)
++
++
++def empty_bf16(*args, **kwargs):
++    return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
++
++
++def empty_fp32(*args, **kwargs):
++    return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
++
++
++RMS_OP = torch.ops._C.rms_norm.default
++RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
++
++
++class QuantKey(NamedTuple):
++    """
++    Named tuple for identifying the type of quantization.
++    dtype: quantized data type
++    static: static quantization if True, dynamic if False
++    per_tensor: per-tensor quantization if True, per-token if False
++    symmetric: symmetric if True, asymmetric if False
++    """
++    dtype: torch.dtype
++    static: bool
++    per_tensor: bool = True
++    symmetric: bool = True
++
++    def __str__(self):
++        return (f"QuantKey({'static' if self.static else 'dynamic'},"
++                f"{fx.graph.dtype_abbrs[self.dtype]},"
++                f"{'per_tensor' if self.per_tensor else 'per_token'},"
++                f"{'a' if not self.symmetric else ''}symmetric)")
++
++
++kFp8StaticTensorSym = QuantKey(FP8_DTYPE, True, True, True)
++kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, True, True)
++kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, False, True)
++
++QUANT_OPS: Dict[QuantKey, OpOverload] = {
++    kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa
++    kFp8DynamicTensorSym:
++    torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa
++    kFp8DynamicTokenSym:
++    torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa
++}
++
++
++class FusedRMSQuantKey(NamedTuple):
++    """
++    Named tuple for identifying the type of RMSNorm + quant fusion.
++    quant: type of quantization
++    fused_add: does the op also perform the residual add
++    """
++    quant: QuantKey
++    fused_add: bool
++
++    def __str__(self):
++        return (f"FusedQuantKey({self.quant}, with"
++                f"{'' if self.fused_add else 'out'} residual)")
++
++
++FUSED_OPS: Dict[FusedRMSQuantKey, OpOverload] = {
++    FusedRMSQuantKey(kFp8StaticTensorSym, False):
++    torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa
++    FusedRMSQuantKey(kFp8StaticTensorSym, True):
++    torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,  # noqa
++    FusedRMSQuantKey(kFp8DynamicTokenSym, False):
++    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa
++    FusedRMSQuantKey(kFp8DynamicTokenSym, True):
++    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa
++}
++
++
++class QuantMultiOutputMatch(MultiOutputMatch):
++
++    def __init__(self, match: pm.Match, quant_op, fused_op):
++        super().__init__(match)
++        assert isinstance(quant_op, OpOverload)
++        assert isinstance(fused_op, OpOverload)
++        self.QUANT_OP = quant_op  # in-place quant op
++        self.FUSED_OP = fused_op  # in-place fused quant op
++
++    def insert_fused_node(self, fused_return_mapping: Dict[int, Tuple[fx.Node,
++                                                                      int]],
++                          **kwargs):
++        """
++        This utility function inserts an auto-functionalized node for FUSED_OP.
++        It also correctly sets its meta value and rebinds the users of the
++        unfused nodes to use the fused node instead.
++
++        :param fused_return_mapping: A dictionary, mapping from getitem indices
++        of the fused node result to a tuple of the old node and a getitem index.
++        :param kwargs: kwargs that get directly forwarded to the auto_fn node
++
++        Example:
++        If we want to replace this graph:
++        _, x1, x2 = auto_fn(op1)
++        _, y1, y2 = auto_fn(op2)
++
++        with
++        _, x1, y2, x2 = auto_fn(FUSED_OP)
++
++        we would call:
++        insert_fused_node({1: (op1_node, 1), 2: (op2_node, 2), 3: (op1_node, 2)}
++
++        Note that the 0th element is None for auto-functionalized in-place ops.
++        Hence, others appear 1-indexed.
++        """
++        fused_node = self.insert_auto_fn(self.FUSED_OP, kwargs)
++        indices = fused_return_mapping.keys()
++        getitem_nodes = self.insert_getitems(fused_node, indices)
++
++        # Prepare the meta value, use a list so it's mutable
++        meta_val = [None] * (max(indices) + 1)
++
++        # Iterate through elements of the tuple produced by fused_node
++        for idx, getitem_node in zip(indices, getitem_nodes):
++            old_node, old_idx = fused_return_mapping[idx]
++
++            # If the old value was never used, the old_getitem might not exist
++            old_getitem = find_getitem_maybe(old_node, old_idx)
++            if old_getitem is not None:
++                # Rebind the users of match getitem nodes to use the new nodes.
++                # The old nodes will be removed by DCE at the end of the pass.
++                old_getitem.replace_all_uses_with(getitem_node)
++                getitem_node.meta["val"] = old_getitem.meta["val"]
++
++            # Extract the appropriate meta value
++            # It is present even if the getitem node does not exist
++            meta_val[idx] = old_node.meta["val"][old_idx]
++
++        # Fix the meta value on the new fused node
++        fused_node.meta["val"] = tuple(meta_val)
++
++
++class RMSNormQuantPattern:
++
++    def __init__(self, epsilon: float, key: FusedRMSQuantKey):
++        self.epsilon = epsilon
++        self.quant_dtype = key.quant.dtype
++
++        assert key.quant in QUANT_OPS, \
++            f"unsupported quantization scheme {key.quant}"
++        self.QUANT_OP = QUANT_OPS[key.quant]
++
++        assert key in FUSED_OPS, \
++            f"unsupported fused rmsnorm+quant op for {key}"
++        self.FUSED_OP = FUSED_OPS[key]
++
++
++class RMSNormStaticQuantPattern(RMSNormQuantPattern):
++
++    def __init__(self,
++                 epsilon: float,
++                 quant_dtype: torch.dtype,
++                 symmetric=True):
++        fused_key = FusedRMSQuantKey(fused_add=False,
++                                     quant=QuantKey(dtype=quant_dtype,
++                                                    static=True,
++                                                    per_tensor=True,
++                                                    symmetric=symmetric))
++        super().__init__(epsilon, fused_key)
++
++    def register(self, pm_pass: PatternMatcherPass):
++        # Cannot use methods, as the self argument affects tracing
++        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
++                    input: torch.Tensor, weight: torch.Tensor,
++                    scale: torch.Tensor):
++            at1 = auto_functionalized(RMS_OP,
++                                      result=result_rms,
++                                      input=input,
++                                      weight=weight,
++                                      epsilon=self.epsilon)
++            at2 = auto_functionalized(self.QUANT_OP,
++                                      result=result,
++                                      input=at1[1],
++                                      scale=scale)
++
++            # result
++            return at2[1]
++
++        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
++                        input: torch.Tensor, weight: torch.Tensor,
++                        scale: torch.Tensor):
++            at = auto_functionalized(self.FUSED_OP,
++                                     result=result,
++                                     input=input,
++                                     weight=weight,
++                                     scale=scale,
++                                     epsilon=self.epsilon)
++
++            # result
++            return at[1]
++
++        inputs = [
++            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
++            empty_bf16(5, 4),  # result_rms
++            empty_bf16(5, 4),  # input
++            empty_bf16(1, 5),  # weight
++            empty_fp32(1, 1)  # scale
++        ]
++
++        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only,
++                                pm_pass)
++
++
++class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
++
++    def __init__(self,
++                 epsilon: float,
++                 quant_dtype: torch.dtype,
++                 symmetric=True):
++        key = FusedRMSQuantKey(fused_add=True,
++                               quant=QuantKey(dtype=quant_dtype,
++                                              static=True,
++                                              per_tensor=True,
++                                              symmetric=symmetric))
++        super().__init__(epsilon, key)
++
++    def register(self, pm_pass: PatternMatcherPass,
++                 record_match: Callable[[MultiOutputMatch], bool]):
++
++        def pattern(result: torch.Tensor, input: torch.Tensor,
++                    residual: torch.Tensor, weight: torch.Tensor,
++                    scale: torch.Tensor):
++            at = auto_functionalized(RMS_ADD_OP,
++                                     input=input,
++                                     residual=residual,
++                                     weight=weight,
++                                     epsilon=self.epsilon)
++            at1 = auto_functionalized(self.QUANT_OP,
++                                      result=result,
++                                      input=at[1],
++                                      scale=scale)
++
++            # result, residual
++            return at1[1], at[2]
++
++        def replacement(result: torch.Tensor, input: torch.Tensor,
++                        residual: torch.Tensor, weight: torch.Tensor,
++                        scale: torch.Tensor):
++            at = auto_functionalized(self.FUSED_OP,
++                                     result=result,
++                                     input=input,
++                                     residual=residual,
++                                     weight=weight,
++                                     scale=scale,
++                                     epsilon=self.epsilon)
++
++            # result, residual
++            return at[1], at[2]
++
++        inputs = [
++            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
++            empty_bf16(5, 4),  # input
++            empty_bf16(5, 4),  # residual
++            empty_bf16(1, 5),  # weight
++            empty_fp32(1, 1)  # scale
++        ]
++
++        pm.register_replacement(
++            pattern,
++            replacement,
++            inputs,
++            pm.fwd_only,
++            pm_pass,
++            extra_check=lambda m: record_match(
++                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
++
++    class Match(QuantMultiOutputMatch):
++
++        def process(self):
++            # Find the nodes in the match that we need to rebind
++            rms_node = self.find_auto_fn(RMS_ADD_OP)
++            quant_node = self.find_auto_fn(self.QUANT_OP)
++
++            assert len(rms_node.users) == 2
++            assert len(quant_node.users) == 1
++
++            # First, insert a new auto_functionalized node for the fused op,
++            # as well as getitem nodes to extract the result and residual.
++            # The auto_fn node returns a tuple of (None, result, residual).
++            #
++            # The resulting graph looks like this:
++            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
++            # result_node_new = at[1]
++            # residual_node_new = at[2]
++            with self.inserting_after_match():
++                # Missing epsilon, scalars cannot be inputs to the pattern
++                kwargs = self.match.kwargs.copy()
++
++                # 0 is always None
++                fused_return_mapping = {1: (quant_node, 1), 2: (rms_node, 2)}
++                self.insert_fused_node(fused_return_mapping,
++                                       epsilon=rms_node.kwargs["epsilon"],
++                                       **kwargs)
++
++
++class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
++
++    def __init__(self,
++                 epsilon: float,
++                 quant_dtype: torch.dtype,
++                 per_tensor: bool,
++                 symmetric=True):
++        key = FusedRMSQuantKey(fused_add=False,
++                               quant=QuantKey(dtype=quant_dtype,
++                                              static=False,
++                                              per_tensor=per_tensor,
++                                              symmetric=symmetric))
++        super().__init__(epsilon, key)
++
++    def register(self, pm_pass: PatternMatcherPass,
++                 record_match: Callable[[MultiOutputMatch], bool]):
++
++        def pattern(result: torch.Tensor, result_rms: torch.Tensor,
++                    input: torch.Tensor, weight: torch.Tensor,
++                    scale: torch.Tensor):
++            at1 = auto_functionalized(RMS_OP,
++                                      result=result_rms,
++                                      input=input,
++                                      weight=weight,
++                                      epsilon=self.epsilon)
++            at2 = auto_functionalized(self.QUANT_OP,
++                                      result=result,
++                                      input=at1[1],
++                                      scale=scale,
++                                      scale_ub=None)
++
++            # result, scale
++            return at2[1], at2[2]
++
++        def replacement(result: torch.Tensor, result_rms: torch.Tensor,
++                        input: torch.Tensor, weight: torch.Tensor,
++                        scale: torch.Tensor):
++            at = auto_functionalized(self.FUSED_OP,
++                                     result=result,
++                                     input=input,
++                                     weight=weight,
++                                     scale=scale,
++                                     epsilon=self.epsilon,
++                                     scale_ub=None,
++                                     residual=None)
++
++            # result, scale
++            return at[1], at[2]
++
++        inputs = [
++            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
++            empty_bf16(5, 4),  # result_rms
++            empty_bf16(5, 4),  # input
++            empty_bf16(1, 5),  # weight
++            empty_fp32(1, 1)  # scale
++        ]
++
++        pm.register_replacement(
++            pattern,
++            replacement,
++            inputs,
++            pm.fwd_only,
++            pm_pass,
++            extra_check=lambda m: record_match(
++                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
++
++    class Match(QuantMultiOutputMatch):
++
++        def process(self):
++            # Find the nodes in the match that we need to rebind
++            rms_node = self.find_auto_fn(RMS_OP)
++            quant_node = self.find_auto_fn(self.QUANT_OP)
++
++            assert len(rms_node.users) == 1
++            assert len(quant_node.users) == 2
++
++            # First, insert a new auto_functionalized node for the fused op,
++            # as well as getitem nodes to extract the result and scale.
++            # The auto_fn node returns a tuple of (None, result, scale).
++            #
++            # The resulting graph looks like this:
++            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
++            # result_node_new = at[1]
++            # scale_node_new = at[2]
++            with self.inserting_after_match():
++                # Missing epsilon, scalars cannot be inputs to the pattern
++                kwargs = self.match.kwargs.copy()
++                del kwargs["result_rms"]  # not used in the fused op
++
++                fused_return_mapping = {1: (quant_node, 1), 2: (quant_node, 2)}
++                self.insert_fused_node(
++                    fused_return_mapping,
++                    epsilon=rms_node.kwargs["epsilon"],
++                    scale_ub=None,  # not used but required
++                    residual=None,  # not used but required
++                    **kwargs)
++
++
++class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
++
++    def __init__(self,
++                 epsilon: float,
++                 quant_dtype: torch.dtype,
++                 per_tensor: bool = True,
++                 symmetric=True):
++        key = FusedRMSQuantKey(fused_add=True,
++                               quant=QuantKey(dtype=quant_dtype,
++                                              static=False,
++                                              per_tensor=per_tensor,
++                                              symmetric=symmetric))
++        super().__init__(epsilon, key)
++
++    def register(self, pm_pass: PatternMatcherPass,
++                 record_match: Callable[[MultiOutputMatch], bool]):
++
++        def pattern(result: torch.Tensor, input: torch.Tensor,
++                    residual: torch.Tensor, weight: torch.Tensor,
++                    scale: torch.Tensor):
++            at = auto_functionalized(RMS_ADD_OP,
++                                     input=input,
++                                     residual=residual,
++                                     weight=weight,
++                                     epsilon=self.epsilon)
++            at1 = auto_functionalized(self.QUANT_OP,
++                                      result=result,
++                                      input=at[1],
++                                      scale=scale,
++                                      scale_ub=None)
++
++            # result, residual, scale
++            return at1[1], at[2], at1[2]
++
++        def replacement(result: torch.Tensor, input: torch.Tensor,
++                        residual: torch.Tensor, weight: torch.Tensor,
++                        scale: torch.Tensor):
++            at = auto_functionalized(self.FUSED_OP,
++                                     result=result,
++                                     input=input,
++                                     weight=weight,
++                                     scale=scale,
++                                     epsilon=self.epsilon,
++                                     scale_ub=None,
++                                     residual=residual)
++
++            # result, residual, scale
++            return at[1], at[3], at[2]
++
++        inputs = [
++            torch.empty(5, 4, device="cuda", dtype=self.quant_dtype),  # result
++            empty_bf16(5, 4),  # input
++            empty_bf16(5, 4),  # residual
++            empty_bf16(1, 5),  # weight
++            empty_fp32(1, 1)  # scale
++        ]
++
++        pm.register_replacement(
++            pattern,
++            replacement,
++            inputs,
++            pm.fwd_only,
++            pm_pass,
++            extra_check=lambda m: record_match(
++                self.Match(m, self.QUANT_OP, self.FUSED_OP)))
++
++    class Match(QuantMultiOutputMatch):
++
++        def process(self):
++            # Find the nodes in the match that we need to rebind
++            rms_node = self.find_auto_fn(RMS_ADD_OP)
++            quant_node = self.find_auto_fn(self.QUANT_OP)
++
++            assert len(rms_node.users) == 2
++            assert len(quant_node.users) == 2
++
++            # First, insert a new auto_functionalized node for the fused op,
++            # as well as getitem nodes to extract result, scale, and residual.
++            # The auto_fn node returns a tuple (None, result, scale, residual).
++            #
++            # The resulting graph looks like this:
++            # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...)  # noqa
++            # result_node_new = at[1]
++            # scale_node_new = at[2]
++            # residual_node_new = at[3]
++            with self.inserting_after_match():
++                # Missing epsilon, scalars cannot be inputs to the pattern
++                kwargs = self.match.kwargs.copy()
++
++                fused_return_mapping = {
++                    1: (quant_node, 1),  # result
++                    2: (quant_node, 2),  # scale
++                    3: (rms_node, 2),  # residual
++                }
++                self.insert_fused_node(
++                    fused_return_mapping,
++                    epsilon=rms_node.kwargs["epsilon"],
++                    scale_ub=None,  # not used but required
++                    **kwargs)
++
++
++class FusionPass(VllmInductorPass):
++    """
++    This pass fuses a pre-defined set of custom ops into fused ops.
++    It uses the torch pattern matcher to find the patterns and replace them.
++    It also manually processes multi-output matches, as those are broken in
++    the torch pattern matcher.
++
++    Because patterns can only be registered once, the pass is a singleton.
++    This will be addressed in a future version of PyTorch:
++    https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
++    """
++
++    _instance: 'Optional[FusionPass]' = None
++
++    @classmethod
++    def instance(cls, config: CompilationConfig.PassConfig):
++        """
++        Get the singleton instance of the FusionPass.
++        If the instance exists, the config is updated but
++        initialization is not repeated.
++        """
++        if cls._instance is None:
++            cls._instance = FusionPass(config)
++        else:
++            cls._instance.config = config
++        return cls._instance
++
++    def __init__(self, config: CompilationConfig.PassConfig):
++        assert self.__class__._instance is None, \
++            "FusionPass singleton instance already exists"
++        super().__init__(config)
++
++        self.matches: List[MultiOutputMatch] = []
++        self.patterns: PatternMatcherPass = PatternMatcherPass(
++            pass_name="fusion_pass")
++
++        for epsilon in [1e-5, 1e-6]:
++            # Fuse rms_norm + static fp8 quant
++            RMSNormStaticQuantPattern(epsilon,
++                                      FP8_DTYPE).register(self.patterns)
++
++            # Matches for patterns below have 2 or more outputs,
++            # so we need to process them manually (see process_matches)
++
++            # Fuse rms_norm + static fp8 quant
++            FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
++                self.patterns, self.record_match)
++
++            # Fuse rms_norm + dynamic per-token fp8 quant
++            RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE,
++                                       per_tensor=False).register(
++                                           self.patterns, self.record_match)
++
++            # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
++            FusedAddRMSNormDynamicQuantPattern(epsilon,
++                                               FP8_DTYPE,
++                                               per_tensor=False).register(
++                                                   self.patterns,
++                                                   self.record_match)
++
++            # WARNING: This is a hack to clear the pattern matcher cache
++            # and allow multiple values of epsilon.
++            torch._inductor.pattern_matcher._seen_patterns.clear()
++
++    def record_match(self, match: MultiOutputMatch) -> bool:
++        # Hijack the extra_check to record the match and
++        # save it for post-processing.
++        self.matches.append(match)
++
++        # Return False to prevent automatic replacement.
++        return False
++
++    def process_matches(self, graph: fx.Graph):
++        """
++        Manually process multi-output matches and replace them with fused nodes.
++        See MultiOutputMatch for more details.
++        """
++        for match in self.matches:
++            match.process()
++
++        # Finally, remove matched nodes
++        graph.eliminate_dead_code()
++        assert all(node not in graph.nodes for match in self.matches
++                   for node in match.match.nodes)
++
++    def __call__(self, graph: fx.Graph):
++        self.begin()
++        self.dump_graph(graph, "before_fusion")
++
++        count = self.patterns.apply(graph)
++        logger.debug("Replaced %s patterns", count)
++        self.dump_graph(graph, "after_pattern_match")
++
++        # Manually process multi-output matches (and run DCE)
++        self.process_matches(graph)
++        logger.debug("Post-processed %s matches", len(self.matches))
++        self.dump_graph(graph, "after_fusion")
++        self.matches.clear()
++        self.end_and_log()
+diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
+new file mode 100644
+index 0000000..924e26f
+--- /dev/null
++++ b/vllm/compilation/fx_utils.py
+@@ -0,0 +1,42 @@
++import operator
++from typing import Iterable, Optional
++
++from torch import fx
++from torch._higher_order_ops.auto_functionalize import auto_functionalized
++from torch._ops import OpOverload
++
++
++def is_func(node: fx.Node, target) -> bool:
++    return node.op == "call_function" and node.target == target
++
++
++# Returns the first auto_functionalized node with the given op (if it exists)
++def find_auto_fn_maybe(nodes: Iterable[fx.Node],
++                       op: OpOverload) -> Optional[fx.Node]:
++    for node in nodes:
++        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
++            return node
++    return None
++
++
++# Returns the first auto_functionalized node with the given op
++def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
++    node = find_auto_fn_maybe(nodes, op)
++    assert node is not None, f"Could not find {op} in nodes {nodes}"
++    return node
++
++
++# Returns the getitem node that extracts the idx-th element from node
++# (if it exists)
++def find_getitem_maybe(node: fx.Node, idx: int) -> Optional[fx.Node]:
++    for user in node.users:
++        if is_func(user, operator.getitem) and user.args[1] == idx:
++            return user
++    return None
++
++
++# Returns the getitem node that extracts the idx-th element from node
++def find_getitem(node: fx.Node, idx: int) -> fx.Node:
++    ret = find_getitem_maybe(node, idx)
++    assert ret is not None, f"Could not find getitem {idx} in node {node}"
++    return ret
+diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
+new file mode 100644
+index 0000000..f6846c0
+--- /dev/null
++++ b/vllm/compilation/inductor_pass.py
+@@ -0,0 +1,84 @@
++import hashlib
++import inspect
++import types
++from abc import ABC, abstractmethod
++from typing import Any, Callable, Optional, Union
++
++import torch
++from torch import fx
++
++
++class InductorPass(ABC):
++    """
++    General custom inductor pass interface.
++    TODO(torch==2.6) use torch._inductor.custom_graph_pass.CustomGraphPass
++    """
++
++    @abstractmethod
++    def __call__(self, graph: torch.fx.Graph):
++        """
++        Execute the pass on the given graph.
++        """
++        raise NotImplementedError
++
++    def uuid(self) -> Any:
++        """
++        Provide a unique identifier for the pass, used in Inductor code cache.
++        This should depend on the pass implementation, so that changes to the
++        pass result in recompilation.
++        By default, the object source is hashed.
++        """
++        return InductorPass.hash_source(self)
++
++    @staticmethod
++    def hash_source(*srcs: Union[str, Any]):
++        """
++        Utility method to hash the sources of functions or objects.
++        :param srcs: strings or objects to add to the hash.
++        Objects and functions have their source inspected.
++        :return:
++        """
++        hasher = hashlib.sha256()
++        for src in srcs:
++            if isinstance(src, str):
++                src_str = src
++            elif isinstance(src, types.FunctionType):
++                src_str = inspect.getsource(src)
++            else:
++                src_str = inspect.getsource(src.__class__)
++            hasher.update(src_str.encode("utf-8"))
++        return hasher.digest()
++
++
++class CallableInductorPass(InductorPass):
++    """
++    This class is a wrapper for a callable that automatically provides an
++    implementation of the UUID.
++    """
++
++    def __init__(self,
++                 callable: Callable[[fx.Graph], None],
++                 uuid: Optional[Any] = None):
++        self.callable = callable
++        if uuid is None:
++            uuid = InductorPass.hash_source(callable)
++        self._uuid = uuid
++
++    def __call__(self, graph: torch.fx.Graph):
++        self.callable(graph)
++
++    def uuid(self) -> Any:
++        return self._uuid
++
++    def __getstate__(self):
++        """
++        Pickling occurs in the Inductor code cache if a pass is not given to
++        the pass manager but is instead directly added to config as a pass.
++        See PostGradPassManager for more.
++
++        TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead.
++        """
++        return self._uuid
++
++    def __setstate__(self, state):
++        raise ValueError("Cannot unpickle CallableInductorPass")
+diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
+new file mode 100644
+index 0000000..b97e404
+--- /dev/null
++++ b/vllm/compilation/monitor.py
+@@ -0,0 +1,36 @@
++import os
++import time
++
++from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++context_manager = None
++torch_compile_start_time: float = 0.0
++
++
++def start_monitoring_torch_compile(vllm_config: VllmConfig):
++    global torch_compile_start_time
++    torch_compile_start_time = time.time()
++
++    compilation_config: CompilationConfig = vllm_config.compilation_config
++    if compilation_config.level == CompilationLevel.PIECEWISE and \
++        compilation_config.debug_dump_path:
++        import depyf
++        path = os.path.join(compilation_config.debug_dump_path,
++                            f"rank_{vllm_config.parallel_config.rank}")
++        global context_manager
++        context_manager = depyf.prepare_debug(path)
++        context_manager.__enter__()
++
++
++def end_monitoring_torch_compile(vllm_config: VllmConfig):
++    compilation_config: CompilationConfig = vllm_config.compilation_config
++    if compilation_config.level == CompilationLevel.PIECEWISE:
++        logger.info("torch.compile takes %.2f s in total",
++                    compilation_config.compilation_time)
++        global context_manager
++        if context_manager is not None:
++            context_manager.__exit__(None, None, None)
++            context_manager = None
+diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
+new file mode 100644
+index 0000000..b6bcecd
+--- /dev/null
++++ b/vllm/compilation/multi_output_match.py
+@@ -0,0 +1,106 @@
++import abc
++import operator
++from abc import abstractmethod
++from typing import Iterable, List, Tuple
++
++from torch import fx
++from torch._higher_order_ops.auto_functionalize import auto_functionalized
++from torch._inductor import pattern_matcher as pm
++from torch._ops import OpOverload
++from torch.fx import Node
++
++from vllm.compilation.fx_utils import find_auto_fn
++
++
++class MultiOutputMatch(abc.ABC):
++    """
++    This class provides utilities to process multi-output matches and
++    manually insert replacements.
++
++    This is necessary because the automatic replacement for multi-output
++    matches is broken: https://github.com/pytorch/pytorch/issues/137280
++    """
++
++    def __init__(self, match: pm.Match):
++        self.match = match
++
++    @abstractmethod
++    def process(self):
++        """
++        Process a multi-output match and manually insert the replacement.
++
++        This method should:
++        1. Insert the replacement nodes after the last node in the match.
++        2. Rebind the users of nodes in the match to use the new nodes.
++        3. Set meta["val"] for de-functionalization.
++
++        The result of an auto-functionalized node is a tuple of tensors.
++        The first element is the return value of the function, usually None.
++        The remaining elements are the mutated args of the function.
++
++        All auto-functionalized nodes must contain a proper meta["val"],
++        as it is used by de-functionalization. meta["val"] has to contain the
++        value of the node (tuple of tensors) that would be returned by the
++        functionalized node during tracing.
++
++        Existing nodes in the graph all have this property set, but we have
++        to set it manually for new nodes we insert.
++
++        Example:
++        # op schema: foo(a: Tensor!, b: Tensor, c: Tensor!) -> None
++        at = auto_functionalized(torch.ops._C.foo.default, a, b, c)
++        # at.meta["val"] = (None, a, c)
++        """
++        raise NotImplementedError
++
++    @property
++    def nodes(self) -> List[fx.Node]:
++        return self.match.nodes
++
++    @property
++    def graph(self) -> fx.Graph:
++        return self.match.graph
++
++    def find_auto_fn(self, op) -> fx.Node:
++        """
++        Find the first auto_functionalized node with the given op in the match.
++        """
++        return find_auto_fn(self.nodes, op)
++
++    def inserting_after_match(self):
++        """
++        Insert nodes after the last node in the match.
++        This is done to avoid use-before-definition errors after inserting
++        replacement nodes.
++        """
++
++        # match.nodes is not guaranteed to be sorted.
++        # Find the last node in the match.
++        for last_node_in_match in reversed(self.graph.nodes):
++            if last_node_in_match in self.match.nodes:
++                break
++        else:
++            raise ValueError("No nodes in graph")
++
++        return self.graph.inserting_after(last_node_in_match)
++
++    def insert_getitems(self, tuple_node: fx.Node,
++                        indices: Iterable[int]) -> Tuple[fx.Node, ...]:
++        """
++        Insert operator.getitem nodes to extract elements from a tuple node.
++
++        :param tuple_node: The tuple node to extract elements from.
++        :param indices: The indices of the elements to extract.
++        :return: Tuple of the new getitem nodes, corresponding to the indices.
++        """
++        with self.graph.inserting_after(tuple_node):
++            return tuple(
++                self.graph.call_function(operator.getitem, (tuple_node, idx))
++                for idx in indices)
++
++    def insert_auto_fn(self, op: OpOverload, kwargs) -> Node:
++        """
++        Insert an auto_functionalized node with the given op and kwargs.
++        """
++        return self.graph.call_function(auto_functionalized, (op, ),
++                                        kwargs=kwargs)
+diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
+new file mode 100644
+index 0000000..34f5f35
+--- /dev/null
++++ b/vllm/compilation/pass_manager.py
+@@ -0,0 +1,77 @@
++from typing import Any, Dict, List
++
++from torch import fx as fx
++
++from vllm.config import CompilationConfig
++from vllm.logger import init_logger
++
++from .fix_functionalization import FixFunctionalizationPass
++from .fusion import FusionPass
++from .inductor_pass import InductorPass
++from .reshapes import RedundantReshapesPass
++
++logger = init_logger(__name__)
++
++
++class PostGradPassManager:
++    """
++    The pass manager for post-grad passes.
++    It handles configuration, adding custom passes, and running passes.
++    It also supports pickling, which is used by the Inductor code cache.
++    TODO(torch==2.6), use CustomGraphPass
++    (torch._inductor.custom_graph_pass.CustomGraphPass)
++
++    The order of the post-grad post-passes is:
++    1. passes (constructor parameter)
++    2. default passes (RedundantReshapesPass, FusionPass)
++    3. config["post_grad_custom_post_pass"] (if it exists)
++    4. fix_functionalization
++    This way, all passes operate on a functionalized graph.
++    """
++
++    def __init__(self):
++        self.passes: List[InductorPass] = []
++
++    def __call__(self, graph: fx.Graph):
++        for pass_ in self.passes:
++            pass_(graph)
++
++        # always run fix_functionalization last
++        self.fix_functionalization(graph)
++
++    def configure(self, pass_config: CompilationConfig.PassConfig):
++        self.pass_config = pass_config
++        if pass_config.enable_reshape:
++            self.passes += [RedundantReshapesPass(pass_config)]
++
++        if pass_config.enable_fusion:
++            self.passes += [FusionPass.instance(pass_config)]
++
++        self.fix_functionalization = FixFunctionalizationPass(pass_config)
++
++    def add(self, pass_: InductorPass):
++        assert isinstance(pass_, InductorPass)
++        self.passes.append(pass_)
++
++    def __getstate__(self) -> Dict[str, List[Any]]:
++        """
++        Custom pickling for the pass manager, as some passes cannot be pickled.
++        Pickling occurs because the pass manager is set as the value of
++        `config["post_grad_custom_post_pass"]` in the Inductor config.
++        The config is pickled to act as a key in the Inductor code cache.
++        Any other passes in the config are pickled as well.
++
++        TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead.
++        """
++        state = {"pass_config": self.pass_config.uuid(), "passes": []}
++        for pass_ in self.passes:
++            state["passes"].append(pass_.uuid())
++        state["passes"].append(self.fix_functionalization.uuid())
++        return state
++
++    def __setstate__(self, state):
++        """
++        Do not allow unpickling of the pass manager.
++        If this is needed in the future, it should properly pickle the passes.
++        """
++        raise ValueError("Cannot unpickle PostGradPassManager")
+diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py
+new file mode 100644
+index 0000000..ba28b1f
+--- /dev/null
++++ b/vllm/compilation/reshapes.py
+@@ -0,0 +1,88 @@
++from typing import Union
++
++import torch.fx
++from torch import SymInt
++
++from vllm.logger import init_logger
++
++from .fx_utils import is_func
++from .vllm_inductor_pass import VllmInductorPass
++
++logger = init_logger(__name__)
++
++
++class RedundantReshapesPass(VllmInductorPass):
++    """
++    This is an inductor pass that removes redundant reshape operations.
++    It is required for RMSNorm-quant fusion to work properly.
++    That's because apply_fp8_linear adds a reshape, which is redundant
++    in the 2D-case.
++
++    Example graph:
++
++    getitem_1: "f16[s0, 4096]" = ...
++    view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096])
++    at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...)
++    out: "f8e4m3fn[s0, 4096]" = at[1]
++
++    Can be replaced with:
++    getitem_1: "f16[s0, 4096]" = ...
++    at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...)
++    out: "f8e4m3fn[s0, 4096]" = at[1]
++    """
++
++    def __call__(self, graph: torch.fx.Graph):
++        self.begin()
++        self.dump_graph(graph, "before_reshapes")
++        count = 0
++        # Remove no-op reshapes/views:
++        for node in graph.nodes:
++            if is_func(node, torch.ops.aten.reshape.default):
++                input, shape = node.args[:2]
++                input_shape = input.meta["val"].shape
++                if len(shape) != len(input_shape):
++                    # Reshape changing rank, skip
++                    continue
++
++                if shape.count(-1) > 1:
++                    # Invalid reshape args, skip
++                    continue
++
++                if all(
++                        self.dims_equivalent(s, i_s)
++                        for s, i_s in zip(shape, input_shape)):
++                    node.replace_all_uses_with(input)
++                    graph.erase_node(node)
++                    count += 1
++
++        logger.debug("Removed %s no-op reshapes", count)
++
++        self.dump_graph(graph, "after_reshapes")
++        self.end_and_log()
++
++    def dims_equivalent(self, dim: Union[int, torch.fx.Node],
++                        i_dim: Union[int, SymInt]) -> bool:
++        """
++        This function checks if two dimensions are equivalent.
++        :param dim: The dimension arg to reshape
++        :param i_dim: The corresponding dimension in the input tensor
++        :return: Are the dimensions equivalent?
++
++        There are three cases in which the dimensions are equivalent:
++        1. The dimensions are equal (both integers)
++        2. The reshape dimension is -1 (i.e. inferred)
++        3. The dimensions both correspond to the same SymInt
++
++        While case 2 does not guarantee the dimensions are equal,
++        they are equal if all other dimensions are equal.
++
++        In case 3, the reshape dimension is a torch.fx.Node,
++        and its value is a SymInt. That value is equal to the
++        input dimension.
++
++        """
++        # Case 1 and 2
++        if dim == i_dim or dim == -1:
++            return True
++        # Case 3
++        return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim
+diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
+new file mode 100644
+index 0000000..b8c52a7
+--- /dev/null
++++ b/vllm/compilation/vllm_inductor_pass.py
+@@ -0,0 +1,49 @@
++import time
++
++import torch
++
++from vllm.config import CompilationConfig
++# yapf: disable
++from vllm.distributed import get_tensor_model_parallel_rank as get_tp_rank
++from vllm.distributed import (
++    get_tensor_model_parallel_world_size as get_tp_world_size)
++from vllm.distributed import model_parallel_is_initialized as p_is_init
++# yapf: enable
++from vllm.logger import init_logger
++
++from .inductor_pass import InductorPass
++
++logger = init_logger(__name__)
++
++
++class VllmInductorPass(InductorPass):
++    """
++    An inductor pass with access to vLLM PassConfig.
++    It provides timing, logging, and dumping utilities.
++    """
++
++    def __init__(self, config: CompilationConfig.PassConfig):
++        self.config = config
++        self.pass_name = self.__class__.__name__
++
++    def dump_graph(self, graph: torch.fx.Graph, stage: str):
++        if stage in self.config.dump_graph_stages:
++            # Make sure filename includes rank in the distributed setting
++            parallel = p_is_init() and get_tp_world_size() > 1
++            rank = f"-{get_tp_rank()}" if parallel else ""
++            filepath = self.config.dump_graph_dir / f"{stage}{rank}.py"
++
++            logger.info("%s printing graph to %s", self.pass_name, filepath)
++            with open(filepath, "w") as f:
++                src = graph.python_code(root_module="self", verbose=True).src
++                # Add imports so it's not full of errors
++                print("import torch; from torch import device", file=f)
++                print(src, file=f)
++
++    def begin(self):
++        self._start_time = time.perf_counter_ns()
++
++    def end_and_log(self):
++        self._end_time = time.perf_counter_ns()
++        duration_ms = float(self._end_time - self._start_time) / 1.0e6
++        logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms)
+diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
+new file mode 100644
+index 0000000..e3260a1
+--- /dev/null
++++ b/vllm/compilation/wrapper.py
+@@ -0,0 +1,105 @@
++import os
++import sys
++from abc import abstractmethod
++from contextlib import contextmanager
++from types import CodeType
++from typing import Callable, List, Optional
++
++import torch
++
++import vllm.envs as envs
++from vllm.config import CompilationLevel, get_current_vllm_config
++
++
++class TorchCompileWrapperWithCustomDispatcher:
++    """
++    A wrapper class for torch.compile, with a custom dispatch logic.
++    Subclasses should:
++    1. Implement the forward method
++    2. Implement the dispatch logic in the __call__ method
++        It can use `self.compiled_codes` to access the compiled bytecode,
++        and `with self.dispatch_to_code(index):` to dispatch to
++        the compiled code.
++    3. Implement the `__init__` method to determine how to call
++        `torch.compile` over the forward method.
++    """
++
++    def __init__(self,
++                 compiled_callable: Optional[Callable] = None,
++                 compilation_level: int = 0):
++
++        vllm_config = get_current_vllm_config()
++        self.vllm_config = vllm_config
++        if compiled_callable is None:
++            # default compilation settings
++            # compiling the forward method
++
++            backend = vllm_config.compilation_config.init_backend(vllm_config)
++
++            compiled_callable = torch.compile(
++                self.forward,
++                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
++                backend=backend)
++
++        self.compiled_callable = compiled_callable
++        self.original_code_object = self.__class__.forward.__code__
++        self.compiled_codes: List[CodeType] = []
++        torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
++
++        # read the env var to determine whether to use the custom dispatcher
++        # subclasses can use this to switch between the custom dispatcher
++        # and the default Dynamo guard mechanism.
++        self.use_custom_dispatcher: bool = \
++            compilation_level >= CompilationLevel.DYNAMO_ONCE
++
++    def __call__(self, *args, **kwargs):
++        """Implement the dispatch logic here, beyond the torch.compile level.
++        NOTE: this function can have additional arguments beyond the forward
++         method, for directly dispatching to the compiled code.
++        """
++        return self.compiled_callable(*args, **kwargs)
++
++    @abstractmethod
++    def forward(self, *args, **kwargs):
++        ...
++
++    def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
++        """Hook to save the compiled bytecode for direct execution."""
++        if old_code is not self.original_code_object:
++            return
++        # code borrowed from https://github.com/thuml/depyf/blob/f4ad79fadee27ea113b4c75202db1eb1a11c0dbc/depyf/explain/enable_debugging.py#L25
++        frame = sys._getframe()
++        while frame and frame.f_back:
++            frame = frame.f_back
++            code_name = frame.f_code.co_name
++            file_name = frame.f_code.co_filename.split(os.path.sep)[-1]
++            if code_name == "_compile" and file_name == "convert_frame.py":
++                break
++        frame = frame.f_locals["frame"]
++        assert frame.f_code == old_code
++
++        if frame.f_locals["self"] is not self:
++            return
++
++        self.compiled_codes.append(new_code)
++
++        if self.vllm_config.compilation_config.use_cudagraph and \
++            "update" in new_code.co_names:
++            import depyf
++            src = depyf.decompile(new_code)
++            msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src  # noqa
++            raise RuntimeError(msg)
++
++    @contextmanager
++    def dispatch_to_code(self, index: int):
++        """Context manager to dispatch to the compiled code.
++        Why does this work? Because Dynamo guarantees that the compiled
++        bytecode has exactly the same arguments, cell variables, and free
++        variables as the original code. Therefore we can directly switch
++        the code object in the function and call it.
++
++        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
++        """ # noqa
++        self.__class__.forward.__code__ = self.compiled_codes[index]
++        yield
++        self.__class__.forward.__code__ = self.original_code_object
+diff --git a/vllm/config.py b/vllm/config.py
+index 13bb294..59b509d 100644
+--- a/vllm/config.py
++++ b/vllm/config.py
+@@ -1,29 +1,83 @@
++import ast
++import copy
+ import enum
++import hashlib
+ import json
+-from dataclasses import dataclass, field, fields
+-from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
++import sys
++import warnings
++from contextlib import contextmanager
++from dataclasses import dataclass, field, replace
++from pathlib import Path
++from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict,
++                    Final, List, Literal, Mapping, Optional, Protocol, Set,
++                    Tuple, Type, Union)
+ 
+ import torch
+-from packaging.version import Version
++from pydantic import BaseModel, Field, PrivateAttr
+ from transformers import PretrainedConfig
+ 
++import vllm.envs as envs
++from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
+ from vllm.logger import init_logger
+ from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
+                                                      get_quantization_config)
+-from vllm.transformers_utils.config import get_config, get_hf_text_config
+-from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip,
+-                        is_neuron)
+-
+-GPTQMarlinConfig = get_quantization_config("gptq_marlin")
++from vllm.model_executor.models import ModelRegistry
++from vllm.platforms import CpuArchEnum
++from vllm.tracing import is_otel_available, otel_import_error_traceback
++from vllm.transformers_utils.config import (
++    ConfigFormat, get_config, get_hf_image_processor_config,
++    get_hf_text_config, get_pooling_config,
++    get_sentence_transformer_tokenizer_config, is_encoder_decoder,
++    try_get_generation_config, uses_mrope)
++from vllm.transformers_utils.s3_utils import S3Model
++from vllm.transformers_utils.utils import is_s3
++from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
++                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
+ 
+ if TYPE_CHECKING:
+     from ray.util.placement_group import PlacementGroup
+ 
++    from vllm.executor.executor_base import ExecutorBase
++    from vllm.model_executor.layers.quantization.base_config import (
++        QuantizationConfig)
+     from vllm.model_executor.model_loader.loader import BaseModelLoader
++    from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
++        BaseTokenizerGroup)
++else:
++    QuantizationConfig = None
+ 
+ logger = init_logger(__name__)
+ 
+-_GB = 1 << 30
++_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
++_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
++
++TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
++                     "score", "reward"]
++
++_ResolvedTask = Literal["generate", "embed", "classify", "score", "reward",
++                        "draft"]
++
++RunnerType = Literal["generate", "pooling", "draft"]
++
++_RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = {
++    "generate": ["generate"],
++    "pooling": ["embed", "classify", "score", "reward"],
++    "draft": ["draft"],
++}
++
++_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
++    task: runner
++    for runner, tasks in _RUNNER_TASKS.items() for task in tasks
++}
++
++HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
++                                             PretrainedConfig]]
++
++
++class SupportsHash(Protocol):
++
++    def compute_hash(self) -> str:
++        ...
+ 
+ 
+ class ModelConfig:
+@@ -31,13 +85,22 @@ class ModelConfig:
+ 
+     Args:
+         model: Name or path of the huggingface model to use.
+-            It is also used as the content for `model_name` tag in metrics 
+-            output when `served_model_name` is not specified. 
++            It is also used as the content for `model_name` tag in metrics
++            output when `served_model_name` is not specified.
++        task: The task to use the model for. Each vLLM instance only supports
++            one task, even if the same model can be used for multiple tasks.
++            When the model only supports one task, "auto" can be used to select
++            it; otherwise, you must specify explicitly which task to use.
+         tokenizer: Name or path of the huggingface tokenizer to use.
+         tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
+-            available, and "slow" will always use the slow tokenizer.
++            available, "slow" will always use the slow tokenizer, and
++            "mistral" will always use the tokenizer from `mistral_common`.
+         trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+             downloading the model and tokenizer.
++        allowed_local_media_path: Allowing API requests to read local images or
++            videos from directories specified by the server file system.
++            This is a security risk. Should only be enabled in trusted
++            environments.
+         dtype: Data type for model weights and activations. The "auto" option
+             will use FP16 precision for FP32 and FP16 models, and BF16 precision
+             for BF16 models.
+@@ -53,6 +116,8 @@ class ModelConfig:
+             the default version.
+         max_model_len: Maximum length of a sequence (including prompt and
+             output). If None, will be derived from the model.
++        spec_target_max_model_len: Specify the the maximum length for spec
++            decoding draft models.
+         quantization: Quantization method that was used to quantize the model
+             weights. If None, we assume the model weights are not quantized.
+         quantization_param_path: Path to JSON file containing scaling factors.
+@@ -63,123 +128,451 @@ class ModelConfig:
+         enforce_eager: Whether to enforce eager execution. If True, we will
+             disable CUDA graph and always execute the model in eager mode.
+             If False, we will use CUDA graph and eager execution in hybrid.
+-        max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+-            When a sequence has context length larger than this, we fall back
+-            to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
++            If None, the user did not specify, so default to False.
+         max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
+             When a sequence has context length larger than this, we fall back
+-            to eager mode
++            to eager mode. Additionally for encoder-decoder models, if the
++            sequence length of the encoder input is larger than this, we fall
++            back to the eager mode.
++        max_logprobs: Maximum number of log probabilities. Defaults to 20.
++        disable_sliding_window: Whether to disable sliding window. If True,
++            we will disable the sliding window functionality of the model.
++            If the model does not support sliding window, this argument is
++            ignored.
+         skip_tokenizer_init: If true, skip initialization of tokenizer and
+             detokenizer.
+         served_model_name: The model name used in metrics tag `model_name`,
+-            matches the model name exposed via the APIs. If multiple model 
+-            names provided, the first name will be used. If not specified, 
++            matches the model name exposed via the APIs. If multiple model
++            names provided, the first name will be used. If not specified,
+             the model name will be the same as `model`.
++        limit_mm_per_prompt: Maximum number of data items per modality
++            per prompt. Only applicable for multimodal models.
++        use_async_output_proc: Whether to use async output processor.
++            Defaults to True.
++        config_format: The config format which shall be loaded.
++            Defaults to 'auto' which defaults to 'hf'.
++        hf_overrides: If a dictionary, contains arguments to be forwarded to the
++            HuggingFace config. If a callable, it is called to update the
++            HuggingFace config.
++        mm_processor_kwargs: Arguments to be forwarded to the model's processor
++            for multi-modal data, e.g., image processor.
++        disable_mm_preprocessor_cache: If true, then disables caching of the
++            multi-modal preprocessor/mapper. (not recommended)
++        override_neuron_config: Initialize non default neuron config or
++            override default neuron config that are specific to Neuron devices,
++            this argument will be used to configure the neuron config that
++            can not be gathered from the vllm arguments.
++        override_pooler_config: Initialize non default pooling config or
++            override default pooling config for the pooling model.
++        logits_processor_pattern: Optional regex pattern specifying valid
++            logits processor qualified names that can be passed with the
++            `logits_processors` extra completion argument. Defaults to None,
++            which allows no processors.
++        generation_config: Configuration parameter file for generation.
+     """
+ 
+-    def __init__(
+-        self,
+-        model: str,
+-        tokenizer: str,
+-        tokenizer_mode: str,
+-        trust_remote_code: bool,
+-        dtype: Union[str, torch.dtype],
+-        seed: int,
+-        revision: Optional[str] = None,
+-        code_revision: Optional[str] = None,
+-        tokenizer_revision: Optional[str] = None,
+-        max_model_len: Optional[int] = None,
+-        quantization: Optional[str] = None,
+-        quantization_param_path: Optional[str] = None,
+-        enforce_eager: bool = False,
+-        max_context_len_to_capture: Optional[int] = None,
+-        max_seq_len_to_capture: Optional[int] = None,
+-        max_logprobs: int = 5,
+-        skip_tokenizer_init: bool = False,
+-        served_model_name: Optional[Union[str, List[str]]] = None,
+-    ) -> None:
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        factors: List[Any] = []
++        factors.append(self.model)
++        factors.append(self.dtype)
++        factors.append(self.quantization)
++        factors.append(self.quantization_param_path)
++        factors.append(self.revision)
++        factors.append(self.code_revision)
++        factors.append(self.trust_remote_code)
++        factors.append(self.rope_scaling)
++        factors.append(self.rope_theta)
++        return hashlib.sha256(str(factors).encode()).hexdigest()
++
++    def __init__(self,
++                 model: str,
++                 task: Union[TaskOption, Literal["draft"]],
++                 tokenizer: str,
++                 tokenizer_mode: str,
++                 trust_remote_code: bool,
++                 dtype: Union[str, torch.dtype],
++                 seed: int,
++                 allowed_local_media_path: str = "",
++                 revision: Optional[str] = None,
++                 code_revision: Optional[str] = None,
++                 rope_scaling: Optional[Dict[str, Any]] = None,
++                 rope_theta: Optional[float] = None,
++                 tokenizer_revision: Optional[str] = None,
++                 max_model_len: Optional[int] = None,
++                 spec_target_max_model_len: Optional[int] = None,
++                 quantization: Optional[str] = None,
++                 quantization_param_path: Optional[str] = None,
++                 enforce_eager: Optional[bool] = None,
++                 max_seq_len_to_capture: Optional[int] = None,
++                 max_logprobs: int = 20,
++                 disable_sliding_window: bool = False,
++                 skip_tokenizer_init: bool = False,
++                 served_model_name: Optional[Union[str, List[str]]] = None,
++                 limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
++                 use_async_output_proc: bool = True,
++                 config_format: ConfigFormat = ConfigFormat.AUTO,
++                 hf_overrides: Optional[HfOverrides] = None,
++                 mm_processor_kwargs: Optional[Dict[str, Any]] = None,
++                 disable_mm_preprocessor_cache: bool = False,
++                 override_neuron_config: Optional[Dict[str, Any]] = None,
++                 override_pooler_config: Optional["PoolerConfig"] = None,
++                 logits_processor_pattern: Optional[str] = None,
++                 generation_config: Optional[str] = None) -> None:
+         self.model = model
+         self.tokenizer = tokenizer
+         self.tokenizer_mode = tokenizer_mode
+         self.trust_remote_code = trust_remote_code
++        self.allowed_local_media_path = allowed_local_media_path
+         self.seed = seed
+         self.revision = revision
+         self.code_revision = code_revision
+-        self.tokenizer_revision = tokenizer_revision
++        self.rope_scaling = rope_scaling
++        self.rope_theta = rope_theta
++
++        if hf_overrides is None:
++            hf_overrides = {}
++
++        if callable(hf_overrides):
++            hf_overrides_kw = {}
++            hf_overrides_fn = hf_overrides
++        else:
++            hf_overrides_kw = hf_overrides
++            hf_overrides_fn = None
++
++        if rope_scaling is not None:
++            hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling}
++            hf_overrides_kw.update(hf_override)
++            msg = ("`--rope-scaling` will be removed in a future release. "
++                   f"'Please instead use `--hf-overrides '{hf_override!r}'`")
++            warnings.warn(DeprecationWarning(msg), stacklevel=2)
++        if rope_theta is not None:
++            hf_override = {"rope_theta": rope_theta}
++            hf_overrides_kw.update(hf_override)
++            msg = ("`--rope-theta` will be removed in a future release. "
++                   f"'Please instead use `--hf-overrides '{hf_override!r}'`")
++            warnings.warn(DeprecationWarning(msg), stacklevel=2)
++
++        self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
++
++        # The tokenizer version is consistent with the model version by default.
++        if tokenizer_revision is None:
++            self.tokenizer_revision = revision
++        else:
++            self.tokenizer_revision = tokenizer_revision
+         self.quantization = quantization
+         self.quantization_param_path = quantization_param_path
+         self.enforce_eager = enforce_eager
+-        self.max_context_len_to_capture = max_context_len_to_capture
+-        if self.max_context_len_to_capture is not None:
+-            raise ValueError("`max_context_len_to_capture` is deprecated. "
+-                             "Use `max_seq_len_to_capture` instead.")
+-        self.max_seq_len_to_capture = (max_seq_len_to_capture
+-                                       or max_context_len_to_capture)
++        self.max_seq_len_to_capture = max_seq_len_to_capture
+         self.max_logprobs = max_logprobs
++        self.disable_sliding_window = disable_sliding_window
+         self.skip_tokenizer_init = skip_tokenizer_init
+ 
+-        self.hf_config = get_config(self.model, trust_remote_code, revision,
+-                                    code_revision)
++        hf_config = get_config(self.model, trust_remote_code, revision,
++                               code_revision, config_format)
++
++        if hf_overrides_kw:
++            logger.info("Overriding HF config with %s", hf_overrides_kw)
++            hf_config.update(hf_overrides_kw)
++        if hf_overrides_fn:
++            logger.info("Overriding HF config with %s", hf_overrides_fn)
++            hf_config = hf_overrides_fn(hf_config)
++
++        self.hf_config = hf_config
++
+         self.hf_text_config = get_hf_text_config(self.hf_config)
++        self.encoder_config = self._get_encoder_config()
++        self.hf_image_processor_config = get_hf_image_processor_config(
++            self.model, revision)
+         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+-        self.max_model_len = _get_and_verify_max_len(self.hf_text_config,
+-                                                     max_model_len)
++        self.use_async_output_proc = use_async_output_proc
++        self.mm_processor_kwargs = mm_processor_kwargs
++        self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
++
++        # Set enforce_eager to False if the value is unset.
++        if self.enforce_eager is None:
++            self.enforce_eager = False
++
++        sliding_window = getattr(self.hf_text_config, "sliding_window", None)
++        has_interleaved_attention = (sliding_window is not None) and (
++            isinstance(sliding_window, list) or
++            (self.hf_text_config.model_type in ["gemma2", "cohere2"]))
++
++        if (not self.disable_sliding_window and has_interleaved_attention):
++            if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
++                sliding_window_len_min = get_min_sliding_window(
++                    self.hf_text_config.sliding_window)
++
++                logger.warning_once(
++                    f"{self.hf_text_config.model_type} has interleaved "
++                    "attention, which is currently not supported by the "
++                    "XFORMERS backend. Disabling sliding window and capping "
++                    "the max length to the sliding window size "
++                    f"({sliding_window_len_min}).")
++                self.disable_sliding_window = True
++            else:
++                # for a model with interleaved attention,
++                # the scheduler and the model treat it as full attention
++                # (i.e., not dropping any tokens outside the window).
++                # only the attention layer itself is aware of the sliding
++                # window, and use the window size to compute the attention.
++                self.hf_text_config.interleaved_sliding_window = sliding_window
++                delattr(self.hf_text_config, "sliding_window")
++                sliding_window = None
++
++        self.max_model_len = _get_and_verify_max_len(
++            hf_config=self.hf_text_config,
++            max_model_len=max_model_len,
++            disable_sliding_window=self.disable_sliding_window,
++            sliding_window_len=self.get_hf_config_sliding_window(),
++            spec_target_max_model_len=spec_target_max_model_len,
++            encoder_config=self.encoder_config)
+         self.served_model_name = get_served_model_name(model,
+                                                        served_model_name)
++        self.multimodal_config = self._init_multimodal_config(
++            limit_mm_per_prompt)
+         if not self.skip_tokenizer_init:
+             self._verify_tokenizer_mode()
++
++        self.is_attention_free = self._init_attention_free()
++        self.is_hybrid = self._init_is_hybrid()
++        self.has_inner_state = self._init_has_inner_state()
++
++        from vllm.platforms import current_platform
++        if current_platform.is_neuron():
++            self.override_neuron_config = override_neuron_config
++        else:
++            self.override_neuron_config = None
++
++        supported_tasks, task = self._resolve_task(task, self.hf_config)
++        self.supported_tasks = supported_tasks
++        self.task: Final = task
++
++        self.pooler_config = self._init_pooler_config(override_pooler_config)
++        self.logits_processor_pattern = logits_processor_pattern
++
++        self.generation_config = generation_config
++
+         self._verify_quantization()
+         self._verify_cuda_graph()
++        self._verify_bnb_config()
++
++    def maybe_pull_model_tokenizer_for_s3(self, model: str,
++                                          tokenizer: str) -> None:
++        """
++        Pull the model config or tokenizer to a temporary
++        directory in case of S3.
++
++        Args:
++            model: The model name or path.
++            tokenizer: The tokenizer name or path.
++
++        """
++        if is_s3(model) or is_s3(tokenizer):
++            if is_s3(model):
++                s3_model = S3Model()
++                s3_model.pull_files(model, allow_pattern=["*config.json"])
++                self.model_weights = self.model
++                self.model = s3_model.dir
++
++            if is_s3(tokenizer):
++                s3_tokenizer = S3Model()
++                s3_tokenizer.pull_files(
++                    model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
++                self.tokenizer = s3_tokenizer.dir
++
++    def _init_multimodal_config(
++        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
++    ) -> Optional["MultiModalConfig"]:
++        architectures = getattr(self.hf_config, "architectures", [])
++        if ModelRegistry.is_multimodal_model(architectures):
++            return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
++
++        if limit_mm_per_prompt:
++            raise ValueError("`limit_mm_per_prompt` is only supported for "
++                             "multimodal models.")
++
++        return None
++
++    def _get_encoder_config(self):
++        return get_sentence_transformer_tokenizer_config(
++            self.model, self.revision)
++
++    def _init_pooler_config(
++        self,
++        override_pooler_config: Optional["PoolerConfig"],
++    ) -> Optional["PoolerConfig"]:
++
++        if self.runner_type == "pooling":
++            user_config = override_pooler_config or PoolerConfig()
++
++            base_config = get_pooling_config(self.model, self.revision)
++            if base_config is not None:
++                # Only set values that are not overridden by the user
++                for k, v in base_config.items():
++                    if getattr(user_config, k) is None:
++                        setattr(user_config, k, v)
++
++            return user_config
++
++        return None
++
++    def _init_attention_free(self) -> bool:
++        architectures = getattr(self.hf_config, "architectures", [])
++        return ModelRegistry.is_attention_free_model(architectures)
++
++    def _init_is_hybrid(self) -> bool:
++        architectures = getattr(self.hf_config, "architectures", [])
++        return ModelRegistry.is_hybrid_model(architectures)
++
++    def _init_has_inner_state(self) -> bool:
++        architectures = getattr(self.hf_config, "architectures", [])
++        return ModelRegistry.model_has_inner_state(architectures)
+ 
+     def _verify_tokenizer_mode(self) -> None:
+         tokenizer_mode = self.tokenizer_mode.lower()
+-        if tokenizer_mode not in ["auto", "slow"]:
++        if tokenizer_mode not in ["auto", "slow", "mistral"]:
+             raise ValueError(
+                 f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
+-                "either 'auto' or 'slow'.")
++                "either 'auto', 'slow' or 'mistral'.")
+         self.tokenizer_mode = tokenizer_mode
+ 
++    def _get_preferred_task(
++        self,
++        architectures: List[str],
++        supported_tasks: Set[_ResolvedTask],
++    ) -> Optional[_ResolvedTask]:
++        model_id = self.model
++        if get_pooling_config(model_id, self.revision):
++            return "embed"
++        if ModelRegistry.is_cross_encoder_model(architectures):
++            return "score"
++
++        suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [
++            # Other models follow this pattern
++            ("ForCausalLM", "generate"),
++            ("ForConditionalGeneration", "generate"),
++            ("ForSequenceClassification", "classify"),
++            ("ChatModel", "generate"),
++            ("LMHeadModel", "generate"),
++            ("EmbeddingModel", "embed"),
++            ("RewardModel", "reward"),
++        ]
++        _, arch = ModelRegistry.inspect_model_cls(architectures)
++
++        for suffix, pref_task in suffix_to_preferred_task:
++            if arch.endswith(suffix) and pref_task in supported_tasks:
++                return pref_task
++
++        return None
++
++    def _resolve_task(
++        self,
++        task_option: Union[TaskOption, Literal["draft"]],
++        hf_config: PretrainedConfig,
++    ) -> Tuple[Set[_ResolvedTask], _ResolvedTask]:
++        if task_option == "draft":
++            return {"draft"}, "draft"
++
++        architectures = getattr(hf_config, "architectures", [])
++
++        runner_support: Dict[RunnerType, bool] = {
++            # NOTE: Listed from highest to lowest priority,
++            # in case the model supports multiple of them
++            "generate": ModelRegistry.is_text_generation_model(architectures),
++            "pooling": ModelRegistry.is_pooling_model(architectures),
++        }
++        supported_runner_types_lst: List[RunnerType] = [
++            runner_type
++            for runner_type, is_supported in runner_support.items()
++            if is_supported
++        ]
++
++        supported_tasks_lst: List[_ResolvedTask] = [
++            task for runner_type in supported_runner_types_lst
++            for task in _RUNNER_TASKS[runner_type]
++        ]
++        supported_tasks = set(supported_tasks_lst)
++
++        if task_option == "auto":
++            selected_task = next(iter(supported_tasks_lst))
++
++            if len(supported_tasks_lst) > 1:
++                preferred_task = self._get_preferred_task(
++                    architectures, supported_tasks)
++                if preferred_task is not None:
++                    selected_task = preferred_task
++
++                logger.info(
++                    "This model supports multiple tasks: %s. "
++                    "Defaulting to '%s'.", supported_tasks, selected_task)
++        else:
++            # Aliases
++            if task_option == "embedding":
++                preferred_task = self._get_preferred_task(
++                    architectures, supported_tasks)
++                if preferred_task != "embed":
++                    msg = ("The 'embedding' task will be restricted to "
++                           "embedding models in a future release. Please "
++                           "pass `--task classify`, `--task score`, or "
++                           "`--task reward` explicitly for other pooling "
++                           "models.")
++                    warnings.warn(msg, DeprecationWarning, stacklevel=2)
++
++                task_option = preferred_task or "embed"
++
++            if task_option not in supported_tasks:
++                msg = (
++                    f"This model does not support the '{task_option}' task. "
++                    f"Supported tasks: {supported_tasks}")
++                raise ValueError(msg)
++
++            selected_task = task_option
++
++        return supported_tasks, selected_task
++
++    def _parse_quant_hf_config(self):
++        quant_cfg = getattr(self.hf_config, "quantization_config", None)
++        if quant_cfg is None:
++            # compressed-tensors uses a "compression_config" key
++            quant_cfg = getattr(self.hf_config, "compression_config", None)
++        return quant_cfg
++
+     def _verify_quantization(self) -> None:
+-        supported_quantization = [*QUANTIZATION_METHODS]
+-        rocm_supported_quantization = ["gptq", "squeezellm"]
++        supported_quantization = QUANTIZATION_METHODS
++        optimized_quantization_methods = [
++            "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
++            "awq_marlin", "fbgemm_fp8", "compressed_tensors",
++            "compressed-tensors", "experts_int8"
++        ]
+         if self.quantization is not None:
+             self.quantization = self.quantization.lower()
+ 
+         # Parse quantization method from the HF model config, if available.
+-        quant_cfg = getattr(self.hf_config, "quantization_config", None)
++        quant_cfg = self._parse_quant_hf_config()
++
+         if quant_cfg is not None:
+             quant_method = quant_cfg.get("quant_method", "").lower()
+-            # compat: autogptq >=0.8.0 use checkpoint_format: str
+-            # compat: autogptq <=0.7.1 is_marlin_format: bool
+-            is_format_marlin = (quant_cfg.get("checkpoint_format") == "marlin"
+-                                or quant_cfg.get("is_marlin_format", False))
+-
+-            # Check which LinearMethod the GPTQ model should use.
+-            if quant_method == "gptq":
+-                # If serialized in Marlin format, use MarlinLinearMethod.
+-                # TODO (@robertgshaw): migrate under GPTQMarlinLinearMethod.
+-                if is_format_marlin:
+-                    logger.info("The model is serialized in Marlin format. "
+-                                "Using Marlin kernel.")
+-                    quant_method = "marlin"
+-                    if self.quantization == "gptq":
+-                        self.quantization = quant_method
+-
+-                # If convertible to Marlin format, use GPTQMarlinLinearMethod
+-                # unless the user explicitly specified GPTQLinearMethod.
+-                elif GPTQMarlinConfig.is_marlin_compatible(quant_cfg):
+-                    if self.quantization == "gptq":
+-                        logger.warning(
+-                            "The model is convertible to Marlin format, but "
+-                            "you specified quantization=gptq. Use "
+-                            "quantization=marlin for faster inference.")
+-                    else:
+-                        logger.info(
+-                            "The model is convertible to Marlin format. "
+-                            "Using Marlin kernel.")
+-                        quant_method = "gptq_marlin"
+-                        if self.quantization == "marlin":
+-                            self.quantization = quant_method
++
++            # Detect which checkpoint is it
++            for name in QUANTIZATION_METHODS:
++                method = get_quantization_config(name)
++                quantization_override = method.override_quantization_method(
++                    quant_cfg, self.quantization)
++                if quantization_override:
++                    quant_method = quantization_override
++                    self.quantization = quantization_override
++                    break
+ 
+             # Verify quantization configurations.
+             if self.quantization is None:
+@@ -196,12 +589,9 @@ class ModelConfig:
+                 raise ValueError(
+                     f"Unknown quantization method: {self.quantization}. Must "
+                     f"be one of {supported_quantization}.")
+-            if is_hip(
+-            ) and self.quantization not in rocm_supported_quantization:
+-                raise ValueError(
+-                    f"{self.quantization} quantization is currently not "
+-                    f"supported in ROCm.")
+-            if (self.quantization not in ["marlin", "gptq_marlin"]):
++            from vllm.platforms import current_platform
++            current_platform.verify_quantization(self.quantization)
++            if self.quantization not in optimized_quantization_methods:
+                 logger.warning(
+                     "%s quantization is not fully "
+                     "optimized yet. The speed can be slower than "
+@@ -213,11 +603,80 @@ class ModelConfig:
+         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
+                                           self.max_model_len)
+ 
++        if (self.hf_config.model_type == 'deepseek_v3'
++                and not self.enforce_eager):
++            logger.warning("CUDA graph is not supported for Deepseek V3 yet, "
++                           "fallback to the eager mode.")
++            self.enforce_eager = True
++
++    def _verify_bnb_config(self) -> None:
++        """
++        The current version of bitsandbytes (0.44.0) with 8-bit models does not
++        yet support CUDA graph.
++        """
++        is_bitsandbytes = self.quantization == "bitsandbytes"
++        has_quantization_config = (getattr(self.hf_config,
++                                           "quantization_config", None)
++                                   is not None)
++        is_8bit = (self.hf_config.quantization_config.get(
++            "load_in_8bit", False) if has_quantization_config else False)
++        if all([
++                is_bitsandbytes,
++                has_quantization_config,
++                is_8bit,
++                not self.enforce_eager,
++        ]):
++            logger.warning(
++                "CUDA graph is not supported on BitAndBytes 8bit yet, "
++                "fallback to the eager mode.")
++            self.enforce_eager = True
++
++    def verify_async_output_proc(self, parallel_config, speculative_config,
++                                 device_config) -> None:
++        if not self.use_async_output_proc:
++            # Nothing to check
++            return
++
++        if parallel_config.pipeline_parallel_size > 1:
++            logger.warning("Async output processing can not be enabled "
++                           "with pipeline parallel")
++            self.use_async_output_proc = False
++            return
++
++        # Reminder: Please update docs/source/features/compatibility_matrix.md
++        # If the feature combo become valid
++        from vllm.platforms import current_platform
++        if not current_platform.is_async_output_supported(self.enforce_eager):
++            logger.warning(
++                "Async output processing is not supported on the "
++                "current platform type %s.", current_platform.device_type)
++            self.use_async_output_proc = False
++            return
++
++        if envs.VLLM_USE_RAY_SPMD_WORKER:
++            logger.warning(
++                "Async output processing can not be enabled with ray spmd")
++            self.use_async_output_proc = False
++            return
++
++        # Async postprocessor is not necessary for pooling models
++        # since there is no token generation
++        if self.runner_type == "pooling":
++            self.use_async_output_proc = False
++
++        # Reminder: Please update docs/source/features/compatibility_matrix.md
++        # If the feature combo become valid
++        if speculative_config:
++            logger.warning("Async output processing is not supported with"
++                           " speculative decoding currently.")
++            self.use_async_output_proc = False
++
+     def verify_with_parallel_config(
+         self,
+         parallel_config: "ParallelConfig",
+     ) -> None:
+-        total_num_attention_heads = self.hf_text_config.num_attention_heads
++        total_num_attention_heads = getattr(self.hf_text_config,
++                                            "num_attention_heads", 0)
+         tensor_parallel_size = parallel_config.tensor_parallel_size
+         if total_num_attention_heads % tensor_parallel_size != 0:
+             raise ValueError(
+@@ -225,17 +684,22 @@ class ModelConfig:
+                 " must be divisible by tensor parallel size "
+                 f"({tensor_parallel_size}).")
+ 
+-        total_num_hidden_layers = self.hf_text_config.num_hidden_layers
+         pipeline_parallel_size = parallel_config.pipeline_parallel_size
+-        if total_num_hidden_layers % pipeline_parallel_size != 0:
+-            raise ValueError(
+-                f"Total number of hidden layers ({total_num_hidden_layers}) "
+-                "must be divisible by pipeline parallel size "
+-                f"({pipeline_parallel_size}).")
+-
+-    def get_sliding_window(self) -> Optional[int]:
+-        """Get the sliding window size, or None if disabled.
+-        """
++        if pipeline_parallel_size > 1:
++            architectures = getattr(self.hf_config, "architectures", [])
++            if not ModelRegistry.is_pp_supported_model(architectures):
++                raise NotImplementedError(
++                    "Pipeline parallelism is not supported for this model. "
++                    "Supported models implement the `SupportsPP` interface.")
++
++            if self.use_async_output_proc:
++                logger.warning("Async output processor is not supported with "
++                               "pipeline parallelism currently. Disabling it.")
++                self.use_async_output_proc = False
++
++    def get_hf_config_sliding_window(
++            self) -> Union[Optional[int], List[Optional[int]]]:
++        """Get the sliding window size, or None if disabled."""
+ 
+         # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
+         # addition to sliding window size. We check if that field is present
+@@ -245,6 +709,15 @@ class ModelConfig:
+             return None
+         return getattr(self.hf_text_config, "sliding_window", None)
+ 
++    def get_sliding_window(self) -> Optional[Union[int, List[Optional[int]]]]:
++        """Get the sliding window size, or None if disabled.
++        """
++        # If user disables sliding window, return None.
++        if self.disable_sliding_window:
++            return None
++        # Otherwise get the value from the hf config.
++        return self.get_hf_config_sliding_window()
++
+     def get_vocab_size(self) -> int:
+         return self.hf_text_config.vocab_size
+ 
+@@ -252,6 +725,17 @@ class ModelConfig:
+         return self.hf_text_config.hidden_size
+ 
+     def get_head_size(self) -> int:
++        # TODO remove hard code
++        if hasattr(self.hf_text_config,
++                   "model_type") and (self.hf_text_config.model_type
++                                      in ('deepseek_v2', 'deepseek_v3')):
++            # FlashAttention supports only head_size 32, 64, 128, 256,
++            # we need to pad head_size 192 to 256
++            return 256
++
++        if self.is_attention_free:
++            return 0
++
+         if hasattr(self.hf_text_config, "head_dim"):
+             return self.hf_text_config.head_dim
+         # FIXME(woosuk): This may not be true for all models.
+@@ -275,10 +759,17 @@ class ModelConfig:
+             return 1
+ 
+         # For DBRX and MPT
+-        if self.hf_config.model_type in ["dbrx", "mpt"]:
++        if self.hf_config.model_type == "mpt":
++            if "kv_n_heads" in self.hf_config.attn_config:
++                return self.hf_config.attn_config["kv_n_heads"]
++            return self.hf_config.num_attention_heads
++        if self.hf_config.model_type == "dbrx":
+             return getattr(self.hf_config.attn_config, "kv_n_heads",
+                            self.hf_config.num_attention_heads)
+ 
++        if self.is_attention_free:
++            return 0
++
+         attributes = [
+             # For Falcon:
+             "n_head_kv",
+@@ -309,12 +800,142 @@ class ModelConfig:
+ 
+     def get_num_attention_heads(self,
+                                 parallel_config: "ParallelConfig") -> int:
+-        return self.hf_text_config.num_attention_heads // \
+-                    parallel_config.tensor_parallel_size
++        num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
++        return num_heads // parallel_config.tensor_parallel_size
++
++    def get_layers_start_end_indices(
++            self, parallel_config: "ParallelConfig") -> Tuple[int, int]:
++        from vllm.distributed.utils import get_pp_indices
++        total_num_hidden_layers = getattr(self.hf_text_config,
++                                          "num_hidden_layers", 0)
++        pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size
++        pp_size = parallel_config.pipeline_parallel_size
++        start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
++        return start, end
+ 
+     def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
+-        total_num_hidden_layers = self.hf_text_config.num_hidden_layers
+-        return total_num_hidden_layers // parallel_config.pipeline_parallel_size
++        start, end = self.get_layers_start_end_indices(parallel_config)
++        return end - start
++
++    def get_num_layers_by_block_type(
++        self,
++        parallel_config: "ParallelConfig",
++        block_type: LayerBlockType = LayerBlockType.attention,
++    ) -> int:
++        # This function relies on 'layers_block_type' in hf_config,
++        # for w/o this attribute, we will need to have workarounds like so
++        attn_block_type = block_type == LayerBlockType.attention
++        is_transformer = not self.is_hybrid and not self.is_attention_free
++        start, end = self.get_layers_start_end_indices(parallel_config)
++
++        if is_transformer:
++            # Handle the basic case first
++            return end - start if attn_block_type else 0
++        elif self.is_attention_free:
++            # Attention free
++            # Note that this code assumes there
++            # is only one type of attention-free block type.
++            return 0 if attn_block_type else end - start
++        else:
++            # Hybrid model
++            layers_block_type_value = getattr(self.hf_config,
++                                              "layers_block_type", None)
++            if layers_block_type_value is None:
++                raise ValueError("The model is an hybrid without a"
++                                 "layers_block_type in the hf_config,"
++                                 "cannot determine the num of "
++                                 f"{block_type.value} layers")
++
++            return sum(t == block_type.value
++                       for t in layers_block_type_value[start:end])
++
++    def get_multimodal_config(self) -> "MultiModalConfig":
++        """
++        Get the multimodal configuration of the model.
++
++        Raises:
++            ValueError: If the model is not multimodal.
++        """
++        if self.multimodal_config is None:
++            raise ValueError("The model is not multimodal.")
++
++        return self.multimodal_config
++
++    def try_get_generation_config(self) -> Dict[str, Any]:
++        if self.generation_config is None or self.generation_config == "auto":
++            config = try_get_generation_config(
++                self.model,
++                trust_remote_code=self.trust_remote_code,
++                revision=self.revision,
++            )
++        else:
++            config = try_get_generation_config(
++                self.generation_config,
++                trust_remote_code=self.trust_remote_code,
++            )
++
++        if config is None:
++            return {}
++
++        return config.to_diff_dict()
++
++    def get_diff_sampling_param(self) -> Dict[str, Any]:
++        """
++        This method returns a dictionary containing the parameters
++        that differ from the default sampling parameters, but only
++        if `generation_config` is set. If `generation_config` is not
++        set, an empty dictionary is returned.
++
++        Returns:
++            Dict[str, Any]: A dictionary with the differing sampling
++            parameters if `generation_config` is set, otherwise an
++            empty dictionary.
++        """
++        if self.generation_config is None:
++            # When generation_config is not set
++            return {}
++        config = self.try_get_generation_config()
++        available_params = [
++            "repetition_penalty",
++            "temperature",
++            "top_k",
++            "top_p",
++            "min_p",
++        ]
++        if any(p in config for p in available_params):
++            diff_sampling_param = {
++                p: config.get(p)
++                for p in available_params if config.get(p) is not None
++            }
++        else:
++            diff_sampling_param = {}
++        return diff_sampling_param
++
++    @property
++    def is_encoder_decoder(self) -> bool:
++        """Extract the HF encoder/decoder model flag."""
++        return is_encoder_decoder(self.hf_config)
++
++    @property
++    def uses_mrope(self) -> bool:
++        return uses_mrope(self.hf_config)
++
++    @property
++    def is_multimodal_model(self) -> bool:
++        return self.multimodal_config is not None
++
++    @property
++    def is_cross_encoder(self) -> bool:
++        architectures = getattr(self.hf_config, "architectures", [])
++        return ModelRegistry.is_cross_encoder_model(architectures)
++
++    @property
++    def supported_runner_types(self) -> Set[RunnerType]:
++        return {_TASK_RUNNER[task] for task in self.supported_tasks}
++
++    @property
++    def runner_type(self) -> RunnerType:
++        return _TASK_RUNNER[self.task]
+ 
+ 
+ class CacheConfig:
+@@ -326,33 +947,62 @@ class CacheConfig:
+             vLLM execution.
+         swap_space: Size of the CPU swap space per GPU (in GiB).
+         cache_dtype: Data type for kv cache storage.
++        is_attention_free: Whether the model is attention-free.
+         num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
+             profiled num_gpu_blocks if specified. Does nothing if None.
++        sliding_window: Sliding window size for the KV cache. Can not work with
++            prefix caching enabled.
++        enable_prefix_caching: Whether to enable prefix caching.
++        cpu_offload_gb: Size of the CPU offload buffer in GiB.
+     """
+ 
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        factors: List[Any] = []
++        factors.append(self.cache_dtype)
++        # `cpu_offload_gb` does not use `torch.compile` yet.
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
++
+     def __init__(
+         self,
+         block_size: int,
+         gpu_memory_utilization: float,
+-        swap_space: int,
++        swap_space: float,
+         cache_dtype: str,
++        is_attention_free: bool = False,
+         num_gpu_blocks_override: Optional[int] = None,
+         sliding_window: Optional[int] = None,
+         enable_prefix_caching: bool = False,
++        cpu_offload_gb: float = 0,
+     ) -> None:
+         self.block_size = block_size
+         self.gpu_memory_utilization = gpu_memory_utilization
+-        self.swap_space_bytes = swap_space * _GB
++        self.swap_space_bytes = swap_space * GiB_bytes
+         self.num_gpu_blocks_override = num_gpu_blocks_override
+         self.cache_dtype = cache_dtype
++        self.is_attention_free = is_attention_free
+         self.sliding_window = sliding_window
+         self.enable_prefix_caching = enable_prefix_caching
++        self.cpu_offload_gb = cpu_offload_gb
++
+         self._verify_args()
+         self._verify_cache_dtype()
++        self._verify_prefix_caching()
+ 
+         # Will be set after profiling.
+-        self.num_gpu_blocks = None
+-        self.num_cpu_blocks = None
++        self.num_gpu_blocks: Optional[int] = None
++        self.num_cpu_blocks: Optional[int] = None
+ 
+     def metrics_info(self):
+         # convert cache_config to dict(key: str, value: str) for prometheus
+@@ -368,24 +1018,24 @@ class CacheConfig:
+     def _verify_cache_dtype(self) -> None:
+         if self.cache_dtype == "auto":
+             pass
+-        elif self.cache_dtype == "fp8":
+-            if not is_hip():
+-                nvcc_cuda_version = get_nvcc_cuda_version()
+-                if nvcc_cuda_version is not None \
+-                        and nvcc_cuda_version < Version("11.8"):
+-                    raise ValueError(
+-                        "FP8 is not supported when cuda version is"
+-                        "lower than 11.8.")
++        elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
+             logger.info(
+                 "Using fp8 data type to store kv cache. It reduces the GPU "
+                 "memory footprint and boosts the performance. "
+-                "But it may cause slight accuracy drop without scaling "
+-                "factors. FP8_E5M2 (without scaling) is only supported on "
+-                "cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 "
+-                "is instead supported for common inference criteria.")
++                "Meanwhile, it may cause accuracy drop without a proper "
++                "scaling factor")
+         else:
+             raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
+ 
++    def _verify_prefix_caching(self) -> None:
++        if not self.enable_prefix_caching:
++            return
++
++        if self.sliding_window is not None:
++            raise NotImplementedError(
++                "Prefix caching is not supported with sliding window. "
++                "Run with --disable-sliding-window to use prefix caching.")
++
+     def verify_with_parallel_config(
+         self,
+         parallel_config: "ParallelConfig",
+@@ -396,9 +1046,9 @@ class CacheConfig:
+         num_gpus_per_node = parallel_config.tensor_parallel_size
+         cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
+ 
+-        msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
+-               f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
+-               "allocated for the swap space.")
++        msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
++               f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
++               "is allocated for the swap space.")
+         if cpu_memory_usage > 0.7 * total_cpu_memory:
+             raise ValueError("Too large swap space. " + msg)
+         elif cpu_memory_usage > 0.4 * total_cpu_memory:
+@@ -417,18 +1067,38 @@ class TokenizerPoolConfig:
+             pool type.
+     """
+     pool_size: int
+-    pool_type: str
++    pool_type: Union[str, Type["BaseTokenizerGroup"]]
+     extra_config: dict
+ 
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # this config will not affect the computation graph.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
++
+     def __post_init__(self):
+-        if self.pool_type not in ("ray", ):
++        if self.pool_type not in ("ray", ) and not isinstance(
++                self.pool_type, type):
+             raise ValueError(f"Unknown pool type: {self.pool_type}")
+         if not isinstance(self.extra_config, dict):
+             raise ValueError("extra_config must be a dictionary.")
+ 
+     @classmethod
+     def create_config(
+-        cls, tokenizer_pool_size: int, tokenizer_pool_type: str,
++        cls, tokenizer_pool_size: int,
++        tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]],
+         tokenizer_pool_extra_config: Optional[Union[str, dict]]
+     ) -> Optional["TokenizerPoolConfig"]:
+         """Create a TokenizerPoolConfig from the given parameters.
+@@ -464,6 +1134,11 @@ class LoadFormat(str, enum.Enum):
+     NPCACHE = "npcache"
+     DUMMY = "dummy"
+     TENSORIZER = "tensorizer"
++    SHARDED_STATE = "sharded_state"
++    GGUF = "gguf"
++    BITSANDBYTES = "bitsandbytes"
++    MISTRAL = "mistral"
++    RUNAI_STREAMER = "runai_streamer"
+ 
+ 
+ @dataclass
+@@ -483,156 +1158,313 @@ class LoadConfig:
+                 mainly for profiling.
+             "tensorizer" will use CoreWeave's tensorizer library for
+                 fast weight loading.
++            "bitsandbytes" will load nf4 type weights.
++        model_loader_extra_config: The extra config for the model loader.
++        ignore_patterns: The list of patterns to ignore when loading the model.
++            Default to "original/**/*" to avoid repeated loading of llama's
++            checkpoints.
+     """
+ 
+     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
+     download_dir: Optional[str] = None
+     model_loader_extra_config: Optional[Union[str, dict]] = field(
+         default_factory=dict)
++    ignore_patterns: Optional[Union[List[str], str]] = None
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # this config will not affect the computation graph.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
+ 
+     def __post_init__(self):
+         model_loader_extra_config = self.model_loader_extra_config or {}
+         if isinstance(model_loader_extra_config, str):
+             self.model_loader_extra_config = json.loads(
+                 model_loader_extra_config)
+-        self._verify_load_format()
++        if isinstance(self.load_format, str):
++            load_format = self.load_format.lower()
++            self.load_format = LoadFormat(load_format)
+ 
+-    def _verify_load_format(self) -> None:
+-        if not isinstance(self.load_format, str):
+-            return
++        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
++            logger.info(
++                "Ignoring the following patterns when downloading weights: %s",
++                self.ignore_patterns)
++        else:
++            self.ignore_patterns = ["original/**/*"]
+ 
+-        load_format = self.load_format.lower()
+-        self.load_format = LoadFormat(load_format)
+ 
+-        rocm_not_supported_load_format: List[str] = []
+-        if is_hip() and load_format in rocm_not_supported_load_format:
+-            rocm_supported_load_format = [
+-                f for f in LoadFormat.__members__
+-                if (f not in rocm_not_supported_load_format)
+-            ]
+-            raise ValueError(
+-                f"load format '{load_format}' is not supported in ROCm. "
+-                f"Supported load formats are "
+-                f"{rocm_supported_load_format}")
++@dataclass
++class ParallelConfig:
++    """Configuration for the distributed execution."""
+ 
++    pipeline_parallel_size: int = 1  # Number of pipeline parallel groups.
++    tensor_parallel_size: int = 1  # Number of tensor parallel groups.
+ 
+-class ParallelConfig:
+-    """Configuration for the distributed execution.
++    # Deprecated, use distributed_executor_backend instead.
++    worker_use_ray: Optional[bool] = None
+ 
+-    Args:
+-        pipeline_parallel_size: Number of pipeline parallel groups.
+-        tensor_parallel_size: Number of tensor parallel groups.
+-        worker_use_ray: Whether to use Ray for model workers. Will be set to
+-            True if either pipeline_parallel_size or tensor_parallel_size is
+-            greater than 1.
+-        max_parallel_loading_workers: Maximum number of multiple batches
+-            when load model sequentially. To avoid RAM OOM when using tensor
+-            parallel and large models.
+-        disable_custom_all_reduce: Disable the custom all-reduce kernel and
+-            fall back to NCCL.
+-        tokenizer_pool_config: Config for the tokenizer pool.
+-            If None, will use synchronous tokenization.
+-        ray_workers_use_nsight: Whether to profile Ray workers with nsight, see
+-            https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
+-    """
++    # Maximum number of multiple batches
++    # when load model sequentially. To avoid RAM OOM when using tensor
++    # parallel and large models.
++    max_parallel_loading_workers: Optional[int] = None
++
++    # Disable the custom all-reduce kernel and fall back to NCCL.
++    disable_custom_all_reduce: bool = False
++
++    # Config for the tokenizer pool. If None, will use synchronous tokenization.
++    tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
++
++    # Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
++    ray_workers_use_nsight: bool = False
++
++    # ray distributed model workers placement group.
++    placement_group: Optional["PlacementGroup"] = None
++
++    # Backend to use for distributed model
++    # workers, either "ray" or "mp" (multiprocessing). If the product
++    # of pipeline_parallel_size and tensor_parallel_size is less than
++    # or equal to the number of GPUs available, "mp" will be used to
++    # keep processing on a single host. Otherwise, this will default
++    # to "ray" if Ray is installed and fail otherwise. Note that tpu
++    # and hpu only support Ray for distributed inference.
++    distributed_executor_backend: Optional[Union[str,
++                                                 Type["ExecutorBase"]]] = None
++
++    # the full name of the worker class to use. If "auto", the worker class
++    # will be determined based on the platform.
++    worker_cls: str = "auto"
++    sd_worker_cls: str = "auto"
++
++    world_size: int = field(init=False)
++
++    rank: int = 0
++
++    def compute_hash(self):
++        """
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        factors: List[Any] = []
++        factors.append(self.pipeline_parallel_size)
++        factors.append(self.tensor_parallel_size)
++        return hashlib.sha256(str(factors).encode()).hexdigest()
++
++    def __post_init__(self) -> None:
++        self.world_size = self.pipeline_parallel_size * \
++            self.tensor_parallel_size
++
++        if self.worker_use_ray:
++            if self.distributed_executor_backend is None:
++                self.distributed_executor_backend = "ray"
++            elif not self.use_ray:
++                raise ValueError(f"worker-use-ray can't be used with "
++                                 f"distributed executor backend "
++                                 f"'{self.distributed_executor_backend}'.")
++        ray_only_devices = ["tpu", "hpu"]
++        from vllm.platforms import current_platform
++        if (current_platform.device_type in ray_only_devices
++                and self.world_size > 1):
++            if self.distributed_executor_backend is None:
++                self.distributed_executor_backend = "ray"
++            if self.distributed_executor_backend != "ray":
++                raise ValueError(
++                    f"{current_platform.device_type.upper()} backend only "
++                    "supports Ray for distributed inference.")
++
++        if self.distributed_executor_backend is None and self.world_size > 1:
++            # We use multiprocessing by default if world_size fits on the
++            # current node and we aren't in a ray placement group.
++
++            from vllm.executor import ray_utils
++            backend = "mp"
++            ray_found = ray_utils.ray_is_available()
++            if (current_platform.is_cuda()
++                    and cuda_device_count_stateless() < self.world_size):
++                if not ray_found:
++                    raise ValueError("Unable to load Ray which is "
++                                     "required for multi-node inference, "
++                                     "please install Ray with `pip install "
++                                     "ray`.") from ray_utils.ray_import_err
++                backend = "ray"
++            elif ray_found:
++                if self.placement_group:
++                    backend = "ray"
++                else:
++                    from ray import is_initialized as ray_is_initialized
++                    if ray_is_initialized():
++                        from ray.util import get_current_placement_group
++                        if get_current_placement_group():
++                            backend = "ray"
++            self.distributed_executor_backend = backend
++            logger.info("Defaulting to use %s for distributed inference",
++                        backend)
+ 
+-    def __init__(
+-        self,
+-        pipeline_parallel_size: int,
+-        tensor_parallel_size: int,
+-        worker_use_ray: bool,
+-        max_parallel_loading_workers: Optional[int] = None,
+-        disable_custom_all_reduce: bool = False,
+-        tokenizer_pool_config: Optional[TokenizerPoolConfig] = None,
+-        ray_workers_use_nsight: bool = False,
+-        placement_group: Optional["PlacementGroup"] = None,
+-    ) -> None:
+-        self.pipeline_parallel_size = pipeline_parallel_size
+-        self.tensor_parallel_size = tensor_parallel_size
+-        self.worker_use_ray = worker_use_ray
+-        self.max_parallel_loading_workers = max_parallel_loading_workers
+-        self.disable_custom_all_reduce = disable_custom_all_reduce
+-        self.tokenizer_pool_config = tokenizer_pool_config
+-        self.ray_workers_use_nsight = ray_workers_use_nsight
+-        self.placement_group = placement_group
+-
+-        self.world_size = pipeline_parallel_size * self.tensor_parallel_size
+-        if self.world_size > 1:
+-            self.worker_use_ray = True
+         self._verify_args()
+ 
++    @property
++    def use_ray(self) -> bool:
++        return self.distributed_executor_backend == "ray" or (
++            isinstance(self.distributed_executor_backend, type)
++            and self.distributed_executor_backend.uses_ray)
++
+     def _verify_args(self) -> None:
+-        if self.pipeline_parallel_size > 1:
+-            raise NotImplementedError(
+-                "Pipeline parallelism is not supported yet.")
+-        if not self.disable_custom_all_reduce and self.world_size > 1:
+-            if is_hip():
+-                self.disable_custom_all_reduce = True
+-                logger.info(
+-                    "Disabled the custom all-reduce kernel because it is not "
+-                    "supported on AMD GPUs.")
+-            elif self.pipeline_parallel_size > 1:
+-                self.disable_custom_all_reduce = True
+-                logger.info(
+-                    "Disabled the custom all-reduce kernel because it is not "
+-                    "supported with pipeline parallelism.")
+-        if self.ray_workers_use_nsight and not self.worker_use_ray:
++        # Lazy import to avoid circular import
++        from vllm.executor.executor_base import ExecutorBase
++        from vllm.platforms import current_platform
++        if self.distributed_executor_backend not in (
++                "ray", "mp", None) and not (isinstance(
++                    self.distributed_executor_backend, type) and issubclass(
++                        self.distributed_executor_backend, ExecutorBase)):
++            raise ValueError(
++                "Unrecognized distributed executor backend "
++                f"{self.distributed_executor_backend}. Supported "
++                "values are 'ray', 'mp' or custom ExecutorBase subclass.")
++        if self.use_ray:
++            from vllm.executor import ray_utils
++            ray_utils.assert_ray_available()
++        if current_platform.is_rocm():
++            self.disable_custom_all_reduce = True
++            logger.info(
++                "Disabled the custom all-reduce kernel because it is not "
++                "supported on AMD GPUs.")
++        if self.ray_workers_use_nsight and not self.use_ray:
+             raise ValueError("Unable to use nsight profiling unless workers "
+                              "run with Ray.")
+ 
+ 
++@dataclass
+ class SchedulerConfig:
+-    """Scheduler configuration.
++    """Scheduler configuration."""
+ 
+-    Args:
+-        max_num_batched_tokens: Maximum number of tokens to be processed in
+-            a single iteration.
+-        max_num_seqs: Maximum number of sequences to be processed in a single
+-            iteration.
+-        max_model_len: Maximum length of a sequence (including prompt
+-            and generated text).
+-        use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not.
+-        num_lookahead_slots: The number of slots to allocate per sequence per
+-            step, beyond the known token ids. This is used in speculative
+-            decoding to store KV activations of tokens which may or may not be
+-            accepted.
+-        delay_factor: Apply a delay (of delay factor multiplied by previous
+-            prompt latency) before scheduling next prompt.
+-        enable_chunked_prefill: If True, prefill requests can be chunked based
+-            on the remaining max_num_batched_tokens.
+-    """
++    runner_type: str = "generate"  # The runner type to launch for the model.
+ 
+-    def __init__(
+-        self,
+-        max_num_batched_tokens: Optional[int],
+-        max_num_seqs: int,
+-        max_model_len: int,
+-        use_v2_block_manager: bool = False,
+-        num_lookahead_slots: int = 0,
+-        delay_factor: float = 0.0,
+-        enable_chunked_prefill: bool = False,
+-    ) -> None:
+-        if max_num_batched_tokens is not None:
+-            self.max_num_batched_tokens = max_num_batched_tokens
+-        else:
+-            if enable_chunked_prefill:
+-                # It is the values that have the best balance between ITL
+-                # and TTFT on A100. Note it is not optimized for throughput.
+-                self.max_num_batched_tokens = 512
++    # Maximum number of tokens to be processed in a single iteration.
++    max_num_batched_tokens: int = field(default=None)  # type: ignore
++
++    # Maximum number of sequences to be processed in a single iteration.
++    max_num_seqs: int = 128
++
++    # Maximum length of a sequence (including prompt and generated text).
++    max_model_len: int = 8192
++
++    # The number of slots to allocate per sequence per
++    # step, beyond the known token ids. This is used in speculative
++    # decoding to store KV activations of tokens which may or may not be
++    # accepted.
++    num_lookahead_slots: int = 0
++
++    # Apply a delay (of delay factor multiplied by previous
++    # prompt latency) before scheduling next prompt.
++    delay_factor: float = 0.0
++
++    # If True, prefill requests can be chunked based
++    # on the remaining max_num_batched_tokens.
++    enable_chunked_prefill: bool = False
++
++    is_multimodal_model: bool = False
++
++    # FIXME(woosuk & ywang96): Below are placeholder values. We need to
++    # calculate the actual values from the configurations.
++    # Multimodal encoder run compute budget, only used in V1
++    max_num_encoder_input_tokens = 16384
++
++    # Multimodal encoder cache size, only used in V1
++    encoder_cache_size = 16384
++
++    # Whether to perform preemption by swapping or
++    # recomputation. If not specified, we determine the mode as follows:
++    # We use recomputation by default since it incurs lower overhead than
++    # swapping. However, when the sequence group has multiple sequences
++    # (e.g., beam search), recomputation is not currently supported. In
++    # such a case, we use swapping instead.
++    preemption_mode: Optional[str] = None
++
++    num_scheduler_steps: int = 1
++
++    multi_step_stream_outputs: bool = False
++
++    # Private API. If used, scheduler sends delta data to
++    # workers instead of an entire data. It should be enabled only
++    # when SPMD worker architecture is enabled. I.e.,
++    # VLLM_USE_RAY_SPMD_WORKER=1
++    send_delta_data: bool = False
++
++    # The scheduling policy to use. "fcfs" (default) or "priority".
++    policy: str = "fcfs"
++
++    chunked_prefill_enabled: bool = field(init=False)
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # this config will not affect the computation graph.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
++
++    def __post_init__(self) -> None:
++        if self.max_num_batched_tokens is None:
++            if self.enable_chunked_prefill:
++                if self.num_scheduler_steps > 1:
++                    # Multi-step Chunked-Prefill doesn't allow prompt-chunking
++                    # for now. Have max_num_batched_tokens set to max_model_len
++                    # so we don't reject sequences on account of a short
++                    # max_num_batched_tokens.
++                    self.max_num_batched_tokens = max(self.max_model_len, 2048)
++                else:
++                    # This value is chosen to have a balance between ITL
++                    # and TTFT. Note it is not optimized for throughput.
++                    self.max_num_batched_tokens = 2048
+             else:
+                 # If max_model_len is too short, use 2048 as the default value
+                 # for higher throughput.
+-                self.max_num_batched_tokens = max(max_model_len, 2048)
+-        if enable_chunked_prefill:
+-            logger.info("Chunked prefill is enabled (EXPERIMENTAL).")
+-
+-        self.max_num_seqs = max_num_seqs
+-        self.max_model_len = max_model_len
+-        self.use_v2_block_manager = use_v2_block_manager
+-        self.num_lookahead_slots = num_lookahead_slots
+-        self.delay_factor = delay_factor
+-        self.chunked_prefill_enabled = enable_chunked_prefill
++                self.max_num_batched_tokens = max(self.max_model_len, 2048)
++
++            if self.runner_type == "pooling":
++                # Choose specific value for higher throughput
++                self.max_num_batched_tokens = max(
++                    self.max_num_batched_tokens,
++                    _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
++                )
++            if self.is_multimodal_model:
++                # The value needs to be at least the number of multimodal tokens
++                self.max_num_batched_tokens = max(
++                    self.max_num_batched_tokens,
++                    _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
++                )
++
++        if self.enable_chunked_prefill:
++            logger.info(
++                "Chunked prefill is enabled with max_num_batched_tokens=%d.",
++                self.max_num_batched_tokens)
+ 
++        self.chunked_prefill_enabled = self.enable_chunked_prefill
+         self._verify_args()
+ 
+     def _verify_args(self) -> None:
+@@ -658,27 +1490,56 @@ class SchedulerConfig:
+                 f"({self.num_lookahead_slots}) must be greater than or "
+                 "equal to 0.")
+ 
++        if self.num_scheduler_steps < 1:
++            raise ValueError(
++                "num_scheduler_steps "
++                f"({self.num_scheduler_steps}) must be greater than or "
++                "equal to 1.")
++
++    @property
++    def is_multi_step(self) -> bool:
++        return self.num_scheduler_steps > 1
++
+ 
+ class DeviceConfig:
++    device: Optional[torch.device]
++    device_type: str
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # the device/platform information will be summarized
++        # by torch/vllm automatically.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
+ 
+     def __init__(self, device: str = "auto") -> None:
+         if device == "auto":
+             # Automated device type detection
+-            if is_neuron():
+-                self.device_type = "neuron"
+-            elif is_cpu():
+-                self.device_type = "cpu"
+-            else:
+-                # We don't call torch.cuda.is_available() here to
+-                # avoid initializing CUDA before workers are forked
+-                self.device_type = "cuda"
++            from vllm.platforms import current_platform
++            self.device_type = current_platform.device_type
++            if not self.device_type:
++                raise RuntimeError("Failed to infer device type")
+         else:
+             # Device type is assigned explicitly
+             self.device_type = device
+ 
+         # Some device types require processing inputs on CPU
+-        if self.device_type in ["neuron"]:
++        if self.device_type in ["neuron", "openvino"]:
+             self.device = torch.device("cpu")
++        elif self.device_type in ["tpu"]:
++            self.device = None
+         else:
+             # Set device with device type
+             self.device = torch.device(self.device_type)
+@@ -691,18 +1552,44 @@ class SpeculativeConfig:
+     decoding with top-1 proposals.
+     """
+ 
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # spec decode does not use `torch.compile` yet.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
++
+     @staticmethod
+     def maybe_create_spec_config(
+         target_model_config: ModelConfig,
+         target_parallel_config: ParallelConfig,
+         target_dtype: str,
+         speculative_model: Optional[str],
++        speculative_model_quantization: Optional[str],
++        speculative_draft_tensor_parallel_size: Optional[int],
+         num_speculative_tokens: Optional[int],
++        speculative_disable_mqa_scorer: Optional[bool],
+         speculative_max_model_len: Optional[int],
+         enable_chunked_prefill: bool,
+-        use_v2_block_manager: bool,
++        disable_log_stats: bool,
++        speculative_disable_by_batch_size: Optional[int],
+         ngram_prompt_lookup_max: Optional[int],
+         ngram_prompt_lookup_min: Optional[int],
++        draft_token_acceptance_method: str,
++        typical_acceptance_sampler_posterior_threshold: Optional[float],
++        typical_acceptance_sampler_posterior_alpha: Optional[float],
++        disable_logprobs: Optional[bool],
+     ) -> Optional["SpeculativeConfig"]:
+         """Create a SpeculativeConfig if possible, else return None.
+ 
+@@ -718,62 +1605,82 @@ class SpeculativeConfig:
+             target_dtype (str): The data type used for the target model.
+             speculative_model (Optional[str]): The name of the speculative
+                 model, if provided.
++            speculative_model_quantization (Optional[str]): Quantization method
++                that was used to quantize the speculative model weights. If
++                None, we assume the model weights are not quantized.
++            speculative_draft_tensor_parallel_size (Optional[int]): The degree
++                of the tensor parallelism for the draft model.
+             num_speculative_tokens (Optional[int]): The number of speculative
+-                tokens, if provided.
++                tokens, if provided. Will default to the number in the draft
++                model config if present, otherwise is required.
++            speculative_disable_mqa_scorer (Optional[bool]): Disable the MQA
++                scorer for the speculative model and fall back to batch
++                expansion for scoring.
+             speculative_max_model_len (Optional[int]): The maximum model len of
+                 the speculative model. Used when testing the ability to skip
+                 speculation for some sequences.
+             enable_chunked_prefill (bool): Whether vLLM is configured to use
+                 chunked prefill or not. Used for raising an error since its not
+                 yet compatible with spec decode.
+-            use_v2_block_manager (bool): Whether vLLM is configured to use the
+-                v2 block manager or not. Used for raising an error since the v2
+-                block manager is required with spec decode.
++            speculative_disable_by_batch_size (Optional[int]): Disable
++                speculative decoding for new incoming requests when the number
++                of enqueue requests  is larger than this value, if provided.
+             ngram_prompt_lookup_max (Optional[int]): Max size of ngram token
+                 window, if provided.
+             ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
+                 window, if provided.
++            draft_token_acceptance_method (str): The method to use for
++                accepting draft tokens. This can take two possible
++                values 'rejection_sampler' and 'typical_acceptance_sampler'
++                for RejectionSampler and TypicalAcceptanceSampler
++                respectively.
++            typical_acceptance_sampler_posterior_threshold (Optional[float]):
++                A threshold value that sets a lower bound on the posterior
++                probability of a token in the target model for it to be
++                accepted. This threshold is used only when we use the
++                TypicalAcceptanceSampler for token acceptance.
++            typical_acceptance_sampler_posterior_alpha (Optional[float]):
++                A scaling factor for the entropy-based threshold in the
++                TypicalAcceptanceSampler.
++            disable_logprobs (Optional[bool]): If set to True, token log
++                probabilities are not returned during speculative decoding.
++                If set to False, token log probabilities are returned
++                according to the log probability settings in SamplingParams.
++                If not specified, it defaults to True.
+ 
+         Returns:
+             Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
+                 the necessary conditions are met, else None.
+         """
+ 
+-        if (speculative_model is None and num_speculative_tokens is None):
++        if speculative_model is None:
++            if num_speculative_tokens is not None:
++                raise ValueError("num_speculative_tokens was provided without "
++                                 "speculative_model.")
+             return None
+ 
+-        if speculative_model is not None and num_speculative_tokens is None:
+-            raise ValueError(
+-                "Expected both speculative_model and "
+-                "num_speculative_tokens to be provided, but found "
+-                f"{speculative_model=} and {num_speculative_tokens=}.")
+-
+-        assert (speculative_model is not None
+-                and num_speculative_tokens is not None)
+-
+-        if enable_chunked_prefill:
+-            raise ValueError(
+-                "Speculative decoding and chunked prefill are "
+-                f"currently mutually exclusive ({enable_chunked_prefill=}).")
+-
+-        if not use_v2_block_manager:
+-            raise ValueError(
+-                "Speculative decoding requires usage of the V2 "
+-                "block manager. Enable it with --use-v2-block-manager.")
++        if (speculative_disable_by_batch_size is not None
++                and speculative_disable_by_batch_size < 2):
++            raise ValueError("Expect the batch size threshold of disabling "
++                             "speculative decoding is > 1, but got "
++                             f"{speculative_disable_by_batch_size=}")
+ 
+-        # TODO: The user should be able to specify revision/quantization/max
+-        # model len for the draft model. It is not currently supported.
++        # TODO: The user should be able to specify revision/max model len
++        # for the draft model. It is not currently supported.
+         draft_revision = None
+         draft_code_revision = None
+-        draft_quantization = None
++        draft_quantization = speculative_model_quantization
+ 
+         if speculative_model == "[ngram]":
+-            assert (ngram_prompt_lookup_max is not None
+-                    and ngram_prompt_lookup_max > 0)
+             if ngram_prompt_lookup_min is None:
+-                ngram_prompt_lookup_min = 0
+-            else:
+-                assert ngram_prompt_lookup_max > ngram_prompt_lookup_min
++                ngram_prompt_lookup_min = 1
++            if ngram_prompt_lookup_max is None or ngram_prompt_lookup_max < 1:
++                raise ValueError(f"{ngram_prompt_lookup_max=} must be > 0")
++            if ngram_prompt_lookup_min < 1:
++                raise ValueError(f"{ngram_prompt_lookup_min=} must be > 0")
++            if ngram_prompt_lookup_min > ngram_prompt_lookup_max:
++                raise ValueError(f"{ngram_prompt_lookup_min=} cannot be "
++                                 f"larger than {ngram_prompt_lookup_max=}")
+ 
+             # TODO: current we still need extract vocab_size from target model
+             # config, in future, we may try refactor it out, and set
+@@ -785,15 +1692,19 @@ class SpeculativeConfig:
+             ngram_prompt_lookup_min = 0
+             draft_model_config = ModelConfig(
+                 model=speculative_model,
++                task="draft",
+                 tokenizer=target_model_config.tokenizer,
+                 tokenizer_mode=target_model_config.tokenizer_mode,
+                 trust_remote_code=target_model_config.trust_remote_code,
++                allowed_local_media_path=target_model_config.
++                allowed_local_media_path,
+                 dtype=target_model_config.dtype,
+                 seed=target_model_config.seed,
+                 revision=draft_revision,
+                 code_revision=draft_code_revision,
+                 tokenizer_revision=target_model_config.tokenizer_revision,
+                 max_model_len=None,
++                spec_target_max_model_len=target_model_config.max_model_len,
+                 quantization=draft_quantization,
+                 enforce_eager=target_model_config.enforce_eager,
+                 max_seq_len_to_capture=target_model_config.
+@@ -801,6 +1712,38 @@ class SpeculativeConfig:
+                 max_logprobs=target_model_config.max_logprobs,
+             )
+ 
++            draft_hf_config = draft_model_config.hf_config
++
++            if (num_speculative_tokens is not None
++                    and hasattr(draft_hf_config, "num_lookahead_tokens")):
++                draft_hf_config.num_lookahead_tokens = num_speculative_tokens
++
++            n_predict = getattr(draft_hf_config, "n_predict", None)
++            if n_predict is not None:
++                if num_speculative_tokens is None:
++                    # Default to max value defined in draft model config.
++                    num_speculative_tokens = n_predict
++                elif num_speculative_tokens > n_predict:
++                    # Verify provided value doesn't exceed the maximum
++                    # supported by the draft model.
++                    raise ValueError(
++                        "This speculative model supports a maximum of "
++                        f"num_speculative_tokens={n_predict}, but "
++                        f"{num_speculative_tokens=} was provided.")
++
++            if enable_chunked_prefill and draft_hf_config.model_type in (
++                    "medusa", "mlp_speculator", "eagle"):
++                raise ValueError(
++                    "Chunked prefill and hidden-state based draft models are "
++                    "not compatible.")
++
++            speculative_draft_tensor_parallel_size = \
++                SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
++                    target_parallel_config,
++                    speculative_draft_tensor_parallel_size,
++                    draft_hf_config
++            )
++
+             draft_model_config.max_model_len = (
+                 SpeculativeConfig._maybe_override_draft_max_model_len(
+                     speculative_max_model_len,
+@@ -810,14 +1753,37 @@ class SpeculativeConfig:
+ 
+             draft_parallel_config = (
+                 SpeculativeConfig.create_draft_parallel_config(
+-                    target_parallel_config))
++                    target_parallel_config,
++                    speculative_draft_tensor_parallel_size, draft_hf_config))
++
++        if num_speculative_tokens is None:
++            raise ValueError(
++                "num_speculative_tokens must be provided with "
++                "speculative_model unless the draft model config contains an "
++                "n_predict parameter.")
++
++        if typical_acceptance_sampler_posterior_threshold is None:
++            typical_acceptance_sampler_posterior_threshold = 0.09
++        if typical_acceptance_sampler_posterior_alpha is None:
++            typical_acceptance_sampler_posterior_alpha = 0.3
++        if disable_logprobs is None:
++            disable_logprobs = True
+ 
+         return SpeculativeConfig(
+             draft_model_config,
+             draft_parallel_config,
+             num_speculative_tokens,
++            speculative_disable_mqa_scorer,
++            speculative_disable_by_batch_size,
+             ngram_prompt_lookup_max,
+             ngram_prompt_lookup_min,
++            draft_token_acceptance_method=draft_token_acceptance_method,
++            typical_acceptance_sampler_posterior_threshold=\
++                typical_acceptance_sampler_posterior_threshold,
++            typical_acceptance_sampler_posterior_alpha=\
++                typical_acceptance_sampler_posterior_alpha,
++            disable_logprobs=disable_logprobs,
++            disable_log_stats=disable_log_stats,
+         )
+ 
+     @staticmethod
+@@ -855,19 +1821,50 @@ class SpeculativeConfig:
+             target_max_model_len,
+         )
+ 
++    @staticmethod
++    def _verify_and_get_draft_model_tensor_parallel_size(
++            target_parallel_config: ParallelConfig,
++            speculative_draft_tensor_parallel_size: Optional[int],
++            draft_hf_config: PretrainedConfig) -> int:
++        """
++        Verifies and adjusts the tensor parallel size for a draft model
++        specified using speculative_draft_tensor_parallel_size.
++        """
++        # If speculative_draft_tensor_parallel_size is unset then set it
++        # appropriately else verify that it is set correctly.
++        if speculative_draft_tensor_parallel_size is None:
++            if draft_hf_config.model_type == "mlp_speculator":
++                speculative_draft_tensor_parallel_size = 1
++                if target_parallel_config.tensor_parallel_size > 1:
++                    logger.warning(
++                        "MLPSpeculator cannot currently be run with tp>1; "
++                        "setting speculative_draft_tensor_parallel_size=1")
++            else:
++                speculative_draft_tensor_parallel_size = \
++                    target_parallel_config.tensor_parallel_size
++        elif speculative_draft_tensor_parallel_size not in (
++                1, target_parallel_config.tensor_parallel_size):
++            raise ValueError(
++                f"{speculative_draft_tensor_parallel_size=} cannot be "
++                f"other value than 1 or target model tensor_parallel_size")
++        return speculative_draft_tensor_parallel_size
++
+     @staticmethod
+     def create_draft_parallel_config(
+-            target_parallel_config: ParallelConfig) -> ParallelConfig:
++        target_parallel_config: ParallelConfig,
++        speculative_draft_tensor_parallel_size: int,
++        draft_hf_config: PretrainedConfig,
++    ) -> ParallelConfig:
+         """Create a parallel config for use by the draft worker.
+ 
+-        This is mostly a copy of the target parallel config. In the future the
+-        draft worker can have a different parallel strategy, e.g. TP=1.
++        This is mostly a copy of the target parallel config, except the tp_size.
+         """
+         draft_parallel_config = ParallelConfig(
+             pipeline_parallel_size=target_parallel_config.
+             pipeline_parallel_size,
+-            tensor_parallel_size=target_parallel_config.tensor_parallel_size,
+-            worker_use_ray=target_parallel_config.worker_use_ray,
++            tensor_parallel_size=speculative_draft_tensor_parallel_size,
++            distributed_executor_backend=target_parallel_config.
++            distributed_executor_backend,
+             max_parallel_loading_workers=target_parallel_config.
+             max_parallel_loading_workers,
+             disable_custom_all_reduce=target_parallel_config.
+@@ -885,8 +1882,15 @@ class SpeculativeConfig:
+         draft_model_config: ModelConfig,
+         draft_parallel_config: ParallelConfig,
+         num_speculative_tokens: int,
+-        ngram_prompt_lookup_max: int,
+-        ngram_prompt_lookup_min: int,
++        speculative_disable_mqa_scorer: Optional[bool],
++        speculative_disable_by_batch_size: Optional[int],
++        ngram_prompt_lookup_max: Optional[int],
++        ngram_prompt_lookup_min: Optional[int],
++        draft_token_acceptance_method: str,
++        typical_acceptance_sampler_posterior_threshold: float,
++        typical_acceptance_sampler_posterior_alpha: float,
++        disable_logprobs: bool,
++        disable_log_stats: bool,
+     ):
+         """Create a SpeculativeConfig object.
+ 
+@@ -895,12 +1899,48 @@ class SpeculativeConfig:
+             draft_parallel_config: ParallelConfig for the draft model.
+             num_speculative_tokens: The number of tokens to sample from the
+                 draft model before scoring with the target model.
++            speculative_disable_by_batch_size: Disable speculative
++                decoding for new incoming requests when the number of
++                enqueue requests is larger than this value.
++            ngram_prompt_lookup_max: Max size of ngram token window.
++            ngram_prompt_lookup_min: Min size of ngram token window.
++            draft_token_acceptance_method (str): The method to use for
++                accepting draft tokens. This can take two possible
++                values 'rejection_sampler' and 'typical_acceptance_sampler'
++                for RejectionSampler and TypicalAcceptanceSampler
++                respectively.
++            typical_acceptance_sampler_posterior_threshold (Optional[float]):
++                A threshold value that sets a lower bound on the posterior
++                probability of a token in the target model for it to be
++                accepted. This threshold is used only when we use the
++                TypicalAcceptanceSampler for token acceptance.
++            typical_acceptance_sampler_posterior_alpha (Optional[float]):
++                A scaling factor for the entropy-based threshold in the
++                TypicalAcceptanceSampler.
++            disable_logprobs: If set to True, token log probabilities will not
++                be returned even if requested by sampling parameters. This
++                reduces latency by skipping logprob calculation in proposal
++                sampling, target sampling, and after accepted tokens are
++                determined. If set to False, log probabilities will be
++                returned.
++            disable_log_stats: Whether to disable periodic printing of stage
++                times in speculative decoding.
+         """
+         self.draft_model_config = draft_model_config
+         self.draft_parallel_config = draft_parallel_config
+         self.num_speculative_tokens = num_speculative_tokens
+-        self.ngram_prompt_lookup_max = ngram_prompt_lookup_max
+-        self.ngram_prompt_lookup_min = ngram_prompt_lookup_min
++        self.speculative_disable_mqa_scorer = speculative_disable_mqa_scorer
++        self.speculative_disable_by_batch_size = \
++            speculative_disable_by_batch_size
++        self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
++        self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
++        self.draft_token_acceptance_method = draft_token_acceptance_method
++        self.typical_acceptance_sampler_posterior_threshold = \
++            typical_acceptance_sampler_posterior_threshold
++        self.typical_acceptance_sampler_posterior_alpha = \
++            typical_acceptance_sampler_posterior_alpha
++        self.disable_logprobs = disable_logprobs
++        self.disable_log_stats = disable_log_stats
+ 
+         self._verify_args()
+ 
+@@ -912,6 +1952,31 @@ class SpeculativeConfig:
+         if self.draft_model_config:
+             self.draft_model_config.verify_with_parallel_config(
+                 self.draft_parallel_config)
++            # Validate and set draft token acceptance related settings.
++
++        if (self.draft_token_acceptance_method is None):
++            raise ValueError("draft_token_acceptance_method is not set. "
++                             "Expected values are rejection_sampler or "
++                             "typical_acceptance_sampler.")
++
++        if (self.draft_token_acceptance_method != 'rejection_sampler'
++                and self.draft_token_acceptance_method !=
++                'typical_acceptance_sampler'):
++            raise ValueError(
++                "Expected draft_token_acceptance_method to be either "
++                "rejection_sampler or typical_acceptance_sampler. Instead it "
++                f"is {self.draft_token_acceptance_method}")
++
++        if (self.typical_acceptance_sampler_posterior_threshold < 0
++                or self.typical_acceptance_sampler_posterior_alpha < 0):
++            raise ValueError(
++                "Expected typical_acceptance_sampler_posterior_threshold "
++                "and typical_acceptance_sampler_posterior_alpha to be > 0. "
++                "Instead found "
++                f"typical_acceptance_sampler_posterior_threshold = "
++                f"{self.typical_acceptance_sampler_posterior_threshold} and "
++                f"typical_acceptance_sampler_posterior_alpha = "
++                f"{self.typical_acceptance_sampler_posterior_alpha}")
+ 
+     @property
+     def num_lookahead_slots(self) -> int:
+@@ -938,14 +2003,35 @@ class LoRAConfig:
+     max_loras: int
+     fully_sharded_loras: bool = False
+     max_cpu_loras: Optional[int] = None
+-    lora_dtype: Optional[torch.dtype] = None
++    lora_dtype: Optional[Union[torch.dtype, str]] = None
+     lora_extra_vocab_size: int = 256
+     # This is a constant.
+     lora_vocab_padding_size: ClassVar[int] = 256
++    long_lora_scaling_factors: Optional[Tuple[float]] = None
++    bias_enabled: bool = False
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # LoRA is not compatible with `torch.compile` .
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
+ 
+     def __post_init__(self):
+-        # Keep this in sync with csrc/punica/bgmv/bgmv_config.h
+-        possible_max_ranks = (8, 16, 32, 64)
++        # Setting the maximum rank to 256 should be able to satisfy the vast
++        # majority of applications.
++        possible_max_ranks = (8, 16, 32, 64, 128, 256)
+         possible_lora_extra_vocab_size = (0, 256, 512)
+         if self.max_lora_rank not in possible_max_ranks:
+             raise ValueError(
+@@ -964,6 +2050,11 @@ class LoRAConfig:
+                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
+                 f"max_loras ({self.max_loras})")
+ 
++    def verify_with_cache_config(self, cache_config: CacheConfig):
++        # TODO LoRA supports CPU offload.
++        if cache_config.cpu_offload_gb > 0:
++            raise ValueError("CPU offload is not supported with LoRA yet.")
++
+     def verify_with_model_config(self, model_config: ModelConfig):
+         if self.lora_dtype in (None, "auto"):
+             self.lora_dtype = model_config.dtype
+@@ -972,58 +2063,148 @@ class LoRAConfig:
+         if model_config.quantization and model_config.quantization not in [
+                 "awq", "gptq"
+         ]:
+-            # TODO support marlin and squeezellm
++            # TODO support marlin
+             logger.warning("%s quantization is not tested with LoRA yet.",
+                            model_config.quantization)
+ 
+     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
+-        if scheduler_config.max_num_batched_tokens > 65528:
+-            raise ValueError(
+-                "Due to limitations of the custom LoRA CUDA kernel, "
+-                "max_num_batched_tokens must be <= 65528 when "
+-                "LoRA is enabled.")
++        # Reminder: Please update docs/source/features/compatibility_matrix.md
++        # If the feature combo become valid
++        if scheduler_config.chunked_prefill_enabled:
++            logger.warning("LoRA with chunked prefill is still experimental "
++                           "and may be unstable.")
+ 
+ 
+ @dataclass
+-class VisionLanguageConfig:
+-    """Configs the input data format and how models should run for
+-    vision language models."""
++class PromptAdapterConfig:
++    max_prompt_adapters: int
++    max_prompt_adapter_token: int
++    max_cpu_prompt_adapters: Optional[int] = None
++    prompt_adapter_dtype: Optional[torch.dtype] = None
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # this config will not affect the computation graph.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
++
++    def __post_init__(self):
+ 
+-    class ImageInputType(enum.Enum):
+-        """Image input type into the vision language model.
++        if self.max_prompt_adapters < 1:
++            raise ValueError(f"max_prompt_adapters "
++                             f"({self.max_prompt_adapters}) must be >= 1.")
++        if self.max_prompt_adapter_token == 0:
++            raise ValueError("max_prompt_adapter_token must be set.")
++        if self.max_cpu_prompt_adapters is None:
++            self.max_cpu_prompt_adapters = self.max_prompt_adapters
++
++    def verify_with_model_config(self, model_config: ModelConfig):
++        if self.prompt_adapter_dtype in (None, "auto"):
++            self.prompt_adapter_dtype = model_config.dtype
++        elif isinstance(self.prompt_adapter_dtype, str):
++            self.prompt_adapter_dtype = getattr(torch,
++                                                self.prompt_adapter_dtype)
+ 
+-        An image roughly goes through the following transformation:
+-        Raw image --> pixel values --> image features --> image embeddings.
+ 
+-        The difference between different image input types is where the
+-        image encoder (pixel values --> image features) is run.
+-        Different image input types also correspond to different tensor shapes.
++@dataclass
++class MultiModalConfig:
++    """Controls the behavior of multimodal models."""
+ 
+-        For example, for Llava, PIXEL_VALUES: (1, 3, 336, 336).
+-        IMAGE_FEATURES: (1, 576, 1024).
++    limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
++    """
++    The maximum number of input items allowed per prompt for each modality.
++    """
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
+         """
+-        PIXEL_VALUES = enum.auto()
+-        IMAGE_FEATURES = enum.auto()
+-
+-    image_input_type: ImageInputType
+-    # The input id corresponding to image token.
+-    image_token_id: int
+-    # Used for running `run_prefill_max_token`.
+-    # For models that support varying resolution, this corresponds to
+-    # worst case scenario (biggest supported resolution).
+-    image_input_shape: tuple
+-    image_feature_size: int
++        # no factors to consider.
++        # this config will not affect the computation graph.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
+ 
+-    @classmethod
+-    def get_image_input_enum_type(
+-            cls, value: str) -> "VisionLanguageConfig.ImageInputType":
+-        """Get the image input type from a string."""
+-        try:
+-            return cls.ImageInputType[value.upper()]
+-        except KeyError as e:
+-            raise ValueError(f"{value} is not a valid choice. "
+-                             f"Expecting to choose from "
+-                             f"{[x.name for x in cls.ImageInputType]}.") from e
++    # TODO: Add configs to init vision tower or not.
++
++
++@dataclass
++class PoolerConfig:
++    """Controls the behavior of output pooling in pooling models."""
++
++    pooling_type: Optional[str] = None
++    """
++    The pooling method of the pooling model. This should be a key in
++    :class:`vllm.model_executor.layers.pooler.PoolingType`.
++    """
++
++    normalize: Optional[bool] = None
++    """
++    Whether to normalize the pooled outputs. Usually, this should be set to
++    ``True`` for embedding outputs.
++    """
++
++    softmax: Optional[bool] = None
++    """
++    Whether to apply softmax to the pooled outputs. Usually, this should be set
++    to ``True`` for classification outputs.
++    """
++
++    step_tag_id: Optional[int] = None
++    """
++    If set, only the score corresponding to the ``step_tag_id`` in the
++    generated sentence should be returned. Otherwise, the scores for all tokens
++    are returned.
++    """
++
++    returned_token_ids: Optional[List[int]] = None
++    """
++    A list of indices for the vocabulary dimensions to be extracted,
++    such as the token IDs of ``good_token`` and ``bad_token`` in the
++    ``math-shepherd-mistral-7b-prm`` model.
++    """
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # this config will not affect the computation graph.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
++
++    @staticmethod
++    def from_json(json_str: str) -> "PoolerConfig":
++        return PoolerConfig(**json.loads(json_str))
+ 
+ 
+ _STR_DTYPE_TO_TORCH_DTYPE = {
+@@ -1034,7 +2215,7 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
+     "bfloat16": torch.bfloat16,
+ }
+ 
+-_ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"]
++_ROCM_NOT_SUPPORTED_DTYPE: List[str] = []  #
+ 
+ 
+ def _get_and_verify_dtype(
+@@ -1051,11 +2232,48 @@ def _get_and_verify_dtype(
+         dtype = dtype.lower()
+         if dtype == "auto":
+             if config_dtype == torch.float32:
+-                # Following the common practice, we use float16 for float32
+-                # models.
+-                torch_dtype = torch.float16
++                if config.model_type == "gemma2":
++                    logger.info(
++                        "For Gemma 2, we downcast float32 to bfloat16 instead "
++                        "of float16 by default. Please specify `dtype` if you "
++                        "want to use float16.")
++                    torch_dtype = torch.bfloat16
++                else:
++                    # Following the common practice, we use float16 for float32
++                    # models.
++                    torch_dtype = torch.float16
+             else:
+                 torch_dtype = config_dtype
++
++            from vllm.platforms import current_platform
++            if (current_platform.is_cpu()
++                    and current_platform.get_cpu_architecture()
++                    == CpuArchEnum.POWERPC
++                    and (config_dtype == torch.float16
++                         or config_dtype == torch.float32)):
++                logger.info(
++                    "For POWERPC, we cast models to bfloat16 instead of "
++                    "using float16 by default. Float16 is not currently "
++                    "supported for POWERPC.")
++                torch_dtype = torch.bfloat16
++
++            # TODO: change this condition to check if the platform support bf16
++            # instead of checking the OS. For instance M2 shall supports bf16
++            # already. But we need to modify `cpu_extension.cmake` to activate
++            # the feature in the build.
++            if (current_platform.is_cpu() and sys.platform.startswith("darwin")
++                    and current_platform.get_cpu_architecture()
++                    == CpuArchEnum.ARM and config_dtype == torch.bfloat16):
++                logger.info("For macOS with Apple Silicon, currently bfloat16 "
++                            "is not supported. Setting dtype to float16.")
++                torch_dtype = torch.float16
++
++            if current_platform.is_hpu() and config_dtype == torch.float16:
++                logger.info(
++                    "For HPU, we cast models to bfloat16 instead of"
++                    "using float16 by default. Please specify `dtype` if you "
++                    "want to use float16.")
++                torch_dtype = torch.bfloat16
+         else:
+             if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
+                 raise ValueError(f"Unknown dtype: {dtype}")
+@@ -1065,21 +2283,15 @@ def _get_and_verify_dtype(
+     else:
+         raise ValueError(f"Unknown dtype: {dtype}")
+ 
+-    if is_hip() and torch_dtype == torch.float32:
+-        rocm_supported_dtypes = [
+-            k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
+-            if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
+-        ]
+-        raise ValueError(f"dtype '{dtype}' is not supported in ROCm. "
+-                         f"Supported dtypes are {rocm_supported_dtypes}")
+-
+     # Verify the dtype.
+     if torch_dtype != config_dtype:
+         if torch_dtype == torch.float32:
+             # Upcasting to float32 is allowed.
++            logger.info("Upcasting %s to %s.", config_dtype, torch_dtype)
+             pass
+         elif config_dtype == torch.float32:
+             # Downcasting from float32 to float16 or bfloat16 is allowed.
++            logger.info("Downcasting %s to %s.", config_dtype, torch_dtype)
+             pass
+         else:
+             # Casting between float16 and bfloat16 is allowed with a warning.
+@@ -1091,6 +2303,10 @@ def _get_and_verify_dtype(
+ def _get_and_verify_max_len(
+     hf_config: PretrainedConfig,
+     max_model_len: Optional[int],
++    disable_sliding_window: bool,
++    sliding_window_len: Optional[Union[int, List[Optional[int]]]],
++    spec_target_max_model_len: Optional[int] = None,
++    encoder_config: Optional[Any] = None,
+ ) -> int:
+     """Get and verify the model's maximum length."""
+     derived_max_model_len = float("inf")
+@@ -1105,11 +2321,14 @@ def _get_and_verify_max_len(
+         "seq_length",
+         # Command-R
+         "model_max_length",
++        # Whisper
++        "max_target_positions",
+         # Others
+         "max_sequence_length",
+         "max_seq_length",
+         "seq_len",
+     ]
++    # Choose the smallest "max_length" from the possible keys.
+     max_len_key = None
+     for key in possible_keys:
+         max_len = getattr(hf_config, key, None)
+@@ -1117,28 +2336,66 @@ def _get_and_verify_max_len(
+             max_len_key = key if max_len < derived_max_model_len \
+                 else max_len_key
+             derived_max_model_len = min(derived_max_model_len, max_len)
++
++    # If sliding window is manually disabled, max_length should be less
++    # than the sliding window length in the model config.
++    if disable_sliding_window and sliding_window_len is not None:
++
++        sliding_window_len_min = get_min_sliding_window(sliding_window_len)
++        max_len_key = "sliding_window" \
++            if sliding_window_len_min < derived_max_model_len else max_len_key
++        derived_max_model_len = min(derived_max_model_len,
++                                    sliding_window_len_min)
++
++    # If none of the keys were found in the config, use a default and
++    # log a warning.
+     if derived_max_model_len == float("inf"):
+         if max_model_len is not None:
+             # If max_model_len is specified, we use it.
+             return max_model_len
+ 
++        if spec_target_max_model_len is not None:
++            # If this is a speculative draft model, we use the max model len
++            # from the target model.
++            return spec_target_max_model_len
++
+         default_max_len = 2048
+         logger.warning(
+             "The model's config.json does not contain any of the following "
+             "keys to determine the original maximum length of the model: "
+-            "%d. Assuming the model's maximum length is %d.", possible_keys,
++            "%s. Assuming the model's maximum length is %d.", possible_keys,
+             default_max_len)
+         derived_max_model_len = default_max_len
+ 
+     rope_scaling = getattr(hf_config, "rope_scaling", None)
+-    if rope_scaling is not None and rope_scaling["type"] != "su":
+-        assert "factor" in rope_scaling
+-        scaling_factor = rope_scaling["factor"]
+-        if rope_scaling["type"] == "yarn":
+-            derived_max_model_len = rope_scaling[
+-                "original_max_position_embeddings"]
+-        derived_max_model_len *= scaling_factor
+-
++    if rope_scaling is not None:
++        # No need to consider "type" key because of patch_rope_scaling when
++        # loading HF config
++        rope_type = rope_scaling["rope_type"]
++
++        if rope_type not in ("su", "longrope", "llama3"):
++            if disable_sliding_window:
++                # TODO(robertgshaw): Find a model that supports rope_scaling
++                # with sliding window to see if this case should be allowed.
++                raise NotImplementedError(
++                    "Disabling sliding window is not supported for models "
++                    "with rope_scaling. Please raise an issue so we can "
++                    "investigate.")
++
++            # NOTE: rope_type == "default" does not define factor
++            # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
++            scaling_factor = rope_scaling.get("factor", 1.0)
++
++            if rope_type == "yarn":
++                derived_max_model_len = rope_scaling[
++                    "original_max_position_embeddings"]
++            derived_max_model_len *= scaling_factor
++
++    if encoder_config and "max_seq_length" in encoder_config:
++        derived_max_model_len = encoder_config["max_seq_length"]
++
++    # If the user specified a max length, make sure it is smaller than the
++    # derived length from the HF model config.
+     if max_model_len is None:
+         max_model_len = int(derived_max_model_len)
+     elif max_model_len > derived_max_model_len:
+@@ -1147,25 +2404,46 @@ def _get_and_verify_max_len(
+         # with model_max_length and allow this override when it's smaller.
+         model_max_length = getattr(hf_config, "model_max_length", None)
+         if model_max_length is not None and max_model_len <= model_max_length:
+-            pass
++            if disable_sliding_window:
++                # TODO(robertgshaw): Find a model that has model_max_length
++                # with sliding window to see if this case should be allowed.
++                raise NotImplementedError(
++                    "Disabling sliding window is not supported for models "
++                    "model_max_length in the config. Please raise an issue "
++                    "so we can investigate.")
+         else:
+-            raise ValueError(
++            msg = (
+                 f"User-specified max_model_len ({max_model_len}) is greater "
+-                "than the derived max_model_len "
+-                f"({max_len_key}={derived_max_model_len} or model_max_length="
++                f"than the derived max_model_len ({max_len_key}="
++                f"{derived_max_model_len} or model_max_length="
+                 f"{model_max_length} in model's config.json). This may lead "
+-                "to incorrect model outputs or CUDA errors. Make sure the "
+-                "value is correct and within the model context size.")
++                "to incorrect model outputs or CUDA errors.")
++            if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
++                logger.warning(
++                    "%s Make sure the value is correct and within the "
++                    "model context size.", msg)
++            else:
++                raise ValueError(
++                    f"{msg} To allow overriding this maximum, set "
++                    "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1")
+     return int(max_model_len)
+ 
+ 
++def get_min_sliding_window(
++        sliding_window: Union[int, List[Optional[int]]]) -> int:
++    if isinstance(sliding_window, list):
++        return min(s for s in sliding_window if s is not None)
++
++    return sliding_window
++
++
+ def get_served_model_name(model: str,
+                           served_model_name: Optional[Union[str, List[str]]]):
+     """
+-    If the input is a non-empty list, the first model_name in 
+-    `served_model_name` is taken. 
+-    If the input is a non-empty string, it is used directly. 
+-    For cases where the input is either an empty string or an 
++    If the input is a non-empty list, the first model_name in
++    `served_model_name` is taken.
++    If the input is a non-empty string, it is used directly.
++    For cases where the input is either an empty string or an
+     empty list, the fallback is to use `self.model`.
+     """
+     if not served_model_name:
+@@ -1179,47 +2457,885 @@ def get_served_model_name(model: str,
+ class DecodingConfig:
+     """Dataclass which contains the decoding strategy of the engine"""
+ 
+-    # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
+-    guided_decoding_backend: str = 'outlines'
++    # Which guided decoding algo to use.
++    # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
++    guided_decoding_backend: str = 'xgrammar'
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # this config will not affect the computation graph.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
+ 
+     def __post_init__(self):
+-        valid_guided_backends = ['outlines', 'lm-format-enforcer']
++        valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
+         backend = self.guided_decoding_backend
+         if backend not in valid_guided_backends:
+             raise ValueError(f"Invalid guided_decoding_backend '{backend},"
+                              f"must be one of {valid_guided_backends}")
+ 
+ 
+-@dataclass(frozen=True)
+-class EngineConfig:
+-    """Dataclass which contains all engine-related configuration. This
++@dataclass
++class ObservabilityConfig:
++    """Configuration for observability."""
++    otlp_traces_endpoint: Optional[str] = None
++
++    # Collecting detailed timing information for each request can be expensive.
++
++    # If set, collects the model forward time for the request.
++    collect_model_forward_time: bool = False
++
++    # If set, collects the model execute time for the request.
++    collect_model_execute_time: bool = False
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # this config will not affect the computation graph.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
++
++    def __post_init__(self):
++        if not is_otel_available() and self.otlp_traces_endpoint is not None:
++            raise ValueError(
++                "OpenTelemetry is not available. Unable to configure "
++                "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
++                f"installed. Original error:\n{otel_import_error_traceback}")
++
++
++class KVTransferConfig(BaseModel):
++    """Configuration for distributed KV cache transfer."""
++
++    # The KV connector for vLLM to transmit KV caches between vLLM instances.
++    kv_connector: Optional[str] = None
++
++    # The device used by kv connector to buffer the KV cache.
++    # Currently only support 'cuda'.
++    kv_buffer_device: Optional[str] = "cuda"
++
++    # The buffer size for TorchDistributedConnector. Measured in number of
++    # bytes. Recommended value: 1e9 (about 1GB).
++    kv_buffer_size: float = 1e9
++
++    # Whether this vLLM instance produces, consumes KV cache, or both. Choices
++    # are 'kv_producer', 'kv_consumer', and 'both'.
++    kv_role: Optional[str] = None
++
++    # The rank of this vLLM instance in the KV cache transfer. Typical value:
++    # 0 for prefill instance, 1 for decode instance.
++    # Currently only 1P1D is supported.
++    kv_rank: Optional[int] = None
++
++    # The number of parallel instances for KV cache transfer. For
++    # PyNcclConnector, this should be 2.
++    kv_parallel_size: int = 1
++
++    # The KV connector ip, used to build distributed connection
++    kv_ip: str = "127.0.0.1"
++
++    # The KV connector port, used to build distributed connection
++    kv_port: int = 14579
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        # no factors to consider.
++        # this config will not affect the computation graph.
++        factors: List[Any] = []
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
++        return hash_str
++
++    @classmethod
++    def from_cli(cls, cli_value: str) -> "KVTransferConfig":
++        """Parse the CLI value for the kv cache transfer config."""
++        return KVTransferConfig.model_validate_json(cli_value)
++
++    def model_post_init(self, __context: Any) -> None:
++
++        if self.kv_role is not None and self.kv_role not in [
++                "kv_producer", "kv_consumer", "kv_both"
++        ]:
++            raise ValueError(
++                f"Unsupported kv_role: {self.kv_role}. "
++                f"Supported roles are `kv_producer`, `kv_consumer`, "
++                f"and `kv_both`")
++
++        if self.kv_connector is not None and self.kv_role is None:
++            raise ValueError("Please specify kv_disagg_role when kv_connector "
++                             "is set, supported roles are `kv_producer`, "
++                             "`kv_consumer`, and `kv_both`")
++
++    @property
++    def is_kv_transfer_instance(self) -> bool:
++        return self.kv_connector is not None and \
++            self.kv_role in ["kv_producer", "kv_consumer", "kv_both"]
++
++    @property
++    def need_kv_parallel_group(self) -> bool:
++        # for those database-based connector, vLLM does not need to create
++        # parallel group, and in that case the kv parallel size will be 1.
++        return self.kv_connector is not None and self.kv_parallel_size > 1
++
++    @property
++    def is_kv_producer(self) -> bool:
++        return self.kv_connector is not None and \
++            self.kv_role in ["kv_producer", "kv_both"]
++
++    @property
++    def is_kv_consumer(self) -> bool:
++        return self.kv_connector is not None and \
++            self.kv_role in ["kv_consumer", "kv_both"]
++
++
++class CompilationLevel:
++    # constants for the levels of the compilation process
++    NO_COMPILATION = 0
++    DYNAMO_AS_IS = 1
++    DYNAMO_ONCE = 2
++    PIECEWISE = 3
++
++
++class CompilationConfig(BaseModel):
++    """
++    Configuration for compilation.
++    It has three parts:
++    - Top-level Compilation control:
++        - level: the level of compilation.
++            - 0: no compilation.
++            - 1: dynamo as is.
++            - 2: dynamo once.
++            - 3: piecewise compilation.
++        - debug_dump_path: the path to dump the debug information.
++        - cache_dir: the directory to store the compiled graph, to
++            accelerate Inductor compilation. By default, it will use
++            model-related information to generate a cache directory.
++        - backend: the backend for compilation. It needs to be a string.
++            - "" (empty string): use the default backend.
++            - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
++            - "full.module.name": a qualified name which can be used to import the backend function.
++            We use string to avoid serialization issues when using compilation in a distributed setting.
++            When the compilation level is 1 or 2, the backend is used for the compilation directly (it sees the whole graph).
++            When the compilation level is 3, the backend is used for the piecewise compilation (it sees a part of the graph).
++        - custom_ops: fine-grained control over which custom ops to enable/disable.
++            Use 'all' to enable all, 'none' to disable all.
++            Also specify a list of custom op names to enable (prefixed with a '+'),
++            or disable (prefixed with a '-').
++            Examples:
++                - 'all,-op1' to enable all except op1
++                - 'none,+op1,+op2' to enable only op1 and op2
++            By default, all custom ops are enabled when running without Inductor
++                and disabled when running with Inductor (compile_level >= Inductor).
++        - splitting_ops: a list of ops to split the full graph into subgraphs, used in piecewise compilation.
++    - CudaGraph capture:
++        - use_cudagraph: whether to use cudagraph inside compilation.
++            - False: cudagraph inside compilation is not used.
++            - True: cudagraph inside compilation is used. It requires
++                that all input buffers have fixed addresses, and all
++                splitting ops write their outputs to input buffers.
++            Note that this is orthogonal to the cudagraph capture logic
++            outside of compilation.
++            TODO: move outside cudagraph logic into compilation.
++            torch.compile will handle cudagraph capture logic in the future.
++        - cudagraph_capture_sizes: sizes to capture cudagraph.
++            - None (default): capture sizes are inferred from vllm config.
++            - List[int]: capture sizes are specified as given.
++        - cudagraph_num_of_warmups: number of warmup runs for cudagraph.
++            It means the first several runs will be treated as warmup runs.
++            Only after that, the execution will be recorded, and the recorded
++            cudagraph will be used for subsequent runs.
++        - cudagraph_copy_inputs: whether to copy input tensors for
++            cudagraph. If the caller can guarantee that the same input buffers
++            are always used, it can set this to False. Otherwise, it should
++            set this to True, and the compiler will copy the input to an
++            internally managed buffer. Default is False.
++    - Inductor compilation:
++        - use_inductor: whether to use inductor compilation.
++            - False: inductor compilation is not used. graph runs in eager.
++            - True: inductor compilation is used. one graph for symbolic shape
++                is compiled. In addition, compile for cudagraph sizes that are
++                in candidate_compile_sizes, using configurations
++                in inductor_compile_config.
++        - candidate_compile_sizes: sizes to compile for inductor.
++        - inductor_compile_config: additional configurations for inductor.
++            - None: use default configurations.
++        - inductor_passes: additional passes for inductor. It is a dictionary
++            from pass name to pass function qualified name. We use function
++            name because the config uses json format. If we pass the config
++            from Python, functions can also be passed directly via Python object
++            constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`
++        - custom inductor passes: see PassConfig for more details
++
++    Why we have different sizes for cudagraph and inductor:
++    - cudagraph: a cudagraph captured for a specific size can only be used
++        for the same size. We need to capture all the sizes we want to use.
++    - inductor: a graph compiled by inductor for a general shape can be used
++        for different sizes. Inductor can also compile for specific sizes,
++        where it can have more information to optimize the graph with fully
++        static shapes. However, we find the general shape compilation is
++        sufficient for most cases. It might be beneficial to compile for
++        certain small batchsizes, where inductor is good at optimizing.
++    """ # noqa
++    level: int = 0
++    debug_dump_path: str = ""
++    cache_dir: str = ""
++    backend: str = ""
++    custom_ops: List[str] = Field(default_factory=list)
++    splitting_ops: List[str] = Field(default=None)  # type: ignore
++
++    use_inductor: bool = True
++    candidate_compile_sizes: Optional[List[int]] = Field(default=None)
++    inductor_compile_config: Dict = Field(default_factory=dict)
++    inductor_passes: Dict[str, str] = Field(default_factory=dict)
++
++    use_cudagraph: bool = False
++    cudagraph_num_of_warmups: int = 0
++    cudagraph_capture_sizes: Optional[List[int]] = None
++    cudagraph_copy_inputs: bool = False
++
++    class PassConfig(BaseModel):
++        """
++        Configuration for custom Inductor passes.
++        This is separate from general CompilationConfig so that inductor passes
++        don't all have access to full configuration - that would create a cycle
++        as the PassManager is set as a property of config.
++        - dump_graph_stages: list of stages for which we want to dump the graph.
++            Each pass defines its own stages (before, after, maybe in-between).
++        - dump_graph_dir: directory to dump the graphs. Default is .
++        - enable_fusion: whether to enable the custom fusion pass.
++        - enable_reshape: whether to enable the custom reshape elimination pass.
++            TODO better pass enabling system.
++        """
++        dump_graph_stages: List[str] = Field(default_factory=list)
++        dump_graph_dir: Path = Field(default=Path("."))
++        enable_fusion: bool = True
++        enable_reshape: bool = True
++
++        def uuid(self):
++            """
++            Produces a hash unique to the pass configuration.
++            Any new fields that affect compilation should be added to the hash.
++            Do not include dump_graph_* in the hash - they don't affect
++            compilation.
++            """
++            dict_ = self.model_dump(
++                include={"enable_fusion", "enable_reshape"})
++            encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
++            return hashlib.sha256(encoded).digest()
++
++        def model_post_init(self, __context: Any) -> None:
++            if not self.enable_reshape and self.enable_fusion:
++                logger.warning_once(
++                    "Fusion enabled but reshape elimination disabled."
++                    "RMSNorm + quant (fp8) fusion might not work")
++
++    pass_config: PassConfig = Field(default_factory=PassConfig)
++
++    # not configurable, computed after init
++    compile_sizes: List[int] = PrivateAttr
++    capture_sizes: List[int] = PrivateAttr
++    max_capture_size: int = PrivateAttr
++    # optimization:
++    # Intuitively, bs_to_padded_graph_size should be Dict[int, int].
++    # since we know all keys are in a range [0, max_capture_size],
++    # we can optimize it to List[int] for better lookup performance.
++    bs_to_padded_graph_size: List[int] = PrivateAttr
++
++    # keep track of enabled and disabled custom ops
++    enabled_custom_ops: Counter[str] = PrivateAttr
++    disabled_custom_ops: Counter[str] = PrivateAttr
++    traced_files: Set[str] = PrivateAttr
++    compilation_time: float = PrivateAttr
++
++    # Per-model forward context
++    # Map from layer name to the attention cls
++    static_forward_context: Dict[str, Any] = PrivateAttr
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        factors: List[Any] = []
++        factors.append(self.level)
++        factors.append(self.backend)
++        factors.append(self.custom_ops)
++        factors.append(self.splitting_ops)
++        factors.append(self.use_inductor)
++        factors.append(self.inductor_compile_config)
++        factors.append(self.inductor_passes)
++        factors.append(self.pass_config.uuid())
++        return hashlib.sha256(str(factors).encode()).hexdigest()
++
++    def __repr__(self) -> str:
++        exclude = {
++            "static_forward_context",
++            "enabled_custom_ops",
++            "disabled_custom_ops",
++            "compilation_time",
++            "bs_to_padded_graph_size",
++            "pass_config",
++            "traced_files",
++        }
++        return self.model_dump_json(exclude=exclude, exclude_unset=True)
++
++    __str__ = __repr__
++
++    @classmethod
++    def from_cli(cls, cli_value: str) -> "CompilationConfig":
++        """Parse the CLI value for the compilation config."""
++        if cli_value in ["0", "1", "2", "3"]:
++            return cls(level=int(cli_value))
++        # do not use `eval`, it is dangerous and can execute arbitrary code
++        dict_value = ast.literal_eval(cli_value)
++        return CompilationConfig.model_validate(dict_value)
++
++    def model_post_init(self, __context: Any) -> None:
++
++        count_none = self.custom_ops.count("none")
++        count_all = self.custom_ops.count("all")
++        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
++
++        if self.splitting_ops is None:
++            if envs.VLLM_USE_V1:
++                # v1 must split the graph on attention ops
++                # for piecewise cudagraph
++                self.splitting_ops = [
++                    "vllm.unified_attention",
++                    "vllm.unified_attention_with_output",
++                ]
++            else:
++                # v0 can use full graph compilation without splitting,
++                # splitting is optional.
++                # right now we still need it. kv cache shape
++                # will be included in the graph if we don't split
++                # the graph.
++                # TODO: hide kv cache in static forward context
++                # so that inductor does not see it.
++                self.splitting_ops = [
++                    "vllm.unified_attention",
++                    "vllm.unified_attention_with_output",
++                ]
++
++        for k, v in self.inductor_passes.items():
++            if not isinstance(v, str):
++                assert callable(v), (
++                    f"pass {k} should be callable or a qualified name")
++                self.inductor_compile_config[k] = v if isinstance(
++                    v, InductorPass) else CallableInductorPass(v)
++                continue
++
++            # resolve function from qualified name
++            names = v.split(".")
++            module = ".".join(names[:-1])
++            func_name = names[-1]
++            func = __import__(module).__dict__[func_name]
++            self.inductor_compile_config[k] = func if isinstance(
++                func, InductorPass) else CallableInductorPass(func)
++
++        self.enabled_custom_ops = Counter()
++        self.disabled_custom_ops = Counter()
++        self.traced_files = set()
++        self.static_forward_context = {}
++        self.compilation_time = 0.0
++
++    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
++        if self.level == CompilationLevel.NO_COMPILATION:
++            raise ValueError("No compilation level is set.")
++
++        from torch._dynamo.backends.registry import list_backends
++        torch_backends = list_backends(exclude_tags=tuple())
++        if self.level in [
++                CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE
++        ]:
++            if self.backend == "":
++                return "eager"
++            if self.backend in torch_backends:
++                return self.backend
++            return resolve_obj_by_qualname(self.backend)
++
++        # TODO: pass user-specified backend to piecewise compilation
++        # merge with the config use_inductor
++        assert self.level == CompilationLevel.PIECEWISE
++
++        from vllm.compilation.backends import VllmBackend
++        return VllmBackend(vllm_config)
++
++    def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]):
++        """To complete the initialization of config,
++        we need to know the cudagraph sizes."""
++
++        if self.cudagraph_capture_sizes is None:
++            self.capture_sizes = sizes_to_specialize
++        else:
++            self.capture_sizes = self.cudagraph_capture_sizes
++            logger.info(("cudagraph sizes specified by model runner"
++                         " %s is overridden by config %s"),
++                        sizes_to_specialize, self.cudagraph_capture_sizes)
++
++        if self.candidate_compile_sizes is None:
++            self.candidate_compile_sizes = []
++        self.compile_sizes = [
++            x for x in self.candidate_compile_sizes if x in self.capture_sizes
++        ]
++        ignored_sizes = [
++            x for x in self.candidate_compile_sizes
++            if x not in self.capture_sizes
++        ]
++        if ignored_sizes:
++            logger.warning(("candidate_compile_sizes %s are ignored "
++                            "because they are not cudagraph capture sizes."),
++                           ignored_sizes)
++
++        # sort to make sure cudagraph capture sizes are in descending order
++        self.capture_sizes.sort(reverse=True)
++        self.max_capture_size = self.capture_sizes[
++            0] if self.capture_sizes else 0
++
++        # pre-compute the mapping from batch size to padded graph size
++        self.bs_to_padded_graph_size = [
++            0 for i in range(self.max_capture_size + 1)
++        ]
++        for end, start in zip(self.capture_sizes,
++                              self.capture_sizes[1:] + [0]):
++            for bs in range(start, end):
++                if bs == start:
++                    self.bs_to_padded_graph_size[bs] = start
++                else:
++                    self.bs_to_padded_graph_size[bs] = end
++        self.bs_to_padded_graph_size[
++            self.max_capture_size] = self.max_capture_size
++
++
++@dataclass
++class VllmConfig:
++    """Dataclass which contains all vllm-related configuration. This
+     simplifies passing around the distinct configurations in the codebase.
+     """
+ 
+-    model_config: ModelConfig
+-    cache_config: CacheConfig
+-    parallel_config: ParallelConfig
+-    scheduler_config: SchedulerConfig
+-    device_config: DeviceConfig
+-    load_config: LoadConfig
+-    lora_config: Optional[LoRAConfig]
+-    vision_language_config: Optional[VisionLanguageConfig]
+-    speculative_config: Optional[SpeculativeConfig]
+-    decoding_config: Optional[DecodingConfig]
++    model_config: ModelConfig = field(default=None, init=True)  # type: ignore
++    cache_config: CacheConfig = field(default=None, init=True)  # type: ignore
++    parallel_config: ParallelConfig = field(default_factory=ParallelConfig,
++                                            init=True)
++    scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig,
++                                              init=True)
++    device_config: DeviceConfig = field(default=None,
++                                        init=True)  # type: ignore
++    load_config: LoadConfig = field(default=None, init=True)  # type: ignore
++    lora_config: Optional[LoRAConfig] = None
++    speculative_config: Optional[SpeculativeConfig] = None
++    decoding_config: Optional[DecodingConfig] = None
++    observability_config: Optional[ObservabilityConfig] = None
++    prompt_adapter_config: Optional[PromptAdapterConfig] = None
++    quant_config: Optional[QuantizationConfig] = None
++    compilation_config: CompilationConfig = field(default=None,
++                                                  init=True)  # type: ignore
++    kv_transfer_config: KVTransferConfig = field(default=None,
++                                                 init=True)  # type: ignore
++    # some opaque config, only used to provide additional information
++    # for the hash computation, mainly used for testing and debugging.
++    additional_config: SupportsHash = field(default=None,
++                                            init=True)  # type: ignore
++    instance_id: str = ""
++
++    def compute_hash(self) -> str:
++        """
++        WARNING: Whenever a new field is added to this config,
++        ensure that it is included in the factors list if
++        it affects the computation graph.
++
++        Provide a hash that uniquely identifies all the configs
++        that affect the structure of the computation
++        graph from input ids/embeddings to the final hidden states,
++        excluding anything before input ids/embeddings and after
++        the final hidden states.
++        """
++        factors: List[Any] = []
++        # summarize system state
++        from torch._inductor.codecache import CacheBase
++        system_factors = CacheBase.get_system()
++        factors.append(system_factors)
++
++        # summarize pytorch state
++        from torch._inductor.codecache import torch_key
++        torch_factors = torch_key()
++        factors.append(torch_factors)
++
++        # summarize vllm config
++        vllm_factors: List[Any] = []
++        from vllm import __version__
++        vllm_factors.append(__version__)
++        if self.model_config:
++            vllm_factors.append(self.model_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.cache_config:
++            vllm_factors.append(self.cache_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.parallel_config:
++            vllm_factors.append(self.parallel_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.scheduler_config:
++            vllm_factors.append(self.scheduler_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.device_config:
++            vllm_factors.append(self.device_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.load_config:
++            vllm_factors.append(self.load_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.lora_config:
++            vllm_factors.append(self.lora_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.speculative_config:
++            vllm_factors.append(self.speculative_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.decoding_config:
++            vllm_factors.append(self.decoding_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.observability_config:
++            vllm_factors.append(self.observability_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.prompt_adapter_config:
++            vllm_factors.append(self.prompt_adapter_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.quant_config:
++            pass  # should be captured by model_config.quantization
++        if self.compilation_config:
++            vllm_factors.append(self.compilation_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.kv_transfer_config:
++            vllm_factors.append(self.kv_transfer_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        if self.additional_config:
++            vllm_factors.append(self.additional_config.compute_hash())
++        else:
++            vllm_factors.append("None")
++        factors.append(vllm_factors)
++
++        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
++        return hash_str
++
++    def pad_for_cudagraph(self, batch_size: int) -> int:
++        # if batch_size > self.compilation_config.max_capture_size,
++        # it should raise an IndexError.
++        # the caller should make sure the batch_size is within the range,
++        # i.e., batch_size <= self.compilation_config.max_capture_size
++        return self.compilation_config.bs_to_padded_graph_size[batch_size]
++
++    @staticmethod
++    def _get_quantization_config(
++            model_config: ModelConfig,
++            load_config: LoadConfig) -> Optional[QuantizationConfig]:
++        """Get the quantization config."""
++        from vllm.platforms import current_platform
++        if model_config.quantization is not None:
++            from vllm.model_executor.model_loader.weight_utils import (
++                get_quant_config)
++            quant_config = get_quant_config(model_config, load_config)
++            capability_tuple = current_platform.get_device_capability()
++
++            if capability_tuple is not None:
++                capability = capability_tuple.to_int()
++                if capability < quant_config.get_min_capability():
++                    raise ValueError(
++                        f"The quantization method {model_config.quantization} "
++                        "is not supported for the current GPU. Minimum "
++                        f"capability: {quant_config.get_min_capability()}. "
++                        f"Current capability: {capability}.")
++            supported_dtypes = quant_config.get_supported_act_dtypes()
++            if model_config.dtype not in supported_dtypes:
++                raise ValueError(
++                    f"{model_config.dtype} is not supported for quantization "
++                    f"method {model_config.quantization}. Supported dtypes: "
++                    f"{supported_dtypes}")
++            return quant_config
++        return None
++
++    def with_hf_config(
++        self,
++        hf_config: PretrainedConfig,
++        architectures: Optional[list[str]] = None,
++    ) -> "VllmConfig":
++        if architectures is not None:
++            hf_config = copy.deepcopy(hf_config)
++            hf_config.architectures = architectures
++
++        model_config = copy.deepcopy(self.model_config)
++        model_config.hf_config = hf_config
++
++        return replace(self, model_config=model_config)
+ 
+     def __post_init__(self):
+         """Verify configs are valid & consistent with each other.
+         """
+-        self.model_config.verify_with_parallel_config(self.parallel_config)
+-        self.cache_config.verify_with_parallel_config(self.parallel_config)
++        if self.model_config is not None:
++            self.model_config.verify_async_output_proc(self.parallel_config,
++                                                       self.speculative_config,
++                                                       self.device_config)
++            self.model_config.verify_with_parallel_config(self.parallel_config)
++
++        if self.cache_config is not None:
++            self.cache_config.verify_with_parallel_config(self.parallel_config)
+ 
+         if self.lora_config:
++            self.lora_config.verify_with_cache_config(self.cache_config)
+             self.lora_config.verify_with_model_config(self.model_config)
+             self.lora_config.verify_with_scheduler_config(
+                 self.scheduler_config)
+-
+-    def to_dict(self):
+-        """Return the configs as a dictionary, for use in **kwargs.
++        if self.prompt_adapter_config:
++            self.prompt_adapter_config.verify_with_model_config(
++                self.model_config)
++
++        if self.quant_config is None and \
++            self.model_config is not None and self.load_config is not None:
++            self.quant_config = VllmConfig._get_quantization_config(
++                self.model_config, self.load_config)
++
++        from vllm.platforms import current_platform
++        if self.scheduler_config is not None and \
++            self.model_config is not None and \
++            self.scheduler_config.chunked_prefill_enabled and \
++            self.model_config.dtype == torch.float32 and \
++            current_platform.get_device_capability() == (7, 5):
++            logger.warning_once(
++                "Turing devices tensor cores do not support float32 matmul. "
++                "To workaround this limitation, vLLM will set 'ieee' input "
++                "precision for chunked prefill triton kernels.")
++
++        if self.compilation_config is None:
++            self.compilation_config = CompilationConfig()
++        if envs.VLLM_USE_V1 and not self.model_config.enforce_eager:
++            # NOTE(woosuk): Currently, we use inductor because the piecewise
++            # CUDA graphs do not work properly with the custom CUDA kernels.
++            # FIXME(woosuk): Disable inductor to reduce the compilation time
++            # and avoid any potential issues with the inductor.
++            self.compilation_config.custom_ops = ["none"]
++            self.compilation_config.use_cudagraph = True
++            self.compilation_config.use_inductor = True
++            self.compilation_config.cudagraph_num_of_warmups = 1
++            self.compilation_config.pass_config.enable_fusion = False
++            self.compilation_config.pass_config.enable_reshape = False
++            self.compilation_config.level = CompilationLevel.PIECEWISE
++
++        self._set_cudagraph_sizes()
++
++        if self.cache_config is not None and \
++            self.cache_config.cpu_offload_gb > 0 and \
++            self.compilation_config.level != CompilationLevel.NO_COMPILATION:
++            logger.warning(
++                "CPU offload is not supported with `torch.compile` yet."
++                " Disabling `torch.compile`.")
++            self.compilation_config.level = CompilationLevel.NO_COMPILATION
++
++        if self.lora_config is not None and self.compilation_config.level !=\
++             CompilationLevel.NO_COMPILATION:
++            logger.warning("LoRA is not supported with `torch.compile` yet. "
++                           "Disabling `torch.compile`.")
++            self.compilation_config.level = CompilationLevel.NO_COMPILATION
++
++        current_platform.check_and_update_config(self)
++
++        if not self.instance_id:
++            self.instance_id = random_uuid()[:5]
++
++    def _set_cudagraph_sizes(self):
++        """
++        cudagraph batchsize padding logic:
++
++        `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible
++        batch sizes that cudagraph will capture.
++
++        Depending on the engine's configuration of `max_num_seqs`, the
++        candidate batch sizes to capture cudagraph will shrink to the subset
++        which just cover the range of `[1, max_num_seqs]`. In the common case,
++        `max_num_seqs` is 256, and the cudagraph batch sizes will be
++        `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`.
++
++        However, if users specify the cudagraph capture sizes through
++        compilation config, we will use the specified sizes instead.
++
++        In the end, `vllm_config.compilation_config.capture_sizes` will be the
++        final sizes to capture cudagraph (in descending order).
++
++        During runtime, if batchsize is larger than
++        `vllm_config.compilation_config.capture_sizes`,
++        no cudagraph will be used.
++        If the batch size is no larger than
++        `vllm_config.compilation_config.capture_sizes`,
++        we can quickly find the padded graph size for a given batch size by
++        looking up `vllm_config.compilation_config.bs_to_padded_graph_size`.
+         """
+-        return dict(
+-            (field.name, getattr(self, field.name)) for field in fields(self))
++
++        # calculate the default `batch_size_capture_list`
++        if not envs.VLLM_USE_V1:
++            batch_size_capture_list = []
++            max_batchsize_to_capture = 0
++            if self.scheduler_config is not None and \
++                self.model_config is not None and \
++                    not self.model_config.enforce_eager:
++
++                possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
++                # find the minimum size that is larger than max_num_seqs,
++                # which then becomes the max_batchsize_to_capture
++                larger_sizes = [
++                    x for x in possible_sizes
++                    if x >= self.scheduler_config.max_num_seqs
++                ]
++                if larger_sizes:
++                    max_batchsize_to_capture = larger_sizes[0]
++                else:
++                    max_batchsize_to_capture = possible_sizes[-1]
++
++                # filter out the sizes that are
++                # larger than max_batchsize_to_capture
++                batch_size_capture_list = [
++                    size for size in possible_sizes
++                    if size <= max_batchsize_to_capture
++                ]
++        else:
++            batch_size_capture_list = []
++            if self.model_config is not None and \
++                not self.model_config.enforce_eager:
++                batch_size_capture_list = [1, 2, 4
++                                           ] + [i for i in range(8, 513, 8)]
++
++        self.compilation_config.init_with_cudagraph_sizes(
++            batch_size_capture_list)
++
++    def __str__(self):
++        return (
++            f"model={self.model_config.model!r},"
++            f" speculative_config={self.speculative_config!r},"
++            f" tokenizer={self.model_config.tokenizer!r}, "
++            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init},"
++            f" tokenizer_mode={self.model_config.tokenizer_mode}, "
++            f"revision={self.model_config.revision}, "
++            f"override_neuron_config={self.model_config.override_neuron_config},"
++            f" tokenizer_revision={self.model_config.tokenizer_revision}, "
++            f"trust_remote_code={self.model_config.trust_remote_code}, "
++            f"dtype={self.model_config.dtype}, "
++            f"max_seq_len={self.model_config.max_model_len},"
++            f" download_dir={self.load_config.download_dir!r}, "
++            f"load_format={self.load_config.load_format}, "
++            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size},"
++            f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
++            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
++            f"quantization={self.model_config.quantization}, "
++            f"enforce_eager={self.model_config.enforce_eager}, "
++            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
++            f"quantization_param_path={self.model_config.quantization_param_path},"
++            f" device_config={self.device_config.device}, "
++            f"decoding_config={self.decoding_config!r}, "
++            f"observability_config={self.observability_config!r}, "
++            f"seed={self.model_config.seed}, "
++            f"served_model_name={self.model_config.served_model_name}, "
++            f"num_scheduler_steps={self.scheduler_config.num_scheduler_steps}, "
++            f"multi_step_stream_outputs={self.scheduler_config.multi_step_stream_outputs}, "  # noqa
++            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
++            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
++            f"use_async_output_proc={self.model_config.use_async_output_proc}, "
++            f"disable_mm_preprocessor_cache={self.model_config.disable_mm_preprocessor_cache!r}, "  # noqa
++            f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
++            f"pooler_config={self.model_config.pooler_config!r}, "
++            f"compilation_config={self.compilation_config!r}")
++
++
++_current_vllm_config: Optional[VllmConfig] = None
++
++
++@contextmanager
++def set_current_vllm_config(vllm_config: VllmConfig):
++    """
++    Temporarily set the current VLLM config.
++    Used during model initialization.
++    We save the current VLLM config in a global variable,
++    so that all modules can access it, e.g. custom ops
++    can access the VLLM config to determine how to dispatch.
++    """
++    global _current_vllm_config
++    old_vllm_config = _current_vllm_config
++    from vllm.compilation.counter import compilation_counter
++    num_models_seen = compilation_counter.num_models_seen
++    try:
++        _current_vllm_config = vllm_config
++        yield
++    finally:
++        logger.debug("enabled custom ops: %s",
++                     vllm_config.compilation_config.enabled_custom_ops)
++        logger.debug("disabled custom ops: %s",
++                     vllm_config.compilation_config.disabled_custom_ops)
++        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
++            and compilation_counter.num_models_seen == num_models_seen:
++            # If the model supports compilation,
++            # compilation_counter.num_models_seen should be increased
++            # by at least 1.
++            # If it is not increased, it means the model does not support
++            # compilation (does not have @support_torch_compile decorator).
++            logger.warning(
++                "`torch.compile` is turned on, but the model %s"
++                " does not support it. Please open an issue on GitHub"
++                "if you want it to be supported.",
++                vllm_config.model_config.model)
++        _current_vllm_config = old_vllm_config
++
++
++def get_current_vllm_config() -> VllmConfig:
++    if _current_vllm_config is None:
++        # in ci, usually when we test custom ops/modules directly,
++        # we don't set the vllm config. In that case, we set a default
++        # config.
++        logger.warning("Current VLLM config is not set.")
++        from vllm.config import VllmConfig
++        return VllmConfig()
++    return _current_vllm_config
+diff --git a/vllm/connections.py b/vllm/connections.py
+new file mode 100644
+index 0000000..e785a0b
+--- /dev/null
++++ b/vllm/connections.py
+@@ -0,0 +1,167 @@
++from pathlib import Path
++from typing import Mapping, MutableMapping, Optional
++from urllib.parse import urlparse
++
++import aiohttp
++import requests
++
++from vllm.version import __version__ as VLLM_VERSION
++
++
++class HTTPConnection:
++    """Helper class to send HTTP requests."""
++
++    def __init__(self, *, reuse_client: bool = True) -> None:
++        super().__init__()
++
++        self.reuse_client = reuse_client
++
++        self._sync_client: Optional[requests.Session] = None
++        self._async_client: Optional[aiohttp.ClientSession] = None
++
++    def get_sync_client(self) -> requests.Session:
++        if self._sync_client is None or not self.reuse_client:
++            self._sync_client = requests.Session()
++
++        return self._sync_client
++
++    # NOTE: We intentionally use an async function even though it is not
++    # required, so that the client is only accessible inside async event loop
++    async def get_async_client(self) -> aiohttp.ClientSession:
++        if self._async_client is None or not self.reuse_client:
++            self._async_client = aiohttp.ClientSession()
++
++        return self._async_client
++
++    def _validate_http_url(self, url: str):
++        parsed_url = urlparse(url)
++
++        if parsed_url.scheme not in ("http", "https"):
++            raise ValueError("Invalid HTTP URL: A valid HTTP URL "
++                             "must have scheme 'http' or 'https'.")
++
++    def _headers(self, **extras: str) -> MutableMapping[str, str]:
++        return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras}
++
++    def get_response(
++        self,
++        url: str,
++        *,
++        stream: bool = False,
++        timeout: Optional[float] = None,
++        extra_headers: Optional[Mapping[str, str]] = None,
++    ):
++        self._validate_http_url(url)
++
++        client = self.get_sync_client()
++        extra_headers = extra_headers or {}
++
++        return client.get(url,
++                          headers=self._headers(**extra_headers),
++                          stream=stream,
++                          timeout=timeout)
++
++    async def get_async_response(
++        self,
++        url: str,
++        *,
++        timeout: Optional[float] = None,
++        extra_headers: Optional[Mapping[str, str]] = None,
++    ):
++        self._validate_http_url(url)
++
++        client = await self.get_async_client()
++        extra_headers = extra_headers or {}
++
++        return client.get(url,
++                          headers=self._headers(**extra_headers),
++                          timeout=timeout)
++
++    def get_bytes(self, url: str, *, timeout: Optional[float] = None) -> bytes:
++        with self.get_response(url, timeout=timeout) as r:
++            r.raise_for_status()
++
++            return r.content
++
++    async def async_get_bytes(
++        self,
++        url: str,
++        *,
++        timeout: Optional[float] = None,
++    ) -> bytes:
++        async with await self.get_async_response(url, timeout=timeout) as r:
++            r.raise_for_status()
++
++            return await r.read()
++
++    def get_text(self, url: str, *, timeout: Optional[float] = None) -> str:
++        with self.get_response(url, timeout=timeout) as r:
++            r.raise_for_status()
++
++            return r.text
++
++    async def async_get_text(
++        self,
++        url: str,
++        *,
++        timeout: Optional[float] = None,
++    ) -> str:
++        async with await self.get_async_response(url, timeout=timeout) as r:
++            r.raise_for_status()
++
++            return await r.text()
++
++    def get_json(self, url: str, *, timeout: Optional[float] = None) -> str:
++        with self.get_response(url, timeout=timeout) as r:
++            r.raise_for_status()
++
++            return r.json()
++
++    async def async_get_json(
++        self,
++        url: str,
++        *,
++        timeout: Optional[float] = None,
++    ) -> str:
++        async with await self.get_async_response(url, timeout=timeout) as r:
++            r.raise_for_status()
++
++            return await r.json()
++
++    def download_file(
++        self,
++        url: str,
++        save_path: Path,
++        *,
++        timeout: Optional[float] = None,
++        chunk_size: int = 128,
++    ) -> Path:
++        with self.get_response(url, timeout=timeout) as r:
++            r.raise_for_status()
++
++            with save_path.open("wb") as f:
++                for chunk in r.iter_content(chunk_size):
++                    f.write(chunk)
++
++        return save_path
++
++    async def async_download_file(
++        self,
++        url: str,
++        save_path: Path,
++        *,
++        timeout: Optional[float] = None,
++        chunk_size: int = 128,
++    ) -> Path:
++        async with await self.get_async_response(url, timeout=timeout) as r:
++            r.raise_for_status()
++
++            with save_path.open("wb") as f:
++                async for chunk in r.content.iter_chunked(chunk_size):
++                    f.write(chunk)
++
++        return save_path
++
++
++global_http_connection = HTTPConnection()
++"""The global :class:`HTTPConnection` instance used by vLLM."""
+diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
+index b0d9511..90c1438 100644
+--- a/vllm/core/block/block_table.py
++++ b/vllm/core/block/block_table.py
+@@ -1,5 +1,7 @@
++import math
+ from typing import List, Optional
+ 
++from vllm.core.block.common import BlockList
+ from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
+ from vllm.utils import Device, cdiv, chunk_list
+ 
+@@ -20,6 +22,10 @@ class BlockTable:
+         _blocks (Optional[List[Block]], optional): An optional list of existing
+             blocks to initialize the BlockTable with. If not provided, an empty
+             BlockTable is created.
++        max_block_sliding_window (Optional[int], optional): The number of
++            blocks to keep around for each sequence. If None, all blocks
++            are kept (eg., when sliding window is not used).
++            It should at least fit the sliding window size of the model.
+ 
+     Attributes:
+         _block_size (int): The maximum number of tokens that can be stored in a
+@@ -37,21 +43,24 @@ class BlockTable:
+         block_size: int,
+         block_allocator: DeviceAwareBlockAllocator,
+         _blocks: Optional[List[Block]] = None,
++        max_block_sliding_window: Optional[int] = None,
+     ):
+         self._block_size = block_size
+         self._allocator = block_allocator
+         if _blocks is None:
+             _blocks = []
+-        self._blocks: List[Block] = _blocks
++        self._blocks: BlockList = BlockList(_blocks)
+ 
+-        # Use helper method instead of directly calculating, as blocks
+-        # may not be allocated.
+-        self._num_full_slots = len(self._get_all_token_ids())
++        self._max_block_sliding_window = max_block_sliding_window
++        self._num_full_slots = self._get_num_token_ids()
+ 
+     @staticmethod
+-    def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
++    def get_num_required_blocks(token_ids: List[int],
++                                block_size: int,
++                                num_lookahead_slots: int = 0) -> int:
+         """Calculates the minimum number of blocks required to store a given
+-        sequence of token IDs.
++        sequence of token IDs along with any look-ahead slots that may be
++        required (like in multi-step + chunked-prefill).
+ 
+         This assumes worst-case scenario, where every block requires a new
+         allocation (e.g. ignoring prefix caching).
+@@ -60,16 +69,19 @@ class BlockTable:
+             token_ids (List[int]): The sequence of token IDs to be stored.
+             block_size (int): The maximum number of tokens that can be stored in
+                 a single block.
++            num_lookahead_slots (int): look-ahead slots that the sequence may
++                require.
+ 
+         Returns:
+             int: The minimum number of blocks required to store the given
+-                sequence of token IDs.
++                sequence of token IDs along with any required look-ahead slots.
+         """
+-        return cdiv(len(token_ids), block_size)
++        return cdiv(len(token_ids) + num_lookahead_slots, block_size)
+ 
+     def allocate(self,
+                  token_ids: List[int],
+-                 device: Device = Device.GPU) -> None:
++                 device: Device = Device.GPU,
++                 extra_hash: Optional[int] = None) -> None:
+         """Allocates memory blocks for storing the given sequence of token IDs.
+ 
+         This method allocates the required number of blocks to store the given
+@@ -79,17 +91,30 @@ class BlockTable:
+             token_ids (List[int]): The sequence of token IDs to be stored.
+             device (Device, optional): The device on which the blocks should be
+                 allocated. Defaults to Device.GPU.
++            extra_hash (Optional[int]): The hash value of additional
++                factors, such as adapters, that influence the block hash
++                in the prefixcaching block.
+         """
+         assert not self._is_allocated
+         assert token_ids
+-        self._blocks = self._allocate_blocks_for_token_ids(prev_block=None,
+-                                                           token_ids=token_ids,
+-                                                           device=device)
++        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
++                                                     token_ids=token_ids,
++                                                     device=device,
++                                                     extra_hash=extra_hash)
++        self.update(blocks)
+         self._num_full_slots = len(token_ids)
+ 
++    def update(self, blocks: List[Block]) -> None:
++        """Resets the table to the newly provided blocks 
++        (with their corresponding block ids)
++        """
++        self._blocks.update(blocks)
++
+     def append_token_ids(self,
+                          token_ids: List[int],
+-                         num_lookahead_slots: int = 0) -> None:
++                         num_lookahead_slots: int = 0,
++                         num_computed_slots: Optional[int] = None,
++                         extra_hash: Optional[int] = None) -> None:
+         """Appends a sequence of token IDs to the existing blocks in the
+         BlockTable.
+ 
+@@ -104,22 +129,50 @@ class BlockTable:
+ 
+         Args:
+             token_ids (List[int]): The sequence of token IDs to be appended.
++            num_computed_slots (Optional[int]): The number of KV cache slots
++                that are already filled (computed).
++                When sliding window is enabled, this is used to compute how many
++                blocks to drop at the front of the sequence.
++                Without sliding window, None can be passed.
++                Without chunked prefill, it should be the same as
++                _num_full_slots.
++            extra_hash (Optional[int]): The hash value of additional
++                factors such as adapters that influence the block, apart
++                from the token_ids.
+         """
+-        assert self._is_allocated
++        assert self._is_allocated, "no blocks have been allocated"
+         assert len(self._blocks) > 0
+ 
++        # Drop blocks that are no longer needed due to sliding window
++        if self._max_block_sliding_window is not None:
++            null_block = self._allocator.allocate_or_get_null_block()
++            assert num_computed_slots is not None
++            end_block_idx = (num_computed_slots //
++                             self._block_size) - self._max_block_sliding_window
++            for idx in range(0, end_block_idx):
++                b = self._blocks[idx]
++                if b is not null_block:
++                    self._allocator.free(b)
++                    self._blocks[idx] = null_block
++
++        # Ensure there are enough empty slots for the new tokens plus
++        # lookahead slots
+         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
+-                                    num_lookahead_slots)
++                                    num_lookahead_slots,
++                                    extra_hash=extra_hash)
+ 
+-        blocks = self._blocks[self._num_full_slots // self._block_size:]
++        # Update the blocks with the new tokens
++        first_block_idx = self._num_full_slots // self._block_size
+         token_blocks = self._chunk_token_blocks_for_append(token_ids)
+ 
+-        for block, token_block in zip(blocks, token_blocks):
+-            block.append_token_ids(token_block)
++        for i, token_block in enumerate(token_blocks):
++            self._blocks.append_token_ids(first_block_idx + i, token_block)
+ 
+         self._num_full_slots += len(token_ids)
+ 
+-    def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
++    def ensure_num_empty_slots(self,
++                               num_empty_slots: int,
++                               extra_hash: Optional[int] = None) -> None:
+         """Ensures that the BlockTable has at least the specified number of
+         empty slots available.
+ 
+@@ -130,6 +183,9 @@ class BlockTable:
+ 
+         Args:
+             num_empty_slots (int): The minimum number of empty slots required.
++            extra_hash (Optional[int]): The hash value of additional
++                factors such as adapters that influence the block, apart
++                from the token_ids.
+         """
+         # Currently the block table only supports
+         # appending tokens to GPU blocks.
+@@ -145,8 +201,10 @@ class BlockTable:
+         for _ in range(blocks_to_allocate):
+             assert len(self._blocks) > 0
+             self._blocks.append(
+-                self._allocator.allocate_mutable(prev_block=self._blocks[-1],
+-                                                 device=device))
++                self._allocator.allocate_mutable_block(
++                    prev_block=self._blocks[-1],
++                    device=device,
++                    extra_hash=extra_hash))
+ 
+     def fork(self) -> "BlockTable":
+         """Creates a new BlockTable instance with a copy of the blocks from the
+@@ -168,6 +226,7 @@ class BlockTable:
+             block_size=self._block_size,
+             block_allocator=self._allocator,
+             _blocks=forked_blocks,
++            max_block_sliding_window=self._max_block_sliding_window,
+         )
+ 
+     def free(self) -> None:
+@@ -178,13 +237,12 @@ class BlockTable:
+         occupied by each block. After freeing all the blocks, the `_blocks` list
+         is set to `None`.
+         """
+-        assert self._is_allocated
+-        for block in self._blocks:
++        for block in self.blocks:
+             self._allocator.free(block)
+-        self._blocks = []
++        self._blocks.reset()
+ 
+     @property
+-    def physical_block_ids(self) -> List[Optional[int]]:
++    def physical_block_ids(self) -> List[int]:
+         """Returns a list of physical block indices for the blocks in the
+         BlockTable.
+ 
+@@ -197,8 +255,7 @@ class BlockTable:
+             List[int]: A list of physical block indices for the blocks in the
+                 BlockTable.
+         """
+-        assert self._is_allocated
+-        return [block.block_id for block in self._blocks]
++        return self._blocks.ids()
+ 
+     def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
+         """Get the number of "unseen" tokens in the sequence.
+@@ -219,21 +276,40 @@ class BlockTable:
+         # ones after the appended ones.
+         return sequence_token_ids[self.num_full_slots:]
+ 
+-    def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
+-                                       token_ids: List[int],
+-                                       device: Device) -> List[Block]:
+-        blocks = []
+-        for block_token_ids in chunk_list(token_ids, self._block_size):
+-            if len(block_token_ids) == self._block_size:
+-                # If the block is full, create an immutable block.
+-                prev_block = self._allocator.allocate_immutable(
+-                    prev_block, token_ids=block_token_ids, device=device)
++    def _allocate_blocks_for_token_ids(
++            self,
++            prev_block: Optional[Block],
++            token_ids: List[int],
++            device: Device,
++            extra_hash: Optional[int] = None) -> List[Block]:
++        blocks: List[Block] = []
++
++        block_token_ids = []
++        tail_token_ids = []
++        for cur_token_ids in chunk_list(token_ids, self._block_size):
++            if len(cur_token_ids) == self._block_size:
++                block_token_ids.append(cur_token_ids)
+             else:
+-                # Else, partially fill a mutable block with token ids.
+-                prev_block = self._allocator.allocate_mutable(
+-                    prev_block=prev_block, device=device)
+-                prev_block.append_token_ids(block_token_ids)
+-            blocks.append(prev_block)
++                tail_token_ids.append(cur_token_ids)
++
++        if block_token_ids:
++            blocks.extend(
++                self._allocator.allocate_immutable_blocks(
++                    prev_block,
++                    block_token_ids=block_token_ids,
++                    device=device,
++                    extra_hash=extra_hash))
++            prev_block = blocks[-1]
++
++        if tail_token_ids:
++            assert len(tail_token_ids) == 1
++            cur_token_ids = tail_token_ids[0]
++
++            block = self._allocator.allocate_mutable_block(
++                prev_block=prev_block, device=device, extra_hash=extra_hash)
++            block.append_token_ids(cur_token_ids)
++
++            blocks.append(block)
+ 
+         return blocks
+ 
+@@ -244,15 +320,26 @@ class BlockTable:
+         if not self._is_allocated:
+             return token_ids
+ 
+-        for block in self._blocks:
++        for block in self.blocks:
+             token_ids.extend(block.token_ids)
+ 
+         return token_ids
+ 
++    def _get_num_token_ids(self) -> int:
++        res = 0
++        for block in self.blocks:
++            res += len(block.token_ids)
++
++        return res
++
+     @property
+     def _is_allocated(self) -> bool:
+         return len(self._blocks) > 0
+ 
++    @property
++    def blocks(self) -> List[Block]:
++        return self._blocks.list()
++
+     @property
+     def _num_empty_slots(self) -> int:
+         assert self._is_allocated
+@@ -276,10 +363,17 @@ class BlockTable:
+         This is required for the scheduler to determine whether a sequence can
+         continue generation, or if it must be preempted.
+         """
++        # Math below is equivalent to:
++        # all_token_ids = token_ids + [-1] * num_lookahead_slots
++        # token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
++        # return len(token_blocks)
+ 
+-        all_token_ids = token_ids + [-1] * num_lookahead_slots
+-        token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
+-        return len(token_blocks)
++        num_token_ids = len(token_ids) + num_lookahead_slots
++        first_chunk_size = self._block_size - (self._num_full_slots %
++                                               self._block_size)
++        num_token_blocks = (1 + math.ceil(
++            (num_token_ids - first_chunk_size) / self._block_size))
++        return num_token_blocks
+ 
+     def _chunk_token_blocks_for_append(
+             self, token_ids: List[int]) -> List[List[int]]:
+@@ -287,9 +381,16 @@ class BlockTable:
+         appended to blocks. The first such "token block" may have less token ids
+         than the block size, since the last allocated block may be partially
+         full.
++
++        If no token ids are provided, then no chunks are returned.
+         """
++
++        if not token_ids:
++            return []
++
+         first_chunk_size = self._block_size - (self._num_full_slots %
+                                                self._block_size)
+-        token_blocks = [token_ids[:first_chunk_size]] + chunk_list(
+-            token_ids[first_chunk_size:], self._block_size)
++        token_blocks = [token_ids[:first_chunk_size]]
++        token_blocks.extend(
++            chunk_list(token_ids[first_chunk_size:], self._block_size))
+         return token_blocks
+diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
+index 3f97a12..c03b593 100644
+--- a/vllm/core/block/common.py
++++ b/vllm/core/block/common.py
+@@ -1,5 +1,6 @@
+-from collections import defaultdict
+-from typing import Dict, Iterable, List, Optional, Protocol
++from collections import deque
++from dataclasses import dataclass
++from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
+ 
+ from vllm.core.block.interfaces import Block, BlockAllocator
+ 
+@@ -96,84 +97,251 @@ class CopyOnWriteTracker:
+ 
+     The CopyOnWriteTracker class maintains a mapping of source block indices to
+         their corresponding copy-on-write destination block indices. It works in
+-        conjunction with a RefCounter and a BlockAllocator to handle reference
+-        counting and block allocation.
++        conjunction with a RefCounter.
+ 
+     Args:
+         refcounter (RefCounter): The reference counter used to track block
+             reference counts.
+-        allocator (BlockAllocator): The block allocator used to allocate and
+-            free blocks.
+     """
+ 
+-    def __init__(
+-        self,
+-        refcounter: RefCounterProtocol,
+-        allocator: BlockAllocator,
+-    ):
+-        self._copy_on_writes: Dict[BlockId, List[BlockId]] = defaultdict(list)
++    def __init__(self, refcounter: RefCounterProtocol):
++        self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
+         self._refcounter = refcounter
+-        self._allocator = allocator
+-
+-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
+-        """Performs a copy-on-write operation on the given block if it is not
+-        appendable.
+-
+-        This method checks the reference count of the given block. If the
+-        reference count is greater than 1, indicating that the block is shared,
+-        a copy-on-write operation is performed. The original block is freed,
+-        and a new block is allocated with the same content. The new block index
+-        is returned.
+-
+-        Args:
+-            block (Block): The block to check for copy-on-write.
+ 
+-        Returns:
+-            Optional[BlockId]: The block index of the new block if a copy-on
+-                -write operation was performed, or the original block index if
+-                no copy-on-write was necessary.
++    def is_appendable(self, block: Block) -> bool:
++        """Checks if the block is shared or not. If shared, then it cannot
++        be appended and needs to be duplicated via copy-on-write
+         """
+         block_id = block.block_id
+         if block_id is None:
+-            return block_id
++            return True
+ 
+         refcount = self._refcounter.get(block_id)
+-        assert refcount != 0
+-        if refcount > 1:
+-            src_block_id = block_id
+-
+-            # Decrement refcount of the old block.
+-            self._allocator.free(block)
+-
+-            # Allocate a fresh new block.
+-            block_id = self._allocator.allocate_mutable(
+-                prev_block=block.prev_block).block_id
++        return refcount <= 1
+ 
+-            # Track src/dst copy.
+-            assert src_block_id is not None
+-            assert block_id is not None
+-            self._copy_on_writes[src_block_id].append(block_id)
+-
+-        return block_id
++    def record_cow(self, src_block_id: Optional[BlockId],
++                   trg_block_id: Optional[BlockId]) -> None:
++        """Records a copy-on-write operation from source to target block id
++        Args:
++            src_block_id (BlockId): The source block id from which to copy 
++                the data
++            trg_block_id (BlockId): The target block id to which the data
++                is copied
++        """
++        assert src_block_id is not None
++        assert trg_block_id is not None
++        self._copy_on_writes.append((src_block_id, trg_block_id))
+ 
+-    def clear_cows(self) -> Dict[BlockId, List[BlockId]]:
++    def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
+         """Clears the copy-on-write tracking information and returns the current
+         state.
+ 
+-        This method returns a dictionary mapping source block indices to lists
+-        of destination block indices for the current copy-on-write operations.
++        This method returns a list mapping source block indices to
++         destination block indices for the current copy-on-write operations.
+         It then clears the internal tracking information.
+ 
+         Returns:
+-            Dict[BlockId, List[BlockId]]: A dictionary mapping source
+-                block indices to lists of destination block indices for the
++            List[Tuple[BlockId, BlockId]]: A list mapping source
++                block indices to destination block indices for the
+                 current copy-on-write operations.
+         """
+-        cows = dict(self._copy_on_writes)
+-        self._copy_on_writes.clear()
++        cows = self._copy_on_writes
++        self._copy_on_writes = []
+         return cows
+ 
+ 
++class BlockPool:
++    """Used to pre-allocate block objects, in order to avoid excessive python
++    object allocations/deallocations.
++    The pool starts from "pool_size" objects and will increase to more objects
++    if necessary
++
++    Note that multiple block objects may point to the same physical block id,
++    which is why this pool is needed, so that it will be easier to support
++    prefix caching and more complicated sharing of physical blocks.
++    """
++
++    def __init__(self, block_size: int, create_block: Block.Factory,
++                 allocator: BlockAllocator, pool_size: int):
++        self._block_size = block_size
++        self._create_block = create_block
++        self._allocator = allocator
++        self._pool_size = pool_size
++        assert self._pool_size >= 0
++
++        self._free_ids: Deque[int] = deque(range(self._pool_size))
++        self._pool = []
++        for i in range(self._pool_size):
++            self._pool.append(
++                self._create_block(prev_block=None,
++                                   token_ids=[],
++                                   block_size=self._block_size,
++                                   allocator=self._allocator,
++                                   block_id=None,
++                                   extra_hash=None))
++
++    def increase_pool(self):
++        """Doubles the internal pool size
++        """
++        cur_pool_size = self._pool_size
++        new_pool_size = cur_pool_size * 2
++        self._pool_size = new_pool_size
++
++        self._free_ids += deque(range(cur_pool_size, new_pool_size))
++
++        for i in range(cur_pool_size, new_pool_size):
++            self._pool.append(
++                self._create_block(prev_block=None,
++                                   token_ids=[],
++                                   block_size=self._block_size,
++                                   allocator=self._allocator,
++                                   block_id=None,
++                                   extra_hash=None))
++
++    def init_block(self,
++                   prev_block: Optional[Block],
++                   token_ids: List[int],
++                   block_size: int,
++                   physical_block_id: Optional[int],
++                   extra_hash: Optional[int] = None) -> Block:
++        if len(self._free_ids) == 0:
++            self.increase_pool()
++            assert len(self._free_ids) > 0
++
++        pool_id = self._free_ids.popleft()
++
++        block = self._pool[pool_id]
++        block.__init__(  # type: ignore[misc]
++            prev_block=prev_block,
++            token_ids=token_ids,
++            block_size=block_size,
++            allocator=block._allocator,  # type: ignore[attr-defined] 
++            block_id=physical_block_id,
++            extra_hash=extra_hash)
++        block.pool_id = pool_id  # type: ignore[attr-defined]
++        return block
++
++    def free_block(self, block: Block) -> None:
++        self._free_ids.appendleft(block.pool_id)  # type: ignore[attr-defined]
++
++
++class BlockList:
++    """This class is an optimization to allow fast-access to physical 
++    block ids. It maintains a block id list that is updated with the 
++    block list and this avoids the need to reconstruct the block id 
++    list on every iteration of the block manager
++    """
++
++    def __init__(self, blocks: List[Block]):
++        self._blocks: List[Block] = []
++        self._block_ids: List[int] = []
++
++        self.update(blocks)
++
++    def _add_block_id(self, block_id: Optional[BlockId]) -> None:
++        assert block_id is not None
++        self._block_ids.append(block_id)
++
++    def _update_block_id(self, block_index: int,
++                         new_block_id: Optional[BlockId]) -> None:
++        assert new_block_id is not None
++        self._block_ids[block_index] = new_block_id
++
++    def update(self, blocks: List[Block]):
++        self._blocks = blocks
++
++        # Cache block ids for fast query
++        self._block_ids = []
++        for block in self._blocks:
++            self._add_block_id(block.block_id)
++
++    def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
++        block = self._blocks[block_index]
++        prev_block_id = block.block_id
++
++        block.append_token_ids(token_ids)
++
++        # CoW or promotion may update the internal block_id
++        if prev_block_id != block.block_id:
++            self._update_block_id(block_index, block.block_id)
++
++    def append(self, new_block: Block):
++        self._blocks.append(new_block)
++        self._add_block_id(new_block.block_id)
++
++    def __len__(self) -> int:
++        return len(self._blocks)
++
++    def __getitem__(self, block_index: int) -> Block:
++        return self._blocks[block_index]
++
++    def __setitem__(self, block_index: int, new_block: Block) -> None:
++        self._blocks[block_index] = new_block
++        self._update_block_id(block_index, new_block.block_id)
++
++    def reset(self):
++        self._blocks = []
++        self._block_ids = []
++
++    def list(self) -> List[Block]:
++        return self._blocks
++
++    def ids(self) -> List[int]:
++        return self._block_ids
++
++
++@dataclass
++class CacheMetricData:
++    """A utility dataclass to maintain cache metric.
++    To avoid overflow, we maintain the hit rate in block granularity, so that
++    we can maintain a single hit rate for n_completed_block x block_size,
++    and calculate the real time hit rate by the following:
++    BS = The number of queries per block.
++    nB = The number of completed blocks.
++    HR = hit rate of (nB x BS) queries.
++    Q = current number of queries (< BS).
++    H = current number of hits (< BS).
++    hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
++    """
++    num_completed_blocks: int = 0
++    completed_block_cache_hit_rate: float = 0.0
++    num_incompleted_block_queries: int = 0
++    num_incompleted_block_hit: int = 0
++    block_size: int = 1000
++
++    def query(self, hit: bool):
++        self.num_incompleted_block_queries += 1
++        self.num_incompleted_block_hit += 1 if hit else 0
++
++        # When a block is completed, update the cache hit rate
++        # and reset the incomplete numbers.
++        if self.num_incompleted_block_queries == self.block_size:
++            hit_rate = (self.num_incompleted_block_hit /
++                        self.num_incompleted_block_queries)
++            self.completed_block_cache_hit_rate = (
++                self.completed_block_cache_hit_rate * self.num_completed_blocks
++                + hit_rate) / (self.num_completed_blocks + 1)
++            self.num_incompleted_block_queries = 0
++            self.num_incompleted_block_hit = 0
++            self.num_completed_blocks += 1
++
++    def get_hit_rate(self):
++        incomplete_ratio = self.num_incompleted_block_queries / self.block_size
++        total_blocks = self.num_completed_blocks + incomplete_ratio
++        if total_blocks == 0:
++            return 0.0
++
++        completed_block_hit, incompleted_block_hit = 0.0, 0.0
++        if self.num_completed_blocks > 0:
++            completed_block_hit = (self.completed_block_cache_hit_rate *
++                                   self.num_completed_blocks)
++        if self.num_incompleted_block_queries > 0:
++            incompleted_hit_rate = (self.num_incompleted_block_hit /
++                                    self.num_incompleted_block_queries)
++            incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
++        return (completed_block_hit + incompleted_block_hit) / total_blocks
++
++
+ def get_all_blocks_recursively(last_block: Block) -> List[Block]:
+     """Retrieves all the blocks in a sequence starting from the last block.
+ 
+diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
+index 5b25e1b..3a57487 100644
+--- a/vllm/core/block/cpu_gpu_block_allocator.py
++++ b/vllm/core/block/cpu_gpu_block_allocator.py
+@@ -1,9 +1,10 @@
+-from typing import Dict, FrozenSet, List, Optional
++from typing import Dict, FrozenSet, List, Optional, Tuple
+ 
+ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
+                                         DeviceAwareBlockAllocator)
+ from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+ from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
++from vllm.platforms import current_platform
+ from vllm.utils import Device
+ 
+ 
+@@ -52,7 +53,11 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+             - The block IDs are assigned contiguously, with GPU block IDs coming
+                 before CPU block IDs.
+         """
+-        block_ids = list(range(num_gpu_blocks + num_cpu_blocks))
++        # For HPU, block id 0 is used only for padding
++        reserved_blocks = 1 if current_platform.is_hpu() else 0
++        block_ids = list(
++            range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
++        num_gpu_blocks -= reserved_blocks
+         gpu_block_ids = block_ids[:num_gpu_blocks]
+         cpu_block_ids = block_ids[num_gpu_blocks:]
+ 
+@@ -90,11 +95,8 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+             gpu_block_allocator=gpu_allocator,
+         )
+ 
+-    def __init__(
+-        self,
+-        cpu_block_allocator: BlockAllocator,
+-        gpu_block_allocator: BlockAllocator,
+-    ):
++    def __init__(self, cpu_block_allocator: BlockAllocator,
++                 gpu_block_allocator: BlockAllocator):
+         assert not (
+             cpu_block_allocator.all_block_ids
+             & gpu_block_allocator.all_block_ids
+@@ -105,27 +107,71 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+             Device.GPU: gpu_block_allocator,
+         }
+ 
++        self._swap_mapping: Dict[int, int] = {}
++        self._null_block: Optional[Block] = None
++
+         self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
+         for _, allocator in self._allocators.items():
+             for block_id in allocator.all_block_ids:
+                 self._block_ids_to_allocator[block_id] = allocator
+ 
+-    def allocate_mutable(self, prev_block: Optional[Block],
+-                         device: Device) -> Block:
++    def allocate_or_get_null_block(self) -> Block:
++        if self._null_block is None:
++            self._null_block = NullBlock(
++                self.allocate_mutable_block(None, Device.GPU))
++        return self._null_block
++
++    def allocate_mutable_block(self,
++                               prev_block: Optional[Block],
++                               device: Device,
++                               extra_hash: Optional[int] = None) -> Block:
+         """Allocates a new mutable block on the specified device.
+ 
+         Args:
+             prev_block (Optional[Block]): The previous block to in the sequence.
+                 Used for prefix hashing.
+             device (Device): The device on which to allocate the new block.
++            extra_hash (Optional[int]): The hash value of additional
++                factors, such as adapters, that influence the block hash
++                in the prefix caching block.
+ 
+         Returns:
+             Block: The newly allocated mutable block.
+         """
+-        return self._allocators[device].allocate_mutable(prev_block)
++        return self._allocators[device].allocate_mutable_block(
++            prev_block, extra_hash=extra_hash)
++
++    def allocate_immutable_blocks(
++            self,
++            prev_block: Optional[Block],
++            block_token_ids: List[List[int]],
++            device: Device,
++            extra_hash: Optional[int] = None) -> List[Block]:
++        """Allocates a new group of immutable blocks with the provided block 
++        token IDs on the specified device.
++
++        Args:
++            prev_block (Optional[Block]): The previous block in the sequence.
++                Used for prefix hashing.
++            block_token_ids (List[int]): The list of block token IDs to be 
++                stored in the new blocks.
++            device (Device): The device on which to allocate the new block.
++            extra_hash (Optional[int]): The hash value of additional
++                factors, such as adapters, that influence the block hash
++                in the prefix caching block.
+ 
+-    def allocate_immutable(self, prev_block: Optional[Block],
+-                           token_ids: List[int], device: Device) -> Block:
++        Returns:
++            List[Block]: The newly allocated list of immutable blocks 
++                containing the provided block token IDs.
++        """
++        return self._allocators[device].allocate_immutable_blocks(
++            prev_block, block_token_ids, extra_hash=extra_hash)
++
++    def allocate_immutable_block(self,
++                                 prev_block: Optional[Block],
++                                 token_ids: List[int],
++                                 device: Device,
++                                 extra_hash: Optional[int] = None) -> Block:
+         """Allocates a new immutable block with the provided token IDs on the
+         specified device.
+ 
+@@ -135,13 +181,16 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+             token_ids (List[int]): The list of token IDs to be stored in the new
+                 block.
+             device (Device): The device on which to allocate the new block.
++            extra_hash (Optional[int]): The hash value of additional
++                factors, such as adapters, that influence the block hash
++                in the prefix caching block.
+ 
+         Returns:
+             Block: The newly allocated immutable block containing the provided
+                 token IDs.
+         """
+-        return self._allocators[device].allocate_immutable(
+-            prev_block, token_ids)
++        return self._allocators[device].allocate_immutable_block(
++            prev_block, token_ids, extra_hash=extra_hash)
+ 
+     def free(self, block: Block) -> None:
+         """Frees the memory occupied by the given block.
+@@ -149,10 +198,13 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+         Args:
+             block (Block): The block to be freed.
+         """
++        # Null block should never be freed
++        if isinstance(block, NullBlock):
++            return
+         block_id = block.block_id
+         assert block_id is not None
+         allocator = self._block_ids_to_allocator[block_id]
+-        return allocator.free(block)
++        allocator.free(block)
+ 
+     def fork(self, last_block: Block) -> List[Block]:
+         """Creates a new sequence of blocks that shares the same underlying
+@@ -165,6 +217,8 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+             List[Block]: A new list of blocks that shares the same memory as the
+                 original sequence.
+         """
++        # do not attempt to fork the null block
++        assert not isinstance(last_block, NullBlock)
+         block_id = last_block.block_id
+         assert block_id is not None
+         allocator = self._block_ids_to_allocator[block_id]
+@@ -185,13 +239,72 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+     def get_num_total_blocks(self, device: Device) -> int:
+         return self._allocators[device].get_num_total_blocks()
+ 
+-    def clear_copy_on_writes(self) -> Dict[int, List[int]]:
++    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
++        """Returns the zero-offset block id on certain device given the 
++        absolute block id.
++
++        Args:
++            device (Device): The device for which to query relative block id.
++                absolute_id (int): The absolute block id for the block in 
++                whole allocator.
++
++        Returns:
++            int: The zero-offset block id on certain device.
++        """
++        return self._allocators[device].get_physical_block_id(absolute_id)
++
++    def swap(self, blocks: List[Block], src_device: Device,
++             dst_device: Device) -> Dict[int, int]:
++        """Execute the swap for the given blocks from source_device
++        on to dest_device, save the current swap mapping and append 
++        them to the accumulated `self._swap_mapping` for each 
++        scheduling move.
++
++        Args:
++            blocks: List of blocks to be swapped.
++            src_device (Device): Device to swap the 'blocks' from.
++            dst_device (Device): Device to swap the 'blocks' to.
++        
++        Returns:
++            Dict[int, int]: Swap mapping from source_device
++                on to dest_device.
++        """
++        src_block_ids = [block.block_id for block in blocks]
++        self._allocators[src_device].swap_out(blocks)
++        self._allocators[dst_device].swap_in(blocks)
++        dst_block_ids = [block.block_id for block in blocks]
++
++        current_swap_mapping: Dict[int, int] = {}
++        for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
++            if src_block_id is not None and dst_block_id is not None:
++                self._swap_mapping[src_block_id] = dst_block_id
++                current_swap_mapping[src_block_id] = dst_block_id
++        return current_swap_mapping
++
++    def get_num_full_blocks_touched(self, blocks: List[Block],
++                                    device: Device) -> int:
++        """Returns the number of full blocks that will be touched by
++        swapping in/out the given blocks on to the 'device'.
++
++        Args:
++            blocks: List of blocks to be swapped.
++            device (Device): Device to swap the 'blocks' on.
++
++        Returns:
++            int: the number of full blocks that will be touched by
++                swapping in/out the given blocks on to the 'device'.
++                Non full blocks are ignored when deciding the number
++                of blocks to touch.
++        """
++        return self._allocators[device].get_num_full_blocks_touched(blocks)
++
++    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
+         """Clears the copy-on-write (CoW) state and returns the mapping of
+             source to destination block IDs.
+ 
+         Returns:
+-            Dict[int, List[int]]: A dictionary mapping source block IDs to lists
+-                of destination block IDs.
++            List[Tuple[int, int]]: A list mapping source block IDs to 
++                destination block IDs.
+         """
+         # CoW only supported on GPU
+         device = Device.GPU
+@@ -211,18 +324,106 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+         return self._allocators[device].mark_blocks_as_computed(block_ids)
+ 
+     def get_common_computed_block_ids(
+-            self, seq_block_ids: List[List[int]]) -> List[int]:
++            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+         # Prefix caching only supported on GPU.
+         device = Device.GPU
+         return self._allocators[device].get_common_computed_block_ids(
+-            seq_block_ids)
++            computed_seq_block_ids)
+ 
+     @property
+     def all_block_ids(self) -> FrozenSet[int]:
+         return frozenset(self._block_ids_to_allocator.keys())
+ 
+-    def promote_to_immutable_block(self, block: Block) -> BlockId:
+-        raise NotImplementedError
++    def get_prefix_cache_hit_rate(self, device: Device) -> float:
++        """Prefix cache hit rate. -1 means not supported or disabled."""
++        assert device in self._allocators
++        return self._allocators[device].get_prefix_cache_hit_rate()
++
++    def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
++        """Returns and clears the mapping of source to destination block IDs.
++        Will be called after every swapping operations for now, and after every
++        schedule when BlockManagerV2 become default. Currently not useful.
++
++        Returns:
++            List[Tuple[int, int]]: A mapping of source to destination block IDs.
++        """
++        mapping = self._swap_mapping.copy()
++        self._swap_mapping.clear()
++        return list(mapping.items())
++
++    def find_cached_blocks_prefix(
++        self,
++        block_hashes: List[int],
++        device: Device = Device.GPU,
++    ) -> List[int]:
++        return self._allocators[device].find_cached_blocks_prefix(block_hashes)
++
++
++class NullBlock(Block):
++    """
++    Null blocks are used as a placeholders for KV cache blocks that have
++    been dropped due to sliding window.
++    This implementation just wraps an ordinary block and prevents it from
++    being modified. It also allows for testing if a block is NullBlock
++    via isinstance().
++    """
++
++    def __init__(self, proxy: Block):
++        super().__init__()
++        self._proxy = proxy
++
++    def append_token_ids(self, token_ids: List[BlockId]):
++        raise ValueError("null block should not be modified")
++
++    @property
++    def block_id(self):
++        return self._proxy.block_id
++
++    @block_id.setter
++    def block_id(self, value: Optional[BlockId]):
++        raise ValueError("null block should not be modified")
++
++    @property
++    def token_ids(self) -> List[BlockId]:
++        return self._proxy.token_ids
++
++    @property
++    def num_tokens_total(self) -> int:
++        raise NotImplementedError(
++            "num_tokens_total is not used for null block")
++
++    @property
++    def num_empty_slots(self) -> BlockId:
++        return self._proxy.num_empty_slots
++
++    @property
++    def is_full(self):
++        return self._proxy.is_full
++
++    @property
++    def prev_block(self):
++        return self._proxy.prev_block
++
++    @property
++    def extra_hash(self):
++        return None
+ 
+-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
+-        raise NotImplementedError
++    @property
++    def computed(self):
++        return self._proxy.computed
++
++    @computed.setter
++    def computed(self, value):
++        self._proxy.computed = value
++
++    @property
++    def last_accessed(self) -> float:
++        return self._proxy.last_accessed
++
++    @last_accessed.setter
++    def last_accessed(self, last_accessed_ts: float):
++        self._proxy.last_accessed = last_accessed_ts
++
++    @property
++    def content_hash(self):
++        return self._proxy.content_hash
+diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
+index 634c401..985a109 100644
+--- a/vllm/core/block/interfaces.py
++++ b/vllm/core/block/interfaces.py
+@@ -1,5 +1,5 @@
+ from abc import ABC, abstractmethod
+-from typing import Dict, FrozenSet, List, Optional, Protocol
++from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
+ 
+ from vllm.utils import Device
+ 
+@@ -28,6 +28,13 @@ class Block(ABC):
+     def token_ids(self) -> List[int]:
+         pass
+ 
++    @property
++    @abstractmethod
++    def num_tokens_total(self) -> int:
++        """The number of tokens till the current block (inclusive)
++        """
++        pass
++
+     @property
+     @abstractmethod
+     def num_empty_slots(self) -> int:
+@@ -43,6 +50,11 @@ class Block(ABC):
+     def prev_block(self) -> Optional["Block"]:
+         pass
+ 
++    @property
++    @abstractmethod
++    def extra_hash(self) -> Optional[int]:
++        return None
++
+     @property
+     @abstractmethod
+     def computed(self) -> bool:
+@@ -74,6 +86,8 @@ class Block(ABC):
+             block_size: int,
+             allocator: "BlockAllocator",
+             block_id: Optional[int] = None,
++            computed: bool = False,
++            extra_hash: Optional[int] = None,
+         ) -> "Block":
+             pass
+ 
+@@ -92,12 +106,20 @@ class Block(ABC):
+ class BlockAllocator(ABC):
+ 
+     @abstractmethod
+-    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
++    def allocate_mutable_block(self, prev_block: Optional[Block],
++                               extra_hash: Optional[int]) -> Block:
+         pass
+ 
+     @abstractmethod
+-    def allocate_immutable(self, prev_block: Optional[Block],
+-                           token_ids: List[int]) -> Block:
++    def allocate_immutable_block(self, prev_block: Optional[Block],
++                                 token_ids: List[int],
++                                 extra_hash: Optional[int]) -> Block:
++        pass
++
++    @abstractmethod
++    def allocate_immutable_blocks(self, prev_block: Optional[Block],
++                                  block_token_ids: List[List[int]],
++                                  extra_hash: Optional[int]) -> List[Block]:
+         pass
+ 
+     @abstractmethod
+@@ -116,13 +138,25 @@ class BlockAllocator(ABC):
+     def get_num_free_blocks(self) -> int:
+         pass
+ 
++    @abstractmethod
++    def get_physical_block_id(self, absolute_id: int) -> int:
++        pass
++
++    @abstractmethod
++    def swap_out(self, blocks: List[Block]) -> None:
++        pass
++
++    @abstractmethod
++    def swap_in(self, blocks: List[Block]) -> None:
++        pass
++
+     @property
+     @abstractmethod
+     def all_block_ids(self) -> FrozenSet[int]:
+         pass
+ 
+     @abstractmethod
+-    def clear_copy_on_writes(self) -> Dict[int, List[int]]:
++    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
+         pass
+ 
+     @abstractmethod
+@@ -136,11 +170,11 @@ class BlockAllocator(ABC):
+ 
+     @abstractmethod
+     def get_common_computed_block_ids(
+-            self, seq_block_ids: List[List[int]]) -> List[int]:
++            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+         pass
+ 
+     @abstractmethod
+-    def cow_block_if_not_appendable(self, block: Block) -> Optional["BlockId"]:
++    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
+         """NOTE: This should not be used besides Block"""
+         pass
+ 
+@@ -149,20 +183,51 @@ class BlockAllocator(ABC):
+         """NOTE: This should not be used besides Block"""
+         pass
+ 
++    @abstractmethod
++    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
++        pass
++
++    @abstractmethod
++    def get_prefix_cache_hit_rate(self) -> float:
++        """Prefix cache hit rate. -1 means not supported or disabled."""
++        pass
++
+     class NoFreeBlocksError(ValueError):
+         pass
+ 
++    @abstractmethod
++    def find_cached_blocks_prefix(
++        self,
++        block_hashes: List[int],
++    ) -> List[int]:
++        pass
++
+ 
+ class DeviceAwareBlockAllocator(ABC):
+ 
+     @abstractmethod
+-    def allocate_mutable(self, prev_block: Optional[Block],
+-                         device: Device) -> Block:
++    def allocate_mutable_block(self,
++                               prev_block: Optional[Block],
++                               device: Device,
++                               extra_hash: Optional[int] = None) -> Block:
+         pass
+ 
+     @abstractmethod
+-    def allocate_immutable(self, prev_block: Optional[Block],
+-                           token_ids: List[int], device: Device) -> Block:
++    def allocate_immutable_block(self,
++                                 prev_block: Optional[Block],
++                                 token_ids: List[int],
++                                 device: Device,
++                                 extra_hash: Optional[int] = None) -> Block:
++        pass
++
++    @abstractmethod
++    def allocate_immutable_blocks(
++        self,
++        prev_block: Optional[Block],
++        block_token_ids: List[List[int]],
++        device: Device,
++        extra_hash: Optional[int] = None,
++    ) -> List[Block]:
+         pass
+ 
+     @abstractmethod
+@@ -187,7 +252,7 @@ class DeviceAwareBlockAllocator(ABC):
+         pass
+ 
+     @abstractmethod
+-    def clear_copy_on_writes(self) -> Dict[int, List[int]]:
++    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
+         pass
+ 
+     @abstractmethod
+@@ -201,5 +266,41 @@ class DeviceAwareBlockAllocator(ABC):
+ 
+     @abstractmethod
+     def get_common_computed_block_ids(
+-            self, seq_block_ids: List[List[int]]) -> List[int]:
++            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
++        pass
++
++    @abstractmethod
++    def get_num_full_blocks_touched(self, blocks: List[Block],
++                                    device: Device) -> int:
++        pass
++
++    @abstractmethod
++    def swap(self, blocks: List[Block], src_device: Device,
++             dst_device: Device) -> Dict[int, int]:
++        pass
++
++    @abstractmethod
++    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
++        pass
++
++    @abstractmethod
++    def allocate_or_get_null_block(self) -> Block:
++        """
++        Null blocks are used as a placeholders for KV cache blocks that have
++        been dropped due to sliding window.
++        There is at most one null block per allocator.
++        """
++        pass
++
++    @abstractmethod
++    def get_prefix_cache_hit_rate(self, device: Device) -> float:
++        """Prefix cache hit rate. -1 means not supported or disabled."""
++        pass
++
++    @abstractmethod
++    def find_cached_blocks_prefix(
++        self,
++        block_hashes: List[int],
++        device: Device = Device.GPU,
++    ) -> List[int]:
+         pass
+diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
+index a1b901b..9b94918 100644
+--- a/vllm/core/block/naive_block.py
++++ b/vllm/core/block/naive_block.py
+@@ -1,6 +1,7 @@
+-from typing import Dict, FrozenSet, Iterable, List, Optional, Set
++from collections import deque
++from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple
+ 
+-from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
++from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
+                                     get_all_blocks_recursively)
+ from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
+ 
+@@ -30,28 +31,40 @@ class NaiveBlockAllocator(BlockAllocator):
+         num_blocks: int,
+         block_size: int,
+         block_ids: Optional[Iterable[int]] = None,
++        block_pool: Optional[BlockPool] = None,
+     ):
+         if block_ids is None:
+             block_ids = range(num_blocks)
+ 
+-        self._free_block_indices: Set[BlockId] = set(block_ids)
++        self._free_block_indices: Deque[BlockId] = deque(block_ids)
+         self._all_block_indices = frozenset(block_ids)
+         assert len(self._all_block_indices) == num_blocks
+ 
+         self._refcounter = RefCounter(
+             all_block_indices=self._free_block_indices)
+-        self._create_block = create_block
+         self._block_size = block_size
+ 
+         self._cow_tracker = CopyOnWriteTracker(
+-            refcounter=self._refcounter.as_readonly(),
+-            allocator=self,
+-        )
+-
+-    def allocate_immutable(self,
+-                           prev_block: Optional[Block],
+-                           token_ids: List[int],
+-                           device: Optional[Device] = None) -> Block:
++            refcounter=self._refcounter.as_readonly())
++
++        if block_pool is None:
++            extra_factor = 4
++            # Pre-allocate "num_blocks * extra_factor" block objects.
++            # The "* extra_factor" is a buffer to allow more block objects
++            # than physical blocks
++            self._block_pool = BlockPool(self._block_size, create_block, self,
++                                         num_blocks * extra_factor)
++        else:
++            # In this case, the block pool is provided by the caller,
++            # which means that there is most likely a need to share
++            # a block pool between allocators
++            self._block_pool = block_pool
++
++    def allocate_immutable_block(self,
++                                 prev_block: Optional[Block],
++                                 token_ids: List[int],
++                                 extra_hash: Optional[int] = None,
++                                 device: Optional[Device] = None) -> Block:
+         """Allocates a new immutable block with the given token IDs, linked to
+         the previous block.
+ 
+@@ -65,13 +78,38 @@ class NaiveBlockAllocator(BlockAllocator):
+             Block: The newly allocated immutable block.
+         """
+         assert device is None
+-        block = self.allocate_mutable(prev_block=prev_block)
++        block = self.allocate_mutable_block(prev_block=prev_block)
+         block.append_token_ids(token_ids)
+         return block
+ 
+-    def allocate_mutable(self,
+-                         prev_block: Optional[Block],
+-                         device: Optional[Device] = None) -> Block:
++    def allocate_immutable_blocks(
++            self,
++            prev_block: Optional[Block],
++            block_token_ids: List[List[int]],
++            extra_hash: Optional[int] = None,
++            device: Optional[Device] = None) -> List[Block]:
++        assert device is None
++        num_blocks = len(block_token_ids)
++
++        block_ids = []
++        for i in range(num_blocks):
++            block_ids.append(self._allocate_block_id())
++
++        blocks = []
++        for i in range(num_blocks):
++            prev_block = self._block_pool.init_block(
++                prev_block=prev_block,
++                token_ids=block_token_ids[i],
++                block_size=self._block_size,
++                physical_block_id=block_ids[i])
++            blocks.append(prev_block)
++
++        return blocks
++
++    def allocate_mutable_block(self,
++                               prev_block: Optional[Block],
++                               extra_hash: Optional[int] = None,
++                               device: Optional[Device] = None) -> Block:
+         """Allocates a new mutable block, linked to the previous block.
+ 
+         Args:
+@@ -83,22 +121,39 @@ class NaiveBlockAllocator(BlockAllocator):
+             Block: The newly allocated mutable block.
+         """
+         assert device is None
+-        block_id = self._allocate_new_block_id()
+-        return self._create_block(
+-            prev_block=prev_block,
+-            token_ids=[],
+-            block_id=block_id,
+-            block_size=self._block_size,
+-            allocator=self,
+-        )
+-
+-    def free(self, block: Block) -> None:
+-        assert block.block_id is not None
+-        self._free_block_id(block.block_id)
+-
+-        # Mark the block as having no allocation.
++        block_id = self._allocate_block_id()
++        block = self._block_pool.init_block(prev_block=prev_block,
++                                            token_ids=[],
++                                            block_size=self._block_size,
++                                            physical_block_id=block_id)
++        return block
++
++    def _allocate_block_id(self) -> BlockId:
++        if not self._free_block_indices:
++            raise BlockAllocator.NoFreeBlocksError()
++
++        block_id = self._free_block_indices.popleft()
++        self._refcounter.incr(block_id)
++        return block_id
++
++    def _free_block_id(self, block: Block) -> None:
++        block_id = block.block_id
++        assert block_id is not None
++
++        refcount = self._refcounter.decr(block_id)
++        if refcount == 0:
++            self._free_block_indices.appendleft(block_id)
++
+         block.block_id = None
+ 
++    def free(self, block: Block, keep_block_object: bool = False) -> None:
++        # Release the physical block id
++        self._free_block_id(block)
++
++        # Release the block object
++        if not keep_block_object:
++            self._block_pool.free_block(block)
++
+     def fork(self, last_block: Block) -> List[Block]:
+         """Creates a new sequence of blocks that shares the same underlying
+         memory as the original sequence.
+@@ -112,7 +167,7 @@ class NaiveBlockAllocator(BlockAllocator):
+         """
+         source_blocks = get_all_blocks_recursively(last_block)
+ 
+-        forked_blocks = []
++        forked_blocks: List[Block] = []
+         prev_block = None
+         for block in source_blocks:
+ 
+@@ -121,14 +176,13 @@ class NaiveBlockAllocator(BlockAllocator):
+             refcount = self._refcounter.incr(block.block_id)
+             assert refcount != 1, "can't fork free'd block"
+ 
+-            forked_blocks.append(
+-                self._create_block(
+-                    prev_block=prev_block,
+-                    token_ids=block.token_ids,
+-                    block_id=block.block_id,
+-                    block_size=self._block_size,
+-                    allocator=self,
+-                ))
++            forked_block = self._block_pool.init_block(
++                prev_block=prev_block,
++                token_ids=block.token_ids,
++                block_size=self._block_size,
++                physical_block_id=block.block_id)
++
++            forked_blocks.append(forked_block)
+             prev_block = forked_blocks[-1]
+ 
+         return forked_blocks
+@@ -139,19 +193,18 @@ class NaiveBlockAllocator(BlockAllocator):
+     def get_num_total_blocks(self) -> int:
+         return len(self._all_block_indices)
+ 
+-    def _allocate_new_block_id(self) -> BlockId:
+-        if not self._free_block_indices:
+-            raise BlockAllocator.NoFreeBlocksError()
++    def get_physical_block_id(self, absolute_id: int) -> int:
++        """Returns the zero-offset block id on certain block allocator
++        given the absolute block id.
+ 
+-        block_id = next(iter(self._free_block_indices))
+-        self._refcounter.incr(block_id)
+-        self._free_block_indices.remove(block_id)
+-        return block_id
++        Args:
++            absolute_id (int): The absolute block id for the block 
++            in whole allocator.
+ 
+-    def _free_block_id(self, block_id: BlockId) -> None:
+-        refcount = self._refcounter.decr(block_id)
+-        if refcount == 0:
+-            self._free_block_indices.add(block_id)
++        Returns:
++            int: The zero-offset block id on certain device.
++        """
++        return sorted(self._all_block_indices).index(absolute_id)
+ 
+     @property
+     def refcounter(self):
+@@ -161,7 +214,7 @@ class NaiveBlockAllocator(BlockAllocator):
+     def all_block_ids(self) -> FrozenSet[int]:
+         return self._all_block_indices
+ 
+-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
++    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
+         """Performs a copy-on-write operation on the given block if it is not
+         appendable.
+ 
+@@ -169,18 +222,29 @@ class NaiveBlockAllocator(BlockAllocator):
+             block (Block): The block to check for copy-on-write.
+ 
+         Returns:
+-            Optional[BlockId]: The block index of the new block if a copy-on
+-                -write operation was performed, or the original block index if
++            BlockId: The block index of the new block if a copy-on-write 
++                operation was performed, or the original block index if
+                 no copy-on-write was necessary.
+         """
+-        return self._cow_tracker.cow_block_if_not_appendable(block)
++        src_block_id = block.block_id
++        assert src_block_id is not None
++
++        if self._cow_tracker.is_appendable(block):
++            return src_block_id
++
++        self._free_block_id(block)
++        trg_block_id = self._allocate_block_id()
+ 
+-    def clear_copy_on_writes(self) -> Dict[BlockId, List[BlockId]]:
++        self._cow_tracker.record_cow(src_block_id, trg_block_id)
++
++        return trg_block_id
++
++    def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
+         """Returns the copy-on-write source->destination mapping and clears it.
+ 
+         Returns:
+-            Dict[BlockId, List[BlockId]]: A dictionary mapping source
+-                block indices to lists of destination block indices.
++            List[Tuple[BlockId, BlockId]]: A list mapping source
++                block indices to destination block indices.
+         """
+         return self._cow_tracker.clear_cows()
+ 
+@@ -202,7 +266,7 @@ class NaiveBlockAllocator(BlockAllocator):
+         pass
+ 
+     def get_common_computed_block_ids(
+-            self, seq_block_ids: List[List[int]]) -> List[int]:
++            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+         """Determine blocks that can be skipped in prefill.
+ 
+         Since the naive allocator does not support prefix caching, always return
+@@ -211,7 +275,59 @@ class NaiveBlockAllocator(BlockAllocator):
+         return []
+ 
+     def promote_to_immutable_block(self, block: Block) -> BlockId:
+-        raise NotImplementedError
++        raise NotImplementedError("There is no promotion for naive blocks")
++
++    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
++        """Returns the number of full blocks that will be touched by
++        swapping in/out.
++
++        Args:
++            blocks: List of blocks to be swapped.
++        Returns:
++            int: the number of full blocks that will be touched by
++                swapping in/out the given blocks. Non full blocks are ignored
++                when deciding the number of blocks to touch.
++        """
++        # NOTE: for naive block, we use set to eliminate common blocks among
++        # seqs, also we compare the empty slots in the mutable blocks with
++        # lookahead slots to get the number of unique new block that are
++        # needed.
++        old_block_set = set()
++        for block in blocks:
++            if block.is_full:
++                old_block_set.add(block)
++        return len(old_block_set)
++
++    def swap_out(self, blocks: List[Block]) -> None:
++        for block in blocks:
++            self._free_block_id(block)
++
++    def swap_in(self, blocks: List[Block]) -> None:
++        for block in blocks:
++            # Here we allocate either immutable or mutable block and then
++            # extract its block_id. Note that the block object is released
++            # and the block_id is assigned to "block" to allow reusing the
++            # existing "block" object
++            if block.is_full:
++                tmp_block = self.allocate_immutable_block(
++                    prev_block=block.prev_block, token_ids=block.token_ids)
++            else:
++                tmp_block = self.allocate_mutable_block(
++                    prev_block=block.prev_block)
++                tmp_block.append_token_ids(block.token_ids)
++
++            block_id = tmp_block.block_id
++            tmp_block.block_id = None
++            self._block_pool.free_block(tmp_block)
++
++            block.block_id = block_id  # Assign block_id
++
++    def get_prefix_cache_hit_rate(self) -> float:
++        return -1
++
++    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
++        # Not applicable for naive block allocator.
++        return []
+ 
+ 
+ class NaiveBlock(Block):
+@@ -242,7 +358,8 @@ class NaiveBlock(Block):
+                  block_size: int,
+                  allocator: BlockAllocator,
+                  block_id: Optional[int] = None,
+-                 _cow_target: Optional[Block] = None):
++                 _cow_target: Optional[Block] = None,
++                 extra_hash: Optional[int] = None):
+         self._token_ids: List[int] = []
+         self._block_size = block_size
+         self._prev_block = prev_block
+@@ -253,11 +370,12 @@ class NaiveBlock(Block):
+         self._append_token_ids_no_cow(token_ids)
+ 
+     def append_token_ids(self, token_ids: List[int]) -> None:
+-        """Appends the given token IDs to the block, instructing the allocator
+-        to perform a copy-on-write if necessary.
++        """Appends the given token IDs to the block and performs a 
++        copy-on-write if necessary.
+ 
+         Args:
+-            token_ids (List[int]): The token IDs to be appended to the block.
++            token_ids (Optional[List[int]]): The token IDs to be appended 
++                to the block.
+         """
+         self._append_token_ids_no_cow(token_ids)
+ 
+@@ -266,7 +384,16 @@ class NaiveBlock(Block):
+                 self._cow_target))
+ 
+     def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
+-        assert self.num_empty_slots >= len(token_ids)
++        """Appends the given token IDs to the block
++
++        Args:
++            token_ids (List[int]): The token IDs to be appended to the block.
++        """
++        if len(token_ids) == 0:
++            return
++
++        assert len(token_ids) <= self.num_empty_slots
++
+         self._token_ids.extend(token_ids)
+ 
+     @property
+@@ -299,12 +426,17 @@ class NaiveBlock(Block):
+ 
+     @property
+     def num_empty_slots(self) -> int:
+-        return self._block_size - len(self._token_ids)
++        return self._block_size - len(self.token_ids)
+ 
+     @property
+     def token_ids(self) -> List[int]:
+         return self._token_ids
+ 
++    @property
++    def num_tokens_total(self) -> int:
++        raise NotImplementedError(
++            "num_tokens_total is not used for naive block")
++
+     @property
+     def block_size(self) -> int:
+         return self._block_size
+@@ -313,6 +445,10 @@ class NaiveBlock(Block):
+     def prev_block(self) -> Optional["Block"]:
+         return self._prev_block
+ 
++    @property
++    def extra_hash(self):
++        return None
++
+     @property
+     def content_hash(self) -> Optional[int]:
+         return None
+diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
+index 4a37e8f..1238303 100644
+--- a/vllm/core/block/prefix_caching_block.py
++++ b/vllm/core/block/prefix_caching_block.py
+@@ -1,13 +1,18 @@
+ """Token blocks."""
+-from itertools import takewhile
++import sys
++from bisect import bisect_left
+ from os.path import commonprefix
+-from typing import Dict, FrozenSet, Iterable, List, Optional
++from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
++                    Tuple)
+ 
+-from vllm.core.block.common import (CopyOnWriteTracker,
++from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
+                                     get_all_blocks_recursively)
+-from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
+-from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+-from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
++from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
++                                        DeviceAwareBlockAllocator)
++from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
++                                         NaiveBlockAllocator)
++from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
++from vllm.sequence import Sequence
+ 
+ PrefixHash = int
+ 
+@@ -17,6 +22,30 @@ PrefixHash = int
+ _DEFAULT_LAST_ACCESSED_TIME = -1
+ 
+ 
++class BlockTracker:
++    """Used to track the status of a block inside the prefix caching allocator
++    """
++    __slots__ = ("active", "last_accessed", "computed")
++
++    def reset(self):
++        self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
++        self.computed: bool = False
++
++    def __init__(self):
++        self.active: bool = False
++        self.reset()
++
++    def enable(self):
++        assert not self.active
++        self.active = True
++        self.reset()
++
++    def disable(self):
++        assert self.active
++        self.active = False
++        self.reset()
++
++
+ class PrefixCachingBlockAllocator(BlockAllocator):
+     """A block allocator that implements prefix caching.
+ 
+@@ -39,12 +68,31 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         block_ids: Optional[Iterable[int]] = None,
+         eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
+     ):
++        if block_ids is None:
++            block_ids = range(num_blocks)
++
++        self._block_size = block_size
++
+         # A mapping of prefix hash to block index. All blocks which have a
+         # prefix hash will be in this dict, even if they have refcount 0.
+         self._cached_blocks: Dict[PrefixHash, BlockId] = {}
+ 
+-        # A mapping of blockId to Block to track those cached blocks
+-        self._blocks: Dict[BlockId, Block] = {}
++        # A list of immutable block IDs that have been touched by scheduler
++        # and should be marked as computed after an entire batch of sequences
++        # are scheduled.
++        self._touched_blocks: Set[BlockId] = set()
++
++        # Used to track status of each physical block id
++        self._block_tracker: Dict[BlockId, BlockTracker] = {}
++        for block_id in block_ids:
++            self._block_tracker[block_id] = BlockTracker()
++
++        # Pre-allocate "num_blocks * extra_factor" block objects.
++        # The "* extra_factor" is a buffer to allow more block objects
++        # than physical blocks
++        extra_factor = 4
++        self._block_pool = BlockPool(self._block_size, self._create_block,
++                                     self, num_blocks * extra_factor)
+ 
+         # An allocator for blocks that do not have prefix hashes.
+         self._hashless_allocator = NaiveBlockAllocator(
+@@ -52,10 +100,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+             num_blocks=num_blocks,
+             block_size=block_size,
+             block_ids=block_ids,
++            block_pool=self._block_pool,  # Share block pool here
+         )
+ 
+-        self._block_size = block_size
+-
+         # Evitor used to maintain how we want to handle those computed blocks
+         # if we find memory pressure is high.
+         self.evictor: Evictor = make_evictor(eviction_policy)
+@@ -66,9 +113,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         self._refcounter = self._hashless_allocator.refcounter
+ 
+         self._cow_tracker = CopyOnWriteTracker(
+-            refcounter=self._refcounter.as_readonly(),
+-            allocator=self,
+-        )
++            refcounter=self._refcounter.as_readonly())
++
++        self.metric_data = CacheMetricData()
+ 
+     # Implements Block.Factory.
+     def _create_block(
+@@ -79,6 +126,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         allocator: BlockAllocator,
+         block_id: Optional[int] = None,
+         computed: bool = False,
++        extra_hash: Optional[int] = None,
+     ) -> Block:
+         # Bind block to self.
+         allocator = self
+@@ -88,14 +136,16 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+             token_ids=token_ids,
+             block_size=block_size,
+             block_id=block_id,
+-            prefix_caching_allocator=allocator,
++            allocator=allocator,
+             computed=computed,
++            extra_hash=extra_hash,
+         )
+ 
+-    def allocate_immutable(self,
+-                           prev_block: Optional[Block],
+-                           token_ids: List[int],
+-                           device: Optional[Device] = None) -> Block:
++    def allocate_immutable_block(self,
++                                 prev_block: Optional[Block],
++                                 token_ids: List[int],
++                                 extra_hash: Optional[int] = None,
++                                 device: Optional[Device] = None) -> Block:
+         """Allocates an immutable block with the given token IDs, reusing cached
+         blocks if possible.
+ 
+@@ -109,29 +159,47 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         assert device is None
+         assert_prefix_caching_block_or_none(prev_block)
+ 
+-        block = self._create_block(
+-            prev_block=prev_block,
+-            token_ids=token_ids,
+-            block_size=self._block_size,
+-            allocator=self,
+-        )
++        # First, try to create a block that points to cached data
++        block = self._block_pool.init_block(prev_block=prev_block,
++                                            token_ids=token_ids,
++                                            block_size=self._block_size,
++                                            physical_block_id=None,
++                                            extra_hash=extra_hash)
+         assert block.content_hash is not None
+ 
+         cached_block_id = self._cached_blocks.get(block.content_hash, None)
+         if cached_block_id is not None:
++            self.metric_data.query(hit=True)
+             block.block_id = cached_block_id
+-            self._incr_refcount_cached_block(block, block.block_id)
++            self._incr_refcount_cached_block(block)
+             return block
++        self.metric_data.query(hit=False)
++        self._block_pool.free_block(block)
+ 
+-        block = self.allocate_mutable(prev_block)
++        # No cached block => Allocate a new block
++        block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
+         block.append_token_ids(token_ids)
+-        assert block.content_hash is not None
+-
+         return block
+ 
+-    def allocate_mutable(self,
+-                         prev_block: Optional[Block],
+-                         device: Optional[Device] = None) -> Block:
++    def allocate_immutable_blocks(
++            self,
++            prev_block: Optional[Block],
++            block_token_ids: List[List[int]],
++            extra_hash: Optional[int] = None,
++            device: Optional[Device] = None) -> List[Block]:
++        blocks = []
++        for token_ids in block_token_ids:
++            prev_block = self.allocate_immutable_block(prev_block=prev_block,
++                                                       token_ids=token_ids,
++                                                       device=device,
++                                                       extra_hash=extra_hash)
++            blocks.append(prev_block)
++        return blocks
++
++    def allocate_mutable_block(self,
++                               prev_block: Optional[Block],
++                               extra_hash: Optional[int] = None,
++                               device: Optional[Device] = None) -> Block:
+         """Allocates a mutable block. If there are no free blocks, this will
+         evict unused cached blocks.
+ 
+@@ -145,107 +213,155 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         assert device is None
+         assert_prefix_caching_block_or_none(prev_block)
+ 
+-        try:
+-            block = self._hashless_allocator.allocate_mutable(
+-                prev_block=prev_block)
+-
+-            assert block.block_id not in self._blocks
+-            assert block.block_id is not None
+-            self._blocks[block.block_id] = block
+-            return block
+-        except BlockAllocator.NoFreeBlocksError:
+-            # We must check the unused cached blocks before raising OOM.
+-            pass
+-
+-        # If the evictor has blocks available for eviction, evict a block
+-        # and return it.
+-        if self.evictor.num_blocks > 0:
+-            block_id, content_hash_to_evict = self.evictor.evict()
+-
+-            # Here we may have scenario that several blocks have
+-            # the same content hash, but due to the latter coming block
+-            # is coming from mutable to immutable path, their physical
+-            # block is added into evictor.
+-            # However in this case, we shall not pop the _cached_blocks,
+-            # as the same content is still used by others, which means
+-            # we need to check ref before decide to pop the list.
+-
+-            _block_id = self._cached_blocks[content_hash_to_evict]
+-            refcount = self._refcounter.get(_block_id)
+-            if refcount == 1:
+-                self._cached_blocks.pop(content_hash_to_evict)
+-                assert _block_id == block_id
+-
+-            self._refcounter.incr(block_id)
+-
+-            # the block comes from evictor already contain computed result
+-            block = self._create_block(
+-                prev_block=prev_block,
+-                token_ids=[],
+-                block_size=self._block_size,
+-                allocator=self,
+-                block_id=block_id,
+-                computed=True,
+-            )
+-            assert block.content_hash is None
+-
+-            assert block.block_id not in self._blocks
+-            assert block.block_id is not None
+-            self._blocks[block.block_id] = block
+-            return block
+-
+-        # No block available in hashless allocator, nor in unused cache blocks.
+-        raise BlockAllocator.NoFreeBlocksError()
++        block_id = self._allocate_block_id()
++        block = self._block_pool.init_block(prev_block=prev_block,
++                                            token_ids=[],
++                                            block_size=self._block_size,
++                                            physical_block_id=block_id,
++                                            extra_hash=extra_hash)
++        assert not block.computed
++        assert block.content_hash is None
++        return block
+ 
+-    def _incr_refcount_cached_block(self, block: Block,
+-                                    block_id: BlockId) -> None:
+-        # since block is already computed, mark it
++    def _incr_refcount_cached_block(self, block: Block) -> None:
++        # Set this block to be "computed" since it is pointing to a
++        # cached block id (which was already computed)
+         block.computed = True
+ 
++        block_id = block.block_id
++        assert block_id is not None
++
+         refcount = self._refcounter.incr(block_id)
+         if refcount == 1:
+-            # if block get referred, then it shall not be in evictor
+-            # and put it into _blocks for tracking
++            # In case a cached block was evicted, restore its tracking
+             if block_id in self.evictor:
+                 self.evictor.remove(block_id)
+-            self._blocks[block_id] = block
+ 
+-    def free(self, block: Block) -> None:
+-        """Decrement the refcount of the block. If the decremented refcount is
+-        zero, store the block in the freelist.
++            self._track_block_id(block_id, computed=True)
+ 
+-        If the block has a content hash (meaning it is immutable), then we will
+-        keep the block around in case future allocations require it.
+-        """
+-        assert (block.block_id
+-                is not None), "freeing unallocated block is undefined"
++    def _decr_refcount_cached_block(self, block: Block) -> None:
++        # Ensure this is immutable/cached block
++        assert block.content_hash is not None
++
++        block_id = block.block_id
++        assert block_id is not None
++
++        refcount = self._refcounter.decr(block_id)
++        if refcount > 0:
++            block.block_id = None
++            return
++        else:
++            assert refcount == 0
++
++        # No longer used
++        assert block.content_hash in self._cached_blocks
+ 
+-        self._free_block_id_for_block(block.block_id, block)
++        # Add the cached block to the evictor
++        # (This keeps the cached block around so it can be reused)
++        self.evictor.add(block_id, block.content_hash, block.num_tokens_total,
++                         self._block_tracker[block_id].last_accessed)
++
++        # Stop tracking the block
++        self._untrack_block_id(block_id)
+ 
+         block.block_id = None
+ 
+-    def _free_block_id_for_block(self, block_id: BlockId,
+-                                 block: Block) -> None:
+-        assert isinstance(block, PrefixCachingBlock)
++    def _decr_refcount_hashless_block(self, block: Block) -> None:
++        block_id = block.block_id
++        assert block_id is not None
+ 
+-        if block.content_hash is None:
+-            refcount = self._refcounter.get(block_id)
+-            # We have fork case where block would get more than one ref,
+-            # so we cannot free it from tracking if ref cnt large than 1
+-            if refcount <= 1:
+-                assert block.block_id is not None
+-                del self._blocks[block.block_id]
+-            return self._hashless_allocator.free(block)
++        # We may have a fork case where block is shared,
++        # in which case, we cannot remove it from tracking
++        refcount = self._refcounter.get(block_id)
++        if refcount == 1:
++            self._untrack_block_id(block_id)
+ 
+-        refcount = self._refcounter.decr(block_id)
++        # Decrement refcount of the block_id, but do not free the block object
++        # itself (will be handled by the caller)
++        self._hashless_allocator.free(block, keep_block_object=True)
++
++    def _allocate_block_id(self) -> BlockId:
++        """First tries to allocate a block id from the hashless allocator,
++        and if there are no blocks, then tries to evict an unused cached block.
++        """
++        hashless_block_id = self._maybe_allocate_hashless_block_id()
++        if hashless_block_id is not None:
++            return hashless_block_id
++
++        evicted_block_id = self._maybe_allocate_evicted_block_id()
++        if evicted_block_id is not None:
++            return evicted_block_id
++
++        # No block available in hashless allocator, nor in unused cache blocks.
++        raise BlockAllocator.NoFreeBlocksError()
++
++    def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]:
++        try:
++            # Allocate mutable block and extract its block_id
++            block = self._hashless_allocator.allocate_mutable_block(
++                prev_block=None)
++            block_id = block.block_id
++            self._block_pool.free_block(block)
++
++            self._track_block_id(block_id, computed=False)
++            return block_id
++        except BlockAllocator.NoFreeBlocksError:
++            return None
++
++    def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
++        if self.evictor.num_blocks == 0:
++            return None
++
++        # Here we get an evicted block, which is only added
++        # into evictor if its ref counter is 0
++        # and since its content would be changed, we need
++        # to remove it from _cached_blocks's tracking list
++        block_id, content_hash_to_evict = self.evictor.evict()
++
++        # Sanity checks
++        assert content_hash_to_evict in self._cached_blocks
++        _block_id = self._cached_blocks[content_hash_to_evict]
++        assert self._refcounter.get(_block_id) == 0
++        assert _block_id == block_id
++
++        self._cached_blocks.pop(content_hash_to_evict)
++
++        self._refcounter.incr(block_id)
++        self._track_block_id(block_id, computed=False)
++
++        return block_id
++
++    def _free_block_id(self, block: Block) -> None:
++        """Decrements the refcount of the block. The block may be in two 
++        possible states: (1) immutable/cached or (2) mutable/hashless. 
++        In the first case, the refcount is decremented directly and the block
++        may be possibly added to the evictor. In other case, hashless 
++        allocator free(..) with keep_block_object=True is called to only free
++        the block id (since the block object may be reused by the caller)
++        """
++        block_id = block.block_id
++        assert block_id is not None, "Freeing unallocated block is undefined"
+ 
+-        # If no longer used, add the block to the evictor.
+-        if refcount == 0:
+-            assert block.content_hash in self._cached_blocks
+-            assert block.block_id is not None
+-            del self._blocks[block.block_id]
+-            self.evictor.add(block.block_id, block.content_hash,
+-                             block.num_tokens_total, block.last_accessed)
++        if block.content_hash is not None:
++            # Immutable: This type of block is always cached, and we want to
++            # keep it in the evictor for future reuse
++            self._decr_refcount_cached_block(block)
++        else:
++            # Mutable: This type of block is not cached, so we release it
++            # directly to the hashless allocator
++            self._decr_refcount_hashless_block(block)
++
++        assert block.block_id is None
++
++    def free(self, block: Block, keep_block_object: bool = False) -> None:
++        """Release the block (look at free_block_id(..) docs)
++        """
++        # Release the physical block index
++        self._free_block_id(block)
++
++        # Release the block object to the pool
++        if not keep_block_object:
++            self._block_pool.free_block(block)
+ 
+     def fork(self, last_block: Block) -> List[Block]:
+         """Creates a new sequence of blocks that shares the same underlying
+@@ -260,20 +376,24 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         """
+         source_blocks = get_all_blocks_recursively(last_block)
+ 
+-        forked_blocks = []
++        forked_blocks: List[Block] = []
+         prev_block = None
+         for block in source_blocks:
+-            refcount = self._refcounter.incr(block.block_id)
+-            assert refcount != 1, "can't fork free'd block"
++            block_id = block.block_id
++            assert block_id is not None
+ 
+-            forked_blocks.append(
+-                self._create_block(
+-                    prev_block=prev_block,
+-                    token_ids=block.token_ids,
+-                    block_id=block.block_id,
+-                    block_size=self._block_size,
+-                    allocator=self,
+-                ))
++            refcount = self._refcounter.incr(block_id)
++            assert refcount != 1, "can't fork free'd block_id = {}".format(
++                block_id)
++
++            forked_block = self._block_pool.init_block(
++                prev_block=prev_block,
++                token_ids=block.token_ids,
++                block_size=self._block_size,
++                physical_block_id=block_id,
++                extra_hash=block.extra_hash)
++
++            forked_blocks.append(forked_block)
+             prev_block = forked_blocks[-1]
+ 
+         return forked_blocks
+@@ -288,10 +408,30 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+     def get_num_total_blocks(self) -> int:
+         return self._hashless_allocator.get_num_total_blocks()
+ 
++    def get_physical_block_id(self, absolute_id: int) -> int:
++        """Returns the zero-offset block id on certain block allocator
++        given the absolute block id.
++
++        Args:
++            absolute_id (int): The absolute block id for the block 
++                in whole allocator.
++
++        Returns:
++            int: The rzero-offset block id on certain device.
++        """
++        return sorted(self.all_block_ids).index(absolute_id)
++
+     @property
+     def all_block_ids(self) -> FrozenSet[int]:
+         return self._hashless_allocator.all_block_ids
+ 
++    def get_prefix_cache_hit_rate(self) -> float:
++        return self.metric_data.get_hit_rate()
++
++    def is_block_cached(self, block: Block) -> bool:
++        assert block.content_hash is not None
++        return block.content_hash in self._cached_blocks
++
+     def promote_to_immutable_block(self, block: Block) -> BlockId:
+         """Once a mutable block is full, it can be promoted to an immutable
+         block. This means that its content can be referenced by future blocks
+@@ -299,7 +439,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+ 
+         Note that if we already have a cached block with the same content, we
+         will replace the newly-promoted block's mapping with the existing cached
+-        block.
++        block id.
+ 
+         Args:
+             block: The mutable block to be promoted.
+@@ -308,22 +448,34 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+             BlockId: Either the original block index, or the block index of
+                 the previously cached block matching the same content.
+         """
++        # Ensure block can be promoted
+         assert block.content_hash is not None
+         assert block.block_id is not None
+         assert self._refcounter.get(block.block_id) > 0
+ 
+-        # If the content hash does not have a corresponding cached block,
+-        # set this block as the cached block.
+         if block.content_hash not in self._cached_blocks:
++            # No cached content hash => Set this block as cached.
++            # Note that this block cannot be marked as computed yet
++            # because other sequences in the same batch cannot reuse
++            # this block.
+             self._cached_blocks[block.content_hash] = block.block_id
+-        else:
+-            self._free_block_id_for_block(block.block_id, block)
+-            self._incr_refcount_cached_block(
+-                block, self._cached_blocks[block.content_hash])
++            # Mark this block as touched so that it can be marked as
++            # computed after the entire batch of sequences are scheduled.
++            self._touched_blocks.add(block.block_id)
++            return block.block_id
++
++        # Reuse the cached content hash
++        self._decr_refcount_hashless_block(block)
++        block.block_id = self._cached_blocks[block.content_hash]
++
++        # Increment refcount of the cached block and (possibly) restore
++        # it from the evictor.
++        # Note that in this case, the block is marked as computed
++        self._incr_refcount_cached_block(block)
+ 
+-        return self._cached_blocks[block.content_hash]
++        return block.block_id
+ 
+-    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
++    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
+         """Performs a copy-on-write operation on the given block if it is not
+         appendable.
+ 
+@@ -331,18 +483,29 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+             block (Block): The block to check for copy-on-write.
+ 
+         Returns:
+-            Optional[BlockId]: The block index of the new block if a copy-on
+-                -write operation was performed, or the original block index if
++            BlockId: The block index of the new block if a copy-on-write 
++                operation was performed, or the original block index if
+                 no copy-on-write was necessary.
+         """
+-        return self._cow_tracker.cow_block_if_not_appendable(block)
++        src_block_id = block.block_id
++        assert src_block_id is not None
+ 
+-    def clear_copy_on_writes(self) -> Dict[BlockId, List[BlockId]]:
++        if self._cow_tracker.is_appendable(block):
++            return src_block_id
++
++        self._free_block_id(block)
++        trg_block_id = self._allocate_block_id()
++
++        self._cow_tracker.record_cow(src_block_id, trg_block_id)
++
++        return trg_block_id
++
++    def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
+         """Returns the copy-on-write source->destination mapping and clears it.
+ 
+         Returns:
+-            Dict[BlockId, List[BlockId]]: A dictionary mapping source
+-                block indices to lists of destination block indices.
++            List[Tuple[BlockId, BlockId]]: A list mapping source
++                block indices to destination block indices.
+         """
+         return self._cow_tracker.clear_cows()
+ 
+@@ -355,8 +518,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         """
+ 
+         for block_id in block_ids:
+-            if block_id in self._blocks:
+-                self._blocks[block_id].last_accessed = now
++            if self._block_tracker[block_id].active:
++                self._block_tracker[block_id].last_accessed = now
+             elif block_id in self.evictor:
+                 self.evictor.update(block_id, now)
+             else:
+@@ -364,25 +527,29 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+                     "Mark block as accessed which is not belonged to GPU")
+ 
+     def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+-        """Mark blocks as computed, used in prefix caching."""
++        # Mark all touched blocks as computed.
++        for block_id in self._touched_blocks:
++            self._block_tracker[block_id].computed = True
++        self._touched_blocks.clear()
+ 
+-        for block_id in block_ids:
+-            if block_id in self._blocks:
+-                # only those full block is valid for prefix caching
+-                if self._blocks[block_id].is_full:
+-                    self._blocks[block_id].computed = True
+-            elif block_id not in self.evictor:
+-                raise ValueError(f"Mark {block_id=} as computed which "
+-                                 "is not belonged to GPU")
++    def _track_block_id(self, block_id: Optional[BlockId],
++                        computed: bool) -> None:
++        assert block_id is not None
++        self._block_tracker[block_id].enable()
++        self._block_tracker[block_id].computed = computed
++
++    def _untrack_block_id(self, block_id: Optional[BlockId]) -> None:
++        assert block_id is not None
++        self._block_tracker[block_id].disable()
+ 
+     def block_is_computed(self, block_id: int) -> bool:
+-        if block_id in self._blocks:
+-            return self._blocks[block_id].computed
++        if self._block_tracker[block_id].active:
++            return self._block_tracker[block_id].computed
+         else:
+             return block_id in self.evictor
+ 
+     def get_common_computed_block_ids(
+-            self, seq_block_ids: List[List[int]]) -> List[int]:
++            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+         """Return the block ids that are common for a given sequence group.
+ 
+         Only those blocks that are immutable and already be marked
+@@ -393,17 +560,117 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         # prompt is cached. This would cause erroneous behavior in model
+         # runner.
+ 
+-        ids_list = [
+-            list(
+-                takewhile(lambda block_id: self.block_is_computed(block_id),
+-                          seq[:-1])) for seq in seq_block_ids
+-        ]
+         # It returns a list of int although type annotation says list of string.
++        if len(computed_seq_block_ids) == 1:
++            return computed_seq_block_ids[0]
++
+         return commonprefix([
+-            ids for ids in ids_list  # type: ignore
+-            if ids != []
++            ids for ids in computed_seq_block_ids  # type: ignore
++            if ids
+         ])
+ 
++    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
++        """Returns the number of full blocks that will be touched by
++        swapping in/out.
++
++        Args:
++            blocks: List of blocks to be swapped.
++        Returns:
++            int: the number of full blocks that will be touched by
++                swapping in/out the given blocks. Non full blocks are ignored
++                when deciding the number of blocks to touch.
++        """
++        num_touched_blocks: int = 0
++        for block in blocks:
++            # If the block has a match in the cache and the cached
++            # block is not referenced, then we still count it as a
++            # touched block
++            if block.is_full and (not self.is_block_cached(block) or \
++                (block.content_hash is not None and \
++                self._cached_blocks[block.content_hash] in \
++                        self.evictor)):
++                num_touched_blocks += 1
++        return num_touched_blocks
++
++    def swap_out(self, blocks: List[Block]) -> None:
++        """Execute the swap out actions. Basically just free the 
++        given blocks.
++
++        Args:
++            blocks: List of blocks to be swapped out.
++        """
++        for block in blocks:
++            self._free_block_id(block)
++
++    def swap_in(self, blocks: List[Block]) -> None:
++        """Execute the swap in actions. Change the block id from 
++        old allocator to current allocator for each block to finish 
++        the block table update. 
++
++        Args:
++            blocks: List of blocks to be swapped in.
++        """
++        for block in blocks:
++            # Here we allocate either immutable or mutable block and then
++            # extract its block_id. Note that the block object is released
++            # and the block_id is assigned to "block" to allow reusing the
++            # existing "block" object
++            if block.is_full:
++                tmp_block = self.allocate_immutable_block(
++                    prev_block=block.prev_block,
++                    token_ids=block.token_ids,
++                    extra_hash=block.extra_hash)
++            else:
++                tmp_block = self.allocate_mutable_block(
++                    prev_block=block.prev_block, extra_hash=block.extra_hash)
++                tmp_block.append_token_ids(block.token_ids)
++
++            block_id = tmp_block.block_id
++            self._block_pool.free_block(tmp_block)
++
++            block.block_id = block_id  # Assign block_id
++
++    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
++        """
++        Given a list of block hashes, return the prefix of the block hashes that
++        are all cached.
++
++        Since a block's block hash includes the hashes of all previous blocks,
++        and we only allocate/deallocate blocks in the entire sequence, so if a
++        block is cached, then all previous blocks are also cached. With this
++        property, we can use binary search to find the prefix of cached blocks.
++
++        Args:
++            block_hashes (List[int]): The list of block hashes.
++
++        Returns:
++            List[int]: The prefix of the `block_hashes` that are cached.
++        """
++
++        def _block_is_cached(block_hash: PrefixHash) -> bool:
++            if block_hash not in self._cached_blocks:
++                return False
++
++            cached_block_id = self._cached_blocks[block_hash]
++            # We only consider the blocks that are marked as computed.
++            return self.block_is_computed(cached_block_id)
++
++        def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int:
++
++            # python <= 3.10 don't have the key argument
++            if sys.version_info < (3, 10):
++                a = [key(e) for e in a]
++                return bisect_left(a, x)
++            else:
++                return bisect_left(a, x, key=key)
++
++        # Look for the first block that's not cached, and returns the prefix
++        # i.e. blocks that are cached.
++        idx = _bisect_left(block_hashes,
++                           True,
++                           key=lambda x: not _block_is_cached(x))
++        return block_hashes[:idx]
++
+ 
+ class PrefixCachingBlock(Block):
+     """A block implementation that supports prefix caching.
+@@ -419,10 +686,12 @@ class PrefixCachingBlock(Block):
+         token_ids (List[int]): The initial token IDs to be stored in the block.
+         block_size (int): The maximum number of token IDs that can be stored in
+             the block.
+-        prefix_caching_allocator (BlockAllocator): The prefix
++        allocator (BlockAllocator): The prefix
+             caching block allocator associated with this block.
+         block_id (Optional[int], optional): The physical block index
+             of this block. Defaults to None.
++        extra_hash (Optional[int]): The hash value of additional factors
++            such as adapters that influence the block, apart from the token_ids.
+     """
+ 
+     def __init__(
+@@ -430,31 +699,57 @@ class PrefixCachingBlock(Block):
+         prev_block: Optional[Block],
+         token_ids: List[int],
+         block_size: int,
+-        prefix_caching_allocator: BlockAllocator,
++        allocator: BlockAllocator,
+         block_id: Optional[int] = None,
+         computed: bool = False,
++        extra_hash: Optional[int] = None,
+     ):
+-        assert isinstance(prefix_caching_allocator,
+-                          PrefixCachingBlockAllocator), (
+-                              "Currently this class is only tested with "
+-                              "PrefixCachingBlockAllocator.")
++        assert isinstance(allocator, PrefixCachingBlockAllocator), (
++            "Currently this class is only tested with "
++            "PrefixCachingBlockAllocator. Got instead allocator = {}".format(
++                allocator))
+         assert_prefix_caching_block_or_none(prev_block)
+ 
+         self._prev_block = prev_block
+         self._cached_content_hash: Optional[int] = None
+-        self._cached_num_tokens_total: Optional[int] = None
+-        self._prefix_caching_allocator = prefix_caching_allocator
++        self._cached_num_tokens_total: int = 0
++        self._allocator = allocator
+         self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
+         self._computed = computed
++        self._extra_hash = extra_hash
+ 
+-        self._block = NaiveBlock(
+-            prev_block=prev_block,
+-            token_ids=token_ids,
+-            block_size=block_size,
+-            block_id=block_id,
+-            allocator=prefix_caching_allocator,
+-            _cow_target=self,
+-        )
++        # On the first time, we create the block object, and next we only
++        # reinitialize it
++        if hasattr(self, "_block"):
++            self._block.__init__(  # type: ignore[has-type]
++                prev_block=prev_block,
++                token_ids=token_ids,
++                block_size=block_size,
++                block_id=block_id,
++                allocator=self._allocator)
++        else:
++            self._block = NaiveBlock(prev_block=prev_block,
++                                     token_ids=token_ids,
++                                     block_size=block_size,
++                                     block_id=block_id,
++                                     allocator=self._allocator)
++
++        self._update_num_tokens_total()
++
++    def _update_num_tokens_total(self):
++        """Incrementally computes the number of tokens that there is
++        till the current block (included)
++        """
++        res = 0
++
++        # Add all previous blocks
++        if self._prev_block is not None:
++            res += self._prev_block.num_tokens_total
++
++        # Add current block
++        res += len(self.token_ids)
++
++        self._cached_num_tokens_total = res
+ 
+     @property
+     def computed(self) -> bool:
+@@ -476,22 +771,28 @@ class PrefixCachingBlock(Block):
+         """Appends the given token IDs to the block and registers the block as
+         immutable if the block becomes full.
+ 
+-        Internally, the naive block handles CoW.
+-
+         Args:
+             token_ids (List[int]): The token IDs to be appended to the block.
+         """
+-        assert token_ids
++        # Ensure this is mutable block (not promoted)
++        assert self.content_hash is None
++        assert not self.computed
++
++        if len(token_ids) == 0:
++            return
++
++        # Ensure there are input tokens
++        assert token_ids, "Got token_ids = {}".format(token_ids)
+ 
+-        # naive block handles CoW.
++        # Naive block handles CoW.
+         self._block.append_token_ids(token_ids)
++        self._update_num_tokens_total()
+ 
+         # If the content hash is present, then the block can be made immutable.
+         # Register ourselves with the allocator, potentially replacing the
+         # physical block index.
+         if self.content_hash is not None:
+-            self.block_id = (self._prefix_caching_allocator.
+-                             promote_to_immutable_block(self))
++            self.block_id = self._allocator.promote_to_immutable_block(self)
+ 
+     @property
+     def block_id(self) -> Optional[int]:
+@@ -511,23 +812,6 @@ class PrefixCachingBlock(Block):
+ 
+     @property
+     def num_tokens_total(self) -> int:
+-        """return the total tokens so far.
+-
+-        Here we iterate the block chain till to the first block, while
+-        cache the result in local to prevent repeated computations.
+-        """
+-        if self._cached_num_tokens_total is not None:
+-            return self._cached_num_tokens_total
+-
+-        _block: Optional[Block] = self
+-        self._cached_num_tokens_total = 0
+-
+-        # TODO: current implement here take O(N^2), we expect future
+-        # we have O(1) here
+-        while _block is not None:
+-            self._cached_num_tokens_total += len(_block.token_ids)
+-            _block = _block.prev_block
+-
+         return self._cached_num_tokens_total
+ 
+     @property
+@@ -542,6 +826,10 @@ class PrefixCachingBlock(Block):
+     def prev_block(self) -> Optional[Block]:
+         return self._prev_block
+ 
++    @property
++    def extra_hash(self) -> Optional[int]:
++        return self._extra_hash
++
+     @property
+     def content_hash(self) -> Optional[int]:
+         """Return the content-based hash of the current block, or None if it is
+@@ -550,7 +838,6 @@ class PrefixCachingBlock(Block):
+         For the content-based hash to be defined, the current block must be
+         full.
+         """
+-
+         # If the hash is already computed, return it.
+         if self._cached_content_hash is not None:
+             return self._cached_content_hash
+@@ -573,18 +860,19 @@ class PrefixCachingBlock(Block):
+         self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
+             is_first_block,
+             prev_block_hash,
+-            cur_block_token_ids=self.token_ids)
++            cur_block_token_ids=self.token_ids,
++            extra_hash=self._extra_hash)
+         return self._cached_content_hash
+ 
+     @staticmethod
+-    def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
+-                          cur_block_token_ids: List[int]) -> int:
++    def hash_block_tokens(is_first_block: bool,
++                          prev_block_hash: Optional[int],
++                          cur_block_token_ids: List[int],
++                          extra_hash: Optional[int] = None) -> int:
+         """Computes a hash value corresponding to the contents of a block and
+         the contents of the preceding block(s). The hash value is used for
+         prefix caching.
+ 
+-        NOTE: Content-based hashing does not yet support LoRA.
+-
+         Parameters:
+         - is_first_block (bool): A flag indicating if the block is the first in
+             the sequence.
+@@ -592,15 +880,186 @@ class PrefixCachingBlock(Block):
+             if this is the first block.
+         - cur_block_token_ids (List[int]): A list of token ids in the current
+             block. The current block is assumed to be full.
++        - extra_hash (Optional[int]): The hash value of additional factors
++            such as adapters that influence the block, apart from the token_ids.
+ 
+         Returns:
+         - int: The computed hash value for the block.
+         """
+         assert (prev_block_hash is None) == is_first_block
+-        return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
++        return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
++                     extra_hash))
++
++
++class ComputedBlocksTracker:
++    """
++    Tracks the computed blocks for each sequence.
++
++    Internally, it maintains a map from sequence id to the list of block hashes
++    for the sequence. We cache the hashes of the full blocks for each sequence,
++    and make sure the hash is calculated in the same way as the allocator.
++    When a sequence is being decoded, we also update the sequence's hash
++    accordingly and incrementally.
++
++    From the sequence hash, with prefix caching enabled, we could also calculate
++    the number of cached tokens for the sequence by looking up the number of
++    cached block hashes in the allocator.
++    """
++
++    def __init__(
++        self,
++        allocator: DeviceAwareBlockAllocator,
++        block_size: int,
++        enable_caching: bool,
++    ):
++        self._allocator = allocator
++        self._block_size = block_size
++        self._enable_caching = enable_caching
++
++        # A map from seq_id to the list of block hashes for the
++        # sequence. This is so that we don't have to recompute the block hashes
++        # for the sequence when we need to check if the sequence is cached.
++        # Note a block that's not full will not have its hash calculated and
++        # recorded.
++        self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {}
++
++        # A map from seq_id to the number of tokens that are cached for the
++        # sequence.
++        # We need this so that a sequence in continuous prefill doesn't
++        # accidentally see its cached token count change. See comments in
++        # `get_num_cached_tokens` for more details.
++        self._seq_id_to_num_tokens_computed: Dict[int, int] = {}
++
++    def _update_seq_hashes(self, seq: Sequence) -> None:
++        """Incrementally update the sequence's block hashes and record them."""
++        assert self._enable_caching
++
++        block_hashes_recorded = self._seq_id_to_blocks_hashes.get(
++            seq.seq_id, [])
++        cur_num_blocks_recorded = len(block_hashes_recorded)
++        token_ids = seq.get_token_ids()
++        assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, (
++            f"The sequence has {len(token_ids)} tokens, but"
++            f" already recorded {cur_num_blocks_recorded} blocks. "
++            "This should not happen since we assume blocks are "
++            "only appended other than recomputation. When the sequence is "
++            "recomputed, we should have removed the info of the old blocks.")
++        # Update the computed block hashes for the sequence. Since only full
++        # blocks are considered as "computed", we take floor here.
++        num_computed_blocks = len(token_ids) // self._block_size
++
++        # We need to know the hash of the previous block to compute the hash of
++        # the current block so that blocks could be uniquely identified across
++        # sequences of prefixes.
++        prev_block_hash = (None if cur_num_blocks_recorded == 0 else
++                           block_hashes_recorded[-1])
++        # Only update the computed block hashes for the new blocks
++        for i in range(cur_num_blocks_recorded, num_computed_blocks):
++            assert len(token_ids) >= (i + 1) * self._block_size
++            block_token_ids = token_ids[i * self._block_size:(i + 1) *
++                                        self._block_size]
++
++            # NOTE: If there are any factors affecting the block besides
++            # token_ids, they should be added as input to extra_hash.
++            extra_hash = seq.extra_hash()
++
++            # This has to be kept in sync with the allocator's hash
++            # calculation.
++            block_hash = PrefixCachingBlock.hash_block_tokens(
++                is_first_block=prev_block_hash is None,
++                prev_block_hash=prev_block_hash,
++                cur_block_token_ids=block_token_ids,
++                extra_hash=extra_hash,
++            )
++            block_hashes_recorded.append(block_hash)
++            prev_block_hash = block_hash
++
++        self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded
++
++    def get_num_cached_tokens(self, seq: Sequence) -> int:
++        if not self._enable_caching:
++            return 0
++
++        # We always try to update the sequence hashes on the fly.
++        # This is to ensure that we don't miss any cached tokens for the
++        # sequence during decode.
++        # This routine should only update hash for any new blocks too.
++        self._update_seq_hashes(seq)
++
++        num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get(
++            seq.seq_id, None)
++
++        # TODO(rickyx): This hack could be removed once we mark blocks as
++        # computed correctly with chunked prefills.
++        if num_computed_tokens_prev is not None and seq.is_prefill():
++            # For a sequence that is still in prefill, we don't
++            # recompute the number of cached tokens.
++            # This also handles correctly chunked prefill since currently
++            # we mark blocks as computed even if the sequence is still partially
++            # prefilled. So a continuously prefilled sequence should not
++            # see its cached token count change while running.
++            return num_computed_tokens_prev
++
++        block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id]
++
++        # This is O(logN), where N is the number of blocks.
++        num_cached_blocks = len(
++            self._allocator.find_cached_blocks_prefix(block_hashes))
++        num_cached_tokens = num_cached_blocks * self._block_size
++        self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens
++        return num_cached_tokens
++
++    def remove_seq(self, seq_id: int) -> None:
++        """Stop tracking the sequence."""
++        if not self._enable_caching:
++            return
++        assert seq_id in self._seq_id_to_blocks_hashes
++        del self._seq_id_to_blocks_hashes[seq_id]
++
++        assert seq_id in self._seq_id_to_num_tokens_computed
++        del self._seq_id_to_num_tokens_computed[seq_id]
++
++
++class LastAccessBlocksTracker:
++    """Manages the last access time of the tracked sequences, in order to allow
++    an efficient update of allocator's block last access times
++    """
++
++    def __init__(self, allocator):
++        self._allocator = allocator
++        self._seq_last_access: Dict[int, Optional[float]] = {}
++
++    def add_seq(self, seq_id: int) -> None:
++        """Start tracking seq_id
++        """
++        assert seq_id not in self._seq_last_access
++        self._seq_last_access[seq_id] = None
++
++    def remove_seq(self, seq_id: int) -> None:
++        """Stop tracking seq_id
++        """
++        assert seq_id in self._seq_last_access
++        del self._seq_last_access[seq_id]
++
++    def update_last_access(self, seq_id: int, time: float) -> None:
++        assert seq_id in self._seq_last_access
++        self._seq_last_access[seq_id] = time
++
++    def update_seq_blocks_last_access(self, seq_id: int,
++                                      block_ids: List[int]) -> None:
++        assert seq_id in self._seq_last_access
++
++        ts = self._seq_last_access[seq_id]
++
++        if ts is None:
++            # No last access was recorded, no need to update.
++            return
++
++        self._allocator.mark_blocks_as_accessed(block_ids, ts)
+ 
+ 
+ def assert_prefix_caching_block_or_none(block: Optional[Block]):
+     if block is None:
+         return
+-    assert isinstance(block, PrefixCachingBlock)
++    assert isinstance(block,
++                      PrefixCachingBlock), "Got block = {}".format(block)
+diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
+new file mode 100644
+index 0000000..1c6578e
+--- /dev/null
++++ b/vllm/core/block/utils.py
+@@ -0,0 +1,26 @@
++"""Block manager utils."""
++from vllm.sequence import SequenceGroup
++from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
++                        STR_NOT_IMPL_ENC_DEC_SWA)
++
++
++def check_no_caching_or_swa_for_blockmgr_encdec(
++        block_mgr, seq_group: SequenceGroup) -> None:
++    '''
++    Enforce that prefix caching & sliding-window attention (SWA)
++    are currently unsupported *specifically* for encoder/decoder models.
++
++    Raises NotImplementedError if unsupported scenario is detected.
++
++    Arguments:
++
++    * block_mgr: BlockSpaceManager instance
++    * seq_group: SequenceGroup passed to block_mgr
++    '''
++
++    if seq_group.is_encoder_decoder():
++        if block_mgr.max_block_sliding_window is not None:
++            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
++
++        if block_mgr.enable_caching:
++            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
+diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
+new file mode 100644
+index 0000000..b41e848
+--- /dev/null
++++ b/vllm/core/block_manager.py
+@@ -0,0 +1,516 @@
++"""A block manager that manages token blocks."""
++from typing import Dict, List, Optional
++from typing import Sequence as GenericSequence
++from typing import Tuple
++
++from vllm.core.block.block_table import BlockTable
++from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
++from vllm.core.block.interfaces import Block
++from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
++                                                  LastAccessBlocksTracker)
++from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
++from vllm.core.interfaces import AllocStatus, BlockSpaceManager
++from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
++from vllm.utils import Device
++
++SeqId = int
++EncoderSeqId = str
++
++
++class SelfAttnBlockSpaceManager(BlockSpaceManager):
++    """BlockSpaceManager which manages the allocation of KV cache.
++
++    It owns responsibility for allocation, swapping, allocating memory for
++    autoregressively-generated tokens, and other advanced features such as
++    prefix caching, forking/copy-on-write, and sliding-window memory allocation.
++
++    This class implements the design described in
++    https://github.com/vllm-project/vllm/pull/3492.
++
++    Lookahead slots
++        The block manager has the notion of a "lookahead slot". These are slots
++        in the KV cache that are allocated for a sequence. Unlike the other
++        allocated slots, the content of these slots is undefined -- the worker
++        may use the memory allocations in any way.
++
++        In practice, a worker could use these lookahead slots to run multiple
++        forward passes for a single scheduler invocation. Each successive
++        forward pass would write KV activations to the corresponding lookahead
++        slot. This allows low inter-token latency use-cases, where the overhead
++        of continuous batching scheduling is amortized over >1 generated tokens.
++
++        Speculative decoding uses lookahead slots to store KV activations of
++        proposal tokens.
++
++        See https://github.com/vllm-project/vllm/pull/3250 for more information
++        on lookahead scheduling.
++
++    Args:
++        block_size (int): The size of each memory block.
++        num_gpu_blocks (int): The number of memory blocks allocated on GPU.
++        num_cpu_blocks (int): The number of memory blocks allocated on CPU.
++        watermark (float, optional): The threshold used for memory swapping.
++            Defaults to 0.01.
++        sliding_window (Optional[int], optional): The size of the sliding
++            window. Defaults to None.
++        enable_caching (bool, optional): Flag indicating whether caching is
++            enabled. Defaults to False.
++    """
++
++    def __init__(
++        self,
++        block_size: int,
++        num_gpu_blocks: int,
++        num_cpu_blocks: int,
++        watermark: float = 0.01,
++        sliding_window: Optional[int] = None,
++        enable_caching: bool = False,
++    ) -> None:
++        self.block_size = block_size
++        self.num_total_gpu_blocks = num_gpu_blocks
++        self.num_total_cpu_blocks = num_cpu_blocks
++
++        self.sliding_window = sliding_window
++        # max_block_sliding_window is the max number of blocks that need to be
++        # allocated
++        self.max_block_sliding_window = None
++        if sliding_window is not None:
++            # +1 here because // rounds down
++            num_blocks = sliding_window // block_size + 1
++            # +1 here because the last block may not be full,
++            # and so the sequence stretches one more block at the beginning
++            # For example, if sliding_window is 3 and block_size is 4,
++            # we may need 2 blocks when the second block only holds 1 token.
++            self.max_block_sliding_window = num_blocks + 1
++
++        self.watermark = watermark
++        assert watermark >= 0.0
++
++        self.enable_caching = enable_caching
++
++        self.watermark_blocks = int(watermark * num_gpu_blocks)
++
++        self.block_allocator = CpuGpuBlockAllocator.create(
++            allocator_type="prefix_caching" if enable_caching else "naive",
++            num_gpu_blocks=num_gpu_blocks,
++            num_cpu_blocks=num_cpu_blocks,
++            block_size=block_size,
++        )
++
++        self.block_tables: Dict[SeqId, BlockTable] = {}
++        self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
++
++        self._computed_blocks_tracker = ComputedBlocksTracker(
++            self.block_allocator, self.block_size, self.enable_caching)
++        self._last_access_blocks_tracker = LastAccessBlocksTracker(
++            self.block_allocator)
++
++    def can_allocate(self,
++                     seq_group: SequenceGroup,
++                     num_lookahead_slots: int = 0) -> AllocStatus:
++        # FIXME(woosuk): Here we assume that all sequences in the group share
++        # the same prompt. This may not be true for preempted sequences.
++
++        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
++
++        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
++        num_required_blocks = BlockTable.get_num_required_blocks(
++            seq.get_token_ids(),
++            block_size=self.block_size,
++            num_lookahead_slots=num_lookahead_slots,
++        )
++
++        if seq_group.is_encoder_decoder():
++            encoder_seq = seq_group.get_encoder_seq()
++            assert encoder_seq is not None
++            num_required_blocks += BlockTable.get_num_required_blocks(
++                encoder_seq.get_token_ids(),
++                block_size=self.block_size,
++            )
++
++        if self.max_block_sliding_window is not None:
++            num_required_blocks = min(num_required_blocks,
++                                      self.max_block_sliding_window)
++
++        num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
++            device=Device.GPU)
++
++        # Use watermark to avoid frequent cache eviction.
++        if (self.num_total_gpu_blocks - num_required_blocks <
++                self.watermark_blocks):
++            return AllocStatus.NEVER
++        if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
++            return AllocStatus.OK
++        else:
++            return AllocStatus.LATER
++
++    def _allocate_sequence(self, seq: Sequence) -> BlockTable:
++        block_table = BlockTable(
++            block_size=self.block_size,
++            block_allocator=self.block_allocator,
++            max_block_sliding_window=self.max_block_sliding_window,
++        )
++        if seq.get_token_ids():
++            # NOTE: If there are any factors affecting the block besides
++            # token_ids, they should be added as input to extra_hash.
++            extra_hash = seq.extra_hash()
++
++            # Add blocks to the block table only if the sequence is non empty.
++            block_table.allocate(token_ids=seq.get_token_ids(),
++                                 extra_hash=extra_hash)
++
++        return block_table
++
++    def allocate(self, seq_group: SequenceGroup) -> None:
++
++        # Allocate self-attention block tables for decoder sequences
++        waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
++        assert not (set(seq.seq_id for seq in waiting_seqs)
++                    & self.block_tables.keys()), "block table already exists"
++
++        # NOTE: Here we assume that all sequences in the group have the same
++        # prompt.
++        seq = waiting_seqs[0]
++        block_table: BlockTable = self._allocate_sequence(seq)
++        self.block_tables[seq.seq_id] = block_table
++
++        # Track seq
++        self._last_access_blocks_tracker.add_seq(seq.seq_id)
++
++        # Assign the block table for each sequence.
++        for seq in waiting_seqs[1:]:
++            self.block_tables[seq.seq_id] = block_table.fork()
++
++            # Track seq
++            self._last_access_blocks_tracker.add_seq(seq.seq_id)
++
++        # Allocate cross-attention block table for encoder sequence
++        #
++        # NOTE: Here we assume that all sequences in the group have the same
++        # encoder prompt.
++        request_id = seq_group.request_id
++
++        assert (request_id
++                not in self.cross_block_tables), \
++            "block table already exists"
++
++        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
++
++        if seq_group.is_encoder_decoder():
++            encoder_seq = seq_group.get_encoder_seq()
++            assert encoder_seq is not None
++            block_table = self._allocate_sequence(encoder_seq)
++            self.cross_block_tables[request_id] = block_table
++
++    def can_append_slots(self, seq_group: SequenceGroup,
++                         num_lookahead_slots: int) -> bool:
++        """Determine if there is enough space in the GPU KV cache to continue
++        generation of the specified sequence group.
++
++        We use a worst-case heuristic: assume each touched block will require a
++        new allocation (either via CoW or new block). We can append slots if the
++        number of touched blocks is less than the number of free blocks.
++
++        "Lookahead slots" are slots that are allocated in addition to the slots
++        for known tokens. The contents of the lookahead slots are not defined.
++        This is used by speculative decoding when speculating future tokens.
++        """
++
++        num_touched_blocks = 0
++        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
++            block_table = self.block_tables[seq.seq_id]
++
++            num_touched_blocks += (
++                block_table.get_num_blocks_touched_by_append_slots(
++                    token_ids=block_table.get_unseen_token_ids(
++                        seq.get_token_ids()),
++                    num_lookahead_slots=num_lookahead_slots,
++                ))
++
++        num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
++            Device.GPU)
++        return num_touched_blocks <= num_free_gpu_blocks
++
++    def append_slots(
++        self,
++        seq: Sequence,
++        num_lookahead_slots: int,
++    ) -> List[Tuple[int, int]]:
++
++        block_table = self.block_tables[seq.seq_id]
++
++        block_table.append_token_ids(
++            token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
++            num_lookahead_slots=num_lookahead_slots,
++            num_computed_slots=seq.data.get_num_computed_tokens(),
++            extra_hash=seq.extra_hash(),
++        )
++        # Return any new copy-on-writes.
++        new_cows = self.block_allocator.clear_copy_on_writes()
++        return new_cows
++
++    def free(self, seq: Sequence) -> None:
++        seq_id = seq.seq_id
++
++        if seq_id not in self.block_tables:
++            # Already freed or haven't been scheduled yet.
++            return
++
++        # Update seq block ids with the latest access time
++        self._last_access_blocks_tracker.update_seq_blocks_last_access(
++            seq_id, self.block_tables[seq.seq_id].physical_block_ids)
++
++        # Untrack seq
++        self._last_access_blocks_tracker.remove_seq(seq_id)
++        self._computed_blocks_tracker.remove_seq(seq_id)
++
++        # Free table/blocks
++        self.block_tables[seq_id].free()
++        del self.block_tables[seq_id]
++
++    def free_cross(self, seq_group: SequenceGroup) -> None:
++        request_id = seq_group.request_id
++        if request_id not in self.cross_block_tables:
++            # Already freed or hasn't been scheduled yet.
++            return
++        self.cross_block_tables[request_id].free()
++        del self.cross_block_tables[request_id]
++
++    def get_block_table(self, seq: Sequence) -> List[int]:
++        block_ids = self.block_tables[seq.seq_id].physical_block_ids
++        return block_ids  # type: ignore
++
++    def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
++        request_id = seq_group.request_id
++        assert request_id in self.cross_block_tables
++        block_ids = self.cross_block_tables[request_id].physical_block_ids
++        assert all(b is not None for b in block_ids)
++        return block_ids  # type: ignore
++
++    def access_all_blocks_in_seq(self, seq: Sequence, now: float):
++        if self.enable_caching:
++            # Record the latest access time for the sequence. The actual update
++            # of the block ids is deferred to the sequence free(..) call, since
++            # only during freeing of block ids, the blocks are actually added to
++            # the evictor (which is when the most updated time is required)
++            # (This avoids expensive calls to mark_blocks_as_accessed(..))
++            self._last_access_blocks_tracker.update_last_access(
++                seq.seq_id, now)
++
++    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
++                                token_chunk_size: int):
++        # If prefix caching is enabled, mark immutable blocks as computed
++        # right after they have been scheduled (for prefill). This assumes
++        # the scheduler is synchronous so blocks are actually computed when
++        # scheduling the next batch.
++        self.block_allocator.mark_blocks_as_computed([])
++
++    def get_common_computed_block_ids(
++            self, seqs: List[Sequence]) -> GenericSequence[int]:
++        """Determine which blocks for which we skip prefill.
++
++        With prefix caching we can skip prefill for previously-generated blocks.
++        Currently, the attention implementation only supports skipping cached
++        blocks if they are a contiguous prefix of cached blocks.
++
++        This method determines which blocks can be safely skipped for all
++        sequences in the sequence group.
++        """
++        computed_seq_block_ids = []
++        for seq in seqs:
++            all_blocks = self.block_tables[seq.seq_id].physical_block_ids
++            num_cached_tokens = (
++                self._computed_blocks_tracker.get_num_cached_tokens(seq))
++            assert num_cached_tokens % self.block_size == 0
++            num_cached_blocks = num_cached_tokens // self.block_size
++            computed_block_ids = all_blocks[:num_cached_blocks]
++            computed_seq_block_ids.append(computed_block_ids)
++
++        # NOTE(sang): This assumes seq_block_ids doesn't contain any None.
++        return self.block_allocator.get_common_computed_block_ids(
++            computed_seq_block_ids)  # type: ignore
++
++    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
++        if parent_seq.seq_id not in self.block_tables:
++            # Parent sequence has either been freed or never existed.
++            return
++        src_block_table = self.block_tables[parent_seq.seq_id]
++        self.block_tables[child_seq.seq_id] = src_block_table.fork()
++
++        # Track child seq
++        self._last_access_blocks_tracker.add_seq(child_seq.seq_id)
++
++    def can_swap_in(self, seq_group: SequenceGroup,
++                    num_lookahead_slots: int) -> AllocStatus:
++        """Returns the AllocStatus for the given sequence_group 
++        with num_lookahead_slots.
++
++        Args:
++            sequence_group (SequenceGroup): The sequence group to swap in.
++            num_lookahead_slots (int): Number of lookahead slots used in 
++                speculative decoding, default to 0.
++
++        Returns:
++            AllocStatus: The AllocStatus for the given sequence group.
++        """
++        return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
++                              num_lookahead_slots)
++
++    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
++        """Returns the block id mapping (from CPU to GPU) generated by
++        swapping in the given seq_group with num_lookahead_slots.
++
++        Args:
++            seq_group (SequenceGroup): The sequence group to swap in.
++
++        Returns:
++            List[Tuple[int, int]]: The mapping of swapping block from CPU 
++                to GPU.
++        """
++        physical_block_id_mapping = []
++        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
++            blocks = self.block_tables[seq.seq_id].blocks
++            if len(blocks) == 0:
++                continue
++
++            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
++                                                         src_device=Device.CPU,
++                                                         dst_device=Device.GPU)
++
++            # Refresh the block ids of the table (post-swap)
++            self.block_tables[seq.seq_id].update(blocks)
++
++            seq_physical_block_id_mapping = {
++                self.block_allocator.get_physical_block_id(
++                    Device.CPU, cpu_block_id):
++                self.block_allocator.get_physical_block_id(
++                    Device.GPU, gpu_block_id)
++                for cpu_block_id, gpu_block_id in seq_swap_mapping.items()
++            }
++
++            physical_block_id_mapping.extend(
++                list(seq_physical_block_id_mapping.items()))
++
++        return physical_block_id_mapping
++
++    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
++        """Returns whether we can swap out the given sequence_group 
++        with num_lookahead_slots.
++
++        Args:
++            seq_group (SequenceGroup): The sequence group to swap out.
++            num_lookahead_slots (int): Number of lookahead slots used in 
++                speculative decoding, default to 0.
++
++        Returns:
++            bool: Whether it's possible to swap out current sequence group.
++        """
++        alloc_status = self._can_swap(seq_group, Device.CPU,
++                                      SequenceStatus.RUNNING)
++        return alloc_status == AllocStatus.OK
++
++    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
++        """Returns the block id mapping (from GPU to CPU) generated by
++        swapping out the given sequence_group with num_lookahead_slots.
++
++        Args:
++            sequence_group (SequenceGroup): The sequence group to swap out.
++
++        Returns:
++            List[Tuple[int, int]]: The mapping of swapping block from 
++                GPU to CPU.
++        """
++        physical_block_id_mapping = []
++        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
++            blocks = self.block_tables[seq.seq_id].blocks
++            if len(blocks) == 0:
++                continue
++
++            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
++                                                         src_device=Device.GPU,
++                                                         dst_device=Device.CPU)
++
++            # Refresh the block ids of the table (post-swap)
++            self.block_tables[seq.seq_id].update(blocks)
++
++            seq_physical_block_id_mapping = {
++                self.block_allocator.get_physical_block_id(
++                    Device.GPU, gpu_block_id):
++                self.block_allocator.get_physical_block_id(
++                    Device.CPU, cpu_block_id)
++                for gpu_block_id, cpu_block_id in seq_swap_mapping.items()
++            }
++
++            physical_block_id_mapping.extend(
++                list(seq_physical_block_id_mapping.items()))
++
++        return physical_block_id_mapping
++
++    def get_num_free_gpu_blocks(self) -> int:
++        return self.block_allocator.get_num_free_blocks(Device.GPU)
++
++    def get_num_free_cpu_blocks(self) -> int:
++        return self.block_allocator.get_num_free_blocks(Device.CPU)
++
++    def get_prefix_cache_hit_rate(self, device: Device) -> float:
++        return self.block_allocator.get_prefix_cache_hit_rate(device)
++
++    def _can_swap(self,
++                  seq_group: SequenceGroup,
++                  device: Device,
++                  status: SequenceStatus,
++                  num_lookahead_slots: int = 0) -> AllocStatus:
++        """Returns the AllocStatus for swapping in/out the given sequence_group 
++        on to the 'device'.
++
++        Args:
++            sequence_group (SequenceGroup): The sequence group to swap in/out.
++            device (Device): device to swap the 'seq_group' on.
++            status (SequenceStatus): The status of sequence which is needed
++                for action. RUNNING for swap out and SWAPPED for swap in
++            num_lookahead_slots (int): Number of lookahead slots used in 
++                speculative decoding, default to 0.
++
++        Returns:
++            AllocStatus: The AllocStatus for swapping in/out the given 
++                sequence_group on to the 'device'.
++        """
++        # First determine the number of blocks that will be touched by this
++        # swap. Then verify if there are available blocks in the device
++        # to perform the swap.
++        num_blocks_touched = 0
++        blocks: List[Block] = []
++        for seq in seq_group.get_seqs(status=status):
++            block_table = self.block_tables[seq.seq_id]
++            if block_table.blocks is not None:
++                # Compute the number blocks to touch for the tokens to be
++                # appended. This does NOT include the full blocks that need
++                # to be touched for the swap.
++                num_blocks_touched += \
++                    block_table.get_num_blocks_touched_by_append_slots(
++                        block_table.get_unseen_token_ids(seq.get_token_ids()),
++                        num_lookahead_slots=num_lookahead_slots)
++                blocks.extend(block_table.blocks)
++        # Compute the number of full blocks to touch and add it to the
++        # existing count of blocks to touch.
++        num_blocks_touched += self.block_allocator.get_num_full_blocks_touched(
++            blocks, device=device)
++
++        watermark_blocks = 0
++        if device == Device.GPU:
++            watermark_blocks = self.watermark_blocks
++
++        if self.block_allocator.get_num_total_blocks(
++                device) < num_blocks_touched:
++            return AllocStatus.NEVER
++        elif self.block_allocator.get_num_free_blocks(
++                device) - num_blocks_touched >= watermark_blocks:
++            return AllocStatus.OK
++        else:
++            return AllocStatus.LATER
++
++    def get_num_cached_tokens(self, seq: Sequence) -> int:
++        """Get the number of tokens in blocks that are already computed and
++        cached in the block manager for the sequence.
++        """
++        return self._computed_blocks_tracker.get_num_cached_tokens(seq)
+diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
+new file mode 100644
+index 0000000..c930651
+--- /dev/null
++++ b/vllm/core/evictor.py
+@@ -0,0 +1,154 @@
++import enum
++import heapq
++from abc import ABC, abstractmethod
++from typing import Dict, List, Tuple
++
++
++class EvictionPolicy(enum.Enum):
++    """Enum for eviction policy used by make_evictor to instantiate the correct
++       Evictor subclass.
++    """
++    LRU = enum.auto()
++
++
++class Evictor(ABC):
++    """The Evictor subclasses should be used by the BlockAllocator class to
++    handle eviction of freed Blocks.
++    """
++
++    @abstractmethod
++    def __init__(self):
++        pass
++
++    @abstractmethod
++    def __contains__(self, block_id: int) -> bool:
++        pass
++
++    @abstractmethod
++    def evict(self) -> Tuple[int, int]:
++        """Runs the eviction algorithm and returns the evicted block's
++        content hash along with physical block id along with physical block id
++        """
++        pass
++
++    @abstractmethod
++    def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
++            last_accessed: float):
++        """Adds block to the evictor, making it a candidate for eviction"""
++        pass
++
++    @abstractmethod
++    def update(self, block_id: int, last_accessed: float):
++        """Update corresponding block's access time in metadata"""
++        pass
++
++    @abstractmethod
++    def remove(self, block_id: int):
++        """Remove a given block id from the cache."""
++        pass
++
++    @property
++    @abstractmethod
++    def num_blocks(self) -> int:
++        pass
++
++
++class BlockMetaData:
++    """Data structure for storing key data describe cached block, so that
++    evitor could use to make its decision which one to choose for eviction
++
++    Here we use physical block id as the dict key, as there maybe several
++    blocks with the same content hash, but their physical id is unique.
++    """
++
++    def __init__(self, content_hash: int, num_hashed_tokens: int,
++                 last_accessed: float):
++        self.content_hash = content_hash
++        self.num_hashed_tokens = num_hashed_tokens
++        self.last_accessed = last_accessed
++
++
++class LRUEvictor(Evictor):
++    """Evicts in a least-recently-used order using the last_accessed timestamp
++    that's recorded in the Block. If there are multiple blocks with
++    the same last_accessed time, then the one with the largest num_hashed_tokens
++    will be evicted. If two blocks each have the lowest last_accessed time and
++    highest num_hashed_tokens value, then one will be chose arbitrarily
++    """
++
++    # CLEANUP_THRESHOLD determines the maximum allowable size of the priority
++    # queue relative to the free table size. When this threshold is exceeded,
++    # a cleanup operation is triggered to reduce memory usage.
++    CLEANUP_THRESHOLD = 50
++
++    def __init__(self):
++        self.free_table: Dict[int, BlockMetaData] = {}
++        self.priority_queue = []
++
++    def __contains__(self, block_id: int) -> bool:
++        return block_id in self.free_table
++
++    def evict(self) -> Tuple[int, int]:
++        if len(self.free_table) == 0:
++            raise ValueError("No usable cache memory left")
++
++        while self.priority_queue:
++            # We do not remove outdated entries from the priority queue at the
++            # time of updating the last_accessed timestamp. Instead, outdated
++            # entries are filtered out here during eviction. Outdated entries
++            # would either not in the free table, or have older last accessed
++            # time.
++            last_accessed, _, block_id, content_hash = heapq.heappop(
++                self.priority_queue)
++            if (block_id in self.free_table and
++                    self.free_table[block_id].last_accessed == last_accessed):
++                self.free_table.pop(block_id)
++                return block_id, content_hash
++
++        raise ValueError("No usable cache memory left")
++
++    def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
++            last_accessed: float):
++        self.free_table[block_id] = BlockMetaData(content_hash,
++                                                  num_hashed_tokens,
++                                                  last_accessed)
++        heapq.heappush(
++            self.priority_queue,
++            (last_accessed, -num_hashed_tokens, block_id, content_hash))
++        self._cleanup_if_necessary()
++
++    def update(self, block_id: int, last_accessed: float):
++        self.free_table[block_id].last_accessed = last_accessed
++
++    def _cleanup_if_necessary(self):
++        if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(
++                self.free_table):
++            self._cleanup()
++
++    def _cleanup(self):
++        new_priority_queue: List[Tuple[float, int, int, int]] = []
++
++        for block_id, block in self.free_table.items():
++            new_priority_queue.append(
++                (block.last_accessed, -block.num_hashed_tokens, block_id,
++                 block.content_hash))
++        heapq.heapify(new_priority_queue)
++
++        self.priority_queue = new_priority_queue
++
++    def remove(self, block_id: int):
++        if block_id not in self.free_table:
++            raise ValueError(
++                "Attempting to remove block that's not in the evictor")
++        self.free_table.pop(block_id)
++
++    @property
++    def num_blocks(self) -> int:
++        return len(self.free_table)
++
++
++def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
++    if eviction_policy == EvictionPolicy.LRU:
++        return LRUEvictor()
++    else:
++        raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
+diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
+index 09ccadd..b10b8d3 100644
+--- a/vllm/core/interfaces.py
++++ b/vllm/core/interfaces.py
+@@ -1,9 +1,11 @@
+ import enum
+ from abc import ABC, abstractmethod
+-from typing import Dict, List
++from typing import List
+ from typing import Sequence as GenericSequence
++from typing import Tuple
+ 
+ from vllm.sequence import Sequence, SequenceGroup
++from vllm.utils import Device
+ 
+ 
+ class AllocStatus(enum.Enum):
+@@ -26,18 +28,21 @@ class BlockSpaceManager(ABC):
+     def get_block_space_manager_class(version: str):
+         version = version.lower()
+ 
+-        if version == "v1":
+-            from vllm.core.block_manager_v1 import BlockSpaceManagerV1
+-            return BlockSpaceManagerV1
++        if version == "selfattn":
++            from vllm.core.block_manager import SelfAttnBlockSpaceManager
++            return SelfAttnBlockSpaceManager
+ 
+-        if version == "v2":
+-            from vllm.core.block_manager_v2 import BlockSpaceManagerV2
+-            return BlockSpaceManagerV2
++        if version == "placeholder":
++            from vllm.core.placeholder_block_space_manager import (
++                PlaceholderBlockSpaceManager)
++            return PlaceholderBlockSpaceManager
+ 
+         raise ValueError(f"Unknown version {version=}")
+ 
+     @abstractmethod
+-    def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
++    def can_allocate(self,
++                     seq_group: SequenceGroup,
++                     num_lookahead_slots: int = 0) -> AllocStatus:
+         pass
+ 
+     @abstractmethod
+@@ -54,7 +59,7 @@ class BlockSpaceManager(ABC):
+         self,
+         seq: Sequence,
+         num_lookahead_slots: int,
+-    ) -> Dict[int, List[int]]:
++    ) -> List[Tuple[int, int]]:
+         pass
+ 
+     @abstractmethod
+@@ -67,8 +72,7 @@ class BlockSpaceManager(ABC):
+         pass
+ 
+     @abstractmethod
+-    def swap_in(self, seq_group: SequenceGroup,
+-                num_lookahead_slots: int) -> Dict[int, int]:
++    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+         pass
+ 
+     @abstractmethod
+@@ -76,7 +80,7 @@ class BlockSpaceManager(ABC):
+         pass
+ 
+     @abstractmethod
+-    def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
++    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+         pass
+ 
+     @abstractmethod
+@@ -109,5 +113,15 @@ class BlockSpaceManager(ABC):
+         pass
+ 
+     @abstractmethod
+-    def mark_blocks_as_computed(self, seq_group: SequenceGroup):
++    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
++                                token_chunk_size: int):
++        pass
++
++    @abstractmethod
++    def get_prefix_cache_hit_rate(self, device: Device) -> float:
++        """Prefix cache hit rate. -1 means not supported or disabled."""
++        pass
++
++    @abstractmethod
++    def get_num_cached_tokens(self, seq: Sequence) -> int:
+         pass
+diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
+new file mode 100644
+index 0000000..a47e594
+--- /dev/null
++++ b/vllm/core/placeholder_block_space_manager.py
+@@ -0,0 +1,94 @@
++from typing import List, Tuple
++
++from vllm.core.interfaces import AllocStatus, BlockSpaceManager
++from vllm.sequence import Sequence, SequenceGroup
++from vllm.utils import Device
++
++
++class PlaceholderBlockSpaceManager(BlockSpaceManager):
++    """A version of BlockSpaceManager for use in environments
++    where block management is not required. 
++    For example: pooling models or attention-free models like Mamba.
++
++    This class provides the same interface as BlockSpaceManager, but its
++    methods perform no actions or return simple values like True in specific
++    actions. It's designed to be used in scenarios where the overhead of
++    block management is unnecessary, such as in an embedding environment.
++    """
++
++    def __init__(
++        self,
++        **kwargs,
++    ) -> None:
++        pass
++
++    def can_allocate(self,
++                     seq_group: SequenceGroup,
++                     num_lookahead_slots: int = 0) -> AllocStatus:
++        # Always return OK for dummy purposes
++        return AllocStatus.OK
++
++    def allocate(self, seq_group: SequenceGroup) -> None:
++        # No actual allocation logic needed
++        pass
++
++    def can_append_slots(self, seq_group: SequenceGroup,
++                         num_lookahead_slots: int) -> bool:
++        return True
++
++    def append_slots(
++        self,
++        seq: Sequence,
++        num_lookahead_slots: int,
++    ) -> List[Tuple[int, int]]:
++        return []
++
++    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
++        pass
++
++    def can_swap_in(self, seq_group: SequenceGroup,
++                    num_lookahead_slots: int) -> AllocStatus:
++        return AllocStatus.OK
++
++    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
++        return None  # type: ignore
++
++    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
++        return True
++
++    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
++        return None  # type: ignore
++
++    def free(self, seq: Sequence) -> None:
++        # No operation on free
++        return
++
++    def get_block_table(self, seq: Sequence) -> List[int]:
++        return None  # type: ignore
++
++    def get_num_free_gpu_blocks(self) -> int:
++        return 1
++
++    def get_num_free_cpu_blocks(self) -> int:
++        return 1
++
++    def access_all_blocks_in_seq(
++        self,
++        seq: Sequence,
++        access_time: float,
++    ) -> None:
++        pass
++
++    def get_common_computed_block_ids(self,
++                                      seq_group: List[Sequence]) -> List[int]:
++        return []
++
++    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
++                                token_chunk_size: int):
++        pass
++
++    def get_prefix_cache_hit_rate(self, device: Device) -> float:
++        return -1
++
++    def get_num_cached_tokens(self, seq: Sequence) -> int:
++        return 0
+diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
+index a9e0b05..b3d396f 100644
+--- a/vllm/core/scheduler.py
++++ b/vllm/core/scheduler.py
+@@ -4,16 +4,19 @@ import random
+ import time
+ from collections import deque
+ from dataclasses import dataclass, field
+-from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
++from typing import Callable, Deque, Dict, Iterable, List, Optional
++from typing import Sequence as GenericSequence
++from typing import Set, Tuple, Union
+ 
+ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
+-from vllm.core.policy import Policy, PolicyFactory
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
++from vllm.prompt_adapter.request import PromptAdapterRequest
+ from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
+-                           SequenceGroupMetadata, SequenceStatus)
+-from vllm.utils import merge_dicts
++                           SequenceGroupMetadata, SequenceGroupMetadataDelta,
++                           SequenceStatus)
++from vllm.utils import Device, PyObjectCache
+ 
+ logger = init_logger(__name__)
+ 
+@@ -51,13 +54,18 @@ class SchedulingBudget:
+     """
+     token_budget: int
+     max_num_seqs: int
+-    _requeset_ids_num_batched_tokens: Set[str] = field(default_factory=set)
+-    _requeset_ids_num_curr_seqs: Set[str] = field(default_factory=set)
++    _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
++    _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
++    # Number of cached tokens in the batch.
++    _num_cached_tokens: int = 0
++    # Number of actual non-cached tokens in the batch.
+     _num_batched_tokens: int = 0
+     _num_curr_seqs: int = 0
+ 
+     def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
+-        assert num_new_tokens != 0
++        # We allow num_new_tokens to be 0 when the entire sequence has
++        # been cached.
++        assert num_new_tokens >= 0
+         assert num_new_seqs != 0
+         return (self.num_batched_tokens + num_new_tokens <= self.token_budget
+                 and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
+@@ -65,29 +73,35 @@ class SchedulingBudget:
+     def remaining_token_budget(self):
+         return self.token_budget - self.num_batched_tokens
+ 
+-    def add_num_batched_tokens(self, req_id: str, num_batched_tokens: int):
+-        if req_id in self._requeset_ids_num_batched_tokens:
++    def add_num_batched_tokens(self,
++                               req_id: str,
++                               num_batched_tokens: int,
++                               num_cached_tokens: int = 0):
++        if req_id in self._request_ids_num_batched_tokens:
+             return
++        assert num_cached_tokens >= 0
++        assert num_batched_tokens >= 0
+ 
+-        self._requeset_ids_num_batched_tokens.add(req_id)
++        self._request_ids_num_batched_tokens.add(req_id)
+         self._num_batched_tokens += num_batched_tokens
++        self._num_cached_tokens += num_cached_tokens
+ 
+     def subtract_num_batched_tokens(self, req_id: str,
+                                     num_batched_tokens: int):
+-        if req_id in self._requeset_ids_num_batched_tokens:
+-            self._requeset_ids_num_batched_tokens.remove(req_id)
++        if req_id in self._request_ids_num_batched_tokens:
++            self._request_ids_num_batched_tokens.remove(req_id)
+             self._num_batched_tokens -= num_batched_tokens
+ 
+     def add_num_seqs(self, req_id: str, num_curr_seqs: int):
+-        if req_id in self._requeset_ids_num_curr_seqs:
++        if req_id in self._request_ids_num_curr_seqs:
+             return
+ 
+-        self._requeset_ids_num_curr_seqs.add(req_id)
++        self._request_ids_num_curr_seqs.add(req_id)
+         self._num_curr_seqs += num_curr_seqs
+ 
+     def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
+-        if req_id in self._requeset_ids_num_curr_seqs:
+-            self._requeset_ids_num_curr_seqs.remove(req_id)
++        if req_id in self._request_ids_num_curr_seqs:
++            self._request_ids_num_curr_seqs.remove(req_id)
+             self._num_curr_seqs -= num_curr_seqs
+ 
+     @property
+@@ -98,6 +112,10 @@ class SchedulingBudget:
+     def num_curr_seqs(self):
+         return self._num_curr_seqs
+ 
++    @property
++    def num_cached_tokens(self):
++        return self._num_cached_tokens
++
+ 
+ @dataclass
+ class ScheduledSequenceGroup:
+@@ -113,23 +131,24 @@ class ScheduledSequenceGroup:
+ class SchedulerOutputs:
+     """The scheduling decision made from a scheduler."""
+     # Scheduled sequence groups.
+-    scheduled_seq_groups: Iterable[ScheduledSequenceGroup]
++    scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
+     # Number of prefill groups scheduled.
+     num_prefill_groups: int
+     # Total number of batched tokens.
+     num_batched_tokens: int
+-    # Blocks to swap in. Dict of CPU -> GPU block number.
+-    blocks_to_swap_in: Dict[int, int]
+-    # Blocks to swap out. Dict of GPU -> CPU block number.
+-    blocks_to_swap_out: Dict[int, int]
+-    # Blocks to copy. Source to a list of dest blocks.
+-    blocks_to_copy: Dict[int, List[int]]
++    # Blocks to swap in. List of CPU -> GPU block number.
++    blocks_to_swap_in: List[Tuple[int, int]]
++    # Blocks to swap out. List of GPU -> CPU block number.
++    blocks_to_swap_out: List[Tuple[int, int]]
++    # Blocks to copy. Source to dest block.
++    blocks_to_copy: List[Tuple[int, int]]
+     # Sequence groups that are going to be ignored.
+     ignored_seq_groups: List[SequenceGroup]
+     # The number of slots for lookahead decoding.
+     num_lookahead_slots: int
+     # The number of requests in the running queue
+     running_queue_size: int
++    preempted: int
+ 
+     def __post_init__(self):
+         # Swap in and swap out should never happen at the same time.
+@@ -139,15 +158,26 @@ class SchedulerOutputs:
+         if self.num_loras > 0:
+             self._sort_by_lora_ids()
+ 
++        self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
++
+     def is_empty(self) -> bool:
+         # NOTE: We do not consider the ignored sequence groups.
+         return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
+                 and not self.blocks_to_swap_out and not self.blocks_to_copy)
+ 
+     def _sort_by_lora_ids(self):
+-        self.scheduled_seq_groups = sorted(
+-            self.scheduled_seq_groups,
+-            key=lambda g: (g.seq_group.lora_int_id, g.seq_group.request_id))
++        assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
++
++        def key_fn(group: ScheduledSequenceGroup):
++            key = (group.seq_group.lora_int_id, group.seq_group.request_id)
++            if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
++                # Sort sequence groups so that all prefills come before all
++                # decodes as required by chunked prefill.
++                return (not group.seq_group.is_prefill(), *key)
++            return key
++
++        self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
++                                           key=key_fn)
+ 
+     @property
+     def lora_requests(self) -> Set[LoRARequest]:
+@@ -157,6 +187,14 @@ class SchedulerOutputs:
+             if g.seq_group.lora_request is not None
+         }
+ 
++    @property
++    def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
++        return {
++            g.seq_group.prompt_adapter_request
++            for g in self.scheduled_seq_groups
++            if g.seq_group.prompt_adapter_request is not None
++        }
++
+ 
+ @dataclass
+ class SchedulerRunningOutputs:
+@@ -166,21 +204,25 @@ class SchedulerRunningOutputs:
+     enough memory, it can be preempted (for recompute) or swapped out.
+     """
+     # Selected sequences that are running and in a decoding phase.
+-    decode_seq_groups: List[SequenceGroup]
++    decode_seq_groups: List[ScheduledSequenceGroup]
+     # Selected sequences that are running and in a prefill phase.
+     # I.e., it means the prefill has been chunked.
+-    prefill_seq_groups: List[SequenceGroup]
++    prefill_seq_groups: List[ScheduledSequenceGroup]
+     # The preempted sequences.
+     preempted: List[SequenceGroup]
+     # Sequences that are swapped out.
+     swapped_out: List[SequenceGroup]
+     # The blocks to swap out.
+-    blocks_to_swap_out: Dict[int, int]
++    blocks_to_swap_out: List[Tuple[int, int]]
+     # The blocks to copy.
+-    blocks_to_copy: Dict[int, List[int]]
++    blocks_to_copy: List[Tuple[int, int]]
+     # The number of slots for lookahead decoding.
+     num_lookahead_slots: int
+ 
++    # Optimization for fast-access to seq_group lists
++    decode_seq_groups_list: List[SequenceGroup]
++    prefill_seq_groups_list: List[SequenceGroup]
++
+     @classmethod
+     def create_empty(cls) -> "SchedulerRunningOutputs":
+         return SchedulerRunningOutputs(
+@@ -188,9 +230,11 @@ class SchedulerRunningOutputs:
+             prefill_seq_groups=[],
+             preempted=[],
+             swapped_out=[],
+-            blocks_to_swap_out={},
+-            blocks_to_copy={},
++            blocks_to_swap_out=[],
++            blocks_to_copy=[],
+             num_lookahead_slots=0,
++            decode_seq_groups_list=[],
++            prefill_seq_groups_list=[],
+         )
+ 
+ 
+@@ -202,14 +246,14 @@ class SchedulerSwappedInOutputs:
+     """
+     # Selected sequences that are going to be swapped in and is in a
+     # decoding phase.
+-    decode_seq_groups: List[SequenceGroup]
++    decode_seq_groups: List[ScheduledSequenceGroup]
+     # Selected sequences that are going to be swapped in and in a prefill
+     # phase. I.e., it means the prefill has been chunked.
+-    prefill_seq_groups: List[SequenceGroup]
++    prefill_seq_groups: List[ScheduledSequenceGroup]
+     # The blocks to swap in.
+-    blocks_to_swap_in: Dict[int, int]
++    blocks_to_swap_in: List[Tuple[int, int]]
+     # The blocks to copy.
+-    blocks_to_copy: Dict[int, List[int]]
++    blocks_to_copy: List[Tuple[int, int]]
+     # The number of slots for lookahead decoding.
+     num_lookahead_slots: int
+     # Infeasible sequence groups.
+@@ -220,8 +264,8 @@ class SchedulerSwappedInOutputs:
+         return SchedulerSwappedInOutputs(
+             decode_seq_groups=[],
+             prefill_seq_groups=[],
+-            blocks_to_swap_in={},
+-            blocks_to_copy={},
++            blocks_to_swap_in=[],
++            blocks_to_copy=[],
+             num_lookahead_slots=0,
+             infeasible_seq_groups=[],
+         )
+@@ -235,7 +279,7 @@ class SchedulerPrefillOutputs:
+     to be recomputed from scratch.
+     """
+     # Selected sequences for prefill.
+-    seq_groups: List[SequenceGroup]
++    seq_groups: List[ScheduledSequenceGroup]
+     # Ignored sequence groups.
+     ignored_seq_groups: List[SequenceGroup]
+     num_lookahead_slots: int
+@@ -249,6 +293,32 @@ class SchedulerPrefillOutputs:
+         )
+ 
+ 
++def seq_group_metadata_builder():
++    return SequenceGroupMetadata(request_id="",
++                                 is_prompt=False,
++                                 seq_data={},
++                                 sampling_params=None,
++                                 block_tables={})
++
++
++def scheduler_running_outputs_builder():
++    return SchedulerRunningOutputs(decode_seq_groups=[],
++                                   prefill_seq_groups=[],
++                                   preempted=[],
++                                   swapped_out=[],
++                                   blocks_to_swap_out=[],
++                                   blocks_to_copy=[],
++                                   num_lookahead_slots=0,
++                                   prefill_seq_groups_list=[],
++                                   decode_seq_groups_list=[])
++
++
++def scheduled_seq_group_builder():
++    return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
++                                  token_chunk_size=0)
++    # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
++
++
+ class Scheduler:
+ 
+     def __init__(
+@@ -256,6 +326,8 @@ class Scheduler:
+         scheduler_config: SchedulerConfig,
+         cache_config: CacheConfig,
+         lora_config: Optional[LoRAConfig],
++        pipeline_parallel_size: int = 1,
++        output_proc_callback: Optional[Callable] = None,
+     ) -> None:
+         self.scheduler_config = scheduler_config
+         self.cache_config = cache_config
+@@ -264,22 +336,27 @@ class Scheduler:
+         # LoRAs. This should be improved in the future.
+         self.lora_config = lora_config
+ 
+-        if self.scheduler_config.chunked_prefill_enabled:
+-            self.prompt_limit = self.scheduler_config.max_model_len
+-        else:
+-            self.prompt_limit = min(
+-                self.scheduler_config.max_model_len,
+-                self.scheduler_config.max_num_batched_tokens)
++        version = "selfattn"
++        if (self.scheduler_config.runner_type == "pooling"
++                or self.cache_config.is_attention_free):
++            version = "placeholder"
+ 
+         BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
+-            version="v2" if self.scheduler_config.
+-            use_v2_block_manager else "v1")
++            version)
++
++        num_gpu_blocks = cache_config.num_gpu_blocks
++        if num_gpu_blocks:
++            num_gpu_blocks //= pipeline_parallel_size
++
++        num_cpu_blocks = cache_config.num_cpu_blocks
++        if num_cpu_blocks:
++            num_cpu_blocks //= pipeline_parallel_size
+ 
+         # Create the block space manager.
+         self.block_manager = BlockSpaceManagerImpl(
+             block_size=self.cache_config.block_size,
+-            num_gpu_blocks=self.cache_config.num_gpu_blocks,
+-            num_cpu_blocks=self.cache_config.num_cpu_blocks,
++            num_gpu_blocks=num_gpu_blocks,
++            num_cpu_blocks=num_cpu_blocks,
+             sliding_window=self.cache_config.sliding_window,
+             enable_caching=self.cache_config.enable_prefix_caching)
+ 
+@@ -292,13 +369,19 @@ class Scheduler:
+         # Sequence groups in the SWAPPED state.
+         # Contain decode requests that are swapped out.
+         self.swapped: Deque[SequenceGroup] = deque()
+-
++        # Sequence groups finished requests ids since last step iteration.
++        # It lets the model know that any state associated with these requests
++        # can and must be released after the current step.
++        # This is used to evict the finished requests from the Mamba cache.
++        self._finished_requests_ids: List[str] = list()
+         # Time at previous scheduling step
+         self.prev_time = 0.0
+         # Did we schedule a prompt at previous step?
+         self.prev_prompt = False
+         # Latency of the last prompt step
+         self.last_prompt_latency = 0.0
++        # preemption mode, RECOMPUTE or SWAP
++        self.user_specified_preemption_mode = scheduler_config.preemption_mode
+ 
+         # The following field is test-only. It is used to inject artificial
+         # preemption.
+@@ -306,6 +389,39 @@ class Scheduler:
+         self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
+                                        if self.enable_artificial_preemption
+                                        else 0)
++        self.num_cumulative_preemption: int = 0
++
++        # Used to cache python objects
++        self._seq_group_metadata_cache: List[PyObjectCache] = []
++        self._scheduler_running_outputs_cache: List[PyObjectCache] = []
++        self._scheduled_seq_group_cache: List[PyObjectCache] = []
++
++        # For async output processing, we need to swap cache buffers between
++        # iterations. I.e. since the output processing is lagged one step,
++        # we cannot reuse the cached objects immediately when the schedule()
++        # is called again, but only when schedule() is called the second time.
++        self.output_proc_callback = output_proc_callback
++        self.use_async_output_proc = self.output_proc_callback is not None
++        self.num_cache_iters = 2 if self.use_async_output_proc else 1
++
++        self.cache_id = 0
++        for i in range(self.num_cache_iters):
++            self._seq_group_metadata_cache.append(
++                PyObjectCache(seq_group_metadata_builder))
++            self._scheduler_running_outputs_cache.append(
++                PyObjectCache(scheduler_running_outputs_builder))
++            self._scheduled_seq_group_cache.append(
++                PyObjectCache(scheduled_seq_group_builder))
++
++        # For async postprocessor, the extra decode run cannot be done
++        # when the request reaches max_model_len. In this case, the request
++        # will be stopped during schedule() call and added to this stop list
++        # for processing and deallocation by the free_finished_seq_groups()
++        self._async_stopped: List[SequenceGroup] = []
++
++    @property
++    def next_cache_id(self):
++        return (self.cache_id + 1) % self.num_cache_iters
+ 
+     @property
+     def lora_enabled(self) -> bool:
+@@ -320,6 +436,16 @@ class Scheduler:
+         # Add sequence groups to the waiting queue.
+         self.waiting.append(seq_group)
+ 
++    def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
++        # Add sequence groups to the running queue.
++        # Only for testing purposes.
++        self.running.append(seq_group)
++
++    def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
++        # Add sequence groups to the swapped queue.
++        # Only for testing purposes.
++        self.swapped.append(seq_group)
++
+     def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
+         """Aborts a sequence group with the given ID.
+ 
+@@ -350,112 +476,190 @@ class Scheduler:
+             for aborted_group in aborted_groups:
+                 # Remove the sequence group from the state queue.
+                 state_queue.remove(aborted_group)
++                # Remove the aborted request from the Mamba cache.
++                self._finished_requests_ids.append(aborted_group.request_id)
+                 for seq in aborted_group.get_seqs():
+                     if seq.is_finished():
+                         continue
+                     seq.status = SequenceStatus.FINISHED_ABORTED
+                     self.free_seq(seq)
+ 
++                self._free_seq_group_cross_attn_blocks(aborted_group)
++
++    def _free_seq_group_cross_attn_blocks(
++        self,
++        seq_group: SequenceGroup,
++    ) -> None:
++        """
++        Free a sequence group from a cross-attention block table.
++        Has no effect on decoder-only models.
++        """
++        if seq_group.is_encoder_decoder():
++            self.block_manager.free_cross(seq_group)
++
+     def has_unfinished_seqs(self) -> bool:
+         return len(self.waiting) != 0 or len(self.running) != 0 or len(
+             self.swapped) != 0
+ 
++    def get_prefix_cache_hit_rate(self, device: Device) -> float:
++        return self.block_manager.get_prefix_cache_hit_rate(device)
++
+     def get_num_unfinished_seq_groups(self) -> int:
+         return len(self.waiting) + len(self.running) + len(self.swapped)
+ 
++    def get_and_reset_finished_requests_ids(self) -> List[str]:
++        """Flushes the list of request ids of previously finished seq_groups."""
++        finished_requests_ids = self._finished_requests_ids
++        self._finished_requests_ids = list()
++        return finished_requests_ids
++
+     def _schedule_running(
+         self,
+-        running_queue: deque,
+         budget: SchedulingBudget,
+         curr_loras: Optional[Set[int]],
+-        policy: Policy,
+         enable_chunking: bool = False,
+-    ) -> Tuple[deque, SchedulerRunningOutputs]:
++    ) -> SchedulerRunningOutputs:
+         """Schedule sequence groups that are running.
+ 
+         Running queue should include decode and chunked prefill requests.
+ 
+         Args:
+-            running_queue: The queue that contains running requests (i.e.,
+-                decodes). The given arguments are NOT in-place modified.
+             budget: The scheduling budget. The argument is in-place updated
+                 when any decodes are preempted.
+             curr_loras: Currently batched lora request ids. The argument is
+                 in-place updated when any decodes are preempted.
+-            policy: The sorting policy to sort running_queue.
+             enable_chunking: If True, seq group can be chunked and only a
+                 chunked number of tokens are scheduled  if
+                 `budget.num_batched_tokens` has not enough capacity to schedule
+                 all tokens.
+     
+         Returns:
+-            A tuple of remaining running queue (should be always 0) after
+-            scheduling and SchedulerRunningOutputs.
++            SchedulerRunningOutputs.
+         """
++        ret: SchedulerRunningOutputs = \
++            self._scheduler_running_outputs_cache[self.cache_id].get_object()
++        ret.blocks_to_swap_out.clear()
++        ret.blocks_to_copy.clear()
++        ret.decode_seq_groups.clear()
++        ret.prefill_seq_groups.clear()
++        ret.preempted.clear()
++        ret.swapped_out.clear()
++
++        ret.num_lookahead_slots = self._get_num_lookahead_slots(
++            is_prefill=False, enable_chunking=enable_chunking)
++
++        ret.decode_seq_groups_list.clear()
++        ret.prefill_seq_groups_list.clear()
++
+         # Blocks that need to be swapped or copied before model execution.
+-        blocks_to_swap_out: Dict[int, int] = {}
+-        blocks_to_copy: Dict[int, List[int]] = {}
++        blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
++        blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
+ 
+-        decode_seq_groups: List[ScheduledSequenceGroup] = []
+-        prefill_seq_groups: List[ScheduledSequenceGroup] = []
+-        preempted: List[SequenceGroup] = []
+-        swapped_out: List[SequenceGroup] = []
++        decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
++        prefill_seq_groups: List[
++            ScheduledSequenceGroup] = ret.prefill_seq_groups
++        preempted: List[SequenceGroup] = ret.preempted
++        swapped_out: List[SequenceGroup] = ret.swapped_out
+ 
+-        # NOTE(woosuk): Preemption happens only when there is no available slot
+-        # to keep all the sequence groups in the RUNNING state.
+-        # In this case, the policy is responsible for deciding which sequence
+-        # groups to preempt.
+-        now = time.time()
+-        running_queue = policy.sort_by_priority(now, running_queue)
++        running_queue = self.running
++        assert len(self._async_stopped) == 0
+         while running_queue:
+             seq_group = running_queue[0]
+-            num_running_tokens = self._get_num_new_tokens(
+-                seq_group, SequenceStatus.RUNNING, enable_chunking, budget)
+-
++            # We discard the cached tokens info here because we don't need it
++            # for running sequence:
++            #   1. If a sequence is running with chunked prefill, the cached
++            #      tokens info was already used for the first prefill.
++            #   2. If a sequence is running with non-chunked prefill, then
++            #      there it's a decoding sequence, and the cached tokens info is
++            #      irrelevant.
++            num_uncached_new_tokens, _ = (
++                self._get_num_new_uncached_and_cached_tokens(
++                    seq_group, SequenceStatus.RUNNING, enable_chunking,
++                    budget))
++
++            num_running_tokens = num_uncached_new_tokens
+             if num_running_tokens == 0:
++                # No budget => Stop
+                 break
+ 
+             running_queue.popleft()
+-            while not self._can_append_slots(seq_group):
++
++            # With async postprocessor, an extra decode run is done
++            # to process the final tokens. The check below avoids this extra
++            # decode run when the model max len is reached, in order to avoid
++            # a memory overflow.
++            if self.use_async_output_proc and seq_group.seqs[0].get_len(
++            ) > self.scheduler_config.max_model_len:
++                self._async_stopped.append(seq_group)
++                continue
++
++            # NOTE(woosuk): Preemption happens only when there is no available
++            # slot to keep all the sequence groups in the RUNNING state.
++            while not self._can_append_slots(seq_group, enable_chunking):
+                 budget.subtract_num_batched_tokens(seq_group.request_id,
+                                                    num_running_tokens)
+                 num_running_seqs = seq_group.get_max_num_running_seqs()
+                 budget.subtract_num_seqs(seq_group.request_id,
+                                          num_running_seqs)
+-                if curr_loras is not None and seq_group.lora_int_id > 0:
++
++                if (curr_loras is not None and seq_group.lora_int_id > 0
++                        and seq_group.lora_int_id in curr_loras):
+                     curr_loras.remove(seq_group.lora_int_id)
+ 
++                # Determine victim sequence
++                cont_loop = True
+                 if running_queue:
+-                    # Preempt the lowest-priority sequence groups.
++                    # Preempt the lowest-priority sequence group.
+                     victim_seq_group = running_queue.pop()
++                else:
++                    # No other sequence group can be preempted.
++                    # Preempt the current sequence group.
++                    # Note: This is also where we stop this loop
++                    # (since there is nothing else to preempt)
++                    victim_seq_group = seq_group
++                    cont_loop = False
++
++                # With async postprocessor, before preempting a sequence
++                # we need to ensure it has no pending async postprocessor
++                do_preempt = True
++                if self.use_async_output_proc:
++                    assert self.output_proc_callback is not None
++                    self.output_proc_callback(
++                        request_id=victim_seq_group.request_id)
++
++                    # It may be that the async pending "victim_seq_group"
++                    # becomes finished, in which case we simply free it.
++                    if victim_seq_group.is_finished():
++                        self._free_finished_seq_group(victim_seq_group)
++                        do_preempt = False
++
++                # Do preemption
++                if do_preempt:
+                     preempted_mode = self._preempt(victim_seq_group,
+                                                    blocks_to_swap_out)
+                     if preempted_mode == PreemptionMode.RECOMPUTE:
+                         preempted.append(victim_seq_group)
+                     else:
+                         swapped_out.append(victim_seq_group)
+-                else:
+-                    # No other sequence groups can be preempted.
+-                    # Preempt the current sequence group.
+-                    preempted_mode = self._preempt(seq_group,
+-                                                   blocks_to_swap_out)
+-                    if preempted_mode == PreemptionMode.RECOMPUTE:
+-                        preempted.append(seq_group)
+-                    else:
+-                        swapped_out.append(seq_group)
++
++                if not cont_loop:
+                     break
+             else:
+-                self._append_slots(seq_group, blocks_to_copy)
++                self._append_slots(seq_group, blocks_to_copy, enable_chunking)
+                 is_prefill = seq_group.is_prefill()
++
++                scheduled_seq_group: ScheduledSequenceGroup = \
++                    self._scheduled_seq_group_cache[self.cache_id].get_object()
++                scheduled_seq_group.seq_group = seq_group
+                 if is_prefill:
+-                    prefill_seq_groups.append(
+-                        ScheduledSequenceGroup(
+-                            seq_group=seq_group,
+-                            token_chunk_size=num_running_tokens))
++                    scheduled_seq_group.token_chunk_size = num_running_tokens
++                    prefill_seq_groups.append(scheduled_seq_group)
++                    ret.prefill_seq_groups_list.append(seq_group)
+                 else:
+-                    decode_seq_groups.append(
+-                        ScheduledSequenceGroup(seq_group=seq_group,
+-                                               token_chunk_size=1))
++                    scheduled_seq_group.token_chunk_size = 1
++                    decode_seq_groups.append(scheduled_seq_group)
++                    ret.decode_seq_groups_list.append(seq_group)
++
+                 budget.add_num_batched_tokens(seq_group.request_id,
+                                               num_running_tokens)
+                 # OPTIMIZATION:  Note that get_max_num_running_seqs is
+@@ -468,24 +672,17 @@ class Scheduler:
+                 if curr_loras is not None and seq_group.lora_int_id > 0:
+                     curr_loras.add(seq_group.lora_int_id)
+ 
+-        return running_queue, SchedulerRunningOutputs(
+-            decode_seq_groups=decode_seq_groups,
+-            prefill_seq_groups=prefill_seq_groups,
+-            preempted=preempted,
+-            swapped_out=swapped_out,
+-            blocks_to_swap_out=blocks_to_swap_out,
+-            blocks_to_copy=blocks_to_copy,
+-            num_lookahead_slots=self._get_num_lookahead_slots(
+-                is_prefill=False))
++        self._scheduler_running_outputs_cache[self.next_cache_id].reset()
++        self._scheduled_seq_group_cache[self.next_cache_id].reset()
++
++        return ret
+ 
+     def _schedule_swapped(
+         self,
+-        swapped_queue: deque,
+         budget: SchedulingBudget,
+         curr_loras: Optional[Set[int]],
+-        policy: Policy,
+         enable_chunking: bool = False,
+-    ) -> Tuple[deque, SchedulerSwappedInOutputs]:
++    ) -> SchedulerSwappedInOutputs:
+         """Schedule sequence groups that are swapped out.
+ 
+         It schedules swapped requests as long as it fits `budget` and
+@@ -493,37 +690,36 @@ class Scheduler:
+         `budget` and `curr_loras` are updated based on scheduled seq_groups.
+ 
+         Args:
+-            swapped_queue: The queue that contains swapped out requests.
+-                The given arguments are NOT in-place modified.
+             budget: The scheduling budget. The argument is in-place updated
+                 when any requests are swapped in.
+             curr_loras: Currently batched lora request ids. The argument is
+                 in-place updated when any requests are swapped in.
+-            policy: The sorting policy to sort swapped_queue.
+             enable_chunking: If True, seq group can be chunked and only a
+                 chunked number of tokens are scheduled  if
+                 `budget.num_batched_tokens` has not enough capacity to schedule
+                 all tokens.
+ 
+         Returns:
+-            A tuple of remaining swapped_queue after scheduling and
+             SchedulerSwappedInOutputs.
+         """
+         # Blocks that need to be swapped or copied before model execution.
+-        blocks_to_swap_in: Dict[int, int] = {}
+-        blocks_to_copy: Dict[int, List[int]] = {}
++        blocks_to_swap_in: List[Tuple[int, int]] = []
++        blocks_to_copy: List[Tuple[int, int]] = []
+         decode_seq_groups: List[ScheduledSequenceGroup] = []
+         prefill_seq_groups: List[ScheduledSequenceGroup] = []
+-        now = time.time()
+-        swapped_queue = policy.sort_by_priority(now, swapped_queue)
+         infeasible_seq_groups: List[SequenceGroup] = []
+ 
++        swapped_queue = self.swapped
++
+         leftover_swapped: Deque[SequenceGroup] = deque()
+         while swapped_queue:
+             seq_group = swapped_queue[0]
+ 
+             # If the sequence group cannot be swapped in, stop.
+-            alloc_status = self.block_manager.can_swap_in(seq_group)
++            is_prefill = seq_group.is_prefill()
++            alloc_status = self.block_manager.can_swap_in(
++                seq_group,
++                self._get_num_lookahead_slots(is_prefill, enable_chunking))
+             if alloc_status == AllocStatus.LATER:
+                 break
+             elif alloc_status == AllocStatus.NEVER:
+@@ -553,50 +749,150 @@ class Scheduler:
+             # The total number of sequences in the RUNNING state should not
+             # exceed the maximum number of sequences.
+             num_new_seqs = seq_group.get_max_num_running_seqs()
+-            num_new_tokens = self._get_num_new_tokens(seq_group,
+-                                                      SequenceStatus.SWAPPED,
+-                                                      enable_chunking, budget)
+-
+-            if (num_new_tokens == 0
+-                    or not budget.can_schedule(num_new_tokens=num_new_tokens,
+-                                               num_new_seqs=num_new_seqs)):
++            num_new_tokens_uncached, num_new_tokens_cached = (
++                self._get_num_new_uncached_and_cached_tokens(
++                    seq_group, SequenceStatus.SWAPPED, enable_chunking,
++                    budget))
++
++            if num_new_tokens_uncached == 0 or not budget.can_schedule(
++                    num_new_tokens=num_new_tokens_uncached,
++                    num_new_seqs=num_new_seqs,
++            ):
+                 break
+ 
+             if lora_int_id > 0 and curr_loras is not None:
+                 curr_loras.add(lora_int_id)
+             swapped_queue.popleft()
+             self._swap_in(seq_group, blocks_to_swap_in)
+-            self._append_slots(seq_group, blocks_to_copy)
++            self._append_slots(seq_group, blocks_to_copy, enable_chunking)
+             is_prefill = seq_group.is_prefill()
+             if is_prefill:
+                 prefill_seq_groups.append(
+-                    ScheduledSequenceGroup(seq_group,
+-                                           token_chunk_size=num_new_tokens))
++                    ScheduledSequenceGroup(
++                        seq_group,
++                        token_chunk_size=num_new_tokens_uncached +
++                        num_new_tokens_cached,
++                    ))
+             else:
+                 decode_seq_groups.append(
+                     ScheduledSequenceGroup(seq_group, token_chunk_size=1))
+-            budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens)
++            budget.add_num_batched_tokens(
++                seq_group.request_id,
++                num_batched_tokens=num_new_tokens_uncached,
++                num_cached_tokens=num_new_tokens_cached,
++            )
+             budget.add_num_seqs(seq_group.request_id, num_new_seqs)
+ 
+         swapped_queue.extendleft(leftover_swapped)
+ 
+-        return swapped_queue, SchedulerSwappedInOutputs(
++        return SchedulerSwappedInOutputs(
+             decode_seq_groups=decode_seq_groups,
+             prefill_seq_groups=prefill_seq_groups,
+             blocks_to_swap_in=blocks_to_swap_in,
+             blocks_to_copy=blocks_to_copy,
+             num_lookahead_slots=self._get_num_lookahead_slots(
+-                is_prefill=False),
++                is_prefill=False, enable_chunking=enable_chunking),
+             infeasible_seq_groups=infeasible_seq_groups,
+         )
+ 
++    def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
++        if self.scheduler_config.chunked_prefill_enabled and \
++                not self.scheduler_config.is_multi_step:
++            prompt_limit = self.scheduler_config.max_model_len
++        else:
++            prompt_limit = min(self.scheduler_config.max_model_len,
++                               self.scheduler_config.max_num_batched_tokens)
++
++        # Model is fine tuned with long context. Return the fine tuned max_len.
++        if (seq_group.lora_request
++                and seq_group.lora_request.long_lora_max_len):
++            assert prompt_limit <= seq_group.lora_request.long_lora_max_len
++            return seq_group.lora_request.long_lora_max_len
++        else:
++            return prompt_limit
++
++    def _get_priority(self,
++                      seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
++        """ Get the priority of the sequence group.
++        Highest preference to user-defined priority, followed by arrival time.
++        Args:
++            seq_group: The sequence group input.
++        Returns:
++            The priority of the sequence group.
++        """
++        return seq_group.priority, seq_group.arrival_time
++
++    def _schedule_priority_preemption(
++        self,
++        budget: SchedulingBudget,
++    ) -> int:
++        """Sorts waiting and running queue. Also, force preempt requests
++        from the running queue if their priority is lower.
++        Priority-based preemption is used with the priority policy.
++        Args:
++            budget: The scheduling budget. The argument is in-place updated
++                when any requests are scheduled.
++        Returns:
++            A count of priority-based preemptions.
++        """
++
++        waiting_queue = self.waiting
++
++        running_queue = deque(sorted(self.running, key=self._get_priority))
++
++        blocks_to_swap_out: List[Tuple[int, int]] = []
++        force_preemption_count = 0
++
++        if waiting_queue:
++            seq_group = waiting_queue.popleft()
++            num_new_seqs = seq_group.get_max_num_running_seqs()
++            num_new_tokens_uncached, _ = (
++                self._get_num_new_uncached_and_cached_tokens(
++                    seq_group, SequenceStatus.WAITING, False, budget))
++
++            #Only preempt if priority inversion exists
++            while running_queue and self._get_priority(
++                    running_queue[-1]) > self._get_priority(seq_group):
++                #Only preempt if waiting sequence cannot be allocated
++                can_allocate = self.block_manager.can_allocate(seq_group)
++                if (num_new_tokens_uncached > 0
++                        and can_allocate == AllocStatus.OK
++                        and budget.can_schedule(
++                            num_new_tokens=num_new_tokens_uncached,
++                            num_new_seqs=num_new_seqs,
++                        )):
++                    break
++
++                #Adjust budget to remove the victim sequence group
++                vseq_group = running_queue.pop()
++                num_running_tokens_uncached, _ = (
++                    self._get_num_new_uncached_and_cached_tokens(
++                        vseq_group, SequenceStatus.RUNNING, False, budget))
++                budget.subtract_num_batched_tokens(
++                    vseq_group.request_id, num_running_tokens_uncached)
++                num_running_seqs = vseq_group.get_max_num_running_seqs()
++                budget.subtract_num_seqs(vseq_group.request_id,
++                                         num_running_seqs)
++
++                #Preempt out the victim sequence group
++                self._preempt(vseq_group, blocks_to_swap_out)
++                waiting_queue.appendleft(vseq_group)
++                force_preemption_count += 1
++            #Put the sequence back into the waiting queue
++            waiting_queue.appendleft(seq_group)
++
++        waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
++
++        self.waiting = waiting_queue
++        self.running = running_queue
++        return force_preemption_count
++
+     def _schedule_prefills(
+         self,
+-        waiting_queue: deque,
+         budget: SchedulingBudget,
+         curr_loras: Optional[Set[int]],
+         enable_chunking: bool = False,
+-    ) -> Tuple[deque, SchedulerPrefillOutputs]:
++    ) -> SchedulerPrefillOutputs:
+         """Schedule sequence groups that are in prefill stage.
+ 
+         Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
+@@ -608,8 +904,6 @@ class Scheduler:
+         `budget` and `curr_loras` are updated based on scheduled seq_groups.
+ 
+         Args:
+-            waiting_queue: The queue that contains prefill requests.
+-                The given arguments are NOT in-place modified.
+             budget: The scheduling budget. The argument is in-place updated
+                 when any requests are scheduled.
+             curr_loras: Currently batched lora request ids. The argument is
+@@ -620,14 +914,12 @@ class Scheduler:
+                 all tokens.
+ 
+         Returns:
+-            A tuple of remaining waiting_queue after scheduling and
+-            SchedulerSwappedInOutputs.
++            SchedulerPrefillOutputs.
+         """
+         ignored_seq_groups: List[SequenceGroup] = []
+-        seq_groups: List[SequenceGroup] = []
+-        # We don't sort waiting queue because we assume it is sorted.
+-        # Copy the queue so that the input queue is not modified.
+-        waiting_queue = deque([s for s in waiting_queue])
++        seq_groups: List[ScheduledSequenceGroup] = []
++
++        waiting_queue = self.waiting
+ 
+         leftover_waiting_sequences: Deque[SequenceGroup] = deque()
+         while self._passed_delay(time.time()) and waiting_queue:
+@@ -637,33 +929,42 @@ class Scheduler:
+             assert len(waiting_seqs) == 1, (
+                 "Waiting sequence group should have only one prompt "
+                 "sequence.")
+-            num_new_tokens = self._get_num_new_tokens(seq_group,
+-                                                      SequenceStatus.WAITING,
+-                                                      enable_chunking, budget)
++            num_new_tokens_uncached, num_new_tokens_cached = (
++                self._get_num_new_uncached_and_cached_tokens(
++                    seq_group, SequenceStatus.WAITING, enable_chunking,
++                    budget))
++            num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
++
+             if not enable_chunking:
+                 num_prompt_tokens = waiting_seqs[0].get_len()
+                 assert num_new_tokens == num_prompt_tokens
+ 
+-            if num_new_tokens > self.prompt_limit:
++            prompt_limit = self._get_prompt_limit(seq_group)
++            if num_new_tokens > prompt_limit:
+                 logger.warning(
+                     "Input prompt (%d tokens) is too long"
+-                    " and exceeds limit of %d", num_new_tokens,
+-                    self.prompt_limit)
++                    " and exceeds limit of %d", num_new_tokens, prompt_limit)
+                 for seq in waiting_seqs:
+                     seq.status = SequenceStatus.FINISHED_IGNORED
+                 ignored_seq_groups.append(seq_group)
+                 waiting_queue.popleft()
+                 continue
+ 
++            num_lookahead_slots: int = 0
++            if self.scheduler_config.is_multi_step and enable_chunking:
++                num_lookahead_slots = self._get_num_lookahead_slots(
++                    True, enable_chunking)
++
+             # If the sequence group cannot be allocated, stop.
+-            can_allocate = self.block_manager.can_allocate(seq_group)
++            can_allocate = self.block_manager.can_allocate(
++                seq_group, num_lookahead_slots=num_lookahead_slots)
+             if can_allocate == AllocStatus.LATER:
+                 break
+             elif can_allocate == AllocStatus.NEVER:
+                 logger.warning(
+-                    "Input prompt (%d tokens) is too long"
+-                    " and exceeds the capacity of block_manager",
+-                    num_new_tokens)
++                    "Input prompt (%d tokens) + lookahead slots (%d) is "
++                    "too long and exceeds the capacity of block_manager",
++                    num_new_tokens, num_lookahead_slots)
+                 for seq in waiting_seqs:
+                     seq.status = SequenceStatus.FINISHED_IGNORED
+                 ignored_seq_groups.append(seq_group)
+@@ -684,10 +985,18 @@ class Scheduler:
+                     waiting_queue.popleft()
+                     continue
+ 
++            if (budget.num_batched_tokens >=
++                    self.scheduler_config.max_num_batched_tokens):
++                # We've reached the budget limit - since there might be
++                # continuous prefills in the running queue, we should break
++                # to avoid scheduling any new prefills.
++                break
++
+             num_new_seqs = seq_group.get_max_num_running_seqs()
+-            if (num_new_tokens == 0
+-                    or not budget.can_schedule(num_new_tokens=num_new_tokens,
+-                                               num_new_seqs=num_new_seqs)):
++            if num_new_tokens_uncached == 0 or not budget.can_schedule(
++                    num_new_tokens=num_new_tokens_uncached,
++                    num_new_seqs=num_new_seqs,
++            ):
+                 break
+ 
+             # Can schedule this request.
+@@ -695,10 +1004,32 @@ class Scheduler:
+                 curr_loras.add(lora_int_id)
+             waiting_queue.popleft()
+             self._allocate_and_set_running(seq_group)
++
++            if enable_chunking and self.scheduler_config.is_multi_step:
++                blocks_to_copy: List[Tuple[int, int]] = []
++                # init_multi_step_from_lookahead_slots happens in append_slots
++                self._append_slots(seq_group, blocks_to_copy, enable_chunking)
++                # This assert will trip when a copy-on-write happens. This is
++                # not a concern as the very first sequence-group block
++                # allocation happens above. Still, we have the assert to
++                # catch any edge-cases.
++                assert not blocks_to_copy
++            else:
++                seq_group.init_multi_step_from_lookahead_slots(
++                    num_lookahead_slots,
++                    num_scheduler_steps=self.scheduler_config.
++                    num_scheduler_steps,
++                    is_multi_step=self.scheduler_config.is_multi_step,
++                    enable_chunking=enable_chunking)
++
+             seq_groups.append(
+                 ScheduledSequenceGroup(seq_group=seq_group,
+                                        token_chunk_size=num_new_tokens))
+-            budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens)
++            budget.add_num_batched_tokens(
++                seq_group.request_id,
++                num_batched_tokens=num_new_tokens_uncached,
++                num_cached_tokens=num_new_tokens_cached,
++            )
+             budget.add_num_seqs(seq_group.request_id, num_new_seqs)
+ 
+         # Queue requests that couldn't be scheduled.
+@@ -706,10 +1037,11 @@ class Scheduler:
+         if len(seq_groups) > 0:
+             self.prev_prompt = True
+ 
+-        return waiting_queue, SchedulerPrefillOutputs(
++        return SchedulerPrefillOutputs(
+             seq_groups=seq_groups,
+             ignored_seq_groups=ignored_seq_groups,
+-            num_lookahead_slots=self._get_num_lookahead_slots(is_prefill=True))
++            num_lookahead_slots=self._get_num_lookahead_slots(
++                is_prefill=True, enable_chunking=enable_chunking))
+ 
+     def _schedule_default(self) -> SchedulerOutputs:
+         """Schedule queued requests.
+@@ -730,79 +1062,93 @@ class Scheduler:
+             budget.add_num_seqs(seq_group.request_id,
+                                 seq_group.get_max_num_running_seqs())
+         curr_loras = set(
+-            seq_group.lora_int_id
+-            for seq_group in self.running) if self.lora_enabled else None
++            seq_group.lora_int_id for seq_group in self.running
++            if seq_group.lora_int_id > 0) if self.lora_enabled else None
+ 
+-        remaining_waiting, prefills = (self.waiting,
+-                                       SchedulerPrefillOutputs.create_empty())
+-        remaining_running, running_scheduled = (
+-            self.running, SchedulerRunningOutputs.create_empty())
+-        remaining_swapped, swapped_in = (
+-            self.swapped, SchedulerSwappedInOutputs.create_empty())
++        prefills = SchedulerPrefillOutputs.create_empty()
++        running_scheduled = SchedulerRunningOutputs.create_empty()
++        swapped_in = SchedulerSwappedInOutputs.create_empty()
+ 
+         # If any requests are swapped, prioritized swapped requests.
+         if not self.swapped:
+-            remaining_waiting, prefills = self._schedule_prefills(
+-                self.waiting, budget, curr_loras, enable_chunking=False)
++            prefills = self._schedule_prefills(budget,
++                                               curr_loras,
++                                               enable_chunking=False)
++
++        if len(prefills.seq_groups
++               ) == 0 and self.scheduler_config.policy == "priority":
++            self._schedule_priority_preemption(budget)
+ 
+-        fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs")
+         # Don't schedule decodes if prefills are scheduled.
+         # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
+         # only contains decode requests, not chunked prefills.
+         if len(prefills.seq_groups) == 0:
+-            remaining_running, running_scheduled = self._schedule_running(
+-                self.running,
+-                budget,
+-                curr_loras,
+-                fcfs_policy,
+-                enable_chunking=False)
++            running_scheduled = self._schedule_running(budget,
++                                                       curr_loras,
++                                                       enable_chunking=False)
+ 
+             # If any sequence group is preempted, do not swap in any sequence
+             # group. because it means there's no slot for new running requests.
+             if len(running_scheduled.preempted) + len(
+                     running_scheduled.swapped_out) == 0:
+-                remaining_swapped, swapped_in = self._schedule_swapped(
+-                    self.swapped, budget, curr_loras, fcfs_policy)
++                swapped_in = self._schedule_swapped(budget, curr_loras)
+ 
+         assert (budget.num_batched_tokens <=
+                 self.scheduler_config.max_num_batched_tokens)
+         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
+ 
+         # Update waiting requests.
+-        self.waiting = remaining_waiting
+         self.waiting.extendleft(running_scheduled.preempted)
+         # Update new running requests.
+-        self.running = remaining_running
+-        self.running.extend([s.seq_group for s in prefills.seq_groups])
+-        self.running.extend(
+-            [s.seq_group for s in running_scheduled.decode_seq_groups])
+-        self.running.extend(
+-            [s.seq_group for s in swapped_in.decode_seq_groups])
++        if len(prefills.seq_groups) > 0:
++            self.running.extend([s.seq_group for s in prefills.seq_groups])
++
++        self.running.extend(running_scheduled.decode_seq_groups_list)
++
++        if len(swapped_in.decode_seq_groups) > 0:
++            self.running.extend(
++                [s.seq_group for s in swapped_in.decode_seq_groups])
++
+         # Update swapped requests.
+-        self.swapped = remaining_swapped
+         self.swapped.extend(running_scheduled.swapped_out)
++        preempted = (len(running_scheduled.preempted) +
++                     len(running_scheduled.swapped_out))
+ 
+         # There should be no prefill from running queue because this policy
+         # doesn't allow chunked prefills.
+         assert len(running_scheduled.prefill_seq_groups) == 0
+         assert len(swapped_in.prefill_seq_groups) == 0
++
++        # Merge lists
++        num_prefill_groups = len(prefills.seq_groups)
++        if num_prefill_groups > 0:
++            scheduled_seq_groups = prefills.seq_groups
++            scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
++        else:
++            scheduled_seq_groups = running_scheduled.decode_seq_groups
++        scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
++
++        blocks_to_copy = running_scheduled.blocks_to_copy
++        blocks_to_copy.extend(swapped_in.blocks_to_copy)
++
++        ignored_seq_groups = prefills.ignored_seq_groups
++        ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
++
+         return SchedulerOutputs(
+-            scheduled_seq_groups=(prefills.seq_groups +
+-                                  running_scheduled.decode_seq_groups +
+-                                  swapped_in.decode_seq_groups),
+-            num_prefill_groups=len(prefills.seq_groups),
+-            num_batched_tokens=budget.num_batched_tokens,
++            scheduled_seq_groups=scheduled_seq_groups,
++            num_prefill_groups=num_prefill_groups,
++            num_batched_tokens=budget.num_batched_tokens +
++            budget.num_cached_tokens,
+             blocks_to_swap_in=swapped_in.blocks_to_swap_in,
+             blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
+-            blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
+-                                       swapped_in.blocks_to_copy),
+-            ignored_seq_groups=prefills.ignored_seq_groups +
+-            swapped_in.infeasible_seq_groups,
++            blocks_to_copy=blocks_to_copy,
++            ignored_seq_groups=ignored_seq_groups,
+             num_lookahead_slots=running_scheduled.num_lookahead_slots,
+             running_queue_size=len(self.running),
++            preempted=preempted,
+         )
+ 
+-    def _schedule_chunked_prefill(self):
++    def _schedule_chunked_prefill(self) -> SchedulerOutputs:
+         """Schedule queued requests.
+         
+         Chunked prefill allows to chunk prefill requests, batch them together
+@@ -813,7 +1159,7 @@ class Scheduler:
+ 
+         The policy can sustain the high GPU utilization because it can put
+         prefill and decodes requests to the same batch, while it improves
+-        inter token latency because decodes requests don't need to blocked
++        inter token latency because decodes requests don't need to be blocked
+         by prefill requests.
+         """
+         budget = SchedulingBudget(
+@@ -822,71 +1168,79 @@ class Scheduler:
+         )
+         curr_loras: Set[int] = set()
+ 
+-        remaining_waiting, prefills = (self.waiting,
+-                                       SchedulerPrefillOutputs.create_empty())
+-        remaining_running, running_scheduled = (
+-            self.running, SchedulerRunningOutputs.create_empty())
+-        remaining_swapped, swapped_in = (
+-            self.swapped, SchedulerSwappedInOutputs.create_empty())
++        prefills = SchedulerPrefillOutputs.create_empty()
++        swapped_in = SchedulerSwappedInOutputs.create_empty()
+ 
+         # Decoding should be always scheduled first by fcfs.
+-        fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs")
+-        remaining_running, running_scheduled = self._schedule_running(
+-            self.running,
+-            budget,
+-            curr_loras,
+-            fcfs_policy,
+-            enable_chunking=True)
++        running_scheduled = self._schedule_running(budget,
++                                                   curr_loras,
++                                                   enable_chunking=True)
+ 
+         # Schedule swapped out requests.
+         # If preemption happens, it means we don't have space for swap-in.
+         if len(running_scheduled.preempted) + len(
+                 running_scheduled.swapped_out) == 0:
+-            remaining_swapped, swapped_in = self._schedule_swapped(
+-                self.swapped, budget, curr_loras, fcfs_policy)
++            swapped_in = self._schedule_swapped(budget, curr_loras)
+ 
+-        # Schedule new prefills.
+-        remaining_waiting, prefills = self._schedule_prefills(
+-            self.waiting, budget, curr_loras, enable_chunking=True)
++        prefills = self._schedule_prefills(budget,
++                                           curr_loras,
++                                           enable_chunking=True)
+ 
+         assert (budget.num_batched_tokens <=
+                 self.scheduler_config.max_num_batched_tokens)
+         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
+ 
+         # Update waiting requests.
+-        self.waiting = remaining_waiting
+         self.waiting.extendleft(running_scheduled.preempted)
++
+         # Update new running requests.
+-        self.running = remaining_running
+-        self.running.extend([s.seq_group for s in prefills.seq_groups])
+-        self.running.extend(
+-            [s.seq_group for s in running_scheduled.decode_seq_groups])
+-        self.running.extend(
+-            [s.seq_group for s in running_scheduled.prefill_seq_groups])
++        # By default, vLLM scheduler prioritizes prefills.
++        # Once chunked prefill is enabled,
++        # the policy is changed to prioritize decode requests.
+         self.running.extend(
+             [s.seq_group for s in swapped_in.decode_seq_groups])
+         self.running.extend(
+             [s.seq_group for s in swapped_in.prefill_seq_groups])
++        self.running.extend(
++            [s.seq_group for s in running_scheduled.decode_seq_groups])
++        self.running.extend(
++            [s.seq_group for s in running_scheduled.prefill_seq_groups])
++        self.running.extend([s.seq_group for s in prefills.seq_groups])
++
+         # Update swapped requests.
+-        self.swapped = remaining_swapped
+         self.swapped.extend(running_scheduled.swapped_out)
++        # Put prefills first due to Attention backend ordering assumption.
++        scheduled_seq_groups = (prefills.seq_groups +
++                                running_scheduled.prefill_seq_groups +
++                                swapped_in.prefill_seq_groups +
++                                running_scheduled.decode_seq_groups +
++                                swapped_in.decode_seq_groups)
++        num_prefill_groups = (len(prefills.seq_groups) +
++                              len(swapped_in.prefill_seq_groups) +
++                              len(running_scheduled.prefill_seq_groups))
++        # If all prompts, then we set num_lookahead_slots to 0
++        # this allows us to go through the `no_spec` path in
++        # `spec_decode_worker.py`
++        all_prefills = (len(scheduled_seq_groups) == num_prefill_groups)
++        num_lookahead_slots = (0 if
++                               (all_prefills
++                                and not self.scheduler_config.is_multi_step)
++                               else running_scheduled.num_lookahead_slots)
+         return SchedulerOutputs(
+-            scheduled_seq_groups=(prefills.seq_groups +
+-                                  running_scheduled.prefill_seq_groups +
+-                                  swapped_in.prefill_seq_groups +
+-                                  running_scheduled.decode_seq_groups +
+-                                  swapped_in.decode_seq_groups),
+-            num_prefill_groups=(len(prefills.seq_groups) +
+-                                len(swapped_in.prefill_seq_groups) +
+-                                len(running_scheduled.prefill_seq_groups)),
+-            num_batched_tokens=budget.num_batched_tokens,
++            scheduled_seq_groups=scheduled_seq_groups,
++            num_prefill_groups=num_prefill_groups,
++            num_batched_tokens=budget.num_batched_tokens +
++            budget.num_cached_tokens,
+             blocks_to_swap_in=swapped_in.blocks_to_swap_in,
+             blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
+-            blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
+-                                       swapped_in.blocks_to_copy),
+-            ignored_seq_groups=prefills.ignored_seq_groups,
+-            num_lookahead_slots=running_scheduled.num_lookahead_slots,
++            blocks_to_copy=running_scheduled.blocks_to_copy +
++            swapped_in.blocks_to_copy,
++            ignored_seq_groups=prefills.ignored_seq_groups +
++            swapped_in.infeasible_seq_groups,
++            num_lookahead_slots=num_lookahead_slots,
+             running_queue_size=len(self.running),
++            preempted=(len(running_scheduled.preempted) +
++                       len(running_scheduled.swapped_out)),
+         )
+ 
+     def _schedule(self) -> SchedulerOutputs:
+@@ -896,7 +1250,8 @@ class Scheduler:
+         else:
+             return self._schedule_default()
+ 
+-    def _can_append_slots(self, seq_group: SequenceGroup) -> bool:
++    def _can_append_slots(self, seq_group: SequenceGroup,
++                          enable_chunking: bool) -> bool:
+         """Determine whether or not we have enough space in the KV cache to
+         continue generation of the sequence group.
+         """
+@@ -907,21 +1262,41 @@ class Scheduler:
+             self.artificial_preempt_cnt -= 1
+             return False
+ 
+-        # Appending slots only occurs in decoding.
+-        is_prefill = False
++        is_prefill = seq_group.is_prefill()
++        num_lookahead_slots = self._get_num_lookahead_slots(
++            is_prefill, enable_chunking)
+ 
+-        return self.block_manager.can_append_slots(
+-            seq_group=seq_group,
+-            num_lookahead_slots=self._get_num_lookahead_slots(is_prefill),
+-        )
++        if is_prefill and num_lookahead_slots > 0:
++            # Appending prefill slots only happens multi-step and
++            # chunked-prefill are enabled together.
++            assert self.scheduler_config.is_multi_step and enable_chunking
+ 
+-    def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
++        return self.block_manager.can_append_slots(
++            seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
++
++    def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
++        # async_output_proc is allowed only when we have a single sequence
++        # in the sequence group
++        no_single_seq = seq_group.sampling_params is None or (
++            seq_group.sampling_params.n == 1)
++        return no_single_seq
++
++    def schedule(
++            self
++    ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
+         # Schedule sequence groups.
+         # This function call changes the internal states of the scheduler
+         # such as self.running, self.swapped, and self.waiting.
+-        scheduler_outputs = self._schedule()
++        scheduler_start_time = time.perf_counter()
++
++        scheduler_outputs: SchedulerOutputs = self._schedule()
+         now = time.time()
+ 
++        if not self.cache_config.enable_prefix_caching:
++            common_computed_block_nums = []
++
++        allow_async_output_proc: bool = self.use_async_output_proc
++
+         # Create input data structures.
+         seq_group_metadata_list: List[SequenceGroupMetadata] = []
+         for i, scheduled_seq_group in enumerate(
+@@ -930,67 +1305,138 @@ class Scheduler:
+             token_chunk_size = scheduled_seq_group.token_chunk_size
+             seq_group.maybe_set_first_scheduled_time(now)
+ 
++            seq_group_metadata = self._seq_group_metadata_cache[
++                self.cache_id].get_object()
++            seq_group_metadata.seq_data.clear()
++            seq_group_metadata.block_tables.clear()
++
+             # seq_id -> SequenceData
+             seq_data: Dict[int, SequenceData] = {}
+             # seq_id -> physical block numbers
+             block_tables: Dict[int, List[int]] = {}
+ 
++            if seq_group.is_encoder_decoder():
++                # Encoder associated with SequenceGroup
++                encoder_seq = seq_group.get_encoder_seq()
++                assert encoder_seq is not None
++                encoder_seq_data = encoder_seq.data
++                # Block table for cross-attention
++                # Also managed at SequenceGroup level
++                cross_block_table = self.block_manager.get_cross_block_table(
++                    seq_group)
++            else:
++                encoder_seq_data = None
++                cross_block_table = None
++
+             for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+                 seq_id = seq.seq_id
+                 seq_data[seq_id] = seq.data
+                 block_tables[seq_id] = self.block_manager.get_block_table(seq)
+                 self.block_manager.access_all_blocks_in_seq(seq, now)
+ 
+-            common_computed_block_nums = (
+-                self.block_manager.get_common_computed_block_ids(
+-                    seq_group.get_seqs(status=SequenceStatus.RUNNING)))
++            if self.cache_config.enable_prefix_caching:
++                common_computed_block_nums = (
++                    self.block_manager.get_common_computed_block_ids(
++                        seq_group.get_seqs(status=SequenceStatus.RUNNING)))
+ 
+             do_sample = True
+-            if seq_group.is_prefill():
++            is_prompt = seq_group.is_prefill()
++            # We should send the metadata to workers when the first prefill
++            # is sent. Subsequent requests could be chunked prefill or decode.
++            is_first_prefill = False
++            if is_prompt:
+                 seqs = seq_group.get_seqs()
+                 # Prefill has only 1 sequence.
+                 assert len(seqs) == 1
++                num_computed_tokens = seqs[0].data.get_num_computed_tokens()
++                is_first_prefill = num_computed_tokens == 0
+                 # In the next iteration, all prompt tokens are not computed.
+                 # It means the prefill is chunked, and we don't need sampling.
+                 # NOTE: We use get_len instead of get_prompt_len because when
+                 # a sequence is preempted, prefill includes previous generated
+                 # output tokens.
+-                if (token_chunk_size + seqs[0].data.get_num_computed_tokens() <
++                if (token_chunk_size + num_computed_tokens <
+                         seqs[0].data.get_len()):
+                     do_sample = False
+ 
+             # It assumes the scheduled_seq_groups is ordered by
+             # prefill < decoding.
+-            is_prompt = seq_group.is_prefill()
+-            seq_group_metadata = SequenceGroupMetadata(
+-                request_id=seq_group.request_id,
+-                is_prompt=is_prompt,
+-                seq_data=seq_data,
+-                sampling_params=seq_group.sampling_params,
+-                block_tables=block_tables,
+-                do_sample=do_sample,
+-                token_chunk_size=token_chunk_size,
+-                lora_request=seq_group.lora_request,
+-                computed_block_nums=common_computed_block_nums,
+-                state=seq_group.state,
+-                # `multi_modal_data` will only be present for the 1st comm
+-                # between engine and worker.
+-                # the subsequent comms can still use delta, but
+-                # `multi_modal_data` will be None.
+-                multi_modal_data=seq_group.multi_modal_data
+-                if scheduler_outputs.num_prefill_groups > 0 else None,
+-            )
++            if is_first_prefill or not self.scheduler_config.send_delta_data:
++                seq_group_metadata = SequenceGroupMetadata(
++                    request_id=seq_group.request_id,
++                    is_prompt=is_prompt,
++                    seq_data=seq_data,
++                    sampling_params=seq_group.sampling_params,
++                    block_tables=block_tables,
++                    do_sample=do_sample,
++                    pooling_params=seq_group.pooling_params,
++                    token_chunk_size=token_chunk_size,
++                    lora_request=seq_group.lora_request,
++                    computed_block_nums=common_computed_block_nums,
++                    encoder_seq_data=encoder_seq_data,
++                    cross_block_table=cross_block_table,
++                    state=seq_group.state,
++                    token_type_ids=seq_group.token_type_ids,
++                    # `multi_modal_data` will only be present for the 1st comm
++                    # between engine and worker.
++                    # the subsequent comms can still use delta, but
++                    # `multi_modal_data` will be None.
++                    multi_modal_data=seq_group.multi_modal_data
++                    if scheduler_outputs.num_prefill_groups > 0 else None,
++                    multi_modal_placeholders=seq_group.multi_modal_placeholders
++                    if scheduler_outputs.num_prefill_groups > 0 else None,
++                    mm_processor_kwargs=seq_group.mm_processor_kwargs,
++                    prompt_adapter_request=seq_group.prompt_adapter_request,
++                )
++            else:
++                # When SPMD mode is enabled, we only send delta data except for
++                # the first request to reduce serialization cost.
++                seq_data_delta = {}
++                for id, data in seq_data.items():
++                    seq_data_delta[id] = data.get_delta_and_reset()
++                seq_group_metadata = SequenceGroupMetadataDelta(
++                    seq_data_delta,
++                    seq_group.request_id,
++                    block_tables,
++                    is_prompt,
++                    do_sample=do_sample,
++                    token_chunk_size=token_chunk_size,
++                    computed_block_nums=common_computed_block_nums,
++                )
+             seq_group_metadata_list.append(seq_group_metadata)
+ 
++            if allow_async_output_proc:
++                allow_async_output_proc = self._allow_async_output_proc(
++                    seq_group)
++
+         # Now that the batch has been created, we can assume all blocks in the
+         # batch will have been computed before the next scheduling invocation.
+         # This is because the engine assumes that a failure in model execution
+         # will crash the vLLM instance / will not retry.
+         for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
+             self.block_manager.mark_blocks_as_computed(
+-                scheduled_seq_group.seq_group)
++                scheduled_seq_group.seq_group,
++                scheduled_seq_group.token_chunk_size)
++
++        self._seq_group_metadata_cache[self.next_cache_id].reset()
++
++        scheduler_time = time.perf_counter() - scheduler_start_time
++        # Add this to scheduler time to all the sequences that are currently
++        # running. This will help estimate if the scheduler is a significant
++        # component in the e2e latency.
++        for seq_group in self.running:
++            if seq_group is not None and seq_group.metrics is not None:
++                if seq_group.metrics.scheduler_time is not None:
++                    seq_group.metrics.scheduler_time += scheduler_time
++                else:
++                    seq_group.metrics.scheduler_time = scheduler_time
++
++        # Move to next cache (if exists)
++        self.cache_id = self.next_cache_id
+ 
+-        return seq_group_metadata_list, scheduler_outputs
++        # Return results
++        return (seq_group_metadata_list, scheduler_outputs,
++                allow_async_output_proc)
+ 
+     def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+         self.block_manager.fork(parent_seq, child_seq)
+@@ -999,46 +1445,90 @@ class Scheduler:
+         """Free a sequence from a block table."""
+         self.block_manager.free(seq)
+ 
++    def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
++        """Free finished seqs in a sequence group."""
++        for seq in seq_group.get_seqs():
++            if seq.is_finished():
++                self.free_seq(seq)
++
++    def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
++        if seq_group.is_finished():
++            # Free cross-attention block table, if it exists
++            self._free_seq_group_cross_attn_blocks(seq_group)
++
++            # Add the finished requests to the finished requests list.
++            # This list will be used to update the Mamba cache in the
++            # next step.
++            self._finished_requests_ids.append(seq_group.request_id)
++
++        # Free finished seqs
++        self._free_finished_seqs(seq_group)
++
+     def free_finished_seq_groups(self) -> None:
+-        self.running = deque(seq_group for seq_group in self.running
+-                             if not seq_group.is_finished())
++        remaining: Deque[SequenceGroup] = deque()
++        for seq_group in self.running:
++            self._free_finished_seq_group(seq_group)
++            if not seq_group.is_finished():
++                remaining.append(seq_group)
++
++        self.running = remaining
++
++        # Handle async stopped sequence groups
++        # (ones that reached max model len)
++        if self._async_stopped:
++            for seq_group in self._async_stopped:
++                self._free_seq_group_cross_attn_blocks(seq_group)
++                self._finished_requests_ids.append(seq_group.request_id)
++
++                # Free finished seqs
++                self._free_finished_seqs(seq_group)
++
++            self._async_stopped.clear()
+ 
+     def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
+         self.block_manager.allocate(seq_group)
+         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
+             seq.status = SequenceStatus.RUNNING
+ 
+-    def _append_slots(
+-        self,
+-        seq_group: SequenceGroup,
+-        blocks_to_copy: Dict[int, List[int]],
+-    ) -> None:
++    def _append_slots(self,
++                      seq_group: SequenceGroup,
++                      blocks_to_copy: List[Tuple[int, int]],
++                      enable_chunking: bool = False) -> None:
+         """Appends new slots to the sequences in the given sequence group.
+ 
+         Args:
+             seq_group (SequenceGroup): The sequence group containing the
+                 sequences to append slots to.
+-            blocks_to_copy (Dict[int, List[int]]): A dictionary mapping source
+-                block indices to lists of destination block indices. This
+-                dictionary is updated with the new source and destination block
+-                indices for the appended slots.
++            blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
++                ints, the first int is the source block index, and the second
++                int is the destination block index. This list is updated with
++                the new source and destination block indices for the appended
++                slots.
++            enable_chunking (bool): True if chunked prefill is enabled.
+         """
+-        num_lookahead_slots = self._get_num_lookahead_slots(is_prefill=False)
+-
+-        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
++        is_prefill: bool = seq_group.is_prefill()
++        num_lookahead_slots: int = self._get_num_lookahead_slots(
++            is_prefill, enable_chunking)
++
++        seq_group.init_multi_step_from_lookahead_slots(
++            num_lookahead_slots,
++            num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
++            is_multi_step=self.scheduler_config.is_multi_step,
++            enable_chunking=enable_chunking)
++
++        seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
++        if self.scheduler_config.is_multi_step and enable_chunking:
++            # In multi-step chunked-prefill any sequence type can have
++            # slots appended.
++            seq_status = None
++
++        for seq in seq_group.get_seqs(status=seq_status):
+             cows = self.block_manager.append_slots(seq, num_lookahead_slots)
++            if len(cows) > 0:
++                blocks_to_copy.extend(cows)
+ 
+-            for src, dests in cows.items():
+-                if src not in blocks_to_copy:
+-                    blocks_to_copy[src] = []
+-                blocks_to_copy[src].extend(dests)
+-
+-    def _preempt(
+-        self,
+-        seq_group: SequenceGroup,
+-        blocks_to_swap_out: Dict[int, int],
+-        preemption_mode: Optional[PreemptionMode] = None,
+-    ) -> PreemptionMode:
++    def _preempt(self, seq_group: SequenceGroup,
++                 blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
+         # If preemption mode is not specified, we determine the mode as follows:
+         # We use recomputation by default since it incurs lower overhead than
+         # swapping. However, when the sequence group has multiple sequences
+@@ -1050,11 +1540,27 @@ class Scheduler:
+         # over sequence groups with a single sequence.
+         # TODO(woosuk): Support recomputation for sequence groups with multiple
+         # sequences. This may require a more sophisticated CUDA kernel.
+-        if preemption_mode is None:
++        if self.user_specified_preemption_mode is None:
+             if seq_group.get_max_num_running_seqs() == 1:
+                 preemption_mode = PreemptionMode.RECOMPUTE
+             else:
+                 preemption_mode = PreemptionMode.SWAP
++
++        elif self.user_specified_preemption_mode == "swap":
++            preemption_mode = PreemptionMode.SWAP
++        else:
++            preemption_mode = PreemptionMode.RECOMPUTE
++
++        if self.num_cumulative_preemption % 50 == 0:
++            logger.warning(
++                "Sequence group %s is preempted by %s mode because there is "
++                "not enough KV cache space. This can affect the end-to-end "
++                "performance. Increase gpu_memory_utilization or "
++                "tensor_parallel_size to provide more KV cache memory. "
++                "total_num_cumulative_preemption=%d", seq_group.request_id,
++                preemption_mode, self.num_cumulative_preemption + 1)
++        self.num_cumulative_preemption += 1
++
+         if preemption_mode == PreemptionMode.RECOMPUTE:
+             self._preempt_by_recompute(seq_group)
+         elif preemption_mode == PreemptionMode.SWAP:
+@@ -1073,28 +1579,29 @@ class Scheduler:
+             seq.status = SequenceStatus.WAITING
+             self.free_seq(seq)
+             seq.reset_state_for_recompute()
++        self._free_seq_group_cross_attn_blocks(seq_group)
+ 
+     def _preempt_by_swap(
+         self,
+         seq_group: SequenceGroup,
+-        blocks_to_swap_out: Dict[int, int],
++        blocks_to_swap_out: List[Tuple[int, int]],
+     ) -> None:
+         self._swap_out(seq_group, blocks_to_swap_out)
+ 
+     def _swap_in(
+         self,
+         seq_group: SequenceGroup,
+-        blocks_to_swap_in: Dict[int, int],
++        blocks_to_swap_in: List[Tuple[int, int]],
+     ) -> None:
+         mapping = self.block_manager.swap_in(seq_group)
+-        blocks_to_swap_in.update(mapping)
++        blocks_to_swap_in.extend(mapping)
+         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+             seq.status = SequenceStatus.RUNNING
+ 
+     def _swap_out(
+         self,
+         seq_group: SequenceGroup,
+-        blocks_to_swap_out: Dict[int, int],
++        blocks_to_swap_out: List[Tuple[int, int]],
+     ) -> None:
+         if not self.block_manager.can_swap_out(seq_group):
+             # FIXME(woosuk): Abort the sequence group instead of aborting the
+@@ -1103,7 +1610,7 @@ class Scheduler:
+                 "Aborted due to the lack of CPU swap space. Please increase "
+                 "the swap space to avoid this error.")
+         mapping = self.block_manager.swap_out(seq_group)
+-        blocks_to_swap_out.update(mapping)
++        blocks_to_swap_out.extend(mapping)
+         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+             seq.status = SequenceStatus.SWAPPED
+ 
+@@ -1123,41 +1630,207 @@ class Scheduler:
+             passed_delay = True
+         return passed_delay
+ 
+-    def _get_num_lookahead_slots(self, is_prefill: bool) -> int:
++    def _get_num_lookahead_slots(self, is_prefill: bool,
++                                 enable_chunking: bool) -> int:
+         """The number of slots to allocate per sequence per step, beyond known
+         token ids. Speculative decoding uses these slots to store KV activations
+         of tokens which may or may not be accepted.
+ 
+         Speculative decoding does not yet support prefill, so we do not perform
+         lookahead allocation for prefill.
++
++        When chunking is enabled with multi-step, we allocate lookahead slots
++        for the prefills for when the prefills turn into decodes in the first
++        step.
+         """
+         if is_prefill:
+-            return 0
++            if self.scheduler_config.is_multi_step and enable_chunking:
++                # num_lookahead_slots was introduced in the context of decodes,
++                # in Speculative Decoding.
++                # When the num_scheduler_steps is 8, say, then the
++                # num_lookahead_slots is 7. Meaning, we are doing a 1-step of
++                # decode anyways and we wish to do 7 more.
++                #
++                # "lookaheads" for prefills, is introduced in support for
++                # Chunked-Prefill in Multi-Step.
++                return self.scheduler_config.num_lookahead_slots + 1
++            else:
++                return 0
+ 
+         return self.scheduler_config.num_lookahead_slots
+ 
+-    def _get_num_new_tokens(self, seq_group: SequenceGroup,
+-                            status: SequenceStatus, enable_chunking: bool,
+-                            budget: SchedulingBudget) -> int:
+-        """Get the next new tokens to compute for a given sequence group
+-            that's in a given `status`.
++    def _get_num_new_uncached_and_cached_tokens(
++        self,
++        seq_group: SequenceGroup,
++        status: SequenceStatus,
++        enable_chunking: bool,
++        budget: SchedulingBudget,
++    ) -> Tuple[int, int]:
++        """
++        Returns the number of new uncached and cached tokens to schedule for a
++        given sequence group that's in a given `status`.
+ 
+         The API could chunk the number of tokens to compute based on `budget`
+         if `enable_chunking` is True. If a sequence group has multiple
+         sequences (e.g., running beam search), it means it is in decoding
+         phase, so chunking doesn't happen.
+ 
+-        Returns 0 if the new token cannot be computed due to token budget.
++        Returns (0, 0) if the new token cannot be computed due to token budget.
++
++        The cached tokens's blocks are already computed, and the attention
++        backend will reuse the cached blocks rather than recomputing them. So
++        the scheduler could schedule these cached tokens "for free".
++
++        Args:
++            seq_group: The sequence group to get the number of new tokens to
++                schedule.
++            status: The status of the sequences to get the number of new tokens
++                to schedule.
++            enable_chunking: Whether to chunk the number of tokens to compute.
++            budget: The budget to chunk the number of tokens to compute.
++
++
++        Returns:
++            A tuple of two ints. The first int is the number of new uncached
++            tokens to schedule. The second int is the number of cached tokens.
++            If no more new tokens can be scheduled, returns (0, 0).
+         """
+-        num_new_tokens = 0
++        num_cached_new_tokens = 0
++        num_uncached_new_tokens = 0
++
+         seqs = seq_group.get_seqs(status=status)
++        # Compute the number of new uncached and cached tokens for
++        # each sequence.
+         for seq in seqs:
+-            num_new_tokens += seq.get_num_new_tokens()
+-        assert num_new_tokens > 0
+-        # Chunk if a running request cannot fit in.
+-        # If number of seq > 1, it means it is doing beam search in a
+-        # decode phase. Do not chunk in that case.
++            if not seq.is_prefill():
++                # Decode sequences should always just have 1 uncached token
++                # TODO(rickyx): Actually is this still correct for multi-step?
++                num_uncached_new_tokens += 1
++                continue
++
++            num_computed_tokens_seq = seq.get_num_computed_tokens()
++            all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
++            if not self.cache_config.enable_prefix_caching:
++                # If prefix caching is not enabled, all new tokens are uncached.
++                num_uncached_new_tokens += all_num_new_tokens_seq
++                continue
++
++            # NOTE: the cache token might be currently in a block that's in an
++            # evictor meaning that it's not yet allocated. However, we don't
++            # exclude such tokens in the cache count because it will be
++            # guaranteed to be allocated later if the sequence can be allocated.
++            num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
++                seq)
++
++            # Sanity check.
++            if num_cached_tokens_seq < num_computed_tokens_seq:
++                # This should only happen with chunked prefill, and
++                # the seq is still in prefill. The `num_cached_tokens_seq`
++                # is the value we calculated on scheduling the first prefill.
++                # For subsequent continuous prefill steps, we cached the
++                # number of cache tokens for the sequence so the cached token
++                # count could be less than the number of computed tokens.
++                # See comments on `ComputedBlocksTracker` for more details.
++                assert (
++                    seq.is_prefill() and seq.status == SequenceStatus.RUNNING
++                    and self.scheduler_config.chunked_prefill_enabled
++                ), ("Number of cached tokens should not be less than the "
++                    "number of computed tokens for a sequence that's still "
++                    f"in prefill. But there are {num_cached_tokens_seq} cached "
++                    f"tokens and {num_computed_tokens_seq} computed tokens "
++                    f"for sequence {seq.seq_id}.")
++
++            num_cached_new_tokens_seq = max(
++                0, num_cached_tokens_seq - num_computed_tokens_seq)
++            num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
++                                           num_cached_new_tokens_seq)
++
++            num_uncached_new_tokens += num_uncached_new_tokens_seq
++            num_cached_new_tokens += num_cached_new_tokens_seq
++
++        if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
++            # For a fully cached hit sequence, we actually need to recompute the
++            # last token. So we need at least 1 uncached token to schedule.
++            # See ModelRunner._compute_for_prefix_cache_hit for more details.
++            num_uncached_new_tokens = 1
++            num_cached_new_tokens -= 1
++
+         if enable_chunking and len(seqs) == 1:
+-            num_new_tokens = min(num_new_tokens,
+-                                 budget.remaining_token_budget())
++            # Chunk if a running request cannot fit in the given budget.
++            # If number of seq > 1, it means it is doing beam search
++            # in a decode phase. Do not chunk.
++            num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
++                self.scheduler_config,
++                self.cache_config,
++                budget,
++                self._get_prompt_limit(seq_group),
++                num_uncached_new_tokens,
++            )
++
++        return num_uncached_new_tokens, num_cached_new_tokens
++
++    @staticmethod
++    def _chunk_new_tokens_to_schedule(
++        scheduler_config: SchedulerConfig,
++        cache_config: CacheConfig,
++        budget: SchedulingBudget,
++        prompt_limit: int,
++        num_new_tokens: int,
++    ) -> int:
++        """
++        Chunks the number of new tokens to schedule based on the budget when
++        chunked prefill is enabled.
++
++        Args:
++            scheduler_config: The scheduler config.
++            cache_config: The cache config.
++            budget: The budget to chunk the number of tokens to compute.
++            prompt_limit: The maximum number of tokens allowed in a prompt.
++            num_new_tokens: The number of new tokens to schedule.
++
++        Returns:
++            The number of new tokens to schedule after chunking.
++        """
++        remaining_token_budget = budget.remaining_token_budget()
++        if scheduler_config.is_multi_step:
++            # The current multi-step + chunked prefill capability does
++            # not actually support chunking prompts.
++            #
++            # Therefore, `num_new_tokens` is computed in the same fashion
++            # for both multi-step+chunked-prefill &
++            # multi-step+chunked-prefill+APC
++            #
++            # Prompts with more tokens than the current remaining budget
++            # are postponed to future scheduler steps
++            if num_new_tokens > prompt_limit:
++                # If the seq_group is in prompt-stage, pass the
++                # num_new_tokens as-is so the caller can ignore
++                # the sequence.
++                return num_new_tokens
++
++            return (0 if num_new_tokens > remaining_token_budget else
++                    num_new_tokens)
++
++        if cache_config.enable_prefix_caching:
++            # Adjust the remaining token budget to be divisible by the block
++            # size when prefix caching is enabled.
++
++            # When prefix caching is enabled, we always allocate
++            # the number of new tokens that is dividable by the block
++            # size to avoid partial block matching.
++            block_size = cache_config.block_size
++            remainder = budget.token_budget % block_size
++            if remainder != 0:
++                raise ValueError("When enabling chunked prefill and "
++                                 "prefix caching, max_num_batched_tokens "
++                                 "(chunk size) must be dividable by "
++                                 "block size, but got chunk_size "
++                                 f"({budget.token_budget}) % block_size "
++                                 f"({block_size}) = {remainder}")
++            # Round down to block size.
++            remaining_token_budget = (remaining_token_budget // block_size *
++                                      block_size)
++
++        num_new_tokens = min(num_new_tokens, remaining_token_budget)
++
+         return num_new_tokens
+diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
+index b539a7b..e13505d 100644
+--- a/vllm/distributed/communication_op.py
++++ b/vllm/distributed/communication_op.py
+@@ -1,237 +1,32 @@
+-from collections import namedtuple
+-from typing import Any, Dict, List, Optional, Tuple, Union
++from typing import Any, Dict, Optional, Union
+ 
+ import torch
+-from torch.distributed import ProcessGroup
++import torch.distributed
+ 
+-from .parallel_state import (get_cpu_world_group,
+-                             get_tensor_model_parallel_group,
+-                             get_tensor_model_parallel_rank,
+-                             get_tensor_model_parallel_world_size,
+-                             is_pynccl_enabled_for_all_reduce)
++from .parallel_state import get_tp_group
+ 
+ 
+ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
+-    """All-reduce the input tensor across model parallel group.
+-
+-    NOTE: This operation will be applied in-place on the input tensor if
+-    disable_custom_all_reduce is set to True. Otherwise, this operation may or
+-    may not be applied in place depending on whether custom all reduce is
+-    invoked for a particular tensor, which further depends on the tensor size
+-    and GPU topology.
+-
+-    TLDR: always assume this function modifies its input, but use the return
+-    value as the output.
+-    """
+-    from vllm.distributed.device_communicators import pynccl_utils
+-    from vllm.distributed.device_communicators.custom_all_reduce import (
+-        custom_all_reduce)
+-
+-    # Bypass the function if we are using only 1 GPU.
+-    if get_tensor_model_parallel_world_size() == 1:
+-        return input_
+-    out = custom_all_reduce(input_)
+-    if out is not None:
+-        return out
+-    if is_pynccl_enabled_for_all_reduce():
+-        pynccl_utils.all_reduce(input_)
+-    else:
+-        torch.distributed.all_reduce(input_,
+-                                     group=get_tensor_model_parallel_group())
+-    return input_
++    """All-reduce the input tensor across model parallel group."""
++    return get_tp_group().all_reduce(input_)
+ 
+ 
+ def tensor_model_parallel_all_gather(input_: torch.Tensor,
+                                      dim: int = -1) -> torch.Tensor:
+     """All-gather the input tensor across model parallel group."""
+-    world_size = get_tensor_model_parallel_world_size()
+-    # Bypass the function if we are using only 1 GPU.
+-    if world_size == 1:
+-        return input_
+-    assert -input_.dim() <= dim < input_.dim(), (
+-        f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+-    if dim < 0:
+-        # Convert negative dim to positive.
+-        dim += input_.dim()
+-    input_size = input_.size()
+-    # Allocate output tensor.
+-    output_tensor = torch.empty((world_size, ) + input_size,
+-                                dtype=input_.dtype,
+-                                device=input_.device)
+-    # All-gather.
+-    torch.distributed.all_gather_into_tensor(
+-        output_tensor, input_, group=get_tensor_model_parallel_group())
+-    # Reshape
+-    output_tensor = output_tensor.movedim(0, dim)
+-    output_tensor = output_tensor.reshape(input_size[:dim] +
+-                                          (world_size * input_size[dim], ) +
+-                                          input_size[dim + 1:])
+-    return output_tensor
++    return get_tp_group().all_gather(input_, dim)
+ 
+ 
+ def tensor_model_parallel_gather(input_: torch.Tensor,
+                                  dst: int = 0,
+-                                 dim: int = -1) -> torch.Tensor:
+-    """Gather the input tensor across model parallel group.
+-
+-    NOTE: We assume that the input tensor is on the same device across
+-    all the ranks.
+-    """
+-    world_size = get_tensor_model_parallel_world_size()
+-    # Bypass the function if we are using only 1 GPU.
+-    if world_size == 1:
+-        return input_
+-    assert -input_.dim() <= dim < input_.dim(), (
+-        f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+-    if dim < 0:
+-        # Convert negative dim to positive.
+-        dim += input_.dim()
+-    # Allocate output tensor.
+-    if get_tensor_model_parallel_rank() == dst:
+-        gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+-    else:
+-        gather_list = None
+-    # Gather.
+-    torch.distributed.gather(input_,
+-                             gather_list,
+-                             dst=dst,
+-                             group=get_tensor_model_parallel_group())
+-    if get_tensor_model_parallel_rank() == dst:
+-        output_tensor = torch.cat(gather_list, dim=dim)
+-    else:
+-        output_tensor = None
+-    return output_tensor
+-
+-
+-def broadcast(input_: torch.Tensor,
+-              src: int = 0,
+-              group: Optional[ProcessGroup] = None):
+-    """Broadcast the input tensor."""
+-    group = group or torch.distributed.group.WORLD
+-    ranks = torch.distributed.get_process_group_ranks(group)
+-    assert src in ranks, f"Invalid src rank ({src})"
+-
+-    # Bypass the function if we are using only 1 GPU.
+-    world_size = torch.distributed.get_world_size(group=group)
+-    if world_size == 1:
+-        return input_
+-    # Broadcast.
+-    torch.distributed.broadcast(input_, src=src, group=group)
+-    return input_
+-
+-
+-def broadcast_object_list(obj_list: List[Any],
+-                          src: int = 0,
+-                          group: Optional[ProcessGroup] = None):
+-    """Broadcast the input object list."""
+-    group = group or torch.distributed.group.WORLD
+-    ranks = torch.distributed.get_process_group_ranks(group)
+-    assert src in ranks, f"Invalid src rank ({src})"
++                                 dim: int = -1) -> Optional[torch.Tensor]:
++    """Gather the input tensor across model parallel group."""
++    return get_tp_group().gather(input_, dst, dim)
+ 
+-    # Bypass the function if we are using only 1 GPU.
+-    world_size = torch.distributed.get_world_size(group=group)
+-    if world_size == 1:
+-        return obj_list
+-    # Broadcast.
+-    torch.distributed.broadcast_object_list(obj_list, src=src, group=group)
+-    return obj_list
+ 
+-
+-TensorMetadata = namedtuple("TensorMetadata", ["dtype", "size"])
+-
+-
+-def _split_tensor_dict(
+-    tensor_dict: Dict[Any, Union[torch.Tensor, Any]]
+-) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+-    """Split the tensor dictionary into two parts:
+-    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
+-         by its metadata.
+-    2. A list of tensors.
+-    """
+-    metadata_list = []
+-    tensor_list = []
+-    for key, value in tensor_dict.items():
+-        if isinstance(value, torch.Tensor):
+-            # Note(youkaichao): currently this only supports broadcasting
+-            # tensors on cuda. In the future, we can add device as a field in
+-            # TensorMetadata to support broadcasting tensors on different
+-            # devices.
+-            assert value.is_cuda, (
+-                f"Tensor {key}: {value} is not on cuda. Currently we only "
+-                f"support broadcasting tensors on cuda.")
+-            metadata_list.append((key, TensorMetadata(value.dtype,
+-                                                      value.size())))
+-            tensor_list.append(value)
+-        else:
+-            metadata_list.append((key, value))
+-    return metadata_list, tensor_list
+-
+-
+-def broadcast_tensor_dict(
+-    tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
+-    src: int = 0,
+-    group: Optional[ProcessGroup] = None,
+-    metadata_group: Optional[ProcessGroup] = None
+-) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
+-    """Broadcast the input tensor dictionary.
+-    `group` is used to broadcast the tensors, while `metadata_group` is used
+-     to broadcast the metadata of the dict (e.g. dict structure, tensor sizes,
+-     dtypes).
+-    """
+-    group = group or torch.distributed.group.WORLD
+-    metadata_group = metadata_group or get_cpu_world_group()
+-    ranks = torch.distributed.get_process_group_ranks(group)
+-    assert src in ranks, f"Invalid src rank ({src})"
+-
+-    # Bypass the function if we are using only 1 GPU.
+-    world_size = torch.distributed.get_world_size(group=group)
+-    if world_size == 1:
++def broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor,
++                                                                Any]]] = None,
++                          src: int = 0):
++    if not torch.distributed.is_initialized():
+         return tensor_dict
+-
+-    rank = torch.distributed.get_rank()
+-    if rank == src:
+-        metadata_list: List[Tuple[Any, Any]] = []
+-        assert isinstance(
+-            tensor_dict,
+-            dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
+-        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+-        # `metadata_list` lives in CPU memory.
+-        # `broadcast_object_list` involves serialization and deserialization,
+-        # all happening on CPU. Therefore, we can use the CPU group.
+-        torch.distributed.broadcast_object_list([metadata_list],
+-                                                src=src,
+-                                                group=metadata_group)
+-        async_handles = []
+-        for tensor in tensor_list:
+-            async_handles.append(
+-                torch.distributed.broadcast(tensor,
+-                                            src=src,
+-                                            group=group,
+-                                            async_op=True))
+-        for async_handle in async_handles:
+-            async_handle.wait()
+-
+-    else:
+-        recv_metadata_list = [None]
+-        torch.distributed.broadcast_object_list(recv_metadata_list,
+-                                                src=src,
+-                                                group=metadata_group)
+-        assert recv_metadata_list[0] is not None
+-        tensor_dict = {}
+-        async_handles = []
+-        for key, value in recv_metadata_list[0]:
+-            if isinstance(value, TensorMetadata):
+-                tensor = torch.empty(value.size,
+-                                     dtype=value.dtype,
+-                                     device="cuda")
+-                async_handle = torch.distributed.broadcast(tensor,
+-                                                           src=src,
+-                                                           async_op=True,
+-                                                           group=group)
+-                async_handles.append(async_handle)
+-                tensor_dict[key] = tensor
+-            else:
+-                tensor_dict[key] = value
+-        for async_handle in async_handles:
+-            async_handle.wait()
+-    return tensor_dict
++    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
+diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
+new file mode 100644
+index 0000000..d5a5338
+--- /dev/null
++++ b/vllm/distributed/device_communicators/cuda_wrapper.py
+@@ -0,0 +1,172 @@
++"""This file is a pure Python wrapper for the cudart library.
++It avoids the need to compile a separate shared library, and is
++convenient for use when we just need to call a few functions.
++"""
++
++import ctypes
++from dataclasses import dataclass
++from typing import Any, Dict, List, Optional
++
++# this line makes it possible to directly load `libcudart.so` using `ctypes`
++import torch  # noqa
++
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++# === export types and functions from cudart to Python ===
++# for the original cudart definition, please check
++# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html
++
++cudaError_t = ctypes.c_int
++cudaMemcpyKind = ctypes.c_int
++
++
++class cudaIpcMemHandle_t(ctypes.Structure):
++    _fields_ = [("internal", ctypes.c_byte * 128)]
++
++
++@dataclass
++class Function:
++    name: str
++    restype: Any
++    argtypes: List[Any]
++
++
++def find_loaded_library(lib_name) -> Optional[str]:
++    """
++    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
++    the file `/proc/self/maps` contains the memory maps of the process, which includes the
++    shared libraries loaded by the process. We can use this file to find the path of the
++    a loaded library.
++    """ # noqa
++    found = False
++    with open("/proc/self/maps") as f:
++        for line in f:
++            if lib_name in line:
++                found = True
++                break
++    if not found:
++        # the library is not loaded in the current process
++        return None
++    # if lib_name is libcudart, we need to match a line with:
++    # address /path/to/libcudart-hash.so.11.0
++    start = line.index("/")
++    path = line[start:].strip()
++    filename = path.split("/")[-1]
++    assert filename.rpartition(".so")[0].startswith(lib_name), \
++        f"Unexpected filename: {filename} for library {lib_name}"
++    return path
++
++
++class CudaRTLibrary:
++    exported_functions = [
++        # ​cudaError_t cudaSetDevice ( int  device )
++        Function("cudaSetDevice", cudaError_t, [ctypes.c_int]),
++        # cudaError_t 	cudaDeviceSynchronize ( void )
++        Function("cudaDeviceSynchronize", cudaError_t, []),
++        # ​cudaError_t cudaDeviceReset ( void )
++        Function("cudaDeviceReset", cudaError_t, []),
++
++        # const char* 	cudaGetErrorString ( cudaError_t error )
++        Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),
++
++        # ​cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
++        Function("cudaMalloc", cudaError_t,
++                 [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]),
++        # ​cudaError_t 	cudaFree ( void* devPtr )
++        Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
++        # ​cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
++        Function("cudaMemset", cudaError_t,
++                 [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]),
++        # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
++        Function("cudaMemcpy", cudaError_t, [
++            ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind
++        ]),
++
++        # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
++        Function("cudaIpcGetMemHandle", cudaError_t,
++                 [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]),
++        # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
++        Function("cudaIpcOpenMemHandle", cudaError_t, [
++            ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint
++        ]),
++    ]
++
++    # class attribute to store the mapping from the path to the library
++    # to avoid loading the same library multiple times
++    path_to_library_cache: Dict[str, Any] = {}
++
++    # class attribute to store the mapping from library path
++    #  to the corresponding dictionary
++    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
++
++    def __init__(self, so_file: Optional[str] = None):
++        if so_file is None:
++            so_file = find_loaded_library("libcudart")
++            assert so_file is not None, \
++                "libcudart is not loaded in the current process"
++        if so_file not in CudaRTLibrary.path_to_library_cache:
++            lib = ctypes.CDLL(so_file)
++            CudaRTLibrary.path_to_library_cache[so_file] = lib
++        self.lib = CudaRTLibrary.path_to_library_cache[so_file]
++
++        if so_file not in CudaRTLibrary.path_to_dict_mapping:
++            _funcs = {}
++            for func in CudaRTLibrary.exported_functions:
++                f = getattr(self.lib, func.name)
++                f.restype = func.restype
++                f.argtypes = func.argtypes
++                _funcs[func.name] = f
++            CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs
++        self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file]
++
++    def CUDART_CHECK(self, result: cudaError_t) -> None:
++        if result != 0:
++            error_str = self.cudaGetErrorString(result)
++            raise RuntimeError(f"CUDART error: {error_str}")
++
++    def cudaGetErrorString(self, error: cudaError_t) -> str:
++        return self.funcs["cudaGetErrorString"](error).decode("utf-8")
++
++    def cudaSetDevice(self, device: int) -> None:
++        self.CUDART_CHECK(self.funcs["cudaSetDevice"](device))
++
++    def cudaDeviceSynchronize(self) -> None:
++        self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]())
++
++    def cudaDeviceReset(self) -> None:
++        self.CUDART_CHECK(self.funcs["cudaDeviceReset"]())
++
++    def cudaMalloc(self, size: int) -> ctypes.c_void_p:
++        devPtr = ctypes.c_void_p()
++        self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size))
++        return devPtr
++
++    def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
++        self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))
++
++    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int,
++                   count: int) -> None:
++        self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))
++
++    def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p,
++                   count: int) -> None:
++        cudaMemcpyDefault = 4
++        kind = cudaMemcpyDefault
++        self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))
++
++    def cudaIpcGetMemHandle(self,
++                            devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
++        handle = cudaIpcMemHandle_t()
++        self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"](
++            ctypes.byref(handle), devPtr))
++        return handle
++
++    def cudaIpcOpenMemHandle(self,
++                             handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
++        cudaIpcMemLazyEnablePeerAccess = 1
++        devPtr = ctypes.c_void_p()
++        self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"](
++            ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess))
++        return devPtr
+diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
+index cc5f816..62929dc 100644
+--- a/vllm/distributed/device_communicators/custom_all_reduce.py
++++ b/vllm/distributed/device_communicators/custom_all_reduce.py
+@@ -1,205 +1,160 @@
++import ctypes
+ from contextlib import contextmanager
+-from typing import Any, List, Optional
++from typing import List, Optional, Union
+ 
+ import torch
+ import torch.distributed as dist
++from torch.distributed import ProcessGroup
+ 
+ import vllm.envs as envs
++from vllm import _custom_ops as ops
++from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
++from vllm.distributed.device_communicators.custom_all_reduce_utils import (
++    gpu_p2p_access_check)
++from vllm.distributed.parallel_state import in_the_same_node_as
+ from vllm.logger import init_logger
++from vllm.platforms import current_platform
++from vllm.utils import cuda_device_count_stateless
+ 
+ try:
+-    import pynvml
+-
+-    from vllm._C import custom_ar
+-except ImportError:
+-    # For AMD GPUs
+-    custom_ar = None
+-    pynvml = None
++    ops.meta_size()
++    custom_ar = True
++except Exception:
++    # For AMD GPUs and CPUs
++    custom_ar = False
+ 
+ logger = init_logger(__name__)
+ 
+-_CA_HANDLE: Optional["CustomAllreduce"] = None
+-_IS_CAPTURING = False
+-_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
+-
+-
+-def init_custom_ar() -> None:
+-    from vllm.distributed import (get_tensor_model_parallel_rank,
+-                                  get_tensor_model_parallel_world_size)
+-
+-    global _CA_HANDLE
+-    if _CA_HANDLE is not None:
+-        return
+-    rank = get_tensor_model_parallel_rank()
+-    world_size = get_tensor_model_parallel_world_size()
+-    if world_size == 1:
+-        # No need to initialize custom allreduce for single GPU case.
+-        return
+-
+-    if world_size not in _SUPPORTED_WORLD_SIZES:
+-        logger.warning(
+-            "Custom allreduce is disabled due to an unsupported world size: "
+-            "%d. Supported world sizes: %s. To silence this warning, specify"
+-            " disable_custom_all_reduce=True explicitly.", world_size,
+-            str(_SUPPORTED_WORLD_SIZES))
+-        return
+-    num_dev = torch.cuda.device_count()
+-    # note: num dev can be larger than world_size if we're only using
+-    # first few GPUs
+-    if num_dev < world_size:
+-        logger.warning(
+-            "Cannot test GPU P2P because not all GPUs are visible to the "
+-            "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
+-            " is set.")
+-        return
+-    # test nvlink first, this will filter out most of the cases
+-    # where custom allreduce is not supported
+-    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+-    if cuda_visible_devices:
+-        device_ids = list(map(int, cuda_visible_devices.split(",")))
+-    else:
+-        device_ids = list(range(num_dev))
+-    # this checks hardware and driver support for NVLink
+-    full_nvlink = _is_full_nvlink(device_ids)
+-    if world_size > 2 and not full_nvlink:
+-        logger.warning(
+-            "Custom allreduce is disabled because it's not supported on more"
+-            " than two PCIe-only GPUs. To silence this warning, specify"
+-            " disable_custom_all_reduce=True explicitly.")
+-        return
+-    # test P2P capability, this checks software/cudaruntime support
+-    # this is expensive to compute at the first time
+-    # then we cache the result
+-    if not _can_p2p(rank, world_size):
+-        logger.warning(
+-            "Custom allreduce is disabled because your platform lacks GPU P2P"
+-            " capability or P2P test failed. To silence this warning, specify"
+-            " disable_custom_all_reduce=True explicitly.")
+-        return
+-    _CA_HANDLE = CustomAllreduce(rank, world_size, full_nvlink)
+-
+-
+-def begin_capture() -> None:
+-    global _IS_CAPTURING
+-    _IS_CAPTURING = True
+-
+-
+-def end_capture() -> None:
+-    global _IS_CAPTURING
+-    _IS_CAPTURING = False
+-
+-
+-def is_capturing() -> bool:
+-    return _IS_CAPTURING and _CA_HANDLE is not None
+-
+-
+-def get_handle() -> Optional["CustomAllreduce"]:
+-    return _CA_HANDLE
+-
+-
+-def is_initialized() -> bool:
+-    return _CA_HANDLE is not None
+-
+-
+-@contextmanager
+-def capture():
+-    try:
+-        begin_capture()
+-        yield
+-    finally:
+-        end_capture()
+-        handle = get_handle()
+-        if handle is not None:
+-            handle.register_graph_buffers()
+-
+-
+-def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]:
+-    ca_handle = get_handle()
+-    # when custom allreduce is disabled, this will be None
+-    if ca_handle is None:
+-        return None
+-    if is_capturing():
+-        if torch.cuda.is_current_stream_capturing():
+-            if ca_handle.should_custom_ar(input):
+-                return ca_handle.all_reduce_reg(input)
+-        else:
+-            if ca_handle.should_custom_ar(input):
+-                # if warm up, mimic the allocation pattern
+-                # since custom allreduce is out-of-place
+-                return torch.empty_like(input)
+-    else:
+-        # note: outside of cuda graph context,
+-        # custom allreduce incurs a cost of cudaMemcpy, which should
+-        # be small(<=1% of overall latency) compared to the performance
+-        # gains of using custom kernels
+-        if ca_handle.should_custom_ar(input):
+-            return ca_handle.all_reduce_unreg(input)
+-
+-    return None
+-
+-
+-@contextmanager
+-def _nvml():
+-    try:
+-        pynvml.nvmlInit()
+-        yield
+-    finally:
+-        pynvml.nvmlShutdown()
+-
+-
+-@_nvml()
+-def _is_full_nvlink(device_ids: List[int]) -> bool:
+-    """
+-    query if the set of gpus are fully connected by nvlink (1 hop)
+-    Note that `pynvml` is not affected by `CUDA_VISIBLE_DEVICES`,
+-    so it works on real physical device ids.
+-    """
+-    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
+-    for i, handle in enumerate(handles):
+-        for j, peer_handle in enumerate(handles):
+-            if i < j:
+-                try:
+-                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+-                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+-                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+-                        return False
+-                except pynvml.NVMLError as error:
+-                    logger.error(
+-                        "NVLink detection failed. This is normal if your"
+-                        " machine has no NVLink equipped.",
+-                        exc_info=error)
+-                    return False
+-    return True
+-
+ 
+ def _can_p2p(rank: int, world_size: int) -> bool:
+-    from vllm.distributed.utils import gpu_p2p_access_check
+     for i in range(world_size):
+         if i == rank:
+             continue
++        if envs.VLLM_SKIP_P2P_CHECK:
++            logger.info(
++                "Skipping P2P check and trusting the driver's P2P report.")
++            return torch.cuda.can_device_access_peer(rank, i)
+         if not gpu_p2p_access_check(rank, i):
+             return False
+     return True
+ 
+ 
++def is_weak_contiguous(inp: torch.Tensor):
++    return inp.is_contiguous() or (inp.storage().nbytes() -
++                                   inp.storage_offset() * inp.element_size()
++                                   == inp.numel() * inp.element_size())
++
++
+ class CustomAllreduce:
+ 
++    _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
++
+     # max_size: max supported allreduce size
+     def __init__(self,
+-                 rank,
+-                 world_size,
+-                 full_nvlink,
++                 group: ProcessGroup,
++                 device: Union[int, str, torch.device],
+                  max_size=8192 * 1024) -> None:
+-        # buffers memory are owned by this Python class and passed to C++
+-        # meta data composes of two parts: meta data for synchronization
+-        # (256 bytes) and a temporary buffer for storing intermediate
+-        # allreduce results.
+-        self.meta = torch.zeros(custom_ar.meta_size() + max_size,
+-                                dtype=torch.uint8,
+-                                device="cuda")
++        """
++        Args:
++            group: the process group to work on. If None, it will use the
++                default process group.
++            device: the device to bind the CustomAllreduce to. If None,
++                it will be bind to f"cuda:{local_rank}".
++        It is the caller's responsibility to make sure each communicator
++        is bind to a unique device, and all communicators in this group
++        are in the same node.
++        """
++        self._IS_CAPTURING = False
++        self.disabled = True
++
++        if not custom_ar:
++            # disable because of missing custom allreduce library
++            # e.g. in a non-cuda environment
++            return
++
++        self.group = group
++
++        assert dist.get_backend(group) != dist.Backend.NCCL, (
++            "CustomAllreduce should be attached to a non-NCCL group.")
++
++        if not all(in_the_same_node_as(group, source_rank=0)):
++            # No need to initialize custom allreduce for multi-node case.
++            logger.warning(
++                "Custom allreduce is disabled because this process group"
++                " spans across nodes.")
++            return
++
++        rank = dist.get_rank(group=self.group)
++        world_size = dist.get_world_size(group=self.group)
++        if world_size == 1:
++            # No need to initialize custom allreduce for single GPU case.
++            return
++
++        if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
++            logger.warning(
++                "Custom allreduce is disabled due to an unsupported world"
++                " size: %d. Supported world sizes: %s. To silence this "
++                "warning, specify disable_custom_all_reduce=True explicitly.",
++                world_size, str(CustomAllreduce._SUPPORTED_WORLD_SIZES))
++            return
++
++        if isinstance(device, int):
++            device = torch.device(f"cuda:{device}")
++        elif isinstance(device, str):
++            device = torch.device(device)
++        # now `device` is a `torch.device` object
++        assert isinstance(device, torch.device)
++        self.device = device
++
++        cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
++        if cuda_visible_devices:
++            device_ids = list(map(int, cuda_visible_devices.split(",")))
++        else:
++            device_ids = list(range(cuda_device_count_stateless()))
++
++        physical_device_id = device_ids[device.index]
++        tensor = torch.tensor([physical_device_id],
++                              dtype=torch.int,
++                              device="cpu")
++        gather_list = [
++            torch.tensor([0], dtype=torch.int, device="cpu")
++            for _ in range(world_size)
++        ]
++        dist.all_gather(gather_list, tensor, group=self.group)
++        physical_device_ids = [t.item() for t in gather_list]
++
++        # test nvlink first, this will filter out most of the cases
++        # where custom allreduce is not supported
++        # this checks hardware and driver support for NVLink
++        assert current_platform.is_cuda()
++        from vllm.platforms.cuda import CudaPlatform
++        cuda_platform: CudaPlatform = current_platform
++        full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
++        if world_size > 2 and not full_nvlink:
++            logger.warning(
++                "Custom allreduce is disabled because it's not supported on"
++                " more than two PCIe-only GPUs. To silence this warning, "
++                "specify disable_custom_all_reduce=True explicitly.")
++            return
++        # test P2P capability, this checks software/cudaruntime support
++        # this is expensive to compute at the first time
++        # then we cache the result
++        if not _can_p2p(rank, world_size):
++            logger.warning(
++                "Custom allreduce is disabled because your platform lacks "
++                "GPU P2P capability or P2P test failed. To silence this "
++                "warning, specify disable_custom_all_reduce=True explicitly.")
++            return
++
++        self.disabled = False
++        # Buffers memory are owned by this Python class and passed to C++.
++        # Meta data composes of two parts: meta data for synchronization and a
++        # temporary buffer for storing intermediate allreduce results.
++        self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
++                                                   group=group)
+         # This is a pre-registered IPC buffer. In eager mode, input tensors
+         # are first copied into this buffer before allreduce is performed
+-        self.buffer = torch.empty(max_size, dtype=torch.uint8, device="cuda")
++        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+         # This is a buffer for storing the tuples of pointers pointing to
+         # IPC buffers from all ranks. Each registered tuple has size of
+         # 8*world_size bytes where world_size is at most 8. Allocating 8MB
+@@ -207,68 +162,142 @@ class CustomAllreduce:
+         # needs less than 10000 of registered tuples.
+         self.rank_data = torch.empty(8 * 1024 * 1024,
+                                      dtype=torch.uint8,
+-                                     device="cuda")
++                                     device=self.device)
+         self.max_size = max_size
++        self.rank = rank
+         self.world_size = world_size
+-        handles, offsets = self._get_ipc_meta(self.meta)
+         self.full_nvlink = full_nvlink
+-        self._ptr = custom_ar.init_custom_ar(self.meta, self.rank_data,
+-                                             handles, offsets, rank,
+-                                             self.full_nvlink)
+-        self.register_buffer(self.buffer)
+-
+-    def _get_ipc_meta(self, inp: torch.Tensor):
+-        data = inp.untyped_storage()._share_cuda_()
+-        shard_data = (
+-            data[1],  # ipc handle to base ptr
+-            data[3],  # offset of base ptr
+-        )
+-        return self._gather_ipc_meta(shard_data)
+-
+-    def _gather_ipc_meta(self, shard_data):
+-        all_data: List[Optional[Any]] = [None] * self.world_size
+-        dist.all_gather_object(all_data, shard_data)
+-
+-        handles = []
+-        offsets = []
+-        for i in range(len(all_data)):
+-            handles.append(all_data[i][0])  # type: ignore
+-            offsets.append(all_data[i][1])  # type: ignore
+-        return handles, offsets
+-
+-    def register_buffer(self, inp: torch.Tensor):
+-        handles, offsets = self._get_ipc_meta(inp)
+-        custom_ar.register_buffer(self._ptr, inp, handles, offsets)
++        self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank,
++                                       self.full_nvlink)
++        ops.register_buffer(self._ptr, self.buffer_ptrs)
++
++    @staticmethod
++    def create_shared_buffer(
++            size_in_bytes: int,
++            group: Optional[ProcessGroup] = None) -> List[int]:
++        """
++        Creates a shared buffer and returns a list of pointers
++        representing the buffer on all processes in the group.
++        """
++        lib = CudaRTLibrary()
++        pointer = lib.cudaMalloc(size_in_bytes)
++        handle = lib.cudaIpcGetMemHandle(pointer)
++        world_size = dist.get_world_size(group=group)
++        rank = dist.get_rank(group=group)
++        handles = [None] * world_size
++        dist.all_gather_object(handles, handle, group=group)
++
++        pointers: List[int] = []
++        for i, h in enumerate(handles):
++            if i == rank:
++                pointers.append(pointer.value)  # type: ignore
++            else:
++                pointers.append(
++                    lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
++
++        return pointers
++
++    @staticmethod
++    def free_shared_buffer(pointers: List[int],
++                           group: Optional[ProcessGroup] = None) -> None:
++        rank = dist.get_rank(group=group)
++        lib = CudaRTLibrary()
++        lib.cudaFree(ctypes.c_void_p(pointers[rank]))
++
++    @contextmanager
++    def capture(self):
++        """
++        The main responsibility of this context manager is the 
++        `register_graph_buffers` call at the end of the context.
++        It records all the buffer addresses used in the CUDA graph.
++        """
++        try:
++            self._IS_CAPTURING = True
++            yield
++        finally:
++            self._IS_CAPTURING = False
++            if not self.disabled:
++                self.register_graph_buffers()
+ 
+     def register_graph_buffers(self):
+-        handle, offset = custom_ar.get_graph_buffer_ipc_meta(self._ptr)
+-        handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
++        handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
+         logger.info("Registering %d cuda graph addresses", len(offset))
+-        custom_ar.register_graph_buffers(self._ptr, handles, offsets)
++        # We cannot directly use `dist.all_gather_object` here
++        # because it is incompatible with `gloo` backend under inference mode.
++        # see https://github.com/pytorch/pytorch/issues/126032 for details.
++        all_data = [[None, None]
++                    for _ in range(dist.get_world_size(group=self.group))]
++        all_data[self.rank] = [handle, offset]
++        ranks = sorted(dist.get_process_group_ranks(group=self.group))
++        for i, rank in enumerate(ranks):
++            dist.broadcast_object_list(all_data[i],
++                                       src=rank,
++                                       group=self.group,
++                                       device="cpu")
++        # Unpack list of tuples to tuple of lists.
++        handles = [d[0] for d in all_data]  # type: ignore
++        offsets = [d[1] for d in all_data]  # type: ignore
++        ops.register_graph_buffers(self._ptr, handles, offsets)
+ 
+     def should_custom_ar(self, inp: torch.Tensor):
+-        return custom_ar.should_custom_ar(inp, self.max_size, self.world_size,
+-                                          self.full_nvlink)
+-
+-    # all reduce, assuming inp tensor is IPC registered with register_buffer,
+-    # or, in the context of cuda graphs, register_graph_buffers
+-    def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
++        if self.disabled:
++            return False
++        inp_size = inp.numel() * inp.element_size()
++        # custom allreduce requires input byte size to be multiples of 16
++        if inp_size % 16 != 0:
++            return False
++        if not is_weak_contiguous(inp):
++            return False
++        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
++        # little performance improvement over NCCL.
++        if self.world_size == 2 or self.full_nvlink:
++            return inp_size < self.max_size
++        return False
++
++    def all_reduce(self,
++                   inp: torch.Tensor,
++                   *,
++                   out: torch.Tensor = None,
++                   registered: bool = False):
++        """Performs an out-of-place all reduce.
++        
++        If registered is True, this assumes inp's pointer is already
++        IPC-registered. Otherwise, inp is first copied into a pre-registered
++        buffer.
++        """
+         if out is None:
+             out = torch.empty_like(inp)
+-        custom_ar.all_reduce_reg(self._ptr, inp, out)
++        if registered:
++            ops.all_reduce(self._ptr, inp, out, 0, 0)
++        else:
++            ops.all_reduce(self._ptr, inp, out, self.buffer_ptrs[self.rank],
++                           self.max_size)
+         return out
+ 
+-    # all reduce, assuming inp tensor is NOT IPC registered
+-    def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
+-        if out is None:
+-            out = torch.empty_like(inp)
+-        custom_ar.all_reduce_unreg(self._ptr, inp, self.buffer, out)
+-        return out
++    def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
++        """The main allreduce API that provides support for cuda graph."""
++        # When custom allreduce is disabled, this will be None.
++        if self.disabled or not self.should_custom_ar(input):
++            return None
++        if self._IS_CAPTURING:
++            if torch.cuda.is_current_stream_capturing():
++                return self.all_reduce(input, registered=True)
++            else:
++                # If warm up, mimic the allocation pattern since custom
++                # allreduce is out-of-place.
++                return torch.empty_like(input)
++        else:
++            # Note: outside of cuda graph context, custom allreduce incurs a
++            # cost of cudaMemcpy, which should be small (<=1% of overall
++            # latency) compared to the performance gain of using custom kernels
++            return self.all_reduce(input, registered=False)
+ 
+     def close(self):
+-        if self._ptr:
+-            custom_ar.dispose(self._ptr)
++        if not self.disabled and self._ptr:
++            ops.dispose(self._ptr)
+             self._ptr = 0
++            self.free_shared_buffer(self.meta_ptrs)
++            self.free_shared_buffer(self.buffer_ptrs)
+ 
+     def __del__(self):
+         self.close()
+diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+new file mode 100644
+index 0000000..1f78e10
+--- /dev/null
++++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+@@ -0,0 +1,255 @@
++import ctypes
++import json
++import os
++import pickle
++import subprocess
++import sys
++import tempfile
++from itertools import product
++from typing import Dict, List, Optional, Sequence
++
++import torch.distributed as dist
++import torch.multiprocessing as mp
++
++import vllm.envs as envs
++from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
++from vllm.logger import init_logger
++from vllm.utils import (cuda_device_count_stateless,
++                        update_environment_variables)
++
++logger = init_logger(__name__)
++
++
++def producer(batch_src: Sequence[int],
++             producer_queue,
++             consumer_queue,
++             result_queue,
++             cuda_visible_devices: Optional[str] = None):
++    if cuda_visible_devices is not None:
++        update_environment_variables(
++            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
++
++    lib = CudaRTLibrary()
++    for i in batch_src:
++        lib.cudaSetDevice(i)
++        pointer = lib.cudaMalloc(1024)
++        lib.cudaMemset(pointer, 1, 1024)
++        lib.cudaDeviceSynchronize()
++        handle = lib.cudaIpcGetMemHandle(pointer)
++        producer_queue.put(handle)
++        open_success = consumer_queue.get()
++        if open_success:
++            # use two queues to simulate barrier
++            producer_queue.put(0)
++            consumer_queue.get()
++            # check if the memory is modified
++            host_data = (ctypes.c_char * 1024)()
++            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
++            for i in range(1024):
++                if ord(host_data[i]) != 2:
++                    open_success = False
++                    break
++        result_queue.put(open_success)
++        lib.cudaDeviceReset()
++
++
++def consumer(batch_tgt: Sequence[int],
++             producer_queue,
++             consumer_queue,
++             result_queue,
++             cuda_visible_devices: Optional[str] = None):
++    if cuda_visible_devices is not None:
++        update_environment_variables(
++            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
++
++    lib = CudaRTLibrary()
++    for j in batch_tgt:
++        lib.cudaSetDevice(j)
++        handle = producer_queue.get()
++        open_success = False
++        try:
++            pointer = lib.cudaIpcOpenMemHandle(handle)  # type: ignore
++            open_success = True
++        except RuntimeError:
++            # cannot error out here, because the producer process
++            # is still waiting for the response.
++            pass
++        consumer_queue.put(open_success)
++        if open_success:
++            # modify the memory
++            lib.cudaMemset(pointer, 2, 1024)
++            lib.cudaDeviceSynchronize()
++            # use two queues to simulate barrier
++            producer_queue.get()
++            consumer_queue.put(0)
++            # check if the memory is modified
++            host_data = (ctypes.c_char * 1024)()
++            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
++            for i in range(1024):
++                if ord(host_data[i]) != 2:
++                    open_success = False
++                    break
++        result_queue.put(open_success)
++        lib.cudaDeviceReset()
++
++
++def can_actually_p2p(
++    batch_src: Sequence[int],
++    batch_tgt: Sequence[int],
++) -> Sequence[bool]:
++    """
++    Usually, checking if P2P access is enabled can be done by
++    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
++    the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
++    returns `True` even if P2P access is not actually possible.
++    See https://github.com/vllm-project/vllm/issues/2728 and
++    https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
++    Therefore, we have to perform a real P2P access to check if it is actually
++    possible.
++
++    Note on p2p and cuda IPC:
++    Usually, one process uses one GPU:
++    GPU src --> cuda context src --> tensor src --> process src
++
++    We need to combine p2p and cuda IPC, so that:
++    GPU src --> cuda context src --> tensor src --> process src
++                                      |shared|
++    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
++    That is to say, process src creates a tensor in GPU src, passes IPC handle to
++    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
++    tensor in process tgt will be reflected in the tensor in process src, because
++    they are the same memory segment.
++    It is important to note that process tgt accesses the tensor in GPU tgt, not
++    GPU src. That's why we need p2p access.
++
++    The most time-consuming part is the process creation. To avoid creating
++    processes for every pair of GPUs, we use batched testing. We create two
++    processes for testing all pairs of GPUs in batch. The trick is to reset
++    the device after each test (which is not available in PyTorch).
++    """  # noqa
++    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
++    # pass the CUDA_VISIBLE_DEVICES to the child process
++    # to make sure they see the same set of GPUs
++
++    # make sure the processes are spawned
++    smp = mp.get_context("spawn")
++    producer_queue = smp.Queue()
++    consumer_queue = smp.Queue()
++    result_queue = smp.Queue()
++    p_src = smp.Process(target=producer,
++                        args=(batch_src, producer_queue, consumer_queue,
++                              result_queue, cuda_visible_devices))
++    p_tgt = smp.Process(target=consumer,
++                        args=(batch_tgt, producer_queue, consumer_queue,
++                              result_queue, cuda_visible_devices))
++    p_src.start()
++    p_tgt.start()
++    p_src.join()
++    p_tgt.join()
++    assert p_src.exitcode == 0 and p_tgt.exitcode == 0
++    result: List[bool] = []
++    for src, tgt in zip(batch_src, batch_tgt):
++        a = result_queue.get()
++        b = result_queue.get()
++        if a != b:
++            logger.warning(
++                "Two processes do not agree on the P2P access"
++                " status on %d -> %d, treat as disabled.", src, tgt)
++            result.append(False)
++        else:
++            result.append(a)
++    return result
++
++
++# why do we need this cache?
++# we are testing peer-to-peer (p2p) access between GPUs,across processes.
++# if we test it every time, it will be very slow, because we need to create
++#  N * N * 2 processes, where N is the world size. This is very slow.
++# to reduce the time, we use a cache file to store the p2p access status.
++# the cache file is generated by the master process if it does not exist.
++# then all the processes can read the cache file to check the p2p access status.
++# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
++#  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
++#  e.g. used by different vllm engines. The device id in the cache file is a
++#  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
++#  of visible devices in the vllm engine.
++_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
++
++
++def gpu_p2p_access_check(src: int, tgt: int) -> bool:
++    """Check if GPU src can access GPU tgt."""
++
++    # if the cache variable is already calculated,
++    # read from the cache instead of checking it again
++    global _gpu_p2p_access_cache
++    if _gpu_p2p_access_cache is not None:
++        return _gpu_p2p_access_cache[f"{src}->{tgt}"]
++
++    is_distributed = dist.is_initialized()
++
++    num_dev = cuda_device_count_stateless()
++    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
++    if cuda_visible_devices is None:
++        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
++
++    path = os.path.join(
++        envs.VLLM_CACHE_ROOT,
++        f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
++    os.makedirs(os.path.dirname(path), exist_ok=True)
++    from vllm.distributed.parallel_state import get_world_group
++    if ((not is_distributed or get_world_group().local_rank == 0)
++            and (not os.path.exists(path))):
++        # only the local master process (with local_rank == 0) can
++        #  enter this block to calculate the cache
++        logger.info("generating GPU P2P access cache in %s", path)
++        cache: Dict[str, bool] = {}
++        ids = list(range(num_dev))
++        # batch of all pairs of GPUs
++        batch_src, batch_tgt = zip(*list(product(ids, ids)))
++        # NOTE: we use `subprocess` rather than `multiprocessing` here
++        # because the caller might not have `if __name__ == "__main__":`,
++        # in that case we cannot use spawn method in multiprocessing.
++        # However, `can_actually_p2p` requires spawn method.
++        # The fix is, we use `subprocess` to call the function,
++        # where we have `if __name__ == "__main__":` in this file.
++
++        # use a temporary file to store the result
++        # we don't use the output of the subprocess directly,
++        # because the subprocess might produce logging output
++        with tempfile.NamedTemporaryFile() as output_file:
++            input_bytes = pickle.dumps(
++                (batch_src, batch_tgt, output_file.name))
++            returned = subprocess.run([sys.executable, __file__],
++                                      input=input_bytes,
++                                      capture_output=True)
++            # check if the subprocess is successful
++            try:
++                returned.check_returncode()
++            except Exception as e:
++                # wrap raised exception to provide more information
++                raise RuntimeError(
++                    f"Error happened when batch testing "
++                    f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
++                    f"{returned.stderr.decode()}") from e
++            with open(output_file.name, "rb") as f:
++                result = pickle.load(f)
++        for _i, _j, r in zip(batch_src, batch_tgt, result):
++            cache[f"{_i}->{_j}"] = r
++        with open(path, "w") as f:
++            json.dump(cache, f, indent=4)
++    if is_distributed:
++        get_world_group().barrier()
++    logger.info("reading GPU P2P access cache from %s", path)
++    with open(path) as f:
++        cache = json.load(f)
++    _gpu_p2p_access_cache = cache
++    return _gpu_p2p_access_cache[f"{src}->{tgt}"]
++
++
++__all__ = ["gpu_p2p_access_check"]
++
++if __name__ == "__main__":
++    batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
++    result = can_actually_p2p(batch_src, batch_tgt)
++    with open(output_file, "wb") as f:
++        f.write(pickle.dumps(result))
+diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py
+new file mode 100644
+index 0000000..cc9b19c
+--- /dev/null
++++ b/vllm/distributed/device_communicators/hpu_communicator.py
+@@ -0,0 +1,48 @@
++import torch
++import torch.distributed as dist
++from torch.distributed import ProcessGroup
++
++from vllm.platforms import current_platform
++
++if current_platform.is_hpu():
++    import habana_frameworks.torch as htorch  # noqa: F401
++
++
++class HpuCommunicator:
++
++    def __init__(self, group: ProcessGroup):
++        if not current_platform.is_hpu():
++            self.disabled = True
++            return
++        self.disabled = False
++        self.group = group
++        self.world_size = dist.get_world_size(self.group)
++
++    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
++        # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
++        # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
++        # (which is required for tensor parallel HPUGraph inference)
++        htorch.core.mark_step()
++        dist.all_reduce(x, group=self.group)
++        return x
++
++    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
++        world_size = self.world_size
++        if dim < 0:
++            # Convert negative dim to positive.
++            dim += x.dim()
++        input_size = x.size()
++        # Allocate output tensor.
++        output_tensor = torch.empty((world_size, ) + input_size,
++                                    dtype=x.dtype,
++                                    device=x.device)
++        # All-gather.
++        htorch.core.mark_step()
++        dist.all_gather_into_tensor(output_tensor, x, group=self.group)
++        # Reshape
++        output_tensor = output_tensor.movedim(0, dim)
++        output_tensor = output_tensor.reshape(input_size[:dim] +
++                                              (world_size *
++                                               input_size[dim], ) +
++                                              input_size[dim + 1:])
++        return output_tensor
+diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
+index 7589943..efc5998 100644
+--- a/vllm/distributed/device_communicators/pynccl.py
++++ b/vllm/distributed/device_communicators/pynccl.py
+@@ -1,26 +1,3 @@
+-# This file is a pure Python wrapper for the NCCL library.
+-# The main purpose is to use NCCL combined with CUDA graph.
+-# Before writing this script, we tried the following approach:
+-# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
+-#  often gets stuck when initializing the NCCL communicator.
+-# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
+-#  contains many other potential cuda APIs, that are not allowed during
+-#  capturing the CUDA graph. For further details, please check
+-# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
+-#
+-# Another rejected idea is to write a C/C++ binding for NCCL. It is usually
+-# doable, but we often encounter issues related with nccl versions, and need
+-# to switch between different versions of NCCL. See
+-# https://github.com/NVIDIA/nccl/issues/1234 for more details.
+-# A C/C++ binding is not flexible enough to handle this. It requires
+-# recompilation of the code every time we want to switch between different
+-# versions. This current implementation, with a **pure** Python wrapper, is
+-# more flexible. We can easily switch between different versions of NCCL by
+-# changing the environment variable `VLLM_NCCL_SO_PATH`, or the `so_file`
+-# variable in the code.
+-
+-import ctypes
+-import platform
+ from typing import Optional, Union
+ 
+ # ===================== import region =====================
+@@ -28,229 +5,85 @@ import torch
+ import torch.distributed as dist
+ from torch.distributed import ProcessGroup, ReduceOp
+ 
+-from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
++from vllm.distributed.device_communicators.pynccl_wrapper import (
++    NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum,
++    ncclRedOpTypeEnum, ncclUniqueId)
++from vllm.distributed.utils import StatelessProcessGroup
+ from vllm.logger import init_logger
+-from vllm.utils import find_nccl_library, nccl_integrity_check
++from vllm.utils import current_stream
+ 
+ logger = init_logger(__name__)
+ 
+-so_file = find_nccl_library()
+-
+-try:
+-    # load the library in another process.
+-    # if it core dumps, it will not crash the current process
+-    nccl_integrity_check(so_file)
+-    nccl = ctypes.CDLL(so_file)
+-except Exception as e:
+-    logger.error(
+-        "Failed to load NCCL library from %s ."
+-        "It is expected if you are not running on NVIDIA/AMD GPUs."
+-        "Otherwise, the nccl library might not exist, be corrupted "
+-        "or it does not support the current platform %s."
+-        "One solution is to download libnccl2 version 2.18 from "
+-        "https://developer.download.nvidia.com/compute/cuda/repos/ "
+-        "and extract the libnccl.so.2 file. If you already have the "
+-        "library, please set the environment variable VLLM_NCCL_SO_PATH"
+-        " to point to the correct nccl library path.", so_file,
+-        platform.platform())
+-    raise e
+-
+-# === export types and functions from nccl to Python ===
+-# for the original nccl definition, please check
+-# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
+-
+-ncclResult_t = ctypes.c_int
+-
+-_c_ncclGetErrorString = nccl.ncclGetErrorString
+-_c_ncclGetErrorString.restype = ctypes.c_char_p
+-_c_ncclGetErrorString.argtypes = [ncclResult_t]
+-
+-
+-def NCCL_CHECK(result: ncclResult_t) -> None:
+-    if result != 0:
+-        error_str = _c_ncclGetErrorString(result)
+-        error_str = error_str.decode("utf-8")
+-        raise RuntimeError(f"NCCL error: {error_str}")
+-
+-
+-# equivalent to c declaration:
+-# ncclResult_t  ncclGetVersion(int *version);
+-_c_ncclGetVersion = nccl.ncclGetVersion
+-_c_ncclGetVersion.restype = ctypes.c_int
+-_c_ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
+-
+-
+-def ncclGetVersion() -> str:
+-    version = ctypes.c_int()
+-    NCCL_CHECK(_c_ncclGetVersion(ctypes.byref(version)))
+-    # something like 21903 --> "2.19.3"
+-    version_str = str(version.value)
+-    major = version_str[0].lstrip("0")
+-    minor = version_str[1:3].lstrip("0")
+-    patch = version_str[3:].lstrip("0")
+-    return f"{major}.{minor}.{patch}"
+-
+-
+-class NcclUniqueId(ctypes.Structure):
+-    _fields_ = [("internal", ctypes.c_byte * 128)]
+-
+-
+-# equivalent to c declaration:
+-# ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
+-_c_ncclGetUniqueId = nccl.ncclGetUniqueId
+-_c_ncclGetUniqueId.restype = ctypes.c_int
+-_c_ncclGetUniqueId.argtypes = [ctypes.POINTER(NcclUniqueId)]
+-
+-
+-def ncclGetUniqueId() -> NcclUniqueId:
+-    unique_id = NcclUniqueId()
+-    NCCL_CHECK(_c_ncclGetUniqueId(ctypes.byref(unique_id)))
+-    return unique_id
+-
+-
+-# equivalent to c declaration:
+-# ncclResult_t  ncclCommInitRank(
+-#   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+-# note that ncclComm_t is a pointer type, so the first argument
+-# is a pointer to a pointer
+-_c_ncclCommInitRank = nccl.ncclCommInitRank
+-_c_ncclCommInitRank.restype = ctypes.c_int
+-_c_ncclCommInitRank.argtypes = [
+-    ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int
+-]
+-
+-ncclDataType_t = ctypes.c_int
+-
+-
+-class ncclDataTypeEnum:
+-    ncclInt8 = 0
+-    ncclChar = 0
+-    ncclUint8 = 1
+-    ncclInt32 = 2
+-    ncclInt = 2
+-    ncclUint32 = 3
+-    ncclInt64 = 4
+-    ncclUint64 = 5
+-    ncclFloat16 = 6
+-    ncclHalf = 6
+-    ncclFloat32 = 7
+-    ncclFloat = 7
+-    ncclFloat64 = 8
+-    ncclDouble = 8
+-    ncclBfloat16 = 9
+-    ncclNumTypes = 10
+-
+-    @classmethod
+-    def from_torch(cls, dtype: torch.dtype) -> int:
+-        if dtype == torch.int8:
+-            return cls.ncclInt8
+-        if dtype == torch.uint8:
+-            return cls.ncclUint8
+-        if dtype == torch.int32:
+-            return cls.ncclInt32
+-        if dtype == torch.int64:
+-            return cls.ncclInt64
+-        if dtype == torch.float16:
+-            return cls.ncclFloat16
+-        if dtype == torch.float32:
+-            return cls.ncclFloat32
+-        if dtype == torch.float64:
+-            return cls.ncclFloat64
+-        if dtype == torch.bfloat16:
+-            return cls.ncclBfloat16
+-        raise ValueError(f"Unsupported dtype: {dtype}")
+-
+-
+-ncclRedOp_t = ctypes.c_int
+-
+-
+-class ncclRedOpTypeEnum:
+-    ncclSum = 0
+-    ncclProd = 1
+-    ncclMax = 2
+-    ncclMin = 3
+-    ncclAvg = 4
+-    ncclNumOps = 5
+-
+-    @classmethod
+-    def from_torch(cls, op: ReduceOp) -> int:
+-        if op == ReduceOp.SUM:
+-            return cls.ncclSum
+-        if op == ReduceOp.PRODUCT:
+-            return cls.ncclProd
+-        if op == ReduceOp.MAX:
+-            return cls.ncclMax
+-        if op == ReduceOp.MIN:
+-            return cls.ncclMin
+-        if op == ReduceOp.AVG:
+-            return cls.ncclAvg
+-        raise ValueError(f"Unsupported op: {op}")
+-
+-
+-# equivalent to c declaration:
+-# ncclResult_t  ncclAllReduce(
+-#   const void* sendbuff, void* recvbuff, size_t count,
+-#   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+-#   udaStream_t stream);
+-# note that cudaStream_t is a pointer type, so the last argument is a pointer
+-_c_ncclAllReduce = nccl.ncclAllReduce
+-_c_ncclAllReduce.restype = ctypes.c_int
+-_c_ncclAllReduce.argtypes = [
+-    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclRedOp_t,
+-    ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p
+-]
+-
+-# be cautious! this is a collective call, it will block until all
+-# processes in the communicator have called this function.
+-# because Python object destruction can happen in random order,
+-# it is better not to call it at all.
+-# equivalent to c declaration:
+-# ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+-_c_ncclCommDestroy = nccl.ncclCommDestroy
+-_c_ncclCommDestroy.restype = ctypes.c_int
+-_c_ncclCommDestroy.argtypes = [ctypes.c_void_p]
+ 
+-
+-class NCCLCommunicator:
++class PyNcclCommunicator:
+ 
+     def __init__(
+         self,
+-        group: Optional[ProcessGroup] = None,
+-        device: Optional[Union[int, str, torch.device]] = None,
++        group: Union[ProcessGroup, StatelessProcessGroup],
++        device: Union[int, str, torch.device],
++        library_path: Optional[str] = None,
+     ):
+         """
+         Args:
+             group: the process group to work on. If None, it will use the
+                 default process group.
+-            device: the device to bind the NCCLCommunicator to. If None,
++            device: the device to bind the PyNcclCommunicator to. If None,
+                 it will be bind to f"cuda:{local_rank}".
++            library_path: the path to the NCCL library. If None, it will
++                use the default library path.
+         It is the caller's responsibility to make sure each communicator
+         is bind to a unique device.
+         """
+-        assert dist.is_initialized()
+-        group = get_cpu_world_group() if group is None else group
+-        assert dist.get_backend(group) != dist.Backend.NCCL, (
+-            "NCCLCommunicator should be attached to a non-NCCL group.")
++        if not isinstance(group, StatelessProcessGroup):
++            assert dist.is_initialized()
++            assert dist.get_backend(group) != dist.Backend.NCCL, (
++                "PyNcclCommunicator should be attached to a non-NCCL group.")
++            # note: this rank is the rank in the group
++            self.rank = dist.get_rank(group)
++            self.world_size = dist.get_world_size(group)
++        else:
++            self.rank = group.rank
++            self.world_size = group.world_size
++
+         self.group = group
+-        # note: this rank is the rank in the group
+-        self.rank = dist.get_rank(group)
+-        self.world_size = dist.get_world_size(group)
++
++        # if world_size == 1, no need to create communicator
++        if self.world_size == 1:
++            self.available = False
++            self.disabled = True
++            return
++        try:
++            self.nccl = NCCLLibrary(library_path)
++        except Exception:
++            # disable because of missing NCCL library
++            # e.g. in a non-GPU environment
++            self.available = False
++            self.disabled = True
++            return
++
++        self.available = True
++        self.disabled = False
++
++        logger.info("vLLM is using nccl==%s", self.nccl.ncclGetVersion())
++
+         if self.rank == 0:
+-            self.unique_id = ncclGetUniqueId()
++            # get the unique id from NCCL
++            self.unique_id = self.nccl.ncclGetUniqueId()
++        else:
++            # construct an empty unique id
++            self.unique_id = ncclUniqueId()
++
++        if not isinstance(group, StatelessProcessGroup):
++            tensor = torch.ByteTensor(list(self.unique_id.internal))
++            ranks = dist.get_process_group_ranks(group)
++            # arg `src` in `broadcast` is the global rank
++            dist.broadcast(tensor, src=ranks[0], group=group)
++            byte_list = tensor.tolist()
++            for i, byte in enumerate(byte_list):
++                self.unique_id.internal[i] = byte
+         else:
+-            self.unique_id = NcclUniqueId()
+-        tensor = torch.ByteTensor(list(self.unique_id.internal))
+-        ranks = dist.get_process_group_ranks(group)
+-        # arg `src` in `broadcast` is the global rank
+-        dist.broadcast(tensor, src=ranks[0], group=group)
+-        byte_list = tensor.tolist()
+-        for i, byte in enumerate(byte_list):
+-            self.unique_id.internal[i] = byte
+-        self.comm = ctypes.c_void_p()
+-        if device is None:
+-            local_rank = get_local_rank()
+-            device = torch.device(f"cuda:{local_rank}")
+-        elif isinstance(device, int):
++            self.unique_id = group.broadcast_obj(self.unique_id, src=0)
++        if isinstance(device, int):
+             device = torch.device(f"cuda:{device}")
+         elif isinstance(device, str):
+             device = torch.device(device)
+@@ -261,27 +94,122 @@ class NCCLCommunicator:
+         # `torch.cuda.device` is a context manager that changes the
+         # current cuda device to the specified one
+         with torch.cuda.device(device):
+-            NCCL_CHECK(
+-                _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size,
+-                                    self.unique_id, self.rank))
+-            self.stream = torch.cuda.Stream()
++            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
++                self.world_size, self.unique_id, self.rank)
++
++            stream = current_stream()
++            # A small all_reduce for warmup.
++            data = torch.zeros(1, device=device)
++            self.all_reduce(data)
++            stream.synchronize()
++            del data
+ 
+     def all_reduce(self,
+-                   tensor: torch.Tensor,
++                   in_tensor: torch.Tensor,
+                    op: ReduceOp = ReduceOp.SUM,
++                   stream=None) -> torch.Tensor:
++        if self.disabled:
++            return None
++        # nccl communicator created on a specific device
++        # will only work on tensors on the same device
++        # otherwise it will cause "illegal memory access"
++        assert in_tensor.device == self.device, (
++            f"this nccl communicator is created to work on {self.device}, "
++            f"but the input tensor is on {in_tensor.device}")
++
++        out_tensor = torch.empty_like(in_tensor)
++
++        if stream is None:
++            stream = current_stream()
++        self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
++                                buffer_type(out_tensor.data_ptr()),
++                                in_tensor.numel(),
++                                ncclDataTypeEnum.from_torch(in_tensor.dtype),
++                                ncclRedOpTypeEnum.from_torch(op), self.comm,
++                                cudaStream_t(stream.cuda_stream))
++        return out_tensor
++
++    def all_gather(self,
++                   output_tensor: torch.Tensor,
++                   input_tensor: torch.Tensor,
+                    stream=None):
++        if self.disabled:
++            return
++        # nccl communicator created on a specific device
++        # will only work on tensors on the same device
++        # otherwise it will cause "illegal memory access"
++        assert input_tensor.device == self.device, (
++            f"this nccl communicator is created to work on {self.device}, "
++            f"but the input tensor is on {input_tensor.device}")
++        if stream is None:
++            stream = current_stream()
++        self.nccl.ncclAllGather(
++            buffer_type(input_tensor.data_ptr()),
++            buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
++            ncclDataTypeEnum.from_torch(input_tensor.dtype), self.comm,
++            cudaStream_t(stream.cuda_stream))
++
++    def reduce_scatter(self,
++                       output_tensor: torch.Tensor,
++                       input_tensor: torch.Tensor,
++                       op: ReduceOp = ReduceOp.SUM,
++                       stream=None):
++        if self.disabled:
++            return
+         # nccl communicator created on a specific device
+         # will only work on tensors on the same device
+         # otherwise it will cause "illegal memory access"
++        assert input_tensor.device == self.device, (
++            f"this nccl communicator is created to work on {self.device}, "
++            f"but the input tensor is on {input_tensor.device}")
++        if stream is None:
++            stream = current_stream()
++        self.nccl.ncclReduceScatter(
++            buffer_type(input_tensor.data_ptr()),
++            buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
++            ncclDataTypeEnum.from_torch(input_tensor.dtype),
++            ncclRedOpTypeEnum.from_torch(op), self.comm,
++            cudaStream_t(stream.cuda_stream))
++
++    def send(self, tensor: torch.Tensor, dst: int, stream=None):
++        if self.disabled:
++            return
+         assert tensor.device == self.device, (
+             f"this nccl communicator is created to work on {self.device}, "
+             f"but the input tensor is on {tensor.device}")
+         if stream is None:
+-            stream = self.stream
+-        NCCL_CHECK(
+-            _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()),
+-                             ctypes.c_void_p(tensor.data_ptr()),
+-                             tensor.numel(),
+-                             ncclDataTypeEnum.from_torch(tensor.dtype),
+-                             ncclRedOpTypeEnum.from_torch(op), self.comm,
+-                             ctypes.c_void_p(stream.cuda_stream)))
++            stream = current_stream()
++        self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
++                           ncclDataTypeEnum.from_torch(tensor.dtype), dst,
++                           self.comm, cudaStream_t(stream.cuda_stream))
++
++    def recv(self, tensor: torch.Tensor, src: int, stream=None):
++        if self.disabled:
++            return
++        assert tensor.device == self.device, (
++            f"this nccl communicator is created to work on {self.device}, "
++            f"but the input tensor is on {tensor.device}")
++        if stream is None:
++            stream = current_stream()
++        self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
++                           ncclDataTypeEnum.from_torch(tensor.dtype), src,
++                           self.comm, cudaStream_t(stream.cuda_stream))
++
++    def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
++        if self.disabled:
++            return
++        assert tensor.device == self.device, (
++            f"this nccl communicator is created to work on {self.device}, "
++            f"but the input tensor is on {tensor.device}")
++        if stream is None:
++            stream = current_stream()
++        if src == self.rank:
++            sendbuff = buffer_type(tensor.data_ptr())
++            # NCCL requires the sender also to have a receive buffer
++            recvbuff = buffer_type(tensor.data_ptr())
++        else:
++            sendbuff = buffer_type()
++            recvbuff = buffer_type(tensor.data_ptr())
++        self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
++                                ncclDataTypeEnum.from_torch(tensor.dtype), src,
++                                self.comm, cudaStream_t(stream.cuda_stream))
+diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
+new file mode 100644
+index 0000000..7dea61b
+--- /dev/null
++++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
+@@ -0,0 +1,338 @@
++# This file is a pure Python wrapper for the NCCL library.
++# The main purpose is to use NCCL combined with CUDA graph.
++# Before writing this script, we tried the following approach:
++# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
++#  often gets stuck when initializing the NCCL communicator.
++# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
++#  contains many other potential cuda APIs, that are not allowed during
++#  capturing the CUDA graph. For further details, please check
++# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
++#
++# Another rejected idea is to write a C/C++ binding for NCCL. It is usually
++# doable, but we often encounter issues related with nccl versions, and need
++# to switch between different versions of NCCL. See
++# https://github.com/NVIDIA/nccl/issues/1234 for more details.
++# A C/C++ binding is not flexible enough to handle this. It requires
++# recompilation of the code every time we want to switch between different
++# versions. This current implementation, with a **pure** Python wrapper, is
++# more flexible. We can easily switch between different versions of NCCL by
++# changing the environment variable `VLLM_NCCL_SO_PATH`, or the `so_file`
++# variable in the code.
++
++import ctypes
++import platform
++from dataclasses import dataclass
++from typing import Any, Dict, List, Optional
++
++import torch
++from torch.distributed import ReduceOp
++
++from vllm.logger import init_logger
++from vllm.utils import find_nccl_library
++
++logger = init_logger(__name__)
++
++# === export types and functions from nccl to Python ===
++# for the original nccl definition, please check
++# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
++
++ncclResult_t = ctypes.c_int
++ncclComm_t = ctypes.c_void_p
++
++
++class ncclUniqueId(ctypes.Structure):
++    _fields_ = [("internal", ctypes.c_byte * 128)]
++
++
++cudaStream_t = ctypes.c_void_p
++buffer_type = ctypes.c_void_p
++
++ncclDataType_t = ctypes.c_int
++
++
++class ncclDataTypeEnum:
++    ncclInt8 = 0
++    ncclChar = 0
++    ncclUint8 = 1
++    ncclInt32 = 2
++    ncclInt = 2
++    ncclUint32 = 3
++    ncclInt64 = 4
++    ncclUint64 = 5
++    ncclFloat16 = 6
++    ncclHalf = 6
++    ncclFloat32 = 7
++    ncclFloat = 7
++    ncclFloat64 = 8
++    ncclDouble = 8
++    ncclBfloat16 = 9
++    ncclNumTypes = 10
++
++    @classmethod
++    def from_torch(cls, dtype: torch.dtype) -> int:
++        if dtype == torch.int8:
++            return cls.ncclInt8
++        if dtype == torch.uint8:
++            return cls.ncclUint8
++        if dtype == torch.int32:
++            return cls.ncclInt32
++        if dtype == torch.int64:
++            return cls.ncclInt64
++        if dtype == torch.float16:
++            return cls.ncclFloat16
++        if dtype == torch.float32:
++            return cls.ncclFloat32
++        if dtype == torch.float64:
++            return cls.ncclFloat64
++        if dtype == torch.bfloat16:
++            return cls.ncclBfloat16
++        raise ValueError(f"Unsupported dtype: {dtype}")
++
++
++ncclRedOp_t = ctypes.c_int
++
++
++class ncclRedOpTypeEnum:
++    ncclSum = 0
++    ncclProd = 1
++    ncclMax = 2
++    ncclMin = 3
++    ncclAvg = 4
++    ncclNumOps = 5
++
++    @classmethod
++    def from_torch(cls, op: ReduceOp) -> int:
++        if op == ReduceOp.SUM:
++            return cls.ncclSum
++        if op == ReduceOp.PRODUCT:
++            return cls.ncclProd
++        if op == ReduceOp.MAX:
++            return cls.ncclMax
++        if op == ReduceOp.MIN:
++            return cls.ncclMin
++        if op == ReduceOp.AVG:
++            return cls.ncclAvg
++        raise ValueError(f"Unsupported op: {op}")
++
++
++@dataclass
++class Function:
++    name: str
++    restype: Any
++    argtypes: List[Any]
++
++
++class NCCLLibrary:
++    exported_functions = [
++        # const char* ncclGetErrorString(ncclResult_t result)
++        Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
++        # ncclResult_t  ncclGetVersion(int *version);
++        Function("ncclGetVersion", ncclResult_t,
++                 [ctypes.POINTER(ctypes.c_int)]),
++        # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
++        Function("ncclGetUniqueId", ncclResult_t,
++                 [ctypes.POINTER(ncclUniqueId)]),
++        # ncclResult_t  ncclCommInitRank(
++        #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
++        # note that ncclComm_t is a pointer type, so the first argument
++        # is a pointer to a pointer
++        Function("ncclCommInitRank", ncclResult_t, [
++            ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId,
++            ctypes.c_int
++        ]),
++        # ncclResult_t  ncclAllReduce(
++        #   const void* sendbuff, void* recvbuff, size_t count,
++        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
++        #   cudaStream_t stream);
++        # note that cudaStream_t is a pointer type, so the last argument
++        # is a pointer
++        Function("ncclAllReduce", ncclResult_t, [
++            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
++            ncclRedOp_t, ncclComm_t, cudaStream_t
++        ]),
++
++        # ncclResult_t  ncclAllGather(
++        #   const void* sendbuff, void* recvbuff, size_t count,
++        #   ncclDataType_t datatype, ncclComm_t comm,
++        #   cudaStream_t stream);
++        # note that cudaStream_t is a pointer type, so the last argument
++        # is a pointer
++        Function("ncclAllGather", ncclResult_t, [
++            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
++            ncclComm_t, cudaStream_t
++        ]),
++
++        # ncclResult_t  ncclReduceScatter(
++        #   const void* sendbuff, void* recvbuff, size_t count,
++        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
++        #   cudaStream_t stream);
++        # note that cudaStream_t is a pointer type, so the last argument
++        # is a pointer
++        Function("ncclReduceScatter", ncclResult_t, [
++            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
++            ncclRedOp_t, ncclComm_t, cudaStream_t
++        ]),
++
++        # ncclResult_t  ncclSend(
++        #   const void* sendbuff, size_t count, ncclDataType_t datatype,
++        #   int dest, ncclComm_t comm, cudaStream_t stream);
++        Function("ncclSend", ncclResult_t, [
++            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
++            ncclComm_t, cudaStream_t
++        ]),
++
++        # ncclResult_t  ncclRecv(
++        #   void* recvbuff, size_t count, ncclDataType_t datatype,
++        #   int src, ncclComm_t comm, cudaStream_t stream);
++        Function("ncclRecv", ncclResult_t, [
++            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
++            ncclComm_t, cudaStream_t
++        ]),
++
++        # ncclResult_t ncclBroadcast(
++        #   const void* sendbuff, void* recvbuff, size_t count,
++        #   ncclDataType_t datatype, int root, ncclComm_t comm,
++        #   cudaStream_t stream);
++        Function("ncclBroadcast", ncclResult_t, [
++            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
++            ctypes.c_int, ncclComm_t, cudaStream_t
++        ]),
++
++        # be cautious! this is a collective call, it will block until all
++        # processes in the communicator have called this function.
++        # because Python object destruction can happen in random order,
++        # it is better not to call it at all.
++        # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
++        Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
++    ]
++
++    # class attribute to store the mapping from the path to the library
++    # to avoid loading the same library multiple times
++    path_to_library_cache: Dict[str, Any] = {}
++
++    # class attribute to store the mapping from library path
++    #  to the corresponding dictionary
++    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
++
++    def __init__(self, so_file: Optional[str] = None):
++
++        so_file = so_file or find_nccl_library()
++
++        try:
++            if so_file not in NCCLLibrary.path_to_dict_mapping:
++                lib = ctypes.CDLL(so_file)
++                NCCLLibrary.path_to_library_cache[so_file] = lib
++            self.lib = NCCLLibrary.path_to_library_cache[so_file]
++        except Exception as e:
++            logger.error(
++                "Failed to load NCCL library from %s ."
++                "It is expected if you are not running on NVIDIA/AMD GPUs."
++                "Otherwise, the nccl library might not exist, be corrupted "
++                "or it does not support the current platform %s."
++                "If you already have the library, please set the "
++                "environment variable VLLM_NCCL_SO_PATH"
++                " to point to the correct nccl library path.", so_file,
++                platform.platform())
++            raise e
++
++        if so_file not in NCCLLibrary.path_to_dict_mapping:
++            _funcs: Dict[str, Any] = {}
++            for func in NCCLLibrary.exported_functions:
++                f = getattr(self.lib, func.name)
++                f.restype = func.restype
++                f.argtypes = func.argtypes
++                _funcs[func.name] = f
++            NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
++        self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]
++
++    def ncclGetErrorString(self, result: ncclResult_t) -> str:
++        return self._funcs["ncclGetErrorString"](result).decode("utf-8")
++
++    def NCCL_CHECK(self, result: ncclResult_t) -> None:
++        if result != 0:
++            error_str = self.ncclGetErrorString(result)
++            raise RuntimeError(f"NCCL error: {error_str}")
++
++    def ncclGetVersion(self) -> str:
++        version = ctypes.c_int()
++        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
++        version_str = str(version.value)
++        # something like 21903 --> "2.19.3"
++        major = version_str[0].lstrip("0")
++        minor = version_str[1:3].lstrip("0")
++        patch = version_str[3:].lstrip("0")
++        return f"{major}.{minor}.{patch}"
++
++    def ncclGetUniqueId(self) -> ncclUniqueId:
++        unique_id = ncclUniqueId()
++        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](
++            ctypes.byref(unique_id)))
++        return unique_id
++
++    def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId,
++                         rank: int) -> ncclComm_t:
++        comm = ncclComm_t()
++        self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm),
++                                                        world_size, unique_id,
++                                                        rank))
++        return comm
++
++    def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
++                      count: int, datatype: int, op: int, comm: ncclComm_t,
++                      stream: cudaStream_t) -> None:
++        # `datatype` actually should be `ncclDataType_t`
++        # and `op` should be `ncclRedOp_t`
++        # both are aliases of `ctypes.c_int`
++        # when we pass int to a function, it will be converted to `ctypes.c_int`
++        # by ctypes automatically
++        self.NCCL_CHECK(self._funcs["ncclAllReduce"](sendbuff, recvbuff, count,
++                                                     datatype, op, comm,
++                                                     stream))
++
++    def ncclReduceScatter(self, sendbuff: buffer_type, recvbuff: buffer_type,
++                          count: int, datatype: int, op: int, comm: ncclComm_t,
++                          stream: cudaStream_t) -> None:
++        # `datatype` actually should be `ncclDataType_t`
++        # and `op` should be `ncclRedOp_t`
++        # both are aliases of `ctypes.c_int`
++        # when we pass int to a function, it will be converted to `ctypes.c_int`
++        # by ctypes automatically
++        self.NCCL_CHECK(self._funcs["ncclReduceScatter"](sendbuff, recvbuff,
++                                                         count, datatype, op,
++                                                         comm, stream))
++
++    def ncclAllGather(self, sendbuff: buffer_type, recvbuff: buffer_type,
++                      count: int, datatype: int, comm: ncclComm_t,
++                      stream: cudaStream_t) -> None:
++        # `datatype` actually should be `ncclDataType_t`
++        # which is an aliases of `ctypes.c_int`
++        # when we pass int to a function, it will be converted to `ctypes.c_int`
++        # by ctypes automatically
++        self.NCCL_CHECK(self._funcs["ncclAllGather"](sendbuff, recvbuff, count,
++                                                     datatype, comm, stream))
++
++    def ncclSend(self, sendbuff: buffer_type, count: int, datatype: int,
++                 dest: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
++        self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype,
++                                                dest, comm, stream))
++
++    def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int,
++                 src: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
++        self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src,
++                                                comm, stream))
++
++    def ncclBroadcast(self, sendbuff: buffer_type, recvbuff: buffer_type,
++                      count: int, datatype: int, root: int, comm: ncclComm_t,
++                      stream: cudaStream_t) -> None:
++        self.NCCL_CHECK(self._funcs["ncclBroadcast"](sendbuff, recvbuff, count,
++                                                     datatype, root, comm,
++                                                     stream))
++
++    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
++        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
++
++
++__all__ = [
++    "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId",
++    "ncclComm_t", "cudaStream_t", "buffer_type"
++]
+diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
+new file mode 100644
+index 0000000..4ced991
+--- /dev/null
++++ b/vllm/distributed/device_communicators/shm_broadcast.py
+@@ -0,0 +1,528 @@
++import os
++import pickle
++import sys
++import time
++from contextlib import contextmanager
++from dataclasses import dataclass, field
++from multiprocessing import shared_memory
++from typing import List, Optional, Tuple, Union
++from unittest.mock import patch
++
++import torch
++import torch.distributed as dist
++from torch.distributed import ProcessGroup
++from zmq import IPV6  # type: ignore
++from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
++
++import vllm.envs as envs
++from vllm.distributed.utils import StatelessProcessGroup
++from vllm.logger import init_logger
++from vllm.utils import get_ip, get_open_port, is_valid_ipv6_address
++
++VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
++
++logger = init_logger(__name__)
++
++# We prefer to use os.sched_yield as it results in tighter polling loops,
++# measured to be around 3e-7 seconds. However on earlier versions of Python
++# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0)
++USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1))
++                   or (sys.version_info[:2] == (3, 10)
++                       and sys.version_info[2] >= 8))
++
++
++def sched_yield():
++    if USE_SCHED_YIELD:
++        os.sched_yield()
++    else:
++        time.sleep(0)
++
++
++class ShmRingBuffer:
++
++    def __init__(self,
++                 n_reader: int,
++                 max_chunk_bytes: int,
++                 max_chunks: int,
++                 name: Optional[str] = None):
++        """
++        A shared memory ring buffer implementation for broadcast communication.
++        Essentially, it is a queue where only one will `enqueue` and multiple
++        will `dequeue`. The max size of each item, together with the max number
++        of items that can be stored in the buffer are known in advance.
++        In this case, we don't need to synchronize the access to
++         the buffer.
++        
++        Buffer memory layout:
++                  data                                 metadata
++                    |                                      |
++                    | (current_idx)                        | (current_idx)
++                    v                                      v
++        +-------------------------------+----------------------------------------+
++        | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata |
++        +-------------------------------+----------------------------------------+
++        | max_chunks x max_chunk_bytes  | max_chunks x (1 + n_reader) bytes      |
++
++        metadata memory layout: each byte is a flag, the first byte is the written
++        flag, and the rest are reader flags. The flags are set to 0 by default.
++        +--------------+--------------+--------------+-----+--------------+
++        | written_flag | reader0_flag | reader1_flag | ... | readerN_flag |
++        +--------------+--------------+--------------+-----+--------------+
++
++        The state of metadata is as follows:
++
++        (case 1) 0???...???: the block is not written yet, cannot read, can write
++        (case 2) 1000...000: the block is just written, can read, cannot write
++        (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write
++        (case 4) 1111...111: the block is written and read by all readers, cannot read, can write
++
++        State transition for readers:
++
++        When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read.
++        Only after the caller finishes reading the block, the reader can mark the block as read.
++        Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0).
++
++        State transition for writer:
++
++        When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case
++        to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer
++        can reset the reader flags to 0, and mark the block as written (from 0 to 1).
++        NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct.
++
++        During creation, `name` is None and the buffer is created. We can pass the
++        created object to other processes by pickling it. The other processes will
++        get the name of the shared memory and open it, so that they can access the
++        same shared memory buffer.
++        """# noqa
++        self.n_reader = n_reader
++        self.metadata_size = 1 + n_reader
++        self.max_chunk_bytes = max_chunk_bytes
++        self.max_chunks = max_chunks
++        self.total_bytes_of_buffer = (self.max_chunk_bytes +
++                                      self.metadata_size) * self.max_chunks
++        self.data_offset = 0
++        self.metadata_offset = self.max_chunk_bytes * self.max_chunks
++
++        if name is None:
++            # we are creating a buffer
++            self.is_creator = True
++            self.shared_memory = shared_memory.SharedMemory(
++                create=True, size=self.total_bytes_of_buffer)
++            # initialize the metadata section to 0
++            with memoryview(self.shared_memory.buf[self.metadata_offset:]
++                            ) as metadata_buffer:
++                torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0)
++        else:
++            # we are opening an existing buffer
++            self.is_creator = False
++            # fix to https://stackoverflow.com/q/62748654/9191338
++            # Python incorrectly tracks shared memory even if it is not
++            # created by the process. The following patch is a workaround.
++            with patch("multiprocessing.resource_tracker.register",
++                       lambda *args, **kwargs: None):
++                try:
++                    self.shared_memory = shared_memory.SharedMemory(name=name)
++                    assert (
++                        self.shared_memory.size == self.total_bytes_of_buffer)
++                except FileNotFoundError:
++                    # we might deserialize the object in a different node
++                    # in this case, this object is not used,
++                    # and we should suppress the error
++                    pass
++
++    def handle(self):
++        return (self.n_reader, self.max_chunk_bytes, self.max_chunks,
++                self.shared_memory.name)
++
++    def __reduce__(self):
++        return (
++            self.__class__,
++            self.handle(),
++        )
++
++    def __del__(self):
++        if hasattr(self, "shared_memory"):
++            self.shared_memory.close()
++            if self.is_creator:
++                self.shared_memory.unlink()
++
++    @contextmanager
++    def get_data(self, current_idx: int):
++        start = self.data_offset + current_idx * self.max_chunk_bytes
++        end = start + self.max_chunk_bytes
++        with memoryview(self.shared_memory.buf[start:end]) as buf:
++            yield buf
++
++    @contextmanager
++    def get_metadata(self, current_idx: int):
++        start = self.metadata_offset + current_idx * self.metadata_size
++        end = start + self.metadata_size
++        with memoryview(self.shared_memory.buf[start:end]) as buf:
++            yield buf
++
++
++@dataclass
++class Handle:
++    connect_ip: str
++    local_reader_ranks: List[int] = field(default_factory=list)
++
++    buffer_handle: Optional[Tuple[int, int, int, str]] = None
++    local_subscribe_port: Optional[int] = None
++    remote_subscribe_port: Optional[int] = None
++
++
++class MessageQueue:
++
++    def __init__(
++        self,
++        n_reader,  # number of all readers
++        n_local_reader,  # number of local readers through shared memory
++        local_reader_ranks: Optional[List[int]] = None,
++        max_chunk_bytes: int = 1024 * 1024 * 10,
++        max_chunks: int = 10,
++        connect_ip: Optional[str] = None,
++    ):
++        if local_reader_ranks is None:
++            local_reader_ranks = list(range(n_local_reader))
++        else:
++            assert len(local_reader_ranks) == n_local_reader
++        self.n_local_reader = n_local_reader
++        n_remote_reader = n_reader - n_local_reader
++        self.n_remote_reader = n_remote_reader
++
++        if connect_ip is None:
++            connect_ip = get_ip() if n_remote_reader > 0 else "127.0.0.1"
++
++        context = Context()
++
++        if n_local_reader > 0:
++            # for local readers, we will:
++            # 1. create a shared memory ring buffer to communicate small data
++            # 2. create a publish-subscribe socket to communicate large data
++            self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes,
++                                        max_chunks)
++
++            # XPUB is very similar to PUB,
++            # except that it can receive subscription messages
++            # to confirm the number of subscribers
++            self.local_socket = context.socket(XPUB)
++            # set the verbose option so that we can receive every subscription
++            # message. otherwise, we will only receive the first subscription
++            # see http://api.zeromq.org/3-3:zmq-setsockopt for more details
++            self.local_socket.setsockopt(XPUB_VERBOSE, True)
++            local_subscribe_port = get_open_port()
++            socket_addr = f"tcp://127.0.0.1:{local_subscribe_port}"
++            logger.debug("Binding to %s", socket_addr)
++            self.local_socket.bind(socket_addr)
++
++            self.current_idx = 0
++
++        else:
++            self.buffer = None  # type: ignore
++            local_subscribe_port = None
++            self.local_socket = None
++            self.current_idx = -1
++
++        if n_remote_reader > 0:
++            # for remote readers, we will:
++            # create a publish-subscribe socket to communicate large data
++            self.remote_socket = context.socket(XPUB)
++            self.remote_socket.setsockopt(XPUB_VERBOSE, True)
++            remote_subscribe_port = get_open_port()
++            if is_valid_ipv6_address(connect_ip):
++                self.remote_socket.setsockopt(IPV6, 1)
++            socket_addr = f"tcp://*:{remote_subscribe_port}"
++            self.remote_socket.bind(socket_addr)
++
++        else:
++            remote_subscribe_port = None
++            self.remote_socket = None
++
++        self._is_writer = True
++        self._is_local_reader = False
++        self.local_reader_rank = -1
++        # rank does not matter for remote readers
++        self._is_remote_reader = False
++
++        self.handle = Handle(
++            connect_ip=connect_ip,
++            local_reader_ranks=local_reader_ranks,
++            buffer_handle=self.buffer.handle()
++            if self.buffer is not None else None,
++            local_subscribe_port=local_subscribe_port,
++            remote_subscribe_port=remote_subscribe_port,
++        )
++
++        logger.info("vLLM message queue communication handle: %s", self.handle)
++
++    def export_handle(self) -> Handle:
++        return self.handle
++
++    @staticmethod
++    def create_from_handle(handle: Handle, rank) -> "MessageQueue":
++        self = MessageQueue.__new__(MessageQueue)
++        self.handle = handle
++        self._is_writer = False
++
++        context = Context()
++
++        if rank in handle.local_reader_ranks:
++            assert handle.buffer_handle is not None
++            self.buffer = ShmRingBuffer(*handle.buffer_handle)
++            self.current_idx = 0
++            self.local_reader_rank = handle.local_reader_ranks.index(rank)
++            self._is_local_reader = True
++            self._is_remote_reader = False
++
++            self.local_socket = context.socket(SUB)
++            self.local_socket.setsockopt_string(SUBSCRIBE, "")
++            socket_addr = f"tcp://127.0.0.1:{handle.local_subscribe_port}"
++            logger.debug("Connecting to %s", socket_addr)
++            self.local_socket.connect(socket_addr)
++
++            self.remote_socket = None
++        else:
++            self.buffer = None  # type: ignore
++            self.current_idx = -1
++            self.local_reader_rank = -1
++            self._is_local_reader = False
++            self._is_remote_reader = True
++
++            self.local_socket = None
++
++            self.remote_socket = context.socket(SUB)
++            self.remote_socket.setsockopt_string(SUBSCRIBE, "")
++            if is_valid_ipv6_address(handle.connect_ip):
++                self.remote_socket.setsockopt(IPV6, 1)
++            socket_addr = f"tcp://{handle.connect_ip}:{handle.remote_subscribe_port}"
++            logger.debug("Connecting to %s", socket_addr)
++            self.remote_socket.connect(socket_addr)
++
++        return self
++
++    def wait_until_ready(self):
++        """This is a collective operation. All processes (including the
++        readers and the writer) should call this function.
++        """
++        if self._is_writer:
++            # wait for all readers to connect
++
++            # local readers
++            for i in range(self.n_local_reader):
++                # wait for subscription messages from all local readers
++                self.local_socket.recv()
++            if self.n_local_reader > 0:
++                # send a message to all local readers
++                # to make sure the publish channel is working
++                self.local_socket.send(b"READY")
++
++            # remote readers
++            for i in range(self.n_remote_reader):
++                # wait for subscription messages from all remote readers
++                self.remote_socket.recv()
++            if self.n_remote_reader > 0:
++                # send a message to all remote readers
++                # to make sure the publish channel is working
++                self.remote_socket.send(b"READY")
++        elif self._is_local_reader:
++            # wait for the writer to send a message
++            recv = self.local_socket.recv()
++            assert recv == b"READY"
++        elif self._is_remote_reader:
++            # wait for the writer to send a message
++            recv = self.remote_socket.recv()
++            assert recv == b"READY"
++
++    @contextmanager
++    def acquire_write(self, timeout: Optional[float] = None):
++        assert self._is_writer, "Only writers can acquire write"
++        start_time = time.monotonic()
++        n_warning = 1
++        while True:
++            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
++                read_count = sum(metadata_buffer[1:])
++                written_flag = metadata_buffer[0]
++                if written_flag and read_count != self.buffer.n_reader:
++                    # this block is written and not read by all readers
++                    # for writers, `self.current_idx` is the next block to write
++                    # if this block is not ready to write,
++                    # we need to wait until it is read by all readers
++
++                    # Release the processor to other threads
++                    sched_yield()
++
++                    # if we wait for a long time, log a message
++                    if (time.monotonic() - start_time >
++                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
++                        logger.debug("No available block found in %s second. ",
++                                     VLLM_RINGBUFFER_WARNING_INTERVAL)
++                        n_warning += 1
++
++                    # if we time out, raise an exception
++                    if (timeout is not None
++                            and time.monotonic() - start_time > timeout):
++                        raise TimeoutError
++
++                    continue
++                # found a block that is either
++                # (1) not written
++                # (2) read by all readers
++
++                # mark the block as not written
++                metadata_buffer[0] = 0
++                # let caller write to the buffer
++                with self.buffer.get_data(self.current_idx) as buf:
++                    yield buf
++
++                # caller has written to the buffer
++                # NOTE: order is important here
++                # first set the read flags to 0
++                # then set the written flag to 1
++                # otherwise, the readers may think they already read the block
++                for i in range(1, self.buffer.n_reader + 1):
++                    # set read flag to 0, meaning it is not read yet
++                    metadata_buffer[i] = 0
++                # mark the block as written
++                metadata_buffer[0] = 1
++                self.current_idx = (self.current_idx +
++                                    1) % self.buffer.max_chunks
++                break
++
++    @contextmanager
++    def acquire_read(self, timeout: Optional[float] = None):
++        assert self._is_local_reader, "Only readers can acquire read"
++        start_time = time.monotonic()
++        n_warning = 1
++        while True:
++            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
++                read_flag = metadata_buffer[self.local_reader_rank + 1]
++                written_flag = metadata_buffer[0]
++                if not written_flag or read_flag:
++                    # this block is either
++                    # (1) not written
++                    # (2) already read by this reader
++
++                    # for readers, `self.current_idx` is the next block to read
++                    # if this block is not ready,
++                    # we need to wait until it is written
++
++                    # Release the processor to other threads
++                    sched_yield()
++
++                    # if we wait for a long time, log a message
++                    if (time.monotonic() - start_time >
++                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
++                        logger.debug("No available block found in %s second. ",
++                                     VLLM_RINGBUFFER_WARNING_INTERVAL)
++                        n_warning += 1
++
++                    # if we time out, raise an exception
++                    if (timeout is not None
++                            and time.monotonic() - start_time > timeout):
++                        raise TimeoutError
++
++                    continue
++                # found a block that is not read by this reader
++                # let caller read from the buffer
++                with self.buffer.get_data(self.current_idx) as buf:
++                    yield buf
++
++                # caller has read from the buffer
++                # set the read flag
++                metadata_buffer[self.local_reader_rank + 1] = 1
++                self.current_idx = (self.current_idx +
++                                    1) % self.buffer.max_chunks
++                break
++
++    def enqueue(self, obj, timeout: Optional[float] = None):
++        """ Write to message queue with optional timeout (in seconds) """
++        assert self._is_writer, "Only writers can enqueue"
++        serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
++        if self.n_local_reader > 0:
++            if len(serialized_obj) >= self.buffer.max_chunk_bytes:
++                with self.acquire_write(timeout) as buf:
++                    buf[0] = 1  # overflow
++                self.local_socket.send(serialized_obj)
++            else:
++                with self.acquire_write(timeout) as buf:
++                    buf[0] = 0  # not overflow
++                    buf[1:len(serialized_obj) + 1] = serialized_obj
++        if self.n_remote_reader > 0:
++            self.remote_socket.send(serialized_obj)
++
++    def dequeue(self, timeout: Optional[float] = None):
++        """ Read from message queue with optional timeout (in seconds) """
++        if self._is_local_reader:
++            with self.acquire_read(timeout) as buf:
++                overflow = buf[0] == 1
++                if not overflow:
++                    # no need to know the size of serialized object
++                    # pickle format contains the size information internally
++                    # see https://docs.python.org/3/library/pickle.html
++                    obj = pickle.loads(buf[1:])
++            if overflow:
++                recv = self.local_socket.recv()
++                obj = pickle.loads(recv)
++        elif self._is_remote_reader:
++            recv = self.remote_socket.recv()
++            obj = pickle.loads(recv)
++        else:
++            raise RuntimeError("Only readers can dequeue")
++        return obj
++
++    def broadcast_object(self, obj=None):
++        if self._is_writer:
++            self.enqueue(obj)
++            return obj
++        else:
++            return self.dequeue()
++
++    @staticmethod
++    def create_from_process_group(pg: Union[ProcessGroup,
++                                            StatelessProcessGroup],
++                                  max_chunk_bytes,
++                                  max_chunks,
++                                  writer_rank=0) -> "MessageQueue":
++        if isinstance(pg, ProcessGroup):
++            group_rank = dist.get_rank(pg)
++            group_world_size = dist.get_world_size(pg)
++            global_ranks = dist.get_process_group_ranks(pg)
++        else:
++            group_rank = pg.rank
++            group_world_size = pg.world_size
++            global_ranks = list(range(pg.world_size))
++
++        from vllm.distributed.parallel_state import in_the_same_node_as
++        status = in_the_same_node_as(pg, source_rank=writer_rank)
++        same_node_ranks = [i for i, s in enumerate(status) if s]
++        n_reader = group_world_size - 1
++        n_local_reader = len(same_node_ranks) - 1
++        local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
++        buffer_io: MessageQueue
++        if group_rank == writer_rank:
++            buffer_io = MessageQueue(
++                n_reader=n_reader,
++                n_local_reader=n_local_reader,
++                local_reader_ranks=local_reader_ranks,
++                max_chunk_bytes=max_chunk_bytes,
++                max_chunks=max_chunks,
++            )
++            handle = buffer_io.export_handle()
++            if isinstance(pg, ProcessGroup):
++                dist.broadcast_object_list([handle],
++                                           src=global_ranks[writer_rank],
++                                           group=pg)
++            else:
++                pg.broadcast_obj(handle, writer_rank)
++        else:
++            if isinstance(pg, ProcessGroup):
++                recv = [None]
++                dist.broadcast_object_list(recv,
++                                           src=global_ranks[writer_rank],
++                                           group=pg)
++                handle = recv[0]  # type: ignore
++            else:
++                handle = pg.broadcast_obj(None, writer_rank)
++            buffer_io = MessageQueue.create_from_handle(handle, group_rank)
++        buffer_io.wait_until_ready()
++        return buffer_io
+diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
+new file mode 100644
+index 0000000..765a0f9
+--- /dev/null
++++ b/vllm/distributed/device_communicators/tpu_communicator.py
+@@ -0,0 +1,61 @@
++import os
++
++import torch
++import torch.distributed as dist
++from torch.distributed import ProcessGroup
++
++from vllm.platforms import current_platform
++
++if current_platform.is_tpu():
++    import torch_xla.core.xla_model as xm
++    import torch_xla.runtime as xr
++    from torch_xla._internal import pjrt
++
++    from vllm.executor import ray_utils
++
++
++class TpuCommunicator:
++
++    def __init__(self, group: ProcessGroup):
++        if not current_platform.is_tpu():
++            self.disabled = True
++            return
++        self.disabled = False
++
++        # NOTE(woosuk): When using TP > 1 on TPUs, every TPU on the same node
++        # must be used together. Therefore, the local rank and world size can
++        # be simply calculated as follows.
++        global_rank = dist.get_rank(group)
++        global_world_size = dist.get_world_size(group)
++
++        # Calculate how many TPU nodes are in the current deployment. This
++        # is the Ray placement group if it is deployed with Ray. Default
++        # to the number of TPU nodes in the Ray cluster. The number of TPU
++        # nodes is computed by the total number of TPUs divided by the
++        # number of TPU accelerators per node, to account for clusters
++        # with both CPUs and TPUs.
++        num_nodes = ray_utils.get_num_tpu_nodes()
++        num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group()
++        if num_nodes_in_pg > 0:
++            num_nodes = num_nodes_in_pg
++
++        local_world_size = global_world_size // num_nodes
++        local_rank = global_rank % local_world_size
++
++        # Ensure environment variables are set for multihost deployments.
++        # On GKE, this is needed for libtpu and TPU driver to know which TPU
++        # chip is actually visible. Otherwise the TPU driver will fail to
++        # initialize because the number of devices would be different from
++        # the number of visible worker addresses.
++        os.environ["CLOUD_TPU_TASK_ID"] = str(global_rank)
++        os.environ["TPU_VISIBLE_CHIPS"] = str(local_rank)
++
++        pjrt.initialize_multiprocess(local_rank, local_world_size)
++        xr._init_world_size_ordinal()
++
++    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
++        return xm.all_reduce(xm.REDUCE_SUM, x)
++
++    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
++        assert dim == -1, "TPUs only support dim=-1 for all-gather."
++        return xm.all_gather(x, dim=dim)
+diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
+new file mode 100644
+index 0000000..eafd3c2
+--- /dev/null
++++ b/vllm/distributed/device_communicators/xpu_communicator.py
+@@ -0,0 +1,47 @@
++import torch
++import torch.distributed as dist
++from torch.distributed import ProcessGroup
++
++from vllm.platforms import current_platform
++
++
++class XpuCommunicator:
++
++    def __init__(self, group: ProcessGroup):
++        if not current_platform.is_xpu():
++            self.disabled = True
++            return
++        self.disabled = False
++        self.group = group
++        self.world_size = dist.get_world_size(self.group)
++
++    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
++        dist.all_reduce(x, group=self.group)
++        return x
++
++    def gather(self,
++               input_: torch.Tensor,
++               rank_in_group: int,
++               dst: int = 0,
++               dim: int = -1):
++        # For xpu path, gather doesn't work properly together with ray
++        # cluster so we use all_gather instead for now.
++        input_size = input_.size()
++        # Allocate output tensor.
++        output_tensor = torch.empty((self.world_size, ) + input_size,
++                                    dtype=input_.dtype,
++                                    device=input_.device)
++        # All-gather.
++        torch.distributed.all_gather_into_tensor(output_tensor,
++                                                 input_,
++                                                 group=self.group)
++        if rank_in_group == dst:
++            # Reshape
++            output_tensor = output_tensor.movedim(0, dim)
++            output_tensor = output_tensor.reshape(input_size[:dim] +
++                                                  (self.world_size *
++                                                   input_size[dim], ) +
++                                                  input_size[dim + 1:])
++        else:
++            output_tensor = None
++        return output_tensor
+diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
+new file mode 100644
+index 0000000..e20c992
+--- /dev/null
++++ b/vllm/distributed/kv_transfer/README.md
+@@ -0,0 +1,30 @@
++
++# Distributed KV cache transfer
++
++This folder implements distributed KV cache transfer across vLLM instances.
++Currently the main usecase is for disaggregated prefilling.
++
++## Abstractions
++
++The KV cache transfer contains three layer of abstractions:
++
++- KV pipe: a FIFO pipe for torch.tensor transmission. Key APIs: `send_tensor` and `recv_tensor`.
++- KV lookup buffer: a lookup buffer for KV caches. Key: the tokens, value: the KV caches (and/or hidden states). Key APIs: `insert` and `drop_select` (similar to SQL semantics).
++- KV connector: a connector that connects the KV pipe and KV lookup buffer to vLLM. Key APIs: `send_kv_caches_and_hidden_states` and `recv_kv_caches_and_hidden_states`.
++
++Why we need KV lookup buffer: FIFO pipe itself is not enough as prefill vLLM worker may process requests in a different order compared to decode vLLM worker. Say the QPS is really high, prefill worker may handle requests in order A -> B -> C, but the decode worker may process request C first. This is not the case that can be naturally handled by FIFO pipe, so we provide KV lookup buffer to help translate a FIFO pipe to a lookup buffer.
++
++NOTE: KV pipe layer is bypassible: you can skip this layer if your distributed 
++communication service already supports key-value-based lookup (like redis or 
++RDMA database).
++
++NOTE: If you want to not only transfer KV caches, but adjust the model execution flow of vLLM as well (for example, allow vLLM to receive KV caches on some tokens and do prefill on the remaining tokens), you can bypass both KV pipe layer and KV lookup buffer layer, and directly implement on KV connector layer. Bear in mind that as vLLM's model input is constantly changing, this implementation will likely be broken when vLLM has new updates.
++
++## Disaggregated prefilling
++
++The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh).
++
++Here is the diagram of how we run disaggretgated prefilling.
++
++![Disaggregated prefill workflow](./disagg_prefill_workflow.jpg)
++
+diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg b/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg
+new file mode 100644
+index 0000000..a25ec5e
+Binary files /dev/null and b/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg differ
+diff --git a/vllm/distributed/kv_transfer/kv_connector/__init__.py b/vllm/distributed/kv_transfer/kv_connector/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
+new file mode 100644
+index 0000000..6089e3b
+--- /dev/null
++++ b/vllm/distributed/kv_transfer/kv_connector/base.py
+@@ -0,0 +1,122 @@
++"""
++KVConnectorBase Class for Distributed KV Cache & Hidden State communication
++
++The class provides two primary abstract methods:
++1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states
++2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states
++"""
++
++from abc import ABC, abstractmethod
++from typing import TYPE_CHECKING, List, Tuple, Union
++
++import torch
++
++from vllm.sequence import IntermediateTensors
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
++
++
++class KVConnectorBase(ABC):
++    """
++    Abstract base class for a KV connector.
++
++    The class provides two primary abstract methods:
++    1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states
++    2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states
++    """
++
++    @abstractmethod
++    def __init__(
++        self,
++        rank: int,
++        local_rank: int,
++        config: "VllmConfig",
++    ):
++        raise NotImplementedError
++
++    @abstractmethod
++    def close(self) -> None:
++        """Close the buffer and release resources.
++
++        This method is responsible for cleaning up resources related to the 
++        connector when it is no longer needed.
++
++        Raises:
++            NotImplementedError: This method must be implemented in subclasses.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def send_kv_caches_and_hidden_states(
++        self,
++        model_executable: torch.nn.Module,
++        model_input: "ModelInputForGPUWithSamplingMetadata",
++        kv_caches: List[torch.Tensor],
++        hidden_or_intermediate_states: Union[torch.Tensor,
++                                             IntermediateTensors],
++    ) -> None:
++        """
++        Send KV caches and hidden states to the connector.
++
++        This method processes the input tokens, KV caches, and 
++        hidden/intermediate states for a given model and sends the data to the 
++        decode instance.
++
++        Args:
++            model_executable (torch.nn.Module): The model executable containing 
++                start and end layer information.
++            model_input (ModelInputForGPUWithSamplingMetadata): The input
++                metadata from vLLM.
++            kv_caches (List[torch.Tensor]): List of KV caches (keys and values) 
++                for each layer.
++            hidden_or_intermediate_states (Union[torch.Tensor, 
++            IntermediateTensors]): 
++                The hidden or intermediate states associated with the tokens.
++
++        Returns:
++            None
++
++        """
++
++        raise NotImplementedError
++
++    @abstractmethod
++    def recv_kv_caches_and_hidden_states(
++        self, model_executable: torch.nn.Module,
++        model_input: "ModelInputForGPUWithSamplingMetadata",
++        kv_caches: List[torch.Tensor]
++    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
++               "ModelInputForGPUWithSamplingMetadata"]:
++        """
++        Receive KV caches and hidden states from the connector.
++
++        This method attempts to retrieve KV caches and hidden states for input
++        tokens. If all required KV caches and hidden states are received, it
++        will bypass model input, else it will fall back to normal vLLM model 
++        forwarding.
++
++        Args:
++            model_executable (torch.nn.Module): 
++                The model executable from vLLM modelrunner.
++            model_input (ModelInputForGPUWithSamplingMetadata): 
++                The model input from vLLM modelrunner.
++            kv_caches (List[torch.Tensor]): 
++                List of KV caches for each layer.
++
++        Returns:
++            - hidden_or_intermediate_states (torch.Tensor or
++            IntermediateTensors): 
++                Concatenated hidden states if all required data is retrieved, 
++                otherwise `None`.
++            - bypass_model_exec (bool): 
++                Indicates whether the model execution can be skipped (True) or 
++                needs to be redone (False).
++            - model_input (ModelInputForGPUWithSamplingMetadata): 
++                Optionally adjusted input metadata for re-execution when 
++                `bypass_model_exec=False`.
++
++        """
++
++        raise NotImplementedError
+diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
+new file mode 100644
+index 0000000..6372dab
+--- /dev/null
++++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
+@@ -0,0 +1,48 @@
++import importlib
++from typing import TYPE_CHECKING, Callable, Dict, Type
++
++from .base import KVConnectorBase
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++
++
++class KVConnectorFactory:
++    _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
++
++    @classmethod
++    def register_connector(cls, name: str, module_path: str,
++                           class_name: str) -> None:
++        """Register a connector with a lazy-loading module and class name."""
++        if name in cls._registry:
++            raise ValueError(f"Connector '{name}' is already registered.")
++
++        def loader() -> Type[KVConnectorBase]:
++            module = importlib.import_module(module_path)
++            return getattr(module, class_name)
++
++        cls._registry[name] = loader
++
++    @classmethod
++    def create_connector(cls, rank: int, local_rank: int,
++                         config: "VllmConfig") -> KVConnectorBase:
++        connector_name = config.kv_transfer_config.kv_connector
++        if connector_name not in cls._registry:
++            raise ValueError(f"Unsupported connector type: {connector_name}")
++
++        connector_cls = cls._registry[connector_name]()
++        return connector_cls(rank, local_rank, config)
++
++
++# Register various connectors here.
++# The registration should not be done in each individual file, as we want to
++# only load the files corresponding to the current connector.
++KVConnectorFactory.register_connector(
++    "PyNcclConnector",
++    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
++    "SimpleConnector")
++
++KVConnectorFactory.register_connector(
++    "MooncakeConnector",
++    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
++    "SimpleConnector")
+diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+new file mode 100644
+index 0000000..4ace03f
+--- /dev/null
++++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+@@ -0,0 +1,312 @@
++"""
++Simple KV Cache Connector for Distributed Machine Learning Inference
++
++The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache 
++producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or
++MooncakePipe.
++
++But the logic can be extended to support other pipe and lookup buffer.
++"""
++from typing import TYPE_CHECKING, List, Optional, Tuple, Union
++
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.config import VllmConfig
++from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
++from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
++    SimpleBuffer)
++from vllm.logger import init_logger
++from vllm.sequence import IntermediateTensors
++
++if TYPE_CHECKING:
++    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
++
++logger = init_logger(__name__)
++
++
++class SimpleConnector(KVConnectorBase):
++
++    def __init__(
++        self,
++        rank: int,
++        local_rank: int,
++        config: VllmConfig,
++    ):
++
++        self.config = config.kv_transfer_config
++
++        if self.config.kv_connector == "PyNcclConnector":
++            from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
++                PyNcclPipe)
++            logger.info(
++                "Initializing PyNcclConfig under kv_transfer_config %s",
++                self.config)
++        elif self.config.kv_connector == "MooncakeConnector":
++            # Check if MOONCAKE_CONFIG_PATH is set
++            import os
++            use_mooncake_distributed_pipe = os.getenv(
++                'MOONCAKE_CONFIG_PATH') is not None
++
++            if not use_mooncake_distributed_pipe:
++                raise ValueError(
++                    "To use MooncakeConnector, you need to pass the ENV: "
++                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
++            else:
++                from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import (  # noqa: E501
++                    MooncakePipe)
++                logger.info(
++                    "Initializing MooncakeConfig under kv_transfer_config %s",
++                    self.config)
++
++        self.lookup_buffer_size = self.config.kv_buffer_size
++
++        self.producer_buffer: Optional[SimpleBuffer] = None
++        self.consumer_buffer: Optional[SimpleBuffer] = None
++
++        self.producer_data_pipe: Union[PyNcclPipe, MooncakePipe]
++        self.consumer_data_pipe: Union[PyNcclPipe, MooncakePipe]
++        self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
++        self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
++
++        # 2 pipes for every rank in the world
++        port_offset_base = 2 * rank
++
++        # In disaggregated prefill, the prefill vLLM only uses send pipe
++        # and the decode vLLM only uses recv pipe
++        if self.config.is_kv_producer:
++
++            if self.config.kv_connector == "PyNcclConnector":
++                self.producer_data_pipe = PyNcclPipe(
++                    local_rank=local_rank,
++                    config=self.config,
++                    port_offset=port_offset_base,
++                )
++                self.producer_signal_pipe = PyNcclPipe(
++                    local_rank=local_rank,
++                    config=self.config,
++                    port_offset=port_offset_base + 1,
++                    device="cpu",
++                )
++            elif self.config.kv_connector == "MooncakeConnector":
++                self.producer_data_pipe = MooncakePipe(
++                    local_rank=local_rank,
++                    config=self.config,
++                )
++                # We only need to initialize MooncakePipe once
++                self.producer_signal_pipe = self.producer_data_pipe
++
++            self.producer_buffer = SimpleBuffer(self.producer_signal_pipe,
++                                                self.producer_data_pipe,
++                                                self.config.kv_buffer_size)
++
++        else:
++
++            # the current vLLM instance is KV consumer, so it needs to connect
++            # its recv pipe to the send pipe of KV producder
++            if self.config.kv_connector == "PyNcclConnector":
++                self.consumer_data_pipe = PyNcclPipe(
++                    local_rank=local_rank,
++                    config=self.config,
++                    port_offset=port_offset_base,
++                )
++                self.consumer_signal_pipe = PyNcclPipe(
++                    local_rank=local_rank,
++                    config=self.config,
++                    port_offset=port_offset_base + 1,
++                    device="cpu",
++                )
++            elif self.config.kv_connector == "MooncakeConnector":
++                self.consumer_data_pipe = MooncakePipe(
++                    local_rank=local_rank,
++                    config=self.config,
++                )
++                self.consumer_signal_pipe = self.consumer_data_pipe
++
++            self.consumer_buffer = SimpleBuffer(
++                self.consumer_signal_pipe,
++                self.consumer_data_pipe,
++                self.config.kv_buffer_size,
++            )
++
++    def select(self, input_tokens: Optional[torch.Tensor],
++               roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
++
++        assert self.consumer_buffer is not None, "Please initialize the "\
++            "consumer buffer before calling select."
++        return self.consumer_buffer.drop_select(input_tokens, roi)
++
++    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
++               key: torch.Tensor, value: torch.Tensor,
++               hidden: torch.Tensor) -> None:
++
++        assert self.producer_buffer is not None, "Please initialize the "\
++            "producer buffer before calling insert."
++
++        self.producer_buffer.insert(input_tokens, roi, key, value, hidden)
++
++    def send_kv_caches_and_hidden_states(
++        self,
++        model_executable: torch.nn.Module,
++        model_input: "ModelInputForGPUWithSamplingMetadata",
++        kv_caches: List[torch.Tensor],
++        hidden_or_intermediate_states: Union[torch.Tensor,
++                                             IntermediateTensors],
++    ) -> None:
++
++        input_tokens_tensor = model_input.input_tokens
++        seq_lens = model_input.attn_metadata.seq_lens
++        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
++        start_layer = model_executable.model.start_layer
++        end_layer = model_executable.model.end_layer
++
++        model_config = model_executable.model.config
++        num_heads = model_config.num_key_value_heads
++        hidden_size = model_config.hidden_size
++        num_attention_heads = model_config.num_attention_heads
++        head_size = int(hidden_size / num_attention_heads)
++
++        # query_lens contains new KV caches that are added to vLLM.
++        # so we will send them to decode instance
++        # FIXME(Kuntai): This assume that all requests are prefill.
++        for idx, slen in enumerate(seq_lens):
++            start_pos = sum(seq_lens[:idx])
++            end_pos = start_pos + slen
++            current_tokens = input_tokens_tensor[start_pos:end_pos]
++
++            keys, values = [], []
++
++            for layer_id in range(start_layer, end_layer):
++                kv_cache = kv_caches[layer_id - start_layer]
++
++                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
++                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
++
++                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
++
++                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
++                values.append(value_cache[current_slot_mapping].unsqueeze(0))
++
++            keys = torch.cat(keys, dim=0)
++            values = torch.cat(values, dim=0)
++
++            self.insert(current_tokens,
++                        torch.ones_like(current_tokens,
++                                        dtype=bool), keys, values,
++                        hidden_or_intermediate_states[start_pos:end_pos])
++
++        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
++
++    def recv_kv_caches_and_hidden_states(
++        self, model_executable: torch.nn.Module,
++        model_input: "ModelInputForGPUWithSamplingMetadata",
++        kv_caches: List[torch.Tensor]
++    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
++               "ModelInputForGPUWithSamplingMetadata"]:
++
++        # When bypass_model_exec is set to False, it means that at least for one
++        # request its corresponding KV cache or hidden state is missing.
++        # In this case we need to do prefilling to recompute missing KV cache
++        # and hidden states.
++        bypass_model_exec = True
++
++        input_tokens_tensor = model_input.input_tokens
++        seq_lens = model_input.attn_metadata.seq_lens
++        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
++
++        hidden_or_intermediate_states_for_one_req = []
++
++        input_tokens_list = []
++        num_computed_tokens_list = []
++        start_pos_list = []
++
++        # enumerate different requests
++        # FIXME(Kuntai): This impl assumes that all requests are prefill.
++        for idx, slen in enumerate(seq_lens):
++
++            start_pos = sum(seq_lens[:idx])
++            end_pos = start_pos + slen
++            current_tokens = input_tokens_tensor[start_pos:end_pos]
++            num_tokens = slen
++
++            # collecting data for rebuilding the input
++            input_tokens_list.append(current_tokens)
++            start_pos_list.append(start_pos)
++
++            ret = self.select(current_tokens,
++                              torch.ones_like(current_tokens, dtype=bool))
++            if ret[0] is None:
++                # didn't find any match.
++                bypass_model_exec = False
++                num_computed_tokens_list.append(0)
++                continue
++
++            roi: torch.Tensor = ret[1]
++            keys: torch.Tensor = ret[2]
++            values: torch.Tensor = ret[3]
++            hidden: torch.Tensor = ret[4]
++
++            num_computed_tokens = roi.shape[0]
++            num_computed_tokens_list.append(num_computed_tokens)
++
++            # check if both KV cache and the hidden states are received
++            # If not, need to redo the forwarding to compute missing states
++            if not all([(num_computed_tokens == num_tokens), hidden is not None
++                        ]):
++                bypass_model_exec = False
++
++            # update the end position based on how many tokens are cached.
++            end_pos = start_pos + num_computed_tokens
++
++            # put received KV caches into paged memory
++            for i in range(model_executable.model.start_layer,
++                           model_executable.model.end_layer):
++
++                kv_cache = kv_caches[i - model_executable.model.start_layer]
++                layer = model_executable.model.layers[i]
++
++                key_cache, value_cache = kv_cache[0], kv_cache[1]
++                ops.reshape_and_cache_flash(
++                    keys[i - model_executable.model.start_layer].to(
++                        key_cache.device),
++                    values[i - model_executable.model.start_layer].to(
++                        value_cache.device),
++                    key_cache,
++                    value_cache,
++                    slot_mapping[start_pos:end_pos],
++                    layer.self_attn.attn.kv_cache_dtype,
++                    layer.self_attn.attn._k_scale,
++                    layer.self_attn.attn._v_scale,
++                )
++
++            hidden_or_intermediate_states_for_one_req.append(hidden)
++
++        if not bypass_model_exec:
++            # Some of the KV cache is not retrieved
++            # Here we will fall back to normal model forwarding
++            # But optionally you can adjust model_input so that you only do
++            # prefilling on those tokens that are missing KV caches.
++            logger.debug(
++                "[rank%d]: Failed to receive all KVs and hidden "
++                "states, redo model forwarding.", torch.distributed.get_rank())
++            hidden_or_intermediate_states = None
++
++        else:
++            logger.debug(
++                "[rank%d]: Successfully received all KVs and hidden "
++                "states, skip model forwarding.", torch.distributed.get_rank())
++            hidden_or_intermediate_states = torch.cat(
++                hidden_or_intermediate_states_for_one_req, dim=0)
++
++        return hidden_or_intermediate_states, bypass_model_exec, model_input
++
++    def close(self):
++        self.producer_data_pipe.close()
++        self.consumer_data_pipe.close()
++        if self.config.kv_connector == "PyNcclConnector":
++            self.producer_signal_pipe.close()
++            self.consumer_signal_pipe.close()
++        elif self.config.kv_connector == "MooncakeConnector":
++            # MooncakePipe reuses data_pipe for signal_pipe, so we only have to
++            # close the data_pipe.
++            pass
+diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+new file mode 100644
+index 0000000..bad119a
+--- /dev/null
++++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+@@ -0,0 +1,108 @@
++"""
++This file contains a new class `KVLookupBufferBase` that allows developers to 
++think of KV cache operations as inserting new KV cache entries (`insert`) 
++into the lookup buffer and querying existing KV caches (`drop_select`) 
++from the lookup buffer.
++
++All distributed communications are abstracted behind this class.
++"""
++
++from abc import ABC, abstractmethod
++from typing import List, Optional
++
++import torch
++
++
++class KVLookupBufferBase(ABC):
++    """
++    Abstract base class for a lookup buffer.
++
++    This class provides an abstraction for a key-value (KV) cache lookup buffer.
++    
++    The key of the lookup buffer:
++    - input_tokens: token IDs of the request
++    - roi: a binary mask on top of input_tokens.
++      - Purpose of roi: Since KV cache may only be available for a subset of 
++        tokens in the input (for example, when vLLM is connected to an external 
++        KV cache service), roi specifies the subset of tokens that the KV cache 
++        is associated with.
++      - NOTE: roi can be further extended to describe which part of KV the 
++        current process is holding (each process may only hold a part of KV 
++        due to TP and PP). This is not implemented for now.
++        
++    The value of the lookup buffer:
++    - key: the key tensor in the KV cache
++    - value: the value tensor in the KV cache
++    - hidden: the final hidden state generated by model forwarding. This allows 
++      vLLM to bypass further model forwarding by transmitting the hidden state.
++    """
++
++    @abstractmethod
++    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
++               key: torch.Tensor, value: torch.Tensor,
++               hidden: torch.Tensor) -> None:
++        """Insert into the lookup buffer.
++        
++        The functionality is similar to the following python statement
++        ```
++        buffer[input_tokens, roi] = [key, value, hidden]
++        ```
++        
++        FIXME: in the future, we should only have two arguments, key and value,
++        where key is a tensor dict and value is a tensor dict.
++        
++        FIXME: we should transmit both sampler outputs and the hidden states.
++
++        Args:
++            input_tokens (torch.Tensor): token IDs.
++            roi (torch.Tensor): A binary mask on top of the input tokens
++            key (torch.Tensor): The key tensor in the KV cache.
++            value (torch.Tensor): The value tensor in the KV cache.
++            hidden (torch.Tensor): The final hidden state tensor generated 
++                                   during model forwarding to bypass model 
++                                   forwarding.
++
++        Raises:
++            NotImplementedError: This method must be implemented in subclasses.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def drop_select(
++            self, input_tokens: Optional[torch.Tensor],
++            roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
++        """Select and *drop* KV cache entries from the lookup buffer.
++        
++        The functionality is similar to the following python statements
++        ```
++        ret = buffer.pop(input_tokens, roi)
++        return ret
++        ```
++        
++        If `input_tokens` and `roi` is `None`, it means selecting any of the
++        KV caches in the buffer, return, and remove it from the buffer, useful
++        when offloading KV cache to KV cache storage service.
++
++        Args:
++            input_tokens (torch.Tensor): token IDs.
++            roi (torch.Tensor): A binary mask on top of the input tokens
++
++        Returns:
++            List[Optional[torch.Tensor]]: A list of tensors. Can be None.
++
++        Raises:
++            NotImplementedError: This method must be implemented in subclasses.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def close(self) -> None:
++        """Close the buffer and release resources.
++
++        This method is responsible for cleaning up resources related to the 
++        lookup buffer when it is no longer needed.
++
++        Raises:
++            NotImplementedError: This method must be implemented in subclasses.
++        """
++        raise NotImplementedError
+diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+new file mode 100644
+index 0000000..fe8d8d7
+--- /dev/null
++++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+@@ -0,0 +1,242 @@
++"""
++    Implements a distributed key-value (KV) cache transfer mechanism.
++
++    Key Features:
++    - Distributed KV cache transmission using PyNccl pipes.
++    - Non-blocking `insert`, blocking `drop_select`.
++    - Use CPU signal pipe to avoid racing condition
++    - Handles buffer size constraints and provide backpressure mechanism to 
++      stop the prefill instance when the decode instance is slow.
++"""
++import threading
++import time
++from collections import deque
++from typing import Deque, List, Optional, Union
++
++import torch
++
++from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
++    KVLookupBufferBase)
++from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++
++class SimpleBuffer(KVLookupBufferBase):
++
++    def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase,
++                 buffer_size_thresh: float):
++        """
++        signal_pipe: on CPU 
++        
++        NOTE: on-device recv will block all threads in the process, making the 
++        KV cache producer unable to listen to new request while transmitting 
++        KV cache. Luckily CPU recv only blocks the current thread so we use 
++        CPU recv to listen to new request.
++        
++        data_pipe: on device (e.g. GPU)
++        """
++
++        self.buffer: Deque[List[torch.Tensor]] = deque()
++
++        self.buffer_size = 0
++        self.buffer_size_threshold = buffer_size_thresh
++        self.buffer_lock = threading.Lock()
++        self.signal_pipe = signal_pipe
++        self.data_pipe = data_pipe
++        self.request_handling_thread: Optional[threading.Thread] = None
++
++        self.normal_signal = torch.tensor([0], device="cpu")
++        self.end_signal = None
++
++    def _matches(self, tokens_roi_sender: List[torch.Tensor],
++                 tokens_roi_recver: List[torch.Tensor]):
++
++        # tokens_roi_sender: tokens and roi of the producer (in the buffer)
++        # tokens_roi_recver: tokens and roi of the consumer (query)
++
++        tokens_sender = tokens_roi_sender[0]
++        tokens_recver = tokens_roi_recver[0]
++        roi_sender = tokens_roi_sender[1]
++        roi_recver = tokens_roi_recver[1]
++
++        if tokens_recver is None:
++            # consumer sends an empty request
++            # semantics: DROP SELECT * LIMIT 1
++            # so any of the data in the buffer can be drop-selected
++            return True
++
++        # Assuming that roi is a binary mask on tokens
++        tokens_sender = tokens_sender[roi_sender]
++        tokens_recver = tokens_recver[roi_recver]
++
++        # simple common prefix matching
++        min_length = min(len(tokens_sender), len(tokens_recver))
++        if torch.allclose(tokens_sender[:min_length],
++                          tokens_recver[:min_length]):
++            return min_length
++
++        return 0
++
++    def _send_tensor_and_dec_size(self,
++                                  tensor: Optional[torch.Tensor]) -> None:
++
++        assert tensor is not None, "Use self.data_pipe.send(None) instead"
++        self.buffer_size -= tensor.element_size() * tensor.numel()
++        if tensor.dtype == torch.bool:
++            tensor = tensor.float()
++        self.data_pipe.send_tensor(tensor)
++
++    def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
++
++        if isinstance(data, torch.Tensor):
++            return data.element_size() * data.numel()
++        if not data:
++            # cannot perform `not data` on a tensor
++            # so this check needs to go after the check above
++            return 0
++
++        raise AssertionError(f"Unknown data type {type(data)}")
++
++    def _add_to_buffer(self, input_tokens: torch.Tensor, roi: torch.Tensor,
++                       key: torch.Tensor, value: torch.Tensor,
++                       hidden: torch.Tensor):
++
++        if isinstance(input_tokens, torch.Tensor):
++            input_tokens = input_tokens.clone()
++        if isinstance(roi, torch.Tensor):
++            roi = roi.clone()
++        if isinstance(key, torch.Tensor):
++            key = key.clone()
++        if isinstance(value, torch.Tensor):
++            value = value.clone()
++        if isinstance(hidden, torch.Tensor):
++            hidden = hidden.clone()
++
++        buffer_item = [input_tokens, roi, key, value, hidden]
++
++        with self.buffer_lock:
++            for data in buffer_item:
++                self.buffer_size += self._get_element_size(data)
++            self.buffer.append(buffer_item)
++
++    def _is_end_signal(self, signal):
++        return signal is None
++
++    def drop_select_handler(self):
++
++        try:
++
++            while True:
++                signal = self.signal_pipe.recv_tensor()
++                if self._is_end_signal(signal):
++                    logger.info("Received end signal!")
++                    break
++
++                input_tokens = self.data_pipe.recv_tensor()
++
++                roi = self.data_pipe.recv_tensor()
++                assert roi is not None, "Please provide the roi when sending "\
++                    "drop-select request"
++                roi = (roi > 0.5)
++                tokens_roi_recver = [input_tokens, roi]
++
++                matched_length = 0
++
++                # perform input tokens and roi matching
++                # FIXME: this matching is O(n), ideally it should be O(1)
++                # but this buffer size won't (and shouldn't) be too large so
++                # the fix is not urgent.
++                with self.buffer_lock:
++
++                    for _ in range(len(self.buffer)):
++
++                        temp_length = self._matches(self.buffer[0],
++                                                    tokens_roi_recver)
++                        if temp_length > 0:
++                            matched_length = temp_length
++                            break
++                        # rotate the element we just accessed to the end
++                        self.buffer.rotate(-1)
++
++                    if matched_length > 0:
++                        # need to clone the tensor
++                        # in case the tensor is freed before sending finishes
++                        matched_item = self.buffer.popleft()
++                        for tensor in matched_item:
++                            self._send_tensor_and_dec_size(tensor)
++
++                    else:
++                        # no match, just send None
++                        for _ in range(5):
++                            self.data_pipe.send_tensor(None)
++
++        except RuntimeError as e:
++            if 'Connection closed by peer' not in str(e):
++                raise e
++
++        logger.debug("Closing drop_select_handler")
++
++    def drop_select(
++            self, input_tokens: Optional[torch.Tensor],
++            roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
++
++        assert self.request_handling_thread is None, \
++            "drop_select should be called by the KV cache consumer "\
++            "(e.g. the decode vLLM instance)"
++
++        if isinstance(input_tokens, torch.Tensor):
++            input_tokens = input_tokens.clone()
++        if isinstance(roi, torch.Tensor):
++            roi = roi.clone().float()
++
++        self.signal_pipe.send_tensor(self.normal_signal)
++        self.data_pipe.send_tensor(input_tokens)
++        self.data_pipe.send_tensor(roi)
++
++        input_tokens = self.data_pipe.recv_tensor()
++        roi = self.data_pipe.recv_tensor()
++        if roi is not None:
++            # convert from float tensor to bool tensor
++            # as PyNccl does not support sending bool tensor
++            roi = (roi > 0.5)
++        key = self.data_pipe.recv_tensor()
++        value = self.data_pipe.recv_tensor()
++        hidden = self.data_pipe.recv_tensor()
++
++        return [input_tokens, roi, key, value, hidden]
++
++    def full_handler(self):
++        time.sleep(0.001)
++
++    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
++               key: torch.Tensor, value: torch.Tensor,
++               hidden: torch.Tensor) -> None:
++
++        if self.buffer_size > self.buffer_size_threshold:
++            # log outside the while loop to avoid this message being logged
++            # repeatedly.
++            logger.debug("KV transfer buffer is full. Handling...")
++        while self.buffer_size > self.buffer_size_threshold:
++            self.full_handler()
++
++        self._add_to_buffer(input_tokens, roi, key, value, hidden)
++
++        # when calling the insert, the current process is a sender
++        # need to launch the request handler and start listening to request.
++        if self.request_handling_thread is None:
++            self.request_handling_thread = threading.Thread(
++                target=self.drop_select_handler)
++            self.request_handling_thread.start()
++
++    def close(self):
++
++        if hasattr(self, "request_handling_thread"
++                   ) and self.request_handling_thread is not None:
++            self.request_handling_thread.join()
++
++        else:
++            # TODO: have a explicit close signal and have a explicit way to
++            # check if it's requester
++            self.signal_pipe.send_tensor(self.end_signal)
+diff --git a/vllm/distributed/kv_transfer/kv_pipe/__init__.py b/vllm/distributed/kv_transfer/kv_pipe/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
+new file mode 100644
+index 0000000..4b0cb44
+--- /dev/null
++++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
+@@ -0,0 +1,65 @@
++"""
++This file defines an interface `KVPipeBase`
++that provides an abstraction for sending and receiving tensors, or None, via
++distributed communications.
++
++All classes instantiated from this interface are assumed to be a FIFO pipe.
++
++If your distributed communication platform already supports key-value lookup,
++you can bypass this interface and directly start from `kv_lookup_buffer`.
++"""
++
++from abc import ABC, abstractmethod
++from typing import Optional
++
++import torch
++
++
++class KVPipeBase(ABC):
++    """
++    This class provides an interface for sending and receiving tensors, or
++    None, by distributed communications.
++    """
++
++    @abstractmethod
++    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
++        """Send a tensor, or None, via the pipe.
++        
++        Need to support sending None -- important for error handling.
++        
++        TODO: add a `key` argument so that we can use traditional 
++        key-value database as the distributed communication mechanism behind 
++        the pipe.
++
++        Args:
++            tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None.
++
++        Raises:
++            NotImplementedError: This method must be implemented in subclasses.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def recv_tensor(self) -> Optional[torch.Tensor]:
++        """Receive a tensor (can be None) from the pipeline.
++
++        Returns:
++            Optional[torch.Tensor]: The tensor received from the pipeline. Can 
++                                    be None.
++
++        Raises:
++            NotImplementedError: This method must be implemented in subclasses.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def close(self) -> None:
++        """Close the pipeline and release resources.
++
++        This method is responsible for closing the communication pipeline 
++        and releasing any resources associated with it.
++
++        Raises:
++            NotImplementedError: This method must be implemented in subclasses.
++        """
++        raise NotImplementedError
+diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+new file mode 100644
+index 0000000..8e43586
+--- /dev/null
++++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+@@ -0,0 +1,272 @@
++import json
++import os
++import pickle
++from concurrent.futures import ThreadPoolExecutor
++from dataclasses import dataclass
++from typing import Optional, Union
++
++import torch
++import zmq
++
++from vllm.config import KVTransferConfig
++from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++NONE_INT = -150886311
++
++
++@dataclass
++class MooncakeTransferEngineConfig:
++    prefill_url: str
++    decode_url: str
++    metadata_backend: Union[str, None]
++    metadata_server: str
++    protocol: str
++    device_name: str
++
++    @staticmethod
++    def from_file(file_path: str) -> 'MooncakeTransferEngineConfig':
++        """Load the config from a JSON file."""
++        with open(file_path) as fin:
++            config = json.load(fin)
++        return MooncakeTransferEngineConfig(
++            prefill_url=config.get("prefill_url"),
++            decode_url=config.get("decode_url"),
++            metadata_backend=config.get("metadata_backend", None),
++            metadata_server=config.get("metadata_server"),
++            protocol=config.get("protocol", "tcp"),
++            device_name=config.get("device_name", ""),
++        )
++
++    @staticmethod
++    def load_from_env() -> 'MooncakeTransferEngineConfig':
++        """Load config from a file specified in the environment variable."""
++        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
++        if config_file_path is None:
++            raise ValueError(
++                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
++        return MooncakeTransferEngineConfig.from_file(config_file_path)
++
++
++class MooncakeTransferEngine:
++    """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
++
++    def __init__(self, kv_rank: int, local_rank: int):
++        try:
++            import mooncake_vllm_adaptor as mva
++        except ImportError as e:
++            raise ImportError(
++                "Please install mooncake by following the instructions at "
++                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
++                "to run vLLM with MooncakeConnector.") from e
++
++        self.engine = mva.mooncake_vllm_adaptor()
++        self.local_rank = local_rank
++
++        try:
++            self.config = MooncakeTransferEngineConfig.load_from_env()
++            logger.info("Mooncake Configuration loaded successfully.")
++        except ValueError as e:
++            logger.error(e)
++            raise
++        except Exception as exc:
++            logger.error(
++                "An error occurred while loading the configuration: %s", exc)
++            raise
++        prefill_host, base_prefill_port = self.config.prefill_url.split(':')
++        decode_host, base_decode_port = self.config.decode_url.split(':')
++
++        # Avoid ports conflict when running prefill and decode on the same node
++        if prefill_host == decode_host and \
++                base_prefill_port == base_decode_port:
++            base_decode_port = str(int(base_decode_port) + 100)
++
++        prefill_port = int(base_prefill_port) + self.local_rank
++        decode_port = int(base_decode_port) + self.local_rank
++        self.prefill_url = ':'.join([prefill_host, str(prefill_port)])
++        self.decode_url = ':'.join([decode_host, str(decode_port)])
++
++        self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url,
++                        self.config.metadata_server, self.config.protocol,
++                        self.config.device_name, self.config.metadata_backend)
++
++        self.remote_url = (self.decode_url
++                           if kv_rank == 0 else self.prefill_url)
++
++        # Initialize ZeroMQ context and sockets
++        self.context = zmq.Context()  # type: ignore[attr-defined]
++        self.sender_socket = self.context.socket(zmq.constants.PUSH)
++        self.receiver_socket = self.context.socket(zmq.constants.PULL)
++        self.sender_ack = self.context.socket(zmq.constants.PULL)
++        self.receiver_ack = self.context.socket(zmq.constants.PUSH)
++
++        self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
++        self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port,
++                                     decode_host, base_decode_port)
++
++    def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str,
++                                d_host: str, d_port: str) -> None:
++        """Set up ZeroMQ sockets for sending and receiving data."""
++        # Offsets < 8 are left for initialization in case tp and pp are enabled
++        p_rank_offset = int(p_port) + 8 + self.local_rank * 2
++        d_rank_offset = int(d_port) + 8 + self.local_rank * 2
++        if kv_rank == 0:
++            self.sender_socket.bind(f"tcp://*:{p_rank_offset + 1}")
++            self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}")
++            self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}")
++            self.receiver_ack.bind(f"tcp://*:{p_rank_offset + 2}")
++        else:
++            self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}")
++            self.sender_socket.bind(f"tcp://*:{d_rank_offset + 1}")
++            self.receiver_ack.bind(f"tcp://*:{d_rank_offset + 2}")
++            self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}")
++
++    def initialize(self, local_hostname: str, metadata_server: str,
++                   protocol: str, device_name: str,
++                   metadata_backend: Union[str, None]) -> None:
++        """Initialize the mooncake instance."""
++        if metadata_backend is None:
++            self.engine.initialize(local_hostname, metadata_server, protocol,
++                                   device_name)
++        else:
++            supported_backend = ["etcd", "redis"]
++            metadata_backend = metadata_backend.lower()
++            if metadata_backend not in supported_backend:
++                raise ValueError(
++                    "Mooncake Configuration error. `metadata_backend`"
++                    f"should be one of {supported_backend}.")
++
++            self.engine.initializeExt(local_hostname, metadata_server,
++                                      protocol, device_name, metadata_backend)
++
++    def allocate_managed_buffer(self, length: int) -> int:
++        """Allocate a managed buffer of the specified length."""
++        ret = self.engine.allocateManagedBuffer(length)
++        if ret <= 0:
++            logger.error("Allocation Return Error")
++            raise Exception("Allocation Return Error")
++        return ret
++
++    def free_managed_buffer(self, buffer: int, length: int) -> int:
++        """Free a previously allocated managed buffer."""
++        return self.engine.freeManagedBuffer(buffer, length)
++
++    def transfer_sync(self, buffer: int, peer_buffer_address: int,
++                      length: int) -> int:
++        """Synchronously transfer data to the specified address."""
++        ret = self.engine.transferSync(self.remote_url, buffer,
++                                       peer_buffer_address, length)
++        if ret < 0:
++            logger.error("Transfer Return Error")
++            raise Exception("Transfer Return Error")
++        return ret
++
++    def write_bytes_to_buffer(self, buffer: int, user_data: bytes,
++                              length: int) -> int:
++        """Write bytes to the allocated buffer."""
++        return self.engine.writeBytesToBuffer(buffer, user_data, length)
++
++    def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
++        """Read bytes from the allocated buffer."""
++        return self.engine.readBytesFromBuffer(buffer, length)
++
++    def wait_for_ack(self, src_ptr: int, length: int) -> None:
++        """Asynchronously wait for ACK from the receiver."""
++        ack = self.sender_ack.recv_pyobj()
++        if ack != b'ACK':
++            logger.error("Failed to receive ACK from the receiver")
++
++        self.free_managed_buffer(src_ptr, length)
++
++    def send_bytes(self, user_data: bytes) -> None:
++        """Send bytes to the remote process."""
++        length = len(user_data)
++        src_ptr = self.allocate_managed_buffer(length)
++        self.write_bytes_to_buffer(src_ptr, user_data, length)
++        self.sender_socket.send_pyobj((src_ptr, length))
++        self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
++
++    def recv_bytes(self) -> bytes:
++        """Receive bytes from the remote process."""
++        src_ptr, length = self.receiver_socket.recv_pyobj()
++        dst_ptr = self.allocate_managed_buffer(length)
++        self.transfer_sync(dst_ptr, src_ptr, length)
++        ret = self.read_bytes_from_buffer(dst_ptr, length)
++
++        # Buffer cleanup
++        self.receiver_ack.send_pyobj(b'ACK')
++        self.free_managed_buffer(dst_ptr, length)
++
++        return ret
++
++
++class MooncakePipe(KVPipeBase):
++    """MooncakeTransferEngine based Pipe implementation."""
++
++    def __init__(self,
++                 local_rank: int,
++                 config: KVTransferConfig,
++                 device: Optional[str] = None):
++        """Initialize the mooncake pipe and set related parameters."""
++        self.config = config
++        self.local_rank = local_rank
++        self.kv_rank = self.config.kv_rank
++        if device is None:
++            self.device = self._select_device(self.config.kv_buffer_device)
++        else:
++            self.device = self._select_device(device)
++
++        self.transfer_engine = MooncakeTransferEngine(self.kv_rank,
++                                                      self.local_rank)
++        self.transport_thread: Optional[ThreadPoolExecutor] = None
++        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
++
++    def _select_device(self, device: str) -> torch.device:
++        """Select available device (CUDA or CPU)."""
++        logger.info("Selecting device: %s", device)
++        if device == "cuda":
++            return torch.device(f"cuda:{self.local_rank}")
++        else:
++            return torch.device("cpu")
++
++    def tensor_hash(self, tensor: torch.Tensor) -> int:
++        """Calculate the hash value of the tensor."""
++        return hash(tensor.data_ptr())
++
++    def _send_impl(self, tensor: torch.Tensor) -> None:
++        """Implement the tensor sending logic."""
++        value_bytes = pickle.dumps(tensor)
++        self.transfer_engine.send_bytes(value_bytes)
++
++    def _recv_impl(self) -> torch.Tensor:
++        """Implement the tensor receiving logic."""
++        data = self.transfer_engine.recv_bytes()
++        return pickle.loads(data)
++
++    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
++        """Send tensor to the target process."""
++        if self.transport_thread is None:
++            self.transport_thread = ThreadPoolExecutor(max_workers=1)
++        tensor = tensor if tensor is not None else self.none_tensor
++        assert (len(tensor.shape) > 0)
++        self.transport_thread.submit(self._send_impl, tensor)
++
++    def recv_tensor(self) -> Optional[torch.Tensor]:
++        """Receive tensor from other processes."""
++        if self.transport_thread is None:
++            self.transport_thread = ThreadPoolExecutor(max_workers=1)
++        tensor = self.transport_thread.submit(self._recv_impl).result()
++        if tensor.numel() == 1 and tensor.item() == NONE_INT:
++            return None
++        else:
++            return tensor
++
++    def close(self) -> None:
++        """Cleanup logic when closing the pipe."""
++        self.transfer_engine.sender_socket.close()
++        self.transfer_engine.receiver_socket.close()
++        self.transfer_engine.sender_ack.close()
++        self.transfer_engine.receiver_ack.close()
++        self.transfer_engine.context.term()  # Terminate the ZMQ context
++        logger.info("Closed the transfer engine and cleaned up resources.")
+diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+new file mode 100644
+index 0000000..98222fa
+--- /dev/null
++++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+@@ -0,0 +1,276 @@
++"""
++    This module implements a PyNccl pipe for sending and receiving 
++    Optional[torch.Tensor] between distributed ranks with advanced 
++    communication features.
++
++    Key Features:
++    - Supports sending and receiving tensors with metadata
++    - Handles both CUDA and CPU device communications
++    - Implements a non-blocking tensor transfer mechanism
++    - Manages buffer size and provides backpressure control
++    - Supports distributed process groups with configurable parameters
++"""
++
++import threading
++import time
++from concurrent.futures import ThreadPoolExecutor
++from typing import Callable, Dict, Optional, Tuple
++
++import torch
++
++from vllm.config import KVTransferConfig
++from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
++from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
++from vllm.distributed.utils import StatelessProcessGroup
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++
++class BrokenPipeException(Exception):
++
++    def __init__(self, message):
++        self.message = message
++        super().__init__(self.message)
++
++
++Metadata = Dict[str, Optional[torch.Tensor]]
++
++
++class PyNcclPipe(KVPipeBase):
++
++    METADATA_LENGTH = 16
++    MAX_TENSOR_DIMENSIONS = 14
++    METADATA_DTYPE = torch.int64
++
++    def __init__(self,
++                 local_rank: int,
++                 config: KVTransferConfig,
++                 device: Optional[str] = None,
++                 port_offset: int = 0):
++        self.config = config
++        self.local_rank = local_rank
++        self.kv_rank = self.config.kv_rank
++        self.kv_parallel_size = self.config.kv_parallel_size
++        if device is None:
++            self.device = self._select_device(self.config.kv_buffer_device)
++        else:
++            self.device = self._select_device(device)
++
++        # build distributed connection and send/recv implementation
++        self.group = StatelessProcessGroup.create(
++            host=self.config.kv_ip,
++            port=self.config.kv_port + port_offset,
++            rank=self.kv_rank,
++            world_size=self.kv_parallel_size,
++        )
++        # add a barrier to make sure the connection is initiated properly
++        self.group.barrier()
++        impl = self._get_device_send_recv_impl(self.group)
++        self.device_send_func, self.device_recv_func = impl
++        # set target rank
++        self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size
++        self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
++
++        # transportation-related variables
++        self.transport_thread: Optional[ThreadPoolExecutor] = None
++        self.buffer_size = 0
++        self.buffer_size_lock = threading.Lock()
++        self.buffer_size_thresh = self.config.kv_buffer_size
++
++    def _get_device_send_recv_impl(
++        self, group: StatelessProcessGroup
++    ) -> Tuple[Callable[[torch.Tensor, int], None], Callable[
++        [torch.Tensor, int], None]]:
++
++        send: Callable[[torch.Tensor, int], None]
++        recv: Callable[[torch.Tensor, int], None]
++        if self.device.type == "cuda":
++            # use PyNCCL for send / recv
++            comm = PyNcclCommunicator(group, device=self.local_rank)
++            comm.disabled = False
++            send, recv = comm.send, comm.recv  # type: ignore
++        else:
++            # This send / recv implementation here is NOT intended to transfer
++            # KV caches (and should NOT be repurposed to transfer KV caches).
++            # Currently it is only used to transmit control-plane messages
++            # for PyNcclBuffer.
++            send = group.send_obj
++
++            def my_recv(x, src):
++                x[...] = group.recv_obj(src)
++
++            recv = my_recv
++
++        return send, recv
++
++    def _select_device(self, device: str):
++        logger.info("Selecting device: %s", device)
++        if device == "cuda":
++            return torch.device(f"cuda:{self.local_rank}")
++        else:
++            return torch.device("cpu")
++
++    def _make_metadata(self, tensor: Optional[torch.Tensor]) -> Metadata:
++        """
++        Create the metadata as a dictionary based on the input tensor.
++
++        Parameters:
++            - tensor: The input tensor or None if no tensor is provided.
++
++        Returns:
++            - metadata: A dictionary with the following keys:
++                - "dtype": The data type of the tensor or None.
++                - "shape": The shape of the tensor or None.
++        """
++        if tensor is None:
++            return {"dtype": None, "shape": None}
++        else:
++            return {"dtype": tensor.dtype, "shape": tensor.shape}
++
++    def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
++        """
++        Create a buffer to receive the tensor based on the provided metadata.
++
++        Parameters:
++            - metadata: A dictionary with keys "dtype" and "shape", describing 
++              the tensor's data type and shape.
++
++        Returns:
++            - buffer: A tensor of the specified type and shape, allocated on 
++              self.device.
++        """
++        return torch.empty(metadata["shape"],
++                           dtype=metadata["dtype"],
++                           device=self.device)
++
++    def _send_metadata(self, metadata: Metadata):
++        """
++        Send the metadata dictionary to the target rank.
++
++        Parameters:
++            - metadata: A dictionary with keys "dtype" and "shape".
++        """
++        self.group.send_obj(metadata, self.target_rank_for_send)
++
++    def _recv_metadata(self) -> Metadata:
++        """
++        Receive the metadata dictionary from the target rank.
++
++        Returns:
++            - metadata: A dictionary with keys "dtype" and "shape" describing 
++              the tensor.
++        """
++        return self.group.recv_obj(self.target_rank_for_recv)
++
++    def _send_impl(self, tensor: Optional[torch.Tensor]) -> None:
++        """
++        The actual implementation of sending the tensor and its metadata to the 
++        target rank.
++
++        Parameters:
++            - tensor: The input tensor to be sent, or None if no tensor is 
++              being sent.
++        """
++        metadata = self._make_metadata(tensor)
++        self._send_metadata(metadata)
++        if tensor is not None:
++            self.device_send_func(tensor.to(self.device),
++                                  self.target_rank_for_send)
++
++    def _recv_impl(self) -> Optional[torch.Tensor]:
++        """
++        The actual implementation of receiving a tensor and its metadata from 
++        the target rank.
++
++        Returns:
++            - buffer: The received tensor, or None if no tensor is received.
++        """
++        metadata = self._recv_metadata()
++        if metadata["dtype"] is None:
++            return None
++        buffer = self._prepare_recv_buffer(metadata)
++        self.device_recv_func(buffer, self.target_rank_for_recv)
++
++        return buffer
++
++    def send_tensor_wrapper(self, tensor: Optional[torch.Tensor],
++                            tensor_size: int) -> None:
++        """
++        Wrapper for _send_impl to handle exceptions and update buffer size.
++        """
++        try:
++            self._send_impl(tensor)
++
++            with self.buffer_size_lock:
++                self.buffer_size -= tensor_size
++        except Exception as e:
++            logger.error("[rank%d]: Exception when trying to send %s, msg: %s",
++                         torch.distributed.get_rank(), str(tensor), str(e))
++            import traceback
++            traceback.print_exc()
++
++    def block_if_full(self):
++        """
++        Block the current thread if the buffer size is larger than the 
++        threshold.
++        """
++        while self.buffer_size > self.buffer_size_thresh:
++            logger.debug("KV cache transfer pipe is full. Waiting...")
++            time.sleep(0.05)
++
++    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
++        """
++        Sends a tensor and its metadata to the destination rank in a 
++        non-blocking way.
++
++        Parameters:
++            - tensor: The tensor to send, or None if no tensor is being sent.
++        """
++        if self.transport_thread is None:
++            self.transport_thread = ThreadPoolExecutor(max_workers=1)
++
++        if tensor is not None:
++            tensor_size = tensor.element_size() * tensor.numel()
++        else:
++            tensor_size = 0
++
++        self.block_if_full()
++
++        with self.buffer_size_lock:
++            self.buffer_size += tensor_size
++
++        self.transport_thread.submit(self.send_tensor_wrapper, tensor,
++                                     tensor_size)
++
++    def recv_tensor(self) -> Optional[torch.Tensor]:
++        """
++        Receives a tensor and its metadata from the source rank. Blocking call.
++
++        Returns:
++            - tensor: The received tensor, or None if no tensor is received.
++        """
++        if self.transport_thread is None:
++            self.transport_thread = ThreadPoolExecutor(max_workers=1)
++
++        future = self.transport_thread.submit(self._recv_impl)
++
++        try:
++            tensor = future.result()
++        except Exception as e:
++            logger.error("Encountering exception in KV receiving thread")
++            logger.error("%s", e)
++            logger.error("My device: %s", self.device)
++            import traceback
++            traceback.print_exc()
++            raise e
++
++        return tensor
++
++    def close(self):
++        """
++        Close the pipe and release associated resources.
++        """
++        if hasattr(self,
++                   "transport_thread") and self.transport_thread is not None:
++            self.transport_thread.shutdown()
+diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py
+new file mode 100644
+index 0000000..9ce9785
+--- /dev/null
++++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py
+@@ -0,0 +1,75 @@
++"""A centralized entrypoint to perform distributed KV cache transfer.
++
++This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
++1. `send_kv_caches_and_hidden_states`
++2. `recv_kv_caches_and_hidden_states
++"""
++from typing import TYPE_CHECKING, List, Tuple, Union
++
++if TYPE_CHECKING:
++    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
++    from vllm.config import VllmConfig
++
++import torch
++
++from vllm.distributed.kv_transfer.kv_connector.factory import (
++    KVConnectorFactory)
++from vllm.logger import init_logger
++from vllm.sequence import IntermediateTensors
++
++logger = init_logger(__name__)
++
++
++class KVTransferAgent:
++    """
++    A class designated for distributed KV transfer
++    
++    Target use cases:
++        1. Disaggregated prefill
++        2. Remote KV cache storage
++    """
++
++    def __init__(
++        self,
++        rank: int,
++        local_rank: int,
++        config: "VllmConfig",
++    ):
++
++        self.config = config
++
++        if config.kv_transfer_config is None:
++            raise ValueError("KVTransferConfig is not set in the VllmConfig,"
++                             " cannot initialize KVConnector.")
++
++        assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\
++            "TransferAgent should only be used when kv_connector is set."
++
++        self.connector = KVConnectorFactory.create_connector(
++            rank, local_rank, config)
++
++    def send_kv_caches_and_hidden_states(
++        self,
++        model_executable: torch.nn.Module,
++        model_input: "ModelInputForGPUWithSamplingMetadata",
++        kv_caches: List[torch.Tensor],
++        hidden_or_intermediate_states: Union[torch.Tensor,
++                                             IntermediateTensors],
++    ) -> None:
++
++        self.connector.send_kv_caches_and_hidden_states(
++            model_executable, model_input, kv_caches,
++            hidden_or_intermediate_states)
++
++    def close(self) -> None:
++        self.connector.close()
++
++    def recv_kv_caches_and_hidden_states(
++        self, model_executable: torch.nn.Module,
++        model_input: "ModelInputForGPUWithSamplingMetadata",
++        kv_caches: List[torch.Tensor]
++    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
++               "ModelInputForGPUWithSamplingMetadata"]:
++
++        return self.connector.recv_kv_caches_and_hidden_states(
++            model_executable, model_input, kv_caches)
+diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
+index be5bb4e..be7f16e 100644
+--- a/vllm/distributed/parallel_state.py
++++ b/vllm/distributed/parallel_state.py
+@@ -2,53 +2,941 @@
+ # Adapted from
+ # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+-"""Tensor and pipeline parallel groups."""
++"""vLLM distributed state.
++It takes over the control of the distributed environment from PyTorch.
++The typical workflow is:
++
++- call `init_distributed_environment` to initialize the distributed environment.
++- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to
++ initialize the model parallel groups.
++
++- any code dealing with the distributed stuff
++
++- call `destroy_model_parallel` to destroy the model parallel groups.
++- call `destroy_distributed_environment` to destroy the distributed environment.
++
++If you only need to use the distributed environment without model/pipeline
++ parallelism, you can skip the model parallel initialization and destruction
++ steps.
++"""
+ import contextlib
+-from typing import Optional
++import gc
++import pickle
++import weakref
++from collections import namedtuple
++from contextlib import contextmanager, nullcontext
++from dataclasses import dataclass
++from multiprocessing import shared_memory
++from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
++                    Union)
++from unittest.mock import patch
+ 
+ import torch
++import torch.distributed
++from torch.distributed import Backend, ProcessGroup
+ 
++import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
+ import vllm.envs as envs
++from vllm.distributed.utils import StatelessProcessGroup
+ from vllm.logger import init_logger
++from vllm.utils import direct_register_custom_op, supports_custom_op
+ 
+-logger = init_logger(__name__)
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++
++
++@dataclass
++class GraphCaptureContext:
++    stream: torch.cuda.Stream
++
++
++TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
++
++
++def _split_tensor_dict(
++    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
++) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
++    """Split the tensor dictionary into two parts:
++    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
++         by its metadata.
++    2. A list of tensors.
++    """
++    metadata_list: List[Tuple[str, Any]] = []
++    tensor_list: List[torch.Tensor] = []
++    for key, value in tensor_dict.items():
++        if isinstance(value, torch.Tensor):
++            # Note: we cannot use `value.device` here,
++            # because it contains not only the device type but also the device
++            # index (e.g. "cuda:0"). We only need the device type.
++            # receiving side will set the device index.
++            device = value.device.type
++            metadata_list.append(
++                (key, TensorMetadata(device, value.dtype, value.size())))
++            tensor_list.append(value)
++        else:
++            metadata_list.append((key, value))
++    return metadata_list, tensor_list
++
++
++_group_name_counter: Dict[str, int] = {}
++
++
++def _get_unique_name(name: str) -> str:
++    """Get a unique name for the group.
++    Example:
++    _get_unique_name("tp") -> "tp:0"
++    _get_unique_name("tp") -> "tp:1"
++    """
++    if name not in _group_name_counter:
++        _group_name_counter[name] = 0
++    newname = f"{name}:{_group_name_counter[name]}"
++    _group_name_counter[name] += 1
++    return newname
++
++
++_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {}
++
++
++def _register_group(group: "GroupCoordinator") -> None:
++    _groups[group.unique_name] = weakref.ref(group)
++
++
++def all_reduce(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
++    assert group_name in _groups, f"Group {group_name} is not found."
++    group = _groups[group_name]()
++    if group is None:
++        raise ValueError(f"Group {group_name} is destroyed.")
++    return group._all_reduce_out_place(tensor)
++
++
++def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
++    return torch.empty_like(tensor)
++
++
++if supports_custom_op():
++    direct_register_custom_op(
++        op_name="all_reduce",
++        op_func=all_reduce,
++        mutates_args=[],
++        fake_impl=all_reduce_fake,
++    )
++
++
++class GroupCoordinator:
++    """
++    PyTorch ProcessGroup wrapper for a group of processes.
++    PyTorch ProcessGroup is bound to one specific communication backend,
++        e.g. NCCL, Gloo, MPI, etc.
++    GroupCoordinator takes charge of all the communication operations among
++        the processes in the group. It can route the communication to
++        a specific implementation (e.g. switch allreduce implementation
++        based on the tensor size and cuda graph mode).
++    """
++
++    # available attributes:
++    rank: int  # global rank
++    ranks: List[int]  # global ranks in the group
++    world_size: int  # size of the group
++    # difference between `local_rank` and `rank_in_group`:
++    # if we have a group of size 4 across two nodes:
++    # Process | Node | Rank | Local Rank | Rank in Group
++    #   0     |   0  |  0   |     0      |       0
++    #   1     |   0  |  1   |     1      |       1
++    #   2     |   1  |  2   |     0      |       2
++    #   3     |   1  |  3   |     1      |       3
++    local_rank: int  # local rank used to assign devices
++    rank_in_group: int  # rank inside the group
++    cpu_group: ProcessGroup  # group for CPU communication
++    device_group: ProcessGroup  # group for device communication
++    use_pynccl: bool  # a hint of whether to use PyNccl
++    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
++    # communicators are only created for world size > 1
++    pynccl_comm: Optional[Any]  # PyNccl communicator
++    ca_comm: Optional[Any]  # Custom allreduce communicator
++    mq_broadcaster: Optional[Any]  # shared memory broadcaster
++
++    def __init__(
++        self,
++        group_ranks: List[List[int]],
++        local_rank: int,
++        torch_distributed_backend: Union[str, Backend],
++        use_pynccl: bool,
++        use_custom_allreduce: bool,
++        use_tpu_communicator: bool,
++        use_hpu_communicator: bool,
++        use_xpu_communicator: bool,
++        use_message_queue_broadcaster: bool = False,
++        group_name: Optional[str] = None,
++    ):
++        group_name = group_name or "anonymous"
++        self.unique_name = _get_unique_name(group_name)
++        _register_group(self)
++
++        self.rank = torch.distributed.get_rank()
++        self.local_rank = local_rank
++        self.device_group = None
++        self.cpu_group = None
++
++        for ranks in group_ranks:
++            device_group = torch.distributed.new_group(
++                ranks, backend=torch_distributed_backend)
++            # a group with `gloo` backend, to allow direct coordination between
++            # processes through the CPU.
++            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
++            if self.rank in ranks:
++                self.ranks = ranks
++                self.world_size = len(ranks)
++                self.rank_in_group = ranks.index(self.rank)
++                self.device_group = device_group
++                self.cpu_group = cpu_group
++
++        assert self.cpu_group is not None
++        assert self.device_group is not None
++
++        from vllm.platforms import current_platform
++        if current_platform.is_cuda_alike():
++            self.device = torch.device(f"cuda:{local_rank}")
++        else:
++            self.device = torch.device("cpu")
++
++        self.use_pynccl = use_pynccl
++        self.use_custom_allreduce = use_custom_allreduce
++        self.use_tpu_communicator = use_tpu_communicator
++        self.use_hpu_communicator = use_hpu_communicator
++        self.use_xpu_communicator = use_xpu_communicator
++
++        # lazy import to avoid documentation build error
++        from vllm.distributed.device_communicators.custom_all_reduce import (
++            CustomAllreduce)
++        from vllm.distributed.device_communicators.pynccl import (
++            PyNcclCommunicator)
++
++        self.pynccl_comm: Optional[PyNcclCommunicator] = None
++        if use_pynccl and self.world_size > 1:
++            self.pynccl_comm = PyNcclCommunicator(
++                group=self.cpu_group,
++                device=self.device,
++            )
++
++        self.ca_comm: Optional[CustomAllreduce] = None
++        if use_custom_allreduce and self.world_size > 1:
++            # Initialize a custom fast all-reduce implementation.
++            self.ca_comm = CustomAllreduce(
++                group=self.cpu_group,
++                device=self.device,
++            )
++
++        from vllm.distributed.device_communicators.tpu_communicator import (
++            TpuCommunicator)
++        self.tpu_communicator: Optional[TpuCommunicator] = None
++        if use_tpu_communicator and self.world_size > 1:
++            self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
++
++        from vllm.distributed.device_communicators.hpu_communicator import (
++            HpuCommunicator)
++        self.hpu_communicator: Optional[HpuCommunicator]
++        if use_hpu_communicator and self.world_size > 1:
++            self.hpu_communicator = HpuCommunicator(group=self.device_group)
++
++        from vllm.distributed.device_communicators.xpu_communicator import (
++            XpuCommunicator)
++        self.xpu_communicator: Optional[XpuCommunicator]
++        if use_xpu_communicator and self.world_size > 1:
++            self.xpu_communicator = XpuCommunicator(group=self.device_group)
++
++        from vllm.distributed.device_communicators.shm_broadcast import (
++            MessageQueue)
++        self.mq_broadcaster: Optional[MessageQueue] = None
++        if use_message_queue_broadcaster and self.world_size > 1:
++            self.mq_broadcaster = MessageQueue.create_from_process_group(
++                self.cpu_group, 1 << 22, 6)
++
++    @property
++    def first_rank(self):
++        """Return the global rank of the first process in the group"""
++        return self.ranks[0]
++
++    @property
++    def last_rank(self):
++        """Return the global rank of the last process in the group"""
++        return self.ranks[-1]
++
++    @property
++    def is_first_rank(self):
++        """Return whether the caller is the first process in the group"""
++        return self.rank == self.first_rank
++
++    @property
++    def is_last_rank(self):
++        """Return whether the caller is the last process in the group"""
++        return self.rank == self.last_rank
++
++    @property
++    def next_rank(self):
++        """Return the global rank of the process that follows the caller"""
++        rank_in_group = self.rank_in_group
++        world_size = self.world_size
++        return self.ranks[(rank_in_group + 1) % world_size]
++
++    @property
++    def prev_rank(self):
++        """Return the global rank of the process that precedes the caller"""
++        rank_in_group = self.rank_in_group
++        world_size = self.world_size
++        return self.ranks[(rank_in_group - 1) % world_size]
++
++    @contextmanager
++    def graph_capture(
++            self, graph_capture_context: Optional[GraphCaptureContext] = None):
++        if graph_capture_context is None:
++            stream = torch.cuda.Stream()
++            graph_capture_context = GraphCaptureContext(stream)
++        else:
++            stream = graph_capture_context.stream
++
++        ca_comm = self.ca_comm
++        maybe_ca_context = nullcontext(
++        ) if ca_comm is None else ca_comm.capture()
++
++        # ensure all initialization operations complete before attempting to
++        # capture the graph on another stream
++        curr_stream = torch.cuda.current_stream()
++        if curr_stream != stream:
++            stream.wait_stream(curr_stream)
++
++        with torch.cuda.stream(stream), maybe_ca_context:
++            yield graph_capture_context
++
++    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
++        """
++        User-facing all-reduce function before we actually call the
++        all-reduce operation.
++
++        We need this because Dynamo does not support passing an arbitrary
++        object (`self` in this case) to a custom op. We need to pass the
++         group name as a string, and then look up the group coordinator from
++         the group name, dispatch the all-reduce operation to the group
++         coordinator.
++
++        In addition, PyTorch custom ops do not support mutation or returning
++        a new tensor in the same op. So we always make the all-reduce operation
++        out-of-place.
++        """
++        # Bypass the function if we are using only 1 GPU.
++        if self.world_size == 1:
++            return input_
++
++        if input_.is_cpu:
++            import intel_extension_for_pytorch as ipex
++            ipex.distributed.all_reduce(input_, group=self.device_group)
++            return input_
++
++        if self.tpu_communicator is not None and \
++            not self.tpu_communicator.disabled:
++            # TPU handles Dynamo with its own logic.
++            return self.tpu_communicator.all_reduce(input_)
++
++        if self.hpu_communicator is not None and \
++            not self.hpu_communicator.disabled:
++            return self.hpu_communicator.all_reduce(input_)
++
++        if self.xpu_communicator is not None and \
++                not self.xpu_communicator.disabled:
++            return self.xpu_communicator.all_reduce(input_)
++
++        return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name)
++
++    def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
++        # always try custom allreduce first,
++        # and then pynccl.
++        ca_comm = self.ca_comm
++        if ca_comm is not None and not ca_comm.disabled and \
++            ca_comm.should_custom_ar(input_):
++            out = ca_comm.custom_all_reduce(input_)
++            assert out is not None
++            return out
++        pynccl_comm = self.pynccl_comm
++        assert pynccl_comm is not None
++        out = pynccl_comm.all_reduce(input_)
++        if out is None:
++            # fall back to the default all-reduce using PyTorch.
++            # this usually happens during testing.
++            # when we run the model, allreduce only happens for the TP
++            # group, where we always have either custom allreduce or pynccl.
++            out = input_.clone()
++            torch.distributed.all_reduce(out, group=self.device_group)
++        return out
++
++    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
++        world_size = self.world_size
++        # Bypass the function if we are using only 1 GPU.
++        if world_size == 1:
++            return input_
++        assert -input_.dim() <= dim < input_.dim(), (
++            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
++
++        # For TPUs, use TPU communicator.
++        tpu_comm = self.tpu_communicator
++        if tpu_comm is not None and not tpu_comm.disabled:
++            return tpu_comm.all_gather(input_, dim)
++
++        # For HPUs, use HPU communicator.
++        hpu_comm = self.hpu_communicator
++        if hpu_comm is not None and not hpu_comm.disabled:
++            return hpu_comm.all_gather(input_, dim)
++
++        if dim < 0:
++            # Convert negative dim to positive.
++            dim += input_.dim()
++        input_size = input_.size()
++        # NOTE: we have to use concat-style all-gather here,
++        # stack-style all-gather has compatibility issues with
++        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
++        output_size = (input_size[0] * world_size, ) + input_size[1:]
++        # Allocate output tensor.
++        output_tensor = torch.empty(output_size,
++                                    dtype=input_.dtype,
++                                    device=input_.device)
++        # All-gather.
++        torch.distributed.all_gather_into_tensor(output_tensor,
++                                                 input_,
++                                                 group=self.device_group)
++        # Reshape
++        output_tensor = output_tensor.reshape((world_size, ) + input_size)
++        output_tensor = output_tensor.movedim(0, dim)
++        output_tensor = output_tensor.reshape(input_size[:dim] +
++                                              (world_size *
++                                               input_size[dim], ) +
++                                              input_size[dim + 1:])
++        return output_tensor
++
++    def gather(self,
++               input_: torch.Tensor,
++               dst: int = 0,
++               dim: int = -1) -> Optional[torch.Tensor]:
++        """
++        NOTE: We assume that the input tensor is on the same device across
++        all the ranks.
++        NOTE: `dst` is the local rank of the destination rank.
++        """
++        world_size = self.world_size
++        # Bypass the function if we are using only 1 GPU.
++        if world_size == 1:
++            return input_
++        assert -input_.dim() <= dim < input_.dim(), (
++            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
++        if dim < 0:
++            # Convert negative dim to positive.
++            dim += input_.dim()
++        if self.xpu_communicator is not None and \
++                not self.xpu_communicator.disabled:
++            return self.xpu_communicator.gather(input_, self.rank_in_group,
++                                                dst, dim)
++        # Allocate output tensor.
++        if self.rank_in_group == dst:
++            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
++        else:
++            gather_list = None
++        # Gather.
++        torch.distributed.gather(input_,
++                                 gather_list,
++                                 dst=self.ranks[dst],
++                                 group=self.device_group)
++        if self.rank_in_group == dst:
++            output_tensor = torch.cat(gather_list, dim=dim)
++        else:
++            output_tensor = None
++        return output_tensor
++
++    def broadcast(self, input_: torch.Tensor, src: int = 0):
++        """Broadcast the input tensor.
++        NOTE: `src` is the local rank of the source rank.
++        """
++        assert src < self.world_size, f"Invalid src rank ({src})"
++
++        # Bypass the function if we are using only 1 GPU.
++        if self.world_size == 1:
++            return input_
++        # Broadcast.
++        torch.distributed.broadcast(input_,
++                                    src=self.ranks[src],
++                                    group=self.device_group)
++        return input_
++
++    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
++        """Broadcast the input object.
++        NOTE: `src` is the local rank of the source rank.
++        """
++        assert src < self.world_size, f"Invalid src rank ({src})"
++
++        # Bypass the function if we are using only 1 GPU.
++        if self.world_size == 1:
++            return obj
++        if self.mq_broadcaster is not None:
++            assert src == 0, "Message queue broadcaster only supports src=0"
++            return self.mq_broadcaster.broadcast_object(obj)
++        if self.rank_in_group == src:
++            torch.distributed.broadcast_object_list([obj],
++                                                    src=self.ranks[src],
++                                                    group=self.cpu_group)
++            return obj
++        else:
++            recv = [None]
++            torch.distributed.broadcast_object_list(recv,
++                                                    src=self.ranks[src],
++                                                    group=self.cpu_group)
++            return recv[0]
++
++    def broadcast_object_list(self,
++                              obj_list: List[Any],
++                              src: int = 0,
++                              group: Optional[ProcessGroup] = None):
++        """Broadcast the input object list.
++        NOTE: `src` is the local rank of the source rank.
++        """
++        assert src < self.world_size, f"Invalid src rank ({src})"
++
++        # Bypass the function if we are using only 1 GPU.
++        if self.world_size == 1:
++            return obj_list
++        # Broadcast.
++        torch.distributed.broadcast_object_list(obj_list,
++                                                src=self.ranks[src],
++                                                group=self.device_group)
++        return obj_list
++
++    def send_object(self, obj: Any, dst: int) -> None:
++        """Send the input object list to the destination rank."""
++        """NOTE: `dst` is the local rank of the destination rank."""
++
++        assert dst < self.world_size, f"Invalid dst rank ({dst})"
++
++        assert dst != self.rank_in_group, (
++            "Invalid destination rank. Destination rank is the same "
++            "as the current rank.")
++
++        # Serialize object to tensor and get the size as well
++        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
++
++        size_tensor = torch.tensor([object_tensor.numel()],
++                                   dtype=torch.long,
++                                   device="cpu")
++
++        # Send object size
++
++        torch.distributed.send(size_tensor,
++                               dst=self.ranks[dst],
++                               group=self.cpu_group)
++
++        # Send object
++        torch.distributed.send(object_tensor,
++                               dst=self.ranks[dst],
++                               group=self.cpu_group)
++
++        return None
++
++    def recv_object(self, src: int) -> Any:
++        """Receive the input object list from the source rank."""
++        """NOTE: `src` is the local rank of the source rank."""
++
++        assert src < self.world_size, f"Invalid src rank ({src})"
++
++        assert src != self.rank_in_group, (
++            "Invalid source rank. Source rank is the same as the current rank."
++        )
++
++        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
++
++        # Receive object size
++        rank_size = torch.distributed.recv(size_tensor,
++                                           src=self.ranks[src],
++                                           group=self.cpu_group)
++
++        # Tensor to receive serialized objects into.
++        object_tensor = torch.empty(  # type: ignore[call-overload]
++            size_tensor.item(),  # type: ignore[arg-type]
++            dtype=torch.uint8,
++            device="cpu")
+ 
+-# Tensor model parallel group that the current rank belongs to.
+-_TP_DEVICE_GROUP = None
+-_TP_CPU_GROUP = None
+-# Pipeline model parallel group that the current rank belongs to.
+-_PIPELINE_MODEL_PARALLEL_GROUP = None
++        rank_object = torch.distributed.recv(object_tensor,
++                                             src=self.ranks[src],
++                                             group=self.cpu_group)
+ 
+-# when people blindly call `torch.distributed.all_reduce` etc,
+-# it will use this group. It is initialized with the `backend`
+-# parameter of `init_distributed_environment` below.
+-# Essentially, this is `torch.distributed.group.WORLD`.
+-# We leave a line here to note that this is device-specific.
+-# Note that this variable is not safe to use, because when users
+-# call `init_distributed_environment` first, and then destroy
+-# the process group themselves, this variable will keep a reference to the
+-# destroyed process group, which is not useful.
+-_DEVICE_WORLD_GROUP = None
++        assert rank_object == rank_size, (
++            "Received object sender rank does not match the size sender rank.")
+ 
+-# duing `init_distributed_environment`, we will also initialize a
+-# group with `gloo` backend, to allow direct coordination between
+-# processes through the CPU.
+-_CPU_WORLD_GROUP = None
++        obj = pickle.loads(object_tensor.numpy().tobytes())
+ 
+-# In summary, after calling `init_distributed_environment`, we will
+-# always have two groups: one for device-specific (and is the default)
+-# and one for CPU. All processes will be part of both groups.
++        return obj
+ 
+-# A list of global ranks for each pipeline group to ease calculation of the
+-# source rank when broadcasting from the first or last pipeline stage.
+-_PIPELINE_GLOBAL_RANKS = None
++    def broadcast_tensor_dict(
++        self,
++        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
++        src: int = 0,
++        group: Optional[ProcessGroup] = None,
++        metadata_group: Optional[ProcessGroup] = None
++    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
++        """Broadcast the input tensor dictionary.
++        NOTE: `src` is the local rank of the source rank.
++        """
++        # Bypass the function if we are using only 1 GPU.
++        if (not torch.distributed.is_initialized() or self.world_size == 1):
++            return tensor_dict
+ 
+-_LOCAL_RANK = -1
++        group = self.device_group
++        metadata_group = self.cpu_group
++        assert src < self.world_size, f"Invalid src rank ({src})"
+ 
++        rank_in_group = self.rank_in_group
++        if rank_in_group == src:
++            metadata_list: List[Tuple[Any, Any]] = []
++            assert isinstance(
++                tensor_dict,
++                dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
++            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
++            # `metadata_list` lives in CPU memory.
++            # `broadcast_object_list` has serialization & deserialization,
++            # all happening on CPU. Therefore, we can use the CPU group.
++            self.broadcast_object(metadata_list, src=src)
++            async_handles = []
++            for tensor in tensor_list:
++                if tensor.numel() == 0:
++                    # Skip broadcasting empty tensors.
++                    continue
++                if tensor.is_cpu:
++                    # use metadata_group for CPU tensors
++                    handle = torch.distributed.broadcast(tensor,
++                                                         src=self.ranks[src],
++                                                         group=metadata_group,
++                                                         async_op=True)
++                else:
++                    # use group for GPU tensors
++                    handle = torch.distributed.broadcast(tensor,
++                                                         src=self.ranks[src],
++                                                         group=group,
++                                                         async_op=True)
++                async_handles.append(handle)
++            for async_handle in async_handles:
++                async_handle.wait()
+ 
+-def get_local_rank():
+-    global _LOCAL_RANK
+-    return _LOCAL_RANK
++        else:
++            metadata_list = self.broadcast_object(None, src=src)
++            tensor_dict = {}
++            async_handles = []
++            for key, value in metadata_list:
++                if isinstance(value, TensorMetadata):
++                    tensor = torch.empty(value.size,
++                                         dtype=value.dtype,
++                                         device=value.device)
++                    if tensor.numel() == 0:
++                        # Skip broadcasting empty tensors.
++                        tensor_dict[key] = tensor
++                        continue
++                    if tensor.is_cpu:
++                        # use metadata_group for CPU tensors
++                        handle = torch.distributed.broadcast(
++                            tensor,
++                            src=self.ranks[src],
++                            group=metadata_group,
++                            async_op=True)
++                    else:
++                        # use group for GPU tensors
++                        handle = torch.distributed.broadcast(
++                            tensor,
++                            src=self.ranks[src],
++                            group=group,
++                            async_op=True)
++                    async_handles.append(handle)
++                    tensor_dict[key] = tensor
++                else:
++                    tensor_dict[key] = value
++            for async_handle in async_handles:
++                async_handle.wait()
++        return tensor_dict
++
++    def send_tensor_dict(
++        self,
++        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
++        dst: Optional[int] = None,
++        all_gather_group: Optional["GroupCoordinator"] = None,
++    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
++        """Send the input tensor dictionary.
++        NOTE: `dst` is the local rank of the source rank.
++        """
++        # Bypass the function if we are using only 1 GPU.
++        if not torch.distributed.is_initialized() or self.world_size == 1:
++            return tensor_dict
++
++        all_gather_size = (1 if all_gather_group is None else
++                           all_gather_group.world_size)
++        all_gather_rank = (0 if all_gather_group is None else
++                           all_gather_group.rank_in_group)
++
++        group = self.device_group
++        metadata_group = self.cpu_group
++
++        if dst is None:
++            dst = (self.rank_in_group + 1) % self.world_size
++        assert dst < self.world_size, f"Invalid dst rank ({dst})"
++
++        metadata_list: List[Tuple[Any, Any]] = []
++        assert isinstance(
++            tensor_dict,
++            dict), f"Expecting a dictionary, got {type(tensor_dict)}"
++        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
++        # `metadata_list` lives in CPU memory.
++        # `send_object_list` has serialization & deserialization,
++        # all happening on CPU. Therefore, we can use the CPU group.
++        self.send_object(metadata_list, dst=dst)
++        for tensor in tensor_list:
++            if tensor.numel() == 0:
++                # Skip sending empty tensors.
++                continue
++
++            # send-allgather: send only a slice, then do allgather.
++            if (all_gather_group is not None
++                    and tensor.numel() % all_gather_size == 0):
++                tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
++
++            if tensor.is_cpu:
++                # use metadata_group for CPU tensors
++                torch.distributed.send(tensor,
++                                       dst=self.ranks[dst],
++                                       group=metadata_group)
++            else:
++                # use group for GPU tensors
++                torch.distributed.send(tensor,
++                                       dst=self.ranks[dst],
++                                       group=group)
++        return None
++
++    def recv_tensor_dict(
++        self,
++        src: Optional[int] = None,
++        all_gather_group: Optional["GroupCoordinator"] = None,
++    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
++        """Recv the input tensor dictionary.
++        NOTE: `src` is the local rank of the source rank.
++        """
++        # Bypass the function if we are using only 1 GPU.
++        if not torch.distributed.is_initialized() or self.world_size == 1:
++            return None
++
++        all_gather_size = (1 if all_gather_group is None else
++                           all_gather_group.world_size)
++        all_gather_rank = (0 if all_gather_group is None else
++                           all_gather_group.rank_in_group)
++
++        group = self.device_group
++        metadata_group = self.cpu_group
++
++        if src is None:
++            src = (self.rank_in_group - 1) % self.world_size
++        assert src < self.world_size, f"Invalid src rank ({src})"
++
++        recv_metadata_list = self.recv_object(src=src)
++        tensor_dict: Dict[str, Any] = {}
++        for key, value in recv_metadata_list:
++            if isinstance(value, TensorMetadata):
++                tensor = torch.empty(value.size,
++                                     dtype=value.dtype,
++                                     device=value.device)
++                if tensor.numel() == 0:
++                    # Skip broadcasting empty tensors.
++                    tensor_dict[key] = tensor
++                    continue
++
++                # send-allgather: send only a slice, then do allgather.
++                use_all_gather = (all_gather_group is not None
++                                  and tensor.numel() % all_gather_size == 0)
++
++                if use_all_gather:
++                    orig_shape = tensor.shape
++                    tensor = tensor.reshape(all_gather_size,
++                                            -1)[all_gather_rank]
++
++                if tensor.is_cpu:
++                    # use metadata_group for CPU tensors
++                    torch.distributed.recv(tensor,
++                                           src=self.ranks[src],
++                                           group=metadata_group)
++                else:
++                    # use group for GPU tensors
++                    torch.distributed.recv(tensor,
++                                           src=self.ranks[src],
++                                           group=group)
++                if use_all_gather:
++                    # do the allgather
++                    tensor = all_gather_group.all_gather(  # type: ignore
++                        tensor, dim=0)
++                    tensor = tensor.reshape(orig_shape)
++
++                tensor_dict[key] = tensor
++            else:
++                tensor_dict[key] = value
++        return tensor_dict
++
++    def barrier(self):
++        """Barrier synchronization among the group.
++        NOTE: don't use `device_group` here! `barrier` in NCCL is
++        terrible because it is internally a broadcast operation with
++        secretly created GPU tensors. It is easy to mess up the current
++        device. Use the CPU group instead.
++        """
++        torch.distributed.barrier(group=self.cpu_group)
++
++    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
++        """Sends a tensor to the destination rank in a non-blocking way"""
++        """NOTE: `dst` is the local rank of the destination rank."""
++        if dst is None:
++            dst = (self.rank_in_group + 1) % self.world_size
++
++        pynccl_comm = self.pynccl_comm
++        if pynccl_comm is not None and not pynccl_comm.disabled:
++            pynccl_comm.send(tensor, dst)
++        else:
++            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
++
++    def recv(self,
++             size: torch.Size,
++             dtype: torch.dtype,
++             src: Optional[int] = None) -> torch.Tensor:
++        """Receives a tensor from the source rank."""
++        """NOTE: `src` is the local rank of the source rank."""
++        if src is None:
++            src = (self.rank_in_group - 1) % self.world_size
++
++        tensor = torch.empty(size, dtype=dtype, device=self.device)
++        pynccl_comm = self.pynccl_comm
++        if pynccl_comm is not None and not pynccl_comm.disabled:
++            pynccl_comm.recv(tensor, src)
++        else:
++            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
++        return tensor
++
++    def destroy(self):
++        if self.device_group is not None:
++            torch.distributed.destroy_process_group(self.device_group)
++            self.device_group = None
++        if self.cpu_group is not None:
++            torch.distributed.destroy_process_group(self.cpu_group)
++            self.cpu_group = None
++        if self.pynccl_comm is not None:
++            self.pynccl_comm = None
++        if self.ca_comm is not None:
++            self.ca_comm = None
++        if self.mq_broadcaster is not None:
++            self.mq_broadcaster = None
++
++
++_WORLD: Optional[GroupCoordinator] = None
++
++
++def get_world_group() -> GroupCoordinator:
++    assert _WORLD is not None, ("world group is not initialized")
++    return _WORLD
++
++
++def init_world_group(ranks: List[int], local_rank: int,
++                     backend: str) -> GroupCoordinator:
++    return GroupCoordinator(
++        group_ranks=[ranks],
++        local_rank=local_rank,
++        torch_distributed_backend=backend,
++        use_pynccl=False,
++        use_custom_allreduce=False,
++        use_tpu_communicator=False,
++        use_hpu_communicator=False,
++        use_xpu_communicator=False,
++        group_name="world",
++    )
++
++
++def init_model_parallel_group(
++    group_ranks: List[List[int]],
++    local_rank: int,
++    backend: str,
++    use_custom_allreduce: Optional[bool] = None,
++    use_message_queue_broadcaster: bool = False,
++    group_name: Optional[str] = None,
++) -> GroupCoordinator:
++    if use_custom_allreduce is None:
++        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
++    return GroupCoordinator(
++        group_ranks=group_ranks,
++        local_rank=local_rank,
++        torch_distributed_backend=backend,
++        use_pynccl=True,
++        use_custom_allreduce=use_custom_allreduce,
++        use_tpu_communicator=True,
++        use_hpu_communicator=True,
++        use_xpu_communicator=True,
++        use_message_queue_broadcaster=use_message_queue_broadcaster,
++        group_name=group_name,
++    )
++
++
++_TP: Optional[GroupCoordinator] = None
++
++
++def get_tp_group() -> GroupCoordinator:
++    assert _TP is not None, ("tensor model parallel group is not initialized")
++    return _TP
++
++
++# kept for backward compatibility
++get_tensor_model_parallel_group = get_tp_group
++
++_PP: Optional[GroupCoordinator] = None
++
++
++def get_pp_group() -> GroupCoordinator:
++    assert _PP is not None, (
++        "pipeline model parallel group is not initialized")
++    return _PP
++
++
++# kept for backward compatibility
++get_pipeline_model_parallel_group = get_pp_group
++
++_KV_TRANSFER: Optional[kv_transfer.KVTransferAgent] = None
++
++
++def get_kv_transfer_group() -> kv_transfer.KVTransferAgent:
++    assert _KV_TRANSFER is not None, (
++        "disaggregated KV cache transfer parallel group is not initialized")
++    return _KV_TRANSFER
++
++
++@contextmanager
++def graph_capture(device: torch.device):
++    """
++    `graph_capture` is a context manager which should surround the code that
++    is capturing the CUDA graph. Its main purpose is to ensure that the
++    some operations will be run after the graph is captured, before the graph
++    is replayed. It returns a `GraphCaptureContext` object which contains the
++    necessary data for the graph capture. Currently, it only contains the
++    stream that the graph capture is running on. This stream is set to the
++    current CUDA stream when the context manager is entered and reset to the
++    default stream when the context manager is exited. This is to ensure that
++    the graph capture is running on a separate stream from the default stream,
++    in order to explicitly distinguish the kernels to capture
++    from other kernels possibly launched on background in the default stream.
++    """
++    context = GraphCaptureContext(torch.cuda.Stream(device=device))
++    with get_tp_group().graph_capture(context), get_pp_group().graph_capture(
++            context):
++        yield context
++
++
++logger = init_logger(__name__)
++
++_ENABLE_CUSTOM_ALL_REDUCE = True
++
++
++def set_custom_all_reduce(enable: bool):
++    global _ENABLE_CUSTOM_ALL_REDUCE
++    _ENABLE_CUSTOM_ALL_REDUCE = enable
+ 
+ 
+ def init_distributed_environment(
+@@ -72,18 +960,23 @@ def init_distributed_environment(
+             init_method=distributed_init_method,
+             world_size=world_size,
+             rank=rank)
+-        global _DEVICE_WORLD_GROUP, _CPU_WORLD_GROUP
+-        _DEVICE_WORLD_GROUP = torch.distributed.group.WORLD
+-        ranks = list(range(torch.distributed.get_world_size()))
+-        _CPU_WORLD_GROUP = torch.distributed.new_group(ranks=ranks,
+-                                                       backend="gloo")
+-        # set the local rank
+-        # local_rank is not available in torch ProcessGroup,
+-        # see https://github.com/pytorch/pytorch/issues/122816
+-        if local_rank == -1 and distributed_init_method == "env://":
++    # set the local rank
++    # local_rank is not available in torch ProcessGroup,
++    # see https://github.com/pytorch/pytorch/issues/122816
++    if local_rank == -1:
++        # local rank not set, this usually happens in single-node
++        # setting, where we can use rank as local rank
++        if distributed_init_method == "env://":
+             local_rank = envs.LOCAL_RANK
+-        global _LOCAL_RANK
+-        _LOCAL_RANK = local_rank
++        else:
++            local_rank = rank
++    global _WORLD
++    if _WORLD is None:
++        ranks = list(range(torch.distributed.get_world_size()))
++        _WORLD = init_world_group(ranks, local_rank, backend)
++    else:
++        assert _WORLD.world_size == torch.distributed.get_world_size(), (
++            "world group already initialized with a different world size")
+ 
+ 
+ def initialize_model_parallel(
+@@ -116,8 +1009,8 @@ def initialize_model_parallel(
+     # Get world size and rank. Ensure some consistencies.
+     assert torch.distributed.is_initialized()
+     world_size: int = torch.distributed.get_world_size()
+-    # get the backend of _DEVICE_WORLD_GROUP
+-    backend = backend or torch.distributed.get_backend()
++    backend = backend or torch.distributed.get_backend(
++        get_world_group().device_group)
+ 
+     if (world_size !=
+             tensor_model_parallel_size * pipeline_model_parallel_size):
+@@ -126,36 +1019,61 @@ def initialize_model_parallel(
+             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
+             f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
+ 
++    # Build the tensor model-parallel groups.
+     num_tensor_model_parallel_groups: int = (world_size //
+                                              tensor_model_parallel_size)
+-    num_pipeline_model_parallel_groups: int = (world_size //
+-                                               pipeline_model_parallel_size)
+-    rank = torch.distributed.get_rank()
+-
+-    # Build the tensor model-parallel groups.
+-    global _TP_DEVICE_GROUP, _TP_CPU_GROUP
+-    assert _TP_DEVICE_GROUP is None, (
+-        "tensor model parallel group is already initialized")
++    global _TP
++    assert _TP is None, ("tensor model parallel group is already initialized")
++    group_ranks = []
+     for i in range(num_tensor_model_parallel_groups):
+-        ranks = range(i * tensor_model_parallel_size,
+-                      (i + 1) * tensor_model_parallel_size)
+-        group = torch.distributed.new_group(ranks, backend=backend)
+-        cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+-        if rank in ranks:
+-            _TP_DEVICE_GROUP = group
+-            _TP_CPU_GROUP = cpu_group
++        ranks = list(
++            range(i * tensor_model_parallel_size,
++                  (i + 1) * tensor_model_parallel_size))
++        group_ranks.append(ranks)
++
++    # message queue broadcaster is only used in tensor model parallel group
++    _TP = init_model_parallel_group(group_ranks,
++                                    get_world_group().local_rank,
++                                    backend,
++                                    use_message_queue_broadcaster=True,
++                                    group_name="tp")
+ 
+     # Build the pipeline model-parallel groups.
+-    global _PIPELINE_MODEL_PARALLEL_GROUP
+-    global _PIPELINE_GLOBAL_RANKS
+-    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, (
++    num_pipeline_model_parallel_groups: int = (world_size //
++                                               pipeline_model_parallel_size)
++    global _PP
++    assert _PP is None, (
+         "pipeline model parallel group is already initialized")
++    group_ranks = []
+     for i in range(num_pipeline_model_parallel_groups):
+-        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
+-        group = torch.distributed.new_group(ranks, backend=backend)
+-        if rank in ranks:
+-            _PIPELINE_MODEL_PARALLEL_GROUP = group
+-            _PIPELINE_GLOBAL_RANKS = ranks
++        ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
++        group_ranks.append(ranks)
++    # pipeline parallel does not need custom allreduce
++    _PP = init_model_parallel_group(group_ranks,
++                                    get_world_group().local_rank,
++                                    backend,
++                                    use_custom_allreduce=False,
++                                    group_name="pp")
++
++
++def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
++    """
++    Initialize KV cache transfer parallel group.
++    """
++
++    global _KV_TRANSFER
++
++    if vllm_config.kv_transfer_config is None:
++        return
++
++    if all([
++            vllm_config.kv_transfer_config.need_kv_parallel_group,
++            _KV_TRANSFER is None
++    ]):
++        _KV_TRANSFER = kv_transfer.KVTransferAgent(
++            rank=get_world_group().rank,
++            local_rank=get_world_group().local_rank,
++            config=vllm_config)
+ 
+ 
+ def ensure_model_parallel_initialized(
+@@ -167,8 +1085,8 @@ def ensure_model_parallel_initialized(
+     or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
+     values if the model parallel groups are initialized.
+     """
+-    # get the backend of _DEVICE_WORLD_GROUP
+-    backend = backend or torch.distributed.get_backend()
++    backend = backend or torch.distributed.get_backend(
++        get_world_group().device_group)
+     if not model_parallel_is_initialized():
+         initialize_model_parallel(tensor_model_parallel_size,
+                                   pipeline_model_parallel_size, backend)
+@@ -179,161 +1097,172 @@ def ensure_model_parallel_initialized(
+     ), ("tensor parallel group already initialized, but of unexpected size: "
+         f"{get_tensor_model_parallel_world_size()=} vs. "
+         f"{tensor_model_parallel_size=}")
+-    assert (get_pipeline_model_parallel_world_size(
+-    ) == pipeline_model_parallel_size), (
++    pp_world_size = get_pp_group().world_size
++    assert (pp_world_size == pipeline_model_parallel_size), (
+         "pipeline parallel group already initialized, but of unexpected size: "
+-        f"{get_pipeline_model_parallel_world_size()=} vs. "
++        f"{pp_world_size=} vs. "
+         f"{pipeline_model_parallel_size=}")
+ 
+ 
+ def model_parallel_is_initialized():
+     """Check if tensor and pipeline parallel groups are initialized."""
+-    return (_TP_DEVICE_GROUP is not None
+-            and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
+-
++    return (_TP is not None and _PP is not None)
+ 
+-def get_cpu_world_group():
+-    """Get the CPU world group."""
+-    assert _CPU_WORLD_GROUP is not None, ("CPU world group is not initialized")
+-    return _CPU_WORLD_GROUP
+ 
++_TP_STATE_PATCHED = False
+ 
+-def get_tensor_model_parallel_group():
+-    """Get the tensor model parallel group the caller rank belongs to."""
+-    assert _TP_DEVICE_GROUP is not None, (
+-        "tensor model parallel group is not initialized")
+-    return _TP_DEVICE_GROUP
+ 
++@contextmanager
++def patch_tensor_parallel_group(tp_group: GroupCoordinator):
++    """Patch the tp group temporarily until this function ends.
+ 
+-def get_tensor_model_parallel_cpu_group():
+-    """Get the tensor model parallel cpu group the caller rank belongs to."""
+-    assert _TP_CPU_GROUP is not None, (
+-        "tensor model parallel cpu group is not initialized")
+-    return _TP_CPU_GROUP
++    This method is for draft workers of speculative decoding to run draft model
++    with different tp degree from that of target model workers.
+ 
++    Args:
++        tp_group (GroupCoordinator): the tp group coordinator
++    """
++    global _TP_STATE_PATCHED
++    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
+ 
+-def get_pipeline_model_parallel_group():
+-    """Get the pipeline model parallel group the caller rank belongs to."""
+-    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, (
+-        "pipeline model parallel group is not initialized")
+-    return _PIPELINE_MODEL_PARALLEL_GROUP
++    _TP_STATE_PATCHED = True
++    old_tp_group = get_tp_group()
++    global _TP
++    _TP = tp_group
++    try:
++        yield
++    finally:
++        # restore the original state
++        _TP_STATE_PATCHED = False
++        _TP = old_tp_group
+ 
+ 
+ def get_tensor_model_parallel_world_size():
+     """Return world size for the tensor model parallel group."""
+-    return torch.distributed.get_world_size(
+-        group=get_tensor_model_parallel_group())
+-
+-
+-def get_pipeline_model_parallel_world_size():
+-    """Return world size for the pipeline model parallel group."""
+-    return torch.distributed.get_world_size(
+-        group=get_pipeline_model_parallel_group())
++    return get_tp_group().world_size
+ 
+ 
+ def get_tensor_model_parallel_rank():
+     """Return my rank for the tensor model parallel group."""
+-    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
++    return get_tp_group().rank_in_group
+ 
+ 
+-def get_pipeline_model_parallel_rank():
+-    """Return my rank for the pipeline model parallel group."""
+-    return torch.distributed.get_rank(
+-        group=get_pipeline_model_parallel_group())
++def destroy_model_parallel():
++    """Set the groups to none and destroy them."""
++    global _TP
++    if _TP:
++        _TP.destroy()
++    _TP = None
+ 
++    global _PP
++    if _PP:
++        _PP.destroy()
++    _PP = None
+ 
+-def get_tensor_model_parallel_src_rank():
+-    """Calculate the global rank corresponding to the first local rank
+-    in the tensor model parallel group."""
+-    global_rank = torch.distributed.get_rank()
+-    local_world_size = get_tensor_model_parallel_world_size()
+-    return (global_rank // local_world_size) * local_world_size
+ 
++def destroy_distributed_environment():
++    global _WORLD
++    if _WORLD:
++        _WORLD.destroy()
++    _WORLD = None
++    if torch.distributed.is_initialized():
++        torch.distributed.destroy_process_group()
+ 
+-def get_pipeline_model_parallel_first_rank():
+-    """Return the global rank of the first process in the pipeline for the
+-    current tensor parallel group"""
+-    assert _PIPELINE_GLOBAL_RANKS is not None, (
+-        "Pipeline parallel group is not initialized")
+-    return _PIPELINE_GLOBAL_RANKS[0]
+ 
++def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
++    destroy_model_parallel()
++    destroy_distributed_environment()
++    with contextlib.suppress(AssertionError):
++        torch.distributed.destroy_process_group()
++    if shutdown_ray:
++        import ray  # Lazy import Ray
++        ray.shutdown()
++    gc.collect()
++    from vllm.platforms import current_platform
++    if not current_platform.is_cpu():
++        torch.cuda.empty_cache()
+ 
+-def get_pipeline_model_parallel_last_rank():
+-    """Return the global rank of the last process in the pipeline for the
+-    current tensor parallel group"""
+-    assert _PIPELINE_GLOBAL_RANKS is not None, (
+-        "Pipeline parallel group is not initialized")
+-    last_rank_local = get_pipeline_model_parallel_world_size() - 1
+-    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
+ 
++def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
++                        source_rank: int = 0) -> List[bool]:
++    """
++    This is a collective operation that returns if each rank is in the same node
++    as the source rank. It tests if processes are attached to the same
++    memory system (shared access to shared memory).
++    """
++    if isinstance(pg, ProcessGroup):
++        assert torch.distributed.get_backend(
++            pg) != torch.distributed.Backend.NCCL, (
++                "in_the_same_node_as should be tested with a non-NCCL group.")
++        # local rank inside the group
++        rank = torch.distributed.get_rank(group=pg)
++        world_size = torch.distributed.get_world_size(group=pg)
+ 
+-def get_pipeline_model_parallel_next_rank():
+-    """Return the global rank that follows the caller in the pipeline"""
+-    assert _PIPELINE_GLOBAL_RANKS is not None, (
+-        "Pipeline parallel group is not initialized")
+-    rank_in_pipeline = get_pipeline_model_parallel_rank()
+-    world_size = get_pipeline_model_parallel_world_size()
+-    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
++        # global ranks of the processes in the group
++        ranks = torch.distributed.get_process_group_ranks(pg)
++    else:
++        rank = pg.rank
++        world_size = pg.world_size
++        ranks = list(range(world_size))
+ 
++    # local tensor in each process to store the result
++    is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
+ 
+-def get_pipeline_model_parallel_prev_rank():
+-    """Return the global rank that precedes the caller in the pipeline"""
+-    assert _PIPELINE_GLOBAL_RANKS is not None, (
+-        "Pipeline parallel group is not initialized")
+-    rank_in_pipeline = get_pipeline_model_parallel_rank()
+-    world_size = get_pipeline_model_parallel_world_size()
+-    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
++    magic_message = b"magic_message"
++    shm = None
+ 
++    try:
++        with contextlib.suppress(OSError):
++            if rank == source_rank:
++                # create a shared memory segment
++                shm = shared_memory.SharedMemory(create=True, size=128)
++                shm.buf[:len(magic_message)] = magic_message
++                if isinstance(pg, ProcessGroup):
++                    torch.distributed.broadcast_object_list(
++                        [shm.name], src=ranks[source_rank], group=pg)
++                else:
++                    pg.broadcast_obj(shm.name, src=source_rank)
++                is_in_the_same_node[rank] = 1
++            else:
++                # try to open the shared memory segment
++                if isinstance(pg, ProcessGroup):
++                    recv = [None]
++                    torch.distributed.broadcast_object_list(
++                        recv, src=ranks[source_rank], group=pg)
++                    name = recv[0]
++                else:
++                    name = pg.broadcast_obj(None, src=source_rank)
++                # fix to https://stackoverflow.com/q/62748654/9191338
++                # Python incorrectly tracks shared memory even if it is not
++                # created by the process. The following patch is a workaround.
++                with patch("multiprocessing.resource_tracker.register",
++                           lambda *args, **kwargs: None):
++                    shm = shared_memory.SharedMemory(name=name)
++                if shm.buf[:len(magic_message)] == magic_message:
++                    is_in_the_same_node[rank] = 1
++    except Exception as e:
++        logger.error("Error ignored in is_in_the_same_node: %s", e)
++    finally:
++        if shm:
++            shm.close()
+ 
+-def destroy_model_parallel():
+-    """Set the groups to none and destroy them."""
+-    global _TP_DEVICE_GROUP
+-    if _TP_DEVICE_GROUP:
+-        torch.distributed.destroy_process_group(_TP_DEVICE_GROUP)
+-    _TP_DEVICE_GROUP = None
+-    global _TP_CPU_GROUP
+-    if _TP_CPU_GROUP:
+-        torch.distributed.destroy_process_group(_TP_CPU_GROUP)
+-    _TP_CPU_GROUP = None
+-    global _PIPELINE_MODEL_PARALLEL_GROUP
+-    if _PIPELINE_MODEL_PARALLEL_GROUP:
+-        torch.distributed.destroy_process_group(_PIPELINE_MODEL_PARALLEL_GROUP)
+-    _PIPELINE_MODEL_PARALLEL_GROUP = None
+-    global _PIPELINE_GLOBAL_RANKS
+-    _PIPELINE_GLOBAL_RANKS = None
+-    from vllm.distributed.device_communicators import pynccl_utils
+-
+-    # Destroy the pynccl states if any.
+-    pynccl_utils.destroy_process_group()
+-
+-
+-# Whether to use pynccl for nccl all reduce.
+-# We use pynccl for all reduce when using CUDA graph, because torch.distributed
+-# is not well supported by CUDA graph.
+-_ENABLE_PYNCCL_FOR_ALL_REDUCE = False
+-
+-
+-@contextlib.contextmanager
+-def with_pynccl_for_all_reduce():
+-    from vllm.distributed.device_communicators import pynccl_utils
+-    """use pynccl instead of torch.distributed for all reduce"""
+-    tp_size = get_tensor_model_parallel_world_size()
+-    if tp_size == 1:
+-        # No-op.
+-        # NOTE(woosuk): We don't initialize pynccl when tp_size is 1.
+-        yield
++    if isinstance(pg, ProcessGroup):
++        torch.distributed.barrier(group=pg)
+     else:
+-        global _ENABLE_PYNCCL_FOR_ALL_REDUCE
+-        old = _ENABLE_PYNCCL_FOR_ALL_REDUCE
+-        _ENABLE_PYNCCL_FOR_ALL_REDUCE = True
++        pg.barrier()
+ 
+-        stream = torch.cuda.current_stream()
+-        with pynccl_utils.set_pynccl_stream(stream):
+-            yield
+-        _ENABLE_PYNCCL_FOR_ALL_REDUCE = old
++    # clean up the shared memory segment
++    with contextlib.suppress(OSError):
++        if rank == source_rank and shm:
++            shm.unlink()
+ 
++    if isinstance(pg, ProcessGroup):
++        torch.distributed.all_reduce(is_in_the_same_node, group=pg)
++        aggregated_data = is_in_the_same_node
++    else:
++        aggregated_data = torch.zeros_like(is_in_the_same_node)
++        for i in range(world_size):
++            rank_data = pg.broadcast_obj(is_in_the_same_node, src=i)
++            aggregated_data += rank_data
+ 
+-def is_pynccl_enabled_for_all_reduce():
+-    """check if pynccl is enabled for all reduce"""
+-    global _ENABLE_PYNCCL_FOR_ALL_REDUCE
+-    return _ENABLE_PYNCCL_FOR_ALL_REDUCE
++    return [x == 1 for x in aggregated_data.tolist()]
+diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
+index 1965d4c..dcfcb84 100644
+--- a/vllm/distributed/utils.py
++++ b/vllm/distributed/utils.py
+@@ -2,18 +2,18 @@
+ # Adapted from
+ # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+-import json
+-import os
+-from typing import Dict, Optional, Sequence
++import dataclasses
++import pickle
++import time
++from collections import deque
++from typing import Any, Deque, Dict, Optional, Sequence, Tuple
+ 
+ import torch
+-import torch.distributed as dist
++from torch.distributed import TCPStore
+ 
+ import vllm.envs as envs
+ from vllm.logger import init_logger
+ 
+-from .parallel_state import get_cpu_world_group, get_local_rank
+-
+ logger = init_logger(__name__)
+ 
+ 
+@@ -58,79 +58,170 @@ def split_tensor_along_last_dim(
+     return tensor_list
+ 
+ 
+-# code partly borrowed from
+-# https://github.com/turboderp/exllamav2/blob/1c67f97f3d2a968605a9c31ab791a05c85bb7879/exllamav2/compat.py#L10
+-# License: MIT
+-def _can_actually_p2p(idx_a, idx_b):
+-    dev_i = f"cuda:{idx_a}"
+-    dev_j = f"cuda:{idx_b}"
+-    a = torch.randn(5, device=dev_i) + 123.0
+-    b = a.to(dev_j)
+-    c = b.to(dev_i)
+-    return torch.all(a == c).cpu().item()
+-
+-
+-# why do we need this cache?
+-# 1. we can have runtime checks for P2P access, where every process checks
+-#  P2P access to all other GPUs. Unfortunately, the test might cost many
+-#  (world_size * world_size) cuda context, and reduce the memory available
+-#  for the model. see https://github.com/vllm-project/vllm/issues/3821
+-# 2. alternatively, we can have a p2p map that is generated by the master
+-#  process and broadcasted to all other processes. This still requires
+-#  #world_size of cuda context, belonging to the master process, on each GPU.
+-# 3. we can have a cache file, that records the p2p access status. The first
+-#  time the master process checks the p2p access, it will generate the cache
+-#  file, at the cost of #world_size of cuda context. Later on, all processes
+-#  can read the cache file to check the p2p access status without any cost of
+-#  additional cuda context.
+-# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
+-#  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
+-#  e.g. used by different vllm engines. The device id in the cache file is a
+-#  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
+-#  of visible devices in the vllm engine.
+-_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
+-
+-
+-def gpu_p2p_access_check(i: int, j: int) -> bool:
+-    """Check if GPU i can access GPU j."""
+-
+-    # if the cache variable is already calculated,
+-    # read from the cache instead of checking it again
+-    global _gpu_p2p_access_cache
+-    if _gpu_p2p_access_cache is not None:
+-        return _gpu_p2p_access_cache[f"{i}->{j}"]
+-
+-    is_distributed = dist.is_initialized()
+-
+-    num_dev = torch.cuda.device_count()
+-    cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
+-    if cuda_visible_devices is None:
+-        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
+-    VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
+-    path = os.path.expanduser(
+-        f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
+-    )
+-    os.makedirs(os.path.dirname(path), exist_ok=True)
+-    if (not is_distributed or get_local_rank() == 0) \
+-        and (not os.path.exists(path)):
+-        # only the local master process (with local_rank == 0) can
+-        #  enter this block to calculate the cache
+-        logger.info("generating GPU P2P access cache for in %s", path)
+-        cache = {}
+-        for _i in range(num_dev):
+-            for _j in range(num_dev):
+-                # on some platforms, P2P support might be buggy and we need
+-                # additional checks. See also:
+-                # https://github.com/vllm-project/vllm/issues/2728
+-                cache[f"{_i}->{_j}"] = torch.cuda.can_device_access_peer(
+-                    _i, _j) and _can_actually_p2p(_i, _j)
+-        with open(path, "w") as f:
+-            json.dump(cache, f, indent=4)
+-    if is_distributed:
+-        cpu_world_group = get_cpu_world_group()
+-        dist.barrier(cpu_world_group)
+-    logger.info("reading GPU P2P access cache from %s", path)
+-    with open(path, "r") as f:
+-        cache = json.load(f)
+-    _gpu_p2p_access_cache = cache
+-    return _gpu_p2p_access_cache[f"{i}->{j}"]
++def get_pp_indices(num_hidden_layers: int, pp_rank: int,
++                   pp_size: int) -> Tuple[int, int]:
++    """Try to evenly distribute layers across partitions.
++    If the number of layers is not divisible by the number of partitions,
++    the last partition will have the remaining layers.
++    """
++    partition_list_str = envs.VLLM_PP_LAYER_PARTITION
++    if partition_list_str is not None:
++        try:
++            partitions = [
++                int(layer) for layer in partition_list_str.split(",")
++            ]
++        except ValueError as err:
++            raise ValueError("Invalid partition string: {}".format(
++                partition_list_str)) from err
++        if len(partitions) != pp_size:
++            raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
++        if sum(partitions) != num_hidden_layers:
++            raise ValueError(
++                f"{sum(partitions)=} does not match {num_hidden_layers=}.")
++        start_layer = sum(partitions[:pp_rank])
++        end_layer = start_layer + partitions[pp_rank]
++    else:
++        layers_per_partition = num_hidden_layers // pp_size
++        start_layer = pp_rank * layers_per_partition
++        end_layer = start_layer + layers_per_partition
++
++        if pp_rank == pp_size - 1:
++            end_layer = num_hidden_layers
++
++    return (start_layer, end_layer)
++
++
++@dataclasses.dataclass
++class StatelessProcessGroup:
++    """A dataclass to hold a metadata store, and the rank, world_size of the
++    group. Only use it to communicate metadata between processes.
++    For data-plane communication, create NCCL-related objects.
++    """
++    rank: int
++    world_size: int
++    store: torch._C._distributed_c10d.Store
++    data_expiration_seconds: int = 3600  # 1 hour
++
++    # dst rank -> counter
++    send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
++    # src rank -> counter
++    recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
++    broadcast_send_counter: int = 0
++    broadcast_recv_src_counter: Dict[int, int] = dataclasses.field(
++        default_factory=dict)
++
++    # A deque to store the data entries, with key and timestamp.
++    entries: Deque[Tuple[str,
++                         float]] = dataclasses.field(default_factory=deque)
++
++    def __post_init__(self):
++        assert self.rank < self.world_size
++        self.send_dst_counter = {i: 0 for i in range(self.world_size)}
++        self.recv_src_counter = {i: 0 for i in range(self.world_size)}
++        self.broadcast_recv_src_counter = {
++            i: 0
++            for i in range(self.world_size)
++        }
++
++    def send_obj(self, obj: Any, dst: int):
++        """Send an object to a destination rank."""
++        self.expire_data()
++        key = f"send_to/{dst}/{self.send_dst_counter[dst]}"
++        self.store.set(key, pickle.dumps(obj))
++        self.send_dst_counter[dst] += 1
++        self.entries.append((key, time.time()))
++
++    def expire_data(self):
++        """Expire data that is older than `data_expiration_seconds` seconds."""
++        while self.entries:
++            # check the oldest entry
++            key, timestamp = self.entries[0]
++            if time.time() - timestamp > self.data_expiration_seconds:
++                self.store.delete_key(key)
++                self.entries.popleft()
++            else:
++                break
++
++    def recv_obj(self, src: int) -> Any:
++        """Receive an object from a source rank."""
++        obj = pickle.loads(
++            self.store.get(
++                f"send_to/{self.rank}/{self.recv_src_counter[src]}"))
++        self.recv_src_counter[src] += 1
++        return obj
++
++    def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
++        """Broadcast an object from a source rank to all other ranks.
++        It does not clean up after all ranks have received the object.
++        Use it for limited times, e.g., for initialization.
++        """
++        if self.rank == src:
++            self.expire_data()
++            key = (f"broadcast_from/{src}/"
++                   f"{self.broadcast_send_counter}")
++            self.store.set(key, pickle.dumps(obj))
++            self.broadcast_send_counter += 1
++            self.entries.append((key, time.time()))
++            return obj
++        else:
++            key = (f"broadcast_from/{src}/"
++                   f"{self.broadcast_recv_src_counter[src]}")
++            recv_obj = pickle.loads(self.store.get(key))
++            self.broadcast_recv_src_counter[src] += 1
++            return recv_obj
++
++    def all_gather_obj(self, obj: Any) -> list[Any]:
++        """All gather an object from all ranks."""
++        gathered_objs = []
++        for i in range(self.world_size):
++            if i == self.rank:
++                gathered_objs.append(obj)
++                self.broadcast_obj(obj, src=self.rank)
++            else:
++                recv_obj = self.broadcast_obj(None, src=i)
++                gathered_objs.append(recv_obj)
++        return gathered_objs
++
++    def barrier(self):
++        """A barrier to synchronize all ranks."""
++        for i in range(self.world_size):
++            if i == self.rank:
++                self.broadcast_obj(None, src=self.rank)
++            else:
++                self.broadcast_obj(None, src=i)
++
++    @staticmethod
++    def create(
++        host: str,
++        port: int,
++        rank: int,
++        world_size: int,
++        data_expiration_seconds: int = 3600,
++    ) -> "StatelessProcessGroup":
++        """A replacement for `torch.distributed.init_process_group` that does not
++        pollute the global state.
++
++        If we have process A and process B called `torch.distributed.init_process_group`
++        to form a group, and then we want to form another group with process A, B, C,
++        D, it is not possible in PyTorch, because process A and process B have already
++        formed a group, and process C and process D cannot join that group. This
++        function is a workaround for this issue.
++
++        `torch.distributed.init_process_group` is a global call, while this function
++        is a stateless call. It will return a `StatelessProcessGroup` object that can be
++        used for exchanging metadata. With this function, process A and process B
++        can call `StatelessProcessGroup.create` to form a group, and then process A, B,
++        C, and D can call `StatelessProcessGroup.create` to form another group.
++        """ # noqa
++        store = TCPStore(
++            host_name=host,
++            port=port,
++            world_size=world_size,
++            is_master=(rank == 0),
++        )
++
++        return StatelessProcessGroup(
++            rank=rank,
++            world_size=world_size,
++            store=store,
++            data_expiration_seconds=data_expiration_seconds)
+diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
+index bb8245e..0850bab 100644
+--- a/vllm/engine/arg_utils.py
++++ b/vllm/engine/arg_utils.py
+@@ -1,14 +1,44 @@
+ import argparse
+ import dataclasses
++import json
+ from dataclasses import dataclass
+-from typing import List, Optional, Union
++from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
++                    Tuple, Type, Union, cast, get_args)
+ 
+-from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
+-                         EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
+-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
+-                         TokenizerPoolConfig, VisionLanguageConfig)
++import torch
++
++import vllm.envs as envs
++from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
++                         DecodingConfig, DeviceConfig, HfOverrides,
++                         KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
++                         ModelConfig, ObservabilityConfig, ParallelConfig,
++                         PoolerConfig, PromptAdapterConfig, SchedulerConfig,
++                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
++                         VllmConfig)
++from vllm.executor.executor_base import ExecutorBase
++from vllm.logger import init_logger
+ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+-from vllm.utils import str_to_int_tuple
++from vllm.transformers_utils.utils import check_gguf_file
++from vllm.usage.usage_lib import UsageContext
++from vllm.utils import FlexibleArgumentParser, StoreBoolean
++
++if TYPE_CHECKING:
++    from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
++
++logger = init_logger(__name__)
++
++ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"]
++
++DEVICE_OPTIONS = [
++    "auto",
++    "cuda",
++    "neuron",
++    "cpu",
++    "openvino",
++    "tpu",
++    "xpu",
++    "hpu",
++]
+ 
+ 
+ def nullable_str(val: str):
+@@ -17,95 +47,206 @@ def nullable_str(val: str):
+     return val
+ 
+ 
++def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
++    """Parses a string containing comma separate key [str] to value [int]
++    pairs into a dictionary.
++
++    Args:
++        val: String value to be parsed.
++
++    Returns:
++        Dictionary with parsed values.
++    """
++    if len(val) == 0:
++        return None
++
++    out_dict: Dict[str, int] = {}
++    for item in val.split(","):
++        kv_parts = [part.lower().strip() for part in item.split("=")]
++        if len(kv_parts) != 2:
++            raise argparse.ArgumentTypeError(
++                "Each item should be in the form KEY=VALUE")
++        key, value = kv_parts
++
++        try:
++            parsed_value = int(value)
++        except ValueError as exc:
++            msg = f"Failed to parse value of item {key}={value}"
++            raise argparse.ArgumentTypeError(msg) from exc
++
++        if key in out_dict and out_dict[key] != parsed_value:
++            raise argparse.ArgumentTypeError(
++                f"Conflicting values specified for key: {key}")
++        out_dict[key] = parsed_value
++
++    return out_dict
++
++
+ @dataclass
+ class EngineArgs:
+     """Arguments for vLLM engine."""
+-    model: str
+-    served_model_name: Optional[Union[List[str]]] = None
++    model: str = 'facebook/opt-125m'
++    served_model_name: Optional[Union[str, List[str]]] = None
+     tokenizer: Optional[str] = None
++    task: TaskOption = "auto"
+     skip_tokenizer_init: bool = False
+     tokenizer_mode: str = 'auto'
+     trust_remote_code: bool = False
++    allowed_local_media_path: str = ""
+     download_dir: Optional[str] = None
+     load_format: str = 'auto'
++    config_format: ConfigFormat = ConfigFormat.AUTO
+     dtype: str = 'auto'
+     kv_cache_dtype: str = 'auto'
+     quantization_param_path: Optional[str] = None
+     seed: int = 0
+     max_model_len: Optional[int] = None
+     worker_use_ray: bool = False
++    # Note: Specifying a custom executor backend by passing a class
++    # is intended for expert use only. The API may change without
++    # notice.
++    distributed_executor_backend: Optional[Union[str,
++                                                 Type[ExecutorBase]]] = None
++    # number of P/D disaggregation (or other disaggregation) workers
+     pipeline_parallel_size: int = 1
+     tensor_parallel_size: int = 1
+     max_parallel_loading_workers: Optional[int] = None
+-    block_size: int = 16
+-    enable_prefix_caching: bool = False
+-    use_v2_block_manager: bool = False
+-    swap_space: int = 4  # GiB
++    block_size: Optional[int] = None
++    enable_prefix_caching: Optional[bool] = None
++    disable_sliding_window: bool = False
++    use_v2_block_manager: bool = True
++    swap_space: float = 4  # GiB
++    cpu_offload_gb: float = 0  # GiB
+     gpu_memory_utilization: float = 0.90
+     max_num_batched_tokens: Optional[int] = None
+-    max_num_seqs: int = 256
+-    max_logprobs: int = 5  # OpenAI default value
++    max_num_seqs: Optional[int] = None
++    max_logprobs: int = 20  # Default value for OpenAI Chat Completions API
+     disable_log_stats: bool = False
+     revision: Optional[str] = None
+     code_revision: Optional[str] = None
++    rope_scaling: Optional[Dict[str, Any]] = None
++    rope_theta: Optional[float] = None
++    hf_overrides: Optional[HfOverrides] = None
+     tokenizer_revision: Optional[str] = None
+     quantization: Optional[str] = None
+-    enforce_eager: bool = False
+-    max_context_len_to_capture: Optional[int] = None
++    enforce_eager: Optional[bool] = None
+     max_seq_len_to_capture: int = 8192
+     disable_custom_all_reduce: bool = False
+     tokenizer_pool_size: int = 0
+-    tokenizer_pool_type: str = "ray"
+-    tokenizer_pool_extra_config: Optional[dict] = None
++    # Note: Specifying a tokenizer pool by passing a class
++    # is intended for expert use only. The API may change without
++    # notice.
++    tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray"
++    tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
++    limit_mm_per_prompt: Optional[Mapping[str, int]] = None
++    mm_processor_kwargs: Optional[Dict[str, Any]] = None
++    disable_mm_preprocessor_cache: bool = False
+     enable_lora: bool = False
++    enable_lora_bias: bool = False
+     max_loras: int = 1
+     max_lora_rank: int = 16
++    enable_prompt_adapter: bool = False
++    max_prompt_adapters: int = 1
++    max_prompt_adapter_token: int = 0
+     fully_sharded_loras: bool = False
+     lora_extra_vocab_size: int = 256
+-    lora_dtype = 'auto'
++    long_lora_scaling_factors: Optional[Tuple[float]] = None
++    lora_dtype: Optional[Union[str, torch.dtype]] = 'auto'
+     max_cpu_loras: Optional[int] = None
+     device: str = 'auto'
++    num_scheduler_steps: int = 1
++    multi_step_stream_outputs: bool = True
+     ray_workers_use_nsight: bool = False
+     num_gpu_blocks_override: Optional[int] = None
+     num_lookahead_slots: int = 0
+     model_loader_extra_config: Optional[dict] = None
++    ignore_patterns: Optional[Union[str, List[str]]] = None
++    preemption_mode: Optional[str] = None
+ 
+-    # Related to Vision-language models such as llava
+-    image_input_type: Optional[str] = None
+-    image_token_id: Optional[int] = None
+-    image_input_shape: Optional[str] = None
+-    image_feature_size: Optional[int] = None
+     scheduler_delay_factor: float = 0.0
+-    enable_chunked_prefill: bool = False
++    enable_chunked_prefill: Optional[bool] = None
+ 
+-    guided_decoding_backend: str = 'outlines'
++    guided_decoding_backend: str = 'xgrammar'
++    logits_processor_pattern: Optional[str] = None
+     # Speculative decoding configuration.
+     speculative_model: Optional[str] = None
++    speculative_model_quantization: Optional[str] = None
++    speculative_draft_tensor_parallel_size: Optional[int] = None
+     num_speculative_tokens: Optional[int] = None
++    speculative_disable_mqa_scorer: Optional[bool] = False
+     speculative_max_model_len: Optional[int] = None
++    speculative_disable_by_batch_size: Optional[int] = None
+     ngram_prompt_lookup_max: Optional[int] = None
+     ngram_prompt_lookup_min: Optional[int] = None
++    spec_decoding_acceptance_method: str = 'rejection_sampler'
++    typical_acceptance_sampler_posterior_threshold: Optional[float] = None
++    typical_acceptance_sampler_posterior_alpha: Optional[float] = None
++    qlora_adapter_name_or_path: Optional[str] = None
++    disable_logprobs_during_spec_decoding: Optional[bool] = None
++
++    otlp_traces_endpoint: Optional[str] = None
++    collect_detailed_traces: Optional[str] = None
++    disable_async_output_proc: bool = False
++    scheduling_policy: Literal["fcfs", "priority"] = "fcfs"
++
++    override_neuron_config: Optional[Dict[str, Any]] = None
++    override_pooler_config: Optional[PoolerConfig] = None
++    compilation_config: Optional[CompilationConfig] = None
++    worker_cls: str = "auto"
++
++    kv_transfer_config: Optional[KVTransferConfig] = None
++
++    generation_config: Optional[str] = None
+ 
+     def __post_init__(self):
+-        if self.tokenizer is None:
++        if not self.tokenizer:
+             self.tokenizer = self.model
+ 
++        # Override the default value of enable_prefix_caching if it's not set
++        # by user.
++        if self.enable_prefix_caching is None:
++            self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
++
++        # Override max_num_seqs if it's not set by user.
++        if self.max_num_seqs is None:
++            self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
++
++        # support `EngineArgs(compilation_config={...})`
++        # without having to manually construct a
++        # CompilationConfig object
++        if isinstance(self.compilation_config, (int, dict)):
++            self.compilation_config = CompilationConfig.from_cli(
++                str(self.compilation_config))
++
++        # Setup plugins
++        from vllm.plugins import load_general_plugins
++        load_general_plugins()
++
+     @staticmethod
+-    def add_cli_args(
+-            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
++    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+         """Shared CLI arguments for vLLM engine."""
+ 
+         # Model arguments
+         parser.add_argument(
+             '--model',
+             type=str,
+-            default='facebook/opt-125m',
++            default=EngineArgs.model,
+             help='Name or path of the huggingface model to use.')
++        parser.add_argument(
++            '--task',
++            default=EngineArgs.task,
++            choices=get_args(TaskOption),
++            help='The task to use the model for. Each vLLM instance only '
++            'supports one task, even if the same model can be used for '
++            'multiple tasks. When the model only supports one task, "auto" '
++            'can be used to select it; otherwise, you must specify explicitly '
++            'which task to use.')
+         parser.add_argument(
+             '--tokenizer',
+             type=nullable_str,
+             default=EngineArgs.tokenizer,
+-            help='Name or path of the huggingface tokenizer to use.')
++            help='Name or path of the huggingface tokenizer to use. '
++            'If unspecified, model name or path will be used.')
+         parser.add_argument(
+             '--skip-tokenizer-init',
+             action='store_true',
+@@ -128,20 +269,28 @@ class EngineArgs:
+             '--tokenizer-revision',
+             type=nullable_str,
+             default=None,
+-            help='The specific tokenizer version to use. It can be a branch '
+-            'name, a tag name, or a commit id. If unspecified, will use '
+-            'the default version.')
++            help='Revision of the huggingface tokenizer to use. '
++            'It can be a branch name, a tag name, or a commit id. '
++            'If unspecified, will use the default version.')
+         parser.add_argument(
+             '--tokenizer-mode',
+             type=str,
+             default=EngineArgs.tokenizer_mode,
+-            choices=['auto', 'slow'],
++            choices=['auto', 'slow', 'mistral'],
+             help='The tokenizer mode.\n\n* "auto" will use the '
+             'fast tokenizer if available.\n* "slow" will '
+-            'always use the slow tokenizer.')
++            'always use the slow tokenizer. \n* '
++            '"mistral" will always use the `mistral_common` tokenizer.')
+         parser.add_argument('--trust-remote-code',
+                             action='store_true',
+                             help='Trust remote code from huggingface.')
++        parser.add_argument(
++            '--allowed-local-media-path',
++            type=str,
++            help="Allowing API requests to read local images or videos "
++            "from directories specified by the server file system. "
++            "This is a security risk. "
++            "Should only be enabled in trusted environments.")
+         parser.add_argument('--download-dir',
+                             type=nullable_str,
+                             default=EngineArgs.download_dir,
+@@ -152,9 +301,7 @@ class EngineArgs:
+             '--load-format',
+             type=str,
+             default=EngineArgs.load_format,
+-            choices=[
+-                'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer'
+-            ],
++            choices=[f.value for f in LoadFormat],
+             help='The format of the model weights to load.\n\n'
+             '* "auto" will try to load the weights in the safetensors format '
+             'and fall back to the pytorch bin format if safetensors format '
+@@ -166,8 +313,19 @@ class EngineArgs:
+             '* "dummy" will initialize the weights with random values, '
+             'which is mainly for profiling.\n'
+             '* "tensorizer" will load the weights using tensorizer from '
+-            'CoreWeave which assumes tensorizer_uri is set to the location of '
+-            'the serialized weights.')
++            'CoreWeave. See the Tensorize vLLM Model script in the Examples '
++            'section for more information.\n'
++            '* "runai_streamer" will load the Safetensors weights using Run:ai'
++            'Model Streamer \n'
++            '* "bitsandbytes" will load the weights using bitsandbytes '
++            'quantization.\n')
++        parser.add_argument(
++            '--config-format',
++            default=EngineArgs.config_format,
++            choices=[f.value for f in ConfigFormat],
++            help='The format of the model config to load.\n\n'
++            '* "auto" will try to load the config in hf format '
++            'if available else it will try to load in mistral format ')
+         parser.add_argument(
+             '--dtype',
+             type=str,
+@@ -186,12 +344,11 @@ class EngineArgs:
+         parser.add_argument(
+             '--kv-cache-dtype',
+             type=str,
+-            choices=['auto', 'fp8'],
++            choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+             default=EngineArgs.kv_cache_dtype,
+             help='Data type for kv cache storage. If "auto", will use model '
+-            'data type. FP8_E5M2 (without scaling) is only supported on cuda '
+-            'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
+-            'supported for common inference criteria.')
++            'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
++            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+         parser.add_argument(
+             '--quantization-param-path',
+             type=nullable_str,
+@@ -200,7 +357,7 @@ class EngineArgs:
+             'scaling factors. This should generally be supplied, when '
+             'KV cache dtype is FP8. Otherwise, KV cache scaling factors '
+             'default to 1.0, which may cause accuracy issues. '
+-            'FP8_E5M2 (without scaling) is only supported on cuda version'
++            'FP8_E5M2 (without scaling) is only supported on cuda version '
+             'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
+             'supported for common inference criteria.')
+         parser.add_argument('--max-model-len',
+@@ -211,19 +368,40 @@ class EngineArgs:
+         parser.add_argument(
+             '--guided-decoding-backend',
+             type=str,
+-            default='outlines',
+-            choices=['outlines', 'lm-format-enforcer'],
++            default='xgrammar',
++            choices=['outlines', 'lm-format-enforcer', 'xgrammar'],
+             help='Which engine will be used for guided decoding'
+             ' (JSON schema / regex etc) by default. Currently support '
+-            'https://github.com/outlines-dev/outlines and '
++            'https://github.com/outlines-dev/outlines, '
++            'https://github.com/mlc-ai/xgrammar, and '
+             'https://github.com/noamgat/lm-format-enforcer.'
+             ' Can be overridden per request via guided_decoding_backend'
+             ' parameter.')
++        parser.add_argument(
++            '--logits-processor-pattern',
++            type=nullable_str,
++            default=None,
++            help='Optional regex pattern specifying valid logits processor '
++            'qualified names that can be passed with the `logits_processors` '
++            'extra completion argument. Defaults to None, which allows no '
++            'processors.')
+         # Parallel arguments
+-        parser.add_argument('--worker-use-ray',
+-                            action='store_true',
+-                            help='Use Ray for distributed serving, will be '
+-                            'automatically set when using more than 1 GPU.')
++        parser.add_argument(
++            '--distributed-executor-backend',
++            choices=['ray', 'mp'],
++            default=EngineArgs.distributed_executor_backend,
++            help='Backend to use for distributed model '
++            'workers, either "ray" or "mp" (multiprocessing). If the product '
++            'of pipeline_parallel_size and tensor_parallel_size is less than '
++            'or equal to the number of GPUs available, "mp" will be used to '
++            'keep processing on a single host. Otherwise, this will default '
++            'to "ray" if Ray is installed and fail otherwise. Note that tpu '
++            'and hpu only support Ray for distributed inference.')
++
++        parser.add_argument(
++            '--worker-use-ray',
++            action='store_true',
++            help='Deprecated, use --distributed-executor-backend=ray.')
+         parser.add_argument('--pipeline-parallel-size',
+                             '-pp',
+                             type=int,
+@@ -249,16 +427,32 @@ class EngineArgs:
+         parser.add_argument('--block-size',
+                             type=int,
+                             default=EngineArgs.block_size,
+-                            choices=[8, 16, 32],
++                            choices=[8, 16, 32, 64, 128],
+                             help='Token block size for contiguous chunks of '
+-                            'tokens.')
++                            'tokens. This is ignored on neuron devices and '
++                            'set to max-model-len. On CUDA devices, '
++                            'only block sizes up to 32 are supported. '
++                            'On HPU devices, block size defaults to 128.')
+ 
+-        parser.add_argument('--enable-prefix-caching',
++        parser.add_argument(
++            "--enable-prefix-caching",
++            action=argparse.BooleanOptionalAction,
++            default=EngineArgs.enable_prefix_caching,
++            help="Enables automatic prefix caching. "
++            "Use --no-enable-prefix-caching to disable explicitly.",
++        )
++        parser.add_argument('--disable-sliding-window',
+                             action='store_true',
+-                            help='Enables automatic prefix caching.')
++                            help='Disables sliding window, '
++                            'capping to sliding window size')
+         parser.add_argument('--use-v2-block-manager',
+                             action='store_true',
+-                            help='Use BlockSpaceMangerV2.')
++                            default=True,
++                            help='[DEPRECATED] block manager v1 has been '
++                            'removed and SelfAttnBlockSpaceManager (i.e. '
++                            'block manager v2) is now the default. '
++                            'Setting this flag to True or False'
++                            ' has no effect on vLLM behavior.')
+         parser.add_argument(
+             '--num-lookahead-slots',
+             type=int,
+@@ -273,9 +467,23 @@ class EngineArgs:
+                             default=EngineArgs.seed,
+                             help='Random seed for operations.')
+         parser.add_argument('--swap-space',
+-                            type=int,
++                            type=float,
+                             default=EngineArgs.swap_space,
+                             help='CPU swap space size (GiB) per GPU.')
++        parser.add_argument(
++            '--cpu-offload-gb',
++            type=float,
++            default=0,
++            help='The space in GiB to offload to CPU, per GPU. '
++            'Default is 0, which means no offloading. Intuitively, '
++            'this argument can be seen as a virtual way to increase '
++            'the GPU memory size. For example, if you have one 24 GB '
++            'GPU and set this to 10, virtually you can think of it as '
++            'a 34 GB GPU. Then you can load a 13B model with BF16 weight, '
++            'which requires at least 26GB GPU memory. Note that this '
++            'requires fast CPU-GPU interconnect, as part of the model is '
++            'loaded from CPU memory to GPU memory on the fly in each '
++            'model forward pass.')
+         parser.add_argument(
+             '--gpu-memory-utilization',
+             type=float,
+@@ -283,13 +491,18 @@ class EngineArgs:
+             help='The fraction of GPU memory to be used for the model '
+             'executor, which can range from 0 to 1. For example, a value of '
+             '0.5 would imply 50%% GPU memory utilization. If unspecified, '
+-            'will use the default value of 0.9.')
++            'will use the default value of 0.9. This is a per-instance '
++            'limit, and only applies to the current vLLM instance.'
++            'It does not matter if you have another vLLM instance running '
++            'on the same GPU. For example, if you have two vLLM instances '
++            'running on the same GPU, you can set the GPU memory utilization '
++            'to 0.5 for each instance.')
+         parser.add_argument(
+             '--num-gpu-blocks-override',
+             type=int,
+             default=None,
+             help='If specified, ignore GPU profiling result and use this number'
+-            'of GPU blocks. Used for testing preemption.')
++            ' of GPU blocks. Used for testing preemption.')
+         parser.add_argument('--max-num-batched-tokens',
+                             type=int,
+                             default=EngineArgs.max_num_batched_tokens,
+@@ -320,25 +533,38 @@ class EngineArgs:
+                             'None, we assume the model weights are not '
+                             'quantized and use `dtype` to determine the data '
+                             'type of the weights.')
++        parser.add_argument(
++            '--rope-scaling',
++            default=None,
++            type=json.loads,
++            help='RoPE scaling configuration in JSON format. '
++            'For example, {"rope_type":"dynamic","factor":2.0}')
++        parser.add_argument('--rope-theta',
++                            default=None,
++                            type=float,
++                            help='RoPE theta. Use with `rope_scaling`. In '
++                            'some cases, changing the RoPE theta improves the '
++                            'performance of the scaled model.')
++        parser.add_argument('--hf-overrides',
++                            type=json.loads,
++                            default=EngineArgs.hf_overrides,
++                            help='Extra arguments for the HuggingFace config. '
++                            'This should be a JSON string that will be '
++                            'parsed into a dictionary.')
+         parser.add_argument('--enforce-eager',
+                             action='store_true',
+                             help='Always use eager-mode PyTorch. If False, '
+                             'will use eager mode and CUDA graph in hybrid '
+                             'for maximal performance and flexibility.')
+-        parser.add_argument('--max-context-len-to-capture',
+-                            type=int,
+-                            default=EngineArgs.max_context_len_to_capture,
+-                            help='Maximum context length covered by CUDA '
+-                            'graphs. When a sequence has context length '
+-                            'larger than this, we fall back to eager mode. '
+-                            '(DEPRECATED. Use --max-seq_len-to-capture instead'
+-                            ')')
+-        parser.add_argument('--max-seq_len-to-capture',
++        parser.add_argument('--max-seq-len-to-capture',
+                             type=int,
+                             default=EngineArgs.max_seq_len_to_capture,
+                             help='Maximum sequence length covered by CUDA '
+                             'graphs. When a sequence has context length '
+-                            'larger than this, we fall back to eager mode.')
++                            'larger than this, we fall back to eager mode. '
++                            'Additionally for encoder-decoder models, if the '
++                            'sequence length of the encoder input is larger '
++                            'than this, we fall back to the eager mode.')
+         parser.add_argument('--disable-custom-all-reduce',
+                             action='store_true',
+                             default=EngineArgs.disable_custom_all_reduce,
+@@ -362,10 +588,39 @@ class EngineArgs:
+                             'This should be a JSON string that will be '
+                             'parsed into a dictionary. Ignored if '
+                             'tokenizer_pool_size is 0.')
++
++        # Multimodal related configs
++        parser.add_argument(
++            '--limit-mm-per-prompt',
++            type=nullable_kvs,
++            default=EngineArgs.limit_mm_per_prompt,
++            # The default value is given in
++            # MultiModalRegistry.init_mm_limits_per_prompt
++            help=('For each multimodal plugin, limit how many '
++                  'input instances to allow for each prompt. '
++                  'Expects a comma-separated list of items, '
++                  'e.g.: `image=16,video=2` allows a maximum of 16 '
++                  'images and 2 videos per prompt. Defaults to 1 for '
++                  'each modality.'))
++        parser.add_argument(
++            '--mm-processor-kwargs',
++            default=None,
++            type=json.loads,
++            help=('Overrides for the multimodal input mapping/processing, '
++                  'e.g., image processor. For example: {"num_crops": 4}.'))
++        parser.add_argument(
++            '--disable-mm-preprocessor-cache',
++            action='store_true',
++            help='If true, then disables caching of the multi-modal '
++            'preprocessor/mapper. (not recommended)')
++
+         # LoRA related configs
+         parser.add_argument('--enable-lora',
+                             action='store_true',
+                             help='If True, enable handling of LoRA adapters.')
++        parser.add_argument('--enable-lora-bias',
++                            action='store_true',
++                            help='If True, enable bias for LoRA adapters.')
+         parser.add_argument('--max-loras',
+                             type=int,
+                             default=EngineArgs.max_loras,
+@@ -385,16 +640,27 @@ class EngineArgs:
+             '--lora-dtype',
+             type=str,
+             default=EngineArgs.lora_dtype,
+-            choices=['auto', 'float16', 'bfloat16', 'float32'],
++            choices=['auto', 'float16', 'bfloat16'],
+             help=('Data type for LoRA. If auto, will default to '
+                   'base model dtype.'))
++        parser.add_argument(
++            '--long-lora-scaling-factors',
++            type=nullable_str,
++            default=EngineArgs.long_lora_scaling_factors,
++            help=('Specify multiple scaling factors (which can '
++                  'be different from base model scaling factor '
++                  '- see eg. Long LoRA) to allow for multiple '
++                  'LoRA adapters trained with those scaling '
++                  'factors to be used at the same time. If not '
++                  'specified, only adapters trained with the '
++                  'base model scaling factor are allowed.'))
+         parser.add_argument(
+             '--max-cpu-loras',
+             type=int,
+             default=EngineArgs.max_cpu_loras,
+             help=('Maximum number of LoRAs to store in CPU memory. '
+-                  'Must be >= than max_num_seqs. '
+-                  'Defaults to max_num_seqs.'))
++                  'Must be >= than max_loras. '
++                  'Defaults to max_loras.'))
+         parser.add_argument(
+             '--fully-sharded-loras',
+             action='store_true',
+@@ -403,45 +669,48 @@ class EngineArgs:
+                   'Enabling this will use the fully sharded layers. '
+                   'At high sequence length, max rank or '
+                   'tensor parallel size, this is likely faster.'))
++        parser.add_argument('--enable-prompt-adapter',
++                            action='store_true',
++                            help='If True, enable handling of PromptAdapters.')
++        parser.add_argument('--max-prompt-adapters',
++                            type=int,
++                            default=EngineArgs.max_prompt_adapters,
++                            help='Max number of PromptAdapters in a batch.')
++        parser.add_argument('--max-prompt-adapter-token',
++                            type=int,
++                            default=EngineArgs.max_prompt_adapter_token,
++                            help='Max number of PromptAdapters tokens')
+         parser.add_argument("--device",
+                             type=str,
+                             default=EngineArgs.device,
+-                            choices=["auto", "cuda", "neuron", "cpu"],
++                            choices=DEVICE_OPTIONS,
+                             help='Device type for vLLM execution.')
+-        # Related to Vision-language models such as llava
+-        parser.add_argument(
+-            '--image-input-type',
+-            type=nullable_str,
+-            default=None,
+-            choices=[
+-                t.name.lower() for t in VisionLanguageConfig.ImageInputType
+-            ],
+-            help=('The image input type passed into vLLM. '
+-                  'Should be one of "pixel_values" or "image_features".'))
+-        parser.add_argument('--image-token-id',
++        parser.add_argument('--num-scheduler-steps',
+                             type=int,
+-                            default=None,
+-                            help=('Input id for image token.'))
+-        parser.add_argument(
+-            '--image-input-shape',
+-            type=nullable_str,
+-            default=None,
+-            help=('The biggest image input shape (worst for memory footprint) '
+-                  'given an input type. Only used for vLLM\'s profile_run.'))
++                            default=1,
++                            help=('Maximum number of forward steps per '
++                                  'scheduler call.'))
++
+         parser.add_argument(
+-            '--image-feature-size',
+-            type=int,
+-            default=None,
+-            help=('The image feature size along the context dimension.'))
++            '--multi-step-stream-outputs',
++            action=StoreBoolean,
++            default=EngineArgs.multi_step_stream_outputs,
++            nargs="?",
++            const="True",
++            help='If False, then multi-step will stream outputs at the end '
++            'of all steps')
+         parser.add_argument(
+             '--scheduler-delay-factor',
+             type=float,
+             default=EngineArgs.scheduler_delay_factor,
+-            help='Apply a delay (of delay factor multiplied by previous'
++            help='Apply a delay (of delay factor multiplied by previous '
+             'prompt latency) before scheduling next prompt.')
+         parser.add_argument(
+             '--enable-chunked-prefill',
+-            action='store_true',
++            action=StoreBoolean,
++            default=EngineArgs.enable_chunked_prefill,
++            nargs="?",
++            const="True",
+             help='If set, the prefill requests can be chunked based on the '
+             'max_num_batched_tokens.')
+ 
+@@ -451,13 +720,37 @@ class EngineArgs:
+             default=EngineArgs.speculative_model,
+             help=
+             'The name of the draft model to be used in speculative decoding.')
+-
++        # Quantization settings for speculative model.
++        parser.add_argument(
++            '--speculative-model-quantization',
++            type=nullable_str,
++            choices=[*QUANTIZATION_METHODS, None],
++            default=EngineArgs.speculative_model_quantization,
++            help='Method used to quantize the weights of speculative model. '
++            'If None, we first check the `quantization_config` '
++            'attribute in the model config file. If that is '
++            'None, we assume the model weights are not '
++            'quantized and use `dtype` to determine the data '
++            'type of the weights.')
+         parser.add_argument(
+             '--num-speculative-tokens',
+             type=int,
+             default=EngineArgs.num_speculative_tokens,
+             help='The number of speculative tokens to sample from '
+             'the draft model in speculative decoding.')
++        parser.add_argument(
++            '--speculative-disable-mqa-scorer',
++            action='store_true',
++            help=
++            'If set to True, the MQA scorer will be disabled in speculative '
++            ' and fall back to batch expansion')
++        parser.add_argument(
++            '--speculative-draft-tensor-parallel-size',
++            '-spec-draft-tp',
++            type=int,
++            default=EngineArgs.speculative_draft_tensor_parallel_size,
++            help='Number of tensor parallel replicas for '
++            'the draft model in speculative decoding.')
+ 
+         parser.add_argument(
+             '--speculative-max-model-len',
+@@ -467,6 +760,13 @@ class EngineArgs:
+             'draft model. Sequences over this length will skip '
+             'speculation.')
+ 
++        parser.add_argument(
++            '--speculative-disable-by-batch-size',
++            type=int,
++            default=EngineArgs.speculative_disable_by_batch_size,
++            help='Disable speculative decoding for new incoming requests '
++            'if the number of enqueue requests is larger than this value.')
++
+         parser.add_argument(
+             '--ngram-prompt-lookup-max',
+             type=int,
+@@ -481,6 +781,52 @@ class EngineArgs:
+             help='Min size of window for ngram prompt lookup in speculative '
+             'decoding.')
+ 
++        parser.add_argument(
++            '--spec-decoding-acceptance-method',
++            type=str,
++            default=EngineArgs.spec_decoding_acceptance_method,
++            choices=['rejection_sampler', 'typical_acceptance_sampler'],
++            help='Specify the acceptance method to use during draft token '
++            'verification in speculative decoding. Two types of acceptance '
++            'routines are supported: '
++            '1) RejectionSampler which does not allow changing the '
++            'acceptance rate of draft tokens, '
++            '2) TypicalAcceptanceSampler which is configurable, allowing for '
++            'a higher acceptance rate at the cost of lower quality, '
++            'and vice versa.')
++
++        parser.add_argument(
++            '--typical-acceptance-sampler-posterior-threshold',
++            type=float,
++            default=EngineArgs.typical_acceptance_sampler_posterior_threshold,
++            help='Set the lower bound threshold for the posterior '
++            'probability of a token to be accepted. This threshold is '
++            'used by the TypicalAcceptanceSampler to make sampling decisions '
++            'during speculative decoding. Defaults to 0.09')
++
++        parser.add_argument(
++            '--typical-acceptance-sampler-posterior-alpha',
++            type=float,
++            default=EngineArgs.typical_acceptance_sampler_posterior_alpha,
++            help='A scaling factor for the entropy-based threshold for token '
++            'acceptance in the TypicalAcceptanceSampler. Typically defaults '
++            'to sqrt of --typical-acceptance-sampler-posterior-threshold '
++            'i.e. 0.3')
++
++        parser.add_argument(
++            '--disable-logprobs-during-spec-decoding',
++            action=StoreBoolean,
++            default=EngineArgs.disable_logprobs_during_spec_decoding,
++            nargs="?",
++            const="True",
++            help='If set to True, token log probabilities are not returned '
++            'during speculative decoding. If set to False, log probabilities '
++            'are returned according to the settings in SamplingParams. If '
++            'not specified, it defaults to True. Disabling log probabilities '
++            'during speculative decoding reduces latency by skipping logprob '
++            'calculation in proposal sampling, target sampling, and after '
++            'accepted tokens are determined.')
++
+         parser.add_argument('--model-loader-extra-config',
+                             type=nullable_str,
+                             default=EngineArgs.model_loader_extra_config,
+@@ -489,6 +835,21 @@ class EngineArgs:
+                             'corresponding to the chosen load_format. '
+                             'This should be a JSON string that will be '
+                             'parsed into a dictionary.')
++        parser.add_argument(
++            '--ignore-patterns',
++            action="append",
++            type=str,
++            default=[],
++            help="The pattern(s) to ignore when loading the model."
++            "Default to `original/**/*` to avoid repeated loading of llama's "
++            "checkpoints.")
++        parser.add_argument(
++            '--preemption-mode',
++            type=str,
++            default=None,
++            help='If \'recompute\', the engine performs preemption by '
++            'recomputing; If \'swap\', the engine performs preemption by '
++            'block swapping.')
+ 
+         parser.add_argument(
+             "--served-model-name",
+@@ -500,150 +861,457 @@ class EngineArgs:
+             "of the provided names. The model name in the model "
+             "field of a response will be the first name in this "
+             "list. If not specified, the model name will be the "
+-            "same as the `--model` argument. Noted that this name(s)"
++            "same as the `--model` argument. Noted that this name(s) "
+             "will also be used in `model_name` tag content of "
+-            "prometheus metrics, if multiple names provided, metrics"
++            "prometheus metrics, if multiple names provided, metrics "
+             "tag will take the first one.")
++        parser.add_argument('--qlora-adapter-name-or-path',
++                            type=str,
++                            default=None,
++                            help='Name or path of the QLoRA adapter.')
++
++        parser.add_argument(
++            '--otlp-traces-endpoint',
++            type=str,
++            default=None,
++            help='Target URL to which OpenTelemetry traces will be sent.')
++        parser.add_argument(
++            '--collect-detailed-traces',
++            type=str,
++            default=None,
++            help="Valid choices are " +
++            ",".join(ALLOWED_DETAILED_TRACE_MODULES) +
++            ". It makes sense to set this only if --otlp-traces-endpoint is"
++            " set. If set, it will collect detailed traces for the specified "
++            "modules. This involves use of possibly costly and or blocking "
++            "operations and hence might have a performance impact.")
++
++        parser.add_argument(
++            '--disable-async-output-proc',
++            action='store_true',
++            default=EngineArgs.disable_async_output_proc,
++            help="Disable async output processing. This may result in "
++            "lower performance.")
++
++        parser.add_argument(
++            '--scheduling-policy',
++            choices=['fcfs', 'priority'],
++            default="fcfs",
++            help='The scheduling policy to use. "fcfs" (first come first served'
++            ', i.e. requests are handled in order of arrival; default) '
++            'or "priority" (requests are handled based on given '
++            'priority (lower value means earlier handling) and time of '
++            'arrival deciding any ties).')
++
++        parser.add_argument(
++            '--override-neuron-config',
++            type=json.loads,
++            default=None,
++            help="Override or set neuron device configuration. "
++            "e.g. {\"cast_logits_dtype\": \"bloat16\"}.'")
++        parser.add_argument(
++            '--override-pooler-config',
++            type=PoolerConfig.from_json,
++            default=None,
++            help="Override or set the pooling method for pooling models. "
++            "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
++
++        parser.add_argument('--compilation-config',
++                            '-O',
++                            type=CompilationConfig.from_cli,
++                            default=None,
++                            help='torch.compile configuration for the model.'
++                            'When it is a number (0, 1, 2, 3), it will be '
++                            'interpreted as the optimization level.\n'
++                            'NOTE: level 0 is the default level without '
++                            'any optimization. level 1 and 2 are for internal '
++                            'testing only. level 3 is the recommended level '
++                            'for production.\n'
++                            'To specify the full compilation config, '
++                            'use a JSON string.\n'
++                            'Following the convention of traditional '
++                            'compilers, using -O without space is also '
++                            'supported. -O3 is equivalent to -O 3.')
++
++        parser.add_argument('--kv-transfer-config',
++                            type=KVTransferConfig.from_cli,
++                            default=None,
++                            help='The configurations for distributed KV cache '
++                            'transfer. Should be a JSON string.')
++
++        parser.add_argument(
++            '--worker-cls',
++            type=str,
++            default="auto",
++            help='The worker class to use for distributed execution.')
++
++        parser.add_argument(
++            "--generation-config",
++            type=nullable_str,
++            default=None,
++            help="The folder path to the generation config. "
++            "Defaults to None, will use the default generation config in vLLM. "
++            "If set to 'auto', the generation config will be automatically "
++            "loaded from model. If set to a folder path, the generation config "
++            "will be loaded from the specified folder path.")
+ 
+         return parser
+ 
+     @classmethod
+-    def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
++    def from_cli_args(cls, args: argparse.Namespace):
+         # Get the list of attributes of this dataclass.
+         attrs = [attr.name for attr in dataclasses.fields(cls)]
+         # Set the attributes from the parsed arguments.
+         engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
+         return engine_args
+ 
+-    def create_engine_config(self, ) -> EngineConfig:
+-        device_config = DeviceConfig(self.device)
+-        model_config = ModelConfig(
+-            self.model, self.tokenizer, self.tokenizer_mode,
+-            self.trust_remote_code, self.dtype, self.seed, self.revision,
+-            self.code_revision, self.tokenizer_revision, self.max_model_len,
+-            self.quantization, self.quantization_param_path,
+-            self.enforce_eager, self.max_context_len_to_capture,
+-            self.max_seq_len_to_capture, self.max_logprobs,
+-            self.skip_tokenizer_init, self.served_model_name)
+-        cache_config = CacheConfig(self.block_size,
+-                                   self.gpu_memory_utilization,
+-                                   self.swap_space, self.kv_cache_dtype,
+-                                   self.num_gpu_blocks_override,
+-                                   model_config.get_sliding_window(),
+-                                   self.enable_prefix_caching)
++    def create_model_config(self) -> ModelConfig:
++        return ModelConfig(
++            model=self.model,
++            task=self.task,
++            # We know this is not None because we set it in __post_init__
++            tokenizer=cast(str, self.tokenizer),
++            tokenizer_mode=self.tokenizer_mode,
++            trust_remote_code=self.trust_remote_code,
++            allowed_local_media_path=self.allowed_local_media_path,
++            dtype=self.dtype,
++            seed=self.seed,
++            revision=self.revision,
++            code_revision=self.code_revision,
++            rope_scaling=self.rope_scaling,
++            rope_theta=self.rope_theta,
++            hf_overrides=self.hf_overrides,
++            tokenizer_revision=self.tokenizer_revision,
++            max_model_len=self.max_model_len,
++            quantization=self.quantization,
++            quantization_param_path=self.quantization_param_path,
++            enforce_eager=self.enforce_eager,
++            max_seq_len_to_capture=self.max_seq_len_to_capture,
++            max_logprobs=self.max_logprobs,
++            disable_sliding_window=self.disable_sliding_window,
++            skip_tokenizer_init=self.skip_tokenizer_init,
++            served_model_name=self.served_model_name,
++            limit_mm_per_prompt=self.limit_mm_per_prompt,
++            use_async_output_proc=not self.disable_async_output_proc,
++            config_format=self.config_format,
++            mm_processor_kwargs=self.mm_processor_kwargs,
++            disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
++            override_neuron_config=self.override_neuron_config,
++            override_pooler_config=self.override_pooler_config,
++            logits_processor_pattern=self.logits_processor_pattern,
++            generation_config=self.generation_config)
++
++    def create_load_config(self) -> LoadConfig:
++        return LoadConfig(
++            load_format=self.load_format,
++            download_dir=self.download_dir,
++            model_loader_extra_config=self.model_loader_extra_config,
++            ignore_patterns=self.ignore_patterns,
++        )
++
++    def create_engine_config(self,
++                             usage_context: Optional[UsageContext] = None
++                             ) -> VllmConfig:
++        if envs.VLLM_USE_V1:
++            self._override_v1_engine_args(usage_context)
++
++        # gguf file needs a specific model loader and doesn't use hf_repo
++        if check_gguf_file(self.model):
++            self.quantization = self.load_format = "gguf"
++
++        # bitsandbytes quantization needs a specific model loader
++        # so we make sure the quant method and the load format are consistent
++        if (self.quantization == "bitsandbytes" or
++           self.qlora_adapter_name_or_path is not None) and \
++           self.load_format != "bitsandbytes":
++            raise ValueError(
++                "BitsAndBytes quantization and QLoRA adapter only support "
++                f"'bitsandbytes' load format, but got {self.load_format}")
++
++        if (self.load_format == "bitsandbytes" or
++            self.qlora_adapter_name_or_path is not None) and \
++            self.quantization != "bitsandbytes":
++            raise ValueError(
++                "BitsAndBytes load format and QLoRA adapter only support "
++                f"'bitsandbytes' quantization, but got {self.quantization}")
++
++        assert self.cpu_offload_gb >= 0, (
++            "CPU offload space must be non-negative"
++            f", but got {self.cpu_offload_gb}")
++
++        device_config = DeviceConfig(device=self.device)
++        model_config = self.create_model_config()
++
++        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
++                and self.enable_prefix_caching):
++            logger.warning("--enable-prefix-caching is currently not "
++                           "supported for multimodal models in v0 and "
++                           "has been disabled.")
++            self.enable_prefix_caching = False
++
++        cache_config = CacheConfig(
++            block_size=self.block_size,
++            gpu_memory_utilization=self.gpu_memory_utilization,
++            swap_space=self.swap_space,
++            cache_dtype=self.kv_cache_dtype,
++            is_attention_free=model_config.is_attention_free,
++            num_gpu_blocks_override=self.num_gpu_blocks_override,
++            sliding_window=model_config.get_sliding_window(),
++            enable_prefix_caching=self.enable_prefix_caching,
++            cpu_offload_gb=self.cpu_offload_gb,
++        )
+         parallel_config = ParallelConfig(
+-            self.pipeline_parallel_size, self.tensor_parallel_size,
+-            self.worker_use_ray, self.max_parallel_loading_workers,
+-            self.disable_custom_all_reduce,
+-            TokenizerPoolConfig.create_config(
++            pipeline_parallel_size=self.pipeline_parallel_size,
++            tensor_parallel_size=self.tensor_parallel_size,
++            worker_use_ray=self.worker_use_ray,
++            max_parallel_loading_workers=self.max_parallel_loading_workers,
++            disable_custom_all_reduce=self.disable_custom_all_reduce,
++            tokenizer_pool_config=TokenizerPoolConfig.create_config(
+                 self.tokenizer_pool_size,
+                 self.tokenizer_pool_type,
+                 self.tokenizer_pool_extra_config,
+-            ), self.ray_workers_use_nsight)
++            ),
++            ray_workers_use_nsight=self.ray_workers_use_nsight,
++            distributed_executor_backend=self.distributed_executor_backend,
++            worker_cls=self.worker_cls,
++        )
++
++        max_model_len = model_config.max_model_len
++        use_long_context = max_model_len > 32768
++        if self.enable_chunked_prefill is None:
++            # If not explicitly set, enable chunked prefill by default for
++            # long context (> 32K) models. This is to avoid OOM errors in the
++            # initial memory profiling phase.
++
++            # For multimodal models, chunked prefill is disabled by default in
++            # V0, but enabled by design in V1
++            if model_config.is_multimodal_model:
++                self.enable_chunked_prefill = bool(envs.VLLM_USE_V1)
++
++            elif use_long_context:
++                is_gpu = device_config.device_type == "cuda"
++                use_sliding_window = (model_config.get_sliding_window()
++                                      is not None)
++                use_spec_decode = self.speculative_model is not None
++                from vllm.platforms import current_platform
++                if (is_gpu and not use_sliding_window and not use_spec_decode
++                        and not self.enable_lora
++                        and not self.enable_prompt_adapter
++                        and model_config.runner_type != "pooling"
++                        and not current_platform.is_rocm()):
++                    self.enable_chunked_prefill = True
++                    logger.warning(
++                        "Chunked prefill is enabled by default for models with "
++                        "max_model_len > 32K. Currently, chunked prefill might "
++                        "not work with some features or models. If you "
++                        "encounter any issues, please disable chunked prefill "
++                        "by setting --enable-chunked-prefill=False.")
++            if self.enable_chunked_prefill is None:
++                self.enable_chunked_prefill = False
++
++        if not self.enable_chunked_prefill and use_long_context:
++            logger.warning(
++                "The model has a long context length (%s). This may cause OOM "
++                "errors during the initial memory profiling phase, or result "
++                "in low performance due to small KV cache space. Consider "
++                "setting --max-model-len to a smaller value.", max_model_len)
++        elif (self.enable_chunked_prefill
++              and model_config.runner_type == "pooling"):
++            msg = "Chunked prefill is not supported for pooling models"
++            raise ValueError(msg)
++
+ 
+         speculative_config = SpeculativeConfig.maybe_create_spec_config(
+             target_model_config=model_config,
+             target_parallel_config=parallel_config,
+             target_dtype=self.dtype,
+             speculative_model=self.speculative_model,
++            speculative_model_quantization = \
++                self.speculative_model_quantization,
++            speculative_draft_tensor_parallel_size = \
++                self.speculative_draft_tensor_parallel_size,
+             num_speculative_tokens=self.num_speculative_tokens,
++            speculative_disable_mqa_scorer=self.speculative_disable_mqa_scorer,
++            speculative_disable_by_batch_size=self.
++            speculative_disable_by_batch_size,
+             speculative_max_model_len=self.speculative_max_model_len,
+             enable_chunked_prefill=self.enable_chunked_prefill,
+-            use_v2_block_manager=self.use_v2_block_manager,
++            disable_log_stats=self.disable_log_stats,
+             ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
+             ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
++            draft_token_acceptance_method=\
++                self.spec_decoding_acceptance_method,
++            typical_acceptance_sampler_posterior_threshold=self.
++            typical_acceptance_sampler_posterior_threshold,
++            typical_acceptance_sampler_posterior_alpha=self.
++            typical_acceptance_sampler_posterior_alpha,
++            disable_logprobs=self.disable_logprobs_during_spec_decoding,
+         )
+ 
++        # Reminder: Please update docs/source/features/compatibility_matrix.md
++        # If the feature combo become valid
++        if self.num_scheduler_steps > 1:
++            if speculative_config is not None:
++                raise ValueError("Speculative decoding is not supported with "
++                                 "multi-step (--num-scheduler-steps > 1)")
++            if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
++                raise ValueError("Multi-Step Chunked-Prefill is not supported "
++                                 "for pipeline-parallel-size > 1")
++            from vllm.platforms import current_platform
++            if current_platform.is_cpu():
++                logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
++                               "currently not supported for CPUs and has been "
++                               "disabled.")
++                self.num_scheduler_steps = 1
++
++        # make sure num_lookahead_slots is set the higher value depending on
++        # if we are using speculative decoding or multi-step
++        num_lookahead_slots = max(self.num_lookahead_slots,
++                                  self.num_scheduler_steps - 1)
++        num_lookahead_slots = num_lookahead_slots \
++            if speculative_config is None \
++            else speculative_config.num_lookahead_slots
++
++        if not self.use_v2_block_manager:
++            logger.warning(
++                "[DEPRECATED] Block manager v1 has been removed, "
++                "and setting --use-v2-block-manager to True or False has "
++                "no effect on vLLM behavior. Please remove "
++                "--use-v2-block-manager in your engine argument. "
++                "If your use case is not supported by "
++                "SelfAttnBlockSpaceManager (i.e. block manager v2),"
++                " please file an issue with detailed information.")
++
+         scheduler_config = SchedulerConfig(
+-            self.max_num_batched_tokens,
+-            self.max_num_seqs,
+-            model_config.max_model_len,
+-            self.use_v2_block_manager,
+-            num_lookahead_slots=(self.num_lookahead_slots
+-                                 if speculative_config is None else
+-                                 speculative_config.num_lookahead_slots),
++            runner_type=model_config.runner_type,
++            max_num_batched_tokens=self.max_num_batched_tokens,
++            max_num_seqs=self.max_num_seqs,
++            max_model_len=model_config.max_model_len,
++            num_lookahead_slots=num_lookahead_slots,
+             delay_factor=self.scheduler_delay_factor,
+             enable_chunked_prefill=self.enable_chunked_prefill,
+-        )
++            is_multimodal_model=model_config.is_multimodal_model,
++            preemption_mode=self.preemption_mode,
++            num_scheduler_steps=self.num_scheduler_steps,
++            multi_step_stream_outputs=self.multi_step_stream_outputs,
++            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
++                             and parallel_config.use_ray),
++            policy=self.scheduling_policy)
+         lora_config = LoRAConfig(
++            bias_enabled=self.enable_lora_bias,
+             max_lora_rank=self.max_lora_rank,
+             max_loras=self.max_loras,
+             fully_sharded_loras=self.fully_sharded_loras,
+             lora_extra_vocab_size=self.lora_extra_vocab_size,
++            long_lora_scaling_factors=self.long_lora_scaling_factors,
+             lora_dtype=self.lora_dtype,
+             max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
+             and self.max_cpu_loras > 0 else None) if self.enable_lora else None
+ 
+-        load_config = LoadConfig(
+-            load_format=self.load_format,
+-            download_dir=self.download_dir,
+-            model_loader_extra_config=self.model_loader_extra_config,
+-        )
++        if self.qlora_adapter_name_or_path is not None and \
++            self.qlora_adapter_name_or_path != "":
++            if self.model_loader_extra_config is None:
++                self.model_loader_extra_config = {}
++            self.model_loader_extra_config[
++                "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
+ 
+-        if self.image_input_type:
+-            if (not self.image_token_id or not self.image_input_shape
+-                    or not self.image_feature_size):
+-                raise ValueError(
+-                    'Specify `image_token_id`, `image_input_shape` and '
+-                    '`image_feature_size` together with `image_input_type`.')
+-            vision_language_config = VisionLanguageConfig(
+-                image_input_type=VisionLanguageConfig.
+-                get_image_input_enum_type(self.image_input_type),
+-                image_token_id=self.image_token_id,
+-                image_input_shape=str_to_int_tuple(self.image_input_shape),
+-                image_feature_size=self.image_feature_size,
+-            )
+-        else:
+-            vision_language_config = None
++        load_config = self.create_load_config()
++
++        prompt_adapter_config = PromptAdapterConfig(
++            max_prompt_adapters=self.max_prompt_adapters,
++            max_prompt_adapter_token=self.max_prompt_adapter_token) \
++                                        if self.enable_prompt_adapter else None
+ 
+         decoding_config = DecodingConfig(
+             guided_decoding_backend=self.guided_decoding_backend)
+ 
+-        return EngineConfig(model_config=model_config,
+-                            cache_config=cache_config,
+-                            parallel_config=parallel_config,
+-                            scheduler_config=scheduler_config,
+-                            device_config=device_config,
+-                            lora_config=lora_config,
+-                            vision_language_config=vision_language_config,
+-                            speculative_config=speculative_config,
+-                            load_config=load_config,
+-                            decoding_config=decoding_config)
++        detailed_trace_modules = []
++        if self.collect_detailed_traces is not None:
++            detailed_trace_modules = self.collect_detailed_traces.split(",")
++        for m in detailed_trace_modules:
++            if m not in ALLOWED_DETAILED_TRACE_MODULES:
++                raise ValueError(
++                    f"Invalid module {m} in collect_detailed_traces. "
++                    f"Valid modules are {ALLOWED_DETAILED_TRACE_MODULES}")
++        observability_config = ObservabilityConfig(
++            otlp_traces_endpoint=self.otlp_traces_endpoint,
++            collect_model_forward_time="model" in detailed_trace_modules
++            or "all" in detailed_trace_modules,
++            collect_model_execute_time="worker" in detailed_trace_modules
++            or "all" in detailed_trace_modules,
++        )
++
++        config = VllmConfig(
++            model_config=model_config,
++            cache_config=cache_config,
++            parallel_config=parallel_config,
++            scheduler_config=scheduler_config,
++            device_config=device_config,
++            lora_config=lora_config,
++            speculative_config=speculative_config,
++            load_config=load_config,
++            decoding_config=decoding_config,
++            observability_config=observability_config,
++            prompt_adapter_config=prompt_adapter_config,
++            compilation_config=self.compilation_config,
++            kv_transfer_config=self.kv_transfer_config,
++        )
++
++        if envs.VLLM_USE_V1:
++            self._override_v1_engine_config(config)
++        return config
++
++    def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
++        """
++        Override the EngineArgs's args based on the usage context for V1.
++        """
++        assert envs.VLLM_USE_V1, "V1 is not enabled"
++
++        # V1 always uses chunked prefills.
++        self.enable_chunked_prefill = True
++        # When no user override, set the default values based on the usage
++        # context.
++        # TODO(woosuk): Tune the default values for different hardware.
++        default_max_num_batched_tokens = {
++            UsageContext.LLM_CLASS: 8192,
++            UsageContext.OPENAI_API_SERVER: 2048,
++        }
++        if (self.max_num_batched_tokens is None
++                and usage_context in default_max_num_batched_tokens):
++            self.max_num_batched_tokens = default_max_num_batched_tokens[
++                usage_context]
++            logger.warning(
++                "Setting max_num_batched_tokens to %d for %s usage context.",
++                self.max_num_batched_tokens, usage_context.value)
++
++    def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
++        """
++        Override the EngineConfig's configs based on the usage context for V1.
++        """
++        assert envs.VLLM_USE_V1, "V1 is not enabled"
+ 
+ 
+ @dataclass
+ class AsyncEngineArgs(EngineArgs):
+     """Arguments for asynchronous vLLM engine."""
+-    engine_use_ray: bool = False
+     disable_log_requests: bool = False
+-    max_log_len: Optional[int] = None
+ 
+     @staticmethod
+-    def add_cli_args(parser: argparse.ArgumentParser,
+-                     async_args_only: bool = False) -> argparse.ArgumentParser:
++    def add_cli_args(parser: FlexibleArgumentParser,
++                     async_args_only: bool = False) -> FlexibleArgumentParser:
+         if not async_args_only:
+             parser = EngineArgs.add_cli_args(parser)
+-        parser.add_argument('--engine-use-ray',
+-                            action='store_true',
+-                            help='Use Ray to start the LLM engine in a '
+-                            'separate process as the server process.')
+         parser.add_argument('--disable-log-requests',
+                             action='store_true',
+                             help='Disable logging requests.')
+-        parser.add_argument('--max-log-len',
+-                            type=int,
+-                            default=None,
+-                            help='Max number of prompt characters or prompt '
+-                            'ID numbers being printed in log.'
+-                            '\n\nDefault: Unlimited')
+         return parser
+ 
+ 
+ # These functions are used by sphinx to build the documentation
+ def _engine_args_parser():
+-    return EngineArgs.add_cli_args(argparse.ArgumentParser())
++    return EngineArgs.add_cli_args(FlexibleArgumentParser())
+ 
+ 
+ def _async_engine_args_parser():
+-    return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(),
++    return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
+                                         async_args_only=True)
+diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
+index 9f72a0d..da23ed1 100644
+--- a/vllm/engine/async_llm_engine.py
++++ b/vllm/engine/async_llm_engine.py
+@@ -1,23 +1,41 @@
+ import asyncio
++import copy
+ import time
++import weakref
+ from functools import partial
+-from typing import (Any, AsyncIterator, Callable, Dict, Iterable, List,
+-                    Optional, Set, Tuple, Type, Union)
++from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable,
++                    List, Mapping, Optional, Set, Tuple, Type, Union, overload)
++from weakref import ReferenceType
+ 
+-from transformers import PreTrainedTokenizer
++from typing_extensions import deprecated
+ 
+ import vllm.envs as envs
+-from vllm.config import DecodingConfig, ModelConfig
++from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
++                         ParallelConfig, SchedulerConfig, VllmConfig)
+ from vllm.core.scheduler import SchedulerOutputs
+ from vllm.engine.arg_utils import AsyncEngineArgs
+-from vllm.engine.llm_engine import LLMEngine
+-from vllm.executor.ray_utils import initialize_ray_cluster, ray
++from vllm.engine.async_timeout import asyncio_timeout
++from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
++from vllm.engine.metrics_types import StatLoggerBase
++from vllm.engine.protocol import EngineClient
++from vllm.executor.executor_base import ExecutorAsyncBase
++from vllm.executor.gpu_executor import GPUExecutorAsync
++from vllm.executor.ray_utils import initialize_ray_cluster
++from vllm.inputs import PromptType
++from vllm.inputs.preprocess import InputPreprocessor
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+-from vllm.outputs import RequestOutput
++from vllm.model_executor.guided_decoding import (
++    get_guided_decoding_logits_processor)
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.outputs import PoolingRequestOutput, RequestOutput
++from vllm.pooling_params import PoolingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
+ from vllm.sampling_params import SamplingParams
+-from vllm.sequence import ExecuteModelRequest, MultiModalData, SamplerOutput
++from vllm.sequence import ExecuteModelRequest
++from vllm.transformers_utils.tokenizer import AnyTokenizer
+ from vllm.usage.usage_lib import UsageContext
++from vllm.utils import deprecate_kwargs, weak_bind
+ 
+ logger = init_logger(__name__)
+ ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
+@@ -27,55 +45,85 @@ class AsyncEngineDeadError(RuntimeError):
+     pass
+ 
+ 
+-def _raise_exception_on_finish(
+-        task: asyncio.Task, error_callback: Callable[[Exception],
+-                                                     None]) -> None:
+-    msg = ("Task finished unexpectedly. This should never happen! "
+-           "Please open an issue on Github.")
++def _log_task_completion(task: asyncio.Task,
++                         error_callback: Callable[[Exception], None]) -> None:
++    """This function is only intended for the `engine.run_engine_loop()` task.
++
++    In particular, that task runs a `while True` loop that can only exit if
++    there is an exception.
++    """
+ 
+     exception = None
+     try:
+-        task.result()
+-        # NOTE: This will be thrown if task exits normally (which it should not)
+-        raise AsyncEngineDeadError(msg)
++        return_value = task.result()
++        raise AssertionError(
++            f"The engine background task should never finish without an "
++            f"exception. {return_value}")
++    except asyncio.exceptions.CancelledError:
++        # We assume that if the task is cancelled, we are gracefully shutting
++        # down. This should only happen on program exit.
++        logger.info("Engine is gracefully shutting down.")
+     except Exception as e:
+         exception = e
+         logger.error("Engine background task failed", exc_info=e)
+         error_callback(exception)
+         raise AsyncEngineDeadError(
+-            msg + " See stack trace above for the actual cause.") from e
++            "Task finished unexpectedly. This should never happen! "
++            "Please open an issue on Github. See stack trace above for the "
++            "actual cause.") from e
++
++
++STOP_ITERATION = Exception()  # Sentinel
+ 
+ 
+ class AsyncStream:
+-    """A stream of RequestOutputs for a request that can be
+-    iterated over asynchronously."""
++    """A stream of RequestOutputs or PoolingRequestOutputs for a request
++    that can be iterated over asynchronously via an async generator."""
+ 
+-    def __init__(self, request_id: str) -> None:
++    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
+         self.request_id = request_id
++        self._cancel = cancel
+         self._queue: asyncio.Queue = asyncio.Queue()
+         self._finished = False
+ 
+-    def put(self, item: Union[RequestOutput, Exception]) -> None:
+-        if self._finished:
+-            return
+-        self._queue.put_nowait(item)
++    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
++                              Exception]) -> None:
++        if not self._finished:
++            self._queue.put_nowait(item)
+ 
+-    def finish(self) -> None:
+-        self._queue.put_nowait(StopAsyncIteration())
+-        self._finished = True
++    def finish(
++        self,
++        exception: Optional[Union[BaseException, Type[BaseException]]] = None,
++    ) -> None:
++        if not self._finished:
++            self._finished = True
++            self._queue.put_nowait(
++                exception if self._is_raisable(exception) else STOP_ITERATION)
+ 
+     @property
+     def finished(self) -> bool:
+         return self._finished
+ 
+-    def __aiter__(self):
+-        return self
+-
+-    async def __anext__(self) -> RequestOutput:
+-        result = await self._queue.get()
+-        if isinstance(result, Exception):
+-            raise result
+-        return result
++    async def generator(
++        self
++    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
++        try:
++            while True:
++                result = await self._queue.get()
++                if self._is_raisable(result):
++                    if result == STOP_ITERATION:
++                        return
++                    raise result
++                yield result
++        except GeneratorExit:
++            self._cancel(self.request_id)
++            raise asyncio.CancelledError from None
++
++    @staticmethod
++    def _is_raisable(value: Any):
++        return isinstance(value, BaseException) or \
++                (isinstance(value, type) and \
++                 issubclass(value, BaseException))
+ 
+ 
+ class RequestTracker:
+@@ -83,7 +131,7 @@ class RequestTracker:
+ 
+     def __init__(self) -> None:
+         self._request_streams: Dict[str, AsyncStream] = {}
+-        self._finished_requests: asyncio.Queue[str] = asyncio.Queue()
++        self._aborted_requests: asyncio.Queue[str] = asyncio.Queue()
+         self._new_requests: asyncio.Queue[Tuple[AsyncStream,
+                                                 dict]] = asyncio.Queue()
+         self.new_requests_event = asyncio.Event()
+@@ -100,45 +148,58 @@ class RequestTracker:
+         """Propagate an exception to request streams
+         (all if request_id is None)."""
+         if request_id is not None:
+-            self._request_streams[request_id].put(exc)
+-            self.abort_request(request_id)
++            self.abort_request(request_id, exception=exc)
+         else:
+-            for rid, stream in self._request_streams.items():
+-                stream.put(exc)
+-                self.abort_request(rid)
++            # NB: tuple() used here because self.abort_request pops the stream
++            # out of self._request_streams, so we can't iterate on it directly
++            for rid in tuple(self._request_streams.keys()):
++                self.abort_request(rid, exception=exc)
+ 
+     def process_request_output(self,
+-                               request_output: RequestOutput,
++                               request_output: Union[RequestOutput,
++                                                     PoolingRequestOutput],
+                                *,
+                                verbose: bool = False) -> None:
+         """Process a request output from the engine."""
+         request_id = request_output.request_id
++        finished = request_output.finished
++
++        if finished:
++            stream = self._request_streams.pop(request_id, None)
++        else:
++            stream = self._request_streams.get(request_id)
++        # Guard against a KeyError which can occur if the request was aborted
++        # while the output was generated
++        if stream is not None:
++            stream.put(request_output)
++            if finished:
++                stream.finish()
+ 
+-        self._request_streams[request_id].put(request_output)
+-        if request_output.finished:
+-            if verbose:
+-                logger.info("Finished request %s.", request_id)
+-            self.abort_request(request_id)
++        if verbose and finished:
++            logger.info("Finished request %s.", request_id)
+ 
+     def process_exception(self,
+                           request_id: str,
+-                          exception: Exception,
++                          exception: BaseException,
+                           *,
+                           verbose: bool = False) -> None:
+         """Propagate an exception from the engine."""
+-        self._request_streams[request_id].put(exception)
+         if verbose:
+             logger.info("Finished request %s.", request_id)
+-        self.abort_request(request_id)
++        self.abort_request(request_id, exception=exception)
+ 
+-    def add_request(self, request_id: str,
++    def add_request(self,
++                    request_id: str,
++                    *,
++                    verbose: bool = False,
+                     **engine_add_request_kwargs) -> AsyncStream:
+         """Add a request to be sent to the engine on the next background
+         loop iteration."""
+         if request_id in self._request_streams:
+             raise KeyError(f"Request {request_id} already exists.")
+ 
+-        stream = AsyncStream(request_id)
++        abort_request = partial(self.abort_request, verbose=verbose)
++        stream = AsyncStream(request_id, abort_request)
+         self._new_requests.put_nowait((stream, {
+             "request_id": request_id,
+             **engine_add_request_kwargs
+@@ -146,41 +207,47 @@ class RequestTracker:
+ 
+         self.new_requests_event.set()
+ 
++        if verbose:
++            logger.info("Added request %s.", request_id)
++
+         return stream
+ 
+-    def abort_request(self, request_id: str, *, verbose: bool = False) -> None:
++    def abort_request(self,
++                      request_id: str,
++                      *,
++                      exception: Optional[Union[BaseException,
++                                                Type[BaseException]]] = None,
++                      verbose: bool = False) -> None:
+         """Abort a request during next background loop iteration."""
+         if verbose:
+             logger.info("Aborted request %s.", request_id)
+ 
+-        self._finished_requests.put_nowait(request_id)
++        self._aborted_requests.put_nowait(request_id)
+ 
+-        if request_id not in self._request_streams or self._request_streams[
+-                request_id].finished:
+-            # The request has already finished or been aborted.
+-            return
+-
+-        self._request_streams[request_id].finish()
++        stream = self._request_streams.pop(request_id, None)
++        if stream is not None:
++            stream.finish(exception=exception)
+ 
+-    def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]:
++    def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]:
+         """Get the new requests and finished requests to be
+         sent to the engine."""
+         new_requests: List[Dict] = []
+         finished_requests: Set[str] = set()
+ 
+-        while not self._finished_requests.empty():
+-            request_id = self._finished_requests.get_nowait()
++        while not self._aborted_requests.empty():
++            request_id = self._aborted_requests.get_nowait()
+             finished_requests.add(request_id)
+-            self._request_streams.pop(request_id, None)
+ 
+         while not self._new_requests.empty():
+             stream, new_request = self._new_requests.get_nowait()
+-            if stream.request_id in finished_requests:
++            request_id = stream.request_id
++            if request_id in finished_requests:
+                 # The request has already been aborted.
+-                stream.finish()
+-                continue
+-            self._request_streams[stream.request_id] = stream
+-            new_requests.append(new_request)
++                stream.finish(asyncio.CancelledError)
++                finished_requests.discard(request_id)
++            else:
++                self._request_streams[request_id] = stream
++                new_requests.append(new_request)
+ 
+         return new_requests, finished_requests
+ 
+@@ -196,7 +263,12 @@ class RequestTracker:
+ class _AsyncLLMEngine(LLMEngine):
+     """Extension of LLMEngine to add async methods."""
+ 
+-    async def step_async(self) -> List[RequestOutput]:
++    def __init__(self, *args, **kwargs):
++        super().__init__(*args, **kwargs)
++
++    async def step_async(
++        self, virtual_engine: int
++    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+         """Performs one decoding iteration and returns newly generated results.
+         The workers are ran asynchronously if possible.
+ 
+@@ -206,173 +278,440 @@ class _AsyncLLMEngine(LLMEngine):
+         and updates the scheduler with the model outputs. Finally, it decodes
+         the sequences and returns the newly generated results.
+         """
+-        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
++        # these are cached outputs from previous iterations. None if on first
++        # iteration
++        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
++        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
++        scheduler_outputs = cached_outputs.scheduler_outputs
++        allow_async_output_proc = cached_outputs.allow_async_output_proc
++
++        ctx = self.scheduler_contexts[virtual_engine]
++
++        # Clear outputs for each new scheduler iteration
++        ctx.request_outputs.clear()
++
++        # skip the scheduler if there are any remaining steps in the seq groups.
++        # This ensures that the scheduler is only called again when the current
++        # batch has completed.
++        if not self._has_remaining_steps(seq_group_metadata_list):
++
++            # Schedule iteration
++            (seq_group_metadata_list, scheduler_outputs,
++             allow_async_output_proc
++             ) = self.scheduler[virtual_engine].schedule()
++
++            ctx.seq_group_metadata_list = seq_group_metadata_list
++            ctx.scheduler_outputs = scheduler_outputs
++
++            finished_requests_ids = self.scheduler[
++                virtual_engine].get_and_reset_finished_requests_ids()
++
++            # Maybe switch from async mode to sync mode
++            if not allow_async_output_proc and len(ctx.output_queue) > 0:
++                self._process_model_outputs(ctx=ctx)
++
++            if (self.scheduler_config.is_multi_step
++                    and scheduler_outputs.num_lookahead_slots > 0):
++                # cache the scheduler outputs for the next iteration if we have
++                # lookahead slots
++                self._cache_scheduler_outputs_for_multi_step(
++                    virtual_engine, seq_group_metadata_list, scheduler_outputs,
++                    allow_async_output_proc)
++        else:
++            finished_requests_ids = list()
++
++        assert seq_group_metadata_list is not None
++        assert scheduler_outputs is not None
+ 
+         if not scheduler_outputs.is_empty():
+-            # Execute the model.
++
++            # Check if we have a cached last_output from the previous iteration.
++            # For supporting PP this is probably the best way to pass the
++            # sampled_token_ids, as a separate broadcast over all the PP stages
++            # will cause one virtual engine's microbatch to block the pipeline.
++            last_sampled_token_ids = \
++                self._get_last_sampled_token_ids(virtual_engine)
++
+             execute_model_req = ExecuteModelRequest(
+                 seq_group_metadata_list=seq_group_metadata_list,
+                 blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+                 blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+                 blocks_to_copy=scheduler_outputs.blocks_to_copy,
++                virtual_engine=virtual_engine,
+                 num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
+                 running_queue_size=scheduler_outputs.running_queue_size,
+-            )
+-            output = await self.model_executor.execute_model_async(
++                finished_requests_ids=finished_requests_ids,
++                # We use ExecuteModelRequest to pass the last sampled_token_ids
++                # to each of the non-last PP stages for in-place prepare_input.
++                last_sampled_token_ids=last_sampled_token_ids)
++
++            if allow_async_output_proc:
++                execute_model_req.async_callback = self.async_callbacks[
++                    virtual_engine]
++
++            # Execute the model.
++            outputs = await self.model_executor.execute_model_async(
+                 execute_model_req)
++
++            # we need to do this here so that last step's sampled_token_ids can
++            # be passed to the next iteration for PP.
++            if self.scheduler_config.is_multi_step:
++                self._update_cached_scheduler_output(virtual_engine, outputs)
++        else:
++            if len(ctx.output_queue) > 0:
++                self._process_model_outputs(ctx=ctx)
++            outputs = []
++
++        # Finish the current step for all the sequence groups.
++        if self.scheduler_config.is_multi_step:
++            for seq_group in seq_group_metadata_list:
++                seq_group.finish_step()
++
++        if not self._has_remaining_steps(seq_group_metadata_list):
++            # Clear the cache if we have finished all the steps
++            if self.scheduler_config.is_multi_step:
++                self.cached_scheduler_outputs[
++                    virtual_engine] = SchedulerOutputState()
++
++            # is_first_step_output is True only when the num_steps of all
++            # the sequences are 1. When the num_steps > 1,
++            # multi_step_model_runner does the first-step output append.
++            is_first_step_output: bool = False if not seq_group_metadata_list \
++                else seq_group_metadata_list[0].state.num_steps == 1
++
++            ctx.append_output(outputs=outputs,
++                              seq_group_metadata_list=seq_group_metadata_list,
++                              scheduler_outputs=scheduler_outputs,
++                              is_async=allow_async_output_proc,
++                              is_last_step=True,
++                              is_first_step_output=is_first_step_output)
++
++            if outputs and allow_async_output_proc:
++                assert len(
++                    outputs
++                ) == 1, "Async postprocessor expects only a single output set"
++                self._advance_to_next_step(
++                    outputs[0], seq_group_metadata_list,
++                    scheduler_outputs.scheduled_seq_groups)
++
++            if not allow_async_output_proc:
++                self._process_model_outputs(ctx=ctx)
++
++                # Log stats.
++                self.do_log_stats(scheduler_outputs, outputs)
++
++                # Tracing
++                self.do_tracing(scheduler_outputs)
++
+         else:
+-            output = []
++            # Multi-step case
++            return ctx.request_outputs
++
++        if not self.has_unfinished_requests():
++            # Drain async postprocessor (if exists)
++            if len(ctx.output_queue) > 0:
++                self._process_model_outputs(ctx=ctx)
++            assert len(ctx.output_queue) == 0
+ 
+-        request_outputs = self._process_model_outputs(
+-            output, scheduler_outputs.scheduled_seq_groups,
+-            scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
++        return ctx.request_outputs
+ 
+-        # Log stats.
+-        self.do_log_stats(scheduler_outputs, output)
++    async def stop_remote_worker_execution_loop_async(self) -> None:
++        """Stop the remote worker execution loop."""
++        await self.model_executor.stop_remote_worker_execution_loop_async()
+ 
+-        return request_outputs
++    async def get_tokenizer_async(self,
++                                  lora_request: Optional[LoRARequest] = None
++                                  ) -> AnyTokenizer:
++        return await (
++            self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
+ 
+-    async def encode_request_async(
++    @overload
++    @deprecated("'inputs' will be renamed to 'prompt")
++    async def add_request_async(
+         self,
+-        request_id: str,  # pylint: disable=unused-argument
+-        prompt: Optional[str],
+-        prompt_token_ids: Optional[List[int]] = None,
++        request_id: str,
++        *,
++        inputs: PromptType,
++        params: Union[SamplingParams, PoolingParams],
++        arrival_time: Optional[float] = None,
+         lora_request: Optional[LoRARequest] = None,
+-    ):
+-        if prompt_token_ids is None:
+-            assert prompt is not None
+-            prompt_token_ids = await self.tokenizer.encode_async(
+-                request_id=request_id,
+-                prompt=prompt,
+-                lora_request=lora_request)
+-        return prompt_token_ids
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> None:
++        ...
+ 
++    @overload
+     async def add_request_async(
+         self,
+         request_id: str,
+-        prompt: Optional[str],
+-        sampling_params: SamplingParams,
+-        prompt_token_ids: Optional[List[int]] = None,
++        prompt: PromptType,
++        params: Union[SamplingParams, PoolingParams],
+         arrival_time: Optional[float] = None,
+         lora_request: Optional[LoRARequest] = None,
+-        multi_modal_data: Optional[MultiModalData] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> None:
++        ...
++
++    @deprecate_kwargs(
++        "inputs",
++        additional_message="Please use the 'prompt' parameter instead.",
++    )
++    async def add_request_async(
++            self,
++            request_id: str,
++            prompt: Optional[PromptType] = None,
++            params: Optional[Union[SamplingParams, PoolingParams]] = None,
++            arrival_time: Optional[float] = None,
++            lora_request: Optional[LoRARequest] = None,
++            trace_headers: Optional[Mapping[str, str]] = None,
++            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++            priority: int = 0,
++            *,
++            inputs: Optional[PromptType] = None,  # DEPRECATED
+     ) -> None:
++        """Async version of :meth:`add_request`."""
++        if inputs is not None:
++            prompt = inputs
++        assert prompt is not None and params is not None
++
+         if lora_request is not None and not self.lora_config:
+             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                              "not enabled!")
++        if priority != 0 and not self.scheduler_config.policy == "priority":
++            raise ValueError(f"Got priority {priority} but "
++                             "Priority scheduling is not enabled.")
+         if arrival_time is None:
+             arrival_time = time.time()
+-        prompt_token_ids = await self.encode_request_async(
+-            request_id=request_id,
+-            prompt=prompt,
+-            prompt_token_ids=prompt_token_ids,
+-            lora_request=lora_request)
+ 
+-        return self.add_request(request_id,
+-                                prompt=prompt,
+-                                prompt_token_ids=prompt_token_ids,
+-                                sampling_params=sampling_params,
+-                                arrival_time=arrival_time,
+-                                lora_request=lora_request,
+-                                multi_modal_data=multi_modal_data)
++        if self.tokenizer is not None:
++            tokenizer = await self.get_tokenizer_async(lora_request)
++            self._validate_token_prompt(prompt, tokenizer=tokenizer)
++
++        preprocessed_inputs = await self.input_preprocessor.preprocess_async(
++            prompt,
++            request_id=request_id,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++        )
++        processed_inputs = self.input_processor(preprocessed_inputs)
++
++        if isinstance(params, SamplingParams) and \
++            params.guided_decoding is not None:
++            # Guided decoding has an async implementation for building logits
++            # processors in a separate threadpool.
++            # We want to invoke that here instead of using the blocking
++            # implementation in the LLMEngine
++            params = await build_guided_decoding_logits_processor_async(
++                sampling_params=params,
++                tokenizer=await self.get_tokenizer_async(lora_request),
++                default_guided_backend=self.decoding_config.
++                guided_decoding_backend,
++                model_config=self.model_config)
++
++        self._add_processed_request(
++            request_id=request_id,
++            processed_inputs=processed_inputs,
++            params=params,
++            arrival_time=arrival_time,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++            trace_headers=trace_headers,
++            priority=priority,
++        )
+ 
+     async def check_health_async(self) -> None:
++        if self.tokenizer:
++            self.tokenizer.check_health()
+         self.model_executor.check_health()
+ 
+ 
+-class AsyncLLMEngine:
+-    """An asynchronous wrapper for LLMEngine.
++async def build_guided_decoding_logits_processor_async(
++        sampling_params: SamplingParams, tokenizer: AnyTokenizer,
++        default_guided_backend: str,
++        model_config: ModelConfig) -> SamplingParams:
++    """Constructs logits processors based on the guided_decoding,
++    logits_bias, and allowed_token_ids fields in sampling_params. Deletes
++    those fields and adds the constructed logits processors to the
++    logits_processors field. Modifies sampling params in-place and returns
++    the modified sampling params."""
++    if sampling_params.guided_decoding is None:
++        return sampling_params
++
++    # Defensively copy sampling params since guided decoding logits
++    # processors can have different state for each request
++    sampling_params = copy.copy(sampling_params)
++    guided_decoding = sampling_params.guided_decoding
++
++    logger.debug("Building guided decoding logits processor. "
++                 "Params: %s", guided_decoding)
++
++    guided_decoding.backend = guided_decoding.backend or default_guided_backend
++
++    processor = await get_guided_decoding_logits_processor(
++        guided_params=guided_decoding,
++        tokenizer=tokenizer,
++        model_config=model_config)
++
++    if processor:
++        if sampling_params.logits_processors is None:
++            sampling_params.logits_processors = []
++        sampling_params.logits_processors.append(processor)
++
++    # Unset guided decoding params after constructing the lp from them
++    sampling_params.guided_decoding = None
++
++    return sampling_params
++
+ 
+-    This class is used to wrap the LLMEngine class to make it asynchronous. It
+-    uses asyncio to create a background loop that keeps processing incoming
+-    requests. The LLMEngine is kicked by the generate method when there
+-    are requests in the waiting queue. The generate method yields the outputs
+-    from the LLMEngine to the caller.
++class AsyncLLMEngine(EngineClient):
++    """An asynchronous wrapper for :class:`LLMEngine`.
+ 
+-    NOTE: For the comprehensive list of arguments, see `LLMEngine`.
++    This class is used to wrap the :class:`LLMEngine` class to make it
++    asynchronous. It uses asyncio to create a background loop that keeps
++    processing incoming requests. The :class:`LLMEngine` is kicked by the
++    generate method when there are requests in the waiting queue. The generate
++    method yields the outputs from the :class:`LLMEngine` to the caller.
+ 
+     Args:
+-        worker_use_ray: Whether to use Ray for model workers. Required for
+-            distributed execution. Should be the same as
+-            `parallel_config.worker_use_ray`.
+-        engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the
+-            async frontend will be executed in a separate process as the
+-            model workers.
+         log_requests: Whether to log the requests.
+-        max_log_len: Maximum number of prompt characters or prompt ID numbers
+-            being printed in log.
+         start_engine_loop: If True, the background task to run the engine
+             will be automatically started in the generate call.
+-        *args: Arguments for LLMEngine.
+-        *kwargs: Arguments for LLMEngine.
++        *args: Arguments for :class:`LLMEngine`.
++        **kwargs: Arguments for :class:`LLMEngine`.
+     """
+ 
+     _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
+ 
+     def __init__(self,
+-                 worker_use_ray: bool,
+-                 engine_use_ray: bool,
+                  *args,
+                  log_requests: bool = True,
+-                 max_log_len: Optional[int] = None,
+                  start_engine_loop: bool = True,
+                  **kwargs) -> None:
+-        self.worker_use_ray = worker_use_ray
+-        self.engine_use_ray = engine_use_ray
+         self.log_requests = log_requests
+-        self.max_log_len = max_log_len
+-        self.engine = self._init_engine(*args, **kwargs)
++        self.engine = self._engine_class(*args, **kwargs)
++
++        # This ensures quick processing of request outputs
++        # so the append to asyncio queues is not delayed,
++        # especially for multi-step.
++        self.use_process_request_outputs_callback = (
++            self.engine.model_config.use_async_output_proc)
++
++        if self.use_process_request_outputs_callback:
++            self.engine.process_request_outputs_callback = \
++                weak_bind(self.process_request_outputs)
+ 
+         self.background_loop: Optional[asyncio.Future] = None
+         # We need to keep a reference to unshielded
+         # task as well to prevent it from being garbage
+         # collected
+-        self._background_loop_unshielded: Optional[asyncio.Task[Any]] = None
++        self._background_loop_unshielded: Optional[asyncio.Task] = None
+         self.start_engine_loop = start_engine_loop
+         self._errored_with: Optional[BaseException] = None
+ 
+         # Lazy initialized fields
+         self._request_tracker: RequestTracker
+ 
+-    @classmethod
+-    def from_engine_args(
+-        cls,
+-        engine_args: AsyncEngineArgs,
+-        start_engine_loop: bool = True,
+-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+-    ) -> "AsyncLLMEngine":
+-        """Creates an async LLM engine from the engine arguments."""
+-        # Create the engine configs.
+-        engine_config = engine_args.create_engine_config()
++    def __del__(self):
++        if rt := getattr(self, "request_tracker", None):
++            # Wake up engine loop so that it will exit cleanly
++            rt.new_requests_event.set()
+ 
+-        if engine_config.device_config.device_type == "neuron":
++    @classmethod
++    def _get_executor_cls(
++            cls, engine_config: VllmConfig) -> Type[ExecutorAsyncBase]:
++        distributed_executor_backend = (
++            engine_config.parallel_config.distributed_executor_backend)
++        if isinstance(distributed_executor_backend, type):
++            if not issubclass(distributed_executor_backend, ExecutorAsyncBase):
++                raise TypeError(
++                    "distributed_executor_backend must be a subclass of "
++                    f"ExecutorAsyncBase. Got {distributed_executor_backend}.")
++            executor_class = distributed_executor_backend
++        elif engine_config.device_config.device_type == "neuron":
+             from vllm.executor.neuron_executor import NeuronExecutorAsync
+             executor_class = NeuronExecutorAsync
++        elif engine_config.device_config.device_type == "tpu":
++            if distributed_executor_backend == "ray":
++                from vllm.executor.ray_tpu_executor import RayTPUExecutorAsync
++                executor_class = RayTPUExecutorAsync
++            else:
++                assert distributed_executor_backend is None
++                from vllm.executor.tpu_executor import TPUExecutorAsync
++                executor_class = TPUExecutorAsync
+         elif engine_config.device_config.device_type == "cpu":
+-            assert not engine_config.parallel_config.worker_use_ray, (
+-                "Ray is not supported with the CPU backend.")
+             from vllm.executor.cpu_executor import CPUExecutorAsync
+             executor_class = CPUExecutorAsync
+-        elif engine_config.parallel_config.worker_use_ray:
+-            initialize_ray_cluster(engine_config.parallel_config)
++        elif engine_config.device_config.device_type == "hpu":
++            if distributed_executor_backend == "ray":
++                initialize_ray_cluster(engine_config.parallel_config)
++                from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync
++                executor_class = RayHPUExecutorAsync
++            else:
++                from vllm.executor.hpu_executor import HPUExecutorAsync
++                executor_class = HPUExecutorAsync
++        elif engine_config.device_config.device_type == "openvino":
++            assert distributed_executor_backend is None, (
++                "Distributed execution is not supported with "
++                "the OpenVINO backend.")
++            from vllm.executor.openvino_executor import OpenVINOExecutorAsync
++            executor_class = OpenVINOExecutorAsync
++        elif engine_config.device_config.device_type == "xpu":
++            if distributed_executor_backend is None:
++                from vllm.executor.xpu_executor import XPUExecutorAsync
++                executor_class = XPUExecutorAsync
++            elif distributed_executor_backend == "ray":
++                from vllm.executor.ray_xpu_executor import RayXPUExecutorAsync
++                executor_class = RayXPUExecutorAsync
++            elif distributed_executor_backend == "mp":
++                from vllm.executor.multiproc_xpu_executor import (
++                    MultiprocessingXPUExecutorAsync)
++                executor_class = MultiprocessingXPUExecutorAsync
++            else:
++                raise RuntimeError(
++                    "Not supported distributed execution model on XPU device.")
++        elif distributed_executor_backend == "ray":
+             from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
+             executor_class = RayGPUExecutorAsync
++        elif distributed_executor_backend == "mp":
++            from vllm.executor.multiproc_gpu_executor import (
++                MultiprocessingGPUExecutorAsync)
++            executor_class = MultiprocessingGPUExecutorAsync
+         else:
+-            assert engine_config.parallel_config.world_size == 1, (
+-                "Ray is required if parallel_config.world_size > 1.")
+             from vllm.executor.gpu_executor import GPUExecutorAsync
+             executor_class = GPUExecutorAsync
++        return executor_class
++
++    @classmethod
++    def from_engine_args(
++        cls,
++        engine_args: AsyncEngineArgs,
++        engine_config: Optional[VllmConfig] = None,
++        start_engine_loop: bool = True,
++        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
++        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
++    ) -> "AsyncLLMEngine":
++        """Creates an async LLM engine from the engine arguments."""
++        # Create the engine configs.
++        if engine_config is None:
++            engine_config = engine_args.create_engine_config(usage_context)
++
++        executor_class = cls._get_executor_cls(engine_config)
++
++        if executor_class.uses_ray:
++            initialize_ray_cluster(engine_config.parallel_config)
++
+         # Create the async LLM engine.
+         engine = cls(
+-            engine_config.parallel_config.worker_use_ray,
+-            engine_args.engine_use_ray,
+-            **engine_config.to_dict(),
++            vllm_config=engine_config,
+             executor_class=executor_class,
+             log_requests=not engine_args.disable_log_requests,
+             log_stats=not engine_args.disable_log_stats,
+-            max_log_len=engine_args.max_log_len,
+             start_engine_loop=start_engine_loop,
+             usage_context=usage_context,
++            stat_loggers=stat_loggers,
+         )
+         return engine
+ 
+@@ -392,6 +731,14 @@ class AsyncLLMEngine:
+     def errored(self) -> bool:
+         return self._errored_with is not None
+ 
++    @property
++    def dead_error(self) -> BaseException:
++        return AsyncEngineDeadError(
++            "Background loop is not running. If it was running, "
++            "inspect the output to find the stacktrace of the "
++            "error that caused the background loop to stop "
++            "(AsyncEngineDeadError).")
++
+     def set_errored(self, exc: Exception) -> None:
+         self._errored_with = exc
+ 
+@@ -399,11 +746,14 @@ class AsyncLLMEngine:
+         self.set_errored(exc)
+         self._request_tracker.propagate_exception(exc)
+ 
+-    async def get_tokenizer(self) -> "PreTrainedTokenizer":
+-        if self.engine_use_ray:
+-            return await self.engine.get_tokenizer.remote()  # type: ignore
+-        else:
+-            return self.engine.get_tokenizer()
++    async def get_input_preprocessor(self) -> InputPreprocessor:
++        return self.engine.input_preprocessor
++
++    async def get_tokenizer(
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> AnyTokenizer:
++        return await self.engine.get_tokenizer_async(lora_request)
+ 
+     def start_background_loop(self) -> None:
+         """Start the background loop."""
+@@ -416,48 +766,37 @@ class AsyncLLMEngine:
+         self._request_tracker = RequestTracker()
+ 
+         self._background_loop_unshielded = asyncio.get_event_loop(
+-        ).create_task(self.run_engine_loop())
++        ).create_task(self.run_engine_loop(weakref.ref(self)))
+         self._background_loop_unshielded.add_done_callback(
+-            partial(_raise_exception_on_finish,
+-                    error_callback=self._error_callback))
++            partial(_log_task_completion, error_callback=self._error_callback))
+         self.background_loop = asyncio.shield(self._background_loop_unshielded)
+ 
+-    def _init_engine(self, *args,
+-                     **kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]:
+-        if not self.engine_use_ray:
+-            engine_class = self._engine_class
+-        elif self.worker_use_ray:
+-            engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
+-        else:
+-            # FIXME(woosuk): This is a bit hacky. Be careful when changing the
+-            # order of the arguments.
+-            cache_config = kwargs["cache_config"]
+-            parallel_config = kwargs["parallel_config"]
+-            if parallel_config.tensor_parallel_size == 1:
+-                num_gpus = cache_config.gpu_memory_utilization
+-            else:
+-                num_gpus = 1
+-            engine_class = ray.remote(num_gpus=num_gpus)(
+-                self._engine_class).remote
+-        return engine_class(*args, **kwargs)
++    def shutdown_background_loop(self) -> None:
++        """
++        Shut down the background loop.
+ 
+-    async def engine_step(self) -> bool:
++        This method needs to be called during cleanup to remove
++        references to `self` and properly GC the resources held
++        by the async LLM engine (e.g., the executors as well as
++        their resources).
++        """
++        if self._background_loop_unshielded is not None:
++            self._background_loop_unshielded.cancel()
++            self._background_loop_unshielded = None
++        self.background_loop = None
++
++    async def engine_step(self, virtual_engine: int) -> bool:
+         """Kick the engine to process the waiting requests.
+ 
+         Returns True if there are in-progress requests."""
+ 
+-        new_requests, finished_requests = (
+-            self._request_tracker.get_new_and_finished_requests())
++        new_requests, aborted_requests = (
++            self._request_tracker.get_new_and_aborted_requests())
+ 
+         for new_request in new_requests:
+             # Add the request into the vLLM engine's waiting queue.
+-            # TODO: Maybe add add_request_batch to reduce Ray overhead
+             try:
+-                if self.engine_use_ray:
+-                    await self.engine.add_request.remote(  # type: ignore
+-                        **new_request)
+-                else:
+-                    await self.engine.add_request_async(**new_request)
++                await self.engine.add_request_async(**new_request)
+             except ValueError as e:
+                 # TODO: use a vLLM specific error for failed validation
+                 self._request_tracker.process_exception(
+@@ -466,71 +805,160 @@ class AsyncLLMEngine:
+                     verbose=self.log_requests,
+                 )
+ 
+-        if finished_requests:
+-            await self._engine_abort(finished_requests)
++        if aborted_requests:
++            await self._engine_abort(aborted_requests)
++
++        request_outputs = await self.engine.step_async(virtual_engine)
+ 
+-        if self.engine_use_ray:
+-            request_outputs = await self.engine.step.remote()  # type: ignore
++        # Put the outputs into the corresponding streams.
++        # If used as a callback, then already invoked inside
++        # LLMEngine's _process_model_outputs
++        if not self.use_process_request_outputs_callback:
++            all_finished = self.process_request_outputs(request_outputs)
+         else:
+-            request_outputs = await self.engine.step_async()
++            # For callback case, we only need to detect when all
++            # requests are finished
++            all_finished = all(request_output.finished
++                               for request_output in request_outputs)
++
++        return not all_finished
+ 
++    def process_request_outputs(self, request_outputs) -> bool:
+         # Put the outputs into the corresponding streams.
++        all_finished = True
+         for request_output in request_outputs:
+             self._request_tracker.process_request_output(
+                 request_output, verbose=self.log_requests)
++            all_finished = all_finished and request_output.finished
+ 
+-        return len(request_outputs) > 0
++        return all_finished
+ 
+     async def _engine_abort(self, request_ids: Iterable[str]):
+-        if self.engine_use_ray:
+-            await self.engine.abort_request.remote(request_ids)  # type: ignore
+-        else:
+-            self.engine.abort_request(request_ids)
++        self.engine.abort_request(request_ids)
++
++    @staticmethod
++    async def run_engine_loop(engine_ref: ReferenceType):
++        """We use a weakref to the engine so that the running loop
++        doesn't prevent the engine being garbage collected."""
++        engine: Optional[AsyncLLMEngine] = engine_ref()
++        if not engine:
++            return
+ 
+-    async def run_engine_loop(self):
+-        has_requests_in_progress = False
++        pipeline_parallel_size = \
++                engine.engine.parallel_config.pipeline_parallel_size
++        has_requests_in_progress = [False] * pipeline_parallel_size
+         while True:
+-            if not has_requests_in_progress:
++            if not any(has_requests_in_progress):
+                 logger.debug("Waiting for new requests...")
+-                await self._request_tracker.wait_for_new_requests()
++                # Stop the execute model loop in parallel workers until there
++                # are more requests to process. This avoids waiting
++                # indefinitely in torch.distributed ops which may otherwise
++                # timeout, and unblocks the RPC thread in the workers so that
++                # they can process any other queued control plane messages,
++                # such as add/remove lora adapters.
++                await engine.engine.stop_remote_worker_execution_loop_async()
++                request_tracker = engine._request_tracker
++                # Allow engine to be garbage collected while
++                # waiting for new requests
++                del engine
++                await asyncio.sleep(0)
++                if engine_ref() is None:
++                    return
++                await request_tracker.wait_for_new_requests()
++                engine = engine_ref()
++                if not engine:
++                    return
+                 logger.debug("Got new requests!")
++                requests_in_progress = [
++                    asyncio.create_task(engine.engine_step(ve))
++                    for ve in range(pipeline_parallel_size)
++                ]
++                has_requests_in_progress = [True] * pipeline_parallel_size
+ 
+             # Abort if iteration takes too long due to unrecoverable errors
+             # (eg. NCCL timeouts).
+             try:
+-                has_requests_in_progress = await asyncio.wait_for(
+-                    self.engine_step(), ENGINE_ITERATION_TIMEOUT_S)
++                async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
++                    done, _ = await asyncio.wait(
++                        requests_in_progress,
++                        return_when=asyncio.FIRST_COMPLETED)
++                    for _ in range(pipeline_parallel_size):
++                        await asyncio.sleep(0)
++                for task in done:
++                    result = task.result()
++                    virtual_engine = requests_in_progress.index(task)
++                    has_unfinished_requests = (
++                        engine.engine.
++                        has_unfinished_requests_for_virtual_engine(
++                            virtual_engine))
++                    if result or has_unfinished_requests:
++                        requests_in_progress[virtual_engine] = (
++                            asyncio.create_task(
++                                engine.engine_step(virtual_engine)))
++                        has_requests_in_progress[virtual_engine] = True
++                    else:
++                        has_requests_in_progress[virtual_engine] = False
+             except asyncio.TimeoutError as exc:
+                 logger.error(
+                     "Engine iteration timed out. This should never happen!")
+-                self.set_errored(exc)
++                engine.set_errored(exc)
+                 raise
+             await asyncio.sleep(0)
+ 
++    # This method does not need to be async, but kept that way
++    # for backwards compatibility.
++    @overload
++    @deprecated("'inputs' will be renamed to 'prompt")
++    def add_request(
++        self,
++        request_id: str,
++        *,
++        inputs: PromptType,
++        params: Union[SamplingParams, PoolingParams],
++        arrival_time: Optional[float] = None,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> Coroutine[None, None, AsyncGenerator[Union[
++            RequestOutput, PoolingRequestOutput], None]]:
++        ...
++
++    @overload
++    def add_request(
++        self,
++        request_id: str,
++        prompt: PromptType,
++        params: Union[SamplingParams, PoolingParams],
++        arrival_time: Optional[float] = None,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> Coroutine[None, None, AsyncGenerator[Union[
++            RequestOutput, PoolingRequestOutput], None]]:
++        ...
++
++    @deprecate_kwargs(
++        "inputs",
++        additional_message="Please use the 'prompt' parameter instead.",
++    )
+     async def add_request(
+         self,
+         request_id: str,
+-        prompt: Optional[str],
+-        sampling_params: SamplingParams,
+-        prompt_token_ids: Optional[List[int]] = None,
++        prompt: Optional[PromptType] = None,
++        params: Optional[Union[SamplingParams, PoolingParams]] = None,
+         arrival_time: Optional[float] = None,
+         lora_request: Optional[LoRARequest] = None,
+-        multi_modal_data: Optional[MultiModalData] = None,
+-    ) -> AsyncStream:
+-        if self.log_requests:
+-            shortened_prompt = prompt
+-            shortened_token_ids = prompt_token_ids
+-            if self.max_log_len is not None:
+-                if shortened_prompt is not None:
+-                    shortened_prompt = shortened_prompt[:self.max_log_len]
+-                if shortened_token_ids is not None:
+-                    shortened_token_ids = shortened_token_ids[:self.
+-                                                              max_log_len]
+-            logger.info(
+-                "Received request %s: prompt: %r, "
+-                "sampling_params: %s, prompt_token_ids: %s, "
+-                "lora_request: %s.", request_id, shortened_prompt,
+-                sampling_params, shortened_token_ids, lora_request)
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++        *,
++        inputs: Optional[PromptType] = None,  # DEPRECATED
++    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
++        if inputs is not None:
++            prompt = inputs
++        assert prompt is not None and params is not None
+ 
+         if not self.is_running:
+             if self.start_engine_loop:
+@@ -542,44 +970,35 @@ class AsyncLLMEngine:
+                     "error that caused the background loop to stop "
+                     "(AsyncEngineDeadError).")
+ 
+-        if arrival_time is None:
+-            arrival_time = time.time()
+-
+-        if self.engine_use_ray:
+-            prompt_token_ids = await (
+-                self.engine.encode_request_async.remote(  # type: ignore
+-                    request_id=request_id,
+-                    prompt=prompt,
+-                    prompt_token_ids=prompt_token_ids,
+-                    lora_request=lora_request))
+-        else:
+-            prompt_token_ids = await self.engine.encode_request_async(
+-                request_id=request_id,
+-                prompt=prompt,
+-                prompt_token_ids=prompt_token_ids,
+-                lora_request=lora_request)
++        if (priority != 0
++                and not self.engine.scheduler_config.policy == "priority"):
++            raise ValueError(f"Got priority {priority} but "
++                             "Priority scheduling is not enabled.")
+ 
+         stream = self._request_tracker.add_request(
+             request_id,
++            verbose=self.log_requests,
+             prompt=prompt,
+-            sampling_params=sampling_params,
+-            prompt_token_ids=prompt_token_ids,
+-            arrival_time=arrival_time,
++            params=params,
++            arrival_time=arrival_time or time.time(),
+             lora_request=lora_request,
+-            multi_modal_data=multi_modal_data,
++            trace_headers=trace_headers,
++            prompt_adapter_request=prompt_adapter_request,
++            priority=priority,
+         )
+ 
+-        return stream
++        return stream.generator()
+ 
+     async def generate(
+         self,
+-        prompt: Optional[str],
++        prompt: PromptType,
+         sampling_params: SamplingParams,
+         request_id: str,
+-        prompt_token_ids: Optional[List[int]] = None,
+         lora_request: Optional[LoRARequest] = None,
+-        multi_modal_data: Optional[MultiModalData] = None
+-    ) -> AsyncIterator[RequestOutput]:
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> AsyncGenerator[RequestOutput, None]:
+         """Generate outputs for a request.
+ 
+         Generate outputs for a request. This method is a coroutine. It adds the
+@@ -587,18 +1006,20 @@ class AsyncLLMEngine:
+         from the LLMEngine to the caller.
+ 
+         Args:
+-            prompt: The prompt string. Can be None if prompt_token_ids is
+-                provided.
++            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
++                for more details about the format of each input.
+             sampling_params: The sampling parameters of the request.
+             request_id: The unique id of the request.
+-            prompt_token_ids: The token IDs of the prompt. If None, we
+-                use the tokenizer to convert the prompts to token IDs.
+             lora_request: LoRA request to use for generation, if any.
+-            multi_modal_data: Multi modal data per request.
++            trace_headers: OpenTelemetry trace headers.
++            prompt_adapter_request: Prompt Adapter request to use
++                                            for generation, if any.
++            priority: The priority of the request.
++                Only applicable with priority scheduling.
+ 
+         Yields:
+-            The output `RequestOutput` objects from the LLMEngine for the
+-            request.
++            The output `RequestOutput` objects from the LLMEngine
++            for the request.
+ 
+         Details:
+             - If the engine is not running, start the background loop,
+@@ -616,6 +1037,7 @@ class AsyncLLMEngine:
+             >>> # the complete example.
+             >>>
+             >>> # initialize the engine and the example input
++            >>> # note that engine_args here is AsyncEngineArgs instance
+             >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
+             >>> example_input = {
+             >>>     "prompt": "What is LLM?",
+@@ -643,27 +1065,105 @@ class AsyncLLMEngine:
+             >>> # Process and return the final output
+             >>> ...
+         """
+-        # Preprocess the request.
+-        arrival_time = time.time()
++        try:
++            async for output in await self.add_request(
++                    request_id,
++                    prompt,
++                    sampling_params,
++                    lora_request=lora_request,
++                    trace_headers=trace_headers,
++                    prompt_adapter_request=prompt_adapter_request,
++                    priority=priority,
++            ):
++                yield LLMEngine.validate_output(output, RequestOutput)
++        except asyncio.CancelledError:
++            await self.abort(request_id)
++            raise
++
++    async def encode(
++        self,
++        prompt: PromptType,
++        pooling_params: PoolingParams,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        priority: int = 0,
++    ) -> AsyncGenerator[PoolingRequestOutput, None]:
++        """Generate outputs for a request from a pooling model.
++
++        Generate outputs for a request. This method is a coroutine. It adds the
++        request into the waiting queue of the LLMEngine and streams the outputs
++        from the LLMEngine to the caller.
++
++        Args:
++            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
++                for more details about the format of each input.
++            pooling_params: The pooling parameters of the request.
++            request_id: The unique id of the request.
++            lora_request: LoRA request to use for generation, if any.
++            trace_headers: OpenTelemetry trace headers.
++            priority: The priority of the request.
++                Only applicable with priority scheduling.
++
++        Yields:
++            The output `PoolingRequestOutput` objects from the LLMEngine
++            for the request.
+ 
++        Details:
++            - If the engine is not running, start the background loop,
++              which iteratively invokes
++              :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
++              to process the waiting requests.
++            - Add the request to the engine's `RequestTracker`.
++              On the next background loop, this request will be sent to
++              the underlying engine.
++              Also, a corresponding `AsyncStream` will be created.
++            - Wait for the request outputs from `AsyncStream` and yield them.
++
++        Example:
++            >>> # Please refer to entrypoints/api_server.py for
++            >>> # the complete example.
++            >>>
++            >>> # initialize the engine and the example input
++            >>> # note that engine_args here is AsyncEngineArgs instance
++            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
++            >>> example_input = {
++            >>>     "input": "What is LLM?",
++            >>>     "request_id": 0,
++            >>> }
++            >>>
++            >>> # start the generation
++            >>> results_generator = engine.encode(
++            >>>    example_input["input"],
++            >>>    PoolingParams(),
++            >>>    example_input["request_id"])
++            >>>
++            >>> # get the results
++            >>> final_output = None
++            >>> async for request_output in results_generator:
++            >>>     if await request.is_disconnected():
++            >>>         # Abort the request if the client disconnects.
++            >>>         await engine.abort(request_id)
++            >>>         # Return or raise an error
++            >>>         ...
++            >>>     final_output = request_output
++            >>>
++            >>> # Process and return the final output
++            >>> ...
++        """
+         try:
+-            stream = await self.add_request(
+-                request_id,
+-                prompt,
+-                sampling_params,
+-                prompt_token_ids=prompt_token_ids,
+-                arrival_time=arrival_time,
+-                lora_request=lora_request,
+-                multi_modal_data=multi_modal_data,
+-            )
+-
+-            async for request_output in stream:
+-                yield request_output
+-        except (Exception, asyncio.CancelledError) as e:
+-            # If there is an exception or coroutine is cancelled, abort the
+-            # request.
+-            self._abort(request_id)
+-            raise e
++            async for output in await self.add_request(
++                    request_id,
++                    prompt,
++                    pooling_params,
++                    lora_request=lora_request,
++                    trace_headers=trace_headers,
++                    priority=priority,
++            ):
++                yield LLMEngine.validate_output(output, PoolingRequestOutput)
++        except asyncio.CancelledError:
++            await self.abort(request_id)
++            raise
+ 
+     async def abort(self, request_id: str) -> None:
+         """Abort a request.
+@@ -693,32 +1193,34 @@ class AsyncLLMEngine:
+             request_id: The unique id of the request.
+         """
+         self._request_tracker.abort_request(request_id,
++                                            exception=asyncio.CancelledError,
+                                             verbose=self.log_requests)
+ 
+     async def get_model_config(self) -> ModelConfig:
+         """Get the model configuration of the vLLM engine."""
+-        if self.engine_use_ray:
+-            return await self.engine.get_model_config.remote()  # type: ignore
+-        else:
+-            return self.engine.get_model_config()
++        return self.engine.get_model_config()
++
++    async def get_parallel_config(self) -> ParallelConfig:
++        """Get the parallel configuration of the vLLM engine."""
++        return self.engine.get_parallel_config()
+ 
+     async def get_decoding_config(self) -> DecodingConfig:
+         """Get the decoding configuration of the vLLM engine."""
+-        if self.engine_use_ray:
+-            return await self.engine.get_decoding_config.remote(  # type: ignore
+-            )
+-        else:
+-            return self.engine.get_decoding_config()
++        return self.engine.get_decoding_config()
++
++    async def get_scheduler_config(self) -> SchedulerConfig:
++        """Get the scheduling configuration of the vLLM engine."""
++        return self.engine.get_scheduler_config()
++
++    async def get_lora_config(self) -> LoRAConfig:
++        """Get the lora configuration of the vLLM engine."""
++        return self.engine.get_lora_config()
+ 
+     async def do_log_stats(
+             self,
+             scheduler_outputs: Optional[SchedulerOutputs] = None,
+             model_output: Optional[List[SamplerOutput]] = None) -> None:
+-        if self.engine_use_ray:
+-            await self.engine.do_log_stats.remote(  # type: ignore
+-                scheduler_outputs, model_output)
+-        else:
+-            self.engine.do_log_stats()
++        self.engine.do_log_stats()
+ 
+     async def check_health(self) -> None:
+         """Raises an error if engine is unhealthy."""
+@@ -727,11 +1229,41 @@ class AsyncLLMEngine:
+         if self.is_stopped:
+             raise AsyncEngineDeadError("Background loop is stopped.")
+ 
+-        if self.engine_use_ray:
+-            try:
+-                await self.engine.check_health.remote()  # type: ignore
+-            except ray.exceptions.RayActorError as e:
+-                raise RuntimeError("Engine is dead.") from e
+-        else:
+-            await self.engine.check_health_async()
++        await self.engine.check_health_async()
+         logger.debug("Health check took %fs", time.perf_counter() - t)
++
++    async def is_tracing_enabled(self) -> bool:
++        return self.engine.is_tracing_enabled()
++
++    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
++        self.engine.add_logger(logger_name=logger_name, logger=logger)
++
++    def remove_logger(self, logger_name: str) -> None:
++        self.engine.remove_logger(logger_name=logger_name)
++
++    async def start_profile(self) -> None:
++        # using type instead of isinstance to check to avoid capturing
++        # inherited classes
++        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
++            self.engine.model_executor.start_profile()
++        else:
++            self.engine.model_executor._run_workers("start_profile")
++
++    async def stop_profile(self) -> None:
++        # using type instead of isinstance to check to avoid capturing
++        # inherited classes
++        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
++            self.engine.model_executor.stop_profile()
++        else:
++            self.engine.model_executor._run_workers("stop_profile")
++
++    async def add_lora(self, lora_request: LoRARequest) -> None:
++        """Load a new LoRA adapter into the engine for future requests."""
++        self.engine.add_lora(lora_request)
++
++
++# TODO(v1): Remove this class proxy when V1 goes default.
++if envs.VLLM_USE_V1:
++    from vllm.v1.engine.async_llm import AsyncLLM
++
++    AsyncLLMEngine = AsyncLLM  # type: ignore
+diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
+new file mode 100644
+index 0000000..4b18426
+--- /dev/null
++++ b/vllm/engine/async_timeout.py
+@@ -0,0 +1,189 @@
++# Workaround for https://github.com/python/cpython/issues/86296
++#
++# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py
++# Licensed under the Apache License (Apache-2.0)
++
++import asyncio
++import enum
++import sys
++import warnings
++from types import TracebackType
++from typing import Any, Optional, Type
++
++if sys.version_info[:2] >= (3, 11):
++    from asyncio import timeout as asyncio_timeout
++else:
++
++    def asyncio_timeout(delay: Optional[float]) -> "Timeout":
++        """timeout context manager.
++        Useful in cases when you want to apply timeout logic around block
++        of code or in cases when asyncio.wait_for is not suitable. For example:
++        >>> async with timeout(0.001):
++        ...     async with aiohttp.get('https://github.com') as r:
++        ...         await r.text()
++        delay - value in seconds or None to disable timeout logic
++        """
++        loop = asyncio.get_running_loop()
++        deadline = loop.time() + delay if delay is not None else None
++        return Timeout(deadline, loop)
++
++    class _State(enum.Enum):
++        INIT = "INIT"
++        ENTER = "ENTER"
++        TIMEOUT = "TIMEOUT"
++        EXIT = "EXIT"
++
++    class Timeout:
++        # Internal class, please don't instantiate it directly
++        # Use timeout() and timeout_at() public factories instead.
++        #
++        # Implementation note: `async with timeout()` is preferred
++        # over `with timeout()`.
++        # While technically the Timeout class implementation
++        # doesn't need to be async at all,
++        # the `async with` statement explicitly points that
++        # the context manager should be used from async function context.
++        #
++        # This design allows to avoid many silly misusages.
++        #
++        # TimeoutError is raised immediately when scheduled
++        # if the deadline is passed.
++        # The purpose is to time out as soon as possible
++        # without waiting for the next await expression.
++
++        __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler")
++
++        def __init__(self, deadline: Optional[float],
++                     loop: asyncio.AbstractEventLoop) -> None:
++            self._loop = loop
++            self._state = _State.INIT
++
++            self._timeout_handler = None  # type: Optional[asyncio.Handle]
++            if deadline is None:
++                self._deadline = None  # type: Optional[float]
++            else:
++                self.update(deadline)
++
++        def __enter__(self) -> "Timeout":
++            warnings.warn(
++                "with timeout() is deprecated, use async with timeout()",
++                DeprecationWarning,
++                stacklevel=2,
++            )
++            self._do_enter()
++            return self
++
++        def __exit__(
++            self,
++            exc_type: Optional[Type[BaseException]],
++            exc_val: Optional[BaseException],
++            exc_tb: Optional[TracebackType],
++        ) -> Optional[bool]:
++            self._do_exit(exc_type)
++            return None
++
++        async def __aenter__(self) -> "Timeout":
++            self._do_enter()
++            return self
++
++        async def __aexit__(
++            self,
++            exc_type: Optional[Type[BaseException]],
++            exc_val: Optional[BaseException],
++            exc_tb: Optional[TracebackType],
++        ) -> Optional[bool]:
++            self._do_exit(exc_type)
++            return None
++
++        @property
++        def expired(self) -> bool:
++            """Is timeout expired during execution?"""
++            return self._state == _State.TIMEOUT
++
++        @property
++        def deadline(self) -> Optional[float]:
++            return self._deadline
++
++        def reject(self) -> None:
++            """Reject scheduled timeout if any."""
++            # cancel is maybe better name but
++            # task.cancel() raises CancelledError in asyncio world.
++            if self._state not in (_State.INIT, _State.ENTER):
++                raise RuntimeError(f"invalid state {self._state.value}")
++            self._reject()
++
++        def _reject(self) -> None:
++            if self._timeout_handler is not None:
++                self._timeout_handler.cancel()
++                self._timeout_handler = None
++
++        def shift(self, delay: float) -> None:
++            """Advance timeout on delay seconds.
++            The delay can be negative.
++            Raise RuntimeError if shift is called when deadline is not scheduled
++            """
++            deadline = self._deadline
++            if deadline is None:
++                raise RuntimeError(
++                    "cannot shift timeout if deadline is not scheduled")
++            self.update(deadline + delay)
++
++        def update(self, deadline: float) -> None:
++            """Set deadline to absolute value.
++            deadline argument points on the time in the same clock system
++            as loop.time().
++            If new deadline is in the past the timeout is raised immediately.
++            Please note: it is not POSIX time but a time with
++            undefined starting base, e.g. the time of the system power on.
++            """
++            if self._state == _State.EXIT:
++                raise RuntimeError(
++                    "cannot reschedule after exit from context manager")
++            if self._state == _State.TIMEOUT:
++                raise RuntimeError("cannot reschedule expired timeout")
++            if self._timeout_handler is not None:
++                self._timeout_handler.cancel()
++            self._deadline = deadline
++            if self._state != _State.INIT:
++                self._reschedule()
++
++        def _reschedule(self) -> None:
++            assert self._state == _State.ENTER
++            deadline = self._deadline
++            if deadline is None:
++                return
++
++            now = self._loop.time()
++            if self._timeout_handler is not None:
++                self._timeout_handler.cancel()
++
++            task = asyncio.current_task()
++            if deadline <= now:
++                self._timeout_handler = self._loop.call_soon(
++                    self._on_timeout, task)
++            else:
++                self._timeout_handler = self._loop.call_at(
++                    deadline, self._on_timeout, task)
++
++        def _do_enter(self) -> None:
++            if self._state != _State.INIT:
++                raise RuntimeError(f"invalid state {self._state.value}")
++            self._state = _State.ENTER
++            self._reschedule()
++
++        def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None:
++            if exc_type is asyncio.CancelledError and \
++                    self._state == _State.TIMEOUT:
++                self._timeout_handler = None
++                raise asyncio.TimeoutError
++            # timeout has not expired
++            self._state = _State.EXIT
++            self._reject()
++            return None
++
++        def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None:
++            if task:
++                task.cancel()
++            self._state = _State.TIMEOUT
++            # drop the reference early
++            self._timeout_handler = None
+diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
+index b9938b0..1db3e59 100644
+--- a/vllm/engine/llm_engine.py
++++ b/vllm/engine/llm_engine.py
+@@ -1,50 +1,122 @@
++import copy
+ import time
+-from typing import Iterable, List, Optional, Type, Union
+-
+-from transformers import GenerationConfig, PreTrainedTokenizer
+-
+-import vllm
+-from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
+-                         LoRAConfig, ModelConfig, ParallelConfig,
+-                         SchedulerConfig, SpeculativeConfig,
+-                         VisionLanguageConfig)
++from collections import Counter as collectionsCounter
++from collections import deque
++from contextlib import contextmanager
++from dataclasses import dataclass
++from functools import partial
++from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
++                    List, Mapping, NamedTuple, Optional)
++from typing import Sequence as GenericSequence
++from typing import Set, Type, Union, cast, overload
++
++import torch
++from typing_extensions import TypeVar, deprecated
++
++import vllm.envs as envs
++from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
++                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
++                         VllmConfig)
+ from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
+                                  SchedulerOutputs)
+ from vllm.engine.arg_utils import EngineArgs
+-from vllm.engine.metrics import StatLogger, Stats
++from vllm.engine.metrics_types import StatLoggerBase, Stats
+ from vllm.engine.output_processor.interfaces import (
+     SequenceGroupOutputProcessor)
+ from vllm.engine.output_processor.stop_checker import StopChecker
+ from vllm.engine.output_processor.util import create_output_by_sequence_group
++from vllm.entrypoints.openai.logits_processors import (
++    get_logits_processors as get_openai_logits_processors)
+ from vllm.executor.executor_base import ExecutorBase
++from vllm.executor.gpu_executor import GPUExecutor
+ from vllm.executor.ray_utils import initialize_ray_cluster
++from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
++                         PromptType, SingletonInputsAdapter)
++from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt
++from vllm.inputs.preprocess import InputPreprocessor
+ from vllm.logger import init_logger
++from vllm.logits_process import get_bad_words_logits_processors
+ from vllm.lora.request import LoRARequest
+-from vllm.outputs import RequestOutput
+-from vllm.sampling_params import SamplingParams
+-from vllm.sequence import (ExecuteModelRequest, MultiModalData, SamplerOutput,
+-                           Sequence, SequenceGroup, SequenceGroupMetadata,
+-                           SequenceStatus)
++from vllm.model_executor.guided_decoding import (
++    get_local_guided_decoding_logits_processor)
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
++from vllm.outputs import (PoolingRequestOutput, RequestOutput,
++                          RequestOutputFactory)
++from vllm.pooling_params import PoolingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import RequestOutputKind, SamplingParams
++from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
++                           PoolingSequenceGroupOutput, Sequence, SequenceGroup,
++                           SequenceGroupBase, SequenceGroupMetadata,
++                           SequenceGroupOutput, SequenceStatus)
++from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
++                          init_tracer)
+ from vllm.transformers_utils.detokenizer import Detokenizer
+-from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
+-                                                     get_tokenizer_group)
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.transformers_utils.tokenizer_group import (
++    BaseTokenizerGroup, init_tokenizer_from_configs)
+ from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
+                                   usage_message)
+-from vllm.utils import Counter
++from vllm.utils import Counter, Device, deprecate_kwargs, weak_bind
++from vllm.version import __version__ as VLLM_VERSION
+ 
+ logger = init_logger(__name__)
+ _LOCAL_LOGGING_INTERVAL_SEC = 5
+ 
+-
+-def _load_generation_config_dict(model_config: ModelConfig):
+-    try:
+-        return GenerationConfig.from_pretrained(
+-            model_config.model,
+-            revision=model_config.revision,
+-        ).to_diff_dict()
+-    except OSError:
+-        # Not found.
+-        return {}
++_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
++_O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
++
++
++@dataclass
++class SchedulerOutputState:
++    """Caches the scheduler outputs for a virtual engine. Used for Multi-Step"""
++    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
++    scheduler_outputs: Optional[SchedulerOutputs] = None
++    allow_async_output_proc: bool = False
++    last_output: Optional[SamplerOutput] = None
++
++
++class OutputData(NamedTuple):
++    outputs: List[SamplerOutput]
++    seq_group_metadata_list: List[SequenceGroupMetadata]
++    scheduler_outputs: SchedulerOutputs
++    is_async: bool
++    is_last_step: bool
++    # Indicates if this output is from the first step of the
++    # multi-step. When multi-step is disabled, this is always
++    # set to True.
++    # is_first_step_output is invalid when `outputs` has
++    # outputs from multiple steps.
++    is_first_step_output: Optional[bool]
++    skip: List[int]
++
++
++class SchedulerContext:
++
++    def __init__(self, multi_step_stream_outputs: bool = False):
++        self.output_queue: Deque[OutputData] = deque()
++        self.request_outputs: List[Union[RequestOutput,
++                                         PoolingRequestOutput]] = []
++        self.seq_group_metadata_list: Optional[
++            List[SequenceGroupMetadata]] = None
++        self.scheduler_outputs: Optional[SchedulerOutputs] = None
++
++        self.multi_step_stream_outputs: bool = multi_step_stream_outputs
++
++    def append_output(self, outputs: List[SamplerOutput],
++                      seq_group_metadata_list: List[SequenceGroupMetadata],
++                      scheduler_outputs: SchedulerOutputs, is_async: bool,
++                      is_last_step: bool,
++                      is_first_step_output: Optional[bool]):
++        self.output_queue.append(
++            OutputData(outputs=outputs,
++                       seq_group_metadata_list=seq_group_metadata_list,
++                       scheduler_outputs=scheduler_outputs,
++                       is_async=is_async,
++                       is_last_step=is_last_step,
++                       is_first_step_output=is_first_step_output,
++                       skip=[]))
+ 
+ 
+ class LLMEngine:
+@@ -57,11 +129,11 @@ class LLMEngine:
+     iteration-level scheduling and efficient memory management to maximize the
+     serving throughput.
+ 
+-    The `LLM` class wraps this class for offline batched inference and the
+-    `AsyncLLMEngine` class wraps this class for online serving.
++    The :class:`~vllm.LLM` class wraps this class for offline batched inference
++    and the :class:`AsyncLLMEngine` class wraps this class for online serving.
+ 
+-    NOTE: The config arguments are derived from the `EngineArgs` class. For the
+-    comprehensive list of arguments, see `EngineArgs`.
++    The config arguments are derived from :class:`~vllm.EngineArgs`. (See
++    :ref:`engine-args`)
+ 
+     Args:
+         model_config: The configuration related to the LLM model.
+@@ -71,139 +143,173 @@ class LLMEngine:
+         scheduler_config: The configuration related to the request scheduler.
+         device_config: The configuration related to the device.
+         lora_config (Optional): The configuration related to serving multi-LoRA.
+-        vision_language_config (Optional): The configuration related to vision
+-            language models.
+         speculative_config (Optional): The configuration related to speculative
+             decoding.
+         executor_class: The model executor class for managing distributed
+             execution.
++        prompt_adapter_config (Optional): The configuration related to serving
++            prompt adapters.
+         log_stats: Whether to log statistics.
+-        usage_context: Specified entry point, used for usage info collection
++        usage_context: Specified entry point, used for usage info collection.
+     """
+ 
++    DO_VALIDATE_OUTPUT: ClassVar[bool] = False
++    """A flag to toggle whether to validate the type of request output."""
++
++    @classmethod
++    @contextmanager
++    def enable_output_validation(cls):
++        cls.DO_VALIDATE_OUTPUT = True
++
++        yield
++
++        cls.DO_VALIDATE_OUTPUT = False
++
++    @classmethod
++    def validate_output(
++        cls,
++        output: object,
++        output_type: Type[_O],
++    ) -> _O:
++        do_validate = cls.DO_VALIDATE_OUTPUT
++
++        if ((TYPE_CHECKING or do_validate)
++                and not isinstance(output, output_type)):
++            raise TypeError(f"Expected output of type {output_type}, "
++                            f"but found type {type(output)}")
++
++        return cast(_O, output)
++
++    @classmethod
++    def validate_outputs(
++        cls,
++        outputs: GenericSequence[object],
++        output_type: Type[_O],
++    ) -> List[_O]:
++        do_validate = cls.DO_VALIDATE_OUTPUT
++
++        outputs_: List[_O]
++        if TYPE_CHECKING or do_validate:
++            outputs_ = []
++            for output in outputs:
++                if not isinstance(output, output_type):
++                    raise TypeError(f"Expected output of type {output_type}, "
++                                    f"but found type {type(output)}")
++
++                outputs_.append(output)
++        else:
++            outputs_ = outputs
++
++        return outputs_
++
++    tokenizer: Optional[BaseTokenizerGroup]
++
+     def __init__(
+         self,
+-        model_config: ModelConfig,
+-        cache_config: CacheConfig,
+-        parallel_config: ParallelConfig,
+-        scheduler_config: SchedulerConfig,
+-        device_config: DeviceConfig,
+-        load_config: LoadConfig,
+-        lora_config: Optional[LoRAConfig],
+-        vision_language_config: Optional[VisionLanguageConfig],
+-        speculative_config: Optional[SpeculativeConfig],
+-        decoding_config: Optional[DecodingConfig],
++        vllm_config: VllmConfig,
+         executor_class: Type[ExecutorBase],
+         log_stats: bool,
+         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
++        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
++        input_registry: InputRegistry = INPUT_REGISTRY,
++        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
++        use_cached_outputs: bool = False,
+     ) -> None:
++
++        self.vllm_config = vllm_config
++        self.model_config = vllm_config.model_config
++        self.cache_config = vllm_config.cache_config
++        self.lora_config = vllm_config.lora_config
++        self.parallel_config = vllm_config.parallel_config
++        self.scheduler_config = vllm_config.scheduler_config
++        self.device_config = vllm_config.device_config
++        self.speculative_config = vllm_config.speculative_config  # noqa
++        self.load_config = vllm_config.load_config
++        self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
++        )
++        self.prompt_adapter_config = vllm_config.prompt_adapter_config  # noqa
++        self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
++        )
++
+         logger.info(
+-            "Initializing an LLM engine (v%s) with config: "
+-            "model=%r, speculative_config=%r, tokenizer=%r, "
+-            "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
+-            "tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
+-            "max_seq_len=%d, download_dir=%r, load_format=%s, "
+-            "tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
+-            "quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
+-            "quantization_param_path=%s, device_config=%s, "
+-            "decoding_config=%r, seed=%d, served_model_name=%s)",
+-            vllm.__version__,
+-            model_config.model,
+-            speculative_config,
+-            model_config.tokenizer,
+-            model_config.skip_tokenizer_init,
+-            model_config.tokenizer_mode,
+-            model_config.revision,
+-            model_config.tokenizer_revision,
+-            model_config.trust_remote_code,
+-            model_config.dtype,
+-            model_config.max_model_len,
+-            load_config.download_dir,
+-            load_config.load_format,
+-            parallel_config.tensor_parallel_size,
+-            parallel_config.disable_custom_all_reduce,
+-            model_config.quantization,
+-            model_config.enforce_eager,
+-            cache_config.cache_dtype,
+-            model_config.quantization_param_path,
+-            device_config.device,
+-            decoding_config,
+-            model_config.seed,
+-            model_config.served_model_name,
++            "Initializing an LLM engine (v%s) with config: %s, "
++            "use_cached_outputs=%s, ",
++            VLLM_VERSION,
++            vllm_config,
++            use_cached_outputs,
+         )
+-        # TODO(woosuk): Print more configs in debug mode.
+-
+-        self.model_config = model_config
+-        self.cache_config = cache_config
+-        self.lora_config = lora_config
+-        self.vision_language_config = vision_language_config
+-        self.parallel_config = parallel_config
+-        self.scheduler_config = scheduler_config
+-        self.device_config = device_config
+-        self.speculative_config = speculative_config
+-        self.load_config = load_config
+-        self.decoding_config = decoding_config or DecodingConfig()
++
+         self.log_stats = log_stats
++        self.use_cached_outputs = use_cached_outputs
+ 
+         if not self.model_config.skip_tokenizer_init:
+-            self.tokenizer: BaseTokenizerGroup
+-            self._init_tokenizer()
++            self.tokenizer = self._init_tokenizer()
+             self.detokenizer = Detokenizer(self.tokenizer)
++            tokenizer_group = self.get_tokenizer_group()
+         else:
+-            self.detokenizer = None
+             self.tokenizer = None
++            self.detokenizer = None
++            tokenizer_group = None
++
++        # Ensure that the function doesn't contain a reference to self,
++        # to avoid engine GC issues
++        def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
++            assert tokenizer_group, ("tokenizer_group cannot be None, "
++                                     "make sure skip_tokenizer_init is False")
++            return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
+ 
+         self.seq_counter = Counter()
+-        self.generation_config_fields = _load_generation_config_dict(
+-            model_config)
+-
+-        self.model_executor = executor_class(
+-            model_config=model_config,
+-            cache_config=cache_config,
+-            parallel_config=parallel_config,
+-            scheduler_config=scheduler_config,
+-            device_config=device_config,
+-            lora_config=lora_config,
+-            vision_language_config=vision_language_config,
+-            speculative_config=speculative_config,
+-            load_config=load_config,
+-        )
++        self.generation_config_fields = (
++            self.model_config.try_get_generation_config())
++
++        self.input_preprocessor = InputPreprocessor(self.model_config,
++                                                    self.tokenizer,
++                                                    mm_registry)
++
++        self.input_registry = input_registry
++        self.input_processor = input_registry.create_input_processor(
++            self.model_config)
+ 
+-        self._initialize_kv_caches()
++        self.model_executor = executor_class(vllm_config=vllm_config, )
++
++        if self.model_config.runner_type != "pooling":
++            self._initialize_kv_caches()
+ 
+         # If usage stat is enabled, collect relevant info.
+         if is_usage_stats_enabled():
+             from vllm.model_executor.model_loader import (
+                 get_architecture_class_name)
+             usage_message.report_usage(
+-                get_architecture_class_name(model_config),
++                get_architecture_class_name(self.model_config),
+                 usage_context,
+                 extra_kvs={
+                     # Common configuration
+                     "dtype":
+-                    str(model_config.dtype),
++                    str(self.model_config.dtype),
+                     "tensor_parallel_size":
+-                    parallel_config.tensor_parallel_size,
++                    self.parallel_config.tensor_parallel_size,
+                     "block_size":
+-                    cache_config.block_size,
++                    self.cache_config.block_size,
+                     "gpu_memory_utilization":
+-                    cache_config.gpu_memory_utilization,
++                    self.cache_config.gpu_memory_utilization,
+ 
+                     # Quantization
+                     "quantization":
+-                    model_config.quantization,
++                    self.model_config.quantization,
+                     "kv_cache_dtype":
+-                    cache_config.cache_dtype,
++                    str(self.cache_config.cache_dtype),
+ 
+                     # Feature flags
+                     "enable_lora":
+-                    bool(lora_config),
++                    bool(self.lora_config),
++                    "enable_prompt_adapter":
++                    bool(self.prompt_adapter_config),
+                     "enable_prefix_caching":
+-                    cache_config.enable_prefix_caching,
++                    self.cache_config.enable_prefix_caching,
+                     "enforce_eager":
+-                    model_config.enforce_eager,
++                    self.model_config.enforce_eager,
+                     "disable_custom_all_reduce":
+-                    parallel_config.disable_custom_all_reduce,
++                    self.parallel_config.disable_custom_all_reduce,
+                 })
+ 
+         if self.tokenizer:
+@@ -211,18 +317,76 @@ class LLMEngine:
+             # different process.
+             self.tokenizer.ping()
+ 
++        self.cached_scheduler_outputs = [
++            SchedulerOutputState()
++            for _ in range(self.parallel_config.pipeline_parallel_size)
++        ]
++
++        self.scheduler_contexts = [
++            SchedulerContext(multi_step_stream_outputs=self.scheduler_config.
++                             multi_step_stream_outputs)
++            for _ in range(self.parallel_config.pipeline_parallel_size)
++        ]
++
++        if self.model_config.use_async_output_proc:
++            process_model_outputs = weak_bind(self._process_model_outputs)
++
++            self.async_callbacks = [
++                partial(process_model_outputs,
++                        ctx=self.scheduler_contexts[v_id])
++                for v_id in range(self.parallel_config.pipeline_parallel_size)
++            ]
++        else:
++            self.async_callbacks = []
++
++        # Currently used by AsyncLLMEngine to ensure quick append
++        # of request outputs to asyncio queues
++        self.process_request_outputs_callback: Optional[Callable] = None
++
+         # Create the scheduler.
+         # NOTE: the cache_config here have been updated with the numbers of
+         # GPU and CPU blocks, which are profiled in the distributed executor.
+-        self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
++        self.scheduler = [
++            Scheduler(
++                self.scheduler_config, self.cache_config, self.lora_config,
++                self.parallel_config.pipeline_parallel_size,
++                self.async_callbacks[v_id]
++                if self.model_config.use_async_output_proc else None)
++            for v_id in range(self.parallel_config.pipeline_parallel_size)
++        ]
+ 
+         # Metric Logging.
+         if self.log_stats:
+-            self.stat_logger = StatLogger(
+-                local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+-                labels=dict(model_name=model_config.served_model_name),
+-                max_model_len=self.model_config.max_model_len)
+-            self.stat_logger.info("cache_config", self.cache_config)
++            if stat_loggers is not None:
++                self.stat_loggers = stat_loggers
++            else:
++                # Lazy import for prometheus multiprocessing.
++                # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
++                # before prometheus_client is imported.
++                # See https://prometheus.github.io/client_python/multiprocess/
++                from vllm.engine.metrics import (LoggingStatLogger,
++                                                 PrometheusStatLogger)
++
++                self.stat_loggers = {
++                    "logging":
++                    LoggingStatLogger(
++                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
++                        vllm_config=vllm_config),
++                    "prometheus":
++                    PrometheusStatLogger(
++                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
++                        labels=dict(
++                            model_name=self.model_config.served_model_name),
++                        vllm_config=vllm_config),
++                }
++                self.stat_loggers["prometheus"].info("cache_config",
++                                                     self.cache_config)
++
++        self.tracer = None
++        if self.observability_config.otlp_traces_endpoint:
++            self.tracer = init_tracer(
++                "vllm.llm_engine",
++                self.observability_config.otlp_traces_endpoint)
+ 
+         # Create sequence output processor, e.g. for beam search or
+         # speculative decoding.
+@@ -232,19 +396,22 @@ class LLMEngine:
+                 self.detokenizer,
+                 self.scheduler,
+                 self.seq_counter,
+-                self.get_tokenizer_for_seq,
++                get_tokenizer_for_seq,
+                 stop_checker=StopChecker(
+                     self.scheduler_config.max_model_len,
+-                    self.get_tokenizer_for_seq,
++                    get_tokenizer_for_seq,
+                 ),
+             ))
+ 
++        self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}
++
+     def _initialize_kv_caches(self) -> None:
+         """Initialize the KV cache in the worker(s).
+ 
+         The workers will determine the number of blocks in both the GPU cache
+         and the swap CPU cache.
+         """
++        start = time.time()
+         num_gpu_blocks, num_cpu_blocks = (
+             self.model_executor.determine_num_available_blocks())
+ 
+@@ -260,41 +427,101 @@ class LLMEngine:
+         self.cache_config.num_cpu_blocks = num_cpu_blocks
+ 
+         self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
++        elapsed = time.time() - start
++        logger.info(("init engine (profile, create kv cache, "
++                     "warmup model) took %.2f seconds"), elapsed)
+ 
+     @classmethod
+-    def from_engine_args(
+-        cls,
+-        engine_args: EngineArgs,
+-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+-    ) -> "LLMEngine":
+-        """Creates an LLM engine from the engine arguments."""
+-        # Create the engine configs.
+-        engine_config = engine_args.create_engine_config()
+-
++    def _get_executor_cls(cls,
++                          engine_config: VllmConfig) -> Type[ExecutorBase]:
++        distributed_executor_backend = (
++            engine_config.parallel_config.distributed_executor_backend)
+         # Initialize the cluster and specify the executor class.
+-        if engine_config.device_config.device_type == "neuron":
++        if isinstance(distributed_executor_backend, type):
++            if not issubclass(distributed_executor_backend, ExecutorBase):
++                raise TypeError(
++                    "distributed_executor_backend must be a subclass of "
++                    f"ExecutorBase. Got {distributed_executor_backend}.")
++            if distributed_executor_backend.uses_ray:  # type: ignore
++                initialize_ray_cluster(engine_config.parallel_config)
++            executor_class = distributed_executor_backend
++        elif engine_config.device_config.device_type == "neuron":
+             from vllm.executor.neuron_executor import NeuronExecutor
+             executor_class = NeuronExecutor
++        elif engine_config.device_config.device_type == "tpu":
++            if distributed_executor_backend == "ray":
++                initialize_ray_cluster(engine_config.parallel_config)
++                from vllm.executor.ray_tpu_executor import RayTPUExecutor
++                executor_class = RayTPUExecutor
++            else:
++                assert distributed_executor_backend is None
++                from vllm.executor.tpu_executor import TPUExecutor
++                executor_class = TPUExecutor
+         elif engine_config.device_config.device_type == "cpu":
+             from vllm.executor.cpu_executor import CPUExecutor
+             executor_class = CPUExecutor
+-        elif engine_config.parallel_config.worker_use_ray:
++        elif engine_config.device_config.device_type == "hpu":
++            if distributed_executor_backend == "ray":
++                initialize_ray_cluster(engine_config.parallel_config)
++                from vllm.executor.ray_hpu_executor import RayHPUExecutor
++                executor_class = RayHPUExecutor
++            else:
++                from vllm.executor.hpu_executor import HPUExecutor
++                executor_class = HPUExecutor
++        elif engine_config.device_config.device_type == "openvino":
++            from vllm.executor.openvino_executor import OpenVINOExecutor
++            executor_class = OpenVINOExecutor
++        elif engine_config.device_config.device_type == "xpu":
++            if distributed_executor_backend == "ray":
++                initialize_ray_cluster(engine_config.parallel_config)
++                from vllm.executor.ray_xpu_executor import RayXPUExecutor
++                executor_class = RayXPUExecutor
++            elif distributed_executor_backend == "mp":
++                # FIXME(kunshang):
++                # spawn needs calling `if __name__ == '__main__':``
++                # fork is not supported for xpu start new process.
++                logger.error(
++                    "Both start methods (spawn and fork) have issue "
++                    "on XPU if you use mp backend, Please try ray instead.")
++            else:
++                from vllm.executor.xpu_executor import XPUExecutor
++                executor_class = XPUExecutor
++        elif distributed_executor_backend == "ray":
+             initialize_ray_cluster(engine_config.parallel_config)
+             from vllm.executor.ray_gpu_executor import RayGPUExecutor
+             executor_class = RayGPUExecutor
++        elif distributed_executor_backend == "mp":
++            from vllm.executor.multiproc_gpu_executor import (
++                MultiprocessingGPUExecutor)
++            assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
++                "multiprocessing distributed executor backend does not "
++                "support VLLM_USE_RAY_SPMD_WORKER=1")
++            executor_class = MultiprocessingGPUExecutor
+         else:
+-            assert engine_config.parallel_config.world_size == 1, (
+-                "Ray is required if parallel_config.world_size > 1.")
+             from vllm.executor.gpu_executor import GPUExecutor
+             executor_class = GPUExecutor
++        return executor_class
+ 
++    @classmethod
++    def from_engine_args(
++        cls,
++        engine_args: EngineArgs,
++        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
++        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
++    ) -> "LLMEngine":
++        """Creates an LLM engine from the engine arguments."""
++        # Create the engine configs.
++        engine_config = engine_args.create_engine_config(usage_context)
++        executor_class = cls._get_executor_cls(engine_config)
+         # Create the LLM engine.
+         engine = cls(
+-            **engine_config.to_dict(),
++            vllm_config=engine_config,
+             executor_class=executor_class,
+             log_stats=not engine_args.disable_log_stats,
+             usage_context=usage_context,
++            stat_loggers=stat_loggers,
+         )
++
+         return engine
+ 
+     def __reduce__(self):
+@@ -308,25 +535,34 @@ class LLMEngine:
+         if model_executor := getattr(self, "model_executor", None):
+             model_executor.shutdown()
+ 
+-    def get_tokenizer(self) -> "PreTrainedTokenizer":
+-        return self.tokenizer.get_lora_tokenizer(None)
+-
+-    def get_tokenizer_for_seq(self,
+-                              sequence: Sequence) -> "PreTrainedTokenizer":
+-        return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
+-
+-    def _init_tokenizer(self, **tokenizer_init_kwargs):
+-        init_kwargs = dict(
+-            tokenizer_id=self.model_config.tokenizer,
+-            enable_lora=bool(self.lora_config),
+-            max_num_seqs=self.scheduler_config.max_num_seqs,
+-            max_input_length=None,
+-            tokenizer_mode=self.model_config.tokenizer_mode,
+-            trust_remote_code=self.model_config.trust_remote_code,
+-            revision=self.model_config.tokenizer_revision)
+-        init_kwargs.update(tokenizer_init_kwargs)
+-        self.tokenizer = get_tokenizer_group(
+-            self.parallel_config.tokenizer_pool_config, **init_kwargs)
++    def get_tokenizer_group(
++        self,
++        group_type: Type[_G] = BaseTokenizerGroup,
++    ) -> _G:
++        tokenizer_group = self.tokenizer
++
++        if tokenizer_group is None:
++            raise ValueError("Unable to get tokenizer because "
++                             "skip_tokenizer_init is True")
++        if not isinstance(tokenizer_group, group_type):
++            raise TypeError("Invalid type of tokenizer group. "
++                            f"Expected type: {group_type}, but "
++                            f"found type: {type(tokenizer_group)}")
++
++        return tokenizer_group
++
++    def get_tokenizer(
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> AnyTokenizer:
++        return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
++
++    def _init_tokenizer(self) -> BaseTokenizerGroup:
++        return init_tokenizer_from_configs(
++            model_config=self.model_config,
++            scheduler_config=self.scheduler_config,
++            parallel_config=self.parallel_config,
++            lora_config=self.lora_config)
+ 
+     def _verify_args(self) -> None:
+         self.model_config.verify_with_parallel_config(self.parallel_config)
+@@ -335,30 +571,143 @@ class LLMEngine:
+             self.lora_config.verify_with_model_config(self.model_config)
+             self.lora_config.verify_with_scheduler_config(
+                 self.scheduler_config)
++        if self.prompt_adapter_config:
++            self.prompt_adapter_config.verify_with_model_config(
++                self.model_config)
+ 
+-    def encode_request(
++    def _add_processed_request(
+         self,
+-        request_id: str,  # pylint: disable=unused-argument
+-        prompt: Optional[str],
+-        prompt_token_ids: Optional[List[int]] = None,
++        request_id: str,
++        processed_inputs: ProcessorInputs,
++        params: Union[SamplingParams, PoolingParams],
++        arrival_time: float,
++        lora_request: Optional[LoRARequest],
++        prompt_adapter_request: Optional[PromptAdapterRequest],
++        trace_headers: Optional[Mapping[str, str]] = None,
++        priority: int = 0,
++    ) -> Optional[SequenceGroup]:
++        """Add a processed request to the engine's request pool.
++        return the created sequence group.
++        """
++        if isinstance(params, SamplingParams) and params.n > 1:
++            ParallelSampleSequenceGroup.add_request(
++                request_id,
++                self,
++                params,
++                processed_inputs=processed_inputs,
++                arrival_time=arrival_time,
++                lora_request=lora_request,
++                trace_headers=trace_headers,
++                prompt_adapter_request=prompt_adapter_request,
++                priority=priority,
++            )
++            return None
++
++        self._validate_model_inputs(processed_inputs, lora_request)
++        # Create the sequences.
++        block_size = self.cache_config.block_size
++        seq_id = next(self.seq_counter)
++        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
++
++        if is_encoder_decoder_inputs(processed_inputs):
++            decoder_inputs = processed_inputs["decoder"]
++            encoder_inputs = processed_inputs["encoder"]
++        else:
++            decoder_inputs = processed_inputs
++            encoder_inputs = None
++
++        seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
++                       lora_request, prompt_adapter_request)
++
++        encoder_seq = (None if encoder_inputs is None else Sequence(
++            seq_id, encoder_inputs, block_size, eos_token_id, lora_request,
++            prompt_adapter_request))
++
++        # Create a SequenceGroup based on SamplingParams or PoolingParams
++        if isinstance(params, SamplingParams):
++            seq_group = self._create_sequence_group_with_sampling(
++                request_id,
++                seq,
++                params,
++                arrival_time=arrival_time,
++                lora_request=lora_request,
++                trace_headers=trace_headers,
++                prompt_adapter_request=prompt_adapter_request,
++                encoder_seq=encoder_seq,
++                priority=priority)
++        elif isinstance(params, PoolingParams):
++            seq_group = self._create_sequence_group_with_pooling(
++                request_id,
++                seq,
++                params,
++                arrival_time=arrival_time,
++                lora_request=lora_request,
++                prompt_adapter_request=prompt_adapter_request,
++                encoder_seq=encoder_seq,
++                priority=priority)
++        else:
++            raise ValueError(
++                "Either SamplingParams or PoolingParams must be provided.")
++
++        # Add the sequence group to the scheduler with least unfinished seqs.
++        costs = [
++            scheduler.get_num_unfinished_seq_groups()
++            for scheduler in self.scheduler
++        ]
++        min_cost_scheduler = self.scheduler[costs.index(min(costs))]
++        min_cost_scheduler.add_seq_group(seq_group)
++
++        return seq_group
++
++    def stop_remote_worker_execution_loop(self) -> None:
++        self.model_executor.stop_remote_worker_execution_loop()
++
++    @overload
++    def add_request(
++        self,
++        request_id: str,
++        prompt: PromptType,
++        params: Union[SamplingParams, PoolingParams],
++        arrival_time: Optional[float] = None,
+         lora_request: Optional[LoRARequest] = None,
+-    ):
+-        if prompt_token_ids is None:
+-            assert prompt is not None
+-            prompt_token_ids = self.tokenizer.encode(request_id=request_id,
+-                                                     prompt=prompt,
+-                                                     lora_request=lora_request)
+-        return prompt_token_ids
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> None:
++        ...
+ 
++    @overload
++    @deprecated("'inputs' will be renamed to 'prompt")
+     def add_request(
+         self,
+         request_id: str,
+-        prompt: Optional[str],
+-        sampling_params: SamplingParams,
+-        prompt_token_ids: Optional[List[int]] = None,
++        *,
++        inputs: PromptType,
++        params: Union[SamplingParams, PoolingParams],
+         arrival_time: Optional[float] = None,
+         lora_request: Optional[LoRARequest] = None,
+-        multi_modal_data: Optional[MultiModalData] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> None:
++        ...
++
++    @deprecate_kwargs(
++        "inputs",
++        additional_message="Please use the 'prompt' parameter instead.",
++    )
++    def add_request(
++            self,
++            request_id: str,
++            prompt: Optional[PromptType] = None,
++            params: Optional[Union[SamplingParams, PoolingParams]] = None,
++            arrival_time: Optional[float] = None,
++            lora_request: Optional[LoRARequest] = None,
++            trace_headers: Optional[Mapping[str, str]] = None,
++            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++            priority: int = 0,
++            *,
++            inputs: Optional[PromptType] = None,  # DEPRECATED
+     ) -> None:
+         """Add a request to the engine's request pool.
+ 
+@@ -368,19 +717,21 @@ class LLMEngine:
+ 
+         Args:
+             request_id: The unique ID of the request.
+-            prompt: The prompt string. Can be None if prompt_token_ids is
+-                provided.
+-            sampling_params: The sampling parameters for text generation.
+-            prompt_token_ids: The token IDs of the prompt. If None, we
+-                use the tokenizer to convert the prompts to token IDs.
++            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
++                for more details about the format of each input.
++            params: Parameters for sampling or pooling.
++                :class:`~vllm.SamplingParams` for text generation.
++                :class:`~vllm.PoolingParams` for pooling.
+             arrival_time: The arrival time of the request. If None, we use
+                 the current monotonic time.
+-            multi_modal_data: Multi modal data per request.
++            trace_headers: OpenTelemetry trace headers.
++            priority: The priority of the request.
++                Only applicable with priority scheduling.
+ 
+         Details:
+             - Set arrival_time to the current time if it is None.
+             - Set prompt_token_ids to the encoded prompt if it is None.
+-            - Create `best_of` number of :class:`~vllm.Sequence` objects.
++            - Create `n` number of :class:`~vllm.Sequence` objects.
+             - Create a :class:`~vllm.SequenceGroup` object
+               from the list of :class:`~vllm.Sequence`.
+             - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
+@@ -401,9 +752,86 @@ class LLMEngine:
+             >>> # continue the request processing
+             >>> ...
+         """
++        if inputs is not None:
++            prompt = inputs
++        assert prompt is not None and params is not None
++
+         if lora_request is not None and not self.lora_config:
+             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                              "not enabled!")
++
++        if priority != 0 and not self.scheduler_config.policy == "priority":
++            raise ValueError(f"Got priority {priority} but "
++                             "Priority scheduling is not enabled.")
++
++        if isinstance(params, SamplingParams) \
++            and (params.guided_decoding or params.logits_processors) \
++            and self.scheduler_config.num_scheduler_steps > 1:
++            raise ValueError(
++                "Guided decoding and logits processors are not supported "
++                "in multi-step decoding")
++
++        if arrival_time is None:
++            arrival_time = time.time()
++
++        if self.tokenizer is not None:
++            self._validate_token_prompt(
++                prompt,
++                tokenizer=self.get_tokenizer(lora_request=lora_request))
++
++        preprocessed_inputs = self.input_preprocessor.preprocess(
++            prompt,
++            request_id=request_id,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++        )
++        processed_inputs = self.input_processor(preprocessed_inputs)
++
++        self._add_processed_request(
++            request_id=request_id,
++            processed_inputs=processed_inputs,
++            params=params,
++            arrival_time=arrival_time,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++            trace_headers=trace_headers,
++            priority=priority,
++        )
++
++    def _validate_token_prompt(self, prompt: PromptType,
++                               tokenizer: AnyTokenizer):
++        # Guard against out-of-vocab tokens.
++        # For some tokenizers, tokenizer.decode will happily return empty text
++        # for token ids that are out of vocab, and we don't detect token ids
++        # that are greater than the max token id before running the model.
++        # However, these token ids will later crash a cuda kernel at runtime
++        # with an index out of bounds error. This will crash the entire engine.
++        # This needs to happen before multimodal input pre-processing, which
++        # may add dummy <image> tokens that aren't part of the tokenizer's
++        # vocabulary.
++        if is_token_prompt(prompt):
++            prompt_ids = prompt["prompt_token_ids"]
++            if len(prompt_ids) == 0:
++                # Empty prompt check is handled later
++                return
++            max_input_id = max(prompt_ids)
++            if max_input_id > tokenizer.max_token_id:
++                raise ValueError(
++                    "Token id {} is out of vocabulary".format(max_input_id))
++
++    def _create_sequence_group_with_sampling(
++        self,
++        request_id: str,
++        seq: Sequence,
++        sampling_params: SamplingParams,
++        arrival_time: float,
++        lora_request: Optional[LoRARequest],
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        encoder_seq: Optional[Sequence] = None,
++        priority: int = 0,
++    ) -> SequenceGroup:
++        """Creates a SequenceGroup with SamplingParams."""
+         max_logprobs = self.get_model_config().max_logprobs
+         if (sampling_params.logprobs
+                 and sampling_params.logprobs > max_logprobs) or (
+@@ -411,43 +839,56 @@ class LLMEngine:
+                     and sampling_params.prompt_logprobs > max_logprobs):
+             raise ValueError(f"Cannot request more than "
+                              f"{max_logprobs} logprobs.")
+-        if arrival_time is None:
+-            arrival_time = time.time()
+-        prompt_token_ids = self.encode_request(
+-            request_id=request_id,
+-            prompt=prompt,
+-            prompt_token_ids=prompt_token_ids,
+-            lora_request=lora_request)
+ 
+-        # Create the sequences.
+-        block_size = self.cache_config.block_size
+-        seq_id = next(self.seq_counter)
+-        eos_token_id = None
+-        if self.tokenizer:
+-            eos_token_id = self.tokenizer.get_lora_tokenizer(
+-                lora_request).eos_token_id
+-        else:
+-            logger.warning("Use None for EOS token id because tokenizer is "
+-                           "not initialized")
+-        seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
+-                       eos_token_id, lora_request)
++        sampling_params = self._build_logits_processors(
++            sampling_params, lora_request)
+ 
+         # Defensive copy of SamplingParams, which are used by the sampler,
+         # this doesn't deep-copy LogitsProcessor objects
+         sampling_params = sampling_params.clone()
+-        # Add the eos token id into the sampling_params to support min_tokens
+-        # processing
+-        if seq.eos_token_id is not None:
+-            sampling_params.all_stop_token_ids.add(seq.eos_token_id)
++
+         sampling_params.update_from_generation_config(
+-            self.generation_config_fields)
++            self.generation_config_fields, seq.eos_token_id)
+ 
+         # Create the sequence group.
+-        seq_group = SequenceGroup(request_id, [seq], sampling_params,
+-                                  arrival_time, lora_request, multi_modal_data)
+-
+-        # Add the sequence group to the scheduler.
+-        self.scheduler.add_seq_group(seq_group)
++        seq_group = SequenceGroup(
++            request_id=request_id,
++            seqs=[seq],
++            arrival_time=arrival_time,
++            sampling_params=sampling_params,
++            lora_request=lora_request,
++            trace_headers=trace_headers,
++            prompt_adapter_request=prompt_adapter_request,
++            encoder_seq=encoder_seq,
++            priority=priority)
++
++        return seq_group
++
++    def _create_sequence_group_with_pooling(
++        self,
++        request_id: str,
++        seq: Sequence,
++        pooling_params: PoolingParams,
++        arrival_time: float,
++        lora_request: Optional[LoRARequest],
++        prompt_adapter_request: Optional[PromptAdapterRequest],
++        encoder_seq: Optional[Sequence] = None,
++        priority: int = 0,
++    ) -> SequenceGroup:
++        """Creates a SequenceGroup with PoolingParams."""
++        # Defensive copy of PoolingParams, which are used by the pooler
++        pooling_params = pooling_params.clone()
++        # Create the sequence group.
++        seq_group = SequenceGroup(
++            request_id=request_id,
++            seqs=[seq],
++            arrival_time=arrival_time,
++            lora_request=lora_request,
++            pooling_params=pooling_params,
++            prompt_adapter_request=prompt_adapter_request,
++            encoder_seq=encoder_seq,
++            priority=priority)
++        return seq_group
+ 
+     def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
+         """Aborts a request(s) with the given ID.
+@@ -466,71 +907,362 @@ class LLMEngine:
+             >>> # abort the request
+             >>> engine.abort_request(request_id)
+         """
+-        self.scheduler.abort_seq_group(request_id)
++        for scheduler in self.scheduler:
++            scheduler.abort_seq_group(request_id)
+ 
+     def get_model_config(self) -> ModelConfig:
+         """Gets the model configuration."""
+         return self.model_config
+ 
++    def get_parallel_config(self) -> ParallelConfig:
++        """Gets the parallel configuration."""
++        return self.parallel_config
++
+     def get_decoding_config(self) -> DecodingConfig:
+         """Gets the decoding configuration."""
+         return self.decoding_config
+ 
++    def get_scheduler_config(self) -> SchedulerConfig:
++        """Gets the scheduler configuration."""
++        return self.scheduler_config
++
++    def get_lora_config(self) -> LoRAConfig:
++        """Gets the LoRA configuration."""
++        return self.lora_config
++
+     def get_num_unfinished_requests(self) -> int:
+         """Gets the number of unfinished requests."""
+-        return self.scheduler.get_num_unfinished_seq_groups()
++        return sum(scheduler.get_num_unfinished_seq_groups()
++                   for scheduler in self.scheduler)
+ 
+     def has_unfinished_requests(self) -> bool:
+         """Returns True if there are unfinished requests."""
+-        return self.scheduler.has_unfinished_seqs()
++        return any(scheduler.has_unfinished_seqs()
++                   for scheduler in self.scheduler)
+ 
+-    def _process_model_outputs(
+-        self,
+-        output: List[SamplerOutput],
+-        scheduled_seq_groups: List[ScheduledSequenceGroup],
+-        ignored_seq_groups: List[SequenceGroup],
+-        seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> List[RequestOutput]:
+-        """Apply the model output to the sequences in the scheduled seq groups.
+-
+-        Returns RequestOutputs that can be returned to the client.
++    def has_unfinished_requests_for_virtual_engine(
++            self, virtual_engine: int) -> bool:
+         """
++        Returns True if there are unfinished requests for the virtual engine.
++        """
++        return self.scheduler[virtual_engine].has_unfinished_seqs()
+ 
+-        now = time.time()
++    @staticmethod
++    def _process_sequence_group_outputs(
++        seq_group: SequenceGroup,
++        outputs: List[PoolingSequenceGroupOutput],
++    ) -> None:
++        seq_group.pooled_data = outputs[0].data
+ 
+-        # Organize outputs by [sequence group][step] instead of
+-        # [step][sequence group].
+-        output_by_sequence_group = create_output_by_sequence_group(
+-            sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups))
++        for seq in seq_group.get_seqs():
++            seq.status = SequenceStatus.FINISHED_STOPPED
+ 
+-        # Update the scheduled sequence groups with the model outputs.
+-        for scheduled_seq_group, outputs, seq_group_meta in zip(
+-                scheduled_seq_groups, output_by_sequence_group,
+-                seq_group_metadata_list):
+-            seq_group = scheduled_seq_group.seq_group
++        return
++
++    def _update_num_computed_tokens_for_multi_step_prefill(
++            self, seq_group: SequenceGroup,
++            seq_group_meta: SequenceGroupMetadata,
++            is_first_step_output: Optional[bool]):
++        """
++        This function updates num_computed_tokens for prompt sequences
++        when Multi-Step is enabled.
++
++        seq_group: SequenceGroup to update the num_computed_tokens for.
++        seq_group_meta: Metadata of the given SequenceGroup.
++        is_first_step_output: Optional[bool] -
++            When available, is_first_step_output indicates if the appended
++            output token is the output of the first-step in multi-step.
++            A value of None indicates that outputs from all steps in
++            in multi-step are submitted in a single burst.
++        """
++
++        assert self.scheduler_config.is_multi_step
++
++        if not seq_group_meta.is_prompt:
++            # num_computed_token updates for multi-step decodes happen after
++            # the tokens are appended to the sequence.
++            return
++
++        do_update: bool = False
++        if self.scheduler_config.chunked_prefill_enabled:
++            # In multi-step + chunked-prefill case, the prompt sequences
++            # that are scheduled are fully processed in the first step.
++            do_update = is_first_step_output is None or is_first_step_output
++        else:
++            # Normal multi-step decoding case. In this case prompt-sequences
++            # are actually single-stepped. Always update in this case.
++            assert seq_group.state.num_steps == 1
++            do_update = True
++
++        if do_update:
+             seq_group.update_num_computed_tokens(
+-                scheduled_seq_group.token_chunk_size)
++                seq_group_meta.token_chunk_size)
+ 
+-            self.output_processor.process_prompt_logprob(seq_group, outputs)
+-            if seq_group_meta.do_sample:
+-                self.output_processor.process_outputs(seq_group, outputs)
++    def _process_model_outputs(self,
++                               ctx: SchedulerContext,
++                               request_id: Optional[str] = None) -> None:
++        """Apply the model output to the sequences in the scheduled seq groups
++        and return responses.
+ 
+-        # Free the finished sequence groups.
+-        self.scheduler.free_finished_seq_groups()
++        ctx: The virtual engine context to work on
++        request_id: If provided, then only this request is going to be processed
++        """
++
++        now = time.time()
++
++        if len(ctx.output_queue) == 0:
++            return None
++
++        # Get pending async postprocessor
++        if request_id:
++            # When we process only one request, no pop is required
++            # (since later we will process all of the rest)
++            (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
++             is_last_step, is_first_step_output, skip) = ctx.output_queue[0]
++        else:
++            (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
++             is_last_step, is_first_step_output,
++             skip) = ctx.output_queue.popleft()
++
++        # Sanity check
++        assert len(seq_group_metadata_list) == len(
++            scheduler_outputs.scheduled_seq_groups)
++
++        has_multiple_outputs: bool = len(outputs) > 1
++        outputs_by_sequence_group: List[List[SequenceGroupOutput]]
++        if has_multiple_outputs:
++            assert self.scheduler_config.is_multi_step or \
++                     self.speculative_config
++            # Organize outputs by [step][sequence group] instead of
++            # [sequence group][step].
++            outputs_by_sequence_group = create_output_by_sequence_group(
++                outputs, num_seq_groups=len(seq_group_metadata_list))
++            # We have outputs for multiple steps submitted in a single burst,
++            # so invalidate is_first_step_output.
++            is_first_step_output = None
++        else:
++            outputs_by_sequence_group = outputs
++
++        # Determine the requests we need to operate on
++        if request_id:
++            indices = []
++            for i, seq_group_meta in enumerate(seq_group_metadata_list):
++                if seq_group_meta.request_id == request_id:
++                    assert i not in skip  # Cannot be called twice
++                    indices.append(i)
++                    break
++
++            # If the request_id was not found, then it means that
++            # this is a new request that has no pending async
++            # postprocessor
++            if not indices:
++                return
++        else:
++            indices = range(len(seq_group_metadata_list))  # type: ignore
++
++        finished_before: List[int] = []
++        finished_now: List[int] = []
++        for i in indices:
++            if i in skip:
++                continue
++
++            seq_group_meta = seq_group_metadata_list[i]
++            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
++
++            seq_group: SequenceGroup = scheduled_seq_group.seq_group
++
++            if seq_group.is_finished():
++                finished_before.append(i)
++                continue
++
++            output: List[SequenceGroupOutput]
++            if has_multiple_outputs:
++                output = outputs_by_sequence_group[i]
++            else:
++                output = [outputs_by_sequence_group[0][i]]
++
++            if not is_async:
++                if self.scheduler_config.is_multi_step:
++                    # Updates happen only if the sequence is prefill
++                    self._update_num_computed_tokens_for_multi_step_prefill(
++                        seq_group, seq_group_meta, is_first_step_output)
++                else:
++                    seq_group.update_num_computed_tokens(
++                        seq_group_meta.token_chunk_size or 0)
++
++            if outputs:
++                for o in outputs:
++                    if (isinstance(o, SamplerOutput)
++                            and seq_group.metrics is not None):
++                        if seq_group.metrics.model_forward_time is not None:
++                            seq_group.metrics.model_forward_time += (
++                                o.model_forward_time or 0)
++                        else:
++                            seq_group.metrics.model_forward_time = (
++                                o.model_forward_time)
++                        if seq_group.metrics.model_execute_time is not None:
++                            seq_group.metrics.model_execute_time += (
++                                o.model_execute_time or 0)
++                        else:
++                            seq_group.metrics.model_execute_time = (
++                                o.model_execute_time)
++
++            if self.model_config.runner_type == "pooling":
++                self._process_sequence_group_outputs(seq_group, output)
++            else:
++                self.output_processor.process_prompt_logprob(seq_group, output)
++                if seq_group_meta.do_sample:
++                    self.output_processor.process_outputs(
++                        seq_group, output, is_async)
++
++            if seq_group.is_finished():
++                finished_now.append(i)
++
++        # Generate outputs for the requests that finished this iteration
++        for i in finished_now:
++            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
++
++            seq_group = scheduled_seq_group.seq_group
++            seq_group.maybe_set_first_token_time(now)
++            if not seq_group.is_prefill():
++                seq_group.set_last_token_time(now)
++            request_output = RequestOutputFactory.create(
++                seq_group,
++                self.seq_id_to_seq_group,
++                use_cache=self.use_cached_outputs)
++            if request_output:
++                ctx.request_outputs.append(request_output)
++
++        # When we process a single request, we skip it for the next time,
++        # and invoke the request output callback (if there was final output)
++        if request_id:
++            assert len(indices) == 1
++            skip.append(indices[0])
++
++            if (finished_now
++                    and self.process_request_outputs_callback is not None):
++                self.process_request_outputs_callback(ctx.request_outputs)
++                ctx.request_outputs.clear()
++            return
++
++        # Free currently finished requests
++        if finished_now:
++            for scheduler in self.scheduler:
++                scheduler.free_finished_seq_groups()
++
++        # For multi-step without streaming, don't create outputs each iteration
++        if not is_last_step and not ctx.multi_step_stream_outputs:
++            # Immediately process request outputs here (if callback is given)
++            if (finished_now
++                    and self.process_request_outputs_callback is not None):
++                self.process_request_outputs_callback(ctx.request_outputs)
++                ctx.request_outputs.clear()
++            return
++
++        # Create the outputs
++        for i in indices:
++            if i in skip or i in finished_before or i in finished_now:
++                continue  # Avoids double processing
++
++            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
+ 
+-        # Create the outputs.
+-        request_outputs: List[RequestOutput] = []
+-        for scheduled_seq_group in scheduled_seq_groups:
+             seq_group = scheduled_seq_group.seq_group
+             seq_group.maybe_set_first_token_time(now)
+-            request_output = RequestOutput.from_seq_group(seq_group)
+-            request_outputs.append(request_output)
+-        for seq_group in ignored_seq_groups:
+-            request_output = RequestOutput.from_seq_group(seq_group)
+-            request_outputs.append(request_output)
+-        return request_outputs
+-
+-    def step(self) -> List[RequestOutput]:
++            if not seq_group.is_prefill():
++                seq_group.set_last_token_time(now)
++            request_output = RequestOutputFactory.create(
++                seq_group,
++                self.seq_id_to_seq_group,
++                use_cache=self.use_cached_outputs)
++            if request_output:
++                ctx.request_outputs.append(request_output)
++
++        # For multi-step with streaming, create outputs each iteration
++        if not is_last_step and ctx.multi_step_stream_outputs:
++            # Immediately process request outputs here (if callback is given)
++            if self.process_request_outputs_callback is not None:
++                self.process_request_outputs_callback(ctx.request_outputs)
++                ctx.request_outputs.clear()
++            return
++
++        for seq_group in scheduler_outputs.ignored_seq_groups:
++            params = seq_group.sampling_params
++            if params is not None and params.output_kind == (
++                    RequestOutputKind.DELTA) and not seq_group.is_finished():
++                continue
++
++            request_output = RequestOutputFactory.create(
++                seq_group,
++                self.seq_id_to_seq_group,
++                use_cache=self.use_cached_outputs,
++            )
++            if request_output:
++                ctx.request_outputs.append(request_output)
++
++        # Immediately process request outputs here (if callback is given)
++        if (ctx.request_outputs
++                and self.process_request_outputs_callback is not None):
++            self.process_request_outputs_callback(ctx.request_outputs)
++            ctx.request_outputs.clear()
++
++        # For async case, we need to record the stats here.
++        # For non-async case, the stats are done in the
++        # LLMEngine/AsyncLLMEngine directly
++        if is_async:
++            # Log stats.
++            self.do_log_stats(scheduler_outputs, outputs, finished_before,
++                              skip)
++
++            # Tracing
++            self.do_tracing(scheduler_outputs, finished_before)
++
++        return None
++
++    def _advance_to_next_step(
++            self, output: List[SamplerOutput],
++            seq_group_metadata_list: List[SequenceGroupMetadata],
++            scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None:
++        """Given model output from a single run, append the tokens to the
++        sequences. This is normally done inside output processor, but it is
++        required if the worker is to perform async forward pass to next step.
++        """
++        for seq_group_metadata, sequence_group_outputs, scheduled_seq_group in \
++            zip(seq_group_metadata_list, output, scheduled_seq_groups):
++            seq_group = scheduled_seq_group.seq_group
++
++            if seq_group.is_finished():
++                continue
++
++            if self.scheduler_config.is_multi_step:
++                # Updates happen only if the sequence is prefill
++                self._update_num_computed_tokens_for_multi_step_prefill(
++                    seq_group, seq_group_metadata,
++                    seq_group.state.num_steps == 1)
++            else:
++                token_chunk_size = (seq_group_metadata.token_chunk_size
++                                    if seq_group_metadata.token_chunk_size
++                                    is not None else 0)
++                seq_group.update_num_computed_tokens(token_chunk_size)
++
++            if seq_group_metadata.do_sample:
++                assert len(sequence_group_outputs.samples) == 1, (
++                    "Async output processor expects a single sample"
++                    " (i.e sampling_params.n == 1)")
++                sample = sequence_group_outputs.samples[0]
++
++                assert len(seq_group.seqs) == 1
++                seq = seq_group.seqs[0]
++
++                if self.scheduler_config.is_multi_step:
++                    is_prefill_append = seq.data.get_num_uncomputed_tokens(
++                    ) == 0
++                    seq.append_token_id(sample.output_token, sample.logprobs)
++                    if not is_prefill_append:
++                        seq_group.update_num_computed_tokens(1)
++                else:
++                    seq.append_token_id(sample.output_token, sample.logprobs)
++
++    def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+         """Performs one decoding iteration and returns newly generated results.
+ 
+         .. figure:: https://i.imgur.com/sv2HssD.png
+@@ -570,7 +1302,7 @@ class LLMEngine:
+             >>> while True:
+             >>>     if example_inputs:
+             >>>         req_id, prompt, sampling_params = example_inputs.pop(0)
+-            >>>         engine.add_request(str(req_id), prompt, sampling_params)
++            >>>         engine.add_request(str(req_id),prompt,sampling_params)
+             >>>
+             >>>     # continue the request processing
+             >>>     request_outputs = engine.step()
+@@ -581,9 +1313,68 @@ class LLMEngine:
+             >>>     if not (engine.has_unfinished_requests() or example_inputs):
+             >>>         break
+         """
+-        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
++        if self.parallel_config.pipeline_parallel_size > 1:
++            raise NotImplementedError(
++                "Pipeline parallelism is only supported through AsyncLLMEngine "
++                "as performance will be severely degraded otherwise.")
++
++        # For llm_engine, there is no pipeline parallel support, so the engine
++        # used is always 0.
++        virtual_engine = 0
++
++        # These are cached outputs from previous iterations. None if on first
++        # iteration
++        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
++        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
++        scheduler_outputs = cached_outputs.scheduler_outputs
++        allow_async_output_proc = cached_outputs.allow_async_output_proc
++
++        ctx = self.scheduler_contexts[virtual_engine]
++
++        # Clear outputs for each new scheduler iteration
++        ctx.request_outputs.clear()
++
++        # Skip the scheduler if there are any remaining steps in the seq groups.
++        # This ensures that the scheduler is only called again when the current
++        # batch has completed.
++        if not self._has_remaining_steps(seq_group_metadata_list):
++            # Schedule iteration
++            (seq_group_metadata_list, scheduler_outputs,
++             allow_async_output_proc
++             ) = self.scheduler[virtual_engine].schedule()
++
++            ctx.seq_group_metadata_list = seq_group_metadata_list
++            ctx.scheduler_outputs = scheduler_outputs
++
++            finished_requests_ids = self.scheduler[
++                virtual_engine].get_and_reset_finished_requests_ids()
++
++            # Maybe switch from async mode to sync mode
++            if not allow_async_output_proc and len(ctx.output_queue) > 0:
++                self._process_model_outputs(ctx=ctx)
++
++            if (self.scheduler_config.is_multi_step
++                    and scheduler_outputs.num_lookahead_slots > 0):
++                # cache the scheduler outputs for the next iteration if we have
++                # lookahead slots
++                self._cache_scheduler_outputs_for_multi_step(
++                    virtual_engine, seq_group_metadata_list, scheduler_outputs,
++                    allow_async_output_proc)
++        else:
++            finished_requests_ids = list()
++
++        assert seq_group_metadata_list is not None
++        assert scheduler_outputs is not None
+ 
+         if not scheduler_outputs.is_empty():
++
++            # Check if we have a cached last_output from the previous iteration.
++            # For supporting PP this is probably the best way to pass the
++            # sampled_token_ids, as a separate broadcast over all the PP stages
++            # will cause one virtual engine's microbatch to block the pipeline.
++            last_sampled_token_ids = \
++                self._get_last_sampled_token_ids(virtual_engine)
++
+             execute_model_req = ExecuteModelRequest(
+                 seq_group_metadata_list=seq_group_metadata_list,
+                 blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+@@ -591,34 +1382,182 @@ class LLMEngine:
+                 blocks_to_copy=scheduler_outputs.blocks_to_copy,
+                 num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
+                 running_queue_size=scheduler_outputs.running_queue_size,
+-            )
+-            output = self.model_executor.execute_model(
+-                execute_model_req=execute_model_req)
+-        else:
+-            output = []
+-
+-        request_outputs = self._process_model_outputs(
+-            output, scheduler_outputs.scheduled_seq_groups,
+-            scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
++                finished_requests_ids=finished_requests_ids,
++                # We use ExecuteModelRequest to pass the last sampled_token_ids
++                # to each of the non-last PP stages for in-place prepare_input.
++                last_sampled_token_ids=last_sampled_token_ids)
+ 
+-        # Log stats.
+-        self.do_log_stats(scheduler_outputs, output)
++            if allow_async_output_proc:
++                execute_model_req.async_callback = self.async_callbacks[
++                    virtual_engine]
+ 
+-        return request_outputs
++            outputs = self.model_executor.execute_model(
++                execute_model_req=execute_model_req)
+ 
+-    def do_log_stats(
+-            self,
+-            scheduler_outputs: Optional[SchedulerOutputs] = None,
+-            model_output: Optional[List[SamplerOutput]] = None) -> None:
++            # We need to do this here so that last step's sampled_token_ids can
++            # be passed to the next iteration for PP.
++            if self.scheduler_config.is_multi_step:
++                self._update_cached_scheduler_output(virtual_engine, outputs)
++        else:
++            # Nothing scheduled => If there is pending async postprocessor,
++            # then finish it here.
++            if len(ctx.output_queue) > 0:
++                self._process_model_outputs(ctx=ctx)
++            # No outputs in this case
++            outputs = []
++
++        # Finish the current step for all the sequence groups.
++        if self.scheduler_config.is_multi_step:
++            for seq_group in seq_group_metadata_list:
++                seq_group.finish_step()
++
++        if not self._has_remaining_steps(seq_group_metadata_list):
++            # clear the cache if we have finished all the steps.
++            if self.scheduler_config.is_multi_step:
++                self.cached_scheduler_outputs[0] = SchedulerOutputState()
++
++            # is_first_step_output is True only when the num_steps of all
++            # the sequences are 1. When the num_steps > 1,
++            # multi_step_model_runner does the first-step output append.
++            is_first_step_output: bool = False if not seq_group_metadata_list \
++                else seq_group_metadata_list[0].state.num_steps == 1
++
++            # Add results to the output_queue
++            ctx.append_output(outputs=outputs,
++                              seq_group_metadata_list=seq_group_metadata_list,
++                              scheduler_outputs=scheduler_outputs,
++                              is_async=allow_async_output_proc,
++                              is_last_step=True,
++                              is_first_step_output=is_first_step_output)
++
++            if outputs and allow_async_output_proc:
++                assert len(outputs) == 1, (
++                    "Async postprocessor expects only a single output set")
++
++                self._advance_to_next_step(
++                    outputs[0], seq_group_metadata_list,
++                    scheduler_outputs.scheduled_seq_groups)
++
++            # Check if need to run the usual non-async path
++            if not allow_async_output_proc:
++                self._process_model_outputs(ctx=ctx)
++
++                # Log stats.
++                self.do_log_stats(scheduler_outputs, outputs)
++
++                # Tracing
++                self.do_tracing(scheduler_outputs)
++        else:
++            # Multi-step case
++            return ctx.request_outputs
++
++        if not self.has_unfinished_requests():
++            # Drain async postprocessor (if exists)
++            if len(ctx.output_queue) > 0:
++                self._process_model_outputs(ctx=ctx)
++            assert len(ctx.output_queue) == 0
++
++            # Stop the execute model loop in parallel workers until there are
++            # more requests to process. This avoids waiting indefinitely in
++            # torch.distributed ops which may otherwise timeout, and unblocks
++            # the RPC thread in the workers so that they can process any other
++            # queued control plane messages, such as add/remove lora adapters.
++            logger.debug("Stopping remote worker execution loop.")
++            self.model_executor.stop_remote_worker_execution_loop()
++
++        return ctx.request_outputs
++
++    def _has_remaining_steps(
++        self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
++    ) -> bool:
++        if (not self.scheduler_config.is_multi_step
++                or not seq_group_metadata_list):
++            return False
++
++        # TODO(will) this is a sanity check for nowto make sure that all the
++        # seqs are on the same steps. Eventually we will want to do some sort of
++        # dynamic scheduling when doing multi-step decoding.
++        ref_remaining_steps = seq_group_metadata_list[0].state.remaining_steps
++        if any([
++                seq_group.state.remaining_steps != ref_remaining_steps
++                for seq_group in seq_group_metadata_list[1:]
++        ]):
++            raise AssertionError("All running sequence groups should "
++                                 "have the same remaining steps.")
++
++        return ref_remaining_steps > 0
++
++    def _cache_scheduler_outputs_for_multi_step(
++            self, virtual_engine: int,
++            seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
++            scheduler_outputs: SchedulerOutputs,
++            allow_async_output_proc: bool) -> None:
++        co = self.cached_scheduler_outputs[virtual_engine]
++
++        co.seq_group_metadata_list = seq_group_metadata_list
++        co.scheduler_outputs = scheduler_outputs
++        co.allow_async_output_proc = allow_async_output_proc
++        co.last_output = None
++
++    def _update_cached_scheduler_output(
++            self, virtual_engine: int,
++            output: List[Optional[SamplerOutput]]) -> None:
++        if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0
++                and output[0] is not None):
++            last_output = output[-1]
++            assert last_output is not None
++            assert last_output.sampled_token_ids_cpu is not None
++            assert last_output.sampled_token_ids is None
++            assert last_output.sampled_token_probs is None
++            self.cached_scheduler_outputs[
++                virtual_engine].last_output = last_output
++
++    def _get_last_sampled_token_ids(
++            self, virtual_engine: int) -> Optional[torch.Tensor]:
++        cached_last_output = self.cached_scheduler_outputs[
++            virtual_engine].last_output
++        if (self.scheduler_config.is_multi_step
++                and self.parallel_config.pipeline_parallel_size > 1
++                and cached_last_output is not None
++                and cached_last_output.sampled_token_ids_cpu is not None):
++            return cached_last_output.sampled_token_ids_cpu
++        return None
++
++    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
++        if not self.log_stats:
++            raise RuntimeError(
++                "Stat logging is disabled. Set `disable_log_stats=False` "
++                "argument to enable.")
++        if logger_name in self.stat_loggers:
++            raise KeyError(f"Logger with name {logger_name} already exists.")
++        self.stat_loggers[logger_name] = logger
++
++    def remove_logger(self, logger_name: str) -> None:
++        if not self.log_stats:
++            raise RuntimeError(
++                "Stat logging is disabled. Set `disable_log_stats=False` "
++                "argument to enable.")
++        if logger_name not in self.stat_loggers:
++            raise KeyError(f"Logger with name {logger_name} does not exist.")
++        del self.stat_loggers[logger_name]
++
++    def do_log_stats(self,
++                     scheduler_outputs: Optional[SchedulerOutputs] = None,
++                     model_output: Optional[List[SamplerOutput]] = None,
++                     finished_before: Optional[List[int]] = None,
++                     skip: Optional[List[int]] = None) -> None:
+         """Forced log when no requests active."""
+         if self.log_stats:
+-            self.stat_logger.log(
+-                self._get_stats(scheduler_outputs, model_output))
+-
+-    def _get_stats(
+-            self,
+-            scheduler_outputs: Optional[SchedulerOutputs],
+-            model_output: Optional[List[SamplerOutput]] = None) -> Stats:
++            stats = self._get_stats(scheduler_outputs, model_output,
++                                    finished_before, skip)
++            for logger in self.stat_loggers.values():
++                logger.log(stats)
++
++    def _get_stats(self,
++                   scheduler_outputs: Optional[SchedulerOutputs],
++                   model_output: Optional[List[SamplerOutput]] = None,
++                   finished_before: Optional[List[int]] = None,
++                   skip: Optional[List[int]] = None) -> Stats:
+         """Get Stats to be Logged to Prometheus.
+ 
+         Args:
+@@ -626,47 +1565,100 @@ class LLMEngine:
+                 the scheduled batch,
+             model_output: Optional, used to emit speculative decoding metrics
+                 which are created by the workers.
++            finished_before: Optional, indices of sequences that were finished
++                before. These sequences will be ignored.
++            skip: Optional, indices of sequences that were preempted. These
++                sequences will be ignored.
+         """
+         now = time.time()
+ 
+         # System State
+         #   Scheduler State
+-        num_running_sys = len(self.scheduler.running)
+-        num_swapped_sys = len(self.scheduler.swapped)
+-        num_waiting_sys = len(self.scheduler.waiting)
++        num_running_sys = sum(
++            len(scheduler.running) for scheduler in self.scheduler)
++        num_swapped_sys = sum(
++            len(scheduler.swapped) for scheduler in self.scheduler)
++        num_waiting_sys = sum(
++            len(scheduler.waiting) for scheduler in self.scheduler)
+ 
+         # KV Cache Usage in %
+         num_total_gpu = self.cache_config.num_gpu_blocks
+-        num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
+-        gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
++        gpu_cache_usage_sys = 0.
++        if num_total_gpu:  # Guard against both None and 0
++            num_free_gpu = sum(
++                scheduler.block_manager.get_num_free_gpu_blocks()
++                for scheduler in self.scheduler)
++            gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
+ 
+         num_total_cpu = self.cache_config.num_cpu_blocks
+         cpu_cache_usage_sys = 0.
+-        if num_total_cpu > 0:
+-            num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
+-            )
++        if num_total_cpu:  # Guard against both None and 0
++            num_free_cpu = sum(
++                scheduler.block_manager.get_num_free_cpu_blocks()
++                for scheduler in self.scheduler)
+             cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
+ 
++        # Prefix Cache Hit Rate. Note that we always use
++        # the cache hit rate of the first virtual engine.
++        cpu_prefix_cache_hit_rate = self.scheduler[
++            0].get_prefix_cache_hit_rate(Device.CPU)
++        gpu_prefix_cache_hit_rate = self.scheduler[
++            0].get_prefix_cache_hit_rate(Device.GPU)
++
+         # Iteration stats
+         num_prompt_tokens_iter = 0
+         num_generation_tokens_iter = 0
++        num_tokens_iter = 0
+         time_to_first_tokens_iter: List[float] = []
+         time_per_output_tokens_iter: List[float] = []
++        num_preemption_iter = (0 if scheduler_outputs is None else
++                               scheduler_outputs.preempted)
+ 
+         # Request stats
+         #   Latency
+         time_e2e_requests: List[float] = []
++        time_queue_requests: List[float] = []
++        time_inference_requests: List[float] = []
++        time_prefill_requests: List[float] = []
++        time_decode_requests: List[float] = []
++        time_in_queue_requests: List[float] = []
++        model_forward_time_requests: List[float] = []
++        model_execute_time_requests: List[float] = []
+         #   Metadata
+         num_prompt_tokens_requests: List[int] = []
+         num_generation_tokens_requests: List[int] = []
+-        best_of_requests: List[int] = []
+         n_requests: List[int] = []
++        max_num_generation_tokens_requests: List[int] = []
++        max_tokens_requests: List[int] = []
+         finished_reason_requests: List[str] = []
+ 
++        # Lora requests
++        running_lora_adapters = dict(
++            collectionsCounter([
++                running_request.lora_request.lora_name
++                for scheduler in self.scheduler
++                for running_request in scheduler.running
++                if running_request.lora_request
++            ]))
++        waiting_lora_adapters = dict(
++            collectionsCounter([
++                waiting_request.lora_request.lora_name
++                for scheduler in self.scheduler
++                for waiting_request in scheduler.waiting
++                if waiting_request.lora_request
++            ]))
++        max_lora_stat = "0"
++        if self.lora_config:
++            max_lora_stat = str(self.lora_config.max_loras)
++
+         # NOTE: This loop assumes prefill seq_groups are before
+         # decode seq_groups in scheduled_seq_groups.
+         if scheduler_outputs is not None:
+-            num_generation_tokens_from_prefill_groups = 0.
++            # For async postprocessor, already finished sequences need to be
++            # not counted (to avoid double counting)
++            actual_num_batched_tokens = scheduler_outputs.num_batched_tokens  # type: ignore
++
++            num_generation_tokens_from_prefill_groups = 0
+             # NOTE: if scheduler_outputs.num_prefill_groups > 0 and
+             # the len of scheduler_outputs.scheduled_seq_groups is !=
+             # scheduler_outputs.num_prefill_groups, this means that
+@@ -674,6 +1666,16 @@ class LLMEngine:
+ 
+             for idx, scheduled_seq_group in enumerate(
+                     scheduler_outputs.scheduled_seq_groups):
++                # Skip double logging when using async output proc
++                if finished_before and idx in finished_before:
++                    actual_num_batched_tokens -= 1
++                    continue
++
++                # Currently, skip == preempted sequences, so we need to skip
++                # their log stats
++                if skip and idx in skip:
++                    continue
++
+                 group_was_prefill = idx < scheduler_outputs.num_prefill_groups
+                 seq_group = scheduled_seq_group.seq_group
+ 
+@@ -688,7 +1690,7 @@ class LLMEngine:
+                     # If the seq_group just finished the prefill state
+                     # get TTFT.
+                     if not seq_group.is_prefill():
+-                        latency = seq_group.get_last_latency(now)
++                        latency = seq_group.get_last_token_latency()
+                         time_to_first_tokens_iter.append(latency)
+ 
+                         # One generation token per finished prefill.
+@@ -696,8 +1698,17 @@ class LLMEngine:
+                             seq_group.num_seqs())
+                 else:
+                     # TPOTs.
+-                    latency = seq_group.get_last_latency(now)
++                    latency = seq_group.get_last_token_latency()
+                     time_per_output_tokens_iter.append(latency)
++                    if seq_group.state.current_step == 0:
++                        # For async_output_proc, the do_log_stats()
++                        # is called following init_multi_step(), which
++                        # sets the current_step to zero.
++                        actual_num_batched_tokens +=\
++                            seq_group.state.num_steps - 1
++                    else:
++                        actual_num_batched_tokens +=\
++                            seq_group.state.current_step - 1
+ 
+                 # Because of chunked prefill, we can have a single sequence
+                 # group that does multiple prompt_runs. To prevent logging
+@@ -708,7 +1719,27 @@ class LLMEngine:
+                     # Latency timings
+                     time_e2e_requests.append(now -
+                                              seq_group.metrics.arrival_time)
+-
++                    if (seq_group.metrics.first_scheduled_time is not None and
++                            seq_group.metrics.first_token_time is not None):
++                        time_queue_requests.append(
++                            seq_group.metrics.first_scheduled_time -
++                            seq_group.metrics.arrival_time)
++                        time_prefill_requests.append(
++                            seq_group.metrics.first_token_time -
++                            seq_group.metrics.first_scheduled_time)
++                        time_decode_requests.append(
++                            now - seq_group.metrics.first_token_time)
++                        time_inference_requests.append(
++                            now - seq_group.metrics.first_scheduled_time)
++                    if seq_group.metrics.time_in_queue is not None:
++                        time_in_queue_requests.append(
++                            seq_group.metrics.time_in_queue)
++                    if seq_group.metrics.model_forward_time is not None:
++                        model_forward_time_requests.append(
++                            seq_group.metrics.model_forward_time)
++                    if seq_group.metrics.model_execute_time is not None:
++                        model_execute_time_requests.append(
++                            seq_group.metrics.model_execute_time * 1000)
+                     # Metadata
+                     num_prompt_tokens_requests.append(
+                         len(seq_group.prompt_token_ids))
+@@ -716,8 +1747,13 @@ class LLMEngine:
+                         seq.get_output_len()
+                         for seq in seq_group.get_finished_seqs()
+                     ])
+-                    best_of_requests.append(seq_group.sampling_params.best_of)
+-                    n_requests.append(seq_group.sampling_params.n)
++                    max_num_generation_tokens_requests.append(
++                        max(seq.get_output_len()
++                            for seq in seq_group.get_seqs()))
++                    if seq_group.sampling_params is not None:
++                        n_requests.append(seq_group.sampling_params.n)
++                        max_tokens_requests.append(
++                            seq_group.sampling_params.max_tokens)
+                     finished_reason_requests.extend([
+                         SequenceStatus.get_finished_reason(seq.status)
+                         for seq in seq_group.get_finished_seqs()
+@@ -730,20 +1766,20 @@ class LLMEngine:
+             #   + num_generation_tokens_from_prefill_groups (since we generate
+             #   one token on prefills on iters where the prefill finishes).
+             num_generation_tokens_iter = (
+-                scheduler_outputs.num_batched_tokens - num_prompt_tokens_iter +
++                actual_num_batched_tokens - num_prompt_tokens_iter +
+                 num_generation_tokens_from_prefill_groups)
+-
++            num_tokens_iter = (num_generation_tokens_iter +
++                               num_prompt_tokens_iter)
+         # Spec decode, if enabled, emits specialized metrics from the worker in
+         # sampler output.
+-        if model_output and (model_output[0].spec_decode_worker_metrics
+-                             is not None):
++        if model_output and isinstance(model_output[0], SamplerOutput) and (
++                model_output[0].spec_decode_worker_metrics is not None):
+             spec_decode_metrics = model_output[0].spec_decode_worker_metrics
+         else:
+             spec_decode_metrics = None
+ 
+         return Stats(
+             now=now,
+-
+             # System stats
+             #   Scheduler State
+             num_running_sys=num_running_sys,
+@@ -752,24 +1788,40 @@ class LLMEngine:
+             #   KV Cache Usage in %
+             gpu_cache_usage_sys=gpu_cache_usage_sys,
+             cpu_cache_usage_sys=cpu_cache_usage_sys,
++            #   Prefix Cache Hit Rate
++            cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate,
++            gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate,
+ 
+             # Iteration stats
+             num_prompt_tokens_iter=num_prompt_tokens_iter,
+             num_generation_tokens_iter=num_generation_tokens_iter,
++            num_tokens_iter=num_tokens_iter,
+             time_to_first_tokens_iter=time_to_first_tokens_iter,
+             time_per_output_tokens_iter=time_per_output_tokens_iter,
+             spec_decode_metrics=spec_decode_metrics,
++            num_preemption_iter=num_preemption_iter,
+ 
+             # Request stats
+             #   Latency
+             time_e2e_requests=time_e2e_requests,
++            time_queue_requests=time_queue_requests,
++            time_inference_requests=time_inference_requests,
++            time_prefill_requests=time_prefill_requests,
++            time_decode_requests=time_decode_requests,
++            time_in_queue_requests=time_in_queue_requests,
++            model_forward_time_requests=model_forward_time_requests,
++            model_execute_time_requests=model_execute_time_requests,
+             #   Metadata
+             num_prompt_tokens_requests=num_prompt_tokens_requests,
+             num_generation_tokens_requests=num_generation_tokens_requests,
+-            best_of_requests=best_of_requests,
++            max_num_generation_tokens_requests=
++            max_num_generation_tokens_requests,
+             n_requests=n_requests,
++            max_tokens_requests=max_tokens_requests,
+             finished_reason_requests=finished_reason_requests,
+-        )
++            max_lora=str(max_lora_stat),
++            waiting_lora_adapters=list(waiting_lora_adapters.keys()),
++            running_lora_adapters=list(running_lora_adapters.keys()))
+ 
+     def add_lora(self, lora_request: LoRARequest) -> bool:
+         return self.model_executor.add_lora(lora_request)
+@@ -777,8 +1829,207 @@ class LLMEngine:
+     def remove_lora(self, lora_id: int) -> bool:
+         return self.model_executor.remove_lora(lora_id)
+ 
+-    def list_loras(self) -> List[int]:
++    def list_loras(self) -> Set[int]:
+         return self.model_executor.list_loras()
+ 
++    def pin_lora(self, lora_id: int) -> bool:
++        return self.model_executor.pin_lora(lora_id)
++
++    def add_prompt_adapter(
++            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
++        return self.model_executor.add_prompt_adapter(prompt_adapter_request)
++
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        return self.model_executor.remove_prompt_adapter(prompt_adapter_id)
++
++    def list_prompt_adapters(self) -> List[int]:
++        return self.model_executor.list_prompt_adapters()
++
+     def check_health(self) -> None:
++        if self.tokenizer:
++            self.tokenizer.check_health()
+         self.model_executor.check_health()
++
++    def start_profile(self) -> None:
++        # using type instead of isinstance to check to avoid capturing
++        # inherited classes (MultiprocessingGPUExecutor)
++        if type(self.model_executor) == GPUExecutor:  # noqa: E721
++            self.model_executor.start_profile()
++        else:
++            self.model_executor._run_workers("start_profile")
++
++    def stop_profile(self) -> None:
++        # using type instead of isinstance to check to avoid capturing
++        # inherited classes (MultiprocessingGPUExecutor)
++        if type(self.model_executor) == GPUExecutor:  # noqa: E721
++            self.model_executor.stop_profile()
++        else:
++            self.model_executor._run_workers("stop_profile")
++
++    def is_tracing_enabled(self) -> bool:
++        return self.tracer is not None
++
++    def do_tracing(self,
++                   scheduler_outputs: SchedulerOutputs,
++                   finished_before: Optional[List[int]] = None) -> None:
++        if self.tracer is None:
++            return
++
++        for idx, scheduled_seq_group in enumerate(
++                scheduler_outputs.scheduled_seq_groups):
++            # Skip double tracing when using async output proc
++            if finished_before and idx in finished_before:
++                continue
++
++            seq_group = scheduled_seq_group.seq_group
++            if seq_group.is_finished():
++                self.create_trace_span(seq_group)
++
++    def create_trace_span(self, seq_group: SequenceGroup) -> None:
++        if self.tracer is None or seq_group.sampling_params is None:
++            return
++        arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9)
++
++        trace_context = extract_trace_context(seq_group.trace_headers)
++
++        with self.tracer.start_as_current_span(
++                "llm_request",
++                kind=SpanKind.SERVER,
++                context=trace_context,
++                start_time=arrival_time_nano_seconds) as seq_span:
++            metrics = seq_group.metrics
++            ttft = metrics.first_token_time - metrics.arrival_time
++            e2e_time = metrics.finished_time - metrics.arrival_time
++            # attribute names are based on
++            # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
++            seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL,
++                                   self.model_config.model)
++            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID,
++                                   seq_group.request_id)
++            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE,
++                                   seq_group.sampling_params.temperature)
++            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P,
++                                   seq_group.sampling_params.top_p)
++            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS,
++                                   seq_group.sampling_params.max_tokens)
++            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N,
++                                   seq_group.sampling_params.n)
++            seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES,
++                                   seq_group.num_seqs())
++            seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
++                                   len(seq_group.prompt_token_ids))
++            seq_span.set_attribute(
++                SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
++                sum([
++                    seq.get_output_len()
++                    for seq in seq_group.get_finished_seqs()
++                ]))
++            seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE,
++                                   metrics.time_in_queue)
++            seq_span.set_attribute(
++                SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
++            seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time)
++            if metrics.scheduler_time is not None:
++                seq_span.set_attribute(
++                    SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER,
++                    metrics.scheduler_time)
++            if metrics.model_forward_time is not None:
++                seq_span.set_attribute(
++                    SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD,
++                    metrics.model_forward_time / 1000.0)
++            if metrics.model_execute_time is not None:
++                seq_span.set_attribute(
++                    SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE,
++                    metrics.model_execute_time)
++
++    def _validate_model_inputs(self, inputs: ProcessorInputs,
++                               lora_request: Optional[LoRARequest]):
++        if is_encoder_decoder_inputs(inputs):
++            # For encoder-decoder multimodal models, the max_prompt_len
++            # restricts the decoder prompt length
++            prompt_inputs = inputs["decoder" if self.model_config.
++                                   is_multimodal_model else "encoder"]
++        else:
++            prompt_inputs = inputs
++
++        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
++
++        if prompt_ids is None or len(prompt_ids) == 0:
++            raise ValueError("Prompt cannot be empty")
++
++        if self.model_config.is_multimodal_model:
++            max_prompt_len = self.model_config.max_model_len
++
++            if len(prompt_ids) > max_prompt_len:
++                raise ValueError(
++                    f"The prompt (total length {len(prompt_ids)}) is too long "
++                    f"to fit into the model (context length {max_prompt_len}). "
++                    "Make sure that `max_model_len` is no smaller than the "
++                    "number of text tokens plus multimodal tokens. For image "
++                    "inputs, the number of image tokens depends on the number "
++                    "of images, and possibly their aspect ratios as well.")
++
++            # TODO: Find out how many placeholder tokens are there so we can
++            # check that chunked prefill does not truncate them
++            # max_batch_len = self.scheduler_config.max_num_batched_tokens
++
++    def _build_logits_processors(
++            self, sampling_params: SamplingParams,
++            lora_request: Optional[LoRARequest]) -> SamplingParams:
++        """Constructs logits processors based on the guided_decoding,
++        logits_bias, and allowed_token_ids fields in sampling_params. Deletes
++        those fields and adds the constructed logits processors to the
++        logits_processors field. Returns the modified sampling params."""
++
++        logits_processors = []
++
++        if sampling_params.guided_decoding is not None:
++            # Defensively copy sampling params since guided decoding logits
++            # processors can have different state for each request
++            sampling_params = copy.copy(sampling_params)
++            guided_decoding = sampling_params.guided_decoding
++
++            logger.debug(
++                "Building guided decoding logits processor in "
++                "LLMEngine. Params: %s", guided_decoding)
++
++            tokenizer = self.get_tokenizer(lora_request=lora_request)
++            guided_decoding.backend = guided_decoding.backend or \
++                self.decoding_config.guided_decoding_backend
++
++            processor = get_local_guided_decoding_logits_processor(
++                guided_params=guided_decoding,
++                tokenizer=tokenizer,
++                model_config=self.model_config)
++            if processor:
++                logits_processors.append(processor)
++
++            # Unset so this doesn't get passed down to the model
++            sampling_params.guided_decoding = None
++
++        if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
++            tokenizer = self.get_tokenizer(lora_request=lora_request)
++
++            processors = get_openai_logits_processors(
++                logit_bias=sampling_params.logit_bias,
++                allowed_token_ids=sampling_params.allowed_token_ids,
++                tokenizer=tokenizer)
++            logits_processors.extend(processors)
++
++            # Unset so these don't get passed down to the model
++            sampling_params.logit_bias = None
++            sampling_params.allowed_token_ids = None
++
++        if len(sampling_params.bad_words) > 0:
++            tokenizer = self.get_tokenizer(lora_request)
++            processors = get_bad_words_logits_processors(
++                bad_words=sampling_params.bad_words, tokenizer=tokenizer)
++            logits_processors.extend(processors)
++
++        if logits_processors:
++            if sampling_params.logits_processors is None:
++                sampling_params.logits_processors = logits_processors
++            else:
++                sampling_params.logits_processors.extend(logits_processors)
++
++        return sampling_params
+diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
+index 3c4aac9..c8aec8d 100644
+--- a/vllm/engine/metrics.py
++++ b/vllm/engine/metrics.py
+@@ -1,21 +1,28 @@
+ import time
+-from dataclasses import dataclass
+ from typing import TYPE_CHECKING
+ from typing import Counter as CollectionsCounter
+-from typing import Dict, List, Optional, Protocol, Union
++from typing import Dict, List, Optional, Type, Union, cast
+ 
+ import numpy as np
+-from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
+-                               disable_created_metrics)
++import prometheus_client
+ 
++from vllm.config import VllmConfig
++from vllm.engine.metrics_types import (StatLoggerBase, Stats,
++                                       SupportsMetricsInfo)
++from vllm.executor.ray_utils import ray
+ from vllm.logger import init_logger
+ 
++if ray is not None:
++    from ray.util import metrics as ray_metrics
++else:
++    ray_metrics = None
++
+ if TYPE_CHECKING:
+     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
+ 
+ logger = init_logger(__name__)
+ 
+-disable_created_metrics()
++prometheus_client.disable_created_metrics()
+ 
+ # The begin-* and end* here are used by the documentation generator
+ # to extract the metrics definitions.
+@@ -23,53 +30,104 @@ disable_created_metrics()
+ 
+ # begin-metrics-definitions
+ class Metrics:
++    """
++    vLLM uses a multiprocessing-based frontend for the OpenAI server.
++    This means that we need to run prometheus_client in multiprocessing mode
++    See https://prometheus.github.io/client_python/multiprocess/ for more
++    details on limitations.
++    """
++
+     labelname_finish_reason = "finished_reason"
++    labelname_waiting_lora_adapters = "waiting_lora_adapters"
++    labelname_running_lora_adapters = "running_lora_adapters"
++    labelname_max_lora = "max_lora"
++    _gauge_cls = prometheus_client.Gauge
++    _counter_cls = prometheus_client.Counter
++    _histogram_cls = prometheus_client.Histogram
+ 
+-    def __init__(self, labelnames: List[str], max_model_len: int):
+-        # Unregister any existing vLLM collectors
+-        for collector in list(REGISTRY._collector_to_names):
+-            if hasattr(collector, "_name") and "vllm" in collector._name:
+-                REGISTRY.unregister(collector)
++    def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
++        # Unregister any existing vLLM collectors (for CI/CD)
++        self._unregister_vllm_metrics()
+ 
+-        # Config Information
+-        self.info_cache_config = Info(
+-            name='vllm:cache_config',
+-            documentation='information of cache_config')
++        max_model_len = vllm_config.model_config.max_model_len
+ 
+         # System stats
+         #   Scheduler State
+-        self.gauge_scheduler_running = Gauge(
++        self.gauge_scheduler_running = self._gauge_cls(
+             name="vllm:num_requests_running",
+             documentation="Number of requests currently running on GPU.",
+-            labelnames=labelnames)
+-        self.gauge_scheduler_waiting = Gauge(
++            labelnames=labelnames,
++            multiprocess_mode="sum")
++        self.gauge_scheduler_waiting = self._gauge_cls(
+             name="vllm:num_requests_waiting",
+             documentation="Number of requests waiting to be processed.",
+-            labelnames=labelnames)
+-        self.gauge_scheduler_swapped = Gauge(
++            labelnames=labelnames,
++            multiprocess_mode="sum")
++        self.gauge_lora_info = self._gauge_cls(
++            name="vllm:lora_requests_info",
++            documentation="Running stats on lora requests.",
++            labelnames=[
++                self.labelname_running_lora_adapters,
++                self.labelname_max_lora,
++                self.labelname_waiting_lora_adapters,
++            ],
++            multiprocess_mode="livemostrecent",
++        )
++        self.gauge_scheduler_swapped = self._gauge_cls(
+             name="vllm:num_requests_swapped",
+             documentation="Number of requests swapped to CPU.",
+-            labelnames=labelnames)
++            labelnames=labelnames,
++            multiprocess_mode="sum")
+         #   KV Cache Usage in %
+-        self.gauge_gpu_cache_usage = Gauge(
++        self.gauge_gpu_cache_usage = self._gauge_cls(
+             name="vllm:gpu_cache_usage_perc",
+             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
+-            labelnames=labelnames)
+-        self.gauge_cpu_cache_usage = Gauge(
++            labelnames=labelnames,
++            multiprocess_mode="sum")
++        self.gauge_cpu_cache_usage = self._gauge_cls(
+             name="vllm:cpu_cache_usage_perc",
+             documentation="CPU KV-cache usage. 1 means 100 percent usage.",
+-            labelnames=labelnames)
++            labelnames=labelnames,
++            multiprocess_mode="sum")
++        #   Prefix caching block hit rate
++        self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
++            name="vllm:cpu_prefix_cache_hit_rate",
++            documentation="CPU prefix cache block hit rate.",
++            labelnames=labelnames,
++            multiprocess_mode="sum")
++        self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
++            name="vllm:gpu_prefix_cache_hit_rate",
++            documentation="GPU prefix cache block hit rate.",
++            labelnames=labelnames,
++            multiprocess_mode="sum")
+ 
+         # Iteration stats
+-        self.counter_prompt_tokens = Counter(
++        self.counter_num_preemption = self._counter_cls(
++            name="vllm:num_preemptions_total",
++            documentation="Cumulative number of preemption from the engine.",
++            labelnames=labelnames)
++        self.counter_prompt_tokens = self._counter_cls(
+             name="vllm:prompt_tokens_total",
+             documentation="Number of prefill tokens processed.",
+             labelnames=labelnames)
+-        self.counter_generation_tokens = Counter(
++        self.counter_generation_tokens = self._counter_cls(
+             name="vllm:generation_tokens_total",
+             documentation="Number of generation tokens processed.",
+             labelnames=labelnames)
+-        self.histogram_time_to_first_token = Histogram(
++        self.counter_tokens = self._counter_cls(
++            name="vllm:tokens_total",
++            documentation="Number of prefill plus generation tokens processed.",
++            labelnames=labelnames)
++        buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
++        if not vllm_config.model_config.enforce_eager:
++            buckets = vllm_config.compilation_config.capture_sizes.copy()
++            buckets.sort()
++        self.histogram_iteration_tokens = self._histogram_cls(
++            name="vllm:iteration_tokens_total",
++            documentation="Histogram of number of tokens per engine_step.",
++            labelnames=labelnames,
++            buckets=buckets)
++        self.histogram_time_to_first_token = self._histogram_cls(
+             name="vllm:time_to_first_token_seconds",
+             documentation="Histogram of time to first token in seconds.",
+             labelnames=labelnames,
+@@ -77,7 +135,7 @@ class Metrics:
+                 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+             ])
+-        self.histogram_time_per_output_token = Histogram(
++        self.histogram_time_per_output_token = self._histogram_cls(
+             name="vllm:time_per_output_token_seconds",
+             documentation="Histogram of time per output token in seconds.",
+             labelnames=labelnames,
+@@ -88,70 +146,246 @@ class Metrics:
+ 
+         # Request stats
+         #   Latency
+-        self.histogram_e2e_time_request = Histogram(
++        request_latency_buckets = [
++            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
++            40.0, 50.0, 60.0
++        ]
++        self.histogram_e2e_time_request = self._histogram_cls(
+             name="vllm:e2e_request_latency_seconds",
+             documentation="Histogram of end to end request latency in seconds.",
+             labelnames=labelnames,
+-            buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
++            buckets=request_latency_buckets)
++        self.histogram_queue_time_request = self._histogram_cls(
++            name="vllm:request_queue_time_seconds",
++            documentation=
++            "Histogram of time spent in WAITING phase for request.",
++            labelnames=labelnames,
++            buckets=request_latency_buckets)
++        self.histogram_inference_time_request = self._histogram_cls(
++            name="vllm:request_inference_time_seconds",
++            documentation=
++            "Histogram of time spent in RUNNING phase for request.",
++            labelnames=labelnames,
++            buckets=request_latency_buckets)
++        self.histogram_prefill_time_request = self._histogram_cls(
++            name="vllm:request_prefill_time_seconds",
++            documentation=
++            "Histogram of time spent in PREFILL phase for request.",
++            labelnames=labelnames,
++            buckets=request_latency_buckets)
++        self.histogram_decode_time_request = self._histogram_cls(
++            name="vllm:request_decode_time_seconds",
++            documentation=
++            "Histogram of time spent in DECODE phase for request.",
++            labelnames=labelnames,
++            buckets=request_latency_buckets)
++        self.histogram_time_in_queue_request = self._histogram_cls(
++            name="vllm:time_in_queue_requests",
++            documentation=
++            "Histogram of time the request spent in the queue in seconds.",
++            labelnames=labelnames,
++            buckets=request_latency_buckets)
++        self.histogram_model_forward_time_request = self._histogram_cls(
++            name="vllm:model_forward_time_milliseconds",
++            documentation=
++            "Histogram of time spent in the model forward pass in ms.",
++            labelnames=labelnames,
++            buckets=build_1_2_3_5_8_buckets(3000))
++        self.histogram_model_execute_time_request = self._histogram_cls(
++            name="vllm:model_execute_time_milliseconds",
++            documentation=
++            "Histogram of time spent in the model execute function in ms.",
++            labelnames=labelnames,
++            buckets=build_1_2_3_5_8_buckets(3000))
+         #   Metadata
+-        self.histogram_num_prompt_tokens_request = Histogram(
++        self.histogram_num_prompt_tokens_request = self._histogram_cls(
+             name="vllm:request_prompt_tokens",
+             documentation="Number of prefill tokens processed.",
+             labelnames=labelnames,
+             buckets=build_1_2_5_buckets(max_model_len),
+         )
+-        self.histogram_num_generation_tokens_request = Histogram(
+-            name="vllm:request_generation_tokens",
+-            documentation="Number of generation tokens processed.",
+-            labelnames=labelnames,
+-            buckets=build_1_2_5_buckets(max_model_len),
+-        )
+-        self.histogram_best_of_request = Histogram(
+-            name="vllm:request_params_best_of",
+-            documentation="Histogram of the best_of request parameter.",
++        self.histogram_num_generation_tokens_request = \
++            self._histogram_cls(
++                name="vllm:request_generation_tokens",
++                documentation="Number of generation tokens processed.",
++                labelnames=labelnames,
++                buckets=build_1_2_5_buckets(max_model_len),
++            )
++        self.histogram_max_num_generation_tokens_request = self._histogram_cls(
++            name="vllm:request_max_num_generation_tokens",
++            documentation=
++            "Histogram of maximum number of requested generation tokens.",
+             labelnames=labelnames,
+-            buckets=[1, 2, 5, 10, 20],
+-        )
+-        self.histogram_n_request = Histogram(
++            buckets=build_1_2_5_buckets(max_model_len))
++        self.histogram_n_request = self._histogram_cls(
+             name="vllm:request_params_n",
+             documentation="Histogram of the n request parameter.",
+             labelnames=labelnames,
+             buckets=[1, 2, 5, 10, 20],
+         )
+-        self.counter_request_success = Counter(
++        self.histogram_max_tokens_request = self._histogram_cls(
++            name="vllm:request_params_max_tokens",
++            documentation="Histogram of the max_tokens request parameter.",
++            labelnames=labelnames,
++            buckets=build_1_2_5_buckets(max_model_len),
++        )
++        self.counter_request_success = self._counter_cls(
+             name="vllm:request_success_total",
+             documentation="Count of successfully processed requests.",
+             labelnames=labelnames + [Metrics.labelname_finish_reason])
+ 
++        # Speculatie decoding stats
++        self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(
++            name="vllm:spec_decode_draft_acceptance_rate",
++            documentation="Speulative token acceptance rate.",
++            labelnames=labelnames,
++            multiprocess_mode="sum")
++        self.gauge_spec_decode_efficiency = self._gauge_cls(
++            name="vllm:spec_decode_efficiency",
++            documentation="Speculative decoding system efficiency.",
++            labelnames=labelnames,
++            multiprocess_mode="sum")
++        self.counter_spec_decode_num_accepted_tokens = (self._counter_cls(
++            name="vllm:spec_decode_num_accepted_tokens_total",
++            documentation="Number of accepted tokens.",
++            labelnames=labelnames))
++        self.counter_spec_decode_num_draft_tokens = self._counter_cls(
++            name="vllm:spec_decode_num_draft_tokens_total",
++            documentation="Number of draft tokens.",
++            labelnames=labelnames)
++        self.counter_spec_decode_num_emitted_tokens = (self._counter_cls(
++            name="vllm:spec_decode_num_emitted_tokens_total",
++            documentation="Number of emitted tokens.",
++            labelnames=labelnames))
++
+         # Deprecated in favor of vllm:prompt_tokens_total
+-        self.gauge_avg_prompt_throughput = Gauge(
++        self.gauge_avg_prompt_throughput = self._gauge_cls(
+             name="vllm:avg_prompt_throughput_toks_per_s",
+             documentation="Average prefill throughput in tokens/s.",
+             labelnames=labelnames,
++            multiprocess_mode="sum",
+         )
+         # Deprecated in favor of vllm:generation_tokens_total
+-        self.gauge_avg_generation_throughput = Gauge(
++        self.gauge_avg_generation_throughput = self._gauge_cls(
+             name="vllm:avg_generation_throughput_toks_per_s",
+             documentation="Average generation throughput in tokens/s.",
+             labelnames=labelnames,
++            multiprocess_mode="sum",
+         )
+ 
+ 
+ # end-metrics-definitions
+ 
++    def _unregister_vllm_metrics(self) -> None:
++        for collector in list(prometheus_client.REGISTRY._collector_to_names):
++            if hasattr(collector, "_name") and "vllm" in collector._name:
++                prometheus_client.REGISTRY.unregister(collector)
++
++
++class _RayGaugeWrapper:
++    """Wraps around ray.util.metrics.Gauge to provide same API as
++    prometheus_client.Gauge"""
++
++    def __init__(self,
++                 name: str,
++                 documentation: str = "",
++                 labelnames: Optional[List[str]] = None,
++                 multiprocess_mode: str = ""):
++        del multiprocess_mode
++        labelnames_tuple = tuple(labelnames) if labelnames else None
++        self._gauge = ray_metrics.Gauge(name=name,
++                                        description=documentation,
++                                        tag_keys=labelnames_tuple)
++
++    def labels(self, **labels):
++        self._gauge.set_default_tags(labels)
++        return self
++
++    def set(self, value: Union[int, float]):
++        return self._gauge.set(value)
++
++    def set_to_current_time(self):
++        # ray metrics doesn't have set_to_current time, https://docs.ray.io/en/latest/_modules/ray/util/metrics.html
++        return self._gauge.set(time.time())
++
++
++class _RayCounterWrapper:
++    """Wraps around ray.util.metrics.Counter to provide same API as
++    prometheus_client.Counter"""
++
++    def __init__(self,
++                 name: str,
++                 documentation: str = "",
++                 labelnames: Optional[List[str]] = None):
++        labelnames_tuple = tuple(labelnames) if labelnames else None
++        self._counter = ray_metrics.Counter(name=name,
++                                            description=documentation,
++                                            tag_keys=labelnames_tuple)
++
++    def labels(self, **labels):
++        self._counter.set_default_tags(labels)
++        return self
++
++    def inc(self, value: Union[int, float] = 1.0):
++        if value == 0:
++            return
++        return self._counter.inc(value)
++
++
++class _RayHistogramWrapper:
++    """Wraps around ray.util.metrics.Histogram to provide same API as
++    prometheus_client.Histogram"""
++
++    def __init__(self,
++                 name: str,
++                 documentation: str = "",
++                 labelnames: Optional[List[str]] = None,
++                 buckets: Optional[List[float]] = None):
++        labelnames_tuple = tuple(labelnames) if labelnames else None
++        boundaries = buckets if buckets else []
++        self._histogram = ray_metrics.Histogram(name=name,
++                                                description=documentation,
++                                                tag_keys=labelnames_tuple,
++                                                boundaries=boundaries)
++
++    def labels(self, **labels):
++        self._histogram.set_default_tags(labels)
++        return self
++
++    def observe(self, value: Union[int, float]):
++        return self._histogram.observe(value)
++
++
++class RayMetrics(Metrics):
++    """
++    RayMetrics is used by RayPrometheusStatLogger to log to Ray metrics.
++    Provides the same metrics as Metrics but uses Ray's util.metrics library.
++    """
++    _gauge_cls: Type[prometheus_client.Gauge] = cast(
++        Type[prometheus_client.Gauge], _RayGaugeWrapper)
++    _counter_cls: Type[prometheus_client.Counter] = cast(
++        Type[prometheus_client.Counter], _RayCounterWrapper)
++    _histogram_cls: Type[prometheus_client.Histogram] = cast(
++        Type[prometheus_client.Histogram], _RayHistogramWrapper)
++
++    def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
++        if ray_metrics is None:
++            raise ImportError("RayMetrics requires Ray to be installed.")
++        super().__init__(labelnames, vllm_config)
+ 
+-def build_1_2_5_buckets(max_value: int):
++    def _unregister_vllm_metrics(self) -> None:
++        # No-op on purpose
++        pass
++
++
++def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
+     """
+-    Builds a list of buckets with increasing powers of 10 multiplied by 
+-    mantissa values (1, 2, 5) until the value exceeds the specified maximum.
++    Builds a list of buckets with increasing powers of 10 multiplied by
++    mantissa values until the value exceeds the specified maximum.
+ 
+-    Example:
+-    >>> build_1_2_5_buckets(100)
+-    [1, 2, 5, 10, 20, 50, 100]
+     """
+-    mantissa_lst = [1, 2, 5]
+     exponent = 0
+-    buckets = []
++    buckets: List[int] = []
+     while True:
+         for m in mantissa_lst:
+             value = m * 10**exponent
+@@ -162,73 +396,166 @@ def build_1_2_5_buckets(max_value: int):
+         exponent += 1
+ 
+ 
+-@dataclass
+-class Stats:
+-    """Created by LLMEngine for use by StatLogger."""
+-    now: float
++def build_1_2_5_buckets(max_value: int) -> List[int]:
++    """
++    Example:
++    >>> build_1_2_5_buckets(100)
++    [1, 2, 5, 10, 20, 50, 100]
++    """
++    return build_buckets([1, 2, 5], max_value)
+ 
+-    # System stats (should have _sys suffix)
+-    #   Scheduler State
+-    num_running_sys: int
+-    num_waiting_sys: int
+-    num_swapped_sys: int
+-    #   KV Cache Usage in %
+-    gpu_cache_usage_sys: float
+-    cpu_cache_usage_sys: float
+ 
+-    # Iteration stats (should have _iter suffix)
+-    num_prompt_tokens_iter: int
+-    num_generation_tokens_iter: int
+-    time_to_first_tokens_iter: List[float]
+-    time_per_output_tokens_iter: List[float]
++def build_1_2_3_5_8_buckets(max_value: int) -> List[int]:
++    """
++    Example:
++    >>> build_1_2_3_5_8_buckets(100)
++    [1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
++    """
++    return build_buckets([1, 2, 3, 5, 8], max_value)
+ 
+-    # Request stats (should have _requests suffix)
+-    #   Latency
+-    time_e2e_requests: List[float]
+-    #   Metadata
+-    num_prompt_tokens_requests: List[int]
+-    num_generation_tokens_requests: List[int]
+-    best_of_requests: List[int]
+-    n_requests: List[int]
+-    finished_reason_requests: List[str]
+ 
+-    spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
++def local_interval_elapsed(now: float, last_log: float,
++                           local_interval: float) -> bool:
++    elapsed_time = now - last_log
++    return elapsed_time > local_interval
+ 
+ 
+-class SupportsMetricsInfo(Protocol):
++def get_throughput(tracked_stats: List[int], now: float,
++                   last_log: float) -> float:
++    return float(np.sum(tracked_stats) / (now - last_log))
+ 
+-    def metrics_info(self) -> Dict[str, str]:
+-        ...
+ 
++class LoggingStatLogger(StatLoggerBase):
++    """LoggingStatLogger is used in LLMEngine to log to Stdout."""
+ 
+-class StatLogger:
+-    """StatLogger is used LLMEngine to log to Promethus and Stdout."""
++    def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
++        super().__init__(local_interval, vllm_config)
++        self.last_prompt_throughput: Optional[float] = None
++        self.last_generation_throughput: Optional[float] = None
+ 
+-    def __init__(self, local_interval: float, labels: Dict[str, str],
+-                 max_model_len: int) -> None:
+-        # Metadata for logging locally.
+-        self.last_local_log = time.time()
+-        self.local_interval = local_interval
++    def log(self, stats: Stats) -> None:
++        """Called by LLMEngine.
++           Logs to Stdout every self.local_interval seconds."""
++
++        # Save tracked stats for token counters.
++        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
++        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
++
++        # Update spec decode metrics
++        self.maybe_update_spec_decode_metrics(stats)
++
++        # Log locally every local_interval seconds.
++        if local_interval_elapsed(stats.now, self.last_local_log,
++                                  self.local_interval):
++            # Compute summary metrics for tracked stats (and log them
++            # to promethus if applicable).
++            prompt_throughput = get_throughput(self.num_prompt_tokens,
++                                               now=stats.now,
++                                               last_log=self.last_local_log)
++            generation_throughput = get_throughput(
++                self.num_generation_tokens,
++                now=stats.now,
++                last_log=self.last_local_log)
++
++            log_fn = logger.info
++            if not any((prompt_throughput, generation_throughput,
++                        self.last_prompt_throughput,
++                        self.last_generation_throughput)):
++                # Avoid log noise on an idle production system
++                log_fn = logger.debug
++
++            log_fn(
++                "Avg prompt throughput: %.1f tokens/s, "
++                "Avg generation throughput: %.1f tokens/s, "
++                "Running: %d reqs, Swapped: %d reqs, "
++                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
++                "CPU KV cache usage: %.1f%%.",
++                prompt_throughput,
++                generation_throughput,
++                stats.num_running_sys,
++                stats.num_swapped_sys,
++                stats.num_waiting_sys,
++                stats.gpu_cache_usage_sys * 100,
++                stats.cpu_cache_usage_sys * 100,
++            )
++            if (stats.cpu_prefix_cache_hit_rate >= 0
++                    or stats.gpu_prefix_cache_hit_rate >= 0):
++                log_fn(
++                    "Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%",
++                    stats.gpu_prefix_cache_hit_rate * 100,
++                    stats.cpu_prefix_cache_hit_rate * 100,
++                )
++            if self.spec_decode_metrics is not None:
++                log_fn(
++                    self._format_spec_decode_metrics_str(
++                        self.spec_decode_metrics))
++
++            self._reset(stats, prompt_throughput, generation_throughput)
+ 
+-        # Tracked stats over current local logging interval.
+-        self.num_prompt_tokens: List[int] = []
+-        self.num_generation_tokens: List[int] = []
++    def _reset(self, stats, prompt_throughput, generation_throughput) -> None:
++        # Reset tracked stats for next interval.
++        self.num_prompt_tokens = []
++        self.num_generation_tokens = []
++        self.last_local_log = stats.now
++        self.spec_decode_metrics = None
++        self.last_prompt_throughput = prompt_throughput
++        self.last_generation_throughput = generation_throughput
+ 
++    def _format_spec_decode_metrics_str(
++            self, metrics: "SpecDecodeWorkerMetrics") -> str:
++
++        return ("Speculative metrics: "
++                f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
++                f"System efficiency: {metrics.system_efficiency:.3f}, "
++                f"Number of speculative tokens: {metrics.num_spec_tokens}, "
++                f"Number of accepted tokens: {metrics.accepted_tokens}, "
++                f"Number of draft tokens: {metrics.draft_tokens}, "
++                f"Number of emitted tokens: {metrics.emitted_tokens}.")
++
++    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
++        raise NotImplementedError
++
++
++class PrometheusStatLogger(StatLoggerBase):
++    """PrometheusStatLogger is used LLMEngine to log to Promethus."""
++    _metrics_cls = Metrics
++    _gauge_cls = prometheus_client.Gauge
++
++    def __init__(self, local_interval: float, labels: Dict[str, str],
++                 vllm_config: VllmConfig) -> None:
++        super().__init__(local_interval, vllm_config)
+         # Prometheus metrics
+         self.labels = labels
+-        self.metrics = Metrics(labelnames=list(labels.keys()),
+-                               max_model_len=max_model_len)
++        self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
++                                         vllm_config=vllm_config)
+ 
+-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
+-        if type == "cache_config":
+-            self.metrics.info_cache_config.info(obj.metrics_info())
++    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
++        # Convenience function for logging to gauge.
++        gauge.labels(**self.labels).set(data)
++
++    def _log_counter(self, counter, data: Union[int, float]) -> None:
++        # Convenience function for logging to counter.
++        # Prevent ValueError from negative increment
++        if data < 0:
++            logger.warning("Skipping negative increment of %g to %s", data,
++                           counter)
++            return
++        counter.labels(**self.labels).inc(data)
++
++    def _log_counter_labels(self, counter, data: CollectionsCounter,
++                            label_key: str) -> None:
++        # Convenience function for collection counter of labels.
++        for label, count in data.items():
++            counter.labels(**{**self.labels, label_key: label}).inc(count)
+ 
+-    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
+-        return float(np.sum(tracked_stats) / (now - self.last_local_log))
++    def _log_histogram(self, histogram, data: Union[List[int],
++                                                    List[float]]) -> None:
++        # Convenience function for logging list to histogram.
++        for datum in data:
++            histogram.labels(**self.labels).observe(datum)
+ 
+-    def _local_interval_elapsed(self, now: float) -> bool:
+-        elapsed_time = now - self.last_local_log
+-        return elapsed_time > self.local_interval
++    def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
++        gauge.labels(**data).set_to_current_time()
+ 
+     def _log_prometheus(self, stats: Stats) -> None:
+         # System state data
+@@ -242,12 +569,30 @@ class StatLogger:
+                         stats.gpu_cache_usage_sys)
+         self._log_gauge(self.metrics.gauge_cpu_cache_usage,
+                         stats.cpu_cache_usage_sys)
+-
++        self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
++                        stats.cpu_prefix_cache_hit_rate)
++        self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
++                        stats.gpu_prefix_cache_hit_rate)
++        # Including max-lora in metric, in future this property of lora
++        # config maybe extended to be dynamic.
++        lora_info = {
++            self.metrics.labelname_running_lora_adapters:
++            ",".join(stats.running_lora_adapters),
++            self.metrics.labelname_waiting_lora_adapters:
++            ",".join(stats.waiting_lora_adapters),
++            self.metrics.labelname_max_lora:
++            stats.max_lora,
++        }
++        self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
+         # Iteration level data
++        self._log_counter(self.metrics.counter_num_preemption,
++                          stats.num_preemption_iter)
+         self._log_counter(self.metrics.counter_prompt_tokens,
+                           stats.num_prompt_tokens_iter)
+         self._log_counter(self.metrics.counter_generation_tokens,
+                           stats.num_generation_tokens_iter)
++        self._log_histogram(self.metrics.histogram_iteration_tokens,
++                            [stats.num_tokens_iter])
+         self._log_histogram(self.metrics.histogram_time_to_first_token,
+                             stats.time_to_first_tokens_iter)
+         self._log_histogram(self.metrics.histogram_time_per_output_token,
+@@ -257,6 +602,20 @@ class StatLogger:
+         # Latency
+         self._log_histogram(self.metrics.histogram_e2e_time_request,
+                             stats.time_e2e_requests)
++        self._log_histogram(self.metrics.histogram_queue_time_request,
++                            stats.time_queue_requests)
++        self._log_histogram(self.metrics.histogram_inference_time_request,
++                            stats.time_inference_requests)
++        self._log_histogram(self.metrics.histogram_prefill_time_request,
++                            stats.time_prefill_requests)
++        self._log_histogram(self.metrics.histogram_decode_time_request,
++                            stats.time_decode_requests)
++        self._log_histogram(self.metrics.histogram_time_in_queue_request,
++                            stats.time_in_queue_requests)
++        self._log_histogram(self.metrics.histogram_model_forward_time_request,
++                            stats.model_forward_time_requests)
++        self._log_histogram(self.metrics.histogram_model_execute_time_request,
++                            stats.model_execute_time_requests)
+         # Metadata
+         finished_reason_counter = CollectionsCounter(
+             stats.finished_reason_requests)
+@@ -269,28 +628,11 @@ class StatLogger:
+             self.metrics.histogram_num_generation_tokens_request,
+             stats.num_generation_tokens_requests)
+         self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
+-        self._log_histogram(self.metrics.histogram_best_of_request,
+-                            stats.best_of_requests)
+-
+-    def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None:
+-        # Convenience function for logging to gauge.
+-        gauge.labels(**self.labels).set(data)
+-
+-    def _log_counter(self, counter: Counter, data: Union[int, float]) -> None:
+-        # Convenience function for logging to counter.
+-        counter.labels(**self.labels).inc(data)
+-
+-    def _log_counter_labels(self, counter: Counter, data: CollectionsCounter,
+-                            label_key: str) -> None:
+-        # Convenience function for collection counter of labels.
+-        for label, count in data.items():
+-            counter.labels(**{**self.labels, label_key: label}).inc(count)
+-
+-    def _log_histogram(self, histogram: Histogram,
+-                       data: Union[List[int], List[float]]) -> None:
+-        # Convenience function for logging list to histogram.
+-        for datum in data:
+-            histogram.labels(**self.labels).observe(datum)
++        self._log_histogram(
++            self.metrics.histogram_max_num_generation_tokens_request,
++            stats.max_num_generation_tokens_requests)
++        self._log_histogram(self.metrics.histogram_max_tokens_request,
++                            stats.max_tokens_requests)
+ 
+     def _log_prometheus_interval(self, prompt_throughput: float,
+                                  generation_throughput: float) -> None:
+@@ -306,11 +648,8 @@ class StatLogger:
+         self.metrics.gauge_avg_generation_throughput.labels(
+             **self.labels).set(generation_throughput)
+ 
+-    def log(self, stats: Stats) -> None:
+-        """Called by LLMEngine.
+-           Logs to prometheus and tracked stats every iteration.
+-           Logs to Stdout every self.local_interval seconds."""
+-
++    def log(self, stats: Stats):
++        """Logs to prometheus and tracked stats every iteration."""
+         # Log to prometheus.
+         self._log_prometheus(stats)
+ 
+@@ -318,51 +657,65 @@ class StatLogger:
+         self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
+         self.num_generation_tokens.append(stats.num_generation_tokens_iter)
+ 
++        # Update spec decode metrics
++        self.maybe_update_spec_decode_metrics(stats)
++
+         # Log locally every local_interval seconds.
+-        if self._local_interval_elapsed(stats.now):
++        if local_interval_elapsed(stats.now, self.last_local_log,
++                                  self.local_interval):
+             # Compute summary metrics for tracked stats (and log them
+             # to promethus if applicable).
+-            prompt_throughput = self._get_throughput(self.num_prompt_tokens,
+-                                                     now=stats.now)
+-            generation_throughput = self._get_throughput(
+-                self.num_generation_tokens, now=stats.now)
++            prompt_throughput = get_throughput(self.num_prompt_tokens,
++                                               now=stats.now,
++                                               last_log=self.last_local_log)
++            generation_throughput = get_throughput(
++                self.num_generation_tokens,
++                now=stats.now,
++                last_log=self.last_local_log)
++
+             self._log_prometheus_interval(
+                 prompt_throughput=prompt_throughput,
+                 generation_throughput=generation_throughput)
+ 
+-            # Log to stdout.
+-            logger.info(
+-                "Avg prompt throughput: %.1f tokens/s, "
+-                "Avg generation throughput: %.1f tokens/s, "
+-                "Running: %d reqs, Swapped: %d reqs, "
+-                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
+-                "CPU KV cache usage: %.1f%%",
+-                prompt_throughput,
+-                generation_throughput,
+-                stats.num_running_sys,
+-                stats.num_swapped_sys,
+-                stats.num_waiting_sys,
+-                stats.gpu_cache_usage_sys * 100,
+-                stats.cpu_cache_usage_sys * 100,
+-            )
++            if self.spec_decode_metrics is not None:
++                self._log_gauge(
++                    self.metrics.gauge_spec_decode_draft_acceptance_rate,
++                    self.spec_decode_metrics.draft_acceptance_rate)
++                self._log_gauge(self.metrics.gauge_spec_decode_efficiency,
++                                self.spec_decode_metrics.system_efficiency)
++                self._log_counter(
++                    self.metrics.counter_spec_decode_num_accepted_tokens,
++                    self.spec_decode_metrics.accepted_tokens)
++                self._log_counter(
++                    self.metrics.counter_spec_decode_num_draft_tokens,
++                    self.spec_decode_metrics.draft_tokens)
++                self._log_counter(
++                    self.metrics.counter_spec_decode_num_emitted_tokens,
++                    self.spec_decode_metrics.emitted_tokens)
+ 
+             # Reset tracked stats for next interval.
+             self.num_prompt_tokens = []
+             self.num_generation_tokens = []
+             self.last_local_log = stats.now
++            self.spec_decode_metrics = None
+ 
+-            if stats.spec_decode_metrics is not None:
+-                logger.info(
+-                    self._format_spec_decode_metrics_str(
+-                        stats.spec_decode_metrics))
++    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
++        # Info type metrics are syntactic sugar for a gauge permanently set to 1
++        # Since prometheus multiprocessing mode does not support Info, emulate
++        # info here with a gauge.
++        if type == "cache_config":
++            metrics_info = obj.metrics_info()
++            info_gauge = self._gauge_cls(
++                name="vllm:cache_config_info",
++                documentation="Information of the LLMEngine CacheConfig",
++                labelnames=metrics_info.keys(),
++                multiprocess_mode="mostrecent")
++            info_gauge.labels(**metrics_info).set(1)
+ 
+-    def _format_spec_decode_metrics_str(
+-            self, metrics: "SpecDecodeWorkerMetrics") -> str:
+ 
+-        return ("Speculative metrics: "
+-                f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
+-                f"System efficiency: {metrics.system_efficiency:.3f}, "
+-                f"Number of speculative tokens: {metrics.num_spec_tokens}, "
+-                f"Number of accepted tokens: {metrics.accepted_tokens}, "
+-                f"Number of draft tokens tokens: {metrics.draft_tokens}, "
+-                f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
++class RayPrometheusStatLogger(PrometheusStatLogger):
++    """RayPrometheusStatLogger uses Ray metrics instead."""
++    _metrics_cls = RayMetrics
++
++    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
++        return None
+diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
+new file mode 100644
+index 0000000..5c7a430
+--- /dev/null
++++ b/vllm/engine/metrics_types.py
+@@ -0,0 +1,101 @@
++"""
++These types are defined in this file to avoid importing vllm.engine.metrics
++and therefore importing prometheus_client.
++
++This is required due to usage of Prometheus multiprocess mode to enable 
++metrics after splitting out the uvicorn process from the engine process.
++
++Prometheus multiprocess mode requires setting PROMETHEUS_MULTIPROC_DIR
++before prometheus_client is imported. Typically, this is done by setting
++the env variable before launch, but since we are a library, we need to
++do this in Python code and lazily import prometheus_client.
++"""
++
++import time
++from abc import ABC, abstractmethod
++from dataclasses import dataclass
++from typing import Dict, List, Optional, Protocol
++
++from vllm.config import VllmConfig
++from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
++
++
++@dataclass
++class Stats:
++    """Created by LLMEngine for use by StatLogger."""
++    now: float
++
++    # System stats (should have _sys suffix)
++    #   Scheduler State
++    num_running_sys: int
++    num_waiting_sys: int
++    num_swapped_sys: int
++    #   KV Cache Usage in %
++    gpu_cache_usage_sys: float
++    cpu_cache_usage_sys: float
++    #   Prefix caching block hit rate
++    cpu_prefix_cache_hit_rate: float
++    gpu_prefix_cache_hit_rate: float
++
++    # Iteration stats (should have _iter suffix)
++    num_prompt_tokens_iter: int
++    num_generation_tokens_iter: int
++    num_tokens_iter: int
++    time_to_first_tokens_iter: List[float]
++    time_per_output_tokens_iter: List[float]
++    num_preemption_iter: int
++
++    # Request stats (should have _requests suffix)
++    #   Latency
++    time_e2e_requests: List[float]
++    time_queue_requests: List[float]
++    time_inference_requests: List[float]
++    time_prefill_requests: List[float]
++    time_decode_requests: List[float]
++    time_in_queue_requests: List[float]
++    model_forward_time_requests: List[float]
++    model_execute_time_requests: List[float]
++    #   Metadata
++    num_prompt_tokens_requests: List[int]
++    num_generation_tokens_requests: List[int]
++    n_requests: List[int]
++    max_num_generation_tokens_requests: List[int]
++    max_tokens_requests: List[int]
++    finished_reason_requests: List[str]
++    waiting_lora_adapters: List[str]
++    running_lora_adapters: List[str]
++    max_lora: str
++
++    spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
++
++
++class SupportsMetricsInfo(Protocol):
++
++    def metrics_info(self) -> Dict[str, str]:
++        ...
++
++
++class StatLoggerBase(ABC):
++    """Base class for StatLogger."""
++
++    def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
++        # Tracked stats over current local logging interval.
++        self.num_prompt_tokens: List[int] = []
++        self.num_generation_tokens: List[int] = []
++        self.last_local_log = time.time()
++        self.local_interval = local_interval
++        self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
++
++    @abstractmethod
++    def log(self, stats: Stats) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
++        raise NotImplementedError
++
++    def maybe_update_spec_decode_metrics(self, stats: Stats):
++        """Save spec decode metrics (since they are unlikely
++        to be emitted at same time as log interval)."""
++        if stats.spec_decode_metrics is not None:
++            self.spec_decode_metrics = stats.spec_decode_metrics
+diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
+new file mode 100644
+index 0000000..7132f98
+--- /dev/null
++++ b/vllm/engine/multiprocessing/__init__.py
+@@ -0,0 +1,152 @@
++import uuid
++from dataclasses import dataclass, field
++from enum import Enum
++from typing import List, Mapping, Optional, Union, overload
++
++from typing_extensions import deprecated
++
++from vllm import PoolingParams
++from vllm.inputs import PromptType
++from vllm.lora.request import LoRARequest
++from vllm.outputs import RequestOutput
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import SamplingParams
++from vllm.utils import deprecate_kwargs
++
++VLLM_RPC_SUCCESS_STR = "SUCCESS"
++
++IPC_INPUT_EXT = "_input_socket"
++IPC_OUTPUT_EXT = "_output_socket"
++IPC_HEALTH_EXT = "_health_socket"
++IPC_DATA_EXT = "_data_socket"
++
++
++class MQEngineDeadError(RuntimeError):
++    pass
++
++
++@dataclass
++class RPCProcessRequest:
++    prompt: PromptType
++    params: Union[SamplingParams, PoolingParams]
++    request_id: str
++    lora_request: Optional[LoRARequest] = None
++    trace_headers: Optional[Mapping[str, str]] = None
++    prompt_adapter_request: Optional[PromptAdapterRequest] = None
++    priority: int = 0
++
++    @overload
++    def __init__(
++        self,
++        prompt: PromptType,
++        params: Union[SamplingParams, PoolingParams],
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> None:
++        ...
++
++    @overload
++    @deprecated("'inputs' will be renamed to 'prompt")
++    def __init__(
++        self,
++        *,
++        inputs: PromptType,
++        params: Union[SamplingParams, PoolingParams],
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> None:
++        ...
++
++    @deprecate_kwargs(
++        "inputs",
++        additional_message="Please use the 'prompt' parameter instead.",
++    )
++    def __init__(
++            self,
++            prompt: Optional[PromptType] = None,
++            params: Optional[Union[SamplingParams, PoolingParams]] = None,
++            request_id: Optional[str] = None,
++            lora_request: Optional[LoRARequest] = None,
++            trace_headers: Optional[Mapping[str, str]] = None,
++            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++            priority: int = 0,
++            *,
++            inputs: Optional[PromptType] = None,  # DEPRECATED
++    ) -> None:
++        if inputs is not None:
++            prompt = inputs
++        assert (prompt is not None and params is not None
++                and request_id is not None)
++
++        super().__init__()
++
++        self.prompt = prompt
++        self.params = params
++        self.request_id = request_id
++        self.lora_request = lora_request
++        self.trace_headers = trace_headers
++        self.prompt_adapter_request = prompt_adapter_request
++        self.priority = priority
++
++
++@dataclass
++class RPCError:
++    request_id: Optional[str]
++    is_engine_errored: bool
++    exception: BaseException
++
++
++@dataclass
++class RPCAbortRequest:
++    request_id: str
++
++
++class RPCStartupRequest(Enum):
++    IS_SERVER_READY = 1
++
++
++@dataclass
++class RPCStartupResponse:
++    tracing_enabled: bool
++
++
++class RPCUProfileRequest(Enum):
++    START_PROFILE = 1
++    STOP_PROFILE = 2
++
++
++@dataclass
++class RPCLoadAdapterRequest:
++    lora_request: LoRARequest
++    # Set the default value of request_id to a new UUID
++    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
++
++
++@dataclass
++class RPCAdapterLoadedResponse:
++    request_id: str
++
++
++RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
++                      RPCUProfileRequest, RPCLoadAdapterRequest]
++
++REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
++                          RPCError]
++
++
++def ENGINE_DEAD_ERROR(
++        error: Optional[BaseException] = None) -> MQEngineDeadError:
++    if error is None:
++        return MQEngineDeadError(
++            "Engine loop is not running. Inspect the stacktrace to "
++            "find the original error")
++
++    return MQEngineDeadError(
++        "Engine loop is not running. Inspect the stacktrace to "
++        f"find the original error: {repr(error)}.")
+diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
+new file mode 100644
+index 0000000..a9ab899
+--- /dev/null
++++ b/vllm/engine/multiprocessing/client.py
+@@ -0,0 +1,689 @@
++import asyncio
++import copy
++import pickle
++from contextlib import contextmanager, suppress
++from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
++                    Optional, Union, cast, overload)
++
++import cloudpickle
++import psutil
++import zmq
++import zmq.asyncio
++from typing_extensions import deprecated
++from zmq import Frame  # type: ignore[attr-defined]
++from zmq.asyncio import Socket
++
++from vllm import PoolingParams
++from vllm.config import DecodingConfig, ModelConfig, VllmConfig
++from vllm.core.scheduler import SchedulerOutputs
++from vllm.engine.arg_utils import AsyncEngineArgs
++# yapf conflicts with isort for this block
++# yapf: disable
++from vllm.engine.async_llm_engine import (
++    build_guided_decoding_logits_processor_async)
++from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
++                                         IPC_HEALTH_EXT, IPC_INPUT_EXT,
++                                         IPC_OUTPUT_EXT, RPC_REQUEST_T,
++                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
++                                         RPCAdapterLoadedResponse, RPCError,
++                                         RPCLoadAdapterRequest,
++                                         RPCProcessRequest, RPCStartupRequest,
++                                         RPCStartupResponse,
++                                         RPCUProfileRequest)
++from vllm.engine.protocol import EngineClient
++# yapf: enable
++from vllm.envs import VLLM_RPC_TIMEOUT
++from vllm.inputs import PromptType
++from vllm.inputs.preprocess import InputPreprocessor
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.outputs import PoolingRequestOutput, RequestOutput
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import SamplingParams
++from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
++from vllm.utils import deprecate_kwargs
++
++logger = init_logger(__name__)
++
++
++class MQClientClosedError(Exception):
++    """Exception class raised when the client is used post-close.
++
++    The client can be closed, which closes the ZMQ context. This normally
++    happens on server shutdown. In some cases, methods like abort and
++    do_log_stats will still be called and then try to open a socket, which
++    causes a ZMQError and creates a huge stack trace.
++    So, we throw this error such that we can suppress it.
++    """
++
++
++class MQLLMEngineClient(EngineClient):
++    """A client wrapper for MQLLMEngine that conforms to the
++    EngineClient protocol.
++
++    MQLLMEngine and MQLLMEngineClient are intended to run in separate
++    processes communicating via zeromq ipc sockets.
++
++    The entrypoint to MQLLMEngineClient is through the generate()
++    method. On generate() MQLLMEngine does three things:
++        - Creates an asyncio output queue
++        - Sends a RPCGenerateRequest to the MQLLMEngine via zmq
++        - Pulls RequestOutputs from its queue and yields them
++
++    MQLLMEngine runs two background loops:
++        - output_loop: the output loop pulls List[RequestOutput]
++            from the MQLLMEngine via zmq (each list is the output
++            of one engine_step in the LLMEngine). It then parses
++            the list and pushes individual request_outputs into
++            the corresponding output_queue such that they can be
++            consumed by the .generate() method.
++        - health_loop: the health loop queries the health socket
++            every N seconds, confirming the engine is healthy
++    """
++
++    def __init__(self, ipc_path: str, engine_config: VllmConfig,
++                 engine_pid: int):
++        self.context = zmq.asyncio.Context()
++        self._errored_with: Optional[BaseException] = None
++
++        # Get the configs.
++        self.model_config = engine_config.model_config
++        self.decoding_config = engine_config.decoding_config
++
++        # Create the tokenizer group.
++        self.tokenizer = init_tokenizer_from_configs(
++            model_config=self.model_config,
++            scheduler_config=engine_config.scheduler_config,
++            parallel_config=engine_config.parallel_config,
++            lora_config=engine_config.lora_config)
++        self.input_preprocessor = InputPreprocessor(self.model_config,
++                                                    self.tokenizer)
++
++        # Send RPCGenerateRequest to the MQLLMEngine.
++        self.input_socket: Socket = self.context.socket(zmq.constants.PUSH)
++        self.input_socket.connect(f"{ipc_path}{IPC_INPUT_EXT}")
++
++        # Receive streams of RequestOutput from the MQLLMEngine.
++        self.output_socket: Socket = self.context.socket(zmq.constants.PULL)
++        self.output_socket.connect(f"{ipc_path}{IPC_OUTPUT_EXT}")
++
++        # IPC path for acking heartbeats.
++        self.heartbeat_socket: Socket = self.context.socket(zmq.constants.PULL)
++        self.heartbeat_socket.connect(f"{ipc_path}{IPC_HEALTH_EXT}")
++
++        # IPC path for the data socket.
++        self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
++
++        # Stream for each individual request.
++        self.output_queues: Dict[str, asyncio.Queue] = {}
++
++        # Loop to handle output of the LLMEngine periodically.
++        # Started after the MQLLMEngine is ready so that we can
++        # build the Client in an executor to enable clean shutdown.
++        self.output_loop: Optional[asyncio.Task] = None
++
++        # Loop to check health of the LLMEngine periodically.
++        # Started after the MQLLMEngine is ready.
++        self.health_loop: Optional[asyncio.Task] = None
++        self._engine_process = psutil.Process(engine_pid)
++
++    @staticmethod
++    def is_unsupported_config(engine_args: AsyncEngineArgs):
++        # Pipeline parallel not yet supported
++        return engine_args.pipeline_parallel_size > 1
++
++    @contextmanager
++    def get_data_socket(self) -> Iterator[Socket]:
++        socket = self.context.socket(zmq.constants.DEALER)
++        try:
++            socket.connect(self.data_ipc_path)
++            yield socket
++        finally:
++            socket.close(linger=0)
++
++    async def run_heartbeat_loop(self, timeout: int):
++        """Background loop that continually checks to ensure the engine process
++        is still alive.
++        """
++        try:
++            while True:
++                # Check if the engine process is running:
++                if not self._engine_process.is_running() or (
++                        self._engine_process.status() == psutil.STATUS_ZOMBIE):
++                    # NB: is_running() returns True for zombies
++                    self._set_errored(
++                        RuntimeError(
++                            f"Engine process (pid {self._engine_process.pid}) "
++                            "died."))
++                    break
++
++                if await self.heartbeat_socket.poll(timeout=timeout):
++                    # Heartbeat received- check the message
++                    await self._check_success(
++                        error_message="Heartbeat failed.",
++                        socket=self.heartbeat_socket)
++
++                logger.debug("Heartbeat successful.")
++
++        except asyncio.CancelledError:
++            logger.debug("Shutting down MQLLMEngineClient check health loop.")
++
++        except psutil.NoSuchProcess:
++            self._set_errored(
++                RuntimeError(
++                    f"Engine process (pid {self._engine_process.pid}) died."))
++
++        except Exception as e:
++            self._set_errored(e)
++
++    async def run_output_handler_loop(self):
++        """Get RequestOutputs from Engine and stream to Request Queues"""
++
++        try:
++            while True:
++                # Poll, checking for ENGINE_DEAD
++                while await self.output_socket.poll(timeout=VLLM_RPC_TIMEOUT
++                                                    ) == 0:
++                    logger.debug("Waiting for output from MQLLMEngine.")
++
++                    # If errored, alert all running requests.
++                    if self.errored:
++                        for queue_j in tuple(self.output_queues.values()):
++                            queue_j.put_nowait(
++                                ENGINE_DEAD_ERROR(self._errored_with))
++                        return
++
++                message: Frame = await self.output_socket.recv(copy=False)
++                request_outputs = pickle.loads(message.buffer)
++
++                is_error = isinstance(request_outputs,
++                                      (BaseException, RPCError))
++                if is_error:
++                    if isinstance(request_outputs, RPCError):
++                        rpc_error: RPCError = request_outputs
++                        request_id = rpc_error.request_id
++                        exception = rpc_error.exception
++                        is_engine_errored = rpc_error.is_engine_errored
++                    else:
++                        # MPLLMEngine should always return an RPCError to
++                        # the output_socket when an issue arises.
++                        # If we are here, we are in a bad state and
++                        # should shut down the server.
++                        error: BaseException = request_outputs
++                        logger.error(
++                            "Received Exception %s rather than RPCError from "
++                            "MPLLMEngine. This should never happen.", error)
++                        request_id = None
++                        exception = error
++                        is_engine_errored = True
++
++                    # Set to error state only on engine critical error
++                    # (and record only the first one)
++                    if is_engine_errored and not self._errored_with:
++                        self._errored_with = exception
++                        # If engine is errored, no matter the type of exception
++                        # it will no longer be able to receive new requests,
++                        # therefore we have to inform that the current
++                        # processed requests failed as well. Send back a dead
++                        # engine error give this feedback and also give a
++                        # 'hint' to the server to shutdown next.
++                        exception = self.dead_error
++
++                    if request_id is None:
++                        # If request_id is None, then the engine raised an
++                        # exception for a batch, and we may not know the
++                        # request that caused it, neither if it was actually
++                        # caused by any of them (e.g. CUDA OOM). Therefore we
++                        # broadcast the same exception for all requests.
++                        for queue_i in tuple(self.output_queues.values()):
++                            queue_i.put_nowait(exception)
++                    else:
++                        queue = self.output_queues.get(request_id)
++                        if queue is not None:
++                            queue.put_nowait(exception)
++                # Put each output into the appropriate queue.
++                elif isinstance(request_outputs, RPCAdapterLoadedResponse):
++                    self._add_output(request_outputs)
++                else:
++                    for request_output in request_outputs:
++                        self._add_output(request_output)
++
++        except asyncio.CancelledError:
++            logger.debug("Shutting down MQLLMEngineClient output handler.")
++
++    def _add_output(self, request_output: Union[RequestOutput,
++                                                RPCAdapterLoadedResponse]):
++        queue = self.output_queues.get(request_output.request_id)
++        if queue is not None:
++            queue.put_nowait(request_output)
++
++    async def setup(self):
++        """Setup the client before it starts sending server requests."""
++
++        # Start output_loop
++        self.output_loop = asyncio.create_task(self.run_output_handler_loop())
++
++        with self.get_data_socket() as socket:
++            # Wait until server is ready.
++            response = await self._wait_for_server_rpc(socket)
++
++            self.tracing_flag = response.tracing_enabled
++
++            # Start health_loop.
++            self.health_loop = asyncio.create_task(
++                self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT))
++
++    def close(self):
++        """Destroy the ZeroMQ Context."""
++        # Close all sockets and terminate the context.
++        self.context.destroy(linger=0)
++
++        # Cancel background tasks.
++        if self.health_loop is not None:
++            self.health_loop.cancel()
++        if self.output_loop is not None:
++            self.output_loop.cancel()
++
++    def _set_errored(self, e: BaseException):
++        logger.exception(repr(e))
++        if self._errored_with is None:
++            self._errored_with = e
++
++    @staticmethod
++    async def _send_get_data_rpc_request(request: RPCStartupRequest,
++                                         expected_type: Any,
++                                         error_message: str,
++                                         socket: Socket) -> Any:
++        """Send an RPC request that is expecting data back."""
++
++        # Ping RPCServer with a request.
++        await socket.send_multipart((pickle.dumps(request), ), copy=False)
++
++        # Make sure the server responds in time.
++        if await socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0:
++            raise TimeoutError("RPCServer didn't reply within "
++                               f"{VLLM_RPC_TIMEOUT} ms")
++
++        # Await the data from the Server.
++        frame = await socket.recv(copy=False)
++        data = pickle.loads(frame.buffer)
++
++        if isinstance(data, BaseException):
++            raise data
++        elif not isinstance(data, expected_type):
++            raise ValueError(error_message)
++
++        return data
++
++    @staticmethod
++    async def _send_one_way_rpc_request(request: RPC_REQUEST_T,
++                                        socket: Socket):
++        """Send one-way RPC request to trigger an action."""
++
++        if socket.closed:
++            raise MQClientClosedError()
++
++        await socket.send_multipart((pickle.dumps(request), ))
++
++    async def _await_ack(self, error_message: str, socket: Socket):
++        """Await acknowledgement that a request succeeded."""
++
++        if socket.closed:
++            raise MQClientClosedError()
++
++        if await socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0:
++            raise TimeoutError("MQLLMEngine didn't reply within "
++                               f"{VLLM_RPC_TIMEOUT}ms")
++
++        await self._check_success(error_message, socket)
++
++    @staticmethod
++    async def _check_success(error_message: str, socket: Socket):
++        """Confirm that socket has a VLLM_RPC_SUCCESS_STR message"""
++
++        if socket.closed:
++            raise MQClientClosedError()
++
++        frame = await socket.recv(copy=False)
++        response = pickle.loads(frame.buffer)
++
++        # Raise error if unsuccessful
++        if isinstance(response, BaseException):
++            raise response
++        elif (not isinstance(response, str)
++              or response != VLLM_RPC_SUCCESS_STR):
++            raise ValueError(error_message)
++
++    async def get_input_preprocessor(self) -> InputPreprocessor:
++        return self.input_preprocessor
++
++    async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
++        return await self.tokenizer.get_lora_tokenizer_async(lora_request)
++
++    async def get_decoding_config(self) -> DecodingConfig:
++        return self.decoding_config
++
++    async def get_model_config(self) -> ModelConfig:
++        return self.model_config
++
++    async def is_tracing_enabled(self) -> bool:
++        return self.tracing_flag
++
++    async def _wait_for_server_rpc(self, socket: Socket) -> RPCStartupResponse:
++        """Wait for the RPCServer to start up."""
++
++        return await self._send_get_data_rpc_request(
++            request=RPCStartupRequest.IS_SERVER_READY,
++            expected_type=RPCStartupResponse,
++            error_message="Unable to start RPC Server",
++            socket=socket)
++
++    async def abort(self, request_id: str):
++        """Send an ABORT_REQUEST signal to the RPC Server"""
++
++        with suppress(MQClientClosedError):
++            await self._send_one_way_rpc_request(
++                request=RPCAbortRequest(request_id), socket=self.input_socket)
++
++    async def do_log_stats(
++        self,
++        scheduler_outputs: Optional[SchedulerOutputs] = None,
++        model_output: Optional[List[SamplerOutput]] = None,
++    ) -> None:
++        """
++        Ignore do_log_stats (handled on MQLLMEngine polling)
++        """
++        pass
++
++    async def check_health(self):
++        """
++        The check health loop probes the health status of the
++        Engine's health every N seconds and sets _errored_with
++        if the engine is unhealthy.
++        """
++        if self._errored_with is not None:
++            raise self._errored_with
++
++    @property
++    def is_running(self) -> bool:
++        return not self.errored
++
++    @property
++    def is_stopped(self) -> bool:
++        return self.errored
++
++    @property
++    def errored(self) -> bool:
++        return self._errored_with is not None
++
++    @property
++    def dead_error(self) -> BaseException:
++        return ENGINE_DEAD_ERROR(self._errored_with)
++
++    @overload
++    def generate(
++        self,
++        prompt: PromptType,
++        sampling_params: SamplingParams,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> AsyncGenerator[RequestOutput, None]:
++        ...
++
++    @overload
++    @deprecated("'inputs' will be renamed to 'prompt")
++    def generate(
++        self,
++        *,
++        inputs: PromptType,
++        sampling_params: SamplingParams,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> AsyncGenerator[RequestOutput, None]:
++        ...
++
++    @deprecate_kwargs(
++        "inputs",
++        additional_message="Please use the 'prompt' parameter instead.",
++    )
++    def generate(
++        self,
++        prompt: Optional[PromptType] = None,
++        sampling_params: Optional[SamplingParams] = None,
++        request_id: Optional[str] = None,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++        *,
++        inputs: Optional[PromptType] = None  # DEPRECATED
++    ) -> AsyncGenerator[RequestOutput, None]:
++        """Generate outputs for a request.
++
++        Generate outputs for a request. This method is a coroutine. It adds the
++        request into the waiting queue of the LLMEngine and streams the outputs
++        from the LLMEngine to the caller.
++
++        Args:
++            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
++                for more details about the format of each input.
++            sampling_params: The sampling parameters of the request.
++            request_id: The unique id of the request.
++            lora_request: LoRA request to use for generation, if any.
++            trace_headers: OpenTelemetry trace headers.
++            prompt_adapter_request: Prompt Adapter request to use
++                                            for generation, if any.
++            priority: Priority of the request (lower means earlier handling).
++                Any priority other than 0 will lead to an error if the
++                scheduling policy is not "priority".
++        """
++        if inputs is not None:
++            prompt = inputs
++        assert (prompt is not None and sampling_params is not None
++                and request_id is not None)
++
++        return self._process_request(prompt, sampling_params, request_id,
++                                     lora_request, trace_headers,
++                                     prompt_adapter_request, priority)
++
++    @overload
++    def encode(
++        self,
++        prompt: PromptType,
++        pooling_params: PoolingParams,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        priority: int = 0,
++    ) -> AsyncGenerator[PoolingRequestOutput, None]:
++        ...
++
++    @overload
++    @deprecated("'inputs' will be renamed to 'prompt")
++    def encode(
++        self,
++        *,
++        inputs: PromptType,
++        pooling_params: PoolingParams,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        priority: int = 0,
++    ) -> AsyncGenerator[PoolingRequestOutput, None]:
++        ...
++
++    @deprecate_kwargs(
++        "inputs",
++        additional_message="Please use the 'prompt' parameter instead.",
++    )
++    def encode(
++        self,
++        prompt: Optional[PromptType] = None,
++        pooling_params: Optional[PoolingParams] = None,
++        request_id: Optional[str] = None,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        priority: int = 0,
++        *,
++        inputs: Optional[PromptType] = None  # DEPRECATED
++    ) -> AsyncGenerator[PoolingRequestOutput, None]:
++        """Generate outputs for a request from a pooling model.
++
++        Generate outputs for a request. This method is a coroutine. It adds the
++        request into the waiting queue of the LLMEngine and streams the outputs
++        from the LLMEngine to the caller.
++
++        Args:
++            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
++                for more details about the format of each input.
++            pooling_params: The pooling parameters of the request.
++            request_id: The unique id of the request.
++            lora_request: LoRA request to use for generation, if any.
++            trace_headers: OpenTelemetry trace headers.
++
++        Yields:
++            The output `PoolingRequestOutput` objects from the LLMEngine
++            for the request.
++        """
++        if inputs is not None:
++            prompt = inputs
++        assert (prompt is not None and pooling_params is not None
++                and request_id is not None)
++
++        return cast(
++            AsyncGenerator[PoolingRequestOutput, None],
++            self._process_request(prompt,
++                                  pooling_params,
++                                  request_id,
++                                  lora_request,
++                                  trace_headers,
++                                  priority=priority))
++
++    async def _process_request(
++        self,
++        prompt: PromptType,
++        params: Union[SamplingParams, PoolingParams],
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
++            PoolingRequestOutput, None]]:
++        """Send an RPCGenerateRequest to the RPCServer and stream responses."""
++
++        # If already dead, error out.
++        if self._errored_with is not None:
++            raise ENGINE_DEAD_ERROR(self._errored_with)
++
++        # Ensure the request id is unique among running requests
++        if request_id in self.output_queues:
++            raise ValueError(f"Request {request_id} already exists")
++
++        # Constructing guided decoding logits processors is expensive, so we do
++        # it here to avoid contending with cpu resources and the GIL on the
++        # backend process.
++        if isinstance(params, SamplingParams) and \
++            params.guided_decoding is not None:
++            params = await \
++                build_guided_decoding_logits_processor_async(
++                    sampling_params=params,
++                    tokenizer=await self.get_tokenizer(lora_request),
++                    default_guided_backend=(self.decoding_config.guided_decoding_backend
++                        if self.decoding_config
++                        else DecodingConfig.guided_decoding_backend),
++                    model_config=self.model_config
++                )
++
++        # 1) Create output queue for this requests.
++        queue: asyncio.Queue[Union[RequestOutput,
++                                   BaseException]] = asyncio.Queue()
++        self.output_queues[request_id] = queue
++
++        try:
++            # 2) Detach logits processors so that they can be pickled
++            # separately (may require cloudpickle which is slower)
++            if isinstance(params, SamplingParams) and params.logits_processors:
++                # Defensive shallow copy
++                params = copy.copy(params)
++                logits_processors = params.logits_processors
++                params.logits_processors = None
++                lp_bytes = cloudpickle.dumps(logits_processors)
++            else:
++                lp_bytes = None
++
++            request_bytes = pickle.dumps(
++                RPCProcessRequest(
++                    prompt=prompt,
++                    params=params,
++                    request_id=request_id,
++                    lora_request=lora_request,
++                    trace_headers=trace_headers,
++                    prompt_adapter_request=prompt_adapter_request,
++                    priority=priority,
++                ))
++
++            # 3) Send the RPCGenerateRequest to the MQLLMEngine.
++            parts = (request_bytes,
++                     lp_bytes) if lp_bytes else (request_bytes, )
++            await self.input_socket.send_multipart(parts, copy=False)
++
++            # 4) Stream the RequestOutputs from the output queue. Note
++            # that the output_loop pushes RequestOutput objects to this
++            # queue after pulling them from the zmq socket.
++            finished = False
++            try:
++                while not finished:
++                    request_output = await queue.get()
++
++                    if isinstance(request_output, BaseException):
++                        raise request_output
++
++                    finished = request_output.finished
++                    yield request_output
++            finally:
++                # Request was canceled by the client.
++                if not finished and not self.errored:
++                    await self.abort(request_id)
++        finally:
++            self.output_queues.pop(request_id)
++
++    async def start_profile(self) -> None:
++        """Start profiling the engine"""
++
++        await self._send_one_way_rpc_request(
++            request=RPCUProfileRequest.START_PROFILE, socket=self.input_socket)
++
++    async def stop_profile(self) -> None:
++        """Stop profiling the engine"""
++
++        await self._send_one_way_rpc_request(
++            request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)
++
++    async def add_lora(self, lora_request: LoRARequest) -> None:
++        """Load a new LoRA adapter into the engine for future requests."""
++        # Uses the same I/O as generate requests
++        request = RPCLoadAdapterRequest(lora_request)
++
++        # Create output queue for this requests.
++        queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue()
++        self.output_queues[request.request_id] = queue
++
++        # Send the request
++        request_bytes = pickle.dumps(request)
++        await self.input_socket.send_multipart((request_bytes, ), copy=False)
++
++        # Wait for the response
++        request_output = await queue.get()
++        self.output_queues.pop(request.request_id)
++
++        # Raise on error, otherwise happily return None
++        if isinstance(request_output, BaseException):
++            raise request_output
+diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
+new file mode 100644
+index 0000000..36f4df4
+--- /dev/null
++++ b/vllm/engine/multiprocessing/engine.py
+@@ -0,0 +1,389 @@
++import pickle
++import signal
++from contextlib import contextmanager
++from typing import Iterator, List, Optional, Union
++
++import cloudpickle
++import zmq
++
++from vllm import AsyncEngineArgs, SamplingParams
++from vllm.engine.llm_engine import LLMEngine
++# yapf conflicts with isort for this block
++# yapf: disable
++from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
++                                         IPC_HEALTH_EXT, IPC_INPUT_EXT,
++                                         IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
++                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
++                                         RPCAdapterLoadedResponse, RPCError,
++                                         RPCLoadAdapterRequest,
++                                         RPCProcessRequest, RPCStartupRequest,
++                                         RPCStartupResponse,
++                                         RPCUProfileRequest)
++# yapf: enable
++from vllm.executor.gpu_executor import GPUExecutor
++from vllm.logger import init_logger
++from vllm.outputs import RequestOutput
++from vllm.usage.usage_lib import UsageContext
++
++logger = init_logger(__name__)
++
++POLLING_TIMEOUT_MS = 10000
++HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
++
++
++class MQLLMEngine:
++    """A multiprocessing wrapper for :class:`LLMEngine`.
++
++    This class is used to wrap the :class:`LLMEngine` class to enable use
++    in concurrnet manner. It runs a background loop and uses zeromq to 
++    receive new requests and stream outputs incrementally via ipc.
++    
++    The :class:`LLMEngine` generate or encode process is kicked off when a new
++    RPCProcessRequest is received by the input_socket.
++    
++    The self.engine_loop checks the input_socket for new requests,
++    adds them to the LLMEngine if there are any, calls the internal
++    :class:`LLMEngine.step()`, and sends the RequestOutputs back over
++    the output_socket.
++
++    If use_async_sockets is set, the logic associated with reading new
++    requests from the socket and sending data to the socket is passed
++    as a callback to the llm_engine, which calls the logic asynchronously
++    such that the IPC can be overlapped with the GPU.
++
++    Args:
++        ipc_path: Base path for zeromq interprocess messaging
++        use_async_sockets: Whether to make send/recv async with GPU
++        log_requests: Whether to log the requests.
++        *args: Arguments for :class:`LLMEngine`.
++        **kwargs: Arguments for :class:`LLMEngine`.
++    """
++
++    def __init__(self,
++                 ipc_path: str,
++                 use_async_sockets: bool,
++                 *args,
++                 log_requests: bool = True,
++                 **kwargs) -> None:
++        # For MQLLMEngine, we can use cached outputs, since each new request
++        # output is immediately pickled and send over the socket, which frees
++        # the python object to be reused again.
++        kwargs['use_cached_outputs'] = True
++
++        self.engine = LLMEngine(*args, **kwargs)
++        self.log_requests = log_requests
++
++        self.use_async_sockets = use_async_sockets
++        if self.use_async_sockets:
++            self.engine.process_request_outputs_callback = \
++                self._async_socket_engine_callback
++
++        self.ctx = zmq.Context()  # type: ignore[attr-defined]
++
++        # Receive input from the client.
++        self.input_socket = self.ctx.socket(zmq.constants.PULL)
++        self.input_socket.bind(f"{ipc_path}{IPC_INPUT_EXT}")
++
++        # Send output stream back to client.
++        self.output_socket = self.ctx.socket(zmq.constants.PUSH)
++        self.output_socket.bind(f"{ipc_path}{IPC_OUTPUT_EXT}")
++
++        # Send heartbeats back to client.
++        self.heartbeat_socket = self.ctx.socket(zmq.constants.PUSH)
++        self.heartbeat_socket.bind(f"{ipc_path}{IPC_HEALTH_EXT}")
++
++        # IPC path for the data socket.
++        self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
++
++        # Error state.
++        self._errored_with: Optional[BaseException] = None
++
++    @property
++    def dead_error(self) -> BaseException:
++        if self._errored_with is not None:
++            return ENGINE_DEAD_ERROR(self._errored_with)
++        else:
++            return ENGINE_DEAD_ERROR()
++
++    @classmethod
++    def from_engine_args(cls, engine_args: AsyncEngineArgs,
++                         usage_context: UsageContext, ipc_path: str):
++        """Creates an MQLLMEngine from the engine arguments."""
++        # Setup plugins for each process
++        from vllm.plugins import load_general_plugins
++        load_general_plugins()
++
++        engine_config = engine_args.create_engine_config(usage_context)
++        executor_class = LLMEngine._get_executor_cls(engine_config)
++
++        use_async_sockets = engine_config.model_config.use_async_output_proc
++
++        return cls(ipc_path=ipc_path,
++                   use_async_sockets=use_async_sockets,
++                   vllm_config=engine_config,
++                   executor_class=executor_class,
++                   log_requests=not engine_args.disable_log_requests,
++                   log_stats=not engine_args.disable_log_stats,
++                   usage_context=usage_context)
++
++    def start(self):
++        try:
++            try:
++                logger.debug("Starting Startup Loop.")
++                self.run_startup_loop()
++                logger.debug("Starting Engine Loop.")
++                self.run_engine_loop()
++            except Exception as e:
++                logger.exception(repr(e))
++        except KeyboardInterrupt:
++            logger.debug("Shutting down MQLLMEngine.")
++        finally:
++            logger.debug("MQLLMEngine is shut down.")
++            self.cleanup()
++
++    def cleanup(self):
++        """Cleanup zeromq state on shutdown."""
++        # Closes all sockets and destroys context.
++        self.ctx.destroy(linger=0)
++        del self.engine
++
++    @contextmanager
++    def make_data_socket(
++            self) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
++        socket = self.ctx.socket(zmq.constants.ROUTER)
++        try:
++            socket.bind(self.data_ipc_path)
++            yield socket
++        finally:
++            socket.close(linger=0)
++
++    def run_startup_loop(self) -> None:
++        """Startup loop for sending data from Engine -> Client."""
++
++        with self.make_data_socket() as socket:
++            response: Union[RPCStartupResponse, BaseException]
++            try:
++                identity, message = socket.recv_multipart(copy=False)
++                request: RPCStartupRequest = pickle.loads(message.buffer)
++
++                # Handle the query from the Client.
++                if request == RPCStartupRequest.IS_SERVER_READY:
++                    tracing_enabled = self.engine.is_tracing_enabled()
++                    response = RPCStartupResponse(
++                        tracing_enabled=tracing_enabled)
++
++            except Exception as e:
++                response = e
++
++            socket.send_multipart((identity, pickle.dumps(response)),
++                                  copy=False)
++
++    def run_engine_loop(self):
++        """Core busy loop of the LLMEngine."""
++
++        while True:
++            if not self.engine.has_unfinished_requests():
++                # Poll until there is work to do.
++                while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
++                    # When there's no work, check on engine health and send
++                    # health status back to client
++                    self._health_check()
++                    self.engine.do_log_stats()
++                    logger.debug("Waiting for new requests in engine loop.")
++
++            # Handle any input from the client.
++            self.handle_new_input()
++
++            # Engine step.
++            request_outputs = self.engine_step()
++
++            # Send request outputs (if async, done in engine_step callback).
++            if not self.use_async_sockets:
++                self._send_outputs(request_outputs)
++
++    def engine_step(self) -> List[RequestOutput]:
++        """Engine step wrapper with error handling."""
++        try:
++            return self.engine.step()
++        except SystemExit:
++            raise
++        except BaseException as e:
++            self._set_errored(e)
++            rpc_err = RPCError(request_id=None,
++                               is_engine_errored=True,
++                               exception=e)
++            self._send_outputs(rpc_err)
++            raise e
++
++    def handle_new_input(self):
++        """Handle new input from the socket"""
++        try:
++            while self.input_socket.poll(timeout=0) != 0:
++                frames = self.input_socket.recv_multipart(copy=False)
++                request = pickle.loads(frames[0].buffer)
++
++                if isinstance(request, RPCProcessRequest):
++                    if len(frames) > 1:
++                        # Use cloudpickle for logits processors
++                        assert isinstance(request.params, SamplingParams)
++                        lprocs = cloudpickle.loads(frames[1].buffer)
++                        request.params.logits_processors = lprocs
++                    self._handle_process_request(request)
++                elif isinstance(request, RPCAbortRequest):
++                    self._handle_abort_request(request)
++                elif isinstance(request, RPCUProfileRequest):
++                    if request == RPCUProfileRequest.START_PROFILE:
++                        self.start_profile()
++                    else:
++                        self.stop_profile()
++                elif isinstance(request, RPCLoadAdapterRequest):
++                    self._handle_load_adapter_request(request)
++                else:
++                    raise ValueError("Unknown RPCRequest Type: "
++                                     f"{type(request)}")
++
++        except Exception as e:
++            self._set_errored(e)
++            self._send_unhealthy(e)
++            raise e
++
++    def _handle_process_request(self, request: RPCProcessRequest):
++        """Handle RPCProcessRequest by adding it to the LLMEngine."""
++        request_id = request.request_id
++
++        if self._errored_with is not None:
++            rpc_err = RPCError(request_id=request_id,
++                               is_engine_errored=True,
++                               exception=ENGINE_DEAD_ERROR(self._errored_with))
++            self._send_outputs(rpc_err)
++
++        try:
++            self.engine.add_request(
++                request_id=request_id,
++                prompt=request.prompt,
++                params=request.params,
++                lora_request=request.lora_request,
++                trace_headers=request.trace_headers,
++                prompt_adapter_request=request.prompt_adapter_request,
++                priority=request.priority)
++
++            if self.log_requests:
++                logger.info("Added request %s.", request.request_id)
++
++        except Exception as e:
++            # We do not set self._errored = True here, since the error
++            # is due to an issue adding this request to the engine,
++            # rather than an issue with the engine itself.
++            is_errored = self._errored_with is not None
++            rpc_err = RPCError(request_id=request_id,
++                               is_engine_errored=is_errored,
++                               exception=e)
++            self._send_outputs(rpc_err)
++
++            # Remove request from the engine.
++            self.engine.abort_request(request_id)
++
++    def _handle_abort_request(self, request: RPCAbortRequest):
++        self.engine.abort_request(request.request_id)
++        if self.log_requests:
++            logger.info("Aborted request %s.", request.request_id)
++
++    def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
++        try:
++            self.engine.add_lora(request.lora_request)
++        except BaseException as e:
++            # Send back an error if the adater fails to load
++            rpc_err = RPCError(request_id=request.request_id,
++                               is_engine_errored=False,
++                               exception=e)
++            self._send_outputs(rpc_err)
++        # Otherwise, send back the successful load message
++        self._send_outputs(
++            RPCAdapterLoadedResponse(request_id=request.request_id))
++
++    def _health_check(self):
++        # Send unhealthy if engine has already errored
++        if self._errored_with is not None:
++            self._send_unhealthy(self._errored_with)
++        try:
++            self.engine.check_health()
++            self._send_healthy()
++        except Exception as e:
++            self._set_errored(e)
++            self._send_unhealthy(e)
++
++    def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
++        """Send outputs back to the engine client. These can be:
++        - Exceptions
++        - A list of generation outputs
++        - A response from loading a lora adapter
++        """
++        if outputs:
++            try:
++                from ray.exceptions import RayTaskError
++
++                # RayTaskError might not pickelable here. We need to unpack the
++                # underlying exception as the real exception in the output.
++                if (isinstance(outputs, RPCError)
++                        and isinstance(outputs.exception, RayTaskError)):
++                    outputs.exception = outputs.exception.cause
++            except ImportError:
++                pass
++
++            output_bytes = pickle.dumps(outputs)
++            self.output_socket.send_multipart((output_bytes, ), copy=False)
++
++    def _send_healthy(self):
++        """Send HEALTHY message to RPCClient."""
++        if not self.heartbeat_socket.closed:
++            self.heartbeat_socket.send_multipart(HEALTHY_RESPONSE, copy=False)
++
++    def _send_unhealthy(self, error: BaseException):
++        """Send UNHEALTHY message to RPCClient."""
++        if not self.heartbeat_socket.closed:
++            error_bytes = pickle.dumps(error)
++            self.heartbeat_socket.send_multipart((error_bytes, ), copy=False)
++
++    def _async_socket_engine_callback(self,
++                                      request_outputs: REQUEST_OUTPUTS_T):
++        """Callback used by engine to make socket handling async with GPU."""
++        self._send_outputs(request_outputs)
++        self.handle_new_input()
++
++    def _set_errored(self, e: BaseException):
++        """Log and set errored status if this is the first issue."""
++        if self._errored_with is None:
++            self._errored_with = e
++
++    def start_profile(self) -> None:
++        if type(self.engine.model_executor) is GPUExecutor:
++            self.engine.model_executor.start_profile()
++        else:
++            self.engine.model_executor._run_workers("start_profile")
++
++    def stop_profile(self) -> None:
++        if type(self.engine.model_executor) is GPUExecutor:
++            self.engine.model_executor.stop_profile()
++        else:
++            self.engine.model_executor._run_workers("stop_profile")
++
++
++def signal_handler(*_) -> None:
++    raise KeyboardInterrupt("MQLLMEngine terminated")
++
++
++def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
++                  ipc_path: str, engine_alive):
++    try:
++        engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
++                                              usage_context=usage_context,
++                                              ipc_path=ipc_path)
++
++        signal.signal(signal.SIGTERM, signal_handler)
++
++        engine.start()
++
++    except BaseException as e:
++        logger.exception(e)
++        engine_alive.value = False
++        raise e
+diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
+index 9ddb6a3..50adaf4 100644
+--- a/vllm/engine/output_processor/interfaces.py
++++ b/vllm/engine/output_processor/interfaces.py
+@@ -1,13 +1,12 @@
+ from abc import ABC, abstractmethod
+ from typing import Callable, List
+ 
+-from transformers import PreTrainedTokenizer
+-
+ from vllm.config import SchedulerConfig
+ from vllm.core.scheduler import Scheduler
+ from vllm.engine.output_processor.stop_checker import StopChecker
+ from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
+ from vllm.transformers_utils.detokenizer import Detokenizer
++from vllm.transformers_utils.tokenizer import AnyTokenizer
+ from vllm.utils import Counter
+ 
+ 
+@@ -27,9 +26,9 @@ class SequenceGroupOutputProcessor(ABC):
+     def create_output_processor(
+         scheduler_config: SchedulerConfig,
+         detokenizer: Detokenizer,
+-        scheduler: Scheduler,
++        scheduler: List[Scheduler],
+         seq_counter: Counter,
+-        get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
++        get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer],
+         stop_checker: "StopChecker",
+     ):
+         """Create an output processor.
+@@ -41,13 +40,9 @@ class SequenceGroupOutputProcessor(ABC):
+             # Importing here to avoid cycle.
+             from vllm.engine.output_processor.single_step import (
+                 SingleStepOutputProcessor)
+-            return SingleStepOutputProcessor(
+-                scheduler_config,
+-                detokenizer,
+-                scheduler,
+-                seq_counter,
+-                stop_checker,
+-            )
++            return SingleStepOutputProcessor(scheduler_config, detokenizer,
++                                             scheduler, seq_counter,
++                                             stop_checker)
+         else:
+             # Importing here to avoid cycle.
+             from vllm.engine.output_processor.multi_step import (
+@@ -62,7 +57,8 @@ class SequenceGroupOutputProcessor(ABC):
+ 
+     @abstractmethod
+     def process_outputs(self, sequence_group: SequenceGroup,
+-                        outputs: List[SequenceGroupOutput]) -> None:
++                        outputs: List[SequenceGroupOutput],
++                        is_async: bool) -> None:
+         """Process new token ids for the sequence group. Handles logic such as
+         detokenization, stop checking, and freeing/forking sequences in the
+         scheduler.
+diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
+index 5f2f433..c8b282b 100644
+--- a/vllm/engine/output_processor/multi_step.py
++++ b/vllm/engine/output_processor/multi_step.py
+@@ -1,17 +1,20 @@
+ import functools
+-from typing import Callable, List
+-
+-from transformers import PreTrainedTokenizer
++from typing import Callable, List, cast
+ 
+ from vllm.core.scheduler import Scheduler
+ from vllm.engine.output_processor.interfaces import (
+     SequenceGroupOutputProcessor)
++from vllm.engine.output_processor.single_step import (
++    single_step_process_prompt_logprob)
+ from vllm.engine.output_processor.stop_checker import StopChecker
+ from vllm.logger import init_logger
+ from vllm.sampling_params import SamplingParams
+-from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
+-                           SequenceOutput, SequenceStatus)
++from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
++                           CompletionSequenceGroupOutput, Sequence,
++                           SequenceGroup, SequenceGroupOutput, SequenceOutput,
++                           SequenceStatus)
+ from vllm.transformers_utils.detokenizer import Detokenizer
++from vllm.transformers_utils.tokenizer import AnyTokenizer
+ from vllm.utils import Counter
+ 
+ logger = init_logger(__name__)
+@@ -34,9 +37,9 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
+     def __init__(
+         self,
+         detokenizer: Detokenizer,
+-        scheduler: Scheduler,
++        scheduler: List[Scheduler],
+         seq_counter: Counter,
+-        get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
++        get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer],
+         stop_checker: StopChecker,
+     ):
+         self.detokenizer = detokenizer
+@@ -47,48 +50,110 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
+ 
+     def process_prompt_logprob(self, seq_group: SequenceGroup,
+                                outputs: List[SequenceGroupOutput]) -> None:
+-        # TODO(sang): Prompt logprob currently not implemented in multi step
+-        # workers.
+-        self._log_prompt_logprob_unsupported_warning_once()
++        """Process prompt logprobs associated with each step of a multi-step-
++        scheduled computation.
++
++        Args:
++          seq_group: the outputs are associated with this :class:`SequenceGroup`
++          outputs: the :class:`SequenceGroupOutput`s for all scheduler steps
++        """
++        for output in outputs:
++            # Concatenate single-step prompt logprob processing results.
++            assert isinstance(output, CompletionSequenceGroupOutput)
++            single_step_process_prompt_logprob(self, seq_group, output)
+ 
+     @staticmethod
+-    @functools.lru_cache()
++    @functools.lru_cache
+     def _log_prompt_logprob_unsupported_warning_once():
++        # Reminder: Please update docs/source/features/compatibility_matrix.md
++        # If the feature combo become valid
+         logger.warning(
+             "Prompt logprob is not supported by multi step workers. "
+             "(e.g., speculative decode uses multi step workers).")
+ 
+-    def process_outputs(self, sequence_group: SequenceGroup,
+-                        outputs: List[SequenceGroupOutput]) -> None:
++    def process_outputs(self,
++                        sequence_group: SequenceGroup,
++                        outputs: List[SequenceGroupOutput],
++                        is_async: bool = False) -> None:
+         """Append new tokens in the outputs to sequences in the sequence group.
+ 
+         This only supports sequence groups of size 1. It supports greater than
+         one new token per sequence.
+ 
+-        This applies logic like stop condition checking and detokenization,
+-        including freeing finished sequences. It also handles cases where there
+-        are tokens emitted after the EOS token.
++        This applies logic like stop condition checking and detokenization.
++        It also handles cases where there are tokens emitted after 
++        the EOS token.
++
++        is_async - Indicates whether this postprocessor runs in 
++            parallel with the GPU forward pass and is processing 
++            tokens from the previous step. If this is true, then
++            no tokens need to be appended since it is already done
++            externally (before the next schedule() call)
+         """
++        # Sequences can be in RUNNING or FINISHED_ABORTED state
++        # once scheduled, as a sequence is moved to FINSIHED_ABORTED
++        # if a client disconnects from the api server.
+         seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
++        if seqs is None:
++            seqs = sequence_group.get_seqs(
++                status=SequenceStatus.FINISHED_ABORTED)
+ 
+-        assert seqs, "expected running sequences"
++        assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences"
+         assert len(seqs) == 1, (
+             "Beam search not supported in multi-step decoding.")
+         seq = seqs[0]
+-
+-        # Since there's only one sequence per sequence group, we can take the
+-        # first sample.
+-        samples = [outputs[step].samples[0] for step in range(len(outputs))]
+-
+-        # -1 means the output token is not valid (eg. due to spec decode
+-        # rejecting tokens).
+-        valid_samples = [
+-            sample for sample in samples if sample.output_token != -1
+-        ]
+-        assert valid_samples
+-
+-        self._process_seq_outputs(seq, valid_samples,
+-                                  sequence_group.sampling_params)
++        seq_id = seq.seq_id
++        # This method is defined in the more generic
++        # SequenceGroupOutputProcessor, but here we assume that the outputs are
++        # of a more specific type.
++        assert all([
++            isinstance(output, CompletionSequenceGroupOutput)
++            for output in outputs
++        ])
++        compl_outputs = cast(List[CompletionSequenceGroupOutput], outputs)
++        assert all([
++            seq_id == output.samples[0].parent_seq_id
++            for output in compl_outputs
++        ])
++
++        if is_async:
++            # Async case: We process tokens one by one. Here, we know the token
++            # was already appended, so we only need to do the rest of the
++            # postprocessor: Detokenization + stopping logic
++            self._process_decode_and_stop(seq, sequence_group.sampling_params)
++        else:
++            # Standard multi-step case
++
++            # Since there's only one sequence per sequence group,
++            # we can take the first sample.
++            samples = [output.samples[0] for output in compl_outputs]
++
++            # entries in sample tokens may be invalid (eg. due to spec decode
++            # rejecting tokens).
++            valid_samples = [
++                sample for sample in samples
++                if sample.output_token != VLLM_INVALID_TOKEN_ID
++            ]
++
++            # When both spec-decode and pre-fill chunking are enabled, we
++            # don't have guaranteed samples here (e.g. all -1s).
++            if valid_samples:
++                self._process_seq_outputs(seq, valid_samples,
++                                          sequence_group.sampling_params)
++
++    def _process_decode_and_stop(self, seq: Sequence,
++                                 sampling_params: SamplingParams) -> None:
++        new_char_count = 0
++        if sampling_params.detokenize:
++            new_char_count = self.detokenizer.decode_sequence_inplace(
++                seq, sampling_params)
++
++        # TODO(sang): Support lora.
++        self.stop_checker.maybe_stop_sequence(
++            seq,
++            new_char_count=new_char_count,
++            sampling_params=sampling_params,
++        )
+ 
+     def _process_seq_outputs(self, seq: Sequence,
+                              valid_samples: List[SequenceOutput],
+@@ -100,7 +165,6 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
+         remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
+                                                          len(output_token_ids))
+         if remaining_tokens < 0:
+-            valid_samples = valid_samples[:remaining_tokens]
+             output_token_ids = output_token_ids[:remaining_tokens]
+ 
+         # Truncate any tokens after EOS. This is required as spec decode
+@@ -114,9 +178,9 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
+             for i in range(len(output_token_ids)):
+                 if output_token_ids[i] == eos_token_id:
+                     output_token_ids = output_token_ids[:i + 1]
+-                    valid_samples = valid_samples[:i + 1]
+                     break
+ 
++        is_prefill_sampled_token = seq.data.get_num_uncomputed_tokens() == 0
+         # Incrementally append tokens to the sequence, as if we had only one new
+         # token.
+         for output_token_id, output_logprob in zip(output_token_ids,
+@@ -126,17 +190,14 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
+                 logprobs=output_logprob,
+             )
+ 
+-            new_char_count = 0
+-            if sampling_params.detokenize:
+-                new_char_count = self.detokenizer.decode_sequence_inplace(
+-                    seq, sampling_params)
++            if is_prefill_sampled_token:
++                is_prefill_sampled_token = False
++            else:
++                # Update num_computed_tokens iff the sampled token is not from
++                # a prefill step.
++                seq.data.update_num_computed_tokens(1)
++
++            self._process_decode_and_stop(seq, sampling_params)
+ 
+-            self.stop_checker.maybe_stop_sequence(
+-                seq,
+-                new_char_count=new_char_count,
+-                sampling_params=sampling_params)
+             if seq.is_finished():
+                 break
+-
+-        if seq.is_finished():
+-            self.scheduler.free_seq(seq)
+diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
+index 07b1405..da3185f 100644
+--- a/vllm/engine/output_processor/single_step.py
++++ b/vllm/engine/output_processor/single_step.py
+@@ -1,4 +1,4 @@
+-from typing import Dict, List, Tuple, Union
++from typing import List
+ 
+ from vllm.config import SchedulerConfig
+ from vllm.core.scheduler import Scheduler
+@@ -6,15 +6,52 @@ from vllm.engine.output_processor.interfaces import (
+     SequenceGroupOutputProcessor)
+ from vllm.engine.output_processor.stop_checker import StopChecker
+ from vllm.logger import init_logger
+-from vllm.sampling_params import SamplingParams
+-from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
+-                           SequenceOutput, SequenceStatus)
++from vllm.sequence import (CompletionSequenceGroupOutput, SequenceGroup,
++                           SequenceGroupOutput)
+ from vllm.transformers_utils.detokenizer import Detokenizer
+ from vllm.utils import Counter
+ 
+ logger = init_logger(__name__)
+ 
+ 
++def single_step_process_prompt_logprob(
++        sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
++        output: CompletionSequenceGroupOutput) -> None:
++    """Process prompt logprobs associated with the :class:`SequenceGroupOutput`
++    for a given step.
++
++    Do nothing if the output has no prompt logprobs.
++
++    Account for the fact that transformers do not compute first-token logprobs.
++    
++    Args:
++      sg_output_proc: :class:`SequenceGroupOutputProcessor` instance
++      seq_group: the output is associated with this :class:`SequenceGroup`
++      output: the :class:`SequenceGroupOutput` for a single scheduler step
++    """
++    prompt_logprobs = output.prompt_logprobs
++
++    # If this is the first (or only) "chunk" of the prefill, we need
++    # to prepend None to the list of prompt logprobs. The reason for this
++    # is that for N prompt tokens, the Sampler will generate N-1 total
++    # prompt logprobs during prefill since the token at idx 0 will not
++    # have a logprob associated with it.
++    if prompt_logprobs is not None:
++        if not seq_group.prompt_logprobs:
++            prompt_logprobs = [None] + prompt_logprobs
++            seq_group.prompt_logprobs = []
++
++        assert hasattr(sg_output_proc, 'detokenizer')
++        if (seq_group.sampling_params.detokenize
++                and sg_output_proc.detokenizer):
++            sg_output_proc.detokenizer.decode_prompt_logprobs_inplace(
++                seq_group,
++                prompt_logprobs,
++                position_offset=len(seq_group.prompt_logprobs))
++
++        seq_group.prompt_logprobs.extend(prompt_logprobs)
++
++
+ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
+     """SequenceGroupOutputProcessor which handles "output processing" logic,
+     which happens after the model returns generated token ids and before
+@@ -29,14 +66,9 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
+     that is currently difficult to schedule multiple steps ahead of time.
+     """
+ 
+-    def __init__(
+-        self,
+-        scheduler_config: SchedulerConfig,
+-        detokenizer: Detokenizer,
+-        scheduler: Scheduler,
+-        seq_counter: Counter,
+-        stop_checker: StopChecker,
+-    ):
++    def __init__(self, scheduler_config: SchedulerConfig,
++                 detokenizer: Detokenizer, scheduler: List[Scheduler],
++                 seq_counter: Counter, stop_checker: StopChecker):
+         self.scheduler_config = scheduler_config
+         self.detokenizer = detokenizer
+         self.scheduler = scheduler
+@@ -44,241 +76,59 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
+         self.stop_checker = stop_checker
+ 
+     def process_outputs(self, sequence_group: SequenceGroup,
+-                        outputs: List[SequenceGroupOutput]) -> None:
++                        outputs: List[SequenceGroupOutput],
++                        is_async: bool) -> None:
+         """Append all new tokens to sequences in the sequence group. Fork any
+         surviving beam candidates; free any unsurviving ones.
+ 
+         Invokes detokenizer to detokenize new tokens, and also marks sequences
+         as finished if they meet stop conditions.
++        
++        is_async - Indicates whether this postprocessor runs in 
++            parallel with the GPU forward pass and is processing 
++            tokens from the previous step. If this is true, then
++            no tokens need to be appended since it is already done
++            externally (before the next schedule() call)
+         """
+         assert (len(outputs) == 1
+                 ), f"{type(self)} does not support multiple outputs per step"
+-        return self._process_sequence_group_outputs(sequence_group, outputs[0])
++        return self._process_sequence_group_outputs(sequence_group, outputs[0],
++                                                    is_async)
+ 
+     def process_prompt_logprob(self, seq_group: SequenceGroup,
+                                outputs: List[SequenceGroupOutput]) -> None:
++        """Process prompt logprobs associated with one step of a single-step-
++        scheduled computation.
++        
++        Args:
++          seq_group: the output is associated with this :class:`SequenceGroup`
++          output: the :class:`SequenceGroupOutput` for a single scheduler step
++        """
+         assert len(outputs) == 1, ("Single step should only has 1 output.")
+         output = outputs[0]
+-        prompt_logprobs = output.prompt_logprobs
+-        if (prompt_logprobs is not None
+-                and seq_group.sampling_params.detokenize and self.detokenizer):
+-            self.detokenizer.decode_prompt_logprobs_inplace(
+-                seq_group, prompt_logprobs)
+-            if not seq_group.prompt_logprobs:
+-                # The first prompt token's logprob is None because it doesn't
+-                # have tokens that are precedent.
+-                seq_group.prompt_logprobs = [None]
+-            seq_group.prompt_logprobs.extend(prompt_logprobs)
++        assert isinstance(output, CompletionSequenceGroupOutput)
++        single_step_process_prompt_logprob(self, seq_group, output)
+ 
+     def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
+-                                        outputs: SequenceGroupOutput) -> None:
+-        # Process samples
+-        samples = outputs.samples
+-        parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+-        existing_finished_seqs = seq_group.get_finished_seqs()
+-        parent_child_dict: Dict[int, List[SequenceOutput]] = {
+-            parent_seq.seq_id: []
+-            for parent_seq in parent_seqs
+-        }
+-        for sample in samples:
+-            parent_child_dict[sample.parent_seq_id].append(sample)
+-        # List of (child, parent)
+-        child_seqs: List[Tuple[Sequence, Sequence]] = []
+-
+-        # Process the child samples for each parent sequence
+-        for parent in parent_seqs:
+-            child_samples: List[SequenceOutput] = parent_child_dict[
+-                parent.seq_id]
+-            if len(child_samples) == 0:
+-                # This parent sequence has no children samples. Remove
+-                # the parent sequence from the sequence group since it will
+-                # not be used in the future iterations.
+-                parent.status = SequenceStatus.FINISHED_ABORTED
+-                seq_group.remove(parent.seq_id)
+-                self.scheduler.free_seq(parent)
+-                continue
+-            # Fork the parent sequence if there are multiple child samples.
+-            for child_sample in child_samples[:-1]:
+-                new_child_seq_id: int = next(self.seq_counter)
+-                child = parent.fork(new_child_seq_id)
+-                child.append_token_id(child_sample.output_token,
+-                                      child_sample.logprobs)
+-                child_seqs.append((child, parent))
+-            # Continue the parent sequence for the last child sample.
+-            # We reuse the parent sequence here to reduce redundant memory
+-            # copies, especially when using non-beam search sampling methods.
+-            last_child_sample = child_samples[-1]
+-            parent.append_token_id(last_child_sample.output_token,
+-                                   last_child_sample.logprobs)
+-            child_seqs.append((parent, parent))
+-
+-        for seq, _ in child_seqs:
+-            if seq_group.sampling_params.detokenize and self.detokenizer:
+-                new_char_count = self.detokenizer.decode_sequence_inplace(
+-                    seq, seq_group.sampling_params)
+-            else:
+-                new_char_count = 0
+-            self.stop_checker.maybe_stop_sequence(seq, new_char_count,
+-                                                  seq_group.sampling_params)
+-
+-        # Non-beam search case
+-        if not seq_group.sampling_params.use_beam_search:
+-            # For newly created child sequences, add them to the sequence group
+-            # and fork them in block manager if they are not finished.
+-            for seq, parent in child_seqs:
+-                if seq is not parent:
+-                    seq_group.add(seq)
+-                    if not seq.is_finished():
+-                        self.scheduler.fork_seq(parent, seq)
+-
+-            # Free the finished and selected parent sequences' memory in block
+-            # manager. Keep them in the sequence group as candidate output.
+-            # NOTE: we need to fork the new sequences before freeing the
+-            # old sequences.
+-            for seq, parent in child_seqs:
+-                if seq is parent and seq.is_finished():
+-                    self.scheduler.free_seq(seq)
+-            return
+-
+-        # Beam search case
+-        # Select the child sequences to keep in the sequence group.
+-        selected_child_seqs = []
+-        unselected_child_seqs = []
+-        beam_width = seq_group.sampling_params.best_of
+-        length_penalty = seq_group.sampling_params.length_penalty
+-
+-        # Select the newly finished sequences with the highest scores
+-        # to replace existing finished sequences.
+-        # Tuple of (seq, parent, is_new)
+-        existing_finished_seqs = [(seq, None, False)
+-                                  for seq in existing_finished_seqs]
+-        new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs
+-                             if seq.is_finished()]
+-        all_finished_seqs = existing_finished_seqs + new_finished_seqs
+-        # Sort the finished sequences by their scores.
+-        all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
+-            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
+-                               reverse=True)
+-        for seq, parent, is_new in all_finished_seqs[:beam_width]:
+-            if is_new:
+-                # A newly generated child sequence finishes and has a high
+-                # score, so we will add it into the sequence group.
+-                selected_child_seqs.append((seq, parent))
+-        for seq, parent, is_new in all_finished_seqs[beam_width:]:
+-            if is_new:
+-                # A newly generated child sequence finishes but has a low
+-                # score, so we will not add it into the sequence group.
+-                # Additionally, if this sequence is a continuation of a
+-                # parent sequence, we will need remove the parent sequence
+-                # from the sequence group.
+-                unselected_child_seqs.append((seq, parent))
+-            else:
+-                # An existing finished sequence has a low score, so we will
+-                # remove it from the sequence group.
+-                seq_group.remove(seq.seq_id)
+-
+-        # select the top beam_width sequences from the running
+-        # sequences for the next iteration to continue the beam
+-        # search.
+-        running_child_seqs = [(seq, parent) for seq, parent in child_seqs
+-                              if not seq.is_finished()]
+-        # Sort the running sequences by their scores.
+-        running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
+-            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
+-                                reverse=True)
+-
+-        # Check if we can stop the beam search.
+-        if len(running_child_seqs) == 0:
+-            # No running sequences, stop the beam search.
+-            stop_beam_search = True
+-        elif len(all_finished_seqs) < beam_width:
+-            # Not enough finished sequences, continue the beam search.
+-            stop_beam_search = False
+-        else:
+-            # Check the early stopping criteria
+-            best_running_seq = running_child_seqs[0][0]
+-            current_worst_seq = all_finished_seqs[beam_width - 1][0]
+-            stop_beam_search = self._check_beam_search_early_stopping(
+-                seq_group.sampling_params.early_stopping,
+-                seq_group.sampling_params, best_running_seq, current_worst_seq)
+-
+-        if stop_beam_search:
+-            # Stop the beam search and remove all the running sequences from
+-            # the sequence group.
+-            unselected_child_seqs.extend(running_child_seqs)
+-        else:
+-            # Continue the beam search and select the top beam_width sequences
+-            # to continue the beam search.
+-            selected_child_seqs.extend(running_child_seqs[:beam_width])
+-            # The remaining running sequences will not be used in the next
+-            # iteration. Again, if these sequences are continuations of
+-            # parent sequences, we will need to remove the parent sequences
+-            # from the sequence group.
+-            unselected_child_seqs.extend(running_child_seqs[beam_width:])
+-
+-        # For newly created child sequences, add them to the sequence group
+-        # and fork them in block manager if they are not finished.
+-        for seq, parent in selected_child_seqs:
+-            if seq is not parent:
+-                seq_group.add(seq)
+-                if not seq.is_finished():
+-                    self.scheduler.fork_seq(parent, seq)
+-
+-        # Free the finished and selected parent sequences' memory in block
+-        # manager. Keep them in the sequence group as candidate output.
+-        for seq, parent in selected_child_seqs:
+-            if seq is parent and seq.is_finished():
+-                self.scheduler.free_seq(seq)
+-
+-        # Remove the unselected parent sequences from the sequence group and
+-        # free their memory in block manager.
+-        for seq, parent in unselected_child_seqs:
+-            if seq is parent:
+-                # Remove the parent sequence if it is not selected for next
+-                # iteration
+-                seq_group.remove(seq.seq_id)
+-                self.scheduler.free_seq(seq)
+-
+-    def _check_beam_search_early_stopping(
+-        self,
+-        early_stopping: Union[bool, str],
+-        sampling_params: SamplingParams,
+-        best_running_seq: Sequence,
+-        current_worst_seq: Sequence,
+-    ) -> bool:
+-        assert sampling_params.use_beam_search
+-        length_penalty = sampling_params.length_penalty
+-        if early_stopping is True:
+-            return True
+-
+-        current_worst_score = current_worst_seq.get_beam_search_score(
+-            length_penalty=length_penalty,
+-            eos_token_id=current_worst_seq.eos_token_id)
+-        if early_stopping is False:
+-            highest_attainable_score = best_running_seq.get_beam_search_score(
+-                length_penalty=length_penalty,
+-                eos_token_id=best_running_seq.eos_token_id)
++                                        outputs: SequenceGroupOutput,
++                                        is_async: bool) -> None:
++        sampling_params = seq_group.sampling_params
++
++        sample = outputs.samples[0]
++        seq = seq_group.first_seq
++        if not is_async:
++            seq.append_token_id(sample.output_token, sample.logprobs)
++        if sampling_params.detokenize and self.detokenizer:
++            new_char_count = self.detokenizer.decode_sequence_inplace(
++                seq, sampling_params)
+         else:
+-            assert early_stopping == "never"
+-            if length_penalty > 0.0:
+-                # If length_penalty > 0.0, beam search will prefer longer
+-                # sequences. The highest attainable score calculation is
+-                # based on the longest possible sequence length in this case.
+-                max_possible_length = max(
+-                    best_running_seq.get_prompt_len() +
+-                    sampling_params.max_tokens,
+-                    self.scheduler_config.max_model_len)
+-                highest_attainable_score = (
+-                    best_running_seq.get_beam_search_score(
+-                        length_penalty=length_penalty,
+-                        eos_token_id=best_running_seq.eos_token_id,
+-                        seq_len=max_possible_length))
+-            else:
+-                # Otherwise, beam search will prefer shorter sequences. The
+-                # highest attainable score calculation is based on the current
+-                # sequence length.
+-                highest_attainable_score = (
+-                    best_running_seq.get_beam_search_score(
+-                        length_penalty=length_penalty,
+-                        eos_token_id=best_running_seq.eos_token_id))
+-        return current_worst_score >= highest_attainable_score
++            new_char_count = 0
++        self.stop_checker.maybe_stop_sequence(
++            seq,
++            new_char_count,
++            sampling_params,
++            lora_req=seq_group.lora_request,
++        )
++        if seq.is_finished():
++            for scheduler in self.scheduler:
++                scheduler.free_seq(seq)
+diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
+index 66deb9b..4b701f8 100644
+--- a/vllm/engine/output_processor/stop_checker.py
++++ b/vllm/engine/output_processor/stop_checker.py
+@@ -1,9 +1,9 @@
+-from typing import Callable, Optional
+-
+-from transformers import PreTrainedTokenizer
++from typing import Callable, List, Optional, Tuple
+ 
++from vllm.lora.request import LoRARequest
+ from vllm.sampling_params import SamplingParams
+ from vllm.sequence import Sequence, SequenceStatus
++from vllm.transformers_utils.tokenizer import AnyTokenizer
+ 
+ 
+ class StopChecker:
+@@ -14,13 +14,24 @@ class StopChecker:
+     """
+ 
+     def __init__(self, max_model_len: int,
+-                 get_tokenizer_for_seq: Callable[[Sequence],
+-                                                 PreTrainedTokenizer]):
+-        self.max_model_len = max_model_len
++                 get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer]):
++        # Do not use it directly, but use `self._get_max_model_len`.
++        self._max_model_len = max_model_len
+         self.get_tokenizer_for_seq = get_tokenizer_for_seq
+ 
+-    def maybe_stop_sequence(self, seq: Sequence, new_char_count: int,
+-                            sampling_params: SamplingParams) -> None:
++    def _get_max_model_len(self, lora_req: Optional[LoRARequest]):
++        if lora_req and lora_req.long_lora_max_len:
++            return lora_req.long_lora_max_len
++        else:
++            return self._max_model_len
++
++    def maybe_stop_sequence(
++        self,
++        seq: Sequence,
++        new_char_count: int,
++        sampling_params: SamplingParams,
++        lora_req: Optional[LoRARequest] = None,
++    ) -> None:
+         """Stop the finished sequences.
+ 
+        new_char_count is the number of chars added to the
+@@ -35,13 +46,18 @@ class StopChecker:
+         # Check if the sequence has generated the EOS token.
+         if ((not sampling_params.ignore_eos)
+                 and seq.get_last_token_id() == seq.eos_token_id):
++            # Remove the last EOS token unless explicitly specified
++            # This prevents unintended exposure of the EOS token
++            if new_char_count and (
++                    not sampling_params.include_stop_str_in_output):
++                seq.output_text = seq.output_text[:-new_char_count]
+             seq.status = SequenceStatus.FINISHED_STOPPED
+             return
+ 
+         # Check if a stop token was encountered.
+         # This assumes a single token produced per step.
+         last_token_id = seq.get_last_token_id()
+-        if last_token_id in sampling_params.stop_token_ids:
++        if last_token_id in (sampling_params.stop_token_ids or ()):
+             if new_char_count and (
+                     not sampling_params.include_stop_str_in_output):
+                 # Remove last token
+@@ -51,15 +67,19 @@ class StopChecker:
+             return
+ 
+         # Check if any stop strings are matched.
+-        stop_str = self._check_stop_strings(seq, new_char_count,
+-                                            sampling_params)
+-        if stop_str is not None:
++        stop = self.check_stop_strings(
++            seq.output_text, new_char_count, sampling_params.stop,
++            sampling_params.include_stop_str_in_output)
++        if stop is not None:
++            stop_str, truncate_to = stop
++            if truncate_to != -1:
++                seq.output_text = seq.output_text[:truncate_to]
+             seq.status = SequenceStatus.FINISHED_STOPPED
+             seq.stop_reason = stop_str
+             return
+ 
+         # Check if the sequence has reached max_model_len.
+-        if seq.get_len() > self.max_model_len:
++        if seq.get_len() > self._get_max_model_len(lora_req):
+             seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+             return
+ 
+@@ -69,33 +89,40 @@ class StopChecker:
+             return
+ 
+     @staticmethod
+-    def _check_stop_strings(seq: Sequence, new_char_count: int,
+-                            sampling_params: SamplingParams) -> Optional[str]:
++    def check_stop_strings(
++        output_text: str,
++        new_char_count: int,
++        stop: List[str],
++        include_in_output: bool,
++    ) -> Optional[Tuple[str, int]]:
+         """Check if any stop strings are matched and truncate sequence
+         output text accordingly.
+ 
+-        Returns the stop string if matched or else None.
++        Returns tuple (stop_string, offset) if matched or else None.
++
++        Where stop_string is the matched stop string and offset is the
++        length to which output_text should be truncated, or -1 for no
++        truncation.
+         """
+-        if not new_char_count:
++        if not new_char_count or not stop:
+             return None
+ 
+-        for stop_str in sampling_params.stop:
++        for stop_str in stop:
+             stop_string_len = len(stop_str)
+             # Avoid searching already-searched text.
+-            stop_index = seq.output_text.find(
+-                stop_str, -new_char_count - stop_string_len)
++            stop_index = output_text.find(stop_str,
++                                          -new_char_count - stop_string_len)
+             if stop_index == -1:
+                 continue
+ 
+-            if sampling_params.include_stop_str_in_output:
++            if include_in_output:
+                 # Truncate to end of stop string.
+                 stop_index += stop_string_len
+-                if stop_index >= len(seq.output_text):
++                if stop_index >= len(output_text):
+                     # No truncation required.
+-                    return stop_str
++                    return stop_str, -1
+ 
+             # Truncate the output text to either the beginning
+             # or end of the stop string.
+-            seq.output_text = seq.output_text[:stop_index]
+-            return stop_str
++            return stop_str, stop_index
+         return None
+diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
+index 9816e96..770982a 100644
+--- a/vllm/engine/output_processor/util.py
++++ b/vllm/engine/output_processor/util.py
+@@ -1,19 +1,25 @@
+ from typing import List
++from typing import Sequence as GenericSequence
++from typing import cast
+ 
+-from vllm.sequence import SamplerOutput, SequenceGroupOutput
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import CompletionSequenceGroupOutput, SequenceGroupOutput
+ 
+ 
+ def create_output_by_sequence_group(
+-        sampler_outputs: List[SamplerOutput],
++        outputs: GenericSequence[SamplerOutput],
+         num_seq_groups: int) -> List[List[SequenceGroupOutput]]:
+     """Helper method which transforms a 2d list organized by
+     [step][sequence group] into [sequence group][step].
+     """
+-    output_by_sequence_group: List[List[SamplerOutput]] = [
++    output_by_sequence_group: List[List[CompletionSequenceGroupOutput]] = [
+         [] for _ in range(num_seq_groups)
+     ]
+-    for step in sampler_outputs:
++    for step in outputs:
++        sequence_group_output: CompletionSequenceGroupOutput
+         for i, sequence_group_output in enumerate(step):
+             output_by_sequence_group[i].append(sequence_group_output)
+ 
+-    return output_by_sequence_group
++    # Cast to the more generic type that CompletionSequenceGroupOutput
++    # inherits from.
++    return cast(List[List[SequenceGroupOutput]], output_by_sequence_group)
+diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
+new file mode 100644
+index 0000000..f05ff62
+--- /dev/null
++++ b/vllm/engine/protocol.py
+@@ -0,0 +1,277 @@
++import asyncio
++from abc import ABC, abstractmethod
++from typing import AsyncGenerator, List, Mapping, Optional
++
++from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
++from vllm.config import DecodingConfig, ModelConfig
++from vllm.core.scheduler import SchedulerOutputs
++from vllm.inputs.data import PromptType, TokensPrompt
++from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
++from vllm.inputs.preprocess import InputPreprocessor
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
++from vllm.pooling_params import PoolingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import BeamSearchParams, SamplingParams
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.utils import collect_from_async_generator, random_uuid
++
++logger = init_logger(__name__)
++
++
++class EngineClient(ABC):
++    """Protocol class for Clients to Engine"""
++
++    @property
++    @abstractmethod
++    def is_running(self) -> bool:
++        ...
++
++    @property
++    @abstractmethod
++    def is_stopped(self) -> bool:
++        ...
++
++    @property
++    @abstractmethod
++    def errored(self) -> bool:
++        ...
++
++    @property
++    @abstractmethod
++    def dead_error(self) -> BaseException:
++        ...
++
++    @abstractmethod
++    def generate(
++        self,
++        prompt: PromptType,
++        sampling_params: SamplingParams,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> AsyncGenerator[RequestOutput, None]:
++        """Generate outputs for a request."""
++        ...
++
++    async def beam_search(
++        self,
++        prompt: PromptType,
++        request_id: str,
++        params: BeamSearchParams,
++    ) -> AsyncGenerator[RequestOutput, None]:
++
++        beam_width = params.beam_width
++        max_tokens = params.max_tokens
++        ignore_eos = params.ignore_eos
++        temperature = params.temperature
++        length_penalty = params.length_penalty
++        include_stop_str_in_output = params.include_stop_str_in_output
++
++        preprocessor = await self.get_input_preprocessor()
++        tokenizer_group = preprocessor.get_tokenizer_group()
++        tokenizer = await tokenizer_group.get_lora_tokenizer_async()
++
++        if is_explicit_encoder_decoder_prompt(prompt):
++            raise NotImplementedError
++        else:
++            processed_inputs = preprocessor._prompt_to_llm_inputs(
++                prompt,
++                request_id=request_id,
++            )
++
++        prompt_token_ids = processed_inputs["prompt_token_ids"]
++        prompt_text = processed_inputs.get("prompt")
++        multi_modal_data = processed_inputs.get("multi_modal_data")
++        mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs")
++
++        tokenized_length = len(prompt_token_ids)
++
++        sort_beams_key = create_sort_beams_key_function(
++            tokenizer.eos_token_id, length_penalty)
++
++        beam_search_params = SamplingParams(
++            logprobs=2 * beam_width,
++            max_tokens=1,
++            temperature=temperature,
++        )
++        all_beams = [
++            BeamSearchSequence(tokens=prompt_token_ids,
++                               cum_logprob=0,
++                               logprobs=[],
++                               multi_modal_data=multi_modal_data,
++                               mm_processor_kwargs=mm_processor_kwargs)
++        ]
++        completed = []
++
++        for _ in range(max_tokens):
++            prompts_batch = [
++                TokensPrompt(prompt_token_ids=beam.tokens,
++                             multi_modal_data=beam.multi_modal_data,
++                             mm_processor_kwargs=beam.mm_processor_kwargs)
++                for beam in all_beams
++            ]
++
++            tasks = []
++
++            request_id = f"beam_search-{random_uuid()}"
++            for i, individual_prompt in enumerate(prompts_batch):
++                request_id_item = f"{request_id}-{i}"
++                task = asyncio.create_task(
++                    collect_from_async_generator(
++                        self.generate(individual_prompt, beam_search_params,
++                                      request_id_item)))
++                tasks.append(task)
++
++            output = await asyncio.gather(*tasks)
++
++            output = [x[0] for x in output]
++
++            new_beams = []
++            for i, current_beam in enumerate(all_beams):
++                result = output[i]
++
++                if result.outputs[0].logprobs is not None:
++                    logprobs = result.outputs[0].logprobs[0]
++                    for token_id, logprob_obj in logprobs.items():
++                        if token_id == tokenizer.eos_token_id and \
++                            not ignore_eos:
++                            completed.append(
++                                BeamSearchSequence(
++                                    tokens=current_beam.tokens +
++                                    [token_id] if include_stop_str_in_output
++                                    else current_beam.tokens,
++                                    logprobs=current_beam.logprobs +
++                                    [logprobs],
++                                    cum_logprob=current_beam.cum_logprob +
++                                    logprob_obj.logprob,
++                                    finish_reason="stop",
++                                    stop_reason=tokenizer.eos_token_id))
++                        else:
++                            new_beams.append(
++                                BeamSearchSequence(
++                                    tokens=current_beam.tokens + [token_id],
++                                    logprobs=current_beam.logprobs +
++                                    [logprobs],
++                                    cum_logprob=current_beam.cum_logprob +
++                                    logprob_obj.logprob,
++                                    multi_modal_data=current_beam.
++                                    multi_modal_data,
++                                    mm_processor_kwargs=current_beam.
++                                    mm_processor_kwargs))
++
++            sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True)
++            all_beams = sorted_beams[:beam_width]
++
++        completed.extend(all_beams)
++        sorted_completed = sorted(completed, key=sort_beams_key, reverse=True)
++        best_beams = sorted_completed[:beam_width]
++
++        for beam in best_beams:
++            if (beam.tokens[-1] == tokenizer.eos_token_id and not ignore_eos):
++                # Skip the eos token in the text.
++                tokens = beam.tokens[tokenized_length:-1]
++            else:
++                tokens = beam.tokens[tokenized_length:]
++            beam.text = tokenizer.decode(tokens)
++
++        beam_search_output = RequestOutput(
++            request_id=request_id,
++            prompt=prompt_text,
++            outputs=[
++                CompletionOutput(text=beam.text,
++                                 cumulative_logprob=beam.cum_logprob,
++                                 token_ids=beam.tokens[tokenized_length:],
++                                 index=i,
++                                 logprobs=beam.logprobs,
++                                 finish_reason=beam.finish_reason if
++                                 beam.finish_reason is not None else "length",
++                                 stop_reason=beam.stop_reason)
++                for (i, beam) in enumerate(best_beams)
++            ],
++            finished=True,
++            prompt_token_ids=prompt_token_ids,
++            prompt_logprobs=None)
++
++        yield beam_search_output
++
++    @abstractmethod
++    def encode(
++        self,
++        prompt: PromptType,
++        pooling_params: PoolingParams,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        priority: int = 0,
++    ) -> AsyncGenerator[PoolingRequestOutput, None]:
++        """Generate outputs for a request from a pooling model."""
++        ...
++
++    @abstractmethod
++    async def abort(self, request_id: str) -> None:
++        """Abort a request.
++
++        Args:
++            request_id: The unique id of the request.
++        """
++        ...
++
++    @abstractmethod
++    async def get_model_config(self) -> ModelConfig:
++        """Get the model configuration of the vLLM engine."""
++        ...
++
++    @abstractmethod
++    async def get_decoding_config(self) -> DecodingConfig:
++        """Get the decoding configuration of the vLLM engine."""
++        ...
++
++    @abstractmethod
++    async def get_input_preprocessor(self) -> InputPreprocessor:
++        """Get the input processor of the vLLM engine."""
++        ...
++
++    @abstractmethod
++    async def get_tokenizer(
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> AnyTokenizer:
++        """Get the appropriate tokenizer for the request"""
++        ...
++
++    @abstractmethod
++    async def is_tracing_enabled(self) -> bool:
++        ...
++
++    @abstractmethod
++    async def do_log_stats(
++        self,
++        scheduler_outputs: Optional[SchedulerOutputs] = None,
++        model_output: Optional[List[SamplerOutput]] = None,
++    ) -> None:
++        ...
++
++    @abstractmethod
++    async def check_health(self) -> None:
++        """Raise if unhealthy"""
++        ...
++
++    @abstractmethod
++    async def start_profile(self) -> None:
++        """Start profiling the engine"""
++        ...
++
++    @abstractmethod
++    async def stop_profile(self) -> None:
++        """Start profiling the engine"""
++        ...
++
++    @abstractmethod
++    async def add_lora(self, lora_request: LoRARequest) -> None:
++        """Load a new LoRA adapter into the engine for future requests."""
++        ...
+diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
+index 075de0b..daefbff 100644
+--- a/vllm/entrypoints/api_server.py
++++ b/vllm/entrypoints/api_server.py
+@@ -5,21 +5,26 @@ For production use, we recommend using our OpenAI compatible server.
+ We are also not going to accept PRs modifying this file, please
+ change `vllm/entrypoints/openai/api_server.py` instead.
+ """
+-
+-import argparse
++import asyncio
+ import json
+ import ssl
+-from typing import AsyncGenerator
++from argparse import Namespace
++from typing import Any, AsyncGenerator, Optional
+ 
+-import uvicorn
+ from fastapi import FastAPI, Request
+ from fastapi.responses import JSONResponse, Response, StreamingResponse
+ 
+ from vllm.engine.arg_utils import AsyncEngineArgs
+ from vllm.engine.async_llm_engine import AsyncLLMEngine
++from vllm.entrypoints.launcher import serve_http
++from vllm.entrypoints.utils import with_cancellation
++from vllm.logger import init_logger
+ from vllm.sampling_params import SamplingParams
+ from vllm.usage.usage_lib import UsageContext
+-from vllm.utils import random_uuid
++from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit
++from vllm.version import __version__ as VLLM_VERSION
++
++logger = init_logger("vllm.entrypoints.api_server")
+ 
+ TIMEOUT_KEEP_ALIVE = 5  # seconds.
+ app = FastAPI()
+@@ -42,6 +47,11 @@ async def generate(request: Request) -> Response:
+     - other fields: the sampling parameters (See `SamplingParams` for details).
+     """
+     request_dict = await request.json()
++    return await _generate(request_dict, raw_request=request)
++
++
++@with_cancellation
++async def _generate(request_dict: dict, raw_request: Request) -> Response:
+     prompt = request_dict.pop("prompt")
+     stream = request_dict.pop("stream", False)
+     sampling_params = SamplingParams(**request_dict)
+@@ -54,33 +64,84 @@ async def generate(request: Request) -> Response:
+     async def stream_results() -> AsyncGenerator[bytes, None]:
+         async for request_output in results_generator:
+             prompt = request_output.prompt
++            assert prompt is not None
+             text_outputs = [
+                 prompt + output.text for output in request_output.outputs
+             ]
+             ret = {"text": text_outputs}
+-            yield (json.dumps(ret) + "\0").encode("utf-8")
++            yield (json.dumps(ret) + "\n").encode("utf-8")
+ 
+     if stream:
+         return StreamingResponse(stream_results())
+ 
+     # Non-streaming case
+     final_output = None
+-    async for request_output in results_generator:
+-        if await request.is_disconnected():
+-            # Abort the request if the client disconnects.
+-            await engine.abort(request_id)
+-            return Response(status_code=499)
+-        final_output = request_output
++    try:
++        async for request_output in results_generator:
++            final_output = request_output
++    except asyncio.CancelledError:
++        return Response(status_code=499)
+ 
+     assert final_output is not None
+     prompt = final_output.prompt
++    assert prompt is not None
+     text_outputs = [prompt + output.text for output in final_output.outputs]
+     ret = {"text": text_outputs}
+     return JSONResponse(ret)
+ 
+ 
++def build_app(args: Namespace) -> FastAPI:
++    global app
++
++    app.root_path = args.root_path
++    return app
++
++
++async def init_app(
++    args: Namespace,
++    llm_engine: Optional[AsyncLLMEngine] = None,
++) -> FastAPI:
++    app = build_app(args)
++
++    global engine
++
++    engine_args = AsyncEngineArgs.from_cli_args(args)
++    engine = (llm_engine
++              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
++                  engine_args, usage_context=UsageContext.API_SERVER))
++
++    return app
++
++
++async def run_server(args: Namespace,
++                     llm_engine: Optional[AsyncLLMEngine] = None,
++                     **uvicorn_kwargs: Any) -> None:
++    logger.info("vLLM API server version %s", VLLM_VERSION)
++    logger.info("args: %s", args)
++
++    set_ulimit()
++
++    app = await init_app(args, llm_engine)
++    assert engine is not None
++
++    shutdown_task = await serve_http(
++        app,
++        host=args.host,
++        port=args.port,
++        log_level=args.log_level,
++        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
++        ssl_keyfile=args.ssl_keyfile,
++        ssl_certfile=args.ssl_certfile,
++        ssl_ca_certs=args.ssl_ca_certs,
++        ssl_cert_reqs=args.ssl_cert_reqs,
++        **uvicorn_kwargs,
++    )
++
++    await shutdown_task
++
++
+ if __name__ == "__main__":
+-    parser = argparse.ArgumentParser()
++    parser = FlexibleArgumentParser()
+     parser.add_argument("--host", type=str, default=None)
+     parser.add_argument("--port", type=int, default=8000)
+     parser.add_argument("--ssl-keyfile", type=str, default=None)
+@@ -103,17 +164,5 @@ if __name__ == "__main__":
+     parser.add_argument("--log-level", type=str, default="debug")
+     parser = AsyncEngineArgs.add_cli_args(parser)
+     args = parser.parse_args()
+-    engine_args = AsyncEngineArgs.from_cli_args(args)
+-    engine = AsyncLLMEngine.from_engine_args(
+-        engine_args, usage_context=UsageContext.API_SERVER)
+ 
+-    app.root_path = args.root_path
+-    uvicorn.run(app,
+-                host=args.host,
+-                port=args.port,
+-                log_level=args.log_level,
+-                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+-                ssl_keyfile=args.ssl_keyfile,
+-                ssl_certfile=args.ssl_certfile,
+-                ssl_ca_certs=args.ssl_ca_certs,
+-                ssl_cert_reqs=args.ssl_cert_reqs)
++    asyncio.run(run_server(args))
+diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
+new file mode 100644
+index 0000000..beedf5d
+--- /dev/null
++++ b/vllm/entrypoints/chat_utils.py
+@@ -0,0 +1,1001 @@
++import asyncio
++import codecs
++import json
++from abc import ABC, abstractmethod
++from collections import defaultdict, deque
++from functools import lru_cache, partial
++from pathlib import Path
++from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
++                    Literal, Optional, Tuple, TypeVar, Union, cast)
++
++import jinja2.nodes
++import transformers.utils.chat_template_utils as hf_chat_utils
++# yapf conflicts with isort for this block
++# yapf: disable
++from openai.types.chat import (ChatCompletionAssistantMessageParam,
++                               ChatCompletionContentPartImageParam,
++                               ChatCompletionContentPartInputAudioParam)
++from openai.types.chat import (
++    ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
++from openai.types.chat import (ChatCompletionContentPartRefusalParam,
++                               ChatCompletionContentPartTextParam)
++from openai.types.chat import (
++    ChatCompletionMessageParam as OpenAIChatCompletionMessageParam)
++from openai.types.chat import (ChatCompletionMessageToolCallParam,
++                               ChatCompletionToolMessageParam)
++from openai.types.chat.chat_completion_content_part_input_audio_param import (
++    InputAudio)
++# yapf: enable
++# pydantic needs the TypedDict from typing_extensions
++from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
++from typing_extensions import Required, TypeAlias, TypedDict
++
++from vllm.config import ModelConfig
++from vllm.logger import init_logger
++from vllm.multimodal import MultiModalDataDict
++from vllm.multimodal.utils import MediaConnector
++from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
++
++logger = init_logger(__name__)
++
++
++class AudioURL(TypedDict, total=False):
++    url: Required[str]
++    """
++    Either a URL of the audio or a data URL with base64 encoded audio data.
++    """
++
++
++class ChatCompletionContentPartAudioParam(TypedDict, total=False):
++    audio_url: Required[AudioURL]
++
++    type: Required[Literal["audio_url"]]
++    """The type of the content part."""
++
++
++class VideoURL(TypedDict, total=False):
++    url: Required[str]
++    """
++    Either a URL of the video or a data URL with base64 encoded video data.
++    """
++
++
++class ChatCompletionContentPartVideoParam(TypedDict, total=False):
++    video_url: Required[VideoURL]
++
++    type: Required[Literal["video_url"]]
++    """The type of the content part."""
++
++
++class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
++    """A simpler version of the param that only accepts a plain image_url.
++    This is supported by OpenAI API, although it is not documented.
++
++    Example:
++    {
++        "image_url": "https://example.com/image.jpg"
++    }
++    """
++    image_url: Required[str]
++
++
++class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
++    """A simpler version of the param that only accepts a plain audio_url.
++
++    Example:
++    {
++        "audio_url": "https://example.com/audio.mp3"
++    }
++    """
++    audio_url: Required[str]
++
++
++class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
++    """A simpler version of the param that only accepts a plain audio_url.
++
++    Example:
++    {
++        "video_url": "https://example.com/video.mp4"
++    }
++    """
++    video_url: Required[str]
++
++
++ChatCompletionContentPartParam: TypeAlias = Union[
++    OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
++    ChatCompletionContentPartInputAudioParam,
++    ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
++    CustomChatCompletionContentSimpleImageParam,
++    CustomChatCompletionContentSimpleAudioParam,
++    CustomChatCompletionContentSimpleVideoParam, str]
++
++
++class CustomChatCompletionMessageParam(TypedDict, total=False):
++    """Enables custom roles in the Chat Completion API."""
++    role: Required[str]
++    """The role of the message's author."""
++
++    content: Union[str, List[ChatCompletionContentPartParam]]
++    """The contents of the message."""
++
++    name: str
++    """An optional name for the participant.
++
++    Provides the model information to differentiate between participants of the
++    same role.
++    """
++
++    tool_call_id: Optional[str]
++    """Tool call that this message is responding to."""
++
++    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
++    """The tool calls generated by the model, such as function calls."""
++
++
++ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
++                                   CustomChatCompletionMessageParam]
++
++
++# TODO: Make fields ReadOnly once mypy supports it
++class ConversationMessage(TypedDict, total=False):
++    role: Required[str]
++    """The role of the message's author."""
++
++    content: Union[Optional[str], List[Dict[str, str]]]
++    """The contents of the message"""
++
++    tool_call_id: Optional[str]
++    """Tool call that this message is responding to."""
++
++    name: Optional[str]
++    """The name of the function to call"""
++
++    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
++    """The tool calls generated by the model, such as function calls."""
++
++
++# Passed in by user
++ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
++
++# Used internally
++_ChatTemplateContentFormat = Literal["string", "openai"]
++
++
++def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
++    if isinstance(node, jinja2.nodes.Name):
++        return node.ctx == "load" and node.name == varname
++
++    return False
++
++
++def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
++    if isinstance(node, jinja2.nodes.Getitem):
++        return (_is_var_access(node.node, varname)
++                and isinstance(node.arg, jinja2.nodes.Const)
++                and node.arg.value == key)
++
++    if isinstance(node, jinja2.nodes.Getattr):
++        return _is_var_access(node.node, varname) and node.attr == key
++
++    return False
++
++
++def _is_var_or_elems_access(
++    node: jinja2.nodes.Node,
++    varname: str,
++    key: Optional[str] = None,
++) -> bool:
++    if isinstance(node, jinja2.nodes.Filter):
++        return (node.node is not None
++                and _is_var_or_elems_access(node.node, varname, key))
++    if isinstance(node, jinja2.nodes.Test):
++        return _is_var_or_elems_access(node.node, varname, key)
++
++    if (isinstance(node, jinja2.nodes.Getitem)
++            and isinstance(node.arg, jinja2.nodes.Slice)):
++        return _is_var_or_elems_access(node.node, varname, key)
++
++    # yapf: disable
++    return (
++        _is_attr_access(node, varname, key) if key
++        else _is_var_access(node, varname)
++    ) # yapf: enable
++
++
++def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
++    # Global variable that is implicitly defined at the root
++    yield root, varname
++
++    # Iterative BFS
++    related_varnames = deque([varname])
++    while related_varnames:
++        related_varname = related_varnames.popleft()
++
++        for assign_ast in root.find_all(jinja2.nodes.Assign):
++            lhs = assign_ast.target
++            rhs = assign_ast.node
++
++            if _is_var_or_elems_access(rhs, related_varname):
++                assert isinstance(lhs, jinja2.nodes.Name)
++                yield assign_ast, lhs.name
++
++                # Avoid infinite looping for self-assignment
++                if lhs.name != related_varname:
++                    related_varnames.append(lhs.name)
++
++
++# NOTE: The proper way to handle this is to build a CFG so that we can handle
++# the scope in which each variable is defined, but that is too complicated
++def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node):
++    messages_varnames = [
++        varname
++        for _, varname in _iter_nodes_assign_var_or_elems(root, "messages")
++    ]
++
++    # Search for {%- for message in messages -%} loops
++    for loop_ast in root.find_all(jinja2.nodes.For):
++        loop_iter = loop_ast.iter
++        loop_target = loop_ast.target
++
++        for varname in messages_varnames:
++            if _is_var_or_elems_access(loop_iter, varname):
++                assert isinstance(loop_target, jinja2.nodes.Name)
++                yield loop_ast, loop_target.name
++                break
++
++
++def _iter_nodes_assign_content_item(root: jinja2.nodes.Node):
++    message_varnames = [
++        varname for _, varname in _iter_nodes_assign_messages_item(root)
++    ]
++
++    # Search for {%- for content in message['content'] -%} loops
++    for loop_ast in root.find_all(jinja2.nodes.For):
++        loop_iter = loop_ast.iter
++        loop_target = loop_ast.target
++
++        for varname in message_varnames:
++            if _is_var_or_elems_access(loop_iter, varname, "content"):
++                assert isinstance(loop_target, jinja2.nodes.Name)
++                yield loop_ast, loop_target.name
++                break
++
++
++def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]:
++    try:
++        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
++        return jinja_compiled.environment.parse(chat_template)
++    except Exception:
++        logger.exception("Error when compiling Jinja template")
++        return None
++
++
++def _detect_content_format(
++    chat_template: str,
++    *,
++    default: _ChatTemplateContentFormat,
++) -> _ChatTemplateContentFormat:
++    jinja_ast = _try_extract_ast(chat_template)
++    if jinja_ast is None:
++        return default
++
++    try:
++        next(_iter_nodes_assign_content_item(jinja_ast))
++    except StopIteration:
++        return "string"
++    except Exception:
++        logger.exception("Error when parsing AST of Jinja template")
++        return default
++    else:
++        return "openai"
++
++
++def _resolve_chat_template_content_format(
++    chat_template: Optional[str],
++    given_format: ChatTemplateContentFormatOption,
++    tokenizer: AnyTokenizer,
++) -> _ChatTemplateContentFormat:
++    if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
++        tokenizer_chat_template = tokenizer.chat_template
++    else:
++        tokenizer_chat_template = None
++
++    jinja_text: Optional[str]
++    if isinstance(tokenizer_chat_template, str) and chat_template is None:
++        jinja_text = tokenizer_chat_template
++    elif (isinstance(tokenizer_chat_template, dict)
++            and chat_template in tokenizer_chat_template):
++        jinja_text = tokenizer_chat_template[chat_template]
++    else:
++        jinja_text = load_chat_template(chat_template, is_literal=True)
++
++    detected_format = ("string" if jinja_text is None else
++                       _detect_content_format(jinja_text, default="string"))
++
++    return detected_format if given_format == "auto" else given_format
++
++
++@lru_cache
++def resolve_chat_template_content_format(
++    chat_template: Optional[str],
++    given_format: ChatTemplateContentFormatOption,
++    tokenizer: AnyTokenizer,
++) -> _ChatTemplateContentFormat:
++    detected_format = _resolve_chat_template_content_format(
++        chat_template,
++        given_format,
++        tokenizer,
++    )
++
++    logger.info(
++        "Detected the chat template content format to be '%s'. "
++        "You can set `--chat-template-content-format` to override this.",
++        detected_format,
++    )
++
++    if given_format != "auto" and given_format != detected_format:
++        logger.warning(
++            "You specified `--chat-template-content-format %s` "
++            "which is different from the detected format '%s'. "
++            "If our automatic detection is incorrect, please consider "
++            "opening a GitHub issue so that we can improve it: "
++            "https://github.com/vllm-project/vllm/issues/new/choose",
++            given_format,
++            detected_format,
++        )
++
++    return detected_format
++
++
++ModalityStr = Literal["image", "audio", "video"]
++_T = TypeVar("_T")
++
++
++class BaseMultiModalItemTracker(ABC, Generic[_T]):
++    """
++    Tracks multi-modal items in a given request and ensures that the number
++    of multi-modal items in a given request does not exceed the configured
++    maximum per prompt.
++    """
++
++    def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
++        super().__init__()
++
++        self._model_config = model_config
++        self._tokenizer = tokenizer
++        self._allowed_items = (model_config.multimodal_config.limit_per_prompt
++                               if model_config.multimodal_config else {})
++
++        self._items_by_modality = defaultdict[str, list[_T]](list)
++
++    @property
++    def model_config(self) -> ModelConfig:
++        return self._model_config
++
++    @property
++    def allowed_local_media_path(self):
++        return self._model_config.allowed_local_media_path
++
++    @staticmethod
++    @lru_cache(maxsize=None)
++    def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
++        return tokenizer.decode(token_index)
++
++    def _placeholder_str(self, modality: ModalityStr,
++                         current_count: int) -> Optional[str]:
++        # TODO: Let user specify how to insert image tokens into prompt
++        # (similar to chat template)
++        hf_config = self._model_config.hf_config
++        model_type = hf_config.model_type
++
++        if modality == "image":
++            if model_type == "phi3_v":
++                # Workaround since this token is not defined in the tokenizer
++                return f"<|image_{current_count}|>"
++            if model_type == "minicpmv":
++                return "(<image>./</image>)"
++            if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
++                              "pixtral"):
++                # These models do not use image tokens in the prompt
++                return None
++            if model_type == "qwen":
++                return f"Picture {current_count}: <img></img>"
++            if model_type.startswith("llava"):
++                return self._cached_token_str(self._tokenizer,
++                                              hf_config.image_token_index)
++            if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
++                              "NVLM_D", "h2ovl_chat"):
++                return "<image>"
++            if model_type == "mllama":
++                return "<|image|>"
++            if model_type == "qwen2_vl":
++                return "<|vision_start|><|image_pad|><|vision_end|>"
++            if model_type == "molmo":
++                return ""
++            if model_type == "idefics3":
++                return "<image>"
++            if model_type == "aria":
++                return "<|fim_prefix|><|img|><|fim_suffix|>"
++
++            raise TypeError(f"Unknown {modality} model type: {model_type}")
++        elif modality == "audio":
++            if model_type == "ultravox":
++                return "<|audio|>"
++            if model_type == "qwen2_audio":
++                return (f"Audio {current_count}: "
++                        f"<|audio_bos|><|AUDIO|><|audio_eos|>")
++            raise TypeError(f"Unknown model type: {model_type}")
++        elif modality == "video":
++            if model_type == "qwen2_vl":
++                return "<|vision_start|><|video_pad|><|vision_end|>"
++            if model_type.startswith("llava"):
++                return self._cached_token_str(self._tokenizer,
++                                              hf_config.video_token_index)
++            raise TypeError(f"Unknown {modality} model type: {model_type}")
++        else:
++            raise TypeError(f"Unknown modality: {modality}")
++
++    def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
++        """
++        Add a multi-modal item to the current prompt and returns the
++        placeholder string to use, if any.
++        """
++        allowed_count = self._allowed_items.get(modality, 1)
++        current_count = len(self._items_by_modality[modality]) + 1
++        if current_count > allowed_count:
++            raise ValueError(
++                f"At most {allowed_count} {modality}(s) may be provided in "
++                "one request.")
++
++        self._items_by_modality[modality].append(item)
++
++        return self._placeholder_str(modality, current_count)
++
++    @abstractmethod
++    def create_parser(self) -> "BaseMultiModalContentParser":
++        raise NotImplementedError
++
++
++class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
++
++    def all_mm_data(self) -> Optional[MultiModalDataDict]:
++        if self._items_by_modality:
++            return dict(self._items_by_modality)
++
++        return None
++
++    def create_parser(self) -> "BaseMultiModalContentParser":
++        return MultiModalContentParser(self)
++
++
++class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
++
++    async def all_mm_data(self) -> Optional[MultiModalDataDict]:
++        if self._items_by_modality:
++            return {
++                modality: await asyncio.gather(*items)
++                for modality, items in self._items_by_modality.items()
++            }
++
++        return None
++
++    def create_parser(self) -> "BaseMultiModalContentParser":
++        return AsyncMultiModalContentParser(self)
++
++
++class BaseMultiModalContentParser(ABC):
++
++    def __init__(self) -> None:
++        super().__init__()
++
++        # multimodal placeholder_string : count
++        self._placeholder_counts: Dict[str, int] = defaultdict(lambda: 0)
++
++    def _add_placeholder(self, placeholder: Optional[str]):
++        if placeholder:
++            self._placeholder_counts[placeholder] += 1
++
++    def mm_placeholder_counts(self) -> Dict[str, int]:
++        return dict(self._placeholder_counts)
++
++    @abstractmethod
++    def parse_image(self, image_url: str) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def parse_audio(self, audio_url: str) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def parse_input_audio(self, input_audio: InputAudio) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def parse_video(self, video_url: str) -> None:
++        raise NotImplementedError
++
++
++class MultiModalContentParser(BaseMultiModalContentParser):
++
++    def __init__(self, tracker: MultiModalItemTracker) -> None:
++        super().__init__()
++
++        self._tracker = tracker
++
++        self._connector = MediaConnector(
++            allowed_local_media_path=tracker.allowed_local_media_path,
++        )
++
++    def parse_image(self, image_url: str) -> None:
++        image = self._connector.fetch_image(image_url)
++
++        placeholder = self._tracker.add("image", image)
++        self._add_placeholder(placeholder)
++
++    def parse_audio(self, audio_url: str) -> None:
++        audio = self._connector.fetch_audio(audio_url)
++
++        placeholder = self._tracker.add("audio", audio)
++        self._add_placeholder(placeholder)
++
++    def parse_input_audio(self, input_audio: InputAudio) -> None:
++        audio_data = input_audio.get("data", "")
++        audio_format = input_audio.get("format", "")
++        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
++
++        return self.parse_audio(audio_url)
++
++    def parse_video(self, video_url: str) -> None:
++        video = self._connector.fetch_video(video_url)
++
++        placeholder = self._tracker.add("video", video)
++        self._add_placeholder(placeholder)
++
++
++class AsyncMultiModalContentParser(BaseMultiModalContentParser):
++
++    def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
++        super().__init__()
++
++        self._tracker = tracker
++        self._connector = MediaConnector(
++            allowed_local_media_path=tracker.allowed_local_media_path,
++        )
++
++    def parse_image(self, image_url: str) -> None:
++        image_coro = self._connector.fetch_image_async(image_url)
++
++        placeholder = self._tracker.add("image", image_coro)
++        self._add_placeholder(placeholder)
++
++    def parse_audio(self, audio_url: str) -> None:
++        audio_coro = self._connector.fetch_audio_async(audio_url)
++
++        placeholder = self._tracker.add("audio", audio_coro)
++        self._add_placeholder(placeholder)
++
++    def parse_input_audio(self, input_audio: InputAudio) -> None:
++        audio_data = input_audio.get("data", "")
++        audio_format = input_audio.get("format", "")
++        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
++
++        return self.parse_audio(audio_url)
++
++    def parse_video(self, video_url: str) -> None:
++        video = self._connector.fetch_video_async(video_url)
++
++        placeholder = self._tracker.add("video", video)
++        self._add_placeholder(placeholder)
++
++
++def validate_chat_template(chat_template: Optional[Union[Path, str]]):
++    """Raises if the provided chat template appears invalid."""
++    if chat_template is None:
++        return
++
++    elif isinstance(chat_template, Path) and not chat_template.exists():
++        raise FileNotFoundError(
++            "the supplied chat template path doesn't exist")
++
++    elif isinstance(chat_template, str):
++        JINJA_CHARS = "{}\n"
++        if not any(c in chat_template
++                   for c in JINJA_CHARS) and not Path(chat_template).exists():
++            raise ValueError(
++                f"The supplied chat template string ({chat_template}) "
++                f"appears path-like, but doesn't exist!")
++
++    else:
++        raise TypeError(
++            f"{type(chat_template)} is not a valid chat template type")
++
++
++def load_chat_template(
++    chat_template: Optional[Union[Path, str]],
++    *,
++    is_literal: bool = False,
++) -> Optional[str]:
++    if chat_template is None:
++        return None
++
++    if is_literal:
++        if isinstance(chat_template, Path):
++            raise TypeError("chat_template is expected to be read directly "
++                            "from its value")
++
++        return codecs.decode(chat_template, "unicode_escape")
++
++    try:
++        with open(chat_template) as f:
++            return f.read()
++    except OSError as e:
++        if isinstance(chat_template, Path):
++            raise
++
++        JINJA_CHARS = "{}\n"
++        if not any(c in chat_template for c in JINJA_CHARS):
++            msg = (f"The supplied chat template ({chat_template}) "
++                   f"looks like a file path, but it failed to be "
++                   f"opened. Reason: {e}")
++            raise ValueError(msg) from e
++
++        # If opening a file fails, set chat template to be args to
++        # ensure we decode so our escape are interpreted correctly
++        return load_chat_template(chat_template, is_literal=True)
++
++
++# TODO: Let user specify how to insert multimodal tokens into prompt
++# (similar to chat template)
++def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
++                                     text_prompt: str) -> str:
++    """Combine multimodal prompts for a multimodal language model."""
++
++    # Look through the text prompt to check for missing placeholders
++    missing_placeholders: List[str] = []
++    for placeholder in placeholder_counts:
++
++        # For any existing placeholder in the text prompt, we leave it as is
++        placeholder_counts[placeholder] -= text_prompt.count(placeholder)
++
++        if placeholder_counts[placeholder] < 0:
++            raise ValueError(
++                f"Found more '{placeholder}' placeholders in input prompt than "
++                "actual multimodal data items.")
++
++        missing_placeholders.extend([placeholder] *
++                                    placeholder_counts[placeholder])
++
++    # NOTE: For now we always add missing placeholders at the front of
++    # the prompt. This may change to be customizable in the future.
++    return "\n".join(missing_placeholders + [text_prompt])
++
++
++# No need to validate using Pydantic again
++_TextParser = partial(cast, ChatCompletionContentPartTextParam)
++_ImageParser = partial(cast, ChatCompletionContentPartImageParam)
++_AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
++_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
++_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
++_VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
++
++_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio]
++
++# Define a mapping from part types to their corresponding parsing functions.
++MM_PARSER_MAP: Dict[
++    str,
++    Callable[[ChatCompletionContentPartParam], _ContentPart],
++] = {
++    "text":
++    lambda part: _TextParser(part).get("text", ""),
++    "image_url":
++    lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
++    "audio_url":
++    lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
++    "input_audio":
++    lambda part: _InputAudioParser(part).get("input_audio", {}),
++    "refusal":
++    lambda part: _RefusalParser(part).get("refusal", ""),
++    "video_url":
++    lambda part: _VideoParser(part).get("video_url", {}).get("url", ""),
++}
++
++
++def _parse_chat_message_content_mm_part(
++        part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]:
++    """
++    Parses a given multi-modal content part based on its type.
++
++    Args:
++        part: A dict containing the content part, with a potential 'type' field.
++
++    Returns:
++        A tuple (part_type, content) where:
++        - part_type: Type of the part (e.g., 'text', 'image_url').
++        - content: Parsed content (e.g., text, image URL).
++
++    Raises:
++        ValueError: If the 'type' field is missing and no direct URL is found.
++    """
++    assert isinstance(
++        part, dict)  # This is needed to avoid mypy errors: part.get() from str
++    part_type = part.get("type", None)
++
++    if isinstance(part_type, str) and part_type in MM_PARSER_MAP:
++        content = MM_PARSER_MAP[part_type](part)
++
++        # Special case for 'image_url.detail'
++        # We only support 'auto', which is the default
++        if part_type == "image_url" and part.get("detail", "auto") != "auto":
++            logger.warning("'image_url.detail' is currently not supported "
++                           "and will be ignored.")
++
++        return part_type, content
++
++    # Handle missing 'type' but provided direct URL fields.
++    # 'type' is required field by pydantic
++    if part_type is None:
++        if part.get("image_url") is not None:
++            image_params = cast(CustomChatCompletionContentSimpleImageParam,
++                                part)
++            return "image_url", image_params.get("image_url", "")
++        if part.get("audio_url") is not None:
++            audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
++                                part)
++            return "audio_url", audio_params.get("audio_url", "")
++        if part.get("input_audio") is not None:
++            input_audio_params = cast(Dict[str, str], part)
++            return "input_audio", input_audio_params
++        if part.get("video_url") is not None:
++            video_params = cast(CustomChatCompletionContentSimpleVideoParam,
++                                part)
++            return "video_url", video_params.get("video_url", "")
++        # Raise an error if no 'type' or direct URL is found.
++        raise ValueError("Missing 'type' field in multimodal part.")
++
++    if not isinstance(part_type, str):
++        raise ValueError("Invalid 'type' field in multimodal part.")
++    return part_type, "unknown part_type content"
++
++
++VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
++                                       "audio_url", "input_audio", "video_url")
++
++
++def _parse_chat_message_content_parts(
++    role: str,
++    parts: Iterable[ChatCompletionContentPartParam],
++    mm_tracker: BaseMultiModalItemTracker,
++    *,
++    wrap_dicts: bool,
++) -> List[ConversationMessage]:
++    content = list[_ContentPart]()
++
++    mm_parser = mm_tracker.create_parser()
++
++    for part in parts:
++        parse_res = _parse_chat_message_content_part(
++            part,
++            mm_parser,
++            wrap_dicts=wrap_dicts,
++        )
++        if parse_res:
++            content.append(parse_res)
++
++    if wrap_dicts:
++        # Parsing wraps images and texts as interleaved dictionaries
++        return [ConversationMessage(role=role,
++                                    content=content)]  # type: ignore
++    texts = cast(List[str], content)
++    text_prompt = "\n".join(texts)
++    mm_placeholder_counts = mm_parser.mm_placeholder_counts()
++    if mm_placeholder_counts:
++        text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_counts,
++                                                       text_prompt)
++    return [ConversationMessage(role=role, content=text_prompt)]
++
++
++def _parse_chat_message_content_part(
++    part: ChatCompletionContentPartParam,
++    mm_parser: BaseMultiModalContentParser,
++    *,
++    wrap_dicts: bool,
++) -> Optional[_ContentPart]:
++    """Parses a single part of a conversation. If wrap_dicts is True,
++    structured dictionary pieces for texts and images will be
++    wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
++    {"type": "image"}, respectively. Otherwise multimodal data will be
++    handled by mm_parser, and texts will be returned as strings to be joined
++    with multimodal placeholders.
++    """
++    if isinstance(part, str):  # Handle plain text parts
++        return part
++
++    # Handle structured dictionary parts
++    part_type, content = _parse_chat_message_content_mm_part(part)
++
++    # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
++    # content is empty, log a warning and skip
++    if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
++        logger.warning(
++            "Skipping multimodal part (type: '%s')"
++            "with empty / unparsable content.", part_type)
++        return None
++
++    if part_type in ("text", "refusal"):
++        str_content = cast(str, content)
++        if wrap_dicts:
++            return {'type': 'text', 'text': str_content}
++        else:
++            return str_content
++
++    if part_type == "image_url":
++        str_content = cast(str, content)
++        mm_parser.parse_image(str_content)
++        return {'type': 'image'} if wrap_dicts else None
++
++    if part_type == "audio_url":
++        str_content = cast(str, content)
++        mm_parser.parse_audio(str_content)
++        return {'type': 'audio'} if wrap_dicts else None
++
++    if part_type == "input_audio":
++        dict_content = cast(InputAudio, content)
++        mm_parser.parse_input_audio(dict_content)
++        return {'type': 'audio'} if wrap_dicts else None
++
++    if part_type == "video_url":
++        str_content = cast(str, content)
++        mm_parser.parse_video(str_content)
++        return {'type': 'video'} if wrap_dicts else None
++
++    raise NotImplementedError(f"Unknown part type: {part_type}")
++
++
++# No need to validate using Pydantic again
++_AssistantParser = partial(cast, ChatCompletionAssistantMessageParam)
++_ToolParser = partial(cast, ChatCompletionToolMessageParam)
++
++
++def _parse_chat_message_content(
++    message: ChatCompletionMessageParam,
++    mm_tracker: BaseMultiModalItemTracker,
++    content_format: _ChatTemplateContentFormat,
++) -> List[ConversationMessage]:
++    role = message["role"]
++    content = message.get("content")
++
++    if content is None:
++        content = []
++    elif isinstance(content, str):
++        content = [
++            ChatCompletionContentPartTextParam(type="text", text=content)
++        ]
++    result = _parse_chat_message_content_parts(
++        role,
++        content,  # type: ignore
++        mm_tracker,
++        wrap_dicts=(content_format == "openai"),
++    )
++
++    for result_msg in result:
++        if role == 'assistant':
++            parsed_msg = _AssistantParser(message)
++
++            if "tool_calls" in parsed_msg:
++                result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
++        elif role == "tool":
++            parsed_msg = _ToolParser(message)
++            if "tool_call_id" in parsed_msg:
++                result_msg["tool_call_id"] = parsed_msg["tool_call_id"]
++
++        if "name" in message and isinstance(message["name"], str):
++            result_msg["name"] = message["name"]
++
++    return result
++
++
++def _postprocess_messages(messages: List[ConversationMessage]) -> None:
++    # per the Transformers docs & maintainers, tool call arguments in
++    # assistant-role messages with tool_calls need to be dicts not JSON str -
++    # this is how tool-use chat templates will expect them moving forwards
++    # so, for messages that have tool_calls, parse the string (which we get
++    # from openAI format) to dict
++    for message in messages:
++        if (message["role"] == "assistant" and "tool_calls" in message
++                and isinstance(message["tool_calls"], list)):
++
++            for item in message["tool_calls"]:
++                item["function"]["arguments"] = json.loads(
++                    item["function"]["arguments"])
++
++
++def parse_chat_messages(
++    messages: List[ChatCompletionMessageParam],
++    model_config: ModelConfig,
++    tokenizer: AnyTokenizer,
++    content_format: _ChatTemplateContentFormat,
++) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]:
++    conversation: List[ConversationMessage] = []
++    mm_tracker = MultiModalItemTracker(model_config, tokenizer)
++
++    for msg in messages:
++        sub_messages = _parse_chat_message_content(
++            msg,
++            mm_tracker,
++            content_format,
++        )
++
++        conversation.extend(sub_messages)
++
++    _postprocess_messages(conversation)
++
++    return conversation, mm_tracker.all_mm_data()
++
++
++def parse_chat_messages_futures(
++    messages: List[ChatCompletionMessageParam],
++    model_config: ModelConfig,
++    tokenizer: AnyTokenizer,
++    content_format: _ChatTemplateContentFormat,
++) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
++    conversation: List[ConversationMessage] = []
++    mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
++
++    for msg in messages:
++        sub_messages = _parse_chat_message_content(
++            msg,
++            mm_tracker,
++            content_format,
++        )
++
++        conversation.extend(sub_messages)
++
++    _postprocess_messages(conversation)
++
++    return conversation, mm_tracker.all_mm_data()
++
++
++def apply_hf_chat_template(
++    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
++    conversation: List[ConversationMessage],
++    chat_template: Optional[str],
++    *,
++    tokenize: bool = False,  # Different from HF's default
++    **kwargs: Any,
++) -> str:
++    if chat_template is None and tokenizer.chat_template is None:
++        raise ValueError(
++            "As of transformers v4.44, default chat template is no longer "
++            "allowed, so you must provide a chat template if the tokenizer "
++            "does not define one.")
++
++    return tokenizer.apply_chat_template(
++        conversation=conversation,  # type: ignore[arg-type]
++        chat_template=chat_template,
++        tokenize=tokenize,
++        **kwargs,
++    )
++
++
++def apply_mistral_chat_template(
++    tokenizer: MistralTokenizer,
++    messages: List[ChatCompletionMessageParam],
++    chat_template: Optional[str] = None,
++    **kwargs: Any,
++) -> List[int]:
++    if chat_template is not None:
++        logger.warning_once(
++            "'chat_template' cannot be overridden for mistral tokenizer.")
++    if "add_generation_prompt" in kwargs:
++        logger.warning_once(
++            "'add_generation_prompt' is not supported for mistral tokenizer, "
++            "so it will be ignored.")
++    if "continue_final_message" in kwargs:
++        logger.warning_once(
++            "'continue_final_message' is not supported for mistral tokenizer, "
++            "so it will be ignored.")
++
++    return tokenizer.apply_chat_template(
++        messages=messages,
++        **kwargs,
++    )
+diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
+new file mode 100644
+index 0000000..5dcf50b
+--- /dev/null
++++ b/vllm/entrypoints/launcher.py
+@@ -0,0 +1,103 @@
++import asyncio
++import signal
++from http import HTTPStatus
++from typing import Any
++
++import uvicorn
++from fastapi import FastAPI, Request, Response
++
++from vllm import envs
++from vllm.engine.async_llm_engine import AsyncEngineDeadError
++from vllm.engine.multiprocessing import MQEngineDeadError
++from vllm.logger import init_logger
++from vllm.utils import find_process_using_port
++
++logger = init_logger(__name__)
++
++
++async def serve_http(app: FastAPI, **uvicorn_kwargs: Any):
++    logger.info("Available routes are:")
++    for route in app.routes:
++        methods = getattr(route, "methods", None)
++        path = getattr(route, "path", None)
++
++        if methods is None or path is None:
++            continue
++
++        logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
++
++    config = uvicorn.Config(app, **uvicorn_kwargs)
++    server = uvicorn.Server(config)
++    _add_shutdown_handlers(app, server)
++
++    loop = asyncio.get_running_loop()
++
++    server_task = loop.create_task(server.serve())
++
++    def signal_handler() -> None:
++        # prevents the uvicorn signal handler to exit early
++        server_task.cancel()
++
++    async def dummy_shutdown() -> None:
++        pass
++
++    loop.add_signal_handler(signal.SIGINT, signal_handler)
++    loop.add_signal_handler(signal.SIGTERM, signal_handler)
++
++    try:
++        await server_task
++        return dummy_shutdown()
++    except asyncio.CancelledError:
++        port = uvicorn_kwargs["port"]
++        process = find_process_using_port(port)
++        if process is not None:
++            logger.debug(
++                "port %s is used by process %s launched with command:\n%s",
++                port, process, " ".join(process.cmdline()))
++        logger.info("Shutting down FastAPI HTTP server.")
++        return server.shutdown()
++
++
++def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
++    """Adds handlers for fatal errors that should crash the server"""
++
++    @app.exception_handler(RuntimeError)
++    async def runtime_error_handler(request: Request, __):
++        """On generic runtime error, check to see if the engine has died.
++        It probably has, in which case the server will no longer be able to
++        handle requests. Trigger a graceful shutdown with a SIGTERM."""
++        engine = request.app.state.engine_client
++        if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored
++                and not engine.is_running):
++            logger.fatal("AsyncLLMEngine has failed, terminating server "
++                         "process")
++            # See discussions here on shutting down a uvicorn server
++            # https://github.com/encode/uvicorn/discussions/1103
++            # In this case we cannot await the server shutdown here because
++            # this handler must first return to close the connection for
++            # this request.
++            server.should_exit = True
++
++        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
++
++    @app.exception_handler(AsyncEngineDeadError)
++    async def async_engine_dead_handler(_, __):
++        """Kill the server if the async engine is already dead. It will
++        not handle any further requests."""
++        if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
++            logger.fatal("AsyncLLMEngine is already dead, terminating server "
++                         "process")
++            server.should_exit = True
++
++        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
++
++    @app.exception_handler(MQEngineDeadError)
++    async def mq_engine_dead_handler(_, __):
++        """Kill the server if the mq engine is already dead. It will
++        not handle any further requests."""
++        if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
++            logger.fatal("MQLLMEngine is already dead, terminating server "
++                         "process")
++            server.should_exit = True
++
++        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
+diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
+index 3ed660e..acb4db8 100644
+--- a/vllm/entrypoints/llm.py
++++ b/vllm/entrypoints/llm.py
+@@ -1,17 +1,45 @@
+-from typing import List, Optional, Union
++import itertools
++import warnings
++from contextlib import contextmanager
++from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type,
++                    Union, cast, overload)
+ 
+-import torch
+ from tqdm import tqdm
+-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
++from typing_extensions import deprecated
+ 
+-from vllm.engine.arg_utils import EngineArgs
++from vllm import envs
++from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
++                              BeamSearchSequence, get_beam_search_score)
++from vllm.config import CompilationConfig
++from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
++                                   TaskOption)
+ from vllm.engine.llm_engine import LLMEngine
++from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
++                                         ChatTemplateContentFormatOption,
++                                         apply_hf_chat_template,
++                                         apply_mistral_chat_template,
++                                         parse_chat_messages,
++                                         resolve_chat_template_content_format)
++from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
++from vllm.inputs.parse import is_token_prompt, parse_and_batch_prompt
++from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+-from vllm.outputs import RequestOutput
+-from vllm.sampling_params import SamplingParams
+-from vllm.sequence import MultiModalData
++from vllm.model_executor.guided_decoding.guided_fields import (
++    GuidedDecodingRequest, LLMGuidedOptions)
++from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
++                          PoolingRequestOutput, RequestOutput,
++                          ScoringRequestOutput)
++from vllm.pooling_params import PoolingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
++                                  RequestOutputKind, SamplingParams)
++from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
++                                               get_cached_tokenizer)
++from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+ from vllm.usage.usage_lib import UsageContext
+-from vllm.utils import Counter
++from vllm.utils import Counter, deprecate_args, deprecate_kwargs, is_list_of
++
++logger = init_logger(__name__)
+ 
+ 
+ class LLM:
+@@ -23,10 +51,6 @@ class LLM:
+     this class generates texts from the model, using an intelligent batching
+     mechanism and efficient memory management.
+ 
+-    NOTE: This class is intended to be used for offline inference. For online
+-    serving, use the `AsyncLLMEngine` class instead.
+-    NOTE: For the comprehensive list of arguments, see `EngineArgs`.
+-
+     Args:
+         model: The name or path of a HuggingFace Transformers model.
+         tokenizer: The name or path of a HuggingFace Transformers tokenizer.
+@@ -37,6 +61,10 @@ class LLM:
+             from the input.
+         trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+             downloading the model and tokenizer.
++        allowed_local_media_path: Allowing API requests to read local images
++            or videos from directories specified by the server file system.
++            This is a security risk. Should only be enabled in trusted
++            environments.
+         tensor_parallel_size: The number of GPUs to use for distributed
+             execution with tensor parallelism.
+         dtype: The data type for the model weights and activations. Currently,
+@@ -45,7 +73,7 @@ class LLM:
+             However, if the `torch_dtype` in the config is `float32`, we will
+             use `float16` instead.
+         quantization: The method used to quantize the model weights. Currently,
+-            we support "awq", "gptq", "squeezellm", and "fp8" (experimental).
++            we support "awq", "gptq", and "fp8" (experimental).
+             If None, we first check the `quantization_config` attribute in the
+             model config file. If that is None, we assume the model weights are
+             not quantized and use `dtype` to determine the data type of
+@@ -65,18 +93,60 @@ class LLM:
+             when their `best_of` sampling parameters are larger than 1. If all
+             requests will have `best_of=1`, you can safely set this to 0.
+             Otherwise, too small values may cause out-of-memory (OOM) errors.
++        cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
++            the model weights. This virtually increases the GPU memory space
++            you can use to hold the model weights, at the cost of CPU-GPU data
++            transfer for every forward pass.
+         enforce_eager: Whether to enforce eager execution. If True, we will
+             disable CUDA graph and always execute the model in eager mode.
+             If False, we will use CUDA graph and eager execution in hybrid.
+-        max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+-            When a sequence has context length larger than this, we fall back
+-            to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead).
+         max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
+             When a sequence has context length larger than this, we fall back
+-            to eager mode.
+-        disable_custom_all_reduce: See ParallelConfig
++            to eager mode. Additionally for encoder-decoder models, if the
++            sequence length of the encoder input is larger than this, we fall
++            back to the eager mode.
++        disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
++        disable_async_output_proc: Disable async output processing.
++            This may result in lower performance.
++        hf_overrides: If a dictionary, contains arguments to be forwarded to the
++            HuggingFace config. If a callable, it is called to update the
++            HuggingFace config.
++        compilation_config: Either an integer or a dictionary. If it is an
++            integer, it is used as the level of compilation optimization. If it
++            is a dictionary, it can specify the full compilation configuration.
++        **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
++            :ref:`engine-args`)
++
++    Note:
++        This class is intended to be used for offline inference. For online
++        serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
+     """
+ 
++    DEPRECATE_LEGACY: ClassVar[bool] = True
++    """A flag to toggle whether to deprecate the legacy generate/encode API."""
++
++    DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
++    """
++    A flag to toggle whether to deprecate positional arguments in
++    :meth:`LLM.__init__`.
++    """
++
++    @classmethod
++    @contextmanager
++    def deprecate_legacy_api(cls):
++        cls.DEPRECATE_LEGACY = True
++
++        yield
++
++        cls.DEPRECATE_LEGACY = False
++
++    @deprecate_args(
++        start_index=2,  # Ignore self and model
++        is_deprecated=lambda: LLM.DEPRECATE_INIT_POSARGS,
++        additional_message=(
++            "All positional arguments other than `model` will be "
++            "replaced with keyword arguments in an upcoming version."),
++    )
+     def __init__(
+         self,
+         model: str,
+@@ -84,6 +154,7 @@ class LLM:
+         tokenizer_mode: str = "auto",
+         skip_tokenizer_init: bool = False,
+         trust_remote_code: bool = False,
++        allowed_local_media_path: str = "",
+         tensor_parallel_size: int = 1,
+         dtype: str = "auto",
+         quantization: Optional[str] = None,
+@@ -91,21 +162,47 @@ class LLM:
+         tokenizer_revision: Optional[str] = None,
+         seed: int = 0,
+         gpu_memory_utilization: float = 0.9,
+-        swap_space: int = 4,
+-        enforce_eager: bool = False,
+-        max_context_len_to_capture: Optional[int] = None,
++        swap_space: float = 4,
++        cpu_offload_gb: float = 0,
++        enforce_eager: Optional[bool] = None,
+         max_seq_len_to_capture: int = 8192,
+         disable_custom_all_reduce: bool = False,
++        disable_async_output_proc: bool = False,
++        hf_overrides: Optional[HfOverrides] = None,
++        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
++        # After positional args are removed, move this right below `model`
++        task: TaskOption = "auto",
++        override_pooler_config: Optional[PoolerConfig] = None,
++        compilation_config: Optional[Union[int, Dict[str, Any]]] = None,
+         **kwargs,
+     ) -> None:
++        '''
++        LLM constructor.
++
++        Note: if enforce_eager is unset (enforce_eager is None)
++        it defaults to False.
++        '''
++
+         if "disable_log_stats" not in kwargs:
+             kwargs["disable_log_stats"] = True
++
++        if compilation_config is not None:
++            if isinstance(compilation_config, (int, dict)):
++                compilation_config_instance = CompilationConfig.from_cli(
++                    str(compilation_config))
++            else:
++                compilation_config_instance = compilation_config
++        else:
++            compilation_config_instance = None
++
+         engine_args = EngineArgs(
+             model=model,
++            task=task,
+             tokenizer=tokenizer,
+             tokenizer_mode=tokenizer_mode,
+             skip_tokenizer_init=skip_tokenizer_init,
+             trust_remote_code=trust_remote_code,
++            allowed_local_media_path=allowed_local_media_path,
+             tensor_parallel_size=tensor_parallel_size,
+             dtype=dtype,
+             quantization=quantization,
+@@ -114,146 +211,1051 @@ class LLM:
+             seed=seed,
+             gpu_memory_utilization=gpu_memory_utilization,
+             swap_space=swap_space,
++            cpu_offload_gb=cpu_offload_gb,
+             enforce_eager=enforce_eager,
+-            max_context_len_to_capture=max_context_len_to_capture,
+             max_seq_len_to_capture=max_seq_len_to_capture,
+             disable_custom_all_reduce=disable_custom_all_reduce,
++            disable_async_output_proc=disable_async_output_proc,
++            hf_overrides=hf_overrides,
++            mm_processor_kwargs=mm_processor_kwargs,
++            override_pooler_config=override_pooler_config,
++            compilation_config=compilation_config_instance,
+             **kwargs,
+         )
+-        self.llm_engine = LLMEngine.from_engine_args(
++        # Logic to switch between engines is done at runtime instead of import
++        # to avoid import order issues
++        self.engine_class = self.get_engine_class()
++        self.llm_engine = self.engine_class.from_engine_args(
+             engine_args, usage_context=UsageContext.LLM_CLASS)
++
+         self.request_counter = Counter()
+ 
+-    def get_tokenizer(
+-            self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+-        return self.llm_engine.tokenizer.tokenizer
++    @staticmethod
++    def get_engine_class() -> Type[LLMEngine]:
++        if envs.VLLM_USE_V1:
++            # Lazy import: the v1 package isn't distributed
++            from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
++            return V1LLMEngine  # type: ignore
++        return LLMEngine
+ 
+-    def set_tokenizer(
++    def get_tokenizer(self) -> AnyTokenizer:
++        return self.llm_engine.get_tokenizer_group(TokenizerGroup).tokenizer
++
++    def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
++        tokenizer_group = self.llm_engine.get_tokenizer_group(TokenizerGroup)
++
++        # While CachedTokenizer is dynamic, have no choice but
++        # compare class name. Misjudgment will arise from
++        # user-defined tokenizer started with 'Cached'
++        if tokenizer.__class__.__name__.startswith("Cached"):
++            tokenizer_group.tokenizer = tokenizer
++        else:
++            tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
++
++    def get_default_sampling_params(self) -> SamplingParams:
++        diff_sampling_param = (
++            self.llm_engine.model_config.get_diff_sampling_param())
++        if diff_sampling_param:
++            return SamplingParams.from_optional(**diff_sampling_param)
++        return SamplingParams()
++
++    @overload
++    def generate(
+         self,
+-        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+-    ) -> None:
+-        self.llm_engine.tokenizer.tokenizer = tokenizer
++        prompts: Union[PromptType, Sequence[PromptType]],
++        /,
++        sampling_params: Optional[Union[SamplingParams,
++                                        Sequence[SamplingParams]]] = None,
++        *,
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        guided_options_request: Optional[Union[LLMGuidedOptions,
++                                               GuidedDecodingRequest]] = None,
++    ) -> List[RequestOutput]:
++        ...
++
++    @overload  # LEGACY: single (prompt + optional token ids)
++    @deprecated("'prompt_token_ids' will become part of 'prompts'")
++    def generate(
++        self,
++        prompts: str,
++        sampling_params: Optional[Union[SamplingParams,
++                                        List[SamplingParams]]] = None,
++        prompt_token_ids: Optional[List[int]] = None,
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        guided_options_request: Optional[Union[LLMGuidedOptions,
++                                               GuidedDecodingRequest]] = None,
++    ) -> List[RequestOutput]:
++        ...
+ 
++    @overload  # LEGACY: multi (prompt + optional token ids)
++    @deprecated("'prompt_token_ids' will become part of 'prompts'")
+     def generate(
+         self,
+-        prompts: Optional[Union[str, List[str]]] = None,
++        prompts: List[str],
+         sampling_params: Optional[Union[SamplingParams,
+                                         List[SamplingParams]]] = None,
+         prompt_token_ids: Optional[List[List[int]]] = None,
+         use_tqdm: bool = True,
+-        lora_request: Optional[LoRARequest] = None,
+-        multi_modal_data: Optional[MultiModalData] = None,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        guided_options_request: Optional[Union[LLMGuidedOptions,
++                                               GuidedDecodingRequest]] = None,
++    ) -> List[RequestOutput]:
++        ...
++
++    @overload  # LEGACY: single (token ids + optional prompt)
++    @deprecated("'prompt_token_ids' will become part of 'prompts'")
++    def generate(
++        self,
++        prompts: Optional[str] = None,
++        sampling_params: Optional[Union[SamplingParams,
++                                        List[SamplingParams]]] = None,
++        *,
++        prompt_token_ids: List[int],
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        guided_options_request: Optional[Union[LLMGuidedOptions,
++                                               GuidedDecodingRequest]] = None,
++    ) -> List[RequestOutput]:
++        ...
++
++    @overload  # LEGACY: multi (token ids + optional prompt)
++    @deprecated("'prompt_token_ids' will become part of 'prompts'")
++    def generate(
++        self,
++        prompts: Optional[List[str]] = None,
++        sampling_params: Optional[Union[SamplingParams,
++                                        List[SamplingParams]]] = None,
++        *,
++        prompt_token_ids: List[List[int]],
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        guided_options_request: Optional[Union[LLMGuidedOptions,
++                                               GuidedDecodingRequest]] = None,
++    ) -> List[RequestOutput]:
++        ...
++
++    @overload  # LEGACY: single or multi token ids [pos-only]
++    @deprecated("'prompt_token_ids' will become part of 'prompts'")
++    def generate(
++        self,
++        prompts: None,
++        sampling_params: None,
++        prompt_token_ids: Union[List[int], List[List[int]]],
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        guided_options_request: Optional[Union[LLMGuidedOptions,
++                                               GuidedDecodingRequest]] = None,
++    ) -> List[RequestOutput]:
++        ...
++
++    @deprecate_kwargs(
++        "prompt_token_ids",
++        is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
++        additional_message="Please use the 'prompts' parameter instead.",
++    )
++    def generate(
++        self,
++        prompts: Union[Union[PromptType, Sequence[PromptType]],
++                       Optional[Union[str, List[str]]]] = None,
++        sampling_params: Optional[Union[SamplingParams,
++                                        Sequence[SamplingParams]]] = None,
++        prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        guided_options_request: Optional[Union[LLMGuidedOptions,
++                                               GuidedDecodingRequest]] = None,
++        priority: Optional[List[int]] = None,
+     ) -> List[RequestOutput]:
+         """Generates the completions for the input prompts.
+ 
+-        NOTE: This class automatically batches the given prompts, considering
++        This class automatically batches the given prompts, considering
+         the memory constraint. For the best performance, put all of your prompts
+         into a single list and pass it to this method.
+ 
+         Args:
+-            prompts: A list of prompts to generate completions for.
++            prompts: The prompts to the LLM. You may pass a sequence of prompts
++                for batch inference. See :class:`~vllm.inputs.PromptType`
++                for more details about the format of each prompts.
+             sampling_params: The sampling parameters for text generation. If
+-                None, we use the default sampling parameters. 
+-                When it is a single value, it is applied to every prompt. 
+-                When it is a list, the list must have the same length as the 
++                None, we use the default sampling parameters.
++                When it is a single value, it is applied to every prompt.
++                When it is a list, the list must have the same length as the
+                 prompts and it is paired one by one with the prompt.
+-            prompt_token_ids: A list of token IDs for the prompts. If None, we
+-                use the tokenizer to convert the prompts to token IDs.
+             use_tqdm: Whether to use tqdm to display the progress bar.
+             lora_request: LoRA request to use for generation, if any.
+-            multi_modal_data: Multi modal data.
++            prompt_adapter_request: Prompt Adapter request to use for
++                generation, if any.
++            priority: The priority of the requests, if any.
++                Only applicable when priority scheduling policy is enabled.
+ 
+         Returns:
+-            A list of `RequestOutput` objects containing the generated
+-            completions in the same order as the input prompts.
++            A list of ``RequestOutput`` objects containing the
++            generated completions in the same order as the input prompts.
++
++        Note:
++            Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
++            considered legacy and may be deprecated in the future. You should
++            instead pass them via the ``inputs`` parameter.
+         """
+-        if prompts is None and prompt_token_ids is None:
+-            raise ValueError("Either prompts or prompt_token_ids must be "
+-                             "provided.")
+-        if self.llm_engine.model_config.skip_tokenizer_init \
+-            and prompts is not None:
+-            raise ValueError("prompts must be None if skip_tokenizer_init "
+-                             "is True")
+-        if isinstance(prompts, str):
++        runner_type = self.llm_engine.model_config.runner_type
++        if runner_type != "generate":
++            messages = [
++                "LLM.generate() is only supported for (conditional) generation "
++                "models (XForCausalLM, XForConditionalGeneration).",
++            ]
++
++            supported_runner_types = self.llm_engine.model_config \
++                .supported_runner_types
++            if "generate" in supported_runner_types:
++                messages.append(
++                    "Your model supports the 'generate' runner, but is "
++                    f"currently initialized for the '{runner_type}' runner. "
++                    "Please initialize vLLM using `--task generate`.")
++
++            raise ValueError(" ".join(messages))
++
++        if prompt_token_ids is not None:
++            parsed_prompts = self._convert_v1_inputs(
++                prompts=cast(Optional[Union[str, List[str]]], prompts),
++                prompt_token_ids=prompt_token_ids,
++            )
++        else:
++            parsed_prompts = cast(Union[PromptType, Sequence[PromptType]],
++                                  prompts)
++
++        if isinstance(guided_options_request, dict):
++            if len(guided_options_request) > 1:
++                raise ValueError(
++                    "You can only use one guided decoding but multiple is "
++                    f"specified: {guided_options_request}")
++            guided_options_request = GuidedDecodingRequest(
++                **guided_options_request)
++
++        if sampling_params is None:
++            # Use default sampling params.
++            sampling_params = self.get_default_sampling_params()
++
++        self._validate_and_add_requests(
++            prompts=parsed_prompts,
++            params=sampling_params,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++            guided_options=guided_options_request,
++            priority=priority)
++
++        outputs = self._run_engine(use_tqdm=use_tqdm)
++        return self.engine_class.validate_outputs(outputs, RequestOutput)
++
++    def beam_search(
++        self,
++        prompts: List[Union[TokensPrompt, TextPrompt]],
++        params: BeamSearchParams,
++    ) -> List[BeamSearchOutput]:
++        """
++        Generate sequences using beam search.
++
++        Args:
++            prompts: A list of prompts. Each prompt can be a string or a list
++                of token IDs.
++            params: The beam search parameters.
++
++        TODO: how does beam search work together with length penalty, frequency
++        penalty, and stopping criteria, etc.?
++        """
++
++        beam_width = params.beam_width
++        max_tokens = params.max_tokens
++        temperature = params.temperature
++        ignore_eos = params.ignore_eos
++        length_penalty = params.length_penalty
++
++        def sort_beams_key(x: BeamSearchSequence) -> float:
++            return get_beam_search_score(x.tokens, x.cum_logprob,
++                                         tokenizer.eos_token_id,
++                                         length_penalty)
++
++        tokenizer = self.get_tokenizer()
++        # generate 2 * beam_width candidates at each step
++        # following the huggingface transformers implementation
++        # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
++        beam_search_params = SamplingParams(logprobs=2 * beam_width,
++                                            max_tokens=1,
++                                            temperature=temperature)
++        instances: List[BeamSearchInstance] = []
++
++        for prompt in prompts:
++            if is_token_prompt(prompt):
++                prompt_tokens = prompt["prompt_token_ids"]
++            else:
++                prompt_tokens = tokenizer.encode(prompt["prompt"])
++            instances.append(BeamSearchInstance(prompt_tokens))
++
++        for _ in range(max_tokens):
++            all_beams: List[BeamSearchSequence] = list(
++                sum((instance.beams for instance in instances), []))
++            pos = [0] + list(
++                itertools.accumulate(
++                    len(instance.beams) for instance in instances))
++            instance_start_and_end: List[Tuple[int, int]] = list(
++                zip(pos[:-1], pos[1:]))
++
++            if len(all_beams) == 0:
++                break
++
++            prompts_batch = [
++                TokensPrompt(prompt_token_ids=beam.tokens)
++                for beam in all_beams
++            ]
++
++            # only runs for one step
++            # we don't need to use tqdm here
++            output = self.generate(prompts_batch,
++                                   sampling_params=beam_search_params,
++                                   use_tqdm=False)
++
++            for (start, end), instance in zip(instance_start_and_end,
++                                              instances):
++                instance_new_beams = []
++                for i in range(start, end):
++                    current_beam = all_beams[i]
++                    result = output[i]
++
++                    if result.outputs[0].logprobs is not None:
++                        # if `result.outputs[0].logprobs` is None, it means
++                        # the sequence is completed because of the max-model-len
++                        # or abortion. we don't need to add it to the new beams.
++                        logprobs = result.outputs[0].logprobs[0]
++                        for token_id, logprob_obj in logprobs.items():
++                            new_beam = BeamSearchSequence(
++                                tokens=current_beam.tokens + [token_id],
++                                logprobs=current_beam.logprobs + [logprobs],
++                                cum_logprob=current_beam.cum_logprob +
++                                logprob_obj.logprob)
++
++                            if token_id == tokenizer.eos_token_id and \
++                                not ignore_eos:
++                                instance.completed.append(new_beam)
++                            else:
++                                instance_new_beams.append(new_beam)
++                sorted_beams = sorted(instance_new_beams,
++                                      key=sort_beams_key,
++                                      reverse=True)
++                instance.beams = sorted_beams[:beam_width]
++
++        outputs = []
++        for instance in instances:
++            instance.completed.extend(instance.beams)
++            sorted_completed = sorted(instance.completed,
++                                      key=sort_beams_key,
++                                      reverse=True)
++            best_beams = sorted_completed[:beam_width]
++
++            for beam in best_beams:
++                beam.text = tokenizer.decode(beam.tokens)
++            outputs.append(BeamSearchOutput(sequences=best_beams))
++
++        return outputs
++
++    def chat(
++        self,
++        messages: Union[List[ChatCompletionMessageParam],
++                        List[List[ChatCompletionMessageParam]]],
++        sampling_params: Optional[Union[SamplingParams,
++                                        List[SamplingParams]]] = None,
++        use_tqdm: bool = True,
++        lora_request: Optional[LoRARequest] = None,
++        chat_template: Optional[str] = None,
++        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
++        add_generation_prompt: bool = True,
++        continue_final_message: bool = False,
++        tools: Optional[List[Dict[str, Any]]] = None,
++        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
++    ) -> List[RequestOutput]:
++        """
++        Generate responses for a chat conversation.
++
++        The chat conversation is converted into a text prompt using the
++        tokenizer and calls the :meth:`generate` method to generate the
++        responses.
++
++        Multi-modal inputs can be passed in the same way you would pass them
++        to the OpenAI API.
++
++        Args:
++            messages: A list of conversations or a single conversation.
++
++              - Each conversation is represented as a list of messages.
++              - Each message is a dictionary with 'role' and 'content' keys.
++
++            sampling_params: The sampling parameters for text generation.
++                If None, we use the default sampling parameters. When it
++                is a single value, it is applied to every prompt. When it
++                is a list, the list must have the same length as the
++                prompts and it is paired one by one with the prompt.
++            use_tqdm: Whether to use tqdm to display the progress bar.
++            lora_request: LoRA request to use for generation, if any.
++            chat_template: The template to use for structuring the chat.
++              If not provided, the model's default chat template will be used.
++            chat_template_content_format: The format to render message content.
++
++              - "string" will render the content as a string.
++                Example: ``"Who are you?"``
++              - "openai" will render the content as a list of dictionaries,
++                similar to OpenAI schema.
++                Example: ``[{"type": "text", "text": "Who are you?"}]``
++
++            add_generation_prompt: If True, adds a generation template
++                to each message.
++            continue_final_message: If True, continues the final message in
++                the conversation instead of starting a new one. Cannot be
++                ``True`` if ``add_generation_prompt`` is also ``True``.
++            mm_processor_kwargs: Multimodal processor kwarg overrides for this
++                chat request. Only used for offline requests.
++
++        Returns:
++            A list of ``RequestOutput`` objects containing the generated
++            responses in the same order as the input messages.
++        """
++        list_of_messages: List[List[ChatCompletionMessageParam]]
++
++        # Handle multi and single conversations
++        if is_list_of(messages, list):
++            # messages is List[List[...]]
++            list_of_messages = cast(List[List[ChatCompletionMessageParam]],
++                                    messages)
++        else:
++            # messages is List[...]
++            list_of_messages = [
++                cast(List[ChatCompletionMessageParam], messages)
++            ]
++
++        tokenizer = self.get_tokenizer()
++        model_config = self.llm_engine.get_model_config()
++        resolved_content_format = resolve_chat_template_content_format(
++            chat_template,
++            chat_template_content_format,
++            tokenizer,
++        )
++
++        prompts: List[Union[TokensPrompt, TextPrompt]] = []
++
++        for msgs in list_of_messages:
++            # NOTE: _parse_chat_message_content_parts() currently doesn't
++            # handle mm_processor_kwargs, since there is no implementation in
++            # the chat message parsing for it.
++            conversation, mm_data = parse_chat_messages(
++                msgs,
++                model_config,
++                tokenizer,
++                content_format=resolved_content_format,
++            )
++
++            prompt_data: Union[str, List[int]]
++            if isinstance(tokenizer, MistralTokenizer):
++                prompt_data = apply_mistral_chat_template(
++                    tokenizer,
++                    messages=msgs,
++                    chat_template=chat_template,
++                    add_generation_prompt=add_generation_prompt,
++                    continue_final_message=continue_final_message,
++                    tools=tools,
++                )
++            else:
++                prompt_data = apply_hf_chat_template(
++                    tokenizer,
++                    conversation=conversation,
++                    chat_template=chat_template,
++                    add_generation_prompt=add_generation_prompt,
++                    continue_final_message=continue_final_message,
++                    tools=tools,
++                )
++
++            prompt: Union[TokensPrompt, TextPrompt]
++            if is_list_of(prompt_data, int):
++                prompt = TokensPrompt(prompt_token_ids=prompt_data)
++            else:
++                prompt = TextPrompt(prompt=prompt_data)
++
++            if mm_data is not None:
++                prompt["multi_modal_data"] = mm_data
++
++            if mm_processor_kwargs is not None:
++                prompt["mm_processor_kwargs"] = mm_processor_kwargs
++
++            prompts.append(prompt)
++
++        return self.generate(
++            prompts,
++            sampling_params=sampling_params,
++            use_tqdm=use_tqdm,
++            lora_request=lora_request,
++        )
++
++    @overload
++    def encode(
++        self,
++        prompts: Union[PromptType, Sequence[PromptType]],
++        /,
++        pooling_params: Optional[Union[PoolingParams,
++                                       Sequence[PoolingParams]]] = None,
++        *,
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> List[PoolingRequestOutput]:
++        ...
++
++    @overload  # LEGACY: single (prompt + optional token ids)
++    @deprecated("'prompt_token_ids' will become part of 'prompts'")
++    def encode(
++        self,
++        prompts: str,
++        pooling_params: Optional[Union[PoolingParams,
++                                       Sequence[PoolingParams]]] = None,
++        prompt_token_ids: Optional[List[int]] = None,
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> List[PoolingRequestOutput]:
++        ...
++
++    @overload  # LEGACY: multi (prompt + optional token ids)
++    @deprecated("'prompt_token_ids' will become part of 'prompts'")
++    def encode(
++        self,
++        prompts: List[str],
++        pooling_params: Optional[Union[PoolingParams,
++                                       Sequence[PoolingParams]]] = None,
++        prompt_token_ids: Optional[List[List[int]]] = None,
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> List[PoolingRequestOutput]:
++        ...
++
++    @overload  # LEGACY: single (token ids + optional prompt)
++    @deprecated("'prompt_token_ids' will become part of 'prompts'")
++    def encode(
++        self,
++        prompts: Optional[str] = None,
++        pooling_params: Optional[Union[PoolingParams,
++                                       Sequence[PoolingParams]]] = None,
++        *,
++        prompt_token_ids: List[int],
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> List[PoolingRequestOutput]:
++        ...
++
++    @overload  # LEGACY: multi (token ids + optional prompt)
++    @deprecated("'prompt_token_ids' will become part of 'prompts'")
++    def encode(
++        self,
++        prompts: Optional[List[str]] = None,
++        pooling_params: Optional[Union[PoolingParams,
++                                       Sequence[PoolingParams]]] = None,
++        *,
++        prompt_token_ids: List[List[int]],
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> List[PoolingRequestOutput]:
++        ...
++
++    @overload  # LEGACY: single or multi token ids [pos-only]
++    @deprecated("'prompt_token_ids' will become part of 'prompts'")
++    def encode(
++        self,
++        prompts: None,
++        pooling_params: None,
++        prompt_token_ids: Union[List[int], List[List[int]]],
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> List[PoolingRequestOutput]:
++        ...
++
++    @deprecate_kwargs(
++        "prompt_token_ids",
++        is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
++        additional_message="Please use the 'prompts' parameter instead.",
++    )
++    def encode(
++        self,
++        prompts: Union[Union[PromptType, Sequence[PromptType]],
++                       Optional[Union[str, List[str]]]] = None,
++        pooling_params: Optional[Union[PoolingParams,
++                                       Sequence[PoolingParams]]] = None,
++        prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> List[PoolingRequestOutput]:
++        """Apply pooling to the hidden states corresponding to the input
++        prompts.
++
++        This class automatically batches the given prompts, considering
++        the memory constraint. For the best performance, put all of your prompts
++        into a single list and pass it to this method.
++
++        Args:
++            prompts: The prompts to the LLM. You may pass a sequence of prompts
++                for batch inference. See :class:`~vllm.inputs.PromptType`
++                for more details about the format of each prompts.
++            pooling_params: The pooling parameters for pooling. If None, we
++                use the default pooling parameters.
++            use_tqdm: Whether to use tqdm to display the progress bar.
++            lora_request: LoRA request to use for generation, if any.
++            prompt_adapter_request: Prompt Adapter request to use for
++                generation, if any.
++
++        Returns:
++            A list of ``PoolingRequestOutput`` objects containing the
++            pooled hidden states in the same order as the input prompts.
++
++        Note:
++            Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
++            considered legacy and may be deprecated in the future. You should
++            instead pass them via the ``inputs`` parameter.
++        """
++        runner_type = self.llm_engine.model_config.runner_type
++        if runner_type != "pooling":
++            messages = ["LLM.encode() is only supported for pooling models."]
++
++            supported_runner_types = self.llm_engine.model_config \
++                .supported_runner_types
++            if "pooling" in supported_runner_types:
++                messages.append(
++                    "Your model supports the 'pooling' runner, but is "
++                    f"currently initialized for the '{runner_type}' runner. "
++                    "Please initialize vLLM using `--task embed`, "
++                    "`--task classify`, `--task score` etc.")
++
++            raise ValueError(" ".join(messages))
++
++        if prompt_token_ids is not None:
++            parsed_prompts = self._convert_v1_inputs(
++                prompts=cast(Optional[Union[str, List[str]]], prompts),
++                prompt_token_ids=prompt_token_ids,
++            )
++        else:
++            parsed_prompts = cast(Union[PromptType, Sequence[PromptType]],
++                                  prompts)
++
++        if pooling_params is None:
++            # Use default pooling params.
++            pooling_params = PoolingParams()
++
++        self._validate_and_add_requests(
++            prompts=parsed_prompts,
++            params=pooling_params,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++        )
++
++        outputs = self._run_engine(use_tqdm=use_tqdm)
++        return self.engine_class.validate_outputs(outputs,
++                                                  PoolingRequestOutput)
++
++    def embed(
++        self,
++        prompts: Union[PromptType, Sequence[PromptType]],
++        /,
++        *,
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> List[EmbeddingRequestOutput]:
++        """
++        Generate an embedding vector for each prompt.
++
++        This class automatically batches the given prompts, considering
++        the memory constraint. For the best performance, put all of your prompts
++        into a single list and pass it to this method.
++
++        Args:
++            prompts: The prompts to the LLM. You may pass a sequence of prompts
++                for batch inference. See :class:`~vllm.inputs.PromptType`
++                for more details about the format of each prompts.
++            use_tqdm: Whether to use tqdm to display the progress bar.
++            lora_request: LoRA request to use for generation, if any.
++            prompt_adapter_request: Prompt Adapter request to use for
++                generation, if any.
++
++        Returns:
++            A list of ``EmbeddingRequestOutput`` objects containing the
++            embedding vectors in the same order as the input prompts.
++        """
++        if self.llm_engine.model_config.task != "embed":
++            raise ValueError(
++                "Embedding API is only enabled for `--task embed`")
++
++        items = self.encode(prompts,
++                            use_tqdm=use_tqdm,
++                            lora_request=lora_request,
++                            prompt_adapter_request=prompt_adapter_request)
++
++        return [EmbeddingRequestOutput.from_base(item) for item in items]
++
++    def classify(
++        self,
++        prompts: Union[PromptType, Sequence[PromptType]],
++        /,
++        *,
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> List[ClassificationRequestOutput]:
++        """
++        Generate class logits for each prompt.
++
++        This class automatically batches the given prompts, considering
++        the memory constraint. For the best performance, put all of your prompts
++        into a single list and pass it to this method.
++
++        Args:
++            prompts: The prompts to the LLM. You may pass a sequence of prompts
++                for batch inference. See :class:`~vllm.inputs.PromptType`
++                for more details about the format of each prompts.
++            use_tqdm: Whether to use tqdm to display the progress bar.
++            lora_request: LoRA request to use for generation, if any.
++            prompt_adapter_request: Prompt Adapter request to use for
++                generation, if any.
++
++        Returns:
++            A list of ``ClassificationRequestOutput`` objects containing the
++            embedding vectors in the same order as the input prompts.
++        """
++        if self.llm_engine.model_config.task != "classify":
++            raise ValueError(
++                "Classification API is only enabled for `--task classify`")
++
++        items = self.encode(prompts,
++                            use_tqdm=use_tqdm,
++                            lora_request=lora_request,
++                            prompt_adapter_request=prompt_adapter_request)
++
++        return [ClassificationRequestOutput.from_base(item) for item in items]
++
++    def score(
++        self,
++        text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
++        text_2: Union[SingletonPrompt, Sequence[SingletonPrompt]],
++        /,
++        *,
++        truncate_prompt_tokens: Optional[int] = None,
++        use_tqdm: bool = True,
++        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> List[ScoringRequestOutput]:
++        """Generate similarity scores for all pairs ``<text,text_pair>``.
++
++        The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``.
++        In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N``
++        times to pair with the ``text_2`` sentences.
++        The input pairs are used to build a list of prompts for the
++        cross encoder model. This class automatically batches the prompts,
++        considering the memory constraint. For the best performance, put all
++        of your texts into a single list and pass it to this method.
++
++        Args:
++            text_1: can be a single prompt or a list of prompts, in which
++                case it has to have the same length as the ``text_2`` list
++            text_2: The texts to pair with the query to form the input
++                to the LLM. See :class:`~vllm.inputs.PromptType` for
++                more details about the format of each prompts.
++            use_tqdm: Whether to use tqdm to display the progress bar.
++            lora_request: LoRA request to use for generation, if any.
++            prompt_adapter_request: Prompt Adapter request to use for
++                generation, if any.
++
++        Returns:
++            A list of ``ScoringRequestOutput`` objects containing the
++            generated scores in the same order as the input prompts.
++        """
++        runner_type = self.llm_engine.model_config.runner_type
++        if runner_type != "pooling":
++            messages = ["LLM.score() is only supported for pooling models."]
++
++            supported_runner_types = self.llm_engine.model_config \
++                .supported_runner_types
++            if "pooling" in supported_runner_types:
++                messages.append(
++                    "Your model supports the 'pooling' runner, but is "
++                    f"currently initialized for the '{runner_type}' runner. "
++                    "Please initialize vLLM using `--task embed`, "
++                    "`--task classify`, `--task score` etc.")
++
++            raise ValueError(" ".join(messages))
++
++        if not self.llm_engine.model_config.is_cross_encoder:
++            raise ValueError("Your model does not support cross encoding")
++        if self.llm_engine.model_config.task != "score":
++            raise ValueError("Score API is only enabled for `--task score`")
++
++        tokenizer = self.llm_engine.get_tokenizer()
++
++        if isinstance(tokenizer, MistralTokenizer):
++            raise ValueError(
++                "MistralTokenizer not supported for cross-encoding")
++
++        # the tokenizer for models such as
++        # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
++        # lists of tokens to the `text` and `text_pair` kwargs
++        def ensure_str(prompt: SingletonPrompt):
++            if isinstance(prompt, dict):
++                if "multi_modal_data" in prompt:
++                    raise ValueError("Multi-modal prompt is not "
++                                     "supported for cross encoding")
++                elif "prompt_token_ids" in prompt:
++                    prompt = tokenizer.decode(
++                        cast(TokensPrompt, prompt)["prompt_token_ids"])
++                elif "prompt" in prompt:
++                    prompt = cast(TextPrompt, prompt)["prompt"]
++            assert type(prompt) is str
++            return prompt
++
++        if isinstance(text_1, (str, dict)):
+             # Convert a single prompt to a list.
+-            prompts = [prompts]
+-        if (prompts is not None and prompt_token_ids is not None
+-                and len(prompts) != len(prompt_token_ids)):
+-            raise ValueError("The lengths of prompts and prompt_token_ids "
+-                             "must be the same.")
++            text_1 = [text_1]
++        text_1 = [ensure_str(t) for t in text_1]
++
++        if isinstance(text_2, (str, dict)):
++            # Convert a single prompt to a list.
++            text_2 = [text_2]
++        text_2 = [ensure_str(t) for t in text_2]
++
++        if len(text_1) > 1 and len(text_1) != len(text_2):
++            raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
++        if len(text_1) == 0:
++            raise ValueError("At least one text element must be given")
++        if len(text_2) == 0:
++            raise ValueError("At least one text_pair element must be given")
++
++        if len(text_1) == 1:
++            text_1 = text_1 * len(text_2)
++
++        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
++        pooling_params = PoolingParams()
++
++        tokenization_kwargs: Dict[str, Any] = {}
++        if truncate_prompt_tokens is not None:
++            tokenization_kwargs["truncation"] = True
++            tokenization_kwargs["max_length"] = truncate_prompt_tokens
++
++        parsed_prompts = []
++
++        for q, t in input_pairs:
++            prompt_inputs = tokenizer(text=q,
++                                      text_pair=t,
++                                      **tokenization_kwargs)
++            engine_prompt = TokensPrompt(
++                prompt_token_ids=prompt_inputs["input_ids"],
++                token_type_ids=prompt_inputs.get("token_type_ids"))
++            parsed_prompts.append(engine_prompt)
++
++        self._validate_and_add_requests(
++            prompts=parsed_prompts,
++            params=pooling_params,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++        )
++
++        outputs = self._run_engine(use_tqdm=use_tqdm)
++        items = self.engine_class.validate_outputs(outputs,
++                                                   PoolingRequestOutput)
++
++        return [ScoringRequestOutput.from_base(item) for item in items]
++
++    def start_profile(self) -> None:
++        self.llm_engine.start_profile()
+ 
++    def stop_profile(self) -> None:
++        self.llm_engine.stop_profile()
++
++    # LEGACY
++    def _convert_v1_inputs(
++        self,
++        prompts: Optional[Union[str, List[str]]],
++        prompt_token_ids: Optional[Union[List[int], List[List[int]]]],
++    ):
++        # skip_tokenizer_init is now checked in engine
++
++        if prompts is not None:
++            prompts = [p["content"] for p in parse_and_batch_prompt(prompts)]
++        if prompt_token_ids is not None:
++            prompt_token_ids = [
++                p["content"] for p in parse_and_batch_prompt(prompt_token_ids)
++            ]
++
++        num_requests = None
+         if prompts is not None:
+             num_requests = len(prompts)
+-        else:
+-            assert prompt_token_ids is not None
++        if prompt_token_ids is not None:
++            if (num_requests is not None
++                    and num_requests != len(prompt_token_ids)):
++                raise ValueError("The lengths of prompts and prompt_token_ids "
++                                 "must be the same.")
++
+             num_requests = len(prompt_token_ids)
++        if num_requests is None:
++            raise ValueError("Either prompts or prompt_token_ids must be "
++                             "provided.")
+ 
+-        if sampling_params is None:
+-            # Use default sampling params.
+-            sampling_params = SamplingParams()
++        parsed_prompts: List[PromptType] = []
++        for i in range(num_requests):
++            item: PromptType
++
++            if prompts is not None:
++                item = TextPrompt(prompt=prompts[i])
++            elif prompt_token_ids is not None:
++                item = TokensPrompt(prompt_token_ids=prompt_token_ids[i])
++            else:
++                raise AssertionError
++
++            parsed_prompts.append(item)
++
++        return parsed_prompts
++
++    def _validate_and_add_requests(
++        self,
++        prompts: Union[PromptType, Sequence[PromptType]],
++        params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
++                      Sequence[PoolingParams]],
++        lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
++        prompt_adapter_request: Optional[PromptAdapterRequest],
++        guided_options: Optional[GuidedDecodingRequest] = None,
++        priority: Optional[List[int]] = None,
++    ) -> None:
++        if guided_options is not None:
++            warnings.warn(
++                "guided_options_request is deprecated, use "
++                "SamplingParams.guided_decoding instead",
++                DeprecationWarning,
++                stacklevel=2,
++            )
++
++        if isinstance(prompts, (str, dict)):
++            # Convert a single prompt to a list.
++            prompts = [prompts]
+ 
+-        elif isinstance(sampling_params,
+-                        list) and len(sampling_params) != num_requests:
+-            raise ValueError("The lengths of prompts and sampling_params "
++        num_requests = len(prompts)
++        if isinstance(params, list) and len(params) != num_requests:
++            raise ValueError("The lengths of prompts and params "
+                              "must be the same.")
+-        if multi_modal_data:
+-            multi_modal_data.data = multi_modal_data.data.to(torch.float16)
++        if isinstance(lora_request,
++                      list) and len(lora_request) != num_requests:
++            raise ValueError("The lengths of prompts and lora_request "
++                             "must be the same.")
++
++        for sp in params if isinstance(params, list) else (params, ):
++            if isinstance(sp, SamplingParams):
++                self._add_guided_params(sp, guided_options)
++
++                # We only care about the final output
++                sp.output_kind = RequestOutputKind.FINAL_ONLY
+ 
+         # Add requests to the engine.
+-        for i in range(num_requests):
+-            prompt = prompts[i] if prompts is not None else None
+-            token_ids = None if prompt_token_ids is None else prompt_token_ids[
+-                i]
++        for i, prompt in enumerate(prompts):
+             self._add_request(
+                 prompt,
+-                sampling_params[i]
+-                if isinstance(sampling_params, list) else sampling_params,
+-                token_ids,
+-                lora_request=lora_request,
+-                # Get ith image while maintaining the batch dim.
+-                multi_modal_data=MultiModalData(
+-                    type=multi_modal_data.type,
+-                    data=multi_modal_data.data[i].unsqueeze(0))
+-                if multi_modal_data else None,
++                params[i] if isinstance(params, Sequence) else params,
++                lora_request=lora_request[i] if isinstance(
++                    lora_request, Sequence) else lora_request,
++                prompt_adapter_request=prompt_adapter_request,
++                priority=priority[i] if priority else 0,
+             )
+-        return self._run_engine(use_tqdm)
+ 
+     def _add_request(
+         self,
+-        prompt: Optional[str],
+-        sampling_params: SamplingParams,
+-        prompt_token_ids: Optional[List[int]],
++        prompt: PromptType,
++        params: Union[SamplingParams, PoolingParams],
+         lora_request: Optional[LoRARequest] = None,
+-        multi_modal_data: Optional[MultiModalData] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
+     ) -> None:
+         request_id = str(next(self.request_counter))
+-        self.llm_engine.add_request(request_id,
+-                                    prompt,
+-                                    sampling_params,
+-                                    prompt_token_ids,
+-                                    lora_request=lora_request,
+-                                    multi_modal_data=multi_modal_data)
+-
+-    def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
++        self.llm_engine.add_request(
++            request_id,
++            prompt,
++            params,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++            priority=priority,
++        )
++
++    def _add_guided_params(
++            self,
++            params: SamplingParams,
++            guided_options: Optional[GuidedDecodingRequest] = None):
++        if guided_options is None:
++            return params
++
++        if params.guided_decoding is not None:
++            raise ValueError("Cannot set both guided_options_request and"
++                             "params.guided_decoding.")
++
++        params.guided_decoding = GuidedDecodingParams(
++            json=guided_options.guided_json,
++            regex=guided_options.guided_regex,
++            choice=guided_options.guided_choice,
++            grammar=guided_options.guided_grammar,
++            json_object=guided_options.guided_json_object,
++            backend=guided_options.guided_decoding_backend,
++            whitespace_pattern=guided_options.guided_whitespace_pattern)
++        return params
++
++    def _run_engine(
++            self, *, use_tqdm: bool
++    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+         # Initialize tqdm.
+         if use_tqdm:
+             num_requests = self.llm_engine.get_num_unfinished_requests()
+-            pbar = tqdm(total=num_requests,
+-                        desc="Processed prompts",
+-                        dynamic_ncols=True)
++            pbar = tqdm(
++                total=num_requests,
++                desc="Processed prompts",
++                dynamic_ncols=True,
++                postfix=(f"est. speed input: {0:.2f} toks/s, "
++                         f"output: {0:.2f} toks/s"),
++            )
++
+         # Run the engine.
+-        outputs: List[RequestOutput] = []
++        outputs: List[Union[RequestOutput, PoolingRequestOutput]] = []
++        total_in_toks = 0
++        total_out_toks = 0
+         while self.llm_engine.has_unfinished_requests():
+             step_outputs = self.llm_engine.step()
+             for output in step_outputs:
+                 if output.finished:
+                     outputs.append(output)
+                     if use_tqdm:
++                        if isinstance(output, RequestOutput):
++                            # Calculate tokens only for RequestOutput
++                            assert output.prompt_token_ids is not None
++                            total_in_toks += len(output.prompt_token_ids)
++                            in_spd = total_in_toks / pbar.format_dict["elapsed"]
++                            total_out_toks += sum(
++                                len(stp.token_ids) for stp in output.outputs)
++                            out_spd = (total_out_toks /
++                                       pbar.format_dict["elapsed"])
++                            pbar.postfix = (
++                                f"est. speed input: {in_spd:.2f} toks/s, "
++                                f"output: {out_spd:.2f} toks/s")
+                         pbar.update(1)
++
+         if use_tqdm:
+             pbar.close()
+         # Sort the outputs by request ID.
+         # This is necessary because some requests may be finished earlier than
+         # its previous requests.
+-        outputs = sorted(outputs, key=lambda x: int(x.request_id))
+-        return outputs
+\ No newline at end of file
++        return sorted(outputs, key=lambda x: int(x.request_id))
+diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
+new file mode 100644
+index 0000000..584ee0d
+--- /dev/null
++++ b/vllm/entrypoints/logger.py
+@@ -0,0 +1,42 @@
++from typing import List, Optional, Union
++
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.pooling_params import PoolingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import BeamSearchParams, SamplingParams
++
++logger = init_logger(__name__)
++
++
++class RequestLogger:
++
++    def __init__(self, *, max_log_len: Optional[int]) -> None:
++        super().__init__()
++
++        self.max_log_len = max_log_len
++
++    def log_inputs(
++        self,
++        request_id: str,
++        prompt: Optional[str],
++        prompt_token_ids: Optional[List[int]],
++        params: Optional[Union[SamplingParams, PoolingParams,
++                               BeamSearchParams]],
++        lora_request: Optional[LoRARequest],
++        prompt_adapter_request: Optional[PromptAdapterRequest],
++    ) -> None:
++        max_log_len = self.max_log_len
++        if max_log_len is not None:
++            if prompt is not None:
++                prompt = prompt[:max_log_len]
++
++            if prompt_token_ids is not None:
++                prompt_token_ids = prompt_token_ids[:max_log_len]
++
++        logger.info(
++            "Received request %s: prompt: %r, "
++            "params: %s, prompt_token_ids: %s, "
++            "lora_request: %s, prompt_adapter_request: %s.", request_id,
++            prompt, params, prompt_token_ids, lora_request,
++            prompt_adapter_request)
+diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
+index f9e294a..1aeefe8 100644
+--- a/vllm/entrypoints/openai/api_server.py
++++ b/vllm/entrypoints/openai/api_server.py
+@@ -1,130 +1,609 @@
+ import asyncio
++import atexit
+ import importlib
+ import inspect
++import multiprocessing
++import os
+ import re
++import signal
++import socket
++import sys
++import tempfile
++import uuid
++from argparse import Namespace
+ from contextlib import asynccontextmanager
++from functools import partial
+ from http import HTTPStatus
+-from typing import Any, Set
++from typing import AsyncIterator, Dict, Optional, Set, Tuple, Union
+ 
+-import fastapi
+-import uvicorn
+-from fastapi import Request
++import uvloop
++from fastapi import APIRouter, FastAPI, HTTPException, Request
+ from fastapi.exceptions import RequestValidationError
+ from fastapi.middleware.cors import CORSMiddleware
+ from fastapi.responses import JSONResponse, Response, StreamingResponse
+-from prometheus_client import make_asgi_app
++from starlette.datastructures import State
+ from starlette.routing import Mount
++from typing_extensions import assert_never
+ 
+-import vllm
+ import vllm.envs as envs
++from vllm.config import ModelConfig
+ from vllm.engine.arg_utils import AsyncEngineArgs
+-from vllm.engine.async_llm_engine import AsyncLLMEngine
+-from vllm.entrypoints.openai.cli_args import make_arg_parser
++from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
++from vllm.engine.multiprocessing.client import MQLLMEngineClient
++from vllm.engine.multiprocessing.engine import run_mp_engine
++from vllm.engine.protocol import EngineClient
++from vllm.entrypoints.chat_utils import load_chat_template
++from vllm.entrypoints.launcher import serve_http
++from vllm.entrypoints.logger import RequestLogger
++from vllm.entrypoints.openai.cli_args import (make_arg_parser,
++                                              validate_parsed_serve_args)
++# yapf conflicts with isort for this block
++# yapf: disable
+ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                               ChatCompletionResponse,
+-                                              CompletionRequest, ErrorResponse)
++                                              CompletionRequest,
++                                              CompletionResponse,
++                                              DetokenizeRequest,
++                                              DetokenizeResponse,
++                                              EmbeddingChatRequest,
++                                              EmbeddingCompletionRequest,
++                                              EmbeddingRequest,
++                                              EmbeddingResponse,
++                                              EmbeddingResponseData,
++                                              ErrorResponse,
++                                              LoadLoraAdapterRequest,
++                                              PoolingChatRequest,
++                                              PoolingCompletionRequest,
++                                              PoolingRequest, PoolingResponse,
++                                              ScoreRequest, ScoreResponse,
++                                              TokenizeRequest,
++                                              TokenizeResponse,
++                                              UnloadLoraAdapterRequest)
++# yapf: enable
+ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
++from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
++from vllm.entrypoints.openai.serving_engine import OpenAIServing
++from vllm.entrypoints.openai.serving_models import (BaseModelPath,
++                                                    OpenAIServingModels)
++from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
++from vllm.entrypoints.openai.serving_score import OpenAIServingScores
++from vllm.entrypoints.openai.serving_tokenization import (
++    OpenAIServingTokenization)
++from vllm.entrypoints.openai.tool_parsers import ToolParserManager
++from vllm.entrypoints.utils import with_cancellation
+ from vllm.logger import init_logger
+ from vllm.usage.usage_lib import UsageContext
++from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
++                        is_valid_ipv6_address, set_ulimit)
++from vllm.version import __version__ as VLLM_VERSION
+ 
+ TIMEOUT_KEEP_ALIVE = 5  # seconds
+ 
+-openai_serving_chat: OpenAIServingChat
+-openai_serving_completion: OpenAIServingCompletion
+-logger = init_logger(__name__)
++prometheus_multiproc_dir: tempfile.TemporaryDirectory
+ 
+-_running_tasks: Set[asyncio.Task[Any]] = set()
++# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
++logger = init_logger('vllm.entrypoints.openai.api_server')
++
++_running_tasks: Set[asyncio.Task] = set()
++
++
++@asynccontextmanager
++async def lifespan(app: FastAPI):
++    try:
++        if app.state.log_stats:
++            engine_client: EngineClient = app.state.engine_client
++
++            async def _force_log():
++                while True:
++                    await asyncio.sleep(10.)
++                    await engine_client.do_log_stats()
++
++            task = asyncio.create_task(_force_log())
++            _running_tasks.add(task)
++            task.add_done_callback(_running_tasks.remove)
++        else:
++            task = None
++        try:
++            yield
++        finally:
++            if task is not None:
++                task.cancel()
++    finally:
++        # Ensure app state including engine ref is gc'd
++        del app.state
++
++
++@asynccontextmanager
++async def build_async_engine_client(
++        args: Namespace) -> AsyncIterator[EngineClient]:
++
++    # Context manager to handle engine_client lifecycle
++    # Ensures everything is shutdown and cleaned up on error/exit
++    engine_args = AsyncEngineArgs.from_cli_args(args)
++
++    async with build_async_engine_client_from_engine_args(
++            engine_args, args.disable_frontend_multiprocessing) as engine:
++        yield engine
+ 
+ 
+ @asynccontextmanager
+-async def lifespan(app: fastapi.FastAPI):
++async def build_async_engine_client_from_engine_args(
++    engine_args: AsyncEngineArgs,
++    disable_frontend_multiprocessing: bool = False,
++) -> AsyncIterator[EngineClient]:
++    """
++    Create EngineClient, either:
++        - in-process using the AsyncLLMEngine Directly
++        - multiprocess using AsyncLLMEngine RPC
++
++    Returns the Client or None if the creation failed.
++    """
++
++    # AsyncLLMEngine.
++    if (MQLLMEngineClient.is_unsupported_config(engine_args)
++            or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
++
++        engine_client: Optional[EngineClient] = None
++        try:
++            engine_client = AsyncLLMEngine.from_engine_args(
++                engine_args=engine_args,
++                usage_context=UsageContext.OPENAI_API_SERVER)
++            yield engine_client
++        finally:
++            if engine_client and hasattr(engine_client, "shutdown"):
++                engine_client.shutdown()
++
++    # MQLLMEngine.
++    else:
++        if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
++            # Make TemporaryDirectory for prometheus multiprocessing
++            # Note: global TemporaryDirectory will be automatically
++            #   cleaned up upon exit.
++            global prometheus_multiproc_dir
++            prometheus_multiproc_dir = tempfile.TemporaryDirectory()
++            os.environ[
++                "PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
++        else:
++            logger.warning(
++                "Found PROMETHEUS_MULTIPROC_DIR was set by user. "
++                "This directory must be wiped between vLLM runs or "
++                "you will find inaccurate metrics. Unset the variable "
++                "and vLLM will properly handle cleanup.")
++
++        # Select random path for IPC.
++        ipc_path = get_open_zmq_ipc_path()
++        logger.debug("Multiprocessing frontend to use %s for IPC Path.",
++                     ipc_path)
++
++        # Start RPCServer in separate process (holds the LLMEngine).
++        # the current process might have CUDA context,
++        # so we need to spawn a new process
++        context = multiprocessing.get_context("spawn")
++
++        # The Process can raise an exception during startup, which may
++        # not actually result in an exitcode being reported. As a result
++        # we use a shared variable to communicate the information.
++        engine_alive = multiprocessing.Value('b', True, lock=False)
++        engine_process = context.Process(target=run_mp_engine,
++                                         args=(engine_args,
++                                               UsageContext.OPENAI_API_SERVER,
++                                               ipc_path, engine_alive))
++        engine_process.start()
++        engine_pid = engine_process.pid
++        assert engine_pid is not None, "Engine process failed to start."
++        logger.info("Started engine process with PID %d", engine_pid)
++
++        def _cleanup_ipc_path():
++            socket_path = ipc_path.replace("ipc://", "")
++            if os.path.exists(socket_path):
++                os.remove(socket_path)
++
++        # Ensure we clean up the local IPC socket file on exit.
++        atexit.register(_cleanup_ipc_path)
++
++        # Build RPCClient, which conforms to EngineClient Protocol.
++        engine_config = engine_args.create_engine_config()
++        build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
++                               engine_pid)
++        mq_engine_client = await asyncio.get_running_loop().run_in_executor(
++            None, build_client)
++        try:
++            while True:
++                try:
++                    await mq_engine_client.setup()
++                    break
++                except TimeoutError:
++                    if (not engine_process.is_alive()
++                            or not engine_alive.value):
++                        raise RuntimeError(
++                            "Engine process failed to start. See stack "
++                            "trace for the root cause.") from None
++
++            yield mq_engine_client  # type: ignore[misc]
++        finally:
++            # Ensure rpc server process was terminated
++            engine_process.terminate()
++
++            # Close all open connections to the backend
++            mq_engine_client.close()
++
++            # Wait for engine process to join
++            engine_process.join(4)
++            if engine_process.exitcode is None:
++                # Kill if taking longer than 5 seconds to stop
++                engine_process.kill()
++
++            # Lazy import for prometheus multiprocessing.
++            # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
++            # before prometheus_client is imported.
++            # See https://prometheus.github.io/client_python/multiprocess/
++            from prometheus_client import multiprocess
++            multiprocess.mark_process_dead(engine_process.pid)
++
++
++router = APIRouter()
++
++
++def mount_metrics(app: FastAPI):
++    # Lazy import for prometheus multiprocessing.
++    # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
++    # before prometheus_client is imported.
++    # See https://prometheus.github.io/client_python/multiprocess/
++    from prometheus_client import (CollectorRegistry, make_asgi_app,
++                                   multiprocess)
++
++    prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
++    if prometheus_multiproc_dir_path is not None:
++        logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
++                     prometheus_multiproc_dir_path)
++        registry = CollectorRegistry()
++        multiprocess.MultiProcessCollector(registry)
++
++        # Add prometheus asgi middleware to route /metrics requests
++        metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
++    else:
++        # Add prometheus asgi middleware to route /metrics requests
++        metrics_route = Mount("/metrics", make_asgi_app())
+ 
+-    async def _force_log():
+-        while True:
+-            await asyncio.sleep(10)
+-            await engine.do_log_stats()
++    # Workaround for 307 Redirect for /metrics
++    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
++    app.routes.append(metrics_route)
+ 
+-    if not engine_args.disable_log_stats:
+-        task = asyncio.create_task(_force_log())
+-        _running_tasks.add(task)
+-        task.add_done_callback(_running_tasks.remove)
+ 
+-    yield
++def base(request: Request) -> OpenAIServing:
++    # Reuse the existing instance
++    return tokenization(request)
+ 
+ 
+-app = fastapi.FastAPI(lifespan=lifespan)
++def models(request: Request) -> OpenAIServingModels:
++    return request.app.state.openai_serving_models
+ 
+ 
+-def parse_args():
+-    parser = make_arg_parser()
+-    return parser.parse_args()
++def chat(request: Request) -> Optional[OpenAIServingChat]:
++    return request.app.state.openai_serving_chat
+ 
+ 
+-# Add prometheus asgi middleware to route /metrics requests
+-route = Mount("/metrics", make_asgi_app())
+-# Workaround for 307 Redirect for /metrics
+-route.path_regex = re.compile('^/metrics(?P<path>.*)$')
+-app.routes.append(route)
++def completion(request: Request) -> Optional[OpenAIServingCompletion]:
++    return request.app.state.openai_serving_completion
+ 
+ 
+-@app.exception_handler(RequestValidationError)
+-async def validation_exception_handler(_, exc):
+-    err = openai_serving_chat.create_error_response(message=str(exc))
+-    return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
++def pooling(request: Request) -> Optional[OpenAIServingPooling]:
++    return request.app.state.openai_serving_pooling
+ 
+ 
+-@app.get("/health")
+-async def health() -> Response:
++def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
++    return request.app.state.openai_serving_embedding
++
++
++def score(request: Request) -> Optional[OpenAIServingScores]:
++    return request.app.state.openai_serving_scores
++
++
++def tokenization(request: Request) -> OpenAIServingTokenization:
++    return request.app.state.openai_serving_tokenization
++
++
++def engine_client(request: Request) -> EngineClient:
++    return request.app.state.engine_client
++
++
++@router.get("/health")
++async def health(raw_request: Request) -> Response:
+     """Health check."""
+-    await openai_serving_chat.engine.check_health()
++    await engine_client(raw_request).check_health()
+     return Response(status_code=200)
+ 
+ 
+-@app.get("/v1/models")
+-async def show_available_models():
+-    models = await openai_serving_chat.show_available_models()
+-    return JSONResponse(content=models.model_dump())
++@router.api_route("/ping", methods=["GET", "POST"])
++async def ping(raw_request: Request) -> Response:
++    """Ping check. Endpoint required for SageMaker"""
++    return await health(raw_request)
++
++
++@router.post("/tokenize")
++@with_cancellation
++async def tokenize(request: TokenizeRequest, raw_request: Request):
++    handler = tokenization(raw_request)
++
++    generator = await handler.create_tokenize(request, raw_request)
++    if isinstance(generator, ErrorResponse):
++        return JSONResponse(content=generator.model_dump(),
++                            status_code=generator.code)
++    elif isinstance(generator, TokenizeResponse):
++        return JSONResponse(content=generator.model_dump())
++
++    assert_never(generator)
++
++
++@router.post("/detokenize")
++@with_cancellation
++async def detokenize(request: DetokenizeRequest, raw_request: Request):
++    handler = tokenization(raw_request)
++
++    generator = await handler.create_detokenize(request, raw_request)
++    if isinstance(generator, ErrorResponse):
++        return JSONResponse(content=generator.model_dump(),
++                            status_code=generator.code)
++    elif isinstance(generator, DetokenizeResponse):
++        return JSONResponse(content=generator.model_dump())
++
++    assert_never(generator)
++
+ 
++@router.get("/v1/models")
++async def show_available_models(raw_request: Request):
++    handler = models(raw_request)
+ 
+-@app.get("/version")
++    models_ = await handler.show_available_models()
++    return JSONResponse(content=models_.model_dump())
++
++
++@router.get("/version")
+ async def show_version():
+-    ver = {"version": vllm.__version__}
++    ver = {"version": VLLM_VERSION}
+     return JSONResponse(content=ver)
+ 
+ 
+-@app.post("/v1/chat/completions")
++@router.post("/v1/chat/completions")
++@with_cancellation
+ async def create_chat_completion(request: ChatCompletionRequest,
+                                  raw_request: Request):
+-    generator = await openai_serving_chat.create_chat_completion(
+-        request, raw_request)
++    handler = chat(raw_request)
++    if handler is None:
++        return base(raw_request).create_error_response(
++            message="The model does not support Chat Completions API")
++
++    generator = await handler.create_chat_completion(request, raw_request)
++
+     if isinstance(generator, ErrorResponse):
+         return JSONResponse(content=generator.model_dump(),
+                             status_code=generator.code)
+-    if request.stream:
+-        return StreamingResponse(content=generator,
+-                                 media_type="text/event-stream")
+-    else:
+-        assert isinstance(generator, ChatCompletionResponse)
++
++    elif isinstance(generator, ChatCompletionResponse):
+         return JSONResponse(content=generator.model_dump())
+ 
++    return StreamingResponse(content=generator, media_type="text/event-stream")
++
+ 
+-@app.post("/v1/completions")
++@router.post("/v1/completions")
++@with_cancellation
+ async def create_completion(request: CompletionRequest, raw_request: Request):
+-    generator = await openai_serving_completion.create_completion(
+-        request, raw_request)
++    handler = completion(raw_request)
++    if handler is None:
++        return base(raw_request).create_error_response(
++            message="The model does not support Completions API")
++
++    generator = await handler.create_completion(request, raw_request)
+     if isinstance(generator, ErrorResponse):
+         return JSONResponse(content=generator.model_dump(),
+                             status_code=generator.code)
+-    if request.stream:
+-        return StreamingResponse(content=generator,
+-                                 media_type="text/event-stream")
++    elif isinstance(generator, CompletionResponse):
++        return JSONResponse(content=generator.model_dump())
++
++    return StreamingResponse(content=generator, media_type="text/event-stream")
++
++
++@router.post("/v1/embeddings")
++@with_cancellation
++async def create_embedding(request: EmbeddingRequest, raw_request: Request):
++    handler = embedding(raw_request)
++    if handler is None:
++        fallback_handler = pooling(raw_request)
++        if fallback_handler is None:
++            return base(raw_request).create_error_response(
++                message="The model does not support Embeddings API")
++
++        logger.warning(
++            "Embeddings API will become exclusive to embedding models "
++            "in a future release. To return the hidden states directly, "
++            "use the Pooling API (`/pooling`) instead.")
++
++        res = await fallback_handler.create_pooling(request, raw_request)
++
++        generator: Union[ErrorResponse, EmbeddingResponse]
++        if isinstance(res, PoolingResponse):
++            generator = EmbeddingResponse(
++                id=res.id,
++                object=res.object,
++                created=res.created,
++                model=res.model,
++                data=[
++                    EmbeddingResponseData(
++                        index=d.index,
++                        embedding=d.data,  # type: ignore
++                    ) for d in res.data
++                ],
++                usage=res.usage,
++            )
++        else:
++            generator = res
+     else:
++        generator = await handler.create_embedding(request, raw_request)
++
++    if isinstance(generator, ErrorResponse):
++        return JSONResponse(content=generator.model_dump(),
++                            status_code=generator.code)
++    elif isinstance(generator, EmbeddingResponse):
+         return JSONResponse(content=generator.model_dump())
+ 
++    assert_never(generator)
+ 
+-if __name__ == "__main__":
+-    args = parse_args()
++
++@router.post("/pooling")
++@with_cancellation
++async def create_pooling(request: PoolingRequest, raw_request: Request):
++    handler = pooling(raw_request)
++    if handler is None:
++        return base(raw_request).create_error_response(
++            message="The model does not support Pooling API")
++
++    generator = await handler.create_pooling(request, raw_request)
++    if isinstance(generator, ErrorResponse):
++        return JSONResponse(content=generator.model_dump(),
++                            status_code=generator.code)
++    elif isinstance(generator, PoolingResponse):
++        return JSONResponse(content=generator.model_dump())
++
++    assert_never(generator)
++
++
++@router.post("/score")
++@with_cancellation
++async def create_score(request: ScoreRequest, raw_request: Request):
++    handler = score(raw_request)
++    if handler is None:
++        return base(raw_request).create_error_response(
++            message="The model does not support Score API")
++
++    generator = await handler.create_score(request, raw_request)
++    if isinstance(generator, ErrorResponse):
++        return JSONResponse(content=generator.model_dump(),
++                            status_code=generator.code)
++    elif isinstance(generator, ScoreResponse):
++        return JSONResponse(content=generator.model_dump())
++
++    assert_never(generator)
++
++
++@router.post("/v1/score")
++@with_cancellation
++async def create_score_v1(request: ScoreRequest, raw_request: Request):
++    logger.warning(
++        "To indicate that Score API is not part of standard OpenAI API, we "
++        "have moved it to `/score`. Please update your client accordingly.")
++
++    return await create_score(request, raw_request)
++
++
++TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
++    "generate": {
++        "messages": (ChatCompletionRequest, create_chat_completion),
++        "default": (CompletionRequest, create_completion),
++    },
++    "embed": {
++        "messages": (EmbeddingChatRequest, create_embedding),
++        "default": (EmbeddingCompletionRequest, create_embedding),
++    },
++    "score": {
++        "default": (ScoreRequest, create_score),
++    },
++    "reward": {
++        "messages": (PoolingChatRequest, create_pooling),
++        "default": (PoolingCompletionRequest, create_pooling),
++    },
++    "classify": {
++        "messages": (PoolingChatRequest, create_pooling),
++        "default": (PoolingCompletionRequest, create_pooling),
++    },
++}
++
++
++@router.post("/invocations")
++async def invocations(raw_request: Request):
++    """
++    For SageMaker, routes requests to other handlers based on model `task`.
++    """
++    body = await raw_request.json()
++    task = raw_request.app.state.task
++
++    if task not in TASK_HANDLERS:
++        raise HTTPException(
++            status_code=400,
++            detail=f"Unsupported task: '{task}' for '/invocations'. "
++            f"Expected one of {set(TASK_HANDLERS.keys())}")
++
++    handler_config = TASK_HANDLERS[task]
++    if "messages" in body:
++        request_model, handler = handler_config["messages"]
++    else:
++        request_model, handler = handler_config["default"]
++
++    # this is required since we lose the FastAPI automatic casting
++    request = request_model.model_validate(body)
++    return await handler(request, raw_request)
++
++
++if envs.VLLM_TORCH_PROFILER_DIR:
++    logger.warning(
++        "Torch Profiler is enabled in the API server. This should ONLY be "
++        "used for local development!")
++
++    @router.post("/start_profile")
++    async def start_profile(raw_request: Request):
++        logger.info("Starting profiler...")
++        await engine_client(raw_request).start_profile()
++        logger.info("Profiler started.")
++        return Response(status_code=200)
++
++    @router.post("/stop_profile")
++    async def stop_profile(raw_request: Request):
++        logger.info("Stopping profiler...")
++        await engine_client(raw_request).stop_profile()
++        logger.info("Profiler stopped.")
++        return Response(status_code=200)
++
++
++if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
++    logger.warning(
++        "Lora dynamic loading & unloading is enabled in the API server. "
++        "This should ONLY be used for local development!")
++
++    @router.post("/v1/load_lora_adapter")
++    async def load_lora_adapter(request: LoadLoraAdapterRequest,
++                                raw_request: Request):
++        handler = models(raw_request)
++        response = await handler.load_lora_adapter(request)
++        if isinstance(response, ErrorResponse):
++            return JSONResponse(content=response.model_dump(),
++                                status_code=response.code)
++
++        return Response(status_code=200, content=response)
++
++    @router.post("/v1/unload_lora_adapter")
++    async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
++                                  raw_request: Request):
++        handler = models(raw_request)
++        response = await handler.unload_lora_adapter(request)
++        if isinstance(response, ErrorResponse):
++            return JSONResponse(content=response.model_dump(),
++                                status_code=response.code)
++
++        return Response(status_code=200, content=response)
++
++
++def build_app(args: Namespace) -> FastAPI:
++    if args.disable_fastapi_docs:
++        app = FastAPI(openapi_url=None,
++                      docs_url=None,
++                      redoc_url=None,
++                      lifespan=lifespan)
++    else:
++        app = FastAPI(lifespan=lifespan)
++    app.include_router(router)
++    app.root_path = args.root_path
++
++    mount_metrics(app)
+ 
+     app.add_middleware(
+         CORSMiddleware,
+@@ -134,53 +613,220 @@ if __name__ == "__main__":
+         allow_headers=args.allowed_headers,
+     )
+ 
++    @app.exception_handler(RequestValidationError)
++    async def validation_exception_handler(_, exc):
++        err = ErrorResponse(message=str(exc),
++                            type="BadRequestError",
++                            code=HTTPStatus.BAD_REQUEST)
++        return JSONResponse(err.model_dump(),
++                            status_code=HTTPStatus.BAD_REQUEST)
++
+     if token := envs.VLLM_API_KEY or args.api_key:
+ 
+         @app.middleware("http")
+         async def authentication(request: Request, call_next):
+-            root_path = "" if args.root_path is None else args.root_path
+-            if not request.url.path.startswith(f"{root_path}/v1"):
++            if request.method == "OPTIONS":
++                return await call_next(request)
++            url_path = request.url.path
++            if app.root_path and url_path.startswith(app.root_path):
++                url_path = url_path[len(app.root_path):]
++            if not url_path.startswith("/v1"):
+                 return await call_next(request)
+             if request.headers.get("Authorization") != "Bearer " + token:
+                 return JSONResponse(content={"error": "Unauthorized"},
+                                     status_code=401)
+             return await call_next(request)
+ 
++    if args.enable_request_id_headers:
++        logger.warning(
++            "CAUTION: Enabling X-Request-Id headers in the API Server. "
++            "This can harm performance at high QPS.")
++
++        @app.middleware("http")
++        async def add_request_id(request: Request, call_next):
++            request_id = request.headers.get(
++                "X-Request-Id") or uuid.uuid4().hex
++            response = await call_next(request)
++            response.headers["X-Request-Id"] = request_id
++            return response
++
+     for middleware in args.middleware:
+         module_path, object_name = middleware.rsplit(".", 1)
+         imported = getattr(importlib.import_module(module_path), object_name)
+         if inspect.isclass(imported):
+-            app.add_middleware(imported)
++            app.add_middleware(imported)  # type: ignore[arg-type]
+         elif inspect.iscoroutinefunction(imported):
+             app.middleware("http")(imported)
+         else:
+             raise ValueError(f"Invalid middleware {middleware}. "
+                              f"Must be a function or a class.")
+ 
+-    logger.info("vLLM API server version %s", vllm.__version__)
+-    logger.info("args: %s", args)
++    return app
++
+ 
++async def init_app_state(
++    engine_client: EngineClient,
++    model_config: ModelConfig,
++    state: State,
++    args: Namespace,
++) -> None:
+     if args.served_model_name is not None:
+         served_model_names = args.served_model_name
+     else:
+         served_model_names = [args.model]
+-    engine_args = AsyncEngineArgs.from_cli_args(args)
+-    engine = AsyncLLMEngine.from_engine_args(
+-        engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
+-    openai_serving_chat = OpenAIServingChat(engine, served_model_names,
+-                                            args.response_role,
+-                                            args.lora_modules,
+-                                            args.chat_template)
+-    openai_serving_completion = OpenAIServingCompletion(
+-        engine, served_model_names, args.lora_modules)
+ 
+-    app.root_path = args.root_path
+-    uvicorn.run(app,
+-                host=args.host,
+-                port=args.port,
+-                log_level=args.uvicorn_log_level,
+-                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+-                ssl_keyfile=args.ssl_keyfile,
+-                ssl_certfile=args.ssl_certfile,
+-                ssl_ca_certs=args.ssl_ca_certs,
+-                ssl_cert_reqs=args.ssl_cert_reqs)
++    if args.disable_log_requests:
++        request_logger = None
++    else:
++        request_logger = RequestLogger(max_log_len=args.max_log_len)
++
++    base_model_paths = [
++        BaseModelPath(name=name, model_path=args.model)
++        for name in served_model_names
++    ]
++
++    state.engine_client = engine_client
++    state.log_stats = not args.disable_log_stats
++
++    resolved_chat_template = load_chat_template(args.chat_template)
++    logger.info("Using supplied chat template:\n%s", resolved_chat_template)
++
++    state.openai_serving_models = OpenAIServingModels(
++        engine_client=engine_client,
++        model_config=model_config,
++        base_model_paths=base_model_paths,
++        lora_modules=args.lora_modules,
++        prompt_adapters=args.prompt_adapters,
++    )
++    await state.openai_serving_models.init_static_loras()
++    state.openai_serving_chat = OpenAIServingChat(
++        engine_client,
++        model_config,
++        state.openai_serving_models,
++        args.response_role,
++        request_logger=request_logger,
++        chat_template=resolved_chat_template,
++        chat_template_content_format=args.chat_template_content_format,
++        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
++        enable_auto_tools=args.enable_auto_tool_choice,
++        tool_parser=args.tool_call_parser,
++        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
++    ) if model_config.runner_type == "generate" else None
++    state.openai_serving_completion = OpenAIServingCompletion(
++        engine_client,
++        model_config,
++        state.openai_serving_models,
++        request_logger=request_logger,
++        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
++    ) if model_config.runner_type == "generate" else None
++    state.openai_serving_pooling = OpenAIServingPooling(
++        engine_client,
++        model_config,
++        state.openai_serving_models,
++        request_logger=request_logger,
++        chat_template=resolved_chat_template,
++        chat_template_content_format=args.chat_template_content_format,
++    ) if model_config.runner_type == "pooling" else None
++    state.openai_serving_embedding = OpenAIServingEmbedding(
++        engine_client,
++        model_config,
++        state.openai_serving_models,
++        request_logger=request_logger,
++        chat_template=resolved_chat_template,
++        chat_template_content_format=args.chat_template_content_format,
++    ) if model_config.task == "embed" else None
++    state.openai_serving_scores = OpenAIServingScores(
++        engine_client,
++        model_config,
++        state.openai_serving_models,
++        request_logger=request_logger
++    ) if model_config.task == "score" else None
++    state.openai_serving_tokenization = OpenAIServingTokenization(
++        engine_client,
++        model_config,
++        state.openai_serving_models,
++        request_logger=request_logger,
++        chat_template=resolved_chat_template,
++        chat_template_content_format=args.chat_template_content_format,
++    )
++    state.task = model_config.task
++
++
++def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
++    family = socket.AF_INET
++    if is_valid_ipv6_address(addr[0]):
++        family = socket.AF_INET6
++
++    sock = socket.socket(family=family, type=socket.SOCK_STREAM)
++    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
++    sock.bind(addr)
++
++    return sock
++
++
++async def run_server(args, **uvicorn_kwargs) -> None:
++    logger.info("vLLM API server version %s", VLLM_VERSION)
++    logger.info("args: %s", args)
++
++    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
++        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
++
++    valid_tool_parses = ToolParserManager.tool_parsers.keys()
++    if args.enable_auto_tool_choice \
++        and args.tool_call_parser not in valid_tool_parses:
++        raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
++                       f"(chose from {{ {','.join(valid_tool_parses)} }})")
++
++    # workaround to make sure that we bind the port before the engine is set up.
++    # This avoids race conditions with ray.
++    # see https://github.com/vllm-project/vllm/issues/8204
++    sock_addr = (args.host or "", args.port)
++    sock = create_server_socket(sock_addr)
++
++    # workaround to avoid footguns where uvicorn drops requests with too
++    # many concurrent requests active
++    set_ulimit()
++
++    def signal_handler(*_) -> None:
++        # Interrupt server on sigterm while initializing
++        raise KeyboardInterrupt("terminated")
++
++    signal.signal(signal.SIGTERM, signal_handler)
++
++    async with build_async_engine_client(args) as engine_client:
++        app = build_app(args)
++
++        model_config = await engine_client.get_model_config()
++        await init_app_state(engine_client, model_config, app.state, args)
++
++        shutdown_task = await serve_http(
++            app,
++            host=args.host,
++            port=args.port,
++            log_level=args.uvicorn_log_level,
++            timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
++            ssl_keyfile=args.ssl_keyfile,
++            ssl_certfile=args.ssl_certfile,
++            ssl_ca_certs=args.ssl_ca_certs,
++            ssl_cert_reqs=args.ssl_cert_reqs,
++            # Workaround to work on macOS
++            fd=sock.fileno() if sys.platform.startswith("darwin") else None,
++            **uvicorn_kwargs,
++        )
++
++    # NB: Await server shutdown only after the backend context is exited
++    await shutdown_task
++
++    sock.close()
++
++
++if __name__ == "__main__":
++    # NOTE(simon):
++    # This section should be in sync with vllm/scripts.py for CLI entrypoints.
++    parser = FlexibleArgumentParser(
++        description="vLLM OpenAI-Compatible RESTful API server.")
++    parser = make_arg_parser(parser)
++    args = parser.parse_args()
++    validate_parsed_serve_args(args)
++
++    uvloop.run(run_server(args))
+diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
+index 4c0cb1e..22206ef 100644
+--- a/vllm/entrypoints/openai/cli_args.py
++++ b/vllm/entrypoints/openai/cli_args.py
+@@ -7,24 +7,75 @@ purposes.
+ import argparse
+ import json
+ import ssl
++from typing import List, Optional, Sequence, Union, get_args
+ 
+ from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
+-from vllm.entrypoints.openai.serving_engine import LoRAModulePath
++from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
++                                         validate_chat_template)
++from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
++                                                    PromptAdapterPath)
++from vllm.entrypoints.openai.tool_parsers import ToolParserManager
++from vllm.utils import FlexibleArgumentParser
+ 
+ 
+ class LoRAParserAction(argparse.Action):
+ 
+-    def __call__(self, parser, namespace, values, option_string=None):
+-        lora_list = []
++    def __call__(
++        self,
++        parser: argparse.ArgumentParser,
++        namespace: argparse.Namespace,
++        values: Optional[Union[str, Sequence[str]]],
++        option_string: Optional[str] = None,
++    ):
++        if values is None:
++            values = []
++        if isinstance(values, str):
++            raise TypeError("Expected values to be a list")
++
++        lora_list: List[LoRAModulePath] = []
+         for item in values:
+-            name, path = item.split('=')
+-            lora_list.append(LoRAModulePath(name, path))
++            if item in [None, '']:  # Skip if item is None or empty string
++                continue
++            if '=' in item and ',' not in item:  # Old format: name=path
++                name, path = item.split('=')
++                lora_list.append(LoRAModulePath(name, path))
++            else:  # Assume JSON format
++                try:
++                    lora_dict = json.loads(item)
++                    lora = LoRAModulePath(**lora_dict)
++                    lora_list.append(lora)
++                except json.JSONDecodeError:
++                    parser.error(
++                        f"Invalid JSON format for --lora-modules: {item}")
++                except TypeError as e:
++                    parser.error(
++                        f"Invalid fields for --lora-modules: {item} - {str(e)}"
++                    )
+         setattr(namespace, self.dest, lora_list)
+ 
+ 
+-def make_arg_parser():
+-    parser = argparse.ArgumentParser(
+-        description="vLLM OpenAI-Compatible RESTful API server.")
++class PromptAdapterParserAction(argparse.Action):
++
++    def __call__(
++        self,
++        parser: argparse.ArgumentParser,
++        namespace: argparse.Namespace,
++        values: Optional[Union[str, Sequence[str]]],
++        option_string: Optional[str] = None,
++    ):
++        if values is None:
++            values = []
++        if isinstance(values, str):
++            raise TypeError("Expected values to be a list")
++
++        adapter_list: List[PromptAdapterPath] = []
++        for item in values:
++            name, path = item.split('=')
++            adapter_list.append(PromptAdapterPath(name, path))
++        setattr(namespace, self.dest, adapter_list)
++
++
++def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+     parser.add_argument("--host",
+                         type=nullable_str,
+                         default=None,
+@@ -62,14 +113,38 @@ def make_arg_parser():
+         default=None,
+         nargs='+',
+         action=LoRAParserAction,
+-        help="LoRA module configurations in the format name=path. "
+-        "Multiple modules can be specified.")
++        help="LoRA module configurations in either 'name=path' format"
++        "or JSON format. "
++        "Example (old format): 'name=path' "
++        "Example (new format): "
++        "'{\"name\": \"name\", \"local_path\": \"path\", "
++        "\"base_model_name\": \"id\"}'")
++    parser.add_argument(
++        "--prompt-adapters",
++        type=nullable_str,
++        default=None,
++        nargs='+',
++        action=PromptAdapterParserAction,
++        help="Prompt adapter configurations in the format name=path. "
++        "Multiple adapters can be specified.")
+     parser.add_argument("--chat-template",
+                         type=nullable_str,
+                         default=None,
+                         help="The file path to the chat template, "
+                         "or the template in single-line form "
+                         "for the specified model")
++    parser.add_argument(
++        '--chat-template-content-format',
++        type=str,
++        default="auto",
++        choices=get_args(ChatTemplateContentFormatOption),
++        help='The format to render message content within a chat template.'
++        '\n\n'
++        '* "string" will render the content as a string. '
++        'Example: "Hello World"\n'
++        '* "openai" will render the content as a list of dictionaries, '
++        'similar to OpenAI schema. '
++        'Example: [{"type": "text", "text": "Hello world!"}]')
+     parser.add_argument("--response-role",
+                         type=nullable_str,
+                         default="assistant",
+@@ -110,6 +185,90 @@ def make_arg_parser():
+         "using @app.middleware('http'). "
+         "If a class is provided, vLLM will add it to the server "
+         "using app.add_middleware(). ")
++    parser.add_argument(
++        "--return-tokens-as-token-ids",
++        action="store_true",
++        help="When --max-logprobs is specified, represents single tokens as "
++        "strings of the form 'token_id:{token_id}' so that tokens that "
++        "are not JSON-encodable can be identified.")
++    parser.add_argument(
++        "--disable-frontend-multiprocessing",
++        action="store_true",
++        help="If specified, will run the OpenAI frontend server in the same "
++        "process as the model serving engine.")
++    parser.add_argument(
++        "--enable-request-id-headers",
++        action="store_true",
++        help="If specified, API server will add X-Request-Id header to "
++        "responses. Caution: this hurts performance at high QPS.")
++    parser.add_argument(
++        "--enable-auto-tool-choice",
++        action="store_true",
++        default=False,
++        help=
++        "Enable auto tool choice for supported models. Use --tool-call-parser"
++        " to specify which parser to use")
++
++    valid_tool_parsers = ToolParserManager.tool_parsers.keys()
++    parser.add_argument(
++        "--tool-call-parser",
++        type=str,
++        metavar="{" + ",".join(valid_tool_parsers) + "} or name registered in "
++        "--tool-parser-plugin",
++        default=None,
++        help=
++        "Select the tool call parser depending on the model that you're using."
++        " This is used to parse the model-generated tool call into OpenAI API "
++        "format. Required for --enable-auto-tool-choice.")
++
++    parser.add_argument(
++        "--tool-parser-plugin",
++        type=str,
++        default="",
++        help=
++        "Special the tool parser plugin write to parse the model-generated tool"
++        " into OpenAI API format, the name register in this plugin can be used "
++        "in --tool-call-parser.")
+ 
+     parser = AsyncEngineArgs.add_cli_args(parser)
++
++    parser.add_argument('--max-log-len',
++                        type=int,
++                        default=None,
++                        help='Max number of prompt characters or prompt '
++                        'ID numbers being printed in log.'
++                        '\n\nDefault: Unlimited')
++
++    parser.add_argument(
++        "--disable-fastapi-docs",
++        action='store_true',
++        default=False,
++        help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint"
++    )
++    parser.add_argument(
++        "--enable-prompt-tokens-details",
++        action='store_true',
++        default=False,
++        help="If set to True, enable prompt_tokens_details in usage.")
++
+     return parser
++
++
++def validate_parsed_serve_args(args: argparse.Namespace):
++    """Quick checks for model serve args that raise prior to loading."""
++    if hasattr(args, "subparser") and args.subparser != "serve":
++        return
++
++    # Ensure that the chat template is valid; raises if it likely isn't
++    validate_chat_template(args.chat_template)
++
++    # Enable auto tool needs a tool call parser to be valid
++    if args.enable_auto_tool_choice and not args.tool_call_parser:
++        raise TypeError("Error: --enable-auto-tool-choice requires "
++                        "--tool-call-parser")
++
++
++def create_parser_for_docs() -> FlexibleArgumentParser:
++    parser_for_docs = FlexibleArgumentParser(
++        prog="-m vllm.entrypoints.openai.api_server")
++    return make_arg_parser(parser_for_docs)
+diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
+new file mode 100644
+index 0000000..c813281
+--- /dev/null
++++ b/vllm/entrypoints/openai/logits_processors.py
+@@ -0,0 +1,86 @@
++from functools import lru_cache, partial
++from typing import Dict, FrozenSet, Iterable, List, Optional, Union
++
++import torch
++
++from vllm.sampling_params import LogitsProcessor
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++
++
++class AllowedTokenIdsLogitsProcessor:
++    """Logits processor for constraining generated tokens to a
++    specific set of token ids."""
++
++    def __init__(self, allowed_ids: Iterable[int]):
++        self.allowed_ids: Optional[List[int]] = list(allowed_ids)
++        self.mask: Optional[torch.Tensor] = None
++
++    def __call__(self, token_ids: List[int],
++                 logits: torch.Tensor) -> torch.Tensor:
++        if self.mask is None:
++            self.mask = torch.ones((logits.shape[-1], ),
++                                   dtype=torch.bool,
++                                   device=logits.device)
++            self.mask[self.allowed_ids] = False
++            self.allowed_ids = None
++        logits.masked_fill_(self.mask, float("-inf"))
++        return logits
++
++
++@lru_cache(maxsize=32)
++def _get_allowed_token_ids_logits_processor(
++    allowed_token_ids: FrozenSet[int],
++    vocab_size: int,
++) -> LogitsProcessor:
++    if not allowed_token_ids:
++        raise ValueError("Empty allowed_token_ids provided")
++    if not all(0 <= tid < vocab_size for tid in allowed_token_ids):
++        raise ValueError("allowed_token_ids contains "
++                         "out-of-vocab token id")
++    return AllowedTokenIdsLogitsProcessor(allowed_token_ids)
++
++
++def logit_bias_logits_processor(
++    logit_bias: Dict[int, float],
++    token_ids: List[int],
++    logits: torch.Tensor,
++) -> torch.Tensor:
++    for token_id, bias in logit_bias.items():
++        logits[token_id] += bias
++    return logits
++
++
++def get_logits_processors(
++    logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]],
++    allowed_token_ids: Optional[List[int]],
++    tokenizer: AnyTokenizer,
++) -> List[LogitsProcessor]:
++    logits_processors: List[LogitsProcessor] = []
++    if logit_bias:
++        try:
++            # Convert token_id to integer
++            # Clamp the bias between -100 and 100 per OpenAI API spec
++            clamped_logit_bias: Dict[int, float] = {
++                int(token_id): min(100.0, max(-100.0, bias))
++                for token_id, bias in logit_bias.items()
++            }
++        except ValueError as exc:
++            raise ValueError(
++                "Found token_id in logit_bias that is not "
++                "an integer or string representing an integer") from exc
++
++        # Check if token_id is within the vocab size
++        for token_id, bias in clamped_logit_bias.items():
++            if token_id < 0 or token_id >= len(tokenizer):
++                raise ValueError(f"token_id {token_id} in logit_bias contains "
++                                 "out-of-vocab token id")
++
++        logits_processors.append(
++            partial(logit_bias_logits_processor, clamped_logit_bias))
++
++    if allowed_token_ids is not None:
++        logits_processors.append(
++            _get_allowed_token_ids_logits_processor(
++                frozenset(allowed_token_ids), len(tokenizer)))
++
++    return logits_processors
+diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
+index 3cd9dda..14e4134 100644
+--- a/vllm/entrypoints/openai/protocol.py
++++ b/vllm/entrypoints/openai/protocol.py
+@@ -1,20 +1,65 @@
+ # Adapted from
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
++import re
+ import time
+-from typing import Dict, List, Literal, Optional, Union
++from argparse import Namespace
++from typing import Any, Dict, List, Literal, Optional, Union
+ 
+ import torch
+-from openai.types.chat import ChatCompletionMessageParam
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
+ from typing_extensions import Annotated
+ 
+-from vllm.sampling_params import SamplingParams
+-from vllm.utils import random_uuid
++from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
++from vllm.logger import init_logger
++from vllm.pooling_params import PoolingParams
++from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
++                                  RequestOutputKind, SamplingParams)
++from vllm.sequence import Logprob
++from vllm.utils import random_uuid, resolve_obj_by_qualname
++
++logger = init_logger(__name__)
++
++# torch is mocked during docs generation,
++# so we have to provide the values as literals
++_MOCK_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807)
++_LONG_INFO: Union["torch.iinfo", Namespace]
++
++try:
++    from sphinx.ext.autodoc.mock import _MockModule
++
++    if isinstance(torch, _MockModule):
++        _LONG_INFO = _MOCK_LONG_INFO
++    else:
++        _LONG_INFO = torch.iinfo(torch.long)
++except ModuleNotFoundError:
++    _LONG_INFO = torch.iinfo(torch.long)
++
++assert _LONG_INFO.min == _MOCK_LONG_INFO.min
++assert _LONG_INFO.max == _MOCK_LONG_INFO.max
+ 
+ 
+ class OpenAIBaseModel(BaseModel):
+-    # OpenAI API does not allow extra fields
+-    model_config = ConfigDict(extra="forbid")
++    # OpenAI API does allow extra fields
++    model_config = ConfigDict(extra="allow")
++
++    @model_validator(mode="before")
++    @classmethod
++    def __log_extra_fields__(cls, data):
++        if isinstance(data, dict):
++            # Get all class field names and their potential aliases
++            field_names = set()
++            for field_name, field in cls.model_fields.items():
++                field_names.add(field_name)
++                if hasattr(field, 'alias') and field.alias:
++                    field_names.add(field.alias)
++
++            # Compare against both field names and aliases
++            extra_fields = data.keys() - field_names
++            if extra_fields:
++                logger.warning(
++                    "The following fields were present in the request "
++                    "but ignored: %s", extra_fields)
++        return data
+ 
+ 
+ class ErrorResponse(OpenAIBaseModel):
+@@ -47,6 +92,7 @@ class ModelCard(OpenAIBaseModel):
+     owned_by: str = "vllm"
+     root: Optional[str] = None
+     parent: Optional[str] = None
++    max_model_len: Optional[int] = None
+     permission: List[ModelPermission] = Field(default_factory=list)
+ 
+ 
+@@ -55,15 +101,100 @@ class ModelList(OpenAIBaseModel):
+     data: List[ModelCard] = Field(default_factory=list)
+ 
+ 
++class PromptTokenUsageInfo(OpenAIBaseModel):
++    cached_tokens: Optional[int] = None
++
++
+ class UsageInfo(OpenAIBaseModel):
+     prompt_tokens: int = 0
+     total_tokens: int = 0
+     completion_tokens: Optional[int] = 0
++    prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
++
++
++class RequestResponseMetadata(BaseModel):
++    request_id: str
++    final_usage_info: Optional[UsageInfo] = None
++
++
++class JsonSchemaResponseFormat(OpenAIBaseModel):
++    name: str
++    description: Optional[str] = None
++    # schema is the field in openai but that causes conflicts with pydantic so
++    # instead use json_schema with an alias
++    json_schema: Optional[Dict[str, Any]] = Field(default=None, alias='schema')
++    strict: Optional[bool] = None
+ 
+ 
+ class ResponseFormat(OpenAIBaseModel):
+-    # type must be "json_object" or "text"
+-    type: Literal["text", "json_object"]
++    # type must be "json_schema", "json_object" or "text"
++    type: Literal["text", "json_object", "json_schema"]
++    json_schema: Optional[JsonSchemaResponseFormat] = None
++
++
++class StreamOptions(OpenAIBaseModel):
++    include_usage: Optional[bool] = True
++    continuous_usage_stats: Optional[bool] = False
++
++
++class FunctionDefinition(OpenAIBaseModel):
++    name: str
++    description: Optional[str] = None
++    parameters: Optional[Dict[str, Any]] = None
++
++
++class ChatCompletionToolsParam(OpenAIBaseModel):
++    type: Literal["function"] = "function"
++    function: FunctionDefinition
++
++
++class ChatCompletionNamedFunction(OpenAIBaseModel):
++    name: str
++
++
++class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
++    function: ChatCompletionNamedFunction
++    type: Literal["function"] = "function"
++
++
++class LogitsProcessorConstructor(BaseModel):
++    qualname: str
++    args: Optional[List[Any]] = None
++    kwargs: Optional[Dict[str, Any]] = None
++
++
++LogitsProcessors = List[Union[str, LogitsProcessorConstructor]]
++
++
++def get_logits_processors(processors: Optional[LogitsProcessors],
++                          pattern: Optional[str]) -> Optional[List[Any]]:
++    if processors and pattern:
++        logits_processors = []
++        for processor in processors:
++            qualname = processor if isinstance(processor,
++                                               str) else processor.qualname
++            if not re.match(pattern, qualname):
++                raise ValueError(
++                    f"Logits processor '{qualname}' is not allowed by this "
++                    "server. See --logits-processor-pattern engine argument "
++                    "for more information.")
++            try:
++                logits_processor = resolve_obj_by_qualname(qualname)
++            except Exception as e:
++                raise ValueError(
++                    f"Logits processor '{qualname}' could not be resolved: {e}"
++                ) from e
++            if isinstance(processor, LogitsProcessorConstructor):
++                logits_processor = logits_processor(*processor.args or [],
++                                                    **processor.kwargs or {})
++            logits_processors.append(logits_processor)
++        return logits_processors
++    elif processors:
++        raise ValueError(
++            "The `logits_processors` argument is not supported by this "
++            "server. See --logits-processor-pattern engine argugment "
++            "for more information.")
++    return None
+ 
+ 
+ class ChatCompletionRequest(OpenAIBaseModel):
+@@ -74,54 +205,100 @@ class ChatCompletionRequest(OpenAIBaseModel):
+     frequency_penalty: Optional[float] = 0.0
+     logit_bias: Optional[Dict[str, float]] = None
+     logprobs: Optional[bool] = False
+-    top_logprobs: Optional[int] = None
+-    max_tokens: Optional[int] = None
++    top_logprobs: Optional[int] = 0
++    # TODO(#9845): remove max_tokens when field is removed from OpenAI API
++    max_tokens: Optional[int] = Field(
++        default=None,
++        deprecated=
++        'max_tokens is deprecated in favor of the max_completion_tokens field')
++    max_completion_tokens: Optional[int] = None
+     n: Optional[int] = 1
+     presence_penalty: Optional[float] = 0.0
+     response_format: Optional[ResponseFormat] = None
+-    seed: Optional[int] = Field(None,
+-                                ge=torch.iinfo(torch.long).min,
+-                                le=torch.iinfo(torch.long).max)
++    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+     stream: Optional[bool] = False
+-    temperature: Optional[float] = 0.7
+-    top_p: Optional[float] = 1.0
++    stream_options: Optional[StreamOptions] = None
++    temperature: Optional[float] = None
++    top_p: Optional[float] = None
++    tools: Optional[List[ChatCompletionToolsParam]] = None
++    tool_choice: Optional[Union[Literal["none"], Literal["auto"],
++                                ChatCompletionNamedToolChoiceParam]] = "none"
++
++    # NOTE this will be ignored by VLLM -- the model determines the behavior
++    parallel_tool_calls: Optional[bool] = False
+     user: Optional[str] = None
+ 
+     # doc: begin-chat-completion-sampling-params
+     best_of: Optional[int] = None
+-    use_beam_search: Optional[bool] = False
+-    top_k: Optional[int] = -1
+-    min_p: Optional[float] = 0.0
+-    repetition_penalty: Optional[float] = 1.0
+-    length_penalty: Optional[float] = 1.0
+-    early_stopping: Optional[bool] = False
+-    ignore_eos: Optional[bool] = False
+-    min_tokens: Optional[int] = 0
++    use_beam_search: bool = False
++    top_k: Optional[int] = None
++    min_p: Optional[float] = None
++    repetition_penalty: Optional[float] = None
++    length_penalty: float = 1.0
+     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+-    skip_special_tokens: Optional[bool] = True
+-    spaces_between_special_tokens: Optional[bool] = True
++    include_stop_str_in_output: bool = False
++    ignore_eos: bool = False
++    min_tokens: int = 0
++    skip_special_tokens: bool = True
++    spaces_between_special_tokens: bool = True
++    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
++    prompt_logprobs: Optional[int] = None
+     # doc: end-chat-completion-sampling-params
+ 
+     # doc: begin-chat-completion-extra-params
+-    echo: Optional[bool] = Field(
++    echo: bool = Field(
+         default=False,
+         description=(
+             "If true, the new message will be prepended with the last message "
+             "if they belong to the same role."),
+     )
+-    add_generation_prompt: Optional[bool] = Field(
++    add_generation_prompt: bool = Field(
+         default=True,
+         description=
+         ("If true, the generation prompt will be added to the chat template. "
+          "This is a parameter used by chat template in tokenizer config of the "
+          "model."),
+     )
+-    include_stop_str_in_output: Optional[bool] = Field(
++    continue_final_message: bool = Field(
++        default=False,
++        description=
++        ("If this is set, the chat will be formatted so that the final "
++         "message in the chat is open-ended, without any EOS tokens. The "
++         "model will continue this message rather than starting a new one. "
++         "This allows you to \"prefill\" part of the model's response for it. "
++         "Cannot be used at the same time as `add_generation_prompt`."),
++    )
++    add_special_tokens: bool = Field(
+         default=False,
+         description=(
+-            "Whether to include the stop string in the output. "
+-            "This is only applied when the stop or stop_token_ids is set."),
++            "If true, special tokens (e.g. BOS) will be added to the prompt "
++            "on top of what is added by the chat template. "
++            "For most models, the chat template takes care of adding the "
++            "special tokens so this should be set to false (as is the "
++            "default)."),
++    )
++    documents: Optional[List[Dict[str, str]]] = Field(
++        default=None,
++        description=
++        ("A list of dicts representing documents that will be accessible to "
++         "the model if it is performing RAG (retrieval-augmented generation)."
++         " If the template does not support RAG, this argument will have no "
++         "effect. We recommend that each document should be a dict containing "
++         "\"title\" and \"text\" keys."),
++    )
++    chat_template: Optional[str] = Field(
++        default=None,
++        description=(
++            "A Jinja template to use for this conversion. "
++            "As of transformers v4.44, default chat template is no longer "
++            "allowed, so you must provide a chat template if the tokenizer "
++            "does not define one."),
++    )
++    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
++        default=None,
++        description=("Additional kwargs to pass to the template renderer. "
++                     "Will be accessible by the chat template."),
+     )
+     guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+         default=None,
+@@ -153,67 +330,287 @@ class ChatCompletionRequest(OpenAIBaseModel):
+         description=(
+             "If specified, will override the default whitespace pattern "
+             "for guided json decoding."))
++    priority: int = Field(
++        default=0,
++        description=(
++            "The priority of the request (lower means earlier handling; "
++            "default: 0). Any priority other than 0 will raise an error "
++            "if the served model does not use priority scheduling."))
++    request_id: str = Field(
++        default_factory=lambda: f"{random_uuid()}",
++        description=(
++            "The request_id related to this request. If the caller does "
++            "not set it, a random_uuid will be generated. This id is used "
++            "through out the inference process and return in response."))
++    logits_processors: Optional[LogitsProcessors] = Field(
++        default=None,
++        description=(
++            "A list of either qualified names of logits processors, or "
++            "constructor objects, to apply when sampling. A constructor is "
++            "a JSON object with a required 'qualname' field specifying the "
++            "qualified name of the processor class/factory, and optional "
++            "'args' and 'kwargs' fields containing positional and keyword "
++            "arguments. For example: {'qualname': "
++            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
++            "{'param': 'value'}}."))
+ 
+     # doc: end-chat-completion-extra-params
+ 
+-    def to_sampling_params(self) -> SamplingParams:
+-        if self.logprobs and not self.top_logprobs:
+-            raise ValueError("Top logprobs must be set when logprobs is.")
++    # Default sampling parameters for chat completion requests
++    _DEFAULT_SAMPLING_PARAMS: dict = {
++        "repetition_penalty": 1.0,
++        "temperature": 1.0,
++        "top_p": 1.0,
++        "top_k": -1,
++        "min_p": 0.0,
++    }
++
++    def to_beam_search_params(
++            self,
++            default_max_tokens: int,
++            default_sampling_params: Optional[dict] = None
++    ) -> BeamSearchParams:
++        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
++        max_tokens = self.max_completion_tokens or self.max_tokens
++        if max_tokens is None:
++            max_tokens = default_max_tokens
++
++        if default_sampling_params is None:
++            default_sampling_params = {}
++        n = self.n if self.n is not None else 1
++
++        if (temperature := self.temperature) is None:
++            temperature = default_sampling_params.get(
++                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
++
++        return BeamSearchParams(
++            beam_width=n,
++            max_tokens=max_tokens,
++            ignore_eos=self.ignore_eos,
++            temperature=temperature,
++            length_penalty=self.length_penalty,
++            include_stop_str_in_output=self.include_stop_str_in_output)
+ 
+-        logits_processors = None
+-        if self.logit_bias:
++    def to_sampling_params(
++            self,
++            default_max_tokens: int,
++            logits_processor_pattern: Optional[str],
++            default_sampling_params: Optional[dict] = None) -> SamplingParams:
++        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
++        max_tokens = self.max_completion_tokens or self.max_tokens
++        if max_tokens is None:
++            max_tokens = default_max_tokens
+ 
+-            def logit_bias_logits_processor(
+-                    token_ids: List[int],
+-                    logits: torch.Tensor) -> torch.Tensor:
+-                assert self.logit_bias is not None
+-                for token_id, bias in self.logit_bias.items():
+-                    # Clamp the bias between -100 and 100 per OpenAI API spec
+-                    bias = min(100, max(-100, bias))
+-                    logits[int(token_id)] += bias
+-                return logits
++        if default_sampling_params is None:
++            default_sampling_params = {}
++        # Default parameters
++        if (repetition_penalty := self.repetition_penalty) is None:
++            repetition_penalty = default_sampling_params.get(
++                "repetition_penalty",
++                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
++            )
++        if (temperature := self.temperature) is None:
++            temperature = default_sampling_params.get(
++                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
++        if (top_p := self.top_p) is None:
++            top_p = default_sampling_params.get(
++                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
++        if (top_k := self.top_k) is None:
++            top_k = default_sampling_params.get(
++                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
++        if (min_p := self.min_p) is None:
++            min_p = default_sampling_params.get(
++                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+ 
+-            logits_processors = [logit_bias_logits_processor]
++        prompt_logprobs = self.prompt_logprobs
++        if prompt_logprobs is None and self.echo:
++            prompt_logprobs = self.top_logprobs
+ 
+-        return SamplingParams(
++        guided_json_object = None
++        if self.response_format is not None:
++            if self.response_format.type == "json_object":
++                guided_json_object = True
++            elif self.response_format.type == "json_schema":
++                json_schema = self.response_format.json_schema
++                assert json_schema is not None
++                self.guided_json = json_schema.json_schema
++                if self.guided_decoding_backend is None:
++                    self.guided_decoding_backend = "xgrammar"
++
++        guided_decoding = GuidedDecodingParams.from_optional(
++            json=self._get_guided_json_from_tool() or self.guided_json,
++            regex=self.guided_regex,
++            choice=self.guided_choice,
++            grammar=self.guided_grammar,
++            json_object=guided_json_object,
++            backend=self.guided_decoding_backend,
++            whitespace_pattern=self.guided_whitespace_pattern)
++
++        return SamplingParams.from_optional(
+             n=self.n,
++            best_of=self.best_of,
+             presence_penalty=self.presence_penalty,
+             frequency_penalty=self.frequency_penalty,
+-            repetition_penalty=self.repetition_penalty,
+-            temperature=self.temperature,
+-            top_p=self.top_p,
+-            min_p=self.min_p,
++            repetition_penalty=repetition_penalty,
++            temperature=temperature,
++            top_p=top_p,
++            top_k=top_k,
++            min_p=min_p,
+             seed=self.seed,
+             stop=self.stop,
+             stop_token_ids=self.stop_token_ids,
+-            max_tokens=self.max_tokens,
+-            min_tokens=self.min_tokens,
+             logprobs=self.top_logprobs if self.logprobs else None,
+-            prompt_logprobs=self.top_logprobs if self.echo else None,
+-            best_of=self.best_of,
+-            top_k=self.top_k,
++            prompt_logprobs=prompt_logprobs,
+             ignore_eos=self.ignore_eos,
+-            use_beam_search=self.use_beam_search,
+-            early_stopping=self.early_stopping,
++            max_tokens=max_tokens,
++            min_tokens=self.min_tokens,
+             skip_special_tokens=self.skip_special_tokens,
+             spaces_between_special_tokens=self.spaces_between_special_tokens,
++            logits_processors=get_logits_processors(self.logits_processors,
++                                                    logits_processor_pattern),
+             include_stop_str_in_output=self.include_stop_str_in_output,
+-            length_penalty=self.length_penalty,
+-            logits_processors=logits_processors,
+-        )
++            truncate_prompt_tokens=self.truncate_prompt_tokens,
++            output_kind=RequestOutputKind.DELTA if self.stream \
++                else RequestOutputKind.FINAL_ONLY,
++            guided_decoding=guided_decoding,
++            logit_bias=self.logit_bias)
++
++    def _get_guided_json_from_tool(
++            self) -> Optional[Union[str, dict, BaseModel]]:
++        # user has chosen to not use any tool
++        if self.tool_choice == "none" or self.tools is None:
++            return None
++
++        # user has chosen to use a named tool
++        if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
++            tool_name = self.tool_choice.function.name
++            tools = {tool.function.name: tool.function for tool in self.tools}
++            if tool_name not in tools:
++                raise ValueError(
++                    f"Tool '{tool_name}' has not been passed in `tools`.")
++            tool = tools[tool_name]
++            return tool.parameters
++
++        return None
++
++    @model_validator(mode="before")
++    @classmethod
++    def validate_stream_options(cls, data):
++        if data.get("stream_options") and not data.get("stream"):
++            raise ValueError(
++                "Stream options can only be defined when `stream=True`.")
++
++        return data
++
++    @model_validator(mode="before")
++    @classmethod
++    def check_logprobs(cls, data):
++        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
++            if data.get("stream") and prompt_logprobs > 0:
++                raise ValueError(
++                    "`prompt_logprobs` are not available when `stream=True`.")
++
++            if prompt_logprobs < 0:
++                raise ValueError("`prompt_logprobs` must be a positive value.")
++
++        if (top_logprobs := data.get("top_logprobs")) is not None:
++            if top_logprobs < 0:
++                raise ValueError("`top_logprobs` must be a positive value.")
++
++            if not data.get("logprobs"):
++                raise ValueError(
++                    "when using `top_logprobs`, `logprobs` must be set to true."
++                )
++
++        return data
+ 
+     @model_validator(mode="before")
+     @classmethod
+     def check_guided_decoding_count(cls, data):
++        if isinstance(data, ValueError):
++            raise data
++
+         guide_count = sum([
+             "guided_json" in data and data["guided_json"] is not None,
+             "guided_regex" in data and data["guided_regex"] is not None,
+             "guided_choice" in data and data["guided_choice"] is not None
+         ])
++        # you can only use one kind of guided decoding
+         if guide_count > 1:
+             raise ValueError(
+                 "You can only use one kind of guided decoding "
+                 "('guided_json', 'guided_regex' or 'guided_choice').")
++        # you can only either use guided decoding or tools, not both
++        if guide_count > 1 and data.get("tool_choice",
++                                        "none") not in ("none", "auto"):
++            raise ValueError(
++                "You can only either use guided decoding or tools, not both.")
++        return data
++
++    @model_validator(mode="before")
++    @classmethod
++    def check_tool_usage(cls, data):
++
++        # if "tool_choice" is not specified but tools are provided,
++        # default to "auto" tool_choice
++        if "tool_choice" not in data and data.get("tools"):
++            data["tool_choice"] = "auto"
++
++        # if "tool_choice" is "none" -- ignore tools if present
++        if "tool_choice" in data and data["tool_choice"] == "none":
++            # ensure that no tools are present
++            data.pop("tools", None)
++            return data
++
++        # if "tool_choice" is specified -- validation
++        if "tool_choice" in data:
++
++            # ensure that if "tool choice" is specified, tools are present
++            if "tools" not in data or data["tools"] is None:
++                raise ValueError(
++                    "When using `tool_choice`, `tools` must be set.")
++
++            # make sure that tool choice is either a named tool
++            # OR that it's set to "auto"
++            if data["tool_choice"] != "auto" and not isinstance(
++                    data["tool_choice"], dict):
++                raise ValueError(
++                    "`tool_choice` must either be a named tool, \"auto\", "
++                    "or \"none\".")
++
++            # ensure that if "tool_choice" is specified as an object,
++            # it matches a valid tool
++            if isinstance(data["tool_choice"], dict):
++                valid_tool = False
++                specified_function = data["tool_choice"].get("function")
++                if not specified_function:
++                    raise ValueError(
++                        "Expected field `function` in `tool_choice`."
++                        " Correct usage: `{\"type\": \"function\","
++                        " \"function\": {\"name\": \"my_function\"}}`")
++                specified_function_name = specified_function.get("name")
++                if not specified_function_name:
++                    raise ValueError(
++                        "Expected field `name` in `function` in `tool_choice`."
++                        "Correct usage: `{\"type\": \"function\", "
++                        "\"function\": {\"name\": \"my_function\"}}`")
++                for tool in data["tools"]:
++                    if tool["function"]["name"] == specified_function_name:
++                        valid_tool = True
++                        break
++                if not valid_tool:
++                    raise ValueError(
++                        "The tool specified in `tool_choice` does not match any"
++                        " of the specified `tools`")
++        return data
++
++    @model_validator(mode="before")
++    @classmethod
++    def check_generation_prompt(cls, data):
++        if data.get("continue_final_message") and data.get(
++                "add_generation_prompt"):
++            raise ValueError("Cannot set both `continue_final_message` and "
++                             "`add_generation_prompt` to True.")
+         return data
+ 
+ 
+@@ -230,48 +627,49 @@ class CompletionRequest(OpenAIBaseModel):
+     max_tokens: Optional[int] = 16
+     n: int = 1
+     presence_penalty: Optional[float] = 0.0
+-    seed: Optional[int] = Field(None,
+-                                ge=torch.iinfo(torch.long).min,
+-                                le=torch.iinfo(torch.long).max)
++    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+     stream: Optional[bool] = False
++    stream_options: Optional[StreamOptions] = None
+     suffix: Optional[str] = None
+-    temperature: Optional[float] = 1.0
+-    top_p: Optional[float] = 1.0
++    temperature: Optional[float] = None
++    top_p: Optional[float] = None
+     user: Optional[str] = None
+ 
+     # doc: begin-completion-sampling-params
+-    use_beam_search: Optional[bool] = False
+-    top_k: Optional[int] = -1
+-    min_p: Optional[float] = 0.0
+-    repetition_penalty: Optional[float] = 1.0
+-    length_penalty: Optional[float] = 1.0
+-    early_stopping: Optional[bool] = False
++    use_beam_search: bool = False
++    top_k: Optional[int] = None
++    min_p: Optional[float] = None
++    repetition_penalty: Optional[float] = None
++    length_penalty: float = 1.0
+     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+-    ignore_eos: Optional[bool] = False
+-    min_tokens: Optional[int] = 0
+-    skip_special_tokens: Optional[bool] = True
+-    spaces_between_special_tokens: Optional[bool] = True
++    include_stop_str_in_output: bool = False
++    ignore_eos: bool = False
++    min_tokens: int = 0
++    skip_special_tokens: bool = True
++    spaces_between_special_tokens: bool = True
+     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
++    allowed_token_ids: Optional[List[int]] = None
++    prompt_logprobs: Optional[int] = None
+     # doc: end-completion-sampling-params
+ 
+     # doc: begin-completion-extra-params
+-    include_stop_str_in_output: Optional[bool] = Field(
+-        default=False,
++    add_special_tokens: bool = Field(
++        default=True,
+         description=(
+-            "Whether to include the stop string in the output. "
+-            "This is only applied when the stop or stop_token_ids is set."),
++            "If true (the default), special tokens (e.g. BOS) will be added to "
++            "the prompt."),
+     )
+     response_format: Optional[ResponseFormat] = Field(
+         default=None,
+         description=
+         ("Similar to chat completion, this parameter specifies the format of "
+-         "output. Only {'type': 'json_object'} or {'type': 'text' } is "
+-         "supported."),
++         "output. Only {'type': 'json_object'}, {'type': 'json_schema'} or "
++         "{'type': 'text' } is supported."),
+     )
+     guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+         default=None,
+-        description=("If specified, the output will follow the JSON schema."),
++        description="If specified, the output will follow the JSON schema.",
+     )
+     guided_regex: Optional[str] = Field(
+         default=None,
+@@ -299,54 +697,138 @@ class CompletionRequest(OpenAIBaseModel):
+         description=(
+             "If specified, will override the default whitespace pattern "
+             "for guided json decoding."))
++    priority: int = Field(
++        default=0,
++        description=(
++            "The priority of the request (lower means earlier handling; "
++            "default: 0). Any priority other than 0 will raise an error "
++            "if the served model does not use priority scheduling."))
++    logits_processors: Optional[LogitsProcessors] = Field(
++        default=None,
++        description=(
++            "A list of either qualified names of logits processors, or "
++            "constructor objects, to apply when sampling. A constructor is "
++            "a JSON object with a required 'qualname' field specifying the "
++            "qualified name of the processor class/factory, and optional "
++            "'args' and 'kwargs' fields containing positional and keyword "
++            "arguments. For example: {'qualname': "
++            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
++            "{'param': 'value'}}."))
+ 
+     # doc: end-completion-extra-params
+ 
+-    def to_sampling_params(self):
+-        echo_without_generation = self.echo and self.max_tokens == 0
++    # Default sampling parameters for completion requests
++    _DEFAULT_SAMPLING_PARAMS: dict = {
++        "repetition_penalty": 1.0,
++        "temperature": 1.0,
++        "top_p": 1.0,
++        "top_k": -1,
++        "min_p": 0.0,
++    }
++
++    def to_beam_search_params(
++            self,
++            default_max_tokens: int,
++            default_sampling_params: Optional[dict] = None
++    ) -> BeamSearchParams:
++        max_tokens = self.max_tokens
++        if max_tokens is None:
++            max_tokens = default_max_tokens
+ 
+-        logits_processors = None
+-        if self.logit_bias:
++        if default_sampling_params is None:
++            default_sampling_params = {}
++        n = self.n if self.n is not None else 1
+ 
+-            def logit_bias_logits_processor(
+-                    token_ids: List[int],
+-                    logits: torch.Tensor) -> torch.Tensor:
+-                assert self.logit_bias is not None
+-                for token_id, bias in self.logit_bias.items():
+-                    # Clamp the bias between -100 and 100 per OpenAI API spec
+-                    bias = min(100, max(-100, bias))
+-                    logits[int(token_id)] += bias
+-                return logits
++        if (temperature := self.temperature) is None:
++            temperature = default_sampling_params.get("temperature", 1.0)
+ 
+-            logits_processors = [logit_bias_logits_processor]
++        return BeamSearchParams(
++            beam_width=n,
++            max_tokens=max_tokens,
++            ignore_eos=self.ignore_eos,
++            temperature=temperature,
++            length_penalty=self.length_penalty,
++            include_stop_str_in_output=self.include_stop_str_in_output)
+ 
+-        return SamplingParams(
++    def to_sampling_params(
++            self,
++            default_max_tokens: int,
++            logits_processor_pattern: Optional[str],
++            default_sampling_params: Optional[dict] = None) -> SamplingParams:
++        max_tokens = self.max_tokens
++        if max_tokens is None:
++            max_tokens = default_max_tokens
++
++        if default_sampling_params is None:
++            default_sampling_params = {}
++        # Default parameters
++        if (repetition_penalty := self.repetition_penalty) is None:
++            repetition_penalty = default_sampling_params.get(
++                "repetition_penalty",
++                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
++            )
++        if (temperature := self.temperature) is None:
++            temperature = default_sampling_params.get(
++                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
++        if (top_p := self.top_p) is None:
++            top_p = default_sampling_params.get(
++                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
++        if (top_k := self.top_k) is None:
++            top_k = default_sampling_params.get(
++                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
++        if (min_p := self.min_p) is None:
++            min_p = default_sampling_params.get(
++                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
++
++        prompt_logprobs = self.prompt_logprobs
++        if prompt_logprobs is None and self.echo:
++            prompt_logprobs = self.logprobs
++
++        echo_without_generation = self.echo and self.max_tokens == 0
++
++        guided_json_object = None
++        if (self.response_format is not None
++                and self.response_format.type == "json_object"):
++            guided_json_object = True
++
++        guided_decoding = GuidedDecodingParams.from_optional(
++            json=self.guided_json,
++            regex=self.guided_regex,
++            choice=self.guided_choice,
++            grammar=self.guided_grammar,
++            json_object=guided_json_object,
++            backend=self.guided_decoding_backend,
++            whitespace_pattern=self.guided_whitespace_pattern)
++
++        return SamplingParams.from_optional(
+             n=self.n,
+             best_of=self.best_of,
+             presence_penalty=self.presence_penalty,
+             frequency_penalty=self.frequency_penalty,
+-            repetition_penalty=self.repetition_penalty,
+-            temperature=self.temperature,
+-            top_p=self.top_p,
+-            top_k=self.top_k,
+-            min_p=self.min_p,
++            repetition_penalty=repetition_penalty,
++            temperature=temperature,
++            top_p=top_p,
++            top_k=top_k,
++            min_p=min_p,
+             seed=self.seed,
+             stop=self.stop,
+             stop_token_ids=self.stop_token_ids,
++            logprobs=self.logprobs,
+             ignore_eos=self.ignore_eos,
+-            max_tokens=self.max_tokens if not echo_without_generation else 1,
++            max_tokens=max_tokens if not echo_without_generation else 1,
+             min_tokens=self.min_tokens,
+-            logprobs=self.logprobs,
+-            use_beam_search=self.use_beam_search,
+-            early_stopping=self.early_stopping,
+-            prompt_logprobs=self.logprobs if self.echo else None,
++            prompt_logprobs=prompt_logprobs,
+             skip_special_tokens=self.skip_special_tokens,
+-            spaces_between_special_tokens=(self.spaces_between_special_tokens),
++            spaces_between_special_tokens=self.spaces_between_special_tokens,
+             include_stop_str_in_output=self.include_stop_str_in_output,
+-            length_penalty=self.length_penalty,
+-            logits_processors=logits_processors,
++            logits_processors=get_logits_processors(self.logits_processors,
++                                                    logits_processor_pattern),
+             truncate_prompt_tokens=self.truncate_prompt_tokens,
+-        )
++            output_kind=RequestOutputKind.DELTA if self.stream \
++                else RequestOutputKind.FINAL_ONLY,
++            guided_decoding=guided_decoding,
++            logit_bias=self.logit_bias,
++            allowed_token_ids=self.allowed_token_ids)
+ 
+     @model_validator(mode="before")
+     @classmethod
+@@ -362,18 +844,166 @@ class CompletionRequest(OpenAIBaseModel):
+                 "('guided_json', 'guided_regex' or 'guided_choice').")
+         return data
+ 
++    @model_validator(mode="before")
++    @classmethod
++    def check_logprobs(cls, data):
++        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
++            if data.get("stream") and prompt_logprobs > 0:
++                raise ValueError(
++                    "`prompt_logprobs` are not available when `stream=True`.")
++
++            if prompt_logprobs < 0:
++                raise ValueError("`prompt_logprobs` must be a positive value.")
++
++        if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
++            raise ValueError("`logprobs` must be a positive value.")
++
++        return data
++
++    @model_validator(mode="before")
++    @classmethod
++    def validate_stream_options(cls, data):
++        if data.get("stream_options") and not data.get("stream"):
++            raise ValueError(
++                "Stream options can only be defined when `stream=True`.")
++
++        return data
++
++
++class EmbeddingCompletionRequest(OpenAIBaseModel):
++    # Ordered by official OpenAI API documentation
++    # https://platform.openai.com/docs/api-reference/embeddings
++    model: str
++    input: Union[List[int], List[List[int]], str, List[str]]
++    encoding_format: Literal["float", "base64"] = "float"
++    dimensions: Optional[int] = None
++    user: Optional[str] = None
++    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
++
++    # doc: begin-embedding-pooling-params
++    additional_data: Optional[Any] = None
++    # doc: end-embedding-pooling-params
++
++    # doc: begin-embedding-extra-params
++    add_special_tokens: bool = Field(
++        default=True,
++        description=(
++            "If true (the default), special tokens (e.g. BOS) will be added to "
++            "the prompt."),
++    )
++    priority: int = Field(
++        default=0,
++        description=(
++            "The priority of the request (lower means earlier handling; "
++            "default: 0). Any priority other than 0 will raise an error "
++            "if the served model does not use priority scheduling."))
++
++    # doc: end-embedding-extra-params
+ 
+-class LogProbs(OpenAIBaseModel):
++    def to_pooling_params(self):
++        return PoolingParams(additional_data=self.additional_data)
++
++
++class EmbeddingChatRequest(OpenAIBaseModel):
++    model: str
++    messages: List[ChatCompletionMessageParam]
++
++    encoding_format: Literal["float", "base64"] = "float"
++    dimensions: Optional[int] = None
++    user: Optional[str] = None
++    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
++
++    # doc: begin-chat-embedding-pooling-params
++    additional_data: Optional[Any] = None
++    # doc: end-chat-embedding-pooling-params
++
++    # doc: begin-chat-embedding-extra-params
++    add_special_tokens: bool = Field(
++        default=False,
++        description=(
++            "If true, special tokens (e.g. BOS) will be added to the prompt "
++            "on top of what is added by the chat template. "
++            "For most models, the chat template takes care of adding the "
++            "special tokens so this should be set to false (as is the "
++            "default)."),
++    )
++    chat_template: Optional[str] = Field(
++        default=None,
++        description=(
++            "A Jinja template to use for this conversion. "
++            "As of transformers v4.44, default chat template is no longer "
++            "allowed, so you must provide a chat template if the tokenizer "
++            "does not define one."),
++    )
++    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
++        default=None,
++        description=("Additional kwargs to pass to the template renderer. "
++                     "Will be accessible by the chat template."),
++    )
++    priority: int = Field(
++        default=0,
++        description=(
++            "The priority of the request (lower means earlier handling; "
++            "default: 0). Any priority other than 0 will raise an error "
++            "if the served model does not use priority scheduling."))
++    # doc: end-chat-embedding-extra-params
++
++    @model_validator(mode="before")
++    @classmethod
++    def check_generation_prompt(cls, data):
++        if data.get("continue_final_message") and data.get(
++                "add_generation_prompt"):
++            raise ValueError("Cannot set both `continue_final_message` and "
++                             "`add_generation_prompt` to True.")
++        return data
++
++    def to_pooling_params(self):
++        return PoolingParams(additional_data=self.additional_data)
++
++
++EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
++
++PoolingCompletionRequest = EmbeddingCompletionRequest
++PoolingChatRequest = EmbeddingChatRequest
++PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
++
++
++class ScoreRequest(OpenAIBaseModel):
++    model: str
++    text_1: Union[List[str], str]
++    text_2: Union[List[str], str]
++    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
++
++    # doc: begin-score-pooling-params
++    additional_data: Optional[Any] = None
++    # doc: end-score-pooling-params
++
++    # doc: begin-score-extra-params
++    priority: int = Field(
++        default=0,
++        description=(
++            "The priority of the request (lower means earlier handling; "
++            "default: 0). Any priority other than 0 will raise an error "
++            "if the served model does not use priority scheduling."))
++
++    # doc: end-score-extra-params
++
++    def to_pooling_params(self):
++        return PoolingParams(additional_data=self.additional_data)
++
++
++class CompletionLogProbs(OpenAIBaseModel):
+     text_offset: List[int] = Field(default_factory=list)
+     token_logprobs: List[Optional[float]] = Field(default_factory=list)
+     tokens: List[str] = Field(default_factory=list)
+-    top_logprobs: Optional[List[Optional[Dict[str, float]]]] = None
++    top_logprobs: List[Optional[Dict[str,
++                                     float]]] = Field(default_factory=list)
+ 
+ 
+ class CompletionResponseChoice(OpenAIBaseModel):
+     index: int
+     text: str
+-    logprobs: Optional[LogProbs] = None
++    logprobs: Optional[CompletionLogProbs] = None
+     finish_reason: Optional[str] = None
+     stop_reason: Optional[Union[int, str]] = Field(
+         default=None,
+@@ -382,6 +1012,7 @@ class CompletionResponseChoice(OpenAIBaseModel):
+             "to stop, None if the completion finished for some other reason "
+             "including encountering the EOS token"),
+     )
++    prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None
+ 
+ 
+ class CompletionResponse(OpenAIBaseModel):
+@@ -396,7 +1027,7 @@ class CompletionResponse(OpenAIBaseModel):
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
+     index: int
+     text: str
+-    logprobs: Optional[LogProbs] = None
++    logprobs: Optional[CompletionLogProbs] = None
+     finish_reason: Optional[str] = None
+     stop_reason: Optional[Union[int, str]] = Field(
+         default=None,
+@@ -416,45 +1047,290 @@ class CompletionStreamResponse(OpenAIBaseModel):
+     usage: Optional[UsageInfo] = Field(default=None)
+ 
+ 
++class EmbeddingResponseData(OpenAIBaseModel):
++    index: int
++    object: str = "embedding"
++    embedding: Union[List[float], str]
++
++
++class EmbeddingResponse(OpenAIBaseModel):
++    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
++    object: str = "list"
++    created: int = Field(default_factory=lambda: int(time.time()))
++    model: str
++    data: List[EmbeddingResponseData]
++    usage: UsageInfo
++
++
++class PoolingResponseData(OpenAIBaseModel):
++    index: int
++    object: str = "pooling"
++    data: Union[List[List[float]], List[float], str]
++
++
++class PoolingResponse(OpenAIBaseModel):
++    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
++    object: str = "list"
++    created: int = Field(default_factory=lambda: int(time.time()))
++    model: str
++    data: List[PoolingResponseData]
++    usage: UsageInfo
++
++
++class ScoreResponseData(OpenAIBaseModel):
++    index: int
++    object: str = "score"
++    score: float
++
++
++class ScoreResponse(OpenAIBaseModel):
++    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
++    object: str = "list"
++    created: int = Field(default_factory=lambda: int(time.time()))
++    model: str
++    data: List[ScoreResponseData]
++    usage: UsageInfo
++
++
++class FunctionCall(OpenAIBaseModel):
++    name: str
++    arguments: str
++
++
++class ToolCall(OpenAIBaseModel):
++    id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}")
++    type: Literal["function"] = "function"
++    function: FunctionCall
++
++
++class DeltaFunctionCall(BaseModel):
++    name: Optional[str] = None
++    arguments: Optional[str] = None
++
++
++# a tool call delta where everything is optional
++class DeltaToolCall(OpenAIBaseModel):
++    id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}")
++    type: Literal["function"] = "function"
++    index: int
++    function: Optional[DeltaFunctionCall] = None
++
++
++class ExtractedToolCallInformation(BaseModel):
++    # indicate if tools were called
++    tools_called: bool
++
++    # extracted tool calls
++    tool_calls: List[ToolCall]
++
++    # content - per OpenAI spec, content AND tool calls can be returned rarely
++    # But some models will do this intentionally
++    content: Optional[str] = None
++
++
+ class ChatMessage(OpenAIBaseModel):
+     role: str
+-    content: str
++    content: Optional[str] = None
++    tool_calls: List[ToolCall] = Field(default_factory=list)
++
++
++class ChatCompletionLogProb(OpenAIBaseModel):
++    token: str
++    logprob: float = -9999.0
++    bytes: Optional[List[int]] = None
++
++
++class ChatCompletionLogProbsContent(ChatCompletionLogProb):
++    top_logprobs: List[ChatCompletionLogProb] = Field(default_factory=list)
++
++
++class ChatCompletionLogProbs(OpenAIBaseModel):
++    content: Optional[List[ChatCompletionLogProbsContent]] = None
+ 
+ 
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
+     index: int
+     message: ChatMessage
+-    logprobs: Optional[LogProbs] = None
+-    finish_reason: Optional[str] = None
++    logprobs: Optional[ChatCompletionLogProbs] = None
++    # per OpenAI spec this is the default
++    finish_reason: Optional[str] = "stop"
++    # not part of the OpenAI spec but included in vLLM for legacy reasons
+     stop_reason: Optional[Union[int, str]] = None
+ 
+ 
+ class ChatCompletionResponse(OpenAIBaseModel):
+     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+-    object: str = "chat.completion"
++    object: Literal["chat.completion"] = "chat.completion"
+     created: int = Field(default_factory=lambda: int(time.time()))
+     model: str
+     choices: List[ChatCompletionResponseChoice]
+     usage: UsageInfo
++    prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None
+ 
+ 
+ class DeltaMessage(OpenAIBaseModel):
+     role: Optional[str] = None
+     content: Optional[str] = None
++    tool_calls: List[DeltaToolCall] = Field(default_factory=list)
+ 
+ 
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
+     index: int
+     delta: DeltaMessage
+-    logprobs: Optional[LogProbs] = None
++    logprobs: Optional[ChatCompletionLogProbs] = None
+     finish_reason: Optional[str] = None
+     stop_reason: Optional[Union[int, str]] = None
+ 
+ 
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
+     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+-    object: str = "chat.completion.chunk"
++    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
+     created: int = Field(default_factory=lambda: int(time.time()))
+     model: str
+     choices: List[ChatCompletionResponseStreamChoice]
+     usage: Optional[UsageInfo] = Field(default=None)
++
++
++class BatchRequestInput(OpenAIBaseModel):
++    """
++    The per-line object of the batch input file.
++
++    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
++    """
++
++    # A developer-provided per-request id that will be used to match outputs to
++    # inputs. Must be unique for each request in a batch.
++    custom_id: str
++
++    # The HTTP method to be used for the request. Currently only POST is
++    # supported.
++    method: str
++
++    # The OpenAI API relative URL to be used for the request. Currently
++    # /v1/chat/completions is supported.
++    url: str
++
++    # The parameters of the request.
++    body: Union[ChatCompletionRequest, EmbeddingRequest]
++
++
++class BatchResponseData(OpenAIBaseModel):
++    # HTTP status code of the response.
++    status_code: int = 200
++
++    # An unique identifier for the API request.
++    request_id: str
++
++    # The body of the response.
++    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse]] = None
++
++
++class BatchRequestOutput(OpenAIBaseModel):
++    """
++    The per-line object of the batch output and error files
++    """
++
++    id: str
++
++    # A developer-provided per-request id that will be used to match outputs to
++    # inputs.
++    custom_id: str
++
++    response: Optional[BatchResponseData]
++
++    # For requests that failed with a non-HTTP error, this will contain more
++    # information on the cause of the failure.
++    error: Optional[Any]
++
++
++class TokenizeCompletionRequest(OpenAIBaseModel):
++    model: str
++    prompt: str
++
++    add_special_tokens: bool = Field(
++        default=True,
++        description=(
++            "If true (the default), special tokens (e.g. BOS) will be added to "
++            "the prompt."),
++    )
++
++
++class TokenizeChatRequest(OpenAIBaseModel):
++    model: str
++    messages: List[ChatCompletionMessageParam]
++
++    add_generation_prompt: bool = Field(
++        default=True,
++        description=
++        ("If true, the generation prompt will be added to the chat template. "
++         "This is a parameter used by chat template in tokenizer config of the "
++         "model."),
++    )
++    continue_final_message: bool = Field(
++        default=False,
++        description=
++        ("If this is set, the chat will be formatted so that the final "
++         "message in the chat is open-ended, without any EOS tokens. The "
++         "model will continue this message rather than starting a new one. "
++         "This allows you to \"prefill\" part of the model's response for it. "
++         "Cannot be used at the same time as `add_generation_prompt`."),
++    )
++    add_special_tokens: bool = Field(
++        default=False,
++        description=(
++            "If true, special tokens (e.g. BOS) will be added to the prompt "
++            "on top of what is added by the chat template. "
++            "For most models, the chat template takes care of adding the "
++            "special tokens so this should be set to false (as is the "
++            "default)."),
++    )
++    chat_template: Optional[str] = Field(
++        default=None,
++        description=(
++            "A Jinja template to use for this conversion. "
++            "As of transformers v4.44, default chat template is no longer "
++            "allowed, so you must provide a chat template if the tokenizer "
++            "does not define one."),
++    )
++    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
++        default=None,
++        description=("Additional kwargs to pass to the template renderer. "
++                     "Will be accessible by the chat template."),
++    )
++
++    @model_validator(mode="before")
++    @classmethod
++    def check_generation_prompt(cls, data):
++        if data.get("continue_final_message") and data.get(
++                "add_generation_prompt"):
++            raise ValueError("Cannot set both `continue_final_message` and "
++                             "`add_generation_prompt` to True.")
++        return data
++
++
++TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
++
++
++class TokenizeResponse(OpenAIBaseModel):
++    count: int
++    max_model_len: int
++    tokens: List[int]
++
++
++class DetokenizeRequest(OpenAIBaseModel):
++    model: str
++    tokens: List[int]
++
++
++class DetokenizeResponse(OpenAIBaseModel):
++    prompt: str
++
++
++class LoadLoraAdapterRequest(BaseModel):
++    lora_name: str
++    lora_path: str
++
++
++class UnloadLoraAdapterRequest(BaseModel):
++    lora_name: str
++    lora_int_id: Optional[int] = Field(default=None)
+diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
+new file mode 100644
+index 0000000..f8f136f
+--- /dev/null
++++ b/vllm/entrypoints/openai/run_batch.py
+@@ -0,0 +1,317 @@
++import asyncio
++from http import HTTPStatus
++from io import StringIO
++from typing import Awaitable, Callable, List, Optional
++
++import aiohttp
++import torch
++from prometheus_client import start_http_server
++from tqdm import tqdm
++
++from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
++from vllm.engine.async_llm_engine import AsyncLLMEngine
++from vllm.entrypoints.logger import RequestLogger, logger
++# yapf: disable
++from vllm.entrypoints.openai.protocol import (BatchRequestInput,
++                                              BatchRequestOutput,
++                                              BatchResponseData,
++                                              ChatCompletionResponse,
++                                              EmbeddingResponse, ErrorResponse)
++# yapf: enable
++from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
++from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
++from vllm.entrypoints.openai.serving_models import (BaseModelPath,
++                                                    OpenAIServingModels)
++from vllm.usage.usage_lib import UsageContext
++from vllm.utils import FlexibleArgumentParser, random_uuid
++from vllm.version import __version__ as VLLM_VERSION
++
++
++def parse_args():
++    parser = FlexibleArgumentParser(
++        description="vLLM OpenAI-Compatible batch runner.")
++    parser.add_argument(
++        "-i",
++        "--input-file",
++        required=True,
++        type=str,
++        help=
++        "The path or url to a single input file. Currently supports local file "
++        "paths, or the http protocol (http or https). If a URL is specified, "
++        "the file should be available via HTTP GET.")
++    parser.add_argument(
++        "-o",
++        "--output-file",
++        required=True,
++        type=str,
++        help="The path or url to a single output file. Currently supports "
++        "local file paths, or web (http or https) urls. If a URL is specified,"
++        " the file should be available via HTTP PUT.")
++    parser.add_argument("--response-role",
++                        type=nullable_str,
++                        default="assistant",
++                        help="The role name to return if "
++                        "`request.add_generation_prompt=True`.")
++
++    parser = AsyncEngineArgs.add_cli_args(parser)
++
++    parser.add_argument('--max-log-len',
++                        type=int,
++                        default=None,
++                        help='Max number of prompt characters or prompt '
++                        'ID numbers being printed in log.'
++                        '\n\nDefault: Unlimited')
++
++    parser.add_argument("--enable-metrics",
++                        action="store_true",
++                        help="Enable Prometheus metrics")
++    parser.add_argument(
++        "--url",
++        type=str,
++        default="0.0.0.0",
++        help="URL to the Prometheus metrics server "
++        "(only needed if enable-metrics is set).",
++    )
++    parser.add_argument(
++        "--port",
++        type=int,
++        default=8000,
++        help="Port number for the Prometheus metrics server "
++        "(only needed if enable-metrics is set).",
++    )
++    parser.add_argument(
++        "--enable-prompt-tokens-details",
++        action='store_true',
++        default=False,
++        help="If set to True, enable prompt_tokens_details in usage.")
++
++    return parser.parse_args()
++
++
++# explicitly use pure text format, with a newline at the end
++# this makes it impossible to see the animation in the progress bar
++# but will avoid messing up with ray or multiprocessing, which wraps
++# each line of output with some prefix.
++_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
++
++
++class BatchProgressTracker:
++
++    def __init__(self):
++        self._total = 0
++        self._pbar: Optional[tqdm] = None
++
++    def submitted(self):
++        self._total += 1
++
++    def completed(self):
++        if self._pbar:
++            self._pbar.update()
++
++    def pbar(self) -> tqdm:
++        enable_tqdm = not torch.distributed.is_initialized(
++        ) or torch.distributed.get_rank() == 0
++        self._pbar = tqdm(total=self._total,
++                          unit="req",
++                          desc="Running batch",
++                          mininterval=5,
++                          disable=not enable_tqdm,
++                          bar_format=_BAR_FORMAT)
++        return self._pbar
++
++
++async def read_file(path_or_url: str) -> str:
++    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
++        async with aiohttp.ClientSession() as session, \
++                   session.get(path_or_url) as resp:
++            return await resp.text()
++    else:
++        with open(path_or_url, encoding="utf-8") as f:
++            return f.read()
++
++
++async def write_file(path_or_url: str, data: str) -> None:
++    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
++        async with aiohttp.ClientSession() as session, \
++                   session.put(path_or_url, data=data.encode("utf-8")):
++            pass
++    else:
++        # We should make this async, but as long as this is always run as a
++        # standalone program, blocking the event loop won't effect performance
++        # in this particular case.
++        with open(path_or_url, "w", encoding="utf-8") as f:
++            f.write(data)
++
++
++def make_error_request_output(request: BatchRequestInput,
++                              error_msg: str) -> BatchRequestOutput:
++    batch_output = BatchRequestOutput(
++        id=f"vllm-{random_uuid()}",
++        custom_id=request.custom_id,
++        response=BatchResponseData(
++            status_code=HTTPStatus.BAD_REQUEST,
++            request_id=f"vllm-batch-{random_uuid()}",
++        ),
++        error=error_msg,
++    )
++    return batch_output
++
++
++async def make_async_error_request_output(
++        request: BatchRequestInput, error_msg: str) -> BatchRequestOutput:
++    return make_error_request_output(request, error_msg)
++
++
++async def run_request(serving_engine_func: Callable,
++                      request: BatchRequestInput,
++                      tracker: BatchProgressTracker) -> BatchRequestOutput:
++    response = await serving_engine_func(request.body)
++
++    if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)):
++        batch_output = BatchRequestOutput(
++            id=f"vllm-{random_uuid()}",
++            custom_id=request.custom_id,
++            response=BatchResponseData(
++                body=response, request_id=f"vllm-batch-{random_uuid()}"),
++            error=None,
++        )
++    elif isinstance(response, ErrorResponse):
++        batch_output = BatchRequestOutput(
++            id=f"vllm-{random_uuid()}",
++            custom_id=request.custom_id,
++            response=BatchResponseData(
++                status_code=response.code,
++                request_id=f"vllm-batch-{random_uuid()}"),
++            error=response,
++        )
++    else:
++        batch_output = make_error_request_output(
++            request, error_msg="Request must not be sent in stream mode")
++
++    tracker.completed()
++    return batch_output
++
++
++async def main(args):
++    if args.served_model_name is not None:
++        served_model_names = args.served_model_name
++    else:
++        served_model_names = [args.model]
++
++    engine_args = AsyncEngineArgs.from_cli_args(args)
++    engine = AsyncLLMEngine.from_engine_args(
++        engine_args, usage_context=UsageContext.OPENAI_BATCH_RUNNER)
++
++    model_config = await engine.get_model_config()
++    base_model_paths = [
++        BaseModelPath(name=name, model_path=args.model)
++        for name in served_model_names
++    ]
++
++    if args.disable_log_requests:
++        request_logger = None
++    else:
++        request_logger = RequestLogger(max_log_len=args.max_log_len)
++
++    # Create the openai serving objects.
++    openai_serving_models = OpenAIServingModels(
++        engine_client=engine,
++        model_config=model_config,
++        base_model_paths=base_model_paths,
++        lora_modules=None,
++        prompt_adapters=None,
++    )
++    openai_serving_chat = OpenAIServingChat(
++        engine,
++        model_config,
++        openai_serving_models,
++        args.response_role,
++        request_logger=request_logger,
++        chat_template=None,
++        chat_template_content_format="auto",
++        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
++    ) if model_config.runner_type == "generate" else None
++    openai_serving_embedding = OpenAIServingEmbedding(
++        engine,
++        model_config,
++        openai_serving_models,
++        request_logger=request_logger,
++        chat_template=None,
++        chat_template_content_format="auto",
++    ) if model_config.task == "embed" else None
++
++    tracker = BatchProgressTracker()
++    logger.info("Reading batch from %s...", args.input_file)
++
++    # Submit all requests in the file to the engine "concurrently".
++    response_futures: List[Awaitable[BatchRequestOutput]] = []
++    for request_json in (await read_file(args.input_file)).strip().split("\n"):
++        # Skip empty lines.
++        request_json = request_json.strip()
++        if not request_json:
++            continue
++
++        request = BatchRequestInput.model_validate_json(request_json)
++
++        # Determine the type of request and run it.
++        if request.url == "/v1/chat/completions":
++            handler_fn = (None if openai_serving_chat is None else
++                          openai_serving_chat.create_chat_completion)
++            if handler_fn is None:
++                response_futures.append(
++                    make_async_error_request_output(
++                        request,
++                        error_msg=
++                        "The model does not support Chat Completions API",
++                    ))
++                continue
++
++            response_futures.append(run_request(handler_fn, request, tracker))
++            tracker.submitted()
++        elif request.url == "/v1/embeddings":
++            handler_fn = (None if openai_serving_embedding is None else
++                          openai_serving_embedding.create_embedding)
++            if handler_fn is None:
++                response_futures.append(
++                    make_async_error_request_output(
++                        request,
++                        error_msg="The model does not support Embeddings API",
++                    ))
++                continue
++
++            response_futures.append(run_request(handler_fn, request, tracker))
++            tracker.submitted()
++        else:
++            response_futures.append(
++                make_async_error_request_output(
++                    request,
++                    error_msg="Only /v1/chat/completions and "
++                    "/v1/embeddings are supported in the batch endpoint.",
++                ))
++
++    with tracker.pbar():
++        responses = await asyncio.gather(*response_futures)
++
++    output_buffer = StringIO()
++    for response in responses:
++        print(response.model_dump_json(), file=output_buffer)
++
++    output_buffer.seek(0)
++    await write_file(args.output_file, output_buffer.read().strip())
++
++
++if __name__ == "__main__":
++    args = parse_args()
++
++    logger.info("vLLM batch processing API version %s", VLLM_VERSION)
++    logger.info("args: %s", args)
++
++    # Start the Prometheus metrics server. LLMEngine uses the Prometheus client
++    # to publish metrics at the /metrics endpoint.
++    if args.enable_metrics:
++        logger.info("Prometheus metrics enabled")
++        start_http_server(port=args.port, addr=args.url)
++    else:
++        logger.info("Prometheus metrics disabled")
++
++    asyncio.run(main(args))
+diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
+index c8f4a6b..89a119a 100644
+--- a/vllm/entrypoints/openai/serving_chat.py
++++ b/vllm/entrypoints/openai/serving_chat.py
+@@ -1,181 +1,346 @@
+ import asyncio
+-import codecs
++import json
+ import time
+-from typing import (AsyncGenerator, AsyncIterator, Awaitable, Iterable, List,
+-                    Optional, Tuple, TypedDict, Union, final)
++from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, Final, List,
++                    Optional)
++from typing import Sequence as GenericSequence
++from typing import Union
+ 
+ from fastapi import Request
+-from openai.types.chat import (ChatCompletionContentPartParam,
+-                               ChatCompletionRole)
+ 
+-from vllm.engine.async_llm_engine import AsyncLLMEngine
++from vllm.config import ModelConfig
++from vllm.engine.protocol import EngineClient
++from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
++                                         ConversationMessage)
++from vllm.entrypoints.logger import RequestLogger
+ from vllm.entrypoints.openai.protocol import (
++    ChatCompletionLogProb, ChatCompletionLogProbs,
++    ChatCompletionLogProbsContent, ChatCompletionNamedToolChoiceParam,
+     ChatCompletionRequest, ChatCompletionResponse,
+     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
+-    ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
+-    UsageInfo)
+-from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+-                                                    OpenAIServing)
++    ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
++    DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
++    RequestResponseMetadata, ToolCall, UsageInfo)
++from vllm.entrypoints.openai.serving_engine import OpenAIServing
++from vllm.entrypoints.openai.serving_models import OpenAIServingModels
++from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+ from vllm.logger import init_logger
+-from vllm.model_executor.guided_decoding import (
+-    get_guided_decoding_logits_processor)
+-from vllm.outputs import RequestOutput
+-from vllm.utils import random_uuid
++from vllm.outputs import CompletionOutput, RequestOutput
++from vllm.sampling_params import BeamSearchParams, SamplingParams
++from vllm.sequence import Logprob
++from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
++from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
+ 
+ logger = init_logger(__name__)
+ 
+ 
+-@final  # So that it should be compatible with Dict[str, str]
+-class ConversationMessage(TypedDict):
+-    role: str
+-    content: str
+-
+-
+ class OpenAIServingChat(OpenAIServing):
+ 
+-    def __init__(self,
+-                 engine: AsyncLLMEngine,
+-                 served_model_names: List[str],
+-                 response_role: str,
+-                 lora_modules: Optional[List[LoRAModulePath]] = None,
+-                 chat_template: Optional[str] = None):
+-        super().__init__(engine=engine,
+-                         served_model_names=served_model_names,
+-                         lora_modules=lora_modules,
+-                         await_post_init=self._load_chat_template(
+-                             chat_template=chat_template))
+-
+-        self.response_role = response_role
+-
+-    def _parse_chat_message_content(
++    def __init__(
+         self,
+-        role: ChatCompletionRole,
+-        content: Optional[Union[str,
+-                                Iterable[ChatCompletionContentPartParam]]],
+-    ) -> Tuple[List[ConversationMessage], List[Awaitable[object]]]:
+-        if content is None:
+-            return [], []
+-        if isinstance(content, str):
+-            return [ConversationMessage(role=role, content=content)], []
+-
+-        texts: List[str] = []
+-        for _, part in enumerate(content):
+-            if part["type"] == "text":
+-                text = part["text"]
+-
+-                texts.append(text)
+-            else:
+-                raise NotImplementedError(f"Unknown part type: {part['type']}")
++        engine_client: EngineClient,
++        model_config: ModelConfig,
++        models: OpenAIServingModels,
++        response_role: str,
++        *,
++        request_logger: Optional[RequestLogger],
++        chat_template: Optional[str],
++        chat_template_content_format: ChatTemplateContentFormatOption,
++        return_tokens_as_token_ids: bool = False,
++        enable_auto_tools: bool = False,
++        tool_parser: Optional[str] = None,
++        enable_prompt_tokens_details: bool = False,
++    ) -> None:
++        super().__init__(engine_client=engine_client,
++                         model_config=model_config,
++                         models=models,
++                         request_logger=request_logger,
++                         return_tokens_as_token_ids=return_tokens_as_token_ids)
+ 
+-        return [ConversationMessage(role=role, content="\n".join(texts))], []
++        self.response_role = response_role
++        self.chat_template = chat_template
++        self.chat_template_content_format: Final = chat_template_content_format
++
++        # set up tool use
++        self.enable_auto_tools: bool = enable_auto_tools
++        if self.enable_auto_tools:
++            logger.info(
++                "\"auto\" tool choice has been enabled please note that while"
++                " the parallel_tool_calls client option is preset for "
++                "compatibility reasons, it will be ignored.")
++
++        self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
++        if self.enable_auto_tools:
++            try:
++                if (tool_parser == "pythonic" and
++                        model_config.model.startswith("meta-llama/Llama-3.2")):
++                    logger.warning(
++                        "Llama3.2 models may struggle to emit valid pythonic"
++                        " tool calls")
++                self.tool_parser = ToolParserManager.get_tool_parser(
++                    tool_parser)
++            except Exception as e:
++                raise TypeError("Error: --enable-auto-tool-choice requires "
++                                f"tool_parser:'{tool_parser}' which has not "
++                                "been registered") from e
++
++        self.enable_prompt_tokens_details = enable_prompt_tokens_details
++        diff_sampling_param = self.model_config.get_diff_sampling_param()
++        if diff_sampling_param:
++            logger.info("Overwriting default chat sampling param with: %s",
++                        diff_sampling_param)
+ 
+     async def create_chat_completion(
+-        self, request: ChatCompletionRequest, raw_request: Request
+-    ) -> Union[ErrorResponse, AsyncGenerator[str, None],
+-               ChatCompletionResponse]:
+-        """Completion API similar to OpenAI's API.
++        self,
++        request: ChatCompletionRequest,
++        raw_request: Optional[Request] = None,
++    ) -> Union[AsyncGenerator[str, None], ChatCompletionResponse,
++               ErrorResponse]:
++        """
++        Chat Completion API similar to OpenAI's API.
+ 
+         See https://platform.openai.com/docs/api-reference/chat/create
+         for the API specification. This API mimics the OpenAI
+-        ChatCompletion API.
+-
+-        NOTE: Currently we do not support the following feature:
+-            - function_call (Users should implement this by themselves)
++        Chat Completion API.
+         """
+         error_check_ret = await self._check_model(request)
+         if error_check_ret is not None:
++            logger.error("Error with model %s", error_check_ret)
+             return error_check_ret
+ 
+-        try:
+-            conversation: List[ConversationMessage] = []
+-
+-            for m in request.messages:
+-                messages, _ = self._parse_chat_message_content(
+-                    m["role"], m["content"])
++        # If the engine is dead, raise the engine's DEAD_ERROR.
++        # This is required for the streaming case, where we return a
++        # success status before we actually start generating text :).
++        if self.engine_client.errored:
++            raise self.engine_client.dead_error
+ 
+-                conversation.extend(messages)
++        try:
++            (
++                lora_request,
++                prompt_adapter_request,
++            ) = self._maybe_get_adapters(request)
++
++            model_name = self.models.model_name(lora_request)
++
++            tokenizer = await self.engine_client.get_tokenizer(lora_request)
++
++            tool_parser = self.tool_parser
++
++            # validation for OpenAI tools
++            # tool_choice = "required" is not supported
++            if request.tool_choice == "required":
++                return self.create_error_response(
++                    "tool_choice = \"required\" is not supported!")
++
++            # because of issues with pydantic we need to potentially
++            # re-serialize the tool_calls field of the request
++            # for more info: see comment in `maybe_serialize_tool_calls`
++            if isinstance(tokenizer, MistralTokenizer):
++                maybe_serialize_tool_calls(request)
++
++            if (request.tool_choice == "auto" and
++                    not (self.enable_auto_tools and tool_parser is not None)
++                    and not isinstance(tokenizer, MistralTokenizer)):
++                # for hf tokenizers, "auto" tools requires
++                # --enable-auto-tool-choice and --tool-call-parser
++                return self.create_error_response(
++                    "\"auto\" tool choice requires "
++                    "--enable-auto-tool-choice and --tool-call-parser to be set"
++                )
+ 
+-            prompt = self.tokenizer.apply_chat_template(
+-                conversation=conversation,
+-                tokenize=False,
++            tool_dicts = None if request.tools is None else [
++                tool.model_dump() for tool in request.tools
++            ]
++
++            (
++                conversation,
++                request_prompts,
++                engine_prompts,
++            ) = await self._preprocess_chat(
++                request,
++                tokenizer,
++                request.messages,
++                chat_template=request.chat_template or self.chat_template,
++                chat_template_content_format=self.chat_template_content_format,
+                 add_generation_prompt=request.add_generation_prompt,
++                continue_final_message=request.continue_final_message,
++                tool_dicts=tool_dicts,
++                documents=request.documents,
++                chat_template_kwargs=request.chat_template_kwargs,
++                tool_parser=tool_parser,
++                truncate_prompt_tokens=request.truncate_prompt_tokens,
++                add_special_tokens=request.add_special_tokens,
+             )
+-        except Exception as e:
+-            logger.error("Error in applying chat template from request: %s", e)
++        except ValueError as e:
++            logger.exception("Error in preprocessing prompt inputs")
+             return self.create_error_response(str(e))
+ 
+-        request_id = f"cmpl-{random_uuid()}"
++        request_id = "chatcmpl-" \
++                     f"{self._base_request_id(raw_request, request.request_id)}"
++
++        request_metadata = RequestResponseMetadata(request_id=request_id)
++        if raw_request:
++            raw_request.state.request_metadata = request_metadata
++
++        # Schedule the request and get the result generator.
++        generators: List[AsyncGenerator[RequestOutput, None]] = []
+         try:
+-            # Tokenize/detokenize depending on prompt format (string/token list)
+-            prompt_ids, prompt_text = self._validate_prompt_and_tokenize(
+-                request, prompt=prompt)
+-            sampling_params = request.to_sampling_params()
+-            lora_request = self._maybe_get_lora(request)
+-            decoding_config = await self.engine.get_decoding_config()
+-            guided_decoding_backend = request.guided_decoding_backend \
+-                or decoding_config.guided_decoding_backend
+-            guided_decode_logits_processor = (
+-                await get_guided_decoding_logits_processor(
+-                    guided_decoding_backend, request, await
+-                    self.engine.get_tokenizer()))
+-            if guided_decode_logits_processor:
+-                if sampling_params.logits_processors is None:
+-                    sampling_params.logits_processors = []
+-                sampling_params.logits_processors.append(
+-                    guided_decode_logits_processor)
++            for i, engine_prompt in enumerate(engine_prompts):
++                sampling_params: Union[SamplingParams, BeamSearchParams]
++                default_max_tokens = self.max_model_len - len(
++                    engine_prompt["prompt_token_ids"])
++                # Build default sampling params
++                default_sampling_params = (
++                    self.model_config.get_diff_sampling_param())
++                if request.use_beam_search:
++                    sampling_params = request.to_beam_search_params(
++                        default_max_tokens, default_sampling_params)
++                else:
++                    sampling_params = request.to_sampling_params(
++                        default_max_tokens,
++                        self.model_config.logits_processor_pattern,
++                        default_sampling_params)
++
++                self._log_inputs(request_id,
++                                 request_prompts[i],
++                                 params=sampling_params,
++                                 lora_request=lora_request,
++                                 prompt_adapter_request=prompt_adapter_request)
++
++                trace_headers = (None if raw_request is None else await
++                                 self._get_trace_headers(raw_request.headers))
++
++                if isinstance(sampling_params, BeamSearchParams):
++                    generator = self.engine_client.beam_search(
++                        prompt=engine_prompt,
++                        request_id=request_id,
++                        params=sampling_params,
++                    )
++                else:
++                    generator = self.engine_client.generate(
++                        engine_prompt,
++                        sampling_params,
++                        request_id,
++                        lora_request=lora_request,
++                        trace_headers=trace_headers,
++                        prompt_adapter_request=prompt_adapter_request,
++                        priority=request.priority,
++                    )
++
++                generators.append(generator)
+         except ValueError as e:
++            # TODO: Use a vllm-specific Validation Error
+             return self.create_error_response(str(e))
+ 
+-        result_generator = self.engine.generate(prompt_text, sampling_params,
+-                                                request_id, prompt_ids,
+-                                                lora_request)
++        assert len(generators) == 1
++        result_generator, = generators
++
+         # Streaming response
+         if request.stream:
+             return self.chat_completion_stream_generator(
+-                request, result_generator, request_id, conversation)
+-        else:
+-            try:
+-                return await self.chat_completion_full_generator(
+-                    request, raw_request, result_generator, request_id,
+-                    conversation)
+-            except ValueError as e:
+-                # TODO: Use a vllm-specific Validation Error
+-                return self.create_error_response(str(e))
++                request, result_generator, request_id, model_name,
++                conversation, tokenizer, request_metadata)
++
++        try:
++            return await self.chat_completion_full_generator(
++                request, result_generator, request_id, model_name,
++                conversation, tokenizer, request_metadata)
++        except ValueError as e:
++            # TODO: Use a vllm-specific Validation Error
++            return self.create_error_response(str(e))
+ 
+     def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
+         if request.add_generation_prompt:
+             return self.response_role
+-        else:
+-            return request.messages[-1]["role"]
++        return request.messages[-1]["role"]
+ 
+     async def chat_completion_stream_generator(
+-            self, request: ChatCompletionRequest,
+-            result_generator: AsyncIterator[RequestOutput], request_id: str,
+-            conversation: List[ConversationMessage]
++        self,
++        request: ChatCompletionRequest,
++        result_generator: AsyncIterator[RequestOutput],
++        request_id: str,
++        model_name: str,
++        conversation: List[ConversationMessage],
++        tokenizer: AnyTokenizer,
++        request_metadata: RequestResponseMetadata,
+     ) -> AsyncGenerator[str, None]:
+-        model_name = self.served_model_names[0]
+         created_time = int(time.time())
+-        chunk_object_type = "chat.completion.chunk"
++        chunk_object_type: Final = "chat.completion.chunk"
+         first_iteration = True
+ 
+         # Send response for each token for each request.n (index)
+-        assert request.n is not None
+-        previous_texts = [""] * request.n
+-        previous_num_tokens = [0] * request.n
+-        finish_reason_sent = [False] * request.n
++        num_choices = 1 if request.n is None else request.n
++        previous_num_tokens = [0] * num_choices
++        finish_reason_sent = [False] * num_choices
++        num_prompt_tokens = 0
++        num_cached_tokens = None
++
++        if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
++            tool_choice_function_name = request.tool_choice.function.name
++        else:
++            tool_choice_function_name = None
++
++        # Determine whether tools are in use with "auto" tool choice
++        tool_choice_auto = (
++            not tool_choice_function_name
++            and self._should_stream_with_auto_tool_parsing(request))
++
++        all_previous_token_ids: Optional[List[List[int]]]
++        if tool_choice_auto:
++            # These are only required in "auto" tool choice case
++            previous_texts = [""] * num_choices
++            all_previous_token_ids = [[]] * num_choices
++        else:
++            previous_texts, all_previous_token_ids = None, None
++
++        # Prepare the tool parser if it's needed
++        try:
++            if tool_choice_auto and self.tool_parser:
++                tool_parsers: List[Optional[ToolParser]] = [
++                    self.tool_parser(tokenizer)
++                ] * num_choices
++            else:
++                tool_parsers = [None] * num_choices
++        except Exception as e:
++            logger.exception("Error in tool parser creation.")
++            data = self.create_streaming_error_response(str(e))
++            yield f"data: {data}\n\n"
++            yield "data: [DONE]\n\n"
++            return
++
++        stream_options = request.stream_options
++        if stream_options:
++            include_usage = stream_options.include_usage
++            include_continuous_usage = include_usage and \
++                                       stream_options.continuous_usage_stats
++        else:
++            include_usage, include_continuous_usage = False, False
++
+         try:
+             async for res in result_generator:
++                if res.prompt_token_ids is not None:
++                    num_prompt_tokens = len(res.prompt_token_ids)
++                    if res.encoder_prompt_token_ids is not None:
++                        num_prompt_tokens += len(res.encoder_prompt_token_ids)
++
+                 # We need to do it here, because if there are exceptions in
+                 # the result_generator, it needs to be sent as the FIRST
+                 # response (by the try...catch).
+                 if first_iteration:
++                    num_cached_tokens = res.num_cached_tokens
+                     # Send first response for each request.n (index) with
+                     # the role
+                     role = self.get_chat_request_role(request)
+-                    for i in range(request.n):
++
++                    # NOTE num_choices defaults to 1 so this usually executes
++                    # once per request
++                    for i in range(num_choices):
+                         choice_data = ChatCompletionResponseStreamChoice(
+                             index=i,
+-                            delta=DeltaMessage(role=role),
++                            delta=DeltaMessage(
++                                role=role,
++                                content="",
++                            ),
+                             logprobs=None,
+                             finish_reason=None)
+                         chunk = ChatCompletionStreamResponse(
+@@ -184,33 +349,46 @@ class OpenAIServingChat(OpenAIServing):
+                             created=created_time,
+                             choices=[choice_data],
+                             model=model_name)
++
++                        # if continuous usage stats are requested, add it
++                        if include_continuous_usage:
++                            chunk.usage = UsageInfo(
++                                prompt_tokens=num_prompt_tokens,
++                                completion_tokens=0,
++                                total_tokens=num_prompt_tokens)
++
+                         data = chunk.model_dump_json(exclude_unset=True)
+                         yield f"data: {data}\n\n"
+ 
+                     # Send response to echo the input portion of the
+                     # last message
+                     if request.echo:
+-                        last_msg_content = ""
+-                        if conversation and conversation[-1].get(
+-                                "content") and conversation[-1].get(
+-                                    "role") == role:
+-                            last_msg_content = conversation[-1]["content"]
++                        last_msg_content: Union[str, List[Dict[str, str]]] = ""
++                        if conversation and "content" in conversation[
++                                -1] and conversation[-1].get("role") == role:
++                            last_msg_content = conversation[-1]["content"] or ""
+ 
+                         if last_msg_content:
+-                            for i in range(request.n):
++                            for i in range(num_choices):
+                                 choice_data = (
+                                     ChatCompletionResponseStreamChoice(
+                                         index=i,
+                                         delta=DeltaMessage(
+                                             content=last_msg_content),
++                                        logprobs=None,
+                                         finish_reason=None))
+                                 chunk = ChatCompletionStreamResponse(
+                                     id=request_id,
+                                     object=chunk_object_type,
+                                     created=created_time,
+                                     choices=[choice_data],
+-                                    logprobs=None,
+                                     model=model_name)
++                                if include_continuous_usage:
++                                    chunk.usage = UsageInfo(
++                                        prompt_tokens=num_prompt_tokens,
++                                        completion_tokens=0,
++                                        total_tokens=num_prompt_tokens)
++
+                                 data = chunk.model_dump_json(
+                                     exclude_unset=True)
+                                 yield f"data: {data}\n\n"
+@@ -218,175 +396,463 @@ class OpenAIServingChat(OpenAIServing):
+ 
+                 for output in res.outputs:
+                     i = output.index
++                    tool_parser = tool_parsers[i]
+ 
+                     if finish_reason_sent[i]:
+                         continue
+ 
+-                    delta_token_ids = output.token_ids[previous_num_tokens[i]:]
+-                    top_logprobs = output.logprobs[
+-                        previous_num_tokens[i]:] if output.logprobs else None
+-
+-                    if request.logprobs:
+-                        logprobs = self._create_logprobs(
+-                            token_ids=delta_token_ids,
+-                            top_logprobs=top_logprobs,
+-                            num_output_top_logprobs=request.logprobs,
+-                            initial_text_offset=len(previous_texts[i]),
++                    if request.logprobs and request.top_logprobs is not None:
++                        assert output.logprobs is not None, (
++                            "Did not output logprobs")
++                        logprobs = self._create_chat_logprobs(
++                            token_ids=output.token_ids,
++                            top_logprobs=output.logprobs,
++                            tokenizer=tokenizer,
++                            num_output_top_logprobs=request.top_logprobs,
+                         )
+                     else:
+                         logprobs = None
+ 
+-                    delta_text = output.text[len(previous_texts[i]):]
+-                    previous_texts[i] = output.text
+-                    previous_num_tokens[i] = len(output.token_ids)
++                    delta_text = output.text
++
++                    if not delta_text and not output.token_ids and \
++                        not previous_num_tokens[i]:
++                        # Chunked prefill case, don't return empty chunks
++                        continue
++
++                    delta_message: Optional[DeltaMessage]
++
++                    # handle streaming deltas for tools with named tool_choice
++                    if tool_choice_function_name:
++                        delta_message = DeltaMessage(tool_calls=[
++                            DeltaToolCall(function=DeltaFunctionCall(
++                                name=tool_choice_function_name,
++                                arguments=delta_text),
++                                          index=i)
++                        ])
++
++                    # handle streaming deltas for tools with "auto" tool choice
++                    elif tool_choice_auto:
++                        assert previous_texts is not None
++                        assert all_previous_token_ids is not None
++                        assert tool_parser is not None
++                        #TODO optimize manipulation of these lists
++                        previous_text = previous_texts[i]
++                        previous_token_ids = all_previous_token_ids[i]
++                        current_text = previous_text + delta_text
++                        current_token_ids = previous_token_ids + list(
++                            output.token_ids)
++
++                        delta_message = (
++                            tool_parser.extract_tool_calls_streaming(
++                                previous_text=previous_text,
++                                current_text=current_text,
++                                delta_text=delta_text,
++                                previous_token_ids=previous_token_ids,
++                                current_token_ids=current_token_ids,
++                                delta_token_ids=output.token_ids,
++                                request=request))
++
++                        # update the previous values for the next iteration
++                        previous_texts[i] = current_text
++                        all_previous_token_ids[i] = current_token_ids
++
++                    # handle streaming just a content delta
++                    else:
++                        delta_message = DeltaMessage(content=delta_text)
++
++                    # set the previous values for the next iteration
++                    previous_num_tokens[i] += len(output.token_ids)
++
++                    # if the message delta is None (e.g. because it was a
++                    # "control token" for tool calls or the parser otherwise
++                    # wasn't ready to send a token, then
++                    #   get the next token without streaming a chunk
++                    if delta_message is None:
++                        continue
++
+                     if output.finish_reason is None:
+                         # Send token-by-token response for each request.n
+                         choice_data = ChatCompletionResponseStreamChoice(
+                             index=i,
+-                            delta=DeltaMessage(content=delta_text),
++                            delta=delta_message,
+                             logprobs=logprobs,
+                             finish_reason=None)
+-                        chunk = ChatCompletionStreamResponse(
+-                            id=request_id,
+-                            object=chunk_object_type,
+-                            created=created_time,
+-                            choices=[choice_data],
+-                            model=model_name)
+-                        data = chunk.model_dump_json(exclude_unset=True)
+-                        yield f"data: {data}\n\n"
++
++                    # if the model is finished generating
+                     else:
++                        # check to make sure we haven't "forgotten" to stream
++                        #   any tokens that were generated but previously
++                        #   matched by partial json parsing
++                        # only happens if we are NOT using guided decoding
++                        auto_tools_called = False
++                        if tool_parser:
++                            auto_tools_called = len(
++                                tool_parser.prev_tool_call_arr) > 0
++                            index = len(tool_parser.prev_tool_call_arr
++                                        ) - 1 if auto_tools_called else 0
++                        else:
++                            index = 0
++
++                        if self._should_check_for_unstreamed_tool_arg_tokens(
++                                delta_message, output) and tool_parser:
++                            latest_delta_len = 0
++                            if ((isinstance(
++                                    delta_message.tool_calls[0].function,
++                                    DeltaFunctionCall)) and isinstance(
++                                        delta_message.tool_calls[0].function.
++                                        arguments, str)):
++                                latest_delta_len = len(
++                                    delta_message.tool_calls[0].function.
++                                    arguments)
++
++                            # get the expected call based on partial JSON
++                            # parsing which "autocompletes" the JSON
++                            expected_call = json.dumps(
++                                tool_parser.prev_tool_call_arr[index].get(
++                                    "arguments", {}),
++                                ensure_ascii=False)
++
++                            # get what we've streamed so far for arguments
++                            # for the current tool
++                            actual_call = tool_parser.streamed_args_for_tool[
++                                index]
++                            if (latest_delta_len > 0):
++                                actual_call = actual_call[:-latest_delta_len]
++
++                            # check to see if there's anything left to stream
++                            remaining_call = expected_call.replace(
++                                actual_call, "", 1)
++                            # set that as a delta message
++                            delta_message = DeltaMessage(tool_calls=[
++                                DeltaToolCall(index=index,
++                                              function=DeltaFunctionCall(
++                                                  arguments=remaining_call).
++                                              model_dump(exclude_none=True))
++                            ])
++
+                         # Send the finish response for each request.n only once
+-                        prompt_tokens = len(res.prompt_token_ids)
+-                        final_usage = UsageInfo(
+-                            prompt_tokens=prompt_tokens,
+-                            completion_tokens=previous_num_tokens[i],
+-                            total_tokens=prompt_tokens +
+-                            previous_num_tokens[i],
+-                        )
+                         choice_data = ChatCompletionResponseStreamChoice(
+                             index=i,
+-                            delta=DeltaMessage(content=delta_text),
++                            delta=delta_message,
+                             logprobs=logprobs,
+-                            finish_reason=output.finish_reason,
++                            finish_reason=output.finish_reason
++                            if not auto_tools_called else "tool_calls",
+                             stop_reason=output.stop_reason)
+-                        chunk = ChatCompletionStreamResponse(
+-                            id=request_id,
+-                            object=chunk_object_type,
+-                            created=created_time,
+-                            choices=[choice_data],
+-                            model=model_name)
+-                        if final_usage is not None:
+-                            chunk.usage = final_usage
+-                        data = chunk.model_dump_json(exclude_unset=True,
+-                                                     exclude_none=True)
+-                        yield f"data: {data}\n\n"
++
+                         finish_reason_sent[i] = True
+-        except ValueError as e:
++
++                    chunk = ChatCompletionStreamResponse(
++                        id=request_id,
++                        object=chunk_object_type,
++                        created=created_time,
++                        choices=[choice_data],
++                        model=model_name)
++
++                    # handle usage stats if requested & if continuous
++                    if include_continuous_usage:
++                        completion_tokens = previous_num_tokens[i]
++                        chunk.usage = UsageInfo(
++                            prompt_tokens=num_prompt_tokens,
++                            completion_tokens=completion_tokens,
++                            total_tokens=num_prompt_tokens + completion_tokens,
++                        )
++
++                    data = chunk.model_dump_json(exclude_unset=True)
++                    yield f"data: {data}\n\n"
++
++            # once the final token is handled, if stream_options.include_usage
++            # is sent, send the usage
++            if include_usage:
++                completion_tokens = sum(previous_num_tokens)
++                final_usage = UsageInfo(prompt_tokens=num_prompt_tokens,
++                                        completion_tokens=completion_tokens,
++                                        total_tokens=num_prompt_tokens +
++                                        completion_tokens)
++                if self.enable_prompt_tokens_details and num_cached_tokens:
++                    final_usage.prompt_tokens_details = PromptTokenUsageInfo(
++                        cached_tokens=num_cached_tokens)
++
++                final_usage_chunk = ChatCompletionStreamResponse(
++                    id=request_id,
++                    object=chunk_object_type,
++                    created=created_time,
++                    choices=[],
++                    model=model_name,
++                    usage=final_usage)
++                final_usage_data = (final_usage_chunk.model_dump_json(
++                    exclude_unset=True, exclude_none=True))
++                yield f"data: {final_usage_data}\n\n"
++
++            # report to FastAPI middleware aggregate usage across all choices
++            num_completion_tokens = sum(previous_num_tokens)
++            request_metadata.final_usage_info = UsageInfo(
++                prompt_tokens=num_prompt_tokens,
++                completion_tokens=num_completion_tokens,
++                total_tokens=num_prompt_tokens + num_completion_tokens)
++
++        except Exception as e:
+             # TODO: Use a vllm-specific Validation Error
++            logger.exception("Error in chat completion stream generator.")
+             data = self.create_streaming_error_response(str(e))
+             yield f"data: {data}\n\n"
+         # Send the final done message after all response.n are finished
+         yield "data: [DONE]\n\n"
+ 
+     async def chat_completion_full_generator(
+-        self, request: ChatCompletionRequest, raw_request: Request,
+-        result_generator: AsyncIterator[RequestOutput], request_id: str,
+-        conversation: List[ConversationMessage]
++        self,
++        request: ChatCompletionRequest,
++        result_generator: AsyncIterator[RequestOutput],
++        request_id: str,
++        model_name: str,
++        conversation: List[ConversationMessage],
++        tokenizer: AnyTokenizer,
++        request_metadata: RequestResponseMetadata,
+     ) -> Union[ErrorResponse, ChatCompletionResponse]:
+ 
+-        model_name = self.served_model_names[0]
+         created_time = int(time.time())
+         final_res: Optional[RequestOutput] = None
+ 
+-        async for res in result_generator:
+-            if await raw_request.is_disconnected():
+-                # Abort the request if the client disconnects.
+-                await self.engine.abort(request_id)
+-                return self.create_error_response("Client disconnected")
+-            final_res = res
++        try:
++            async for res in result_generator:
++                final_res = res
++        except asyncio.CancelledError:
++            return self.create_error_response("Client disconnected")
++        except ValueError as e:
++            # TODO: Use a vllm-specific Validation Error
++            return self.create_error_response(str(e))
++
+         assert final_res is not None
+ 
+-        choices = []
++        choices: List[ChatCompletionResponseChoice] = []
+ 
+         role = self.get_chat_request_role(request)
+         for output in final_res.outputs:
+             token_ids = output.token_ids
+-            top_logprobs = output.logprobs
++            out_logprobs = output.logprobs
+ 
+-            if request.logprobs:
+-                logprobs = self._create_logprobs(
++            if request.logprobs and request.top_logprobs is not None:
++                assert out_logprobs is not None, "Did not output logprobs"
++                logprobs = self._create_chat_logprobs(
+                     token_ids=token_ids,
+-                    top_logprobs=top_logprobs,
+-                    num_output_top_logprobs=request.logprobs,
++                    top_logprobs=out_logprobs,
++                    num_output_top_logprobs=request.top_logprobs,
++                    tokenizer=tokenizer,
+                 )
+             else:
+                 logprobs = None
+ 
++            # In the OpenAI API the finish_reason is "tools_called"
++            # if the tool choice is auto and the model produced a tool
++            # call. The same is not true for named function calls
++            auto_tools_called = False
++
++            # if auto tools are not enabled, and a named tool choice using
++            #   outlines is not being used
++            if (not self.enable_auto_tools
++                    or not self.tool_parser) and not isinstance(
++                        request.tool_choice,
++                        ChatCompletionNamedToolChoiceParam):
++                message = ChatMessage(role=role, content=output.text)
++
++            # if the request uses tools and specified a tool choice
++            elif request.tool_choice and type(
++                    request.tool_choice) is ChatCompletionNamedToolChoiceParam:
++
++                message = ChatMessage(
++                    role=role,
++                    content="",
++                    tool_calls=[
++                        ToolCall(function=FunctionCall(
++                            name=request.tool_choice.function.name,
++                            arguments=output.text))
++                    ])
++
++            # if the request doesn't use tool choice
++            # OR specifies to not use a tool
++            elif not request.tool_choice or request.tool_choice == "none":
++
++                message = ChatMessage(role=role, content=output.text)
++
++            # handle when there are tools and tool choice is auto
++            elif request.tools and (
++                    request.tool_choice == "auto"
++                    or request.tool_choice is None) and self.enable_auto_tools \
++                    and self.tool_parser:
++
++                try:
++                    tool_parser = self.tool_parser(tokenizer)
++                except RuntimeError as e:
++                    logger.exception("Error in tool parser creation.")
++                    return self.create_error_response(str(e))
++
++                tool_call_info = tool_parser.extract_tool_calls(
++                    output.text, request=request)
++                # In the OpenAI API the finish_reason is "tools_called"
++                # if the tool choice is auto and the model produced a tool
++                # call. The same is not true for named function calls
++                auto_tools_called = tool_call_info.tools_called
++                if tool_call_info.tools_called:
++                    message = ChatMessage(role=role,
++                                          content=tool_call_info.content,
++                                          tool_calls=tool_call_info.tool_calls)
++
++                else:
++                    # FOR NOW make it a chat message; we will have to detect
++                    # the type to make it later.
++                    message = ChatMessage(role=role, content=output.text)
++
++            # undetermined case that is still important to handle
++            else:
++                logger.error(
++                    "Error in chat_completion_full_generator - cannot determine"
++                    " if tools should be extracted. Returning a standard chat "
++                    "completion.")
++                message = ChatMessage(role=role, content=output.text)
++
+             choice_data = ChatCompletionResponseChoice(
+                 index=output.index,
+-                message=ChatMessage(role=role, content=output.text),
++                message=message,
+                 logprobs=logprobs,
+-                finish_reason=output.finish_reason,
+-                stop_reason=output.stop_reason,
+-            )
++                finish_reason="tool_calls" if auto_tools_called else
++                output.finish_reason if output.finish_reason else "stop",
++                stop_reason=output.stop_reason)
+             choices.append(choice_data)
+ 
+         if request.echo:
+-            last_msg_content = ""
+-            if conversation and conversation[-1].get(
+-                    "content") and conversation[-1].get("role") == role:
+-                last_msg_content = conversation[-1]["content"]
++            last_msg_content: Union[str, List[Dict[str, str]]] = ""
++            if conversation and "content" in conversation[-1] and conversation[
++                    -1].get("role") == role:
++                last_msg_content = conversation[-1]["content"] or ""
++            if isinstance(last_msg_content, list):
++                last_msg_content = "\n".join(msg['text']
++                                             for msg in last_msg_content)
+ 
+             for choice in choices:
+-                full_message = last_msg_content + choice.message.content
++                full_message = last_msg_content + (choice.message.content
++                                                   or "")
+                 choice.message.content = full_message
+ 
++        assert final_res.prompt_token_ids is not None
+         num_prompt_tokens = len(final_res.prompt_token_ids)
++        if final_res.encoder_prompt_token_ids is not None:
++            num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
+         num_generated_tokens = sum(
+             len(output.token_ids) for output in final_res.outputs)
+-        usage = UsageInfo(
+-            prompt_tokens=num_prompt_tokens,
+-            completion_tokens=num_generated_tokens,
+-            total_tokens=num_prompt_tokens + num_generated_tokens,
+-        )
++        usage = UsageInfo(prompt_tokens=num_prompt_tokens,
++                          completion_tokens=num_generated_tokens,
++                          total_tokens=num_prompt_tokens +
++                          num_generated_tokens)
++        if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
++            usage.prompt_tokens_details = PromptTokenUsageInfo(
++                cached_tokens=final_res.num_cached_tokens)
++
++        request_metadata.final_usage_info = usage
++
+         response = ChatCompletionResponse(
+             id=request_id,
+             created=created_time,
+             model=model_name,
+             choices=choices,
+             usage=usage,
++            prompt_logprobs=final_res.prompt_logprobs,
+         )
+ 
+         return response
+ 
+-    async def _load_chat_template(self, chat_template: Optional[str]):
+-        while self.tokenizer is None:
+-            # Give the parent class time to load the tokenizer
+-            await asyncio.sleep(0.1)
+-        tokenizer = self.tokenizer
++    def _get_top_logprobs(
++            self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int],
++            tokenizer: AnyTokenizer) -> List[ChatCompletionLogProb]:
++        return [
++            ChatCompletionLogProb(token=(token := self._get_decoded_token(
++                p[1],
++                p[0],
++                tokenizer,
++                return_as_token_id=self.return_tokens_as_token_ids)),
++                                  logprob=max(p[1].logprob, -9999.0),
++                                  bytes=list(
++                                      token.encode("utf-8", errors="replace")))
++            for i, p in enumerate(logprobs.items())
++            if top_logprobs and i < top_logprobs
++        ]
++
++    def _create_chat_logprobs(
++        self,
++        token_ids: GenericSequence[int],
++        top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]],
++        tokenizer: AnyTokenizer,
++        num_output_top_logprobs: Optional[int] = None,
++    ) -> ChatCompletionLogProbs:
++        """Create OpenAI-style logprobs."""
++        logprobs_content: List[ChatCompletionLogProbsContent] = []
++
++        for i, token_id in enumerate(token_ids):
++            step_top_logprobs = top_logprobs[i]
++            if step_top_logprobs is None:
++                token = tokenizer.decode(token_id)
++                if self.return_tokens_as_token_ids:
++                    token = f"token_id:{token_id}"
++
++                logprobs_content.append(
++                    ChatCompletionLogProbsContent(
++                        token=token,
++                        bytes=list(token.encode("utf-8", errors="replace")),
++                    ))
++            else:
++                step_token = step_top_logprobs[token_id]
++                step_decoded = step_token.decoded_token
++
++                logprobs_content.append(
++                    ChatCompletionLogProbsContent(
++                        token=self._get_decoded_token(
++                            step_token,
++                            token_id,
++                            tokenizer,
++                            self.return_tokens_as_token_ids,
++                        ),
++                        logprob=max(step_token.logprob, -9999.0),
++                        bytes=None if step_decoded is None else list(
++                            step_decoded.encode("utf-8", errors="replace")),
++                        top_logprobs=self._get_top_logprobs(
++                            step_top_logprobs,
++                            num_output_top_logprobs,
++                            tokenizer,
++                        ),
++                    ))
++
++        return ChatCompletionLogProbs(content=logprobs_content)
++
++    def _should_stream_with_auto_tool_parsing(self,
++                                              request: ChatCompletionRequest):
++        """
++        Utility function to check if streamed tokens should go through the tool
++        call parser that was configured.
+ 
+-        if chat_template is not None:
+-            try:
+-                with open(chat_template, "r") as f:
+-                    tokenizer.chat_template = f.read()
+-            except OSError as e:
+-                JINJA_CHARS = "{}\n"
+-                if not any(c in chat_template for c in JINJA_CHARS):
+-                    msg = (f"The supplied chat template ({chat_template}) "
+-                           f"looks like a file path, but it failed to be "
+-                           f"opened. Reason: {e}")
+-                    raise ValueError(msg) from e
+-
+-                # If opening a file fails, set chat template to be args to
+-                # ensure we decode so our escape are interpreted correctly
+-                tokenizer.chat_template = codecs.decode(
+-                    chat_template, "unicode_escape")
+-
+-            logger.info("Using supplied chat template:\n%s",
+-                        tokenizer.chat_template)
+-        elif tokenizer.chat_template is not None:
+-            logger.info("Using default chat template:\n%s",
+-                        tokenizer.chat_template)
+-        else:
+-            logger.warning(
+-                "No chat template provided. Chat API will not work.")
++        We only want to do this IF user-provided tools are set, a tool parser
++        is configured, "auto" tool choice is enabled, and the request's tool
++        choice field indicates that "auto" tool choice should be used.
++        """
++        return (request.tools and self.tool_parser and self.enable_auto_tools
++                and request.tool_choice in ['auto', None])
++
++    def _should_check_for_unstreamed_tool_arg_tokens(
++        self,
++        delta_message: Optional[DeltaMessage],
++        output: CompletionOutput,
++    ) -> bool:
++        """
++        Check to see if we should check for unstreamed tool arguments tokens.
++        This is only applicable when auto tool parsing is enabled, the delta
++        is a tool call with arguments.
++        """
++
++        # yapf: disable
++        return bool(
++            # if there is a delta message that includes tool calls which
++            # include a function that has arguments
++            output.finish_reason is not None
++            and self.enable_auto_tools and self.tool_parser and delta_message
++            and delta_message.tool_calls and delta_message.tool_calls[0]
++            and delta_message.tool_calls[0].function
++            and delta_message.tool_calls[0].function.arguments is not None
++        )
+diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
+index 6a7f29c..2c9c20c 100644
+--- a/vllm/entrypoints/openai/serving_completion.py
++++ b/vllm/entrypoints/openai/serving_completion.py
+@@ -1,67 +1,65 @@
++import asyncio
+ import time
+-from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
+-                    Optional, Tuple)
++from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
++from typing import Sequence as GenericSequence
++from typing import Tuple, Union, cast
+ 
+ from fastapi import Request
+ 
+-from vllm.engine.async_llm_engine import AsyncLLMEngine
+-from vllm.entrypoints.openai.protocol import (CompletionRequest,
++from vllm.config import ModelConfig
++from vllm.engine.protocol import EngineClient
++from vllm.entrypoints.logger import RequestLogger
++# yapf conflicts with isort for this block
++# yapf: disable
++from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
++                                              CompletionRequest,
+                                               CompletionResponse,
+                                               CompletionResponseChoice,
+                                               CompletionResponseStreamChoice,
+                                               CompletionStreamResponse,
+-                                              LogProbs, UsageInfo)
+-from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+-                                                    OpenAIServing)
++                                              ErrorResponse,
++                                              RequestResponseMetadata,
++                                              UsageInfo)
++# yapf: enable
++from vllm.entrypoints.openai.serving_engine import OpenAIServing
++from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+ from vllm.logger import init_logger
+-from vllm.model_executor.guided_decoding import (
+-    get_guided_decoding_logits_processor)
+ from vllm.outputs import RequestOutput
+-from vllm.utils import merge_async_iterators, random_uuid
++from vllm.sampling_params import BeamSearchParams, SamplingParams
++from vllm.sequence import Logprob
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.utils import merge_async_iterators
+ 
+ logger = init_logger(__name__)
+ 
+-TypeTokenIDs = List[int]
+-TypeTopLogProbs = List[Optional[Dict[int, float]]]
+-TypeCreateLogProbsFn = Callable[
+-    [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs]
+-
+-
+-def parse_prompt_format(prompt) -> Tuple[bool, list]:
+-    # get the prompt, openai supports the following
+-    # "a string, array of strings, array of tokens, or array of token arrays."
+-    prompt_is_tokens = False
+-    prompts = [prompt]  # case 1: a string
+-    if isinstance(prompt, list):
+-        if len(prompt) == 0:
+-            raise ValueError("please provide at least one prompt")
+-        elif isinstance(prompt[0], str):
+-            prompt_is_tokens = False
+-            prompts = prompt  # case 2: array of strings
+-        elif isinstance(prompt[0], int):
+-            prompt_is_tokens = True
+-            prompts = [prompt]  # case 3: array of tokens
+-        elif isinstance(prompt[0], list) and isinstance(prompt[0][0], int):
+-            prompt_is_tokens = True
+-            prompts = prompt  # case 4: array of token arrays
+-        else:
+-            raise ValueError("prompt must be a string, array of strings, "
+-                             "array of tokens, or array of token arrays")
+-    return prompt_is_tokens, prompts
+-
+ 
+ class OpenAIServingCompletion(OpenAIServing):
+ 
+-    def __init__(self,
+-                 engine: AsyncLLMEngine,
+-                 served_model_names: List[str],
+-                 lora_modules: Optional[List[LoRAModulePath]] = None):
+-        super().__init__(engine=engine,
+-                         served_model_names=served_model_names,
+-                         lora_modules=lora_modules)
+-
+-    async def create_completion(self, request: CompletionRequest,
+-                                raw_request: Request):
++    def __init__(
++        self,
++        engine_client: EngineClient,
++        model_config: ModelConfig,
++        models: OpenAIServingModels,
++        *,
++        request_logger: Optional[RequestLogger],
++        return_tokens_as_token_ids: bool = False,
++    ):
++        super().__init__(engine_client=engine_client,
++                         model_config=model_config,
++                         models=models,
++                         request_logger=request_logger,
++                         return_tokens_as_token_ids=return_tokens_as_token_ids)
++        diff_sampling_param = self.model_config.get_diff_sampling_param()
++        if diff_sampling_param:
++            logger.info(
++                "Overwriting default completion sampling param with: %s",
++                diff_sampling_param)
++
++    async def create_completion(
++        self,
++        request: CompletionRequest,
++        raw_request: Request,
++    ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
+         """Completion API similar to OpenAI's API.
+ 
+         See https://platform.openai.com/docs/api-reference/completions/create
+@@ -75,61 +73,99 @@ class OpenAIServingCompletion(OpenAIServing):
+         if error_check_ret is not None:
+             return error_check_ret
+ 
++        # If the engine is dead, raise the engine's DEAD_ERROR.
++        # This is required for the streaming case, where we return a
++        # success status before we actually start generating text :).
++        if self.engine_client.errored:
++            raise self.engine_client.dead_error
++
+         # Return error for unsupported features.
+         if request.suffix is not None:
+             return self.create_error_response(
+                 "suffix is not currently supported")
+ 
+-        model_name = self.served_model_names[0]
+-        request_id = f"cmpl-{random_uuid()}"
++        request_id = f"cmpl-{self._base_request_id(raw_request)}"
+         created_time = int(time.time())
+ 
++        request_metadata = RequestResponseMetadata(request_id=request_id)
++        if raw_request:
++            raw_request.state.request_metadata = request_metadata
++
++        try:
++            (
++                lora_request,
++                prompt_adapter_request,
++            ) = self._maybe_get_adapters(request)
++
++            tokenizer = await self.engine_client.get_tokenizer(lora_request)
++
++            request_prompts, engine_prompts = await self._preprocess_completion(
++                request,
++                tokenizer,
++                request.prompt,
++                truncate_prompt_tokens=request.truncate_prompt_tokens,
++                add_special_tokens=request.add_special_tokens,
++            )
++        except ValueError as e:
++            logger.exception("Error in preprocessing prompt inputs")
++            return self.create_error_response(str(e))
++
+         # Schedule the request and get the result generator.
+-        generators: List[AsyncIterator[RequestOutput]] = []
++        generators: List[AsyncGenerator[RequestOutput, None]] = []
+         try:
+-            sampling_params = request.to_sampling_params()
+-            lora_request = self._maybe_get_lora(request)
+-            decoding_config = await self.engine.get_decoding_config()
+-            guided_decoding_backend = request.guided_decoding_backend \
+-                or decoding_config.guided_decoding_backend
+-            guided_decode_logit_processor = (
+-                await get_guided_decoding_logits_processor(
+-                    guided_decoding_backend, request, await
+-                    self.engine.get_tokenizer()))
+-            if guided_decode_logit_processor is not None:
+-                if sampling_params.logits_processors is None:
+-                    sampling_params.logits_processors = []
+-                sampling_params.logits_processors.append(
+-                    guided_decode_logit_processor)
+-            prompt_is_tokens, prompts = parse_prompt_format(request.prompt)
+-
+-            for i, prompt in enumerate(prompts):
+-                if prompt_is_tokens:
+-                    prompt_formats = self._validate_prompt_and_tokenize(
+-                        request,
+-                        prompt_ids=prompt,
+-                        truncate_prompt_tokens=sampling_params.
+-                        truncate_prompt_tokens)
++            for i, engine_prompt in enumerate(engine_prompts):
++                sampling_params: Union[SamplingParams, BeamSearchParams]
++                default_max_tokens = self.max_model_len - len(
++                    engine_prompt["prompt_token_ids"])
++                # Build default sampling params
++                default_sampling_params = (
++                    self.model_config.get_diff_sampling_param())
++                if request.use_beam_search:
++                    sampling_params = request.to_beam_search_params(
++                        default_max_tokens, default_sampling_params)
+                 else:
+-                    prompt_formats = self._validate_prompt_and_tokenize(
+-                        request,
+-                        prompt=prompt,
+-                        truncate_prompt_tokens=sampling_params.
+-                        truncate_prompt_tokens)
+-                prompt_ids, prompt_text = prompt_formats
+-
+-                generators.append(
+-                    self.engine.generate(prompt_text,
+-                                         sampling_params,
+-                                         f"{request_id}-{i}",
+-                                         prompt_token_ids=prompt_ids,
+-                                         lora_request=lora_request))
++                    sampling_params = request.to_sampling_params(
++                        default_max_tokens,
++                        self.model_config.logits_processor_pattern,
++                        default_sampling_params)
++
++                request_id_item = f"{request_id}-{i}"
++
++                self._log_inputs(request_id_item,
++                                 request_prompts[i],
++                                 params=sampling_params,
++                                 lora_request=lora_request,
++                                 prompt_adapter_request=prompt_adapter_request)
++
++                trace_headers = (await
++                                 self._get_trace_headers(raw_request.headers))
++
++                if isinstance(sampling_params, BeamSearchParams):
++                    generator = self.engine_client.beam_search(
++                        prompt=engine_prompt,
++                        request_id=request_id,
++                        params=sampling_params,
++                    )
++                else:
++                    generator = self.engine_client.generate(
++                        engine_prompt,
++                        sampling_params,
++                        request_id_item,
++                        lora_request=lora_request,
++                        prompt_adapter_request=prompt_adapter_request,
++                        trace_headers=trace_headers,
++                        priority=request.priority,
++                    )
++
++                generators.append(generator)
+         except ValueError as e:
+             # TODO: Use a vllm-specific Validation Error
+             return self.create_error_response(str(e))
+ 
+-        result_generator: AsyncIterator[Tuple[
+-            int, RequestOutput]] = merge_async_iterators(*generators)
++        result_generator = merge_async_iterators(*generators)
++
++        model_name = self.models.model_name(lora_request)
++        num_prompts = len(engine_prompts)
+ 
+         # Similar to the OpenAI API, when n != best_of, we do not stream the
+         # results. In addition, we do not stream the results when use
+@@ -140,25 +176,45 @@ class OpenAIServingCompletion(OpenAIServing):
+ 
+         # Streaming response
+         if stream:
+-            return self.completion_stream_generator(request,
+-                                                    raw_request,
+-                                                    result_generator,
+-                                                    request_id,
+-                                                    created_time,
+-                                                    model_name,
+-                                                    num_prompts=len(prompts))
++            return self.completion_stream_generator(
++                request,
++                result_generator,
++                request_id,
++                created_time,
++                model_name,
++                num_prompts=num_prompts,
++                tokenizer=tokenizer,
++                request_metadata=request_metadata)
+ 
+         # Non-streaming response
+-        final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts)
++        final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts
+         try:
+             async for i, res in result_generator:
+-                if await raw_request.is_disconnected():
+-                    # Abort the request if the client disconnects.
+-                    await self.engine.abort(f"{request_id}-{i}")
+-                    return self.create_error_response("Client disconnected")
+                 final_res_batch[i] = res
++
++            for i, final_res in enumerate(final_res_batch):
++                assert final_res is not None
++
++                # The output should contain the input text
++                # We did not pass it into vLLM engine to avoid being redundant
++                # with the inputs token IDs
++                if final_res.prompt is None:
++                    final_res.prompt = request_prompts[i]["prompt"]
++
++            final_res_batch_checked = cast(List[RequestOutput],
++                                           final_res_batch)
++
+             response = self.request_output_to_completion_response(
+-                final_res_batch, request, request_id, created_time, model_name)
++                final_res_batch_checked,
++                request,
++                request_id,
++                created_time,
++                model_name,
++                tokenizer,
++                request_metadata,
++            )
++        except asyncio.CancelledError:
++            return self.create_error_response("Client disconnected")
+         except ValueError as e:
+             # TODO: Use a vllm-specific Validation Error
+             return self.create_error_response(str(e))
+@@ -179,80 +235,96 @@ class OpenAIServingCompletion(OpenAIServing):
+     async def completion_stream_generator(
+         self,
+         request: CompletionRequest,
+-        raw_request: Request,
+         result_generator: AsyncIterator[Tuple[int, RequestOutput]],
+         request_id: str,
+         created_time: int,
+         model_name: str,
+         num_prompts: int,
++        tokenizer: AnyTokenizer,
++        request_metadata: RequestResponseMetadata,
+     ) -> AsyncGenerator[str, None]:
+-        assert request.n is not None
+-        previous_texts = [""] * request.n * num_prompts
+-        previous_num_tokens = [0] * request.n * num_prompts
+-        has_echoed = [False] * request.n * num_prompts
++        num_choices = 1 if request.n is None else request.n
++        previous_text_lens = [0] * num_choices * num_prompts
++        previous_num_tokens = [0] * num_choices * num_prompts
++        has_echoed = [False] * num_choices * num_prompts
++        num_prompt_tokens = [0] * num_prompts
++
++        stream_options = request.stream_options
++        if stream_options:
++            include_usage = stream_options.include_usage
++            include_continuous_usage = include_usage and \
++                                       stream_options.continuous_usage_stats
++        else:
++            include_usage, include_continuous_usage = False, False
+ 
+         try:
+             async for prompt_idx, res in result_generator:
++                prompt_token_ids = res.prompt_token_ids
++                prompt_logprobs = res.prompt_logprobs
++                prompt_text = res.prompt
++
++                # Prompt details are excluded from later streamed outputs
++                if res.prompt_token_ids is not None:
++                    num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids)
+ 
+-                # Abort the request if the client disconnects.
+-                if await raw_request.is_disconnected():
+-                    await self.engine.abort(f"{request_id}-{prompt_idx}")
+-                    raise StopAsyncIteration()
++                delta_token_ids: GenericSequence[int]
++                out_logprobs: Optional[GenericSequence[Optional[Dict[
++                    int, Logprob]]]]
+ 
+                 for output in res.outputs:
+-                    i = output.index + prompt_idx * request.n
+-                    # TODO(simon): optimize the performance by avoiding full
+-                    # text O(n^2) sending.
++                    i = output.index + prompt_idx * num_choices
+ 
+                     assert request.max_tokens is not None
+-                    if request.echo and request.max_tokens == 0:
+-                        # only return the prompt
+-                        delta_text = res.prompt
+-                        delta_token_ids = res.prompt_token_ids
+-                        top_logprobs = res.prompt_logprobs
+-                        has_echoed[i] = True
+-                    elif (request.echo and request.max_tokens > 0
+-                          and not has_echoed[i]):
+-                        # echo the prompt and first token
+-                        delta_text = res.prompt + output.text
+-                        delta_token_ids = (res.prompt_token_ids +
+-                                           output.token_ids)
+-                        top_logprobs = res.prompt_logprobs + (output.logprobs
+-                                                              or [])
++                    if request.echo and not has_echoed[i]:
++                        assert prompt_token_ids is not None
++                        assert prompt_text is not None
++                        if request.max_tokens == 0:
++                            # only return the prompt
++                            delta_text = prompt_text
++                            delta_token_ids = prompt_token_ids
++                            out_logprobs = prompt_logprobs
++                        else:
++                            assert prompt_logprobs is not None
++                            # echo the prompt and first token
++                            delta_text = prompt_text + output.text
++                            delta_token_ids = [
++                                *prompt_token_ids, *output.token_ids
++                            ]
++                            out_logprobs = [
++                                *prompt_logprobs,
++                                *(output.logprobs or []),
++                            ]
+                         has_echoed[i] = True
+                     else:
+                         # return just the delta
+-                        delta_text = output.text[len(previous_texts[i]):]
+-                        delta_token_ids = output.token_ids[
+-                            previous_num_tokens[i]:]
+-                        top_logprobs = output.logprobs[previous_num_tokens[
+-                            i]:] if output.logprobs else None
++                        delta_text = output.text
++                        delta_token_ids = output.token_ids
++                        out_logprobs = output.logprobs
++
++                        if not delta_text and not delta_token_ids \
++                            and not previous_num_tokens[i]:
++                            # Chunked prefill case, don't return empty chunks
++                            continue
+ 
+                     if request.logprobs is not None:
+-                        logprobs = self._create_logprobs(
++                        assert out_logprobs is not None, (
++                            "Did not output logprobs")
++                        logprobs = self._create_completion_logprobs(
+                             token_ids=delta_token_ids,
+-                            top_logprobs=top_logprobs,
++                            top_logprobs=out_logprobs,
+                             num_output_top_logprobs=request.logprobs,
+-                            initial_text_offset=len(previous_texts[i]),
++                            tokenizer=tokenizer,
++                            initial_text_offset=previous_text_lens[i],
+                         )
+                     else:
+                         logprobs = None
+ 
+-                    previous_texts[i] = output.text
+-                    previous_num_tokens[i] = len(output.token_ids)
++                    previous_text_lens[i] += len(output.text)
++                    previous_num_tokens[i] += len(output.token_ids)
+                     finish_reason = output.finish_reason
+                     stop_reason = output.stop_reason
+-                    if output.finish_reason is not None:  # return final usage
+-                        prompt_tokens = len(res.prompt_token_ids)
+-                        completion_tokens = len(output.token_ids)
+-                        final_usage = UsageInfo(
+-                            prompt_tokens=prompt_tokens,
+-                            completion_tokens=completion_tokens,
+-                            total_tokens=prompt_tokens + completion_tokens,
+-                        )
+-                    else:
+-                        final_usage = None
+-                    response_json = CompletionStreamResponse(
++
++                    chunk = CompletionStreamResponse(
+                         id=request_id,
+                         created=created_time,
+                         model=model_name,
+@@ -264,11 +336,42 @@ class OpenAIServingCompletion(OpenAIServing):
+                                 finish_reason=finish_reason,
+                                 stop_reason=stop_reason,
+                             )
+-                        ],
+-                        usage=final_usage,
+-                    ).model_dump_json(exclude_unset=True)
++                        ])
++                    if include_continuous_usage:
++                        prompt_tokens = num_prompt_tokens[prompt_idx]
++                        completion_tokens = previous_num_tokens[i]
++                        chunk.usage = UsageInfo(
++                            prompt_tokens=prompt_tokens,
++                            completion_tokens=completion_tokens,
++                            total_tokens=prompt_tokens + completion_tokens,
++                        )
++
++                    response_json = chunk.model_dump_json(exclude_unset=False)
+                     yield f"data: {response_json}\n\n"
+-        except ValueError as e:
++
++            total_prompt_tokens = sum(num_prompt_tokens)
++            total_completion_tokens = sum(previous_num_tokens)
++            final_usage_info = UsageInfo(
++                prompt_tokens=total_prompt_tokens,
++                completion_tokens=total_completion_tokens,
++                total_tokens=total_prompt_tokens + total_completion_tokens)
++
++            if include_usage:
++                final_usage_chunk = CompletionStreamResponse(
++                    id=request_id,
++                    created=created_time,
++                    model=model_name,
++                    choices=[],
++                    usage=final_usage_info,
++                )
++                final_usage_data = (final_usage_chunk.model_dump_json(
++                    exclude_unset=False, exclude_none=True))
++                yield f"data: {final_usage_data}\n\n"
++
++            # report to FastAPI middleware aggregate usage across all choices
++            request_metadata.final_usage_info = final_usage_info
++
++        except Exception as e:
+             # TODO: Use a vllm-specific Validation Error
+             data = self.create_streaming_error_response(str(e))
+             yield f"data: {data}\n\n"
+@@ -281,39 +384,62 @@ class OpenAIServingCompletion(OpenAIServing):
+         request_id: str,
+         created_time: int,
+         model_name: str,
++        tokenizer: AnyTokenizer,
++        request_metadata: RequestResponseMetadata,
+     ) -> CompletionResponse:
+         choices: List[CompletionResponseChoice] = []
+         num_prompt_tokens = 0
+         num_generated_tokens = 0
++
+         for final_res in final_res_batch:
+-            assert final_res is not None
+             prompt_token_ids = final_res.prompt_token_ids
++            assert prompt_token_ids is not None
+             prompt_logprobs = final_res.prompt_logprobs
++            if prompt_logprobs:
++                for logprob_dict in prompt_logprobs:
++                    if logprob_dict:
++                        for logprob_values in logprob_dict.values():
++                            if logprob_values.logprob == float('-inf'):
++                                logprob_values.logprob = -9999.0
+             prompt_text = final_res.prompt
+ 
++            token_ids: GenericSequence[int]
++            out_logprobs: Optional[GenericSequence[Optional[Dict[int,
++                                                                 Logprob]]]]
++
+             for output in final_res.outputs:
+                 assert request.max_tokens is not None
+-                if request.echo and request.max_tokens == 0:
+-                    token_ids = prompt_token_ids
+-                    top_logprobs = prompt_logprobs
+-                    output_text = prompt_text
+-                elif request.echo and request.max_tokens > 0:
+-                    token_ids = prompt_token_ids + output.token_ids
+-                    top_logprobs = (prompt_logprobs + output.logprobs
+-                                    if request.logprobs else None)
+-                    output_text = prompt_text + output.text
++                if request.echo:
++                    assert prompt_text is not None
++                    if request.max_tokens == 0:
++                        token_ids = prompt_token_ids
++                        out_logprobs = prompt_logprobs
++                        output_text = prompt_text
++                    else:
++                        token_ids = [*prompt_token_ids, *output.token_ids]
++
++                        if request.logprobs is None:
++                            out_logprobs = None
++                        else:
++                            assert prompt_logprobs is not None
++                            assert output.logprobs is not None
++                            out_logprobs = [
++                                *prompt_logprobs,
++                                *output.logprobs,
++                            ]
++
++                        output_text = prompt_text + output.text
+                 else:
+                     token_ids = output.token_ids
+-                    top_logprobs = output.logprobs
++                    out_logprobs = output.logprobs
+                     output_text = output.text
+ 
+                 if request.logprobs is not None:
+-                    assert top_logprobs is not None, (
+-                        "top_logprobs must be provided when logprobs "
+-                        "is requested")
+-                    logprobs = self._create_logprobs(
++                    assert out_logprobs is not None, "Did not output logprobs"
++                    logprobs = self._create_completion_logprobs(
+                         token_ids=token_ids,
+-                        top_logprobs=top_logprobs,
++                        top_logprobs=out_logprobs,
++                        tokenizer=tokenizer,
+                         num_output_top_logprobs=request.logprobs,
+                     )
+                 else:
+@@ -325,12 +451,13 @@ class OpenAIServingCompletion(OpenAIServing):
+                     logprobs=logprobs,
+                     finish_reason=output.finish_reason,
+                     stop_reason=output.stop_reason,
++                    prompt_logprobs=final_res.prompt_logprobs,
+                 )
+                 choices.append(choice_data)
+ 
++                num_generated_tokens += len(output.token_ids)
++
+             num_prompt_tokens += len(prompt_token_ids)
+-            num_generated_tokens += sum(
+-                len(output.token_ids) for output in final_res.outputs)
+ 
+         usage = UsageInfo(
+             prompt_tokens=num_prompt_tokens,
+@@ -338,6 +465,8 @@ class OpenAIServingCompletion(OpenAIServing):
+             total_tokens=num_prompt_tokens + num_generated_tokens,
+         )
+ 
++        request_metadata.final_usage_info = usage
++
+         return CompletionResponse(
+             id=request_id,
+             created=created_time,
+@@ -345,3 +474,73 @@ class OpenAIServingCompletion(OpenAIServing):
+             choices=choices,
+             usage=usage,
+         )
++
++    def _create_completion_logprobs(
++        self,
++        token_ids: GenericSequence[int],
++        top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]],
++        num_output_top_logprobs: int,
++        tokenizer: AnyTokenizer,
++        initial_text_offset: int = 0,
++    ) -> CompletionLogProbs:
++        """Create logprobs for OpenAI Completion API."""
++        out_text_offset: List[int] = []
++        out_token_logprobs: List[Optional[float]] = []
++        out_tokens: List[str] = []
++        out_top_logprobs: List[Optional[Dict[str, float]]] = []
++
++        last_token_len = 0
++
++        for i, token_id in enumerate(token_ids):
++            step_top_logprobs = top_logprobs[i]
++            if step_top_logprobs is None:
++                token = tokenizer.decode(token_id)
++                if self.return_tokens_as_token_ids:
++                    token = f"token_id:{token_id}"
++
++                out_tokens.append(token)
++                out_token_logprobs.append(None)
++                out_top_logprobs.append(None)
++            else:
++                step_token = step_top_logprobs[token_id]
++
++                token = self._get_decoded_token(
++                    step_token,
++                    token_id,
++                    tokenizer,
++                    return_as_token_id=self.return_tokens_as_token_ids,
++                )
++                token_logprob = max(step_token.logprob, -9999.0)
++
++                out_tokens.append(token)
++                out_token_logprobs.append(token_logprob)
++
++                # makes sure to add the top num_output_top_logprobs + 1
++                # logprobs, as defined in the openai API
++                # (cf. https://github.com/openai/openai-openapi/blob/
++                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
++                out_top_logprobs.append({
++                    # Convert float("-inf") to the
++                    # JSON-serializable float that OpenAI uses
++                    self._get_decoded_token(
++                        top_lp[1],
++                        top_lp[0],
++                        tokenizer,
++                        return_as_token_id=self.return_tokens_as_token_ids):
++                    max(top_lp[1].logprob, -9999.0)
++                    for i, top_lp in enumerate(step_top_logprobs.items())
++                    if num_output_top_logprobs >= i
++                })
++
++            if len(out_text_offset) == 0:
++                out_text_offset.append(initial_text_offset)
++            else:
++                out_text_offset.append(out_text_offset[-1] + last_token_len)
++            last_token_len = len(token)
++
++        return CompletionLogProbs(
++            text_offset=out_text_offset,
++            token_logprobs=out_token_logprobs,
++            tokens=out_tokens,
++            top_logprobs=out_top_logprobs,
++        )
+diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
+new file mode 100644
+index 0000000..e7116a3
+--- /dev/null
++++ b/vllm/entrypoints/openai/serving_embedding.py
+@@ -0,0 +1,240 @@
++import asyncio
++import base64
++import time
++from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
++
++import numpy as np
++from fastapi import Request
++from typing_extensions import assert_never
++
++from vllm.config import ModelConfig
++from vllm.engine.protocol import EngineClient
++from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
++from vllm.entrypoints.logger import RequestLogger
++from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
++                                              EmbeddingRequest,
++                                              EmbeddingResponse,
++                                              EmbeddingResponseData,
++                                              ErrorResponse, UsageInfo)
++from vllm.entrypoints.openai.serving_engine import OpenAIServing
++from vllm.entrypoints.openai.serving_models import OpenAIServingModels
++from vllm.logger import init_logger
++from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
++                          PoolingRequestOutput)
++from vllm.utils import merge_async_iterators
++
++logger = init_logger(__name__)
++
++
++def _get_embedding(
++    output: EmbeddingOutput,
++    encoding_format: Literal["float", "base64"],
++) -> Union[List[float], str]:
++    if encoding_format == "float":
++        return output.embedding
++    elif encoding_format == "base64":
++        # Force to use float32 for base64 encoding
++        # to match the OpenAI python client behavior
++        embedding_bytes = np.array(output.embedding, dtype="float32").tobytes()
++        return base64.b64encode(embedding_bytes).decode("utf-8")
++
++    assert_never(encoding_format)
++
++
++class OpenAIServingEmbedding(OpenAIServing):
++
++    def __init__(
++        self,
++        engine_client: EngineClient,
++        model_config: ModelConfig,
++        models: OpenAIServingModels,
++        *,
++        request_logger: Optional[RequestLogger],
++        chat_template: Optional[str],
++        chat_template_content_format: ChatTemplateContentFormatOption,
++    ) -> None:
++        super().__init__(engine_client=engine_client,
++                         model_config=model_config,
++                         models=models,
++                         request_logger=request_logger)
++
++        self.chat_template = chat_template
++        self.chat_template_content_format: Final = chat_template_content_format
++
++    async def create_embedding(
++        self,
++        request: EmbeddingRequest,
++        raw_request: Optional[Request] = None,
++    ) -> Union[EmbeddingResponse, ErrorResponse]:
++        """
++        Embedding API similar to OpenAI's API.
++
++        See https://platform.openai.com/docs/api-reference/embeddings/create
++        for the API specification. This API mimics the OpenAI Embedding API.
++        """
++        error_check_ret = await self._check_model(request)
++        if error_check_ret is not None:
++            return error_check_ret
++
++        encoding_format = request.encoding_format
++        if request.dimensions is not None:
++            return self.create_error_response(
++                "dimensions is currently not supported")
++
++        model_name = request.model
++        request_id = f"embd-{self._base_request_id(raw_request)}"
++        created_time = int(time.time())
++
++        truncate_prompt_tokens = None
++
++        if request.truncate_prompt_tokens is not None:
++            if request.truncate_prompt_tokens <= self.max_model_len:
++                truncate_prompt_tokens = request.truncate_prompt_tokens
++            else:
++                return self.create_error_response(
++                    "truncate_prompt_tokens value is "
++                    "greater than max_model_len."
++                    " Please, select a smaller truncation size.")
++
++        try:
++            (
++                lora_request,
++                prompt_adapter_request,
++            ) = self._maybe_get_adapters(request)
++
++            tokenizer = await self.engine_client.get_tokenizer(lora_request)
++
++            if prompt_adapter_request is not None:
++                raise NotImplementedError("Prompt adapter is not supported "
++                                          "for embedding models")
++
++            if isinstance(request, EmbeddingChatRequest):
++                (
++                    _,
++                    request_prompts,
++                    engine_prompts,
++                ) = await self._preprocess_chat(
++                    request,
++                    tokenizer,
++                    request.messages,
++                    chat_template=request.chat_template or self.chat_template,
++                    chat_template_content_format=self.
++                    chat_template_content_format,
++                    # In embedding requests, we are not generating tokens,
++                    # so there is no need to append extra tokens to the input
++                    add_generation_prompt=False,
++                    continue_final_message=False,
++                    truncate_prompt_tokens=truncate_prompt_tokens,
++                    add_special_tokens=request.add_special_tokens,
++                )
++            else:
++                (request_prompts,
++                 engine_prompts) = await self._preprocess_completion(
++                     request,
++                     tokenizer,
++                     request.input,
++                     truncate_prompt_tokens=truncate_prompt_tokens,
++                     add_special_tokens=request.add_special_tokens,
++                 )
++        except ValueError as e:
++            logger.exception("Error in preprocessing prompt inputs")
++            return self.create_error_response(str(e))
++
++        # Schedule the request and get the result generator.
++        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
++        try:
++            pooling_params = request.to_pooling_params()
++
++            for i, engine_prompt in enumerate(engine_prompts):
++                request_id_item = f"{request_id}-{i}"
++
++                self._log_inputs(request_id_item,
++                                 request_prompts[i],
++                                 params=pooling_params,
++                                 lora_request=lora_request,
++                                 prompt_adapter_request=prompt_adapter_request)
++
++                trace_headers = (None if raw_request is None else await
++                                 self._get_trace_headers(raw_request.headers))
++
++                generator = self.engine_client.encode(
++                    engine_prompt,
++                    pooling_params,
++                    request_id_item,
++                    lora_request=lora_request,
++                    trace_headers=trace_headers,
++                    priority=request.priority,
++                )
++
++                generators.append(generator)
++        except ValueError as e:
++            # TODO: Use a vllm-specific Validation Error
++            return self.create_error_response(str(e))
++
++        result_generator = merge_async_iterators(*generators)
++
++        num_prompts = len(engine_prompts)
++
++        # Non-streaming response
++        final_res_batch: List[Optional[PoolingRequestOutput]]
++        final_res_batch = [None] * num_prompts
++        try:
++            async for i, res in result_generator:
++                final_res_batch[i] = res
++
++            assert all(final_res is not None for final_res in final_res_batch)
++
++            final_res_batch_checked = cast(List[PoolingRequestOutput],
++                                           final_res_batch)
++
++            response = self.request_output_to_embedding_response(
++                final_res_batch_checked,
++                request_id,
++                created_time,
++                model_name,
++                encoding_format,
++            )
++        except asyncio.CancelledError:
++            return self.create_error_response("Client disconnected")
++        except ValueError as e:
++            # TODO: Use a vllm-specific Validation Error
++            return self.create_error_response(str(e))
++
++        return response
++
++    def request_output_to_embedding_response(
++        self,
++        final_res_batch: List[PoolingRequestOutput],
++        request_id: str,
++        created_time: int,
++        model_name: str,
++        encoding_format: Literal["float", "base64"],
++    ) -> EmbeddingResponse:
++        items: List[EmbeddingResponseData] = []
++        num_prompt_tokens = 0
++
++        for idx, final_res in enumerate(final_res_batch):
++            embedding_res = EmbeddingRequestOutput.from_base(final_res)
++
++            item = EmbeddingResponseData(
++                index=idx,
++                embedding=_get_embedding(embedding_res.outputs,
++                                         encoding_format),
++            )
++            prompt_token_ids = final_res.prompt_token_ids
++
++            items.append(item)
++            num_prompt_tokens += len(prompt_token_ids)
++
++        usage = UsageInfo(
++            prompt_tokens=num_prompt_tokens,
++            total_tokens=num_prompt_tokens,
++        )
++
++        return EmbeddingResponse(
++            id=request_id,
++            created=created_time,
++            model=model_name,
++            data=items,
++            usage=usage,
++        )
+diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
+index 21baea2..8885925 100644
+--- a/vllm/entrypoints/openai/serving_engine.py
++++ b/vllm/entrypoints/openai/serving_engine.py
+@@ -1,144 +1,99 @@
+-import asyncio
+ import json
+-from dataclasses import dataclass
++from concurrent.futures.thread import ThreadPoolExecutor
+ from http import HTTPStatus
+-from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
++from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
++                    Optional, Sequence, Tuple, TypedDict, Union)
+ 
++from fastapi import Request
+ from pydantic import Field
+-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
++from starlette.datastructures import Headers
+ from typing_extensions import Annotated
+ 
+-from vllm.engine.async_llm_engine import AsyncLLMEngine
++from vllm.config import ModelConfig
++from vllm.engine.protocol import EngineClient
++# yapf conflicts with isort for this block
++# yapf: disable
++from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
++                                         ChatTemplateContentFormatOption,
++                                         ConversationMessage,
++                                         apply_hf_chat_template,
++                                         apply_mistral_chat_template,
++                                         parse_chat_messages_futures,
++                                         resolve_chat_template_content_format)
++from vllm.entrypoints.logger import RequestLogger
+ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+-                                              CompletionRequest, ErrorResponse,
+-                                              LogProbs, ModelCard, ModelList,
+-                                              ModelPermission)
++                                              CompletionRequest,
++                                              DetokenizeRequest,
++                                              EmbeddingChatRequest,
++                                              EmbeddingCompletionRequest,
++                                              ErrorResponse, ScoreRequest,
++                                              TokenizeChatRequest,
++                                              TokenizeCompletionRequest)
++from vllm.entrypoints.openai.serving_models import OpenAIServingModels
++from vllm.entrypoints.openai.tool_parsers import ToolParser
++# yapf: enable
++from vllm.inputs import TokensPrompt
++from vllm.inputs.parse import parse_and_batch_prompt
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
++from vllm.pooling_params import PoolingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import BeamSearchParams, SamplingParams
+ from vllm.sequence import Logprob
+-from vllm.transformers_utils.tokenizer import get_tokenizer
++from vllm.tracing import (contains_trace_headers, extract_trace_headers,
++                          log_tracing_disabled_warning)
++from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
++from vllm.utils import is_list_of, make_async, random_uuid
+ 
+ logger = init_logger(__name__)
+ 
++CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
++                              EmbeddingCompletionRequest, ScoreRequest,
++                              TokenizeCompletionRequest]
+ 
+-@dataclass
+-class LoRAModulePath:
+-    name: str
+-    local_path: str
++ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
++                        TokenizeChatRequest]
+ 
++AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest]
+ 
+-class OpenAIServing:
+ 
+-    def __init__(self,
+-                 engine: AsyncLLMEngine,
+-                 served_model_names: List[str],
+-                 lora_modules: Optional[List[LoRAModulePath]],
+-                 await_post_init: Optional[Awaitable[Any]] = None):
+-        self.engine = engine
+-        self.served_model_names = served_model_names
+-        if lora_modules is None:
+-            self.lora_requests = []
+-        else:
+-            self.lora_requests = [
+-                LoRARequest(
+-                    lora_name=lora.name,
+-                    lora_int_id=i,
+-                    lora_local_path=lora.local_path,
+-                ) for i, lora in enumerate(lora_modules, start=1)
+-            ]
+-
+-        self.max_model_len = 0
+-        # Lazy initialized
+-        self.tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+-
+-        try:
+-            event_loop = asyncio.get_running_loop()
+-        except RuntimeError:
+-            event_loop = None
+-
+-        if event_loop is not None and event_loop.is_running():
+-            # If the current is instanced by Ray Serve,
+-            # there is already a running event loop
+-            event_loop.create_task(self._post_init(await_post_init))
+-        else:
+-            # When using single vLLM without engine_use_ray
+-            asyncio.run(self._post_init(await_post_init))
+-
+-    async def _post_init(self, await_post_init):
+-        engine_model_config = await self.engine.get_model_config()
+-        self.max_model_len = engine_model_config.max_model_len
+-
+-        # A separate tokenizer to map token IDs to strings.
+-        self.tokenizer = get_tokenizer(
+-            engine_model_config.tokenizer,
+-            tokenizer_mode=engine_model_config.tokenizer_mode,
+-            tokenizer_revision=engine_model_config.tokenizer_revision,
+-            trust_remote_code=engine_model_config.trust_remote_code,
+-            truncation_side="left")
+-
+-        if await_post_init is not None:
+-            await await_post_init
+-
+-    async def show_available_models(self) -> ModelList:
+-        """Show available models. Right now we only have one model."""
+-        model_cards = [
+-            ModelCard(id=served_model_name,
+-                      root=self.served_model_names[0],
+-                      permission=[ModelPermission()])
+-            for served_model_name in self.served_model_names
+-        ]
+-        lora_cards = [
+-            ModelCard(id=lora.lora_name,
+-                      root=self.served_model_names[0],
+-                      permission=[ModelPermission()])
+-            for lora in self.lora_requests
+-        ]
+-        model_cards.extend(lora_cards)
+-        return ModelList(data=model_cards)
++class TextTokensPrompt(TypedDict):
++    prompt: str
++    prompt_token_ids: List[int]
++
+ 
+-    def _create_logprobs(
++RequestPrompt = Union[List[int], str, TextTokensPrompt]
++
++
++class OpenAIServing:
++
++    def __init__(
+         self,
+-        token_ids: List[int],
+-        top_logprobs: List[Optional[Dict[int, Logprob]]],
+-        num_output_top_logprobs: Optional[int] = None,
+-        initial_text_offset: int = 0,
+-    ) -> LogProbs:
+-        """Create OpenAI-style logprobs."""
+-        logprobs = LogProbs()
+-        last_token_len = 0
+-        if num_output_top_logprobs:
+-            logprobs.top_logprobs = []
+-
+-        for i, token_id in enumerate(token_ids):
+-            step_top_logprobs = top_logprobs[i]
+-            if step_top_logprobs is None:
+-                token = self.tokenizer.decode(token_id)
+-                logprobs.tokens.append(token)
+-                logprobs.token_logprobs.append(None)
+-                assert logprobs.top_logprobs is not None
+-                logprobs.top_logprobs.append(None)
+-            else:
+-                token_logprob = step_top_logprobs[token_id].logprob
+-                token = step_top_logprobs[token_id].decoded_token
+-                logprobs.tokens.append(token)
+-                logprobs.token_logprobs.append(token_logprob)
+-
+-                if num_output_top_logprobs:
+-                    assert logprobs.top_logprobs is not None
+-                    logprobs.top_logprobs.append({
+-                        # Convert float("-inf") to the
+-                        # JSON-serializable float that OpenAI uses
+-                        p.decoded_token: max(p.logprob, -9999.0)
+-                        for i, p in step_top_logprobs.items()
+-                    } if step_top_logprobs else None)
+-
+-            if len(logprobs.text_offset) == 0:
+-                logprobs.text_offset.append(initial_text_offset)
+-            else:
+-                logprobs.text_offset.append(logprobs.text_offset[-1] +
+-                                            last_token_len)
+-            last_token_len = len(token)
+-        return logprobs
++        engine_client: EngineClient,
++        model_config: ModelConfig,
++        models: OpenAIServingModels,
++        *,
++        request_logger: Optional[RequestLogger],
++        return_tokens_as_token_ids: bool = False,
++    ):
++        super().__init__()
++
++        self.engine_client = engine_client
++        self.model_config = model_config
++        self.max_model_len = model_config.max_model_len
++
++        self.models = models
++
++        self.request_logger = request_logger
++        self.return_tokens_as_token_ids = return_tokens_as_token_ids
++
++        self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
++
++        self._tokenize_prompt_input_async = make_async(
++            self._tokenize_prompt_input, executor=self._tokenizer_executor)
++        self._tokenize_prompt_input_or_inputs_async = make_async(
++            self._tokenize_prompt_input_or_inputs,
++            executor=self._tokenizer_executor)
+ 
+     def create_error_response(
+             self,
+@@ -163,72 +118,400 @@ class OpenAIServing:
+         return json_str
+ 
+     async def _check_model(
+-        self, request: Union[CompletionRequest, ChatCompletionRequest]
++        self,
++        request: AnyRequest,
+     ) -> Optional[ErrorResponse]:
+-        if request.model in self.served_model_names:
++        if self._is_model_supported(request.model):
++            return None
++        if request.model in [
++                lora.lora_name for lora in self.models.lora_requests
++        ]:
+             return None
+-        if request.model in [lora.lora_name for lora in self.lora_requests]:
++        if request.model in [
++                prompt_adapter.prompt_adapter_name
++                for prompt_adapter in self.models.prompt_adapter_requests
++        ]:
+             return None
+         return self.create_error_response(
+             message=f"The model `{request.model}` does not exist.",
+             err_type="NotFoundError",
+             status_code=HTTPStatus.NOT_FOUND)
+ 
+-    def _maybe_get_lora(
+-        self, request: Union[CompletionRequest, ChatCompletionRequest]
+-    ) -> Optional[LoRARequest]:
+-        if request.model in self.served_model_names:
+-            return None
+-        for lora in self.lora_requests:
++    def _maybe_get_adapters(
++        self, request: AnyRequest
++    ) -> Union[Tuple[None, None], Tuple[LoRARequest, None], Tuple[
++            None, PromptAdapterRequest]]:
++        if self._is_model_supported(request.model):
++            return None, None
++        for lora in self.models.lora_requests:
+             if request.model == lora.lora_name:
+-                return lora
++                return lora, None
++        for prompt_adapter in self.models.prompt_adapter_requests:
++            if request.model == prompt_adapter.prompt_adapter_name:
++                return None, prompt_adapter
+         # if _check_model has been called earlier, this will be unreachable
+         raise ValueError(f"The model `{request.model}` does not exist.")
+ 
+-    def _validate_prompt_and_tokenize(
++    def _normalize_prompt_text_to_input(
+         self,
+-        request: Union[ChatCompletionRequest, CompletionRequest],
+-        prompt: Optional[str] = None,
+-        prompt_ids: Optional[List[int]] = None,
+-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+-    ) -> Tuple[List[int], str]:
+-        if not (prompt or prompt_ids):
+-            raise ValueError("Either prompt or prompt_ids should be provided.")
+-        if (prompt and prompt_ids):
+-            raise ValueError(
+-                "Only one of prompt or prompt_ids should be provided.")
+-
+-        if prompt_ids is None:
+-            tokenizer_kwargs = {} if truncate_prompt_tokens is None else {
+-                "truncation": True,
+-                "max_length": truncate_prompt_tokens,
+-            }
+-            input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
+-        elif truncate_prompt_tokens is not None:
+-            input_ids = prompt_ids[-truncate_prompt_tokens:]
++        request: AnyRequest,
++        tokenizer: AnyTokenizer,
++        prompt: str,
++        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
++        add_special_tokens: bool,
++    ) -> TextTokensPrompt:
++        if (self.model_config.encoder_config is not None
++                and self.model_config.encoder_config.get(
++                    "do_lower_case", False)):
++            prompt = prompt.lower()
++
++        if truncate_prompt_tokens is None:
++            encoded = tokenizer(prompt, add_special_tokens=add_special_tokens)
+         else:
++            encoded = tokenizer(prompt,
++                                add_special_tokens=add_special_tokens,
++                                truncation=True,
++                                max_length=truncate_prompt_tokens)
++
++        input_ids = encoded.input_ids
++
++        input_text = prompt
++
++        return self._validate_input(request, input_ids, input_text)
++
++    def _normalize_prompt_tokens_to_input(
++        self,
++        request: AnyRequest,
++        tokenizer: AnyTokenizer,
++        prompt_ids: List[int],
++        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
++    ) -> TextTokensPrompt:
++        if truncate_prompt_tokens is None:
+             input_ids = prompt_ids
++        else:
++            input_ids = prompt_ids[-truncate_prompt_tokens:]
++
++        input_text = tokenizer.decode(input_ids)
+ 
+-        input_text = prompt if prompt is not None else self.tokenizer.decode(
+-            prompt_ids)
++        return self._validate_input(request, input_ids, input_text)
++
++    def _validate_input(
++        self,
++        request: AnyRequest,
++        input_ids: List[int],
++        input_text: str,
++    ) -> TextTokensPrompt:
+         token_num = len(input_ids)
+ 
+-        if request.max_tokens is None:
++        # Note: EmbeddingRequest doesn't have max_tokens
++        if isinstance(request,
++                      (EmbeddingChatRequest, EmbeddingCompletionRequest)):
++            if token_num > self.max_model_len:
++                raise ValueError(
++                    f"This model's maximum context length is "
++                    f"{self.max_model_len} tokens. However, you requested "
++                    f"{token_num} tokens in the input for embedding "
++                    f"generation. Please reduce the length of the input.")
++            return TextTokensPrompt(prompt=input_text,
++                                    prompt_token_ids=input_ids)
++
++        # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
++        # and does not require model context length validation
++        if isinstance(request, (TokenizeCompletionRequest, TokenizeChatRequest,
++                                DetokenizeRequest)):
++            return TextTokensPrompt(prompt=input_text,
++                                    prompt_token_ids=input_ids)
++
++        # chat completion endpoint supports max_completion_tokens
++        if isinstance(request, ChatCompletionRequest):
++            # TODO(#9845): remove max_tokens when field dropped from OpenAI API
++            max_tokens = request.max_completion_tokens or request.max_tokens
++        else:
++            max_tokens = request.max_tokens
++        if max_tokens is None:
+             if token_num >= self.max_model_len:
+                 raise ValueError(
+                     f"This model's maximum context length is "
+                     f"{self.max_model_len} tokens. However, you requested "
+                     f"{token_num} tokens in the messages, "
+-                    f"Please reduce the length of the messages.", )
+-            request.max_tokens = self.max_model_len - token_num
+-
+-        if token_num + request.max_tokens > self.max_model_len:
++                    f"Please reduce the length of the messages.")
++        elif token_num + max_tokens > self.max_model_len:
+             raise ValueError(
+                 f"This model's maximum context length is "
+                 f"{self.max_model_len} tokens. However, you requested "
+-                f"{request.max_tokens + token_num} tokens "
++                f"{max_tokens + token_num} tokens "
+                 f"({token_num} in the messages, "
+-                f"{request.max_tokens} in the completion). "
+-                f"Please reduce the length of the messages or completion.", )
++                f"{max_tokens} in the completion). "
++                f"Please reduce the length of the messages or completion.")
++
++        return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
++
++    def _tokenize_prompt_input(
++        self,
++        request: AnyRequest,
++        tokenizer: AnyTokenizer,
++        prompt_input: Union[str, List[int]],
++        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
++        add_special_tokens: bool = True,
++    ) -> TextTokensPrompt:
++        """
++        A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
++        that assumes single input.
++        """
++        return next(
++            self._tokenize_prompt_inputs(
++                request,
++                tokenizer,
++                [prompt_input],
++                truncate_prompt_tokens=truncate_prompt_tokens,
++                add_special_tokens=add_special_tokens,
++            ))
++
++    def _tokenize_prompt_inputs(
++        self,
++        request: AnyRequest,
++        tokenizer: AnyTokenizer,
++        prompt_inputs: Iterable[Union[str, List[int]]],
++        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
++        add_special_tokens: bool = True,
++    ) -> Iterator[TextTokensPrompt]:
++        """
++        A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
++        that assumes multiple inputs.
++        """
++        for text in prompt_inputs:
++            if isinstance(text, str):
++                yield self._normalize_prompt_text_to_input(
++                    request,
++                    tokenizer,
++                    prompt=text,
++                    truncate_prompt_tokens=truncate_prompt_tokens,
++                    add_special_tokens=add_special_tokens,
++                )
++            else:
++                yield self._normalize_prompt_tokens_to_input(
++                    request,
++                    tokenizer,
++                    prompt_ids=text,
++                    truncate_prompt_tokens=truncate_prompt_tokens,
++                )
++
++    def _tokenize_prompt_input_or_inputs(
++        self,
++        request: AnyRequest,
++        tokenizer: AnyTokenizer,
++        input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
++        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
++        add_special_tokens: bool = True,
++    ) -> List[TextTokensPrompt]:
++        """
++        Tokenize/detokenize depending on the input format.
++
++        According to `OpenAI API <https://platform.openai.com/docs/api-reference/embeddings/create>`_
++        , each input can be a string or array of tokens. Note that each request
++        can pass one or more inputs.
++        """
++        # Although our type checking is based on mypy,
++        # VSCode Pyright extension should still work properly
++        # "is True" is required for Pyright to perform type narrowing
++        # See: https://github.com/microsoft/pyright/issues/7672
++        return [
++            self._normalize_prompt_text_to_input(
++                request,
++                tokenizer,
++                prompt=prompt_input["content"],
++                truncate_prompt_tokens=truncate_prompt_tokens,
++                add_special_tokens=add_special_tokens)
++            if prompt_input["is_tokens"] is False else
++            self._normalize_prompt_tokens_to_input(
++                request,
++                tokenizer,
++                prompt_ids=prompt_input["content"],
++                truncate_prompt_tokens=truncate_prompt_tokens)
++            for prompt_input in parse_and_batch_prompt(input_or_inputs)
++        ]
++
++    async def _preprocess_completion(
++        self,
++        request: CompletionLikeRequest,
++        tokenizer: AnyTokenizer,
++        input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
++        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
++        add_special_tokens: bool = True,
++    ) -> Tuple[List[TextTokensPrompt], List[TokensPrompt]]:
++        request_prompts = await self._tokenize_prompt_input_or_inputs_async(
++            request,
++            tokenizer,
++            input_or_inputs,
++            truncate_prompt_tokens=truncate_prompt_tokens,
++            add_special_tokens=add_special_tokens,
++        )
++
++        engine_prompts = [
++            TokensPrompt(prompt_token_ids=request_prompt["prompt_token_ids"])
++            for request_prompt in request_prompts
++        ]
++
++        return request_prompts, engine_prompts
++
++    async def _preprocess_chat(
++        self,
++        request: ChatLikeRequest,
++        tokenizer: AnyTokenizer,
++        messages: List[ChatCompletionMessageParam],
++        chat_template: Optional[str],
++        chat_template_content_format: ChatTemplateContentFormatOption,
++        add_generation_prompt: bool = True,
++        continue_final_message: bool = False,
++        tool_dicts: Optional[List[Dict[str, Any]]] = None,
++        documents: Optional[List[Dict[str, str]]] = None,
++        chat_template_kwargs: Optional[Dict[str, Any]] = None,
++        tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
++        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
++        add_special_tokens: bool = False,
++    ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt],
++               List[TokensPrompt]]:
++        resolved_content_format = resolve_chat_template_content_format(
++            chat_template,
++            chat_template_content_format,
++            tokenizer,
++        )
++        conversation, mm_data_future = parse_chat_messages_futures(
++            messages,
++            self.model_config,
++            tokenizer,
++            content_format=resolved_content_format,
++        )
++
++        _chat_template_kwargs: Dict[str, Any] = dict(
++            chat_template=chat_template,
++            add_generation_prompt=add_generation_prompt,
++            continue_final_message=continue_final_message,
++            tools=tool_dicts,
++            documents=documents,
++        )
++        _chat_template_kwargs.update(chat_template_kwargs or {})
++
++        request_prompt: Union[str, List[int]]
++        is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer)
++        if is_mistral_tokenizer:
++            request_prompt = apply_mistral_chat_template(
++                tokenizer,
++                messages=messages,
++                **_chat_template_kwargs,
++            )
+         else:
+-            return input_ids, input_text
++            request_prompt = apply_hf_chat_template(
++                tokenizer,
++                conversation=conversation,
++                **_chat_template_kwargs,
++            )
++
++        mm_data = await mm_data_future
++
++        # tool parsing is done only if a tool_parser has been set and if
++        # tool_choice is not "none" (if tool_choice is "none" but a tool_parser
++        # is set, we want to prevent parsing a tool_call hallucinated by the LLM
++        should_parse_tools = tool_parser is not None and (hasattr(
++            request, "tool_choice") and request.tool_choice != "none")
++
++        if should_parse_tools:
++            if not isinstance(request, ChatCompletionRequest):
++                msg = "Tool usage is only supported for Chat Completions API"
++                raise NotImplementedError(msg)
++
++            request = tool_parser(tokenizer).adjust_request(  # type: ignore
++                request=request)
++
++        if isinstance(request_prompt, str):
++            prompt_inputs = await self._tokenize_prompt_input_async(
++                request,
++                tokenizer,
++                request_prompt,
++                truncate_prompt_tokens=truncate_prompt_tokens,
++                add_special_tokens=add_special_tokens,
++            )
++        else:
++            # For MistralTokenizer
++            assert is_list_of(request_prompt, int), (
++                "Prompt has to be either a string or a list of token ids")
++            prompt_inputs = TextTokensPrompt(
++                prompt=tokenizer.decode(request_prompt),
++                prompt_token_ids=request_prompt)
++
++        engine_prompt = TokensPrompt(
++            prompt_token_ids=prompt_inputs["prompt_token_ids"])
++        if mm_data is not None:
++            engine_prompt["multi_modal_data"] = mm_data
++
++        return conversation, [request_prompt], [engine_prompt]
++
++    def _log_inputs(
++        self,
++        request_id: str,
++        inputs: RequestPrompt,
++        params: Optional[Union[SamplingParams, PoolingParams,
++                               BeamSearchParams]],
++        lora_request: Optional[LoRARequest],
++        prompt_adapter_request: Optional[PromptAdapterRequest],
++    ) -> None:
++        if self.request_logger is None:
++            return
++
++        if isinstance(inputs, str):
++            prompt = inputs
++            prompt_token_ids = None
++        elif isinstance(inputs, list):
++            prompt = None
++            prompt_token_ids = inputs
++        else:
++            prompt = inputs["prompt"]
++            prompt_token_ids = inputs["prompt_token_ids"]
++
++        self.request_logger.log_inputs(
++            request_id,
++            prompt,
++            prompt_token_ids,
++            params=params,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++        )
++
++    async def _get_trace_headers(
++        self,
++        headers: Headers,
++    ) -> Optional[Mapping[str, str]]:
++        is_tracing_enabled = await self.engine_client.is_tracing_enabled()
++
++        if is_tracing_enabled:
++            return extract_trace_headers(headers)
++
++        if contains_trace_headers(headers):
++            log_tracing_disabled_warning()
++
++        return None
++
++    @staticmethod
++    def _base_request_id(raw_request: Optional[Request],
++                         default: Optional[str] = None) -> Optional[str]:
++        """Pulls the request id to use from a header, if provided"""
++        default = default or random_uuid()
++        if raw_request is None:
++            return default
++
++        return raw_request.headers.get("X-Request-Id", default)
++
++    @staticmethod
++    def _get_decoded_token(logprob: Logprob,
++                           token_id: int,
++                           tokenizer: AnyTokenizer,
++                           return_as_token_id: bool = False) -> str:
++        if return_as_token_id:
++            return f"token_id:{token_id}"
++
++        if logprob.decoded_token is not None:
++            return logprob.decoded_token
++        return tokenizer.decode(token_id)
++
++    def _is_model_supported(self, model_name):
++        return self.models.is_base_model(model_name)
+diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
+new file mode 100644
+index 0000000..a222eaf
+--- /dev/null
++++ b/vllm/entrypoints/openai/serving_models.py
+@@ -0,0 +1,250 @@
++import json
++import pathlib
++from dataclasses import dataclass
++from http import HTTPStatus
++from typing import List, Optional, Union
++
++from vllm.config import ModelConfig
++from vllm.engine.protocol import EngineClient
++from vllm.entrypoints.openai.protocol import (ErrorResponse,
++                                              LoadLoraAdapterRequest,
++                                              ModelCard, ModelList,
++                                              ModelPermission,
++                                              UnloadLoraAdapterRequest)
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.utils import AtomicCounter
++
++logger = init_logger(__name__)
++
++
++@dataclass
++class BaseModelPath:
++    name: str
++    model_path: str
++
++
++@dataclass
++class PromptAdapterPath:
++    name: str
++    local_path: str
++
++
++@dataclass
++class LoRAModulePath:
++    name: str
++    path: str
++    base_model_name: Optional[str] = None
++
++
++class OpenAIServingModels:
++    """Shared instance to hold data about the loaded base model(s) and adapters.
++
++    Handles the routes:
++    - /v1/models
++    - /v1/load_lora_adapter
++    - /v1/unload_lora_adapter
++    """
++
++    def __init__(
++        self,
++        engine_client: EngineClient,
++        model_config: ModelConfig,
++        base_model_paths: List[BaseModelPath],
++        *,
++        lora_modules: Optional[List[LoRAModulePath]] = None,
++        prompt_adapters: Optional[List[PromptAdapterPath]] = None,
++    ):
++        super().__init__()
++
++        self.base_model_paths = base_model_paths
++        self.max_model_len = model_config.max_model_len
++        self.engine_client = engine_client
++
++        self.static_lora_modules = lora_modules
++        self.lora_requests: List[LoRARequest] = []
++        self.lora_id_counter = AtomicCounter(0)
++
++        self.prompt_adapter_requests = []
++        if prompt_adapters is not None:
++            for i, prompt_adapter in enumerate(prompt_adapters, start=1):
++                with pathlib.Path(prompt_adapter.local_path,
++                                  "adapter_config.json").open() as f:
++                    adapter_config = json.load(f)
++                    num_virtual_tokens = adapter_config["num_virtual_tokens"]
++                self.prompt_adapter_requests.append(
++                    PromptAdapterRequest(
++                        prompt_adapter_name=prompt_adapter.name,
++                        prompt_adapter_id=i,
++                        prompt_adapter_local_path=prompt_adapter.local_path,
++                        prompt_adapter_num_virtual_tokens=num_virtual_tokens))
++
++    async def init_static_loras(self):
++        """Loads all static LoRA modules.
++        Raises if any fail to load"""
++        if self.static_lora_modules is None:
++            return
++        for lora in self.static_lora_modules:
++            load_request = LoadLoraAdapterRequest(lora_path=lora.path,
++                                                  lora_name=lora.name)
++            load_result = await self.load_lora_adapter(
++                request=load_request, base_model_name=lora.base_model_name)
++            if isinstance(load_result, ErrorResponse):
++                raise ValueError(load_result.message)
++
++    def is_base_model(self, model_name):
++        return any(model.name == model_name for model in self.base_model_paths)
++
++    def model_name(self, lora_request: Optional[LoRARequest] = None) -> str:
++        """Returns the appropriate model name depending on the availability
++        and support of the LoRA or base model.
++        Parameters:
++        - lora: LoRARequest that contain a base_model_name.
++        Returns:
++        - str: The name of the base model or the first available model path.
++        """
++        if lora_request is not None:
++            return lora_request.lora_name
++        return self.base_model_paths[0].name
++
++    async def show_available_models(self) -> ModelList:
++        """Show available models. This includes the base model and all 
++        adapters"""
++        model_cards = [
++            ModelCard(id=base_model.name,
++                      max_model_len=self.max_model_len,
++                      root=base_model.model_path,
++                      permission=[ModelPermission()])
++            for base_model in self.base_model_paths
++        ]
++        lora_cards = [
++            ModelCard(id=lora.lora_name,
++                      root=lora.local_path,
++                      parent=lora.base_model_name if lora.base_model_name else
++                      self.base_model_paths[0].name,
++                      permission=[ModelPermission()])
++            for lora in self.lora_requests
++        ]
++        prompt_adapter_cards = [
++            ModelCard(id=prompt_adapter.prompt_adapter_name,
++                      root=self.base_model_paths[0].name,
++                      permission=[ModelPermission()])
++            for prompt_adapter in self.prompt_adapter_requests
++        ]
++        model_cards.extend(lora_cards)
++        model_cards.extend(prompt_adapter_cards)
++        return ModelList(data=model_cards)
++
++    async def load_lora_adapter(
++            self,
++            request: LoadLoraAdapterRequest,
++            base_model_name: Optional[str] = None
++    ) -> Union[ErrorResponse, str]:
++        error_check_ret = await self._check_load_lora_adapter_request(request)
++        if error_check_ret is not None:
++            return error_check_ret
++
++        lora_name, lora_path = request.lora_name, request.lora_path
++        unique_id = self.lora_id_counter.inc(1)
++        lora_request = LoRARequest(lora_name=lora_name,
++                                   lora_int_id=unique_id,
++                                   lora_path=lora_path)
++        if base_model_name is not None and self.is_base_model(base_model_name):
++            lora_request.base_model_name = base_model_name
++
++        # Validate that the adapter can be loaded into the engine
++        # This will also pre-load it for incoming requests
++        try:
++            await self.engine_client.add_lora(lora_request)
++        except ValueError as e:
++            # Adapter not found or lora configuration errors
++            if "No adapter found" in str(e):
++                return create_error_response(message=str(e),
++                                             err_type="NotFoundError",
++                                             status_code=HTTPStatus.NOT_FOUND)
++            else:
++                return create_error_response(
++                    message=str(e),
++                    err_type="BadRequestError",
++                    status_code=HTTPStatus.BAD_REQUEST)
++        except BaseException as e:
++            # Some other unexpected problem loading the adapter, e.g. malformed
++            # input files.
++            # More detailed error messages for the user would be nicer here
++            return create_error_response(message=str(e),
++                                         err_type="BadRequestError",
++                                         status_code=HTTPStatus.BAD_REQUEST)
++
++        self.lora_requests.append(lora_request)
++        logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name,
++                    lora_path)
++        return f"Success: LoRA adapter '{lora_name}' added successfully."
++
++    async def unload_lora_adapter(
++            self,
++            request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
++        error_check_ret = await self._check_unload_lora_adapter_request(request
++                                                                        )
++        if error_check_ret is not None:
++            return error_check_ret
++
++        lora_name = request.lora_name
++        self.lora_requests = [
++            lora_request for lora_request in self.lora_requests
++            if lora_request.lora_name != lora_name
++        ]
++        logger.info("Removed LoRA adapter: name '%s'", lora_name)
++        return f"Success: LoRA adapter '{lora_name}' removed successfully."
++
++    async def _check_load_lora_adapter_request(
++            self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
++        # Check if both 'lora_name' and 'lora_path' are provided
++        if not request.lora_name or not request.lora_path:
++            return create_error_response(
++                message="Both 'lora_name' and 'lora_path' must be provided.",
++                err_type="InvalidUserInput",
++                status_code=HTTPStatus.BAD_REQUEST)
++
++        # Check if the lora adapter with the given name already exists
++        if any(lora_request.lora_name == request.lora_name
++               for lora_request in self.lora_requests):
++            return create_error_response(
++                message=
++                f"The lora adapter '{request.lora_name}' has already been"
++                "loaded.",
++                err_type="InvalidUserInput",
++                status_code=HTTPStatus.BAD_REQUEST)
++
++        return None
++
++    async def _check_unload_lora_adapter_request(
++            self,
++            request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
++        # Check if either 'lora_name' or 'lora_int_id' is provided
++        if not request.lora_name and not request.lora_int_id:
++            return create_error_response(
++                message=
++                "either 'lora_name' and 'lora_int_id' needs to be provided.",
++                err_type="InvalidUserInput",
++                status_code=HTTPStatus.BAD_REQUEST)
++
++        # Check if the lora adapter with the given name exists
++        if not any(lora_request.lora_name == request.lora_name
++                   for lora_request in self.lora_requests):
++            return create_error_response(
++                message=
++                f"The lora adapter '{request.lora_name}' cannot be found.",
++                err_type="NotFoundError",
++                status_code=HTTPStatus.NOT_FOUND)
++
++        return None
++
++
++def create_error_response(
++        message: str,
++        err_type: str = "BadRequestError",
++        status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
++    return ErrorResponse(message=message,
++                         type=err_type,
++                         code=status_code.value)
+diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
+new file mode 100644
+index 0000000..5830322
+--- /dev/null
++++ b/vllm/entrypoints/openai/serving_pooling.py
+@@ -0,0 +1,233 @@
++import asyncio
++import base64
++import time
++from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
++
++import numpy as np
++from fastapi import Request
++from typing_extensions import assert_never
++
++from vllm.config import ModelConfig
++from vllm.engine.protocol import EngineClient
++from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
++from vllm.entrypoints.logger import RequestLogger
++from vllm.entrypoints.openai.protocol import (ErrorResponse,
++                                              PoolingChatRequest,
++                                              PoolingRequest, PoolingResponse,
++                                              PoolingResponseData, UsageInfo)
++from vllm.entrypoints.openai.serving_engine import OpenAIServing
++from vllm.entrypoints.openai.serving_models import OpenAIServingModels
++from vllm.logger import init_logger
++from vllm.outputs import PoolingOutput, PoolingRequestOutput
++from vllm.utils import merge_async_iterators
++
++logger = init_logger(__name__)
++
++
++def _get_data(
++    output: PoolingOutput,
++    encoding_format: Literal["float", "base64"],
++) -> Union[List[float], str]:
++    if encoding_format == "float":
++        return output.data.tolist()
++    elif encoding_format == "base64":
++        # Force to use float32 for base64 encoding
++        # to match the OpenAI python client behavior
++        pooling_bytes = np.array(output.data, dtype="float32").tobytes()
++        return base64.b64encode(pooling_bytes).decode("utf-8")
++
++    assert_never(encoding_format)
++
++
++class OpenAIServingPooling(OpenAIServing):
++
++    def __init__(
++        self,
++        engine_client: EngineClient,
++        model_config: ModelConfig,
++        models: OpenAIServingModels,
++        *,
++        request_logger: Optional[RequestLogger],
++        chat_template: Optional[str],
++        chat_template_content_format: ChatTemplateContentFormatOption,
++    ) -> None:
++        super().__init__(engine_client=engine_client,
++                         model_config=model_config,
++                         models=models,
++                         request_logger=request_logger)
++
++        self.chat_template = chat_template
++        self.chat_template_content_format: Final = chat_template_content_format
++
++    async def create_pooling(
++        self,
++        request: PoolingRequest,
++        raw_request: Optional[Request] = None,
++    ) -> Union[PoolingResponse, ErrorResponse]:
++        """
++        See https://platform.openai.com/docs/api-reference/embeddings/create
++        for the API specification. This API mimics the OpenAI Embedding API.
++        """
++        error_check_ret = await self._check_model(request)
++        if error_check_ret is not None:
++            return error_check_ret
++
++        encoding_format = request.encoding_format
++        if request.dimensions is not None:
++            return self.create_error_response(
++                "dimensions is currently not supported")
++
++        model_name = request.model
++        request_id = f"pool-{self._base_request_id(raw_request)}"
++        created_time = int(time.time())
++
++        truncate_prompt_tokens = None
++
++        if request.truncate_prompt_tokens is not None:
++            if request.truncate_prompt_tokens <= self.max_model_len:
++                truncate_prompt_tokens = request.truncate_prompt_tokens
++            else:
++                return self.create_error_response(
++                    "truncate_prompt_tokens value is "
++                    "greater than max_model_len."
++                    " Please, select a smaller truncation size.")
++
++        try:
++            (
++                lora_request,
++                prompt_adapter_request,
++            ) = self._maybe_get_adapters(request)
++
++            tokenizer = await self.engine_client.get_tokenizer(lora_request)
++
++            if prompt_adapter_request is not None:
++                raise NotImplementedError("Prompt adapter is not supported "
++                                          "for pooling models")
++
++            if isinstance(request, PoolingChatRequest):
++                (
++                    _,
++                    request_prompts,
++                    engine_prompts,
++                ) = await self._preprocess_chat(
++                    request,
++                    tokenizer,
++                    request.messages,
++                    chat_template=request.chat_template or self.chat_template,
++                    chat_template_content_format=self.
++                    chat_template_content_format,
++                    # In pooling requests, we are not generating tokens,
++                    # so there is no need to append extra tokens to the input
++                    add_generation_prompt=False,
++                    continue_final_message=False,
++                    truncate_prompt_tokens=truncate_prompt_tokens,
++                    add_special_tokens=request.add_special_tokens,
++                )
++            else:
++                (request_prompts,
++                 engine_prompts) = await self._preprocess_completion(
++                     request,
++                     tokenizer,
++                     request.input,
++                     truncate_prompt_tokens=truncate_prompt_tokens,
++                     add_special_tokens=request.add_special_tokens,
++                 )
++        except ValueError as e:
++            logger.exception("Error in preprocessing prompt inputs")
++            return self.create_error_response(str(e))
++
++        # Schedule the request and get the result generator.
++        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
++        try:
++            pooling_params = request.to_pooling_params()
++
++            for i, engine_prompt in enumerate(engine_prompts):
++                request_id_item = f"{request_id}-{i}"
++
++                self._log_inputs(request_id_item,
++                                 request_prompts[i],
++                                 params=pooling_params,
++                                 lora_request=lora_request,
++                                 prompt_adapter_request=prompt_adapter_request)
++
++                trace_headers = (None if raw_request is None else await
++                                 self._get_trace_headers(raw_request.headers))
++
++                generator = self.engine_client.encode(
++                    engine_prompt,
++                    pooling_params,
++                    request_id_item,
++                    lora_request=lora_request,
++                    trace_headers=trace_headers,
++                    priority=request.priority,
++                )
++
++                generators.append(generator)
++        except ValueError as e:
++            # TODO: Use a vllm-specific Validation Error
++            return self.create_error_response(str(e))
++
++        result_generator = merge_async_iterators(*generators)
++
++        num_prompts = len(engine_prompts)
++
++        # Non-streaming response
++        final_res_batch: List[Optional[PoolingRequestOutput]]
++        final_res_batch = [None] * num_prompts
++        try:
++            async for i, res in result_generator:
++                final_res_batch[i] = res
++
++            assert all(final_res is not None for final_res in final_res_batch)
++
++            final_res_batch_checked = cast(List[PoolingRequestOutput],
++                                           final_res_batch)
++
++            response = self.request_output_to_pooling_response(
++                final_res_batch_checked,
++                request_id,
++                created_time,
++                model_name,
++                encoding_format,
++            )
++        except asyncio.CancelledError:
++            return self.create_error_response("Client disconnected")
++        except ValueError as e:
++            # TODO: Use a vllm-specific Validation Error
++            return self.create_error_response(str(e))
++
++        return response
++
++    def request_output_to_pooling_response(
++        self,
++        final_res_batch: List[PoolingRequestOutput],
++        request_id: str,
++        created_time: int,
++        model_name: str,
++        encoding_format: Literal["float", "base64"],
++    ) -> PoolingResponse:
++        items: List[PoolingResponseData] = []
++        num_prompt_tokens = 0
++
++        for idx, final_res in enumerate(final_res_batch):
++            item = PoolingResponseData(
++                index=idx,
++                data=_get_data(final_res.outputs, encoding_format),
++            )
++            prompt_token_ids = final_res.prompt_token_ids
++
++            items.append(item)
++            num_prompt_tokens += len(prompt_token_ids)
++
++        usage = UsageInfo(
++            prompt_tokens=num_prompt_tokens,
++            total_tokens=num_prompt_tokens,
++        )
++
++        return PoolingResponse(
++            id=request_id,
++            created=created_time,
++            model=model_name,
++            data=items,
++            usage=usage,
++        )
+diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
+new file mode 100644
+index 0000000..5d3e713
+--- /dev/null
++++ b/vllm/entrypoints/openai/serving_score.py
+@@ -0,0 +1,226 @@
++import asyncio
++import time
++from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
++
++from fastapi import Request
++
++from vllm.config import ModelConfig
++from vllm.engine.protocol import EngineClient
++from vllm.entrypoints.logger import RequestLogger
++from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest,
++                                              ScoreResponse, ScoreResponseData,
++                                              UsageInfo)
++from vllm.entrypoints.openai.serving_engine import OpenAIServing
++from vllm.entrypoints.openai.serving_models import OpenAIServingModels
++from vllm.inputs.data import TokensPrompt
++from vllm.logger import init_logger
++from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
++from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
++from vllm.utils import make_async, merge_async_iterators
++
++logger = init_logger(__name__)
++
++
++def make_pairs(text_1: Union[List[str], str], text_2: Union[List[str],
++                                                            str]) -> List:
++    if isinstance(text_1, (str, dict)):
++        # Convert a single prompt to a list.
++        text_1 = [text_1]
++    text_1 = [t for t in text_1]
++
++    if isinstance(text_2, (str, dict)):
++        # Convert a single prompt to a list.
++        text_2 = [text_2]
++    text_2 = [t for t in text_2]
++    if len(text_1) > 1 and len(text_1) != len(text_2):
++        raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
++    if len(text_1) == 0:
++        raise ValueError("At least one text element must be given")
++    if len(text_2) == 0:
++        raise ValueError("At least one text_pair element must be given")
++
++    if len(text_1) == 1:
++        text_1 = text_1 * len(text_2)
++
++    return [(t1, t2) for t1, t2 in zip(text_1, text_2)]
++
++
++class OpenAIServingScores(OpenAIServing):
++
++    def __init__(
++        self,
++        engine_client: EngineClient,
++        model_config: ModelConfig,
++        models: OpenAIServingModels,
++        *,
++        request_logger: Optional[RequestLogger],
++    ) -> None:
++        super().__init__(engine_client=engine_client,
++                         model_config=model_config,
++                         models=models,
++                         request_logger=request_logger)
++
++    async def create_score(
++        self,
++        request: ScoreRequest,
++        raw_request: Optional[Request] = None,
++    ) -> Union[ScoreResponse, ErrorResponse]:
++        """
++        Score API similar to Sentence Transformers cross encoder
++
++        See https://sbert.net/docs/package_reference/cross_encoder
++        """
++        error_check_ret = await self._check_model(request)
++        if error_check_ret is not None:
++            return error_check_ret
++
++        model_name = request.model
++        request_id = f"score-{self._base_request_id(raw_request)}"
++        created_time = int(time.time())
++        truncate_prompt_tokens = request.truncate_prompt_tokens
++
++        request_prompts = []
++        engine_prompts = []
++
++        try:
++            (
++                lora_request,
++                prompt_adapter_request,
++            ) = self._maybe_get_adapters(request)
++
++            tokenizer = await self.engine_client.get_tokenizer(lora_request)
++
++            if prompt_adapter_request is not None:
++                raise NotImplementedError("Prompt adapter is not supported "
++                                          "for scoring models")
++
++            if isinstance(tokenizer, MistralTokenizer):
++                raise ValueError(
++                    "MistralTokenizer not supported for cross-encoding")
++
++            if not self.model_config.is_cross_encoder:
++                raise ValueError("Model is not cross encoder.")
++
++        except ValueError as e:
++            logger.exception("Error in preprocessing prompt inputs")
++            return self.create_error_response(str(e))
++
++        # Schedule the request and get the result generator.
++        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
++
++        input_pairs = make_pairs(request.text_1, request.text_2)
++
++        for q, t in input_pairs:
++            request_prompt = f"{q}{tokenizer.sep_token}{t}"
++
++            tokenization_kwargs: Dict[str, Any] = {}
++            if truncate_prompt_tokens is not None:
++                tokenization_kwargs["truncation"] = True
++                tokenization_kwargs["max_length"] = truncate_prompt_tokens
++
++            tokenize_async = make_async(tokenizer.__call__,
++                                        executor=self._tokenizer_executor)
++            prompt_inputs = await tokenize_async(text=q,
++                                                 text_pair=t,
++                                                 **tokenization_kwargs)
++            engine_prompt = TokensPrompt(
++                prompt_token_ids=prompt_inputs["input_ids"],
++                token_type_ids=prompt_inputs.get("token_type_ids"))
++
++            request_prompts.append(request_prompt)
++            engine_prompts.append(engine_prompt)
++
++        try:
++            pooling_params = request.to_pooling_params()
++
++            for i, engine_prompt in enumerate(engine_prompts):
++                request_id_item = f"{request_id}-{i}"
++
++                self._log_inputs(request_id_item,
++                                 request_prompts[i],
++                                 params=pooling_params,
++                                 lora_request=lora_request,
++                                 prompt_adapter_request=prompt_adapter_request)
++
++                trace_headers = (None if raw_request is None else await
++                                 self._get_trace_headers(raw_request.headers))
++
++                generator = self.engine_client.encode(
++                    engine_prompt,
++                    pooling_params,
++                    request_id_item,
++                    lora_request=lora_request,
++                    trace_headers=trace_headers,
++                    priority=request.priority,
++                )
++
++                generators.append(generator)
++        except ValueError as e:
++            # TODO: Use a vllm-specific Validation Error
++            return self.create_error_response(str(e))
++
++        result_generator = merge_async_iterators(*generators)
++
++        num_prompts = len(engine_prompts)
++
++        # Non-streaming response
++        final_res_batch: List[Optional[PoolingRequestOutput]]
++        final_res_batch = [None] * num_prompts
++
++        try:
++            async for i, res in result_generator:
++                final_res_batch[i] = res
++
++            assert all(final_res is not None for final_res in final_res_batch)
++
++            final_res_batch_checked = cast(List[PoolingRequestOutput],
++                                           final_res_batch)
++
++            response = self.request_output_to_score_response(
++                final_res_batch_checked,
++                request_id,
++                created_time,
++                model_name,
++            )
++        except asyncio.CancelledError:
++            return self.create_error_response("Client disconnected")
++        except ValueError as e:
++            # TODO: Use a vllm-specific Validation Error
++            return self.create_error_response(str(e))
++
++        return response
++
++    def request_output_to_score_response(
++        self,
++        final_res_batch: List[PoolingRequestOutput],
++        request_id: str,
++        created_time: int,
++        model_name: str,
++    ) -> ScoreResponse:
++        items: List[ScoreResponseData] = []
++        num_prompt_tokens = 0
++
++        for idx, final_res in enumerate(final_res_batch):
++            classify_res = ScoringRequestOutput.from_base(final_res)
++
++            item = ScoreResponseData(
++                index=idx,
++                score=classify_res.outputs.score,
++            )
++            prompt_token_ids = final_res.prompt_token_ids
++
++            items.append(item)
++            num_prompt_tokens += len(prompt_token_ids)
++
++        usage = UsageInfo(
++            prompt_tokens=num_prompt_tokens,
++            total_tokens=num_prompt_tokens,
++        )
++
++        return ScoreResponse(
++            id=request_id,
++            created=created_time,
++            model=model_name,
++            data=items,
++            usage=usage,
++        )
+diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
+new file mode 100644
+index 0000000..b67ecfb
+--- /dev/null
++++ b/vllm/entrypoints/openai/serving_tokenization.py
+@@ -0,0 +1,144 @@
++from typing import Final, List, Optional, Union
++
++from fastapi import Request
++
++from vllm.config import ModelConfig
++from vllm.engine.protocol import EngineClient
++from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
++from vllm.entrypoints.logger import RequestLogger
++# yapf conflicts with isort for this block
++# yapf: disable
++from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
++                                              DetokenizeResponse,
++                                              ErrorResponse,
++                                              TokenizeChatRequest,
++                                              TokenizeRequest,
++                                              TokenizeResponse)
++# yapf: enable
++from vllm.entrypoints.openai.serving_engine import OpenAIServing
++from vllm.entrypoints.openai.serving_models import OpenAIServingModels
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++
++class OpenAIServingTokenization(OpenAIServing):
++
++    def __init__(
++        self,
++        engine_client: EngineClient,
++        model_config: ModelConfig,
++        models: OpenAIServingModels,
++        *,
++        request_logger: Optional[RequestLogger],
++        chat_template: Optional[str],
++        chat_template_content_format: ChatTemplateContentFormatOption,
++    ) -> None:
++        super().__init__(engine_client=engine_client,
++                         model_config=model_config,
++                         models=models,
++                         request_logger=request_logger)
++
++        self.chat_template = chat_template
++        self.chat_template_content_format: Final = chat_template_content_format
++
++    async def create_tokenize(
++        self,
++        request: TokenizeRequest,
++        raw_request: Request,
++    ) -> Union[TokenizeResponse, ErrorResponse]:
++        error_check_ret = await self._check_model(request)
++        if error_check_ret is not None:
++            return error_check_ret
++
++        request_id = f"tokn-{self._base_request_id(raw_request)}"
++
++        try:
++            (
++                lora_request,
++                prompt_adapter_request,
++            ) = self._maybe_get_adapters(request)
++
++            tokenizer = await self.engine_client.get_tokenizer(lora_request)
++
++            if isinstance(request, TokenizeChatRequest):
++                (
++                    _,
++                    request_prompts,
++                    engine_prompts,
++                ) = await self._preprocess_chat(
++                    request,
++                    tokenizer,
++                    request.messages,
++                    chat_template=request.chat_template or self.chat_template,
++                    chat_template_content_format=self.
++                    chat_template_content_format,
++                    add_generation_prompt=request.add_generation_prompt,
++                    continue_final_message=request.continue_final_message,
++                    chat_template_kwargs=request.chat_template_kwargs,
++                    add_special_tokens=request.add_special_tokens,
++                )
++            else:
++                (request_prompts,
++                 engine_prompts) = await self._preprocess_completion(
++                     request,
++                     tokenizer,
++                     request.prompt,
++                     add_special_tokens=request.add_special_tokens,
++                 )
++        except ValueError as e:
++            logger.exception("Error in preprocessing prompt inputs")
++            return self.create_error_response(str(e))
++
++        input_ids: List[int] = []
++        for i, engine_prompt in enumerate(engine_prompts):
++            self._log_inputs(request_id,
++                             request_prompts[i],
++                             params=None,
++                             lora_request=lora_request,
++                             prompt_adapter_request=prompt_adapter_request)
++
++            # Silently ignore prompt adapter since it does not affect
++            # tokenization (Unlike in Embeddings API where an error is raised)
++
++            input_ids.extend(engine_prompt["prompt_token_ids"])
++
++        return TokenizeResponse(tokens=input_ids,
++                                count=len(input_ids),
++                                max_model_len=self.max_model_len)
++
++    async def create_detokenize(
++        self,
++        request: DetokenizeRequest,
++        raw_request: Request,
++    ) -> Union[DetokenizeResponse, ErrorResponse]:
++        error_check_ret = await self._check_model(request)
++        if error_check_ret is not None:
++            return error_check_ret
++
++        request_id = f"tokn-{self._base_request_id(raw_request)}"
++
++        (
++            lora_request,
++            prompt_adapter_request,
++        ) = self._maybe_get_adapters(request)
++
++        tokenizer = await self.engine_client.get_tokenizer(lora_request)
++
++        self._log_inputs(request_id,
++                         request.tokens,
++                         params=None,
++                         lora_request=lora_request,
++                         prompt_adapter_request=prompt_adapter_request)
++
++        # Silently ignore prompt adapter since it does not affect tokenization
++        # (Unlike in Embeddings API where an error is raised)
++
++        prompt_input = await self._tokenize_prompt_input_async(
++            request,
++            tokenizer,
++            request.tokens,
++        )
++        input_text = prompt_input["prompt"]
++
++        return DetokenizeResponse(prompt=input_text)
+diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
+new file mode 100644
+index 0000000..2850349
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
+@@ -0,0 +1,16 @@
++from .abstract_tool_parser import ToolParser, ToolParserManager
++from .granite_20b_fc_tool_parser import Granite20bFCToolParser
++from .granite_tool_parser import GraniteToolParser
++from .hermes_tool_parser import Hermes2ProToolParser
++from .internlm2_tool_parser import Internlm2ToolParser
++from .jamba_tool_parser import JambaToolParser
++from .llama_tool_parser import Llama3JsonToolParser
++from .mistral_tool_parser import MistralToolParser
++from .pythonic_tool_parser import PythonicToolParser
++
++__all__ = [
++    "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
++    "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
++    "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser",
++    "PythonicToolParser"
++]
+diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+new file mode 100644
+index 0000000..aa7c201
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+@@ -0,0 +1,160 @@
++import os
++from functools import cached_property
++from typing import Callable, Dict, List, Optional, Sequence, Type, Union
++
++from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
++                                              DeltaMessage,
++                                              ExtractedToolCallInformation)
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.utils import import_from_path, is_list_of
++
++logger = init_logger(__name__)
++
++
++class ToolParser:
++    """
++    Abstract ToolParser class that should not be used directly. Provided
++    properties and methods should be used in
++    derived classes.
++    """
++
++    def __init__(self, tokenizer: AnyTokenizer):
++        self.prev_tool_call_arr: List[Dict] = []
++        # the index of the tool call that is currently being parsed
++        self.current_tool_id: int = -1
++        self.current_tool_name_sent: bool = False
++        self.streamed_args_for_tool: List[str] = []
++
++        self.model_tokenizer = tokenizer
++
++    @cached_property
++    def vocab(self) -> Dict[str, int]:
++        # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
++        # whereas all tokenizers have .get_vocab()
++        return self.model_tokenizer.get_vocab()
++
++    def adjust_request(
++            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
++        """
++        Static method that used to adjust the request parameters.
++        """
++        return request
++
++    def extract_tool_calls(
++            self, model_output: str,
++            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
++        """
++        Static method that should be implemented for extracting tool calls from
++        a complete model-generated string.
++        Used for non-streaming responses where we have the entire model response
++        available before sending to the client.
++        Static because it's stateless.
++        """
++        raise NotImplementedError(
++            "AbstractToolParser.extract_tool_calls has not been implemented!")
++
++    def extract_tool_calls_streaming(
++        self,
++        previous_text: str,
++        current_text: str,
++        delta_text: str,
++        previous_token_ids: Sequence[int],
++        current_token_ids: Sequence[int],
++        delta_token_ids: Sequence[int],
++        request: ChatCompletionRequest,
++    ) -> Union[DeltaMessage, None]:
++        """
++        Instance method that should be implemented for extracting tool calls
++        from an incomplete response; for use when handling tool calls and
++        streaming. Has to be an instance method because  it requires state -
++        the current tokens/diffs, but also the information about what has
++        previously been parsed and extracted (see constructor)
++        """
++        raise NotImplementedError(
++            "AbstractToolParser.extract_tool_calls_streaming has not been "
++            "implemented!")
++
++
++class ToolParserManager:
++    tool_parsers: Dict[str, Type] = {}
++
++    @classmethod
++    def get_tool_parser(cls, name) -> Type:
++        """
++        Get tool parser by name which is registered by `register_module`.
++
++        Raise a KeyError exception if the name is not registered.
++        """
++        if name in cls.tool_parsers:
++            return cls.tool_parsers[name]
++
++        raise KeyError(f"tool helper: '{name}' not found in tool_parsers")
++
++    @classmethod
++    def _register_module(cls,
++                         module: Type,
++                         module_name: Optional[Union[str, List[str]]] = None,
++                         force: bool = True) -> None:
++        if not issubclass(module, ToolParser):
++            raise TypeError(
++                f'module must be subclass of ToolParser, but got {type(module)}'
++            )
++        if module_name is None:
++            module_name = module.__name__
++        if isinstance(module_name, str):
++            module_name = [module_name]
++        for name in module_name:
++            if not force and name in cls.tool_parsers:
++                existed_module = cls.tool_parsers[name]
++                raise KeyError(f'{name} is already registered '
++                               f'at {existed_module.__module__}')
++            cls.tool_parsers[name] = module
++
++    @classmethod
++    def register_module(
++            cls,
++            name: Optional[Union[str, List[str]]] = None,
++            force: bool = True,
++            module: Union[Type, None] = None) -> Union[type, Callable]:
++        """
++        Register module with the given name or name list. it can be used as a
++        decoder(with module as None) or normal function(with module as not 
++        None).
++        """
++        if not isinstance(force, bool):
++            raise TypeError(f'force must be a boolean, but got {type(force)}')
++
++        # raise the error ahead of time
++        if not (name is None or isinstance(name, str)
++                or is_list_of(name, str)):
++            raise TypeError(
++                'name must be None, an instance of str, or a sequence of str, '
++                f'but got {type(name)}')
++
++        # use it as a normal method: x.register_module(module=SomeClass)
++        if module is not None:
++            cls._register_module(module=module, module_name=name, force=force)
++            return module
++
++        # use it as a decorator: @x.register_module()
++        def _register(module):
++            cls._register_module(module=module, module_name=name, force=force)
++            return module
++
++        return _register
++
++    @classmethod
++    def import_tool_parser(cls, plugin_path: str) -> None:
++        """
++        Import a user-defined tool parser by the path of the tool parser define
++        file.
++        """
++        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
++
++        try:
++            import_from_path(module_name, plugin_path)
++        except Exception:
++            logger.exception("Failed to load module '%s' from %s.",
++                             module_name, plugin_path)
++            return
+diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+new file mode 100644
+index 0000000..94db8f3
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+@@ -0,0 +1,251 @@
++import json
++import re
++from json import JSONDecoder
++from typing import Dict, Sequence, Union
++
++import partial_json_parser
++from partial_json_parser.core.options import Allow
++
++from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
++                                              DeltaFunctionCall, DeltaMessage,
++                                              DeltaToolCall,
++                                              ExtractedToolCallInformation,
++                                              FunctionCall, ToolCall)
++from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
++    ToolParser, ToolParserManager)
++from vllm.entrypoints.openai.tool_parsers.utils import (consume_space,
++                                                        find_common_prefix,
++                                                        is_complete_json,
++                                                        partial_json_loads)
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.utils import random_uuid
++
++logger = init_logger(__name__)
++
++
++@ToolParserManager.register_module("granite-20b-fc")
++class Granite20bFCToolParser(ToolParser):
++    """
++    Tool call parser for the granite-20b-functioncalling model intended
++    for use with the examples/tool_chat_template_granite20b_fc.jinja
++    template.
++
++    Used when --enable-auto-tool-choice --tool-call-parser granite-20-fc
++    are all set
++    """
++
++    def __init__(self, tokenizer: AnyTokenizer):
++        super().__init__(tokenizer)
++
++        self.bot_token = "<function_call>"
++        self.tool_start_token = self.bot_token
++        self.tool_call_regex = re.compile(r"<function_call>\s*")
++
++    def extract_tool_calls(
++            self, model_output: str,
++            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
++        if self.tool_start_token not in model_output:
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++
++        dec = JSONDecoder()
++        try:
++            matches = list(self.tool_call_regex.finditer(model_output))
++            logger.debug("Found %d tool call matches", len(matches))
++
++            raw_function_calls = []
++
++            for i, match in enumerate(matches):
++                # position after the <function_call> tag
++                start_of_json = match.end()
++                # end_index == the start of the next function call
++                # (if exists)
++                next_function_call_start = (matches[i + 1].start()
++                                            if i + 1 < len(matches) else None)
++
++                raw_function_calls.append(
++                    dec.raw_decode(
++                        model_output[start_of_json:next_function_call_start])
++                    [0])
++
++            logger.debug("Extracted %d tool calls", len(raw_function_calls))
++            tool_calls = [
++                ToolCall(
++                    type="function",
++                    function=FunctionCall(
++                        name=function_call["name"],
++                        # function call args are JSON but as a string
++                        arguments=json.dumps(function_call["arguments"]),
++                    ),
++                ) for function_call in raw_function_calls
++            ]
++
++            content = model_output[:model_output.find(self.bot_token)]
++            return ExtractedToolCallInformation(
++                tools_called=True,
++                tool_calls=tool_calls,
++                content=content if content else None,
++            )
++
++        except Exception as e:
++            logger.error("Error in extracting tool call from response %s", e)
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++
++    def extract_tool_calls_streaming(
++        self,
++        previous_text: str,
++        current_text: str,
++        delta_text: str,
++        previous_token_ids: Sequence[int],
++        current_token_ids: Sequence[int],
++        delta_token_ids: Sequence[int],
++        request: ChatCompletionRequest,
++    ) -> Union[DeltaMessage, None]:
++
++        if len(current_text) < len(
++                self.bot_token) and self.bot_token.startswith(current_text):
++            return None
++
++        if not current_text.startswith(self.bot_token):
++            return DeltaMessage(content=delta_text)
++
++        # bit mask flags for partial JSON parsing. If the name hasn't been
++        # sent yet, don't allow sending
++        # an incomplete string since OpenAI only ever (as far as I have
++        # seen) allows sending the entire tool/ function name at once.
++        flags = Allow.ALL if self.current_tool_name_sent \
++            else Allow.ALL & ~Allow.STR
++        try:
++            tool_call_arr = []
++            is_complete = []
++            try:
++                start_idx = len(self.bot_token)
++                start_idx = consume_space(start_idx, current_text)
++
++                while start_idx < len(current_text):
++                    (obj,
++                     end_idx) = partial_json_loads(current_text[start_idx:],
++                                                   flags)
++                    is_complete.append(
++                        is_complete_json(current_text[start_idx:start_idx +
++                                                      end_idx]))
++                    start_idx += end_idx
++                    start_idx = consume_space(start_idx, current_text)
++                    start_idx += len(self.bot_token)
++                    start_idx = consume_space(start_idx, current_text)
++                    tool_call_arr.append(obj)
++            except partial_json_parser.core.exceptions.MalformedJSON:
++                logger.debug('not enough tokens to parse into JSON yet')
++                return None
++
++            # select as the current tool call the one we're on the state at
++            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
++                if len(tool_call_arr) > 0 else {}
++
++            # case -- if no tokens have been streamed for the tool, e.g.
++            #   only the array brackets, stream nothing
++            if len(tool_call_arr) == 0:
++                return None
++
++            # case: we are starting a new tool in the array
++            #   -> array has > 0 length AND length has moved past cursor
++            elif (len(tool_call_arr) > 0
++                  and len(tool_call_arr) > self.current_tool_id + 1):
++
++                # if we're moving on to a new call, first make sure we
++                # haven't missed anything in the previous one that was
++                # auto-generated due to JSON completions, but wasn't
++                # streamed to the client yet.
++                if self.current_tool_id >= 0:
++                    cur_arguments = current_tool_call.get("arguments")
++                    if cur_arguments:
++                        cur_args_json = json.dumps(cur_arguments)
++                        sent = len(
++                            self.streamed_args_for_tool[self.current_tool_id])
++                        argument_diff = cur_args_json[sent:]
++
++                        logger.debug("got arguments diff: %s", argument_diff)
++                        delta = DeltaMessage(tool_calls=[
++                            DeltaToolCall(index=self.current_tool_id,
++                                          function=DeltaFunctionCall(
++                                              arguments=argument_diff).
++                                          model_dump(exclude_none=True))
++                        ])
++                        self.streamed_args_for_tool[
++                            self.current_tool_id] += argument_diff
++                    else:
++                        delta = None
++                else:
++                    delta = None
++                # re-set stuff pertaining to progress in the current tool
++                self.current_tool_id = len(tool_call_arr) - 1
++                self.current_tool_name_sent = False
++                self.streamed_args_for_tool.append("")
++                logger.debug("starting on new tool %d", self.current_tool_id)
++                return delta
++
++            # if the current tool name hasn't been sent, send if available
++            # - otherwise send nothing
++            elif not self.current_tool_name_sent:
++                function_name = current_tool_call.get("name")
++                if function_name:
++
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      type="function",
++                                      id=f"chatcmpl-tool-{random_uuid()}",
++                                      function=DeltaFunctionCall(
++                                          name=function_name).model_dump(
++                                              exclude_none=True))
++                    ])
++                    self.current_tool_name_sent = True
++                else:
++                    delta = None
++
++            # now we know we're on the same tool call and we're streaming
++            # arguments
++            else:
++                cur_arguments = current_tool_call.get("arguments")
++                delta = None
++
++                if cur_arguments:
++                    sent = len(
++                        self.streamed_args_for_tool[self.current_tool_id])
++                    cur_args_json = json.dumps(cur_arguments)
++                    prev_arguments = self.prev_tool_call_arr[
++                        self.current_tool_id].get("arguments")
++
++                    argument_diff = None
++                    if is_complete[self.current_tool_id]:
++                        argument_diff = cur_args_json[sent:]
++                    elif prev_arguments:
++                        prev_args_json = json.dumps(prev_arguments)
++                        if cur_args_json != prev_args_json:
++
++                            prefix = find_common_prefix(
++                                prev_args_json, cur_args_json)
++                            argument_diff = prefix[sent:]
++
++                    if argument_diff is not None:
++                        delta = DeltaMessage(tool_calls=[
++                            DeltaToolCall(index=self.current_tool_id,
++                                          function=DeltaFunctionCall(
++                                              arguments=argument_diff).
++                                          model_dump(exclude_none=True))
++                        ])
++                        self.streamed_args_for_tool[
++                            self.current_tool_id] += argument_diff
++
++            self.prev_tool_call_arr = tool_call_arr
++            return delta
++
++        except Exception as e:
++            logger.error("Error trying to handle streaming tool call: %s", e)
++            logger.debug(
++                "Skipping chunk as a result of tool streaming extraction "
++                "error")
++            return None
+diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+new file mode 100644
+index 0000000..8aefcd8
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+@@ -0,0 +1,229 @@
++import json
++from typing import Dict, Sequence, Union
++
++import partial_json_parser
++from partial_json_parser.core.options import Allow
++
++from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
++                                              DeltaFunctionCall, DeltaMessage,
++                                              DeltaToolCall,
++                                              ExtractedToolCallInformation,
++                                              FunctionCall, ToolCall)
++from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
++    ToolParser, ToolParserManager)
++from vllm.entrypoints.openai.tool_parsers.utils import (consume_space,
++                                                        find_common_prefix,
++                                                        is_complete_json,
++                                                        partial_json_loads)
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.utils import random_uuid
++
++logger = init_logger(__name__)
++
++
++@ToolParserManager.register_module("granite")
++class GraniteToolParser(ToolParser):
++    """
++    Tool call parser for the granite 3.0 models. Intended
++    for use with the examples/tool_chat_template_granite.jinja
++    template.
++
++    Used when --enable-auto-tool-choice --tool-call-parser granite
++    are all set
++    """
++
++    def __init__(self, tokenizer: AnyTokenizer):
++        super().__init__(tokenizer)
++        # for granite 3.0, the token `<|tool_call|>`
++        self.bot_token = "<|tool_call|>"
++        # for granite 3.1, the string `<tool_call>`
++        self.bot_string = "<tool_call>"
++
++    def extract_tool_calls(
++            self, model_output: str,
++            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
++        stripped = model_output.strip()\
++                    .removeprefix(self.bot_token)\
++                    .removeprefix(self.bot_string)\
++                    .lstrip()
++        if not stripped or stripped[0] != '[':
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++        try:
++            raw_function_calls = json.loads(stripped)
++            if not isinstance(raw_function_calls, list):
++                raise Exception(
++                    f"Expected dict or list, got {type(raw_function_calls)}")
++
++            logger.debug("Extracted %d tool calls", len(raw_function_calls))
++            tool_calls = [
++                ToolCall(
++                    type="function",
++                    function=FunctionCall(
++                        name=function_call["name"],
++                        # function call args are JSON but as a string
++                        arguments=json.dumps(function_call["arguments"]),
++                    ),
++                ) for function_call in raw_function_calls
++            ]
++
++            return ExtractedToolCallInformation(
++                tools_called=True,
++                tool_calls=tool_calls,
++                content=None,
++            )
++
++        except Exception as e:
++            logger.error("Error in extracting tool call from response %s", e)
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++
++    def extract_tool_calls_streaming(
++        self,
++        previous_text: str,
++        current_text: str,
++        delta_text: str,
++        previous_token_ids: Sequence[int],
++        current_token_ids: Sequence[int],
++        delta_token_ids: Sequence[int],
++        request: ChatCompletionRequest,
++    ) -> Union[DeltaMessage, None]:
++
++        start_idx = consume_space(0, current_text)
++        if current_text[start_idx:].startswith(self.bot_token):
++            start_idx = consume_space(start_idx + len(self.bot_token),
++                                      current_text)
++        if current_text[start_idx:].startswith(self.bot_string):
++            start_idx = consume_space(start_idx + len(self.bot_string),
++                                      current_text)
++        if not current_text or start_idx >= len(current_text)\
++            or current_text[start_idx] != '[':
++            return DeltaMessage(content=delta_text)
++
++        # bit mask flags for partial JSON parsing. If the name hasn't been
++        # sent yet, don't allow sending
++        # an incomplete string since OpenAI only ever (as far as I have
++        # seen) allows sending the entire tool/ function name at once.
++        flags = Allow.ALL if self.current_tool_name_sent \
++            else Allow.ALL & ~Allow.STR
++        try:
++            tool_call_arr = None
++            is_complete = None
++            try:
++                tool_calls, end_idx = partial_json_loads(
++                    current_text[start_idx:], flags)
++                if type(tool_calls) is list:
++                    tool_call_arr = tool_calls
++                else:
++                    return DeltaMessage(content=delta_text)
++
++                is_complete = [True] * len(tool_calls)
++                if not is_complete_json(
++                        current_text[start_idx:start_idx + end_idx]):
++                    is_complete[-1] = False
++            except partial_json_parser.core.exceptions.MalformedJSON:
++                logger.debug('not enough tokens to parse into JSON yet')
++                return None
++
++            # case -- if no tokens have been streamed for the tool, e.g.
++            #   only the array brackets, stream nothing
++            if not tool_call_arr:
++                return None
++
++            # select as the current tool call the one we're on the state at
++            current_tool_call: Dict = tool_call_arr[self.current_tool_id]
++
++            delta = None
++            # case: we are starting a new tool in the array
++            #   -> array has > 0 length AND length has moved past cursor
++            if len(tool_call_arr) > self.current_tool_id + 1:
++
++                # if we're moving on to a new call, first make sure we
++                # haven't missed anything in the previous one that was
++                # auto-generated due to JSON completions, but wasn't
++                # streamed to the client yet.
++                if self.current_tool_id >= 0:
++                    cur_arguments = current_tool_call.get("arguments")
++                    if cur_arguments:
++                        cur_args_json = json.dumps(cur_arguments)
++                        sent = len(
++                            self.streamed_args_for_tool[self.current_tool_id])
++                        argument_diff = cur_args_json[sent:]
++
++                        logger.debug("got arguments diff: %s", argument_diff)
++                        delta = DeltaMessage(tool_calls=[
++                            DeltaToolCall(index=self.current_tool_id,
++                                          function=DeltaFunctionCall(
++                                              arguments=argument_diff).
++                                          model_dump(exclude_none=True))
++                        ])
++                        self.streamed_args_for_tool[
++                            self.current_tool_id] += argument_diff
++
++                # re-set stuff pertaining to progress in the current tool
++                self.current_tool_id = len(tool_call_arr) - 1
++                self.current_tool_name_sent = False
++                self.streamed_args_for_tool.append("")
++                logger.debug("starting on new tool %d", self.current_tool_id)
++                return delta
++
++            # if the current tool name hasn't been sent, send if available
++            # - otherwise send nothing
++            elif not self.current_tool_name_sent:
++                function_name = current_tool_call.get("name")
++                if function_name:
++
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      type="function",
++                                      id=f"chatcmpl-tool-{random_uuid()}",
++                                      function=DeltaFunctionCall(
++                                          name=function_name).model_dump(
++                                              exclude_none=True))
++                    ])
++                    self.current_tool_name_sent = True
++
++            # now we know we're on the same tool call and we're streaming
++            # arguments
++            else:
++                cur_arguments = current_tool_call.get("arguments")
++
++                if cur_arguments:
++                    sent = len(
++                        self.streamed_args_for_tool[self.current_tool_id])
++                    cur_args_json = json.dumps(cur_arguments)
++                    prev_arguments = self.prev_tool_call_arr[
++                        self.current_tool_id].get("arguments")
++
++                    argument_diff = None
++                    if is_complete[self.current_tool_id]:
++                        argument_diff = cur_args_json[sent:]
++                    elif prev_arguments:
++                        prev_args_json = json.dumps(prev_arguments)
++                        if cur_args_json != prev_args_json:
++                            prefix = find_common_prefix(
++                                prev_args_json, cur_args_json)
++                            argument_diff = prefix[sent:]
++
++                    if argument_diff is not None:
++                        delta = DeltaMessage(tool_calls=[
++                            DeltaToolCall(index=self.current_tool_id,
++                                          function=DeltaFunctionCall(
++                                              arguments=argument_diff).
++                                          model_dump(exclude_none=True))
++                        ])
++                        self.streamed_args_for_tool[
++                            self.current_tool_id] += argument_diff
++
++            self.prev_tool_call_arr = tool_call_arr
++            return delta
++
++        except Exception as e:
++            logger.error("Error trying to handle streaming tool call: %s", e)
++            logger.debug(
++                "Skipping chunk as a result of tool streaming extraction "
++                "error")
++            return None
+diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+new file mode 100644
+index 0000000..869d15a
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+@@ -0,0 +1,367 @@
++import json
++import re
++from typing import Dict, List, Sequence, Union
++
++import partial_json_parser
++from partial_json_parser.core.options import Allow
++
++from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
++                                              DeltaFunctionCall, DeltaMessage,
++                                              DeltaToolCall,
++                                              ExtractedToolCallInformation,
++                                              FunctionCall, ToolCall)
++from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
++    ToolParser, ToolParserManager)
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
++from vllm.utils import random_uuid
++
++logger = init_logger(__name__)
++
++
++@ToolParserManager.register_module("hermes")
++class Hermes2ProToolParser(ToolParser):
++
++    def __init__(self, tokenizer: AnyTokenizer):
++        super().__init__(tokenizer)
++
++        if isinstance(self.model_tokenizer, MistralTokenizer):
++            logger.error(
++                "Detected Mistral tokenizer when using a Hermes model")
++            self.model_tokenizer = self.model_tokenizer.tokenizer
++
++        self.current_tool_name_sent: bool = False
++        self.prev_tool_call_arr: List[Dict] = []
++        self.current_tool_id: int = -1
++        self.streamed_args_for_tool: List[str] = [
++        ]  # map what has been streamed for each tool so far to a list
++
++        self.tool_call_start_token: str = "<tool_call>"
++        self.tool_call_end_token: str = "</tool_call>"
++
++        self.tool_call_regex = re.compile(
++            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL)
++        self.scratch_pad_regex = re.compile(
++            r"<scratch_pad>(.*?)</scratch_pad>", re.DOTALL)
++
++        if not self.model_tokenizer:
++            raise ValueError(
++                "The model tokenizer must be passed to the ToolParser "
++                "constructor during construction.")
++        self.tool_call_start_token_id = self.vocab.get(
++            self.tool_call_start_token)
++        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
++        if (self.tool_call_start_token_id is None
++                or self.tool_call_end_token_id is None):
++            raise RuntimeError(
++                "Hermes 2 Pro Tool parser could not locate tool call start/end "
++                "tokens in the tokenizer!")
++
++    def extract_tool_calls(
++        self,
++        model_output: str,
++        request: ChatCompletionRequest,
++    ) -> ExtractedToolCallInformation:
++
++        # sanity check; avoid unnecessary processing
++        if self.tool_call_start_token not in model_output:
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++
++        else:
++
++            try:
++                # there are two possible captures - between tags, or between a
++                # tag and end-of-string so the result of
++                # findall is an array of tuples where one is a function call and
++                # the other is None
++                function_call_tuples = (
++                    self.tool_call_regex.findall(model_output))
++
++                # load the JSON, and then use it to build the Function and
++                # Tool Call
++                raw_function_calls = [
++                    json.loads(match[0] if match[0] else match[1])
++                    for match in function_call_tuples
++                ]
++                tool_calls = [
++                    ToolCall(
++                        type="function",
++                        function=FunctionCall(
++                            name=function_call["name"],
++                            # function call args are JSON but as a string
++                            arguments=json.dumps(function_call["arguments"],
++                                                 ensure_ascii=False)))
++                    for function_call in raw_function_calls
++                ]
++
++                content = model_output[:model_output.
++                                       find(self.tool_call_start_token)]
++                return ExtractedToolCallInformation(
++                    tools_called=True,
++                    tool_calls=tool_calls,
++                    content=content if content else None)
++
++            except Exception:
++                logger.exception(
++                    "Error in extracting tool call from response.")
++                return ExtractedToolCallInformation(tools_called=False,
++                                                    tool_calls=[],
++                                                    content=model_output)
++
++    def extract_tool_calls_streaming(
++        self,
++        previous_text: str,
++        current_text: str,
++        delta_text: str,
++        previous_token_ids: Sequence[int],
++        current_token_ids: Sequence[int],
++        delta_token_ids: Sequence[int],
++        request: ChatCompletionRequest,
++    ) -> Union[DeltaMessage, None]:
++
++        logger.debug("delta_text: %s", delta_text)
++        logger.debug("delta_token_ids: %s", delta_token_ids)
++        # check to see if we should be streaming a tool call - is there a
++        if self.tool_call_start_token_id not in current_token_ids:
++            logger.debug("No tool call tokens found!")
++            return DeltaMessage(content=delta_text)
++
++        try:
++
++            # figure out where we are in the parsing by counting tool call
++            # start & end tags
++            prev_tool_start_count = previous_token_ids.count(
++                self.tool_call_start_token_id)
++            prev_tool_end_count = previous_token_ids.count(
++                self.tool_call_end_token_id)
++            cur_tool_start_count = current_token_ids.count(
++                self.tool_call_start_token_id)
++            cur_tool_end_count = current_token_ids.count(
++                self.tool_call_end_token_id)
++            tool_call_portion = None
++            text_portion = None
++
++            # case: if we're generating text, OR rounding out a tool call
++            if (cur_tool_start_count == cur_tool_end_count
++                    and prev_tool_end_count == cur_tool_end_count
++                    and self.tool_call_end_token not in delta_text):
++                logger.debug("Generating text content! skipping tool parsing.")
++                return DeltaMessage(content=delta_text)
++
++            if self.tool_call_end_token in delta_text:
++                logger.debug("tool_call_end_token in delta_text")
++                full_text = current_text + delta_text
++                tool_call_portion = full_text.split(
++                    self.tool_call_start_token)[-1].split(
++                        self.tool_call_end_token)[0].rstrip()
++                delta_text = delta_text.split(
++                    self.tool_call_end_token)[0].rstrip()
++                text_portion = delta_text.split(
++                    self.tool_call_end_token)[-1].lstrip()
++
++            # case: if tool open & close tag counts don't match, we're doing
++            # imaginary "else" block here
++            # something with tools with this diff.
++            # flags for partial JSON parting. exported constants from
++            # "Allow" are handled via BIT MASK
++            flags = Allow.ALL if self.current_tool_name_sent \
++                else Allow.ALL & ~Allow.STR
++
++            # case -- we're starting a new tool call
++            if (cur_tool_start_count > cur_tool_end_count
++                    and cur_tool_start_count > prev_tool_start_count):
++                if len(delta_token_ids) > 1:
++                    tool_call_portion = current_text.split(
++                        self.tool_call_start_token)[-1]
++                else:
++                    tool_call_portion = None
++                    delta = None
++
++                text_portion = None
++
++                # set cursors and state appropriately
++                self.current_tool_id += 1
++                self.current_tool_name_sent = False
++                self.streamed_args_for_tool.append("")
++                logger.debug("Starting on a new tool %s", self.current_tool_id)
++
++            # case -- we're updating an existing tool call
++            elif (cur_tool_start_count > cur_tool_end_count
++                  and cur_tool_start_count == prev_tool_start_count):
++
++                # get the portion of the text that's the tool call
++                tool_call_portion = current_text.split(
++                    self.tool_call_start_token)[-1]
++                text_portion = None
++
++            # case -- the current tool call is being closed.
++            elif (cur_tool_start_count == cur_tool_end_count
++                  and cur_tool_end_count >= prev_tool_end_count):
++                if (self.prev_tool_call_arr is None
++                        or len(self.prev_tool_call_arr) == 0):
++                    logger.debug(
++                        "attempting to close tool call, but no tool call")
++                    return None
++                diff = self.prev_tool_call_arr[self.current_tool_id].get(
++                    "arguments")
++                if diff:
++                    diff = diff.encode('utf-8').decode(
++                        'unicode_escape') if diff is str else diff
++                    if ('"}' not in delta_text):
++                        return None
++                    end_loc = delta_text.rindex('"}')
++                    diff = delta_text[:end_loc] + '"}'
++                    logger.debug(
++                        "Finishing tool and found diff that had not "
++                        "been streamed yet: %s", diff)
++                    self.streamed_args_for_tool[self.current_tool_id] \
++                        += diff
++                    return DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      function=DeltaFunctionCall(
++                                          arguments=diff).model_dump(
++                                              exclude_none=True))
++                    ])
++
++            # case -- otherwise we're just generating text
++            else:
++                text = delta_text.replace(self.tool_call_start_token, "")
++                text = text.replace(self.tool_call_end_token, "")
++                delta = DeltaMessage(tool_calls=[], content=text)
++                return delta
++
++            try:
++
++                current_tool_call = partial_json_parser.loads(
++                    tool_call_portion or "{}",
++                    flags) if tool_call_portion else None
++                logger.debug("Parsed tool call %s", current_tool_call)
++            except partial_json_parser.core.exceptions.MalformedJSON:
++                logger.debug('not enough tokens to parse into JSON yet')
++                return None
++            except json.decoder.JSONDecodeError:
++                logger.debug("unable to parse JSON")
++                return None
++
++            # case - we haven't sent the tool name yet. If it's available, send
++            #   it. otherwise, wait until it's available.
++            if not self.current_tool_name_sent:
++                if (current_tool_call is None):
++                    return None
++                function_name: Union[str, None] = current_tool_call.get("name")
++                if function_name:
++                    self.current_tool_name_sent = True
++                    return DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      type="function",
++                                      id=f"chatcmpl-tool-{random_uuid()}",
++                                      function=DeltaFunctionCall(
++                                          name=function_name).model_dump(
++                                              exclude_none=True))
++                    ])
++                else:
++                    return None
++            # case -- otherwise, send the tool call delta
++
++            # if the tool call portion is None, send the delta as text
++            if tool_call_portion is None:
++                # if there's text but not tool calls, send that -
++                # otherwise None to skip chunk
++                delta = DeltaMessage(content=delta_text) \
++                    if text_portion is not None else None
++                return delta
++
++            # now, the nitty-gritty of tool calls
++            # now we have the portion to parse as tool call.
++
++            logger.debug("Trying to parse current tool call with ID %s",
++                         self.current_tool_id)
++
++            # if we're starting a new tool call, push an empty object in as
++            #   a placeholder for the arguments
++            if len(self.prev_tool_call_arr) <= self.current_tool_id:
++                self.prev_tool_call_arr.append({})
++
++            # main logic for tool parsing here - compare prev. partially-parsed
++            #   JSON to the current partially-parsed JSON
++            prev_arguments = (
++                self.prev_tool_call_arr[self.current_tool_id].get("arguments"))
++            cur_arguments = current_tool_call.get("arguments")
++
++            logger.debug("diffing old arguments: %s", prev_arguments)
++            logger.debug("against new ones: %s", cur_arguments)
++
++            # case -- no arguments have been created yet. skip sending a delta.
++            if not cur_arguments and not prev_arguments:
++                logger.debug("Skipping text %s - no arguments", delta_text)
++                delta = None
++
++            # case -- prev arguments are defined, but non are now.
++            #   probably impossible, but not a fatal error - just keep going
++            elif not cur_arguments and prev_arguments:
++                logger.error("should be impossible to have arguments reset "
++                             "mid-call. skipping streaming anything.")
++                delta = None
++
++            # case -- we now have the first info about arguments available from
++            #   autocompleting the JSON
++            elif cur_arguments and not prev_arguments:
++
++                cur_arguments_json = json.dumps(cur_arguments,
++                                                ensure_ascii=False)
++                logger.debug("finding %s in %s", delta_text,
++                             cur_arguments_json)
++
++                # get the location where previous args differ from current
++                if (delta_text not in cur_arguments_json[:-2]):
++                    return None
++                args_delta_start_loc = cur_arguments_json[:-2]. \
++                                           rindex(delta_text) + \
++                                           len(delta_text)
++
++                # use that to find the actual delta
++                arguments_delta = cur_arguments_json[:args_delta_start_loc]
++                logger.debug("First tokens in arguments received: %s",
++                             arguments_delta)
++
++                delta = DeltaMessage(tool_calls=[
++                    DeltaToolCall(index=self.current_tool_id,
++                                  function=DeltaFunctionCall(
++                                      arguments=arguments_delta).model_dump(
++                                          exclude_none=True))
++                ])
++                self.streamed_args_for_tool[self.current_tool_id] \
++                    += arguments_delta
++
++            # last case -- we have an update to existing arguments.
++            elif cur_arguments and prev_arguments:
++                if isinstance(delta_text, str) and len(delta_text.rstrip(
++                )) >= 1 and delta_text.rstrip()[-1] == '}':
++                    delta_text = delta_text.rstrip()[:-1]
++
++                logger.debug("got diff %s", delta_text)
++
++                delta = DeltaMessage(tool_calls=[
++                    DeltaToolCall(index=self.current_tool_id,
++                                  function=DeltaFunctionCall(
++                                      arguments=delta_text).model_dump(
++                                          exclude_none=True))
++                ])
++                self.streamed_args_for_tool[self.current_tool_id] \
++                    += delta_text
++
++            # handle saving the state for the current tool into
++            # the "prev" list for use in diffing for the next iteration
++            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
++                self.prev_tool_call_arr[self.current_tool_id] = \
++                    current_tool_call
++            else:
++                self.prev_tool_call_arr.append(current_tool_call)
++
++            return delta
++
++        except Exception:
++            logger.exception("Error trying to handle streaming tool call.")
++            return None  # do not stream a delta. skip this token ID.
+diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+new file mode 100644
+index 0000000..cb391e1
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+@@ -0,0 +1,208 @@
++import json
++from typing import Dict, Sequence, Union
++
++import partial_json_parser
++from partial_json_parser.core.options import Allow
++
++from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
++                                              DeltaFunctionCall, DeltaMessage,
++                                              DeltaToolCall,
++                                              ExtractedToolCallInformation,
++                                              FunctionCall, ToolCall)
++from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
++    ToolParser, ToolParserManager)
++from vllm.entrypoints.openai.tool_parsers.utils import (
++    extract_intermediate_diff)
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.utils import random_uuid
++
++logger = init_logger(__name__)
++
++
++@ToolParserManager.register_module(["internlm"])
++class Internlm2ToolParser(ToolParser):
++
++    def __init__(self, tokenizer: AnyTokenizer):
++        super().__init__(tokenizer)
++        self.position = 0
++
++    def adjust_request(
++            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
++        if request.tools and request.tool_choice != 'none':
++            # do not skip special tokens because internlm use the special
++            # tokens to indicated the start and end of the tool calls
++            # information.
++            request.skip_special_tokens = False
++        return request
++
++    def get_argments(self, obj):
++        if "parameters" in obj:
++            return obj.get("parameters")
++        elif "arguments" in obj:
++            return obj.get("arguments")
++        return None
++
++    def extract_tool_calls_streaming(
++        self,
++        previous_text: str,
++        current_text: str,
++        delta_text: str,
++        previous_token_ids: Sequence[int],
++        current_token_ids: Sequence[int],
++        delta_token_ids: Sequence[int],
++        request: ChatCompletionRequest,
++    ) -> Union[DeltaMessage, None]:
++        if '<|action_start|>' not in current_text:
++            self.position = len(current_text)
++            return DeltaMessage(content=delta_text)
++        # if the tool call is sended, return a empty delta message
++        # to make sure the finish_reason will be send correctly.
++        if self.current_tool_id > 0:
++            return DeltaMessage(content='')
++
++        last_pos = self.position
++        if '<|action_start|><|plugin|>' not in current_text[last_pos:]:
++            return None
++
++        new_delta = current_text[last_pos:]
++        text, action = new_delta.split('<|action_start|><|plugin|>')
++
++        if len(text) > 0:
++            self.position = self.position + len(text)
++            return DeltaMessage(content=text)
++
++        action = action.strip()
++        action = action.split('<|action_end|>'.strip())[0]
++
++        # bit mask flags for partial JSON parsing. If the name hasn't been
++        # sent yet, don't allow sending
++        # an incomplete string since OpenAI only ever (as far as I have
++        # seen) allows sending the entire tool/ function name at once.
++        flags = Allow.ALL if self.current_tool_name_sent \
++            else Allow.ALL & ~Allow.STR
++
++        try:
++            parsable_arr = action
++
++            # tool calls are generated in an object in inernlm2
++            # it's not support parallel tool calls
++            try:
++                tool_call_arr: Dict = partial_json_parser.loads(
++                    parsable_arr, flags)
++            except partial_json_parser.core.exceptions.MalformedJSON:
++                logger.debug('not enough tokens to parse into JSON yet')
++                return None
++
++            # if the current tool name hasn't been sent, send if available
++            # - otherwise send nothing
++            if not self.current_tool_name_sent:
++                function_name = tool_call_arr.get("name")
++                if function_name:
++                    self.current_tool_id = self.current_tool_id + 1
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      type="function",
++                                      id=f"chatcmpl-tool-{random_uuid()}",
++                                      function=DeltaFunctionCall(
++                                          name=function_name).model_dump(
++                                              exclude_none=True))
++                    ])
++                    self.current_tool_name_sent = True
++                    self.streamed_args_for_tool.append("")
++                else:
++                    delta = None
++            # now we know we're on the same tool call and we're streaming
++            # arguments
++            else:
++                prev_arguments = self.get_argments(
++                    self.prev_tool_call_arr[self.current_tool_id])
++                cur_arguments = self.get_argments(tool_call_arr)
++
++                # not arguments generated
++                if not cur_arguments and not prev_arguments:
++                    delta = None
++                # will never happen
++                elif not cur_arguments and prev_arguments:
++                    logger.error(
++                        "INVARIANT - impossible to have arguments reset "
++                        "mid-arguments")
++                    delta = None
++                # first time to get parameters
++                elif cur_arguments and not prev_arguments:
++                    cur_arguments_json = json.dumps(cur_arguments)
++
++                    arguments_delta = cur_arguments_json[:cur_arguments_json.
++                                                         index(delta_text) +
++                                                         len(delta_text)]
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      function=DeltaFunctionCall(
++                                          arguments=arguments_delta).
++                                      model_dump(exclude_none=True))
++                    ])
++                    self.streamed_args_for_tool[
++                        self.current_tool_id] += arguments_delta
++                # both prev and cur parameters, send the increase parameters
++                elif cur_arguments and prev_arguments:
++                    cur_args_json = json.dumps(cur_arguments)
++                    prev_args_json = json.dumps(prev_arguments)
++
++                    argument_diff = extract_intermediate_diff(
++                        cur_args_json, prev_args_json)
++
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      function=DeltaFunctionCall(
++                                          arguments=argument_diff).model_dump(
++                                              exclude_none=True))
++                    ])
++                    self.streamed_args_for_tool[
++                        self.current_tool_id] += argument_diff
++
++            # check to see if the name is defined and has been sent. if so,
++            # stream the name - otherwise keep waiting
++            # finish by setting old and returning None as base case
++            tool_call_arr["arguments"] = self.get_argments(tool_call_arr)
++            self.prev_tool_call_arr = [tool_call_arr]
++            return delta
++        except Exception:
++            logger.exception("Error trying to handle streaming tool call.")
++            logger.debug(
++                "Skipping chunk as a result of tool streaming extraction "
++                "error")
++            return None
++
++    def extract_tool_calls(
++        self,
++        model_output: str,
++        request: ChatCompletionRequest,
++    ) -> ExtractedToolCallInformation:
++        text = model_output
++        tools = request.tools
++        if '<|action_start|><|plugin|>' in text:
++            text, action = text.split('<|action_start|><|plugin|>')
++            action = action.split('<|action_end|>'.strip())[0]
++            action = action[action.find('{'):]
++            action_dict = json.loads(action)
++            name, parameters = action_dict['name'], json.dumps(
++                action_dict.get('parameters', action_dict.get('arguments',
++                                                              {})))
++
++            if not tools or name not in [t.function.name for t in tools]:
++                ExtractedToolCallInformation(tools_called=False,
++                                             tool_calls=[],
++                                             content=text)
++
++            tool_calls = [
++                ToolCall(
++                    function=FunctionCall(name=name, arguments=parameters))
++            ]
++            return ExtractedToolCallInformation(
++                tools_called=True,
++                tool_calls=tool_calls,
++                content=text if len(text) > 0 else None)
++
++        return ExtractedToolCallInformation(tools_called=False,
++                                            tool_calls=[],
++                                            content=text)
+diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+new file mode 100644
+index 0000000..cfd0248
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+@@ -0,0 +1,300 @@
++import json
++import re
++from typing import Dict, List, Sequence, Union
++
++import partial_json_parser
++from partial_json_parser.core.options import Allow
++
++from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
++                                              DeltaFunctionCall, DeltaMessage,
++                                              DeltaToolCall,
++                                              ExtractedToolCallInformation,
++                                              FunctionCall, ToolCall)
++from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
++from vllm.entrypoints.openai.tool_parsers.utils import (
++    extract_intermediate_diff)
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.transformers_utils.tokenizers import MistralTokenizer
++from vllm.utils import random_uuid
++
++logger = init_logger(__name__)
++
++
++@ToolParserManager.register_module("jamba")
++class JambaToolParser(ToolParser):
++
++    def __init__(self, tokenizer: AnyTokenizer):
++        super().__init__(tokenizer)
++
++        if isinstance(self.model_tokenizer, MistralTokenizer):
++            raise ValueError(
++                "Detected a MistralTokenizer tokenizer when using a Jamba model"
++            )
++
++        self.current_tool_name_sent: bool = False
++        self.prev_tool_call_arr: List[Dict] = []
++        self.current_tool_id: int = -1
++        self.streamed_args_for_tool: List[str] = [
++        ]  # map what has been streamed for each tool so far to a list
++
++        self.tool_calls_start_token: str = "<tool_calls>"
++        self.tool_calls_end_token: str = "</tool_calls>"
++
++        self.tool_calls_regex = re.compile(
++            rf"{self.tool_calls_start_token}(.*?){self.tool_calls_end_token}",
++            re.DOTALL)
++
++        if not self.model_tokenizer:
++            raise ValueError(
++                "The model tokenizer must be passed to the ToolParser "
++                "constructor during construction.")
++        self.tool_calls_start_token_id = self.vocab.get(
++            self.tool_calls_start_token)
++        self.tool_calls_end_token_id = self.vocab.get(
++            self.tool_calls_end_token)
++        if (self.tool_calls_start_token_id is None
++                or self.tool_calls_end_token_id is None):
++            raise RuntimeError(
++                "Jamba Tool parser could not locate tool calls start/end "
++                "tokens in the tokenizer!")
++
++    def adjust_request(
++            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
++        if request.tools and request.tool_choice != 'none':
++            # do not skip special tokens because jamba use the special
++            # tokens to indicate the start and end of the tool calls
++            # information.
++            request.skip_special_tokens = False
++        return request
++
++    def extract_tool_calls(
++            self, model_output: str,
++            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
++
++        # sanity check; avoid unnecessary processing
++        if self.tool_calls_start_token not in model_output:
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++
++        else:
++
++            try:
++                # use a regex to find the tool call between the tags
++                function_calls = self.tool_calls_regex.findall(model_output)[0]
++
++                # load the JSON, and then use it to build the Function and
++                # Tool Call
++                raw_function_calls = json.loads(function_calls)
++                tool_calls = [
++                    ToolCall(
++                        type="function",
++                        function=FunctionCall(
++                            name=function_call["name"],
++                            # function call args are JSON but as a string
++                            arguments=json.dumps(function_call["arguments"])))
++                    for function_call in raw_function_calls
++                ]
++
++                content = model_output[:model_output.
++                                       find(self.tool_calls_start_token)]
++                return ExtractedToolCallInformation(
++                    tools_called=True,
++                    tool_calls=tool_calls,
++                    content=content if
++                    (len(content) > 0 and content != " ") else None)
++
++            except Exception:
++                logger.exception(
++                    "Error in extracting tool call from response.")
++                return ExtractedToolCallInformation(tools_called=False,
++                                                    tool_calls=[],
++                                                    content=model_output)
++
++    def extract_tool_calls_streaming(
++        self,
++        previous_text: str,
++        current_text: str,
++        delta_text: str,
++        previous_token_ids: Sequence[int],
++        current_token_ids: Sequence[int],
++        delta_token_ids: Sequence[int],
++        request: ChatCompletionRequest,
++    ) -> Union[DeltaMessage, None]:
++
++        # if the tool call token is not in the tokens generated so far, append
++        # output to contents since it's not a tool
++        if self.tool_calls_start_token not in current_text:
++            return DeltaMessage(content=delta_text)
++
++        # if the tool call token ID IS in the tokens generated so far, that
++        # means we're parsing as tool calls now
++
++        # handle if we detected the start of tool calls token which means
++        # the start of tool calling
++        if (self.tool_calls_start_token_id in delta_token_ids
++                and len(delta_token_ids) == 1):
++            # if it's the only token, return None, so we don't send a chat
++            # completion and don't send a control token
++            return None
++
++        # bit mask flags for partial JSON parsing. If the name hasn't been
++        # sent yet, don't allow sending
++        # an incomplete string since OpenAI only ever (as far as I have
++        # seen) allows sending the entire tool/ function name at once.
++        flags = Allow.ALL if self.current_tool_name_sent \
++            else Allow.ALL & ~Allow.STR
++        try:
++
++            # Extract the tool calls between the special tool call tokens
++            parsable_arr = current_text.split(
++                self.tool_calls_start_token)[-1].split(
++                    self.tool_calls_end_token)[0]
++
++            # tool calls are generated in an array, so do partial JSON
++            # parsing on the entire array
++            try:
++                tool_call_arr: List[Dict] = partial_json_parser.loads(
++                    parsable_arr, flags)
++            except partial_json_parser.core.exceptions.MalformedJSON:
++                logger.debug('not enough tokens to parse into JSON yet')
++                return None
++
++            # select as the current tool call the one we're on the state at
++
++            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
++                if len(tool_call_arr) > 0 else {}
++
++            # case -- if no tokens have been streamed for the tool, e.g.
++            #   only the array brackets, stream nothing
++            if len(tool_call_arr) == 0:
++                return None
++
++            # case: we are starting a new tool in the array
++            #   -> array has > 0 length AND length has moved past cursor
++            elif (len(tool_call_arr) > 0
++                  and len(tool_call_arr) > self.current_tool_id + 1):
++
++                # if we're moving on to a new call, first make sure we
++                # haven't missed anything in the previous one that was
++                # auto-generated due to JSON completions, but wasn't
++                # streamed to the client yet.
++                if self.current_tool_id >= 0:
++                    diff: Union[str, None] = current_tool_call.get("arguments")
++
++                    if diff:
++                        diff = json.dumps(diff).replace(
++                            self.streamed_args_for_tool[self.current_tool_id],
++                            "")
++                        delta = DeltaMessage(tool_calls=[
++                            DeltaToolCall(index=self.current_tool_id,
++                                          function=DeltaFunctionCall(
++                                              arguments=diff).model_dump(
++                                                  exclude_none=True))
++                        ])
++                        self.streamed_args_for_tool[
++                            self.current_tool_id] += diff
++                    else:
++                        delta = None
++                else:
++                    delta = None
++                # re-set stuff pertaining to progress in the current tool
++                self.current_tool_id = len(tool_call_arr) - 1
++                self.current_tool_name_sent = False
++                self.streamed_args_for_tool.append("")
++                logger.debug("starting on new tool %d", self.current_tool_id)
++                return delta
++
++            # case: update an existing tool - this is handled below
++
++            # if the current tool name hasn't been sent, send if available
++            # - otherwise send nothing
++            if not self.current_tool_name_sent:
++                function_name = current_tool_call.get("name")
++                if function_name:
++
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      type="function",
++                                      id=f"chatcmpl-tool-{random_uuid()}",
++                                      function=DeltaFunctionCall(
++                                          name=function_name).model_dump(
++                                              exclude_none=True))
++                    ])
++                    self.current_tool_name_sent = True
++                else:
++                    delta = None
++
++            # now we know we're on the same tool call and we're streaming
++            # arguments
++            else:
++
++                prev_arguments = self.prev_tool_call_arr[
++                    self.current_tool_id].get("arguments")
++                cur_arguments = current_tool_call.get("arguments")
++
++                new_text = delta_text.replace("\'", "\"")
++
++                if not cur_arguments and not prev_arguments:
++
++                    delta = None
++                elif not cur_arguments and prev_arguments:
++                    logger.error(
++                        "INVARIANT - impossible to have arguments reset "
++                        "mid-arguments")
++                    delta = None
++                elif cur_arguments and not prev_arguments:
++                    cur_arguments_json = json.dumps(cur_arguments)
++                    logger.debug("finding %s in %s", new_text,
++                                 cur_arguments_json)
++
++                    arguments_delta = cur_arguments_json[:cur_arguments_json.
++                                                         index(new_text) +
++                                                         len(new_text)]
++                    logger.debug("First tokens in arguments received: %s",
++                                 arguments_delta)
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      function=DeltaFunctionCall(
++                                          arguments=arguments_delta).
++                                      model_dump(exclude_none=True))
++                    ])
++                    self.streamed_args_for_tool[
++                        self.current_tool_id] += arguments_delta
++
++                elif cur_arguments and prev_arguments:
++                    cur_args_json = json.dumps(cur_arguments)
++                    prev_args_json = json.dumps(prev_arguments)
++                    logger.debug("Searching for diff between \n%s\n%s",
++                                 cur_args_json, prev_args_json)
++
++                    argument_diff = extract_intermediate_diff(
++                        cur_args_json, prev_args_json)
++                    logger.debug("got arguments diff: %s", argument_diff)
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      function=DeltaFunctionCall(
++                                          arguments=argument_diff).model_dump(
++                                              exclude_none=True))
++                    ])
++                    self.streamed_args_for_tool[
++                        self.current_tool_id] += argument_diff
++                else:
++                    # try parsing it with regular JSON - if it works we're
++                    # at the end, and we need to send the difference between
++                    # tokens streamed so far and the valid JSON
++                    delta = None
++
++            # check to see if the name is defined and has been sent. if so,
++            # stream the name - otherwise keep waiting
++            # finish by setting old and returning None as base case
++            self.prev_tool_call_arr = tool_call_arr
++            return delta
++
++        except Exception:
++            logger.exception("Error trying to handle streaming tool call.")
++            logger.debug(
++                "Skipping chunk as a result of tool streaming extraction "
++                "error")
++            return None
+diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+new file mode 100644
+index 0000000..1856308
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+@@ -0,0 +1,258 @@
++import json
++import re
++from json import JSONDecoder
++from typing import Dict, List, Sequence, Union
++
++import partial_json_parser
++from partial_json_parser.core.options import Allow
++from transformers import PreTrainedTokenizerBase
++
++from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
++                                              DeltaFunctionCall, DeltaMessage,
++                                              DeltaToolCall,
++                                              ExtractedToolCallInformation,
++                                              FunctionCall, ToolCall)
++from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
++    ToolParser, ToolParserManager)
++from vllm.entrypoints.openai.tool_parsers.utils import (find_common_prefix,
++                                                        is_complete_json,
++                                                        partial_json_loads)
++from vllm.logger import init_logger
++from vllm.utils import random_uuid
++
++logger = init_logger(__name__)
++
++
++@ToolParserManager.register_module("llama3_json")
++class Llama3JsonToolParser(ToolParser):
++    """
++    Tool call parser for Llama 3.1 models intended for use with the
++    examples/tool_chat_template_llama.jinja template.
++
++    Used when --enable-auto-tool-choice --tool-call-parser llama3_json 
++    are all set
++    """
++
++    def __init__(self, tokenizer: PreTrainedTokenizerBase):
++        super().__init__(tokenizer)
++
++        # initialize properties used for state when parsing tool calls in
++        # streaming mode
++        self.prev_tool_call_arr: List[Dict] = []
++        self.current_tool_id: int = -1
++        self.current_tool_name_sent: bool = False
++        self.streamed_args_for_tool: List[str] = [
++        ]  # map what has been streamed for each tool so far to a list
++        self.bot_token = "<|python_tag|>"
++        self.bot_token_id = tokenizer.encode(self.bot_token,
++                                             add_special_tokens=False)[0]
++        self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
++
++    def extract_tool_calls(
++            self, model_output: str,
++            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
++        """
++        Extract the tool calls from a complete model response.
++        """
++        # case -- if a tool call token is not present, return a text response
++        if not (model_output.startswith(self.bot_token)
++                or model_output.startswith('{')):
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++
++        try:
++            # load the JSON, and then use it to build the Function and
++            # Tool Call
++            dec = JSONDecoder()
++            function_call_arr = []
++
++            # depending on the prompt format the Llama model may or may not
++            # prefix the output with the <|python_tag|> token
++            start_idx = len(self.bot_token) if model_output.startswith(
++                self.bot_token) else 0
++            while start_idx < len(model_output):
++                (obj, end_idx) = dec.raw_decode(model_output[start_idx:])
++                start_idx += end_idx + len('; ')
++                function_call_arr.append(obj)
++
++            tool_calls: List[ToolCall] = [
++                ToolCall(
++                    type="function",
++                    function=FunctionCall(
++                        name=raw_function_call["name"],
++                        # function call args are JSON but as a string
++                        arguments=json.dumps(raw_function_call["arguments"] \
++                                if "arguments" in raw_function_call \
++                                else raw_function_call["parameters"])))
++                for raw_function_call in function_call_arr
++            ]
++
++            # get any content before  the tool call
++            ret = ExtractedToolCallInformation(tools_called=True,
++                                               tool_calls=tool_calls,
++                                               content=None)
++            return ret
++
++        except Exception:
++            logger.exception("Error in extracting tool call from response.")
++            # return information to just treat the tool call as regular JSON
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++
++    def extract_tool_calls_streaming(
++        self,
++        previous_text: str,
++        current_text: str,
++        delta_text: str,
++        previous_token_ids: Sequence[int],
++        current_token_ids: Sequence[int],
++        delta_token_ids: Sequence[int],
++        request: ChatCompletionRequest,
++    ) -> Union[DeltaMessage, None]:
++
++        if not (current_text.startswith(self.bot_token)
++                or current_text.startswith('{')):
++            return DeltaMessage(content=delta_text)
++
++        # bit mask flags for partial JSON parsing. If the name hasn't been
++        # sent yet, don't allow sending
++        # an incomplete string since OpenAI only ever (as far as I have
++        # seen) allows sending the entire tool/ function name at once.
++        flags = Allow.ALL if self.current_tool_name_sent \
++            else Allow.ALL & ~Allow.STR
++        try:
++            tool_call_arr = []
++            is_complete = []
++            try:
++                # depending on the prompt format the Llama model may or may not
++                # prefix the output with the <|python_tag|> token
++                start_idx = len(self.bot_token) if current_text.startswith(
++                    self.bot_token) else 0
++                while start_idx < len(current_text):
++                    (obj,
++                     end_idx) = partial_json_loads(current_text[start_idx:],
++                                                   flags)
++                    is_complete.append(
++                        is_complete_json(current_text[start_idx:start_idx +
++                                                      end_idx]))
++                    start_idx += end_idx + len('; ')
++                    # depending on the prompt Llama can use
++                    # either arguments or parameters
++                    if "parameters" in obj:
++                        assert "arguments" not in obj, \
++                            "model generated both parameters and arguments"
++                        obj["arguments"] = obj["parameters"]
++                    tool_call_arr.append(obj)
++            except partial_json_parser.core.exceptions.MalformedJSON:
++                logger.debug('not enough tokens to parse into JSON yet')
++                return None
++
++            # select as the current tool call the one we're on the state at
++            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
++                if len(tool_call_arr) > 0 else {}
++
++            # case -- if no tokens have been streamed for the tool, e.g.
++            #   only the array brackets, stream nothing
++            if len(tool_call_arr) == 0:
++                return None
++
++            # case: we are starting a new tool in the array
++            #   -> array has > 0 length AND length has moved past cursor
++            elif (len(tool_call_arr) > 0
++                  and len(tool_call_arr) > self.current_tool_id + 1):
++
++                # if we're moving on to a new call, first make sure we
++                # haven't missed anything in the previous one that was
++                # auto-generated due to JSON completions, but wasn't
++                # streamed to the client yet.
++                if self.current_tool_id >= 0:
++                    cur_arguments = current_tool_call.get("arguments")
++                    if cur_arguments:
++                        cur_args_json = json.dumps(cur_arguments)
++                        sent = len(
++                            self.streamed_args_for_tool[self.current_tool_id])
++                        argument_diff = cur_args_json[sent:]
++
++                        logger.debug("got arguments diff: %s", argument_diff)
++                        delta = DeltaMessage(tool_calls=[
++                            DeltaToolCall(index=self.current_tool_id,
++                                          function=DeltaFunctionCall(
++                                              arguments=argument_diff).
++                                          model_dump(exclude_none=True))
++                        ])
++                        self.streamed_args_for_tool[
++                            self.current_tool_id] += argument_diff
++                    else:
++                        delta = None
++                else:
++                    delta = None
++                # re-set stuff pertaining to progress in the current tool
++                self.current_tool_id = len(tool_call_arr) - 1
++                self.current_tool_name_sent = False
++                self.streamed_args_for_tool.append("")
++                logger.debug("starting on new tool %d", self.current_tool_id)
++                return delta
++
++            # if the current tool name hasn't been sent, send if available
++            # - otherwise send nothing
++            elif not self.current_tool_name_sent:
++                function_name = current_tool_call.get("name")
++                if function_name:
++
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      type="function",
++                                      id=f"chatcmpl-tool-{random_uuid()}",
++                                      function=DeltaFunctionCall(
++                                          name=function_name).model_dump(
++                                              exclude_none=True))
++                    ])
++                    self.current_tool_name_sent = True
++                else:
++                    delta = None
++
++            # now we know we're on the same tool call and we're streaming
++            # arguments
++            else:
++                cur_arguments = current_tool_call.get("arguments")
++                delta = None
++
++                if cur_arguments:
++                    sent = len(
++                        self.streamed_args_for_tool[self.current_tool_id])
++                    cur_args_json = json.dumps(cur_arguments)
++                    prev_arguments = self.prev_tool_call_arr[
++                        self.current_tool_id].get("arguments")
++
++                    argument_diff = None
++                    if is_complete[self.current_tool_id]:
++                        argument_diff = cur_args_json[sent:]
++                    elif prev_arguments:
++                        prev_args_json = json.dumps(prev_arguments)
++                        if cur_args_json != prev_args_json:
++
++                            prefix = find_common_prefix(
++                                prev_args_json, cur_args_json)
++                            argument_diff = prefix[sent:]
++
++                    if argument_diff is not None:
++                        delta = DeltaMessage(tool_calls=[
++                            DeltaToolCall(index=self.current_tool_id,
++                                          function=DeltaFunctionCall(
++                                              arguments=argument_diff).
++                                          model_dump(exclude_none=True))
++                        ])
++                        self.streamed_args_for_tool[
++                            self.current_tool_id] += argument_diff
++
++            self.prev_tool_call_arr = tool_call_arr
++            return delta
++
++        except Exception:
++            logger.exception("Error trying to handle streaming tool call.")
++            logger.debug(
++                "Skipping chunk as a result of tool streaming extraction "
++                "error")
++            return None
+diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+new file mode 100644
+index 0000000..bada805
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+@@ -0,0 +1,322 @@
++import json
++import re
++from random import choices
++from string import ascii_letters, digits
++from typing import Dict, List, Sequence, Union
++
++import partial_json_parser
++from partial_json_parser.core.options import Allow
++from pydantic import Field
++
++from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
++                                              DeltaFunctionCall, DeltaMessage,
++                                              DeltaToolCall,
++                                              ExtractedToolCallInformation,
++                                              FunctionCall, ToolCall)
++from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
++    ToolParser, ToolParserManager)
++from vllm.entrypoints.openai.tool_parsers.utils import (
++    extract_intermediate_diff)
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
++
++logger = init_logger(__name__)
++
++ALPHANUMERIC = ascii_letters + digits
++
++
++class MistralToolCall(ToolCall):
++    id: str = Field(
++        default_factory=lambda: MistralToolCall.generate_random_id())
++
++    @staticmethod
++    def generate_random_id():
++        # Mistral Tool Call Ids must be alphanumeric with a maximum length of 9.
++        # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299
++        return "".join(choices(ALPHANUMERIC, k=9))
++
++
++@ToolParserManager.register_module("mistral")
++class MistralToolParser(ToolParser):
++    """
++    Tool call parser for Mistral 7B Instruct v0.3, intended for use with the
++    examples/tool_chat_template_mistral.jinja template.
++
++    Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
++    """
++
++    def __init__(self, tokenizer: AnyTokenizer):
++        super().__init__(tokenizer)
++
++        if not isinstance(self.model_tokenizer, MistralTokenizer):
++            logger.info("Non-Mistral tokenizer detected when using a Mistral "
++                        "model...")
++
++        # initialize properties used for state when parsing tool calls in
++        # streaming mode
++        self.prev_tool_call_arr: List[Dict] = []
++        self.current_tool_id: int = -1
++        self.current_tool_name_sent: bool = False
++        self.streamed_args_for_tool: List[str] = [
++        ]  # map what has been streamed for each tool so far to a list
++        self.bot_token = "[TOOL_CALLS]"
++        self.bot_token_id = self.vocab.get(self.bot_token)
++        self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
++        if self.bot_token_id is None:
++            raise RuntimeError(
++                "Mistral Tool Parser could not locate the tool call token in "
++                "the tokenizer!")
++
++    def extract_tool_calls(
++        self,
++        model_output: str,
++        request: ChatCompletionRequest,
++    ) -> ExtractedToolCallInformation:
++        """
++        Extract the tool calls from a complete model response. Requires
++        find-and-replacing single quotes with double quotes for JSON parsing,
++        make sure your tool call arguments don't ever include quotes!
++        """
++
++        # case -- if a tool call token is not present, return a text response
++        if self.bot_token not in model_output:
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++
++        # first remove the BOT token
++        tool_content = model_output.replace(self.bot_token, "").strip()
++
++        try:
++
++            # we first try to directly load the json as parsing very nested
++            # jsons is difficult
++            try:
++                function_call_arr = json.loads(tool_content)
++            except json.JSONDecodeError:
++                # use a regex to find the part corresponding to the tool call.
++                # NOTE: This use case should not happen if the model is trained
++                # correctly. It's a easy possible fix so it's included, but
++                # can be brittle for very complex / highly nested tool calls
++                raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
++                function_call_arr = json.loads(raw_tool_call)
++
++            # Tool Call
++            tool_calls: List[MistralToolCall] = [
++                MistralToolCall(
++                    type="function",
++                    function=FunctionCall(
++                        name=raw_function_call["name"],
++                        # function call args are JSON but as a string
++                        arguments=json.dumps(raw_function_call["arguments"],
++                                             ensure_ascii=False)))
++                for raw_function_call in function_call_arr
++            ]
++
++            # get any content before  the tool call
++            content = model_output.split(self.bot_token)[0]
++            return ExtractedToolCallInformation(
++                tools_called=True,
++                tool_calls=tool_calls,
++                content=content if len(content) > 0 else None)
++
++        except Exception:
++            logger.exception("Error in extracting tool call from response.")
++            # return information to just treat the tool call as regular JSON
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=tool_content)
++
++    def extract_tool_calls_streaming(
++        self,
++        previous_text: str,
++        current_text: str,
++        delta_text: str,
++        previous_token_ids: Sequence[int],
++        current_token_ids: Sequence[int],
++        delta_token_ids: Sequence[int],
++        request: ChatCompletionRequest,
++    ) -> Union[DeltaMessage, None]:
++
++        # if the tool call token is not in the tokens generated so far, append
++        # output to contents since it's not a tool
++        if self.bot_token not in current_text:
++            return DeltaMessage(content=delta_text)
++
++        # if the tool call token ID IS in the tokens generated so far, that
++        # means we're parsing as tool calls now
++
++        # handle if we detected the BOT token which means the start of tool
++        # calling
++        if (self.bot_token_id in delta_token_ids
++                and len(delta_token_ids) == 1):
++            # if it's the only token, return None, so we don't send a chat
++            # completion any don't send a control token
++            return None
++
++        # bit mask flags for partial JSON parsing. If the name hasn't been
++        # sent yet, don't allow sending
++        # an incomplete string since OpenAI only ever (as far as I have
++        # seen) allows sending the entire tool/ function name at once.
++        flags = Allow.ALL if self.current_tool_name_sent \
++            else Allow.ALL & ~Allow.STR
++        try:
++
++            # replace BOT token with empty string, and convert single quotes
++            # to double to allow parsing as JSON since mistral uses single
++            # quotes instead of double for tool calls
++            parsable_arr = current_text.split(self.bot_token)[-1]
++
++            # tool calls are generated in an array, so do partial JSON
++            # parsing on the entire array
++            try:
++                tool_call_arr: List[Dict] = partial_json_parser.loads(
++                    parsable_arr, flags)
++            except partial_json_parser.core.exceptions.MalformedJSON:
++                logger.debug('not enough tokens to parse into JSON yet')
++                return None
++
++            # select as the current tool call the one we're on the state at
++
++            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
++                if len(tool_call_arr) > 0 else {}
++
++            # case -- if no tokens have been streamed for the tool, e.g.
++            #   only the array brackets, stream nothing
++            if len(tool_call_arr) == 0:
++                return None
++
++            # case: we are starting a new tool in the array
++            #   -> array has > 0 length AND length has moved past cursor
++            elif (len(tool_call_arr) > 0
++                  and len(tool_call_arr) > self.current_tool_id + 1):
++
++                # if we're moving on to a new call, first make sure we
++                # haven't missed anything in the previous one that was
++                # auto-generated due to JSON completions, but wasn't
++                # streamed to the client yet.
++                if self.current_tool_id >= 0:
++                    diff: Union[str, None] = current_tool_call.get("arguments")
++
++                    if diff:
++                        diff = json.dumps(diff, ensure_ascii=False).replace(
++                            self.streamed_args_for_tool[self.current_tool_id],
++                            "")
++                        delta = DeltaMessage(tool_calls=[
++                            DeltaToolCall(index=self.current_tool_id,
++                                          function=DeltaFunctionCall(
++                                              arguments=diff).model_dump(
++                                                  exclude_none=True))
++                        ])
++                        self.streamed_args_for_tool[
++                            self.current_tool_id] += diff
++                    else:
++                        delta = None
++                else:
++                    delta = None
++                # re-set stuff pertaining to progress in the current tool
++                self.current_tool_id = len(tool_call_arr) - 1
++                self.current_tool_name_sent = False
++                self.streamed_args_for_tool.append("")
++                logger.debug("starting on new tool %d", self.current_tool_id)
++                return delta
++
++            # case: update an existing tool - this is handled below
++
++            # if the current tool name hasn't been sent, send if available
++            # - otherwise send nothing
++            if not self.current_tool_name_sent:
++                function_name = current_tool_call.get("name")
++                if function_name:
++
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      type="function",
++                                      id=MistralToolCall.generate_random_id(),
++                                      function=DeltaFunctionCall(
++                                          name=function_name).model_dump(
++                                              exclude_none=True))
++                    ])
++                    self.current_tool_name_sent = True
++                else:
++                    delta = None
++
++            # now we know we're on the same tool call and we're streaming
++            # arguments
++            else:
++
++                prev_arguments = self.prev_tool_call_arr[
++                    self.current_tool_id].get("arguments")
++                cur_arguments = current_tool_call.get("arguments")
++
++                new_text = delta_text.replace("\'", "\"")
++                if ('"}' in new_text):
++                    new_text = new_text[:new_text.rindex('"}')]
++
++                if not cur_arguments and not prev_arguments:
++
++                    delta = None
++                elif not cur_arguments and prev_arguments:
++                    logger.error(
++                        "INVARIANT - impossible to have arguments reset "
++                        "mid-arguments")
++                    delta = None
++                elif cur_arguments and not prev_arguments:
++                    cur_arguments_json = json.dumps(cur_arguments,
++                                                    ensure_ascii=False)[:-2]
++                    logger.debug("finding %s in %s", new_text,
++                                 cur_arguments_json)
++
++                    if (new_text not in cur_arguments_json):
++                        return None
++                    arguments_delta = cur_arguments_json[:cur_arguments_json.
++                                                         rindex(new_text) +
++                                                         len(new_text)]
++                    logger.debug("First tokens in arguments received: %s",
++                                 arguments_delta)
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      function=DeltaFunctionCall(
++                                          arguments=arguments_delta).
++                                      model_dump(exclude_none=True))
++                    ])
++                    self.streamed_args_for_tool[
++                        self.current_tool_id] += arguments_delta
++
++                elif cur_arguments and prev_arguments:
++                    cur_args_json = json.dumps(cur_arguments,
++                                               ensure_ascii=False)
++                    prev_args_json = json.dumps(prev_arguments,
++                                                ensure_ascii=False)
++                    logger.debug("Searching for diff between \n%s\n%s",
++                                 cur_args_json, prev_args_json)
++
++                    argument_diff = extract_intermediate_diff(
++                        cur_args_json, prev_args_json)
++                    logger.debug("got arguments diff: %s", argument_diff)
++                    delta = DeltaMessage(tool_calls=[
++                        DeltaToolCall(index=self.current_tool_id,
++                                      function=DeltaFunctionCall(
++                                          arguments=argument_diff).model_dump(
++                                              exclude_none=True))
++                    ])
++                    self.streamed_args_for_tool[
++                        self.current_tool_id] += argument_diff
++                else:
++                    # try parsing it with regular JSON - if it works we're
++                    # at the end, and we need to send the difference between
++                    # tokens streamed so far and the valid JSON
++                    delta = None
++
++            # check to see if the name is defined and has been sent. if so,
++            # stream the name - otherwise keep waiting
++            # finish by setting old and returning None as base case
++            self.prev_tool_call_arr = tool_call_arr
++            return delta
++
++        except Exception:
++            logger.exception("Error trying to handle streaming tool call.")
++            logger.debug(
++                "Skipping chunk as a result of tool streaming extraction "
++                "error")
++            return None
+diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+new file mode 100644
+index 0000000..26da4d6
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+@@ -0,0 +1,289 @@
++import ast
++import json
++import re
++from typing import Any, Sequence, Tuple, Union
++
++from transformers import PreTrainedTokenizerBase
++
++from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
++                                              DeltaFunctionCall, DeltaMessage,
++                                              DeltaToolCall,
++                                              ExtractedToolCallInformation,
++                                              FunctionCall, ToolCall)
++from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
++    ToolParser, ToolParserManager)
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++
++class _UnexpectedAstError(Exception):
++    pass
++
++
++@ToolParserManager.register_module("pythonic")
++class PythonicToolParser(ToolParser):
++    """
++    Tool call parser for models that produce tool calls in a pythonic style,
++    such as Llama 3.2 models.
++
++    Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set
++    """
++    # TODO(mdepinet): Possible future improvements:
++    #   1. Support text + tools separated by either <|python_tag|> or \n\n
++    #   2. Support tools outside of a list (or separated by a semicolon).
++    #      This depends on item 1 for consistent streaming.
++    # Neither of these are necessary for e.g. ToolACE, but both would help make
++    # Llama3.2 models more reliable.
++
++    TOOL_CALL_REGEX = re.compile(
++        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
++        re.DOTALL)
++
++    def __init__(self, tokenizer: PreTrainedTokenizerBase):
++        super().__init__(tokenizer)
++
++    # Rename for readability. This is NOT a tool id.
++    @property
++    def current_tool_index(self) -> int:
++        return self.current_tool_id
++
++    @current_tool_index.setter
++    def current_tool_index(self, value: int) -> None:
++        self.current_tool_id = value
++
++    def extract_tool_calls(
++            self, model_output: str,
++            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
++        """
++        Extract the tool calls from a complete model response.
++        """
++
++        if not (self.TOOL_CALL_REGEX.match(model_output)):
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++
++        try:
++            module = ast.parse(model_output)
++            parsed = getattr(module.body[0], "value", None)
++            if isinstance(parsed, ast.List) and all(
++                    isinstance(e, ast.Call) for e in parsed.elts):
++                return ExtractedToolCallInformation(
++                    tools_called=True,
++                    tool_calls=[
++                        _handle_single_tool(e)  # type: ignore
++                        for e in parsed.elts
++                    ],
++                    content=None)
++            else:
++                raise _UnexpectedAstError(
++                    "Tool output must be a list of function calls")
++        except Exception:
++            logger.exception("Error in extracting tool call from response.")
++            # Treat as regular text
++            return ExtractedToolCallInformation(tools_called=False,
++                                                tool_calls=[],
++                                                content=model_output)
++
++    def extract_tool_calls_streaming(
++        self,
++        previous_text: str,
++        current_text: str,
++        delta_text: str,
++        previous_token_ids: Sequence[int],
++        current_token_ids: Sequence[int],
++        delta_token_ids: Sequence[int],
++        request: ChatCompletionRequest,
++    ) -> Union[DeltaMessage, None]:
++
++        if not current_text.startswith("["):
++            return DeltaMessage(content=delta_text)
++
++        try:
++            valid_and_added_text = _make_valid_python(current_text)
++            if valid_and_added_text is None:
++                return None
++            valid_text, added_text = valid_and_added_text
++
++            module = ast.parse(valid_text)
++            parsed = getattr(module.body[0], "value", None)
++            if not isinstance(parsed, ast.List) or not all(
++                    isinstance(e, ast.Call) for e in parsed.elts):
++                raise _UnexpectedAstError(
++                    "Tool output must be a list of function calls")
++            tool_calls = [
++                _handle_single_tool(e)  # type: ignore
++                for e in parsed.elts
++            ]
++
++            tool_deltas = []
++            for index, new_call in enumerate(tool_calls):
++                if index < self.current_tool_index:
++                    continue
++
++                self.current_tool_index = index
++                if len(self.streamed_args_for_tool) == index:
++                    self.streamed_args_for_tool.append("")
++
++                new_call_complete = index < len(
++                    tool_calls) - 1 or ")]" not in added_text
++                if new_call_complete:
++                    self.current_tool_index += 1
++
++                withheld_suffix = (added_text[:-2]
++                                   if not new_call_complete else "")
++                if not new_call_complete and added_text[-2] == ")":
++                    # Function call is incomplete. Withhold the closing bracket.
++                    withheld_suffix = withheld_suffix + "}"
++                # Strings get single quotes in the model-produced string.
++                # JSON requires double quotes.
++                withheld_suffix = withheld_suffix.replace("'", '"')
++                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
++                                            new_call, index, withheld_suffix)
++
++                if delta is not None:
++                    tool_deltas.append(delta)
++                    if (delta.function is not None
++                            and delta.function.arguments is not None):
++                        self.streamed_args_for_tool[
++                            index] += delta.function.arguments
++
++            # HACK: serving_chat.py inspects the internal state of tool parsers
++            # when determining it's final streaming delta, automatically
++            # adding autocompleted JSON.
++            # These two lines avoid that nonsense while ensuring finish_reason
++            # is set to tool_calls when at least one tool is called.
++            if tool_deltas and not self.prev_tool_call_arr:
++                self.prev_tool_call_arr = [{"arguments": {}}]
++
++            if tool_deltas:
++                return DeltaMessage(tool_calls=tool_deltas)
++            elif not added_text and self.current_tool_id > 0:
++                # Return an empty DeltaMessage once the tool calls are all done
++                # so that finish_reason gets set.
++                return DeltaMessage(content='')
++            else:
++                return None
++        except Exception:
++            logger.exception("Error trying to handle streaming tool call.")
++            logger.debug(
++                "Skipping chunk as a result of tool streaming extraction "
++                "error")
++            return None
++
++
++def _get_parameter_value(val: ast.expr) -> Any:
++    if isinstance(val, ast.Constant):
++        return val.value
++    elif isinstance(val, ast.Dict):
++        if not all(isinstance(k, ast.Constant) for k in val.keys):
++            raise _UnexpectedAstError(
++                "Dict tool call arguments must have literal keys")
++        return {
++            k.value: _get_parameter_value(v)  # type: ignore
++            for k, v in zip(val.keys, val.values)
++        }
++    elif isinstance(val, ast.List):
++        return [_get_parameter_value(v) for v in val.elts]
++    else:
++        raise _UnexpectedAstError("Tool call arguments must be literals")
++
++
++def _handle_single_tool(call: ast.Call) -> ToolCall:
++    if not isinstance(call.func, ast.Name):
++        raise _UnexpectedAstError("Invalid tool call name")
++    function_name = call.func.id
++    arguments = {}
++    for keyword in call.keywords:
++        arguments[keyword.arg] = _get_parameter_value(keyword.value)
++    return ToolCall(type="function",
++                    function=FunctionCall(name=function_name,
++                                          arguments=json.dumps(arguments)))
++
++
++def _make_valid_python(text: str) -> Union[Tuple[str, str], None]:
++    bracket_stack = []
++    for index, char in enumerate(text):
++        if char in {"[", "(", "{"}:
++            bracket_stack.append(char)
++        elif char == "]":
++            if not bracket_stack or bracket_stack.pop() != "[":
++                raise _UnexpectedAstError("Mismatched square brackets")
++        elif char == ")":
++            if not bracket_stack or bracket_stack.pop() != "(":
++                raise _UnexpectedAstError("Mismatched parentheses")
++        elif char == "}":
++            if not bracket_stack or bracket_stack.pop() != "{":
++                raise _UnexpectedAstError("Mismatched curly braces")
++        elif char in {"'", '"'}:
++            if bracket_stack and bracket_stack[-1] == char:
++                if index > 0 and text[index - 1] == "\\":
++                    # Treat an escaped quote as a regular character
++                    pass
++                else:
++                    bracket_stack.pop()
++            elif bracket_stack and bracket_stack[-1] in {"'", '"'}:
++                # Double quote within a single quote string or vice versa.
++                pass
++            else:
++                bracket_stack.append(char)
++
++    text = text.rstrip()
++    if text.endswith("=") or text.endswith(":"):
++        # Since we have no type information for this property/parameter value,
++        # we can't fill in a valid value.
++        return None
++    if bracket_stack and bracket_stack[-1] == "{":
++        trailing_dict_text = text[:text.rfind("{")]
++        num_keys = trailing_dict_text.count(":")
++        num_values = trailing_dict_text.count(",")
++        if num_keys <= num_values:
++            return None  # Incomplete property name within parameter value
++    if bracket_stack and bracket_stack[-1] == "(":
++        trailing_params_text = text[:text.rfind("(")]
++        num_full_param_names = trailing_params_text.count("=")
++        num_full_param_values = trailing_params_text.count(",")
++        if num_full_param_names <= num_full_param_values:
++            return None  # Incomplete parameter name
++    if text.endswith(","):
++        text = text[:-1]
++    if bracket_stack and bracket_stack[-1] == "[" and not text.endswith(
++            "[") and not text.endswith(")"):
++        return None  # Incomplete function name
++
++    added_text = ""
++    for char in reversed(bracket_stack):
++        if char == "[":
++            added_text += "]"
++        elif char == "(":
++            added_text += ")"
++        elif char == "{":
++            added_text += "}"
++        elif char == "'":
++            added_text += "'"
++        elif char == '"':
++            added_text += '"'
++
++    return text + added_text, added_text
++
++
++def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall,
++                        index: int,
++                        withheld_suffix: str) -> Union[DeltaToolCall, None]:
++    new_call_args = new_call.function.arguments
++    if withheld_suffix:
++        assert new_call_args.endswith(withheld_suffix)
++        new_call_args = new_call_args[:-len(withheld_suffix)]
++    if not previously_sent_args:
++        return DeltaToolCall(id=new_call.id,
++                             index=index,
++                             function=DeltaFunctionCall(
++                                 name=new_call.function.name,
++                                 arguments=new_call_args,
++                             ))
++
++    arg_diff = new_call_args[len(previously_sent_args):]
++    return DeltaToolCall(
++        id="", index=index, function=DeltaFunctionCall(
++            arguments=arg_diff)) if arg_diff else None
+diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
+new file mode 100644
+index 0000000..5e4eb23
+--- /dev/null
++++ b/vllm/entrypoints/openai/tool_parsers/utils.py
+@@ -0,0 +1,121 @@
++import json
++from json import JSONDecodeError, JSONDecoder
++from typing import Any, List, Tuple
++
++import partial_json_parser
++from partial_json_parser.core.options import Allow
++
++
++def find_common_prefix(s1: str, s2: str) -> str:
++    """
++    Finds a common prefix that is shared between two strings, if there is one.
++    Order of arguments is NOT important.
++
++    This function is provided as a UTILITY for extracting information from JSON
++    generated by partial_json_parser, to help in ensuring that the right tokens
++    are returned in streaming, so that close-quotes, close-brackets and
++    close-braces are not returned prematurely.
++
++    e.g. find_common_prefix('{"fruit": "ap"}', '{"fruit": "apple"}') ->
++    '{"fruit": "ap'
++    """
++    prefix = ''
++    min_length = min(len(s1), len(s2))
++    for i in range(0, min_length):
++        if s1[i] == s2[i]:
++            prefix += s1[i]
++        else:
++            break
++    return prefix
++
++
++def find_common_suffix(s1: str, s2: str) -> str:
++    """
++    Finds a common suffix shared between two strings, if there is one. Order of
++    arguments is NOT important.
++    Stops when the suffix ends OR it hits an alphanumeric character
++
++    e.g. find_common_suffix('{"fruit": "ap"}', '{"fruit": "apple"}') -> '"}'
++    """
++    suffix = ''
++    min_length = min(len(s1), len(s2))
++    for i in range(1, min_length + 1):
++        if s1[-i] == s2[-i] and not s1[-i].isalnum():
++            suffix = s1[-i] + suffix
++        else:
++            break
++    return suffix
++
++
++def extract_intermediate_diff(curr: str, old: str) -> str:
++    """
++    Given two strings, extract the difference in the middle between two strings
++    that are known to have a common prefix and/or suffix.
++
++    This function is provided as a UTILITY for extracting information from JSON
++    generated by partial_json_parser, to help in ensuring that the right tokens
++    are returned in streaming, so that close-quotes, close-brackets and
++    close-braces are not returned prematurely. The order of arguments IS
++    important - the new version of the partially-parsed JSON must be the first
++    argument, and the secnod argument must be from the previous generation.
++
++    What it returns, is tokens that should be streamed to the client.
++
++    e.g. extract_intermediate_diff('{"fruit": "apple"}', '{"fruit": "ap"}')
++        -> 'ple'
++
++    """
++    suffix = find_common_suffix(curr, old)
++
++    old = old[::-1].replace(suffix[::-1], '', 1)[::-1]
++    prefix = find_common_prefix(curr, old)
++    diff = curr
++    if len(suffix):
++        diff = diff[::-1].replace(suffix[::-1], '', 1)[::-1]
++
++    if len(prefix):
++        # replace the prefix only once in case it's mirrored
++        diff = diff.replace(prefix, '', 1)
++
++    return diff
++
++
++def find_all_indices(string: str, substring: str) -> List[int]:
++    """
++    Find all (starting) indices of a substring in a given string. Useful for
++    tool call extraction
++    """
++    indices = []
++    index = -1
++    while True:
++        index = string.find(substring, index + 1)
++        if index == -1:
++            break
++        indices.append(index)
++    return indices
++
++
++# partial_json_parser doesn't support extra data and
++# JSONDecorder.raw_decode doesn't support partial JSON
++def partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
++    try:
++        return (partial_json_parser.loads(input_str, flags), len(input_str))
++    except JSONDecodeError as e:
++        if "Extra data" in e.msg:
++            dec = JSONDecoder()
++            return dec.raw_decode(input_str)
++        raise
++
++
++def is_complete_json(input_str: str) -> bool:
++    try:
++        json.loads(input_str)
++        return True
++    except JSONDecodeError:
++        return False
++
++
++def consume_space(i: int, s: str) -> int:
++    while i < len(s) and s[i].isspace():
++        i += 1
++    return i
+diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
+new file mode 100644
+index 0000000..e8a78d2
+--- /dev/null
++++ b/vllm/entrypoints/utils.py
+@@ -0,0 +1,57 @@
++import asyncio
++import functools
++
++from fastapi import Request
++
++
++async def listen_for_disconnect(request: Request) -> None:
++    """Returns if a disconnect message is received"""
++    while True:
++        message = await request.receive()
++        if message["type"] == "http.disconnect":
++            break
++
++
++def with_cancellation(handler_func):
++    """Decorator that allows a route handler to be cancelled by client
++    disconnections.
++    
++    This does _not_ use request.is_disconnected, which does not work with
++    middleware. Instead this follows the pattern from 
++    starlette.StreamingResponse, which simultaneously awaits on two tasks- one
++    to wait for an http disconnect message, and the other to do the work that we
++    want done. When the first task finishes, the other is cancelled.
++
++    A core assumption of this method is that the body of the request has already
++    been read. This is a safe assumption to make for fastapi handlers that have
++    already parsed the body of the request into a pydantic model for us.
++    This decorator is unsafe to use elsewhere, as it will consume and throw away
++    all incoming messages for the request while it looks for a disconnect
++    message.
++
++    In the case where a `StreamingResponse` is returned by the handler, this
++    wrapper will stop listening for disconnects and instead the response object
++    will start listening for disconnects.
++    """
++
++    # Functools.wraps is required for this wrapper to appear to fastapi as a
++    # normal route handler, with the correct request type hinting.
++    @functools.wraps(handler_func)
++    async def wrapper(*args, **kwargs):
++
++        # The request is either the second positional arg or `raw_request`
++        request = args[1] if len(args) > 1 else kwargs["raw_request"]
++
++        handler_task = asyncio.create_task(handler_func(*args, **kwargs))
++        cancellation_task = asyncio.create_task(listen_for_disconnect(request))
++
++        done, pending = await asyncio.wait([handler_task, cancellation_task],
++                                           return_when=asyncio.FIRST_COMPLETED)
++        for task in pending:
++            task.cancel()
++
++        if handler_task in done:
++            return handler_task.result()
++        return None
++
++    return wrapper
+diff --git a/vllm/envs.py b/vllm/envs.py
+index 91cc8f3..c4a568c 100644
+--- a/vllm/envs.py
++++ b/vllm/envs.py
+@@ -1,10 +1,13 @@
+ import os
+-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
++import tempfile
++from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+ 
+ if TYPE_CHECKING:
+     VLLM_HOST_IP: str = ""
++    VLLM_PORT: Optional[int] = None
++    VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
+     VLLM_USE_MODELSCOPE: bool = False
+-    VLLM_INSTANCE_ID: Optional[str] = None
++    VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
+     VLLM_NCCL_SO_PATH: Optional[str] = None
+     LD_LIBRARY_PATH: Optional[str] = None
+     VLLM_USE_TRITON_FLASH_ATTN: bool = False
+@@ -15,26 +18,75 @@ if TYPE_CHECKING:
+     S3_ACCESS_KEY_ID: Optional[str] = None
+     S3_SECRET_ACCESS_KEY: Optional[str] = None
+     S3_ENDPOINT_URL: Optional[str] = None
+-    VLLM_CONFIG_ROOT: str = ""
++    VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
++    VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
+     VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
+     VLLM_NO_USAGE_STATS: bool = False
+     VLLM_DO_NOT_TRACK: bool = False
+     VLLM_USAGE_SOURCE: str = ""
+     VLLM_CONFIGURE_LOGGING: int = 1
++    VLLM_LOGGING_LEVEL: str = "INFO"
++    VLLM_LOGGING_PREFIX: str = ""
+     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
+     VLLM_TRACE_FUNCTION: int = 0
+     VLLM_ATTENTION_BACKEND: Optional[str] = None
++    VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
++    VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False
++    VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
++    VLLM_PP_LAYER_PARTITION: Optional[str] = None
+     VLLM_CPU_KVCACHE_SPACE: int = 0
++    VLLM_CPU_OMP_THREADS_BIND: str = ""
++    VLLM_OPENVINO_DEVICE: str = "CPU"
++    VLLM_OPENVINO_KVCACHE_SPACE: int = 0
++    VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
++    VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
++    VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
++    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
++    VLLM_USE_RAY_SPMD_WORKER: bool = False
+     VLLM_USE_RAY_COMPILED_DAG: bool = False
+-    VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
++    VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
++    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = True
++    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
++    VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
++    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
++    VLLM_VIDEO_FETCH_TIMEOUT: int = 30
++    VLLM_AUDIO_FETCH_TIMEOUT: int = 10
+     VLLM_TARGET_DEVICE: str = "cuda"
+     MAX_JOBS: Optional[str] = None
+     NVCC_THREADS: Optional[str] = None
+-    VLLM_BUILD_WITH_NEURON: bool = False
+     VLLM_USE_PRECOMPILED: bool = False
+-    VLLM_INSTALL_PUNICA_KERNELS: bool = False
++    VLLM_NO_DEPRECATION_WARNING: bool = False
++    VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
+     CMAKE_BUILD_TYPE: Optional[str] = None
+     VERBOSE: bool = False
++    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
++    VLLM_TEST_FORCE_FP8_MARLIN: bool = False
++    VLLM_RPC_TIMEOUT: int = 10000  # ms
++    VLLM_PLUGINS: Optional[List[str]] = None
++    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
++    VLLM_USE_TRITON_AWQ: bool = False
++    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
++    VLLM_SKIP_P2P_CHECK: bool = False
++    VLLM_DISABLED_KERNELS: List[str] = []
++    VLLM_USE_V1: bool = False
++    VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
++    VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
++    VLLM_DISABLE_COMPILE_CACHE: bool = False
++
++
++def get_default_cache_root():
++    return os.getenv(
++        "XDG_CACHE_HOME",
++        os.path.join(os.path.expanduser("~"), ".cache"),
++    )
++
++
++def get_default_config_root():
++    return os.getenv(
++        "XDG_CONFIG_HOME",
++        os.path.join(os.path.expanduser("~"), ".config"),
++    )
++
+ 
+ # The begin-* and end* here are used by the documentation generator
+ # to extract the used env vars.
+@@ -45,7 +97,8 @@ environment_variables: Dict[str, Callable[[], Any]] = {
+ 
+     # ================== Installation Time Env Vars ==================
+ 
+-    # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
++    # Target device of vLLM, supporting [cuda (by default),
++    # rocm, neuron, cpu, openvino]
+     "VLLM_TARGET_DEVICE":
+     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
+ 
+@@ -60,17 +113,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
+     "NVCC_THREADS":
+     lambda: os.getenv("NVCC_THREADS", None),
+ 
+-    # If set, vllm will build with Neuron support
+-    "VLLM_BUILD_WITH_NEURON":
+-    lambda: bool(os.environ.get("VLLM_BUILD_WITH_NEURON", False)),
+-
+     # If set, vllm will use precompiled binaries (*.so)
+     "VLLM_USE_PRECOMPILED":
+-    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
+-
+-    # If set, vllm will install Punica kernels
+-    "VLLM_INSTALL_PUNICA_KERNELS":
+-    lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
++    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
++        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+ 
+     # CMake build type
+     # If not set, defaults to "Debug" or "RelWithDebInfo"
+@@ -83,28 +129,57 @@ environment_variables: Dict[str, Callable[[], Any]] = {
+     lambda: bool(int(os.getenv('VERBOSE', '0'))),
+ 
+     # Root directory for VLLM configuration files
++    # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
+     # Note that this not only affects how vllm finds its configuration files
+     # during runtime, but also affects how vllm installs its configuration
+     # files during **installation**.
+     "VLLM_CONFIG_ROOT":
+-    lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
+-        "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
++    lambda: os.path.expanduser(
++        os.getenv(
++            "VLLM_CONFIG_ROOT",
++            os.path.join(get_default_config_root(), "vllm"),
++        )),
+ 
+     # ================== Runtime Env Vars ==================
+ 
+-    # used in distributed environment to determine the master address
++    # Root directory for VLLM cache files
++    # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
++    "VLLM_CACHE_ROOT":
++    lambda: os.path.expanduser(
++        os.getenv(
++            "VLLM_CACHE_ROOT",
++            os.path.join(get_default_cache_root(), "vllm"),
++        )),
++
++    # used in distributed environment to determine the ip address
++    # of the current node, when the node has multiple network interfaces.
++    # If you are using multi-node inference, you should set this differently
++    # on each node.
+     'VLLM_HOST_IP':
+-    lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
++    lambda: os.getenv('VLLM_HOST_IP', ""),
++
++    # used in distributed environment to manually set the communication port
++    # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
++    # VLLM_PORT will be used as the first port, and the rest will be generated
++    # by incrementing the VLLM_PORT value.
++    # '0' is used to make mypy happy
++    'VLLM_PORT':
++    lambda: int(os.getenv('VLLM_PORT', '0'))
++    if 'VLLM_PORT' in os.environ else None,
++
++    # path used for ipc when the frontend api server is running in
++    # multi-processing mode to communicate with the backend engine process.
++    'VLLM_RPC_BASE_PATH':
++    lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()),
+ 
+     # If true, will load models from ModelScope instead of Hugging Face Hub.
+     # note that the value is true or false, not numbers
+     "VLLM_USE_MODELSCOPE":
+     lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
+ 
+-    # Instance id represents an instance of the VLLM. All processes in the same
+-    # instance should have the same instance id.
+-    "VLLM_INSTANCE_ID":
+-    lambda: os.environ.get("VLLM_INSTANCE_ID", None),
++    # Interval in seconds to log a warning message when the ring buffer is full
++    "VLLM_RINGBUFFER_WARNING_INTERVAL":
++    lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
+ 
+     # path to cudatoolkit home directory, under which should be bin, include,
+     # and lib directories.
+@@ -126,6 +201,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
+     lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
+              ("true", "1")),
+ 
++    # Internal flag to enable Dynamo fullgraph capture
++    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
++    lambda: bool(
++        os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
++
+     # local rank of the process in the distributed setting, used to determine
+     # the GPU device id
+     "LOCAL_RANK":
+@@ -145,7 +225,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
+ 
+     # S3 access information, used for tensorizer to load model from S3
+     "S3_ACCESS_KEY_ID":
+-    lambda: os.environ.get("S3_ACCESS_KEY", None),
++    lambda: os.environ.get("S3_ACCESS_KEY_ID", None),
+     "S3_SECRET_ACCESS_KEY":
+     lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
+     "S3_ENDPOINT_URL":
+@@ -171,6 +251,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
+     "VLLM_LOGGING_CONFIG_PATH":
+     lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
+ 
++    # this is used for configuring the default logging level
++    "VLLM_LOGGING_LEVEL":
++    lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"),
++
++    # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
++    "VLLM_LOGGING_PREFIX":
++    lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
++
+     # Trace function calls
+     # If set to 1, vllm will trace function calls
+     # Useful for debugging
+@@ -183,30 +271,208 @@ environment_variables: Dict[str, Callable[[], Any]] = {
+     # - "FLASH_ATTN": use FlashAttention
+     # - "XFORMERS": use XFormers
+     # - "ROCM_FLASH": use ROCmFlashAttention
++    # - "FLASHINFER": use flashinfer
+     "VLLM_ATTENTION_BACKEND":
+     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
+ 
+-    # CPU key-value cache space
++    # If set, vllm will use flashinfer sampler
++    "VLLM_USE_FLASHINFER_SAMPLER":
++    lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
++    if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None,
++
++    # If set, vllm will force flashinfer to use tensor cores;
++    # otherwise will use heuristic based on model architecture.
++    "VLLM_FLASHINFER_FORCE_TENSOR_CORES":
++    lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))),
++
++    # Pipeline stage partition strategy
++    "VLLM_PP_LAYER_PARTITION":
++    lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
++
++    # (CPU backend only) CPU key-value cache space.
+     # default is 4GB
+     "VLLM_CPU_KVCACHE_SPACE":
+     lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
+ 
++    # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
++    # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
++    "VLLM_CPU_OMP_THREADS_BIND":
++    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),
++
++    # OpenVINO device selection
++    # default is CPU
++    "VLLM_OPENVINO_DEVICE":
++    lambda: os.getenv("VLLM_OPENVINO_DEVICE", "CPU").upper(),
++
++    # OpenVINO key-value cache space
++    # default is 4GB
++    "VLLM_OPENVINO_KVCACHE_SPACE":
++    lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
++
++    # OpenVINO KV cache precision
++    # default is bf16 if natively supported by platform, otherwise f16
++    # To enable KV cache compression, please, explicitly specify u8
++    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
++    lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
++
++    # Enables weights compression during model export via HF Optimum
++    # default is False
++    "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
++    lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
++
++    # If the env var is set, then all workers will execute as separate
++    # processes from the engine, and we use the same mechanism to trigger
++    # execution on all workers.
++    # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
++    "VLLM_USE_RAY_SPMD_WORKER":
++    lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))),
++
+     # If the env var is set, it uses the Ray's compiled DAG API
+     # which optimizes the control plane overhead.
+     # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+     "VLLM_USE_RAY_COMPILED_DAG":
+-    lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
++    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
++
++    # If the env var is set, it uses NCCL for communication in
++    # Ray's compiled DAG. This flag is ignored if
++    # VLLM_USE_RAY_COMPILED_DAG is not set.
++    "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
++    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
++                 ),
++
++    # If the env var is set, it enables GPU communication overlap in
++    # Ray's compiled DAG. This flag is ignored if
++    # VLLM_USE_RAY_COMPILED_DAG is not set.
++    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
++    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "1"))
++                 ),
+ 
+     # Use dedicated multiprocess context for workers.
+     # Both spawn and fork work
+     "VLLM_WORKER_MULTIPROC_METHOD":
+-    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
++    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
++
++    # Path to the cache for storing downloaded assets
++    "VLLM_ASSETS_CACHE":
++    lambda: os.path.expanduser(
++        os.getenv(
++            "VLLM_ASSETS_CACHE",
++            os.path.join(get_default_cache_root(), "vllm", "assets"),
++        )),
++
++    # Timeout for fetching images when serving multimodal models
++    # Default is 5 seconds
++    "VLLM_IMAGE_FETCH_TIMEOUT":
++    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
++
++    # Timeout for fetching videos when serving multimodal models
++    # Default is 15 seconds
++    "VLLM_VIDEO_FETCH_TIMEOUT":
++    lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "15")),
++
++    # Timeout for fetching audio when serving multimodal models
++    # Default is 10 seconds
++    "VLLM_AUDIO_FETCH_TIMEOUT":
++    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
++
++    # Path to the XLA persistent cache directory.
++    # Only used for XLA devices such as TPUs.
++    "VLLM_XLA_CACHE_PATH":
++    lambda: os.path.expanduser(
++        os.getenv(
++            "VLLM_XLA_CACHE_PATH",
++            os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
++        )),
++    "VLLM_FUSED_MOE_CHUNK_SIZE":
++    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
++
++    # If set, vllm will skip the deprecation warnings.
++    "VLLM_NO_DEPRECATION_WARNING":
++    lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
++
++    # If set, the OpenAI API server will stay alive even after the underlying
++    # AsyncLLMEngine errors and stops serving requests
++    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
++    lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
++
++    # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
++    # the user to specify a max sequence length greater than
++    # the max length derived from the model's config.json.
++    # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
++    "VLLM_ALLOW_LONG_MAX_MODEL_LEN":
++    lambda:
++    (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
++     ("1", "true")),
++
++    # If set, forces FP8 Marlin to be used for FP8 quantization regardless
++    # of the hardware support for FP8 compute.
++    "VLLM_TEST_FORCE_FP8_MARLIN":
++    lambda:
++    (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
++     ("1", "true")),
++    "VLLM_TEST_FORCE_LOAD_FORMAT":
++    lambda: os.getenv("VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"),
++
++    # Time in ms for the zmq client to wait for a response from the backend
++    # server for simple data operations
++    "VLLM_RPC_TIMEOUT":
++    lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
++
++    # a list of plugin names to load, separated by commas.
++    # if this is not set, it means all plugins will be loaded
++    # if this is set to an empty string, no plugins will be loaded
++    "VLLM_PLUGINS":
++    lambda: None if "VLLM_PLUGINS" not in os.environ else os.environ[
++        "VLLM_PLUGINS"].split(","),
++
++    # Enables torch profiler if set. Path to the directory where torch profiler
++    # traces are saved. Note that it must be an absolute path.
++    "VLLM_TORCH_PROFILER_DIR":
++    lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
++             .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
++
++    # If set, vLLM will use Triton implementations of AWQ.
++    "VLLM_USE_TRITON_AWQ":
++    lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
++
++    # If set, allow loading or unloading lora adapters in runtime,
++    "VLLM_ALLOW_RUNTIME_LORA_UPDATING":
++    lambda:
++    (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in
++     ("1", "true")),
++
++    # By default, vLLM will check the peer-to-peer capability itself,
++    # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
++    # If this env var is set to 1, vLLM will skip the peer-to-peer check,
++    # and trust the driver's peer-to-peer capability report.
++    "VLLM_SKIP_P2P_CHECK":
++    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",
++
++    # List of quantization kernels that should be disabled, used for testing
++    # and performance comparisons. Currently only affects MPLinearKernel
++    # selection
++    # (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
++    "VLLM_DISABLED_KERNELS":
++    lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[
++        "VLLM_DISABLED_KERNELS"].split(","),
++
++    # If set, use the V1 code path.
++    "VLLM_USE_V1":
++    lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))),
++
++    # If set, enable multiprocessing in LLM for the V1 code path.
++    "VLLM_ENABLE_V1_MULTIPROCESSING":
++    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
++    "VLLM_LOG_BATCHSIZE_INTERVAL":
++    lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
++    "VLLM_DISABLE_COMPILE_CACHE":
++    lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))),
+ }
+ 
+ # end-env-vars-definition
+ 
+ 
+-def __getattr__(name):
++def __getattr__(name: str):
+     # lazy evaluation of environment variables
+     if name in environment_variables:
+         return environment_variables[name]()
+diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
+index a221245..b9a6bee 100644
+--- a/vllm/executor/cpu_executor.py
++++ b/vllm/executor/cpu_executor.py
+@@ -1,63 +1,174 @@
+-from typing import List, Set, Tuple
++import os
++from functools import partial
++from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
+ 
+-import torch
+-
+-import vllm.envs as envs
+-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
+ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
++from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
++                                                  ResultHandler, WorkerMonitor)
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+-                        make_async)
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import get_distributed_init_method, get_open_port, make_async
++from vllm.worker.worker_base import WorkerWrapperBase
+ 
+ logger = init_logger(__name__)
+ 
+ 
+ class CPUExecutor(ExecutorBase):
+ 
++    uses_ray: bool = False
++
+     def _init_executor(self) -> None:
+         assert self.device_config.device_type == "cpu"
+-        assert self.lora_config is None, "cpu backend doesn't support LoRA"
+-        self.model_config = _verify_and_get_model_config(self.model_config)
+-        self.cache_config = _verify_and_get_cache_config(self.cache_config)
+-        self.scheduler_config = _verify_and_get_scheduler_config(
+-            self.scheduler_config)
+-
+-        # Instantiate the worker and load the model to CPU.
+-        self._init_worker()
+-
+-    def _init_worker(self):
+-        from vllm.worker.cpu_worker import CPUWorker
+-
+-        assert self.parallel_config.world_size == 1, (
+-            "CPUExecutor only supports single CPU socket currently.")
+-
+-        distributed_init_method = get_distributed_init_method(
+-            get_ip(), get_open_port())
+-        self.driver_worker = CPUWorker(
+-            model_config=self.model_config,
+-            parallel_config=self.parallel_config,
+-            scheduler_config=self.scheduler_config,
+-            device_config=self.device_config,
+-            cache_config=self.cache_config,
+-            load_config=self.load_config,
+-            local_rank=0,
+-            rank=0,
+-            distributed_init_method=distributed_init_method,
+-            lora_config=self.lora_config,
+-            vision_language_config=self.vision_language_config,
++
++        #
++        # Environment variables for CPU executor
++        #
++
++        # Disable torch async compiling which won't work with daemonic processes
++        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
++
++        # Intel OpenMP setting
++        ld_prealod_str = os.getenv("LD_PRELOAD", "")
++        if "libiomp5.so" in ld_prealod_str:
++            # The time(milliseconds) that a thread should wait after
++            # completing the execution of a parallel region, before sleeping.
++            os.environ['KMP_BLOCKTIME'] = "1"
++            # Prevents the CPU to run into low performance state
++            os.environ['KMP_TPAUSE'] = "0"
++            # Provides fine granularity parallelism
++            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
++            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
++            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
++
++        # To hint IPEX uses shared memory based AllReduce
++        os.environ["LOCAL_WORLD_SIZE"] = str(
++            self.parallel_config.tensor_parallel_size)
++
++        # Multiprocessing-based executor does not support multi-node setting.
++        # Since it only works for single node, we can use the loopback address
++        # 127.0.0.1 for communication.
++        ip = "127.0.0.1"
++        port = get_open_port()
++        self.distributed_init_method = get_distributed_init_method(ip, port)
++
++        is_async = isinstance(self, CPUExecutorAsync)
++
++        world_size = self.parallel_config.tensor_parallel_size
++        result_handler = ResultHandler()
++        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
++        self.workers = []
++
++        if is_async:
++            self.workers = [
++                ProcessWorkerWrapper(
++                    result_handler,
++                    partial(
++                        self._create_worker,
++                        rank=rank,
++                        local_rank=rank,
++                    )) for rank in range(0, world_size)
++            ]
++            self.driver_worker = self.workers[0]
++            self.workers = self.workers[1:]
++            self.driver_method_invoker = _async_driver_method_invoker
++        else:
++            self.driver_worker = self._create_worker()
++            self.driver_method_invoker = _driver_method_invoker
++
++            if world_size != 1:
++                self.workers = [
++                    ProcessWorkerWrapper(
++                        result_handler,
++                        partial(
++                            self._create_worker,
++                            rank=rank,
++                            local_rank=rank,
++                        )) for rank in range(1, world_size)
++                ]
++
++        self.worker_monitor = None
++        if world_size != 1 or is_async:
++            if is_async:
++                async_worker_list = self.workers + [self.driver_worker]
++            else:
++                async_worker_list = self.workers
++            self.worker_monitor = WorkerMonitor(async_worker_list,
++                                                result_handler)
++            result_handler.start()
++            self.worker_monitor.start()
++
++        self._run_workers("init_device")
++        self._run_workers("load_model")
++
++    def _create_worker(
++        self,
++        local_rank: int = 0,
++        rank: int = 0,
++    ):
++
++        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
++
++        assert self.distributed_init_method is not None
++
++        kwargs = dict(
++            vllm_config=self.vllm_config,
++            local_rank=local_rank,
++            rank=rank,
++            distributed_init_method=self.distributed_init_method,
+             kv_cache_dtype=self.cache_config.cache_dtype,
+-            is_driver_worker=True,
++            is_driver_worker=rank == 0,
+         )
+-        self.driver_worker.init_device()
+-        self.driver_worker.load_model()
++        wrapper.init_worker(**kwargs)
++
++        return wrapper.worker
++
++    def _run_workers(
++        self,
++        method: str,
++        *args,
++        async_run_remote_workers_only: bool = False,
++        max_concurrent_workers: Optional[int] = None,
++        **kwargs,
++    ) -> Any:
++        """Runs the given method on all workers.
++
++        Args:
++            async_run_remote_workers_only: If True the method will be run only
++                in the remote workers, not the driver worker. It will also be
++                run asynchronously and return a list of futures rather than
++                blocking on the results.
++        """
++
++        if max_concurrent_workers:
++            raise NotImplementedError(
++                "max_concurrent_workers is not supported yet.")
++
++        # Start the workers first.
++        worker_outputs = [
++            worker.execute_method(method, *args, **kwargs)
++            for worker in self.workers
++        ]
++
++        if async_run_remote_workers_only:
++            # Just return futures
++            return worker_outputs
++
++        driver_worker_output = self.driver_method_invoker(
++            self.driver_worker, method, *args, **kwargs)
++
++        # Get the results of the workers.
++        return [driver_worker_output
++                ] + [output.get() for output in worker_outputs]
+ 
+     def determine_num_available_blocks(self) -> Tuple[int, int]:
+         """Determine the number of available KV blocks by invoking the
+         underlying worker.
+         """
+-        return self.driver_worker.determine_num_available_blocks()
++        return self.driver_method_invoker(self.driver_worker,
++                                          "determine_num_available_blocks")
+ 
+     def initialize_cache(self, num_gpu_blocks: int,
+                          num_cpu_blocks: int) -> None:
+@@ -70,27 +181,101 @@ class CPUExecutor(ExecutorBase):
+         # referred as `gpu block`. Because we want to reuse the existing block
+         # management procedure.
+         logger.info("# CPU blocks: %d", num_gpu_blocks)
+-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
++
++        self._run_workers("initialize_cache",
++                          num_gpu_blocks=num_gpu_blocks,
++                          num_cpu_blocks=num_cpu_blocks)
+ 
+     def execute_model(
+             self,
+             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+-        output = self.driver_worker.execute_model(execute_model_req)
++        if (self.parallel_config.tensor_parallel_size > 1
++                and self.parallel_worker_tasks is None):
++            self.parallel_worker_tasks = self._run_workers(
++                "start_worker_execution_loop",
++                async_run_remote_workers_only=True,
++            )
++        output = self.driver_method_invoker(self.driver_worker,
++                                            "execute_model", execute_model_req)
+         return output
+ 
++    def stop_remote_worker_execution_loop(self) -> None:
++        if self.parallel_worker_tasks is None:
++            return
++        """
++        Passing None will cause the driver to stop the model execution
++        loop running in each of the remote workers.
++        """
++        self.driver_method_invoker(self.driver_worker, "execute_model", None)
++        parallel_worker_tasks = self.parallel_worker_tasks
++        self.parallel_worker_tasks = None
++        # Ensure that workers exit model loop cleanly
++        # (this will raise otherwise)
++        self._wait_for_tasks_completion(parallel_worker_tasks)
++
+     def add_lora(self, lora_request: LoRARequest) -> bool:
+-        return self.driver_worker.add_lora(lora_request)
++        return all(self._run_workers("add_lora", lora_request))
+ 
+     def remove_lora(self, lora_id: int) -> bool:
+-        return self.driver_worker.remove_lora(lora_id)
++        return all(self._run_workers("remove_lora", lora_id))
++
++    def pin_lora(self, lora_id: int) -> bool:
++        assert lora_id > 0, "lora_id must be greater than 0."
++        return all(self._run_workers(
++            "pin_lora",
++            lora_id=lora_id,
++        ))
+ 
+     def list_loras(self) -> Set[int]:
+-        return self.driver_worker.list_loras()
++        return self.driver_method_invoker(self.driver_worker, "list_loras")
++
++    def add_prompt_adapter(
++            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
++        return all(
++            self._run_workers(
++                "add_prompt_adapter",
++                prompt_adapter_request,
++            ))
++
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        return all(
++            self._run_workers(
++                "remove_prompt_adapter",
++                prompt_adapter_id,
++            ))
++
++    def list_prompt_adapters(self) -> Set[int]:
++        return self.driver_method_invoker(self.driver_worker,
++                                          "list_prompt_adapters")
++
++    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        return all(self._run_workers(
++            "pin_prompt_adapter",
++            prompt_adapter_id,
++        ))
+ 
+     def check_health(self) -> None:
+-        # CPUExecutor will always be healthy as long as
+-        # it's running.
+-        return
++        """Raises an error if engine is unhealthy."""
++        if self.worker_monitor is not None and not self.worker_monitor.is_alive(
++        ):
++            raise RuntimeError("Worker processes are not running")
++
++    def shutdown(self):
++        if (worker_monitor := getattr(self, "worker_monitor",
++                                      None)) is not None:
++            worker_monitor.close()
++
++    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
++        """Wait for futures returned from _run_workers() with
++        async_run_remote_workers_only to complete."""
++        for result in parallel_worker_tasks:
++            result.get()
++
++    def start_profile(self) -> None:
++        self.driver_method_invoker(self.driver_worker, "start_profile")
++
++    def stop_profile(self) -> None:
++        self.driver_method_invoker(self.driver_worker, "stop_profile")
+ 
+ 
+ class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
+@@ -98,55 +283,17 @@ class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
+     async def execute_model_async(
+             self,
+             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+-        output = await make_async(self.driver_worker.execute_model
++        output = await make_async(self.execute_model
+                                   )(execute_model_req=execute_model_req, )
+         return output
+ 
+     async def check_health_async(self) -> None:
+-        # CPUExecutor will always be healthy as long as
+-        # it's running.
+-        return
+-
+-
+-def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
+-    if config.dtype == torch.float16:
+-        logger.warning("float16 is not supported on CPU, casting to bfloat16.")
+-        config.dtype = torch.bfloat16
+-    if not config.enforce_eager:
+-        logger.warning(
+-            "CUDA graph is not supported on CPU, fallback to the eager "
+-            "mode.")
+-        config.enforce_eager = True
+-    return config
++        self.check_health()
+ 
+ 
+-def _verify_and_get_scheduler_config(
+-        config: SchedulerConfig) -> SchedulerConfig:
+-    if config.chunked_prefill_enabled:
+-        logger.warning("Chunked prefill is not supported on CPU, disable it.")
+-        config.chunked_prefill_enabled = False
++def _driver_method_invoker(driver, method: str, *args, **kwargs):
++    return getattr(driver, method)(*args, **kwargs)
+ 
+-    return config
+-
+-
+-def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
+-    _GB = 1 << 30
+-    if config.enable_prefix_caching:
+-        logger.warning("Prefix caching is not supported on CPU, disable it.")
+-        config.enable_prefix_caching = False
+-
+-    kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+-
+-    if kv_cache_space >= 0:
+-        if kv_cache_space == 0:
+-            config.cpu_kvcache_space_bytes = 4 * _GB  # type: ignore
+-            logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
+-                           "for CPU backend is not set, using 4 by default.")
+-        else:
+-            config.cpu_kvcache_space_bytes = kv_cache_space * _GB  # type: ignore
+-    else:
+-        raise RuntimeError(
+-            "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
+-            f" {kv_cache_space}, expect a positive integer value.")
+ 
+-    return config
++def _async_driver_method_invoker(driver, method: str, *args, **kwargs):
++    return driver.execute_method(method, *args, **kwargs).get()
+diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py
+index 4c922ef..deb7cb1 100644
+--- a/vllm/executor/distributed_gpu_executor.py
++++ b/vllm/executor/distributed_gpu_executor.py
+@@ -1,11 +1,13 @@
++import asyncio
+ from abc import abstractmethod
+-from typing import Any, Dict, List, Optional, Set, Tuple
++from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union
+ 
+ from vllm.executor.executor_base import ExecutorAsyncBase
+ from vllm.executor.gpu_executor import GPUExecutor
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+-from vllm.sequence import SamplerOutput
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
+ 
+ logger = init_logger(__name__)
+ 
+@@ -13,6 +15,16 @@ logger = init_logger(__name__)
+ class DistributedGPUExecutor(GPUExecutor):
+     """Abstract superclass of multi-GPU executor implementations."""
+ 
++    def __init__(self, *args, **kwargs):
++        # This is non-None when the execute model loop is running
++        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
++        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
++        # Updated by implementations that require additional args to be passed
++        # to the _run_workers execute_model call
++        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
++
++        super().__init__(*args, **kwargs)
++
+     def determine_num_available_blocks(self) -> Tuple[int, int]:
+         """Determine the number of available KV blocks.
+ 
+@@ -44,6 +56,10 @@ class DistributedGPUExecutor(GPUExecutor):
+         # have GPUs.
+         logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                     num_cpu_blocks)
++        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
++                           self.model_config.max_model_len)
++        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
++                    self.model_config.max_model_len, max_concurrency)
+ 
+         self.cache_config.num_gpu_blocks = num_gpu_blocks
+         self.cache_config.num_cpu_blocks = num_cpu_blocks
+@@ -52,13 +68,31 @@ class DistributedGPUExecutor(GPUExecutor):
+                           num_gpu_blocks=num_gpu_blocks,
+                           num_cpu_blocks=num_cpu_blocks)
+ 
+-    def execute_model(self, *args, **kwargs) -> List[SamplerOutput]:
+-        all_outputs = self._run_workers("execute_model",
+-                                        driver_args=args,
+-                                        driver_kwargs=kwargs)
++    def execute_model(
++        self,
++        execute_model_req: ExecuteModelRequest,
++    ) -> List[SamplerOutput]:
++        if self.parallel_worker_tasks is None:
++            self.parallel_worker_tasks = self._run_workers(
++                "start_worker_execution_loop",
++                async_run_tensor_parallel_workers_only=True,
++                **self.extra_execute_model_run_workers_kwargs)
+ 
+         # Only the driver worker returns the sampling results.
+-        return all_outputs[0]
++        driver_outputs = self._driver_execute_model(execute_model_req)
++        assert driver_outputs is not None
++        return driver_outputs
++
++    def stop_remote_worker_execution_loop(self) -> None:
++        if self.parallel_worker_tasks is None:
++            return
++
++        self._driver_execute_model(execute_model_req=None)
++        parallel_worker_tasks = self.parallel_worker_tasks
++        self.parallel_worker_tasks = None
++        # Ensure that workers exit model loop cleanly
++        # (this will raise otherwise)
++        self._wait_for_tasks_completion(parallel_worker_tasks)
+ 
+     def add_lora(self, lora_request: LoRARequest) -> bool:
+         assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+@@ -74,42 +108,105 @@ class DistributedGPUExecutor(GPUExecutor):
+             lora_id=lora_id,
+         )
+ 
++    def pin_lora(self, lora_id: int) -> bool:
++        assert lora_id > 0, "lora_id must be greater than 0."
++        return self._run_workers(
++            "pin_lora",
++            lora_id=lora_id,
++        )
++
+     def list_loras(self) -> Set[int]:
+         return self._run_workers("list_loras")
+ 
++    def save_sharded_state(
++        self,
++        path: str,
++        pattern: Optional[str] = None,
++        max_size: Optional[int] = None,
++    ) -> None:
++        self._run_workers("save_sharded_state",
++                          path=path,
++                          pattern=pattern,
++                          max_size=max_size)
++
++    @abstractmethod
++    def _driver_execute_model(
++        self, execute_model_req: Optional[ExecuteModelRequest]
++    ) -> Optional[List[SamplerOutput]]:
++        """Run execute_model in the driver worker.
++
++        Passing None will cause the driver to stop the model execution loop
++        running in each of the remote workers. In this case, this method
++        returns None. Otherwise, this method returns the model output.
++        """
++        raise NotImplementedError
++
+     @abstractmethod
+     def _run_workers(
+         self,
+         method: str,
+         *args,
+-        driver_args: Optional[Tuple[Any, ...]] = None,
+-        driver_kwargs: Optional[Dict[str, Any]] = None,
++        async_run_tensor_parallel_workers_only: bool = False,
+         max_concurrent_workers: Optional[int] = None,
+         **kwargs,
+     ) -> Any:
+-        """Runs the given method on all workers."""
++        """Runs the given method on all workers.
++
++        Args:
++            async_run_tensor_parallel_workers_only: If True the method will be
++                run only in the remote TP workers, not the driver worker.
++                It will also be run asynchronously and return a list of futures
++                rather than blocking on the results.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
++        """Wait for futures returned from _run_workers() with
++        async_run_remote_workers_only to complete."""
+         raise NotImplementedError
+ 
+ 
+ class DistributedGPUExecutorAsync(DistributedGPUExecutor, ExecutorAsyncBase):
+ 
++    async def execute_model_async(
++            self,
++            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        if self.parallel_worker_tasks is None:
++            # Start model execution loop running in the parallel workers
++            self.parallel_worker_tasks = asyncio.create_task(
++                self._start_worker_execution_loop())
++
++        # Only the driver worker returns the sampling results.
++        return await self._driver_execute_model_async(execute_model_req)
++
++    async def stop_remote_worker_execution_loop_async(self) -> None:
++        if self.parallel_worker_tasks is None:
++            return
++
++        await self._driver_execute_model_async()
++        parallel_worker_tasks = self.parallel_worker_tasks
++        self.parallel_worker_tasks = None
++        # Ensure that workers exit model loop cleanly
++        # (this will raise otherwise)
++        await parallel_worker_tasks
++
+     @abstractmethod
+-    async def _run_workers_async(
++    async def _driver_execute_model_async(
+         self,
+-        method: str,
+-        *args,
+-        driver_args: Optional[Tuple[Any, ...]] = None,
+-        driver_kwargs: Optional[Dict[str, Any]] = None,
+-        **kwargs,
+-    ) -> Any:
+-        """Runs the given method on all workers."""
+-        raise NotImplementedError
++        execute_model_req: Optional[ExecuteModelRequest] = None,
++    ) -> List[SamplerOutput]:
++        """Execute the model asynchronously in the driver worker.
+ 
+-    async def execute_model_async(self, *args,
+-                                  **kwargs) -> List[SamplerOutput]:
+-        all_outputs = await self._run_workers_async("execute_model",
+-                                                    driver_args=args,
+-                                                    driver_kwargs=kwargs)
++        Passing None will cause the driver to stop the model execution
++        loop running in each of the remote workers.
++        """
++        raise NotImplementedError
+ 
+-        # Only the driver worker returns the sampling results.
+-        return all_outputs[0]
++    @abstractmethod
++    async def _start_worker_execution_loop(self):
++        """Run execution loop on all workers. It guarantees all workers run
++        the loop or None of them is running the loop. Loop can be stopped by
++        `stop_remote_worker_execution_loop`.
++        The API is idempotent (guarantee only 1 loop run at any moment)."""
++        raise NotImplementedError
+diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
+index 08aa589..9cba189 100644
+--- a/vllm/executor/executor_base.py
++++ b/vllm/executor/executor_base.py
+@@ -1,11 +1,11 @@
+ from abc import ABC, abstractmethod
+ from typing import List, Optional, Set, Tuple
+ 
+-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+-                         ModelConfig, ParallelConfig, SchedulerConfig,
+-                         SpeculativeConfig, VisionLanguageConfig)
++from vllm.config import VllmConfig
+ from vllm.lora.request import LoRARequest
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sequence import ExecuteModelRequest
+ 
+ 
+ class ExecutorBase(ABC):
+@@ -16,28 +16,23 @@ class ExecutorBase(ABC):
+     that can execute the model on multiple devices.
+     """
+ 
++    uses_ray: bool  # whether the executor uses Ray for orchestration.
++
+     def __init__(
+         self,
+-        model_config: ModelConfig,
+-        cache_config: CacheConfig,
+-        parallel_config: ParallelConfig,
+-        scheduler_config: SchedulerConfig,
+-        device_config: DeviceConfig,
+-        load_config: LoadConfig,
+-        lora_config: Optional[LoRAConfig],
+-        vision_language_config: Optional[VisionLanguageConfig],
+-        speculative_config: Optional[SpeculativeConfig],
++        vllm_config: VllmConfig,
+     ) -> None:
+-        self.model_config = model_config
+-        self.cache_config = cache_config
+-        self.lora_config = lora_config
+-        self.load_config = load_config
+-        self.parallel_config = parallel_config
+-        self.scheduler_config = scheduler_config
+-        self.device_config = device_config
+-        self.vision_language_config = vision_language_config
+-        self.speculative_config = speculative_config
+-
++        self.vllm_config = vllm_config
++        self.model_config = vllm_config.model_config
++        self.cache_config = vllm_config.cache_config
++        self.lora_config = vllm_config.lora_config
++        self.load_config = vllm_config.load_config
++        self.parallel_config = vllm_config.parallel_config
++        self.scheduler_config = vllm_config.scheduler_config
++        self.device_config = vllm_config.device_config
++        self.speculative_config = vllm_config.speculative_config
++        self.prompt_adapter_config = vllm_config.prompt_adapter_config
++        self.observability_config = vllm_config.observability_config
+         self._init_executor()
+ 
+     @abstractmethod
+@@ -69,11 +64,15 @@ class ExecutorBase(ABC):
+ 
+     @abstractmethod
+     def execute_model(
+-            self,
+-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        self, execute_model_req: ExecuteModelRequest
++    ) -> Optional[List[SamplerOutput]]:
+         """Executes at least one model step on the given sequences."""
+         raise NotImplementedError
+ 
++    def stop_remote_worker_execution_loop(self) -> None:
++        """Releases parallel workers from model loop."""
++        return
++
+     @abstractmethod
+     def add_lora(self, lora_request: LoRARequest) -> bool:
+         raise NotImplementedError
+@@ -82,10 +81,31 @@ class ExecutorBase(ABC):
+     def remove_lora(self, lora_id: int) -> bool:
+         raise NotImplementedError
+ 
++    @abstractmethod
++    def pin_lora(self, lora_id: int) -> bool:
++        raise NotImplementedError  # type: ignore
++
+     @abstractmethod
+     def list_loras(self) -> Set[int]:
+         raise NotImplementedError
+ 
++    @abstractmethod
++    def add_prompt_adapter(
++            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
++        raise NotImplementedError
++
++    @abstractmethod
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError
++
++    @abstractmethod
++    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError  # type: ignore
++
++    @abstractmethod
++    def list_prompt_adapters(self) -> Set[int]:
++        raise NotImplementedError
++
+     @abstractmethod
+     def check_health(self) -> None:
+         """Checks if the executor is healthy. If not, it should raise an
+@@ -109,6 +129,10 @@ class ExecutorAsyncBase(ExecutorBase):
+         """Executes one model step on the given sequences."""
+         raise NotImplementedError
+ 
++    async def stop_remote_worker_execution_loop_async(self) -> None:
++        """Releases parallel workers from model loop."""
++        return
++
+     async def check_health_async(self) -> None:
+         """Checks if the executor is healthy. If not, it should raise an
+         exception."""
+diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
+index 1af3bcf..7fa3445 100644
+--- a/vllm/executor/gpu_executor.py
++++ b/vllm/executor/gpu_executor.py
+@@ -1,9 +1,11 @@
+-from typing import Any, Dict, List, Optional, Set, Tuple
++from typing import Any, Dict, List, Optional, Set, Tuple, Union
+ 
+ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sequence import ExecuteModelRequest, PoolerOutput
+ from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                         make_async)
+ from vllm.worker.worker_base import WorkerWrapperBase
+@@ -11,18 +13,26 @@ from vllm.worker.worker_base import WorkerWrapperBase
+ logger = init_logger(__name__)
+ 
+ 
++def create_worker(**kwargs):
++    vllm_config = kwargs.get("vllm_config")
++    wrapper = WorkerWrapperBase(vllm_config=vllm_config)
++    wrapper.init_worker(**kwargs)
++    return wrapper.worker
++
++
+ class GPUExecutor(ExecutorBase):
+ 
++    uses_ray: bool = False
++
+     def _init_executor(self) -> None:
+         """Initialize the worker and load the model.
+-
+-        If speculative decoding is enabled, we instead create the speculative
+-        worker.
+         """
+-        if self.speculative_config is None:
+-            self._init_non_spec_worker()
+-        else:
+-            self._init_spec_worker()
++        assert self.parallel_config.world_size == 1, (
++            "GPUExecutor only supports single GPU.")
++
++        self.driver_worker = self._create_worker()
++        self.driver_worker.init_device()
++        self.driver_worker.load_model()
+ 
+     def _get_worker_kwargs(
+             self,
+@@ -34,70 +44,22 @@ class GPUExecutor(ExecutorBase):
+             distributed_init_method = get_distributed_init_method(
+                 get_ip(), get_open_port())
+         return dict(
+-            model_config=self.model_config,
+-            parallel_config=self.parallel_config,
+-            scheduler_config=self.scheduler_config,
+-            device_config=self.device_config,
+-            cache_config=self.cache_config,
+-            load_config=self.load_config,
++            vllm_config=self.vllm_config,
+             local_rank=local_rank,
+             rank=rank,
+             distributed_init_method=distributed_init_method,
+-            lora_config=self.lora_config,
+-            vision_language_config=self.vision_language_config,
+-            is_driver_worker=rank == 0,
++            is_driver_worker=(not self.parallel_config)
++            or (rank % self.parallel_config.tensor_parallel_size == 0),
+         )
+ 
+     def _create_worker(self,
+                        local_rank: int = 0,
+                        rank: int = 0,
+                        distributed_init_method: Optional[str] = None):
+-        wrapper = WorkerWrapperBase(
+-            worker_module_name="vllm.worker.worker",
+-            worker_class_name="Worker",
+-        )
+-        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
+-                                                      distributed_init_method))
+-        return wrapper.worker
+-
+-    def _init_non_spec_worker(self):
+-        assert self.parallel_config.world_size == 1, (
+-            "GPUExecutor only supports single GPU.")
+-
+-        self.driver_worker = self._create_worker()
+-        self.driver_worker.init_device()
+-        self.driver_worker.load_model()
+-
+-    def _init_spec_worker(self):
+-        """Initialize a SpecDecodeWorker, using a draft model for proposals.
+-        """
+-        assert self.speculative_config is not None
+-
+-        from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
+-
+-        target_worker = self._create_worker()
+-
+-        draft_worker_kwargs = self._get_worker_kwargs()
+-        # Override draft-model specific worker args.
+-        draft_worker_kwargs.update(
+-            model_config=self.speculative_config.draft_model_config,
+-            parallel_config=self.speculative_config.draft_parallel_config,
+-            # TODO allow draft-model specific load config.
+-            #load_config=self.load_config,
+-        )
+-
+-        spec_decode_worker = SpecDecodeWorker.create_worker(
+-            scorer_worker=target_worker,
+-            draft_worker_kwargs=draft_worker_kwargs,
+-        )
+-
+-        assert self.parallel_config.world_size == 1, (
+-            "GPUExecutor only supports single GPU.")
+-
+-        self.driver_worker = spec_decode_worker
+-
+-        # Load model handled in spec decode worker.
+-        self.driver_worker.init_device()
++        return create_worker(**self._get_worker_kwargs(
++            local_rank=local_rank,
++            rank=rank,
++            distributed_init_method=distributed_init_method))
+ 
+     def determine_num_available_blocks(self) -> Tuple[int, int]:
+         """Determine the number of available KV blocks by invoking the
+@@ -113,12 +75,16 @@ class GPUExecutor(ExecutorBase):
+         # remains to abstract away the device for non-GPU configurations.
+         logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                     num_cpu_blocks)
++        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
++                           self.model_config.max_model_len)
++        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
++                    self.model_config.max_model_len, max_concurrency)
+ 
+         self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+ 
+     def execute_model(
+-            self,
+-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        self, execute_model_req: ExecuteModelRequest
++    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
+         output = self.driver_worker.execute_model(execute_model_req)
+         return output
+ 
+@@ -130,21 +96,50 @@ class GPUExecutor(ExecutorBase):
+         assert lora_id > 0, "lora_id must be greater than 0."
+         return self.driver_worker.remove_lora(lora_id)
+ 
++    def pin_lora(self, lora_id: int) -> bool:
++        assert lora_id > 0, "lora_id must be greater than 0."
++        return self.driver_worker.pin_lora(lora_id)
++
+     def list_loras(self) -> Set[int]:
+         return self.driver_worker.list_loras()
+ 
++    def add_prompt_adapter(
++            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
++        assert prompt_adapter_request.prompt_adapter_id > 0, \
++            "prompt_adapter_id must be greater than 0."
++        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
++
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        assert prompt_adapter_id > 0, \
++            "prompt_adapter_id must be greater than 0."
++        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
++
++    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        assert prompt_adapter_id > 0, \
++                "prompt_adapter_id must be greater than 0."
++        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
++
++    def list_prompt_adapters(self) -> Set[int]:
++        return self.driver_worker.list_prompt_adapters()
++
+     def check_health(self) -> None:
+         # GPUExecutor will always be healthy as long as
+         # it's running.
+         return
+ 
++    def start_profile(self) -> None:
++        self.driver_worker.start_profile()
++
++    def stop_profile(self) -> None:
++        self.driver_worker.stop_profile()
++
+ 
+ class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
+ 
+     async def execute_model_async(
+         self,
+         execute_model_req: ExecuteModelRequest,
+-    ) -> List[SamplerOutput]:
++    ) -> List[Union[SamplerOutput, PoolerOutput]]:
+         output = await make_async(self.driver_worker.execute_model
+-                                  )(execute_model_req=execute_model_req, )
++                                  )(execute_model_req=execute_model_req)
+         return output
+diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py
+new file mode 100644
+index 0000000..c9b7bfa
+--- /dev/null
++++ b/vllm/executor/hpu_executor.py
+@@ -0,0 +1,202 @@
++###############################################################################
++# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
++###############################################################################
++
++import contextlib
++import os
++from typing import Any, Dict, List, Optional, Set, Tuple
++
++from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
++                        make_async)
++from vllm.worker.worker_base import WorkerWrapperBase
++
++logger = init_logger(__name__)
++
++
++class HPUExecutor(ExecutorBase):
++
++    uses_ray: bool = False
++
++    def _init_executor(self) -> None:
++        """Initialize the worker and load the model."""
++        self._init_worker()
++
++    def _get_worker_kwargs(
++            self,
++            local_rank: int = 0,
++            rank: int = 0,
++            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
++        """Return worker init args for a given rank."""
++        if distributed_init_method is None:
++            distributed_init_method = get_distributed_init_method(
++                get_ip(), get_open_port())
++        return dict(
++            vllm_config=self.vllm_config,
++            local_rank=local_rank,
++            rank=rank,
++            distributed_init_method=distributed_init_method,
++            is_driver_worker=rank == 0,
++        )
++
++    def _create_worker(self,
++                       local_rank: int = 0,
++                       rank: int = 0,
++                       distributed_init_method: Optional[str] = None):
++        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
++        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
++                                                      distributed_init_method))
++        return wrapper.worker
++
++    def _init_worker(self):
++        assert self.parallel_config.world_size == 1, (
++            "GPUExecutor only supports single GPU.")
++
++        self.driver_worker = self._create_worker()
++        self.driver_worker.init_device()
++        self.driver_worker.load_model()
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """Determine the number of available KV blocks by invoking the
++        underlying worker.
++        """
++        return self.driver_worker.determine_num_available_blocks()
++
++    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
++        """Initialize the KV cache by invoking the underlying worker.
++        """
++        # NOTE: This is logged in the executor because there can be >1 worker
++        # with other executors. We could log in the engine level, but work
++        # remains to abstract away the device for non-GPU configurations.
++        logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
++                    num_cpu_blocks)
++        from vllm_hpu_extension.profiler import HabanaMemoryProfiler
++        with HabanaMemoryProfiler() as cache_init_m:
++            self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
++        msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
++        logger.info(msg)
++
++    def finish_measurements(self):
++        self.driver_worker.finish_measurements()
++
++    def execute_model(
++            self,
++            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
++        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
++        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
++        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
++        log_graph_compilation_all = os.environ.get(
++            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
++        log_graph_compilation = os.environ.get(
++            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
++            '0') != '0' or log_graph_compilation_all
++        log_cpu_fallbacks_all = os.environ.get(
++            'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
++        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
++                                           '0') != '0' or log_cpu_fallbacks_all
++        if log_graph_compilation or log_cpu_fallbacks:
++            from habana_frameworks.torch.hpu.metrics import metric_localcontext
++            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
++            is_prompt = any([
++                seq_group_metadata.is_prompt
++                for seq_group_metadata in seq_group_metadata_list
++            ])
++            max_context_len = max([
++                max([
++                    len(v.prompt_token_ids) + len(v.output_token_ids)
++                    for v in seq_group_metadata.seq_data.values()
++                ]) for seq_group_metadata in seq_group_metadata_list
++            ])  # whoa, that's some spicy stuff right here
++            max_num_blocks = (
++                (max_context_len - 1) // self.cache_config.block_size) + 1
++            input_stats = (f'is_prompt: {is_prompt}, '
++                           f'num_seqs: {len(seq_group_metadata_list)}, '
++                           f'max_context_len: {max_context_len}, '
++                           f'max_num_blocks {max_num_blocks}')
++            gc_ctx = metric_localcontext(
++                "graph_compilation"
++            ) if log_graph_compilation else contextlib.nullcontext()
++            cpu_fallback_ctx = metric_localcontext(
++                "cpu_fallback"
++            ) if log_cpu_fallbacks else contextlib.nullcontext()
++            with gc_ctx as gc_local_metric, \
++                cpu_fallback_ctx as cpu_fallback_local_metric:
++                output = self.driver_worker.execute_model(execute_model_req)
++            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
++                ) or log_graph_compilation_all:
++                msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
++                       f"{gc_local_metric.stats()}, {input_stats}")
++                logger.warning(msg)
++            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
++                    0) or log_cpu_fallbacks_all:
++                msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
++                       f"{cpu_fallback_local_metric.stats()}, {input_stats}")
++                logger.warning(msg)
++
++            return output
++
++        output = self.driver_worker.execute_model(execute_model_req)
++        return output
++
++    def add_lora(self, lora_request: LoRARequest) -> bool:
++        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
++        return self.driver_worker.add_lora(lora_request)
++
++    def remove_lora(self, lora_id: int) -> bool:
++        assert lora_id > 0, "lora_id must be greater than 0."
++        return self.driver_worker.remove_lora(lora_id)
++
++    def pin_lora(self, lora_id: int) -> bool:
++        assert lora_id > 0, "lora_id must be greater than 0."
++        return self.driver_worker.pin_lora(lora_id)
++
++    def list_loras(self) -> Set[int]:
++        return self.driver_worker.list_loras()
++
++    def add_prompt_adapter(
++            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
++        raise NotImplementedError(
++            "Prompt Adapter is not implemented for HPU backend.")
++
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError(
++            "Prompt Adapter is not implemented for HPU backend.")
++
++    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError(
++            "Prompt Adapter is not implemented for HPU backend.")
++
++    def list_prompt_adapters(self) -> Set[int]:
++        raise NotImplementedError(
++            "Prompt Adapter is not implemented for HPU backend.")
++
++    def check_health(self) -> None:
++        # GPUExecutor will always be healthy as long as
++        # it's running.
++        return
++
++    def start_profile(self) -> None:
++        self.driver_worker.start_profile()
++
++    def stop_profile(self) -> None:
++        self.driver_worker.stop_profile()
++
++    def shutdown(self) -> None:
++        self.driver_worker.shutdown_inc()
++
++
++class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase):
++
++    async def execute_model_async(
++        self,
++        execute_model_req: ExecuteModelRequest,
++    ) -> List[SamplerOutput]:
++        output = await make_async(self.driver_worker.execute_model
++                                  )(execute_model_req=execute_model_req, )
++        return output
+diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py
+new file mode 100644
+index 0000000..c467115
+--- /dev/null
++++ b/vllm/executor/msgspec_utils.py
+@@ -0,0 +1,27 @@
++from array import array
++from typing import Any, Type
++
++from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
++
++
++def encode_hook(obj: Any) -> Any:
++    """Custom msgspec enc hook that supports array types.
++
++    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
++    """
++    if isinstance(obj, array):
++        assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
++            f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
++            f"Given array has a type code of {obj.typecode}.")
++        return obj.tobytes()
++
++
++def decode_hook(type: Type, obj: Any) -> Any:
++    """Custom msgspec dec hook that supports array types.
++
++    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
++    """
++    if type is array:
++        deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
++        deserialized.frombytes(obj)
++        return deserialized
+diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
+new file mode 100644
+index 0000000..fc58163
+--- /dev/null
++++ b/vllm/executor/multiproc_gpu_executor.py
+@@ -0,0 +1,223 @@
++import asyncio
++import os
++from functools import partial
++from typing import Any, List, Optional
++
++from vllm.executor.distributed_gpu_executor import (  # yapf: disable
++    DistributedGPUExecutor, DistributedGPUExecutorAsync)
++from vllm.executor.gpu_executor import create_worker
++from vllm.executor.multiproc_worker_utils import (
++    ProcessWorkerWrapper, ResultHandler, WorkerMonitor,
++    set_multiprocessing_worker_envs)
++from vllm.logger import init_logger
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
++                        get_distributed_init_method, get_open_port, make_async,
++                        update_environment_variables)
++
++logger = init_logger(__name__)
++
++
++class MultiprocessingGPUExecutor(DistributedGPUExecutor):
++    """Python multiprocessing-based multi-GPU executor"""
++
++    uses_ray: bool = False
++
++    def _init_executor(self) -> None:
++        self._check_executor_parameters()
++
++        # Create the parallel GPU workers.
++        world_size = self.parallel_config.world_size
++        tensor_parallel_size = self.parallel_config.tensor_parallel_size
++
++        # Set multiprocessing envs that are common to V0 and V1
++        set_multiprocessing_worker_envs(self.parallel_config)
++
++        # Multiprocessing-based executor does not support multi-node setting.
++        # Since it only works for single node, we can use the loopback address
++        # 127.0.0.1 for communication.
++        distributed_init_method = get_distributed_init_method(
++            "127.0.0.1", get_open_port())
++
++        self.workers: List[ProcessWorkerWrapper] = []
++        # This is the list of workers that are rank 0 of each TP group EXCEPT
++        # global rank 0. These are the workers that will broadcast to the
++        # rest of the workers.
++        self.tp_driver_workers: List[ProcessWorkerWrapper] = []
++        # This is the list of workers that are not drivers and not the first
++        # worker in a TP group. These are the workers that will be
++        # broadcasted to.
++        self.non_driver_workers: List[ProcessWorkerWrapper] = []
++
++        if world_size == 1:
++            self.worker_monitor = None
++        else:
++            result_handler = ResultHandler()
++            for rank in range(1, world_size):
++                worker = ProcessWorkerWrapper(
++                    result_handler,
++                    partial(
++                        create_worker,
++                        **self._get_worker_kwargs(
++                            rank=rank,
++                            local_rank=rank,
++                            distributed_init_method=distributed_init_method,
++                        )))
++                self.workers.append(worker)
++                if rank % tensor_parallel_size == 0:
++                    self.tp_driver_workers.append(worker)
++                else:
++                    self.non_driver_workers.append(worker)
++
++            self.worker_monitor = WorkerMonitor(self.workers, result_handler)
++            result_handler.start()
++            self.worker_monitor.start()
++
++        # Set up signal handlers to shutdown the executor cleanly
++        # sometimes gc does not work well
++
++        self.driver_worker = self._create_worker(
++            distributed_init_method=distributed_init_method)
++        self._run_workers("init_device")
++        self._run_workers("load_model",
++                          max_concurrent_workers=self.parallel_config.
++                          max_parallel_loading_workers)
++
++    def _check_executor_parameters(self):
++        world_size = self.parallel_config.world_size
++        tensor_parallel_size = self.parallel_config.tensor_parallel_size
++
++        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
++        if "CUDA_VISIBLE_DEVICES" not in os.environ:
++            update_environment_variables({
++                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
++            })
++
++        cuda_device_count = cuda_device_count_stateless()
++        # Use confusing message for more common TP-only case.
++        assert tensor_parallel_size <= cuda_device_count, (
++            f"please set tensor_parallel_size ({tensor_parallel_size}) "
++            f"to less than max local gpu count ({cuda_device_count})")
++
++        assert world_size <= cuda_device_count, (
++            f"please ensure that world_size ({world_size}) "
++            f"is less than than max local gpu count ({cuda_device_count})")
++
++    def shutdown(self):
++        if (worker_monitor := getattr(self, "worker_monitor",
++                                      None)) is not None:
++            worker_monitor.close()
++
++    def _driver_execute_model(
++        self, execute_model_req: Optional[ExecuteModelRequest]
++    ) -> Optional[List[SamplerOutput]]:
++        """Run execute_model in the driver worker.
++
++        Passing None will cause the driver to stop the model execution
++        loop running in each of the remote workers.
++        """
++        return self.driver_worker.execute_model(execute_model_req)
++
++    def _run_workers(
++        self,
++        method: str,
++        *args,
++        async_run_tensor_parallel_workers_only: bool = False,
++        max_concurrent_workers: Optional[int] = None,
++        **kwargs,
++    ) -> Any:
++        """Runs the given method on all workers.
++
++        Args:
++            async_run_tensor_parallel_workers_only: If True the method will be
++                run only in the remote TP workers, not the driver worker.
++                It will also be run asynchronously and return a list of futures
++                rather than blocking on the results.
++        """
++
++        if max_concurrent_workers:
++            raise NotImplementedError(
++                "max_concurrent_workers is not supported yet.")
++
++        if async_run_tensor_parallel_workers_only:
++            # Run only non-driver workers and just return futures.
++            return [
++                worker.execute_method(method, *args, **kwargs)
++                for worker in self.non_driver_workers
++            ]
++
++        # Start all remote workers first.
++        worker_outputs = [
++            worker.execute_method(method, *args, **kwargs)
++            for worker in self.workers
++        ]
++
++        driver_worker_method = getattr(self.driver_worker, method)
++        driver_worker_output = driver_worker_method(*args, **kwargs)
++
++        # Get the results of the workers.
++        return [driver_worker_output
++                ] + [output.get() for output in worker_outputs]
++
++    def check_health(self) -> None:
++        """Raises an error if engine is unhealthy."""
++        if self.worker_monitor is not None and not self.worker_monitor.is_alive(
++        ):
++            raise RuntimeError("Worker processes are not running")
++
++    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
++        """Wait for futures returned from _run_workers() with
++        async_run_remote_workers_only to complete."""
++        for result in parallel_worker_tasks:
++            result.get()
++
++
++class MultiprocessingGPUExecutorAsync(MultiprocessingGPUExecutor,
++                                      DistributedGPUExecutorAsync):
++
++    def __init__(self, *args, **kwargs):
++        super().__init__(*args, **kwargs)
++        self.driver_exec_model = make_async(self.driver_worker.execute_model)
++        self.pp_locks: Optional[List[asyncio.Lock]] = None
++
++    async def _driver_execute_model_async(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None
++    ) -> List[SamplerOutput]:
++        if not self.tp_driver_workers:
++            return await self.driver_exec_model(execute_model_req)
++
++        if self.pp_locks is None:
++            # This locks each pipeline parallel stage so multiple virtual
++            # engines can't execute on the same stage at the same time
++            # We create the locks here to avoid creating them in the constructor
++            # which uses a different asyncio loop.
++            self.pp_locks = [
++                asyncio.Lock()
++                for _ in range(self.parallel_config.pipeline_parallel_size)
++            ]
++
++        tasks = [
++            asyncio.create_task(
++                _run_task_with_lock(self.driver_exec_model, self.pp_locks[0],
++                                    execute_model_req))
++        ]
++        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
++                                                start=1):
++            tasks.append(
++                asyncio.create_task(
++                    _run_task_with_lock(driver_worker.execute_method_async,
++                                        self.pp_locks[pp_rank],
++                                        "execute_model", execute_model_req)))
++        results = await asyncio.gather(*tasks)
++
++        # Only the last PP stage has the final results.
++        return results[-1]
++
++    async def _start_worker_execution_loop(self):
++        coros = [
++            worker.execute_method_async("start_worker_execution_loop")
++            for worker in self.non_driver_workers
++        ]
++        return await asyncio.gather(*coros)
+diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
+index 6288753..bc32826 100644
+--- a/vllm/executor/multiproc_worker_utils.py
++++ b/vllm/executor/multiproc_worker_utils.py
+@@ -1,9 +1,7 @@
+ import asyncio
+-import multiprocessing
+ import os
+ import sys
+ import threading
+-import traceback
+ import uuid
+ from dataclasses import dataclass
+ from multiprocessing import Queue
+@@ -12,8 +10,14 @@ from multiprocessing.process import BaseProcess
+ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
+                     TypeVar, Union)
+ 
+-import vllm.envs as envs
++import torch
++
+ from vllm.logger import init_logger
++from vllm.triton_utils.importing import HAS_TRITON
++from vllm.utils import _check_multiproc_method, get_mp_context
++
++if HAS_TRITON:
++    from vllm.triton_utils import maybe_set_triton_cache_manager
+ 
+ logger = init_logger(__name__)
+ 
+@@ -27,9 +31,6 @@ RESET = '\033[0;0m'
+ 
+ JOIN_TIMEOUT_S = 2
+ 
+-mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
+-mp = multiprocessing.get_context(mp_method)
+-
+ 
+ @dataclass
+ class Result(Generic[T]):
+@@ -65,10 +66,11 @@ def _set_future_result(future: Union[ResultFuture, asyncio.Future],
+         future.set_result(result)
+         return
+     loop = future.get_loop()
+-    if result.exception is not None:
+-        loop.call_soon_threadsafe(future.set_exception, result.exception)
+-    else:
+-        loop.call_soon_threadsafe(future.set_result, result.value)
++    if not loop.is_closed():
++        if result.exception is not None:
++            loop.call_soon_threadsafe(future.set_exception, result.exception)
++        else:
++            loop.call_soon_threadsafe(future.set_result, result.value)
+ 
+ 
+ class ResultHandler(threading.Thread):
+@@ -76,7 +78,7 @@ class ResultHandler(threading.Thread):
+ 
+     def __init__(self) -> None:
+         super().__init__(daemon=True)
+-        self.result_queue = mp.Queue()
++        self.result_queue = get_mp_context().Queue()
+         self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {}
+ 
+     def run(self):
+@@ -119,7 +121,8 @@ class WorkerMonitor(threading.Thread):
+                     logger.error("Worker %s pid %s died, exit code: %s",
+                                  process.name, process.pid, process.exitcode)
+             # Cleanup any remaining workers
+-            logger.info("Killing local vLLM worker processes")
++            if logger:
++                logger.info("Killing local vLLM worker processes")
+             for worker in self.workers:
+                 worker.kill_worker()
+             # Must be done after worker task queues are all closed
+@@ -145,10 +148,11 @@ class ProcessWorkerWrapper:
+ 
+     def __init__(self, result_handler: ResultHandler,
+                  worker_factory: Callable[[], Any]) -> None:
+-        self._task_queue = mp.Queue()
++        self.mp = get_mp_context()
++        self._task_queue = self.mp.Queue()
+         self.result_queue = result_handler.result_queue
+         self.tasks = result_handler.tasks
+-        self.process: BaseProcess = mp.Process(  # type: ignore[attr-defined]
++        self.process: BaseProcess = self.mp.Process(  # type: ignore[attr-defined]
+             target=_run_worker_process,
+             name="VllmWorkerProcess",
+             kwargs=dict(
+@@ -166,6 +170,8 @@ class ProcessWorkerWrapper:
+         self.tasks[task_id] = future
+         try:
+             self._task_queue.put((task_id, method, args, kwargs))
++        except SystemExit:
++            raise
+         except BaseException as e:
+             del self.tasks[task_id]
+             raise ChildProcessError("worker died") from e
+@@ -200,7 +206,7 @@ def _run_worker_process(
+     """Worker process event loop"""
+ 
+     # Add process-specific prefix to stdout and stderr
+-    process_name = mp.current_process().name
++    process_name = get_mp_context().current_process().name
+     pid = os.getpid()
+     _add_prefix(sys.stdout, process_name, pid)
+     _add_prefix(sys.stderr, process_name, pid)
+@@ -220,11 +226,14 @@ def _run_worker_process(
+             try:
+                 executor = getattr(worker, method)
+                 output = executor(*args, **kwargs)
++            except SystemExit:
++                raise
++            except KeyboardInterrupt:
++                break
+             except BaseException as e:
+-                tb = traceback.format_exc()
+-                logger.error(
+-                    "Exception in worker %s while processing method %s: %s, %s",
+-                    process_name, method, e, tb)
++                logger.exception(
++                    "Exception in worker %s while processing method %s.",
++                    process_name, method)
+                 exception = e
+             result_queue.put(
+                 Result(task_id=task_id, value=output, exception=exception))
+@@ -261,3 +270,33 @@ def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
+ 
+     file.start_new_line = True  # type: ignore[attr-defined]
+     file.write = write_with_prefix  # type: ignore[method-assign]
++
++
++def set_multiprocessing_worker_envs(parallel_config):
++    """ Set up environment variables that should be used when there are workers
++    in a multiprocessing environment. This should be called by the parent 
++    process before worker processes are created"""
++
++    _check_multiproc_method()
++
++    # Configure thread parallelism if OMP_NUM_THREADS isn't set
++    #
++    # Helps to avoid CPU contention. The default of spawning a thread per
++    # core combined with multiprocessing for each GPU can have a negative
++    # impact on performance. The contention is amplified when running in a
++    # container where CPU limits can cause throttling.
++    default_omp_num_threads = 1
++    if "OMP_NUM_THREADS" not in os.environ and (
++            current_parallelism :=
++            torch.get_num_threads()) > default_omp_num_threads:
++        logger.warning(
++            "Reducing Torch parallelism from %d threads to %d to avoid "
++            "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
++            "external environment to tune this value as needed.",
++            current_parallelism, default_omp_num_threads)
++        os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
++        torch.set_num_threads(default_omp_num_threads)
++
++    # workaround for https://github.com/vllm-project/vllm/issues/6103
++    if HAS_TRITON and parallel_config.world_size > 1:
++        maybe_set_triton_cache_manager()
+diff --git a/vllm/executor/multiproc_xpu_executor.py b/vllm/executor/multiproc_xpu_executor.py
+new file mode 100644
+index 0000000..a66afbf
+--- /dev/null
++++ b/vllm/executor/multiproc_xpu_executor.py
+@@ -0,0 +1,26 @@
++import vllm.envs as envs
++from vllm.executor.multiproc_gpu_executor import (
++    MultiprocessingGPUExecutor, MultiprocessingGPUExecutorAsync)
++from vllm.executor.xpu_executor import XPUExecutor
++from vllm.logger import init_logger
++from vllm.utils import make_async
++
++logger = init_logger(__name__)
++
++
++class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor):
++    """Python multiprocessing-based multi-XPU executor"""
++
++    def _check_executor_parameters(self):
++        mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
++        if mp_method != "spawn":
++            raise RuntimeError(
++                "XPU multiprocess executor only support spawn as mp method")
++
++
++class MultiprocessingXPUExecutorAsync(MultiprocessingXPUExecutor,
++                                      MultiprocessingGPUExecutorAsync):
++
++    def __init__(self, *args, **kwargs):
++        super().__init__(*args, **kwargs)
++        self.driver_exec_model = make_async(self.driver_worker.execute_model)
+diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
+index e7f0e88..a9efc4f 100644
+--- a/vllm/executor/neuron_executor.py
++++ b/vllm/executor/neuron_executor.py
+@@ -3,14 +3,19 @@ from typing import List, Set, Tuple
+ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+-from vllm.utils import make_async
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
++                        make_async)
++from vllm.worker.worker_base import WorkerWrapperBase
+ 
+ logger = init_logger(__name__)
+ 
+ 
+ class NeuronExecutor(ExecutorBase):
+ 
++    uses_ray: bool = False
++
+     def _init_executor(self) -> None:
+         assert (self.lora_config is
+                 None), "LoRA is not supported for Neuron backend."
+@@ -21,15 +26,16 @@ class NeuronExecutor(ExecutorBase):
+         self._init_worker()
+ 
+     def _init_worker(self):
+-        from vllm.worker.neuron_worker import NeuronWorker
+-
+-        self.driver_worker = NeuronWorker(
+-            self.model_config,
+-            self.parallel_config,
+-            self.scheduler_config,
+-            self.device_config,
+-            self.cache_config,
++        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
++        distributed_init_method = get_distributed_init_method(
++            get_ip(), get_open_port())
++        wrapper.init_worker(
++            vllm_config=self.vllm_config,
++            local_rank=0,
++            rank=0,
++            distributed_init_method=distributed_init_method,
+         )
++        self.driver_worker = wrapper.worker
+         self.driver_worker.init_device()
+         self.driver_worker.load_model()
+ 
+@@ -48,15 +54,14 @@ class NeuronExecutor(ExecutorBase):
+     def execute_model(
+             self,
+             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+-        assert (execute_model_req.blocks_to_swap_in == {}
+-                and execute_model_req.blocks_to_swap_out == {}
+-                and execute_model_req.blocks_to_copy == {}), (
++        assert (not execute_model_req.blocks_to_swap_in
++                and not execute_model_req.blocks_to_swap_out
++                and not execute_model_req.blocks_to_copy), (
+                     "Cache operations are not supported for Neuron backend.")
+         assert execute_model_req.num_lookahead_slots == 0, (
+             "lookahead not supported for Neuron backend.")
+ 
+-        output = self.driver_worker.execute_model(
+-            execute_model_req.seq_group_metadata_list)
++        output = self.driver_worker.execute_model(execute_model_req)
+         return output
+ 
+     def add_lora(self, lora_request: LoRARequest) -> bool:
+@@ -65,9 +70,28 @@ class NeuronExecutor(ExecutorBase):
+     def remove_lora(self, lora_id: int) -> bool:
+         return self.driver_worker.remove_lora(lora_id)
+ 
++    def pin_lora(self, lora_id: int) -> bool:
++        return self.driver_worker.pin_lora(lora_id)
++
+     def list_loras(self) -> Set[int]:
+         return self.driver_worker.list_loras()
+ 
++    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the Neuron backend.")
++
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the Neuron backend.")
++
++    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the Neuron backend.")
++
++    def list_prompt_adapters(self) -> Set[int]:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the Neuron backend.")
++
+     def check_health(self) -> None:
+         # NeuronExecutor will always be healthy as long as
+         # it's running.
+@@ -80,9 +104,8 @@ class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
+         self,
+         execute_model_req: ExecuteModelRequest,
+     ) -> List[SamplerOutput]:
+-        output = await make_async(
+-            self.driver_worker.execute_model
+-        )(seq_group_metadata_list=execute_model_req.seq_group_metadata_list, )
++        output = await make_async(self.driver_worker.execute_model
++                                  )(execute_model_req=execute_model_req, )
+         return output
+ 
+     async def check_health_async(self) -> None:
+diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
+new file mode 100644
+index 0000000..057a323
+--- /dev/null
++++ b/vllm/executor/openvino_executor.py
+@@ -0,0 +1,125 @@
++from typing import List, Set, Tuple
++
++import openvino as ov
++
++import vllm.envs as envs
++from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.platforms import current_platform
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
++                        make_async)
++from vllm.worker.worker_base import WorkerWrapperBase
++
++logger = init_logger(__name__)
++
++
++class OpenVINOExecutor(ExecutorBase):
++
++    uses_ray: bool = False
++
++    def _init_executor(self) -> None:
++        assert self.device_config.device_type == "openvino"
++        assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
++        assert current_platform.is_openvino_cpu() or \
++            current_platform.is_openvino_gpu(), \
++            "OpenVINO backend supports only CPU and GPU devices"
++
++        # Instantiate the worker and load the model to CPU.
++        self._init_worker()
++
++    def _init_worker(self):
++
++        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
++
++        distributed_init_method = get_distributed_init_method(
++            get_ip(), get_open_port())
++        wrapper.init_worker(
++            ov_core=ov.Core(),
++            vllm_config=self.vllm_config,
++            local_rank=0,
++            rank=0,
++            distributed_init_method=distributed_init_method,
++            kv_cache_dtype=self.cache_config.cache_dtype,
++            is_driver_worker=True,
++        )
++        self.driver_worker = wrapper.worker
++        self.driver_worker.init_device()
++        self.driver_worker.load_model()
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """Determine the number of available KV blocks by invoking the
++        underlying worker.
++        """
++        return self.driver_worker.determine_num_available_blocks()
++
++    def initialize_cache(self, num_gpu_blocks: int,
++                         num_cpu_blocks: int) -> None:
++        """Initialize the KV cache by invoking the underlying worker."""
++        # NOTE: We log here to avoid multiple logs when number of workers is
++        # greater than one. We could log in the engine, but not all executors
++        # have GPUs.
++        # NOTE: In case of a CPU device, `cpu block` for OpenVINO backend
++        # is located on CPU memory but is referred as `gpu block`.
++        # Because we want to reuse the existing block management procedure.
++        device_blocks = num_gpu_blocks
++        swap_blocks = num_cpu_blocks
++        logger.info("OpenVINO %s: # device blocks: %d; # swap blocks: %d",
++                    envs.VLLM_OPENVINO_DEVICE, device_blocks, swap_blocks)
++        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
++
++    def execute_model(
++            self,
++            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        output = self.driver_worker.execute_model(execute_model_req)
++        return output
++
++    def add_lora(self, lora_request: LoRARequest) -> bool:
++        return self.driver_worker.add_lora(lora_request)
++
++    def remove_lora(self, lora_id: int) -> bool:
++        return self.driver_worker.remove_lora(lora_id)
++
++    def pin_lora(self, lora_id: int) -> bool:
++        return self.driver_worker.pin_lora(lora_id)
++
++    def list_loras(self) -> Set[int]:
++        return self.driver_worker.list_loras()
++
++    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the OPENVINO backend.")
++
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the OPENVINO backend.")
++
++    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the OPENVINO backend.")
++
++    def list_prompt_adapters(self) -> Set[int]:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the OPENVINO backend.")
++
++    def check_health(self) -> None:
++        # OpenVINOExecutor will always be healthy as long as
++        # it's running.
++        return
++
++
++class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase):
++
++    async def execute_model_async(
++            self,
++            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        output = await make_async(self.driver_worker.execute_model
++                                  )(execute_model_req=execute_model_req, )
++        return output
++
++    async def check_health_async(self) -> None:
++        # OpenVINOExecutor will always be healthy as long as
++        # it's running.
++        return
+diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
+index afc1c88..e2c549c 100644
+--- a/vllm/executor/ray_gpu_executor.py
++++ b/vllm/executor/ray_gpu_executor.py
+@@ -1,18 +1,21 @@
+ import asyncio
+ import os
+-import pickle
+ from collections import defaultdict
+ from itertools import islice, repeat
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+ 
++import msgspec
++
+ import vllm.envs as envs
+ from vllm.executor.distributed_gpu_executor import (  # yapf: disable
+     DistributedGPUExecutor, DistributedGPUExecutorAsync)
++from vllm.executor.msgspec_utils import encode_hook
+ from vllm.executor.ray_utils import RayWorkerWrapper, ray
+ from vllm.logger import init_logger
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+-                        get_vllm_instance_id, make_async)
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
++                        get_ip, get_open_port, make_async)
+ 
+ if ray is not None:
+     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+@@ -22,16 +25,34 @@ if TYPE_CHECKING:
+ 
+ logger = init_logger(__name__)
+ 
+-USE_RAY_COMPILED_DAG = envs.VLLM_USE_RAY_COMPILED_DAG
+-
+ 
+ class RayGPUExecutor(DistributedGPUExecutor):
+ 
+-    def _init_executor(self) -> None:
+-        assert (not self.speculative_config
+-                ), "Speculative decoding not yet supported for RayGPU backend."
++    uses_ray: bool = True
+ 
+-        assert self.parallel_config.worker_use_ray
++    def _init_executor(self) -> None:
++        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
++        # If the env var is set, it uses the Ray's compiled DAG API
++        # which optimizes the control plane overhead.
++        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
++        # Currently, this requires USE_RAY_SPMD_WORKER=True.
++        self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
++        # If the env var is set, then we do not distinguish between the
++        # "driver worker" vs other workers. Also, the rank 0 worker will
++        # be executed in a remote Ray worker. Currently this requires
++        # USE_RAY_COMPILED_DAG=True.
++        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
++        if self.use_ray_compiled_dag:
++            assert self.use_ray_spmd_worker, (
++                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
++                "VLLM_USE_RAY_SPMD_WORKER=1")
++        if self.use_ray_spmd_worker:
++            # TODO: Support SPMD worker for non-DAG Ray executor.
++            assert self.use_ray_compiled_dag, (
++                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
++                "VLLM_USE_RAY_COMPILED_DAG=1")
++
++        assert self.uses_ray
+         placement_group = self.parallel_config.placement_group
+ 
+         # Disable Ray usage stats collection.
+@@ -42,9 +63,17 @@ class RayGPUExecutor(DistributedGPUExecutor):
+         # Create the parallel GPU workers.
+         self._init_workers_ray(placement_group)
+ 
+-        self.forward_dag = None
+-        if USE_RAY_COMPILED_DAG:
+-            self.forward_dag = self._compiled_ray_dag()
++        self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
++        self.output_decoder = msgspec.msgpack.Decoder(
++            Optional[List[SamplerOutput]])
++
++    def shutdown(self) -> None:
++        if hasattr(self, "forward_dag") and self.forward_dag is not None:
++            self.forward_dag.teardown()
++            import ray
++            for worker in self.workers:
++                ray.kill(worker)
++            self.forward_dag = None
+ 
+     def _configure_ray_workers_use_nsight(self,
+                                           ray_remote_kwargs) -> Dict[str, Any]:
+@@ -61,9 +90,14 @@ class RayGPUExecutor(DistributedGPUExecutor):
+ 
+         return ray_remote_kwargs
+ 
++    # child class could overwrite this to return actual env vars.
++    def _get_env_vars_to_be_updated(self):
++        return self._env_vars_for_all_workers
++
+     def _init_workers_ray(self, placement_group: "PlacementGroup",
+                           **ray_remote_kwargs):
+-        if self.parallel_config.tensor_parallel_size == 1:
++        if (self.parallel_config.tensor_parallel_size == 1
++                and self.parallel_config.pipeline_parallel_size == 1):
+             # For single GPU case, we use a ray worker with constrained memory.
+             num_gpus = self.cache_config.gpu_memory_utilization
+         else:
+@@ -76,12 +110,20 @@ class RayGPUExecutor(DistributedGPUExecutor):
+         # The remaining workers are the actual ray actors.
+         self.workers: List[RayWorkerWrapper] = []
+ 
++        # Used in ray compiled DAG: indexed first by PP rank,
++        # and then TP rank. In other words, the inner list is
++        # the TP group of workers for a PP rank.
++        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
++
+         if self.parallel_config.ray_workers_use_nsight:
+             ray_remote_kwargs = self._configure_ray_workers_use_nsight(
+                 ray_remote_kwargs)
+ 
++        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
++
+         # Create the workers.
+         driver_ip = get_ip()
++        workers = []
+         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+             if not bundle.get("GPU", 0):
+                 continue
+@@ -90,64 +132,135 @@ class RayGPUExecutor(DistributedGPUExecutor):
+                 placement_group_capture_child_tasks=True,
+                 placement_group_bundle_index=bundle_id,
+             )
++
+             worker = ray.remote(
+                 num_cpus=0,
+                 num_gpus=num_gpus,
+                 scheduling_strategy=scheduling_strategy,
+                 **ray_remote_kwargs,
+-            )(RayWorkerWrapper).remote(
+-                worker_module_name="vllm.worker.worker",
+-                worker_class_name="Worker",
+-                trust_remote_code=self.model_config.trust_remote_code,
+-            )
++            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
++            workers.append(worker)
+ 
+-            worker_ip = ray.get(worker.get_node_ip.remote())
+-            if worker_ip == driver_ip and self.driver_dummy_worker is None:
+-                # If the worker is on the same node as the driver, we use it
+-                # as the resource holder for the driver process.
+-                self.driver_dummy_worker = worker
+-                self.driver_worker = RayWorkerWrapper(
+-                    worker_module_name="vllm.worker.worker",
+-                    worker_class_name="Worker",
+-                    trust_remote_code=self.model_config.trust_remote_code,
+-                )
+-            else:
+-                # Else, added to the list of workers.
+-                self.workers.append(worker)
++        worker_ip_refs = [
++            worker.get_node_ip.remote()  # type: ignore[attr-defined]
++            for worker in workers
++        ]
++        worker_ips = ray.get(worker_ip_refs)
++
++        if not self.use_ray_spmd_worker:
++            for i in range(len(workers)):
++                worker = workers[i]
++                worker_ip = worker_ips[i]
++                if self.driver_dummy_worker is None and worker_ip == driver_ip:
++                    # If the worker is on the same node as the driver, we use it
++                    # as the resource holder for the driver process.
++                    self.driver_dummy_worker = worker
++                    self.driver_worker = RayWorkerWrapper(
++                        vllm_config=self.vllm_config)
++                    workers.pop(i)
++                    worker_ips.pop(i)
++                    self.workers = workers
++                    break
++        else:
++            self.workers = workers
+ 
+-        if self.driver_dummy_worker is None:
++        logger.debug("workers: %s", self.workers)
++        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
++        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
+             raise ValueError(
+                 "Ray does not allocate any GPUs on the driver node. Consider "
+                 "adjusting the Ray placement group or running the driver on a "
+                 "GPU node.")
+ 
++        ip_counts: Dict[str, int] = {}
++        for ip in worker_ips:
++            ip_counts[ip] = ip_counts.get(ip, 0) + 1
++
++        worker_to_ip = dict(zip(self.workers, worker_ips))
++
++        def sort_by_driver_then_worker_ip(worker):
++            """
++            Sort the workers based on 3 properties:
++            1. If the worker is on the same node as the driver (vllm engine),
++                it should be placed first.
++            2. Then, if the worker is on a node with fewer workers, it should
++                be placed first.
++            3. Finally, if the work is on a node with smaller IP address, it
++                should be placed first.
++            """
++            ip = worker_to_ip[worker]
++            return (ip != driver_ip, ip_counts[ip], ip)
++
++        # After sorting, the workers on the same node will be
++        # close to each other, and the workers on the driver
++        # node will be placed first.
++        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
++
+         # Get the set of GPU IDs used on each node.
+-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
+-                                                    use_dummy_driver=True)
++        worker_node_and_gpu_ids = []
++        for worker in [self.driver_dummy_worker] + self.workers:
++            if worker is None:
++                # driver_dummy_worker can be None when using ray spmd worker.
++                continue
++            worker_node_and_gpu_ids.append(
++                ray.get(worker.get_node_and_gpu_ids.remote()) \
++            ) # type: ignore
+ 
+-        node_workers = defaultdict(list)
+-        node_gpus = defaultdict(list)
++        node_workers = defaultdict(list)  # node id -> list of worker ranks
++        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+ 
+         for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+             node_workers[node_id].append(i)
++            # `gpu_ids` can be a list of strings or integers.
++            # convert them to integers for consistency.
++            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
++            # string sorting is not sufficient.
++            # see https://github.com/vllm-project/vllm/issues/5590
++            gpu_ids = [int(x) for x in gpu_ids]
+             node_gpus[node_id].extend(gpu_ids)
+         for node_id, gpu_ids in node_gpus.items():
+             node_gpus[node_id] = sorted(gpu_ids)
+ 
+-        VLLM_INSTANCE_ID = get_vllm_instance_id()
++        all_ips = set(worker_ips + [driver_ip])
++        n_ips = len(all_ips)
++        n_nodes = len(node_workers)
++
++        if n_nodes != n_ips:
++            raise RuntimeError(
++                f"Every node should have a unique IP address. Got {n_nodes}"
++                f" nodes with node ids {list(node_workers.keys())} and "
++                f"{n_ips} unique IP addresses {all_ips}. Please check your"
++                " network configuration. If you set `VLLM_HOST_IP`"
++                " environment variable, make sure it is unique for"
++                " each node.")
+ 
+         # Set environment variables for the driver and workers.
+         all_args_to_update_environment_variables = [({
+             "CUDA_VISIBLE_DEVICES":
+             ",".join(map(str, node_gpus[node_id])),
+-            "VLLM_INSTANCE_ID":
+-            VLLM_INSTANCE_ID,
+             "VLLM_TRACE_FUNCTION":
+             str(envs.VLLM_TRACE_FUNCTION),
++            **({
++                "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
++            } if envs.VLLM_ATTENTION_BACKEND is not None else {})
+         }, ) for (node_id, _) in worker_node_and_gpu_ids]
+-        self._run_workers("update_environment_variables",
+-                          all_args=all_args_to_update_environment_variables)
+ 
++        self._env_vars_for_all_workers = (
++            all_args_to_update_environment_variables)
++
++        self._run_workers("update_environment_variables",
++                          all_args=self._get_env_vars_to_be_updated())
++
++        if len(node_gpus) == 1:
++            # in single node case, we don't need to get the IP address.
++            # the loopback address is sufficient
++            # NOTE: a node may have several IP addresses, one for each
++            # network interface. `get_ip()` might return any of them,
++            # while they might not work for communication inside the node
++            # if the network setup is complicated. Using the loopback address
++            # solves this issue, as it always works for communication inside
++            # the node.
++            driver_ip = "127.0.0.1"
+         distributed_init_method = get_distributed_init_method(
+             driver_ip, get_open_port())
+ 
+@@ -166,162 +279,294 @@ class RayGPUExecutor(DistributedGPUExecutor):
+                           max_concurrent_workers=self.parallel_config.
+                           max_parallel_loading_workers)
+ 
++        if self.use_ray_spmd_worker:
++            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
++                self.pp_tp_workers.append([])
++                for tp_rank in range(
++                        self.parallel_config.tensor_parallel_size):
++                    # PP=2, TP=4
++                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
++                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
++                            ) + tp_rank
++                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
++                    assert pp_rank < len(self.pp_tp_workers)
++                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
++
++        # This is the list of workers that are rank 0 of each TP group EXCEPT
++        # global rank 0. These are the workers that will broadcast to the
++        # rest of the workers.
++        self.tp_driver_workers: List[RayWorkerWrapper] = []
++        # This is the list of workers that are not drivers and not the first
++        # worker in a TP group. These are the workers that will be
++        # broadcasted to.
++        self.non_driver_workers: List[RayWorkerWrapper] = []
++
++        # Enforce rank order for correct rank to return final output.
++        for index, worker in enumerate(self.workers):
++            # The driver worker is rank 0 and not in self.workers.
++            rank = index + 1
++            if rank % self.parallel_config.tensor_parallel_size == 0:
++                self.tp_driver_workers.append(worker)
++            else:
++                self.non_driver_workers.append(worker)
++
++    def _driver_execute_model(
++        self, execute_model_req: Optional[ExecuteModelRequest]
++    ) -> Optional[List[SamplerOutput]]:
++        """Run execute_model in the driver worker.
++
++        Passing None will cause the driver to stop the model execution
++        loop running in each of the remote workers.
++        """
++        assert not self.use_ray_spmd_worker, (
++            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
++        return self.driver_worker.execute_method("execute_model",
++                                                 execute_model_req)
++
+     def execute_model(
+             self,
+             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+-        all_outputs = self._run_workers(
+-            "execute_model",
+-            driver_kwargs={"execute_model_req": execute_model_req},
+-            use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
++        if not self.use_ray_spmd_worker:
++            return super().execute_model(execute_model_req)
+ 
+-        # Only the driver worker returns the sampling results.
+-        return all_outputs[0]
++        if self.forward_dag is None:
++            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
++
++        serialized_data = self.input_encoder.encode(execute_model_req)
++        outputs = ray.get(self.forward_dag.execute(serialized_data))
++        output = self.output_decoder.decode(outputs[0])
++        return output
+ 
+     def _run_workers(
+         self,
+         method: str,
+         *args,
+-        driver_args: Optional[Tuple[Any, ...]] = None,
+-        driver_kwargs: Optional[Dict[str, Any]] = None,
++        async_run_tensor_parallel_workers_only: bool = False,
+         all_args: Optional[List[Tuple[Any, ...]]] = None,
+         all_kwargs: Optional[List[Dict[str, Any]]] = None,
+-        use_dummy_driver: bool = False,
+         max_concurrent_workers: Optional[int] = None,
+-        use_ray_compiled_dag: bool = False,
+         **kwargs,
+     ) -> Any:
+         """Runs the given method on all workers. Can be used in the following
+         ways:
+ 
++        Args:
++        - async_run_tensor_parallel_workers_only: If True the method will be
++          run only in the remote TP workers, not the driver worker.
++          It will also be run asynchronously and return a list of futures
++          rather than blocking on the results.
+         - args/kwargs: All workers share the same args/kwargs
+-        - args/kwargs and driver_args/driver_kwargs: Driver worker has
+-          different args
+         - all_args/all_kwargs: args/kwargs for each worker are specified
+           individually
+         """
++        if self.use_ray_spmd_worker:
++            assert not async_run_tensor_parallel_workers_only, (
++                "async_run_tensor_parallel_workers_only is not supported for "
++                "spmd mode.")
+ 
+         if max_concurrent_workers:
+             raise NotImplementedError(
+                 "max_concurrent_workers is not supported yet.")
+ 
+-        if driver_args is None:
+-            driver_args = args if all_args is None else all_args[0]
+-        if driver_kwargs is None:
+-            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+-
+-        count = len(self.workers)
++        count = len(self.workers) if not \
++            async_run_tensor_parallel_workers_only \
++            else len(self.non_driver_workers)
++        # If using SPMD worker, all workers are the same, so we should execute
++        # the args on all workers. Otherwise, we skip the first worker's args
++        # because those args will go to the driver worker.
++        first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
+         all_worker_args = repeat(args, count) if all_args is None \
+-            else islice(all_args, 1, None)
++            else islice(all_args, first_worker_args_index, None)
+         all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+-            else islice(all_kwargs, 1, None)
++            else islice(all_kwargs, first_worker_args_index, None)
++
++        # Start the ray workers first.
++        ray_workers = self.workers
++        if async_run_tensor_parallel_workers_only:
++            ray_workers = self.non_driver_workers
++        ray_worker_outputs = [
++            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
++            for (worker, worker_args, worker_kwargs
++                 ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
++        ]
+ 
+-        if use_ray_compiled_dag:
+-            # Right now, compiled DAG can only accept a single
+-            # input. TODO(sang): Fix it.
+-            assert self.forward_dag is not None
+-            output_channels = self.forward_dag.execute(1)
+-        else:
+-            # Start the ray workers first.
+-            ray_worker_outputs = [
+-                worker.execute_method.remote(method, *worker_args,
+-                                             **worker_kwargs)
+-                for (worker, worker_args, worker_kwargs
+-                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
++        if async_run_tensor_parallel_workers_only:
++            # Just return futures
++            return ray_worker_outputs
++
++        driver_worker_output = []
++        # In SPMD mode, the driver worker is the same as any other worker,
++        # so we only explicitly execute on the driver worker if using a
++        # non-SPMD worker class.
++        if not self.use_ray_spmd_worker:
++            driver_args = args if all_args is None else all_args[0]
++            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
++
++            # Start the driver worker after all the ray workers.
++            driver_worker_output = [
++                self.driver_worker.execute_method(method, *driver_args,
++                                                  **driver_kwargs)
+             ]
+ 
+-        # Start the driver worker after all the ray workers.
+-        if not use_dummy_driver:
+-            driver_worker_output = self.driver_worker.execute_method(
+-                method, *driver_args, **driver_kwargs)
+-        else:
+-            assert self.driver_dummy_worker is not None
+-            driver_worker_output = ray.get(
+-                self.driver_dummy_worker.execute_method.remote(
+-                    method, *driver_args, **driver_kwargs))
+         # Get the results of the ray workers.
+         if self.workers:
+-            if use_ray_compiled_dag:
+-                try:
+-                    ray_worker_outputs = [
+-                        pickle.loads(chan.begin_read())
+-                        for chan in output_channels
+-                    ]
+-                finally:
+-                    # Has to call end_read in order to reuse the DAG.
+-                    for chan in output_channels:
+-                        chan.end_read()
+-            else:
+-                ray_worker_outputs = ray.get(ray_worker_outputs)
++            ray_worker_outputs = ray.get(ray_worker_outputs)
+ 
+-        return [driver_worker_output] + ray_worker_outputs
++        return driver_worker_output + ray_worker_outputs
+ 
+-    def _compiled_ray_dag(self):
++    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
++        """Wait for futures returned from _run_workers() with
++        async_run_remote_workers_only to complete."""
++        ray.get(parallel_worker_tasks)
++
++    def _check_ray_adag_installation(self):
+         import pkg_resources
+-        required_version = "2.9"
+-        current_version = pkg_resources.get_distribution("ray").version
++        from packaging import version
++
++        required_version = version.parse("2.40")
++        current_version = version.parse(
++            pkg_resources.get_distribution("ray").version)
+         if current_version < required_version:
+-            raise ValueError(f"Ray version {required_version} or greater is "
++            raise ValueError(f"Ray version {required_version} is "
+                              f"required, but found {current_version}")
+ 
++        import importlib.util
++        adag_spec = importlib.util.find_spec(
++            "ray.experimental.compiled_dag_ref")
++        if adag_spec is None:
++            raise ValueError("Ray accelerated DAG is not installed. "
++                             "Run `pip install ray[adag]` to install it.")
++
++        cupy_spec = importlib.util.find_spec("cupy")
++        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
++            raise ValueError(
++                "cupy is not installed but required since "
++                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
++                "Run `pip install ray[adag]` and check cupy installation.")
++
++    def _compiled_ray_dag(self, enable_asyncio: bool):
++        assert self.parallel_config.use_ray
++        self._check_ray_adag_installation()
+         from ray.dag import InputNode, MultiOutputNode
+-        assert self.parallel_config.worker_use_ray
++        from ray.experimental.channel.torch_tensor_type import TorchTensorType
+ 
+-        # Right now, compiled DAG requires at least 1 arg. We send
+-        # a dummy value for now. It will be fixed soon.
++        logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
++                    envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
++        logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
++                    envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
+         with InputNode() as input_data:
+-            forward_dag = MultiOutputNode([
+-                worker.execute_model_compiled_dag_remote.
+-                bind(  # type: ignore[attr-defined]
+-                    input_data) for worker in self.workers
+-            ])
+-        return forward_dag.experimental_compile()
+-
+-    def check_health(self) -> None:
+-        """Raises an error if engine is unhealthy."""
+-        self._check_if_any_actor_is_dead()
+-
+-    def _check_if_any_actor_is_dead(self):
+-        if not self.workers:
+-            return
+-
+-        dead_actors = []
+-        for actor in self.workers:
+-            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
+-            if actor_state["State"] == "DEAD":
+-                dead_actors.append(actor)
+-        if dead_actors:
+-            raise RuntimeError("At least one Worker is dead. "
+-                               f"Dead Workers: {dead_actors}. ")
++            # Example DAG: PP=2, TP=4
++            # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
++            #                         -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput   # noqa: E501
++            #                         -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput   # noqa: E501
++            #                         -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput   # noqa: E501
++
++            # All workers in the first TP group will take in the
++            # ExecuteModelRequest as input.
++            outputs = [input_data for _ in self.pp_tp_workers[0]]
++            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
++                # Each PP worker takes in the output of the previous PP worker,
++                # and the TP group executes in SPMD fashion.
++                outputs = [
++                    worker.execute_model_spmd.
++                    bind(  # type: ignore[attr-defined]
++                        outputs[i]) for i, worker in enumerate(tp_group)
++                ]
++
++                last_pp_rank = len(self.pp_tp_workers) - 1
++                if pp_rank < last_pp_rank:
++                    # Specify how intermediate tensors should be passed
++                    # between pp stages, no need to specify for the last
++                    # pp stage.
++                    transport = "nccl" \
++                        if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \
++                        else "auto"
++                    outputs = [
++                        output.with_type_hint(
++                            TorchTensorType(transport=transport))
++                        for output in outputs
++                    ]
++
++            forward_dag = MultiOutputNode(outputs)
++
++        return forward_dag.experimental_compile(
++            enable_asyncio=enable_asyncio,
++            _overlap_gpu_communication=envs.
++            VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
++
++    def __del__(self):
++        self.shutdown()
+ 
+ 
+ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
+ 
+     def __init__(self, *args, **kwargs):
+         super().__init__(*args, **kwargs)
+-        self.driver_executor = make_async(self.driver_worker.execute_method)
++        self.pp_locks: Optional[List[asyncio.Lock]] = None
++        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
++        if not self.use_ray_compiled_dag:
++            self.driver_exec_method = make_async(
++                self.driver_worker.execute_method)
+ 
+-    async def _run_workers_async(
+-        self,
+-        method: str,
+-        *args,
+-        driver_args: Optional[Tuple[Any, ...]] = None,
+-        driver_kwargs: Optional[Dict[str, Any]] = None,
+-        **kwargs,
+-    ) -> Any:
+-        """Runs the given method on all workers."""
+-        coros = []
++    async def execute_model_async(
++            self,
++            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        if not self.use_ray_spmd_worker:
++            return await super().execute_model_async(execute_model_req)
+ 
+-        if driver_args is None:
+-            driver_args = args
+-        if driver_kwargs is None:
+-            driver_kwargs = kwargs
++        if self.forward_dag is None:
++            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
+ 
+-        coros.append(
+-            self.driver_executor(method, *driver_args, **driver_kwargs))
++        serialized_data = self.input_encoder.encode(execute_model_req)
++        dag_future = await self.forward_dag.execute_async(serialized_data)
++        output = await dag_future[0]
++        return self.output_decoder.decode(output)
+ 
+-        # Run the ray workers asynchronously.
+-        for worker in self.workers:
+-            coros.append(worker.execute_method.remote(method, *args, **kwargs))
++    async def _driver_execute_model_async(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None
++    ) -> List[SamplerOutput]:
++        assert not self.use_ray_spmd_worker, (
++            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
++        if not self.tp_driver_workers:
++            return await self.driver_exec_method("execute_model",
++                                                 execute_model_req)
++        if self.pp_locks is None:
++            # This locks each pipeline parallel stage so multiple virtual
++            # engines can't execute on the same stage at the same time
++            # We create the locks here to avoid creating them in the constructor
++            # which uses a different asyncio loop.
++            self.pp_locks = [
++                asyncio.Lock()
++                for _ in range(self.parallel_config.pipeline_parallel_size)
++            ]
++
++        tasks = [
++            asyncio.create_task(
++                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
++                                    "execute_model", execute_model_req))
++        ]
++        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
++                                                start=1):
++            tasks.append(
++                asyncio.create_task(
++                    _run_task_with_lock(driver_worker.execute_method.remote,
++                                        self.pp_locks[pp_rank],
++                                        "execute_model", execute_model_req)))
++
++        results = await asyncio.gather(*tasks)
++
++        # Only the last PP stage has the final results.
++        return results[-1]
++
++    async def _start_worker_execution_loop(self):
++        assert not self.use_ray_spmd_worker, (
++            "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
++        coros = [
++            worker.execute_method.remote("start_worker_execution_loop")
++            for worker in self.non_driver_workers
++        ]
++        return await asyncio.gather(*coros)
+ 
+-        all_outputs = await asyncio.gather(*coros)
+-        return all_outputs
++    def __del__(self):
++        self.shutdown()
+diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
+new file mode 100644
+index 0000000..f3025cb
+--- /dev/null
++++ b/vllm/executor/ray_hpu_executor.py
+@@ -0,0 +1,515 @@
++import asyncio
++import os
++from collections import defaultdict
++from itertools import islice, repeat
++from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
++
++import msgspec
++
++import vllm.envs as envs
++from vllm.executor.distributed_gpu_executor import (  # yapf: disable
++    DistributedGPUExecutor, DistributedGPUExecutorAsync)
++from vllm.executor.msgspec_utils import encode_hook
++from vllm.executor.ray_utils import RayWorkerWrapper, ray
++from vllm.logger import init_logger
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
++                        get_ip, get_open_port, make_async)
++
++if ray is not None:
++    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
++
++if TYPE_CHECKING:
++    from ray.util.placement_group import PlacementGroup
++
++logger = init_logger(__name__)
++
++
++class RayHPUExecutor(DistributedGPUExecutor):
++
++    uses_ray: bool = True
++
++    def _init_executor(self) -> None:
++        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
++        # If the env var is set, it uses the Ray's compiled DAG API
++        # which optimizes the control plane overhead.
++        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
++        # Currently, this requires USE_RAY_SPMD_WORKER=True.
++        self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
++        # If the env var is set, then we do not distinguish between the
++        # "driver worker" vs other workers. Also, the rank 0 worker will
++        # be executed in a remote Ray worker. Currently this requires
++        # USE_RAY_COMPILED_DAG=True.
++        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
++        if self.use_ray_compiled_dag:
++            assert self.use_ray_spmd_worker, (
++                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
++                "VLLM_USE_RAY_SPMD_WORKER=1")
++        if self.use_ray_spmd_worker:
++            # TODO: Support SPMD worker for non-DAG Ray executor.
++            assert self.use_ray_compiled_dag, (
++                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
++                "VLLM_USE_RAY_COMPILED_DAG=1")
++
++        assert self.uses_ray
++        placement_group = self.parallel_config.placement_group
++
++        # Disable Ray usage stats collection.
++        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
++        if ray_usage != "1":
++            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
++
++        # Create the parallel GPU workers.
++        self._init_workers_ray(placement_group)
++
++        self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
++        self.output_decoder = msgspec.msgpack.Decoder(
++            Optional[List[SamplerOutput]])
++
++    def shutdown(self) -> None:
++        if hasattr(self, "forward_dag") and self.forward_dag is not None:
++            self.forward_dag.teardown()
++            import ray
++            for worker in self.workers:
++                ray.kill(worker)
++            self.forward_dag = None
++
++    def finish_measurements(self):
++        self._run_workers("finish_measurements")
++
++    def _init_workers_ray(self, placement_group: "PlacementGroup",
++                          **ray_remote_kwargs):
++        # Otherwise, the ray workers are allocated with a full GPU.
++        num_gpus = 1
++
++        # The driver dummy worker does not actually use any resources.
++        # It holds the resource for the driver worker.
++        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
++        # The remaining workers are the actual ray actors.
++        self.workers: List[RayWorkerWrapper] = []
++
++        # Used in ray compiled DAG: indexed first by PP rank,
++        # and then TP rank. In other words, the inner list is
++        # the TP group of workers for a PP rank.
++        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
++
++        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
++
++        # Create the workers.
++        driver_ip = get_ip()
++        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
++            if not bundle.get("HPU", 0):
++                continue
++            scheduling_strategy = PlacementGroupSchedulingStrategy(
++                placement_group=placement_group,
++                placement_group_capture_child_tasks=True,
++                placement_group_bundle_index=bundle_id,
++            )
++
++            worker = ray.remote(
++                num_cpus=0,
++                num_gpus=0,
++                resources={'HPU': num_gpus},
++                scheduling_strategy=scheduling_strategy,
++                **ray_remote_kwargs,
++            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
++
++            if self.use_ray_spmd_worker:
++                self.workers.append(worker)
++            else:
++                worker_ip = ray.get(worker.get_node_ip.remote())
++                if worker_ip == driver_ip and self.driver_dummy_worker is None:
++                    # If the worker is on the same node as the driver, we use it
++                    # as the resource holder for the driver process.
++                    self.driver_dummy_worker = worker
++                    self.driver_worker = RayWorkerWrapper(
++                        vllm_config=self.vllm_config)
++                else:
++                    # Else, added to the list of workers.
++                    self.workers.append(worker)
++
++        logger.debug("workers: %s", self.workers)
++        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
++        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
++            raise ValueError(
++                "Ray does not allocate any GPUs on the driver node. Consider "
++                "adjusting the Ray placement group or running the driver on a "
++                "GPU node.")
++
++        worker_ips = [
++            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
++            for worker in self.workers
++        ]
++        ip_counts: Dict[str, int] = {}
++        for ip in worker_ips:
++            ip_counts[ip] = ip_counts.get(ip, 0) + 1
++
++        def sort_by_driver_then_worker_ip(worker):
++            """
++            Sort the workers based on 3 properties:
++            1. If the worker is on the same node as the driver (vllm engine),
++                it should be placed first.
++            2. Then, if the worker is on a node with fewer workers, it should
++                be placed first.
++            3. Finally, if the work is on a node with smaller IP address, it
++                should be placed first.
++            """
++            ip = ray.get(worker.get_node_ip.remote())
++            return (ip != driver_ip, ip_counts[ip], ip)
++
++        # After sorting, the workers on the same node will be
++        # close to each other, and the workers on the driver
++        # node will be placed first.
++        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
++
++        worker_node_and_gpu_ids = []
++        for worker in [self.driver_dummy_worker] + self.workers:
++            if worker is None:
++                # driver_dummy_worker can be None when using ray spmd worker.
++                continue
++            worker_node_and_gpu_ids.append(
++                ray.get(worker.get_node_and_gpu_ids.remote()) \
++            ) # type: ignore
++
++        node_workers = defaultdict(list)  # node id -> list of worker ranks
++        node_gpus = defaultdict(list)  # node id -> list of gpu ids
++
++        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
++            node_workers[node_id].append(i)
++            # `gpu_ids` can be a list of strings or integers.
++            # convert them to integers for consistency.
++            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
++            # string sorting is not sufficient.
++            # see https://github.com/vllm-project/vllm/issues/5590
++            gpu_ids = [int(x) for x in gpu_ids]
++            node_gpus[node_id].extend(gpu_ids)
++        for node_id, gpu_ids in node_gpus.items():
++            node_gpus[node_id] = sorted(gpu_ids)
++
++        all_ips = set(worker_ips + [driver_ip])
++        n_ips = len(all_ips)
++        n_nodes = len(node_workers)
++
++        if n_nodes != n_ips:
++            raise RuntimeError(
++                f"Every node should have a unique IP address. Got {n_nodes}"
++                f" nodes with node ids {list(node_workers.keys())} and "
++                f"{n_ips} unique IP addresses {all_ips}. Please check your"
++                " network configuration. If you set `VLLM_HOST_IP` "
++                "environment variable, make sure it is unique for"
++                " each node.")
++
++        # Set environment variables for the driver and workers.
++        all_args_to_update_environment_variables = [({
++            "VLLM_TRACE_FUNCTION":
++            str(envs.VLLM_TRACE_FUNCTION),
++        }, ) for (node_id, _) in worker_node_and_gpu_ids]
++        self._run_workers("update_environment_variables",
++                          all_args=all_args_to_update_environment_variables)
++
++        if len(node_gpus) == 1:
++            # in single node case, we don't need to get the IP address.
++            # the loopback address is sufficient
++            # NOTE: a node may have several IP addresses, one for each
++            # network interface. `get_ip()` might return any of them,
++            # while they might not work for communication inside the node
++            # if the network setup is complicated. Using the loopback address
++            # solves this issue, as it always works for communication inside
++            # the node.
++            driver_ip = "127.0.0.1"
++        distributed_init_method = get_distributed_init_method(
++            driver_ip, get_open_port())
++
++        # Initialize the actual workers inside worker wrapper.
++        init_worker_all_kwargs = [
++            self._get_worker_kwargs(
++                local_rank=node_workers[node_id].index(rank),
++                rank=rank,
++                distributed_init_method=distributed_init_method,
++            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
++        ]
++        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
++
++        self._run_workers("init_device")
++        self._run_workers("load_model",
++                          max_concurrent_workers=self.parallel_config.
++                          max_parallel_loading_workers)
++
++        if self.use_ray_spmd_worker:
++            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
++                self.pp_tp_workers.append([])
++                for tp_rank in range(
++                        self.parallel_config.tensor_parallel_size):
++                    # PP=2, TP=4
++                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
++                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
++                            ) + tp_rank
++                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
++                    assert pp_rank < len(self.pp_tp_workers)
++                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
++
++        # This is the list of workers that are rank 0 of each TP group EXCEPT
++        # global rank 0. These are the workers that will broadcast to the
++        # rest of the workers.
++        self.tp_driver_workers: List[RayWorkerWrapper] = []
++        # This is the list of workers that are not drivers and not the first
++        # worker in a TP group. These are the workers that will be
++        # broadcasted to.
++        self.non_driver_workers: List[RayWorkerWrapper] = []
++
++        # Enforce rank order for correct rank to return final output.
++        for index, worker in enumerate(self.workers):
++            # The driver worker is rank 0 and not in self.workers.
++            rank = index + 1
++            if rank % self.parallel_config.tensor_parallel_size == 0:
++                self.tp_driver_workers.append(worker)
++            else:
++                self.non_driver_workers.append(worker)
++
++    def _driver_execute_model(
++        self, execute_model_req: Optional[ExecuteModelRequest]
++    ) -> Optional[List[SamplerOutput]]:
++        """Run execute_model in the driver worker.
++
++        Passing None will cause the driver to stop the model execution
++        loop running in each of the remote workers.
++        """
++        assert not self.use_ray_spmd_worker, (
++            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
++        return self.driver_worker.execute_method("execute_model",
++                                                 execute_model_req)
++
++    def execute_model(
++            self,
++            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        if not self.use_ray_spmd_worker:
++            return super().execute_model(execute_model_req)
++
++        if self.forward_dag is None:
++            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
++
++        serialized_data = self.input_encoder.encode(execute_model_req)
++        outputs = ray.get(self.forward_dag.execute(serialized_data))
++        output = self.output_decoder.decode(outputs[0])
++        return output
++
++    def _run_workers(
++        self,
++        method: str,
++        *args,
++        async_run_tensor_parallel_workers_only: bool = False,
++        all_args: Optional[List[Tuple[Any, ...]]] = None,
++        all_kwargs: Optional[List[Dict[str, Any]]] = None,
++        max_concurrent_workers: Optional[int] = None,
++        **kwargs,
++    ) -> Any:
++        """Runs the given method on all workers. Can be used in the following
++        ways:
++
++        Args:
++        - async_run_tensor_parallel_workers_only: If True the method will be
++          run only in the remote TP workers, not the driver worker.
++          It will also be run asynchronously and return a list of futures
++          rather than blocking on the results.
++        - args/kwargs: All workers share the same args/kwargs
++        - all_args/all_kwargs: args/kwargs for each worker are specified
++          individually
++        """
++        if self.use_ray_spmd_worker:
++            assert not async_run_tensor_parallel_workers_only, (
++                "async_run_tensor_parallel_workers_only is not supported for "
++                "spmd mode.")
++
++        if max_concurrent_workers:
++            raise NotImplementedError(
++                "max_concurrent_workers is not supported yet.")
++
++        count = len(self.workers) if not \
++            async_run_tensor_parallel_workers_only \
++            else len(self.non_driver_workers)
++        # If using SPMD worker, all workers are the same, so we should execute
++        # the args on all workers. Otherwise, we skip the first worker's args
++        # because those args will go to the driver worker.
++        first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
++        all_worker_args = repeat(args, count) if all_args is None \
++            else islice(all_args, first_worker_args_index, None)
++        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
++            else islice(all_kwargs, first_worker_args_index, None)
++
++        # Start the ray workers first.
++        ray_workers = self.workers
++        if async_run_tensor_parallel_workers_only:
++            ray_workers = self.non_driver_workers
++        ray_worker_outputs = [
++            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
++            for (worker, worker_args, worker_kwargs
++                 ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
++        ]
++
++        if async_run_tensor_parallel_workers_only:
++            # Just return futures
++            return ray_worker_outputs
++
++        driver_worker_output = []
++        # In SPMD mode, the driver worker is the same as any other worker,
++        # so we only explicitly execute on the driver worker if using a
++        # non-SPMD worker class.
++        if not self.use_ray_spmd_worker:
++            driver_args = args if all_args is None else all_args[0]
++            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
++
++            # Start the driver worker after all the ray workers.
++            driver_worker_output = [
++                self.driver_worker.execute_method(method, *driver_args,
++                                                  **driver_kwargs)
++            ]
++
++        # Get the results of the ray workers.
++        if self.workers:
++            ray_worker_outputs = ray.get(ray_worker_outputs)
++
++        return driver_worker_output + ray_worker_outputs
++
++    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
++        """Wait for futures returned from _run_workers() with
++        async_run_remote_workers_only to complete."""
++        ray.get(parallel_worker_tasks)
++
++    def _check_ray_adag_installation(self):
++        import pkg_resources
++        from packaging import version
++
++        required_version = version.parse("2.35")
++        current_version = version.parse(
++            pkg_resources.get_distribution("ray").version)
++        # TODO: update the constraint once we adapt to the backward
++        # incompatible API change from ray 2.36
++        if current_version != required_version:
++            raise ValueError(f"Ray version {required_version} is "
++                             f"required, but found {current_version}")
++
++        import importlib.util
++        adag_spec = importlib.util.find_spec(
++            "ray.experimental.compiled_dag_ref")
++        if adag_spec is None:
++            raise ValueError("Ray accelerated DAG is not installed. "
++                             "Run `pip install ray[adag]` to install it.")
++
++    def _compiled_ray_dag(self, enable_asyncio: bool):
++        assert self.parallel_config.use_ray
++        self._check_ray_adag_installation()
++        from ray.dag import InputNode, MultiOutputNode
++        from ray.experimental.channel.torch_tensor_type import TorchTensorType
++
++        with InputNode() as input_data:
++            # Example DAG: PP=2, TP=4
++            # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
++            #                         -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput   # noqa: E501
++            #                         -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput   # noqa: E501
++            #                         -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput   # noqa: E501
++
++            # All workers in the first TP group will take in the
++            # ExecuteModelRequest as input.
++            outputs = [input_data for _ in self.pp_tp_workers[0]]
++            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
++                # Each PP worker takes in the output of the previous PP worker,
++                # and the TP group executes in SPMD fashion.
++                outputs = [
++                    worker.execute_model_spmd.
++                    bind(  # type: ignore[attr-defined]
++                        outputs[i]) for i, worker in enumerate(tp_group)
++                ]
++
++                last_pp_rank = len(self.pp_tp_workers) - 1
++                if pp_rank < last_pp_rank:
++                    # Specify how intermediate tensors should be passed
++                    # between pp stages, no need to specify for the last
++                    # pp stage.
++                    transport = "auto"
++                    outputs = [
++                        output.with_type_hint(
++                            TorchTensorType(transport=transport))
++                        for output in outputs
++                    ]
++
++            forward_dag = MultiOutputNode(outputs)
++
++        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
++
++    def __del__(self):
++        self.shutdown()
++
++
++class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync):
++
++    def __init__(self, *args, **kwargs):
++        super().__init__(*args, **kwargs)
++        self.pp_locks: Optional[List[asyncio.Lock]] = None
++        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
++        if not self.use_ray_compiled_dag:
++            self.driver_exec_method = make_async(
++                self.driver_worker.execute_method)
++
++    async def execute_model_async(
++            self,
++            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        if not self.use_ray_spmd_worker:
++            return await super().execute_model_async(execute_model_req)
++
++        if self.forward_dag is None:
++            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
++
++        serialized_data = self.input_encoder.encode(execute_model_req)
++        dag_future = await self.forward_dag.execute_async(serialized_data)
++        outputs = await dag_future
++        return self.output_decoder.decode(outputs[0])
++
++    async def _driver_execute_model_async(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None
++    ) -> List[SamplerOutput]:
++        assert not self.use_ray_spmd_worker, (
++            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
++        if not self.tp_driver_workers:
++            return await self.driver_exec_method("execute_model",
++                                                 execute_model_req)
++        if self.pp_locks is None:
++            # This locks each pipeline parallel stage so multiple virtual
++            # engines can't execute on the same stage at the same time
++            # We create the locks here to avoid creating them in the constructor
++            # which uses a different asyncio loop.
++            self.pp_locks = [
++                asyncio.Lock()
++                for _ in range(self.parallel_config.pipeline_parallel_size)
++            ]
++
++        tasks = [
++            asyncio.create_task(
++                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
++                                    "execute_model", execute_model_req))
++        ]
++        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
++                                                start=1):
++            tasks.append(
++                asyncio.create_task(
++                    _run_task_with_lock(driver_worker.execute_method.remote,
++                                        self.pp_locks[pp_rank],
++                                        "execute_model", execute_model_req)))
++
++        results = await asyncio.gather(*tasks)
++
++        # Only the last PP stage has the final results.
++        return results[-1]
++
++    async def _start_worker_execution_loop(self):
++        assert not self.use_ray_spmd_worker, (
++            "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
++        coros = [
++            worker.execute_method.remote("start_worker_execution_loop")
++            for worker in self.non_driver_workers
++        ]
++        return await asyncio.gather(*coros)
++
++    def __del__(self):
++        self.shutdown()
+diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
+new file mode 100644
+index 0000000..5118c13
+--- /dev/null
++++ b/vllm/executor/ray_tpu_executor.py
+@@ -0,0 +1,343 @@
++import asyncio
++import os
++from collections import defaultdict
++from itertools import islice, repeat
++from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
++                    Union)
++
++import vllm.envs as envs
++from vllm.executor.executor_base import ExecutorAsyncBase
++from vllm.executor.ray_utils import RayWorkerWrapper, ray
++from vllm.executor.tpu_executor import TPUExecutor
++from vllm.logger import init_logger
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
++                        make_async)
++
++if ray is not None:
++    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
++
++if TYPE_CHECKING:
++    from ray.util.placement_group import PlacementGroup
++
++logger = init_logger(__name__)
++
++
++class RayTPUExecutor(TPUExecutor):
++
++    uses_ray: bool = True
++
++    def __init__(self, *args, **kwargs):
++        # This is non-None when the execute model loop is running
++        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
++        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
++        # Updated by implementations that require additional args to be passed
++        # to the _run_workers execute_model call
++        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
++
++        super().__init__(*args, **kwargs)
++
++    def _init_executor(self) -> None:
++        assert self.parallel_config.distributed_executor_backend == "ray"
++        placement_group = self.parallel_config.placement_group
++
++        # Disable Ray usage stats collection.
++        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
++        if ray_usage != "1":
++            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
++
++        # Create the parallel TPU workers.
++        self._init_workers_ray(placement_group)
++
++    def _init_workers_ray(self, placement_group: "PlacementGroup",
++                          **ray_remote_kwargs):
++        # The driver dummy worker does not actually use any resources.
++        # It holds the resource for the driver worker.
++        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
++        # The remaining workers are the actual ray actors.
++        self.workers: List[RayWorkerWrapper] = []
++
++        # Create the workers.
++        driver_ip = get_ip()
++        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
++            if not bundle.get("TPU", 0):
++                continue
++            scheduling_strategy = PlacementGroupSchedulingStrategy(
++                placement_group=placement_group,
++                placement_group_capture_child_tasks=True,
++                placement_group_bundle_index=bundle_id,
++            )
++
++            # GKE does not fetch environment information from metadata server
++            # and instead sets these from within the Ray process. Therefore we
++            # need to override the Ray environment variables manually.
++            override_env = {}
++            if "TPU_CHIPS_PER_HOST_BOUNDS" in os.environ:
++                override_env.update({
++                    "TPU_CHIPS_PER_HOST_BOUNDS":
++                    os.environ["TPU_CHIPS_PER_HOST_BOUNDS"]
++                })
++            if "TPU_HOST_BOUNDS" in os.environ:
++                override_env.update(
++                    {"TPU_HOST_BOUNDS": os.environ["TPU_HOST_BOUNDS"]})
++
++            worker = ray.remote(
++                num_cpus=0,
++                resources={"TPU": 1},
++                scheduling_strategy=scheduling_strategy,
++                **ray_remote_kwargs,
++            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
++            if override_env:
++                worker.override_env_vars.remote(override_env)
++
++            worker_ip = ray.get(worker.get_node_ip.remote())
++            if worker_ip == driver_ip and self.driver_dummy_worker is None:
++                # If the worker is on the same node as the driver, we use it
++                # as the resource holder for the driver process.
++                self.driver_dummy_worker = worker
++                self.driver_worker = RayWorkerWrapper(
++                    vllm_config=self.vllm_config)
++            else:
++                # Else, added to the list of workers.
++                self.workers.append(worker)
++
++        logger.debug("workers: %s", self.workers)
++        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
++        if self.driver_dummy_worker is None:
++            raise ValueError(
++                "Ray does not allocate any TPUs on the driver node. Consider "
++                "adjusting the Ray placement group or running the driver on a "
++                "TPU node.")
++
++        worker_ips = [
++            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
++            for worker in self.workers
++        ]
++        ip_counts: Dict[str, int] = {}
++        for ip in worker_ips:
++            ip_counts[ip] = ip_counts.get(ip, 0) + 1
++
++        def sort_by_driver_then_worker_ip(worker):
++            """
++            Sort the workers based on 3 properties:
++            1. If the worker is on the same node as the driver (vllm engine),
++                it should be placed first.
++            2. Then, if the worker is on a node with fewer workers, it should
++                be placed first.
++            3. Finally, if the work is on a node with smaller IP address, it
++                should be placed first.
++            """
++            ip = ray.get(worker.get_node_ip.remote())
++            return (ip != driver_ip, ip_counts[ip], ip)
++
++        # After sorting, the workers on the same node will be
++        # close to each other, and the workers on the driver
++        # node will be placed first.
++        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
++
++        # Get the set of TPU IDs used on each node.
++        worker_node_and_gpu_ids = []
++        for worker in [self.driver_dummy_worker] + self.workers:
++            if worker is None:
++                # driver_dummy_worker can be None when using ray spmd worker.
++                continue
++            worker_node_and_gpu_ids.append(
++                ray.get(worker.get_node_and_gpu_ids.remote()) \
++            ) # type: ignore
++
++        node_workers = defaultdict(list)
++        for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
++            node_workers[node_id].append(i)
++
++        # Set environment variables for the driver and workers.
++        all_args_to_update_environment_variables = [({
++            "VLLM_TRACE_FUNCTION":
++            str(envs.VLLM_TRACE_FUNCTION),
++        }, ) for _ in worker_node_and_gpu_ids]
++        self._run_workers("update_environment_variables",
++                          all_args=all_args_to_update_environment_variables)
++
++        if len(node_workers) == 1:
++            # in single node case, we don't need to get the IP address.
++            # the loopback address is sufficient
++            # NOTE: a node may have several IP addresses, one for each
++            # network interface. `get_ip()` might return any of them,
++            # while they might not work for communication inside the node
++            # if the network setup is complicated. Using the loopback address
++            # solves this issue, as it always works for communication inside
++            # the node.
++            driver_ip = "127.0.0.1"
++        distributed_init_method = get_distributed_init_method(
++            driver_ip, get_open_port())
++
++        # Initialize the actual workers inside worker wrapper.
++        init_worker_all_kwargs = [
++            self._get_worker_kwargs(
++                local_rank=node_workers[node_id].index(rank),
++                rank=rank,
++                distributed_init_method=distributed_init_method,
++            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
++        ]
++        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
++
++        self._run_workers("init_device")
++        self._run_workers("load_model",
++                          max_concurrent_workers=self.parallel_config.
++                          max_parallel_loading_workers)
++
++    def _driver_execute_model(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None
++    ) -> List[SamplerOutput]:
++        """Run execute_model in the driver worker.
++
++        Passing None will cause the driver to stop the model execution
++        loop running in each of the remote workers.
++        """
++        return self.driver_worker.execute_method("execute_model",
++                                                 execute_model_req)
++
++    def _run_workers(
++        self,
++        method: str,
++        *args,
++        async_run_remote_workers_only: bool = False,
++        all_args: Optional[List[Tuple[Any, ...]]] = None,
++        all_kwargs: Optional[List[Dict[str, Any]]] = None,
++        max_concurrent_workers: Optional[int] = None,
++        use_ray_compiled_dag: bool = False,
++        **kwargs,
++    ) -> Any:
++        """Runs the given method on all workers. Can be used in the following
++        ways:
++
++        - async_run_remote_workers_only: If True the method will be run only
++          in the remote workers, not the driver worker. It will also be
++          run asynchronously and return a list of futures rather than blocking
++          on the results.
++        - args/kwargs: All workers share the same args/kwargs
++        - all_args/all_kwargs: args/kwargs for each worker are specified
++          individually
++        """
++
++        if max_concurrent_workers:
++            raise NotImplementedError(
++                "max_concurrent_workers is not supported yet.")
++
++        count = len(self.workers)
++        all_worker_args = repeat(args, count) if all_args is None \
++            else islice(all_args, 1, None)
++        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
++            else islice(all_kwargs, 1, None)
++
++        # Start the ray workers first.
++        ray_worker_outputs = [
++            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
++            for (worker, worker_args, worker_kwargs
++                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
++        ]
++
++        if async_run_remote_workers_only:
++            # Just return futures
++            return ray_worker_outputs
++
++        driver_args = args if all_args is None else all_args[0]
++        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
++
++        # Start the driver worker after all the ray workers.
++        driver_worker_output = self.driver_worker.execute_method(
++            method, *driver_args, **driver_kwargs)
++        # Get the results of the ray workers.
++        if self.workers:
++            ray_worker_outputs = ray.get(ray_worker_outputs)
++
++        return [driver_worker_output] + ray_worker_outputs
++
++    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
++        """Wait for futures returned from _run_workers() with
++        async_run_remote_workers_only to complete."""
++        ray.get(parallel_worker_tasks)
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        num_blocks = self._run_workers("determine_num_available_blocks", )
++        num_tpu_blocks = min(b[0] for b in num_blocks)
++        num_cpu_blocks = min(b[1] for b in num_blocks)
++        return num_tpu_blocks, num_cpu_blocks
++
++    def initialize_cache(self, num_gpu_blocks: int,
++                         num_cpu_blocks: int) -> None:
++        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
++                    num_cpu_blocks)
++        self.cache_config.num_gpu_blocks = num_gpu_blocks
++        self.cache_config.num_cpu_blocks = num_cpu_blocks
++        self._run_workers("initialize_cache",
++                          num_gpu_blocks=num_gpu_blocks,
++                          num_cpu_blocks=num_cpu_blocks)
++
++    def execute_model(
++        self,
++        execute_model_req: ExecuteModelRequest,
++    ) -> List[SamplerOutput]:
++        if self.parallel_worker_tasks is None:
++            self.parallel_worker_tasks = self._run_workers(
++                "start_worker_execution_loop",
++                async_run_remote_workers_only=True,
++                **self.extra_execute_model_run_workers_kwargs)
++
++        # Only the driver worker returns the sampling results.
++        return self._driver_execute_model(execute_model_req)
++
++    def stop_remote_worker_execution_loop(self) -> None:
++        if self.parallel_worker_tasks is None:
++            return
++
++        self._driver_execute_model()
++        parallel_worker_tasks = self.parallel_worker_tasks
++        self.parallel_worker_tasks = None
++        # Ensure that workers exit model loop cleanly
++        # (this will raise otherwise)
++        self._wait_for_tasks_completion(parallel_worker_tasks)
++
++
++class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase):
++
++    def __init__(self, *args, **kwargs):
++        super().__init__(*args, **kwargs)
++        self.driver_exec_method = make_async(self.driver_worker.execute_method)
++
++    async def execute_model_async(
++            self,
++            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        if self.parallel_worker_tasks is None:
++            # Start model execution loop running in the parallel workers
++            self.parallel_worker_tasks = asyncio.create_task(
++                self._start_worker_execution_loop())
++
++        # Only the driver worker returns the sampling results.
++        return await self._driver_execute_model_async(execute_model_req)
++
++    async def stop_remote_worker_execution_loop_async(self) -> None:
++        if self.parallel_worker_tasks is None:
++            return
++
++        await self._driver_execute_model_async()
++        parallel_worker_tasks = self.parallel_worker_tasks
++        self.parallel_worker_tasks = None
++        # Ensure that workers exit model loop cleanly
++        # (this will raise otherwise)
++        await parallel_worker_tasks
++
++    async def _driver_execute_model_async(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None
++    ) -> List[SamplerOutput]:
++        return await self.driver_exec_method("execute_model",
++                                             execute_model_req)
++
++    async def _start_worker_execution_loop(self):
++        coros = [
++            worker.execute_method.remote("start_worker_execution_loop")
++            for worker in self.workers
++        ]
++        return await asyncio.gather(*coros)
+diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
+index 9db3ae2..9f40f6a 100644
+--- a/vllm/executor/ray_utils.py
++++ b/vllm/executor/ray_utils.py
+@@ -1,15 +1,31 @@
+-import pickle
+-from typing import List, Optional, Tuple
++import os
++import time
++from collections import defaultdict
++from typing import Dict, List, Optional, Tuple, Union
++
++import msgspec
+ 
+ from vllm.config import ParallelConfig
++from vllm.executor.msgspec_utils import decode_hook, encode_hook
+ from vllm.logger import init_logger
+-from vllm.utils import get_ip, is_hip
++from vllm.platforms import current_platform
++from vllm.sequence import ExecuteModelRequest, IntermediateTensors
++from vllm.utils import get_ip
+ from vllm.worker.worker_base import WorkerWrapperBase
+ 
+ logger = init_logger(__name__)
++PG_WAIT_TIMEOUT = 1800
+ 
+ try:
+     import ray
++    from ray.util import placement_group_table
++    from ray.util.placement_group import PlacementGroup
++    try:
++        from ray._private.state import available_resources_per_node
++    except ImportError:
++        # Ray 2.9.x doesn't expose `available_resources_per_node`
++        from ray._private.state import state as _state
++        available_resources_per_node = _state._available_resources_per_node
+ 
+     class RayWorkerWrapper(WorkerWrapperBase):
+         """Ray wrapper for vllm.worker.Worker, allowing Worker to be
+@@ -23,33 +39,185 @@ try:
+             # that thread.
+             self.compiled_dag_cuda_device_set = False
+ 
++            self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
++                                                         dec_hook=decode_hook)
++            self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
++
+         def get_node_ip(self) -> str:
+             return get_ip()
+ 
+         def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
+             node_id = ray.get_runtime_context().get_node_id()
+-            gpu_ids = ray.get_gpu_ids()
++            device_key = current_platform.ray_device_key
++            if not device_key:
++                raise RuntimeError("current platform %s does not support ray.",
++                                   current_platform.device_name)
++            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
++            )[device_key]
+             return node_id, gpu_ids
+ 
+-        def execute_model_compiled_dag_remote(self, ignored):
+-            """Used only when compiled DAG is enabled."""
++        def execute_model_spmd(
++            self, req_or_tuple: Union[bytes,
++                                      Tuple[bytes,
++                                            Optional[IntermediateTensors]]]
++        ) -> bytes:
++            """Execute model in SPMD fashion: used only when SPMD worker and
++            compiled DAG are both enabled.
++
++            Args:
++                req_or_tuple: A request or a tuple containing the
++                    request and intermediate tensors. Intermediate tensors are
++                    None unless if it is provided because it is > 0 pipeline
++                    stage. The request is serialized by msgspec.
++            """
++            if isinstance(req_or_tuple, bytes):
++                serialized_req, intermediate_tensors = req_or_tuple, None
++            else:
++                serialized_req, intermediate_tensors = req_or_tuple
++
++            execute_model_req = self.input_decoder.decode(serialized_req)
++
++            # TODO(swang): This is needed right now because Ray aDAG executes
++            # on a background thread, so we need to reset torch's current
++            # device.
+             import torch
+             if not self.compiled_dag_cuda_device_set:
+                 torch.cuda.set_device(self.worker.device)
+                 self.compiled_dag_cuda_device_set = True
+ 
+-            output = self.worker.execute_model()
+-            output = pickle.dumps(output)
++            output = self.worker._execute_model_spmd(execute_model_req,
++                                                     intermediate_tensors)
++            # Pipeline model request and output to the next pipeline stage.
++            if isinstance(output, IntermediateTensors):
++                output = serialized_req, output
++            else:
++                output = self.output_encoder.encode(output)
++
+             return output
+ 
++        def override_env_vars(self, vars: Dict[str, str]):
++            os.environ.update(vars)
++
++    ray_import_err = None
++
+ except ImportError as e:
+-    logger.warning(
+-        "Failed to import Ray with %r. For distributed inference, "
+-        "please install Ray with `pip install ray`.", e)
+     ray = None  # type: ignore
++    ray_import_err = e
+     RayWorkerWrapper = None  # type: ignore
+ 
+ 
++def ray_is_available() -> bool:
++    """Returns True if Ray is available."""
++    return ray is not None
++
++
++def assert_ray_available():
++    """Raise an exception if Ray is not available."""
++    if ray is None:
++        raise ValueError("Failed to import Ray, please install Ray with "
++                         "`pip install ray`.") from ray_import_err
++
++
++def _verify_bundles(placement_group: "PlacementGroup",
++                    parallel_config: ParallelConfig, device_str: str):
++    """Verify a given placement group has bundles located in the right place.
++
++    There are 2 rules.
++    - Warn if all tensor parallel workers cannot fit in a single node.
++    - Fail if driver node is not included in a placement group.
++    """
++    assert ray.is_initialized(), (
++        "Ray is not initialized although distributed-executor-backend is ray.")
++    pg_data = placement_group_table(placement_group)
++    # bundle_idx -> node_id
++    bundle_to_node_ids = pg_data["bundles_to_node_id"]
++    # bundle_idx -> bundle (e.g., {"GPU": 1})
++    bundles = pg_data["bundles"]
++    # node_id -> List of bundle (e.g., {"GPU": 1})
++    node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
++
++    for bundle_idx, node_id in bundle_to_node_ids.items():
++        node_id_to_bundle[node_id].append(bundles[bundle_idx])
++    driver_node_id = ray.get_runtime_context().get_node_id()
++
++    if driver_node_id not in node_id_to_bundle:
++        raise RuntimeError(
++            f"driver node id {driver_node_id} is not included in a placement "
++            f"group {placement_group.id}. Node id -> bundles "
++            f"{node_id_to_bundle}. "
++            "You don't have enough GPUs available in a current node. Check "
++            "`ray status` to see if you have available GPUs in a node "
++            f"{driver_node_id} before starting an vLLM engine.")
++
++    for node_id, bundles in node_id_to_bundle.items():
++        if len(bundles) < parallel_config.tensor_parallel_size:
++            logger.warning(
++                "tensor_parallel_size=%d "
++                "is bigger than a reserved number of %ss (%d "
++                "%ss) in a node %s. Tensor parallel workers can be "
++                "spread out to 2+ nodes which can degrade the performance "
++                "unless you have fast interconnect across nodes, like "
++                "Infiniband. To resolve this issue, make sure you have more "
++                "than %d GPUs available at each node.",
++                parallel_config.tensor_parallel_size, device_str, len(bundles),
++                device_str, node_id, parallel_config.tensor_parallel_size)
++
++
++def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
++    """Wait until a placement group is ready.
++
++    It prints the informative log messages if the placement group is
++    not created within time.
++
++    """
++    # Wait until PG is ready - this will block until all
++    # requested resources are available, and will timeout
++    # if they cannot be provisioned.
++    placement_group_specs = current_placement_group.bundle_specs
++
++    s = time.time()
++    pg_ready_ref = current_placement_group.ready()
++    wait_interval = 10
++    while time.time() - s < PG_WAIT_TIMEOUT:
++        ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
++        if len(ready) > 0:
++            break
++
++        # Exponential backoff for warning print.
++        wait_interval *= 2
++        logger.info(
++            "Waiting for creating a placement group of specs for "
++            "%d seconds. specs=%s. Check "
++            "`ray status` to see if you have enough resources.",
++            int(time.time() - s), placement_group_specs)
++
++    try:
++        ray.get(pg_ready_ref, timeout=0)
++    except ray.exceptions.GetTimeoutError:
++        raise ValueError(
++            "Cannot provide a placement group of "
++            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
++            "`ray status` to make sure the cluster has enough resources."
++        ) from None
++
++
++def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
++    ray.util.remove_placement_group(current_placement_group)
++    s = time.time()
++    wait_interval = 10
++    while time.time() - s < PG_WAIT_TIMEOUT:
++        pg = ray.util.get_current_placement_group()
++        if pg is None:
++            break
++
++        # Exponential backoff for warning print.
++        wait_interval *= 2
++        logger.info(
++            "Waiting for removing a placement group of specs for "
++            "%d seconds.", int(time.time() - s))
++        time.sleep(wait_interval)
++
++
+ def initialize_ray_cluster(
+     parallel_config: ParallelConfig,
+     ray_address: Optional[str] = None,
+@@ -65,16 +233,21 @@ def initialize_ray_cluster(
+         ray_address: The address of the Ray cluster. If None, uses
+             the default Ray cluster address.
+     """
+-    if ray is None:
+-        raise ImportError(
+-            "Ray is not installed. Please install Ray to use distributed "
+-            "serving.")
++    assert_ray_available()
++    from vllm.platforms import current_platform
+ 
+     # Connect to a ray cluster.
+-    if is_hip():
+-        ray.init(address=ray_address,
+-                 ignore_reinit_error=True,
+-                 num_gpus=parallel_config.world_size)
++    if current_platform.is_rocm() or current_platform.is_xpu():
++        # Try to connect existing ray instance and create a new one if not found
++        try:
++            ray.init("auto", ignore_reinit_error=True)
++        except ConnectionError:
++            logger.warning(
++                "No existing RAY instance detected. "
++                "A new instance will be launched with current node resources.")
++            ray.init(address=ray_address,
++                     ignore_reinit_error=True,
++                     num_gpus=parallel_config.world_size)
+     else:
+         ray.init(address=ray_address, ignore_reinit_error=True)
+ 
+@@ -82,38 +255,95 @@ def initialize_ray_cluster(
+         # Placement group is already set.
+         return
+ 
++    device_str = current_platform.ray_device_key
++    if not device_str:
++        raise ValueError(
++            f"current platform {current_platform.device_name} does not "
++            "support ray.")
++
+     # Create placement group for worker processes
+     current_placement_group = ray.util.get_current_placement_group()
+     if current_placement_group:
+         # We are in a placement group
+         bundles = current_placement_group.bundle_specs
+         # Verify that we can use the placement group.
+-        gpu_bundles = 0
++        device_bundles = 0
+         for bundle in bundles:
+-            bundle_gpus = bundle.get("GPU", 0)
+-            if bundle_gpus > 1:
++            bundle_devices = bundle.get(device_str, 0)
++            if bundle_devices > 1:
+                 raise ValueError(
+-                    "Placement group bundle cannot have more than 1 GPU.")
+-            if bundle_gpus:
+-                gpu_bundles += 1
+-        if parallel_config.world_size > gpu_bundles:
++                    "Placement group bundle cannot have more than 1 "
++                    f"{device_str}.")
++            if bundle_devices:
++                device_bundles += 1
++        if parallel_config.world_size > device_bundles:
+             raise ValueError(
+-                "The number of required GPUs exceeds the total number of "
+-                "available GPUs in the placement group.")
++                f"The number of required {device_str}s exceeds the total "
++                f"number of available {device_str}s in the placement group."
++                f"Required number of devices: {parallel_config.world_size}. "
++                f"Total number of devices: {device_bundles}.")
+     else:
+-        num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
+-        if parallel_config.world_size > num_gpus_in_cluster:
+-            raise ValueError(
+-                "The number of required GPUs exceeds the total number of "
+-                "available GPUs in the cluster.")
++        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
++        # Log a warning message and delay resource allocation failure response.
++        # Avoid immediate rejection to allow user-initiated placement group
++        # created and wait cluster to be ready
++        if parallel_config.world_size > num_devices_in_cluster:
++            logger.warning(
++                "The number of required %ss exceeds the total "
++                "number of available %ss in the placement group.", device_str,
++                device_str)
+         # Create a new placement group
+-        placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size)
++        placement_group_specs: List[Dict[str, float]] = ([{
++            device_str: 1.0
++        } for _ in range(parallel_config.world_size)])
++
++        # vLLM engine is also a worker to execute model with an accelerator,
++        # so it requires to have the device in a current node. Check if
++        # the current node has at least one device.
++        current_ip = get_ip()
++        current_node_id = ray.get_runtime_context().get_node_id()
++        current_node_resource = available_resources_per_node()[current_node_id]
++        if current_node_resource.get(device_str, 0) < 1:
++            raise ValueError(
++                f"Current node has no {device_str} available. "
++                f"{current_node_resource=}. vLLM engine cannot start without "
++                f"{device_str}. Make sure you have at least 1 {device_str} "
++                f"available in a node {current_node_id=} {current_ip=}.")
++        # This way, at least bundle is required to be created in a current
++        # node.
++        placement_group_specs[0][f"node:{current_ip}"] = 0.001
++
++        # By default, Ray packs resources as much as possible.
+         current_placement_group = ray.util.placement_group(
+-            placement_group_specs)
+-        # Wait until PG is ready - this will block until all
+-        # requested resources are available, and will timeout
+-        # if they cannot be provisioned.
+-        ray.get(current_placement_group.ready(), timeout=1800)
++            placement_group_specs, strategy="PACK")
++        _wait_until_pg_ready(current_placement_group)
+ 
++    assert current_placement_group is not None
++    _verify_bundles(current_placement_group, parallel_config, device_str)
+     # Set the placement group in the parallel config
+     parallel_config.placement_group = current_placement_group
++
++
++def get_num_tpu_nodes() -> int:
++    from ray._private.accelerators import TPUAcceleratorManager
++    cluster_resources = ray.cluster_resources()
++    total_tpus = int(cluster_resources["TPU"])
++    tpus_per_node = TPUAcceleratorManager.get_current_node_num_accelerators()
++    assert total_tpus % tpus_per_node == 0
++    return total_tpus // tpus_per_node
++
++
++def get_num_nodes_in_placement_group() -> int:
++    pg_table = ray.util.placement_group_table()
++    current_pg = ray.util.get_current_placement_group()
++    num_nodes = 0
++
++    if current_pg:
++        nodes_in_pg = set()
++        for pg_key, pg in pg_table.items():
++            if pg_key == current_pg.id.hex():
++                for _, node in pg["bundles_to_node_id"].items():
++                    nodes_in_pg.add(node)
++        num_nodes = len(nodes_in_pg)
++
++    return num_nodes
+diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py
+new file mode 100644
+index 0000000..d2086f5
+--- /dev/null
++++ b/vllm/executor/ray_xpu_executor.py
+@@ -0,0 +1,40 @@
++import asyncio
++from typing import List, Optional
++
++import ray
++
++import vllm.envs as envs
++from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync
++from vllm.executor.xpu_executor import XPUExecutor
++from vllm.logger import init_logger
++from vllm.utils import make_async
++
++logger = init_logger(__name__)
++
++
++class RayXPUExecutor(RayGPUExecutor, XPUExecutor):
++
++    def _get_env_vars_to_be_updated(self):
++        # Get the set of GPU IDs used on each node.
++        worker_node_and_gpu_ids = []
++        for worker in [self.driver_dummy_worker] + self.workers:
++            if worker is None:
++                # driver_dummy_worker can be None when using ray spmd worker.
++                continue
++            worker_node_and_gpu_ids.append(
++                ray.get(worker.get_node_and_gpu_ids.remote()))  # type: ignore
++
++        # Set environment variables for the driver and workers.
++        all_args_to_update_environment_variables = [({
++            "VLLM_TRACE_FUNCTION":
++            str(envs.VLLM_TRACE_FUNCTION),
++        }, ) for (_, _) in worker_node_and_gpu_ids]
++        return all_args_to_update_environment_variables
++
++
++class RayXPUExecutorAsync(RayXPUExecutor, RayGPUExecutorAsync):
++
++    def __init__(self, *args, **kwargs):
++        super().__init__(*args, **kwargs)
++        self.driver_exec_method = make_async(self.driver_worker.execute_method)
++        self.pp_locks: Optional[List[asyncio.Lock]] = None
+diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
+new file mode 100644
+index 0000000..e37e897
+--- /dev/null
++++ b/vllm/executor/tpu_executor.py
+@@ -0,0 +1,142 @@
++from typing import Any, Dict, List, Optional, Set, Tuple
++
++import torch
++
++from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
++                        make_async)
++
++logger = init_logger(__name__)
++
++
++class TPUExecutor(ExecutorBase):
++
++    uses_ray: bool = False
++
++    def _init_executor(self) -> None:
++        assert not self.scheduler_config.chunked_prefill_enabled, (
++            "Chunked prefill is not yet supported for TPU backend")
++        assert not self.speculative_config, (
++            "Speculative decoding is not yet supported for TPU backend")
++        if self.model_config.dtype in (torch.float16, torch.float32):
++            logger.warning(
++                "The TPU backend currently does not support %s. "
++                "Using bfloat16 instead.", self.model_config.dtype)
++            self.model_config.dtype = torch.bfloat16
++
++        # Instantiate the worker and load the model to the device.
++        self.driver_worker = self._create_worker()
++        self.driver_worker.init_device()
++        self.driver_worker.load_model()
++
++    def _get_worker_kwargs(
++        self,
++        local_rank: int = 0,
++        rank: int = 0,
++        distributed_init_method: Optional[str] = None,
++    ) -> Dict[str, Any]:
++        """Return worker init args for a given rank."""
++        if distributed_init_method is None:
++            distributed_init_method = get_distributed_init_method(
++                get_ip(), get_open_port())
++        return dict(
++            vllm_config=self.vllm_config,
++            local_rank=local_rank,
++            rank=rank,
++            distributed_init_method=distributed_init_method,
++            is_driver_worker=rank == 0,
++        )
++
++    def _create_worker(
++        self,
++        local_rank: int = 0,
++        rank: int = 0,
++        distributed_init_method: Optional[str] = None,
++    ):
++        if self.scheduler_config.is_multi_step:
++            from vllm.worker.multi_step_tpu_worker import MultiStepTPUWorker
++            worker = MultiStepTPUWorker(**self._get_worker_kwargs(
++                local_rank, rank, distributed_init_method))
++            return worker
++        else:
++            from vllm.worker.tpu_worker import TPUWorker
++
++            worker = TPUWorker(**self._get_worker_kwargs(
++                local_rank, rank, distributed_init_method))
++            return worker
++
++    def initialize_cache(
++        self,
++        num_gpu_blocks: int,
++        num_cpu_blocks: int,
++    ) -> None:
++        """Initialize the KV cache by invoking the underlying worker."""
++        # NOTE: This is logged in the executor because there can be >1 worker
++        # with other executors. We could log in the engine level, but work
++        # remains to abstract away the device for non-GPU configurations.
++        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
++                    num_cpu_blocks)
++        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """Determine the number of available KV blocks by invoking the
++        underlying worker."""
++        return self.driver_worker.determine_num_available_blocks()
++
++    def execute_model(
++        self,
++        execute_model_req: ExecuteModelRequest,
++    ) -> List[SamplerOutput]:
++        output = self.driver_worker.execute_model(execute_model_req)
++        return output
++
++    def add_lora(self, lora_request: LoRARequest) -> bool:
++        raise NotImplementedError(
++            "LoRA is currently not supported by the TPU backend.")
++
++    def remove_lora(self, lora_id: int) -> bool:
++        raise NotImplementedError(
++            "LoRA is currently not supported by the TPU backend.")
++
++    def pin_lora(self, lora_id: int) -> bool:
++        raise NotImplementedError(
++            "LoRA is currently not supported by the TPU backend.")
++
++    def list_loras(self) -> Set[int]:
++        raise NotImplementedError(
++            "LoRA is currently not supported by the TPU backend.")
++
++    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the TPU backend.")
++
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the TPU backend.")
++
++    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the TPU backend.")
++
++    def list_prompt_adapters(self) -> Set[int]:
++        raise NotImplementedError(
++            "Soft prompt is currently not supported by the TPU backend.")
++
++    def check_health(self) -> None:
++        # TPUExecutor will always be healthy as long as it's running.
++        return
++
++
++class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
++
++    async def execute_model_async(
++        self,
++        sexecute_model_req: ExecuteModelRequest,
++    ) -> SamplerOutput:
++        output = await make_async(self.driver_worker.execute_model
++                                  )(sexecute_model_req)
++        return output
+diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
+new file mode 100644
+index 0000000..722b86a
+--- /dev/null
++++ b/vllm/executor/xpu_executor.py
+@@ -0,0 +1,39 @@
++from typing import List, Optional, Union
++
++from vllm.executor.executor_base import ExecutorAsyncBase
++from vllm.executor.gpu_executor import GPUExecutor
++from vllm.logger import init_logger
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest, PoolerOutput
++from vllm.utils import make_async
++
++logger = init_logger(__name__)
++
++
++class XPUExecutor(GPUExecutor):
++
++    uses_ray: bool = False
++
++    def _init_executor(self) -> None:
++        assert self.device_config.device_type == "xpu"
++        assert self.speculative_config is None, (
++            "Speculative decoding not yet supported for XPU backend")
++
++        GPUExecutor._init_executor(self)
++
++    def execute_model(
++        self, execute_model_req: ExecuteModelRequest
++    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
++        output = self.driver_worker.execute_model(execute_model_req)
++        return output
++
++
++class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase):
++
++    async def execute_model_async(
++        self,
++        execute_model_req: ExecuteModelRequest,
++    ) -> List[SamplerOutput]:
++        output = await make_async(self.driver_worker.execute_model
++                                  )(execute_model_req=execute_model_req)
++        return output
+diff --git a/vllm/forward_context.py b/vllm/forward_context.py
+new file mode 100644
+index 0000000..828b394
+--- /dev/null
++++ b/vllm/forward_context.py
+@@ -0,0 +1,99 @@
++import time
++from collections import defaultdict
++from contextlib import contextmanager
++from dataclasses import dataclass
++from typing import TYPE_CHECKING, Any, Dict, Optional
++
++import torch
++
++import vllm.envs as envs
++from vllm.config import VllmConfig
++from vllm.logger import init_logger
++
++if TYPE_CHECKING:
++    from vllm.attention.backends.abstract import AttentionMetadata
++
++logger = init_logger(__name__)
++
++track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
++last_logging_time: float = 0
++forward_start_time: float = 0
++batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
++batchsize_forward_time: defaultdict = defaultdict(list)
++
++
++@dataclass
++class ForwardContext:
++    # copy from vllm_config.compilation_config.static_forward_context
++    attn_layers: Dict[str, Any]
++    # TODO: extend to support per-layer dynamic forward context
++    attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
++    # TODO: remove after making all virtual_engines share the same kv cache
++    virtual_engine: int  # set dynamically for each forward pass
++
++
++_forward_context: Optional[ForwardContext] = None
++
++
++def get_forward_context() -> ForwardContext:
++    """Get the current forward context."""
++    assert _forward_context is not None, (
++        "Forward context is not set. "
++        "Please use `set_forward_context` to set the forward context.")
++    return _forward_context
++
++
++@contextmanager
++def set_forward_context(attn_metadata: Any,
++                        vllm_config: VllmConfig,
++                        virtual_engine: int = 0):
++    """A context manager that stores the current forward context,
++    can be attention metadata, etc.
++    Here we can inject common logic for every model forward pass.
++    """
++    global forward_start_time
++    need_to_track_batchsize = track_batchsize and attn_metadata is not None
++    if need_to_track_batchsize:
++        forward_start_time = time.perf_counter()
++    global _forward_context
++    prev_context = _forward_context
++    _forward_context = ForwardContext(
++        attn_layers=vllm_config.compilation_config.static_forward_context,
++        virtual_engine=virtual_engine,
++        attn_metadata=attn_metadata)
++    try:
++        yield
++    finally:
++        global last_logging_time, batchsize_logging_interval
++        if need_to_track_batchsize:
++            if hasattr(attn_metadata, "num_prefill_tokens"):
++                # for v0 attention backends
++                batchsize = attn_metadata.num_prefill_tokens + \
++                    attn_metadata.num_decode_tokens
++            else:
++                # for v1 attention backends
++                batchsize = attn_metadata.num_input_tokens
++            # we use synchronous scheduling right now,
++            # adding a sync point here should not affect
++            # scheduling of the next batch
++            torch.cuda.synchronize()
++            now = time.perf_counter()
++            # time measurement is in milliseconds
++            batchsize_forward_time[batchsize].append(
++                (now - forward_start_time) * 1000)
++            if now - last_logging_time > batchsize_logging_interval:
++                last_logging_time = now
++                forward_stats = []
++                for bs, times in batchsize_forward_time.items():
++                    if len(times) <= 1:
++                        # can be cudagraph / profiling run
++                        continue
++                    medium = torch.quantile(torch.tensor(times), q=0.5).item()
++                    medium = round(medium, 2)
++                    forward_stats.append((bs, len(times), medium))
++                forward_stats.sort(key=lambda x: x[1], reverse=True)
++                if forward_stats:
++                    logger.info(("Batchsize forward time stats "
++                                 "(batchsize, count, median_time(ms)): %s"),
++                                forward_stats)
++        _forward_context = prev_context
+diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
+new file mode 100644
+index 0000000..a0dd89f
+--- /dev/null
++++ b/vllm/inputs/__init__.py
+@@ -0,0 +1,37 @@
++from .data import (DecoderOnlyInputs, EncoderDecoderInputs,
++                   ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
++                   SingletonInputs, SingletonInputsAdapter, SingletonPrompt,
++                   TextPrompt, TokenInputs, TokensPrompt,
++                   build_explicit_enc_dec_prompt, to_enc_dec_tuple_list,
++                   token_inputs, zip_enc_dec_prompts)
++from .registry import (DummyData, InputContext, InputProcessingContext,
++                       InputRegistry)
++
++INPUT_REGISTRY = InputRegistry()
++"""
++The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
++to dispatch data processing according to the target model.
++"""
++
++__all__ = [
++    "TextPrompt",
++    "TokensPrompt",
++    "PromptType",
++    "SingletonPrompt",
++    "ExplicitEncoderDecoderPrompt",
++    "TokenInputs",
++    "token_inputs",
++    "DecoderOnlyInputs",
++    "EncoderDecoderInputs",
++    "ProcessorInputs",
++    "SingletonInputs",
++    "SingletonInputsAdapter",
++    "build_explicit_enc_dec_prompt",
++    "to_enc_dec_tuple_list",
++    "zip_enc_dec_prompts",
++    "INPUT_REGISTRY",
++    "DummyData",
++    "InputContext",
++    "InputProcessingContext",
++    "InputRegistry",
++]
+diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
+new file mode 100644
+index 0000000..b8163a7
+--- /dev/null
++++ b/vllm/inputs/data.py
+@@ -0,0 +1,403 @@
++from dataclasses import dataclass
++from functools import cached_property
++from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal,
++                    Optional, Tuple, Union, cast)
++
++import torch
++from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
++
++if TYPE_CHECKING:
++    from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
++                                 MultiModalPlaceholderDict)
++    from vllm.multimodal.inputs import MultiModalInputsV2
++
++
++class TextPrompt(TypedDict):
++    """Schema for a text prompt."""
++
++    prompt: str
++    """The input text to be tokenized before passing to the model."""
++
++    multi_modal_data: NotRequired["MultiModalDataDict"]
++    """
++    Optional multi-modal data to pass to the model,
++    if the model supports it.
++    """
++
++    mm_processor_kwargs: NotRequired[Dict[str, Any]]
++    """
++    Optional multi-modal processor kwargs to be forwarded to the
++    multimodal input mapper & processor. Note that if multiple modalities
++    have registered mappers etc for the model being considered, we attempt
++    to pass the mm_processor_kwargs to each of them.
++    """
++
++
++class TokensPrompt(TypedDict):
++    """Schema for a tokenized prompt."""
++
++    prompt_token_ids: List[int]
++    """A list of token IDs to pass to the model."""
++
++    token_type_ids: NotRequired[List[int]]
++    """A list of token type IDs to pass to the cross encoder model."""
++
++    multi_modal_data: NotRequired["MultiModalDataDict"]
++    """
++    Optional multi-modal data to pass to the model,
++    if the model supports it.
++    """
++
++    mm_processor_kwargs: NotRequired[Dict[str, Any]]
++    """
++    Optional multi-modal processor kwargs to be forwarded to the
++    multimodal input mapper & processor. Note that if multiple modalities
++    have registered mappers etc for the model being considered, we attempt
++    to pass the mm_processor_kwargs to each of them.
++    """
++
++
++SingletonPrompt = Union[str, TextPrompt, TokensPrompt]
++"""
++Set of possible schemas for a single prompt:
++
++- A text prompt (:class:`str` or :class:`TextPrompt`)
++- A tokenized prompt (:class:`TokensPrompt`)
++
++Note that "singleton" is as opposed to a data structure
++which encapsulates multiple prompts, i.e. of the sort
++which may be utilized for encoder/decoder models when
++the user desires to express both the encoder & decoder
++prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt`
++
++A prompt of type :class:`SingletonPrompt` may be employed
++as (1) input to a decoder-only model, (2) input to
++the encoder of an encoder/decoder model, in the scenario
++where the decoder-prompt is not specified explicitly, or
++(3) as a member of a larger data structure encapsulating
++more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt`
++"""
++
++_T1_co = TypeVar("_T1_co",
++                 bound=SingletonPrompt,
++                 default=SingletonPrompt,
++                 covariant=True)
++_T2_co = TypeVar("_T2_co",
++                 bound=SingletonPrompt,
++                 default=SingletonPrompt,
++                 covariant=True)
++
++
++# TODO: Make fields ReadOnly once mypy supports it
++class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
++    """
++    Represents an encoder/decoder model input prompt,
++    comprising an explicit encoder prompt and a decoder prompt.
++
++    The encoder and decoder prompts, respectively, may be formatted
++    according to any of the :class:`SingletonPrompt` schemas,
++    and are not required to have the same schema.
++
++    Only the encoder prompt may have multi-modal data. mm_processor_kwargs
++    should be at the top-level, and should not be set in the encoder/decoder
++    prompts, since they are agnostic to the encoder/decoder.
++
++    Note that an :class:`ExplicitEncoderDecoderPrompt` may not
++    be used as an input to a decoder-only model,
++    and that the :code:`encoder_prompt` and :code:`decoder_prompt`
++    fields of this data structure themselves must be
++    :class:`SingletonPrompt` instances.
++    """
++
++    encoder_prompt: _T1_co
++
++    decoder_prompt: Optional[_T2_co]
++
++    mm_processor_kwargs: NotRequired[Dict[str, Any]]
++
++
++PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
++"""
++Set of possible schemas for an LLM input, including
++both decoder-only and encoder/decoder input types:
++
++- A text prompt (:class:`str` or :class:`TextPrompt`)
++- A tokenized prompt (:class:`TokensPrompt`)
++- A single data structure containing both an encoder and a decoder prompt
++  (:class:`ExplicitEncoderDecoderPrompt`)
++"""
++
++
++class TokenInputs(TypedDict):
++    """Represents token-based inputs."""
++
++    type: Literal["token"]
++    """The type of inputs."""
++
++    prompt_token_ids: List[int]
++    """The token IDs of the prompt."""
++
++    token_type_ids: NotRequired[List[int]]
++    """The token type IDs of the prompt."""
++
++    prompt: NotRequired[str]
++    """
++    The original prompt text corresponding to the token IDs, if available.
++    """
++
++    multi_modal_data: NotRequired["MultiModalDataDict"]
++    """
++    Optional multi-modal data to pass to the model,
++    if the model supports it.
++    """
++
++    multi_modal_inputs: NotRequired["MultiModalKwargs"]
++    """
++    Optional multi-modal inputs to pass to the model,
++    if the model supports it.
++    """
++
++    multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
++    """
++    Placeholder ranges for the multi-modal data.
++    """
++
++    multi_modal_hashes: NotRequired[List[str]]
++    """
++    The hashes of the multi-modal data.
++    """
++
++    mm_processor_kwargs: NotRequired[Dict[str, Any]]
++    """
++    Optional multi-modal processor kwargs to be forwarded to the
++    multimodal input mapper & processor. Note that if multiple modalities
++    have registered mappers etc for the model being considered, we attempt
++    to pass the mm_processor_kwargs to each of them.
++    """
++
++
++def token_inputs(
++    prompt_token_ids: List[int],
++    token_type_ids: Optional[List[int]] = None,
++    prompt: Optional[str] = None,
++    multi_modal_data: Optional["MultiModalDataDict"] = None,
++    multi_modal_inputs: Optional["MultiModalKwargs"] = None,
++    multi_modal_hashes: Optional[List[str]] = None,
++    multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
++    mm_processor_kwargs: Optional[Dict[str, Any]] = None,
++) -> TokenInputs:
++    """Construct :class:`TokenInputs` from optional values."""
++    inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
++
++    if prompt is not None:
++        inputs["prompt"] = prompt
++    if token_type_ids is not None:
++        inputs["token_type_ids"] = token_type_ids
++    if multi_modal_data is not None:
++        inputs["multi_modal_data"] = multi_modal_data
++    if multi_modal_inputs is not None:
++        inputs["multi_modal_inputs"] = multi_modal_inputs
++    if multi_modal_hashes is not None:
++        inputs["multi_modal_hashes"] = multi_modal_hashes
++    if multi_modal_placeholders is not None:
++        inputs["multi_modal_placeholders"] = multi_modal_placeholders
++    if mm_processor_kwargs is not None:
++        inputs["mm_processor_kwargs"] = mm_processor_kwargs
++
++    return inputs
++
++
++DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"]
++"""
++The inputs in :class:`~vllm.LLMEngine` before they are
++passed to the model executor.
++This specifies the data required for decoder-only models.
++"""
++
++
++class EncoderDecoderInputs(TypedDict):
++    """
++    The inputs in :class:`~vllm.LLMEngine` before they are
++    passed to the model executor.
++
++    This specifies the required data for encoder-decoder models.
++    """
++    encoder: Union[TokenInputs, "MultiModalInputsV2"]
++    """The inputs for the encoder portion."""
++
++    decoder: Union[TokenInputs, "MultiModalInputsV2"]
++    """The inputs for the decoder portion."""
++
++
++SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"]
++"""
++A processed :class:`SingletonPrompt` which can be passed to
++:class:`vllm.sequence.Sequence`.
++"""
++
++
++@dataclass
++class SingletonInputsAdapter:
++    """
++    Unified interface to access the components of :class:`SingletonInputs`.
++    """
++    inputs: SingletonInputs
++
++    @cached_property
++    def prompt(self) -> Optional[str]:
++        inputs = self.inputs
++
++        if inputs["type"] == "token" or inputs["type"] == "multimodal":
++            return inputs.get("prompt")
++
++        assert_never(inputs)  # type: ignore[arg-type]
++
++    @cached_property
++    def prompt_token_ids(self) -> List[int]:
++        inputs = self.inputs
++
++        if inputs["type"] == "token" or inputs["type"] == "multimodal":
++            return inputs.get("prompt_token_ids", [])
++
++        assert_never(inputs)  # type: ignore[arg-type]
++
++    @cached_property
++    def token_type_ids(self) -> List[int]:
++        inputs = self.inputs
++
++        if inputs["type"] == "token" or inputs["type"] == "multimodal":
++            return inputs.get("token_type_ids", [])
++
++        assert_never(inputs)  # type: ignore[arg-type]
++
++    @cached_property
++    def prompt_embeds(self) -> Optional[torch.Tensor]:
++        inputs = self.inputs
++
++        if inputs["type"] == "token" or inputs["type"] == "multimodal":
++            return None
++
++        assert_never(inputs)  # type: ignore[arg-type]
++
++    @cached_property
++    def multi_modal_data(self) -> "MultiModalDataDict":
++        inputs = self.inputs
++
++        if inputs["type"] == "token":
++            return inputs.get("multi_modal_data", {})
++
++        if inputs["type"] == "multimodal":
++            return inputs.get("mm_kwargs", {})
++
++        assert_never(inputs)  # type: ignore[arg-type]
++
++    @cached_property
++    def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
++        inputs = self.inputs
++
++        if inputs["type"] == "token":
++            return inputs.get("multi_modal_inputs", {})
++
++        if inputs["type"] == "multimodal":
++            return inputs.get("mm_kwargs", {})
++
++        assert_never(inputs)  # type: ignore[arg-type]
++
++    @cached_property
++    def multi_modal_hashes(self) -> List[str]:
++        inputs = self.inputs
++
++        if inputs["type"] == "token":
++            return inputs.get("multi_modal_hashes", [])
++
++        if inputs["type"] == "multimodal":
++            # only the case when we use MultiModalInputsV2
++            return inputs.get("mm_hashes", [])  # type: ignore[return-value]
++
++        assert_never(inputs)  # type: ignore[arg-type]
++
++    @cached_property
++    def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
++        inputs = self.inputs
++
++        if inputs["type"] == "token":
++            return inputs.get("multi_modal_placeholders", {})
++
++        if inputs["type"] == "multimodal":
++            return inputs.get("mm_placeholders", {})
++
++        assert_never(inputs)  # type: ignore[arg-type]
++
++    @cached_property
++    def mm_processor_kwargs(self) -> Dict[str, Any]:
++        inputs = self.inputs
++
++        if inputs["type"] == "token":
++            return inputs.get("mm_processor_kwargs", {})
++
++        if inputs["type"] == "multimodal":
++            return {}
++
++        assert_never(inputs)  # type: ignore[arg-type]
++
++
++ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
++"""
++The inputs to :data:`vllm.inputs.InputProcessor`.
++"""
++
++_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
++_T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt)
++
++
++def build_explicit_enc_dec_prompt(
++    encoder_prompt: _T1,
++    decoder_prompt: Optional[_T2],
++    mm_processor_kwargs: Optional[Dict[str, Any]] = None,
++) -> ExplicitEncoderDecoderPrompt[_T1, _T2]:
++    if mm_processor_kwargs is None:
++        mm_processor_kwargs = {}
++    return ExplicitEncoderDecoderPrompt(
++        encoder_prompt=encoder_prompt,
++        decoder_prompt=decoder_prompt,
++        mm_processor_kwargs=mm_processor_kwargs)
++
++
++def zip_enc_dec_prompts(
++    enc_prompts: Iterable[_T1],
++    dec_prompts: Iterable[Optional[_T2]],
++    mm_processor_kwargs: Optional[Union[Iterable[Dict[str, Any]],
++                                        Dict[str, Any]]] = None,
++) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
++    """
++    Zip encoder and decoder prompts together into a list of
++    :class:`ExplicitEncoderDecoderPrompt` instances.
++    
++    ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
++    dictionary will be used for every encoder/decoder prompt. If an iterable is
++    provided, it will be zipped with the encoder/decoder prompts.
++    """
++    if mm_processor_kwargs is None:
++        mm_processor_kwargs = cast(Dict[str, Any], {})
++    if isinstance(mm_processor_kwargs, dict):
++        return [
++            build_explicit_enc_dec_prompt(
++                encoder_prompt, decoder_prompt,
++                cast(Dict[str, Any], mm_processor_kwargs))
++            for (encoder_prompt,
++                 decoder_prompt) in zip(enc_prompts, dec_prompts)
++        ]
++    return [
++        build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,
++                                      mm_proc_kwargs)
++        for (encoder_prompt, decoder_prompt, mm_proc_kwargs
++             ) in zip(enc_prompts, dec_prompts, mm_processor_kwargs)
++    ]
++
++
++def to_enc_dec_tuple_list(
++    enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]],
++) -> List[Tuple[_T1, Optional[_T2]]]:
++    return [(enc_dec_prompt["encoder_prompt"],
++             enc_dec_prompt["decoder_prompt"])
++            for enc_dec_prompt in enc_dec_prompts]
+diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
+new file mode 100644
+index 0000000..09f1ff2
+--- /dev/null
++++ b/vllm/inputs/parse.py
+@@ -0,0 +1,112 @@
++from typing import List, Literal, Sequence, TypedDict, Union, cast, overload
++
++from typing_extensions import TypeIs
++
++from vllm.utils import is_list_of
++
++from .data import (EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
++                   ProcessorInputs, PromptType, SingletonPrompt, TextPrompt,
++                   TokensPrompt)
++
++
++class ParsedText(TypedDict):
++    content: str
++    is_tokens: Literal[False]
++
++
++class ParsedTokens(TypedDict):
++    content: List[int]
++    is_tokens: Literal[True]
++
++
++@overload
++def parse_and_batch_prompt(
++        prompt: Union[str, List[str]]) -> Sequence[ParsedText]:
++    ...
++
++
++@overload
++def parse_and_batch_prompt(
++        prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]:
++    ...
++
++
++def parse_and_batch_prompt(
++    prompt: Union[str, List[str], List[int], List[List[int]]],
++) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]:
++    if isinstance(prompt, str):
++        # case 1: a string
++        return [ParsedText(content=prompt, is_tokens=False)]
++
++    if isinstance(prompt, list):
++        if len(prompt) == 0:
++            raise ValueError("please provide at least one prompt")
++
++        if is_list_of(prompt, str):
++            # case 2: array of strings
++            prompt = cast(List[str], prompt)
++            return [
++                ParsedText(content=elem, is_tokens=False) for elem in prompt
++            ]
++        if is_list_of(prompt, int):
++            # case 3: array of tokens
++            prompt = cast(List[int], prompt)
++            return [ParsedTokens(content=prompt, is_tokens=True)]
++        if is_list_of(prompt, list):
++            prompt = cast(List[List[int]], prompt)
++            if len(prompt[0]) == 0:
++                raise ValueError("please provide at least one prompt")
++
++            if is_list_of(prompt[0], int):
++                # case 4: array of token arrays
++                return [
++                    ParsedTokens(content=elem, is_tokens=True)
++                    for elem in prompt
++                ]
++
++    raise TypeError("prompt must be a string, array of strings, "
++                    "array of tokens, or array of token arrays")
++
++
++class ParsedStrPrompt(TypedDict):
++    type: Literal["str"]
++    content: str
++
++
++class ParsedTextPrompt(TypedDict):
++    type: Literal["text"]
++    content: TextPrompt
++
++
++class ParsedTokensPrompt(TypedDict):
++    type: Literal["tokens"]
++    content: TokensPrompt
++
++
++def parse_singleton_prompt(
++    prompt: SingletonPrompt,
++) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]:
++    if isinstance(prompt, str):
++        return ParsedStrPrompt(type="str", content=prompt)
++    elif isinstance(prompt, dict):
++        if "prompt_token_ids" in prompt:
++            return ParsedTokensPrompt(type="tokens",
++                                      content=prompt)  # type: ignore
++        elif "prompt" in prompt:
++            return ParsedTextPrompt(type="text", content=prompt)
++
++    raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt")
++
++
++def is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]:
++    return isinstance(prompt, dict) and "prompt_token_ids" in prompt
++
++
++def is_explicit_encoder_decoder_prompt(
++        prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]:
++    return isinstance(prompt, dict) and "encoder_prompt" in prompt
++
++
++def is_encoder_decoder_inputs(
++        inputs: ProcessorInputs) -> TypeIs[EncoderDecoderInputs]:
++    return "encoder" in inputs and "decoder" in inputs
+diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
+new file mode 100644
+index 0000000..0890883
+--- /dev/null
++++ b/vllm/inputs/preprocess.py
+@@ -0,0 +1,707 @@
++import asyncio
++from typing import List, Mapping, Optional, Union
++
++from typing_extensions import assert_never
++
++from vllm.config import ModelConfig
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
++from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
++
++from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
++                   PromptType, SingletonInputs, SingletonPrompt, token_inputs)
++from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
++
++logger = init_logger(__name__)
++
++
++class InputPreprocessor:
++
++    def __init__(
++        self,
++        model_config: ModelConfig,
++        tokenizer: Optional[BaseTokenizerGroup],
++        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
++    ) -> None:
++        super().__init__()
++
++        self.model_config = model_config
++        self.tokenizer = tokenizer
++        self.mm_registry = mm_registry
++
++    def get_tokenizer_group(self) -> BaseTokenizerGroup:
++        if self.tokenizer is None:
++            raise ValueError("You cannot pass text prompts when "
++                             "`skip_tokenizer_init` is True")
++
++        return self.tokenizer
++
++    def get_bos_token_id(self,
++                         lora_request: Optional[LoRARequest] = None
++                         ) -> Optional[int]:
++        if self.tokenizer is None:
++            logger.warning("Using None for BOS token id because tokenizer "
++                           "is not initialized")
++            return None
++
++        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
++
++    def get_eos_token_id(self,
++                         lora_request: Optional[LoRARequest] = None
++                         ) -> Optional[int]:
++        if self.tokenizer is None:
++            logger.warning("Using None for EOS token id because tokenizer "
++                           "is not initialized")
++            return None
++
++        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
++
++    def get_decoder_start_token_id(self) -> Optional[int]:
++        '''
++        Obtain the decoder start token id employed by an encoder/decoder
++        model. Returns None for non-encoder/decoder models or if the
++        model config is unavailable.
++        '''
++
++        if not self.model_config.is_encoder_decoder:
++            logger.warning_once(
++                "Using None for decoder start token id because "
++                "this is not an encoder/decoder model.")
++            return None
++
++        if (self.model_config is None or self.model_config.hf_config is None):
++            logger.warning_once(
++                "Using None for decoder start token id because "
++                "model config is not available.")
++            return None
++
++        dec_start_token_id = getattr(self.model_config.hf_config,
++                                     'decoder_start_token_id', None)
++        if dec_start_token_id is None:
++            logger.warning_once(
++                "Falling back on <BOS> for decoder start token "
++                "id because decoder start token id is not "
++                "available.")
++            dec_start_token_id = self.get_bos_token_id()
++
++        return dec_start_token_id
++
++    def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
++        '''
++        Specifically for encoder/decoder models:
++        generate a default decoder prompt for when
++        the user specifies only the encoder prompt.
++
++        Encoder/decoder models utilize the decoder
++        prompt in different ways; as new models are
++        added, it is intended that this function
++        will be extended to produce differing
++        default decoder prompts, depending on the
++        model variety.
++
++        Absent a special case, the default behavior
++        of this method is to mirror the behavior of
++        the HuggingFace (HF) GenerationMixin for a None
++        decoder prompt, which is to employ a logit processor
++        setting to force the first decoded token to be <BOS>.
++        Here, this behavior is approximated by having the
++        "default" decoder prompt be <BOS>.
++
++        However, it is possible that in the future
++        other models may have different or more
++        complex logic for the default decoder prompt.
++        This motivates having a special helper method
++        for default decoder prompts.
++
++        Returns:
++
++        * prompt_token_ids
++        '''
++
++        bos_token_id = self.get_bos_token_id()
++        assert bos_token_id is not None
++        return [bos_token_id]
++
++    def _prepare_decoder_input_ids_for_generation(
++        self,
++        decoder_input_ids: Optional[List[int]],
++    ) -> List[int]:
++        """
++        Prepares `decoder_input_ids` for generation with encoder-decoder models.
++
++        Based on
++
++        https://github.com/huggingface/transformers/blob/
++        4037a2b5b1278736e566aec12e169100275545ea/
++        src/transformers/generation/utils.py
++
++        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
++
++        Arguments:
++
++        * decoder_input_ids: input token ids to preprocess
++
++        Returns:
++
++        * Processed token list
++        """
++
++        decoder_start_token_id = self.get_decoder_start_token_id()
++        assert decoder_start_token_id is not None
++
++        if decoder_input_ids is None:
++            # no decoder prompt input ->
++            # use decoder_start_token_id as decoder_input_ids
++            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
++
++        if (len(decoder_input_ids) == 0
++                or decoder_input_ids[0] != decoder_start_token_id):
++            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
++
++        return decoder_input_ids
++
++    def _apply_prompt_adapter(
++        self,
++        prompt_token_ids: List[int],
++        prompt_adapter_request: Optional[PromptAdapterRequest],
++    ) -> List[int]:
++        if prompt_adapter_request:
++            prompt_token_ids = (
++                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
++                + prompt_token_ids)
++
++        return prompt_token_ids
++
++    def _tokenize_prompt(
++        self,
++        prompt: str,
++        request_id: str,
++        lora_request: Optional[LoRARequest],
++    ) -> List[int]:
++        """
++        Apply the model's tokenizer to a text prompt, returning the
++        corresponding token IDs.
++        """
++        tokenizer = self.get_tokenizer_group()
++        add_special_tokens = None
++        if self.model_config.hf_config.model_type == "whisper":
++            # For Whisper, special tokens should be provided by the user based
++            # on the task and language of their request. Also needed to avoid
++            # appending an EOS token to the prompt which disrupts generation.
++            add_special_tokens = False
++
++        if (self.model_config.encoder_config is not None
++                and self.model_config.encoder_config.get(
++                    "do_lower_case", False)):
++            prompt = prompt.lower()
++
++        return tokenizer.encode(request_id=request_id,
++                                prompt=prompt,
++                                lora_request=lora_request,
++                                add_special_tokens=add_special_tokens)
++
++    async def _tokenize_prompt_async(
++        self,
++        prompt: str,
++        request_id: str,
++        lora_request: Optional[LoRARequest],
++    ) -> List[int]:
++        """Async version of :meth:`_tokenize_prompt`."""
++        tokenizer = self.get_tokenizer_group()
++        add_special_tokens = None
++        if self.model_config.hf_config.model_type == "whisper":
++            # For Whisper, special tokens should be provided by the user based
++            # on the task and language of their request. Also needed to avoid
++            # appending an EOS token to the prompt which disrupts generation.
++            add_special_tokens = False
++        return await tokenizer.encode_async(
++            request_id=request_id,
++            prompt=prompt,
++            lora_request=lora_request,
++            add_special_tokens=add_special_tokens)
++
++    def _can_process_multimodal(self) -> bool:
++        model_config = self.model_config
++
++        if not model_config.is_multimodal_model:
++            raise ValueError("Your model does not support multi-modal inputs")
++
++        # Interim measure so we can handle models that have yet to be
++        # updated to use the new multi-modal processor
++        can_process_multimodal = self.mm_registry.has_processor(model_config)
++        if not can_process_multimodal:
++            logger.info_once(
++                "Your model uses the legacy input pipeline instead of the new "
++                "multi-modal processor. Please note that the legacy pipeline "
++                "will be removed in a future release. For more details, see: "
++                "https://github.com/vllm-project/vllm/issues/10114")
++
++        return can_process_multimodal
++
++    def _process_multimodal(
++        self,
++        prompt: Union[str, List[int]],
++        mm_data: MultiModalDataDict,
++        mm_processor_kwargs: Optional[Mapping[str, object]],
++        lora_request: Optional[LoRARequest],
++    ) -> MultiModalInputsV2:
++        """
++        Apply the model's multi-modal processor to a multi-modal prompt,
++        returning the corresponding token IDs and metadata.
++        """
++        tokenizer_group = self.get_tokenizer_group()
++        tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
++
++        mm_processor = self.mm_registry.create_processor(
++            self.model_config, tokenizer)
++
++        if isinstance(prompt, list):
++            prompt = tokenizer.decode(prompt)
++        if mm_processor_kwargs is None:
++            mm_processor_kwargs = {}
++
++        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
++
++    async def _process_multimodal_async(
++        self,
++        prompt: Union[str, List[int]],
++        mm_data: MultiModalDataDict,
++        mm_processor_kwargs: Optional[Mapping[str, object]],
++        lora_request: Optional[LoRARequest],
++    ) -> MultiModalInputsV2:
++        """Async version of :meth:`_process_multimodal`."""
++        tokenizer_group = self.get_tokenizer_group()
++        tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request
++                                                                   )
++
++        mm_processor = self.mm_registry.create_processor(
++            self.model_config, tokenizer)
++        if mm_processor_kwargs is None:
++            mm_processor_kwargs = {}
++
++        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
++
++    def _prompt_to_llm_inputs(
++        self,
++        prompt: SingletonPrompt,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> SingletonInputs:
++        """
++        Extract the singleton inputs from a prompt.
++
++        Arguments:
++
++        * request_id
++        * prompt: single encoder or decoder input prompt
++        * lora_request: this is only valid for decoder prompts
++
++        Returns:
++
++        * :class:`SingletonInputs` instance
++        """
++        parsed = parse_singleton_prompt(prompt)
++
++        if parsed["type"] == "str":
++            prompt_text = parsed["content"]
++            prompt_token_ids = self._tokenize_prompt(
++                prompt_text,
++                request_id=request_id,
++                lora_request=lora_request,
++            )
++
++            return token_inputs(
++                prompt=prompt_text,
++                prompt_token_ids=prompt_token_ids,
++            )
++
++        if parsed["type"] == "tokens":
++            tokens_content = parsed["content"]
++
++            prompt_token_ids = tokens_content["prompt_token_ids"]
++            token_type_ids = tokens_content.get("token_type_ids")
++            multi_modal_data = tokens_content.get("multi_modal_data")
++            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
++
++            if multi_modal_data is not None and self._can_process_multimodal():
++                return self._process_multimodal(
++                    prompt_token_ids,
++                    multi_modal_data,
++                    mm_processor_kwargs,
++                    lora_request=lora_request,
++                )
++
++            return token_inputs(
++                prompt_token_ids=prompt_token_ids,
++                token_type_ids=token_type_ids,
++                multi_modal_data=multi_modal_data,
++                mm_processor_kwargs=mm_processor_kwargs,
++            )
++
++        if parsed["type"] == "text":
++            text_content = parsed["content"]
++
++            prompt_text = text_content["prompt"]
++            multi_modal_data = text_content.get("multi_modal_data")
++            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
++
++            if multi_modal_data is not None and self._can_process_multimodal():
++                return self._process_multimodal(
++                    prompt_text,
++                    multi_modal_data,
++                    mm_processor_kwargs,
++                    lora_request=lora_request,
++                )
++
++            prompt_token_ids = self._tokenize_prompt(
++                prompt_text,
++                request_id=request_id,
++                lora_request=lora_request,
++            )
++
++            return token_inputs(
++                prompt=prompt_text,
++                prompt_token_ids=prompt_token_ids,
++                multi_modal_data=multi_modal_data,
++                mm_processor_kwargs=mm_processor_kwargs,
++            )
++
++        assert_never(parsed)
++
++    async def _prompt_to_llm_inputs_async(
++        self,
++        prompt: SingletonPrompt,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> SingletonInputs:
++        """Async version of :meth:`_extract_prompt_components`."""
++        parsed = parse_singleton_prompt(prompt)
++
++        if parsed["type"] == "str":
++            prompt_text = parsed["content"]
++            prompt_token_ids = await self._tokenize_prompt_async(
++                prompt_text,
++                request_id=request_id,
++                lora_request=lora_request,
++            )
++
++            return token_inputs(
++                prompt=prompt_text,
++                prompt_token_ids=prompt_token_ids,
++            )
++
++        if parsed["type"] == "tokens":
++            tokens_content = parsed["content"]
++
++            prompt_token_ids = tokens_content["prompt_token_ids"]
++            multi_modal_data = tokens_content.get("multi_modal_data")
++            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
++
++            if multi_modal_data is not None and self._can_process_multimodal():
++                return await self._process_multimodal_async(
++                    prompt_token_ids,
++                    multi_modal_data,
++                    mm_processor_kwargs,
++                    lora_request=lora_request,
++                )
++
++            return token_inputs(
++                prompt_token_ids=prompt_token_ids,
++                multi_modal_data=multi_modal_data,
++                mm_processor_kwargs=mm_processor_kwargs,
++            )
++
++        if parsed["type"] == "text":
++            text_content = parsed["content"]
++
++            prompt_text = text_content["prompt"]
++            multi_modal_data = text_content.get("multi_modal_data")
++            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
++
++            if multi_modal_data is not None and self._can_process_multimodal():
++                return await self._process_multimodal_async(
++                    prompt_text,
++                    multi_modal_data,
++                    mm_processor_kwargs,
++                    lora_request=lora_request,
++                )
++
++            prompt_token_ids = await self._tokenize_prompt_async(
++                prompt_text,
++                request_id=request_id,
++                lora_request=lora_request,
++            )
++
++            return token_inputs(
++                prompt=prompt_text,
++                prompt_token_ids=prompt_token_ids,
++                multi_modal_data=multi_modal_data,
++                mm_processor_kwargs=mm_processor_kwargs,
++            )
++
++        assert_never(parsed)
++
++    def _build_enc_dec_llm_inputs(
++        self,
++        encoder_inputs: SingletonInputs,
++        decoder_inputs: Optional[SingletonInputs],
++    ) -> EncoderDecoderInputs:
++        if (encoder_inputs["type"] == "token"
++                or encoder_inputs["type"] == "multimodal"):
++            pass
++        else:
++            assert_never(encoder_inputs)  # type: ignore[arg-type]
++
++        if decoder_inputs is None:
++            if self.model_config.hf_config.model_type == "whisper":
++                # For Whisper models, the text prompt should go to the decoder.
++                # If no explicit encoder/decoder inputs, then copy the prompt
++                # from the encoder to the decoder. The encoder tokens are later
++                # overridden by the audio features.
++                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
++            else:
++                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
++                    None)
++            decoder_inputs = token_inputs(dec_token_ids)
++        elif (decoder_inputs["type"] == "token"
++              or decoder_inputs["type"] == "multimodal"):
++            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
++                decoder_inputs["prompt_token_ids"])
++            decoder_inputs["prompt_token_ids"] = dec_token_ids
++
++            if "multi_modal_data" in decoder_inputs:
++                raise ValueError("Multi-modal decoder inputs of encoder-"
++                                 "decoder models are not supported yet")
++        else:
++            assert_never(encoder_inputs)  # type: ignore[arg-type]
++
++        return EncoderDecoderInputs(
++            encoder=encoder_inputs,
++            decoder=decoder_inputs,
++        )
++
++    def _process_encoder_decoder_prompt(
++        self,
++        prompt: PromptType,
++        request_id: str,
++    ) -> EncoderDecoderInputs:
++        """
++        For encoder/decoder models only:
++        Process an input prompt into an :class:`EncoderDecoderInputs` instance.
++
++        There are two types of input prompts:
++        singleton prompts which carry only the
++        encoder prompt, and explicit encoder/decoder
++        prompts which carry both the encoder and the
++        decoder prompts as member variables.
++
++        This function handles the following scenarios:
++        * Singleton encoder prompt: extract encoder prompt
++          token ids & infer default decoder prompt token ids
++        * Explicit encoder/decoder prompt: extract encoder
++          and decoder prompt token ids
++
++        Note that for Explicit encoder/decoder prompts,
++        each sub-prompt (encoder or decoder prompt) can
++        have any possible singleton type; thus this
++        method relies on helper functions to obtain
++        token ids for the sub-prompts.
++
++        Arguments:
++
++        * prompt: an input prompt
++        * request_id
++
++        Returns:
++
++        * :class:`EncoderDecoderInputs` instance
++        """
++        encoder_inputs: SingletonInputs
++        decoder_inputs: Optional[SingletonInputs]
++
++        if is_explicit_encoder_decoder_prompt(prompt):
++            encoder_inputs = self._prompt_to_llm_inputs(
++                prompt["encoder_prompt"],
++                request_id=request_id,
++            )
++
++            if (decoder_input := prompt["decoder_prompt"]) is None:
++                decoder_inputs = None
++            else:
++                decoder_inputs = self._prompt_to_llm_inputs(
++                    decoder_input,
++                    request_id=request_id,
++                )
++        else:
++            encoder_inputs = self._prompt_to_llm_inputs(
++                prompt,
++                request_id=request_id,
++            )
++
++            decoder_inputs = None
++
++        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
++
++    async def _process_encoder_decoder_prompt_async(
++        self,
++        prompt: PromptType,
++        request_id: str,
++    ) -> EncoderDecoderInputs:
++        """Async version of :meth:`_process_encoder_decoder_prompt`."""
++        encoder_inputs: SingletonInputs
++        decoder_inputs: Optional[SingletonInputs]
++
++        if is_explicit_encoder_decoder_prompt(prompt):
++            encoder_task = self._prompt_to_llm_inputs_async(
++                prompt["encoder_prompt"],
++                request_id=request_id,
++            )
++
++            if (decoder_input := prompt["decoder_prompt"]) is None:
++                encoder_inputs = await encoder_task
++                decoder_inputs = None
++            else:
++                decoder_task = self._prompt_to_llm_inputs_async(
++                    decoder_input,
++                    request_id=request_id,
++                )
++
++                encoder_inputs, decoder_inputs = await asyncio.gather(
++                    encoder_task, decoder_task)
++        else:
++            encoder_inputs = await self._prompt_to_llm_inputs_async(
++                prompt,
++                request_id=request_id,
++            )
++
++            decoder_inputs = None
++
++        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
++
++    def _build_decoder_only_llm_inputs(
++        self,
++        prompt_inputs: DecoderOnlyInputs,
++        prompt_adapter_request: Optional[PromptAdapterRequest],
++    ) -> DecoderOnlyInputs:
++        if (prompt_inputs["type"] == "token"
++                or prompt_inputs["type"] == "multimodal"):
++            prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
++                prompt_inputs["prompt_token_ids"],
++                prompt_adapter_request=prompt_adapter_request,
++            )
++        else:
++            assert_never(prompt_inputs)  # type: ignore[arg-type]
++
++        return prompt_inputs
++
++    def _process_decoder_only_prompt(
++        self,
++        prompt: SingletonPrompt,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> DecoderOnlyInputs:
++        """
++        For decoder-only models:
++        Process an input prompt into an :class:`DecoderOnlyInputs` instance.
++
++        Arguments:
++
++        * prompt: input prompt
++        * request_id
++        * lora_request
++        * prompt_adapter_request
++
++        Returns:
++
++        * :class:`DecoderOnlyInputs` instance
++        """
++
++        prompt_comps = self._prompt_to_llm_inputs(
++            prompt,
++            request_id=request_id,
++            lora_request=lora_request,
++        )
++
++        return self._build_decoder_only_llm_inputs(
++            prompt_comps,
++            prompt_adapter_request=prompt_adapter_request,
++        )
++
++    async def _process_decoder_only_prompt_async(
++        self,
++        prompt: SingletonPrompt,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> DecoderOnlyInputs:
++        """Async version of :meth:`_process_decoder_only_prompt`."""
++        prompt_comps = await self._prompt_to_llm_inputs_async(
++            prompt,
++            request_id=request_id,
++            lora_request=lora_request,
++        )
++
++        return self._build_decoder_only_llm_inputs(
++            prompt_comps,
++            prompt_adapter_request=prompt_adapter_request,
++        )
++
++    def preprocess(
++        self,
++        prompt: PromptType,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> ProcessorInputs:
++        """Preprocess the input prompt."""
++        if self.model_config.is_encoder_decoder:
++            # Encoder-decoder model requires special mapping of
++            # input prompts to encoder & decoder
++            return self._process_encoder_decoder_prompt(
++                prompt,
++                request_id=request_id,
++            )
++
++        if is_explicit_encoder_decoder_prompt(prompt):
++            raise ValueError("Cannot pass encoder-decoder prompt "
++                             "to decoder-only models")
++
++        # Decoder-only operation
++        return self._process_decoder_only_prompt(
++            prompt,
++            request_id=request_id,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++        )
++
++    async def preprocess_async(
++        self,
++        prompt: PromptType,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++    ) -> ProcessorInputs:
++        """Async version of :meth:`preprocess`."""
++        if self.model_config.is_encoder_decoder:
++            # Encoder-decoder model requires special mapping of
++            # input prompts to encoder & decoder
++            return await self._process_encoder_decoder_prompt_async(
++                prompt,
++                request_id=request_id,
++            )
++
++        if is_explicit_encoder_decoder_prompt(prompt):
++            raise ValueError("Cannot pass encoder-decoder prompt "
++                             "to decoder-only models")
++
++        # Decoder-only operation
++        return await self._process_decoder_only_prompt_async(
++            prompt,
++            request_id=request_id,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++        )
+diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
+new file mode 100644
+index 0000000..4b73ade
+--- /dev/null
++++ b/vllm/inputs/registry.py
+@@ -0,0 +1,464 @@
++import functools
++from collections import UserDict
++from dataclasses import dataclass
++from typing import (TYPE_CHECKING, Any, Callable, Mapping, NamedTuple,
++                    Optional, Protocol, Union)
++
++from torch import nn
++from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
++from typing_extensions import TypeVar, assert_never
++
++from vllm.logger import init_logger
++from vllm.transformers_utils.processor import cached_get_processor
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
++                        resolve_mm_processor_kwargs)
++
++from .data import ProcessorInputs, SingletonInputs
++from .parse import is_encoder_decoder_inputs
++
++if TYPE_CHECKING:
++    from vllm.config import ModelConfig
++    from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
++                                 MultiModalRegistry)
++    from vllm.sequence import SequenceData
++
++logger = init_logger(__name__)
++
++C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig)
++P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin)
++
++
++@dataclass(frozen=True)
++class InputContext:
++    """
++    Contains information about the model which may be used to
++    modify the inputs.
++    """
++
++    model_config: "ModelConfig"
++    """The configuration of the model."""
++
++    def get_hf_config(
++        self,
++        typ: Union[type[C], tuple[type[C], ...]] = PretrainedConfig,
++        /,
++    ) -> C:
++        """
++        Get the HuggingFace configuration
++        (:class:`transformers.PretrainedConfig`) of the model,
++        additionally checking its type.
++
++        Raises:
++            TypeError: If the configuration is not of the specified type.
++        """
++        hf_config = self.model_config.hf_config
++        if not isinstance(hf_config, typ):
++            raise TypeError("Invalid type of HuggingFace config. "
++                            f"Expected type: {typ}, but "
++                            f"found type: {type(hf_config)}")
++
++        return hf_config
++
++    def get_hf_image_processor_config(self) -> dict[str, Any]:
++        """
++        Get the HuggingFace image processor configuration of the model.
++        """
++        return self.model_config.hf_image_processor_config
++
++    def get_mm_config(self):
++        """
++        Get the multimodal config of the model.
++
++        Raises:
++            RuntimeError: If the model is not a multimodal model.
++        """
++        mm_config = self.model_config.multimodal_config
++        if mm_config is None:
++            raise RuntimeError("Not a multimodal model")
++
++        return mm_config
++
++    def get_hf_processor(
++        self,
++        typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
++        /,
++        **kwargs: object,
++    ) -> P:
++        """
++        Get the HuggingFace processor
++        (:class:`transformers.ProcessorMixin`) of the model,
++        additionally checking its type.
++
++        Raises:
++            TypeError: If the processor is not of the specified type.
++        """
++        base_kwargs = self.model_config.mm_processor_kwargs
++        if base_kwargs is None:
++            base_kwargs = {}
++
++        merged_kwargs = {**base_kwargs, **kwargs}
++
++        if isinstance(typ, type):
++            merged_kwargs["processor_cls"] = typ
++
++        hf_processor = cached_get_processor(
++            self.model_config.model,
++            trust_remote_code=self.model_config.trust_remote_code,
++            **merged_kwargs,
++        )
++        if not isinstance(hf_processor, typ):
++            raise TypeError("Invalid type of HuggingFace processor. "
++                            f"Expected type: {typ}, but "
++                            f"found type: {type(hf_processor)}")
++
++        return hf_processor
++
++
++@dataclass(frozen=True)
++class InputProcessingContext(InputContext):
++    tokenizer: AnyTokenizer
++    """The tokenizer used to tokenize the inputs."""
++
++    def get_hf_processor(
++        self,
++        typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
++        /,
++        **kwargs: object,
++    ) -> P:
++        return super().get_hf_processor(
++            typ,
++            tokenizer=self.tokenizer,
++            **kwargs,
++        )
++
++    def call_hf_processor(
++        self,
++        hf_processor: ProcessorMixin,
++        data: Mapping[str, object],
++        kwargs: Mapping[str, object] = {},
++    ) -> BatchFeature:
++        """
++        Call :code:`hf_processor` on the prompt :code:`data`
++        (text, image, audio...) with configurable options :code:`kwargs`.
++        """
++        assert callable(hf_processor)
++
++        base_kwargs = self.model_config.mm_processor_kwargs
++        if base_kwargs is None:
++            base_kwargs = {}
++
++        merged_kwargs = resolve_mm_processor_kwargs(
++            base_kwargs,
++            kwargs,
++            hf_processor,
++            requires_kw_only=False,
++            allow_var_kwargs=True,
++        )
++
++        try:
++            return hf_processor(**data, **merged_kwargs, return_tensors="pt")
++        except Exception as exc:
++            msg = (f"Failed to apply {type(hf_processor).__name__} "
++                   f"on data={data} with kwargs={merged_kwargs}")
++
++            raise RuntimeError(msg) from exc
++
++
++N = TypeVar("N", bound=type[nn.Module])
++
++
++class DummyData(NamedTuple):
++    """Dummy data used for profiling."""
++
++    seq_data: "SequenceData"
++    multi_modal_data: Optional["MultiModalDataDict"] = None
++    multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None
++
++
++class DummyDataFactory(Protocol):
++
++    def __call__(
++        self,
++        ctx: InputContext,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++        **mm_processor_kwargs: Any,
++    ) -> DummyData:
++        """
++        Create dummy data to be inputted into the model.
++
++        Note:
++            :data:`InputProcessor` is not applied to the dummy data.
++
++            The :code:`mm_processor_kwargs` are overrides provided at
++            initialization time to values in the config whose values
++            may affect the number of tokens per instance.
++        """
++        ...
++
++
++class _MultiModalCounts(UserDict[str, int]):
++    """
++    Wraps `mm_counts` for a more informative error message
++    when attempting to access a plugin that does not exist.
++    """
++
++    def __getitem__(self, key: str) -> int:
++        try:
++            return super().__getitem__(key)
++        except KeyError as exc:
++            msg = (f"There is no multi-modal plugin with the key: {key}. "
++                   f"Available keys: {set(self.keys())}")
++            raise KeyError(msg) from exc
++
++
++InputProcessor = Callable[[InputContext, ProcessorInputs], ProcessorInputs]
++"""Preprocess the inputs to the model."""
++
++
++class InputRegistry:
++    """
++    A registry to dispatch data processing
++    according to the target model.
++    """
++
++    def __init__(self) -> None:
++        self._dummy_factories_by_model_type = \
++            ClassRegistry[nn.Module, DummyDataFactory]()
++        self._dummy_encoder_factories_by_model_type = \
++            ClassRegistry[nn.Module, DummyDataFactory]()
++        self._input_processors_by_model_type = \
++            ClassRegistry[nn.Module, InputProcessor]()
++
++    def _default_dummy_data_factory(
++        self,
++        ctx: InputContext,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> DummyData:
++        """
++        The default dummy data factory represents the longest possible text
++        that can be inputted to the model.
++
++        Note:
++            :data:`InputProcessor` is not applied to the dummy data.
++        """
++        # Avoid circular import
++        from vllm.sequence import SequenceData
++
++        return DummyData(SequenceData.from_prompt_token_counts((0, seq_len)))
++
++    def register_dummy_data(self, factory: DummyDataFactory):
++        """
++        Register a dummy data factory to a model class.
++
++        During memory profiling, the provided function is invoked to create
++        dummy data to be inputted into the model. The resulting memory usage
++        should be an upper bound of what the model would use at inference time.
++        """
++
++        def wrapper(model_cls: N) -> N:
++            if self._dummy_factories_by_model_type.contains(model_cls,
++                                                            strict=True):
++                logger.warning(
++                    "Model class %s already has dummy data "
++                    "registered to %s. It is overwritten by the new one.",
++                    model_cls, self)
++
++            self._dummy_factories_by_model_type[model_cls] = factory
++
++            return model_cls
++
++        return wrapper
++
++    def _get_dummy_data_factory(self, model_cls: type[nn.Module]):
++        return self._dummy_factories_by_model_type \
++            .get(model_cls, self._default_dummy_data_factory)
++
++    def register_dummy_encoder_data(self, factory: DummyDataFactory):
++        """
++        Register a dummy encoder data factory to a model class
++
++        This is similar to :meth:`~register_dummy_data`, but for encoder input.
++        """
++
++        def wrapper(model_cls: N) -> N:
++            if self._dummy_encoder_factories_by_model_type.contains(
++                    model_cls, strict=True):
++                logger.warning(
++                    "Model class %s already has dummy encoder data "
++                    "registered to %s. It is overwritten by the new one.",
++                    model_cls, self)
++
++            self._dummy_encoder_factories_by_model_type[model_cls] = factory
++
++            return model_cls
++
++        return wrapper
++
++    def _get_dummy_encoder_data_factory(self, model_cls: type[nn.Module]):
++        return self._dummy_encoder_factories_by_model_type \
++            .get(model_cls, self._default_dummy_data_factory)
++
++    def dummy_data_for_profiling(
++        self,
++        model_config: "ModelConfig",
++        seq_len: int,
++        mm_registry: "MultiModalRegistry",
++        is_encoder_data: bool = False,
++    ) -> DummyData:
++        """
++        Create dummy data for profiling the memory usage of a model.
++
++        The model is identified by ``model_config``.
++
++        Note:
++            This should be called after
++            :meth:`~MultiModalRegistry.init_mm_limits_per_prompt`.
++        """
++        # Avoid circular import
++        from vllm.model_executor.model_loader import get_model_architecture
++        from vllm.multimodal import MultiModalKwargs
++        from vllm.multimodal.profiling import MultiModalProfiler
++        from vllm.multimodal.utils import cached_get_tokenizer
++
++        if mm_registry.has_processor(model_config):
++            tokenizer = cached_get_tokenizer(
++                model_config.tokenizer,
++                trust_remote_code=model_config.trust_remote_code,
++            )
++            processor = mm_registry.create_processor(model_config, tokenizer)
++            profiler = MultiModalProfiler(processor)
++            dummy_data = profiler.get_dummy_data(seq_len)
++        else:
++            model_cls, _ = get_model_architecture(model_config)
++            if is_encoder_data:
++                dummy_factory = self._get_dummy_encoder_data_factory(model_cls)
++            else:
++                dummy_factory = self._get_dummy_data_factory(model_cls)
++            mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
++            mm_processor_kwargs = get_allowed_kwarg_only_overrides(
++                dummy_factory, overrides=model_config.mm_processor_kwargs)
++
++            dummy_data = dummy_factory(InputContext(model_config), seq_len,
++                                       _MultiModalCounts(mm_counts),
++                                       **mm_processor_kwargs)
++
++        # Having more tokens is over-conservative but otherwise fine
++        num_tokens = dummy_data.seq_data.prompt_token_ids
++        if len(num_tokens) < seq_len:
++            if is_encoder_data:
++                logger.warning_once(
++                    f"Expected at least {seq_len} dummy encoder tokens for "
++                    f"profiling, but found {len(num_tokens)} tokens instead.")
++            else:
++                raise AssertionError(
++                    f"Expected at least {seq_len} dummy tokens for profiling, "
++                    f"but found {len(num_tokens)} tokens instead.")
++
++        if (dummy_data.multi_modal_data is not None and
++                not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)):
++            for k, v in dummy_data.multi_modal_data.items():
++                num_items = len(v) if isinstance(v, list) else 1
++                num_expected = mm_counts[k]
++                assert num_items >= num_expected, (
++                    f"Expected at least {num_expected} dummy '{k}' instances "
++                    f"for profiling, but found {num_items} instances instead.")
++
++        return dummy_data
++
++    def _default_input_processor(
++        self,
++        ctx: InputContext,
++        inputs: ProcessorInputs,
++    ) -> ProcessorInputs:
++        """The default input processor is a no-op."""
++        return inputs
++
++    def register_input_processor(self, processor: InputProcessor):
++        """
++        Register an input processor to a model class.
++
++        The provided function is invoked on each input to the model. This
++        happens before
++        :meth:`~vllm.multimodal.registry.MultiModalRegistry.map_input`.
++        """
++
++        def wrapper(model_cls: N) -> N:
++            if self._input_processors_by_model_type.contains(model_cls,
++                                                             strict=True):
++                logger.warning(
++                    "Model class %s already has input processor "
++                    "registered to %s. It is overwritten by the new one.",
++                    model_cls, self)
++
++            self._input_processors_by_model_type[model_cls] = processor
++
++            return model_cls
++
++        return wrapper
++
++    def _get_model_input_processor(self, model_cls: type[nn.Module]):
++        return self._input_processors_by_model_type \
++            .get(model_cls, self._default_input_processor)
++
++    def _ensure_mm_kwargs(
++        self,
++        inputs: SingletonInputs,
++        mm_processor_kwargs: dict[str, Any],
++    ):
++        if inputs["type"] == "token":
++            # In case the input processor for that model fails to set it
++            if "mm_processor_kwargs" not in inputs:
++                inputs["mm_processor_kwargs"] = mm_processor_kwargs
++        elif inputs["type"] == "multimodal":
++            # Be more strict in V2
++            assert "mm_kwargs" in inputs
++        else:
++            assert_never(inputs["type"])  # type: ignore[arg-type]
++
++    def process_input(self, model_config: "ModelConfig",
++                      inputs: ProcessorInputs) -> ProcessorInputs:
++        """
++        Apply an input processor to an instance of model inputs.
++
++        The model is identified by ``model_config``.
++        """
++        # Avoid circular import
++        from vllm.model_executor.model_loader import get_model_architecture
++
++        model_cls, _ = get_model_architecture(model_config)
++        processor = self._get_model_input_processor(model_cls)
++
++        # Handle multimodal processor kwargs with priority:
++        #     Inference kwargs -> Init kwargs -> {}
++        # If it's empty, it'll fall back to the default kwarg values
++        mm_processor_kwargs = resolve_mm_processor_kwargs(
++            model_config.mm_processor_kwargs,
++            inputs.get("mm_processor_kwargs", {}),  # type: ignore
++            processor,
++        )
++
++        processed_inputs = processor(
++            InputContext(model_config),
++            inputs,
++            **mm_processor_kwargs,
++        )
++
++        if is_encoder_decoder_inputs(processed_inputs):
++            self._ensure_mm_kwargs(processed_inputs["encoder"],
++                                   mm_processor_kwargs)
++            self._ensure_mm_kwargs(processed_inputs["decoder"],
++                                   mm_processor_kwargs)
++        else:
++            self._ensure_mm_kwargs(processed_inputs, mm_processor_kwargs)
++
++        return processed_inputs
++
++    def create_input_processor(self, model_config: "ModelConfig"):
++        """
++        Create an input processor (see :meth:`_process_input`) for a
++        specific model.
++        """
++        return functools.partial(self.process_input, model_config)
+diff --git a/vllm/logger.py b/vllm/logger.py
+index 153cdfb..cac174f 100644
+--- a/vllm/logger.py
++++ b/vllm/logger.py
+@@ -4,24 +4,28 @@ import json
+ import logging
+ import os
+ import sys
+-from functools import partial
++from functools import lru_cache, partial
+ from logging import Logger
+ from logging.config import dictConfig
+ from os import path
+-from typing import Dict, Optional
++from types import MethodType
++from typing import Any, Optional, cast
+ 
+ import vllm.envs as envs
+ 
+ VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
+ VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
++VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
++VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
+ 
+-_FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
++_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
++           "%(filename)s:%(lineno)d] %(message)s")
+ _DATE_FORMAT = "%m-%d %H:%M:%S"
+ 
+ DEFAULT_LOGGING_CONFIG = {
+     "formatters": {
+         "vllm": {
+-            "class": "vllm.logging.NewLineFormatter",
++            "class": "vllm.logging_utils.NewLineFormatter",
+             "datefmt": _DATE_FORMAT,
+             "format": _FORMAT,
+         },
+@@ -30,7 +34,7 @@ DEFAULT_LOGGING_CONFIG = {
+         "vllm": {
+             "class": "logging.StreamHandler",
+             "formatter": "vllm",
+-            "level": "INFO",
++            "level": VLLM_LOGGING_LEVEL,
+             "stream": "ext://sys.stdout",
+         },
+     },
+@@ -42,11 +46,48 @@ DEFAULT_LOGGING_CONFIG = {
+         },
+     },
+     "version": 1,
++    "disable_existing_loggers": False
+ }
+ 
+ 
++@lru_cache
++def _print_info_once(logger: Logger, msg: str) -> None:
++    # Set the stacklevel to 2 to print the original caller's line info
++    logger.info(msg, stacklevel=2)
++
++
++@lru_cache
++def _print_warning_once(logger: Logger, msg: str) -> None:
++    # Set the stacklevel to 2 to print the original caller's line info
++    logger.warning(msg, stacklevel=2)
++
++
++class _VllmLogger(Logger):
++    """
++    Note:
++        This class is just to provide type information.
++        We actually patch the methods directly on the :class:`logging.Logger`
++        instance to avoid conflicting with other libraries such as
++        `intel_extension_for_pytorch.utils._logger`.
++    """
++
++    def info_once(self, msg: str) -> None:
++        """
++        As :meth:`info`, but subsequent calls with the same message
++        are silently dropped.
++        """
++        _print_info_once(self, msg)
++
++    def warning_once(self, msg: str) -> None:
++        """
++        As :meth:`warning`, but subsequent calls with the same message
++        are silently dropped.
++        """
++        _print_warning_once(self, msg)
++
++
+ def _configure_vllm_root_logger() -> None:
+-    logging_config: Optional[Dict] = None
++    logging_config = dict[str, Any]()
+ 
+     if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+         raise RuntimeError(
+@@ -63,8 +104,7 @@ def _configure_vllm_root_logger() -> None:
+             raise RuntimeError(
+                 "Could not load logging config. File does not exist: %s",
+                 VLLM_LOGGING_CONFIG_PATH)
+-        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8",
+-                  mode="r") as file:
++        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+             custom_config = json.loads(file.read())
+ 
+         if not isinstance(custom_config, dict):
+@@ -72,16 +112,31 @@ def _configure_vllm_root_logger() -> None:
+                              type(custom_config).__name__)
+         logging_config = custom_config
+ 
++    for formatter in logging_config.get("formatters", {}).values():
++        # This provides backwards compatibility after #10134.
++        if formatter.get("class") == "vllm.logging.NewLineFormatter":
++            formatter["class"] = "vllm.logging_utils.NewLineFormatter"
++
+     if logging_config:
+         dictConfig(logging_config)
+ 
+ 
+-def init_logger(name: str) -> Logger:
++def init_logger(name: str) -> _VllmLogger:
+     """The main purpose of this function is to ensure that loggers are
+     retrieved in such a way that we can be sure the root vllm logger has
+     already been configured."""
+ 
+-    return logging.getLogger(name)
++    logger = logging.getLogger(name)
++
++    methods_to_patch = {
++        "info_once": _print_info_once,
++        "warning_once": _print_warning_once,
++    }
++
++    for method_name, method in methods_to_patch.items():
++        setattr(logger, method_name, MethodType(method, logger))
++
++    return cast(_VllmLogger, logger)
+ 
+ 
+ # The root logger is initialized when the module is imported.
+@@ -114,13 +169,14 @@ def _trace_calls(log_path, root_dir, frame, event, arg=None):
+                 last_lineno = 0
+                 last_func_name = ""
+             with open(log_path, 'a') as f:
++                ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
+                 if event == 'call':
+-                    f.write(f"{datetime.datetime.now()} Call to"
++                    f.write(f"{ts} Call to"
+                             f" {func_name} in {filename}:{lineno}"
+                             f" from {last_func_name} in {last_filename}:"
+                             f"{last_lineno}\n")
+                 else:
+-                    f.write(f"{datetime.datetime.now()} Return from"
++                    f.write(f"{ts} Return from"
+                             f" {func_name} in {filename}:{lineno}"
+                             f" to {last_func_name} in {last_filename}:"
+                             f"{last_lineno}\n")
+diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
+new file mode 100644
+index 0000000..576ccf7
+--- /dev/null
++++ b/vllm/logging_utils/__init__.py
+@@ -0,0 +1,5 @@
++from vllm.logging_utils.formatter import NewLineFormatter
++
++__all__ = [
++    "NewLineFormatter",
++]
+diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
+new file mode 100644
+index 0000000..b24b4e1
+--- /dev/null
++++ b/vllm/logging_utils/formatter.py
+@@ -0,0 +1,15 @@
++import logging
++
++
++class NewLineFormatter(logging.Formatter):
++    """Adds logging prefix to newlines to align multi-line messages."""
++
++    def __init__(self, fmt, datefmt=None, style="%"):
++        logging.Formatter.__init__(self, fmt, datefmt, style)
++
++    def format(self, record):
++        msg = logging.Formatter.format(self, record)
++        if record.message != "":
++            parts = msg.split(record.message)
++            msg = msg.replace("\n", "\r\n" + parts[0])
++        return msg
+diff --git a/vllm/logits_process.py b/vllm/logits_process.py
+new file mode 100644
+index 0000000..7716ccd
+--- /dev/null
++++ b/vllm/logits_process.py
+@@ -0,0 +1,119 @@
++from typing import Callable, List, Tuple, Union
++
++import torch
++
++from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
++
++LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor],
++                        Callable[[List[int], List[int], torch.Tensor],
++                                 torch.Tensor]]
++"""LogitsProcessor is a function that takes a list
++of previously generated tokens, the logits tensor
++for the next token and, optionally, prompt tokens as a
++first argument, and returns a modified tensor of logits
++to sample from."""
++
++
++def get_bad_words_logits_processors(
++        bad_words: List[str],
++        tokenizer: AnyTokenizer) -> List[LogitsProcessor]:
++    bad_words_ids: List[List[int]] = list()
++
++    for bad_word in bad_words:
++        # To prohibit words both at the beginning
++        # and in the middle of text
++        # (related to add_prefix_space tokenizer parameter)
++        for add_prefix_space in [False, True]:
++            prefix = " " if add_prefix_space else ""
++            prompt = prefix + bad_word.lstrip()
++
++            if isinstance(tokenizer, MistralTokenizer):
++                # Mistral tokenizers should not add special tokens
++                prompt_token_ids = tokenizer.encode(prompt=prompt)
++            else:
++                prompt_token_ids = tokenizer.encode(text=prompt,
++                                                    add_special_tokens=False)
++
++            # If no space at the beginning
++            # or if prefix space produces a new word token
++            if (not add_prefix_space) or (
++                    add_prefix_space
++                    and prompt_token_ids[0] != bad_words_ids[-1][0]
++                    and len(prompt_token_ids) == len(bad_words_ids[-1])):
++                bad_words_ids.append(prompt_token_ids)
++
++    return [NoBadWordsLogitsProcessor(bad_words_ids=bad_words_ids)]
++
++
++class NoBadWordsLogitsProcessor:
++    _SMALLEST_LOGIT = float("-inf")
++    _NEUTRAL_LOGIT = 0.0
++
++    def __init__(self, bad_words_ids: List[List[int]]):
++        self.bad_words_ids = bad_words_ids
++        self.word_bias: torch.FloatTensor = None
++
++    def __call__(
++        self,
++        past_tokens_ids: Union[List[int], Tuple[int]],
++        logits: torch.FloatTensor,
++    ) -> torch.Tensor:
++        if self.word_bias is None:
++            self._init_word_bias(logits=logits)
++
++        last_token_bias = torch.zeros_like(logits)
++
++        for bad_word_ids in self.bad_words_ids:
++            if len(bad_word_ids) == 1:  # 1-token words already processed
++                continue
++
++            if len(bad_word_ids) > len(past_tokens_ids) + 1:
++                continue
++
++            prefix_length = len(bad_word_ids) - 1
++            last_token_id = bad_word_ids[-1]
++            actual_prefix = past_tokens_ids[-prefix_length:]
++            expected_prefix = bad_word_ids[:prefix_length]
++
++            assert len(actual_prefix) == len(expected_prefix)
++
++            is_match = tuple(actual_prefix) == tuple(expected_prefix)
++            last_token_bias[last_token_id] += (self._SMALLEST_LOGIT if is_match
++                                               else self._NEUTRAL_LOGIT)
++
++        logits = logits + self.word_bias + last_token_bias
++
++        return logits
++
++    def _init_word_bias(self, logits: torch.FloatTensor) -> None:
++        # Code based on NoBadWordsLogitsProcessor and SequenceBiasLogitsProcessor  # noqa: E501
++        # from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py
++
++        vocab_size = logits.shape[-1]
++
++        self._check_token_ids_bounds(vocab_size=vocab_size)
++
++        self.word_bias = torch.zeros((vocab_size, ),
++                                     dtype=torch.float,
++                                     device=logits.device)
++
++        for bad_word_ids in self.bad_words_ids:
++            if len(bad_word_ids) == 1:
++                bad_word_id = bad_word_ids[-1]
++                self.word_bias[bad_word_id] = self._SMALLEST_LOGIT
++
++    def _check_token_ids_bounds(self, vocab_size: int) -> None:
++        invalid_token_ids = []
++
++        for bad_word_ids in self.bad_words_ids:
++            for token_id in bad_word_ids:
++                if token_id < 0 or token_id >= vocab_size:
++                    invalid_token_ids.append(token_id)
++
++        if len(invalid_token_ids) > 0:
++            raise ValueError(
++                f"The model vocabulary size is {vocab_size},"
++                f" but the following tokens"
++                f" were specified as bad: {invalid_token_ids}."
++                f" All token id values should be integers satisfying:"
++                f" 0 <= token_id < {vocab_size}.")
+diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
+index 1720566..545ec21 100644
+--- a/vllm/lora/fully_sharded_layers.py
++++ b/vllm/lora/fully_sharded_layers.py
+@@ -1,5 +1,5 @@
+ # pylint: disable=unused-argument
+-from typing import TYPE_CHECKING, List, Optional
++from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast
+ 
+ import torch
+ import torch.nn as nn
+@@ -12,8 +12,8 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
+ from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+                               MergedColumnParallelLinearWithLoRA,
+                               MergedQKVParallelLinearWithLora,
++                              QKVParallelLinearWithLora,
+                               RowParallelLinearWithLoRA)
+-from vllm.lora.punica import bgmv, dispatch_bgmv_low_level
+ 
+ if TYPE_CHECKING:
+     pass
+@@ -27,11 +27,49 @@ def _fully_sharded_can_replace(can_replace):
+ 
+     def dec(*args, **kwargs):
+         return (can_replace(*args, **kwargs)
+-                and kwargs['lora_config'].fully_sharded_loras)
++                and kwargs["lora_config"].fully_sharded_loras)
+ 
+     return dec
+ 
+ 
++def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
++    """ 
++    For `ColumnParallelLinearWithLoRA` or classes that inherit from 
++    `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
++    """
++    assert (layer.n_slices == len(layer.lora_a_stacked) == len(
++        layer.lora_b_stacked) == len(layer.output_slices))
++    if layer.lora_bias_stacked is not None:
++        assert layer.n_slices == len(layer.lora_bias_stacked)
++
++    output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
++
++    x = x.view(-1, x.shape[-1])
++    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
++
++    # Since communication is needed, the buffer is directly initialized as a
++    # tensor rather than a tuple of tensor.
++    buffers = torch.zeros(
++        (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]),
++        dtype=torch.float32,
++        device=x.device,
++    )
++
++    layer.punica_wrapper.add_shrink(buffers, x, layer.lora_a_stacked, 1.0)
++    buffers = tensor_model_parallel_all_gather(buffers)
++    layer.punica_wrapper.add_expand(output,
++                                    buffers,
++                                    layer.lora_b_stacked,
++                                    layer.lora_bias_stacked,
++                                    layer.output_slices,
++                                    offset_start=0,
++                                    add_input=True)
++
++    output = output.view(*out_orig_shape)
++    # now have column partitioned and packed output
++    return output
++
++
+ # these layers are based on the tensor parallelism strategy given in
+ # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
+ # https://arxiv.org/abs/2311.03285.
+@@ -44,40 +82,32 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
+     Based on S-LoRA, slicing happens along the rank dim.
+     """
+ 
++    # For all LoRA layers where the `base_layer` is `ColumnParallelLinear`,
++    # their `lora_a` and `lora_b` have different sharding patterns. After
++    # completing the `lora_a` GEMM , a gather operation is performed.
++    # Therefore, the sharding of `lora_a` only needs to correspond with the
++    # gather operation.
+     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+         tp_rank = get_tensor_model_parallel_rank()
+-        shard_size = self.lora_a_stacked.shape[2]
++        shard_size = self.lora_a_stacked[0].shape[2]
+         start_idx = tp_rank * shard_size
+         lora_a = lora_a[:, start_idx:start_idx + shard_size]
+         return lora_a
+ 
+-    def apply_weights(self, x: torch.Tensor,
+-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+-        output = self.base_layer.linear_method.apply_weights(
+-            self.base_layer, x, bias)
+-
+-        x = x.view(-1, x.shape[-1])
+-        output, out_orig_shape = output.view(-1,
+-                                             output.shape[-1]), output.shape
+-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
+-                             dtype=torch.float32,
+-                             device=x.device)
+-
+-        bgmv(buffer, x, self.lora_a_stacked,
+-             self.indices[:self.indices_len[0]], 0, 1.0)
+-        buffer = tensor_model_parallel_all_gather(buffer)
+-        bgmv(output, buffer, self.lora_b_stacked,
+-             self.indices[:self.indices_len[0]], 0, 1.0)
+-        # now have column partitioned output
+-
+-        output = output.view(*out_orig_shape)
+-        return output
++    def apply(self,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        return _mcp_apply(x, bias, self)
+ 
+     @classmethod
+     @_fully_sharded_can_replace
+-    def can_replace_layer(cls, source_layer: nn.Module,
+-                          lora_config: LoRAConfig, packed_modules_list: List,
+-                          model_config: Optional[PretrainedConfig]) -> bool:
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
+         # specifying kwargs so they can be easily accessed in decorator
+         return super().can_replace_layer(
+             source_layer=source_layer,
+@@ -88,66 +118,72 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
+         )
+ 
+ 
+-def _mcp_apply_weights(x, bias, layer):
++class MergedColumnParallelLinearWithShardedLoRA(
++        MergedColumnParallelLinearWithLoRA):
+     """
+-    MergedColumnParallelLinearWithShardedLoRA and 
+-    QKVParallelLinearWithShardedLora share the same 
+-    LoRa weight application method.
+-    
+-    The main difference is the step by shard_size for lora_b which can
+-    vary for QKVParallelLinearWithShardedLora but is constant for 
+-    MergedColumnParallelLinearWithShardedLoRA.
++    Differs from MergedColumnParallelLinearWithLoRA by slicing the
++    LoRA A's also.
++
++    Based on S-LoRA, slicing happens along the rank dim.
+     """
+-    # expecting 2 for column parallel and 3 for qkv
+-    n = len(layer.lora_a_stacked)
+-    output = layer.base_layer.linear_method.apply_weights(
+-        layer.base_layer, x, bias)
+ 
+-    x = x.view(-1, x.shape[-1])
+-    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
+-    buffers = torch.zeros((n, x.shape[0], layer.lora_a_stacked[0].shape[2]),
+-                          dtype=torch.float32,
+-                          device=x.device)
+-    for idx in range(n):
+-        bgmv(buffers[idx], x, layer.lora_a_stacked[idx],
+-             layer.indices[:layer.indices_len[0]], 0, 1.0)
++    def slice_lora_a(
++        self, lora_a: List[Union[torch.Tensor, None]]
++    ) -> List[Union[torch.Tensor, None]]:
++        #NOTE: lora_a contains 2 subloras, and each sublora could be None.
++        output_shard_size = self.lora_a_stacked[0].shape[2]
++        output_start_idx = self.tp_rank * output_shard_size
++        lora_a = [
++            lora_a[0][:, output_start_idx:output_start_idx +
++                      output_shard_size] if lora_a[0] is not None else None,
++            lora_a[1][:, output_start_idx:output_start_idx +
++                      output_shard_size] if lora_a[1] is not None else None,
++        ]
++        return lora_a
+ 
+-    buffers = tensor_model_parallel_all_gather(buffers)
+-    left_offset = 0
+-    for idx in range(n):
+-        shard_size = layer.lora_b_stacked[idx].shape[2]
+-        dispatch_bgmv_low_level(output, buffers[idx],
+-                                layer.lora_b_stacked[idx],
+-                                layer.indices[:layer.indices_len[0]], 0, 1.0,
+-                                left_offset, shard_size)
+-        left_offset += shard_size
++    def apply(self,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        return _mcp_apply(x, bias, self)
+ 
+-    output = output.view(*out_orig_shape)
+-    # now have column partitioned and packed output
+-    return output
++    @classmethod
++    @_fully_sharded_can_replace
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
++        # specifying kwargs so they can be easily accessed in decorator
++        return super().can_replace_layer(
++            source_layer=source_layer,
++            lora_config=lora_config,
++            packed_modules_list=packed_modules_list,
++            model_config=model_config,
++            decorate=False,
++        )
+ 
+ 
+-class MergedColumnParallelLinearWithShardedLoRA(
+-        MergedColumnParallelLinearWithLoRA):
++class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
+     """
+-    Differs from MergedColumnParallelLinearWithLoRA by slicing the 
++    Differs from QKVParallelLinearWithLora by slicing the
+     LoRA A's also.
+ 
+     Based on S-LoRA, slicing happens along the rank dim.
+     """
+ 
+-    def slice_lora_a(self, lora_a: List[torch.Tensor]) -> List[torch.Tensor]:
+-        output_shard_size = self.lora_a_stacked[0].shape[2]
+-        output_start_idx = self.tp_rank * output_shard_size
+-        lora_a = [
+-            lora_a[i][:, output_start_idx:output_start_idx + output_shard_size]
+-            for i in range(2)
+-        ]
++    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
++        tp_rank = get_tensor_model_parallel_rank()
++        shard_size = self.lora_a_stacked[0].shape[2]
++        start_idx = tp_rank * shard_size
++        lora_a = lora_a[:, start_idx:start_idx + shard_size]
+         return lora_a
+ 
+-    def apply_weights(self, x: torch.Tensor,
+-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+-        return _mcp_apply_weights(x, bias, self)
++    def apply(self,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        return _mcp_apply(x, bias, self)
+ 
+     @classmethod
+     @_fully_sharded_can_replace
+@@ -166,31 +202,42 @@ class MergedColumnParallelLinearWithShardedLoRA(
+ 
+ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
+     """
+-    Differs from QKVParallelLinearWithLora by slicing the 
++    Differs from MergedQKVParallelLinearWithLora by slicing the 
+     LoRA A's also.
+ 
+     Based on S-LoRA, slicing happens along the rank dim.
+     """
+ 
+-    def slice_lora_a(self, lora_a: List[torch.Tensor]) -> List[torch.Tensor]:
++    def slice_lora_a(
++        self, lora_a: List[Union[torch.Tensor, None]]
++    ) -> List[Union[torch.Tensor, None]]:
++        # NOTE: lora_a contains 3 subloras, and each sublora could be None.
+         shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)]
+         start_idx = [self.tp_rank * shard_size[i] for i in range(3)]
+         lora_a = [
+-            lora_a[i][:, start_idx[i]:start_idx[i] +
+-                      shard_size[i]] if lora_a[i] is not None else None
+-            for i in range(3)
++            lora_a[0][:, start_idx[0]:start_idx[0] +
++                      shard_size[0]] if lora_a[0] is not None else None,
++            lora_a[1][:, start_idx[1]:start_idx[1] +
++                      shard_size[1]] if lora_a[1] is not None else None,
++            lora_a[2][:, start_idx[2]:start_idx[2] +
++                      shard_size[2]] if lora_a[2] is not None else None,
+         ]
+         return lora_a
+ 
+-    def apply_weights(self, x: torch.Tensor,
+-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+-        return _mcp_apply_weights(x, bias, self)
++    def apply(self,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        return _mcp_apply(x, bias, self)
+ 
+     @classmethod
+     @_fully_sharded_can_replace
+-    def can_replace_layer(cls, source_layer: nn.Module,
+-                          lora_config: LoRAConfig, packed_modules_list: List,
+-                          model_config: Optional[PretrainedConfig]) -> bool:
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
+         # specifying kwargs so they can be easily accessed in decorator
+         return super().can_replace_layer(
+             source_layer=source_layer,
+@@ -203,33 +250,47 @@ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
+ 
+ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
+     """
+-    Differs from RowParallelLinearWithLoRA by slicing the 
++    Differs from RowParallelLinearWithLoRA by slicing the
+     LoRA B's also.
+ 
+     Based on S-LoRA, slicing happens along the output dim.
+-    This yields a combined partial sum from the row parallel base 
++    This yields a combined partial sum from the row parallel base
+     layer and column partitioned output from the LoRA.
+     """
+ 
+     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+-        shard_size = self.lora_b_stacked.shape[2]
++        shard_size = self.lora_b_stacked[0].shape[2]
+         start_idx = self.tp_rank * shard_size
+         end_idx = (self.tp_rank + 1) * shard_size
+         lora_b = lora_b[:, start_idx:end_idx]
+         return lora_b
+ 
+-    def apply_weights(self, x: torch.Tensor) -> torch.Tensor:
+-        output = self.base_layer.linear_method.apply_weights(
+-            self.base_layer, x)
++    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
++        if bias is None:
++            return bias
++        self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
++                                      self.lora_bias_stacked)
++        shard_size = self.lora_bias_stacked[0].shape[2]
++        start_idx = self.tp_rank * shard_size
++        end_idx = (self.tp_rank + 1) * shard_size
++        bias = bias[start_idx:end_idx]
++        return bias
++
++    def apply(self,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        output = self.base_layer.quant_method.apply(self.base_layer, x)
+ 
+         x = x.view(-1, x.shape[-1])
+         output, out_orig_shape = output.view(-1,
+                                              output.shape[-1]), output.shape
+-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
+-                             dtype=torch.float32,
+-                             device=x.device)
+-        bgmv(buffer, x, self.lora_a_stacked,
+-             self.indices[:self.indices_len[0]], 0, 1.0)
++        buffer = torch.zeros(
++            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
++            dtype=torch.float32,
++            device=x.device,
++        )
++
++        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
+         buffer = tensor_model_parallel_all_reduce(buffer)
+ 
+         # following S-LoRA, allows the fusing of all_gather and all_reduce
+@@ -238,20 +299,30 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
+         # remains is a standard all_reduce. User should be aware though that
+         # the output is not the same as a normal row_parallel, it should be
+         # reduced before being used
+-        shard_size = self.lora_b_stacked.shape[2]
+-        start_idx = self.tp_rank * shard_size
+-        dispatch_bgmv_low_level(output, buffer, self.lora_b_stacked,
+-                                self.indices[:self.indices_len[0]], 0, 1.0,
+-                                start_idx, shard_size)
+-
++        # NOTE offset are based on the rank.
++        shard_size = self.lora_b_stacked[0].shape[2]
++        offset_start = self.tp_rank * shard_size
++        self.punica_wrapper.add_expand(
++            output,
++            buffer,
++            self.lora_b_stacked,
++            self.lora_bias_stacked,
++            self.output_slices,
++            offset_start=offset_start,
++            add_input=True,
++        )
+         output = output.view(*out_orig_shape)
+         return output
+ 
+     @classmethod
+     @_fully_sharded_can_replace
+-    def can_replace_layer(cls, source_layer: nn.Module,
+-                          lora_config: LoRAConfig, packed_modules_list: List,
+-                          model_config: Optional[PretrainedConfig]) -> bool:
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
+         # specifying kwargs so they can be easily accessed in decorator
+         return super().can_replace_layer(
+             source_layer=source_layer,
+diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
+index b360966..a933cca 100644
+--- a/vllm/lora/layers.py
++++ b/vllm/lora/layers.py
+@@ -1,13 +1,14 @@
+ # pylint: disable=unused-argument
+ import math
+ from dataclasses import dataclass
+-from typing import TYPE_CHECKING, List, Optional, Tuple
++from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast
+ 
+ import torch
+ import torch.nn as nn
+ import torch.nn.functional as F
+ from transformers import PretrainedConfig
+ 
++from vllm.adapter_commons.layers import AdapterMapping
+ from vllm.config import LoRAConfig
+ from vllm.distributed import (get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size,
+@@ -16,17 +17,23 @@ from vllm.distributed import (get_tensor_model_parallel_rank,
+                               tensor_model_parallel_all_reduce,
+                               tensor_model_parallel_gather)
+ from vllm.distributed.utils import divide
+-from vllm.lora.punica import add_lora, add_lora_slice, bgmv
++# yapf: disable
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               LinearBase,
+                                                MergedColumnParallelLinear,
+                                                QKVParallelLinear,
++                                               ReplicatedLinear,
+                                                RowParallelLinear)
++# yapf: enable
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.rotary_embedding import (
++    LinearScalingRotaryEmbedding, RotaryEmbedding)
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     VocabParallelEmbedding)
++from vllm.platforms import current_platform
+ 
+ if TYPE_CHECKING:
+-    pass
++    from vllm.lora.punica_wrapper import PunicaWrapperBase
+ 
+ 
+ def _get_lora_device(base_layer: nn.Module) -> torch.device:
+@@ -35,7 +42,10 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device:
+     # unquantizedLinear
+     if hasattr(base_layer, "weight"):
+         return base_layer.weight.device
+-    # GPTQ/AWQ/SqueezeLLM
++    # Compressed Tensor
++    elif hasattr(base_layer, "weight_packed"):
++        return base_layer.weight_packed.device
++    # GPTQ/AWQ
+     elif hasattr(base_layer, "qweight"):
+         return base_layer.qweight.device
+     # marlin
+@@ -52,112 +62,39 @@ def _not_fully_sharded_can_replace(can_replace):
+     """
+ 
+     def dec(*args, **kwargs):
+-        decorate = kwargs.pop('decorate') if 'decorate' in kwargs else True
+-        condition = (not kwargs['lora_config'].fully_sharded_loras
++        decorate = kwargs.pop("decorate") if "decorate" in kwargs else True
++        condition = (not kwargs["lora_config"].fully_sharded_loras
+                      if decorate else True)
+         return can_replace(*args, **kwargs) and condition
+ 
+     return dec
+ 
+ 
+-def _apply_lora(
+-    x: torch.Tensor,
+-    lora_a_stacked: torch.Tensor,
+-    lora_b_stacked: torch.Tensor,
+-    indices: torch.Tensor,
+-    output: torch.Tensor,
+-):
+-    """Applies lora to each input.
+-
+-    This method applies all loras to each input. It uses the
+-    indices vector to determine which lora yields the
+-    correct output. An index of -1 means no lora should be
+-    applied. This method adds the final lora results to the
+-    output.
+-
+-    Input shapes:
+-        x:               (batch_size, hidden_dim)
+-        lora_a_stacked:  (num_loras, lora_rank, hidden_dim)
+-        lora_b_stacked:  (num_loras, output_dim, lora_rank)
+-        indices:         (batch_size)
+-        output:          (batch_size, output_dim)
+-    """
+-    org_output = output
+-    x = x.view(-1, x.shape[-1])
+-    output = output.view(-1, output.shape[-1])
+-    indices = indices.view(-1)
+-    add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0)
+-    return output.view_as(org_output)
+-
+-
+-def _apply_lora_packed_nslice(
+-    x: torch.Tensor,
+-    lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+-    lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+-    indices: torch.Tensor,
+-    output: torch.Tensor,
+-    output_slices: Tuple[int, ...],
+-):
+-    """Applies lora to each input.
+-
+-    This method applies all loras to each input. It uses the
+-    indices vector to determine which lora yields the
+-    correct output. An index of -1 means no lora should be
+-    applied. This method adds the final lora results to the
+-    output.
+-
+-    This method is used for layers that are composed of multiple sublayers
+-    (slices) packed together.
+-
+-    Input shapes:
+-        x:                 (batch_size, hidden_dim)
+-        lora_a_stacked:    3 element tuple of (num_loras, lora_rank, hidden_dim)
+-        lora_b_stacked:    3 element tuple of (num_loras, output_dim, lora_rank)
+-        indices:           (batch_size)
+-        output:            (batch_size, q_slice_size + 2*kv_slice_size)
+-        output_slices:     n-1 element tuple of (slice_size...),
+-                           where n is number of slices
+-    """
+-    org_output = output
+-    x = x.view(-1, x.shape[-1])
+-    output = output.view(-1, output.shape[-1])
+-    indices = indices.view(-1)
+-    offset_left = 0
+-    for slice_idx in range(len(output_slices)):
+-        add_lora_slice(output, x, lora_a_stacked[slice_idx],
+-                       lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left,
+-                       output_slices[slice_idx])
+-        offset_left += output_slices[slice_idx]
+-    return output.view_as(org_output)
+-
+-
+ @dataclass
+-class LoRAMapping:
+-    # Per every token in input_ids:
+-    index_mapping: Tuple[int, ...]
+-    # Per sampled token:
+-    prompt_mapping: Tuple[int, ...]
+-
+-    def __post_init__(self):
+-        self.index_mapping = tuple(self.index_mapping)
+-        self.prompt_mapping = tuple(self.prompt_mapping)
++class LoRAMapping(AdapterMapping):
++    is_prefill: bool = False
+ 
+ 
+ class BaseLayerWithLoRA(nn.Module):
+ 
+-    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
++    def slice_lora_a(
++        self, lora_a: Union[torch.Tensor, List[Union[torch.Tensor, None]]]
++    ) -> Union[torch.Tensor, List[Union[torch.Tensor, None]]]:
+         """Slice lora a if splitting for tensor parallelism."""
+         ...
+ 
+-    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
++    def slice_lora_b(
++        self, lora_b: Union[torch.Tensor, List[Union[torch.Tensor, None]]]
++    ) -> Union[torch.Tensor, List[Union[torch.Tensor, None]]]:
+         """Slice lora b if splitting with tensor parallelism."""
+         ...
+ 
+     def create_lora_weights(
+-            self,
+-            max_loras: int,
+-            lora_config: LoRAConfig,
+-            model_config: Optional[PretrainedConfig] = None) -> None:
++        self,
++        max_loras: int,
++        lora_config: LoRAConfig,
++        model_config: Optional[PretrainedConfig] = None,
++    ) -> None:
+         """Initializes lora matrices."""
+         ...
+ 
+@@ -171,25 +108,25 @@ class BaseLayerWithLoRA(nn.Module):
+         lora_a: torch.Tensor,
+         lora_b: torch.Tensor,
+         embeddings_tensor: Optional[torch.Tensor],
++        bias: Optional[torch.Tensor] = None,
+     ):
+         """Overwrites lora tensors at index."""
+         ...
+ 
+     def set_mapping(
+         self,
+-        base_indices: torch.Tensor,
+-        sampler_indices: torch.Tensor,
+-        sampler_indices_padded: torch.Tensor,
+-        embeddings_indices: torch.Tensor,
+-        indices_len: List[int],
++        punica_wrapper,
+     ):
+-        """Sets the mapping indices."""
+-        ...
++        self.punica_wrapper: PunicaWrapperBase = punica_wrapper
+ 
+     @classmethod
+-    def can_replace_layer(cls, source_layer: nn.Module,
+-                          lora_config: LoRAConfig, packed_modules_list: List,
+-                          model_config: Optional[PretrainedConfig]) -> bool:
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
+         """Returns True if the layer can be replaced by this LoRA layer."""
+         raise NotImplementedError
+ 
+@@ -208,19 +145,19 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
+             lora_config: LoRAConfig,
+             model_config: Optional[PretrainedConfig] = None) -> None:
+ 
+-        lora_vocab_start_idx = self.base_layer.org_vocab_size
+-        weights_idx = None
+-        if self.base_layer.vocab_end_index > lora_vocab_start_idx:
++        if self.base_layer.num_added_embeddings_per_partition > 0:
+             # We can start adding lora weights
+-            weights_idx = max(
+-                lora_vocab_start_idx - self.base_layer.vocab_start_index, 0)
+-            self.embeddings_slice = (self.base_layer.vocab_start_index -
+-                                     self.base_layer.org_vocab_size +
+-                                     weights_idx,
+-                                     self.base_layer.vocab_end_index -
+-                                     self.base_layer.org_vocab_size)
+-            self.embeddings_weights = self.base_layer.weight.data[weights_idx:]
+-            self.embeddings_weights.fill_(0)
++            self.embeddings_weights = self.base_layer.weight.data[
++                self.base_layer.num_org_embeddings_per_partition:self.
++                base_layer.num_org_embeddings_per_partition +
++                self.base_layer.num_added_embeddings_per_partition]
++            self.embeddings_slice = (
++                self.base_layer.shard_indices.added_vocab_start_index -
++                self.base_layer.org_vocab_size,
++                self.base_layer.shard_indices.added_vocab_end_index -
++                self.base_layer.org_vocab_size)
++            self.base_layer.weight.data[
++                self.base_layer.num_org_embeddings_per_partition:].fill_(0)
+         else:
+             self.embeddings_slice = None
+             self.embeddings_weights = None
+@@ -258,10 +195,6 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
+             self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
+             self.lora_a_stacked.shape[2],
+         )
+-        # Lazily initialized.
+-        self.indices: torch.Tensor
+-        self.indices_len: List[int]
+-        self.embeddings_indices: torch.Tensor
+ 
+     def reset_lora(self, index: int):
+         self.lora_a_stacked[index] = 0
+@@ -274,6 +207,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
+         lora_a: torch.Tensor,
+         lora_b: torch.Tensor,
+         embeddings_tensor: Optional[torch.Tensor],
++        bias: Optional[torch.Tensor] = None,
+     ):
+         self.reset_lora(index)
+         self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_(
+@@ -284,39 +218,27 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
+         if embeddings_tensor is not None:
+             self.embeddings_tensors[
+                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
+-                shape[1]].copy_(embeddings_tensor, non_blocking=True)
++                shape[1], ].copy_(embeddings_tensor, non_blocking=True)
+             if self.embeddings_slice is not None:
+                 # TODO(yard1): Optimize this copy, we don't need to copy
+                 # everything, just the modified part
+                 embeddings = self.embeddings_tensors.view(
+                     self.embeddings_tensors.shape[0] *
+                     self.embeddings_tensors.shape[1],
+-                    self.embeddings_tensors.shape[2]
++                    self.embeddings_tensors.shape[2],
+                 )[self.embeddings_slice[0]:self.embeddings_slice[1]]
+                 assert self.embeddings_weights is not None
+                 self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
+ 
+-    def set_mapping(
+-        self,
+-        base_indices: torch.Tensor,
+-        sampler_indices: torch.Tensor,
+-        sampler_indices_padded: torch.Tensor,
+-        embeddings_indices: torch.Tensor,
+-        indices_len: List[int],
+-    ):
+-        self.indices = base_indices
+-        self.embeddings_indices = embeddings_indices
+-        self.indices_len = indices_len
+-
+     def forward(self, x: torch.Tensor) -> torch.Tensor:
+         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
+-        embedding_len = self.indices_len[3]
+-        indices = self.embeddings_indices[1][:embedding_len].view_as(x)
++        embeddings_indices = self.punica_wrapper.embeddings_indices
++        indices = embeddings_indices[1].view_as(x)
+         full_lora_a_embeddings = F.embedding(
+             x + indices,
+             self.lora_a_stacked_2d,
+         )
+-        indices = self.embeddings_indices[0][:embedding_len].view_as(x)
++        indices = embeddings_indices[0].view_as(x)
+         full_output = self.base_layer.forward(
+             x.add_(indices * added_tokens_mask))
+ 
+@@ -327,79 +249,105 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
+         if full_lora_a_embeddings.ndim == 3:
+             full_lora_a_embeddings = full_lora_a_embeddings.view(
+                 full_lora_a_embeddings.shape[0] *
+-                full_lora_a_embeddings.shape[1], -1)
+-        bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked,
+-             self.indices[:self.indices_len[0]], 0, 1.0)
++                full_lora_a_embeddings.shape[1],
++                -1,
++            )
++        self.punica_wrapper.add_lora_embedding(full_output,
++                                               full_lora_a_embeddings,
++                                               self.lora_b_stacked,
++                                               add_input=True)
+         return full_output.view_as(full_output_org)
+ 
+     @classmethod
+-    def can_replace_layer(cls, source_layer: nn.Module,
+-                          lora_config: LoRAConfig, packed_modules_list: List,
+-                          model_config: Optional[PretrainedConfig]) -> bool:
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
+         return type(source_layer) is VocabParallelEmbedding
+ 
+ 
+-class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
+-    """
+-    LoRA on top of ColumnParallelLinear layer.
+-    
+-    LoRA B is sliced for tensor parallelism.
+-    """
++class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
+ 
+-    def __init__(self, base_layer: ColumnParallelLinear) -> None:
++    def __init__(self, base_layer: LinearBase):
+         super().__init__()
+         self.base_layer = base_layer
+-        self.tp_size = get_tensor_model_parallel_world_size()
+         self.input_size = self.base_layer.input_size
+-        self.output_size = self.base_layer.output_size_per_partition
+         self.device = _get_lora_device(self.base_layer)
++        self.lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
++
++        self.output_slices: Tuple[int, ...]
++        self.tp_size: int
++        self.output_size: int
++        self.n_slices: int
+ 
+     def create_lora_weights(
+-            self,
+-            max_loras: int,
+-            lora_config: LoRAConfig,
+-            model_config: Optional[PretrainedConfig] = None) -> None:
++        self,
++        max_loras: int,
++        lora_config: LoRAConfig,
++        model_config: Optional[PretrainedConfig] = None,
++    ) -> None:
+         self.lora_config = lora_config
+-        self.tp_size = get_tensor_model_parallel_world_size()
+-        lora_a_output_size_per_partition = (
+-            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
+-            else divide(lora_config.max_lora_rank, self.tp_size))
+-        self.lora_a_stacked = torch.zeros(
+-            max_loras,
+-            1,
+-            lora_a_output_size_per_partition,
+-            self.input_size,
+-            dtype=lora_config.lora_dtype,
+-            device=self.device,
+-        )
+-        self.lora_b_stacked = torch.zeros(
+-            max_loras,
+-            1,
+-            self.output_size,
+-            lora_config.max_lora_rank,
+-            dtype=lora_config.lora_dtype,
+-            device=self.device,
+-        )
+-        self.output_dim = self.lora_b_stacked.shape[2]
++        #
++        if isinstance(self.base_layer, ReplicatedLinear):
++            lora_a_out_size = lora_config.max_lora_rank
++            lora_b_out_size = self.output_size
++
++        elif isinstance(self.base_layer, ColumnParallelLinear):
++            lora_a_out_size = (lora_config.max_lora_rank if
++                               not lora_config.fully_sharded_loras else divide(
++                                   lora_config.max_lora_rank, self.tp_size))
++            lora_b_out_size = self.output_size
++
++        elif isinstance(self.base_layer, RowParallelLinear):
++            lora_a_out_size = lora_config.max_lora_rank
++            lora_b_out_size = (self.output_size if
++                               not lora_config.fully_sharded_loras else divide(
++                                   self.output_size, self.tp_size))
++        else:
++            raise NotImplementedError
+ 
+-        # lazily initialized.
+-        self.indices: torch.Tensor
+-        self.indices_len: List[int]
++        self.lora_a_stacked = tuple(
++            torch.zeros(
++                max_loras,
++                1,
++                lora_a_out_size,
++                self.input_size,
++                dtype=lora_config.lora_dtype,
++                device=self.device,
++            ) for _ in range(self.n_slices))
++        self.lora_b_stacked = tuple(
++            torch.zeros(
++                max_loras,
++                1,
++                lora_b_out_size,
++                lora_config.max_lora_rank,
++                dtype=lora_config.lora_dtype,
++                device=self.device,
++            ) for _ in range(self.n_slices))
++        if lora_config.bias_enabled:
++            lora_bias_out_size = lora_b_out_size
++            self.lora_bias_stacked = tuple(
++                torch.zeros(
++                    max_loras,
++                    1,
++                    lora_bias_out_size,
++                    dtype=lora_config.lora_dtype,
++                    device=self.device,
++                ) for _ in range(self.n_slices))
++        self.output_slices = (self.lora_b_stacked[0].shape[2], )
+ 
+     def reset_lora(self, index: int):
+-        self.lora_a_stacked[index] = 0
+-        self.lora_b_stacked[index] = 0
+-
+-    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+-        return lora_a
+-
+-    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+-        shard_size = self.output_dim
+-        start_idx = tensor_model_parallel_rank * shard_size
+-        end_idx = (tensor_model_parallel_rank + 1) * shard_size
+-        lora_b = lora_b[:, start_idx:end_idx]
+-        return lora_b
++        for s_index in range(self.n_slices):
++            self.lora_a_stacked[s_index][index] = 0
++            self.lora_b_stacked[s_index][index] = 0
++            if self.lora_config.bias_enabled:
++                # Make mypy happy
++                self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
++                                              self.lora_bias_stacked)
++                self.lora_bias_stacked[s_index][index] = 0
+ 
+     def set_lora(
+         self,
+@@ -407,44 +355,152 @@ class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
+         lora_a: torch.Tensor,
+         lora_b: torch.Tensor,
+         embeddings_tensor: Optional[torch.Tensor],
++        lora_bias: Optional[torch.Tensor] = None,
+     ):
+-        self.reset_lora(index)
++        # Except for QKVParallelLinearWithLora and
++        # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
++        # store weights in a tuple of size 1. These two layers will
++        # override this function.
++        assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
++                self.n_slices == 1)
+ 
++        self.reset_lora(index)
+         if self.tp_size > 1:
+             lora_a = self.slice_lora_a(lora_a)
+             lora_b = self.slice_lora_b(lora_b)
++            if lora_bias is not None:
++                lora_bias = self.slice_bias(lora_bias)
++
++        self.lora_a_stacked[0][index,
++                               0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
++                                   lora_a.T, non_blocking=True)
++        self.lora_b_stacked[0][index,
++                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
++                                   lora_b.T, non_blocking=True)
++        if lora_bias is not None:
++
++            self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
++                                          self.lora_bias_stacked)
++            assert len(self.lora_bias_stacked)
++            self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
++                lora_bias.T, non_blocking=True)
++
++    def apply(self,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
++        self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
++                                            self.lora_b_stacked,
++                                            self.lora_bias_stacked, 1.0,
++                                            self.output_slices)
++        return output
+ 
+-        self.lora_a_stacked[index,
+-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+-                                lora_a.T, non_blocking=True)
+-        self.lora_b_stacked[index,
+-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+-                                lora_b.T, non_blocking=True)
+ 
+-    def set_mapping(
+-        self,
+-        base_indices: torch.Tensor,
+-        sampler_indices: torch.Tensor,
+-        sampler_indices_padded: torch.Tensor,
+-        embeddings_indices: torch.Tensor,
+-        indices_len: List[int],
+-    ):
+-        self.indices = base_indices
+-        self.indices_len = indices_len
++class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
+ 
+-    def apply(self, x: torch.Tensor,
+-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+-        _apply_lora(
+-            x,
+-            self.lora_a_stacked,
+-            self.lora_b_stacked,
+-            self.indices[:self.indices_len[0]],
+-            output,
+-        )
+-        return output
++    def __init__(self, base_layer: ReplicatedLinear) -> None:
++        super().__init__(base_layer, )
++        # To ensure interface compatibility, set to 1 always.
++        self.tp_size = 1
++        self.output_size = self.base_layer.output_size
++        self.n_slices = 1
++
++    def forward(
++        self, input_: torch.Tensor
++    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
++        """Forward of ReplicatedLinearWithLoRA
+ 
+-    def forward(self, input_):
++        Args:
++            input_: Tensor whose last dimension is `input_size`.
++
++        Returns:
++            - output
++            - bias
++        """
++        bias = (self.base_layer.bias
++                if not self.base_layer.skip_bias_add else None)
++
++        # Matrix multiply.
++        output = self.apply(input_, bias)
++
++        output_bias = (self.base_layer.bias
++                       if self.base_layer.skip_bias_add else None)
++        return output, output_bias
++
++    # ReplicatedLinear should always be replaced, regardless of the fully
++    # sharded LoRAs setting, because it is, by definition, copied per GPU.
++    @classmethod
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
++        return type(source_layer) is ReplicatedLinear
++
++
++class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
++    """
++    LoRA on top of ColumnParallelLinear layer.
++    LoRA B is sliced for tensor parallelism.
++    There are two types for the `base_layer`:
++    1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`.
++    2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`.
++    """
++
++    def __init__(self, base_layer: ColumnParallelLinear) -> None:
++        super().__init__(base_layer)
++        # The base_layer type is ColumnParallelLinear or
++        # MergedColumnParallelLinear, their weight sharding logic is
++        # inconsistent when TP is greater than 1.
++        self.is_merged_col_linear = type(
++            base_layer) is MergedColumnParallelLinear
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.output_size = self.base_layer.output_size_per_partition
++        # There is only one LoRA layer
++        self.n_slices = 1
++
++    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
++        return lora_a
++
++    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
++        # Applicable to cases where the base_layer is
++        # MergedColumnParallelLinear.
++        if self.is_merged_col_linear:
++            tp_rank = get_tensor_model_parallel_rank()
++            shard_size = self.output_size // 2
++            offset = lora_b.shape[-1] // 2
++
++            left_weight = lora_b[:, tp_rank * shard_size:(tp_rank + 1) *
++                                 shard_size]
++            right_weight = lora_b[:, offset + tp_rank * shard_size:offset +
++                                  (tp_rank + 1) * shard_size]
++            lora_b = torch.cat([left_weight, right_weight], dim=1)
++        # Applicable to cases where the base_layer is
++        # ColumnParallelLinear.
++        else:
++            tensor_model_parallel_rank = get_tensor_model_parallel_rank()
++            shard_size = self.output_size
++            start_idx = tensor_model_parallel_rank * shard_size
++            end_idx = (tensor_model_parallel_rank + 1) * shard_size
++            lora_b = lora_b[:, start_idx:end_idx]
++        return lora_b
++
++    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
++        # TODO: Fix the slicing logic of bias.
++        if bias is None:
++            return bias
++        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
++        shard_size = self.output_size
++        start_idx = tensor_model_parallel_rank * shard_size
++        end_idx = (tensor_model_parallel_rank + 1) * shard_size
++        bias = bias[start_idx:end_idx]
++        return bias
++
++    def forward(
++        self, input_: torch.Tensor
++    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+         """Forward of ColumnParallelLinear
+ 
+         Args:
+@@ -470,9 +526,13 @@ class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
+ 
+     @classmethod
+     @_not_fully_sharded_can_replace
+-    def can_replace_layer(cls, source_layer: nn.Module,
+-                          lora_config: LoRAConfig, packed_modules_list: List,
+-                          model_config: Optional[PretrainedConfig]) -> bool:
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
+         return type(source_layer) is ColumnParallelLinear or (
+             type(source_layer) is MergedColumnParallelLinear
+             and len(packed_modules_list) == 1)
+@@ -487,24 +547,32 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
+     Both slices must have the same size.
+     """
+ 
+-    def __init__(self, base_layer: MergedColumnParallelLinear) -> None:
++    def __init__(
++        self, base_layer: Union[MergedColumnParallelLinear,
++                                QKVParallelLinear]) -> None:
+         super().__init__(base_layer)
++        # There are two LoRA layers
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.tp_rank = get_tensor_model_parallel_rank()
++        # the output_sizes in MergedColumnParallelLinear is not sharded by tp
++        # we need to divide it by the tp_size to get correct slices size
++        output_sizes = self.base_layer.output_sizes
++        self.output_slices = tuple(
++            divide(output_size, self.tp_size) for output_size in output_sizes)
++        self.n_slices = len(self.output_slices)
++        self.output_ids = (self.tp_rank, ) * self.n_slices
+ 
+     def create_lora_weights(
+-            self,
+-            max_loras: int,
+-            lora_config: LoRAConfig,
+-            model_config: Optional[PretrainedConfig] = None) -> None:
++        self,
++        max_loras: int,
++        lora_config: LoRAConfig,
++        model_config: Optional[PretrainedConfig] = None,
++    ) -> None:
++        """
++        The main reason for overriding this function is to enhance  code 
++        maintainability.
++        """
+         self.lora_config = lora_config
+-        n_slices = 2
+-        if not (len(self.base_layer.output_sizes) == n_slices
+-                and self.base_layer.output_sizes[0]
+-                == self.base_layer.output_sizes[1]):
+-            raise ValueError(
+-                "LoRAColumnParallelLinear2Slice requires 2 slices with "
+-                "the same size.")
+-        self.tp_size = get_tensor_model_parallel_world_size()
+-        self.tp_rank = get_tensor_model_parallel_rank()
+ 
+         lora_a_output_size_per_partition = (
+             lora_config.max_lora_rank if not lora_config.fully_sharded_loras
+@@ -518,105 +586,115 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
+                 self.input_size,
+                 dtype=lora_config.lora_dtype,
+                 device=self.device,
+-            ) for _ in range(n_slices))
++            ) for _ in range(self.n_slices))
+         self.lora_b_stacked = tuple(
+             torch.zeros(
+                 max_loras,
+                 1,
+-                self.output_size // 2,
++                output_size,
+                 lora_config.max_lora_rank,
+                 dtype=lora_config.lora_dtype,
+                 device=self.device,
+-            ) for _ in range(n_slices))
+-
+-        self.output_dim = self.lora_b_stacked[0].shape[2]
+-        # Lazily initialized.
+-        self.indices: torch.Tensor
+-
+-    def reset_lora(self, index: int):
+-        self.lora_a_stacked[0][index] = 0
+-        self.lora_a_stacked[1][index] = 0
+-        self.lora_b_stacked[0][index] = 0
+-        self.lora_b_stacked[1][index] = 0
+-
+-    def slice_lora_a(self, lora_a: List[torch.Tensor]) -> List[torch.Tensor]:
++            ) for output_size in self.output_slices)
++        if lora_config.bias_enabled:
++            self.lora_bias_stacked = tuple(
++                torch.zeros(
++                    max_loras,
++                    1,
++                    output_size,
++                    dtype=lora_config.lora_dtype,
++                    device=self.device,
++                ) for output_size in self.output_slices)
++
++    def slice_lora_a(
++        self, lora_a: List[Union[torch.Tensor, None]]
++    ) -> List[Union[torch.Tensor, None]]:
+         return lora_a
+ 
+-    def slice_lora_b(self, lora_b: List[torch.Tensor]) -> List[torch.Tensor]:
+-        shard_size = self.output_dim
+-        start_idx = self.tp_rank * shard_size
+-        end_idx = (self.tp_rank + 1) * shard_size
+-        lora_b = [
+-            lora_b[0][:, start_idx:end_idx], lora_b[1][:, start_idx:end_idx]
+-        ]
++    def slice_lora_b(
++        self, lora_b: List[Union[torch.Tensor, None]]
++    ) -> List[Union[torch.Tensor, None]]:
++        for i, (shard_id, shard_size) in enumerate(
++                zip(self.output_ids, self.output_slices)):
++            if (lora_b_i := lora_b[i]) is not None:
++                lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size *
++                                     (shard_id + 1)]
+         return lora_b
+ 
++    def slice_bias(
++        self, bias: List[Union[torch.Tensor,
++                               None]]) -> List[Union[torch.Tensor, None]]:
++        for i, (shard_id, shard_size) in enumerate(
++                zip(self.output_ids, self.output_slices)):
++            if (bias_i := bias[i]) is not None:
++                bias[i] = bias_i[shard_size * shard_id:shard_size *
++                                 (shard_id + 1)]
++        return bias
++
+     def set_lora(
+         self,
+         index: int,
+         lora_a: torch.Tensor,
+         lora_b: torch.Tensor,
+         embeddings_tensor: Optional[torch.Tensor],
++        lora_bias: Optional[torch.Tensor] = None,
+     ):
+         self.reset_lora(index)
+ 
+         if self.tp_size > 1:
+             lora_a = self.slice_lora_a(lora_a)
+             lora_b = self.slice_lora_b(lora_b)
+-
+-        if lora_a[0] is not None:
+-            self.lora_a_stacked[0][
+-                index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_(
+-                    lora_a[0].T, non_blocking=True)
+-            self.lora_b_stacked[0][
+-                index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_(
+-                    lora_b[0].T, non_blocking=True)
+-        if lora_a[1] is not None:
+-            self.lora_a_stacked[1][
+-                index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_(
+-                    lora_a[1].T, non_blocking=True)
+-            self.lora_b_stacked[1][
+-                index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_(
+-                    lora_b[1].T, non_blocking=True)
+-
+-    def apply(self, x: torch.Tensor,
+-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+-        _apply_lora_packed_nslice(
+-            x,
+-            self.lora_a_stacked,
+-            self.lora_b_stacked,
+-            self.indices[:self.indices_len[0]],
+-            output,
+-            (self.output_dim, self.output_dim),
+-        )
+-        return output
++            if lora_bias is not None:
++                lora_bias = self.slice_bias(lora_bias)
++
++        for i in range(self.n_slices):
++            if (lora_a_i := lora_a[i]) is not None:
++                self.lora_a_stacked[i][
++                    index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_(
++                        lora_a_i.T, non_blocking=True)
++            if (lora_b_i := lora_b[i]) is not None:
++                self.lora_b_stacked[i][
++                    index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_(
++                        lora_b_i.T, non_blocking=True)
++
++        if lora_bias is not None:
++            self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
++                                          self.lora_bias_stacked)
++            for i in range(self.n_slices):
++                if (lora_bias_i := lora_bias[i]) is not None:
++                    self.lora_bias_stacked[i][index,
++                                              0, :lora_bias_i.shape[0]].copy_(
++                                                  lora_bias_i.T,
++                                                  non_blocking=True)
+ 
+     @classmethod
+     @_not_fully_sharded_can_replace
+-    def can_replace_layer(cls, source_layer: nn.Module,
+-                          lora_config: LoRAConfig, packed_modules_list: List,
+-                          model_config: Optional[PretrainedConfig]) -> bool:
+-        return type(source_layer) is MergedColumnParallelLinear and len(
+-            packed_modules_list) == 2
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
++        return (type(source_layer) is MergedColumnParallelLinear
++                and len(packed_modules_list) == 2)
+ 
+ 
+ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
+     """
+-    ColumnParallelLinear layer that is specifically designed for  
+-    qkv_proj. Certain models, such as chtglm3 and baichuan-7b,  
+-    only contains a single LoRA within their qkv_proj layer. 
++    ColumnParallelLinear layer that is specifically designed for
++    qkv_proj. Certain models, such as chatglm3 and baichuan-7b,
++    only contains a single LoRA within their qkv_proj layer.
+ 
+-    During inference with Tensor Parallel, the weights of lora_b 
++    During inference with Tensor Parallel, the weights of lora_b
+     must be accurately partitioned according to the respective ranks.
+-    
++
+     Q slice may have different shape than K and V slices (which both have
+     the same shape).
+     """
+ 
+     def __init__(self, base_layer: QKVParallelLinear) -> None:
+         super().__init__(base_layer)
+-        self.tp_size = get_tensor_model_parallel_world_size()
+         self.q_proj_total_size = (self.base_layer.total_num_heads *
+                                   self.base_layer.head_size)
+         self.q_proj_shard_size = (self.base_layer.num_heads *
+@@ -625,40 +703,44 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
+                                    self.base_layer.head_size)
+         self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
+                                    self.base_layer.head_size)
++        # There is only one LoRA layer
++        self.n_slices = 1
+ 
+-    def set_lora(
+-        self,
+-        index: int,
+-        lora_a: torch.Tensor,
+-        lora_b: torch.Tensor,
+-        embeddings_tensor: Optional[torch.Tensor],
+-    ):
+-        self.reset_lora(index)
+-        if self.tp_size > 1:
+-            tp_rank = get_tensor_model_parallel_rank()
+-            self.q_shard_id = tp_rank
+-            self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
+-            lora_b_q = lora_b[:, self.q_proj_shard_size *
+-                              self.q_shard_id:self.q_proj_shard_size *
+-                              (self.q_shard_id + 1)]
+-            k_offset = self.q_proj_total_size
+-            lora_b_k = lora_b[:, k_offset + self.kv_proj_shard_size *
+-                              self.kv_shard_id:k_offset +
+-                              self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+-            v_offset = k_offset + self.kv_proj_total_size
+-            lora_b_v = lora_b[:, v_offset + self.kv_proj_shard_size *
+-                              self.kv_shard_id:v_offset +
+-                              self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+-            lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
++    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
++        tp_rank = get_tensor_model_parallel_rank()
++        self.q_shard_id = tp_rank
++        self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
++        lora_b_q = lora_b[:, self.q_proj_shard_size *
++                          self.q_shard_id:self.q_proj_shard_size *
++                          (self.q_shard_id + 1)]
++        k_offset = self.q_proj_total_size
++        lora_b_k = lora_b[:, k_offset +
++                          self.kv_proj_shard_size * self.kv_shard_id:k_offset +
++                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
++        v_offset = k_offset + self.kv_proj_total_size
++        lora_b_v = lora_b[:, v_offset +
++                          self.kv_proj_shard_size * self.kv_shard_id:v_offset +
++                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
++        lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
++        return lora_b
+ 
+-        self.lora_a_stacked[index,
+-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+-                                lora_a.T, non_blocking=True)
+-        self.lora_b_stacked[index,
+-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+-                                lora_b.T, non_blocking=True)
++    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
++        bias_q = bias[self.q_proj_shard_size *
++                      self.q_shard_id:self.q_proj_shard_size *
++                      (self.q_shard_id + 1)]
++        k_offset = self.q_proj_total_size
++        bias_k = bias[k_offset +
++                      self.kv_proj_shard_size * self.kv_shard_id:k_offset +
++                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
++        v_offset = k_offset + self.kv_proj_total_size
++        bias_v = bias[v_offset +
++                      self.kv_proj_shard_size * self.kv_shard_id:v_offset +
++                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
++        bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
++        return bias
+ 
+     @classmethod
++    @_not_fully_sharded_can_replace
+     def can_replace_layer(cls, source_layer: nn.Module,
+                           lora_config: LoRAConfig, packed_modules_list: List,
+                           model_config: Optional[PretrainedConfig]) -> bool:
+@@ -666,8 +748,8 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
+             packed_modules_list) == 1
+ 
+ 
+-class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
+-    """ColumnParallelLinear layer that is composed of 3 sublayers (slices)
++class MergedQKVParallelLinearWithLora(MergedColumnParallelLinearWithLoRA):
++    """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
+     packed together in qkv proj fashion
+     (q_proj + k_proj + v_proj -> qkv_proj).
+ 
+@@ -679,15 +761,11 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
+ 
+     def __init__(self, base_layer: QKVParallelLinear) -> None:
+         super().__init__(base_layer)
+-
+-    def create_lora_weights(
+-            self,
+-            max_loras: int,
+-            lora_config: LoRAConfig,
+-            model_config: Optional[PretrainedConfig] = None) -> None:
+-        self.lora_config = lora_config
++        # There are three LoRA layer.
++        self.n_slices = len(self.base_layer.output_sizes)
+         self.tp_size = get_tensor_model_parallel_world_size()
+         self.tp_rank = get_tensor_model_parallel_rank()
++
+         self.q_proj_shard_size = (self.base_layer.num_heads *
+                                   self.base_layer.head_size)
+         self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
+@@ -695,264 +773,73 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
+         self.q_shard_id = self.tp_rank
+         self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas
+ 
+-        lora_a_output_size_per_partition = (
+-            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
+-            else divide(lora_config.max_lora_rank, self.tp_size))
+-        # q, k, v
+-        self.lora_a_stacked = (
+-            torch.zeros(
+-                max_loras,
+-                1,
+-                lora_a_output_size_per_partition,
+-                self.input_size,
+-                dtype=lora_config.lora_dtype,
+-                device=self.device,
+-            ),
+-            torch.zeros(
+-                max_loras,
+-                1,
+-                lora_a_output_size_per_partition,
+-                self.input_size,
+-                dtype=lora_config.lora_dtype,
+-                device=self.device,
+-            ),
+-            torch.zeros(
+-                max_loras,
+-                1,
+-                lora_a_output_size_per_partition,
+-                self.input_size,
+-                dtype=lora_config.lora_dtype,
+-                device=self.device,
+-            ),
++        self.output_slices = (
++            self.q_proj_shard_size,
++            self.kv_proj_shard_size,
++            self.kv_proj_shard_size,
+         )
+-        self.lora_b_stacked = (
+-            torch.zeros(
+-                max_loras,
+-                1,
+-                self.q_proj_shard_size,
+-                lora_config.max_lora_rank,
+-                dtype=lora_config.lora_dtype,
+-                device=self.device,
+-            ),
+-            torch.zeros(
+-                max_loras,
+-                1,
+-                self.kv_proj_shard_size,
+-                lora_config.max_lora_rank,
+-                dtype=lora_config.lora_dtype,
+-                device=self.device,
+-            ),
+-            torch.zeros(
+-                max_loras,
+-                1,
+-                self.kv_proj_shard_size,
+-                lora_config.max_lora_rank,
+-                dtype=lora_config.lora_dtype,
+-                device=self.device,
+-            ),
++        self.output_ids = (
++            self.q_shard_id,
++            self.kv_shard_id,
++            self.kv_shard_id,
+         )
+ 
+-        self.output_slices = (self.q_proj_shard_size, self.kv_proj_shard_size,
+-                              self.kv_proj_shard_size)
+-        self.packed_indices: Optional[torch.Tensor] = None
+-        self.standard_indices: Optional[torch.Tensor] = None
+-        # lazily initialized.
+-        self.indices_len: List[int]
+-
+-    def reset_lora(self, index: int):
+-        self.lora_a_stacked[0][index] = 0
+-        self.lora_b_stacked[0][index] = 0
+-        self.lora_a_stacked[1][index] = 0
+-        self.lora_b_stacked[1][index] = 0
+-        self.lora_a_stacked[2][index] = 0
+-        self.lora_b_stacked[2][index] = 0
+-
+-    def slice_lora_a(self, lora_a: List[torch.Tensor]) -> List[torch.Tensor]:
+-        return lora_a
+-
+-    def slice_lora_b(self, lora_b: List[torch.Tensor]) -> List[torch.Tensor]:
+-        if lora_b[0] is not None:
+-            lora_b_q = lora_b[0][:, self.q_proj_shard_size *
+-                                 self.q_shard_id:self.q_proj_shard_size *
+-                                 (self.q_shard_id + 1)]
+-        if lora_b[1] is not None:
+-            lora_b_k = lora_b[1][:, self.kv_proj_shard_size *
+-                                 self.kv_shard_id:self.kv_proj_shard_size *
+-                                 (self.kv_shard_id + 1)]
+-        if lora_b[2] is not None:
+-            lora_b_v = lora_b[2][:, self.kv_proj_shard_size *
+-                                 self.kv_shard_id:self.kv_proj_shard_size *
+-                                 (self.kv_shard_id + 1)]
+-        lora_b = [lora_b_q, lora_b_k, lora_b_v]
+-        return lora_b
+-
+-    def set_lora(
++    def create_lora_weights(
+         self,
+-        index: int,
+-        lora_a: torch.Tensor,
+-        lora_b: torch.Tensor,
+-        embeddings_tensor: Optional[torch.Tensor],
+-    ):
+-        self.reset_lora(index)
+-
+-        if self.tp_size > 1:
+-            lora_a = self.slice_lora_a(lora_a)
+-            lora_b = self.slice_lora_b(lora_b)
+-
+-        if lora_b[0] is not None:
+-            lora_b_q = lora_b[0]
+-            self.lora_b_stacked[0][
+-                index, 0, :lora_b_q.shape[1], :lora_b_q.shape[0]].copy_(
+-                    lora_b_q.T, non_blocking=True)
+-        if lora_b[1] is not None:
+-            lora_b_k = lora_b[1]
+-            self.lora_b_stacked[1][
+-                index, 0, :lora_b_k.shape[1], :lora_b_k.shape[0]].copy_(
+-                    lora_b_k.T, non_blocking=True)
+-        if lora_b[2] is not None:
+-            lora_b_v = lora_b[2]
+-            self.lora_b_stacked[2][
+-                index, 0, :lora_b_v.shape[1], :lora_b_v.shape[0]].copy_(
+-                    lora_b_v.T, non_blocking=True)
+-
+-        if lora_a[0] is not None:
+-            self.lora_a_stacked[0][
+-                index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_(
+-                    lora_a[0].T, non_blocking=True)
+-        if lora_a[1] is not None:
+-            self.lora_a_stacked[1][
+-                index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_(
+-                    lora_a[1].T, non_blocking=True)
+-        if lora_a[2] is not None:
+-            self.lora_a_stacked[2][
+-                index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_(
+-                    lora_a[2].T, non_blocking=True)
+-
+-    def apply(self, x: torch.Tensor,
+-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+-        _apply_lora_packed_nslice(
+-            x,
+-            self.lora_a_stacked,
+-            self.lora_b_stacked,
+-            self.indices[:self.indices_len[0]],
+-            output,
+-            self.output_slices,
+-        )
+-        return output
++        max_loras: int,
++        lora_config: LoRAConfig,
++        model_config: Optional[PretrainedConfig] = None,
++    ) -> None:
++        """
++        The main reason for overloading this function is to handle inconsistent 
++        weight dimensions in qkv lora.
++        """
++        super().create_lora_weights(max_loras, lora_config, model_config)
+ 
+     @classmethod
+     @_not_fully_sharded_can_replace
+-    def can_replace_layer(cls, source_layer: nn.Module,
+-                          lora_config: LoRAConfig, packed_modules_list: List,
+-                          model_config: Optional[PretrainedConfig]) -> bool:
+-        return type(source_layer) is QKVParallelLinear and len(
+-            packed_modules_list) == 3
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
++        return (type(source_layer) is QKVParallelLinear
++                and len(packed_modules_list) == 3)
+ 
+ 
+-class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
++class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
+ 
+     def __init__(self, base_layer: RowParallelLinear) -> None:
+-        super().__init__()
+-        self.base_layer = base_layer
++        super().__init__(base_layer)
++
++        self.tp_size = get_tensor_model_parallel_world_size()
++        # reset input_size
+         self.input_size = self.base_layer.input_size_per_partition
+         self.output_size = self.base_layer.output_size
+-        self.device = _get_lora_device(self.base_layer)
+ 
+-    def create_lora_weights(
+-            self,
+-            max_loras: int,
+-            lora_config: LoRAConfig,
+-            model_config: Optional[PretrainedConfig] = None) -> None:
+-        self.lora_config = lora_config
+         self.tp_rank = get_tensor_model_parallel_rank()
+-        self.lora_a_stacked = torch.zeros(
+-            (
+-                max_loras,
+-                1,
+-                lora_config.max_lora_rank,
+-                self.input_size,
+-            ),
+-            dtype=lora_config.lora_dtype,
+-            device=self.device,
+-        )
+-        tp_size = get_tensor_model_parallel_world_size()
+-        lora_b_output_size_per_partition = (
+-            self.output_size if not lora_config.fully_sharded_loras else
+-            divide(self.output_size, tp_size))
+-
+-        self.lora_b_stacked = torch.zeros(
+-            (
+-                max_loras,
+-                1,
+-                lora_b_output_size_per_partition,
+-                lora_config.max_lora_rank,
+-            ),
+-            dtype=lora_config.lora_dtype,
+-            device=self.device,
+-        )
+-        # Lazily initialized
+-        self.indices: torch.Tensor
+-        self.indices_len: List[int]
+-
+-    def reset_lora(self, index: int):
+-        self.lora_a_stacked[index] = 0
+-        self.lora_b_stacked[index] = 0
++        # There is only one LoRA layer.
++        self.n_slices = 1
+ 
+     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
++
+         shard_size = self.input_size
+-        start_idx = tensor_model_parallel_rank * shard_size
+-        end_idx = (tensor_model_parallel_rank + 1) * shard_size
++        start_idx = self.tp_rank * shard_size
++        end_idx = (self.tp_rank + 1) * shard_size
+         lora_a = lora_a[start_idx:end_idx, :]
+         return lora_a
+ 
+     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+         return lora_b
+ 
+-    def set_lora(
+-        self,
+-        index: int,
+-        lora_a: torch.Tensor,
+-        lora_b: torch.Tensor,
+-        embeddings_tensor: Optional[torch.Tensor],
+-    ):
+-        self.reset_lora(index)
+-
+-        if self.base_layer.tp_size > 1:
+-            lora_a = self.slice_lora_a(lora_a)
+-            lora_b = self.slice_lora_b(lora_b)
+-
+-        self.lora_a_stacked[index,
+-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+-                                lora_a.T, non_blocking=True)
+-        self.lora_b_stacked[index,
+-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+-                                lora_b.T, non_blocking=True)
+-
+-    def set_mapping(
+-        self,
+-        base_indices: torch.Tensor,
+-        sampler_indices: torch.Tensor,
+-        sampler_indices_padded: torch.Tensor,
+-        embeddings_indices: torch.Tensor,
+-        indices_len: List[int],
+-    ):
+-        self.indices = base_indices
+-        self.indices_len = indices_len
+-
+-    def apply(self, x: torch.Tensor) -> torch.Tensor:
+-        output = self.base_layer.quant_method.apply(self.base_layer, x)
+-        _apply_lora(
+-            x,
+-            self.lora_a_stacked,
+-            self.lora_b_stacked,
+-            self.indices[:self.indices_len[0]],
+-            output,
+-        )
+-        return output
++    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
++        return bias
+ 
+-    def forward(self, input_):
++    def forward(
++        self, input_: torch.Tensor
++    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+         """Forward of RowParallelLinear
+ 
+         Args:
+@@ -969,10 +856,9 @@ class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
+             input_parallel = input_
+         else:
+             # TODO: simplify code below
+-            tp_rank = get_tensor_model_parallel_rank()
+             splitted_input = split_tensor_along_last_dim(
+                 input_, num_partitions=self.base_layer.tp_size)
+-            input_parallel = splitted_input[tp_rank].contiguous()
++            input_parallel = splitted_input[self.tp_rank].contiguous()
+ 
+         # Matrix multiply.
+         output_parallel = self.apply(input_parallel)
+@@ -992,32 +878,47 @@ class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
+ 
+     @property
+     def weight(self):
+-
+-        return self.base_layer.weight if hasattr(
+-            self.base_layer, "weight") else self.base_layer.qweight
++        return (self.base_layer.weight if hasattr(self.base_layer, "weight")
++                else self.base_layer.qweight)
+ 
+     @classmethod
+     @_not_fully_sharded_can_replace
+-    def can_replace_layer(cls, source_layer: nn.Module,
+-                          lora_config: LoRAConfig, packed_modules_list: List,
+-                          model_config: Optional[PretrainedConfig]) -> bool:
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
+         return type(source_layer) is RowParallelLinear
+ 
+ 
+ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
++    """
++    LoRA wrapper for LogitsProcessor, with extra logic to handle the
++    application of the LoRA adapter and added LoRA vocabulary.
++
++    Args:
++        base_layer: LogitsProcessor layer
++        hidden_size: hidden size of the model
++        dtype: data type of the model
++        device: device of the model
++        sharded_to_full_mapping: index mapping from sharded vocab to full vocab
++            received from base_layer.get_sharded_to_full_mapping(). If None,
++            no reindexing will be done.
++    """
+ 
+-    def __init__(
+-        self,
+-        base_layer: LogitsProcessor,
+-        hidden_size: int,
+-        dtype: torch.dtype,
+-        device: torch.device,
+-    ) -> None:
++    def __init__(self, base_layer: LogitsProcessor, hidden_size: int,
++                 dtype: torch.dtype, device: torch.device,
++                 sharded_to_full_mapping: Optional[List[int]]) -> None:
+         super().__init__()
+         self.base_layer = base_layer
+         self.hidden_size = hidden_size
+         self.dtype = dtype
+         self.device = device
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.tp_rank = get_tensor_model_parallel_rank()
++        self.sharded_to_full_mapping = sharded_to_full_mapping
+ 
+     @property
+     def logits_as_input(self):
+@@ -1031,6 +932,14 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+     def scale(self):
+         return self.base_layer.scale
+ 
++    @property
++    def soft_cap(self):
++        return self.base_layer.soft_cap
++
++    @property
++    def use_gather(self):
++        return self.base_layer.use_gather
++
+     @property
+     def org_vocab_size(self):
+         return self.base_layer.org_vocab_size
+@@ -1039,16 +948,20 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+     def include_gpu_probs_tensor(self):
+         return self.base_layer.include_gpu_probs_tensor
+ 
++    @property
++    def should_modify_greedy_probs_inplace(self):
++        return self.base_layer.should_modify_greedy_probs_inplace
++
+     def create_lora_weights(
+         self,
+         max_loras: int,
+         lora_config: LoRAConfig,
+         model_config: Optional[PretrainedConfig] = None,
+     ) -> None:
+-        # Keep this in sync with csrc/punica/bgmv/bgmv_config.h
+-        if 32000 < self.base_layer.vocab_size > 128512:
++        # TODO: Verify if this condition can be further relaxed
++        if 32000 < self.base_layer.vocab_size > 257024:
+             raise ValueError("When using LoRA, vocab size must be "
+-                             "32000 >= vocab_size <= 128512")
++                             "32000 >= vocab_size <= 257024")
+         self.lora_a_stacked = torch.zeros(
+             (
+                 max_loras,
+@@ -1078,10 +991,13 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+             dtype=self.dtype,
+             device=self.device,
+         )
+-        # Lazily initialized.
+-        self.indices: torch.Tensor
+-        self.indices_len: List[int]
+-        self.indices_padded: torch.Tensor
++        if self.sharded_to_full_mapping is not None:
++            self.sharded_to_full_mapping_gpu = torch.tensor(
++                self.sharded_to_full_mapping,
++                device=self.device,
++                dtype=torch.long)
++        else:
++            self.sharded_to_full_mapping_gpu = None
+ 
+     def reset_lora(self, index: int):
+         self.lora_a_stacked[index] = 0
+@@ -1094,6 +1010,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+         lora_a: torch.Tensor,
+         lora_b: torch.Tensor,
+         embeddings_tensor: Optional[torch.Tensor],
++        bias: Optional[torch.Tensor] = None,
+     ):
+         self.reset_lora(index)
+         self.lora_a_stacked[index,
+@@ -1107,32 +1024,39 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
+                 shape[1], ] = embeddings_tensor
+ 
+-    def set_mapping(
+-        self,
+-        base_indices: torch.Tensor,
+-        sampler_indices: torch.Tensor,
+-        sampler_indices_padded: torch.Tensor,
+-        embeddings_indices: torch.Tensor,
+-        indices_len: List[int],
+-    ):
+-        self.indices = sampler_indices
+-        self.indices_padded = sampler_indices_padded
+-        self.indices_len = indices_len
+-
+     def _get_logits(
+         self,
+         hidden_states: torch.Tensor,
+-        embedding: torch.Tensor,
++        lm_head: VocabParallelEmbedding,
+         embedding_bias: Optional[torch.Tensor] = None,
+     ) -> Optional[torch.Tensor]:
+         # Get the logits for the next tokens.
+-        logits = torch.matmul(hidden_states, embedding.t())
++        logits = lm_head.linear_method.apply(lm_head, hidden_states)
+         if embedding_bias is not None:
+             logits += embedding_bias
+         logits = tensor_model_parallel_gather(logits)
+         if logits is None:
+             return None
+ 
++        if self.sharded_to_full_mapping_gpu is not None:
++            # Reindex full logits tensor to ensure 1:1 mapping between
++            # index and token_id
++            # Example for:
++            #   org_vocab_size = 4
++            #   added_vocab_size = 2
++            #   pad_to_size = 8
++            #   tp_size = 2
++
++            # indices:  [0, 1, 2,  3, 4, 5, 6,  7]
++            # token_id: [0, 1, 4, -1, 2, 3, 5, -1]
++
++            # Therefore, the mapping is expected to be:
++            # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex,
++            # we get:
++            # indices:  [0, 1, 2, 3, 4, 5,  6,  7]
++            # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
++            logits = logits[:, self.sharded_to_full_mapping_gpu]
++
+         lora_logits = torch.empty(
+             self.embeddings_tensors.shape[0] + 1,
+             self.embeddings_tensors.shape[1],
+@@ -1145,37 +1069,129 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+                      out=lora_logits[:-1])
+         lora_logits[-1] = float("-inf")
+         lora_logits = lora_logits.mT
++        indices_padded = self.punica_wrapper.sampler_indices_padded
+         lora_logits = (lora_logits.reshape(
+             lora_logits.shape[0] * lora_logits.shape[1],
+             lora_logits.shape[2],
+-        ).index_select(0,
+-                       self.indices_padded[:self.indices_len[2]]).nan_to_num_(
+-                           nan=float("-inf"),
+-                           posinf=float("inf"),
+-                           neginf=float("-inf")))
++        ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
++                                                      posinf=float("inf"),
++                                                      neginf=float("-inf")))
++
++        # HPU needs special handling to prune out dummy samples.
++        if current_platform.is_hpu():
++            lora_logits = lora_logits[:logits.shape[0], :]
++
+         logits[:,
+                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
+                lora_logits.shape[1]] = lora_logits
+ 
+-        _apply_lora(
+-            hidden_states,
+-            self.lora_a_stacked,
+-            self.lora_b_stacked,
+-            self.indices[:self.indices_len[1]],
+-            logits,
+-        )
++        # LogitsProcessorWithLoRA always using bgmv
++        self.punica_wrapper.add_lora_logits(logits, hidden_states,
++                                            self.lora_a_stacked,
++                                            self.lora_b_stacked, 1.0)
+ 
+         # Remove paddings in vocab (if any).
+         logits = logits[:, :self.base_layer.vocab_size]
+-
+         return logits
+ 
+     def forward(self, *args, **kwargs):
+         return type(self.base_layer).forward(self, *args, **kwargs)
+ 
+     @classmethod
+-    def can_replace_layer(cls, source_layer: nn.Module,
+-                          lora_config: LoRAConfig, packed_modules_list: List,
+-                          model_config: Optional[PretrainedConfig]) -> bool:
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
+         # Special handling for the LogitsProcessor.
+         return False
++
++
++class LinearScalingRotaryEmbeddingWithLora(BaseLayerWithLoRA):
++    """Implements RoPE-scaled embeddings with linear scaling for
++    multiple LoRA adapters with a specialized kernel.
++
++    Replace LinearScalingRotaryEmbedding with MultiLinearScalingRotaryEmbedding
++    which can handle multi lora adapters in a specialied kernel.
++    """
++
++    def __init__(self, base_layer: RotaryEmbedding) -> None:
++        super().__init__()
++        self.base_layer = base_layer
++
++    @property
++    def scaling_factors(self):
++        return self.base_layer.scaling_factors
++
++    @property
++    def rotary_dim(self):
++        return self.base_layer.rotary_dim
++
++    def create_lora_weights(
++        self,
++        max_loras: int,
++        lora_config: LoRAConfig,
++        model_config: Optional[PretrainedConfig] = None,
++    ) -> None:
++        scaling_factors = (list(lora_config.long_lora_scaling_factors)
++                           if lora_config.long_lora_scaling_factors else [])
++        base_scaling_factor = (self.base_layer.scaling_factor if isinstance(
++            self.base_layer, LinearScalingRotaryEmbedding) else 1.0)
++        scaling_factors = sorted(
++            list(set([base_scaling_factor] + scaling_factors)))
++        self.base_layer = LinearScalingRotaryEmbedding(
++            self.base_layer.head_size,
++            self.base_layer.rotary_dim,
++            self.base_layer.max_position_embeddings,
++            self.base_layer.base,
++            self.base_layer.is_neox_style,
++            scaling_factors,
++            self.base_layer.dtype,
++        )
++
++    def reset_lora(self, index: int):
++        ...
++
++    def set_lora(
++        self,
++        index: int,
++        lora_a: torch.Tensor,
++        lora_b: torch.Tensor,
++        embeddings_tensor: Optional[torch.Tensor],
++        bias: Optional[torch.Tensor] = None,
++    ):
++        ...
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        query: torch.Tensor,
++        key: torch.Tensor,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        return self.base_layer(
++            positions,
++            query,
++            key,
++            offsets=self.punica_wrapper.long_lora_indices,
++        )
++
++    @property
++    def scaling_factor_to_offset(self) -> Dict[float, int]:
++        return self.base_layer.scaling_factor_to_offset
++
++    @classmethod
++    def can_replace_layer(
++        cls,
++        source_layer: nn.Module,
++        lora_config: LoRAConfig,
++        packed_modules_list: List,
++        model_config: Optional[PretrainedConfig],
++    ) -> bool:
++        """Returns True if the layer can be replaced by this LoRA layer."""
++        return (type(source_layer) is LinearScalingRotaryEmbedding
++                or type(source_layer) is RotaryEmbedding)
++
++    def extra_repr(self) -> str:
++        return self.base_layer.extra_repr()
+diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
+index d7794aa..93ad465 100644
+--- a/vllm/lora/lora.py
++++ b/vllm/lora/lora.py
+@@ -1,7 +1,10 @@
+ from typing import List, Optional
++from typing import Sequence as GenericSequence
+ 
+ import torch
++import torch.types
+ 
++from vllm.lora.peft_helper import PEFTHelper
+ from vllm.utils import is_pin_memory_available
+ 
+ 
+@@ -15,6 +18,7 @@ class LoRALayerWeights:
+         lora_alpha: int,
+         lora_a: torch.Tensor,
+         lora_b: torch.Tensor,
++        bias: Optional[torch.Tensor] = None,
+         embeddings_tensor: Optional[torch.Tensor] = None,
+         scaling: Optional[float] = None,
+     ) -> None:
+@@ -23,6 +27,7 @@ class LoRALayerWeights:
+         self.lora_alpha = lora_alpha
+         self.lora_a = lora_a
+         self.lora_b = lora_b
++        self.bias = bias
+         self.embeddings_tensor = embeddings_tensor
+ 
+         if scaling is None:
+@@ -55,6 +60,17 @@ class LoRALayerWeights:
+         return self.embeddings_tensor.shape[
+             0] if self.embeddings_tensor is not None else 0
+ 
++    @classmethod
++    def from_config(
++        cls,
++        module_name: str,
++        peft_helper: PEFTHelper,
++        embeddings_tensor: Optional[torch.Tensor] = None,
++    ) -> "LoRALayerWeights":
++        return cls(module_name, peft_helper.r, peft_helper.lora_alpha, None,
++                   None, None, embeddings_tensor,
++                   peft_helper.vllm_lora_scaling_factor)
++
+     @classmethod
+     def create_dummy_lora_weights(
+             cls,
+@@ -63,8 +79,9 @@ class LoRALayerWeights:
+             output_dim: int,
+             rank: int,
+             dtype: torch.dtype,
+-            device: torch.device,
+-            embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights":
++            device: torch.types.Device,
++            embeddings_tensor_dim: Optional[int] = None,
++            bias_enabled: Optional[bool] = False) -> "LoRALayerWeights":
+         pin_memory = str(device) == "cpu" and is_pin_memory_available()
+         lora_a = torch.zeros([input_dim, rank],
+                              dtype=dtype,
+@@ -74,6 +91,14 @@ class LoRALayerWeights:
+                              dtype=dtype,
+                              device=device,
+                              pin_memory=pin_memory)
++        if bias_enabled:
++            bias = torch.zeros([output_dim],
++                               dtype=dtype,
++                               device=device,
++                               pin_memory=pin_memory)
++        else:
++            bias = None
++
+         embeddings_tensor = torch.rand(
+             10,
+             embeddings_tensor_dim,
+@@ -86,6 +111,7 @@ class LoRALayerWeights:
+             lora_alpha=1,
+             lora_a=lora_a,
+             lora_b=lora_b,
++            bias=bias,
+             embeddings_tensor=embeddings_tensor,
+         )
+ 
+@@ -100,6 +126,7 @@ class PackedLoRALayerWeights(LoRALayerWeights):
+         lora_alphas: List[Optional[int]],
+         lora_a: List[Optional[torch.Tensor]],
+         lora_b: List[Optional[torch.Tensor]],
++        bias: Optional[List[Optional[torch.Tensor]]] = None,
+         scaling: Optional[List[float]] = None,
+     ) -> None:
+         super().__init__(
+@@ -108,6 +135,7 @@ class PackedLoRALayerWeights(LoRALayerWeights):
+             lora_alpha=0,
+             lora_a=lora_a,
+             lora_b=lora_b,
++            bias=bias,
+             scaling=scaling,  # type: ignore
+             embeddings_tensor=None,
+         )
+@@ -120,7 +148,7 @@ class PackedLoRALayerWeights(LoRALayerWeights):
+ 
+     @classmethod
+     def pack(
+-            cls, loras: List[Optional["LoRALayerWeights"]]
++        cls, loras: GenericSequence[Optional["LoRALayerWeights"]]
+     ) -> "PackedLoRALayerWeights":
+         """Pack a list of LoRAs into a single LoRA.
+ 
+@@ -139,6 +167,7 @@ class PackedLoRALayerWeights(LoRALayerWeights):
+             [lora.lora_alpha if lora is not None else None for lora in loras],
+             [lora.lora_a if lora is not None else None for lora in loras],
+             [lora.lora_b if lora is not None else None for lora in loras],
++            [lora.bias if lora is not None else None for lora in loras],
+             scaling=[
+                 1 if lora is not None else None  # type: ignore
+                 for lora in loras
+diff --git a/vllm/lora/models.py b/vllm/lora/models.py
+index 50d7e91..5b7225b 100644
+--- a/vllm/lora/models.py
++++ b/vllm/lora/models.py
+@@ -3,99 +3,49 @@ import json
+ import math
+ import os
+ import re
+-from typing import Callable, Dict, List, Optional, Tuple, Type
++from dataclasses import dataclass, field
++from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
+ 
+ import safetensors.torch
+ import torch
+ from torch import nn
+ 
++from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
++                                         AdapterModelManager)
++from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
++                                        get_adapter, list_adapters,
++                                        remove_adapter, set_adapter_mapping)
+ from vllm.config import LoRAConfig
+ from vllm.logger import init_logger
+-from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
++from vllm.lora.layers import (BaseLayerWithLoRA,
++                              LinearScalingRotaryEmbeddingWithLora,
++                              LoRAMapping)
+ from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
++from vllm.lora.peft_helper import PEFTHelper
++from vllm.lora.punica_wrapper import get_punica_wrapper
+ from vllm.lora.utils import (from_layer, from_layer_logits_processor,
++                             is_regex_target_modules,
+                              parse_fine_tuned_lora_name, replace_submodule)
+-from vllm.utils import LRUCache, is_pin_memory_available
++from vllm.model_executor.models import SupportsLoRA, supports_multimodal
++from vllm.model_executor.models.module_mapping import MultiModelKeys
++from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
++from vllm.utils import is_pin_memory_available
+ 
+ logger = init_logger(__name__)
+ 
+ _GLOBAL_LORA_ID = 0
+ 
+ 
+-def convert_mapping(
+-    mapping: LoRAMapping, lora_index_to_id: List[Optional[int]],
+-    max_loras: int, vocab_size: int, extra_vocab_size: int
+-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, List[int]]:
+-    """Converts LoRAMapping to index tensors.
+-
+-    Args:
+-        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
+-        lora_index_to_id: List mapping LoRA ids to LoRA indices.
+-        max_loras: Maximum number of LoRAs.
+-        vocab_size: Model vocab size.
+-        extra_vocab_size: Extra vocab size each LoRA can have.
+-
+-    Returns:
+-        A tuple of tensors:
+-            base_indices: Tensor of shape [batch_size] mapping batch rows to
+-                LoRA indices.
+-            sampler_indices: Tensor of shape [batch_size] mapping requests to
+-                LoRA indices for sampler. For generation, this will be the
+-                same as base_indicies. For prefill, this will map requests
+-                to LoRA indices.
+-            sampler_indices_padded: Tensor of shape [batch_size] mapping
+-                requests to LoRA indices for sampler with padding.
+-                Same as sampler_indicies, but -1 is replaced with
+-                max_loras.
+-            embeddings_indices: Tensor of shape [2, batch_size] mapping
+-                requests to embedding indices. First row is for embeddings
+-                added by the LoRAs, second row is for the LoRA.lora_a
+-                embeddings.
+-            indices_len: List of lengths of the above tensors.
+-    """
+-    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+-    embedding_indices = index_mapping_indices.copy()
+-    lora_indices = index_mapping_indices.copy()
+-    prompt_mapping: List[int] = [
+-        lora_index_to_id.index(x) if x > 0 else -1
+-        for x in mapping.prompt_mapping
+-    ]
+-    lora_idx = None
+-    for i in range(len(index_mapping_indices)):
+-        # TODO index can be slow. optimize
+-        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+-                    if index_mapping_indices[i] > 0 else -1)
+-        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+-        index_mapping_indices[i] = i
+-        lora_indices[i] = lora_idx
+-
+-    indices = torch.tensor(
+-        [index_mapping_indices, lora_indices, embedding_indices],
+-        dtype=torch.long,
+-        device="cuda")
+-    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+-                                         device="cuda",
+-                                         dtype=torch.long)
+-    embeddings_indices = torch.stack([
+-        indices[2] * extra_vocab_size,
+-        indices[2] * (vocab_size + extra_vocab_size)
+-    ])
+-    embeddings_indices[embeddings_indices == -1] = max_loras - 1
+-    base_indices = indices[1]
+-    sampler_indices = prompt_mapping_tensor
+-    sampler_indices_padded = sampler_indices.clone()
+-    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
+-    sampler_indices_padded = (
+-        torch.arange(
+-            0, len(sampler_indices_padded), device="cuda", dtype=torch.long) +
+-        (sampler_indices_padded * len(sampler_indices_padded)))
+-    indices_len = [
+-        base_indices.shape[-1], sampler_indices.shape[-1],
+-        sampler_indices_padded.shape[-1], embeddings_indices.shape[-1]
+-    ]
+-
+-    return (base_indices, sampler_indices, sampler_indices_padded,
+-            embeddings_indices, indices_len)
++@dataclass
++class LongContextLoRAContext:
++    """Context for lora adapters that support long context."""
++    # The scaling factors to support long context lora fine tuned models.
++    scaling_factors: List[float]
++    # dimension to apply rotary embedding.
++    rot_dim: int
++    # offsets to the sin_cos_cache for each lora_id loaded.
++    # This value is dynamically modified.
++    offsets_by_lora_id: Dict[int, int] = field(default_factory=dict)
+ 
+ 
+ def get_lora_id():
+@@ -104,7 +54,7 @@ def get_lora_id():
+     return _GLOBAL_LORA_ID
+ 
+ 
+-class LoRAModel:
++class LoRAModel(AdapterModel):
+     """A LoRA fine-tuned model."""
+ 
+     def __init__(
+@@ -112,13 +62,35 @@ class LoRAModel:
+         lora_model_id: int,
+         rank: int,
+         loras: Dict[str, LoRALayerWeights],
++        scaling_factor: Optional[float] = None,
+     ) -> None:
++        """
++        Args:
++            lora_model_id: The integer id for the lora model.
++            rank: lora rank.
++            loras: module name -> weights for lora-replaced layers.
++            scaling_factor: Scaling factor to support long context lora model.
++                None if the lora is not tuned for long context support.
++        """
+         self.id = lora_model_id
++        # Scaling factor for long context lora model. None if it is not
++        # fine tuned for the long context.
++        self.scaling_factor = scaling_factor
+         assert (lora_model_id >
+                 0), f"a valid lora id should be greater than 0, got {self.id}"
+         self.rank = rank
+         self.loras: Dict[str, LoRALayerWeights] = loras
+ 
++    def clone(self, lora_model_id: int) -> "LoRAModel":
++        """Return a copy of the object with different ids.
++
++        Will share the underlying tensors."""
++        return self.__class__(
++            lora_model_id,
++            rank=self.rank,
++            loras=self.loras.copy(),
++        )
++
+     @property
+     def extra_vocab_size(self) -> int:
+         return max(lora.extra_vocab_size
+@@ -133,21 +105,22 @@ class LoRAModel:
+     def from_lora_tensors(
+         cls,
+         lora_model_id: int,
+-        rank: int,
+-        lora_alpha: int,
+         tensors: Dict[str, torch.Tensor],
++        peft_helper: PEFTHelper,
+         device: str = "cuda",
+         dtype: Optional[torch.dtype] = None,
+         embeddings: Optional[Dict[str, torch.Tensor]] = None,
+         target_embedding_padding: Optional[int] = None,
+         embedding_modules: Optional[Dict[str, str]] = None,
+         embedding_padding_modules: Optional[List[str]] = None,
++        weights_mapper: Optional[WeightsMapper] = None,
+     ) -> "LoRAModel":
+         """Create a LoRAModel from a dictionary of tensors."""
+         pin_memory = str(device) == "cpu" and is_pin_memory_available()
+         loras: Dict[str, LoRALayerWeights] = {}
+         for tensor_name, tensor in tensors.items():
+-            module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name)
++            module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name(
++                tensor_name, weights_mapper)
+             if module_name not in loras:
+                 lora_embeddings_tensor = None
+                 if embeddings:
+@@ -162,10 +135,17 @@ class LoRAModel:
+                         if pin_memory:
+                             lora_embeddings_tensor = (
+                                 lora_embeddings_tensor.pin_memory())
+-                loras[module_name] = LoRALayerWeights(module_name, rank,
+-                                                      lora_alpha, None, None,
+-                                                      lora_embeddings_tensor)
+-            if is_lora_a:
++                loras[module_name] = LoRALayerWeights.from_config(
++                    module_name, peft_helper, lora_embeddings_tensor)
++
++            if is_bias:
++                loras[module_name].bias = tensor.to(device=device,
++                                                    dtype=dtype).t()
++                bias = tensor.to(device=device, dtype=dtype).t()
++                if pin_memory:
++                    bias = bias.pin_memory()
++                loras[module_name].bias = bias
++            elif is_lora_a:
+                 loras[module_name].lora_a = tensor.to(device=device,
+                                                       dtype=dtype).t()
+                 if pin_memory:
+@@ -189,21 +169,44 @@ class LoRAModel:
+ 
+         for lora in loras.values():
+             lora.optimize()
+-        return cls(lora_model_id, rank, loras)
++
++        return cls(lora_model_id,
++                   peft_helper.r,
++                   loras,
++                   scaling_factor=peft_helper.vllm_long_context_scaling_factor)
+ 
+     @classmethod
+     def from_local_checkpoint(
+         cls,
+         lora_dir: str,
+         expected_lora_modules: List[str],
++        *,
++        max_position_embeddings: Optional[int] = None,
+         lora_model_id: Optional[int] = None,
+         device: str = "cuda",
+         dtype: Optional[torch.dtype] = None,
+         target_embedding_padding: Optional[int] = None,
+         embedding_modules: Optional[Dict[str, str]] = None,
+         embedding_padding_modules: Optional[List[str]] = None,
++        weights_mapper: Optional[WeightsMapper] = None,
+     ) -> "LoRAModel":
+-        """Create a LoRAModel from a local checkpoint."""
++        """Create a LoRAModel from a local checkpoint.
++        
++        Args:
++            lora_dir: The local path that has lora data.
++            expected_lora_modules: Name of modules that are expected to be
++                replaced by lora.
++            max_position_embeddings: Max position embedding length. Used to
++                scaling the largest context length. If None, the lora model's
++                context length is not scaled.
++            lora_model_id: Lora model id. If not given, automatically set by
++                a global counter.
++            device: Device where the lora model is loaded.
++            dtype: dtype of the lora model weights.
++
++        Returns:
++            Loaded LoRA Model.
++        """
+         lora_config_path = os.path.join(lora_dir, "adapter_config.json")
+         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
+         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
+@@ -213,24 +216,62 @@ class LoRAModel:
+                                                     "new_embeddings.bin")
+         with open(lora_config_path) as f:
+             config = json.load(f)
+-        target_modules = config["target_modules"]
+-        unexpected_modules = []
+-        for module in target_modules:
+-            # Compatible with more modules, such as:layers.11.self_attn.k_proj
+-            part_name = module.split(".")[-1]
+-            if part_name not in expected_lora_modules:
+-                unexpected_modules.append(module)
+-        # loaded lora's target modules must be a subset of expected_lora_modules
+-        if unexpected_modules:
+-            raise ValueError(
+-                f"While loading {lora_dir}, expected"
+-                f" target modules in {expected_lora_modules}"
+-                f" but received {unexpected_modules}."
+-                f" Please verify that the loaded LoRA module is correct")
++
++        config["vllm_max_position_embeddings"] = max_position_embeddings
++        peft_helper = PEFTHelper.from_dict(config)
++        unexpected_modules: List[Union[list[str], str]]
+         if os.path.isfile(lora_tensor_path):
+-            tensors = safetensors.torch.load_file(lora_tensor_path)
++            tensors: Dict[str, torch.Tensor] = {}
++            # Find unexpected modules.
++            # Use safetensor key as a source of truth to find expected modules.
++            # in peft if you have target_modules A, B, C and C does not exist
++            # in the model it won’t error and model will be trained with A, B
++            # loraified. C won’t exist in the safetensor but it will exist in
++            # the target_modules of the adapter_config.json.
++            unexpected_modules = []
++            with safetensors.safe_open(lora_tensor_path,
++                                       framework="pt") as f:  # type: ignore
++                for lora_module in f.keys():  # noqa
++                    module_name, _, _ = parse_fine_tuned_lora_name(
++                        lora_module, weights_mapper)
++                    part_name = module_name.split(".")[-1]
++                    if part_name not in expected_lora_modules:
++                        unexpected_modules.append(module_name)
++                if unexpected_modules:
++                    raise ValueError(
++                        f"While loading {lora_dir}, expected"
++                        f" target modules in {expected_lora_modules}"
++                        f" but received {unexpected_modules}."
++                        f" Please verify that the loaded LoRA module is correct"
++                    )
++                # Load tensors if there are only expected modules.
++                for module in f.keys():  # noqa
++                    tensors[module] = f.get_tensor(module)
+         elif os.path.isfile(lora_bin_file_path):
+-            tensors = torch.load(lora_bin_file_path)
++            # When a bin file is provided, we rely on config to find unexpected
++            # modules.
++            unexpected_modules = []
++            target_modules = peft_helper.target_modules
++            if not isinstance(target_modules, list):
++                target_modules = [target_modules]
++            for module in target_modules:
++                # Compatible with more modules,
++                # such as:layers.11.self_attn.k_proj
++                part_name = module.split(".")[-1]
++                if part_name not in expected_lora_modules:
++                    unexpected_modules.append(module)
++            # loaded lora's target modules must be a subset of
++            # expected_lora_modules. It is not reliable. See
++            # https://github.com/vllm-project/vllm/pull/5909. But there's no
++            # other better mechanism.
++            if unexpected_modules and not is_regex_target_modules(
++                    peft_helper.target_modules, expected_lora_modules):
++                raise ValueError(
++                    f"While loading {lora_dir}, expected"
++                    f" target modules in {expected_lora_modules}"
++                    f" but received {unexpected_modules}."
++                    f" Please verify that the loaded LoRA module is correct")
++            tensors = torch.load(lora_bin_file_path, map_location=device)
+         else:
+             raise ValueError(f"{lora_dir} doesn't contain tensors")
+ 
+@@ -239,35 +280,34 @@ class LoRAModel:
+             embeddings = safetensors.torch.load_file(
+                 new_embeddings_tensor_path)
+         elif os.path.isfile(new_embeddings_bin_file_path):
+-            embeddings = torch.load(new_embeddings_bin_file_path)
++            embeddings = torch.load(new_embeddings_bin_file_path,
++                                    map_location=device)
+ 
+-        rank = config["r"]
+-        lora_alpha = config["lora_alpha"]
+         return cls.from_lora_tensors(
+             lora_model_id=get_lora_id()
+             if lora_model_id is None else lora_model_id,
+-            rank=rank,
+-            lora_alpha=lora_alpha,
+             tensors=tensors,
++            peft_helper=peft_helper,
+             device=device,
+             dtype=dtype,
+             embeddings=embeddings,
+             target_embedding_padding=target_embedding_padding,
+             embedding_modules=embedding_modules,
+             embedding_padding_modules=embedding_padding_modules,
+-        )
++            weights_mapper=weights_mapper)
+ 
+ 
+-class LoRAModelManager:
++class LoRAModelManager(AdapterModelManager):
+     """A manager that manages multiple LoRA-fine-tuned models."""
+ 
+     def __init__(
+         self,
+-        model: nn.Module,
++        model: SupportsLoRA,
+         max_num_seqs: int,
+         max_num_batched_tokens: int,
+         vocab_size: int,
+         lora_config: LoRAConfig,
++        device: torch.device,
+     ):
+         """Create a LoRAModelManager and adapter for a given model.
+ 
+@@ -281,43 +321,42 @@ class LoRAModelManager:
+             lora_config: the LoRA configuration.
+         """
+         self.lora_config = lora_config
++        self.device = device
+         self.max_num_seqs = max_num_seqs
+         assert self.capacity >= self.lora_slots
+         self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
+         self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots
+         self.vocab_size = vocab_size
+-        self.base_indices = torch.empty(self.max_num_batched_tokens,
+-                                        dtype=torch.long,
+-                                        device="cuda")
+-        self.sampler_indices = torch.empty(self.max_num_batched_tokens,
+-                                           dtype=torch.long,
+-                                           device="cuda")
+-        self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens,
+-                                                  dtype=torch.long,
+-                                                  device="cuda")
+-        self.embeddings_indices = torch.empty(2,
+-                                              self.max_num_batched_tokens,
+-                                              dtype=torch.long,
+-                                              device="cuda")
+-        # 4 is the number of indicies tensors defined above
+-        # base_indices, sampler_indices, sampler_indices_padded,
+-        # embeddings_indices
+-        self.indices_len: List[Optional[int]] = [None] * 4
+-
+-        self.model: nn.Module = model
++        self.long_lora_context: Optional[LongContextLoRAContext] = None
++        self.punica_wrapper = get_punica_wrapper(max_num_batched_tokens,
++                                                 max_batches=self.max_num_seqs,
++                                                 device=self.device)
++        # Scaling factor -> offset to the sin_cos_cache to it.
++        # Used for long context lora.
++        self.scaling_factor_to_offset: Dict[float, int] = {}
++        super().__init__(model)
+         if hasattr(self.model, "supported_lora_modules"):
+             self.supported_lora_modules = copy.deepcopy(
+                 self.model.supported_lora_modules)
++            if lora_config.long_lora_scaling_factors:
++                # We need to replace rotary emb layer to do batch computation
++                # for long lora.
++                self.supported_lora_modules.append("rotary_emb")
+             self.packed_modules_mapping = copy.deepcopy(
+                 self.model.packed_modules_mapping)
++        # Used to indicate whether the model is a multimodal model
++        self.supports_mm: bool = (
++            supports_multimodal(self.model)
++            # In case the model only supports LoRA for
++            # text modules (e.g. ChatGLM)
++            and hasattr(self.model, "get_mm_mapping"))
+         self.packed_modules: Dict[str, List[str]] = {}
+-        self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
+-        self._registered_loras: Dict[int, LoRAModel] = {}
++        self.modules: Dict[str, BaseLayerWithLoRA] = {}
+         # Dict instead of a Set for compatibility with LRUCache.
+-        self._active_loras: Dict[int, None] = {}
+         self._last_mapping: Optional[LoRAMapping] = None
+         self._create_lora_modules()
+         self.model.lora_manager = self
++        self.adapter_type = 'LoRa'
+ 
+     @property
+     def capacity(self) -> int:
+@@ -327,15 +366,16 @@ class LoRAModelManager:
+     def lora_slots(self) -> int:
+         return self.lora_config.max_loras
+ 
+-    def __len__(self) -> int:
+-        return len(self._registered_loras)
++    @property
++    def adapter_slots(self) -> int:
++        return self.lora_slots
+ 
+-    def activate_lora(
++    def activate_adapter(
+         self,
+         lora_id: int,
+     ) -> bool:
+         """Move LoRA into a GPU buffer to be used in the forward pass."""
+-        if lora_id in self._active_loras:
++        if lora_id in self._active_adapters:
+             return False
+         first_free_slot = next(
+             ((i, lora_id) for i, lora_id in enumerate(self.lora_index_to_id)
+@@ -343,8 +383,8 @@ class LoRAModelManager:
+         if first_free_slot is None:
+             raise ValueError("No free lora slots")
+         index, _ = first_free_slot
+-        self._active_loras[lora_id] = None
+-        lora_model = self._registered_loras[lora_id]
++        self._active_adapters[lora_id] = None
++        lora_model = self._registered_adapters[lora_id]
+         logger.debug("Activating LoRA. int id: %d, slot index: %d",
+                      lora_model.id, index)
+         self.lora_index_to_id[index] = lora_model.id
+@@ -352,91 +392,103 @@ class LoRAModelManager:
+             module_lora = lora_model.get_lora(module_name)
+             if module_lora:
+                 module_lora.optimize()
++                # Bias is not explicitly enabled with the flag enable_lora_bias.
++                bias = module_lora.bias
++                if ((torch.is_tensor(bias) or
++                     (isinstance(bias, Sequence) and any(b is not None
++                                                         for b in bias)))
++                        and not self.lora_config.bias_enabled):
++                    module_lora.bias = None
++                    raise ValueError(
++                        f"Adapter bias cannot be used for {module_name}"
++                        " without --enable-lora-bias.")
+                 module.set_lora(index, module_lora.lora_a, module_lora.lora_b,
+-                                module_lora.embeddings_tensor)
++                                module_lora.embeddings_tensor,
++                                module_lora.bias)
+             else:
+                 module.reset_lora(index)
+         return True
+ 
+-    def _deactivate_lora(self, lora_id: int):
++    def _deactivate_adapter(self, lora_id: int):
+         try:
+             index = self.lora_index_to_id.index(lora_id)
+             self.lora_index_to_id[index] = None
+         except ValueError:
+             pass
+ 
+-    def deactivate_lora(self, lora_id: int) -> bool:
+-        """Remove a LoRA from a GPU buffer."""
+-        if lora_id in self._active_loras:
+-            self._deactivate_lora(lora_id)
+-            self._active_loras.pop(lora_id)
+-            return True
+-        return False
++    def _set_long_lora_context(self, lora: LoRAModel):
++        if self.long_lora_context is None:
++            return
+ 
+-    def _add_lora(self, lora: LoRAModel):
+-        self._create_merged_loras_inplace(lora)
+-        self._registered_loras[lora.id] = lora
+-
+-    def add_lora(self, lora: LoRAModel) -> bool:
+-        """Add a LoRAModel to the manager CPU cache."""
+-        if lora.id not in self._registered_loras:
+-            if len(self._registered_loras) >= self.capacity:
+-                raise RuntimeError("No free LoRA slots.")
+-            self._add_lora(lora)
+-            return True
+-        return False
++        if lora.scaling_factor is None:
++            return
+ 
+-    def remove_lora(self, lora_id: int) -> bool:
+-        """Remove a LoRAModel from the manager CPU cache."""
+-        # TODO: should we check active lora?
+-        self.deactivate_lora(lora_id)
+-        return bool(self._registered_loras.pop(lora_id, None))
+-
+-    # TODO see if this can be vectorized
+-    def _set_lora_mapping(self, mapping: LoRAMapping) -> None:
+-        (base_indices, sampler_indices, sampler_indices_padded,
+-         embeddings_indices,
+-         indices_len) = convert_mapping(mapping, self.lora_index_to_id,
+-                                        self.lora_slots + 1, self.vocab_size,
+-                                        self.lora_config.lora_extra_vocab_size)
+-        self.base_indices[:base_indices.shape[0]].copy_(base_indices)
+-        self.sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
+-        self.sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
+-            sampler_indices_padded)
+-        self.embeddings_indices[:embeddings_indices.
+-                                shape[0], :embeddings_indices.shape[1]].copy_(
+-                                    embeddings_indices)
+-        # Maintain the reference
+-        self.indices_len[:] = indices_len
+-
+-    def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None:
+-        if self._last_mapping != lora_mapping:
+-            self._set_lora_mapping(lora_mapping)
+-        self._last_mapping = lora_mapping
+-
+-    def list_loras(self) -> Dict[int, LoRAModel]:
+-        """List all registered LoRAModels."""
+-        return dict(self._registered_loras)
++        if (lora.scaling_factor not in self.scaling_factor_to_offset):
++            raise ValueError(f"Long LoRA scaling factor {lora.scaling_factor}"
++                             " has not been initialized.")
+ 
+-    def get_lora(self, lora_id: int) -> Optional[LoRAModel]:
+-        return self._registered_loras.get(lora_id, None)
++        offsets = self.scaling_factor_to_offset.get(lora.scaling_factor)
++        if offsets:
++            self.long_lora_context.offsets_by_lora_id[lora.id] = offsets
+ 
+-    def remove_all_loras(self):
++    def _add_adapter(self, lora: LoRAModel):
++        self._create_merged_loras_inplace(lora)
++        self._registered_adapters[lora.id] = lora
++        self._set_long_lora_context(lora)
++
++    def pin_adapter(self, lora_id: int) -> bool:
++        """Pin a LoRAModel in the manager cache."""
++        raise NotImplementedError(
++            "Pinning is not supported in LoRAModelManager."
++            "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
++
++    def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
++        # update lora states
++        self.punica_wrapper.update_metadata(
++            mapping,
++            self.lora_index_to_id,
++            self.lora_slots + 1,
++            self.vocab_size,
++            self.lora_config.lora_extra_vocab_size,
++            self.long_lora_context,
++        )
++
++    def remove_all_adapters(self):
+         """Remove all LoRAModels from the manager."""
+-        self._registered_loras.clear()
++        self._registered_adapters.clear()
+         self.lora_index_to_id = [None] * self.lora_slots
+-        self._active_loras.clear()
++        self._active_adapters.clear()
+ 
+     def _create_lora_modules(self):
+-        for module_name, module in self.model.named_modules():
++        for module_name, module in self.model.named_modules(
++                remove_duplicate=False):
++            if isinstance(module, PPMissingLayer):
++                continue
+             if not self._match_target_modules(module_name):
+                 continue
++            # A temporary approach for multimodal models to support LoRA
++            # TODO: Remove this restriction
++            if self._filter_unsupported_mm_module(module_name):
++                logger.warning(
++                    "Regarding multimodal models, vLLM currently only supports "
++                    "adding LoRA to language model, %s will be ignored.",
++                    module_name,
++                )
++                continue
+             parts = module_name.split(".")[-1]
+             packed_moduled_lst = self.packed_modules_mapping.get(parts, [])
+             new_module = replace_submodule(
+                 self.model, module_name,
+                 from_layer(module, self.lora_slots, self.lora_config,
+                            packed_moduled_lst, self.model.config))
++
++            # LinearScalingRotaryEmbeddingWithLora is used to handle
++            # long context lora. Register relevant metadata.
++            if isinstance(new_module, LinearScalingRotaryEmbeddingWithLora):
++                self.long_lora_context = LongContextLoRAContext(
++                    new_module.scaling_factors, new_module.rotary_dim)
++                self.scaling_factor_to_offset = \
++                    new_module.scaling_factor_to_offset
+             # (yard1): TODO make this more robust
+             if "lm_head" in module_name:
+                 logits_processor_module = self.model.get_submodule(
+@@ -447,11 +499,19 @@ class LoRAModelManager:
+                                                 module, self.lora_slots,
+                                                 self.lora_config,
+                                                 self.model.config))
++
++            # In some models, especially multimodal ones, layers with the same
++            # name may have different types, such as nn.Linear and
++            # ReplicatedLinear. The nn.Linear layers cannot be replaced with
++            # LoRA layers, leading to assertion error. The following check
++            # aims to prevent this error
++            if self.supports_mm and not isinstance(new_module,
++                                                   BaseLayerWithLoRA):
++                continue
+             self.register_module(module_name, new_module)
+             self._register_packed_modules(module_name)
+-            new_module.set_mapping(self.base_indices, self.sampler_indices,
+-                                   self.sampler_indices_padded,
+-                                   self.embeddings_indices, self.indices_len)
++            # All lora layers share the same punica_wrapper based on reference.
++            new_module.set_mapping(self.punica_wrapper)
+ 
+     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
+         assert isinstance(module, BaseLayerWithLoRA)
+@@ -461,12 +521,16 @@ class LoRAModelManager:
+             self,
+             lora_id: int,
+             rank: int,
++            scaling_factor: Optional[float],
+             embedding_modules: Optional[Dict[str, str]] = None) -> LoRAModel:
+         """Create zero-initialized LoRAModel for warmup."""
+-        model = LoRAModel(lora_id, rank, {})
++        model = LoRAModel(lora_id, rank, {}, scaling_factor)
+         for module_name, module in self.model.named_modules():
+-            if not self._match_target_modules(module_name) or not isinstance(
+-                    module, BaseLayerWithLoRA):
++            bias_enabled = self.lora_config.bias_enabled
++            if (not self._match_target_modules(module_name)
++                    or not isinstance(module, BaseLayerWithLoRA)
++                    or isinstance(module, LinearScalingRotaryEmbeddingWithLora)
++                    or self._filter_unsupported_mm_module(module_name)):
+                 continue
+             parts = module_name.split(".")
+             if module_name not in self.packed_modules:
+@@ -488,23 +552,25 @@ class LoRAModelManager:
+                         input_dim,
+                         output_dim,
+                         rank,
+-                        module.lora_a_stacked.dtype,
++                        module.lora_a_stacked[0].dtype,
+                         "cpu",
+-                        embeddings_tensor_dim=embeddings_tensor_dim)
++                        embeddings_tensor_dim=embeddings_tensor_dim,
++                        bias_enabled=bias_enabled)
+                 else:
+                     lora = LoRALayerWeights.create_dummy_lora_weights(
+                         module_name,
+-                        module.lora_a_stacked.shape[-1],
+-                        module.lora_b_stacked.shape[-2],
++                        module.lora_a_stacked[0].shape[-1],
++                        module.lora_b_stacked[0].shape[-2],
+                         rank,
+-                        module.lora_a_stacked.dtype,
++                        module.lora_a_stacked[0].dtype,
+                         "cpu",
++                        bias_enabled=bias_enabled,
+                     )
+                 lora.optimize()
+             else:
+                 parts = module_name.split(".")
+                 replacements = self.packed_modules_mapping[parts[-1]]
+-                subloras: List[Optional["LoRALayerWeights"]] = []
++                subloras: List[Optional[LoRALayerWeights]] = []
+                 for i, r in enumerate(replacements):
+                     lora = LoRALayerWeights.create_dummy_lora_weights(
+                         module_name + "." + r,
+@@ -513,6 +579,7 @@ class LoRAModelManager:
+                         rank,
+                         module.lora_a_stacked[i].dtype,
+                         "cpu",
++                        bias_enabled=bias_enabled,
+                     )
+                     lora.optimize()
+                     subloras.append(lora)
+@@ -527,6 +594,19 @@ class LoRAModelManager:
+                 module_name) or target_module == module_name
+             for target_module in self.supported_lora_modules)
+ 
++    def _filter_unsupported_mm_module(self, module_name: str) -> bool:
++        """
++        Regarding multimodal models, vLLM currently only supports adding LoRA to
++        language model. LoRA for other modules, such as the vision tower, will 
++        be filtered out.
++        """
++        if self.supports_mm:
++            module_mapping: MultiModelKeys = self.model.get_mm_mapping()
++            prefix_lst = module_mapping.connector + module_mapping.tower_model
++            return any(
++                [module_name.startswith(prefix) for prefix in prefix_lst])
++        return False
++
+     def _register_packed_modules(self, module_full_name: str) -> None:
+         parts = module_full_name.split(".")
+         module_name = parts[-1]
+@@ -558,71 +638,111 @@ class LoRAModelManager:
+             lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
+                 replacement_loras)
+ 
++    def deactivate_adapter(self, adapter_id: int) -> bool:
++        return deactivate_adapter(adapter_id, self._active_adapters,
++                                  self._deactivate_adapter)
+ 
+-class LoRALRUCache(LRUCache[LoRAModel]):
++    def add_adapter(self, adapter: LoRAModel) -> bool:
++        logger.debug(
++            "Adding lora. Model id: %d, "
++            "int id: %d, "
++            "scaling factor: %s", adapter.id, adapter.id,
++            adapter.scaling_factor)
++        return add_adapter(adapter, self._registered_adapters, self.capacity,
++                           self._add_adapter)
++
++    def set_adapter_mapping(self, mapping: LoRAMapping) -> None:
++        self._last_mapping = set_adapter_mapping(mapping, self._last_mapping,
++                                                 self._set_adapter_mapping)
++
++    def remove_adapter(self, adapter_id: int) -> bool:
++        return remove_adapter(adapter_id, self._registered_adapters,
++                              self.deactivate_adapter)
++
++    def list_adapters(self) -> Dict[int, Any]:
++        return list_adapters(self._registered_adapters)
++
++    def get_adapter(self, adapter_id: int) -> Optional[Any]:
++        return get_adapter(adapter_id, self._registered_adapters)
++
++
++class LoRALRUCache(AdapterLRUCache[LoRAModel]):
+ 
+     def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
+                                                                    bool]):
+-        super().__init__(capacity)
+-        self.deactivate_lora_fn = deactivate_lora_fn
+-
+-    def _on_remove(self, key: int, value: LoRAModel):
+-        logger.debug("Removing LoRA. int id: %d", key)
+-        self.deactivate_lora_fn(key)
+-        return super()._on_remove(key, value)
++        super().__init__(capacity, deactivate_lora_fn)
+ 
+ 
+ class LRUCacheLoRAModelManager(LoRAModelManager):
+     """A model manager that manages multiple LoRAs with LRU cache."""
+ 
+-    def __init__(
+-        self,
+-        model: nn.Module,
+-        max_num_seqs: int,
+-        max_num_batched_tokens: int,
+-        vocab_size: int,
+-        lora_config: LoRAConfig,
+-    ):
++    def __init__(self, model: nn.Module, max_num_seqs: int,
++                 max_num_batched_tokens: int, vocab_size: int,
++                 lora_config: LoRAConfig, device: torch.device):
+         super().__init__(model, max_num_seqs, max_num_batched_tokens,
+-                         vocab_size, lora_config)
+-        self._registered_loras: LoRALRUCache = LoRALRUCache(
+-            self.capacity, self.deactivate_lora)
+-        self._active_loras: LoRALRUCache = LoRALRUCache(
+-            self.lora_slots, self._deactivate_lora)
++                         vocab_size, lora_config, device)
++        self._registered_adapters: LoRALRUCache = LoRALRUCache(
++            self.capacity, self.deactivate_adapter)
++        self._active_adapters: LoRALRUCache = LoRALRUCache(
++            self.lora_slots, self._deactivate_adapter)
+ 
+-    def list_loras(self) -> Dict[int, LoRAModel]:
++    def list_adapters(self) -> Dict[int, LoRAModel]:
+         """List all registered LoRAModels."""
+-        return dict(self._registered_loras.cache)
++        return dict(self._registered_adapters.cache)
+ 
+-    def add_lora(self, lora: LoRAModel) -> bool:
++    def add_adapter(self, lora: LoRAModel) -> bool:
+         """Add a LoRAModel to the manager."""
+-        if lora.id not in self._registered_loras:
+-            self._add_lora(lora)
++        logger.debug(
++            "Adding lora. Model id: %d, "
++            "int id: %d, "
++            "scaling factor: %s", lora.id, lora.id, lora.scaling_factor)
++        if lora.id not in self._registered_adapters:
++            self._add_adapter(lora)
+             was_added = True
+         else:
+             # We always touch to update the LRU cache order
+-            self._registered_loras.touch(lora.id)
++            self._registered_adapters.touch(lora.id)
+             was_added = False
+         return was_added
+ 
+-    def activate_lora(
++    def activate_adapter(
+         self,
+         lora_id: int,
+     ) -> bool:
+-        if lora_id not in self._active_loras and len(
+-                self._active_loras) >= self.lora_slots:
+-            self._active_loras.remove_oldest()
+-        result = super().activate_lora(lora_id)
++        if lora_id not in self._active_adapters and len(
++                self._active_adapters) >= self.lora_slots:
++            self._active_adapters.remove_oldest()
++        result = super().activate_adapter(lora_id)
+         # We always touch to update the LRU cache order
+-        self._active_loras.touch(lora_id)
++        self._active_adapters.touch(lora_id)
+         return result
+ 
+-    def remove_oldest_lora(self) -> bool:
+-        if len(self._registered_loras) > 0:
+-            self._registered_loras.remove_oldest()
++    def remove_oldest_adapter(self) -> bool:
++        if len(self._registered_adapters) > 0:
++            self._registered_adapters.remove_oldest()
+             return True
+         return False
+ 
++    def pin_adapter(self, lora_id: int) -> bool:
++        """Pin a LoRAModel in the manager cache."""
++        self._pin_lora_in_cpu_cache(lora_id)
++        self._pin_lora_in_gpu_cache(lora_id)
++        return True
++
++    def _pin_lora_in_cpu_cache(self, lora_id: int):
++        try:
++            self._registered_adapters.pin(lora_id)
++        except ValueError as err:
++            raise ValueError("Pinning failed. "
++                             f"LoRA {lora_id} is not registered.") from err
++
++    def _pin_lora_in_gpu_cache(self, lora_id: int):
++        if lora_id not in self._active_adapters:
++            # move lora to gpu if not already active
++            self.activate_adapter(lora_id)
++
++        self._active_adapters.pin(lora_id)
++
+ 
+ def create_lora_manager(
+         model: nn.Module,
+@@ -630,6 +750,7 @@ def create_lora_manager(
+         max_num_batched_tokens: int,
+         vocab_size: int,
+         lora_config: LoRAConfig,
++        device: torch.device,
+         lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager,
+         **kwargs) -> LoRAModelManager:
+     """Create a LoRA adapter for a given model."""
+@@ -641,5 +762,6 @@ def create_lora_manager(
+         max_num_batched_tokens=max_num_batched_tokens,
+         vocab_size=vocab_size,
+         lora_config=lora_config,
++        device=device,
+         **kwargs)
+     return lora_manager
+diff --git a/vllm/lora/ops/__init__.py b/vllm/lora/ops/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py
+new file mode 100644
+index 0000000..9c9159b
+--- /dev/null
++++ b/vllm/lora/ops/torch_ops/__init__.py
+@@ -0,0 +1,13 @@
++from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
++from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
++                                              sgmv_expand, sgmv_expand_slice,
++                                              sgmv_shrink)
++
++__all__ = [
++    "bgmv_expand",
++    "bgmv_expand_slice",
++    "bgmv_shrink",
++    "sgmv_expand",
++    "sgmv_expand_slice",
++    "sgmv_shrink",
++]
+diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py
+new file mode 100644
+index 0000000..5f5aafd
+--- /dev/null
++++ b/vllm/lora/ops/torch_ops/lora_ops.py
+@@ -0,0 +1,113 @@
++import torch
++
++
++def sgmv_expand(inputs: torch.Tensor,
++                lora_b_weights: torch.Tensor,
++                output_tensor: torch.Tensor,
++                b_seq_start_loc: torch.Tensor,
++                seq_len_tensor: torch.Tensor,
++                lora_indices_tensor: torch.Tensor,
++                batches: int,
++                max_seq_length: int,
++                token_nums: int,
++                add_inputs: bool = False):
++    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
++                                               seq_len_tensor)
++
++    bgmv_expand(inputs, lora_b_weights, output_tensor, exploded_indices,
++                add_inputs)
++
++
++def bgmv_expand(inputs: torch.Tensor,
++                lora_b_weights: torch.Tensor,
++                output_tensor: torch.Tensor,
++                lora_indices_tensor: torch.Tensor,
++                add_inputs: bool = True):
++    selected_loras = lora_b_weights[lora_indices_tensor].to(
++        dtype=output_tensor.dtype)
++    if len(selected_loras.shape) == 4:
++        selected_loras = selected_loras.squeeze(dim=1)
++    inputs = inputs.to(dtype=output_tensor.dtype)
++    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
++
++    limit = output_tensor.shape[0]
++    if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
++        limit = 1
++
++    if add_inputs:
++        output_tensor[:, :outputs.shape[1]] += outputs[:limit, :]
++    else:
++        output_tensor[:, :outputs.shape[1]] = outputs[:limit, :]
++
++
++def sgmv_shrink(
++    inputs: torch.Tensor,
++    lora_a_weights: torch.Tensor,
++    output_tensor: torch.Tensor,
++    b_seq_start_loc: torch.Tensor,
++    seq_len_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    batches: int,
++    max_seq_length: int,
++    token_nums: int,
++    scaling: float,
++):
++    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
++                                               seq_len_tensor)
++
++    bgmv_shrink(inputs, lora_a_weights, output_tensor, exploded_indices,
++                scaling)
++
++
++def bgmv_shrink(inputs: torch.Tensor,
++                lora_b_weights: torch.Tensor,
++                output_tensor: torch.Tensor,
++                lora_indices_tensor: torch.Tensor,
++                scaling: float = 1.0):
++    selected_loras = lora_b_weights[lora_indices_tensor].to(
++        dtype=output_tensor.dtype)
++    if len(selected_loras.shape) == 4:
++        selected_loras = selected_loras.squeeze(dim=1)
++    inputs = inputs.to(dtype=output_tensor.dtype)
++    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
++
++    output_tensor[:, :outputs.shape[1]] = scaling * outputs[:]
++
++
++def sgmv_expand_slice(inputs: torch.Tensor,
++                      lora_b_weights: torch.Tensor,
++                      output_tensor: torch.Tensor,
++                      b_seq_start_loc: torch.Tensor,
++                      seq_len_tensor: torch.Tensor,
++                      lora_indices_tensor: torch.Tensor,
++                      batches: int,
++                      max_seq_length: int,
++                      token_nums: int,
++                      slice_offset: int,
++                      slice_size: int,
++                      add_inputs: bool = False):
++    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
++                                               seq_len_tensor)
++
++    bgmv_expand_slice(inputs, lora_b_weights, output_tensor, exploded_indices,
++                      slice_offset, slice_size, add_inputs)
++
++
++def bgmv_expand_slice(inputs: torch.Tensor,
++                      lora_b_weights: torch.Tensor,
++                      output_tensor: torch.Tensor,
++                      lora_indices_tensor: torch.Tensor,
++                      slice_offset: int,
++                      slice_size: int,
++                      add_inputs: bool = True):
++    selected_loras = lora_b_weights[lora_indices_tensor].to(
++        dtype=output_tensor.dtype)
++    inputs = inputs.to(dtype=output_tensor.dtype)
++    if len(selected_loras.shape) == 4:
++        selected_loras = selected_loras.squeeze(dim=1)
++    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
++
++    if add_inputs:
++        output_tensor[:, slice_offset:slice_offset + slice_size] += outputs[:]
++    else:
++        output_tensor[:, slice_offset:slice_offset + slice_size] = outputs[:]
+diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
+new file mode 100644
+index 0000000..9805b6d
+--- /dev/null
++++ b/vllm/lora/ops/triton_ops/__init__.py
+@@ -0,0 +1,13 @@
++from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
++from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
++from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
++from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
++from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink  # noqa: F401
++
++__all__ = [
++    "bgmv_expand",
++    "bgmv_expand_slice",
++    "bgmv_shrink",
++    "sgmv_expand",
++    "sgmv_shrink",
++]
+diff --git a/vllm/lora/ops/triton_ops/bgmv_expand.py b/vllm/lora/ops/triton_ops/bgmv_expand.py
+new file mode 100644
+index 0000000..42adb19
+--- /dev/null
++++ b/vllm/lora/ops/triton_ops/bgmv_expand.py
+@@ -0,0 +1,187 @@
++"""
++Based on:
++Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
++Punica: Multi-Tenant LoRA Serving. 
++https://arxiv.org/abs/2310.18547
++"""
++
++import torch
++import triton
++import triton.language as tl
++
++from vllm.utils import direct_register_custom_op
++
++from .utils import get_lora_op_configs
++
++
++@triton.jit
++def _bgmv_expand_kernel(
++    input_ptr,
++    lora_ptr,
++    out_ptr,
++    N,
++    K,
++    lora_indices,
++    xm_stride,
++    xk_stride,
++    l0_stride,
++    lora_k_stride,
++    lora_n_stride,
++    cm_stride,
++    cn_stride,
++    BLOCK_N: tl.constexpr,
++    BLOCK_K: tl.constexpr,
++    SPLIT_N: tl.constexpr,
++    EVEN_K: tl.constexpr,
++    ADD_INPUTS: tl.constexpr,
++    CAST_TYPE: tl.constexpr,
++):
++    """
++    GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's
++    performance
++    """
++    pid_sn = tl.program_id(axis=0)
++    cur_batch = tl.program_id(axis=1)
++    lora_index = tl.load(lora_indices + cur_batch)
++    if lora_index == -1:
++        return
++    offset_k = tl.arange(0, BLOCK_K)
++    offset_n = tl.arange(0, BLOCK_N)
++    if EVEN_K:
++        tiled_a = tl.load(input_ptr + cur_batch * xm_stride +
++                          offset_k * xk_stride, )  # [BLOCK_K]
++    else:
++        tiled_a = tl.load(
++            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
++            mask=offset_k < K,
++            other=0,
++        )  # [BLOCK_K]
++    # N must be divisible by SPLIT_N
++    split_n_length = tl.cdiv(N, SPLIT_N)
++    if CAST_TYPE:
++        tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
++    # sliding  to  next row-block
++    b_ptr = (lora_ptr + l0_stride * lora_index +
++             pid_sn * split_n_length * lora_k_stride)
++    c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length
++    for n in range(0, split_n_length, BLOCK_N):
++        current_n = n + offset_n
++        current_n_c = tl.max_contiguous(current_n, BLOCK_N)
++        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
++                                                              < K)
++        c_mask = current_n < split_n_length
++        tiled_b = tl.load(
++            b_ptr + current_n_c[:, None] * lora_k_stride +
++            offset_k[None, :] * lora_n_stride,
++            mask=b_ptr_mask,
++            other=0.0,
++        )  # [BLOCK_N,BLOCK_K]
++        if ADD_INPUTS:
++            tiled_out = tl.load(c_ptr + current_n * cn_stride,
++                                mask=c_mask,
++                                other=0.0)
++            accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
++        else:
++            accumulator = tl.sum(tiled_a * tiled_b, 1)
++
++        tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask)
++
++
++@torch.inference_mode()
++def _bgmv_expand(
++    inputs: torch.Tensor,
++    lora_b_weights: torch.Tensor,
++    output_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    add_inputs: bool = True,
++) -> None:
++    """
++    Args:
++        inputs (torch.Tensor): input tensor
++        lora_b_weights (torch.Tensor): lora'a weight
++        output_tensor (torch.Tensor): output tensor
++        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
++            corresponding to each batch, An index of -1 means no lora should be
++            applied.
++        batches (int): batch size
++        add_inputs (bool, optional):  Defaults to False, adds the final lora 
++            results to the output.
++    """
++    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
++    assert lora_b_weights.dtype in [
++        torch.float16,
++        torch.bfloat16,
++    ]
++    assert inputs.size(1) == lora_b_weights.size(-1)
++
++    assert inputs.is_contiguous()
++    assert output_tensor.is_contiguous()
++
++    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
++        assert lora_b_weights.size(1) == 1
++        lora_b_weights = lora_b_weights.squeeze(dim=1)
++    else:
++        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
++    assert lora_b_weights.is_contiguous()
++
++    # TODO tuning this config
++    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
++    BLOCK_K = triton.next_power_of_2(K)
++    EVEN_K = K % BLOCK_K == 0
++    ADD_INPUTS = add_inputs
++    CAST_TYPE = False
++    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
++            torch.float16,
++            torch.bfloat16,
++    ]:
++        CAST_TYPE = True
++    batches = lora_indices_tensor.size(0)
++    config = get_lora_op_configs("expand", batches, N)
++    grid = lambda META: (
++        META["SPLIT_N"],
++        batches,
++    )
++    _bgmv_expand_kernel[grid](
++        inputs,
++        lora_b_weights,
++        output_tensor,
++        N,
++        K,
++        lora_indices_tensor,
++        inputs.stride(0),
++        inputs.stride(1),
++        lora_b_weights.stride(0),
++        lora_b_weights.stride(1),
++        lora_b_weights.stride(2),
++        output_tensor.stride(0),
++        output_tensor.stride(1),
++        BLOCK_K=BLOCK_K,
++        EVEN_K=EVEN_K,
++        ADD_INPUTS=ADD_INPUTS,
++        CAST_TYPE=CAST_TYPE,
++        **config,
++    )
++    return
++
++
++def bgmv_expand_fake(
++    inputs: torch.Tensor,
++    lora_b_weights: torch.Tensor,
++    output_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    add_inputs: bool = True,
++) -> None:
++    return
++
++
++try:
++    direct_register_custom_op(
++        op_name="bgmv_expand",
++        op_func=_bgmv_expand,
++        mutates_args=["output_tensor"],
++        fake_impl=bgmv_expand_fake,
++    )
++    bgmv_expand = torch.ops.vllm.bgmv_expand
++
++except AttributeError:
++    bgmv_expand = _bgmv_expand
+diff --git a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
+new file mode 100644
+index 0000000..f397d75
+--- /dev/null
++++ b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
+@@ -0,0 +1,206 @@
++"""
++Based on:
++Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
++Punica: Multi-Tenant LoRA Serving. 
++https://arxiv.org/abs/2310.18547
++"""
++
++import torch
++import triton
++import triton.language as tl
++
++from vllm.utils import direct_register_custom_op
++
++from .utils import get_lora_op_configs
++
++
++@triton.jit
++def _bgmv_expand_slice_kernel(
++    input_ptr,
++    lora_ptr,
++    out_ptr,
++    N,
++    K,
++    lora_indices,
++    xm_stride,
++    xk_stride,
++    l0_stride,
++    lora_k_stride,
++    lora_n_stride,
++    cm_stride,
++    cn_stride,
++    slice_offset,
++    BLOCK_N: tl.constexpr,
++    BLOCK_K: tl.constexpr,
++    SPLIT_N: tl.constexpr,
++    EVEN_K: tl.constexpr,
++    ADD_INPUTS: tl.constexpr,
++    CAST_TYPE: tl.constexpr,
++):
++    """
++    GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's
++    performance
++    """
++    pid_sn = tl.program_id(axis=0)
++    cur_batch = tl.program_id(axis=1)
++    lora_index = tl.load(lora_indices + cur_batch)
++    if lora_index == -1:
++        return
++    offset_k = tl.arange(0, BLOCK_K)
++    offset_n = tl.arange(0, BLOCK_N)
++    if EVEN_K:
++        tiled_a = tl.load(input_ptr + cur_batch * xm_stride +
++                          offset_k * xk_stride, )  # [BLOCK_K]
++    else:
++        tiled_a = tl.load(
++            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
++            mask=offset_k < K,
++            other=0,
++        )  # [BLOCK_K]
++    # N must be divisible by SPLIT_N
++    split_n_length = tl.cdiv(N, SPLIT_N)
++    if CAST_TYPE:
++        tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
++    # sliding  to  next row-block
++    b_ptr = (lora_ptr + l0_stride * lora_index +
++             pid_sn * split_n_length * lora_k_stride)
++    c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length +
++             slice_offset * cn_stride)
++
++    for n in range(0, split_n_length, BLOCK_N):
++        current_n = n + offset_n
++        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
++                                                              < K)
++        c_mask = current_n < split_n_length
++        tiled_b = tl.load(
++            b_ptr + current_n[:, None] * lora_k_stride +
++            offset_k[None, :] * lora_n_stride,
++            mask=b_ptr_mask,
++            other=0.0,
++        )  # [BLOCK_N,BLOCK_K]
++
++        if ADD_INPUTS:
++            # explicitly pass in other=None to tell triton that masked values
++            # can be uninitialized. This is OK because the later tl.store
++            # operation uses the same mask, eliminating the risk of garbage
++            # values propagating
++            tiled_out = tl.load(c_ptr + current_n * cn_stride,
++                                mask=c_mask,
++                                other=None)
++            accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
++        else:
++            accumulator = tl.sum(tiled_a * tiled_b, 1)
++
++        tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask)
++
++
++@torch.inference_mode()
++def _bgmv_expand_slice(
++    inputs: torch.Tensor,
++    lora_b_weights: torch.Tensor,
++    output_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    slice_offset: int,
++    slice_size: int,
++    add_inputs: bool = True,
++) -> None:
++    """
++    Args:
++        inputs (torch.Tensor): input tensor
++        lora_b_weights (torch.Tensor): lora'b weight
++        output_tensor (torch.Tensor): output tensor
++        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
++            corresponding to each batch, An index of -1 means no lora should be
++            applied.
++        slice_offset (int): output_tensor's offset
++        slice_size (int): current output_tensor's size
++        batches (int): batch size
++        add_inputs (bool, optional): Defaults to False.
++    """
++    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
++    assert lora_b_weights.dtype in [
++        torch.float16,
++        torch.bfloat16,
++    ]
++    assert inputs.size(1) == lora_b_weights.size(-1)
++
++    assert slice_size == lora_b_weights.size(-2)
++    assert inputs.is_contiguous()
++    assert output_tensor.is_contiguous()
++
++    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
++        assert lora_b_weights.size(1) == 1
++        lora_b_weights = lora_b_weights.squeeze(dim=1)
++    else:
++        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
++
++    assert lora_b_weights.is_contiguous()
++
++    # TODO tuning this config
++
++    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
++    BLOCK_K = triton.next_power_of_2(K)
++    EVEN_K = K % BLOCK_K == 0
++    ADD_INPUTS = add_inputs
++    CAST_TYPE = False
++    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
++            torch.float16,
++            torch.bfloat16,
++    ]:
++        CAST_TYPE = True
++
++    batches = lora_indices_tensor.size(0)
++
++    config = get_lora_op_configs("expand", batches, N)
++
++    grid = lambda META: (
++        META["SPLIT_N"],
++        batches,
++    )
++    _bgmv_expand_slice_kernel[grid](
++        inputs,
++        lora_b_weights,
++        output_tensor,
++        N,
++        K,
++        lora_indices_tensor,
++        inputs.stride(0),
++        inputs.stride(1),
++        lora_b_weights.stride(0),
++        lora_b_weights.stride(1),
++        lora_b_weights.stride(2),
++        output_tensor.stride(0),
++        output_tensor.stride(1),
++        slice_offset,
++        BLOCK_K=BLOCK_K,
++        EVEN_K=EVEN_K,
++        ADD_INPUTS=ADD_INPUTS,
++        CAST_TYPE=CAST_TYPE,
++        **config,
++    )
++    return
++
++
++def bgmv_expand_slice_fake(
++    inputs: torch.Tensor,
++    lora_b_weights: torch.Tensor,
++    output_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    slice_offset: int,
++    slice_size: int,
++    add_inputs: bool = True,
++) -> None:
++    return
++
++
++try:
++    direct_register_custom_op(
++        op_name="bgmv_expand_slice",
++        op_func=_bgmv_expand_slice,
++        mutates_args=["output_tensor"],
++        fake_impl=bgmv_expand_slice_fake,
++    )
++    bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
++
++except AttributeError:
++    bgmv_expand_slice = _bgmv_expand_slice
+diff --git a/vllm/lora/ops/triton_ops/bgmv_shrink.py b/vllm/lora/ops/triton_ops/bgmv_shrink.py
+new file mode 100644
+index 0000000..f3ef01d
+--- /dev/null
++++ b/vllm/lora/ops/triton_ops/bgmv_shrink.py
+@@ -0,0 +1,167 @@
++"""
++Based on:
++Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
++Punica: Multi-Tenant LoRA Serving. 
++https://arxiv.org/abs/2310.18547
++"""
++
++import torch
++import triton
++import triton.language as tl
++
++from vllm.utils import direct_register_custom_op
++
++from .utils import get_lora_op_configs
++
++
++@triton.jit
++def _bgmv_shrink_kernel(
++    input_ptr,
++    lora_ptr,
++    out_ptr,
++    N,
++    K,
++    lora_indices,
++    scaling,
++    xm_stride,
++    xk_stride,
++    l0_stride,
++    lora_k_stride,
++    lora_n_stride,
++    cm_stride,
++    cn_stride,
++    BLOCK_N: tl.constexpr,
++    BLOCK_K: tl.constexpr,
++    SPLIT_K: tl.constexpr,
++):
++    """
++    GroupGEMV, additionally, introducing SPLIT-K can improve large hidden_size's
++    performance
++    """
++    pid_sk = tl.program_id(axis=0)
++    cur_batch = tl.program_id(axis=1)
++    lora_index = tl.load(lora_indices + cur_batch)
++    if lora_index == -1:
++        return
++
++    offset_n = tl.arange(0, BLOCK_N)
++    offset_k = tl.arange(0, BLOCK_K) + pid_sk * BLOCK_K
++    a_ptr = input_ptr + cur_batch * xm_stride
++    b_ptr = lora_ptr + l0_stride * lora_index
++    accumulator = tl.zeros((BLOCK_N, ), dtype=tl.float32)
++    for k in range(0, K, BLOCK_K * SPLIT_K):
++        current_k = k + offset_k
++        current_k_c = tl.max_contiguous(current_k, BLOCK_K)
++        tiled_a = tl.load(
++            a_ptr + current_k_c,
++            mask=current_k < K,
++            other=0.0,
++        )  # [BLOCK_K]
++        b_ptr_mask = (offset_n[:, None] < N) & (current_k[None, :] < K)
++
++        tiled_b = tl.load(
++            b_ptr + offset_n[:, None] * lora_k_stride +
++            current_k[None, :] * lora_n_stride,
++            mask=b_ptr_mask,
++            other=0.0,
++        )  # [BLOCK_N,BLOCK_K]
++
++        accumulator += tl.sum(tiled_a * tiled_b, 1)
++    accumulator *= scaling
++    offset_cn = tl.arange(0, BLOCK_N)
++    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn * cn_stride
++    c_mask = offset_cn < N
++    if SPLIT_K == 1:
++        tl.store(c_ptr, accumulator, mask=c_mask)
++    else:
++        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
++
++
++@torch.inference_mode()
++def _bgmv_shrink(
++    inputs: torch.Tensor,
++    lora_a_weights: torch.Tensor,
++    output_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    scaling: float = 1.0,
++) -> None:
++    """
++    Args:
++        inputs (torch.Tensor): input tensor
++        lora_a_weights (torch.Tensor): lora'a weight
++        output_tensor (torch.Tensor): output tensor
++        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
++            corresponding to each batch. An index of -1 means no lora should be
++            applied.
++        batches (int): batch size
++        scaling (float):  Scaling factor.
++    """
++    assert inputs.dtype == lora_a_weights.dtype
++    assert inputs.dtype in [torch.float16, torch.bfloat16]
++    assert lora_a_weights.dtype in [
++        torch.float16,
++        torch.bfloat16,
++    ]
++    assert inputs.size(1) == lora_a_weights.size(-1)
++    assert inputs.is_contiguous()
++
++    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
++        assert lora_a_weights.size(1) == 1
++        lora_a_weights = lora_a_weights.squeeze(dim=1)
++    else:
++        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
++    assert lora_a_weights.is_contiguous()
++    assert output_tensor.is_contiguous()
++    # TODO tuning this config
++    batches = lora_indices_tensor.size(0)
++    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
++    BLOCK_N = triton.next_power_of_2(N)
++    # First try to load optimal config from the file
++    config = get_lora_op_configs("bgmv_shrink", batches, K)
++
++    grid = lambda META: (
++        META["SPLIT_K"],
++        batches,
++    )
++    _bgmv_shrink_kernel[grid](
++        inputs,
++        lora_a_weights,
++        output_tensor,
++        N,
++        K,
++        lora_indices_tensor,
++        scaling,
++        inputs.stride(0),
++        inputs.stride(1),
++        lora_a_weights.stride(0),
++        lora_a_weights.stride(1),
++        lora_a_weights.stride(2),
++        output_tensor.stride(0),
++        output_tensor.stride(1),
++        BLOCK_N=BLOCK_N,
++        **config,
++    )
++    return
++
++
++def bgmv_shrink_fake(
++    inputs: torch.Tensor,
++    lora_a_weights: torch.Tensor,
++    output_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    scaling: float = 1.0,
++) -> None:
++    return
++
++
++try:
++    direct_register_custom_op(
++        op_name="bgmv_shrink",
++        op_func=_bgmv_shrink,
++        mutates_args=["output_tensor"],
++        fake_impl=bgmv_shrink_fake,
++    )
++    bgmv_shrink = torch.ops.vllm.bgmv_shrink
++
++except AttributeError:
++    bgmv_shrink = _bgmv_shrink
+diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py
+new file mode 100644
+index 0000000..8af44b7
+--- /dev/null
++++ b/vllm/lora/ops/triton_ops/sgmv_expand.py
+@@ -0,0 +1,278 @@
++"""
++Based on:
++Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
++Punica: Multi-Tenant LoRA Serving.
++https://arxiv.org/abs/2310.18547
++"""
++
++from typing import List
++
++import torch
++import triton
++import triton.language as tl
++
++from vllm.utils import direct_register_custom_op
++
++from .utils import _get_lora_b_ptr
++
++
++@triton.jit
++def _sgmv_expand_kernel(
++        input_ptr,
++        lora_ptr,
++        out_ptr,
++        N,
++        K,
++        b_seq_start_loc,
++        seq_lens,
++        lora_indices,
++        slice_start_loc,
++        input_d0_stride,
++        input_d1_stride,
++        input_d2_stride,  # 1
++        ls_d0_ptr,
++        ls_d1_ptr,
++        ls_d2_ptr,  # 1
++        output_d0_stride,
++        output_d1_stride,  # 1
++        output_hs_ptr,
++        BLOCK_M: tl.constexpr,
++        BLOCK_N: tl.constexpr,
++        BLOCK_K: tl.constexpr,
++        EVEN_K: tl.constexpr,
++        ADD_INPUTS: tl.constexpr,
++        CAST_TYPE: tl.constexpr,
++        SLICE_NUM: tl.constexpr,
++        SAME_STRIDE: tl.constexpr):
++    """
++
++    Similar to the 'sgmv_expand' operator, but with an added parameter
++    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator
++    might be that in the future, we could implement a fusion operator to
++    achieve the current functionality instead of having to call it multiple
++    times.
++    """
++    pid = tl.program_id(axis=0)
++    cur_batch = tl.program_id(axis=1)
++    slice_id = tl.program_id(axis=2)
++    cta_n_num = tl.cdiv(N, BLOCK_N)
++    # When the output dimensions of each slice are the same,cur_n=N, otherwise
++    # cur_n=tl.load(output_hs_ptr + slice_id), this situation exists in GQA's
++    # qkv linear.
++    curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id)
++    pid_m = pid // cta_n_num
++    pid_n = pid % cta_n_num
++    M = tl.load(seq_lens + cur_batch)
++    if pid_m * BLOCK_M > M:
++        return
++    if pid_n * BLOCK_N > curr_N:
++        return
++    lora_index = tl.load(lora_indices + cur_batch)
++    if lora_index == -1:
++        return
++
++    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
++    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
++    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
++    offset_k = tl.arange(0, BLOCK_K)
++    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
++    rbn = tl.max_contiguous(tl.multiple_of(offset_n % curr_N, BLOCK_N),
++                            BLOCK_N)
++    # ls_d*_ptr can be either an integer or a pointer
++    if SAME_STRIDE:
++        # integer
++        cur_lora_d0_stride = ls_d0_ptr
++        cur_lora_d1_stride = ls_d1_ptr
++        cur_lora_d2_stride = ls_d2_ptr
++    else:
++        # pointer
++        cur_lora_d0_stride = tl.load(ls_d0_ptr + slice_id)
++        cur_lora_d1_stride = tl.load(ls_d1_ptr + slice_id)
++        cur_lora_d2_stride = tl.load(ls_d2_ptr + slice_id)
++    if SLICE_NUM == 1:
++        cur_input_ptr = input_ptr
++        cur_lora_ptr = lora_ptr
++
++    else:
++        cur_input_ptr = input_ptr + slice_id * input_d0_stride
++        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
++            tl.pointer_type(out_ptr.dtype.element_ty))
++
++    a_ptr = (cur_input_ptr + cur_seq_start * input_d1_stride +
++             ram[:, None] * input_d1_stride +
++             offset_k[None, :] * input_d2_stride, )
++    b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
++             offset_k[:, None] * cur_lora_d2_stride +
++             rbn[None, :] * cur_lora_d1_stride)
++    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
++    for k in range(tl.cdiv(K, BLOCK_K)):
++        if EVEN_K:
++            tiled_a = tl.load(a_ptr)
++            tiled_b = tl.load(b_ptr)
++        else:
++            tiled_a = tl.load(a_ptr,
++                              mask=offset_k[None, :] < K - k * BLOCK_K,
++                              other=0)
++            tiled_b = tl.load(b_ptr,
++                              mask=offset_k[:, None] < K - k * BLOCK_K,
++                              other=0)
++        if CAST_TYPE:
++            tiled_a = tiled_a.to(cur_lora_ptr.dtype.element_ty)
++        accumulator += tl.dot(
++            tiled_a,
++            tiled_b,
++        )
++        a_ptr += BLOCK_K * input_d2_stride
++        b_ptr += BLOCK_K * cur_lora_d2_stride
++
++    tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty)
++    if SLICE_NUM == 1:
++        cur_slice_start = slice_start_loc
++    else:
++        cur_slice_start = tl.load(slice_start_loc + slice_id)
++
++    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
++    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start
++    c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
++             offset_cn[None, :] * output_d1_stride)
++    M = tl.load(seq_lens + cur_batch)
++    c_mask = (offset_cm[:, None] <
++              (cur_seq_start + M)) & (offset_cn[None, :] <
++                                      (cur_slice_start + curr_N))
++    if ADD_INPUTS:
++        tiled_out = tl.load(c_ptr, mask=c_mask)
++        tiled_c += tiled_out
++    tl.store(c_ptr, tiled_c, mask=c_mask)
++
++
++@torch.inference_mode()
++def _sgmv_expand(
++    inputs: torch.Tensor,
++    lora_b_weights: List[torch.Tensor],
++    output_tensor: torch.Tensor,
++    b_seq_start_loc: torch.Tensor,
++    seq_len_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    batches: int,
++    max_seq_length: int,
++    token_nums: int,
++    offset_start: int = 0,
++    add_inputs: bool = False,
++) -> None:
++    """
++    Args:
++        inputs (torch.Tensor): input tensor
++        lora_b_weights (List[torch.Tensor]): lora'b weight
++        output_tensor (torch.Tensor): output tensor
++        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
++            sequence lengths of the sequences in the batch, used to index
++            into sequence. E.g., if the sequence length is [4, 6], it is
++            [0, 4].
++        seq_len_tensor (torch.Tensor): (batch_size,). Record the sequence
++            length of the sequences in the batch.
++        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
++            corresponding to each batch. An index of -1 means no lora should be
++            applied.
++        batches (int): batch size
++        max_seq_length (int): The max sequence lengths of the sequences in the 
++            batch.
++        token_nums (int): The token numbers in the batch. Used to verify if the 
++            token numbers in the inputs matches the one in the metadata.
++        offset_start (int, optional): Offset start for output_tensor. 
++            Defaults to 0.
++        add_inputs (bool, optional): Whether to add the input tensor to the 
++            output tensor. Defaults to False.
++    """
++    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
++    for weight in lora_b_weights:
++        assert weight.dtype in [torch.float16, torch.bfloat16]
++
++    assert inputs.size(1) == token_nums
++    assert inputs.size(0) == len(lora_b_weights)
++
++    assert b_seq_start_loc.size(0) == batches
++    assert lora_indices_tensor.size(0) == batches
++    assert output_tensor.is_contiguous()
++    (slice_start_tensor, lora_ptr_tensor, lora_strides_d0_tensor,
++     lora_strides_d1_tensor, lora_strides_d2_tensor, hidden_sizes_tensor,
++     same_stride, MAX_N) = _get_lora_b_ptr(lora_b_weights, offset_start,
++                                           b_seq_start_loc.device)
++
++    # TODO tuning this config
++    K = lora_b_weights[0].shape[-1]  # K= rank
++
++    BLOCK_M = 64
++    BLOCK_N = 128
++    BLOCK_K = 16
++    EVEN_K = K % BLOCK_K == 0
++    ADD_INPUTS = add_inputs
++    CAST_TYPE = False
++
++    if inputs.dtype == torch.float32 and lora_b_weights[0].dtype in [
++            torch.float16,
++            torch.bfloat16,
++    ]:
++        CAST_TYPE = True
++    grid = (
++        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(MAX_N, BLOCK_N),
++        batches,
++        len(lora_b_weights),
++    )
++    _sgmv_expand_kernel[grid](
++        inputs,
++        lora_ptr_tensor,
++        output_tensor,
++        MAX_N,
++        K,
++        b_seq_start_loc,
++        seq_len_tensor,
++        lora_indices_tensor,
++        slice_start_tensor,
++        inputs.stride(0),
++        inputs.stride(1),
++        inputs.stride(2),
++        lora_strides_d0_tensor,
++        lora_strides_d1_tensor,
++        lora_strides_d2_tensor,
++        output_tensor.stride(0),
++        output_tensor.stride(1),
++        hidden_sizes_tensor,
++        BLOCK_M,
++        BLOCK_N,
++        BLOCK_K,
++        EVEN_K,
++        ADD_INPUTS,
++        CAST_TYPE,
++        len(lora_b_weights),
++        same_stride,
++    )
++    return
++
++
++def _sgmv_expand_fake(
++    inputs: torch.Tensor,
++    lora_b_weights: List[torch.Tensor],
++    output_tensor: torch.Tensor,
++    b_seq_start_loc: torch.Tensor,
++    seq_len_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    batches: int,
++    max_seq_length: int,
++    token_nums: int,
++    offset_start: int = 0,
++    add_inputs: bool = False,
++) -> None:
++    return
++
++
++try:
++    direct_register_custom_op(
++        op_name="sgmv_expand",
++        op_func=_sgmv_expand,
++        mutates_args=["output_tensor"],
++        fake_impl=_sgmv_expand_fake,
++    )
++    sgmv_expand = torch.ops.vllm.sgmv_expand
++
++except AttributeError:
++    sgmv_expand = _sgmv_expand
+diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py
+new file mode 100644
+index 0000000..3d2ebe8
+--- /dev/null
++++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py
+@@ -0,0 +1,239 @@
++"""
++Based on:
++Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
++Punica: Multi-Tenant LoRA Serving. 
++https://arxiv.org/abs/2310.18547
++"""
++
++from typing import List
++
++import torch
++import triton
++import triton.language as tl
++
++from vllm.utils import direct_register_custom_op
++
++from .utils import _get_lora_a_ptr
++
++
++@triton.jit
++def _sgmv_shrink_kernel(
++        input_ptr,
++        lora_ptr,  #1-3
++        out_ptr,
++        N,
++        K,
++        b_seq_start_loc,
++        seq_lens,
++        lora_indices,
++        scaling,
++        input_d0_stride,
++        input_d1_stride,  # 1
++        lora_d0_stride,
++        lora_d1_stride,
++        lora_d2_stride,  # 1
++        output_d0_stride,
++        output_d1_stride,
++        output_d2_stride,  # 1 
++        BLOCK_M: tl.constexpr,
++        BLOCK_N: tl.constexpr,
++        BLOCK_K: tl.constexpr,
++        EVEN_K: tl.constexpr,
++        SPLIT_K: tl.constexpr,
++        SLICE_NUM: tl.constexpr):
++    """
++    The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K.
++    The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally,
++    introducing SPLIT-K can improve performance
++    """
++    pid = tl.program_id(axis=0)
++    pid_mix = tl.program_id(axis=1)
++    cur_batch = tl.program_id(axis=2)
++    cta_n_num = tl.cdiv(N, BLOCK_N)
++    pid_m = pid // cta_n_num
++    pid_n = pid % cta_n_num
++    if SLICE_NUM == 1:
++        slice_id: tl.constexpr = 0
++        pid_sk = tl.program_id(axis=1)
++    else:
++        pid_mix = tl.program_id(axis=1)
++        slice_id = pid_mix // SPLIT_K
++        pid_sk = pid_mix % SPLIT_K
++
++    M = tl.load(seq_lens + cur_batch)
++    if pid_m * BLOCK_M > M:
++        return
++    lora_index = tl.load(lora_indices + cur_batch)
++    if lora_index == -1:
++        return
++    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
++    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
++    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
++    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
++
++    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
++    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
++    # input ptr
++    a_ptr = (input_ptr + cur_seq_start * input_d0_stride +
++             ram[:, None] * input_d0_stride +
++             offset_k[None, :] * input_d1_stride)
++
++    if SLICE_NUM == 1:
++        # current lora ptr
++        cur_lora_ptr = lora_ptr
++    else:
++        # current lora ptr
++        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
++            tl.pointer_type(input_ptr.dtype.element_ty))
++
++    b_ptr = (cur_lora_ptr + lora_d0_stride * lora_index +
++             rbn[None, :] * lora_d1_stride +
++             offset_k[:, None] * lora_d2_stride)
++
++    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
++    for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
++        if EVEN_K:
++            tiled_a = tl.load(a_ptr)
++            tiled_b = tl.load(b_ptr)
++        else:
++            k_remaining = K - k * (BLOCK_K * SPLIT_K)
++            tiled_a = tl.load(a_ptr,
++                              mask=offset_k[None, :] < k_remaining,
++                              other=0.0)
++            tiled_b = tl.load(b_ptr,
++                              mask=offset_k[:, None] < k_remaining,
++                              other=0.0)
++        accumulator += tl.dot(tiled_a, tiled_b)
++
++        a_ptr += BLOCK_K * SPLIT_K * input_d1_stride
++        b_ptr += BLOCK_K * SPLIT_K * lora_d2_stride
++    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
++
++    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
++    cur_out_ptr = (out_ptr if SLICE_NUM == 1 else out_ptr +
++                   slice_id * output_d0_stride)
++    c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
++        None, :] * output_d2_stride
++    c_mask = (offset_cm[:, None] <
++              (cur_seq_start + M)) & (offset_cn[None, :] < N)
++    accumulator *= scaling
++    # handles write-back with reduction-splitting
++    if SPLIT_K == 1:
++        tl.store(c_ptr, accumulator, mask=c_mask)
++    else:
++        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
++
++
++@torch.inference_mode()
++def _sgmv_shrink(
++    inputs: torch.Tensor,
++    lora_a_weights: List[torch.Tensor],
++    output_tensor: torch.Tensor,
++    b_seq_start_loc: torch.Tensor,
++    seq_len_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    batches: int,
++    max_seq_length: int,
++    token_nums: int,
++    scaling: float,
++) -> None:
++    """
++    Args:
++        inputs (torch.Tensor): input tensor
++        lora_a_weights (List[torch.Tensor]): lora'a weight
++        output_tensor (torch.Tensor): output tensor
++        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
++            sequence lengths of the sequences in the batch, used to index
++            into sequence. E.g., if the sequence length is [4, 6], it is
++            [0, 4].
++        seq_len_tensor (torch.Tensor): (batch_size,). Record the sequence
++            length of the sequences in the batch.
++        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
++            corresponding to each batch. An index of -1 means no lora should be
++            applied.
++        batches (int): batch size
++        max_seq_length (int): The max sequence lengths of the sequences in the 
++            batch.
++        token_nums (int): The token numbers in the batch. Used to verify if the 
++            token numbers in the inputs matches the one in the metadata.
++        scaling (float): Scaling factor.
++    """
++    assert inputs.dtype == lora_a_weights[0].dtype
++    assert inputs.dtype in [torch.float16, torch.bfloat16]
++    for weight in lora_a_weights:
++        assert weight.dtype in [torch.float16, torch.bfloat16]
++
++    assert inputs.size(0) == token_nums
++    assert inputs.size(1) == lora_a_weights[0].size(-1)
++    assert b_seq_start_loc.size(0) == batches
++    assert lora_indices_tensor.size(0) == batches
++    assert inputs.is_contiguous()
++    assert output_tensor.is_contiguous()
++    (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
++     lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, b_seq_start_loc.device)
++    # TODO tuning this config
++    N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
++    BLOCK_M = 32
++    BLOCK_N = 16
++    BLOCK_K = 32
++    SPLIT_K = 8
++    EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
++    grid = (
++        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
++        SPLIT_K * len(lora_a_weights),
++        batches,
++    )
++    _sgmv_shrink_kernel[grid](
++        inputs,
++        lora_ptr_tensor,
++        output_tensor,
++        N,
++        K,
++        b_seq_start_loc,
++        seq_len_tensor,
++        lora_indices_tensor,
++        scaling,
++        inputs.stride(0),
++        inputs.stride(1),
++        lora_strides_d0,
++        lora_strides_d1,
++        lora_strides_d2,
++        output_tensor.stride(0),
++        output_tensor.stride(1),
++        output_tensor.stride(2),
++        BLOCK_M,
++        BLOCK_N,
++        BLOCK_K,
++        EVEN_K,
++        SPLIT_K,
++        len(lora_a_weights),
++    )
++    return
++
++
++def sgmv_shrink_fake(
++    inputs: torch.Tensor,
++    lora_a_weights: List[torch.Tensor],
++    output_tensor: torch.Tensor,
++    b_seq_start_loc: torch.Tensor,
++    seq_len_tensor: torch.Tensor,
++    lora_indices_tensor: torch.Tensor,
++    batches: int,
++    max_seq_length: int,
++    token_nums: int,
++    scaling: float,
++) -> None:
++    return
++
++
++try:
++    direct_register_custom_op(
++        op_name="sgmv_shrink",
++        op_func=_sgmv_shrink,
++        mutates_args=["output_tensor"],
++        fake_impl=sgmv_shrink_fake,
++    )
++    sgmv_shrink = torch.ops.vllm.sgmv_shrink
++
++except AttributeError:
++    sgmv_shrink = _sgmv_shrink
+diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
+new file mode 100644
+index 0000000..7df5bc2
+--- /dev/null
++++ b/vllm/lora/ops/triton_ops/utils.py
+@@ -0,0 +1,165 @@
++import functools
++from typing import Dict, List, Tuple
++
++import torch
++
++
++@functools.lru_cache
++def _get_op_configs(op_type: str, batch: int, hidden_size: int):
++    # TODO: add optimal configurations
++    return None
++
++
++def _check_divisibility(hidden_size: int):
++    # The bgmv_expand kernel requires that the hidden_size be divisible by
++    # the number below.
++    divisibility = [2, 4, 8, 16, 32, 64]
++    divisibility.sort(reverse=True)
++    for div in divisibility:
++        if hidden_size % div == 0:
++            return div
++    # hidden_size is an odd number
++    return 1
++
++
++def _get_default_config(op_type: str, batch: int, hidden_size: int):
++    if op_type == "expand":
++        return {
++            "BLOCK_N": 256,
++            "SPLIT_N": _check_divisibility(hidden_size),
++            "num_warps": 8
++        }
++    else:
++        return {"BLOCK_K": 256, "SPLIT_K": 64, "num_warps": 8}
++
++
++def get_lora_op_configs(op_type: str, batch: int,
++                        hidden_size: int) -> Dict[str, int]:
++    """Inspired by `fused_moe_kernel`
++    The return value will be a dictionary mapping an irregular grid of batch 
++    sizes and hidden_size to configurations of the bgmv-related kernel. 
++    NOTE: It currently only supports the default configuration. We plan to 
++    generate optimal configurations for different hardware in the future using 
++    scripts similar to `benchmark_moe.py`.
++    """
++    config = _get_op_configs(op_type, batch, hidden_size)
++    if not config:
++        config = _get_default_config(op_type, batch, hidden_size)
++    return config
++
++
++_LORA_A_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
++_LORA_B_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
++
++
++def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: str):
++    """
++    `_LORA_A_PTR_DICT` collects the required information during `profile_run`, 
++    After this, it remains constant and subsequent usage is through LUT.
++    Refer to: 
++    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
++    """
++    key = tuple(lora_weight.data_ptr() for lora_weight in lora_a_weights)
++
++    if values := _LORA_A_PTR_DICT.get(key):
++        return values
++
++    lora_strides_d0 = []
++    lora_strides_d1 = []
++    lora_strides_d2 = []
++    tensor_ptrs = []
++    for lora_a_weight in lora_a_weights:
++        if lora_a_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
++            assert lora_a_weight.size(1) == 1
++            lora_a_weight = lora_a_weight.squeeze(dim=1)
++        else:
++            assert lora_a_weight.ndim == 3  # shape:(lora_num,size,rank)
++        assert lora_a_weight.is_contiguous()
++        tensor_ptrs.append(lora_a_weight.data_ptr())
++        lora_strides_d0.append(lora_a_weight.stride(0))
++        lora_strides_d1.append(lora_a_weight.stride(1))
++        lora_strides_d2.append(lora_a_weight.stride(2))
++    if len(lora_a_weights) > 1:
++        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
++    else:
++        lora_ptr_tensor = lora_a_weights[0]
++
++    if (len(set(lora_strides_d0)) > 1 or len(set(lora_strides_d1)) > 1
++            or len(set(lora_strides_d2)) > 1):
++        raise ValueError("All LoRA weights must have the same stride.")
++
++    _LORA_A_PTR_DICT[key] = (
++        lora_ptr_tensor,
++        lora_strides_d0[0],
++        lora_strides_d1[0],
++        lora_strides_d2[0],
++    )
++    return _LORA_A_PTR_DICT.get(key)
++
++
++def _get_lora_b_ptr(lora_weights: List[torch.Tensor], offset_start: int,
++                    device: str):
++    """ 
++     `_LORA_B_PTR_DICT` collects the required information during `profile_run`, 
++    After this, it remains constant and subsequent usage is through LUT.
++    Refer to: 
++    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
++
++    """
++
++    key = tuple(lora_weight.data_ptr() for lora_weight in lora_weights)
++    if values := _LORA_B_PTR_DICT.get(key):
++        return values
++    slice_offset_lst = []
++    tensor_ptrs = []
++    lora_strides_d0 = []
++    lora_strides_d1 = []
++    lora_strides_d2 = []
++    hidden_sizes = []
++    slice_offset = offset_start
++    for lora_b_weight in lora_weights:
++        if lora_b_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
++            assert lora_b_weight.size(1) == 1
++            lora_b_weight = lora_b_weight.squeeze(dim=1)
++        else:
++            assert lora_b_weight.ndim == 3  # shape:(lora_num,size,rank)
++        assert lora_b_weight.is_contiguous()
++        tensor_ptrs.append(lora_b_weight.data_ptr())
++        lora_strides_d0.append(lora_b_weight.stride(0))
++        lora_strides_d1.append(lora_b_weight.stride(1))
++        lora_strides_d2.append(lora_b_weight.stride(2))
++        slice_offset_lst.append(slice_offset)
++        slice_offset += lora_b_weight.size(1)
++        hidden_sizes.append(lora_b_weight.size(1))
++
++    if len(lora_weights) > 1:
++        # note these are device tensors
++        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
++        slice_start_tensor = torch.tensor(slice_offset_lst, device=device)
++    else:
++        slice_start_tensor = slice_offset_lst[0]
++        lora_ptr_tensor = lora_b_weight[0]
++
++    # If each lora has the same stride, there's no need to use a
++    # tensor for storage.
++    if (len(set(lora_strides_d0)) == 1 and len(set(lora_strides_d1)) == 1 and
++            len(set(lora_strides_d2)) == 1) and len(set(hidden_sizes)) == 1:
++        lora_strides_d0_tensor = lora_strides_d0[0]
++        lora_strides_d1_tensor = lora_strides_d1[0]
++        lora_strides_d2_tensor = lora_strides_d2[0]
++        hidden_sizes_tensor = hidden_sizes[0]
++        same_stride = True
++
++    else:
++        lora_strides_d0_tensor = torch.tensor(lora_strides_d0, device=device)
++        lora_strides_d1_tensor = torch.tensor(lora_strides_d1, device=device)
++        lora_strides_d2_tensor = torch.tensor(lora_strides_d2, device=device)
++        hidden_sizes_tensor = torch.tensor(hidden_sizes, device=device)
++        same_stride = False
++    # MAX_N is the maximum hidden size among all the lora_b weights
++    MAX_N = max(hidden_sizes)
++    _LORA_B_PTR_DICT[key] = (slice_start_tensor, lora_ptr_tensor,
++                             lora_strides_d0_tensor, lora_strides_d1_tensor,
++                             lora_strides_d2_tensor, hidden_sizes_tensor,
++                             same_stride, MAX_N)
++    return _LORA_B_PTR_DICT.get(key)
+diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
+new file mode 100644
+index 0000000..dacfb9e
+--- /dev/null
++++ b/vllm/lora/peft_helper.py
+@@ -0,0 +1,80 @@
++# Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
++
++import math
++from dataclasses import MISSING, dataclass, field, fields
++from typing import Literal, Optional, Union
++
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++
++@dataclass
++class PEFTHelper:
++    # Required fields
++    r: int
++    lora_alpha: int
++    target_modules: Union[list[str], str]
++
++    bias: Literal["none", "all", "lora_only"] = field(default="none")
++    modules_to_save: Optional[list[str]] = field(default=None)
++    # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732)
++    use_rslora: bool = field(default=False)
++    # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353)
++    use_dora: bool = field(default=False)
++    # long context lora field
++    context_length: int = field(default=0)
++    # Extra vllm field, start with 'vllm_' to avoid conflict
++    vllm_lora_scaling_factor: float = field(default=1.0)
++    vllm_max_position_embeddings: Optional[int] = field(default=False)
++    vllm_long_context_scaling_factor: Optional[float] = field(default=None)
++
++    def _validate_features(self):
++        error_msg = []
++
++        if self.modules_to_save:
++            error_msg.append("vLLM only supports modules_to_save being None.")
++
++        if self.use_dora:
++            error_msg.append("vLLM does not yet support DoRA.")
++
++        if error_msg:
++            raise ValueError(f"{', '.join(error_msg)}")
++
++    def __post_init__(self):
++        self._validate_features()
++        if self.use_rslora:
++            logger.info_once("Loading LoRA weights trained with rsLoRA.")
++            self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
++        else:
++            self.vllm_lora_scaling_factor = self.lora_alpha / self.r
++        if self.context_length:
++            if self.vllm_max_position_embeddings is None:
++                self.vllm_max_position_embeddings = self.context_length
++            self.vllm_long_context_scaling_factor = float(
++                math.ceil(self.context_length /
++                          self.vllm_max_position_embeddings))
++
++    @classmethod
++    def from_dict(cls, config_dict: dict) -> "PEFTHelper":
++        # Get all field information from the class
++        class_fields = {f.name: f for f in fields(cls)}
++        # Check for required fields
++        required_fields = {
++            name
++            for name, f in class_fields.items()
++            if f.default is MISSING and f.default_factory is MISSING
++        }
++
++        # Identify any missing required fields
++        missing_fields = required_fields - set(config_dict.keys())
++        if missing_fields:
++            raise ValueError(
++                f"Missing required configuration fields: {missing_fields}")
++
++        # Filter out fields that aren't defined in the class
++        filtered_dict = {
++            k: v
++            for k, v in config_dict.items() if k in class_fields
++        }
++        return cls(**filtered_dict)
+diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py
+new file mode 100644
+index 0000000..48ada39
+--- /dev/null
++++ b/vllm/lora/punica_wrapper/__init__.py
+@@ -0,0 +1,7 @@
++from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
++from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
++
++__all__ = [
++    "PunicaWrapperBase",
++    "get_punica_wrapper",
++]
+diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
+new file mode 100644
+index 0000000..b9ec0c4
+--- /dev/null
++++ b/vllm/lora/punica_wrapper/punica_base.py
+@@ -0,0 +1,482 @@
++"""
++Based on:
++Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
++Punica: Multi-Tenant LoRA Serving. 
++https://arxiv.org/abs/2310.18547
++"""
++
++from abc import ABC, abstractmethod
++from typing import TYPE_CHECKING, List, Optional, Tuple, Union
++
++import torch
++
++from .utils import compute_meta, convert_mapping
++
++if TYPE_CHECKING:
++    # avoid circuit import
++    from vllm.lora.layers import LoRAMapping
++    from vllm.lora.models import LongContextLoRAContext
++
++
++class PunicaWrapperABC(ABC):
++    """
++    PunicaWrapper ABC.
++    """
++
++    @abstractmethod
++    def update_metadata(
++        self,
++        mapping: "LoRAMapping",
++        lora_index_to_id: List[Optional[int]],
++        max_loras: int,
++        vocab_size: int,
++        extra_vocab_size: int,
++        long_lora_context: Optional["LongContextLoRAContext"] = None,
++        **kwargs,
++    ) -> None:
++        """
++        Update the lora-related metadata
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_shrink(
++        self,
++        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
++        x: torch.Tensor,
++        lora_a_stacked: Tuple[torch.Tensor, ...],
++        scale: float,
++        **kwargs,
++    ) -> None:
++        """
++        Performs GEMM  for multiple slices of lora_a.
++        """
++
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_expand(
++        self,
++        y: torch.Tensor,
++        x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
++        lora_b_stacked: Tuple[torch.Tensor, ...],
++        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
++        output_slices: Tuple[int, ...],
++        offset_start: int = 0,
++        add_inputs=True,
++        **kwargs,
++    ) -> None:
++        """
++        Performs GEMM and bias addition for multiple slices of lora_b.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_lora_embedding(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        lora_b_stacked: torch.Tensor,
++        add_inputs: bool = True,
++        **kwargs,
++    ) -> None:
++        """
++        Applies lora  specifically for VocabParallelEmbeddingWithLoRA, 
++        and this layer only requires the expand operation.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_lora_linear(self,
++                        y: torch.Tensor,
++                        x: torch.Tensor,
++                        lora_a_stacked: Tuple[torch.Tensor, ...],
++                        lora_b_stacked: Tuple[torch.Tensor, ...],
++                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
++                        scale: float,
++                        output_slices: Tuple[int, ...],
++                        *,
++                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
++                        **kwargs) -> None:
++        """
++        Applicable to linear-related lora. 
++        """
++
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_lora_logits(self,
++                        y: torch.Tensor,
++                        x: torch.Tensor,
++                        lora_a_stacked: torch.Tensor,
++                        lora_b_stacked: torch.Tensor,
++                        scale,
++                        *,
++                        buffer: Optional[torch.Tensor] = None,
++                        **kwargs) -> None:
++        """
++        Applies lora  specifically for LogitsProcessorWithLoRA.
++        """
++        raise NotImplementedError
++
++
++class PunicaWrapperBase(PunicaWrapperABC):
++    """
++    PunicaWrapperBase is designed to manage and provide metadata for the punica 
++    kernel. The main function is to maintain the state information for 
++    Multi-LoRA, and to provide the interface for the punica.
++    """
++
++    def __init__(self, max_num_batched_tokens: int, max_batches: int,
++                 device: Union[torch.device, str], **kwargs):
++        self._token_lora_indices = torch.empty(max_num_batched_tokens,
++                                               dtype=torch.long,
++                                               device=device)
++        self._sampler_indices = torch.empty(max_num_batched_tokens,
++                                            dtype=torch.long,
++                                            device=device)
++        self._sampler_indices_padded = torch.empty(max_num_batched_tokens,
++                                                   dtype=torch.long,
++                                                   device=device)
++        self._embeddings_indices = torch.empty(2,
++                                               max_num_batched_tokens,
++                                               dtype=torch.long,
++                                               device=device)
++        self._long_lora_indices = torch.empty(max_num_batched_tokens,
++                                              dtype=torch.long,
++                                              device=device)
++
++        # 5 is the number of indicies tensors.
++        # base_indices, sampler_indices, sampler_indices_padded,
++        # embeddings_indices,long_lora_indices
++        self.indices_len: List[Optional[int]] = [None] * 5
++        # these attributes are the information required for sgmv kernel
++        self._seq_start_locs = torch.empty(max_batches,
++                                           dtype=torch.long,
++                                           device=device)
++        self._seq_lengths = torch.empty(max_batches,
++                                        dtype=torch.long,
++                                        device=device)
++        self._lora_indices_per_batch = torch.empty(max_batches,
++                                                   dtype=torch.long,
++                                                   device=device)
++        self.device: torch.device = device
++        self.max_length: int = 0
++        self.token_nums: int = 0
++        self.batch_size: int = -1
++        self.is_prefill = False
++        self.no_lora = False
++
++    def _update_base_metadata(
++        self,
++        mapping: "LoRAMapping",
++        lora_index_to_id: List[Optional[int]],
++        max_loras: int,
++        vocab_size: int,
++        extra_vocab_size: int,
++        long_lora_context: Optional["LongContextLoRAContext"] = None,
++    ):
++        (
++            base_indices,
++            sampler_indices,
++            sampler_indices_padded,
++            embeddings_indices,
++            long_lora_offsets_tensor,
++            indices_len,
++        ) = convert_mapping(
++            mapping,
++            lora_index_to_id,
++            max_loras,
++            vocab_size,
++            extra_vocab_size,
++            self.device,
++            long_lora_context,
++        )
++        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
++        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
++        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
++            sampler_indices_padded)
++        self._embeddings_indices[:embeddings_indices.
++                                 shape[0], :embeddings_indices.shape[1]].copy_(
++                                     embeddings_indices)
++        if long_lora_offsets_tensor is not None:
++            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
++                long_lora_offsets_tensor)
++        else:
++            self._long_lora_indices.zero_()
++        self.indices_len[:] = indices_len
++
++    def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None:
++
++        (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
++         batch_size, max_length, token_nums,
++         no_lora) = compute_meta(token_lora_tensor)
++
++        self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_(
++            b_seq_start_tensor)
++        self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor)
++        self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_(
++            lora_indices_tensor)
++        self.batch_size = batch_size
++        self.max_length = max_length
++        self.token_nums = token_nums
++        self.no_lora = no_lora
++
++    def _apply_bias(
++        self,
++        indices: torch.Tensor,
++        output: torch.Tensor,
++        output_slices: Tuple[int, ...],
++        lora_bias_stacked: Tuple[Optional[torch.Tensor], ...],
++    ):
++        """Applies bias to output
++
++        Input shapes:
++            lora_bias_stacked:      3 element tuple of (num_loras, output_dim)
++            indices:           (batch_size)
++            output:            (batch_size, q_slice_size + 2*kv_slice_size)
++            output_slices:     n-1 element tuple of (slice_size...),
++                            where n is number of slices
++        """
++        org_output = output
++        output = output.view(-1, output.shape[-1])
++        indices = indices.view(-1)
++
++        offset_left = 0
++        for slice_idx, slice in enumerate(output_slices):
++            bias = lora_bias_stacked[slice_idx]
++            if bias is not None:
++                bias = bias.view(-1, bias.shape[-1])
++                bias = bias[indices]
++                bias[indices == -1] = 0
++                output[:, offset_left:offset_left + slice] += bias
++            offset_left += slice
++
++        return output.view_as(org_output)
++
++    @property
++    def prefill_metadata(
++        self
++    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]:
++        """
++        This property provides a convenient way to access the necessary 
++        metadata for prefill-related  kernel computations.
++            1. seq_start_locs: Tensor of sequence start positions.
++            2. seq_lengths: Tensor of sequence lengths.
++            3. lora_indices_per_batch: Tensor of lora indices, and an index of 
++                -1 means no lora should be applied.
++            4. batch_size: Batch size after clustering identical lora indices.
++            5. max_length: The maximum sequence length in the batch.
++            6. token_nums: The token numbers in the batch.
++        """
++        return (self._seq_start_locs[:self.batch_size],
++                self._seq_lengths[:self.batch_size],
++                self._lora_indices_per_batch[:self.batch_size],
++                self.batch_size, self.max_length, self.token_nums)
++
++    @property
++    def token_lora_indices(self) -> torch.Tensor:
++        """
++        This property provides the lora indices corresponding to each token 
++        in the batch. An index of -1 means no lora should be applied.
++        """
++        token_lora_len = self.indices_len[0]
++        return self._token_lora_indices[:token_lora_len]
++
++    @property
++    def sampler_indices(self) -> torch.Tensor:
++        """ 
++        This property is used to access the lora indices specifically for 
++        LogitsProcessorWithLoRA.
++        """
++        sampler_indices_len = self.indices_len[1]
++        return self._sampler_indices[:sampler_indices_len]
++
++    @property
++    def sampler_indices_padded(self) -> torch.Tensor:
++        """
++        This property provides access to padded sampler indices.
++        """
++        indices_padded_len = self.indices_len[2]
++        return self._sampler_indices_padded[:indices_padded_len]
++
++    @property
++    def embeddings_indices(self) -> torch.Tensor:
++        """
++        This property provides access to the indices used for lora embeddings, 
++        specifically for VocabParallelEmbeddingWithLoRA.
++        """
++        embeddings_indices_len = self.indices_len[3]
++        return self._embeddings_indices[:, :embeddings_indices_len]
++
++    @property
++    def long_lora_indices(self) -> torch.Tensor:
++        """ 
++        This property provides access to the indices used for long context 
++        lora, specifically for LinearScalingRotaryEmbeddingWithLora.
++        """
++        long_lora_len = self.indices_len[4]
++        return self._long_lora_indices[:long_lora_len]
++
++    def update_metadata(
++            self,
++            mapping: "LoRAMapping",
++            lora_index_to_id: List[Optional[int]],
++            max_loras: int,
++            vocab_size: int,
++            extra_vocab_size: int,
++            long_lora_context: Optional["LongContextLoRAContext"] = None,
++            **kwargs):
++
++        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
++                                   vocab_size, extra_vocab_size,
++                                   long_lora_context)
++        if mapping.is_prefill:
++            # Update metadata required for prefill-related operators.
++            self._update_prefill_metada(self.token_lora_indices)
++            self.is_prefill = True
++        else:
++            self.is_prefill = False
++
++    @abstractmethod
++    def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
++                   x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
++                   scale: float, **kwargs) -> None:
++        """
++        Performs GEMM  for multiple slices of lora_a.
++
++        Semantics:
++        for i in range(len(lora_a_stacked)):
++            y[i] += (x @ lora_a_stacked[i]) * scale
++        
++        Args:
++            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
++            x (torch.Tensor): Input tensor
++            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
++            scale (float): Scaling factor for the operation
++
++        """
++        # TODO: implement it based on torch ops
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_expand(self,
++                   y: torch.Tensor,
++                   x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
++                   lora_b_stacked: Tuple[torch.Tensor, ...],
++                   lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
++                   output_slices: Tuple[int, ...],
++                   offset_start: int = 0,
++                   add_inputs=True,
++                   **kwargs) -> None:
++        """
++        Performs GEMM and bias addition for multiple slices of lora_b.
++      
++        Semantics:
++            offset = offset_start
++            for i in range(len(lora_b_stacked)):
++                slice = output_slices[i]
++                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
++                    lora_bias_stacked[i] 
++                offset += slice
++            
++        Args:
++            y (torch.Tensor): Output tensor.
++            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
++            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
++            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
++                bias's weight
++            output_slices (Tuple[int, ...]): Every slice's size
++            offset_start (int): The starting position of y, defaults to 0
++            add_inputs (bool):  Defaults to True.
++
++        """
++        # TODO: implement it based on torch ops
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_lora_embedding(self,
++                           y: torch.Tensor,
++                           x: torch.Tensor,
++                           lora_b_stacked: torch.Tensor,
++                           add_inputs: bool = True,
++                           **kwargs) -> None:
++        """
++        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
++        and this layer only requires the expand operation.
++        Semantics:
++            y += x @ lora_b_stacked
++
++        Args:
++            y (torch.Tensor): Output tensor.
++            x (torch.Tensor): Input tensor.
++            lora_b_stacked (torch.Tensor): lora_b's weights.
++            add_inputs (bool): Default to True.
++        """
++        # TODO: implement it based on torch ops
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_lora_linear(self,
++                        y: torch.Tensor,
++                        x: torch.Tensor,
++                        lora_a_stacked: Tuple[torch.Tensor, ...],
++                        lora_b_stacked: Tuple[torch.Tensor, ...],
++                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
++                        scale: float,
++                        output_slices: Tuple[int, ...],
++                        *,
++                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
++                        **kwargs) -> None:
++        """
++        Applicable to linear-related lora. 
++
++        Semantics:
++            for i in range(len(lora_a_stacked)):
++                y[i] += (
++                    x[i].unsqueeze(0)
++                    @ lora_a_stacked[indices[i], layer_idx, :, :]
++                    @ lora_b_stacked[indices[i], layer_idx, :, :]
++                    * scale
++                    ).squeeze(0)+lora_bias_stacked[i]
++
++        Args:
++            y (torch.Tensor): Output tensor. Will be changed in-place.
++            x (torch.Tensor): Input tensor
++            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
++            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
++            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
++            scale (float): Scaling factor.
++            output_slices (Tuple[int, ...]): Every slice's size.
++            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
++        """
++        # TODO: implement it based on torch ops
++        raise NotImplementedError
++
++    @abstractmethod
++    def add_lora_logits(self,
++                        y: torch.Tensor,
++                        x: torch.Tensor,
++                        lora_a_stacked: torch.Tensor,
++                        lora_b_stacked: torch.Tensor,
++                        scale,
++                        *,
++                        buffer: Optional[torch.Tensor] = None,
++                        **kwargs) -> None:
++        """
++        Applies lora  specifically for LogitsProcessorWithLoRA.
++        
++        Semantics:
++            buffer = (x @ lora_a_stacked) * scale
++            y += buffer @ lora_b_stacked
++
++        Args:
++            y (torch.Tensor): Output tensor.
++            x (torch.Tensor): Input tensor.
++            lora_a_stacked (torch.Tensor): lora_a's weights.
++            lora_b_stacked (torch.Tensor):lora_b's weights.
++            scale (float): Scaling factor.
++            buffer (Optional[torch.Tensor]):Default to None.
++        """
++        # TODO: implement it based on torch ops
++        raise NotImplementedError
+diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py
+new file mode 100644
+index 0000000..b9ae3e0
+--- /dev/null
++++ b/vllm/lora/punica_wrapper/punica_cpu.py
+@@ -0,0 +1,346 @@
++from typing import Callable, Optional, Tuple, Union
++
++import torch
++
++from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
++                                     bgmv_shrink, sgmv_expand,
++                                     sgmv_expand_slice, sgmv_shrink)
++
++from .punica_base import PunicaWrapperBase
++
++
++# The platforms that are compatible with the PyTorch-native implementation can
++# inherit this class
++class PunicaWrapperCPU(PunicaWrapperBase):
++    """
++    PunicaWrapperCPU is designed to manage and provide metadata for the punica 
++    kernel. The main function is to maintain the state information for 
++    Multi-LoRA, and to provide the interface for the pytorch punica ops.
++    """
++
++    def __init__(self, max_num_batched_tokens: int, max_batches: int,
++                 device: Union[torch.device, str], **kwargs):
++        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
++                                   device)
++
++    def _shrink_prefill(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: torch.Tensor,
++        scale: float,
++    ):
++        #No LoRA request, so return directly
++        if self.no_lora:
++            return
++        sgmv_shrink(
++            x,
++            w_t_all,
++            y,
++            *self.prefill_metadata,
++            scale,
++        )
++
++    def _shrink_decode(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: torch.Tensor,
++        scale: float,
++    ):
++        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
++
++    def _expand_prefill(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: torch.Tensor,
++        add_inputs: bool,
++    ):
++        #No LoRA request, so return directly
++        if self.no_lora:
++            return
++        sgmv_expand(
++            x,
++            w_t_all,
++            y,
++            *self.prefill_metadata,
++            add_inputs,
++        )
++
++    def _expand_decode(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: torch.Tensor,
++        add_inputs: bool,
++    ):
++        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
++
++    def _expand_slice_prefill(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: torch.Tensor,
++        y_offset: int,
++        y_slice_size: int,
++        add_inputs: bool,
++    ):
++        #No LoRA request, so return directly
++        if self.no_lora:
++            return
++        sgmv_expand_slice(
++            x,
++            w_t_all,
++            y,
++            *self.prefill_metadata,
++            y_offset,
++            y_slice_size,
++            add_inputs,
++        )
++
++    def _expand_slice_decode(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: torch.Tensor,
++        y_offset: int,
++        y_slice_size: int,
++        add_inputs: bool,
++    ):
++        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
++                          y_slice_size, add_inputs)
++
++    def _apply_expand(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: torch.Tensor,
++        y_offset: int,
++        y_slice_size: int,
++        add_inputs: bool = True,
++    ):
++        """
++        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
++        computation, which is suitable for the
++        GEMM of lora'b.
++        """
++
++        expand_slice_fun: Callable = (self._expand_slice_prefill
++                                      if self.is_prefill else
++                                      self._expand_slice_decode)
++        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
++
++    def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
++                      w_t_all: torch.Tensor, scale: float):
++        """
++        Perform the ` y+=x@w_t_all` computation, which is suitable for the
++        GEMM of lora'a.
++        When `is_prefill is` true, it indicates that it is currently the
++        prefill stage, and the `_shrink_prefill` function should be called.
++        Otherwise, it is the decode stage, and the _shrink_decode function
++        should be called.
++        """
++        y_org = y
++        y = y.view(-1, y.shape[-1])
++        shrink_fun: Callable = (self._shrink_prefill
++                                if self.is_prefill else self._shrink_decode)
++        shrink_fun(y, x, w_t_all, scale)
++        y = y.view_as(y_org)
++
++    def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
++                   x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
++                   scale: float, **kwargs):
++        """
++        Performs GEMM  for multiple slices of lora_a.
++        When `is_prefill is` true, it indicates that it is currently the
++        prefill stage, and the `_shrink_prefill` function should be called.
++        Otherwise, it is the decode stage, and the _shrink_decode function
++        should be called.
++            
++        Semantics:
++        for i in range(len(lora_a_stacked)):
++            y[i] += (x @ lora_a_stacked[i]) * scale
++        
++        Args:
++            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
++            x (torch.Tensor): Input tensor
++            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
++            scale (float): Scaling factor for the operation
++        """
++
++        x = x.view(-1, x.shape[-1])
++        # TODO fuse these kernels
++        for slice_idx in range(len(lora_a_stacked)):
++            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
++                               scale)
++
++    def add_expand(self,
++                   y: torch.Tensor,
++                   x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
++                   lora_b_stacked: Tuple[torch.Tensor, ...],
++                   lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
++                   output_slices: Tuple[int, ...],
++                   offset_start: int = 0,
++                   add_inputs=True,
++                   **kwargs) -> None:
++        """
++        Performs GEMM and bias addition for multiple slices of lora_b.
++      
++        Semantics:
++            for i in range(len(lora_b_stacked)):
++                slice = output_slices[i]
++                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
++                    lora_bias_stacked[i] 
++                offset += slice
++            
++        Args:
++            y (torch.Tensor): Output tensor.
++            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
++            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
++            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
++                bias's weight
++            output_slices (Tuple[int, ...]): Every slice's size
++            add_inputs (bool):  Defaults to True.
++        """
++        y_org = y
++        y = y.view(-1, y.shape[-1])
++        offset_left = offset_start
++        if lora_bias_stacked is not None:
++            self._apply_bias(self.token_lora_indices, y, output_slices,
++                             lora_bias_stacked)
++        for slice_idx in range(len(lora_b_stacked)):
++            self._apply_expand(
++                y,
++                x[slice_idx],
++                lora_b_stacked[slice_idx],
++                offset_left,
++                output_slices[slice_idx],
++                add_inputs=add_inputs,
++            )
++            offset_left += output_slices[slice_idx]
++        y = y.view_as(y_org)
++
++    def add_lora_embedding(self,
++                           y: torch.Tensor,
++                           x: torch.Tensor,
++                           lora_b_stacked: torch.Tensor,
++                           add_inputs: bool = True,
++                           **kwargs) -> None:
++        """
++        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
++
++        Semantics:
++            y += x @ lora_b_stacked
++
++        Args:
++            y (torch.Tensor): Output tensor.
++            x (torch.Tensor): Input tensor.
++            lora_b_stacked (torch.Tensor): lora_b's weights.
++            add_inputs (bool): Default to True.
++        """
++
++        # Embedding layer only need expand op
++        expand_fun: Callable = (self._expand_prefill
++                                if self.is_prefill else self._expand_decode)
++        expand_fun(y, x, lora_b_stacked, add_inputs)
++
++    def add_lora_linear(self,
++                        y: torch.Tensor,
++                        x: torch.Tensor,
++                        lora_a_stacked: Tuple[torch.Tensor, ...],
++                        lora_b_stacked: Tuple[torch.Tensor, ...],
++                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
++                        scale: float,
++                        output_slices: Tuple[int, ...],
++                        *,
++                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
++                        **kwargs) -> None:
++        """
++        Applicable to linear-related lora. 
++
++        Semantics:
++            for i in range(len(lora_a_stacked)):
++                y[i] += (
++                    x[i].unsqueeze(0)
++                    @ lora_a_stacked[indices[i], layer_idx, :, :]
++                    @ lora_b_stacked[indices[i], layer_idx, :, :]
++                    * scale
++                    ).squeeze(0)+lora_bias_stacked[i]
++
++        Args:
++            y (torch.Tensor): Output tensor. Will be changed in-place.
++            x (torch.Tensor): Input tensor
++            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
++            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
++            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
++            scale (float): Scaling factor.
++            output_slices (Tuple[int, ...]): Every slice's size.
++            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
++        """
++
++        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
++        if lora_bias_stacked is not None:
++            assert len(lora_bias_stacked) == len(output_slices)
++            y = self._apply_bias(self.token_lora_indices, y, output_slices,
++                                 lora_bias_stacked)
++
++        if buffer is None:
++            r = lora_b_stacked[0].size(-1)
++            # We set the buffer to be float32 by default, consistent with the
++            # triton op
++            buffer = tuple(
++                torch.zeros(
++                    (x.size(0), r), dtype=torch.float32, device=x.device)
++                for _ in range(len(output_slices)))
++        self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
++        self.add_expand(y,
++                        buffer,
++                        lora_b_stacked,
++                        None,
++                        output_slices,
++                        add_inputs=True,
++                        **kwargs)
++
++    def add_lora_logits(self,
++                        y: torch.Tensor,
++                        x: torch.Tensor,
++                        lora_a_stacked: torch.Tensor,
++                        lora_b_stacked: torch.Tensor,
++                        scale,
++                        *,
++                        buffer: Optional[torch.Tensor] = None,
++                        **kwargs) -> None:
++        """
++        Applies lora  specifically for LogitsProcessorWithLoRA.
++        
++        Semantics:
++            buffer = (x @ lora_a_stacked) * scale
++            y += buffer @ lora_b_stacked
++
++        Args:
++            y (torch.Tensor): Output tensor.
++            x (torch.Tensor): Input tensor.
++            lora_a_stacked (torch.Tensor): lora_a's weights.
++            lora_b_stacked (torch.Tensor):lora_b's weights.
++            scale (float): Scaling factor.
++            buffer (Optional[torch.Tensor]):Default to None.
++        """
++        y_org = y
++        y = y.view(-1, y.shape[-1])
++        x = x.view(-1, x.shape[-1])
++        r = lora_b_stacked.size(-1)
++        if buffer is None:
++            # We set the buffer to be float32 by default, consistent with the
++            # triton op
++            buffer = torch.zeros((x.size(0), r),
++                                 dtype=torch.float32,
++                                 device=x.device)
++        # LogitsProcessorWithLoRA always using bgmv.
++        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
++        bgmv_expand(buffer,
++                    lora_b_stacked,
++                    y,
++                    self.sampler_indices,
++                    add_inputs=True)
++        y = y.view_as(y_org)
+diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
+new file mode 100644
+index 0000000..451f23e
+--- /dev/null
++++ b/vllm/lora/punica_wrapper/punica_gpu.py
+@@ -0,0 +1,314 @@
++"""
++Based on:
++Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
++Punica: Multi-Tenant LoRA Serving. 
++https://arxiv.org/abs/2310.18547
++"""
++
++from typing import Optional, Tuple, Union, final
++
++import torch
++
++from vllm.triton_utils import HAS_TRITON
++
++if HAS_TRITON:
++    from vllm.lora.ops.triton_ops import bgmv_expand
++    from vllm.lora.ops.triton_ops import bgmv_expand_slice
++    from vllm.lora.ops.triton_ops import bgmv_shrink
++    from vllm.lora.ops.triton_ops import sgmv_expand
++    from vllm.lora.ops.triton_ops import sgmv_shrink
++
++from .punica_base import PunicaWrapperBase
++
++
++@final
++class PunicaWrapperGPU(PunicaWrapperBase):
++    """
++    PunicaWrapperGPU is designed to manage and provide metadata for the punica 
++    kernel. The main function is to maintain the state information for 
++    Multi-LoRA, and to provide the interface for the punica triton kernel.
++    """
++
++    def __init__(self, max_num_batched_tokens: int, max_batches: int,
++                 device: Union[torch.device, str], **kwargs):
++        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
++                                   device)
++
++    def _apply_shrink_prefill(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: Tuple[torch.Tensor, ...],
++        scale: float,
++    ):
++        #No LoRA request, so return directly
++        if self.no_lora:
++            return
++        sgmv_shrink(
++            x,
++            w_t_all,
++            y,
++            *self.prefill_metadata,
++            scale,
++        )
++
++    def _apply_shrink_decode(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: torch.Tensor,
++        scale: float,
++    ):
++        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
++
++    def _apply_expand_prefill(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: torch.Tensor,
++        offset_start: int,
++        add_inputs: bool,
++    ):
++        #No LoRA request, so return directly
++        if self.no_lora:
++            return
++
++        sgmv_expand(
++            x,
++            w_t_all,
++            y,
++            *self.prefill_metadata,
++            offset_start=offset_start,
++            add_inputs=add_inputs,
++        )
++
++    def _apply_expand_decode(
++        self,
++        y: torch.Tensor,
++        x: torch.Tensor,
++        w_t_all: torch.Tensor,
++        y_offset: Optional[int],
++        y_slice_size: Optional[int],
++        add_inputs: bool,
++    ):
++        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
++                          y_slice_size, add_inputs)
++
++    def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
++                   x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
++                   scale: float, **kwargs):
++        """
++        Performs GEMM  for multiple slices of lora_a.
++        When `is_prefill is` true, it indicates that it is currently the
++        prefill stage, and the `_shrink_prefill` function should be called.
++        Otherwise, it is the decode stage, and the _shrink_decode function
++        should be called.
++            
++        Semantics:
++        for i in range(len(lora_a_stacked)):
++            y[i] += (x @ lora_a_stacked[i]) * scale
++        
++        Args:
++            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
++            x (torch.Tensor): Input tensor
++            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
++            scale (float): Scaling factor for the operation
++        """
++
++        x = x.view(-1, x.shape[-1])
++
++        if self.is_prefill:
++            # NOTE fused kernel
++            self._apply_shrink_prefill(y, x, lora_a_stacked, scale)
++        else:
++            # TODO fuse these kernels
++            for slice_idx in range(len(lora_a_stacked)):
++                self._apply_shrink_decode(y[slice_idx], x,
++                                          lora_a_stacked[slice_idx], scale)
++
++    def add_expand(self,
++                   y: torch.Tensor,
++                   x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
++                   lora_b_stacked: Tuple[torch.Tensor, ...],
++                   lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
++                   output_slices: Tuple[int, ...],
++                   offset_start: int = 0,
++                   add_inputs=True,
++                   **kwargs) -> None:
++        """
++        Performs GEMM and bias addition for multiple slices of lora_b.
++      
++        Semantics:
++            for i in range(len(lora_b_stacked)):
++                slice = output_slices[i]
++                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
++                    lora_bias_stacked[i] 
++                offset += slice
++            
++        Args:
++            y (torch.Tensor): Output tensor.
++            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
++            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
++            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
++                bias's weight
++            output_slices (Tuple[int, ...]): Every slice's size
++            add_inputs (bool):  Defaults to True.
++        """
++        y_org = y
++        y = y.view(-1, y.shape[-1])
++        if lora_bias_stacked is not None:
++            self._apply_bias(self.token_lora_indices, y, output_slices,
++                             lora_bias_stacked)
++        if self.is_prefill:
++            # NOTE fused kernel
++            self._apply_expand_prefill(y,
++                                       x,
++                                       lora_b_stacked,
++                                       offset_start,
++                                       add_inputs=True)
++        else:
++            # TODO fuse these kernels
++            for slice_idx in range(len(lora_b_stacked)):
++                self._apply_expand_decode(
++                    y,
++                    x[slice_idx],
++                    lora_b_stacked[slice_idx],
++                    offset_start,
++                    output_slices[slice_idx],
++                    add_inputs=add_inputs,
++                )
++                offset_start += output_slices[slice_idx]
++        y = y.view_as(y_org)
++
++    def add_lora_embedding(self,
++                           y: torch.Tensor,
++                           x: torch.Tensor,
++                           lora_b_stacked: torch.Tensor,
++                           add_inputs: bool = True,
++                           **kwargs) -> None:
++        """
++        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
++
++        Semantics:
++            y += x @ lora_b_stacked
++
++        Args:
++            y (torch.Tensor): Output tensor.
++            x (torch.Tensor): Input tensor.
++            lora_b_stacked (torch.Tensor): lora_b's weights.
++            add_inputs (bool): Default to True.
++        """
++
++        if self.is_prefill:
++            sgmv_expand(
++                x.unsqueeze(dim=0),
++                [lora_b_stacked],
++                y,
++                *self.prefill_metadata,
++                offset_start=0,
++                add_inputs=add_inputs,
++            )
++        else:
++            bgmv_expand(x, lora_b_stacked, y, self.token_lora_indices,
++                        add_inputs)
++
++    def add_lora_linear(self,
++                        y: torch.Tensor,
++                        x: torch.Tensor,
++                        lora_a_stacked: Tuple[torch.Tensor, ...],
++                        lora_b_stacked: Tuple[torch.Tensor, ...],
++                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
++                        scale: float,
++                        output_slices: Tuple[int, ...],
++                        *,
++                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
++                        **kwargs) -> None:
++        """
++        Applicable to linear-related lora. 
++
++        Semantics:
++            for i in range(len(lora_a_stacked)):
++                y[i] += (
++                    x[i].unsqueeze(0)
++                    @ lora_a_stacked[indices[i], layer_idx, :, :]
++                    @ lora_b_stacked[indices[i], layer_idx, :, :]
++                    * scale
++                    ).squeeze(0)+lora_bias_stacked[i]
++
++        Args:
++            y (torch.Tensor): Output tensor. Will be changed in-place.
++            x (torch.Tensor): Input tensor
++            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
++            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
++            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
++            scale (float): Scaling factor.
++            output_slices (Tuple[int, ...]): Every slice's size.
++            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
++        """
++
++        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
++        if lora_bias_stacked is not None:
++            assert len(lora_bias_stacked) == len(output_slices)
++            y = self._apply_bias(self.token_lora_indices, y, output_slices,
++                                 lora_bias_stacked)
++
++        if buffer is None:
++            r = lora_b_stacked[0].size(-1)
++            # We set the buffer to be float32 by default ,refer to:
++            # https://github.com/triton-lang/triton/issues/1387
++            buffer = torch.zeros(
++                (len(output_slices), x.size(0), r),
++                dtype=torch.float32,
++                device=x.device,
++            )
++        self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
++        self.add_expand(y,
++                        buffer,
++                        lora_b_stacked,
++                        None,
++                        output_slices,
++                        add_inputs=True,
++                        **kwargs)
++
++    def add_lora_logits(self,
++                        y: torch.Tensor,
++                        x: torch.Tensor,
++                        lora_a_stacked: torch.Tensor,
++                        lora_b_stacked: torch.Tensor,
++                        scale,
++                        *,
++                        buffer: Optional[torch.Tensor] = None,
++                        **kwargs) -> None:
++        """
++        Applies lora  specifically for LogitsProcessorWithLoRA.
++        
++        Semantics:
++            buffer = (x @ lora_a_stacked) * scale
++            y += buffer @ lora_b_stacked
++
++        Args:
++            y (torch.Tensor): Output tensor.
++            x (torch.Tensor): Input tensor.
++            lora_a_stacked (torch.Tensor): lora_a's weights.
++            lora_b_stacked (torch.Tensor):lora_b's weights.
++            scale (float): Scaling factor.
++            buffer (Optional[torch.Tensor]):Default to None.
++        """
++        y_org = y
++        y = y.view(-1, y.shape[-1])
++        x = x.view(-1, x.shape[-1])
++        r = lora_b_stacked.size(-1)
++        if buffer is None:
++            # We set the buffer to be float32 by default ,refer to:
++            # https://github.com/triton-lang/triton/issues/1387
++            buffer = torch.zeros((x.size(0), r),
++                                 dtype=torch.float32,
++                                 device=x.device)
++        # LogitsProcessorWithLoRA always using bgmv.
++        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
++        bgmv_expand(buffer,
++                    lora_b_stacked,
++                    y,
++                    self.sampler_indices,
++                    add_inputs=True)
++        y = y.view_as(y_org)
+diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
+new file mode 100644
+index 0000000..d9c4f44
+--- /dev/null
++++ b/vllm/lora/punica_wrapper/punica_hpu.py
+@@ -0,0 +1,87 @@
++from typing import Optional, Tuple, Union, final
++
++import torch
++from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
++                                    dispatch_bgmv_linear)
++
++from .punica_base import PunicaWrapperBase
++
++
++@final
++class PunicaWrapperHPU(PunicaWrapperBase):
++
++    def __init__(self, max_num_batched_tokens: int, max_batches: int,
++                 device: Union[torch.device, str], **kwargs):
++        # Increasing max_num_batched_tokens by 3x to handle increase in
++        # tensor size due to padding.
++        PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
++                                   max_batches, device)
++
++    def add_lora_embedding(self,
++                           y: torch.Tensor,
++                           x: torch.Tensor,
++                           lora_b_stacked: torch.Tensor,
++                           add_inputs: bool = True,
++                           **kwargs) -> None:
++        dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
++
++    def add_lora_linear(self,
++                        y: torch.Tensor,
++                        x: torch.Tensor,
++                        lora_a_stacked: Tuple[torch.Tensor, ...],
++                        lora_b_stacked: Tuple[torch.Tensor, ...],
++                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
++                        scale: float,
++                        output_slices: Tuple[int, ...],
++                        *,
++                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
++                        **kwargs) -> None:
++        y_org = y
++        x = x.view(-1, x.shape[-1])
++        y = y.view(-1, y.shape[-1])
++        offset_left = 0
++
++        for slice_idx in range(len(output_slices)):
++            dispatch_bgmv_linear(
++                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
++                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
++            offset_left += output_slices[slice_idx]
++        y = y.view_as(y_org)
++
++    def add_lora_logits(self,
++                        y: torch.Tensor,
++                        x: torch.Tensor,
++                        lora_a_stacked: torch.Tensor,
++                        lora_b_stacked: torch.Tensor,
++                        scale,
++                        *,
++                        buffer: Optional[torch.Tensor] = None,
++                        **kwargs) -> None:
++        y_org = y
++        y = y.view(-1, y.shape[-1])
++        x = x.view(-1, x.shape[-1])
++        dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
++        y = y.view_as(y_org)
++
++    def add_shrink(
++        self,
++        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
++        x: torch.Tensor,
++        lora_a_stacked: Tuple[torch.Tensor, ...],
++        scale: float,
++        **kwargs,
++    ) -> None:
++        raise NotImplementedError
++
++    def add_expand(
++        self,
++        y: torch.Tensor,
++        x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
++        lora_b_stacked: Tuple[torch.Tensor, ...],
++        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
++        output_slices: Tuple[int, ...],
++        offset_start: int = 0,
++        add_inputs=True,
++        **kwargs,
++    ) -> None:
++        raise NotImplementedError
+diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
+new file mode 100644
+index 0000000..9f1606e
+--- /dev/null
++++ b/vllm/lora/punica_wrapper/punica_selector.py
+@@ -0,0 +1,26 @@
++from vllm.logger import init_logger
++from vllm.platforms import current_platform
++
++from .punica_base import PunicaWrapperBase
++
++logger = init_logger(__name__)
++
++
++def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
++    if current_platform.is_cuda_alike():
++        # Lazy import to avoid ImportError
++        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
++        logger.info_once("Using PunicaWrapperGPU.")
++        return PunicaWrapperGPU(*args, **kwargs)
++    elif current_platform.is_cpu():
++        # Lazy import to avoid ImportError
++        from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
++        logger.info_once("Using PunicaWrapperCPU.")
++        return PunicaWrapperCPU(*args, **kwargs)
++    elif current_platform.is_hpu():
++        # Lazy import to avoid ImportError
++        from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
++        logger.info_once("Using PunicaWrapperHPU.")
++        return PunicaWrapperHPU(*args, **kwargs)
++    else:
++        raise NotImplementedError
+diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py
+new file mode 100644
+index 0000000..7360c8c
+--- /dev/null
++++ b/vllm/lora/punica_wrapper/utils.py
+@@ -0,0 +1,159 @@
++from typing import TYPE_CHECKING, List, Optional, Tuple, Union
++
++import torch
++
++if TYPE_CHECKING:
++    # avoid circuit import
++    from vllm.lora.layers import LoRAMapping
++    from vllm.lora.models import LongContextLoRAContext
++
++
++def compute_meta(
++    token_lora_tensor: torch.Tensor
++) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]:
++    """
++    Get the information required for the sgmv kernel. With the  features:
++    1. If consecutive requests in the batch use the same LoRA, this function
++    will combine them into a single request, improving sgmv kernel inference
++    performance.
++    2. At the beginning of each prefill stage inference, recalculations are
++    needed based on the input, but only once.
++    """
++
++    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
++        token_lora_tensor, return_counts=True)
++    cum_result = torch.cumsum(seq_length_tensor, dim=0)
++    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
++    b_seq_start_tensor[1:].copy_(cum_result[:-1])
++    max_length = seq_length_tensor.max().item()
++    token_nums = seq_length_tensor.sum().item()
++    batch_size = lora_indices_tensor.size(0)
++    no_lora = False
++    # -1 means no lora should be applied. Use `no_lora` to determine whether
++    # the current step requires LoRA. If LoRA is not needed, the prefill stage
++    # does not need to launch the triton kernel, which can improve performance
++    if batch_size == 1 and lora_indices_tensor == -1:
++        no_lora = True
++    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
++            batch_size, max_length, token_nums, no_lora)
++
++
++# TODO see if this can be vectorized
++def convert_mapping(
++    mapping: "LoRAMapping",
++    lora_index_to_id: List[Optional[int]],
++    max_loras: int,
++    vocab_size: int,
++    extra_vocab_size: int,
++    device: torch.device,
++    long_lora_context: Optional["LongContextLoRAContext"] = None,
++) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
++           Optional[torch.Tensor], List[int]]:
++    """Converts LoRAMapping to index tensors.
++
++    Args:
++        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
++        lora_index_to_id: List mapping LoRA ids to LoRA indices.
++        max_loras: Maximum number of LoRAs.
++        vocab_size: Model vocab size.
++        extra_vocab_size: Extra vocab size each LoRA can have.
++        long_lora_context: Passed if there are long context lora in a batch.
++
++    Returns:
++        A tuple of tensors:
++            base_indices: Tensor of shape [batch_size] mapping batch rows to
++                LoRA indices.
++            sampler_indices: Tensor of shape [batch_size] mapping requests to
++                LoRA indices for sampler. For generation, this will be the
++                same as base_indicies. For prefill, this will map requests
++                to LoRA indices.
++            sampler_indices_padded: Tensor of shape [batch_size] mapping
++                requests to LoRA indices for sampler with padding.
++                Same as sampler_indicies, but -1 is replaced with
++                max_loras.
++            embeddings_indices: Tensor of shape [2, batch_size] mapping
++                requests to embedding indices. First row is for embeddings
++                added by the LoRAs, second row is for the LoRA.lora_a
++                embeddings.
++            long_lora_indices: Tensor of shape [batch_size] mapping
++                requests to RoPE offsets and rot dims for long LoRAs.
++                None if long context lora doesn't exist.
++            indices_len: List of lengths of the above tensors. It contains
++                (base_indices, sampler_indices, sampler_indices_padded,
++                embeddings_indices, long_lora_indices).
++    """
++    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
++    embedding_indices = index_mapping_indices.copy()
++    lora_indices = index_mapping_indices.copy()
++    long_lora_offsets: Optional[torch.Tensor] = None
++    if long_lora_context:
++        long_lora_offsets = torch.zeros(len(index_mapping_indices),
++                                        device=device,
++                                        dtype=torch.long)
++    prompt_mapping: List[int] = [
++        lora_index_to_id.index(x) if x > 0 else -1
++        for x in mapping.prompt_mapping
++    ]
++    lora_idx = None
++    for i in range(len(index_mapping_indices)):
++        # TODO index can be slow. optimize
++        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
++                    if index_mapping_indices[i] > 0 else -1)
++        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
++        lora_indices[i] = lora_idx
++        if long_lora_context:
++            assert long_lora_offsets is not None
++            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
++                index_mapping_indices[i], 0)
++            long_lora_offsets[i] = lora_offset
++
++    indices_list: List[Union[List[int], torch.Tensor]] = [
++        index_mapping_indices,
++        lora_indices,
++        embedding_indices,
++    ]
++    if long_lora_context:
++        assert long_lora_offsets is not None
++        indices_list.append(long_lora_offsets)
++    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
++    prompt_mapping_tensor = torch.tensor(prompt_mapping,
++                                         dtype=torch.long,
++                                         device=device)
++    embeddings_indices = torch.stack([
++        indices[2] * extra_vocab_size,
++        indices[2] * (vocab_size + extra_vocab_size),
++    ])
++    embeddings_indices[embeddings_indices == -1] = max_loras - 1
++    base_indices = indices[1]
++    sampler_indices = prompt_mapping_tensor
++    sampler_indices_padded = sampler_indices.clone()
++    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
++    sampler_indices_padded = torch.arange(
++        0, len(sampler_indices_padded), device=device, dtype=torch.long) + (
++            sampler_indices_padded * len(sampler_indices_padded))
++    long_lora_indices = None
++    long_lora_indices_len: Optional[int] = None
++    if long_lora_context:
++        long_lora_indices = indices[3]
++        long_lora_indices_len = long_lora_indices.shape[-1]
++    # Contain length of indices tensors. Used to index into each tensor.
++    indices_len = [
++        base_indices.shape[-1],
++        sampler_indices.shape[-1],
++        sampler_indices_padded.shape[-1],
++        embeddings_indices.shape[-1],
++    ]
++    if long_lora_indices_len is not None:
++        indices_len.append(long_lora_indices_len)
++    else:
++        # If long_lora doesn't exist,append None
++        indices_len.append(None)
++
++    return (
++        base_indices,
++        sampler_indices,
++        sampler_indices_padded,
++        embeddings_indices,
++        long_lora_indices,
++        indices_len,
++    )
+diff --git a/vllm/lora/request.py b/vllm/lora/request.py
+index bbbf488..c4b26dc 100644
+--- a/vllm/lora/request.py
++++ b/vllm/lora/request.py
+@@ -1,12 +1,19 @@
+-from dataclasses import dataclass
++import warnings
++from typing import Optional
+ 
++import msgspec
+ 
+-@dataclass
+-class LoRARequest:
++from vllm.adapter_commons.request import AdapterRequest
++
++
++class LoRARequest(
++        msgspec.Struct,
++        omit_defaults=True,  # type: ignore[call-arg]
++        array_like=True):  # type: ignore[call-arg]
+     """
+     Request for a LoRA adapter.
+ 
+-    Note that this class should be be used internally. For online
++    Note that this class should be used internally. For online
+     serving, it is recommended to not allow users to use this class but
+     instead provide another layer of abstraction to prevent users from
+     accessing unauthorized LoRA adapters.
+@@ -14,19 +21,75 @@ class LoRARequest:
+     lora_int_id must be globally unique for a given adapter.
+     This is currently not enforced in vLLM.
+     """
++    __metaclass__ = AdapterRequest
+ 
+     lora_name: str
+     lora_int_id: int
+-    lora_local_path: str
++    lora_path: str = ""
++    lora_local_path: Optional[str] = msgspec.field(default=None)
++    long_lora_max_len: Optional[int] = None
++    base_model_name: Optional[str] = msgspec.field(default=None)
+ 
+     def __post_init__(self):
+-        if self.lora_int_id < 1:
+-            raise ValueError(
+-                f"lora_int_id must be > 0, got {self.lora_int_id}")
++        if 'lora_local_path' in self.__struct_fields__:
++            warnings.warn(
++                "The 'lora_local_path' attribute is deprecated "
++                "and will be removed in a future version. "
++                "Please use 'lora_path' instead.",
++                DeprecationWarning,
++                stacklevel=2)
++            if not self.lora_path:
++                self.lora_path = self.lora_local_path or ""
++
++        # Ensure lora_path is not empty
++        assert self.lora_path, "lora_path cannot be empty"
++
++    @property
++    def adapter_id(self):
++        return self.lora_int_id
++
++    @property
++    def name(self):
++        return self.lora_name
++
++    @property
++    def path(self):
++        return self.lora_path
++
++    @property
++    def local_path(self):
++        warnings.warn(
++            "The 'local_path' attribute is deprecated "
++            "and will be removed in a future version. "
++            "Please use 'path' instead.",
++            DeprecationWarning,
++            stacklevel=2)
++        return self.lora_path
++
++    @local_path.setter
++    def local_path(self, value):
++        warnings.warn(
++            "The 'local_path' attribute is deprecated "
++            "and will be removed in a future version. "
++            "Please use 'path' instead.",
++            DeprecationWarning,
++            stacklevel=2)
++        self.lora_path = value
+ 
+     def __eq__(self, value: object) -> bool:
+-        return isinstance(
+-            value, LoRARequest) and self.lora_int_id == value.lora_int_id
++        """
++        Overrides the equality method to compare LoRARequest
++        instances based on lora_name. This allows for identification
++        and comparison lora adapter across engines.
++        """
++        return isinstance(value,
++                          self.__class__) and self.lora_name == value.lora_name
+ 
+     def __hash__(self) -> int:
+-        return self.lora_int_id
++        """
++        Overrides the hash method to hash LoRARequest instances
++        based on lora_name. This ensures that LoRARequest instances
++        can be used in hash-based collections such as sets and dictionaries,
++        identified by their names across engines.
++        """
++        return hash(self.lora_name)
+diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
+index 9942a5f..d72b763 100644
+--- a/vllm/lora/utils.py
++++ b/vllm/lora/utils.py
+@@ -1,5 +1,10 @@
+-from typing import List, Optional, Set, Tuple, Type
++import os
++import re
++from typing import List, Optional, Set, Tuple, Type, Union
+ 
++import huggingface_hub
++from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
++                                   HFValidationError, RepositoryNotFoundError)
+ from torch import nn
+ from transformers import PretrainedConfig
+ 
+@@ -8,30 +13,42 @@ from vllm.logger import init_logger
+ from vllm.lora.fully_sharded_layers import (
+     ColumnParallelLinearWithShardedLoRA,
+     MergedColumnParallelLinearWithShardedLoRA,
+-    MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
++    MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora,
++    RowParallelLinearWithShardedLoRA)
+ # being imported for _all_lora_classes below
+ # yapf conflicts with isort for this block
+ # yapf: disable
+ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
++                              LinearScalingRotaryEmbeddingWithLora,
+                               LogitsProcessorWithLoRA,
+                               MergedColumnParallelLinearWithLoRA,
+                               MergedQKVParallelLinearWithLora,
+                               QKVParallelLinearWithLora,
++                              ReplicatedLinearWithLoRA,
+                               RowParallelLinearWithLoRA,
+                               VocabParallelEmbeddingWithLoRA)
+ # yapf: enable
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.models.utils import WeightsMapper
+ 
+ logger = init_logger(__name__)
+ 
+ _all_lora_classes: Set[Type[BaseLayerWithLoRA]] = {
+-    VocabParallelEmbeddingWithLoRA, ColumnParallelLinearWithLoRA,
+-    MergedColumnParallelLinearWithLoRA, QKVParallelLinearWithLora,
+-    MergedQKVParallelLinearWithLora, RowParallelLinearWithLoRA,
+-    LogitsProcessorWithLoRA, ColumnParallelLinearWithShardedLoRA,
++    VocabParallelEmbeddingWithLoRA,
++    ColumnParallelLinearWithLoRA,
++    MergedColumnParallelLinearWithLoRA,
++    QKVParallelLinearWithLora,
++    MergedQKVParallelLinearWithLora,
++    RowParallelLinearWithLoRA,
++    ReplicatedLinearWithLoRA,
++    LogitsProcessorWithLoRA,
++    ColumnParallelLinearWithShardedLoRA,
++    QKVParallelLinearWithShardedLora,
+     MergedColumnParallelLinearWithShardedLoRA,
+-    MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA
++    MergedQKVParallelLinearWithShardedLora,
++    RowParallelLinearWithShardedLoRA,
++    LinearScalingRotaryEmbeddingWithLora,
+ }
+ 
+ 
+@@ -60,7 +77,8 @@ def from_layer_logits_processor(
+     model_config: Optional[PretrainedConfig] = None,
+ ) -> LogitsProcessorWithLoRA:
+     ret = LogitsProcessorWithLoRA(layer, lm_head.embedding_dim,
+-                                  lm_head.weight.dtype, lm_head.weight.device)
++                                  lm_head.weight.dtype, lm_head.weight.device,
++                                  lm_head.get_sharded_to_full_mapping())
+     ret.create_lora_weights(max_loras, lora_config, model_config)
+     return ret
+ 
+@@ -74,25 +92,120 @@ def replace_submodule(model: nn.Module, module_name: str,
+     return new_module
+ 
+ 
+-def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]:
++def parse_fine_tuned_lora_name(
++        name: str,
++        weights_mapper: Optional[WeightsMapper] = None
++) -> Tuple[str, bool, bool]:
+     """Parse the name of lora weights.
+ 
+     args:
+         name: the name of the fine-tuned LoRA, e.g.
+             base_model.model.dense1.weight
++        weights_mapper: maps the name of weight, e.g.
++            `model.` -> `language_model.model.`,
+     return:
+         Tuple(module_name, is_lora_a):
+             module_name: the name of the module, e.g. model.dense1,
+             is_lora_a whether the tensor is lora_a or lora_b.
++            is_bias whether the tensor is lora bias.
+     """
++
++    # LoRA weight qualified name always starts with `base_model.model.`,
++    # so we remove the prefix `base_model.model.` to make the following
++    # mapping correctly.
++    if "base_model.model." in name:
++        name = name.replace("base_model.model.", "")
++        name = weights_mapper._map_name(name) if weights_mapper else name
++        # recover the prefix `base_model.model.`
++        name = "base_model.model." + name
++
+     parts = name.split(".")
+-    assert parts[0] == "base_model"
+-    assert parts[1] == "model"
+-    if parts[-1] == "weight":
+-        assert parts[-2] == "lora_A" or parts[-2] == "lora_B"
+-        return ".".join(parts[2:-2]), parts[-2] == "lora_A"
++    if parts[-1] == "weight" and (parts[-2] == "lora_A"
++                                  or parts[-2] == "lora_B"):
++        new_name = ".".join(parts[2:-2])
++        return new_name, parts[-2] == "lora_A", False
+ 
+     if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
+-        return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A"
++        new_name = ".".join(parts[2:-1])
++        return new_name, parts[-1] == "lora_embedding_A", False
++
++    if parts[-1] == "bias":
++        new_name = ".".join(parts[2:-2])
++        return new_name, False, True
++
++    raise ValueError(f"{name} is unsupported LoRA weight")
++
++
++def is_regex_target_modules(load_modules: Union[str, List[str]],
++                            expected_lora_modules: List[str]) -> bool:
++    """
++    PEFT supports passing `target_modules` in the form of regular expressions, 
++    such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to 
++    determine whether the suffix in the regular expression is present in the 
++    `expected_lora_modules`.
++    """
++
++    def is_valid_regex(pattern):
++        try:
++            re.compile(pattern)
++            return True
++        except re.error:
++            return False
++
++    def is_subset(sub_list, full_list):
++        return set(sub_list).issubset(set(full_list))
++
++    # Similar to PEFT's processing logic, regex-related operations are only
++    #  executed when the load_modules is a `str`.
++    if not isinstance(load_modules, str):
++        return False
++
++    if is_valid_regex(load_modules):
++        match = re.search(r"\((.*?)\)\$?$", load_modules)
++        if match:
++            suffix = match.group(1).split("|")
++            return is_subset(suffix, expected_lora_modules)
++    return False
++
++
++def get_adapter_absolute_path(lora_path: str) -> str:
++    """
++    Resolves the given lora_path to an absolute local path.
++
++    If the lora_path is identified as a Hugging Face model identifier,
++    it will download the model and return the local snapshot path.
++    Otherwise, it treats the lora_path as a local file path and
++    converts it to an absolute path.
++
++    Parameters:
++    lora_path (str): The path to the lora model, which can be an absolute path,
++                     a relative path, or a Hugging Face model identifier.
++
++    Returns:
++    str: The resolved absolute local path to the lora model.
++    """
++
++    # Check if the path is an absolute path. Return it no matter exists or not.
++    if os.path.isabs(lora_path):
++        return lora_path
++
++    # If the path starts with ~, expand the user home directory.
++    if lora_path.startswith('~'):
++        return os.path.expanduser(lora_path)
++
++    # Check if the expanded relative path exists locally.
++    if os.path.exists(lora_path):
++        return os.path.abspath(lora_path)
++
++    # If the path does not exist locally, assume it's a Hugging Face repo.
++    try:
++        local_snapshot_path = huggingface_hub.snapshot_download(
++            repo_id=lora_path)
++    except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError,
++            HFValidationError):
++        # Handle errors that may occur during the download
++        # Return original path instead instead of throwing error here
++        logger.exception("Error downloading the HuggingFace model")
++        return lora_path
+ 
+-    raise ValueError(f"{name} is unsupported format")
++    return local_snapshot_path
+diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
+index ec3c10c..eec4627 100644
+--- a/vllm/lora/worker_manager.py
++++ b/vllm/lora/worker_manager.py
+@@ -1,74 +1,30 @@
+-from abc import ABC, abstractmethod, abstractproperty
+-from typing import Any, Dict, List, Set, Type
++from contextlib import contextmanager
++from typing import Any, Dict, List, Literal, Optional, Set, Type, Union
+ 
+ import torch
+ 
++from vllm.adapter_commons.utils import (add_adapter_worker,
++                                        apply_adapters_worker,
++                                        list_adapters_worker,
++                                        set_active_adapters_worker)
++from vllm.adapter_commons.worker_manager import AbstractWorkerManager
+ from vllm.config import LoRAConfig
+ from vllm.logger import init_logger
+-from vllm.lora.layers import LoRAMapping
+ from vllm.lora.models import (LoRAModel, LoRAModelManager,
+                               LRUCacheLoRAModelManager, create_lora_manager)
+ from vllm.lora.request import LoRARequest
++from vllm.lora.utils import get_adapter_absolute_path
+ 
+ logger = init_logger(__name__)
+ 
+ 
+-class AbstractWorkerLoRAManager(ABC):
+-    """Abstract class for managing LoRA models on the worker side."""
+-
+-    def __init__(self, max_num_seqs: int, max_num_batched_tokens: int,
+-                 vocab_size: int, lora_config: LoRAConfig,
+-                 device: torch.device):
+-        self.max_num_seqs = max_num_seqs
+-        self.max_num_batched_tokens = max_num_batched_tokens
+-        self.vocab_size = vocab_size
+-        self.device = device
+-        self.lora_config = lora_config
+-
+-    @abstractproperty
+-    def is_enabled(self) -> bool:
+-        ...
+-
+-    @abstractmethod
+-    def create_lora_manager(
+-        self,
+-        model: torch.nn.Module,
+-    ) -> Any:
+-        ...
+-
+-    @abstractmethod
+-    def set_active_loras(self, lora_requests: Set[LoRARequest],
+-                         lora_mapping: LoRAMapping) -> None:
+-        ...
+-
+-    @abstractmethod
+-    def add_lora(self, lora_request: LoRARequest) -> bool:
+-        ...
+-
+-    @abstractmethod
+-    def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
+-        ...
+-
+-    @abstractmethod
+-    def remove_lora(self, lora_id: int) -> bool:
+-        ...
+-
+-    @abstractmethod
+-    def remove_all_loras(self):
+-        ...
+-
+-    @abstractmethod
+-    def list_loras(self) -> Set[int]:
+-        ...
+-
+-
+-class WorkerLoRAManager(AbstractWorkerLoRAManager):
++class WorkerLoRAManager(AbstractWorkerManager):
+     """WorkerLoRAManager that manages LoRA models on the worker side.
+ 
+     Every request, the requested LoRAs will be loaded (unless they are already
+     loaded), and every other LoRA will be unloaded."""
+ 
+-    _lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager
++    _manager_cls: Type[LoRAModelManager] = LoRAModelManager
+ 
+     def __init__(
+         self,
+@@ -80,14 +36,28 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
+         embedding_modules: Dict[str, str],
+         embedding_padding_modules: List[str],
+         lora_model_cls: Type[LoRAModel] = LoRAModel,
++        max_position_embeddings: Optional[int] = None,
+     ):
+         self._lora_model_cls = lora_model_cls
+         self.embedding_modules = embedding_modules
+         self.embedding_padding_modules = embedding_padding_modules
++        self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
++        self.max_num_seqs = max_num_seqs
++        self.max_num_batched_tokens = max_num_batched_tokens
++        self.vocab_size = vocab_size
++        self.lora_config = lora_config
++        self.max_position_embeddings = max_position_embeddings
++        super().__init__(device)
+         # Lazily initialized by create_lora_manager.
+-        self._lora_manager: LoRAModelManager
+-        super().__init__(max_num_seqs, max_num_batched_tokens, vocab_size,
+-                         lora_config, device)
++        self._adapter_manager: LoRAModelManager
++
++    @contextmanager
++    def dummy_lora_cache(self):
++        """Use this context manager to reuse the dummy lora model
++        to avoid creating it repeatedly."""
++        self._cached_dummy_lora = None
++        yield
++        self._cached_dummy_lora = False
+ 
+     @property
+     def is_enabled(self) -> bool:
+@@ -103,53 +73,39 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
+             max_num_batched_tokens=self.max_num_batched_tokens,
+             vocab_size=self.vocab_size,
+             lora_config=self.lora_config,
+-            lora_manager_cls=self._lora_manager_cls,
++            device=self.device,
++            lora_manager_cls=self._manager_cls,
+         )
+-        self._lora_manager = lora_manager
++        self._adapter_manager = lora_manager
+         return lora_manager.model
+ 
+-    def set_active_loras(self, lora_requests: Set[LoRARequest],
+-                         lora_mapping: LoRAMapping) -> None:
+-        self._apply_loras(lora_requests)
+-        self._lora_manager.set_lora_mapping(lora_mapping)
+-
+-    def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
+-        loras_that_exist = self.list_loras()
+-        loras_map = {
+-            lora_request.lora_int_id: lora_request
+-            for lora_request in lora_requests if lora_request
+-        }
+-        if len(loras_map) > self._lora_manager.lora_slots:
+-            raise RuntimeError(
+-                f"Number of requested LoRAs ({len(loras_map)}) is greater "
+-                "than the number of GPU LoRA slots "
+-                f"({self._lora_manager.lora_slots}).")
+-
+-        new_loras = set(loras_map)
+-        loras_to_add = new_loras - loras_that_exist
+-        loras_to_remove = loras_that_exist - new_loras
+-
+-        for lora_id in loras_to_remove:
+-            self.remove_lora(lora_id)
+-
+-        for lora_id in loras_to_add:
+-            self.add_lora(loras_map[lora_id])
+-
+-    def _load_lora(self, lora_request: LoRARequest) -> LoRAModel:
++    def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
+         try:
+-            model = self._lora_manager.model
++            model = self._adapter_manager.model
+             supported_lora_modules = model.supported_lora_modules
+             packed_modules_mapping = model.packed_modules_mapping
+-            expected_lora_modules = []
++            expected_lora_modules: List[str] = []
+             for module in supported_lora_modules:
+                 if module in packed_modules_mapping:
+                     expected_lora_modules.extend(
+                         packed_modules_mapping[module])
+                 else:
+                     expected_lora_modules.append(module)
++
++            expected_lora_modules = list(set(expected_lora_modules))
++            lora_path = get_adapter_absolute_path(lora_request.lora_path)
++
++            # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
++            # to ensure correct loading of lora weights.
++            hf_to_vllm_mapper = None
++            if (hasattr(model, "hf_to_vllm_mapper")
++                    and model.hf_to_vllm_mapper is not None):
++                hf_to_vllm_mapper = model.hf_to_vllm_mapper
++
+             lora = self._lora_model_cls.from_local_checkpoint(
+-                lora_request.lora_local_path,
++                lora_path,
+                 expected_lora_modules,
++                max_position_embeddings=self.max_position_embeddings,
+                 lora_model_id=lora_request.lora_int_id,
+                 device="cpu",
+                 dtype=self.lora_config.lora_dtype,
+@@ -157,10 +113,18 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
+                 self.lora_config.lora_extra_vocab_size,
+                 embedding_modules=self.embedding_modules,
+                 embedding_padding_modules=self.embedding_padding_modules,
+-            )
++                weights_mapper=hf_to_vllm_mapper)
++
++        except FileNotFoundError as e:
++            # FileNotFoundError should be raised if both
++            # - No adapter found to download from huggingface (or in
++            #       offline mode)
++            # - No local adapter files found at `lora_request.lora_path`
++            raise ValueError(
++                f"Loading lora {lora_request.lora_name} failed: No adapter "
++                f"found for {lora_path}") from e
+         except Exception as e:
+-            raise RuntimeError(
+-                f"Loading lora {lora_request.lora_local_path} failed") from e
++            raise RuntimeError(f"Loading lora {lora_path} failed") from e
+         if lora.rank > self.lora_config.max_lora_rank:
+             raise ValueError(
+                 f"LoRA rank {lora.rank} is greater than max_lora_rank "
+@@ -172,28 +136,45 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
+         return lora
+ 
+     def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
+-        if lora_request.lora_int_id in self.list_loras():
++        if lora_request.lora_int_id in self.list_adapters():
+             return False
+-        return self._lora_manager.add_lora(
+-            self._lora_manager.create_dummy_lora(lora_request.lora_int_id,
+-                                                 rank, self.embedding_modules))
++        if isinstance(self._cached_dummy_lora, LoRAModel):
++            dummy_lora = self._cached_dummy_lora.clone(
++                lora_request.lora_int_id)
++        else:
++            dummy_lora = self._adapter_manager.create_dummy_lora(
++                lora_request.lora_int_id, rank, 1, self.embedding_modules)
++            if self._cached_dummy_lora is None:
++                self._cached_dummy_lora = dummy_lora
++        return self._adapter_manager.add_adapter(dummy_lora)
+ 
+-    def add_lora(self, lora_request: LoRARequest) -> bool:
+-        if lora_request.lora_int_id in self.list_loras():
+-            return False
+-        lora = self._load_lora(lora_request)
+-        loaded = self._lora_manager.add_lora(lora)
+-        self._lora_manager.activate_lora(lora.id)
+-        return loaded
++    def pin_adapter(self, adapter_id: int) -> bool:
++        return self._adapter_manager.pin_adapter(adapter_id)
++
++    def set_active_adapters(self, requests: Set[Any],
++                            mapping: Optional[Any]) -> None:
++        set_active_adapters_worker(requests, mapping, self._apply_adapters,
++                                   self._adapter_manager.set_adapter_mapping)
++
++    def _apply_adapters(self, adapter_requests: Set[Any]) -> None:
++        apply_adapters_worker(adapter_requests, self.list_adapters,
++                              self._adapter_manager.adapter_slots,
++                              self.remove_adapter, self.add_adapter)
++
++    def add_adapter(self, adapter_request: Any) -> bool:
++        return add_adapter_worker(adapter_request, self.list_adapters,
++                                  self._load_adapter,
++                                  self._adapter_manager.add_adapter,
++                                  self._adapter_manager.activate_adapter)
+ 
+-    def remove_lora(self, lora_id: int) -> bool:
+-        return self._lora_manager.remove_lora(lora_id)
++    def remove_adapter(self, adapter_id: int) -> bool:
++        return self._adapter_manager.remove_adapter(adapter_id)
+ 
+-    def remove_all_loras(self):
+-        self._lora_manager.remove_all_loras()
++    def remove_all_adapters(self):
++        self._adapter_manager.remove_all_adapters()
+ 
+-    def list_loras(self) -> Set[int]:
+-        return set(self._lora_manager.list_loras())
++    def list_adapters(self) -> Set[int]:
++        return list_adapters_worker(self._adapter_manager.list_adapters)
+ 
+ 
+ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
+@@ -203,8 +184,7 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
+     (unless they are already loaded) and least recently used LoRAs will
+     be unloaded if the cache is above capacity."""
+ 
+-    _lora_manager_cls: Type[
+-        LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager
++    _manager_cls: Type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager
+ 
+     def create_lora_manager(
+         self,
+@@ -212,40 +192,49 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
+     ) -> Any:
+         lora_manager = create_lora_manager(
+             model,
+-            lora_manager_cls=self._lora_manager_cls,
++            lora_manager_cls=self._manager_cls,
+             max_num_seqs=self.max_num_seqs,
+             vocab_size=self.vocab_size,
+             lora_config=self.lora_config,
++            device=self.device,
+             max_num_batched_tokens=self.max_num_batched_tokens,
+         )
+-        self._lora_manager = lora_manager
++        self._adapter_manager = lora_manager
+         return lora_manager.model
+ 
+-    def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
++    def _apply_adapters(self, lora_requests: Set[LoRARequest]) -> None:
+         loras_map = {
+             lora_request.lora_int_id: lora_request
+             for lora_request in lora_requests if lora_request
+         }
+-        if len(loras_map) > self._lora_manager.lora_slots:
++        if len(loras_map) > self._adapter_manager.lora_slots:
+             raise RuntimeError(
+                 f"Number of requested LoRAs ({len(loras_map)}) is greater "
+                 "than the number of GPU LoRA slots "
+-                f"({self._lora_manager.lora_slots}).")
++                f"({self._adapter_manager.lora_slots}).")
+         for lora in loras_map.values():
+-            self.add_lora(lora)
+-
+-    def add_lora(self, lora_request: LoRARequest) -> bool:
+-        if lora_request.lora_int_id not in self.list_loras():
+-            # Remove before we load the new lora to save memory
+-            if len(self._lora_manager) + 1 > self._lora_manager.capacity:
+-                assert isinstance(self._lora_manager, LRUCacheLoRAModelManager)
+-                self._lora_manager.remove_oldest_lora()
+-            lora = self._load_lora(lora_request)
+-            loaded = self._lora_manager.add_lora(lora)
++            self.add_adapter(lora)
++
++    def add_adapter(self, lora_request: LoRARequest) -> bool:
++        if lora_request.lora_int_id not in self.list_adapters():
++            # Load the new adapter first to ensure it is actually valid, before
++            # evicting any existing adapters.
++            # This may cause the # of loaded lora adapters to very temporarily
++            # exceed `--max-cpu-loras`.
++            lora = self._load_adapter(lora_request)
++
++            # Loading succeeded, now check if we will exceed cache capacity and
++            # evict if the oldest adapter if so
++            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
++                assert isinstance(self._adapter_manager,
++                                  LRUCacheLoRAModelManager)
++                self._adapter_manager.remove_oldest_adapter()
++            # Then add the new adapter to the cache
++            loaded = self._adapter_manager.add_adapter(lora)
+         else:
+             # If the lora is already loaded, just touch it to
+             # update its position in the caches
+-            loaded = self._lora_manager.get_lora(
++            loaded = self._adapter_manager.get_adapter(
+                 lora_request.lora_int_id) is not None
+-        self._lora_manager.activate_lora(lora_request.lora_int_id)
++        self._adapter_manager.activate_adapter(lora_request.lora_int_id)
+         return loaded
+diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
+index fb98f4a..7278c7f 100644
+--- a/vllm/model_executor/__init__.py
++++ b/vllm/model_executor/__init__.py
+@@ -1,7 +1,13 @@
+-from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           PackedvLLMParameter)
++from vllm.model_executor.sampling_metadata import (SamplingMetadata,
++                                                   SamplingMetadataCache)
+ from vllm.model_executor.utils import set_random_seed
+ 
+ __all__ = [
+     "SamplingMetadata",
++    "SamplingMetadataCache",
+     "set_random_seed",
++    "BasevLLMParameter",
++    "PackedvLLMParameter",
+ ]
+diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
+new file mode 100644
+index 0000000..96995c5
+--- /dev/null
++++ b/vllm/model_executor/custom_op.py
+@@ -0,0 +1,144 @@
++from typing import Dict, Type
++
++import torch.nn as nn
++
++from vllm.config import get_current_vllm_config
++from vllm.logger import init_logger
++from vllm.platforms import current_platform
++
++logger = init_logger(__name__)
++
++
++class CustomOp(nn.Module):
++    """
++    Base class for custom ops.
++    Dispatches the forward method to the appropriate backend.
++    """
++
++    def __init__(self):
++        super().__init__()
++        self._forward_method = self.dispatch_forward()
++
++    def forward(self, *args, **kwargs):
++        return self._forward_method(*args, **kwargs)
++
++    def forward_native(self, *args, **kwargs):
++        """PyTorch-native implementation of the forward method.
++        This method is optional. If implemented, it can be used with compilers
++        such as torch.compile or PyTorch XLA. Also, it can be used for testing
++        purposes.
++        """
++        raise NotImplementedError
++
++    def forward_cuda(self, *args, **kwargs):
++        raise NotImplementedError
++
++    def forward_hip(self, *args, **kwargs):
++        # By default, we assume that HIP ops are compatible with CUDA ops.
++        return self.forward_cuda(*args, **kwargs)
++
++    def forward_xpu(self, *args, **kwargs):
++        # By default, we assume that XPU ops are compatible with the
++        # PyTorch-native implementation.
++        return self.forward_native(*args, **kwargs)
++
++    def forward_cpu(self, *args, **kwargs):
++        # By default, we assume that CPU ops are compatible with CUDA ops.
++        return self.forward_cuda(*args, **kwargs)
++
++    def forward_tpu(self, *args, **kwargs):
++        # By default, we assume that TPU ops are compatible with the
++        # PyTorch-native implementation.
++        # NOTE(woosuk): This is a placeholder for future extensions.
++        return self.forward_native(*args, **kwargs)
++
++    def forward_hpu(self, *args, **kwargs):
++        # By default, we assume that Gaudi ops are compatible with the
++        # PyTorch-native implementation.
++        return self.forward_native(*args, **kwargs)
++
++    def forward_oot(self, *args, **kwargs):
++        # By default, we assume that OOT ops are compatible with the
++        # PyTorch-native implementation.
++        return self.forward_native(*args, **kwargs)
++
++    def dispatch_forward(self):
++        # NOTE(woosuk): Here we assume that vLLM was built for only one
++        # specific backend. Currently, we do not support dynamic dispatching.
++        compilation_config = get_current_vllm_config().compilation_config
++        enabled = self.enabled()
++        if enabled:
++            compilation_config.enabled_custom_ops.update([self.__class__.name])
++        else:
++            compilation_config.disabled_custom_ops.update(
++                [self.__class__.name])
++
++        if not enabled:
++            return self.forward_native
++
++        if current_platform.is_rocm():
++            return self.forward_hip
++        elif current_platform.is_cpu():
++            return self.forward_cpu
++        elif current_platform.is_hpu():
++            return self.forward_hpu
++        elif current_platform.is_tpu():
++            return self.forward_tpu
++        elif current_platform.is_xpu():
++            return self.forward_xpu
++        elif current_platform.is_out_of_tree():
++            return self.forward_oot
++        else:
++            return self.forward_cuda
++
++    @classmethod
++    def enabled(cls) -> bool:
++        # if no name, then it was not registered
++        compilation_config = get_current_vllm_config().compilation_config
++        custom_ops = compilation_config.custom_ops
++        if not hasattr(cls, "name"):
++            logger.warning_once(
++                f"Custom op {cls.__name__} was not registered, "
++                f"which means it won't appear in the op registry. "
++                f"It will be enabled/disabled based on the global settings.")
++            return CustomOp.default_on()
++
++        enabled = f"+{cls.name}" in custom_ops
++        disabled = f"-{cls.name}" in custom_ops
++        assert not (enabled
++                    and disabled), f"Cannot enable and disable {cls.name}"
++
++        return (CustomOp.default_on() or enabled) and not disabled
++
++    @staticmethod
++    def default_on() -> bool:
++        """
++        On by default if level < CompilationLevel.PIECEWISE
++        Specifying 'all' or 'none' in custom_op takes precedence.
++        """
++        from vllm.config import CompilationLevel
++        compilation_config = get_current_vllm_config().compilation_config
++        custom_ops = compilation_config.custom_ops
++        count_none = custom_ops.count("none")
++        count_all = custom_ops.count("all")
++        return compilation_config.level < CompilationLevel.PIECEWISE and \
++            not count_none > 0 or count_all > 0
++
++    # Dictionary of all custom ops (classes, indexed by registered name).
++    # To check if an op with a name is enabled, call .enabled() on the class.
++    # Examples:
++    # - MyOp.enabled()
++    # - op_registry["my_op"].enabled()
++    op_registry: Dict[str, Type['CustomOp']] = {}
++
++    # Decorator to register custom ops.
++    @classmethod
++    def register(cls, name: str):
++
++        def decorator(op_cls):
++            assert name not in cls.op_registry, f"Duplicate op name: {name}"
++            op_cls.name = name
++            cls.op_registry[name] = op_cls
++            return op_cls
++
++        return decorator
+diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
+index 0558d6c..18b435a 100644
+--- a/vllm/model_executor/guided_decoding/__init__.py
++++ b/vllm/model_executor/guided_decoding/__init__.py
+@@ -1,25 +1,139 @@
+-from typing import Optional, Union
++from __future__ import annotations
+ 
+-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+-                                              CompletionRequest)
+-from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (
+-    get_lm_format_enforcer_guided_decoding_logits_processor)
+-from vllm.model_executor.guided_decoding.outlines_decoding import (
+-    get_outlines_guided_decoding_logits_processor)
+-from vllm.sampling_params import LogitsProcessor
++from typing import TYPE_CHECKING
++
++from vllm.logger import init_logger
++from vllm.model_executor.guided_decoding.utils import (
++    convert_lark_to_gbnf, grammar_is_likely_lark,
++    has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
++from vllm.platforms import CpuArchEnum
++
++if TYPE_CHECKING:
++    from transformers import PreTrainedTokenizer
++
++    from vllm.config import ModelConfig
++    from vllm.logits_process import LogitsProcessor
++    from vllm.sampling_params import GuidedDecodingParams
++
++logger = init_logger(__name__)
++
++
++def maybe_backend_fallback(
++        guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
++    # lm-format-enforce doesn't support grammar, fallback to xgrammar
++    if guided_params.backend == "lm-format-enforcer":
++        if guided_params.grammar is not None:
++            logger.warning(
++                "lm-format-enforcer does not support grammar guided decoding. "
++                "Falling back to use xgrammar instead.")
++            guided_params.backend = "xgrammar"
++
++        # lm-format-enforcer doesn't support some JSON schema features
++        elif (guided_params.json is not None
++              and has_lmf_unsupported_json_features(guided_params.json)):
++            logger.warning(
++                "lm-format-enforcer does not support advanced JSON schema "
++                "features like patterns or numeric ranges. "
++                "Falling back to use outlines instead.")
++            guided_params.backend = "outlines"
++
++    if guided_params.backend == "xgrammar":
++        # xgrammar only has x86 wheels for linux, fallback to outlines
++        from vllm.platforms import current_platform
++        if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
++            logger.warning("xgrammar is only supported on x86 CPUs. "
++                           "Falling back to use outlines instead.")
++            guided_params.backend = "outlines"
++
++        # xgrammar doesn't support regex or choice, fallback to outlines
++        if guided_params.regex is not None or guided_params.choice is not None:
++            logger.warning(
++                "xgrammar only supports json or grammar guided decoding. "
++                "Falling back to use outlines instead.")
++            guided_params.backend = "outlines"
++
++        # xgrammar doesn't support some JSON schema features
++        elif (guided_params.json is not None
++              and has_xgrammar_unsupported_json_features(guided_params.json)):
++            logger.warning(
++                "xgrammar does not support advanced JSON schema features like "
++                "patterns or numeric ranges. "
++                "Falling back to use outlines instead.")
++            guided_params.backend = "outlines"
++
++        # xgrammar only supports GBNF grammars, so we must convert Lark.
++        # We must check if the grammar is likely Lark and if that
++        # grammar is convertible to GBNF
++        elif (guided_params.grammar is not None
++              and grammar_is_likely_lark(guided_params.grammar)):
++            try:
++                convert_lark_to_gbnf(guided_params.grammar)
++            except Exception:
++                logger.warning(
++                    "xgrammar does not support Lark grammars and the "
++                    "grammar failed to convert to GBNF. "
++                    "Falling back to use outlines instead.")
++                guided_params.backend = "outlines"
++
++    if (guided_params.backend == "outlines"
++            and guided_params.json_object is not None):
++        # outlines doesn't support json_object, fallback to xgrammar
++        logger.warning("outlines does not support json_object. "
++                       "Falling back to use xgrammar instead.")
++        guided_params.backend = "xgrammar"
++
++    return guided_params
+ 
+ 
+ async def get_guided_decoding_logits_processor(
+-        guided_decoding_backend: str, request: Union[CompletionRequest,
+-                                                     ChatCompletionRequest],
+-        tokenizer) -> Optional[LogitsProcessor]:
+-    if guided_decoding_backend == 'outlines':
++        guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
++        model_config: ModelConfig) -> LogitsProcessor | None:
++    guided_params = maybe_backend_fallback(guided_params)
++    # CFG grammar not supported by LMFE, so we use outlines instead
++    if guided_params.backend == 'outlines':
++        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
++        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
++            get_outlines_guided_decoding_logits_processor)
+         return await get_outlines_guided_decoding_logits_processor(
+-            request, tokenizer)
+-    if guided_decoding_backend == 'lm-format-enforcer':
+-        return await get_lm_format_enforcer_guided_decoding_logits_processor(
+-            request, tokenizer)
++            guided_params, tokenizer)
++    if guided_params.backend == 'lm-format-enforcer':
++        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
++            get_local_lm_format_enforcer_guided_decoding_logits_processor)
++        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
++            guided_params, tokenizer)
++    if guided_params.backend == 'xgrammar':
++        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
++            get_local_xgrammar_guided_decoding_logits_processor)
++        return get_local_xgrammar_guided_decoding_logits_processor(
++            guided_params, tokenizer, model_config)
++
++    raise ValueError(
++        f"Unknown guided decoding backend '{guided_params.backend}'. "
++        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
++
++
++def get_local_guided_decoding_logits_processor(
++        guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
++        model_config: ModelConfig) -> LogitsProcessor | None:
++    guided_params = maybe_backend_fallback(guided_params)
++    # CFG grammar not supported by LMFE, so we use outlines instead
++    if guided_params.backend == 'outlines':
++        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
++        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
++            get_local_outlines_guided_decoding_logits_processor)
++        return get_local_outlines_guided_decoding_logits_processor(
++            guided_params, tokenizer)
++    if guided_params.backend == 'lm-format-enforcer':
++        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
++            get_local_lm_format_enforcer_guided_decoding_logits_processor)
++        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
++            guided_params, tokenizer)
++    if guided_params.backend == 'xgrammar':
++        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
++            get_local_xgrammar_guided_decoding_logits_processor)
++        return get_local_xgrammar_guided_decoding_logits_processor(
++            guided_params, tokenizer, model_config)
+ 
+     raise ValueError(
+-        f"Unknown guided decoding backend '{guided_decoding_backend}'. "
+-        "Must be one of 'outlines, 'lm-format-enforcer'")
++        f"Unknown guided decoding backend '{guided_params.backend}'. "
++        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
+diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
+new file mode 100644
+index 0000000..8deb4c9
+--- /dev/null
++++ b/vllm/model_executor/guided_decoding/guided_fields.py
+@@ -0,0 +1,39 @@
++from dataclasses import dataclass
++from typing import Dict, List, Optional, TypedDict, Union
++
++from pydantic import BaseModel
++
++
++# These classes are deprecated, see SamplingParams
++class LLMGuidedOptions(TypedDict, total=False):
++    guided_json: Union[Dict, BaseModel, str]
++    guided_regex: str
++    guided_choice: List[str]
++    guided_grammar: str
++    guided_decoding_backend: str
++    guided_whitespace_pattern: str
++    guided_json_object: bool
++
++
++@dataclass
++class GuidedDecodingRequest:
++    """One of the fields will be used to retrieve the logit processor."""
++    guided_json: Optional[Union[Dict, BaseModel, str]] = None
++    guided_regex: Optional[str] = None
++    guided_choice: Optional[List[str]] = None
++    guided_grammar: Optional[str] = None
++    guided_decoding_backend: Optional[str] = None
++    guided_whitespace_pattern: Optional[str] = None
++    guided_json_object: Optional[bool] = None
++
++    def __post_init__(self):
++        """Validate that some fields are mutually exclusive."""
++        guide_count = sum([
++            self.guided_json is not None, self.guided_regex is not None,
++            self.guided_choice is not None, self.guided_grammar is not None,
++            self.guided_json_object is not None
++        ])
++        if guide_count > 1:
++            raise ValueError(
++                "You can only use one kind of guided decoding but multiple are "
++                f"specified: {self.__dict__}")
+diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+index d0a5ca5..a17e75a 100644
+--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
++++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+@@ -7,18 +7,14 @@ from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
+                               TokenEnforcerTokenizerData, UnionParser)
+ from lmformatenforcer.integrations.vllm import (
+     build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
+-from pydantic import BaseModel
+ from transformers import PreTrainedTokenizerBase
+ 
+-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+-                                              CompletionRequest)
+-from vllm.model_executor.guided_decoding.outlines_decoding import (
+-    get_outlines_guided_decoding_logits_processor)
+-from vllm.sampling_params import LogitsProcessor
++from vllm.logits_process import LogitsProcessor
++from vllm.sampling_params import GuidedDecodingParams
+ 
+ 
+-async def get_lm_format_enforcer_guided_decoding_logits_processor(
+-        request: Union[CompletionRequest, ChatCompletionRequest],
++def get_local_lm_format_enforcer_guided_decoding_logits_processor(
++        guided_params: GuidedDecodingParams,
+         tokenizer) -> Optional[LogitsProcessor]:
+     """
+     Given an OpenAI-compatible request, check for guided decoding parameters
+@@ -30,22 +26,22 @@ async def get_lm_format_enforcer_guided_decoding_logits_processor(
+     tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
+         tokenizer)
+     character_level_parser: CharacterLevelParser
+-    if request.guided_json:
+-        schema = _normalize_json_schema_object(request.guided_json)
+-        character_level_parser = JsonSchemaParser(schema)
+-    elif request.guided_choice:
++    if guided_params.json:
++        schema_dict = _normalize_json_schema_object(guided_params.json)
++        character_level_parser = JsonSchemaParser(schema_dict)
++    elif guided_params.choice:
+         character_level_parser = UnionParser(
+-            [StringParser(choice) for choice in request.guided_choice])
+-    elif request.guided_regex:
+-        character_level_parser = RegexParser(request.guided_regex)
+-    elif request.guided_grammar:
+-        # CFG grammar not supported by LMFE, revert to outlines
+-        return await get_outlines_guided_decoding_logits_processor(
+-            request, tokenizer)
+-    elif (request.response_format is not None
+-          and request.response_format.type == "json_object"):
+-        character_level_parser = JsonSchemaParser(
+-            None)  # None means any json object
++            [StringParser(choice) for choice in guided_params.choice])
++    elif guided_params.regex:
++        character_level_parser = RegexParser(guided_params.regex)
++    elif guided_params.grammar:
++        # CFG grammar not supported by LMFE
++        raise ValueError("Cannot construct a guided decoding logits processor"
++                         " using the grammar option with the"
++                         " lm_format_enforcer backend.")
++    elif guided_params.json_object:
++        # None means any json object
++        character_level_parser = JsonSchemaParser(None)
+     else:
+         return None
+ 
+@@ -54,13 +50,11 @@ async def get_lm_format_enforcer_guided_decoding_logits_processor(
+     return logits_processor
+ 
+ 
+-def _normalize_json_schema_object(schema: Union[str, dict, BaseModel]) -> dict:
++def _normalize_json_schema_object(schema: Union[str, dict]) -> dict:
+     if isinstance(schema, str):
+         return json_loads(schema)
+     if isinstance(schema, dict):
+         return schema
+-    if isinstance(schema, BaseModel):
+-        return schema.model_json_schema()
+     raise AssertionError(f"Unsupported schema type {schema}")
+ 
+ 
+diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
+index 8403604..eb8db88 100644
+--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
++++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
+@@ -1,19 +1,16 @@
+ import asyncio
+ import concurrent.futures
+-from copy import copy
++import os
+ from enum import Enum
+-from functools import lru_cache
+ from json import dumps as json_dumps
+ from re import escape as regex_escape
+ from typing import Tuple, Union
+ 
+-from pydantic import BaseModel
+ from transformers import PreTrainedTokenizerBase
+ 
+-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+-                                              CompletionRequest)
+ from vllm.model_executor.guided_decoding.outlines_logits_processors import (
+     CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
++from vllm.sampling_params import GuidedDecodingParams
+ 
+ 
+ class GuidedDecodingMode(Enum):
+@@ -52,10 +49,16 @@ pair   : UNESCAPED_STRING ":" value
+ 
+ global_thread_pool = None  # used for generating logits processor fsm
+ 
++# It's not yet clear that using more provides a benefit, and it could
++# potentially starve other processes on the machine. We'll cap this for now and
++# adjust later if testing proves it to help overcome a bottleneck.
++_MAX_THREADPOOL_WORKERS = 16
++
+ 
+ async def get_outlines_guided_decoding_logits_processor(
+-        request: Union[CompletionRequest, ChatCompletionRequest],
+-        tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
++    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
++) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
++           None]:
+     """
+     Given an OpenAI-compatible request, check for guided decoding parameters
+     and get the necessary logits processor for the given guide.
+@@ -63,63 +66,72 @@ async def get_outlines_guided_decoding_logits_processor(
+     we make a shallow copy to reuse the same underlying FSM.
+     """
+     global global_thread_pool
+-    guide, mode = _get_guide_and_mode(request)
+-    if not guide:
++    guide, mode = _get_guide_and_mode(guided_params)
++    if not guide or not mode:
+         return None
+ 
+     if global_thread_pool is None:
++        max_workers = os.cpu_count() or 2
++        if max_workers > _MAX_THREADPOOL_WORKERS:
++            max_workers = _MAX_THREADPOOL_WORKERS
+         global_thread_pool = concurrent.futures.ThreadPoolExecutor(
+-            max_workers=2)
++            max_workers=max_workers)
+     loop = asyncio.get_running_loop()
+ 
+-    result = await loop.run_in_executor(global_thread_pool,
+-                                        _get_cached_logits_processor, guide,
+-                                        tokenizer, mode,
+-                                        request.guided_whitespace_pattern)
++    return await loop.run_in_executor(global_thread_pool,
++                                      _get_logits_processor, guide, tokenizer,
++                                      mode, guided_params.whitespace_pattern)
+ 
+-    logits_processor = copy(result)
+-    # reset logits processor's internal state
+-    logits_processor.init_state()
+-    return logits_processor
++
++def get_local_outlines_guided_decoding_logits_processor(
++    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
++) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
++           None]:
++    """
++    Given an OpenAI-compatible request, check for guided decoding parameters
++    and get the necessary logits processor for the given guide.
++    We cache logit processors by (guide, tokenizer), and on cache hit
++    we make a shallow copy to reuse the same underlying FSM.
++    """
++    guide, mode = _get_guide_and_mode(guided_params)
++    if not guide or not mode:
++        return None
++
++    return _get_logits_processor(guide, tokenizer, mode,
++                                 guided_params.whitespace_pattern)
+ 
+ 
+ def _get_guide_and_mode(
+-    request: Union[CompletionRequest, ChatCompletionRequest]
++    guided_params: GuidedDecodingParams
+ ) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
+-
+-    if request.guided_json:
+-        json = request.guided_json
+-        if isinstance(json, dict):
++    if guided_params.json:
++        if isinstance(guided_params.json, dict):
+             # turn dict into hashable string
+-            json = json_dumps(json)
+-        elif isinstance(json, BaseModel):
+-            # use pydantic signature so that different model classes
+-            # with the same fields will get hashed the same
+-            json = str(json.__signature__)
++            json = json_dumps(guided_params.json)
++        else:
++            json = guided_params.json
+         return json, GuidedDecodingMode.JSON
+-    elif request.guided_regex:
+-        return request.guided_regex, GuidedDecodingMode.REGEX
+-    elif request.guided_choice:
++    elif guided_params.regex:
++        return guided_params.regex, GuidedDecodingMode.REGEX
++    elif guided_params.choice:
+         # choice just uses regex
+         choices = [
+-            regex_escape(str(choice)) for choice in request.guided_choice
++            regex_escape(str(choice)) for choice in guided_params.choice
+         ]
+         choices_regex = "(" + "|".join(choices) + ")"
+         return choices_regex, GuidedDecodingMode.CHOICE
+-    elif request.guided_grammar:
+-        return request.guided_grammar, GuidedDecodingMode.GRAMMAR
+-    elif (request.response_format is not None
+-          and request.response_format.type == "json_object"):
++    elif guided_params.grammar:
++        return guided_params.grammar, GuidedDecodingMode.GRAMMAR
++    elif guided_params.json_object:
+         return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR
+     else:
+         return None, None
+ 
+ 
+-@lru_cache(maxsize=32)
+-def _get_cached_logits_processor(guide: str,
+-                                 tokenizer: PreTrainedTokenizerBase,
+-                                 mode: GuidedDecodingMode,
+-                                 whitespace_pattern: Union[str, None]):
++def _get_logits_processor(
++    guide: str, tokenizer: PreTrainedTokenizerBase, mode: GuidedDecodingMode,
++    whitespace_pattern: Union[str, None]
++) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
+     if mode == GuidedDecodingMode.JSON:
+         return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern)
+     elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
+diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+index a131c6a..e4eb3f1 100644
+--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
++++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+@@ -15,53 +15,93 @@
+ # limitations under the License.
+ import copy
+ import json
+-import math
+ from collections import defaultdict
+ from functools import lru_cache
+ from typing import Callable, DefaultDict, Dict, List, Union
+ 
++import numpy as np
+ import torch
+-from outlines.fsm.fsm import CFGFSM, FSM, RegexFSM
+-from outlines.fsm.json_schema import build_regex_from_schema
++from outlines import grammars
++from outlines.caching import cache
++from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide,
++                                RegexGuide, Write)
++from outlines.fsm.parsing import PartialLark
++from outlines_core.fsm.json_schema import build_regex_from_schema
+ from pydantic import BaseModel
+ from transformers import PreTrainedTokenizerBase
+ 
+ 
+ class BaseLogitsProcessor:
+ 
+-    def __init__(self):
+-        # Child class should use initialize in their init.
+-        self.fsm: FSM
+-
+-    def init_state(self):
+-        """Initialize the FSM states."""
+-        self.fsm_state: DefaultDict[int, int] = defaultdict(int)
++    def __init__(self, guide: Guide):
++        self._guide: Guide = guide
++        # CFGState is used for the FSM state for CFGGuide
++        self._fsm_state: DefaultDict[int, Union[int,
++                                                CFGState]] = defaultdict(int)
+ 
+     def __call__(self, input_ids: List[int],
+                  scores: torch.Tensor) -> torch.Tensor:
+         """Use the FSM to bias the logits before sampling the next token."""
+         seq_id = hash(tuple(input_ids))
+ 
+-        if len(input_ids) == 0:
+-            self.init_state()
+-        else:
++        if len(input_ids) > 0:
+             last_token = input_ids[-1]
+             last_seq_id = hash(tuple(input_ids[:-1]))
+-            self.fsm_state[seq_id] = self.fsm.next_state(
+-                self.fsm_state[last_seq_id], last_token)
+-
+-        allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])
++            self._fsm_state[seq_id] = self._guide.get_next_state(
++                state=self._fsm_state[last_seq_id], token_id=last_token)
++        else:
++            # Note: this is a hack.
++            # Lark pickling does not work properly (silent failure),
++            # which breaks the RPC (which uses python pickleing).
++            # We need to find a better solution.
++            # On the first time this is called, we simply re-create
++            # the Lark object.
++            if isinstance(self._guide, CFGGuide):
++                self._guide.parser = PartialLark(
++                    self._guide.cfg_string,
++                    parser="lalr",
++                    import_paths=[grammars.GRAMMAR_PATH],
++                )
++                self._fsm_state[seq_id] = CFGState(
++                    parser_state=self._guide.parser.parse(""), prev_token=None)
++
++        instruction = self._guide.get_next_instruction(
++            state=self._fsm_state[seq_id])
++
++        if type(instruction) == Generate:  # noqa: E721
++            allowed_tokens = instruction.tokens
++        elif type(instruction) == Write:  # noqa: E721
++            # TODO: support fast forward tokens
++            allowed_tokens = [instruction.tokens[0]]
++        else:
++            raise TypeError(
++                f"Unsupported instruction type {type(instruction)}")
+ 
+         mask = torch.full((scores.shape[-1], ),
+-                          -math.inf,
++                          -torch.inf,
+                           device=scores.device)
+-        mask[allowed_tokens] = 0
++        # The tokenizer may support more token ids than the model can generate,
++        # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
++        # but scores.shape == torch.Size([128256])
++        # Using NumPy is faster for filtering token ids
++        allowed_tokens = np.array(allowed_tokens, dtype=np.int64)
++        allowed_tokens = torch.tensor(allowed_tokens, device=scores.device)
++        allowed_tokens = allowed_tokens.masked_select(
++            allowed_tokens < scores.shape[-1])
++        mask.index_fill_(0, allowed_tokens, 0)
+         scores.add_(mask)
+         return scores
+ 
+ 
+ class RegexLogitsProcessor(BaseLogitsProcessor):
+ 
++    @classmethod
++    @cache()
++    def _get_guide(cls, regex_string: str,
++                   tokenizer: PreTrainedTokenizerBase) -> Guide:
++        tokenizer = _adapt_tokenizer(tokenizer)
++        return RegexGuide.from_regex(regex_string, tokenizer)
++
+     def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
+         """Compile the FSM that drives the regex-structured generation.
+ 
+@@ -73,9 +113,8 @@ class RegexLogitsProcessor(BaseLogitsProcessor):
+             The model's tokenizer
+ 
+         """
+-        tokenizer = _adapt_tokenizer(tokenizer)
+-        fsm = RegexFSM(regex_string, tokenizer)
+-        self.fsm = fsm
++        super().__init__(
++            RegexLogitsProcessor._get_guide(regex_string, tokenizer))
+ 
+ 
+ class JSONLogitsProcessor(RegexLogitsProcessor):
+@@ -115,6 +154,12 @@ class JSONLogitsProcessor(RegexLogitsProcessor):
+ 
+ class CFGLogitsProcessor(BaseLogitsProcessor):
+ 
++    @classmethod
++    @cache()
++    def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
++        tokenizer = _adapt_tokenizer(tokenizer)
++        return CFGGuide(cfg, tokenizer)
++
+     def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
+         """Compile the FSM that drives the context free grammar generation.
+ 
+@@ -126,17 +171,11 @@ class CFGLogitsProcessor(BaseLogitsProcessor):
+             The model's tokenizer
+ 
+         """
+-        tokenizer = _adapt_tokenizer(tokenizer)
+-        fsm = CFGFSM(cfg, tokenizer)
+-        self.fsm = fsm
+-
+-    def init_state(self):
+-        """Initialize state with a CFGFSM copy."""
+-        super().init_state()
+-        self.fsm = self.fsm.copy()
++        super().__init__(CFGLogitsProcessor._get_guide(cfg, tokenizer))
++        self._guide = self._guide.copy()
+ 
+ 
+-@lru_cache
++@lru_cache(maxsize=32)
+ def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
+     """Adapt vLLM's tokenizer to use to compile the FSM.
+ 
+@@ -162,7 +201,8 @@ def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
+         string = tokenizer.convert_tokens_to_string([token])
+ 
+         # A hack to handle missing spaces to HF's Llama tokenizers
+-        if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
++        if (type(token) is str and token.startswith(SPIECE_UNDERLINE)
++                or token == "<0x20>"):
+             return " " + string
+ 
+         return string
+@@ -173,6 +213,9 @@ def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
+         """Sync vLLM's decoder with the outlines by returning list."""
+ 
+         def new_decoder(inp_tokens: List[int]) -> List[str]:
++            if (isinstance(inp_tokens, list) and len(inp_tokens) == 1
++                    and isinstance(inp_tokens[0], list)):
++                inp_tokens = inp_tokens[0]
+             return [decoder(inp_tokens)]
+ 
+         return new_decoder
+diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
+new file mode 100644
+index 0000000..20abaef
+--- /dev/null
++++ b/vllm/model_executor/guided_decoding/utils.py
+@@ -0,0 +1,228 @@
++import re
++
++
++def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
++    """Check if JSON schema contains features unsupported by xgrammar."""
++
++    def check_object(obj: dict) -> bool:
++        if not isinstance(obj, dict):
++            return False
++
++        # Check for pattern restrictions
++        if "pattern" in obj:
++            return True
++
++        # Check for numeric ranges
++        if obj.get("type") in ("integer", "number") and any(
++                key in obj for key in [
++                    "minimum", "maximum", "exclusiveMinimum",
++                    "exclusiveMaximum", "multipleOf"
++                ]):
++            return True
++
++        # Recursively check all nested objects and arrays
++        for value in obj.values():
++            if isinstance(value, dict):
++                if check_object(value):
++                    return True
++            elif isinstance(value, list):
++                for item in value:
++                    if isinstance(item, dict) and check_object(item):
++                        return True
++
++        return False
++
++    return check_object(schema)
++
++
++def has_lmf_unsupported_json_features(schema: dict) -> bool:
++    """
++    Check if JSON schema contains features unsupported 
++    by lm_format_enforcer.
++
++    Known issues:
++    - Regex patterns:
++        "grade": {
++            "type": "string",
++            "pattern": "^[A-D]$"  # Regex pattern
++        },
++    """
++
++    def check_object(obj: dict) -> bool:
++        if not isinstance(obj, dict):
++            return False
++
++        # Check for pattern restrictions
++        if "pattern" in obj:
++            return True
++
++        # Recursively check all nested objects and arrays
++        for value in obj.values():
++            if isinstance(value, dict):
++                if check_object(value):
++                    return True
++            elif isinstance(value, list):
++                for item in value:
++                    if isinstance(item, dict) and check_object(item):
++                        return True
++
++        return False
++
++    return check_object(schema)
++
++
++def grammar_is_likely_lark(grammar_str: str) -> bool:
++    """
++    Check if grammar appears to use Lark syntax.
++    
++    Args:
++        grammar_str: Input grammar string
++        
++    Returns:
++        bool: True if grammar appears to be in Lark format, False otherwise
++        
++    Examples:
++        >>> grammar_is_likely_lark("rule: 'abc'")
++        True
++        >>> grammar_is_likely_lark("rule ::= 'abc'")
++        False
++    """
++    if not grammar_str or not isinstance(grammar_str, str):
++        return False
++
++    for line in grammar_str.split('\n'):
++        # Remove both comment styles
++        line = re.sub(r'(#|//).*$', '', line).strip()
++        if not line:
++            continue
++
++        # Look for GBNF rule definition
++        if '::=' in line:
++            return False
++
++    return True
++
++
++def convert_lark_to_gbnf(grammar_str: str) -> str:
++    """
++    Convert a Lark grammar string to GBNF format.
++
++    GBNF reference:
++    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
++    Lark grammar reference:
++    https://lark-parser.readthedocs.io/en/latest/grammar.html
++    
++    Args:
++        grammar_str: Input grammar in Lark format
++        
++    Returns:
++        str: Converted grammar in GBNF format
++        
++    Examples:
++        >>> print(convert_lark_to_gbnf("rule: 'hello'"))
++        root ::= rule
++        rule ::= "hello"
++    """
++    if not isinstance(grammar_str, str):
++        raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
++    if not grammar_str.strip():
++        raise ValueError("Grammar string cannot be empty")
++
++    defined_rules = set()
++    referenced_rules = set()
++    output_lines = []
++
++    def clean_line(line: str) -> str:
++        """Remove comments and whitespace from line."""
++        return re.sub(r'(#|//).*$', '', line).strip()
++
++    def check_quotes(text: str, rule_name: str, line_num: int) -> None:
++        """Validate quote matching in text."""
++        if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
++            raise ValueError(
++                f"Mismatched quotes in {rule_name} on line {line_num}")
++
++    def extract_references(text: str) -> set:
++        """Extract rule references from text."""
++        # Remove quoted strings and special characters
++        text = re.sub(r'"[^"]*"', '', text)
++        text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
++        return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
++
++    # First pass: Find root rule and validate rule definitions
++    lines = [clean_line(line) for line in grammar_str.split('\n')]
++    first_rule = None
++
++    for line_num, line in enumerate(lines, 1):
++        if not line or line.startswith('|'):
++            continue
++
++        if ':' in line:
++            try:
++                name = line.split(':', 1)[0].strip().strip('?')
++                defined_rules.add(name)
++                if first_rule is None:
++                    first_rule = name
++                if name == 'start':
++                    first_rule = 'start'
++            except IndexError as e:
++                raise ValueError(f"Invalid rule format on line {line_num}. "
++                                 "Expected 'rule_name: definition'") from e
++
++    if not defined_rules:
++        raise ValueError("No valid rules found in grammar")
++
++    # Add root rule
++    output_lines.append(f"root ::= {first_rule}")
++
++    # Second pass: Process rule definitions and alternatives
++    current_rule = None
++    current_definition = []
++
++    for line_num, line in enumerate(lines, 1):
++        if not line:
++            continue
++
++        try:
++            if ':' in line and not line.startswith('|'):
++                # Save previous rule if exists
++                if current_rule:
++                    output_lines.append(
++                        f"{current_rule} ::= {' | '.join(current_definition)}")
++
++                # Process new rule
++                name, definition = line.split(':', 1)
++                current_rule = name.strip().strip('?')
++
++                check_quotes(definition, f"rule '{current_rule}'", line_num)
++                definition = re.sub(r"'([^']*)'", r'"\1"', definition)
++                referenced_rules.update(extract_references(definition))
++                current_definition = [definition.strip()]
++
++            elif line.startswith('|'):
++                if not current_rule:
++                    raise ValueError(f"Alternative '|' on line {line_num} "
++                                     "without a preceding rule definition")
++
++                alt_def = line[1:].strip()
++                check_quotes(alt_def, f"alternative for rule '{current_rule}'",
++                             line_num)
++                alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
++                referenced_rules.update(extract_references(alt_def))
++                current_definition.append(alt_def)
++
++        except ValueError as e:
++            raise ValueError(f"Error on line {line_num}: {str(e)}") from e
++
++    # Add final rule if exists
++    if current_rule:
++        output_lines.append(
++            f"{current_rule} ::= {' | '.join(current_definition)}")
++
++    # Validate all rules are defined
++    undefined_rules = referenced_rules - defined_rules - {'root'}
++    if undefined_rules:
++        raise ValueError("Referenced rules are not defined: "
++                         f"{', '.join(sorted(undefined_rules))}")
++
++    return '\n'.join(output_lines)
+diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+new file mode 100644
+index 0000000..f10a8fb
+--- /dev/null
++++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+@@ -0,0 +1,316 @@
++# noqa: UP007
++from __future__ import annotations
++
++import copy
++import json
++from dataclasses import dataclass, field
++from typing import TYPE_CHECKING, Any
++
++import torch
++from transformers import PreTrainedTokenizerFast
++
++try:
++    import xgrammar as xgr
++    from xgrammar.base import _core as xgr_core
++except ImportError:
++    pass
++
++from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,
++                                                       grammar_is_likely_lark)
++from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
++
++if TYPE_CHECKING:
++    from transformers import PreTrainedTokenizer
++
++    from vllm.config import ModelConfig
++    from vllm.sampling_params import GuidedDecodingParams
++
++
++# TODO: passing batch size to max threads here
++def get_local_xgrammar_guided_decoding_logits_processor(
++        guided_params: GuidedDecodingParams,
++        tokenizer: PreTrainedTokenizer,
++        model_config: ModelConfig,
++        max_threads: int = 8):
++    config = GrammarConfig.from_guided_params(guided_params=guided_params,
++                                              model_config=model_config,
++                                              tokenizer=tokenizer,
++                                              max_threads=max_threads)
++    return XGrammarLogitsProcessor(config)
++
++
++@dataclass(frozen=True)
++class TokenizerData:
++    """Immutable container for cached tokenizer data."""
++    encoded_vocab: list[str] = field(default_factory=list)
++    stop_token_ids: list[int] | None = None
++    # These fields are mutually exclusive: `backend_str` is used to create a
++    # TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
++    # used within the constructor of TokenizeInfo
++    backend_str: str | None = None
++    vocab_type: xgr.VocabType | None = None
++
++    def __post_init__(self):
++        # Check for mutual exclusive
++        assert not (self.backend_str and self.vocab_type), \
++            "backend_str and vocab_type are mutual exclusive"
++
++
++class TokenizerDataCache:
++    """Cache manager for tokenizer data to avoid repeated processing."""
++    _cache: dict[int, TokenizerData] = {}
++
++    @classmethod
++    def get_tokenizer_data(cls,
++                           tokenizer: PreTrainedTokenizer) -> TokenizerData:
++        tokenizer_hash = hash(tokenizer)
++
++        if tokenizer_hash not in cls._cache:
++            # Vendored from xgrammar logic since we cannot pickle the tokenizer
++            # https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501
++            try:
++                encoded_vocab = [
++                    token for token, _ in sorted(tokenizer.get_vocab().items(),
++                                                 key=lambda x: x[1])
++                ]
++            except AttributeError as e:
++                raise ValueError(
++                    f"Cannot get the vocabulary of the tokenizer "
++                    f"{type(tokenizer)}. The tokenizer should have a "
++                    "get_vocab method.") from e
++
++            stop_token_ids = None
++            backend_str = ""
++            vocab_type = xgr.VocabType.RAW
++
++            if stop_token_ids is None and hasattr(
++                    tokenizer,
++                    "eos_token_id") and tokenizer.eos_token_id is not None:
++                stop_token_ids = [tokenizer.eos_token_id]
++
++            if isinstance(tokenizer, PreTrainedTokenizerFast):
++                backend_str = tokenizer.backend_tokenizer.to_str()
++                vocab_type = None
++
++            elif isinstance(tokenizer, MistralTokenizer):
++                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
++                vocab_type = xgr.VocabType.BYTE_FALLBACK
++
++            cls._cache[tokenizer_hash] = TokenizerData(
++                encoded_vocab=encoded_vocab,
++                stop_token_ids=stop_token_ids,
++                backend_str=backend_str,
++                vocab_type=vocab_type)
++
++        return cls._cache[tokenizer_hash]
++
++
++class GrammarCompilerCache:
++    """
++    Cache for GrammarCompiler instances based on tokenizer.
++
++    This cache reduces the overhead of creating new compiler instances when
++    using the same tokenizer configuration.
++    """
++    _cache: dict[str, xgr.GrammarCompiler] = {}
++
++    @classmethod
++    def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
++        cache_key = str(config.tokenizer_hash)
++
++        if cache_key not in cls._cache:
++            assert config.tokenizer_data is not None
++            assert config.tokenizer_data.encoded_vocab is not None
++
++            config_data = config.tokenizer_data
++
++            # In TokenizerDataCache.get_tokenizer_data, a serializable
++            # tokenizer_data is created and cached. This data is used to build
++            # a tokenizer_info and create an xgrammar compiler.
++            # - If tokenizer_data has backend_str set, use
++            # xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
++            # - Otherwise, use the default constructor with vocab_type.
++            # - xgr_core.TokenizerInfo.from_huggingface !=
++            #   xgr.TokenizerInfo.from_huggingface.
++            if config_data.backend_str:
++                tokenizer_info = xgr.TokenizerInfo._create_from_handle(
++                    xgr_core.TokenizerInfo.from_huggingface(
++                        config_data.encoded_vocab, config_data.backend_str,
++                        config.vocab_size, config_data.stop_token_ids))
++            else:
++                tokenizer_info = xgr.TokenizerInfo(
++                    config_data.encoded_vocab,
++                    config_data.vocab_type,
++                    vocab_size=config.vocab_size,
++                    stop_token_ids=config_data.stop_token_ids)
++            cls._cache[cache_key] = xgr.GrammarCompiler(
++                tokenizer_info, max_threads=config.max_threads)
++
++        return cls._cache[cache_key]
++
++
++@dataclass
++class GrammarConfig:
++    """Serializable configuration for grammar compilation"""
++    tokenizer_hash: int
++    vocab_size: int
++    json_str: str | None = None
++    grammar_str: str | None = None
++    json_object: bool | None = None
++    max_threads: int = 8
++    tokenizer_data: TokenizerData | None = None
++
++    @classmethod
++    def from_guided_params(cls,
++                           guided_params: GuidedDecodingParams,
++                           model_config: ModelConfig,
++                           tokenizer: PreTrainedTokenizer,
++                           max_threads: int = 8) -> GrammarConfig:
++
++        tokenizer_hash = hash(tokenizer)
++        tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
++
++        if guided_params.json:
++            if not isinstance(guided_params.json, str):
++                json_str = json.dumps(guided_params.json)
++            else:
++                json_str = guided_params.json
++
++            # Validate the schema and raise ValueError here if it is invalid.
++            # This is to avoid exceptions in model execution, which will crash
++            # the engine worker process.
++            try:
++                xgr.Grammar.from_json_schema(json_str)
++            except RuntimeError as err:
++                raise ValueError(str(err)) from err
++
++            return cls(json_str=json_str,
++                       vocab_size=model_config.hf_text_config.vocab_size,
++                       tokenizer_hash=tokenizer_hash,
++                       max_threads=max_threads,
++                       tokenizer_data=tokenizer_data)
++        elif guided_params.grammar:
++            # XGrammar only supports GBNF grammars, so we must convert Lark
++            if grammar_is_likely_lark(guided_params.grammar):
++                try:
++                    grammar_str = convert_lark_to_gbnf(guided_params.grammar)
++                except ValueError as e:
++                    raise ValueError(
++                        "Failed to convert the grammar from Lark to GBNF. "
++                        "Please either use GBNF grammar directly or specify"
++                        " --guided-decoding-backend=outlines.\n"
++                        f"Conversion error: {str(e)}") from e
++            else:
++                grammar_str = guided_params.grammar
++
++            # Validate the grammar and raise ValueError here if it is invalid.
++            # This is to avoid exceptions in model execution, which will crash
++            # the engine worker process.
++            try:
++                xgr.Grammar.from_ebnf(grammar_str)
++            except RuntimeError as err:
++                raise ValueError(str(err)) from err
++
++            return cls(grammar_str=grammar_str,
++                       vocab_size=model_config.hf_text_config.vocab_size,
++                       tokenizer_hash=tokenizer_hash,
++                       max_threads=max_threads,
++                       tokenizer_data=tokenizer_data)
++        elif guided_params.json_object:
++            return cls(
++                json_object=True,
++                vocab_size=model_config.hf_text_config.vocab_size,
++                tokenizer_hash=tokenizer_hash,
++                max_threads=max_threads,
++                tokenizer_data=tokenizer_data,
++            )
++        else:
++            raise ValueError(
++                "Currently only support JSON and EBNF grammar mode for xgrammar"
++            )
++
++
++@dataclass
++class XGrammarLogitsProcessor:
++    """Wrapper class to support pickle protocol"""
++    config: GrammarConfig
++
++    ctx: xgr.CompiledGrammar | None = None
++    token_bitmask: torch.Tensor = None  # type: ignore[assignment]
++    matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
++    batch_size: int = field(default=1)
++    prefilled: bool = field(default=False)
++
++    def __getstate__(self) -> dict[str, Any]:
++        return {'config': self.config}
++
++    def __setstate__(self, state: dict[str, Any]):
++        self.config = state['config']
++
++        self.ctx = None
++        self.matchers = []
++        self.batch_size = 1
++        self.token_bitmask = None  # type: ignore[assignment]
++        self.prefilled = False
++
++    def _ensure_ctx(self):
++        """Lazily initialize the processor in the worker process"""
++        if self.ctx is None:
++            compiler = GrammarCompilerCache.get_compiler(self.config)
++            if self.config.json_str is not None:
++                self.ctx = compiler.compile_json_schema(self.config.json_str)
++            elif self.config.grammar_str is not None:
++                self.ctx = compiler.compile_grammar(self.config.grammar_str)
++            elif self.config.json_object:
++                self.ctx = compiler.compile_builtin_json_grammar()
++            else:
++                raise ValueError(
++                    "Invalid configuration for xgrammar logits processor")
++
++    def __call__(self, input_ids: list[int],
++                 scores: torch.Tensor) -> torch.Tensor:
++        if self.ctx is None:
++            self._ensure_ctx()
++
++        if len(self.matchers) == 0:
++            self.matchers = [
++                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
++            ]
++            self.token_bitmask = xgr.allocate_token_bitmask(
++                self.batch_size, self.config.vocab_size)
++
++        if not self.prefilled:
++            # Have not sampled a token yet
++            self.prefilled = True
++        else:
++            for i, matcher in enumerate(self.matchers):
++                if not matcher.is_terminated():
++                    sampled_token = input_ids[-1]
++                    assert self.matchers[i].accept_token(sampled_token)
++
++        for i, matcher in enumerate(self.matchers):
++            if not matcher.is_terminated():
++                # @ubospica: ideally, fill_next_token_bitmask should be
++                # parallelized with model decoding
++                # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303
++                matcher.fill_next_token_bitmask(self.token_bitmask, i)
++
++        # token_bitmask is a CPU tensor for use with accept_token and
++        # fill_next_token_bitmask so we move it to the device of scores
++        device_type = scores.device.type
++        if device_type != "cuda":
++            scores = scores.to("cpu").unsqueeze(0)
++
++        # Note: In this method, if the tensors have different dimensions
++        # on CPU device fails, but on GPU it runs without error. Hence the
++        # unsqueeze above for scores, to match the token bitmask shape
++        xgr.apply_token_bitmask_inplace(scores,
++                                        self.token_bitmask.to(scores.device))
++        if device_type != "cuda":
++            scores = scores.to(device_type).squeeze()
++
++        return scores
++
++    def clone(self) -> XGrammarLogitsProcessor:
++        """Deepcopy due to per-sequence state in the matchers"""
++        return copy.deepcopy(self)
+diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
+index d101aa3..2475190 100644
+--- a/vllm/model_executor/layers/activation.py
++++ b/vllm/model_executor/layers/activation.py
+@@ -6,14 +6,50 @@ import torch
+ import torch.nn as nn
+ import torch.nn.functional as F
+ 
+-from vllm import _custom_ops as ops
+ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size)
+-from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.custom_op import CustomOp
+ from vllm.model_executor.utils import set_weight_attrs
++from vllm.platforms import current_platform
++from vllm.utils import LazyDict
+ 
+ 
+-class SiluAndMul(nn.Module):
++@CustomOp.register("fatrelu_and_mul")
++class FatreluAndMul(CustomOp):
++    """An activation function for FATReLU.
++
++    The function computes x -> FATReLU(x[:d]) * x[d:] where
++    d = x.shape[-1] // 2.
++    This is used in openbmb/MiniCPM-S-1B-sft.
++
++    Shapes:
++        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
++        return: (num_tokens, d) or (batch_size, seq_len, d)
++    """
++
++    def __init__(self, threshold: float = 0.):
++        super().__init__()
++        self.threshold = threshold
++        if current_platform.is_cuda_alike() or current_platform.is_cpu():
++            self.op = torch.ops._C.fatrelu_and_mul
++
++    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
++        d = x.shape[-1] // 2
++        x1 = x[..., :d]
++        x2 = x[..., d:]
++        x1 = F.threshold(x1, self.threshold, 0.0)
++        return x1 * x2
++
++    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
++        d = x.shape[-1] // 2
++        output_shape = (x.shape[:-1] + (d, ))
++        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
++        self.op(out, x, self.threshold)
++        return out
++
++
++@CustomOp.register("silu_and_mul")
++class SiluAndMul(CustomOp):
+     """An activation function for SwiGLU.
+ 
+     The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+@@ -23,20 +59,36 @@ class SiluAndMul(nn.Module):
+         return: (num_tokens, d) or (batch_size, seq_len, d)
+     """
+ 
+-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
++    def __init__(self):
++        super().__init__()
++        if current_platform.is_cuda_alike() or current_platform.is_cpu():
++            self.op = torch.ops._C.silu_and_mul
++        elif current_platform.is_xpu():
++            from vllm._ipex_ops import ipex_ops
++            self.op = ipex_ops.silu_and_mul
++
++    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+         """PyTorch-native implementation equivalent to forward()."""
+         d = x.shape[-1] // 2
+         return F.silu(x[..., :d]) * x[..., d:]
+ 
+-    def forward(self, x: torch.Tensor) -> torch.Tensor:
++    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+         d = x.shape[-1] // 2
+         output_shape = (x.shape[:-1] + (d, ))
+         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+-        ops.silu_and_mul(out, x)
++        self.op(out, x)
++        return out
++
++    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
++        d = x.shape[-1] // 2
++        output_shape = (x.shape[:-1] + (d, ))
++        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
++        self.op(out, x)
+         return out
+ 
+ 
+-class GeluAndMul(nn.Module):
++@CustomOp.register("gelu_and_mul")
++class GeluAndMul(CustomOp):
+     """An activation function for GeGLU.
+ 
+     The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+@@ -51,52 +103,134 @@ class GeluAndMul(nn.Module):
+         self.approximate = approximate
+         if approximate not in ("none", "tanh"):
+             raise ValueError(f"Unknown approximate mode: {approximate}")
+-
+-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
++        if current_platform.is_cuda_alike() or current_platform.is_cpu():
++            if approximate == "none":
++                self.op = torch.ops._C.gelu_and_mul
++            elif approximate == "tanh":
++                self.op = torch.ops._C.gelu_tanh_and_mul
++        elif current_platform.is_xpu():
++            from vllm._ipex_ops import ipex_ops
++            if approximate == "none":
++                self.op = ipex_ops.gelu_and_mul
++            else:
++                self.op = ipex_ops.gelu_tanh_and_mul
++
++    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+         """PyTorch-native implementation equivalent to forward()."""
+         d = x.shape[-1] // 2
+         return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
+ 
+-    def forward(self, x: torch.Tensor) -> torch.Tensor:
++    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+         d = x.shape[-1] // 2
+         output_shape = (x.shape[:-1] + (d, ))
+         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+-        if self.approximate == "none":
+-            ops.gelu_and_mul(out, x)
+-        elif self.approximate == "tanh":
+-            ops.gelu_tanh_and_mul(out, x)
++        self.op(out, x)
++        return out
++
++    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
++        d = x.shape[-1] // 2
++        output_shape = (x.shape[:-1] + (d, ))
++        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
++        self.op(out, x)
+         return out
+ 
+     def extra_repr(self) -> str:
+         return f'approximate={repr(self.approximate)}'
+ 
+ 
+-class NewGELU(nn.Module):
++@CustomOp.register("gelu_new")
++class NewGELU(CustomOp):
++
++    def __init__(self):
++        super().__init__()
++        if current_platform.is_cuda_alike() or current_platform.is_cpu():
++            self.op = torch.ops._C.gelu_new
++        elif current_platform.is_xpu():
++            from vllm._ipex_ops import ipex_ops
++            self.op = ipex_ops.gelu_new
+ 
+-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
++    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+         """PyTorch-native implementation equivalent to forward()."""
+         c = math.sqrt(2.0 / math.pi)
+         return 0.5 * x * (1.0 + torch.tanh(c *
+                                            (x + 0.044715 * torch.pow(x, 3.0))))
+ 
+-    def forward(self, x: torch.Tensor) -> torch.Tensor:
++    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+         out = torch.empty_like(x)
+-        ops.gelu_new(out, x)
++        self.op(out, x)
+         return out
+ 
++    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
++        return self.op(x)
+ 
+-class FastGELU(nn.Module):
+ 
+-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
++@CustomOp.register("gelu_fast")
++class FastGELU(CustomOp):
++
++    def __init__(self):
++        super().__init__()
++        if current_platform.is_cuda_alike() or current_platform.is_cpu():
++            self.op = torch.ops._C.gelu_fast
++        elif current_platform.is_xpu():
++            from vllm._ipex_ops import ipex_ops
++            self.op = ipex_ops.gelu_fast
++
++    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+         """PyTorch-native implementation equivalent to forward()."""
+         return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
+                                            (1.0 + 0.044715 * x * x)))
+ 
+-    def forward(self, x: torch.Tensor) -> torch.Tensor:
++    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+         out = torch.empty_like(x)
+-        ops.gelu_fast(out, x)
++        self.op(out, x)
+         return out
+ 
++    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
++        return self.op(x)
++
++
++@CustomOp.register("quick_gelu")
++class QuickGELU(CustomOp):
++    # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
++    def __init__(self):
++        super().__init__()
++        if current_platform.is_cuda_alike() or current_platform.is_cpu():
++            self.op = torch.ops._C.gelu_quick
++        elif current_platform.is_xpu():
++            from vllm._ipex_ops import ipex_ops
++            self.op = ipex_ops.gelu_quick
++
++    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
++        """PyTorch-native implementation equivalent to forward()."""
++        return x * torch.sigmoid(1.702 * x)
++
++    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
++        out = torch.empty_like(x)
++        self.op(out, x)
++        return out
++
++    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
++        out = torch.empty_like(x)
++        self.op(out, x)
++        return out
++
++    # TODO implement forward_xpu for QuickGELU
++    # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
++
++
++@CustomOp.register("relu2")
++class ReLUSquaredActivation(CustomOp):
++    """
++    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
++    """
++
++    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
++        """PyTorch-native implementation equivalent to forward()."""
++        return torch.square(F.relu(x))
++
++    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
++        return self.forward_native(x)
++
+ 
+ class ScaledActivation(nn.Module):
+     """An activation function with post-scale parameters.
+@@ -140,34 +274,47 @@ class ScaledActivation(nn.Module):
+         param_data.copy_(loaded_weight)
+ 
+ 
+-_ACTIVATION_REGISTRY = {
+-    "gelu": nn.GELU(),
+-    "gelu_fast": FastGELU(),
+-    "gelu_new": NewGELU(),
+-    "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
+-    "relu": nn.ReLU(),
+-}
+-
+-
+-def get_act_fn(
+-    act_fn_name: str,
+-    quant_config: Optional[QuantizationConfig] = None,
+-    intermediate_size: Optional[int] = None,
+-    input_is_parallel: bool = True,
+-    params_dtype: Optional[torch.dtype] = None,
+-) -> nn.Module:
++_ACTIVATION_REGISTRY = LazyDict({
++    "gelu":
++    lambda: nn.GELU(),
++    "gelu_fast":
++    lambda: FastGELU(),
++    "gelu_new":
++    lambda: NewGELU(),
++    "gelu_pytorch_tanh":
++    lambda: nn.GELU(approximate="tanh"),
++    "relu":
++    lambda: nn.ReLU(),
++    "relu2":
++    lambda: ReLUSquaredActivation(),
++    "silu":
++    lambda: nn.SiLU(),
++    "quick_gelu":
++    lambda: QuickGELU(),
++})
++
++
++def get_act_fn(act_fn_name: str) -> nn.Module:
+     """Get an activation function by name."""
+     act_fn_name = act_fn_name.lower()
+     if act_fn_name not in _ACTIVATION_REGISTRY:
+         raise ValueError(
+             f"Activation function {act_fn_name!r} is not supported.")
+ 
+-    act_fn = _ACTIVATION_REGISTRY[act_fn_name]
+-    if (quant_config is not None
+-            and act_fn_name in quant_config.get_scaled_act_names()):
+-        if intermediate_size is None:
+-            raise ValueError("intermediate_size must be specified for scaled "
+-                             "activation functions.")
+-        return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
+-                                params_dtype)
+-    return act_fn
++    return _ACTIVATION_REGISTRY[act_fn_name]
++
++
++_ACTIVATION_AND_MUL_REGISTRY = LazyDict({
++    "gelu": lambda: GeluAndMul(),
++    "silu": lambda: SiluAndMul(),
++})
++
++
++def get_act_and_mul_fn(act_fn_name: str) -> nn.Module:
++    """Get an activation-and-mul (i.e. SiluAndMul) function by name."""
++    act_fn_name = act_fn_name.lower()
++    if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY:
++        raise ValueError(
++            f"Activation function {act_fn_name!r} is not supported.")
++
++    return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name]
+diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
+index 496d69c..c4223d1 100644
+--- a/vllm/model_executor/layers/fused_moe/__init__.py
++++ b/vllm/model_executor/layers/fused_moe/__init__.py
+@@ -1,7 +1,46 @@
+-from vllm.model_executor.layers.fused_moe.fused_moe import (
+-    fused_moe, get_config_file_name)
++from contextlib import contextmanager
++from typing import Any, Dict, Optional
++
++from vllm.model_executor.layers.fused_moe.layer import (
++    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
++from vllm.triton_utils import HAS_TRITON
++
++_config: Optional[Dict[str, Any]] = None
++
++
++@contextmanager
++def override_config(config):
++    global _config
++    old_config = _config
++    _config = config
++    yield
++    _config = old_config
++
++
++def get_config() -> Optional[Dict[str, Any]]:
++    return _config
++
+ 
+ __all__ = [
+-    "fused_moe",
+-    "get_config_file_name",
++    "FusedMoE",
++    "FusedMoEMethodBase",
++    "FusedMoeWeightScaleSupported",
++    "override_config",
++    "get_config",
+ ]
++
++if HAS_TRITON:
++    # import to register the custom ops
++    import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
++    import vllm.model_executor.layers.fused_moe.fused_moe  # noqa
++    from vllm.model_executor.layers.fused_moe.fused_moe import (
++        fused_experts, fused_moe, fused_topk, get_config_file_name,
++        grouped_topk)
++
++    __all__ += [
++        "fused_moe",
++        "fused_topk",
++        "fused_experts",
++        "get_config_file_name",
++        "grouped_topk",
++    ]
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..56c1a4e
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+@@ -0,0 +1,146 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 2
++  },
++  "48": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
+new file mode 100644
+index 0000000..d3677be
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
+@@ -0,0 +1,146 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "24": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "32": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "48": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "64": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "128": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "256": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "512": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..265768f
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
+new file mode 100644
+index 0000000..d3be23d
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "24": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "32": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "48": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "64": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "128": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "256": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..589f5d3
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "24": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "48": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..2c78bfa
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "24": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "48": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 2
++  },
++  "128": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 2
++  },
++  "256": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
+new file mode 100644
+index 0000000..4da841e
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "24": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "32": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "48": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "64": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "96": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "128": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "256": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "512": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..2003567
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
+new file mode 100644
+index 0000000..e076615
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "24": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "32": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "48": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "64": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "128": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "256": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..ee89655
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
+new file mode 100644
+index 0000000..05aed8b
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "24": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "32": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "48": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "64": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "96": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "128": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "256": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..51ad5b2
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+@@ -0,0 +1,146 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
+new file mode 100644
+index 0000000..ee51191
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
+@@ -0,0 +1,146 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "4": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 2
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 4
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..68793c7
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "128": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
+new file mode 100644
+index 0000000..6129107
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "4": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "48": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 2
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 2
++  },
++  "128": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 2
++  },
++  "256": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..51d03d8
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+@@ -0,0 +1,146 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "96": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..26f9abd
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+@@ -0,0 +1,146 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "128": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "256": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "512": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+new file mode 100644
+index 0000000..cd0cdbe
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+@@ -0,0 +1,130 @@
++{
++    "3328": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "256": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "768": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "1792": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "2560": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "2816": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "3584": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "512": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "3840": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "1280": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "2304": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 2
++    }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..64be6e6
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+@@ -0,0 +1,146 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 2
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
+new file mode 100644
+index 0000000..0a6a6a7
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
+@@ -0,0 +1,218 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "128": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "256": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "512": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "5120": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "9216": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "13312": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "17408": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "25600": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "33792": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "41984": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "50176": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "58368": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+new file mode 100644
+index 0000000..ba9041d
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+@@ -0,0 +1,130 @@
++{
++    "3840": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "1792": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "3584": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "512": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "2816": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1280": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "768": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "3328": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "2560": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "2304": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "256": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..7a7508a
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
+@@ -0,0 +1,146 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 2
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 4
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
+new file mode 100644
+index 0000000..dbf9a2d
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
+@@ -0,0 +1,146 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 5
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 2
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "128": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 64,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 256,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+new file mode 100644
+index 0000000..bbb2386
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
+@@ -0,0 +1,146 @@
++{
++  "1": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "2": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "4": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "8": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "16": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "24": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 32,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "32": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "48": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "64": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 16,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "96": {
++    "BLOCK_SIZE_M": 16,
++    "BLOCK_SIZE_N": 64,
++    "BLOCK_SIZE_K": 256,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 4,
++    "num_stages": 4
++  },
++  "128": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "256": {
++    "BLOCK_SIZE_M": 32,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 4,
++    "num_stages": 3
++  },
++  "512": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "1024": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 5
++  },
++  "1536": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "2048": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 64,
++    "num_warps": 8,
++    "num_stages": 3
++  },
++  "3072": {
++    "BLOCK_SIZE_M": 128,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 64,
++    "GROUP_SIZE_M": 32,
++    "num_warps": 8,
++    "num_stages": 4
++  },
++  "4096": {
++    "BLOCK_SIZE_M": 256,
++    "BLOCK_SIZE_N": 128,
++    "BLOCK_SIZE_K": 128,
++    "GROUP_SIZE_M": 1,
++    "num_warps": 8,
++    "num_stages": 3
++  }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+new file mode 100644
+index 0000000..5705545
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+@@ -0,0 +1,130 @@
++{
++    "2048": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "1792": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "512": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "3328": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 2
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "2560": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "768": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "2816": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "256": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "2304": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 8,
++        "num_stages": 2
++    },
++    "1280": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "3840": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "3584": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 4
++    }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
+new file mode 100644
+index 0000000..8cc6c64
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
+@@ -0,0 +1,146 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "2": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 5
++    },
++    "4": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "8": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 5
++    },
++    "16": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "24": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "32": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "48": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "64": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "96": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "128": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "256": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "512": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
+new file mode 100644
+index 0000000..d4c9ddd
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
+@@ -0,0 +1,146 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "2": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "4": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "8": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "16": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "24": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "32": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "48": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "64": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "96": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "128": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "256": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "512": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 4
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
+new file mode 100644
+index 0000000..b2799ed
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
+@@ -0,0 +1,146 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "2": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "4": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "8": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "16": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "24": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "32": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "48": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "64": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "96": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "128": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "256": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "512": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
+new file mode 100644
+index 0000000..b8d3be2
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
+@@ -0,0 +1,146 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "2": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "4": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "8": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "16": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "24": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "32": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "48": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "64": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "96": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "128": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "256": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "512": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 4
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+new file mode 100644
+index 0000000..6a97678
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+@@ -0,0 +1,200 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "2": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "4": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "8": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 1,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "16": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "24": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 1,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "32": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "48": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "64": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "96": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "128": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "256": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "512": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 32,
++        "kpack": 2
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+new file mode 100644
+index 0000000..3f3ccda
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+@@ -0,0 +1,138 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 5
++    },
++    "2": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "4": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "8": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "16": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "24": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "32": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "48": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "64": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "96": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "128": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2
++    },
++    "256": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "512": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 3
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+new file mode 100644
+index 0000000..0a46390
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+@@ -0,0 +1,200 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "2": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "4": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "8": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "16": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "24": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "32": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "48": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "64": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "96": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "128": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "256": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "512": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+new file mode 100644
+index 0000000..5557187
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+@@ -0,0 +1,146 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "2": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "4": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "8": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "16": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "24": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "32": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "48": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "64": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "96": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "128": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "256": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "512": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 5
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 4
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+new file mode 100644
+index 0000000..91011e6
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+@@ -0,0 +1,200 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "2": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "4": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "8": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "16": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "24": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "32": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "48": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "64": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "96": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "128": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "256": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "512": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 32,
++        "kpack": 2
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+new file mode 100644
+index 0000000..673bae2
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+@@ -0,0 +1,146 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "2": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "4": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "8": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "16": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "24": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "32": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "48": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "64": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "96": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "128": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "256": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "512": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 4
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json
+new file mode 100644
+index 0000000..d720deb
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json
+@@ -0,0 +1,173 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "2": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 2,
++        "num_warps": 4,
++        "num_ctas": 1,
++        "num_stages": 7
++    },
++    "4": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 128,
++        "num_warps": 2,
++        "num_ctas": 1,
++        "num_stages": 4
++    },
++    "8": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_ctas": 1,
++        "num_stages": 1
++    },
++    "16": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_ctas": 1,
++        "num_stages": 1
++    },
++    "32": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 2,
++        "num_warps": 4,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "48": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 2,
++        "num_warps": 4,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "64": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 2,
++        "num_warps": 4,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "96": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 8,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "128": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 8,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "192": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 8,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "256": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 16,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "512": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 128,
++        "num_warps": 2,
++        "num_ctas": 1,
++        "num_stages": 8
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_ctas": 1,
++        "num_stages": 3
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 16,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 16,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "6144": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_ctas": 1,
++        "num_stages": 2
++    },
++    "8192": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 16,
++        "num_ctas": 1,
++        "num_stages": 2
++    }
++}
+\ No newline at end of file
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+new file mode 100644
+index 0000000..cc614e6
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+@@ -0,0 +1,146 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "2": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "4": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "8": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "16": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "24": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "32": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "48": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "64": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "96": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "128": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "256": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "512": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 4
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+new file mode 100644
+index 0000000..f807d4a
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+@@ -0,0 +1,200 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "2": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 32,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "4": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "8": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "16": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "24": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "32": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "48": {
++        "BLOCK_SIZE_M": 16,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 2,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "64": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "96": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 32,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 4,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "128": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "256": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 32,
++        "kpack": 2
++    },
++    "512": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 2
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 0,
++        "waves_per_eu": 0,
++        "matrix_instr_nonkdim": 16,
++        "kpack": 1
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+new file mode 100644
+index 0000000..918f683
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+@@ -0,0 +1,146 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "2": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "4": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "8": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "16": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "24": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "32": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "48": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "64": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "96": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "128": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "256": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 4,
++        "num_stages": 3
++    },
++    "512": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 4
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+new file mode 100644
+index 0000000..34b916e
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+@@ -0,0 +1,146 @@
++{
++    "1": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 64,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 5
++    },
++    "2": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "4": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "8": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 64,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 5
++    },
++    "16": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "24": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "32": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "48": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "64": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "96": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "128": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 4
++    },
++    "256": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 8,
++        "num_stages": 5
++    },
++    "512": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 64,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "1536": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    },
++    "3072": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 3
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 32,
++        "num_warps": 8,
++        "num_stages": 4
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+new file mode 100644
+index 0000000..8799326
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+@@ -0,0 +1,359 @@
++"""Fused MoE utilities for GPTQ."""
++import functools
++from typing import Optional
++
++import torch
++
++from vllm.model_executor.layers.fused_moe.fused_moe import (
++    fused_topk, moe_align_block_size, try_get_optimal_moe_config)
++from vllm.scalar_type import scalar_types
++from vllm.utils import direct_register_custom_op
++
++
++def get_scalar_type(num_bits: int, has_zp: bool):
++    if has_zp:
++        assert num_bits == 4
++        return scalar_types.uint4
++    else:
++        return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
++
++
++def single_marlin_moe(
++    hidden_states: torch.Tensor,
++    w: torch.Tensor,
++    scales: torch.Tensor,
++    gating_output: torch.Tensor,
++    topk: int,
++    renormalize: bool,
++    g_idx: Optional[torch.Tensor] = None,
++    sort_indices: Optional[torch.Tensor] = None,
++    w_zeros: Optional[torch.Tensor] = None,
++    num_bits: int = 8,
++    is_k_full: bool = True,
++) -> torch.Tensor:
++    """
++    This function computes the multiplication of hidden_states with expert
++    weights used in Marlin MoE, using weights w and top-k gating mechanism.
++    Its purpose is testing and debugging the fused MoE kernel.
++
++    Parameters:
++    - hidden_states (torch.Tensor): The input tensor to the Marlin Mul.
++    - w (torch.Tensor): The set of expert weights.
++    - scales (torch.Tensor): The quantization scales.
++    - gating_output (torch.Tensor): The output of the gating operation
++        (before softmax).
++    - g_idx (Optional[torch.Tensor]): Optional act_order indices.
++    - sort_indices (Optional[torch.Tensor]): Optional act_order input
++      permutation.
++    - topk (int): The number of top-k experts to select.
++    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
++    - w_zeros (Optional[torch.Tensor]): Optional zero points to be used for w.
++    - num_bits (bool): The number of bits in expert weights quantization.
++
++    Returns:
++    - torch.Tensor: The output tensor after applying the MoE layer.
++    """
++    # Check constraints.
++    assert hidden_states.shape[0] == gating_output.shape[0], (
++        "Number of tokens mismatch")
++    assert hidden_states.shape[1] == w.shape[1] * 16, "Hidden size mismatch"
++    assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
++    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
++    assert w.is_contiguous(), "Expert weights must be contiguous"
++    assert hidden_states.dtype == torch.float16
++    assert num_bits in [4, 8]
++
++    M, K = hidden_states.shape
++    E = w.shape[0]
++    N = w.shape[2] // (num_bits // 2)
++
++    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
++                                        renormalize)
++
++    # This might not be an optimal config for a single MMM
++    get_config_func = functools.partial(try_get_optimal_moe_config,
++                                        w.shape,
++                                        w.shape,
++                                        topk_ids.shape[1],
++                                        None,
++                                        is_marlin=True)
++    config = get_config_func(M)
++
++    block_size_m = config['BLOCK_SIZE_M']
++
++    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
++
++    max_workspace_size = (N // 64) * 16
++    workspace = torch.zeros(max_workspace_size,
++                            dtype=torch.int,
++                            device=hidden_states.device,
++                            requires_grad=False)
++
++    has_zero_point = w_zeros is not None
++    if w_zeros is None:
++        w_zeros = torch.empty((0, 0),
++                              dtype=hidden_states.dtype,
++                              device=hidden_states.device,
++                              requires_grad=False)
++
++    if g_idx is None:
++        g_idx = torch.empty((0, 0),
++                            dtype=torch.int32,
++                            device=hidden_states.device,
++                            requires_grad=False)
++
++    if sort_indices is None:
++        sort_indices = torch.empty((0),
++                                   dtype=torch.int32,
++                                   device=hidden_states.device,
++                                   requires_grad=False)
++
++    scalar_type = get_scalar_type(num_bits, has_zero_point)
++
++    intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
++        hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
++        w_zeros, g_idx, sort_indices, workspace, scalar_type.id, M, N, K,
++        is_k_full, E, topk, block_size_m, True, False)
++
++    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
++
++
++def single_marlin_moe_fake(
++    hidden_states: torch.Tensor,
++    w: torch.Tensor,
++    scales: torch.Tensor,
++    gating_output: torch.Tensor,
++    topk: int,
++    renormalize: bool,
++    g_idx: Optional[torch.Tensor] = None,
++    sort_indices: Optional[torch.Tensor] = None,
++    w_zeros: Optional[torch.Tensor] = None,
++    num_bits: int = 8,
++    is_k_full: bool = True,
++) -> torch.Tensor:
++    return torch.empty_like(hidden_states)
++
++
++direct_register_custom_op(
++    op_name="single_marlin_moe",
++    op_func=single_marlin_moe,
++    mutates_args=[],
++    fake_impl=single_marlin_moe_fake,
++)
++
++
++def fused_marlin_moe(
++    hidden_states: torch.Tensor,
++    w1: torch.Tensor,
++    w2: torch.Tensor,
++    w1_scale: torch.Tensor,
++    w2_scale: torch.Tensor,
++    gating_output: torch.Tensor,
++    topk_weights: torch.Tensor,
++    topk_ids: torch.Tensor,
++    g_idx1: Optional[torch.Tensor] = None,
++    g_idx2: Optional[torch.Tensor] = None,
++    sort_indices1: Optional[torch.Tensor] = None,
++    sort_indices2: Optional[torch.Tensor] = None,
++    w1_zeros: Optional[torch.Tensor] = None,
++    w2_zeros: Optional[torch.Tensor] = None,
++    num_bits: int = 8,
++    is_k_full: bool = True,
++) -> torch.Tensor:
++    """
++    This function computes a Mixture of Experts (MoE) layer using two sets of
++    weights, w1 and w2, and top-k gating mechanism.
++
++    Parameters:
++    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
++    - w1 (torch.Tensor): The first set of expert weights.
++    - w2 (torch.Tensor): The second set of expert weights.
++    - w1_scale (torch.Tensor): Scale to be used for w1.
++    - w2_scale (torch.Tensor): Scale to be used for w2.
++    - gating_output (torch.Tensor): The output of the gating operation
++        (before softmax).
++    - g_idx1 (Optional[torch.Tensor]): The first set of act_order indices.
++    - g_idx2 (Optional[torch.Tensor]): The second set of act_order indices.
++    - sort_indices1 (Optional[torch.Tensor]): The first act_order input
++        permutation.
++    - sort_indices2 (Optional[torch.Tensor]): The second act_order input
++        permutation.
++    - topk_weights (torch.Tensor): Top-k weights.
++    - topk_ids (torch.Tensor): Indices of topk-k elements.
++    - w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1.
++    - w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2.
++    - num_bits (bool): The number of bits in expert weights quantization.
++
++    Returns:
++    - torch.Tensor: The output tensor after applying the MoE layer.
++    """
++    # Check constraints.
++    assert hidden_states.shape[0] == gating_output.shape[
++        0], "Number of tokens mismatch"
++    assert hidden_states.shape[
++        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
++    assert hidden_states.shape[1] == w2.shape[2] // (
++        num_bits // 2), "Hidden size mismatch w2"
++    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
++    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
++    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
++    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
++    assert hidden_states.dtype == torch.float16
++    assert num_bits in [4, 8]
++
++    has_no_act_order = (g_idx1 is None and g_idx2 is None
++                        and sort_indices1 is None and sort_indices2 is None)
++    has_all_act_order = (g_idx1 is not None and g_idx2 is not None
++                         and sort_indices1 is not None
++                         and sort_indices2 is not None)
++    assert has_no_act_order or has_all_act_order, (
++        "g_idx and sorted_indices "
++        "must be all not None or must be all None")
++
++    has_no_zp = w1_zeros is None and w2_zeros is None
++    has_all_zp = w1_zeros is not None and w2_zeros is not None
++    assert has_no_zp or has_all_zp, ("zero points must be both not None or "
++                                     "must be both None")
++
++    M, K = hidden_states.shape
++    E = w1.shape[0]
++    N = w2.shape[1] * 16
++    topk = topk_ids.shape[1]
++
++    get_config_func = functools.partial(
++        try_get_optimal_moe_config,
++        w1.shape,
++        w2.shape,
++        topk_ids.shape[1],
++        None,
++        is_marlin=True,
++    )
++    config = get_config_func(M)
++
++    block_size_m = config["BLOCK_SIZE_M"]
++
++    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
++
++    max_workspace_size = (max(2 * N, K) // 64) * 16
++    workspace = torch.zeros(max_workspace_size,
++                            dtype=torch.int,
++                            device="cuda",
++                            requires_grad=False)
++
++    if has_no_zp:
++        w1_zeros = torch.empty((0, 0),
++                               dtype=hidden_states.dtype,
++                               device=hidden_states.device,
++                               requires_grad=False)
++        w2_zeros = torch.empty((0, 0),
++                               dtype=hidden_states.dtype,
++                               device=hidden_states.device,
++                               requires_grad=False)
++
++    if has_no_act_order:
++        g_idx1 = torch.empty((0, 0),
++                             dtype=torch.int32,
++                             device=hidden_states.device,
++                             requires_grad=False)
++        g_idx2 = torch.empty((0, 0),
++                             dtype=torch.int32,
++                             device=hidden_states.device,
++                             requires_grad=False)
++        sort_indices1 = torch.empty((0),
++                                    dtype=torch.int32,
++                                    device=hidden_states.device,
++                                    requires_grad=False)
++        sort_indices2 = torch.empty((0, 0),
++                                    dtype=torch.int32,
++                                    device=hidden_states.device,
++                                    requires_grad=False)
++
++    scalar_type1 = get_scalar_type(num_bits, has_all_zp)
++    scalar_type2 = get_scalar_type(num_bits, has_all_zp)
++
++    intermediate_cache2 = torch.empty(
++        (M * topk_ids.shape[1], N),
++        device=hidden_states.device,
++        dtype=hidden_states.dtype,
++    )
++
++    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
++        hidden_states,
++        w1,
++        sorted_token_ids,
++        topk_weights,
++        topk_ids,
++        w1_scale,
++        w1_zeros,
++        g_idx1,
++        sort_indices1,
++        workspace,
++        scalar_type1.id,
++        M,
++        2 * N,
++        K,
++        is_k_full,
++        E,
++        topk,
++        block_size_m,
++        True,
++        False,
++    )
++
++    torch.ops._C.silu_and_mul(intermediate_cache2,
++                              intermediate_cache1.view(-1, 2 * N))
++
++    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
++        intermediate_cache2,
++        w2,
++        sorted_token_ids,
++        topk_weights,
++        topk_ids,
++        w2_scale,
++        w2_zeros,
++        g_idx2,
++        sort_indices2,
++        workspace,
++        scalar_type2.id,
++        M,
++        K,
++        N,
++        is_k_full,
++        E,
++        topk,
++        block_size_m,
++        False,
++        True,
++    )
++
++    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
++                     dim=1)
++
++
++def fused_marlin_moe_fake(
++    hidden_states: torch.Tensor,
++    w1: torch.Tensor,
++    w2: torch.Tensor,
++    w1_scale: torch.Tensor,
++    w2_scale: torch.Tensor,
++    gating_output: torch.Tensor,
++    topk_weights: torch.Tensor,
++    topk_ids: torch.Tensor,
++    g_idx1: Optional[torch.Tensor] = None,
++    g_idx2: Optional[torch.Tensor] = None,
++    sort_indices1: Optional[torch.Tensor] = None,
++    sort_indices2: Optional[torch.Tensor] = None,
++    w1_zeros: Optional[torch.Tensor] = None,
++    w2_zeros: Optional[torch.Tensor] = None,
++    num_bits: int = 8,
++    is_k_full: bool = True,
++) -> torch.Tensor:
++    return torch.empty_like(hidden_states)
++
++
++direct_register_custom_op(
++    op_name="fused_marlin_moe",
++    op_func=fused_marlin_moe,
++    mutates_args=[],
++    fake_impl=fused_marlin_moe_fake,
++)
+diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
+index 3cb0419..3ea6217 100644
+--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
++++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
+@@ -2,57 +2,69 @@
+ import functools
+ import json
+ import os
+-from typing import Any, Dict, Optional, Tuple
++from typing import Any, Callable, Dict, List, Optional, Tuple
+ 
+ import torch
+ import triton
+ import triton.language as tl
+ 
++import vllm.envs as envs
+ from vllm import _custom_ops as ops
+ from vllm.logger import init_logger
+-from vllm.utils import is_hip
++from vllm.model_executor.layers.quantization.utils.fp8_utils import (
++    per_token_group_quant_fp8)
++from vllm.platforms import current_platform
++from vllm.utils import direct_register_custom_op
+ 
+ logger = init_logger(__name__)
+ 
+ 
+ @triton.jit
+ def fused_moe_kernel(
+-    # Pointers to matrices
+-    a_ptr,
+-    b_ptr,
+-    c_ptr,
+-    a_scale_ptr,
+-    b_scale_ptr,
+-    topk_weights_ptr,
+-    sorted_token_ids_ptr,
+-    expert_ids_ptr,
+-    num_tokens_post_padded_ptr,
+-    # Matrix dimensions
+-    N,
+-    K,
+-    EM,
+-    num_valid_tokens,
+-    # The stride variables represent how much to increase the ptr by when
+-    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+-    # how much to increase `a_ptr` by to get the element one row down
+-    # (A has M rows).
+-    stride_am,
+-    stride_ak,
+-    stride_be,
+-    stride_bk,
+-    stride_bn,
+-    stride_cm,
+-    stride_cn,
+-    # Meta-parameters
+-    BLOCK_SIZE_M: tl.constexpr,
+-    BLOCK_SIZE_N: tl.constexpr,
+-    BLOCK_SIZE_K: tl.constexpr,
+-    GROUP_SIZE_M: tl.constexpr,
+-    MUL_ROUTED_WEIGHT: tl.constexpr,
+-    top_k: tl.constexpr,
+-    compute_type: tl.constexpr,
+-    use_fp8: tl.constexpr,
+-):
++        # Pointers to matrices
++        a_ptr,
++        b_ptr,
++        c_ptr,
++        a_scale_ptr,
++        b_scale_ptr,
++        topk_weights_ptr,
++        sorted_token_ids_ptr,
++        expert_ids_ptr,
++        num_tokens_post_padded_ptr,
++        # Matrix dimensions
++        N,
++        K,
++        EM,
++        num_valid_tokens,
++        # The stride variables represent how much to increase the ptr by when
++        # moving by 1 element in a particular dimension. E.g. `stride_am` is
++        # how much to increase `a_ptr` by to get the element one row down
++        # (A has M rows).
++        stride_am,
++        stride_ak,
++        stride_be,
++        stride_bk,
++        stride_bn,
++        stride_cm,
++        stride_cn,
++        stride_asm,
++        stride_ask,
++        stride_bse,
++        stride_bsk,
++        stride_bsn,
++        # Block size for block-wise quantization
++        group_n: tl.constexpr,
++        group_k: tl.constexpr,
++        # Meta-parameters
++        BLOCK_SIZE_M: tl.constexpr,
++        BLOCK_SIZE_N: tl.constexpr,
++        BLOCK_SIZE_K: tl.constexpr,
++        GROUP_SIZE_M: tl.constexpr,
++        MUL_ROUTED_WEIGHT: tl.constexpr,
++        top_k: tl.constexpr,
++        compute_type: tl.constexpr,
++        use_fp8_w8a8: tl.constexpr,
++        use_int8_w8a16: tl.constexpr):
+     """
+     Implements the fused computation for a Mixture of Experts (MOE) using
+     token and expert matrices.
+@@ -101,22 +113,34 @@ def fused_moe_kernel(
+     num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+     if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+         return
+-    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
++    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
++        tl.int64)
+     offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+     token_mask = offs_token < num_valid_tokens
+ 
+-    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
++    offs_bn = (pid_n * BLOCK_SIZE_N +
++               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+     offs_k = tl.arange(0, BLOCK_SIZE_K)
+     a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                       offs_k[None, :] * stride_ak)
+ 
+-    off_experts = tl.load(expert_ids_ptr + pid_m)
++    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+     b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
+                                                 offs_bn[None, :] * stride_bn)
+-
+-    if use_fp8:
+-        a_scale = tl.load(a_scale_ptr)
+-        b_scale = tl.load(b_scale_ptr + off_experts)
++    if use_int8_w8a16:
++        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
++            None, :] * stride_bsn
++        b_scale = tl.load(b_scale_ptrs)
++
++    if use_fp8_w8a8:
++        if group_k > 0 and group_n > 0:
++            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
++            offs_bsn = offs_bn // group_n
++            b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse +
++                            offs_bsn * stride_bsn)
++        else:
++            a_scale = tl.load(a_scale_ptr)
++            b_scale = tl.load(b_scale_ptr + off_experts)
+ 
+     # -----------------------------------------------------------
+     # Iterate to compute a block of the C matrix.
+@@ -136,8 +160,21 @@ def fused_moe_kernel(
+                     mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                     other=0.0)
+         # We accumulate along the K dimension.
+-        if use_fp8:
+-            accumulator = tl.dot(a, b, acc=accumulator)
++        if use_int8_w8a16:
++            accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
++        elif use_fp8_w8a8:
++            if group_k > 0 and group_n > 0:
++                k_start = k * BLOCK_SIZE_K
++                offs_ks = k_start // group_k
++                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
++                                  mask=token_mask,
++                                  other=0.0)
++                b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
++
++                accumulator += tl.dot(a, b) * a_scale[:,
++                                                      None] * b_scale[None, :]
++            else:
++                accumulator = tl.dot(a, b, acc=accumulator)
+         else:
+             accumulator += tl.dot(a, b)
+         # Advance the ptrs to the next K block.
+@@ -149,9 +186,13 @@ def fused_moe_kernel(
+                              mask=token_mask,
+                              other=0)
+         accumulator = accumulator * moe_weight[:, None]
+-
+-    if use_fp8:
+-        accumulator = (accumulator * a_scale * b_scale).to(compute_type)
++    if use_int8_w8a16:
++        accumulator = (accumulator * b_scale).to(compute_type)
++    elif use_fp8_w8a8:
++        if group_k > 0 and group_n > 0:
++            accumulator = accumulator.to(compute_type)
++        else:
++            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+     else:
+         accumulator = accumulator.to(compute_type)
+     # -----------------------------------------------------------
+@@ -220,25 +261,42 @@ def moe_align_block_size(
+     return sorted_ids, expert_ids, num_tokens_post_pad
+ 
+ 
+-def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
++def invoke_fused_moe_kernel(A: torch.Tensor,
++                            B: torch.Tensor,
++                            C: torch.Tensor,
+                             A_scale: Optional[torch.Tensor],
+                             B_scale: Optional[torch.Tensor],
+-                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
++                            topk_weights: torch.Tensor,
++                            topk_ids: torch.Tensor,
+                             sorted_token_ids: torch.Tensor,
+                             expert_ids: torch.Tensor,
+                             num_tokens_post_padded: torch.Tensor,
+-                            mul_routed_weight: bool, top_k: int,
+-                            config: Dict[str, Any], compute_type: tl.dtype,
+-                            use_fp8: bool) -> None:
++                            mul_routed_weight: bool,
++                            top_k: int,
++                            config: Dict[str, Any],
++                            compute_type: tl.dtype,
++                            use_fp8_w8a8: bool,
++                            use_int8_w8a16: bool,
++                            block_shape: Optional[List[int]] = None) -> None:
+     assert topk_weights.stride(1) == 1
+     assert sorted_token_ids.stride(0) == 1
+ 
+-    if not use_fp8:
++    if use_fp8_w8a8:
++        assert B_scale is not None
++        if block_shape is None:
++            A, A_scale = ops.scaled_fp8_quant(A, A_scale)
++        else:
++            assert len(block_shape) == 2
++            block_n, block_k = block_shape[0], block_shape[1]
++            A, A_scale = per_token_group_quant_fp8(A, block_k)
++            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
++            assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
++            assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
++    elif use_int8_w8a16:
++        assert B_scale is not None
++    else:
+         assert A_scale is None
+         assert B_scale is None
+-    else:
+-        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+-        assert B_scale is not None
+ 
+     grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
+         'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), )
+@@ -264,16 +322,24 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
+         B.stride(1),
+         C.stride(1),
+         C.stride(2),
++        A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
++        A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
++        B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
++        B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
++        B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
++        0 if block_shape is None else block_shape[0],
++        0 if block_shape is None else block_shape[1],
+         MUL_ROUTED_WEIGHT=mul_routed_weight,
+         top_k=top_k,
+         compute_type=compute_type,
+-        use_fp8=use_fp8,
++        use_fp8_w8a8=use_fp8_w8a8,
++        use_int8_w8a16=use_int8_w8a16,
+         **config,
+     )
+ 
+ 
+ def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
+-    device_name = torch.cuda.get_device_name().replace(" ", "_")
++    device_name = current_platform.get_device_name().replace(" ", "_")
+     dtype_selector = "" if not dtype else f",dtype={dtype}"
+     return f"E={E},N={N},device_name={device_name}{dtype_selector}.json"
+ 
+@@ -305,9 +371,420 @@ def get_moe_configs(E: int, N: int,
+ 
+     # If no optimized configuration is available, we will use the default
+     # configuration
++    logger.warning(
++        ("Using default MoE config. Performance might be sub-optimal! "
++         "Config file not found at %s"), config_file_path)
+     return None
+ 
+ 
++def get_default_config(
++    M: int,
++    E: int,
++    N: int,
++    K: int,
++    topk: int,
++    dtype: Optional[str],
++    is_marlin: bool,
++) -> Dict[str, int]:
++    config = {
++        'BLOCK_SIZE_M': 64,
++        'BLOCK_SIZE_N': 64,
++        'BLOCK_SIZE_K': 32,
++        'GROUP_SIZE_M': 8
++    }
++    # A heuristic: fused marlin works faster with this config for small M
++    if M <= E or (is_marlin and M <= 32):
++        config = {
++            'BLOCK_SIZE_M': 16,
++            'BLOCK_SIZE_N': 32,
++            'BLOCK_SIZE_K': 64,
++            'GROUP_SIZE_M': 1
++        }
++    return config
++
++
++def try_get_optimal_moe_config(
++    w1_shape: Tuple[int, ...],
++    w2_shape: Tuple[int, ...],
++    top_k: int,
++    dtype: Optional[str],
++    M: int,
++    is_marlin: bool = False,
++    block_shape: Optional[List[int]] = None,
++):
++    from vllm.model_executor.layers.fused_moe import get_config
++    override_config = get_config()
++    if override_config:
++        config = override_config
++    else:
++        # First try to load optimal config from the file
++        E, _, N = w2_shape
++        configs = get_moe_configs(E, N, dtype)
++
++        if configs:
++            # If an optimal configuration map has been found, look up the
++            # optimal config
++            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
++        else:
++            # Else use the default config
++            config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
++                                        is_marlin)
++    # NOTE: For block-wise quant,
++    # BLOCK_K must be divisible by block_shape[1]
++    # BLOCK_N and BLOCK_M has no requirements
++    if block_shape is not None:
++        config["BLOCK_SIZE_N"] = block_shape[0]
++        config["BLOCK_SIZE_K"] = block_shape[1]
++    return config
++
++
++def fused_topk(
++    hidden_states: torch.Tensor,
++    gating_output: torch.Tensor,
++    topk: int,
++    renormalize: bool,
++):
++    assert hidden_states.shape[0] == gating_output.shape[0], (
++        "Number of tokens mismatch")
++
++    M, _ = hidden_states.shape
++
++    topk_weights = torch.empty(M,
++                               topk,
++                               dtype=torch.float32,
++                               device=hidden_states.device)
++    topk_ids = torch.empty(M,
++                           topk,
++                           dtype=torch.int32,
++                           device=hidden_states.device)
++    token_expert_indicies = torch.empty(M,
++                                        topk,
++                                        dtype=torch.int32,
++                                        device=hidden_states.device)
++
++    ops.topk_softmax(
++        topk_weights,
++        topk_ids,
++        token_expert_indicies,
++        gating_output.float(),  # TODO(woosuk): Optimize this.
++    )
++    del token_expert_indicies  # Not used. Will be used in the future.
++
++    if renormalize:
++        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
++
++    return topk_weights, topk_ids
++
++
++# This is used by the Deepseek-V2 and Deepseek-V3 model
++def grouped_topk(hidden_states: torch.Tensor,
++                 gating_output: torch.Tensor,
++                 topk: int,
++                 renormalize: bool,
++                 num_expert_group: int = 0,
++                 topk_group: int = 0,
++                 scoring_func: str = "softmax",
++                 e_score_correction_bias: Optional[torch.Tensor] = None):
++
++    assert hidden_states.shape[0] == gating_output.shape[0], (
++        "Number of tokens mismatch")
++
++    if scoring_func == "softmax":
++        scores = torch.softmax(gating_output, dim=-1)
++    elif scoring_func == "sigmoid":
++        scores = gating_output.sigmoid()
++    else:
++        raise ValueError(f"Unsupported scoring function: {scoring_func}")
++
++    if e_score_correction_bias is not None:
++        scores.add_(e_score_correction_bias.unsqueeze(0))
++
++    num_token = scores.shape[0]
++    group_scores = scores.view(num_token, num_expert_group,
++                               -1).max(dim=-1).values  # [n, n_group]
++    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
++                           sorted=False)[1]  # [n, top_k_group]
++    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
++    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
++    score_mask = group_mask.unsqueeze(-1).expand(
++        num_token, num_expert_group,
++        scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
++    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
++    topk_weights, topk_ids = torch.topk(tmp_scores,
++                                        k=topk,
++                                        dim=-1,
++                                        sorted=False)
++
++    if renormalize:
++        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
++
++    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
++
++
++def get_config_dtype_str(dtype: torch.dtype,
++                         use_int8_w8a16: Optional[bool] = False,
++                         use_fp8_w8a8: Optional[bool] = False):
++    if use_fp8_w8a8:
++        return "fp8_w8a8"
++    elif use_int8_w8a16:
++        return "int8_w8a16"
++    elif dtype == torch.float:
++        # avoiding cases where kernel fails when float32 MoE
++        # use fp16/bfloat16 configs
++        return "float32"
++    return None
++
++
++def inplace_fused_experts(hidden_states: torch.Tensor,
++                          w1: torch.Tensor,
++                          w2: torch.Tensor,
++                          topk_weights: torch.Tensor,
++                          topk_ids: torch.Tensor,
++                          use_fp8_w8a8: bool = False,
++                          use_int8_w8a16: bool = False,
++                          w1_scale: Optional[torch.Tensor] = None,
++                          w2_scale: Optional[torch.Tensor] = None,
++                          a1_scale: Optional[torch.Tensor] = None,
++                          a2_scale: Optional[torch.Tensor] = None,
++                          block_shape: Optional[List[int]] = None) -> None:
++    fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
++                       use_fp8_w8a8, use_int8_w8a16, w1_scale, w2_scale,
++                       a1_scale, a2_scale, block_shape)
++
++
++def inplace_fused_experts_fake(
++        hidden_states: torch.Tensor,
++        w1: torch.Tensor,
++        w2: torch.Tensor,
++        topk_weights: torch.Tensor,
++        topk_ids: torch.Tensor,
++        use_fp8_w8a8: bool = False,
++        use_int8_w8a16: bool = False,
++        w1_scale: Optional[torch.Tensor] = None,
++        w2_scale: Optional[torch.Tensor] = None,
++        a1_scale: Optional[torch.Tensor] = None,
++        a2_scale: Optional[torch.Tensor] = None,
++        block_shape: Optional[List[int]] = None) -> None:
++    pass
++
++
++direct_register_custom_op(
++    op_name="inplace_fused_experts",
++    op_func=inplace_fused_experts,
++    mutates_args=["hidden_states"],
++    fake_impl=inplace_fused_experts_fake,
++)
++
++
++def outplace_fused_experts(
++        hidden_states: torch.Tensor,
++        w1: torch.Tensor,
++        w2: torch.Tensor,
++        topk_weights: torch.Tensor,
++        topk_ids: torch.Tensor,
++        use_fp8_w8a8: bool = False,
++        use_int8_w8a16: bool = False,
++        w1_scale: Optional[torch.Tensor] = None,
++        w2_scale: Optional[torch.Tensor] = None,
++        a1_scale: Optional[torch.Tensor] = None,
++        a2_scale: Optional[torch.Tensor] = None,
++        block_shape: Optional[List[int]] = None) -> torch.Tensor:
++    return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
++                              False, use_fp8_w8a8, use_int8_w8a16, w1_scale,
++                              w2_scale, a1_scale, a2_scale, block_shape)
++
++
++def outplace_fused_experts_fake(
++        hidden_states: torch.Tensor,
++        w1: torch.Tensor,
++        w2: torch.Tensor,
++        topk_weights: torch.Tensor,
++        topk_ids: torch.Tensor,
++        use_fp8_w8a8: bool = False,
++        use_int8_w8a16: bool = False,
++        w1_scale: Optional[torch.Tensor] = None,
++        w2_scale: Optional[torch.Tensor] = None,
++        a1_scale: Optional[torch.Tensor] = None,
++        a2_scale: Optional[torch.Tensor] = None,
++        block_shape: Optional[List[int]] = None) -> torch.Tensor:
++    return torch.empty_like(hidden_states)
++
++
++direct_register_custom_op(
++    op_name="outplace_fused_experts",
++    op_func=outplace_fused_experts,
++    mutates_args=[],
++    fake_impl=outplace_fused_experts_fake,
++)
++
++
++def fused_experts(hidden_states: torch.Tensor,
++                  w1: torch.Tensor,
++                  w2: torch.Tensor,
++                  topk_weights: torch.Tensor,
++                  topk_ids: torch.Tensor,
++                  inplace: bool = False,
++                  use_fp8_w8a8: bool = False,
++                  use_int8_w8a16: bool = False,
++                  w1_scale: Optional[torch.Tensor] = None,
++                  w2_scale: Optional[torch.Tensor] = None,
++                  a1_scale: Optional[torch.Tensor] = None,
++                  a2_scale: Optional[torch.Tensor] = None,
++                  block_shape: Optional[List[int]] = None):
++    if inplace:
++        torch.ops.vllm.inplace_fused_experts(hidden_states, w1, w2,
++                                             topk_weights, topk_ids,
++                                             use_fp8_w8a8, use_int8_w8a16,
++                                             w1_scale, w2_scale, a1_scale,
++                                             a2_scale, block_shape)
++        return hidden_states
++    else:
++        return torch.ops.vllm.outplace_fused_experts(hidden_states, w1, w2,
++                                                     topk_weights, topk_ids,
++                                                     use_fp8_w8a8,
++                                                     use_int8_w8a16, w1_scale,
++                                                     w2_scale, a1_scale,
++                                                     a2_scale, block_shape)
++
++
++def fused_experts_impl(hidden_states: torch.Tensor,
++                       w1: torch.Tensor,
++                       w2: torch.Tensor,
++                       topk_weights: torch.Tensor,
++                       topk_ids: torch.Tensor,
++                       inplace: bool = False,
++                       use_fp8_w8a8: bool = False,
++                       use_int8_w8a16: bool = False,
++                       w1_scale: Optional[torch.Tensor] = None,
++                       w2_scale: Optional[torch.Tensor] = None,
++                       a1_scale: Optional[torch.Tensor] = None,
++                       a2_scale: Optional[torch.Tensor] = None,
++                       block_shape: Optional[List[int]] = None):
++    # Check constraints.
++    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
++    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
++    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
++    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
++    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
++    assert hidden_states.dtype in [
++        torch.float32, torch.float16, torch.bfloat16
++    ]
++
++    num_tokens, _ = hidden_states.shape
++    E, N, _ = w1.shape
++    # We execute the fused_moe kernel in chunks to circumvent this issue:
++    # https://github.com/vllm-project/vllm/issues/5938
++    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
++    M = min(num_tokens, CHUNK_SIZE)
++    config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8,
++                                        use_int8_w8a16=use_int8_w8a16,
++                                        dtype=hidden_states.dtype)
++
++    get_config_func = functools.partial(
++        try_get_optimal_moe_config,
++        w1.shape,
++        w2.shape,
++        topk_ids.shape[1],
++        config_dtype,
++        block_shape=block_shape,
++    )
++
++    config = get_config_func(M)
++
++    intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
++                                      device=hidden_states.device,
++                                      dtype=hidden_states.dtype)
++    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N // 2),
++                                      device=hidden_states.device,
++                                      dtype=hidden_states.dtype)
++    intermediate_cache3 = torch.empty((M, topk_ids.shape[1], w2.shape[1]),
++                                      device=hidden_states.device,
++                                      dtype=hidden_states.dtype)
++
++    if hidden_states.dtype == torch.bfloat16:
++        compute_type = tl.bfloat16
++    elif hidden_states.dtype == torch.float16:
++        compute_type = tl.float16
++    elif hidden_states.dtype == torch.float32:
++        compute_type = tl.float32
++    else:
++        raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
++
++    if inplace:
++        out_hidden_states = hidden_states
++    else:
++        out_hidden_states = torch.empty_like(hidden_states)
++
++    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
++        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
++                                          min((chunk + 1) * CHUNK_SIZE,
++                                              num_tokens))
++        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
++        tokens_in_chunk, _ = curr_hidden_states.shape
++
++        if tokens_in_chunk == 0:
++            break
++
++        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
++            # Adjust the intermediate cache size and config for the last
++            # chunk. Note that in most cases we only have one chunk
++            # so the cache size and config are already set correctly and
++            # do not need to be adjusted.
++            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
++            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
++            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
++            config = get_config_func(tokens_in_chunk)
++
++        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
++        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
++
++        sorted_token_ids, expert_ids, num_tokens_post_padded = (
++            moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'], E))
++
++        invoke_fused_moe_kernel(curr_hidden_states,
++                                w1,
++                                intermediate_cache1,
++                                a1_scale,
++                                w1_scale,
++                                curr_topk_weights,
++                                curr_topk_ids,
++                                sorted_token_ids,
++                                expert_ids,
++                                num_tokens_post_padded,
++                                False,
++                                topk_ids.shape[1],
++                                config,
++                                compute_type=compute_type,
++                                use_fp8_w8a8=use_fp8_w8a8,
++                                use_int8_w8a16=use_int8_w8a16,
++                                block_shape=block_shape)
++
++        torch.ops._C.silu_and_mul(intermediate_cache2,
++                                  intermediate_cache1.view(-1, N))
++
++        invoke_fused_moe_kernel(intermediate_cache2,
++                                w2,
++                                intermediate_cache3,
++                                a2_scale,
++                                w2_scale,
++                                curr_topk_weights,
++                                curr_topk_ids,
++                                sorted_token_ids,
++                                expert_ids,
++                                num_tokens_post_padded,
++                                True,
++                                1,
++                                config,
++                                compute_type=compute_type,
++                                use_fp8_w8a8=use_fp8_w8a8,
++                                use_int8_w8a16=use_int8_w8a16,
++                                block_shape=block_shape)
++
++        ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
++                    out_hidden_states[begin_chunk_idx:end_chunk_idx])
++    return out_hidden_states
++
++
+ def fused_moe(
+     hidden_states: torch.Tensor,
+     w1: torch.Tensor,
+@@ -316,12 +793,17 @@ def fused_moe(
+     topk: int,
+     renormalize: bool,
+     inplace: bool = False,
+-    override_config: Optional[Dict[str, Any]] = None,
+-    use_fp8: bool = False,
++    use_grouped_topk: bool = False,
++    num_expert_group: Optional[int] = None,
++    topk_group: Optional[int] = None,
++    custom_routing_function: Optional[Callable] = None,
++    use_fp8_w8a8: bool = False,
++    use_int8_w8a16: bool = False,
+     w1_scale: Optional[torch.Tensor] = None,
+     w2_scale: Optional[torch.Tensor] = None,
+     a1_scale: Optional[torch.Tensor] = None,
+     a2_scale: Optional[torch.Tensor] = None,
++    block_shape: Optional[List[int]] = None,
+ ) -> torch.Tensor:
+     """
+     This function computes a Mixture of Experts (MoE) layer using two sets of
+@@ -337,143 +819,53 @@ def fused_moe(
+     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+     - inplace (bool): If True, perform the operation in-place.
+         Defaults to False.
+-    - override_config (Optional[Dict[str, Any]]): Optional override
+-        for the kernel configuration.
+-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
++    - num_expert_group: Optional[int]: additional parameter for grouped_topk
++    - topk_group: Optional[int]: additional parameter for grouped_topk
++    - use_grouped_topk: If True, use grouped_topk instead of fused_topk
++        note: Deepseekv2 model uses grouped_topk
++    - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
++        products for w1 and w2. Defaults to False.
++    - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner
+         products for w1 and w2. Defaults to False.
+     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+         w1.
+     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+         w2.
++    - a1_scale (Optional[torch.Tensor]): Optional scale to be used for
++        a1.
++    - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
++        a2.
++    - block_shape: (Optional[List[int]]): Optional block size for block-wise
++        quantization.
+ 
+     Returns:
+     - torch.Tensor: The output tensor after applying the MoE layer.
+     """
+     # Check constraints.
+-    assert hidden_states.shape[0] == gating_output.shape[0], (
+-        "Number of tokens mismatch")
+-    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+-    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+-    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+-    assert hidden_states.dtype in [
+-        torch.float32, torch.float16, torch.bfloat16
+-    ]
+-    M, _ = hidden_states.shape
+-    E, N, _ = w1.shape
+ 
+-    if is_hip():
+-        # The MoE kernels are not yet supported on ROCm.
+-        routing_weights = torch.softmax(gating_output,
+-                                        dim=-1,
+-                                        dtype=torch.float32)
+-        topk_weights, topk_ids = torch.topk(routing_weights, topk, dim=-1)
++    if use_grouped_topk:
++        assert num_expert_group is not None and topk_group is not None
++        topk_weights, topk_ids = grouped_topk(hidden_states, gating_output,
++                                              topk, renormalize,
++                                              num_expert_group, topk_group)
++    elif custom_routing_function is None:
++        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
++                                            renormalize)
+     else:
+-        import vllm._moe_C as moe_kernels
+-
+-        topk_weights = torch.empty(M,
+-                                   topk,
+-                                   dtype=torch.float32,
+-                                   device=hidden_states.device)
+-        topk_ids = torch.empty(M,
+-                               topk,
+-                               dtype=torch.int32,
+-                               device=hidden_states.device)
+-        token_expert_indicies = torch.empty(M,
+-                                            topk,
+-                                            dtype=torch.int32,
+-                                            device=hidden_states.device)
+-        moe_kernels.topk_softmax(
+-            topk_weights,
+-            topk_ids,
+-            token_expert_indicies,
+-            gating_output.float(),  # TODO(woosuk): Optimize this.
+-        )
+-        del token_expert_indicies  # Not used. Will be used in the future.
+-    if renormalize:
+-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+-
+-    if override_config:
+-        config = override_config
+-    else:
+-        # First try to load optimal config from the file
+-        configs = get_moe_configs(E, w2.shape[2],
+-                                  "float8" if use_fp8 else None)
+-
+-        if configs:
+-            # If an optimal configuration map has been found, look up the
+-            # optimal config
+-            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+-        else:
+-            # Else use the default config
+-            config = {
+-                'BLOCK_SIZE_M': 64,
+-                'BLOCK_SIZE_N': 64,
+-                'BLOCK_SIZE_K': 32,
+-                'GROUP_SIZE_M': 8
+-            }
+-
+-            if M <= E:
+-                config = {
+-                    'BLOCK_SIZE_M': 16,
+-                    'BLOCK_SIZE_N': 32,
+-                    'BLOCK_SIZE_K': 64,
+-                    'GROUP_SIZE_M': 1
+-                }
+-
+-    intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
+-                                      device=hidden_states.device,
+-                                      dtype=hidden_states.dtype)
+-    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N // 2),
+-                                      device=hidden_states.device,
+-                                      dtype=hidden_states.dtype)
+-    intermediate_cache3 = torch.empty((M, topk_ids.shape[1], w2.shape[1]),
+-                                      device=hidden_states.device,
+-                                      dtype=hidden_states.dtype)
+-
+-    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+-        topk_ids, config['BLOCK_SIZE_M'], E)
+-    compute_type = (tl.bfloat16
+-                    if hidden_states.dtype == torch.bfloat16 else tl.float16)
+-
+-    invoke_fused_moe_kernel(hidden_states,
+-                            w1,
+-                            intermediate_cache1,
+-                            a1_scale,
+-                            w1_scale,
+-                            topk_weights,
+-                            topk_ids,
+-                            sorted_token_ids,
+-                            expert_ids,
+-                            num_tokens_post_padded,
+-                            False,
+-                            topk_ids.shape[1],
+-                            config,
+-                            compute_type=compute_type,
+-                            use_fp8=use_fp8)
+-
+-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+-
+-    invoke_fused_moe_kernel(intermediate_cache2,
+-                            w2,
+-                            intermediate_cache3,
+-                            a2_scale,
+-                            w2_scale,
+-                            topk_weights,
+-                            topk_ids,
+-                            sorted_token_ids,
+-                            expert_ids,
+-                            num_tokens_post_padded,
+-                            True,
+-                            1,
+-                            config,
+-                            compute_type=compute_type,
+-                            use_fp8=use_fp8)
+-
+-    if inplace:
+-        return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+-                         dim=1,
+-                         out=hidden_states)
+-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+-                     dim=1)
++        topk_weights, topk_ids = custom_routing_function(
++            hidden_states, gating_output, topk, renormalize)
++
++    return fused_experts(hidden_states,
++                         w1,
++                         w2,
++                         topk_weights,
++                         topk_ids,
++                         inplace=inplace,
++                         use_fp8_w8a8=use_fp8_w8a8,
++                         use_int8_w8a16=use_int8_w8a16,
++                         w1_scale=w1_scale,
++                         w2_scale=w2_scale,
++                         a1_scale=a1_scale,
++                         a2_scale=a2_scale,
++                         block_shape=block_shape)
+diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
+new file mode 100644
+index 0000000..3d822fc
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/layer.py
+@@ -0,0 +1,619 @@
++from abc import abstractmethod
++from enum import Enum
++from typing import Callable, List, Optional, Tuple
++
++import torch
++
++from vllm.distributed import (get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size,
++                              tensor_model_parallel_all_reduce)
++from vllm.logger import init_logger
++from vllm.model_executor.custom_op import CustomOp
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig, QuantizeMethodBase)
++from vllm.model_executor.utils import set_weight_attrs
++from vllm.platforms import current_platform
++from vllm.platforms.interface import CpuArchEnum
++
++if current_platform.is_cuda_alike():
++    from .fused_moe import fused_experts
++else:
++    fused_experts = None  # type: ignore
++if current_platform.is_tpu():
++    # the iterative moe implementation is used until the moe_pallas is fixed
++    from .moe_torch_iterative import fused_moe as fused_moe_pallas
++else:
++    fused_moe_pallas = None  # type: ignore
++logger = init_logger(__name__)
++
++
++class FusedMoeWeightScaleSupported(Enum):
++    TENSOR = "tensor"
++    CHANNEL = "channel"
++    GROUP = "group"
++    BLOCK = "block"
++
++
++class FusedMoEMethodBase(QuantizeMethodBase):
++
++    @abstractmethod
++    def create_weights(self, layer: torch.nn.Module, num_experts: int,
++                       hidden_size: int, intermediate_size: int,
++                       params_dtype: torch.dtype, **extra_weight_attrs):
++        raise NotImplementedError
++
++    @abstractmethod
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        router_logits: torch.Tensor,
++        top_k: int,
++        renormalize: bool,
++        use_grouped_topk: bool = False,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None
++    ) -> torch.Tensor:
++        raise NotImplementedError
++
++
++@CustomOp.register("unquantized_fused_moe")
++class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
++    """MoE method without quantization."""
++
++    def create_weights(self, layer: torch.nn.Module, num_experts: int,
++                       hidden_size: int, intermediate_size: int,
++                       params_dtype: torch.dtype, **extra_weight_attrs):
++        # Fused gate_up_proj (column parallel)
++        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
++                                                    2 * intermediate_size,
++                                                    hidden_size,
++                                                    dtype=params_dtype),
++                                        requires_grad=False)
++        layer.register_parameter("w13_weight", w13_weight)
++        set_weight_attrs(w13_weight, extra_weight_attrs)
++
++        # down_proj (row parallel)
++        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
++                                                   hidden_size,
++                                                   intermediate_size,
++                                                   dtype=params_dtype),
++                                       requires_grad=False)
++        layer.register_parameter("w2_weight", w2_weight)
++        set_weight_attrs(w2_weight, extra_weight_attrs)
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        super().process_weights_after_loading(layer)
++
++        if current_platform.is_cpu():
++            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
++                import intel_extension_for_pytorch as ipex
++                layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
++                    layer.w13_weight,
++                    layer.w2_weight,
++                    use_prepack=True,
++                )
++            else:
++                raise NotImplementedError("CPU MOE only supports x86 arch.")
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        router_logits: torch.Tensor,
++        top_k: int,
++        renormalize: bool,
++        use_grouped_topk: bool = False,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None
++    ) -> torch.Tensor:
++        return self.forward(x=x,
++                            layer=layer,
++                            router_logits=router_logits,
++                            top_k=top_k,
++                            renormalize=renormalize,
++                            use_grouped_topk=use_grouped_topk,
++                            topk_group=topk_group,
++                            num_expert_group=num_expert_group,
++                            custom_routing_function=custom_routing_function,
++                            scoring_func=scoring_func,
++                            e_score_correction_bias=e_score_correction_bias)
++
++    def forward_cuda(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        use_grouped_topk: bool,
++        top_k: int,
++        router_logits: torch.Tensor,
++        renormalize: bool,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None
++    ) -> torch.Tensor:
++        topk_weights, topk_ids = FusedMoE.select_experts(
++            hidden_states=x,
++            router_logits=router_logits,
++            use_grouped_topk=use_grouped_topk,
++            top_k=top_k,
++            renormalize=renormalize,
++            topk_group=topk_group,
++            num_expert_group=num_expert_group,
++            custom_routing_function=custom_routing_function,
++            scoring_func=scoring_func,
++            e_score_correction_bias=e_score_correction_bias)
++
++        return fused_experts(hidden_states=x,
++                             w1=layer.w13_weight,
++                             w2=layer.w2_weight,
++                             topk_weights=topk_weights,
++                             topk_ids=topk_ids,
++                             inplace=True)
++
++    def forward_cpu(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        use_grouped_topk: bool,
++        top_k: int,
++        router_logits: torch.Tensor,
++        renormalize: bool,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        **kwargs,
++    ):
++        assert custom_routing_function is None
++        return layer.ipex_fusion(
++            x,
++            use_grouped_topk,
++            top_k,
++            router_logits,
++            renormalize,
++            topk_group,
++            num_expert_group,
++        )
++
++    def forward_tpu(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        use_grouped_topk: bool,
++        top_k: int,
++        router_logits: torch.Tensor,
++        renormalize: bool,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None
++    ) -> torch.Tensor:
++        assert not use_grouped_topk
++        assert num_expert_group is None
++        assert topk_group is None
++        assert custom_routing_function is None
++        if scoring_func != "softmax":
++            raise NotImplementedError(
++                "Only softmax scoring function is supported for TPU.")
++        if e_score_correction_bias is not None:
++            raise NotImplementedError(
++                "Expert score correction bias is not supported for TPU.")
++        return fused_moe_pallas(hidden_states=x,
++                                w1=layer.w13_weight,
++                                w2=layer.w2_weight,
++                                topk=top_k,
++                                gating_output=router_logits,
++                                renormalize=renormalize)
++
++    forward_native = forward_cuda
++
++
++class FusedMoE(torch.nn.Module):
++    """FusedMoE layer for MoE models.
++
++    This layer contains both MergedColumnParallel weights (gate_up_proj /
++    w13) and RowParallelLinear weights (down_proj/ w2).
++
++    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
++    copy that naming convention here and handle any remapping in the
++    load_weights function in each model implementation.
++
++    Args:
++        num_experts: Number of experts in the model
++        top_k: Number of experts selected for each token
++        hidden_size: Input hidden state size of the transformer
++        intermediate_size: Intermediate size of the experts
++        params_dtype: Data type for the parameters.
++        reduce_results: Whether to all all_reduce on the output of the layer
++        renomalize: Whether to renormalize the logits in the fused_moe kernel
++        quant_config: Quantization configure.
++    """
++
++    def __init__(
++        self,
++        num_experts: int,
++        top_k: int,
++        hidden_size: int,
++        intermediate_size: int,
++        params_dtype: Optional[torch.dtype] = None,
++        reduce_results: bool = False,
++        renormalize: bool = True,
++        use_grouped_topk: bool = False,
++        num_expert_group: Optional[int] = None,
++        topk_group: Optional[int] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        tp_size: Optional[int] = None,
++        prefix: str = "",
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None,
++    ):
++        super().__init__()
++
++        if params_dtype is None:
++            params_dtype = torch.get_default_dtype()
++
++        self.tp_size = (tp_size if tp_size is not None else
++                        get_tensor_model_parallel_world_size())
++        self.top_k = top_k
++        self.num_experts = num_experts
++        assert intermediate_size % self.tp_size == 0
++        self.intermediate_size_per_partition = intermediate_size // self.tp_size
++        self.reduce_results = reduce_results
++        self.renormalize = renormalize
++        self.use_grouped_topk = use_grouped_topk
++        if self.use_grouped_topk:
++            assert num_expert_group is not None and topk_group is not None
++        self.num_expert_group = num_expert_group
++        self.topk_group = topk_group
++        self.custom_routing_function = custom_routing_function
++        self.scoring_func = scoring_func
++        self.e_score_correction_bias = e_score_correction_bias
++
++        if self.scoring_func != "softmax" and not self.use_grouped_topk:
++            raise ValueError("Only softmax scoring function is supported for "
++                             "non-grouped topk.")
++
++        if quant_config is None:
++            self.quant_method: Optional[QuantizeMethodBase] = (
++                UnquantizedFusedMoEMethod())
++        else:
++            self.quant_method = quant_config.get_quant_method(self, prefix)
++        assert self.quant_method is not None
++
++        self.quant_method.create_weights(
++            layer=self,
++            num_experts=num_experts,
++            hidden_size=hidden_size,
++            intermediate_size=self.intermediate_size_per_partition,
++            params_dtype=params_dtype,
++            weight_loader=self.weight_loader)
++
++    def _load_per_tensor_weight_scale(self, shard_id: str,
++                                      param: torch.nn.Parameter,
++                                      loaded_weight: torch.Tensor,
++                                      expert_id: int):
++        param_data = param.data
++        # for per tensor weight quantization
++        if shard_id in ("w1", "w3"):
++            # We have to keep the weight scales of w1 and w3 because
++            # we need to re-quantize w1/w3 weights after weight loading.
++            idx = 0 if shard_id == "w1" else 1
++            param_data[expert_id][idx] = loaded_weight
++        # If we are in the row parallel case (down_proj)
++        elif shard_id == "w2":
++            param_data[expert_id] = loaded_weight
++
++    def _load_model_weight_or_group_weight_scale(self, shard_dim: int,
++                                                 expert_data: torch.Tensor,
++                                                 shard_id: str,
++                                                 loaded_weight: torch.Tensor,
++                                                 tp_rank: int):
++        # Load grouped weight scales for group quantization
++        # or model weights
++        if shard_id == "w2":
++            self._load_w2(shard_id=shard_id,
++                          shard_dim=shard_dim,
++                          loaded_weight=loaded_weight,
++                          expert_data=expert_data,
++                          tp_rank=tp_rank)
++        elif shard_id in ("w1", "w3"):
++            self._load_w13(shard_id=shard_id,
++                           shard_dim=shard_dim,
++                           loaded_weight=loaded_weight,
++                           expert_data=expert_data,
++                           tp_rank=tp_rank)
++
++    def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
++                                       shard_dim: int, shard_id: str,
++                                       loaded_weight: torch.Tensor,
++                                       tp_rank: int):
++        # for per channel weight quantization
++        if shard_id == "w2":
++            expert_data.copy_(loaded_weight)
++        elif shard_id in ("w1", "w3"):
++            self._load_w13(shard_id=shard_id,
++                           shard_dim=shard_dim,
++                           loaded_weight=loaded_weight,
++                           expert_data=expert_data,
++                           tp_rank=tp_rank)
++
++    def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
++                  shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):
++
++        # Index the loaded weight for tp sharding.
++        # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
++        shard_size = expert_data.shape[shard_dim] // 2
++        loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
++                                             shard_size)
++        # Narrow parameter and load.
++        # w1, gate_proj: Load into first logical weight of w13.
++        if shard_id == "w1":
++            expert_data = expert_data.narrow(shard_dim, 0, shard_size)
++        # w3, up_proj: Load into second logical weight of w13.
++        else:
++            assert shard_id == "w3"
++            expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
++        expert_data.copy_(loaded_weight)
++
++    def _load_w2(self, expert_data: torch.Tensor, shard_dim: int,
++                 shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):
++
++        # Index the loaded weight for tp sharding.
++        # down_proj: "RowParallel" so tp sharding on input_dim
++        # Narrow parameter and load.
++        shard_size = expert_data.shape[shard_dim]
++        loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
++                                             shard_size)
++        # w2, down_proj: Load into only logical weight of w2.
++        expert_data.copy_(loaded_weight)
++
++    def _load_single_value(self, param: torch.nn.Parameter,
++                           loaded_weight: torch.Tensor, expert_id: int):
++        param_data = param.data
++
++        # Input scales can be loaded directly and should be equal.
++        param_data[expert_id] = loaded_weight
++
++    def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
++                    shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
++
++        if shard_id == "w2":
++            self._load_w2(shard_id=shard_id,
++                          shard_dim=shard_dim,
++                          loaded_weight=loaded_weight,
++                          expert_data=expert_data,
++                          tp_rank=tp_rank)
++        else:
++            assert shard_id in ("w1", "w3")
++            expert_data.copy_(loaded_weight)
++
++    def weight_loader(self, param: torch.nn.Parameter,
++                      loaded_weight: torch.Tensor, weight_name: str,
++                      shard_id: str, expert_id: int) -> None:
++
++        # compressed-tensors checkpoints with packed weights are stored flipped
++        # TODO (mgoin): check self.quant_method.quant_config.quant_format
++        # against known CompressionFormat enum values that have this quality
++        loaded_weight = loaded_weight.t().contiguous() if (
++            self.quant_method.__class__.__name__
++            == "CompressedTensorsWNA16MoEMethod") else loaded_weight
++
++        if shard_id not in ("w1", "w2", "w3"):
++            raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
++                             f"got {shard_id}.")
++
++        WEIGHT_SCALE_SUPPORTED = [
++            e.value for e in FusedMoeWeightScaleSupported
++        ]
++        # Fetch the dim to shard the parameter/loaded weight
++        # based on the shard id. This will be whatever
++        # dimension intermediate_size is used.
++        SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
++
++        expert_data = param.data[expert_id]
++        tp_rank = get_tensor_model_parallel_rank()
++
++        # is_transposed: if the dim to shard the weight
++        # should be flipped. Required by GPTQ, compressed-tensors
++        # should be whatever dimension intermediate_size is
++        is_transposed = getattr(param, "is_transposed", False)
++        shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
++        if is_transposed:
++            shard_dim = ~shard_dim
++
++        # Case input scale: input_scale loading is only supported for fp8
++        if "input_scale" in weight_name:
++            # this is needed for compressed-tensors only
++            loaded_weight = loaded_weight.to(param.data.device)
++
++            if param.data[expert_id] != 1 and (param.data[expert_id] -
++                                               loaded_weight).abs() > 1e-5:
++                raise ValueError(
++                    "input_scales of w1 and w3 of a layer "
++                    f"must be equal. But got {param.data[expert_id]} "
++                    f"vs. {loaded_weight}")
++
++            self._load_single_value(param=param,
++                                    loaded_weight=loaded_weight,
++                                    expert_id=expert_id)
++            return
++
++        # Case g_idx
++        if "g_idx" in weight_name:
++            self._load_g_idx(shard_dim=0,
++                             shard_id=shard_id,
++                             loaded_weight=loaded_weight,
++                             expert_data=expert_data,
++                             tp_rank=tp_rank)
++            return
++
++        # Case weight scales and zero_points
++        if ("scale" in weight_name or "zero" in weight_name):
++            # load the weight scales and zp based on the quantization scheme
++            # supported weight scales/zp can be found in
++            # FusedMoeWeightScaleSupported
++            # TODO @dsikka: once hardened, refactor to use vLLM Parameters
++            # specific to each case
++            quant_method = getattr(param, "quant_method", None)
++            if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
++                self._load_per_channel_weight_scale(
++                    shard_id=shard_id,
++                    shard_dim=shard_dim,
++                    loaded_weight=loaded_weight,
++                    expert_data=expert_data,
++                    tp_rank=tp_rank)
++            elif quant_method in [
++                    FusedMoeWeightScaleSupported.GROUP.value,
++                    FusedMoeWeightScaleSupported.BLOCK.value,
++            ]:
++                self._load_model_weight_or_group_weight_scale(
++                    shard_id=shard_id,
++                    shard_dim=shard_dim,
++                    loaded_weight=loaded_weight,
++                    expert_data=expert_data,
++                    tp_rank=tp_rank)
++            elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
++                self._load_per_tensor_weight_scale(shard_id=shard_id,
++                                                   param=param,
++                                                   loaded_weight=loaded_weight,
++                                                   expert_id=expert_id)
++            else:
++                raise ValueError(
++                    f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
++            return
++
++        # Case weight_shape
++        if "weight_shape" in weight_name:
++            # only required by compressed-tensors
++            self._load_single_value(param=param,
++                                    loaded_weight=loaded_weight,
++                                    expert_id=expert_id)
++            return
++
++        # Case model weights
++        if "weight" in weight_name:
++            self._load_model_weight_or_group_weight_scale(
++                shard_id=shard_id,
++                shard_dim=shard_dim,
++                loaded_weight=loaded_weight,
++                expert_data=expert_data,
++                tp_rank=tp_rank)
++            return
++
++    @staticmethod
++    def select_experts(hidden_states: torch.Tensor,
++                       router_logits: torch.Tensor,
++                       top_k: int,
++                       use_grouped_topk: bool,
++                       renormalize: bool,
++                       topk_group: Optional[int] = None,
++                       num_expert_group: Optional[int] = None,
++                       custom_routing_function: Optional[Callable] = None,
++                       scoring_func: str = "softmax",
++                       e_score_correction_bias: Optional[torch.Tensor] = None):
++        from vllm.model_executor.layers.fused_moe.fused_moe import (
++            fused_topk, grouped_topk)
++
++        # DeekSeekv2 uses grouped_top_k
++        if use_grouped_topk:
++            assert topk_group is not None
++            assert num_expert_group is not None
++            topk_weights, topk_ids = grouped_topk(
++                hidden_states=hidden_states,
++                gating_output=router_logits,
++                topk=top_k,
++                renormalize=renormalize,
++                num_expert_group=num_expert_group,
++                topk_group=topk_group,
++                scoring_func=scoring_func,
++                e_score_correction_bias=e_score_correction_bias)
++        elif custom_routing_function is None:
++            topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
++                                                gating_output=router_logits,
++                                                topk=top_k,
++                                                renormalize=renormalize)
++        else:
++            topk_weights, topk_ids = custom_routing_function(
++                hidden_states=hidden_states,
++                gating_output=router_logits,
++                topk=top_k,
++                renormalize=renormalize)
++
++        return topk_weights, topk_ids
++
++    def forward(self, hidden_states: torch.Tensor,
++                router_logits: torch.Tensor):
++        assert self.quant_method is not None
++
++        # Matrix multiply.
++        final_hidden_states = self.quant_method.apply(
++            layer=self,
++            x=hidden_states,
++            router_logits=router_logits,
++            top_k=self.top_k,
++            renormalize=self.renormalize,
++            use_grouped_topk=self.use_grouped_topk,
++            topk_group=self.topk_group,
++            num_expert_group=self.num_expert_group,
++            custom_routing_function=self.custom_routing_function,
++            scoring_func=self.scoring_func,
++            e_score_correction_bias=self.e_score_correction_bias)
++
++        if self.reduce_results and self.tp_size > 1:
++            final_hidden_states = tensor_model_parallel_all_reduce(
++                final_hidden_states)
++
++        return final_hidden_states
++
++    @classmethod
++    def make_expert_params_mapping(
++            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
++            ckpt_up_proj_name: str,
++            num_experts: int) -> List[Tuple[str, str, int, str]]:
++
++        return [
++            # (param_name, weight_name, expert_id, shard_id)
++            ("experts.w13_" if weight_name
++             in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
++             f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
++            for expert_id in range(num_experts) for shard_id, weight_name in [
++                ("w1", ckpt_gate_proj_name),
++                ("w2", ckpt_down_proj_name),
++                ("w3", ckpt_up_proj_name),
++            ]
++        ]
++
++    def _load_fp8_scale(self, param: torch.nn.Parameter,
++                        loaded_weight: torch.Tensor, weight_name: str,
++                        shard_id: str, expert_id: int) -> None:
++        param_data = param.data
++
++        # Input scales can be loaded directly and should be equal.
++        if "input_scale" in weight_name:
++            if param_data[expert_id] != 1 and (param_data[expert_id] -
++                                               loaded_weight).abs() > 1e-5:
++                raise ValueError(
++                    "input_scales of w1 and w3 of a layer "
++                    f"must be equal. But got {param_data[expert_id]} "
++                    f"vs. {loaded_weight}")
++            param_data[expert_id] = loaded_weight
++        # Weight scales
++        elif "weight_scale" in weight_name:
++            # If we are in merged column case (gate_up_proj)
++            if shard_id in ("w1", "w3"):
++                # We have to keep the weight scales of w1 and w3 because
++                # we need to re-quantize w1/w3 weights after weight loading.
++                idx = 0 if shard_id == "w1" else 1
++                param_data[expert_id][idx] = loaded_weight
++            # If we are in the row parallel case (down_proj)
++            else:
++                param_data[expert_id] = loaded_weight
+diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
+new file mode 100644
+index 0000000..563ee18
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
+@@ -0,0 +1,62 @@
++import torch
++import torch.nn.functional as F
++from torch_xla.experimental.custom_kernel import _histogram
++
++
++def fused_moe(
++    hidden_states: torch.Tensor,
++    w1: torch.Tensor,
++    w2: torch.Tensor,
++    gating_output: torch.Tensor,
++    topk: int,
++    renormalize: bool,
++) -> torch.Tensor:
++    """
++    Args:
++        hidden_states: [*, hidden_size]
++        w1: [num_experts, intermediate_size * 2, hidden_size]
++        w2: [num_experts, hidden_size, intermediate_size]
++        gating_output: [*, num_experts]
++    """
++    orig_shape = hidden_states.shape
++    hidden_size = hidden_states.shape[-1]
++    num_tokens = hidden_states.shape[:-1].numel()
++    num_experts = w1.shape[0]
++    intermediate_size = w2.shape[-1]
++    device = hidden_states.device
++    dtype = hidden_states.dtype
++    assert (num_tokens * topk) % 16 == 0, (
++        "The Pallas GMM kernel requires num_tokens * topk to be a multiple of "
++        f"16 but got {num_tokens * topk}")
++
++    hidden_states = hidden_states.view(num_tokens, hidden_size)
++    gating_output = gating_output.view(num_tokens, num_experts)
++    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
++    topk_weights, topk_indices = topk_weights.topk(topk, dim=-1)
++    if renormalize:
++        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
++    topk_weights = topk_weights.to(dtype)
++
++    topk_indices = topk_indices.flatten()
++    topk_argsort_indices = topk_indices.argsort()
++    topk_argsort_revert_indices = topk_argsort_indices.argsort()
++    token_indices = torch.arange(num_tokens,
++                                 device=device).repeat_interleave(topk)
++    token_indices = token_indices[topk_argsort_indices]
++    group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1)
++
++    # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout
++    # from HF Transformers.
++    w1 = w1.transpose(1, 2)
++    w2 = w2.transpose(1, 2)
++
++    x = hidden_states[token_indices]
++    x = torch.ops.xla.gmm(x, w1, group_sizes)
++    x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:]
++    x = torch.ops.xla.gmm(x, w2, group_sizes)
++    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
++
++    x = x * topk_weights.unsqueeze_(dim=-1)
++    x = x.sum(dim=-2)
++    x = x.reshape(orig_shape)
++    return x
+diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
+new file mode 100644
+index 0000000..bcff55f
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
+@@ -0,0 +1,51 @@
++import torch
++import torch.nn.functional as F
++
++
++def fused_moe(
++    hidden_states: torch.Tensor,
++    w1: torch.Tensor,
++    w2: torch.Tensor,
++    gating_output: torch.Tensor,
++    topk: int,
++    renormalize: bool,
++) -> torch.Tensor:
++    """
++    Args:
++        hidden_states: [*, hidden_size]
++        w1: [num_experts, intermediate_size * 2, hidden_size]
++        w2: [num_experts, hidden_size, intermediate_size]
++        gating_output: [*, num_experts]
++    """
++    orig_shape = hidden_states.shape
++    hidden_size = hidden_states.shape[-1]
++    num_tokens = hidden_states.shape[:-1].numel()
++    num_experts = w1.shape[0]
++    intermediate_size = w2.shape[-1]
++    dtype = hidden_states.dtype
++
++    hidden_states = hidden_states.view(num_tokens, hidden_size)
++    gating_output = gating_output.view(num_tokens, num_experts)
++    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
++    topk_weights, selected_experts = topk_weights.topk(topk, dim=-1)
++    if renormalize:
++        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
++    topk_weights = topk_weights.to(dtype)
++
++    final_hidden_states = None
++    for expert_idx in range(num_experts):
++        expert_w1 = w1[expert_idx]
++        expert_w2 = w2[expert_idx]
++        expert_mask = (selected_experts == expert_idx)
++        expert_weights = (topk_weights * expert_mask).sum(dim=-1, keepdim=True)
++        x = F.linear(hidden_states, expert_w1)
++        gate = F.silu(x[:, :intermediate_size])
++        x = x[:, intermediate_size:] * gate
++        x = F.linear(x, expert_w2)
++        current_hidden_states = x * expert_weights
++        if final_hidden_states is None:
++            final_hidden_states = current_hidden_states
++        else:
++            final_hidden_states = final_hidden_states + current_hidden_states
++
++    return final_hidden_states.view(orig_shape)  # type: ignore
+diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
+index 8de0794..43ea4eb 100644
+--- a/vllm/model_executor/layers/layernorm.py
++++ b/vllm/model_executor/layers/layernorm.py
+@@ -4,10 +4,11 @@ from typing import Optional, Tuple, Union
+ import torch
+ import torch.nn as nn
+ 
+-from vllm import _custom_ops as ops
++from vllm.model_executor.custom_op import CustomOp
+ 
+ 
+-class RMSNorm(nn.Module):
++@CustomOp.register("rms_norm")
++class RMSNorm(CustomOp):
+     """Root mean square normalization.
+ 
+     Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
+@@ -18,12 +19,22 @@ class RMSNorm(nn.Module):
+         self,
+         hidden_size: int,
+         eps: float = 1e-6,
++        var_hidden_size: Optional[int] = None,
++        has_weight: bool = True,
+     ) -> None:
+         super().__init__()
+-        self.weight = nn.Parameter(torch.ones(hidden_size))
++
++        self.hidden_size = hidden_size
+         self.variance_epsilon = eps
++        self.variance_size_override = (None if var_hidden_size == hidden_size
++                                       else var_hidden_size)
++        self.has_weight = has_weight
++
++        self.weight = torch.ones(hidden_size)
++        if self.has_weight:
++            self.weight = nn.Parameter(self.weight)
+ 
+-    def _forward(
++    def forward_native(
+         self,
+         x: torch.Tensor,
+         residual: Optional[torch.Tensor] = None,
+@@ -35,19 +46,42 @@ class RMSNorm(nn.Module):
+             x = x + residual.to(torch.float32)
+             residual = x.to(orig_dtype)
+ 
+-        variance = x.pow(2).mean(dim=-1, keepdim=True)
++        hidden_size = x.shape[-1]
++        if hidden_size != self.hidden_size:
++            raise ValueError("Expected hidden_size to be "
++                             f"{self.hidden_size}, but found: {hidden_size}")
++
++        if self.variance_size_override is None:
++            x_var = x
++        else:
++            if hidden_size < self.variance_size_override:
++                raise ValueError(
++                    "Expected hidden_size to be at least "
++                    f"{self.variance_size_override}, but found: {hidden_size}")
++
++            x_var = x[:, :, :self.variance_size_override]
++
++        variance = x_var.pow(2).mean(dim=-1, keepdim=True)
++
+         x = x * torch.rsqrt(variance + self.variance_epsilon)
+-        x = x.to(orig_dtype) * self.weight
++        x = x.to(orig_dtype)
++        if self.has_weight:
++            x = x * self.weight
+         if residual is None:
+             return x
+         else:
+             return x, residual
+ 
+-    def forward(
++    def forward_cuda(
+         self,
+         x: torch.Tensor,
+         residual: Optional[torch.Tensor] = None,
+     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
++        if self.variance_size_override is not None:
++            return self.forward_native(x, residual)
++
++        from vllm import _custom_ops as ops
++
+         if residual is not None:
+             ops.fused_add_rms_norm(
+                 x,
+@@ -65,7 +99,114 @@ class RMSNorm(nn.Module):
+         )
+         return out
+ 
++    def forward_hpu(
++        self,
++        x: torch.Tensor,
++        residual: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
++        from vllm_hpu_extension.ops import HPUFusedRMSNorm
++        if HPUFusedRMSNorm is None:
++            return self.forward_native(x, residual)
++        if residual is not None:
++            orig_shape = x.shape
++            residual += x.view(residual.shape)
++            # Note: HPUFusedRMSNorm requires 3D tensors as inputs
++            x = HPUFusedRMSNorm.apply(residual, self.weight,
++                                      self.variance_epsilon)
++            return x.view(orig_shape), residual
++
++        x = HPUFusedRMSNorm.apply(x, self.weight, self.variance_epsilon)
++        return x
++
++    def forward_xpu(
++        self,
++        x: torch.Tensor,
++        residual: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
++        if self.variance_size_override is not None:
++            return self.forward_native(x, residual)
++
++        from vllm._ipex_ops import ipex_ops as ops
++
++        if residual is not None:
++            ops.fused_add_rms_norm(
++                x,
++                residual,
++                self.weight.data,
++                self.variance_epsilon,
++            )
++            return x, residual
++        return ops.rms_norm(
++            x,
++            self.weight.data,
++            self.variance_epsilon,
++        )
++
+     def extra_repr(self) -> str:
+         s = f"hidden_size={self.weight.data.size(0)}"
+         s += f", eps={self.variance_epsilon}"
+         return s
++
++
++@CustomOp.register("gemma_rms_norm")
++class GemmaRMSNorm(CustomOp):
++    """RMS normalization for Gemma.
++
++    Two differences from the above RMSNorm:
++        1. x * (1 + w) instead of x * w.
++        2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w.
++    """
++
++    def __init__(
++        self,
++        hidden_size: int,
++        eps: float = 1e-6,
++    ) -> None:
++        super().__init__()
++        self.weight = nn.Parameter(torch.zeros(hidden_size))
++        self.variance_epsilon = eps
++
++    @staticmethod
++    def forward_static(
++        weight: torch.Tensor,
++        variance_epsilon: float,
++        x: torch.Tensor,
++        residual: Optional[torch.Tensor],
++    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
++        """PyTorch-native implementation equivalent to forward()."""
++        orig_dtype = x.dtype
++        if residual is not None:
++            x = x + residual
++            residual = x
++
++        x = x.float()
++        variance = x.pow(2).mean(dim=-1, keepdim=True)
++        x = x * torch.rsqrt(variance + variance_epsilon)
++        # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16)
++        # See https://github.com/huggingface/transformers/pull/29402
++        x = x * (1.0 + weight.float())
++        x = x.to(orig_dtype)
++        return x if residual is None else (x, residual)
++
++    def forward_native(
++        self,
++        x: torch.Tensor,
++        residual: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
++        """PyTorch-native implementation equivalent to forward()."""
++        return self.forward_static(self.weight.data, self.variance_epsilon, x,
++                                   residual)
++
++    def forward_cuda(
++        self,
++        x: torch.Tensor,
++        residual: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
++        if torch.compiler.is_compiling():
++            return self.forward_native(x, residual)
++
++        if not getattr(self, "_is_compiled", False):
++            self.forward_static = torch.compile(  # type: ignore
++                self.forward_static)
++            self._is_compiled = True
++        return self.forward_native(x, residual)
+diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
+index 7726dcb..8876ca7 100644
+--- a/vllm/model_executor/layers/linear.py
++++ b/vllm/model_executor/layers/linear.py
+@@ -1,9 +1,10 @@
++import itertools
+ from abc import abstractmethod
+-from typing import List, Optional
++from typing import Dict, List, Optional, Tuple
+ 
+ import torch
+ import torch.nn.functional as F
+-from torch.nn.parameter import Parameter
++from torch.nn.parameter import Parameter, UninitializedParameter
+ 
+ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size,
+@@ -13,10 +14,27 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+ from vllm.logger import init_logger
+ from vllm.model_executor.layers.quantization.base_config import (
+     QuantizationConfig, QuantizeMethodBase)
++# yapf: disable
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           BlockQuantScaleParameter,
++                                           PackedColumnParameter,
++                                           PackedvLLMParameter,
++                                           PerTensorScaleParameter,
++                                           RowvLLMParameter)
++# yapf: enable
+ from vllm.model_executor.utils import set_weight_attrs
+ 
+ logger = init_logger(__name__)
+ 
++WEIGHT_LOADER_V2_SUPPORTED = [
++    "CompressedTensorsLinearMethod", "AWQMarlinLinearMethod",
++    "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod",
++    "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
++    "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod",
++    "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod",
++    "HQQMarlinMethod"
++]
++
+ 
+ def adjust_marlin_shard(param, shard_size, shard_offset):
+     marlin_tile_size = getattr(param, "marlin_tile_size", None)
+@@ -26,6 +44,44 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
+     return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+ 
+ 
++def adjust_bitsandbytes_4bit_shard(param: Parameter,
++                                   shard_offsets: Dict[str, Tuple[int, int]],
++                                   loaded_shard_id: str) -> Tuple[int, int]:
++    """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
++
++    total, _ = shard_offsets["total"]
++    orig_offset, orig_size = shard_offsets[loaded_shard_id]
++
++    quantized_total = param.data.shape[0]
++    quantized_offset = orig_offset * quantized_total // total
++    quantized_size = orig_size * quantized_total // total
++
++    return quantized_size, quantized_offset
++
++
++def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
++    """For fused modules (QKV and MLP) we have an array of length
++    N that holds 1 scale for each "logical" matrix. So the param
++    is an array of length N. The loaded_weight corresponds to 
++    one of the shards on disk. Here, we slice the param based on 
++    the shard_id for loading.
++    """
++    qkv_idxs = {"q": 0, "k": 1, "v": 2}
++
++    if isinstance(shard_id, str):
++        shard_id = qkv_idxs[shard_id]
++    elif not isinstance(shard_id, int):
++        raise ValueError(f"Unknown Shard Id {shard_id}")
++
++    # AutoFP8 scales do not have a shape
++    # compressed-tensors scales do have a shape
++    if len(loaded_weight.shape) != 0:
++        assert loaded_weight.shape[0] == 1
++        loaded_weight = loaded_weight[0]
++
++    return param[shard_id], loaded_weight
++
++
+ class LinearMethodBase(QuantizeMethodBase):
+     """Base class for different (maybe quantized) linear methods."""
+ 
+@@ -37,7 +93,7 @@ class LinearMethodBase(QuantizeMethodBase):
+                        **extra_weight_attrs):
+         """Create weights for a linear layer. 
+            The weights will be set as attributes of the layer.
+-        
++
+         Args:
+             layer: The layer that is using the LinearMethodBase factory.
+             input_size_per_partition: Size of the weight input dim on rank X.
+@@ -56,29 +112,19 @@ class LinearMethodBase(QuantizeMethodBase):
+               x: torch.Tensor,
+               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+         """Apply the weights in layer to the input tensor.
+-
+         Expects create_weights to have been called before on the layer."""
+         raise NotImplementedError
+ 
+ 
+ class UnquantizedLinearMethod(LinearMethodBase):
+-    """Linear method without quantization.
+-
+-    Args:
+-        separate_bias_add: If true, add bias separately after matrix
+-                           multiplication.
+-    """
+-
+-    def __init__(self, separate_bias_add: bool = False):
+-        self.separate_bias_add = separate_bias_add
++    """Linear method without quantization."""
+ 
+     def create_weights(self, layer: torch.nn.Module,
+                        input_size_per_partition: int,
+                        output_partition_sizes: List[int], input_size: int,
+                        output_size: int, params_dtype: torch.dtype,
+                        **extra_weight_attrs):
+-        output_size_per_partition = sum(output_partition_sizes)
+-        weight = Parameter(torch.empty(output_size_per_partition,
++        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                        input_size_per_partition,
+                                        dtype=params_dtype),
+                            requires_grad=False)
+@@ -90,12 +136,8 @@ class UnquantizedLinearMethod(LinearMethodBase):
+               layer: torch.nn.Module,
+               x: torch.Tensor,
+               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+-        weight = layer.weight
+-        if self.separate_bias_add:
+-            if bias is not None:
+-                return F.linear(x, weight) + bias
+-            return F.linear(x, weight)
+-        return F.linear(x, weight, bias)
++
++        return F.linear(x, layer.weight, bias)
+ 
+ 
+ class LinearBase(torch.nn.Module):
+@@ -117,6 +159,7 @@ class LinearBase(torch.nn.Module):
+         skip_bias_add: bool = False,
+         params_dtype: Optional[torch.dtype] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+ 
+@@ -131,7 +174,8 @@ class LinearBase(torch.nn.Module):
+             self.quant_method: Optional[
+                 QuantizeMethodBase] = UnquantizedLinearMethod()
+         else:
+-            self.quant_method = quant_config.get_quant_method(self)
++            self.quant_method = quant_config.get_quant_method(self,
++                                                              prefix=prefix)
+ 
+     def forward(self, x: torch.Tensor) -> torch.Tensor:
+         raise NotImplementedError
+@@ -147,34 +191,56 @@ class ReplicatedLinear(LinearBase):
+         skip_bias_add: If true, skip adding bias but instead return it.
+         params_dtype: Data type for the parameters.
+         quant_config: Quantization configure.
++        prefix: The name of the layer in the state dict, including all parents
++                        (e.g. model.layers.0.qkv_proj)
+     """
+ 
+-    def __init__(
+-        self,
+-        input_size: int,
+-        output_size: int,
+-        bias: bool = True,
+-        skip_bias_add: bool = False,
+-        params_dtype: Optional[torch.dtype] = None,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
+-        super().__init__(input_size, output_size, skip_bias_add, params_dtype,
+-                         quant_config)
++    def __init__(self,
++                 input_size: int,
++                 output_size: int,
++                 bias: bool = True,
++                 skip_bias_add: bool = False,
++                 params_dtype: Optional[torch.dtype] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__(input_size,
++                         output_size,
++                         skip_bias_add,
++                         params_dtype,
++                         quant_config,
++                         prefix=prefix)
+ 
+         # All the linear layer supports quant method.
+         assert self.quant_method is not None
+-        self.quant_method.create_weights(self, self.input_size,
+-                                         [self.output_size], self.input_size,
+-                                         self.output_size, self.params_dtype)
++        self.quant_method.create_weights(self,
++                                         self.input_size, [self.output_size],
++                                         self.input_size,
++                                         self.output_size,
++                                         self.params_dtype,
++                                         weight_loader=self.weight_loader)
+ 
+         if bias:
+             self.bias = Parameter(
+                 torch.empty(self.output_size, dtype=self.params_dtype))
+-            set_weight_attrs(self.bias, {"output_dim": 0})
++            set_weight_attrs(self.bias, {
++                "output_dim": 0,
++                "weight_loader": self.weight_loader,
++            })
+         else:
+             self.register_parameter("bias", None)
+ 
+-    def forward(self, x: torch.Tensor) -> torch.Tensor:
++    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
++        # If the weight on disk does not have a shape, give it one
++        # (such scales for AutoFp8).
++        if len(loaded_weight.shape) == 0:
++            loaded_weight = loaded_weight.reshape(1)
++
++        assert param.size() == loaded_weight.size()
++        param.data.copy_(loaded_weight)
++
++    def forward(
++        self, x: torch.Tensor
++    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+         bias = self.bias if not self.skip_bias_add else None
+         assert self.quant_method is not None
+         output = self.quant_method.apply(self, x, bias)
+@@ -208,38 +274,50 @@ class ColumnParallelLinear(LinearBase):
+         quant_config: Quantization configure.
+         output_sizes: list of output sizes packed into one output, like for QKV
+                        the list would be size 3.
++        prefix: The name of the layer in the state dict, including all parents
++                        (e.g. model.layers.0.qkv_proj) 
+     """
+ 
+-    def __init__(
+-        self,
+-        input_size: int,
+-        output_size: int,
+-        bias: bool = True,
+-        gather_output: bool = False,
+-        skip_bias_add: bool = False,
+-        params_dtype: Optional[torch.dtype] = None,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        output_sizes: Optional[List[int]] = None,
+-    ):
++    def __init__(self,
++                 input_size: int,
++                 output_size: int,
++                 bias: bool = True,
++                 gather_output: bool = False,
++                 skip_bias_add: bool = False,
++                 params_dtype: Optional[torch.dtype] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 output_sizes: Optional[List[int]] = None,
++                 prefix: str = ""):
+         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
+-                         quant_config)
++                         quant_config, prefix)
+ 
+         self.gather_output = gather_output
+ 
+         # Divide the weight matrix along the last dimension.
+         tp_size = get_tensor_model_parallel_world_size()
+-        self.output_size_per_partition = divide(output_size, tp_size)
++        assert self.quant_method is not None
++        self.output_size_per_partition = divide(self.output_size, tp_size)
++        self.output_partition_sizes = [self.output_size_per_partition]
++        # If QKV or MergedColumn, use output size of each partition.
++        if hasattr(self, "output_sizes"):
++            self.output_partition_sizes = [
++                divide(output_size, tp_size)
++                for output_size in self.output_sizes
++            ]
++
+         if output_sizes is None:
+             output_sizes = [output_size]
+-        # All the linear layer supports quant method.
+-        assert self.quant_method is not None
+-        self.quant_method.create_weights(self,
+-                                         self.input_size,
+-                                         [x // tp_size for x in output_sizes],
+-                                         self.input_size,
+-                                         self.output_size,
+-                                         self.params_dtype,
+-                                         weight_loader=self.weight_loader)
++
++        self.quant_method.create_weights(
++            layer=self,
++            input_size_per_partition=self.input_size,
++            output_partition_sizes=self.output_partition_sizes,
++            input_size=self.input_size,
++            output_size=self.output_size,
++            params_dtype=self.params_dtype,
++            weight_loader=(
++                self.weight_loader_v2 if self.quant_method.__class__.__name__
++                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+         if bias:
+             self.bias = Parameter(
+                 torch.empty(self.output_size_per_partition,
+@@ -252,27 +330,46 @@ class ColumnParallelLinear(LinearBase):
+             self.register_parameter("bias", None)
+ 
+     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+-        # Special case for Fp8 scales.
+-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+-                                           None)
+-
+         tp_rank = get_tensor_model_parallel_rank()
+         output_dim = getattr(param, "output_dim", None)
++
++        # Special case for GGUF
++        is_gguf_weight = getattr(param, "is_gguf_weight", False)
++        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
++        if is_gguf_weight_type:
++            param.weight_type = loaded_weight.item()
++
++        # Materialize GGUF UninitializedParameter
++        if is_gguf_weight and isinstance(param, UninitializedParameter):
++            param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
++
++        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
++
+         param_data = param.data
+-        if output_dim is not None:
++        # bitsandbytes loads the weights of the specific portion
++        # no need to narrow here
++        if output_dim is not None and not use_bitsandbytes_4bit:
+             shard_size = param_data.shape[output_dim]
+             start_idx = tp_rank * shard_size
+             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                  shard_size)
+-        # Special case for Fp8 scales.
+-        elif fp8_scales_shard_indexer is not None:
+-            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
+-                                                                 loaded_weight,
+-                                                                 shard_id=0)
++
++        # Special case for loading scales off disk, which often do not
++        # have a shape (such as in the case of AutoFP8).
++        if len(loaded_weight.shape) == 0:
++            loaded_weight = loaded_weight.reshape(1)
+ 
+         assert param_data.shape == loaded_weight.shape
+         param_data.copy_(loaded_weight)
+ 
++    def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
++        # Special case for loading scales off disk, which often do not
++        # have a shape (such as in the case of AutoFP8).
++        if len(loaded_weight.shape) == 0:
++            assert loaded_weight.numel() == 1
++            loaded_weight = loaded_weight.reshape(1)
++        param.load_column_parallel_weight(loaded_weight=loaded_weight)
++
+     def forward(self, input_):
+         bias = self.bias if not self.skip_bias_add else None
+ 
+@@ -315,46 +412,91 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
+                        skip adding bias but instead return it.
+         params_dtype: Data type for the parameters.
+         quant_config: Quantization configure.
++        prefix: The name of the layer in the state dict, including all parents
++                        (e.g. model.layers.0.qkv_proj)
+     """
+ 
+-    def __init__(
+-        self,
+-        input_size: int,
+-        output_sizes: List[int],
+-        bias: bool = True,
+-        gather_output: bool = False,
+-        skip_bias_add: bool = False,
+-        params_dtype: Optional[torch.dtype] = None,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self,
++                 input_size: int,
++                 output_sizes: List[int],
++                 bias: bool = True,
++                 gather_output: bool = False,
++                 skip_bias_add: bool = False,
++                 params_dtype: Optional[torch.dtype] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         self.output_sizes = output_sizes
+         tp_size = get_tensor_model_parallel_world_size()
+         assert all(output_size % tp_size == 0 for output_size in output_sizes)
+-        super().__init__(input_size, sum(output_sizes), bias, gather_output,
+-                         skip_bias_add, params_dtype, quant_config,
+-                         self.output_sizes)
++        super().__init__(input_size=input_size,
++                         output_size=sum(output_sizes),
++                         bias=bias,
++                         gather_output=gather_output,
++                         skip_bias_add=skip_bias_add,
++                         params_dtype=params_dtype,
++                         quant_config=quant_config,
++                         prefix=prefix)
+ 
+     def weight_loader(self,
+                       param: Parameter,
+                       loaded_weight: torch.Tensor,
+                       loaded_shard_id: Optional[int] = None):
+ 
++        # Special case for GGUF
++        # initialize GGUF param after we know the quantize type
++        is_gguf_weight = getattr(param, "is_gguf_weight", False)
++        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
++        if is_gguf_weight_type:
++            if loaded_shard_id is not None:
++                param.data[loaded_shard_id].copy_(loaded_weight)
++                param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
++            else:
++                param.shard_weight_type = {
++                    i: loaded_weight.item()
++                    for i, _ in enumerate(self.output_sizes)
++                }
++            return
++
++        if is_gguf_weight:
++            tp_size = get_tensor_model_parallel_world_size()
++            tp_rank = get_tensor_model_parallel_rank()
++
++            output_dim = getattr(param, "output_dim", None)
++            shard_size = loaded_weight.size(output_dim) // tp_size
++            start_idx = tp_rank * shard_size
++
++            if loaded_shard_id is not None:
++                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
++                                                     shard_size)
++                param.shard_id.append(loaded_shard_id)
++                param.shard_id_map[loaded_shard_id] = len(param.data_container)
++                param.data_container.append(loaded_weight)
++                if len(param.data_container) == 2:
++                    self.qweight = param.materialize_nested()
++                return
++
+         param_data = param.data
+         output_dim = getattr(param, "output_dim", None)
+         # Special case for AQLM codebooks.
+         is_metadata = getattr(param, "is_metadata", False)
+-        # Special case for Fp8 scales.
+-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+-                                           None)
++        # Special case for per-tensor scale to load scalar into fused array.
++        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
+ 
+         if loaded_shard_id is None:
+-            # Loaded weight is already packed.
++            # Loaded weight is already fused on disk (mlp).
++            # (e.g., Phi-3's gate_up_proj).
+             if output_dim is None:
++                if needs_scalar_to_array:
++                    param_data, loaded_weight = adjust_scalar_to_fused_array(
++                        param_data, loaded_weight, 0)
++
+                 assert param_data.shape == loaded_weight.shape
+                 param_data.copy_(loaded_weight)
+                 return
+             current_shard_offset = 0
+-            shard_offsets = []
++            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
++                                            False)
++            shard_offsets: List[Tuple[int, int, int]] = []
+             for i, output_size in enumerate(self.output_sizes):
+                 shard_offsets.append((i, current_shard_offset, output_size))
+                 current_shard_offset += output_size
+@@ -370,6 +512,16 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
+                     shard_size, shard_offset = adjust_marlin_shard(
+                         param, shard_size, shard_offset)
+ 
++                if use_bitsandbytes_4bit:
++                    index = list(itertools.accumulate([0] + self.output_sizes))
++                    orig_offsets = {
++                        str(i): (index[i], size)
++                        for i, size in enumerate(self.output_sizes)
++                    }
++                    orig_offsets["total"] = (self.output_size, 0)
++                    shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
++                        param, orig_offsets, str(shard_id))
++
+                 loaded_weight_shard = loaded_weight.narrow(
+                     output_dim, shard_offset, shard_size)
+                 self.weight_loader(param, loaded_weight_shard, shard_id)
+@@ -392,20 +544,31 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
+                 shard_size, shard_offset = adjust_marlin_shard(
+                     param, shard_size, shard_offset)
+ 
++            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
++                                            False)
++            if use_bitsandbytes_4bit:
++                shard_size = loaded_weight.shape[output_dim]
++                shard_offset = loaded_weight.shape[output_dim] * \
++                    loaded_shard_id
++
+             param_data = param_data.narrow(output_dim, shard_offset,
+                                            shard_size)
+             start_idx = tp_rank * shard_size
+-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+-                                                 shard_size)
++            # bitsandbytes loads the weights of the specific portion
++            # no need to narrow here
++            if not use_bitsandbytes_4bit:
++                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
++                                                     shard_size)
+         # Special case for AQLM codebooks.
+         elif is_metadata:
+             # metadata indicates fixed size concatenated along dim 0
+             shard_size = loaded_weight.shape[0]
+             shard_offset = loaded_shard_id * shard_size
+             param_data = param_data.narrow(0, shard_offset, shard_size)
+-        # Special case for Fp8 scales.
+-        elif fp8_scales_shard_indexer is not None:
+-            param_data, loaded_weight = fp8_scales_shard_indexer(
++
++        # Special case for per-tensor scales in fused case.
++        elif needs_scalar_to_array:
++            param_data, loaded_weight = adjust_scalar_to_fused_array(
+                 param_data, loaded_weight, loaded_shard_id)
+ 
+         else:
+@@ -415,9 +578,86 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
+                     "Loading a weight without `output_dim` attribute in "
+                     "MergedColumnParallelLinear, assume the weight is "
+                     "the same for all partitions.")
++
+         assert param_data.shape == loaded_weight.shape
+         param_data.copy_(loaded_weight)
+ 
++    def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
++                                           loaded_weight: torch.Tensor):
++        """
++        Handle special case for models where MLP layers are already
++        fused on disk. In this case, we have no shard id. This function
++        determmines the shard id by splitting these layers and then calls
++        the weight loader using the shard id.
++
++        An example of a model with these fused layers:
++        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
++        """
++
++        current_shard_offset = 0
++        shard_offsets: List[Tuple[int, int, int]] = []
++        for i, output_size in enumerate(self.output_sizes):
++            shard_offsets.append((i, current_shard_offset, output_size))
++            current_shard_offset += output_size
++
++        for shard_id, shard_offset, shard_size in shard_offsets:
++            # Special case for Quantization.
++            # If quantized, we need to adjust the offset and size to account
++            # for the packing.
++            if isinstance(param, (PackedColumnParameter, PackedvLLMParameter
++                                  )) and param.packed_dim == param.output_dim:
++                shard_size, shard_offset = \
++                    param.adjust_shard_indexes_for_packing(
++                    shard_size=shard_size, shard_offset=shard_offset)
++
++            loaded_weight_shard = loaded_weight.narrow(param.output_dim,
++                                                       shard_offset,
++                                                       shard_size)
++            self.weight_loader_v2(param, loaded_weight_shard, shard_id)
++
++    def weight_loader_v2(self,
++                         param: BasevLLMParameter,
++                         loaded_weight: torch.Tensor,
++                         loaded_shard_id: Optional[int] = None):
++        if loaded_shard_id is None:
++            if isinstance(param, PerTensorScaleParameter):
++                param.load_merged_column_weight(loaded_weight=loaded_weight,
++                                                shard_id=0)
++                return
++            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
++                param.load_merged_column_weight(loaded_weight=loaded_weight)
++                return
++            # TODO: @dsikka - move to parameter.py
++            self._load_fused_module_from_checkpoint(param, loaded_weight)
++            return
++
++        assert loaded_shard_id < len(self.output_sizes)
++
++        tp_size = get_tensor_model_parallel_world_size()
++
++        if isinstance(param, BlockQuantScaleParameter):
++            from vllm.model_executor.layers.quantization.fp8 import (
++                Fp8LinearMethod, Fp8MoEMethod)
++            assert self.quant_method is not None
++            assert isinstance(self.quant_method,
++                              (Fp8LinearMethod, Fp8MoEMethod))
++            weight_block_size = self.quant_method.quant_config.weight_block_size
++            assert weight_block_size is not None
++            block_n, _ = weight_block_size[0], weight_block_size[1]
++            shard_offset = (
++                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
++                block_n) // tp_size
++            shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
++                          block_n // tp_size)
++        else:
++            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
++            shard_size = self.output_sizes[loaded_shard_id] // tp_size
++
++        param.load_merged_column_weight(loaded_weight=loaded_weight,
++                                        shard_id=loaded_shard_id,
++                                        shard_offset=shard_offset,
++                                        shard_size=shard_size)
++
+ 
+ class QKVParallelLinear(ColumnParallelLinear):
+     """Linear layers for the attention's QKV transformation.
+@@ -441,19 +681,20 @@ class QKVParallelLinear(ColumnParallelLinear):
+                        skip adding bias but instead return it.
+         params_dtype: Data type for the parameters.
+         quant_config: Quantization configure.
++        prefix: The name of the layer in the state dict, including all parents
++                        (e.g. model.layers.0.qkv_proj)
+     """
+ 
+-    def __init__(
+-        self,
+-        hidden_size: int,
+-        head_size: int,
+-        total_num_heads: int,
+-        total_num_kv_heads: Optional[int] = None,
+-        bias: bool = True,
+-        skip_bias_add: bool = False,
+-        params_dtype: Optional[torch.dtype] = None,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self,
++                 hidden_size: int,
++                 head_size: int,
++                 total_num_heads: int,
++                 total_num_kv_heads: Optional[int] = None,
++                 bias: bool = True,
++                 skip_bias_add: bool = False,
++                 params_dtype: Optional[torch.dtype] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         self.hidden_size = hidden_size
+         self.head_size = head_size
+         self.total_num_heads = total_num_heads
+@@ -473,30 +714,155 @@ class QKVParallelLinear(ColumnParallelLinear):
+         input_size = self.hidden_size
+         output_size = (self.num_heads +
+                        2 * self.num_kv_heads) * tp_size * self.head_size
+-        output_sizes = [
+-            self.num_heads * tp_size * self.head_size,
+-            self.num_kv_heads * tp_size * self.head_size,
+-            self.num_kv_heads * tp_size * self.head_size
++        self.output_sizes = [
++            self.num_heads * self.head_size * tp_size,  # q_proj
++            self.num_kv_heads * self.head_size * tp_size,  # k_proj
++            self.num_kv_heads * self.head_size * tp_size,  # v_proj 
+         ]
+ 
+-        super().__init__(input_size, output_size, bias, False, skip_bias_add,
+-                         params_dtype, quant_config, output_sizes)
++        super().__init__(input_size=input_size,
++                         output_size=output_size,
++                         bias=bias,
++                         gather_output=False,
++                         skip_bias_add=skip_bias_add,
++                         params_dtype=params_dtype,
++                         quant_config=quant_config,
++                         prefix=prefix)
++
++    def _get_shard_offset_mapping(self, loaded_shard_id: str):
++        shard_offset_mapping = {
++            "q": 0,
++            "k": self.num_heads * self.head_size,
++            "v": (self.num_heads + self.num_kv_heads) * self.head_size,
++            "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size
++        }
++        return shard_offset_mapping.get(loaded_shard_id)
++
++    def _get_shard_size_mapping(self, loaded_shard_id: str):
++        shard_size_mapping = {
++            "q": self.num_heads * self.head_size,
++            "k": self.num_kv_heads * self.head_size,
++            "v": self.num_kv_heads * self.head_size,
++        }
++        return shard_size_mapping.get(loaded_shard_id)
++
++    def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
++                                           loaded_weight: torch.Tensor):
++        """
++        Handle special case for models where QKV layers are already 
++        fused on disk. In this case, we have no shard id. This function
++        determmines the shard id by splitting these layers and then calls
++        the weight loader using the shard id.
++
++        An example of a model with these fused layers:
++        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
++        """
++        shard_offsets = [
++            # (shard_id, shard_offset, shard_size)
++            ("q", 0, self.total_num_heads * self.head_size),
++            ("k", self.total_num_heads * self.head_size,
++             self.total_num_kv_heads * self.head_size),
++            ("v",
++             (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
++             self.total_num_kv_heads * self.head_size),
++        ]
++
++        for shard_id, shard_offset, shard_size in shard_offsets:
++            # Special case for Quantization.
++            # If quantized, we need to adjust the offset and size to account
++            # for the packing.
++            if isinstance(param, (PackedColumnParameter, PackedvLLMParameter
++                                  )) and param.packed_dim == param.output_dim:
++                shard_size, shard_offset = \
++                    param.adjust_shard_indexes_for_packing(
++                    shard_size=shard_size, shard_offset=shard_offset)
++
++            loaded_weight_shard = loaded_weight.narrow(param.output_dim,
++                                                       shard_offset,
++                                                       shard_size)
++            self.weight_loader_v2(param, loaded_weight_shard, shard_id)
++
++    def weight_loader_v2(self,
++                         param: BasevLLMParameter,
++                         loaded_weight: torch.Tensor,
++                         loaded_shard_id: Optional[str] = None):
++        if loaded_shard_id is None:  # special case for certain models
++            if isinstance(param, PerTensorScaleParameter):
++                param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
++                return
++            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
++                param.load_qkv_weight(loaded_weight=loaded_weight)
++                return
++            # TODO: @dsikka - move to parameter.py
++            self._load_fused_module_from_checkpoint(param, loaded_weight)
++            return
++
++        assert loaded_shard_id in ["q", "k", "v"]
++
++        shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
++        shard_size = self._get_shard_size_mapping(loaded_shard_id)
++
++        param.load_qkv_weight(loaded_weight=loaded_weight,
++                              num_heads=self.num_kv_head_replicas,
++                              shard_id=loaded_shard_id,
++                              shard_offset=shard_offset,
++                              shard_size=shard_size)
+ 
+     def weight_loader(self,
+                       param: Parameter,
+                       loaded_weight: torch.Tensor,
+                       loaded_shard_id: Optional[str] = None):
++
++        # Special case for GGUF
++        # initialize GGUF param after we know the quantize type
++        is_gguf_weight = getattr(param, "is_gguf_weight", False)
++        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
++        if is_gguf_weight_type:
++            idx_map = {"q": 0, "k": 1, "v": 2}
++            if loaded_shard_id is not None:
++                param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
++                param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
++            else:
++                param.shard_weight_type = {
++                    k: loaded_weight.item()
++                    for k in idx_map
++                }
++            return
++
++        if is_gguf_weight:
++            tp_size = get_tensor_model_parallel_world_size()
++            tp_rank = get_tensor_model_parallel_rank()
++
++            output_dim = getattr(param, "output_dim", None)
++            shard_size = loaded_weight.size(output_dim) // tp_size
++            start_idx = tp_rank * shard_size
++
++            if loaded_shard_id is not None:
++                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
++                                                     shard_size)
++                param.shard_id.append(loaded_shard_id)
++                param.shard_id_map[loaded_shard_id] = len(param.data_container)
++                param.data_container.append(loaded_weight)
++                if len(param.data_container) == 3:
++                    self.qweight = param.materialize_nested()
++                return
++
+         param_data = param.data
+         output_dim = getattr(param, "output_dim", None)
+         # Special case for AQLM codebooks.
+         is_metadata = getattr(param, "is_metadata", False)
+-        # Special case for Fp8 scales.
+-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+-                                           None)
++
++        # Special case for per-tensor scales in fused case.
++        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
+ 
+         if loaded_shard_id is None:
+-            # Loaded weight is already packed.
++            # Loaded weight is already fused on disk (qkv).
++            # (e.g., Phi-3's qkv_proj).
+             if output_dim is None:
++                if needs_scalar_to_array:
++                    param_data, loaded_weight = adjust_scalar_to_fused_array(
++                        param_data, loaded_weight, 0)
++
+                 assert param_data.shape == loaded_weight.shape
+                 param_data.copy_(loaded_weight)
+                 return
+@@ -508,6 +874,9 @@ class QKVParallelLinear(ColumnParallelLinear):
+                 ("v", (self.total_num_heads + self.total_num_kv_heads) *
+                  self.head_size, self.total_num_kv_heads * self.head_size),
+             ]
++            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
++                                            False)
++
+             packed_dim = getattr(param, "packed_dim", None)
+             for shard_id, shard_offset, shard_size in shard_offsets:
+                 # Special case for Quantized Weights.
+@@ -521,6 +890,23 @@ class QKVParallelLinear(ColumnParallelLinear):
+                     shard_size, shard_offset = adjust_marlin_shard(
+                         param, shard_size, shard_offset)
+ 
++                if use_bitsandbytes_4bit:
++                    orig_qkv_offsets = {
++                        "q": (0, self.total_num_heads * self.head_size),
++                        "k": (self.total_num_heads * self.head_size,
++                              self.total_num_kv_heads * self.head_size),
++                        "v":
++                        ((self.total_num_heads + self.total_num_kv_heads) *
++                         self.head_size,
++                         self.total_num_kv_heads * self.head_size),
++                        "total":
++                        ((self.total_num_heads + 2 * self.total_num_kv_heads) *
++                         self.head_size, 0)
++                    }
++
++                    shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
++                        param, orig_qkv_offsets, shard_id)
++
+                 loaded_weight_shard = loaded_weight.narrow(
+                     output_dim, shard_offset, shard_size)
+                 self.weight_loader(param, loaded_weight_shard, shard_id)
+@@ -528,6 +914,8 @@ class QKVParallelLinear(ColumnParallelLinear):
+ 
+         tp_rank = get_tensor_model_parallel_rank()
+         assert loaded_shard_id in ["q", "k", "v"]
++
++        # If output dim is defined, use the default loading process.
+         if output_dim is not None:
+             if loaded_shard_id == "q":
+                 shard_offset = 0
+@@ -551,6 +939,23 @@ class QKVParallelLinear(ColumnParallelLinear):
+                 shard_size, shard_offset = adjust_marlin_shard(
+                     param, shard_size, shard_offset)
+ 
++            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
++                                            False)
++            if use_bitsandbytes_4bit:
++                orig_qkv_offsets = {
++                    "q": (0, self.num_heads * self.head_size),
++                    "k": (self.num_heads * self.head_size,
++                          self.num_kv_heads * self.head_size),
++                    "v":
++                    ((self.num_heads + self.num_kv_heads) * self.head_size,
++                     self.num_kv_heads * self.head_size),
++                    "total":
++                    ((self.num_heads + 2 * self.num_kv_heads) * self.head_size,
++                     0)
++                }
++                shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
++                    param, orig_qkv_offsets, loaded_shard_id)
++
+             param_data = param_data.narrow(output_dim, shard_offset,
+                                            shard_size)
+             if loaded_shard_id == "q":
+@@ -558,8 +963,13 @@ class QKVParallelLinear(ColumnParallelLinear):
+             else:
+                 shard_id = tp_rank // self.num_kv_head_replicas
+             start_idx = shard_id * shard_size
+-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+-                                                 shard_size)
++
++            # bitsandbytes loads the weights of the specific portion
++            # no need to narrow here
++            if not use_bitsandbytes_4bit:
++                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
++                                                     shard_size)
++
+         # Special case for for AQLM codebooks.
+         elif is_metadata:
+             # metadata indicates fixed size concatenated along dim 0
+@@ -567,9 +977,9 @@ class QKVParallelLinear(ColumnParallelLinear):
+             shard_index = ["q", "k", "v"].index(loaded_shard_id)
+             param_data = param_data.narrow(0, shard_index * shard_size,
+                                            shard_size)
+-        # Special case for Fp8 scales.
+-        elif fp8_scales_shard_indexer is not None:
+-            param_data, loaded_weight = fp8_scales_shard_indexer(
++        # Special case for per-tensor scales in fused case.
++        elif needs_scalar_to_array:
++            param_data, loaded_weight = adjust_scalar_to_fused_array(
+                 param_data, loaded_weight, loaded_shard_id)
+         else:
+             ignore_warning = getattr(param, "ignore_warning", False)
+@@ -578,6 +988,7 @@ class QKVParallelLinear(ColumnParallelLinear):
+                     "Loading a weight without `output_dim` attribute in "
+                     "QKVParallelLinear, assume the weight is the same "
+                     "for all partitions.")
++
+         assert param_data.shape == loaded_weight.shape
+         param_data.copy_(loaded_weight)
+ 
+@@ -608,36 +1019,38 @@ class RowParallelLinear(LinearBase):
+         quant_config: Quantization configure.
+     """
+ 
+-    def __init__(
+-        self,
+-        input_size: int,
+-        output_size: int,
+-        bias: bool = True,
+-        input_is_parallel: bool = True,
+-        skip_bias_add: bool = False,
+-        params_dtype: Optional[torch.dtype] = None,
+-        reduce_results: bool = True,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self,
++                 input_size: int,
++                 output_size: int,
++                 bias: bool = True,
++                 input_is_parallel: bool = True,
++                 skip_bias_add: bool = False,
++                 params_dtype: Optional[torch.dtype] = None,
++                 reduce_results: bool = True,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
+-                         quant_config)
++                         quant_config, prefix)
+ 
+         self.input_is_parallel = input_is_parallel
+         self.reduce_results = reduce_results
+ 
+         # Divide the weight matrix along the last dimension.
++        self.tp_rank = get_tensor_model_parallel_rank()
+         self.tp_size = get_tensor_model_parallel_world_size()
+         self.input_size_per_partition = divide(input_size, self.tp_size)
+-        # All the linear layer supports quant method.
+         assert self.quant_method is not None
+-        self.quant_method.create_weights(self,
+-                                         self.input_size_per_partition,
+-                                         [self.output_size],
+-                                         self.input_size,
+-                                         self.output_size,
+-                                         self.params_dtype,
+-                                         weight_loader=self.weight_loader)
+ 
++        self.quant_method.create_weights(
++            layer=self,
++            input_size_per_partition=self.input_size_per_partition,
++            output_partition_sizes=[self.output_size],
++            input_size=self.input_size,
++            output_size=self.output_size,
++            params_dtype=self.params_dtype,
++            weight_loader=(
++                self.weight_loader_v2 if self.quant_method.__class__.__name__
++                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+         if not reduce_results and (bias and not skip_bias_add):
+             raise ValueError("When not reduce the results, adding bias to the "
+                              "results can lead to incorrect results")
+@@ -653,29 +1066,53 @@ class RowParallelLinear(LinearBase):
+             self.register_parameter("bias", None)
+ 
+     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+-        # Special case for Fp8 scales.
+-        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+-                                           None)
+-
+         tp_rank = get_tensor_model_parallel_rank()
++        tp_size = get_tensor_model_parallel_world_size()
+         input_dim = getattr(param, "input_dim", None)
++        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
++
++        # Special case for GGUF
++        is_gguf_weight = getattr(param, "is_gguf_weight", False)
++        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
++        if is_gguf_weight_type:
++            param.weight_type = loaded_weight.item()
++
++        # Materialize GGUF UninitializedParameter
++        if is_gguf_weight and isinstance(param, UninitializedParameter):
++            weight_shape = list(loaded_weight.shape)
++            if input_dim:
++                weight_shape[input_dim] = weight_shape[input_dim] // tp_size
++            param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
++
+         param_data = param.data
+-        if input_dim is not None:
++        # bitsandbytes loads the weights of the specific portion
++        # no need to narrow here
++        if input_dim is not None and not use_bitsandbytes_4bit:
+             shard_size = param_data.shape[input_dim]
+             start_idx = tp_rank * shard_size
+             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
+                                                  shard_size)
+-        # Special case for Fp8 scales.
+-        elif fp8_scales_shard_indexer is not None:
+-            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
+-                                                                 loaded_weight,
+-                                                                 shard_id=0)
++
++        # Special case for loading scales off disk, which often do not
++        # have a shape (such as in the case of AutoFP8).
++        if len(loaded_weight.shape) == 0:
++            loaded_weight = loaded_weight.reshape(1)
+ 
+         assert param_data.shape == loaded_weight.shape
+         param_data.copy_(loaded_weight)
+ 
++    def weight_loader_v2(self, param: BasevLLMParameter,
++                         loaded_weight: torch.Tensor):
++
++        # Special case for loading scales off disk, which often do not
++        # have a shape (such as in the case of AutoFP8).
++        if len(loaded_weight.shape) == 0:
++            assert loaded_weight.numel() == 1
++            loaded_weight = loaded_weight.reshape(1)
++
++        param.load_row_parallel_weight(loaded_weight=loaded_weight)
++
+     def forward(self, input_):
+-        # Set up backprop all-reduce.
+         if self.input_is_parallel:
+             input_parallel = input_
+         else:
+@@ -686,18 +1123,19 @@ class RowParallelLinear(LinearBase):
+ 
+         # Matrix multiply.
+         assert self.quant_method is not None
+-        output_parallel = self.quant_method.apply(self, input_parallel)
++        # Only fuse bias add into GEMM for rank 0 (this ensures that
++        # bias will not get added more than once in TP>1 case)
++        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
++        output_parallel = self.quant_method.apply(self,
++                                                  input_parallel,
++                                                  bias=bias_)
+         if self.reduce_results and self.tp_size > 1:
+-            output_ = tensor_model_parallel_all_reduce(output_parallel)
++            output = tensor_model_parallel_all_reduce(output_parallel)
+         else:
+-            output_ = output_parallel
++            output = output_parallel
++
++        output_bias = self.bias if self.skip_bias_add else None
+ 
+-        if not self.skip_bias_add:
+-            output = output_ + self.bias if self.bias is not None else output_
+-            output_bias = None
+-        else:
+-            output = output_
+-            output_bias = self.bias
+         return output, output_bias
+ 
+     def extra_repr(self) -> str:
+diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
+index 91eb969..2bc7e45 100644
+--- a/vllm/model_executor/layers/logits_processor.py
++++ b/vllm/model_executor/layers/logits_processor.py
+@@ -1,11 +1,17 @@
+ """A layer that compute logits from hidden_stats."""
++import inspect
+ from typing import Optional
+ 
+ import torch
+ import torch.nn as nn
+ 
+-from vllm.distributed import tensor_model_parallel_gather
++import vllm.envs as envs
++from vllm.distributed import (tensor_model_parallel_all_gather,
++                              tensor_model_parallel_gather)
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    VocabParallelEmbedding)
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.platforms import current_platform
+ 
+ 
+ class LogitsProcessor(nn.Module):
+@@ -20,8 +26,9 @@ class LogitsProcessor(nn.Module):
+     def __init__(self,
+                  vocab_size: int,
+                  org_vocab_size: Optional[int] = None,
+-                 scale: Optional[float] = 1.0,
+-                 logits_as_input: bool = False) -> None:
++                 scale: float = 1.0,
++                 logits_as_input: bool = False,
++                 soft_cap: Optional[float] = None) -> None:
+         """
+         Args:
+             scale: A scaling factor to apply to the logits.
+@@ -33,41 +40,67 @@ class LogitsProcessor(nn.Module):
+         self.logits_as_input = logits_as_input
+         # original vocabulary size (without LoRA).
+         self.org_vocab_size = org_vocab_size or vocab_size
++        # Soft cap the logits. Used in Gemma 2.
++        self.soft_cap = soft_cap
++        # Whether to use gather or all-gather to gather the logits.
++
++        self.use_gather = not current_platform.is_tpu(
++        ) and not envs.VLLM_USE_V1
+ 
+     def forward(
+         self,
+-        embedding: torch.Tensor,
++        lm_head: VocabParallelEmbedding,
+         hidden_states: torch.Tensor,
+-        sampling_metadata: SamplingMetadata,
++        sampling_metadata: Optional[SamplingMetadata] = None,
+         embedding_bias: Optional[torch.Tensor] = None,
+-    ) -> torch.Tensor:
++    ) -> Optional[torch.Tensor]:
+         if self.logits_as_input:
+             logits = hidden_states
+         else:
+-            hidden_states = _prune_hidden_states(hidden_states,
+-                                                 sampling_metadata)
++            if sampling_metadata is not None:
++                hidden_states = _prune_hidden_states(hidden_states,
++                                                     sampling_metadata)
+ 
+             # Get the logits for the next tokens.
+-            logits = self._get_logits(hidden_states, embedding, embedding_bias)
+-
++            logits = self._get_logits(hidden_states, lm_head, embedding_bias)
+         if logits is not None:
+-            logits *= self.scale
++            if self.soft_cap is not None:
++                logits = logits / self.soft_cap
++                logits = torch.tanh(logits)
++                logits = logits * self.soft_cap
++
++            if self.scale != 1.0:
++                logits *= self.scale
+ 
+             # Apply logits processors (if any).
+-            logits = _apply_logits_processors(logits, sampling_metadata)
++            if sampling_metadata is not None:
++                logits = _apply_logits_processors(logits, sampling_metadata)
+ 
+         return logits
+ 
+-    def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
+-                    embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
++    def _get_logits(
++        self,
++        hidden_states: torch.Tensor,
++        lm_head: VocabParallelEmbedding,
++        embedding_bias: Optional[torch.Tensor],
++    ) -> Optional[torch.Tensor]:
+         # Get the logits for the next tokens.
+-        logits = torch.matmul(hidden_states, embedding.t())
+-        if embedding_bias is not None:
+-            logits += embedding_bias
+-        logits = tensor_model_parallel_gather(logits)
++        logits = lm_head.linear_method.apply(lm_head,
++                                             hidden_states,
++                                             bias=embedding_bias)
++        if self.use_gather:
++            # None may be returned for rank > 0
++            logits = tensor_model_parallel_gather(logits)
++        else:
++            # Gather is not supported for some devices such as TPUs.
++            # Use all-gather instead.
++            # NOTE(woosuk): Here, the outputs of every device should not be None
++            # because XLA requires strict SPMD among all devices. Every device
++            # should execute the same operations after gathering the logits.
++            logits = tensor_model_parallel_all_gather(logits)
+         # Remove paddings in vocab (if any).
+         if logits is not None:
+-            logits = logits[:, :self.org_vocab_size]
++            logits = logits[..., :self.org_vocab_size]
+         return logits
+ 
+     def extra_repr(self) -> str:
+@@ -81,8 +114,14 @@ def _prune_hidden_states(
+     hidden_states: torch.Tensor,
+     sampling_metadata: SamplingMetadata,
+ ) -> torch.Tensor:
+-    return hidden_states.index_select(0,
+-                                      sampling_metadata.selected_token_indices)
++    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios
++    # (warmup, profile_run) we might not have selected_token_indices,
++    # so we skip pruning.
++    if sampling_metadata.selected_token_indices is not None:
++        return hidden_states.index_select(
++            0, sampling_metadata.selected_token_indices)
++    else:
++        return hidden_states
+ 
+ 
+ def _apply_logits_processors(
+@@ -95,15 +134,25 @@ def _apply_logits_processors(
+         seq_ids = seq_group.seq_ids
+         sampling_params = seq_group.sampling_params
+         logits_processors = sampling_params.logits_processors
+-
+         if logits_processors:
+             found_logits_processors = True
++
+             for seq_id, logits_row_idx in zip(seq_ids,
+                                               seq_group.sample_indices):
+                 logits_row = logits[logits_row_idx]
+-                token_ids = seq_group.seq_data[seq_id].output_token_ids
++                past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids
++                prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids
++
+                 for logits_processor in logits_processors:
+-                    logits_row = logits_processor(token_ids, logits_row)
++                    parameters = inspect.signature(logits_processor).parameters
++                    if len(parameters) == 3:
++                        logits_row = logits_processor(prompt_tokens_ids,
++                                                      past_tokens_ids,
++                                                      logits_row)
++                    else:
++                        logits_row = logits_processor(past_tokens_ids,
++                                                      logits_row)
++
+                 logits[logits_row_idx] = logits_row
+ 
+         logits_processed += len(seq_group.sample_indices) + len(
+diff --git a/vllm/model_executor/layers/mamba/__init__.py b/vllm/model_executor/layers/mamba/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
+new file mode 100644
+index 0000000..606c796
+--- /dev/null
++++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
+@@ -0,0 +1,241 @@
++import torch
++from torch import nn
++from torch.nn.parameter import Parameter
++
++from vllm.attention.backends.abstract import AttentionMetadata
++from vllm.distributed.parallel_state import (
++    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
++from vllm.model_executor.custom_op import CustomOp
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               MergedColumnParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
++    causal_conv1d_fn, causal_conv1d_update)
++from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
++    selective_scan_fn, selective_state_update)
++from vllm.model_executor.models.mamba_cache import MambaCacheParams
++from vllm.model_executor.utils import set_weight_attrs
++
++
++# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
++@CustomOp.register("mamba_mixer")
++class MambaMixer(CustomOp):
++    """
++    Compute ∆, A, B, C, and D the state space parameters and compute
++    the `contextualized_states`. A, D are input independent
++    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
++    for why A isn't selective) ∆, B, C are input-dependent
++    (this is a key difference between Mamba and the linear time
++    invariant S4, and is why Mamba is called
++    **selective** state spaces)
++    """
++
++    def __init__(self,
++                 hidden_size: int,
++                 ssm_state_size: int,
++                 conv_kernel_size: int,
++                 intermediate_size: int,
++                 time_step_rank: int,
++                 use_conv_bias: bool,
++                 use_bias: bool,
++                 use_rms_norm: bool,
++                 rms_norm_has_weight: bool = True,
++                 rms_norm_eps: float = 1e-5,
++                 activation="silu",
++                 is_lora_enabled: bool = False):
++        super().__init__()
++        self.time_step_rank = time_step_rank
++        self.ssm_state_size = ssm_state_size
++        self.use_rms_norm = use_rms_norm
++        self.activation = activation
++        self.is_lora_enabled = is_lora_enabled
++
++        self.conv1d = ColumnParallelLinear(
++            input_size=conv_kernel_size,
++            output_size=intermediate_size,
++            bias=use_conv_bias,
++        )
++        # unsqueeze to fit conv1d weights shape into the linear weights shape.
++        # Can't do this in `weight_loader` since it already exists in
++        # `ColumnParallelLinear` and `set_weight_attrs`
++        # doesn't allow to override it
++        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
++
++        self.in_proj = MergedColumnParallelLinear(hidden_size,
++                                                  [intermediate_size] * 2,
++                                                  bias=use_bias)
++
++        # selective projection used to make dt, B and C input dependent
++        self.x_proj = RowParallelLinear(
++            intermediate_size,
++            time_step_rank + ssm_state_size * 2,
++            bias=False,
++        )
++        # time step projection (discretization) -
++        # In the forward we need to apply dt_proj without the bias,
++        # as the bias is added in the selective scan kernel.
++        self.dt_proj = ColumnParallelLinear(time_step_rank,
++                                            intermediate_size,
++                                            bias=True,
++                                            skip_bias_add=True)
++
++        def weight_loader(param: Parameter, loaded_weight: torch.Tensor):
++            tp_rank = get_tensor_model_parallel_rank()
++            tp_size = get_tensor_model_parallel_world_size()
++            param.data.copy_(
++                loaded_weight.data.split(loaded_weight.shape[0] // tp_size,
++                                         dim=0)[tp_rank])
++
++        def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
++            weight_loader(param, -torch.exp(loaded_weight.float()))
++
++        tp_size = get_tensor_model_parallel_world_size()
++        self.A = nn.Parameter(
++            torch.empty(
++                intermediate_size // tp_size,
++                ssm_state_size,
++                dtype=torch.float32,
++            ))
++        self.D = nn.Parameter(torch.ones(intermediate_size // tp_size))
++
++        set_weight_attrs(self.D, {"weight_loader": weight_loader})
++        set_weight_attrs(self.A, {"weight_loader": A_weight_loader})
++
++        self.out_proj = RowParallelLinear(
++            intermediate_size,
++            hidden_size,
++            bias=use_bias,
++            input_is_parallel=True,
++        )
++
++        self.dt_layernorm = RMSNorm(
++            time_step_rank,
++            eps=rms_norm_eps,
++            has_weight=rms_norm_has_weight,
++        ) if use_rms_norm else None
++
++        self.b_layernorm = RMSNorm(
++            ssm_state_size,
++            eps=rms_norm_eps,
++            has_weight=rms_norm_has_weight,
++        ) if use_rms_norm else None
++
++        self.c_layernorm = RMSNorm(
++            ssm_state_size,
++            eps=rms_norm_eps,
++            has_weight=rms_norm_has_weight,
++        ) if use_rms_norm else None
++
++    def forward_native(self, hidden_states: torch.Tensor,
++                       attn_metadata: AttentionMetadata,
++                       conv_state: torch.Tensor, ssm_state: torch.Tensor):
++        pass
++
++    def forward_cuda(self, hidden_states: torch.Tensor,
++                     attn_metadata: AttentionMetadata,
++                     mamba_cache_params: MambaCacheParams):
++
++        # 1. Gated MLP's linear projection
++        projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
++        hidden_states, gate = projected_states.chunk(2, dim=-2)
++
++        # 2. Convolution sequence transformation
++        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
++                                               self.conv1d.weight.size(2))
++
++        if attn_metadata.query_start_loc is not None \
++            and attn_metadata.context_lens_tensor is not None:
++            # |---------- N-1 iteration --------|
++            # |---------------- N iteration ---------------------|
++            # |- tokenA -|......................|-- newTokens ---|
++            # |---------- context_len ----------|
++            # |-------------------- seq_len ---------------------|
++            #                                   |-- query_len ---|
++            hidden_states = causal_conv1d_fn(
++                hidden_states,
++                conv_weights,
++                self.conv1d.bias,
++                activation=self.activation,
++                conv_states=mamba_cache_params.conv_state,
++                has_initial_state=attn_metadata.context_lens_tensor > 0,
++                cache_indices=mamba_cache_params.state_indices_tensor,
++                query_start_loc=attn_metadata.query_start_loc)
++        else:
++            hidden_states = causal_conv1d_update(
++                hidden_states.transpose(0, 1),
++                mamba_cache_params.conv_state,
++                conv_weights,
++                self.conv1d.bias,
++                self.activation,
++                conv_state_indices=mamba_cache_params.state_indices_tensor)
++            hidden_states = hidden_states.transpose(0, 1)
++
++        # 3. State Space Model sequence transformation
++        # 3.a. input varying initialization of time_step, B and C
++
++        if self.is_lora_enabled:
++            #   lora kernel requires contiguous tensor
++            ssm_parameters = self.x_proj(
++                hidden_states.transpose(-2, -1).contiguous())[0]
++        else:
++            ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
++
++        time_step, B, C = torch.split(
++            ssm_parameters,
++            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
++            dim=-1,
++        )
++        if self.use_rms_norm:
++            assert self.dt_layernorm is not None
++            assert self.b_layernorm is not None
++            assert self.c_layernorm is not None
++            time_step = self.dt_layernorm(time_step.contiguous())
++            B = self.b_layernorm(B.contiguous())
++            C = self.c_layernorm(C.contiguous())
++
++        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
++        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
++        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
++            self.dt_proj, "bias") else None)
++
++        if attn_metadata.query_start_loc is not None \
++            and attn_metadata.context_lens_tensor is not None:
++            scan_outputs = selective_scan_fn(
++                hidden_states,
++                mamba_cache_params.ssm_state,
++                discrete_time_step,
++                self.A,
++                B.transpose(-2, -1),
++                C.transpose(-2, -1),
++                self.D.float(),
++                gate,
++                time_proj_bias,
++                delta_softplus=True,
++                cache_indices=mamba_cache_params.state_indices_tensor,
++                has_initial_state=attn_metadata.context_lens_tensor > 0,
++                query_start_loc=attn_metadata.query_start_loc)
++        else:
++            scan_outputs = selective_state_update(
++                mamba_cache_params.ssm_state,
++                hidden_states.transpose(0, 1),
++                discrete_time_step.transpose(0, 1),
++                self.A,
++                B,
++                C,
++                self.D,
++                gate.transpose(0, 1),
++                time_proj_bias,
++                dt_softplus=True,
++                state_batch_indices=mamba_cache_params.state_indices_tensor)
++            scan_outputs = scan_outputs.transpose(0, 1)
++
++        # 4. Final linear projection
++        if self.is_lora_enabled:
++            #  lora kernel requires contiguous tensor
++            contextualized_states = self.out_proj(
++                scan_outputs.transpose(-2, -1).contiguous())[0]
++        else:
++            contextualized_states = self.out_proj(
++                scan_outputs.transpose(-2, -1))[0]
++        return contextualized_states
+diff --git a/vllm/model_executor/layers/mamba/ops/__init__.py b/vllm/model_executor/layers/mamba/ops/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+new file mode 100644
+index 0000000..be5639d
+--- /dev/null
++++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+@@ -0,0 +1,102 @@
++# Copyright (c) 2024, Tri Dao.
++# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
++
++from typing import Optional
++
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.attention.backends.utils import PAD_SLOT_ID
++
++
++def causal_conv1d_fn(x: torch.Tensor,
++                     weight: torch.Tensor,
++                     bias: Optional[torch.Tensor] = None,
++                     query_start_loc: Optional[torch.Tensor] = None,
++                     cache_indices: Optional[torch.Tensor] = None,
++                     has_initial_state: Optional[torch.Tensor] = None,
++                     conv_states: Optional[torch.Tensor] = None,
++                     activation: Optional[str] = "silu",
++                     pad_slot_id: int = PAD_SLOT_ID):
++    """
++    x: (batch, dim, seqlen) or (dim,cu_seq_len) for varlen
++        sequences are concatenated from left to right for varlen
++    weight: (dim, width)
++    bias: (dim,)
++    query_start_loc: (batch + 1) int32
++        The cumulative sequence lengths of the sequences in
++        the batch, used to index into sequence. prepended by 0.
++        for example: query_start_loc = torch.Tensor([0,10,16,17]), 
++        x.shape=(dim,17)
++    cache_indices: (batch)  int32
++        indicates the corresponding state index, 
++        like so: conv_state = conv_states[cache_indices[batch_id]]
++    has_initial_state: (batch) bool
++        indicates whether should the kernel take the current state as initial 
++        state for the calculations
++    conv_states: (...,dim,width - 1) itype
++        updated inplace if provided
++    activation: either None or "silu" or "swish"
++    pad_slot_id: int
++            if cache_indices is passed, lets the kernel identify padded 
++            entries that will not be processed, 
++            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id] 
++            in this case, the kernel will not process entries at 
++            indices 0 and 3
++
++
++    out: (batch, dim, seqlen)
++    """
++    if activation not in [None, "silu", "swish"]:
++        raise NotImplementedError("activation must be None, silu, or swish")
++    if x.stride(-1) != 1:
++        x = x.contiguous()
++    bias = bias.contiguous() if bias is not None else None
++
++    ops.causal_conv1d_fwd(x, weight, bias, conv_states, query_start_loc,
++                          cache_indices, has_initial_state, activation
++                          in ["silu", "swish"], pad_slot_id)
++    return x
++
++
++def causal_conv1d_update(x: torch.Tensor,
++                         conv_state: torch.Tensor,
++                         weight: torch.Tensor,
++                         bias: Optional[torch.Tensor] = None,
++                         activation: Optional[str] = None,
++                         cache_seqlens: Optional[torch.Tensor] = None,
++                         conv_state_indices: Optional[torch.Tensor] = None,
++                         pad_slot_id: int = PAD_SLOT_ID):
++    """
++    x: (batch, dim) or (batch, dim, seqlen)
++    conv_state: (batch, dim, state_len), where state_len >= width - 1
++    weight: (dim, width)
++    bias: (dim,)
++    cache_seqlens: (batch,), dtype int32.
++        If not None, the conv_state is treated as a circular buffer.
++        The conv_state will be updated by copying x to the conv_state 
++        starting at the index
++        @cache_seqlens % state_len.
++    conv_state_indices: (batch,), dtype int32
++        If not None, the conv_state is a larger tensor along the batch dim, 
++        and we are selecting the batch coords specified by conv_state_indices.
++        Useful for a continuous batching scenario.
++    pad_slot_id: int
++            if cache_indices is passed, lets the kernel identify padded 
++            entries that will not be processed, 
++            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id] 
++            in this case, the kernel will not process entries at 
++            indices 0 and 3
++    out: (batch, dim) or (batch, dim, seqlen)
++    """
++    if activation not in [None, "silu", "swish"]:
++        raise NotImplementedError("activation must be None, silu, or swish")
++    activation_val = activation in ["silu", "swish"]
++    unsqueeze = x.dim() == 2
++    if unsqueeze:
++        x = x.unsqueeze(-1)
++    ops.causal_conv1d_update(x, conv_state, weight, bias, activation_val,
++                             cache_seqlens, conv_state_indices, pad_slot_id)
++    if unsqueeze:
++        x = x.squeeze(-1)
++    return x
+diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+new file mode 100644
+index 0000000..1484b79
+--- /dev/null
++++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+@@ -0,0 +1,411 @@
++# Copyright (c) 2024, Tri Dao, Albert Gu.
++# Adapted from https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/selective_state_update.py
++
++import torch
++import triton
++import triton.language as tl
++from packaging import version
++
++from vllm import _custom_ops as ops
++from vllm.attention.backends.utils import PAD_SLOT_ID
++
++TRITON3 = version.parse(triton.__version__) >= version.parse("3.0.0")
++
++if TRITON3:
++
++    @triton.jit
++    def softplus(dt):
++        dt = tl.where(dt <= 20.0, tl.math.log(tl.math.exp(dt) + 1), dt)
++        return dt
++else:
++
++    @triton.jit
++    def softplus(dt):
++        dt = tl.where(dt <= 20.0, tl.math.log1p(tl.exp(dt)), dt)
++        return dt
++
++
++@triton.heuristics(
++    {"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
++@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
++@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
++@triton.heuristics({
++    "HAS_STATE_BATCH_INDICES":
++    lambda args: args["state_batch_indices_ptr"] is not None
++})
++@triton.heuristics(
++    {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])})
++@triton.jit
++def _selective_scan_update_kernel(
++    # Pointers to matrices
++    state_ptr,
++    x_ptr,
++    dt_ptr,
++    dt_bias_ptr,
++    A_ptr,
++    B_ptr,
++    C_ptr,
++    D_ptr,
++    z_ptr,
++    out_ptr,
++    state_batch_indices_ptr,
++    pad_slot_id,
++    # Matrix dimensions
++    batch,
++    nheads,
++    dim,
++    dstate,
++    nheads_ngroups_ratio,
++    # Strides
++    stride_state_batch,
++    stride_state_head,
++    stride_state_dim,
++    stride_state_dstate,
++    stride_x_batch,
++    stride_x_head,
++    stride_x_dim,
++    stride_dt_batch,
++    stride_dt_head,
++    stride_dt_dim,
++    stride_dt_bias_head,
++    stride_dt_bias_dim,
++    stride_A_head,
++    stride_A_dim,
++    stride_A_dstate,
++    stride_B_batch,
++    stride_B_group,
++    stride_B_dstate,
++    stride_C_batch,
++    stride_C_group,
++    stride_C_dstate,
++    stride_D_head,
++    stride_D_dim,
++    stride_z_batch,
++    stride_z_head,
++    stride_z_dim,
++    stride_out_batch,
++    stride_out_head,
++    stride_out_dim,
++    # Meta-parameters
++    DT_SOFTPLUS: tl.constexpr,
++    TIE_HDIM: tl.constexpr,
++    BLOCK_SIZE_M: tl.constexpr,
++    HAS_DT_BIAS: tl.constexpr,
++    HAS_D: tl.constexpr,
++    HAS_Z: tl.constexpr,
++    HAS_STATE_BATCH_INDICES: tl.constexpr,
++    BLOCK_SIZE_DSTATE: tl.constexpr,
++):
++    pid_m = tl.program_id(axis=0)
++    pid_b = tl.program_id(axis=1)
++    pid_h = tl.program_id(axis=2)
++
++    # If HAS_STATE_BATCH_INDICES is true, then the ssm state's batch coordinate
++    # is taken from the state_batch_indices_ptr Otherwise, the state coordinate
++    # is the same as the batch id.
++    if HAS_STATE_BATCH_INDICES:
++        state_batch_indices_ptr += pid_b
++        state_batch_idx = tl.load(state_batch_indices_ptr)
++        state_ptr += (state_batch_idx * stride_state_batch +
++                      pid_h * stride_state_head)
++    else:
++        state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
++
++    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
++    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
++    if HAS_DT_BIAS:
++        dt_bias_ptr += pid_h * stride_dt_bias_head
++    A_ptr += pid_h * stride_A_head
++    B_ptr += pid_b * stride_B_batch + (pid_h //
++                                       nheads_ngroups_ratio) * stride_B_group
++    C_ptr += pid_b * stride_C_batch + (pid_h //
++                                       nheads_ngroups_ratio) * stride_C_group
++    if HAS_Z:
++        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
++    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
++
++    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
++    offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
++    state_ptrs = state_ptr + (offs_m[:, None] * stride_state_dim +
++                              offs_n[None, :] * stride_state_dstate)
++    x_ptrs = x_ptr + offs_m * stride_x_dim
++    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
++    if HAS_DT_BIAS:
++        dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
++    if HAS_D:
++        D_ptr += pid_h * stride_D_head
++    A_ptrs = A_ptr + (offs_m[:, None] * stride_A_dim +
++                      offs_n[None, :] * stride_A_dstate)
++    B_ptrs = B_ptr + offs_n * stride_B_dstate
++    C_ptrs = C_ptr + offs_n * stride_C_dstate
++    if HAS_D:
++        D_ptrs = D_ptr + offs_m * stride_D_dim
++    if HAS_Z:
++        z_ptrs = z_ptr + offs_m * stride_z_dim
++    out_ptrs = out_ptr + offs_m * stride_out_dim
++    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
++    if HAS_STATE_BATCH_INDICES:
++        mask &= (state_batch_idx != pad_slot_id)
++    state = tl.load(state_ptrs, mask=mask, other=0.0)
++
++    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
++    if not TIE_HDIM:
++        dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
++        if HAS_DT_BIAS:
++            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim,
++                          other=0.0).to(tl.float32)
++        if DT_SOFTPLUS:
++            dt = softplus(dt)
++        A = tl.load(A_ptrs,
++                    mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate),
++                    other=0.0).to(tl.float32)
++        dA = tl.exp(A * dt[:, None])
++    else:
++        dt = tl.load(dt_ptr).to(tl.float32)
++        if HAS_DT_BIAS:
++            dt += tl.load(dt_bias_ptr).to(tl.float32)
++        if DT_SOFTPLUS:
++            dt = softplus(dt)
++        A = tl.load(A_ptr).to(tl.float32)
++        dA = tl.exp(A * dt)  # scalar, not a matrix
++
++    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
++    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
++    if HAS_D:
++        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
++    if HAS_Z:
++        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
++
++    dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt
++    state = state * dA + dB * x[:, None]
++
++    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
++    if HAS_STATE_BATCH_INDICES:
++        mask &= (state_batch_idx != pad_slot_id)
++    tl.store(state_ptrs, state, mask=mask)
++    out = tl.sum(state * C[None, :], axis=1)
++    if HAS_D:
++        out += x * D
++    if HAS_Z:
++        out *= z * tl.sigmoid(z)
++    tl.store(out_ptrs, out, mask=offs_m < dim)
++
++
++def selective_state_update(state,
++                           x,
++                           dt,
++                           A,
++                           B,
++                           C,
++                           D=None,
++                           z=None,
++                           dt_bias=None,
++                           dt_softplus=False,
++                           state_batch_indices=None,
++                           pad_slot_id=PAD_SLOT_ID):
++    """
++    Argument:
++        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
++        x: (batch, dim) or (batch, nheads, dim)
++        dt: (batch, dim) or (batch, nheads, dim)
++        A: (dim, dstate) or (nheads, dim, dstate)
++        B: (batch, dstate) or (batch, ngroups, dstate)
++        C: (batch, dstate) or (batch, ngroups, dstate)
++        D: (dim,) or (nheads, dim)
++        z: (batch, dim) or (batch, nheads, dim)
++        dt_bias: (dim,) or (nheads, dim)
++        pad_slot_id: int
++            if cache_indices is passed, lets the kernel identify padded 
++            entries that will not be processed, 
++            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id] 
++            in this case, the kernel will not process entries at 
++            indices 0 and 3
++    Return:
++        out: (batch, dim) or (batch, nheads, dim)
++    """
++    has_heads = state.dim() > 3
++    if state.dim() == 3:
++        state = state.unsqueeze(1)
++    if x.dim() == 2:
++        x = x.unsqueeze(1)
++    if dt.dim() == 2:
++        dt = dt.unsqueeze(1)
++    if A.dim() == 2:
++        A = A.unsqueeze(0)
++    if B.dim() == 2:
++        B = B.unsqueeze(1)
++    if C.dim() == 2:
++        C = C.unsqueeze(1)
++    if D is not None and D.dim() == 1:
++        D = D.unsqueeze(0)
++    if z is not None and z.dim() == 2:
++        z = z.unsqueeze(1)
++    if dt_bias is not None and dt_bias.dim() == 1:
++        dt_bias = dt_bias.unsqueeze(0)
++
++    _, nheads, dim, dstate = state.shape
++    batch = x.shape[0]
++
++    assert x.shape == (batch, nheads, dim)
++    assert dt.shape == x.shape
++    assert A.shape == (nheads, dim, dstate)
++    ngroups = B.shape[1]
++    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
++    assert B.shape == (batch, ngroups, dstate)
++    assert C.shape == B.shape
++    if D is not None:
++        assert D.shape == (nheads, dim)
++    if z is not None:
++        assert z.shape == x.shape
++    if dt_bias is not None:
++        assert dt_bias.shape == (nheads, dim)
++    if state_batch_indices is not None:
++        assert state_batch_indices.shape == (batch, )
++    out = torch.empty_like(x)
++    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE_M']), batch, nheads)
++    z_strides = ((z.stride(0), z.stride(1), z.stride(2)) if z is not None else
++                 (0, 0, 0))
++    # We don't want autotune since it will overwrite the state
++    # We instead tune by hand.
++    BLOCK_SIZE_M, num_warps = ((32, 4) if dstate <= 16 else
++                               ((16, 4) if dstate <= 32 else
++                                ((8, 4) if dstate <= 64 else
++                                 ((4, 4) if dstate <= 128 else ((4, 8))))))
++    tie_hdim = A.stride(-1) == 0 and A.stride(-2) == 0 and dt.stride(
++        -1) == 0 and dt_bias.stride(-1) == 0
++    with torch.cuda.device(x.device.index):
++        _selective_scan_update_kernel[grid](
++            state,
++            x,
++            dt,
++            dt_bias,
++            A,
++            B,
++            C,
++            D,
++            z,
++            out,
++            state_batch_indices,
++            pad_slot_id,
++            batch,
++            nheads,
++            dim,
++            dstate,
++            nheads // ngroups,
++            state.stride(0),
++            state.stride(1),
++            state.stride(2),
++            state.stride(3),
++            x.stride(0),
++            x.stride(1),
++            x.stride(2),
++            dt.stride(0),
++            dt.stride(1),
++            dt.stride(2),
++            *(dt_bias.stride(0),
++              dt_bias.stride(1)) if dt_bias is not None else 0,
++            A.stride(0),
++            A.stride(1),
++            A.stride(2),
++            B.stride(0),
++            B.stride(1),
++            B.stride(2),
++            C.stride(0),
++            C.stride(1),
++            C.stride(2),
++            *(D.stride(0), D.stride(1)) if D is not None else 0,
++            z_strides[0],
++            z_strides[1],
++            z_strides[2],
++            out.stride(0),
++            out.stride(1),
++            out.stride(2),
++            dt_softplus,
++            tie_hdim,
++            BLOCK_SIZE_M,
++            num_warps=num_warps,
++        )
++    if not has_heads:
++        out = out.squeeze(1)
++    return out
++
++
++def selective_scan_fn(u,
++                      ssm_states,
++                      delta,
++                      A,
++                      B,
++                      C,
++                      D=None,
++                      z=None,
++                      delta_bias=None,
++                      delta_softplus=False,
++                      query_start_loc=None,
++                      cache_indices=None,
++                      has_initial_state=None,
++                      pad_slot_id=PAD_SLOT_ID) -> torch.Tensor:
++    """
++    u: (dim, total_length) for varlen or (batch, dim, seqlen) 
++        applies changes in place.
++    ssm_states: (batch, dim, dstate) or (batch, nheads, dim, dstate)
++        applies changes in place.
++    delta: (dim, total_length) for varlen or (batch, dim, seqlen)
++    A: (dim, dstate) 
++    B: (ngroups, dstate, total_length) for varlen or 
++                                        (batch,ngroups,dstate,seqlen)
++    C: (ngroups, dstate, total_length) for varlen or 
++                                        (batch,ngroups,dstate,seqlen)
++    D: (dim,) 
++    z: (dim, total_length) for varlen or (batch, dim, seqlen) 
++    dt_bias: (dim,) or (dim)
++    query_start_loc: (batch + 1) int32
++        The cumulative sequence lengths of the sequences in
++        the batch, used to index into sequence. prepended with 0.
++        for example: query_start_loc = torch.Tensor([0,10,16,17]), 
++        x.shape=(dim,17)
++    cache_indices: (batch) int32
++        A tensor with each cell is a correspondent 
++        input and output ssm_state index
++    has_initial_state: (batch) bool
++        A tensor populated with ones and zeros, 
++        indicate if the ssm_state at the corresponding index should be 
++        used as initial state. Not providing argument assumes 
++        there's no initial state
++    pad_slot_id: int
++        if cache_indices is passed, lets the kernel identify padding entries 
++        that will not be processed, 
++        for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id] 
++        in this case, the kernel will not process entries at indices 0 and 3
++    returns
++        output: (dim, total_length) for varlen or (batch, dim, seqlen) 
++                supports inplace replacement
++    """
++    if u.stride(-1) != 1:
++        u = u.contiguous()
++    if delta.stride(-1) != 1:
++        delta = delta.contiguous()
++    if D is not None:
++        D = D.contiguous()
++    if B.stride(-1) != 1:
++        B = B.contiguous()
++    if C.stride(-1) != 1:
++        C = C.contiguous()
++    if z is not None and z.stride(-1) != 1:
++        z = z.contiguous()
++    if B.dim() == 3 and query_start_loc is None:
++        B = B.unsqueeze(1)
++    if B.dim() == 2 and query_start_loc is not None:
++        B = B.unsqueeze(0)
++    if C.dim() == 3 and query_start_loc is None:
++        C = C.unsqueeze(1)
++    if C.dim() == 2 and query_start_loc is not None:
++        C = C.unsqueeze(0)
++
++    ops.selective_scan_fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus,
++                           query_start_loc, cache_indices, has_initial_state,
++                           ssm_states, pad_slot_id)
++
++    if z is None:
++        return delta  # output written inplace to delta
++    else:
++        return z  # output written inplace to z
+diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
+new file mode 100644
+index 0000000..75bf33d
+--- /dev/null
++++ b/vllm/model_executor/layers/pooler.py
+@@ -0,0 +1,320 @@
++from enum import IntEnum
++from typing import List, Optional, Union
++
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from transformers import PretrainedConfig
++from typing_extensions import assert_never
++
++from vllm.config import PoolerConfig
++from vllm.model_executor.pooling_metadata import (PoolingMetadata,
++                                                  PoolingTensors)
++from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
++from vllm.transformers_utils.config import (
++    get_cross_encoder_activation_function)
++
++
++class PoolingType(IntEnum):
++    """Enumeration for different types of pooling methods."""
++    LAST = 0
++    ALL = 1
++    CLS = 2
++    STEP = 3
++    MEAN = 4
++
++
++class SimplePooler(nn.Module):
++    """A layer that pools specific information from hidden states.
++
++    This layer does the following:
++    1. Extracts specific tokens or aggregates data based on pooling method.
++    2. Normalizes output if specified.
++    3. Returns structured results as `PoolerOutput`.
++
++    Attributes:
++        pooling_type: The type of pooling to use.
++        normalize: Whether to normalize the pooled data.
++    """
++
++    @staticmethod
++    def from_pooling_type(
++        pooling_type: PoolingType,
++        *,
++        normalize: bool,
++        softmax: bool,
++        step_tag_id: Optional[int] = None,
++        returned_token_ids: Optional[List[int]] = None,
++    ) -> "SimplePooler":
++        if pooling_type == PoolingType.LAST:
++            assert step_tag_id is None and returned_token_ids is None
++            return LastPool(normalize=normalize, softmax=softmax)
++        if pooling_type == PoolingType.ALL:
++            assert step_tag_id is None and returned_token_ids is None
++            return AllPool(normalize=normalize, softmax=softmax)
++        if pooling_type == PoolingType.CLS:
++            assert step_tag_id is None and returned_token_ids is None
++            return CLSPool(normalize=normalize, softmax=softmax)
++        if pooling_type == PoolingType.MEAN:
++            assert step_tag_id is None and returned_token_ids is None
++            return MeanPool(normalize=normalize, softmax=softmax)
++        if pooling_type == PoolingType.STEP:
++            return StepPool(normalize=normalize,
++                            softmax=softmax,
++                            step_tag_id=step_tag_id,
++                            returned_token_ids=returned_token_ids)
++
++        assert_never(pooling_type)
++
++    def __init__(self, *, normalize: bool, softmax: bool) -> None:
++        super().__init__()
++
++        self.head = PoolerHead(normalize=normalize, softmax=softmax)
++
++    def get_prompt_lens(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> torch.Tensor:
++        return PoolingTensors.from_pooling_metadata(
++            pooling_metadata, hidden_states.device).prompt_lens
++
++    def extract_states(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Union[list[torch.Tensor], torch.Tensor]:
++        raise NotImplementedError
++
++    def build_output(self, data: torch.Tensor) -> PoolingSequenceGroupOutput:
++        return PoolingSequenceGroupOutput(data)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> PoolerOutput:
++        pooled_data = self.extract_states(hidden_states, pooling_metadata)
++        pooled_data = self.head(pooled_data)
++        pooled_outputs = [self.build_output(data) for data in pooled_data]
++        return PoolerOutput(outputs=pooled_outputs)
++
++
++class CLSPool(SimplePooler):
++
++    def extract_states(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Union[list[torch.Tensor], torch.Tensor]:
++        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
++
++        first_token_flat_indices = torch.zeros_like(prompt_lens)
++        first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
++        return hidden_states[first_token_flat_indices]
++
++
++class LastPool(SimplePooler):
++
++    def extract_states(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Union[list[torch.Tensor], torch.Tensor]:
++        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
++
++        last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
++        return hidden_states[last_token_flat_indices]
++
++
++class AllPool(SimplePooler):
++
++    def extract_states(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Union[list[torch.Tensor], torch.Tensor]:
++        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
++
++        offset = 0
++        pooled_data = list[torch.Tensor]()
++        for prompt_len in prompt_lens:
++            pooled_data.append(hidden_states[offset:offset + prompt_len])
++            offset += prompt_len
++
++        return pooled_data
++
++
++class MeanPool(SimplePooler):
++
++    def extract_states(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Union[list[torch.Tensor], torch.Tensor]:
++        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
++
++        cumsum = torch.cumsum(hidden_states, dim=0)
++        start_indices = torch.cat([
++            torch.tensor([0], device=hidden_states.device),
++            torch.cumsum(prompt_lens[:-1], dim=0)
++        ])
++        end_indices = torch.cumsum(prompt_lens, dim=0)
++        return (cumsum[end_indices - 1] - cumsum[start_indices] +
++                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
++
++
++class StepPool(SimplePooler):
++
++    def __init__(
++        self,
++        *,
++        normalize: bool,
++        softmax: bool,
++        step_tag_id: Optional[int] = None,
++        returned_token_ids: Optional[List[int]] = None,
++    ):
++        super().__init__(normalize=normalize, softmax=softmax)
++
++        self.step_tag_id = step_tag_id
++        self.returned_token_ids = returned_token_ids
++
++    def extract_states(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Union[list[torch.Tensor], torch.Tensor]:
++        prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
++
++        returned_token_ids = self.returned_token_ids
++        if returned_token_ids is not None and len(returned_token_ids) > 0:
++            hidden_states = hidden_states[:, returned_token_ids]
++
++        step_tag_id = self.step_tag_id
++
++        offset = 0
++        pooled_data = list[torch.Tensor]()
++        for prompt_len, seq_data_i in zip(prompt_lens,
++                                          pooling_metadata.seq_data.values()):
++            pooled_data_i = hidden_states[offset:offset + prompt_len]
++            if step_tag_id is not None:
++                token_ids = torch.tensor(seq_data_i.prompt_token_ids)
++                pooled_data_i = pooled_data_i[token_ids == step_tag_id]
++
++            offset += prompt_len
++            pooled_data.append(pooled_data_i)
++
++        return pooled_data
++
++
++class PoolerHead(nn.Module):
++
++    def __init__(self, *, normalize: bool, softmax: bool) -> None:
++        super().__init__()
++
++        self.normalize = normalize
++        self.softmax = softmax
++
++    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor]):
++        if self.normalize:
++            if isinstance(pooled_data, list):
++                pooled_data = [
++                    F.normalize(data, p=2, dim=1) for data in pooled_data
++                ]
++            else:
++                pooled_data = F.normalize(pooled_data, p=2, dim=1)
++
++        if self.softmax:
++            if isinstance(pooled_data, list):
++                pooled_data = [F.softmax(data, dim=-1) for data in pooled_data]
++            else:
++                pooled_data = F.softmax(pooled_data, dim=-1)
++
++        return pooled_data
++
++
++class Pooler(nn.Module):
++
++    @classmethod
++    def from_config_with_defaults(
++        cls,
++        pooler_config: PoolerConfig,
++        pooling_type: PoolingType,
++        normalize: bool,
++        softmax: bool,
++        step_tag_id: Optional[int] = None,
++        returned_token_ids: Optional[List[int]] = None,
++    ) -> SimplePooler:
++        return SimplePooler.from_pooling_type(
++            pooling_type=PoolingType[pooler_config.pooling_type]
++            if pooler_config.pooling_type is not None else pooling_type,
++            normalize=pooler_config.normalize
++            if pooler_config.normalize is not None else normalize,
++            softmax=pooler_config.softmax
++            if pooler_config.softmax is not None else softmax,
++            step_tag_id=pooler_config.step_tag_id
++            if pooler_config.step_tag_id is not None else step_tag_id,
++            returned_token_ids=pooler_config.returned_token_ids
++            if pooler_config.returned_token_ids is not None else
++            returned_token_ids,
++        )
++
++
++class CrossEncodingPooler(nn.Module):
++    """A layer that pools specific information from hidden states.
++
++    This layer does the following:
++    1. Extracts specific tokens or aggregates data based on pooling method.
++    2. Normalizes output if specified.
++    3. Returns structured results as `PoolerOutput`.
++
++    Attributes:
++        pooling_type: The type of pooling to use.
++        normalize: Whether to normalize the pooled data.
++    """
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        classifier: nn.Module,
++        pooler: Optional[nn.Module] = None,
++    ):
++        super().__init__()
++        self.classifier = classifier
++        self.pooler = pooler
++        self.default_activation_function = \
++            get_cross_encoder_activation_function(config)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> PoolerOutput:
++        """Pools sentence pair scores from the hidden_states."""
++
++        prompt_lens = PoolingTensors.from_pooling_metadata(
++            pooling_metadata, hidden_states.device).prompt_lens
++
++        offset = 0
++        pooled_data_lst = []
++        for prompt_len in prompt_lens:
++            pooled_data_i = hidden_states[offset:offset + prompt_len]
++
++            if self.pooler is not None:
++                final_shape_tensor = self.pooler(pooled_data_i)
++            else:
++                final_shape_tensor = self.classifier(pooled_data_i)
++
++            pooled_data_lst.append(final_shape_tensor)
++            offset += prompt_len
++
++        pooled_output = torch.stack(pooled_data_lst)
++
++        if self.pooler is not None:
++            # apply classifier once on the full batch if possible
++            pooled_output = self.classifier(pooled_output)
++
++        scores = self.default_activation_function(pooled_output).squeeze(-1)
++
++        pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
++        return PoolerOutput(outputs=pooled_outputs)
+diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
+index 1c652e3..dd10c43 100644
+--- a/vllm/model_executor/layers/quantization/__init__.py
++++ b/vllm/model_executor/layers/quantization/__init__.py
+@@ -1,31 +1,87 @@
+-from typing import Dict, Type
++from typing import Dict, List, Type
+ 
+-from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
+-from vllm.model_executor.layers.quantization.awq import AWQConfig
+ from vllm.model_executor.layers.quantization.base_config import (
+     QuantizationConfig)
+-from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+-from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+-from vllm.model_executor.layers.quantization.gptq_marlin import (
+-    GPTQMarlinConfig)
+-from vllm.model_executor.layers.quantization.marlin import MarlinConfig
+-from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
+-
+-QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
+-    "aqlm": AQLMConfig,
+-    "awq": AWQConfig,
+-    "fp8": Fp8Config,
+-    "gptq": GPTQConfig,
+-    "squeezellm": SqueezeLLMConfig,
+-    "gptq_marlin": GPTQMarlinConfig,
+-    "marlin": MarlinConfig,
+-}
++
++QUANTIZATION_METHODS: List[str] = [
++    "aqlm",
++    "awq",
++    "deepspeedfp",
++    "tpu_int8",
++    "fp8",
++    "fbgemm_fp8",
++    "modelopt",
++    # The order of gptq methods is important for config.py iteration over
++    # override_quantization_method(..)
++    "marlin",
++    "gguf",
++    "gptq_marlin_24",
++    "gptq_marlin",
++    "awq_marlin",
++    "gptq",
++    "compressed-tensors",
++    "bitsandbytes",
++    "qqq",
++    "hqq",
++    "experts_int8",
++    "neuron_quant",
++    "ipex",
++]
+ 
+ 
+ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
+     if quantization not in QUANTIZATION_METHODS:
+         raise ValueError(f"Invalid quantization method: {quantization}")
+-    return QUANTIZATION_METHODS[quantization]
++
++    # lazy import to avoid triggering `torch.compile` too early
++    from .aqlm import AQLMConfig
++    from .awq import AWQConfig
++    from .awq_marlin import AWQMarlinConfig
++    from .bitsandbytes import BitsAndBytesConfig
++    from .compressed_tensors.compressed_tensors import (  # noqa: E501
++        CompressedTensorsConfig)
++    from .deepspeedfp import DeepSpeedFPConfig
++    from .experts_int8 import ExpertsInt8Config
++    from .fbgemm_fp8 import FBGEMMFp8Config
++    from .fp8 import Fp8Config
++    from .gguf import GGUFConfig
++    from .gptq import GPTQConfig
++    from .gptq_marlin import GPTQMarlinConfig
++    from .gptq_marlin_24 import GPTQMarlin24Config
++    from .hqq_marlin import HQQMarlinConfig
++    from .ipex_quant import IPEXConfig
++    from .marlin import MarlinConfig
++    from .modelopt import ModelOptFp8Config
++    from .neuron_quant import NeuronQuantConfig
++    from .qqq import QQQConfig
++    from .tpu_int8 import Int8TpuConfig
++
++    method_to_config: Dict[str, Type[QuantizationConfig]] = {
++        "aqlm": AQLMConfig,
++        "awq": AWQConfig,
++        "deepspeedfp": DeepSpeedFPConfig,
++        "tpu_int8": Int8TpuConfig,
++        "fp8": Fp8Config,
++        "fbgemm_fp8": FBGEMMFp8Config,
++        "modelopt": ModelOptFp8Config,
++        # The order of gptq methods is important for config.py iteration over
++        # override_quantization_method(..)
++        "marlin": MarlinConfig,
++        "gguf": GGUFConfig,
++        "gptq_marlin_24": GPTQMarlin24Config,
++        "gptq_marlin": GPTQMarlinConfig,
++        "awq_marlin": AWQMarlinConfig,
++        "gptq": GPTQConfig,
++        "compressed-tensors": CompressedTensorsConfig,
++        "bitsandbytes": BitsAndBytesConfig,
++        "qqq": QQQConfig,
++        "hqq": HQQMarlinConfig,
++        "experts_int8": ExpertsInt8Config,
++        "neuron_quant": NeuronQuantConfig,
++        "ipex": IPEXConfig,
++    }
++
++    return method_to_config[quantization]
+ 
+ 
+ __all__ = [
+diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
+index 83e24fa..72c89fe 100644
+--- a/vllm/model_executor/layers/quantization/aqlm.py
++++ b/vllm/model_executor/layers/quantization/aqlm.py
+@@ -95,7 +95,7 @@ def generic_dequantize_gemm(
+     codebooks: torch.
+     Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+     scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+-    output_partition_sizes: torch.IntTensor,
++    output_partition_sizes: List[int],
+     bias: Optional[torch.Tensor],
+ ) -> torch.Tensor:
+     output_shape = input.shape[:-1] + (scales.shape[0], )
+@@ -133,7 +133,7 @@ def optimized_dequantize_gemm(
+     codebooks: torch.
+     Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+     scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+-    output_partition_sizes: torch.IntTensor,
++    output_partition_sizes: List[int],
+     bias: Optional[torch.Tensor],
+ ) -> torch.Tensor:
+     weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+@@ -192,7 +192,7 @@ class AQLMConfig(QuantizationConfig):
+ 
+     @classmethod
+     def get_min_capability(cls) -> int:
+-        return 70
++        return 60
+ 
+     @classmethod
+     def get_config_filenames(cls) -> List[str]:
+@@ -207,15 +207,12 @@ class AQLMConfig(QuantizationConfig):
+         return cls(in_group_size, nbits_per_codebook, num_code_books,
+                    out_group_size)
+ 
+-    def get_quant_method(
+-            self, layer: torch.nn.Module) -> Optional["AQLMLinearMethod"]:
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["AQLMLinearMethod"]:
+         if isinstance(layer, LinearBase):
+             return AQLMLinearMethod(self)
+         return None
+ 
+-    def get_scaled_act_names(self) -> List[str]:
+-        return []
+-
+ 
+ class AQLMLinearMethod(LinearMethodBase):
+     """Linear method for AQLM.
+@@ -288,10 +285,8 @@ class AQLMLinearMethod(LinearMethodBase):
+             codebooks,
+             {
+                 # metadata indicates fixed size concatenated along dim 0
+-                "is_metadata":
+-                True,
+-                "output_partition_sizes":
+-                torch.tensor(output_partition_sizes, device='cpu'),
++                "is_metadata": True,
++                "output_partition_sizes": output_partition_sizes
+             },
+         )
+ 
+@@ -334,7 +329,7 @@ class AQLMLinearMethod(LinearMethodBase):
+         codes = layer.codes
+         scales = layer.scales
+         output_partition_sizes = getattr(codebooks, "output_partition_sizes",
+-                                         None)
++                                         [])
+ 
+         nbooks = codes.shape[2]
+         ingroups = codebooks.shape[3]
+diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
+index f4fc7ce..d83528e 100644
+--- a/vllm/model_executor/layers/quantization/awq.py
++++ b/vllm/model_executor/layers/quantization/awq.py
+@@ -1,13 +1,14 @@
+ from typing import Any, Dict, List, Optional
+ 
+ import torch
+-from torch.nn.parameter import Parameter
+ 
+ from vllm import _custom_ops as ops
+-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
++from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
++                                               UnquantizedLinearMethod)
+ from vllm.model_executor.layers.quantization.base_config import (
+     QuantizationConfig)
+-from vllm.model_executor.utils import set_weight_attrs
++from vllm.model_executor.parameter import (GroupQuantScaleParameter,
++                                           PackedvLLMParameter)
+ 
+ 
+ class AWQConfig(QuantizationConfig):
+@@ -21,10 +22,12 @@ class AWQConfig(QuantizationConfig):
+         weight_bits: int,
+         group_size: int,
+         zero_point: bool,
++        modules_to_not_convert: Optional[List[str]] = None,
+     ) -> None:
+         self.weight_bits = weight_bits
+         self.group_size = group_size
+         self.zero_point = zero_point
++        self.modules_to_not_convert = modules_to_not_convert or []
+ 
+         if self.weight_bits != 4:
+             raise ValueError(
+@@ -35,7 +38,8 @@ class AWQConfig(QuantizationConfig):
+     def __repr__(self) -> str:
+         return (f"AWQConfig(weight_bits={self.weight_bits}, "
+                 f"group_size={self.group_size}, "
+-                f"zero_point={self.zero_point})")
++                f"zero_point={self.zero_point}, "
++                f"modules_to_not_convert={self.modules_to_not_convert})")
+ 
+     def get_name(self) -> str:
+         return "awq"
+@@ -43,7 +47,8 @@ class AWQConfig(QuantizationConfig):
+     def get_supported_act_dtypes(self) -> List[torch.dtype]:
+         return [torch.half]
+ 
+-    def get_min_capability(self) -> int:
++    @classmethod
++    def get_min_capability(cls) -> int:
+         # The AWQ kernel only supports Turing or newer GPUs.
+         return 75
+ 
+@@ -60,16 +65,21 @@ class AWQConfig(QuantizationConfig):
+         weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
+         group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
+         zero_point = cls.get_from_keys(config, ["zero_point"])
+-        return cls(weight_bits, group_size, zero_point)
++        modules_to_not_convert = cls.get_from_keys_or(
++            config, ["modules_to_not_convert"], None)
++        return cls(weight_bits, group_size, zero_point, modules_to_not_convert)
+ 
+-    def get_quant_method(
+-            self, layer: torch.nn.Module) -> Optional["AWQLinearMethod"]:
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["LinearMethodBase"]:
+         if isinstance(layer, LinearBase):
++            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
++                return UnquantizedLinearMethod()
+             return AWQLinearMethod(self)
+         return None
+ 
+-    def get_scaled_act_names(self) -> List[str]:
+-        return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"]
++
++def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]):
++    return any(module_name in prefix for module_name in modules_to_not_convert)
+ 
+ 
+ class AWQLinearMethod(LinearMethodBase):
+@@ -100,55 +110,51 @@ class AWQLinearMethod(LinearMethodBase):
+                 "weight shape. This can be caused by too large "
+                 "tensor parallel size.")
+ 
+-        qweight = Parameter(
+-            torch.empty(
++        weight_loader = extra_weight_attrs.get("weight_loader")
++        qweight = PackedvLLMParameter(
++            data=torch.empty(
+                 input_size_per_partition,
+                 output_size_per_partition // self.quant_config.pack_factor,
+                 dtype=torch.int32,
+             ),
+-            requires_grad=False,
+-        )
+-        set_weight_attrs(
+-            qweight, {
+-                "input_dim": 0,
+-                "output_dim": 1,
+-                "packed_dim": 1,
+-                "pack_factor": self.quant_config.pack_factor,
+-            })
+-        qzeros = Parameter(
+-            torch.empty(
++            input_dim=0,
++            output_dim=1,
++            packed_dim=1,
++            packed_factor=self.quant_config.pack_factor,
++            weight_loader=weight_loader)
++
++        qzeros = PackedvLLMParameter(
++            data=torch.empty(
+                 input_size_per_partition // self.quant_config.group_size,
+                 output_size_per_partition // self.quant_config.pack_factor,
+                 dtype=torch.int32,
+             ),
+-            requires_grad=False,
+-        )
+-        set_weight_attrs(
+-            qzeros, {
+-                "input_dim": 0,
+-                "output_dim": 1,
+-                "packed_dim": 1,
+-                "pack_factor": self.quant_config.pack_factor,
+-            })
+-        scales = Parameter(
+-            torch.empty(
+-                input_size_per_partition // self.quant_config.group_size,
+-                output_size_per_partition,
+-                dtype=params_dtype,
+-            ),
+-            requires_grad=False,
+-        )
+-        set_weight_attrs(scales, {
+-            "input_dim": 0,
+-            "output_dim": 1,
+-        })
++            input_dim=0,
++            output_dim=1,
++            packed_dim=1,
++            packed_factor=self.quant_config.pack_factor,
++            weight_loader=weight_loader)
++
++        scales = GroupQuantScaleParameter(data=torch.empty(
++            input_size_per_partition // self.quant_config.group_size,
++            output_size_per_partition,
++            dtype=params_dtype,
++        ),
++                                          input_dim=0,
++                                          output_dim=1,
++                                          weight_loader=weight_loader)
+ 
+         layer.register_parameter("qweight", qweight)
+-        set_weight_attrs(qweight, extra_weight_attrs)
+         layer.register_parameter("qzeros", qzeros)
+-        set_weight_attrs(qzeros, extra_weight_attrs)
+         layer.register_parameter("scales", scales)
+-        set_weight_attrs(scales, extra_weight_attrs)
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        layer.qweight = torch.nn.Parameter(layer.qweight.data,
++                                           requires_grad=False)
++        layer.qzeros = torch.nn.Parameter(layer.qzeros.data,
++                                          requires_grad=False)
++        layer.scales = torch.nn.Parameter(layer.scales.data,
++                                          requires_grad=False)
+ 
+     def apply(self,
+               layer: torch.nn.Module,
+diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
+new file mode 100644
+index 0000000..c28fd0c
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/awq_marlin.py
+@@ -0,0 +1,475 @@
++from typing import Any, Callable, Dict, List, Optional
++
++import torch
++from torch.nn import Parameter
++
++import vllm.model_executor.layers.fused_moe  # noqa
++from vllm import _custom_ops as ops
++from vllm.logger import init_logger
++from vllm.model_executor.layers.fused_moe.layer import (
++    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
++from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
++                                               UnquantizedLinearMethod,
++                                               set_weight_attrs)
++from vllm.model_executor.layers.quantization.awq import is_layer_skipped_awq
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig, QuantizeMethodBase)
++from vllm.model_executor.layers.quantization.utils import replace_parameter
++from vllm.model_executor.layers.quantization.utils.marlin_utils import (
++    apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
++    marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
++    marlin_permute_scales, moe_awq_to_marlin_zero_points,
++    verify_marlin_supported, verify_marlin_supports_shape)
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.parameter import (GroupQuantScaleParameter,
++                                           PackedvLLMParameter)
++from vllm.platforms import current_platform
++from vllm.scalar_type import scalar_types
++
++logger = init_logger(__name__)
++
++
++class AWQMarlinConfig(QuantizationConfig):
++    """Config class for AWQ Marlin"""
++
++    # num_bits -> type
++    TYPE_MAP = {
++        4: scalar_types.uint4,
++        8: scalar_types.uint8,
++    }
++
++    def __init__(self,
++                 weight_bits: int,
++                 group_size: int,
++                 zero_point: bool,
++                 lm_head_quantized: bool,
++                 modules_to_not_convert: Optional[List[str]] = None) -> None:
++        self.pack_factor = 32 // weight_bits  # packed into int32
++        self.group_size = group_size
++        self.zero_point = zero_point
++        self.lm_head_quantized = lm_head_quantized
++        self.weight_bits = weight_bits
++        self.modules_to_not_convert = modules_to_not_convert or []
++
++        if self.weight_bits not in self.TYPE_MAP:
++            raise ValueError(f"Unsupported num_bits = {self.weight_bits}. "
++                             f"Supported num_bits = {self.TYPE_MAP.keys()}")
++
++        self.quant_type = self.TYPE_MAP[self.weight_bits]
++
++        verify_marlin_supported(self.quant_type,
++                                group_size=self.group_size,
++                                has_zp=self.zero_point)
++
++    def __repr__(self) -> str:
++        return (f"AWQMarlinConfig(quant_type={self.quant_type}, "
++                f"group_size={self.group_size}, "
++                f"zero_point={self.zero_point}, "
++                f"lm_head_quantized={self.lm_head_quantized}, "
++                f"modules_to_not_convert={self.modules_to_not_convert})")
++
++    @classmethod
++    def get_name(cls) -> str:
++        return "awq_marlin"
++
++    @classmethod
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.half, torch.bfloat16]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 80
++
++    @classmethod
++    def get_config_filenames(cls) -> List[str]:
++        return ["quantize_config.json"]
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig":
++        weight_bits = cls.get_from_keys(config, ["bits"])
++        group_size = cls.get_from_keys(config, ["group_size"])
++        zero_point = cls.get_from_keys(config, ["zero_point"])
++        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
++                                                 default=False)
++        modules_to_not_convert = cls.get_from_keys_or(
++            config, ["modules_to_not_convert"], None)
++        return cls(weight_bits, group_size, zero_point, lm_head_quantized,
++                   modules_to_not_convert)
++
++    @classmethod
++    def override_quantization_method(cls, hf_quant_cfg,
++                                     user_quant) -> Optional[str]:
++        can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
++        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
++                               or user_quant == "awq_marlin")
++
++        if can_convert and is_valid_user_quant:
++            msg = ("The model is convertible to {} during runtime."
++                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
++            logger.info(msg)
++            return cls.get_name()
++
++        if can_convert and user_quant == "awq":
++            logger.info("Detected that the model can run with awq_marlin"
++                        ", however you specified quantization=awq explicitly,"
++                        " so forcing awq. Use quantization=awq_marlin for"
++                        " faster inference")
++        return None
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["QuantizeMethodBase"]:
++        if (isinstance(layer, LinearBase) or
++            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
++            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
++                return UnquantizedLinearMethod()
++            return AWQMarlinLinearMethod(self)
++        elif isinstance(layer, FusedMoE):
++            return AWQMoEMethod(self)
++        return None
++
++    @classmethod
++    def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
++        # Extract data from quant config.
++        quant_method = quant_config.get("quant_method", "").lower()
++        num_bits = quant_config.get("bits")
++        group_size = quant_config.get("group_size")
++        zero_point = quant_config.get("zero_point")
++
++        if not current_platform.is_cuda():
++            return False
++
++        if quant_method != "awq":
++            return False
++
++        # If we cannot find the info needed in the config, cannot convert.
++        if (num_bits is None or group_size is None or zero_point is None):
++            return False
++
++        if num_bits not in cls.TYPE_MAP:
++            return False
++
++        return check_marlin_supported(quant_type=cls.TYPE_MAP[num_bits],
++                                      group_size=group_size,
++                                      has_zp=zero_point)
++
++
++class AWQMarlinLinearMethod(LinearMethodBase):
++    """Linear method for AWQ Marlin.
++
++    Args:
++        quant_config: The AWQ Marlin quantization config.
++    """
++
++    def __init__(self, quant_config: AWQMarlinConfig) -> None:
++        self.quant_config = quant_config
++
++    def create_weights(
++        self,
++        layer: torch.nn.Module,
++        input_size_per_partition: int,
++        output_partition_sizes: List[int],
++        input_size: int,
++        output_size: int,
++        params_dtype: torch.dtype,
++        **extra_weight_attrs,
++    ) -> None:
++        del output_size
++        output_size_per_partition = sum(output_partition_sizes)
++        weight_loader = extra_weight_attrs.get("weight_loader")
++
++        # Normalize group_size
++        if self.quant_config.group_size != -1:
++            group_size = self.quant_config.group_size
++        else:
++            group_size = input_size
++
++        verify_marlin_supports_shape(
++            output_size_per_partition=output_size_per_partition,
++            input_size_per_partition=input_size_per_partition,
++            input_size=input_size,
++            group_size=group_size)
++
++        qweight = PackedvLLMParameter(
++            data=torch.empty(
++                input_size_per_partition,
++                output_size_per_partition // self.quant_config.pack_factor,
++                dtype=torch.int32,
++            ),
++            input_dim=0,
++            output_dim=1,
++            packed_dim=1,
++            packed_factor=self.quant_config.pack_factor,
++            weight_loader=weight_loader)
++
++        num_groups = input_size_per_partition // group_size
++
++        qzeros = PackedvLLMParameter(
++            data=torch.empty(
++                num_groups,
++                output_size_per_partition // self.quant_config.pack_factor,
++                dtype=torch.int32,
++            ),
++            input_dim=0,
++            output_dim=1,
++            packed_dim=1,
++            packed_factor=self.quant_config.pack_factor,
++            weight_loader=weight_loader)
++
++        scales = GroupQuantScaleParameter(data=torch.empty(
++            num_groups,
++            output_size_per_partition,
++            dtype=params_dtype,
++        ),
++                                          input_dim=0,
++                                          output_dim=1,
++                                          weight_loader=weight_loader)
++
++        layer.register_parameter("qweight", qweight)
++        layer.register_parameter("qzeros", qzeros)
++        layer.register_parameter("scales", scales)
++
++        layer.input_size_per_partition = input_size_per_partition
++        layer.output_size_per_partition = output_size_per_partition
++        layer.num_groups = num_groups
++
++    # TODO: Update this docs
++    # Checkpoints are serialized in AutoAWQ format, which is different from the
++    # marlin format. This function is called after the weights are loaded.
++    # Here, we handle the repacking
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        device = layer.qweight.device
++        layer.qweight = torch.nn.Parameter(layer.qweight.data,
++                                           requires_grad=False)
++        layer.qzeros = torch.nn.Parameter(layer.qzeros.data,
++                                          requires_grad=False)
++        layer.scales = torch.nn.Parameter(layer.scales.data,
++                                          requires_grad=False)
++
++        # Allocate marlin workspace
++        layer.workspace = marlin_make_workspace(
++            layer.output_size_per_partition, device)
++
++        # Repack weights from AWQ format to marlin format.
++        marlin_qweight = ops.awq_marlin_repack(
++            layer.qweight,
++            size_k=layer.input_size_per_partition,
++            size_n=layer.output_size_per_partition,
++            num_bits=self.quant_config.quant_type.size_bits)
++        replace_parameter(layer, "qweight", marlin_qweight)
++
++        # Permute scales from AWQ format to marlin format.
++        marlin_scales = marlin_permute_scales(
++            layer.scales,
++            size_k=layer.input_size_per_partition,
++            size_n=layer.output_size_per_partition,
++            group_size=self.quant_config.group_size)
++        replace_parameter(layer, "scales", marlin_scales)
++
++        # Permute zero-points from AWQ format to marlin format.
++        marlin_zp = awq_to_marlin_zero_points(
++            layer.qzeros,
++            size_k=layer.num_groups,
++            size_n=layer.output_size_per_partition,
++            num_bits=self.quant_config.quant_type.size_bits)
++        replace_parameter(layer, "qzeros", marlin_zp)
++
++        # Not-used
++        layer.g_idx = marlin_make_empty_g_idx(device)
++        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        return apply_awq_marlin_linear(
++            input=x,
++            weight=layer.qweight,
++            weight_scale=layer.scales,
++            weight_zp=layer.qzeros,
++            g_idx=layer.g_idx,
++            g_idx_sort_indices=layer.g_idx_sort_indices,
++            workspace=layer.workspace,
++            quant_type=self.quant_config.quant_type,
++            output_size_per_partition=layer.output_size_per_partition,
++            input_size_per_partition=layer.input_size_per_partition,
++            bias=bias)
++
++
++class AWQMoEMethod(FusedMoEMethodBase):
++
++    def __init__(self, quant_config: AWQMarlinConfig):
++        self.quant_config = quant_config
++
++    def create_weights(self, layer: torch.nn.Module, num_experts: int,
++                       hidden_size: int, intermediate_size: int,
++                       params_dtype: torch.dtype, **extra_weight_attrs):
++        extra_weight_attrs.update({
++            "is_transposed":
++            True,
++            "quant_method":
++            FusedMoeWeightScaleSupported.GROUP.value,
++        })
++
++        w13_qweight = Parameter(torch.empty(num_experts,
++                                            hidden_size,
++                                            2 * intermediate_size //
++                                            self.quant_config.pack_factor,
++                                            dtype=torch.int32),
++                                requires_grad=False)
++        layer.register_parameter("w13_qweight", w13_qweight)
++        set_weight_attrs(w13_qweight, extra_weight_attrs)
++
++        w2_qweight = Parameter(torch.empty(num_experts,
++                                           intermediate_size,
++                                           hidden_size //
++                                           self.quant_config.pack_factor,
++                                           dtype=torch.int32),
++                               requires_grad=False)
++        layer.register_parameter("w2_qweight", w2_qweight)
++        set_weight_attrs(w2_qweight, extra_weight_attrs)
++
++        num_groups_w13 = hidden_size // self.quant_config.group_size
++        num_groups_w2 = intermediate_size // self.quant_config.group_size
++
++        # WEIGHT_SCALES
++        # Allocate 2 scales for w1 and w3 respectively.
++        w13_scales = Parameter(torch.empty(num_experts,
++                                           num_groups_w13,
++                                           intermediate_size * 2,
++                                           dtype=params_dtype),
++                               requires_grad=False)
++        layer.register_parameter("w13_scales", w13_scales)
++        set_weight_attrs(w13_scales, extra_weight_attrs)
++
++        w2_scales = Parameter(torch.empty(num_experts,
++                                          num_groups_w2,
++                                          hidden_size,
++                                          dtype=params_dtype),
++                              requires_grad=False)
++        layer.register_parameter("w2_scales", w2_scales)
++        set_weight_attrs(w2_scales, extra_weight_attrs)
++
++        # WEIGHT_ZERO_POINT
++        # Allocate 2 zero points for w1 and w3 respectively.
++        w13_qzeros = Parameter(torch.empty(num_experts,
++                                           num_groups_w13,
++                                           2 * intermediate_size //
++                                           self.quant_config.pack_factor,
++                                           dtype=torch.int32),
++                               requires_grad=False)
++        layer.register_parameter("w13_qzeros", w13_qzeros)
++        set_weight_attrs(w13_qzeros, extra_weight_attrs)
++
++        w2_qzeros = Parameter(torch.empty(num_experts,
++                                          num_groups_w2,
++                                          hidden_size //
++                                          self.quant_config.pack_factor,
++                                          dtype=torch.int32),
++                              requires_grad=False)
++        layer.register_parameter("w2_qzeros", w2_qzeros)
++        set_weight_attrs(w2_qzeros, extra_weight_attrs)
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        num_experts = layer.w13_qweight.shape[0]
++        device = layer.w13_qweight.device
++
++        layer.w13_g_idx_sort_indices = torch.nn.Parameter(
++            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
++            requires_grad=False,
++        )
++        layer.w2_g_idx_sort_indices = torch.nn.Parameter(
++            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
++            requires_grad=False,
++        )
++
++        marlin_w13_qweight = ops.awq_marlin_moe_repack(
++            layer.w13_qweight,
++            layer.w13_g_idx_sort_indices,
++            size_k=layer.w13_qweight.shape[1],
++            size_n=layer.w13_qweight.shape[2] * self.quant_config.pack_factor,
++            num_bits=self.quant_config.weight_bits,
++        )
++        replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
++
++        marlin_w2_qweight = ops.awq_marlin_moe_repack(
++            layer.w2_qweight,
++            layer.w2_g_idx_sort_indices,
++            size_k=layer.w2_qweight.shape[1],
++            size_n=layer.w2_qweight.shape[2] * self.quant_config.pack_factor,
++            num_bits=self.quant_config.weight_bits,
++        )
++        replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
++
++        # Why does this take the intermediate size for size_k?
++        marlin_w13_scales = marlin_moe_permute_scales(
++            s=layer.w13_scales,
++            size_k=layer.intermediate_size_per_partition,
++            size_n=layer.w13_scales.shape[2],
++            group_size=self.quant_config.group_size,
++        )
++
++        replace_parameter(layer, "w13_scales", marlin_w13_scales)
++
++        marlin_w2_scales = marlin_moe_permute_scales(
++            s=layer.w2_scales,
++            size_k=layer.intermediate_size_per_partition,
++            size_n=layer.w2_scales.shape[2],
++            group_size=self.quant_config.group_size,
++        )
++        replace_parameter(layer, "w2_scales", marlin_w2_scales)
++
++        marlin_w13_zp = moe_awq_to_marlin_zero_points(
++            layer.w13_qzeros,
++            size_k=layer.w13_qzeros.shape[1],
++            size_n=layer.w13_qzeros.shape[2] * self.quant_config.pack_factor,
++            num_bits=self.quant_config.weight_bits)
++        replace_parameter(layer, "w13_qzeros", marlin_w13_zp)
++
++        marlin_w2_zp = moe_awq_to_marlin_zero_points(
++            layer.w2_qzeros,
++            size_k=layer.w2_qzeros.shape[1],
++            size_n=layer.w2_qzeros.shape[2] * self.quant_config.pack_factor,
++            num_bits=self.quant_config.weight_bits)
++        replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        router_logits: torch.Tensor,
++        top_k: int,
++        renormalize: bool,
++        use_grouped_topk: bool = False,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        topk_weights, topk_ids = FusedMoE.select_experts(
++            hidden_states=x,
++            router_logits=router_logits,
++            use_grouped_topk=use_grouped_topk,
++            top_k=top_k,
++            renormalize=renormalize,
++            topk_group=topk_group,
++            num_expert_group=num_expert_group,
++            custom_routing_function=custom_routing_function,
++            scoring_func=scoring_func,
++            e_score_correction_bias=e_score_correction_bias)
++
++        return torch.ops.vllm.fused_marlin_moe(
++            x,
++            layer.w13_qweight,
++            layer.w2_qweight,
++            layer.w13_scales,
++            layer.w2_scales,
++            router_logits,
++            topk_weights,
++            topk_ids,
++            w1_zeros=layer.w13_qzeros,
++            w2_zeros=layer.w2_qzeros,
++            num_bits=self.quant_config.weight_bits,
++        )
+diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py
+new file mode 100644
+index 0000000..ace8f4a
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/awq_triton.py
+@@ -0,0 +1,317 @@
++import torch
++import triton
++import triton.language as tl
++
++AWQ_TRITON_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
++
++
++@triton.jit
++def awq_dequantize_kernel(
++        qweight_ptr,  # quantized matrix
++        scales_ptr,  # scales, per group
++        zeros_ptr,  # zeros, per group
++        group_size,  # Should always be one of the supported group sizes
++        result_ptr,  # Output matrix
++        num_cols,  # input num cols in qweight
++        num_rows,  # input num rows in qweight
++        BLOCK_SIZE_X: tl.constexpr,
++        BLOCK_SIZE_Y: tl.constexpr):
++    # Setup the pids.
++    pid_x = tl.program_id(axis=0)
++    pid_y = tl.program_id(axis=1)
++
++    # Compute offsets and masks for qweight_ptr.
++    offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
++    offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X)
++    offsets = num_cols * offsets_y[:, None] + offsets_x[None, :]
++
++    masks_y = offsets_y < num_rows
++    masks_x = offsets_x < num_cols
++
++    masks = masks_y[:, None] & masks_x[None, :]
++
++    # Compute offsets and masks for result output ptr.
++    result_offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
++    result_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(
++        0, BLOCK_SIZE_X * 8)
++    result_offsets = (8 * num_cols * result_offsets_y[:, None] +
++                      result_offsets_x[None, :])
++
++    result_masks_y = result_offsets_y < num_rows
++    result_masks_x = result_offsets_x < num_cols * 8
++    result_masks = result_masks_y[:, None] & result_masks_x[None, :]
++
++    # Load the weights.
++    iweights = tl.load(qweight_ptr + offsets, masks, 0.0)
++    iweights = tl.interleave(iweights, iweights)
++    iweights = tl.interleave(iweights, iweights)
++    iweights = tl.interleave(iweights, iweights)
++
++    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
++    # that will map given indices to the correct order.
++    reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] +
++                                tl.arange(0, 4)[:, None]).reshape(8)
++
++    # Use this to compute a set of shifts that can be used to unpack and
++    # reorder the values in iweights and zeros.
++    shifts = reverse_awq_order_tensor * 4
++    shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_Y * BLOCK_SIZE_X, 8))
++    shifts = tl.reshape(shifts, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
++
++    # Unpack and reorder: shift out the correct 4-bit value and mask.
++    iweights = (iweights >> shifts) & 0xF
++
++    # Compute zero offsets and masks.
++    zero_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
++    zero_offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X)
++    zero_offsets = num_cols * zero_offsets_y[:, None] + zero_offsets_x[None, :]
++
++    zero_masks_y = zero_offsets_y < num_rows // group_size
++    zero_masks_x = zero_offsets_x < num_cols
++    zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :]
++
++    # Load the zeros.
++    zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0)
++    zeros = tl.interleave(zeros, zeros)
++    zeros = tl.interleave(zeros, zeros)
++    zeros = tl.interleave(zeros, zeros)
++    zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
++
++    # Unpack and reorder: shift out the correct 4-bit value and mask.
++    zeros = (zeros >> shifts) & 0xF
++
++    # Compute scale offsets and masks.
++    scale_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
++    scale_offsets_x = (pid_x * BLOCK_SIZE_X * 8 +
++                       tl.arange(0, BLOCK_SIZE_X * 8))
++    scale_offsets = (num_cols * 8 * scale_offsets_y[:, None] +
++                     scale_offsets_x[None, :])
++    scale_masks_y = scale_offsets_y < num_rows // group_size
++    scale_masks_x = scale_offsets_x < num_cols * 8
++    scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :]
++
++    # Load the scales.
++    scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0)
++    scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
++
++    # Dequantize.
++    iweights = (iweights - zeros) * scales
++    iweights = iweights.to(result_ptr.type.element_ty)
++
++    # Finally, store.
++    tl.store(result_ptr + result_offsets, iweights, result_masks)
++
++
++@triton.jit
++def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
++                    group_size, BLOCK_SIZE_M: tl.constexpr,
++                    BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
++                    SPLIT_K: tl.constexpr):
++    pid = tl.program_id(axis=0)
++    pid_z = tl.program_id(1)
++
++    # NOTE: This doesn't work in TRITON_INTERPRET=1 mode.  Use below instead.
++    # num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
++    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
++
++    pid_m = pid // num_pid_n
++    pid_n = pid % num_pid_n
++
++    accumulator_dtype = c_ptr.type.element_ty
++
++    # NOTE: This doesn't work in TRITON_INTERPRET=1 mode.  Use below instead.
++    # accumulator = tl.arange(0, BLOCK_SIZE_N)
++    # accumulator = tl.broadcast_to(accumulator[None, :],
++    # (BLOCK_SIZE_M, BLOCK_SIZE_N))
++    # accumulator = accumulator & 0x0
++    # accumulator = accumulator.to(accumulator_dtype)
++    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
++                           dtype=accumulator_dtype)
++
++    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
++    # that will map given indices to the correct order.
++    reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] +
++                                tl.arange(0, 4)[:, None]).reshape(8)
++
++    # Create the necessary shifts to use to unpack.
++    shifts = reverse_awq_order_tensor * 4
++    shifts = tl.broadcast_to(shifts[None, :],
++                             (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
++    shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N))
++
++    # Offsets and masks.
++    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
++    masks_am = offsets_am < M
++
++    offsets_bn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
++    masks_bn = offsets_bn < N // 8
++
++    offsets_zn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
++    masks_zn = offsets_zn < N // 8
++
++    offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
++    masks_sn = offsets_sn < N
++
++    offsets_k = pid_z * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
++    offsets_a = K * offsets_am[:, None] + offsets_k[None, :]
++    offsets_b = (N // 8) * offsets_k[:, None] + offsets_bn[None, :]
++
++    a_ptrs = a_ptr + offsets_a
++    b_ptrs = b_ptr + offsets_b
++
++    # NOTE: Use this in TRITON_INTERPRET=1 mode instead of tl.cdiv
++    # block_offset = BLOCK_SIZE_K * SPLIT_K
++    # for k in range(0, (K + block_offset - 1) // (block_offset)):
++    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
++        masks_k = offsets_k < K
++        masks_a = masks_am[:, None] & masks_k[None, :]
++        a = tl.load(a_ptrs, mask=masks_a, other=0.0)
++
++        masks_b = masks_k[:, None] & masks_bn[None, :]
++        b = tl.load(b_ptrs, mask=masks_b, other=0.0)
++        b = tl.interleave(b, b)
++        b = tl.interleave(b, b)
++        b = tl.interleave(b, b)
++
++        # Dequantize b.
++        offsets_szk = (
++            (BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size +
++            tl.arange(0, 1))
++        offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :]
++        masks_zk = offsets_szk < K // group_size
++        masks_z = masks_zk[:, None] & masks_zn[None, :]
++        zeros_ptrs = zeros_ptr + offsets_z
++        zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0)
++        zeros = tl.interleave(zeros, zeros)
++        zeros = tl.interleave(zeros, zeros)
++        zeros = tl.interleave(zeros, zeros)
++        zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N))
++
++        offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :]
++        masks_sk = offsets_szk < K // group_size
++        masks_s = masks_sk[:, None] & masks_sn[None, :]
++        scales_ptrs = scales_ptr + offsets_s
++        scales = tl.load(scales_ptrs, mask=masks_s, other=0.0)
++        scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N))
++
++        b = (b >> shifts) & 0xF
++        zeros = (zeros >> shifts) & 0xF
++        b = (b - zeros) * scales
++        b = b.to(c_ptr.type.element_ty)
++
++        # Accumulate results.
++        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
++
++        offsets_k += BLOCK_SIZE_K * SPLIT_K
++        a_ptrs += BLOCK_SIZE_K * SPLIT_K
++        b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N // 8)
++
++    c = accumulator.to(c_ptr.type.element_ty)
++    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
++    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
++    c_ptrs = c_ptr + pid_z * N * M + N * offs_cm[:, None] + offs_cn[None, :]
++    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
++    tl.store(c_ptrs, c, mask=c_mask)
++
++
++# qweights - [K     , M // 8], int32
++# scales   - [K // G, M     ], float16
++# zeros    - [K // G, M // 8], int32
++def awq_dequantize_triton(qweight: torch.Tensor,
++                          scales: torch.Tensor,
++                          zeros: torch.Tensor,
++                          block_size_x: int = 32,
++                          block_size_y: int = 32) -> torch.Tensor:
++    K = qweight.shape[0]
++    M = scales.shape[1]
++    group_size = qweight.shape[0] // scales.shape[0]
++
++    assert K > 0 and M > 0
++    assert scales.shape[0] == K // group_size and scales.shape[1] == M
++    assert zeros.shape[0] == K // group_size and zeros.shape[1] == M // 8
++    assert group_size <= K
++    assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
++
++    # Result tensor:
++    # number of rows = same as input tensor
++    # number of cols = 8 x input tensor num cols
++    result = torch.empty(qweight.shape[0],
++                         qweight.shape[1] * 8,
++                         device=qweight.device,
++                         dtype=scales.dtype)
++
++    Y = qweight.shape[0]  # num rows
++    X = qweight.shape[1]  # num cols
++
++    grid = lambda META: (
++        triton.cdiv(X, META['BLOCK_SIZE_X']),
++        triton.cdiv(Y, META['BLOCK_SIZE_Y']),
++    )
++    awq_dequantize_kernel[grid](qweight,
++                                scales,
++                                zeros,
++                                group_size,
++                                result,
++                                X,
++                                Y,
++                                BLOCK_SIZE_X=block_size_x,
++                                BLOCK_SIZE_Y=block_size_y)
++
++    return result
++
++
++# input   - [M, K]
++# qweight - [K, N // 8]
++# qzeros  - [K // G, N // 8]
++# scales  - [K // G, N]
++# split_k_iters - parallelism along K-dimension, int, power of 2.
++def awq_gemm_triton(input: torch.Tensor,
++                    qweight: torch.Tensor,
++                    scales: torch.Tensor,
++                    qzeros: torch.Tensor,
++                    split_k_iters: int,
++                    block_size_m: int = 32,
++                    block_size_n: int = 32,
++                    block_size_k: int = 32) -> torch.Tensor:
++    M, K = input.shape
++    N = qweight.shape[1] * 8
++    group_size = qweight.shape[0] // qzeros.shape[0]
++
++    assert N > 0 and K > 0 and M > 0
++    assert qweight.shape[0] == K and qweight.shape[1] == N // 8
++    assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 8
++    assert scales.shape[0] == K // group_size and scales.shape[1] == N
++    assert split_k_iters & (split_k_iters - 1) == 0 and split_k_iters != 0
++    assert split_k_iters <= 32
++    assert group_size <= K
++    assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
++
++    grid = lambda META: (
++        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
++            N, META['BLOCK_SIZE_N']),
++        split_k_iters,
++    )
++
++    result = torch.zeros((split_k_iters, M, N),
++                         dtype=scales.dtype,
++                         device=input.device)
++
++    # A = input, B = qweight, C = result
++    # A = M x K, B = K x N, C = M x N
++    awq_gemm_kernel[grid](input,
++                          qweight,
++                          result,
++                          qzeros,
++                          scales,
++                          M,
++                          N,
++                          K,
++                          group_size,
++                          BLOCK_SIZE_M=block_size_m,
++                          BLOCK_SIZE_N=block_size_n,
++                          BLOCK_SIZE_K=block_size_k,
++                          SPLIT_K=split_k_iters)
++
++    result = result.sum(0)
++
++    return result
+diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
+index ff5cf0b..6dfac8a 100644
+--- a/vllm/model_executor/layers/quantization/base_config.py
++++ b/vllm/model_executor/layers/quantization/base_config.py
+@@ -1,5 +1,6 @@
++import inspect
+ from abc import ABC, abstractmethod
+-from typing import Any, Dict, List, Optional
++from typing import Any, Dict, List, Optional, Type
+ 
+ import torch
+ from torch import nn
+@@ -23,6 +24,14 @@ class QuantizeMethodBase(ABC):
+         Expects create_weights to have been called before on the layer."""
+         raise NotImplementedError
+ 
++    # Not required functions
++    def embedding(self, layer: torch.nn.Module, *args,
++                  **kwargs) -> torch.Tensor:
++        """Gather embeddings in the layer based on indices in the input tensor.
++
++        Expects create_weights to have been called before on the layer."""
++        raise NotImplementedError
++
+     def process_weights_after_loading(self, layer: nn.Module) -> None:
+         """Process the weight after loading.
+ 
+@@ -31,6 +40,21 @@ class QuantizeMethodBase(ABC):
+         return
+ 
+ 
++def method_has_implemented_embedding(
++        method_class: Type[QuantizeMethodBase]) -> bool:
++    """
++    Not all quant methods have embedding implemented, so we need to check that
++    it exists for our given method. We check this by making sure the function
++    has been changed from the base implementation.
++    """
++    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding",
++                                            None)
++    class_embedding = inspect.getattr_static(method_class, "embedding", None)
++
++    return (class_embedding is not None
++            and class_embedding is not base_embedding)
++
++
+ class QuantizationConfig(ABC):
+     """Base class for quantization configs."""
+ 
+@@ -44,8 +68,9 @@ class QuantizationConfig(ABC):
+         """List of supported activation dtypes."""
+         raise NotImplementedError
+ 
++    @classmethod
+     @abstractmethod
+-    def get_min_capability(self) -> int:
++    def get_min_capability(cls) -> int:
+         """Minimum GPU capability to support the quantization method.
+ 
+         E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
+@@ -66,6 +91,17 @@ class QuantizationConfig(ABC):
+         """Create a config class from the model's quantization config."""
+         raise NotImplementedError
+ 
++    @classmethod
++    def override_quantization_method(cls, hf_quant_cfg,
++                                     user_quant) -> Optional[str]:
++        """
++           Detects if this quantization method can support a given checkpoint
++           format by overriding the user specified quantization method -- 
++           this method should only be overwritten by subclasses in exceptional 
++           circumstances
++        """
++        return None
++
+     @staticmethod
+     def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
+         """Get a value from the model's quantization config."""
+@@ -75,23 +111,25 @@ class QuantizationConfig(ABC):
+         raise ValueError(f"Cannot find any of {keys} in the model's "
+                          "quantization config.")
+ 
++    @staticmethod
++    def get_from_keys_or(config: Dict[str, Any], keys: List[str],
++                         default: Any) -> Any:
++        """Get a optional value from the model's quantization config."""
++        try:
++            return QuantizationConfig.get_from_keys(config, keys)
++        except ValueError:
++            return default
++
+     @abstractmethod
+-    def get_quant_method(
+-            self, layer: torch.nn.Module) -> Optional[QuantizeMethodBase]:
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional[QuantizeMethodBase]:
+         """Get the quantize method to use for the quantized layer.
+         
+         Args:
+             layer: The layer for the quant method.
++            prefix: The full name of the layer in the state dict
+         Returns:
+             The quantize method. None if the given layer doesn't support quant
+             method.
+         """
+         raise NotImplementedError
+-
+-    @abstractmethod
+-    def get_scaled_act_names(self) -> List[str]:
+-        """Returns the activation function names that should be post-scaled.
+-
+-        For now, this is only used by AWQ.
+-        """
+-        raise NotImplementedError
+diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
+new file mode 100644
+index 0000000..5dc8729
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
+@@ -0,0 +1,357 @@
++from typing import Any, Dict, List, Optional
++
++import torch
++
++from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
++                                               UnquantizedLinearMethod,
++                                               set_weight_attrs)
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++
++
++class BitsAndBytesConfig(QuantizationConfig):
++    """Config class for BitsAndBytes Quantization.
++
++    Reference: https://arxiv.org/abs/2305.14314
++    """
++
++    def __init__(
++        self,
++        load_in_8bit: bool = False,
++        load_in_4bit: bool = True,
++        bnb_4bit_compute_dtype: str = "float32",
++        bnb_4bit_quant_storage: str = "uint8",
++        bnb_4bit_quant_type: str = "fp4",
++        bnb_4bit_use_double_quant: bool = False,
++        llm_int8_enable_fp32_cpu_offload: bool = False,
++        llm_int8_has_fp16_weight: bool = False,
++        llm_int8_skip_modules: Optional[List[str]] = None,
++        llm_int8_threshold: float = 6.0,
++    ) -> None:
++
++        self.load_in_8bit = load_in_8bit
++        self.load_in_4bit = load_in_4bit
++        self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
++        self.bnb_4bit_quant_storage = bnb_4bit_quant_storage
++        self.bnb_4bit_quant_type = bnb_4bit_quant_type
++        self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant
++        self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload
++        self.llm_int8_has_fp16_weight = llm_int8_has_fp16_weight
++        self.llm_int8_skip_modules = llm_int8_skip_modules or []
++        self.llm_int8_threshold = llm_int8_threshold
++
++        if self.bnb_4bit_quant_storage not in ["uint8"]:
++            raise ValueError("Unsupported bnb_4bit_quant_storage: "
++                             f"{self.bnb_4bit_quant_storage}")
++
++    def __repr__(self) -> str:
++        return (f"BitsAndBytesConfig(load_in_8bit={self.load_in_8bit}, "
++                f"load_in_4bit={self.load_in_4bit}, "
++                f"bnb_4bit_compute_dtype={self.bnb_4bit_compute_dtype}, "
++                f"bnb_4bit_quant_storage={self.bnb_4bit_quant_storage}, "
++                f"bnb_4bit_quant_type={self.bnb_4bit_quant_type}, "
++                f"llm_int8_skip_modules={self.llm_int8_skip_modules})")
++
++    @classmethod
++    def get_name(self) -> str:
++        return "bitsandbytes"
++
++    @classmethod
++    def get_supported_act_dtypes(self) -> List[torch.dtype]:
++        return [torch.float32, torch.float16, torch.bfloat16]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 70
++
++    @staticmethod
++    def get_config_filenames() -> List[str]:
++        return [
++            "adapter_config.json",
++        ]
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig":
++
++        def get_safe_value(config, keys, default_value=None):
++            try:
++                value = cls.get_from_keys(config, keys)
++                return value if value is not None else default_value
++            except ValueError:
++                return default_value
++
++        load_in_8bit = get_safe_value(config, ["load_in_8bit"],
++                                      default_value=False)
++        load_in_4bit = get_safe_value(config, ["load_in_4bit"],
++                                      default_value=True)
++        bnb_4bit_compute_dtype = get_safe_value(config,
++                                                ["bnb_4bit_compute_dtype"],
++                                                default_value="float32")
++        bnb_4bit_quant_storage = get_safe_value(config,
++                                                ["bnb_4bit_quant_storage"],
++                                                default_value="uint8")
++        bnb_4bit_quant_type = get_safe_value(config, ["bnb_4bit_quant_type"],
++                                             default_value="fp4")
++        bnb_4bit_use_double_quant = get_safe_value(
++            config, ["bnb_4bit_use_double_quant"], default_value=False)
++        llm_int8_enable_fp32_cpu_offload = get_safe_value(
++            config, ["llm_int8_enable_fp32_cpu_offload"], default_value=False)
++        llm_int8_has_fp16_weight = get_safe_value(config,
++                                                  ["llm_int8_has_fp16_weight"],
++                                                  default_value=False)
++        llm_int8_skip_modules = get_safe_value(config,
++                                               ["llm_int8_skip_modules"],
++                                               default_value=[])
++        llm_int8_threshold = get_safe_value(config, ["llm_int8_threshold"],
++                                            default_value=6.0)
++
++        return cls(
++            load_in_8bit=load_in_8bit,
++            load_in_4bit=load_in_4bit,
++            bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
++            bnb_4bit_quant_storage=bnb_4bit_quant_storage,
++            bnb_4bit_quant_type=bnb_4bit_quant_type,
++            bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
++            llm_int8_enable_fp32_cpu_offload=llm_int8_enable_fp32_cpu_offload,
++            llm_int8_has_fp16_weight=llm_int8_has_fp16_weight,
++            llm_int8_skip_modules=llm_int8_skip_modules,
++            llm_int8_threshold=llm_int8_threshold)
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["LinearMethodBase"]:
++        if isinstance(layer, LinearBase):
++            if is_layer_skipped_bnb(prefix, self.llm_int8_skip_modules):
++                return UnquantizedLinearMethod()
++            return BitsAndBytesLinearMethod(self)
++        return None
++
++
++def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]):
++    # Split the prefix into its dot-separated components
++    components = prefix.split('.')
++
++    # Check if any of the skip modules exactly matches any component
++    return any(module_name in components
++               for module_name in llm_int8_skip_modules)
++
++
++class BitsAndBytesLinearMethod(LinearMethodBase):
++    """Linear method for BitsAndBytes.
++
++    Args:
++       quant_config: The BitsAndBytes quantization config.
++    """
++
++    def __init__(self, quant_config: BitsAndBytesConfig):
++        try:
++            import bitsandbytes
++            if bitsandbytes.__version__ < "0.45.0":
++                raise ImportError("bitsandbytes version is wrong. Please "
++                                  "install bitsandbytes>=0.45.0.")
++        except ImportError as err:
++            raise ImportError("Please install bitsandbytes>=0.45.0 via "
++                              "`pip install bitsandbytes>=0.45.0` to use "
++                              "bitsandbytes quantizer.") from err
++
++        self.quant_config = quant_config
++
++    def create_weights(self, layer: torch.nn.Module,
++                       input_size_per_partition: int,
++                       output_partition_sizes: List[int], input_size: int,
++                       output_size: int, params_dtype: torch.dtype,
++                       **extra_weight_attrs):
++        from bitsandbytes.nn import Int8Params
++
++        def calculate_quant_ratio(dtype):
++            if dtype.is_floating_point:
++                return torch.finfo(dtype).bits // torch.iinfo(torch.uint8).bits
++            else:
++                return torch.iinfo(dtype).bits // torch.iinfo(torch.uint8).bits
++
++        def create_qweight_for_8bit():
++            qweight = Int8Params(
++                data=torch.empty(sum(output_partition_sizes),
++                                 input_size_per_partition,
++                                 dtype=torch.int8),
++                has_fp16_weights=self.quant_config.llm_int8_has_fp16_weight,
++                requires_grad=False)
++            set_weight_attrs(
++                qweight, {
++                    "input_dim": 0,
++                    "output_dim": 0,
++                    "pack_factor": 1,
++                    "use_bitsandbytes_8bit": True,
++                    "generation": 0
++                })
++            return qweight
++
++        def create_qweight_for_4bit():
++            quant_ratio = calculate_quant_ratio(params_dtype)
++
++            total_size = input_size_per_partition * sum(output_partition_sizes)
++            if total_size % quant_ratio != 0:
++                raise ValueError(
++                    "The input size is not aligned with the quantized "
++                    "weight shape.")
++
++            qweight = torch.nn.Parameter(torch.empty(total_size // quant_ratio,
++                                                     1,
++                                                     dtype=torch.uint8),
++                                         requires_grad=False)
++            set_weight_attrs(
++                qweight, {
++                    "input_dim": 0,
++                    "output_dim": 0,
++                    "pack_factor": quant_ratio,
++                    "use_bitsandbytes_4bit": True
++                })
++            return qweight
++
++        if self.quant_config.load_in_8bit:
++            qweight = create_qweight_for_8bit()
++        else:
++            qweight = create_qweight_for_4bit()
++        # Enable parameters to have the same name as in the BNB
++        # checkpoint format.
++        layer.register_parameter("weight", qweight)
++        set_weight_attrs(qweight, extra_weight_attrs)
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++
++        if self.quant_config.load_in_8bit:
++            return self._apply_8bit_weight(layer, x, bias)
++        else:
++            return self._apply_4bit_weight(layer, x, bias)
++
++    def _apply_8bit_weight(
++            self,
++            layer: torch.nn.Module,
++            x: torch.Tensor,
++            bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++
++        # only load the bitsandbytes module when needed
++        from bitsandbytes import MatmulLtState, matmul
++
++        original_type = x.dtype
++        original_shape = x.shape
++        reshape_after_matmul = False
++        if x.ndim > 2:
++            x = x.reshape(-1, x.size(-1))
++            reshape_after_matmul = True
++        bf_x = x.to(torch.bfloat16)
++
++        qweight = layer.weight
++        offsets = qweight.bnb_shard_offsets
++        quant_states = qweight.bnb_quant_state
++        matmul_states = qweight.matmul_state
++        generation = qweight.generation
++
++        out_dim_0 = x.shape[0]
++        out_dim_1 = sum(
++            [quant_state[1].shape[0] for quant_state in quant_states.items()])
++        out = torch.empty(out_dim_0,
++                          out_dim_1,
++                          dtype=torch.float16,
++                          device=x.device)
++
++        current_index = 0
++        for i in range(len(quant_states)):
++            output_size = quant_states[i].shape[0]
++
++            # in profile_run or the first generation of inference,
++            # create new matmul_states
++            if generation == 0 or generation == 1:
++                matmul_states[i] = MatmulLtState()
++                matmul_states[i].CB = qweight[offsets[i]:offsets[i + 1]]
++                matmul_states[i].SCB = quant_states[i].to(x.device)
++                matmul_states[i].threshold = (
++                    self.quant_config.llm_int8_threshold)
++                matmul_states[i].has_fp16_weights = (
++                    self.quant_config.llm_int8_has_fp16_weight)
++                matmul_states[i].is_training = False
++                if matmul_states[i].threshold > 0.0 and not matmul_states[
++                        i].has_fp16_weights:
++                    matmul_states[i].use_pool = True
++
++            new_x = bf_x.unsqueeze(0)
++
++            out[:, current_index:current_index + output_size] = matmul(
++                new_x,
++                qweight[offsets[i]:offsets[i + 1]],
++                state=matmul_states[i])
++
++            current_index += output_size
++
++            # only update the matmul_states if it is not profile_run
++            if (generation > 0
++                    and not self.quant_config.llm_int8_has_fp16_weight
++                    and matmul_states[i].CB is not None
++                    and matmul_states[i].CxB is not None):
++                del matmul_states[i].CB
++                qweight[offsets[i]:offsets[i + 1]] = matmul_states[i].CxB
++
++        out = out.to(original_type)
++
++        if reshape_after_matmul:
++            out = out.view(*original_shape[:-1], out.size(-1))
++
++        if bias is not None:
++            out += bias
++
++        qweight.generation += 1
++
++        return out
++
++    def _apply_4bit_weight(
++            self,
++            layer: torch.nn.Module,
++            x: torch.Tensor,
++            bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++
++        # only load the bitsandbytes module when needed
++        from bitsandbytes import matmul_4bit
++
++        original_type = x.dtype
++        original_shape = x.shape
++        reshape_after_matmul = False
++        if x.ndim > 2:
++            x = x.reshape(-1, x.size(-1))
++            reshape_after_matmul = True
++        bf_x = x.to(torch.bfloat16)
++
++        qweight = layer.weight
++        quant_states = qweight.bnb_quant_state
++        offsets = qweight.bnb_shard_offsets
++
++        out_dim_0 = x.shape[0]
++        out_dim_1 = sum(
++            [quant_state[1].shape[0] for quant_state in quant_states.items()])
++        out = torch.empty(out_dim_0,
++                          out_dim_1,
++                          dtype=torch.bfloat16,
++                          device=x.device)
++
++        current_index = 0
++        for i in range(len(quant_states)):
++            output_size = quant_states[i].shape[0]
++            # It is more efficient to use out kwarg like
++            # matmul_4bit(..., out = ...).  Infeasible now due to the bug
++            # https://github.com/TimDettmers/bitsandbytes/issues/1235.
++            # Need to change  after the bug is fixed.
++            out[:, current_index:current_index + output_size] = matmul_4bit(
++                bf_x, qweight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
++
++            current_index += output_size
++
++        out = out.to(original_type)
++
++        if reshape_after_matmul:
++            out = out.view(*original_shape[:-1], out.size(-1))
++
++        if bias is not None:
++            out += bias
++
++        return out
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+new file mode 100644
+index 0000000..0c1fc18
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+@@ -0,0 +1,555 @@
++from typing import Any, Dict, List, Literal, Optional, cast
++
++import torch
++from compressed_tensors.config import (CompressionFormat,
++                                       SparsityCompressionConfig,
++                                       SparsityStructure)
++from compressed_tensors.quantization import (QuantizationArgs,
++                                             QuantizationStrategy,
++                                             QuantizationType)
++from pydantic import BaseModel
++
++from vllm.model_executor.layers.fused_moe import FusedMoE
++from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
++                                               UnquantizedLinearMethod)
++from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
++    QuantizationConfig, QuantizeMethodBase)
++from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
++    CompressedTensorsMoEMethod)
++from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
++    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
++    CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
++    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
++    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
++from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
++    find_matched_target, is_activation_quantization_format,
++    should_ignore_layer)
++from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
++from vllm.platforms import current_platform
++
++__all__ = ["CompressedTensorsLinearMethod"]
++
++SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
++QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
++
++
++class CompressedTensorsConfig(QuantizationConfig):
++
++    def __init__(
++        self,
++        target_scheme_map: Dict[str, Any],
++        ignore: List[str],
++        quant_format: str,
++        sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
++        kv_cache_scheme: Optional[Dict[str, Any]] = None,
++        config: Optional[Dict[str, Any]] = None,
++    ):
++
++        self.ignore = ignore
++        self.quant_format = quant_format
++        # Map from [target -> scheme]
++        self.target_scheme_map = target_scheme_map
++        self.kv_cache_scheme = kv_cache_scheme
++        self.sparsity_scheme_map = sparsity_scheme_map
++        self.config = config
++
++    def get_linear_method(self) -> "CompressedTensorsLinearMethod":
++        return CompressedTensorsLinearMethod(self)
++
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.float16, torch.bfloat16]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 70
++
++    def get_name(self) -> str:
++        return "compressed_tensors"
++
++    def get_quant_method(
++        self,
++        layer: torch.nn.Module,
++        prefix: str,
++    ) -> Optional["QuantizeMethodBase"]:
++        from vllm.attention.layer import Attention  # Avoid circular import
++
++        # Check if the layer is skipped for quantization.
++        # TODO (@robertgshaw2): support module names
++        if should_ignore_layer(prefix, ignore=self.ignore):
++            return UnquantizedLinearMethod()
++        if isinstance(layer, LinearBase):
++            scheme = self.get_scheme(layer=layer, layer_name=prefix)
++            layer.scheme = scheme
++            return CompressedTensorsLinearMethod(self)
++        if isinstance(layer, Attention):
++            return CompressedTensorsKVCacheMethod(self)
++        if isinstance(layer, FusedMoE):
++            return CompressedTensorsMoEMethod.get_moe_method(self)
++        return None
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
++        ignore: List[str] = cast(List[str], config.get("ignore", []))
++        quant_format = cast(str, config.get("format"))
++        target_scheme_map = cls._quantization_scheme_map_from_config(
++            config=config)
++        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(
++            config=config)
++
++        return cls(
++            target_scheme_map=target_scheme_map,
++            ignore=ignore,
++            quant_format=quant_format,
++            sparsity_scheme_map=sparsity_scheme_map,
++            config=config,
++        )
++
++    @classmethod
++    def _sparsity_scheme_map_from_config(
++            cls, config: Dict[str,
++                              Any]) -> Dict[str, SparsityCompressionConfig]:
++        """
++        :param config: The `quantization_config` dictionary from config.json
++        :return: A dictionary mapping target layer names to their corresponding
++            sparsity compression configurations
++        """
++        if (sparsity_config := config.get(SPARSITY_CONFIG_NAME)) is None:
++            return dict()
++
++        sparsity_config = SparsityCompressionConfig.model_validate(
++            sparsity_config)
++        sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
++            target: sparsity_config
++            for target in sparsity_config.targets or list()
++        }
++        return sparse_scheme_map
++
++    @classmethod
++    def _quantization_scheme_map_from_config(
++            cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
++        """
++        :param config: The `quantization_config` dictionary from config.json
++        :return: A dictionary mapping target layer names to their corresponding
++            quantization_args for weights and input activations
++        """
++        target_scheme_map: Dict[str, Any] = dict()
++        quant_format = cast(str, config.get("format"))
++
++        # The quant_config has multiple config_groups, each containing
++        # an input_activations key with details about how the activations are
++        # quantized, a weights key indicating how the weights are quantized,
++        # and a list of targets under the `targets` key, dictating which
++        # layers are impacted by the quantization details. The quantization
++        # details follow the structure defined by the QuantizationArgs
++        # pydantic model, which is used to verify the structure of the
++        # quant_config and also store the details for later use.
++
++        config_groups = config.get("config_groups", dict())
++        for _, quant_config in config_groups.items():
++            targets = quant_config.get("targets")
++            for target in targets:
++                target_scheme_map[target] = {}
++                target_scheme_map[target][
++                    "weights"] = QuantizationArgs.model_validate(
++                        quant_config.get("weights"))
++
++                target_scheme_map[target]["input_activations"] = None
++                if is_activation_quantization_format(quant_format):
++                    input_activations = quant_config.get("input_activations")
++                    # The only case where we have activation quant supported
++                    # but no input_activations provided in the config
++                    # should be w8a16fp8 w8a16fp8 can also run for cases where
++                    # there is an input_quant but it is ignored
++                    if not input_activations:
++                        assert target_scheme_map[target][
++                            "weights"].type == QuantizationType.FLOAT
++                    else:
++                        target_scheme_map[target][
++                            "input_activations"] = QuantizationArgs.model_validate(  # noqa: E501
++                                quant_config.get("input_activations"))
++        return target_scheme_map
++
++    @classmethod
++    def get_config_filenames(cls) -> List[str]:
++        return []
++
++    def _check_scheme_supported(self,
++                                min_capability: int,
++                                error: bool = True) -> bool:
++        capability_tuple = current_platform.get_device_capability()
++
++        if capability_tuple is not None:
++            capability = capability_tuple.to_int()
++            supported = capability >= min_capability
++            if error and not supported:
++                raise RuntimeError(
++                    "Quantization scheme is not supported for ",
++                    f"the current GPU. Min capability: {min_capability}. ",
++                    f"Current capability: {capability}.")
++            return supported
++        else:
++            return False
++
++    def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
++                               input_quant: BaseModel) -> bool:
++        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
++        weight_strategy = (
++            weight_quant.strategy == QuantizationStrategy.TENSOR.value
++            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
++        is_tensor = (weight_strategy and input_quant.strategy
++                     == QuantizationStrategy.TENSOR.value)
++        is_static = not weight_quant.dynamic and not input_quant.dynamic
++
++        # Both symmetric and asymmetric input quantization supported.
++        # Only symmetric weight quantization supported.
++        return is_8_bits and is_tensor and weight_quant.symmetric and is_static
++
++    def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
++                               input_quant: BaseModel) -> bool:
++        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
++        weight_strategy = (
++            weight_quant.strategy == QuantizationStrategy.TENSOR.value
++            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
++        is_token = (weight_strategy and input_quant.strategy
++                    == QuantizationStrategy.TOKEN.value)
++        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
++
++        # Both symmetric and asymmetric input quantization supported.
++        # Only symmetric weight quantization supported.
++        return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
++
++    def _is_fp8_w8a8(self, weight_quant: BaseModel,
++                     input_quant: BaseModel) -> bool:
++        # Confirm weights and activations quantized.
++        if weight_quant is None or input_quant is None:
++            return False
++
++        # Confirm weight scheme is supported.
++        is_floating_point = (weight_quant.type == QuantizationType.FLOAT
++                             and input_quant.type == QuantizationType.FLOAT)
++        is_symmetric_weight = weight_quant.symmetric
++        is_static_weight = not weight_quant.dynamic
++        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
++            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
++        ])
++        if not (is_floating_point and is_symmetric_weight and is_static_weight
++                and is_per_tensor_or_channel_weight):
++            return False
++
++        # Dynamic quantization is always supported if weights supported.
++        if input_quant.dynamic:
++            return True
++
++        # Confirm activation scheme is supported.
++        is_symmetric_activation = input_quant.symmetric
++        is_per_tensor_activation = (
++            input_quant.strategy == QuantizationStrategy.TENSOR)
++        return is_symmetric_activation and is_per_tensor_activation
++
++    def _is_fp8_w8a16(self, weight_quant: BaseModel,
++                      input_quant: BaseModel) -> bool:
++        # Confirm weights quantized.
++        if weight_quant is None:
++            return False
++
++        # Confirm we have floating points.
++        if weight_quant.type != QuantizationType.FLOAT:
++            return False
++
++        # Confirm weight scheme is supported.
++        is_symmetric_weight = weight_quant.symmetric
++        is_static_weight = not weight_quant.dynamic
++        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
++            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
++        ])
++        if not (is_symmetric_weight and is_static_weight  # noqa: SIM103
++                and is_per_tensor_or_channel_weight):
++            return False
++
++        # All conditions satisfied.
++        return True
++
++    def _is_wNa16_group_channel(self, weight_quant: BaseModel,
++                                input_quant: BaseModel) -> bool:
++        input_quant_none = input_quant is None
++        is_symmetric = weight_quant.symmetric
++        is_channel_group = (
++            weight_quant.strategy == QuantizationStrategy.CHANNEL.value
++            or weight_quant.strategy == QuantizationStrategy.GROUP.value)
++        is_static = not weight_quant.dynamic
++
++        return (is_channel_group and input_quant_none and is_symmetric
++                and is_static)
++
++    def _get_scheme_from_parts(
++            self, weight_quant: BaseModel,
++            input_quant: BaseModel) -> "CompressedTensorsScheme":
++
++        # Detect If Mixed Precision
++        if self._is_wNa16_group_channel(weight_quant, input_quant):
++            if (self.quant_format == CompressionFormat.marlin_24.value
++                    and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
++                return CompressedTensorsW4A16Sparse24(
++                    strategy=weight_quant.strategy,
++                    num_bits=weight_quant.num_bits,
++                    group_size=weight_quant.group_size)
++            if (self.quant_format == CompressionFormat.pack_quantized.value
++                    and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
++                return CompressedTensorsWNA16(
++                    num_bits=weight_quant.num_bits,
++                    strategy=weight_quant.strategy,
++                    group_size=weight_quant.group_size,
++                    actorder=weight_quant.actorder)
++
++        if is_activation_quantization_format(self.quant_format):
++            if self._is_fp8_w8a8(weight_quant, input_quant):
++                is_fp8_w8a8_supported = self._check_scheme_supported(
++                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
++                if is_fp8_w8a8_supported:
++                    return CompressedTensorsW8A8Fp8(
++                        strategy=weight_quant.strategy,
++                        is_static_input_scheme=(input_quant
++                                                and not input_quant.dynamic))
++                else:
++                    # note: input_quant will be present for converted models;
++                    # will be ignored during inference post loading
++                    return CompressedTensorsW8A16Fp8(
++                        strategy=weight_quant.strategy,
++                        is_static_input_scheme=not input_quant.dynamic)
++
++            # note: input_quant can be None
++            if self._is_fp8_w8a16(weight_quant, input_quant):
++                is_static_input_scheme = (input_quant
++                                          and not input_quant.dynamic)
++                return CompressedTensorsW8A16Fp8(
++                    strategy=weight_quant.strategy,
++                    is_static_input_scheme=is_static_input_scheme)
++
++            if self._is_static_tensor_w8a8(weight_quant, input_quant):
++                return CompressedTensorsW8A8Int8(
++                    strategy=weight_quant.strategy,
++                    is_static_input_scheme=True,
++                    input_symmetric=input_quant.symmetric)
++
++            if self._is_dynamic_token_w8a8(weight_quant, input_quant):
++                return CompressedTensorsW8A8Int8(
++                    strategy=weight_quant.strategy,
++                    is_static_input_scheme=False,
++                    input_symmetric=input_quant.symmetric)
++
++        raise NotImplementedError(
++            "No compressed-tensors compatible scheme was found.")
++
++    def get_scheme(
++            self,
++            layer: torch.nn.Module,
++            layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
++        """
++        compressed-tensors supports non uniform in the following way:
++
++        ignore: List of layer_names or nn.Module names to be ignored.
++        targets of config_groups: There can be N config_groups which each
++            have a quantization scheme. Each config_group has a list of targets
++            which can be a full layer_name, a regex for a layer_name, or
++            an nn.Module name.
++
++        We first check whether a layer is in the ignore group and use
++        CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer
++
++        We then detect whether a layer_name is found in any target and
++        use the quantization scheme corresponding to the matched target
++        to select the CompressedTensorsScheme used for infernece.
++        """
++
++        # Find the "target" in the compressed-tensors config
++        # that our layer conforms to.
++        # TODO (@robertgshaw): add compressed-tensors as dep
++        # so we do not have to re-write these functions
++        # need to make accelerate optional in ct to do this
++
++        # Will be empty for models with only sparsity
++        if self.target_scheme_map:
++            matched_target = find_matched_target(
++                layer_name=layer_name,
++                module=layer,
++                targets=self.target_scheme_map.keys())
++
++            scheme_dict = self.target_scheme_map[matched_target]
++            weight_quant = scheme_dict.get("weights")
++            input_quant = scheme_dict.get("input_activations")
++        elif self.sparsity_scheme_map:
++            matched_target = find_matched_target(
++                layer_name=layer_name,
++                module=layer,
++                targets=self.sparsity_scheme_map.keys())
++            weight_quant = None
++            input_quant = None
++
++        # For models with sparsity, assumes that the sparse layers are also
++        # quantized for cutlass 2:4 support
++        sparsity_scheme: Optional[
++            SparsityCompressionConfig] = self.sparsity_scheme_map.get(
++                matched_target)
++
++        if self.supports_cutlass_24(weight_quant=weight_quant,
++                                    input_quant=input_quant,
++                                    sparsity_scheme=sparsity_scheme):
++            # Have a valid sparsity scheme
++            # Validate layer is supported by Cutlass 2:4 Kernel
++            scheme = CompressedTensors24(quantized=weight_quant is not None
++                                         or input_quant is not None,
++                                         weight_quant=weight_quant,
++                                         input_quant=input_quant)
++        else:
++            # Find the quant_scheme
++            scheme = self._get_scheme_from_parts(  # type: ignore
++                weight_quant=weight_quant,
++                input_quant=input_quant,
++            )
++
++        # Raise error if device does not support the scheme
++        # (e.g. fp8 needs ada lovelace)
++        self._check_scheme_supported(scheme.get_min_capability())
++        return scheme
++
++    @staticmethod
++    def supports_cutlass_24(
++            weight_quant: Optional[QuantizationArgs],
++            input_quant: Optional[QuantizationArgs],
++            sparsity_scheme: Optional[SparsityCompressionConfig] = None
++    ) -> bool:
++        """
++        Check if the layer is supported by the Cutlass 2:4 Kernel
++        Conditions:
++            - Overarching condition: Sparsity Structure is 2:4
++            - Unquantized cases are supported
++            - Weight only quantization is not-supported
++            - Supported weight quantization strategies are TENSOR and CHANNEL
++            - Supported input quantization strategies are TENSOR and TOKEN
++            - Only 8 bit quantization is supported 
++
++        :return: True if the layer is supported by the Cutlass 2:4 Kernel
++            False otherwise
++        """
++        is_valid_sparsity = (sparsity_scheme is not None
++                             and sparsity_scheme.sparsity_structure
++                             == SparsityStructure.TWO_FOUR.value
++                             and sparsity_scheme.format == "dense")
++        if not is_valid_sparsity:
++            return False
++
++        # Unquantized cases are supported
++        if weight_quant is None and input_quant is None:
++            return True
++
++        # Weight only quantization is not-supported
++        if weight_quant is not None and input_quant is None:
++            return False
++
++        supported_weight_quant_strategies = [
++            QuantizationStrategy.TENSOR.value,
++            QuantizationStrategy.CHANNEL.value
++        ]
++
++        assert weight_quant is not None
++        assert input_quant is not None
++        if weight_quant.strategy not in supported_weight_quant_strategies:
++            return False
++
++        supported_input_quant_strategies = [
++            QuantizationStrategy.TENSOR.value, QuantizationStrategy.TOKEN.value
++        ]
++
++        if input_quant.strategy not in supported_input_quant_strategies:
++            return False
++
++        return weight_quant.num_bits == input_quant.num_bits == 8
++
++
++class CompressedTensorsLinearMethod(LinearMethodBase):
++
++    def __init__(self, quantization_config: CompressedTensorsConfig):
++        self.quantization_config = quantization_config
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        layer.scheme.process_weights_after_loading(layer)
++
++    def create_weights(self, layer: torch.nn.Module,
++                       input_size_per_partition: int,
++                       output_partition_sizes: List[int], input_size: int,
++                       output_size: int, params_dtype: torch.dtype,
++                       **extra_weight_attrs):
++        """
++        Use the CompressedTensorsScheme associated with each layer to create
++        the necessary parameters for the layer. See LinearMethodBase for param
++        details
++        """
++        weight_loader = extra_weight_attrs.get("weight_loader")
++        layer.scheme.create_weights(
++            layer=layer,
++            input_size=input_size,
++            input_size_per_partition=input_size_per_partition,
++            output_partition_sizes=output_partition_sizes,
++            output_size=output_size,
++            params_dtype=params_dtype,
++            weight_loader=weight_loader)
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None):
++        """
++        Use the output of create_weights and the CompressedTensorsScheme
++        associated with the layer to apply the forward pass with the
++        layer input.  See LinearMethodBase for param details
++
++        """
++
++        scheme = layer.scheme
++        if scheme is None:
++            raise ValueError("A scheme must be defined for each layer")
++        return scheme.apply_weights(layer, x, bias=bias)
++
++
++class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):
++    """
++    Supports loading kv-cache scaling factors from compressed-tensors
++    checkpoints.
++    """
++
++    def __init__(self, quant_config: CompressedTensorsConfig):
++        self.validate_kv_cache_scheme(quant_config.kv_cache_scheme)
++        super().__init__(quant_config)
++
++    @staticmethod
++    def validate_kv_cache_scheme(kv_cache_scheme: Optional[Dict[str, Any]]):
++        """
++        Validator for the kv cache scheme. Useful for controlling the
++        kv cache quantization schemes, that are being supported in vLLM
++        :param kv_cache_scheme: the compressed-tensors kv cache scheme
++        """
++        if kv_cache_scheme is None:
++            return
++
++        type_ = kv_cache_scheme.get("type")
++        num_bits = kv_cache_scheme.get("num_bits")
++
++        if type_ != "float" and num_bits != 8:
++            raise NotImplementedError(
++                "Currently supported kv cache quantization is "
++                "num_bits=8, type=float, however "
++                f"received num_bits={num_bits}, type={type_}")
++
++        strategy = kv_cache_scheme.get("strategy")
++        if strategy != "tensor":
++            raise NotImplementedError(
++                "Only support per-tensor scaling factor "
++                "for compressed-tensors KV cache. "
++                f"Expected strategy: tensor, found strategy: {strategy}")
++
++        is_symmetric = kv_cache_scheme.get("symmetric")
++        if not is_symmetric:
++            raise NotImplementedError(
++                "Only support symmetric scaling factor "
++                "for compressed-tensors KV cache. "
++                f"However found symmetric: {is_symmetric}")
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+new file mode 100644
+index 0000000..4fb8fd8
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+@@ -0,0 +1,519 @@
++import enum
++from enum import Enum
++from typing import Callable, List, Optional
++
++import torch
++from compressed_tensors import CompressionFormat
++from compressed_tensors.quantization import QuantizationStrategy
++
++import vllm.model_executor.layers.fused_moe  # noqa
++from vllm import _custom_ops as ops
++from vllm.logger import init_logger
++from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
++                                                  FusedMoeWeightScaleSupported)
++from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
++    WNA16_SUPPORTED_BITS)
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
++from vllm.model_executor.utils import set_weight_attrs
++from vllm.platforms import current_platform
++
++logger = init_logger(__name__)
++
++
++class GPTQMarlinState(Enum):
++    REPACK = enum.auto()
++    READY = enum.auto()
++
++
++__all__ = [
++    "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod",
++    "CompressedTensorsWNA16MoEMethod"
++]
++
++
++class CompressedTensorsMoEMethod(FusedMoEMethodBase):
++
++    @staticmethod
++    def get_moe_method(
++        quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
++    ) -> "CompressedTensorsMoEMethod":
++        # TODO: @dsikka: refactor this to use schemes as other kernels
++        # are supported + check if the layer is being ignored.
++        weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
++        input_quant = quant_config.target_scheme_map["Linear"].get(
++            "input_activations")
++
++        if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
++            return CompressedTensorsWNA16MoEMethod(quant_config)
++        elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
++            return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
++        else:
++            raise RuntimeError(
++                f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}")
++
++
++class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
++
++    def __init__(
++            self,
++            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
++    ):
++        self.quant_config = quant_config
++        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
++            "weights")
++        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
++            "input_activations")
++
++        if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
++                and self.input_quant.strategy == QuantizationStrategy.TENSOR):
++            raise ValueError(
++                "For FP8 Fused MoE layers, only per-tensor scales"
++                "for weights and activations are supported. Found "
++                f"{self.weight_quant}, {self.input_quant}")
++
++        self.static_input_scales = not self.input_quant.dynamic
++
++    def create_weights(self, layer: torch.nn.Module, num_experts: int,
++                       hidden_size: int, intermediate_size: int,
++                       params_dtype: torch.dtype, **extra_weight_attrs):
++
++        params_dtype = torch.float8_e4m3fn
++
++        # WEIGHTS
++        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
++                                                    2 * intermediate_size,
++                                                    hidden_size,
++                                                    dtype=params_dtype),
++                                        requires_grad=False)
++        layer.register_parameter("w13_weight", w13_weight)
++        set_weight_attrs(w13_weight, extra_weight_attrs)
++
++        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
++                                                   hidden_size,
++                                                   intermediate_size,
++                                                   dtype=params_dtype),
++                                       requires_grad=False)
++        layer.register_parameter("w2_weight", w2_weight)
++        set_weight_attrs(w2_weight, extra_weight_attrs)
++
++        # WEIGHT_SCALES
++        # Allocate 2 scales for w1 and w3 respectively.
++        # They will be combined to a single scale after weight loading.
++        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
++                                                         2,
++                                                         dtype=torch.float32),
++                                              requires_grad=False)
++        layer.register_parameter("w13_weight_scale", w13_weight_scale)
++
++        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
++                                                        dtype=torch.float32),
++                                             requires_grad=False)
++        layer.register_parameter("w2_weight_scale", w2_weight_scale)
++        # Add the quantization method used (per tensor/grouped/channel)
++        # to ensure the weight scales are loaded in properly
++        extra_weight_attrs.update(
++            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
++        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
++        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
++
++        # INPUT_SCALES
++        if self.static_input_scales:
++            w13_input_scale = torch.nn.Parameter(torch.ones(
++                num_experts, dtype=torch.float32),
++                                                 requires_grad=False)
++            layer.register_parameter("w13_input_scale", w13_input_scale)
++            set_weight_attrs(w13_input_scale, extra_weight_attrs)
++
++            w2_input_scale = torch.nn.Parameter(torch.ones(
++                num_experts, dtype=torch.float32),
++                                                requires_grad=False)
++            layer.register_parameter("w2_input_scale", w2_input_scale)
++            set_weight_attrs(w2_input_scale, extra_weight_attrs)
++        else:
++            layer.w13_input_scale = None
++            layer.w2_input_scale = None
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        # Fp8 moe kernels require a single activation scale.
++        # We take the max of all the scales in case they differ.
++        if self.static_input_scales:
++            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
++                raise ValueError(
++                    "QuantConfig has static quantization, but found "
++                    "activation scales are None.")
++            if (not all_close_1d(layer.w13_input_scale)
++                    or not all_close_1d(layer.w2_input_scale)):
++                logger.warning_once(
++                    "Found input_scales that are not equal for "
++                    "fp8 MoE layer. Using the maximum across experts "
++                    "for each layer.")
++            layer.w13_input_scale = torch.nn.Parameter(
++                layer.w13_input_scale.max(), requires_grad=False)
++            layer.w2_input_scale = torch.nn.Parameter(
++                layer.w2_input_scale.max(), requires_grad=False)
++
++        # If rocm, normalize the weights and scales to e4m3fnuz
++        if current_platform.is_rocm():
++            # Normalize the weights and scales
++            w13_weight, w13_weight_scale, w13_input_scale = \
++                normalize_e4m3fn_to_e4m3fnuz(
++                    layer.w13_weight, layer.w13_weight_scale,
++                    layer.w13_input_scale)
++            w2_weight, w2_weight_scale, w2_input_scale = \
++                normalize_e4m3fn_to_e4m3fnuz(
++                    layer.w2_weight, layer.w2_weight_scale,
++                    layer.w2_input_scale)
++            # Reset the parameter
++            layer.w13_weight = torch.nn.Parameter(w13_weight,
++                                                  requires_grad=False)
++            layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale,
++                                                        requires_grad=False)
++            if w13_input_scale is not None:
++                layer.w13_input_scale = torch.nn.Parameter(w13_input_scale,
++                                                           requires_grad=False)
++            layer.w2_weight = torch.nn.Parameter(w2_weight,
++                                                 requires_grad=False)
++            layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
++                                                       requires_grad=False)
++            if w2_input_scale is not None:
++                layer.w2_input_scale = torch.nn.Parameter(w2_input_scale,
++                                                          requires_grad=False)
++
++        # Fp8 moe kernel needs single weight scale for w13 per expert.
++        # We take the max then dequant and requant each expert.
++        assert layer.w13_weight_scale is not None
++        shard_size = layer.intermediate_size_per_partition
++        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
++        for expert_id in range(layer.num_experts):
++            start = 0
++            for shard_id in range(2):
++                dq_weight = per_tensor_dequantize(
++                    layer.w13_weight[expert_id][start:start + shard_size, :],
++                    layer.w13_weight_scale[expert_id][shard_id])
++                layer.w13_weight[expert_id][
++                    start:start + shard_size, :], _ = ops.scaled_fp8_quant(
++                        dq_weight, max_w13_scales[expert_id])
++                start += shard_size
++
++        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
++                                                    requires_grad=False)
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        router_logits: torch.Tensor,
++        top_k: int,
++        renormalize: bool,
++        use_grouped_topk: bool = False,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        from vllm.model_executor.layers.fused_moe import fused_experts
++
++        topk_weights, topk_ids = FusedMoE.select_experts(
++            hidden_states=x,
++            router_logits=router_logits,
++            use_grouped_topk=use_grouped_topk,
++            top_k=top_k,
++            renormalize=renormalize,
++            topk_group=topk_group,
++            num_expert_group=num_expert_group,
++            custom_routing_function=custom_routing_function,
++            scoring_func=scoring_func,
++            e_score_correction_bias=e_score_correction_bias)
++
++        return fused_experts(x,
++                             layer.w13_weight,
++                             layer.w2_weight,
++                             topk_weights=topk_weights,
++                             topk_ids=topk_ids,
++                             inplace=True,
++                             use_fp8_w8a8=True,
++                             w1_scale=layer.w13_weight_scale,
++                             w2_scale=layer.w2_weight_scale,
++                             a1_scale=layer.w13_input_scale,
++                             a2_scale=layer.w2_input_scale)
++
++
++class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
++
++    def __init__(
++            self,
++            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
++    ):
++        self.quant_config = quant_config
++        # TODO: @dsikka: refactor this to use schemes as other kernels
++        # are supported + check if the layer is being ignored.
++        config = self.quant_config.target_scheme_map["Linear"].get("weights")
++        self.num_bits = config.num_bits
++        self.packed_factor = 32 // config.num_bits
++        self.strategy = config.strategy
++        self.group_size = config.group_size
++        assert config.symmetric, (
++            "Only symmetric quantization is supported for MoE")
++
++        if not (self.quant_config.quant_format
++                == CompressionFormat.pack_quantized.value
++                and self.num_bits in WNA16_SUPPORTED_BITS):
++            raise ValueError("For Fused MoE layers, only ",
++                             f"{CompressionFormat.pack_quantized.value} ",
++                             "is supported for the following bits: ",
++                             f"{WNA16_SUPPORTED_BITS}")
++
++    def create_weights(self, layer: torch.nn.Module, num_experts: int,
++                       hidden_size: int, intermediate_size: int,
++                       params_dtype: torch.dtype, **extra_weight_attrs):
++
++        # Will transpose the loaded weight along the
++        # intermediate and hidden dim sizes. Will
++        # shard for TP along the transposed dims
++        extra_weight_attrs.update({
++            "is_transposed": True,
++            "quant_method": self.strategy
++        })
++        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
++                                                    hidden_size //
++                                                    self.packed_factor,
++                                                    2 * intermediate_size,
++                                                    dtype=torch.int32),
++                                        requires_grad=False)
++        layer.register_parameter("w13_weight_packed", w13_weight)
++        set_weight_attrs(w13_weight, extra_weight_attrs)
++
++        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
++                                                   intermediate_size //
++                                                   self.packed_factor,
++                                                   hidden_size,
++                                                   dtype=torch.int32),
++                                       requires_grad=False)
++        layer.register_parameter("w2_weight_packed", w2_weight)
++        set_weight_attrs(w2_weight, extra_weight_attrs)
++
++        if self.strategy == "channel":
++            num_groups_w2 = num_groups_w13 = 1
++            self.group_size = -1
++        else:
++            num_groups_w2 = intermediate_size // self.group_size
++            num_groups_w13 = hidden_size // self.group_size
++
++        w13_scale = torch.nn.Parameter(torch.ones(num_experts,
++                                                  num_groups_w13,
++                                                  2 * intermediate_size,
++                                                  dtype=params_dtype),
++                                       requires_grad=False)
++        layer.register_parameter("w13_weight_scale", w13_scale)
++        set_weight_attrs(w13_scale, extra_weight_attrs)
++
++        w2_scale = torch.nn.Parameter(torch.ones(num_experts,
++                                                 num_groups_w2,
++                                                 hidden_size,
++                                                 dtype=params_dtype),
++                                      requires_grad=False)
++        layer.register_parameter("w2_weight_scale", w2_scale)
++        set_weight_attrs(w2_scale, extra_weight_attrs)
++
++        w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
++                                             requires_grad=False)
++        layer.register_parameter("w2_weight_shape", w2_weight_shape)
++        set_weight_attrs(w2_weight_shape, extra_weight_attrs)
++        w13_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
++                                              requires_grad=False)
++
++        layer.register_parameter("w13_weight_shape", w13_weight_shape)
++        set_weight_attrs(w13_weight_shape, extra_weight_attrs)
++
++        w13_g_idx = torch.nn.Parameter(
++            torch.empty(
++                num_experts,
++                hidden_size,
++                dtype=torch.int32,
++            ),
++            requires_grad=False,
++        )
++        layer.register_parameter("w13_g_idx", w13_g_idx)
++        set_weight_attrs(w13_g_idx, extra_weight_attrs)
++
++        w2_g_idx = torch.nn.Parameter(
++            torch.empty(
++                num_experts,
++                intermediate_size,
++                dtype=torch.int32,
++            ),
++            requires_grad=False,
++        )
++        layer.register_parameter("w2_g_idx", w2_g_idx)
++        set_weight_attrs(w2_g_idx, extra_weight_attrs)
++
++        w13_g_idx_sort_indices = torch.nn.Parameter(
++            torch.empty(
++                num_experts,
++                hidden_size,
++                dtype=torch.int32,
++            ),
++            requires_grad=False,
++        )
++        layer.register_parameter("w13_g_idx_sort_indices",
++                                 w13_g_idx_sort_indices)
++        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
++
++        w2_g_idx_sort_indices = torch.nn.Parameter(
++            torch.empty(
++                num_experts,
++                intermediate_size,
++                dtype=torch.int32,
++            ),
++            requires_grad=False,
++        )
++        layer.register_parameter("w2_g_idx_sort_indices",
++                                 w2_g_idx_sort_indices)
++        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
++
++        layer.a13_scale = None
++        layer.a2_scale = None
++        layer.marlin_state = GPTQMarlinState.REPACK
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++
++        def replace_tensor(name, new_t):
++            # It is important to use resize_() here since it ensures
++            # the same buffer is reused
++            getattr(layer, name).resize_(new_t.shape)
++            getattr(layer, name).copy_(new_t)
++            del new_t
++
++        def get_scale_perms(num_bits: int):
++            scale_perm: List[int] = []
++            for i in range(8):
++                scale_perm.extend([i + 8 * j for j in range(8)])
++            scale_perm_single: List[int] = []
++            for i in range(4):
++                scale_perm_single.extend(
++                    [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
++            return scale_perm, scale_perm_single
++
++        def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
++                                  group_size: int, num_bits: int):
++            scale_perm, scale_perm_single = get_scale_perms(num_bits)
++            if group_size < size_k and group_size != -1:
++                s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
++            else:
++                s = s.reshape((-1, len(scale_perm_single)))[:,
++                                                            scale_perm_single]
++            s = s.reshape((-1, size_n)).contiguous()
++            return s
++
++        def marlin_moe_permute_scales(s: torch.Tensor, size_k: int,
++                                      size_n: int, group_size: int,
++                                      num_bits: int):
++            num_experts = s.shape[0]
++            output = torch.empty((num_experts, s.shape[1], s.shape[2]),
++                                 device=s.device,
++                                 dtype=s.dtype)
++            for e in range(num_experts):
++                output[e] = marlin_permute_scales(s[e], size_k, size_n,
++                                                  group_size, num_bits)
++            return output
++
++        size_k2 = layer.w2_weight_packed.shape[2]
++        size_k13 = layer.w13_weight_packed.shape[2]
++
++        num_experts = layer.w13_g_idx.shape[0]
++        device = layer.w13_g_idx.device
++        layer.w13_g_idx = torch.nn.Parameter(
++            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
++            requires_grad=False,
++        )
++        layer.w2_g_idx = torch.nn.Parameter(
++            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
++            requires_grad=False,
++        )
++        layer.w13_g_idx_sort_indices = torch.nn.Parameter(
++            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
++            requires_grad=False,
++        )
++        layer.w2_g_idx_sort_indices = torch.nn.Parameter(
++            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
++            requires_grad=False,
++        )
++
++        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
++            layer.w13_weight_packed,
++            layer.w13_g_idx_sort_indices,
++            layer.w13_weight_packed.shape[1] * self.packed_factor,
++            layer.w13_weight_packed.shape[2],
++            self.num_bits,
++        )
++        replace_tensor("w13_weight_packed", marlin_w13_qweight)
++        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
++            layer.w2_weight_packed,
++            layer.w2_g_idx_sort_indices,
++            layer.w2_weight_packed.shape[1] * self.packed_factor,
++            layer.w2_weight_packed.shape[2],
++            self.num_bits,
++        )
++        replace_tensor("w2_weight_packed", marlin_w2_qweight)
++        # Repack scales
++        marlin_w13_scales = marlin_moe_permute_scales(
++            layer.w13_weight_scale,
++            size_k13,
++            layer.w13_weight_scale.shape[2],
++            self.group_size,
++            self.num_bits,
++        )
++        replace_tensor("w13_weight_scale", marlin_w13_scales)
++        marlin_w2_scales = marlin_moe_permute_scales(
++            layer.w2_weight_scale,
++            layer.w2_weight_scale.shape[1] * self.packed_factor,
++            size_k2,
++            self.group_size,
++            self.num_bits,
++        )
++        replace_tensor("w2_weight_scale", marlin_w2_scales)
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        router_logits: torch.Tensor,
++        top_k: int,
++        renormalize: bool,
++        use_grouped_topk: bool = False,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++
++        topk_weights, topk_ids = FusedMoE.select_experts(
++            hidden_states=x,
++            router_logits=router_logits,
++            use_grouped_topk=use_grouped_topk,
++            top_k=top_k,
++            renormalize=renormalize,
++            topk_group=topk_group,
++            num_expert_group=num_expert_group,
++            custom_routing_function=custom_routing_function,
++            scoring_func=scoring_func,
++            e_score_correction_bias=e_score_correction_bias)
++
++        return torch.ops.vllm.fused_marlin_moe(
++            x,
++            layer.w13_weight_packed,
++            layer.w2_weight_packed,
++            layer.w13_weight_scale,
++            layer.w2_weight_scale,
++            router_logits,
++            topk_weights,
++            topk_ids,
++            g_idx1=layer.w13_g_idx,
++            g_idx2=layer.w2_g_idx,
++            sort_indices1=layer.w13_g_idx_sort_indices,
++            sort_indices2=layer.w2_g_idx_sort_indices,
++            num_bits=self.num_bits,
++        )
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+new file mode 100644
+index 0000000..569ecaa
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+@@ -0,0 +1,18 @@
++from .compressed_tensors_scheme import CompressedTensorsScheme
++from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
++                                          CompressedTensorsW4A16Sparse24)
++from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
++from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
++from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
++from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
++                                       CompressedTensorsWNA16)
++
++from .compressed_tensors_24 import CompressedTensors24  # isort: skip
++
++__all__ = [
++    "CompressedTensorsScheme", "CompressedTensorsWNA16",
++    "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
++    "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
++    "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
++    "CompressedTensors24"
++]
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+new file mode 100644
+index 0000000..bc697ef
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+@@ -0,0 +1,208 @@
++from typing import Callable, List, Optional
++
++import torch
++from compressed_tensors.quantization import (QuantizationArgs,
++                                             QuantizationStrategy,
++                                             QuantizationType)
++
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
++    CompressedTensorsScheme)
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    convert_to_channelwise, sparse_cutlass_supported)
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           ChannelQuantScaleParameter,
++                                           ModelWeightParameter,
++                                           PerTensorScaleParameter)
++
++__all__ = ["CompressedTensors24"]
++
++
++class CompressedTensors24(CompressedTensorsScheme):
++
++    def __init__(self,
++                 quantized: bool = False,
++                 weight_quant: Optional[QuantizationArgs] = None,
++                 input_quant: Optional[QuantizationArgs] = None):
++
++        self.quantized = quantized
++        self.weight_quant = weight_quant
++        self.input_quant = input_quant
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        # Only cutlass 3.x kernels are implemented so far
++        return 90
++
++    def create_weights(self, layer: torch.nn.Module, input_size: int,
++                       output_partition_sizes: List[int],
++                       input_size_per_partition: int,
++                       params_dtype: torch.dtype, weight_loader: Callable,
++                       **kwargs):
++
++        if not sparse_cutlass_supported():
++            raise ValueError(
++                "Sparse CUTLASS not supported. vLLM must be built with"
++                "CUDA 12.2 or later to use this feature")
++
++        self.output_dtype = params_dtype
++        layer.logical_widths = output_partition_sizes
++        self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
++
++        # parameter to store uncompressed weight
++        weight = ModelWeightParameter(data=torch.empty(
++            sum(output_partition_sizes),
++            input_size_per_partition,
++            dtype=self.weights_dtype),
++                                      input_dim=1,
++                                      output_dim=0,
++                                      weight_loader=weight_loader)
++
++        # Check if quantized, not just 2:4 Sparse
++        if self.quantized:
++            if (self.weight_quant and self.weight_quant.strategy
++                    == QuantizationStrategy.CHANNEL.value):
++                weight_scale = ChannelQuantScaleParameter(
++                    data=torch.empty((sum(output_partition_sizes), 1),
++                                     dtype=torch.float32),
++                    output_dim=0,
++                    weight_loader=weight_loader)
++            else:
++                assert (self.weight_quant and self.weight_quant.strategy
++                        == QuantizationStrategy.TENSOR.value)
++                weight_scale = PerTensorScaleParameter(
++                    data=torch.empty(len(output_partition_sizes),
++                                     dtype=torch.float32),
++                    weight_loader=weight_loader)
++
++            layer.register_parameter("weight_scale", weight_scale)
++
++            # input quant will be non-none
++            if self.input_quant and not self.input_quant.dynamic:
++                # register input quant scale
++                assert (self.input_quant.strategy ==
++                        QuantizationStrategy.TENSOR.value)
++                input_scale = BasevLLMParameter(data=torch.empty(
++                    1, dtype=torch.float32),
++                                                weight_loader=weight_loader)
++
++                layer.register_parameter("input_scale", input_scale)
++
++        else:
++            # for sparse-only, pass in 1 for weight/input scales
++            weight_scale = torch.nn.Parameter(data=torch.ones(
++                1, dtype=torch.float32),
++                                              requires_grad=False)
++            input_scale = torch.nn.Parameter(data=torch.ones(
++                1, dtype=torch.float32),
++                                             requires_grad=False)
++            layer.register_parameter("input_scale", input_scale)
++            layer.register_parameter("weight_scale", weight_scale)
++
++        layer.register_parameter("weight", weight)
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        """
++        Compress weights after loading. Store compressed weight and meta
++            tensor
++        
++        :post-condition: layer.w_compressed and layer.meta are
++            set to the compressed weight and meta tensor in the
++            format expected by the Cutlass kernels
++        :param layer: The layer with the weights to be processed
++        
++        """
++        # torch.compile workaround
++        if hasattr(layer, "input_scale"):
++            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
++                                                   requires_grad=False)
++
++        if self.weight_quant:
++            if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
++                layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
++                    weight_scale=layer.weight_scale,
++                    logical_widths=layer.logical_widths),
++                                                        requires_grad=False)
++            else:
++                # torch.compile workaround
++                layer.weight_scale = torch.nn.Parameter(
++                    layer.weight_scale.data, requires_grad=False)
++
++        w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
++        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
++        layer.meta = torch.nn.Parameter(meta, requires_grad=False)
++
++    def apply_weights(self,
++                      layer: torch.nn.Module,
++                      x: torch.Tensor,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        """
++        Returns the output tensor for the layer with 2:4 
++        sparse compressed weights, given the input tensor
++        and bias
++
++        :param layer: The layer with 2:4 sparse compressed 
++            weights to be used for the computation
++        :param x: The input tensor to the layer
++        :param bias: The bias to be added to the output tensor
++        :return: The output tensor of the layer 
++        """
++        if self.quantized:
++            scale = None
++            if hasattr(layer, "input_scale"):
++                scale = layer.input_scale
++
++            if self.weights_dtype == torch.int8:
++                ops_output = ops.scaled_int8_quant(x, scale=scale)
++                q_input = ops_output[0]
++                input_scale = ops_output[1]
++            else:
++                assert self.weights_dtype == torch.float8_e4m3fn
++                if scale is not None:
++                    q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
++                else:
++                    q_input, input_scale = ops.scaled_fp8_quant(
++                        x, use_per_token_if_dynamic=True)
++
++        else:
++            # Not quantized, nothing to do with the input_scales, use as is
++            input_scale = layer.input_scale
++            q_input = x
++
++        out = ops.cutlass_scaled_sparse_mm(a=q_input,
++                                           bt_nzs=layer.weight,
++                                           bt_meta=layer.meta,
++                                           scale_a=input_scale,
++                                           scale_b=layer.weight_scale,
++                                           out_dtype=self.output_dtype,
++                                           bias=bias)
++        assert out.is_contiguous()
++        return out
++
++    def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
++        if not self.quantized:
++            return params_dtype
++
++        assert self.weight_quant is not None
++        assert self.input_quant is not None
++
++        is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
++
++        if not is_8_bits:
++            raise ValueError("Cutlass only supports 8-bit quantization")
++
++        if (self.weight_quant.type == QuantizationType.FLOAT
++                and self.input_quant.type == QuantizationType.FLOAT):
++            return torch.float8_e4m3fn
++
++        if (self.weight_quant.type == QuantizationType.INT
++                and self.input_quant.type == QuantizationType.INT):
++            return torch.int8
++
++        raise ValueError("Quantization type not supported by Cutlass")
++
++
++def check_24(tensor):
++    new_tensor = tensor.view(-1, 4)
++    zero_counts = (new_tensor == 0).sum(dim=1)
++    return (zero_counts >= 2).all().item()
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+new file mode 100644
+index 0000000..b4bab33
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+@@ -0,0 +1,52 @@
++from abc import ABC, abstractmethod
++from typing import Optional
++
++import torch
++
++__all__ = ["CompressedTensorsScheme"]
++
++
++class CompressedTensorsScheme(ABC):
++    """
++    Abstract class used to describe the weight creation and forward pass 
++    of different quantization schemes supported by CompressedTensors.
++    """
++
++    @classmethod
++    @abstractmethod
++    def get_min_capability(cls) -> int:
++        """
++        Get minimum device capability.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def create_weights(self, *args, **kwargs):
++        """
++        Weight creation for the particular scheme. Inputs to this function 
++
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
++                      bias: Optional[torch.Tensor]):
++        """
++        Run the forward pass for the particular scheme. This is where 
++        scheme-specific dequant/quant steps/kernels should be applied.
++
++        :param layer: torch.nn.Module with the registered weights and 
++            other parameters relevant to the particular scheme. 
++        :param x: input to the layer
++        :param bias: bias parameter
++
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def process_weights_after_loading(self, layer: torch.nn.Module):
++        """
++        Called after weight loading is complete for any cleanup that
++        needs to occur.
++        """
++        raise NotImplementedError
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+new file mode 100644
+index 0000000..61d1c91
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+@@ -0,0 +1,157 @@
++from typing import Callable, List, Optional
++
++import torch
++from torch.nn import Parameter
++
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
++    CompressedTensorsScheme)
++from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
++    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N)
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           ChannelQuantScaleParameter,
++                                           GroupQuantScaleParameter,
++                                           PackedvLLMParameter)
++from vllm.scalar_type import scalar_types
++
++__all__ = ["CompressedTensorsW4A16Sparse24"]
++W4A16SPARSE24_SUPPORTED_TYPES_MAP = {
++    4: scalar_types.uint4b8,
++}
++W4A16SPARSE24_SUPPORTED_BITS = list(W4A16SPARSE24_SUPPORTED_TYPES_MAP.keys())
++
++
++class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
++
++    def __init__(self,
++                 strategy: str,
++                 num_bits: int,
++                 group_size: Optional[int] = None):
++        self.strategy = strategy
++        self.group_size = group_size
++        self.tile_size = 16
++
++        if num_bits not in W4A16SPARSE24_SUPPORTED_TYPES_MAP:
++            raise ValueError(
++                f"Unsupported num_bits = {num_bits}. "
++                f"Supported num_bits = {W4A16SPARSE24_SUPPORTED_BITS}")
++
++        self.quant_type = W4A16SPARSE24_SUPPORTED_TYPES_MAP[num_bits]
++
++        if self.strategy == "group" and self.group_size is None:
++            raise ValueError(
++                "group_size must be given when using strategy group")
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        # ampere + up
++        return 80
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        # required by torch.compile to be torch.nn.Parameter
++        layer.weight_packed = Parameter(layer.weight_packed.data,
++                                        requires_grad=False)
++        layer.scale_packed = Parameter(layer.scale_packed.data,
++                                       requires_grad=False)
++        layer.meta = Parameter(layer.meta.data, requires_grad=False)
++
++    def create_weights(self, layer: torch.nn.Module, input_size: int,
++                       output_partition_sizes: List[int],
++                       input_size_per_partition: int,
++                       params_dtype: torch.dtype, weight_loader: Callable,
++                       **kwargs):
++
++        assert params_dtype == torch.float16, (
++            "float16 is required for marlin24 compressd models. Set dtype=torch.float16"  # noqa: E501
++        )
++
++        pack_factor = 32 // self.quant_type.size_bits
++        output_size_per_partition = sum(output_partition_sizes)
++
++        qweight = PackedvLLMParameter(data=torch.empty(
++            input_size_per_partition // self.tile_size // 2,
++            output_size_per_partition * self.tile_size // pack_factor,
++            dtype=torch.int32,
++        ),
++                                      input_dim=0,
++                                      output_dim=1,
++                                      packed_dim=1,
++                                      packed_factor=pack_factor,
++                                      marlin_tile_size=self.tile_size,
++                                      weight_loader=weight_loader)
++
++        input_groups = (1 if self.group_size is None else
++                        input_size_per_partition // self.group_size)
++
++        weight_scale_args = {
++            "data":
++            torch.empty(
++                input_groups,
++                output_size_per_partition,
++                dtype=params_dtype,
++            ),
++            "weight_loader":
++            weight_loader
++        }
++
++        if self.group_size is not None:
++            scales = GroupQuantScaleParameter(output_dim=1,
++                                              input_dim=0,
++                                              **weight_scale_args)
++        else:
++            scales = ChannelQuantScaleParameter(output_dim=1,
++                                                **weight_scale_args)
++
++        weight_shape = BasevLLMParameter(data=torch.empty(2,
++                                                          dtype=torch.int64),
++                                         weight_loader=weight_loader)
++
++        meta = PackedvLLMParameter(data=torch.empty(
++            input_size_per_partition // 8 // 2 // 2,
++            output_size_per_partition * 2,
++            dtype=torch.int16,
++        ),
++                                   input_dim=0,
++                                   output_dim=1,
++                                   packed_dim=1,
++                                   packed_factor=1,
++                                   marlin_tile_size=2,
++                                   weight_loader=weight_loader)
++
++        layer.register_parameter("weight_packed", qweight)
++        layer.register_parameter("weight_shape", weight_shape)
++        layer.register_parameter("scale_packed", scales)
++        layer.register_parameter("meta", meta)
++
++        max_workspace_size = (
++            output_size_per_partition //
++            GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL
++
++        workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int),
++                              requires_grad=False)
++        layer.workspace = workspace
++
++    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
++                      bias: Optional[torch.Tensor]) -> torch.Tensor:
++
++        qweight = layer.weight_packed
++        meta = layer.meta
++        scales = layer.scale_packed
++        workspace = layer.workspace
++
++        x_2d = x.view(-1, x.shape[-1])
++
++        size_m = x_2d.shape[0]
++        size_k = x_2d.shape[1]
++        size_n = scales.shape[1]
++
++        output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
++                                            workspace, self.quant_type, size_m,
++                                            size_n, size_k)
++
++        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
++
++        if bias is not None:
++            output.add_(bias)  # In-place add
++
++        return output
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+new file mode 100644
+index 0000000..1671a23
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+@@ -0,0 +1,117 @@
++from typing import Callable, List, Optional
++
++import torch
++from compressed_tensors.quantization import QuantizationStrategy
++
++from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
++    CompressedTensorsScheme)
++from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
++    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    convert_to_channelwise)
++from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
++                                           ModelWeightParameter,
++                                           PerTensorScaleParameter)
++
++__all__ = ["CompressedTensorsW8A16Fp8"]
++
++SUPPORTED_STRATEGIES = [
++    QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR
++]
++
++
++class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
++
++    def __init__(self, strategy: str, is_static_input_scheme: bool):
++        self.strategy = strategy
++        self.is_static_input_scheme = is_static_input_scheme
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        # ampere and up
++        return 80
++
++    # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
++    # So if we have a fused module (QKV, MLP) with per tensor scales,
++    # we expand each scale to its shard's channels.
++    def process_weights_after_loading(self, layer) -> None:
++        if self.strategy == QuantizationStrategy.TENSOR:
++            ws_channelwise = convert_to_channelwise(layer.weight_scale,
++                                                    layer.logical_widths)
++            layer.weight_scale = torch.nn.Parameter(ws_channelwise,
++                                                    requires_grad=False)
++        else:
++            # required by torch.compile to be torch.nn.Parameter
++            layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
++                                                    requires_grad=False)
++
++        # Weights must be transposed for marlin
++        layer.weight = torch.nn.Parameter(layer.weight.t(),
++                                          requires_grad=False)
++
++        if self.is_static_input_scheme:
++            # required by torch.compile to be torch.nn.Parameter
++            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
++                                                   requires_grad=False)
++        prepare_fp8_layer_for_marlin(layer, strategy="channel")
++
++    def create_weights(self, layer: torch.nn.Module, input_size: int,
++                       output_partition_sizes: List[int],
++                       input_size_per_partition: int,
++                       params_dtype: torch.dtype, weight_loader: Callable,
++                       **kwargs):
++
++        output_size_per_partition = sum(output_partition_sizes)
++        layer.logical_widths = output_partition_sizes
++        layer.input_size_per_partition = input_size_per_partition
++        layer.output_size_per_partition = output_size_per_partition
++        layer.orig_dtype = params_dtype
++
++        # WEIGHT
++        weight = ModelWeightParameter(data=torch.empty(
++            output_size_per_partition,
++            input_size_per_partition,
++            dtype=torch.float8_e4m3fn),
++                                      input_dim=1,
++                                      output_dim=0,
++                                      weight_loader=weight_loader)
++        layer.register_parameter("weight", weight)
++
++        # WEIGHT SCALE
++        if self.strategy == QuantizationStrategy.CHANNEL:
++            weight_scale = ChannelQuantScaleParameter(
++                data=torch.empty((sum(output_partition_sizes), 1),
++                                 dtype=torch.float32),
++                output_dim=0,
++                weight_loader=weight_loader)
++        elif self.strategy == QuantizationStrategy.TENSOR:
++            weight_scale = PerTensorScaleParameter(data=torch.empty(
++                len(output_partition_sizes), dtype=torch.float32),
++                                                   weight_loader=weight_loader)
++        else:
++            raise ValueError(
++                f"Unsupported weight strategy={self.strategy}, "
++                f"supported strategies are {SUPPORTED_STRATEGIES}")
++
++        weight_scale[:] = torch.finfo(torch.float32).min
++        layer.register_parameter("weight_scale", weight_scale)
++
++        # INPUT SCALE (to deal with converted checkpoints)
++        if self.is_static_input_scheme:
++            input_scale = PerTensorScaleParameter(data=torch.empty(
++                len(output_partition_sizes), dtype=torch.float32),
++                                                  weight_loader=weight_loader)
++            layer.register_parameter("input_scale", input_scale)
++
++    def apply_weights(self,
++                      layer: torch.nn.Module,
++                      x: torch.Tensor,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++
++        return apply_fp8_marlin_linear(input=x,
++                                       weight=layer.weight,
++                                       weight_scale=layer.weight_scale,
++                                       workspace=layer.workspace,
++                                       size_n=layer.output_size_per_partition,
++                                       size_k=layer.input_size_per_partition,
++                                       bias=bias)
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+new file mode 100644
+index 0000000..1d4e4bd
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+@@ -0,0 +1,146 @@
++from typing import Callable, List, Optional
++
++import torch
++from compressed_tensors.quantization import QuantizationStrategy
++from torch.nn import Parameter
++
++from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
++    CompressedTensorsScheme)
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz,
++    requantize_with_max_scale)
++from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
++                                           ModelWeightParameter,
++                                           PerTensorScaleParameter)
++from vllm.platforms import current_platform
++
++__all__ = ["CompressedTensorsW8A8Fp8"]
++
++
++class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
++
++    def __init__(self, strategy: str, is_static_input_scheme: bool):
++        self.strategy = strategy
++        self.is_static_input_scheme = is_static_input_scheme
++        self.cutlass_fp8_supported = cutlass_fp8_supported()
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        # lovelace and up
++        return 89
++
++    def process_weights_after_loading(self, layer) -> None:
++        # If per tensor, when we have a fused module (e.g. QKV) with per
++        # tensor scales (thus N scales being passed to the kernel),
++        # requantize so we can always run per tensor
++        if self.strategy == QuantizationStrategy.TENSOR:
++            max_w_scale, weight = requantize_with_max_scale(
++                weight=layer.weight,
++                weight_scale=layer.weight_scale,
++                logical_widths=layer.logical_widths,
++            )
++
++            if current_platform.is_rocm():
++                input_scale = getattr(layer, 'input_scale', None)
++
++                weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
++                    weight=weight,
++                    weight_scale=max_w_scale,
++                    input_scale=input_scale)
++                if input_scale is not None:
++                    layer.input_scale = Parameter(input_scale,
++                                                  requires_grad=False)
++
++            layer.weight = Parameter(weight.t(), requires_grad=False)
++            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
++
++        # If channelwise, scales are already lined up, so just transpose.
++        elif self.strategy == QuantizationStrategy.CHANNEL:
++            weight = layer.weight
++
++            if current_platform.is_rocm():
++                input_scale = getattr(layer, 'input_scale', None)
++
++                weight, weight_scale, input_scale = \
++                    normalize_e4m3fn_to_e4m3fnuz(
++                        weight=weight,
++                        weight_scale=layer.weight_scale,
++                        input_scale=input_scale)
++                if input_scale is not None:
++                    layer.input_scale = Parameter(input_scale,
++                                                  requires_grad=False)
++            else:
++                weight_scale = layer.weight_scale.data
++
++            layer.weight = Parameter(weight.t(), requires_grad=False)
++            # required by torch.compile to be torch.nn.Parameter
++            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
++
++        else:
++            raise ValueError(f"Unknown quantization strategy {self.strategy}")
++
++        # INPUT SCALE
++        if self.is_static_input_scheme and hasattr(layer, 'input_scale'):
++            layer.input_scale = Parameter(layer.input_scale.max(),
++                                          requires_grad=False)
++        else:
++            layer.input_scale = None
++
++    def create_weights(self, layer: torch.nn.Module,
++                       output_partition_sizes: List[int],
++                       input_size_per_partition: int,
++                       params_dtype: torch.dtype, weight_loader: Callable,
++                       **kwargs):
++        output_size_per_partition = sum(output_partition_sizes)
++        layer.logical_widths = output_partition_sizes
++
++        # WEIGHT
++        weight = ModelWeightParameter(data=torch.empty(
++            output_size_per_partition,
++            input_size_per_partition,
++            dtype=torch.float8_e4m3fn),
++                                      input_dim=1,
++                                      output_dim=0,
++                                      weight_loader=weight_loader)
++        layer.register_parameter("weight", weight)
++
++        # WEIGHT SCALE
++        # TODO: update create_xxx_parameter functions to return
++        # the newly added parameters
++        if self.strategy == QuantizationStrategy.CHANNEL:
++            weight_scale = ChannelQuantScaleParameter(
++                data=torch.empty((sum(output_partition_sizes), 1),
++                                 dtype=torch.float32),
++                output_dim=0,
++                weight_loader=weight_loader)
++        else:
++            assert self.strategy == QuantizationStrategy.TENSOR
++            weight_scale = PerTensorScaleParameter(data=torch.empty(
++                len(output_partition_sizes), dtype=torch.float32),
++                                                   weight_loader=weight_loader)
++
++        # min requirement for fp8 kernels
++        weight_scale[:] = torch.finfo(torch.float32).min
++        layer.register_parameter("weight_scale", weight_scale)
++
++        # INPUT SCALE
++        if self.is_static_input_scheme:
++            input_scale = PerTensorScaleParameter(data=torch.empty(
++                len(output_partition_sizes), dtype=torch.float32),
++                                                  weight_loader=weight_loader)
++            input_scale[:] = torch.finfo(torch.float32).min
++            layer.register_parameter("input_scale", input_scale)
++
++    def apply_weights(self,
++                      layer: torch.nn.Module,
++                      x: torch.Tensor,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++
++        return apply_fp8_linear(
++            input=x,
++            weight=layer.weight,
++            weight_scale=layer.weight_scale,
++            input_scale=layer.input_scale,
++            bias=bias,
++            cutlass_fp8_supported=self.cutlass_fp8_supported,
++            use_per_token_if_dynamic=True)
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+new file mode 100644
+index 0000000..0e3f473
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+@@ -0,0 +1,108 @@
++from typing import Callable, List, Optional, Set
++
++import torch
++from compressed_tensors.quantization import QuantizationStrategy
++
++from vllm.logger import init_logger
++from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
++    CompressedTensorsScheme)
++from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
++    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           ChannelQuantScaleParameter,
++                                           ModelWeightParameter,
++                                           PerTensorScaleParameter)
++
++logger = init_logger(__name__)
++
++
++class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
++    _kernel_backends_being_used: Set[str] = set()
++
++    def __init__(self, strategy: str, is_static_input_scheme: bool,
++                 input_symmetric: bool):
++        self.strategy = strategy
++        self.is_static_input_scheme = is_static_input_scheme
++        self.input_symmetric = input_symmetric
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        # turing and up
++        return 75
++
++    def create_weights(self, layer: torch.nn.Module,
++                       output_partition_sizes: List[int],
++                       input_size_per_partition: int,
++                       params_dtype: torch.dtype, weight_loader: Callable,
++                       **kwargs):
++        layer.logical_widths = output_partition_sizes
++
++        scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
++            is_channelwise=(self.strategy == QuantizationStrategy.CHANNEL),
++            is_static_input_scheme=self.is_static_input_scheme,
++            input_symmetric=self.input_symmetric)
++
++        kernel_type = choose_scaled_mm_linear_kernel(
++            scaled_mm_linear_kernel_config)
++
++        if kernel_type.__name__ not in self._kernel_backends_being_used:
++            logger.info("Using %s for CompressedTensorsW8A8Int8",
++                        kernel_type.__name__)
++            self._kernel_backends_being_used.add(kernel_type.__name__)
++
++        # WEIGHT
++        weight = ModelWeightParameter(data=torch.empty(
++            sum(output_partition_sizes),
++            input_size_per_partition,
++            dtype=torch.int8),
++                                      input_dim=1,
++                                      output_dim=0,
++                                      weight_loader=weight_loader)
++
++        layer.register_parameter("weight", weight)
++
++        # WEIGHT SCALE
++        if self.strategy == QuantizationStrategy.CHANNEL:
++            weight_scale = ChannelQuantScaleParameter(
++                data=torch.empty((sum(output_partition_sizes), 1),
++                                 dtype=torch.float32),
++                output_dim=0,
++                weight_loader=weight_loader)
++        else:
++            assert self.strategy == QuantizationStrategy.TENSOR
++            weight_scale = PerTensorScaleParameter(data=torch.empty(
++                len(output_partition_sizes), dtype=torch.float32),
++                                                   weight_loader=weight_loader)
++        layer.register_parameter("weight_scale", weight_scale)
++
++        # INPUT SCALE
++        if self.is_static_input_scheme:
++            input_scale = BasevLLMParameter(data=torch.empty(
++                1, dtype=torch.float32),
++                                            weight_loader=weight_loader)
++            layer.register_parameter("input_scale", input_scale)
++
++            if not self.input_symmetric:
++                # Note: compressed-tensors stores the zp using the same dtype
++                # as the weights
++                # AZP loaded as int8 but used as int32
++                input_zero_point = BasevLLMParameter(
++                    data=torch.empty(1, dtype=torch.int8),
++                    weight_loader=weight_loader)
++                layer.register_parameter("input_zero_point", input_zero_point)
++
++        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
++                                  w_q_param_name="weight",
++                                  w_s_param_name="weight_scale",
++                                  i_s_param_name="input_scale",
++                                  i_zp_param_name="input_zero_point",
++                                  azp_adj_param_name="azp_adj")
++
++    # Checkpoints are serialized in compressed-tensors format, which is
++    # different from the format the kernel may want. Handle repacking here.
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        self.kernel.process_weights_after_loading(layer)
++
++    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
++                      bias: Optional[torch.Tensor]) -> torch.Tensor:
++        return self.kernel.apply_weights(layer, x, bias)
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+new file mode 100644
+index 0000000..2dd243b
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+@@ -0,0 +1,162 @@
++from typing import Callable, List, Optional, Set
++
++import torch
++from compressed_tensors.quantization import ActivationOrdering
++
++from vllm.logger import init_logger
++from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
++    CompressedTensorsScheme)
++from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
++    MPLinearLayerConfig, choose_mp_linear_kernel)
++from vllm.model_executor.layers.quantization.utils.marlin_utils import (
++    marlin_repeat_scales_on_all_ranks)
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           ChannelQuantScaleParameter,
++                                           GroupQuantScaleParameter,
++                                           PackedvLLMParameter,
++                                           RowvLLMParameter)
++from vllm.scalar_type import scalar_types
++
++logger = init_logger(__name__)
++
++__all__ = ["CompressedTensorsWNA16"]
++WNA16_SUPPORTED_TYPES_MAP = {
++    4: scalar_types.uint4b8,
++    8: scalar_types.uint8b128
++}
++WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
++
++
++class CompressedTensorsWNA16(CompressedTensorsScheme):
++    _kernel_backends_being_used: Set[str] = set()
++
++    def __init__(self,
++                 strategy: str,
++                 num_bits: int,
++                 group_size: Optional[int] = None,
++                 actorder: Optional[ActivationOrdering] = None):
++
++        self.pack_factor = 32 // num_bits
++        self.strategy = strategy
++        self.group_size = -1 if group_size is None else group_size
++        self.has_g_idx = actorder == ActivationOrdering.GROUP
++
++        if self.group_size == -1 and self.strategy != "channel":
++            raise ValueError("Marlin kernels require group quantization or "
++                             "channelwise quantization, but found no group "
++                             "size and strategy is not channelwise.")
++
++        if num_bits not in WNA16_SUPPORTED_TYPES_MAP:
++            raise ValueError(
++                f"Unsupported num_bits = {num_bits}. "
++                f"Supported num_bits = {WNA16_SUPPORTED_TYPES_MAP.keys()}")
++
++        self.quant_type = WNA16_SUPPORTED_TYPES_MAP[num_bits]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        # ampere and up
++        return 80
++
++    def create_weights(self, layer: torch.nn.Module, output_size: int,
++                       input_size: int, output_partition_sizes: List[int],
++                       input_size_per_partition: int,
++                       params_dtype: torch.dtype, weight_loader: Callable,
++                       **kwargs):
++
++        output_size_per_partition = sum(output_partition_sizes)
++
++        mp_linear_kernel_config = MPLinearLayerConfig(
++            full_weight_shape=(input_size, output_size),
++            partition_weight_shape=\
++                (input_size_per_partition, output_size_per_partition),
++            weight_type=self.quant_type,
++            act_type=params_dtype,
++            group_size=self.group_size,
++            zero_points=False,
++            has_g_idx=self.has_g_idx
++        )
++
++        kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
++
++        if kernel_type.__name__ not in self._kernel_backends_being_used:
++            logger.info("Using %s for CompressedTensorsWNA16",
++                        kernel_type.__name__)
++            self._kernel_backends_being_used.add(kernel_type.__name__)
++
++        # If group_size is -1, we are in channelwise case.
++        group_size = self.group_size if self.group_size != -1 else input_size
++        row_parallel = (input_size != input_size_per_partition)
++        partition_scales = not marlin_repeat_scales_on_all_ranks(
++            self.has_g_idx, self.group_size, row_parallel)
++
++        scales_and_zp_size = input_size // group_size
++
++        if partition_scales:
++            assert input_size_per_partition % group_size == 0
++            scales_and_zp_size = input_size_per_partition // group_size
++
++        weight = PackedvLLMParameter(input_dim=1,
++                                     output_dim=0,
++                                     weight_loader=weight_loader,
++                                     packed_factor=self.pack_factor,
++                                     packed_dim=1,
++                                     data=torch.empty(
++                                         output_size_per_partition,
++                                         input_size_per_partition //
++                                         self.pack_factor,
++                                         dtype=torch.int32,
++                                     ))
++
++        weight_scale_args = {
++            "weight_loader":
++            weight_loader,
++            "data":
++            torch.empty(
++                output_size_per_partition,
++                scales_and_zp_size,
++                dtype=params_dtype,
++            )
++        }
++        if not partition_scales:
++            weight_scale = ChannelQuantScaleParameter(output_dim=0,
++                                                      **weight_scale_args)
++        else:
++            weight_scale = GroupQuantScaleParameter(output_dim=0,
++                                                    input_dim=1,
++                                                    **weight_scale_args)
++
++        # A 2D array defining the original shape of the weights
++        # before packing
++        weight_shape = BasevLLMParameter(data=torch.empty(2,
++                                                          dtype=torch.int64),
++                                         weight_loader=weight_loader)
++
++        layer.register_parameter("weight_packed", weight)
++        layer.register_parameter("weight_scale", weight_scale)
++        layer.register_parameter("weight_shape", weight_shape)
++
++        # group index (for activation reordering)
++        if self.has_g_idx:
++            weight_g_idx = RowvLLMParameter(data=torch.empty(
++                input_size_per_partition,
++                dtype=torch.int32,
++            ),
++                                            input_dim=0,
++                                            weight_loader=weight_loader)
++            layer.register_parameter("weight_g_idx", weight_g_idx)
++
++        self.kernel = kernel_type(mp_linear_kernel_config,
++                                  w_q_param_name="weight_packed",
++                                  w_s_param_name="weight_scale",
++                                  w_zp_param_name=None,
++                                  w_gidx_param_name="weight_g_idx")
++
++    # Checkpoints are serialized in compressed-tensors format, which is
++    # different from the format the kernel may want. Handle repacking here.
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        self.kernel.process_weights_after_loading(layer)
++
++    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
++                      bias: Optional[torch.Tensor]) -> torch.Tensor:
++        return self.kernel.apply_weights(layer, x, bias)
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+new file mode 100644
+index 0000000..2659afc
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+@@ -0,0 +1,199 @@
++from typing import Optional, Type
++
++import torch
++import triton
++import triton.language as tl
++
++
++def is_weak_contiguous(x: torch.Tensor):
++    strides = x.stride()
++    sizes = x.shape
++    is_not_transpose = strides[0] == 1 and (strides[1] >= max(1, sizes[0]))
++    is_transpose = strides[1] == 1 and (strides[0] >= max(1, sizes[1]))
++    return is_transpose or is_not_transpose
++
++
++@triton.jit
++def scaled_mm_kernel(a_ptr, b_ptr, scale_a_ptr, scale_b_ptr, c_ptr, bias_ptr,
++                     M, N, K, stride_am, stride_ak, stride_bk, stride_bn,
++                     stride_cm, stride_cn, ACCUMULATOR_DTYPE: tl.constexpr,
++                     BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
++                     BLOCK_SIZE_K: tl.constexpr,
++                     BLOCK_SIZE_SCALE_A: tl.constexpr,
++                     BLOCK_SIZE_SCALE_B: tl.constexpr):
++    pid = tl.program_id(axis=0)
++
++    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
++
++    pid_m = pid // num_pid_n
++    pid_n = pid % num_pid_n
++
++    accumulator_dtype = ACCUMULATOR_DTYPE
++    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
++                           dtype=accumulator_dtype)
++
++    # NOTE: Some tensor inputs are so large, they will cause int32 overflow
++    # so it is necessary to use tl.int64 for all the offsets, else SEGV will
++    # eventually occur.
++
++    # Offsets and masks.
++    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
++    masks_am = offsets_am < M
++
++    offsets_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
++    masks_bn = offsets_bn < N
++
++    offsets_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
++    offsets_a = (stride_am * offsets_am[:, None] +
++                 stride_ak * offsets_k[None, :])
++    offsets_b = (stride_bk * offsets_k[:, None] +
++                 stride_bn * offsets_bn[None, :])
++
++    # NOTE: BLOCK_SIZE_SCALE_A could be 1 or BLOCK_SIZE_M, so need to create
++    # appropriate offsets and masks for each case. Same goes for
++    # BLOCK_SIZE_SCALE_B.
++    offsets_scale_am = (tl.arange(0, BLOCK_SIZE_SCALE_A) +
++                        (BLOCK_SIZE_SCALE_A > 1) * pid_m * BLOCK_SIZE_M)
++    masks_scale_am = offsets_scale_am < M
++
++    offsets_scale_bn = (tl.arange(0, BLOCK_SIZE_SCALE_B) +
++                        (BLOCK_SIZE_SCALE_B > 1) * pid_n * BLOCK_SIZE_N)
++    masks_scale_bn = offsets_scale_bn < N
++
++    a_ptrs = a_ptr + offsets_a
++    b_ptrs = b_ptr + offsets_b
++
++    scale_a_ptrs = scale_a_ptr + offsets_scale_am
++    scale_b_ptrs = scale_b_ptr + offsets_scale_bn
++
++    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
++        masks_k = offsets_k < K
++        masks_a = masks_am[:, None] & masks_k[None, :]
++        a = tl.load(a_ptrs, mask=masks_a)
++
++        masks_b = masks_k[:, None] & masks_bn[None, :]
++        b = tl.load(b_ptrs, mask=masks_b)
++
++        # Accumulate results.
++        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
++
++        offsets_k += BLOCK_SIZE_K
++        a_ptrs += BLOCK_SIZE_K * stride_ak
++        b_ptrs += BLOCK_SIZE_K * stride_bk
++
++    # Apply scale at end.
++    masks_scale_a = masks_scale_am[:, None] & (tl.arange(0, 1) < 1)[:, None]
++    scale_a = tl.load(scale_a_ptrs[:, None], masks_scale_a)
++    # Need to broadcast to the appropriate size, if scale_a is already
++    # (BLOCK_SIZE_M, 1) then it will broadcast to its own shape. Same goes
++    # for scale_b below.
++    scale_a = scale_a.broadcast_to((BLOCK_SIZE_M, 1))
++    accumulator = scale_a * accumulator.to(tl.float32)
++
++    masks_scale_b = masks_scale_bn[:, None] & (tl.arange(0, 1) < 1)[None, :]
++    scale_b = tl.load(scale_b_ptrs[:, None], masks_scale_b)
++    scale_b = scale_b.broadcast_to((BLOCK_SIZE_N, 1))
++    accumulator = scale_b.T * accumulator.to(tl.float32)
++
++    # Convert to output format.
++    c = accumulator.to(c_ptr.type.element_ty)
++
++    # Add bias, it's already in output format, so add it after conversion.
++    if bias_ptr:
++        offsets_bias = offsets_bn
++        bias_ptrs = bias_ptr + offsets_bias
++        bias_mask = offsets_bias < N
++        bias = tl.load(bias_ptrs, bias_mask)
++        c += bias
++
++    # Save output
++    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
++    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
++    offs_cm = offs_cm.to(tl.int64)
++    offs_cn = offs_cn.to(tl.int64)
++    c_ptrs = (c_ptr + stride_cm * offs_cm[:, None] +
++              stride_cn * offs_cn[None, :])
++    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
++
++    tl.store(c_ptrs, c, mask=c_mask)
++
++
++# input   - [M, K]
++# weight - [K, N]
++def triton_scaled_mm(input: torch.Tensor,
++                     weight: torch.Tensor,
++                     scale_a: torch.Tensor,
++                     scale_b: torch.Tensor,
++                     out_dtype: Type[torch.dtype],
++                     bias: Optional[torch.Tensor] = None,
++                     block_size_m: int = 32,
++                     block_size_n: int = 32,
++                     block_size_k: int = 32,
++                     use_heuristic=True) -> torch.Tensor:
++    M, K = input.shape
++    N = weight.shape[1]
++
++    assert N > 0 and K > 0 and M > 0
++    assert weight.shape[0] == K
++    assert input.dtype == weight.dtype
++    assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point()
++    assert scale_a.shape == torch.Size([1, 1]) or scale_a.shape == torch.Size(
++        [M, 1])
++    assert scale_b.shape == torch.Size([1, 1]) or scale_b.shape == torch.Size(
++        [N, 1])
++    assert out_dtype.is_floating_point
++    assert bias is None or bias.is_floating_point()
++    assert is_weak_contiguous(input)
++    assert is_weak_contiguous(weight)
++
++    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
++        N, META['BLOCK_SIZE_N']), )
++
++    result = torch.empty((M, N), dtype=out_dtype, device=input.device)
++
++    has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1
++
++    if use_heuristic:
++        is_small_N = N < 8192
++        next_power_of_2_M = max(32, triton.next_power_of_2(M))
++        if next_power_of_2_M <= 32:
++            tile_shape = (64, 64, 256) if is_small_N else (64, 128, 256)
++        elif next_power_of_2_M <= 64:
++            tile_shape = (64, 64, 256)
++        elif next_power_of_2_M <= 128:
++            tile_shape = (64, 128, 128)
++        else:
++            tile_shape = (128, 128, 128)
++
++    block_size_m, block_size_n, block_size_k = tile_shape
++
++    block_size_sa = 1 if has_scalar(scale_a) else block_size_m
++    block_size_sb = 1 if has_scalar(scale_b) else block_size_n
++
++    accumulator_dtype = tl.float32 if input.is_floating_point() else tl.int32
++
++    # A = input, B = weight, C = result
++    # A = M x K, B = K x N, C = M x N
++    scaled_mm_kernel[grid](input,
++                           weight,
++                           scale_a,
++                           scale_b,
++                           result,
++                           bias,
++                           M,
++                           N,
++                           K,
++                           input.stride(0),
++                           input.stride(1),
++                           weight.stride(0),
++                           weight.stride(1),
++                           result.stride(0),
++                           result.stride(1),
++                           accumulator_dtype,
++                           BLOCK_SIZE_M=block_size_m,
++                           BLOCK_SIZE_N=block_size_n,
++                           BLOCK_SIZE_K=block_size_k,
++                           BLOCK_SIZE_SCALE_A=block_size_sa,
++                           BLOCK_SIZE_SCALE_B=block_size_sb)
++
++    return result.to(out_dtype)
+diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+new file mode 100644
+index 0000000..dfae4db
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+@@ -0,0 +1,171 @@
++import re
++from typing import Iterable, Optional
++
++from compressed_tensors import CompressionFormat
++from torch.nn import Module
++
++from vllm.model_executor.layers.quantization.utils.quant_utils import (
++    FUSED_LAYER_NAME_MAPPING)
++
++
++def is_activation_quantization_format(format: str) -> bool:
++    _ACTIVATION_QUANTIZATION_FORMATS = [
++        CompressionFormat.naive_quantized.value,
++        CompressionFormat.int_quantized.value,
++        CompressionFormat.float_quantized.value
++    ]
++    return format in _ACTIVATION_QUANTIZATION_FORMATS
++
++
++def should_ignore_layer(layer_name: Optional[str],
++                        ignore: Iterable[str]) -> bool:
++    if layer_name is None:
++        return False
++
++    # layer_name = model.layers.0.self_attn.qkv_proj
++    # proj_name = qkv_proj
++    proj_name = layer_name.split(".")[-1]
++
++    # Fused layers like gate_up_proj or qkv_proj will not be fused
++    # in the safetensors checkpoint. So, we convert the name
++    # from the fused version to unfused + check to make sure that
++    # each shard of the fused layer has the same scheme.
++    if proj_name in FUSED_LAYER_NAME_MAPPING and layer_name not in ignore:
++        shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
++
++        # Convert fused_name --> [shard_names]
++        shard_names = [
++            layer_name.replace(proj_name, shard_proj_name)
++            for shard_proj_name in shard_proj_names
++        ]
++
++        # Layer should be ignored if shards are ignored.
++        should_ignore_layer = None
++        for shard_name in shard_names:
++            should_ignore_shard = check_equal_or_regex_match(
++                layer_name=shard_name, targets=ignore)
++
++            # If shard_idx=0, set layer ignore to match shard.
++            if should_ignore_layer is None:
++                should_ignore_layer = should_ignore_shard
++
++            # If shard_idx=1+ confirm scheme matches prior shards.
++            elif should_ignore_shard != should_ignore_layer:
++                raise ValueError(f"Found a different quantization schemes for "
++                                 f"{shard_proj_names} in {layer_name}. vLLM "
++                                 "requires all to use the same scheme.")
++
++    # Unfused layers like down_proj and o_proj will match
++    # the safetensors checkpoint already.
++    else:
++        should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
++                                                         targets=ignore)
++
++    assert should_ignore_layer is not None
++    return should_ignore_layer
++
++
++def check_equal_or_regex_match(layer_name: str,
++                               targets: Iterable[str]) -> bool:
++    """
++    Checks whether a layer_name is exactly equal or a regex match for 
++    if target starts with 're:' to any target in list.
++    """
++    for target in targets:
++        if _is_equal_or_regex_match(layer_name, target):
++            return True
++    return False
++
++
++def find_matched_target(layer_name: Optional[str], module: Module,
++                        targets: Iterable[str]) -> str:
++    """
++    Helper function to look up which "target" in the compressed-tensors
++    config that a layer corresponds to.
++
++    Recall that a compressed-tensors configs has a concept of 
++    config_groups, where each layer can be quantized with with a different
++    scheme.
++
++    targets in each config_group will be a list of either layer names 
++    (or regexes corresponding to layer names) or names of torch Modules.
++
++    First, we try to match the layer_name with a target
++    Second, we try to match the module's name with a target
++
++    :param layer_name: layer name
++    :param module: torch.nn.Module
++    :param targets: list of targets to match the layer against
++    """
++
++    if layer_name is None:
++        layer_name = ""
++
++    matched_target = (_find_first_match(layer_name, targets)
++                      or _find_first_match(module.__class__.__name__, targets,
++                                           True))
++
++    if matched_target is None:
++        raise ValueError(f"Unable to find matching target for {module} in the "
++                         "compressed-tensors config.")
++
++    return matched_target
++
++
++def _find_first_match(value: str,
++                      targets: Iterable[str],
++                      check_contains: bool = False) -> Optional[str]:
++    """
++    Returns first element of target that matches value either
++    exactly or as a regex after 're:'. If check_contains is set to True,
++    additionally checks if the target string is contained within the value.
++
++    :param value: string to compare the list of targets against
++    :param targets: list of targets to match the layer against
++    :param check_contains: whether or not to do a substring match
++    """
++
++    for target in targets:
++        if _is_equal_or_regex_match(value,
++                                    target,
++                                    check_contains=check_contains):
++            return target
++    return None
++
++
++def get_compressed_tensors_cache_scale(name: str) -> Optional[str]:
++    """
++    Check whether the param name matches the format for k/v cache scales
++    in compressed-tensors. If this is the case, return its equivalent
++    param name expected by vLLM
++
++    :param name: param name
++    :return: matching param name for KV cache scale in vLLM
++    """
++    if name.endswith(".output_scale") and ".k_proj" in name:
++        return name.replace(".k_proj.output_scale", ".attn.k_scale")
++    if name.endswith(".output_scale") and ".v_proj" in name:
++        return name.replace(".v_proj.output_scale", ".attn.v_scale")
++    # If no matches, return None
++    return None
++
++
++def _is_equal_or_regex_match(value: str,
++                             target: str,
++                             check_contains: bool = False) -> bool:
++    """
++    Checks whether a value is exactly equal or a regex match for target
++    if target starts with 're:'. If check_contains is set to True,
++    additionally checks if the target string is contained within the value.
++    """
++
++    if target.startswith("re:"):
++        pattern = target[3:]
++        if re.match(pattern, value):
++            return True
++    elif check_contains:
++        if target.lower() in value.lower():
++            return True
++    elif target == value:
++        return True
++    return False
+diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
+new file mode 100644
+index 0000000..36598b3
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
+@@ -0,0 +1,190 @@
++from typing import Any, Dict, List, Optional
++
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++
++from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.utils import set_weight_attrs
++
++
++class DeepSpeedFPConfig(QuantizationConfig):
++    """Config for DeepSpeed FP quantizer. It supports fp6 and fp8.
++    
++    Args: 
++        weight_bits: the target quantization bits, 6 or 8.
++        group_size: group size for quantizaiton, default to 128.
++    """
++
++    def __init__(
++        self,
++        weight_bits: int = 8,
++        group_size: int = 512,
++    ) -> None:
++        self.weight_bits = weight_bits
++        self.group_size = group_size
++        self.valid_types = [torch.bfloat16, torch.float16]
++
++        if self.weight_bits not in (6, 8):
++            raise ValueError(
++                "Currently, only 6-bit or 8-bit weight quantization are "
++                f"supported for DeepSpeed FP quantizaiton, but got "
++                f"{self.weight_bits} bits.")
++
++    def __repr__(self) -> str:
++        return (f"DeepSpeedFPConfig(weight_bits={self.weight_bits}), "
++                f"group_size={self.group_size}")
++
++    @classmethod
++    def get_name(cls) -> str:
++        return "DeepSpeedFP"
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "DeepSpeedFPConfig":
++        weight_bits = cls.get_from_keys(config, ["bits"])
++        group_size = cls.get_from_keys(config, ["group_size"])
++        return cls(weight_bits=weight_bits, group_size=group_size)
++
++    def get_linear_method(self) -> "DeepSpeedFPLinearMethod":
++        return DeepSpeedFPLinearMethod(self)
++
++    @classmethod
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.half, torch.bfloat16]
++
++    @classmethod
++    # Need to figure it out
++    def get_min_capability(cls) -> int:
++        return 60
++
++    @staticmethod
++    def get_config_filenames() -> List[str]:
++        return [
++            "quant_config.json",
++            "quantize_config.json",
++        ]
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["DeepSpeedFPLinearMethod"]:
++        if isinstance(layer, LinearBase):
++            return DeepSpeedFPLinearMethod(self)
++        return None
++
++
++class DeepSpeedFPLinearMethod(LinearMethodBase):
++    """Linear method for DeepSpeedFP quantizer.
++
++    Args:
++        quant_config: the DeepSpeedFP quantization config.
++    """
++
++    def __init__(self, quant_config: DeepSpeedFPConfig):
++        self.quant_config = quant_config
++        self.weight = None
++
++    def create_weights(self,
++                       layer: torch.nn.Module,
++                       input_size_per_partition: int,
++                       output_partition_sizes: List[int],
++                       input_size: int,
++                       output_size: int,
++                       params_dtype: torch.dtype,
++                       weight_loader=None,
++                       **extra_weight_attrs):
++        del output_size
++        del input_size
++        output_size_per_partition = sum(output_partition_sizes)
++        weight = DeepSpeedFPParameter(
++            torch.Size((output_size_per_partition, input_size_per_partition)),
++            params_dtype=params_dtype,
++            quant_config=self.quant_config,
++        )
++        set_weight_attrs(weight, {
++            "input_dim": 1,
++            "output_dim": 0,
++        })
++        layer.register_parameter("weight", weight)
++
++        def quant_weight_loader(param, loaded_weight, *args, **kwargs):
++            # Calls the original weight loader (if any), quantizes the result,
++            # and then loads the quantized parameter.
++            if weight_loader is not None:
++                orig_param_data = param.data
++                param.data = param.ds_dequantize()
++                weight_loader(param, loaded_weight, *args, **kwargs)
++                param.data, loaded_weight = orig_param_data, param.data
++            param.ds_quantize_(loaded_weight.cuda())
++
++        extra_weight_attrs["weight_loader"] = quant_weight_loader
++        set_weight_attrs(weight, extra_weight_attrs)
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        weight = layer.weight
++        y = weight.ds_dequantize()
++        return F.linear(x, y, bias)
++
++
++class DeepSpeedFPParameter(nn.Parameter):
++    """
++    DeepSpeedFP quantized parameter class that implements fp8/fp6
++    quantization deepspeed. Weights are stored in quantized form on
++    GPUs, and can be dequantized on-the-fly when needed by the model.
++    """
++
++    def __new__(cls, orig_shape: torch.Size, params_dtype: torch.dtype,
++                quant_config: DeepSpeedFPConfig):
++        try:
++            import deepspeed
++            if deepspeed.__version__ < "0.14.2":
++                raise ImportError("deepspeed version is wrong. Please "
++                                  "install deepspeed>=0.14.2.")
++            from deepspeed.ops.fp_quantizer import FP_Quantize
++        except ImportError as err:
++            raise ImportError("Please install deepspeed>=0.14.2 via "
++                              "`pip install deepspeed>=0.14.2` to use "
++                              "deepspeedfp quantizer.") from err
++        data = torch.empty((
++            orig_shape.numel() // quant_config.group_size,
++            quant_config.group_size * quant_config.weight_bits // 8 + 4,
++        ),
++                           dtype=torch.int8)
++        self = torch.Tensor._make_subclass(cls, data, data.requires_grad)
++        self.orig_shape = orig_shape
++        self.quant_config = quant_config
++        self.fp_quantizer = FP_Quantize(group_size=quant_config.group_size)
++        self.fp_quantizer.orig_shape = orig_shape
++        self.fp_quantizer.orig_dtype = params_dtype
++        return self
++
++    def ds_quantize_(self, tensor: torch.Tensor):
++        assert tensor.device.type == "cuda" and tensor.dtype != torch.int8
++        return self.data.copy_(
++            self.fp_quantizer.quantize(
++                tensor.data,
++                q_bits=self.quant_config.weight_bits,
++            ))
++
++    def ds_dequantize(self, fp_out=None) -> torch.Tensor:
++        """
++        Return a tensor containing the dequantized weights of this parameter.
++        """
++        assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
++        return self.fp_quantizer.dequantize(
++            self.data, fp_out=fp_out, q_bits=self.quant_config.weight_bits)
++
++    def ds_selective_dequantize(self, indices, fp_out=None) -> torch.Tensor:
++        """
++        Return a tensor where only the weights at `indices` are dequantized
++        (to save HBM -> SRAM bandwidth).
++        """
++        assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
++        return self.fp_quantizer.selective_dequantize(
++            self.data,
++            indices,
++            fp_out=fp_out,
++            q_bits=self.quant_config.weight_bits)
+diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
+new file mode 100644
+index 0000000..209f12c
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/experts_int8.py
+@@ -0,0 +1,180 @@
++from typing import Any, Callable, Dict, List, Optional
++
++import torch
++
++from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
++from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
++from vllm.model_executor.layers.linear import (LinearBase,
++                                               UnquantizedLinearMethod)
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig, QuantizeMethodBase)
++from vllm.model_executor.utils import set_weight_attrs
++
++
++class ExpertsInt8Config(QuantizationConfig):
++    """Config class for Int8 experts quantization."""
++
++    def __init__(self) -> None:
++        pass
++
++    @classmethod
++    def get_name(cls) -> str:
++        return "experts_int8"
++
++    @classmethod
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.bfloat16, torch.half]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 80
++
++    @classmethod
++    def get_config_filenames(cls) -> List[str]:
++        return []
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "ExpertsInt8Config":
++        return cls()
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["QuantizeMethodBase"]:
++        if isinstance(layer, LinearBase):
++            return UnquantizedLinearMethod()
++        elif isinstance(layer, FusedMoE):
++            return ExpertsInt8MoEMethod(self)
++        return None
++
++
++class ExpertsInt8MoEMethod(FusedMoEMethodBase):
++
++    def __init__(self, quant_config: ExpertsInt8Config):
++        self.quant_config = quant_config
++
++    def create_weights(self, layer: torch.nn.Module, num_experts: int,
++                       hidden_size: int, intermediate_size: int,
++                       params_dtype: torch.dtype, **extra_weight_attrs):
++
++        int8_dtype = torch.int8
++
++        assert 'weight_loader' in extra_weight_attrs
++        weight_loader = extra_weight_attrs['weight_loader']
++        wrapped_weight_loader = ExpertsInt8MoEMethod.quantizing_weight_loader(
++            layer, weight_loader)
++        extra_weight_attrs['weight_loader'] = wrapped_weight_loader
++
++        # Fused gate_up_proj (column parallel)
++        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
++                                                    2 * intermediate_size,
++                                                    hidden_size,
++                                                    dtype=int8_dtype),
++                                        requires_grad=False)
++        layer.register_parameter("w13_weight", w13_weight)
++        set_weight_attrs(w13_weight, extra_weight_attrs)
++
++        # down_proj (row parallel)
++        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
++                                                   hidden_size,
++                                                   intermediate_size,
++                                                   dtype=int8_dtype),
++                                       requires_grad=False)
++        layer.register_parameter("w2_weight", w2_weight)
++        set_weight_attrs(w2_weight, extra_weight_attrs)
++
++        w13_scale = torch.nn.Parameter(torch.zeros(num_experts,
++                                                   2 * intermediate_size,
++                                                   dtype=torch.float32),
++                                       requires_grad=False)
++        layer.register_parameter("w13_scale", w13_scale)
++
++        w2_scale = torch.nn.Parameter(torch.zeros(num_experts,
++                                                  hidden_size,
++                                                  dtype=torch.float32),
++                                      requires_grad=False)
++        layer.register_parameter("w2_scale", w2_scale)
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        router_logits: torch.Tensor,
++        top_k: int,
++        renormalize: bool,
++        use_grouped_topk: bool = False,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        from vllm.model_executor.layers.fused_moe import fused_experts
++
++        topk_weights, topk_ids = FusedMoE.select_experts(
++            hidden_states=x,
++            router_logits=router_logits,
++            use_grouped_topk=use_grouped_topk,
++            top_k=top_k,
++            renormalize=renormalize,
++            topk_group=topk_group,
++            num_expert_group=num_expert_group,
++            custom_routing_function=custom_routing_function,
++            scoring_func=scoring_func,
++            e_score_correction_bias=e_score_correction_bias)
++
++        return fused_experts(x,
++                             layer.w13_weight,
++                             layer.w2_weight,
++                             topk_weights=topk_weights,
++                             topk_ids=topk_ids,
++                             inplace=True,
++                             use_int8_w8a16=True,
++                             w1_scale=layer.w13_scale,
++                             w2_scale=layer.w2_scale)
++
++    @staticmethod
++    def quantizing_weight_loader(layer, weight_loader):
++
++        def quantize_and_call_weight_loader(param: torch.nn.Parameter,
++                                            loaded_weight: torch.Tensor,
++                                            weight_name: str, shard_id: int,
++                                            expert_id: int):
++            tp_rank = get_tensor_model_parallel_rank()
++            shard_size = layer.intermediate_size_per_partition
++            shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
++            device = get_tp_group().device
++            loaded_weight = loaded_weight.to(device)
++            # w1, gate_proj case: Load into first shard of w13.
++            if shard_id == "w1":
++                scales = quantize_in_place_and_get_scales(
++                    loaded_weight[shard, :])
++                layer.w13_scale.data[expert_id, 0:shard_size].copy_(scales[:,
++                                                                           0])
++            # w3, up_proj case: Load into second shard of w13.
++            elif shard_id == "w3":
++                scales = quantize_in_place_and_get_scales(
++                    loaded_weight[shard, :])
++                layer.w13_scale.data[expert_id, shard_size:2 *
++                                     shard_size].copy_(scales[:, 0])
++            # w2, down_proj case: Load into only shard of w2.
++            elif shard_id == "w2":
++                scales = quantize_in_place_and_get_scales(loaded_weight[:,
++                                                                        shard])
++                layer.w2_scale.data[expert_id, :].copy_(scales[:, 0])
++            else:
++                raise ValueError(
++                    f"Shard id must be in [0,1,2] but got {shard_id}")
++            weight_loader(param, loaded_weight, weight_name, shard_id,
++                          expert_id)
++
++        return quantize_and_call_weight_loader
++
++
++def quantize_in_place_and_get_scales(weight: torch.Tensor) -> torch.Tensor:
++    vmax = torch.iinfo(torch.int8).max
++    scales = (torch.max(torch.abs(weight), dim=1, keepdim=True)[0] / vmax)
++
++    weight.div_(scales)
++    weight.round_()
++    weight.clamp_(-vmax, vmax)
++
++    return scales
+diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+new file mode 100644
+index 0000000..7b71e13
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+@@ -0,0 +1,165 @@
++from typing import Any, Dict, List, Optional
++
++import torch
++from torch.nn import Module
++from torch.nn.parameter import Parameter
++
++from vllm.logger import init_logger
++from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
++                                               UnquantizedLinearMethod)
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig, QuantizeMethodBase)
++from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported
++from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
++    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
++from vllm.model_executor.layers.quantization.utils.quant_utils import (
++    is_layer_skipped)
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    apply_fp8_linear, normalize_e4m3fn_to_e4m3fnuz)
++from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
++                                           ModelWeightParameter)
++from vllm.platforms import current_platform
++
++logger = init_logger(__name__)
++
++
++class FBGEMMFp8Config(QuantizationConfig):
++    """Config class for FBGEMM Fp8."""
++
++    def __init__(self, ignore_list: List[str], input_scale_ub: float):
++        self.ignore_list = ignore_list if ignore_list else []
++        self.input_scale_ub = input_scale_ub
++
++        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
++        # kernel for fast weight-only FP8 quantization
++        self.use_marlin = not current_platform.has_device_capability(89)
++
++    @classmethod
++    def get_name(cls) -> str:
++        return "fbgemm_fp8"
++
++    @classmethod
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.bfloat16, torch.float16]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 80
++
++    @classmethod
++    def get_config_filenames(cls) -> List[str]:
++        return []
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "FBGEMMFp8Config":
++        ignore_list = cls.get_from_keys(config, ["modules_to_not_convert"])
++        input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"])
++        return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub)
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["QuantizeMethodBase"]:
++        if isinstance(layer, LinearBase):
++            if is_layer_skipped(prefix, self.ignore_list):
++                return UnquantizedLinearMethod()
++            return FBGEMMFp8LinearMethod(self)
++        return None
++
++
++class FBGEMMFp8LinearMethod(LinearMethodBase):
++
++    def __init__(self, quant_config: FBGEMMFp8Config):
++        self.quant_config = quant_config
++        self.cutlass_fp8_supported = cutlass_fp8_supported()
++
++    def create_weights(
++        self,
++        layer: torch.nn.Module,
++        input_size_per_partition: int,
++        output_partition_sizes: List[int],
++        input_size: int,
++        output_size: int,
++        params_dtype: torch.dtype,
++        **extra_weight_attrs,
++    ):
++        weight_loader = extra_weight_attrs.get("weight_loader")
++        del input_size, output_size
++        output_size_per_partition = sum(output_partition_sizes)
++
++        layer.logical_widths = output_partition_sizes
++
++        layer.input_size_per_partition = input_size_per_partition
++        layer.output_size_per_partition = output_size_per_partition
++        layer.orig_dtype = params_dtype
++
++        # WEIGHT
++        weight = ModelWeightParameter(data=torch.empty(
++            output_size_per_partition,
++            input_size_per_partition,
++            dtype=torch.float8_e4m3fn),
++                                      input_dim=1,
++                                      output_dim=0,
++                                      weight_loader=weight_loader)
++        layer.register_parameter("weight", weight)
++
++        # WEIGHT SCALE
++        weight_scale = ChannelQuantScaleParameter(data=torch.empty(
++            (sum(output_partition_sizes), 1), dtype=torch.float32),
++                                                  output_dim=0,
++                                                  weight_loader=weight_loader)
++        weight_scale[:] = torch.finfo(torch.float32).min
++        layer.register_parameter("weight_scale", weight_scale)
++
++        # INPUT SCALE UPPER BOUND
++        input_scale_ub = torch.nn.Parameter(torch.tensor(
++            (self.quant_config.input_scale_ub), dtype=torch.float32),
++                                            requires_grad=False)
++        layer.input_scale_ub = input_scale_ub
++
++    def process_weights_after_loading(self, layer: Module) -> None:
++        # required by torch.compile
++        layer.weight_scale = Parameter(layer.weight_scale.data,
++                                       requires_grad=False)
++        layer.weight = Parameter(layer.weight.data, requires_grad=False)
++
++        weight = layer.weight
++
++        if current_platform.is_rocm():
++            weight, weight_scale, input_scale = \
++                normalize_e4m3fn_to_e4m3fnuz(
++                    weight=weight,
++                    weight_scale=layer.weight_scale,
++                    input_scale=None)
++            if input_scale is not None:
++                layer.input_scale = Parameter(input_scale, requires_grad=False)
++            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
++
++        layer.weight = Parameter(weight.t(), requires_grad=False)
++        if self.quant_config.use_marlin:
++            prepare_fp8_layer_for_marlin(layer)
++            # Activations not quantized for marlin.
++            del layer.input_scale_ub
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++
++        if self.quant_config.use_marlin:
++            return apply_fp8_marlin_linear(
++                input=x,
++                weight=layer.weight,
++                weight_scale=layer.weight_scale,
++                workspace=layer.workspace,
++                size_n=layer.output_size_per_partition,
++                size_k=layer.input_size_per_partition,
++                bias=bias)
++
++        return apply_fp8_linear(
++            input=x,
++            weight=layer.weight,
++            weight_scale=layer.weight_scale,
++            input_scale=None,
++            input_scale_ub=layer.input_scale_ub,
++            bias=bias,
++            cutlass_fp8_supported=self.cutlass_fp8_supported,
++            use_per_token_if_dynamic=True)
+diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
+index b57e1dd..a1be45a 100644
+--- a/vllm/model_executor/layers/quantization/fp8.py
++++ b/vllm/model_executor/layers/quantization/fp8.py
+@@ -1,15 +1,33 @@
+-from typing import Any, Dict, List, Optional, Tuple, Union
++from typing import Any, Callable, Dict, List, Optional
+ 
+ import torch
+ from torch.nn import Module
+ from torch.nn.parameter import Parameter
+ 
++import vllm.envs as envs
+ from vllm import _custom_ops as ops
++from vllm.distributed import get_tensor_model_parallel_world_size
+ from vllm.logger import init_logger
+-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
++from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
++                                                  FusedMoeWeightScaleSupported)
++from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
++                                               UnquantizedLinearMethod)
+ from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++    QuantizationConfig, QuantizeMethodBase)
++from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
++from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
++    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
++from vllm.model_executor.layers.quantization.utils.quant_utils import (
++    is_layer_skipped)
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    all_close_1d, apply_fp8_linear, convert_to_channelwise,
++    cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
++    requantize_with_max_scale)
++from vllm.model_executor.parameter import (BlockQuantScaleParameter,
++                                           ModelWeightParameter,
++                                           PerTensorScaleParameter)
+ from vllm.model_executor.utils import set_weight_attrs
++from vllm.platforms import current_platform
+ 
+ ACTIVATION_SCHEMES = ["static", "dynamic"]
+ 
+@@ -23,6 +41,8 @@ class Fp8Config(QuantizationConfig):
+         self,
+         is_checkpoint_fp8_serialized: bool = False,
+         activation_scheme: str = "dynamic",
++        ignored_layers: Optional[List[str]] = None,
++        weight_block_size: Optional[List[int]] = None,
+     ) -> None:
+         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+         if is_checkpoint_fp8_serialized:
+@@ -32,6 +52,21 @@ class Fp8Config(QuantizationConfig):
+             raise ValueError(
+                 f"Unsupported activation scheme {activation_scheme}")
+         self.activation_scheme = activation_scheme
++        self.ignored_layers = ignored_layers or []
++        if weight_block_size is not None:
++            if not is_checkpoint_fp8_serialized:
++                raise ValueError(
++                    "The block-wise quantization only supports fp8-serialized "
++                    "checkpoint for now.")
++            if len(weight_block_size) != 2:
++                raise ValueError(
++                    "The quantization block size of weight must have 2 "
++                    f"dimensions, but got {len(weight_block_size)} dimensions")
++            if activation_scheme != "dynamic":
++                raise ValueError("The block-wise quantization only supports "
++                                 "dynamic activation scheme for now, but got "
++                                 f"{activation_scheme} activation scheme.")
++        self.weight_block_size = weight_block_size
+ 
+     @classmethod
+     def get_name(cls) -> str:
+@@ -43,7 +78,7 @@ class Fp8Config(QuantizationConfig):
+ 
+     @classmethod
+     def get_min_capability(cls) -> int:
+-        return 89
++        return 80
+ 
+     @classmethod
+     def get_config_filenames(cls) -> List[str]:
+@@ -54,18 +89,28 @@ class Fp8Config(QuantizationConfig):
+         quant_method = cls.get_from_keys(config, ["quant_method"])
+         is_checkpoint_fp8_serialized = ("fp8" in quant_method)
+         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
++        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
++        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"],
++                                                 None)
+         return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+-                   activation_scheme=activation_scheme)
++                   activation_scheme=activation_scheme,
++                   ignored_layers=ignored_layers,
++                   weight_block_size=weight_block_size)
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["QuantizeMethodBase"]:
++        from vllm.attention.layer import Attention  # Avoid circular import
+ 
+-    def get_quant_method(
+-            self, layer: torch.nn.Module) -> Optional["Fp8LinearMethod"]:
+         if isinstance(layer, LinearBase):
++            if is_layer_skipped(prefix, self.ignored_layers):
++                return UnquantizedLinearMethod()
+             return Fp8LinearMethod(self)
++        elif isinstance(layer, FusedMoE):
++            return Fp8MoEMethod(self)
++        elif isinstance(layer, Attention):
++            return Fp8KVCacheMethod(self)
+         return None
+ 
+-    def get_scaled_act_names(self) -> List[str]:
+-        return []
+-
+ 
+ class Fp8LinearMethod(LinearMethodBase):
+     """Linear method for FP8.
+@@ -80,31 +125,27 @@ class Fp8LinearMethod(LinearMethodBase):
+     1. Only support per-tensor quantization due to torch._scaled_mm support.
+     2. Only support float8_e4m3fn data type due to the limitation of
+        torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
+-       
++
+     Args:
+         quant_config: The quantization config.
+     """
+ 
+     def __init__(self, quant_config: Fp8Config):
+         self.quant_config = quant_config
++        self.cutlass_fp8_supported = cutlass_fp8_supported()
+ 
+-    def _create_scale_param(
+-        self,
+-        scale_name: str,
+-        layer: torch.nn.Module,
+-        output_partition_sizes: List[int],
+-        **extra_weight_attrs,
+-    ) -> None:
+-        scale = Parameter(torch.empty(len(output_partition_sizes),
+-                                      dtype=torch.float32),
+-                          requires_grad=False)
+-        layer.register_parameter(scale_name, scale)
+-        set_weight_attrs(
+-            scale, {
+-                **extra_weight_attrs,
+-                "fp8_scales_shard_indexer":
+-                self.scales_shard_indexer,
+-            })
++        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
++        # kernel for fast weight-only FP8 quantization
++        self.use_marlin = (not current_platform.has_device_capability(89)
++                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
++        # Disable marlin for rocm
++        if current_platform.is_rocm():
++            self.use_marlin = False
++
++        self.block_quant = self.quant_config.weight_block_size is not None
++        if self.block_quant:
++            # Marlin doesn't support block-wise fp8
++            self.use_marlin = False
+ 
+     def create_weights(
+         self,
+@@ -116,150 +157,494 @@ class Fp8LinearMethod(LinearMethodBase):
+         params_dtype: torch.dtype,
+         **extra_weight_attrs,
+     ):
+-        del input_size, output_size
+         output_size_per_partition = sum(output_partition_sizes)
++        weight_loader = extra_weight_attrs.get("weight_loader")
++
++        if self.block_quant:
++            tp_size = get_tensor_model_parallel_world_size()
++            assert self.quant_config.weight_block_size is not None
++            block_n, block_k = (
++                self.quant_config.weight_block_size[0],
++                self.quant_config.weight_block_size[1],
++            )
++            # Required by row parallel
++            if (tp_size > 1
++                    and input_size // input_size_per_partition == tp_size
++                    and input_size_per_partition % block_k != 0):
++                raise ValueError(
++                    f"Weight input_size_per_partition = "
++                    f"{input_size_per_partition} is not divisible by "
++                    f"weight quantization block_k = {block_k}.")
++            # Required by column parallel or enabling merged weights
++            if (tp_size > 1 and output_size // output_size_per_partition
++                    == tp_size) or len(output_partition_sizes) > 1:
++                for output_partition_size in output_partition_sizes:
++                    if output_partition_size % block_n != 0:
++                        raise ValueError(
++                            f"Weight output_partition_size = "
++                            f"{output_partition_size} is not divisible by "
++                            f"weight quantization block_n = {block_n}.")
+ 
+-        layer.process_after_load = True
+         layer.logical_widths = output_partition_sizes
+ 
++        layer.input_size_per_partition = input_size_per_partition
++        layer.output_size_per_partition = output_size_per_partition
++        layer.orig_dtype = params_dtype
++
+         # WEIGHT
+         weight_dtype = (torch.float8_e4m3fn
+                         if self.quant_config.is_checkpoint_fp8_serialized else
+                         params_dtype)
+-        weight = Parameter(torch.empty(output_size_per_partition,
+-                                       input_size_per_partition,
+-                                       dtype=weight_dtype),
+-                           requires_grad=False)
++
++        weight = ModelWeightParameter(data=torch.empty(
++            output_size_per_partition,
++            input_size_per_partition,
++            dtype=weight_dtype),
++                                      input_dim=1,
++                                      output_dim=0,
++                                      weight_loader=weight_loader)
+         layer.register_parameter("weight", weight)
+-        set_weight_attrs(weight, {
+-            **extra_weight_attrs,
+-            "input_dim": 1,
+-            "output_dim": 0,
+-        })
+ 
+         # If checkpoint is serialized fp8, load them.
+         # Otherwise, wait until process_weights_after_loading.
+         if self.quant_config.is_checkpoint_fp8_serialized:
+             # WEIGHT SCALE
+-            self._create_scale_param(
+-                scale_name="weight_scale",
+-                layer=layer,
+-                output_partition_sizes=output_partition_sizes,
+-                **extra_weight_attrs)
+-
+-            # ACTIVATION SCALE
++            if not self.block_quant:
++                scale = PerTensorScaleParameter(
++                    data=torch.empty(len(output_partition_sizes),
++                                     dtype=torch.float32),
++                    weight_loader=weight_loader,
++                )
++                scale[:] = torch.finfo(torch.float32).min
++                layer.register_parameter("weight_scale", scale)
++            else:
++                assert self.quant_config.activation_scheme == "dynamic"
++                scale = BlockQuantScaleParameter(
++                    data=torch.empty(
++                        (output_size_per_partition + block_n - 1) // block_n,
++                        (input_size_per_partition + block_k - 1) // block_k,
++                        dtype=torch.float32,
++                    ),
++                    input_dim=1,
++                    output_dim=0,
++                    weight_loader=weight_loader,
++                )
++                scale[:] = torch.finfo(torch.float32).min
++                # The weight_scale_inv name is intentional for deepseekv3
++                layer.register_parameter("weight_scale_inv", scale)
++
++            # INPUT ACTIVATION SCALE
+             if self.quant_config.activation_scheme == "static":
+-                self._create_scale_param(
+-                    scale_name="act_scale",
+-                    layer=layer,
+-                    output_partition_sizes=output_partition_sizes,
+-                    **extra_weight_attrs)
+-
+-    def scales_shard_indexer(
+-            self, param: torch.Tensor, loaded_weight: torch.Tensor,
+-            shard_id: Union[str, int]) -> Tuple[torch.Tensor, torch.Tensor]:
+-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
+-
+-        if isinstance(shard_id, int):
+-            pass
+-        elif isinstance(shard_id, str):
+-            if shard_id not in qkv_idxs:
+-                raise ValueError(f"Unknown shard_id: {shard_id}")
+-            shard_id = qkv_idxs[shard_id]
+-        else:
+-            ValueError(f"Shard id must be int or str but got {type(shard_id)}")
++                scale = PerTensorScaleParameter(data=torch.empty(
++                    len(output_partition_sizes), dtype=torch.float32),
++                                                weight_loader=weight_loader)
+ 
+-        return param[shard_id], loaded_weight
++                scale[:] = torch.finfo(torch.float32).min
++                layer.register_parameter("input_scale", scale)
++            else:
++                layer.register_parameter("input_scale", None)
+ 
+     def process_weights_after_loading(self, layer: Module) -> None:
+-        if (not hasattr(layer, "process_after_load")
+-                or not layer.process_after_load):
++        # Block quant doesn't need to process weights after loading
++        if self.block_quant:
+             return
+-
+-        # If checkpoint is fp/bf16 (not serialized fp8), quantize the weights.
++        layer.weight = torch.nn.Parameter(layer.weight.data,
++                                          requires_grad=False)
++        # If checkpoint not serialized fp8, quantize the weights.
+         if not self.quant_config.is_checkpoint_fp8_serialized:
+             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
+                                                          scale=None)
++
++            # If using marlin (w8a16), kernel uses channelwise weights,
++            # so extend the weight scales to be channelwise.
++            if self.use_marlin:
++                assert weight_scale.numel() == 1
++                weight_scale = convert_to_channelwise(
++                    weight_scale.expand(len(layer.logical_widths)),
++                    layer.logical_widths)
++
++            # Update the layer with the new values.
+             layer.weight = Parameter(qweight.t(), requires_grad=False)
+             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+-            layer.logical_widths = None
+-            layer.act_scale = None
+-            return
++            layer.input_scale = None
+ 
+-        # If checkpoint is fp8, requantize the separately quantized logical
+-        # weights into a single fp8 weight with a single weight scale.
++        # If checkpoint is fp8, handle that there are N scales for N
++        # shards in a fused module
+         else:
+-            # WEIGHT_SCALE / WEIGHT
+-            #   Loop over logical weights, requantizing with single scale.
+-            max_w_scale = layer.weight_scale.max()
+-            start = 0
+-            for idx, logical_width in enumerate(layer.logical_widths):
+-                end = start + logical_width
+-                weight_dq = per_tensor_dequantize(layer.weight[start:end, :],
+-                                                  layer.weight_scale[idx])
+-
+-                layer.weight[start:end, :] = per_tensor_quantize(
+-                    weight_dq, layer.weight_scale.max())
+-                start = end
+-            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+-
+-            # WEIGHT
+-            #   Transpose weight for passing to torch._scaled_mm
+-            weight = layer.weight
++            layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
++                                                    requires_grad=False)
++            if self.quant_config.activation_scheme == "static":
++                layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
++                                                       requires_grad=False)
++            # If using marlin (w8a16), kernel uses channelwise weights,
++            # so extend the weight scales to be channelwise.
++            if self.use_marlin:
++                weight = layer.weight
++                weight_scale = convert_to_channelwise(layer.weight_scale,
++                                                      layer.logical_widths)
++
++            # If using w8a8, torch._scaled_mm needs per tensor, so
++            # requantize the logical shards as a single weight.
++            else:
++                # Dequant -> Quant with max scale so we can run per tensor.
++                weight = layer.weight
++                weight_scale = layer.weight_scale
++
++                # If rocm, use float8_e4m3fnuz.
++                if current_platform.is_rocm():
++                    weight, weight_scale, input_scale = \
++                        normalize_e4m3fn_to_e4m3fnuz(
++                            weight=weight,
++                            weight_scale=weight_scale,
++                            input_scale=layer.input_scale)
++                    if input_scale is not None:
++                        layer.input_scale = Parameter(input_scale,
++                                                      requires_grad=False)
++
++                weight_scale, weight = requantize_with_max_scale(
++                    weight=weight,
++                    weight_scale=weight_scale,
++                    logical_widths=layer.logical_widths,
++                )
++
++            # Update layer with new values.
+             layer.weight = Parameter(weight.t(), requires_grad=False)
++            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
++            if self.quant_config.activation_scheme == "static":
++                layer.input_scale = Parameter(layer.input_scale.max(),
++                                              requires_grad=False)
+ 
+-            # ACT_SCALE
+-            #   Dynamic: set to None (required input to ops.scaled_fp8_quant).
+-            #   Static:  set to max of the act_scales (since they are equal).
+-            if self.quant_config.activation_scheme == "dynamic":
+-                layer.act_scale = None
+-            elif self.quant_config.activation_scheme == "static":
+-                if not all_close_1d(layer.act_scale):
+-                    raise ValueError(
+-                        "All the act_scales for the logical weights of a layer "
+-                        f"must be equal. But got {layer.act_scale}")
+-                layer.act_scale = Parameter(layer.act_scale.max(),
+-                                            requires_grad=False)
+-            else:
+-                raise ValueError(
+-                    f"Unknown scheme {self.quant_config.activation_scheme}")
++        if self.use_marlin:
++            prepare_fp8_layer_for_marlin(layer)
++            # Activations not quantized for marlin.
++            del layer.input_scale
+ 
+     def apply(self,
+               layer: torch.nn.Module,
+               x: torch.Tensor,
+               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+-        # ops.scaled_fp8_quant supports both dynamic and static quant.
+-        #   If dynamic, layer.act_scale is None and x_scale computed from x.
+-        #   If static,  layer.act_scale is scalar and x_scale set to act_scale.
+-        qinput, x_scale = ops.scaled_fp8_quant(x, layer.act_scale)
+-
+-        # Fused GEMM_DQ
+-        output, _ = torch._scaled_mm(
+-            qinput,
+-            layer.weight,
+-            out_dtype=x.dtype,
+-            scale_a=x_scale,
+-            scale_b=layer.weight_scale,
++
++        if self.use_marlin:
++            return apply_fp8_marlin_linear(
++                input=x,
++                weight=layer.weight,
++                weight_scale=layer.weight_scale,
++                workspace=layer.workspace,
++                size_n=layer.output_size_per_partition,
++                size_k=layer.input_size_per_partition,
++                bias=bias)
++
++        # Note: lazy import to avoid triton import error.
++        from vllm.model_executor.layers.quantization.utils.fp8_utils import (
++            apply_w8a8_block_fp8_linear)
++        if self.block_quant:
++            assert self.quant_config.weight_block_size is not None
++            return apply_w8a8_block_fp8_linear(
++                input=x,
++                weight=layer.weight,
++                block_size=self.quant_config.weight_block_size,
++                weight_scale=layer.weight_scale_inv,
++                input_scale=layer.input_scale,
++                bias=bias,
++            )
++
++        return apply_fp8_linear(
++            input=x,
++            weight=layer.weight,
++            weight_scale=layer.weight_scale,
++            input_scale=layer.input_scale,
+             bias=bias,
+-        )
++            cutlass_fp8_supported=self.cutlass_fp8_supported,
++            use_per_token_if_dynamic=False)
++
++
++class Fp8MoEMethod(FusedMoEMethodBase):
++    """MoE method for FP8.
++    Supports loading FP8 checkpoints with static weight scale and
++    dynamic/static activation scale.
++
++    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
++    activation scaling. The weight scaling factor will be initialized after
++    the model weights are loaded.
+ 
+-        return output
++    Args:
++        quant_config: The quantization config.
++    """
+ 
++    def __init__(self, quant_config: Fp8Config):
++        self.quant_config = quant_config
++        self.block_quant = self.quant_config.weight_block_size is not None
+ 
+-def all_close_1d(x: torch.Tensor) -> bool:
+-    assert len(x.shape) == 1
+-    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
++    def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
++                       intermediate_size: int, params_dtype: torch.dtype,
++                       **extra_weight_attrs):
+ 
++        if self.quant_config.is_checkpoint_fp8_serialized:
++            params_dtype = torch.float8_e4m3fn
++        if self.block_quant:
++            assert self.quant_config.weight_block_size is not None
++            tp_size = get_tensor_model_parallel_world_size()
++            block_n, block_k = (
++                self.quant_config.weight_block_size[0],
++                self.quant_config.weight_block_size[1],
++            )
++            # NOTE: To ensure proper alignment of the block-wise quantization
++            # scales, the output_size of the weights for both the gate and up
++            # layers must be divisible by block_n.
++            # Required by column parallel or enabling merged weights
++            if intermediate_size % block_n != 0:
++                raise ValueError(
++                    f"The output_size of gate's and up's weight = "
++                    f"{intermediate_size} is not divisible by "
++                    f"weight quantization block_n = {block_n}.")
++            if (tp_size > 1 and intermediate_size % block_k != 0):
++                # Required by row parallel
++                raise ValueError(f"The input_size of down's weight = "
++                                 f"{intermediate_size} is not divisible by "
++                                 f"weight quantization block_k = {block_k}.")
++
++        # WEIGHTS
++        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
++                                                    2 * intermediate_size,
++                                                    hidden_size,
++                                                    dtype=params_dtype),
++                                        requires_grad=False)
++        layer.register_parameter("w13_weight", w13_weight)
++        set_weight_attrs(w13_weight, extra_weight_attrs)
++
++        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
++                                                   hidden_size,
++                                                   intermediate_size,
++                                                   dtype=params_dtype),
++                                       requires_grad=False)
++        layer.register_parameter("w2_weight", w2_weight)
++        set_weight_attrs(w2_weight, extra_weight_attrs)
++
++        # WEIGHT_SCALES
++        if not self.block_quant:
++            # Allocate 2 scales for w1 and w3 respectively.
++            # They will be combined to a single scale after weight loading.
++            w13_weight_scale = torch.nn.Parameter(torch.ones(
++                num_experts, 2, dtype=torch.float32),
++                                                  requires_grad=False)
++            w2_weight_scale = torch.nn.Parameter(torch.ones(
++                num_experts, dtype=torch.float32),
++                                                 requires_grad=False)
++            layer.register_parameter("w13_weight_scale", w13_weight_scale)
++            layer.register_parameter("w2_weight_scale", w2_weight_scale)
++        else:
++            w13_weight_scale = torch.nn.Parameter(
++                torch.ones(
++                    num_experts,
++                    2 * ((intermediate_size + block_n - 1) // block_n),
++                    (hidden_size + block_k - 1) // block_k,
++                    dtype=torch.float32,
++                ),
++                requires_grad=False,
++            )
++            w2_weight_scale = torch.nn.Parameter(
++                torch.ones(
++                    num_experts,
++                    (hidden_size + block_n - 1) // block_n,
++                    (intermediate_size + block_k - 1) // block_k,
++                    dtype=torch.float32,
++                ),
++                requires_grad=False,
++            )
++            layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
++            layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
++            assert self.quant_config.activation_scheme == "dynamic"
++
++        # Add the quantization method used (per tensor/grouped/channel)
++        # to ensure the weight scales are loaded in properly
++        extra_weight_attrs.update(
++            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.
++             value} if self.block_quant else
++            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
++        # If loading fp8 checkpoint, pass the weight loaders.
++        # If loading an fp16 checkpoint, do not (we will quantize in
++        #   process_weights_after_loading()
++        if self.quant_config.is_checkpoint_fp8_serialized:
++            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
++            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+ 
+-def per_tensor_quantize(tensor: torch.Tensor,
+-                        inv_scale: float) -> torch.Tensor:
+-    finfo = torch.finfo(torch.float8_e4m3fn)
+-    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
+-    return qweight.to(torch.float8_e4m3fn)
++        # INPUT_SCALES
++        if self.quant_config.activation_scheme == "static":
++            if not self.quant_config.is_checkpoint_fp8_serialized:
++                raise ValueError(
++                    "Found static activation scheme for checkpoint that "
++                    "was not serialized fp8.")
+ 
++            w13_input_scale = torch.nn.Parameter(torch.ones(
++                num_experts, dtype=torch.float32),
++                                                 requires_grad=False)
++            layer.register_parameter("w13_input_scale", w13_input_scale)
++            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+ 
+-def per_tensor_dequantize(tensor: torch.Tensor,
+-                          inv_scale: float) -> torch.Tensor:
+-    fake_qweight = tensor.to(torch.float16)
+-    dq_weight = fake_qweight * inv_scale
+-    return dq_weight
++            w2_input_scale = torch.nn.Parameter(torch.ones(
++                num_experts, dtype=torch.float32),
++                                                requires_grad=False)
++            layer.register_parameter("w2_input_scale", w2_input_scale)
++            set_weight_attrs(w2_input_scale, extra_weight_attrs)
++
++        else:
++            layer.w13_input_scale = None
++            layer.w2_input_scale = None
++
++    def process_weights_after_loading(self, layer: Module) -> None:
++        # Block quant doesn't need to process weights after loading
++        if self.block_quant:
++            return
++        # If checkpoint is fp16, quantize in place.
++        if not self.quant_config.is_checkpoint_fp8_serialized:
++            # If rocm, use float8_e4m3fnuz as dtype
++            fp8_dtype = torch.float8_e4m3fnuz \
++                        if current_platform.is_rocm() else torch.float8_e4m3fn
++            w13_weight = torch.empty_like(layer.w13_weight.data,
++                                          dtype=fp8_dtype)
++            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
++
++            # Re-initialize w13_scale because we directly quantize
++            # merged w13 weights and generate a single scaling factor.
++            layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
++                layer.num_experts,
++                dtype=torch.float32,
++                device=w13_weight.device),
++                                                        requires_grad=False)
++            for expert in range(layer.num_experts):
++                w13_weight[expert, :, :], layer.w13_weight_scale[
++                    expert] = ops.scaled_fp8_quant(
++                        layer.w13_weight.data[expert, :, :])
++                w2_weight[expert, :, :], layer.w2_weight_scale[
++                    expert] = ops.scaled_fp8_quant(
++                        layer.w2_weight.data[expert, :, :])
++            layer.w13_weight = torch.nn.Parameter(w13_weight,
++                                                  requires_grad=False)
++            layer.w2_weight = torch.nn.Parameter(w2_weight,
++                                                 requires_grad=False)
++            return
++
++        # If checkpoint is fp8, we need to handle that the
++        # MoE kernels require single activation scale and single weight
++        # scale for w13 per expert.
++        else:
++            # Fp8 moe kernels require a single activation scale.
++            # We take the max of all the scales in case they differ.
++            if self.quant_config.activation_scheme == "static":
++                if (layer.w13_input_scale is None
++                        or layer.w2_input_scale is None):
++                    raise ValueError(
++                        "QuantConfig has static quantization, but found "
++                        "activation scales are None.")
++                if (not all_close_1d(layer.w13_input_scale)
++                        or not all_close_1d(layer.w2_input_scale)):
++                    logger.warning_once(
++                        "Found input_scales that are not equal for "
++                        "fp8 MoE layer. Using the maximum across experts "
++                        "for each layer.")
++                layer.w13_input_scale = torch.nn.Parameter(
++                    layer.w13_input_scale.max(), requires_grad=False)
++                layer.w2_input_scale = torch.nn.Parameter(
++                    layer.w2_input_scale.max(), requires_grad=False)
++            # If rocm, normalize the weights and scales to e4m3fnuz
++            if current_platform.is_rocm():
++                # Normalize the weights and scales
++                w13_weight, w13_weight_scale, w13_input_scale = \
++                    normalize_e4m3fn_to_e4m3fnuz(
++                        layer.w13_weight, layer.w13_weight_scale,
++                        layer.w13_input_scale)
++                w2_weight, w2_weight_scale, w2_input_scale = \
++                    normalize_e4m3fn_to_e4m3fnuz(
++                        layer.w2_weight, layer.w2_weight_scale,
++                        layer.w2_input_scale)
++                # Reset the parameter
++                layer.w13_weight = torch.nn.Parameter(w13_weight,
++                                                      requires_grad=False)
++                layer.w13_weight_scale = torch.nn.Parameter(
++                    w13_weight_scale, requires_grad=False)
++                if w13_input_scale is not None:
++                    layer.w13_input_scale = torch.nn.Parameter(
++                        w13_input_scale, requires_grad=False)
++                layer.w2_weight = torch.nn.Parameter(w2_weight,
++                                                     requires_grad=False)
++                layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
++                                                           requires_grad=False)
++                if w2_input_scale is not None:
++                    layer.w2_input_scale = torch.nn.Parameter(
++                        w2_input_scale, requires_grad=False)
++
++            # Fp8 moe kernel needs single weight scale for w13 per expert.
++            # We take the max then dequant and requant each expert.
++            assert layer.w13_weight_scale is not None
++            shard_size = layer.intermediate_size_per_partition
++            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
++            for expert_id in range(layer.num_experts):
++                start = 0
++                for shard_id in range(2):
++                    dq_weight = per_tensor_dequantize(
++                        layer.w13_weight[expert_id][start:start +
++                                                    shard_size, :],
++                        layer.w13_weight_scale[expert_id][shard_id])
++                    layer.w13_weight[expert_id][
++                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
++                            dq_weight, max_w13_scales[expert_id])
++                    start += shard_size
++
++            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
++                                                        requires_grad=False)
++            return
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        router_logits: torch.Tensor,
++        top_k: int,
++        renormalize: bool,
++        use_grouped_topk: bool = False,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        from vllm.model_executor.layers.fused_moe import fused_experts
++
++        topk_weights, topk_ids = FusedMoE.select_experts(
++            hidden_states=x,
++            router_logits=router_logits,
++            use_grouped_topk=use_grouped_topk,
++            top_k=top_k,
++            renormalize=renormalize,
++            topk_group=topk_group,
++            num_expert_group=num_expert_group,
++            custom_routing_function=custom_routing_function,
++            scoring_func=scoring_func,
++            e_score_correction_bias=e_score_correction_bias,
++        )
++
++        return fused_experts(
++            x,
++            layer.w13_weight,
++            layer.w2_weight,
++            topk_weights=topk_weights,
++            topk_ids=topk_ids,
++            inplace=True,
++            use_fp8_w8a8=True,
++            w1_scale=(layer.w13_weight_scale_inv
++                      if self.block_quant else layer.w13_weight_scale),
++            w2_scale=(layer.w2_weight_scale_inv
++                      if self.block_quant else layer.w2_weight_scale),
++            a1_scale=layer.w13_input_scale,
++            a2_scale=layer.w2_input_scale,
++            block_shape=self.quant_config.weight_block_size,
++        )
++
++
++class Fp8KVCacheMethod(BaseKVCacheMethod):
++    """
++    Supports loading kv-cache scaling factors from FP8 checkpoints.
++    """
++
++    def __init__(self, quant_config: Fp8Config):
++        super().__init__(quant_config)
+diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
+new file mode 100644
+index 0000000..f0943ef
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/gguf.py
+@@ -0,0 +1,226 @@
++from typing import Any, Dict, List, Optional
++
++import gguf
++import torch
++from gguf import GGMLQuantizationType as WeightType
++from torch.nn.parameter import Parameter, UninitializedParameter
++
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig, QuantizeMethodBase)
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    VocabParallelEmbedding)
++from vllm.model_executor.utils import set_weight_attrs
++
++
++class GGUFConfig(QuantizationConfig):
++    """Config class for GGUF."""
++
++    def __init__(self, ) -> None:
++        pass
++
++    def __repr__(self) -> str:
++        return ("GGUFConfig()")
++
++    def get_name(self) -> str:
++        return "gguf"
++
++    def get_supported_act_dtypes(self) -> List[torch.dtype]:
++        return [torch.half, torch.bfloat16]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 60
++
++    @classmethod
++    def get_config_filenames(cls) -> List[str]:
++        return []  # no extra configs.
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "GGUFConfig":
++        return cls()
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["QuantizeMethodBase"]:
++        if isinstance(layer, LinearBase):
++            return GGUFLinearMethod(self)
++        elif isinstance(layer, VocabParallelEmbedding):
++            return GGUFEmbeddingMethod(self)
++        return None
++
++
++UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
++STANDARD_QUANT_TYPES = {
++    WeightType.Q4_0,
++    WeightType.Q4_1,
++    WeightType.Q5_0,
++    WeightType.Q5_1,
++    WeightType.Q8_0,
++    WeightType.Q8_1,
++}
++KQUANT_TYPES = {
++    WeightType.Q2_K,
++    WeightType.Q3_K,
++    WeightType.Q4_K,
++    WeightType.Q5_K,
++    WeightType.Q6_K,
++}
++IMATRIX_QUANT_TYPES = {
++    WeightType.IQ1_M,
++    WeightType.IQ1_S,
++    WeightType.IQ2_XXS,
++    WeightType.IQ2_XS,
++    WeightType.IQ2_S,
++    WeightType.IQ3_XXS,
++    WeightType.IQ3_S,
++    WeightType.IQ4_XS,
++    WeightType.IQ4_NL,
++}
++# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization.
++# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add
++# MMQ kernel for I-Matrix quantization.
++DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
++MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
++MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
++
++
++def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
++                  qweight_type: int) -> torch.Tensor:
++    # there is no need to call any kernel for fp16/bf16
++    if qweight_type in UNQUANTIZED_TYPES:
++        return x @ qweight.T
++    # enable MMVQ in contiguous batching with batch_size=1
++    if x.shape[0] == 1 and qweight_type in MMVQ_QUANT_TYPES:
++        y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
++    # Use MMQ Kernel if it's available (standard + k-quants)
++    elif qweight_type in MMQ_QUANT_TYPES:
++        y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
++    # If there is no available MMQ kernel, fallback to dequantize
++    elif qweight_type in DEQUANT_TYPES:
++        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
++        shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
++        weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
++        y = x @ weight.T
++    else:
++        # Raise an error if the quantization type is not supported.
++        # Might be useful if llama.cpp adds a new quantization type.
++        # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
++        qweight_type = WeightType(qweight_type)
++        raise NotImplementedError(
++            f"Unsupported GGUF quantization type: {qweight_type}")
++    return y
++
++
++class GGUFLinearMethod(LinearMethodBase):
++    """Linear method for GGUF.
++
++    Args:
++        quant_config: The GGUF quantization config.
++    """
++
++    def __init__(self, quant_config: GGUFConfig):
++        self.quant_config = quant_config
++
++    def create_weights(self, layer: torch.nn.Module,
++                       input_size_per_partition: int,
++                       output_partition_sizes: List[int], input_size: int,
++                       output_size: int, params_dtype: torch.dtype,
++                       **extra_weight_attrs):
++        output_size_per_partition = sum(output_partition_sizes)
++
++        tensor_shape = (output_size_per_partition, input_size_per_partition)
++        qweight = GGUFUninitializedParameter(requires_grad=False)
++        set_weight_attrs(
++            qweight, {
++                "input_dim": 1,
++                "output_dim": 0,
++                "tensor_shape": tensor_shape,
++                "is_gguf_weight": True,
++                "data_container": [],
++                "shard_id": [],
++                "shard_id_map": {},
++            })
++        set_weight_attrs(qweight, extra_weight_attrs)
++        layer.register_parameter("qweight", qweight)
++
++        qweight_type = Parameter(torch.empty(len(output_partition_sizes),
++                                             dtype=torch.uint8),
++                                 requires_grad=False)
++        set_weight_attrs(
++            qweight_type, {
++                "is_gguf_weight_type": True,
++                "weight_type": 0,
++                "shard_weight_type": {},
++                "ignore_warning": True
++            })
++        set_weight_attrs(qweight_type, extra_weight_attrs)
++        layer.register_parameter("qweight_type", qweight_type)
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        shard_id = getattr(layer.qweight, "shard_id", None)
++
++        if shard_id:
++            # dequantize shard weights respectively
++            shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id
++            qweight = layer.qweight.unbind(0)
++            result = []
++            for idx in shard_id:
++                q_idx = layer.qweight.shard_id_map[idx]
++                qweight_type = layer.qweight_type.shard_weight_type[idx]
++                result.append(_fuse_mul_mat(x, qweight[q_idx], qweight_type))
++            out = torch.cat(result, axis=1)
++        else:
++            qweight = layer.qweight
++            qweight_type = layer.qweight_type.weight_type
++            out = _fuse_mul_mat(x, qweight, qweight_type)
++        if bias is not None:
++            out.add_(bias)
++        return out
++
++
++class GGUFEmbeddingMethod(GGUFLinearMethod):
++    """Embedding method for GGUF.
++
++    Args:
++        quant_config: The GGUF quantization config.
++    """
++
++    def embedding(self, layer: torch.nn.Module,
++                  x: torch.Tensor) -> torch.Tensor:
++        qweight = layer.qweight
++        qweight_type = layer.qweight_type.weight_type
++
++        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
++        hidden_size = qweight.shape[1] // type_size * block_size
++        if qweight_type < 2:
++            return torch.embedding(qweight, x)
++        x_flat = x.flatten()
++        quant = torch.index_select(qweight, dim=0, index=x_flat)
++        dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
++                                      x_flat.shape[0])
++        return dequant.view(*x.shape, hidden_size)
++
++
++class GGUFUninitializedParameter(UninitializedParameter):
++    cls_to_become = Parameter
++    data_container: List[torch.Tensor]
++
++    def materialize_nested(self) -> Parameter:
++        dtype = {data.dtype for data in self.data_container}
++        assert len(dtype) == 1, ValueError(
++            f"Data container has mixed dtypes: {dtype}")
++        dtype = next(iter(dtype))
++        nested_data = torch.nested.nested_tensor(self.data_container,
++                                                 device=self.device,
++                                                 dtype=dtype)
++        self.data_container.clear()
++        param = torch.Tensor._make_subclass(self.cls_to_become,
++                                            nested_data,
++                                            require_grad=False)
++        for k, v in self.__dict__.items():
++            setattr(param, k, v)
++        return param
+diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
+index ae9f701..abafad0 100644
+--- a/vllm/model_executor/layers/quantization/gptq.py
++++ b/vllm/model_executor/layers/quantization/gptq.py
+@@ -10,7 +10,12 @@ from vllm import _custom_ops as ops
+ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+ from vllm.model_executor.layers.quantization.base_config import (
+     QuantizationConfig)
+-from vllm.model_executor.utils import set_weight_attrs
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
++                                           GroupQuantScaleParameter,
++                                           PackedColumnParameter,
++                                           PackedvLLMParameter,
++                                           RowvLLMParameter)
+ 
+ 
+ class GPTQConfig(QuantizationConfig):
+@@ -24,10 +29,12 @@ class GPTQConfig(QuantizationConfig):
+         weight_bits: int,
+         group_size: int,
+         desc_act: bool,
++        lm_head_quantized: bool,
+     ) -> None:
+         self.weight_bits = weight_bits
+         self.group_size = group_size
+         self.desc_act = desc_act
++        self.lm_head_quantized = lm_head_quantized
+         self.pack_factor = Fraction(32, self.weight_bits)
+         if self.weight_bits not in [2, 3, 4, 8]:
+             raise ValueError(
+@@ -37,7 +44,8 @@ class GPTQConfig(QuantizationConfig):
+     def __repr__(self) -> str:
+         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
+                 f"group_size={self.group_size}, "
+-                f"desc_act={self.desc_act})")
++                f"desc_act={self.desc_act}),"
++                f"lm_head_quantized={self.lm_head_quantized}")
+ 
+     @classmethod
+     def get_name(cls) -> str:
+@@ -61,17 +69,17 @@ class GPTQConfig(QuantizationConfig):
+         weight_bits = cls.get_from_keys(config, ["bits"])
+         group_size = cls.get_from_keys(config, ["group_size"])
+         desc_act = cls.get_from_keys(config, ["desc_act"])
+-        return cls(weight_bits, group_size, desc_act)
+-
+-    def get_quant_method(
+-            self, layer: torch.nn.Module) -> Optional["GPTQLinearMethod"]:
+-        if isinstance(layer, LinearBase):
++        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
++                                                 default=False)
++        return cls(weight_bits, group_size, desc_act, lm_head_quantized)
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["GPTQLinearMethod"]:
++        if (isinstance(layer, LinearBase) or
++            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+             return GPTQLinearMethod(self)
+         return None
+ 
+-    def get_scaled_act_names(self) -> List[str]:
+-        return []
+-
+ 
+ class ExllamaState(Enum):
+ 
+@@ -101,6 +109,7 @@ class GPTQLinearMethod(LinearMethodBase):
+         **extra_weight_attrs,
+     ):
+         del output_size  # Unused.
++        weight_loader = extra_weight_attrs.get("weight_loader")
+         if input_size_per_partition % self.quant_config.group_size != 0:
+             raise ValueError(
+                 "The input size is not aligned with the quantized "
+@@ -131,79 +140,81 @@ class GPTQLinearMethod(LinearMethodBase):
+                 scale_and_zero_size = input_size_per_partition // group_size
+                 scale_and_zero_input_dim = 0
+ 
+-        qweight = Parameter(
+-            torch.empty(
++        qweight = PackedvLLMParameter(
++            data=torch.empty(
+                 input_size_per_partition // self.quant_config.pack_factor,
+                 output_size_per_partition,
+                 dtype=torch.int32,
+             ),
+-            requires_grad=False,
+-        )
+-        set_weight_attrs(
+-            qweight, {
+-                "input_dim": 0,
+-                "output_dim": 1,
+-                "packed_dim": 0,
+-                "pack_factor": self.quant_config.pack_factor,
+-            })
+-        g_idx = Parameter(
+-            torch.tensor(
+-                [
+-                    i // self.quant_config.group_size
+-                    for i in range(input_size_per_partition)
+-                ],
+-                dtype=torch.int32,
+-            ),
+-            requires_grad=False,
+-        )
+-        # Ignore warning from fused linear layers such as QKVParallelLinear.
+-        set_weight_attrs(g_idx, {"input_dim": 0, "ignore_warning": True})
+-        qzeros = Parameter(
++            input_dim=0,
++            output_dim=1,
++            packed_dim=0,
++            packed_factor=self.quant_config.pack_factor,
++            weight_loader=weight_loader)
++
++        g_idx = RowvLLMParameter(data=torch.tensor(
++            [
++                i // self.quant_config.group_size
++                for i in range(input_size_per_partition)
++            ],
++            dtype=torch.int32,
++        ),
++                                 input_dim=0,
++                                 weight_loader=weight_loader)
++        qzeros_args = {
++            "data":
+             torch.empty(
+                 scale_and_zero_size,
+                 output_size_per_partition // self.quant_config.pack_factor,
+                 dtype=torch.int32,
+             ),
+-            requires_grad=False,
+-        )
+-        set_weight_attrs(
+-            qzeros, {
+-                "input_dim": scale_and_zero_input_dim,
+-                "output_dim": 1,
+-                "packed_dim": 1,
+-                "pack_factor": self.quant_config.pack_factor,
+-            })
+-        scales = Parameter(
++            "weight_loader":
++            weight_loader
++        }
++        weight_scale_args = {
++            "data":
+             torch.empty(
+                 scale_and_zero_size,
+                 output_size_per_partition,
+                 dtype=params_dtype,
+             ),
+-            requires_grad=False,
+-        )
+-        set_weight_attrs(scales, {
+-            "input_dim": scale_and_zero_input_dim,
+-            "output_dim": 1,
+-        })
++            "weight_loader":
++            weight_loader
++        }
++        if scale_and_zero_input_dim is None:
++            scales = ChannelQuantScaleParameter(output_dim=1,
++                                                **weight_scale_args)
++            qzeros = PackedColumnParameter(
++                output_dim=1,
++                packed_dim=1,
++                packed_factor=self.quant_config.pack_factor,
++                **qzeros_args)
++
++        else:
++            scales = GroupQuantScaleParameter(output_dim=1,
++                                              input_dim=0,
++                                              **weight_scale_args)
++            qzeros = PackedvLLMParameter(
++                input_dim=0,
++                output_dim=1,
++                packed_dim=1,
++                packed_factor=self.quant_config.pack_factor,
++                **qzeros_args)
+ 
+         layer.register_parameter("qweight", qweight)
+-        set_weight_attrs(qweight, extra_weight_attrs)
+         layer.register_parameter("g_idx", g_idx)
+-        set_weight_attrs(g_idx, extra_weight_attrs)
+         layer.register_parameter("qzeros", qzeros)
+-        set_weight_attrs(qzeros, extra_weight_attrs)
+         layer.register_parameter("scales", scales)
+-        set_weight_attrs(scales, extra_weight_attrs)
+ 
+         layer.exllama_state = exllama_state
+ 
+-    def apply(self,
+-              layer: torch.nn.Module,
+-              x: torch.Tensor,
+-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+-        qweight = layer.qweight
+-        out_shape = x.shape[:-1] + (qweight.shape[-1], )
+-        reshaped_x = x.reshape(-1, x.shape[-1])
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        # for torch.compile
++        layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False)
++        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
++        layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False)
++        layer.scales = Parameter(layer.scales.data, requires_grad=False)
++
+         # exllama needs to shuffle the weight after the weight is loaded
+         # here we do the shuffle on first forward pass
+         if layer.exllama_state == ExllamaState.UNINITIALIZED:
+@@ -211,10 +222,19 @@ class GPTQLinearMethod(LinearMethodBase):
+                 layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int)
+             else:
+                 layer.g_idx.data = torch.empty((0, ),
++                                               dtype=torch.int,
+                                                device=layer.g_idx.device)
+             layer.exllama_state = ExllamaState.READY
+             ops.gptq_shuffle(layer.qweight, layer.g_idx,
+                              self.quant_config.weight_bits)
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        out_shape = x.shape[:-1] + (layer.qweight.shape[-1], )
++        reshaped_x = x.reshape(-1, x.shape[-1])
++
+         output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
+                                layer.scales, layer.g_idx,
+                                layer.exllama_state == ExllamaState.READY,
+diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
+index e246400..2dbfca9 100644
+--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
++++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
+@@ -1,97 +1,72 @@
+-import enum
+-from enum import Enum
+-from typing import Any, Dict, List, Optional
++from typing import Any, Callable, Dict, List, Optional, Set, Union
+ 
+ import torch
+-from torch.nn.parameter import Parameter
+ 
++import vllm.model_executor.layers.fused_moe  # noqa
+ from vllm import _custom_ops as ops
++from vllm.logger import init_logger
++from vllm.model_executor.layers.fused_moe.layer import (
++    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                                set_weight_attrs)
+ from vllm.model_executor.layers.quantization.base_config import (
+     QuantizationConfig)
+-
+-GPTQ_MARLIN_TILE = 16
+-GPTQ_MARLIN_MIN_THREAD_N = 64
+-GPTQ_MARLIN_MIN_THREAD_K = 128
+-GPTQ_MARLIN_MAX_PARALLEL = 16
+-
+-GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
+-GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+-GPTQ_MARLIN_SUPPORTED_SYM = [True]
+-
+-
+-# Permutations for Marlin scale shuffling
+-def get_scale_perms(num_bits):
+-    scale_perm = []
+-    for i in range(8):
+-        scale_perm.extend([i + 8 * j for j in range(8)])
+-    scale_perm_single = []
+-    for i in range(4):
+-        scale_perm_single.extend(
+-            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+-    return scale_perm, scale_perm_single
+-
+-
+-def get_pack_factor(num_bits):
+-    assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
+-            ), f"Unsupported num_bits = {num_bits}"
+-    return 32 // num_bits
+-
+-
+-def marlin_permute_scales(s, size_k, size_n, group_size, num_bits):
+-    scale_perm, scale_perm_single = get_scale_perms(num_bits)
+-    if group_size < size_k and group_size != -1:
+-        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+-    else:
+-        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+-    s = s.reshape((-1, size_n)).contiguous()
+-
+-    return s
++from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
++    MPLinearLayerConfig, choose_mp_linear_kernel)
++from vllm.model_executor.layers.quantization.utils import replace_parameter
++from vllm.model_executor.layers.quantization.utils.marlin_utils import (
++    check_marlin_supported, marlin_moe_permute_scales,
++    marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
++                                           GroupQuantScaleParameter,
++                                           PackedColumnParameter,
++                                           PackedvLLMParameter,
++                                           RowvLLMParameter)
++from vllm.platforms import current_platform
++from vllm.scalar_type import scalar_types
++
++logger = init_logger(__name__)
+ 
+ 
+ class GPTQMarlinConfig(QuantizationConfig):
+     """Config class for GPTQ Marlin"""
+ 
+-    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
+-                 is_sym: bool) -> None:
++    # (num_bits, is_sym) -> quant_type
++    TYPE_MAP = {
++        (4, True): scalar_types.uint4b8,
++        (8, True): scalar_types.uint8b128,
++    }
++
++    def __init__(
++        self,
++        weight_bits: int,
++        group_size: int,
++        desc_act: bool,
++        is_sym: bool,
++        lm_head_quantized: bool,
++    ) -> None:
+         if desc_act and group_size == -1:
+             # In this case, act_order == True is the same as act_order == False
+             # (since we have only one group per output channel)
+             desc_act = False
+ 
+-        self.weight_bits = weight_bits
++        self.pack_factor = 32 // weight_bits  # packed into int32
+         self.group_size = group_size
+         self.desc_act = desc_act
+-        self.is_sym = is_sym
+-
+-        # Verify
+-        if self.weight_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+-            raise ValueError(
+-                f"Marlin does not support weight_bits = {self.weight_bits}. "
+-                f"Only weight_bits = {GPTQ_MARLIN_SUPPORTED_NUM_BITS} "
+-                "are supported.")
+-        if self.group_size not in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+-            raise ValueError(
+-                f"Marlin does not support group_size = {self.group_size}. "
+-                f"Only group_sizes = {GPTQ_MARLIN_SUPPORTED_GROUP_SIZES} "
+-                "are supported.")
+-        if self.is_sym not in GPTQ_MARLIN_SUPPORTED_SYM:
+-            raise ValueError(
+-                f"Marlin does not support is_sym = {self.is_sym}. "
+-                f"Only sym = {GPTQ_MARLIN_SUPPORTED_SYM} are supported.")
+-
+-        # Init
+-        self.pack_factor = get_pack_factor(weight_bits)
+-        self.tile_size = GPTQ_MARLIN_TILE
+-        self.min_thread_n = GPTQ_MARLIN_MIN_THREAD_N
+-        self.min_thread_k = GPTQ_MARLIN_MIN_THREAD_K
+-        self.max_parallel = GPTQ_MARLIN_MAX_PARALLEL
++        self.lm_head_quantized = lm_head_quantized
++
++        if (weight_bits, is_sym) not in self.TYPE_MAP:
++            raise ValueError("Unsupported quantization config: "
++                             f"bits={weight_bits}, sym={is_sym}")
++
++        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+ 
+     def __repr__(self) -> str:
+-        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
++        return (f"GPTQMarlinConfig(quant_type={self.quant_type}, "
+                 f"group_size={self.group_size}, "
+-                f"desc_act={self.desc_act})")
++                f"desc_act={self.desc_act}, "
++                f"lm_head_quantized={self.lm_head_quantized})")
+ 
+     @classmethod
+     def get_name(cls) -> str:
+@@ -99,7 +74,7 @@ class GPTQMarlinConfig(QuantizationConfig):
+ 
+     @classmethod
+     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+-        return [torch.half]
++        return [torch.half, torch.bfloat16]
+ 
+     @classmethod
+     def get_min_capability(cls) -> int:
+@@ -115,46 +90,67 @@ class GPTQMarlinConfig(QuantizationConfig):
+         group_size = cls.get_from_keys(config, ["group_size"])
+         desc_act = cls.get_from_keys(config, ["desc_act"])
+         is_sym = cls.get_from_keys(config, ["sym"])
+-        return cls(weight_bits, group_size, desc_act, is_sym)
++        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
++                                                 default=False)
++        return cls(weight_bits, group_size, desc_act, is_sym,
++                   lm_head_quantized)
++
++    @classmethod
++    def override_quantization_method(cls, hf_quant_cfg,
++                                     user_quant) -> Optional[str]:
++        can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
++
++        is_valid_user_quant = (user_quant is None or user_quant == "marlin"
++                               or user_quant == "gptq_marlin")
++
++        if can_convert and is_valid_user_quant:
++            msg = ("The model is convertible to {} during runtime."
++                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
++            logger.info(msg)
++            return cls.get_name()
++
++        if can_convert and user_quant == "gptq":
++            logger.info("Detected that the model can run with gptq_marlin"
++                        ", however you specified quantization=gptq explicitly,"
++                        " so forcing gptq. Use quantization=gptq_marlin for"
++                        " faster inference")
++        return None
+ 
+     def get_quant_method(
+-            self,
+-            layer: torch.nn.Module) -> Optional["GPTQMarlinLinearMethod"]:
+-        if isinstance(layer, LinearBase):
++        self, layer: torch.nn.Module, prefix: str
++    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
++        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
++                                             and self.lm_head_quantized):
+             return GPTQMarlinLinearMethod(self)
++        elif isinstance(layer, FusedMoE):
++            return GPTQMarlinMoEMethod(self)
+         return None
+ 
+-    def get_scaled_act_names(self) -> List[str]:
+-        return []
+-
+     @classmethod
+-    def is_marlin_compatible(cls, quant_config: Dict[str, Any]):
++    def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
+         # Extract data from quant config.
+-        num_bits = quant_config.get("bits", None)
+-        group_size = quant_config.get("group_size", None)
+-        sym = quant_config.get("sym", None)
+-        desc_act = quant_config.get("desc_act", None)
++        quant_method = quant_config.get("quant_method", "").lower()
++        num_bits = quant_config.get("bits")
++        group_size = quant_config.get("group_size")
++        sym = quant_config.get("sym")
++        desc_act = quant_config.get("desc_act")
++
++        if not current_platform.is_cuda():
++            return False
++
++        if quant_method != "gptq":
++            return False
+ 
+         # If we cannot find the info needed in the config, cannot convert.
+         if (num_bits is None or group_size is None or sym is None
+                 or desc_act is None):
+             return False
+ 
+-        # If the capability of the device is too low, cannot convert.
+-        major, minor = torch.cuda.get_device_capability()
+-        device_capability = major * 10 + minor
+-        if device_capability < cls.get_min_capability():
++        if (num_bits, sym) not in cls.TYPE_MAP:
+             return False
+ 
+-        # Otherwise, can convert if model satisfies marlin constraints.
+-        return (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
+-                and group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
+-                and sym in GPTQ_MARLIN_SUPPORTED_SYM)
+-
+-
+-class GPTQMarlinState(Enum):
+-    REPACK = enum.auto()
+-    READY = enum.auto()
++        return check_marlin_supported(quant_type=cls.TYPE_MAP[(num_bits, sym)],
++                                      group_size=group_size)
+ 
+ 
+ class GPTQMarlinLinearMethod(LinearMethodBase):
+@@ -164,9 +160,15 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
+         quant_config: The GPTQ Marlin quantization config.
+     """
+ 
++    _kernel_backends_being_used: Set[str] = set()
++
+     def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+         self.quant_config = quant_config
+ 
++        # Verify supported on platform.
++        verify_marlin_supported(quant_type=self.quant_config.quant_type,
++                                group_size=self.quant_config.group_size)
++
+     def create_weights(
+         self,
+         layer: torch.nn.Module,
+@@ -177,7 +179,27 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
+         params_dtype: torch.dtype,
+         **extra_weight_attrs,
+     ) -> None:
+-        del output_size
++        output_size_per_partition = sum(output_partition_sizes)
++        is_row_parallel = input_size != input_size_per_partition
++        weight_loader = extra_weight_attrs.get("weight_loader")
++
++        mp_linear_kernel_config = MPLinearLayerConfig(
++            full_weight_shape=(input_size, output_size),
++            partition_weight_shape=\
++                (input_size_per_partition, output_size_per_partition),
++            weight_type=self.quant_config.quant_type,
++            act_type=params_dtype,
++            group_size=self.quant_config.group_size,
++            zero_points=False,
++            has_g_idx=self.quant_config.desc_act
++        )
++
++        kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
++
++        if kernel_type.__name__ not in self._kernel_backends_being_used:
++            logger.info("Using %s for GPTQMarlinLinearMethod",
++                        kernel_type.__name__)
++            self._kernel_backends_being_used.add(kernel_type.__name__)
+ 
+         # Normalize group_size
+         if self.quant_config.group_size != -1:
+@@ -185,254 +207,367 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
+         else:
+             group_size = input_size
+ 
+-        # Validate dtype
+-        if params_dtype != torch.float16:
+-            raise ValueError(
+-                f"The params dtype must be float16, but got {params_dtype}")
+-
+-        # Validate output_size_per_partition
+-        output_size_per_partition = sum(output_partition_sizes)
+-        if output_size_per_partition % self.quant_config.min_thread_n != 0:
+-            raise ValueError(
+-                f"Weight output_size_per_partition = "
+-                f"{output_size_per_partition} is not divisible by "
+-                f" min_thread_n = {self.quant_config.min_thread_n}.")
+-
+-        # Validate input_size_per_partition
+-        if input_size_per_partition % self.quant_config.min_thread_k != 0:
+-            raise ValueError(
+-                f"Weight input_size_per_partition = "
+-                f"{input_size_per_partition} is not divisible "
+-                f"by min_thread_k = {self.quant_config.min_thread_k}.")
+-
+-        if (group_size < input_size
+-                and input_size_per_partition % group_size != 0):
+-            raise ValueError(
+-                f"Weight input_size_per_partition = {input_size_per_partition}"
+-                f" is not divisible by group_size = {group_size}.")
+-
+-        # Detect sharding of scales/zp
+-
+-        # By default, no sharding over "input dim"
+-        scales_and_zp_size = input_size // group_size
+-        scales_and_zp_input_dim = None
++        # Determine sharding
++        if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
++                                             self.quant_config.group_size,
++                                             is_row_parallel):
++            # By setting scale_dim == None, weight_loader will
++            # repeat the scales on each GPU in TP>1 case.
++            scales_and_zp_input_dim = None
++            scales_and_zp_size = input_size // group_size
++        else:
++            # By setting scale_dim == 0, weight_loader will
++            # shard the scales in TP>1 case.
++            scales_and_zp_input_dim = 0
++            scales_and_zp_size = input_size_per_partition // group_size
+ 
+-        if self.quant_config.desc_act:
+-            # Act-order case
+-            assert self.quant_config.group_size != -1
++        # Quantized weights
++        qweight = PackedvLLMParameter(
++            data=torch.empty(
++                input_size_per_partition // self.quant_config.pack_factor,
++                output_size_per_partition,
++                dtype=torch.int32,
++            ),
++            input_dim=0,
++            output_dim=1,
++            packed_dim=0,
++            packed_factor=self.quant_config.pack_factor,
++            weight_loader=weight_loader)
+ 
+-            is_k_full = input_size_per_partition == input_size
++        # Activation order
++        g_idx = RowvLLMParameter(data=torch.empty(
++            input_size_per_partition,
++            dtype=torch.int32,
++        ),
++                                 input_dim=0,
++                                 weight_loader=weight_loader)
++
++        qzeros_args = {
++            "data":
++            torch.empty(
++                scales_and_zp_size,
++                output_size_per_partition // self.quant_config.pack_factor,
++                dtype=torch.int32,
++            ),
++            "weight_loader":
++            weight_loader
++        }
++        weight_scale_args = {
++            "data":
++            torch.empty(
++                scales_and_zp_size,
++                output_size_per_partition,
++                dtype=params_dtype,
++            ),
++            "weight_loader":
++            weight_loader
++        }
++
++        if scales_and_zp_input_dim is None:
++            scales = ChannelQuantScaleParameter(output_dim=1,
++                                                **weight_scale_args)
++            qzeros = PackedColumnParameter(
++                output_dim=1,
++                packed_dim=1,
++                packed_factor=self.quant_config.pack_factor,
++                **qzeros_args)
+ 
+         else:
+-            # No act-order case
++            scales = GroupQuantScaleParameter(output_dim=1,
++                                              input_dim=0,
++                                              **weight_scale_args)
++            qzeros = PackedvLLMParameter(
++                input_dim=0,
++                output_dim=1,
++                packed_dim=1,
++                packed_factor=self.quant_config.pack_factor,
++                **qzeros_args)
+ 
+-            # K is always full due to full alignment with
+-            # group-size and shard of scales/zp
+-            is_k_full = True
++        layer.register_parameter("qweight", qweight)
++        layer.register_parameter("g_idx", g_idx)
++        layer.register_parameter("scales", scales)
++        layer.register_parameter("qzeros", qzeros)
+ 
+-            # If this is a row-parallel case, then shard scales/zp
+-            if (input_size != input_size_per_partition
+-                    and self.quant_config.group_size != -1):
+-                scales_and_zp_size = input_size_per_partition // group_size
+-                scales_and_zp_input_dim = 0
++        self.kernel = kernel_type(mp_linear_kernel_config,
++                                  w_q_param_name="qweight",
++                                  w_s_param_name="scales",
++                                  w_zp_param_name="qzeros",
++                                  w_gidx_param_name="g_idx")
+ 
+-        # Init buffers
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        self.kernel.process_weights_after_loading(layer)
+ 
+-        # Quantized weights
+-        qweight = Parameter(
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        return self.kernel.apply_weights(layer, x, bias)
++
++
++class GPTQMarlinMoEMethod(FusedMoEMethodBase):
++    """MoE Marlin method with quantization."""
++
++    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
++        self.quant_config = quant_config
++
++    def create_weights(
++        self,
++        layer: torch.nn.Module,
++        num_experts: int,
++        hidden_size: int,
++        intermediate_size: int,
++        params_dtype: torch.dtype,
++        **extra_weight_attrs,
++    ):
++        # Currently assuming is_k_full is always True
++        # (input size per partition is the same as full input size)
++        # Supports only sym for now (no zp)
++        if self.quant_config.group_size != -1:
++            scales_size13 = hidden_size // self.quant_config.group_size
++            scales_size2 = intermediate_size // self.quant_config.group_size
++            strategy = FusedMoeWeightScaleSupported.GROUP.value
++        else:
++            scales_size13 = 1
++            scales_size2 = 1
++            strategy = FusedMoeWeightScaleSupported.CHANNEL.value
++
++        extra_weight_attrs.update({
++            "quant_method": strategy,
++            "is_transposed": True
++        })
++        # Fused gate_up_proj (column parallel)
++        w13_qweight = torch.nn.Parameter(
+             torch.empty(
+-                input_size_per_partition // self.quant_config.pack_factor,
+-                output_size_per_partition,
++                num_experts,
++                hidden_size // self.quant_config.pack_factor,
++                2 * intermediate_size,
+                 dtype=torch.int32,
+             ),
+             requires_grad=False,
+         )
+-        set_weight_attrs(
+-            qweight,
+-            {
+-                **extra_weight_attrs,
+-                "input_dim": 0,
+-                "output_dim": 1,
+-                "packed_dim": 0,
+-                "pack_factor": self.quant_config.pack_factor,
+-            },
+-        )
+-
+-        # Activation order
+-        g_idx = Parameter(
++        layer.register_parameter("w13_qweight", w13_qweight)
++        set_weight_attrs(w13_qweight, extra_weight_attrs)
++        # down_proj (row parallel)
++        w2_qweight = torch.nn.Parameter(
+             torch.empty(
+-                input_size_per_partition,
++                num_experts,
++                intermediate_size // self.quant_config.pack_factor,
++                hidden_size,
+                 dtype=torch.int32,
+             ),
+             requires_grad=False,
+         )
+-        # Ignore warning from fused linear layers such as QKVParallelLinear.
+-        set_weight_attrs(
+-            g_idx,
+-            {
+-                **extra_weight_attrs, "input_dim": 0,
+-                "ignore_warning": True
+-            },
++        layer.register_parameter("w2_qweight", w2_qweight)
++        set_weight_attrs(w2_qweight, extra_weight_attrs)
++        # up_proj scales
++        w13_scales = torch.nn.Parameter(
++            torch.empty(num_experts,
++                        scales_size13,
++                        2 * intermediate_size,
++                        dtype=torch.half),
++            requires_grad=False,
+         )
+-
+-        g_idx_sort_indices = Parameter(
++        layer.register_parameter("w13_scales", w13_scales)
++        set_weight_attrs(w13_scales, extra_weight_attrs)
++        # down_proj scales
++        w2_scales = torch.nn.Parameter(
++            torch.empty(num_experts,
++                        scales_size2,
++                        hidden_size,
++                        dtype=torch.half),
++            requires_grad=False,
++        )
++        layer.register_parameter("w2_scales", w2_scales)
++        set_weight_attrs(w2_scales, extra_weight_attrs)
++        # up_proj scales
++        w13_qzeros = torch.nn.Parameter(
++            torch.empty(num_experts,
++                        scales_size13,
++                        2 * intermediate_size // self.quant_config.pack_factor,
++                        dtype=params_dtype),
++            requires_grad=False,
++        )
++        layer.register_parameter("w13_qzeros", w13_qzeros)
++        set_weight_attrs(w13_qzeros, extra_weight_attrs)
++        # down_proj scales
++        w2_qzeros = torch.nn.Parameter(
++            torch.empty(num_experts,
++                        scales_size2,
++                        hidden_size // self.quant_config.pack_factor,
++                        dtype=params_dtype),
++            requires_grad=False,
++        )
++        layer.register_parameter("w2_qzeros", w2_qzeros)
++        set_weight_attrs(w2_qzeros, extra_weight_attrs)
++        w13_g_idx = torch.nn.Parameter(
+             torch.empty(
+-                g_idx.shape,
++                num_experts,
++                hidden_size,
+                 dtype=torch.int32,
+             ),
+             requires_grad=False,
+         )
+-        set_weight_attrs(g_idx_sort_indices, extra_weight_attrs)
+-
+-        # Scales
+-        scales = Parameter(
++        layer.register_parameter("w13_g_idx", w13_g_idx)
++        set_weight_attrs(w13_g_idx, extra_weight_attrs)
++        w2_g_idx = torch.nn.Parameter(
+             torch.empty(
+-                scales_and_zp_size,
+-                output_size_per_partition,
+-                dtype=params_dtype,
++                num_experts,
++                intermediate_size,
++                dtype=torch.int32,
+             ),
+             requires_grad=False,
+         )
+-        set_weight_attrs(
+-            scales,
+-            {
+-                **extra_weight_attrs,
+-                "input_dim": scales_and_zp_input_dim,
+-                "output_dim": 1,
+-            },
+-        )
+-
+-        # Quantized zero-points
+-        qzeros = Parameter(
++        layer.register_parameter("w2_g_idx", w2_g_idx)
++        set_weight_attrs(w2_g_idx, extra_weight_attrs)
++        w13_g_idx_sort_indices = torch.nn.Parameter(
+             torch.empty(
+-                scales_and_zp_size,
+-                output_size_per_partition // self.quant_config.pack_factor,
++                num_experts,
++                hidden_size,
+                 dtype=torch.int32,
+-                device="meta",
+             ),
+             requires_grad=False,
+         )
+-        set_weight_attrs(
+-            qzeros,
+-            {
+-                **extra_weight_attrs,
+-                "input_dim": scales_and_zp_input_dim,
+-                "output_dim": 1,
+-                "packed_dim": 1,
+-                "pack_factor": self.quant_config.pack_factor,
+-            },
++        layer.register_parameter("w13_g_idx_sort_indices",
++                                 w13_g_idx_sort_indices)
++        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
++        w2_g_idx_sort_indices = torch.nn.Parameter(
++            torch.empty(
++                num_experts,
++                intermediate_size,
++                dtype=torch.int32,
++            ),
++            requires_grad=False,
+         )
++        layer.register_parameter("w2_g_idx_sort_indices",
++                                 w2_g_idx_sort_indices)
++        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+ 
+-        # Allocate marlin workspace
+-        max_workspace_size = (
+-            output_size_per_partition //
+-            self.quant_config.min_thread_n) * self.quant_config.max_parallel
+-        workspace = torch.zeros(max_workspace_size,
+-                                dtype=torch.int,
+-                                requires_grad=False)
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ 
+-        layer.register_parameter("qweight", qweight)
+-        layer.register_parameter("g_idx", g_idx)
+-        layer.register_parameter("g_idx_sort_indices", g_idx_sort_indices)
+-        layer.register_parameter("scales", scales)
+-        layer.register_parameter("qzeros", qzeros)
+-        layer.workspace = workspace
+-        layer.input_size_per_partition = input_size_per_partition
+-        layer.output_size_per_partition = output_size_per_partition
+-        layer.input_size = input_size
+-        layer.is_k_full = is_k_full
+-        layer.marlin_state = GPTQMarlinState.REPACK
++        # Process act_order
++        if self.quant_config.desc_act:
++            # Get sorting based on g_idx
++            num_experts = layer.w13_g_idx.shape[0]
++            w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
++            w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
++            w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
++            w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
++            for e in range(num_experts):
++                w13_g_idx_sort_indices[e] = torch.argsort(
++                    layer.w13_g_idx[e]).to(torch.int32)
++                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
++                    torch.int32)
++                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
++                    w13_g_idx_sort_indices[e]]
++                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
++                    w2_g_idx_sort_indices[e]]
++            replace_parameter(layer, "w13_g_idx", w13_sorted_g_idx)
++            replace_parameter(layer, "w2_g_idx", w2_sorted_g_idx)
++            replace_parameter(layer, "w13_g_idx_sort_indices",
++                              w13_g_idx_sort_indices)
++            replace_parameter(layer, "w2_g_idx_sort_indices",
++                              w2_g_idx_sort_indices)
++        else:
++            # Reset g_idx related tensors
++            num_experts = layer.w13_g_idx.shape[0]
++            device = layer.w13_g_idx.device
++            layer.w13_g_idx = torch.nn.Parameter(
++                torch.empty((num_experts, 0), dtype=torch.int32,
++                            device=device),
++                requires_grad=False,
++            )
++            layer.w2_g_idx = torch.nn.Parameter(
++                torch.empty((num_experts, 0), dtype=torch.int32,
++                            device=device),
++                requires_grad=False,
++            )
++            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
++                torch.empty((num_experts, 0), dtype=torch.int32,
++                            device=device),
++                requires_grad=False,
++            )
++            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
++                torch.empty((num_experts, 0), dtype=torch.int32,
++                            device=device),
++                requires_grad=False,
++            )
++        # Repack weights
++        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
++            layer.w13_qweight,
++            layer.w13_g_idx_sort_indices,
++            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
++            layer.w13_qweight.shape[2],
++            self.quant_config.quant_type.size_bits,
++        )
++        replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
++        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
++            layer.w2_qweight,
++            layer.w2_g_idx_sort_indices,
++            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
++            layer.w2_qweight.shape[2],
++            self.quant_config.quant_type.size_bits,
++        )
++        replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
++        # Repack scales
++        marlin_w13_scales = marlin_moe_permute_scales(
++            s=layer.w13_scales,
++            size_k=layer.intermediate_size_per_partition,
++            size_n=layer.w13_scales.shape[2],
++            group_size=self.quant_config.group_size,
++        )
++        replace_parameter(layer, "w13_scales", marlin_w13_scales)
++        marlin_w2_scales = marlin_moe_permute_scales(
++            s=layer.w2_scales,
++            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
++            size_n=layer.w2_scales.shape[2],
++            group_size=self.quant_config.group_size,
++        )
++        replace_parameter(layer, "w2_scales", marlin_w2_scales)
+ 
+     def apply(
+         self,
+         layer: torch.nn.Module,
+         x: torch.Tensor,
+-        bias: Optional[torch.Tensor] = None,
++        router_logits: torch.Tensor,
++        top_k: int,
++        renormalize: bool,
++        use_grouped_topk: bool = False,
++        topk_group: Optional[int] = None,
++        num_expert_group: Optional[int] = None,
++        custom_routing_function: Optional[Callable] = None,
++        scoring_func: str = "softmax",
++        e_score_correction_bias: Optional[torch.Tensor] = None,
+     ) -> torch.Tensor:
+-        reshaped_x = x.reshape(-1, x.shape[-1])
+-
+-        size_m = reshaped_x.shape[0]
+-        part_size_n = layer.output_size_per_partition
+-        part_size_k = layer.input_size_per_partition
+-        full_size_k = layer.input_size
+-
+-        out_shape = x.shape[:-1] + (part_size_n, )
+-
+-        if layer.marlin_state == GPTQMarlinState.REPACK:
+-            layer.marlin_state = GPTQMarlinState.READY
+-
+-            # Newly generated tensors need to replace existing tensors that are
+-            # already registered as parameters by vLLM (and won't be freed)
+-            def replace_tensor(name, new_t):
+-                # It is important to use resize_() here since it ensures
+-                # the same buffer is reused
+-                getattr(layer, name).resize_(new_t.shape)
+-                getattr(layer, name).copy_(new_t)
+-                del new_t
+-
+-            cur_device = layer.qweight.device
+-
+-            # Process act_order
+-            if self.quant_config.desc_act:
+-                # Get sorting based on g_idx
+-                g_idx_sort_indices = torch.argsort(layer.g_idx).to(torch.int)
+-
+-                sorted_g_idx = layer.g_idx[g_idx_sort_indices]
+-
+-                replace_tensor("g_idx", sorted_g_idx)
+-                replace_tensor("g_idx_sort_indices", g_idx_sort_indices)
+-
+-            else:
+-                # Reset g_idx related tensors
+-                layer.g_idx = Parameter(
+-                    torch.empty(0, dtype=torch.int, device=cur_device),
+-                    requires_grad=False,
+-                )
+-                layer.g_idx_sort_indices = Parameter(
+-                    torch.empty(0, dtype=torch.int, device=cur_device),
+-                    requires_grad=False,
+-                )
+-
+-            # Repack weights
+-            marlin_qweight = ops.gptq_marlin_repack(
+-                layer.qweight,
+-                layer.g_idx_sort_indices,
+-                part_size_k,
+-                part_size_n,
+-                self.quant_config.weight_bits,
+-            )
+-            replace_tensor("qweight", marlin_qweight)
+-
+-            # Permute scales
+-            scales_size_k = part_size_k
+-            scales_size_n = part_size_n
+-            if self.quant_config.desc_act:
+-                scales_size_k = full_size_k
+-
+-            marlin_scales = marlin_permute_scales(
+-                layer.scales,
+-                scales_size_k,
+-                scales_size_n,
+-                self.quant_config.group_size,
+-                self.quant_config.weight_bits,
+-            )
+-            replace_tensor("scales", marlin_scales)
+-
+-        output = ops.gptq_marlin_gemm(
+-            reshaped_x,
+-            layer.qweight,
+-            layer.scales,
+-            layer.g_idx,
+-            layer.g_idx_sort_indices,
+-            layer.workspace,
+-            self.quant_config.weight_bits,
+-            size_m,
+-            part_size_n,
+-            part_size_k,
+-            layer.is_k_full,
+-        )
+-
+-        if bias is not None:
+-            output.add_(bias)  # In-place add
+-
+-        return output.reshape(out_shape)
++        # The input must currently be float16
++        orig_dtype = x.dtype
++        x = x.half()
++
++        topk_weights, topk_ids = FusedMoE.select_experts(
++            hidden_states=x,
++            router_logits=router_logits,
++            use_grouped_topk=use_grouped_topk,
++            top_k=top_k,
++            renormalize=renormalize,
++            topk_group=topk_group,
++            num_expert_group=num_expert_group,
++            custom_routing_function=custom_routing_function,
++            scoring_func=scoring_func,
++            e_score_correction_bias=e_score_correction_bias)
++
++        return torch.ops.vllm.fused_marlin_moe(
++            x,
++            layer.w13_qweight,
++            layer.w2_qweight,
++            layer.w13_scales,
++            layer.w2_scales,
++            router_logits,
++            topk_weights,
++            topk_ids,
++            g_idx1=layer.w13_g_idx,
++            g_idx2=layer.w2_g_idx,
++            sort_indices1=layer.w13_g_idx_sort_indices,
++            sort_indices2=layer.w2_g_idx_sort_indices,
++            num_bits=self.quant_config.quant_type.size_bits,
++        ).to(orig_dtype)
+diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+new file mode 100644
+index 0000000..07552c0
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+@@ -0,0 +1,292 @@
++from typing import Any, Dict, List, Optional
++
++import torch
++from torch.nn.parameter import Parameter
++
++from vllm import _custom_ops as ops
++from vllm.logger import init_logger
++from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           ChannelQuantScaleParameter,
++                                           GroupQuantScaleParameter,
++                                           PackedvLLMParameter)
++from vllm.scalar_type import scalar_types
++
++logger = init_logger(__name__)
++
++GPTQ_MARLIN_24_TILE = 16
++GPTQ_MARLIN_24_MIN_THREAD_N = 128
++GPTQ_MARLIN_24_MIN_THREAD_K = 128
++GPTQ_MARLIN_24_MAX_PARALLEL = 64
++
++GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES = [
++    scalar_types.uint4b8, scalar_types.uint8b128
++]
++GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
++
++
++class GPTQMarlin24Config(QuantizationConfig):
++    """Config class for Marlin24.
++    """
++
++    def __init__(
++        self,
++        weight_bits: int,
++        group_size: int,
++    ) -> None:
++        quant_type = {
++            4: scalar_types.uint4b8,
++            8: scalar_types.uint8b128,
++        }.get(weight_bits)
++
++        self.group_size = group_size
++
++        # Verify
++        if quant_type is None or \
++            quant_type not in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES:
++            raise ValueError(
++                f"Marlin_24 does not support quant_type = {quant_type}. "
++                f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES} "
++                "are supported.")
++        if self.group_size not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
++            raise ValueError(
++                f"Marlin_24 does not support group_size = {self.group_size}. "
++                f"Only group_sizes = {GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES} "
++                "are supported.")
++
++        self.quant_type = quant_type
++
++        # 4 Bits packed into 32 bit datatype.
++        self.pack_factor = 32 // self.quant_type.size_bits
++
++        # Tile size used by marlin kernels.
++        self.tile_size = 16
++
++        # Min out_features dim
++        self.min_n_threads = GPTQ_MARLIN_24_MIN_THREAD_N
++
++        # Min in_features dim
++        self.min_k_threads = GPTQ_MARLIN_24_MIN_THREAD_K
++
++        # Max parallel problems to solve at once (improves large
++        # batch performance)
++        self.max_parallel = GPTQ_MARLIN_24_MAX_PARALLEL
++
++        # Permutation length used by the marlin kernels.
++        self.perm_len = 1024
++
++    def __repr__(self) -> str:
++        return "Marlin24Config(quant_type={}, group_size={})".format(
++            self.quant_type, self.group_size)
++
++    @classmethod
++    def get_name(cls) -> str:
++        return "gptq_marlin_24"
++
++    @classmethod
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.half]
++
++    @classmethod
++    # Need to figure it out
++    def get_min_capability(cls) -> int:
++        return 80
++
++    @classmethod
++    def get_config_filenames(cls) -> List[str]:
++        return ["quantize_config.json"]
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlin24Config":
++        weight_bits = cls.get_from_keys(config, ["bits"])
++        group_size = cls.get_from_keys(config, ["group_size"])
++        return cls(weight_bits, group_size)
++
++    @classmethod
++    def override_quantization_method(cls, hf_quant_cfg,
++                                     user_quant) -> Optional[str]:
++        is_marlin_24_format = (
++            hf_quant_cfg.get("checkpoint_format") == "marlin_24")
++
++        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
++                               or user_quant == "gptq_marlin_24")
++
++        if is_marlin_24_format and is_valid_user_quant:
++            msg = ("The model is serialized in {} format. "
++                   "Using {} kernel.".format(cls.get_name(), cls.get_name()))
++            logger.info(msg)
++            return cls.get_name()
++
++        return None
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["GPTQMarlin24LinearMethod"]:
++        if isinstance(layer, LinearBase):
++            return GPTQMarlin24LinearMethod(self)
++        return None
++
++
++class GPTQMarlin24LinearMethod(LinearMethodBase):
++    """Linear method for Marlin24.
++
++    Args:
++        quant_config: The Marlin24 quantization config.
++    """
++
++    def __init__(self, quant_config: GPTQMarlin24Config):
++        self.quant_config = quant_config
++
++    def create_weights(
++        self,
++        layer: torch.nn.Module,
++        input_size_per_partition: int,
++        output_partition_sizes: List[int],
++        input_size: int,
++        output_size: int,
++        params_dtype: torch.dtype,
++        **extra_weight_attrs,
++    ):
++        del output_size  # Unused.
++        weight_loader = extra_weight_attrs["weight_loader"]
++        if params_dtype != torch.float16:
++            raise ValueError(
++                f"The params dtype must be float16, but got {params_dtype}")
++
++        # Validate output_size_per_partition
++        output_size_per_partition = sum(output_partition_sizes)
++        if output_size_per_partition % self.quant_config.min_n_threads != 0:
++            raise ValueError(
++                f"Weight output_size_per_partition = "
++                f"{output_size_per_partition} is not divisible by "
++                f"min_n_threads = {self.quant_config.min_n_threads}.")
++        if output_size_per_partition % self.quant_config.pack_factor != 0:
++            raise ValueError(
++                f"Weight output_size_per_partition = "
++                f"{output_size_per_partition} is not divisible by "
++                f"pack_factor = {self.quant_config.pack_factor}.")
++
++        # Validate input_size_per_partition
++        if input_size_per_partition % self.quant_config.min_k_threads != 0:
++            raise ValueError(
++                f"Weight input_size_per_partition = "
++                f"{input_size_per_partition} is not divisible by "
++                f"min_k_threads = {self.quant_config.min_k_threads}.")
++        if (self.quant_config.group_size != -1 and
++                input_size_per_partition % self.quant_config.group_size != 0):
++            raise ValueError(f"Weight input_size_per_partition = "
++                             f"{input_size_per_partition} is not divisible by "
++                             f"group_size = {self.quant_config.group_size}.")
++
++        # Check that we have at least 4 tiles horizontally in the shard
++        num_tiles_per_perm = self.quant_config.perm_len // (
++            self.quant_config.tile_size**2)
++        if output_size_per_partition % num_tiles_per_perm != 0:
++            raise ValueError(
++                "Each permutation group must reside on the same gpu")
++
++        # Quantized 4Bit weights packed into Int32.
++        qweight = PackedvLLMParameter(
++            data=torch.empty(
++                input_size_per_partition // self.quant_config.tile_size // 2,
++                output_size_per_partition * self.quant_config.tile_size //
++                self.quant_config.pack_factor,
++                device="cuda",
++                dtype=torch.int32,
++            ),
++            input_dim=0,
++            output_dim=1,
++            packed_dim=1,
++            packed_factor=self.quant_config.pack_factor,
++            marlin_tile_size=self.quant_config.tile_size,
++            weight_loader=weight_loader)
++
++        # Meta
++        meta = PackedvLLMParameter(data=torch.empty(
++            input_size_per_partition // 8 // 2 // 2,
++            output_size_per_partition * 2,
++            device="cuda",
++            dtype=torch.int16,
++        ),
++                                   input_dim=0,
++                                   output_dim=1,
++                                   packed_dim=1,
++                                   packed_factor=1,
++                                   marlin_tile_size=2,
++                                   weight_loader=weight_loader)
++
++        # Determine if channelwise or not
++        input_groups = (1 if self.quant_config.group_size == -1 else
++                        input_size_per_partition //
++                        self.quant_config.group_size)
++
++        weight_scale_args = {
++            "data":
++            torch.empty(
++                input_groups,
++                output_size_per_partition,
++                device="cuda",
++                dtype=params_dtype,
++            ),
++            "weight_loader":
++            weight_loader
++        }
++        if input_groups == 1:
++            scales = ChannelQuantScaleParameter(output_dim=1,
++                                                **weight_scale_args)
++        else:
++            scales = GroupQuantScaleParameter(output_dim=1,
++                                              input_dim=0,
++                                              **weight_scale_args)
++
++        # Allocate workspace (Used for internal locking mechanism)
++        max_workspace_size = (
++            output_size_per_partition //
++            self.quant_config.min_n_threads) * self.quant_config.max_parallel
++
++        workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size,
++                                                       device="cuda",
++                                                       dtype=torch.int),
++                                      weight_loader=weight_loader)
++
++        layer.register_parameter("B_24", qweight)
++        layer.register_parameter("B_meta", meta)
++        layer.register_parameter("s", scales)
++        layer.register_parameter("workspace", workspace)
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        # required by torch.compile
++        layer.B_24 = Parameter(layer.B_24.data, requires_grad=False)
++        layer.s = Parameter(layer.s.data, requires_grad=False)
++        layer.B_meta = Parameter(layer.B_meta.data, requires_grad=False)
++        layer.workspace = Parameter(layer.workspace.data, requires_grad=False)
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        qweight = layer.B_24
++        meta = layer.B_meta
++        scales = layer.s
++        workspace = layer.workspace
++
++        x_2d = x.view(-1, x.shape[-1])
++
++        size_m = x_2d.shape[0]
++        size_k = x_2d.shape[1]
++        size_n = scales.shape[1]
++
++        output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
++                                            workspace,
++                                            self.quant_config.quant_type,
++                                            size_m, size_n, size_k)
++
++        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
++
++        if bias is not None:
++            output.add_(bias)  # In-place add
++
++        return output
+diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
+new file mode 100644
+index 0000000..28538d2
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
+@@ -0,0 +1,325 @@
++from typing import Any, Dict, List, Optional
++
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.logger import init_logger
++from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
++                                               UnquantizedLinearMethod)
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig, QuantizeMethodBase)
++from vllm.model_executor.layers.quantization.utils.marlin_utils import (
++    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
++    marlin_make_empty_g_idx, marlin_permute_scales)
++from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
++    MarlinWorkspace)
++from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           GroupQuantScaleParameter,
++                                           PackedvLLMParameter)
++from vllm.scalar_type import scalar_types
++
++logger = init_logger(__name__)
++
++
++class HQQMarlinConfig(QuantizationConfig):
++    """Config class for HQQ Marlin"""
++
++    def __init__(
++        self,
++        weight_bits: int,
++        group_size: int,
++        skip_modules: Optional[List[str]] = None,
++    ) -> None:
++        assert group_size == 64, ("The only supported HQQ group size is "
++                                  "currently 64.")
++        assert weight_bits == 4, ("The only supported HQQ quantization "
++                                  "bitsize is currently 4.")
++
++        self.weight_bits = weight_bits
++        self.group_size = group_size
++        self.pack_factor = 32 // weight_bits  # packed into int32 in GPTQ format
++        self.quant_type = scalar_types.uint4
++        self.skip_modules = skip_modules
++
++    def __repr__(self) -> str:
++        return (f"HQQMarlinConfig(quant_type={self.quant_type}, "
++                f"group_size={self.group_size})")
++
++    @classmethod
++    def get_name(cls) -> str:
++        return "hqq"
++
++    @classmethod
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.half, torch.bfloat16]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 80
++
++    @classmethod
++    def get_config_filenames(cls) -> List[str]:
++        return ["quantize_config.json"]
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "HQQMarlinConfig":
++        wq_params = (config["quant_config"]["weight_quant_params"])
++        weight_bits = cls.get_from_keys(wq_params, ["nbits"])
++        group_size = cls.get_from_keys(wq_params, ["group_size"])
++        skip_modules = config["skip_modules"]
++        return cls(weight_bits, group_size, skip_modules)
++
++    def is_layer_skipped(self, prefix: str) -> bool:
++        # Split the prefix into its dot-separated components
++        components = prefix.split('.')
++
++        # Check if any of the skip modules exactly matches any component
++        return self.skip_modules is not None and any(
++            module_name in components for module_name in self.skip_modules)
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["QuantizeMethodBase"]:
++        if isinstance(layer, LinearBase):
++            if self.is_layer_skipped(prefix):
++                return UnquantizedLinearMethod()
++            return HQQMarlinMethod(self)
++        return None
++
++
++# Empty HQQ parameter, will be ignored during loading
++class HQQEmptyParameter(BasevLLMParameter):
++
++    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
++        pass
++
++    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
++        pass
++
++    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
++        pass
++
++
++def error_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
++    raise ValueError("No loader provided for HQQ parameter!")
++
++
++# HQQ packing creates issues with sharding - therefore, prior to loading, we
++# repack to GPTQ. We also reshape the weights to their proper GPTQ shape.
++class HQQweightParameter(PackedvLLMParameter):
++
++    # unpack function from https://github.com/mobiusml/hqq
++    def unpack_4bit_u8(self,
++                       W_q: torch.Tensor) -> torch.Tensor:  # uint8/2 > uint8
++        assert self.weight_bits == 4, "Unsupported quant bitsize (must be 4)"
++
++        dtype = torch.uint8
++        step = W_q.shape[0]
++        tmp = torch.empty([2 * step, W_q.shape[1]],
++                          dtype=dtype,
++                          device=W_q.device)
++        tmp[:step] = (W_q & 0b11110000) >> 4
++        tmp[step:] = W_q & 0b00001111
++        return tmp
++
++    def __init__(self, packed_factor: int, packed_dim: int, weight_bits: int,
++                 **kwargs):
++        super().__init__(packed_factor, packed_dim, None, **kwargs)
++        self.weight_bits = weight_bits
++        self.input_shape = self.shape[self.input_dim] * self.packed_factor
++        self.output_shape = self.shape[self.output_dim]
++
++    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
++        loaded_weight = self.unpack_4bit_u8(loaded_weight)
++        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(
++            1, 0)
++        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
++                                  loaded_weight.shape[0],
++                                  loaded_weight.shape[1])
++        super().load_merged_column_weight(loaded_weight, **kwargs)
++
++    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
++        loaded_weight = self.unpack_4bit_u8(loaded_weight)
++        loaded_weight = loaded_weight.reshape(self.output_shape,
++                                              -1).transpose(1, 0)
++        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
++                                  loaded_weight.shape[0],
++                                  loaded_weight.shape[1])
++        super().load_row_parallel_weight(loaded_weight)
++
++    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
++        loaded_weight = self.unpack_4bit_u8(loaded_weight)
++        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(
++            1, 0)
++        loaded_weight = gptq_pack(loaded_weight, self.weight_bits,
++                                  loaded_weight.shape[0],
++                                  loaded_weight.shape[1])
++        super().load_qkv_weight(loaded_weight, **kwargs)
++
++
++# Zero points and scales in HQQ must also be reshaped to correspond to W_q's
++# GPTQ shape (transposed - we transpose them too when processing weights).
++class HQQZeroScaleParameter(GroupQuantScaleParameter):
++
++    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
++        loaded_weight = loaded_weight.reshape(-1, self.shape[1])
++        super().load_merged_column_weight(loaded_weight, **kwargs)
++
++    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
++        loaded_weight = loaded_weight.reshape(self.shape[0], -1)
++        super().load_row_parallel_weight(loaded_weight)
++
++    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
++        loaded_weight = loaded_weight.reshape(-1, self.shape[1])
++        super().load_qkv_weight(loaded_weight, **kwargs)
++
++
++class HQQMarlinMethod(LinearMethodBase):
++    """Linear method for HQQ Marlin.
++    """
++
++    def __init__(
++        self,
++        quant_config: HQQMarlinConfig,
++    ):
++        self.quant_config = quant_config
++
++    def create_weights(
++        self,
++        layer: torch.nn.Module,
++        input_size_per_partition: int,
++        output_partition_sizes: List[int],
++        input_size: int,
++        output_size: int,
++        params_dtype: torch.dtype,
++        **extra_weight_attrs,
++    ) -> None:
++        self.output_size_per_partition = sum(output_partition_sizes)
++        self.input_size_per_partition = input_size_per_partition
++
++        weight_loader = extra_weight_attrs.get("weight_loader", error_loader)
++
++        self.scales_and_zp_size = (input_size_per_partition //
++                                   self.quant_config.group_size)
++
++        qweight = HQQweightParameter(
++            data=torch.empty(
++                self.input_size_per_partition // self.quant_config.pack_factor,
++                self.output_size_per_partition,
++                dtype=torch.int32,
++            ),
++            input_dim=0,
++            output_dim=1,
++            packed_dim=0,
++            packed_factor=self.quant_config.pack_factor,
++            weight_bits=self.quant_config.weight_bits,
++            weight_loader=weight_loader)
++
++        zeros = HQQZeroScaleParameter(data=torch.empty(
++            self.output_size_per_partition,
++            self.scales_and_zp_size,
++            dtype=params_dtype,
++        ),
++                                      input_dim=1,
++                                      output_dim=0,
++                                      weight_loader=weight_loader)
++
++        scales = HQQZeroScaleParameter(data=torch.empty(
++            self.output_size_per_partition,
++            self.scales_and_zp_size,
++            dtype=params_dtype,
++        ),
++                                       input_dim=1,
++                                       output_dim=0,
++                                       weight_loader=weight_loader)
++
++        layer.register_parameter("W_q", qweight)
++        layer.register_parameter("zero", zeros)
++        layer.register_parameter("scale", scales)
++
++        # Ignore extra parameters in the HQQ model.
++        # To be added as needed.
++        ignore_parameters = ("axis", "channel_wise", "compute_dtype",
++                             "encoded_state_dict", "group_size", "nbits",
++                             "offload_meta", "optimize", "packing",
++                             "quant_scale", "quant_zero", "round_zero",
++                             "shape", "stores_quant_config",
++                             "unpack_view_dtype", "view_as_float")
++        for name in ignore_parameters:
++            layer.register_parameter(
++                name,
++                HQQEmptyParameter(data=torch.empty(0),
++                                  weight_loader=weight_loader))
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        dev = layer.W_q.device
++
++        # Repack to Marlin
++        sort_indices = torch.empty(0, dtype=torch.int, device=dev)
++        marlin_w_q = ops.gptq_marlin_repack(
++            layer.W_q,
++            sort_indices,
++            self.input_size_per_partition,
++            self.output_size_per_partition,
++            self.quant_config.weight_bits,
++        ).to(dev)
++        marlin_s = marlin_permute_scales(layer.scale.transpose(1, 0),
++                                         self.input_size_per_partition,
++                                         self.output_size_per_partition,
++                                         self.quant_config.group_size).to(dev)
++        marlin_zp = marlin_permute_scales(layer.zero.transpose(1, 0),
++                                          self.input_size_per_partition,
++                                          self.output_size_per_partition,
++                                          self.quant_config.group_size).to(dev)
++
++        layer.g_idx = marlin_make_empty_g_idx(dev)
++        layer.g_idx_sort_indices = marlin_make_empty_g_idx(dev)
++
++        layer.marlin_qweight = marlin_w_q
++        layer.marlin_zeros = marlin_zp
++        layer.marlin_scales = marlin_s
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        workspace = MarlinWorkspace(self.output_size_per_partition,
++                                    GPTQ_MARLIN_MIN_THREAD_N,
++                                    GPTQ_MARLIN_MAX_PARALLEL)
++
++        scales = layer.marlin_scales
++        zeros = layer.marlin_zeros
++        orig_type = x.dtype
++
++        if orig_type != torch.float16:
++            x = x.to(torch.float16)
++            scales = scales.to(torch.float16)
++            zeros = zeros.to(torch.float16)
++
++        marlin_out = ops.gptq_marlin_gemm(
++            x,
++            layer.marlin_qweight,
++            scales,
++            zeros,
++            layer.g_idx,
++            layer.g_idx_sort_indices,
++            workspace.scratch,
++            scalar_types.uint4,
++            x.shape[0],
++            self.output_size_per_partition,
++            self.input_size_per_partition,
++            True,  # is_k_full
++            True,  # has_zp
++            True,  # use 32-bit reduce
++            True,  # use float zp
++        )
++
++        if orig_type != torch.float16:
++            marlin_out = marlin_out.to(orig_type)
++
++        if bias is not None:
++            marlin_out.add_(bias)
++
++        return marlin_out
+diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
+new file mode 100644
+index 0000000..c16a962
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/ipex_quant.py
+@@ -0,0 +1,247 @@
++from typing import Any, Dict, List, Optional
++
++import torch
++
++from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
++                                               UnquantizedLinearMethod)
++from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod,
++                                                         is_layer_skipped_awq)
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
++from vllm.platforms import current_platform
++
++MIN_IPEX_VERSION = "2.5.0"
++
++
++class IPEXConfig(QuantizationConfig):
++    """INT8 quantization config class using IPEX for the CPU/XPU backend,
++    including AWQ, GPTQ.
++    """
++
++    IPEX_QUANT_METHOD_MAP = {
++        "awq": 1,
++        "gptq": 0,
++    }
++
++    def __init__(
++        self,
++        method: str,
++        weight_bits: int,
++        group_size: int,
++        modules_to_not_convert: Optional[List[str]] = None,
++        desc_act: Optional[bool] = None,
++        lm_head_quantized: Optional[bool] = None,
++    ) -> None:
++        self.method = method
++        self.weight_bits = weight_bits
++        self.group_size = group_size
++        self.modules_to_not_convert = modules_to_not_convert or []
++        self.desc_act = desc_act
++        self.lm_head_quantized = lm_head_quantized
++        self.pack_factor = 32 // self.weight_bits
++
++        if self.weight_bits not in [4]:
++            raise ValueError(f"IPEX quantization supports weight bits [4], "
++                             f"but got {self.weight_bits}.")
++
++        if self.method not in ["awq", "gptq"]:
++            raise ValueError(f"IPEX quantization supports [awq, gptq], "
++                             f"but got {self.method}.")
++
++    def __repr__(self) -> str:
++        return (f"IPEXConfig(method={self.method},"
++                f"weight_bits={self.weight_bits}, "
++                f"group_size={self.group_size})")
++
++    @classmethod
++    def get_name(cls) -> str:
++        return "ipex"
++
++    @classmethod
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.bfloat16, torch.float16]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return -1
++
++    @staticmethod
++    def get_config_filenames() -> List[str]:
++        return [
++            "quant_config.json",
++            "quantize_config.json",
++        ]
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "IPEXConfig":
++        method = cls.get_from_keys(config, ["quant_method"]).lower()
++        if method == "awq":
++            weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
++            group_size = cls.get_from_keys(config,
++                                           ["q_group_size", "group_size"])
++            modules_to_not_convert = cls.get_from_keys_or(
++                config, ["modules_to_not_convert"], None)
++            return cls(method, weight_bits, group_size, modules_to_not_convert,
++                       False, False)
++        # otherwise for gptq
++        weight_bits = cls.get_from_keys(config, ["bits"])
++        group_size = cls.get_from_keys(config, ["group_size"])
++        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
++                                                 default=False)
++        desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
++        return cls(method, weight_bits, group_size, [], desc_act,
++                   lm_head_quantized)
++
++    @classmethod
++    def override_quantization_method(cls, hf_quant_cfg,
++                                     user_quant) -> Optional[str]:
++        if not current_platform.is_cpu() and not current_platform.is_xpu():
++            return None
++
++        quant_method = hf_quant_cfg.get("quant_method", "").lower()
++
++        if quant_method in ["awq", "gptq"]:
++            return cls.get_name()
++
++        return None
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["LinearMethodBase"]:
++        if isinstance(layer, LinearBase):
++            if self.method == "awq":
++                if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
++                    return UnquantizedLinearMethod()
++                return IPEXAWQLinearMethod(self)
++            if self.method == "gptq":
++                return IPEXGPTQLinearMethod(self)
++        return None
++
++
++class IPEXGPTQLinearMethod(GPTQLinearMethod):
++    """GPTQ linear method using IPEX for the CPU/XPU backend.
++    """
++
++    def __init__(self, quant_config: IPEXConfig):
++        self.quant_config = quant_config  # type: ignore
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        bias = layer.bias if not layer.skip_bias_add else None
++
++        try:
++            import intel_extension_for_pytorch as ipex
++            if ipex.__version__ < MIN_IPEX_VERSION:
++                raise ImportError(
++                    "intel_extension_for_pytorch version is "
++                    "wrong. Please install "
++                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.")
++        except ImportError as err:
++            raise ImportError(
++                "Please install "
++                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
++                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
++                " to use IPEX-AWQ linear method.") from err
++        # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
++        # with better performance.
++        lowp_mode = ipex.quantization.WoqLowpMode.INT8
++        # The weight will be de-packed from INT4 to INT8.
++        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
++        # The float activation will be quantized (dynamic, per-token) to INT8.
++        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK
++
++        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
++            weight_dtype=weight_dtype,
++            lowp_mode=lowp_mode,
++            act_quant_mode=act_quant_mode,
++            group_size=self.quant_config.group_size,
++        )
++        layer.ipex_output_size = layer.qweight.shape[-1]
++        g_idx = layer.g_idx if self.quant_config.desc_act else None
++        layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \
++            IPEXWeightOnlyQuantizedLinear.from_weight(
++            layer.qweight,
++            layer.scales,
++            layer.qzeros,
++            layer.qweight.size(0),
++            layer.ipex_output_size,
++            qconfig=qconfig,
++            g_idx=g_idx,
++            bias=bias,
++            group_size=self.quant_config.group_size,
++            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"]
++        )
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        reshaped_x = x.reshape(-1, x.shape[-1])
++        out = layer.ipex_qlinear(reshaped_x)
++        if bias is not None:
++            out.add_(bias)
++        return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
++
++
++class IPEXAWQLinearMethod(AWQLinearMethod):
++    """AWQ linear method using IPEX for the CPU/XPU backend.
++    """
++
++    def __init__(self, quant_config: IPEXConfig):
++        self.quant_config = quant_config  # type: ignore
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        super().process_weights_after_loading(layer=layer)
++
++        bias = layer.bias if not layer.skip_bias_add else None
++
++        try:
++            import intel_extension_for_pytorch as ipex
++            if ipex.__version__ < MIN_IPEX_VERSION:
++                raise ImportError(
++                    "intel_extension_for_pytorch version is "
++                    "wrong. Please install "
++                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.")
++        except ImportError as err:
++            raise ImportError(
++                "Please install "
++                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
++                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
++                " to use IPEX-AWQ linear method.") from err
++
++        # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
++        # with better performance.
++        lowp_mode = ipex.quantization.WoqLowpMode.INT8
++        # The weight will be de-packed from INT4 to INT8.
++        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
++        # The float activation will be quantized (dynamic, per-token) to INT8.
++        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH
++
++        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
++            weight_dtype=weight_dtype,
++            lowp_mode=lowp_mode,
++            act_quant_mode=act_quant_mode,
++            group_size=self.quant_config.group_size,
++        )
++
++        layer.ipex_output_size = layer.qweight.size(
++            1) * self.quant_config.pack_factor
++        layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \
++            IPEXWeightOnlyQuantizedLinear.from_weight(
++            layer.qweight,
++            layer.scales,
++            layer.qzeros,
++            layer.qweight.size(0),
++            layer.ipex_output_size,
++            qconfig=qconfig,
++            bias=bias,
++            group_size=self.quant_config.group_size,
++            quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"]  # type: ignore
++        )
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        reshaped_x = x.reshape(-1, x.shape[-1])
++        out = layer.ipex_qlinear(reshaped_x)
++        return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
+diff --git a/vllm/model_executor/layers/quantization/kernels/__init__.py b/vllm/model_executor/layers/quantization/kernels/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+new file mode 100644
+index 0000000..b04612a
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+@@ -0,0 +1,87 @@
++from abc import ABC, abstractmethod
++from dataclasses import dataclass
++from typing import Callable, Optional, Tuple
++
++import torch
++
++from vllm.model_executor.layers.quantization.utils import replace_parameter
++from vllm.scalar_type import ScalarType
++
++
++@dataclass
++class MPLinearLayerConfig:
++    full_weight_shape: Tuple[int, int]  # [in, out]
++    partition_weight_shape: Tuple[int, int]
++    weight_type: ScalarType
++    act_type: torch.dtype
++    group_size: int
++    zero_points: bool
++    has_g_idx: bool
++
++
++class MPLinearKernel(ABC):
++
++    @classmethod
++    @abstractmethod
++    def get_min_capability(cls) -> int:
++        raise NotImplementedError
++
++    @classmethod
++    @abstractmethod
++    def can_implement(cls,
++                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
++        raise NotImplementedError
++
++    def __init__(self,
++                 c: MPLinearLayerConfig,
++                 w_q_param_name: str,
++                 w_s_param_name: str,
++                 w_zp_param_name: Optional[str] = None,
++                 w_gidx_param_name: Optional[str] = None) -> None:
++        assert self.can_implement(c)
++        self.config = c
++        self.w_q_name = w_q_param_name
++        self.w_s_name = w_s_param_name
++        if c.zero_points:
++            assert w_zp_param_name is not None
++        if c.has_g_idx:
++            assert w_gidx_param_name is not None
++        self.w_zp_name = w_zp_param_name
++        self.w_gidx_name = w_gidx_param_name
++
++    @abstractmethod
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def apply_weights(self,
++                      layer: torch.nn.Module,
++                      x: torch.Tensor,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        raise NotImplementedError
++
++    def _transform_param(self, layer: torch.nn.Module, name: Optional[str],
++                         fn: Callable) -> None:
++        if name is not None and getattr(layer, name, None) is not None:
++
++            old_param = getattr(layer, name)
++            new_param = fn(old_param)
++            # replace the parameter with torch.nn.Parameter for TorchDynamo
++            # compatibility
++            replace_parameter(
++                layer, name,
++                torch.nn.Parameter(new_param.data, requires_grad=False))
++
++    def _get_weight_params(
++            self, layer: torch.nn.Module
++    ) -> Tuple[torch.Tensor,  # w_q
++               torch.Tensor,  # w_s
++               Optional[torch.Tensor],  # w_zp, 
++               Optional[torch.Tensor]  # w_gidx
++               ]:
++        return (
++            getattr(layer, self.w_q_name),
++            getattr(layer, self.w_s_name),
++            getattr(layer, self.w_zp_name or "", None),
++            getattr(layer, self.w_gidx_name or "", None),
++        )
+diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+new file mode 100644
+index 0000000..8354987
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+@@ -0,0 +1,74 @@
++from typing import List, Optional, Type
++
++import vllm.envs as envs
++from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
++    ExllamaLinearKernel)
++from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
++    MacheteLinearKernel)
++from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (  # noqa: E501
++    MarlinLinearKernel)
++from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import (  # noqa: E501
++    MPLinearKernel, MPLinearLayerConfig)
++from vllm.platforms import current_platform
++
++# in priority/performance order (when available)
++_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
++    MacheteLinearKernel,
++    MarlinLinearKernel,
++    ExllamaLinearKernel,
++]
++
++
++def choose_mp_linear_kernel(
++        config: MPLinearLayerConfig,
++        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
++    """
++    Choose an MPLinearKernel that can implement the given config for the given
++     compute capability. Attempts to choose the best kernel in terms of 
++     performance.
++
++    Args:
++        config (MPLinearLayerConfig): Description of the linear layer to be 
++          implemented.
++        compute_capability (Optional[int], optional): The compute capability of
++          the target device, if None uses `current_platform` to get the compute 
++          capability. Defaults to None.
++
++    Raises:
++        ValueError: If no kernel can implement the given config.
++
++    Returns:
++        Type[MPLinearKernel]: Chosen kernel.
++    """
++    if compute_capability is None:
++        if current_platform is None:
++            raise ValueError("Cannot determine compute capability")
++        _cc = current_platform.get_device_capability()
++        compute_capability = _cc[0] * 10 + _cc[1]
++
++    failure_reasons = []
++    for kernel in _POSSIBLE_KERNELS:
++        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
++            failure_reasons.append(
++                f' {kernel.__name__} disabled by environment variable')
++            continue
++
++        if kernel.get_min_capability() > compute_capability:
++            failure_reasons.append(
++                f"{kernel.__name__} requires capability "
++                f"{kernel.get_min_capability()}, current compute capability "
++                f"is {compute_capability}")
++            continue
++
++        can_implement, failure_reason = kernel.can_implement(config)
++        if can_implement:
++            return kernel
++        else:
++            failure_reasons.append(
++                f' {kernel.__name__} cannot implement due to: {failure_reason}'
++            )
++
++    raise ValueError(
++        "Failed to find a kernel that can implement the "\
++        "WNA16 linear layer. Reasons: \n"
++        + '\n'.join(failure_reasons))
+diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+new file mode 100644
+index 0000000..1d85d62
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+@@ -0,0 +1,140 @@
++from typing import Optional, Tuple
++
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.utils.quant_utils import (
++    pack_quantized_values_into_int32)
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           permute_param_layout_)
++from vllm.scalar_type import scalar_types
++
++from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
++
++
++class ExllamaLinearKernel(MPLinearKernel):
++    SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
++    # In theory supports `scalar_types.uint2b2, scalar_types.uint3b4` too but
++    # currently untested so not added to the list
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 60
++
++    @classmethod
++    def can_implement(cls,
++                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
++        if c.has_g_idx and\
++            c.partition_weight_shape[0] != c.full_weight_shape[0]:
++            return False, "Act reordering currently not supported by Exllama, "\
++                          "when the input features are partitioned across "\
++                          "devices"
++
++        if c.partition_weight_shape[1] % (32 // c.weight_type.size_bits) != 0:
++            return False, "Output features must be a multiple of the pack " \
++                            "factor (32 / num_bits) so that we can correctly " \
++                            "pack the zero points"
++
++        if c.act_type != torch.float16:
++            return False, "Exllama only supports float16 activations"
++
++        if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
++            return False, f"Quant type ({c.weight_type}) not supported by "\
++                           "Exllama, supported types are: "\
++                           f"{cls.SUPPORTED_QUANT_TYPES}"
++
++        if c.full_weight_shape[0] % c.group_size != 0:
++            return False, f"Group size ({c.group_size}) does not evenly divide"\
++                           " the number of input features "\
++                           f"({c.full_weight_shape[0]})"
++
++        return True, None
++
++    def process_weights_after_loading(self, layer: torch.nn.Module):
++        c = self.config
++
++        # For Exllama, we need to set a zero-point tensor if there is not one
++        if not c.zero_points:
++            self.w_zp_name = "qzeros"
++            device = getattr(layer, self.w_q_name).device
++            groups = c.partition_weight_shape[0] // c.group_size
++            out_features = c.partition_weight_shape[1]
++
++            if c.weight_type.has_bias():
++                # if the type has a bias we have to create a zeros tensor that
++                # contains the bias values repeated for each group (-1 due to
++                # a bug in the original GPTQ checkpoint format leading to
++                # exllama kernel adding 1 to the zero points during inference)
++                # Documentation of the bug can be found here:
++                #  https://garden.danieldk.eu/GPTQ-Checkpoint-Format
++                zeros = torch.full((groups, out_features),
++                                   c.weight_type.bias - 1,
++                                   dtype=torch.int32,
++                                   device=device)
++            else:
++                raise NotImplementedError(
++                    "A 0 zero-point is not supported by Exllama due to "
++                    "a bug in the original GPTQ checkpoint format leading to "
++                    "exllama kernel adding 1 to the zero points during "
++                    "inference")
++            zeros = pack_quantized_values_into_int32(zeros,
++                                                     c.weight_type,
++                                                     packed_dim=1)
++            setattr(layer, self.w_zp_name,
++                    torch.nn.Parameter(zeros, requires_grad=False))
++
++        if c.has_g_idx:
++
++            def transform_w_g_idx(x):
++                # Exllama wants the permutation array instead of the group
++                # indices
++                return torch.argsort(x).to(torch.int)
++
++            self._transform_param(layer, self.w_gidx_name, transform_w_g_idx)
++        else:
++            self.w_gidx_name = "g_idx"
++            empty_g_idx = torch.nn.Parameter(torch.empty((0, ),
++                                                         dtype=torch.int,
++                                                         device=device),
++                                             requires_grad=False)
++            setattr(layer, self.w_gidx_name, empty_g_idx)
++
++        def transform_w_q(x):
++            assert isinstance(x, BasevLLMParameter)
++            assert self.w_gidx_name is not None
++            g_idx = getattr(layer, self.w_gidx_name)
++
++            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
++            x_cont = x.data.contiguous()
++            ops.gptq_shuffle(x_cont, g_idx, c.weight_type.size_bits)
++            return x_cont
++
++        def transform_w_s(x):
++            assert isinstance(x, BasevLLMParameter)
++            permute_param_layout_(x, input_dim=0, output_dim=1)
++            x.data = x.data.contiguous()
++            return x.to(dtype=c.act_type)
++
++        # Repack weights and scales for Machete
++        self._transform_param(layer, self.w_q_name, transform_w_q)
++        self._transform_param(layer, self.w_s_name, transform_w_s)
++
++    def apply_weights(self,
++                      layer: torch.nn.Module,
++                      x: torch.Tensor,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        c = self.config
++
++        x_2d = x.reshape(-1, x.shape[-1])
++        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
++
++        w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer)
++
++        assert w_zp is not None, "Zero points are required by Exllama"
++        assert w_g_idx is not None, "Group index is required by Exllama"
++        output = ops.gptq_gemm(x_2d, w_q, w_zp, w_s, w_g_idx, True,
++                               c.weight_type.size_bits)
++
++        if bias is not None:
++            output.add_(bias)
++        return output.reshape(out_shape)
+diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+new file mode 100644
+index 0000000..15df020
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+@@ -0,0 +1,120 @@
++from functools import partial
++from typing import Optional, Tuple
++
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.utils.machete_utils import (
++    MACHETE_SUPPORTED_GROUP_SIZES, check_machete_supports_shape,
++    query_machete_supported_quant_types)
++from vllm.model_executor.layers.quantization.utils.quant_utils import (
++    pack_quantized_values_into_int32, unpack_quantized_values_into_int32)
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           permute_param_layout_)
++
++from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
++
++
++class MacheteLinearKernel(MPLinearKernel):
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 90
++
++    @classmethod
++    def can_implement(cls,
++                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
++        if c.has_g_idx and\
++            c.partition_weight_shape[0] != c.full_weight_shape[0]:
++            return False, "Act reordering currently not supported by Machete, "\
++                          "when the input features are partitioned across "\
++                          "devices"
++
++        if c.zero_points:
++            return False, "Zero points currently not supported by "\
++                          " Compressed Tensors + Machete. (Kernel supports it"\
++                          " but CompressedTensorsWNA16 does not so support has"\
++                          " not been added to MacheteWNA16Kernel yet"
++
++        if c.weight_type not in query_machete_supported_quant_types(
++                c.zero_points):
++            return False, f"Quant type ({c.weight_type}) not supported by "\
++                           "Machete, supported types are: "\
++                           f"{query_machete_supported_quant_types(c.zero_points)}"
++
++        if c.group_size not in MACHETE_SUPPORTED_GROUP_SIZES:
++            return False, f"Group size ({c.group_size}) not supported by "\
++                            "Machete, supported group sizes are: "\
++                            f"{MACHETE_SUPPORTED_GROUP_SIZES}"
++
++        return check_machete_supports_shape(c.partition_weight_shape[0],
++                                            c.partition_weight_shape[1])
++
++    # note assumes that
++    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
++    #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
++    def process_weights_after_loading(self, layer: torch.nn.Module):
++        c = self.config
++
++        if c.has_g_idx:
++            assert self.w_gidx_name is not None
++            perm = torch.argsort(getattr(layer, self.w_gidx_name))\
++                .to(torch.int)
++
++            self.act_perm = lambda x: x[:, perm]
++            # use `ops.permute_cols` if possible
++            if c.act_type in [torch.float16, torch.bfloat16] \
++                and c.partition_weight_shape[0] % 8 == 0:
++                self.act_perm = partial(ops.permute_cols, perm=perm)
++
++        def transform_w_q(x):
++            assert isinstance(x, BasevLLMParameter)
++            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
++            if c.has_g_idx:
++                x_unpacked = unpack_quantized_values_into_int32(x.data,
++                                                                c.weight_type,
++                                                                packed_dim=0)
++                x_perm = x_unpacked[perm, :]
++                x.data = pack_quantized_values_into_int32(x_perm,
++                                                          c.weight_type,
++                                                          packed_dim=0)
++            x.data = ops.machete_prepack_B(x.data.t().contiguous().t(),
++                                           a_type=c.act_type,
++                                           b_type=c.weight_type,
++                                           group_scales_type=c.act_type)
++            return x
++
++        def transform_w_s(x):
++            assert isinstance(x, BasevLLMParameter)
++            permute_param_layout_(x, input_dim=0, output_dim=1)
++            x.data = x.data.contiguous()
++            return x
++
++        # Repack weights and scales for Machete
++        self._transform_param(layer, self.w_q_name, transform_w_q)
++        self._transform_param(layer, self.w_s_name, transform_w_s)
++
++    def apply_weights(self,
++                      layer: torch.nn.Module,
++                      x: torch.Tensor,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        c = self.config
++        w_q, w_s, _, _ = self._get_weight_params(layer)
++
++        x_2d = x.reshape(-1, x.shape[-1])
++        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
++
++        if c.has_g_idx:
++            x_2d = self.act_perm(x_2d)
++
++        output = ops.machete_mm(a=x_2d,
++                                b_q=w_q,
++                                b_type=c.weight_type,
++                                b_group_zeros=None,
++                                b_group_scales=w_s,
++                                b_group_size=c.group_size)
++
++        if bias is not None:
++            output.add_(bias)  # In-place add
++
++        return output.reshape(out_shape)
+diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+new file mode 100644
+index 0000000..6969583
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+@@ -0,0 +1,133 @@
++from typing import Optional, Tuple
++
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.utils.marlin_utils import (
++    MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
++    check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
++    marlin_make_workspace, marlin_permute_scales, marlin_sort_g_idx,
++    query_marlin_supported_quant_types)
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           permute_param_layout_)
++
++from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
++
++
++class MarlinLinearKernel(MPLinearKernel):
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 80
++
++    @classmethod
++    def can_implement(cls,
++                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
++        if c.zero_points:
++            return False, "Zero points currently not supported by "\
++                          " MarlinLinearKernel. Will be added when AWQMarlin "\
++                          "is migrated over to using MPLinearKernel backend"
++
++        quant_types = query_marlin_supported_quant_types(c.zero_points)
++        if c.weight_type not in quant_types:
++            return False, f"Quant type ({c.weight_type}) not supported by"\
++                          f"  Marlin, supported types are: {quant_types}"
++
++        if c.group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
++            return False, f"Group size ({c.group_size}) not supported by "\
++                            "Marlin, supported group sizes are: "\
++                            f"{MARLIN_SUPPORTED_GROUP_SIZES}"
++
++        return check_marlin_supports_shape(
++            c.partition_weight_shape[1],  # out_features
++            c.partition_weight_shape[0],  # in_features
++            c.full_weight_shape[0],  # in_features
++            c.group_size)
++
++    # note assumes that
++    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
++    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        device = getattr(layer, self.w_q_name).device
++        c = self.config
++
++        row_parallel = (c.partition_weight_shape[0] != c.full_weight_shape[0])
++        self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
++
++        # Allocate marlin workspace.
++        self.workspace = marlin_make_workspace(c.partition_weight_shape[1],
++                                               device)
++
++        # Default names since marlin requires empty parameters for these,
++        # TODO: remove this requirement from marlin (allow optional tensors)
++        if self.w_gidx_name is None:
++            self.w_gidx_name = "g_idx"
++        if self.w_zp_name is None:
++            self.w_zp_name = "w_zp"
++
++        if c.has_g_idx:
++            g_idx, g_idx_sort_indices = marlin_sort_g_idx(
++                getattr(layer, self.w_gidx_name))
++            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
++            layer.g_idx_sort_indices = g_idx_sort_indices
++        else:
++            setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
++            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
++
++        if c.zero_points:
++            pass
++            # TODO (lucas): add the following when AWQMarlin is migrated over to
++            #       using MPLinearKernel backend
++            # self._transform_param(layer, self.w_zp_name, lambda x: \
++            #     marlin_zero_points(
++            #         x,
++            #         size_k=c.partition_weight_shape[0],
++            #         size_n=c.partition_weight_shape[1],
++            #         num_bits=c.weight_type.size_bits))
++        else:
++            setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
++
++        def transform_w_q(x):
++            assert isinstance(x, BasevLLMParameter)
++            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
++            x.data = ops.gptq_marlin_repack(x.data.contiguous(),
++                                            perm=layer.g_idx_sort_indices,
++                                            size_k=c.partition_weight_shape[0],
++                                            size_n=c.partition_weight_shape[1],
++                                            num_bits=c.weight_type.size_bits)
++            return x
++
++        def transform_w_s(x):
++            assert isinstance(x, BasevLLMParameter)
++            permute_param_layout_(x, input_dim=0, output_dim=1)
++            x.data = marlin_permute_scales(x.data.contiguous(),
++                                           size_k=c.partition_weight_shape[0],
++                                           size_n=c.partition_weight_shape[1],
++                                           group_size=c.group_size)
++            return x
++
++        self._transform_param(layer, self.w_q_name, transform_w_q)
++        self._transform_param(layer, self.w_s_name, transform_w_s)
++
++    def apply_weights(self,
++                      layer: torch.nn.Module,
++                      x: torch.Tensor,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        c = self.config
++        w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
++
++        # `process_weights_after_loading` will ensure w_zp and w_gidx are not
++        #  None for marlin
++        return apply_gptq_marlin_linear(
++            input=x,
++            weight=w_q,
++            weight_scale=w_s,
++            weight_zp=w_zp,  # type: ignore
++            g_idx=w_gidx,  # type: ignore
++            g_idx_sort_indices=layer.g_idx_sort_indices,
++            workspace=self.workspace,
++            wtype=c.weight_type,
++            input_size_per_partition=c.partition_weight_shape[0],
++            output_size_per_partition=c.partition_weight_shape[1],
++            is_k_full=self.is_k_full,
++            bias=bias)
+diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+new file mode 100644
+index 0000000..75cf91f
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+@@ -0,0 +1,64 @@
++from abc import ABC, abstractmethod
++from dataclasses import dataclass
++from typing import Optional, Tuple
++
++import torch
++
++
++@dataclass
++class ScaledMMLinearLayerConfig:
++    is_channelwise: bool
++    is_static_input_scheme: bool
++    input_symmetric: bool
++
++
++class ScaledMMLinearKernel(ABC):
++
++    @classmethod
++    @abstractmethod
++    def get_min_capability(cls) -> int:
++        raise NotImplementedError
++
++    @classmethod
++    @abstractmethod
++    def can_implement(
++            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
++        raise NotImplementedError
++
++    def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
++                 w_s_param_name: str, i_s_param_name: str,
++                 i_zp_param_name: str, azp_adj_param_name: str) -> None:
++        assert self.can_implement(c)
++        self.config = c
++        self.w_q_name = w_q_param_name
++        self.w_s_name = w_s_param_name
++        self.i_s_name = i_s_param_name
++        self.i_zp_name = i_zp_param_name
++        self.azp_adj_name = azp_adj_param_name
++
++    @abstractmethod
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def apply_weights(self,
++                      layer: torch.nn.Module,
++                      x: torch.Tensor,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        raise NotImplementedError
++
++    def _get_weight_params(
++            self, layer: torch.nn.Module
++    ) -> Tuple[torch.Tensor,  # weight
++               torch.Tensor,  # weight_scale
++               Optional[torch.Tensor],  # input_scale, 
++               Optional[torch.Tensor],  # input_zp
++               Optional[torch.Tensor],  # azp_adj
++               ]:
++        return (
++            getattr(layer, self.w_q_name),
++            getattr(layer, self.w_s_name),
++            getattr(layer, self.i_s_name),
++            getattr(layer, self.i_zp_name),
++            getattr(layer, self.azp_adj_name),
++        )
+diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+new file mode 100644
+index 0000000..586752d
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+@@ -0,0 +1,84 @@
++import os
++from typing import Dict, List, Optional, Type
++
++from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
++    CutlassScaledMMLinearKernel)
++from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
++    ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
++# from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
++#     TritonScaledMMLinear)
++from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
++    XLAScaledMMLinearKernel)
++from vllm.platforms import PlatformEnum, current_platform
++
++# in priority/performance order (when available)
++_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
++    PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
++    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
++    # TODO(rob): Create TritonScaledMMLinear kernel. ROCM will
++    # incorrectly attempt to run AZP models if prompted to.
++    PlatformEnum.ROCM: [CutlassScaledMMLinearKernel],
++    PlatformEnum.TPU: [XLAScaledMMLinearKernel],
++}
++
++
++def choose_scaled_mm_linear_kernel(
++        config: ScaledMMLinearLayerConfig,
++        compute_capability: Optional[int] = None
++) -> Type[ScaledMMLinearKernel]:
++    """
++    Choose an ScalledMMLinearKernel that can implement the given config for the 
++    given compute capability. Attempts to choose the best kernel in terms of 
++    performance.
++
++    Args:
++        config (ScaledMMLinearLayerConfig): Description of the linear layer 
++            to be implemented.
++        compute_capability (Optional[int], optional): The compute capability of
++            the target device, if None uses `current_platform` to get the 
++            compute capability. Defaults to None.
++
++    Raises:
++        ValueError: If no kernel can implement the given config.
++
++    Returns:
++        Type[ScaledMMLinearKernel]: Chosen kernel.
++    """
++
++    if compute_capability is None:
++        _cc = current_platform.get_device_capability()
++        if _cc is not None:
++            compute_capability = _cc[0] * 10 + _cc[1]
++
++    failure_reasons = []
++    for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
++        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
++            .split(","):
++            failure_reasons.append(
++                f' {kernel.__name__} disabled by environment variable')
++            continue
++
++        # If the current platform uses compute_capability,
++        # make sure the kernel supports the compute cability.
++        if compute_capability is not None:
++            kernel_min_capability = kernel.get_min_capability()
++            if (kernel_min_capability is not None
++                    and kernel_min_capability > compute_capability):
++                failure_reasons.append(
++                    f"{kernel.__name__} requires capability "
++                    f"{kernel_min_capability}, current compute capability "
++                    f"is {compute_capability}")
++                continue
++
++        can_implement, failure_reason = kernel.can_implement(config)
++        if can_implement:
++            return kernel
++        else:
++            failure_reasons.append(
++                f' {kernel.__name__} cannot implement due to: {failure_reason}'
++            )
++
++    raise ValueError(
++        "Failed to find a kernel that can implement the "\
++        "ScaledMM linear layer. Reasons: \n"
++        + '\n'.join(failure_reasons))
+diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+new file mode 100644
+index 0000000..2e83a04
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+@@ -0,0 +1,134 @@
++from typing import Optional, Tuple
++
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.model_executor.layers.quantization.utils import replace_parameter
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    convert_to_channelwise)
++from vllm.platforms import current_platform
++
++from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
++                                   ScaledMMLinearLayerConfig)
++
++
++class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 75
++
++    @classmethod
++    def can_implement(
++            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
++
++        if (not current_platform.is_cuda() and not current_platform.is_cpu()):
++            return False, "CutlassScaledMM requires running on CUDA or CPU."
++
++        return True, None
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        # WEIGHT
++        # Cutlass kernels need transposed weight.
++        weight = getattr(layer, self.w_q_name)
++        replace_parameter(
++            layer, self.w_q_name,
++            torch.nn.Parameter(weight.t().data, requires_grad=False))
++
++        # WEIGHT SCALE
++        # Cutlass kernels support only per-tensor and per-channel.
++        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
++        # scales being passed to the kernel), convert to the per-channel case.
++        is_fused_module = len(layer.logical_widths) > 1
++        weight_scale = getattr(layer, self.w_s_name)
++        if is_fused_module and not self.config.is_channelwise:
++            weight_scale = convert_to_channelwise(weight_scale,
++                                                  layer.logical_widths)
++        replace_parameter(
++            layer, self.w_s_name,
++            torch.nn.Parameter(weight_scale.data, requires_grad=False))
++
++        # INPUT SCALE
++        if self.config.is_static_input_scheme:
++            input_scale = getattr(layer, self.i_s_name)
++
++            if self.config.input_symmetric:
++                replace_parameter(
++                    layer, self.i_s_name,
++                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
++                setattr(layer, self.i_zp_name, None)
++            else:
++                input_zero_point = getattr(layer, self.i_zp_name)
++
++                # reconstruct the ranges
++                int8_traits = torch.iinfo(torch.int8)
++                azps = input_zero_point.to(dtype=torch.int32)
++                range_max = (input_scale * (int8_traits.max - azps)).max()
++                range_min = (input_scale * (int8_traits.min - azps)).min()
++
++                scale = (range_max - range_min) / (int8_traits.max -
++                                                   int8_traits.min)
++                replace_parameter(
++                    layer, self.i_s_name,
++                    torch.nn.Parameter(scale, requires_grad=False))
++
++                # AZP loaded as int8 but used as int32
++                azp = (int8_traits.min -
++                       range_min / scale).to(dtype=torch.int32)
++                replace_parameter(layer, self.i_zp_name,
++                                  torch.nn.Parameter(azp, requires_grad=False))
++
++        else:
++            setattr(layer, self.i_s_name, None)
++            setattr(layer, self.i_zp_name, None)
++
++        # azp_adj is the AZP adjustment term, used to account for weights.
++        # It does not depend on scales or azp, so it is the same for
++        # static and dynamic quantization.
++        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
++        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
++        if not self.config.input_symmetric:
++            weight = getattr(layer, self.w_q_name)
++            azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
++            if self.config.is_static_input_scheme:
++                # cutlass_w8a8 requires azp to be folded into azp_adj
++                # in the per-tensor case
++                azp_adj = getattr(layer, self.i_zp_name) * azp_adj
++            setattr(layer, self.azp_adj_name,
++                    torch.nn.Parameter(azp_adj, requires_grad=False))
++        else:
++            setattr(layer, self.azp_adj_name, None)
++
++    def apply_weights(self,
++                      layer: torch.nn.Module,
++                      x: torch.Tensor,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
++
++        # ops.scaled_int8_quant supports both dynamic and static quant:
++        # * dynamic, i_s is None and x_s computed from x.
++        # * static, i_s is scalar and x_s is i_s.
++        symmetric = azp_adj is None
++        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
++                                               i_s,
++                                               i_zp,
++                                               symmetric=symmetric)
++
++        if x_zp is not None:
++            # Currently, static is always per-tensor and dynamic is per-token
++            static = i_zp is not None
++            azp = None if static else x_zp
++            return ops.cutlass_scaled_mm_azp(x_q,
++                                             w_q,
++                                             scale_a=x_s,
++                                             scale_b=w_s,
++                                             out_dtype=x.dtype,
++                                             azp_adj=azp_adj,
++                                             azp=azp,
++                                             bias=bias)
++        return ops.cutlass_scaled_mm(x_q,
++                                     w_q,
++                                     scale_a=x_s,
++                                     scale_b=w_s,
++                                     out_dtype=x.dtype,
++                                     bias=bias)
+diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+new file mode 100644
+index 0000000..9de668e
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+@@ -0,0 +1,101 @@
++import warnings
++from typing import Optional, Tuple
++
++import torch
++from functorch.experimental.control_flow import cond  # noqa: F401
++
++from vllm.model_executor.layers.quantization.utils import replace_parameter
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    convert_to_channelwise)
++from vllm.platforms import current_platform
++
++from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
++                                   ScaledMMLinearLayerConfig)
++
++
++class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        raise NotImplementedError(
++            "TPU platform does have a concept of compute capability, "
++            "this method should not be called.")
++
++    @classmethod
++    def can_implement(
++            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
++
++        if not current_platform.is_tpu():
++            return False, "ScaledMMXLA requires running on TPU."
++
++        if c.is_static_input_scheme:
++            return False, "ScaledMMXLA requires dynamic activation scales."
++
++        if not c.input_symmetric:
++            return False, "ScaledMMXLA requires symmetric activation scales."
++
++        if not c.is_channelwise:
++            return False, "ScaledMMXLA requires channelwise weight scales"
++
++        return True, None
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        # WEIGHT
++        # [out, in] (different than cutlass_scaled_mm)
++        weight = getattr(layer, self.w_q_name)
++        replace_parameter(layer, self.w_q_name,
++                          torch.nn.Parameter(weight.data, requires_grad=False))
++
++        # WEIGHT SCALE
++        # XLA kernels support only per-tensor and per-channel.
++        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
++        # scales being passed to the kernel), convert to the per-channel case.
++        is_fused_module = len(layer.logical_widths) > 1
++        weight_scale = getattr(layer, self.w_s_name)
++        if is_fused_module and not self.config.is_channelwise:
++            weight_scale = convert_to_channelwise(weight_scale,
++                                                  layer.logical_widths)
++
++        # [out_channel,] (different than cutlass_scaled_mm)
++        weight_scale = weight_scale.squeeze(-1)
++        replace_parameter(
++            layer, self.w_s_name,
++            torch.nn.Parameter(weight_scale.data, requires_grad=False))
++
++        # Only support symmetric dynamic activation quantization.
++        setattr(layer, self.i_s_name, None)
++        setattr(layer, self.i_zp_name, None)
++        setattr(layer, self.azp_adj_name, None)
++
++        # Filter warning for cond usage in apply_weights. It is okay
++        # to specialize the graph since bias is not dynamic.
++        warnings.filterwarnings(
++            "ignore",
++            message=
++            "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."  # noqa: E501
++        )
++
++    def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
++        return x
++
++    def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
++        return x + bias
++
++    def apply_weights(self,
++                      layer: torch.nn.Module,
++                      x: torch.Tensor,
++                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        w_q, w_s, _, _, _ = self._get_weight_params(layer)
++
++        import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
++        out = torch.ops.xla.quantized_matmul(x,
++                                             w_q,
++                                             w_s,
++                                             zero_point=None,
++                                             block_size=-1,
++                                             int4_weight=False,
++                                             quantize_activation=True)
++
++        # Explicitly capture control flow to make dynamo happy.
++        # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
++        return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
+diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
+new file mode 100644
+index 0000000..a74f541
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/kv_cache.py
+@@ -0,0 +1,78 @@
++import torch
++
++from vllm.logger import init_logger
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig, QuantizeMethodBase)
++
++logger = init_logger(__name__)
++
++
++class BaseKVCacheMethod(QuantizeMethodBase):
++    """
++    Quant method that adds `_k_scale` and `_v_scale` attributes to the
++    Attention layer to support loading those scaling factors from checkpoints. 
++    The k/v_scale will be used to:
++        - quantize k/v_cache entries before saving them to the cache
++        - dequantize k/v_cache entries before fetching them from the cache
++
++    :param quant_config: the appropriate QuantizationConfig 
++    """
++
++    def __init__(self, quant_config: QuantizationConfig):
++        self.quant_config = quant_config
++
++    def create_weights(self, layer: torch.nn.Module):
++        """
++        Create "weight" (aka k_scale and v_scale) for an attention layer.
++        """
++        # Initialize the KV cache scales to -1.0, which is an invalid value.
++        # If the k/v_scale appears in the checkpoint, it will be
++        # overwritten when loading weights.
++        layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0),
++                                           requires_grad=False)
++        layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0),
++                                           requires_grad=False)
++
++    def apply(self, layer: torch.nn.Module) -> torch.Tensor:
++        raise RuntimeError(
++            f"{self.__class__.__name__}.apply should not be called.")
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
++        # regardless whether the kv-scale is available in the checkpoint.
++        if layer.kv_cache_dtype != "auto":
++            if layer.k_scale > 0.0 and layer.v_scale > 0.0:
++                # We prefer to use separate k_scale and v_scale if present
++                k_scale = layer.k_scale.to("cpu").tolist()
++                v_scale = layer.v_scale.to("cpu").tolist()
++            elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
++                # If no scales were loaded (both scales are invalid negative
++                # values), use the default value of 1.0
++                k_scale = 1.0
++                v_scale = 1.0
++            else:
++                # If we find a single kv_scale in the checkpoint, we remap
++                # kv_scale to k_scale during weight loading, and duplicate
++                # k_scale to v_scale here
++                assert layer.k_scale > 0.0
++                scale_to_duplicate = max(layer.k_scale, layer.v_scale)
++                k_scale = scale_to_duplicate.to("cpu").tolist()
++                v_scale = scale_to_duplicate.to("cpu").tolist()
++
++            if not isinstance(k_scale, float) or not isinstance(
++                    v_scale, float):
++                raise ValueError("Only support per-tensor scaling factor "
++                                 "for fp8 KV cache")
++
++            # These are used in the final Attention.forward()
++            layer._k_scale = k_scale
++            layer._v_scale = v_scale
++            if (layer._k_scale == 1.0 and layer._v_scale == 1.0
++                    and "e5m2" not in layer.kv_cache_dtype):
++                logger.warning_once(
++                    "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
++                    "may cause accuracy issues. Please make sure k/v_scale "
++                    "scaling factors are available in the fp8 checkpoint.")
++
++        del layer.k_scale
++        del layer.v_scale
+diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
+index 94aba62..20212e6 100644
+--- a/vllm/model_executor/layers/quantization/marlin.py
++++ b/vllm/model_executor/layers/quantization/marlin.py
+@@ -4,10 +4,17 @@ import torch
+ from torch.nn.parameter import Parameter
+ 
+ from vllm import _custom_ops as ops
++from vllm.logger import init_logger
+ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+ from vllm.model_executor.layers.quantization.base_config import (
+     QuantizationConfig)
+-from vllm.model_executor.utils import set_weight_attrs
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           ChannelQuantScaleParameter,
++                                           GroupQuantScaleParameter,
++                                           PackedvLLMParameter)
++
++logger = init_logger(__name__)
+ 
+ 
+ class MarlinConfig(QuantizationConfig):
+@@ -19,9 +26,11 @@ class MarlinConfig(QuantizationConfig):
+     def __init__(
+         self,
+         group_size: int,
++        lm_head_quantized: bool,
+     ) -> None:
+         # Group size for the quantization.
+         self.group_size = group_size
++        self.lm_head_quantized = lm_head_quantized
+         if self.group_size != 128 and self.group_size != -1:
+             raise ValueError(
+                 "Currently, only group size 128 and -1 (channelwise) "
+@@ -48,7 +57,8 @@ class MarlinConfig(QuantizationConfig):
+         self.perm_len = 1024
+ 
+     def __repr__(self) -> str:
+-        return f"MarlinConfig(group_size={self.group_size})"
++        return (f"MarlinConfig(group_size={self.group_size}, "
++                f"lm_head_quantized={self.lm_head_quantized})")
+ 
+     @classmethod
+     def get_name(cls) -> str:
+@@ -70,16 +80,35 @@ class MarlinConfig(QuantizationConfig):
+     @classmethod
+     def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig":
+         group_size = cls.get_from_keys(config, ["group_size"])
+-        return cls(group_size)
++        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
++                                                 default=False)
++        return cls(group_size, lm_head_quantized)
++
++    @classmethod
++    def override_quantization_method(cls, hf_quant_cfg,
++                                     user_quant) -> Optional[str]:
++        # compat: autogptq >=0.8.0 use checkpoint_format: str
++        # compat: autogptq <=0.7.1 is_marlin_format: bool
++        is_marlin_format = (hf_quant_cfg.get("checkpoint_format") == "marlin"
++                            or hf_quant_cfg.get("is_marlin_format", False))
++
++        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
++                               or user_quant == "marlin")
++
++        if is_marlin_format and is_valid_user_quant:
++            msg = ("The model is serialized in {} format. Using {} kernel.".
++                   format(cls.get_name(), cls.get_name()))
++            logger.info(msg)
++            return cls.get_name()
+ 
+-    def get_quant_method(
+-            self, layer: torch.nn.Module) -> Optional["MarlinLinearMethod"]:
+-        if isinstance(layer, LinearBase):
+-            return MarlinLinearMethod(self)
+         return None
+ 
+-    def get_scaled_act_names(self) -> List[str]:
+-        return []
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["MarlinLinearMethod"]:
++        if (isinstance(layer, LinearBase) or
++            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
++            return MarlinLinearMethod(self)
++        return None
+ 
+ 
+ class MarlinLinearMethod(LinearMethodBase):
+@@ -103,6 +132,7 @@ class MarlinLinearMethod(LinearMethodBase):
+         **extra_weight_attrs,
+     ):
+         del output_size  # Unused.
++        weight_loader = extra_weight_attrs["weight_loader"]
+ 
+         if params_dtype != torch.float16:
+             raise ValueError(
+@@ -141,64 +171,64 @@ class MarlinLinearMethod(LinearMethodBase):
+                 "Each permutation group must reside on the same gpu")
+ 
+         # Quantized 4Bit weights packed into Int32.
+-        qweight = Parameter(
+-            torch.empty(
++        qweight = PackedvLLMParameter(
++            data=torch.empty(
+                 input_size_per_partition // self.quant_config.tile_size,
+                 output_size_per_partition * self.quant_config.tile_size //
+                 self.quant_config.pack_factor,
+                 device="cuda",
+                 dtype=torch.int32,
+             ),
+-            requires_grad=False,
+-        )
+-        set_weight_attrs(
+-            qweight,
+-            {
+-                "input_dim": 0,
+-                "output_dim": 1,
+-                "packed_dim": 1,
+-                "pack_factor": self.quant_config.pack_factor,
+-                "marlin_tile_size": self.quant_config.tile_size,
+-            },
+-        )
++            input_dim=0,
++            output_dim=1,
++            packed_dim=1,
++            packed_factor=self.quant_config.pack_factor,
++            marlin_tile_size=self.quant_config.tile_size,
++            weight_loader=weight_loader)
+ 
+         # Determine if channelwise or not
+         input_groups = (1 if self.quant_config.group_size == -1 else
+                         input_size_per_partition //
+                         self.quant_config.group_size)
+ 
+-        scales = Parameter(
++        weight_scale_args = {
++            "data":
+             torch.empty(
+                 input_groups,
+                 output_size_per_partition,
+                 device="cuda",
+                 dtype=params_dtype,
+             ),
+-            requires_grad=False,
+-        )
+-        set_weight_attrs(
+-            scales,
+-            {
+-                "input_dim": None if input_groups == 1 else 0,
+-                "output_dim": 1,
+-            },
+-        )
++            "weight_loader":
++            weight_loader
++        }
++        if input_groups == 1:
++            scales = ChannelQuantScaleParameter(output_dim=1,
++                                                **weight_scale_args)
++        else:
++            scales = GroupQuantScaleParameter(output_dim=1,
++                                              input_dim=0,
++                                              **weight_scale_args)
+ 
+         # Allocate workspace (Used for internal locking mechanism)
+         max_workspace_size = (
+             output_size_per_partition //
+             self.quant_config.min_n_threads) * self.quant_config.max_parallel
+-        workspace = Parameter(torch.zeros(max_workspace_size,
+-                                          device="cuda",
+-                                          dtype=torch.int),
+-                              requires_grad=False)
++
++        workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size,
++                                                       device="cuda",
++                                                       dtype=torch.int),
++                                      weight_loader=weight_loader)
+ 
+         layer.register_parameter("B", qweight)
+-        set_weight_attrs(qweight, extra_weight_attrs)
+         layer.register_parameter("s", scales)
+-        set_weight_attrs(scales, extra_weight_attrs)
+         layer.register_parameter("workspace", workspace)
+-        set_weight_attrs(workspace, extra_weight_attrs)
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        # required by torch.compile
++        layer.B = Parameter(layer.B.data, requires_grad=False)
++        layer.s = Parameter(layer.s.data, requires_grad=False)
++        layer.workspace = Parameter(layer.workspace.data, requires_grad=False)
+ 
+     def apply(
+         self,
+diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
+new file mode 100644
+index 0000000..a1b3eeb
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/modelopt.py
+@@ -0,0 +1,163 @@
++from typing import Any, Dict, List, Optional
++
++import torch
++from torch.nn import Module
++from torch.nn.parameter import Parameter
++
++from vllm.logger import init_logger
++from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig, QuantizeMethodBase)
++from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
++from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++    apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale)
++from vllm.model_executor.parameter import (ModelWeightParameter,
++                                           PerTensorScaleParameter)
++
++logger = init_logger(__name__)
++
++ACTIVATION_SCHEMES = ["static"]
++
++
++class ModelOptFp8Config(QuantizationConfig):
++    """Config class for ModelOpt FP8."""
++
++    def __init__(
++        self,
++        is_checkpoint_fp8_serialized: bool = False,
++    ) -> None:
++        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
++        if is_checkpoint_fp8_serialized:
++            logger.warning("Detected ModelOpt fp8 checkpoint. Please note that"
++                           " the format is experimental and could change.")
++
++    @classmethod
++    def get_name(cls) -> str:
++        return "modelopt"
++
++    @classmethod
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.bfloat16, torch.half]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 89
++
++    @classmethod
++    def get_config_filenames(cls) -> List[str]:
++        return ["hf_quant_config.json"]
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "ModelOptFp8Config":
++        quant_config = cls.get_from_keys(config, ["quantization"])
++        quant_method = quant_config["quant_algo"]
++        is_checkpoint_fp8_serialized = ("FP8" in quant_method)
++        if not is_checkpoint_fp8_serialized:
++            raise ValueError("ModelOpt currently only supports static FP8"
++                             "quantization in vLLM. Please check the "
++                             "`hf_quant_config.json` file for your model's "
++                             "quant configuration.")
++        return cls(is_checkpoint_fp8_serialized)
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["QuantizeMethodBase"]:
++        from vllm.attention.layer import Attention  # Avoid circular import
++        if isinstance(layer, LinearBase):
++            return ModelOptFp8LinearMethod(self)
++        elif isinstance(layer, Attention):
++            return ModelOptFp8KVCacheMethod(self)
++        return None
++
++
++class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
++    """
++    Supports loading kv-cache scaling factors from FP8 checkpoints.
++    """
++
++    def __init__(self, quant_config: ModelOptFp8Config):
++        super().__init__(quant_config)
++
++
++class ModelOptFp8LinearMethod(LinearMethodBase):
++    """Linear method for Model Optimizer static quantization.
++    Supports loading FP8 checkpoints with static weight scale and
++    activation scale. Future support might be added for dynamic 
++    scales.
++
++    Limitations:
++    1. Only support per-tensor quantization due to torch._scaled_mm support.
++    2. Only support float8_e4m3fn datatype 
++        Args: quant_config: The ModelOpt quantization config.
++    """
++
++    def __init__(self, quant_config: ModelOptFp8Config):
++        self.quant_config = quant_config
++        self.cutlass_fp8_supported = cutlass_fp8_supported()
++
++    def create_weights(
++        self,
++        layer: torch.nn.Module,
++        input_size_per_partition: int,
++        output_partition_sizes: List[int],
++        input_size: int,
++        output_size: int,
++        params_dtype: torch.dtype,
++        **extra_weight_attrs,
++    ):
++        del input_size, output_size
++        output_size_per_partition = sum(output_partition_sizes)
++        weight_loader = extra_weight_attrs.get("weight_loader")
++        layer.logical_widths = output_partition_sizes
++        layer.input_size_per_partition = input_size_per_partition
++        layer.output_size_per_partition = output_size_per_partition
++        weight_dtype = (torch.float8_e4m3fn
++                        if self.quant_config.is_checkpoint_fp8_serialized else
++                        params_dtype)
++        weight = ModelWeightParameter(data=torch.empty(
++            output_size_per_partition,
++            input_size_per_partition,
++            dtype=weight_dtype),
++                                      input_dim=1,
++                                      output_dim=0,
++                                      weight_loader=weight_loader)
++        layer.register_parameter("weight", weight)
++
++        if self.quant_config.is_checkpoint_fp8_serialized:
++            # WEIGHT SCALE
++            weight_scale = PerTensorScaleParameter(data=torch.empty(
++                len(output_partition_sizes), dtype=torch.float32),
++                                                   weight_loader=weight_loader)
++            weight_scale[:] = torch.finfo(torch.float32).min
++            layer.register_parameter("weight_scale", weight_scale)
++            # INPUT SCALE
++            scale = PerTensorScaleParameter(data=torch.empty(
++                len(output_partition_sizes), dtype=torch.float32),
++                                            weight_loader=weight_loader)
++
++            scale[:] = torch.finfo(torch.float32).min
++            layer.register_parameter("input_scale", scale)
++
++    def process_weights_after_loading(self, layer: Module) -> None:
++        weight = layer.weight
++        max_w_scale = layer.weight_scale.max()
++        if not (layer.weight_scale == layer.weight_scale[0]).all():
++            max_w_scale, weight = requantize_with_max_scale(
++                layer.weight, layer.weight_scale, layer.logical_widths)
++        layer.weight = Parameter(weight.t(), requires_grad=False)
++        layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
++        layer.input_scale = Parameter(layer.input_scale.max(),
++                                      requires_grad=False)
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        return apply_fp8_linear(
++            input=x,
++            weight=layer.weight,
++            weight_scale=layer.weight_scale,
++            input_scale=layer.input_scale,
++            bias=bias,
++            cutlass_fp8_supported=self.cutlass_fp8_supported)
+diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
+new file mode 100644
+index 0000000..2d5cdfa
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/neuron_quant.py
+@@ -0,0 +1,64 @@
++import os
++from importlib.util import find_spec
++from typing import Any, Dict, List, Optional
++
++from torch.nn import Module
++
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++
++SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn']
++
++
++class NeuronQuantConfig(QuantizationConfig):
++    """Int8 Quantization Config class for Neuron Backend."""
++
++    def __init__(
++        self,
++        dequant_dtype: str = "f16",
++        quantize_method: str = "vector_dynamic",
++    ) -> None:
++        self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8")
++        if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
++            raise ValueError(
++                f"Neuron quantization datatype {self.quant_dtype} is not valid,"
++                f"the quantization datatype should match one of the below types"
++                f"{SUPPORTED_QUANT_DTYPE_LIST}")
++        self.dequant_dtype = dequant_dtype
++        self.quantize_method = quantize_method
++
++    def get_name(self) -> str:
++        return "neuron_quant"
++
++    def get_supported_act_dtypes(self) -> List[str]:
++        return SUPPORTED_QUANT_DTYPE_LIST
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        raise NotImplementedError(
++            "This function should not be called with Neuron Backend")
++
++    @staticmethod
++    def get_config_filenames() -> List[str]:
++        return []
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "NeuronQuantConfig":
++        quantize_method = cls.get_from_keys(config, ["quantize_method"])
++        dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"])
++        return cls(dequant_dtype=dequant_dtype,
++                   quantize_method=quantize_method)
++
++    def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]:
++        if find_spec("transformers_neuronx") is not None:
++            return self.get_quantization_config()
++        else:
++            raise NotImplementedError(
++                "Neuron Quantization is only supported through"
++                " transformers_neuronx.")
++
++    def get_quantization_config(self):
++        from transformers_neuronx.config import QuantizationConfig
++        return QuantizationConfig(quant_dtype=self.quant_dtype,
++                                  dequant_dtype=self.dequant_dtype,
++                                  quantize_method=self.quantize_method)
+diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
+new file mode 100644
+index 0000000..2ccd082
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/qqq.py
+@@ -0,0 +1,270 @@
++from typing import Any, Dict, List, Optional
++
++import torch
++from torch.nn.parameter import Parameter
++
++from vllm import _custom_ops as ops
++from vllm.logger import init_logger
++from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.parameter import (BasevLLMParameter,
++                                           ChannelQuantScaleParameter,
++                                           GroupQuantScaleParameter,
++                                           PackedvLLMParameter)
++
++logger = init_logger(__name__)
++
++MARLIN_QQQ_TILE = 16
++MARLIN_QQQ_MIN_THREAD_N = 64
++MARLIN_QQQ_MIN_THREAD_K = 128
++MARLIN_QQQ_MAX_PARALLEL = 16
++
++MARLIN_QQQ_SUPPORTED_NUM_BITS = [4]
++MARLIN_QQQ_SUPPORTED_GROUP_SIZES = [-1, 128]
++MARLIN_QQQ_SUPPORTED_SYM = [True]
++
++
++class QQQConfig(QuantizationConfig):
++    """Config class for QQQ
++    
++    Reference: https://arxiv.org/pdf/2406.09904
++    """
++
++    def __init__(
++        self,
++        weight_bits: int,
++        group_size: int,
++        is_sym: bool = True,
++    ) -> None:
++        self.weight_bits = weight_bits
++        self.group_size = group_size
++        self.is_sym = is_sym
++
++        # Verify
++        if self.weight_bits not in MARLIN_QQQ_SUPPORTED_NUM_BITS:
++            raise ValueError(
++                f"QQQ does not support weight_bits = {self.weight_bits}. "
++                f"Only weight_bits = {MARLIN_QQQ_SUPPORTED_NUM_BITS} "
++                "are supported.")
++        if self.group_size not in MARLIN_QQQ_SUPPORTED_GROUP_SIZES:
++            raise ValueError(
++                f"QQQ does not support group_size = {self.group_size}. "
++                f"Only group_sizes = {MARLIN_QQQ_SUPPORTED_GROUP_SIZES} "
++                "are supported.")
++        if self.is_sym not in MARLIN_QQQ_SUPPORTED_SYM:
++            raise ValueError(
++                f"QQQ does not support is_sym = {self.is_sym}. "
++                f"Only sym = {MARLIN_QQQ_SUPPORTED_SYM} are supported.")
++
++        # 4 Bits packed into 32 bit datatype.
++        self.pack_factor = 32 // self.weight_bits
++
++        # Tile size used by QQQ kernels.
++        self.tile_size = MARLIN_QQQ_TILE
++
++        # Min out_features dim
++        self.min_n_threads = MARLIN_QQQ_MIN_THREAD_N
++
++        # Min in_features dim
++        self.min_k_threads = MARLIN_QQQ_MIN_THREAD_K
++
++        # Max parallel problems to solve at once (improves large
++        # batch performance)
++        self.max_parallel = MARLIN_QQQ_MAX_PARALLEL
++
++        # Permutation length used by the QQQ kernels.
++        self.perm_len = 1024
++
++    def __repr__(self) -> str:
++        return "QQQConfig(weight_bits={}, group_size={})".format(
++            self.weight_bits, self.group_size)
++
++    @classmethod
++    def get_name(cls) -> str:
++        return "qqq"
++
++    @classmethod
++    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
++        return [torch.half]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        return 80
++
++    @classmethod
++    def get_config_filenames(cls) -> List[str]:
++        """List of filenames to search for in the model directory."""
++        return [
++            "quant_config.json",
++            "quantize_config.json",
++        ]
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "QQQConfig":
++        weight_bits = cls.get_from_keys(config, ["wbits"])
++        group_size = cls.get_from_keys(config, ["group_size"])
++        return cls(weight_bits, group_size)
++
++    def get_quant_method(self, layer: torch.nn.Module,
++                         prefix: str) -> Optional["QQQLinearMethod"]:
++        if isinstance(layer, LinearBase):
++            return QQQLinearMethod(self)
++        return None
++
++
++class QQQLinearMethod(LinearMethodBase):
++    """Linear method for QQQ.
++
++    Args:
++        quant_config: The QQQ quantization config.
++    """
++
++    def __init__(self, quant_config: QQQConfig):
++        self.quant_config = quant_config
++
++    def create_weights(
++        self,
++        layer: torch.nn.Module,
++        input_size_per_partition: int,
++        output_partition_sizes: List[int],
++        input_size: int,
++        output_size: int,
++        params_dtype: torch.dtype,
++        **extra_weight_attrs,
++    ):
++        weight_loader = extra_weight_attrs["weight_loader"]
++        if params_dtype != torch.float16:
++            raise ValueError(
++                f"The params dtype must be float16, but got {params_dtype}")
++
++        # Validate output_size_per_partition
++        output_size_per_partition = sum(output_partition_sizes)
++        if output_size_per_partition % self.quant_config.min_n_threads != 0:
++            raise ValueError(
++                f"Weight output_size_per_partition = "
++                f"{output_size_per_partition} is not divisible by "
++                f"min_n_threads = {self.quant_config.min_n_threads}.")
++        if output_size_per_partition % self.quant_config.pack_factor != 0:
++            raise ValueError(
++                f"Weight output_size_per_partition = "
++                f"{output_size_per_partition} is not divisible by "
++                f"pack_factor = {self.quant_config.pack_factor}.")
++
++        # Validate input_size_per_partition
++        if input_size_per_partition % self.quant_config.min_k_threads != 0:
++            raise ValueError(
++                f"Weight input_size_per_partition = "
++                f"{input_size_per_partition} is not divisible by "
++                f"min_k_threads = {self.quant_config.min_k_threads}.")
++        if (self.quant_config.group_size != -1 and
++                input_size_per_partition % self.quant_config.group_size != 0):
++            raise ValueError(f"Weight input_size_per_partition = "
++                             f"{input_size_per_partition} is not divisible by "
++                             f"group_size = {self.quant_config.group_size}.")
++
++        # Check that we have at least 4 tiles horizontally in the shard
++        num_tiles_per_perm = self.quant_config.perm_len // (
++            self.quant_config.tile_size**2)
++        if output_size_per_partition % num_tiles_per_perm != 0:
++            raise ValueError(
++                "Each permutation group must reside on the same gpu")
++
++        # Quantized 4Bit weights packed into Int32.
++        qweight = PackedvLLMParameter(
++            data=torch.empty(
++                input_size_per_partition // self.quant_config.tile_size,
++                output_size_per_partition * self.quant_config.tile_size //
++                self.quant_config.pack_factor,
++                device="cuda",
++                dtype=torch.int32,
++            ),
++            input_dim=0,
++            output_dim=1,
++            packed_dim=1,
++            packed_factor=self.quant_config.pack_factor,
++            marlin_tile_size=self.quant_config.tile_size,
++            weight_loader=weight_loader)
++
++        s_channel = ChannelQuantScaleParameter(data=torch.empty(
++            1,
++            output_size_per_partition,
++            device="cuda",
++            dtype=torch.float,
++        ),
++                                               weight_loader=weight_loader,
++                                               output_dim=1)
++
++        if self.quant_config.group_size == -1:
++            s_group_data = torch.tensor(
++                [],
++                device="cuda",
++                dtype=torch.half,
++            )
++        else:
++            s_group_data = torch.empty(
++                input_size_per_partition // self.quant_config.group_size,
++                output_size_per_partition,
++                device="cuda",
++                dtype=torch.half,
++            )
++
++        s_group_attr = {"data": s_group_data, "weight_loader": weight_loader}
++
++        if self.quant_config.group_size == -1:
++            s_group = BasevLLMParameter(**s_group_attr)
++        else:
++            s_group = GroupQuantScaleParameter(output_dim=1,
++                                               input_dim=0,
++                                               **s_group_attr)
++
++        # Allocate workspace (Used for internal locking mechanism)
++        max_workspace_size = (
++            output_size_per_partition //
++            self.quant_config.min_n_threads) * self.quant_config.max_parallel
++
++        workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size,
++                                                       device="cuda",
++                                                       dtype=torch.int),
++                                      weight_loader=weight_loader)
++
++        layer.register_parameter("B", qweight)
++        layer.register_parameter("s_channel", s_channel)
++        layer.register_parameter("s_group", s_group)
++        layer.register_parameter("workspace", workspace)
++
++    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
++        # required by torch.compile
++        layer.B = Parameter(layer.B.data, requires_grad=False)
++        layer.s_channel = Parameter(layer.s_channel.data, requires_grad=False)
++        layer.s_group = Parameter(layer.s_group.data, requires_grad=False)
++        layer.workspace = Parameter(layer.workspace.data, requires_grad=False)
++
++    def apply(
++        self,
++        layer: torch.nn.Module,
++        x: torch.Tensor,
++        bias: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        qweight = layer.B
++        s_ch = layer.s_channel
++        s_group = layer.s_group
++        workspace = layer.workspace
++
++        x_2d = x.view(-1, x.shape[-1])
++
++        size_m = x_2d.shape[0]
++        size_k = x_2d.shape[1]
++        size_n = s_ch.shape[1]
++
++        x_int8, s_tok, _ = ops.scaled_int8_quant(x_2d)
++
++        output_2d = ops.marlin_qqq_gemm(x_int8, qweight, s_tok, s_ch, s_group,
++                                        workspace, size_m, size_n, size_k)
++
++        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
++
++        if bias is not None:
++            output.add_(bias)  # In-place add
++
++        return output
+diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
+new file mode 100644
+index 0000000..605c3a3
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/tpu_int8.py
+@@ -0,0 +1,116 @@
++from typing import Any, Dict, List, Optional, Tuple
++
++import torch
++from torch.nn import Module
++from torch.nn.parameter import Parameter
++
++from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.parameter import ModelWeightParameter
++
++ACTIVATION_SCHEMES = ["none"]
++
++
++class Int8TpuConfig(QuantizationConfig):
++    """Int8 Quantization Config class for TPU Backend."""
++
++    def __init__(
++        self,
++        activation_scheme: str = "none",
++    ) -> None:
++        if activation_scheme not in ACTIVATION_SCHEMES:
++            raise ValueError(
++                f"Unsupported activation scheme {activation_scheme}")
++        self.activation_scheme = activation_scheme
++
++    def get_name(self) -> str:
++        return "tpu_int8"
++
++    def get_supported_act_dtypes(self) -> List[torch.dtype]:
++        return [torch.float16, torch.bfloat16]
++
++    @classmethod
++    def get_min_capability(cls) -> int:
++        raise NotImplementedError(
++            "This function should not be called with TPU Backend")
++
++    @staticmethod
++    def get_config_filenames() -> List[str]:
++        return []
++
++    @classmethod
++    def from_config(cls, config: Dict[str, Any]) -> "Int8TpuConfig":
++        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
++        return cls(activation_scheme=activation_scheme)
++
++    def get_quant_method(self, layer: Module,
++                         prefix: str) -> Optional["TPUInt8LinearMethod"]:
++        if isinstance(layer, LinearBase):
++            return TPUInt8LinearMethod(self)
++        return None
++
++
++class TPUInt8LinearMethod(LinearMethodBase):
++    """Int8 Linear method for TPU Quant. """
++
++    def __init__(self, quant_config: Int8TpuConfig):
++        self.quant_config = quant_config
++
++    def create_weights(self, layer: Module, input_size_per_partition: int,
++                       output_partition_sizes: List[int], input_size: int,
++                       output_size: int, params_dtype: torch.dtype,
++                       **extra_weight_attrs):
++
++        weight_loader = extra_weight_attrs.get("weight_loader")
++        weight = ModelWeightParameter(data=torch.empty(
++            sum(output_partition_sizes),
++            input_size_per_partition,
++            dtype=params_dtype),
++                                      input_dim=1,
++                                      output_dim=0,
++                                      weight_loader=weight_loader)
++        layer.register_parameter("weight", weight)
++
++    def _quantize_weight(
++            self, weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
++        weight_dtype = weight.dtype
++        weight = weight.cpu().to(torch.float32)
++        n_bit = 8
++        eps = 1e-5
++        max_int = 2**(n_bit - 1) - 1
++        min_int = -(2**(n_bit - 1))
++        max_val = weight.abs().amax(dim=-1, keepdim=True)
++        max_val = max_val.clamp(min=eps)
++        qscale = max_val / max_int
++        qweight = torch.clamp(torch.round(weight * (1.0 / qscale)), min_int,
++                              max_int).to(torch.int8)
++        qscale = qscale.squeeze().to(weight_dtype)
++        return qweight, qscale
++
++    def process_weights_after_loading(self, layer: Module) -> None:
++        layer.weight = Parameter(layer.weight.data, requires_grad=False)
++        device = layer.weight.device
++        qweight, qscale = self._quantize_weight(layer.weight)
++        qweight = qweight.to(device)
++        qscale = qscale.to(device)
++        layer.weight = Parameter(qweight, requires_grad=False)
++        layer.scale = Parameter(qscale, requires_grad=False)
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        try:
++            import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
++        except ImportError as err:
++            raise ImportError(
++                "Please install torch_xla by following the instructions at "
++                "https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html "  # noqa: E501
++                "to run vLLM on TPU.") from err
++        weight = layer.weight
++        scale = layer.scale
++        out = torch.ops.xla.quantized_matmul(x, weight, scale)
++        if bias is not None:
++            out = out + bias
++        return out
+diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py
+new file mode 100644
+index 0000000..6d18fa3
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/__init__.py
+@@ -0,0 +1,3 @@
++from .layer_utils import replace_parameter, update_tensor_inplace
++
++__all__ = ['update_tensor_inplace', 'replace_parameter']
+diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+new file mode 100644
+index 0000000..f3c3e13
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+@@ -0,0 +1,353 @@
++# Adapted from https://github.com/sgl-project/sglang/pull/2575
++from typing import List, Optional, Tuple
++
++import torch
++import triton
++import triton.language as tl
++
++
++def apply_w8a8_block_fp8_linear(
++    input: torch.Tensor,
++    weight: torch.Tensor,
++    block_size: List[int],
++    weight_scale: torch.Tensor,
++    input_scale: Optional[torch.Tensor] = None,
++    bias: Optional[torch.Tensor] = None,
++) -> torch.Tensor:
++    assert input_scale is None
++    # View input as 2D matrix for fp8 methods
++    input_2d = input.view(-1, input.shape[-1])
++    output_shape = [*input.shape[:-1], weight.shape[0]]
++
++    q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1])
++    output = w8a8_block_fp8_matmul(q_input,
++                                   weight,
++                                   x_scale,
++                                   weight_scale,
++                                   block_size,
++                                   output_dtype=input.dtype)
++
++    if bias is not None:
++        output = output + bias
++    return output.to(dtype=input.dtype).view(*output_shape)
++
++
++def input_to_float8(
++    x: torch.Tensor,
++    dtype: torch.dtype = torch.float8_e4m3fn
++) -> Tuple[torch.Tensor, torch.Tensor]:
++    """This function quantizes input values to float8 values "
++    "with tensor-wise quantization."""
++    finfo = torch.finfo(dtype)
++    min_val, max_val = x.aminmax()
++    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
++    scale = finfo.max / amax
++    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
++    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
++
++
++def block_quant_to_tensor_quant(
++    x_q_block: torch.Tensor,
++    x_s: torch.Tensor,
++    block_size: List[int],
++) -> Tuple[torch.Tensor, torch.Tensor]:
++    """This function converts block-wise quantization to tensor-wise
++    quantization. The inputs are block-wise quantization tensor `x_q_block`,
++    block-wise quantization scale and the block size.
++    The outputs are tensor-wise quantization tensor and tensor-wise
++    quantization scale. Note only float8 is supported for now.
++    """
++    block_n, block_k = block_size[0], block_size[1]
++    n, k = x_q_block.shape
++    n_tiles = (n + block_n - 1) // block_n
++    k_tiles = (k + block_k - 1) // block_k
++    assert n_tiles == x_s.shape[0]
++    assert k_tiles == x_s.shape[1]
++
++    x_dq_block = x_q_block.to(torch.float32)
++
++    x_dq_block_tiles = [[
++        x_dq_block[j * block_n:min((j + 1) * block_n, n),
++                   i * block_k:min((i + 1) * block_k, k), ]
++        for i in range(k_tiles)
++    ] for j in range(n_tiles)]
++
++    for i in range(k_tiles):
++        for j in range(n_tiles):
++            x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
++
++    x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
++    return x_q_tensor, scale
++
++
++@triton.jit
++def _per_token_group_quant_fp8(
++    # Pointers to inputs and output
++    y_ptr,
++    y_q_ptr,
++    y_s_ptr,
++    # Stride of input
++    y_stride,
++    # Columns of input
++    N,
++    # Avoid to divide zero
++    eps,
++    # Information for float8
++    fp8_min,
++    fp8_max,
++    # Meta-parameters
++    BLOCK: tl.constexpr,
++):
++    """A Triton-accelerated function to perform per-token-group
++    quantization on a tensor.
++    This function converts the tensor values into float8 values.
++    """
++    # Map the program id to the row of X and Y it should compute.
++    g_id = tl.program_id(0)
++    y_ptr += g_id * y_stride
++    y_q_ptr += g_id * y_stride
++    y_s_ptr += g_id
++
++    cols = tl.arange(0, BLOCK)  # N <= BLOCK
++    mask = cols < N
++
++    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
++    # Quant
++    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
++    y_s = _absmax / fp8_max
++    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
++
++    tl.store(y_q_ptr + cols, y_q, mask=mask)
++    tl.store(y_s_ptr, y_s)
++
++
++def per_token_group_quant_fp8(
++    x: torch.Tensor,
++    group_size: int,
++    eps: float = 1e-10,
++    dtype: torch.dtype = torch.float8_e4m3fn,
++) -> Tuple[torch.Tensor, torch.Tensor]:
++    """Function to perform per-token-group quantization on an input tensor `x`.
++    It converts the tensor values into signed float8 values and returns the
++    quantized tensor along with the scaling factor used for quantization.
++    Args:
++        x: The input tenosr with ndim >= 2.
++        group_size: The group size used for quantization.
++        eps: The minimum to avoid dividing zero.
++        dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
++        is supported for now.
++    Returns:
++        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
++        scaling factor for quantization.
++    """
++    assert (x.shape[-1] % group_size == 0), (
++        f"the last dimension of `x` {x.shape[-1]} must be divisible "
++        f"by `group_size` {group_size}")
++    assert x.is_contiguous(), "`x` must be contiguous"
++
++    finfo = torch.finfo(dtype)
++    fp8_min = finfo.min
++    fp8_max = finfo.max
++
++    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
++    M = x.numel() // group_size
++    N = group_size
++    x_s = torch.empty(
++        x.shape[:-1] + (x.shape[-1] // group_size, ),
++        device=x.device,
++        dtype=torch.float32,
++    )
++
++    BLOCK = triton.next_power_of_2(N)
++    # heuristics for number of warps
++    num_warps = min(max(BLOCK // 256, 1), 8)
++    num_stages = 1
++    _per_token_group_quant_fp8[(M, )](
++        x,
++        x_q,
++        x_s,
++        group_size,
++        N,
++        eps,
++        fp8_min=fp8_min,
++        fp8_max=fp8_max,
++        BLOCK=BLOCK,
++        num_warps=num_warps,
++        num_stages=num_stages,
++    )
++
++    return x_q, x_s
++
++
++@triton.jit
++def _w8a8_block_fp8_matmul(
++    # Pointers to inputs and output
++    A,
++    B,
++    C,
++    As,
++    Bs,
++    # Shape for matmul
++    M,
++    N,
++    K,
++    # Block size for block-wise quantization
++    group_n,
++    group_k,
++    # Stride for inputs and output
++    stride_am,
++    stride_ak,
++    stride_bk,
++    stride_bn,
++    stride_cm,
++    stride_cn,
++    stride_As_m,
++    stride_As_k,
++    stride_Bs_k,
++    stride_Bs_n,
++    # Meta-parameters
++    BLOCK_SIZE_M: tl.constexpr,
++    BLOCK_SIZE_N: tl.constexpr,
++    BLOCK_SIZE_K: tl.constexpr,
++    GROUP_SIZE_M: tl.constexpr,
++):
++    """Triton-accelerated function used to perform linear operations (dot
++    product) on input tensors `A` and `B` with block-wise quantization, and
++    store the result in output tensor `C`.
++    """
++
++    pid = tl.program_id(axis=0)
++    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
++    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
++    num_pid_in_group = GROUP_SIZE_M * num_pid_n
++    group_id = pid // num_pid_in_group
++    first_pid_m = group_id * GROUP_SIZE_M
++    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
++    pid_m = first_pid_m + (pid % group_size_m)
++    pid_n = (pid % num_pid_in_group) // group_size_m
++
++    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
++    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
++    offs_k = tl.arange(0, BLOCK_SIZE_K)
++    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
++    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
++
++    As_ptrs = As + offs_am * stride_As_m
++    offs_bsn = offs_bn // group_n
++    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
++
++    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
++    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
++        a = tl.load(a_ptrs,
++                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
++                    other=0.0)
++        b = tl.load(b_ptrs,
++                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
++                    other=0.0)
++
++        k_start = k * BLOCK_SIZE_K
++        offs_ks = k_start // group_k
++        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
++        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
++
++        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
++        a_ptrs += BLOCK_SIZE_K * stride_ak
++        b_ptrs += BLOCK_SIZE_K * stride_bk
++
++    if C.dtype.element_ty == tl.bfloat16:
++        c = accumulator.to(tl.bfloat16)
++    elif C.dtype.element_ty == tl.float16:
++        c = accumulator.to(tl.float16)
++    else:
++        c = accumulator.to(tl.float32)
++
++    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
++    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
++    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
++    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
++    tl.store(c_ptrs, c, mask=c_mask)
++
++
++def w8a8_block_fp8_matmul(
++    A: torch.Tensor,
++    B: torch.Tensor,
++    As: torch.Tensor,
++    Bs: torch.Tensor,
++    block_size: List[int],
++    output_dtype: torch.dtype = torch.float16,
++) -> torch.Tensor:
++    """This function performs matrix multiplication with block-wise
++    quantization.
++    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
++    The output is returned in the specified `output_dtype`.
++    Args:
++        A: The input tensor, e.g., activation.
++        B: The input tensor, e.g., weight.
++        As: The per-token-group quantization scale for `A`.
++        Bs: The per-block quantization scale for `B`.
++        block_size: The block size for per-block quantization. It should
++        be 2-dim, e.g., [128, 128].
++        output_dytpe: The dtype of the returned tensor.
++    Returns:
++        torch.Tensor: The result of matmul.
++    """
++    assert len(block_size) == 2
++    block_n, block_k = block_size[0], block_size[1]
++
++    assert A.shape[-1] == B.shape[-1]
++    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
++    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
++    M = A.numel() // A.shape[-1]
++
++    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
++    N, K = B.shape
++    assert triton.cdiv(N, block_n) == Bs.shape[0]
++    assert triton.cdiv(K, block_k) == Bs.shape[1]
++
++    C_shape = A.shape[:-1] + (N, )
++    C = A.new_empty(C_shape, dtype=output_dtype)
++
++    # TODO:
++    # BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N can be optimized.
++    # BLOCK_SIZE_K must be divisible by block_k
++    # BLOCK_SIZE_N and BLOCK_SIZE_M has no requirements
++    BLOCK_SIZE_M = 128
++    if M < BLOCK_SIZE_M:
++        BLOCK_SIZE_M = triton.next_power_of_2(M)
++        BLOCK_SIZE_M = max(BLOCK_SIZE_M, 16)
++    BLOCK_SIZE_K = block_k
++    assert block_k % BLOCK_SIZE_K == 0
++    BLOCK_SIZE_N = block_n
++
++    def grid(META):
++        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
++                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
++
++    _w8a8_block_fp8_matmul[grid](
++        A,
++        B,
++        C,
++        As,
++        Bs,
++        M,
++        N,
++        K,
++        block_n,
++        block_k,
++        A.stride(-2),
++        A.stride(-1),
++        B.stride(1),
++        B.stride(0),
++        C.stride(-2),
++        C.stride(-1),
++        As.stride(-2),
++        As.stride(-1),
++        Bs.stride(1),
++        Bs.stride(0),
++        BLOCK_SIZE_M=BLOCK_SIZE_M,
++        BLOCK_SIZE_N=BLOCK_SIZE_N,
++        BLOCK_SIZE_K=BLOCK_SIZE_K,
++        GROUP_SIZE_M=8,
++    )
++
++    return C
+diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py
+new file mode 100644
+index 0000000..edce6d1
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py
+@@ -0,0 +1,37 @@
++from typing import Union
++
++import torch
++
++
++def update_tensor_inplace(dst: torch.Tensor, src: torch.Tensor):
++    assert dst.dtype == src.dtype, "Tensors must have the same dtype"
++
++    # update tensor shape and stride
++    dst.as_strided_(src.shape, src.stride())
++
++    # If not the same underlying storage move tensor data
++    if dst.data_ptr() != src.data_ptr():
++        dst.copy_(src)
++        del src
++
++
++# Newly generated tensors need to replace existing tensors that are
++# already registered as parameters by vLLM (and won't be freed)
++def replace_parameter(mod: torch.nn.Module, name: str,
++                      new: Union[torch.Tensor, torch.nn.Parameter]) -> None:
++
++    old = getattr(mod, name)
++    if type(old) is type(new) and old.dtype == new.dtype and \
++        old.untyped_storage().nbytes() == new.untyped_storage().nbytes():
++        # If we can just update in-place to avoid re-registering
++        #   can be faster if the underlying storage is the same
++        update_tensor_inplace(old, new)
++    else:
++        # Fallback re-register parameter, convert to Parameter if necessary
++        # this not only ensures we don't register a tensor as a parameter, but
++        # also ensures that all parameter subclasses get re-registered as
++        # parameters for `torch.compile` compatibility
++        if not isinstance(new, torch.nn.Parameter):
++            new = torch.nn.Parameter(new, requires_grad=False)
++        mod.register_parameter(name,
++                               torch.nn.Parameter(new, requires_grad=False))
+diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py
+new file mode 100644
+index 0000000..18e1332
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py
+@@ -0,0 +1,30 @@
++from typing import List, Optional, Tuple
++
++import torch
++
++from vllm.scalar_type import ScalarType, scalar_types
++
++MACHETE_SUPPORTED_GROUP_SIZES = [-1, 128]
++MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128]
++
++
++def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]:
++    if zero_points:
++        return [scalar_types.uint4, scalar_types.uint8]
++    else:
++        return [scalar_types.uint4b8, scalar_types.uint8b128]
++
++
++def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]:
++    return [torch.float16, torch.bfloat16]
++
++
++def check_machete_supports_shape(in_features: int, out_featrues: int) \
++    -> Tuple[bool, Optional[str]]:
++    if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:
++        return False, "Input features size must be divisible by "\
++            f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}"
++    if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0:
++        return False, "Output features size must be divisible by "\
++            f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}"
++    return True, None
+diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+new file mode 100644
+index 0000000..c9366ca
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+@@ -0,0 +1,350 @@
++from typing import List, Optional, Tuple
++
++import numpy
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.platforms import current_platform
++from vllm.scalar_type import ScalarType, scalar_types
++
++from .quant_utils import pack_cols, unpack_cols
++
++GPTQ_MARLIN_TILE = 16
++GPTQ_MARLIN_MIN_THREAD_N = 64
++GPTQ_MARLIN_MIN_THREAD_K = 128
++GPTQ_MARLIN_MAX_PARALLEL = 16
++
++MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
++
++# In case there is a performance issue with Marlin, the variable below can be
++# changed to False, which allows Marlin to perform global reductions in fp16
++# precision (instead of fp32), and therefore, save on some memory movements.
++USE_FP32_REDUCE_DEFAULT = True
++
++
++# For binary size and compile time, we don't support the same types for with and
++#  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
++#  TODO: we may want to move this into the C++ so its closer to the actual impl
++def query_marlin_supported_quant_types(has_zp: bool,
++                                       device_capability: Optional[int] = None
++                                       ):
++    if device_capability is None:
++        capability_tuple = current_platform.get_device_capability()
++        device_capability = (-1 if capability_tuple is None else
++                             capability_tuple.to_int())
++
++    if device_capability < 80:
++        return []
++
++    if has_zp:
++        # AWQ style, unsigned + runtime zero-point
++        return [scalar_types.uint4, scalar_types.uint8]
++    else:
++        # GPTQ style, unsigned + symmetric bias
++        # TODO: once fp8_marlin is merged into "gptq_marlin" we should be able
++        #  to add `scalar_types.float8_e4m3fn` here
++        return [scalar_types.uint4b8, scalar_types.uint8b128]
++
++
++def _check_marlin_supported(
++        quant_type: ScalarType,
++        group_size: Optional[int],
++        has_zp: bool,
++        device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:
++
++    if device_capability is None:
++        capability_tuple = current_platform.get_device_capability()
++        device_capability = (-1 if capability_tuple is None else
++                             capability_tuple.to_int())
++
++    supported_types = query_marlin_supported_quant_types(
++        has_zp, device_capability)
++
++    if quant_type not in supported_types:
++        return (False, f"Marlin does not support weight_bits = {quant_type}. "
++                f"Only types = {supported_types} "
++                f"are supported (for group_size = {group_size}, "
++                f"device_capability = {device_capability}, zp = {has_zp}).")
++    if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
++        return (False, f"Marlin does not support group_size = {group_size}. "
++                f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
++                "are supported.")
++
++    return True, None
++
++
++def check_marlin_supported(quant_type: ScalarType,
++                           group_size: int,
++                           has_zp: bool = False,
++                           device_capability: Optional[int] = None) -> bool:
++    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
++                                      device_capability)
++    return cond
++
++
++def verify_marlin_supported(quant_type: ScalarType,
++                            group_size: int,
++                            has_zp: bool = False) -> None:
++    cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
++    if not cond:
++        assert err_msg is not None
++        raise ValueError(err_msg)
++
++
++def verify_marlin_supports_shape(output_size_per_partition: int,
++                                 input_size_per_partition: int,
++                                 input_size: int, group_size: int) -> None:
++
++    # Validate output_size_per_partition
++    if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
++        raise ValueError(f"Weight output_size_per_partition = "
++                         f"{output_size_per_partition} is not divisible by "
++                         f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
++                         "Consider reducing tensor_parallel_size or running "
++                         "with --quantization gptq.")
++
++    # Validate input_size_per_partition
++    if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
++        raise ValueError(f"Weight input_size_per_partition = "
++                         f"{input_size_per_partition} is not divisible "
++                         f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
++                         "Consider reducing tensor_parallel_size or running "
++                         "with --quantization gptq.")
++
++    if (group_size < input_size
++            and input_size_per_partition % group_size != 0):
++        raise ValueError(
++            f"Weight input_size_per_partition = {input_size_per_partition}"
++            f" is not divisible by group_size = {group_size}."
++            "Consider reducing tensor_parallel_size or running "
++            "with --quantization gptq.")
++
++
++def check_marlin_supports_shape(output_size_per_partition: int,
++                                input_size_per_partition: int,
++                                input_size: int, group_size: int) \
++                                    -> Tuple[bool, Optional[str]]:
++    try:
++        verify_marlin_supports_shape(output_size_per_partition,
++                                     input_size_per_partition, input_size,
++                                     group_size)
++    except ValueError as e:
++        return False, e.__str__()
++    return True, None
++
++
++def marlin_make_workspace(output_size_per_partition: int,
++                          device: torch.device) -> torch.Tensor:
++    max_workspace_size = (output_size_per_partition //
++                          GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
++
++    return torch.zeros(max_workspace_size,
++                       dtype=torch.int,
++                       device=device,
++                       requires_grad=False)
++
++
++def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
++    return (not act_order) or (act_order and not is_row_parallel)
++
++
++def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
++                                      is_row_parallel: bool) -> bool:
++    # Need to repeat scales on every rank if act_ordering or
++    # channelwise and RowParallelLinear
++    is_channelwise = group_size == -1
++    return act_order or (is_channelwise and is_row_parallel)
++
++
++def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
++    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
++                              requires_grad=False)
++
++
++def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
++    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
++                              requires_grad=False)
++
++
++def marlin_sort_g_idx(
++        g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
++    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
++    return g_idx[g_idx_sort_indices], g_idx_sort_indices
++
++
++def get_scale_perms():
++    scale_perm: List[int] = []
++    for i in range(8):
++        scale_perm.extend([i + 8 * j for j in range(8)])
++    scale_perm_single: List[int] = []
++    for i in range(4):
++        scale_perm_single.extend(
++            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
++    return scale_perm, scale_perm_single
++
++
++def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
++                          group_size: int) -> torch.Tensor:
++
++    scale_perm, scale_perm_single = get_scale_perms()
++    if group_size < size_k and group_size != -1:
++        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
++    else:
++        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
++    s = s.reshape((-1, size_n)).contiguous()
++
++    return s
++
++
++def marlin_moe_permute_scales(
++    s: torch.Tensor,
++    size_k: int,
++    size_n: int,
++    group_size: int,
++):
++    num_experts = s.shape[0]
++    output = torch.empty(
++        (num_experts, s.shape[1], s.shape[2]),
++        device=s.device,
++        dtype=s.dtype,
++    )
++
++    for e in range(num_experts):
++        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
++    return output
++
++
++def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
++                       num_bits: int) -> torch.Tensor:
++    # Permute zero-points in a similar way to scales, but do not use the
++    # "single" permutation, since zero-points are applied on every MMA
++    scale_perm, _ = get_scale_perms()
++    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]
++
++    # Interleave column dim (for the dequantize code) and pack it to int32
++    if num_bits == 4:
++        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
++    elif num_bits == 8:
++        interleave = numpy.array([0, 2, 1, 3])
++    else:
++        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
++
++    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
++    zp = zp.reshape((-1, size_n)).contiguous()
++    zp = pack_cols(zp, num_bits, size_k, size_n)
++
++    return zp
++
++
++def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
++                              size_n: int, num_bits: int) -> torch.Tensor:
++    # AWQ zero-points are quantized and packed on the column dim.
++    # In addition, the values are permuted based on dequantizer.
++    # Here we undo both of these, and then apply marlin permutation
++    # and pack it back.
++    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)
++
++    # Undo interleaving (use argsort(..) to get inverse perm)
++    if num_bits == 4:
++        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
++    elif num_bits == 8:
++        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
++    else:
++        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
++
++    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
++    q_zp = q_zp.reshape((-1, size_n)).contiguous()
++
++    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
++    return marlin_zp
++
++
++def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
++                                  size_n: int, num_bits: int):
++    num_experts = q_zp_packed.shape[0]
++    output = torch.empty(
++        (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
++        device=q_zp_packed.device,
++        dtype=q_zp_packed.dtype,
++    )
++    for e in range(num_experts):
++        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
++                                              num_bits)
++    return output
++
++
++def apply_gptq_marlin_linear(
++        input: torch.Tensor,
++        weight: torch.Tensor,
++        weight_scale: torch.Tensor,
++        weight_zp: torch.Tensor,
++        g_idx: torch.Tensor,
++        g_idx_sort_indices: torch.Tensor,
++        workspace: torch.Tensor,
++        wtype: ScalarType,
++        output_size_per_partition: int,
++        input_size_per_partition: int,
++        is_k_full: bool,
++        bias: Optional[torch.Tensor] = None,
++        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
++    reshaped_x = input.reshape(-1, input.shape[-1])
++    out_shape = input.shape[:-1] + (output_size_per_partition, )
++
++    output = ops.gptq_marlin_gemm(reshaped_x,
++                                  weight,
++                                  weight_scale,
++                                  weight_zp,
++                                  g_idx,
++                                  g_idx_sort_indices,
++                                  workspace,
++                                  wtype,
++                                  size_m=reshaped_x.shape[0],
++                                  size_n=output_size_per_partition,
++                                  size_k=input_size_per_partition,
++                                  is_k_full=is_k_full,
++                                  has_zp=False,
++                                  use_fp32_reduce=use_fp32_reduce,
++                                  is_zp_float=False)
++
++    if bias is not None:
++        output.add_(bias)  # In-place add
++
++    return output.reshape(out_shape)
++
++
++def apply_awq_marlin_linear(
++        input: torch.Tensor,
++        weight: torch.Tensor,
++        weight_scale: torch.Tensor,
++        weight_zp: torch.Tensor,
++        g_idx: torch.Tensor,
++        g_idx_sort_indices: torch.Tensor,
++        workspace: torch.Tensor,
++        quant_type: ScalarType,
++        output_size_per_partition: int,
++        input_size_per_partition: int,
++        bias: Optional[torch.Tensor] = None,
++        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
++    reshaped_x = input.reshape(-1, input.shape[-1])
++    out_shape = input.shape[:-1] + (output_size_per_partition, )
++
++    output = ops.gptq_marlin_gemm(reshaped_x,
++                                  weight,
++                                  weight_scale,
++                                  weight_zp,
++                                  g_idx,
++                                  g_idx_sort_indices,
++                                  workspace,
++                                  quant_type,
++                                  size_m=reshaped_x.shape[0],
++                                  size_n=output_size_per_partition,
++                                  size_k=input_size_per_partition,
++                                  is_k_full=True,
++                                  has_zp=True,
++                                  use_fp32_reduce=use_fp32_reduce,
++                                  is_zp_float=False)
++
++    if bias is not None:
++        output.add_(bias)  # In-place add
++
++    return output.reshape(out_shape)
+diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+new file mode 100644
+index 0000000..245fe92
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+@@ -0,0 +1,108 @@
++from typing import Optional
++
++import torch
++
++import vllm._custom_ops as ops
++from vllm.logger import init_logger
++from vllm.platforms import current_platform
++
++from .marlin_utils import marlin_make_workspace, marlin_permute_scales
++
++logger = init_logger(__name__)
++
++
++def is_fp8_marlin_supported():
++    return current_platform.has_device_capability(80)
++
++
++def apply_fp8_marlin_linear(
++    input: torch.Tensor,
++    weight: torch.Tensor,
++    weight_scale: torch.Tensor,
++    workspace: torch.Tensor,
++    size_n: int,
++    size_k: int,
++    bias: Optional[torch.Tensor],
++) -> torch.Tensor:
++    # For GPUs that lack FP8 hardware support, we can leverage the
++    # Marlin kernel for fast weight-only FP8 quantization
++
++    reshaped_x = input.reshape(-1, input.shape[-1])
++    out_shape = input.shape[:-1] + (size_n, )
++
++    output = ops.fp8_marlin_gemm(
++        a=reshaped_x,
++        b_q_weight=weight,
++        b_scales=weight_scale,
++        workspace=workspace,
++        num_bits=8,
++        size_m=reshaped_x.shape[0],
++        size_n=size_n,
++        size_k=size_k,
++    )
++
++    if bias is not None:
++        output.add_(bias)  # In-place add
++
++    return output.reshape(out_shape)
++
++
++def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
++                                 strategy: str = "tensor") -> None:
++    logger.warning_once(
++        "Your GPU does not have native support for FP8 computation but "
++        "FP8 quantization is being used. Weight-only FP8 compression will "
++        "be used leveraging the Marlin kernel. This may degrade "
++        "performance for compute-heavy workloads.")
++
++    part_size_n = layer.output_size_per_partition
++    part_size_k = layer.input_size_per_partition
++
++    device = layer.weight.device
++
++    # WORKSPACE
++    layer.workspace = marlin_make_workspace(part_size_n, device)
++
++    # WEIGHT
++    # Repack weights to marlin format
++    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=pack_fp8_to_int32(
++        layer.weight),
++                                            perm=torch.empty(0,
++                                                             dtype=torch.int,
++                                                             device=device),
++                                            size_k=part_size_k,
++                                            size_n=part_size_n,
++                                            num_bits=8)
++    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
++
++    # WEIGHT SCALES
++    scales = layer.weight_scale.to(layer.orig_dtype)
++    # Permute scales
++    marlin_scales = marlin_permute_scales(s=scales,
++                                          size_k=part_size_k,
++                                          size_n=part_size_n,
++                                          group_size=-1)
++    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
++
++
++def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
++    """
++    Repack FP8 weights to gptq format (packed int32 elements)
++    """
++    assert fp8_tensor.dtype == torch.float8_e4m3fn
++    assert fp8_tensor.shape[0] % 4 == 0
++
++    # Reshape to prepare for packing
++    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
++
++    # Convert fp8 to uint8 (byte) representation
++    byte_tensor = reshaped.view(torch.uint8)
++
++    # Pack 4 uint8 values into one int32
++    packed = (byte_tensor[:, 0].to(torch.int32) |
++              (byte_tensor[:, 1].to(torch.int32) << 8) |
++              (byte_tensor[:, 2].to(torch.int32) << 16) |
++              (byte_tensor[:, 3].to(torch.int32) << 24))
++
++    return packed.view(fp8_tensor.shape[0] // 4,
++                       *fp8_tensor.shape[1:]).contiguous()
+diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+new file mode 100644
+index 0000000..4a06c5d
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+@@ -0,0 +1,163 @@
++"""Utility functions used for tests and benchmarks"""
++
++from typing import List, Optional
++
++import numpy as np
++import torch
++
++from vllm.scalar_type import ScalarType
++
++from .marlin_utils import (GPTQ_MARLIN_TILE, marlin_permute_scales,
++                           marlin_zero_points)
++from .quant_utils import (get_pack_factor, gptq_quantize_weights,
++                          quantize_weights, sort_weights)
++
++
++class MarlinWorkspace:
++
++    def __init__(self, out_features, min_thread_n, max_parallel):
++        assert (out_features % min_thread_n == 0), (
++            "out_features = {} is undivisible by min_thread_n = {}".format(
++                out_features, min_thread_n))
++
++        max_workspace_size = ((out_features // min_thread_n) * max_parallel)
++
++        self.scratch = torch.zeros(max_workspace_size,
++                                   dtype=torch.int,
++                                   device="cuda")
++
++
++def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
++    assert q_w.shape == (size_k, size_n)
++    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
++    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
++
++    # Permute weights to 16x64 marlin tiles
++    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
++    q_w = q_w.permute((0, 2, 1, 3))
++    q_w = q_w.reshape((size_k // tile, size_n * tile))
++
++    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
++
++    return q_w
++
++
++def marlin_weights(q_w, size_k, size_n, num_bits, perm):
++    # Permute
++    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
++
++    # Pack
++    pack_factor = get_pack_factor(num_bits)
++    orig_device = q_w.device
++
++    q_w = q_w.cpu().numpy().astype(np.uint32)
++
++    q_packed = np.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
++                        dtype=np.uint32)
++    for i in range(pack_factor):
++        q_packed |= q_w[:, i::pack_factor] << num_bits * i
++
++    q_packed = torch.from_numpy(q_packed.astype(np.int32)).to(orig_device)
++
++    return q_packed
++
++
++def get_weight_perm(num_bits: int):
++    perm_list: List[int] = []
++    for i in range(32):
++        perm1: List[int] = []
++        col = i // 4
++        for block in [0, 1]:
++            for row in [
++                    2 * (i % 4),
++                    2 * (i % 4) + 1,
++                    2 * (i % 4 + 4),
++                    2 * (i % 4 + 4) + 1,
++            ]:
++                perm1.append(16 * row + col + 8 * block)
++        for j in range(4):
++            perm_list.extend([p + 256 * j for p in perm1])
++
++    perm = np.array(perm_list)
++
++    if num_bits == 4:
++        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
++    elif num_bits == 8:
++        interleave = np.array([0, 2, 1, 3])
++    else:
++        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
++
++    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
++    perm = torch.from_numpy(perm)
++    return perm
++
++
++def marlin_quantize(w: torch.Tensor,
++                    quant_type: ScalarType,
++                    group_size: int,
++                    act_order: bool,
++                    test_perm: Optional[torch.Tensor] = None):
++    size_k, size_n = w.shape
++    num_bits = quant_type.size_bits
++
++    # Normalize group_size
++    if group_size == -1:
++        group_size = size_k
++    assert group_size <= size_k
++
++    # Quantize (and apply act_order if provided)
++    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
++        w, quant_type, group_size, act_order, test_perm)
++
++    # For act_order, sort the "weights" and "g_idx" so that group ids are
++    # increasing
++    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
++    if act_order:
++        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
++
++    # Reformat to marlin
++    weight_perm = get_weight_perm(num_bits)
++    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
++    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
++
++    # Create result
++    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
++    for i in range(len(res_list)):
++        res_list[i] = res_list[i].to(w.device)
++
++    return res_list
++
++
++def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType,
++                        group_size: int):
++    size_k, size_n = w.shape
++
++    # Normalize group_size
++    if group_size == -1:
++        group_size = size_k
++    assert group_size <= size_k
++
++    # Detect num groups
++    assert size_k % group_size == 0
++    num_groups = size_k // group_size
++
++    # Quantize with zp
++    w_ref, q_w, s, zp = quantize_weights(w,
++                                         quant_type,
++                                         group_size,
++                                         zero_points=True)
++
++    # Reformat to marlin
++    weight_perm = get_weight_perm(quant_type.size_bits)
++    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
++                                weight_perm)
++    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
++    marlin_zp = marlin_zero_points(zp, num_groups, size_n,
++                                   quant_type.size_bits)
++
++    # Create result
++    res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]
++    for i in range(len(res_list)):
++        res_list[i] = res_list[i].to(w.device)
++
++    return res_list
+diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+new file mode 100644
+index 0000000..17d0905
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+@@ -0,0 +1,463 @@
++"""Utility functions used for tests and benchmarks"""
++
++import random
++from typing import List
++
++import numpy
++import torch
++
++from vllm.scalar_type import ScalarType
++
++from .marlin_utils_test import marlin_weights
++from .quant_utils import gptq_quantize_weights
++
++
++# This is PyTorch implementation of main part of reorder_meta()
++# function, from tools/util/include/cutlass/util/host_reorder.h file
++# of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
++# GEMM decides upon layout of this matrix, and at the moment for the
++# sparse GEMM executed on tensor cores, this is layout described by
++# ColumnMajorInterleaved<2> data structure, in
++# include/cutlass/layout/matrix.h of CUTLASS source tree.  The
++# reordering of meta matrix into meta_reordered matrix calculated
++# according to these segments of CUTLASS code is re-implemented here.
++# Note that this calculation produces offsets for scattering metadata
++# matrix elements into reordered metadata matrix elements (or,
++# equivalently, for gathering reordered metadata matrix element back
++# into metadata matrix elements).
++def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
++                                               device):
++    dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
++    dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)
++
++    # Reorder the rows, then swizzle the 2x2 blocks.
++    group_x = 64
++    group_y = 32 if meta_dtype.itemsize == 2 else 16
++
++    dst_rows = (dst_rows // group_x * group_x + (dst_rows % 2) * 2 +
++                (dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 +
++                ((dst_rows % group_x) // 8) * 4)
++
++    topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8)
++    bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8)
++    dst_rows += topright - bottomleft
++    dst_cols -= topright - bottomleft
++
++    # Assumed that meta tensor is to be stored in CUTLASS
++    # InterleavedColumnMajor layout, and reverse engineered
++    # corresponding code to store values into this tensor.
++    interleave = 2
++    cols_maj = dst_cols // interleave
++    cols_min = dst_cols % interleave
++    return (cols_maj * m * interleave + dst_rows * interleave +
++            cols_min).view(-1)
++
++
++# This function converts dense matrix into sparse semi-structured
++# representation, producing "compressed" matrix, in the layout used by
++# CUTLASS backend, and corresponding metadata matrix.
++def sparse_semi_structured_from_dense_cutlass(dense):
++    if dense.dim() != 2:
++        raise RuntimeError(
++            f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor"  # noqa: E501
++        )
++
++    m, k = dense.shape
++    device = dense.device
++
++    meta_dtype = torch.int8
++    if dense.dtype == torch.int8:
++        meta_dtype = torch.int32
++    elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]:
++        meta_dtype = torch.int16
++    else:
++        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
++    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
++    if quadbits_per_meta_elem not in (4, 8):
++        raise RuntimeError(
++            "Invalid number of elements per meta element calculated")
++
++    if meta_dtype == torch.int32:
++        if m % 16 != 0:
++            raise RuntimeError(
++                f"Number of rows of dense matrix {m} must be divisible by 16")
++    else:
++        if m % 32 != 0:
++            raise RuntimeError(
++                f"Number of rows of dense matrix {m} must be divisible by 32")
++    if k % (4 * quadbits_per_meta_elem) != 0:
++        raise RuntimeError(
++            f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"  # noqa: E501
++        )
++
++    if dense.dtype != torch.float:
++        ksparse = 4
++        dense_4 = dense.view(-1, k // ksparse, ksparse)
++        m0, m1, m2, m3 = (dense_4 != 0).unbind(-1)
++    else:
++        ksparse = 2
++        dense_2 = dense.view(-1, k // ksparse, ksparse)
++        m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1)
++    meta_ncols = k // (ksparse * quadbits_per_meta_elem)
++
++    # Encoding quadruples of True/False values as follows:
++    #     [True,  True,  False, False] -> 0b0100
++    #     [True,  False, True,  False] -> 0b1000
++    #     [False, True,  True,  False] -> 0b1001
++    #     [True,  False, False, True ] -> 0b1100
++    #     [False, True,  False, True ] -> 0b1101
++    #     [False, False, True,  True ] -> 0b1110
++    # Thus, lower two bits in the encoding are index of the True value
++    # at the lowest index in the quadruple, and the higher two bits in
++    # the encoding are index of the other True value in the quadruple.
++    # In case there are less than two True values, than False value or
++    # values at some index or indices are considered True for the
++    # encoding.  In case there are more than two True values, then the
++    # excess True value(s) at some indices are considered False for
++    # the encoding.  The exact encodings used for these cases are as
++    # follows:
++    #     [False, False, False, False] -> 0b1110
++    #     [False, False, False, True ] -> 0b1110
++    #     [False, False, True,  False] -> 0b1110
++    #     [False, True,  False, False] -> 0b1001
++    #     [False, True,  True,  True ] -> 0b1101
++    #     [True,  False, False, False] -> 0b1000
++    #     [True,  False, True,  True ] -> 0b1100
++    #     [True,  True,  False, True ] -> 0b0100
++    #     [True,  True,  True,  False] -> 0b0100
++    #     [True,  True,  True,  True ] -> 0b0100
++    # These particular encodings are chosen, with the help of Espresso
++    # logic minimizer software, for the purpose of minimization of
++    # corresponding Boolean functions, that translate non-zero flags
++    # into encoding bits.  Note also possible choices for the first
++    # and last of these encodings were limited only to (0b0100,
++    # 0b1110), in order to produce valid encodings for 1:2 sparsity
++    # case.
++
++    expr0 = m0 & m1
++    expr1 = ~m0 & m1
++    expr2 = ~m0 & ~m1
++    bit0 = expr1
++    bit1 = expr2
++    bit2 = expr0 | expr2 | m3
++    bit3 = expr1 | ~m1
++    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
++    idxs1 = bit2 | (bit3.to(torch.int64) << 1)
++
++    if dense.dtype != torch.float:
++        sparse0 = dense_4.gather(
++            -1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
++        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
++        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
++    else:
++        sparse = dense_2.gather(-1,
++                                idxs0.unsqueeze(-1) // 2).view(
++                                    m,
++                                    k // 2)  # type: ignore[possibly-undefined]
++
++    meta_4 = idxs0 | (idxs1 << 2)
++    meta_n = meta_4.view(
++        (-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
++
++    if quadbits_per_meta_elem == 4:
++        meta = (meta_n[:, :, 0]
++                | (meta_n[:, :, 1] << 4)
++                | (meta_n[:, :, 2] << 8)
++                | (meta_n[:, :, 3] << 12))
++    elif quadbits_per_meta_elem == 8:
++        meta = (meta_n[:, :, 0]
++                | (meta_n[:, :, 1] << 4)
++                | (meta_n[:, :, 2] << 8)
++                | (meta_n[:, :, 3] << 12)
++                | (meta_n[:, :, 4] << 16)
++                | (meta_n[:, :, 5] << 20)
++                | (meta_n[:, :, 6] << 24)
++                | (meta_n[:, :, 7] << 28))
++
++    # Reorder meta tensor elements.
++    meta_reordered = meta.new_empty(
++        (m * meta_ncols, ))  # type: ignore[possibly-undefined]
++    meta_offsets = _calculate_meta_reordering_scatter_offsets(
++        m, meta_ncols, meta_dtype, device)
++    meta_reordered.scatter_(0, meta_offsets, meta.view(-1))
++
++    return (sparse, meta_reordered.view(m, meta_ncols))
++
++
++# This function performs reverse of the function above - it
++# reconstructs dense matrix from a pair of "compressed" matrix, given
++# in the layout used by CUTLASS backend, and accompanying metadata
++# matrix.
++def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
++    if sparse.dim() != 2:
++        raise RuntimeError(
++            f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor"  # noqa: E501
++        )
++
++    m, k = sparse.shape
++    device = sparse.device
++
++    if meta_reordered.dim() != 2:
++        raise RuntimeError(
++            f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor"  # noqa: E501
++        )
++    if meta_reordered.device != device:
++        raise RuntimeError(
++            f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device"  # noqa: E501
++        )
++
++    meta_dtype = meta_reordered.dtype
++    if meta_dtype not in (torch.int16, torch.int32):
++        raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix")
++    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
++
++    ksparse = 4 if sparse.dtype != torch.float else 2
++
++    meta_nrows, meta_ncols = meta_reordered.shape
++    if meta_nrows != m:
++        raise RuntimeError(
++            f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}"  # noqa: E501
++        )
++    if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k:
++        raise RuntimeError(
++            f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, "  # noqa: E501
++            "expected according to the number of columns of meta matrix")
++
++    # Undo meta tensor elements reordering.
++    meta_offsets = _calculate_meta_reordering_scatter_offsets(
++        m, meta_ncols, meta_dtype, device)
++    meta = torch.gather(meta_reordered.view(-1), 0,
++                        meta_offsets).view(m, meta_ncols)
++
++    # Unpack sparse tensor back to original dense tensor, using
++    # information provided by meta tensor.  Note that torch.float
++    # datatype is handled pretty much the same as
++    # torch.half/torch.bfloat16, as metadata for a pair of torch.float
++    # value is encoded as if underlying 8 bytes contain four
++    # torch.half/torch.bfloat16 values, where either first two or last
++    # two are zeros.
++    meta_2 = torch.empty(
++        (m, meta_ncols, 2 * quadbits_per_meta_elem),
++        dtype=meta_dtype,
++        device=device,
++    )
++    if quadbits_per_meta_elem == 4:
++        meta_2[:, :, 0] = meta & 0b11
++        meta_2[:, :, 1] = (meta >> 2) & 0b11
++        meta_2[:, :, 2] = (meta >> 4) & 0b11
++        meta_2[:, :, 3] = (meta >> 6) & 0b11
++        meta_2[:, :, 4] = (meta >> 8) & 0b11
++        meta_2[:, :, 5] = (meta >> 10) & 0b11
++        meta_2[:, :, 6] = (meta >> 12) & 0b11
++        meta_2[:, :, 7] = (meta >> 14) & 0b11
++    elif quadbits_per_meta_elem == 8:
++        meta_2[:, :, 0] = meta & 0b11
++        meta_2[:, :, 1] = (meta >> 2) & 0b11
++        meta_2[:, :, 2] = (meta >> 4) & 0b11
++        meta_2[:, :, 3] = (meta >> 6) & 0b11
++        meta_2[:, :, 4] = (meta >> 8) & 0b11
++        meta_2[:, :, 5] = (meta >> 10) & 0b11
++        meta_2[:, :, 6] = (meta >> 12) & 0b11
++        meta_2[:, :, 7] = (meta >> 14) & 0b11
++        meta_2[:, :, 8] = (meta >> 16) & 0b11
++        meta_2[:, :, 9] = (meta >> 18) & 0b11
++        meta_2[:, :, 10] = (meta >> 20) & 0b11
++        meta_2[:, :, 11] = (meta >> 22) & 0b11
++        meta_2[:, :, 12] = (meta >> 24) & 0b11
++        meta_2[:, :, 13] = (meta >> 26) & 0b11
++        meta_2[:, :, 14] = (meta >> 28) & 0b11
++        meta_2[:, :, 15] = (meta >> 30) & 0b11
++
++    dense_offsets = meta_2.view(-1) + (
++        torch.arange(0, 2 * m * k // ksparse, device=device) * 4).view(
++            -1, 1).repeat(1, 2).view(-1)
++
++    dense = torch.zeros((m * 2 * k, ), dtype=sparse.dtype, device=device)
++    if sparse.dtype != torch.float:
++        # dense.scatter_(0, dense_offsets, sparse.view(-1))
++        dense.scatter_(0, dense_offsets, sparse.reshape(-1))
++    else:
++        dense.view(torch.half).scatter_(0, dense_offsets,
++                                        sparse.view(torch.half).view(-1))
++
++    return dense.view(m, 2 * k)
++
++
++def mask_creator(tensor):
++    """
++    Class for creating N:M sparsity masks.
++    Masks will be created using the N:M ratio, where for every block of 
++    M weights, N will be pruned based on ranked weight value. Each mask 
++    will correspond to the given tensor.
++
++    :param N: The number of weights in a group to keep
++    :param M: The size of a weight group
++    """
++    N = 2
++    M = 4
++
++    mask = None
++    # for i, tensor in enumerate(tensors):
++    if tensor.numel() % M != 0:
++        raise ValueError(
++            f"Tensor of size {tensor.shape} can't be evenly divided into "
++            f"{M} groups")
++
++    num_groups = tensor.numel() // M
++
++    # N:M sparsity for linear layers
++    tensor_temp = tensor.detach().abs().reshape(num_groups, M)
++    index = torch.argsort(tensor_temp, dim=1)[:, :int(M - N)]
++
++    w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device)
++    mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)
++
++    return mask
++
++
++def inject_24(w, size_k, size_n):
++    assert w.shape == (size_k, size_n)
++
++    mask = mask_creator(w.t()).t().cuda().bool()
++
++    return (mask * w).contiguous(), mask.contiguous()
++
++
++def check_24(w, num_rows_to_sample=50, _verbose=False):
++    BLOCK_SIZE = 4
++    MAX_NON_ZEROS = 2
++
++    w = w.t().contiguous()
++
++    print("check_24: w.shape = {}".format(w.shape))
++
++    num_rows, num_cols = w.shape
++    sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
++    if _verbose:
++        print(f"Sampled row idxs = {sampled_row_idxs}")
++
++    total_segments = 0
++    non_24_segments = 0
++    for i in sampled_row_idxs:
++        for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
++            total_segments += 1
++            block = w[i, j:j + BLOCK_SIZE]
++            num_nonzero = torch.count_nonzero(block)
++            if num_nonzero > MAX_NON_ZEROS:
++                print("i = {} j = {} block = {}".format(i, j, block))
++                non_24_segments += 1
++
++    print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
++
++
++def compress_quantized_24_weight(q_24, size_k, size_n, wtype: ScalarType):
++    assert q_24.shape == (size_k, size_n)
++
++    # Remove bias to normalize over 0
++    q_24_no_zp = q_24 - wtype.bias
++
++    # Compress
++    q_24_no_zp = q_24_no_zp.t().contiguous()
++    q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
++        q_24_no_zp)
++    q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
++
++    # Restore bias
++    q_24_comp = q_24_no_zp_comp + wtype.bias
++
++    # Resize meta to its actual shape (without moving any data)
++    meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
++
++    return q_24_comp, meta
++
++
++def get_scale_perms_24():
++    scale_perm: List[int] = []
++    for i in range(8):
++        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
++    scale_perm_single: List[int] = []
++    for i in range(8):
++        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
++    return scale_perm, scale_perm_single
++
++
++def get_weight_perm_24(num_bits: int):
++    perm_list: List[int] = []
++    for i in range(32):
++        perm1: List[int] = []
++        col = i // 4
++        col_o = col // 2
++        for block in [0, 1]:
++            for row in [
++                    2 * (i % 4),
++                    2 * (i % 4) + 1,
++                    2 * (i % 4 + 4),
++                    2 * (i % 4 + 4) + 1,
++            ]:
++                perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
++                             4 * block)
++        for j in range(4):
++            perm_list.extend([p + 1 * j for p in perm1])
++    perm = numpy.array(perm_list)
++
++    if num_bits == 4:
++        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
++    elif num_bits == 8:
++        interleave = numpy.array([0, 2, 1, 3])
++    else:
++        raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))
++
++    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
++    perm = torch.from_numpy(perm)
++    return perm
++
++
++def marlin_permute_scales_24(s: torch.Tensor, size_k: int, size_n: int,
++                             group_size: int) -> torch.Tensor:
++
++    scale_perm, scale_perm_single = get_scale_perms_24()
++    if group_size < size_k and group_size != -1:
++        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
++    else:
++        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
++    s = s.reshape((-1, size_n)).contiguous()
++
++    return s
++
++
++def marlin_24_quantize(
++    w: torch.Tensor,
++    quant_type: ScalarType,
++    group_size: int,
++):
++    size_k, size_n = w.shape
++
++    # Normalize group_size
++    if group_size == -1:
++        group_size = size_k
++    assert group_size <= size_k
++
++    # Inject 2:4 sparsity
++    w_24, mask_24 = inject_24(w, size_k, size_n)
++
++    # Quantize
++    w_24_ref, q_w_24, s, g_idx, rand_perm = gptq_quantize_weights(
++        w_24, quant_type, group_size, act_order=False)
++
++    # Compress quantized weight
++    q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
++                                                     quant_type)
++    size_k_comp = size_k // 2
++
++    # Reformat to marlin
++    weight_perm = get_weight_perm_24(quant_type.size_bits)
++    marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
++                                        quant_type.size_bits, weight_perm)
++    marlin_24_s = marlin_permute_scales_24(s, size_k, size_n, group_size)
++
++    # Create result
++    res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
++    for i in range(len(res_list)):
++        res_list[i] = res_list[i].to(w.device)
++
++    return res_list
+diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
+new file mode 100644
+index 0000000..cb58eb9
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
+@@ -0,0 +1,125 @@
++from typing import List
++
++import numpy
++import torch
++
++from .marlin_utils_test import marlin_permute_weights
++from .quant_utils import get_pack_factor, qqq_quantize_weights
++
++
++def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size):
++    # Permute
++    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
++
++    # Pack
++    pack_factor = get_pack_factor(num_bits)
++    orig_device = q_w.device
++
++    q_w = q_w.cpu().numpy().astype(numpy.uint32)
++
++    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
++                           dtype=numpy.uint32)
++    if group_size == size_k:
++        for i in range(pack_factor):
++            q_packed |= (q_w[:, i::pack_factor] & 0xF) << num_bits * i
++    else:
++        for i in range(pack_factor):
++            q_packed |= q_w[:, i::pack_factor] << num_bits * i
++
++    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
++
++    return q_packed
++
++
++def get_qqq_scale_perms():
++    scale_perm: List[int] = []
++    for i in range(8):
++        scale_perm.extend([i + 8 * j for j in range(8)])
++    scale_perm_single: List[int] = []
++    for i in range(4):
++        scale_perm_single.extend(
++            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
++    return scale_perm, scale_perm_single
++
++
++# NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501
++def get_qqq_weight_perm(num_bits: int, quant_type: str):
++    perm_list: List[int] = []
++    for i in range(32):
++        perm1: List[int] = []
++        col = i // 4
++        for block in [0, 1]:
++            for row in [
++                    4 * (i % 4),
++                    4 * (i % 4) + 1,
++                    4 * (i % 4) + 2,
++                    4 * (i % 4) + 3,
++            ]:
++                perm1.append(16 * row + col + 8 * block)
++        for j in range(4):
++            perm_list.extend([p + 256 * j for p in perm1])
++
++    perm = numpy.array(perm_list)
++
++    assert quant_type in ["per-channel",
++                          "per-group"], "not supported quantization type"
++    if num_bits == 4:
++        if quant_type == "per-channel":
++            interleave = numpy.array([4, 0, 5, 1, 6, 2, 7, 3])
++        else:
++            interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
++    else:
++        raise Exception("num_bits must be 4, got {}".format(num_bits))
++
++    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
++    perm = torch.from_numpy(perm)
++    return perm
++
++
++def marlin_qqq_permute_scales(s_group, s_channel, size_k, size_n, group_size):
++    scale_perm, scale_perm_single = get_qqq_scale_perms()
++    if group_size < size_k and group_size != -1:
++        s_group = s_group.reshape((-1, len(scale_perm)))[:, scale_perm]
++        s_channel = s_channel.reshape(
++            (-1, len(scale_perm_single)))[:, scale_perm_single]
++        s_group = s_group.reshape((-1, size_n)).contiguous()
++    else:
++        s_channel = s_channel.reshape(
++            (-1, len(scale_perm_single)))[:, scale_perm_single]
++    s_channel = s_channel.reshape((-1, size_n)).contiguous()
++
++    return s_group, s_channel
++
++
++def marlin_qqq_quantize(
++    w: torch.Tensor,
++    num_bits: int,
++    group_size: int,
++):
++    size_k, size_n = w.shape
++
++    # Normalize group_size
++    if group_size == -1:
++        group_size = size_k
++    assert group_size <= size_k
++    quant_type = "per-channel" if group_size == size_k else "per-group"
++
++    # Quantize
++    w_ref, q_w, s_group, s_channel = qqq_quantize_weights(
++        w, num_bits, group_size)
++
++    # Reformat to marlin_qqq
++    weight_perm = get_qqq_weight_perm(num_bits, quant_type)
++    marlin_qqq_q_w = marlin_qqq_weights(q_w, size_k, size_n, num_bits,
++                                        weight_perm, group_size)
++    marlin_qqq_s_group, marlin_qqq_s_channel = marlin_qqq_permute_scales(
++        s_group, s_channel, size_k, size_n, group_size)
++
++    # Create result
++    res_list = [
++        w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel
++    ]
++    for i in range(len(res_list)):
++        res_list[i] = res_list[i].to(w.device)
++
++    return res_list
+diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
+new file mode 100644
+index 0000000..83055d6
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
+@@ -0,0 +1,454 @@
++"""This file is used for /tests and /benchmarks"""
++from typing import List, Optional
++
++import numpy
++import torch
++
++from vllm.model_executor.layers.quantization.qqq import (
++    MARLIN_QQQ_SUPPORTED_NUM_BITS)
++from vllm.scalar_type import ScalarType, scalar_types
++
++SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
++SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
++
++# Note: this is a hack. We should update each model to register the
++# stacked params and get it from there instead in a future PR.
++# fused_name: List[shard_name]
++FUSED_LAYER_NAME_MAPPING = {
++    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
++    "gate_up_proj": ["gate_proj", "up_proj"]
++}
++
++
++def pack_quantized_values_into_int32(w_q: torch.Tensor,
++                                     wtype: ScalarType,
++                                     packed_dim: int = 0):
++    # move dim to pack to the end
++    perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
++    inv_perm = tuple(perm.index(i) for i in range(len(perm)))
++    w_q_perm = w_q.permute(perm)
++
++    pack_factor = 32 // wtype.size_bits
++    mask = (1 << wtype.size_bits) - 1
++
++    new_shape_perm = list(w_q_perm.shape)
++    assert w_q_perm.shape[-1] % pack_factor == 0
++    new_shape_perm[-1] //= pack_factor
++
++    res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device)
++    for i in range(pack_factor):
++        res |= (w_q_perm[..., i::pack_factor] & mask) << wtype.size_bits * i
++
++    return res.permute(inv_perm)
++
++
++def unpack_quantized_values_into_int32(w_q: torch.Tensor,
++                                       wtype: ScalarType,
++                                       packed_dim: int = 0):
++    # move dim to pack to the end
++    perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
++    inv_perm = tuple(perm.index(i) for i in range(len(perm)))
++    w_q_perm = w_q.permute(perm)
++
++    pack_factor = 32 // wtype.size_bits
++    mask = (1 << wtype.size_bits) - 1
++
++    new_shape_perm = list(w_q_perm.shape)
++    new_shape_perm[-1] *= pack_factor
++
++    res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device)
++    for i in range(pack_factor):
++        res[..., i::pack_factor] = (w_q_perm >> wtype.size_bits * i) & mask
++
++    return res.permute(inv_perm)
++
++
++def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool:
++    # prefix: model.layers.0.self_attn.q_proj
++    # proj_name: q_proj
++    proj_name = prefix.split(".")[-1]
++    if proj_name in FUSED_LAYER_NAME_MAPPING:
++        shard_prefixes = [
++            prefix.replace(proj_name, shard_proj_name)
++            for shard_proj_name in FUSED_LAYER_NAME_MAPPING[proj_name]
++        ]
++
++        is_skipped = None
++        for shard_prefix in shard_prefixes:
++            is_shard_skipped = shard_prefix in ignored_layers
++
++            if is_skipped is None:
++                is_skipped = is_shard_skipped
++            elif is_shard_skipped != is_skipped:
++                raise ValueError(
++                    f"Detected some but not all shards of {prefix} "
++                    "are quantized. All shards of fused layers "
++                    "to have the same precision.")
++    else:
++        is_skipped = prefix in ignored_layers
++
++    assert is_skipped is not None
++    return is_skipped
++
++
++def get_pack_factor(num_bits):
++    assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}"
++    return 32 // num_bits
++
++
++def permute_rows(q_w: torch.Tensor,
++                 w_ref: torch.Tensor,
++                 group_size: int,
++                 test_perm: Optional[torch.Tensor] = None):
++    assert q_w.shape == w_ref.shape
++
++    orig_device = q_w.device
++    k_size, _ = q_w.shape
++
++    g_idx = torch.zeros((k_size, ), dtype=torch.int32)
++    for i in range(k_size):
++        g_idx[i] = i // group_size
++
++    # Simulate act_order by doing a random permutation on K
++    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
++
++    g_idx = g_idx[rand_perm].contiguous()
++    q_w = q_w[rand_perm, :].contiguous()
++    w_ref = w_ref[rand_perm, :].contiguous()
++
++    return (
++        w_ref.to(device=orig_device),
++        q_w.to(device=orig_device),
++        g_idx.to(device=orig_device),
++        rand_perm.to(device=orig_device),
++    )
++
++
++def quantize_weights(w: torch.Tensor,
++                     quant_type: ScalarType,
++                     group_size: Optional[int],
++                     zero_points: bool = False,
++                     ref_zero_points_after_scales: bool = False):
++    assert quant_type.is_integer(), \
++        "Floating point quantization may work but has not been tested"
++    assert not zero_points or group_size is not None, \
++        "to have group zero points, group_size must be provided "\
++        "(-1 group_size is channelwise)"
++
++    orig_device = w.device
++    orig_type = w.dtype
++    size_k, size_n = w.shape
++
++    assert w.is_floating_point(), "w must be float"
++
++    if group_size == -1:
++        group_size = size_k
++
++    # Reshape to [groupsize, -1]
++    if group_size is not None and group_size < size_k:
++        w = w.reshape((-1, group_size, size_n))
++        w = w.permute(1, 0, 2)
++        w = w.reshape((group_size, -1))
++
++    # Compute scale for each group
++    max_val = torch.max(w, 0, keepdim=True).values
++    min_val = torch.min(w, 0, keepdim=True).values
++
++    max_q_val = quant_type.max()
++    min_q_val = quant_type.min()
++
++    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
++    maybe_w_zp = None
++    if group_size is not None:
++        if zero_points:
++            assert not quant_type.is_signed() and quant_type.max() > 0
++            w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
++            maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \
++                .clamp(min_q_val, max_q_val).int()
++        else:
++            # If the bias is such that there are no possible negative/positive
++            #  values, set the max value to inf to avoid divide by 0
++            w_s = torch.max(
++                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
++                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)))
++
++    # Quantize
++    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
++    w_q = torch.clamp(w_q, min_q_val, max_q_val)
++
++    # Compute ref (dequantized)
++    # For some kernels (namely Machete) the zero-points are applied after the
++    # scales are applied, for this case computing the reference in similar way
++    # allows us to use tighter error tolerances in our unit tests.
++    if ref_zero_points_after_scales and maybe_w_zp is not None:
++        w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
++    else:
++        w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
++
++    if quant_type.has_bias():
++        w_q += quant_type.bias
++
++    # Restore original shapes
++    if group_size is not None and group_size < size_k:
++
++        def reshape_w(w):
++            w = w.reshape((group_size, -1, size_n))
++            w = w.permute(1, 0, 2)
++            w = w.reshape((size_k, size_n)).contiguous()
++            return w
++
++        w_q = reshape_w(w_q)
++        w_ref = reshape_w(w_ref)
++        w_s = w_s.reshape((-1, size_n)).contiguous()
++
++    if maybe_w_zp is not None:
++        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
++        maybe_w_zp = maybe_w_zp.to(device=orig_device)
++
++    return (
++        w_ref.to(device=orig_device),
++        w_q.to(device=orig_device),
++        w_s if group_size is not None else None,
++        maybe_w_zp,
++    )
++
++
++def gptq_quantize_weights(w: torch.Tensor,
++                          quant_type: ScalarType,
++                          group_size: int,
++                          act_order: bool,
++                          test_perm: Optional[torch.Tensor] = None):
++    size_k, _ = w.shape
++
++    assert w.is_floating_point(), "w must be float"
++    assert quant_type in SUPPORTED_GPTQ_QUANT_TYPES, \
++        f"Unsupported gptq type = {quant_type}"
++    assert group_size in SUPPORTED_GROUP_SIZES + [
++        size_k
++    ], f"Unsupported groupsize = {group_size}"
++
++    w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size)
++
++    # Apply act_order
++    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
++    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
++    if act_order:
++        assert (
++            group_size < size_k
++        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
++            group_size, size_k)
++
++        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size,
++                                                    test_perm)
++
++    return w_ref, w_q, w_s, g_idx, rand_perm
++
++
++# QQQ employs different quant schemes for per-group and
++# per-channel quantization.
++def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int):
++    orig_device = w.device
++    size_k, size_n = w.shape
++
++    assert w.is_floating_point(), "w must be float"
++    assert num_bits in MARLIN_QQQ_SUPPORTED_NUM_BITS, \
++           f"Unsupported num_bits = {num_bits}"
++    assert group_size in SUPPORTED_GROUP_SIZES + [
++        size_k
++    ], f"Unsupported groupsize = {group_size}"
++
++    if group_size == -1:
++        group_size = size_k
++    assert group_size <= size_k
++
++    if group_size < size_k:
++        # Reshape to [groupsize, -1]
++        w = w.reshape((-1, group_size, size_n))
++        w = w.permute(1, 0, 2)
++        w = w.reshape((group_size, -1))
++
++        max_q_val = 2**num_bits - 1
++        half_q_val = (max_q_val + 1) // 2
++
++        # Compute scale for each group
++        s_group = torch.max(torch.abs(w), 0, keepdim=True)[0]
++        s_group *= 2 / max_q_val  # 2 => symmetric
++
++        # Quantize
++        q_w = torch.round(w / s_group).int()
++        q_w += half_q_val
++        q_w = torch.clamp(q_w, 0, max_q_val)
++        # Compute ref (dequantized)
++        w_ref = (q_w - half_q_val).half() * s_group
++
++        # Restore original shapes
++        def reshape_w(w):
++            w = w.reshape((group_size, -1, size_n))
++            w = w.permute(1, 0, 2)
++            w = w.reshape((size_k, size_n)).contiguous()
++            return w
++
++        q_w = reshape_w(q_w)
++        w_ref = reshape_w(w_ref)
++
++        # Compute int8 quantization scale for each channel
++        s_channel = torch.max(torch.abs(w_ref), 0, keepdim=True)[0]
++        s_channel /= 127.0
++        t_int8 = (w_ref / s_channel).round().clamp(-128, 127).to(torch.int8)
++        w_ref = t_int8.half() * s_channel
++        s_channel = s_channel.reshape(1, -1).to(dtype=torch.float)
++
++        # Fuse scales
++        s_group = (s_group.reshape(-1, size_n).contiguous() /
++                   s_channel).to(dtype=torch.half)
++    else:
++        max_q_val = 2**(num_bits - 1) - 1
++
++        # Compute scale for each channel
++        s_channel = torch.max(torch.abs(w), 0, keepdim=True)[0]
++        s_channel /= max_q_val
++
++        # Quantize
++        q_w = torch.round(w / s_channel).int()
++        q_w = torch.clamp(q_w, -max_q_val, max_q_val)
++        # Compute ref (dequantized)
++        w_ref = q_w.half() * s_channel
++
++        s_group = torch.tensor([], dtype=torch.half)
++        # div 2 ** (8 - self.bits)) to offset right shift in unpacking
++        s_channel /= (2**(8 - num_bits))
++        s_channel = s_channel.reshape(-1, size_n).contiguous().to(torch.float)
++
++    return (
++        w_ref.to(device=orig_device),
++        q_w.to(device=orig_device),
++        s_group.to(device=orig_device),
++        s_channel.to(device=orig_device),
++    )
++
++
++def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
++    orig_device = q_w.device
++
++    sort_indices = torch.argsort(g_idx).to(
++        dtype=torch.int32)  # Sort based on g_idx
++
++    g_idx = g_idx[sort_indices].contiguous()
++    q_w = q_w[sort_indices, :].contiguous()
++
++    return (
++        q_w.to(device=orig_device),
++        g_idx.to(device=orig_device),
++        sort_indices.to(device=orig_device),
++    )
++
++
++def pack_rows(
++    q_w: torch.Tensor,
++    num_bits: int,
++    size_k: int,
++    size_n: int,
++):
++    assert q_w.shape == (size_k, size_n)
++
++    pack_factor = get_pack_factor(num_bits)
++    assert size_k % pack_factor == 0
++
++    orig_device = q_w.device
++
++    q_w = q_w.cpu().numpy().astype(numpy.uint32)
++
++    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)
++
++    for i in range(pack_factor):
++        q_res |= q_w[i::pack_factor, :] << num_bits * i
++
++    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
++    return q_res
++
++
++def pack_cols(
++    q_w: torch.Tensor,
++    num_bits: int,
++    size_k: int,
++    size_n: int,
++):
++    assert q_w.shape == (size_k, size_n)
++
++    pack_factor = get_pack_factor(num_bits)
++    assert size_n % pack_factor == 0
++
++    orig_device = q_w.device
++
++    q_w = q_w.cpu().numpy().astype(numpy.uint32)
++
++    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
++
++    for i in range(pack_factor):
++        q_res |= q_w[:, i::pack_factor] << num_bits * i
++
++    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
++    q_res = q_res.contiguous()
++
++    return q_res
++
++
++def unpack_cols(
++    packed_q_w: torch.Tensor,
++    num_bits: int,
++    size_k: int,
++    size_n: int,
++):
++    pack_factor = get_pack_factor(num_bits)
++    assert size_n % pack_factor == 0
++    assert packed_q_w.shape == (
++        size_k, size_n // pack_factor
++    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
++        packed_q_w.shape, size_k, size_n, pack_factor)
++
++    orig_device = packed_q_w.device
++
++    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
++    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
++
++    mask = (1 << num_bits) - 1
++    for i in range(pack_factor):
++        vals = packed_q_w_cpu & mask
++        packed_q_w_cpu >>= num_bits
++        q_res[:, i::pack_factor] = vals
++
++    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
++    q_res = q_res.contiguous()
++
++    return q_res
++
++
++def gptq_pack(
++    q_w: torch.Tensor,
++    num_bits: int,
++    size_k: int,
++    size_n: int,
++):
++    return pack_rows(q_w, num_bits, size_k, size_n)
++
++
++def awq_pack(
++    q_w: torch.Tensor,
++    num_bits: int,
++    size_k: int,
++    size_n: int,
++):
++    assert q_w.shape == (size_k, size_n)
++
++    # Interleave column dim (for the dequantize code) and pack it to int32
++    if num_bits == 4:
++        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
++    elif num_bits == 8:
++        interleave = numpy.array([0, 2, 1, 3])
++    else:
++        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
++
++    q_w = q_w.reshape((-1, len(interleave)))[:, interleave].ravel()
++    q_w = q_w.reshape((-1, size_n)).contiguous()
++
++    return pack_cols(q_w, num_bits, size_k, size_n)
+diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+new file mode 100644
+index 0000000..7cdce67
+--- /dev/null
++++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+@@ -0,0 +1,225 @@
++from typing import List, Optional, Tuple, Union
++
++import torch
++
++from vllm import _custom_ops as ops
++from vllm.platforms import current_platform
++
++# Input scaling factors are no longer optional in _scaled_mm starting
++# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
++TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
++
++
++def sparse_cutlass_supported() -> bool:
++    if not current_platform.is_cuda():
++        return False
++
++    capability_tuple = current_platform.get_device_capability()
++    capability = -1 if capability_tuple is None else capability_tuple.to_int()
++
++    return ops.cutlass_sparse_scaled_mm_supported(capability)
++
++
++def cutlass_fp8_supported() -> bool:
++    if not current_platform.is_cuda():
++        return False
++
++    capability_tuple = current_platform.get_device_capability()
++    capability = -1 if capability_tuple is None else capability_tuple.to_int()
++
++    return ops.cutlass_scaled_mm_supports_fp8(capability)
++
++
++def per_tensor_dequantize(
++        tensor: torch.Tensor, inv_scale: Union[float,
++                                               torch.Tensor]) -> torch.Tensor:
++    fake_qweight = tensor.to(torch.float16)
++    dq_weight = fake_qweight * inv_scale
++    return dq_weight
++
++
++def all_close_1d(x: torch.Tensor) -> bool:
++    assert len(x.shape) == 1
++    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
++
++
++def convert_to_channelwise(
++        weight_scale: torch.Tensor,
++        logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
++    # Create channelwise buffer
++    weight_scale_channel = torch.empty((sum(logical_widths), 1),
++                                       dtype=torch.float32,
++                                       device=weight_scale.device)
++
++    # Expand each scale to match the size of each logical matrix.
++    start = 0
++    for idx, logical_width in enumerate(logical_widths):
++        end = start + logical_width
++        weight_scale_channel[start:end, :] = weight_scale[idx]
++        start = end
++
++    return weight_scale_channel
++
++
++def requantize_with_max_scale(
++        weight: torch.Tensor, weight_scale: torch.Tensor,
++        logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
++    # Max scale to be used for requanitzation.
++    max_w_scale = weight_scale.max()
++
++    # QKV / MLP is fused in the on disk checkpoint if any of the
++    # weight scales are still set to the default since we initialize
++    # N weight scales for N shards but we only load 1 weight scale
++    # from disk in this case. Skip requantization in this case (since)
++    # we already are quantized with the single scale.
++    # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
++    unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
++        torch.float8_e4m3fn).min)
++
++    # If unfused checkpoint, need requanize with the single scale.
++    if unfused_module_in_checkpoint:
++        start = 0
++        for idx, logical_width in enumerate(logical_widths):
++            end = start + logical_width
++            weight_dq = per_tensor_dequantize(weight[start:end, :],
++                                              weight_scale[idx])
++            weight[start:end, :], _ = ops.scaled_fp8_quant(
++                weight_dq, max_w_scale)
++            start = end
++
++    return max_w_scale, weight
++
++
++def apply_fp8_linear(
++    input: torch.Tensor,
++    weight: torch.Tensor,
++    weight_scale: torch.Tensor,
++    input_scale: Optional[torch.Tensor] = None,
++    input_scale_ub: Optional[torch.Tensor] = None,
++    bias: Optional[torch.Tensor] = None,
++    cutlass_fp8_supported: bool = True,
++    use_per_token_if_dynamic: bool = False,
++) -> torch.Tensor:
++    # ops.scaled_fp8_quant supports both dynamic and static quant.
++    #   If dynamic, layer.input_scale is None and x_scale computed from x.
++    #   If static, layer.input_scale is scalar and x_scale is input_scale.
++
++    # View input as 2D matrix for fp8 methods
++    input_2d = input.view(-1, input.shape[-1])
++    output_shape = [*input.shape[:-1], weight.shape[1]]
++
++    # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
++    if cutlass_fp8_supported:
++        qinput, x_scale = ops.scaled_fp8_quant(
++            input_2d,
++            input_scale,
++            scale_ub=input_scale_ub,
++            use_per_token_if_dynamic=use_per_token_if_dynamic)
++
++        # Fused GEMM_DQ
++        output = ops.cutlass_scaled_mm(qinput,
++                                       weight,
++                                       out_dtype=input.dtype,
++                                       scale_a=x_scale,
++                                       scale_b=weight_scale,
++                                       bias=bias)
++        return output.view(*output_shape)
++
++    # torch.scaled_mm supports per tensor weights + activations only
++    # so fallback to naive if per channel or per token
++    else:
++        # Note: we pad the input because torch._scaled_mm is more performant
++        # for matrices with batch dimension > 16.
++        # This could change in the future.
++        qinput, x_scale = ops.scaled_fp8_quant(
++            input_2d,
++            input_scale,
++            num_token_padding=17,
++            use_per_token_if_dynamic=use_per_token_if_dynamic)
++
++        per_tensor_weights = (weight_scale.numel() == 1)
++        per_tensor_activations = (x_scale.numel() == 1)
++
++        if per_tensor_weights and per_tensor_activations:
++            # Fused GEMM_DQ
++            output = torch._scaled_mm(qinput,
++                                      weight,
++                                      out_dtype=input.dtype,
++                                      scale_a=x_scale,
++                                      scale_b=weight_scale,
++                                      bias=bias)
++            # A fix for discrepancy in scaled_mm which returns tuple
++            # for torch < 2.5 and a single value in torch >= 2.5
++            if type(output) is tuple and len(output) == 2:
++                output = output[0]
++
++            return torch.narrow(output, 0, 0,
++                                input_2d.shape[0]).view(*output_shape)
++
++        else:
++            # Fallback for channelwise case, where we use unfused DQ
++            # due to limitations with scaled_mm
++
++            # Symmetric quantized GEMM by definition computes the following:
++            #   C = (s_x * X) (s_w * W) + bias
++            # This is equivalent to dequantizing the weights and activations
++            # before applying a GEMM.
++            #
++            # In order to compute quantized operands, a quantized kernel
++            # will rewrite the above like so:
++            #   C = s_w * s_x * (X * W) + bias
++            #
++            # For the scaled_mm fallback case, we break this down, since it
++            # does not support s_w being a vector.
++
++            # Making sure the dummy tensor is on the same device as the weight
++            global TORCH_DEVICE_IDENTITY
++            if TORCH_DEVICE_IDENTITY.device != weight.device:
++                TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device)
++
++            # GEMM
++            # This computes C = (X * W).
++            # Output in fp32 to allow subsequent ops to happen in-place
++            output = torch._scaled_mm(qinput,
++                                      weight,
++                                      scale_a=TORCH_DEVICE_IDENTITY,
++                                      scale_b=TORCH_DEVICE_IDENTITY,
++                                      out_dtype=torch.float32)
++            # A fix for discrepancy in scaled_mm which returns tuple
++            # for torch < 2.5 and a single value in torch >= 2.5
++            if type(output) is tuple and len(output) == 2:
++                output = output[0]
++            # Unpad (undo num_token_padding)
++            output = torch.narrow(output, 0, 0, input_2d.shape[0])
++            x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
++
++            # DQ
++            # C = sw * sx * (X * W) + bias
++            output = output * x_scale * weight_scale.t()
++            if bias is not None:
++                output = output + bias
++            return output.to(dtype=input.dtype).view(*output_shape)
++
++
++def normalize_e4m3fn_to_e4m3fnuz(
++    weight: torch.Tensor,
++    weight_scale: torch.Tensor,
++    input_scale: Optional[torch.Tensor] = None
++) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
++    assert weight.dtype == torch.float8_e4m3fn
++    # The bits pattern 10000000(-128) represents zero in e4m3fn
++    # but NaN in e4m3fnuz. So here we set it to 0.
++    # https://onnx.ai/onnx/technical/float8.html
++    weight_as_int8 = weight.view(torch.int8)
++    ROCM_FP8_NAN_AS_INT = -128
++    weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
++    weight = weight_as_int8.view(torch.float8_e4m3fnuz)
++
++    # For the same bits representation, e4m3fnuz value is half of
++    # the e4m3fn value, so we should double the scaling factor to
++    # get the same dequantized value.
++    # https://onnx.ai/onnx/technical/float8.html
++    weight_scale = weight_scale * 2.0
++    if input_scale is not None:
++        input_scale = input_scale * 2.0
++    return weight, weight_scale, input_scale
+diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
+index 5edbbf2..9d6c379 100644
+--- a/vllm/model_executor/layers/rejection_sampler.py
++++ b/vllm/model_executor/layers/rejection_sampler.py
+@@ -1,61 +1,69 @@
+ from functools import cached_property
+-from typing import Optional, Tuple
++from importlib.util import find_spec
++from typing import Dict, Optional, Tuple
+ 
+ import torch
+ import torch.jit
+-import torch.nn as nn
+ 
++import vllm.envs as envs
++from vllm.logger import init_logger
++from vllm.model_executor.layers.spec_decode_base_sampler import (
++    SpecDecodeStochasticBaseSampler)
++from vllm.platforms import current_platform
+ 
+-class RejectionSampler(nn.Module):
++logger = init_logger(__name__)
++
++if find_spec("flashinfer"):
++    """
++    Consider utilizing the FlashInfer rejection sampling kernel initially,
++    as it employs a dedicated kernel rather than relying on 
++    Torch tensor operations. This design choice helps to fuse operations, 
++    reduce memory I/O, and consequently enhances performance.
++    """
++    from flashinfer.sampling import chain_speculative_sampling
++else:
++    chain_speculative_sampling = None
++
++
++class RejectionSampler(SpecDecodeStochasticBaseSampler):
+     """Apply modified rejection sampling as described in "Accelerating Large
+         Language Model Decoding with Speculative Sampling"
+         https://arxiv.org/pdf/2302.01318.pdf.
+     """
+ 
+-    def __init__(self, strict_mode: bool = False):
++    def __init__(self,
++                 strict_mode: bool = False,
++                 use_flashinfer: Optional[bool] = None):
+         """Create a rejection sampler.
+ 
+         Args:
+             strict_mode: Whether or not to perform shape/device/dtype checks
+-                during sampling. This catches correctness issues but adds
+-                nontrivial latency.
++            during sampling. This catches correctness issues but adds
++            nontrivial latency.
++            use_flashinfer: We will use this parameter to determine whether
++            to use the FlashInfer rejection sampling kernel or not. If it's
++            None, we will use the default value from the environment variable.
++            This parameter is only used for testing purposes.
+         """
+-        super().__init__()
+-        self._strict_mode = strict_mode
+-
+-        # NOTE: A "bonus token" is accepted iff all proposal tokens are
+-        # accepted. There is always only one possible bonus token. We store this
+-        # value in a variable for readability.
+-        self._num_bonus_tokens = 1
+-
+-        self.num_accepted_tokens: Optional[torch.Tensor] = None
+-        self.num_emitted_tokens: Optional[torch.Tensor] = None
+-        self.num_draft_tokens: int = 0
+-
+-    def init_gpu_tensors(self, rank: int) -> None:
+-        assert self.num_accepted_tokens is None
+-        device = f"cuda:{rank}"
+-        self.num_accepted_tokens = torch.tensor(0,
+-                                                dtype=torch.long,
+-                                                device=device)
+-        self.num_emitted_tokens = torch.tensor(0,
+-                                               dtype=torch.long,
+-                                               device=device)
+-
+-    @property
+-    def probs_dtype(self):
+-        return torch.float32
+-
+-    @property
+-    def token_id_dtype(self):
+-        return torch.int64
++        super().__init__(strict_mode=strict_mode)
++        if use_flashinfer is None:
++            self.use_flashinfer = envs.VLLM_USE_FLASHINFER_SAMPLER and (
++                chain_speculative_sampling is not None)
++        else:
++            self.use_flashinfer = use_flashinfer
++
++        if self.use_flashinfer:
++            logger.info("Use flashinfer for rejection sampling.")
++        else:
++            logger.info("Use pytorch for rejection sampling.")
+ 
+     def forward(
+         self,
+-        target_probs: torch.Tensor,
++        target_with_bonus_probs: torch.Tensor,
+         bonus_token_ids: torch.Tensor,
+         draft_probs: torch.Tensor,
+         draft_token_ids: torch.Tensor,
++        seeded_seqs: Optional[Dict[int, torch.Generator]] = None,
+     ) -> torch.Tensor:
+         """Sample token ids using rejection sampling. This accepts or rejects
+         tokens proposed by the draft model using the probability of each token
+@@ -69,9 +77,9 @@ class RejectionSampler(nn.Module):
+         sequence.
+ 
+         Args:
+-            target_probs: The probability distribution over token ids given
+-                context according to the target model.
+-            shape = [batch_size, num_speculative_tokens, vocab_size]
++            target_with_bonus_probs: The probability distribution 
++                over token ids given context according to the target model.
++            shape = [batch_size, num_speculative_tokens + 1, vocab_size]
+ 
+             bonus_token_ids: The "bonus" token ids that are accepted iff all
+                 speculative tokens in a sequence are accepted.
+@@ -85,6 +93,9 @@ class RejectionSampler(nn.Module):
+                 probabilities.
+             shape = [batch_size, num_speculative_tokens]
+ 
++            seeded_seqs: Dict of batch row index to torch generator, for
++                sequences using seeded generation.
++
+         Returns:
+             output_token_ids: The token ids sampled via rejection sampling,
+                 or -1 if unable to sample a token because the previous token
+@@ -94,35 +105,61 @@ class RejectionSampler(nn.Module):
+         # Only perform shape/dtype/device checking in strict mode, as it adds
+         # overhead.
+         if self._strict_mode:
+-            self._raise_if_incorrect_shape(target_probs, bonus_token_ids,
+-                                           draft_probs, draft_token_ids)
+-            self._raise_if_incorrect_dtype(target_probs, bonus_token_ids,
+-                                           draft_probs, draft_token_ids)
+-            self._raise_if_inconsistent_device(target_probs, bonus_token_ids,
+-                                               draft_probs, draft_token_ids)
+-            self._raise_if_out_of_bounds_vocab(target_probs.shape[-1],
+-                                               bonus_token_ids,
+-                                               draft_token_ids)
+-
+-        accepted, recovered_token_ids = self._batch_modified_rejection_sampling(
+-            target_probs,
+-            draft_probs,
+-            draft_token_ids,
+-        )
+-
+-        output_token_ids = self._create_output(
+-            accepted,
+-            recovered_token_ids,
+-            draft_token_ids,
+-            bonus_token_ids,
+-        )
++            self._raise_if_incorrect_input(target_with_bonus_probs,
++                                           draft_token_ids, bonus_token_ids,
++                                           draft_probs)
++
++        batch_size, k, _ = draft_probs.shape
++
++        # batch_size = 0 when all requests in the batch are
++        # non_spec requests. In this case, output_token_ids is
++        # just an empty tensor.
++        if batch_size == 0:
++            return torch.empty(0, k + 1, device=draft_probs.device, dtype=int)
++
++        # If use Flashinfer chain_speculative_sampling kernel
++        # for rejection sampling
++        if self.use_flashinfer and chain_speculative_sampling is not None:
++            batch_size, k, _ = draft_probs.shape
++            uniform_samples = self._create_uniform_samples(
++                seeded_seqs, batch_size, k, draft_probs.device)
++            output_token_ids, accepted_token_num, emitted_token_num \
++                = chain_speculative_sampling(
++                draft_probs, draft_token_ids, uniform_samples,
++                target_with_bonus_probs)
++
++            # num_emitted_tokens returned by flashinfer
++            # does not include the bonus token
++            # Flashinfer stops at the first token that violates
++            # the condition p >= q and does not include recovery/bonus token.
++            # Therefore, we need to add batch_size here.
++            self.num_accepted_tokens += accepted_token_num.sum()
++            self.num_emitted_tokens += emitted_token_num.sum() + batch_size
++            self.num_draft_tokens += batch_size * k
++        else:
++            accepted, recovered_token_ids = (
++                self._batch_modified_rejection_sampling(
++                    target_with_bonus_probs[:, :-1],
++                    draft_probs,
++                    draft_token_ids,
++                    seeded_seqs,
++                ))
++
++            output_token_ids = self._create_output(
++                accepted,
++                recovered_token_ids,
++                draft_token_ids,
++                bonus_token_ids,
++            )
++
+         return output_token_ids
+ 
+     def _batch_modified_rejection_sampling(
+-            self,
+-            target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
+-            draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
+-            draft_token_ids: torch.Tensor,  # [batch_size, k]
++        self,
++        target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
++        draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
++        draft_token_ids: torch.Tensor,  # [batch_size, k]
++        seeded_seqs: Optional[Dict[int, torch.Generator]],
+     ) -> Tuple[torch.Tensor, torch.Tensor]:
+         """Perform modified rejection sampling on each sequence.
+ 
+@@ -139,22 +176,84 @@ class RejectionSampler(nn.Module):
+ 
+         # shape [batch_size, k]
+         accepted = self._get_accepted(target_probs, draft_probs,
+-                                      draft_token_ids)
++                                      draft_token_ids, seeded_seqs)
+ 
+         recovered_probs = self._get_recovered_probs(
+             target_probs, draft_probs).reshape(batch_size * k, vocab_size)
+ 
+         # NOTE: the recovered_probs are overwritten by this method.
+-        recovered_token_ids = _multinomial(recovered_probs,
+-                                           num_samples=1).reshape(
+-                                               batch_size, k)
++        recovered_token_ids = _multinomial(
++            recovered_probs,
++            num_samples=1,
++            k=k,
++            seeded_seqs=seeded_seqs or {},
++        ).reshape(batch_size, k)
++
+         return accepted, recovered_token_ids
+ 
++    def _create_uniform_samples(self,
++                                seeded_seqs: Optional[Dict[int,
++                                                           torch.Generator]],
++                                batch_size: int, k: int,
++                                device: torch.device) -> torch.Tensor:
++        """
++        Generates a batch of uniform random samples, with optional seeding 
++        for specific sequences.
++
++        This method creates a tensor of shape `(batch_size, k + 1)` filled 
++        with uniform random values in the range [0, 1). If `seeded_seqs` 
++        is provided, the sequences corresponding to specific indices 
++        will be generated using the provided `torch.Generator` for 
++        reproducibility. The other sequences will be generated without 
++        a seed.
++
++        Args:
++            seeded_seqs : Optional[Dict[int, torch.Generator]]
++                A dictionary mapping indices in the batch to 
++                `torch.Generator` objects. If `None`, all samples are 
++                generated without a seed.
++            batch_size : int
++                The number of sequences to generate.
++            k : int
++                The number of random samples per sequence.
++            device : torch.device
++                The device on which to allocate the tensor.
++
++        Returns:
++            uniform_rand : torch.Tensor
++                A tensor of shape `(batch_size, k + 1)` containing uniform 
++                random values in the range [0, 1).
++        """
++        if not seeded_seqs:
++            return torch.rand(batch_size, k + 1, device=device)
++
++        uniform_rand = torch.empty(batch_size, k + 1, device=device)
++
++        non_seeded_indices = []
++        for idx in range(batch_size):
++            generator = seeded_seqs.get(idx)
++            if generator is None:
++                non_seeded_indices.append(idx)
++            else:
++                uniform_rand[idx, :] = torch.rand(1,
++                                                  k + 1,
++                                                  dtype=self.probs_dtype,
++                                                  device=device,
++                                                  generator=generator)
++        if non_seeded_indices:
++            uniform_rand[non_seeded_indices, :] = torch.rand(
++                len(non_seeded_indices),
++                k + 1,
++                dtype=self.probs_dtype,
++                device=device)
++        return uniform_rand
++
+     def _get_accepted(
+-            self,
+-            target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
+-            draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
+-            draft_token_ids: torch.Tensor,  # [batch_size, k]
++        self,
++        target_probs: torch.Tensor,  # [batch_size, k, vocab_size]
++        draft_probs: torch.Tensor,  # [batch_size, k, vocab_size]
++        draft_token_ids: torch.Tensor,  # [batch_size, k]
++        seeded_seqs: Optional[Dict[int, torch.Generator]],
+     ) -> torch.Tensor:
+         r"""Create bool matrix over the proposed draft tokens. If
+         True, then a token can be accepted, else it should be
+@@ -189,10 +288,9 @@ class RejectionSampler(nn.Module):
+         selected_target_probs = target_probs[batch_indices, probs_indicies,
+                                              draft_token_ids]
+ 
+-        uniform_rand = torch.rand(batch_size,
+-                                  k,
+-                                  dtype=self.probs_dtype,
+-                                  device=target_probs.device)
++        uniform_rand = self._create_uniform_samples(seeded_seqs, batch_size,
++                                                    k - 1, target_probs.device)
++
+         capped_ratio = torch.minimum(
+             selected_target_probs / selected_draft_probs,
+             torch.full((1, ), 1, device=target_probs.device))
+@@ -265,141 +363,36 @@ class RejectionSampler(nn.Module):
+         """
+         return torch.finfo(self.probs_dtype).tiny
+ 
+-    def _create_output(
+-            self,
+-            accepted: torch.Tensor,  # [batch_size, k]
+-            recovered_token_ids: torch.Tensor,  # [batch_size, k]
+-            draft_token_ids: torch.Tensor,  # [batch_size, k]
+-            bonus_token_ids: torch.Tensor,  # [batch_size]
+-    ) -> torch.Tensor:
+-        """Format output. Returns a matrix of token ids. When
+-        a token is rejected via rejection sampling, all subsequent
+-        token ids are set to -1 for the sequence.
+-
+-        shape = [batch_size, k + num_bonus_tokens]
+-        """
+-        bonus_token_ids = bonus_token_ids.squeeze()
+-        batch_size, k = recovered_token_ids.shape
+-
+-        # Determine the index of the first False value for each row.
+-        limits = (accepted == 0).max(1).indices
+-        limits[~(accepted == 0).any(1)] = k
+-
+-        # Create masks using the indices.
+-        indices = torch.arange(k, device=accepted.device).unsqueeze(0)
+-        accepted_mask = indices < limits.unsqueeze(1)
+-        after_false_mask = indices == limits.unsqueeze(1)
+-
+-        # Create an extended output tensor
+-        output_with_bonus_tokens = -torch.ones(
+-            (batch_size, k + self._num_bonus_tokens),
+-            dtype=self.token_id_dtype,
+-            device=accepted.device)
+-        output = output_with_bonus_tokens[:, :k]
+-
+-        # Fill in the first k columns of the output tensor using masks and data
+-        # tensors.
+-        output[:, :k] = torch.where(accepted_mask, draft_token_ids,
+-                                    -torch.ones_like(draft_token_ids))
+-
+-        # Fill the last column.
+-        # We check output directly as accepted may have True values inconsistent
+-        # with causal acceptance.
+-        output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1,
+-                                                      bonus_token_ids, -1)
+-
+-        # We disable bonus tokens because it causes corrupt KV cache for
+-        # proposal methods that require KV cache. We can fix it by "prefilling"
+-        # the bonus token in the proposer. The following issue tracks the fix.
+-        # https://github.com/vllm-project/vllm/issues/4212
+-        output_with_bonus_tokens[:, -1] = -1
+-
+-        # Fill the recovered token ids.
+-        output.mul_(~after_false_mask).add_(
+-            recovered_token_ids.mul(after_false_mask))
+-
+-        self.num_accepted_tokens += accepted.sum()
+-        self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum()
+-        self.num_draft_tokens += batch_size * k
+-
+-        return output_with_bonus_tokens
+-
+-    def _raise_if_incorrect_shape(
+-        self,
+-        target_probs: torch.Tensor,
+-        bonus_token_ids: torch.Tensor,
+-        draft_probs: torch.Tensor,
+-        draft_token_ids: torch.Tensor,
+-    ) -> None:
+-        (target_batch_size, num_target_probs,
+-         target_vocab_size) = target_probs.shape
+-        bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape
+-        draft_batch_size, num_draft_probs, draft_vocab_size = draft_probs.shape
+-        draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape
+-
+-        assert draft_batch_size == target_batch_size
+-        assert num_draft_probs == num_target_probs
+-        assert (draft_vocab_size == target_vocab_size
+-                ), f"{draft_vocab_size=} {target_vocab_size=}"
+-
+-        assert draft_token_ids_batch_size == draft_batch_size
+-        assert num_draft_token_ids == num_draft_probs
+-
+-        assert bonus_batch_size == target_batch_size
+-        assert num_bonus_tokens == self._num_bonus_tokens
+-
+-    def _raise_if_incorrect_dtype(
+-        self,
+-        target_probs: torch.Tensor,
+-        bonus_token_ids: torch.Tensor,
+-        draft_probs: torch.Tensor,
+-        draft_token_ids: torch.Tensor,
+-    ) -> None:
+-        assert all(probs.dtype == self.probs_dtype
+-                   for probs in [target_probs, draft_probs])
+-        assert all(token_ids.dtype == self.token_id_dtype
+-                   for token_ids in [bonus_token_ids, draft_token_ids])
+-
+-    def _raise_if_inconsistent_device(
+-        self,
+-        target_probs: torch.Tensor,
+-        bonus_token_ids: torch.Tensor,
+-        draft_probs: torch.Tensor,
+-        draft_token_ids: torch.Tensor,
+-    ) -> None:
+-        devices = [
+-            t.device for t in
+-            [target_probs, bonus_token_ids, draft_probs, draft_token_ids]
+-        ]
+-        assert all([devices[0] == device for device in devices])
+-
+-    def _raise_if_out_of_bounds_vocab(
+-        self,
+-        vocab_size: int,
+-        bonus_token_ids: torch.Tensor,
+-        draft_token_ids: torch.Tensor,
+-    ) -> None:
+-        assert torch.all(bonus_token_ids < vocab_size)
+-        assert torch.all(bonus_token_ids >= 0)
+-        assert torch.all(draft_token_ids < vocab_size)
+-        assert torch.all(draft_token_ids >= 0)
+-
+ 
+ # torch.multinomial forces a GPU<->CPU sync.
+ # Therefore, we use an optimized implementation instead that skips the sync.
+ # Note that we always sample with replacement.
+ # probs will be modified in place, but this is fine, as we pass
+ # in a copy already.
+-@torch.jit.script
++@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+ def _multinomial(
+     probs: torch.Tensor,
+     num_samples: int,
++    k: int,
++    seeded_seqs: Dict[int, torch.Generator],
+ ) -> torch.Tensor:
++
+     if num_samples > 1:
+         # This is equivalent to torch.repeat_interleaved (which also
+         # forces a GPU<->CPU sync).
+         probs = probs[:, None, :].expand(probs.shape[0], num_samples,
+                                          probs.shape[1]).contiguous().view(
+                                              -1, probs.shape[1])
+-    q = torch.empty_like(probs).exponential_(1.0)
++    q = torch.empty_like(probs)
++    if not seeded_seqs:
++        q.exponential_(1.0)
++    else:
++        start = 0
++        for idx in range(len(q) // k):
++            end = start + k
++            generator = seeded_seqs.get(idx)
++            # Note: generator might be None for non seeded
++            q[start:end].exponential_(1.0, generator=generator)
++            start = end
++
+     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
+diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
+new file mode 100644
+index 0000000..a67713c
+--- /dev/null
++++ b/vllm/model_executor/layers/resampler.py
+@@ -0,0 +1,267 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
++# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
++# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
++#
++# Copyright 2023 The Qwen team.
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""
++Shared resampler perceiver network used in multimodal models and
++related helpers for sincos positional embeddings.
++
++Example models: Qwen (Qwen-VL), MiniCPM-V 2.0
++"""
++import math
++from functools import partial
++from typing import Callable, Optional, Tuple, Union
++
++import numpy as np
++import torch
++import torch.nn.functional as F
++from torch import nn
++
++from vllm.model_executor.layers.linear import ReplicatedLinear
++from vllm.model_executor.layers.quantization import QuantizationConfig
++
++DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
++
++
++def get_abs_pos(abs_pos: torch.Tensor, tgt_size: Union[torch.Tensor,
++                                                       int]) -> torch.Tensor:
++    # abs_pos: L, C
++    # tgt_size: (H, W)
++    # return: M, C
++    src_size = int(math.sqrt(abs_pos.size(0)))
++    dtype = abs_pos.dtype
++    if isinstance(tgt_size, int):
++        tgt_size = (tgt_size, tgt_size)
++    if (src_size == tgt_size[0] and src_size == tgt_size[1]):
++        return abs_pos
++    return (F.interpolate(
++        abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
++        size=(tgt_size[0], tgt_size[1]),
++        mode="bicubic",
++        align_corners=False,
++    ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype))
++
++
++# sin/cos positional embedding helpers are adapted from:
++# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
++def get_1d_sincos_pos_embed_from_grid(
++    embed_dim: int, pos: np.ndarray,
++    version: Tuple[int, int] = (2, 0)) -> torch.Tensor:
++    """
++    embed_dim: output dimension for each position
++    pos: a list of positions to be encoded: size (M,) / (H, W)
++    out: (M, D) / (H, W, D)
++    """
++    assert embed_dim % 2 == 0
++    omega = np.arange(embed_dim // 2, dtype=np.float32)
++    omega /= embed_dim / 2.0
++    omega = 1.0 / 10000**omega  # (D/2,)
++
++    if version == (2, 0):
++        pos = pos.reshape(-1)  # (M,)
++        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
++        emb_sin = np.sin(out)  # (M, D/2)
++        emb_cos = np.cos(out)  # (M, D/2)
++        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
++    else:
++        out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
++        emb_sin = np.sin(out)  # (H, W, D/2)
++        emb_cos = np.cos(out)  # (H, W, D/2)
++        emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
++    return emb
++
++
++def get_2d_sincos_pos_embed_from_grid(
++    embed_dim: int, grid: np.ndarray,
++    version: Tuple[int, int] = (2, 0)) -> torch.Tensor:
++    assert embed_dim % 2 == 0
++
++    # use half of dimensions to encode grid_h
++    emb_h = get_1d_sincos_pos_embed_from_grid(
++        embed_dim // 2, grid[0], version)  # (H*W, D/2) or (H, W, D/2)
++    emb_w = get_1d_sincos_pos_embed_from_grid(
++        embed_dim // 2, grid[1], version)  # (H*W, D/2) or (H, W, D/2)
++
++    if version == (2, 0):
++        emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
++    else:
++        emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
++    return emb
++
++
++def get_2d_sincos_pos_embed(
++        embed_dim: int,
++        grid_size: Union[int, Tuple[int, int]],
++        cls_token: bool = False,
++        version: Tuple[int, int] = (2, 0),
++) -> torch.Tensor:
++    """
++    grid_size: int of the grid height and width
++    return:
++    pos_embed: [grid_size*grid_size, embed_dim] or
++                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
++    """
++    if isinstance(grid_size, int):
++        grid_h_size, grid_w_size = grid_size, grid_size
++    else:
++        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
++
++    grid_h = np.arange(grid_h_size, dtype=np.float32)
++    grid_w = np.arange(grid_w_size, dtype=np.float32)
++    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
++    grid = np.stack(grid, axis=0)
++    assert isinstance(grid, np.ndarray) and \
++        grid.shape == (2, grid_h_size, grid_w_size)
++
++    if version == (2, 0):
++        grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
++        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
++        if cls_token:
++            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed],
++                                       axis=0)
++    else:
++        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
++    return pos_embed
++
++
++class BaseResampler(nn.Module):
++    """
++    A 2D perceiver-resampler network with one cross attention layers by
++        (grid_size**2) learnable queries and 2d sincos pos_emb.
++    Outputs:
++        A tensor with the shape of (grid_size**2, embed_dim)
++    """
++
++    def __init__(self,
++                 num_queries: int,
++                 embed_dim: int,
++                 num_heads: int,
++                 kv_dim: Optional[int] = None,
++                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
++                 do_post_projection: bool = True,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = "") -> None:
++        super().__init__()
++
++        self.num_queries = num_queries
++        self.embed_dim = embed_dim
++        self.num_heads = num_heads
++
++        self.query = nn.Parameter(torch.empty(self.num_queries, embed_dim))
++
++        if kv_dim is not None and kv_dim != embed_dim:
++            self.kv_proj = ReplicatedLinear(kv_dim,
++                                            embed_dim,
++                                            bias=False,
++                                            quant_config=quant_config,
++                                            prefix=f"{prefix}.kv_proj")
++        else:
++            # Maintain the same return value with ReplicatedLinear.forward
++            self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa 
++                nn.Identity()(*args, **kwargs),
++                None,
++            )
++        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
++        self.ln_q = norm_layer(embed_dim)
++        self.ln_kv = norm_layer(embed_dim)
++        self.do_post_projection = do_post_projection
++        self.ln_post = norm_layer(embed_dim) if do_post_projection else None
++        self.proj = nn.Parameter(
++            (embed_dim**-0.5) *
++            torch.empty(embed_dim, embed_dim)) if do_post_projection else None
++
++    def _repeat(self, query, N: int):
++        return query.unsqueeze(1).repeat(1, N, 1)
++
++
++class Resampler2(BaseResampler):
++    """Resampler-perceiver network to be used for a variety of model types,
++    e.g., Qwen-vl / Minicpmv 2.0. The main difference is the addition of the
++    do_post_projection arg, which indicates whether or not there should be
++    a post layer normalization and projector after the attention. This is
++    present in minicpmv2.0, but not qwen-vl.
++    """
++
++    def __init__(self,
++                 grid_size: int,
++                 embed_dim: int,
++                 num_heads: int,
++                 kv_dim: Optional[int] = None,
++                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
++                 adaptive: bool = False,
++                 do_post_projection: bool = True,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = "") -> None:
++        super().__init__(grid_size**2,
++                         embed_dim,
++                         num_heads,
++                         kv_dim,
++                         norm_layer,
++                         do_post_projection=do_post_projection,
++                         quant_config=quant_config,
++                         prefix=prefix)
++
++        self.adaptive = adaptive
++        pos_embed_arr = get_2d_sincos_pos_embed(embed_dim,
++                                                grid_size,
++                                                version=(2, 0))
++
++        self.pos_embed = nn.Parameter(
++            torch.from_numpy(pos_embed_arr).requires_grad_(False))
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        tgt_sizes: Optional[torch.Tensor] = None,
++        attn_mask: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        if tgt_sizes is None:
++            tgt_sizes = int(math.sqrt(x.size(1)))
++        if self.adaptive:
++            pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
++                                                    tgt_sizes,
++                                                    version=(2, 0))
++            pos_embed = torch.from_numpy(pos_embed_arr).to(device=x.device,
++                                                           dtype=x.dtype)
++        else:
++            pos_embed = get_abs_pos(self.pos_embed,
++                                    tgt_sizes).to(device=x.device,
++                                                  dtype=x.dtype)
++
++        x, _ = self.kv_proj(x)
++        x = self.ln_kv(x).permute(1, 0, 2)
++
++        N = x.shape[1]
++        q = self.ln_q(self.query)
++        out = self.attn(
++            self._repeat(q, N) + self.pos_embed.unsqueeze(1),
++            x + pos_embed.unsqueeze(1),
++            x,
++            attn_mask=attn_mask,
++        )[0]
++        x = out.permute(1, 0, 2)
++        if self.do_post_projection:
++            x = self.ln_post(x)
++            x = x @ self.proj
++        return x
+diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
+index 857d70f..3fcd81a 100644
+--- a/vllm/model_executor/layers/rotary_embedding.py
++++ b/vllm/model_executor/layers/rotary_embedding.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
+ # Copyright 2023 The vLLM team.
+@@ -27,7 +26,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
+ import torch
+ import torch.nn as nn
+ 
+-from vllm import _custom_ops as ops
++from vllm.model_executor.custom_op import CustomOp
+ 
+ 
+ def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
+@@ -43,7 +42,37 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+     return x.flatten(-2)
+ 
+ 
+-class RotaryEmbedding(nn.Module):
++def _apply_rotary_emb(
++    x: torch.Tensor,
++    cos: torch.Tensor,
++    sin: torch.Tensor,
++    is_neox_style: bool,
++) -> torch.Tensor:
++    """
++    Args:
++        x: [num_tokens, num_heads, head_size]
++        cos: [num_tokens, head_size // 2]
++        sin: [num_tokens, head_size // 2]
++        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
++            positional embeddings.
++    """
++    cos = cos.unsqueeze(-2).to(x.dtype)
++    sin = sin.unsqueeze(-2).to(x.dtype)
++    if is_neox_style:
++        x1, x2 = torch.chunk(x, 2, dim=-1)
++    else:
++        x1 = x[..., ::2]
++        x2 = x[..., 1::2]
++    o1 = x1 * cos - x2 * sin
++    o2 = x2 * cos + x1 * sin
++    if is_neox_style:
++        return torch.cat((o1, o2), dim=-1)
++    else:
++        return torch.stack((o1, o2), dim=-1).flatten(-2)
++
++
++@CustomOp.register("rotary_embedding")
++class RotaryEmbedding(CustomOp):
+     """Original rotary positional embedding."""
+ 
+     def __init__(
+@@ -53,6 +82,7 @@ class RotaryEmbedding(nn.Module):
+         max_position_embeddings: int,
+         base: int,
+         is_neox_style: bool,
++        dtype: torch.dtype,
+     ) -> None:
+         super().__init__()
+         self.head_size = head_size
+@@ -60,18 +90,15 @@ class RotaryEmbedding(nn.Module):
+         self.max_position_embeddings = max_position_embeddings
+         self.base = base
+         self.is_neox_style = is_neox_style
++        self.dtype = dtype
+ 
+         cache = self._compute_cos_sin_cache()
+-        cache = cache.to(torch.get_default_dtype())
++        cache = cache.to(dtype)
++        self.cos_sin_cache: torch.Tensor
+         self.register_buffer("cos_sin_cache", cache, persistent=False)
+ 
+     def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+         """Compute the inverse frequency."""
+-        # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
+-        # However, we use `torch.arange(..., dtype=torch.float)` instead to
+-        # avoid numerical issues with large base values (e.g., 10000000).
+-        # This may cause a slight numerical difference between the HF
+-        # implementation and ours.
+         # NOTE(woosuk): To exactly match the HF implementation, we need to
+         # use CPU to compute the cache and then move it to GPU. However, we
+         # create the cache on GPU for faster initialization. This may cause
+@@ -91,59 +118,70 @@ class RotaryEmbedding(nn.Module):
+         cache = torch.cat((cos, sin), dim=-1)
+         return cache
+ 
+-    def _forward(
++    def forward_native(
+         self,
+         positions: torch.Tensor,
+         query: torch.Tensor,
+         key: torch.Tensor,
+         offsets: Optional[torch.Tensor] = None,
+     ) -> Tuple[torch.Tensor, torch.Tensor]:
+-        """PyTorch-native implementation equivalent to forward()."""
+-        query = query.view(*query.shape[:-1], -1, self.head_size)
+-        key = key.view(*key.shape[:-1], -1, self.head_size)
++        """A PyTorch-native implementation of forward()."""
++        if offsets is not None:
++            positions = positions + offsets
++        positions = positions.flatten()
++        num_tokens = positions.shape[0]
++        cos_sin = self.cos_sin_cache.index_select(0, positions)
++        cos, sin = cos_sin.chunk(2, dim=-1)
+ 
++        query_shape = query.shape
++        query = query.view(num_tokens, -1, self.head_size)
+         query_rot = query[..., :self.rotary_dim]
+-        key_rot = key[..., :self.rotary_dim]
+-        if self.rotary_dim < self.head_size:
+-            query_pass = query[..., self.rotary_dim:]
+-            key_pass = key[..., self.rotary_dim:]
++        query_pass = query[..., self.rotary_dim:]
++        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
++        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+ 
+-        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
+-            positions.device)
+-        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
+-                                     if offsets is not None else positions]
+-        cos, sin = cos_sin.chunk(2, dim=-1)
+-        if self.is_neox_style:
+-            # NOTE(woosuk): Here we assume that the positions tensor has the
+-            # shape [batch_size, seq_len].
+-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+-        else:
+-            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+-            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
++        key_shape = key.shape
++        key = key.view(num_tokens, -1, self.head_size)
++        key_rot = key[..., :self.rotary_dim]
++        key_pass = key[..., self.rotary_dim:]
++        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
++        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
++        return query, key
+ 
+-        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+-        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+-        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
++    def forward_cuda(
++        self,
++        positions: torch.Tensor,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        offsets: Optional[torch.Tensor] = None,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        from vllm import _custom_ops as ops
+ 
+-        if self.rotary_dim < self.head_size:
+-            query = torch.cat((query_rot, query_pass), dim=-1)
+-            key = torch.cat((key_rot, key_pass), dim=-1)
++        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
++                                                   dtype=query.dtype)
++        # ops.rotary_embedding()/batched_rotary_embedding()
++        # are in-place operations that update the query and key tensors.
++        if offsets is not None:
++            ops.batched_rotary_embedding(positions, query, key, self.head_size,
++                                         self.cos_sin_cache,
++                                         self.is_neox_style, self.rotary_dim,
++                                         offsets)
+         else:
+-            query = query_rot
+-            key = key_rot
+-        query = query.flatten(-2)
+-        key = key.flatten(-2)
++            ops.rotary_embedding(positions, query, key, self.head_size,
++                                 self.cos_sin_cache, self.is_neox_style)
+         return query, key
+ 
+-    def forward(
++    def forward_xpu(
+         self,
+         positions: torch.Tensor,
+         query: torch.Tensor,
+         key: torch.Tensor,
+         offsets: Optional[torch.Tensor] = None,
+     ) -> Tuple[torch.Tensor, torch.Tensor]:
+-        self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
++        from vllm._ipex_ops import ipex_ops as ops
++
++        self.cos_sin_cache = self.cos_sin_cache.to(positions.device,
++                                                   dtype=query.dtype)
+         # ops.rotary_embedding()/batched_rotary_embedding()
+         # are in-place operations that update the query and key tensors.
+         if offsets is not None:
+@@ -156,6 +194,61 @@ class RotaryEmbedding(nn.Module):
+                                  self.cos_sin_cache, self.is_neox_style)
+         return query, key
+ 
++    def forward_hpu(
++        self,
++        positions: torch.Tensor,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        offsets: Optional[torch.Tensor] = None,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        from habana_frameworks.torch.hpex.kernels import (
++            RotaryPosEmbeddingMode, apply_rotary_pos_emb)
++        positions = positions.flatten()
++        if offsets is not None:
++            positions = positions + offsets
++        num_tokens = positions.shape[0]
++        cos_sin = self.cos_sin_cache.index_select(0, positions).view(
++            num_tokens, 1, -1)
++        cos, sin = cos_sin.chunk(2, dim=-1)
++        # HPU RoPE kernel requires hidden dimension for cos and sin to be equal
++        # to query hidden dimension, so the original tensors need to be
++        # expanded
++        # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
++        # and expansion of cos/sin tensors via concatenation
++        # GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE
++        # and expansion of cos/sin tensors via repeat_interleave
++        rope_mode: RotaryPosEmbeddingMode
++        if self.is_neox_style:
++            rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
++            cos = torch.cat((cos, cos), dim=-1)
++            sin = torch.cat((sin, sin), dim=-1)
++        else:
++            rope_mode = RotaryPosEmbeddingMode.PAIRWISE
++            sin = torch.repeat_interleave(sin,
++                                          2,
++                                          dim=-1,
++                                          output_size=cos_sin.shape[-1])
++            cos = torch.repeat_interleave(cos,
++                                          2,
++                                          dim=-1,
++                                          output_size=cos_sin.shape[-1])
++
++        query_shape = query.shape
++        query = query.view(num_tokens, -1, self.head_size)
++        query_rot = query[..., :self.rotary_dim]
++        query_pass = query[..., self.rotary_dim:]
++        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0,
++                                         rope_mode)
++        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
++
++        key_shape = key.shape
++        key = key.view(num_tokens, -1, self.head_size)
++        key_rot = key[..., :self.rotary_dim]
++        key_pass = key[..., self.rotary_dim:]
++        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
++        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
++        return query, key
++
+     def extra_repr(self) -> str:
+         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+         s += f", max_position_embeddings={self.max_position_embeddings}"
+@@ -166,6 +259,29 @@ class RotaryEmbedding(nn.Module):
+ class LinearScalingRotaryEmbedding(RotaryEmbedding):
+     """RotaryEmbedding extended with linear scaling.
+ 
++    It supports multiple scaling factors. Since multiple LoRA adapters may have
++    different scaling factors, we need multiple cos/sin caches. In this way,
++    instead of running rotary embedding kernel per lora, we can run multiple
++    lora in a batched way.
++
++    In addition to that, we also keep the cos/sin cache for the scaling factor
++    of 1 (default) at all times.
++
++    Exemplary for two scaling factors x=1, y and z with embeddings
++    [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and
++    [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and
++    [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]],
++
++    we construct the cos/sin cache as follows:
++    [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p],
++        ...
++     [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]]
++
++    We then use offsets to index into the cos/sin cache for
++    the respective scaling factors.
++
++    The offset to cache can be accessed via `scaling_factor_to_offset` API.
++
+     Credits to the Reddit user /u/kaiokendev
+     """
+ 
+@@ -177,16 +293,22 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding):
+         base: int,
+         is_neox_style: bool,
+         scaling_factors: Union[List[float], float],
++        dtype: torch.dtype,
+     ) -> None:
+         if isinstance(scaling_factors, float):
+             scaling_factors = [scaling_factors]
+-        self.scaling_factors = scaling_factors
++        self.scaling_factors: List[float] = scaling_factors  # noqa
+         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+-                         is_neox_style)
++                         is_neox_style, dtype)
++        # Lazy initialized.
++        self._scaling_factor_to_offset: Dict[float, int]
+ 
+     def _compute_cos_sin_cache(self) -> torch.Tensor:
+         inv_freq = self._compute_inv_freq(self.base)
+-        cache_list = []
++        cache_list: List[torch.Tensor] = []
++        # offsets to the next cache in a tensor.
++        # Each offset corresponds to the same index in scaling_factors.
++        offsets: List[int] = []
+         for scaling_factor in self.scaling_factors:
+             # NOTE(woosuk): self.max_position_embeddings is the original
+             # maximum length before applying the rope scaling.
+@@ -200,9 +322,25 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding):
+             cos = freqs.cos()
+             sin = freqs.sin()
+             cache = torch.cat((cos, sin), dim=-1)
++            if not cache_list:
++                offset = 0
++            else:
++                last_offset = offsets[-1]
++                next_max_len = cache_list[-1].shape[0]
++                offset = last_offset + next_max_len
++            offsets.append(offset)
+             cache_list.append(cache)
++        self._scaling_factor_to_offset = {
++            float(scaling_factor): offsets[i]
++            for i, scaling_factor in enumerate(self.scaling_factors)
++        }
++        assert len(self.scaling_factors) == len(offsets)
+         return torch.cat(cache_list, dim=0)
+ 
++    @property
++    def scaling_factor_to_offset(self) -> Dict[float, int]:
++        return self._scaling_factor_to_offset
++
+ 
+ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+     """RotaryEmbedding extended with Dynamic NTK scaling.
+@@ -218,10 +356,11 @@ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+         base: int,
+         is_neox_style: bool,
+         scaling_factor: float,
++        dtype: torch.dtype,
+     ) -> None:
+         self.scaling_factor = scaling_factor
+         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+-                         is_neox_style)
++                         is_neox_style, dtype)
+ 
+     def _compute_cos_sin_cache(self) -> torch.Tensor:
+         # NOTE(woosuk): self.max_position_embeddings is the original
+@@ -298,6 +437,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
+         base: int,
+         is_neox_style: bool,
+         scaling_factor: float,
++        dtype: torch.dtype,
+         *,
+         extrapolation_factor: float = 1,
+         attn_factor: float = 1,
+@@ -313,7 +453,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
+         self.mscale = float(
+             _yarn_get_mscale(self.scaling_factor) * attn_factor)
+         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+-                         is_neox_style)
++                         is_neox_style, dtype)
+ 
+     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+         pos_freqs = self.base**(
+@@ -344,7 +484,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
+         return cache
+ 
+ 
+-class Phi3SuScaledRotaryEmbedding(nn.Module):
++class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
+     """Phi3 family of models scaled rotary embedding.
+ 
+     Based on the original RotaryEmbedding implementation.
+@@ -358,20 +498,22 @@ class Phi3SuScaledRotaryEmbedding(nn.Module):
+         original_max_position_embeddings: int,
+         base: int,
+         is_neox_style: bool,
++        dtype: torch.dtype,
+         short_factor: List[float],
+         long_factor: List[float],
+-        short_mscale: float = 1.1,
+-        long_mscale: float = 1.225,
++        short_mscale: Optional[float] = None,
++        long_mscale: Optional[float] = None,
+     ):
+         super().__init__()
+ 
+         if rotary_dim != head_size:
+             raise ValueError(
+-                f"`Phi3SuScaledRotaryEmbedding` does not support rotary_dim != \
+-                    head_size ({rotary_dim}!={head_size}).")
++                f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \
++                    rotary_dim != head_size ({rotary_dim}!={head_size}).")
+         if is_neox_style is False:
+             raise ValueError(
+-                "`Phi3SuScaledRotaryEmbedding` only supports neox_style.")
++                "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
++            )
+ 
+         self.head_size = head_size
+         self.max_position_embeddings = max_position_embeddings
+@@ -379,25 +521,32 @@ class Phi3SuScaledRotaryEmbedding(nn.Module):
+         self.base = base
+         self.short_factor = short_factor
+         self.long_factor = long_factor
++
++        scale = self.max_position_embeddings / \
++                self.original_max_position_embeddings
++        if scale <= 1.0:
++            scaling_factor = 1.0
++        else:
++            scaling_factor = math.sqrt(
++                1 + math.log(scale) /
++                math.log(self.original_max_position_embeddings))
++        if short_mscale is None:
++            short_mscale = scaling_factor
++        if long_mscale is None:
++            long_mscale = scaling_factor
++
+         self.short_mscale = short_mscale
+         self.long_mscale = long_mscale
+ 
+         short_cache = self._compute_cos_sin_cache(
+             original_max_position_embeddings, short_factor, short_mscale)
+-        short_cache = short_cache.to(torch.get_default_dtype())
+-        self.register_buffer("short_cos_sin_cache",
+-                             short_cache,
+-                             persistent=False)
++        short_cache = short_cache.to(dtype)
+ 
+         long_cache = self._compute_cos_sin_cache(max_position_embeddings,
+                                                  long_factor, long_mscale)
+-        long_cache = long_cache.to(torch.get_default_dtype())
+-        self.register_buffer("long_cos_sin_cache",
+-                             long_cache,
+-                             persistent=False)
++        long_cache = long_cache.to(dtype)
+ 
+-        long_short_cache = torch.cat(
+-            [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0)
++        long_short_cache = torch.cat([short_cache, long_cache], dim=0)
+         self.register_buffer("long_short_cos_sin_cache",
+                              long_short_cache,
+                              persistent=False)
+@@ -437,8 +586,6 @@ class Phi3SuScaledRotaryEmbedding(nn.Module):
+                               torch.full_like(positions, k)).long()
+         idx = (torch.add(positions, long_prompt_offset)
+                if long_prompt_offset is not None else positions)
+-        self.long_short_cos_sin_cache: torch.Tensor = (
+-            self.long_short_cos_sin_cache.to(idx.device))
+         idx = torch.add(idx, offsets) if offsets is not None else idx
+         cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
+ 
+@@ -452,6 +599,338 @@ class Phi3SuScaledRotaryEmbedding(nn.Module):
+         return query.flatten(-2), key.flatten(-2)
+ 
+ 
++def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
++    if scale <= 1:
++        return 1.0
++    return 0.1 * mscale * math.log(scale) + 1.0
++
++
++class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
++    """RotaryEmbedding extended with YaRN method.
++
++    Credits to Peng et al. github.com/jquesnelle/yarn
++    """
++
++    def __init__(
++        self,
++        head_size: int,
++        rotary_dim: int,
++        max_position_embeddings: int,
++        base: int,
++        is_neox_style: bool,
++        scaling_factor: float,
++        dtype: torch.dtype,
++        *,
++        extrapolation_factor: float = 1,
++        attn_factor: float = 1,
++        beta_fast: int = 32,
++        beta_slow: int = 1,
++        mscale: float = 1,
++        mscale_all_dim: float = 0,
++    ) -> None:
++        self.scaling_factor = scaling_factor
++        self.extrapolation_factor = extrapolation_factor
++        self.attn_factor = attn_factor
++        self.beta_fast = beta_fast
++        self.beta_slow = beta_slow
++        # Get n-d magnitude scaling corrected for interpolation.
++        self.mscale = float(
++            yarn_get_mscale(self.scaling_factor, float(mscale)) /
++            yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) *
++            attn_factor)
++        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
++                         is_neox_style, dtype)
++
++    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
++        pos_freqs = self.base**(torch.arange(
++            0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
++                                self.rotary_dim)
++        inv_freq_extrapolation = 1.0 / pos_freqs
++        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
++
++        low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow,
++                                                self.rotary_dim, self.base,
++                                                self.max_position_embeddings)
++        # Get n-d rotational scaling corrected for extrapolation
++        inv_freq_mask = (1 - _yarn_linear_ramp_mask(
++            low, high, self.rotary_dim // 2,
++            dtype=torch.float)) * self.extrapolation_factor
++        inv_freq = inv_freq_interpolation * (
++            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
++        return inv_freq
++
++    def _compute_cos_sin_cache(self) -> torch.Tensor:
++        inv_freq = self._compute_inv_freq(self.scaling_factor)
++        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
++                         device="cuda",
++                         dtype=torch.float32)
++        freqs = torch.einsum("i,j -> ij", t, inv_freq)
++        cos = (freqs.cos() * self.mscale)
++        sin = (freqs.sin() * self.mscale)
++        cache = torch.cat((cos, sin), dim=-1)
++        return cache
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        offsets: Optional[torch.Tensor] = None,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        """PyTorch-native implementation equivalent to forward()."""
++        query_rot = query[..., :self.rotary_dim]
++        key_rot = key[..., :self.rotary_dim]
++        if self.rotary_dim < self.head_size:
++            query_pass = query[..., self.rotary_dim:]
++            key_pass = key[..., self.rotary_dim:]
++
++        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
++            positions.device)
++        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
++                                     if offsets is not None else positions]
++        cos, sin = cos_sin.chunk(2, dim=-1)
++        if self.is_neox_style:
++            # NOTE(woosuk): Here we assume that the positions tensor has the
++            # shape [batch_size, seq_len].
++            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
++            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
++        else:
++            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
++            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
++
++        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
++        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
++        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
++
++        if self.rotary_dim < self.head_size:
++            query = torch.cat((query_rot, query_pass), dim=-1)
++            key = torch.cat((key_rot, key_pass), dim=-1)
++        else:
++            query = query_rot
++            key = key_rot
++        return query, key
++
++
++class Llama3RotaryEmbedding(RotaryEmbedding):
++
++    def __init__(
++        self,
++        head_size: int,
++        rotary_dim: int,
++        max_position_embeddings: int,
++        base: int,
++        is_neox_style: bool,
++        dtype: torch.dtype,
++        scaling_factor: float,
++        low_freq_factor: float,
++        high_freq_factor: float,
++        orig_max_position: int,
++    ) -> None:
++        self.scaling_factor = scaling_factor
++        self.low_freq_factor = low_freq_factor
++        self.high_freq_factor = high_freq_factor
++        self.orig_max_position = orig_max_position
++        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
++                         is_neox_style, dtype)
++
++    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
++        inv_freqs = super()._compute_inv_freq(base)
++        low_freq_wavelen = self.orig_max_position / self.low_freq_factor
++        high_freq_wavelen = self.orig_max_position / self.high_freq_factor
++
++        wave_len = 2 * math.pi / inv_freqs
++        if self.low_freq_factor != self.high_freq_factor:
++            smooth = (self.orig_max_position / wave_len - self.low_freq_factor
++                      ) / (self.high_freq_factor - self.low_freq_factor)
++        else:
++            smooth = 0
++        new_freqs = torch.where(
++            wave_len < high_freq_wavelen,
++            inv_freqs,
++            torch.where(
++                wave_len > low_freq_wavelen,
++                inv_freqs / self.scaling_factor,
++                (1 - smooth) * inv_freqs / self.scaling_factor +
++                smooth * inv_freqs,
++            ),
++        )
++        return new_freqs
++
++
++class MRotaryEmbedding(RotaryEmbedding):
++    """Rotary Embedding with Multimodal Sections."""
++
++    def __init__(
++        self,
++        head_size: int,
++        rotary_dim: int,
++        max_position_embeddings: int,
++        base: int,
++        is_neox_style: bool,
++        dtype: torch.dtype,
++        mrope_section: Optional[List[int]] = None,
++    ) -> None:
++        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
++                         is_neox_style, dtype)
++
++        self.mrope_section = mrope_section
++        if self.mrope_section:
++            assert sum(self.mrope_section) == rotary_dim // 2
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        query: torch.Tensor,
++        key: torch.Tensor,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        """PyTorch-native implementation equivalent to forward().
++
++        Args:
++            positions:
++                [num_tokens,] (text only) or
++                [3, num_tokens] (T/H/W positions with multimodal inputs)
++            query: [num_tokens, num_heads * head_size]
++            key: [num_tokens, num_kv_heads * head_size]
++        """
++        assert positions.ndim == 1 or positions.ndim == 2
++
++        num_tokens = positions.shape[-1]
++        cos_sin = self.cos_sin_cache[positions]
++        cos, sin = cos_sin.chunk(2, dim=-1)
++        if positions.ndim == 2:
++            assert self.mrope_section
++
++            cos = torch.cat([
++                m[i]
++                for i, m in enumerate(cos.split(self.mrope_section, dim=-1))
++            ],
++                            dim=-1)
++            sin = torch.cat([
++                m[i]
++                for i, m in enumerate(sin.split(self.mrope_section, dim=-1))
++            ],
++                            dim=-1)
++
++        query_shape = query.shape
++        query = query.view(num_tokens, -1, self.head_size)
++        query_rot = query[..., :self.rotary_dim]
++        query_pass = query[..., self.rotary_dim:]
++        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
++        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
++
++        key_shape = key.shape
++        key = key.view(num_tokens, -1, self.head_size)
++        key_rot = key[..., :self.rotary_dim]
++        key_pass = key[..., self.rotary_dim:]
++        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
++        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
++        return query, key
++
++    @staticmethod
++    def get_input_positions(
++        input_tokens: List[int],
++        image_grid_thw: Union[List[List[int]], torch.Tensor],
++        video_grid_thw: Union[List[List[int]], torch.Tensor],
++        image_token_id: int,
++        video_token_id: int,
++        vision_start_token_id: int,
++        vision_end_token_id: int,
++        spatial_merge_size: int,
++        context_len: int = 0,
++        seq_len: Optional[int] = None,
++    ) -> Tuple[List[List[int]], int]:
++        """Get mrope input positions and delta value."""
++
++        if isinstance(image_grid_thw, torch.Tensor):
++            image_grid_thw = image_grid_thw.tolist()
++        if isinstance(video_grid_thw, torch.Tensor):
++            video_grid_thw = video_grid_thw.tolist()
++
++        input_tokens_tensor = torch.tensor(input_tokens)
++        vision_start_indices = torch.argwhere(
++            input_tokens_tensor == vision_start_token_id).squeeze(1)
++        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
++        image_nums = (vision_tokens == image_token_id).sum()
++        video_nums = (vision_tokens == video_token_id).sum()
++        llm_pos_ids_list: list = []
++
++        st = 0
++        remain_images, remain_videos = image_nums, video_nums
++
++        image_index, video_index = 0, 0
++        for _ in range(image_nums + video_nums):
++            if image_token_id in input_tokens and remain_images > 0:
++                ed_image = input_tokens.index(image_token_id, st)
++            else:
++                ed_image = len(input_tokens) + 1
++            if video_token_id in input_tokens and remain_videos > 0:
++                ed_video = input_tokens.index(video_token_id, st)
++            else:
++                ed_video = len(input_tokens) + 1
++            if ed_image < ed_video:
++                t, h, w = (
++                    image_grid_thw[image_index][0],
++                    image_grid_thw[image_index][1],
++                    image_grid_thw[image_index][2],
++                )
++                image_index += 1
++                remain_images -= 1
++                ed = ed_image
++            else:
++                t, h, w = (
++                    video_grid_thw[video_index][0],
++                    video_grid_thw[video_index][1],
++                    video_grid_thw[video_index][2],
++                )
++                video_index += 1
++                remain_videos -= 1
++                ed = ed_video
++            llm_grid_t, llm_grid_h, llm_grid_w = \
++                t, h // spatial_merge_size, w // spatial_merge_size
++            text_len = ed - st
++
++            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
++                llm_pos_ids_list) > 0 else 0
++            llm_pos_ids_list.append(
++                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
++
++            t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
++                -1, llm_grid_h * llm_grid_w).flatten()
++            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
++                llm_grid_t, -1, llm_grid_w).flatten()
++            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
++                llm_grid_t, llm_grid_h, -1).flatten()
++            llm_pos_ids_list.append(
++                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
++            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
++
++        if st < len(input_tokens):
++            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
++                llm_pos_ids_list) > 0 else 0
++            text_len = len(input_tokens) - st
++            llm_pos_ids_list.append(
++                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
++
++        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
++        mrope_position_delta = (llm_positions.max() + 1 -
++                                len(input_tokens)).item()
++        llm_positions = llm_positions[:, context_len:seq_len]
++
++        return llm_positions.tolist(), mrope_position_delta
++
++    @staticmethod
++    def get_next_input_positions(
++        mrope_position_delta: int,
++        context_len: int,
++        seq_len: int,
++    ) -> List[List[int]]:
++        return [
++            list(
++                range(context_len + mrope_position_delta,
++                      seq_len + mrope_position_delta)) for _ in range(3)
++        ]
++
++
+ _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
+ 
+ 
+@@ -462,7 +941,11 @@ def get_rope(
+     base: int,
+     is_neox_style: bool = True,
+     rope_scaling: Optional[Dict[str, Any]] = None,
++    dtype: Optional[torch.dtype] = None,
++    partial_rotary_factor: float = 1.0,
+ ) -> RotaryEmbedding:
++    if dtype is None:
++        dtype = torch.get_default_dtype()
+     if rope_scaling is not None:
+         # Transforms every value that is a list into a tuple for caching calls
+         rope_scaling_tuple = {
+@@ -472,27 +955,64 @@ def get_rope(
+         rope_scaling_args = tuple(rope_scaling_tuple.items())
+     else:
+         rope_scaling_args = None
++    if partial_rotary_factor < 1.0:
++        rotary_dim = int(rotary_dim * partial_rotary_factor)
+     key = (head_size, rotary_dim, max_position, base, is_neox_style,
+-           rope_scaling_args)
++           rope_scaling_args, dtype)
+     if key in _ROPE_DICT:
+         return _ROPE_DICT[key]
++
+     if rope_scaling is None:
+         rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+-                                     is_neox_style)
++                                     is_neox_style, dtype)
+     else:
+-        scaling_type = rope_scaling["type"]
+-        if scaling_type != "su":
++        scaling_type = rope_scaling["rope_type"]
++
++        if scaling_type == "llama3":
++            scaling_factor = rope_scaling["factor"]
++            low_freq_factor = rope_scaling["low_freq_factor"]
++            high_freq_factor = rope_scaling["high_freq_factor"]
++            original_max_position = rope_scaling[
++                "original_max_position_embeddings"]
++            rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim,
++                                               max_position, base,
++                                               is_neox_style, dtype,
++                                               scaling_factor, low_freq_factor,
++                                               high_freq_factor,
++                                               original_max_position)
++        elif scaling_type == "default":
++            if "mrope_section" in rope_scaling:
++                rotary_emb = MRotaryEmbedding(
++                    head_size,
++                    rotary_dim,
++                    max_position,
++                    base,
++                    is_neox_style,
++                    dtype,
++                    mrope_section=rope_scaling["mrope_section"],
++                )
++            else:
++                rotary_emb = RotaryEmbedding(
++                    head_size,
++                    rotary_dim,
++                    max_position,
++                    base,
++                    is_neox_style,
++                    dtype,
++                )
++        elif scaling_type == "linear":
+             scaling_factor = rope_scaling["factor"]
+-        if scaling_type == "linear":
+             rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
+                                                       max_position, base,
+                                                       is_neox_style,
+-                                                      scaling_factor)
++                                                      scaling_factor, dtype)
+         elif scaling_type == "dynamic":
++            scaling_factor = rope_scaling["factor"]
+             rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                 head_size, rotary_dim, max_position, base, is_neox_style,
+-                scaling_factor)
++                scaling_factor, dtype)
+         elif scaling_type == "yarn":
++            scaling_factor = rope_scaling["factor"]
+             original_max_position = rope_scaling[
+                 "original_max_position_embeddings"]
+             extra_kwargs = {
+@@ -504,9 +1024,23 @@ def get_rope(
+             rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim,
+                                                     original_max_position,
+                                                     base, is_neox_style,
+-                                                    scaling_factor,
++                                                    scaling_factor, dtype,
+                                                     **extra_kwargs)
+-        elif scaling_type == "su":
++        elif scaling_type == "deepseek_yarn":
++            scaling_factor = rope_scaling["factor"]
++            original_max_position = rope_scaling[
++                "original_max_position_embeddings"]
++            # assert max_position == original_max_position * scaling_factor
++            extra_kwargs = {
++                k: v
++                for k, v in rope_scaling.items()
++                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
++                         "beta_slow", "mscale", "mscale_all_dim")
++            }
++            rotary_emb = DeepseekScalingRotaryEmbedding(
++                head_size, rotary_dim, original_max_position, base,
++                is_neox_style, scaling_factor, dtype, **extra_kwargs)
++        elif scaling_type == "longrope":
+             short_factor = rope_scaling["short_factor"]
+             long_factor = rope_scaling["long_factor"]
+             original_max_position = rope_scaling[
+@@ -516,9 +1050,10 @@ def get_rope(
+                 for k, v in rope_scaling.items()
+                 if k in ("short_mscale", "long_mscale")
+             }
+-            rotary_emb = Phi3SuScaledRotaryEmbedding(
++            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
+                 head_size, rotary_dim, max_position, original_max_position,
+-                base, is_neox_style, short_factor, long_factor, **extra_kwargs)
++                base, is_neox_style, dtype, short_factor, long_factor,
++                **extra_kwargs)
+         else:
+             raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+     _ROPE_DICT[key] = rotary_emb
+diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
+index 1f19d20..c2d12c4 100644
+--- a/vllm/model_executor/layers/sampler.py
++++ b/vllm/model_executor/layers/sampler.py
+@@ -1,21 +1,161 @@
+ """A layer that samples the next tokens from the model's outputs."""
+ import itertools
+-from typing import Dict, List, Optional, Tuple
++import warnings
++from dataclasses import dataclass
++from importlib.util import find_spec
++from math import inf
++from typing import Dict, Iterator, List, Optional, Tuple, Union
+ 
++import msgspec
+ import torch
+ import torch.nn as nn
+ 
+-from vllm.model_executor.layers.ops.sample import sample as sample_triton
++import vllm.envs as envs
++from vllm.model_executor.layers.utils import apply_penalties
+ from vllm.model_executor.sampling_metadata import (SamplingMetadata,
+                                                    SamplingTensors,
+                                                    SequenceGroupToSample)
+ from vllm.sampling_params import SamplingType
+-from vllm.sequence import (Logprob, PromptLogprobs, SampleLogprobs,
+-                           SamplerOutput, SequenceGroupOutput, SequenceOutput)
++from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
++                           CompletionSequenceGroupOutput, Logprob,
++                           PromptLogprobs, SampleLogprobs, SequenceOutput)
++from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
++
++if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
++    import flashinfer.sampling
++    # yapf: disable
++    from flashinfer.sampling import (
++        top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling)
++
++    # yapf: enable
++else:
++    flashinfer_top_k_top_p_sampling = None
++
++
++def get_sampler() -> torch.nn.Module:
++    if envs.VLLM_USE_V1:
++        # Lazy import: the v1 package isn't distributed
++        from vllm.v1.sample.sampler import Sampler as V1Sampler
++        return V1Sampler()
++    return Sampler()
++
+ 
+ # (num_token_ids, num_parent_ids) per sequence group.
+ SampleResultType = List[Tuple[List[int], List[int]]]
+ 
++# Types of temporary data structures used for
++# computing sample_result
++SampleMetadataType = Dict[SamplingType, Tuple[List[int],
++                                              List[SequenceGroupToSample]]]
++MultinomialSamplesType = Dict[SamplingType, torch.Tensor]
++SampleResultsDictType = Dict[int, Tuple[List[int], List[int]]]
++
++
++# Encapsulates temporary data structures for computing
++# sample_result.
++#
++# * For multi-step scheduling: must be returned
++#   by `Sampler.forward()` and used later to compute the pythonized
++#   sample_result
++#
++# * For single-step scheduling: consumed immediately
++#   inside `Sampler.forward()` to compute pythonized sample_result.
++@dataclass
++class SampleResultArgsType:
++    sample_metadata: SampleMetadataType
++    multinomial_samples: MultinomialSamplesType
++    sample_results_dict: SampleResultsDictType
++    sampling_metadata: SamplingMetadata
++    greedy_samples: Optional[torch.Tensor]
++    beam_search_logprobs: Optional[torch.Tensor]
++
++
++# Union of non-deferred (single-step scheduling)
++# vs deferred (multi-step scheduling)
++# sample result types
++MaybeDeferredSampleResultType = Union[SampleResultType, SampleResultArgsType]
++
++# Abbreviation of the _sample() return type
++SampleReturnType = Tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]]
++
++
++class SamplerOutput(
++        msgspec.Struct,
++        omit_defaults=True,  # type: ignore[call-arg]
++        array_like=True):  # type: ignore[call-arg]
++    """For each sequence group, we generate a list of SequenceOutput object,
++    each of which contains one possible candidate for the next token.
++
++    This data structure implements methods, so it can be used like a list, but
++    also has optional fields for device tensors.
++    """
++
++    outputs: List[CompletionSequenceGroupOutput]
++
++    # On-device tensor containing probabilities of each token.
++    sampled_token_probs: Optional[torch.Tensor] = None
++
++    # On-device tensor containing the logprobs of each token.
++    logprobs: Optional["torch.Tensor"] = None
++
++    # Holds either (1) the pythonized sampler result (single-step scheduling)
++    # or (2) what will be arguments for later deferred pythonization of the
++    # sampler result (muliti-step scheduling)
++    deferred_sample_results_args: Optional[SampleResultArgsType] = None
++
++    # On-device tensor containing the sampled token ids.
++    sampled_token_ids: Optional[torch.Tensor] = None
++    # CPU tensor containing the sampled token ids. Used during multi-step to
++    # return the sampled token ids from last rank to AsyncLLMEngine to be
++    # 'broadcasted' to all other PP ranks for next step.
++    sampled_token_ids_cpu: Optional[torch.Tensor] = None
++
++    # Spec decode metrics populated by workers.
++    spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None
++
++    # Optional last hidden states from the model.
++    hidden_states: Optional[torch.Tensor] = None
++
++    # Optional prefill hidden states from the model
++    # (used for models like EAGLE).
++    prefill_hidden_states: Optional[torch.Tensor] = None
++
++    # Time taken in the forward pass for this across all workers
++    model_forward_time: Optional[float] = None
++
++    # Time taken in the model execute function. This will include model forward,
++    # block/sync across workers, cpu-gpu sync time and sampling time.
++    model_execute_time: Optional[float] = None
++
++    def __getitem__(self, idx: int) -> CompletionSequenceGroupOutput:
++        return self.outputs[idx]
++
++    def __setitem__(self, idx: int, value):
++        self.outputs[idx] = value
++
++    def __iter__(self) -> Iterator[CompletionSequenceGroupOutput]:
++        return iter(self.outputs)
++
++    def __len__(self):
++        return len(self.outputs)
++
++    def __eq__(self, other: object):
++        return isinstance(other,
++                          self.__class__) and self.outputs == other.outputs
++
++    def __repr__(self) -> str:
++        """Show the shape of a tensor instead of its values to reduce noise.
++        """
++        sampled_token_probs_repr = ("None" if self.sampled_token_probs is None
++                                    else self.sampled_token_probs.shape)
++        sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else
++                                  self.sampled_token_ids.shape)
++        return (
++            f"SamplerOutput(outputs={self.outputs}, "
++            f"sampled_token_probs={sampled_token_probs_repr}, "
++            f"sampled_token_ids={sampled_token_ids_repr}, "
++            f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})")
++
+ 
+ class Sampler(nn.Module):
+     """Samples the next tokens from the model's outputs.
+@@ -45,6 +185,33 @@ class Sampler(nn.Module):
+         # containing the sampled token ids and probabilities. This is used by
+         # speculative decoding.
+         self.include_gpu_probs_tensor = False
++        self.should_modify_greedy_probs_inplace = False
++
++    def _init_sampling_tensors(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ):
++        """The goal here is to reuse sampling tensors between similar decode
++        runs. This is possible because sampling logic does not change between
++        decodes of the same sequences.
++        """
++        _, vocab_size = logits.shape
++
++        # First free any existing stored sampling tensors.
++        # This is necessary because some sampling tensors may
++        # have pinned memory.
++        self._sampling_tensors = None
++
++        # Initialize new sampling tensors
++        (sampling_tensors, do_penalties, do_top_p_top_k,
++         do_min_p) = SamplingTensors.from_sampling_metadata(
++             sampling_metadata, vocab_size, logits.device, logits.dtype)
++
++        self._sampling_tensors = sampling_tensors
++        self._do_penalties = do_penalties
++        self._do_top_p_top_k = do_top_p_top_k
++        self._do_min_p = do_min_p
+ 
+     def forward(
+         self,
+@@ -52,6 +219,19 @@ class Sampler(nn.Module):
+         sampling_metadata: SamplingMetadata,
+     ) -> Optional[SamplerOutput]:
+         """
++        Single-step scheduling:
++        * Perform GPU-side sampling computation & compute
++          GPU-side logprobs tensor
++        * Pythonize sampling result & logprobs tensor
++
++        Multi-step scheduling:
++        * Perform GPU-side sampling computation & compute
++          GPU-side logprobs tensor
++        * Defer Pythonization of sampling result & logprobs
++          tensor
++        * Encapsulate arguments required for deferred Pythonization
++          in the :class:`SamplerOutput` structure
++
+         Args:
+             logits: (num_tokens, vocab_size).
+             sampling_metadata: Metadata for sampling.
+@@ -59,26 +239,38 @@ class Sampler(nn.Module):
+         assert logits is not None
+         _, vocab_size = logits.shape
+ 
+-        logits = _apply_min_tokens_penalty(logits, sampling_metadata)
+-
+         # Prepare sampling tensors with pinned memory to avoid blocking.
+-        (sampling_tensors, do_penalties, do_top_p_top_k,
+-         do_min_p) = SamplingTensors.from_sampling_metadata(
+-             sampling_metadata, vocab_size, logits.device, logits.dtype)
++        if not sampling_metadata.reuse_sampling_tensors:
++            self._init_sampling_tensors(logits, sampling_metadata)
++        elif self._do_penalties:
++            # In this case, the sampling tensors logic depends on
++            # "output_tokens" of a sequence. As a result, we cannot
++            # reuse sampling tensors, since "output_tokens" changes
++            # between decode runs.
++            self._init_sampling_tensors(logits, sampling_metadata)
++
++        assert self._sampling_tensors is not None
++        sampling_tensors = self._sampling_tensors
++        do_penalties = self._do_penalties
++        do_top_p_top_k = self._do_top_p_top_k
++        do_min_p = self._do_min_p
++
++        logits = _apply_min_tokens_penalty(logits, sampling_metadata)
+ 
+         # Apply presence and frequency penalties.
+         if do_penalties:
+-            logits = _apply_penalties(logits, sampling_tensors.prompt_tokens,
+-                                      sampling_tensors.output_tokens,
+-                                      sampling_tensors.presence_penalties,
+-                                      sampling_tensors.frequency_penalties,
+-                                      sampling_tensors.repetition_penalties)
++            logits = apply_penalties(logits, sampling_tensors.prompt_tokens,
++                                     sampling_tensors.output_tokens,
++                                     sampling_tensors.presence_penalties,
++                                     sampling_tensors.frequency_penalties,
++                                     sampling_tensors.repetition_penalties)
+ 
+-        # Apply temperature scaling.
++        # Use float32 to apply temperature scaling.
+         # Use in-place division to avoid creating a new tensor.
+-        logits.div_(sampling_tensors.temperatures.unsqueeze_(dim=1))
++        logits = logits.to(torch.float)
++        logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
+ 
+-        if do_top_p_top_k:
++        if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None:
+             logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
+                                         sampling_tensors.top_ks)
+ 
+@@ -92,7 +284,7 @@ class Sampler(nn.Module):
+         logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
+ 
+         # Sample the next tokens.
+-        sample_results, maybe_sampled_tokens_tensor = _sample(
++        maybe_deferred_sample_results, maybe_sampled_tokens_tensor = _sample(
+             probs,
+             logprobs,
+             sampling_metadata,
+@@ -102,19 +294,33 @@ class Sampler(nn.Module):
+         )
+ 
+         if self.include_gpu_probs_tensor:
++            # Since we will defer sampler result Pythonization,
++            # preserve GPU-side tensors in support of later
++            # deferred pythonization of logprobs
+             assert maybe_sampled_tokens_tensor is not None
+             on_device_tensors = (probs, logprobs, maybe_sampled_tokens_tensor)
+         else:
++            # Since Pythonization has already happened, don't preserve
++            # GPU-side tensors.
+             on_device_tensors = None
+ 
+         # Get the logprobs query results.
+-        prompt_logprobs, sample_logprobs = _get_logprobs(
+-            logprobs, sampling_metadata, sample_results)
+-        return _build_sampler_output(sample_results,
+-                                     sampling_metadata,
+-                                     prompt_logprobs,
+-                                     sample_logprobs,
+-                                     on_device_tensors=on_device_tensors)
++        prompt_logprobs = None
++        sample_logprobs = None
++        if not sampling_metadata.skip_sampler_cpu_output:
++            # Pythonize logprobs now (GPU -> CPU); do not defer.
++            assert not isinstance(maybe_deferred_sample_results,
++                                  SampleResultArgsType)
++            prompt_logprobs, sample_logprobs = get_logprobs(
++                logprobs, sampling_metadata, maybe_deferred_sample_results)
++
++        return _build_sampler_output(
++            maybe_deferred_sample_results,
++            sampling_metadata,
++            prompt_logprobs,
++            sample_logprobs,
++            on_device_tensors=on_device_tensors,
++            skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output)
+ 
+     @property
+     def _should_modify_greedy_probs_inplace(self) -> bool:
+@@ -128,25 +334,7 @@ class Sampler(nn.Module):
+         This is used by speculative decoding, which requires that the sampling
+         method be encoded into the probability distribution.
+         """
+-        # Modify greedy probs if include_gpu_probs_tensor is set.
+-        return self.include_gpu_probs_tensor
+-
+-
+-def _get_bin_counts_and_mask(
+-    tokens: torch.Tensor,
+-    vocab_size: int,
+-    num_seqs: int,
+-) -> Tuple[torch.Tensor, torch.Tensor]:
+-    # Compute the bin counts for the tokens.
+-    # vocab_size + 1 for padding.
+-    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
+-                             dtype=torch.long,
+-                             device=tokens.device)
+-    bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
+-    bin_counts = bin_counts[:, :vocab_size]
+-    mask = bin_counts > 0
+-
+-    return bin_counts, mask
++        return self.should_modify_greedy_probs_inplace
+ 
+ 
+ def _apply_min_tokens_penalty(
+@@ -173,10 +361,10 @@ def _apply_min_tokens_penalty(
+         min_tokens = sampling_params.min_tokens
+         token_ids_to_penalize = sampling_params.all_stop_token_ids
+         if min_tokens > 0 and token_ids_to_penalize:
+-            seqs_to_penalize = []
++            seqs_to_penalize: List[int] = []
+             for j, seq_id in enumerate(seq_ids):
+                 seq_data = seq_group.seq_data[seq_id]
+-                if len(seq_data.output_token_ids) < min_tokens:
++                if len(seq_data.output_token_ids_array) < min_tokens:
+                     seqs_to_penalize.append(j)
+ 
+             if seqs_to_penalize:
+@@ -196,29 +384,6 @@ def _apply_min_tokens_penalty(
+     return logits
+ 
+ 
+-def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
+-                     output_tokens_tensor: torch.Tensor,
+-                     presence_penalties: torch.Tensor,
+-                     frequency_penalties: torch.Tensor,
+-                     repetition_penalties: torch.Tensor) -> torch.Tensor:
+-    num_seqs, vocab_size = logits.shape
+-    _, prompt_mask = _get_bin_counts_and_mask(prompt_tokens_tensor, vocab_size,
+-                                              num_seqs)
+-    output_bin_counts, output_mask = _get_bin_counts_and_mask(
+-        output_tokens_tensor, vocab_size, num_seqs)
+-
+-    repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size)
+-    repetition_penalties[~(prompt_mask | output_mask)] = 1.0
+-    logits = torch.where(logits > 0, logits / repetition_penalties,
+-                         logits * repetition_penalties)
+-
+-    # We follow the definition in OpenAI API.
+-    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
+-    logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
+-    logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
+-    return logits
+-
+-
+ def _apply_top_k_top_p(
+     logits: torch.Tensor,
+     p: torch.Tensor,
+@@ -242,12 +407,9 @@ def _apply_top_k_top_p(
+     logits_sort.masked_fill_(top_p_mask, -float("inf"))
+ 
+     # Re-sort the probabilities.
+-    src = torch.arange(logits_idx.shape[-1],
+-                       device=logits_idx.device).expand_as(logits_idx)
+-    logits_idx_inv = torch.empty_like(logits_idx).scatter_(dim=-1,
+-                                                           index=logits_idx,
+-                                                           src=src)
+-    logits = torch.gather(logits_sort, dim=-1, index=logits_idx_inv)
++    logits = torch.empty_like(logits_sort).scatter_(dim=-1,
++                                                    index=logits_idx,
++                                                    src=logits_sort)
+     return logits
+ 
+ 
+@@ -284,7 +446,7 @@ def _greedy_sample(
+         same as the length of selected_seq_groups. If the corresponding
+         seq_group has do_sample=False, tuple contains ([], [])
+     """
+-    samples = samples.tolist()
++    samples_lst = samples.tolist()
+     sample_idx = 0
+     results: SampleResultType = []
+     for seq_group in selected_seq_groups:
+@@ -297,7 +459,7 @@ def _greedy_sample(
+         assert num_parent_seqs == 1, (
+             "Greedy sampling should have only one seq.")
+         parent_ids = list(range(num_parent_seqs))
+-        next_token_ids = [samples[sample_idx]]
++        next_token_ids = [samples_lst[sample_idx]]
+         results.append((next_token_ids, parent_ids))
+         sample_idx += num_parent_seqs
+     return results
+@@ -319,7 +481,7 @@ def _random_sample(
+         same as the length of selected_seq_groups. If the corresponding
+         seq_group has do_sample=False, tuple contains ([], [])
+     """
+-    # Find the maximum best_of value of the prompt phase requests.
++    # Find the maximum n value of the prompt phase requests.
+     random_samples = random_samples.cpu()
+     sample_idx = 0
+     results: SampleResultType = []
+@@ -334,9 +496,9 @@ def _random_sample(
+         num_parent_seqs = len(seq_ids)
+         if is_prompt:
+             # Prompt phase.
+-            parent_ids = [0] * sampling_params.best_of
++            parent_ids = [0] * sampling_params.n
+             next_token_ids = random_samples[
+-                sample_idx, :sampling_params.best_of].tolist()
++                sample_idx, :sampling_params.n].tolist()
+         else:
+             # Generation phase.
+             parent_ids = list(range(num_parent_seqs))
+@@ -381,7 +543,7 @@ def _beam_search_sample(
+         is_prompt = seq_group.is_prompt
+         seq_ids, sampling_params = seq_group.seq_ids, seq_group.sampling_params
+         num_parent_seqs = len(seq_ids)
+-        beam_width = sampling_params.best_of
++        beam_width = sampling_params.n
+         seq_group_logprobs = logprobs[sample_idx:sample_idx + num_parent_seqs]
+         if is_prompt:
+             # Prompt phase.
+@@ -393,7 +555,7 @@ def _beam_search_sample(
+             next_token_ids = next_token_ids.tolist()
+         else:
+             # Generation phase.
+-            cumulative_logprobs: List[int] = [
++            cumulative_logprobs: List[float] = [
+                 seq_group.seq_data[seq_id].cumulative_logprob
+                 for seq_id in seq_ids
+             ]
+@@ -426,14 +588,7 @@ def _multinomial(
+     seq_groups: Optional[List[SequenceGroupToSample]] = None,
+ ) -> torch.Tensor:
+     if num_samples > 1:
+-        # This is equivalent to torch.repeat_interleaved (which also
+-        # forces a GPU<->CPU sync).
+-        # This allows us to do sampling with replacement by creating
+-        # num_samples copies of each row in the tensor, and then
+-        # batch sampling the resulting tensor.
+-        probs = probs[:, None, :].expand(probs.shape[0], num_samples,
+-                                         probs.shape[1]).contiguous().view(
+-                                             -1, probs.shape[1])
++        probs = probs.repeat_interleave(num_samples, dim=0)
+     q = torch.empty_like(probs)
+     if seq_groups is None:
+         q.exponential_()
+@@ -441,20 +596,126 @@ def _multinomial(
+         sample_idx = 0
+         for seq_group in seq_groups:
+             seq_ids = seq_group.seq_ids
+-            next_sample_idx = sample_idx + len(seq_ids) * num_samples
+-            q[sample_idx:next_sample_idx].exponential_(
+-                generator=seq_group.generator)
+-            sample_idx = next_sample_idx
++            stride = len(seq_ids) * num_samples
++            assert seq_group.generator is not None
++            q[sample_idx:sample_idx +
++              stride].exponential_(generator=seq_group.generator)
++            sample_idx += stride
+     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
+ 
+ 
++def _top_k_top_p_multinomial_with_flashinfer(
++        probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor,
++        num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]]):
++    max_top_k_round = 32
++    if num_samples > 1:
++        probs = probs.repeat_interleave(num_samples, dim=0)
++        top_ks = top_ks.repeat_interleave(num_samples)
++        top_ps = top_ps.repeat_interleave(num_samples)
++    batch_size = probs.shape[0]
++    uniform_samples = torch.empty((max_top_k_round, batch_size),
++                                  device=probs.device)
++    if seq_groups is None:
++        uniform_samples.uniform_()
++    else:
++        sample_idx = 0
++        for seq_group in seq_groups:
++            seq_ids = seq_group.seq_ids
++            stride = len(seq_ids) * num_samples
++            assert seq_group.generator is not None
++            uniform_samples[:, sample_idx:sample_idx +
++                            stride].uniform_(generator=seq_group.generator)
++            sample_idx += stride
++    batch_next_token_ids, success = flashinfer_top_k_top_p_sampling(
++        probs,
++        uniform_samples,
++        top_ks,
++        top_ps,
++    )
++    if not success.all():
++        warnings.warn("FlashInfer rejection sampling failed, fallback.",
++                      stacklevel=1)
++        probs = flashinfer.sampling.top_k_renorm_prob(probs, top_ks)
++        probs = flashinfer.sampling.top_p_renorm_prob(probs, top_ps)
++        batch_next_token_ids = flashinfer.sampling.sampling_from_probs(
++            probs, uniform_samples[0])
++    return batch_next_token_ids.view(-1, num_samples)
++
++
++def get_pythonized_sample_results(
++        sample_result_args: SampleResultArgsType) -> SampleResultType:
++    '''This function consumes GPU-side sampler results and computes
++    Pythonized CPU-side sampler results (GPU -> CPU sync.)
++
++    Single-step scheduling: this function is invoked at sampling-time
++    for immediate Pythonization.
++
++    Multi-step scheduling: Pythonization is deferred until after multiple
++    GPU-side steps have been completed.
++
++    Args:
++      sample_result_args: GPU-side inputs to the Pythonization process
++
++    Returns:
++      Pythonized sampler results
++    '''
++
++    (
++        sample_metadata,
++        sampling_metadata,
++        greedy_samples,
++        multinomial_samples,
++        beam_search_logprobs,
++        sample_results_dict,
++    ) = (
++        sample_result_args.sample_metadata,
++        sample_result_args.sampling_metadata,
++        sample_result_args.greedy_samples,
++        sample_result_args.multinomial_samples,
++        sample_result_args.beam_search_logprobs,
++        sample_result_args.sample_results_dict,
++    )
++
++    for sampling_type in SamplingType:
++        if sampling_type not in sample_metadata:
++            continue
++        (seq_group_id, seq_groups) = sample_metadata[sampling_type]
++        if sampling_type == SamplingType.GREEDY:
++            sample_results = _greedy_sample(seq_groups, greedy_samples)
++        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
++            sample_results = _random_sample(seq_groups,
++                                            multinomial_samples[sampling_type])
++        elif sampling_type == SamplingType.BEAM:
++            sample_results = _beam_search_sample(seq_groups,
++                                                 beam_search_logprobs)
++        sample_results_dict.update(zip(seq_group_id, sample_results))
++
++    return [
++        sample_results_dict.get(i, ([], []))
++        for i in range(len(sampling_metadata.seq_groups))
++    ]
++
++
+ def _sample_with_torch(
+     probs: torch.Tensor,
+     logprobs: torch.Tensor,
+     sampling_metadata: SamplingMetadata,
++    sampling_tensors: SamplingTensors,
+     include_gpu_probs_tensor: bool,
+     modify_greedy_probs: bool,
+-) -> Tuple[SampleResultType, Optional[torch.Tensor]]:
++) -> SampleReturnType:
++    '''Torch-oriented _sample() implementation.
++
++    Single-step scheduling:
++    * Perform GPU-side sampling computation
++    * Immediately Pythonize sampling result
++
++    Multi-step scheduling:
++    * Perform GPU-side sampling computation
++    * Defer Pythonization & preserve GPU-side
++      tensors required for Pythonization
++    '''
++
+     categorized_seq_group_ids: Dict[SamplingType,
+                                     List[int]] = {t: []
+                                                   for t in SamplingType}
+@@ -464,23 +725,25 @@ def _sample_with_torch(
+         sampling_type = sampling_params.sampling_type
+         categorized_seq_group_ids[sampling_type].append(i)
+ 
+-    sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
+-    sample_metadata = {}
+-    multinomial_samples = {}
++    sample_results_dict: SampleResultsDictType = {}
++    sample_metadata: SampleMetadataType = {}
++    multinomial_samples: MultinomialSamplesType = {}
++    greedy_samples: Optional[torch.Tensor] = None
++    beam_search_logprobs: Optional[torch.Tensor] = None
+ 
+     # Create output tensor for sampled token ids.
+     if include_gpu_probs_tensor:
+-        sampled_token_ids_tensor = torch.empty(logprobs.shape[0],
+-                                               1,
+-                                               dtype=torch.long,
+-                                               device=logprobs.device)
++        sampled_token_ids_tensor = torch.full((logprobs.shape[0], 1),
++                                              VLLM_INVALID_TOKEN_ID,
++                                              dtype=torch.long,
++                                              device=logprobs.device)
+     else:
+         sampled_token_ids_tensor = None
+ 
+     # Counterintiutively, having two loops here is actually faster.
+     # The first loop can run without waiting on GPU<->CPU sync.
+     for sampling_type in SamplingType:
+-        sample_indices = categorized_sample_indices[sampling_type][:, 0]
++        sample_indices = categorized_sample_indices[sampling_type]
+         num_tokens = len(sample_indices)
+         if num_tokens == 0:
+             continue
+@@ -493,7 +756,7 @@ def _sample_with_torch(
+             greedy_samples = torch.argmax(logprobs[long_sample_indices],
+                                           dim=-1)
+ 
+-            if include_gpu_probs_tensor:
++            if sampled_token_ids_tensor is not None:
+                 # Store sampled tokens in output tensor.
+                 sampled_token_ids_tensor[
+                     long_sample_indices] = greedy_samples.unsqueeze(-1)
+@@ -507,138 +770,72 @@ def _sample_with_torch(
+                                              greedy_samples)
+ 
+         elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
+-            max_best_of_in_batch = 1
++            max_n_in_batch = 1
+             for seq_group in seq_groups:
+                 if seq_group.is_prompt:
+                     sampling_params = seq_group.sampling_params
+-                    max_best_of_in_batch = max(max_best_of_in_batch,
+-                                               sampling_params.best_of)
+-            seeded_args = {} if sampling_type == SamplingType.RANDOM else {
+-                "seq_groups": seq_groups,
+-            }
+-
+-            multinomial_samples[sampling_type] = _multinomial(
+-                probs[long_sample_indices], max_best_of_in_batch,
+-                **seeded_args)
+-
+-            if include_gpu_probs_tensor:
++                    max_n_in_batch = max(max_n_in_batch, sampling_params.n)
++            seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else
++                              seq_groups)
++
++            if flashinfer_top_k_top_p_sampling is not None:
++                multinomial_samples[
++                    sampling_type] = _top_k_top_p_multinomial_with_flashinfer(
++                        probs[long_sample_indices],
++                        sampling_tensors.top_ks[long_sample_indices],
++                        sampling_tensors.top_ps[long_sample_indices],
++                        max_n_in_batch,
++                        seq_groups_arg,
++                    )
++            else:
++                multinomial_samples[sampling_type] = _multinomial(
++                    probs[long_sample_indices],
++                    max_n_in_batch,
++                    seq_groups=seq_groups_arg)
++
++            if sampled_token_ids_tensor is not None:
+                 # Store sampled tokens in output tensor.
+-                sampled_token_ids_tensor[
+-                    long_sample_indices] = multinomial_samples[sampling_type]
++                sampled_token_ids_tensor[long_sample_indices] = \
++                    multinomial_samples[sampling_type].to(torch.long)
+ 
+         elif sampling_type == SamplingType.BEAM:
+             beam_search_logprobs = logprobs[sample_indices]
+         else:
+             raise ValueError(f"Unsupported sampling type: {sampling_type}")
+ 
+-    # GPU<->CPU sync happens in the loop below.
+-    # This also converts the sample output to Python objects.
+-    for sampling_type in SamplingType:
+-        if sampling_type not in sample_metadata:
+-            continue
+-        (seq_group_id, seq_groups) = sample_metadata[sampling_type]
+-        if sampling_type == SamplingType.GREEDY:
+-            sample_results = _greedy_sample(seq_groups, greedy_samples)
+-        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
+-            sample_results = _random_sample(seq_groups,
+-                                            multinomial_samples[sampling_type])
+-        elif sampling_type == SamplingType.BEAM:
+-            sample_results = _beam_search_sample(seq_groups,
+-                                                 beam_search_logprobs)
+-        sample_results_dict.update(zip(seq_group_id, sample_results))
+-
+-    sample_results = [
+-        sample_results_dict.get(i, ([], []))
+-        for i in range(len(sampling_metadata.seq_groups))
+-    ]
+-    return sample_results, sampled_token_ids_tensor
++    # Encapsulate arguments for computing Pythonized sampler
++    # results, whether deferred or otherwise.
++    maybe_deferred_args = SampleResultArgsType(
++        sampling_metadata=sampling_metadata,
++        sample_metadata=sample_metadata,
++        multinomial_samples=multinomial_samples,
++        greedy_samples=greedy_samples,
++        beam_search_logprobs=beam_search_logprobs,
++        sample_results_dict=sample_results_dict)
++
++    if not sampling_metadata.skip_sampler_cpu_output:
++        # GPU<->CPU sync happens here.
++        # This also converts the sampler output to a Python object.
++        # Return Pythonized sampler result & sampled token ids
++        return get_pythonized_sample_results(
++            maybe_deferred_args), sampled_token_ids_tensor
++    else:
++        # Defer sampler result Pythonization; return deferred
++        # Pythonization args & sampled token ids
++        return (
++            maybe_deferred_args,
++            sampled_token_ids_tensor,
++        )
+ 
+ 
+-def _sample_with_triton_kernel(
++def _sample(
+     probs: torch.Tensor,
+     logprobs: torch.Tensor,
+     sampling_metadata: SamplingMetadata,
+     sampling_tensors: SamplingTensors,
+-) -> SampleResultType:
+-    categorized_seq_group_ids: Dict[SamplingType,
+-                                    List[int]] = {t: []
+-                                                  for t in SamplingType}
+-    categorized_sample_indices = sampling_metadata.categorized_sample_indices
+-    for i, seq_group in enumerate(sampling_metadata.seq_groups):
+-        sampling_params = seq_group.sampling_params
+-        sampling_type = sampling_params.sampling_type
+-        categorized_seq_group_ids[sampling_type].append(i)
+-
+-    sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
+-    sample_metadata = {}
+-    max_best_of_in_batch = 1
+-
+-    # Counterintiutively, having two loops here is actually faster.
+-    # The first loop can run without waiting on GPU<->CPU sync.
+-    for sampling_type in SamplingType:
+-        sample_indices = categorized_sample_indices[sampling_type][:, 0]
+-        sampled_token_indices = categorized_sample_indices[sampling_type][:, 1]
+-        num_tokens = len(sample_indices)
+-        if num_tokens == 0:
+-            continue
+-        seq_group_id = categorized_seq_group_ids[sampling_type]
+-        seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_id]
+-        sample_metadata[sampling_type] = (seq_group_id, seq_groups,
+-                                          sample_indices,
+-                                          sampled_token_indices)
+-        if sampling_type in (SamplingType.GREEDY, SamplingType.RANDOM,
+-                             SamplingType.RANDOM_SEED):
+-            for seq_group in seq_groups:
+-                if seq_group.is_prompt:
+-                    sampling_params = seq_group.sampling_params
+-                    max_best_of_in_batch = max(max_best_of_in_batch,
+-                                               sampling_params.best_of)
+-        elif sampling_type == SamplingType.BEAM:
+-            beam_search_logprobs = logprobs[sample_indices]
+-        else:
+-            raise ValueError(f"Unsupported sampling type: {sampling_type}")
+-
+-    sampled_tokens, _, _ = sample_triton(
+-        probs=probs,
+-        seeds=sampling_tensors.sampling_seeds,
+-        max_best_of=max_best_of_in_batch,
+-        sample_indices=sampling_tensors.sample_indices,
+-        logprobs=logprobs,
+-        # don't save logprobs because we have logic for that below
+-        # TODO: use this instead of the CPU-based logic below
+-        save_logprobs=False,
+-    )
+-
+-    # GPU<->CPU sync happens in the loop below.
+-
+-    for sampling_type in SamplingType:
+-        if sampling_type not in sample_metadata:
+-            continue
+-        (seq_group_id, seq_groups, sample_indices,
+-         sampled_token_indices) = sample_metadata[sampling_type]
+-        if sampling_type == SamplingType.GREEDY:
+-            sample_results = _greedy_sample(
+-                seq_groups, sampled_tokens[sampled_token_indices][:, 0])
+-        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
+-            sample_results = _random_sample(
+-                seq_groups, sampled_tokens[sampled_token_indices])
+-        elif sampling_type == SamplingType.BEAM:
+-            sample_results = _beam_search_sample(seq_groups,
+-                                                 beam_search_logprobs)
+-        sample_results_dict.update(zip(seq_group_id, sample_results))
+-
+-    sample_results = [
+-        sample_results_dict.get(i, ([], []))
+-        for i in range(len(sampling_metadata.seq_groups))
+-    ]
+-    return sample_results
+-
+-
+-def _sample(
+-    probs: torch.Tensor, logprobs: torch.Tensor,
+-    sampling_metadata: SamplingMetadata, sampling_tensors: SamplingTensors,
+-    include_gpu_probs_tensor: bool, modify_greedy_probs: bool
+-) -> Tuple[SampleResultType, Optional[torch.Tensor]]:
++    include_gpu_probs_tensor: bool,
++    modify_greedy_probs: bool,
++) -> SampleReturnType:
+     """
+     Args:
+         probs: (num_query_tokens_in_batch, num_vocab)
+@@ -655,14 +852,11 @@ def _sample(
+         probs,
+         logprobs,
+         sampling_metadata,
++        sampling_tensors,
+         include_gpu_probs_tensor=include_gpu_probs_tensor,
+         modify_greedy_probs=modify_greedy_probs,
+     )
+ 
+-    # TODO: Enable once Triton kernel & associated code is faster.
+-    # return _sample_with_triton_kernel(probs, logprobs, sampling_metadata,
+-    #                                   sampling_tensors)
+-
+ 
+ def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
+     """
+@@ -675,20 +869,22 @@ def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
+ 
+     Returns:
+         torch.Tensor: 1D tensor of shape (N,) where N is the no. of tokens.
+-                    Each element in the returned tensor represents the rank 
++                    Each element in the returned tensor represents the rank
+                     of the chosen token in the input logprob tensor.
+     """
+     vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype),
+              indices]
+-    return (x > vals[:, None]).long().sum(1).add_(1)
++    result = (x > vals[:, None])
++    del vals
++    return result.sum(1).add_(1)
+ 
+ 
+-def _get_logprobs(
++def get_logprobs(
+     logprobs: torch.Tensor,
+     sampling_metadata: SamplingMetadata,
+     sample_results: SampleResultType,
+ ) -> Tuple[List[Optional[PromptLogprobs]], List[SampleLogprobs]]:
+-    """Return sample lobprobs and prompt logprobs.
++    """Return sample logprobs and prompt logprobs.
+ 
+     The logic consists of 3 parts.
+     - Select indices to compute logprob from, ranks of token ids, and
+@@ -720,8 +916,9 @@ def _get_logprobs(
+     # The next token ids to get the logprob value from.
+     next_token_ids: List[int] = []
+     # The largest requested number of logprobs. We find logprobs as many as the
+-    # largest num logprobs in this API.
+-    largest_num_logprobs = 1
++    # largest num logprobs in this API. If every logprobs is None, it will be
++    # set to -1.
++    largest_num_logprobs = -1
+ 
+     # Select indices to compute logprob from, ranks of token ids, and the top
+     # k token ids from logprobs.
+@@ -761,34 +958,40 @@ def _get_logprobs(
+         empty_prompt_logprob: Optional[PromptLogprobs] = None
+         return [empty_prompt_logprob], [empty_sampled_logprob]
+ 
+-    query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
+-    next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device)
+-
+-    # (num_selected_query_tokens, num_logprobs). Note that query_indices can
+-    # contain duplicates if beam search is enabled.
+-    selected_logprobs = logprobs[[
+-        query_indices_gpu,
+-        next_token_ids_gpu,
+-    ]]
+-    ranks = _get_ranks(
+-        logprobs[query_indices_gpu],
+-        next_token_ids_gpu,
+-    )
+-    assert selected_logprobs.shape[0] == ranks.shape[0]
+-
+-    # Logprobs of topk tokens for a batch of sequence groups.
+-    # (num_query_tokens_across_batch).
+-    if largest_num_logprobs > 0:
+-        top_logprobs, top_token_ids = torch.topk(logprobs,
+-                                                 largest_num_logprobs,
+-                                                 dim=-1)
+-        top_logprobs = top_logprobs.cpu()
+-        top_token_ids = top_token_ids.cpu()
+-    else:
+-        top_logprobs, top_token_ids = None, None
++    selected_logprobs, ranks = None, None
++    top_logprobs, top_token_ids = None, None
++
++    # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can
++    # skip the whole logprob calculation.
++    if largest_num_logprobs >= 0:
++        query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
++        next_token_ids_gpu = torch.tensor(next_token_ids,
++                                          device=logprobs.device)
++
++        # (num_selected_query_tokens, num_logprobs). Note that query_indices can
++        # contain duplicates if beam search is enabled.
++        selected_logprobs = logprobs[[
++            query_indices_gpu,
++            next_token_ids_gpu,
++        ]]
++        ranks = _get_ranks(
++            logprobs[query_indices_gpu],
++            next_token_ids_gpu,
++        )
++        assert selected_logprobs.shape[0] == ranks.shape[0]
++
++        # We need to compute top k only if there exists logprobs > 0.
++        if largest_num_logprobs > 0:
++            # Logprobs of topk tokens for a batch of sequence groups.
++            # (num_query_tokens_across_batch).
++            top_logprobs, top_token_ids = torch.topk(logprobs,
++                                                     largest_num_logprobs,
++                                                     dim=-1)
++            top_logprobs = top_logprobs.to('cpu')
++            top_token_ids = top_token_ids.to('cpu')
+ 
+-    selected_logprobs = selected_logprobs.cpu()
+-    ranks = ranks.cpu()
++        selected_logprobs = selected_logprobs.to('cpu')
++        ranks = ranks.to('cpu')
+ 
+     # Find prompt/sample logprobs.
+     prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = []
+@@ -828,37 +1031,48 @@ def _get_prompt_logprob_if_needed(
+ 
+     # Find prompt logprobs
+     prompt_logprobs: Optional[PromptLogprobs] = None
+-    if (is_prompt and sampling_params.prompt_logprobs is not None):
++    if is_prompt and sampling_params.prompt_logprobs is not None:
+         prompt_logprobs = []
+         num_logprobs = sampling_params.prompt_logprobs
+         next_prompt_tokens = _get_next_prompt_tokens(seq_group)
+-        for token_id in next_prompt_tokens:
++        # Pre-select indexes and create a list. It is faster than calling .item
++        # repetitively.
++        selected_logprob_items = selected_logprobs[
++            selected_logprobs_idx:selected_logprobs_idx +
++            len(next_prompt_tokens)].tolist()
++        rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
++                           len(next_prompt_tokens)].tolist()
++
++        for idx, token_id in enumerate(next_prompt_tokens):
+             # Calculate the prompt logprob of the real prompt tokens.
+-            # Use tuple here for performance (to use to_list()).
+             # {token_id: (logprob, rank_from_vocab)}
+             prompt_logprobs_dict: Dict[int, Tuple[float, int]] = {
+-                token_id: (selected_logprobs[selected_logprobs_idx].item(),
+-                           ranks[selected_logprobs_idx].item())
++                token_id: (selected_logprob_items[idx], rank_items[idx])
+             }
+ 
+             # Add top K prompt logprobs along with its rank.
+             if num_logprobs > 0:
+-                prompt_logprobs_dict.update(
+-                    zip(
+-                        top_token_ids[top_logprob_idx, :num_logprobs].tolist(),
+-                        zip(
+-                            top_logprobs[
+-                                top_logprob_idx, :num_logprobs].tolist(),
+-                            # This is ranks. Since top_logprob is sorted,
+-                            # we can just use a range here.
+-                            range(1, num_logprobs + 1))))
++                top_ids = top_token_ids[
++                    top_logprob_idx, :num_logprobs].tolist()
++                top_probs = top_logprobs[
++                    top_logprob_idx, :num_logprobs].tolist()
++                # Top K is already sorted by rank, so we can use 1 ~
++                # num_logprobs + 1 for rank.
++                top_ranks = range(1, num_logprobs + 1)
++                prompt_logprobs_dict.update({
++                    top_id: (top_prob, rank)
++                    for top_id, top_prob, rank in zip(top_ids, top_probs,
++                                                      top_ranks)
++                })
+             prompt_logprobs.append({
+                 token_id: Logprob(*logprob_and_rank)
+                 for token_id, logprob_and_rank in prompt_logprobs_dict.items()
+             })
+             # + 1 to go to the next prompt token.
+             top_logprob_idx += 1
+-            selected_logprobs_idx += 1
++
++        # + len(next_prompt_tokens) to go to the next prompt.
++        selected_logprobs_idx += len(next_prompt_tokens)
+     return prompt_logprobs, top_logprob_idx, selected_logprobs_idx
+ 
+ 
+@@ -875,46 +1089,59 @@ def _get_sampled_logprob_if_needed(
+     """Compute the sample logprob if needed."""
+     seq_ids = seq_group.seq_ids
+     num_logprobs = seq_group.sampling_params.logprobs
+-    if num_logprobs is None:
+-        num_logprobs = 0
+     sampled_logprobs: SampleLogprobs = []
+     next_token_ids, parent_seq_ids = sample_result
+ 
+     if seq_group.do_sample:
+         assert len(next_token_ids) > 0
+-        for (next_token_id, parent_id) in zip(next_token_ids, parent_seq_ids):
+-            # Calculate the sample logprob of the real sampled tokens.
+-            # Use tuple here for performance (to use to_list()).
+-            # token_id: (logprob, rank_from_vocab)
+-            sampled_logprobs_dict: Dict[int, Tuple[float, int]] = {
+-                next_token_id:
+-                (selected_logprobs[selected_logprobs_idx].item(),
+-                 ranks[selected_logprobs_idx].item())
+-            }
+-            # +1 to go to the next sampled token. Note that
+-            # selected_logprobs can contain duplicates unlike top_logprobs
+-            # when beam search is enabled.
+-            selected_logprobs_idx += 1
+-
+-            # Second, add top K logprobs along with its rank.
+-            if num_logprobs >= 0:
+-                sampled_logprobs_dict.update(
+-                    zip(
+-                        top_token_ids[top_logprob_idx +
+-                                      parent_id, :num_logprobs].tolist(),
+-                        zip(
+-                            top_logprobs[top_logprob_idx +
+-                                         parent_id, :num_logprobs].tolist(),
+-                            # This is rank. Since top_logprob is sorted, we
+-                            # can just use a range here.
+-                            range(1, num_logprobs + 1))))
+-            sampled_logprobs.append({
+-                token_id: Logprob(*logprob_and_rank)
+-                for token_id, logprob_and_rank in
+-                sampled_logprobs_dict.items()
+-            })
+-        # There are len(seq_ids) number of sampled tokens for the current
+-        # sequence group in top_logprobs. Jump to the next seq_group.
++        if num_logprobs is None:
++            for next_token_id in next_token_ids:
++                # Use a dummy logprob
++                sampled_logprobs.append({next_token_id: Logprob(inf)})
++        else:
++            # Pre-select items from tensor. tolist() is faster than repetitive
++            # `.item()` calls.
++            selected_logprob_items = selected_logprobs[
++                selected_logprobs_idx:selected_logprobs_idx +
++                len(next_token_ids)].tolist()
++            rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
++                               len(next_token_ids)].tolist()
++            for idx, (next_token_id, parent_id) in enumerate(
++                    zip(next_token_ids, parent_seq_ids)):
++                # Get the logprob of a sampled token.
++                sampled_logprobs_dict = {
++                    next_token_id:
++                    (selected_logprob_items[idx], rank_items[idx])
++                }
++                if num_logprobs is not None and num_logprobs > 0:
++                    # Get top K logprobs.
++                    top_ids = top_token_ids[top_logprob_idx +
++                                            parent_id, :num_logprobs].tolist()
++                    top_probs = top_logprobs[
++                        top_logprob_idx + parent_id, :num_logprobs].tolist()
++                    # Top K is already sorted by rank, so we can use 1 ~
++                    # num_logprobs + 1 for rank.
++                    top_ranks = range(1, num_logprobs + 1)
++                    sampled_logprobs_dict.update({
++                        top_id: (top_prob, rank)
++                        for top_id, top_prob, rank in zip(
++                            top_ids, top_probs, top_ranks)
++                    })
++
++                sampled_logprobs.append({
++                    token_id: Logprob(*logprob_and_rank)
++                    for token_id, logprob_and_rank in
++                    sampled_logprobs_dict.items()
++                })
++
++        # NOTE: This part of code is not intuitive. `selected_logprobs` include
++        # logprobs for the current step, which has len(next_token_ids) tokens
++        # per sequence group. `logprobs` includes logprobs from the previous
++        # steps, which has len(seq_ids) tokens per sequence group.
++
++        # Iterate to the next sequence group in a batch.
++        selected_logprobs_idx += len(next_token_ids)
++        # Iterate to the next sequence group in a batch.
+         top_logprob_idx += len(seq_ids)
+     return sampled_logprobs, top_logprob_idx, selected_logprobs_idx
+ 
+@@ -940,7 +1167,7 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
+                 distribution.
+             - Greedy sampling performs `argmax` to obtain the token with the
+                 highest likelihood.
+-    
++
+     Ignoring greedy sampling for a moment, we find that the computed probability
+     distribution has the following property: we can sample from it independently
+     and find that the token sampled by the Sampler has a frequency corresponding
+@@ -970,12 +1197,13 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
+ 
+ 
+ def _build_sampler_output(
+-    sample_results: SampleResultType,
++    maybe_deferred_sample_results: MaybeDeferredSampleResultType,
+     sampling_metadata: SamplingMetadata,
+-    prompt_logprobs: List[Optional[PromptLogprobs]],
+-    sample_logprobs: List[SampleLogprobs],
++    prompt_logprobs: Optional[List[Optional[PromptLogprobs]]],
++    sample_logprobs: Optional[List[SampleLogprobs]],
+     on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor,
+                                       torch.Tensor]],
++    skip_sampler_cpu_output: bool = False,
+ ) -> SamplerOutput:
+     """Construct Python objects with the output of sampling.
+ 
+@@ -985,22 +1213,33 @@ def _build_sampler_output(
+             allows post-processing without copies to CPU/serialization, e.g. in
+             speculative decoding rejection sampling.
+     """
++    sampler_output: List[CompletionSequenceGroupOutput] = []
+ 
+-    sampler_output = []
+-    for (seq_group, sample_result, group_prompt_logprobs,
+-         group_sample_logprobs) in zip(sampling_metadata.seq_groups,
+-                                       sample_results, prompt_logprobs,
+-                                       sample_logprobs):
+-        seq_ids = seq_group.seq_ids
+-        next_token_ids, parent_ids = sample_result
+-        seq_outputs = []
+-        for parent_id, next_token_id, logprobs in zip(parent_ids,
+-                                                      next_token_ids,
+-                                                      group_sample_logprobs):
+-            seq_outputs.append(
+-                SequenceOutput(seq_ids[parent_id], next_token_id, logprobs))
+-        sampler_output.append(
+-            SequenceGroupOutput(seq_outputs, group_prompt_logprobs))
++    if skip_sampler_cpu_output:
++        assert isinstance(maybe_deferred_sample_results, SampleResultArgsType)
++        deferred_sample_results_args = maybe_deferred_sample_results
++    else:
++        assert prompt_logprobs is not None
++        assert sample_logprobs is not None
++        assert not isinstance(maybe_deferred_sample_results,
++                              SampleResultArgsType)
++        deferred_sample_results_args = None
++
++        for (seq_group, sample_result, group_prompt_logprobs,
++             group_sample_logprobs) in zip(sampling_metadata.seq_groups,
++                                           maybe_deferred_sample_results,
++                                           prompt_logprobs, sample_logprobs):
++            seq_ids = seq_group.seq_ids
++            next_token_ids, parent_ids = sample_result
++            seq_outputs: List[SequenceOutput] = []
++            for parent_id, next_token_id, logprobs in zip(
++                    parent_ids, next_token_ids, group_sample_logprobs):
++                seq_outputs.append(
++                    SequenceOutput(seq_ids[parent_id], next_token_id,
++                                   logprobs))
++            sampler_output.append(
++                CompletionSequenceGroupOutput(seq_outputs,
++                                              group_prompt_logprobs))
+ 
+     # If not specified, store None values in SamplerOutput.
+     if on_device_tensors is not None:
+@@ -1015,7 +1254,7 @@ def _build_sampler_output(
+         sampled_token_probs=sampled_token_probs,
+         sampled_token_ids=sampled_token_ids,
+         logprobs=logprobs_tensor,
+-    )
++        deferred_sample_results_args=deferred_sample_results_args)
+ 
+ 
+ def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]:
+diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
+new file mode 100644
+index 0000000..6aa4b8b
+--- /dev/null
++++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
+@@ -0,0 +1,254 @@
++from abc import abstractmethod
++from typing import Dict, Optional, Union
++
++import torch
++import torch.jit
++import torch.nn as nn
++
++
++class SpecDecodeBaseSampler(nn.Module):
++    """Base class for samplers used for Speculative Decoding verification
++        step.
++    """
++
++    def __init__(self, strict_mode: bool = False):
++        """Base class constructor.
++        Args:
++            strict_mode: Whether or not to perform shape/device/dtype checks
++                during sampling. This catches correctness issues but adds
++                nontrivial latency.
++        """
++        super().__init__()
++        self._strict_mode = strict_mode
++
++        # NOTE: A "bonus token" is accepted iff all proposal tokens are
++        # accepted. There is always only one possible bonus token. We store this
++        # value in a variable for readability.
++        self._num_bonus_tokens = 1
++
++        self.num_accepted_tokens: Optional[torch.Tensor] = None
++        self.num_emitted_tokens: Optional[torch.Tensor] = None
++        self.num_draft_tokens: int = 0
++
++    def init_gpu_tensors(self, device: Union[int, str]) -> None:
++        assert self.num_accepted_tokens is None
++        if isinstance(device, int):
++            device = f"cuda:{device}"
++        elif not isinstance(device, str):
++            raise ValueError(f"Device must be int or str, get {type(device)}")
++        self.num_accepted_tokens = torch.tensor(0,
++                                                dtype=torch.long,
++                                                device=device)
++        self.num_emitted_tokens = torch.tensor(0,
++                                               dtype=torch.long,
++                                               device=device)
++
++    def init_tensors(self,
++                     device: Union[int, str],
++                     device_type: Union[torch.device, str] = 'cuda') -> None:
++        assert self.num_accepted_tokens is None
++        if isinstance(device_type, torch.device):
++            device_type = device_type.type
++        if isinstance(device, int):
++            device = f"{device_type}:{device}"
++        self.num_accepted_tokens = torch.tensor(0,
++                                                dtype=torch.long,
++                                                device=device)
++        self.num_emitted_tokens = torch.tensor(0,
++                                               dtype=torch.long,
++                                               device=device)
++
++    @property
++    def probs_dtype(self):
++        return torch.float32
++
++    @property
++    def token_id_dtype(self):
++        return torch.int64
++
++    def _create_output(
++            self,
++            accepted: torch.Tensor,  # [batch_size, k]
++            substitute_token_ids: torch.Tensor,  # [batch_size, k]
++            draft_token_ids: torch.Tensor,  # [batch_size, k]
++            bonus_token_ids: torch.Tensor,  # [batch_size]
++    ) -> torch.Tensor:
++        """Format output. Returns a matrix of token ids. When
++        a token is rejected via sampling, all subsequent token ids are 
++        set to -1 for the sequence.
++
++        Args:
++            accepted: A boolean tensor indicating if the corresponding
++            draft token in draft_token_ids should be accepted or not.
++            substitute_token_ids: A tensor of token_ids that can be used
++            as substitutes for the draft token ids if the proposed token
++            is rejected.
++            draft_token_ids: A tensor of token ids speculated by the 
++            draft model.
++            bonus_token_ids: Token ids to use as the bonus token if
++            all the draft tokens are accepted.
++        Returns:
++            A tensor containing the accepted token ids. The shape of the 
++            tensor is [batch_size, k + num_bonus_tokens]
++        """
++        batch_size, k = substitute_token_ids.shape
++        bonus_token_ids = bonus_token_ids.squeeze(-1)
++        # Determine the index of the first False value for each row.
++        limits = (accepted == 0).max(1).indices
++        limits[~(accepted == 0).any(1)] = k
++
++        # Create masks using the indices.
++        indices = torch.arange(k, device=accepted.device).unsqueeze(0)
++        accepted_mask = indices < limits.unsqueeze(1)
++        after_false_mask = indices == limits.unsqueeze(1)
++
++        # Create an extended output tensor
++        output_with_bonus_tokens = -torch.ones(
++            (batch_size, k + self._num_bonus_tokens),
++            dtype=self.token_id_dtype,
++            device=accepted.device)
++        output = output_with_bonus_tokens[:, :k]
++
++        # Fill in the first k columns of the output tensor using masks and data
++        # tensors.
++        output[:, :k] = torch.where(accepted_mask, draft_token_ids,
++                                    -torch.ones_like(draft_token_ids))
++
++        # Fill the last column.
++        # We check output directly as accepted may have True values inconsistent
++        # with causal acceptance.
++        output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1,
++                                                      bonus_token_ids, -1)
++
++        # Fill the recovered token ids.
++        output.mul_(~after_false_mask).add_(
++            substitute_token_ids.mul(after_false_mask))
++
++        self.num_accepted_tokens += accepted.sum()
++        self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum()
++        self.num_draft_tokens += batch_size * k
++
++        return output_with_bonus_tokens
++
++    def _raise_if_incorrect_input(
++        self,
++        target_with_bonus_probs: torch.Tensor,
++        draft_token_ids: torch.Tensor,
++        bonus_token_ids: torch.Tensor,
++        draft_probs: Optional[torch.Tensor] = None,
++    ) -> None:
++        self._raise_if_incorrect_shape(target_with_bonus_probs,
++                                       draft_token_ids, bonus_token_ids,
++                                       draft_probs)
++        self._raise_if_incorrect_dtype(target_with_bonus_probs,
++                                       draft_token_ids, bonus_token_ids,
++                                       draft_probs)
++        self._raise_if_inconsistent_device(target_with_bonus_probs,
++                                           draft_token_ids, bonus_token_ids,
++                                           draft_probs)
++        self._raise_if_out_of_bounds_vocab(target_with_bonus_probs.shape[-1],
++                                           draft_token_ids, bonus_token_ids)
++
++    def _raise_if_incorrect_shape(
++        self,
++        target_with_bonus_probs: torch.Tensor,
++        draft_token_ids: torch.Tensor,
++        bonus_token_ids: torch.Tensor,
++        draft_probs: Optional[torch.Tensor] = None,
++    ) -> None:
++        (target_batch_size, num_target_probs,
++         target_vocab_size) = target_with_bonus_probs.shape
++
++        # Does not count the extra token
++        num_target_probs -= 1
++
++        # validate the shape of draft token ids.
++        draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape
++        assert draft_token_ids_batch_size == target_batch_size
++        assert num_draft_token_ids == num_target_probs
++
++        # validate the shape of bonus token ids
++        bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape
++        assert bonus_batch_size == target_batch_size
++        assert num_bonus_tokens == self._num_bonus_tokens
++
++        # validate the shape of draft probs if it is set
++        if draft_probs is not None:
++            (draft_batch_size, num_draft_probs,
++             draft_vocab_size) = draft_probs.shape
++            assert draft_batch_size == target_batch_size
++            assert num_draft_probs == num_target_probs
++            assert (draft_vocab_size == target_vocab_size
++                    ), f"{draft_vocab_size=} {target_vocab_size=}"
++
++    def _raise_if_incorrect_dtype(
++        self,
++        target_with_bonus_probs: torch.Tensor,
++        draft_token_ids: torch.Tensor,
++        bonus_token_ids: torch.Tensor,
++        draft_probs: Optional[torch.Tensor] = None,
++    ) -> None:
++        assert target_with_bonus_probs.dtype == self.probs_dtype
++        assert draft_token_ids.dtype == self.token_id_dtype
++        assert bonus_token_ids.dtype == self.token_id_dtype
++        if draft_probs is not None:
++            assert draft_probs.dtype == self.probs_dtype
++
++    def _raise_if_inconsistent_device(
++        self,
++        target_with_bonus_probs: torch.Tensor,
++        draft_token_ids: torch.Tensor,
++        bonus_token_ids: torch.Tensor,
++        draft_probs: Optional[torch.Tensor] = None,
++    ) -> None:
++        devices = [
++            t.device for t in [
++                target_with_bonus_probs, bonus_token_ids, draft_probs,
++                draft_token_ids
++            ] if t is not None
++        ]
++        assert all([devices[0] == device for device in devices])
++
++    def _raise_if_out_of_bounds_vocab(
++        self,
++        vocab_size: int,
++        draft_token_ids: torch.Tensor,
++        bonus_token_ids: torch.Tensor,
++    ) -> None:
++        assert torch.all(bonus_token_ids < vocab_size)
++        assert torch.all(bonus_token_ids >= 0)
++        assert torch.all(draft_token_ids < vocab_size)
++        assert torch.all(draft_token_ids >= 0)
++
++
++class SpecDecodeDeterministicBaseSampler(SpecDecodeBaseSampler):
++    """Base class for samplers used for Speculative Decoding verification
++       step which are deterministic.
++    """
++
++    @abstractmethod
++    def forward(
++        self,
++        target_with_bonus_probs: torch.Tensor,
++        bonus_token_ids: torch.Tensor,
++        draft_probs: torch.Tensor,
++        draft_token_ids: torch.Tensor,
++    ) -> torch.Tensor:
++        raise NotImplementedError
++
++
++class SpecDecodeStochasticBaseSampler(SpecDecodeBaseSampler):
++    """Base class for samplers used for Speculative Decoding verification
++       step which are stochastic
++    """
++
++    @abstractmethod
++    def forward(
++        self,
++        target_with_bonus_probs: torch.Tensor,
++        bonus_token_ids: torch.Tensor,
++        draft_probs: torch.Tensor,
++        draft_token_ids: torch.Tensor,
++        seeded_seqs: Optional[Dict[int, torch.Generator]] = None,
++    ) -> torch.Tensor:
++        raise NotImplementedError
+diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
+new file mode 100644
+index 0000000..584cf97
+--- /dev/null
++++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
+@@ -0,0 +1,170 @@
++import torch
++import torch.jit
++
++from vllm.model_executor.layers.spec_decode_base_sampler import (
++    SpecDecodeDeterministicBaseSampler)
++
++
++class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler):
++    """Apply typical acceptance sampling as described in section 3.3.1 in 
++        "MEDUSA: Simple LLM Inference Acceleration Framework with 
++        Multiple Decoding Heads"
++        https://arxiv.org/pdf/2401.10774
++    """
++
++    def __init__(
++        self,
++        posterior_threshold: float,
++        posterior_alpha: float,
++        strict_mode: bool = False,
++    ):
++        """Create a Typical Acceptance Sampler.
++
++        Args:
++            strict_mode: Whether or not to perform shape/device/dtype checks
++            during sampling. This catches correctness issues but adds
++            nontrivial latency.
++            posterior_threshold : A threshold value that sets a lower bound 
++            on the posterior probability of a token in target model for it
++            to be accepted.
++            posterior_alpha : A scaling factor for the entropy-based
++            threshold in typical acceptance sampling.
++        """
++        self._posterior_threshold = posterior_threshold
++        self._posterior_alpha = posterior_alpha
++        super().__init__(strict_mode=strict_mode)
++
++    def forward(
++        self,
++        target_with_bonus_probs: torch.Tensor,
++        bonus_token_ids: torch.Tensor,
++        draft_probs: torch.Tensor,
++        draft_token_ids: torch.Tensor,
++    ) -> torch.Tensor:
++        """Sample token ids using typical acceptance sampling. This accepts 
++        or rejects tokens proposed by the draft model using the probability
++        of each token according to the draft and target models.
++
++        In the worst case where all draft tokens are rejected, it is guaranteed
++        one token will be emitted.
++
++        In the case where all draft tokens are accepted, the bonus token will be
++        accepted.
++
++        Args:
++            target_probs: The probability distribution over token ids given
++                context according to the target model.
++            shape = [batch_size, num_speculative_tokens, vocab_size]
++
++            bonus_token_ids: The "bonus" token ids that are accepted iff all
++                speculative tokens in a sequence are accepted.
++            shape = [batch_size, num_bonus_tokens]
++
++            draft_probs: This parameter is unused by the acceptance sampler.
++
++            draft_token_ids: The token ids that were sampled from the draft
++                probabilities.
++            shape = [batch_size, num_speculative_tokens]
++
++        Returns:
++            output_token_ids: The token ids sampled via rejection sampling,
++                or -1 if unable to sample a token because the previous token
++                was rejected.
++            shape = [batch_size, num_speculative_tokens + num_bonus_tokens]
++        """
++        # Only perform shape/dtype/device checking in strict mode, as it adds
++        # overhead.
++        if self._strict_mode:
++            self._raise_if_incorrect_input(target_with_bonus_probs,
++                                           draft_token_ids, bonus_token_ids)
++        target_probs = target_with_bonus_probs[:, :-1]
++        accepted = self._evaluate_accepted_tokens(target_probs,
++                                                  draft_token_ids)
++        recovered_token_ids = self._get_recovered_token_ids(target_probs)
++        output_token_ids = self._create_output(accepted, recovered_token_ids,
++                                               draft_token_ids,
++                                               bonus_token_ids)
++        return output_token_ids
++
++    def _evaluate_accepted_tokens(self, target_probs, draft_token_ids):
++        r"""
++        Evaluates and returns a mask of accepted tokens based on the
++        posterior probabilities.
++
++        Parameters:
++        ----------
++        target_probs : torch.Tensor
++            A tensor of shape (batch_size, k, vocab_size) representing 
++            the probabilities of each token in the vocabulary for each
++            position in the proposed sequence. This is the distribution
++            generated by the target model.
++        draft_token_ids : torch.Tensor
++            A tensor of shape (batch_size, k) representing the proposed
++            token ids.
++
++        A draft token_id x_{n+k} is accepted if it satisfies the
++        following condition
++    
++        .. math::
++            p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > 
++            \min \left( \epsilon, \delta * \exp \left(
++                -H(p_{\text{original}}(
++                    \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
++        
++        where :math:`p_{\text{original}}` corresponds to target_probs 
++        and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters
++        specified using self._posterior_threshold and self._posterior_alpha
++
++        This method computes the posterior probabilities for the given
++        draft token ids based on the provided target probabilities. It
++        calculates the entropy of the posterior distribution and determines
++        a dynamic threshold for each token position using the provided
++        posterior_threshold and posterior_alpha values. The method then
++        returns a boolean mask indicating which tokens can be accepted.
++
++        Returns:
++        -------
++        torch.Tensor
++            A boolean tensor of shape (batch_size, k) where each element
++            indicates whether the corresponding draft token has been accepted
++            or rejected. True indicates acceptance and false indicates
++            rejection.
++            
++        """
++        device = target_probs.device
++        candidates_prob = torch.gather(
++            target_probs, dim=-1,
++            index=draft_token_ids.unsqueeze(-1)).squeeze(-1)
++        # A small constant added to prevent computing the logarithm of zero,
++        # which can lead to undefined values.
++        epsilon = 1e-5
++        posterior_entropy = -torch.sum(
++            target_probs * torch.log(target_probs + epsilon), dim=-1)
++        threshold = torch.minimum(
++            torch.ones_like(posterior_entropy, device=device) *
++            self._posterior_threshold,
++            torch.exp(-posterior_entropy) * self._posterior_alpha,
++        )
++        accepted_mask = candidates_prob > threshold
++        return accepted_mask
++
++    def _get_recovered_token_ids(self, target_probs):
++        """
++        The recovered token ids will fill the first unmatched token
++        by the target token.
++
++        Parameters
++        ----------
++        target_probs : torch.Tensor
++            A tensor of shape (batch_size, k, vocab_size) containing 
++            the target probability distribution
++
++        Returns
++        -------
++        torch.Tensor
++            A tensor of shape (batch_size, k) with the recovered token
++            ids which are selected from target probs.
++        """
++        max_indices = torch.argmax(target_probs, dim=-1)
++
++        return max_indices
+diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
+new file mode 100644
+index 0000000..f6f34cd
+--- /dev/null
++++ b/vllm/model_executor/layers/utils.py
+@@ -0,0 +1,57 @@
++"""Utility methods for model layers."""
++from typing import Tuple
++
++import torch
++
++
++def get_token_bin_counts_and_mask(
++    tokens: torch.Tensor,
++    vocab_size: int,
++    num_seqs: int,
++) -> Tuple[torch.Tensor, torch.Tensor]:
++    # Compute the bin counts for the tokens.
++    # vocab_size + 1 for padding.
++    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
++                             dtype=torch.long,
++                             device=tokens.device)
++    bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
++    bin_counts = bin_counts[:, :vocab_size]
++    mask = bin_counts > 0
++
++    return bin_counts, mask
++
++
++def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
++                    output_tokens_tensor: torch.Tensor,
++                    presence_penalties: torch.Tensor,
++                    frequency_penalties: torch.Tensor,
++                    repetition_penalties: torch.Tensor) -> torch.Tensor:
++    """
++    Applies penalties in place to the logits tensor
++    logits : The input logits tensor of shape [num_seqs, vocab_size]
++    prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts 
++        are padded to the maximum prompt length within the batch using 
++        `vocab_size` as the padding value. The value `vocab_size` is used 
++        for padding because it does not correspond to any valid token ID 
++        in the vocabulary.
++    output_tokens_tensor: The output tokens tensor.
++    presence_penalties: The presence penalties of shape (num_seqs, )
++    frequency_penalties: The frequency penalties of shape (num_seqs, )
++    repetition_penalties: The repetition penalties of shape (num_seqs, )
++    """
++    num_seqs, vocab_size = logits.shape
++    _, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor,
++                                                   vocab_size, num_seqs)
++    output_bin_counts, output_mask = get_token_bin_counts_and_mask(
++        output_tokens_tensor, vocab_size, num_seqs)
++    repetition_penalties = repetition_penalties.unsqueeze_(dim=1).repeat(
++        1, vocab_size)
++    logits[logits > 0] /= torch.where(prompt_mask | output_mask,
++                                      repetition_penalties, 1.0)[logits > 0]
++    logits[logits <= 0] *= torch.where(prompt_mask | output_mask,
++                                       repetition_penalties, 1.0)[logits <= 0]
++    # We follow the definition in OpenAI API.
++    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
++    logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
++    logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
++    return logits
+diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
+index 4585b16..65920aa 100644
+--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
++++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
+@@ -1,35 +1,157 @@
+-from typing import Optional, Sequence
++from dataclasses import dataclass
++from typing import List, Optional, Sequence, Tuple
+ 
+ import torch
+ import torch.nn.functional as F
+-from torch.nn.parameter import Parameter
++from torch.nn.parameter import Parameter, UninitializedParameter
+ 
+ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size,
+                               tensor_model_parallel_all_reduce)
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
++from vllm.model_executor.parameter import BasevLLMParameter
+ from vllm.model_executor.utils import set_weight_attrs
++from vllm.platforms import current_platform
+ 
+ DEFAULT_VOCAB_PADDING_SIZE = 64
+ 
+ 
++class UnquantizedEmbeddingMethod(QuantizeMethodBase):
++    """Unquantized method for embeddings."""
++
++    def create_weights(self, layer: torch.nn.Module,
++                       input_size_per_partition: int,
++                       output_partition_sizes: List[int], input_size: int,
++                       output_size: int, params_dtype: torch.dtype,
++                       **extra_weight_attrs):
++        """Create weights for embedding layer."""
++        weight = Parameter(torch.empty(sum(output_partition_sizes),
++                                       input_size_per_partition,
++                                       dtype=params_dtype),
++                           requires_grad=False)
++        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
++        layer.register_parameter("weight", weight)
++        set_weight_attrs(weight, extra_weight_attrs)
++
++    def apply(self,
++              layer: torch.nn.Module,
++              x: torch.Tensor,
++              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
++        return F.linear(x, layer.weight, bias)
++
++    def embedding(self, layer: torch.nn.Module,
++                  input_: torch.Tensor) -> torch.Tensor:
++        return F.embedding(input_, layer.weight)
++
++
+ def pad_vocab_size(vocab_size: int,
+                    pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
+     """Pad the vocab size to the given value."""
+     return ((vocab_size + pad_to - 1) // pad_to) * pad_to
+ 
+ 
+-def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size: int,
+-                                              rank: int) -> Sequence[int]:
++def vocab_range_from_per_partition_vocab_size(
++        per_partition_vocab_size: int,
++        rank: int,
++        offset: int = 0) -> Sequence[int]:
+     index_f = rank * per_partition_vocab_size
+     index_l = index_f + per_partition_vocab_size
+-    return index_f, index_l
++    return index_f + offset, index_l + offset
+ 
+ 
+-def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int,
+-                                       world_size: int) -> Sequence[int]:
++def vocab_range_from_global_vocab_size(global_vocab_size: int,
++                                       rank: int,
++                                       world_size: int,
++                                       offset: int = 0) -> Sequence[int]:
+     per_partition_vocab_size = divide(global_vocab_size, world_size)
+     return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
+-                                                     rank)
++                                                     rank,
++                                                     offset=offset)
++
++
++@dataclass
++class VocabParallelEmbeddingShardIndices:
++    """Indices for a shard of a vocab parallel embedding."""
++    padded_org_vocab_start_index: int
++    padded_org_vocab_end_index: int
++    padded_added_vocab_start_index: int
++    padded_added_vocab_end_index: int
++
++    org_vocab_start_index: int
++    org_vocab_end_index: int
++    added_vocab_start_index: int
++    added_vocab_end_index: int
++
++    @property
++    def num_org_elements(self) -> int:
++        return self.org_vocab_end_index - self.org_vocab_start_index
++
++    @property
++    def num_added_elements(self) -> int:
++        return self.added_vocab_end_index - self.added_vocab_start_index
++
++    @property
++    def num_org_elements_padded(self) -> int:
++        return (self.padded_org_vocab_end_index -
++                self.padded_org_vocab_start_index)
++
++    @property
++    def num_added_elements_padded(self) -> int:
++        return (self.padded_added_vocab_end_index -
++                self.padded_added_vocab_start_index)
++
++    @property
++    def num_org_vocab_padding(self) -> int:
++        return self.num_org_elements_padded - self.num_org_elements
++
++    @property
++    def num_added_vocab_padding(self) -> int:
++        return self.num_added_elements_padded - self.num_added_elements
++
++    @property
++    def num_elements_padded(self) -> int:
++        return self.num_org_elements_padded + self.num_added_elements_padded
++
++    def __post_init__(self):
++        # sanity checks
++        assert (self.padded_org_vocab_start_index <=
++                self.padded_org_vocab_end_index)
++        assert (self.padded_added_vocab_start_index <=
++                self.padded_added_vocab_end_index)
++
++        assert self.org_vocab_start_index <= self.org_vocab_end_index
++        assert self.added_vocab_start_index <= self.added_vocab_end_index
++
++        assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
++        assert (self.added_vocab_start_index <=
++                self.padded_added_vocab_start_index)
++        assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
++        assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
++
++        assert self.num_org_elements <= self.num_org_elements_padded
++        assert self.num_added_elements <= self.num_added_elements_padded
++
++
++@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
++def get_masked_input_and_mask(
++        input_: torch.Tensor, org_vocab_start_index: int,
++        org_vocab_end_index: int, num_org_vocab_padding: int,
++        added_vocab_start_index: int,
++        added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
++    # torch.compile will fuse all of the pointwise ops below
++    # into a single kernel, making it very fast
++    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
++                                                          org_vocab_end_index)
++    added_vocab_mask = (input_ >= added_vocab_start_index) & (
++        input_ < added_vocab_end_index)
++    added_offset = added_vocab_start_index - (
++        org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
++    valid_offset = (org_vocab_start_index *
++                    org_vocab_mask) + (added_offset * added_vocab_mask)
++    vocab_mask = org_vocab_mask | added_vocab_mask
++    input_ = vocab_mask * (input_ - valid_offset)
++    return input_, ~vocab_mask
+ 
+ 
+ class VocabParallelEmbedding(torch.nn.Module):
+@@ -38,69 +160,261 @@ class VocabParallelEmbedding(torch.nn.Module):
+     Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
+     make sure it is divisible by the number of model parallel GPUs.
+ 
++    In order to support various loading methods, we ensure that LoRA-added
++    embeddings are always at the end of TP-sharded tensors. In other words,
++    we shard base embeddings and LoRA embeddings separately (both padded),
++    and place them in the same tensor.
++    In this example, we will have the original vocab size = 1010,
++    added vocab size = 16 and padding to 64. Therefore, the total
++    vocab size with padding will be 1088 (because we first pad 1010 to
++    1024, add 16, and then pad to 1088).
++    Therefore, the tensor format looks like the following:
++    TP1, rank 0 (no sharding):
++                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
++    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1015 |  -1  | ... |  -1  |
++                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
++
++    TP2, rank 0:
++                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
++    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1000 | ... | 1015 |  -1  | ... |  -1 |
++                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  520 | ... | 543 |
++    TP2, rank 1:
++                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
++    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
++                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 |
++
+     Args:
+         num_embeddings: vocabulary size.
+         embedding_dim: size of hidden state.
+         params_dtype: type of the parameters.
+         org_num_embeddings: original vocabulary size (without LoRA).
+         padding_size: padding size for the vocabulary.
+-    """
++        quant_config: quant config for the layer
++        prefix: full name of the layer in the state dict
++    """  # noqa: E501
+ 
+     def __init__(self,
+                  num_embeddings: int,
+                  embedding_dim: int,
+                  params_dtype: Optional[torch.dtype] = None,
+                  org_num_embeddings: Optional[int] = None,
+-                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
++                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__()
+ 
+         # Keep the input dimensions.
++        tp_rank = get_tensor_model_parallel_rank()
++        self.tp_size = get_tensor_model_parallel_world_size()
+         self.num_embeddings = num_embeddings
++        self.padding_size = padding_size
+         self.org_vocab_size = org_num_embeddings or num_embeddings
+-        self.num_embeddings_padded = pad_vocab_size(num_embeddings,
+-                                                    padding_size)
++        num_added_embeddings = num_embeddings - self.org_vocab_size
++        self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size,
++                                                    self.padding_size)
++        self.num_embeddings_padded = pad_vocab_size(
++            self.org_vocab_size_padded + num_added_embeddings,
++            self.padding_size)
++        assert self.org_vocab_size_padded <= self.num_embeddings_padded
++
++        self.shard_indices = self._get_indices(self.num_embeddings_padded,
++                                               self.org_vocab_size_padded,
++                                               self.num_embeddings,
++                                               self.org_vocab_size, tp_rank,
++                                               self.tp_size)
+         self.embedding_dim = embedding_dim
++
++        linear_method = None
++        if quant_config is not None:
++            linear_method = quant_config.get_quant_method(self, prefix=prefix)
++        if linear_method is None:
++            linear_method = UnquantizedEmbeddingMethod()
++
++        # If we are making an embedding layer, then our quantization linear
++        # method must implement the embedding operation. If we are another
++        # layer type like ParallelLMHead, this is not important.
++        is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
++        linear_method_implements_embedding = method_has_implemented_embedding(
++            type(linear_method))
++        if is_embedding_layer and not linear_method_implements_embedding:
++            raise NotImplementedError(
++                f"The class {type(linear_method).__name__} must implement "
++                "the 'embedding' method, see UnquantizedEmbeddingMethod.")
++
++        self.linear_method: QuantizeMethodBase = linear_method
++
+         if params_dtype is None:
+             params_dtype = torch.get_default_dtype()
+-        self.tp_size = get_tensor_model_parallel_world_size()
+         # Divide the weight matrix along the vocaburaly dimension.
+-        self.vocab_start_index, self.vocab_end_index = (
+-            vocab_range_from_global_vocab_size(
+-                self.num_embeddings_padded, get_tensor_model_parallel_rank(),
+-                self.tp_size))
+-        self.num_embeddings_per_partition = (self.vocab_end_index -
+-                                             self.vocab_start_index)
+-        self.weight = Parameter(
+-            torch.empty(self.num_embeddings_per_partition,
+-                        self.embedding_dim,
+-                        dtype=params_dtype))
+-        set_weight_attrs(self.weight, {
+-            "parallel_dim": 0,
+-            "weight_loader": self.weight_loader
+-        })
++        self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
++        self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
++                                                   self.tp_size)
++        assert (self.shard_indices.num_elements_padded ==
++                self.num_embeddings_per_partition)
++        self.num_org_embeddings_per_partition = (
++            self.shard_indices.org_vocab_end_index -
++            self.shard_indices.org_vocab_start_index)
++        self.num_added_embeddings_per_partition = (
++            self.shard_indices.added_vocab_end_index -
++            self.shard_indices.added_vocab_start_index)
++
++        self.linear_method.create_weights(self,
++                                          self.embedding_dim,
++                                          [self.num_embeddings_per_partition],
++                                          self.embedding_dim,
++                                          self.num_embeddings_padded,
++                                          params_dtype=params_dtype,
++                                          weight_loader=self.weight_loader)
++
++    @classmethod
++    def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
++                     vocab_size: int, org_vocab_size: int, tp_rank: int,
++                     tp_size: int) -> VocabParallelEmbeddingShardIndices:
++        """Get start and end indices for vocab parallel embedding, following the
++        layout outlined in the class docstring, based on the given tp_rank and
++        tp_size."""
++        num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded
++        padded_org_vocab_start_index, padded_org_vocab_end_index = (
++            vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank,
++                                               tp_size))
++        padded_added_vocab_start_index, padded_added_vocab_end_index = (
++            vocab_range_from_global_vocab_size(num_added_embeddings_padded,
++                                               tp_rank,
++                                               tp_size,
++                                               offset=org_vocab_size))
++        # remove padding
++        org_vocab_start_index = min(padded_org_vocab_start_index,
++                                    org_vocab_size)
++        org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size)
++        added_vocab_start_index = min(padded_added_vocab_start_index,
++                                      vocab_size)
++        added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size)
++        return VocabParallelEmbeddingShardIndices(
++            padded_org_vocab_start_index, padded_org_vocab_end_index,
++            padded_added_vocab_start_index, padded_added_vocab_end_index,
++            org_vocab_start_index, org_vocab_end_index,
++            added_vocab_start_index, added_vocab_end_index)
++
++    def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
++        """Get a mapping that can be used to reindex the gathered
++        logits for sampling.
++        
++        During sampling, we gather logits from all ranks. The relationship
++        of index->token_id will follow the same format as outlined in the class
++        docstring. However, after the gather, we want to reindex the final
++        logits tensor to map index->token_id one-to-one (the index is always
++        equal the token_id it corresponds to). The indices returned by this
++        method allow us to do that.
++        """
++        if self.tp_size < 2:
++            return None
++
++        base_embeddings: List[int] = []
++        added_embeddings: List[int] = []
++        padding: List[int] = []
++        for tp_rank in range(self.tp_size):
++            shard_indices = self._get_indices(self.num_embeddings_padded,
++                                              self.org_vocab_size_padded,
++                                              self.num_embeddings,
++                                              self.org_vocab_size, tp_rank,
++                                              self.tp_size)
++            range_start = self.num_embeddings_per_partition * tp_rank
++            range_end = self.num_embeddings_per_partition * (tp_rank + 1)
++            base_embeddings.extend(
++                range(range_start,
++                      range_start + shard_indices.num_org_elements))
++            padding.extend(
++                range(range_start + shard_indices.num_org_elements,
++                      range_start + shard_indices.num_org_elements_padded))
++            added_embeddings.extend(
++                range(
++                    range_start + shard_indices.num_org_elements_padded,
++                    range_start + shard_indices.num_org_elements_padded +
++                    shard_indices.num_added_elements))
++            padding.extend(
++                range(
++                    range_start + shard_indices.num_org_elements_padded +
++                    shard_indices.num_added_elements,
++                    range_start + shard_indices.num_org_elements_padded +
++                    shard_indices.num_added_elements_padded))
++            assert (range_start + shard_indices.num_org_elements_padded +
++                    shard_indices.num_added_elements_padded == range_end)
++        ret = base_embeddings + added_embeddings + padding
++        assert len(ret) == self.num_embeddings_padded
++        return ret
+ 
+     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+-        parallel_dim = param.parallel_dim
+-        assert loaded_weight.shape[parallel_dim] == self.org_vocab_size
+-        loaded_weight = loaded_weight[self.vocab_start_index:self.
+-                                      vocab_end_index]
+-        param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
++        output_dim = getattr(param, "output_dim", None)
++        packed_dim = getattr(param, "packed_dim", None)
++
++        # If the parameter is a gguf weight, then load it directly.
++        if getattr(param, "is_gguf_weight_type", None):
++            param.data.copy_(loaded_weight)
++            param.weight_type = loaded_weight.item()
++            return
++        elif isinstance(param, UninitializedParameter):
++            shape = list(loaded_weight.shape)
++            if output_dim is not None:
++                shape[output_dim] = shape[output_dim] // self.tp_size
++            param.materialize(tuple(shape), dtype=loaded_weight.dtype)
++
++        # If parameter does not have output dim, then it should
++        # be copied onto all gpus (e.g. g_idx for act_order gptq).
++        if output_dim is None:
++            assert param.data.shape == loaded_weight.shape
++            param.data.copy_(loaded_weight)
++            return
++
++        # Shard indexes for loading the weight
++        start_idx = self.shard_indices.org_vocab_start_index
++        shard_size = self.shard_indices.org_vocab_end_index - start_idx
++
++        # If param packed on the same dim we are sharding on, then
++        # need to adjust offsets of loaded weight by pack_factor.
++        if packed_dim is not None and packed_dim == output_dim:
++            packed_factor = param.packed_factor if isinstance(
++                param, BasevLLMParameter) else param.pack_factor
++            assert loaded_weight.shape[output_dim] == (self.org_vocab_size //
++                                                       param.packed_factor)
++            start_idx = start_idx // packed_factor
++            shard_size = shard_size // packed_factor
++        else:
++            assert loaded_weight.shape[output_dim] == self.org_vocab_size
++
++        # Copy the data.
++        loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
++
++        if current_platform.is_hpu():
++            # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here,
++            # so we're using a workaround. Remove this when fixed in
++            # HPU PT bridge.
++            padded_weight = torch.cat([
++                loaded_weight,
++                torch.zeros(param.shape[0] - loaded_weight.shape[0],
++                            *loaded_weight.shape[1:])
++            ])
++            param.data.copy_(padded_weight)
++        else:
++            param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
++            param[loaded_weight.shape[0]:].data.fill_(0)
+ 
+     def forward(self, input_):
+         if self.tp_size > 1:
+             # Build the mask.
+-            input_mask = ((input_ < self.vocab_start_index) |
+-                          (input_ >= self.vocab_end_index))
+-            # Mask the input.
+-            masked_input = input_.clone() - self.vocab_start_index
+-            masked_input[input_mask] = 0
++            masked_input, input_mask = get_masked_input_and_mask(
++                input_, self.shard_indices.org_vocab_start_index,
++                self.shard_indices.org_vocab_end_index,
++                self.shard_indices.num_org_vocab_padding,
++                self.shard_indices.added_vocab_start_index,
++                self.shard_indices.added_vocab_end_index)
+         else:
+             masked_input = input_
+-            # Get the embeddings.
+-        output_parallel = F.embedding(masked_input, self.weight)
++        # Get the embeddings.
++        output_parallel = self.linear_method.embedding(self,
++                                                       masked_input.long())
+         # Mask the output embedding.
+         if self.tp_size > 1:
+-            output_parallel[input_mask, :] = 0.0
++            output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
+         # Reduce across all the model parallel GPUs.
+         output = tensor_model_parallel_all_reduce(output_parallel)
+         return output
+@@ -136,20 +450,33 @@ class ParallelLMHead(VocabParallelEmbedding):
+                  bias: bool = False,
+                  params_dtype: Optional[torch.dtype] = None,
+                  org_num_embeddings: Optional[int] = None,
+-                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
++                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__(num_embeddings, embedding_dim, params_dtype,
+-                         org_num_embeddings, padding_size)
++                         org_num_embeddings, padding_size, quant_config,
++                         prefix)
++        self.quant_config = quant_config
+         if bias:
+             self.bias = Parameter(
+                 torch.empty(self.num_embeddings_per_partition,
+                             dtype=params_dtype))
+             set_weight_attrs(self.bias, {
+-                "parallel_dim": 0,
+-                "weight_loader": self.weight_loader
++                "output_dim": 0,
++                "weight_loader": self.weight_loader,
+             })
+         else:
+             self.register_parameter("bias", None)
+ 
++    def tie_weights(self, embed_tokens: VocabParallelEmbedding):
++        """Tie the weights with word embeddings."""
++        # GGUF quantized embed_tokens.
++        if self.quant_config and self.quant_config.get_name() == "gguf":
++            return embed_tokens
++        else:
++            self.weight = embed_tokens.weight
++            return self
++
+     def forward(self, input_):
+         del input_
+         raise RuntimeError("LMHead's weights should be used in the sampler.")
+diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
+index 6f90e49..1246899 100644
+--- a/vllm/model_executor/model_loader/__init__.py
++++ b/vllm/model_executor/model_loader/__init__.py
+@@ -1,27 +1,15 @@
+-from typing import Optional
+-
+ from torch import nn
+ 
+-from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
++from vllm.config import VllmConfig
+ from vllm.model_executor.model_loader.loader import (BaseModelLoader,
+                                                      get_model_loader)
+ from vllm.model_executor.model_loader.utils import (
+     get_architecture_class_name, get_model_architecture)
+ 
+ 
+-def get_model(
+-        *, model_config: ModelConfig, load_config: LoadConfig,
+-        device_config: DeviceConfig, parallel_config: ParallelConfig,
+-        scheduler_config: SchedulerConfig, lora_config: Optional[LoRAConfig],
+-        vision_language_config: Optional[VisionLanguageConfig]) -> nn.Module:
+-    loader = get_model_loader(load_config)
+-    return loader.load_model(model_config=model_config,
+-                             device_config=device_config,
+-                             lora_config=lora_config,
+-                             vision_language_config=vision_language_config,
+-                             parallel_config=parallel_config,
+-                             scheduler_config=scheduler_config)
++def get_model(*, vllm_config: VllmConfig) -> nn.Module:
++    loader = get_model_loader(vllm_config.load_config)
++    return loader.load_model(vllm_config=vllm_config)
+ 
+ 
+ __all__ = [
+diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
+index bafa2de..0033fbf 100644
+--- a/vllm/model_executor/model_loader/loader.py
++++ b/vllm/model_executor/model_loader/loader.py
+@@ -1,94 +1,148 @@
+ # ruff: noqa: SIM117
++import collections
+ import copy
++import dataclasses
++import fnmatch
+ import glob
++import inspect
++import itertools
++import math
+ import os
++import warnings
+ from abc import ABC, abstractmethod
+-from typing import Any, Dict, Generator, List, Optional, Tuple, Type
++from contextlib import contextmanager
++from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional,
++                    Tuple, cast)
+ 
++import gguf
+ import huggingface_hub
++import numpy as np
+ import torch
++from huggingface_hub import HfApi
+ from torch import nn
++from transformers import AutoModelForCausalLM
++from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
+ 
+-from vllm.config import (DeviceConfig, LoadConfig, LoadFormat, LoRAConfig,
+-                         ModelConfig, ParallelConfig, SchedulerConfig,
+-                         VisionLanguageConfig)
++from vllm.config import (LoadConfig, LoadFormat, ModelConfig, ParallelConfig,
++                         VllmConfig, set_current_vllm_config)
++from vllm.distributed import (get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size)
+ from vllm.envs import VLLM_USE_MODELSCOPE
+ from vllm.logger import init_logger
++from vllm.model_executor.layers.linear import (LinearBase,
++                                               MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               ReplicatedLinear,
++                                               RowParallelLinear)
+ from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++    QuantizeMethodBase)
+ from vllm.model_executor.model_loader.tensorizer import (
+-    TensorizerConfig, is_vllm_serialized_tensorizer, load_with_tensorizer,
+-    tensorizer_weights_iterator)
++    TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
++    serialize_vllm_model, tensorizer_weights_iterator)
+ from vllm.model_executor.model_loader.utils import (get_model_architecture,
+                                                     set_default_torch_dtype)
+ from vllm.model_executor.model_loader.weight_utils import (
+-    download_weights_from_hf, filter_files_not_needed_for_inference,
+-    get_quant_config, initialize_dummy_weights, np_cache_weights_iterator,
+-    pt_weights_iterator, safetensors_weights_iterator)
+-from vllm.model_executor.models.llava import LlavaForConditionalGeneration
++    download_safetensors_index_file_from_hf, download_weights_from_hf,
++    filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
++    get_gguf_extra_tensor_names, gguf_quant_weights_iterator,
++    initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator,
++    runai_safetensors_weights_iterator, safetensors_weights_iterator)
++from vllm.model_executor.utils import set_weight_attrs
++from vllm.platforms import current_platform
++from vllm.transformers_utils.s3_utils import glob as s3_glob
++from vllm.transformers_utils.utils import is_s3
++from vllm.utils import is_pin_memory_available
+ 
+-_VISION_MODEL_CLASSES = [
+-    LlavaForConditionalGeneration,
+-]
+ 
+-logger = init_logger(__name__)
++@contextmanager
++def device_loading_context(module: torch.nn.Module,
++                           target_device: torch.device):
++    if target_device.type == "cpu":
++        # If target is CPU, no need to move anything
++        yield module
++        return
+ 
++    original_device_states: Dict[str, torch.device] = {}
+ 
+-def _get_quantization_config(
+-        model_config: ModelConfig,
+-        load_config: LoadConfig) -> Optional[QuantizationConfig]:
+-    """Get the quantization config."""
+-    if model_config.quantization is not None:
+-        quant_config = get_quant_config(model_config, load_config)
+-        capability = torch.cuda.get_device_capability()
+-        capability = capability[0] * 10 + capability[1]
+-        if capability < quant_config.get_min_capability():
+-            raise ValueError(
+-                f"The quantization method {model_config.quantization} is not "
+-                "supported for the current GPU. "
+-                f"Minimum capability: {quant_config.get_min_capability()}. "
+-                f"Current capability: {capability}.")
+-        supported_dtypes = quant_config.get_supported_act_dtypes()
+-        if model_config.dtype not in supported_dtypes:
+-            raise ValueError(
+-                f"{model_config.dtype} is not supported for quantization "
+-                f"method {model_config.quantization}. Supported dtypes: "
+-                f"{supported_dtypes}")
+-        return quant_config
+-    return None
+-
+-
+-def _get_model_initialization_kwargs(
+-        model_class: Type[nn.Module], lora_config: Optional[LoRAConfig],
+-        vision_language_config: Optional[VisionLanguageConfig]
+-) -> Dict[str, Any]:
+-    """Get extra kwargs for model initialization."""
+-    extra_kwargs = {}
+-    if hasattr(model_class, "supported_lora_modules"):
+-        extra_kwargs["lora_config"] = lora_config
+-    elif lora_config:
+-        raise ValueError(
+-            f"Model {model_class.__name__} does not support LoRA, "
+-            "but LoRA is enabled. Support for this model may "
+-            "be added in the future. If this is important to you, "
+-            "please open an issue on github.")
+-    elif model_class in _VISION_MODEL_CLASSES:
+-        extra_kwargs["vision_language_config"] = vision_language_config
+-    return extra_kwargs
++    # Store original device states and move parameters to GPU if they're on CPU
++    for name, p in module.named_parameters():
++        if p.device.type == "cpu":
++            original_device_states[name] = p.device
++            p.data = p.data.to(target_device)
++        # Parameters already on target device are not touched
++
++    try:
++        yield module
++
++    finally:
++        # Restore parameters to their original devices, ignoring new parameters
++        pin_memory = is_pin_memory_available()
++        for name, p in module.named_parameters():
++            if name in original_device_states:
++                original_device: torch.device = original_device_states[name]
++                if original_device.type == "cpu":
++                    # `torch.empty_like` does not support `pin_memory` argument
++                    cpu_data = torch.empty_strided(
++                        size=p.data.size(),
++                        stride=p.data.stride(),
++                        dtype=p.data.dtype,
++                        layout=p.data.layout,
++                        device="cpu",
++                        pin_memory=pin_memory,
++                    )
++                    cpu_data.copy_(p.data)
++                    p.data = cpu_data
++                else:
++                    p.data = p.data.to(original_device)
++        # New parameters or parameters already on target device are untouched
++
++
++logger = init_logger(__name__)
+ 
+ 
+ def _initialize_model(
+-        model_config: ModelConfig, load_config: LoadConfig,
+-        lora_config: Optional[LoRAConfig],
+-        vision_language_config: Optional[VisionLanguageConfig]) -> nn.Module:
++    vllm_config: VllmConfig,
++    *,
++    prefix: str = "",
++) -> nn.Module:
+     """Initialize a model with the given configurations."""
+-    model_class = get_model_architecture(model_config)[0]
+-    quant_config = _get_quantization_config(model_config, load_config)
++    model_config = vllm_config.model_config
++    model_class, _ = get_model_architecture(model_config)
++
++    signatures = inspect.signature(model_class.__init__)
++    all_params = [param.name for param in signatures.parameters.values()]
++    if "vllm_config" in all_params and "prefix" in all_params:
++        # new-style model class
++        with set_current_vllm_config(vllm_config):
++            return model_class(vllm_config=vllm_config, prefix=prefix)
+ 
+-    return model_class(config=model_config.hf_config,
+-                       quant_config=quant_config,
+-                       **_get_model_initialization_kwargs(
+-                           model_class, lora_config, vision_language_config))
++    msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
++           "input arguments. Possibly you have an old-style model class"
++           " registered from out of tree and it is used for new vLLM version. "
++           "Check https://docs.vllm.ai/en/latest/design/arch_overview.html "
++           "for the design and update the model class accordingly.")
++    warnings.warn(msg, DeprecationWarning, stacklevel=2)
++
++    logger.warning(
++        "Trying to guess the arguments for old-style model class %s",
++        model_class,
++    )
++    # try to be compatible with old-style model class
++    kwargs = {}
++    if "prefix" in all_params:
++        kwargs["prefix"] = prefix
++    if "config" in all_params:
++        kwargs["config"] = model_config.hf_config
++    if "cache_config" in all_params:
++        kwargs["cache_config"] = vllm_config.cache_config
++    if "quant_config" in all_params:
++        kwargs["quant_config"] = vllm_config.quant_config
++    if "lora_config" in all_params:
++        kwargs["lora_config"] = vllm_config.lora_config
++    if "scheduler_config" in all_params:
++        kwargs["scheduler_config"] = vllm_config.scheduler_config
++    with set_current_vllm_config(vllm_config):
++        return model_class(**kwargs)
+ 
+ 
+ class BaseModelLoader(ABC):
+@@ -98,19 +152,35 @@ class BaseModelLoader(ABC):
+         self.load_config = load_config
+ 
+     @abstractmethod
+-    def load_model(self, *, model_config: ModelConfig,
+-                   device_config: DeviceConfig,
+-                   lora_config: Optional[LoRAConfig],
+-                   vision_language_config: Optional[VisionLanguageConfig],
+-                   parallel_config: ParallelConfig,
+-                   scheduler_config: SchedulerConfig) -> nn.Module:
++    def download_model(self, model_config: ModelConfig) -> None:
++        """Download a model so that it can be immediately loaded."""
++        raise NotImplementedError
++
++    @abstractmethod
++    def load_model(self, *, vllm_config: VllmConfig) -> nn.Module:
+         """Load a model with the given configurations."""
+-        ...
++        raise NotImplementedError
+ 
+ 
+ class DefaultModelLoader(BaseModelLoader):
+     """Model loader that can load different file types from disk."""
+ 
++    @dataclasses.dataclass
++    class Source:
++        """A source for weights."""
++
++        model_or_path: str
++        """The model ID or path."""
++
++        revision: Optional[str]
++        """The optional model revision."""
++
++        prefix: str = ""
++        """A prefix to prepend to all weights."""
++
++        fall_back_to_pt: bool = True
++        """Whether .pt weights can be used."""
++
+     def __init__(self, load_config: LoadConfig):
+         super().__init__(load_config)
+         if load_config.model_loader_extra_config:
+@@ -120,7 +190,7 @@ class DefaultModelLoader(BaseModelLoader):
+     def _maybe_download_from_modelscope(
+             self, model: str, revision: Optional[str]) -> Optional[str]:
+         """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
+-        
++
+         Returns the path to the downloaded model, or None if the model is not
+         downloaded from ModelScope."""
+         if VLLM_USE_MODELSCOPE:
+@@ -135,30 +205,39 @@ class DefaultModelLoader(BaseModelLoader):
+                     cache_dir=self.load_config.download_dir,
+                     local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                     revision=revision,
++                    ignore_file_pattern=self.load_config.ignore_patterns,
+                 )
+             else:
+                 model_path = model
+             return model_path
+         return None
+ 
+-    def _prepare_weights(self, model_name_or_path: str,
+-                         revision: Optional[str],
+-                         fall_back_to_pt: bool) -> Tuple[str, List[str], bool]:
++    def _prepare_weights(
++        self,
++        model_name_or_path: str,
++        revision: Optional[str],
++        fall_back_to_pt: bool,
++    ) -> Tuple[str, List[str], bool]:
+         """Prepare weights for the model.
+ 
+         If the model is not local, it will be downloaded."""
+-        model_name_or_path = self._maybe_download_from_modelscope(
+-            model_name_or_path, revision) or model_name_or_path
++        model_name_or_path = (self._maybe_download_from_modelscope(
++            model_name_or_path, revision) or model_name_or_path)
+ 
+         is_local = os.path.isdir(model_name_or_path)
+         load_format = self.load_config.load_format
+         use_safetensors = False
++        index_file = SAFE_WEIGHTS_INDEX_NAME
+         # Some quantized models use .pt files for storing the weights.
+         if load_format == LoadFormat.AUTO:
+             allow_patterns = ["*.safetensors", "*.bin"]
+         elif load_format == LoadFormat.SAFETENSORS:
+             use_safetensors = True
+             allow_patterns = ["*.safetensors"]
++        elif load_format == LoadFormat.MISTRAL:
++            use_safetensors = True
++            allow_patterns = ["consolidated*.safetensors"]
++            index_file = "consolidated.safetensors.index.json"
+         elif load_format == LoadFormat.PT:
+             allow_patterns = ["*.pt"]
+         elif load_format == LoadFormat.NPCACHE:
+@@ -170,9 +249,13 @@ class DefaultModelLoader(BaseModelLoader):
+             allow_patterns += ["*.pt"]
+ 
+         if not is_local:
+-            hf_folder = download_weights_from_hf(model_name_or_path,
+-                                                 self.load_config.download_dir,
+-                                                 allow_patterns, revision)
++            hf_folder = download_weights_from_hf(
++                model_name_or_path,
++                self.load_config.download_dir,
++                allow_patterns,
++                revision,
++                ignore_patterns=self.load_config.ignore_patterns,
++            )
+         else:
+             hf_folder = model_name_or_path
+ 
+@@ -184,7 +267,22 @@ class DefaultModelLoader(BaseModelLoader):
+                     use_safetensors = True
+                 break
+ 
+-        if not use_safetensors:
++        if use_safetensors:
++            # For models like Mistral-7B-Instruct-v0.3
++            # there are both sharded safetensors files and a consolidated
++            # safetensors file. Using both breaks.
++            # Here, we download the `model.safetensors.index.json` and filter
++            # any files not found in the index.
++            if not is_local:
++                download_safetensors_index_file_from_hf(
++                    model_name_or_path,
++                    index_file,
++                    self.load_config.download_dir,
++                    revision,
++                )
++            hf_weights_files = filter_duplicate_safetensors_files(
++                hf_weights_files, hf_folder, index_file)
++        else:
+             hf_weights_files = filter_files_not_needed_for_inference(
+                 hf_weights_files)
+ 
+@@ -195,47 +293,98 @@ class DefaultModelLoader(BaseModelLoader):
+         return hf_folder, hf_weights_files, use_safetensors
+ 
+     def _get_weights_iterator(
+-        self, model_name_or_path: str, revision: Optional[str],
+-        fall_back_to_pt: bool
++            self, source: "Source"
+     ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+         """Get an iterator for the model weights based on the load format."""
+         hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
+-            model_name_or_path, revision, fall_back_to_pt)
++            source.model_or_path, source.revision, source.fall_back_to_pt)
+         if self.load_config.load_format == LoadFormat.NPCACHE:
+             # Currently np_cache only support *.bin checkpoints
+             assert use_safetensors is False
+-            return np_cache_weights_iterator(model_name_or_path,
+-                                             self.load_config.download_dir,
+-                                             hf_folder, hf_weights_files)
+-        if use_safetensors:
+-            return safetensors_weights_iterator(hf_weights_files)
+-        return pt_weights_iterator(hf_weights_files)
+-
+-    def load_model(self, *, model_config: ModelConfig,
+-                   device_config: DeviceConfig,
+-                   lora_config: Optional[LoRAConfig],
+-                   vision_language_config: Optional[VisionLanguageConfig],
+-                   parallel_config: ParallelConfig,
+-                   scheduler_config: SchedulerConfig) -> nn.Module:
++            weights_iterator = np_cache_weights_iterator(
++                source.model_or_path,
++                self.load_config.download_dir,
++                hf_folder,
++                hf_weights_files,
++            )
++        elif use_safetensors:
++            weights_iterator = safetensors_weights_iterator(hf_weights_files)
++        else:
++            weights_iterator = pt_weights_iterator(hf_weights_files)
++
++        if current_platform.is_tpu():
++            # In PyTorch XLA, we should call `xm.mark_step` frequently so that
++            # not too many ops are accumulated in the XLA program.
++            import torch_xla.core.xla_model as xm
++
++            def _xla_weights_iterator(iterator: Generator):
++                for weights in iterator:
++                    yield weights
++                    xm.mark_step()
++
++            weights_iterator = _xla_weights_iterator(weights_iterator)
++
++        # Apply the prefix.
++        return ((source.prefix + name, tensor)
++                for (name, tensor) in weights_iterator)
++
++    def _get_all_weights(
++        self,
++        model_config: ModelConfig,
++        model: nn.Module,
++    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
++        primary_weights = DefaultModelLoader.Source(
++            model_config.model,
++            model_config.revision,
++            prefix="",
++            fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load",
++                                    True),
++        )
++        yield from self._get_weights_iterator(primary_weights)
++
++        secondary_weights = cast(
++            Iterable[DefaultModelLoader.Source],
++            getattr(model, "secondary_weights", ()),
++        )
++        for source in secondary_weights:
++            yield from self._get_weights_iterator(source)
++
++    def download_model(self, model_config: ModelConfig) -> None:
++        self._prepare_weights(model_config.model,
++                              model_config.revision,
++                              fall_back_to_pt=True)
++
++    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
++        device_config = vllm_config.device_config
++        model_config = vllm_config.model_config
++
++        target_device = torch.device(device_config.device)
+         with set_default_torch_dtype(model_config.dtype):
+-            with torch.device(device_config.device):
+-                model = _initialize_model(model_config, self.load_config,
+-                                          lora_config, vision_language_config)
+-            model.load_weights(
+-                self._get_weights_iterator(model_config.model,
+-                                           model_config.revision,
+-                                           fall_back_to_pt=getattr(
+-                                               model,
+-                                               "fall_back_to_pt_during_load",
+-                                               True)), )
++            with target_device:
++                model = _initialize_model(vllm_config=vllm_config)
++
++            weights_to_load = {name for name, _ in model.named_parameters()}
++            loaded_weights = model.load_weights(
++                self._get_all_weights(model_config, model))
++            # We only enable strict check for non-quantized models
++            # that have loaded weights tracking currently.
++            if model_config.quantization is None and loaded_weights is not None:
++                weights_not_loaded = weights_to_load - loaded_weights
++                if weights_not_loaded:
++                    raise ValueError(
++                        "Following weights were not initialized from "
++                        f"checkpoint: {weights_not_loaded}")
++
+             for _, module in model.named_modules():
+                 quant_method = getattr(module, "quant_method", None)
+-                if quant_method is not None:
+-                    quant_method.process_weights_after_loading(module)
+-                # FIXME: Remove this after Mixtral is updated
+-                # to use quant_method.
+-                if hasattr(module, "process_weights_after_loading"):
+-                    module.process_weights_after_loading()
++                if isinstance(quant_method, QuantizeMethodBase):
++                    # When quant methods need to process weights after loading
++                    # (for repacking, quantizing, etc), they expect parameters
++                    # to be on the global target device. This scope is for the
++                    # case where cpu offloading is used, where we will move the
++                    # parameters onto device for processing and back off after.
++                    with device_loading_context(module, target_device):
++                        quant_method.process_weights_after_loading(module)
+         return model.eval()
+ 
+ 
+@@ -248,19 +397,30 @@ class DummyModelLoader(BaseModelLoader):
+             raise ValueError(f"Model loader extra config is not supported for "
+                              f"load format {load_config.load_format}")
+ 
+-    def load_model(self, *, model_config: ModelConfig,
+-                   device_config: DeviceConfig,
+-                   lora_config: Optional[LoRAConfig],
+-                   vision_language_config: Optional[VisionLanguageConfig],
+-                   parallel_config: ParallelConfig,
+-                   scheduler_config: SchedulerConfig) -> nn.Module:
++    def download_model(self, model_config: ModelConfig) -> None:
++        pass  # Nothing to download
++
++    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
++        device_config = vllm_config.device_config
++        model_config = vllm_config.model_config
+         with set_default_torch_dtype(model_config.dtype):
+             with torch.device(device_config.device):
+-                model = _initialize_model(model_config, self.load_config,
+-                                          lora_config, vision_language_config)
++                model = _initialize_model(vllm_config=vllm_config)
+             # NOTE(woosuk): For accurate performance evaluation, we assign
+             # random values to the weights.
+             initialize_dummy_weights(model)
++
++            for _, module in model.named_modules():
++                quant_method = getattr(module, "quant_method", None)
++                if quant_method is not None:
++                    # When quant methods need to process weights after loading
++                    # (for repacking, quantizing, etc), they expect parameters
++                    # to be on the global target device. This scope is for the
++                    # case where cpu offloading is used, where we will move the
++                    # parameters onto device for processing and back off after.
++                    with device_loading_context(
++                            module, torch.device(device_config.device)):
++                        quant_method.process_weights_after_loading(module)
+         return model.eval()
+ 
+ 
+@@ -281,70 +441,915 @@ class TensorizerLoader(BaseModelLoader):
+         self.tensorizer_config.verify_with_parallel_config(parallel_config)
+ 
+     def _get_weights_iterator(
+-            self) -> Generator[Tuple[str, torch.Tensor], None, None]:
++        self, ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+         tensorizer_args = self.tensorizer_config._construct_tensorizer_args()
+         return tensorizer_weights_iterator(tensorizer_args)
+ 
+-    def _load_model_unserialized(
+-            self, model_config: ModelConfig, device_config: DeviceConfig,
+-            lora_config: Optional[LoRAConfig],
+-            vision_language_config: Optional[VisionLanguageConfig]
++    def _load_model_serialized_cpu(
++        self,
++        vllm_config: VllmConfig,
+     ) -> nn.Module:
+-        """Load an unserialized model with tensorizer.
++        """Load a serialized model with tensorizer to the CPU.
+ 
+-        Unserialized here means "not serialized with tensorizer". This
+-        should still be faster than default HuggingFace loading, but will
+-        be slower than loading a tensorizer-serialized model.
++        This is only necessary when the model isn't vLLM-tensorized (see
++        examples/other/tensorize_vllm_model.py) This should still
++        be faster than default HuggingFace loading, but will be slower than
++        loading a vLLM-tensorized model.
+         """
++        device_config = vllm_config.device_config
++        model_config = vllm_config.model_config
+         with set_default_torch_dtype(model_config.dtype):
+             with torch.device(device_config.device):
+-                model = _initialize_model(model_config, self.load_config,
+-                                          lora_config, vision_language_config)
++                model = _initialize_model(vllm_config=vllm_config)
+ 
+             model.load_weights(self._get_weights_iterator())
+         return model.eval()
+ 
+     def _load_model_serialized(
+-            self, model_config: ModelConfig, device_config: DeviceConfig,
+-            lora_config: Optional[LoRAConfig],
+-            vision_language_config: Optional[VisionLanguageConfig]
++        self,
++        vllm_config: VllmConfig,
+     ) -> nn.Module:
+         """Load a serialized model with tensorizer.
+ 
+-        See the examples/tensorize_vllm_model.py example "
+-        script for serializing vLLM models."""
++        Expects a vLLM-tensorized model. See the
++        examples/other/tensorize_vllm_model.py example script
++        for serializing vLLM models."""
++
++        device_config = vllm_config.device_config
++        model_config = vllm_config.model_config
++
+         with set_default_torch_dtype(model_config.dtype):
+             with torch.device(device_config.device):
+                 model_class = get_model_architecture(model_config)[0]
+-                quant_config = _get_quantization_config(
+-                    model_config, self.load_config)
+-                extra_kwargs = _get_model_initialization_kwargs(
+-                    model_class, lora_config, vision_language_config)
+-                extra_kwargs["quant_config"] = quant_config
+ 
+                 tensorizer_config = copy.copy(self.tensorizer_config)
+                 tensorizer_config.model_class = model_class
+                 tensorizer_config.hf_config = model_config.hf_config
+                 tensorizer_config.dtype = model_config.dtype
+ 
+-                model = load_with_tensorizer(tensorizer_config, **extra_kwargs)
++                model = load_with_tensorizer(tensorizer_config,
++                                             vllm_config=vllm_config)
+         return model.eval()
+ 
+-    def load_model(self, *, model_config: ModelConfig,
+-                   device_config: DeviceConfig,
+-                   lora_config: Optional[LoRAConfig],
+-                   vision_language_config: Optional[VisionLanguageConfig],
+-                   parallel_config: ParallelConfig,
+-                   scheduler_config: SchedulerConfig) -> nn.Module:
++    def download_model(self, model_config: ModelConfig) -> None:
++        self.tensorizer_config.verify_with_model_config(model_config)
++
++        with self.tensorizer_config.open_stream():
++            pass
++
++    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
++        model_config = vllm_config.model_config
++        parallel_config = vllm_config.parallel_config
+         self._verify_config(model_config, parallel_config)
+ 
+-        if is_vllm_serialized_tensorizer(self.tensorizer_config):
+-            return self._load_model_serialized(model_config, device_config,
+-                                               lora_config,
+-                                               vision_language_config)
+-        return self._load_model_unserialized(model_config, device_config,
+-                                             lora_config,
+-                                             vision_language_config)
++        if parallel_config.tensor_parallel_size > 1:
++            from vllm.distributed import get_tensor_model_parallel_rank
++
++            self.tensorizer_config.tensorizer_uri = (
++                self.tensorizer_config.tensorizer_uri %
++                get_tensor_model_parallel_rank())
++
++        if is_vllm_tensorized(self.tensorizer_config):
++            return self._load_model_serialized(vllm_config=vllm_config)
++        return self._load_model_serialized_cpu(vllm_config=vllm_config)
++
++    @staticmethod
++    def save_model(
++        model: torch.nn.Module,
++        tensorizer_config: TensorizerConfig,
++    ) -> None:
++        serialize_vllm_model(
++            model=model,
++            tensorizer_config=tensorizer_config,
++        )
++
++
++class ShardedStateLoader(BaseModelLoader):
++    """
++    Model loader that directly loads each worker's model state dict, which
++    enables a fast load path for large tensor-parallel models where each worker
++    only needs to read its own shard rather than the entire checkpoint. See
++    `examples/offline_inference/save_sharded_state.py` for creating a sharded
++    checkpoint.
++    """
++
++    DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
++
++    def __init__(self, load_config: LoadConfig):
++        super().__init__(load_config)
++        extra_config = ({} if load_config.model_loader_extra_config is None
++                        else load_config.model_loader_extra_config.copy())
++        self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN)
++        if extra_config:
++            raise ValueError(f"Unexpected extra config keys for load format "
++                             f"{load_config.load_format}: "
++                             f"{load_config.model_loader_extra_config.keys()}")
++
++    @staticmethod
++    def _filter_subtensors(
++        tensors: Dict[str, torch.Tensor], ) -> Dict[str, torch.Tensor]:
++        """
++        Filter out all tensors that share the same memory or a subset of the
++        memory of another tensor.
++        """
++        same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = (
++            collections.defaultdict(list))
++        for key, tensor in tensors.items():
++            if tensor.numel():
++                ptr = tensor.untyped_storage().data_ptr()
++                same_storage_groups[tensor.device, ptr].append((key, tensor))
++
++        def get_end_ptr(tensor: torch.Tensor) -> int:
++            return tensor.view(-1)[-1].data_ptr() + tensor.element_size()
++
++        result: Dict[str, torch.Tensor] = {}
++        for group in same_storage_groups.values():
++            for k, t in group:
++                a, b = t.data_ptr(), get_end_ptr(t)
++                for k2, t2 in group:
++                    if not t2.is_contiguous():
++                        continue
++                    a2, b2 = t2.data_ptr(), get_end_ptr(t2)
++                    if a < a2 or b2 < b:
++                        continue
++                    if a2 < a or b < b2 or not t.is_contiguous():
++                        break  # t2 covers strictly more memory than t.
++                    if k2 < k:
++                        # Same tensors, keep the one with the smaller key.
++                        break
++                else:
++                    result[k] = t
++        return result
++
++    def _prepare_weights(self, model_name_or_path: str,
++                         revision: Optional[str]):
++        if os.path.isdir(model_name_or_path):
++            return model_name_or_path
++        else:
++            allow_patterns = ["*.safetensors"]
++            return download_weights_from_hf(
++                model_name_or_path,
++                self.load_config.download_dir,
++                allow_patterns,
++                revision,
++                ignore_patterns=self.load_config.ignore_patterns,
++            )
++
++    def download_model(self, model_config: ModelConfig) -> None:
++        self._prepare_weights(model_config.model, model_config.revision)
++
++    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
++        device_config = vllm_config.device_config
++        model_config = vllm_config.model_config
++        from safetensors.torch import safe_open
++
++        from vllm.distributed import get_tensor_model_parallel_rank
++
++        local_model_path = self._prepare_weights(model_config.model,
++                                                 model_config.revision)
++
++        with set_default_torch_dtype(model_config.dtype):
++            with torch.device(device_config.device):
++                model = _initialize_model(vllm_config=vllm_config)
++                for _, module in model.named_modules():
++                    quant_method = getattr(module, "quant_method", None)
++                    if quant_method is not None:
++                        quant_method.process_weights_after_loading(module)
++            rank = get_tensor_model_parallel_rank()
++            pattern = os.path.join(
++                local_model_path,
++                self.pattern.format(rank=rank, part="*"),
++            )
++            filepaths = glob.glob(pattern)
++            if not filepaths:
++                # TODO: support un-sharded checkpoints too
++                raise ValueError(
++                    f"Could not find checkpoint files '{pattern}', only "
++                    f"pre-sharded checkpoints are currently supported!")
++            state_dict = self._filter_subtensors(model.state_dict())
++            for path in filepaths:
++                with safe_open(path, framework="pt") as f:
++                    for key in f.keys():  # noqa: SIM118
++                        tensor = f.get_tensor(key)
++                        # If loading with LoRA enabled, additional padding may
++                        # be added to certain parameters. We only load into a
++                        # narrowed view of the parameter data.
++                        param_data = state_dict[key].data
++                        param_shape = state_dict[key].shape
++                        for dim, size in enumerate(tensor.shape):
++                            if size < param_shape[dim]:
++                                param_data = param_data.narrow(dim, 0, size)
++                        if tensor.shape != param_shape:
++                            logger.warning(
++                                "loading tensor of shape %s into "
++                                "parameter '%s' of shape %s",
++                                tensor.shape,
++                                key,
++                                param_shape,
++                            )
++                        param_data.copy_(tensor)
++                        state_dict.pop(key)
++            if state_dict:
++                raise ValueError(
++                    f"Missing keys {tuple(state_dict)} in loaded state!")
++        return model.eval()
++
++    @staticmethod
++    def save_model(
++        model: torch.nn.Module,
++        path: str,
++        pattern: Optional[str] = None,
++        max_size: Optional[int] = None,
++    ) -> None:
++        from safetensors.torch import save_file
++
++        from vllm.distributed import get_tensor_model_parallel_rank
++
++        if pattern is None:
++            pattern = ShardedStateLoader.DEFAULT_PATTERN
++        rank = get_tensor_model_parallel_rank()
++        part_idx = 0
++        total_size = 0
++        state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
++        state_dict_part: Dict[str, torch.Tensor] = {}
++        for key, tensor in state_dict.items():
++            param_size = tensor.nelement() * tensor.element_size()
++            if max_size is not None and total_size + param_size > max_size:
++                filename = pattern.format(rank=rank, part=part_idx)
++                save_file(
++                    state_dict_part,
++                    os.path.join(path, filename),
++                )
++                part_idx += 1
++                total_size = 0
++                state_dict_part = {}
++            state_dict_part[key] = tensor
++            total_size += param_size
++        if len(state_dict_part) > 0:
++            filename = pattern.format(rank=rank, part=part_idx)
++            save_file(
++                state_dict_part,
++                os.path.join(path, filename),
++            )
++
++
++class BitsAndBytesModelLoader(BaseModelLoader):
++    """Model loader to load model weights with BitAndBytes quantization."""
++
++    possible_config_file_names = ["adapter_config.json"]
++
++    def __init__(self, load_config: LoadConfig):
++        super().__init__(load_config)
++
++        # Save the module names without sharding.
++        self.unsharded_weights_modules: List[str] = []
++        # Save the module names that are sharded by column.
++        self.column_sharded_weights_modules: List[str] = []
++        # Store all module names (from transformers) that support
++        # BNB quantization.
++        self.target_modules: List[str] = []
++        # mapping weight names from transformers to vllm.
++        self.weight_mapper: Callable = lambda name: name
++
++    def _get_weight_files(
++        self,
++        model_name_or_path: str,
++        allowed_patterns: List[str],
++        revision: Optional[str] = None,
++    ) -> Tuple[List[str], str]:
++        """Retrieve weight files. Download the files if necessary.
++
++        Return the weight files and the file pattern."""
++        is_local = os.path.isdir(model_name_or_path)
++
++        if is_local:
++            for pattern in allowed_patterns:
++                weight_files = glob.glob(
++                    os.path.join(model_name_or_path, pattern))
++                if weight_files:
++                    return weight_files, pattern
++        else:
++            hf_api = HfApi()
++            repo_files = hf_api.list_repo_files(repo_id=model_name_or_path)
++            for pattern in allowed_patterns:
++                matching_files = fnmatch.filter(repo_files, pattern)
++                if matching_files:
++                    hf_folder = download_weights_from_hf(
++                        model_name_or_path,
++                        self.load_config.download_dir,
++                        [pattern],
++                        revision,
++                        ignore_patterns=self.load_config.ignore_patterns,
++                    )
++                    return glob.glob(os.path.join(hf_folder, pattern)), pattern
++
++        raise RuntimeError(
++            f"No model weights found in: `{model_name_or_path}`")
++
++    def _prepare_weights(self, model_name_or_path: str,
++                         revision: Optional[str]) -> Tuple[List[str], bool]:
++        """Prepare weight files for the model."""
++
++        allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]
++
++        hf_weights_files, matched_pattern = self._get_weight_files(
++            model_name_or_path, allowed_patterns, revision)
++
++        if matched_pattern != "*.safetensors":
++            hf_weights_files = filter_files_not_needed_for_inference(
++                hf_weights_files)
++
++        if len(hf_weights_files) == 0:
++            raise RuntimeError(
++                f"Cannot find any model weights with `{model_name_or_path}`")
++
++        return hf_weights_files, matched_pattern == "*.safetensors"
++
++    def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
++        if use_safetensors:
++            iterator = safetensors_weights_iterator(hf_weights_files)
++        else:
++            iterator = pt_weights_iterator(hf_weights_files)
++        for name, param in iterator:
++            # mapping weight names from transformers to vllm.
++            yield self.weight_mapper(name), param
++
++    def _get_quantized_weights_iterator(
++        self,
++        model_name_or_path: str,
++        revision: Optional[str],
++        pre_quant: bool,
++        load_8bit: bool,
++    ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str,
++                                                                     Any]]:
++        """Get an iterator to the model weights with bitsandbytes quantization,
++        as well as the quantization state dictionary."""
++
++        # only load the bitsandbytes module when needed
++        try:
++            import bitsandbytes
++
++            if bitsandbytes.__version__ < "0.45.0":
++                raise ImportError("bitsandbytes version is wrong. Please "
++                                  "install bitsandbytes>=0.45.0.")
++        except ImportError as err:
++            raise ImportError("Please install bitsandbytes>=0.45.0 via "
++                              "`pip install bitsandbytes>=0.45.0` to use "
++                              "bitsandbytes quantizer.") from err
++
++        hf_weights_files, use_safetensors = self._prepare_weights(
++            model_name_or_path, revision)
++
++        quant_state_dict: Dict[str, Any] = {}
++
++        if pre_quant:
++            if load_8bit:
++                return self._quantized_8bit_generator(
++                    hf_weights_files, use_safetensors,
++                    quant_state_dict), quant_state_dict
++            else:
++                return self._quantized_4bit_generator(
++                    hf_weights_files, use_safetensors,
++                    quant_state_dict), quant_state_dict
++
++        return self._unquantized_generator(hf_weights_files, use_safetensors,
++                                           quant_state_dict), quant_state_dict
++
++    def _is_8bit_weight_name(self, weight_name: str):
++        quantized_suffix = {".scb", ".weight_format"}
++        return any(weight_name.lower().endswith(suffix)
++                   for suffix in quantized_suffix)
++
++    def _is_4bit_weight_name(self, weight_name: str):
++        quantized_suffix = {
++            "absmax",
++            "quant_map",
++            "nested_absmax",
++            "nested_quant_map",
++            "bitsandbytes",
++        }
++        suffix = weight_name.split(".")[-1]
++        return any(q_suffix in suffix for q_suffix in quantized_suffix)
++
++    def _quantized_8bit_generator(self, hf_weights_files, use_safetensors,
++                                  quant_state_dict) -> Generator:
++        for weight_name, weight_tensor in self._hf_weight_iter(
++                hf_weights_files, use_safetensors):
++            if not weight_name.lower().endswith(".scb"):
++                continue
++
++            weight_key = weight_name.lower().replace(".scb", ".weight")
++            quant_state_dict[weight_key] = weight_tensor
++
++        for weight_name, weight_tensor in self._hf_weight_iter(
++                hf_weights_files, use_safetensors):
++            if self._is_8bit_weight_name(weight_name):
++                continue
++
++            if weight_name in quant_state_dict:
++                set_weight_attrs(weight_tensor, {"load_in_8bit": True})
++                yield weight_name, weight_tensor
++            else:
++                yield weight_name, weight_tensor
++
++    def _quantized_4bit_generator(self, hf_weights_files, use_safetensors,
++                                  quant_state_dict) -> Generator:
++        from bitsandbytes.functional import QuantState
++
++        # First iterate over all quant state weights
++        weight_iterator = self._hf_weight_iter(hf_weights_files,
++                                               use_safetensors)
++        temp_state_dict = {}
++        for weight_name, weight_tensor in weight_iterator:
++            if not self._is_4bit_weight_name(weight_name):
++                continue
++            # bitsandbytes library requires
++            # weight.quant_state.bitsandbytes__* in CPU
++            if "quant_state.bitsandbytes" in weight_name:
++                temp_state_dict[weight_name] = weight_tensor.cpu().data
++            else:
++                temp_state_dict[weight_name] = weight_tensor
++
++        # Closure to parse quant_state for each prequant weight
++        def _parse_quant_state(param_name: str,
++                               temp_state_dict: Dict) -> QuantState:
++            quant_state = {}
++            for k in temp_state_dict:
++                if param_name + "." in k:
++                    quant_state[k] = temp_state_dict[k]
++
++            return QuantState.from_dict(quant_state, device="cuda")
++
++        # Second iterate over all prequant and normal weights
++        # pre quantized weights would have a quant_state
++        for weight_name, weight_tensor in self._hf_weight_iter(
++                hf_weights_files, use_safetensors):
++            if self._is_4bit_weight_name(weight_name):
++                continue
++
++            if (f"{weight_name}.quant_state.bitsandbytes__nf4"
++                    in temp_state_dict) or (
++                        f"{weight_name}.quant_state.bitsandbytes__fp4"
++                        in temp_state_dict):
++                quant_state = _parse_quant_state(weight_name, temp_state_dict)
++                quant_state_dict[weight_name] = quant_state
++                yield weight_name, weight_tensor
++            else:
++                yield weight_name, weight_tensor
++
++    def _unquantized_generator(self, hf_weights_files, use_safetensors,
++                               quant_state_dict) -> Generator:
++        from bitsandbytes.functional import quantize_4bit
++
++        tp_size = get_tensor_model_parallel_world_size()
++        tp_rank = get_tensor_model_parallel_rank()
++
++        for weight_name, weight_tensor in self._hf_weight_iter(
++                hf_weights_files, use_safetensors):
++            if any(target_module in weight_name for target_module in
++                   self.target_modules) and weight_name.endswith(".weight"):
++                # Without sharding
++                if any(
++                        weight_name.startswith(module)
++                        for module in self.unsharded_weights_modules):
++                    weight_sub_tensor = weight_tensor
++                # Shard by column
++                elif any(
++                        weight_name.startswith(module)
++                        for module in self.column_sharded_weights_modules):
++                    total_size = weight_tensor.size(-1)
++                    start_index = total_size // tp_size * tp_rank
++                    end_index = total_size // tp_size * (tp_rank + 1)
++                    weight_sub_tensor = weight_tensor[...,
++                                                      start_index:end_index]
++                # Weights have fused on disk. In this case, we assume that the
++                # weight and module use same name.
++                elif any(
++                        weight_name.startswith(module)
++                        for module in self.maybe_fused_weights_modules):
++                    # special case for fused weights
++                    # get the size of each shard weight tensor
++                    total_shard_sizes = next(
++                        (sizes for module, sizes in
++                         self.maybe_fused_weights_modules.items()
++                         if weight_name.startswith(module)))
++                    total_size = weight_tensor.size(0)
++                    assert total_size == sum(total_shard_sizes)
++                    # get the start/end index of each shard weight tensor
++                    total_start_index = list(
++                        itertools.accumulate([0] + total_shard_sizes))[:-1]
++                    shard_weights_index = [(
++                        idx + size // tp_size * tp_rank,
++                        idx + size // tp_size * (tp_rank + 1),
++                    ) for idx, size in zip(total_start_index,
++                                           total_shard_sizes)]
++                    # slice and reorder the weight tensor
++                    weight_tensor = [
++                        weight_tensor[start_index:end_index, ...]
++                        for start_index, end_index in shard_weights_index
++                    ]
++                    weight_sub_tensor = torch.cat(weight_tensor, dim=0)
++                # Shard by row
++                else:
++                    total_size = weight_tensor.size(0)
++                    start_index = total_size // tp_size * tp_rank
++                    end_index = total_size // tp_size * (tp_rank + 1)
++                    weight_sub_tensor = weight_tensor[start_index:end_index,
++                                                      ...]
++
++                # bitsandbytes requires data in GPU
++                if weight_sub_tensor.is_cuda:
++                    loaded_weight = weight_sub_tensor
++                else:
++                    loaded_weight = weight_sub_tensor.cuda()
++
++                # remove the following after the issue is fixed:
++                # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342
++                if loaded_weight.is_contiguous() is False:
++                    loaded_weight = loaded_weight.contiguous()
++
++                with set_default_torch_dtype(torch.float32):
++                    processed_weight, quant_state = quantize_4bit(
++                        loaded_weight,
++                        compress_statistics=True,
++                        quant_type="nf4",
++                    )
++
++                quant_state_dict[weight_name] = quant_state
++            else:
++                processed_weight = weight_tensor
++
++            yield weight_name, processed_weight
++
++    def _get_bnb_target_modules(self, model: nn.Module) -> None:
++
++        # TODO: Maybe we can replace bitsandbytes_stacked_params_mapping with
++        # packed_modules_mapping.
++        inverse_stacked_mapping: Dict[str, List[str]] = {}
++        for orig, (
++                packed,
++                idx,
++        ) in model.bitsandbytes_stacked_params_mapping.items():
++            if packed not in inverse_stacked_mapping:
++                inverse_stacked_mapping[packed] = []
++            inverse_stacked_mapping[packed].insert(idx, orig)
++
++        for name, module in model.named_modules():
++            if isinstance(module, (LinearBase, )):
++                last_name = name.split(".")[-1]
++                if sub_modules := inverse_stacked_mapping.get(last_name, []):
++                    # Map vllm's names to transformers's names.
++                    for sub_name in sub_modules:
++                        self.target_modules.append(
++                            name.replace(last_name, sub_name))
++                # Add original module name even if the module has stacked map,
++                # in case model has a mixture of disk-merged and disk-splitted
++                # weights with same last name.
++                self.target_modules.append(name)
++
++        assert (self.target_modules
++                ), "vllm currently does not support BNB quantization for"
++        f" {type(model).__name__}"
++
++    def _load_weights(self, model_config: ModelConfig,
++                      model: nn.Module) -> None:
++        if not hasattr(model, "load_weights"):
++            raise AttributeError(
++                "The required method 'load_weights' is not defined in class"
++                f" {type(model).__name__}.")
++
++        if not hasattr(model, "bitsandbytes_stacked_params_mapping"):
++            raise AttributeError(
++                f"Model {type(model).__name__} does not support BitsAndBytes "
++                "quantization yet.")
++
++        # For some models like Molmo, we need to use hf_to_vllm_mapper
++        # to ensure correct loading of weights.
++        if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
++            self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)
++        # Modules whose weights might have fused on disk
++        # we need their output_sizes to make shard in flight correctly with TP
++        self.maybe_fused_weights_modules: Dict[str, List[int]] = {}
++        self._get_bnb_target_modules(model)
++        for name, module in model.named_modules():
++            # Some modules like `ReplicatedLinear` should not have their weights
++            # sharded. The reason for implementing it this way is to avoid new
++            # static variable in the model implementation.
++            if isinstance(module, (ReplicatedLinear, )):
++                self.unsharded_weights_modules.append(name)
++            # `QKVParallelLinear` and `MergedColumnParallelLinear` might have
++            # fused weights on disk. We need to use the output sizes of these
++            # modules to shard the weights correctly.
++            elif isinstance(module,
++                            (QKVParallelLinear, MergedColumnParallelLinear)):
++                self.maybe_fused_weights_modules[name] = module.output_sizes
++            # In TP, these weights are partitioned along the column
++            # dimension (dim=-1)
++            elif isinstance(module, (RowParallelLinear, )):
++                self.column_sharded_weights_modules.append(name)
++
++        self.model_type = type(model).__name__
++
++        logger.info("Loading weights with BitsAndBytes quantization. "
++                    " May take a while ...")
++
++        quant_config = getattr(model_config.hf_config, "quantization_config",
++                               None)
++
++        pre_quant = False
++        if quant_config is not None:
++            quant_method = quant_config.get("quant_method")
++            if quant_method == "bitsandbytes":
++                pre_quant = True
++            else:
++                raise ValueError(
++                    f"BitsAndBytes loader does not support {quant_method} "
++                    "quantization")
++
++        # The quant_states in pre_quantized models cannot work with a split
++        # weight tensor. So TP does not work with pre_quantized bnb models.
++        if pre_quant and get_tensor_model_parallel_world_size() > 1:
++            raise ValueError(
++                "Prequant BitsAndBytes models with TP is not supported."
++                "Please try with PP.")
++
++        load_8bit = False
++        if pre_quant:
++            load_8bit = quant_config.get("load_in_8bit", False)
++
++        qweight_iterator, quant_state_dict = (
++            self._get_quantized_weights_iterator(model_config.model,
++                                                 model_config.revision,
++                                                 pre_quant, load_8bit))
++
++        weights_to_load = {name for name, _ in model.named_parameters()}
++        loaded_weights = model.load_weights(qweight_iterator)
++        # Some models may have weights loading tracker unimplemented.
++        if loaded_weights is not None:
++            weights_not_loaded = weights_to_load - loaded_weights
++            if weights_not_loaded:
++                raise ValueError("Following weights were not initialized from "
++                                 f"checkpoint: {weights_not_loaded}")
++
++        torch.cuda.empty_cache()
++
++        param_dict = dict(model.named_parameters())
++        stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {}
++        # TODO: Change this lazy import to normal import
++        # after the checks are updated to run on a new version
++        from vllm.model_executor.models.utils import is_pp_missing_parameter
++
++        for quant_param_name in quant_state_dict:
++            if is_pp_missing_parameter(quant_param_name, model):
++                continue
++
++            non_stacked_param_name = quant_param_name
++
++            shard_index = 0
++            for shard_name, (
++                    weight_name,
++                    index,
++            ) in model.bitsandbytes_stacked_params_mapping.items():
++                shard_pos = quant_param_name.find(shard_name)
++                # Some models, such as MiniCPM V2.5/2.6, contain both
++                # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
++                # from being incorrectly identified as being present in
++                # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
++                if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".":
++                    shard_index = index
++                    quant_param_name = quant_param_name.replace(
++                        shard_name, weight_name)
++                    break
++
++            # Models like Clip/Siglip may skip some layers in initialization,
++            # causing unused quant_param_name in state_dict.
++            if quant_param_name not in param_dict:
++                continue
++
++            if quant_param_name not in stacked_quant_state_dict:
++                stacked_quant_state_dict[quant_param_name] = {}
++
++            stacked_quant_state_dict[quant_param_name][shard_index] = (
++                quant_state_dict[non_stacked_param_name])
++
++        # save quant_states and offsets as the attributes of the parameters
++        for param_name, param in param_dict.items():
++            if param_name in stacked_quant_state_dict:
++                quant_states = stacked_quant_state_dict[param_name]
++                set_weight_attrs(param, {"bnb_quant_state": quant_states})
++
++                pack_ratio = getattr(param, "pack_factor", -1)
++                if pack_ratio == -1:
++                    raise ValueError(
++                        f"pack_factor not set for parameter {param_name}.")
++
++                num_elements = [0] * len(quant_states)
++                for seq, quant_state in quant_states.items():
++                    num_elements[seq] = (math.prod(quant_state.shape) //
++                                         pack_ratio)
++
++                offsets = np.concatenate(([0], np.cumsum(num_elements)))
++                set_weight_attrs(param, {"bnb_shard_offsets": offsets})
++
++                if load_8bit:
++                    set_weight_attrs(
++                        param, {"matmul_state": [None] * len(quant_states)})
++
++    def download_model(self, model_config: ModelConfig) -> None:
++        self._prepare_weights(model_config.model, model_config.revision)
++
++    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
++        device_config = vllm_config.device_config
++        model_config = vllm_config.model_config
++        with set_default_torch_dtype(model_config.dtype):
++            with torch.device(device_config.device):
++                model = _initialize_model(vllm_config=vllm_config)
++
++                self._load_weights(model_config, model)
++
++        return model.eval()
++
++
++class GGUFModelLoader(BaseModelLoader):
++    """
++    Model loader that can load GGUF files. This is useful for loading models
++    that are quantized with GGUF and saved in the GGUF format. This loader
++    supports loading both full models and sharded models.
++    """
++
++    def __init__(self, load_config: LoadConfig):
++        super().__init__(load_config)
++        if load_config.model_loader_extra_config:
++            raise ValueError(f"Model loader extra config is not supported for "
++                             f"load format {load_config.load_format}")
++
++    def _prepare_weights(self, model_name_or_path: str):
++        if os.path.isfile(model_name_or_path):
++            return model_name_or_path
++        else:
++            raise ValueError(f"{model_name_or_path} is not a file.")
++
++    def _get_gguf_weights_map(self, model_config: ModelConfig):
++        """
++        GGUF uses this naming convention for their tensors from HF checkpoint:
++        `blk.N.BB.weight` and `blk.N.BB.bias`
++        where N signifies the block number of a layer, and BB signifies the
++        attention/mlp layer components.
++        See "Standardized tensor names" in
++        https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
++        """
++        config = model_config.hf_config
++        model_type = config.model_type
++        # hack: ggufs have a different name than transformers
++        if model_type == "cohere":
++            model_type = "command-r"
++        arch = None
++        for key, value in gguf.MODEL_ARCH_NAMES.items():
++            if value == model_type:
++                arch = key
++                break
++        if arch is None:
++            raise RuntimeError(f"Unknown gguf model_type: {model_type}")
++        num_layers = config.num_hidden_layers
++        name_map = gguf.get_tensor_name_map(arch, num_layers)
++        with torch.device("meta"):
++            dummy_model = AutoModelForCausalLM.from_config(config)
++        state_dict = dummy_model.state_dict()
++
++        gguf_to_hf_name_map = {}
++        for hf_name in state_dict:
++            name, suffix = hf_name.rsplit(".", 1)
++            gguf_name = name_map.get_name(name)
++            gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name
++        return gguf_to_hf_name_map
++
++    def _get_weights_iterator(
++        self, model_name_or_path: str, gguf_to_hf_name_map: Dict[str, str]
++    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
++        return gguf_quant_weights_iterator(model_name_or_path,
++                                           gguf_to_hf_name_map)
++
++    def download_model(self, model_config: ModelConfig) -> None:
++        self._prepare_weights(model_config.model)
++
++    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
++        device_config = vllm_config.device_config
++        model_config = vllm_config.model_config
++        local_model_path = self._prepare_weights(model_config.model)
++        gguf_weights_map = self._get_gguf_weights_map(model_config)
++        # we can only know if tie word embeddings after mapping weights
++        if "lm_head.weight" in get_gguf_extra_tensor_names(
++                local_model_path, gguf_weights_map):
++            model_config.hf_config.update({"tie_word_embeddings": True})
++
++        with set_default_torch_dtype(model_config.dtype):
++            with torch.device(device_config.device):
++                model = _initialize_model(vllm_config=vllm_config)
++            model.load_weights(
++                self._get_weights_iterator(local_model_path, gguf_weights_map))
++        return model
++
++
++class RunaiModelStreamerLoader(BaseModelLoader):
++    """
++        Model loader that can load safetensors 
++        files from local FS or S3 bucket.
++    """
++
++    def __init__(self, load_config: LoadConfig):
++        super().__init__(load_config)
++        if load_config.model_loader_extra_config:
++            extra_config = load_config.model_loader_extra_config
++
++            if ("concurrency" in extra_config
++                    and isinstance(extra_config.get("concurrency"), int)):
++                os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(
++                    extra_config.get("concurrency"))
++
++            if ("memory_limit" in extra_config
++                    and isinstance(extra_config.get("memory_limit"), int)):
++                os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(
++                    extra_config.get("memory_limit"))
++
++            runai_streamer_s3_endpoint = os.getenv(
++                'RUNAI_STREAMER_S3_ENDPOINT')
++            aws_endpoint_url = os.getenv('AWS_ENDPOINT_URL')
++            if (runai_streamer_s3_endpoint is None
++                    and aws_endpoint_url is not None):
++                os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url
++
++    def _prepare_weights(self, model_name_or_path: str,
++                         revision: Optional[str]) -> List[str]:
++        """Prepare weights for the model.
++
++        If the model is not local, it will be downloaded."""
++        is_s3_path = is_s3(model_name_or_path)
++        is_local = os.path.isdir(model_name_or_path)
++        safetensors_pattern = "*.safetensors"
++        index_file = SAFE_WEIGHTS_INDEX_NAME
++
++        hf_folder = (model_name_or_path if
++                     (is_local or is_s3_path) else download_weights_from_hf(
++                         model_name_or_path,
++                         self.load_config.download_dir,
++                         [safetensors_pattern],
++                         revision,
++                         ignore_patterns=self.load_config.ignore_patterns,
++                     ))
++
++        if is_s3_path:
++            hf_weights_files = s3_glob(path=hf_folder,
++                                       allow_pattern=[safetensors_pattern])
++        else:
++            hf_weights_files = glob.glob(
++                os.path.join(hf_folder, safetensors_pattern))
++
++        if not is_local and not is_s3_path:
++            download_safetensors_index_file_from_hf(
++                model_name_or_path, index_file, self.load_config.download_dir,
++                revision)
++
++        if not hf_weights_files:
++            raise RuntimeError(
++                f"Cannot find any safetensors model weights with "
++                f"`{model_name_or_path}`")
++
++        return hf_weights_files
++
++    def _get_weights_iterator(
++            self, model_or_path: str,
++            revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]:
++        """Get an iterator for the model weights based on the load format."""
++        hf_weights_files = self._prepare_weights(model_or_path, revision)
++        return runai_safetensors_weights_iterator(hf_weights_files)
++
++    def download_model(self, model_config: ModelConfig) -> None:
++        """Download model if necessary"""
++        self._prepare_weights(model_config.model, model_config.revision)
++
++    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
++        """Perform streaming of the model to destination"""
++        device_config = vllm_config.device_config
++        model_config = vllm_config.model_config
++
++        target_device = torch.device(device_config.device)
++        with set_default_torch_dtype(model_config.dtype):
++            with target_device:
++                model = _initialize_model(vllm_config=vllm_config)
++
++            model_weights = model_config.model
++            if hasattr(model_config, "model_weights"):
++                model_weights = model_config.model_weights
++            model.load_weights(
++                self._get_weights_iterator(model_weights,
++                                           model_config.revision))
++
++            for _, module in model.named_modules():
++                quant_method = getattr(module, "quant_method", None)
++                if quant_method is not None:
++                    with device_loading_context(module, target_device):
++                        quant_method.process_weights_after_loading(module)
++        return model.eval()
+ 
+ 
+ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+@@ -359,4 +1364,16 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+     if load_config.load_format == LoadFormat.TENSORIZER:
+         return TensorizerLoader(load_config)
+ 
++    if load_config.load_format == LoadFormat.SHARDED_STATE:
++        return ShardedStateLoader(load_config)
++
++    if load_config.load_format == LoadFormat.BITSANDBYTES:
++        return BitsAndBytesModelLoader(load_config)
++
++    if load_config.load_format == LoadFormat.GGUF:
++        return GGUFModelLoader(load_config)
++
++    if load_config.load_format == LoadFormat.RUNAI_STREAMER:
++        return RunaiModelStreamerLoader(load_config)
++
+     return DefaultModelLoader(load_config)
+diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
+index 07e23ac..a90fbd6 100644
+--- a/vllm/model_executor/model_loader/neuron.py
++++ b/vllm/model_executor/model_loader/neuron.py
+@@ -1,18 +1,20 @@
+ """Utilities for selecting and loading neuron models."""
++import copy
+ import importlib
+ import os
+-from typing import Dict, Optional, Tuple
++from typing import Dict, List, Optional, Tuple
+ 
+ import torch
+ import torch.nn as nn
+-import transformers
+ from transformers import PretrainedConfig
+ 
+ from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.quantization import get_quantization_config
++from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
++                           SequenceOutput)
+ 
+ TORCH_DTYPE_TO_NEURON_AMP = {
+     "auto": "f32",
+@@ -35,17 +37,20 @@ _NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str, str]] = {
+ }
+ 
+ 
+-class NeuronCasualLM(nn.Module):
++class NeuronCausalLM(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-    ) -> None:
++    def __init__(self,
++                 config: PretrainedConfig,
++                 on_device_sampling_disabled: bool = False) -> None:
+         super().__init__()
+         self.config = config
+         self.logits_processor = LogitsProcessor(config.vocab_size,
+                                                 logits_as_input=True)
+-        self.sampler = Sampler()
++
++        self.on_device_sampling_disabled = on_device_sampling_disabled
++        if self.on_device_sampling_disabled:
++            # Use default sampler
++            self.sampler = Sampler()
+ 
+         # Lazy initialized
+         self.model: nn.Module
+@@ -71,8 +76,29 @@ class NeuronCasualLM(nn.Module):
+         logits: torch.Tensor,
+         sampling_metadata: SamplingMetadata,
+     ) -> Optional[SamplerOutput]:
+-        next_tokens = self.sampler(logits, sampling_metadata)
+-        return next_tokens
++
++        if self.on_device_sampling_disabled:
++            next_tokens = self.sampler(logits, sampling_metadata)
++            return next_tokens
++
++        # On-device sampling outputs the token ids directly.
++        sampled_token_ids = logits.flatten()
++        next_tokens = []
++        sample_idx = 0
++        for seq_group in sampling_metadata.seq_groups:
++            samples = []
++            for seq_id in seq_group.seq_ids:
++                token_id = sampled_token_ids[sample_idx].item()
++                samples.append(
++                    SequenceOutput(parent_seq_id=seq_id,
++                                   output_token=token_id,
++                                   logprobs={token_id: Logprob(token_id)}))
++                sample_idx += 1
++            next_tokens.append(
++                CompletionSequenceGroupOutput(samples=samples,
++                                              prompt_logprobs=None))
++
++        return SamplerOutput(outputs=next_tokens)
+ 
+     def load_weights(self, model_name_or_path: str, **kwargs):
+         arch = _get_model_architecture(self.config)
+@@ -81,19 +107,7 @@ class NeuronCasualLM(nn.Module):
+         neuronx_module = importlib.import_module(neuronx_module_path)
+         neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
+ 
+-        split_model_dir = f"{model_name_or_path}-split"
+-        if os.path.isdir(os.path.join(model_name_or_path,
+-                                      "pytorch_model.bin")):
+-            split_model_dir = model_name_or_path
+-        elif not os.path.exists(f"{model_name_or_path}-split"):
+-            hf_model_cls = getattr(transformers, hf_model_cls_name)
+-            from transformers_neuronx.module import save_pretrained_split
+-
+-            hf_model = hf_model_cls.from_pretrained(model_name_or_path,
+-                                                    low_cpu_mem_usage=True)
+-            save_pretrained_split(hf_model, f"{model_name_or_path}-split")
+-
+-        self.model = neuronx_model_cls.from_pretrained(split_model_dir,
++        self.model = neuronx_model_cls.from_pretrained(model_name_or_path,
+                                                        **kwargs)
+         self.model.to_neuron()
+ 
+@@ -109,28 +123,89 @@ def _get_model_architecture(config: PretrainedConfig) -> str:
+         f"{list(_NEURON_SUPPORTED_MODELS.keys())}")
+ 
+ 
++def _get_buckets(env: str, default_value: List[int]) -> List[int]:
++    env_value = os.getenv(env)
++    if env_value is None:
++        return default_value
++    buckets_remove_empty = filter(
++        lambda x: x is not None and len(x.strip()) > 0, env_value.split(","))
++    buckets_int = map(int, buckets_remove_empty)
++    buckets_list = list(buckets_int)
++    return buckets_list
++
++
++def _get_default_neuron_config(model_config: ModelConfig,
++                               parallel_config: ParallelConfig,
++                               scheduler_config: SchedulerConfig):
++    from transformers_neuronx.config import ContinuousBatchingConfig
++    from transformers_neuronx.constants import LAYOUT_BSH
++
++    continuous_batching_config = ContinuousBatchingConfig(
++        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
++    quant_config = dict(
++        dequant_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
++        quantize_method="vector_dynamic")
++    neuron_quantization_config_builder = lambda quant: get_quantization_config(
++        quant).from_config(quant_config).get_quant_method(None, "")
++    # TODO: Add Paged attention config to the default neuron arguments.
++    default_neuron_args = dict(
++        collectives_layout=LAYOUT_BSH,
++        attention_layout=LAYOUT_BSH,
++        fuse_qkv=True,
++        quant=neuron_quantization_config_builder(model_config.quantization)
++        if model_config.quantization else None,
++        continuous_batching=continuous_batching_config,
++        weight_tiling=bool(model_config.quantization),
++        on_device_generation=_get_neuron_on_device_generation_config(
++            model_config))
++    return default_neuron_args
++
++
++def _get_neuron_on_device_generation_config(model_config: ModelConfig):
++    if not _is_neuron_on_device_sampling_disabled(model_config):
++        return copy.deepcopy(model_config.neuron_sampling_params)
++    return None
++
++
++def _is_neuron_on_device_sampling_disabled(model_config: ModelConfig) -> bool:
++    return not getattr(model_config, "neuron_sampling_params", None)
++
++
++def _get_neuron_config_after_override(default_neuron_config,
++                                      overridden_neuron_config):
++    from transformers_neuronx.config import NeuronConfig
++    overridden_neuron_config = overridden_neuron_config or {}
++    default_neuron_config.update(overridden_neuron_config)
++    return NeuronConfig(**default_neuron_config)
++
++
+ def get_neuron_model(model_config: ModelConfig,
+                      parallel_config: ParallelConfig,
+                      scheduler_config: SchedulerConfig) -> nn.Module:
+-    from transformers_neuronx.config import (ContinuousBatchingConfig,
+-                                             NeuronConfig)
+ 
+     # Create a model instance.
+-    model = NeuronCasualLM(model_config.hf_config)
++    model = NeuronCausalLM(
++        model_config.hf_config,
++        _is_neuron_on_device_sampling_disabled(model_config))
+ 
+-    continuous_batching_config = ContinuousBatchingConfig(
+-        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
+-    neuron_config = NeuronConfig(
+-        continuous_batching=continuous_batching_config)
++    default_neuron_config_args = _get_default_neuron_config(
++        model_config, parallel_config, scheduler_config)
++
++    neuron_config = _get_neuron_config_after_override(
++        default_neuron_config_args, model_config.override_neuron_config)
++
++    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
++                                            [scheduler_config.max_model_len])
++    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
++                               [scheduler_config.max_model_len])
+ 
+     # Load the weights from the cached or downloaded files.
+-    model.load_weights(
+-        model_config.model,
+-        tp_degree=parallel_config.tensor_parallel_size,
+-        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
+-        neuron_config=neuron_config,
+-        context_length_estimate=[scheduler_config.max_model_len],
+-        n_positions=[scheduler_config.max_model_len],
+-        batch_size=scheduler_config.max_num_seqs)
++    model.load_weights(model_config.model,
++                       tp_degree=parallel_config.tensor_parallel_size,
++                       amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
++                       neuron_config=neuron_config,
++                       context_length_estimate=context_length_estimates,
++                       n_positions=n_positions,
++                       batch_size=scheduler_config.max_num_seqs)
+ 
+     return model.eval()
+diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
+new file mode 100644
+index 0000000..e629929
+--- /dev/null
++++ b/vllm/model_executor/model_loader/openvino.py
+@@ -0,0 +1,203 @@
++# ruff: noqa: SIM117
++from pathlib import Path
++from typing import List, Optional, Tuple
++
++import openvino as ov
++import torch
++from huggingface_hub import HfApi
++from openvino._offline_transformations import paged_attention_transformation
++from optimum.intel import OVModelForCausalLM
++from torch import nn
++
++import vllm.envs as envs
++from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
++from vllm.config import DeviceConfig, ModelConfig
++from vllm.logger import init_logger
++from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
++                                                         _prune_hidden_states)
++from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.platforms import current_platform
++
++logger = init_logger(__name__)
++
++
++def _flattenize_inputs(inputs):
++    """
++    Helper function for making nested inputs flattens
++    """
++    flatten_inputs = []
++    for input_data in inputs:
++        if input_data is None:
++            continue
++        if isinstance(input_data, (list, tuple)):
++            flatten_inputs.extend(_flattenize_inputs(input_data))
++        elif isinstance(input_data, dict):
++            flatten_inputs.extend(_flattenize_inputs(list(
++                input_data.values())))
++        else:
++            flatten_inputs.append(input_data)
++    return flatten_inputs
++
++
++def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
++                             is_cpu: bool):
++    # Apply hardware dependent modifications to KV tensors
++    for parameter in model.get_parameters():
++        input = parameter.get_output_tensor(0)
++        input_names = input.get_names()
++        if len(input_names) != 1:
++            continue
++        input_name = next(iter(input_names))
++        shape = parameter.get_partial_shape()
++        # use real block size if available, just a placeholder
++        # to provide the expected rank
++        num_blocks = ov.Dimension()
++        block_size = ov.Dimension()
++        head_size = ov.Dimension()
++        if input_name.startswith("key_cache."):
++            cpu_shape = [num_blocks, shape[1], block_size, head_size]
++            gpu_shape = [num_blocks, shape[1], shape[2], block_size]
++        elif input_name.startswith("value_cache."):
++            cpu_shape = [num_blocks, shape[1], block_size, head_size]
++            gpu_shape = [num_blocks, shape[1], block_size, shape[2]]
++        else:
++            continue
++        parameter.set_partial_shape(
++            ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
++        parameter.set_element_type(kv_cache_dtype)
++    model.validate_nodes_and_infer_types()
++
++
++def _require_model_export(model_id, revision=None, subfolder=None):
++    model_dir = Path(model_id)
++    if subfolder is not None:
++        model_dir = model_dir / subfolder
++    if model_dir.is_dir():
++        return (not (model_dir / "openvino_model.xml").exists()
++                or not (model_dir / "openvino_model.bin").exists())
++
++    hf_api = HfApi()
++    try:
++        model_info = hf_api.model_info(model_id, revision=revision or "main")
++        normalized_subfolder = (None if subfolder is None else
++                                Path(subfolder).as_posix())
++        model_files = [
++            file.rfilename for file in model_info.siblings
++            if normalized_subfolder is None
++            or file.rfilename.startswith(normalized_subfolder)
++        ]
++        ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
++                         else f"{normalized_subfolder}/openvino_model.xml")
++        return (ov_model_path not in model_files
++                or ov_model_path.replace(".xml", ".bin") not in model_files)
++    except Exception:
++        return True
++
++
++class OpenVINOCausalLM(nn.Module):
++
++    def __init__(
++        self,
++        ov_core: ov.Core,
++        model_config: ModelConfig,
++        device_config: DeviceConfig,
++        kv_cache_dtype: ov.Type,
++    ) -> None:
++        super().__init__()
++        self.logits_processor = LogitsProcessor(
++            model_config.hf_config.vocab_size, logits_as_input=True)
++        self.sampler = Sampler()
++
++        export = _require_model_export(model_config.model)
++        if export:
++            logger.warning(
++                f"Provided model id {model_config.model} does not "  # noqa: G004
++                "contain OpenVINO IR, the model will be converted to IR with "
++                "default options. If you need to use specific options for "
++                "model conversion, use optimum-cli export openvino with "
++                "desired options.")
++        else:
++            logger.warning(
++                "OpenVINO IR is available for provided model id "  # noqa: G004
++                f"{model_config.model}. This IR will be used for inference "
++                "as-is, all possible options that may affect model conversion "
++                "are ignored.")
++
++        load_in_8bit = envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
++        pt_model = OVModelForCausalLM.from_pretrained(
++            model_config.model,
++            export=export,
++            compile=False,
++            load_in_8bit=load_in_8bit,
++            trust_remote_code=model_config.trust_remote_code,
++        )
++
++        ov_device = envs.VLLM_OPENVINO_DEVICE
++        paged_attention_transformation(pt_model.model)
++        _modify_cache_parameters(pt_model.model, kv_cache_dtype,
++                                 current_platform.is_openvino_cpu())
++
++        ov_compiled = ov_core.compile_model(pt_model.model, ov_device)
++        self.ov_request = ov_compiled.create_infer_request()
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
++        attn_metadata: OpenVINOAttentionMetadata,
++    ) -> torch.Tensor:
++        flatten_kv_cache = _flattenize_inputs(kv_caches)
++
++        inputs = [
++            input_ids,
++            positions,
++            *flatten_kv_cache,
++            attn_metadata.past_lens,
++            attn_metadata.subsequence_begins,
++            attn_metadata.block_indices,
++            attn_metadata.block_indices_begins,
++            attn_metadata.max_context_len,
++        ]
++
++        self.ov_request.start_async(inputs, share_inputs=True)
++        self.ov_request.wait()
++
++        logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)
++
++        # TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
++        return logits.view(-1, logits.shape[-1])
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
++        logits = self.logits_processor(None, hidden_states, sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++
++def get_model(
++    model_config: ModelConfig,
++    device_config: DeviceConfig,
++    kv_cache_dtype: ov.Type,
++    **kwargs,
++) -> torch.nn.Module:
++    lora_config = kwargs.get("lora_config")
++    ov_core = kwargs.get("ov_core")
++    if lora_config:
++        raise ValueError(
++            "OpenVINO modeling does not support LoRA, "
++            "but LoRA is enabled. Support for this model may "
++            "be added in the future. If this is important to you, "
++            "please open an issue on github.")
++
++    return OpenVINOCausalLM(ov_core, model_config, device_config,
++                            kv_cache_dtype)
+diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
+index af433b8..fbd4937 100644
+--- a/vllm/model_executor/model_loader/tensorizer.py
++++ b/vllm/model_executor/model_loader/tensorizer.py
+@@ -2,24 +2,24 @@ import argparse
+ import dataclasses
+ import io
+ import os
++import re
+ import time
+-import typing
+ from dataclasses import dataclass
+-from typing import Generator, Optional, Tuple, Type, Union
++from functools import partial
++from typing import BinaryIO, Generator, Optional, Tuple, Type, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import PretrainedConfig
+ 
+ import vllm.envs as envs
+-from vllm.config import ModelConfig, ParallelConfig
++from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config
++from vllm.engine.arg_utils import EngineArgs
++from vllm.engine.llm_engine import LLMEngine
+ from vllm.logger import init_logger
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     VocabParallelEmbedding)
+-
+-tensorizer_load_fail = None
++from vllm.utils import FlexibleArgumentParser, PlaceholderModule
+ 
+ try:
+     from tensorizer import (DecryptionParams, EncryptionParams,
+@@ -27,8 +27,24 @@ try:
+     from tensorizer.stream_io import open_stream
+     from tensorizer.utils import (convert_bytes, get_mem_usage,
+                                   no_init_or_tensor)
+-except ImportError as e:
+-    tensorizer_load_fail = e
++
++    _read_stream, _write_stream = (partial(
++        open_stream,
++        mode=mode,
++    ) for mode in ("rb", "wb+"))
++except ImportError:
++    tensorizer = PlaceholderModule("tensorizer")
++    DecryptionParams = tensorizer.placeholder_attr("DecryptionParams")
++    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
++    TensorDeserializer = tensorizer.placeholder_attr("TensorDeserializer")
++    TensorSerializer = tensorizer.placeholder_attr("TensorSerializer")
++    open_stream = tensorizer.placeholder_attr("stream_io.open_stream")
++    convert_bytes = tensorizer.placeholder_attr("utils.convert_bytes")
++    get_mem_usage = tensorizer.placeholder_attr("utils.get_mem_usage")
++    no_init_or_tensor = tensorizer.placeholder_attr("utils.no_init_or_tensor")
++
++    _read_stream = tensorizer.placeholder_attr("_read_stream")
++    _write_stream = tensorizer.placeholder_attr("_write_stream")
+ 
+ __all__ = [
+     'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
+@@ -41,9 +57,8 @@ logger = init_logger(__name__)
+ 
+ @dataclass
+ class TensorizerConfig:
+-    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
+-                          str, bytes, os.PathLike, int]
+-    vllm_tensorized: bool
++    tensorizer_uri: str
++    vllm_tensorized: Optional[bool] = False
+     verify_hash: Optional[bool] = False
+     num_readers: Optional[int] = None
+     encryption_keyfile: Optional[str] = None
+@@ -53,6 +68,12 @@ class TensorizerConfig:
+     model_class: Optional[Type[torch.nn.Module]] = None
+     hf_config: Optional[PretrainedConfig] = None
+     dtype: Optional[Union[str, torch.dtype]] = None
++    _is_sharded: bool = False
++
++    def __post_init__(self):
++        # check if the configuration is for a sharded vLLM model
++        self._is_sharded = isinstance(self.tensorizer_uri, str) \
++            and re.search(r'%0\dd', self.tensorizer_uri) is not None
+ 
+     def _construct_tensorizer_args(self) -> "TensorizerArgs":
+         tensorizer_args = {
+@@ -71,13 +92,12 @@ class TensorizerConfig:
+         self,
+         parallel_config: "ParallelConfig",
+     ) -> None:
+-        if (parallel_config.tensor_parallel_size > 1
+-                and self.tensorizer_uri is not None):
++        if parallel_config.tensor_parallel_size > 1 \
++            and not self._is_sharded:
+             raise ValueError(
+-                "Loading to multiple GPUs is not currently supported with "
+-                "vLLM-serialized models. Please set tensor_parallel_size=1."
+-                " or use a non-vLLM-serialized model, such as a "
+-                "serialized Hugging Face `PretrainedModel`.")
++                "For a sharded model, tensorizer_uri should include a"
++                " string format template like '%04d' to be formatted"
++                " with the rank of the shard")
+ 
+     def verify_with_model_config(self, model_config: "ModelConfig") -> None:
+         if (model_config.quantization is not None
+@@ -86,6 +106,13 @@ class TensorizerConfig:
+                 "Loading a model using Tensorizer with quantization on vLLM"
+                 " is unstable and may lead to errors.")
+ 
++    def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None):
++        if tensorizer_args is None:
++            tensorizer_args = self._construct_tensorizer_args()
++
++        return open_stream(self.tensorizer_uri,
++                           **tensorizer_args.stream_params)
++
+ 
+ def load_with_tensorizer(tensorizer_config: TensorizerConfig,
+                          **extra_kwargs) -> nn.Module:
+@@ -93,17 +120,11 @@ def load_with_tensorizer(tensorizer_config: TensorizerConfig,
+     return tensorizer.deserialize()
+ 
+ 
+-def is_vllm_serialized_tensorizer(tensorizer_config: TensorizerConfig) -> bool:
+-    if tensorizer_config is None:
+-        return False
+-    return tensorizer_config.vllm_tensorized
+-
+-
+ @dataclass
+ class TensorizerArgs:
+-    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
+-                          str, bytes, os.PathLike, int]
+-    vllm_tensorized: bool
++    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str,
++                          bytes, os.PathLike, int]
++    vllm_tensorized: Optional[bool] = False
+     verify_hash: Optional[bool] = False
+     num_readers: Optional[int] = None
+     encryption_keyfile: Optional[str] = None
+@@ -121,7 +142,9 @@ class TensorizerArgs:
+           vLLM model. This is used to determine the behavior of the 
+           TensorDeserializer when loading tensors from a serialized model.
+           It is far faster to deserialize a vLLM model as it utilizes
+-          tensorizer's optimized GPU loading.
++          tensorizer's optimized GPU loading. Note that this is now
++          deprecated, as serialized vLLM models are now automatically
++          inferred as vLLM models.
+       verify_hash: If True, the hashes of each tensor will be verified against 
+           the hashes stored in the metadata. A `HashMismatchError` will be 
+           raised if any of the hashes do not match.
+@@ -132,7 +155,7 @@ class TensorizerArgs:
+       encryption_keyfile: File path to a binary file containing a  
+           binary key to use for decryption. `None` (the default) means 
+           no decryption. See the example script in 
+-          examples/tensorize_vllm_model.py. 
++          examples/other/tensorize_vllm_model.py. 
+       s3_access_key_id: The access key for the S3 bucket. Can also be set via
+           the S3_ACCESS_KEY_ID environment variable.
+       s3_secret_access_key: The secret access key for the S3 bucket. Can also
+@@ -158,6 +181,7 @@ class TensorizerArgs:
+             "encryption": self.encryption_keyfile,
+             "num_readers": self.num_readers
+         }
++
+         if self.encryption_keyfile:
+             with open_stream(
+                     self.encryption_keyfile,
+@@ -168,8 +192,7 @@ class TensorizerArgs:
+                 self.deserializer_params['encryption'] = decryption_params
+ 
+     @staticmethod
+-    def add_cli_args(
+-            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
++    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+         """Tensorizer CLI arguments"""
+ 
+         # Tensorizer options arg group
+@@ -177,7 +200,14 @@ class TensorizerArgs:
+             'tensorizer options',
+             description=('Options for configuring the behavior of the'
+                          ' tensorizer deserializer when '
+-                         '--load-format=tensorizer'))
++                         'load_format=tensorizer is specified when '
++                         'initializing an LLMEngine, either via the CLI '
++                         'when running the vLLM OpenAI inference server '
++                         'with a JSON string passed to '
++                         '--model-loader-extra-config or as arguments given '
++                         'to TensorizerConfig when passed to '
++                         'model_loader_extra_config in the constructor '
++                         'for LLMEngine.'))
+ 
+         group.add_argument(
+             "--tensorizer-uri",
+@@ -222,13 +252,6 @@ class TensorizerArgs:
+             help="The endpoint for the S3 bucket. Can also be set via the "
+             "S3_ENDPOINT_URL environment variable.",
+         )
+-        group.add_argument(
+-            "--vllm-tensorized",
+-            action="store_true",
+-            help="If enabled, indicates that the serialized model is a vLLM "
+-            "model. This is used to determine the behavior of the "
+-            "TensorDeserializer when loading tensors from a "
+-            "serialized model.")
+ 
+         return parser
+ 
+@@ -252,22 +275,11 @@ class TensorizerAgent:
+     in vllm/model_executor/model_loader/weight_utils.py
+     """
+ 
+-    def __init__(self, tensorizer_config: TensorizerConfig,
+-                 quant_config: QuantizationConfig, **extra_kwargs):
+-        if tensorizer_load_fail is not None:
+-            raise ImportError(
+-                "Tensorizer is not installed. Please install tensorizer "
+-                "to use this feature with `pip install vllm[tensorizer]`."
+-            ) from tensorizer_load_fail
+-
++    def __init__(self, tensorizer_config: TensorizerConfig, vllm_config):
+         self.tensorizer_config = tensorizer_config
+         self.tensorizer_args = (
+             self.tensorizer_config._construct_tensorizer_args())
+-        self.extra_kwargs = extra_kwargs
+-        if extra_kwargs.get("quant_config", None) is not None:
+-            self.quant_config = extra_kwargs["quant_config"]
+-        else:
+-            self.quant_config = quant_config
++        self.vllm_config = vllm_config
+         self.model = self._init_model()
+ 
+     def _init_model(self):
+@@ -275,11 +287,10 @@ class TensorizerAgent:
+         model_args = self.tensorizer_config.hf_config
+         model_args.torch_dtype = self.tensorizer_config.dtype
+         assert self.tensorizer_config.model_class is not None
+-        with no_init_or_tensor():
++        # TODO: Do we need to consider old-style model class?
++        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config):
+             return self.tensorizer_config.model_class(
+-                config=model_args,
+-                quant_config=self.quant_config,
+-                **self.extra_kwargs)
++                vllm_config=self.vllm_config, )
+ 
+     def _resize_lora_embeddings(self):
+         """Modify LoRA embedding layers to use bigger tensors
+@@ -322,13 +333,13 @@ class TensorizerAgent:
+         """
+         before_mem = get_mem_usage()
+         start = time.perf_counter()
+-        with open_stream(
+-                self.tensorizer_args.tensorizer_uri,
+-                mode="rb",
+-                **self.tensorizer_args.stream_params,
++        with _read_stream(
++                self.tensorizer_config.tensorizer_uri,
++                **self.tensorizer_args.stream_params
+         ) as stream, TensorDeserializer(
+                 stream,
+                 dtype=self.tensorizer_config.dtype,
++                device=f'cuda:{torch.cuda.current_device()}',
+                 **self.tensorizer_args.deserializer_params) as deserializer:
+             deserializer.load_into_module(self.model)
+             end = time.perf_counter()
+@@ -345,24 +356,119 @@ class TensorizerAgent:
+ 
+         self._check_tensors_on_meta_device()
+         self._resize_lora_embeddings()
++        del self.model.vllm_tensorized_marker
+         return self.model.eval()
+ 
+ 
+ def tensorizer_weights_iterator(
+     tensorizer_args: "TensorizerArgs"
+ ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+-    logger.warning(
+-        "Deserializing HuggingFace models is not optimized for "
+-        "loading on vLLM, as tensorizer is forced to load to CPU. "
+-        "Consider deserializing a vLLM model instead for faster "
+-        "load times. See the examples/tensorize_vllm_model.py example "
+-        "script for serializing vLLM models.")
++    logger.warning("Deserializing HuggingFace models is not optimized for "
++                   "loading on vLLM, as tensorizer is forced to load to CPU. "
++                   "Consider deserializing a vLLM model instead for faster "
++                   "load times. See the "
++                   "examples/other/tensorize_vllm_model.py example script "
++                   "for serializing vLLM models.")
+ 
+     deserializer_args = tensorizer_args.deserializer_params
+     stream_params = tensorizer_args.stream_params
+     stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
+     with TensorDeserializer(stream, **deserializer_args,
+                             device="cpu") as state:
+-        for name, param in state.items():
+-            yield name, param
++        yield from state.items()
+     del state
++
++
++def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool:
++    """
++    Infer if the model is a vLLM model by checking the weights for
++    a vLLM tensorized marker.
++
++    Args:
++        tensorizer_config: The TensorizerConfig object containing the
++            tensorizer_uri to the serialized model.
++
++    Returns:
++        bool: True if the model is a vLLM model, False otherwise.
++    """
++    tensorizer_args = tensorizer_config._construct_tensorizer_args()
++    deserializer = TensorDeserializer(open_stream(
++        tensorizer_args.tensorizer_uri, **tensorizer_args.stream_params),
++                                      **tensorizer_args.deserializer_params,
++                                      lazy_load=True)
++    if tensorizer_config.vllm_tensorized:
++        logger.warning(
++            "Please note that newly serialized vLLM models are automatically "
++            "inferred as vLLM models, so setting vllm_tensorized=True is "
++            "only necessary for models serialized prior to this change.")
++        return True
++    return ".vllm_tensorized_marker" in deserializer
++
++
++def serialize_vllm_model(
++    model: nn.Module,
++    tensorizer_config: TensorizerConfig,
++) -> nn.Module:
++    model.register_parameter(
++        "vllm_tensorized_marker",
++        nn.Parameter(torch.tensor((1, ), device="meta"), requires_grad=False))
++    tensorizer_args = tensorizer_config._construct_tensorizer_args()
++
++    encryption_params = None
++    if (keyfile := tensorizer_config.encryption_keyfile) is not None:
++        with open(keyfile, "rb") as f:
++            key = f.read()
++        encryption_params = EncryptionParams(key=key)
++
++    output_file = tensorizer_args.tensorizer_uri
++    if tensorizer_config._is_sharded:
++        from vllm.distributed import get_tensor_model_parallel_rank
++        output_file = output_file % get_tensor_model_parallel_rank()
++
++    with _write_stream(output_file, **tensorizer_args.stream_params) as stream:
++        serializer = TensorSerializer(stream, encryption=encryption_params)
++        serializer.write_module(model)
++        serializer.close()
++    logger.info("Successfully serialized model to %s", str(output_file))
++    return model
++
++
++def tensorize_vllm_model(engine_args: EngineArgs,
++                         tensorizer_config: TensorizerConfig,
++                         generate_keyfile: bool = True):
++    """Utility to load a model and then serialize it with Tensorizer
++
++       Intended to be used separately from running a vLLM server since it
++       creates its own Engine instance.
++    """
++    engine_config = engine_args.create_engine_config()
++    tensorizer_config.verify_with_model_config(engine_config.model_config)
++    tensorizer_config.verify_with_parallel_config(
++        engine_config.parallel_config)
++
++    # generate the encryption key before creating the engine to support sharding
++    if generate_keyfile and (keyfile :=
++                             tensorizer_config.encryption_keyfile) is not None:
++        encryption_params = EncryptionParams.random()
++        with _write_stream(
++                keyfile,
++                s3_access_key_id=tensorizer_config.s3_access_key_id,
++                s3_secret_access_key=tensorizer_config.s3_secret_access_key,
++                s3_endpoint=tensorizer_config.s3_endpoint,
++        ) as stream:
++            stream.write(encryption_params.key)
++
++    engine = LLMEngine.from_engine_args(engine_args)
++    if tensorizer_config._is_sharded:
++        # if the engine is a distributed engine (for tensor parallel) then each
++        # worker shard needs to serialize its part of the model.
++        engine.model_executor._run_workers(
++            "save_tensorized_model",
++            tensorizer_config=tensorizer_config,
++        )
++    else:
++        # with a single worker, we can get to the underlying model directly
++        serialize_vllm_model(
++            engine.model_executor.driver_worker.model_runner.model,
++            tensorizer_config,
++        )
+diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
+index f7e0f56..44978a5 100644
+--- a/vllm/model_executor/model_loader/utils.py
++++ b/vllm/model_executor/model_loader/utils.py
+@@ -7,6 +7,9 @@ from torch import nn
+ 
+ from vllm.config import ModelConfig
+ from vllm.model_executor.models import ModelRegistry
++from vllm.model_executor.models.adapters import (as_classification_model,
++                                                 as_embedding_model,
++                                                 as_reward_model)
+ 
+ 
+ @contextlib.contextmanager
+@@ -21,20 +24,27 @@ def set_default_torch_dtype(dtype: torch.dtype):
+ def get_model_architecture(
+         model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
+     architectures = getattr(model_config.hf_config, "architectures", [])
++
+     # Special handling for quantized Mixtral.
+     # FIXME(woosuk): This is a temporary hack.
++    mixtral_supported = [
++        "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"
++    ]
++
+     if (model_config.quantization is not None
+-            and model_config.quantization != "fp8"
++            and model_config.quantization not in mixtral_supported
+             and "MixtralForCausalLM" in architectures):
+         architectures = ["QuantMixtralForCausalLM"]
+ 
+-    for arch in architectures:
+-        model_cls = ModelRegistry.load_model_cls(arch)
+-        if model_cls is not None:
+-            return (model_cls, arch)
+-    raise ValueError(
+-        f"Model architectures {architectures} are not supported for now. "
+-        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
++    model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
++    if model_config.task == "embed":
++        model_cls = as_embedding_model(model_cls)
++    elif model_config.task == "classify":
++        model_cls = as_classification_model(model_cls)
++    elif model_config.task == "reward":
++        model_cls = as_reward_model(model_cls)
++
++    return model_cls, arch
+ 
+ 
+ def get_architecture_class_name(model_config: ModelConfig) -> str:
+diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
+index c1abde9..9cfcdbf 100644
+--- a/vllm/model_executor/model_loader/weight_utils.py
++++ b/vllm/model_executor/model_loader/weight_utils.py
+@@ -6,21 +6,36 @@ import json
+ import os
+ import tempfile
+ from collections import defaultdict
+-from typing import Any, Generator, Iterable, List, Optional, Tuple
++from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional,
++                    Tuple, Union)
+ 
+ import filelock
++import gguf
+ import huggingface_hub.constants
+ import numpy as np
+ import torch
+-from huggingface_hub import HfFileSystem, snapshot_download
++from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
+ from safetensors.torch import load_file, safe_open, save_file
+ from tqdm.auto import tqdm
+ 
+ from vllm.config import LoadConfig, ModelConfig
++from vllm.distributed import get_tensor_model_parallel_rank
+ from vllm.logger import init_logger
+ from vllm.model_executor.layers.quantization import (QuantizationConfig,
+                                                      get_quantization_config)
+ from vllm.model_executor.layers.quantization.schema import QuantParamSchema
++from vllm.platforms import current_platform
++from vllm.utils import PlaceholderModule
++
++try:
++    from runai_model_streamer import SafetensorsStreamer
++except (ImportError, OSError):
++    # see https://github.com/run-ai/runai-model-streamer/issues/26
++    # OSError will be raised on arm64 platform
++    runai_model_streamer = PlaceholderModule(
++        "runai_model_streamer")  # type: ignore[assignment]
++    SafetensorsStreamer = runai_model_streamer.placeholder_attr(
++        "SafetensorsStreamer")
+ 
+ logger = init_logger(__name__)
+ 
+@@ -116,13 +131,37 @@ def convert_bin_to_safetensor_file(
+ # TODO(woosuk): Move this to other place.
+ def get_quant_config(model_config: ModelConfig,
+                      load_config: LoadConfig) -> QuantizationConfig:
++
+     quant_cls = get_quantization_config(model_config.quantization)
++
++    # GGUF doesn't have config file
++    if model_config.quantization == "gguf":
++        return quant_cls.from_config({})
++
+     # Read the quantization config from the HF model config, if available.
+     hf_quant_config = getattr(model_config.hf_config, "quantization_config",
+                               None)
++    # some vision model may keep quantization_config in their text_config
++    hf_text_config = getattr(model_config.hf_config, "text_config", None)
++    if hf_quant_config is None and hf_text_config is not None:
++        hf_quant_config = getattr(hf_text_config, "quantization_config", None)
++    if hf_quant_config is None:
++        # compressed-tensors uses a compressions_config
++        hf_quant_config = getattr(model_config.hf_config, "compression_config",
++                                  None)
+     if hf_quant_config is not None:
+         return quant_cls.from_config(hf_quant_config)
+-    model_name_or_path = model_config.model
++    # In case of bitsandbytes/QLoRA, get quant config from the adapter model.
++    if model_config.quantization == "bitsandbytes":
++        if (not load_config.model_loader_extra_config
++                or "qlora_adapter_name_or_path"
++                not in load_config.model_loader_extra_config):
++            return quant_cls.from_config({"adapter_name_or_path": ""})
++        model_name_or_path = load_config.model_loader_extra_config[
++            "qlora_adapter_name_or_path"]
++
++    else:
++        model_name_or_path = model_config.model
+     is_local = os.path.isdir(model_name_or_path)
+     if not is_local:
+         # Download the config files.
+@@ -159,8 +198,19 @@ def get_quant_config(model_config: ModelConfig,
+             f"{quant_config_files}")
+ 
+     quant_config_file = quant_config_files[0]
+-    with open(quant_config_file, "r") as f:
++    with open(quant_config_file) as f:
+         config = json.load(f)
++
++        if model_config.quantization == "bitsandbytes":
++            config["adapter_name_or_path"] = model_name_or_path
++        elif model_config.quantization == "modelopt":
++            if config["producer"]["name"] == "modelopt":
++                return quant_cls.from_config(config)
++            else:
++                raise ValueError(
++                    f"Unsupported quantization config"
++                    f" found for {model_config.quantization} in {f}.")
++
+     return quant_cls.from_config(config)
+ 
+ 
+@@ -169,6 +219,7 @@ def download_weights_from_hf(
+     cache_dir: Optional[str],
+     allow_patterns: List[str],
+     revision: Optional[str] = None,
++    ignore_patterns: Optional[Union[str, List[str]]] = None,
+ ) -> str:
+     """Download model weights from Hugging Face Hub.
+ 
+@@ -180,6 +231,9 @@ def download_weights_from_hf(
+             weight files. Files matched by any of the patterns will be
+             downloaded.
+         revision (Optional[str]): The revision of the model.
++        ignore_patterns (Optional[Union[str, List[str]]]): The patterns to
++            filter out the weight files. Files matched by any of the patterns
++            will be ignored.
+ 
+     Returns:
+         str: The path to the downloaded model weights.
+@@ -203,6 +257,7 @@ def download_weights_from_hf(
+         hf_folder = snapshot_download(
+             model_name_or_path,
+             allow_patterns=allow_patterns,
++            ignore_patterns=ignore_patterns,
+             cache_dir=cache_dir,
+             tqdm_class=DisabledTqdm,
+             revision=revision,
+@@ -211,6 +266,69 @@ def download_weights_from_hf(
+     return hf_folder
+ 
+ 
++def download_safetensors_index_file_from_hf(
++    model_name_or_path: str,
++    index_file: str,
++    cache_dir: Optional[str],
++    revision: Optional[str] = None,
++) -> None:
++    """Download hf safetensors index file from Hugging Face Hub.
++
++    Args:
++        model_name_or_path (str): The model name or path.
++        cache_dir (Optional[str]): The cache directory to store the model
++            weights. If None, will use HF defaults.
++        revision (Optional[str]): The revision of the model.
++    """
++    # Use file lock to prevent multiple processes from
++    # downloading the same model weights at the same time.
++    with get_lock(model_name_or_path, cache_dir):
++        try:
++            # Download the safetensors index file.
++            hf_hub_download(
++                repo_id=model_name_or_path,
++                filename=index_file,
++                cache_dir=cache_dir,
++                revision=revision,
++                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
++            )
++        # If file not found on remote or locally, we should not fail since
++        # only some models will have index_file.
++        except huggingface_hub.utils.EntryNotFoundError:
++            logger.info("No %s found in remote.", index_file)
++        except huggingface_hub.utils.LocalEntryNotFoundError:
++            logger.info("No %s found in local cache.", index_file)
++
++
++# For models like Mistral-7B-v0.3, there are both sharded
++# safetensors files and a consolidated safetensors file.
++# Passing both of these to the weight loader functionality breaks.
++# So, we use the index_file to
++# look up which safetensors files should be used.
++def filter_duplicate_safetensors_files(hf_weights_files: List[str],
++                                       hf_folder: str,
++                                       index_file: str) -> List[str]:
++    # model.safetensors.index.json is a mapping from keys in the
++    # torch state_dict to safetensors file holding that weight.
++    index_file_name = os.path.join(hf_folder, index_file)
++    if not os.path.isfile(index_file_name):
++        return hf_weights_files
++
++    # Iterate through the weight_map (weight_name: safetensors files)
++    # to identify weights that we should use.
++    with open(index_file_name) as f:
++        weight_map = json.load(f)["weight_map"]
++    weight_files_in_index = set()
++    for weight_name in weight_map:
++        weight_files_in_index.add(
++            os.path.join(hf_folder, weight_map[weight_name]))
++    # Filter out any fields that are not found in the index file.
++    hf_weights_files = [
++        f for f in hf_weights_files if f in weight_files_in_index
++    ]
++    return hf_weights_files
++
++
+ def filter_files_not_needed_for_inference(
+         hf_weights_files: List[str]) -> List[str]:
+     """
+@@ -232,6 +350,13 @@ def filter_files_not_needed_for_inference(
+     return hf_weights_files
+ 
+ 
++# explicitly use pure text format, with a newline at the end
++# this makes it impossible to see the animation in the progress bar
++# but will avoid messing up with ray or multiprocessing, which wraps
++# each line of output with some prefix.
++_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
++
++
+ def np_cache_weights_iterator(
+     model_name_or_path: str, cache_dir: Optional[str], hf_folder: str,
+     hf_weights_files: List[str]
+@@ -240,6 +365,8 @@ def np_cache_weights_iterator(
+ 
+     Will dump the model weights to numpy files if they are not already dumped.
+     """
++    enable_tqdm = not torch.distributed.is_initialized(
++    ) or torch.distributed.get_rank() == 0
+     # Convert the model weights from torch tensors to numpy arrays for
+     # faster loading.
+     np_folder = os.path.join(hf_folder, "np")
+@@ -249,8 +376,13 @@ def np_cache_weights_iterator(
+     # dumping the same model weights to numpy at the same time.
+     with get_lock(model_name_or_path, cache_dir):
+         if not os.path.exists(weight_names_file):
+-            weight_names = []
+-            for bin_file in hf_weights_files:
++            weight_names: List[str] = []
++            for bin_file in tqdm(
++                    hf_weights_files,
++                    desc="Loading np_cache checkpoint shards",
++                    disable=not enable_tqdm,
++                    bar_format=_BAR_FORMAT,
++            ):
+                 state = torch.load(bin_file, map_location="cpu")
+                 for name, param in state.items():
+                     param_path = os.path.join(np_folder, name)
+@@ -260,7 +392,7 @@ def np_cache_weights_iterator(
+             with open(weight_names_file, "w") as f:
+                 json.dump(weight_names, f)
+ 
+-    with open(weight_names_file, "r") as f:
++    with open(weight_names_file) as f:
+         weight_names = json.load(f)
+ 
+     for name in weight_names:
+@@ -274,25 +406,96 @@ def safetensors_weights_iterator(
+     hf_weights_files: List[str]
+ ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+     """Iterate over the weights in the model safetensor files."""
+-    for st_file in hf_weights_files:
++    enable_tqdm = not torch.distributed.is_initialized(
++    ) or torch.distributed.get_rank() == 0
++    for st_file in tqdm(
++            hf_weights_files,
++            desc="Loading safetensors checkpoint shards",
++            disable=not enable_tqdm,
++            bar_format=_BAR_FORMAT,
++    ):
+         with safe_open(st_file, framework="pt") as f:
+             for name in f.keys():  # noqa: SIM118
+                 param = f.get_tensor(name)
+                 yield name, param
+ 
+ 
++def runai_safetensors_weights_iterator(
++    hf_weights_files: List[str]
++) -> Generator[Tuple[str, torch.Tensor], None, None]:
++    """Iterate over the weights in the model safetensor files."""
++    enable_tqdm = not torch.distributed.is_initialized(
++    ) or torch.distributed.get_rank() == 0
++    with SafetensorsStreamer() as streamer:
++        for st_file in tqdm(
++                hf_weights_files,
++                desc="Loading safetensors using Runai Model Streamer",
++                disable=not enable_tqdm,
++                bar_format=_BAR_FORMAT,
++        ):
++            streamer.stream_file(st_file)
++            yield from streamer.get_tensors()
++
++
+ def pt_weights_iterator(
+     hf_weights_files: List[str]
+ ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+     """Iterate over the weights in the model bin/pt files."""
+-    for bin_file in hf_weights_files:
++    enable_tqdm = not torch.distributed.is_initialized(
++    ) or torch.distributed.get_rank() == 0
++    for bin_file in tqdm(
++            hf_weights_files,
++            desc="Loading pt checkpoint shards",
++            disable=not enable_tqdm,
++            bar_format=_BAR_FORMAT,
++    ):
+         state = torch.load(bin_file, map_location="cpu")
+-        for name, param in state.items():
+-            yield name, param
++        yield from state.items()
+         del state
+         torch.cuda.empty_cache()
+ 
+ 
++def get_gguf_extra_tensor_names(
++        gguf_file: str, gguf_to_hf_name_map: Dict[str, str]) -> List[str]:
++    reader = gguf.GGUFReader(gguf_file)
++    expected_gguf_keys = set(gguf_to_hf_name_map.keys())
++    exact_gguf_keys = set([tensor.name for tensor in reader.tensors])
++    extra_keys = expected_gguf_keys - exact_gguf_keys
++    return [gguf_to_hf_name_map[key] for key in extra_keys]
++
++
++def gguf_quant_weights_iterator(
++    gguf_file: str, gguf_to_hf_name_map: Dict[str, str]
++) -> Generator[Tuple[str, torch.Tensor], None, None]:
++    """
++    Iterate over the quant weights in the model gguf files and convert
++    them to torch tensors
++    """
++
++    reader = gguf.GGUFReader(gguf_file)
++
++    for tensor in reader.tensors:
++        if tensor.name in gguf_to_hf_name_map:
++            weight_type = tensor.tensor_type
++            name = gguf_to_hf_name_map[tensor.name]
++
++            if weight_type.name != "F32":
++                weight_type_name = name.replace("weight", "qweight_type")
++                weight_type = torch.tensor(weight_type)
++                yield weight_type_name, weight_type
++
++    for tensor in reader.tensors:
++        if tensor.name in gguf_to_hf_name_map:
++            weight = tensor.data
++            weight_type = tensor.tensor_type
++            name = gguf_to_hf_name_map[tensor.name]
++
++            if weight_type.name != "F32":
++                name = name.replace("weight", "qweight")
++            param = torch.tensor(weight)
++            yield name, param
++
++
+ def kv_cache_scales_loader(
+         filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int,
+         model_type: Optional[str]) -> Iterable[Tuple[int, float]]:
+@@ -302,7 +505,8 @@ def kv_cache_scales_loader(
+     KV cache scaling factors. The serialization should represent a dictionary
+     whose keys are the TP ranks and values are another dictionary mapping layers
+     to their KV cache scaling factors.
+-    Keep this function in sync with the output of examples/fp8/extract_scales.py
++    Keep this function in sync with the output of
++    examples/other/fp8/extract_scales.py
+     """
+     try:
+         with open(filename) as f:
+@@ -322,8 +526,8 @@ def kv_cache_scales_loader(
+         logger.error("File or directory '%s' not found.", filename)
+     except json.JSONDecodeError:
+         logger.error("Error decoding JSON in file '%s'.", filename)
+-    except Exception as e:
+-        logger.error("An error occurred while reading '%s': %s", filename, e)
++    except Exception:
++        logger.exception("An error occurred while reading '%s'.", filename)
+     # This section is reached if and only if any of the excepts are hit
+     # Return an empty iterable (list) => no KV cache scales are loaded
+     # which ultimately defaults to 1.0 scales
+@@ -351,14 +555,75 @@ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
+ def default_weight_loader(param: torch.Tensor,
+                           loaded_weight: torch.Tensor) -> None:
+     """Default weight loader."""
+-    assert param.size() == loaded_weight.size()
+-    param.data.copy_(loaded_weight)
++    try:
++        if param.numel() == 1 and loaded_weight.numel() == 1:
++            # Sometimes scalar values aren't considered tensors with shapes
++            # so if both param and loaded_weight are a scalar,
++            # "broadcast" instead of copy
++            param.data.fill_(loaded_weight.item())
++        else:
++            assert param.size() == loaded_weight.size(), (
++                f"Attempted to load weight ({loaded_weight.size()}) "
++                f"into parameter ({param.size()})")
++
++            param.data.copy_(loaded_weight)
++    except Exception:
++        # NOTE: This exception is added for the purpose of setting breakpoint to
++        # debug weight loading issues.
++        raise
++
++
++def row_parallel_weight_loader(param: torch.Tensor,
++                               loaded_weight: torch.Tensor) -> None:
++    """Load weights that are row-parallelized."""
++    tp_rank = get_tensor_model_parallel_rank()
++    shard_dim = 0 if param.dim() != 1 else None
++
++    if shard_dim is not None:
++        shard_size = param.data.shape[shard_dim]
++        start_idx = tp_rank * shard_size
++        loaded_weight = loaded_weight.narrow(shard_dim, start_idx, shard_size)
++
++    return default_weight_loader(param, loaded_weight)
++
++
++LoaderFunction = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
++
++
++def sharded_weight_loader(shard_axis: int) -> LoaderFunction:
++    """Create a weight loader that shards the weights along the given axis"""
++
++    def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
++        tp_rank = get_tensor_model_parallel_rank()
++
++        shard_size = param.data.shape[shard_axis]
++        start_idx = tp_rank * shard_size
++        loaded_weight = loaded_weight.narrow(shard_axis, start_idx, shard_size)
++
++        return default_weight_loader(param, loaded_weight)
++
++    return loader
++
++
++def composed_weight_loader(
++        loader: LoaderFunction, fn: Callable[[torch.Tensor],
++                                             torch.Tensor]) -> LoaderFunction:
++    """Create a weight loader that post-processes the weights after loading"""
++
++    def composed_loader(param: torch.Tensor,
++                        loaded_weight: torch.Tensor) -> None:
++        loader(param, loaded_weight)
++        param.data.copy_(fn(param))
++        return
++
++    return composed_loader
+ 
+ 
+ def initialize_dummy_weights(
+     model: torch.nn.Module,
+     low: float = -1e-3,
+     high: float = 1e-3,
++    seed: int = 1234,
+ ) -> None:
+     """Initialize model weights with random values.
+ 
+@@ -366,7 +631,79 @@ def initialize_dummy_weights(
+     measurements. Additionally, the model weights should not cause NaNs in the
+     forward pass. We empirically found that initializing the weights with
+     values between -1e-3 and 1e-3 works well for most models.
++
++    We use per-parameter random seed, so that dummy weights are consistent,
++    even if the model is partitioned across multiple devices. When the seed
++    is fixed, the random values generated by this function only depends on
++    the parameter's number of elements and its data type.
+     """
+     for param in model.state_dict().values():
+         if torch.is_floating_point(param):
+-            param.data.uniform_(low, high)
++            if current_platform.is_tpu():
++                # XLA device does not support torch.Generator()
++                param.uniform_(low, high)
++                continue
++
++            generator = torch.Generator(device=param.data.device)
++            generator.manual_seed(seed)
++            if torch.finfo(param.data.dtype).bits < 16:
++                # uniform_ doesn't support < 16-bit datatypes (FP8)
++                dtype = param.data.dtype
++                tmp_param = param.data.to(torch.float16)
++                tmp_param = tmp_param.uniform_(low, high,
++                                               generator=generator).to(dtype)
++                param.data.copy_(tmp_param)
++            else:
++                param.uniform_(low, high, generator=generator)
++
++
++def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
++    """Remap the name of FP8 k/v_scale parameters.
++
++    This function handles the remapping of FP8 k/v_scale parameter names.
++    It detects if the given name ends with a suffix and attempts to remap
++    it to the expected name format in the model. If the remapped name is not
++    found in the params_dict, a warning is printed and None is returned.
++
++    Args:
++        name (str): The original loaded checkpoint parameter name.
++        params_dict (dict): Dictionary containing the model's named parameters.
++
++    Returns:
++        str: The remapped parameter name if successful, or the original name
++             if no remapping is needed.
++        None: If the remapped name is not found in params_dict.
++    """
++    if name.endswith(".kv_scale"):
++        logger.warning_once(
++            "DEPRECATED. Found kv_scale in the checkpoint. "
++            "This format is deprecated in favor of separate k_scale and "
++            "v_scale tensors and will be removed in a future release. "
++            "Functionally, we will remap kv_scale to k_scale and duplicate "
++            "k_scale to v_scale")
++        # NOTE: we remap the deprecated kv_scale to k_scale
++        remapped_name = name.replace(".kv_scale", ".attn.k_scale")
++        if remapped_name not in params_dict:
++            logger.warning_once(
++                f"Found kv_scale in the checkpoint (e.g. {name}), "
++                "but not found the expected name in the model "
++                f"(e.g. {remapped_name}). kv_scale is "
++                "not loaded.")
++            return None
++        return remapped_name
++
++    possible_scale_names = [".k_scale", ".v_scale"]
++    for scale_name in possible_scale_names:
++        if name.endswith(scale_name):
++            remapped_name = name.replace(scale_name, f".attn{scale_name}")
++            if remapped_name not in params_dict:
++                logger.warning_once(
++                    f"Found {scale_name} in the checkpoint (e.g. {name}), "
++                    "but not found the expected name in the model "
++                    f"(e.g. {remapped_name}). {scale_name} is "
++                    "not loaded.")
++                return None
++            return remapped_name
++
++    # If there were no matches, return the untouched param name
++    return name
+diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
+index c5cdc05..a3ef9ad 100755
+--- a/vllm/model_executor/models/__init__.py
++++ b/vllm/model_executor/models/__init__.py
+@@ -1,119 +1,22 @@
+-import importlib
+-from typing import Dict, List, Optional, Type
+-
+-import torch.nn as nn
+-
+-from vllm.logger import init_logger
+-from vllm.utils import is_hip
+-
+-logger = init_logger(__name__)
+-
+-# Architecture -> (module, class).
+-_MODELS = {
+-    "AquilaModel": ("llama", "LlamaForCausalLM"),
+-    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
+-    "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
+-    "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
+-    "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
+-    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
+-    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
+-    "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
+-    "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
+-    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
+-    "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
+-    "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
+-    "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
+-    "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
+-    "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
+-    "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
+-    "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
+-    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
+-    "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
+-    "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
+-    "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
+-    "LlavaForConditionalGeneration":
+-    ("llava", "LlavaForConditionalGeneration"),
+-    # For decapoda-research/llama-*
+-    "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
+-    "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
+-    "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
+-    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
+-    # transformers's mpt class has lower case
+-    "MptForCausalLM": ("mpt", "MPTForCausalLM"),
+-    "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
+-    "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
+-    "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
+-    "OPTForCausalLM": ("opt", "OPTForCausalLM"),
+-    "OrionForCausalLM": ("orion", "OrionForCausalLM"),
+-    "PhiForCausalLM": ("phi", "PhiForCausalLM"),
+-    "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
+-    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
+-    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
+-    "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
+-    "RWForCausalLM": ("falcon", "FalconForCausalLM"),
+-    "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
+-    "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
+-    "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
+-    "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
+-}
+-
+-# Architecture -> type.
+-# out of tree models
+-_OOT_MODELS: Dict[str, Type[nn.Module]] = {}
+-
+-# Models not supported by ROCm.
+-_ROCM_UNSUPPORTED_MODELS = []
+-
+-# Models partially supported by ROCm.
+-# Architecture -> Reason.
+-_ROCM_PARTIALLY_SUPPORTED_MODELS = {
+-    "Qwen2ForCausalLM":
+-    "Sliding window attention is not yet supported in ROCm's flash attention",
+-    "MistralForCausalLM":
+-    "Sliding window attention is not yet supported in ROCm's flash attention",
+-    "MixtralForCausalLM":
+-    "Sliding window attention is not yet supported in ROCm's flash attention",
+-}
+-
+-
+-class ModelRegistry:
+-
+-    @staticmethod
+-    def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
+-        if model_arch in _OOT_MODELS:
+-            return _OOT_MODELS[model_arch]
+-        if model_arch not in _MODELS:
+-            return None
+-        if is_hip():
+-            if model_arch in _ROCM_UNSUPPORTED_MODELS:
+-                raise ValueError(
+-                    f"Model architecture {model_arch} is not supported by "
+-                    "ROCm for now.")
+-            if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
+-                logger.warning(
+-                    "Model architecture %s is partially supported by ROCm: %s",
+-                    model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
+-
+-        module_name, model_cls_name = _MODELS[model_arch]
+-        module = importlib.import_module(
+-            f"vllm.model_executor.models.{module_name}")
+-        return getattr(module, model_cls_name, None)
+-
+-    @staticmethod
+-    def get_supported_archs() -> List[str]:
+-        return list(_MODELS.keys())
+-
+-    @staticmethod
+-    def register_model(model_arch: str, model_cls: Type[nn.Module]):
+-        if model_arch in _MODELS:
+-            logger.warning(
+-                "Model architecture %s is already registered, and will be "
+-                "overwritten by the new model class %s.", model_arch,
+-                model_cls.__name__)
+-        global _OOT_MODELS
+-        _OOT_MODELS[model_arch] = model_cls
+-
++from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
++                         SupportsPP, has_inner_state, supports_lora,
++                         supports_multimodal, supports_pp)
++from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
++                              is_pooling_model, is_text_generation_model)
++from .registry import ModelRegistry
+ 
+ __all__ = [
+     "ModelRegistry",
++    "VllmModelForPooling",
++    "is_pooling_model",
++    "VllmModelForTextGeneration",
++    "is_text_generation_model",
++    "HasInnerState",
++    "has_inner_state",
++    "SupportsLoRA",
++    "supports_lora",
++    "SupportsMultiModal",
++    "supports_multimodal",
++    "SupportsPP",
++    "supports_pp",
+ ]
+diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
+new file mode 100644
+index 0000000..55e90b9
+--- /dev/null
++++ b/vllm/model_executor/models/adapters.py
+@@ -0,0 +1,248 @@
++from collections.abc import Iterable
++from typing import TYPE_CHECKING, Any, Optional, TypeVar
++
++import torch
++import torch.nn as nn
++
++from .interfaces_base import VllmModelForPooling, is_pooling_model
++
++if TYPE_CHECKING:
++    from vllm.model_executor.layers.pooler import PoolingType
++
++_T = TypeVar("_T", bound=type[nn.Module])
++
++_GENERATE_SUFFIXES = [
++    "ForCausalLM",
++    "ForConditionalGeneration",
++    "ChatModel",
++    "LMHeadModel",
++]
++
++
++def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
++    model_name = orig_model_name
++
++    for generate_suffix in _GENERATE_SUFFIXES:
++        model_name = model_name.removesuffix(generate_suffix)
++
++    return model_name + pooling_suffix
++
++
++def _create_pooling_model_cls(
++    orig_cls: _T,
++    *,
++    default_pooling_type: "PoolingType",
++    default_normalize: bool,
++    default_softmax: bool,
++) -> _T:
++    # Lazy import
++    from vllm.config import VllmConfig
++    from vllm.model_executor.layers.pooler import Pooler, PoolerOutput
++    from vllm.model_executor.pooling_metadata import PoolingMetadata
++
++    from .utils import AutoWeightsLoader, WeightsMapper
++
++    class ModelForPooling(orig_cls, VllmModelForPooling):
++
++        def __init__(
++            self,
++            *,
++            vllm_config: "VllmConfig",
++            prefix: str = "",
++            **kwargs: Any,
++        ) -> None:
++            super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
++
++            # These are not used in pooling models
++            for attr in ("lm_head", "logits_processor"):
++                if hasattr(self, attr):
++                    delattr(self, attr)
++
++            pooler_config = vllm_config.model_config.pooler_config
++            assert pooler_config is not None
++
++            # If the model already defines a pooler instance, don't overwrite it
++            if not getattr(self, "_pooler", None):
++                self._pooler = Pooler.from_config_with_defaults(
++                    pooler_config,
++                    pooling_type=default_pooling_type,
++                    normalize=default_normalize,
++                    softmax=default_softmax,
++                )
++
++        def pooler(
++            self,
++            hidden_states: torch.Tensor,
++            pooling_metadata: PoolingMetadata,
++        ) -> PoolerOutput:
++            return self._pooler(hidden_states, pooling_metadata)
++
++        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
++            # TODO: Support uninitialized params tracking
++
++            # We have deleted this attribute, so don't load it
++            weights = ((name, data) for name, data in weights
++                       if not name.startswith("lm_head."))
++
++            # If `*ForCausalLM` defines `load_weights` on the inner model
++            # and there are no other inner modules with parameters,
++            # we support loading from both `*Model` and `*ForCausalLM`
++            if hasattr(self, "model") and hasattr(self.model, "load_weights"):
++                # Whether only `self.model` contains parameters
++                model_is_only_param = all(
++                    name == "model" or next(child.parameters(), None) is None
++                    for name, child in self.named_children())
++
++                if model_is_only_param:
++                    mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
++                    weights = mapper.apply(weights)
++
++                    self.model.load_weights(weights)
++                    return
++
++            # For most other models
++            if hasattr(orig_cls, "load_weights"):
++                orig_cls.load_weights(self, weights)  # type: ignore
++            # Fallback
++            else:
++                loader = AutoWeightsLoader(self)
++                loader.load_weights(weights)
++
++    return ModelForPooling  # type: ignore
++
++
++def as_embedding_model(cls: _T) -> _T:
++    """
++    Subclass an existing vLLM model to support embeddings.
++
++    By default, the embeddings of the whole prompt are extracted from the
++    normalized hidden state corresponding to the last token.
++
++    Note:
++        We assume that no extra layers are added to the original model;
++        please implement your own model if this is not the case.
++    """
++    # Avoid modifying existing embedding models
++    if is_pooling_model(cls):
++        return cls
++
++    # Lazy import
++    from vllm.model_executor.layers.pooler import PoolingType
++
++    ModelForEmbedding = _create_pooling_model_cls(
++        cls,
++        default_pooling_type=PoolingType.LAST,
++        default_normalize=True,
++        default_softmax=False,
++    )
++    ModelForEmbedding.__name__ = \
++        _get_pooling_model_name(cls.__name__, "ForEmbedding")
++
++    return ModelForEmbedding  # type: ignore
++
++
++def as_classification_model(cls: _T) -> _T:
++    """
++    Subclass an existing vLLM model to support classification.
++
++    By default, the class probabilities are extracted from the softmaxed
++    hidden state corresponding to the last token.
++
++    Note:
++        We assume that the classification head is a single linear layer
++        stored as the attribute `score` of the top-level model;
++        please implement your own model if this is not the case.
++    """
++    # Avoid modifying existing classification models
++    if is_pooling_model(cls):
++        return cls
++
++    # Lazy import
++    from vllm.attention import AttentionMetadata
++    from vllm.config import VllmConfig
++    from vllm.model_executor.layers.linear import RowParallelLinear
++    from vllm.model_executor.layers.pooler import PoolingType
++    from vllm.sequence import IntermediateTensors
++
++    from .utils import maybe_prefix
++
++    ModelForPooling = _create_pooling_model_cls(
++        cls,
++        default_pooling_type=PoolingType.LAST,
++        default_normalize=False,
++        default_softmax=True,
++    )
++
++    class ModelForClassification(ModelForPooling):
++
++        def __init__(
++            self,
++            *,
++            vllm_config: "VllmConfig",
++            prefix: str = "",
++            **kwargs: Any,
++        ) -> None:
++            super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
++
++            config = vllm_config.model_config.hf_config
++            quant_config = vllm_config.quant_config
++
++            self.score = RowParallelLinear(config.hidden_size,
++                                           config.num_labels,
++                                           quant_config=quant_config,
++                                           input_is_parallel=False,
++                                           bias=False,
++                                           prefix=maybe_prefix(
++                                               prefix, "score"))
++
++        def forward(
++            self,
++            input_ids: torch.Tensor,
++            positions: torch.Tensor,
++            kv_caches: list[torch.Tensor],
++            attn_metadata: AttentionMetadata,
++            intermediate_tensors: Optional[IntermediateTensors] = None,
++            inputs_embeds: Optional[torch.Tensor] = None,
++        ) -> torch.Tensor:
++            hidden_states = super().forward(input_ids, positions, kv_caches,
++                                            attn_metadata,
++                                            intermediate_tensors,
++                                            inputs_embeds)
++            logits, _ = self.score(hidden_states)
++            return logits
++
++
++    ModelForClassification.__name__ = \
++        _get_pooling_model_name(cls.__name__, "ForClassification")
++
++    return ModelForClassification  # type: ignore
++
++
++def as_reward_model(cls: _T) -> _T:
++    """
++    Subclass an existing vLLM model to support reward modeling.
++
++    By default, we return the hidden states of each token directly.
++
++    Note:
++        We assume that no extra layers are added to the original model;
++        please implement your own model if this is not the case.
++    """
++    # Avoid modifying existing reward models
++    if is_pooling_model(cls):
++        return cls
++
++    # Lazy import
++    from vllm.model_executor.layers.pooler import PoolingType
++
++    ModelForReward = _create_pooling_model_cls(
++        cls,
++        default_pooling_type=PoolingType.ALL,
++        default_normalize=False,
++        default_softmax=False,
++    )
++
++    ModelForReward.__name__ = \
++        _get_pooling_model_name(cls.__name__, "ForReward")
++
++    return ModelForReward  # type: ignore
+diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
+new file mode 100644
+index 0000000..fd6b565
+--- /dev/null
++++ b/vllm/model_executor/models/arctic.py
+@@ -0,0 +1,581 @@
++"""Inference-only Snowflake Arctic model."""
++from typing import Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size,
++                              tensor_model_parallel_all_reduce)
++from vllm.logger import init_logger
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               ReplicatedLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.quantization.deepspeedfp import (
++    DeepSpeedFPConfig, DeepSpeedFPParameter)
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.model_executor.utils import set_weight_attrs
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.configs.arctic import ArcticConfig
++
++from .interfaces import SupportsPP
++from .utils import (extract_layer_index, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++logger = init_logger(__name__)
++
++
++class ArcticMLP(nn.Module):
++
++    def __init__(self,
++                 config: ArcticConfig,
++                 expert_id: int = -1,
++                 is_residual_mlp: bool = False,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 reduce_results: bool = True,
++                 prefix: str = ""):
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        self.expert_id = expert_id
++
++        self.ffn_dim = config.intermediate_size if not is_residual_mlp \
++            else self.hidden_size
++
++        self.w13 = MergedColumnParallelLinear(self.hidden_size,
++                                              [self.ffn_dim] * 2,
++                                              bias=False,
++                                              quant_config=quant_config)
++        self.w2 = RowParallelLinear(self.ffn_dim,
++                                    self.hidden_size,
++                                    bias=False,
++                                    reduce_results=reduce_results,
++                                    quant_config=quant_config)
++        if config.hidden_act != "silu":
++            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
++                             "Only silu is supported for now.")
++        self.act_fn = SiluAndMul()
++
++    def forward(self, hidden_states):
++        gate_up, _ = self.w13(hidden_states)
++        hidden_states = self.act_fn(gate_up)
++        hidden_states, _ = self.w2(hidden_states)
++        return hidden_states
++
++
++class ArcticMoE(nn.Module):
++    """
++    Model-parallel implementation of Arctic MoE Layer.
++    """
++
++    def __init__(self,
++                 config: ArcticConfig,
++                 tp_size: Optional[int] = None,
++                 params_dtype: Optional[torch.dtype] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 reduce_results: bool = True,
++                 prefix: str = ""):
++        super().__init__()
++
++        layer_id = extract_layer_index(prefix)
++        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
++        self.hidden_size = config.hidden_size
++        self.num_experts = config.num_local_experts
++        self.layer_id = layer_id
++        self.top_k = config.num_experts_per_tok
++        self.intermediate_size = config.intermediate_size // self.tp_size
++
++        self.is_moe_layer = (layer_id + 1) % config.moe_layer_frequency == 0
++        self.is_quant = isinstance(quant_config, DeepSpeedFPConfig)
++        self.reduce_results = reduce_results
++        # Some other parameters
++        if params_dtype is None:
++            params_dtype = torch.get_default_dtype()
++        self.params_dtype = params_dtype
++
++        if not self.is_moe_layer:
++            self.mlp = ArcticMLP(config,
++                                 quant_config=quant_config,
++                                 reduce_results=reduce_results,
++                                 prefix=f"{prefix}.mlp")
++        else:
++            self.gate = ReplicatedLinear(self.hidden_size,
++                                         self.num_experts,
++                                         bias=False,
++                                         params_dtype=self.params_dtype,
++                                         quant_config=quant_config,
++                                         prefix=f"{prefix}.gate")
++            if self.is_quant:
++                self.ws = DeepSpeedFPParameter(
++                    torch.Size((self.num_experts, 2 * self.intermediate_size,
++                                self.hidden_size)),
++                    params_dtype=params_dtype,
++                    quant_config=quant_config,
++                )
++                self.w2s = DeepSpeedFPParameter(
++                    torch.Size((self.num_experts, self.hidden_size,
++                                self.intermediate_size)),
++                    params_dtype=params_dtype,
++                    quant_config=quant_config,
++                )
++            else:
++                self.ws = nn.Parameter(
++                    torch.empty(self.num_experts,
++                                2 * self.intermediate_size,
++                                self.hidden_size,
++                                device="cuda",
++                                dtype=self.params_dtype))
++                self.w2s = nn.Parameter(
++                    torch.empty(self.num_experts,
++                                self.hidden_size,
++                                self.intermediate_size,
++                                device="cuda",
++                                dtype=self.params_dtype))
++            set_weight_attrs(self.ws, {
++                "weight_loader": self.weight_loader,
++            })
++            set_weight_attrs(self.w2s, {
++                "weight_loader": self.weight_loader,
++            })
++
++    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
++                      weight_name: str, expert_id: int):
++        tp_rank = get_tensor_model_parallel_rank()
++        param_data = param.ds_dequantize() if self.is_quant else param.data
++        shard_size = self.intermediate_size
++        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
++        if weight_name.endswith("w1.weight"):
++            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
++        if weight_name.endswith("w3.weight"):
++            param_data[expert_id,
++                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
++        if weight_name.endswith("w2.weight"):
++            param_data[expert_id, :, :] = loaded_weight[:, shard]
++        if self.is_quant:
++            param.ds_quantize_(param_data)
++
++    def local_moe_fused(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        num_tokens, hidden_size = hidden_states.shape
++        hidden_states = hidden_states.view(-1, self.hidden_size)
++        # router_logits: (num_tokens, n_experts)
++        router_logits, _ = self.gate(hidden_states)
++        do_normalize = self.top_k > 1
++        topk_weights, topk_ids = fused_topk(hidden_states,
++                                            router_logits,
++                                            self.top_k,
++                                            renormalize=do_normalize)
++        # topk_ids: (num_tokens, k)
++        if self.is_quant:
++            if 2 * num_tokens <= self.num_experts:
++                # If much fewer tokens than experts, use selective dequantize.
++                ws_dequantized = self.ws.ds_selective_dequantize(
++                    topk_ids.flatten())
++                w2s_dequantized = self.w2s.ds_selective_dequantize(
++                    topk_ids.flatten())
++                # We gathered the experts to the tokens so update the mapping.
++                topk_ids = torch.arange(
++                    0,
++                    topk_ids.numel(),
++                    device=topk_ids.device,
++                ).reshape(topk_ids.shape)
++            else:
++                ws_dequantized = self.ws.ds_dequantize()
++                w2s_dequantized = self.w2s.ds_dequantize()
++
++        final_hidden_states = fused_experts(
++            hidden_states,
++            ws_dequantized if self.is_quant else self.ws,
++            w2s_dequantized if self.is_quant else self.w2s,
++            topk_weights,
++            topk_ids,
++            inplace=True)
++        if self.reduce_results and self.tp_size > 1:
++            final_hidden_states = tensor_model_parallel_all_reduce(
++                final_hidden_states)
++        return final_hidden_states.view(num_tokens, hidden_size)
++
++    def forward(self, hidden_states: torch.Tensor):
++        if self.is_moe_layer:
++            final_hidden_states = self.local_moe_fused(hidden_states)
++        else:
++            final_hidden_states = self.mlp(hidden_states)
++        return final_hidden_states
++
++
++class ArcticAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: ArcticConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.config = config
++        self.hidden_size = config.hidden_size
++
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = config.num_attention_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = config.num_key_value_heads
++        if self.total_num_kv_heads >= tp_size:
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        self.head_dim = self.hidden_size // self.total_num_heads
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++
++        self.max_position_embeddings = config.max_position_embeddings
++        self.rope_theta = config.rope_theta
++        self.scaling = self.head_dim**-0.5
++
++        self.qkv_proj = QKVParallelLinear(self.hidden_size,
++                                          self.head_dim,
++                                          self.total_num_heads,
++                                          self.total_num_kv_heads,
++                                          bias=False,
++                                          quant_config=quant_config)
++        self.o_proj = RowParallelLinear(
++            self.total_num_heads * self.head_dim,
++            self.hidden_size,
++            bias=False,
++            reduce_results=True,
++            quant_config=quant_config,
++        )
++
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=self.max_position_embeddings,
++            base=int(self.rope_theta),
++            is_neox_style=True,
++        )
++
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class ArcticDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: ArcticConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        layer_idx = extract_layer_index(prefix)
++        is_moe_layer = (layer_idx + 1) % config.moe_layer_frequency == 0
++        self.use_residual = config.use_residual and is_moe_layer
++        self.self_attn = ArcticAttention(config,
++                                         cache_config,
++                                         quant_config=quant_config,
++                                         prefix=f"{prefix}.self_attn")
++        self.block_sparse_moe = ArcticMoE(
++            config,
++            quant_config=quant_config,
++            reduce_results=(not self.use_residual),
++            prefix=f"{prefix}.block_sparse_moe",
++        )
++
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.rms_norm_eps)
++
++        if self.use_residual:
++            self.residual_layernorm = RMSNorm(config.hidden_size,
++                                              eps=config.rms_norm_eps)
++            self.residual_mlp = ArcticMLP(config,
++                                          is_residual_mlp=True,
++                                          reduce_results=False,
++                                          prefix=f"{prefix}.residual_mlp")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        residual_input = hidden_states
++        hidden_states = self.input_layernorm(hidden_states)
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        hidden_states = residual_input + hidden_states
++
++        residual_attn = hidden_states
++        if self.use_residual:
++            hidden_states = self.residual_layernorm(hidden_states)
++            hidden_states = self.residual_mlp(hidden_states)
++            residual_mlp = hidden_states
++            hidden_states = self.post_attention_layernorm(residual_input)
++            hidden_states = self.block_sparse_moe(hidden_states)
++            hidden_states = residual_mlp + hidden_states
++            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
++            hidden_states = residual_attn + hidden_states
++        else:
++            hidden_states = self.post_attention_layernorm(hidden_states)
++            hidden_states = self.block_sparse_moe(hidden_states)
++            hidden_states = residual_attn + hidden_states
++        return hidden_states
++
++
++@support_torch_compile
++class ArcticModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.padding_idx = config.pad_token_id
++        self.vocab_size = config.vocab_size
++        self.embed_tokens = VocabParallelEmbedding(
++            self.vocab_size,
++            config.hidden_size,
++            org_num_embeddings=self.vocab_size)
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: ArcticDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
++        self._attn_implementation = config._attn_implementation
++        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states = layer(positions, hidden_states,
++                                  kv_caches[i - self.start_layer],
++                                  attn_metadata)
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
++        hidden_states = self.norm(hidden_states)
++        return hidden_states
++
++
++class ArcticForCausalLM(nn.Module, SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        self.config = config
++        self.model = ArcticModel(vllm_config=vllm_config,
++                                 prefix=maybe_prefix(prefix, "model"))
++        self.vocab_size = config.vocab_size
++        self.lm_head = ParallelLMHead(
++            self.vocab_size,
++            config.hidden_size,
++            quant_config=quant_config,
++        )
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
++        self.num_experts = config.num_local_experts
++        self.num_experts_per_tok = config.num_experts_per_tok
++        self.unpadded_vocab_size = config.vocab_size
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                config.vocab_size)
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        hidden_states = self.model(input_ids, positions, kv_caches,
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: Optional[torch.Tensor],
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++        ]
++
++        mlp_params_mapping: List[Tuple[str, str, int]] = []
++        expert_params_mapping: List[Tuple[str, str, int]] = []
++        num_layers = self.config.num_hidden_layers
++
++        for layer in range(num_layers):
++            mlp_params_mapping.append(
++                (f"layers.{layer}.residual_mlp.w13.weight",
++                 f"layers.{layer}.residual_mlp.w1.weight", 0))
++            mlp_params_mapping.append(
++                (f"layers.{layer}.residual_mlp.w13.weight",
++                 f"layers.{layer}.residual_mlp.w3.weight", 1))
++            if layer % 2 == 0:
++                # MLP layers
++                mlp_params_mapping.append(
++                    (f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
++                     f"layers.{layer}.block_sparse_moe.mlp.w1.weight", 0))
++                mlp_params_mapping.append(
++                    (f"layers.{layer}.block_sparse_moe.mlp.w13.weight",
++                     f"layers.{layer}.block_sparse_moe.mlp.w3.weight", 1))
++            else:
++                # MoE layers
++                for expert_id in range(self.config.num_local_experts):
++                    expert_params_mapping.append(
++                        ("ws", f"experts.{expert_id}.w1.weight", expert_id))
++                    expert_params_mapping.append(
++                        ("w2s", f"experts.{expert_id}.w2.weight", expert_id))
++                    expert_params_mapping.append(
++                        ("ws", f"experts.{expert_id}.w3.weight", expert_id))
++
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++
++        logger.info(
++            "It will take ~10 minutes loading from the 16-bit weights. "
++            "Alternatively, use the prequantized 8-bit weights of arctic "
++            "and set load-format to `sharded_state` will accelerate loading.")
++        for name, loaded_weight in weights:
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                for param_name, weight_name, shard_id in mlp_params_mapping:
++                    if weight_name not in name:
++                        continue
++                    name = name.replace(weight_name, param_name)
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    param = params_dict[name]
++                    weight_loader = param.weight_loader
++                    weight_loader(param, loaded_weight, shard_id)
++                    break
++                else:
++                    for param_name, weight_name, shard_id \
++                            in expert_params_mapping:
++                        if weight_name not in name:
++                            continue
++                        name = name.replace(weight_name, param_name)
++                        if is_pp_missing_parameter(name, self):
++                            continue
++                        param = params_dict[name]
++                        weight_loader = param.weight_loader
++                        weight_loader(param,
++                                      loaded_weight,
++                                      weight_name,
++                                      expert_id=shard_id)
++                        break
++                    else:
++                        if name.endswith(".bias") and name not in params_dict:
++                            continue
++                        if is_pp_missing_parameter(name, self):
++                            continue
++                        param = params_dict[name]
++
++                        weight_loader = getattr(param, "weight_loader",
++                                                default_weight_loader)
++                        weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
+new file mode 100644
+index 0000000..089062a
+--- /dev/null
++++ b/vllm/model_executor/models/aria.py
+@@ -0,0 +1,688 @@
++from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple,
++                    TypedDict, Union)
++
++import torch
++import torch.nn as nn
++from transformers import BatchFeature, PretrainedConfig
++
++from vllm.attention import AttentionMetadata
++from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
++from vllm.distributed import get_tensor_model_parallel_rank
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.fused_moe import FusedMoE
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
++    get_compressed_tensors_cache_scale)
++from vllm.model_executor.layers.sampler import (SamplerOutput,
++                                                SamplingMetadata, get_sampler)
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name)
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
++                                    NestedTensors)
++from vllm.multimodal.parse import MultiModalDataItems
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo, PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
++                                                  AriaVisionConfig)
++
++from .idefics2_vision_model import Idefics2VisionTransformer
++from .interfaces import SupportsMultiModal
++from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
++from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
++                    is_pp_missing_parameter, maybe_prefix,
++                    merge_multimodal_embeddings)
++
++
++class AriaImagePixelInputs(TypedDict):
++    pixel_values: torch.Tensor
++    pixel_mask: Optional[torch.Tensor]
++    """
++    Shape: 
++        pixel_values: `(batch_size * num_images, num_channels, height, width)`
++        pixel_mask: `(batch_size * num_images, height, width)`
++    """
++
++
++class AriaVisionTransformer(Idefics2VisionTransformer):
++    """
++    AriaVisionTransformer is a modified version of Idefics2VisionTransformer
++    that replaces the post-layernorm with an identity layer.
++    """
++
++    def __init__(
++        self,
++        config: AriaVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__(config, quant_config, prefix)
++        self.post_layernorm = nn.Identity()
++
++
++class AriaVisionModel(nn.Module):
++    config_class = AriaVisionConfig
++
++    def __init__(
++        self,
++        config: AriaVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.vision_model = AriaVisionTransformer(
++            config,
++            quant_config,
++            prefix=f"{prefix}.vision_model",
++        )
++
++    def forward(
++        self,
++        pixel_values: torch.Tensor,
++        pixel_mask: Optional[torch.Tensor] = None,
++    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
++        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
++
++        vit_oup = self.vision_model(
++            pixel_values=pixel_values,
++            patch_attention_mask=patch_attention_mask,
++        )
++
++        image_atts = self._create_image_attention_mask(patch_attention_mask)
++
++        return vit_oup, image_atts
++
++    def _create_patch_attention_mask(
++            self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor:
++        if pixel_mask is None:
++            return None
++
++        patches_subgrid = pixel_mask.unfold(
++            dimension=1,
++            size=self.vision_model.config.patch_size,
++            step=self.vision_model.config.patch_size,
++        ).unfold(
++            dimension=2,
++            size=self.vision_model.config.patch_size,
++            step=self.vision_model.config.patch_size,
++        )
++        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
++
++    def _create_image_attention_mask(
++            self, patch_attention_mask: torch.Tensor) -> torch.Tensor:
++        if patch_attention_mask is None:
++            return None
++
++        flattened_mask = patch_attention_mask.flatten(1)
++        return torch.logical_not(flattened_mask)
++
++
++class FFN(nn.Module):
++
++    def __init__(self, embed_dim: int, ff_dim: int, output_dim: int) -> None:
++        super().__init__()
++        self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False)
++        self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False)
++        self.act = get_act_fn("gelu_new")
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        hidden_states, _ = self.linear_in(hidden_states)
++        hidden_states = self.act(hidden_states)
++        hidden_states, _ = self.linear_out(hidden_states)
++        return hidden_states
++
++
++class CrossAttention(nn.Module):
++
++    def __init__(self, kv_dim: int, embed_dim: int, num_heads: int) -> None:
++        super().__init__()
++        self.num_heads = num_heads
++        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
++        self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False)
++        self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False)
++
++        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
++        self.linear = nn.Linear(embed_dim, embed_dim)
++
++        self.layer_norm = nn.LayerNorm(embed_dim)
++        self.ln_kv = nn.LayerNorm(kv_dim)
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        hidden_states: torch.Tensor,
++        attn_mask: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        normed_hidden_states = self.layer_norm(hidden_states)
++        query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
++
++        x = self.ln_kv(x)
++        key = self.k_proj(x).permute(1, 0, 2)
++        value = self.v_proj(x).permute(1, 0, 2)
++
++        attn_output, _ = self.multihead_attn(query,
++                                             key,
++                                             value,
++                                             attn_mask=attn_mask)
++
++        attn_output = attn_output.permute(1, 0, 2)
++
++        attn_output = self.linear(attn_output)
++
++        return attn_output
++
++
++class AriaProjector(nn.Module):
++    """
++    A projection module with one cross attention layer and one FFN layer, which
++    projects ViT's outputs into MoE's inputs.
++
++    Args:
++        patch_to_query_dict (dict): Maps patch numbers to their corresponding
++        query numbers,
++            e.g., {1225: 128, 4900: 256}. This allows for different query sizes
++            based on image resolution.
++        embed_dim (int): Embedding dimension. 
++        num_heads (int): Number of attention heads. 
++        kv_dim (int): Dimension of key and value. 
++        ff_dim (int): Hidden dimension of the feed-forward network. 
++        output_dim (int): Output dimension. 
++        norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
++
++    Outputs:
++        A tensor with the shape of (batch_size, query_number, output_dim)
++    """
++
++    def __init__(
++        self,
++        patch_to_query_dict: dict[int, int],
++        embed_dim: int,
++        num_heads: int,
++        kv_dim: int,
++        ff_dim: int,
++        output_dim: int,
++        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
++    ) -> None:
++        super().__init__()
++        self.patch_to_query_dict = patch_to_query_dict
++        self.embed_dim = embed_dim
++        self.num_heads = num_heads
++
++        self.query = nn.Parameter(
++            torch.empty(max(patch_to_query_dict.values()), self.embed_dim))
++
++        self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
++
++        self.ln_ffn = norm_layer(embed_dim)
++        self.ffn = FFN(embed_dim, ff_dim, output_dim)
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        attn_mask: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        bs = x.shape[0]
++        queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
++
++        query_num = self.patch_to_query_dict.get(x.shape[1], None)
++        assert (query_num is not None
++                ), f"Query number for {x.shape[1]} patches is not provided"
++
++        queries = queries[:, :query_num, :]
++
++        if attn_mask is not None:
++            attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
++            attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1)
++
++        attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
++
++        out = self.ffn(self.ln_ffn(attention_out))
++
++        return out
++
++
++class AriaFusedMoE(FusedMoE):
++
++    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
++                      shard_id: str) -> None:
++        # Override the weight_loader to handle the expert weights in the Aria
++        # model, which are already packed with experts, and merge the gate and
++        # up weights for each expert.
++        # Note: Loading expert weights with quantization is not supported
++        tp_rank = get_tensor_model_parallel_rank()
++        if shard_id == 'w13':
++            # the shape of loaded_weight is
++            # (num_experts, hidden_size, 2 * moe_intermediate_size)
++            if self.tp_size > 1:
++                up, gate = loaded_weight.chunk(2, dim=-1)
++                up_current_rank = up.chunk(self.tp_size, dim=-1)[tp_rank]
++                gate_current_rank = gate.chunk(self.tp_size, dim=-1)[tp_rank]
++                up_and_gate = torch.cat([up_current_rank, gate_current_rank],
++                                        dim=-1).transpose(1, 2)
++                param.data.copy_(up_and_gate)
++            else:
++                param.data.copy_(loaded_weight.transpose(1, 2))
++        elif shard_id == 'w2':
++            # the shape of loaded_weight is
++            # (num_experts, moe_intermediate_size, hidden_size)
++            if self.tp_size > 1:
++                down_current_rank = loaded_weight.chunk(self.tp_size,
++                                                        dim=1)[tp_rank]
++                param.data.copy_(down_current_rank.transpose(1, 2))
++            else:
++                param.data.copy_(loaded_weight.transpose(1, 2))
++
++
++class MoELayer(nn.Module):
++    """
++    Mixture of Experts (MoE) Layer for the AriaMoE model.
++
++    This layer implements the MoE mechanism, which routes input tokens to
++    different experts based on a routing algorithm, processes them through the
++    experts, and then combines the outputs.
++    """
++
++    def __init__(
++        self,
++        config: AriaMoELMConfig,
++        quant_config: Optional[QuantizationConfig],
++    ) -> None:
++        super().__init__()
++        self.config = config
++
++        self.router_weight = nn.Parameter(
++            torch.empty(
++                (self.config.moe_num_experts, self.config.hidden_size)))
++
++        self.experts = AriaFusedMoE(
++            num_experts=config.moe_num_experts,
++            top_k=config.moe_topk,
++            hidden_size=config.hidden_size,
++            intermediate_size=config.moe_intermediate_size,
++            quant_config=quant_config,
++            reduce_results=True,
++        )
++        self.shared_experts = LlamaMLP(
++            config.hidden_size,
++            config.moe_intermediate_size * config.moe_num_shared_experts,
++            "silu",
++            quant_config=quant_config,
++        )
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        """
++        Forward pass of the MoE Layer.
++
++        Args:
++            hidden_states (torch.Tensor): Input tensor of shape (batch_size,
++            sequence_length, hidden_size).
++
++        Returns:
++            torch.Tensor: Output tensor after passing through the MoE layer.
++        """
++
++        router_output = torch.nn.functional.linear(hidden_states,
++                                                   self.router_weight)
++
++        shared_expert_output = self.shared_experts(hidden_states)
++        sparse_expert_output = self.experts(hidden_states, router_output)
++
++        return sparse_expert_output + shared_expert_output
++
++
++class MoEDecoderLayer(LlamaDecoderLayer):
++    """
++    Custom Decoder Layer for the AriaMoE model which modifies the standard
++    `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of
++    Experts (MoE) Layer.
++    """
++
++    def __init__(
++        self,
++        config: AriaMoELMConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__(config, cache_config, quant_config, prefix)
++        self.mlp = MoELayer(config, quant_config=quant_config)
++
++
++class AriaMoELMModel(LlamaModel):
++    """
++    Custom LlamaModel for the AriaMoE model which modifies the standard
++    LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
++    """
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config,
++                         prefix=prefix,
++                         layer_type=MoEDecoderLayer)
++
++    # Adapted from LlamaModel.load_weights with the modification of adding
++    # the expert weights mapping to `stacked_params_mapping`
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            (".qkv_proj", ".q_proj", "q"),
++            (".qkv_proj", ".k_proj", "k"),
++            (".qkv_proj", ".v_proj", "v"),
++            (".gate_up_proj", ".gate_proj", 0),
++            (".gate_up_proj", ".up_proj", 1),
++            ("experts.w13_weight", "experts.fc1.weight", 'w13'),
++            ("experts.w2_weight", "experts.fc2.weight", 'w2'),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            if ("rotary_emb.cos_cached" in name
++                    or "rotary_emb.sin_cached" in name):
++                # Models trained using ColossalAI may include these tensors in
++                # the checkpoint. Skip them.
++                continue
++            if scale_name := get_compressed_tensors_cache_scale(name):
++                # Loading kv cache scales for compressed-tensors quantization
++                param = params_dict[scale_name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                loaded_weight = loaded_weight[0]
++                weight_loader(param, loaded_weight)
++                loaded_params.add(scale_name)
++                continue
++            for param_name, weight_name, shard_id in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                # Remapping the name of FP8 kv-scale.
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++def build_mm_projector(config: PretrainedConfig):
++    return AriaProjector(
++        patch_to_query_dict=config.projector_patch_to_query_dict,
++        embed_dim=config.vision_config.hidden_size,
++        num_heads=config.vision_config.num_attention_heads,
++        kv_dim=config.vision_config.hidden_size,
++        ff_dim=config.text_config.hidden_size,
++        output_dim=config.text_config.hidden_size,
++    )
++
++
++class AriaProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_config(self):
++        return self.ctx.get_hf_config()
++
++    def get_vision_config(self) -> AriaVisionConfig:
++        return self.get_hf_config().vision_config
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": None}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        return {"image": self.get_num_image_tokens()}
++
++    def get_num_image_tokens(self) -> int:
++        hf_config = self.get_hf_config()
++        return max(hf_config.projector_patch_to_query_dict.values())
++
++
++class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        vision_config = self.info.get_vision_config()
++
++        max_image_size = vision_config.image_size
++        num_images = mm_counts.get("image", 0)
++
++        mm_data = {
++            "image":
++            self._get_dummy_images(width=max_image_size,
++                                   height=max_image_size,
++                                   num_images=num_images)
++        }
++
++        hf_processor = self.info.get_hf_processor()
++        image_token: str = hf_processor.image_token  # type: ignore
++
++        return ProcessorInputs(
++            prompt_text=image_token * num_images,
++            mm_data=mm_data,
++        )
++
++
++class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(
++            pixel_values=MultiModalFieldConfig.batched("image"),
++            pixel_mask=MultiModalFieldConfig.batched("image"),
++        )
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        hf_config = self.info.get_hf_config()
++        image_token_id = hf_config.image_token_index
++
++        num_image_tokens = self.info.get_num_image_tokens()
++
++        return [
++            PromptReplacement(
++                modality="image",
++                target=[image_token_id],
++                replacement=[image_token_id] * num_image_tokens,
++            )
++        ]
++
++
++@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor,
++                                        info=AriaProcessingInfo,
++                                        dummy_inputs=AriaDummyInputsBuilder)
++class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
++    """
++    Aria model for conditional generation tasks.
++
++    This model combines a vision tower, a multi-modal projector, and a language
++    model to perform tasks that involve both image and text inputs.
++    """
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_prefix={
++            "language_model.model": "language_model",
++            "language_model.lm_head": "lm_head",
++        },
++        orig_to_new_suffix={
++            "router.weight": "router_weight",
++        },
++    )
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++    ):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++
++        self.config = config
++        self.vision_tower = AriaVisionModel(config.vision_config)
++        self.multi_modal_projector = build_mm_projector(config)
++        self.vocab_size = config.text_config.vocab_size
++        self.language_model = AriaMoELMModel(
++            vllm_config=vllm_config.with_hf_config(config.text_config),
++            prefix=maybe_prefix(prefix, "language_model.model"),
++        )
++        self.pad_token_id = (self.config.pad_token_id
++                             if self.config.pad_token_id is not None else -1)
++        self.unpadded_vocab_size = config.text_config.vocab_size
++        self.lm_head = ParallelLMHead(
++            self.unpadded_vocab_size,
++            config.text_config.hidden_size,
++            org_num_embeddings=self.language_model.org_vocab_size,
++            quant_config=quant_config,
++        )
++        logit_scale = getattr(config, "logit_scale", 1.0)
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                self.vocab_size, logit_scale)
++        self.sampler = get_sampler()
++
++    def _validate_image_sizes(
++            self, images: List[torch.Tensor]) -> List[torch.Tensor]:
++        if not all(img.shape == images[0].shape for img in images):
++            raise ValueError("All images must be the same size")
++        return images
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        pixel_mask = kwargs.pop("pixel_mask", None)
++
++        if pixel_values is None:
++            return None
++
++        if not isinstance(pixel_values, (torch.Tensor, list)):
++            raise ValueError("Incorrect type of pixel values. "
++                             f"Got type: {type(pixel_values)}")
++
++        pixel_values = self._validate_image_sizes(pixel_values)
++        pixel_values = flatten_bn(pixel_values, concat=True)
++
++        if pixel_mask is not None:
++            if not isinstance(pixel_mask, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of pixel mask. "
++                                 f"Got type: {type(pixel_mask)}")
++
++            pixel_mask = flatten_bn(pixel_mask, concat=True)
++
++        return AriaImagePixelInputs(
++            pixel_values=pixel_values,
++            pixel_mask=pixel_mask,
++        )
++
++    def _process_image_input(
++        self, image_input: AriaImagePixelInputs
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        assert self.vision_tower is not None
++
++        pixel_values = image_input['pixel_values']
++        pixel_mask = image_input['pixel_mask']
++
++        image_feature, image_attn_mask = self.vision_tower(
++            pixel_values, pixel_mask=pixel_mask)
++        return self.multi_modal_projector(image_feature, image_attn_mask)
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        multimodal_embeddings = self._process_image_input(image_input)
++        return multimodal_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                self.config.image_token_index)
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if inputs_embeds is None:
++            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
++            # always pass the input via `inputs_embeds`
++            # to make sure the computation graph is consistent
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      multimodal_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model(
++            input_ids,
++            positions,
++            kv_caches,
++            attn_metadata,
++            intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++        )
++
++        return hidden_states
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++
++        loader = AutoWeightsLoader(self)
++        loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
+index 186cee2..5e68b7f 100644
+--- a/vllm/model_executor/models/baichuan.py
++++ b/vllm/model_executor/models/baichuan.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+ #
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+@@ -19,15 +18,16 @@
+ # limitations under the License.
+ """Inference-only BaiChuan model compatible with HuggingFace weights."""
+ import math
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import PretrainedConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.config import LoRAConfig
+-from vllm.distributed import (get_tensor_model_parallel_rank,
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size)
+ from vllm.model_executor.layers.activation import SiluAndMul
+ from vllm.model_executor.layers.layernorm import RMSNorm
+@@ -35,15 +35,18 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers)
+ 
+ 
+ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+@@ -111,7 +114,9 @@ class BaiChuanAttention(nn.Module):
+         position_embedding: str,
+         rope_theta: float = 10000,
+         max_position_embeddings: int = 8192,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.hidden_size = hidden_size
+@@ -153,7 +158,9 @@ class BaiChuanAttention(nn.Module):
+             self.attn = Attention(self.num_heads,
+                                   self.head_dim,
+                                   scaling,
+-                                  alibi_slopes=alibi_slopes)
++                                  alibi_slopes=alibi_slopes,
++                                  quant_config=quant_config,
++                                  prefix=f"{prefix}.attn")
+         else:
+             self.rotary_emb = get_rope(
+                 self.head_dim,
+@@ -162,7 +169,12 @@ class BaiChuanAttention(nn.Module):
+                 base=self.rope_theta,
+             )
+             self.scaling = self.head_dim**-0.5
+-            self.attn = Attention(self.num_heads, self.head_dim, self.scaling)
++            self.attn = Attention(self.num_heads,
++                                  self.head_dim,
++                                  self.scaling,
++                                  cache_config=cache_config,
++                                  quant_config=quant_config,
++                                  prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -185,7 +197,9 @@ class BaiChuanDecoderLayer(nn.Module):
+     def __init__(self,
+                  config: PretrainedConfig,
+                  position_embedding: str,
+-                 quant_config: Optional[QuantizationConfig] = None):
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__()
+         self.hidden_size = config.hidden_size
+         rope_theta = getattr(config, "rope_theta", 10000)
+@@ -197,7 +211,9 @@ class BaiChuanDecoderLayer(nn.Module):
+             position_embedding=position_embedding,
+             rope_theta=rope_theta,
+             max_position_embeddings=max_position_embeddings,
++            cache_config=cache_config,
+             quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
+         )
+         self.mlp = BaiChuanMLP(
+             hidden_size=self.hidden_size,
+@@ -239,13 +255,21 @@ class BaiChuanDecoderLayer(nn.Module):
+         return hidden_states, residual
+ 
+ 
++@support_torch_compile
+ class BaiChuanModel(nn.Module):
+ 
+-    def __init__(self,
+-                 config: PretrainedConfig,
+-                 position_embedding: str,
+-                 quant_config: Optional[QuantizationConfig] = None):
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++        position_embedding: str = "ROPE",
++    ) -> None:
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+         self.padding_idx = config.pad_token_id
+         self.vocab_size = config.vocab_size
+@@ -254,11 +278,22 @@ class BaiChuanModel(nn.Module):
+             config.vocab_size,
+             config.hidden_size,
+         )
+-        self.layers = nn.ModuleList([
+-            BaiChuanDecoderLayer(config, position_embedding, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: BaiChuanDecoderLayer(config,
++                                                position_embedding,
++                                                cache_config,
++                                                quant_config,
++                                                prefix=prefix),
++            prefix=f"{prefix}.layers",
++        )
+         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -266,23 +301,38 @@ class BaiChuanModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        residual = None
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+                 residual,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual,
++            })
+         hidden_states, _ = self.norm(hidden_states, residual)
+         return hidden_states
+ 
+ 
+-class BaiChuanBaseForCausalLM(nn.Module):
++class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+     packed_modules_mapping = {
+         "W_pack": ["W_pack"],
+         "gate_up_proj": [
+@@ -300,20 +350,43 @@ class BaiChuanBaseForCausalLM(nn.Module):
+     embedding_modules = {}
+     embedding_padding_modules = []
+ 
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
+     def __init__(
+         self,
+-        config,
+-        position_embedding: str,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
++        *,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++        position_embedding: str = "ROPE",
+     ):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
+         self.config = config
++        self.lora_config = lora_config
++
+         self.quant_config = quant_config
+-        self.model = BaiChuanModel(config, position_embedding, quant_config)
+-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
++        self.model = BaiChuanModel(vllm_config=vllm_config,
++                                   prefix=prefix,
++                                   position_embedding=position_embedding)
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      quant_config=quant_config)
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -321,14 +394,20 @@ class BaiChuanBaseForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -340,13 +419,15 @@ class BaiChuanBaseForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("gate_up_proj", "gate_proj", 0),
+             ("gate_up_proj", "up_proj", 1),
+         ]
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -369,6 +450,8 @@ class BaiChuanBaseForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -377,34 +460,39 @@ class BaiChuanBaseForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+ 
+ 
+ class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
+-    """Baichuan 13B and Baichuan2 7B/13B."""
++    """Baichuan 13B and Baichuan2 7B/13B.
++    NOTE: the class name has a lower case 'c'.
++    """
+ 
+-    def __init__(
+-        self,
+-        config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        config = vllm_config.model_config.hf_config
+         if config.hidden_size == 4096:  # baichuan2 7b
+-            super().__init__(config, "ROPE", quant_config, lora_config)
++            super().__init__(vllm_config=vllm_config,
++                             prefix=prefix,
++                             position_embedding="ROPE")
+         else:  # baichuan 13b, baichuan2 13b
+-            super().__init__(config, "ALIBI", quant_config, lora_config)
++            super().__init__(vllm_config=vllm_config,
++                             prefix=prefix,
++                             position_embedding="ALIBI")
+ 
+ 
+ class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
+-    """Baichuan 7B."""
+-
+-    def __init__(
+-        self,
+-        config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ):
+-        super().__init__(config, "ROPE", quant_config, lora_config)
++    """Baichuan 7B.
++    NOTE: the class name has an upper case 'C'.
++    """
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config,
++                         prefix=prefix,
++                         position_embedding="ROPE")
+diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
+new file mode 100644
+index 0000000..57eb5ad
+--- /dev/null
++++ b/vllm/model_executor/models/bart.py
+@@ -0,0 +1,998 @@
++# Derived from BART implementation posted on HuggingFace; license below:
++#
++# coding=utf-8
++# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team.
++# All rights reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""PyTorch BART model."""
++import math
++from typing import Iterable, List, Optional, Tuple
++
++import torch
++from torch import nn
++from transformers import BartConfig
++from transformers.utils import logging
++
++from vllm.attention import Attention, AttentionMetadata, AttentionType
++from vllm.config import CacheConfig, LoRAConfig, VllmConfig
++from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++
++from .utils import maybe_prefix
++
++logger = logging.get_logger(__name__)
++
++
++def get_bsz_seq_len(input_ids):
++    shp = input_ids.shape
++    ndim = len(shp)
++    if ndim == 1:
++        return 1, input_ids.numel()
++    else:
++        return shp[:2]
++
++
++class BartLearnedPositionalEmbedding(VocabParallelEmbedding):
++    """
++    This module learns positional embeddings up to a fixed maximum size.
++    """
++
++    def __init__(self, num_embeddings: int, embedding_dim: int):
++        # Bart is set up so that if padding_idx is
++        # specified then offset the embedding ids by 2
++        # and adjust num_embeddings appropriately.
++        # Other models don't have this hack
++        self.offset = 2
++        super().__init__(num_embeddings + self.offset, embedding_dim)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++    ) -> torch.Tensor:
++        """`input_ids' shape is expected to be [bsz x seqlen]."""
++        return super().forward(positions + self.offset)
++
++
++class BartScaledWordEmbedding(VocabParallelEmbedding):
++    """
++    This module overrides VocabParallelEmbedding's 
++    forward by multiplying with embeddings scale.
++    """
++
++    def __init__(self,
++                 num_embeddings: int,
++                 embedding_dim: int,
++                 embed_scale: float = 1.0):
++        super().__init__(num_embeddings, embedding_dim)
++        self.embed_scale = embed_scale
++
++    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return super().forward(input_ids) * self.embed_scale
++
++
++class BartParallelLMHead(ParallelLMHead):
++    """
++    This module overrides ParallelLMHead's
++    forward by dividing by embeddings scale,
++    yielding effectively the inverse of
++    BartScaledWordEmbedding
++    """
++
++    def __init__(self,
++                 num_embeddings: int,
++                 embedding_dim: int,
++                 embed_scale: float = 1.0):
++        super().__init__(num_embeddings, embedding_dim)
++        self.embed_scale = embed_scale
++
++    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return super().forward(input_ids) / self.embed_scale
++
++
++class BartEncoderAttention(nn.Module):
++
++    def __init__(
++        self,
++        embed_dim: int,
++        num_heads: int,
++        bias: bool = True,
++        config: Optional[BartConfig] = None,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.d_model = config.d_model
++        self.embed_dim = embed_dim
++        self.total_num_heads = num_heads
++        self.total_num_kv_heads = self.total_num_heads
++        self.head_dim = embed_dim // num_heads
++        self.config = config
++
++        if (self.head_dim * num_heads) != self.embed_dim:
++            raise ValueError(f"embed_dim must be divisible by num_heads "
++                             f"(got `embed_dim`: {self.embed_dim}"
++                             f" and `num_heads`: {num_heads}).")
++        self.scaling = self.head_dim**-0.5
++
++        self.qkv_proj = QKVParallelLinear(
++            self.d_model,
++            self.d_model // self.total_num_heads,
++            self.total_num_heads,
++            self.total_num_kv_heads,
++            bias=bias,
++            quant_config=quant_config,
++        )
++
++        self.out_proj = RowParallelLinear(
++            embed_dim,
++            embed_dim,
++            bias=bias,
++            quant_config=quant_config,
++        )
++
++        tp_world_size = get_tensor_model_parallel_world_size()
++        assert self.total_num_heads % tp_world_size == 0
++        self.num_heads = self.total_num_heads // tp_world_size
++
++        if self.total_num_kv_heads >= tp_world_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_world_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_world_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn",
++                              attn_type=AttentionType.ENCODER)
++
++    def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
++                attn_metadata: AttentionMetadata) -> torch.Tensor:
++        """Input shape: Batch x Time x Channel"""
++
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++
++        output, _ = self.out_proj(attn_output)
++        return output
++
++
++class BartDecoderSelfAttention(nn.Module):
++
++    def __init__(
++        self,
++        embed_dim: int,
++        num_heads: int,
++        bias: bool = True,
++        config: Optional[BartConfig] = None,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.d_model = config.d_model
++        self.embed_dim = embed_dim
++        self.total_num_heads = num_heads
++        self.total_num_kv_heads = self.total_num_heads
++        self.head_dim = embed_dim // num_heads
++        self.config = config
++
++        if (self.head_dim * num_heads) != self.embed_dim:
++            raise ValueError(f"embed_dim must be divisible by num_heads "
++                             f"(got `embed_dim`: {self.embed_dim}"
++                             f" and `num_heads`: {num_heads}).")
++        self.scaling = self.head_dim**-0.5
++
++        self.qkv_proj = QKVParallelLinear(
++            self.d_model,
++            self.d_model // self.total_num_heads,
++            self.total_num_heads,
++            self.total_num_kv_heads,
++            bias=bias,
++            quant_config=quant_config,
++        )
++
++        self.out_proj = RowParallelLinear(
++            embed_dim,
++            embed_dim,
++            bias=bias,
++            quant_config=quant_config,
++        )
++
++        tp_world_size = get_tensor_model_parallel_world_size()
++        assert self.total_num_heads % tp_world_size == 0
++        self.num_heads = self.total_num_heads // tp_world_size
++
++        if self.total_num_kv_heads >= tp_world_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_world_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_world_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn",
++                              attn_type=AttentionType.DECODER)
++
++    def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
++                attn_metadata: AttentionMetadata) -> torch.Tensor:
++        """Input shape: Batch x Time x Channel"""
++
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++
++        output, _ = self.out_proj(attn_output)
++        return output
++
++
++class BartCrossAttention(nn.Module):
++
++    def __init__(
++        self,
++        embed_dim: int,
++        num_heads: int,
++        bias: bool = True,
++        config: Optional[BartConfig] = None,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.d_model = config.d_model
++        self.embed_dim = embed_dim
++        self.total_num_heads = num_heads
++        self.total_num_kv_heads = self.total_num_heads
++        self.head_dim = embed_dim // num_heads
++        self.config = config
++
++        if (self.head_dim * num_heads) != self.embed_dim:
++            raise ValueError(f"embed_dim must be divisible by num_heads "
++                             f"(got `embed_dim`: {self.embed_dim}"
++                             f" and `num_heads`: {num_heads}).")
++        self.scaling = self.head_dim**-0.5
++
++        self.qkv_proj = QKVParallelLinear(
++            self.d_model,
++            self.d_model // self.total_num_heads,
++            self.total_num_heads,
++            self.total_num_kv_heads,
++            bias=bias,
++            quant_config=quant_config,
++        )
++
++        self.out_proj = RowParallelLinear(
++            embed_dim,
++            embed_dim,
++            bias=bias,
++            quant_config=quant_config,
++        )
++
++        tp_world_size = get_tensor_model_parallel_world_size()
++        assert self.total_num_heads % tp_world_size == 0
++        self.num_heads = self.total_num_heads // tp_world_size
++
++        if self.total_num_kv_heads >= tp_world_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_world_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_world_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn",
++                              attn_type=AttentionType.ENCODER_DECODER)
++
++    def forward(
++        self,
++        decoder_hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        encoder_hidden_states: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        """Input shape: Batch x Time x Channel"""
++
++        # (afeldman-nm 2024/07/22) TODO:
++        # Need a more efficient solution for q/k/v
++        qkv_dec, _ = self.qkv_proj(decoder_hidden_states)
++        q, _, _ = qkv_dec.split([self.q_size, self.kv_size, self.kv_size],
++                                dim=-1)
++        if encoder_hidden_states is None:
++            k = None
++            v = None
++        else:
++            qkv_enc, _ = self.qkv_proj(encoder_hidden_states)
++            _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size],
++                                    dim=-1)
++
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++
++        output, _ = self.out_proj(attn_output)
++        return output
++
++
++class BartEncoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: BartConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.embed_dim = config.d_model
++
++        self.self_attn = BartEncoderAttention(
++            embed_dim=self.embed_dim,
++            num_heads=config.encoder_attention_heads,
++            config=config,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
++        self.activation_fn = get_act_fn(config.activation_function)
++
++        ffn_hidden_size = self.embed_dim
++        ffn_intermediate_size = config.encoder_ffn_dim
++        ffn_has_bias = True
++        self.fc1 = ColumnParallelLinear(
++            ffn_hidden_size,
++            ffn_intermediate_size,
++            bias=ffn_has_bias,
++            quant_config=quant_config,
++        )
++        self.act = get_act_fn("gelu")
++        self.fc2 = RowParallelLinear(
++            ffn_intermediate_size,
++            ffn_hidden_size,
++            bias=ffn_has_bias,
++            quant_config=quant_config,
++        )
++
++        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
++
++    def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
++                attn_metadata: AttentionMetadata) -> torch.Tensor:
++        r"""
++        Args:
++            hidden_states
++                torch.Tensor of *encoder* input embeddings.
++            kv_cache:
++                Layer-wise list of KV cache tensors
++            attn_metadata:
++                vLLM Attention metadata structure
++        Returns:
++            Encoder layer output torch.Tensor
++        """
++        residual = hidden_states
++        hidden_states = self.self_attn(hidden_states=hidden_states,
++                                       kv_cache=kv_cache,
++                                       attn_metadata=attn_metadata)
++
++        hidden_states = residual + hidden_states
++        hidden_states = self.self_attn_layer_norm(hidden_states)
++
++        residual = hidden_states
++        fc1_out, _ = self.fc1(hidden_states)
++        hidden_states = self.activation_fn(fc1_out)
++
++        hidden_states, _ = self.fc2(hidden_states)
++
++        hidden_states = residual + hidden_states
++        hidden_states = self.final_layer_norm(hidden_states)
++
++        if hidden_states.dtype == torch.float16 and (
++                torch.isinf(hidden_states).any()
++                or torch.isnan(hidden_states).any()):
++            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
++            hidden_states = torch.clamp(hidden_states,
++                                        min=-clamp_value,
++                                        max=clamp_value)
++
++        return hidden_states
++
++
++class BartDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: BartConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.embed_dim = config.d_model
++
++        self.self_attn = BartDecoderSelfAttention(
++            embed_dim=self.embed_dim,
++            num_heads=config.decoder_attention_heads,
++            config=config,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.activation_fn = get_act_fn(config.activation_function)
++
++        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
++        '''
++        afeldman-nm: personally I would call this "cross-attention",
++        however I left the name as "encoder_attn" to maintain consistency
++        with the name of the pretrained weights.
++        '''
++        self.encoder_attn = BartCrossAttention(
++            self.embed_dim,
++            config.decoder_attention_heads,
++            config=config,
++            prefix=f"{prefix}.encoder_attn",
++        )
++        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
++
++        ffn_hidden_size = self.embed_dim
++        ffn_intermediate_size = config.encoder_ffn_dim
++        ffn_has_bias = True
++        self.fc1 = ColumnParallelLinear(
++            ffn_hidden_size,
++            ffn_intermediate_size,
++            bias=ffn_has_bias,
++            quant_config=quant_config,
++        )
++        self.fc2 = RowParallelLinear(
++            ffn_intermediate_size,
++            ffn_hidden_size,
++            bias=ffn_has_bias,
++            quant_config=quant_config,
++        )
++
++        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
++
++    def forward(
++        self,
++        decoder_hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        encoder_hidden_states: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        r"""
++        Args:
++            decoder_hidden_states
++                torch.Tensor of *decoder* input embeddings.
++            kv_cache:
++                KV cache tensor
++            attn_metadata:
++                vLLM Attention metadata structure
++            encoder_hidden_states
++                torch.Tensor of *encoder* input embeddings.
++        Returns:
++            Decoder layer output torch.Tensor
++        """
++        residual = decoder_hidden_states
++
++        # Self Attention
++        hidden_states = self.self_attn(hidden_states=decoder_hidden_states,
++                                       kv_cache=kv_cache,
++                                       attn_metadata=attn_metadata)
++
++        hidden_states = residual + hidden_states
++        hidden_states = self.self_attn_layer_norm(hidden_states)
++
++        # Cross-Attention Block
++
++        residual = hidden_states
++
++        hidden_states = self.encoder_attn(
++            decoder_hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++            encoder_hidden_states=encoder_hidden_states,
++        )
++
++        hidden_states = residual + hidden_states
++        hidden_states = self.encoder_attn_layer_norm(hidden_states)
++
++        # Fully Connected
++        residual = hidden_states
++        fc1_out, _ = self.fc1(hidden_states)
++        hidden_states = self.activation_fn(fc1_out)
++
++        hidden_states, _ = self.fc2(hidden_states)
++
++        hidden_states = residual + hidden_states
++        hidden_states = self.final_layer_norm(hidden_states)
++
++        return hidden_states
++
++
++class BartEncoder(nn.Module):
++    """
++    Transformer encoder consisting of *config.encoder_layers*
++    self attention layers. Each layer is a [`BartEncoderLayer`].
++    Args:
++        config: BartConfig
++        embed_tokens (nn.Embedding): output embedding
++    """
++
++    def __init__(self,
++                 config: BartConfig,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 lora_config: Optional[LoRAConfig] = None,
++                 embed_tokens: Optional[nn.Embedding] = None,
++                 prefix: str = ""):
++        super().__init__()
++
++        self.cache_config = cache_config
++        self.quant_config = quant_config
++        self.lora_config = lora_config
++        embed_dim = config.d_model
++        self.max_source_positions = config.max_position_embeddings
++        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
++
++        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
++                                                    embed_dim,
++                                                    embed_scale=embed_scale)
++
++        if embed_tokens is not None:
++            self.embed_tokens.weight = embed_tokens.weight
++
++        self.embed_positions = BartLearnedPositionalEmbedding(
++            config.max_position_embeddings,
++            embed_dim,
++        )
++        self.layers = nn.ModuleList([
++            BartEncoderLayer(config,
++                             cache_config,
++                             quant_config,
++                             prefix=f"{prefix}.layers.{layer_idx}")
++            for layer_idx in range(config.encoder_layers)
++        ])
++
++        self.layernorm_embedding = nn.LayerNorm(embed_dim)
++
++    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
++                kv_caches: List[torch.Tensor],
++                attn_metadata: AttentionMetadata) -> torch.Tensor:
++        r"""
++        Args:
++            input_ids
++                Indices of *encoder* input sequence tokens in the vocabulary.
++                Padding will be ignored by default should you
++                provide it.
++            positions
++                Positions of *encoder* input sequence tokens.
++            kv_caches:
++                Layer-wise list of KV cache tensors
++            attn_metadata:
++                vLLM Attention metadata structure
++        Returns:
++            Decoder output torch.Tensor
++        """
++        # retrieve input_ids and inputs_embeds
++        inputs_embeds = self.embed_tokens(input_ids)
++
++        embed_pos = self.embed_positions(positions)
++        embed_pos = embed_pos.to(inputs_embeds.device)
++
++        hidden_states = inputs_embeds + embed_pos
++        hidden_states = self.layernorm_embedding(hidden_states)
++
++        for idx, encoder_layer in enumerate(self.layers):
++            hidden_states = encoder_layer(
++                hidden_states=hidden_states,
++                kv_cache=kv_caches[idx],
++                attn_metadata=attn_metadata,
++            )
++
++        return hidden_states
++
++
++class BartDecoder(nn.Module):
++    """
++    Transformer decoder consisting of *config.decoder_layers* layers.
++    Each layer is a [`BartDecoderLayer`]
++    Args:
++        config: BartConfig
++        embed_tokens (nn.Embedding): output embedding
++    """
++
++    def __init__(
++        self,
++        config: BartConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        lora_config: Optional[LoRAConfig] = None,
++        embed_tokens: Optional[nn.Embedding] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.cache_config = cache_config
++        self.quant_config = quant_config
++        self.lora_config = lora_config
++        self.max_target_positions = config.max_position_embeddings
++        embed_scale = math.sqrt(
++            config.d_model) if config.scale_embedding else 1.0
++
++        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
++                                                    config.d_model,
++                                                    embed_scale=embed_scale)
++
++        if embed_tokens is not None:
++            self.embed_tokens.weight = embed_tokens.weight
++
++        self.embed_positions = BartLearnedPositionalEmbedding(
++            config.max_position_embeddings,
++            config.d_model,
++        )
++
++        self.layers = nn.ModuleList(
++            [BartDecoderLayer(config,cache_config,quant_config,
++            prefix=f"{prefix}.layers.{layer_idx}") \
++             for layer_idx in range(config.decoder_layers)])
++
++        self.layernorm_embedding = nn.LayerNorm(config.d_model)
++
++    def forward(self, decoder_input_ids: torch.Tensor,
++                decoder_positions: torch.Tensor,
++                encoder_hidden_states: Optional[torch.Tensor],
++                kv_caches: List[torch.Tensor],
++                attn_metadata: AttentionMetadata) -> torch.Tensor:
++        r"""
++        Args:
++            decoder_input_ids
++                Indices of *decoder* input sequence tokens in the vocabulary.
++                Padding will be ignored by default should you
++                provide it.
++            decoder_positions
++                Positions of *decoder* input sequence tokens.
++            encoder_hidden_states:
++                Tensor of encoder output embeddings
++            kv_caches:
++                Layer-wise list of KV cache tensors
++            attn_metadata:
++                vLLM Attention metadata structure
++        Returns:
++            Decoder output torch.Tensor
++        """
++
++        inputs_embeds = self.embed_tokens(decoder_input_ids)
++
++        # embed positions
++        embed_pos = self.embed_positions(decoder_positions)
++        embed_pos = embed_pos.to(inputs_embeds.device)
++
++        hidden_states = inputs_embeds + embed_pos
++        hidden_states = self.layernorm_embedding(hidden_states)
++
++        # decoder layers
++
++        for idx, decoder_layer in enumerate(self.layers):
++            hidden_states = decoder_layer(
++                decoder_hidden_states=hidden_states,
++                kv_cache=kv_caches[idx],
++                attn_metadata=attn_metadata,
++                encoder_hidden_states=encoder_hidden_states,
++            )
++
++        return hidden_states
++
++
++class BartModel(nn.Module):
++    _tied_weights_keys = [
++        "encoder.embed_tokens.weight", "decoder.embed_tokens.weight"
++    ]
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.config = config
++
++        self.padding_idx = config.pad_token_id
++        lora_vocab = (lora_config.lora_extra_vocab_size *
++                      (lora_config.max_loras or 1)) if lora_config else 0
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.org_vocab_size = config.vocab_size
++
++        self.encoder = BartEncoder(config,
++                                   cache_config,
++                                   quant_config=quant_config,
++                                   prefix=f"{prefix}.encoder")
++        self.decoder = BartDecoder(config,
++                                   cache_config,
++                                   quant_config=quant_config,
++                                   prefix=f"{prefix}.decoder")
++
++    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
++                encoder_input_ids: torch.Tensor,
++                encoder_positions: torch.Tensor, kv_caches: List[torch.Tensor],
++                attn_metadata: AttentionMetadata) -> torch.Tensor:
++        r"""
++        Args:
++            input_ids
++                Indices of *decoder* input sequence tokens in the vocabulary.
++                Padding will be ignored by default should you
++                provide it.
++            positions
++                Positions of *decoder* input sequence tokens.
++            encoder_input_ids
++                Indices of *encoder* input sequence tokens in the vocabulary.
++            encoder_positions:
++                Positions of *encoder* input sequence tokens.
++            kv_caches:
++                Layer-wise list of KV cache tensors
++            attn_metadata:
++                vLLM Attention metadata structure
++        Returns:
++            Model output torch.Tensor
++        """
++
++        encoder_hidden_states = None
++
++        if encoder_input_ids.numel() > 0:
++            # Run encoder attention if a non-zero number of encoder tokens
++            # are provided as input
++            encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
++                                                 positions=encoder_positions,
++                                                 kv_caches=kv_caches,
++                                                 attn_metadata=attn_metadata)
++
++        # decoder outputs consists of
++        # (dec_features, past_key_value, dec_hidden, dec_attn)
++        decoder_outputs = self.decoder(
++            decoder_input_ids=input_ids,
++            decoder_positions=positions,
++            encoder_hidden_states=encoder_hidden_states,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata)
++
++        return decoder_outputs
++
++
++class BartForConditionalGeneration(nn.Module):
++    base_model_prefix = "model"
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        lora_config = vllm_config.lora_config
++        # currently all existing BART models have `tie_word_embeddings` enabled
++        assert config.tie_word_embeddings
++        self.config = config
++        self.model = BartModel(vllm_config=vllm_config,
++                               prefix=maybe_prefix(prefix, "model"))
++
++        self.unpadded_vocab_size = config.vocab_size
++        if lora_config:
++            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++
++        embed_scale = math.sqrt(
++            config.d_model) if config.scale_embedding else 1.0
++
++        self.lm_head = BartParallelLMHead(config.vocab_size,
++                                          config.d_model,
++                                          embed_scale=embed_scale)
++
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                config.vocab_size)
++        self.sampler = get_sampler()
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        *,
++        encoder_input_ids: torch.Tensor,
++        encoder_positions: torch.Tensor,
++        **kwargs,
++    ) -> torch.Tensor:
++        r"""
++        Args:
++            input_ids
++                torch.Tensor of *decoder* input token ids.
++            positions
++                torch.Tensor of *decoder* position indices.
++            encoder_input_ids
++                torch.Tensor of *encoder* input token ids.
++            encoder_positions
++                torch.Tensor of *encoder* position indices
++            kv_caches:
++                Layer-wise list of KV cache tensors
++            attn_metadata:
++                vLLM Attention metadata structure
++        Returns:
++            Output torch.Tensor
++        """
++        return self.model(input_ids, positions, encoder_input_ids,
++                          encoder_positions, kv_caches, attn_metadata)
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: Optional[torch.Tensor],
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    stacked_params_mapping = {
++        "q_proj": {
++            "param_name": "qkv_proj",
++            "shard_id": "q",
++        },
++        "k_proj": {
++            "param_name": "qkv_proj",
++            "shard_id": "k",
++        },
++        "v_proj": {
++            "param_name": "qkv_proj",
++            "shard_id": "v",
++        },
++    }
++
++    params_mapping = {
++        "beta": "bias",
++        "gamma": "weight",
++        "LayerNorm": "layernorm",
++    }
++
++    def _rename_key(self, key: str):
++        prefix = f"{self.base_model_prefix}."
++        key = key[len(prefix):] if key.startswith(prefix) else key
++
++        for src, dst in self.params_mapping.items():
++            key = key.replace(src, dst)
++
++        return key
++
++    def _rename_stacked_param(
++        self,
++        name: str,
++    ) -> Tuple[str, Optional[str]]:
++        for key, mapping in self.stacked_params_mapping.items():
++            if key in name:
++                name = name.replace(key, mapping["param_name"])
++                return name, mapping["shard_id"]
++        return name, None
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++
++        model_params_dict = dict(self.model.named_parameters())
++        top_params_dict = dict(self.named_parameters())
++
++        weights_tuple_list = list(weights)
++
++        shared_embedding_weight = None
++        shared_embedding_shard_id = None
++
++        for name, loaded_weight in weights_tuple_list:
++
++            name = self._rename_key(name)
++            name, shard_id = self._rename_stacked_param(name)
++
++            if ('shared.weight' in name
++                    or 'encoder.embed_tokens.weight' in name
++                    or 'decoder.embed_tokens.weight' in name
++                    or 'lm_head.weight' in name):
++                assert shared_embedding_weight is None, (
++                    "Conflicting embedding weights.")
++                shared_embedding_weight = loaded_weight
++                shared_embedding_shard_id = shard_id
++            else:
++                # Skip the specific downstream task weight.
++                if name.startswith('cls.'):
++                    continue
++                # use Pooler instead.
++                if name.startswith('pooler.'):
++                    continue
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in model_params_dict:
++                    continue
++
++                param = model_params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                if shard_id:
++                    weight_loader(param, loaded_weight, shard_id)
++                else:
++                    weight_loader(param, loaded_weight)
++
++        # Assign shared weight values
++        encoder_in_param = model_params_dict['encoder.embed_tokens.weight']
++        encoder_in_weight_loader = getattr(encoder_in_param, "weight_loader",
++                                           default_weight_loader)
++
++        decoder_in_param = model_params_dict['decoder.embed_tokens.weight']
++        decoder_in_weight_loader = getattr(decoder_in_param, "weight_loader",
++                                           default_weight_loader)
++
++        lm_head_in_param = top_params_dict['lm_head.weight']
++        lm_head_in_weight_loader = getattr(lm_head_in_param, "weight_loader",
++                                           default_weight_loader)
++
++        assert shared_embedding_weight is not None
++
++        if shared_embedding_shard_id:
++            encoder_in_weight_loader(encoder_in_param, shared_embedding_weight,
++                                     shared_embedding_shard_id)
++            decoder_in_weight_loader(decoder_in_param, shared_embedding_weight,
++                                     shared_embedding_shard_id)
++            lm_head_in_weight_loader(lm_head_in_param, shared_embedding_weight,
++                                     shared_embedding_shard_id)
++        else:
++            encoder_in_weight_loader(encoder_in_param, shared_embedding_weight)
++            decoder_in_weight_loader(decoder_in_param, shared_embedding_weight)
++            lm_head_in_weight_loader(lm_head_in_param, shared_embedding_weight)
+diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
+new file mode 100644
+index 0000000..4be1365
+--- /dev/null
++++ b/vllm/model_executor/models/bert.py
+@@ -0,0 +1,532 @@
++from typing import Iterable, List, Optional, Set, Tuple
++
++import torch
++from torch import nn
++from transformers import BertConfig
++
++from vllm.attention import Attention, AttentionMetadata, AttentionType
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, PoolerConfig, VllmConfig
++from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler,
++                                               PoolingType)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.pooling_metadata import PoolingMetadata
++from vllm.sequence import IntermediateTensors, PoolerOutput
++from vllm.transformers_utils.config import (
++    get_cross_encoder_activation_function)
++
++from .interfaces import SupportsCrossEncoding
++from .utils import WeightsMapper, maybe_prefix
++
++
++class BertEmbedding(nn.Module):
++
++    def __init__(self, config: BertConfig):
++
++        super().__init__()
++        self.size = config.hidden_size
++        self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
++                                                      config.hidden_size)
++        self.position_embeddings = VocabParallelEmbedding(
++            config.max_position_embeddings, config.hidden_size)
++        self.token_type_embeddings = VocabParallelEmbedding(
++            config.type_vocab_size, config.hidden_size)
++        self.LayerNorm = nn.LayerNorm(config.hidden_size,
++                                      eps=config.layer_norm_eps)
++        self.position_ids = nn.Parameter(
++            torch.empty((1, config.max_position_embeddings)), )
++
++        self.position_embedding_type = config.position_embedding_type
++        if self.position_embedding_type != "absolute":
++            raise ValueError("Only 'absolute' position_embedding_type" +
++                             " is supported")
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        seq_lens: torch.Tensor,
++        position_ids: torch.Tensor,
++        token_type_ids: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        input_shape = input_ids.size()
++
++        # Input embeddings.
++        inputs_embeds = self.word_embeddings(input_ids)
++
++        # Position embeddings.
++        position_embeddings = self.position_embeddings(position_ids)
++
++        if token_type_ids is None:
++            token_type_ids = torch.zeros(input_shape,
++                                         dtype=torch.long,
++                                         device=inputs_embeds.device)
++
++        token_type_embeddings = self.token_type_embeddings(token_type_ids)
++
++        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
++        embeddings = self.LayerNorm(embeddings)
++        return embeddings
++
++
++class BertPooler(nn.Module):
++
++    def __init__(self, config: BertConfig):
++        super().__init__()
++        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
++        self.activation = nn.Tanh()
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        # We "pool" the model by simply taking the hidden state corresponding
++        # to the first token.
++        first_token_tensor = hidden_states[0, :]
++        pooled_output = self.dense(first_token_tensor)
++        pooled_output = self.activation(pooled_output)
++        return pooled_output
++
++
++@support_torch_compile
++class BertEncoder(nn.Module):
++
++    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        self.layer = nn.ModuleList([
++            BertLayer(config=config,
++                      cache_config=cache_config,
++                      quant_config=quant_config,
++                      prefix=f"{prefix}.layer.{layer_idx}")
++            for layer_idx in range(config.num_hidden_layers)
++        ])
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        for i in range(len(self.layer)):
++            layer = self.layer[i]
++            hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
++        return hidden_states
++
++
++class BertLayer(nn.Module):
++
++    def __init__(self,
++                 config: BertConfig,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++
++        self.attention = BertAttention(
++            hidden_size=config.hidden_size,
++            num_attention_heads=config.num_attention_heads,
++            layer_norm_eps=config.layer_norm_eps,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attention")
++
++        self.intermediate = BertIntermediate(
++            hidden_size=config.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.hidden_act,
++            quant_config=quant_config,
++            prefix=f"{prefix}.intermediate")
++
++        self.output = BertOutput(hidden_size=config.hidden_size,
++                                 intermediate_size=config.intermediate_size,
++                                 layer_norm_eps=config.layer_norm_eps,
++                                 quant_config=quant_config,
++                                 prefix=f"{prefix}.output")
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        kv_cache: Optional[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++    ):
++        attn_output = self.attention(hidden_states, kv_cache, attn_metadata)
++        intermediate_output = self.intermediate(attn_output)
++        output = self.output(intermediate_output, attn_output)
++        return output
++
++
++class BertAttention(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        num_attention_heads: int,
++        layer_norm_eps: float,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++
++        self.self = BertSelfAttention(hidden_size=hidden_size,
++                                      num_attention_heads=num_attention_heads,
++                                      cache_config=cache_config,
++                                      quant_config=quant_config,
++                                      prefix=f"{prefix}.output")
++
++        self.output = BertSelfOutput(hidden_size=hidden_size,
++                                     layer_norm_eps=layer_norm_eps,
++                                     quant_config=quant_config,
++                                     prefix=f"{prefix}.output")
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        self_output = self.self(hidden_states, kv_cache, attn_metadata)
++        return self.output(self_output, hidden_states)
++
++
++class BertSelfAttention(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        num_attention_heads: int,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.hidden_size = hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++
++        self.total_num_heads = num_attention_heads
++        assert self.total_num_heads % tp_size == 0
++
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = self.total_num_heads
++        self.head_dim = self.hidden_size // self.total_num_heads
++        assert self.head_dim * self.total_num_heads == self.hidden_size
++
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = self.head_dim**-0.5
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size=self.hidden_size,
++            head_size=self.head_dim,
++            total_num_heads=self.total_num_heads,
++            total_num_kv_heads=self.total_num_kv_heads,
++            bias=True,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj")
++
++        self.attn = Attention(num_heads=self.num_heads,
++                              head_size=self.head_dim,
++                              scale=self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn",
++                              attn_type=AttentionType.ENCODER_ONLY)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        output = self.attn(q, k, v, kv_cache, attn_metadata)
++        return output
++
++
++class BertSelfOutput(nn.Module):
++
++    def __init__(self,
++                 hidden_size: int,
++                 layer_norm_eps: float,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++        self.dense = RowParallelLinear(input_size=hidden_size,
++                                       output_size=hidden_size,
++                                       bias=True,
++                                       quant_config=quant_config,
++                                       prefix=f"{prefix}.dense")
++        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
++
++    def forward(self, hidden_states: torch.Tensor,
++                input_tensor: torch.Tensor) -> torch.Tensor:
++        hidden_states, _ = self.dense(hidden_states)
++        hidden_states = self.LayerNorm(hidden_states + input_tensor)
++        return hidden_states
++
++
++class BertIntermediate(nn.Module):
++
++    def __init__(self,
++                 hidden_size: int,
++                 intermediate_size: int,
++                 hidden_act: str,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++        self.dense = ColumnParallelLinear(input_size=hidden_size,
++                                          output_size=intermediate_size,
++                                          bias=True,
++                                          quant_config=quant_config,
++                                          prefix=f"{prefix}.dense")
++        self.intermediate_act_fn = get_act_fn(hidden_act)
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        hidden_states, _ = self.dense(hidden_states)
++        hidden_states = self.intermediate_act_fn(hidden_states)
++        return hidden_states
++
++
++class BertOutput(nn.Module):
++
++    def __init__(self,
++                 hidden_size: int,
++                 intermediate_size: int,
++                 layer_norm_eps: float,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++
++        self.dense = RowParallelLinear(input_size=intermediate_size,
++                                       output_size=hidden_size,
++                                       bias=True,
++                                       quant_config=quant_config,
++                                       prefix=f"{prefix}.dense")
++
++        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
++
++    def forward(self, hidden_states: torch.Tensor,
++                input_tensor: torch.Tensor) -> torch.Tensor:
++        hidden_states, _ = self.dense(hidden_states)
++        hidden_states = self.LayerNorm(hidden_states + input_tensor)
++        return hidden_states
++
++
++class BertModel(nn.Module):
++
++    def __init__(self,
++                 *,
++                 vllm_config: VllmConfig,
++                 prefix: str = "",
++                 embedding_class: type = BertEmbedding,
++                 add_pooling_layer: bool = False):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        self.embeddings = embedding_class(config)
++        self.encoder = BertEncoder(vllm_config=vllm_config,
++                                   prefix=f"{prefix}.encoder")
++        self.pooler = BertPooler(config) if add_pooling_layer else None
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        position_ids: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        token_type_ids: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        if inputs_embeds is not None:
++            hidden_states = inputs_embeds
++        else:
++            assert hasattr(attn_metadata, "seq_lens_tensor")
++            hidden_states = self.embeddings(
++                input_ids=input_ids,
++                seq_lens=attn_metadata.seq_lens_tensor,
++                position_ids=position_ids,
++                token_type_ids=token_type_ids)
++        return self.encoder(hidden_states, kv_caches, attn_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "query", "q"),
++            ("qkv_proj", "key", "k"),
++            ("qkv_proj", "value", "v"),
++        ]
++
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if self.pooler is None and "pooler" in name:
++                continue
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++class BertEmbeddingModel(nn.Module):
++    """A model that uses Bert to provide embedding functionalities.
++
++   This class encapsulates the BertModel and provides an interface for
++   embedding operations and customized pooling functions.
++
++   Attributes:
++       model: An instance of BertModel used for forward operations.
++       _pooler: An instance of Pooler used for pooling operations.
++   """
++    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        pooler_config = vllm_config.model_config.pooler_config
++        self.model = self._build_model(vllm_config=vllm_config,
++                                       prefix=maybe_prefix(prefix, "model"))
++        self._pooler = self._build_pooler(pooler_config)
++
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        return self.model(input_ids=input_ids,
++                          position_ids=positions,
++                          kv_caches=kv_caches,
++                          inputs_embeds=inputs_embeds,
++                          intermediate_tensors=intermediate_tensors,
++                          attn_metadata=attn_metadata)
++
++    def pooler(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Optional[PoolerOutput]:
++        return self._pooler(hidden_states, pooling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++        weights = self.hf_to_vllm_mapper.apply(weights)
++        weights = ((name, data) for name, data in weights
++                   if not name.startswith("lm_head."))
++        self.model.load_weights(weights)
++
++    def _build_model(self,
++                     vllm_config: VllmConfig,
++                     prefix: str = "") -> BertModel:
++        return BertModel(vllm_config=vllm_config,
++                         prefix=prefix,
++                         embedding_class=BertEmbedding)
++
++    def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
++        return Pooler.from_config_with_defaults(pooler_config,
++                                                pooling_type=PoolingType.CLS,
++                                                normalize=True,
++                                                softmax=False)
++
++
++class BertForSequenceClassification(nn.Module, SupportsCrossEncoding):
++    """A model that uses Bert to provide embedding functionalities.
++
++   This class encapsulates the BertModel and provides an interface for
++   embedding operations and customized pooling functions.
++
++   Attributes:
++       model: An instance of BertModel used for forward operations.
++       _pooler: An instance of Pooler used for pooling operations.
++   """
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++
++        self.default_activation_function = \
++            get_cross_encoder_activation_function(config)
++
++        self.num_labels = config.num_labels
++        self.bert = BertModel(vllm_config=vllm_config,
++                              prefix=maybe_prefix(prefix, "bert"),
++                              embedding_class=BertEmbedding,
++                              add_pooling_layer=True)
++        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
++        self._pooler = CrossEncodingPooler(config, self.classifier,
++                                           self.bert.pooler)
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++
++        self_weights = []
++
++        def weight_filter():
++            for name, weight in weights:
++                if name.startswith("bert."):
++                    yield (name[len("bert."):], weight)
++                else:
++                    self_weights.append((name, weight))
++
++        self.bert.load_weights(weight_filter())
++
++        params_dict = dict(self.named_parameters())
++
++        for name, loaded_weight in self_weights:
++            if name.startswith("classifier"):
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++
++    def pooler(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Optional[PoolerOutput]:
++        return self._pooler(hidden_states, pooling_metadata)
++
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        token_type_ids: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        return self.bert(input_ids=input_ids,
++                         position_ids=positions,
++                         kv_caches=kv_caches,
++                         inputs_embeds=inputs_embeds,
++                         intermediate_tensors=intermediate_tensors,
++                         attn_metadata=attn_metadata,
++                         token_type_ids=token_type_ids)
+diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
+new file mode 100644
+index 0000000..987dfaf
+--- /dev/null
++++ b/vllm/model_executor/models/blip.py
+@@ -0,0 +1,333 @@
++"""Minimal implementation of BlipVisionModel intended to be only used 
++within a vision language model."""
++from typing import Iterable, Optional, Set, Tuple, Union
++
++import torch
++import torch.nn as nn
++from transformers import Blip2VisionConfig, BlipVisionConfig
++
++from vllm.attention.layer import MultiHeadAttention
++from vllm.distributed import divide, get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++
++
++def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
++    assert image_size % patch_size == 0
++    return image_size // patch_size
++
++
++def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
++    grid_length = get_blip_patch_grid_length(image_size=image_size,
++                                             patch_size=patch_size)
++    return grid_length * grid_length
++
++
++# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
++class BlipVisionEmbeddings(nn.Module):
++
++    def __init__(self, config: Union[BlipVisionConfig, Blip2VisionConfig]):
++        super().__init__()
++
++        self.config = config
++        self.embed_dim = config.hidden_size
++        self.image_size = config.image_size
++        self.patch_size = config.patch_size
++
++        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
++
++        self.patch_embedding = nn.Conv2d(
++            in_channels=3,
++            out_channels=self.embed_dim,
++            kernel_size=self.patch_size,
++            stride=self.patch_size,
++        )
++
++        self.num_patches = get_blip_num_patches(image_size=self.image_size,
++                                                patch_size=self.patch_size)
++        self.num_positions = self.num_patches + 1
++
++        self.position_embedding = nn.Parameter(
++            torch.randn(1, self.num_positions, self.embed_dim))
++
++    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
++        batch_size = pixel_values.shape[0]
++        target_dtype = self.patch_embedding.weight.dtype
++        patch_embeds = self.patch_embedding(pixel_values.to(
++            dtype=target_dtype))  # shape = [*, width, grid, grid]
++        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
++
++        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
++        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
++
++        position_embeds = self.position_embedding.to(target_dtype)
++        embeddings = embeddings + position_embeds[:, :embeddings.size(1), :]
++
++        return embeddings
++
++
++class BlipAttention(nn.Module):
++    """Multi-headed attention from 'Attention Is All You Need' paper"""
++
++    def __init__(
++        self,
++        config: Union[BlipVisionConfig, Blip2VisionConfig],
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.config = config
++        self.embed_dim = config.hidden_size
++        self.num_heads = config.num_attention_heads
++        self.head_dim = self.embed_dim // self.num_heads
++        if self.head_dim * self.num_heads != self.embed_dim:
++            raise ValueError(
++                "embed_dim must be divisible by num_heads "
++                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
++                f" {self.num_heads}).")
++        self.scale = self.head_dim**-0.5
++        self.dropout = config.attention_dropout
++
++        self.qkv = QKVParallelLinear(
++            self.embed_dim,
++            self.head_dim,
++            self.num_heads,
++            bias=config.qkv_bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv",
++        )
++        self.projection = RowParallelLinear(
++            self.embed_dim,
++            self.embed_dim,
++            quant_config=quant_config,
++            prefix=f"{prefix}.projection",
++        )
++
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
++
++        self.attn = MultiHeadAttention(self.num_heads_per_partition,
++                                       self.head_dim, self.scale)
++
++    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
++        return tensor.view(bsz, seq_len, self.num_heads,
++                           self.head_dim).transpose(1, 2).contiguous()
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++    ):
++        """Input shape: Batch x Time x Channel"""
++
++        qkv_states, _ = self.qkv(hidden_states)
++        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
++        out = self.attn(query_states, key_states, value_states)
++        attn_output, _ = self.projection(out)
++
++        return attn_output, None
++
++
++class BlipMLP(nn.Module):
++
++    def __init__(
++        self,
++        config: BlipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++
++        self.activation_fn = get_act_fn(config.hidden_act)
++        self.fc1 = ColumnParallelLinear(config.hidden_size,
++                                        config.intermediate_size,
++                                        bias=True,
++                                        quant_config=quant_config,
++                                        prefix=f"{prefix}.fc1")
++        self.fc2 = RowParallelLinear(config.intermediate_size,
++                                     config.hidden_size,
++                                     bias=True,
++                                     quant_config=quant_config,
++                                     prefix=f"{prefix}.fc2")
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        hidden_states, _ = self.fc1(hidden_states)
++        hidden_states = self.activation_fn(hidden_states)
++        hidden_states, _ = self.fc2(hidden_states)
++
++        return hidden_states
++
++
++class BlipEncoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: BlipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        # fallback to sdpa attention if tp unavailable
++        self.self_attn = BlipAttention(
++            config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.layer_norm1 = nn.LayerNorm(config.hidden_size,
++                                        eps=config.layer_norm_eps)
++        self.mlp = BlipMLP(config,
++                           quant_config=quant_config,
++                           prefix=f"{prefix}.mlp")
++        self.layer_norm2 = nn.LayerNorm(config.hidden_size,
++                                        eps=config.layer_norm_eps)
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        residual = hidden_states
++
++        hidden_states = self.layer_norm1(hidden_states)
++        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
++        hidden_states = residual + hidden_states
++
++        residual = hidden_states
++        hidden_states = self.layer_norm2(hidden_states)
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = residual + hidden_states
++
++        return hidden_states
++
++
++class BlipEncoder(nn.Module):
++    """
++    Transformer encoder consisting of `config.num_hidden_layers` self 
++    attention layers. Each layer is a [`BlipEncoderLayer`].
++
++    Args:
++        config: BlipConfig
++    """
++
++    def __init__(
++        self,
++        config: BlipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        num_hidden_layers_override: Optional[int] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++
++        if num_hidden_layers_override is None:
++            num_hidden_layers = config.num_hidden_layers
++        else:
++            num_hidden_layers = num_hidden_layers_override
++
++        self.layers = nn.ModuleList([
++            BlipEncoderLayer(config=config,
++                             quant_config=quant_config,
++                             prefix=f"{prefix}.layers.{layer_idx}")
++            for layer_idx in range(num_hidden_layers)
++        ])
++
++    def forward(self, inputs_embeds: torch.Tensor):
++        hidden_states = inputs_embeds
++        for encoder_layer in self.layers:
++            hidden_states = encoder_layer(hidden_states)
++
++        return hidden_states
++
++
++class BlipVisionModel(nn.Module):
++    config_class = BlipVisionConfig
++    main_input_name = "pixel_values"
++
++    def __init__(
++        self,
++        config: BlipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        require_post_norm: Optional[bool] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.config = config
++
++        self.embeddings = BlipVisionEmbeddings(config)
++        self.encoder = BlipEncoder(
++            config=config,
++            quant_config=quant_config,
++            num_hidden_layers_override=num_hidden_layers_override,
++            prefix=f"{prefix}.encoder",
++        )
++
++        num_hidden_layers = config.num_hidden_layers
++        if len(self.encoder.layers) > config.num_hidden_layers:
++            raise ValueError(
++                f"The original encoder only has {num_hidden_layers} "
++                f"layers, but you requested {len(self.encoder.layers)} layers."
++            )
++
++        # If possible, skip post_layernorm to conserve memory
++        if require_post_norm is None:
++            require_post_norm = len(self.encoder.layers) == num_hidden_layers
++
++        if require_post_norm:
++            self.post_layernorm = nn.LayerNorm(config.hidden_size,
++                                               eps=config.layer_norm_eps)
++        else:
++            self.post_layernorm = None
++
++    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
++        hidden_states = self.embeddings(pixel_values)
++        hidden_states = self.encoder(inputs_embeds=hidden_states)
++
++        if self.post_layernorm is None:
++            return hidden_states
++
++        return self.post_layernorm(hidden_states)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        layer_count = len(self.encoder.layers)
++
++        for name, loaded_weight in weights:
++            # post_layernorm is not needed in BlipVisionModel
++            if (name.startswith("post_layernorm")
++                    and self.post_layernorm is None):
++                continue
++
++            # omit layers when num_hidden_layers_override is set
++            if name.startswith("encoder.layers"):
++                layer_idx = int(name.split(".")[2])
++                if layer_idx >= layer_count:
++                    continue
++
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
+new file mode 100644
+index 0000000..917b88e
+--- /dev/null
++++ b/vllm/model_executor/models/blip2.py
+@@ -0,0 +1,739 @@
++from functools import cached_property
++from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
++                    TypedDict, Union)
++
++import torch
++import torch.nn as nn
++from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig,
++                          apply_chunking_to_forward)
++
++from vllm.attention import AttentionMetadata
++from vllm.config import CacheConfig, VllmConfig
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
++                                    MultiModalInputsV2, MultiModalKwargs,
++                                    NestedTensors, PlaceholderRange)
++from vllm.multimodal.parse import MultiModalDataItems
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo, PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.sequence import IntermediateTensors
++
++from .blip import BlipVisionModel
++from .interfaces import SupportsMultiModal, SupportsPP
++from .utils import (AutoWeightsLoader, init_vllm_registered_model,
++                    maybe_prefix, merge_multimodal_embeddings)
++
++# We use this internally as placeholders since there is no image token
++# defined on the HuggingFace repo
++_IMAGE_TOKEN_ID = 50265
++
++
++class Blip2ImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: torch.Tensor
++    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
++
++
++class Blip2ImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: torch.Tensor
++    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
++
++    `hidden_size` must match the hidden size of language model backbone.
++    """
++
++
++Blip2ImageInputs = Union[Blip2ImagePixelInputs, Blip2ImageEmbeddingInputs]
++
++
++class Blip2QFormerMultiHeadAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: Blip2QFormerConfig,
++        *,
++        quant_config: Optional[QuantizationConfig],
++        cache_config: Optional[CacheConfig],
++        is_cross_attention: bool = False,
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++
++        if config.hidden_size % config.num_attention_heads != 0:
++            raise ValueError(
++                f"The hidden size ({config.hidden_size}) is not a multiple of "
++                f"the number of attention heads ({config.num_attention_heads})"
++            )
++
++        self.num_attention_heads = config.num_attention_heads
++        self.attention_head_size = (config.hidden_size //
++                                    config.num_attention_heads)
++        self.all_head_size = self.num_attention_heads * self.attention_head_size
++        self.scaling = self.attention_head_size**-0.5
++
++        self.query = nn.Linear(config.hidden_size, self.all_head_size)
++        if is_cross_attention:
++            kv_hidden_size = config.encoder_hidden_size
++        else:
++            kv_hidden_size = config.hidden_size
++        self.key = nn.Linear(kv_hidden_size, self.all_head_size)
++        self.value = nn.Linear(kv_hidden_size, self.all_head_size)
++
++        self.position_embedding_type = getattr(config,
++                                               "position_embedding_type",
++                                               "absolute")
++        if self.position_embedding_type != "absolute":
++            raise NotImplementedError("Unsupported position_embedding_type: "
++                                      f"{self.position_embedding_type}")
++
++        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
++
++    def transpose_for_scores(self, x):
++        x = x.view(*x.size()[:-1], self.num_attention_heads,
++                   self.attention_head_size)
++        return x.permute(0, 2, 1, 3)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        encoder_hidden_states: Optional[torch.FloatTensor] = None,
++    ):
++        is_cross_attention = encoder_hidden_states is not None
++
++        if is_cross_attention:
++            key_layer = self.transpose_for_scores(
++                self.key(encoder_hidden_states))
++            value_layer = self.transpose_for_scores(
++                self.value(encoder_hidden_states))
++        else:
++            key_layer = self.transpose_for_scores(self.key(hidden_states))
++            value_layer = self.transpose_for_scores(self.value(hidden_states))
++
++        mixed_query_layer = self.query(hidden_states)
++
++        query_layer = self.transpose_for_scores(mixed_query_layer)
++
++        attention_scores = torch.matmul(query_layer,
++                                        key_layer.transpose(-1, -2))
++        attention_probs = torch.softmax(attention_scores * self.scaling,
++                                        dim=-1)
++
++        # This is actually dropping out entire tokens to attend to, which might
++        # seem a bit unusual, but is taken from the original Transformer paper.
++        attention_probs_dropped = self.dropout(attention_probs)
++
++        context_layer = torch.matmul(attention_probs_dropped, value_layer)
++
++        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
++        context_layer = context_layer.view(*context_layer.size()[:-2],
++                                           self.all_head_size)
++
++        return context_layer
++
++
++class Blip2QFormerSelfOutput(nn.Module):
++
++    def __init__(self, config: Blip2QFormerConfig) -> None:
++        super().__init__()
++
++        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
++        self.LayerNorm = nn.LayerNorm(config.hidden_size,
++                                      eps=config.layer_norm_eps)
++        self.dropout = nn.Dropout(config.hidden_dropout_prob)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        input_tensor: torch.Tensor,
++    ) -> torch.Tensor:
++        hidden_states = self.dense(hidden_states)
++        hidden_states = self.dropout(hidden_states)
++        hidden_states = self.LayerNorm(hidden_states + input_tensor)
++        return hidden_states
++
++
++class Blip2QFormerAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: Blip2QFormerConfig,
++        *,
++        quant_config: Optional[QuantizationConfig],
++        cache_config: Optional[CacheConfig],
++        is_cross_attention: bool = False,
++    ) -> None:
++        super().__init__()
++
++        self.attention = Blip2QFormerMultiHeadAttention(
++            config,
++            quant_config=quant_config,
++            cache_config=cache_config,
++            is_cross_attention=is_cross_attention,
++        )
++
++        self.output = Blip2QFormerSelfOutput(config)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        encoder_hidden_states: Optional[torch.FloatTensor] = None,
++    ) -> Tuple[torch.Tensor]:
++        self_output = self.attention(
++            hidden_states,
++            encoder_hidden_states=encoder_hidden_states,
++        )
++        attention_output = self.output(self_output, hidden_states)
++
++        return attention_output
++
++
++class Blip2QFormerIntermediate(nn.Module):
++
++    def __init__(self, config: Blip2QFormerConfig) -> None:
++        super().__init__()
++
++        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
++        self.intermediate_act_fn = get_act_fn(config.hidden_act)
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        hidden_states = self.dense(hidden_states)
++        hidden_states = self.intermediate_act_fn(hidden_states)
++        return hidden_states
++
++
++class Blip2QFormerOutput(nn.Module):
++
++    def __init__(self, config: Blip2QFormerConfig) -> None:
++        super().__init__()
++
++        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
++        self.LayerNorm = nn.LayerNorm(config.hidden_size,
++                                      eps=config.layer_norm_eps)
++        self.dropout = nn.Dropout(config.hidden_dropout_prob)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        input_tensor: torch.Tensor,
++    ) -> torch.Tensor:
++        hidden_states = self.dense(hidden_states)
++        hidden_states = self.dropout(hidden_states)
++        hidden_states = self.LayerNorm(hidden_states + input_tensor)
++        return hidden_states
++
++
++class Blip2QFormerLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: Blip2QFormerConfig,
++        *,
++        quant_config: Optional[QuantizationConfig],
++        cache_config: Optional[CacheConfig],
++        layer_idx: int,
++    ) -> None:
++        super().__init__()
++
++        self.chunk_size_feed_forward = config.chunk_size_feed_forward
++        self.seq_len_dim = 1
++        self.attention = Blip2QFormerAttention(config,
++                                               quant_config=quant_config,
++                                               cache_config=cache_config)
++
++        self.layer_idx = layer_idx
++
++        if layer_idx % config.cross_attention_frequency == 0:
++            self.crossattention = Blip2QFormerAttention(
++                config,
++                quant_config=quant_config,
++                cache_config=cache_config,
++                is_cross_attention=True)
++            self.has_cross_attention = True
++        else:
++            self.has_cross_attention = False
++
++        self.intermediate_query = Blip2QFormerIntermediate(config)
++        self.output_query = Blip2QFormerOutput(config)
++
++    def forward(
++        self,
++        hidden_states: torch.FloatTensor,
++        encoder_hidden_states: torch.FloatTensor,
++        query_length: int,
++    ):
++        attention_output = self.attention(hidden_states)
++
++        if query_length > 0:
++            query_attention_output = attention_output[:, :query_length, :]
++
++            if self.has_cross_attention:
++                query_attention_output = self.crossattention(
++                    query_attention_output,
++                    encoder_hidden_states=encoder_hidden_states,
++                )
++
++            layer_output = apply_chunking_to_forward(
++                self.feed_forward_chunk_query,
++                self.chunk_size_feed_forward,
++                self.seq_len_dim,
++                query_attention_output,
++            )
++
++            if attention_output.shape[1] > query_length:
++                layer_output_text = apply_chunking_to_forward(
++                    self.feed_forward_chunk,
++                    self.chunk_size_feed_forward,
++                    self.seq_len_dim,
++                    attention_output[:, query_length:, :],
++                )
++                layer_output = torch.cat([layer_output, layer_output_text],
++                                         dim=1)
++        else:
++            layer_output = apply_chunking_to_forward(
++                self.feed_forward_chunk,
++                self.chunk_size_feed_forward,
++                self.seq_len_dim,
++                attention_output,
++            )
++
++        return layer_output
++
++    def feed_forward_chunk(self,
++                           attention_output: torch.Tensor) -> torch.Tensor:
++        intermediate_output = self.intermediate(attention_output)
++        layer_output = self.output(intermediate_output, attention_output)
++        return layer_output
++
++    def feed_forward_chunk_query(
++            self, attention_output: torch.Tensor) -> torch.Tensor:
++        intermediate_output = self.intermediate_query(attention_output)
++        layer_output = self.output_query(intermediate_output, attention_output)
++        return layer_output
++
++
++class Blip2QFormerEncoder(nn.Module):
++
++    def __init__(
++        self,
++        config: Blip2QFormerConfig,
++        *,
++        quant_config: Optional[QuantizationConfig],
++        cache_config: Optional[CacheConfig],
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++
++        self.layer = nn.ModuleList([
++            Blip2QFormerLayer(config,
++                              quant_config=quant_config,
++                              cache_config=cache_config,
++                              layer_idx=layer_idx)
++            for layer_idx in range(config.num_hidden_layers)
++        ])
++
++    def forward(
++        self,
++        hidden_states: torch.FloatTensor,
++        encoder_hidden_states: torch.FloatTensor,
++        query_length: int,
++    ) -> torch.Tensor:
++        for i in range(self.config.num_hidden_layers):
++            layer_module = self.layer[i]
++
++            hidden_states = layer_module(
++                hidden_states,
++                encoder_hidden_states=encoder_hidden_states,
++                query_length=query_length,
++            )
++
++        return hidden_states
++
++
++# Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1025
++class Blip2QFormerModel(nn.Module):
++
++    def __init__(
++        self,
++        config: Blip2QFormerConfig,
++        *,
++        quant_config: Optional[QuantizationConfig],
++        cache_config: Optional[CacheConfig],
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++
++        self.layernorm = nn.LayerNorm(config.hidden_size,
++                                      eps=config.layer_norm_eps)
++        self.dropout = nn.Dropout(config.hidden_dropout_prob)
++
++        self.encoder = Blip2QFormerEncoder(config,
++                                           quant_config=quant_config,
++                                           cache_config=cache_config)
++
++    def forward(
++        self,
++        query_embeds: torch.FloatTensor,
++        encoder_hidden_states: torch.FloatTensor,
++    ) -> torch.Tensor:
++        query_length = query_embeds.shape[1]
++
++        embedding_output = self.layernorm(query_embeds)
++        embedding_output = self.dropout(embedding_output)
++
++        sequence_output = self.encoder(
++            embedding_output,
++            encoder_hidden_states=encoder_hidden_states,
++            query_length=query_length,
++        )
++
++        return sequence_output
++
++
++class Blip2ProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(Blip2Config)
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": 1}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        return {"image": self.get_num_image_tokens()}
++
++    def get_num_image_tokens(self) -> int:
++        hf_config = self.get_hf_config()
++        return hf_config.num_query_tokens
++
++
++class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        hf_config = self.info.get_hf_config()
++        vision_config = hf_config.vision_config
++
++        max_image_size = vision_config.image_size
++        num_images = mm_counts.get("image", 0)
++
++        mm_data = {
++            "image":
++            self._get_dummy_images(width=max_image_size,
++                                   height=max_image_size,
++                                   num_images=num_images)
++        }
++
++        return ProcessorInputs(
++            prompt_text="",
++            mm_data=mm_data,
++        )
++
++
++class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        if not mm_data:
++            # HF processor always adds placeholders even when there's no image
++            tokenizer = self.info.get_tokenizer()
++            prompt_ids = tokenizer.encode(prompt)
++            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
++
++        return super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++        )
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(
++            pixel_values=MultiModalFieldConfig.batched("image"),
++            image_embeds=MultiModalFieldConfig.batched("image"),
++        )
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        num_image_tokens = self.info.get_num_image_tokens()
++
++        return [
++            PromptReplacement(
++                modality="image",
++                target="</s>",
++                replacement="<image>" * num_image_tokens + "</s>",
++            )
++        ]
++
++    def apply(
++        self,
++        prompt: Union[str, list[int]],
++        mm_data: MultiModalDataDict,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> MultiModalInputsV2:
++        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
++
++        # Only <image> tokens should be considered as placeholders,
++        # so we ignore the trailing bos_token
++        result["mm_placeholders"] = {
++            modality: [
++                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
++                for p in ps
++            ]
++            for modality, ps in result["mm_placeholders"].items()
++        }
++
++        return result
++
++
++@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
++                                        info=Blip2ProcessingInfo,
++                                        dummy_inputs=Blip2DummyInputsBuilder)
++class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        # TODO: Optionally initializes this for supporting embeddings.
++        self.vision_model = BlipVisionModel(config.vision_config, quant_config)
++
++        self.query_tokens = nn.Parameter(
++            torch.zeros(1, config.num_query_tokens,
++                        config.qformer_config.hidden_size))
++
++        self.qformer = Blip2QFormerModel(config.qformer_config,
++                                         cache_config=cache_config,
++                                         quant_config=quant_config)
++
++        self.language_projection = nn.Linear(
++            config.qformer_config.hidden_size,
++            config.text_config.hidden_size,
++            bias=True,
++        )
++
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=config.text_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
++        h = w = self.config.vision_config.image_size
++        expected_dims = (3, h, w)
++        actual_dims = tuple(data.shape[1:])
++
++        if actual_dims != expected_dims:
++            expected_expr = ("batch_size", *map(str, expected_dims))
++            raise ValueError(
++                f"The expected shape of pixel values is {expected_expr}. "
++                f"You supplied {tuple(data.shape)}.")
++
++        return data
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[Blip2ImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if pixel_values is not None:
++            if not isinstance(pixel_values, torch.Tensor):
++                raise ValueError("Incorrect type of pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            # Remove the N dimension until multiple images are supported.
++            pixel_values = pixel_values.squeeze(1)
++
++            return Blip2ImagePixelInputs(
++                type="pixel_values",
++                data=self._validate_pixel_values(pixel_values),
++            )
++
++        if image_embeds is not None:
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++
++            # Remove the N dimension until multiple images are supported.
++            image_embeds = image_embeds.squeeze(1)
++
++            return Blip2ImageEmbeddingInputs(
++                type="image_embeds",
++                data=image_embeds,
++            )
++
++        raise AssertionError("This line should be unreachable.")
++
++    def _image_pixels_to_features(self, vision_model: BlipVisionModel,
++                                  pixel_values: torch.Tensor) -> torch.Tensor:
++
++        # NOTE: we skip the step to select the vision feature layer since
++        # this is already done inside the vision tower
++        image_features = vision_model(pixel_values)
++
++        return image_features
++
++    def _process_image_pixels(self,
++                              inputs: Blip2ImagePixelInputs) -> torch.Tensor:
++        assert self.vision_model is not None
++
++        pixel_values = inputs["data"]
++
++        return self._image_pixels_to_features(self.vision_model, pixel_values)
++
++    def _process_image_input(self,
++                             image_input: Blip2ImageInputs) -> torch.Tensor:
++
++        if image_input["type"] == "image_embeds":
++            return image_input["data"]
++
++        assert self.vision_model is not None
++        image_features = self._process_image_pixels(image_input)
++
++        query_tokens = self.query_tokens.expand(image_features.shape[0], -1,
++                                                -1)
++        query_output = self.qformer(
++            query_embeds=query_tokens,
++            encoder_hidden_states=image_features,
++        )
++
++        return self.language_projection(query_output)
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        vision_embeddings = self._process_image_input(image_input)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                _IMAGE_TOKEN_ID)
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[SamplerOutput, IntermediateTensors]:
++        """Run forward pass for BLIP-2.
++
++        One key thing to understand is the `input_ids` already accounts for the
++        positions of the to-be-inserted image embeddings.
++
++        Concretely, consider a text prompt:
++        `"Question: What's the content of the image? Answer:"`.
++
++        Tokenizer outputs:
++        `[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`.
++
++        To reserve space in KV cache, we have to insert placeholder tokens
++        before they are inputted to the model, so the input processor prepends 
++        dummy tokens (denoted as `50265`), resulting in:
++        `[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`.
++
++        We insert 32 tokens since it corresponds to the number of query
++        embeddings outputted by the Q-Former and inputted to the language model.
++
++        This way, the `positions` and `attn_metadata` are consistent
++        with the `input_ids`.
++
++        Args:
++            input_ids: Flattened (concatenated) input_ids corresponding to a
++                batch.
++            pixel_values: The pixels in each input image.
++        
++        See also:
++            :class:`Blip2ImageInputs`
++        """
++
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model.model(input_ids,
++                                                  positions,
++                                                  kv_caches,
++                                                  attn_metadata,
++                                                  intermediate_tensors,
++                                                  inputs_embeds=inputs_embeds)
++
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
+index 1d7e5d2..fee74f4 100644
+--- a/vllm/model_executor/models/bloom.py
++++ b/vllm/model_executor/models/bloom.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
+ # Copyright 2023 The vLLM team.
+@@ -17,28 +16,34 @@
+ # limitations under the License.
+ """Inference-only BLOOM model compatible with HuggingFace weights."""
+ import math
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import BloomConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import (get_tensor_model_parallel_rank,
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size)
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+-    VocabParallelEmbedding)
++    ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+@@ -71,7 +76,9 @@ class BloomAttention(nn.Module):
+     def __init__(
+         self,
+         config: BloomConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -108,7 +115,10 @@ class BloomAttention(nn.Module):
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+                               scaling,
+-                              alibi_slopes=alibi_slopes)
++                              alibi_slopes=alibi_slopes,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -139,7 +149,7 @@ class BloomMLP(nn.Module):
+             4 * hidden_size,
+             quant_config=quant_config,
+         )
+-        self.gelu_impl = get_act_fn("gelu", quant_config, 4 * hidden_size)
++        self.gelu_impl = get_act_fn("gelu")
+         self.dense_4h_to_h = RowParallelLinear(
+             4 * hidden_size,
+             hidden_size,
+@@ -158,14 +168,19 @@ class BloomBlock(nn.Module):
+     def __init__(
+         self,
+         config: BloomConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         hidden_size = config.hidden_size
+ 
+         self.input_layernorm = nn.LayerNorm(hidden_size,
+                                             eps=config.layer_norm_epsilon)
+-        self.self_attention = BloomAttention(config, quant_config)
++        self.self_attention = BloomAttention(config,
++                                             cache_config,
++                                             quant_config,
++                                             prefix=f"{prefix}.self_attention")
+         self.post_attention_layernorm = nn.LayerNorm(
+             hidden_size, eps=config.layer_norm_epsilon)
+         self.mlp = BloomMLP(config, quant_config)
+@@ -209,14 +224,16 @@ class BloomBlock(nn.Module):
+         return output
+ 
+ 
++@support_torch_compile
+ class BloomModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: BloomConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.embed_dim = config.hidden_size
+ 
+         # Embedding + LN Embedding
+@@ -228,13 +245,20 @@ class BloomModel(nn.Module):
+             self.embed_dim, eps=config.layer_norm_epsilon)
+ 
+         # Transformer blocks
+-        self.h = nn.ModuleList([
+-            BloomBlock(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.h = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: BloomBlock(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.h")
+ 
+         # Final Layer Norm
+         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.word_embeddings_layernorm(self.word_embeddings(input_ids))
+ 
+     def forward(
+         self,
+@@ -242,35 +266,55 @@ class BloomModel(nn.Module):
+         position_ids: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.word_embeddings(input_ids)
+-        hidden_states = self.word_embeddings_layernorm(hidden_states)
+-        for i in range(len(self.h)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.h[i]
+             hidden_states = layer(
+                 position_ids,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         hidden_states = self.ln_f(hidden_states)
+         return hidden_states
+ 
+ 
+-class BloomForCausalLM(nn.Module):
++class BloomForCausalLM(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: BloomConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+-        self.transformer = BloomModel(config, quant_config)
+-        self.lm_head_weight = self.transformer.word_embeddings.weight
++        self.transformer = BloomModel(vllm_config=vllm_config,
++                                      prefix=maybe_prefix(
++                                          prefix, "transformer"))
++        if self.config.tie_word_embeddings:
++            self.lm_head = self.transformer.word_embeddings
++        else:
++            self.lm_head = ParallelLMHead(self.config.vocab_size,
++                                          self.config.hidden_size)
++
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.transformer.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.transformer.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -278,14 +322,20 @@ class BloomForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.transformer(input_ids, positions, kv_caches,
+-                                         attn_metadata)
++                                         attn_metadata, intermediate_tensors,
++                                         inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -297,13 +347,17 @@ class BloomForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if name == "lm_head.weight":
+                 continue
+             if not name.startswith("transformer."):
+                 name = "transformer." + name
++            if is_pp_missing_parameter(name, self):
++                continue
+             param = params_dict[name]
+ 
+             if "query_key_value" in name:
+@@ -325,3 +379,5 @@ class BloomForCausalLM(nn.Module):
+             weight_loader = getattr(param, "weight_loader",
+                                     default_weight_loader)
+             weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
+new file mode 100644
+index 0000000..a663420
+--- /dev/null
++++ b/vllm/model_executor/models/chameleon.py
+@@ -0,0 +1,1166 @@
++from functools import cached_property
++from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
++                    Tuple, TypedDict, Union)
++
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor,
++                          ChameleonVQVAEConfig)
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.logger import init_logger
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, row_parallel_weight_loader)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.model_executor.utils import set_weight_attrs
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
++                                    MultiModalInputsV2, MultiModalKwargs,
++                                    NestedTensors, PlaceholderRange)
++from vllm.multimodal.parse import MultiModalDataItems
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo, PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsMultiModal, SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix, merge_multimodal_embeddings)
++
++logger = init_logger(__name__)
++
++
++class ChameleonImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: torch.Tensor
++    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
++
++
++class ChameleonProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(ChameleonConfig)
++
++    def get_hf_processor(self):
++        return self.ctx.get_hf_processor(ChameleonProcessor)
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": 1}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        return {"image": self.get_num_image_tokens()}
++
++    def get_num_image_tokens(self) -> int:
++        processor = self.get_hf_processor()
++        return processor.image_seq_length
++
++
++class ChameleonDummyInputsBuilder(
++        BaseDummyInputsBuilder[ChameleonProcessingInfo]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        config = self.info.get_hf_config()
++
++        width = height = config.vq_config.resolution
++        num_images = mm_counts.get("image", 0)
++
++        mm_data = {
++            "image":
++            self._get_dummy_images(width=width,
++                                   height=height,
++                                   num_images=num_images)
++        }
++
++        return ProcessorInputs(
++            prompt_text="<image>" * num_images,
++            mm_data=mm_data,
++        )
++
++
++class ChameleonMultiModalProcessor(
++        BaseMultiModalProcessor[ChameleonProcessingInfo]):
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        if not mm_data:
++            prompt_ids = self.info.get_tokenizer().encode(prompt)
++            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
++            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
++
++        return super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++        )
++
++    def _apply_hf_processor_tokens_only(
++        self,
++        prompt_tokens: list[int],
++    ) -> list[int]:
++        # HF processor adds sep token for chat mode
++        tokenizer = self.info.get_tokenizer()
++        sep_token_id: int = \
++            tokenizer.vocab[tokenizer.sep_token]  # type: ignore
++
++        return prompt_tokens + [sep_token_id]
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
++
++        return [
++            PromptReplacement(
++                modality="image",
++                target="<image>",
++                replacement="".join([
++                    processor.image_start_token,
++                    processor.image_token * self.info.get_num_image_tokens(),
++                    processor.image_end_token,
++                ]),
++            )
++        ]
++
++    def apply(
++        self,
++        prompt: Union[str, list[int]],
++        mm_data: MultiModalDataDict,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> MultiModalInputsV2:
++        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
++
++        # Only <image> tokens should be considered as placeholders,
++        # so we ignore the image_start_token and image_end_token
++        result["mm_placeholders"] = {
++            modality: [
++                PlaceholderRange(offset=p["offset"] + 1,
++                                 length=p["length"] - 2) for p in ps
++            ]
++            for modality, ps in result["mm_placeholders"].items()
++        }
++
++        return result
++
++
++class ChameleonLayerNorm(nn.LayerNorm):
++
++    def __init__(self, hidden_size, *args, **kwargs):
++        super().__init__(hidden_size, *args, **kwargs)
++        self.normalized_shape = (hidden_size[-1], )
++
++        set_weight_attrs(self.weight,
++                         {"weight_loader": row_parallel_weight_loader})
++        set_weight_attrs(self.bias,
++                         {"weight_loader": row_parallel_weight_loader})
++
++    def forward(self, hidden_states):
++        hidden_states = F.layer_norm(hidden_states,
++                                     self.normalized_shape,
++                                     None,
++                                     None,
++                                     eps=1e-5)
++        hidden_states = hidden_states * self.weight + self.bias
++        return hidden_states
++
++
++# Copied from vllm.model_executor.models.llama.LlamaMLP -> ChameleonMLP
++class ChameleonMLP(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        intermediate_size: int,
++        hidden_act: str,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++    ) -> None:
++        super().__init__()
++        self.gate_up_proj = MergedColumnParallelLinear(
++            input_size=hidden_size,
++            output_sizes=[intermediate_size] * 2,
++            bias=bias,
++            quant_config=quant_config)
++        self.down_proj = RowParallelLinear(input_size=intermediate_size,
++                                           output_size=hidden_size,
++                                           bias=bias,
++                                           quant_config=quant_config)
++        if hidden_act != "silu":
++            raise ValueError(f"Unsupported activation: {hidden_act}. "
++                             "Only silu is supported for now.")
++        self.act_fn = SiluAndMul()
++
++    def forward(self, x):
++        gate_up, _ = self.gate_up_proj(x)
++        x = self.act_fn(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++# Modified from vllm.model_executor.models.llama.LlamaAttention -> ChameleonAttention #noqa
++class ChameleonAttention(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        rope_theta: float = 10000,
++        rope_scaling: Optional[Dict[str, Any]] = None,
++        max_position_embeddings: int = 4096,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        cache_config: Optional[CacheConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = num_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = num_kv_heads
++        if self.total_num_kv_heads >= tp_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        self.head_dim = hidden_size // self.total_num_heads
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = self.head_dim**-0.5
++        self.rope_theta = rope_theta
++        self.max_position_embeddings = max_position_embeddings
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size=hidden_size,
++            head_size=self.head_dim,
++            total_num_heads=self.total_num_heads,
++            total_num_kv_heads=self.total_num_kv_heads,
++            bias=bias,
++            quant_config=quant_config,
++        )
++        self.o_proj = RowParallelLinear(
++            input_size=self.total_num_heads * self.head_dim,
++            output_size=hidden_size,
++            bias=bias,
++            quant_config=quant_config,
++        )
++        self.q_norm = ChameleonLayerNorm((self.num_heads, self.head_dim))
++        self.k_norm = ChameleonLayerNorm((self.num_kv_heads, self.head_dim))
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=max_position_embeddings,
++            base=rope_theta,
++            rope_scaling=rope_scaling,
++        )
++
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++    def _apply_qk_norm(self, q: torch.Tensor,
++                       k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
++        # reshape for layernorm
++        q = q.reshape(-1, self.num_heads, self.head_dim)
++        k = k.reshape(-1, self.num_kv_heads, self.head_dim)
++        q = self.q_norm(q)
++        k = self.k_norm(k)
++        q = q.view(*q.shape[:-2], -1)
++        k = k.view(*k.shape[:-2], -1)
++        return q, k
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k = self._apply_qk_norm(q, k)
++
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class ChameleonDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: ChameleonConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        rope_theta = getattr(config, "rope_theta", 10000)
++        rope_scaling = getattr(config, "rope_scaling", None)
++        if rope_scaling is not None and getattr(
++                config, "original_max_position_embeddings", None):
++            rope_scaling["original_max_position_embeddings"] = (
++                config.original_max_position_embeddings)
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          4096)
++
++        self.self_attn = ChameleonAttention(
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            num_kv_heads=getattr(config, "num_key_value_heads",
++                                 config.num_attention_heads),
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            quant_config=quant_config,
++            bias=False,
++            cache_config=cache_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.mlp = ChameleonMLP(
++            hidden_size=self.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.hidden_act,
++            quant_config=quant_config,
++            bias=getattr(config, "mlp_bias", False),
++        )
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.input_layernorm(hidden_states)
++        else:
++            hidden_states, residual = self.input_layernorm(
++                hidden_states, residual)
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        # Fully Connected
++        hidden_states, residual = self.post_attention_layernorm(
++            hidden_states, residual)
++        hidden_states = self.mlp(hidden_states)
++
++        return hidden_states, residual
++
++
++class ChameleonSwinDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: ChameleonConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        rope_theta = getattr(config, "rope_theta", 10000)
++        rope_scaling = getattr(config, "rope_scaling", None)
++        if rope_scaling is not None and getattr(
++                config, "original_max_position_embeddings", None):
++            rope_scaling["original_max_position_embeddings"] = (
++                config.original_max_position_embeddings)
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          4096)
++
++        self.self_attn = ChameleonAttention(
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            num_kv_heads=getattr(config, "num_key_value_heads",
++                                 config.num_attention_heads),
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            quant_config=quant_config,
++            bias=False,
++            cache_config=cache_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.mlp = ChameleonMLP(
++            hidden_size=self.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.hidden_act,
++            quant_config=quant_config,
++            bias=getattr(config, "mlp_bias", False),
++        )
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++
++        residual = hidden_states
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        hidden_states = self.input_layernorm(hidden_states)
++        hidden_states = hidden_states + residual
++
++        # Fully Connected
++        residual = hidden_states
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states = residual + hidden_states
++
++        return hidden_states, residual
++
++
++# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEVectorQuantizer #noqa
++class ChameleonVQVAEVectorQuantizer(nn.Module):
++
++    def __init__(self, config: ChameleonVQVAEConfig):
++        super().__init__()
++        self.num_embeddings = config.num_embeddings
++        self.embedding_dim = config.embed_dim
++        self.beta = getattr(config, "beta", 0.25)
++
++        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
++        self.re_embed = self.num_embeddings
++
++    def forward(self, hidden_state: torch.Tensor):
++        hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
++        hidden_state_flattened = hidden_state.view(-1, self.embedding_dim)
++
++        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
++        distances = (
++            torch.sum(hidden_state_flattened**2, dim=1, keepdim=True) +
++            torch.sum(self.embedding.weight**2, dim=1) -
++            2 * torch.einsum("bd,dn->bn", hidden_state_flattened,
++                             self.embedding.weight.transpose(0, 1)))
++
++        min_encoding_indices = torch.argmin(distances, dim=1)
++        hidden_state_quant = self.embedding(min_encoding_indices).view(
++            hidden_state.shape)
++
++        # compute loss for embedding
++        loss = torch.mean((hidden_state_quant.detach() - hidden_state)**
++                          2) + self.beta * torch.mean(
++                              (hidden_state_quant - hidden_state.detach())**2)
++
++        # preserve gradients
++        hidden_state_quant = hidden_state + (hidden_state_quant -
++                                             hidden_state).detach()
++
++        # reshape back to match original input shape
++        hidden_state_quant = hidden_state_quant.permute(0, 3, 1,
++                                                        2).contiguous()
++
++        return hidden_state_quant, loss, min_encoding_indices
++
++
++# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderConvDownsample #noqa
++class ChameleonVQVAEEncoderConvDownsample(nn.Module):
++
++    def __init__(self, in_channels: int):
++        super().__init__()
++        self.conv = nn.Conv2d(in_channels,
++                              in_channels,
++                              kernel_size=3,
++                              stride=2,
++                              padding=0)
++
++    def forward(self, hidden_states: torch.Tensor):
++        # no asymmetric padding in torch conv, must do it ourselves
++        hidden_states = F.pad(hidden_states,
++                              pad=(0, 1, 0, 1),
++                              mode="constant",
++                              value=0)
++        hidden_states = self.conv(hidden_states)
++        return hidden_states
++
++
++# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderResnetBlock #noqa
++class ChameleonVQVAEEncoderResnetBlock(nn.Module):
++
++    def __init__(
++        self,
++        config: ChameleonVQVAEConfig,
++        in_channels: int,
++        out_channels=None,
++        conv_shortcut=False,
++    ):
++        super().__init__()
++        self.in_channels = in_channels
++        self.out_channels = in_channels if out_channels is None \
++            else out_channels
++        self.use_conv_shortcut = conv_shortcut
++
++        self.norm1 = torch.nn.GroupNorm(num_groups=32,
++                                        num_channels=in_channels,
++                                        eps=1e-6,
++                                        affine=True)
++        self.conv1 = torch.nn.Conv2d(in_channels,
++                                     out_channels,
++                                     kernel_size=3,
++                                     stride=1,
++                                     padding=1)
++        self.norm2 = torch.nn.GroupNorm(num_groups=32,
++                                        num_channels=out_channels,
++                                        eps=1e-6,
++                                        affine=True)
++        self.dropout = torch.nn.Dropout(config.dropout)
++        self.conv2 = torch.nn.Conv2d(out_channels,
++                                     out_channels,
++                                     kernel_size=3,
++                                     stride=1,
++                                     padding=1)
++        if self.in_channels != self.out_channels:
++            if self.use_conv_shortcut:
++                self.conv_shortcut = torch.nn.Conv2d(in_channels,
++                                                     out_channels,
++                                                     kernel_size=3,
++                                                     stride=1,
++                                                     padding=1)
++            else:
++                self.nin_shortcut = torch.nn.Conv2d(in_channels,
++                                                    out_channels,
++                                                    kernel_size=1,
++                                                    stride=1,
++                                                    padding=0)
++
++    def forward(self, hidden_states: torch.Tensor):
++        residual = hidden_states
++        hidden_states = self.norm1(hidden_states)
++        hidden_states *= torch.sigmoid(hidden_states)
++        hidden_states = self.conv1(hidden_states)
++
++        hidden_states = self.norm2(hidden_states)
++        hidden_states *= torch.sigmoid(hidden_states)
++        hidden_states = self.dropout(hidden_states)
++        hidden_states = self.conv2(hidden_states)
++
++        if self.in_channels != self.out_channels:
++            if self.use_conv_shortcut:
++                residual = self.conv_shortcut(residual)
++            else:
++                residual = self.nin_shortcut(residual)
++
++        return residual + hidden_states
++
++
++# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoderAttnBlock #noqa
++class ChameleonVQVAEEncoderAttnBlock(nn.Module):
++
++    def __init__(self, in_channels: int):
++        super().__init__()
++        self.in_channels = in_channels
++
++        self.norm = torch.nn.GroupNorm(num_groups=32,
++                                       num_channels=in_channels,
++                                       eps=1e-6,
++                                       affine=True)
++        self.q = torch.nn.Conv2d(in_channels,
++                                 in_channels,
++                                 kernel_size=1,
++                                 stride=1,
++                                 padding=0)
++        self.k = torch.nn.Conv2d(in_channels,
++                                 in_channels,
++                                 kernel_size=1,
++                                 stride=1,
++                                 padding=0)
++        self.v = torch.nn.Conv2d(in_channels,
++                                 in_channels,
++                                 kernel_size=1,
++                                 stride=1,
++                                 padding=0)
++        self.proj_out = torch.nn.Conv2d(in_channels,
++                                        in_channels,
++                                        kernel_size=1,
++                                        stride=1,
++                                        padding=0)
++
++    def forward(self, hidden_states: torch.Tensor):
++        residual = hidden_states
++        hidden_states = self.norm(hidden_states)
++        query_states = self.q(hidden_states)
++        key_states = self.k(hidden_states)
++        value_states = self.v(hidden_states)
++
++        # compute attention
++        batch_size, channels, height, width = query_states.shape
++        query_states = query_states.reshape(batch_size, channels,
++                                            height * width).permute(0, 2, 1)
++        key_states = key_states.reshape(batch_size, channels, height * width)
++        attn_weights = torch.bmm(query_states, key_states)
++        attn_weights = attn_weights * (int(channels)**(-0.5))
++        attn_weights = F.softmax(attn_weights, dim=2)
++
++        # attend to values
++        value_states = value_states.reshape(batch_size, channels,
++                                            height * width)
++        attn_weights = attn_weights.permute(0, 2, 1)
++        attn_output = torch.bmm(value_states,
++                                attn_weights).reshape(batch_size, channels,
++                                                      height, width)
++
++        attn_output = self.proj_out(attn_output)
++        return residual + attn_output
++
++
++# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAEEncoder #noqa
++class ChameleonVQVAEEncoder(nn.Module):
++
++    def __init__(self, config: ChameleonVQVAEConfig):
++        super().__init__()
++
++        self.num_resolutions = len(config.channel_multiplier)
++        self.num_res_blocks = config.num_res_blocks
++        base_channels = config.base_channels
++        resolution = config.resolution
++        in_channels = config.in_channels
++        double_latent = config.double_latent
++        latent_channels = config.latent_channels
++        channel_multiplier = config.channel_multiplier
++
++        self.conv_in = torch.nn.Conv2d(in_channels,
++                                       base_channels,
++                                       kernel_size=3,
++                                       stride=1,
++                                       padding=1)
++
++        curr_res = resolution
++        in_channel_multiplier = (1, ) + tuple(channel_multiplier)
++        self.in_channel_multiplier = in_channel_multiplier
++        self.down = nn.ModuleList()
++        for i_level in range(self.num_resolutions):
++            block = nn.ModuleList()
++            attn = nn.ModuleList()
++            block_in = base_channels * in_channel_multiplier[i_level]
++            block_out = base_channels * channel_multiplier[i_level]
++            for i_block in range(self.num_res_blocks):
++                block.append(
++                    ChameleonVQVAEEncoderResnetBlock(
++                        config=config,
++                        in_channels=block_in,
++                        out_channels=block_out,
++                    ))
++                block_in = block_out
++                if (config.attn_resolutions is not None
++                        and curr_res in config.attn_resolutions
++                        and config.attn_type == "vanilla"):
++                    attn.append(ChameleonVQVAEEncoderAttnBlock(block_in))
++
++            down = nn.Module()
++            down.block = block
++            down.attn = attn
++            if i_level != self.num_resolutions - 1:
++                down.downsample = ChameleonVQVAEEncoderConvDownsample(block_in)
++                curr_res = curr_res // 2
++            self.down.append(down)
++
++        self.mid = nn.Module()
++        self.mid.block_1 = ChameleonVQVAEEncoderResnetBlock(
++            config=config,
++            in_channels=block_in,
++            out_channels=block_in,
++        )
++        self.mid.attn_1 = ChameleonVQVAEEncoderAttnBlock(
++            block_in) if config.attn_type == "vanilla" else nn.Identity()
++        self.mid.block_2 = ChameleonVQVAEEncoderResnetBlock(
++            config=config,
++            in_channels=block_in,
++            out_channels=block_in,
++        )
++
++        self.norm_out = torch.nn.GroupNorm(num_groups=32,
++                                           num_channels=block_in,
++                                           eps=1e-6,
++                                           affine=True)
++        self.conv_out = torch.nn.Conv2d(
++            block_in,
++            2 * latent_channels if double_latent else latent_channels,
++            kernel_size=3,
++            stride=1,
++            padding=1,
++        )
++
++    def forward(self, pixel_values: torch.Tensor):
++        pixel_values = pixel_values.to(self.conv_in.weight.dtype)
++
++        # downsampling
++        hidden_states = [self.conv_in(pixel_values)]
++        for i_level in range(self.num_resolutions):
++            for i_block in range(self.num_res_blocks):
++                hidden_state = self.down[i_level].block[i_block](
++                    hidden_states[-1])
++                if len(self.down[i_level].attn) > 0:
++                    hidden_state = self.down[i_level].attn[i_block](
++                        hidden_state)
++                hidden_states.append(hidden_state)
++            if i_level != self.num_resolutions - 1:
++                hidden_states.append(self.down[i_level].downsample(
++                    hidden_states[-1]))
++
++        # middle
++        last_hidden_state = hidden_states[-1]
++        last_hidden_state = self.mid.block_1(last_hidden_state)
++        last_hidden_state = self.mid.attn_1(last_hidden_state)
++        last_hidden_state = self.mid.block_2(last_hidden_state)
++
++        # end
++        last_hidden_state = self.norm_out(last_hidden_state)
++        last_hidden_state *= torch.sigmoid(last_hidden_state)
++        last_hidden_state = self.conv_out(last_hidden_state)
++        return last_hidden_state
++
++
++# Adapted from transformers.models.chameleon.modeling_chameleon.ChameleonVQVAE #noqa
++class ChameleonVQVAE(nn.Module):
++
++    def __init__(self, config: ChameleonVQVAEConfig):
++        super().__init__()
++        self.encoder = ChameleonVQVAEEncoder(config)
++        self.quantize = ChameleonVQVAEVectorQuantizer(config)
++        self.quant_conv = torch.nn.Conv2d(config.latent_channels,
++                                          config.embed_dim, 1)
++        self.post_quant_conv = torch.nn.Conv2d(config.embed_dim,
++                                               config.latent_channels, 1)
++        self.eval()  # Chameleon's VQ model is frozen
++
++    def encode(
++        self, pixel_values: torch.Tensor
++    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
++        hidden_states = self.encoder(pixel_values)
++        hidden_states = self.quant_conv(hidden_states)
++        quant, emb_loss, indices = self.quantize(hidden_states)
++        return quant, emb_loss, indices
++
++
++# Copied from transformers.models.chameleon.modeling_chameleon.ChameleonImageVocabularyMapping #noqa
++class ChameleonImageVocabularyMapping:
++    """
++    A class for mapping discrete image tokens from VQGAN to BPE tokens.
++    """
++
++    def __init__(self, vocab_map: Dict[str, int]):
++        self.vocab_map = vocab_map
++        self.image_token_id = vocab_map.get("<image>")
++
++    @cached_property
++    def val2name(self):
++        return {v: k for k, v in self.vocab_map.items()}
++
++    @cached_property
++    def image_tokens(self):
++        return sorted([
++            val for name, val in self.vocab_map.items()
++            if name.startswith("IMGIMG")
++        ])
++
++    @cached_property
++    def bpe2img(self):
++        img_tkn_chr_mapping = {chr(ord("A") + i): str(i) for i in range(10)}
++
++        def remap(old_name: str) -> str:
++            return "".join(
++                img_tkn_chr_mapping.get(c, c)
++                for c in old_name[len("IMGIMG"):-1])
++
++        return {
++            tok: int(remap(self.val2name[tok]))
++            for tok in self.image_tokens
++        }
++
++    @cached_property
++    def img2bpe(self):
++        return {v: k for k, v in self.bpe2img.items()}
++
++    @cached_property
++    def bpe2img_search_tensors(self):
++        return torch.tensor(sorted(self.bpe2img.keys())), torch.tensor(
++            sorted(self.bpe2img.values()))
++
++    @cached_property
++    def img2bpe_mapping_tensor(self):
++        mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int)
++        for k, v in self.img2bpe.items():
++            mapping[k] = v
++        return mapping
++
++    def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor:
++        device = img_batch.device
++        img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")]
++        return img_tokens.to(device)
++
++
++class ChameleonModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.config = config
++        self.padding_idx = config.pad_token_id
++        self.vocab_size = config.vocab_size
++        self.embed_tokens = VocabParallelEmbedding(
++            self.vocab_size,
++            config.hidden_size,
++        )
++        self.vocabulary_mapping = ChameleonImageVocabularyMapping(
++            config.vocabulary_map)
++        decoder_layer = ChameleonDecoderLayer if not self.config.swin_norm \
++            else ChameleonSwinDecoderLayer
++
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: decoder_layer(config=config,
++                                         cache_config=cache_config,
++                                         quant_config=quant_config,
++                                         prefix=prefix),
++            prefix=f"{prefix}.layers",
++        )
++
++        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.vqmodel = ChameleonVQVAE(config.vq_config)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def get_image_tokens(self, pixel_values: torch.Tensor) -> torch.Tensor:
++        """
++        Tokenizes images into discrete tokens with VQGAN module. Converts
++        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
++        special tokens.
++        """
++        batch_size = pixel_values.shape[0]
++        _, _, image_toks = self.vqmodel.encode(pixel_values)
++        bpe_toks = self.vocabulary_mapping.convert_img2bpe(image_toks)
++        bpe_toks = bpe_toks.view(batch_size, -1)
++        return bpe_toks
++
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++                residual,
++            )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
++
++
++@MULTIMODAL_REGISTRY.register_processor(
++    ChameleonMultiModalProcessor,
++    info=ChameleonProcessingInfo,
++    dummy_inputs=ChameleonDummyInputsBuilder)
++class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                        SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = config
++        self.multimodal_config = multimodal_config
++        self.model = ChameleonModel(vllm_config=vllm_config,
++                                    prefix=maybe_prefix(prefix, "model"))
++        self.unpadded_vocab_size = config.vocab_size
++        self.lm_head = ParallelLMHead(
++            self.unpadded_vocab_size,
++            config.hidden_size,
++        )
++        if config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
++
++        logit_scale = getattr(config, "logit_scale", 1.0)
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                config.vocab_size, logit_scale)
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
++        vq_config: ChameleonVQVAEConfig = self.config.vq_config
++        expected_dims = (3, vq_config.resolution, vq_config.resolution)
++        actual_dims = tuple(data.shape[1:])
++
++        if actual_dims != expected_dims:
++            expected_expr = ("batch_size", *map(str, expected_dims))
++            raise ValueError(
++                f"The expected shape of pixel values is {expected_expr}. "
++                f"You supplied {tuple(data.shape)}.")
++
++        return data
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[ChameleonImagePixelInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++
++        if pixel_values is None:
++            return None
++
++        if not isinstance(pixel_values, torch.Tensor):
++            raise ValueError("Incorrect type of pixel values. "
++                             f"Got type: {type(pixel_values)}")
++
++        # Remove the N dimension until multiple images are supported.
++        pixel_values = pixel_values.squeeze(1)
++
++        return ChameleonImagePixelInputs(
++            type="pixel_values",
++            data=self._validate_pixel_values(pixel_values),
++        )
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        assert self.model.vqmodel is not None
++        image_tokens = self.model.get_image_tokens(image_input["data"].to(
++            self.config.torch_dtype))
++        vision_embeddings = self.model.get_input_embeddings(image_tokens)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++
++        inputs_embeds = self.model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                self.model.vocabulary_mapping.image_token_id)
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.model(input_ids,
++                                   positions,
++                                   kv_caches,
++                                   attn_metadata,
++                                   intermediate_tensors,
++                                   inputs_embeds=inputs_embeds)
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++
++        # Disallow image tokens which does not include special
++        # begin-image and end-image tokens
++        if logits is not None:
++            image_tokens = self.model.vocabulary_mapping.image_tokens
++            logits[:, image_tokens] = torch.finfo(logits.dtype).min
++
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            (".qkv_proj", ".q_proj", "q"),
++            (".qkv_proj", ".k_proj", "k"),
++            (".qkv_proj", ".v_proj", "v"),
++            (".gate_up_proj", ".gate_proj", 0),
++            (".gate_up_proj", ".up_proj", 1),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++
++            if ("rotary_emb.cos_cached" in name
++                    or "rotary_emb.sin_cached" in name):
++                # Models trained using ColossalAI may include these tensors in
++                # the checkpoint. Skip them.
++                continue
++
++            # With tie_word_embeddings, we can skip lm_head.weight
++            # The weight might appear unnecessarily in the files if the model is
++            # processed with quantization, LoRA, fine-tuning, etc.
++            if self.config.tie_word_embeddings and "lm_head.weight" in name:
++                continue
++
++            use_default_weight_loading = False
++            if "vqmodel" in name:
++                if self.model.vqmodel is not None:
++                    # We only do sharding for language model and
++                    # not vqvae for now.
++                    use_default_weight_loading = True
++            else:
++                for (param_name, weight_name,
++                     shard_id) in stacked_params_mapping:
++                    if weight_name not in name:
++                        continue
++                    name = name.replace(weight_name, param_name)
++                    # Skip loading extra bias for GPTQ models.
++                    if name.endswith(".bias") and name not in params_dict:
++                        continue
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    param = params_dict[name]
++                    weight_loader = param.weight_loader
++                    weight_loader(param, loaded_weight, shard_id)
++                    break
++                else:
++                    # Skip loading extra bias for GPTQ models.
++                    if name.endswith(".bias") and name not in params_dict:
++                        continue
++                    # Remapping the name of FP8 kv-scale.
++                    if name.endswith("kv_scale"):
++                        remapped_kv_scale_name = name.replace(
++                            ".kv_scale", ".attn.kv_scale")
++                        if remapped_kv_scale_name not in params_dict:
++                            logger.warning_once(
++                                "Found kv scale in the checkpoint (e.g. "
++                                f"{name}), but not found the expected name in "
++                                f"the model (e.g. {remapped_kv_scale_name}). "
++                                "kv-scale is not loaded.")
++                            continue
++                        else:
++                            name = remapped_kv_scale_name
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
++            if use_default_weight_loading and name in params_dict:
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
+index e116af2..7e37ce3 100644
+--- a/vllm/model_executor/models/chatglm.py
++++ b/vllm/model_executor/models/chatglm.py
+@@ -1,40 +1,236 @@
+-# coding=utf-8
+ # Adapted from
+-# https://github.com/THUDM/ChatGLM2-6B
+-"""Inference-only ChatGLM model compatible with THUDM weights."""
+-from typing import Iterable, List, Optional, Tuple
++# https://github.com/THUDM/CogAgent
++"""Inference-only CogAgent model compatible with THUDM weights."""
++from argparse import Namespace
++from array import array
++from typing import (Dict, Iterable, List, Mapping, Optional, Set, Tuple,
++                    TypedDict)
+ 
+ import torch
++from PIL import Image
+ from torch import nn
+ from torch.nn import LayerNorm
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.config import LoRAConfig
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
++                         InputContext, token_inputs)
++from vllm.logger import init_logger
+ from vllm.model_executor.layers.activation import SiluAndMul
+ from vllm.model_executor.layers.layernorm import RMSNorm
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
++from vllm.model_executor.models.module_mapping import MultiModelKeys
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (ModalityData, MultiModalKwargs,
++                                    NestedTensors)
++from vllm.multimodal.utils import cached_get_tokenizer
++from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
++                           SequenceData)
+ from vllm.transformers_utils.configs import ChatGLMConfig
+ 
++from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++logger = init_logger(__name__)
++
++
++def calculate_image_placeholder(vision_config):
++    return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2
++
++
++def mm_input_mapper_for_glmv(
++    ctx: InputContext,
++    data: ModalityData[object],
++) -> Dict:
++    model_config = ctx.model_config
++    tokenizer = cached_get_tokenizer(
++        model_config.tokenizer,
++        trust_remote_code=model_config.trust_remote_code)
++    if tokenizer is None:
++        raise RuntimeError("No HuggingFace processor is available "
++                           "to process the image object")
++    try:
++        raw_batch_data = tokenizer.apply_chat_template(
++            conversation=[{
++                "role": "user",
++                "image": data
++            }],
++            add_generation_prompt=True,
++            tokenize=True,
++            return_tensors="pt",
++            return_dict=True).data
++    except Exception:
++        logger.error("Failed to process image (%s)", data)
++        raise
++    pixel_values = raw_batch_data['images']
++
++    return MultiModalKwargs({'pixel_values': pixel_values})
++
++
++def merge_glm_vision_embeddings(
++    input_ids: torch.Tensor,
++    inputs_embeds: torch.Tensor,
++    vision_embeddings: torch.Tensor,
++    boi_token_id: int,
++    eoi_token_id: int,
++) -> torch.Tensor:
++
++    boi_positions = (input_ids == boi_token_id).nonzero(as_tuple=True)[0]
++    eoi_positions = (input_ids == eoi_token_id).nonzero(as_tuple=True)[0]
++
++    mask = torch.zeros_like(input_ids, dtype=torch.bool)
++
++    for boi_pos, eoi_pos in zip(boi_positions, eoi_positions):
++        assert boi_pos < eoi_pos
++        mask[boi_pos:eoi_pos + 1] = True
++    inputs_embeds[mask] = vision_embeddings.view(-1,
++                                                 vision_embeddings.shape[-1])
++    return inputs_embeds
++
++
++class GLMImagePixelInputs(TypedDict):
++    pixel_values: torch.Tensor
++    """Shape: `(batch_size, num_channels, height, width)`"""
++
++
++def get_max_glmv_image_tokens(ctx: InputContext):
++    hf_config = ctx.get_hf_config(ChatGLMConfig)
++
++    vision_config = getattr(hf_config, 'vision_config', None)
++    if vision_config is None:
++        return 1
++    elif isinstance(vision_config, dict):
++        return calculate_image_placeholder(vision_config)
++
++    msg = f"Unsupported vision config: {type(vision_config)}"
++    raise NotImplementedError(msg)
++
++
++def dummy_data_for_glmv(ctx: InputContext, seq_len: int,
++                        mm_counts: Mapping[str, int]) -> DummyData:
++    hf_config = ctx.get_hf_config(ChatGLMConfig)
++    vision_config = getattr(hf_config, 'vision_config', None)
++
++    if vision_config is None:
++        token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len)
++        seq_data = SequenceData(token_ids)
++        return DummyData(seq_data, None)
++    elif isinstance(vision_config, dict):
++        image_size = vision_config["image_size"]
++        image_placeholder_length = calculate_image_placeholder(vision_config)
++        token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.boi_token_id] +
++                          [0] * image_placeholder_length +
++                          [hf_config.eoi_token_id])
++        token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
++                           [0] * (seq_len - image_placeholder_length - 2))
++        seq_data = SequenceData(token_ids)
++
++        mm_data = {
++            "image": Image.new("RGB", (image_size, image_size), color=0)
++        }
++
++        return DummyData(seq_data, mm_data)
++
++    msg = f"Unsupported vision config: {type(vision_config)}"
++    raise NotImplementedError(msg)
++
++
++def find_all_positions(input_ids: List[int], target: int) -> List[int]:
++    return [index for index, value in enumerate(input_ids) if value == target]
++
++
++def input_processor_for_glmv(ctx: InputContext, inputs: DecoderOnlyInputs):
++    multi_modal_data = inputs.get("multi_modal_data")
++    if multi_modal_data is None or "image" not in multi_modal_data:
++        return inputs
++
++    hf_config = ctx.get_hf_config(ChatGLMConfig)
++    vision_config = getattr(hf_config, 'vision_config', None)
++
++    if vision_config is None:
++        return inputs
++    elif isinstance(vision_config, dict):
++        image_placeholder_length = calculate_image_placeholder(vision_config)
++    else:
++        msg = f"Unsupported vision config: {type(vision_config)}"
++        raise NotImplementedError(msg)
++
++    input_ids = inputs["prompt_token_ids"]
++
++    tokenizer = cached_get_tokenizer(
++        ctx.model_config.model,
++        trust_remote_code=ctx.model_config.trust_remote_code)
++
++    try:
++        raw_batch_data = tokenizer.apply_chat_template(
++            conversation=[{
++                "role": "user",
++                "image": multi_modal_data["image"],
++                "content": inputs['prompt'],
++            }],
++            add_generation_prompt=True,
++            tokenize=True,
++            return_tensors="pt",
++            return_dict=True,
++        ).data
++    except Exception:
++        logger.error("Failed to process content (%s)", inputs['prompt'])
++        raise
++    input_ids = raw_batch_data['input_ids'][0].tolist()
++
++    boi_token_id = hf_config.boi_token_id
++    eoi_token_id = hf_config.eoi_token_id
++    boi_positions = find_all_positions(input_ids, boi_token_id)
++    eoi_positions = find_all_positions(input_ids, eoi_token_id)
++
++    assert len(boi_positions) == len(eoi_positions)
++
++    new_input_ids = []
++    final_processed_position = 0
++
++    for boi_position, eoi_position in zip(boi_positions, eoi_positions):
++        assert boi_position < eoi_position
++        new_input_ids.extend(input_ids[final_processed_position:boi_position +
++                                       1])
++        new_input_ids.extend([input_ids[boi_position + 1]] *
++                             image_placeholder_length)
++        final_processed_position = eoi_position
++
++    new_input_ids.extend(input_ids[final_processed_position:])
++
++    prompt = inputs.get("prompt")
++    if prompt is None:
++        prompt = tokenizer.decode(new_input_ids)
++
++    return token_inputs(
++        prompt_token_ids=new_input_ids,
++        prompt=prompt,
++        multi_modal_data=multi_modal_data,
++    )
++
+ 
+ class GLMAttention(nn.Module):
+ 
+     def __init__(
+         self,
+-        config,
++        config: ChatGLMConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -78,19 +274,23 @@ class GLMAttention(nn.Module):
+         # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
+         rope_ratio = getattr(config, "rope_ratio", 1.0)
+         max_positions = getattr(config, "seq_length", 8192)
++        # NOTE: THUDM/cogagent-9b-20241220 uses original_rope=False,
++        # which is equivalent to is_neox_style=True
++        is_neox_style = not config.original_rope
+         self.rotary_emb = get_rope(
+             self.head_dim,
+             rotary_dim=self.head_dim // 2,
+             max_position=max_positions,
+             base=10000 * rope_ratio,
+-            is_neox_style=False,
+-        )
+-        self.attn = Attention(
+-            self.num_heads,
+-            self.head_dim,
+-            self.scaling,
+-            num_kv_heads=self.num_kv_heads,
++            is_neox_style=is_neox_style,
+         )
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -123,7 +323,7 @@ class GLMMLP(nn.Module):
+ 
+     def __init__(
+         self,
+-        config,
++        config: ChatGLMConfig,
+         quant_config: Optional[QuantizationConfig] = None,
+     ):
+         super().__init__()
+@@ -166,8 +366,10 @@ class GLMBlock(nn.Module):
+ 
+     def __init__(
+         self,
+-        config,
++        config: ChatGLMConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.apply_residual_connection_post_layernorm = (
+@@ -181,7 +383,10 @@ class GLMBlock(nn.Module):
+                                                eps=config.layernorm_epsilon)
+ 
+         # Self attention.
+-        self.self_attention = GLMAttention(config, quant_config)
++        self.self_attention = GLMAttention(config,
++                                           cache_config,
++                                           quant_config,
++                                           prefix=f"{prefix}.self_attention")
+         self.hidden_dropout = config.hidden_dropout
+ 
+         # Layernorm on the attention output
+@@ -236,8 +441,10 @@ class GLMTransformer(nn.Module):
+ 
+     def __init__(
+         self,
+-        config,
++        config: ChatGLMConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.post_layer_norm = config.post_layer_norm
+@@ -246,8 +453,12 @@ class GLMTransformer(nn.Module):
+         self.num_layers = config.num_layers
+ 
+         # Transformer layers.
+-        self.layers = nn.ModuleList(
+-            [GLMBlock(config, quant_config) for i in range(self.num_layers)])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            self.num_layers,
++            lambda prefix: GLMBlock(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers",
++        )
+ 
+         if self.post_layer_norm:
+             layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
+@@ -255,6 +466,10 @@ class GLMTransformer(nn.Module):
+             self.final_layernorm = layer_norm_func(
+                 config.hidden_size, eps=config.layernorm_epsilon)
+ 
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
+     def forward(
+         self,
+         hidden_states: torch.Tensor,
+@@ -262,16 +477,16 @@ class GLMTransformer(nn.Module):
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+     ) -> torch.Tensor:
+-        for i in range(self.num_layers):
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states = layer(
+                 hidden_states=hidden_states,
+                 position_ids=position_ids,
+-                kv_cache=kv_caches[i],
++                kv_cache=kv_caches[i - self.start_layer],
+                 attn_metadata=attn_metadata,
+             )
+         # Final layer norm.
+-        if self.post_layer_norm:
++        if get_pp_group().is_last_rank and self.post_layer_norm:
+             hidden_states = self.final_layernorm(hidden_states)
+ 
+         return hidden_states
+@@ -279,86 +494,161 @@ class GLMTransformer(nn.Module):
+ 
+ class ChatGLMModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
+ 
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.config = config
++
+         self.embedding = VocabParallelEmbedding(config.padded_vocab_size,
+-                                                config.hidden_size)
++                                                config.hidden_size,
++                                                quant_config=quant_config)
+ 
+         self.num_layers = config.num_layers
+         self.multi_query_group_num = config.multi_query_group_num
+         self.kv_channels = config.kv_channels
+-        self.encoder = GLMTransformer(config, quant_config)
++        self.encoder = GLMTransformer(config,
++                                      cache_config,
++                                      quant_config,
++                                      prefix=f"{prefix}.encoder")
+ 
+         self.output_layer = ParallelLMHead(config.padded_vocab_size,
+-                                           config.hidden_size)
++                                           config.hidden_size,
++                                           quant_config=quant_config,
++                                           prefix=f"{prefix}.output_layer")
++
++        vision_config_flag = getattr(config, 'vision_config', None)
++        if vision_config_flag is not None:
++            self.vision_config = Namespace(**config.vision_config)
++            self.vision = EVA2CLIPModel(self.config,
++                                        quant_config,
++                                        prefix=f"{prefix}.vision")
++        else:
++            self.vision = None
++
++        self.make_empty_intermediate_tensors = (
++            self.encoder.make_empty_intermediate_tensors)
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> GLMImagePixelInputs:
++
++        pixel_values = kwargs.pop("pixel_values", None)
++        if pixel_values is not None and self.vision is not None:
++            if isinstance(pixel_values, torch.Tensor):
++                if pixel_values.ndim > 2:
++                    pixel_values = torch.concat(list(pixel_values))
++            elif isinstance(pixel_values, list):
++                return torch.concat(pixel_values)
++            else:
++                raise TypeError("""pixel_values must be a torch.Tensor
++                    or a list of torch.Tensor
++                    """)
++        return GLMImagePixelInputs(pixel_values=pixel_values)
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input["pixel_values"] is None:
++            return None
++        pixel_values = image_input["pixel_values"].to(
++            dtype=self.config.torch_dtype)
++        vision_embeddings = self.vision(pixel_values)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.embedding(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_glm_vision_embeddings(
++                input_ids=input_ids,
++                inputs_embeds=inputs_embeds,
++                vision_embeddings=multimodal_embeddings,
++                boi_token_id=self.config.boi_token_id,
++                eoi_token_id=self.config.eoi_token_id)
++        return inputs_embeds
+ 
+     def forward(
+         self,
+         input_ids: torch.Tensor,
+-        position_ids: torch.Tensor,
++        positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
+     ) -> torch.Tensor:
+-        inputs_embeds = self.embedding(input_ids)
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        if intermediate_tensors is None and inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++        else:
++            inputs_embeds = intermediate_tensors["hidden_states"]
+ 
+         # Run encoder.
+         hidden_states = self.encoder(
+             hidden_states=inputs_embeds,
+-            position_ids=position_ids,
++            position_ids=positions,
+             kv_caches=kv_caches,
+             attn_metadata=attn_metadata,
+         )
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         return hidden_states
+ 
+ 
+-class ChatGLMForCausalLM(nn.Module):
+-    packed_modules_mapping = {
+-        "query_key_value": ["query_key_value"],
+-        "dense_h_to_4h": ["dense_h_to_4h"]
+-    }
+-    # LoRA specific attributes
+-    supported_lora_modules = [
+-        "query_key_value",
+-        "dense",
+-        "dense_h_to_4h",
+-        "dense_4h_to_h",
+-    ]
+-    embedding_modules = {}
+-    embedding_padding_modules = []
++class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: ChatGLMConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
+-        self.config: ChatGLMConfig = config
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = config
++        self.lora_config = lora_config
++        self.multimodal_config = multimodal_config
++
+         self.quant_config = quant_config
+-        self.transformer = ChatGLMModel(config, quant_config)
+-        self.lm_head_weight = self.transformer.output_layer.weight
++        self.max_position_embeddings = getattr(config, "max_sequence_length",
++                                               8192)
++        self.transformer = ChatGLMModel(vllm_config=vllm_config,
++                                        prefix=maybe_prefix(
++                                            prefix, "transformer"))
++        if self.config.tie_word_embeddings:
++            self.transformer.output_layer.weight = (
++                self.transformer.embedding.weight)
++        self.lm_head = self.transformer.output_layer
+         self.logits_processor = LogitsProcessor(config.padded_vocab_size)
+-        self.sampler = Sampler()
+-
+-    def forward(
+-        self,
+-        input_ids: torch.Tensor,
+-        positions: torch.Tensor,
+-        kv_caches: List[torch.Tensor],
+-        attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        self.sampler = get_sampler()
++
++    def forward(self,
++                input_ids: torch.Tensor,
++                positions: torch.Tensor,
++                kv_caches: List[torch.Tensor],
++                attn_metadata: AttentionMetadata,
++                intermediate_tensors: Optional[IntermediateTensors] = None,
++                **kwargs) -> torch.Tensor:
+         hidden_states = self.transformer(input_ids, positions, kv_caches,
+-                                         attn_metadata)
++                                         attn_metadata, intermediate_tensors,
++                                         **kwargs)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -370,9 +660,27 @@ class ChatGLMForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        # Merge two ColumnParallelLinear into one MergedColumnParallelLinear
++        merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = {
++            "transformer.vision.linear_proj.merged_proj.weight": {
++                "transformer.vision.linear_proj.gate_proj.weight": None,
++                "transformer.vision.linear_proj.dense_h_to_4h.weight": None,
++            }
++        }
++
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
++            is_weight_to_be_merge = False
++            for _, merged_weight_dict in merged_weights_dict.items():
++                if name in merged_weight_dict:
++                    assert merged_weight_dict[name] is None
++                    merged_weight_dict[name] = loaded_weight
++                    is_weight_to_be_merge = True
++            if is_weight_to_be_merge:
++                continue
+             if "rotary_pos_emb.inv_freq" in name:
+                 continue
+             if "word_embeddings" in name:
+@@ -380,7 +688,97 @@ class ChatGLMForCausalLM(nn.Module):
+             # Skip loading extra bias for GPTQ models.
+             if name.endswith(".bias") and name not in params_dict:
+                 continue
++            if is_pp_missing_parameter(name, self):
++                continue
+             param = params_dict[name]
+             weight_loader = getattr(param, "weight_loader",
+                                     default_weight_loader)
+             weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++
++        for combined_name, merged_weight_dict in merged_weights_dict.items():
++            if combined_name in params_dict:
++                param = params_dict[combined_name]
++                combined_weight = torch.cat(list(merged_weight_dict.values()),
++                                            dim=0)
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, combined_weight)
++                loaded_params.add(combined_name)
++        return loaded_params
++
++
++class ChatGLM(ChatGLMBaseModel):
++    packed_modules_mapping = {
++        "query_key_value": ["query_key_value"],
++        "dense_h_to_4h": ["dense_h_to_4h"]
++    }
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "query_key_value",
++        "dense",
++        "dense_h_to_4h",
++        "dense_4h_to_h",
++    ]
++
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++
++class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal):
++    packed_modules_mapping = {
++        "query_key_value": ["query_key_value"],
++        "dense_h_to_4h": ["dense_h_to_4h"],
++        "merged_proj": ["gate_proj", "dense_h_to_4h"]
++    }
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "query_key_value",
++        "dense",
++        "dense_h_to_4h",
++        "dense_4h_to_h",
++        # vision
++        "fc1",
++        "fc2",
++        "merged_proj",
++        "linear_proj"
++    ]
++
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def get_mm_mapping(self) -> MultiModelKeys:
++        """
++        Get the module prefix in multimodal models
++        """
++        return MultiModelKeys.from_string_field(
++            language_model="transformer.encoder",
++            connector="transformer.vision.linear_proj",
++            tower_model="transformer.vision.transformer")
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv)
++@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens)
++@INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv)
++@INPUT_REGISTRY.register_input_processor(input_processor_for_glmv)
++class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
++                         SupportsMultiModal):
++    # Ensure that the LoRA support check passes when the class is not
++    # initialized, but set all these attributes to empty.
++    packed_modules_mapping = {}
++    supported_lora_modules = []
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def __new__(
++        cls,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++    ) -> None:
++        config = vllm_config.model_config.hf_config
++        # Initialize VL
++        if hasattr(config, "visual"):
++            return ChatGLMV(vllm_config=vllm_config, prefix=prefix)
++        # Initialize LLM
++        else:
++            return ChatGLM(vllm_config=vllm_config, prefix=prefix)
+\ No newline at end of file
+diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
+new file mode 100644
+index 0000000..dd69f6c
+--- /dev/null
++++ b/vllm/model_executor/models/clip.py
+@@ -0,0 +1,544 @@
++"""Minimal implementation of CLIPVisionModel intended to be only used
++within a vision language model."""
++from typing import Iterable, List, Optional, Set, Tuple, Union
++
++import numpy as np
++import torch
++import torch.nn as nn
++from PIL import Image
++from transformers import CLIPVisionConfig
++
++from vllm.attention.layer import MultiHeadAttention
++from vllm.config import ModelConfig
++from vllm.distributed import divide, get_tensor_model_parallel_world_size
++from vllm.inputs import DecoderOnlyInputs, token_inputs
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.multimodal.utils import (cached_get_tokenizer,
++                                   consecutive_placeholder_ranges,
++                                   repeat_and_pad_placeholder_tokens)
++from vllm.sequence import SequenceData
++
++from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
++
++
++def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
++    assert image_size % patch_size == 0
++    return image_size // patch_size
++
++
++def get_clip_num_patches(*, image_size: int, patch_size: int) -> int:
++    grid_length = get_clip_patch_grid_length(image_size=image_size,
++                                             patch_size=patch_size)
++    return grid_length * grid_length
++
++
++def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
++    return get_clip_num_patches(image_size=hf_config.image_size,
++                                patch_size=hf_config.patch_size) + 1
++
++
++def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int:
++    return get_clip_image_feature_size(hf_config)
++
++
++def dummy_seq_data_for_clip(hf_config: CLIPVisionConfig,
++                            seq_len: int,
++                            num_images: int,
++                            *,
++                            image_token_id: int,
++                            image_feature_size_override: Optional[int] = None,
++                            mm_key: str = "image"):
++    if image_feature_size_override is None:
++        image_feature_size = get_clip_image_feature_size(hf_config)
++    else:
++        image_feature_size = image_feature_size_override
++
++    return SequenceData.from_prompt_token_counts(
++        (image_token_id, image_feature_size * num_images),
++        (0, seq_len - image_feature_size * num_images),
++    ), {
++        mm_key:
++        consecutive_placeholder_ranges(num_items=num_images,
++                                       item_size=image_feature_size)
++    }
++
++
++def dummy_image_for_clip(
++    hf_config: CLIPVisionConfig,
++    num_images: int,
++    *,
++    image_width_override: Optional[int] = None,
++    image_height_override: Optional[int] = None,
++):
++    width = height = hf_config.image_size
++    if image_width_override is not None:
++        width = image_width_override
++    if image_height_override is not None:
++        height = image_height_override
++
++    image = Image.new("RGB", (width, height), color=0)
++    return {"image": image if num_images == 1 else [image] * num_images}
++
++
++def dummy_video_for_clip(
++    hf_config: CLIPVisionConfig,
++    num_frames: int,
++    num_videos: int = 1,
++    *,
++    image_width_override: Optional[int] = None,
++    image_height_override: Optional[int] = None,
++):
++    pil_frame = dummy_image_for_clip(
++        hf_config,
++        num_images=1,
++        image_width_override=image_width_override,
++        image_height_override=image_height_override)
++    np_frame = np.array(pil_frame["image"])
++    mm_data_per_video = np.repeat([np_frame], num_frames, axis=0)
++    video_data = [mm_data_per_video] * num_videos
++    mm_data = {"video": video_data}
++    return mm_data
++
++
++def input_processor_for_clip(
++    model_config: ModelConfig,
++    hf_config: CLIPVisionConfig,
++    inputs: DecoderOnlyInputs,
++    *,
++    image_token_id: int,
++    image_feature_size_override: Optional[Union[int, List[int]]] = None,
++):
++    multi_modal_data = inputs.get("multi_modal_data")
++    if multi_modal_data is None or "image" not in multi_modal_data:
++        return inputs
++
++    if "multi_modal_placeholders" in inputs and "image" in inputs[
++            "multi_modal_placeholders"]:
++        # The inputs already have placeholders.
++        return inputs
++
++    tokenizer = cached_get_tokenizer(model_config.tokenizer)
++
++    if image_feature_size_override is None:
++        image_data = multi_modal_data["image"]
++        if isinstance(image_data, Image.Image):
++            image_feature_size = get_clip_image_feature_size(hf_config)
++        elif isinstance(image_data, torch.Tensor):
++            num_images, image_feature_size, hidden_size = image_data.shape
++        else:
++            raise TypeError(f"Invalid image type: {type(image_data)}")
++    else:
++        image_feature_size = image_feature_size_override
++
++    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
++        tokenizer,
++        inputs.get("prompt"),
++        inputs["prompt_token_ids"],
++        placeholder_token_id=image_token_id,
++        repeat_count=image_feature_size,
++    )
++
++    # NOTE: Create a defensive copy of the original inputs
++    return token_inputs(prompt_token_ids=new_token_ids,
++                        prompt=new_prompt,
++                        multi_modal_data=multi_modal_data,
++                        multi_modal_placeholders={"image": ranges})
++
++
++class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
++
++    def get_num_image_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++    ) -> int:
++        return get_clip_image_feature_size(self.vision_config)
++
++    def get_max_image_tokens(self) -> int:
++        return get_max_clip_image_tokens(self.vision_config)
++
++    def get_image_size(self) -> int:
++        return self.vision_config.image_size
++
++    def get_patch_size(self) -> int:
++        return self.vision_config.patch_size
++
++    def get_patch_grid_length(self) -> int:
++        return get_clip_patch_grid_length(
++            image_size=self.vision_config.image_size,
++            patch_size=self.vision_config.patch_size,
++        )
++
++
++# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
++class CLIPVisionEmbeddings(nn.Module):
++
++    def __init__(self, config: CLIPVisionConfig):
++        super().__init__()
++        self.config = config
++        self.embed_dim = config.hidden_size
++        self.image_size = config.image_size
++        self.patch_size = config.patch_size
++
++        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
++
++        self.patch_embedding = nn.Conv2d(
++            in_channels=config.num_channels,
++            out_channels=self.embed_dim,
++            kernel_size=self.patch_size,
++            stride=self.patch_size,
++            bias=False,
++        )
++
++        self.num_patches = get_clip_num_patches(image_size=self.image_size,
++                                                patch_size=self.patch_size)
++        self.num_positions = self.num_patches + 1
++        self.position_embedding = nn.Embedding(self.num_positions,
++                                               self.embed_dim)
++        self.register_buffer("position_ids",
++                             torch.arange(self.num_positions).expand((1, -1)),
++                             persistent=False)
++
++    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
++        batch_size = pixel_values.shape[0]
++        target_dtype = self.patch_embedding.weight.dtype
++        patch_embeds = self.patch_embedding(pixel_values.to(
++            dtype=target_dtype))  # shape = [*, width, grid, grid]
++        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
++
++        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
++        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
++        embeddings = embeddings + self.position_embedding(self.position_ids)
++
++        return embeddings
++
++
++class CLIPAttention(nn.Module):
++    """Multi-headed attention from 'Attention Is All You Need' paper"""
++
++    def __init__(
++        self,
++        config: CLIPVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.config = config
++        self.embed_dim = config.hidden_size
++        self.num_heads = config.num_attention_heads
++        self.head_dim = self.embed_dim // self.num_heads
++        if self.head_dim * self.num_heads != self.embed_dim:
++            raise ValueError(
++                "embed_dim must be divisible by num_heads "
++                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
++                f" {self.num_heads}).")
++        self.scale = self.head_dim**-0.5
++        self.dropout = config.attention_dropout
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size=self.embed_dim,
++            head_size=self.head_dim,
++            total_num_heads=self.num_heads,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++
++        self.out_proj = RowParallelLinear(
++            input_size=self.embed_dim,
++            output_size=self.embed_dim,
++            quant_config=quant_config,
++            prefix=f"{prefix}.out_proj",
++        )
++
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
++
++        self.attn = MultiHeadAttention(self.num_heads_per_partition,
++                                       self.head_dim, self.scale)
++
++    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
++        return tensor.view(bsz, seq_len, self.num_heads,
++                           self.head_dim).transpose(1, 2).contiguous()
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++    ):
++        """Input shape: Batch x Time x Channel"""
++
++        qkv_states, _ = self.qkv_proj(hidden_states)
++        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
++        out = self.attn(query_states, key_states, value_states)
++        attn_output, _ = self.out_proj(out)
++
++        return attn_output, None
++
++
++class CLIPMLP(nn.Module):
++
++    def __init__(
++        self,
++        config: CLIPVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.config = config
++        self.activation_fn = get_act_fn(config.hidden_act)
++        self.fc1 = ColumnParallelLinear(config.hidden_size,
++                                        config.intermediate_size,
++                                        bias=True,
++                                        quant_config=quant_config,
++                                        prefix=f"{prefix}.fc1")
++        self.fc2 = RowParallelLinear(config.intermediate_size,
++                                     config.hidden_size,
++                                     bias=True,
++                                     quant_config=quant_config,
++                                     prefix=f"{prefix}.fc2")
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        hidden_states, _ = self.fc1(hidden_states)
++        hidden_states = self.activation_fn(hidden_states)
++        hidden_states, _ = self.fc2(hidden_states)
++
++        return hidden_states
++
++
++class CLIPEncoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: CLIPVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.self_attn = CLIPAttention(
++            config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.layer_norm1 = nn.LayerNorm(config.hidden_size,
++                                        eps=config.layer_norm_eps)
++        self.mlp = CLIPMLP(config,
++                           quant_config=quant_config,
++                           prefix=f"{prefix}.mlp")
++        self.layer_norm2 = nn.LayerNorm(config.hidden_size,
++                                        eps=config.layer_norm_eps)
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++
++        residual = hidden_states
++
++        hidden_states = self.layer_norm1(hidden_states)
++        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
++        hidden_states = residual + hidden_states
++
++        residual = hidden_states
++        hidden_states = self.layer_norm2(hidden_states)
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = residual + hidden_states
++
++        return hidden_states
++
++
++class CLIPEncoder(nn.Module):
++    """
++    Transformer encoder consisting of `config.num_hidden_layers` self
++    attention layers. Each layer is a [`CLIPEncoderLayer`].
++
++    Args:
++        config: CLIPConfig
++    """
++
++    def __init__(
++        self,
++        config: CLIPVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        num_hidden_layers_override: Optional[int] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++
++        if num_hidden_layers_override is None:
++            num_hidden_layers = config.num_hidden_layers
++        else:
++            num_hidden_layers = num_hidden_layers_override
++        self.layers = nn.ModuleList([
++            CLIPEncoderLayer(config=config,
++                             quant_config=quant_config,
++                             prefix=f"{prefix}.layers.{layer_idx}")
++            for layer_idx in range(num_hidden_layers)
++        ])
++
++    def forward(
++        self, inputs_embeds: torch.Tensor, return_all_hidden_states: bool
++    ) -> Union[torch.Tensor, list[torch.Tensor]]:
++        hidden_states_pool = []
++        hidden_states = inputs_embeds
++
++        for encoder_layer in self.layers:
++            hidden_states = encoder_layer(hidden_states)
++            if return_all_hidden_states:
++                hidden_states_pool.append(hidden_states)
++        # If we have multiple feature sample layers, we return all hidden
++        # states in order and grab the ones we need by index.
++        if return_all_hidden_states:
++            return hidden_states_pool
++        return hidden_states
++
++
++class CLIPVisionTransformer(nn.Module):
++
++    def __init__(
++        self,
++        config: CLIPVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        require_post_norm: Optional[bool] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++        embed_dim = config.hidden_size
++
++        self.embeddings = CLIPVisionEmbeddings(config)
++
++        # NOTE: This typo of "layrnorm" is not fixed on purpose to match
++        # the original transformers code and name of the model weights.
++        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
++
++        self.encoder = CLIPEncoder(
++            config=config,
++            quant_config=quant_config,
++            num_hidden_layers_override=num_hidden_layers_override,
++            prefix=f"{prefix}.encoder",
++        )
++
++        num_hidden_layers = config.num_hidden_layers
++        if len(self.encoder.layers) > config.num_hidden_layers:
++            raise ValueError(
++                f"The original encoder only has {num_hidden_layers} "
++                f"layers, but you requested {len(self.encoder.layers)} layers."
++            )
++
++        # If possible, skip post_layernorm to conserve memory
++        if require_post_norm is None:
++            require_post_norm = len(self.encoder.layers) == num_hidden_layers
++
++        if require_post_norm:
++            self.post_layernorm = nn.LayerNorm(embed_dim,
++                                               eps=config.layer_norm_eps)
++        else:
++            self.post_layernorm = None
++
++    def forward(
++        self,
++        pixel_values: torch.Tensor,
++        feature_sample_layers: Optional[list[int]] = None,
++    ) -> torch.Tensor:
++
++        hidden_states = self.embeddings(pixel_values)
++        hidden_states = self.pre_layrnorm(hidden_states)
++
++        return_all_hidden_states = feature_sample_layers is not None
++
++        # Produces either the last layer output or all of the hidden states,
++        # depending on if we have feature_sample_layers or not
++        encoder_outputs = self.encoder(
++            inputs_embeds=hidden_states,
++            return_all_hidden_states=return_all_hidden_states)
++
++        # Handle post-norm (if applicable) and stacks feature layers if needed
++        encoder_outputs = resolve_visual_encoder_outputs(
++            encoder_outputs, feature_sample_layers, self.post_layernorm,
++            self.config.num_hidden_layers)
++
++        return encoder_outputs
++
++
++class CLIPVisionModel(nn.Module):
++
++    config_class = CLIPVisionConfig
++    main_input_name = "pixel_values"
++
++    def __init__(
++        self,
++        config: CLIPVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        require_post_norm: Optional[bool] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.vision_model = CLIPVisionTransformer(
++            config=config,
++            quant_config=quant_config,
++            num_hidden_layers_override=num_hidden_layers_override,
++            require_post_norm=require_post_norm,
++            prefix=f"{prefix}.vision_model")
++
++    def forward(
++        self,
++        pixel_values: torch.Tensor,
++        feature_sample_layers: Optional[list[int]] = None,
++    ) -> torch.Tensor:
++        return self.vision_model(pixel_values, feature_sample_layers)
++
++    @property
++    def device(self):
++        return next(self.parameters()).device
++
++    # (TODO) Add prefix argument for filtering out weights to be loaded
++    #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        layer_count = len(self.vision_model.encoder.layers)
++
++        for name, loaded_weight in weights:
++            # post_layernorm is not needed in CLIPVisionModel
++            if (name.startswith("vision_model.post_layernorm")
++                    and self.vision_model.post_layernorm is None):
++                continue
++
++            # omit layers when num_hidden_layers_override is set
++            if name.startswith("vision_model.encoder.layers"):
++                layer_idx = int(name.split(".")[3])
++                if layer_idx >= layer_count:
++                    continue
++
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
+index 17c2f12..8d61ece 100644
+--- a/vllm/model_executor/models/commandr.py
++++ b/vllm/model_executor/models/commandr.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
+ #
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+@@ -20,35 +19,42 @@
+ 
+ # This file is based on the LLama model definition file in transformers
+ """PyTorch Cohere model."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ import torch.utils.checkpoint
+ from torch import nn
+-from torch.nn.parameter import Parameter
+ from transformers import CohereConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import (get_tensor_model_parallel_rank,
+-                              get_tensor_model_parallel_world_size)
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import SiluAndMul
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     VocabParallelEmbedding)
+-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name,
++    row_parallel_weight_loader)
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+ from vllm.model_executor.utils import set_weight_attrs
+-from vllm.sequence import SamplerOutput
++from vllm.platforms import current_platform
++from vllm.sequence import IntermediateTensors
+ 
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (extract_layer_index, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+-@torch.compile
++
++@torch.compile(backend=current_platform.simple_compile_backend)
+ def layer_norm_func(hidden_states, weight, variance_epsilon):
+     input_dtype = hidden_states.dtype
+     hidden_states = hidden_states.to(torch.float32)
+@@ -66,32 +72,21 @@ class LayerNorm(nn.Module):
+         super().__init__()
+         self.weight = nn.Parameter(torch.ones(param_shape))
+         self.variance_epsilon = eps
+-        set_weight_attrs(self.weight, {"weight_loader": self.weight_loader})
++        set_weight_attrs(self.weight,
++                         {"weight_loader": row_parallel_weight_loader})
+ 
+     def forward(self, hidden_states, residuals=None):
+         hidden_states = layer_norm_func(hidden_states, self.weight,
+                                         self.variance_epsilon)
+         return hidden_states, residuals
+ 
+-    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+-        tp_rank = get_tensor_model_parallel_rank()
+-        shard_dim = 0 if param.dim() != 1 else None
+-        param_data = param.data
+-        if shard_dim is not None:
+-            shard_size = param_data.shape[shard_dim]
+-            start_idx = tp_rank * shard_size
+-            loaded_weight = loaded_weight.narrow(shard_dim, start_idx,
+-                                                 shard_size)
+-        assert param_data.shape == loaded_weight.shape
+-        param_data.copy_(loaded_weight)
+-
+ 
+ # Copied from transformers.models.llama.modeling_llama.LlamaMLP Llama->Cohere
+ class CohereMLP(nn.Module):
+ 
+     def __init__(
+         self,
+-        config,
++        config: CohereConfig,
+         quant_config: Optional[QuantizationConfig] = None,
+     ):
+         super().__init__()
+@@ -124,7 +119,9 @@ class CohereAttention(nn.Module):
+     def __init__(
+         self,
+         config: CohereConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         tp_size = get_tensor_model_parallel_world_size()
+@@ -175,12 +172,29 @@ class CohereAttention(nn.Module):
+             rope_scaling=self.rope_scaling,
+             is_neox_style=False,
+         )
+-        self.attn = Attention(
+-            self.num_heads,
+-            self.head_dim,
+-            self.scaling,
+-            num_kv_heads=self.num_kv_heads,
+-        )
++
++        # Model v2 has interleaved sliding windows, v1 does not
++        interleaved_sliding_window = getattr(config,
++                                             "interleaved_sliding_window",
++                                             None)
++        self.v1 = interleaved_sliding_window is None
++
++        layer_idx = extract_layer_index(prefix)
++        layer_has_sliding_window = (
++            getattr(config, "sliding_window_pattern", False)
++            and (layer_idx + 1) % self.config.sliding_window_pattern != 0)
++
++        self.sliding_window = (interleaved_sliding_window
++                               if layer_has_sliding_window else None)
++
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              per_layer_sliding_window=self.sliding_window,
++                              prefix=f"{prefix}.attn")
+         if self.use_qk_norm:
+             self.q_norm = LayerNorm(param_shape=(self.num_heads,
+                                                  self.head_dim),
+@@ -209,7 +223,8 @@ class CohereAttention(nn.Module):
+         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+         if self.use_qk_norm:
+             q, k = self._apply_qk_norm(q, k)
+-        q, k = self.rotary_emb(positions, q, k)
++        if self.v1 or self.sliding_window:
++            q, k = self.rotary_emb(positions, q, k)
+         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+         output, _ = self.o_proj(attn_output)
+         return output
+@@ -219,11 +234,16 @@ class CohereDecoderLayer(nn.Module):
+ 
+     def __init__(self,
+                  config: CohereConfig,
+-                 quant_config: Optional[QuantizationConfig] = None):
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__()
+         self.hidden_size = config.hidden_size
+ 
+-        self.self_attn = CohereAttention(config, quant_config=quant_config)
++        self.self_attn = CohereAttention(config,
++                                         cache_config,
++                                         quant_config=quant_config,
++                                         prefix=f"{prefix}.self_attn")
+ 
+         self.mlp = CohereMLP(config, quant_config=quant_config)
+         self.input_layernorm = LayerNorm(param_shape=(config.hidden_size),
+@@ -253,24 +273,37 @@ class CohereDecoderLayer(nn.Module):
+         return hidden_states, residual
+ 
+ 
++@support_torch_compile
+ class CohereModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: CohereConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
+         self.config = config
+-        self.vocab_size = config.vocab_size
++        lora_vocab = (lora_config.lora_extra_vocab_size *
++                      (lora_config.max_loras or 1)) if lora_config else 0
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.org_vocab_size = config.vocab_size
+         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                    config.hidden_size)
+-        self.layers = nn.ModuleList([
+-            CohereDecoderLayer(config, quant_config=quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: CohereDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
+         self.norm = LayerNorm(param_shape=(config.hidden_size),
+                               eps=config.layer_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -278,36 +311,80 @@ class CohereModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        residual = None
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+                 residual,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+         hidden_states, _ = self.norm(hidden_states, residual)
+         return hidden_states
+ 
+ 
+-class CohereForCausalLM(nn.Module):
+-
+-    def __init__(
+-        self,
+-        config: CohereConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens"
++    ]
++    embedding_modules = {"embed_tokens": "input_embeddings"}
++    embedding_padding_modules = []
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
+         self.config = config
++        # currently all existing command R models have `tie_word_embeddings`
++        # enabled
++        assert config.tie_word_embeddings
++        self.unpadded_vocab_size = config.vocab_size
++        if lora_config:
++            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+         self.quant_config = quant_config
+-        self.logits_processor = LogitsProcessor(config.vocab_size,
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                config.vocab_size,
+                                                 scale=config.logit_scale)
+-        self.model = CohereModel(config, quant_config)
+-        self.sampler = Sampler()
++        self.model = CohereModel(vllm_config=vllm_config,
++                                 prefix=maybe_prefix(prefix, "model"))
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     @torch.no_grad()
+     def forward(
+@@ -316,15 +393,27 @@ class CohereForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.model.embed_tokens.weight,
+-                                       hidden_states, sampling_metadata)
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        is_not_lora = hasattr(self.model.embed_tokens, 'weight')
++        if is_not_lora:
++            logits = self.logits_processor(self.model.embed_tokens,
++                                           hidden_states, sampling_metadata)
++        else:
++            logits = self.logits_processor(self.model.embed_tokens.base_layer,
++                                           hidden_states, sampling_metadata)
++
+         return logits
+ 
+     def sample(
+@@ -335,7 +424,8 @@ class CohereForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -345,7 +435,7 @@ class CohereForCausalLM(nn.Module):
+             ("gate_up_proj", "up_proj", 1),
+         ]
+         params_dict = dict(self.named_parameters())
+-        loaded_params = set()
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             for param_name, shard_name, shard_id in stacked_params_mapping:
+                 if shard_name not in name:
+@@ -354,6 +444,8 @@ class CohereForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -366,8 +458,16 @@ class CohereForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                # Remapping the name of FP8 kv-scale.
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
+             loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
+index a4a0ae5..3932d8b 100644
+--- a/vllm/model_executor/models/dbrx.py
++++ b/vllm/model_executor/models/dbrx.py
+@@ -1,30 +1,33 @@
+-# coding=utf-8
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ import torch.nn as nn
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import (get_tensor_model_parallel_rank,
+-                              get_tensor_model_parallel_world_size,
+-                              tensor_model_parallel_all_reduce)
+-from vllm.model_executor.layers.fused_moe import fused_moe
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size)
++from vllm.model_executor.layers.fused_moe import FusedMoE
+ from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                                ReplicatedLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name)
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.model_executor.utils import set_weight_attrs
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
+ from vllm.transformers_utils.configs.dbrx import DbrxConfig
+ 
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
+ 
+ class DbrxRouter(nn.Module):
+     """A Router implementation for DBRX that returns logits for each expert
+@@ -53,13 +56,7 @@ class DbrxRouter(nn.Module):
+         return router_logits
+ 
+ 
+-class DbrxExperts(nn.Module):
+-    """A tensor-parallel MoE implementation for DBRX.
+-
+-    Each expert's weights are sharded across all ranks and a fused MoE
+-    kernel is used for the forward pass, and finally we reduce the outputs
+-    across ranks.
+-    """
++class DbrxExperts(FusedMoE):
+ 
+     def __init__(
+         self,
+@@ -67,49 +64,24 @@ class DbrxExperts(nn.Module):
+         quant_config: Optional[QuantizationConfig] = None,
+         params_dtype: Optional[torch.dtype] = None,
+     ):
+-        super().__init__()
++        super().__init__(
++            num_experts=config.ffn_config.moe_num_experts,
++            top_k=config.ffn_config.moe_top_k,
++            hidden_size=config.d_model,
++            intermediate_size=config.ffn_config.ffn_hidden_size,
++            params_dtype=params_dtype,
++            reduce_results=True,
++            renormalize=True,
++            quant_config=quant_config,
++            tp_size=get_tensor_model_parallel_world_size(),
++        )
++        self.config = config
+         self.tp_size = get_tensor_model_parallel_world_size()
+-        self.num_total_experts = config.ffn_config.moe_num_experts
+-        self.top_k = config.ffn_config.moe_top_k
+         self.d_model = config.d_model
+-        self.intermediate_size = (config.ffn_config.ffn_hidden_size //
++        self.intermediate_size = (self.config.ffn_config.ffn_hidden_size //
+                                   self.tp_size)
+ 
+-        if params_dtype is None:
+-            params_dtype = torch.get_default_dtype()
+-        self.params_dtype = params_dtype
+-
+-        self.router = DbrxRouter(config, self.params_dtype)
+-        self.ws = nn.Parameter(
+-            torch.empty(
+-                self.num_total_experts,
+-                2 * self.intermediate_size,
+-                self.d_model,
+-                device="cuda",
+-                dtype=self.params_dtype,
+-            ))
+-        self.w2s = nn.Parameter(
+-            torch.empty(
+-                self.num_total_experts,
+-                self.d_model,
+-                self.intermediate_size,
+-                device="cuda",
+-                dtype=self.params_dtype,
+-            ))
+-
+-        set_weight_attrs(
+-            self.ws,
+-            {
+-                "weight_loader": self.weight_loader,
+-            },
+-        )
+-        set_weight_attrs(
+-            self.w2s,
+-            {
+-                "weight_loader": self.weight_loader,
+-            },
+-        )
+-
++    # Define custom weight loader for dbrx model
+     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
+                       weight_name: str):
+         tp_rank = get_tensor_model_parallel_rank()
+@@ -139,26 +111,40 @@ class DbrxExperts(nn.Module):
+             ).transpose(1, 2)
+             param_data[:] = loaded_weight[:, :, shard]
+ 
++
++class DbrxMoE(nn.Module):
++    """A tensor-parallel MoE implementation for DBRX.
++
++    Each expert's weights are sharded across all ranks and a fused MoE
++    kernel is used for the forward pass, and finally we reduce the outputs
++    across ranks.
++    """
++
++    def __init__(
++        self,
++        config: DbrxConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        params_dtype: Optional[torch.dtype] = None,
++    ):
++        super().__init__()
++        self.d_model = config.d_model
++        if params_dtype is None:
++            params_dtype = torch.get_default_dtype()
++        self.params_dtype = params_dtype
++
++        self.router = DbrxRouter(config, self.params_dtype)
++
++        self.experts = DbrxExperts(config=config,
++                                   quant_config=quant_config,
++                                   params_dtype=self.params_dtype)
++
+     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+-        num_tokens, hidden_size = hidden_states.shape
++        orig_shape = hidden_states.shape
+         hidden_states = hidden_states.view(-1, self.d_model)
+         # router_logits: (num_tokens, n_experts)
+         router_logits = self.router(hidden_states)
+-        final_hidden_states = fused_moe(
+-            hidden_states,
+-            self.ws,
+-            self.w2s,
+-            router_logits,
+-            self.top_k,
+-            renormalize=True,
+-            inplace=True,
+-        )
+-
+-        if self.tp_size > 1:
+-            final_hidden_states = tensor_model_parallel_all_reduce(
+-                final_hidden_states)
+-
+-        return final_hidden_states.view(num_tokens, hidden_size)
++        final_hidden_states = self.experts(hidden_states, router_logits)
++        return final_hidden_states.view(orig_shape)
+ 
+ 
+ class DbrxAttention(nn.Module):
+@@ -166,7 +152,9 @@ class DbrxAttention(nn.Module):
+     def __init__(
+         self,
+         config: DbrxConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.d_model = config.d_model
+@@ -216,12 +204,13 @@ class DbrxAttention(nn.Module):
+         self.q_size = self.num_heads * self.head_dim
+         self.kv_size = self.num_kv_heads * self.head_dim
+         self.scaling = self.head_dim**-0.5
+-        self.attn = Attention(
+-            self.num_heads,
+-            self.head_dim,
+-            self.scaling,
+-            num_kv_heads=self.num_kv_heads,
+-        )
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -245,11 +234,16 @@ class DbrxFusedNormAttention(nn.Module):
+     def __init__(
+         self,
+         config: DbrxConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.d_model = config.d_model
+-        self.attn = DbrxAttention(config, quant_config)
++        self.attn = DbrxAttention(config,
++                                  cache_config,
++                                  quant_config,
++                                  prefix=f"{prefix}.attn")
+         self.norm_1 = nn.LayerNorm(self.d_model)
+         self.norm_2 = nn.LayerNorm(self.d_model)
+ 
+@@ -279,11 +273,17 @@ class DbrxBlock(nn.Module):
+     def __init__(
+         self,
+         config: DbrxConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+-        self.norm_attn_norm = DbrxFusedNormAttention(config, quant_config)
+-        self.ffn = DbrxExperts(config, quant_config)
++        self.norm_attn_norm = DbrxFusedNormAttention(
++            config,
++            cache_config,
++            quant_config,
++            prefix=f"{prefix}.norm_attn_norm")
++        self.ffn = DbrxMoE(config, quant_config)
+ 
+     def forward(
+         self,
+@@ -305,24 +305,35 @@ class DbrxBlock(nn.Module):
+ 
+ class DbrxModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: DbrxConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.wte = VocabParallelEmbedding(
+             config.vocab_size,
+             config.d_model,
+         )
+-        self.blocks = nn.ModuleList(
+-            [DbrxBlock(config, quant_config) for _ in range(config.n_layers)])
++        self.start_layer, self.end_layer, self.blocks = make_layers(
++            config.n_layers,
++            lambda prefix: DbrxBlock(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.blocks",
++        )
+         self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5)
+         for module in self.modules():
+             if hasattr(module, "bias") and isinstance(module.bias,
+                                                       nn.Parameter):
+                 # Remove the bias term in Linear and LayerNorm.
+                 module.register_parameter("bias", None)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.d_model))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.wte(input_ids)
+ 
+     def forward(
+         self,
+@@ -330,41 +341,61 @@ class DbrxModel(nn.Module):
+         position_ids: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.wte(input_ids)
+-        for i in range(len(self.blocks)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            assert intermediate_tensors
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
+             block = self.blocks[i]
+             hidden_states = block(
+                 position_ids,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         hidden_states = self.norm_f(hidden_states)
+         return hidden_states
+ 
+ 
+-class DbrxForCausalLM(nn.Module):
++class DbrxForCausalLM(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: DbrxConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
++        if config.tie_word_embeddings:
++            raise ValueError(
++                "tie_word_embeddings is not supported for Dbrx models.")
+         self.quant_config = quant_config
+         self.unpadded_vocab_size = config.vocab_size
+-        self.transformer = DbrxModel(config, quant_config)
++        self.transformer = DbrxModel(vllm_config=vllm_config,
++                                     prefix=maybe_prefix(
++                                         prefix, "transformer"))
+         self.lm_head = ParallelLMHead(
+             config.vocab_size,
+             config.d_model,
+             org_num_embeddings=config.vocab_size,
+             padding_size=DEFAULT_VOCAB_PADDING_SIZE,
++            quant_config=quant_config,
+         )
+         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                 config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.transformer.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.transformer.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -372,14 +403,20 @@ class DbrxForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.transformer(input_ids, positions, kv_caches,
+-                                         attn_metadata)
++                                         attn_metadata, intermediate_tensors,
++                                         inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -391,23 +428,37 @@ class DbrxForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++
+         expert_params_mapping = [(
+-            "ws" if weight_name in ["w1", "v1"] else "w2s",
+-            f"experts.mlp.{weight_name}",
++            "w13_weight" if weight_name in ["w1", "v1"] else "w2_weight",
++            f"mlp.{weight_name}",
+         ) for weight_name in ["w1", "v1", "w2"]]
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             for param_name, weight_name in expert_params_mapping:
+                 if weight_name not in name:
+                     continue
+                 name = name.replace(weight_name, param_name)
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, weight_name)
+                 break
+             else:
++                # Remapping the name of FP8 kv-scale.
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
+index be9a6b6..c551853 100644
+--- a/vllm/model_executor/models/decilm.py
++++ b/vllm/model_executor/models/decilm.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+ # Copyright 2023 DeciAI Research Team. All rights reserved.
+@@ -23,17 +22,16 @@
+ # limitations under the License.
+ """Inference-only DeciLM model compatible with HuggingFace weights."""
+ 
+-from typing import Iterable, Optional, Tuple
++from typing import Iterable, Set, Tuple
+ 
+ import torch
+-from transformers import PretrainedConfig
+ 
+-from vllm.config import LoRAConfig
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.config import VllmConfig
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.models.llama import LlamaForCausalLM
+ 
++from .utils import is_pp_missing_parameter
++
+ 
+ class DeciLMForCausalLM(LlamaForCausalLM):
+     """
+@@ -53,19 +51,14 @@ class DeciLMForCausalLM(LlamaForCausalLM):
+     instead.
+     """
+ 
+-    def __init__(
+-        self,
+-        config: Optional[PretrainedConfig] = None,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        config = vllm_config.model_config.hf_config
+         config.num_key_value_heads = max(config.num_key_value_heads_per_layer)
+         delattr(config, "num_key_value_heads_per_layer")
+-        super().__init__(config=config,
+-                         quant_config=quant_config,
+-                         lora_config=lora_config)
++        super().__init__(vllm_config=vllm_config)
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -75,6 +68,7 @@ class DeciLMForCausalLM(LlamaForCausalLM):
+             ("gate_up_proj", "up_proj", 1),
+         ]
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -89,6 +83,8 @@ class DeciLMForCausalLM(LlamaForCausalLM):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -97,10 +93,14 @@ class DeciLMForCausalLM(LlamaForCausalLM):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+ 
+     def _degroup_weight(self, loaded_weight: torch.Tensor) -> torch.Tensor:
+         hidden_size = self.config.hidden_size
+diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
+index e5f7ba0..74b6bfd 100644
+--- a/vllm/model_executor/models/deepseek.py
++++ b/vllm/model_executor/models/deepseek.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+ # Copyright 2023 The vLLM team.
+@@ -21,14 +20,15 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only Deepseek model."""
+-from typing import Any, Dict, Iterable, List, Optional, Tuple
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import PretrainedConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import (get_tensor_model_parallel_rank,
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size,
+                               tensor_model_parallel_all_reduce)
+ from vllm.model_executor.layers.activation import SiluAndMul
+@@ -39,15 +39,19 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                ReplicatedLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (extract_layer_index, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class DeepseekMLP(nn.Module):
+@@ -59,6 +63,7 @@ class DeepseekMLP(nn.Module):
+         hidden_act: str,
+         quant_config: Optional[QuantizationConfig] = None,
+         reduce_results: bool = True,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.gate_up_proj = MergedColumnParallelLinear(
+@@ -88,6 +93,7 @@ class DeepseekMoE(nn.Module):
+         self,
+         config: PretrainedConfig,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.config = config
+@@ -178,7 +184,9 @@ class DeepseekAttention(nn.Module):
+         rope_theta: float = 10000,
+         rope_scaling: Optional[Dict[str, Any]] = None,
+         max_position_embeddings: int = 8192,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = hidden_size
+@@ -229,7 +237,10 @@ class DeepseekAttention(nn.Module):
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+                               self.scaling,
+-                              num_kv_heads=self.num_kv_heads)
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -251,10 +262,12 @@ class DeepseekDecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: PretrainedConfig,
+-        layer_idx: int,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
++        layer_idx = extract_layer_index(prefix)
+         self.hidden_size = config.hidden_size
+         rope_theta = getattr(config, "rope_theta", 10000)
+         rope_scaling = getattr(config, "rope_scaling", None)
+@@ -267,18 +280,23 @@ class DeepseekDecoderLayer(nn.Module):
+             rope_theta=rope_theta,
+             rope_scaling=rope_scaling,
+             max_position_embeddings=max_position_embeddings,
++            cache_config=cache_config,
+             quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
+         )
+         if (config.n_routed_experts is not None
+                 and layer_idx >= config.first_k_dense_replace
+                 and layer_idx % config.moe_layer_freq == 0):
+-            self.mlp = DeepseekMoE(config=config, quant_config=quant_config)
++            self.mlp = DeepseekMoE(config=config,
++                                   quant_config=quant_config,
++                                   prefix=f"{prefix}.mlp")
+         else:
+             self.mlp = DeepseekMLP(
+                 hidden_size=config.hidden_size,
+                 intermediate_size=config.intermediate_size,
+                 hidden_act=config.hidden_act,
+                 quant_config=quant_config,
++                prefix=f"{prefix}.mlp",
+             )
+         self.input_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+@@ -318,12 +336,13 @@ class DeepseekModel(nn.Module):
+ 
+     fall_back_to_pt_during_load = False
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.padding_idx = config.pad_token_id
+         self.vocab_size = config.vocab_size
+ 
+@@ -331,11 +350,19 @@ class DeepseekModel(nn.Module):
+             config.vocab_size,
+             config.hidden_size,
+         )
+-        self.layers = nn.ModuleList([
+-            DeepseekDecoderLayer(config, layer_idx, quant_config=quant_config)
+-            for layer_idx in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: DeepseekDecoderLayer(
++                config, cache_config, quant_config=quant_config, prefix=prefix
++            ),
++            prefix=f"{prefix}.layers")
+         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -343,32 +370,54 @@ class DeepseekModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        residual = None
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(positions, hidden_states,
+-                                            kv_caches[i], attn_metadata,
+-                                            residual)
++                                            kv_caches[i - self.start_layer],
++                                            attn_metadata, residual)
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+         hidden_states, _ = self.norm(hidden_states, residual)
+         return hidden_states
+ 
+ 
+-class DeepseekForCausalLM(nn.Module):
++class DeepseekForCausalLM(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+-        self.model = DeepseekModel(config, quant_config)
+-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
++        self.model = DeepseekModel(vllm_config=vllm_config,
++                                   prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      quant_config=quant_config)
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -376,14 +425,20 @@ class DeepseekForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -395,7 +450,8 @@ class DeepseekForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -406,6 +462,7 @@ class DeepseekForCausalLM(nn.Module):
+         ]
+ 
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -420,6 +477,8 @@ class DeepseekForCausalLM(nn.Module):
+                 if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                         and name not in params_dict):
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -432,7 +491,11 @@ class DeepseekForCausalLM(nn.Module):
+                 if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                         and name not in params_dict):
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
+new file mode 100644
+index 0000000..d83cafa
+--- /dev/null
++++ b/vllm/model_executor/models/deepseek_v2.py
+@@ -0,0 +1,652 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
++# Copyright 2023 The vLLM team.
++# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only DeepseekV2 model."""
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++from transformers import PretrainedConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group,
++                              get_tensor_model_parallel_world_size,
++                              tensor_model_parallel_all_reduce)
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.fused_moe import FusedMoE
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               MergedColumnParallelLinear,
++                                               ReplicatedLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (PPMissingLayer, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++
++class DeepseekV2MLP(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        intermediate_size: int,
++        hidden_act: str,
++        quant_config: Optional[QuantizationConfig] = None,
++        reduce_results: bool = True,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.gate_up_proj = MergedColumnParallelLinear(
++            hidden_size, [intermediate_size] * 2,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj")
++        self.down_proj = RowParallelLinear(intermediate_size,
++                                           hidden_size,
++                                           bias=False,
++                                           quant_config=quant_config,
++                                           reduce_results=reduce_results,
++                                           prefix=f"{prefix}.down_proj")
++        if hidden_act != "silu":
++            raise ValueError(f"Unsupported activation: {hidden_act}. "
++                             "Only silu is supported for now.")
++        self.act_fn = SiluAndMul()
++
++    def forward(self, x):
++        gate_up, _ = self.gate_up_proj(x)
++        x = self.act_fn(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class DeepseekV2MoE(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.routed_scaling_factor = config.routed_scaling_factor
++        self.n_shared_experts = config.n_shared_experts
++        self.routed_scaling_factor = config.routed_scaling_factor
++        if self.tp_size > config.n_routed_experts:
++            raise ValueError(
++                f"Tensor parallel size {self.tp_size} is greater than "
++                f"the number of experts {config.n_routed_experts}.")
++
++        if config.hidden_act != "silu":
++            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
++                             "Only silu is supported for now.")
++
++        self.experts = FusedMoE(num_experts=config.n_routed_experts,
++                                top_k=config.num_experts_per_tok,
++                                hidden_size=config.hidden_size,
++                                intermediate_size=config.moe_intermediate_size,
++                                reduce_results=False,
++                                renormalize=config.norm_topk_prob,
++                                quant_config=quant_config,
++                                use_grouped_topk=True,
++                                num_expert_group=config.n_group,
++                                topk_group=config.topk_group,
++                                prefix=f"{prefix}.experts")
++
++        self.gate = ReplicatedLinear(config.hidden_size,
++                                     config.n_routed_experts,
++                                     bias=False,
++                                     quant_config=None,
++                                     prefix=f"{prefix}.gate")
++        if config.n_shared_experts is not None:
++            intermediate_size = (config.moe_intermediate_size *
++                                 config.n_shared_experts)
++            self.shared_experts = DeepseekV2MLP(
++                hidden_size=config.hidden_size,
++                intermediate_size=intermediate_size,
++                hidden_act=config.hidden_act,
++                quant_config=quant_config,
++                reduce_results=False,
++            )
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        num_tokens, hidden_dim = hidden_states.shape
++        hidden_states = hidden_states.view(-1, hidden_dim)
++        if self.n_shared_experts is not None:
++            shared_output = self.shared_experts(hidden_states)
++        # router_logits: (num_tokens, n_experts)
++        router_logits, _ = self.gate(hidden_states)
++        final_hidden_states = self.experts(
++            hidden_states=hidden_states,
++            router_logits=router_logits) * self.routed_scaling_factor
++        if shared_output is not None:
++            final_hidden_states = final_hidden_states + shared_output
++        if self.tp_size > 1:
++            final_hidden_states = tensor_model_parallel_all_reduce(
++                final_hidden_states)
++
++        return final_hidden_states.view(num_tokens, hidden_dim)
++
++
++def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
++    import math
++    if scale <= 1:
++        return 1.0
++    return 0.1 * mscale * math.log(scale) + 1.0
++
++
++class DeepseekV2Attention(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        hidden_size: int,
++        num_heads: int,
++        qk_nope_head_dim: int,
++        qk_rope_head_dim: int,
++        v_head_dim: int,
++        q_lora_rank: int,
++        kv_lora_rank: int,
++        rope_theta: float = 10000,
++        rope_scaling: Optional[Dict[str, Any]] = None,
++        max_position_embeddings: int = 8192,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        self.qk_nope_head_dim = qk_nope_head_dim
++        self.qk_rope_head_dim = qk_rope_head_dim
++        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
++        self.v_head_dim = v_head_dim
++        self.q_lora_rank = q_lora_rank
++        self.kv_lora_rank = kv_lora_rank
++        self.num_heads = num_heads
++        tp_size = get_tensor_model_parallel_world_size()
++        assert num_heads % tp_size == 0
++        self.num_local_heads = num_heads // tp_size
++        self.scaling = self.qk_head_dim**-0.5
++        self.rope_theta = rope_theta
++        self.max_position_embeddings = max_position_embeddings
++
++        if self.q_lora_rank is not None:
++            self.q_a_proj = ReplicatedLinear(self.hidden_size,
++                                             self.q_lora_rank,
++                                             bias=False,
++                                             quant_config=quant_config,
++                                             prefix=f"{prefix}.q_a_proj")
++            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
++                                         eps=config.rms_norm_eps)
++            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
++                                                 self.num_heads *
++                                                 self.qk_head_dim,
++                                                 bias=False,
++                                                 quant_config=quant_config,
++                                                 prefix=f"{prefix}.q_b_proj")
++        else:
++            self.q_proj = ColumnParallelLinear(self.hidden_size,
++                                               self.num_heads *
++                                               self.qk_head_dim,
++                                               bias=False,
++                                               quant_config=quant_config,
++                                               prefix=f"{prefix}.q_proj")
++
++        self.kv_a_proj_with_mqa = ReplicatedLinear(
++            self.hidden_size,
++            self.kv_lora_rank + self.qk_rope_head_dim,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.kv_a_proj_with_mqa")
++        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
++                                      eps=config.rms_norm_eps)
++        self.kv_b_proj = ColumnParallelLinear(
++            self.kv_lora_rank,
++            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.kv_b_proj")
++        # O projection.
++        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
++                                        self.hidden_size,
++                                        bias=False,
++                                        quant_config=quant_config,
++                                        prefix=f"{prefix}.o_proj")
++        if rope_scaling:
++            rope_scaling["rope_type"] = 'deepseek_yarn'
++            self.use_normal_rope = False
++        else:
++            self.use_normal_rope = True
++        self.rotary_emb = get_rope(qk_rope_head_dim,
++                                   rotary_dim=qk_rope_head_dim,
++                                   max_position=max_position_embeddings,
++                                   base=rope_theta,
++                                   rope_scaling=rope_scaling,
++                                   is_neox_style=False)
++
++        if rope_scaling:
++            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
++            scaling_factor = rope_scaling["factor"]
++            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
++            self.scaling = self.scaling * mscale * mscale
++
++        # self.attn = Attention(self.num_heads,
++        #                       self.qk_head_dim,
++        #                       self.scaling,
++        #                       num_kv_heads=self.num_heads)
++
++        # TODO, support head_size 192
++        self.attn = Attention(self.num_local_heads,
++                              256,
++                              self.scaling,
++                              num_kv_heads=self.num_local_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        if self.q_lora_rank is not None:
++            q = self.q_a_proj(hidden_states)[0]
++            q = self.q_a_layernorm(q)
++            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
++                                         self.qk_head_dim)
++        else:
++            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
++                                                   self.qk_head_dim)
++        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
++                               dim=-1)
++        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
++        kv_a, _ = latent_cache.split(
++            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
++        latent_cache = latent_cache.unsqueeze(1)
++        kv_a = self.kv_a_layernorm(kv_a.contiguous())
++        kv = self.kv_b_proj(kv_a)[0]
++        kv = kv.view(-1, self.num_local_heads,
++                     self.qk_nope_head_dim + self.v_head_dim)
++        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
++        k_pe = latent_cache[:, :, self.kv_lora_rank:]
++
++        if self.use_normal_rope:
++            seq_len = positions.size(0)
++            ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
++            q_pe = q_pe.reshape(seq_len, -1)
++            k_pe = k_pe.reshape(seq_len, -1)
++
++        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
++
++        if self.use_normal_rope:
++            q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
++
++        q[..., self.qk_nope_head_dim:] = q_pe
++        k = torch.empty_like(q)
++        k[..., :self.qk_nope_head_dim] = k_nope
++        k[..., self.qk_nope_head_dim:] = k_pe
++        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
++                                    value=0).view(-1,
++                                                  self.num_local_heads * 256)
++        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
++                                    value=0).view(-1,
++                                                  self.num_local_heads * 256)
++        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
++                                    value=0).view(-1,
++                                                  self.num_local_heads * 256)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        attn_output = attn_output.view(
++            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
++                -1, self.num_local_heads * self.v_head_dim)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class DeepseekV2DecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        prefix: str,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        rope_theta = getattr(config, "rope_theta", 10000)
++        rope_scaling = getattr(config, "rope_scaling", None)
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          8192)
++        # DecoderLayers are created with `make_layers` which passes the prefix
++        # with the layer's index.
++        layer_idx = int(prefix.split(sep='.')[-1])
++        self.self_attn = DeepseekV2Attention(
++            config=config,
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            qk_nope_head_dim=config.qk_nope_head_dim,
++            qk_rope_head_dim=config.qk_rope_head_dim,
++            v_head_dim=config.v_head_dim,
++            q_lora_rank=config.q_lora_rank
++            if hasattr(config, "q_lora_rank") else None,
++            kv_lora_rank=config.kv_lora_rank,
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
++
++        if (config.n_routed_experts is not None
++                and layer_idx >= config.first_k_dense_replace
++                and layer_idx % config.moe_layer_freq == 0):
++            self.mlp = DeepseekV2MoE(
++                config=config,
++                quant_config=quant_config,
++                prefix=f"{prefix}.mlp",
++            )
++        else:
++            self.mlp = DeepseekV2MLP(
++                hidden_size=config.hidden_size,
++                intermediate_size=config.intermediate_size,
++                hidden_act=config.hidden_act,
++                quant_config=quant_config,
++                prefix=f"{prefix}.mlp",
++            )
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> torch.Tensor:
++        # Self Attention
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.input_layernorm(hidden_states)
++        else:
++            hidden_states, residual = self.input_layernorm(
++                hidden_states, residual)
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        # Fully Connected
++        hidden_states, residual = self.post_attention_layernorm(
++            hidden_states, residual)
++        hidden_states = self.mlp(hidden_states)
++        return hidden_states, residual
++
++
++@support_torch_compile
++class DeepseekV2Model(nn.Module):
++
++    fall_back_to_pt_during_load = False
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.padding_idx = config.pad_token_id
++        self.vocab_size = config.vocab_size
++
++        if get_pp_group().is_first_rank:
++            self.embed_tokens = VocabParallelEmbedding(
++                config.vocab_size,
++                config.hidden_size,
++            )
++        else:
++            self.embed_tokens = PPMissingLayer()
++
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: DeepseekV2DecoderLayer(
++                config,
++                prefix,
++                cache_config=cache_config,
++                quant_config=quant_config,
++            ),
++            prefix=f"{prefix}.layers")
++
++        if get_pp_group().is_last_rank:
++            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        else:
++            self.norm = PPMissingLayer()
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(positions, hidden_states,
++                                            kv_caches[i - self.start_layer],
++                                            attn_metadata, residual)
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
++
++
++class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        self.config = config
++        self.quant_config = quant_config
++        self.model = DeepseekV2Model(vllm_config=vllm_config,
++                                     prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      quant_config=quant_config)
++        self.logits_processor = LogitsProcessor(config.vocab_size)
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        hidden_states = self.model(input_ids, positions, kv_caches,
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: Optional[torch.Tensor],
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def make_empty_intermediate_tensors(
++            self, batch_size: int, dtype: torch.dtype,
++            device: torch.device) -> IntermediateTensors:
++        return IntermediateTensors({
++            "hidden_states":
++            torch.zeros((batch_size, self.config.hidden_size),
++                        dtype=dtype,
++                        device=device),
++            "residual":
++            torch.zeros((batch_size, self.config.hidden_size),
++                        dtype=dtype,
++                        device=device),
++        })
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("gate_up_proj", "gate_proj", 0),
++            ("gate_up_proj", "up_proj", 1),
++        ]
++
++        # Params for weights, fp8 weight scales, fp8 activation scales
++        # (param_name, weight_name, expert_id, shard_id)
++        expert_params_mapping = FusedMoE.make_expert_params_mapping(
++            ckpt_gate_proj_name="gate_proj",
++            ckpt_down_proj_name="down_proj",
++            ckpt_up_proj_name="up_proj",
++            num_experts=self.config.n_routed_experts)
++
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                # Skip non-stacked layers and experts (experts handled below).
++                if weight_name not in name:
++                    continue
++                # We have mlp.experts[0].gate_proj in the checkpoint.
++                # Since we handle the experts below in expert_params_mapping,
++                # we need to skip here BEFORE we update the name, otherwise
++                # name will be updated to mlp.experts[0].gate_up_proj, which
++                # will then be updated below in expert_params_mapping
++                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
++                if (("mlp.experts." in name) and name not in params_dict):
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                for mapping in expert_params_mapping:
++                    param_name, weight_name, expert_id, shard_id = mapping
++                    if weight_name not in name:
++                        continue
++                    name = name.replace(weight_name, param_name)
++
++                    if is_pp_missing_parameter(name, self):
++                        continue
++
++                    param = params_dict[name]
++                    weight_loader = param.weight_loader
++                    weight_loader(param,
++                                  loaded_weight,
++                                  name,
++                                  shard_id=shard_id,
++                                  expert_id=expert_id)
++                    break
++                else:
++                    # Skip loading extra bias for GPTQ models.
++                    if name.endswith(".bias") and name not in params_dict:
++                        continue
++
++                    # Remapping the name of FP8 kv-scale.
++                    name = maybe_remap_kv_scale_name(name, params_dict)
++                    if name is None:
++                        continue
++
++                    if is_pp_missing_parameter(name, self):
++                        continue
++
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
+new file mode 100644
+index 0000000..ca79b14
+--- /dev/null
++++ b/vllm/model_executor/models/deepseek_v3.py
+@@ -0,0 +1,663 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
++# Copyright 2023 The vLLM team.
++# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only DeepseekV3 model."""
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++from transformers import PretrainedConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group,
++                              get_tensor_model_parallel_world_size,
++                              tensor_model_parallel_all_reduce)
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.fused_moe import FusedMoE
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               MergedColumnParallelLinear,
++                                               ReplicatedLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (PPMissingLayer, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++
++class DeepseekV3MLP(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        intermediate_size: int,
++        hidden_act: str,
++        quant_config: Optional[QuantizationConfig] = None,
++        reduce_results: bool = True,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.gate_up_proj = MergedColumnParallelLinear(
++            hidden_size, [intermediate_size] * 2,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj")
++        self.down_proj = RowParallelLinear(intermediate_size,
++                                           hidden_size,
++                                           bias=False,
++                                           quant_config=quant_config,
++                                           reduce_results=reduce_results,
++                                           prefix=f"{prefix}.down_proj")
++        if hidden_act != "silu":
++            raise ValueError(f"Unsupported activation: {hidden_act}. "
++                             "Only silu is supported for now.")
++        self.act_fn = SiluAndMul()
++
++    def forward(self, x):
++        gate_up, _ = self.gate_up_proj(x)
++        x = self.act_fn(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class DeepseekV3MoE(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.routed_scaling_factor = config.routed_scaling_factor
++        self.n_shared_experts = config.n_shared_experts
++        self.routed_scaling_factor = config.routed_scaling_factor
++        if self.tp_size > config.n_routed_experts:
++            raise ValueError(
++                f"Tensor parallel size {self.tp_size} is greater than "
++                f"the number of experts {config.n_routed_experts}.")
++
++        if config.hidden_act != "silu":
++            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
++                             "Only silu is supported for now.")
++
++        self.gate = ReplicatedLinear(config.hidden_size,
++                                     config.n_routed_experts,
++                                     bias=False,
++                                     quant_config=None,
++                                     prefix=f"{prefix}.gate")
++        if config.topk_method == "noaux_tc":
++            self.gate.e_score_correction_bias = nn.Parameter(
++                torch.empty(config.n_routed_experts))
++        else:
++            self.gate.e_score_correction_bias = None
++
++        self.experts = FusedMoE(
++            num_experts=config.n_routed_experts,
++            top_k=config.num_experts_per_tok,
++            hidden_size=config.hidden_size,
++            intermediate_size=config.moe_intermediate_size,
++            reduce_results=False,
++            renormalize=config.norm_topk_prob,
++            quant_config=quant_config,
++            use_grouped_topk=True,
++            num_expert_group=config.n_group,
++            topk_group=config.topk_group,
++            prefix=f"{prefix}.experts",
++            scoring_func=config.scoring_func,
++            e_score_correction_bias=self.gate.e_score_correction_bias)
++
++        if config.n_shared_experts is not None:
++            intermediate_size = (config.moe_intermediate_size *
++                                 config.n_shared_experts)
++            self.shared_experts = DeepseekV3MLP(
++                hidden_size=config.hidden_size,
++                intermediate_size=intermediate_size,
++                hidden_act=config.hidden_act,
++                quant_config=quant_config,
++                reduce_results=False,
++            )
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        num_tokens, hidden_dim = hidden_states.shape
++        hidden_states = hidden_states.view(-1, hidden_dim)
++        if self.n_shared_experts is not None:
++            shared_output = self.shared_experts(hidden_states)
++        # router_logits: (num_tokens, n_experts)
++        router_logits, _ = self.gate(hidden_states)
++        final_hidden_states = self.experts(
++            hidden_states=hidden_states,
++            router_logits=router_logits) * self.routed_scaling_factor
++        if shared_output is not None:
++            final_hidden_states = final_hidden_states + shared_output
++        if self.tp_size > 1:
++            final_hidden_states = tensor_model_parallel_all_reduce(
++                final_hidden_states)
++
++        return final_hidden_states.view(num_tokens, hidden_dim)
++
++
++def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
++    import math
++    if scale <= 1:
++        return 1.0
++    return 0.1 * mscale * math.log(scale) + 1.0
++
++
++class DeepseekV3Attention(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        hidden_size: int,
++        num_heads: int,
++        qk_nope_head_dim: int,
++        qk_rope_head_dim: int,
++        v_head_dim: int,
++        q_lora_rank: int,
++        kv_lora_rank: int,
++        rope_theta: float = 10000,
++        rope_scaling: Optional[Dict[str, Any]] = None,
++        max_position_embeddings: int = 8192,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        self.qk_nope_head_dim = qk_nope_head_dim
++        self.qk_rope_head_dim = qk_rope_head_dim
++        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
++        self.v_head_dim = v_head_dim
++        self.q_lora_rank = q_lora_rank
++        self.kv_lora_rank = kv_lora_rank
++        self.num_heads = num_heads
++        tp_size = get_tensor_model_parallel_world_size()
++        assert num_heads % tp_size == 0
++        self.num_local_heads = num_heads // tp_size
++        self.scaling = self.qk_head_dim**-0.5
++        self.rope_theta = rope_theta
++        self.max_position_embeddings = max_position_embeddings
++
++        if self.q_lora_rank is not None:
++            self.q_a_proj = ReplicatedLinear(self.hidden_size,
++                                             self.q_lora_rank,
++                                             bias=False,
++                                             quant_config=quant_config,
++                                             prefix=f"{prefix}.q_a_proj")
++            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
++                                         eps=config.rms_norm_eps)
++            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
++                                                 self.num_heads *
++                                                 self.qk_head_dim,
++                                                 bias=False,
++                                                 quant_config=quant_config,
++                                                 prefix=f"{prefix}.q_b_proj")
++        else:
++            self.q_proj = ColumnParallelLinear(self.hidden_size,
++                                               self.num_heads *
++                                               self.qk_head_dim,
++                                               bias=False,
++                                               quant_config=quant_config,
++                                               prefix=f"{prefix}.q_proj")
++
++        self.kv_a_proj_with_mqa = ReplicatedLinear(
++            self.hidden_size,
++            self.kv_lora_rank + self.qk_rope_head_dim,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.kv_a_proj_with_mqa")
++        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
++                                      eps=config.rms_norm_eps)
++        self.kv_b_proj = ColumnParallelLinear(
++            self.kv_lora_rank,
++            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.kv_b_proj")
++        # O projection.
++        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
++                                        self.hidden_size,
++                                        bias=False,
++                                        quant_config=quant_config,
++                                        prefix=f"{prefix}.o_proj")
++        if rope_scaling:
++            rope_scaling["rope_type"] = 'deepseek_yarn'
++            self.use_normal_rope = False
++        else:
++            self.use_normal_rope = True
++        self.rotary_emb = get_rope(qk_rope_head_dim,
++                                   rotary_dim=qk_rope_head_dim,
++                                   max_position=max_position_embeddings,
++                                   base=rope_theta,
++                                   rope_scaling=rope_scaling,
++                                   is_neox_style=False)
++
++        if rope_scaling:
++            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
++            scaling_factor = rope_scaling["factor"]
++            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
++            self.scaling = self.scaling * mscale * mscale
++
++        # self.attn = Attention(self.num_heads,
++        #                       self.qk_head_dim,
++        #                       self.scaling,
++        #                       num_kv_heads=self.num_heads)
++
++        # TODO, support head_size 192
++        self.attn = Attention(self.num_local_heads,
++                              256,
++                              self.scaling,
++                              num_kv_heads=self.num_local_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        if self.q_lora_rank is not None:
++            q = self.q_a_proj(hidden_states)[0]
++            q = self.q_a_layernorm(q)
++            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
++                                         self.qk_head_dim)
++        else:
++            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
++                                                   self.qk_head_dim)
++        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
++                               dim=-1)
++        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
++        kv_a, _ = latent_cache.split(
++            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
++        latent_cache = latent_cache.unsqueeze(1)
++        kv_a = self.kv_a_layernorm(kv_a.contiguous())
++        kv = self.kv_b_proj(kv_a)[0]
++        kv = kv.view(-1, self.num_local_heads,
++                     self.qk_nope_head_dim + self.v_head_dim)
++        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
++        k_pe = latent_cache[:, :, self.kv_lora_rank:]
++
++        if self.use_normal_rope:
++            seq_len = positions.size(0)
++            ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
++            q_pe = q_pe.reshape(seq_len, -1)
++            k_pe = k_pe.reshape(seq_len, -1)
++
++        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
++
++        if self.use_normal_rope:
++            q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
++
++        q[..., self.qk_nope_head_dim:] = q_pe
++        k = torch.empty_like(q)
++        k[..., :self.qk_nope_head_dim] = k_nope
++        k[..., self.qk_nope_head_dim:] = k_pe
++        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
++                                    value=0).view(-1,
++                                                  self.num_local_heads * 256)
++        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
++                                    value=0).view(-1,
++                                                  self.num_local_heads * 256)
++        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
++                                    value=0).view(-1,
++                                                  self.num_local_heads * 256)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        attn_output = attn_output.view(
++            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
++                -1, self.num_local_heads * self.v_head_dim)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class DeepseekV3DecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        prefix: str,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        rope_theta = getattr(config, "rope_theta", 10000)
++        rope_scaling = getattr(config, "rope_scaling", None)
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          8192)
++        # DecoderLayers are created with `make_layers` which passes the prefix
++        # with the layer's index.
++        layer_idx = int(prefix.split(sep='.')[-1])
++        self.self_attn = DeepseekV3Attention(
++            config=config,
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            qk_nope_head_dim=config.qk_nope_head_dim,
++            qk_rope_head_dim=config.qk_rope_head_dim,
++            v_head_dim=config.v_head_dim,
++            q_lora_rank=config.q_lora_rank
++            if hasattr(config, "q_lora_rank") else None,
++            kv_lora_rank=config.kv_lora_rank,
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        if (config.n_routed_experts is not None
++                and layer_idx >= config.first_k_dense_replace
++                and layer_idx % config.moe_layer_freq == 0):
++            self.mlp = DeepseekV3MoE(
++                config=config,
++                quant_config=quant_config,
++                prefix=f"{prefix}.mlp",
++            )
++        else:
++            self.mlp = DeepseekV3MLP(
++                hidden_size=config.hidden_size,
++                intermediate_size=config.intermediate_size,
++                hidden_act=config.hidden_act,
++                quant_config=quant_config,
++                prefix=f"{prefix}.mlp",
++            )
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> torch.Tensor:
++        # Self Attention
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.input_layernorm(hidden_states)
++        else:
++            hidden_states, residual = self.input_layernorm(
++                hidden_states, residual)
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        # Fully Connected
++        hidden_states, residual = self.post_attention_layernorm(
++            hidden_states, residual)
++        hidden_states = self.mlp(hidden_states)
++        return hidden_states, residual
++
++
++# TODO(simon): check whether we support torch compile for Deepseek V3
++# @support_torch_compile
++class DeepseekV3Model(nn.Module):
++
++    fall_back_to_pt_during_load = False
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.padding_idx = config.pad_token_id
++        self.vocab_size = config.vocab_size
++
++        if get_pp_group().is_first_rank:
++            self.embed_tokens = VocabParallelEmbedding(
++                config.vocab_size,
++                config.hidden_size,
++            )
++        else:
++            self.embed_tokens = PPMissingLayer()
++
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: DeepseekV3DecoderLayer(
++                config,
++                prefix,
++                cache_config=cache_config,
++                quant_config=quant_config,
++            ),
++            prefix=f"{prefix}.layers")
++
++        if get_pp_group().is_last_rank:
++            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        else:
++            self.norm = PPMissingLayer()
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(positions, hidden_states,
++                                            kv_caches[i - self.start_layer],
++                                            attn_metadata, residual)
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
++
++
++class DeepseekV3ForCausalLM(nn.Module, SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        self.config = config
++        self.quant_config = quant_config
++        self.model = DeepseekV3Model(vllm_config=vllm_config,
++                                     prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      quant_config=quant_config)
++        self.logits_processor = LogitsProcessor(config.vocab_size)
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        hidden_states = self.model(input_ids, positions, kv_caches,
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: Optional[torch.Tensor],
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def make_empty_intermediate_tensors(
++            self, batch_size: int, dtype: torch.dtype,
++            device: torch.device) -> IntermediateTensors:
++        return IntermediateTensors({
++            "hidden_states":
++            torch.zeros((batch_size, self.config.hidden_size),
++                        dtype=dtype,
++                        device=device),
++            "residual":
++            torch.zeros((batch_size, self.config.hidden_size),
++                        dtype=dtype,
++                        device=device),
++        })
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("gate_up_proj", "gate_proj", 0),
++            ("gate_up_proj", "up_proj", 1),
++        ]
++
++        # Params for weights, fp8 weight scales, fp8 activation scales
++        # (param_name, weight_name, expert_id, shard_id)
++        expert_params_mapping = FusedMoE.make_expert_params_mapping(
++            ckpt_gate_proj_name="gate_proj",
++            ckpt_down_proj_name="down_proj",
++            ckpt_up_proj_name="up_proj",
++            num_experts=self.config.n_routed_experts)
++
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++
++            # TODO(simon): support nextn predict layers
++            if hasattr(self.config, "num_nextn_predict_layers"
++                       ) and self.config.num_nextn_predict_layers > 0:
++                assert self.config.num_nextn_predict_layers == 1
++                layer_idx = self.config.num_hidden_layers
++                if name.startswith(f"model.layers.{layer_idx}"):
++                    continue
++
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                # Skip non-stacked layers and experts (experts handled below).
++                if weight_name not in name:
++                    continue
++                # We have mlp.experts[0].gate_proj in the checkpoint.
++                # Since we handle the experts below in expert_params_mapping,
++                # we need to skip here BEFORE we update the name, otherwise
++                # name will be updated to mlp.experts[0].gate_up_proj, which
++                # will then be updated below in expert_params_mapping
++                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
++                if (("mlp.experts." in name) and name not in params_dict):
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                for mapping in expert_params_mapping:
++                    param_name, weight_name, expert_id, shard_id = mapping
++                    if weight_name not in name:
++                        continue
++                    name = name.replace(weight_name, param_name)
++
++                    if is_pp_missing_parameter(name, self):
++                        continue
++
++                    param = params_dict[name]
++                    weight_loader = param.weight_loader
++                    weight_loader(param,
++                                  loaded_weight,
++                                  name,
++                                  shard_id=shard_id,
++                                  expert_id=expert_id)
++                    break
++                else:
++                    # Skip loading extra bias for GPTQ models.
++                    if name.endswith(".bias") and name not in params_dict:
++                        continue
++
++                    if is_pp_missing_parameter(name, self):
++                        continue
++
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
+new file mode 100644
+index 0000000..99fa941
+--- /dev/null
++++ b/vllm/model_executor/models/deepseek_vl2.py
+@@ -0,0 +1,662 @@
++# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
++"""Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
++import math
++from functools import cached_property, partial
++from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
++                    TypedDict, Union)
++
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from einops import rearrange, repeat
++from transformers import AutoProcessor, BatchFeature, ProcessorMixin
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.logger import init_logger
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.model_loader.utils import set_default_torch_dtype
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
++                                    NestedTensors)
++from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
++                                   ImageSize, MultiModalDataItems)
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo, PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.multimodal.utils import cached_get_tokenizer
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
++                                                          MlpProjectorConfig,
++                                                          VisionEncoderConfig)
++from vllm.utils import is_list_of
++
++from .interfaces import SupportsMultiModal, SupportsPP
++from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
++                    init_vllm_registered_model, maybe_prefix,
++                    merge_multimodal_embeddings)
++
++logger = init_logger(__name__)
++
++# The image token id may be various
++_IMAGE_TOKEN = "<image>"
++
++
++class DeepseekVL2ImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: Union[torch.Tensor, List[torch.Tensor]]
++    """
++    Shape: `(batch_size * num_images, num_channels, height, width)`
++    """
++    images_spatial_crop: torch.Tensor
++    """
++    Shape: `(batch_size * num_images, 2)`
++    """
++
++
++class DeepseekVL2VImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: Union[torch.Tensor, List[torch.Tensor]]
++    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
++
++    `hidden_size` must match the hidden size of language model backbone.
++    """
++
++
++DeepseekVL2ImageInputs = Union[DeepseekVL2ImagePixelInputs,
++                               DeepseekVL2VImageEmbeddingInputs]
++
++
++class MlpProjector(nn.Module):
++
++    def __init__(self, cfg: MlpProjectorConfig):
++
++        super().__init__()
++
++        self.cfg = cfg
++        assert not cfg.token_pooling, (
++            "Token pooling is not supported currently.")
++
++        if cfg.projector_type == "downsample_mlp_gelu":
++            mlp_depth = cfg.depth
++            mlp_ratio = cfg.mlp_ratio
++            modules = [
++                nn.Linear(
++                    cfg.input_dim * cfg.downsample_ratio *
++                    cfg.downsample_ratio, cfg.n_embed * mlp_ratio)
++            ]
++            for _ in range(1, mlp_depth - 1):
++                modules.append(nn.GELU())
++                modules.append(
++                    nn.Linear(cfg.n_embed * mlp_ratio,
++                              cfg.n_embed * mlp_ratio))
++            modules.append(nn.GELU())
++            modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
++            modules = nn.Sequential(*modules)
++
++        else:
++            raise NotImplementedError(
++                f"Unsupported projector type: {cfg.projector_type}")
++
++        self.layers = modules
++
++    def forward(self, x):
++        bs, hw, input_dim = x.shape
++        h = w = int((hw)**0.5)
++        """compute padding"""
++        if h % self.cfg.downsample_ratio:
++            pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
++        else:
++            pad = 0
++        x = x.reshape(bs, h, w, input_dim)
++        if pad > 0:
++            x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
++        """4 to 1 concat"""
++        x = x.permute(0, 3, 1, 2)  # B, C, H, W
++        x = F.unfold(x,
++                     kernel_size=self.cfg.downsample_ratio,
++                     stride=self.cfg.downsample_ratio,
++                     padding=0)  # B, C*4, HW // 4
++        x = x.permute(0, 2, 1)
++
++        return self.layers(x)
++
++
++class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(DeepseekVLV2Config)
++
++    def get_hf_processor(self) -> ProcessorMixin:
++        # TODO(Isotr0py): we should get rid of dependency on deepseek_vl2
++        # in the future, because it's flasky and lack of maintenance.
++        try:
++            from deepseek_vl2.models.processing_deepseek_vl_v2 import (
++                DeepseekVLV2Processor, select_best_resolution)
++            AutoProcessor.register("DeepseekVLV2Processor",
++                                   DeepseekVLV2Processor)
++        except ModuleNotFoundError as exc:
++            raise ModuleNotFoundError(
++                "You need to `pip install "
++                "git+https://github.com/deepseek-ai/DeepSeek-VL2.git` "
++                "to use this model") from exc
++
++        processor = self.ctx.get_hf_processor(DeepseekVLV2Processor)
++        processor.select_best_resolution = partial(
++            select_best_resolution,
++            candidate_resolutions=processor.candidate_resolutions)
++        return processor
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": None}
++
++    def get_num_image_tokens(self, *, image_width: int,
++                             image_height: int) -> int:
++        hf_processor = self.get_hf_processor()
++        image_size = hf_processor.image_size
++        patch_size = hf_processor.patch_size
++        downsample_ratio = hf_processor.downsample_ratio
++
++        best_width, best_height = hf_processor.select_best_resolution(
++            (image_width, image_height))
++
++        num_width_tiles, num_height_tiles = (best_width // image_size,
++                                             best_height // image_size)
++        h = w = math.ceil((image_size // patch_size) / downsample_ratio)
++
++        global_views_tokens = h * (w + 1)
++        local_views_tokens = (num_height_tiles * h) * (num_width_tiles * w + 1)
++        return global_views_tokens + local_views_tokens + 1
++
++    def get_image_size_with_most_features(self) -> ImageSize:
++        hf_config = self.get_hf_config()
++        candidate_resolutions = hf_config.candidate_resolutions
++        height, width = max(candidate_resolutions,
++                            key=lambda x: self.get_num_image_tokens(
++                                image_width=x[1], image_height=x[0]))
++        return ImageSize(width=width, height=height)
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        max_image_size = self.get_image_size_with_most_features()
++        max_image_tokens = self.get_num_image_tokens(
++            image_height=max_image_size.height,
++            image_width=max_image_size.width)
++
++        return {"image": max_image_tokens}
++
++
++class DeepseekVL2DummyInputsBuilder(
++        BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        num_images = mm_counts.get("image", 0)
++        hf_processor = self.info.get_hf_processor()
++        image_token: str = hf_processor.image_token
++
++        max_image_size = self.info.get_image_size_with_most_features()
++
++        mm_data = {
++            "image":
++            self._get_dummy_images(width=max_image_size.width,
++                                   height=max_image_size.height,
++                                   num_images=num_images)
++        }
++
++        return ProcessorInputs(
++            prompt_text=image_token * num_images,
++            mm_data=mm_data,
++        )
++
++
++class DeepseekVL2MultiModalProcessor(
++        BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        if mm_data:
++            outputs = self.info.ctx.call_hf_processor(
++                self.info.get_hf_processor(**mm_kwargs),
++                dict(prompt=prompt, **mm_data),
++                mm_kwargs,
++            )
++
++            # Deepseek-vl2 processor don't return BatchFeature,
++            # we need to manually create it
++            processed_outputs = dict(input_ids=outputs["input_ids"])
++            processed_outputs = BatchFeature(data=dict(processed_outputs),
++                                             tensor_type="pt")
++
++            # Remove batch dimension from processor outputs,
++            # because we will try batch to create NestedTensors
++            target_dtype = self.info.ctx.model_config.dtype
++            pixel_values = outputs["images"].to(target_dtype).squeeze(0)
++            images_spatial_crop = outputs["images_spatial_crop"].squeeze(0)
++            patches_per_image = [
++                x.prod().item() + 1 for x in images_spatial_crop
++            ]
++
++            # Rename `images` -> `pixel_values` to avoid confusion
++            processed_outputs["pixel_values"] = list(
++                pixel_values.split(patches_per_image))
++            processed_outputs["images_spatial_crop"] = images_spatial_crop
++        else:
++            tokenizer = self.info.get_tokenizer()
++            processed_outputs = tokenizer(prompt,
++                                          add_special_tokens=True,
++                                          return_tensors="pt")
++
++        return processed_outputs
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(
++            pixel_values=MultiModalFieldConfig.batched("image"),
++            images_spatial_crop=MultiModalFieldConfig.batched("image"),
++            image_embeds=MultiModalFieldConfig.batched("image"),
++        )
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        hf_processor = self.info.get_hf_processor()
++        image_token_id: int = hf_processor.image_token_id
++
++        def get_replacement_deepseek_vl2(item_idx: int):
++            images = mm_items.get_items(
++                "image", (ImageEmbeddingItems, ImageProcessorItems))
++
++            if isinstance(images, ImageEmbeddingItems):
++                num_image_tokens = images.get_feature_size(item_idx)
++            else:
++                image_size = images.get_image_size(item_idx)
++
++                num_image_tokens = self.info.get_num_image_tokens(
++                    image_width=image_size.width,
++                    image_height=image_size.height,
++                )
++            return [image_token_id] * num_image_tokens
++
++        return [
++            PromptReplacement(
++                modality="image",
++                target=[image_token_id],
++                replacement=get_replacement_deepseek_vl2,
++            )
++        ]
++
++
++@MULTIMODAL_REGISTRY.register_processor(
++    DeepseekVL2MultiModalProcessor,
++    info=DeepseekVL2ProcessingInfo,
++    dummy_inputs=DeepseekVL2DummyInputsBuilder)
++class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
++
++    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
++        "language.": "language_model.",
++    })
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config: DeepseekVLV2Config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        self.vision_config = config.vision_config
++        self.projector_config = config.projector_config
++        self.text_config = config.text_config
++
++        model_config = vllm_config.model_config
++        tokenizer = cached_get_tokenizer(
++            model_config.tokenizer,
++            tokenizer_mode=model_config.tokenizer_mode,
++            tokenizer_revision=model_config.tokenizer_revision,
++            trust_remote_code=model_config.trust_remote_code,
++        )
++        self.image_token_id = tokenizer.vocab.get(_IMAGE_TOKEN)
++
++        self.vision = self._init_vision_module(self.vision_config,
++                                               quant_config,
++                                               maybe_prefix(prefix, "vision"))
++
++        self.projector = MlpProjector(self.projector_config)
++        self.tile_tag = config.tile_tag
++        self.global_view_pos = config.global_view_pos
++
++        # special token for image token sequence format
++        embed_std = 1 / torch.sqrt(
++            torch.tensor(self.projector_config.n_embed, dtype=torch.float32))
++        if self.tile_tag == "2D":
++            # <|view_separator|>, <|\n|>
++            self.image_newline = nn.Parameter(
++                torch.randn(self.projector_config.n_embed) * embed_std)
++            # This is a typo in original implementation
++            self.view_seperator = nn.Parameter(
++                torch.randn(self.projector_config.n_embed) * embed_std)
++        else:
++            raise ValueError(
++                f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
++            )
++
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=self.text_config,
++            prefix=maybe_prefix(prefix, "language"),
++            architectures=["DeepseekV3ForCausalLM"]
++            if self.text_config.topk_method == "noaux_tc" else
++            ["DeepseekV2ForCausalLM"],
++        )
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    def _init_vision_module(
++        self,
++        vision_config: VisionEncoderConfig,
++        quant_config: Optional[QuantizationConfig],
++        prefix: str = "",
++    ) -> nn.Module:
++        # TODO: refactor vision model through timm wrapper from transformers
++        try:
++            import timm
++        except ImportError:
++            raise ImportError("Please install timm") from ImportError
++
++        with set_default_torch_dtype(torch.float16):
++            model = timm.create_model(
++                "vit_so400m_patch14_siglip_384.webli",
++                pretrained=False,
++                num_classes=0,
++                dynamic_img_size=True,
++                dynamic_img_pad=True,
++            )
++
++        model = model.to(dtype=torch.get_default_dtype())
++        return model
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _validate_pixel_values(
++        self, data: Union[torch.Tensor, List[torch.Tensor]]
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++
++        h = w = self.vision_config.image_size
++        expected_dims = (3, h, w)
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape[1:])
++
++            if actual_dims != expected_dims:
++                expected_expr = ("num_patches", *map(str, expected_dims))
++                raise ValueError(
++                    "The expected shape of pixel values per image per batch "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _validate_images_spatial_crop(
++        self, data: Union[torch.Tensor, List[torch.Tensor]]
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++        expected_dims = 2
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = d.size(-1)
++
++            if actual_dims != expected_dims:
++                expected_expr = str(expected_dims)
++                raise ValueError(
++                    f"The expected shape of image sizes per image per batch "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[DeepseekVL2ImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        images_spatial_crop = kwargs.pop("images_spatial_crop", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if pixel_values is not None:
++            if not isinstance(pixel_values, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            if not isinstance(images_spatial_crop, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of image sizes. "
++                                 f"Got type: {type(images_spatial_crop)}")
++
++            return DeepseekVL2ImagePixelInputs(
++                type="pixel_values",
++                data=self._validate_pixel_values(flatten_bn(pixel_values)),
++                images_spatial_crop=self._validate_images_spatial_crop(
++                    flatten_bn(images_spatial_crop, concat=True)))
++
++        if image_embeds is not None:
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++
++            return DeepseekVL2VImageEmbeddingInputs(
++                type="image_embeds",
++                data=flatten_bn(image_embeds),
++            )
++
++        raise AssertionError("This line should be unreachable.")
++
++    def _pixel_values_to_embedding(
++        self,
++        pixel_values: NestedTensors,
++        images_spatial_crop: torch.Tensor,
++    ) -> NestedTensors:
++        # Pixel_values: n_image * batch_size * [patch_per_img, 3, height, width]
++        total_tiles = [x for x in pixel_values]
++
++        # [batch_all_tiles, 3, height, width]
++        total_tiles = torch.cat(total_tiles, dim=0)
++
++        # [batch_all_tiles, vit_seq_len, c]
++        images_feature = self.vision.forward_features(total_tiles)
++
++        # [batch_all_tiles, hw, D]
++        images_embeds = self.projector(images_feature)
++
++        _, hw, n_dim = images_embeds.shape
++        h = w = int(hw**0.5)
++
++        # 根据self.tile_tag & self.global_view_pos填充image token sequence
++        tile_index = 0
++        vision_embeddings = []
++        for jdx in range(images_spatial_crop.size(0)):
++            # extra global & local features
++            num_width_tiles, num_height_tiles = images_spatial_crop[jdx]
++            if num_width_tiles == 0 or num_height_tiles == 0:
++                break
++            num_tiles_in_image = num_width_tiles * num_height_tiles
++
++            # [hw, D]
++            global_features = images_embeds[tile_index]
++
++            # [num_height_tiles * num_width_tiles, hw, D]
++            local_features = images_embeds[tile_index + 1:tile_index + 1 +
++                                           num_tiles_in_image]
++            tile_index += num_tiles_in_image + 1
++
++            # format global and local features
++            # ----------------- global view add newline -----------------
++            # [hw, D] -> [h, w, D]
++            global_features = global_features.view(h, w, n_dim)
++
++            # [D]     -> [h, 1, D]
++            new_lines_in_global = repeat(self.image_newline, "d -> h 1 d", h=h)
++
++            # cat([h, w, D], [h, 1, D], dim=1) -> [h, w + 1, D]
++            global_features = torch.cat([global_features, new_lines_in_global],
++                                        dim=1)
++
++            # [h, w + 1, D] -> [h * (w + 1), D]
++            global_features = global_features.view(-1, n_dim)
++
++            # ----------------- local view add newline -----------------
++            # [num_height_tiles * num_width_tiles, h * w, D] ->
++            # [num_height_tiles * h, num_width_tiles * w, D]
++            local_features = rearrange(local_features,
++                                       "(th tw) (h w) d -> (th h) (tw w) d",
++                                       th=num_height_tiles,
++                                       tw=num_width_tiles,
++                                       h=h,
++                                       w=w)
++
++            # [D] -> [num_height_tiles * h, 1, D]
++            new_lines_in_local = repeat(self.image_newline,
++                                        "d -> (th h) 1 d",
++                                        th=num_height_tiles,
++                                        h=h)
++
++            # [num_height_tiles * h, num_width_tiles * w + 1, D]
++            local_features = torch.cat([local_features, new_lines_in_local],
++                                       dim=1)
++
++            # [num_height_tiles * h, num_width_tiles * w + 1, D]
++            #   --> [(num_height_tiles * h) * (num_width_tiles * w + 1), D]
++            local_features = local_features.view(-1, n_dim)
++
++            # merge global and local tiles
++            if self.global_view_pos == "head":
++                global_local_features = torch.cat([
++                    global_features,
++                    self.view_seperator[None, :],
++                    local_features,
++                ])
++            else:
++                global_local_features = torch.cat([
++                    local_features,
++                    self.view_seperator[None, :],
++                    global_features,
++                ])
++
++            vision_embeddings.append(global_local_features)
++        return vision_embeddings
++
++    def _process_image_input(
++            self, image_input: DeepseekVL2ImageInputs) -> torch.Tensor:
++        if image_input["type"] == "image_embeds":
++            image_data = image_input["data"]
++            if is_list_of(image_data, torch.Tensor):
++                # it's already a list of tensors
++                return image_data
++            if len(image_data.shape) == 3:
++                # 3D tensor
++                return list(torch.unbind(image_data, dim=0))
++            raise ValueError(
++                "We expect batched 2D tensors;"
++                "this can be either a list of 2D tensors or a single 3D tensor."
++            )
++
++        pixel_values = image_input["data"]
++        images_spatial_crop = image_input["images_spatial_crop"]
++
++        return self._pixel_values_to_embedding(
++            pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
++
++    def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        vision_embeddings = self._process_image_input(image_input)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                self.image_token_id)
++        return inputs_embeds
++
++    def forward(self,
++                input_ids: torch.Tensor,
++                positions: torch.Tensor,
++                kv_caches: List[torch.Tensor],
++                attn_metadata: AttentionMetadata,
++                intermediate_tensors: Optional[IntermediateTensors] = None,
++                inputs_embeds: Optional[torch.Tensor] = None,
++                **kwargs: object):
++
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model(input_ids,
++                                            positions,
++                                            kv_caches,
++                                            attn_metadata,
++                                            intermediate_tensors,
++                                            inputs_embeds=inputs_embeds)
++
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++
++        loader = AutoWeightsLoader(self)
++        autoloaded_weights = loader.load_weights(weights,
++                                                 mapper=self.hf_to_vllm_mapper)
++        return autoloaded_weights
+diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
+new file mode 100644
+index 0000000..948560b
+--- /dev/null
++++ b/vllm/model_executor/models/eagle.py
+@@ -0,0 +1,212 @@
++from typing import Iterable, List, Optional, Tuple
++
++import torch
++import torch.nn as nn
++
++from vllm.attention.backends.abstract import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models import ModelRegistry
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++
++from .utils import maybe_prefix
++
++
++class DummyInputLayerNorm(nn.Module):
++
++    def __init__(self, weight=None, bias=None):
++        super().__init__()
++        self.weight = nn.Parameter(weight) if weight is not None else None
++        self.bias = nn.Parameter(bias) if bias is not None else None
++
++    def forward(self, x):
++        return x
++
++
++class DummyOutputNorm(nn.Module):
++
++    def forward(self, x, residual):
++        if residual is None:
++            return x
++        else:
++            return x, residual
++
++
++class EAGLE(nn.Module):
++    """This class implements the EAGLE draft model from the paper: https://arxiv.org/pdf/2401.15077
++    Reference implementation: https://github.com/SafeAILab/EAGLE
++    
++    Differences from reference implementation:
++    1. In reference, LlamaDecoderLayer implementation doesn't have 
++       input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427).
++       Following this approach, our implementation also disables
++       the input_layernorm for the first decoder layer.
++    2. We allow any decoder layer to be used in EAGLE whereas in reference 
++       decoder layer is fixed to be LlamaDecoderLayer.
++    3. We have an optional token_map which reduces draft vocab to most 
++       frequently used tokens to give some additional speed-up by reducing 
++       sampling overhead. This is disabled unless the checkpoint file has 
++       explicit token_map tensor and config has an optional attribute 
++       truncated_vocab_size < vocab_size. To use this technique, one has to find
++       the top-k most frequent tokens in target dataset and add that as a tensor
++       in the draft checkpoint (using key token_map). Also, the draft config
++       needs to have truncated_vocab_size (=k) as an attribute."""
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        self.config = config
++
++        architectures = getattr(self.config.model, "architectures", [])
++        model_cls, _ = ModelRegistry.resolve_model_cls(architectures)
++
++        self.model = model_cls(vllm_config=vllm_config,
++                               prefix=maybe_prefix(prefix, "model"))
++
++        self.fc = nn.Linear(config.model.hidden_size * 2,
++                            config.model.hidden_size,
++                            bias=getattr(self.config, "eagle_fc_bias", False))
++
++        # Modify layer normalization and residual connections as suggested
++        # in the EAGLE framework: https://github.com/SafeAILab/EAGLE
++        # While weights and biases are generally not needed,
++        # they are retained here to support certain unit tests
++        # (e.g., spec_decode/e2e/test_eagle_correctness.py).
++        self.model.model.layers[0].input_layernorm = DummyInputLayerNorm(
++            weight=self.model.model.layers[0].input_layernorm.weight)
++        self.model.model.norm = DummyOutputNorm()
++
++        self.orig_vocab_size = config.vocab_size
++        self.truncated_vocab_size = config.truncated_vocab_size
++        self.unpadded_vocab_size = self.truncated_vocab_size
++
++        self.lm_head = ParallelLMHead(
++            self.unpadded_vocab_size,
++            config.hidden_size,
++            org_num_embeddings=self.truncated_vocab_size,
++            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
++        )
++
++        logit_scale = getattr(config, "logit_scale", 1.0)
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                self.truncated_vocab_size,
++                                                logit_scale)
++
++        # Token map is a idx to token mapping to reduce the vocab size for
++        # the draft model. Using smaller vocab size for draft, containing
++        # only most frequent tokens reduces the speculation overhead. This
++        # doesn't affect the acceptance rate much and thus gives more speed
++        # -up. By default, this is disabled and is only used if the EAGLE
++        # checkpoint file has token_map tensor.
++        self.token_map = None
++
++    @property
++    def sampler(self):
++        return self.model.sampler
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        previous_hidden_states: torch.Tensor,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++
++        if inputs_embeds is None:
++            inputs_embeds = self.get_input_embeddings(input_ids)
++
++        inputs_embeds = self.fc(
++            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
++
++        inputs_embeds[positions == 0] = 0  # masking inputs at position=0
++
++        hidden_states = self.model.model(
++            input_ids=None,
++            inputs_embeds=inputs_embeds,
++            positions=positions,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++            intermediate_tensors=intermediate_tensors,
++        )
++        return hidden_states
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++
++        if self.token_map is not None:
++            _logits = logits
++            logits = -torch.inf * torch.ones(
++                size=(*_logits.shape[:-1], self.orig_vocab_size),
++                device=_logits.device,
++                dtype=_logits.dtype)
++
++            logits[..., self.token_map] = _logits
++
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++        # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
++        # due to missing lm_head weights and its config being that of a
++        # Llama model. Here's a compatible version with the same weights:
++        # https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm
++        # Also, here's an example script for converting trained EAGLE
++        # checkpoint to vLLM compatible version: https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d
++        model_weights = {}
++        for name, loaded_weight in weights:
++            if name == "token_map":
++                if self.config.truncated_vocab_size < self.config.vocab_size:
++                    self.token_map = nn.Parameter(loaded_weight,
++                                                  requires_grad=False)
++            elif name.startswith("fc.weight"):
++                weight_loader = getattr(self.fc.weight, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(self.fc.weight, loaded_weight)
++            elif name.startswith("fc.bias"):
++                if self.fc.bias is not None:
++                    weight_loader = getattr(self.fc.bias, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(self.fc.bias, loaded_weight)
++                else:
++                    raise ValueError("Found bias in the loaded weights "
++                                     "but the model config doesn't have bias")
++            elif name.startswith("model.lm_head.") or name.startswith(
++                    "model.model."):
++                model_weights[name.split("model.", 1)[-1]] = loaded_weight
++            elif name.startswith("lm_head.") or name.startswith("model."):
++                model_weights[name] = loaded_weight
++            else:
++                model_weights[f"model.{name}"] = loaded_weight
++
++        lm_head_weight = model_weights.pop("lm_head.weight")
++
++        if self.token_map is not None and\
++            lm_head_weight.shape[0] > self.token_map.shape[0]:
++
++            lm_head_weight = lm_head_weight[self.token_map]
++
++        weight_loader = getattr(self.lm_head.weight, "weight_loader",
++                                default_weight_loader)
++        weight_loader(self.lm_head.weight, lm_head_weight)
++
++        self.model.load_weights(model_weights.items())
+diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
+new file mode 100644
+index 0000000..8324a56
+--- /dev/null
++++ b/vllm/model_executor/models/exaone.py
+@@ -0,0 +1,614 @@
++# Adapted from
++# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
++# Copyright 2024 The LG U+ CTO AI Tech Lab.
++# Copyright 2021 The LG AI Research EXAONE Lab
++# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only Exaone model compatible with HuggingFace weights."""
++
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size)
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
++    get_compressed_tensors_cache_scale)
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.platforms import current_platform
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.configs.exaone import ExaoneConfig
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (PPMissingLayer, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++
++class ExaoneGatedMLP(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        intermediate_size: int,
++        hidden_act: str,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.gate_up_proj = MergedColumnParallelLinear(
++            input_size=hidden_size,
++            output_sizes=[intermediate_size] * 2,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj",
++        )
++        self.c_proj = RowParallelLinear(
++            input_size=intermediate_size,
++            output_size=hidden_size,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.c_proj",
++        )
++        if hidden_act != "silu":
++            raise ValueError(f"Unsupported activation: {hidden_act}. "
++                             "Only silu is supported for now.")
++        self.act_fn = SiluAndMul()
++
++    def forward(self, x):
++        gate_up, _ = self.gate_up_proj(x)
++        x = self.act_fn(gate_up)
++        x, _ = self.c_proj(x)
++        return x
++
++
++class ExaoneAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: ExaoneConfig,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        rope_theta: float = 10000,
++        rope_scaling: Optional[Dict[str, Any]] = None,
++        max_position_embeddings: int = 8192,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        cache_config: Optional[CacheConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = num_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = num_kv_heads
++        if self.total_num_kv_heads >= tp_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
++        self.head_dim = getattr(config, "head_dim",
++                                self.hidden_size // self.total_num_heads)
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = self.head_dim**-0.5
++        self.rope_theta = rope_theta
++        self.max_position_embeddings = max_position_embeddings
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size=hidden_size,
++            head_size=self.head_dim,
++            total_num_heads=self.total_num_heads,
++            total_num_kv_heads=self.total_num_kv_heads,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++
++        self.out_proj = RowParallelLinear(
++            input_size=self.total_num_heads * self.head_dim,
++            output_size=hidden_size,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.out_proj",
++        )
++
++        is_neox_style = True
++        if quant_config is not None and quant_config.get_name() == "gguf":
++            is_neox_style = False
++
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=max_position_embeddings,
++            base=rope_theta,
++            rope_scaling=rope_scaling,
++            is_neox_style=is_neox_style,
++        )
++        self.attn = Attention(
++            self.num_heads,
++            self.head_dim,
++            self.scaling,
++            num_kv_heads=self.num_kv_heads,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attn",
++        )
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.out_proj(attn_output)
++        return output
++
++
++class ExaoneBlockAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: ExaoneConfig,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        rope_theta: float = 10000,
++        rope_scaling: Optional[Dict[str, Any]] = None,
++        max_position_embeddings: int = 8192,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        cache_config: Optional[CacheConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.attention = ExaoneAttention(
++            config=config,
++            hidden_size=hidden_size,
++            num_heads=num_heads,
++            num_kv_heads=num_kv_heads,
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            quant_config=quant_config,
++            bias=bias,
++            cache_config=cache_config,
++            prefix=f"{prefix}.attention",
++        )
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        return self.attention(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++
++class ExaoneDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: ExaoneConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        rope_theta = getattr(config, "rope_theta", 10000)
++        rope_scaling = getattr(config, "rope_scaling", None)
++        if rope_scaling is not None and getattr(
++                config, "original_max_position_embeddings", None):
++            rope_scaling["original_max_position_embeddings"] = (
++                config.original_max_position_embeddings)
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          8192)
++        # Support abacusai/Smaug-72B-v0.1 with attention_bias
++        # Support internlm/internlm-7b with bias
++        attention_bias = getattr(config, "attention_bias", False) or getattr(
++            config, "bias", False)
++        self.attn = ExaoneBlockAttention(
++            config=config,
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            num_kv_heads=getattr(config, "num_key_value_heads",
++                                 config.num_attention_heads),
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            quant_config=quant_config,
++            bias=attention_bias,
++            cache_config=cache_config,
++            prefix=f"{prefix}.attn",
++        )
++        self.mlp = ExaoneGatedMLP(
++            hidden_size=self.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.activation_function,
++            quant_config=quant_config,
++            bias=getattr(config, "mlp_bias", False),
++            prefix=f"{prefix}.mlp",
++        )
++        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
++        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        # Self Attention
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.ln_1(hidden_states)
++        else:
++            hidden_states, residual = self.ln_1(hidden_states, residual)
++        hidden_states = self.attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        # Fully Connected
++        hidden_states, residual = self.ln_2(hidden_states, residual)
++        hidden_states = self.mlp(hidden_states)
++        return hidden_states, residual
++
++
++@support_torch_compile
++class ExaoneModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.config = config
++        self.padding_idx = config.pad_token_id
++        lora_vocab = ((lora_config.lora_extra_vocab_size *
++                       (lora_config.max_loras or 1)) if lora_config else 0)
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.wte = config.vocab_size
++        if get_pp_group().is_first_rank or (config.tie_word_embeddings
++                                            and get_pp_group().is_last_rank):
++            self.wte = VocabParallelEmbedding(
++                self.vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++                quant_config=quant_config,
++            )
++        else:
++            self.wte = PPMissingLayer()
++        self.start_layer, self.end_layer, self.h = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: ExaoneDecoderLayer(
++                config=config,
++                cache_config=cache_config,
++                quant_config=quant_config,
++                prefix=prefix,
++            ),
++            prefix=f"{prefix}.h",
++        )
++        if get_pp_group().is_last_rank:
++            self.ln_f = RMSNorm(config.hidden_size,
++                                eps=config.layer_norm_epsilon)
++        else:
++            self.ln_f = PPMissingLayer()
++
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.wte(input_ids)
++
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.h[i]
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++                residual,
++            )
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++
++        hidden_states, _ = self.ln_f(hidden_states, residual)
++        return hidden_states
++
++
++class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "c_fc_0",
++            "c_fc_1",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj",
++        "out_proj",
++        "gate_up_proj",
++        "c_proj",
++        "wte",
++        "lm_head",
++    ]
++    embedding_modules = {
++        "wte": "input_embeddings",
++        "lm_head": "output_embeddings",
++    }
++    embedding_padding_modules = ["lm_head"]
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "c_fc_0": ("gate_up_proj", 0),
++        "c_fc_1": ("gate_up_proj", 1),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.config = config
++        self.lora_config = lora_config
++
++        self.transformer = ExaoneModel(
++            vllm_config=vllm_config,
++            prefix=maybe_prefix(prefix, "model"),
++        )
++        if get_pp_group().is_last_rank:
++            self.unpadded_vocab_size = config.vocab_size
++            if lora_config:
++                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++            self.lm_head = ParallelLMHead(
++                self.unpadded_vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++                padding_size=DEFAULT_VOCAB_PADDING_SIZE
++                # We need bigger padding if using lora for kernel
++                # compatibility
++                if not lora_config else lora_config.lora_vocab_padding_size,
++                quant_config=quant_config,
++            )
++            if config.tie_word_embeddings:
++                self.lm_head.weight = self.transformer.wte.weight
++
++            logit_scale = getattr(config, "logit_scale", 1.0)
++            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                    config.vocab_size,
++                                                    logit_scale)
++        else:
++            self.lm_head = PPMissingLayer()
++
++        self.sampler = get_sampler()
++
++        self.make_empty_intermediate_tensors = (
++            self.transformer.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        model_output = self.transformer(input_ids, positions, kv_caches,
++                                        attn_metadata, intermediate_tensors,
++                                        inputs_embeds)
++        return model_output
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            (".qkv_proj", ".q_proj", "q"),
++            (".qkv_proj", ".k_proj", "k"),
++            (".qkv_proj", ".v_proj", "v"),
++            (".gate_up_proj", ".c_fc_0", 0),
++            (".gate_up_proj", ".c_fc_1", 1),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            if ("rotary_emb.cos_cached" in name
++                    or "rotary_emb.sin_cached" in name):
++                # Models trained using ColossalAI may include these tensors in
++                # the checkpoint. Skip them.
++                continue
++            # With tie_word_embeddings, we can skip lm_head.weight
++            # The weight might appear unnecessarily in the files if the model is
++            # processed with quantization, LoRA, fine-tuning, etc.
++            if self.config.tie_word_embeddings and "lm_head.weight" in name:
++                continue
++            if scale_name := get_compressed_tensors_cache_scale(name):
++                # Loading kv cache scales for compressed-tensors quantization
++                param = params_dict[scale_name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                loaded_weight = loaded_weight[0]
++                weight_loader(param, loaded_weight)
++                loaded_params.add(scale_name)
++                continue
++            for param_name, weight_name, shard_id in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++
++                break
++            else:
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                # Remapping the name of FP8 kv-scale.
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++    # If this function is called, it should always initialize KV cache scale
++    # factors (or else raise an exception). Thus, handled exceptions should
++    # make sure to leave KV cache scale factors in a known good (dummy) state
++    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
++        tp_size = get_tensor_model_parallel_world_size()
++        tp_rank = get_tensor_model_parallel_rank()
++        for layer_idx, scaling_factor in kv_cache_scales_loader(
++                quantization_param_path,
++                tp_rank,
++                tp_size,
++                self.config.num_hidden_layers,
++                self.config.__class__.model_type,
++        ):
++            if not isinstance(self.transformer.h[layer_idx], nn.Identity):
++                layer_self_attn = self.transformer.h[layer_idx].attn
++
++            if current_platform.is_rocm():
++                # The scaling factor convention we are assuming is
++                # quantized_value * scaling_factor ~= true_value
++                # which is consistent with the practice of setting
++                # scaling_factor = tensor_amax / FPtype_max
++                scaling_factor *= 2
++            if hasattr(layer_self_attn.attn, "_k_scale"):
++                layer_self_attn.attn._k_scale = scaling_factor
++                layer_self_attn.attn._v_scale = scaling_factor
++            else:
++                raise RuntimeError("Self attention has no KV cache scaling "
++                                   "factor attribute!")
+diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
+index 08dd699..8660cf7 100644
+--- a/vllm/model_executor/models/falcon.py
++++ b/vllm/model_executor/models/falcon.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
+ # Copyright 2023 The vLLM team.
+@@ -19,7 +18,7 @@
+ """PyTorch Falcon model."""
+ 
+ import math
+-from typing import Iterable, List, Optional, Tuple, Union
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+@@ -27,7 +26,9 @@ from torch.nn import LayerNorm
+ from transformers import FalconConfig as HF_FalconConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import (get_tensor_model_parallel_rank,
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size,
+                               tensor_model_parallel_all_reduce)
+ from vllm.model_executor.layers.activation import get_act_fn
+@@ -35,17 +36,21 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+-    VocabParallelEmbedding)
++    ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
+ from vllm.transformers_utils.configs import RWConfig
+ 
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
+ FalconConfig = Union[HF_FalconConfig, RWConfig]
+ 
+ 
+@@ -77,7 +82,9 @@ class FalconAttention(nn.Module):
+     def __init__(
+         self,
+         config: FalconConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+ 
+@@ -151,7 +158,9 @@ class FalconAttention(nn.Module):
+             self.attn = Attention(self.num_heads,
+                                   self.head_dim,
+                                   self.inv_norm_factor,
+-                                  num_kv_heads=self.num_kv_heads)
++                                  num_kv_heads=self.num_kv_heads,
++                                  quant_config=quant_config,
++                                  prefix=f"{prefix}.attn")
+         elif self.use_alibi:
+             tp_rank = get_tensor_model_parallel_rank()
+             head_start = tp_rank * self.num_heads
+@@ -163,12 +172,17 @@ class FalconAttention(nn.Module):
+                                   self.head_dim,
+                                   self.inv_norm_factor,
+                                   num_kv_heads=self.num_kv_heads,
+-                                  alibi_slopes=alibi_slopes)
++                                  alibi_slopes=alibi_slopes,
++                                  quant_config=quant_config,
++                                  prefix=f"{prefix}.attn")
+         else:
+             self.attn = Attention(self.num_heads,
+                                   self.head_dim,
+                                   scale=self.inv_norm_factor,
+-                                  num_kv_heads=self.num_kv_heads)
++                                  num_kv_heads=self.num_kv_heads,
++                                  cache_config=cache_config,
++                                  quant_config=quant_config,
++                                  prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -203,7 +217,7 @@ class FalconMLP(nn.Module):
+                                                   bias=config.bias,
+                                                   skip_bias_add=True,
+                                                   quant_config=quant_config)
+-        self.act = get_act_fn("gelu", quant_config, 4 * hidden_size)
++        self.act = get_act_fn("gelu")
+         self.reduce_row_parallel_results = not (config.new_decoder_architecture
+                                                 or config.parallel_attn)
+         self.dense_4h_to_h = RowParallelLinear(
+@@ -229,27 +243,44 @@ class FalconDecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: FalconConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         hidden_size = config.hidden_size
+         self.num_heads = config.num_attention_heads
+-        self.self_attention = FalconAttention(config, quant_config)
++        self.self_attention = FalconAttention(
++            config,
++            cache_config,
++            quant_config,
++            prefix=f"{prefix}.self_attention")
+         self.mlp = FalconMLP(config, quant_config)
+         self.config = config
+ 
+-        if config.new_decoder_architecture:
+-            # The layer norm before self-attention
+-            self.ln_attn = LayerNorm(hidden_size,
+-                                     eps=config.layer_norm_epsilon)
+-            # The layer norm before the MLP
+-            self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+-        else:
++        if (not hasattr(config, "num_ln_in_parallel_attn")):
++            config.num_ln_in_parallel_attn = None
++
++        if (config.num_ln_in_parallel_attn is None
++                and config.new_decoder_architecture):
++            config.num_ln_in_parallel_attn = 2
++
++        if not config.parallel_attn:
++            self.post_attention_layernorm = LayerNorm(
++                hidden_size, eps=config.layer_norm_epsilon)
+             self.input_layernorm = LayerNorm(hidden_size,
+                                              eps=config.layer_norm_epsilon)
+-            if not config.parallel_attn:
+-                self.post_attention_layernorm = LayerNorm(
+-                    hidden_size, eps=config.layer_norm_epsilon)
++        else:
++            if config.num_ln_in_parallel_attn == 2:
++                # The layer norm before self-attention
++                self.ln_attn = LayerNorm(hidden_size,
++                                         eps=config.layer_norm_epsilon)
++                # The layer norm before the MLP
++                self.ln_mlp = LayerNorm(hidden_size,
++                                        eps=config.layer_norm_epsilon)
++            else:
++                self.input_layernorm = LayerNorm(hidden_size,
++                                                 eps=config.layer_norm_epsilon)
+ 
+         self.reduce_row_parallel_results = not (config.new_decoder_architecture
+                                                 or config.parallel_attn)
+@@ -263,7 +294,7 @@ class FalconDecoderLayer(nn.Module):
+     ) -> torch.Tensor:
+         residual = hidden_states
+ 
+-        if self.config.new_decoder_architecture:
++        if self.config.num_ln_in_parallel_attn == 2:
+             attention_layernorm_out = self.ln_attn(hidden_states)
+             mlp_layernorm_out = self.ln_mlp(hidden_states)
+         else:
+@@ -286,6 +317,10 @@ class FalconDecoderLayer(nn.Module):
+                 residual += attention_output
+                 mlp_layernorm_out = self.post_attention_layernorm(residual)
+ 
++        if (self.config.new_decoder_architecture and self.config.parallel_attn
++                and self.config.num_ln_in_parallel_attn == 1):
++            mlp_layernorm_out = attention_layernorm_out
++
+         # MLP.
+         mlp_output, mlp_bias = self.mlp(mlp_layernorm_out)
+         if self.reduce_row_parallel_results and mlp_bias is not None:
+@@ -306,14 +341,16 @@ class FalconDecoderLayer(nn.Module):
+         return output
+ 
+ 
++@support_torch_compile
+ class FalconModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: FalconConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+         self.embed_dim = config.hidden_size
+         self.num_heads = config.num_attention_heads
+@@ -326,48 +363,86 @@ class FalconModel(nn.Module):
+         )
+ 
+         # Transformer blocks
+-        self.h = nn.ModuleList([
+-            FalconDecoderLayer(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.h = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: FalconDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.h")
+ 
+         # Final Layer Norm
+         self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.word_embeddings(input_ids)
+ 
+     def forward(
+         self,
+-        input_ids: torch.LongTensor,
++        input_ids: torch.Tensor,
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.word_embeddings(input_ids)
+-        for i in range(len(self.h)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.h[i]
+             hidden_states = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         hidden_states = self.ln_f(hidden_states)
+         return hidden_states
+ 
+ 
+-class FalconForCausalLM(nn.Module):
++class FalconForCausalLM(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: FalconConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {}
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+-        self.transformer = FalconModel(config, quant_config)
+-        self.lm_head_weight = self.transformer.word_embeddings.weight
++        self.transformer = FalconModel(vllm_config=vllm_config,
++                                       prefix=maybe_prefix(
++                                           prefix, "transformer"))
++        # only Falcon-11B doesn't share lm_head weight with word embeddings
++        # and previous Falcon model doesn't have tie_word_embeddings config
++        # so we set tie_word_embeddings to True by default
++        self.tie_word_embeddings = (config.tie_word_embeddings
++                                    if config.tie_word_embeddings is not None
++                                    else True)
++        if self.tie_word_embeddings:
++            self.lm_head = self.transformer.word_embeddings
++        else:
++            self.lm_head = ParallelLMHead(
++                config.vocab_size,
++                config.hidden_size,
++                quant_config=quant_config,
++            )
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.transformer.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.transformer.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -375,18 +450,20 @@ class FalconForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
+     ) -> torch.Tensor:
+-        hidden_states = self.transformer(
+-            input_ids,
+-            positions,
+-            kv_caches,
+-            attn_metadata,
+-        )
++        hidden_states = self.transformer(input_ids, positions, kv_caches,
++                                         attn_metadata, intermediate_tensors,
++                                         inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -398,7 +475,8 @@ class FalconForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         total_num_heads = self.config.num_attention_heads
+         if self.config.new_decoder_architecture:
+             total_num_kv_heads = self.config.num_kv_heads
+@@ -408,13 +486,16 @@ class FalconForCausalLM(nn.Module):
+             total_num_kv_heads = total_num_heads
+         num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+-            if name == "lm_head.weight":
+-                # Falcon uses tied embeddings.
++            if name == "lm_head.weight" and self.tie_word_embeddings:
++                # Falcon uses tied embeddings except Falcon-11b.
+                 continue
+             # Skip loading extra bias for GPTQ models.
+             if name.endswith(".bias") and name not in params_dict:
+                 continue
++            if is_pp_missing_parameter(name, self):
++                continue
+             param = params_dict[name]
+             if "query_key_value" in name:
+                 output_dim = getattr(param, "output_dim", None)
+@@ -442,3 +523,5 @@ class FalconForCausalLM(nn.Module):
+             weight_loader = getattr(param, "weight_loader",
+                                     default_weight_loader)
+             weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
+new file mode 100644
+index 0000000..3a5fe8e
+--- /dev/null
++++ b/vllm/model_executor/models/florence2.py
+@@ -0,0 +1,264 @@
++import math
++from typing import Iterable, List, Optional, Set, Tuple
++
++import torch
++import torch.nn as nn
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
++                                             BartParallelLMHead,
++                                             BartScaledWordEmbedding)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++
++from .utils import AutoWeightsLoader
++
++
++class Florence2LanguageModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.config = config
++
++        self.padding_idx = config.pad_token_id
++        self.vocab_size = config.vocab_size
++
++        self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model)
++        self.encoder = BartEncoder(config,
++                                   cache_config=cache_config,
++                                   quant_config=quant_config,
++                                   prefix=f"{prefix}.encoder")
++        self.decoder = BartDecoder(config,
++                                   cache_config=cache_config,
++                                   quant_config=quant_config,
++                                   prefix=f"{prefix}.decoder")
++
++        if self.config.tie_word_embeddings:
++            self.encoder.embed_tokens.weight = self.shared.weight
++            self.decoder.embed_tokens.weight = self.shared.weight
++
++    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
++                encoder_input_ids: torch.Tensor,
++                encoder_positions: torch.Tensor, kv_caches: List[torch.Tensor],
++                attn_metadata: AttentionMetadata) -> torch.Tensor:
++        r"""
++        Args:
++            input_ids
++                Indices of *decoder* input sequence tokens in the vocabulary.
++                Padding will be ignored by default should you
++                provide it.
++            positions
++                Positions of *decoder* input sequence tokens.
++            encoder_input_ids
++                Indices of *encoder* input sequence tokens in the vocabulary.
++            encoder_positions:
++                Positions of *encoder* input sequence tokens.
++            kv_caches:
++                Layer-wise list of KV cache tensors
++            attn_metadata:
++                vLLM Attention metadata structure
++        Returns:
++            Model output torch.Tensor
++        """
++
++        encoder_hidden_states = None
++
++        if encoder_input_ids.numel() > 0:
++            # Run encoder attention if a non-zero number of encoder tokens
++            # are provided as input
++            encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
++                                                 positions=encoder_positions,
++                                                 kv_caches=kv_caches,
++                                                 attn_metadata=attn_metadata)
++
++        # decoder outputs consists of
++        # (dec_features, past_key_value, dec_hidden, dec_attn)
++        decoder_outputs = self.decoder(
++            decoder_input_ids=input_ids,
++            decoder_positions=positions,
++            encoder_hidden_states=encoder_hidden_states,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata)
++
++        return decoder_outputs
++
++
++class Florence2LanguageForConditionalGeneration(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++
++        self.config = config
++        self.model = Florence2LanguageModel(vllm_config=vllm_config,
++                                            prefix=f"{prefix}.model")
++        embed_scale = math.sqrt(
++            config.d_model) if config.scale_embedding else 1.0
++
++        self.vocab_size = config.vocab_size
++        self.lm_head = BartParallelLMHead(self.vocab_size,
++                                          config.d_model,
++                                          embed_scale=embed_scale)
++
++        self.logits_processor = LogitsProcessor(self.vocab_size,
++                                                config.vocab_size)
++        self.sampler = get_sampler()
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        encoder_input_ids: torch.Tensor,
++        encoder_positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        **kwargs,
++    ) -> torch.Tensor:
++        r"""
++        Args:
++            input_ids
++                torch.Tensor of *decoder* input token ids.
++            positions
++                torch.Tensor of *decoder* position indices.
++            encoder_input_ids
++                torch.Tensor of *encoder* input token ids.
++            encoder_positions
++                torch.Tensor of *encoder* position indices
++            kv_caches:
++                Layer-wise list of KV cache tensors
++            attn_metadata:
++                vLLM Attention metadata structure
++        Returns:
++            Output torch.Tensor
++        """
++        return self.model(input_ids, positions, encoder_input_ids,
++                          encoder_positions, kv_caches, attn_metadata)
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(self, logits: torch.Tensor,
++               sampling_metadata: SamplingMetadata) -> SamplerOutput:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++        ]
++
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                if "final_logits_bias" in name:
++                    continue
++                if self.config.tie_word_embeddings and "embed_tokens" in name:
++                    continue
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++class Florence2ForConditionalGeneration(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++
++        # TODO(Isotr0py): Add vision backbone
++        self.language_model = Florence2LanguageForConditionalGeneration(
++            vllm_config=vllm_config.with_hf_config(config.text_config),
++            prefix=f"{prefix}.language_model",
++        )
++
++    @property
++    def sampler(self):
++        return self.language_model.sampler
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        *,
++        encoder_input_ids: torch.Tensor,
++        encoder_positions: torch.Tensor,
++        **kwargs,
++    ) -> torch.Tensor:
++        r"""
++        Args:
++            input_ids
++                torch.Tensor of *decoder* input token ids.
++            positions
++                torch.Tensor of *decoder* position indices.
++            encoder_input_ids
++                torch.Tensor of *encoder* input token ids.
++            encoder_positions
++                torch.Tensor of *encoder* position indices
++            kv_caches:
++                Layer-wise list of KV cache tensors
++            attn_metadata:
++                vLLM Attention metadata structure
++        Returns:
++            Output torch.Tensor
++        """
++        return self.language_model(input_ids, positions, encoder_input_ids,
++                                   encoder_positions, kv_caches, attn_metadata)
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> SamplerOutput:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        skip_prefixes = [
++            'image_projection', "vision_tower", "image_proj_norm",
++            "image_pos_embed", "visual_temporal_embed"
++        ]
++        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
+new file mode 100644
+index 0000000..63e7147
+--- /dev/null
++++ b/vllm/model_executor/models/fuyu.py
+@@ -0,0 +1,406 @@
++# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
++# Copyright 2023 The vLLM team.
++# Copyright 2023 HuggingFace Inc. team. All rights reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++""" PyTorch Fuyu model."""
++import math
++from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
++                    TypedDict, Union)
++
++import torch
++import torch.nn as nn
++from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor,
++                          FuyuProcessor)
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.linear import ColumnParallelLinear
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.model_executor.models.persimmon import PersimmonForCausalLM
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
++                                    MultiModalInputsV2, MultiModalKwargs,
++                                    NestedTensors, PlaceholderRange)
++from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
++                                   MultiModalDataItems)
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo, PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsMultiModal, SupportsPP
++from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
++                    merge_multimodal_embeddings)
++
++# Cannot find the following 2 numbers from hf config.
++_IMAGE_TOKEN_ID = 71011
++_NEWLINE_TOKEN_ID = 71019
++
++
++class FuyuImagePatchInputs(TypedDict):
++    type: Literal["image_patches"]
++    flat_data: torch.Tensor
++    """
++    Shape: 
++    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
++    """
++
++    patches_per_image: List[int]
++    """
++    List of number of total patches for each image in the batch.
++    This is used to restore the first two dimensions of `flat_data`.
++    """
++
++
++class FuyuProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(FuyuConfig)
++
++    def get_hf_processor(self):
++        return self.ctx.get_hf_processor(FuyuProcessor)
++
++    def get_image_processor(self) -> FuyuImageProcessor:
++        return self.get_hf_processor().image_processor
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": 1}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        target_width, target_height = self.get_image_size_with_most_features()
++
++        max_ncols, max_nrows = self.get_image_feature_grid_size(
++            image_width=target_width,
++            image_height=target_height,
++        )
++        max_image_tokens = (max_ncols + 1) * max_nrows
++
++        return {"image": max_image_tokens}
++
++    def get_image_feature_grid_size(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++    ) -> tuple[int, int]:
++        image_processor = self.get_image_processor()
++        target_width = image_processor.size["width"]
++        target_height = image_processor.size["height"]
++
++        if not (image_width <= target_width and image_height <= target_height):
++            height_scale_factor = target_height / image_height
++            width_scale_factor = target_width / image_width
++            optimal_scale_factor = min(height_scale_factor, width_scale_factor)
++
++            image_height = int(image_height * optimal_scale_factor)
++            image_width = int(image_width * optimal_scale_factor)
++
++        ncols = math.ceil(image_width / 30)
++        nrows = math.ceil(image_height / 30)
++        return ncols, nrows
++
++    def get_image_size_with_most_features(self) -> ImageSize:
++        image_processor = self.get_image_processor()
++        return ImageSize(width=image_processor.size["width"],
++                         height=image_processor.size["height"])
++
++
++class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        target_width, target_height = \
++            self.info.get_image_size_with_most_features()
++        num_images = mm_counts.get("image", 0)
++
++        mm_data = {
++            "image":
++            self._get_dummy_images(width=target_width,
++                                   height=target_height,
++                                   num_images=num_images)
++        }
++
++        return ProcessorInputs(
++            prompt_text="",
++            mm_data=mm_data,
++        )
++
++
++class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        if not mm_data:
++            # Avoid warning from HF logger for text-only input
++            prompt_ids = self.info.get_tokenizer().encode(prompt)
++            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
++            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
++
++        processed_outputs = super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++        )
++
++        image_patches = processed_outputs.get("image_patches")
++        if image_patches is not None:
++            images = mm_data["images"]
++            assert isinstance(images, list)
++
++            # Original output: (1, num_images, Pn, Px * Py * C)
++            # New output: (num_images, Pn, Px * Py * C)
++            assert (isinstance(image_patches, list)
++                    and len(image_patches) == 1)
++            assert (isinstance(image_patches[0], torch.Tensor)
++                    and len(image_patches[0]) == len(images))
++
++            processed_outputs["image_patches"] = image_patches[0]
++
++        return processed_outputs
++
++    def _apply_hf_processor_tokens_only(
++        self,
++        prompt_tokens: list[int],
++    ) -> list[int]:
++        # HF processor adds boa_token_id
++        tokenizer = self.info.get_tokenizer()
++        boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
++
++        return prompt_tokens + [boa_token_id]
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(image_patches=MultiModalFieldConfig.batched("image"))
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        hf_config = self.info.get_hf_config()
++        bos_token_id = hf_config.bos_token_id
++
++        tokenizer = self.info.get_tokenizer()
++        eot_token_id = tokenizer.bos_token_id
++        assert isinstance(eot_token_id, int)
++
++        def get_replacement_fuyu(item_idx: int):
++            images = mm_items.get_items("image", ImageProcessorItems)
++            image_size = images.get_image_size(item_idx)
++
++            ncols, nrows = self.info.get_image_feature_grid_size(
++                image_width=image_size.width,
++                image_height=image_size.height,
++            )
++
++            return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
++                    [bos_token_id])
++
++        return [
++            PromptReplacement(
++                modality="image",
++                target=[eot_token_id],
++                replacement=get_replacement_fuyu,
++            )
++        ]
++
++    def apply(
++        self,
++        prompt: Union[str, list[int]],
++        mm_data: MultiModalDataDict,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> MultiModalInputsV2:
++        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
++
++        # Only |SPEAKER| (image) tokens should be considered as placeholders,
++        # so we ignore the trailing bos_token_id
++        result["mm_placeholders"] = {
++            modality: [
++                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
++                for p in ps
++            ]
++            for modality, ps in result["mm_placeholders"].items()
++        }
++
++        return result
++
++
++@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
++                                        info=FuyuProcessingInfo,
++                                        dummy_inputs=FuyuDummyInputsBuilder)
++class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        self.padding_idx = config.pad_token_id
++        self.vocab_size = config.text_config.vocab_size
++        self.image_token_id = _IMAGE_TOKEN_ID
++        self.image_feature_size = config.patch_size**2 * config.num_channels
++
++        self.vision_embed_tokens = ColumnParallelLinear(
++            self.image_feature_size,
++            config.hidden_size,
++            quant_config=quant_config,
++            gather_output=True,
++        )
++        self.language_model = PersimmonForCausalLM(
++            vllm_config=vllm_config.with_hf_config(config.text_config),
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    @property
++    def sampler(self):
++        return self.language_model.sampler
++
++    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
++
++        h = w = self.config.patch_size
++        num_channels = self.config.num_channels
++        expected_dims = num_channels * h * w
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = d.size(-1)
++
++            if actual_dims != expected_dims:
++                expected_expr = str(expected_dims)
++                raise ValueError(
++                    "The expected shape of pixel values per image per batch "
++                    f" per patch is {expected_expr}. "
++                    f"You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data.to(self.vision_embed_tokens.weight.dtype)
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
++        image_patches = kwargs.pop("image_patches", None)
++        if image_patches is not None:
++            if not isinstance(image_patches, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of image patches. "
++                                 f"Got type: {type(image_patches)}")
++
++            image_patches_flat = flatten_bn(image_patches)
++
++            return FuyuImagePatchInputs(
++                type="image_patches",
++                flat_data=self._validate_pixel_values(
++                    flatten_bn(image_patches_flat, concat=True)),
++                patches_per_image=[x.size(0) for x in image_patches_flat],
++            )
++
++        return None
++
++    def _process_image_input(
++            self, image_input: FuyuImagePatchInputs) -> NestedTensors:
++        image_patches_flat = image_input["flat_data"]
++        patches_per_image = image_input["patches_per_image"]
++
++        assert self.vision_embed_tokens is not None
++        vision_embeddings_flat, _ = self.vision_embed_tokens(
++            image_patches_flat)
++        return vision_embeddings_flat.split(patches_per_image, dim=0)
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        vision_embeddings = self._process_image_input(image_input)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                _IMAGE_TOKEN_ID)
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ):
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model(
++            input_ids=input_ids,
++            positions=positions,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++        )
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.language_model.logits_processor(
++            self.language_model.lm_head, hidden_states, sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.language_model.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
+index bb73ff4..b28715c 100644
+--- a/vllm/model_executor/models/gemma.py
++++ b/vllm/model_executor/models/gemma.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Copyright 2023 The vLLM team.
+ # Copyright (c) Google Inc.
+ #
+@@ -15,31 +14,36 @@
+ # limitations under the License.
+ """Inference-only Gemma model compatible with HuggingFace weights."""
+ from functools import lru_cache
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import GemmaConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.config import LoRAConfig
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.logger import init_logger
+ from vllm.model_executor.layers.activation import GeluAndMul
+-from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.layernorm import GemmaRMSNorm
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ logger = init_logger(__name__)
+ 
+@@ -79,16 +83,23 @@ class GemmaMLP(nn.Module):
+         hidden_act: Optional[str] = None,
+         hidden_activation: Optional[str] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.gate_up_proj = MergedColumnParallelLinear(
+-            hidden_size, [intermediate_size] * 2,
++            hidden_size,
++            [intermediate_size] * 2,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj",
++        )
++        self.down_proj = RowParallelLinear(
++            intermediate_size,
++            hidden_size,
+             bias=False,
+-            quant_config=quant_config)
+-        self.down_proj = RowParallelLinear(intermediate_size,
+-                                           hidden_size,
+-                                           bias=False,
+-                                           quant_config=quant_config)
++            quant_config=quant_config,
++            prefix=f"{prefix}.down_proj",
++        )
+         self.act_fn = _get_gemma_act_fn(hidden_act, hidden_activation)
+ 
+     def forward(self, x):
+@@ -100,14 +111,18 @@ class GemmaMLP(nn.Module):
+ 
+ class GemmaAttention(nn.Module):
+ 
+-    def __init__(self,
+-                 hidden_size: int,
+-                 num_heads: int,
+-                 num_kv_heads: int,
+-                 head_dim: int,
+-                 max_position_embeddings: int = 8192,
+-                 rope_theta: float = 10000,
+-                 quant_config: Optional[QuantizationConfig] = None) -> None:
++    def __init__(
++        self,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        head_dim: int,
++        max_position_embeddings: int = 8192,
++        rope_theta: float = 10000,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
+         super().__init__()
+         self.hidden_size = hidden_size
+         tp_size = get_tensor_model_parallel_world_size()
+@@ -137,12 +152,14 @@ class GemmaAttention(nn.Module):
+             self.total_num_kv_heads,
+             bias=False,
+             quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
+         )
+         self.o_proj = RowParallelLinear(
+             self.total_num_heads * self.head_dim,
+             hidden_size,
+             bias=False,
+             quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
+         )
+ 
+         self.rotary_emb = get_rope(
+@@ -155,7 +172,10 @@ class GemmaAttention(nn.Module):
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+                               self.scaling,
+-                              num_kv_heads=self.num_kv_heads)
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -177,7 +197,9 @@ class GemmaDecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: GemmaConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -188,7 +210,9 @@ class GemmaDecoderLayer(nn.Module):
+             head_dim=config.head_dim,
+             max_position_embeddings=config.max_position_embeddings,
+             rope_theta=config.rope_theta,
++            cache_config=cache_config,
+             quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
+         )
+         self.mlp = GemmaMLP(
+             hidden_size=self.hidden_size,
+@@ -196,11 +220,12 @@ class GemmaDecoderLayer(nn.Module):
+             hidden_act=config.hidden_act,
+             hidden_activation=getattr(config, "hidden_activation", None),
+             quant_config=quant_config,
++            prefix=f"{prefix}.mlp",
+         )
+-        self.input_layernorm = RMSNorm(config.hidden_size,
+-                                       eps=config.rms_norm_eps)
+-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+-                                                eps=config.rms_norm_eps)
++        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
++                                            eps=config.rms_norm_eps)
++        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
++                                                     eps=config.rms_norm_eps)
+ 
+     def forward(
+         self,
+@@ -231,25 +256,28 @@ class GemmaDecoderLayer(nn.Module):
+         return hidden_states, residual
+ 
+ 
++@support_torch_compile
+ class GemmaModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: GemmaConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+ 
+         self.embed_tokens = VocabParallelEmbedding(
+             config.vocab_size,
+             config.hidden_size,
+         )
+-        self.layers = nn.ModuleList([
+-            GemmaDecoderLayer(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
+-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: GemmaDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
++        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ 
+         # Normalize the embedding by sqrt(hidden_size)
+         # The normalizer's data type should be downcasted to the model's
+@@ -257,6 +285,12 @@ class GemmaModel(nn.Module):
+         # See https://github.com/huggingface/transformers/pull/29402
+         normalizer = self.config.hidden_size**0.5
+         self.register_buffer("normalizer", torch.tensor(normalizer))
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -264,25 +298,38 @@ class GemmaModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        hidden_states *= self.normalizer
+-
+-        residual = None
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            hidden_states *= self.normalizer
++            residual = None
++        else:
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+                 residual,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+         hidden_states, _ = self.norm(hidden_states, residual)
+         return hidden_states
+ 
+ 
+-class GemmaForCausalLM(nn.Module):
++class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+     packed_modules_mapping = {
+         "qkv_proj": [
+             "q_proj",
+@@ -302,40 +349,63 @@ class GemmaForCausalLM(nn.Module):
+         "gate_up_proj",
+         "down_proj",
+     ]
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
+     # Gemma does not apply LoRA to the embedding layer.
+     embedding_modules = {}
+     embedding_padding_modules = []
+ 
+-    def __init__(
+-        self,
+-        config: GemmaConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ) -> None:
+-        del lora_config  # Unused.
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
+         self.config = config
++        # currently all existing Gemma models have `tie_word_embeddings` enabled
++        assert config.tie_word_embeddings
++        self.lora_config = lora_config
++
+         self.quant_config = quant_config
+-        self.model = GemmaModel(config, quant_config)
++        self.model = GemmaModel(vllm_config=vllm_config,
++                                prefix=maybe_prefix(prefix, "model"))
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+-    @torch.no_grad()
+     def forward(
+         self,
+         input_ids: torch.Tensor,
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.model.embed_tokens.weight,
+-                                       hidden_states, sampling_metadata)
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
++                                       sampling_metadata)
+         return logits
+ 
+     def sample(
+@@ -346,7 +416,8 @@ class GemmaForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -356,7 +427,7 @@ class GemmaForCausalLM(nn.Module):
+             ("gate_up_proj", "up_proj", 1),
+         ]
+         params_dict = dict(self.named_parameters())
+-        loaded_params = set()
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                 if shard_name not in name:
+@@ -365,6 +436,8 @@ class GemmaForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -377,10 +450,8 @@ class GemmaForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
+-                # GemmaRMSNorm is different from Llama's in that it multiplies
+-                # (1 + weight) to the output, instead of just weight.
+-                if "norm.weight" in name:
+-                    loaded_weight += 1.0
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+@@ -388,6 +459,7 @@ class GemmaForCausalLM(nn.Module):
+             loaded_params.add(name)
+         unloaded_params = params_dict.keys() - loaded_params
+         if unloaded_params:
+-            raise RuntimeError(
+-                "Some weights are not initialized from checkpoints: "
+-                f"{unloaded_params}")
++            logger.warning(
++                "Some weights are not initialized from checkpoints: %s",
++                unloaded_params)
++        return loaded_params
+diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
+new file mode 100644
+index 0000000..f4530e4
+--- /dev/null
++++ b/vllm/model_executor/models/gemma2.py
+@@ -0,0 +1,471 @@
++# Copyright 2024 The vLLM team.
++# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
++#
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++from typing import Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++from transformers import Gemma2Config
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.logger import init_logger
++from vllm.model_executor.layers.activation import GeluAndMul
++from vllm.model_executor.layers.layernorm import GemmaRMSNorm
++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
++    get_compressed_tensors_cache_scale)
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (AutoWeightsLoader, extract_layer_index,
++                    is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++logger = init_logger(__name__)
++
++
++class Gemma2MLP(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        intermediate_size: int,
++        hidden_act: str,
++        hidden_activation: str,
++        quant_config: Optional[QuantizationConfig] = None,
++    ) -> None:
++        super().__init__()
++        self.gate_up_proj = MergedColumnParallelLinear(
++            hidden_size, [intermediate_size] * 2,
++            bias=False,
++            quant_config=quant_config)
++        self.down_proj = RowParallelLinear(intermediate_size,
++                                           hidden_size,
++                                           bias=False,
++                                           quant_config=quant_config)
++        if not (hidden_act == hidden_activation == "gelu_pytorch_tanh"):
++            raise ValueError(
++                "Gemma2 uses `gelu_pytorch_tanh` as the hidden activation "
++                "function. Please set `hidden_act` and `hidden_activation` to "
++                "`gelu_pytorch_tanh`.")
++        self.act_fn = GeluAndMul(approximate="tanh")
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        gate_up, _ = self.gate_up_proj(x)
++        x = self.act_fn(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class Gemma2Attention(nn.Module):
++
++    def __init__(self,
++                 config: Gemma2Config,
++                 hidden_size: int,
++                 num_heads: int,
++                 num_kv_heads: int,
++                 head_dim: int,
++                 max_position_embeddings: int,
++                 rope_theta: float,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 attn_logits_soft_cap: Optional[float] = None,
++                 prefix: str = "") -> None:
++        super().__init__()
++        self.config = config
++        self.hidden_size = hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = num_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = num_kv_heads
++        if self.total_num_kv_heads >= tp_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        self.head_dim = head_dim
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = config.query_pre_attn_scalar**-0.5
++        self.rope_theta = rope_theta
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size,
++            self.head_dim,
++            self.total_num_heads,
++            self.total_num_kv_heads,
++            bias=config.attention_bias,
++            quant_config=quant_config,
++        )
++        self.o_proj = RowParallelLinear(
++            self.total_num_heads * self.head_dim,
++            hidden_size,
++            bias=config.attention_bias,
++            quant_config=quant_config,
++        )
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=max_position_embeddings,
++            base=self.rope_theta,
++            is_neox_style=True,
++        )
++
++        # reference:
++        # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa
++        layer_idx = extract_layer_index(prefix)
++        use_sliding_window = (layer_idx % 2 == 0 and
++                              config.interleaved_sliding_window is not None)
++        sliding_window = config.interleaved_sliding_window if \
++            use_sliding_window else None
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              logits_soft_cap=attn_logits_soft_cap,
++                              per_layer_sliding_window=sliding_window,
++                              prefix=f"{prefix}.attn")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class Gemma2DecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: Gemma2Config,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        self.self_attn = Gemma2Attention(
++            config=config,
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            num_kv_heads=config.num_key_value_heads,
++            head_dim=config.head_dim,
++            max_position_embeddings=config.max_position_embeddings,
++            rope_theta=config.rope_theta,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            attn_logits_soft_cap=config.attn_logit_softcapping,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.hidden_size = config.hidden_size
++        self.mlp = Gemma2MLP(
++            hidden_size=self.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.hidden_act,
++            hidden_activation=config.hidden_activation,
++            quant_config=quant_config,
++        )
++        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
++                                            eps=config.rms_norm_eps)
++        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
++                                                     eps=config.rms_norm_eps)
++        self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
++                                                      eps=config.rms_norm_eps)
++        self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
++                                                       eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.input_layernorm(hidden_states)
++        else:
++            hidden_states, residual = self.input_layernorm(
++                hidden_states, residual)
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        hidden_states = self.post_attention_layernorm(hidden_states)
++
++        hidden_states, residual = self.pre_feedforward_layernorm(
++            hidden_states, residual)
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = self.post_feedforward_layernorm(hidden_states)
++        return hidden_states, residual
++
++
++@support_torch_compile
++class Gemma2Model(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        self.config = config
++
++        self.embed_tokens = VocabParallelEmbedding(
++            config.vocab_size,
++            config.hidden_size,
++        )
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: Gemma2DecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
++        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++
++        # Normalize the embedding by sqrt(hidden_size)
++        # The normalizer's data type should be downcasted to the model's
++        # data type such as bfloat16, not float32.
++        # See https://github.com/huggingface/transformers/pull/29402
++        normalizer = self.config.hidden_size**0.5
++        self.register_buffer("normalizer", torch.tensor(normalizer))
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            hidden_states *= self.normalizer
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++                residual,
++            )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++            ("gate_up_proj", "gate_proj", 0),
++            ("gate_up_proj", "up_proj", 1),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if scale_name := get_compressed_tensors_cache_scale(name):
++                # Loading kv cache scales for compressed-tensors quantization
++                param = params_dict[scale_name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                loaded_weight = loaded_weight[0]
++                weight_loader(param, loaded_weight)
++                loaded_params.add(scale_name)
++                continue
++            for (param_name, shard_name, shard_id) in stacked_params_mapping:
++                if shard_name not in name:
++                    continue
++                name = name.replace(shard_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                # Remapping the name of FP8 kv-scale.
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++
++        unloaded_params = params_dict.keys() - loaded_params
++        if unloaded_params:
++            logger.warning(
++                "Some weights are not initialized from checkpoints: %s",
++                unloaded_params)
++        return loaded_params
++
++
++class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj",
++        "o_proj",
++        "gate_up_proj",
++        "down_proj",
++    ]
++    # Gemma does not apply LoRA to the embedding layer.
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++        del lora_config  # Unused.
++        super().__init__()
++        self.config = config
++        # currently all existing Gemma models have `tie_word_embeddings` enabled
++        assert config.tie_word_embeddings
++        self.quant_config = quant_config
++        self.model = Gemma2Model(vllm_config=vllm_config,
++                                 prefix=maybe_prefix(prefix, "model"))
++        self.logits_processor = LogitsProcessor(
++            config.vocab_size, soft_cap=config.final_logit_softcapping)
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        hidden_states = self.model(input_ids, positions, kv_caches,
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(
++            self,
++            skip_prefixes=(["lm_head."]
++                           if self.config.tie_word_embeddings else None),
++        )
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
+new file mode 100644
+index 0000000..942d1e1
+--- /dev/null
++++ b/vllm/model_executor/models/glm.py
+@@ -0,0 +1,21 @@
++"""Inference-only HF format GLM-4 model compatible with THUDM weights."""
++from vllm.config import VllmConfig
++from vllm.model_executor.models.llama import LlamaForCausalLM
++
++from .utils import PPMissingLayer
++
++
++class GlmForCausalLM(LlamaForCausalLM):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++        # Hack Llama model to fit HF format GLM implementation
++        # Attention difference between GLM and Llama:
++        # 1. Half partial rotary_dim and no Neox style.
++        # 2. There is no bias for o_proj in attention
++        for layer in self.model.layers:
++            if not isinstance(layer, PPMissingLayer):
++                layer.self_attn.rotary_emb.rotary_dim //= 2
++                layer.self_attn.rotary_emb.is_neox_style = False
++                layer.self_attn.o_proj.bias = None
++                layer.self_attn.o_proj.skip_bias_add = True
+diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
+new file mode 100644
+index 0000000..39a5736
+--- /dev/null
++++ b/vllm/model_executor/models/glm4_vision_encoder.py
+@@ -0,0 +1,296 @@
++# Adapted from
++# https://github.com/THUDM/GLM-4
++"""Inference-only GLM-4v model visual encoder compatible with THUDM weights."""
++from argparse import Namespace
++from typing import Optional
++
++import torch
++from torch import nn
++from torch.nn import LayerNorm
++
++from vllm.attention.layer import MultiHeadAttention
++from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               ReplicatedLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++
++
++class PatchEmbedding(nn.Module):
++
++    def __init__(self, config):
++        super().__init__()
++        self.proj = nn.Conv2d(config.in_channels,
++                              config.hidden_size,
++                              kernel_size=config.patch_size,
++                              stride=config.patch_size)
++        self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
++        self.position_embedding = nn.Embedding(config.num_positions,
++                                               config.hidden_size)
++
++    def forward(self, images: torch.Tensor) -> torch.Tensor:
++        """
++        Parameters:
++        images : torch.Tensor
++            Input image tensor with shape (B, C, H, W)
++
++        Returns:
++        torch.Tensor
++            Transformed tensor with shape (B, L, D)
++        """
++        images = images.to(self.proj.weight.device)
++        x = self.proj(images)
++        x = x.flatten(2).transpose(1, 2)
++        cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
++        x = torch.cat((cls_token, x), dim=1)
++        x += self.position_embedding.weight.unsqueeze(0)
++        return x
++
++
++class Attention(nn.Module):
++
++    def __init__(
++        self,
++        config,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = '',
++    ):
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.num_heads_per_rank = config.num_heads // self.tp_size
++        self.head_dim = config.hidden_size // config.num_heads
++        self.scale = self.head_dim**-0.5
++
++        self.query_key_value = QKVParallelLinear(
++            config.hidden_size,
++            self.head_dim,
++            config.num_heads,
++            quant_config=quant_config,
++        )
++        self.dense = RowParallelLinear(
++            config.hidden_size,
++            config.hidden_size,
++            quant_config=quant_config,
++        )
++
++        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
++                                       self.scale)
++        self.output_dropout = torch.nn.Dropout(config.dropout_prob)
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        qkv, _ = self.query_key_value(x)  # B, L, 3 * H * D
++        q, k, v = qkv.chunk(3, dim=-1)
++
++        out = self.attn(q, k, v)
++        output, _ = self.dense(out)
++        output = self.output_dropout(output)
++        return output
++
++
++class MLP(nn.Module):
++
++    def __init__(
++        self,
++        config,
++        quant_config: Optional[QuantizationConfig] = None,
++    ):
++        super().__init__()
++        self.config = config
++        self.activation_fn = get_act_fn(config.hidden_act)
++        self.fc1 = ColumnParallelLinear(
++            config.hidden_size,
++            config.intermediate_size,
++            quant_config=quant_config,
++        )
++        self.fc2 = RowParallelLinear(
++            config.intermediate_size,
++            config.hidden_size,
++            quant_config=quant_config,
++        )
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        x, _ = self.fc1(x)
++        x = self.activation_fn(x)
++        x, _ = self.fc2(x)
++        return x
++
++
++class TransformerLayer(nn.Module):
++
++    def __init__(
++        self,
++        config,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = '',
++    ):
++        super().__init__()
++        self.input_layernorm = LayerNorm(config.hidden_size,
++                                         eps=config.layer_norm_eps)
++        self.attention = Attention(config,
++                                   quant_config=quant_config,
++                                   prefix=f"{prefix}.attention")
++        self.mlp = MLP(config, quant_config=quant_config)
++        self.post_attention_layernorm = LayerNorm(config.hidden_size,
++                                                  eps=config.layer_norm_eps)
++
++    def forward(self, hidden_states):
++        attention_input = hidden_states
++        attention_output = self.input_layernorm(
++            self.attention(attention_input))
++        hidden_states = attention_input + attention_output
++        mlp_input = hidden_states
++        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
++        output = mlp_input + mlp_output
++        return output
++
++
++class Transformer(nn.Module):
++
++    def __init__(
++        self,
++        config,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = '',
++    ):
++        super().__init__()
++        self.layers = nn.ModuleList([
++            TransformerLayer(config,
++                             quant_config=quant_config,
++                             prefix=f"{prefix}.layer.{layer_idx}")
++            for layer_idx in range(config.num_hidden_layers)
++        ])
++
++    def forward(self, hidden_states):
++        for layer_module in self.layers:
++            hidden_states = layer_module(hidden_states)
++        return hidden_states
++
++
++class GLU(nn.Module):
++
++    def __init__(
++        self,
++        config,
++        in_features,
++        quant_config: Optional[QuantizationConfig] = None,
++    ):
++        """
++        The original implementation is the same as:
++        ```python
++        self.dense_h_to_4h = ColumnParallelLinear(
++            config.hidden_size,
++            config.ffn_hidden_size,
++            bias=False,
++            quant_config=quant_config
++        )
++
++        self.gate_proj = ColumnParallelLinear(
++            config.hidden_size,
++            config.ffn_hidden_size,
++            bias=False,
++            quant_config=quant_config
++        )
++        ```
++        ```
++        gate_proj_output, _ = self.gate_proj(x)
++        dense_h_to_4h_output, _ = self.dense_h_to_4h(x)
++        x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1)
++        ```
++
++        We merge two ColumnParallelLinear into one MergedColumnParallelLinear:
++        ```
++        self.merged_proj = MergedColumnParallelLinear(
++            config.hidden_size,
++            [config.ffn_hidden_size] * 2,
++            bias=False,
++            quant_config=quant_config
++        )
++        ```
++        ```
++        x, _ = self.merged_proj(x)
++        ```
++        """
++        super().__init__()
++        self.linear_proj = ReplicatedLinear(in_features,
++                                            config.hidden_size,
++                                            bias=False,
++                                            quant_config=quant_config)
++        self.norm1 = nn.LayerNorm(config.hidden_size)
++        self.act1 = nn.GELU()
++        self.act2 = SiluAndMul()
++
++        self.merged_proj = MergedColumnParallelLinear(
++            config.hidden_size, [config.ffn_hidden_size] * 2,
++            bias=False,
++            quant_config=quant_config)
++
++        self.dense_4h_to_h = RowParallelLinear(config.ffn_hidden_size,
++                                               config.hidden_size,
++                                               bias=False,
++                                               quant_config=quant_config)
++
++    def forward(self, x):
++        x, _ = self.linear_proj(x)
++        x = self.act1(self.norm1(x))
++        x, _ = self.merged_proj(x)
++        x = self.act2(x)
++        x, _ = self.dense_4h_to_h(x)
++        return x
++
++
++class EVA2CLIPModel(nn.Module):
++
++    def __init__(
++        self,
++        config,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = '',
++    ):
++        super().__init__()
++        vision_config = Namespace(**config.vision_config)
++        self.patch_embedding = PatchEmbedding(vision_config)
++        self.transformer = Transformer(vision_config,
++                                       quant_config=quant_config,
++                                       prefix=f"{prefix}.transformer")
++        self.linear_proj = GLU(config,
++                               in_features=config.hidden_size,
++                               quant_config=quant_config)
++        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
++                              out_channels=config.hidden_size,
++                              kernel_size=2,
++                              stride=2)
++        self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
++        self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
++        self.scaling_factor = vision_config.scaling_factor
++
++    def forward(self, images: torch.Tensor) -> torch.Tensor:
++        """
++        Parameters:
++        images : torch.Tensor
++            Input image tensor with shape (B, C, H, W)
++
++        Returns:
++        torch.Tensor
++            Transformed tensor with shape (B, L, D)
++        """
++        x = self.patch_embedding(images)
++        x = self.transformer(x)
++        x = x[:, 1:]
++
++        b, s, h = x.shape
++        grid_size = int(s**0.5)
++        x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
++        x = self.conv(x)
++
++        x = x.flatten(2).transpose(1, 2)
++        x = self.linear_proj(x)
++        boi = self.boi.expand(x.shape[0], -1, -1)
++        eoi = self.eoi.expand(x.shape[0], -1, -1)
++        x = torch.cat((boi, x, eoi), dim=1)
++        x = x / self.scaling_factor
++        return x
+diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
+index 75eaebf..1656a3c 100644
+--- a/vllm/model_executor/models/gpt2.py
++++ b/vllm/model_executor/models/gpt2.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
+ # Copyright 2023 The vLLM team.
+@@ -17,27 +16,34 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only GPT-2 model compatible with HuggingFace weights."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import GPT2Config
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed.parallel_state import (
++    get_pp_group, get_tensor_model_parallel_world_size)
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+-    VocabParallelEmbedding)
++    ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class GPT2Attention(nn.Module):
+@@ -45,7 +51,9 @@ class GPT2Attention(nn.Module):
+     def __init__(
+         self,
+         config: GPT2Config,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -63,14 +71,21 @@ class GPT2Attention(nn.Module):
+             total_num_heads,
+             bias=True,
+             quant_config=quant_config,
++            prefix=f"{prefix}.c_attn",
+         )
+         self.c_proj = RowParallelLinear(
+             self.hidden_size,
+             self.hidden_size,
+             bias=True,
+             quant_config=quant_config,
++            prefix=f"{prefix}.c_proj",
+         )
+-        self.attn = Attention(self.num_heads, self.head_dim, scale=self.scale)
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              scale=self.scale,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -92,6 +107,7 @@ class GPT2MLP(nn.Module):
+         intermediate_size: int,
+         config: GPT2Config,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         hidden_size = config.hidden_size
+@@ -100,15 +116,16 @@ class GPT2MLP(nn.Module):
+             intermediate_size,
+             bias=True,
+             quant_config=quant_config,
++            prefix=f"{prefix}.c_fc",
+         )
+         self.c_proj = RowParallelLinear(
+             intermediate_size,
+             hidden_size,
+             bias=True,
+             quant_config=quant_config,
++            prefix=f"{prefix}.c_proj",
+         )
+-        self.act = get_act_fn(config.activation_function, quant_config,
+-                              intermediate_size)
++        self.act = get_act_fn(config.activation_function)
+ 
+     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+         hidden_states, _ = self.c_fc(hidden_states)
+@@ -122,7 +139,9 @@ class GPT2Block(nn.Module):
+     def __init__(
+         self,
+         config: GPT2Config,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         hidden_size = config.hidden_size
+@@ -130,9 +149,15 @@ class GPT2Block(nn.Module):
+                      hidden_size)
+ 
+         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+-        self.attn = GPT2Attention(config, quant_config)
++        self.attn = GPT2Attention(config,
++                                  cache_config,
++                                  quant_config,
++                                  prefix=f"{prefix}.attn")
+         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+-        self.mlp = GPT2MLP(inner_dim, config, quant_config)
++        self.mlp = GPT2MLP(inner_dim,
++                           config,
++                           quant_config,
++                           prefix=f"{prefix}.mlp")
+ 
+     def forward(
+         self,
+@@ -158,26 +183,38 @@ class GPT2Block(nn.Module):
+         return hidden_states
+ 
+ 
++@support_torch_compile
+ class GPT2Model(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: GPT2Config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+         assert not config.add_cross_attention
+         assert not config.scale_attn_by_inverse_layer_idx
+         assert not config.reorder_and_upcast_attn
+         self.embed_dim = config.hidden_size
+-        self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
++        self.wte = VocabParallelEmbedding(config.vocab_size,
++                                          self.embed_dim,
++                                          quant_config=quant_config,
++                                          prefix=f"{prefix}.wte")
+         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+-        self.h = nn.ModuleList([
+-            GPT2Block(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.h = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: GPT2Block(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.h")
+         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.n_embd))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.wte(input_ids)
+ 
+     def forward(
+         self,
+@@ -185,33 +222,56 @@ class GPT2Model(nn.Module):
+         position_ids: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        inputs_embeds = self.wte(input_ids)
+-        position_embeds = self.wpe(position_ids)
+-        hidden_states = inputs_embeds + position_embeds
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor],
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is None:
++                inputs_embeds = self.get_input_embeddings(input_ids)
++            position_embeds = self.wpe(position_ids)
++            hidden_states = inputs_embeds + position_embeds
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
+ 
+-        for i in range(len(self.h)):
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.h[i]
+-            hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
++            hidden_states = layer(hidden_states,
++                                  kv_caches[i - self.start_layer],
++                                  attn_metadata)
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+ 
+         hidden_states = self.ln_f(hidden_states)
+         return hidden_states
+ 
+ 
+-class GPT2LMHeadModel(nn.Module):
++class GPT2LMHeadModel(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: GPT2Config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+-        self.transformer = GPT2Model(config, quant_config)
+-        self.lm_head_weight = self.transformer.wte.weight
++        self.transformer = GPT2Model(vllm_config=vllm_config,
++                                     prefix=maybe_prefix(
++                                         prefix, "transformer"))
++        if self.config.tie_word_embeddings:
++            self.lm_head = self.transformer.wte
++        else:
++            self.lm_head = ParallelLMHead(self.config.vocab_size,
++                                          self.config.hidden_size,
++                                          quant_config=quant_config,
++                                          prefix=f"{prefix}.lm_head")
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.transformer.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.transformer.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -219,14 +279,20 @@ class GPT2LMHeadModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.transformer(input_ids, positions, kv_caches,
+-                                         attn_metadata)
++                                         attn_metadata, intermediate_tensors,
++                                         inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -238,10 +304,12 @@ class GPT2LMHeadModel(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+-            if "lm_head.weight" in name:
++            if name.startswith("lm_head"):
+                 # GPT-2 ties the weights of the embedding layer and the final
+                 # linear layer.
+                 continue
+@@ -251,6 +319,10 @@ class GPT2LMHeadModel(nn.Module):
+                 continue
+             if not name.startswith("transformer."):
+                 name = "transformer." + name
++
++            if is_pp_missing_parameter(name, self):
++                continue
++
+             param = params_dict[name]
+             # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+             # Because of this, we need to transpose the weights.
+@@ -264,3 +336,5 @@ class GPT2LMHeadModel(nn.Module):
+             weight_loader = getattr(param, "weight_loader",
+                                     default_weight_loader)
+             weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
+index d057fd9..c64bc70 100644
+--- a/vllm/model_executor/models/gpt_bigcode.py
++++ b/vllm/model_executor/models/gpt_bigcode.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
+ # Copyright 2023 The vLLM team.
+@@ -18,27 +17,32 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only GPTBigCode model compatible with HuggingFace weights."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import GPTBigCodeConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+-    VocabParallelEmbedding)
++    ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers)
+ 
+ 
+ class GPTBigCodeAttention(nn.Module):
+@@ -46,7 +50,9 @@ class GPTBigCodeAttention(nn.Module):
+     def __init__(
+         self,
+         config: GPTBigCodeConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -85,7 +91,10 @@ class GPTBigCodeAttention(nn.Module):
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+                               scale=self.scale,
+-                              num_kv_heads=self.num_kv_heads)
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -128,8 +137,7 @@ class GPTBigMLP(nn.Module):
+             bias=True,
+             quant_config=quant_config,
+         )
+-        self.act = get_act_fn(config.activation_function, quant_config,
+-                              intermediate_size)
++        self.act = get_act_fn(config.activation_function)
+ 
+     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+         hidden_states, _ = self.c_fc(hidden_states)
+@@ -143,7 +151,9 @@ class GPTBigCodeBlock(nn.Module):
+     def __init__(
+         self,
+         config: GPTBigCodeConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         hidden_size = config.hidden_size
+@@ -151,7 +161,10 @@ class GPTBigCodeBlock(nn.Module):
+                      hidden_size)
+ 
+         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+-        self.attn = GPTBigCodeAttention(config, quant_config)
++        self.attn = GPTBigCodeAttention(config,
++                                        cache_config,
++                                        quant_config,
++                                        prefix=f"{prefix}.attn")
+         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+         self.mlp = GPTBigMLP(inner_dim, config, quant_config)
+ 
+@@ -179,26 +192,41 @@ class GPTBigCodeBlock(nn.Module):
+         return hidden_states
+ 
+ 
++@support_torch_compile
+ class GPTBigCodeModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: GPTBigCodeConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
+         self.config = config
+         assert not config.add_cross_attention
+ 
+         self.embed_dim = config.hidden_size
+-
+-        self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
++        lora_vocab = (lora_config.lora_extra_vocab_size *
++                      (lora_config.max_loras or 1)) if lora_config else 0
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.wte = VocabParallelEmbedding(self.vocab_size,
++                                          self.embed_dim,
++                                          org_num_embeddings=config.vocab_size)
+         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+-        self.h = nn.ModuleList([
+-            GPTBigCodeBlock(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.h = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: GPTBigCodeBlock(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.h",
++        )
+         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.n_embd))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.wte(input_ids)
+ 
+     def forward(
+         self,
+@@ -206,33 +234,70 @@ class GPTBigCodeModel(nn.Module):
+         position_ids: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        inputs_embeds = self.wte(input_ids)
+-        position_embeds = self.wpe(position_ids)
+-        hidden_states = inputs_embeds + position_embeds
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is None:
++                inputs_embeds = self.get_input_embeddings(input_ids)
++            hidden_states = inputs_embeds + self.wpe(position_ids)
++        else:
++            hidden_states = intermediate_tensors["hidden_states"]
+ 
+-        for i in range(len(self.h)):
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.h[i]
+-            hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
++            hidden_states = layer(hidden_states,
++                                  kv_caches[i - self.start_layer],
++                                  attn_metadata)
+ 
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         hidden_states = self.ln_f(hidden_states)
+         return hidden_states
+ 
+ 
+-class GPTBigCodeForCausalLM(nn.Module):
++class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {"c_attn": ["c_attn"]}
+ 
+-    def __init__(
+-        self,
+-        config: GPTBigCodeConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    supported_lora_modules = ["c_fc", "c_proj", "wte", "c_attn"]
++
++    embedding_modules = {
++        "wte": "input_embeddings",
++        "lm_head": "output_embeddings",
++    }
++
++    embedding_padding_modules = []
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
+         self.config = config
++        self.lora_config = lora_config
++
+         self.quant_config = quant_config
+-        self.transformer = GPTBigCodeModel(config, quant_config)
+-        self.lm_head_weight = self.transformer.wte.weight
+-        self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.transformer = GPTBigCodeModel(vllm_config=vllm_config,
++                                           prefix=prefix)
++        if self.config.tie_word_embeddings:
++            self.lm_head = self.transformer.wte
++        else:
++            self.lm_head = ParallelLMHead(
++                self.transformer.vocab_size,
++                self.transformer.embed_dim,
++                org_num_embeddings=self.config.vocab_size)
++        self.unpadded_vocab_size = config.vocab_size
++        if lora_config:
++            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                config.vocab_size)
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.transformer.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.transformer.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -240,14 +305,20 @@ class GPTBigCodeForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.transformer(input_ids, positions, kv_caches,
+-                                         attn_metadata)
++                                         attn_metadata, intermediate_tensors,
++                                         inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -259,8 +330,10 @@ class GPTBigCodeForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "lm_head.weight" in name:
+                 continue
+@@ -268,7 +341,17 @@ class GPTBigCodeForCausalLM(nn.Module):
+                 # Skip attention mask.
+                 # NOTE: "c_attn.bias" should not be skipped.
+                 continue
++            if is_pp_missing_parameter(name, self):
++                continue
+             param = params_dict[name]
+             weight_loader = getattr(param, "weight_loader",
+                                     default_weight_loader)
+-            weight_loader(param, loaded_weight)
++            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
++            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
++                weight_loader(param, loaded_weight, 'q')
++                weight_loader(param, loaded_weight, 'k')
++                weight_loader(param, loaded_weight, 'v')
++            else:
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
+index 8d7fe8a..4829578 100644
+--- a/vllm/model_executor/models/gpt_j.py
++++ b/vllm/model_executor/models/gpt_j.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
+ # Copyright 2023 The vLLM team.
+@@ -16,28 +15,35 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only GPT-J model compatible with HuggingFace weights."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import GPTJConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name)
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class GPTJAttention(nn.Module):
+@@ -45,7 +51,9 @@ class GPTJAttention(nn.Module):
+     def __init__(
+         self,
+         config: GPTJConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.total_num_heads = config.num_attention_heads
+@@ -83,7 +91,12 @@ class GPTJAttention(nn.Module):
+             base=rope_theta,
+             is_neox_style=False,
+         )
+-        self.attn = Attention(self.num_heads, self.head_size, scaling)
++        self.attn = Attention(self.num_heads,
++                              self.head_size,
++                              scaling,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -120,8 +133,7 @@ class GPTJMLP(nn.Module):
+             hidden_size,
+             quant_config=quant_config,
+         )
+-        self.act = get_act_fn(config.activation_function, quant_config,
+-                              intermediate_size)
++        self.act = get_act_fn(config.activation_function)
+ 
+     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+         hidden_states, _ = self.fc_in(hidden_states)
+@@ -135,13 +147,18 @@ class GPTJBlock(nn.Module):
+     def __init__(
+         self,
+         config: GPTJConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         inner_dim = (4 * config.n_embd
+                      if config.n_inner is None else config.n_inner)
+         self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+-        self.attn = GPTJAttention(config, quant_config)
++        self.attn = GPTJAttention(config,
++                                  cache_config,
++                                  quant_config,
++                                  prefix=f"{prefix}.attn")
+         self.mlp = GPTJMLP(inner_dim, config, quant_config)
+ 
+     def forward(
+@@ -164,23 +181,35 @@ class GPTJBlock(nn.Module):
+         return hidden_states
+ 
+ 
++@support_torch_compile
+ class GPTJModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: GPTJConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+         self.embed_dim = config.n_embd
+         self.wte = VocabParallelEmbedding(
+             config.vocab_size,
+             self.embed_dim,
+         )
+-        self.h = nn.ModuleList(
+-            [GPTJBlock(config, quant_config) for _ in range(config.n_layer)])
++        self.start_layer, self.end_layer, self.h = make_layers(
++            config.n_layer,
++            lambda prefix: GPTJBlock(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.h",
++        )
+         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.n_embd))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.wte(input_ids)
+ 
+     def forward(
+         self,
+@@ -188,39 +217,55 @@ class GPTJModel(nn.Module):
+         position_ids: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.wte(input_ids)
+-        for i in range(len(self.h)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.h[i]
+             hidden_states = layer(
+                 position_ids,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         hidden_states = self.ln_f(hidden_states)
+         return hidden_states
+ 
+ 
+-class GPTJForCausalLM(nn.Module):
++class GPTJForCausalLM(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: GPTJConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+         assert not config.tie_word_embeddings
+-        self.transformer = GPTJModel(config, quant_config)
++        self.transformer = GPTJModel(vllm_config=vllm_config,
++                                     prefix=maybe_prefix(
++                                         prefix, "transformer"))
+         self.lm_head = ParallelLMHead(
+             config.vocab_size,
+             config.n_embd,
+             bias=True,
++            quant_config=quant_config,
+         )
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.transformer.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.transformer.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -228,14 +273,20 @@ class GPTJForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.transformer(input_ids, positions, kv_caches,
+-                                         attn_metadata)
++                                         attn_metadata, intermediate_tensors,
++                                         inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata, self.lm_head.bias)
+         return logits
+ 
+@@ -247,7 +298,8 @@ class GPTJForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -257,6 +309,7 @@ class GPTJForCausalLM(nn.Module):
+             ("gate_up_proj", "up_proj", 1),
+         ]
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "attn.bias" in name or "attn.masked_bias" in name:
+                 continue
+@@ -267,15 +320,24 @@ class GPTJForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+                 break
+             else:
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
+index bab563b..7316427 100644
+--- a/vllm/model_executor/models/gpt_neox.py
++++ b/vllm/model_executor/models/gpt_neox.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+ # Copyright 2023 The vLLM team.
+@@ -16,28 +15,34 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only GPT-NeoX model compatible with HuggingFace weights."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import GPTNeoXConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class GPTNeoXAttention(nn.Module):
+@@ -45,7 +50,9 @@ class GPTNeoXAttention(nn.Module):
+     def __init__(
+         self,
+         config: GPTNeoXConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.total_num_heads = config.num_attention_heads
+@@ -84,7 +91,12 @@ class GPTNeoXAttention(nn.Module):
+             max_position=max_position_embeddings,
+             base=rope_theta,
+         )
+-        self.attn = Attention(self.num_heads, self.head_size, scaling)
++        self.attn = Attention(self.num_heads,
++                              self.head_size,
++                              scaling,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -119,8 +131,7 @@ class GPTNeoXMLP(nn.Module):
+             config.hidden_size,
+             quant_config=quant_config,
+         )
+-        self.act = get_act_fn(config.hidden_act, quant_config,
+-                              config.intermediate_size)
++        self.act = get_act_fn(config.hidden_act)
+ 
+     def forward(self, hidden_states):
+         hidden_states, _ = self.dense_h_to_4h(hidden_states)
+@@ -134,7 +145,9 @@ class GPTNeoXLayer(nn.Module):
+     def __init__(
+         self,
+         config: GPTNeoXConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.use_parallel_residual = config.use_parallel_residual
+@@ -142,7 +155,10 @@ class GPTNeoXLayer(nn.Module):
+                                             eps=config.layer_norm_eps)
+         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                      eps=config.layer_norm_eps)
+-        self.attention = GPTNeoXAttention(config, quant_config)
++        self.attention = GPTNeoXAttention(config,
++                                          cache_config,
++                                          quant_config,
++                                          prefix=f"{prefix}.attention")
+         self.mlp = GPTNeoXMLP(config, quant_config)
+ 
+     def forward(
+@@ -177,26 +193,36 @@ class GPTNeoXLayer(nn.Module):
+         return hidden_states
+ 
+ 
++@support_torch_compile
+ class GPTNeoXModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: GPTNeoXConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+ 
+         self.embed_in = VocabParallelEmbedding(
+             config.vocab_size,
+             config.hidden_size,
+         )
+-        self.layers = nn.ModuleList([
+-            GPTNeoXLayer(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: GPTNeoXLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers",
++        )
+         self.final_layer_norm = nn.LayerNorm(config.hidden_size,
+                                              eps=config.layer_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_in(input_ids)
+ 
+     def forward(
+         self,
+@@ -204,37 +230,54 @@ class GPTNeoXModel(nn.Module):
+         position_ids: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_in(input_ids)
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states = layer(
+                 position_ids,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         hidden_states = self.final_layer_norm(hidden_states)
+         return hidden_states
+ 
+ 
+-class GPTNeoXForCausalLM(nn.Module):
++class GPTNeoXForCausalLM(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+-        self.gpt_neox = GPTNeoXModel(config, quant_config)
++        self.gpt_neox = GPTNeoXModel(vllm_config=vllm_config,
++                                     prefix=maybe_prefix(prefix, "gpt_neox"))
+         self.embed_out = ParallelLMHead(
+             config.vocab_size,
+             config.hidden_size,
++            quant_config=quant_config,
+         )
++        if self.config.tie_word_embeddings:
++            self.embed_out.weight = self.gpt_neox.embed_in.weight
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.gpt_neox.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.gpt_neox.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -242,14 +285,20 @@ class GPTNeoXForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.gpt_neox(input_ids, positions, kv_caches,
+-                                      attn_metadata)
++                                      attn_metadata, intermediate_tensors,
++                                      inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.embed_out.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.embed_out, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -261,8 +310,10 @@ class GPTNeoXForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if ("attention.bias" in name or "attention.masked_bias" in name
+                     or "rotary_emb.inv_freq" in name):
+@@ -272,6 +323,8 @@ class GPTNeoXForCausalLM(nn.Module):
+                 # Models trained using OpenRLHF may include
+                 # these tensors in the checkpoint. Skip them.
+                 continue
++            if is_pp_missing_parameter(name, self):
++                continue
+             param = params_dict[name]
+ 
+             if "query_key_value" in name:
+@@ -293,3 +346,5 @@ class GPTNeoXForCausalLM(nn.Module):
+             weight_loader = getattr(param, "weight_loader",
+                                     default_weight_loader)
+             weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
+new file mode 100644
+index 0000000..a91ed41
+--- /dev/null
++++ b/vllm/model_executor/models/granite.py
+@@ -0,0 +1,553 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only IBM Granite model compatible with HuggingFace weights."""
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++from transformers import GraniteConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size)
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
++    get_compressed_tensors_cache_scale)
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.platforms import current_platform
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (PPMissingLayer, is_pp_missing_parameter, make_layers,
++                    maybe_prefix)
++
++
++class GraniteMLP(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        intermediate_size: int,
++        hidden_act: str,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.gate_up_proj = MergedColumnParallelLinear(
++            input_size=hidden_size,
++            output_sizes=[intermediate_size] * 2,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj")
++        self.down_proj = RowParallelLinear(input_size=intermediate_size,
++                                           output_size=hidden_size,
++                                           bias=bias,
++                                           quant_config=quant_config,
++                                           prefix=f"{prefix}.down_proj")
++        if hidden_act != "silu":
++            raise ValueError(f"Unsupported activation: {hidden_act}. "
++                             "Only silu is supported for now.")
++        self.act_fn = SiluAndMul()
++
++    def forward(self, x):
++        gate_up, _ = self.gate_up_proj(x)
++        x = self.act_fn(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class GraniteAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: GraniteConfig,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        rope_theta: float = 10000,
++        rope_scaling: Optional[Dict[str, Any]] = None,
++        max_position_embeddings: int = 8192,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        cache_config: Optional[CacheConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = num_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = num_kv_heads
++        if self.total_num_kv_heads >= tp_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
++        self.head_dim = getattr(config, "head_dim",
++                                self.hidden_size // self.total_num_heads)
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = config.attention_multiplier
++        self.rope_theta = rope_theta
++        self.max_position_embeddings = max_position_embeddings
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size=hidden_size,
++            head_size=self.head_dim,
++            total_num_heads=self.total_num_heads,
++            total_num_kv_heads=self.total_num_kv_heads,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++        self.o_proj = RowParallelLinear(
++            input_size=self.total_num_heads * self.head_dim,
++            output_size=hidden_size,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
++        )
++
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=max_position_embeddings,
++            base=rope_theta,
++            rope_scaling=rope_scaling,
++        )
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class GraniteDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: GraniteConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        self.residual_multiplier = config.residual_multiplier
++        rope_theta = getattr(config, "rope_theta", 10000)
++        rope_scaling = getattr(config, "rope_scaling", None)
++        if rope_scaling is not None and getattr(
++                config, "original_max_position_embeddings", None):
++            rope_scaling["original_max_position_embeddings"] = (
++                config.original_max_position_embeddings)
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          8192)
++        # Support abacusai/Smaug-72B-v0.1 with attention_bias
++        # Support internlm/internlm-7b with bias
++        attention_bias = getattr(config, "attention_bias", False) or getattr(
++            config, "bias", False)
++        self.self_attn = GraniteAttention(
++            config=config,
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            num_kv_heads=getattr(config, "num_key_value_heads",
++                                 config.num_attention_heads),
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            quant_config=quant_config,
++            bias=attention_bias,
++            cache_config=cache_config,
++            prefix=f"{prefix}.self_attn",
++        )
++
++        self.mlp = GraniteMLP(
++            hidden_size=self.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.hidden_act,
++            quant_config=quant_config,
++            bias=getattr(config, "mlp_bias", False),
++            prefix=f"{prefix}.mlp",
++        )
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        # Self Attention
++        residual = hidden_states
++        hidden_states = self.input_layernorm(hidden_states)
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        hidden_states = residual + hidden_states * self.residual_multiplier
++        # Fully Connected
++        residual = hidden_states
++        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = residual + hidden_states * self.residual_multiplier
++        return hidden_states
++
++
++@support_torch_compile
++class GraniteModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.config = config
++        self.padding_idx = config.pad_token_id
++        lora_vocab = (lora_config.lora_extra_vocab_size *
++                      (lora_config.max_loras or 1)) if lora_config else 0
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.org_vocab_size = config.vocab_size
++        if get_pp_group().is_first_rank or (config.tie_word_embeddings
++                                            and get_pp_group().is_last_rank):
++            self.embed_tokens = VocabParallelEmbedding(
++                self.vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++                quant_config=quant_config,
++            )
++        else:
++            self.embed_tokens = PPMissingLayer()
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: GraniteDecoderLayer(config=config,
++                                               cache_config=cache_config,
++                                               quant_config=quant_config,
++                                               prefix=prefix),
++            prefix=f"{prefix}.layers")
++        if get_pp_group().is_last_rank:
++            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        else:
++            self.norm = PPMissingLayer()
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++
++            hidden_states *= self.config.embedding_multiplier
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++            )
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++
++        hidden_states = self.norm(hidden_states)
++        return hidden_states
++
++
++class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
++        "lm_head"
++    ]
++    embedding_modules = {
++        "embed_tokens": "input_embeddings",
++        "lm_head": "output_embeddings",
++    }
++    embedding_padding_modules = ["lm_head"]
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.config = config
++        self.lora_config = lora_config
++
++        self.model = GraniteModel(vllm_config=vllm_config,
++                                  prefix=maybe_prefix(prefix, "model"))
++        if get_pp_group().is_last_rank:
++            self.unpadded_vocab_size = config.vocab_size
++            if lora_config:
++                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++            self.lm_head = ParallelLMHead(
++                self.unpadded_vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++                padding_size=DEFAULT_VOCAB_PADDING_SIZE
++                # We need bigger padding if using lora for kernel
++                # compatibility
++                if not lora_config else lora_config.lora_vocab_padding_size,
++                quant_config=quant_config,
++            )
++            if config.tie_word_embeddings:
++                self.lm_head.weight = self.model.embed_tokens.weight
++
++            logit_scale = getattr(config, "logit_scale", 1.0)
++            if hasattr(config, "logits_scaling"):
++                logit_scale /= config.logits_scaling
++
++            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                    config.vocab_size,
++                                                    scale=logit_scale)
++        else:
++            self.lm_head = PPMissingLayer()
++
++        self.sampler = get_sampler()
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        model_output = self.model(input_ids, positions, kv_caches,
++                                  attn_metadata, intermediate_tensors,
++                                  inputs_embeds)
++        return model_output
++
++    def compute_logits(
++            self, hidden_states: torch.Tensor,
++            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def make_empty_intermediate_tensors(
++            self, batch_size: int, dtype: torch.dtype,
++            device: torch.device) -> IntermediateTensors:
++        return IntermediateTensors({
++            "hidden_states":
++            torch.zeros((batch_size, self.config.hidden_size),
++                        dtype=dtype,
++                        device=device),
++            "residual":
++            torch.zeros((batch_size, self.config.hidden_size),
++                        dtype=dtype,
++                        device=device),
++        })
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            (".qkv_proj", ".q_proj", "q"),
++            (".qkv_proj", ".k_proj", "k"),
++            (".qkv_proj", ".v_proj", "v"),
++            (".gate_up_proj", ".gate_proj", 0),
++            (".gate_up_proj", ".up_proj", 1),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            if ("rotary_emb.cos_cached" in name
++                    or "rotary_emb.sin_cached" in name):
++                # Models trained using ColossalAI may include these tensors in
++                # the checkpoint. Skip them.
++                continue
++            # With tie_word_embeddings, we can skip lm_head.weight
++            # The weight might appear unnecessarily in the files if the model is
++            # processed with quantization, LoRA, fine-tuning, etc.
++            if self.config.tie_word_embeddings and "lm_head.weight" in name:
++                continue
++            if scale_name := get_compressed_tensors_cache_scale(name):
++                # Loading kv cache scales for compressed-tensors quantization
++                param = params_dict[scale_name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                loaded_weight = loaded_weight[0]
++                weight_loader(param, loaded_weight)
++                loaded_params.add(scale_name)
++                continue
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++
++                break
++            else:
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                # Remapping the name of FP8 kv-scale.
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++    # If this function is called, it should always initialize KV cache scale
++    # factors (or else raise an exception). Thus, handled exceptions should
++    # make sure to leave KV cache scale factors in a known good (dummy) state
++    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
++        tp_size = get_tensor_model_parallel_world_size()
++        tp_rank = get_tensor_model_parallel_rank()
++        for layer_idx, scaling_factor in kv_cache_scales_loader(
++                quantization_param_path, tp_rank, tp_size,
++                self.config.num_hidden_layers,
++                self.config.__class__.model_type):
++            if not isinstance(self.model.layers[layer_idx], nn.Identity):
++                layer_self_attn = self.model.layers[layer_idx].self_attn
++
++            if current_platform.is_rocm():
++                # The scaling factor convention we are assuming is
++                # quantized_value * scaling_factor ~= true_value
++                # which is consistent with the practice of setting
++                # scaling_factor = tensor_amax / FPtype_max
++                scaling_factor *= 2
++            if hasattr(layer_self_attn.attn, "_k_scale"):
++                layer_self_attn.attn._k_scale = scaling_factor
++                layer_self_attn.attn._v_scale = scaling_factor
++            else:
++                raise RuntimeError("Self attention has no KV cache scaling "
++                                   "factor attribute!")
+diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
+new file mode 100644
+index 0000000..51296ef
+--- /dev/null
++++ b/vllm/model_executor/models/granitemoe.py
+@@ -0,0 +1,458 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only GraniteMoe model."""
++from typing import Iterable, List, Optional, Set, Tuple
++
++import torch
++from torch import nn
++from transformers.models.granitemoe import GraniteMoeConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.fused_moe import FusedMoE
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (QKVParallelLinear,
++                                               ReplicatedLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++
++from . import mixtral
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import make_layers, maybe_prefix
++
++
++class GraniteMoeMoE(nn.Module):
++    """A tensor-parallel MoE implementation for GraniteMoe that shards each
++    expert across all ranks.
++    Each expert's weights are sharded across all ranks and a fused MoE
++    kernel is used for the forward pass, and finally we reduce the outputs
++    across ranks.
++    """
++
++    def __init__(self,
++                 num_experts: int,
++                 top_k: int,
++                 hidden_size: int,
++                 intermediate_size: int,
++                 params_dtype: Optional[torch.dtype] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 tp_size: Optional[int] = None,
++                 prefix: str = ""):
++        super().__init__()
++        self.hidden_size = hidden_size
++
++        # Gate always runs at half / full precision for now.
++        self.gate = ReplicatedLinear(hidden_size,
++                                     num_experts,
++                                     bias=False,
++                                     params_dtype=params_dtype,
++                                     quant_config=None,
++                                     prefix=f"{prefix}.gate")
++
++        self.experts = FusedMoE(num_experts=num_experts,
++                                top_k=top_k,
++                                hidden_size=hidden_size,
++                                intermediate_size=intermediate_size,
++                                params_dtype=params_dtype,
++                                reduce_results=True,
++                                renormalize=True,
++                                quant_config=quant_config,
++                                tp_size=tp_size,
++                                prefix=f"{prefix}.experts")
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        # NOTE: hidden_states can have either 1D or 2D shape.
++        orig_shape = hidden_states.shape
++        hidden_states = hidden_states.view(-1, self.hidden_size)
++        # router_logits: (num_tokens, n_experts)
++        router_logits, _ = self.gate(hidden_states)
++        final_hidden_states = self.experts(hidden_states, router_logits)
++        return final_hidden_states.view(orig_shape)
++
++
++class GraniteMoeAttention(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        max_position: int = 4096 * 32,
++        rope_theta: float = 10000,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        attention_multiplier: Optional[float] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = num_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = num_kv_heads
++        if self.total_num_kv_heads >= tp_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        self.head_dim = hidden_size // self.total_num_heads
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = (attention_multiplier if attention_multiplier
++                        is not None else self.head_dim**-1)
++        self.rope_theta = rope_theta
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size,
++            self.head_dim,
++            self.total_num_heads,
++            self.total_num_kv_heads,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++        self.o_proj = RowParallelLinear(
++            self.total_num_heads * self.head_dim,
++            hidden_size,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
++        )
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=max_position,
++            base=int(self.rope_theta),
++            is_neox_style=True,
++        )
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class GraniteMoeDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: GraniteMoeConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        # Requires transformers > 4.32.0
++        rope_theta = getattr(config, "rope_theta", 10000)
++        self.self_attn = GraniteMoeAttention(
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            max_position=config.max_position_embeddings,
++            num_kv_heads=config.num_key_value_heads,
++            rope_theta=rope_theta,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++            attention_multiplier=config.attention_multiplier)
++        self.block_sparse_moe = GraniteMoeMoE(
++            num_experts=config.num_local_experts,
++            top_k=config.num_experts_per_tok,
++            hidden_size=config.hidden_size,
++            intermediate_size=config.intermediate_size,
++            quant_config=quant_config,
++            prefix=f"{prefix}.block_sparse_moe")
++
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.rms_norm_eps)
++
++        self.residual_multiplier = config.residual_multiplier
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        # Self Attention
++        residual = hidden_states
++        hidden_states = self.input_layernorm(hidden_states)
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        hidden_states = residual + hidden_states * self.residual_multiplier
++        residual = hidden_states
++        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states = self.block_sparse_moe(hidden_states)
++        hidden_states = residual + hidden_states * self.residual_multiplier
++
++        return hidden_states
++
++
++@support_torch_compile
++class GraniteMoeModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.padding_idx = config.pad_token_id
++        lora_vocab = (lora_config.lora_extra_vocab_size *
++                      (lora_config.max_loras or 1)) if lora_config else 0
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.org_vocab_size = config.vocab_size
++
++        self.embed_tokens = VocabParallelEmbedding(
++            self.vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++        )
++        self.embedding_multiplier = config.embedding_multiplier
++
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: GraniteMoeDecoderLayer(
++                config, cache_config, quant_config=quant_config, prefix=prefix
++            ),
++            prefix=f"{prefix}.layers")
++
++        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            hidden_states *= self.embedding_multiplier
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states = layer(positions, hidden_states,
++                                  kv_caches[i - self.start_layer],
++                                  attn_metadata)
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++        hidden_states = self.norm(hidden_states)
++        return hidden_states
++
++
++class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    fall_back_to_pt_during_load = False
++
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj",
++        "o_proj",
++        "embed_tokens",
++        "lm_head",
++        "layer",
++    ]
++    embedding_modules = {
++        "embed_tokens": "input_embeddings",
++        "lm_head": "output_embeddings",
++    }
++    embedding_padding_modules = ["lm_head"]
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.config = config
++        self.lora_config = lora_config
++
++        self.model = GraniteMoeModel(vllm_config=vllm_config,
++                                     prefix=maybe_prefix(prefix, "model"))
++        self.unpadded_vocab_size = config.vocab_size
++        if lora_config:
++            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++        self.lm_head = ParallelLMHead(
++            self.unpadded_vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++            padding_size=DEFAULT_VOCAB_PADDING_SIZE
++            # We need bigger padding if using lora for kernel
++            # compatibility
++            if not lora_config else lora_config.lora_vocab_padding_size,
++            quant_config=quant_config,
++        )
++        if config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
++
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                config.vocab_size,
++                                                scale=1 /
++                                                self.config.logits_scaling)
++
++        self.sampler = get_sampler()
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        hidden_states = self.model(input_ids, positions, kv_caches,
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
++        return hidden_states
++
++    def compute_logits(
++            self, hidden_states: torch.Tensor,
++            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def make_empty_intermediate_tensors(
++            self, batch_size: int, dtype: torch.dtype,
++            device: torch.device) -> IntermediateTensors:
++        return IntermediateTensors({
++            "hidden_states":
++            torch.zeros((batch_size, self.config.hidden_size),
++                        dtype=dtype,
++                        device=device),
++            "residual":
++            torch.zeros((batch_size, self.config.hidden_size),
++                        dtype=dtype,
++                        device=device),
++        })
++
++    def sample(
++        self,
++        logits: Optional[torch.Tensor],
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        new_weights = {}
++        for n, p in weights:
++            if n.endswith('.block_sparse_moe.input_linear.weight'):
++                for e in range(p.size(0)):
++                    w1_name = n.replace(
++                        '.block_sparse_moe.input_linear.weight',
++                        ".block_sparse_moe.experts.%d.w1.weight" % e)
++                    w3_name = n.replace(
++                        '.block_sparse_moe.input_linear.weight',
++                        ".block_sparse_moe.experts.%d.w3.weight" % e)
++                    w1_param, w3_param = p[e].chunk(2, dim=0)
++                    assert w1_name not in new_weights
++                    assert w3_name not in new_weights
++                    new_weights[w1_name] = w1_param
++                    new_weights[w3_name] = w3_param
++            elif n.endswith('.block_sparse_moe.output_linear.weight'):
++                for e in range(p.size(0)):
++                    w2_name = n.replace(
++                        '.block_sparse_moe.output_linear.weight',
++                        ".block_sparse_moe.experts.%d.w2.weight" % e)
++                    w2_param = p[e]
++                    assert w2_name not in new_weights
++                    new_weights[w2_name] = w2_param
++            elif n.endswith('.block_sparse_moe.router.layer.weight'):
++                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
++                                      ".block_sparse_moe.gate.weight")
++                assert gate_name not in new_weights
++                new_weights[gate_name] = p
++            elif n == 'lm_head.weight' and self.config.tie_word_embeddings:
++                pass
++            else:
++                new_weights[n] = p
++        return mixtral.MixtralForCausalLM.load_weights(self,
++                                                       new_weights.items())
+diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
+new file mode 100644
+index 0000000..d179d62
+--- /dev/null
++++ b/vllm/model_executor/models/gritlm.py
+@@ -0,0 +1,248 @@
++from array import array
++from typing import List, Optional, Union
++
++import torch
++import torch.nn as nn
++from xformers.ops.fmha.attn_bias import BlockDiagonalMask
++
++from vllm.attention import AttentionMetadata
++from vllm.attention.backends.xformers import XFormersImpl
++from vllm.config import ModelConfig, VllmConfig
++from vllm.logger import init_logger
++from vllm.model_executor.layers.pooler import PoolerHead
++from vllm.model_executor.models.llama import LlamaForCausalLM
++from vllm.model_executor.pooling_metadata import (PoolingMetadata,
++                                                  PoolingTensors)
++from vllm.multimodal.utils import cached_get_tokenizer
++from vllm.sequence import (IntermediateTensors, PoolerOutput,
++                           PoolingSequenceGroupOutput)
++
++logger = init_logger(__name__)
++
++
++class GritLMPooler(nn.Module):
++
++    def __init__(self, model_config: ModelConfig):
++        super().__init__()
++
++        self.model_config = model_config
++
++        tokenizer = cached_get_tokenizer(
++            self.model_config.tokenizer,
++            tokenizer_mode=self.model_config.tokenizer_mode,
++            tokenizer_revision=self.model_config.tokenizer_revision,
++            trust_remote_code=self.model_config.trust_remote_code,
++        )
++
++        # Collect the tokens needed for pattern matching.
++        # "▁<" is different from "_<". The former uses "▁" to indicate that
++        # the next token is the start of a word.
++        # "<0x0A>" is the newline token (i.e. "\n")."
++        self.token_ids = {
++            tok: tokenizer.convert_tokens_to_ids([tok])[0]
++            for tok in ["<s>", "▁<", "<", "|", "embed", ">", "<0x0A>", "user"]
++        }
++
++        def tokens_to_ids(tokens: list[str]) -> array:
++            return array("i", [self.token_ids[token] for token in tokens])
++
++        self.user_pattern_ids = tokens_to_ids(
++            ["▁<", "|", "user", "|", ">", "<0x0A>"])
++        self.embed_newline_pattern_ids = tokens_to_ids(
++            ["<0x0A>", "<", "|", "embed", "|", ">", "<0x0A>"])
++        self.embed_pattern_ids = tokens_to_ids(
++            ["▁<", "|", "embed", "|", ">", "<0x0A>"])
++
++        self.head = PoolerHead(normalize=True, softmax=False)
++
++    def _find_array(self, arr: array, target: array, start_idx: int) -> int:
++        """
++        Find the first occurrence of target in arr starting from start_idx.
++
++        Args:
++        arr: The array to search within
++        target: The consecutive subsequence to find
++        start_idx: The starting index to search from
++
++        Returns:
++        int: The index of the first occurrence of target in arr.
++        """
++        if start_idx < 0:
++            raise ValueError("start_idx must be non-negative")
++        if not target or not arr:
++            raise ValueError("Empty arr or target not allowed")
++
++        target_len = len(target)
++        for i in range(start_idx, len(arr) - target_len + 1):
++            if arr[i:i + target_len] == target:
++                return i
++        return -1
++
++    def _get_instruction_len(self, prompt_token_ids: array) -> int:
++        """
++        Get the length of the instruction in the prompt.
++
++        We do a pattern matching to find the instruction in the prompt,
++        and then return the length of the instruction.
++
++        The pattern matching is done using integers instead of strings
++        because the prompt is given as a list of token IDs.
++        """
++
++        instruction_len = 0
++
++        # Return no instruction in case of missing BOS token.
++        if prompt_token_ids[0] != self.token_ids["<s>"]:
++            logger.warning("BOS token not found in prompt,"
++                           "thus using empty string for instruction."
++                           "GritLM requires BOS token in prompt.")
++            return instruction_len
++
++        # If user pattern is found in the prompt, that means there should be
++        # a newline token before the embed pattern.
++        embed_pattern_ids = self.embed_pattern_ids
++        if self._find_array(prompt_token_ids,
++                            self.user_pattern_ids,
++                            start_idx=1) == 1:
++            embed_pattern_ids = self.embed_newline_pattern_ids
++
++        # Find the embed pattern in the prompt.
++        found_embed_pattern_idx = self._find_array(prompt_token_ids,
++                                                   embed_pattern_ids,
++                                                   start_idx=1)
++
++        if found_embed_pattern_idx != -1:
++            instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
++        else:
++            logger.warning("Query instruction not found in prompt,"
++                           "thus using BOS token as instruction instead."
++                           "GritLM requires query instruction in prompt.")
++            instruction_len = 1
++
++        return instruction_len
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> PoolerOutput:
++        """
++        Pool the hidden states by summing the embeddings of
++        non-instruction tokens.
++        """
++        prompts_token_ids = [
++            token_ids.prompt_token_ids_array
++            for _, token_ids in pooling_metadata.seq_data.items()
++        ]
++
++        instruction_lens = torch.tensor(
++            [
++                self._get_instruction_len(prompt_token_ids)
++                for prompt_token_ids in prompts_token_ids
++            ],
++            device=hidden_states.device,
++        )
++
++        prompt_lens = PoolingTensors.from_pooling_metadata(
++            pooling_metadata, hidden_states.device).prompt_lens
++
++        mask = torch.zeros_like(hidden_states, dtype=torch.bool)
++
++        start_idx = 0
++        for prompt_len, instruction_len in zip(prompt_lens, instruction_lens):
++            end_idx = start_idx + prompt_len
++            mask[start_idx + instruction_len:end_idx] = True
++            start_idx = end_idx
++
++        masked_hidden_states = hidden_states.masked_fill(~mask, 0.0)
++
++        sum_embeddings = torch.zeros(len(prompt_lens),
++                                     hidden_states.size(1),
++                                     device=hidden_states.device)
++
++        start_idx = 0
++        for i, prompt_len in enumerate(prompt_lens):
++            end_idx = start_idx + prompt_len
++            sum_embeddings[i] = masked_hidden_states[start_idx:end_idx].sum(
++                dim=0)
++            start_idx = end_idx
++
++        num_non_instruction_tokens = prompt_lens - instruction_lens
++        mean_embeddings = sum_embeddings / num_non_instruction_tokens.unsqueeze(
++            1)
++
++        pooled_data = self.head(mean_embeddings)
++
++        pooled_outputs = [
++            PoolingSequenceGroupOutput(data) for data in pooled_data
++        ]
++
++        return PoolerOutput(outputs=pooled_outputs)
++
++
++class GritLM(LlamaForCausalLM):
++    """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
++
++    The class inherits from LlamaForCausalLM and provides a custom pooling
++    layer.
++
++    The main difference between the pooling layer in GritLM and the one in
++    LlamaForCausalLM is that GritLM ignores the query instruction in the prompt
++    when pooling the hidden states.
++
++    Embedding prompts should be in the following format:
++    - With instruction: "<|user|>\nINSTRUCTION\n<|embed|>\nPROMPT".
++    - Without instruction: "<|embed|>\nPROMPT".
++
++    Generation prompts should be in the following format:
++    - "<|user|>\nPROMPT\n<|assistant|>\n"
++    """
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++        **kwargs,
++    ) -> None:
++        super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
++
++        self.runner_type = vllm_config.model_config.runner_type
++
++        self._pooler = GritLMPooler(vllm_config.model_config)
++
++        for layer in self.model.layers:
++            if self.runner_type == "pooling" and hasattr(layer, "self_attn"):
++                assert isinstance(layer.self_attn.attn.impl, XFormersImpl), (
++                    "GritLM embedding is only supported by XFormers backend, "
++                    "which can be forced by VLLM_ATTENTION_BACKEND=XFORMERS")
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        **kwargs,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++
++        # Change attention to non-causal for pooling tasks.
++        if self.runner_type == "pooling":
++            assert attn_metadata.prefill_metadata.attn_bias is None
++            attn_metadata.prefill_metadata.attn_bias = [
++                BlockDiagonalMask.from_seqlens(attn_metadata.seq_lens)
++            ]
++
++        return super().forward(
++            input_ids=input_ids,
++            positions=positions,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++            **kwargs,
++        )
++
++    def pooler(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Optional[PoolerOutput]:
++        return self._pooler(hidden_states, pooling_metadata)
+diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
+new file mode 100644
+index 0000000..df7e768
+--- /dev/null
++++ b/vllm/model_executor/models/h2ovl.py
+@@ -0,0 +1,400 @@
++# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
++# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
++# --------------------------------------------------------
++# H2OVL-Mississippi
++# Copyright (c) 2024 H2O.AI
++# Licensed under Apache 2.0 License [see LICENSE for details]
++# --------------------------------------------------------
++from functools import partial
++from typing import List, Optional, Tuple
++
++import torch
++from PIL import Image
++from transformers import PretrainedConfig
++
++from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
++                         token_inputs)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
++from vllm.multimodal.utils import cached_get_tokenizer
++from vllm.utils import is_list_of
++
++from .intern_vit import InternVisionModel
++from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, InternVLChatModel,
++                       InternVLInputPipeline, build_transform,
++                       find_closest_aspect_ratio, get_internvl_num_patches)
++
++
++# modified to include blocks generated in second pass
++def calculate_num_blocks(
++    orig_width: int,
++    orig_height: int,
++    min_num: int,
++    max_num: int,
++    image_size: int,
++    use_thumbnail: bool,
++    prior_aspect_ratio=None,
++) -> Tuple[int, int, int, Tuple[int, int]]:
++    aspect_ratio = orig_width / orig_height
++
++    # calculate the existing image aspect ratio
++    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
++                        for i in range(1, n + 1) for j in range(1, n + 1)
++                        if i * j <= max_num and i * j >= min_num)
++    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
++
++    # if prior_aspect_ratio is provided, filter the target ratios
++    if prior_aspect_ratio is not None:
++        target_ratios = [
++            ratio for ratio in target_ratios if prior_aspect_ratio[0] %
++            ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0
++        ]
++
++    # find the closest aspect ratio to the target
++    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
++                                                    target_ratios, orig_width,
++                                                    orig_height, image_size)
++
++    # calculate the target width and height
++    target_width = image_size * target_aspect_ratio[0]
++    target_height = image_size * target_aspect_ratio[1]
++    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
++    # add thumbnail image if num_blocks > 1
++    if use_thumbnail and blocks > 1:
++        blocks += 1
++    return blocks, target_width, target_height, target_aspect_ratio
++
++
++# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
++# refactored to handle prior_aspect_ratio as optional
++def dynamic_preprocess(
++    image: Image.Image,
++    min_num: int,
++    max_num: int,
++    image_size: int,
++    use_thumbnail: bool,
++    prior_aspect_ratio: Optional[Tuple[int, int]] = None,
++) -> Tuple[List[Image.Image], Tuple[int, int]]:
++    orig_width, orig_height = image.size
++
++    # calculate the number of blocks based on prior aspect ratio if available
++    blocks, target_width, target_height, target_aspect_ratio = (
++        calculate_num_blocks(
++            orig_width,
++            orig_height,
++            min_num,
++            max_num,
++            image_size,
++            use_thumbnail=False,
++            prior_aspect_ratio=prior_aspect_ratio,
++        ))
++    # resize the image
++    resized_img = image.resize((target_width, target_height))
++    processed_images = []
++    for i in range(blocks):
++        box = (
++            (i % (target_width // image_size)) * image_size,
++            (i // (target_width // image_size)) * image_size,
++            ((i % (target_width // image_size)) + 1) * image_size,
++            ((i // (target_width // image_size)) + 1) * image_size,
++        )
++        # split the image
++        split_img = resized_img.crop(box)
++        processed_images.append(split_img)
++    assert len(processed_images) == blocks
++    if use_thumbnail and len(processed_images) != 1:
++        thumbnail_img = image.resize((image_size, image_size))
++        processed_images.append(thumbnail_img)
++    return processed_images, target_aspect_ratio
++
++
++def load_image(
++    image: Image.Image,
++    input_size=448,
++    min_num=1,
++    max_num=6,
++    use_thumbnail=True,
++    prior_aspect_ratio: Optional[Tuple[int, int]] = None,
++) -> Tuple[torch.Tensor, Tuple[int, int]]:
++    transform = build_transform(input_size=input_size)
++    images, target_aspect_ratio = dynamic_preprocess(
++        image,
++        image_size=input_size,
++        use_thumbnail=use_thumbnail,
++        min_num=min_num,
++        max_num=max_num,
++        prior_aspect_ratio=prior_aspect_ratio,
++    )
++    pixel_values = [transform(image) for image in images]
++    pixel_values = torch.stack(pixel_values)
++    return pixel_values, target_aspect_ratio
++
++
++# refactored to use the combined load_image function
++def image_to_pixel_values(
++    image: Image.Image,
++    input_size: int,
++    min_num: int,
++    max_num: int,
++    use_thumbnail: bool,
++    use_MSAC: bool,
++) -> torch.Tensor:
++    # when MSAC is turned on, we need to process the image twice
++    if use_MSAC:
++        # first pass
++        pixel_values, target_aspect_ratio = load_image(
++            image,
++            input_size=input_size,
++            min_num=min_num,
++            max_num=max_num,
++            use_thumbnail=True,
++        )
++        # second pass
++        pixel_values2, _ = load_image(
++            image,
++            input_size=input_size,
++            min_num=min_num,
++            max_num=max_num,
++            prior_aspect_ratio=target_aspect_ratio,
++        )
++        # combine pixel values
++        pixel_values = torch.cat(
++            [pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
++
++    else:
++        pixel_values, _ = load_image(
++            image,
++            input_size=input_size,
++            min_num=min_num,
++            max_num=max_num,
++            use_thumbnail=use_thumbnail,
++        )
++
++    return pixel_values
++
++
++def image_to_pixel_values_wrapper(hf_config: PretrainedConfig,
++                                  max_dynamic_patch: Optional[int] = None,
++                                  use_MSAC: Optional[bool] = None):
++    image_size = hf_config.vision_config.image_size
++    min_num = hf_config.min_dynamic_patch
++    if max_dynamic_patch is None:
++        max_dynamic_patch = hf_config.max_dynamic_patch
++    if use_MSAC is None:
++        use_MSAC = hf_config.use_msac
++    use_thumbnail = hf_config.use_thumbnail
++    return partial(
++        image_to_pixel_values,
++        input_size=image_size,
++        min_num=min_num,
++        max_num=max_dynamic_patch,
++        use_thumbnail=use_thumbnail,
++        use_MSAC=use_MSAC,
++    )
++
++
++def get_max_internvl_image_tokens(ctx: InputContext,
++                                  *,
++                                  max_dynamic_patch: Optional[int] = None):
++    """
++    Calculate the maximum number of tokens with/without MSAC and thumbnail
++    """
++    hf_config = ctx.get_hf_config()
++    use_thumbnail = hf_config.use_thumbnail
++    use_MSAC = hf_config.use_msac
++
++    if max_dynamic_patch is None:
++        max_dynamic_patch = hf_config.max_dynamic_patch
++
++    num_patches = get_internvl_num_patches(hf_config)
++
++    coefficient = 2 if use_MSAC else 1
++    num_blocks = coefficient * max_dynamic_patch + (1 if use_thumbnail else 0)
++
++    return num_blocks * num_patches
++
++
++class H2OVLInputPipeline(InternVLInputPipeline):
++    """
++    Input pipeline for processing image and text data for the H2OVL model.
++    """
++
++    def input_processor(
++        self,
++        ctx: InputContext,
++        inputs: DecoderOnlyInputs,
++        *,
++        max_dynamic_patch: Optional[int] = None,
++    ) -> DecoderOnlyInputs:
++        # get multi_modal_data
++        multi_modal_data = inputs.get("multi_modal_data")
++        if multi_modal_data is None or "image" not in multi_modal_data:
++            return inputs
++
++        model_config = ctx.model_config
++        hf_config = ctx.get_hf_config()
++        use_MSAC = hf_config.use_msac
++
++        image_data = multi_modal_data["image"]
++        num_patches = get_internvl_num_patches(hf_config)
++
++        image_pixel_values_mapper = image_to_pixel_values_wrapper(
++            hf_config, max_dynamic_patch=max_dynamic_patch)
++
++        # single image
++        if isinstance(image_data, Image.Image):
++            pixel_values = image_pixel_values_mapper(image_data,
++                                                     use_MSAC=use_MSAC)
++            num_blocks = pixel_values.shape[0]
++            image_feature_sizes = [num_blocks * num_patches]
++            pixel_values = pixel_values.unsqueeze(0)
++
++        # multi images
++        elif is_list_of(image_data, Image.Image):
++            # Do not use MSAC for multi images
++            image_feature_sizes = []
++            pixel_values = [
++                image_pixel_values_mapper(image, use_MSAC=False)
++                for image in image_data
++            ]
++            for pixel_value in pixel_values:
++                num_blocks = pixel_value.shape[0]
++                image_feature_sizes.append(num_blocks * num_patches)
++
++        # image embeddings as input
++        elif isinstance(image_data, torch.Tensor):
++            _, image_feature_size, _ = image_data.shape
++            image_feature_sizes = [image_feature_size]
++            pixel_values = None
++
++        # multi-image image embeddings
++        elif is_list_of(image_data, torch.Tensor):
++
++            image_feature_sizes = []
++            for image_embed in image_data:
++                _, image_feature_size, _ = image_embed.shape
++                image_feature_sizes.append(image_feature_size)
++            pixel_values = None
++
++        else:
++            raise TypeError(f"Invalid image type: {type(image_data)}")
++
++        tokenizer = cached_get_tokenizer(
++            model_config.tokenizer,
++            trust_remote_code=model_config.trust_remote_code,
++        )
++
++        prompt = inputs.get("prompt")
++        prompt_token_ids = inputs["prompt_token_ids"]
++        if prompt is None:
++            prompt = tokenizer.decode(prompt_token_ids)
++
++        new_prompt = self._expand_image_prompt(prompt, image_feature_sizes,
++                                               num_patches)
++        new_prompt_token_ids = tokenizer.encode(new_prompt)
++
++        # Wrap image processing in input_processor to avoid duplication
++        image_token_id = tokenizer.encode(
++            self.img_context_token,
++            add_special_tokens=False,
++            return_tensors="pt",
++        )[0]
++
++        # Update multi_modal_data to return
++        if pixel_values is not None:
++            multi_modal_data = {
++                "image": {
++                    "pixel_values": pixel_values,
++                    "image_token_id": image_token_id,
++                }
++            }
++        else:
++            multi_modal_data = {"image": {"image_embeds": image_data}}
++
++        return token_inputs(
++            prompt=prompt,
++            prompt_token_ids=new_prompt_token_ids,
++            multi_modal_data=multi_modal_data,
++        )
++
++    def input_mapper(
++        self,
++        ctx: InputContext,
++        data: object,
++        *,
++        max_dynamic_patch: Optional[int] = None,
++    ) -> MultiModalKwargs:
++
++        # NOTE: Preprocessing for the image data is done in the
++        # 'input_processor' function during actual inference.
++        if isinstance(data, dict):
++            return MultiModalKwargs(data)
++
++        # The section below is only used with dummy data during
++        # memory profiling.
++        hf_config = ctx.get_hf_config()
++
++        image_pixel_values_mapper = image_to_pixel_values_wrapper(
++            hf_config, max_dynamic_patch)
++
++        if isinstance(data, Image.Image):
++            pixel_values = image_pixel_values_mapper(data)
++            pixel_values = pixel_values.unsqueeze(0)
++
++        elif is_list_of(data, Image.Image):
++            hf_config.use_msac = False
++            pixel_values = [image_pixel_values_mapper(img) for img in data]
++
++        else:
++            return MultiModalKwargs({"image_embeds": data})
++        model_config = ctx.model_config
++        tokenizer = cached_get_tokenizer(
++            model_config.tokenizer,
++            trust_remote_code=model_config.trust_remote_code,
++        )
++        image_token_id = tokenizer.encode(
++            self.img_context_token,
++            add_special_tokens=False,
++            return_tensors="pt",
++        )[0]
++
++        return MultiModalKwargs({
++            "pixel_values": pixel_values,
++            "image_token_id": image_token_id
++        })
++
++
++input_pipeline = H2OVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
++@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
++@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
++@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
++class H2OVLChatModel(InternVLChatModel):
++
++    def _init_vision_model(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig],
++        *,
++        is_mono: bool,
++        prefix: str,
++    ):
++        if not is_mono:
++            vision_feature_layer = config.select_layer
++            if vision_feature_layer < 0:
++                num_hidden_layers = (config.vision_config.num_hidden_layers +
++                                     vision_feature_layer + 1)
++            else:
++                num_hidden_layers = vision_feature_layer + 1
++
++            return InternVisionModel(
++                config.vision_config,
++                quant_config=quant_config,
++                num_hidden_layers_override=num_hidden_layers,
++                prefix=prefix,
++            )
++        else:
++            msg = "Monolith mode is not applicable to H2OVL"
++            raise NotImplementedError(msg)
+diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
+new file mode 100644
+index 0000000..4e42a4b
+--- /dev/null
++++ b/vllm/model_executor/models/idefics2_vision_model.py
+@@ -0,0 +1,344 @@
++# adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
++# Copyright 2024 The vLLM team.
++# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""PyTorch Idefics2 model."""
++
++from typing import Iterable, Optional, Set, Tuple
++
++import torch
++from torch import nn
++from transformers.models.idefics2.configuration_idefics2 import (
++    Idefics2Config, Idefics2VisionConfig)
++
++from vllm.attention.layer import MultiHeadAttention
++from vllm.distributed import divide, get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++
++
++class Idefics2VisionEmbeddings(nn.Module):
++    """
++    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
++    ` to enable images of variable
++    resolution.
++
++    The modifications are adapted from [Patch n' Pack: NaViT, a Vision
++    Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
++    which allows treating images in their native aspect ratio and without the
++    need to resize them to the same fixed size. In particular, we start from the
++    original pre-trained SigLIP model(which uses images of fixed-size square
++    images) and adapt it by training on images of variable resolutions.
++    """
++
++    def __init__(self, config: Idefics2VisionConfig):
++        super().__init__()
++        self.embed_dim = config.hidden_size
++        self.image_size = config.image_size
++        self.patch_size = config.patch_size
++        self.patch_embedding = nn.Conv2d(
++            in_channels=config.num_channels,
++            out_channels=self.embed_dim,
++            kernel_size=self.patch_size,
++            stride=self.patch_size,
++            padding="valid",
++        )
++        self.num_patches_per_side = self.image_size // self.patch_size
++        self.num_patches = self.num_patches_per_side**2
++        self.num_positions = self.num_patches
++        self.position_embedding = nn.Embedding(self.num_positions,
++                                               self.embed_dim)
++
++    def forward(self,
++                pixel_values: torch.FloatTensor,
++                patch_attention_mask: torch.BoolTensor,
++                tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
++        batch_size, _, max_im_h, max_im_w = pixel_values.shape
++        target_dtype = self.patch_embedding.weight.dtype
++        patch_embeds = self.patch_embedding(pixel_values.to(target_dtype))
++        embeddings = patch_embeds.flatten(2).transpose(1, 2)
++        max_nb_patches_h, max_nb_patches_w = (
++            max_im_h // self.patch_size,
++            max_im_w // self.patch_size,
++        )
++        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
++                                  1 / self.num_patches_per_side)
++        position_ids = torch.full(size=(batch_size,
++                                        max_nb_patches_h * max_nb_patches_w),
++                                  fill_value=0)
++
++        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
++
++            if tgt_sizes is not None:
++                nb_patches_h = tgt_sizes[batch_idx][0]
++                nb_patches_w = tgt_sizes[batch_idx][1]
++            else:
++                nb_patches_h = p_attn_mask[:, 0].sum()
++                nb_patches_w = p_attn_mask[0].sum()
++            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
++            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
++            bucket_coords_h = torch.bucketize(fractional_coords_h,
++                                              boundaries,
++                                              right=True)
++            bucket_coords_w = torch.bucketize(fractional_coords_w,
++                                              boundaries,
++                                              right=True)
++            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
++                       bucket_coords_w).flatten()
++            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
++        position_ids = position_ids.to(self.position_embedding.weight.device)
++        embeddings = embeddings + self.position_embedding(position_ids)
++        return embeddings
++
++
++class Idefics2VisionAttention(nn.Module):
++    """Multi-headed attention from 'Attention Is All You Need' paper"""
++
++    def __init__(
++        self,
++        config: Idefics2Config,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.config = config
++        self.embed_dim = config.hidden_size
++        self.num_heads = config.num_attention_heads
++        self.head_dim = self.embed_dim // self.num_heads
++        if self.head_dim * self.num_heads != self.embed_dim:
++            raise ValueError(
++                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"  # noqa: E501
++                f" {self.num_heads}).")
++        self.scale = self.head_dim**-0.5
++        self.dropout = config.attention_dropout
++        self.qkv_proj = QKVParallelLinear(
++            self.embed_dim,
++            self.head_dim,
++            self.num_heads,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++        self.out_proj = RowParallelLinear(
++            self.embed_dim,
++            self.embed_dim,
++            bias=True,
++            quant_config=quant_config,
++            prefix=f"{prefix}.out_proj",
++        )
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
++        self.attn = MultiHeadAttention(self.num_heads_per_partition,
++                                       self.head_dim, self.scale)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(
++            hidden_states
++        )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
++        query_states, key_states, value_states = qkv.chunk(3, dim=-1)
++        out = self.attn(query_states, key_states, value_states)
++        attn_output, _ = self.out_proj(out)
++        return attn_output
++
++
++class Idefics2VisionMLP(nn.Module):
++
++    def __init__(
++        self,
++        config: Idefics2Config,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.config = config
++        self.activation_fn = get_act_fn(config.hidden_act)
++        self.fc1 = ColumnParallelLinear(
++            config.hidden_size,
++            config.intermediate_size,
++            bias=True,
++            quant_config=quant_config,
++            prefix=f"{prefix}.fc1",
++        )
++        self.fc2 = RowParallelLinear(
++            config.intermediate_size,
++            config.hidden_size,
++            bias=True,
++            quant_config=quant_config,
++            prefix=f"{prefix}.fc2",
++        )
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        hidden_states, _ = self.fc1(hidden_states)
++        hidden_states = self.activation_fn(hidden_states)
++        hidden_states, _ = self.fc2(hidden_states)
++        return hidden_states
++
++
++class Idefics2EncoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: Idefics2Config,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.embed_dim = config.hidden_size
++        self.self_attn = Idefics2VisionAttention(config,
++                                                 quant_config=quant_config,
++                                                 prefix=f"{prefix}.self_attn")
++        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
++                                        eps=config.layer_norm_eps)
++        self.mlp = Idefics2VisionMLP(config,
++                                     quant_config=quant_config,
++                                     prefix=f"{prefix}.mlp")
++        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
++                                        eps=config.layer_norm_eps)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++    ) -> torch.Tensor:
++        """
++        Args:
++            hidden_states (`torch.FloatTensor`):
++                Input to the layer of shape `(batch, seq_len, embed_dim)`.
++
++        """
++        residual = hidden_states
++        hidden_states = self.layer_norm1(hidden_states)
++        hidden_states = self.self_attn(hidden_states)
++        hidden_states = residual + hidden_states
++        residual = hidden_states
++        hidden_states = self.layer_norm2(hidden_states)
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = residual + hidden_states
++        return hidden_states
++
++
++class Idefics2Encoder(nn.Module):
++    """
++    Transformer encoder consisting of `config.num_hidden_layers` self attention
++    layers. Each layer is a
++    [`Idefics2EncoderLayer`].
++
++    Args:
++        config: Idefics2Config
++    """
++
++    def __init__(
++        self,
++        config: Idefics2Config,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++        self.layers = nn.ModuleList([
++            Idefics2EncoderLayer(config,
++                                 quant_config=quant_config,
++                                 prefix=f"{prefix}.layers.{layer_idx}")
++            for layer_idx in range(config.num_hidden_layers)
++        ])
++
++    def forward(
++        self,
++        inputs_embeds: torch.Tensor,
++    ) -> torch.Tensor:
++        r"""
++        Args:
++            inputs_embeds (torch.Tensor):
++                Optionally, instead of passing `input_ids` you can choose to
++                directly pass an embedded representation.
++                This is useful if you want more control over how to convert
++                `input_ids` indices into associated vectorsthan the model's
++                internal embedding lookup matrix.
++        """
++        hidden_states = inputs_embeds
++        for encoder_layer in self.layers:
++            layer_outputs = encoder_layer(hidden_states)
++            hidden_states = layer_outputs
++        return hidden_states
++
++
++class Idefics2VisionTransformer(nn.Module):
++
++    def __init__(
++        self,
++        config: Idefics2VisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        embed_dim = config.hidden_size
++        self.config = config
++        self.embeddings = Idefics2VisionEmbeddings(config)
++        self.encoder = Idefics2Encoder(config,
++                                       quant_config=quant_config,
++                                       prefix=f"{prefix}.encoder")
++        self.post_layernorm = nn.LayerNorm(embed_dim,
++                                           eps=config.layer_norm_eps)
++
++    def get_input_embeddings(self):
++        return self.embeddings
++
++    def forward(
++        self,
++        pixel_values,
++        patch_attention_mask: Optional[torch.BoolTensor] = None,
++        tgt_sizes: Optional[torch.IntTensor] = None,
++    ) -> torch.Tensor:
++        hidden_states = self.embeddings(
++            pixel_values=pixel_values,
++            patch_attention_mask=patch_attention_mask,
++            tgt_sizes=tgt_sizes,
++        )
++        encoder_outputs = self.encoder(hidden_states)
++        last_hidden_state = self.post_layernorm(encoder_outputs)
++        return last_hidden_state
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            for param_name, weight_name, shard_id in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
+new file mode 100644
+index 0000000..17e772e
+--- /dev/null
++++ b/vllm/model_executor/models/idefics3.py
+@@ -0,0 +1,777 @@
++# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only Idefics3 model compatible with HuggingFace weights."""
++
++import math
++from typing import (Dict, Iterable, List, Literal, Mapping, NamedTuple,
++                    Optional, Set, Tuple, TypedDict, Union)
++
++import torch
++import torch.utils.checkpoint
++from PIL import Image
++from torch import nn
++# Temporary solution for transformers below 4.46.0.
++from transformers import PretrainedConfig as Idefics3Config
++from transformers import ProcessorMixin as Idefics3ImageProcessor
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
++                         InputContext, token_inputs)
++from vllm.logger import init_logger
++from vllm.model_executor.layers.linear import ReplicatedLinear
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.models.module_mapping import MultiModelKeys
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
++from vllm.multimodal.image import cached_get_image_processor
++from vllm.multimodal.inputs import NestedTensors
++from vllm.sequence import IntermediateTensors, SequenceData
++from vllm.transformers_utils.processor import cached_get_processor
++from vllm.utils import is_list_of
++
++# yapf: disable
++from .idefics2_vision_model import (
++    Idefics2VisionTransformer as Idefics3VisionTransformer)
++# yapf: enable
++from .interfaces import SupportsLoRA, SupportsMultiModal
++from .llama import LlamaModel
++from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
++                    merge_multimodal_embeddings)
++
++logger = init_logger(__name__)
++
++
++class Idefics3ImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: torch.Tensor
++    """
++    Shape: `(batch_size * num_images * num_patches, 
++             num_channels, height, width)`
++    """
++    pixel_attention_mask: Optional[torch.BoolTensor]
++
++
++class Idefics3ImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: torch.Tensor
++    """
++    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
++    `hidden_size` must match the hidden size of language model backbone.
++    """
++
++
++class Idefics3ProcessorSize(NamedTuple):
++    """Hashable wrapper for unhashable `size` dict of Idefics3Processor."""
++    # NOTE: cached_get_processor/cached_get_image_processor uses lru_cache,
++    # we need to use NamedTuple instead of TypedDict to avoid hashing issues.
++    longest_edge: int
++
++    def __contains__(self, key: str) -> bool:
++        return key in self._asdict() and getattr(self, key) is not None
++
++    def __getitem__(self, key: str) -> int:
++        return getattr(self, key)
++
++
++ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
++
++
++def get_mm_processor_kwargs(size: Optional[Dict[str, int]] = None) -> Dict:
++    mm_processor_kwargs = {}
++    if size:
++        mm_processor_kwargs["size"] = Idefics3ProcessorSize(**size)
++    return mm_processor_kwargs
++
++
++def input_mapper_for_idefics3(
++    ctx: InputContext,
++    data: object,
++    *,
++    size: Optional[Dict[str, int]] = None,
++):
++    model_config = ctx.model_config
++    mm_processor_kwargs = get_mm_processor_kwargs(size)
++    image_processor = cached_get_image_processor(
++        model_config.model,
++        trust_remote_code=model_config.trust_remote_code,
++        **mm_processor_kwargs)
++    if image_processor is None:
++        raise RuntimeError("No HuggingFace processor is available "
++                           "to process the image object")
++
++    if isinstance(data, Image.Image):
++        images = [[data]]
++    elif is_list_of(data, Image.Image):
++        images = [data]
++    else:
++        raise TypeError(f"Invalid image type: {type(data)}")
++
++    try:
++        batch_data = image_processor(images,
++                                     return_tensors="pt",
++                                     return_row_col_info=True).data
++    except Exception:
++        logger.error("Failed to process image (%s)", data)
++        raise
++
++    return MultiModalKwargs(batch_data)
++
++
++def _resize_output_size(height: int,
++                        width: int,
++                        max_len: Optional[int] = None,
++                        min_len: Optional[int] = 1,
++                        max_size: Optional[int] = None) -> Tuple[int, int]:
++    # Set default value for max_len if not provided
++    max_len = max(height, width) if max_len is None else max_len
++    aspect_ratio = width / height
++
++    # Handle the maximum size constraint
++    if max_size is not None:
++        max_len = min(max_len, max_size)
++
++    # Adjust dimensions according to the aspect ratio
++    if width >= height:
++        width = max_len
++        height = int(width / aspect_ratio)
++    else:
++        height = max_len
++        width = int(height * aspect_ratio)
++
++    # Ensure both width and height are even (if needed)
++    height += 1 if height % 2 != 0 else 0
++    width += 1 if width % 2 != 0 else 0
++
++    # Ensure dimensions are not smaller than the minimum length
++    height = max(height, min_len)
++    width = max(width, min_len)
++
++    return height, width
++
++
++def _get_resize_output_image_size(
++    image_size: Tuple[int, int],
++    resolution_max_side: int,
++    max_image_size: int = 1820,
++) -> Tuple[int, int]:
++    if resolution_max_side > max_image_size:
++        raise ValueError(
++            "`resolution_max_side` cannot be larger than `max_image_size`")
++
++    height, width = image_size
++
++    # Find the output size, when rescaling the longest edge to max_len and
++    # preserving the aspect ratio
++    height, width = _resize_output_size(height,
++                                        width,
++                                        max_len=resolution_max_side)
++
++    return height, width
++
++
++def _prompt_split_image(image_seq_len: int, image_rows: int, image_cols: int,
++                        fake_token_around_image: str, image_token: str,
++                        global_img_token: str) -> str:
++    """
++    Prompt with expanded image tokens for when the image is split 
++    into patches.
++    """
++    text_split_images = ""
++    for n_h in range(image_rows):
++        for n_w in range(image_cols):
++            text_split_images += (fake_token_around_image +
++                                  f"<row_{n_h + 1}_col_{n_w + 1}>" +
++                                  image_token * image_seq_len)
++        text_split_images += "\n"
++
++    text_split_images += "\n" + _prompt_single_image(
++        image_seq_len=image_seq_len,
++        fake_token_around_image=fake_token_around_image,
++        image_token=image_token,
++        global_img_token=global_img_token)
++    return text_split_images
++
++
++def _prompt_single_image(image_seq_len: int, fake_token_around_image: str,
++                         image_token: str, global_img_token: str):
++    """Prompt with expanded image tokens for a single image."""
++    return (fake_token_around_image + global_img_token +
++            image_token * image_seq_len + fake_token_around_image)
++
++
++def _get_image_prompt_string(image_rows: int, image_cols: int,
++                             image_seq_len: int, fake_token_around_image: str,
++                             image_token: str, global_img_token: str):
++    if image_rows == 0 and image_cols == 0:
++        return _prompt_single_image(
++            image_seq_len=image_seq_len,
++            fake_token_around_image=fake_token_around_image,
++            image_token=image_token,
++            global_img_token=global_img_token,
++        )
++    return _prompt_split_image(image_seq_len, image_rows, image_cols,
++                               fake_token_around_image, image_token,
++                               global_img_token)
++
++
++def input_processor_for_idefics3(ctx: InputContext,
++                                 inputs: DecoderOnlyInputs,
++                                 *,
++                                 size: Optional[Dict[str, int]] = None):
++    multi_modal_data = inputs.get("multi_modal_data")
++    if multi_modal_data is None or "image" not in multi_modal_data:
++        return inputs
++
++    model_config = ctx.model_config
++    mm_processor_kwargs = get_mm_processor_kwargs(size)
++    processor = cached_get_processor(model_config.model, **mm_processor_kwargs)
++    image_processor = processor.image_processor
++    tokenizer = processor.tokenizer
++    size = image_processor.size['longest_edge']
++    max_image_size = image_processor.max_image_size['longest_edge']
++
++    image_data = multi_modal_data["image"]
++    if isinstance(image_data, Image.Image):
++        image_list = [image_data]
++    elif is_list_of(image_data, Image.Image):
++        image_list = image_data
++    else:
++        raise TypeError(f"Invalid image type: {type(image_data)}")
++
++    image_rows = []
++    image_cols = []
++    for image in image_list:
++        height, width = _get_resize_output_image_size(image.size, size)
++
++        rows = math.ceil(height / max_image_size)
++        cols = math.ceil(width / max_image_size)
++        image_rows.append(rows)
++        image_cols.append(cols)
++    image_rows = [image_rows]
++    image_cols = [image_cols]
++
++    n_images_in_text = []
++
++    text = inputs.get("prompt")
++    if text is None:
++        prompt_token_ids = inputs.get("prompt_token_ids", [])
++        assert prompt_token_ids
++        text = tokenizer.decode(prompt_token_ids)
++
++    if isinstance(text, str):
++        text = [text]
++    elif not isinstance(text, list) and not isinstance(text[0], str):
++        raise ValueError("Invalid input text. Please provide a string, "
++                         "or a list of strings")
++
++    fake_image_token = processor.fake_image_token.content
++    image_token = processor.image_token.content
++    global_img_token = processor.global_image_tag
++
++    prompt_strings = []
++    for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
++        n_images_in_text.append(sample.count(image_token))
++
++        # Replace the image token with fake tokens around the expanded
++        # image token sequence of length `image_seq_len`
++        image_prompt_strings = []
++        for n_rows, n_cols in zip(sample_rows, sample_cols):
++            image_prompt_string = _get_image_prompt_string(
++                n_rows,
++                n_cols,
++                processor.image_seq_len,
++                image_token=image_token,
++                fake_token_around_image=fake_image_token,
++                global_img_token=global_img_token,
++            )
++            image_prompt_strings.append(image_prompt_string)
++
++        split_sample = sample.split(image_token)
++        if len(split_sample) == 0:
++            raise ValueError("The image token should be present in the text.")
++
++        # Place in the image prompt strings where the image tokens are
++        sample = split_sample[0]
++        for i, image_prompt_string in enumerate(image_prompt_strings):
++            sample += image_prompt_string + split_sample[i + 1]
++        prompt_strings.append(sample)
++
++    prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
++
++    return token_inputs(
++        prompt_token_ids=prompt_token_ids,
++        prompt=prompt_strings[0],
++        multi_modal_data=multi_modal_data,
++    )
++
++
++def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int:
++    size = image_processor.size['longest_edge']
++    max_image_size = image_processor.max_image_size['longest_edge']
++    resized_height, resized_width = size, size
++
++    grid_h = resized_height // max_image_size
++    grid_w = resized_width // max_image_size
++    return (grid_h * grid_w + 1)
++
++
++def get_max_idefics3_image_tokens(ctx: InputContext,
++                                  *,
++                                  size: Optional[Dict[str,
++                                                      int]] = None) -> int:
++    model_config = ctx.model_config
++    mm_processor_kwargs = get_mm_processor_kwargs(size)
++    processor = cached_get_processor(model_config.model, **mm_processor_kwargs)
++    image_seq_len = processor.image_seq_len
++    image_processor = processor.image_processor
++
++    max_num_image_patches = _get_max_num_image_patch(image_processor)
++
++    return max_num_image_patches * image_seq_len
++
++
++def dummy_data_for_idefics3(
++        ctx: InputContext,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++        *,
++        size: Optional[Dict[str, int]] = None) -> DummyData:
++    hf_config = ctx.get_hf_config()
++    num_images = mm_counts["image"]
++
++    mm_processor_kwargs = get_mm_processor_kwargs(size)
++    processor = cached_get_processor(ctx.model_config.model,
++                                     **mm_processor_kwargs)
++    max_num_image_patches = _get_max_num_image_patch(processor.image_processor)
++    image_seq_len = processor.image_seq_len
++    max_llm_image_tokens = max_num_image_patches * image_seq_len * num_images
++
++    if seq_len - max_llm_image_tokens < 0:
++        raise RuntimeError(
++            f"Idefics3 cannot process {num_images} images in a prompt, "
++            "please increase max_model_len or reduce image limit by "
++            "--limit-mm-per-prompt.")
++
++    seq_data = SequenceData.from_prompt_token_counts(
++        (hf_config.image_token_id, max_llm_image_tokens),
++        (0, seq_len - max_llm_image_tokens))
++
++    width = height = hf_config.vision_config.image_size
++    image = Image.new("RGB", (width, height), color=0)
++    mm_data = {"image": [image] if num_images == 1 else [image] * num_images}
++
++    return DummyData(seq_data, mm_data)
++
++
++class Idefics3SimpleMLP(nn.Module):
++
++    def __init__(
++        self,
++        config: Idefics3Config,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        input_size = config.vision_config.hidden_size * (config.scale_factor**
++                                                         2)
++        output_size = config.text_config.hidden_size
++        self.proj = ReplicatedLinear(
++            input_size,
++            output_size,
++            bias=False,
++            quant_config=quant_config,
++            prefix=maybe_prefix(prefix, "proj"),
++        )
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        out, _ = self.proj(x)
++        return out
++
++
++class Idefics3Connector(nn.Module):
++
++    def __init__(
++        self,
++        config: Idefics3Config,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.scale_factor = config.scale_factor
++        self.modality_projection = Idefics3SimpleMLP(
++            config,
++            quant_config,
++            prefix=maybe_prefix(prefix, "modality_projection"),
++        )
++
++    def pixel_shuffle(self,
++                      x: torch.Tensor,
++                      scale_factor: int = 2) -> torch.Tensor:
++        bsz, seq, embed_dim = x.size()
++        height = width = int(seq**0.5)
++        x = x.view(bsz, height, width, embed_dim)
++        x = x.view(bsz, height, int(width / scale_factor),
++                   embed_dim * scale_factor)
++        x = x.permute(0, 2, 1, 3)
++        x = x.reshape(
++            bsz,
++            int(width / scale_factor),
++            int(height / scale_factor),
++            embed_dim * (scale_factor**2),
++        )
++        x = x.permute(0, 2, 1, 3)
++        x = x.reshape(bsz, int(seq / (scale_factor**2)),
++                      embed_dim * (scale_factor**2))
++        return x
++
++    def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor:
++        image_hidden_states = self.pixel_shuffle(image_hidden_states,
++                                                 self.scale_factor)
++        image_hidden_states = self.modality_projection(image_hidden_states)
++        return image_hidden_states
++
++
++class Idefics3Model(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++
++        self.config = config
++        self.padding_idx = self.config.text_config.pad_token_id
++        self.vocab_size = self.config.text_config.vocab_size
++        self.vision_model = Idefics3VisionTransformer(
++            config.vision_config,
++            quant_config=quant_config,
++            prefix=maybe_prefix(prefix, "vision_model"))
++        self.connector = Idefics3Connector(
++            config,
++            quant_config,
++            prefix=maybe_prefix(prefix, "connector"),
++        )
++        self.text_model = LlamaModel(
++            vllm_config=vllm_config.with_hf_config(config.text_config),
++            prefix=maybe_prefix(prefix, "text_model"),
++        )
++
++        self.image_seq_len = int(
++            ((config.vision_config.image_size //
++              config.vision_config.patch_size)**2) / (config.scale_factor**2))
++        self.image_token_id = self.config.image_token_id
++
++    def _validate_pixel_values(
++        self, data: Union[torch.Tensor, List[torch.Tensor]]
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++
++        h = w = self.config.vision_config.image_size
++        expected_dims = (3, h, w)
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape[1:])
++
++            if actual_dims != expected_dims:
++                expected_expr = ("num_patches", *map(str, expected_dims))
++                raise ValueError(
++                    "The expected shape of pixel values per image per batch "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[ImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++        pixel_attention_mask = kwargs.pop("pixel_attention_mask", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if image_embeds is not None:
++            if not isinstance(image_embeds, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++
++            return Idefics3ImageEmbeddingInputs(
++                type="image_embeds",
++                data=flatten_bn(image_embeds, concat=True),
++            )
++
++        if pixel_values is not None:
++            if not isinstance(pixel_values, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            if isinstance(pixel_values, list):
++                pixel_values = torch.cat(pixel_values, dim=1)
++                pixel_attention_mask = torch.cat(pixel_attention_mask, dim=1)
++            else:
++                pixel_values = flatten_bn(pixel_values)
++                pixel_attention_mask = flatten_bn(pixel_attention_mask)
++
++            return Idefics3ImagePixelInputs(
++                type="pixel_values",
++                data=self._validate_pixel_values(pixel_values),
++                pixel_attention_mask=pixel_attention_mask)
++
++        raise AssertionError("This line should be unreachable.")
++
++    def _image_pixels_to_features(
++        self,
++        pixel_values: torch.Tensor,
++        pixel_attention_mask: Optional[torch.BoolTensor] = None,
++    ) -> torch.Tensor:
++        # NOTE: we skip the step to select the vision feature layer since
++        # this is already done inside the vision tower
++        batch_size, num_images, num_channels, height, width = pixel_values.shape
++        pixel_values = pixel_values.to(
++            dtype=self.vision_model.embeddings.patch_embedding.weight.dtype
++        )  # fp16 compatibility
++        pixel_values = pixel_values.view(batch_size * num_images,
++                                         *pixel_values.shape[2:])
++
++        # Remove padding images - padding images are full 0.
++        nb_values_per_image = pixel_values.shape[1:].numel()
++        real_images_inds = (pixel_values == 0.0).sum(
++            dim=(-1, -2, -3)) != nb_values_per_image
++        pixel_values = pixel_values[real_images_inds].contiguous()
++
++        # Handle the vision attention mask
++        if pixel_attention_mask is None:
++            pixel_attention_mask = torch.ones(
++                size=(pixel_values.size(0), pixel_values.size(2),
++                      pixel_values.size(3)),
++                dtype=torch.bool,
++                device=pixel_values.device,
++            )
++        else:
++            # Remove padding images from the mask
++            pixel_attention_mask = pixel_attention_mask.view(
++                batch_size * num_images, *pixel_attention_mask.shape[2:])
++            pixel_attention_mask = pixel_attention_mask[
++                real_images_inds].contiguous()
++
++        patch_size = self.config.vision_config.patch_size
++        patches_subgrid = pixel_attention_mask.unfold(dimension=1,
++                                                      size=patch_size,
++                                                      step=patch_size)
++        patches_subgrid = patches_subgrid.unfold(dimension=2,
++                                                 size=patch_size,
++                                                 step=patch_size)
++        patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
++
++        # Get sequence from the vision encoder
++        image_hidden_states = self.vision_model(
++            pixel_values=pixel_values,
++            patch_attention_mask=patch_attention_mask,
++        )
++
++        return image_hidden_states
++
++    def _process_image_pixels(
++            self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor:
++        assert self.vision_model is not None
++
++        pixel_values = inputs["data"]
++        pixel_attention_mask = inputs["pixel_attention_mask"]
++
++        return self._image_pixels_to_features(pixel_values,
++                                              pixel_attention_mask)
++
++    def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
++        if image_input["type"] == "image_embeds":
++            return image_input["data"]
++
++        assert self.vision_model is not None
++        image_features = self._process_image_pixels(image_input)
++        return self.connector(image_features)
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++    ) -> torch.Tensor:
++        return self.text_model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++
++        hidden_states = self.text_model(
++            input_ids,
++            positions,
++            kv_caches,
++            attn_metadata,
++            intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++        )
++        return hidden_states
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_idefics3)
++@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens)
++@INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3)
++@INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3)
++class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                       SupportsLoRA):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++    # LoRA specific attributes
++    supported_lora_modules = [
++        # vision_model
++        "fc1",
++        "fc2",
++        "out_proj",
++        # text_model
++        "qkv_proj",  # same name with vision encoder
++        "o_proj",
++        "gate_up_proj",
++        "down_proj",
++    ]
++
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        self.model = Idefics3Model(vllm_config=vllm_config,
++                                   prefix=maybe_prefix(prefix, "model"))
++        self.image_token_id = self.config.image_token_id
++
++        self.lm_head = ParallelLMHead(
++            config.text_config.vocab_size,
++            config.text_config.hidden_size,
++            quant_config=quant_config,
++        )
++        if self.config.text_config.tie_word_embeddings:
++            self.lm_head.weight = self.model.text_model.wte.weight
++        self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
++        self.sampler = Sampler()
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self.model._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        vision_embeddings = self.model._process_image_input(image_input)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                self.config.image_token_id)
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.model.text_model(input_ids,
++                                              positions,
++                                              kv_caches,
++                                              attn_metadata,
++                                              intermediate_tensors,
++                                              inputs_embeds=inputs_embeds)
++
++        return hidden_states
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights)
++
++    def get_mm_mapping(self) -> MultiModelKeys:
++        """
++        Get the module prefix in multimodal models
++        """
++        return MultiModelKeys.from_string_field(
++            language_model="model.text_model",
++            connector="model.connector",
++            tower_model="model.vision_model")
+diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
+new file mode 100644
+index 0000000..c5fd0d9
+--- /dev/null
++++ b/vllm/model_executor/models/interfaces.py
+@@ -0,0 +1,441 @@
++from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
++                    Protocol, Type, Union, overload, runtime_checkable)
++
++import torch
++from typing_extensions import TypeIs, TypeVar
++
++from vllm.logger import init_logger
++from vllm.utils import supports_kw
++
++from .interfaces_base import is_pooling_model
++
++if TYPE_CHECKING:
++    from vllm.attention import AttentionMetadata
++    from vllm.multimodal.inputs import NestedTensors  # noqa: F401
++    from vllm.sequence import IntermediateTensors
++
++logger = init_logger(__name__)
++
++T = TypeVar("T", default="NestedTensors")
++
++
++@runtime_checkable
++class SupportsMultiModal(Protocol):
++    """The interface required for all multi-modal models."""
++
++    supports_multimodal: ClassVar[Literal[True]] = True
++    """
++    A flag that indicates this model supports multi-modal inputs.
++
++    Note:
++        There is no need to redefine this flag if this class is in the
++        MRO of your model class.
++    """
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
++        """
++        Returns multimodal embeddings generated from multimodal kwargs 
++        to be merged with text embeddings.
++
++        The output embeddings must be one of the following formats:
++    
++        - A list or tuple of 2D tensors, where each tensor corresponds to
++          each input multimodal data item (e.g, image).
++        - A single 3D tensor, with the batch dimension grouping the 2D tensors.
++
++        Note:
++            The returned multimodal embeddings must be in the same order as
++            the appearances of their corresponding multimodal data item in the
++            input prompt.
++        """
++        ...
++
++    # Only for models that support v0 chunked prefill
++    # TODO(ywang96): Remove this overload once v0 is deprecated
++    @overload
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[T] = None,
++        attn_metadata: Optional["AttentionMetadata"] = None,
++    ) -> torch.Tensor:
++        ...
++
++    @overload
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[T] = None,
++    ) -> torch.Tensor:
++        """
++        Returns the input embeddings merged from the text embeddings from 
++        input_ids and the multimodal embeddings generated from multimodal 
++        kwargs.
++        """
++        ...
++
++
++# We can't use runtime_checkable with ClassVar for issubclass checks
++# so we need to treat the class as an instance and use isinstance instead
++@runtime_checkable
++class _SupportsMultiModalType(Protocol):
++    supports_multimodal: Literal[True]
++
++
++@overload
++def supports_multimodal(
++        model: Type[object]) -> TypeIs[Type[SupportsMultiModal]]:
++    ...
++
++
++@overload
++def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]:
++    ...
++
++
++def supports_multimodal(
++    model: Union[Type[object], object],
++) -> Union[TypeIs[Type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]:
++    if isinstance(model, type):
++        return isinstance(model, _SupportsMultiModalType)
++
++    return isinstance(model, SupportsMultiModal)
++
++
++@runtime_checkable
++class SupportsLoRA(Protocol):
++    """The interface required for all models that support LoRA."""
++
++    supports_lora: ClassVar[Literal[True]] = True
++    """
++    A flag that indicates this model supports LoRA.
++
++    Note:
++        There is no need to redefine this flag if this class is in the
++        MRO of your model class.
++    """
++
++    packed_modules_mapping: ClassVar[Dict[str, List[str]]]
++    supported_lora_modules: ClassVar[List[str]]
++    embedding_modules: ClassVar[Dict[str, str]]
++    embedding_padding_modules: ClassVar[List[str]]
++
++
++# We can't use runtime_checkable with ClassVar for issubclass checks
++# so we need to treat the class as an instance and use isinstance instead
++@runtime_checkable
++class _SupportsLoRAType(Protocol):
++    supports_lora: Literal[True]
++
++    packed_modules_mapping: Dict[str, List[str]]
++    supported_lora_modules: List[str]
++    embedding_modules: Dict[str, str]
++    embedding_padding_modules: List[str]
++
++
++@overload
++def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]:
++    ...
++
++
++@overload
++def supports_lora(model: object) -> TypeIs[SupportsLoRA]:
++    ...
++
++
++def supports_lora(
++    model: Union[Type[object], object],
++) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]:
++    result = _supports_lora(model)
++
++    if not result:
++        lora_attrs = (
++            "packed_modules_mapping",
++            "supported_lora_modules",
++            "embedding_modules",
++            "embedding_padding_modules",
++        )
++        missing_attrs = tuple(attr for attr in lora_attrs
++                              if not hasattr(model, attr))
++
++        if getattr(model, "supports_lora", False):
++            if missing_attrs:
++                logger.warning(
++                    "The model (%s) sets `supports_lora=True`, "
++                    "but is missing LoRA-specific attributes: %s",
++                    model,
++                    missing_attrs,
++                )
++        else:
++            if not missing_attrs:
++                logger.warning(
++                    "The model (%s) contains all LoRA-specific attributes, "
++                    "but does not set `supports_lora=True`.", model)
++
++    return result
++
++
++def _supports_lora(model: Union[Type[object], object]) -> bool:
++    if isinstance(model, type):
++        return isinstance(model, _SupportsLoRAType)
++
++    return isinstance(model, SupportsLoRA)
++
++
++@runtime_checkable
++class SupportsPP(Protocol):
++    """The interface required for all models that support pipeline parallel."""
++
++    supports_pp: ClassVar[Literal[True]] = True
++    """
++    A flag that indicates this model supports pipeline parallel.
++
++    Note:
++        There is no need to redefine this flag if this class is in the
++        MRO of your model class.
++    """
++
++    def make_empty_intermediate_tensors(
++        self,
++        batch_size: int,
++        dtype: torch.dtype,
++        device: torch.device,
++    ) -> "IntermediateTensors":
++        """Called when PP rank > 0 for profiling purposes."""
++        ...
++
++    def forward(
++        self,
++        *,
++        intermediate_tensors: Optional["IntermediateTensors"],
++    ) -> Union[torch.Tensor, "IntermediateTensors"]:
++        """
++        Accept :class:`IntermediateTensors` when PP rank > 0.
++
++        Return :class:`IntermediateTensors` only for the last PP rank.
++        """
++        ...
++
++
++# We can't use runtime_checkable with ClassVar for issubclass checks
++# so we need to treat the class as an instance and use isinstance instead
++@runtime_checkable
++class _SupportsPPType(Protocol):
++    supports_pp: Literal[True]
++
++    def make_empty_intermediate_tensors(
++        self,
++        batch_size: int,
++        dtype: torch.dtype,
++        device: torch.device,
++    ) -> "IntermediateTensors":
++        ...
++
++    def forward(
++        self,
++        *,
++        intermediate_tensors: Optional["IntermediateTensors"],
++    ) -> Union[torch.Tensor, "IntermediateTensors"]:
++        ...
++
++
++@overload
++def supports_pp(model: Type[object]) -> TypeIs[Type[SupportsPP]]:
++    ...
++
++
++@overload
++def supports_pp(model: object) -> TypeIs[SupportsPP]:
++    ...
++
++
++def supports_pp(
++    model: Union[Type[object], object],
++) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]:
++    supports_attributes = _supports_pp_attributes(model)
++    supports_inspect = _supports_pp_inspect(model)
++
++    if supports_attributes and not supports_inspect:
++        logger.warning(
++            "The model (%s) sets `supports_pp=True`, but does not accept "
++            "`intermediate_tensors` in its `forward` method", model)
++
++    if not supports_attributes:
++        pp_attrs = ("make_empty_intermediate_tensors", )
++        missing_attrs = tuple(attr for attr in pp_attrs
++                              if not hasattr(model, attr))
++
++        if getattr(model, "supports_pp", False):
++            if missing_attrs:
++                logger.warning(
++                    "The model (%s) sets `supports_pp=True`, "
++                    "but is missing PP-specific attributes: %s",
++                    model,
++                    missing_attrs,
++                )
++        else:
++            if not missing_attrs:
++                logger.warning(
++                    "The model (%s) contains all PP-specific attributes, "
++                    "but does not set `supports_pp=True`.", model)
++
++    return supports_attributes and supports_inspect
++
++
++def _supports_pp_attributes(model: Union[Type[object], object]) -> bool:
++    if isinstance(model, type):
++        return isinstance(model, _SupportsPPType)
++
++    return isinstance(model, SupportsPP)
++
++
++def _supports_pp_inspect(model: Union[Type[object], object]) -> bool:
++    model_forward = getattr(model, "forward", None)
++    if not callable(model_forward):
++        return False
++
++    return supports_kw(model_forward, "intermediate_tensors")
++
++
++@runtime_checkable
++class HasInnerState(Protocol):
++    """The interface required for all models that has inner state."""
++
++    has_inner_state: ClassVar[Literal[True]] = True
++    """
++        A flag that indicates this model has inner state.
++        Models that has inner state usually need access to the scheduler_config
++        for max_num_seqs, etc. True for e.g. both Mamba and Jamba.
++    """
++
++
++@runtime_checkable
++class _HasInnerStateType(Protocol):
++    has_inner_state: ClassVar[Literal[True]]
++
++
++@overload
++def has_inner_state(model: object) -> TypeIs[HasInnerState]:
++    ...
++
++
++@overload
++def has_inner_state(model: Type[object]) -> TypeIs[Type[HasInnerState]]:
++    ...
++
++
++def has_inner_state(
++    model: Union[Type[object], object]
++) -> Union[TypeIs[Type[HasInnerState]], TypeIs[HasInnerState]]:
++    if isinstance(model, type):
++        return isinstance(model, _HasInnerStateType)
++
++    return isinstance(model, HasInnerState)
++
++
++@runtime_checkable
++class IsAttentionFree(Protocol):
++    """The interface required for all models like Mamba that lack attention,
++    but do have state whose size is constant wrt the number of tokens."""
++
++    is_attention_free: ClassVar[Literal[True]] = True
++    """
++        A flag that indicates this model has no attention.
++        Used for block manager and attention backend selection.
++        True for Mamba but not Jamba.
++    """
++
++
++@runtime_checkable
++class _IsAttentionFreeType(Protocol):
++    is_attention_free: ClassVar[Literal[True]]
++
++
++@overload
++def is_attention_free(model: object) -> TypeIs[IsAttentionFree]:
++    ...
++
++
++@overload
++def is_attention_free(model: Type[object]) -> TypeIs[Type[IsAttentionFree]]:
++    ...
++
++
++def is_attention_free(
++    model: Union[Type[object], object]
++) -> Union[TypeIs[Type[IsAttentionFree]], TypeIs[IsAttentionFree]]:
++    if isinstance(model, type):
++        return isinstance(model, _IsAttentionFreeType)
++
++    return isinstance(model, IsAttentionFree)
++
++
++@runtime_checkable
++class IsHybrid(Protocol):
++    """The interface required for all models like Jamba that have both
++    attention and mamba blocks, indicates that 
++    hf_config has 'layers_block_type'"""
++
++    is_hybrid: ClassVar[Literal[True]] = True
++    """
++        A flag that indicates this model has both mamba and attention blocks
++        , also indicates that the model's hf_config has 
++        'layers_block_type' """
++
++
++@runtime_checkable
++class _IsHybridType(Protocol):
++    is_hybrid: ClassVar[Literal[True]]
++
++
++@overload
++def is_hybrid(model: object) -> TypeIs[IsHybrid]:
++    ...
++
++
++@overload
++def is_hybrid(model: Type[object]) -> TypeIs[Type[IsHybrid]]:
++    ...
++
++
++def is_hybrid(
++    model: Union[Type[object], object]
++) -> Union[TypeIs[Type[IsHybrid]], TypeIs[IsHybrid]]:
++    if isinstance(model, type):
++        return isinstance(model, _IsHybridType)
++
++    return isinstance(model, IsHybrid)
++
++
++@runtime_checkable
++class SupportsCrossEncoding(Protocol):
++    """The interface required for all models that support cross encoding."""
++
++    supports_cross_encoding: ClassVar[Literal[True]] = True
++
++
++@overload
++def supports_cross_encoding(
++        model: Type[object]) -> TypeIs[Type[SupportsCrossEncoding]]:
++    ...
++
++
++@overload
++def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]:
++    ...
++
++
++def _supports_cross_encoding(
++    model: Union[Type[object], object],
++) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
++
++    if isinstance(model, type):
++        return isinstance(model, SupportsCrossEncoding)
++
++    return isinstance(model, SupportsCrossEncoding)
++
++
++def supports_cross_encoding(
++    model: Union[Type[object], object],
++) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
++    return is_pooling_model(model) and _supports_cross_encoding(model)
+diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
+new file mode 100644
+index 0000000..4c353ae
+--- /dev/null
++++ b/vllm/model_executor/models/interfaces_base.py
+@@ -0,0 +1,177 @@
++from typing import (TYPE_CHECKING, List, Optional, Protocol, Type, Union,
++                    overload, runtime_checkable)
++
++import torch
++import torch.nn as nn
++from transformers import PretrainedConfig
++from typing_extensions import TypeIs, TypeVar
++
++from vllm.logger import init_logger
++from vllm.utils import supports_kw
++
++if TYPE_CHECKING:
++    from vllm.attention import AttentionMetadata
++    from vllm.config import VllmConfig
++    from vllm.model_executor.layers.pooler import PoolerOutput
++    from vllm.model_executor.layers.sampler import SamplerOutput
++    from vllm.model_executor.pooling_metadata import PoolingMetadata
++    from vllm.model_executor.sampling_metadata import SamplingMetadata
++
++logger = init_logger(__name__)
++
++# The type of HF config
++C_co = TypeVar("C_co", bound=PretrainedConfig, covariant=True)
++
++# The type of hidden states
++# Currently, T = torch.Tensor for all models except for Medusa
++# which has T = List[torch.Tensor]
++T = TypeVar("T", default=torch.Tensor)
++T_co = TypeVar("T_co", default=torch.Tensor, covariant=True)
++
++# NOTE: Unlike those in `interfaces.py`, we don't define `ClassVar` tags
++# for the base interfaces to avoid breaking OOT registration for existing models
++# that don't inherit from the base interface classes
++
++
++@runtime_checkable
++class VllmModel(Protocol[C_co, T_co]):
++    """The interface required for all models in vLLM."""
++
++    def __init__(
++        self,
++        vllm_config: "VllmConfig",
++        prefix: str = "",
++    ) -> None:
++        ...
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: "AttentionMetadata",
++    ) -> T_co:
++        ...
++
++
++def _check_vllm_model_init(model: Union[Type[object], object]) -> bool:
++    model_init = model.__init__
++    return supports_kw(model_init, "vllm_config")
++
++
++def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool:
++    model_forward = getattr(model, "forward", None)
++    if not callable(model_forward):
++        return False
++
++    vllm_kws = ("input_ids", "positions", "kv_caches", "attn_metadata")
++    missing_kws = tuple(kw for kw in vllm_kws
++                        if not supports_kw(model_forward, kw))
++
++    if missing_kws and (isinstance(model, type)
++                        and issubclass(model, nn.Module)):
++        logger.warning(
++            "The model (%s) is missing "
++            "vLLM-specific keywords from its `forward` method: %s",
++            model,
++            missing_kws,
++        )
++
++    return len(missing_kws) == 0
++
++
++@overload
++def is_vllm_model(model: Type[object]) -> TypeIs[Type[VllmModel]]:
++    ...
++
++
++@overload
++def is_vllm_model(model: object) -> TypeIs[VllmModel]:
++    ...
++
++
++def is_vllm_model(
++    model: Union[Type[object], object],
++) -> Union[TypeIs[Type[VllmModel]], TypeIs[VllmModel]]:
++    return _check_vllm_model_init(model) and _check_vllm_model_forward(model)
++
++
++@runtime_checkable
++class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]):
++    """The interface required for all generative models in vLLM."""
++
++    def compute_logits(
++        self,
++        hidden_states: T,
++        sampling_metadata: "SamplingMetadata",
++    ) -> Optional[T]:
++        """Return `None` if TP rank > 0."""
++        ...
++
++    def sample(
++        self,
++        logits: T,
++        sampling_metadata: "SamplingMetadata",
++    ) -> "SamplerOutput":
++        """Only called on TP rank 0."""
++        ...
++
++
++@overload
++def is_text_generation_model(
++        model: Type[object]) -> TypeIs[Type[VllmModelForTextGeneration]]:
++    ...
++
++
++@overload
++def is_text_generation_model(
++        model: object) -> TypeIs[VllmModelForTextGeneration]:
++    ...
++
++
++def is_text_generation_model(
++    model: Union[Type[object], object],
++) -> Union[TypeIs[Type[VllmModelForTextGeneration]],
++           TypeIs[VllmModelForTextGeneration]]:
++    if not is_vllm_model(model):
++        return False
++
++    if isinstance(model, type):
++        return isinstance(model, VllmModelForTextGeneration)
++
++    return isinstance(model, VllmModelForTextGeneration)
++
++
++@runtime_checkable
++class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]):
++    """The interface required for all pooling models in vLLM."""
++
++    def pooler(
++        self,
++        hidden_states: T,
++        pooling_metadata: "PoolingMetadata",
++    ) -> "PoolerOutput":
++        """Only called on TP rank 0."""
++        ...
++
++
++@overload
++def is_pooling_model(model: Type[object]) -> TypeIs[Type[VllmModelForPooling]]:
++    ...
++
++
++@overload
++def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]:
++    ...
++
++
++def is_pooling_model(
++    model: Union[Type[object], object],
++) -> Union[TypeIs[Type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]:
++    if not is_vllm_model(model):
++        return False
++
++    if isinstance(model, type):
++        return isinstance(model, VllmModelForPooling)
++
++    return isinstance(model, VllmModelForPooling)
+diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
+new file mode 100644
+index 0000000..8ad009d
+--- /dev/null
++++ b/vllm/model_executor/models/intern_vit.py
+@@ -0,0 +1,474 @@
++# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
++# --------------------------------------------------------
++# InternVL
++# Copyright (c) 2023 OpenGVLab
++# Licensed under The MIT License [see LICENSE for details]
++# --------------------------------------------------------
++from functools import partial
++from typing import Iterable, Optional, Set, Tuple
++
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from transformers import PretrainedConfig
++
++from vllm.attention.layer import MultiHeadAttention
++from vllm.distributed import (divide, get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size,
++                              split_tensor_along_last_dim,
++                              tensor_model_parallel_all_gather)
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++
++NORM2FN = {
++    'rms_norm': RMSNorm,
++    'layer_norm': nn.LayerNorm,
++}
++
++
++class InternVisionEmbeddings(nn.Module):
++
++    def __init__(self, config: PretrainedConfig):
++        super().__init__()
++        self.config = config
++        self.embed_dim = config.hidden_size
++        self.image_size = config.image_size
++        self.patch_size = config.patch_size
++
++        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
++
++        self.patch_embedding = nn.Conv2d(in_channels=3,
++                                         out_channels=self.embed_dim,
++                                         kernel_size=self.patch_size,
++                                         stride=self.patch_size)
++
++        self.num_patches = (self.image_size // self.patch_size)**2
++        self.num_positions = self.num_patches + 1
++
++        self.position_embedding = nn.Parameter(
++            torch.randn(1, self.num_positions, self.embed_dim))
++
++    def _get_pos_embed(self, pos_embed: torch.Tensor, H: int, W: int):
++        target_dtype = pos_embed.dtype
++        pos_embed = pos_embed.float().reshape(
++            1, self.image_size // self.patch_size,
++            self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
++        pos_embed = F.interpolate(pos_embed,
++                                  size=(H, W),
++                                  mode='bicubic',
++                                  align_corners=False)
++        return pos_embed.reshape(1, -1, H * W).permute(0, 2,
++                                                       1).to(target_dtype)
++
++    def _get_position_embedding(self, H: int, W: int) -> torch.Tensor:
++        position_embedding = self.position_embedding
++        if self.num_patches == H * W:
++            return position_embedding
++
++        return torch.cat(
++            [
++                position_embedding[:, :1, :],
++                self._get_pos_embed(position_embedding[:, 1:, :], H, W),
++            ],
++            dim=1,
++        )
++
++    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
++        target_dtype = self.patch_embedding.weight.dtype
++        patch_embeds = self.patch_embedding(pixel_values.to(
++            target_dtype))  # shape = [*, channel, width, height]
++        batch_size, _, height, width = patch_embeds.shape
++        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
++        class_embeds = self.class_embedding.expand(batch_size, 1,
++                                                   -1).to(target_dtype)
++        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
++        position_embedding = self._get_position_embedding(height, width)
++        embeddings = embeddings + position_embedding.to(target_dtype)
++        return embeddings
++
++
++class InternVisionPatchModel(nn.Module):
++
++    def __init__(self, config: PretrainedConfig):
++        super().__init__()
++        self.config = config
++        self.embeddings = InternVisionEmbeddings(config)
++
++    def get_input_embeddings(self):
++        return self.embeddings
++
++    def forward(
++        self,
++        pixel_values: Optional[torch.Tensor] = None,
++        pixel_embeds: Optional[torch.Tensor] = None,
++    ) -> torch.FloatTensor:
++        if pixel_values is None and pixel_embeds is None:
++            raise ValueError(
++                'You have to specify pixel_values or pixel_embeds')
++
++        if pixel_embeds is not None:
++            hidden_states = pixel_embeds
++        elif pixel_values is not None:
++            if pixel_values.ndim == 4:
++                hidden_states = self.embeddings(pixel_values)
++            else:
++                raise ValueError(
++                    f'wrong pixel_values size: {pixel_values.shape}')
++
++        return hidden_states
++
++
++class InternParallelAttention(nn.Module):
++    """Multi-headed attention from 'Attention Is All You Need' paper"""
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_dummy_heads: int = 0,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++        self.embed_dim = config.hidden_size
++        self.num_heads = config.num_attention_heads
++        self.head_dim = self.embed_dim // self.num_heads
++        if self.head_dim * self.num_heads != self.embed_dim:
++            raise ValueError(
++                f'embed_dim must be divisible by num_heads '
++                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
++                f' {self.num_heads}).')
++
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.tp_rank = get_tensor_model_parallel_rank()
++
++        # Additional dummy heads are used to enable TP for common GPU counts.
++        self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
++        self.num_heads_per_partition = divide(num_dummy_heads + self.num_heads,
++                                              self.tp_size)
++
++        self.scale = self.head_dim**-0.5
++        self.qkv = QKVParallelLinear(
++            self.embed_dim,
++            self.head_dim,
++            num_dummy_heads + self.num_heads,
++            bias=config.qkv_bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv",
++        )
++
++        self.qk_normalization = config.qk_normalization
++
++        if self.qk_normalization:
++            self.q_norm = RMSNorm(self.dummy_dim,
++                                  eps=config.layer_norm_eps,
++                                  var_hidden_size=self.embed_dim)
++            self.k_norm = RMSNorm(self.dummy_dim,
++                                  eps=config.layer_norm_eps,
++                                  var_hidden_size=self.embed_dim)
++
++        self.proj = RowParallelLinear(
++            self.dummy_dim,
++            self.embed_dim,
++            quant_config=quant_config,
++            prefix=f"{prefix}.proj",
++        )
++
++        self.attn = MultiHeadAttention(self.num_heads_per_partition,
++                                       self.head_dim, self.scale)
++
++    def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
++        if self.tp_size > 1:
++            q = tensor_model_parallel_all_gather(q.contiguous())
++            k = tensor_model_parallel_all_gather(k.contiguous())
++        q = self.q_norm.forward_native(q)
++        k = self.k_norm.forward_native(k)
++        if self.tp_size > 1:
++            splitter = partial(split_tensor_along_last_dim,
++                               num_partitions=self.tp_size)
++            q = splitter(q)[self.tp_rank]
++            k = splitter(k)[self.tp_rank]
++        return q, k
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        B, N, _ = x.shape
++        qkv, _ = self.qkv(x)
++        q, k, v = qkv.chunk(3, dim=-1)
++
++        if self.qk_normalization:
++            q, k = self._apply_qk_norm(q, k)
++
++        out = self.attn(q, k, v)
++        out, _ = self.proj(out)
++        return out
++
++
++class InternSdpaAttention(nn.Module):
++    """Multi-headed attention from 'Attention Is All You Need' paper"""
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        *,
++        num_dummy_heads: int = 0,
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++        self.embed_dim = config.hidden_size
++        self.num_heads = config.num_attention_heads
++        self.head_dim = self.embed_dim // self.num_heads
++        if self.head_dim * self.num_heads != self.embed_dim:
++            raise ValueError(
++                f'embed_dim must be divisible by num_heads '
++                f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
++                f' {self.num_heads}).')
++
++        # Additional dummy heads are used to enable TP for common GPU counts.
++        self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
++
++        self.scale = self.head_dim**-0.5
++        self.qkv = nn.Linear(self.embed_dim,
++                             3 * self.dummy_dim,
++                             bias=config.qkv_bias)
++
++        self.qk_normalization = config.qk_normalization
++
++        if self.qk_normalization:
++            self.q_norm = RMSNorm(self.dummy_dim,
++                                  eps=config.layer_norm_eps,
++                                  var_hidden_size=self.embed_dim)
++            self.k_norm = RMSNorm(self.dummy_dim,
++                                  eps=config.layer_norm_eps,
++                                  var_hidden_size=self.embed_dim)
++
++        self.proj = nn.Linear(self.dummy_dim, self.embed_dim)
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        B, N, C = x.shape
++        qkv = self.qkv(x)
++        q, k, v = qkv.chunk(3, dim=-1)
++
++        q = q.view(B, N, self.num_heads, self.head_dim)
++        k = k.view(B, N, self.num_heads, self.head_dim)
++        v = v.view(B, N, self.num_heads, self.head_dim)
++
++        if self.qk_normalization:
++            B_, N_, H_, D_ = q.shape
++            q = self.q_norm.forward_native(q.flatten(-2,
++                                                     -1)).view(B_, N_, H_, D_)
++            k = self.k_norm.forward_native(k.flatten(-2,
++                                                     -1)).view(B_, N_, H_, D_)
++        q = q.transpose(1, 2)
++        k = k.transpose(1, 2)
++        v = v.transpose(1, 2)
++
++        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
++        x = x.transpose(1, 2).reshape(B, N, -1)
++
++        x = self.proj(x)
++        return x
++
++
++class InternMLP(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++        self.activation_fn = get_act_fn(config.hidden_act)
++        self.fc1 = ColumnParallelLinear(config.hidden_size,
++                                        config.intermediate_size,
++                                        bias=True,
++                                        quant_config=quant_config,
++                                        prefix=f"{prefix}.fc1")
++        self.fc2 = RowParallelLinear(config.intermediate_size,
++                                     config.hidden_size,
++                                     bias=True,
++                                     quant_config=quant_config,
++                                     prefix=f"{prefix}.fc2")
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        hidden_states, _ = self.fc1(hidden_states)
++        hidden_states = self.activation_fn(hidden_states)
++        hidden_states, _ = self.fc2(hidden_states)
++
++        return hidden_states
++
++
++class InternVisionEncoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_dummy_heads: int = 0,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.embed_dim = config.hidden_size
++        self.intermediate_size = config.intermediate_size
++        self.norm_type = config.norm_type
++
++        self.attn = self._init_attn(config,
++                                    quant_config,
++                                    num_dummy_heads=num_dummy_heads,
++                                    prefix=f"{prefix}.attn")
++
++        self.mlp = InternMLP(config,
++                             quant_config=quant_config,
++                             prefix=f"{prefix}.mlp")
++        self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
++                                             eps=config.layer_norm_eps)
++        self.norm2 = NORM2FN[self.norm_type](self.embed_dim,
++                                             eps=config.layer_norm_eps)
++
++        self.ls1 = nn.Parameter(config.initializer_factor *
++                                torch.ones(self.embed_dim))
++        self.ls2 = nn.Parameter(config.initializer_factor *
++                                torch.ones(self.embed_dim))
++
++    def _init_attn(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig],
++        *,
++        num_dummy_heads: int,
++        prefix: str = "",
++    ):
++        # fallback to sdpa attention if tp unavailable
++        tp_size = get_tensor_model_parallel_world_size()
++        num_heads = config.num_attention_heads
++
++        if (num_heads + num_dummy_heads) % tp_size == 0:
++            return InternParallelAttention(config,
++                                           quant_config=quant_config,
++                                           num_dummy_heads=num_dummy_heads,
++                                           prefix=prefix)
++
++        return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++    ):
++        hidden_states = hidden_states + self.attn(
++            self.norm1(hidden_states)) * self.ls1
++
++        hidden_states = hidden_states + self.mlp(
++            self.norm2(hidden_states)) * self.ls2
++
++        return hidden_states
++
++
++class InternVisionEncoder(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        num_dummy_heads: int = 0,
++        prefix: str = "",
++    ):
++        super().__init__()
++
++        self.config = config
++
++        if num_hidden_layers_override is None:
++            num_hidden_layers = config.num_hidden_layers
++        else:
++            num_hidden_layers = num_hidden_layers_override
++
++        self.layers = nn.ModuleList([
++            InternVisionEncoderLayer(config,
++                                     quant_config,
++                                     num_dummy_heads=num_dummy_heads,
++                                     prefix=f"{prefix}.layers.{layer_idx}")
++            for layer_idx in range(num_hidden_layers)
++        ])
++
++    def forward(self, inputs_embeds: torch.Tensor):
++
++        hidden_states = inputs_embeds
++        for encoder_layer in self.layers:
++            hidden_states = encoder_layer(hidden_states)
++
++        return hidden_states
++
++
++class InternVisionModel(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        num_dummy_heads: int = 0,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++
++        self.embeddings = InternVisionEmbeddings(config)
++        self.encoder = InternVisionEncoder(
++            config=config,
++            quant_config=quant_config,
++            num_hidden_layers_override=num_hidden_layers_override,
++            num_dummy_heads=num_dummy_heads,
++            prefix=f"{prefix}.encoder",
++        )
++
++    def get_input_embeddings(self):
++        return self.embeddings
++
++    def forward(
++        self,
++        pixel_values: Optional[torch.Tensor] = None,
++        pixel_embeds: Optional[torch.Tensor] = None,
++    ) -> torch.FloatTensor:
++        if pixel_values is None and pixel_embeds is None:
++            raise ValueError(
++                'You have to specify pixel_values or pixel_embeds')
++
++        if pixel_embeds is not None:
++            hidden_states = pixel_embeds
++        elif pixel_values is not None:
++            if pixel_values.ndim == 4:
++                hidden_states = self.embeddings(pixel_values)
++            else:
++                raise ValueError(
++                    f'wrong pixel_values size: {pixel_values.shape}')
++
++        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
++
++        return encoder_outputs
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            param = params_dict[name]
++            weight_loader = getattr(param, "weight_loader",
++                                    default_weight_loader)
++            weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
+index 5811cae..28c23ed 100644
+--- a/vllm/model_executor/models/internlm2.py
++++ b/vllm/model_executor/models/internlm2.py
+@@ -1,27 +1,38 @@
+-# -*- coding: utf-8 -*-
+-from typing import Any, Dict, Iterable, List, Optional, Tuple
++from functools import partial
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import PretrainedConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size,
++                              split_tensor_along_last_dim,
++                              tensor_model_parallel_all_gather)
+ from vllm.model_executor.layers.activation import SiluAndMul
+ from vllm.model_executor.layers.layernorm import RMSNorm
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.pooler import Pooler, PoolingType
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.pooling_metadata import PoolingMetadata
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors, PoolerOutput
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class InternLM2MLP(nn.Module):
+@@ -32,16 +43,23 @@ class InternLM2MLP(nn.Module):
+         intermediate_size: int,
+         hidden_act: str,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.gate_up_proj = MergedColumnParallelLinear(
+-            hidden_size, [intermediate_size] * 2,
++            hidden_size,
++            [intermediate_size] * 2,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj",
++        )
++        self.w2 = RowParallelLinear(
++            intermediate_size,
++            hidden_size,
+             bias=False,
+-            quant_config=quant_config)
+-        self.w2 = RowParallelLinear(intermediate_size,
+-                                    hidden_size,
+-                                    bias=False,
+-                                    quant_config=quant_config)
++            quant_config=quant_config,
++            prefix=f"{prefix}.w2",
++        )
+         if hidden_act != "silu":
+             raise ValueError(f"Unsupported activation: {hidden_act}. "
+                              "Only silu is supported for now.")
+@@ -64,27 +82,31 @@ class InternLM2Attention(nn.Module):
+         rope_theta: float = 10000,
+         rope_scaling: Optional[Dict[str, Any]] = None,
+         max_position_embeddings: int = 8192,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = hidden_size
+-        tp_size = get_tensor_model_parallel_world_size()
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.tp_rank = get_tensor_model_parallel_rank()
+         self.total_num_heads = num_heads
+-        assert self.total_num_heads % tp_size == 0
+-        self.num_heads = self.total_num_heads // tp_size
++        assert self.total_num_heads % self.tp_size == 0
++        self.num_heads = self.total_num_heads // self.tp_size
+         self.total_num_kv_heads = num_kv_heads
+-        if self.total_num_kv_heads >= tp_size:
++        if self.total_num_kv_heads >= self.tp_size:
+             # Number of KV heads is greater than TP size, so we partition
+             # the KV heads across multiple tensor parallel GPUs.
+-            assert self.total_num_kv_heads % tp_size == 0
++            assert self.total_num_kv_heads % self.tp_size == 0
+         else:
+             # Number of KV heads is less than TP size, so we replicate
+             # the KV heads across multiple tensor parallel GPUs.
+-            assert tp_size % self.total_num_kv_heads == 0
+-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++            assert self.tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+         self.head_dim = hidden_size // self.total_num_heads
+         self.q_size = self.num_heads * self.head_dim
+         self.kv_size = self.num_kv_heads * self.head_dim
++        self.key_value_groups = int(self.num_heads / self.num_kv_heads)
+         self.scaling = self.head_dim**-0.5
+         self.rope_theta = rope_theta
+         self.max_position_embeddings = max_position_embeddings
+@@ -96,12 +118,14 @@ class InternLM2Attention(nn.Module):
+             self.total_num_kv_heads,
+             bias=False,
+             quant_config=quant_config,
++            prefix=f"{prefix}.wqkv",
+         )
+         self.wo = RowParallelLinear(
+             self.total_num_heads * self.head_dim,
+             hidden_size,
+             bias=False,
+             quant_config=quant_config,
++            prefix=f"{prefix}.wo",
+         )
+ 
+         self.rotary_emb = get_rope(
+@@ -111,10 +135,39 @@ class InternLM2Attention(nn.Module):
+             base=rope_theta,
+             rope_scaling=rope_scaling,
+         )
+-        self.attn = Attention(self.num_heads,
+-                              self.head_dim,
+-                              self.scaling,
+-                              num_kv_heads=self.num_kv_heads)
++        self.attn = Attention(
++            self.num_heads,
++            self.head_dim,
++            self.scaling,
++            num_kv_heads=self.num_kv_heads,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attn",
++        )
++
++    def split_qkv(self, qkv: torch.Tensor):
++        seq_len = qkv.shape[0]
++        if self.tp_size > 1:
++            qkv_map = [self.q_size, self.kv_size, self.kv_size] * self.tp_size
++            qkv = tensor_model_parallel_all_gather(qkv)
++            qkv = torch.split(qkv, qkv_map, dim=-1)
++            qkv = qkv[::3] + qkv[1::3] + qkv[2::3]
++            qkv = torch.cat(qkv, dim=-1)
++
++        qkv = qkv.view(seq_len, self.total_num_kv_heads,
++                       self.key_value_groups + 2, self.head_dim)
++        q, k, v = torch.split(qkv, [self.key_value_groups, 1, 1], dim=-2)
++        q = q.reshape(seq_len, self.q_size * self.tp_size)
++        k = k.reshape(seq_len, self.kv_size * self.tp_size)
++        v = v.reshape(seq_len, self.kv_size * self.tp_size)
++
++        if self.tp_size > 1:
++            splitter = partial(split_tensor_along_last_dim,
++                               num_partitions=self.tp_size)
++            q = splitter(q)[self.tp_rank]
++            k = splitter(k)[self.tp_rank]
++            v = splitter(v)[self.tp_rank]
++        return q, k, v
+ 
+     def forward(
+         self,
+@@ -124,7 +177,7 @@ class InternLM2Attention(nn.Module):
+         attn_metadata: AttentionMetadata,
+     ) -> torch.Tensor:
+         qkv, _ = self.wqkv(hidden_states)
+-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k, v = self.split_qkv(qkv)
+         q, k = self.rotary_emb(positions, q, k)
+         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+         output, _ = self.wo(attn_output)
+@@ -136,7 +189,9 @@ class InternLMDecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: PretrainedConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -151,13 +206,16 @@ class InternLMDecoderLayer(nn.Module):
+             rope_theta=rope_theta,
+             rope_scaling=rope_scaling,
+             max_position_embeddings=max_position_embeddings,
++            cache_config=cache_config,
+             quant_config=quant_config,
++            prefix=f"{prefix}.attention",
+         )
+         self.feed_forward = InternLM2MLP(
+             hidden_size=self.hidden_size,
+             intermediate_size=config.intermediate_size,
+             hidden_act=config.hidden_act,
+             quant_config=quant_config,
++            prefix=f"{prefix}.feed_forward",
+         )
+         self.attention_norm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+@@ -191,14 +249,21 @@ class InternLMDecoderLayer(nn.Module):
+         return hidden_states, residual
+ 
+ 
++@support_torch_compile
+ class InternLM2Model(nn.Module):
+ 
+     def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++            self,
++            *,
++            vllm_config: VllmConfig,
++            prefix: str = "",
++            layer_type: Type[InternLMDecoderLayer] = InternLMDecoderLayer):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+         self.padding_idx = config.pad_token_id
+         self.vocab_size = config.vocab_size
+@@ -206,11 +271,18 @@ class InternLM2Model(nn.Module):
+             config.vocab_size,
+             config.hidden_size,
+         )
+-        self.layers = nn.ModuleList([
+-            InternLMDecoderLayer(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: layer_type(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
+         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.tok_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -218,36 +290,82 @@ class InternLM2Model(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.tok_embeddings(input_ids)
+-        residual = None
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+                 residual,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+         hidden_states, _ = self.norm(hidden_states, residual)
+         return hidden_states
+ 
+ 
+-class InternLM2ForCausalLM(nn.Module):
++class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
++    packed_modules_mapping = {
++        "wqkv": ["wqkv"],
++        "gate_up_proj": ["w1", "w3"],
++    }
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "wqkv",
++        "wo",
++        "gate_up_proj",
++        "w2",
++    ]
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def __init__(self,
++                 *,
++                 vllm_config: VllmConfig,
++                 prefix: str = "",
++                 model_type: Type[InternLM2Model] = InternLM2Model):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
+         self.config = config
+         self.quant_config = quant_config
+-        self.model = InternLM2Model(config, quant_config)
+-        self.output = ParallelLMHead(config.vocab_size, config.hidden_size)
++        self.lora_config = lora_config
++
++        self.model = model_type(vllm_config=vllm_config,
++                                prefix=maybe_prefix(prefix, "model"))
++        self.output = ParallelLMHead(config.vocab_size,
++                                     config.hidden_size,
++                                     quant_config=quant_config,
++                                     prefix=maybe_prefix(prefix, "output"))
++        if self.config.tie_word_embeddings:
++            self.output.weight = self.model.tok_embeddings.weight
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -255,14 +373,20 @@ class InternLM2ForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
+     ) -> torch.Tensor:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.output.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.output, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -274,13 +398,15 @@ class InternLM2ForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("gate_up_proj", "w1", 0),
+             ("gate_up_proj", "w3", 1),
+         ]
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -291,6 +417,8 @@ class InternLM2ForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -299,25 +427,67 @@ class InternLM2ForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+-                if "wqkv" in name:
+-                    config = self.config
+-                    kv_groups = (config.num_attention_heads //
+-                                 config.num_key_value_heads)
+-                    head_dim = config.hidden_size // config.num_attention_heads
+-                    loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
+-                                                       head_dim,
+-                                                       loaded_weight.shape[-1])
+-                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1],
+-                                             dim=1)
+-                    wq = wq.reshape(-1, wq.shape[-1])
+-                    wk = wk.reshape(-1, wk.shape[-1])
+-                    wv = wv.reshape(-1, wv.shape[-1])
+-                    weight_loader = param.weight_loader
+-                    weight_loader(param, wq, 'q')
+-                    weight_loader(param, wk, 'k')
+-                    weight_loader(param, wv, 'v')
+-                else:
+-                    weight_loader = getattr(param, "weight_loader",
+-                                            default_weight_loader)
+-                    weight_loader(param, loaded_weight)
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++class InternLM2ForRewardModel(InternLM2ForCausalLM):
++
++    def __init__(
++        self,
++        *,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++        model_type: Type[InternLM2Model] = InternLM2Model,
++    ):
++        super().__init__(vllm_config=vllm_config,
++                         prefix=prefix,
++                         model_type=model_type)
++
++        for attr in ("output", "logits_processor", "sampler"):
++            delattr(self, attr)
++
++        config = vllm_config.model_config.hf_config
++        self.v_head = RowParallelLinear(
++            config.hidden_size,
++            1,
++            bias=False,
++            input_is_parallel=False,
++            prefix=maybe_prefix(prefix, "v_head"),
++        )
++
++        pooler_config = vllm_config.model_config.pooler_config
++        self._pooler = Pooler.from_config_with_defaults(
++            pooler_config,
++            pooling_type=PoolingType.ALL,
++            normalize=False,
++            softmax=False,
++        )
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        hidden_states = self.model(input_ids, positions, kv_caches,
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
++        logits, _ = self.v_head(hidden_states)
++        return logits
++
++    def pooler(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Optional[PoolerOutput]:
++        return self._pooler(hidden_states, pooling_metadata)
+diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
+new file mode 100644
+index 0000000..93ac2dc
+--- /dev/null
++++ b/vllm/model_executor/models/internlm2_ve.py
+@@ -0,0 +1,154 @@
++from typing import List, Optional, Tuple, Union
++
++import torch
++from torch import nn
++from transformers import PretrainedConfig
++
++from vllm.attention import AttentionMetadata
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.models.internlm2 import (InternLM2Attention,
++                                                  InternLM2ForCausalLM,
++                                                  InternLM2MLP, InternLM2Model)
++from vllm.sequence import IntermediateTensors
++
++
++class InternLM2VEDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        rope_theta = getattr(config, "rope_theta", 10000)
++        rope_scaling = getattr(config, "rope_scaling", None)
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          8192)
++        self.attention = InternLM2Attention(
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            num_kv_heads=config.num_key_value_heads,
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attention",
++        )
++        self.feed_forward = InternLM2MLP(
++            hidden_size=self.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.hidden_act,
++            quant_config=quant_config,
++            prefix=f"{prefix}.feed_forward",
++        )
++        self.feed_forward_ve = InternLM2MLP(
++            hidden_size=self.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.hidden_act,
++            quant_config=quant_config,
++            prefix=f"{prefix}.feed_forward_ve",
++        )
++        self.attention_norm = RMSNorm(config.hidden_size,
++                                      eps=config.rms_norm_eps)
++        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++        visual_token_mask: Optional[torch.Tensor] = None,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        # Self Attention
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.attention_norm(hidden_states)
++        else:
++            hidden_states, residual = self.attention_norm(
++                hidden_states, residual)
++        hidden_states = self.attention(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        # Fully Connected
++        hidden_states, residual = self.ffn_norm(hidden_states, residual)
++        if visual_token_mask is not None and visual_token_mask.any():
++            visual_token_mask = visual_token_mask.repeat(
++                1, self.hidden_size).bool()
++            text_token_mask = ~visual_token_mask
++            hidden_states[visual_token_mask] = self.feed_forward_ve(
++                hidden_states[visual_token_mask].reshape(
++                    -1, self.hidden_size)).flatten()
++            if text_token_mask.any():
++                hidden_states[text_token_mask] = self.feed_forward(
++                    hidden_states[text_token_mask].reshape(
++                        -1, self.hidden_size)).flatten()
++        else:
++            hidden_states = self.feed_forward(hidden_states)
++        return hidden_states, residual
++
++
++class InternLM2VEModel(InternLM2Model):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config,
++                         prefix=prefix,
++                         layer_type=InternLM2VEDecoderLayer)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        visual_token_mask: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.tok_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++                residual,
++                visual_token_mask=visual_token_mask,
++            )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
++
++
++class InternLM2VEForCausalLM(InternLM2ForCausalLM):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config,
++                         prefix=prefix,
++                         model_type=InternLM2VEModel)
+diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
+new file mode 100644
+index 0000000..f4b7e44
+--- /dev/null
++++ b/vllm/model_executor/models/internvl.py
+@@ -0,0 +1,777 @@
++# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
++# --------------------------------------------------------
++# InternVL
++# Copyright (c) 2023 OpenGVLab
++# Licensed under The MIT License [see LICENSE for details]
++# --------------------------------------------------------
++import re
++from functools import cached_property, partial
++from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
++                    TypedDict, Union)
++
++import torch
++import torch.nn as nn
++import torchvision.transforms as T
++from PIL import Image
++from transformers import PretrainedConfig
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
++                         InputContext, token_inputs)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.quantization.awq import AWQConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.models.intern_vit import (InternVisionModel,
++                                                   InternVisionPatchModel)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
++from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
++from vllm.multimodal.utils import cached_get_tokenizer
++from vllm.sequence import IntermediateTensors
++from vllm.utils import is_list_of
++
++from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
++                   get_clip_num_patches)
++from .interfaces import SupportsMultiModal, SupportsPP
++from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
++                    maybe_prefix, merge_multimodal_embeddings)
++
++IMG_START = '<img>'
++IMG_END = '</img>'
++IMG_CONTEXT = '<IMG_CONTEXT>'
++
++IMAGENET_MEAN = (0.485, 0.456, 0.406)
++IMAGENET_STD = (0.229, 0.224, 0.225)
++
++
++class InternVLImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: torch.Tensor
++    """
++    Shape:
++    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
++    """
++    patches_per_image: List[int]
++    """
++    List of number of total patches for each image in the batch.
++    """
++
++
++class InternVLImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: NestedTensors
++    """ 
++    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
++    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
++
++    `hidden_size` must match the hidden size of language model backbone.
++    """
++
++
++InternVLImageInputs = Union[InternVLImagePixelInputs,
++                            InternVLImageEmbeddingInputs]
++
++
++# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
++def build_transform(input_size):
++    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
++    transform = T.Compose([
++        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
++        T.Resize((input_size, input_size),
++                 interpolation=T.InterpolationMode.BICUBIC),
++        T.ToTensor(),
++        T.Normalize(mean=MEAN, std=STD)
++    ])
++    return transform
++
++
++# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
++def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
++                              image_size):
++    best_ratio_diff = float('inf')
++    best_ratio = (1, 1)
++    area = width * height
++    for ratio in target_ratios:
++        target_aspect_ratio = ratio[0] / ratio[1]
++        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
++        if ratio_diff < best_ratio_diff:
++            best_ratio_diff = ratio_diff
++            best_ratio = ratio
++        elif ratio_diff == best_ratio_diff:
++            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
++                best_ratio = ratio
++    return best_ratio
++
++
++def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
++                         max_num: int, image_size: int,
++                         use_thumbnail: bool) -> Tuple[int, int, int]:
++    aspect_ratio = orig_width / orig_height
++
++    # calculate the existing image aspect ratio
++    target_ratios = set((i, j) for n in range(min_num, max_num + 1)
++                        for i in range(1, n + 1) for j in range(1, n + 1)
++                        if i * j <= max_num and i * j >= min_num)
++    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
++
++    # find the closest aspect ratio to the target
++    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
++                                                    target_ratios, orig_width,
++                                                    orig_height, image_size)
++
++    # calculate the target width and height
++    target_width = image_size * target_aspect_ratio[0]
++    target_height = image_size * target_aspect_ratio[1]
++    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
++    # add thumbnail image if num_blocks > 1
++    if use_thumbnail and blocks > 1:
++        blocks += 1
++    return blocks, target_width, target_height
++
++
++def calculate_num_blocks_wrapper(
++    hf_config: PretrainedConfig,
++    max_dynamic_patch: Optional[int] = None,
++    dynamic_image_size: Optional[bool] = None,
++):
++    if dynamic_image_size is None:
++        dynamic_image_size = hf_config.dynamic_image_size
++
++    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
++    if max_dynamic_patch is None:
++        max_dynamic_patch = hf_config.max_dynamic_patch
++    min_num = hf_config.min_dynamic_patch
++    image_size = hf_config.vision_config.image_size
++    use_thumbnail = hf_config.use_thumbnail
++    return partial(calculate_num_blocks,
++                   min_num=min_num,
++                   max_num=max_dynamic_patch,
++                   image_size=image_size,
++                   use_thumbnail=use_thumbnail)
++
++
++# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
++def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,
++                       image_size: int,
++                       use_thumbnail: bool) -> List[Image.Image]:
++    orig_width, orig_height = image.size
++
++    # calculate the number of blocks without thumbnail
++    blocks, target_width, target_height = calculate_num_blocks(
++        orig_width,
++        orig_height,
++        min_num,
++        max_num,
++        image_size,
++        use_thumbnail=False)
++    # resize the image
++    resized_img = image.resize((target_width, target_height))
++    processed_images = []
++    for i in range(blocks):
++        box = ((i % (target_width // image_size)) * image_size,
++               (i // (target_width // image_size)) * image_size,
++               ((i % (target_width // image_size)) + 1) * image_size,
++               ((i // (target_width // image_size)) + 1) * image_size)
++        # split the image
++        split_img = resized_img.crop(box)
++        processed_images.append(split_img)
++    assert len(processed_images) == blocks
++    if use_thumbnail and len(processed_images) != 1:
++        thumbnail_img = image.resize((image_size, image_size))
++        processed_images.append(thumbnail_img)
++    return processed_images
++
++
++# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
++def image_to_pixel_values(image: Image.Image, input_size: int, min_num: int,
++                          max_num: int, use_thumbnail: bool) -> torch.Tensor:
++    transform = build_transform(input_size=input_size)
++    images = dynamic_preprocess(image,
++                                min_num=min_num,
++                                max_num=max_num,
++                                image_size=input_size,
++                                use_thumbnail=use_thumbnail)
++    pixel_values = [transform(image) for image in images]
++    pixel_values = torch.stack(pixel_values)
++    return pixel_values
++
++
++def image_to_pixel_values_wrapper(
++    hf_config: PretrainedConfig,
++    max_dynamic_patch: Optional[int] = None,
++    dynamic_image_size: Optional[bool] = None,
++):
++    image_size = hf_config.vision_config.image_size
++    min_num = hf_config.min_dynamic_patch
++    if dynamic_image_size is None:
++        dynamic_image_size = hf_config.dynamic_image_size
++
++    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
++    if max_dynamic_patch is None:
++        max_dynamic_patch = hf_config.max_dynamic_patch
++    use_thumbnail = hf_config.use_thumbnail
++    return partial(image_to_pixel_values,
++                   input_size=image_size,
++                   min_num=min_num,
++                   max_num=max_dynamic_patch,
++                   use_thumbnail=use_thumbnail)
++
++
++def get_internvl_num_patches(hf_config: PretrainedConfig):
++    vision_config = hf_config.vision_config
++    downsample_ratio = hf_config.downsample_ratio
++    image_size = vision_config.image_size
++    patch_size = vision_config.patch_size
++    return int(
++        get_clip_num_patches(image_size=image_size, patch_size=patch_size) *
++        (downsample_ratio**2))
++
++
++def get_max_internvl_image_tokens(
++    ctx: InputContext,
++    *,
++    max_dynamic_patch: Optional[int] = None,
++    dynamic_image_size: Optional[bool] = None,
++):
++    hf_config = ctx.get_hf_config()
++    if dynamic_image_size is None:
++        dynamic_image_size = hf_config.dynamic_image_size
++
++    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
++    if max_dynamic_patch is None:
++        max_dynamic_patch = hf_config.max_dynamic_patch
++    use_thumbnail = hf_config.use_thumbnail
++    if use_thumbnail and max_dynamic_patch > 1:
++        max_dynamic_patch += 1
++
++    num_patches = get_internvl_num_patches(hf_config)
++    return num_patches * max_dynamic_patch
++
++
++def get_max_internvl_image_size(
++    ctx: InputContext,
++    *,
++    max_dynamic_patch: Optional[int] = None,
++    dynamic_image_size: Optional[bool] = None,
++):
++    hf_config = ctx.get_hf_config()
++    image_size = hf_config.vision_config.image_size
++    if dynamic_image_size is None:
++        dynamic_image_size = hf_config.dynamic_image_size
++
++    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
++    if max_dynamic_patch is None:
++        max_dynamic_patch = hf_config.max_dynamic_patch
++    use_thumbnail = hf_config.use_thumbnail
++    if use_thumbnail and max_dynamic_patch > 1:
++        max_dynamic_patch += 1
++    width = image_size * max_dynamic_patch
++    height = image_size
++    return width, height
++
++
++class InternVLInputPipeline:
++
++    def __init__(
++        self,
++        img_start_token: str,
++        img_end_token: str,
++        img_context_token: str,
++    ) -> None:
++        super().__init__()
++
++        self.img_start_token = img_start_token
++        self.img_end_token = img_end_token
++        self.img_context_token = img_context_token
++
++    def _create_image_prompt(self, feature_size: int, num_patches: int) -> str:
++        return (self.img_start_token + self.img_context_token * feature_size +
++                self.img_end_token)
++
++    def _expand_image_prompt(
++        self,
++        prompt: str,
++        feature_sizes: List[int],
++        num_patches: int,
++    ) -> str:
++        image_idx = sorted(
++            map(int, re.findall(r"Image-(\d+): <image>\n", prompt)))
++
++        new_prompt = prompt
++        for idx, feature_size in enumerate(feature_sizes, start=1):
++            image_prompt = self._create_image_prompt(feature_size, num_patches)
++            if not image_idx:
++                image_prompt = f"Image-{idx}: {image_prompt}"
++
++            new_prompt = new_prompt.replace('<image>', image_prompt, 1)
++
++        return new_prompt
++
++    def input_processor(
++        self,
++        ctx: InputContext,
++        inputs: DecoderOnlyInputs,
++        *,
++        max_dynamic_patch: Optional[int] = None,
++        dynamic_image_size: Optional[bool] = None,
++    ) -> DecoderOnlyInputs:
++        multi_modal_data = inputs.get("multi_modal_data")
++        if multi_modal_data is None or "image" not in multi_modal_data:
++            return inputs
++
++        model_config = ctx.model_config
++        hf_config = ctx.get_hf_config()
++
++        image_data = multi_modal_data["image"]
++        num_patches = get_internvl_num_patches(hf_config)
++        num_blocks_calculator = calculate_num_blocks_wrapper(
++            hf_config, max_dynamic_patch, dynamic_image_size)
++        if isinstance(image_data, Image.Image):
++            width, height = image_data.size
++            num_blocks, _, _ = num_blocks_calculator(width, height)
++            image_feature_sizes = [num_blocks * num_patches]
++        elif is_list_of(image_data, Image.Image):
++            image_feature_sizes = []
++            for image in image_data:
++                width, height = image.size
++                num_blocks, _, _ = num_blocks_calculator(width, height)
++                image_feature_sizes.append(num_blocks * num_patches)
++        elif isinstance(image_data, torch.Tensor):
++            num_images, image_feature_size, hidden_size = image_data.shape
++            image_feature_sizes = [image_feature_size]
++        else:
++            raise TypeError(f"Invalid image type: {type(image_data)}")
++
++        tokenizer = cached_get_tokenizer(
++            model_config.tokenizer,
++            trust_remote_code=model_config.trust_remote_code)
++
++        prompt = inputs.get("prompt")
++        prompt_token_ids = inputs["prompt_token_ids"]
++        if prompt is None:
++            prompt = tokenizer.decode(prompt_token_ids)
++
++        new_prompt = self._expand_image_prompt(prompt, image_feature_sizes,
++                                               num_patches)
++        new_prompt_token_ids = tokenizer.encode(new_prompt)
++        img_context_token_id = tokenizer.encode(self.img_context_token,
++                                                add_special_tokens=False)
++        assert len(img_context_token_id) == 1, \
++            (f"Invalid image token '{self.img_context_token}': A valid image "
++            f"token encodes to a single token ID, got {img_context_token_id}.")
++        img_context_token_id = img_context_token_id[0]
++
++        # Get precise tracking of placeholder positions
++        token_idx = image_idx = 0
++        placeholder_ranges = []
++        while token_idx < len(new_prompt_token_ids):
++            if new_prompt_token_ids[token_idx] == img_context_token_id:
++                curr_image_featue_size = image_feature_sizes[image_idx]
++                placeholder_ranges.append(
++                    PlaceholderRange(offset=token_idx,
++                                     length=curr_image_featue_size))
++                image_idx += 1
++                token_idx += curr_image_featue_size
++            else:
++                token_idx += 1
++
++        return token_inputs(
++            prompt=prompt,
++            prompt_token_ids=new_prompt_token_ids,
++            multi_modal_data=multi_modal_data,
++            multi_modal_placeholders={"image": placeholder_ranges})
++
++    def input_mapper(
++        self,
++        ctx: InputContext,
++        data: object,
++        *,
++        max_dynamic_patch: Optional[int] = None,
++        dynamic_image_size: Optional[bool] = None,
++    ):
++        hf_config = ctx.get_hf_config()
++
++        image_pixel_values_mapper = image_to_pixel_values_wrapper(
++            hf_config, max_dynamic_patch, dynamic_image_size)
++        if isinstance(data, Image.Image):
++            data = image_pixel_values_mapper(data)
++            # Add an N dimension for number of images per prompt (currently 1).
++            data = data.unsqueeze(0)
++        elif is_list_of(data, Image.Image):
++            # we can't stack here because images may have different num_patches
++            data = [image_pixel_values_mapper(img) for img in data]
++        else:
++            return MultiModalKwargs({"image_embeds": data})
++        model_config = ctx.model_config
++        tokenizer = cached_get_tokenizer(
++            model_config.tokenizer,
++            trust_remote_code=model_config.trust_remote_code)
++        image_token_id = tokenizer.encode(self.img_context_token,
++                                          add_special_tokens=False,
++                                          return_tensors="pt")[0]
++
++        return MultiModalKwargs({
++            "pixel_values": data,
++            "image_token_id": image_token_id
++        })
++
++    def dummy_data(
++        self,
++        ctx: InputContext,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++        *,
++        max_dynamic_patch: Optional[int] = None,
++        dynamic_image_size: Optional[bool] = None,
++    ):
++        num_images = mm_counts["image"]
++
++        hf_config = ctx.get_hf_config()
++
++        image_feature_size = get_max_internvl_image_tokens(
++            ctx,
++            max_dynamic_patch=max_dynamic_patch,
++            dynamic_image_size=dynamic_image_size,
++        )
++        model_config = ctx.model_config
++        tokenizer = cached_get_tokenizer(
++            model_config.tokenizer,
++            trust_remote_code=model_config.trust_remote_code)
++
++        seq_data, ranges = dummy_seq_data_for_clip(
++            hf_config.vision_config,
++            seq_len,
++            num_images,
++            image_token_id=tokenizer.encode(self.img_context_token,
++                                            add_special_tokens=False)[0],
++            image_feature_size_override=image_feature_size,
++        )
++
++        max_image_width, max_image_height = get_max_internvl_image_size(
++            ctx,
++            max_dynamic_patch=max_dynamic_patch,
++            dynamic_image_size=dynamic_image_size,
++        )
++
++        mm_data = dummy_image_for_clip(
++            hf_config.vision_config,
++            num_images,
++            image_width_override=max_image_width,
++            image_height_override=max_image_height,
++        )
++
++        return DummyData(seq_data, mm_data, ranges)
++
++
++input_pipeline = InternVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
++@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
++@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
++@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
++class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++
++        self.config = config
++        self.multimodal_config = multimodal_config
++        self._patch_quant_config(config, quant_config)
++
++        image_size = config.force_image_size or config.vision_config.image_size
++        patch_size = config.vision_config.patch_size
++        self.patch_size = patch_size
++        self.num_image_token = int(
++            (image_size // patch_size)**2 * (config.downsample_ratio**2))
++        self.downsample_ratio = config.downsample_ratio
++        self.ps_version = config.ps_version
++
++        self.llm_arch_name = config.text_config.architectures[0]
++        self.is_mono = self.llm_arch_name == 'InternLM2VEForCausalLM'
++        self.vision_model = self._init_vision_model(
++            config,
++            quant_config=quant_config,
++            is_mono=self.is_mono,
++            prefix=maybe_prefix(prefix, "vision_model"),
++        )
++
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=config.text_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++
++        self.mlp1 = self._init_mlp1(config)
++
++        self.img_context_token_id = None
++        self.visual_token_mask = None
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    def _patch_quant_config(self, config: PretrainedConfig,
++                            quant_config: QuantizationConfig):
++        # the awq models from OpenGVLab missing `modules_to_not_convert`
++        # patch the quant_config to add `modules_to_not_convert` back
++        if isinstance(quant_config, AWQConfig):
++            text_config = config.text_config
++            llm_quant_config = getattr(text_config, "quantization_config",
++                                       None)
++            if (not quant_config.modules_to_not_convert) and \
++                (llm_quant_config is not None):
++                quant_config.modules_to_not_convert.append("vision_model")
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _init_vision_model(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig],
++        *,
++        is_mono: bool,
++        prefix: str,
++    ):
++        if not is_mono:
++            vision_feature_layer = config.select_layer
++            if vision_feature_layer < 0:
++                num_hidden_layers = config.vision_config.num_hidden_layers \
++                    + vision_feature_layer + 1
++            else:
++                num_hidden_layers = vision_feature_layer + 1
++
++            return InternVisionModel(
++                config.vision_config,
++                quant_config=quant_config,
++                num_hidden_layers_override=num_hidden_layers,
++                prefix=prefix,
++            )
++        else:
++            return InternVisionPatchModel(config.vision_config)
++
++    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
++        vit_hidden_size = config.vision_config.hidden_size
++        llm_hidden_size = config.text_config.hidden_size
++
++        return nn.Sequential(
++            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
++            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
++                      llm_hidden_size),
++            nn.GELU(),
++            nn.Linear(llm_hidden_size, llm_hidden_size),
++        )
++
++    def pixel_shuffle(self, x, scale_factor=0.5):
++        n, w, h, c = x.size()
++        # N, W, H, C --> N, W, H * scale, C // scale
++        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
++        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
++        x = x.permute(0, 2, 1, 3).contiguous()
++        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
++                   int(c / (scale_factor * scale_factor)))
++        if self.ps_version == 'v1':
++            pass
++        else:
++            x = x.permute(0, 2, 1, 3).contiguous()
++        return x
++
++    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
++        vit_embeds = self.vision_model(pixel_values=pixel_values)
++        vit_embeds = vit_embeds[:, 1:, :]
++
++        h = w = int(vit_embeds.shape[1]**0.5)
++        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
++        vit_embeds = self.pixel_shuffle(vit_embeds,
++                                        scale_factor=self.downsample_ratio)
++        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
++                                        vit_embeds.shape[-1])
++        vit_embeds = self.mlp1(vit_embeds)
++        return vit_embeds
++
++    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
++
++        h = w = self.config.vision_config.image_size
++        expected_dims = (3, h, w)
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape)
++
++            if actual_dims != expected_dims:
++                expected_expr = str(expected_dims)
++                raise ValueError(
++                    "The expected shape of pixel values per image per batch "
++                    f" per patch is {expected_expr}. "
++                    f"You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[InternVLImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_token_id = kwargs.pop("image_token_id", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if image_embeds is not None:
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++
++            return InternVLImageEmbeddingInputs(
++                type="image_embeds",
++                data=flatten_bn(image_embeds),
++            )
++
++        self.img_context_token_id = image_token_id[0]
++
++        if pixel_values is not None:
++            if not isinstance(pixel_values, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            patches_per_image = []
++            for request_pixel_values in pixel_values:
++                for image_pixel_values in request_pixel_values:
++                    patches_per_image.append(image_pixel_values.shape[0])
++            # We need to flatten (B, N, P) to (B*N*P),
++            # so we call flatten_bn twice.
++            return InternVLImagePixelInputs(
++                type="pixel_values",
++                data=self._validate_pixel_values(
++                    flatten_bn(flatten_bn(pixel_values), concat=True)),
++                patches_per_image=patches_per_image)
++
++        raise AssertionError("This line should be unreachable.")
++
++    def _process_image_input(
++        self,
++        image_input: InternVLImageInputs,
++    ) -> Tuple[torch.Tensor]:
++        if image_input["type"] == "image_embeds":
++            return image_input["data"]
++
++        assert self.vision_model is not None
++
++        image_embeds = self.extract_feature(image_input["data"])
++
++        patches_per_image = image_input["patches_per_image"]
++
++        # Only one image in the current batch
++        if len(patches_per_image) == 1:
++            image_embeds = image_embeds.view(
++                -1, self.config.text_config.hidden_size).unsqueeze(0)
++            return image_embeds
++
++        # NOTE: Image embeddings are split into separate tensors for each image
++        # by the size of each embedding.
++        feature_size = image_embeds.shape[1]
++        image_embeds = image_embeds.view(-1,
++                                         self.config.text_config.hidden_size)
++        image_feature_sizes = [
++            num_patches * feature_size for num_patches in patches_per_image
++        ]
++        image_embeds = image_embeds.split(image_feature_sizes)
++        return image_embeds
++
++    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
++        if self.is_mono:
++            self.visual_token_mask = (
++                input_ids == self.img_context_token_id).reshape(-1, 1)
++        else:
++            self.visual_token_mask = None
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        vision_embeddings = self._process_image_input(image_input)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            assert self.img_context_token_id is not None
++            self._set_visual_token_mask(input_ids)
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                self.img_context_token_id)
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[SamplerOutput, IntermediateTensors]:
++
++        if intermediate_tensors is not None:
++            input_ids = None
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        forward_kwargs = {
++            "input_ids": input_ids,
++            "positions": positions,
++            "kv_caches": kv_caches,
++            "attn_metadata": attn_metadata,
++            "intermediate_tensors": intermediate_tensors,
++            "inputs_embeds": inputs_embeds,
++        }
++
++        # Only required if the model is mono-architecture
++        if self.visual_token_mask is not None:
++            forward_kwargs.update(
++                {"visual_token_mask": self.visual_token_mask})
++            self.visual_token_mask = None
++
++        hidden_states = self.language_model.model(**forward_kwargs)
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
+index bd6a180..8c81dff 100644
+--- a/vllm/model_executor/models/jais.py
++++ b/vllm/model_executor/models/jais.py
+@@ -1,6 +1,5 @@
+-# coding=utf-8
+ # Adapted from
+-# https://huggingface.co/core42/jais-30b-chat-v3/blob/main/modeling_jais.py
++# https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
+ # Copyright 2023 The vLLM team.
+ # Copyright 2023 the Jais authors and HuggingFace Inc. team.  All rights
+ # reserved.
+@@ -20,28 +19,34 @@
+ """Inference-only Jais model compatible with HuggingFace weights."""
+ 
+ import math
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import (get_tensor_model_parallel_rank,
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size)
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+-    VocabParallelEmbedding)
++    ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
+ from vllm.transformers_utils.configs import JAISConfig
+ 
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
+ 
+ class SwiGLUActivation(nn.Module):
+ 
+@@ -69,7 +74,9 @@ class JAISAttention(nn.Module):
+     def __init__(
+         self,
+         config: JAISConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -103,12 +110,13 @@ class JAISAttention(nn.Module):
+         head_end = (tp_rank + 1) * self.num_heads
+         alibi_slopes = _get_alibi_slopes(total_num_heads)
+         alibi_slopes = alibi_slopes[head_start:head_end]
+-        self.attn = Attention(
+-            self.num_heads,
+-            self.head_dim,
+-            scale=self.scale,
+-            alibi_slopes=alibi_slopes,
+-        )
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              scale=self.scale,
++                              alibi_slopes=alibi_slopes,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -170,7 +178,9 @@ class JAISBlock(nn.Module):
+     def __init__(
+         self,
+         config: JAISConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         hidden_size = config.hidden_size
+@@ -178,7 +188,10 @@ class JAISBlock(nn.Module):
+                      hidden_size)
+ 
+         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+-        self.attn = JAISAttention(config, quant_config)
++        self.attn = JAISAttention(config,
++                                  cache_config,
++                                  quant_config,
++                                  prefix=f"{prefix}.attn")
+         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+         self.mlp = JAISMLP(inner_dim, config, quant_config)
+ 
+@@ -206,14 +219,16 @@ class JAISBlock(nn.Module):
+         return hidden_states
+ 
+ 
++@support_torch_compile
+ class JAISModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: JAISConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+         assert not config.add_cross_attention
+         assert not config.scale_attn_by_inverse_layer_idx
+@@ -227,11 +242,23 @@ class JAISModel(nn.Module):
+             self.embeddings_scale = config.embeddings_scale
+         else:
+             self.embeddings_scale = config.mup_embeddings_scale
+-        self.h = nn.ModuleList([
+-            JAISBlock(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++
++        self.start_layer, self.end_layer, self.h = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: JAISBlock(config=config,
++                                     cache_config=cache_config,
++                                     quant_config=quant_config,
++                                     prefix=prefix),
++            prefix=f"{prefix}.h",
++        )
++
+         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.n_embd))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.wte(input_ids)
+ 
+     def forward(
+         self,
+@@ -239,36 +266,52 @@ class JAISModel(nn.Module):
+         position_ids: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        inputs_embeds = self.wte(input_ids)
+-        if self.wpe is not None:
+-            position_embeds = self.wpe(position_ids)
+-            hidden_states = inputs_embeds + position_embeds
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[IntermediateTensors, torch.Tensor]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is None:
++                inputs_embeds = self.get_input_embeddings(input_ids)
++            if self.wpe is not None:
++                position_embeds = self.wpe(position_ids)
++                hidden_states = inputs_embeds + position_embeds
++            else:
++                hidden_states = inputs_embeds
++            hidden_states *= torch.tensor(float(self.embeddings_scale),
++                                          dtype=hidden_states.dtype)
+         else:
+-            hidden_states = inputs_embeds
+-        hidden_states *= torch.tensor(float(self.embeddings_scale),
+-                                      dtype=hidden_states.dtype)
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
+ 
+-        for i in range(len(self.h)):
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.h[i]
+-            hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
++            hidden_states = layer(hidden_states,
++                                  kv_caches[i - self.start_layer],
++                                  attn_metadata)
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+ 
+         hidden_states = self.ln_f(hidden_states)
+         return hidden_states
+ 
+ 
+-class JAISLMHeadModel(nn.Module):
++class JAISLMHeadModel(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: JAISConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+-        self.transformer = JAISModel(config, quant_config)
+-        self.lm_head_weight = self.transformer.wte.weight
++        self.transformer = JAISModel(vllm_config=vllm_config,
++                                     prefix=maybe_prefix(
++                                         prefix, "transformer"))
++        if self.config.tie_word_embeddings:
++            self.lm_head = self.transformer.wte
++        else:
++            self.lm_head = ParallelLMHead(self.config.vocab_size,
++                                          self.config.hidden_size)
+         if hasattr(config, "width_scale"):
+             self.output_logits_scale = config.width_scale
+         else:
+@@ -276,7 +319,12 @@ class JAISLMHeadModel(nn.Module):
+                                         config.mup_width_scale)
+         self.logits_processor = LogitsProcessor(vocab_size=config.vocab_size,
+                                                 scale=self.output_logits_scale)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.transformer.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.transformer.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -284,14 +332,20 @@ class JAISLMHeadModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[IntermediateTensors, torch.Tensor]:
+         hidden_states = self.transformer(input_ids, positions, kv_caches,
+-                                         attn_metadata)
++                                         attn_metadata, intermediate_tensors,
++                                         inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -303,8 +357,10 @@ class JAISLMHeadModel(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "lm_head.weight" in name:
+                 # GPT-2 ties the weights of the embedding layer and the final
+@@ -318,6 +374,10 @@ class JAISLMHeadModel(nn.Module):
+                 continue
+             if not name.startswith("transformer."):
+                 name = "transformer." + name
++
++            if is_pp_missing_parameter(name, self):
++                continue
++
+             param = params_dict[name]
+             # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+             # Because of this, we need to transpose the weights.
+@@ -331,3 +391,5 @@ class JAISLMHeadModel(nn.Module):
+             weight_loader = getattr(param, "weight_loader",
+                                     default_weight_loader)
+             weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
+new file mode 100644
+index 0000000..890b553
+--- /dev/null
++++ b/vllm/model_executor/models/jamba.py
+@@ -0,0 +1,631 @@
++"""Inference-only Jamba model."""
++from typing import Iterable, List, Optional, Set, Tuple
++
++import torch
++from torch import nn
++from transformers import JambaConfig
++
++from vllm.attention.backends.abstract import AttentionMetadata
++from vllm.attention.layer import Attention
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.distributed.parallel_state import get_pp_group
++from vllm.model_executor.layers.fused_moe import FusedMoE
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (QKVParallelLinear,
++                                               ReplicatedLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
++from vllm.model_executor.layers.pooler import Pooler, PoolingType
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
++                                                    MambaCacheParams)
++from vllm.model_executor.pooling_metadata import PoolingMetadata
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors, PoolerOutput
++from vllm.utils import LayerBlockType
++
++from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++KVCache = Tuple[torch.Tensor, torch.Tensor]
++
++
++class JambaMoE(nn.Module):
++
++    def __init__(self,
++                 config: JambaConfig,
++                 num_experts: Optional[int] = None,
++                 top_k: Optional[int] = None,
++                 params_dtype: Optional[torch.dtype] = None,
++                 tp_size: Optional[int] = None,
++                 quant_config: Optional[QuantizationConfig] = None):
++        super().__init__()
++        self.num_total_experts = num_experts or config.num_experts
++        self.top_k = top_k or config.num_experts_per_tok
++        self.hidden_size = config.hidden_size
++        self.intermediate_size = config.intermediate_size
++
++        if self.num_total_experts > 1:
++            self.router = ReplicatedLinear(self.hidden_size,
++                                           self.num_total_experts,
++                                           bias=False,
++                                           quant_config=None,
++                                           params_dtype=params_dtype)
++
++        self.experts = FusedMoE(self.num_total_experts,
++                                self.top_k,
++                                self.hidden_size,
++                                self.intermediate_size,
++                                tp_size=tp_size,
++                                params_dtype=params_dtype,
++                                reduce_results=True,
++                                renormalize=False,
++                                use_grouped_topk=False,
++                                quant_config=quant_config)
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        orig_shape = hidden_states.shape
++        hidden_states = hidden_states.view(-1, self.hidden_size)
++        # router_logits: (batch * sequence_length, n_experts)
++        if self.num_total_experts > 1:
++            router_logits, _ = self.router(hidden_states)
++        else:
++            router_logits = torch.ones((hidden_states.shape[0], 1),
++                                       device=hidden_states.device,
++                                       dtype=hidden_states.dtype)
++        hidden_states = self.experts(hidden_states, router_logits)
++        return hidden_states.view(orig_shape)
++
++
++class JambaMLP(JambaMoE):
++
++    def __init__(self,
++                 config: JambaConfig,
++                 params_dtype: Optional[torch.dtype] = None,
++                 tp_size: Optional[int] = None,
++                 quant_config: Optional[QuantizationConfig] = None):
++        super().__init__(config,
++                         num_experts=1,
++                         top_k=1,
++                         params_dtype=params_dtype,
++                         tp_size=tp_size,
++                         quant_config=quant_config)
++
++
++class JambaMambaDecoderLayer(nn.Module):
++
++    def __init__(self,
++                 config: JambaConfig,
++                 layer_idx: int,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 is_lora_enabled: Optional[bool] = False,
++                 **kwargs) -> None:
++        super().__init__()
++        self.config = config
++        self.is_lora_enabled = is_lora_enabled
++        self.mamba = MambaMixer(hidden_size= config.hidden_size,
++                                ssm_state_size = config.mamba_d_state,
++                                conv_kernel_size = config.mamba_d_conv,
++                                intermediate_size = config.mamba_expand *\
++                                                    config.hidden_size,
++                                time_step_rank = config.mamba_dt_rank,
++                                use_conv_bias = config.mamba_conv_bias,
++                                use_bias = config.mamba_proj_bias,
++                                use_rms_norm=True,
++                                rms_norm_eps=config.rms_norm_eps,
++                                activation=config.hidden_act,
++                                is_lora_enabled = self.is_lora_enabled
++                                )
++
++        num_experts = config.layers_num_experts[layer_idx]
++        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
++        self.feed_forward = ffn_layer_class(config, quant_config=quant_config)
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
++                                        eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++        mamba_cache_params: MambaCacheParams,
++        **kwargs,
++    ):
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.input_layernorm(hidden_states)
++        else:
++            hidden_states, residual = self.input_layernorm(
++                hidden_states, residual)
++
++        hidden_states = self.mamba(hidden_states, attn_metadata,
++                                   mamba_cache_params)
++        # Fully Connected
++        hidden_states, residual = self.pre_ff_layernorm(
++            hidden_states, residual)
++        hidden_states = self.feed_forward(hidden_states)
++        return hidden_states, residual
++
++
++class JambaAttentionDecoderLayer(nn.Module):
++
++    def __init__(self,
++                 config: JambaConfig,
++                 layer_idx: int,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = "",
++                 **kwargs) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = config.num_attention_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = config.num_key_value_heads
++        if self.total_num_kv_heads >= tp_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        self.head_dim = config.hidden_size // self.total_num_heads
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = self.head_dim**-0.5
++
++        self.qkv_proj = QKVParallelLinear(
++            config.hidden_size,
++            self.head_dim,
++            self.total_num_heads,
++            self.total_num_kv_heads,
++            bias=False,
++            quant_config=quant_config,
++        )
++        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
++                                        config.hidden_size,
++                                        bias=False,
++                                        quant_config=quant_config)
++
++        self.attn = Attention(
++            self.num_heads,
++            self.head_dim,
++            self.scaling,
++            num_kv_heads=self.num_kv_heads,
++            cache_config=cache_config,
++            prefix=f"{prefix}.attn",
++        )
++
++        num_experts = config.layers_num_experts[layer_idx]
++        ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
++        self.feed_forward = ffn_layer_class(config, quant_config=quant_config)
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
++                                        eps=config.rms_norm_eps)
++
++    def self_attention(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        **kwargs,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++        **kwargs,
++    ):
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.input_layernorm(hidden_states)
++        else:
++            hidden_states, residual = self.input_layernorm(
++                hidden_states, residual)
++
++        hidden_states = self.self_attention(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        # Fully Connected
++        hidden_states, residual = self.pre_ff_layernorm(
++            hidden_states, residual)
++        hidden_states = self.feed_forward(hidden_states)
++        return hidden_states, residual
++
++
++ALL_DECODER_LAYER_TYPES = {
++    "attention": JambaAttentionDecoderLayer,
++    "mamba": JambaMambaDecoderLayer
++}
++
++
++class JambaModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.config = config
++        self.padding_idx = config.pad_token_id
++        lora_vocab = ((lora_config.lora_extra_vocab_size *
++                       (lora_config.max_loras or 1)) if lora_config else 0)
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.org_vocab_size = config.vocab_size
++
++        self.embed_tokens = VocabParallelEmbedding(
++            self.vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++        )
++
++        extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
++
++        def get_layer(prefix: str):
++            layer_idx = int(prefix.rsplit(".", 1)[1])
++            layer_class = ALL_DECODER_LAYER_TYPES[
++                config.layers_block_type[layer_idx]]
++            return layer_class(config,
++                               layer_idx,
++                               cache_config,
++                               quant_config=quant_config,
++                               prefix=prefix,
++                               **extra_kwargs)
++
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++        self.final_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        mamba_cache_params: MambaCacheParams,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        kv_cache_index = 0
++        mamba_cache_index = 0
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            kv_cache = None
++            layer_mamba_cache_params = None
++            if isinstance(layer, JambaAttentionDecoderLayer):
++                kv_cache = kv_caches[kv_cache_index]
++                kv_cache_index += 1
++            if isinstance(layer, JambaMambaDecoderLayer):
++                current_state_layer = mamba_cache_index
++                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
++                    current_state_layer)
++                mamba_cache_index += 1
++
++            hidden_states, residual = layer(
++                positions=positions,
++                hidden_states=hidden_states,
++                kv_cache=kv_cache,
++                attn_metadata=attn_metadata,
++                residual=residual,
++                mamba_cache_params=layer_mamba_cache_params)
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++        hidden_states, _ = self.final_layernorm(hidden_states, residual)
++        return hidden_states
++
++
++class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
++                       IsHybrid):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "in_proj": ["in_proj"],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj", "o_proj", "embed_tokens", "lm_head", "up_proj",
++        "down_proj", "gate_proj", "out_proj", "in_proj", "x_proj"
++    ]
++    embedding_modules = {
++        "embed_tokens": "input_embeddings",
++        "lm_head": "output_embeddings",
++    }
++    embedding_padding_modules = ["lm_head"]
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        lora_config = vllm_config.lora_config
++        scheduler_config = vllm_config.scheduler_config
++        assert not cache_config.enable_prefix_caching, \
++            "Jamba currently does not support prefix caching"
++
++        super().__init__()
++        self.config = config
++        self.vllm_config = vllm_config
++        self.model_config = vllm_config.model_config
++        self.scheduler_config = scheduler_config
++        self.model = JambaModel(vllm_config=vllm_config,
++                                prefix=maybe_prefix(prefix, "model"))
++        self.unpadded_vocab_size = config.vocab_size
++        if lora_config:
++            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++        self.lm_head = ParallelLMHead(
++            self.unpadded_vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++            padding_size=DEFAULT_VOCAB_PADDING_SIZE
++            # We need bigger padding if using lora for kernel
++            # compatibility
++            if not lora_config else lora_config.lora_vocab_padding_size,
++        )
++        # Used to track and store by the Mamba cache between steps.
++        self.mamba_cache: Optional[MambaCacheManager] = None
++
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                config.vocab_size)
++        self.sampler = get_sampler()
++
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++        if self.scheduler_config is not None and \
++                not self.model_config.enforce_eager:
++            if self.scheduler_config.max_num_seqs > \
++                    vllm_config.compilation_config.max_capture_size:
++                self.max_batch_size = \
++                    vllm_config.compilation_config.max_capture_size
++            else:
++                self.max_batch_size = vllm_config.pad_for_cudagraph(
++                    self.scheduler_config.max_num_seqs)
++        else:
++            self.max_batch_size = 8192 + 2
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(self,
++                input_ids: torch.Tensor,
++                positions: torch.Tensor,
++                kv_caches: List[KVCache],
++                attn_metadata: AttentionMetadata,
++                intermediate_tensors: Optional[IntermediateTensors] = None,
++                inputs_embeds: Optional[torch.Tensor] = None,
++                **kwargs):
++        if self.mamba_cache is None:
++            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
++                self.vllm_config.parallel_config, LayerBlockType.mamba)
++            self.mamba_cache = MambaCacheManager(
++                self.lm_head.weight.dtype, num_mamba_layers,
++                self.max_batch_size, *self._get_mamba_cache_shape())
++        (
++            mamba_cache_tensors,
++            state_indices_tensor,
++        ) = self.mamba_cache.current_run_tensors(input_ids, attn_metadata,
++                                                 **kwargs)
++        mamba_cache_params = MambaCacheParams(mamba_cache_tensors[0],
++                                              mamba_cache_tensors[1],
++                                              state_indices_tensor)
++        hidden_states = self.model(input_ids, positions, kv_caches,
++                                   attn_metadata, mamba_cache_params,
++                                   intermediate_tensors, inputs_embeds)
++        return hidden_states
++
++    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
++        return self.mamba_cache.copy_inputs_before_cuda_graphs(
++            input_buffers, **kwargs)
++
++    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
++        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
++
++    def _get_mamba_cache_shape(
++            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
++        world_size = get_tensor_model_parallel_world_size()
++        hidden_size = self.config.hidden_size
++        conv_state_shape = (
++            self.config.mamba_expand * hidden_size // world_size,
++            self.config.mamba_d_conv - 1,
++        )
++        temporal_state_shape = (
++            self.config.mamba_expand * hidden_size // world_size,
++            self.config.mamba_d_state,
++        )
++        return conv_state_shape, temporal_state_shape
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: Optional[torch.Tensor],
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++        ]
++
++        # Params for weights, fp8 weight scales, fp8 activation scales
++        # (param_name, weight_name, expert_id, shard_id)
++        expert_params_mapping = FusedMoE.make_expert_params_mapping(
++            ckpt_gate_proj_name="gate_proj",
++            ckpt_down_proj_name="down_proj",
++            ckpt_up_proj_name="up_proj",
++            num_experts=self.config.num_experts)
++
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++
++            if "A_log" in name:
++                name = name.replace("A_log", "A")
++
++            if ".self_attn." in name:
++                name = name.replace(".self_attn", "")
++
++            if "feed_forward" in name and not _is_moe_layer(name):
++                ## map MLP layers to expert with ID=0
++                name = name.replace("feed_forward", "feed_forward.experts.0")
++
++            for param_name, weight_name, shard_id in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                if 'experts' in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                # Skip layers on other devices.
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                for (
++                        param_name,
++                        weight_name,
++                        expert_id,
++                        shard_id,
++                ) in expert_params_mapping:
++                    if weight_name not in name:
++                        continue
++
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    name = name.replace(weight_name, param_name)
++                    param = params_dict[name]
++                    weight_loader = param.weight_loader
++                    weight_loader(param,
++                                  loaded_weight,
++                                  name,
++                                  shard_id=shard_id,
++                                  expert_id=expert_id)
++                    break
++                else:
++                    # Skip loading extra bias for GPTQ models.
++                    if name.endswith(".bias") and name not in params_dict:
++                        continue
++                    if is_pp_missing_parameter(name, self):
++                        continue
++
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++def _is_moe_layer(name: str):
++    return any(
++        [experts_name in name for experts_name in [
++            "experts",
++            "router",
++        ]])
++
++
++class JambaForSequenceClassification(JambaForCausalLM):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++        config = vllm_config.model_config.hf_config
++        num_labels: int = config.num_labels
++        score_bias: bool = getattr(config, 'score_bias', False)
++        self.score = nn.Linear(config.hidden_size, num_labels, bias=score_bias)
++
++        pooler_config = vllm_config.model_config.pooler_config
++        self._pooler = Pooler.from_config_with_defaults(
++            pooler_config,
++            pooling_type=PoolingType.LAST,
++            normalize=False,
++            softmax=False)
++
++    def pooler(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Optional[PoolerOutput]:
++        hidden_states = hidden_states.float()
++        logits = self.score(hidden_states)
++        return self._pooler(logits, pooling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++        # TODO: The reward weights themselves have float32 accuracy data, we
++        # would like to load them in fp32 to get that extra precision.
++        super().load_weights(weights)
++        self.score = self.score.float()
+diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
+index f6d7fc8..17b0fbb 100644
+--- a/vllm/model_executor/models/llama.py
++++ b/vllm/model_executor/models/llama.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+ # Copyright 2023 The vLLM team.
+@@ -21,15 +20,16 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only LLaMA model compatible with HuggingFace weights."""
+-from typing import Any, Dict, Iterable, List, Optional, Tuple
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import LlamaConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.config import LoRAConfig
+-from vllm.distributed import (get_tensor_model_parallel_rank,
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size)
+ from vllm.model_executor.layers.activation import SiluAndMul
+ from vllm.model_executor.layers.layernorm import RMSNorm
+@@ -37,17 +37,24 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
++    get_compressed_tensors_cache_scale)
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import (
+-    default_weight_loader, kv_cache_scales_loader)
++    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
+-from vllm.utils import is_hip
++from vllm.platforms import current_platform
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
++                    is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class LlamaMLP(nn.Module):
+@@ -57,25 +64,33 @@ class LlamaMLP(nn.Module):
+         hidden_size: int,
+         intermediate_size: int,
+         hidden_act: str,
+-        quant_config: Optional[QKVParallelLinear] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.gate_up_proj = MergedColumnParallelLinear(
+-            hidden_size, [intermediate_size] * 2,
+-            bias=False,
+-            quant_config=quant_config)
+-        self.down_proj = RowParallelLinear(intermediate_size,
+-                                           hidden_size,
+-                                           bias=False,
+-                                           quant_config=quant_config)
++            input_size=hidden_size,
++            output_sizes=[intermediate_size] * 2,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj",
++        )
++        self.down_proj = RowParallelLinear(
++            input_size=intermediate_size,
++            output_size=hidden_size,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.down_proj",
++        )
+         if hidden_act != "silu":
+             raise ValueError(f"Unsupported activation: {hidden_act}. "
+                              "Only silu is supported for now.")
+         self.act_fn = SiluAndMul()
+ 
+     def forward(self, x):
+-        gate_up, _ = self.gate_up_proj(x)
+-        x = self.act_fn(gate_up)
++        x, _ = self.gate_up_proj(x)
++        x = self.act_fn(x)
+         x, _ = self.down_proj(x)
+         return x
+ 
+@@ -84,6 +99,7 @@ class LlamaAttention(nn.Module):
+ 
+     def __init__(
+         self,
++        config: LlamaConfig,
+         hidden_size: int,
+         num_heads: int,
+         num_kv_heads: int,
+@@ -92,9 +108,11 @@ class LlamaAttention(nn.Module):
+         max_position_embeddings: int = 8192,
+         quant_config: Optional[QuantizationConfig] = None,
+         bias: bool = False,
+-        sliding_window: Optional[int] = None,
++        cache_config: Optional[CacheConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
++        layer_idx = extract_layer_index(prefix)
+         self.hidden_size = hidden_size
+         tp_size = get_tensor_model_parallel_world_size()
+         self.total_num_heads = num_heads
+@@ -110,49 +128,70 @@ class LlamaAttention(nn.Module):
+             # the KV heads across multiple tensor parallel GPUs.
+             assert tp_size % self.total_num_kv_heads == 0
+         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+-        self.head_dim = hidden_size // self.total_num_heads
++        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
++        self.head_dim = getattr(config, "head_dim",
++                                self.hidden_size // self.total_num_heads)
+         self.q_size = self.num_heads * self.head_dim
+         self.kv_size = self.num_kv_heads * self.head_dim
+         self.scaling = self.head_dim**-0.5
+         self.rope_theta = rope_theta
+         self.max_position_embeddings = max_position_embeddings
+ 
+-        # This will be overwritten by model initialization if we are using it.
+-        # N.B. currently we only support per tensor scalar scaling factors
+-        # & only applicable to ROCm (AMD GPU).
+-        # The scaling factor convention we are assuming is
+-        # quantized_value * scaling_factor ~= true_value
+-        # which is consistent with the practice of setting
+-        # scaling_factor = tensor_amax / FPtype_max
+-        self.kv_scale = 1.0
+-
+         self.qkv_proj = QKVParallelLinear(
+-            hidden_size,
+-            self.head_dim,
+-            self.total_num_heads,
+-            self.total_num_kv_heads,
++            hidden_size=hidden_size,
++            head_size=self.head_dim,
++            total_num_heads=self.total_num_heads,
++            total_num_kv_heads=self.total_num_kv_heads,
+             bias=bias,
+             quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
+         )
++
+         self.o_proj = RowParallelLinear(
+-            self.total_num_heads * self.head_dim,
+-            hidden_size,
++            input_size=self.total_num_heads * self.head_dim,
++            output_size=hidden_size,
+             bias=bias,
+             quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
+         )
+ 
++        is_neox_style = True
++        is_gguf = quant_config and quant_config.get_name() == "gguf"
++        if is_gguf and config.model_type == "llama":
++            is_neox_style = False
++
+         self.rotary_emb = get_rope(
+             self.head_dim,
+             rotary_dim=self.head_dim,
+             max_position=max_position_embeddings,
+             base=rope_theta,
+             rope_scaling=rope_scaling,
++            is_neox_style=is_neox_style,
++        )
++
++        if hasattr(config, "interleaved_sliding_window"):
++            interleaved_sliding_window = config.interleaved_sliding_window
++            if isinstance(interleaved_sliding_window, int):
++                sliding_window = interleaved_sliding_window
++            elif isinstance(interleaved_sliding_window, list):
++                sw_idx = layer_idx % len(interleaved_sliding_window)
++                sliding_window = interleaved_sliding_window[sw_idx]
++            else:
++                raise ValueError(
++                    f"{type(interleaved_sliding_window)} is not supported.")
++        else:
++            sliding_window = None
++
++        self.attn = Attention(
++            self.num_heads,
++            self.head_dim,
++            self.scaling,
++            num_kv_heads=self.num_kv_heads,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            per_layer_sliding_window=sliding_window,
++            prefix=f"{prefix}.attn",
+         )
+-        self.attn = Attention(self.num_heads,
+-                              self.head_dim,
+-                              self.scaling,
+-                              num_kv_heads=self.num_kv_heads,
+-                              sliding_window=sliding_window)
+ 
+     def forward(
+         self,
+@@ -164,8 +203,7 @@ class LlamaAttention(nn.Module):
+         qkv, _ = self.qkv_proj(hidden_states)
+         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+         q, k = self.rotary_emb(positions, q, k)
+-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata,
+-                                self.kv_scale)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+         output, _ = self.o_proj(attn_output)
+         return output
+ 
+@@ -175,7 +213,9 @@ class LlamaDecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: LlamaConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -187,12 +227,12 @@ class LlamaDecoderLayer(nn.Module):
+                 config.original_max_position_embeddings)
+         max_position_embeddings = getattr(config, "max_position_embeddings",
+                                           8192)
+-        sliding_window = getattr(config, "sliding_window", None)
+         # Support abacusai/Smaug-72B-v0.1 with attention_bias
+         # Support internlm/internlm-7b with bias
+         attention_bias = getattr(config, "attention_bias", False) or getattr(
+             config, "bias", False)
+         self.self_attn = LlamaAttention(
++            config=config,
+             hidden_size=self.hidden_size,
+             num_heads=config.num_attention_heads,
+             num_kv_heads=getattr(config, "num_key_value_heads",
+@@ -202,13 +242,16 @@ class LlamaDecoderLayer(nn.Module):
+             max_position_embeddings=max_position_embeddings,
+             quant_config=quant_config,
+             bias=attention_bias,
+-            sliding_window=sliding_window,
++            cache_config=cache_config,
++            prefix=f"{prefix}.self_attn",
+         )
+         self.mlp = LlamaMLP(
+             hidden_size=self.hidden_size,
+             intermediate_size=config.intermediate_size,
+             hidden_act=config.hidden_act,
+             quant_config=quant_config,
++            bias=getattr(config, "mlp_bias", False),
++            prefix=f"{prefix}.mlp",
+         )
+         self.input_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+@@ -230,12 +273,10 @@ class LlamaDecoderLayer(nn.Module):
+         else:
+             hidden_states, residual = self.input_layernorm(
+                 hidden_states, residual)
+-        hidden_states = self.self_attn(
+-            positions=positions,
+-            hidden_states=hidden_states,
+-            kv_cache=kv_cache,
+-            attn_metadata=attn_metadata,
+-        )
++        hidden_states = self.self_attn(positions=positions,
++                                       hidden_states=hidden_states,
++                                       kv_cache=kv_cache,
++                                       attn_metadata=attn_metadata)
+ 
+         # Fully Connected
+         hidden_states, residual = self.post_attention_layernorm(
+@@ -244,31 +285,53 @@ class LlamaDecoderLayer(nn.Module):
+         return hidden_states, residual
+ 
+ 
++@support_torch_compile
+ class LlamaModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: LlamaConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ) -> None:
++    def __init__(self,
++                 *,
++                 vllm_config: VllmConfig,
++                 prefix: str = "",
++                 layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
+         self.config = config
+         self.padding_idx = config.pad_token_id
+         lora_vocab = (lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0
+         self.vocab_size = config.vocab_size + lora_vocab
+         self.org_vocab_size = config.vocab_size
+-        self.embed_tokens = VocabParallelEmbedding(
+-            self.vocab_size,
+-            config.hidden_size,
+-            org_num_embeddings=config.vocab_size,
++        if get_pp_group().is_first_rank or (config.tie_word_embeddings
++                                            and get_pp_group().is_last_rank):
++            self.embed_tokens = VocabParallelEmbedding(
++                self.vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++                quant_config=quant_config,
++            )
++        else:
++            self.embed_tokens = PPMissingLayer()
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: layer_type(config=config,
++                                      cache_config=cache_config,
++                                      quant_config=quant_config,
++                                      prefix=prefix),
++            prefix=f"{prefix}.layers",
+         )
+-        self.layers = nn.ModuleList([
+-            LlamaDecoderLayer(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
+-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        if get_pp_group().is_last_rank:
++            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        else:
++            self.norm = PPMissingLayer()
++
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
+ 
+     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+         return self.embed_tokens(input_ids)
+@@ -279,107 +342,37 @@ class LlamaModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
+         inputs_embeds: Optional[torch.Tensor] = None,
+-    ) -> torch.Tensor:
+-        if inputs_embeds is not None:
+-            hidden_states = inputs_embeds
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
+         else:
+-            hidden_states = self.get_input_embeddings(input_ids)
+-        residual = None
+-        for i in range(len(self.layers)):
+-            layer = self.layers[i]
+-            hidden_states, residual = layer(
+-                positions,
+-                hidden_states,
+-                kv_caches[i],
+-                attn_metadata,
+-                residual,
+-            )
+-        hidden_states, _ = self.norm(hidden_states, residual)
+-        return hidden_states
+-
+-
+-class LlamaForCausalLM(nn.Module):
+-    packed_modules_mapping = {
+-        "qkv_proj": [
+-            "q_proj",
+-            "k_proj",
+-            "v_proj",
+-        ],
+-        "gate_up_proj": [
+-            "gate_proj",
+-            "up_proj",
+-        ],
+-    }
+-
+-    # LoRA specific attributes
+-    supported_lora_modules = [
+-        "qkv_proj",
+-        "o_proj",
+-        "gate_up_proj",
+-        "down_proj",
+-        "embed_tokens",
+-        "lm_head",
+-    ]
+-    embedding_modules = {
+-        "embed_tokens": "input_embeddings",
+-        "lm_head": "output_embeddings",
+-    }
+-    embedding_padding_modules = ["lm_head"]
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
+ 
+-    def __init__(
+-        self,
+-        config: LlamaConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ) -> None:
+-        super().__init__()
+-        self.config = config
+-        self.model = LlamaModel(config, quant_config, lora_config=lora_config)
+-        self.unpadded_vocab_size = config.vocab_size
+-        if lora_config:
+-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+-        self.lm_head = ParallelLMHead(
+-            self.unpadded_vocab_size,
+-            config.hidden_size,
+-            org_num_embeddings=config.vocab_size,
+-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+-            # We need bigger padding if using lora for kernel
+-            # compatibility
+-            if not lora_config else lora_config.lora_vocab_padding_size,
+-        )
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(positions, hidden_states,
++                                            kv_caches[i - self.start_layer],
++                                            attn_metadata, residual)
+ 
+-        logit_scale = getattr(config, "logit_scale", 1.0)
+-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+-                                                config.vocab_size, logit_scale)
+-        self.sampler = Sampler()
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+ 
+-    def forward(
+-        self,
+-        input_ids: torch.Tensor,
+-        positions: torch.Tensor,
+-        kv_caches: List[torch.Tensor],
+-        attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++        hidden_states, _ = self.norm(hidden_states, residual)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+-                                       sampling_metadata)
+-        return logits
+-
+-    def sample(
+-        self,
+-        logits: torch.Tensor,
+-        sampling_metadata: SamplingMetadata,
+-    ) -> Optional[SamplerOutput]:
+-        next_tokens = self.sampler(logits, sampling_metadata)
+-        return next_tokens
+-
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             (".qkv_proj", ".q_proj", "q"),
+@@ -389,6 +382,7 @@ class LlamaForCausalLM(nn.Module):
+             (".gate_up_proj", ".up_proj", 1),
+         ]
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -397,13 +391,26 @@ class LlamaForCausalLM(nn.Module):
+                 # Models trained using ColossalAI may include these tensors in
+                 # the checkpoint. Skip them.
+                 continue
+-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++            if scale_name := get_compressed_tensors_cache_scale(name):
++                # Loading kv cache scales for compressed-tensors quantization
++                param = params_dict[scale_name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                loaded_weight = loaded_weight[0]
++                weight_loader(param, loaded_weight)
++                loaded_params.add(scale_name)
++                continue
++            for param_name, weight_name, shard_id in stacked_params_mapping:
+                 if weight_name not in name:
+                     continue
+                 name = name.replace(weight_name, param_name)
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -412,10 +419,20 @@ class LlamaForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                # Remapping the name of FP8 kv-scale.
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+ 
+     # If this function is called, it should always initialize KV cache scale
+     # factors (or else raise an exception). Thus, handled exceptions should
+@@ -427,16 +444,190 @@ class LlamaForCausalLM(nn.Module):
+                 quantization_param_path, tp_rank, tp_size,
+                 self.config.num_hidden_layers,
+                 self.config.__class__.model_type):
+-            layer_self_attn = self.model.layers[layer_idx].self_attn
++            if not isinstance(self.layers[layer_idx], nn.Identity):
++                layer_self_attn = self.layers[layer_idx].self_attn
+ 
+-            if is_hip():
++            if current_platform.is_rocm():
+                 # The scaling factor convention we are assuming is
+                 # quantized_value * scaling_factor ~= true_value
+                 # which is consistent with the practice of setting
+                 # scaling_factor = tensor_amax / FPtype_max
+                 scaling_factor *= 2
+-            if hasattr(layer_self_attn, "kv_scale"):
+-                layer_self_attn.kv_scale = scaling_factor
++            if hasattr(layer_self_attn.attn, "_k_scale"):
++                layer_self_attn.attn._k_scale = scaling_factor
++                layer_self_attn.attn._v_scale = scaling_factor
+             else:
+                 raise RuntimeError("Self attention has no KV cache scaling "
+                                    "factor attribute!")
++
++
++class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
++        "gate_up_proj": ["gate_proj", "up_proj"]
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
++        "lm_head"
++    ]
++    embedding_modules = {
++        "embed_tokens": "input_embeddings",
++        "lm_head": "output_embeddings"
++    }
++    embedding_padding_modules = ["lm_head"]
++
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    # Mistral/Llama models can also be loaded with --load-format mistral
++    # from consolidated.safetensors checkpoints
++    mistral_mapping = {
++        "layers": "model.layers",
++        "attention": "self_attn",
++        "wq": "q_proj",
++        "wk": "k_proj",
++        "wv": "v_proj",
++        "wo": "o_proj",
++        "attention_norm": "input_layernorm",
++        "feed_forward": "mlp",
++        "w1": "gate_proj",
++        "w2": "down_proj",
++        "w3": "up_proj",
++        "ffn_norm": "post_attention_layernorm",
++        "tok_embeddings": "model.embed_tokens",
++        "output": "lm_head",
++        "norm": "model.norm"
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++        self.config = config
++        self.lora_config = lora_config
++
++        self.model = self._init_model(vllm_config=vllm_config,
++                                      prefix=maybe_prefix(prefix, "model"))
++
++        if get_pp_group().is_last_rank:
++            self.unpadded_vocab_size = config.vocab_size
++            if lora_config:
++                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++            self.lm_head = ParallelLMHead(
++                self.unpadded_vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++                padding_size=(
++                    DEFAULT_VOCAB_PADDING_SIZE
++                    # We need bigger padding if using lora for kernel
++                    # compatibility
++                    if not lora_config else
++                    lora_config.lora_vocab_padding_size),
++                quant_config=quant_config,
++                prefix=maybe_prefix(prefix, "lm_head"),
++            )
++            if config.tie_word_embeddings:
++                self.lm_head = self.lm_head.tie_weights(
++                    self.model.embed_tokens)
++
++            logit_scale = getattr(config, "logit_scale", 1.0)
++            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                    config.vocab_size,
++                                                    logit_scale)
++        else:
++            self.lm_head = PPMissingLayer()
++
++        self.sampler = get_sampler()
++
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
++        return LlamaModel(vllm_config=vllm_config, prefix=prefix)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        model_output = self.model(input_ids, positions, kv_caches,
++                                  attn_metadata, intermediate_tensors,
++                                  inputs_embeds)
++        return model_output
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(self, logits: torch.Tensor,
++               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(
++            self,
++            skip_prefixes=(["lm_head."]
++                           if self.config.tie_word_embeddings else None),
++        )
++        return loader.load_weights(
++            self.maybe_remap_mistral(name, loaded_weight)
++            for name, loaded_weight in weights)
++
++    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
++        self.model.load_kv_cache_scales(quantization_param_path)
++
++    # This function is used to remap the mistral format as
++    # used by Mistral and Llama <=2
++    def maybe_remap_mistral(
++        self,
++        name: str,
++        loaded_weight: torch.Tensor,
++    ) -> Tuple[str, torch.Tensor]:
++
++        def permute(w: torch.Tensor, n_heads: int):
++            attn_in = self.config.head_dim * n_heads
++            attn_out = self.config.hidden_size
++
++            return w.view(n_heads, attn_in // n_heads // 2, 2,
++                          attn_out).transpose(1, 2).reshape(attn_in, attn_out)
++
++        mapping = self.mistral_mapping
++        modules = name.split(".")
++
++        # rotary embeds should be sliced
++        if "wk" in modules:
++            loaded_weight = permute(loaded_weight,
++                                    self.config.num_key_value_heads)
++        elif "wq" in modules:
++            loaded_weight = permute(loaded_weight,
++                                    self.config.num_attention_heads)
++
++        for item in modules:
++            if item in mapping and mapping[item] not in name:
++                name = name.replace(item, mapping[item])
++
++        return name, loaded_weight
+diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
+index dcde4df..bb3db60 100644
+--- a/vllm/model_executor/models/llava.py
++++ b/vllm/model_executor/models/llava.py
+@@ -1,100 +1,638 @@
+-from typing import Iterable, List, Optional, Tuple
++from abc import abstractmethod
++from functools import cached_property
++from typing import (Final, Iterable, List, Literal, Mapping, Optional,
++                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
+ 
+ import torch
+-from torch import nn
+-# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
+-# transformers' impl.
+-from transformers import CLIPVisionModel, LlavaConfig
++import torch.nn as nn
++from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
++                          PixtralVisionConfig, PretrainedConfig,
++                          SiglipVisionConfig)
++from transformers.models.llava import LlavaProcessor
++from transformers.models.pixtral import PixtralProcessor
+ 
+ from vllm.attention import AttentionMetadata
+-from vllm.config import VisionLanguageConfig
++from vllm.config import VllmConfig
++from vllm.inputs import InputProcessingContext
+ from vllm.model_executor.layers.activation import get_act_fn
+-from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
+-from vllm.model_executor.layers.sampler import Sampler
+-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+-from vllm.model_executor.models.llama import LlamaModel
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
++                                    MultiModalInputsV2, MultiModalKwargs,
++                                    NestedTensors)
++from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
++                                   ImageSize, MultiModalDataItems)
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo, ProcessingCache,
++                                        PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.sequence import IntermediateTensors
+ 
+-_KEYS_TO_MODIFY_MAPPING = {
+-    "language_model.lm_head": "lm_head",
+-    "language_model.model": "language_model",
+-}
++from .clip import CLIPVisionModel
++from .interfaces import SupportsMultiModal, SupportsPP
++from .pixtral import (PixtralHFVisionModel,
++                      get_pixtral_hf_image_feature_grid_size)
++from .siglip import SiglipVisionModel
++from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
++                    maybe_prefix, merge_multimodal_embeddings)
++from .vision import get_vision_encoder_info
++
++
++class LlavaImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: Union[torch.Tensor, List[torch.Tensor]]
++    """
++    Shape: `(batch_size * num_images, num_channels, height, width)`
++
++    Note that `height` or `width` may be different per batch and image,
++    in which case the data is passed as a list instead of a batched tensor.
++    """
++
++
++class LlavaImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: torch.Tensor
++    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
++
++    `hidden_size` must match the hidden size of language model backbone.
++    """
++
++
++LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs]
+ 
+ 
+-# TODO(xwjiang): Run benchmark and decide if TP.
+ class LlavaMultiModalProjector(nn.Module):
+ 
+-    def __init__(self, vision_hidden_size: int, text_hidden_size: int,
+-                 projector_hidden_act: str):
++    def __init__(self,
++                 vision_hidden_size: int,
++                 text_hidden_size: int,
++                 projector_hidden_act: str,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__()
+ 
+-        self.linear_1 = nn.Linear(vision_hidden_size,
+-                                  text_hidden_size,
+-                                  bias=True)
++        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
++                                             text_hidden_size,
++                                             bias=True,
++                                             quant_config=quant_config,
++                                             prefix=f"{prefix}.linear_1")
+         self.act = get_act_fn(projector_hidden_act)
+-        self.linear_2 = nn.Linear(text_hidden_size,
+-                                  text_hidden_size,
+-                                  bias=True)
++        self.linear_2 = RowParallelLinear(text_hidden_size,
++                                          text_hidden_size,
++                                          bias=True,
++                                          quant_config=quant_config,
++                                          prefix=f"{prefix}.linear_2")
+ 
+-    def forward(self, image_features):
+-        hidden_states = self.linear_1(image_features)
++    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
++        hidden_states, _ = self.linear_1(image_features)
+         hidden_states = self.act(hidden_states)
+-        hidden_states = self.linear_2(hidden_states)
++        hidden_states, _ = self.linear_2(hidden_states)
+         return hidden_states
+ 
+ 
+-def _merge_vision_embeddings(input_ids: torch.Tensor,
+-                             inputs_embeds: torch.Tensor,
+-                             vision_embeddings: torch.Tensor,
+-                             image_token_id: int):
+-    """In place merges in vision_embeddings with inputs_embeds."""
+-    mask = (input_ids == image_token_id)
+-    inputs_embeds[mask] = vision_embeddings.view(-1,
+-                                                 vision_embeddings.shape[-1])
++class LlavaLikeConfig(Protocol):
++    vision_config: Final[PretrainedConfig]
++    image_token_index: Final[int]
++    vision_feature_select_strategy: Final[str]
++    vision_feature_layer: Final[Union[int, list[int]]]
+ 
+ 
+-class LlavaForConditionalGeneration(nn.Module):
++class LlavaLikeProcessor(Protocol):
++    image_token: Final[str]
+ 
+-    def __init__(self,
+-                 config: "LlavaConfig",
+-                 vision_language_config: VisionLanguageConfig,
+-                 quant_config: Optional["QuantizationConfig"] = None) -> None:
++
++class BaseLlavaProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_config(self) -> LlavaLikeConfig:
++        return self.ctx.get_hf_config(LlavaConfig)
++
++    def get_vision_encoder_info(self):
++        return get_vision_encoder_info(self.get_hf_config())
++
++    @abstractmethod
++    def get_hf_processor(self) -> LlavaLikeProcessor:
++        raise NotImplementedError
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": None}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        return {"image": self.get_max_image_tokens()}
++
++    def _apply_feature_select_strategy(
++        self,
++        strategy: str,
++        encoder_num_image_tokens: int,
++    ) -> int:
++        if strategy == "default":
++            return encoder_num_image_tokens - 1
++        if strategy == "full":
++            return encoder_num_image_tokens
++
++        msg = f"Unexpected feature select strategy: {strategy!r}"
++        raise NotImplementedError(msg)
++
++    def get_num_image_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++    ) -> int:
++        hf_config = self.get_hf_config()
++        vision_encoder_info = self.get_vision_encoder_info()
++
++        return self._apply_feature_select_strategy(
++            hf_config.vision_feature_select_strategy,
++            vision_encoder_info.get_num_image_tokens(
++                image_width=image_width,
++                image_height=image_height,
++            ),
++        )
++
++    def get_image_size_with_most_features(self) -> ImageSize:
++        vision_encoder_info = self.get_vision_encoder_info()
++        width = height = vision_encoder_info.get_image_size()
++        return ImageSize(width=width, height=height)
++
++    def get_max_image_tokens(self) -> int:
++        target_width, target_height = self.get_image_size_with_most_features()
++
++        return self.get_num_image_tokens(
++            image_width=target_width,
++            image_height=target_height,
++        )
++
++
++_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
++
++
++class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        num_images = mm_counts.get("image", 0)
++
++        processor = self.info.get_hf_processor()
++        image_token = processor.image_token
++        target_width, target_height = \
++            self.info.get_image_size_with_most_features()
++
++        mm_data = {
++            "image":
++            self._get_dummy_images(width=target_width,
++                                   height=target_height,
++                                   num_images=num_images)
++        }
++
++        return ProcessorInputs(
++            prompt_text=image_token * num_images,
++            mm_data=mm_data,
++        )
++
++
++class LlavaProcessingInfo(BaseLlavaProcessingInfo):
++
++    def get_hf_processor(self):
++        return self.ctx.get_hf_processor(LlavaProcessor)
++
++
++class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
++
++    # Copied from BaseMultiModalProcessor
++    @abstractmethod
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        raise NotImplementedError
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        hf_config = self.info.get_hf_config()
++        image_token_id = hf_config.image_token_index
++
++        def get_replacement(item_idx: int):
++            images = mm_items.get_items(
++                "image", (ImageEmbeddingItems, ImageProcessorItems))
++
++            if isinstance(images, ImageEmbeddingItems):
++                num_image_tokens = images.get_feature_size(item_idx)
++            else:
++                image_size = images.get_image_size(item_idx)
++                num_image_tokens = self.info.get_num_image_tokens(
++                    image_width=image_size.width,
++                    image_height=image_size.height,
++                )
++
++            return [image_token_id] * num_image_tokens
++
++        return [
++            PromptReplacement(
++                modality="image",
++                target=[image_token_id],
++                replacement=get_replacement,
++            ),
++        ]
++
++
++class LlavaMultiModalProcessor(
++        BaseLlavaMultiModalProcessor[LlavaProcessingInfo]):
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(
++            pixel_values=MultiModalFieldConfig.batched("image"),
++            image_embeds=MultiModalFieldConfig.batched("image"),
++        )
++
++
++class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
++
++    def get_hf_processor(self):
++        return self.ctx.get_hf_processor(PixtralProcessor)
++
++
++class PixtralHFMultiModalProcessor(
++        BaseMultiModalProcessor[PixtralHFProcessingInfo]):
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        processed_outputs = super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++        )
++
++        pixel_values = processed_outputs.get("pixel_values")
++        if pixel_values is not None:
++            images = mm_data["images"]
++            assert isinstance(images, list)
++
++            # Original output: (1, num_images, C, H, W)
++            # New output: (num_images, C, H, W)
++            assert (isinstance(pixel_values, list) and len(pixel_values) == 1)
++            assert (isinstance(pixel_values[0], list)
++                    and len(pixel_values[0]) == len(images))
++
++            processed_outputs["pixel_values"] = pixel_values[0]
++
++        return processed_outputs
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(
++            pixel_values=MultiModalFieldConfig.batched("image"),
++            image_embeds=MultiModalFieldConfig.batched("image"),
++        )
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        hf_config = self.info.get_hf_config()
++        image_token_id = hf_config.image_token_index
++
++        processor = self.info.get_hf_processor()
++        image_token = processor.image_token
++        image_break_token = processor.image_break_token
++        image_end_token = processor.image_end_token
++
++        vision_config = hf_config.vision_config
++        assert isinstance(vision_config, PixtralVisionConfig)
++
++        def get_replacement(item_idx: int):
++            images = mm_items.get_items("image", ImageProcessorItems)
++            image_size = images.get_image_size(item_idx)
++
++            ncols, nrows = get_pixtral_hf_image_feature_grid_size(
++                vision_config,
++                image_width=image_size.width,
++                image_height=image_size.height,
++            )
++
++            tokens = ([image_token] * ncols + [image_break_token]) * nrows
++            tokens[-1] = image_end_token
++
++            return "".join(tokens)
++
++        return [
++            PromptReplacement(
++                modality="image",
++                target=[image_token_id],
++                replacement=get_replacement,
++            ),
++        ]
++
++
++def _build_llava_or_pixtral_hf_info(
++    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
++    hf_config = ctx.get_hf_config(LlavaConfig)
++
++    if isinstance(hf_config.vision_config, PixtralVisionConfig):
++        return PixtralHFProcessingInfo(ctx)
++
++    return LlavaProcessingInfo(ctx)
++
++
++def _build_llava_or_pixtral_hf_processor(
++    info: _I,
++    dummy_inputs: BaseDummyInputsBuilder[_I],
++    *,
++    cache: Optional[ProcessingCache] = None,
++    enable_sanity_checks: bool = True,
++) -> BaseMultiModalProcessor:
++    if isinstance(info, PixtralHFProcessingInfo):
++        return PixtralHFMultiModalProcessor(
++            info,
++            dummy_inputs,  # type: ignore
++            cache=cache,
++            enable_sanity_checks=enable_sanity_checks,
++        )
++
++    if isinstance(info, LlavaProcessingInfo):
++        return LlavaMultiModalProcessor(
++            info,
++            dummy_inputs,  # type: ignore
++            cache=cache,
++            enable_sanity_checks=enable_sanity_checks,
++        )
++
++    raise NotImplementedError(type(info))
++
++
++def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
++    """Determine the number of hidden layers to initialize up to in the
++    visual encoder.
++    
++    Args:
++        hf_config: Model config with vision feature layer(s).
++    """
++    feature_layers = hf_config.vision_feature_layer
++    num_hidden_layers = hf_config.vision_config.num_hidden_layers
++    # If we have one feature layer, initialize up to that layer
++    if isinstance(feature_layers, int):
++        return _get_layer_index(feature_layers, num_hidden_layers)
++    # If we have multiple feature layers, initialize up to the deepest one
++    elif isinstance(feature_layers, (list, tuple)):
++        return max(
++            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
++    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
++                    " is not supported")
++
++
++def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
++    """Given an signed vision feature layer, get the number of hidden layers
++    needed to leverage it.
++
++    Args:
++        feature_layer_index: Index of a required layer in the visual encoder.
++        num_hidden_layers: The total number of hidden layers in the visual
++            encoder.
++    """
++    if feature_layer_index < 0:
++        return num_hidden_layers + feature_layer_index + 1
++    return feature_layer_index + 1
++
++
++def init_vision_tower_for_llava(
++    hf_config: LlavaLikeConfig,
++    quant_config: Optional[QuantizationConfig],
++    *,
++    require_post_norm: Optional[bool] = None,
++    prefix: str = "",
++):
++    vision_config = hf_config.vision_config
++
++    # Initialize the vision tower only up to the deepest required feature layer
++    num_hidden_layers = _get_num_hidden_layers(hf_config)
++
++    if isinstance(vision_config, CLIPVisionConfig):
++        return CLIPVisionModel(
++            vision_config,
++            quant_config=quant_config,
++            num_hidden_layers_override=num_hidden_layers,
++            require_post_norm=require_post_norm,
++            prefix=prefix,
++        )
++    elif isinstance(vision_config, SiglipVisionConfig):
++        return SiglipVisionModel(
++            vision_config,
++            quant_config=quant_config,
++            num_hidden_layers_override=num_hidden_layers,
++            require_post_norm=require_post_norm,
++            prefix=prefix,
++        )
++    elif isinstance(vision_config, PixtralVisionConfig):
++        return PixtralHFVisionModel(
++            vision_config,
++            quant_config=quant_config,
++            num_hidden_layers_override=num_hidden_layers,
++            require_post_norm=require_post_norm,
++            prefix=prefix,
++        )
++
++    msg = f"Unsupported vision config: {type(vision_config)}"
++    raise NotImplementedError(msg)
++
++
++@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor,
++                                        info=_build_llava_or_pixtral_hf_info,
++                                        dummy_inputs=LlavaDummyInputsBuilder)
++class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+         super().__init__()
+-        self.config = config
+ 
+-        self.vision_language_config = vision_language_config
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
+ 
+-        assert self.vision_language_config, (
+-            "Provide `image_input_type` and other vision "
+-            "related configurations through LLM entrypoint "
+-            "or engine arguments.")
++        self.config = config
++        self.multimodal_config = multimodal_config
+ 
+-        if self.vision_language_config.image_input_type == (
+-                VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
+-            self.vision_tower = CLIPVisionModel(config.vision_config)
+-        else:
+-            self.vision_tower = None
++        # NOTE: These are special cases for Pixtral-12B in the HF-format
++        # https://huggingface.co/mistral-community/pixtral-12b/blob/main/config.json  # noqa
++        if (config.text_config.architectures is None
++                and config.text_config.model_type == "mistral"):
++            config.text_config.architectures = ["MistralForCausalLM"]
++        if (config.projector_hidden_act is None
++                and config.vision_config.hidden_act == "gelu"):
++            config.projector_hidden_act = "gelu"
+ 
++        # TODO: Optionally initializes this for supporting embeddings.
++        self.vision_tower = init_vision_tower_for_llava(
++            config,
++            quant_config,
++            require_post_norm=False,
++            prefix=maybe_prefix(prefix, "vision_tower"))
+         self.multi_modal_projector = LlavaMultiModalProjector(
+             vision_hidden_size=config.vision_config.hidden_size,
+             text_hidden_size=config.text_config.hidden_size,
+-            projector_hidden_act=config.projector_hidden_act)
+-
+-        self.quant_config = quant_config
+-        self.language_model = LlamaModel(config.text_config, quant_config)
+-        self.unpadded_vocab_size = config.text_config.vocab_size
+-        self.lm_head = ParallelLMHead(
+-            self.unpadded_vocab_size,
+-            config.text_config.hidden_size,
+-            org_num_embeddings=self.language_model.org_vocab_size)
+-        logit_scale = getattr(config, "logit_scale", 1.0)
+-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+-                                                config.vocab_size, logit_scale)
+-        self.sampler = Sampler()
++            projector_hidden_act=config.projector_hidden_act,
++            quant_config=quant_config,
++            prefix=maybe_prefix(prefix, "multi_modal_projector"))
++
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=config.text_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
++        h = w = self.config.vision_config.image_size
++        expected_dims = (3, h, w)
++        actual_dims = tuple(data.shape[1:])
++
++        if actual_dims != expected_dims:
++            expected_expr = ("batch_size", *map(str, expected_dims))
++            raise ValueError(
++                f"The expected shape of pixel values is {expected_expr}. "
++                f"You supplied {tuple(data.shape)}.")
++
++        return data
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[LlavaImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if pixel_values is not None:
++            if not isinstance(pixel_values, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            if self.config.vision_config.model_type == "pixtral":
++                return LlavaImagePixelInputs(
++                    type="pixel_values",
++                    data=flatten_bn(pixel_values),
++                )
++
++            return LlavaImagePixelInputs(
++                type="pixel_values",
++                data=self._validate_pixel_values(
++                    flatten_bn(pixel_values, concat=True)),
++            )
++
++        if image_embeds is not None:
++            if not isinstance(image_embeds, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++
++            return LlavaImageEmbeddingInputs(
++                type="image_embeds",
++                data=flatten_bn(image_embeds, concat=True),
++            )
++
++        raise AssertionError("This line should be unreachable.")
++
++    def _select_image_features(self, image_features: torch.Tensor, *,
++                               strategy: str) -> torch.Tensor:
++        # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
++        if strategy == "default":
++            return image_features[:, 1:]
++        elif strategy == "full":
++            return image_features
++
++        raise ValueError(f"Unexpected select feature strategy: {strategy}")
++
++    def _image_pixels_to_features(
++        self,
++        vision_tower: Union[CLIPVisionModel, SiglipVisionModel,
++                            PixtralHFVisionModel],
++        pixel_values: torch.Tensor,
++    ) -> torch.Tensor:
++
++        # NOTE: we skip the step to select the vision feature layer since
++        # this is already done inside the vision tower
++        image_features = vision_tower(pixel_values)
++
++        return self._select_image_features(
++            image_features,
++            strategy=self.config.vision_feature_select_strategy,
++        )
++
++    def _process_image_pixels(self,
++                              inputs: LlavaImagePixelInputs) -> torch.Tensor:
++        assert self.vision_tower is not None
++
++        pixel_values = inputs["data"]
++
++        return self._image_pixels_to_features(self.vision_tower, pixel_values)
++
++    def _process_image_input(self,
++                             image_input: LlavaImageInputs) -> torch.Tensor:
++
++        if image_input["type"] == "image_embeds":
++            return image_input["data"]
++
++        assert self.vision_tower is not None
++        image_features = self._process_image_pixels(image_input)
++        return self.multi_modal_projector(image_features)
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        vision_embeddings = self._process_image_input(image_input)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                self.config.image_token_index)
++        return inputs_embeds
+ 
+     def forward(
+         self,
+@@ -102,138 +640,165 @@ class LlavaForConditionalGeneration(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-        image_input: Optional[torch.Tensor] = None
+-    ) -> SamplerOutput:  # noqa: E501
+-        """Run forward pass for Llava 1.5.
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        """Run forward pass for LLaVA-1.5.
+ 
+         One key thing to understand is the `input_ids` already accounts for the
+         positions of the to-be-inserted image embeddings.
++
+         Concretely, consider a text prompt:
+-        "<image>\nUSER: What's the content of the image?\nASSISTANT:".
++        `"USER: <image>\\nWhat's the content of the image?\\nASSISTANT:"`.
++
+         Tokenizer outputs:
+-        [1, 32000, 29871, 13, 11889, 29901, 1724, 29915, 29879, 278,
+-        2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901].
+-        The to-be-inserted image has a size of 576 (24 * 24) along the context
+-        length dimension.
+-        `input_ids` is thus [1, 32000, ..., 32000, 29871, 13, 11889, 29901,
+-        1724, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933,
+-        9047, 13566, 29901].
+-        There will be 576 `32000` in the `input_ids`.
+-        (32000 is the token id for `<image>`.)
++        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
++        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.
++
++        To reserve space in KV cache, we have to insert placeholder tokens
++        before they are inputted to the model, so the input processor prepends
++        additional image tokens (denoted as `32000`), resulting in:
++        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
++        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
++        29901]`.
++
++        We insert 575 tokens so that including the original image token in the
++        input, there are a total of 576 (24 * 24) image tokens, which
++        corresponds to the number of image tokens inputted to the language
++        model, i.e. the number of image tokens outputted by the visual encoder.
+ 
+         This way, the `positions` and `attn_metadata` are consistent
+         with the `input_ids`.
+ 
+-        The model takes two types of image inputs: 
+-        PIXEL_VALUES and IMAGE_FEATURES.
+-        The following shows how each maps to huggingface implementation.
+-        PIXEL_VALUES: 
+-        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
+-        IMAGE_FEATURES:
+-        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
+-        before going through the multi modal projector.
+-
+         Args:
+             input_ids: Flattened (concatenated) input_ids corresponding to a
+                 batch.
+-            image_input: A batch of image inputs.
+-                For PIXEL_VALUES, expecting [1, 3, 336, 336].
+-                For IMAGE_FEATURES, expecting [1, 576, 1024].
++            pixel_values: The pixels in each input image.
++
++        See also:
++            :class:`LlavaImageInputs`
+         """
+-        if image_input is not None:
+-            if list(image_input.shape[1:]) != list(
+-                    self.vision_language_config.image_input_shape[1:]):
+-                raise ValueError(
+-                    f"The expected image tensor shape is batch dimension "
+-                    f"plus "
+-                    f"{self.vision_language_config.image_input_shape[1:]}."
+-                    f" You supplied {image_input.shape}. "
+-                    f"If you are using vLLM's entrypoint, make sure your "
+-                    f"supplied image input is consistent with "
+-                    f"image_input_shape in engine args.")
+-            if self.vision_tower is not None:
+-                # TODO(xwjiang): Maybe port minimal CLIPVisionModel over.
+-                image_outputs = self.vision_tower(image_input,
+-                                                  output_hidden_states=True)
+-                image_features = image_outputs.hidden_states[
+-                    self.config.vision_feature_layer]
+-                # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
+-                if self.config.vision_feature_select_strategy == "default":
+-                    image_features = image_features[:, 1:]
+-                elif self.config.vision_feature_select_strategy == "full":
+-                    image_features = image_features
+-                else:
+-                    raise ValueError(
+-                        f"Unexpected select feature strategy: "
+-                        f"{self.config.vision_feature_select_strategy}")
+-            else:
+-                image_features = image_input
+-            vision_embeddings = self.multi_modal_projector(image_features)
+-            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+-            _merge_vision_embeddings(
+-                input_ids, inputs_embeds, vision_embeddings,
+-                self.vision_language_config.image_token_id)
+-            input_ids = None
+-        else:
++        if intermediate_tensors is not None:
+             inputs_embeds = None
+-        hidden_states = self.language_model(input_ids,
+-                                            positions,
+-                                            kv_caches,
+-                                            attn_metadata,
+-                                            inputs_embeds=inputs_embeds)
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model.model(input_ids,
++                                                  positions,
++                                                  kv_caches,
++                                                  attn_metadata,
++                                                  intermediate_tensors,
++                                                  inputs_embeds=inputs_embeds)
+ 
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+-                                       sampling_metadata)
+-        return logits
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
+ 
+     def sample(
+         self,
+         logits: torch.Tensor,
+         sampling_metadata: SamplingMetadata,
+     ) -> Optional[SamplerOutput]:
+-        next_tokens = self.sampler(logits, sampling_metadata)
+-        return next_tokens
+-
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+-        # only doing this for language model part for now.
+-        stacked_params_mapping = [
+-            # (param_name, shard_name, shard_id)
+-            ("qkv_proj", "q_proj", "q"),
+-            ("qkv_proj", "k_proj", "k"),
+-            ("qkv_proj", "v_proj", "v"),
+-            ("gate_up_proj", "gate_proj", 0),
+-            ("gate_up_proj", "up_proj", 1),
+-        ]
+-        params_dict = dict(self.named_parameters())
+-        for name, loaded_weight in weights:
+-            if "rotary_emb.inv_freq" in name:
+-                continue
+-            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+-                if key_to_modify in name:
+-                    name = name.replace(key_to_modify, new_key)
+-            use_default_weight_loading = False
+-            if "vision" in name:
+-                if self.vision_tower is not None:
+-                    # We only do sharding for language model and
+-                    # not vision model for now.
+-                    use_default_weight_loading = True
+-            else:
+-                for (param_name, weight_name,
+-                     shard_id) in stacked_params_mapping:
+-                    if weight_name not in name:
+-                        continue
+-                    param = params_dict[name.replace(weight_name, param_name)]
+-                    weight_loader = param.weight_loader
+-                    weight_loader(param, loaded_weight, shard_id)
+-                    break
+-                else:
+-                    use_default_weight_loading = True
+-            if use_default_weight_loading:
+-                param = params_dict[name]
+-                weight_loader = getattr(param, "weight_loader",
+-                                        default_weight_loader)
+-                weight_loader(param, loaded_weight)
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights)
++
++
++class MantisMultiModalProcessor(LlavaMultiModalProcessor):
++
++    def apply(
++        self,
++        prompt: Union[str, list[int]],
++        mm_data: MultiModalDataDict,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> MultiModalInputsV2:
++        hf_config = self.info.get_hf_config()
++        image_token_id = hf_config.image_token_index
++
++        # Assume that it doesn't depend on the image size
++        num_image_tokens = self.info.get_num_image_tokens(
++            image_width=-1,
++            image_height=-1,
++        )
++
++        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
++
++        mm_items = self._to_mm_items(mm_data)
++        mm_item_counts = mm_items.get_all_counts()
++        mm_kwargs = result["mm_kwargs"]
++
++        # We reimplement the functionality of MLlavaProcessor from
++        # https://github.com/TIGER-AI-Lab/Mantis.git
++        def get_replacement_mantis(item_idx: int):
++            return "".join([
++                f"(image {item_idx+1}: <Image>",  # 7 tokens
++                "<image>" * num_image_tokens,
++                "</Image>)",  # 3 tokens
++            ])
++
++        mantis_mm_repls = self._bind_and_group_repls([
++            PromptReplacement(
++                modality="image",
++                target=[image_token_id] * num_image_tokens,
++                replacement=get_replacement_mantis,
++            )
++        ])
++
++        prompt_ids, prompt, _ = self._apply_prompt_replacements(
++            result["prompt_token_ids"],
++            mantis_mm_repls,
++            mm_item_counts,
++        )
++
++        unbound_orig_repls = self._get_prompt_replacements(
++            mm_items,
++            hf_processor_mm_kwargs,
++            mm_kwargs,
++        )
++        orig_repls = self._bind_and_group_repls(unbound_orig_repls)
++
++        mm_placeholders = self._find_mm_placeholders(
++            orig_repls,
++            prompt_ids,
++            mm_item_counts,
++        )
++
++        self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
++
++        mm_placeholder_ranges = {
++            modality: [item.to_range() for item in placeholders]
++            for modality, placeholders in mm_placeholders.items()
++        }
++
++        return MultiModalInputsV2(
++            type="multimodal",
++            prompt=prompt,
++            prompt_token_ids=prompt_ids,
++            mm_kwargs=mm_kwargs,
++            mm_placeholders=mm_placeholder_ranges,
++        )
++
++
++# To use this model, please use
++# `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
++@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor,
++                                        info=LlavaProcessingInfo,
++                                        dummy_inputs=LlavaDummyInputsBuilder)
++class MantisForConditionalGeneration(LlavaForConditionalGeneration):
++    pass
+diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
+new file mode 100644
+index 0000000..fda4f22
+--- /dev/null
++++ b/vllm/model_executor/models/llava_next.py
+@@ -0,0 +1,587 @@
++from abc import abstractmethod
++from functools import cached_property
++from typing import (Final, Iterable, List, Literal, Mapping, Optional,
++                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
++
++import torch
++import torch.nn as nn
++from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
++from transformers.models.llava_next.modeling_llava_next import (
++    get_anyres_image_grid_shape, unpad_image)
++from typing_extensions import NotRequired
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
++from vllm.multimodal.parse import ImageSize
++from vllm.sequence import IntermediateTensors
++
++from .clip import CLIPVisionModel
++from .interfaces import SupportsMultiModal, SupportsPP
++from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo,
++                    LlavaDummyInputsBuilder, LlavaLikeConfig,
++                    LlavaMultiModalProjector, init_vision_tower_for_llava)
++from .siglip import SiglipVisionModel
++from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
++                    init_vllm_registered_model, maybe_prefix)
++
++
++class LlavaNextImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: Union[torch.Tensor, List[torch.Tensor]]
++    """
++    Shape:
++    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
++
++    Note that `num_patches` may be different per batch and image,
++    in which case the data is passed as a list instead of a batched tensor.
++    """
++
++    image_sizes: NotRequired[torch.Tensor]
++    """
++    Shape: `(batch_size * num_images, 2)`
++
++    This should be in `(height, width)` format.
++    """
++
++
++class LlavaNextImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: torch.Tensor
++    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
++
++    `hidden_size` must match the hidden size of language model backbone.
++    """
++
++
++LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
++                             LlavaNextImageEmbeddingInputs]
++
++
++class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
++    image_grid_pinpoints: Final[list[list[int]]]
++
++
++class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
++
++    def get_hf_config(self) -> LlavaNextLikeConfig:
++        return self.ctx.get_hf_config(LlavaNextConfig)
++
++    def get_hf_processor(self):
++        return self.ctx.get_hf_processor(LlavaNextProcessor)
++
++    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
++    def get_num_image_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++    ) -> int:
++        hf_config = self.get_hf_config()
++        vision_encoder_info = self.get_vision_encoder_info()
++
++        base_feature_size = self._apply_feature_select_strategy(
++            hf_config.vision_feature_select_strategy,
++            vision_encoder_info.get_num_image_tokens(
++                image_width=image_width,
++                image_height=image_height,
++            ),
++        )
++
++        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
++            image_size=(image_height, image_width),
++            grid_pinpoints=hf_config.image_grid_pinpoints,
++            patch_size=vision_encoder_info.get_image_size(),
++        )
++
++        (
++            unpadded_feature_size,
++            newline_feature_size,
++        ) = self._get_num_unpadded_features(
++            original_height=image_height,
++            original_width=image_width,
++            npatches=vision_encoder_info.get_patch_grid_length(),
++            num_patch_height=num_patch_height,
++            num_patch_width=num_patch_width,
++        )
++
++        return unpadded_feature_size + newline_feature_size + base_feature_size
++
++    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
++    def _get_num_unpadded_features(
++        self,
++        *,
++        original_height: int,
++        original_width: int,
++        npatches: int,
++        num_patch_height: int,
++        num_patch_width: int,
++    ) -> tuple[int, int]:
++        current_height = npatches * num_patch_height
++        current_width = npatches * num_patch_width
++
++        aspect_ratio = original_width / original_height
++        current_aspect_ratio = current_width / current_height
++
++        if aspect_ratio > current_aspect_ratio:
++            new_height = (original_height * current_width) // original_width
++            padding = (current_height - new_height) // 2
++            current_height = current_height - (2 * padding)
++        else:
++            new_width = (original_width * current_height) // original_height
++            padding = (current_width - new_width) // 2
++            current_width = current_width - (2 * padding)
++
++        unpadded_features = current_height * current_width
++        newline_features = current_height
++
++        return (unpadded_features, newline_features)
++
++    def get_image_size_with_most_features(self) -> ImageSize:
++        hf_config = self.get_hf_config()
++
++        largest_feature_size, largest_feature_pinpoint = 0, None
++        for (height, width) in hf_config.image_grid_pinpoints:
++            feat_size = self.get_num_image_tokens(image_width=width,
++                                                  image_height=height)
++            if feat_size > largest_feature_size:
++                largest_feature_size = feat_size
++                largest_feature_pinpoint = ImageSize(width=width,
++                                                     height=height)
++
++        if largest_feature_size == 0 or largest_feature_pinpoint is None:
++            raise ValueError("Cannot have a largest feature size of 0!")
++
++        return largest_feature_pinpoint
++
++
++_I = TypeVar("_I", bound=LlavaNextProcessingInfo)
++
++
++class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):
++
++    # Copied from BaseMultiModalProcessor
++    @abstractmethod
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        raise NotImplementedError
++
++
++class LlavaNextMultiModalProcessor(
++        BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(
++            pixel_values=MultiModalFieldConfig.batched("image"),
++            image_sizes=MultiModalFieldConfig.batched("image"),
++            image_embeds=MultiModalFieldConfig.batched("image"),
++        )
++
++
++@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
++                                        info=LlavaNextProcessingInfo,
++                                        dummy_inputs=LlavaDummyInputsBuilder)
++class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                        SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++
++        vision_feature_layer = config.vision_feature_layer
++        # Determine the layer up to which we will initialize the vision tower
++        if isinstance(vision_feature_layer, int):
++            vision_hidden_size = config.vision_config.hidden_size
++            self.feature_sample_layers = None
++        # Used for multimodal granite models to control encoder outputs
++        elif isinstance(vision_feature_layer, (list, tuple)):
++            vision_hidden_size = config.vision_config.hidden_size * len(
++                vision_feature_layer)
++            self.feature_sample_layers = vision_feature_layer
++        else:
++            raise TypeError(
++                f"vision_layer_feature type: {type(vision_feature_layer)}"
++                " is not supported")
++
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        # TODO: Optionally initializes this for supporting embeddings.
++        self.vision_tower = init_vision_tower_for_llava(
++            config,
++            quant_config,
++            require_post_norm=False,
++            prefix=maybe_prefix(prefix, "vision_tower"))
++        self.image_newline = nn.Parameter(
++            torch.empty(config.text_config.hidden_size))
++        self.multi_modal_projector = LlavaMultiModalProjector(
++            vision_hidden_size=vision_hidden_size,
++            text_hidden_size=config.text_config.hidden_size,
++            projector_hidden_act=config.projector_hidden_act)
++
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=config.text_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
++        expected_dims = (2, )
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape)
++
++            if actual_dims != expected_dims:
++                expected_expr = str(expected_dims)
++                raise ValueError(
++                    f"The expected shape of image sizes per image per batch "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _validate_pixel_values(
++        self, data: Union[torch.Tensor, List[torch.Tensor]]
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++
++        h = w = self.config.vision_config.image_size
++        expected_dims = (3, h, w)
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape[1:])
++
++            if actual_dims != expected_dims:
++                expected_expr = ("num_patches", *map(str, expected_dims))
++                raise ValueError(
++                    "The expected shape of pixel values per image per batch "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_sizes = kwargs.pop("image_sizes", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if pixel_values is not None:
++            if not isinstance(pixel_values, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            if not isinstance(image_sizes, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of image sizes. "
++                                 f"Got type: {type(image_sizes)}")
++
++            return LlavaNextImagePixelInputs(
++                type="pixel_values",
++                data=self._validate_pixel_values(flatten_bn(pixel_values)),
++                image_sizes=self._validate_image_sizes(
++                    flatten_bn(image_sizes, concat=True)),
++            )
++
++        if image_embeds is not None:
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeds. "
++                                 f"Got type: {type(image_embeds)}")
++
++            return LlavaNextImageEmbeddingInputs(
++                type="image_embeds",
++                data=flatten_bn(image_embeds),
++            )
++
++        raise AssertionError("This line should be unreachable.")
++
++    def _select_image_features(self, image_features: torch.Tensor, *,
++                               strategy: str) -> torch.Tensor:
++        # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
++        if strategy == "default":
++            return image_features[:, 1:]
++        elif strategy == "full":
++            return image_features
++
++        raise ValueError(f"Unexpected select feature strategy: {strategy}")
++
++    def _image_pixels_to_features(
++        self,
++        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
++        pixel_values: torch.Tensor,
++    ) -> torch.Tensor:
++
++        # NOTE: we skip the step to select the vision feature layer since
++        # this is already done inside the vision tower
++        image_features = vision_tower(
++            pixel_values, feature_sample_layers=self.feature_sample_layers)
++
++        return self._select_image_features(
++            image_features,
++            strategy=self.config.vision_feature_select_strategy,
++        )
++
++    # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
++    def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
++                                      patch_embeddings: torch.Tensor, *,
++                                      strategy: str) -> torch.Tensor:
++        if strategy == "flat":
++            return patch_embeddings.flatten(0, 1)
++
++        if strategy.startswith("spatial"):
++            height = width = self.config.vision_config.image_size \
++                // self.config.vision_config.patch_size
++
++            base_patch_embeds = patch_embeddings[0]
++            if height * width != base_patch_embeds.shape[0]:
++                raise ValueError(
++                    "The number of patches is not consistent with the "
++                    "image size.")
++
++            if patch_embeddings.shape[0] > 1:
++                other_patch_embeds = patch_embeddings[1:]
++
++                # Move to CPU to avoid floating-point errors
++                orig_height, orig_width = image_size.tolist()
++
++                # image_aspect_ratio == "anyres"
++                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
++                    (orig_height, orig_width),
++                    self.config.image_grid_pinpoints,
++                    self.config.vision_config.image_size,
++                )
++                num_patches = num_patch_height * num_patch_width
++
++                # Image patches might be padded for batch processing
++                other_patch_embeds = other_patch_embeds[:num_patches] \
++                    .view(num_patch_height, num_patch_width, height, width, -1)
++
++                if "unpad" in strategy:
++                    other_patch_embeds = other_patch_embeds \
++                        .permute(4, 0, 2, 1, 3).contiguous() \
++                        .flatten(1, 2).flatten(2, 3)
++                    other_patch_embeds = unpad_image(other_patch_embeds,
++                                                     (orig_height, orig_width))
++                    other_patch_embeds = torch.cat((
++                        other_patch_embeds,
++                        self.image_newline[:, None, None] \
++                            .expand(*other_patch_embeds.shape[:-1], 1) \
++                            .to(other_patch_embeds.device),
++                    ), dim=-1)
++                    other_patch_embeds = other_patch_embeds \
++                        .flatten(1, 2).transpose(0, 1)
++                else:
++                    other_patch_embeds = other_patch_embeds \
++                        .permute(0, 2, 1, 3, 4).contiguous() \
++                        .flatten(0, 3)
++
++                merged_patch_embeddings = torch.cat(
++                    (base_patch_embeds, other_patch_embeds), dim=0)
++            else:
++                if "unpad" in strategy:
++                    merged_patch_embeddings = torch.cat(
++                        (base_patch_embeds,
++                         self.image_newline[None] \
++                            .to(base_patch_embeds.device)
++                    ), dim=0)
++                else:
++                    merged_patch_embeddings = base_patch_embeds
++
++            return merged_patch_embeddings
++
++        raise ValueError(f"Unexpected patch merge strategy: {strategy}")
++
++    def _process_image_pixels(
++        self,
++        inputs: LlavaNextImagePixelInputs,
++    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
++        assert self.vision_tower is not None
++
++        pixel_values = inputs["data"]
++
++        if isinstance(pixel_values, torch.Tensor):
++            b, num_patches, c, h, w = pixel_values.shape
++            stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
++            stacked_image_features = self._image_pixels_to_features(
++                self.vision_tower, stacked_pixel_values)
++            stacked_patch_embeddings = self.multi_modal_projector(
++                stacked_image_features)
++
++            return stacked_patch_embeddings.view(
++                b, num_patches, *stacked_patch_embeddings.shape[1:])
++
++        num_patches_per_batch = [v.shape[0] for v in pixel_values]
++        stacked_pixel_values = torch.cat(pixel_values)
++        stacked_image_features = self._image_pixels_to_features(
++            self.vision_tower, stacked_pixel_values)
++
++        return torch.split(self.multi_modal_projector(stacked_image_features),
++                           num_patches_per_batch)
++
++    def _process_image_input(
++        self,
++        image_input: LlavaNextImageInputs,
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++        if image_input["type"] == "image_embeds":
++            return [image_input["data"]]
++
++        patch_embeddings = self._process_image_pixels(image_input)
++
++        image_sizes = image_input.get("image_sizes")
++        if image_sizes is None:
++            batch_size = len(image_input["data"])
++            vision_config = self.config.vision_config
++            default_height = default_width = vision_config.image_size
++            image_sizes = torch.as_tensor([[default_height, default_width]
++                                           for _ in range(batch_size)])
++
++        return [
++            self._merge_image_patch_embeddings(image_sizes[i],
++                                               patch_features_batch,
++                                               strategy="spatial_unpad")
++            for i, patch_features_batch in enumerate(patch_embeddings)
++        ]
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        vision_embeddings = self._process_image_input(image_input)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++
++        if multimodal_embeddings is None:
++            return self.language_model.get_input_embeddings(input_ids)
++
++        inputs_embeds = embed_multimodal(
++            input_ids,
++            self.config.image_token_index,
++            self.language_model.model.get_input_embeddings,
++            multimodal_embeddings,
++        )
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        """Run forward pass for LlaVA-NeXT.
++
++        One key thing to understand is the `input_ids` already accounts for the
++        positions of the to-be-inserted image embeddings.
++
++        Concretely, consider a text prompt:
++        `"A chat between a curious human and an artificial intelligence
++        assistant. The assistant gives helpful, detailed, and polite answers to
++        the human's questions.
++        USER: <image>\\nWhat is shown in this image? ASSISTANT:"`.
++
++        Tokenizer outputs:
++        `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
++        29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
++        6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
++        29871, 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973, 319, 1799,
++        9047, 13566, 29901]`.
++
++        To reserve space in KV cache, we have to insert placeholder tokens
++        before they are inputted to the model, so the input processor prepends
++        additional image tokens (denoted as `32000`), resulting in:
++        `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
++        29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
++        6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901,
++        29871, 32000, ..., 32000, 13, 5618, 338, 4318, 297, 445, 1967, 29973,
++        319, 1799, 9047, 13566, 29901]`.
++
++        Unlike in LLaVA-1.5, the number of image tokens inputted to the language
++        model depends on the original size of the input image. Including the
++        original image token in the input, the required number of image tokens
++        is given by :func:`get_llava_next_image_feature_size`.
++
++        This way, the `positions` and `attn_metadata` are consistent
++        with the `input_ids`.
++
++        Args:
++            input_ids: Flattened (concatenated) input_ids corresponding to a
++                batch.
++            pixel_values: The pixels in each grid patch for each input image.
++            image_sizes: The original `(height, width)` for each input image.
++
++        See also:
++            :class:`LlavaNextImageInputs`
++        """
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model.model(input_ids,
++                                                  positions,
++                                                  kv_caches,
++                                                  attn_metadata,
++                                                  intermediate_tensors,
++                                                  inputs_embeds=inputs_embeds)
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
+new file mode 100644
+index 0000000..5be85d7
+--- /dev/null
++++ b/vllm/model_executor/models/llava_next_video.py
+@@ -0,0 +1,493 @@
++import math
++from functools import cached_property
++from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
++                    TypedDict, Union)
++
++import torch
++import torch.nn as nn
++from transformers import (BatchFeature, LlavaNextVideoConfig,
++                          LlavaNextVideoProcessor)
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.models.clip import CLIPVisionModel
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
++                                    NestedTensors)
++from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
++                                   VideoEmbeddingItems, VideoProcessorItems)
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo, PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.sequence import IntermediateTensors
++from vllm.utils import is_list_of
++
++from .interfaces import SupportsMultiModal, SupportsPP
++from .llava import init_vision_tower_for_llava
++from .siglip import SiglipVisionModel
++from .utils import (AutoWeightsLoader, init_vllm_registered_model,
++                    maybe_prefix, merge_multimodal_embeddings)
++from .vision import get_vision_encoder_info
++
++
++class LlavaNextVideoPixelInputs(TypedDict):
++    type: Literal["pixel_values_videos"]
++    data: Union[torch.Tensor, List[torch.Tensor]]
++    """
++    Shape: `(batch_size, num_frames, num_channels, height, width)`
++
++    Note that `num_frames` may be different for each batch, in which case
++    the data is passed as a list instead of a batched tensor.
++
++    Note that it only supports one video input for one batch.
++    """
++
++
++class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(LlavaNextVideoConfig)
++
++    def get_vision_encoder_info(self):
++        return get_vision_encoder_info(self.get_hf_config())
++
++    def get_hf_processor(self):
++        return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"video": 1}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        target_width, target_height = self.get_image_size_with_most_features()
++
++        max_video_tokens = self.get_num_video_tokens(
++            image_width=target_width,
++            image_height=target_height,
++            num_frames=self.get_num_frames_with_most_features(seq_len),
++        )
++
++        return {"video": max_video_tokens}
++
++    def get_image_size_with_most_features(self) -> ImageSize:
++        vision_encoder_info = self.get_vision_encoder_info()
++        width = height = vision_encoder_info.get_image_size()
++        return ImageSize(width=width, height=height)
++
++    def _get_num_frame_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++    ) -> int:
++        hf_config = self.get_hf_config()
++        spatial_pool_stride = hf_config.spatial_pool_stride
++
++        vision_encoder_info = self.get_vision_encoder_info()
++        patch_grid_length = vision_encoder_info.get_patch_grid_length()
++        pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
++
++        return pooled_grid_length * pooled_grid_length
++
++    def get_num_video_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++        num_frames: int,
++    ) -> int:
++        num_frame_tokens = self._get_num_frame_tokens(
++            image_width=image_width,
++            image_height=image_height,
++        )
++
++        return num_frame_tokens * num_frames
++
++    def _get_max_video_frames(self, max_tokens: int) -> int:
++        target_width, target_height = self.get_image_size_with_most_features()
++
++        num_frames = 0
++
++        while True:
++            next_num_frames = num_frames + 1
++            next_max_tokens = self.get_num_video_tokens(
++                image_width=target_width,
++                image_height=target_height,
++                num_frames=next_num_frames,
++            )
++
++            if next_max_tokens > max_tokens:
++                break
++
++            num_frames = next_num_frames
++
++        return num_frames
++
++    def get_num_frames_with_most_features(self, seq_len: int) -> int:
++        mm_config = self.ctx.get_mm_config()
++        max_videos = mm_config.limit_per_prompt.get("video", 1)
++
++        max_total_frames = self._get_max_video_frames(seq_len)
++
++        return max(max_total_frames // max(max_videos, 1), 1)
++
++
++class LlavaNextVideoDummyInputsBuilder(
++        BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        num_videos = mm_counts.get("video", 0)
++
++        processor = self.info.get_hf_processor()
++        video_token = processor.video_token
++
++        target_width, target_height = \
++            self.info.get_image_size_with_most_features()
++        target_num_frames = \
++            self.info.get_num_frames_with_most_features(seq_len)
++
++        mm_data = {
++            "video":
++            self._get_dummy_videos(
++                width=target_width,
++                height=target_height,
++                num_frames=target_num_frames,
++                num_videos=num_videos,
++            )
++        }
++
++        return ProcessorInputs(
++            prompt_text=video_token * num_videos,
++            mm_data=mm_data,
++        )
++
++
++class LlavaNextVideoMultiModalProcessor(
++        BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(pixel_values_videos=MultiModalFieldConfig.batched("video"))
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        hf_config = self.info.get_hf_config()
++        video_token_id = hf_config.video_token_index
++
++        def get_replacement(item_idx: int):
++            videos = mm_items.get_items(
++                "video", (VideoEmbeddingItems, VideoProcessorItems))
++
++            if isinstance(videos, VideoEmbeddingItems):
++                num_video_tokens = videos.get_feature_size(item_idx)
++            else:
++                image_size = videos.get_frame_size(item_idx)
++                num_video_tokens = self.info.get_num_video_tokens(
++                    image_width=image_size.width,
++                    image_height=image_size.height,
++                    num_frames=videos.get_num_frames(item_idx),
++                )
++
++            return [video_token_id] * num_video_tokens
++
++        return [
++            PromptReplacement(
++                modality="video",
++                target=[video_token_id],
++                replacement=get_replacement,
++            ),
++        ]
++
++
++# adopted from transformers modeling_llava_next_video.py
++class LlavaNextVideoPooler(nn.Module):
++
++    def __init__(self, config: LlavaNextVideoConfig):
++        super().__init__()
++
++        mode = config.spatial_pool_mode
++        stride = config.spatial_pool_stride
++        image_size = config.vision_config.image_size
++        patch_size = config.vision_config.patch_size
++        self.image_size = image_size // patch_size**2
++
++        if mode == "average":
++            self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
++        elif mode == "max":
++            self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
++        else:
++            # TODO: Support Conv2d pooling layer, need to load weights
++            raise ValueError(
++                f"Unknown pooling mode: {mode}. Expected [`average`, `max`]")
++
++    def forward(self, image_features: torch.Tensor):
++        ori_width = int(
++            math.sqrt(image_features.shape[1] * self.image_size //
++                      self.image_size))
++        ori_height = int(ori_width * self.image_size // self.image_size)
++
++        batch_size, _, dim = image_features.shape
++        image_features_spatial = image_features \
++            .view(batch_size, ori_height, ori_height, dim) \
++            .permute(0, 3, 1, 2)
++        image_features_spatial = self.pool(image_features_spatial)
++
++        return image_features_spatial.flatten(2).transpose(1, 2).contiguous()
++
++
++class LlavaNextMultiModalProjector(nn.Module):
++
++    def __init__(self, vision_hidden_size: int, text_hidden_size: int,
++                 projector_hidden_act: str):
++        super().__init__()
++
++        self.linear_1 = nn.Linear(vision_hidden_size,
++                                  text_hidden_size,
++                                  bias=True)
++        self.act = get_act_fn(projector_hidden_act)
++        self.linear_2 = nn.Linear(text_hidden_size,
++                                  text_hidden_size,
++                                  bias=True)
++
++    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
++        hidden_states = self.linear_1(image_features)
++        hidden_states = self.act(hidden_states)
++        hidden_states = self.linear_2(hidden_states)
++        return hidden_states
++
++
++@MULTIMODAL_REGISTRY.register_processor(
++    LlavaNextVideoMultiModalProcessor,
++    info=LlavaNextVideoProcessingInfo,
++    dummy_inputs=LlavaNextVideoDummyInputsBuilder,
++)
++class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                             SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        # Initialize the vision tower only up to the required feature layer
++        self.vision_tower = init_vision_tower_for_llava(
++            config,
++            quant_config,
++            require_post_norm=False,
++            prefix=maybe_prefix(prefix, "vision_tower"))
++        self.vision_resampler = LlavaNextVideoPooler(config)
++        self.multi_modal_projector = LlavaNextMultiModalProjector(
++            vision_hidden_size=config.vision_config.hidden_size,
++            text_hidden_size=config.text_config.hidden_size,
++            projector_hidden_act=config.projector_hidden_act)
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=config.text_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.model.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _validate_video_pixel_values(
++        self, data: Union[torch.Tensor, List[torch.Tensor]]
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++
++        h = w = self.config.vision_config.image_size
++        expected_dims = (3, h, w)
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape[2:])
++
++            if actual_dims != expected_dims:
++                expected_expr = ("num_frames", *map(str, expected_dims))
++                raise ValueError(
++                    "The expected shape of pixel values in each video frame "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _parse_and_validate_video_input(
++            self, **kwargs: object) -> Optional[LlavaNextVideoPixelInputs]:
++        """
++        A legal video input should have the following dimensions:
++        {
++            "pixel_values_videos" : 
++                List[b, Tensor(nb_frames, nb_channels, height, width)]
++        }
++        """
++        pixel_values = kwargs.pop("pixel_values_videos", None)
++
++        if pixel_values is None:
++            return None
++
++        if not (is_list_of(pixel_values,
++                           (torch.Tensor))  # different shape videos 
++                or isinstance(pixel_values,
++                              torch.Tensor)):  # same shape videos
++            raise ValueError("Incorrect type of pixel values. "
++                             f"Got type: {type(pixel_values)}")
++
++        return LlavaNextVideoPixelInputs(
++            type="pixel_values_videos",
++            data=pixel_values,
++        )
++
++    def _select_image_features(self, image_features: torch.Tensor, *,
++                               strategy: str) -> torch.Tensor:
++        if strategy == "default":
++            return image_features[:, 1:]
++        elif strategy == "full":
++            return image_features
++
++        raise ValueError(f"Unexpected select feature strategy: {strategy}")
++
++    def _video_pixels_to_features(
++        self,
++        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
++        pixel_values: torch.Tensor,
++    ) -> torch.Tensor:
++
++        # NOTE: we skip the step to select the vision feature layer since
++        # this is already done inside the vision tower
++        image_features = vision_tower(pixel_values)
++        image_features = self._select_image_features(
++            image_features,
++            strategy=self.config.vision_feature_select_strategy,
++        )
++        image_features = self.vision_resampler(image_features)
++        image_features = self.multi_modal_projector(image_features)
++        return image_features
++
++    def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
++        assert self.vision_tower is not None
++
++        video_pixels = inputs["data"]
++
++        if isinstance(video_pixels, torch.Tensor):
++            # TODO: support multiple videos per input
++            b, num_videos, num_frames, c, h, w = video_pixels.shape
++            assert (num_videos == 1)
++            stacked_pixels = video_pixels.view(b * num_videos * num_frames, c,
++                                               h, w)
++            stacked_embeddings = self._video_pixels_to_features(
++                self.vision_tower, stacked_pixels)
++            return stacked_embeddings.view(b, num_frames,
++                                           *stacked_embeddings.shape[1:])
++
++        elif is_list_of(video_pixels, torch.Tensor):
++            frames_per_videos = [v.shape[0] for v in video_pixels]
++            stacked_pixels = torch.cat(video_pixels, dim=0)
++            stacked_embeddings = self._video_pixels_to_features(
++                self.vision_tower, stacked_pixels)
++            return torch.split(stacked_embeddings, frames_per_videos, dim=0)
++
++        else:
++            raise ValueError(
++                f"Unsupported type of video input {type(video_pixels)}")
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        video_input = self._parse_and_validate_video_input(**kwargs)
++        if video_input is None:
++            return None
++        vision_embeddings = self._process_video_pixels(video_input)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                self.config.video_token_index)
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        """Run forward pass for LlaVA-NeXT-Video.
++        Args:
++            input_ids: Flattened (concatenated) input_ids corresponding to a
++                batch.
++            pixel_values_videos: Pixels in each frames for each input videos.
++        """
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model.model(input_ids,
++                                                  positions,
++                                                  kv_caches,
++                                                  attn_metadata,
++                                                  intermediate_tensors,
++                                                  inputs_embeds=inputs_embeds)
++
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(
++            self,
++            # This model doesn't support images for now
++            ignore_unexpected_prefixes=["image_newline"],
++        )
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
+new file mode 100644
+index 0000000..78a47e6
+--- /dev/null
++++ b/vllm/model_executor/models/llava_onevision.py
+@@ -0,0 +1,903 @@
++import math
++from functools import cached_property
++from typing import (Final, Iterable, List, Literal, Mapping, Optional,
++                    Protocol, Set, Tuple, TypedDict, Union)
++
++import torch
++import torch.nn as nn
++from transformers import (BatchFeature, LlavaOnevisionConfig,
++                          LlavaOnevisionProcessor)
++from transformers.models.llava_onevision.modeling_llava_onevision import (
++    get_anyres_image_grid_shape, unpad_image)
++from typing_extensions import NotRequired
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
++                                    NestedTensors)
++from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
++                                   VideoProcessorItems)
++from vllm.multimodal.processing import PromptReplacement
++from vllm.multimodal.profiling import ProcessorInputs
++from vllm.sequence import IntermediateTensors
++from vllm.utils import is_list_of
++
++from .clip import CLIPVisionModel
++from .interfaces import SupportsMultiModal, SupportsPP
++from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava
++from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig,
++                         LlavaNextProcessingInfo)
++from .siglip import SiglipVisionModel
++from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
++                    maybe_prefix, merge_multimodal_embeddings)
++
++# For profile run
++_MAX_FRAMES_PER_VIDEO = 16
++
++
++class LlavaOnevisionVideoPixelInputs(TypedDict):
++    type: Literal["pixel_values_videos"]
++    data: Union[torch.Tensor, List[torch.Tensor]]
++    """
++    Shape: `(batch_size, num_videos, num_frames, num_channels, height, width)`
++
++    Note that `num_videos` may be different for each batch, and 'num_frames'
++    may be different for each video, in which case the data is passed as a
++    list instead of a batched tensor.
++    """
++
++
++class LlavaOnevisionImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: Union[torch.Tensor, List[torch.Tensor]]
++    """
++    Shape:
++    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
++
++    Note that `num_patches` may be different per batch and image,
++    in which case the data is passed as a list instead of a batched tensor.
++    """
++
++    image_sizes: NotRequired[torch.Tensor]
++    """
++    Shape: `(batch_size * num_images, 2)`
++
++    This should be in `(height, width)` format.
++    """
++
++
++class LlavaOnevisionImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: torch.Tensor
++    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
++
++    `hidden_size` must match the hidden size of language model backbone.
++    """
++
++
++LlavaOnevisionImageInputs = Union[LlavaOnevisionImagePixelInputs,
++                                  LlavaOnevisionImageEmbeddingInputs]
++
++LlavaOnevisionMultiInputs = Union[LlavaOnevisionImageInputs,
++                                  LlavaOnevisionVideoPixelInputs]
++
++
++class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
++    video_token_index: Final[int]
++
++
++class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
++
++    def get_hf_config(self) -> LlavaOnevisionLikeConfig:
++        return self.ctx.get_hf_config(LlavaOnevisionConfig)
++
++    def get_hf_processor(self):
++        return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": None, "video": None}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        return {
++            "image": self.get_max_image_tokens(),
++            "video": self.get_max_video_tokens(seq_len),
++        }
++
++    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
++    # with additional logic afterwards taken from LlavaOnevisionProcessor
++    def _get_num_unpadded_features(
++        self,
++        *,
++        original_height: int,
++        original_width: int,
++        npatches: int,
++        num_patch_height: int,
++        num_patch_width: int,
++    ) -> tuple[int, int]:
++        current_height = npatches * num_patch_height
++        current_width = npatches * num_patch_width
++
++        aspect_ratio = original_width / original_height
++        current_aspect_ratio = current_width / current_height
++
++        if aspect_ratio > current_aspect_ratio:
++            new_height = (original_height * current_width) // original_width
++            padding = (current_height - new_height) // 2
++            current_height = current_height - (2 * padding)
++        else:
++            new_width = (original_width * current_height) // original_height
++            padding = (current_width - new_width) // 2
++            current_width = current_width - (2 * padding)
++
++        unpadded_features = current_height * current_width
++        newline_features = current_height
++
++        ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
++        if ratio > 1.1:
++            height_factor = int(current_height // ratio)
++            width_factor = int(current_width // ratio)
++            unpadded_features = height_factor * width_factor
++            newline_features = height_factor
++
++        return (unpadded_features, newline_features)
++
++    def _get_num_frame_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++    ) -> int:
++        hf_config = self.get_hf_config()
++        spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
++
++        vision_encoder_info = self.get_vision_encoder_info()
++        patch_grid_length = vision_encoder_info.get_patch_grid_length()
++        pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
++
++        return pooled_grid_length * pooled_grid_length
++
++    def get_num_video_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++        num_frames: int,
++    ) -> int:
++        num_frame_tokens = self._get_num_frame_tokens(
++            image_width=image_width,
++            image_height=image_height,
++        )
++
++        return num_frame_tokens * num_frames + 1  # Newline token
++
++    def _get_max_video_frames(self, max_tokens: int) -> int:
++        target_width, target_height = self.get_image_size_with_most_features()
++
++        num_frames = 0
++
++        while True:
++            next_num_frames = num_frames + 1
++            next_max_tokens = self.get_num_video_tokens(
++                image_width=target_width,
++                image_height=target_height,
++                num_frames=next_num_frames,
++            )
++
++            if next_max_tokens > max_tokens:
++                break
++
++            num_frames = next_num_frames
++
++        return num_frames
++
++    def get_num_frames_with_most_features(self, seq_len: int) -> int:
++        mm_config = self.ctx.get_mm_config()
++        max_images = mm_config.limit_per_prompt.get("image", 1)
++        max_videos = mm_config.limit_per_prompt.get("video", 1)
++
++        max_image_tokens = self.get_max_image_tokens() * max_images
++        max_total_frames = self._get_max_video_frames(seq_len -
++                                                      max_image_tokens)
++        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
++                                   _MAX_FRAMES_PER_VIDEO)
++
++        return max(max_frames_per_video, 1)
++
++    def get_max_video_tokens(self, seq_len: int) -> int:
++        target_width, target_height = self.get_image_size_with_most_features()
++
++        return self.get_num_video_tokens(
++            image_width=target_width,
++            image_height=target_height,
++            num_frames=self.get_num_frames_with_most_features(seq_len),
++        )
++
++
++class LlavaOnevisionDummyInputsBuilder(
++        LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        num_images = mm_counts.get("image", 0)
++        num_videos = mm_counts.get("video", 0)
++
++        processor = self.info.get_hf_processor()
++        image_token = processor.image_token
++        video_token = processor.video_token
++
++        target_width, target_height = \
++            self.info.get_image_size_with_most_features()
++        target_num_frames = \
++            self.info.get_num_frames_with_most_features(seq_len)
++
++        mm_data = {
++            "image":
++            self._get_dummy_images(width=target_width,
++                                   height=target_height,
++                                   num_images=num_images),
++            "video":
++            self._get_dummy_videos(
++                width=target_width,
++                height=target_height,
++                num_frames=target_num_frames,
++                num_videos=num_videos,
++            )
++        }
++
++        return ProcessorInputs(
++            prompt_text=image_token * num_images + video_token * num_videos,
++            mm_data=mm_data,
++        )
++
++
++class LlavaOnevisionMultiModalProcessor(
++        BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(
++            pixel_values=MultiModalFieldConfig.batched("image"),
++            image_sizes=MultiModalFieldConfig.batched("image"),
++            image_embeds=MultiModalFieldConfig.batched("image"),
++            pixel_values_videos=MultiModalFieldConfig.batched("video"),
++        )
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        mm_data = dict(mm_data)
++        videos = mm_data.pop("videos", [])
++        assert isinstance(videos, list)
++
++        if not videos:
++            return super()._call_hf_processor(
++                prompt=prompt,
++                mm_data=mm_data,
++                mm_kwargs=mm_kwargs,
++            )
++
++        processor = self.info.get_hf_processor()
++        video_token = processor.video_token
++
++        # LLaVA-OneVision processor doesn't support multiple videos
++        # with different sizes when converting back to tensors
++        text_image_outputs = super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++        )
++
++        pixel_values_videos = []
++        for video in videos:
++            item_processor_data = dict(prompt=video_token, videos=video)
++
++            item_outputs = super()._call_hf_processor(
++                prompt=prompt,
++                mm_data=item_processor_data,
++                mm_kwargs=mm_kwargs,
++            )
++
++            pixel_values_videos.append(
++                item_outputs.pop("pixel_values_videos")[0])
++
++        combined_outputs = dict(
++            **text_image_outputs,
++            pixel_values_videos=pixel_values_videos,
++        )
++        return BatchFeature(combined_outputs)
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        image_repls = super()._get_prompt_replacements(
++            mm_items=mm_items,
++            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
++            out_mm_kwargs=out_mm_kwargs,
++        )
++
++        hf_config = self.info.get_hf_config()
++        video_token_id = hf_config.video_token_index
++
++        def get_video_replacement(item_idx: int):
++            videos = mm_items.get_items(
++                "video", (VideoEmbeddingItems, VideoProcessorItems))
++
++            if isinstance(videos, VideoEmbeddingItems):
++                num_video_tokens = videos.get_feature_size(item_idx)
++            else:
++                image_size = videos.get_frame_size(item_idx)
++                num_video_tokens = self.info.get_num_video_tokens(
++                    image_width=image_size.width,
++                    image_height=image_size.height,
++                    num_frames=videos.get_num_frames(item_idx),
++                )
++
++            return [video_token_id] * num_video_tokens
++
++        return image_repls + [
++            PromptReplacement(
++                modality="video",
++                target=[video_token_id],
++                replacement=get_video_replacement,
++            ),
++        ]
++
++
++class LlavaOnevisionMultiModalProjector(nn.Module):
++
++    def __init__(self, config: LlavaOnevisionConfig):
++        super().__init__()
++
++        self.linear_1 = nn.Linear(config.vision_config.hidden_size,
++                                  config.text_config.hidden_size,
++                                  bias=True)
++        self.act = get_act_fn(config.projector_hidden_act)
++        self.linear_2 = nn.Linear(config.text_config.hidden_size,
++                                  config.text_config.hidden_size,
++                                  bias=True)
++
++    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
++        hidden_states = self.linear_1(image_features)
++        hidden_states = self.act(hidden_states)
++        hidden_states = self.linear_2(hidden_states)
++        return hidden_states
++
++
++@MULTIMODAL_REGISTRY.register_processor(
++    LlavaOnevisionMultiModalProcessor,
++    info=LlavaOnevisionProcessingInfo,
++    dummy_inputs=LlavaOnevisionDummyInputsBuilder)
++class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                             SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        # Initialize the vision tower only up to the required feature layer
++        self.vision_tower = init_vision_tower_for_llava(
++            config,
++            quant_config,
++            require_post_norm=False,
++            prefix=maybe_prefix(prefix, "vision_tower"))
++        self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=config.text_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++        self.image_newline = nn.Parameter(
++            torch.empty(config.text_config.hidden_size))
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.model.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
++        expected_dims = (2, )
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape)
++
++            if actual_dims != expected_dims:
++                expected_expr = str(expected_dims)
++                raise ValueError(
++                    f"The expected shape of image sizes per image per batch "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _validate_image_pixel_values(
++        self, data: Union[torch.Tensor, List[torch.Tensor]]
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++
++        h = w = self.config.vision_config.image_size
++        expected_dims = (3, h, w)
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape[1:])
++
++            if actual_dims != expected_dims:
++                expected_expr = ("num_patches", *map(str, expected_dims))
++                raise ValueError(
++                    "The expected shape of pixel values per image per batch "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[LlavaOnevisionImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_sizes = kwargs.pop("image_sizes", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if pixel_values is not None:
++            if not isinstance(pixel_values, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            if not isinstance(image_sizes, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of image sizes. "
++                                 f"Got type: {type(image_sizes)}")
++
++            return LlavaOnevisionImagePixelInputs(
++                type="pixel_values",
++                data=self._validate_image_pixel_values(
++                    flatten_bn(pixel_values)),
++                image_sizes=self._validate_image_sizes(
++                    flatten_bn(image_sizes, concat=True)),
++            )
++
++        if image_embeds is not None:
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeds. "
++                                 f"Got type: {type(image_embeds)}")
++
++            return LlavaOnevisionImageEmbeddingInputs(
++                type="image_embeds",
++                data=flatten_bn(image_embeds),
++            )
++
++        raise AssertionError("This line should be unreachable.")
++
++    def _validate_video_pixel_values(
++        self, data: Union[torch.Tensor, List[torch.Tensor]]
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++
++        h = w = self.config.vision_config.image_size
++        expected_dims = (3, h, w)
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape[2:])
++
++            if actual_dims != expected_dims:
++                expected_expr = ("num_frames", *map(str, expected_dims))
++                raise ValueError(
++                    "The expected shape of pixel values in each video frame "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _parse_and_validate_video_input(
++            self,
++            **kwargs: object) -> Optional[LlavaOnevisionVideoPixelInputs]:
++        """
++        A legal video input should have the following dimensions:
++        {
++            "pixel_values_videos" : 
++                List[b, Tensor(nb_frames, nb_channels, height, width)]
++        }
++        """
++        pixel_values = kwargs.pop("pixel_values_videos", None)
++
++        if pixel_values is None:
++            return None
++
++        if not (is_list_of(pixel_values,
++                           (torch.Tensor))  # different shape videos 
++                or isinstance(pixel_values,
++                              torch.Tensor)):  # same shape videos
++            raise ValueError("Incorrect type of pixel values. "
++                             f"Got type: {type(pixel_values)}")
++
++        return LlavaOnevisionVideoPixelInputs(
++            type="pixel_values_videos",
++            data=pixel_values,
++        )
++
++    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
++        modalities = {}
++
++        # Preserve the order of modalities if there are multiple of them
++        # from the order of kwargs.
++        for input_key in kwargs:
++            if input_key == "pixel_values" and "images" not in modalities:
++                modalities["images"] = self._parse_and_validate_image_input(
++                    **kwargs)
++            if input_key == "pixel_values_videos" and "videos" not in modalities:  # noqa E501
++                modalities["videos"] = self._parse_and_validate_video_input(
++                    **kwargs)
++
++        return modalities
++
++    def _select_image_features(self, image_features: torch.Tensor, *,
++                               strategy: str) -> torch.Tensor:
++        if strategy == "default":
++            return image_features[:, 1:]
++        elif strategy == "full":
++            return image_features
++
++        raise ValueError(f"Unexpected select feature strategy: {strategy}")
++
++    def _image_pixels_to_features(
++        self,
++        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
++        pixel_values: torch.Tensor,
++    ) -> torch.Tensor:
++
++        # NOTE: we skip the step to select the vision feature layer since
++        # this is already done inside the vision tower
++        image_features = vision_tower(pixel_values)
++        return self._select_image_features(
++            image_features,
++            strategy=self.config.vision_feature_select_strategy,
++        )
++
++    # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
++    def _merge_image_patch_embeddings(self,
++                                      image_size: torch.Tensor,
++                                      patch_embeddings: torch.Tensor,
++                                      *,
++                                      image_newline=None,
++                                      vision_aspect_ratio="anyres_max_9",
++                                      strategy: str) -> torch.Tensor:
++        if strategy == "flat":
++            return patch_embeddings.flatten(0, 1)
++
++        if strategy.startswith("spatial"):
++            height = width = self.config.vision_config.image_size \
++                // self.config.vision_config.patch_size
++
++            base_patch_embeds = patch_embeddings[0]
++            if height * width != base_patch_embeds.shape[0]:
++                raise ValueError(
++                    "The number of patches is not consistent with the "
++                    "image size.")
++
++            if patch_embeddings.shape[0] > 1:
++                other_patch_embeds = patch_embeddings[1:]
++
++                # Move to CPU to avoid floating-point errors
++                orig_height, orig_width = image_size.tolist()
++
++                # image_aspect_ratio == "anyres"
++                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
++                    (orig_height, orig_width),
++                    self.config.image_grid_pinpoints,
++                    self.config.vision_config.image_size,
++                )
++                num_patches = num_patch_height * num_patch_width
++
++                # Image patches might be padded for batch processing
++                other_patch_embeds = other_patch_embeds[:num_patches] \
++                    .view(num_patch_height, num_patch_width, height, width, -1)
++
++                if "unpad" in strategy:
++                    other_patch_embeds = other_patch_embeds \
++                        .permute(4, 0, 2, 1, 3).contiguous() \
++                        .flatten(1, 2).flatten(2, 3)
++                    other_patch_embeds = unpad_image(other_patch_embeds,
++                                                     (orig_height, orig_width))
++                    max_num_patches = int(
++                        vision_aspect_ratio.removeprefix("anyres_max_"))
++                    channels, curr_height, curr_width = other_patch_embeds.shape
++                    ratio = math.sqrt(curr_height * curr_width /
++                                      (max_num_patches * height**2))
++                    if ratio > 1.1:
++                        other_patch_embeds = other_patch_embeds[None]
++                        other_patch_embeds = nn.functional.interpolate(
++                            other_patch_embeds, [
++                                int(curr_height // ratio),
++                                int(curr_width // ratio)
++                            ],
++                            mode="bilinear")[0]
++                    if image_newline is not None:
++                        other_patch_embeds = torch.cat(
++                            (
++                                other_patch_embeds,
++                                image_newline[:, None, None] \
++                                .expand(*other_patch_embeds.shape[:-1], 1) \
++                                .to(other_patch_embeds.device),
++                            ),
++                        dim=-1)
++                    other_patch_embeds = other_patch_embeds \
++                        .flatten(1, 2).transpose(0, 1)
++                else:
++                    other_patch_embeds = other_patch_embeds \
++                        .permute(0, 2, 1, 3, 4).contiguous() \
++                        .flatten(0, 3)
++
++                merged_patch_embeddings = torch.cat(
++                    (base_patch_embeds, other_patch_embeds), dim=0)
++            else:
++                if "unpad" in strategy:
++                    merged_patch_embeddings = torch.cat(
++                        (base_patch_embeds,
++                         self.image_newline[None] \
++                            .to(base_patch_embeds.device)
++                    ), dim=0)
++                else:
++                    merged_patch_embeddings = base_patch_embeds
++
++            return merged_patch_embeddings
++
++        raise ValueError(f"Unexpected patch merge strategy: {strategy}")
++
++    def _process_image_pixels(
++        self,
++        inputs: LlavaOnevisionImagePixelInputs,
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++        assert self.vision_tower is not None
++
++        pixel_values = inputs["data"]
++
++        if isinstance(pixel_values, torch.Tensor):
++            b, num_patches, c, h, w = pixel_values.shape
++            stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
++            stacked_image_features = self._image_pixels_to_features(
++                self.vision_tower, stacked_pixel_values)
++            stacked_patch_embeddings = self.multi_modal_projector(
++                stacked_image_features)
++
++            return stacked_patch_embeddings.view(
++                b, num_patches, *stacked_patch_embeddings.shape[1:])
++
++        num_patches_per_batch = [v.shape[0] for v in pixel_values]
++        stacked_pixel_values = torch.cat(pixel_values)
++        stacked_image_features = self._image_pixels_to_features(
++            self.vision_tower, stacked_pixel_values)
++
++        return [
++            self.multi_modal_projector(image_features) for image_features in
++            torch.split(stacked_image_features, num_patches_per_batch)
++        ]
++
++    def _process_image_input(
++        self,
++        image_input: LlavaOnevisionImageInputs,
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++        if image_input["type"] == "image_embeds":
++            return [image_input["data"]]
++
++        patch_embeddings = self._process_image_pixels(image_input)
++
++        image_sizes = image_input.get("image_sizes")
++        if image_sizes is None:
++            batch_size = len(image_input["data"])
++            vision_config = self.config.vision_config
++            default_height = default_width = vision_config.image_size
++            image_sizes = torch.as_tensor([[default_height, default_width]
++                                           for _ in range(batch_size)])
++
++        return [
++            self._merge_image_patch_embeddings(
++                image_sizes[i],
++                patch_features_batch,
++                image_newline=self.image_newline,
++                strategy="spatial_unpad")
++            for i, patch_features_batch in enumerate(patch_embeddings)
++        ]
++
++    def _add_image_newline(
++        self,
++        video_features: torch.Tensor,
++        videos: int = 1,
++        frames: int = 1,
++        strategy: str = "one_token",
++    ) -> torch.Tensor:
++        if strategy == "one_token":
++            video_features = video_features.reshape(
++                videos, frames * video_features.shape[1], -1)
++            image_newline = self.image_newline[None, None, :].repeat(
++                videos, 1, 1).to(video_features.device)
++            video_features = torch.cat((video_features, image_newline), dim=1)
++            return video_features
++        raise ValueError(f"Unexpected video newline strategy: {strategy}")
++
++    def _video_pixels_to_features(
++        self,
++        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
++        pixel_values: torch.Tensor,
++    ) -> torch.Tensor:
++
++        # NOTE: we skip the step to select the vision feature layer since
++        # this is already done inside the vision tower
++        video_features = vision_tower(pixel_values)
++        video_features = self._select_image_features(
++            video_features,
++            strategy=self.config.vision_feature_select_strategy,
++        )
++        video_features = self.multi_modal_projector(video_features)
++        video_features = self.apply_pooling(video_features)
++        return video_features
++
++    def _process_video_pixels(self, inputs: LlavaOnevisionVideoPixelInputs):
++        assert self.vision_tower is not None
++
++        video_pixels = inputs["data"]
++
++        if isinstance(video_pixels, torch.Tensor):
++            b, num_videos, frames, c, h, w = video_pixels.shape
++            pixel_values = video_pixels.view(b * num_videos * frames, c, h, w)
++            stacked_embeddings = self._video_pixels_to_features(
++                self.vision_tower, pixel_values)
++            stacked_embeddings = self._add_image_newline(stacked_embeddings,
++                                                         videos=b * num_videos,
++                                                         frames=frames,
++                                                         strategy="one_token")
++            return stacked_embeddings
++        elif is_list_of(video_pixels, torch.Tensor):
++            stacked_embeddings = []
++            for video_pixel in video_pixels:
++                num_videos, frames, c, h, w = video_pixel.shape
++                pixel_values = video_pixel.view(num_videos * frames, c, h, w)
++                embeddings = self._video_pixels_to_features(
++                    self.vision_tower, pixel_values)
++                embeddings = self._add_image_newline(embeddings,
++                                                     videos=num_videos,
++                                                     frames=frames,
++                                                     strategy="one_token")
++                stacked_embeddings.append(embeddings)
++            return stacked_embeddings
++        else:
++            raise ValueError(
++                f"Unsupported type of video input {type(video_pixels)}")
++
++    def apply_pooling(self, image_features, stride=2):
++        vision_config = self.config.vision_config
++        height = width = vision_config.image_size // vision_config.patch_size
++        batch_frames, _, dim = image_features.shape
++        image_features = image_features.view(batch_frames, height, width, -1)
++        image_features = image_features.permute(0, 3, 1, 2)
++
++        # TODO support other pooling types config
++        height, width = image_features.shape[2:]
++        scaled_shape = [math.ceil(height / stride), math.ceil(width / stride)]
++        image_feature = nn.functional.interpolate(image_features,
++                                                  size=scaled_shape,
++                                                  mode='bilinear')
++        image_feature = image_feature.permute(0, 2, 3, 1)
++        image_feature = image_feature.view(batch_frames, -1, dim)
++        return image_feature
++
++    def get_multimodal_embeddings(
++            self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
++        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
++        if not modalities:
++            return None
++
++        # The result multimodal_embeddings is tuple of tensors, with each
++        # tensor correspoending to a multimodal data item (image or video).
++        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
++
++        # NOTE: It is important to iterate over the keys in this dictionary
++        # to preserve the order of the modalities.
++        for modality in modalities:
++            if modality == "images":
++                image_input = modalities["images"]
++                vision_embeddings = self._process_image_input(image_input)
++                multimodal_embeddings += tuple(vision_embeddings)
++            if modality == "videos":
++                video_input = modalities["videos"]
++                video_embeddings = self._process_video_pixels(video_input)
++                multimodal_embeddings += tuple(video_embeddings)
++
++        return multimodal_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[List[Tuple[NestedTensors,
++                                                   str]]] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                [self.config.image_token_index, self.config.video_token_index])
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        """Run forward pass for LlaVA-Onevision.
++        Args:
++            input_ids: Flattened (concatenated) input_ids corresponding to a
++                batch.
++            pixel_values_videos: Pixels in each frames for each input videos.
++        """
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      multimodal_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model.model(input_ids,
++                                                  positions,
++                                                  kv_caches,
++                                                  attn_metadata,
++                                                  intermediate_tensors,
++                                                  inputs_embeds=inputs_embeds)
++
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
+new file mode 100644
+index 0000000..553bc9c
+--- /dev/null
++++ b/vllm/model_executor/models/mamba.py
+@@ -0,0 +1,302 @@
++"""PyTorch MAMBA model."""
++from typing import Iterable, List, Optional, Set, Tuple
++
++import torch
++from torch import nn
++from transformers import MambaConfig
++
++from vllm.attention.backends.abstract import AttentionMetadata
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.distributed.parallel_state import get_pp_group
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.interfaces import (HasInnerState,
++                                                   IsAttentionFree, SupportsPP)
++from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
++                                                    MambaCacheParams)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++from vllm.utils import LayerBlockType
++
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++KVCache = Tuple[torch.Tensor, torch.Tensor]
++
++
++class MambaDecoderLayer(nn.Module):
++
++    def __init__(self,
++                 config: MambaConfig,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 is_lora_enabled: Optional[bool] = False) -> None:
++        super().__init__()
++        self.config = config
++        self.is_falcon_mamba = config.model_type == "falcon_mamba"
++        self.is_lora_enabled = is_lora_enabled
++        mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None
++        self.mixer = MambaMixer(hidden_size=config.hidden_size,
++                                ssm_state_size=config.state_size,
++                                conv_kernel_size=config.conv_kernel,
++                                intermediate_size=config.intermediate_size,
++                                time_step_rank=config.time_step_rank,
++                                use_conv_bias=config.use_conv_bias,
++                                use_bias=config.use_bias,
++                                use_rms_norm=self.is_falcon_mamba,
++                                rms_norm_has_weight=not self.is_falcon_mamba,
++                                rms_norm_eps=mixer_rms_eps,
++                                activation=config.hidden_act,
++                                is_lora_enabled=self.is_lora_enabled)
++
++        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++        mamba_cache_params: MambaCacheParams,
++        **kwargs,
++    ):
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.norm(hidden_states)
++        else:
++            hidden_states, residual = self.norm(hidden_states, residual)
++
++        hidden_states = self.mixer(hidden_states, attn_metadata,
++                                   mamba_cache_params)
++        return hidden_states, residual
++
++
++class MambaModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++        is_lora_enabled = bool(lora_config)
++
++        self.config = config
++        self.padding_idx = config.pad_token_id
++        lora_vocab = ((lora_config.lora_extra_vocab_size *
++                       (lora_config.max_loras or 1)) if lora_config else 0)
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.org_vocab_size = config.vocab_size
++
++        self.embeddings = VocabParallelEmbedding(
++            self.vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++        )
++
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: MambaDecoderLayer(config,
++                                             cache_config=cache_config,
++                                             quant_config=quant_config,
++                                             is_lora_enabled=is_lora_enabled),
++            prefix=f"{prefix}.layers")
++
++        self.norm_f = RMSNorm(config.hidden_size,
++                              eps=config.layer_norm_epsilon)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        mamba_cache_params: MambaCacheParams,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(
++                positions=positions,
++                hidden_states=hidden_states,
++                attn_metadata=attn_metadata,
++                residual=residual,
++                mamba_cache_params=mamba_cache_params.at_layer_idx(
++                    i - self.start_layer))
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++        hidden_states, _ = self.norm_f(hidden_states, residual)
++
++        return hidden_states
++
++
++class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        lora_config = vllm_config.lora_config
++        scheduler_config = vllm_config.scheduler_config
++        assert not cache_config.enable_prefix_caching, \
++            "Mamba does not support prefix caching"
++
++        super().__init__()
++        self.config = config
++        self.vllm_config = vllm_config
++        self.scheduler_config = scheduler_config
++        self.model_config = vllm_config.model_config
++        self.backbone = MambaModel(vllm_config=vllm_config,
++                                   prefix=maybe_prefix(prefix, "backbone"))
++        self.unpadded_vocab_size = config.vocab_size
++        if lora_config:
++            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++        if config.tie_word_embeddings:
++            self.lm_head = self.backbone.embeddings
++        else:
++            self.lm_head = ParallelLMHead(
++                self.unpadded_vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++                padding_size=DEFAULT_VOCAB_PADDING_SIZE
++                # We need bigger padding if using lora for kernel
++                # compatibility
++                if not lora_config else lora_config.lora_vocab_padding_size,
++            )
++
++        # Used to track and store by the Mamba cache between steps.
++        self.mamba_cache: Optional[MambaCacheManager] = None
++
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                config.vocab_size)
++        self.sampler = get_sampler()
++
++        self.make_empty_intermediate_tensors = (
++            self.backbone.make_empty_intermediate_tensors)
++        if self.scheduler_config is not None and \
++            not self.model_config.enforce_eager:
++            if self.scheduler_config.max_num_seqs > \
++                vllm_config.compilation_config.max_capture_size:
++                self.max_batch_size = \
++                    vllm_config.compilation_config.max_capture_size
++            else:
++                self.max_batch_size = vllm_config.pad_for_cudagraph(
++                    self.scheduler_config.max_num_seqs)
++        else:
++            self.max_batch_size = 8192 + 2
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.backbone.get_input_embeddings(input_ids)
++
++    def forward(self,
++                input_ids: torch.Tensor,
++                positions: torch.Tensor,
++                kv_caches: List[KVCache],
++                attn_metadata: AttentionMetadata,
++                intermediate_tensors: Optional[IntermediateTensors] = None,
++                inputs_embeds: Optional[torch.Tensor] = None,
++                **kwargs):
++        if self.mamba_cache is None:
++            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
++                self.vllm_config.parallel_config, LayerBlockType.mamba)
++            self.mamba_cache = MambaCacheManager(
++                self.lm_head.weight.dtype, num_mamba_layers,
++                self.max_batch_size, *self._get_mamba_cache_shape())
++
++        (
++            mamba_cache_tensors,
++            state_indices_tensor,
++        ) = self.mamba_cache.current_run_tensors(input_ids, attn_metadata,
++                                                 **kwargs)
++
++        mamba_cache_params = MambaCacheParams(mamba_cache_tensors[0],
++                                              mamba_cache_tensors[1],
++                                              state_indices_tensor)
++
++        hidden_states = self.backbone(input_ids, positions, attn_metadata,
++                                      mamba_cache_params, intermediate_tensors,
++                                      inputs_embeds)
++
++        return hidden_states
++
++    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
++        return self.mamba_cache.copy_inputs_before_cuda_graphs(
++            input_buffers, **kwargs)
++
++    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
++        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
++
++    def _get_mamba_cache_shape(
++            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
++        world_size = get_tensor_model_parallel_world_size()
++        conv_state_shape = (
++            self.config.intermediate_size // world_size,
++            self.config.conv_kernel - 1,
++        )
++        temporal_state_shape = (
++            self.config.intermediate_size // world_size,
++            self.config.state_size,
++        )
++        return conv_state_shape, temporal_state_shape
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: Optional[torch.Tensor],
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "A_log" in name:
++                name = name.replace("A_log", "A")
++            # Skip loading extra bias for GPTQ models.
++            if name.endswith(".bias") and name not in params_dict:
++                continue
++            if is_pp_missing_parameter(name, self):
++                continue
++
++            param = params_dict[name]
++            weight_loader = getattr(param, "weight_loader",
++                                    default_weight_loader)
++            weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
+new file mode 100644
+index 0000000..7939342
+--- /dev/null
++++ b/vllm/model_executor/models/mamba_cache.py
+@@ -0,0 +1,158 @@
++from dataclasses import dataclass
++from typing import Dict, List
++
++import torch
++
++from vllm.attention.backends.abstract import AttentionMetadata
++from vllm.attention.backends.utils import PAD_SLOT_ID
++
++
++@dataclass
++class MambaCacheParams:
++    conv_state: torch.Tensor = torch.Tensor()
++    ssm_state: torch.Tensor = torch.Tensor()
++    state_indices_tensor: torch.Tensor = torch.Tensor()
++
++    def at_layer_idx(self, layer_idx):
++        return MambaCacheParams(self.conv_state[layer_idx],
++                                self.ssm_state[layer_idx],
++                                self.state_indices_tensor)
++
++
++class MambaCacheManager:
++
++    def __init__(self, dtype, num_mamba_layers, max_batch_size,
++                 conv_state_shape, temporal_state_shape):
++
++        conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
++                                 conv_state_shape,
++                                 dtype=dtype,
++                                 device="cuda")
++        temporal_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
++                                     temporal_state_shape,
++                                     dtype=dtype,
++                                     device="cuda")
++
++        self.mamba_cache = (conv_state, temporal_state)
++
++        # Maps between the request id and a dict that maps between the seq_id
++        # and its index inside the self.mamba_cache
++        self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
++        self.free_cache_indices = list(range(max_batch_size))
++
++    def current_run_tensors(self, input_ids: torch.Tensor,
++                            attn_metadata: AttentionMetadata, **kwargs):
++        """
++        Return the tensors for the current run's conv and ssm state.
++        """
++        if "seqlen_agnostic_capture_inputs" not in kwargs:
++            # We get here only on Prefill/Eager mode runs
++            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
++            finished_requests_ids = kwargs["finished_requests_ids"]
++
++            self._release_finished_requests(finished_requests_ids)
++            state_indices = self._prepare_current_run_mamba_cache(
++                request_ids_to_seq_ids, finished_requests_ids)
++
++            state_indices_tensor = torch.as_tensor(state_indices,
++                                                   dtype=torch.int32,
++                                                   device="cuda")
++            mamba_cache_tensors = self.mamba_cache
++
++        else:
++            # CUDA graph capturing runs
++            (mamba_cache_tensors,
++             state_indices_tensor) = kwargs["seqlen_agnostic_capture_inputs"]
++
++        return (mamba_cache_tensors, state_indices_tensor)
++
++    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
++        """
++        Copy the relevant state_indices into the CUDA graph input buffer 
++        """
++        assert all(
++            key in kwargs
++            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
++        finished_requests_ids = kwargs["finished_requests_ids"]
++        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
++        assert "seqlen_agnostic_capture_inputs" in input_buffers
++        _, input_state_indices_buffer = input_buffers[
++            "seqlen_agnostic_capture_inputs"]
++
++        self._release_finished_requests(finished_requests_ids)
++        state_indices = self._prepare_current_run_mamba_cache(
++            request_ids_to_seq_ids, finished_requests_ids)
++        cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len(
++            state_indices)
++        state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len)
++
++        input_state_indices_buffer.copy_(
++            torch.as_tensor(state_indices, dtype=torch.int32, device="cuda"))
++
++    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
++        """
++        Provide the CUDA graph capture runs with a buffer in adjusted size.
++        The buffer is used to maintain the Mamba Cache during the CUDA graph
++        replay runs.
++        """
++        state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size,
++                                               dtype=torch.int32,
++                                               device="cuda")
++        return (self.mamba_cache, state_indices_tensor)
++
++    def _copy_mamba_cache(self, from_index: int, to_index: int):
++        assert len(self.mamba_cache) > 0
++        for cache_t in self.mamba_cache:
++            cache_t[:, to_index].copy_(cache_t[:, from_index],
++                                       non_blocking=True)
++
++    def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int,
++                                      finished_requests_ids) -> int:
++        """
++        Assign (req_id,seq_id) pair to a `destination_index` index, if
++        already occupied, move the occupying index to a free index.
++        """
++        if cur_rid in finished_requests_ids:
++            # set as pad, do not allocate destination index
++            return PAD_SLOT_ID
++        elif cur_rid not in self.mamba_cache_indices_mapping:
++            destination_index = self.free_cache_indices.pop()
++            self.mamba_cache_indices_mapping[cur_rid] = {
++                seq_id: destination_index
++            }
++            return destination_index
++        elif seq_id not in (seq_ids2indices :=
++                            self.mamba_cache_indices_mapping[cur_rid]):
++            # parallel sampling , where n > 1, assume prefill have
++            # already happened, so we copy the
++            # existing cache into the siblings seq_ids caches
++            index_exists = next(iter(seq_ids2indices.values()))
++            # case of decoding n>1, copy prefill cache to decoding indices
++            destination_index = self.free_cache_indices.pop()
++            self._copy_mamba_cache(from_index=index_exists,
++                                   to_index=destination_index)
++            self.mamba_cache_indices_mapping[cur_rid][
++                seq_id] = destination_index
++            return destination_index
++        else:
++            # already exists
++            return self.mamba_cache_indices_mapping[cur_rid][seq_id]
++
++    def _prepare_current_run_mamba_cache(
++            self, request_ids_to_seq_ids: Dict[str, list[int]],
++            finished_requests_ids: List[str]) -> List[int]:
++        return [
++            self._assign_seq_id_to_cache_index(req_id, seq_id,
++                                               finished_requests_ids)
++            for req_id, seq_ids in request_ids_to_seq_ids.items()
++            for seq_id in seq_ids
++        ]
++
++    def _release_finished_requests(self,
++                                   finished_seq_groups_req_ids: List[str]):
++        for req_id in finished_seq_groups_req_ids:
++            if req_id in self.mamba_cache_indices_mapping:
++                for seq_id in self.mamba_cache_indices_mapping[req_id]:
++                    self.free_cache_indices.append(
++                        self.mamba_cache_indices_mapping[req_id][seq_id])
++                self.mamba_cache_indices_mapping.pop(req_id)
+diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
+new file mode 100644
+index 0000000..66bdcb8
+--- /dev/null
++++ b/vllm/model_executor/models/medusa.py
+@@ -0,0 +1,208 @@
++from typing import Iterable, List, Optional, Set, Tuple
++
++import torch
++import torch.nn as nn
++
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++
++
++class ResidualBlock(nn.Module):
++
++    def __init__(self, config: VllmConfig, hidden_size: int,
++                 num_layers: int) -> None:
++        super().__init__()
++
++        self.layers = nn.ModuleList([
++            nn.Linear(hidden_size,
++                      hidden_size,
++                      bias=getattr(config, "medusa_fc_bias", False))
++            for _ in range(num_layers)
++        ])
++        self.act = nn.SiLU()
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        for layer in self.layers:
++            x = x + self.act(layer(x))
++        return x
++
++
++class Medusa(nn.Module):
++    """This class implements the Medusa draft model from the paper: https://arxiv.org/abs/2401.10774
++    Reference implementation: https://github.com/FasterDecoding/Medusa
++    
++    Differences from reference implementation:
++    1. Currently this only supports generating proposals from top-1 tokens.
++    2. We have an optional token_map which reduces draft vocab to most 
++       frequently used tokens to give some additional speed-up by reducing 
++       sampling overhead. This is disabled unless the checkpoint file has 
++       explicit token_map tensor and config has an optional attribute 
++       truncated_vocab_size < vocab_size. To use this technique, one has to find
++       the top-k most frequent tokens in target dataset and add that as a tensor
++       in the draft checkpoint (using key token_map). Also, the draft config
++       needs to have truncated_vocab_size (=k) as an attribute."""
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
++        config = vllm_config.model_config.hf_config
++        super().__init__()
++        self.config = config
++        self.blocks = nn.ModuleList([
++            ResidualBlock(config=config,
++                          hidden_size=self.config.hidden_size,
++                          num_layers=self.config.num_hidden_layers)
++            for _ in range(self.config.num_heads)
++        ])
++        self.orig_vocab_size = config.vocab_size
++        self.truncated_vocab_size = config.truncated_vocab_size
++        self.unpadded_vocab_size = self.truncated_vocab_size
++
++        if getattr(config, "original_lm_head", False):
++            self.lm_head = ParallelLMHead(
++                self.unpadded_vocab_size,
++                config.hidden_size,
++                org_num_embeddings=self.truncated_vocab_size,
++                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
++            )
++            self.lm_heads = [
++                self.lm_head for _ in range(self.config.num_heads)
++            ]
++        else:
++            self.lm_heads = nn.ModuleList([
++                ParallelLMHead(
++                    self.unpadded_vocab_size,
++                    config.hidden_size,
++                    org_num_embeddings=self.truncated_vocab_size,
++                    padding_size=DEFAULT_VOCAB_PADDING_SIZE,
++                ) for _ in range(self.config.num_heads)
++            ])
++
++        logit_scale = getattr(config, "logit_scale", 1.0)
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                self.truncated_vocab_size,
++                                                logit_scale)
++
++        # Token map is a idx to token mapping to reduce the vocab size for
++        # the draft model. Using smaller vocab size for draft, containing
++        # only most frequent tokens reduces the speculation overhead. This
++        # doesn't affect the acceptance rate much and thus gives more speed
++        # -up. By default, this is disabled and is only used if the EAGLE
++        # checkpoint file has token_map tensor.
++        self.token_map = None
++
++    def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]:
++        return [block(hidden_states) for block in self.blocks]
++
++    def compute_logits(
++            self, hidden_states: List[torch.Tensor],
++            sampling_metadata: SamplingMetadata) -> List[torch.Tensor]:
++        logits_lst: List[torch.Tensor] = []
++
++        for hs, lm_head in zip(hidden_states, self.lm_heads):
++            _logits = self.logits_processor(lm_head, hs, sampling_metadata)
++
++            if _logits is None:
++                # _logits should only be None on rank > 0, in which case
++                # it should remain true for every lm_head
++                assert len(logits_lst) == 0
++                continue
++
++            if self.token_map is None:
++                logits_lst.append(_logits)
++            else:
++                logits_lst.append(-torch.inf * torch.ones(
++                    size=(*_logits.shape[:-1], self.orig_vocab_size),
++                    device=_logits.device,
++                    dtype=_logits.dtype))
++
++                logits_lst[-1][..., self.token_map] = _logits
++
++        return logits_lst
++
++    def sample(
++        self,
++        logits: List[torch.Tensor],
++        sampling_metadata: SamplingMetadata,
++    ) -> List[SamplerOutput]:
++        logits = torch.stack(logits, dim=0).float()
++        logprobs = torch.log_softmax(logits, dim=-1)
++        token_ids = logits.argmax(-1)  # support only top-1 for now
++        probs = torch.softmax(logits, dim=-1)
++
++        token_id_list = []
++        token_prob_list = []
++        token_logprob_list = []
++
++        for idx, seq_group in enumerate(sampling_metadata.seq_groups):
++            token_id_list.append(token_ids[:, seq_group.sample_indices])
++            token_prob_list.append(probs[:, seq_group.sample_indices])
++            token_logprob_list.append(logprobs[:, seq_group.sample_indices])
++
++        outputs: List[Optional[SamplerOutput]] = []
++        for idx in range(len(sampling_metadata.seq_groups)):
++            outputs.append(
++                SamplerOutput(
++                    outputs=None,
++                    sampled_token_probs=token_prob_list[idx].squeeze(1),
++                    logprobs=token_logprob_list[idx].squeeze(1),
++                    sampled_token_ids=token_id_list[idx].squeeze(1),
++                ))
++
++        return outputs
++
++    def generate_proposals(
++        self,
++        previous_hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> List[SamplerOutput]:
++        return self.sample(
++            logits=self.compute_logits(
++                hidden_states=self.forward(previous_hidden_states),
++                sampling_metadata=sampling_metadata,
++            ),
++            sampling_metadata=sampling_metadata,
++        )
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++
++        weights_map = {}
++
++        for name, loaded_weight in weights:
++            name = name.replace("medusa_heads.", "")
++
++            if name == "token_map":
++                if self.truncated_vocab_size < self.orig_vocab_size:
++                    self.token_map = nn.Parameter(loaded_weight,
++                                                  requires_grad=False)
++            elif name in params_dict:
++                weights_map[name] = loaded_weight
++            elif (getattr(self.config, "original_lm_head", False)
++                  and name == "lm_heads.0.weight"):
++                weights_map["lm_head.weight"] = loaded_weight
++
++        for name, loaded_weight in weights_map.items():
++            if "lm_head" in name and self.token_map is not None and\
++                loaded_weight.shape[0] > self.token_map.shape[0]:
++
++                loaded_weight = loaded_weight[self.token_map]
++
++            param = params_dict[name]
++            weight_loader = getattr(param, "weight_loader",
++                                    default_weight_loader)
++            weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++
++        if self.token_map is not None:
++            self.token_map.to(device=self.lm_heads[0].weight.device)
++
++        assert (self.truncated_vocab_size
++                == self.orig_vocab_size) or (self.token_map is not None)
++
++        return loaded_params
+diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
+index c90bcfb..5a0f202 100644
+--- a/vllm/model_executor/models/minicpm.py
++++ b/vllm/model_executor/models/minicpm.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+ # Copyright 2023 The vLLM team.
+@@ -22,17 +21,19 @@
+ # limitations under the License.
+ """Inference-only MiniCPM model compatible with HuggingFace weights."""
+ import math
+-from typing import Any, Dict, Iterable, List, Optional, Tuple
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
++from transformers import PretrainedConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.config import LoRAConfig
+-from vllm.distributed import (get_tensor_model_parallel_rank,
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size,
+                               tensor_model_parallel_all_reduce)
+-from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.activation import FatreluAndMul, SiluAndMul
+ from vllm.model_executor.layers.fused_moe import fused_moe
+ from vllm.model_executor.layers.layernorm import RMSNorm
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+@@ -40,16 +41,20 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                ReplicatedLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+ from vllm.model_executor.utils import set_weight_attrs
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class MiniCPMMoE(nn.Module):
+@@ -148,6 +153,7 @@ class MiniCPMMLP(nn.Module):
+         hidden_size: int,
+         intermediate_size: int,
+         hidden_act: str,
++        hidden_act_param: float,
+         quant_config: Optional[QuantizationConfig] = None,
+     ) -> None:
+         super().__init__()
+@@ -159,10 +165,13 @@ class MiniCPMMLP(nn.Module):
+                                            hidden_size,
+                                            bias=False,
+                                            quant_config=quant_config)
+-        if hidden_act != "silu":
++        if hidden_act == "silu":
++            self.act_fn = SiluAndMul()
++        elif hidden_act == "fatrelu":
++            self.act_fn = FatreluAndMul(threshold=hidden_act_param)
++        else:
+             raise ValueError(f"Unsupported activation: {hidden_act}. "
+-                             "Only silu is supported for now.")
+-        self.act_fn = SiluAndMul()
++                             "Only silu and fatrelu are supported for now.")
+ 
+     def forward(self, x):
+         gate_up, _ = self.gate_up_proj(x)
+@@ -181,7 +190,9 @@ class MiniCPMAttention(nn.Module):
+         rope_theta: float = 10000,
+         rope_scaling: Optional[Dict[str, Any]] = None,
+         max_position_embeddings: int = 8192,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = hidden_size
+@@ -234,7 +245,10 @@ class MiniCPMAttention(nn.Module):
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+                               self.scaling,
+-                              num_kv_heads=self.num_kv_heads)
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -258,42 +272,57 @@ class MiniCPMDecoderLayer(nn.Module):
+ 
+     def __init__(
+         self,
+-        config,
++        config: PretrainedConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.config = config
++        self.cache_config = cache_config
++        self.quant_config = quant_config
+         self.hidden_size = config.hidden_size
+-        rope_theta = getattr(config, "rope_theta", 10000)
+-        rope_scaling = getattr(config, "rope_scaling", None)
+-        max_position_embeddings = getattr(config, "max_position_embeddings",
+-                                          8192)
++        self.rope_theta = getattr(config, "rope_theta", 10000)
++        self.rope_scaling = getattr(config, "rope_scaling", None)
++        self.max_position_embeddings = getattr(config,
++                                               "max_position_embeddings", 8192)
++        self.prefix = prefix
++        self._init_attn_block()
++        self._init_ffn_block()
++
++    def _init_attn_block(self):
++        self.input_layernorm = RMSNorm(self.config.hidden_size,
++                                       eps=self.config.rms_norm_eps)
+         self.self_attn = MiniCPMAttention(
+             hidden_size=self.hidden_size,
+-            num_heads=config.num_attention_heads,
+-            num_kv_heads=config.num_key_value_heads,
+-            rope_theta=rope_theta,
+-            rope_scaling=rope_scaling,
+-            max_position_embeddings=max_position_embeddings,
+-            quant_config=quant_config,
++            num_heads=self.config.num_attention_heads,
++            num_kv_heads=self.config.num_key_value_heads,
++            rope_theta=self.rope_theta,
++            rope_scaling=self.rope_scaling,
++            max_position_embeddings=self.max_position_embeddings,
++            cache_config=self.cache_config,
++            quant_config=self.quant_config,
++            prefix=f"{self.prefix}.self_attn",
+         )
++
++    def _init_ffn_block(self):
++        self.post_attention_layernorm = RMSNorm(self.config.hidden_size,
++                                                eps=self.config.rms_norm_eps)
+         self.num_experts = getattr(self.config, "num_experts", 0)
+         if self.num_experts == 0:
+             self.mlp = MiniCPMMLP(
+                 hidden_size=self.hidden_size,
+-                intermediate_size=config.intermediate_size,
+-                hidden_act=config.hidden_act,
+-                quant_config=quant_config,
++                intermediate_size=self.config.intermediate_size,
++                hidden_act=self.config.hidden_act,
++                hidden_act_param=getattr(self.config, "hidden_act_param", 0.),
++                quant_config=self.quant_config,
+             )
+         else:
+-            self.mlp = MiniCPMMoE(num_experts=config.num_experts,
+-                                  top_k=config.num_experts_per_tok,
+-                                  hidden_size=config.hidden_size,
+-                                  intermediate_size=config.intermediate_size)
+-        self.input_layernorm = RMSNorm(config.hidden_size,
+-                                       eps=config.rms_norm_eps)
+-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+-                                                eps=config.rms_norm_eps)
++            self.mlp = MiniCPMMoE(
++                num_experts=self.config.num_experts,
++                top_k=self.config.num_experts_per_tok,
++                hidden_size=self.config.hidden_size,
++                intermediate_size=self.config.intermediate_size)
+ 
+     def forward(
+         self,
+@@ -325,16 +354,20 @@ class MiniCPMDecoderLayer(nn.Module):
+         return hidden_states, None
+ 
+ 
++@support_torch_compile
+ class MiniCPMModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
+         self.config = config
++        self.cache_config = cache_config
++        self.quant_config = quant_config
+         self.padding_idx = config.pad_token_id
+         lora_vocab = (lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0
+@@ -345,11 +378,25 @@ class MiniCPMModel(nn.Module):
+             config.hidden_size,
+             org_num_embeddings=config.vocab_size,
+         )
+-        self.layers = nn.ModuleList([
+-            MiniCPMDecoderLayer(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.num_experts = getattr(self.config, "num_experts", 0)
++        self._init_layers(prefix, config, cache_config, quant_config)
+         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], self.config.hidden_size))
++
++    def _init_layers(
++        self,
++        prefix: str,
++        config: PretrainedConfig,
++        cache_config: Optional[CacheConfig],
++        quant_config: Optional[QuantizationConfig],
++    ):
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: MiniCPMDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
+ 
+     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+         embedding = self.embed_tokens(input_ids)
+@@ -361,28 +408,105 @@ class MiniCPMModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
+         inputs_embeds: Optional[torch.Tensor] = None,
+-    ) -> torch.Tensor:
+-        if inputs_embeds is not None:
+-            hidden_states = inputs_embeds
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
+         else:
+-            hidden_states = self.get_input_embeddings(input_ids)
+-        residual = None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
+ 
+-        for i in range(len(self.layers)):
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+                 residual,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+         hidden_states = self.norm(hidden_states)
+         return hidden_states
+ 
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++            ("gate_up_proj", "gate_proj", 0),
++            ("gate_up_proj", "up_proj", 1),
++        ]
++        expert_params_mapping = [
++            # (param_name, weight_name, expert_id)
++            ("ws" if weight_name in ["w1", "w3"] else "w2s",
++             f"experts.{expert_id}.{weight_name}.weight", expert_id)
++            for expert_id in range(self.num_experts)
++            for weight_name in ["w1", "w2", "w3"]
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            if ("rotary_emb.cos_cached" in name
++                    or "rotary_emb.sin_cached" in name):
++                # Models trained using ColossalAI may include these tensors in
++                # the checkpoint. Skip them.
++                continue
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                for param_name, weight_name, expert_id in expert_params_mapping:
++                    if weight_name not in name:
++                        continue
++                    name = name.replace(weight_name, param_name)
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    param = params_dict[name]
++                    weight_loader = param.weight_loader
++                    weight_loader(param,
++                                  loaded_weight,
++                                  weight_name,
++                                  expert_id=expert_id)
++                    break
++                else:
++                    # Skip loading extra bias for GPTQ models.
++                    if name.endswith(".bias") and name not in params_dict:
++                        continue
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
+ 
+-class MiniCPMForCausalLM(nn.Module):
++class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+     packed_modules_mapping = {
+         "qkv_proj": [
+             "q_proj",
+@@ -410,37 +534,61 @@ class MiniCPMForCausalLM(nn.Module):
+     }
+     embedding_padding_modules = ["lm_head"]
+ 
+-    def __init__(
+-        self,
+-        config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ) -> None:
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.prefix = prefix
++        self.vllm_config = vllm_config
+         self.config = config
+-        self.num_experts = getattr(self.config, "num_experts", 0)
++        self.lora_config = lora_config
++        self.cache_config = cache_config
+         self.quant_config = quant_config
+-        self.model = MiniCPMModel(config,
+-                                  quant_config,
+-                                  lora_config=lora_config)
++
++        self.model = self._init_model(vllm_config=vllm_config,
++                                      prefix=maybe_prefix(prefix, "model"))
++
+         unpadded_vocab_size = config.vocab_size
+         if lora_config:
+             unpadded_vocab_size += lora_config.lora_extra_vocab_size
+-        if not self.config.tie_word_embeddings:
+-            self.lm_head = ParallelLMHead(
+-                unpadded_vocab_size,
+-                config.hidden_size,
+-                org_num_embeddings=config.vocab_size,
+-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
+-                # We need bigger padding if using lora for kernel
+-                # compatibility
+-                if not lora_config else lora_config.lora_vocab_padding_size,
+-            )
++        self.lm_head = ParallelLMHead(
++            unpadded_vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++            padding_size=DEFAULT_VOCAB_PADDING_SIZE
++            # We need bigger padding if using lora for kernel
++            # compatibility
++            if not lora_config else lora_config.lora_vocab_padding_size,
++            quant_config=quant_config,
++        )
++        if config.tie_word_embeddings:
++            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+         self.scale_width = self.config.hidden_size / self.config.dim_model_base
+ 
+         self.logits_processor = LogitsProcessor(unpadded_vocab_size,
+                                                 config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        return MiniCPMModel(vllm_config=vllm_config, prefix=prefix)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -448,19 +596,21 @@ class MiniCPMForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
+         hidden_states = hidden_states / self.scale_width
+-        if self.config.tie_word_embeddings:
+-            lm_head_weight = self.model.embed_tokens.weight
+-        else:
+-            lm_head_weight = self.lm_head.weight
+-        logits = self.logits_processor(lm_head_weight, hidden_states,
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -472,60 +622,11 @@ class MiniCPMForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+-        stacked_params_mapping = [
+-            # (param_name, shard_name, shard_id)
+-            ("qkv_proj", "q_proj", "q"),
+-            ("qkv_proj", "k_proj", "k"),
+-            ("qkv_proj", "v_proj", "v"),
+-            ("gate_up_proj", "gate_proj", 0),
+-            ("gate_up_proj", "up_proj", 1),
+-        ]
+-        expert_params_mapping = [
+-            # (param_name, weight_name, expert_id)
+-            ("ws" if weight_name in ["w1", "w3"] else "w2s",
+-             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+-            for expert_id in range(self.num_experts)
+-            for weight_name in ["w1", "w2", "w3"]
+-        ]
+-        params_dict = dict(self.named_parameters())
+-        for name, loaded_weight in weights:
+-            if "rotary_emb.inv_freq" in name:
+-                continue
+-            if ("rotary_emb.cos_cached" in name
+-                    or "rotary_emb.sin_cached" in name):
+-                # Models trained using ColossalAI may include these tensors in
+-                # the checkpoint. Skip them.
+-                continue
+-
+-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+-                if weight_name not in name:
+-                    continue
+-                name = name.replace(weight_name, param_name)
+-                # Skip loading extra bias for GPTQ models.
+-                if name.endswith(".bias") and name not in params_dict:
+-                    continue
+-                param = params_dict[name]
+-                weight_loader = param.weight_loader
+-                weight_loader(param, loaded_weight, shard_id)
+-                break
+-            else:
+-                for param_name, weight_name, expert_id in expert_params_mapping:
+-                    if weight_name not in name:
+-                        continue
+-                    name = name.replace(weight_name, param_name)
+-                    param = params_dict[name]
+-                    weight_loader = param.weight_loader
+-                    weight_loader(param,
+-                                  loaded_weight,
+-                                  weight_name,
+-                                  expert_id=expert_id)
+-                    break
+-                else:
+-                    # Skip loading extra bias for GPTQ models.
+-                    if name.endswith(".bias") and name not in params_dict:
+-                        continue
+-                    param = params_dict[name]
+-                    weight_loader = getattr(param, "weight_loader",
+-                                            default_weight_loader)
+-                    weight_loader(param, loaded_weight)
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(
++            self,
++            skip_prefixes=(["lm_head."]
++                           if self.config.tie_word_embeddings else None),
++        )
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
+new file mode 100644
+index 0000000..e9d7ead
+--- /dev/null
++++ b/vllm/model_executor/models/minicpm3.py
+@@ -0,0 +1,251 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
++# Copyright 2024 The ModelBest team.
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only MiniCPM3 model compatible with HuggingFace weights."""
++from typing import Any, Dict, Optional
++
++import torch
++from torch import nn
++from transformers import PretrainedConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               ReplicatedLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.models.minicpm import (MiniCPMDecoderLayer,
++                                                MiniCPMForCausalLM,
++                                                MiniCPMModel)
++
++from .utils import make_layers
++
++
++class MiniCPM3Attention(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        hidden_size: int,
++        num_heads: int,
++        qk_nope_head_dim: int,
++        qk_rope_head_dim: int,
++        v_head_dim: int,
++        q_lora_rank: int,
++        kv_lora_rank: int,
++        rope_theta: float = 10000,
++        rope_scaling: Optional[Dict[str, Any]] = None,
++        max_position_embeddings: int = 8192,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        self.qk_nope_head_dim = qk_nope_head_dim
++        self.qk_rope_head_dim = qk_rope_head_dim
++        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
++        self.v_head_dim = v_head_dim
++        self.q_lora_rank = q_lora_rank
++        self.kv_lora_rank = kv_lora_rank
++        self.num_heads = num_heads
++
++        tp_size = get_tensor_model_parallel_world_size()
++        assert self.num_heads % tp_size == 0
++        self.num_local_heads = num_heads // tp_size
++
++        self.scaling = self.qk_head_dim**-0.5
++        self.rope_theta = rope_theta
++        self.max_position_embeddings = max_position_embeddings
++
++        self.q_a_proj = ReplicatedLinear(self.hidden_size,
++                                         self.q_lora_rank,
++                                         bias=False,
++                                         quant_config=quant_config)
++        self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
++        self.q_b_proj = ColumnParallelLinear(q_lora_rank,
++                                             self.num_heads * self.qk_head_dim,
++                                             bias=False,
++                                             quant_config=quant_config)
++
++        self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size,
++                                                   self.kv_lora_rank +
++                                                   self.qk_rope_head_dim,
++                                                   bias=False,
++                                                   quant_config=quant_config)
++        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
++                                      eps=config.rms_norm_eps)
++        self.kv_b_proj = ColumnParallelLinear(
++            self.kv_lora_rank,
++            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
++            bias=False,
++            quant_config=quant_config)
++        # O projection.
++        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
++                                        self.hidden_size,
++                                        bias=False,
++                                        quant_config=quant_config)
++
++        self.rotary_emb = get_rope(
++            self.qk_rope_head_dim,
++            rotary_dim=self.qk_rope_head_dim,
++            max_position=max_position_embeddings,
++            base=rope_theta,
++            rope_scaling=rope_scaling,
++        )
++        self.attn = Attention(self.num_local_heads,
++                              self.qk_head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_local_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        q, _ = self.q_a_proj(hidden_states)
++        q = self.q_a_layernorm(q)
++        q, _ = self.q_b_proj(q)
++        q = q.view(-1, self.num_local_heads, self.qk_head_dim)
++        _, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
++                          dim=-1)
++        latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
++        kv_a, _ = latent_cache.split(
++            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
++        latent_cache = latent_cache.unsqueeze(1)
++        kv_a = self.kv_a_layernorm(kv_a.contiguous())
++        kv, _ = self.kv_b_proj(kv_a)
++        kv = kv.view(-1, self.num_local_heads,
++                     self.qk_nope_head_dim + self.v_head_dim)
++        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
++
++        k_pe = latent_cache[:, :, self.kv_lora_rank:]
++
++        q_pe, k_pe = self.rotary_emb(
++            positions,
++            q_pe.reshape(-1, self.num_local_heads * self.qk_rope_head_dim),
++            k_pe.reshape(-1, self.qk_rope_head_dim))
++        q_pe = q_pe.view(-1, self.num_local_heads, self.qk_rope_head_dim)
++        k_pe = k_pe.view(-1, 1, self.qk_rope_head_dim)
++
++        q[..., self.qk_nope_head_dim:] = q_pe
++
++        k = torch.empty_like(q)
++
++        k[..., :self.qk_nope_head_dim] = k_nope
++        k[..., self.qk_nope_head_dim:] = k_pe
++
++        q = q.reshape(-1, self.num_local_heads * self.qk_head_dim)
++        k = k.view(-1, self.num_local_heads * self.qk_head_dim)
++        v = torch.nn.functional.pad(
++            v, [0, self.qk_head_dim - self.v_head_dim],
++            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
++
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        attn_output = attn_output.view(
++            -1, self.num_local_heads,
++            self.qk_head_dim)[..., :self.v_head_dim].reshape(
++                -1, self.num_local_heads * self.v_head_dim)
++
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class MiniCPM3DecoderLayer(MiniCPMDecoderLayer):
++
++    def _init_attn_block(self):
++        self.input_layernorm = RMSNorm(self.config.hidden_size,
++                                       eps=self.config.rms_norm_eps)
++        self.self_attn = MiniCPM3Attention(
++            config=self.config,
++            hidden_size=self.hidden_size,
++            num_heads=self.config.num_attention_heads,
++            qk_nope_head_dim=self.config.qk_nope_head_dim,
++            qk_rope_head_dim=self.config.qk_rope_head_dim,
++            v_head_dim=self.config.v_head_dim,
++            q_lora_rank=self.config.q_lora_rank,
++            kv_lora_rank=self.config.kv_lora_rank,
++            rope_theta=self.rope_theta,
++            rope_scaling=self.rope_scaling,
++            max_position_embeddings=self.max_position_embeddings,
++            cache_config=self.cache_config,
++            quant_config=self.quant_config,
++            prefix=f"{self.prefix}.self_attn",
++        )
++
++
++class MiniCPM3Model(MiniCPMModel):
++
++    def _init_layers(
++        self,
++        prefix: str,
++        config: PretrainedConfig,
++        cache_config: Optional[CacheConfig],
++        quant_config: Optional[QuantizationConfig],
++    ):
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: MiniCPM3DecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
++
++
++class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
++    packed_modules_mapping = {
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "kv_a_proj_with_mqa",
++        "q_a_proj",
++        "q_b_proj",
++        "kv_b_proj",
++        "o_proj",
++        "gate_up_proj",
++        "down_proj",
++        "embed_tokens",
++        "lm_head",
++    ]
++
++    # `embedding_modules` and `embedding_padding_modules`
++    # are inherited from MiniCPMForCausalLM
++
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix)
+diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
+new file mode 100644
+index 0000000..ff7dab8
+--- /dev/null
++++ b/vllm/model_executor/models/minicpmv.py
+@@ -0,0 +1,1023 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
++import math
++import re
++from functools import cached_property, partial
++from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
++                    Set, Tuple, TypedDict, Union)
++
++import torch
++import torch.types
++from PIL import Image
++from torch import nn
++from transformers import PretrainedConfig
++from typing_extensions import NotRequired
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
++                         InputContext, token_inputs)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
++                                                  get_2d_sincos_pos_embed)
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.model_loader.utils import set_default_torch_dtype
++from vllm.model_executor.models.llama import LlamaForCausalLM
++from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
++from vllm.model_executor.models.module_mapping import MultiModelKeys
++from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
++from vllm.multimodal.image import cached_get_image_processor
++from vllm.multimodal.utils import cached_get_tokenizer
++from vllm.sequence import IntermediateTensors, SequenceData
++
++from .idefics2_vision_model import Idefics2VisionTransformer
++from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
++from .utils import AutoWeightsLoader, maybe_prefix
++
++RawImageType = Union[Image.Image, torch.Tensor]
++
++
++class MiniCPMVRawImageInput(TypedDict):
++    """Input mapper input with auxiliary data for computing image bounds."""
++    image: RawImageType
++
++    # Image bounds token ids in 0-dim scaler tensor.
++    im_start_id: torch.Tensor
++    im_end_id: torch.Tensor
++    slice_start_id: NotRequired[torch.Tensor]
++    slice_end_id: NotRequired[torch.Tensor]
++
++
++class MiniCPMVImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: List[torch.Tensor]
++    """
++    Shape: `(batch_size * num_images, num_channels, height, width)`
++
++    Note that the image size may vary, so we pass it as a list
++    instead of a batched tensor.
++    """
++
++    image_bounds: torch.Tensor
++    """
++    Shape: `(batch_size * num_images, 2)`
++
++    This should be in `(start, stop)` format.
++    """
++
++    tgt_sizes: torch.Tensor
++    """
++    Shape: `(batch_size * num_images, 2)`
++
++    This should be in `(height, width)` format.
++    """
++
++
++class MiniCPMVImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: torch.Tensor
++    """
++    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
++
++    `hidden_size` must match the hidden size of language model backbone.
++    instead of a batched tensor.
++    """
++
++    image_bounds: torch.Tensor
++    """
++    Shape: `(batch_size * num_images, 2)`
++
++    This should be in `(start, stop)` format.
++    """
++
++
++MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
++                            MiniCPMVImageEmbeddingInputs]
++
++DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
++
++
++class Resampler2_5(BaseResampler):
++
++    def __init__(self,
++                 num_queries: int,
++                 embed_dim: int,
++                 num_heads: int,
++                 kv_dim: Optional[int] = None,
++                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
++                 max_size: Tuple[int, int] = (70, 70),
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = "") -> None:
++        super().__init__(num_queries,
++                         embed_dim,
++                         num_heads,
++                         kv_dim,
++                         norm_layer,
++                         quant_config=quant_config,
++                         prefix=prefix)
++
++        self.max_size = max_size
++        self._set_2d_pos_cache(self.max_size)
++
++    def _set_2d_pos_cache(self,
++                          max_size: Tuple[int, int],
++                          device: torch.types.Device = "cpu") -> None:
++        pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
++                                                max_size,
++                                                version=(2, 5))
++        pos_embed = torch.from_numpy(pos_embed_arr).float().to(device)
++        self.register_buffer("pos_embed", pos_embed, persistent=False)
++
++    def _adjust_pos_cache(self, tgt_sizes: torch.Tensor,
++                          device: torch.types.Device) -> None:
++        max_h = tgt_sizes[:, 0].max().item()
++        max_w = tgt_sizes[:, 1].max().item()
++        assert isinstance(max_h, int) and isinstance(max_w, int)
++
++        if max_h > self.max_size[0] or max_w > self.max_size[1]:
++            self.max_size = (
++                max(max_h, self.max_size[0]),
++                max(max_w, self.max_size[1]),
++            )
++            self._set_2d_pos_cache(self.max_size, device)
++
++    def forward(self, x: torch.Tensor,
++                tgt_sizes: torch.Tensor) -> torch.Tensor:
++        assert x.shape[0] == tgt_sizes.shape[0]
++        bs = x.shape[0]
++
++        device = x.device
++        dtype = x.dtype
++
++        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
++
++        self._adjust_pos_cache(tgt_sizes, device=device)
++
++        max_patch_len = patch_len.max().item()
++        assert isinstance(max_patch_len, int)
++
++        key_padding_mask = torch.zeros((bs, max_patch_len),
++                                       dtype=torch.bool,
++                                       device=device)
++
++        pos_embed = []
++        for i in range(bs):
++            tgt_h, tgt_w = tgt_sizes[i].tolist()
++            pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
++                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
++            key_padding_mask[i, patch_len[i]:] = True
++        pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed,
++                                                    batch_first=True,
++                                                    padding_value=0.0).permute(
++                                                        1, 0,
++                                                        2)  # BLD => L * B * D
++        x, _ = self.kv_proj(x)  # B * L * D
++        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
++
++        q = self.ln_q(self.query)  # Q * D
++
++        out = self.attn(
++            self._repeat(q, bs),  # Q * B * D
++            x + pos_embed,  # L * B * D +  L * B * D
++            x,
++            key_padding_mask=key_padding_mask,
++        )[0]
++        #  out: Q * B * D
++        x = out.permute(1, 0, 2)  # B * Q * D
++
++        x = self.ln_post(x)
++        x = x @ self.proj
++        return x
++
++
++def _build_image_input(ctx: InputContext,
++                       image: RawImageType) -> MiniCPMVRawImageInput:
++    tokenizer = cached_get_tokenizer(
++        ctx.model_config.tokenizer,
++        trust_remote_code=ctx.model_config.trust_remote_code)
++    if hasattr(tokenizer, "slice_start_id"):
++        return MiniCPMVRawImageInput(
++            image=image,
++            im_start_id=torch.tensor(tokenizer.im_start_id),
++            im_end_id=torch.tensor(tokenizer.im_end_id),
++            slice_start_id=torch.tensor(tokenizer.slice_start_id),
++            slice_end_id=torch.tensor(tokenizer.slice_end_id))
++    else:
++        return MiniCPMVRawImageInput(
++            image=image,
++            im_start_id=torch.tensor(tokenizer.im_start_id),
++            im_end_id=torch.tensor(tokenizer.im_end_id))
++
++
++def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
++    version_float = getattr(config, "version", None)
++
++    # The old configs do not include version number
++    # TODO: Remove this after the HF repos are updated
++    if version_float is None:
++        if config.hidden_size == 2304 and config.query_num == 64:
++            return (2, 0)
++        return (2, 5)
++
++    version_str = str(version_float)
++    return tuple(int(x) for x in version_str.split("."))
++
++
++def get_max_minicpmv_image_tokens(ctx: InputContext):
++    hf_config = ctx.get_hf_config()
++    return getattr(hf_config, "query_num", 64)
++
++
++def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
++    return SequenceData.from_prompt_token_counts((0, seq_len))
++
++
++def dummy_image_for_minicpmv(ctx: InputContext, hf_config: PretrainedConfig,
++                             num_images: int):
++    width = height = hf_config.image_size
++    image = _build_image_input(ctx,
++                               image=Image.new("RGB", (width, height),
++                                               color=0))
++    return {"image": [image] if num_images == 1 else [image] * num_images}
++
++
++def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int,
++                            mm_counts: Mapping[str, int]):
++    hf_config = ctx.get_hf_config()
++    num_images = mm_counts["image"]
++
++    seq_data = dummy_seq_data_for_minicpmv(seq_len, num_images)
++    mm_data = dummy_image_for_minicpmv(ctx, hf_config, num_images)
++
++    return DummyData(seq_data, mm_data)
++
++
++def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs):
++    multi_modal_data = inputs.get("multi_modal_data")
++    if multi_modal_data is None or "image" not in multi_modal_data:
++        return inputs
++    model_config = ctx.model_config
++    version = get_version_by_config(model_config.hf_config)
++    tokenizer = cached_get_tokenizer(
++        model_config.tokenizer,
++        trust_remote_code=model_config.trust_remote_code)
++    image_processor = cached_get_image_processor(model_config.tokenizer)
++
++    def get_placeholder(image_size: Tuple[int, int], num_image: int):
++        if version == (2, 0) or version == (2, 5):
++            return image_processor.get_slice_image_placeholder(image_size)
++        return image_processor.get_slice_image_placeholder(
++            image_size, num_image)
++
++    prompt = inputs.get("prompt")
++    token_ids = inputs.get("prompt_token_ids")
++    if prompt is None:
++        prompt = tokenizer.decode(token_ids)
++
++    pattern = "(<image>./</image>)"
++    images = multi_modal_data["image"]
++    image_tags = re.findall(pattern, prompt)
++    if len(image_tags) == 0:
++        new_token_ids = token_ids
++        new_prompt = prompt
++    else:
++        if isinstance(images, dict):
++            image_size_list = images.get("image_size_list")
++            images = [images.get("image_embeds")]
++        else:
++            if isinstance(images, Image.Image):
++                images = [images]
++            image_size_list = [image.size for image in images]
++
++        text_chunks = prompt.split(pattern)
++        new_prompt_chunks: List[str] = []
++        for i in range(len(image_size_list)):
++            new_prompt_chunks += [
++                text_chunks[i],
++                get_placeholder(image_size_list[i], i)
++            ]
++        new_prompt_chunks.append(text_chunks[-1])
++        new_prompt = "".join(new_prompt_chunks)
++        new_token_ids = tokenizer.encode(new_prompt)
++
++    multi_modal_data["image"] = [
++        _build_image_input(ctx, image) for image in images
++    ]
++
++    return token_inputs(
++        prompt_token_ids=new_token_ids,
++        prompt=new_prompt,
++        multi_modal_data=multi_modal_data,
++    )
++
++
++def input_mapper_for_minicpmv(ctx: InputContext, data: object):
++    model_config = ctx.model_config
++
++    image_processor = cached_get_image_processor(
++        model_config.model, trust_remote_code=model_config.trust_remote_code)
++    if image_processor is None:
++        raise RuntimeError("No HuggingFace processor is available "
++                           "to process the image object")
++
++    if not isinstance(data, list):
++        raise ValueError(
++            "Image input must be list of MiniCPMVImageInput, got (%s)", data)
++
++    if len(data) > 0 and isinstance(data[0]['image'], torch.Tensor):
++        batch_data = {
++            "image_embeds": data[0]['image'],
++        }
++    else:
++        batch_data = image_processor \
++            .preprocess([img["image"] for img in data], return_tensors="pt") \
++            .data
++
++    if len(data) > 0:
++        batch_data["im_start_id"] = data[0]["im_start_id"]
++        batch_data["im_end_id"] = data[0]["im_end_id"]
++        if "slice_start_id" in data[0]:
++            batch_data["slice_start_id"] = data[0]["slice_start_id"]
++            batch_data["slice_end_id"] = data[0]["slice_end_id"]
++
++    return MultiModalKwargs(batch_data)
++
++
++class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
++    """
++    The abstract class of MiniCPMV can only be inherited, but cannot be
++    instantiated.
++    """
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        config = vllm_config.model_config.hf_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        quant_config = vllm_config.quant_config
++        super().__init__()
++        # All MiniCPM-V models disable `tie_word_embeddings` but
++        # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
++        # check `tie_word_embeddings` until vLLM integrate MiniCPM-V model
++        # and config class
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        self.version = get_version_by_config(self.config)
++        self.llm = self.init_llm(vllm_config=vllm_config,
++                                 prefix=maybe_prefix(prefix, "llm"))
++        self.vpm = self.init_vision_module(config,
++                                           quant_config,
++                                           prefix=maybe_prefix(prefix, "vpm"))
++        self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else
++                           self.vpm.embeddings.embed_dim)
++        self.embed_dim = self.config.hidden_size
++
++        self.resampler = self.init_resampler(self.embed_dim,
++                                             self.vision_dim,
++                                             quant_config=quant_config,
++                                             prefix=maybe_prefix(
++                                                 prefix, "resampler"))
++
++        self.make_empty_intermediate_tensors = (
++            self.llm.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.llm, "sampler"):
++            return self.llm.sampler
++
++        return get_sampler()
++
++    def get_embedding(
++        self,
++        input_ids: torch.Tensor,
++        image_inputs: Optional[MiniCPMVImageInputs],
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
++
++        if image_inputs is None:  # No image
++            vision_hidden_states = torch.tensor([], device=input_ids.device)
++        else:
++            if image_inputs["type"] == "image_embeds":
++                vision_hidden_states = (image_inputs["data"].type(
++                    vlm_embedding.dtype).to(vlm_embedding.device))
++            else:
++                vision_hidden_states = self.get_vision_hidden_states(
++                    image_inputs)
++
++            # See NOTE in _parse_and_validate_inputs
++            image_bounds = image_inputs["image_bounds"]
++            if len(image_bounds) > 0:
++                image_indices = torch.stack([
++                    torch.arange(start, end, dtype=torch.long)
++                    for start, end in image_bounds.tolist()
++                ]).to(vlm_embedding.device)
++                vlm_embedding.scatter_(
++                    0,
++                    image_indices.view(-1, 1).repeat(1,
++                                                     vlm_embedding.shape[-1]),
++                    vision_hidden_states.view(-1,
++                                              vision_hidden_states.shape[-1]),
++                )
++
++        return vlm_embedding, vision_hidden_states
++
++    def _get_image_bounds(
++            self,
++            input_ids: torch.Tensor,
++            im_start_id: torch.Tensor,
++            im_end_id: torch.Tensor,
++            slice_start_id: Optional[torch.Tensor] = None,
++            slice_end_id: Optional[torch.Tensor] = None) -> torch.Tensor:
++        # All the images in the batch should share the same special image
++        # bound token ids.
++        start_cond = input_ids == im_start_id[0]
++        end_cond = input_ids == im_end_id[0]
++        if slice_start_id is not None:
++            start_cond |= (input_ids == slice_start_id[0])
++            end_cond |= (input_ids == slice_end_id[0])
++
++        image_start_tokens, = torch.where(start_cond)
++        image_start_tokens += 1
++        image_end_tokens, = torch.where(end_cond)
++        valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
++
++        if valid_image_nums == 0:
++            return torch.zeros((0, 2), device=input_ids.device)
++
++        return torch.hstack([
++            image_start_tokens[:valid_image_nums].unsqueeze(-1),
++            image_end_tokens[:valid_image_nums].unsqueeze(-1),
++        ])
++
++    def _parse_and_validate_inputs(
++        self,
++        input_ids: torch.Tensor,
++        **kwargs: object,
++    ) -> Optional[MiniCPMVImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", [])
++        tgt_sizes = kwargs.pop("tgt_sizes", [])
++        im_start_id = kwargs.pop("im_start_id", None)
++        im_end_id = kwargs.pop("im_end_id", None)
++        slice_start_id = kwargs.pop("slice_start_id", None)
++        slice_end_id = kwargs.pop("slice_end_id", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++
++        if image_embeds is not None:
++            if not isinstance(image_embeds, (torch.Tensor, list)):
++                raise ValueError(f"Incorrect type of image embeds. "
++                                 f"Got type: {type(image_embeds)}")
++            if isinstance(image_embeds, list):
++                image_embeds = torch.concat(image_embeds)
++
++            return MiniCPMVImageEmbeddingInputs(
++                image_bounds=self._get_image_bounds(input_ids, im_start_id,
++                                                    im_end_id, slice_start_id,
++                                                    slice_end_id),
++                data=image_embeds,
++                type="image_embeds",
++            )
++
++        if not isinstance(pixel_values, (torch.Tensor, list)):
++            raise ValueError("Incorrect type of pixel values. "
++                             f"Got type: {type(pixel_values)}")
++
++        if not isinstance(tgt_sizes, (torch.Tensor, list)):
++            raise ValueError("Incorrect type of target sizes. "
++                             f"Got type: {type(tgt_sizes)}")
++
++        if len(pixel_values) != len(tgt_sizes):
++            raise ValueError("Inconsistent batch lengths, found: "
++                             f"{len(pixel_values)} vs. {len(tgt_sizes)}")
++
++        pixel_values_flat: List[torch.Tensor] = []
++        tgt_sizes_flat: List[torch.Tensor] = []
++        for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
++            if len(pixel_b) != len(tgt_b):
++                raise ValueError("Inconsistent N lengths, found: "
++                                 f"{len(pixel_b)} vs {len(tgt_b)}")
++
++            for pixel_n, tgt_n in zip(pixel_b, tgt_b):
++                pixel_values_flat += pixel_n
++                tgt_sizes_flat += tgt_n
++
++        # NOTE: Input IDs does not contain image tokens during memory profiling,
++        # so we allow it to be empty
++        if len(pixel_values_flat) != len(tgt_sizes_flat):
++            raise ValueError("Inconsistent flattened lengths, found: "
++                             f"{len(pixel_values_flat)} vs. "
++                             f"{len(tgt_sizes_flat)}")
++
++        if len(pixel_values_flat) == 0:
++            return None
++
++        if im_start_id is None:
++            return None
++
++        return MiniCPMVImagePixelInputs(
++            image_bounds=self._get_image_bounds(input_ids, im_start_id,
++                                                im_end_id, slice_start_id,
++                                                slice_end_id),
++            data=pixel_values_flat,
++            tgt_sizes=torch.stack(tgt_sizes_flat),
++            type="pixel_values",
++        )
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        **kwargs: Any,
++    ) -> torch.Tensor:
++        if intermediate_tensors is not None:
++            vlm_embeddings = None
++        else:
++            image_inputs = self._parse_and_validate_inputs(input_ids, **kwargs)
++
++            vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
++
++        # always pass the input via `inputs_embeds`
++        # to make sure the computation graph is consistent
++        # for `torch.compile` integration
++        input_ids = None
++
++        output = self.llm.model(
++            input_ids=input_ids,
++            positions=positions,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=vlm_embeddings,
++        )
++        return output
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.llm.compute_logits(hidden_states, sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights)
++
++    def get_mm_mapping(self) -> MultiModelKeys:
++        """
++        Get the module prefix in multimodal models
++        """
++        return MultiModelKeys.from_string_field(language_model="llm",
++                                                connector="resampler",
++                                                tower_model="vpm")
++
++    def init_llm(
++        self,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++    ) -> nn.Module:
++        raise NotImplementedError
++
++    def init_vision_module(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig],
++        prefix: str = "",
++    ) -> nn.Module:
++        raise NotImplementedError
++
++    def init_resampler(self,
++                       embed_dim: int,
++                       vision_dim: int,
++                       quant_config: Optional[QuantizationConfig] = None,
++                       prefix: str = "") -> nn.Module:
++        raise NotImplementedError
++
++    def get_vision_embedding(
++        self,
++        pixel_values: List[torch.Tensor],
++        patch_attn_mask: Optional[torch.Tensor] = None,
++        tgt_sizes: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        raise NotImplementedError
++
++    def get_vision_hidden_states(self,
++                                 data: MiniCPMVImageInputs) -> torch.Tensor:
++        raise NotImplementedError
++
++
++class MiniCPMV2_0(MiniCPMVBaseModel):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++        assert self.version == (2, 0)
++
++    def init_llm(
++        self,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++    ) -> nn.Module:
++        return MiniCPMForCausalLM(vllm_config=vllm_config, prefix=prefix)
++
++    def init_vision_module(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig],
++        prefix: str = "",
++    ) -> nn.Module:
++        # TODO: refactor vision model through timm wrapper from transformers
++        try:
++            import timm
++        except ImportError:
++            raise ImportError("Please install timm==0.9.10") from ImportError
++
++        with set_default_torch_dtype(torch.float16):
++            model = timm.create_model(
++                "vit_so400m_patch14_siglip_384.webli",
++                pretrained=False,
++                num_classes=0,
++                dynamic_img_size=True,
++                dynamic_img_pad=True,
++            )
++
++        model = model.to(dtype=torch.get_default_dtype())
++
++        if (isinstance(model, timm.models.VisionTransformer)
++                and model.attn_pool is not None):
++            model.attn_pool = torch.nn.Identity()
++
++        if self.config.drop_vision_last_layer:
++            model.blocks = model.blocks[:-1]
++
++        return model
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.embed_tokens(input_ids)
++
++    def init_resampler(self,
++                       embed_dim: int,
++                       vision_dim: int,
++                       quant_config: Optional[QuantizationConfig] = None,
++                       prefix: str = "") -> nn.Module:
++        with set_default_torch_dtype(torch.float16):
++            resampler = Resampler2(embed_dim=embed_dim,
++                                   num_heads=embed_dim // 128,
++                                   grid_size=int(
++                                       math.sqrt(self.config.query_num)),
++                                   kv_dim=vision_dim,
++                                   adaptive=False,
++                                   do_post_projection=True,
++                                   quant_config=quant_config,
++                                   prefix=prefix)
++
++        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
++
++    def get_vision_embedding(
++        self,
++        pixel_values: List[torch.Tensor],
++        patch_attn_mask: Optional[torch.Tensor] = None,
++        tgt_sizes: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        res = []
++        dtype = self.vpm.pos_embed.data.dtype
++        for pixel_value in pixel_values:
++            H, W = pixel_value[0].shape[-2:]
++            tgt_size = (
++                math.ceil(H / self.vpm.patch_embed.patch_size[0]),
++                math.ceil(W / self.vpm.patch_embed.patch_size[0]),
++            )
++            vision_embedding = self.vpm.forward_features(
++                pixel_value.unsqueeze(0).type(dtype))
++            if (hasattr(self.vpm, "num_prefix_tokens")
++                    and self.vpm.num_prefix_tokens > 0):
++                vision_embedding = vision_embedding[:, self.vpm.
++                                                    num_prefix_tokens:]
++            res.append(self.resampler(vision_embedding, tgt_size))
++        return torch.vstack(res)
++
++    def get_vision_hidden_states(self,
++                                 data: MiniCPMVImageInputs) -> torch.Tensor:
++        pixel_values = data["data"]
++
++        return self.get_vision_embedding(pixel_values)
++
++
++class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++    # LoRA specific attributes
++    supported_lora_modules = [
++        # vision encoder
++        "fc1",
++        "fc2",
++        "out_proj",
++        # language model
++        "qkv_proj",  # same name with vision encoder
++        "o_proj",
++        "gate_up_proj",
++        "down_proj",
++        # resampler
++        "kv_proj",
++    ]
++
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++        assert self.version == (2, 5)
++
++    def init_llm(
++        self,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++    ) -> nn.Module:
++        return LlamaForCausalLM(vllm_config=vllm_config, prefix=prefix)
++
++    def init_vision_module(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig],
++        prefix: str = "",
++    ) -> nn.Module:
++        model = Idefics2VisionTransformer(config.vision_config,
++                                          quant_config=quant_config,
++                                          prefix=prefix)
++        if self.config.drop_vision_last_layer:
++            model.encoder.layers = model.encoder.layers[:-1]
++        return model
++
++    def init_resampler(self,
++                       embed_dim: int,
++                       vision_dim: int,
++                       quant_config: Optional[QuantizationConfig] = None,
++                       prefix: str = "") -> nn.Module:
++        with set_default_torch_dtype(torch.float16):
++            resampler = Resampler2_5(num_queries=self.config.query_num,
++                                     embed_dim=embed_dim,
++                                     num_heads=embed_dim // 128,
++                                     kv_dim=vision_dim,
++                                     quant_config=quant_config,
++                                     prefix=prefix)
++
++        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
++
++    def get_vision_embedding(
++        self,
++        pixel_values: List[torch.Tensor],
++        patch_attn_mask: Optional[torch.Tensor] = None,
++        tgt_sizes: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        vision_embedding = self.vpm(pixel_values,
++                                    patch_attention_mask=patch_attn_mask)
++        vision_embedding = self.resampler(vision_embedding, tgt_sizes)
++        return vision_embedding
++
++    def get_vision_hidden_states(self,
++                                 data: MiniCPMVImageInputs) -> torch.Tensor:
++        pixel_values = data["data"]
++        tgt_sizes = data["tgt_sizes"]
++
++        device = self.vpm.embeddings.position_embedding.weight.device
++        dtype = self.vpm.embeddings.position_embedding.weight.dtype
++        all_pixel_values_lst = [
++            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
++        ]
++
++        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
++        assert isinstance(max_patches, int)
++
++        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
++            all_pixel_values_lst, batch_first=True, padding_value=0.0)
++        B, L, _ = all_pixel_values.shape
++        all_pixel_values = all_pixel_values.permute(0, 2,
++                                                    1).reshape(B, 3, -1, L)
++
++        patch_attn_mask = torch.zeros((B, 1, max_patches),
++                                      dtype=torch.bool,
++                                      device=device)
++        for i in range(B):
++            patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
++
++        return self.get_vision_embedding(all_pixel_values.type(dtype),
++                                         patch_attn_mask, tgt_sizes)
++
++
++class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++    # LoRA specific attributes
++    supported_lora_modules = [
++        # vision encoder
++        "fc1",
++        "fc2",
++        "out_proj",
++        # language model
++        "qkv_proj",  # same name with vision encoder
++        "o_proj",
++        "gate_up_proj",
++        "down_proj",
++        # resampler
++        "kv_proj",
++    ]
++
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++        assert self.version == (2, 6)
++
++    def init_llm(
++        self,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++    ) -> nn.Module:
++        return Qwen2ForCausalLM(vllm_config=vllm_config, prefix=prefix)
++
++    def init_vision_module(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig],
++        prefix: str = "",
++    ) -> nn.Module:
++        model = Idefics2VisionTransformer(config.vision_config,
++                                          quant_config=quant_config,
++                                          prefix=prefix)
++        if self.config.drop_vision_last_layer:
++            model.encoder.layers = model.encoder.layers[:-1]
++        return model
++
++    def init_resampler(self,
++                       embed_dim: int,
++                       vision_dim: int,
++                       quant_config: Optional[QuantizationConfig] = None,
++                       prefix: str = "") -> nn.Module:
++        with set_default_torch_dtype(torch.float16):
++            # The resampler in 2.6 remains consistent with the one in 2.5.
++            resampler = Resampler2_5(num_queries=self.config.query_num,
++                                     embed_dim=embed_dim,
++                                     num_heads=embed_dim // 128,
++                                     kv_dim=vision_dim,
++                                     quant_config=quant_config,
++                                     prefix=prefix)
++
++        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
++
++    def get_vision_embedding(
++        self,
++        pixel_values: List[torch.Tensor],
++        patch_attn_mask: Optional[torch.Tensor] = None,
++        tgt_sizes: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        vision_embedding = self.vpm(
++            pixel_values,
++            patch_attention_mask=patch_attn_mask,
++            tgt_sizes=tgt_sizes,
++        )
++        return vision_embedding
++
++    def get_vision_hidden_states(self,
++                                 data: MiniCPMVImageInputs) -> torch.Tensor:
++        pixel_values = data["data"]
++        tgt_sizes = data["tgt_sizes"]
++
++        device = self.vpm.embeddings.position_embedding.weight.device
++        dtype = self.vpm.embeddings.position_embedding.weight.dtype
++        all_pixel_values_lst = [
++            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
++        ]
++
++        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
++        assert isinstance(max_patches, int)
++
++        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
++            all_pixel_values_lst, batch_first=True, padding_value=0.0)
++        B, L, _ = all_pixel_values.shape
++        all_pixel_values = all_pixel_values.permute(0, 2,
++                                                    1).reshape(B, 3, -1, L)
++
++        patch_attn_mask = torch.zeros((B, 1, max_patches),
++                                      dtype=torch.bool,
++                                      device=device)
++        for i in range(B):
++            patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
++        vision_embedding = self.vpm(
++            all_pixel_values.type(dtype),
++            patch_attention_mask=patch_attn_mask,
++            tgt_sizes=tgt_sizes,
++        )
++
++        return self.resampler(vision_embedding, tgt_sizes)
++
++
++_SUPPORT_VERSION = {
++    (2, 0): MiniCPMV2_0,
++    (2, 5): MiniCPMV2_5,
++    (2, 6): MiniCPMV2_6
++}
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_minicpmv)
++@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens)
++@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv)
++@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv)
++class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
++    """
++    Different versions of MiniCPMV use different visual encoders and LLMs,
++    which is not conducive to the current integration logic of LoRA and
++    bitsandbytes in vLLM. Therefore, it is necessary to separate them.
++    """
++    # Ensure that the LoRA support check passes when the class is not
++    # initialized, but set all these attributes to empty.
++    packed_modules_mapping = {}
++    supported_lora_modules = []
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
++        config = vllm_config.model_config.hf_config
++        if not hasattr(config, "version"):
++            if config.hidden_size == 2304 and config.query_num == 64:
++                version = (2, 0)
++            else:
++                version = (2, 5)
++        else:
++            version = str(config.version).split(".")
++            version = tuple([int(x) for x in version])
++        # Dispatch class based on version
++        instance_class = _SUPPORT_VERSION.get(version)
++        if instance_class is None:
++            raise ValueError(
++                "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6")
++        return instance_class(vllm_config=vllm_config, prefix=prefix)
+diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
+index efa4de7..a5b364f 100644
+--- a/vllm/model_executor/models/mixtral.py
++++ b/vllm/model_executor/models/mixtral.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+ # Copyright 2023 The vLLM team.
+@@ -21,36 +20,36 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only Mixtral model."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import MixtralConfig
+ 
+-from vllm import _custom_ops as ops
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.config import LoRAConfig
+-from vllm.distributed import (get_tensor_model_parallel_rank,
+-                              get_tensor_model_parallel_world_size,
+-                              tensor_model_parallel_all_reduce)
+-from vllm.model_executor.layers.fused_moe import fused_moe
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.fused_moe import FusedMoE
+ from vllm.model_executor.layers.layernorm import RMSNorm
+ from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                                ReplicatedLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
+-from vllm.model_executor.layers.quantization.fp8 import Fp8Config
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name)
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.model_executor.utils import set_weight_attrs
+-from vllm.sequence import SamplerOutput
+-from vllm.utils import print_warning_once
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class MixtralMoE(nn.Module):
+@@ -62,198 +61,61 @@ class MixtralMoE(nn.Module):
+     across ranks.
+     """
+ 
+-    def __init__(
+-        self,
+-        num_experts: int,
+-        top_k: int,
+-        hidden_size: int,
+-        intermediate_size: int,
+-        params_dtype: Optional[torch.dtype] = None,
+-        tp_size: Optional[int] = None,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self,
++                 num_experts: int,
++                 top_k: int,
++                 hidden_size: int,
++                 intermediate_size: int,
++                 params_dtype: Optional[torch.dtype] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 tp_size: Optional[int] = None,
++                 prefix: str = ""):
+         super().__init__()
+-        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
+-        self.num_total_experts = num_experts
+-        self.top_k = top_k
+         self.hidden_size = hidden_size
+-        self.intermediate_size = intermediate_size // self.tp_size
+-        self.quant_config = quant_config
+-
+-        # FIXME(pcmoritz): Make this more general to support different
+-        # quantization schemes
+-        self.use_fp8 = isinstance(quant_config, Fp8Config)
+-
+-        if params_dtype is None:
+-            params_dtype = torch.get_default_dtype()
+-        self.params_dtype = params_dtype
+ 
+         # Gate always runs at half / full precision for now.
+-        self.gate = ReplicatedLinear(self.hidden_size,
+-                                     self.num_total_experts,
++
++        self.gate = ReplicatedLinear(hidden_size,
++                                     num_experts,
+                                      bias=False,
+-                                     params_dtype=self.params_dtype,
+-                                     quant_config=None)
+-
+-        if self.use_fp8:
+-            params_dtype = torch.float8_e4m3fn
+-
+-        self.w13_weight = nn.Parameter(
+-            torch.empty(self.num_total_experts,
+-                        2 * self.intermediate_size,
+-                        self.hidden_size,
+-                        dtype=params_dtype))
+-        self.w2_weight = nn.Parameter(
+-            torch.empty(self.num_total_experts,
+-                        self.hidden_size,
+-                        self.intermediate_size,
+-                        dtype=params_dtype))
+-
+-        set_weight_attrs(self.w13_weight, {
+-            "weight_loader": self.weight_loader,
+-        })
+-        set_weight_attrs(self.w2_weight, {
+-            "weight_loader": self.weight_loader,
+-        })
+-
+-        # Used for fp8.
+-        self.w13_scale = None
+-        self.w2_scale = None
+-        self.a13_scale = None
+-        self.a2_scale = None
+-
+-        if self.use_fp8:
+-            # WEIGHT_SCALE (for fp8)
+-            self.w13_scale = nn.Parameter(torch.ones(self.num_total_experts,
+-                                                     dtype=torch.float32),
+-                                          requires_grad=False)
+-            self.w2_scale = nn.Parameter(torch.ones(self.num_total_experts,
+-                                                    dtype=torch.float32),
+-                                         requires_grad=False)
+-
+-            # If loading fp8 checkpoint, pass the weight loaders.
+-            # If loading an fp16 checkpoint, do not (we will quantize in
+-            #   process_weights_after_loading()
+-            if quant_config.is_checkpoint_fp8_serialized:
+-                set_weight_attrs(self.w13_scale, {
+-                    "weight_loader": self.weight_loader,
+-                })
+-                set_weight_attrs(self.w2_scale, {
+-                    "weight_loader": self.weight_loader,
+-                })
+-
+-            # ACT_SCALE (for fp8)
+-            if quant_config.activation_scheme == "static":
+-                if not quant_config.is_checkpoint_fp8_serialized:
+-                    raise ValueError(
+-                        "Found static activation scheme for checkpoint that "
+-                        "was not serialized fp8.")
+-                self.a13_scale = nn.Parameter(torch.zeros(
+-                    self.num_total_experts, dtype=torch.float32),
+-                                              requires_grad=False)
+-                self.a2_scale = nn.Parameter(torch.zeros(
+-                    self.num_total_experts, dtype=torch.float32),
+-                                             requires_grad=False)
+-
+-                set_weight_attrs(self.a13_scale, {
+-                    "weight_loader": self.weight_loader,
+-                })
+-                set_weight_attrs(self.a2_scale, {
+-                    "weight_loader": self.weight_loader,
+-                })
+-
+-    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
+-                      weight_name: str, expert_id: int):
+-        tp_rank = get_tensor_model_parallel_rank()
+-        param_data = param.data
+-        shard_size = self.intermediate_size
+-        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+-        if weight_name.endswith("w1.weight"):
+-            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
+-        if weight_name.endswith("w3.weight"):
+-            param_data[expert_id,
+-                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
+-        if weight_name.endswith("w2.weight"):
+-            param_data[expert_id, :, :] = loaded_weight[:, shard]
+-        if "act_scale" in weight_name or "weight_scale" in weight_name:
+-            param_data[expert_id] = loaded_weight
+-
+-    def process_weights_after_loading(self):
+-        # Fp8 is the only case where we need to process after loading.
+-        if not self.use_fp8:
+-            return
+-
+-        # If checkpoint is fp16, quantize here.
+-        if not self.quant_config.is_checkpoint_fp8_serialized:
+-            w13_weight = torch.empty_like(self.w13_weight.data,
+-                                          dtype=torch.float8_e4m3fn)
+-            w2_weight = torch.empty_like(self.w2_weight.data,
+-                                         dtype=torch.float8_e4m3fn)
+-            for expert in range(self.num_total_experts):
+-                w13_weight[expert, :, :], self.w13_scale[
+-                    expert] = ops.scaled_fp8_quant(
+-                        self.w13_weight.data[expert, :, :])
+-                w2_weight[expert, :, :], self.w2_scale[
+-                    expert] = ops.scaled_fp8_quant(
+-                        self.w2_weight.data[expert, :, :])
+-            self.w13_weight = nn.Parameter(w13_weight, requires_grad=False)
+-            self.w2_weight = nn.Parameter(w2_weight, requires_grad=False)
+-
+-        # If checkpoint is fp8 + static, cleanup act_scales.
+-        #   Since state_dict has an act_scale per expert but our kernels
+-        #   are passed one act_scale shared across all experts.
+-        elif self.quant_config.activation_scheme == "static":
+-            if self.a13_scale is None or self.a2_scale is None:
+-                raise ValueError(
+-                    "QuantConfig has static quantization, but found "
+-                    "activation scales are None.")
+-
+-            if (not all_close_1d(self.a13_scale)
+-                    or not all_close_1d(self.a2_scale)):
+-                print_warning_once(
+-                    "Found act_scales that are not equal for fp8 MoE layer. "
+-                    "Using the maximum across experts for each layer. ")
+-
+-            self.a13_scale = nn.Parameter(self.a13_scale.max(),
+-                                          requires_grad=False)
+-            self.a2_scale = nn.Parameter(self.a2_scale.max(),
+-                                         requires_grad=False)
++                                     params_dtype=params_dtype,
++                                     quant_config=None,
++                                     prefix=f"{prefix}.gate")
++
++        self.experts = FusedMoE(num_experts=num_experts,
++                                top_k=top_k,
++                                hidden_size=hidden_size,
++                                intermediate_size=intermediate_size,
++                                params_dtype=params_dtype,
++                                reduce_results=True,
++                                renormalize=True,
++                                quant_config=quant_config,
++                                tp_size=tp_size,
++                                prefix=f"{prefix}.experts")
+ 
+     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+-        num_tokens, hidden_size = hidden_states.shape
++        # NOTE: hidden_states can have either 1D or 2D shape.
++        orig_shape = hidden_states.shape
+         hidden_states = hidden_states.view(-1, self.hidden_size)
+         # router_logits: (num_tokens, n_experts)
+         router_logits, _ = self.gate(hidden_states)
+-        final_hidden_states = fused_moe(hidden_states,
+-                                        self.w13_weight,
+-                                        self.w2_weight,
+-                                        router_logits,
+-                                        self.top_k,
+-                                        renormalize=True,
+-                                        inplace=True,
+-                                        use_fp8=self.use_fp8,
+-                                        w1_scale=self.w13_scale,
+-                                        w2_scale=self.w2_scale,
+-                                        a1_scale=self.a13_scale,
+-                                        a2_scale=self.a2_scale)
+-
+-        if self.tp_size > 1:
+-            final_hidden_states = tensor_model_parallel_all_reduce(
+-                final_hidden_states)
+-
+-        return final_hidden_states.view(num_tokens, hidden_size)
++        final_hidden_states = self.experts(hidden_states, router_logits)
++        return final_hidden_states.view(orig_shape)
+ 
+ 
+ class MixtralAttention(nn.Module):
+ 
+-    def __init__(self,
+-                 hidden_size: int,
+-                 num_heads: int,
+-                 num_kv_heads: int,
+-                 max_position: int = 4096 * 32,
+-                 rope_theta: float = 10000,
+-                 quant_config: Optional[QuantizationConfig] = None,
+-                 sliding_window: Optional[int] = None) -> None:
++    def __init__(
++        self,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        max_position: int = 4096 * 32,
++        rope_theta: float = 10000,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
+         super().__init__()
+         self.hidden_size = hidden_size
+         tp_size = get_tensor_model_parallel_world_size()
+@@ -275,16 +137,6 @@ class MixtralAttention(nn.Module):
+         self.kv_size = self.num_kv_heads * self.head_dim
+         self.scaling = self.head_dim**-0.5
+         self.rope_theta = rope_theta
+-        self.sliding_window = sliding_window
+-
+-        if isinstance(
+-                quant_config,
+-                Fp8Config) and not quant_config.is_checkpoint_fp8_serialized:
+-            print_warning_once(
+-                "For Mixtral FP8 quantization, we currently do not quantize "
+-                "the attention layers until their FP8 performance is improved."
+-            )
+-            quant_config = None
+ 
+         self.qkv_proj = QKVParallelLinear(
+             hidden_size,
+@@ -293,12 +145,14 @@ class MixtralAttention(nn.Module):
+             self.total_num_kv_heads,
+             bias=False,
+             quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
+         )
+         self.o_proj = RowParallelLinear(
+             self.total_num_heads * self.head_dim,
+             hidden_size,
+             bias=False,
+             quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
+         )
+         self.rotary_emb = get_rope(
+             self.head_dim,
+@@ -307,13 +161,13 @@ class MixtralAttention(nn.Module):
+             base=int(self.rope_theta),
+             is_neox_style=True,
+         )
+-        self.attn = Attention(
+-            self.num_heads,
+-            self.head_dim,
+-            self.scaling,
+-            num_kv_heads=self.num_kv_heads,
+-            sliding_window=self.sliding_window,
+-        )
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -335,7 +189,9 @@ class MixtralDecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: MixtralConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -347,14 +203,16 @@ class MixtralDecoderLayer(nn.Module):
+             max_position=config.max_position_embeddings,
+             num_kv_heads=config.num_key_value_heads,
+             rope_theta=rope_theta,
+-            sliding_window=config.sliding_window,
+-            quant_config=quant_config)
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn")
+         self.block_sparse_moe = MixtralMoE(
+             num_experts=config.num_local_experts,
+             top_k=config.num_experts_per_tok,
+             hidden_size=config.hidden_size,
+             intermediate_size=config.intermediate_size,
+-            quant_config=quant_config)
++            quant_config=quant_config,
++            prefix=f"{prefix}.block_sparse_moe")
+         self.input_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+         self.post_attention_layernorm = RMSNorm(config.hidden_size,
+@@ -389,15 +247,17 @@ class MixtralDecoderLayer(nn.Module):
+         return hidden_states, residual
+ 
+ 
++@support_torch_compile
+ class MixtralModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: MixtralConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
+         self.padding_idx = config.pad_token_id
+         lora_vocab = (lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0
+@@ -409,11 +269,21 @@ class MixtralModel(nn.Module):
+             config.hidden_size,
+             org_num_embeddings=config.vocab_size,
+         )
+-        self.layers = nn.ModuleList([
+-            MixtralDecoderLayer(config, quant_config=quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: MixtralDecoderLayer(
++                config, cache_config, quant_config=quant_config, prefix=prefix
++            ),
++            prefix=f"{prefix}.layers")
++
+         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -421,19 +291,34 @@ class MixtralModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        residual = None
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(positions, hidden_states,
+-                                            kv_caches[i], attn_metadata,
+-                                            residual)
++                                            kv_caches[i - self.start_layer],
++                                            attn_metadata, residual)
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+         hidden_states, _ = self.norm(hidden_states, residual)
+         return hidden_states
+ 
+ 
+-class MixtralForCausalLM(nn.Module):
++class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+     fall_back_to_pt_during_load = False
+ 
+     packed_modules_mapping = {
+@@ -446,10 +331,8 @@ class MixtralForCausalLM(nn.Module):
+ 
+     # LoRA specific attributes
+     supported_lora_modules = [
+-        "qkv_proj",
+-        "o_proj",
+-        "embed_tokens",
+-        "lm_head",
++        "qkv_proj", "o_proj", "embed_tokens", "lm_head", "w1", "w2", "w3",
++        "gate"
+     ]
+     embedding_modules = {
+         "embed_tokens": "input_embeddings",
+@@ -457,17 +340,16 @@ class MixtralForCausalLM(nn.Module):
+     }
+     embedding_padding_modules = ["lm_head"]
+ 
+-    def __init__(
+-        self,
+-        config: MixtralConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
+         self.config = config
+-        self.model = MixtralModel(config,
+-                                  quant_config,
+-                                  lora_config=lora_config)
++        self.lora_config = lora_config
++
++        self.model = MixtralModel(vllm_config=vllm_config,
++                                  prefix=maybe_prefix(prefix, "model"))
+         self.unpadded_vocab_size = config.vocab_size
+         if lora_config:
+             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+@@ -479,10 +361,18 @@ class MixtralForCausalLM(nn.Module):
+             # We need bigger padding if using lora for kernel
+             # compatibility
+             if not lora_config else lora_config.lora_vocab_padding_size,
++            quant_config=quant_config,
+         )
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
+         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                 config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -490,14 +380,20 @@ class MixtralForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -509,7 +405,8 @@ class MixtralForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -517,30 +414,16 @@ class MixtralForCausalLM(nn.Module):
+             ("qkv_proj", "v_proj", "v"),
+         ]
+ 
+-        expert_params_mapping = [
+-            # These are the weight scales for the experts
+-            # (param_name, weight_name, expert_id)
+-            ("w13_scale" if weight_name in ["w1", "w3"] else "w2_scale",
+-             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id)
+-            for expert_id in range(self.config.num_local_experts)
+-            for weight_name in ["w1", "w2", "w3"]
+-        ] + [
+-            # These are the weights for the experts
+-            # (param_name, weight_name, expert_id)
+-            ("w13_weight" if weight_name in ["w1", "w3"] else "w2_weight",
+-             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+-            for expert_id in range(self.config.num_local_experts)
+-            for weight_name in ["w1", "w2", "w3"]
+-        ] + [
+-            # These are the activation scales for the experts
+-            # (param_name, weight_name, expert_id)
+-            ("a13_scale" if weight_name in ["w1", "w3"] else "a2_scale",
+-             f"experts.{expert_id}.{weight_name}.act_scale", expert_id)
+-            for expert_id in range(self.config.num_local_experts)
+-            for weight_name in ["w1", "w2", "w3"]
+-        ]
++        # Params for weights, fp8 weight scales, fp8 activation scales
++        # (param_name, weight_name, expert_id, shard_id)
++        expert_params_mapping = FusedMoE.make_expert_params_mapping(
++            ckpt_gate_proj_name="w1",
++            ckpt_down_proj_name="w2",
++            ckpt_up_proj_name="w3",
++            num_experts=self.config.num_local_experts)
+ 
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -550,34 +433,53 @@ class MixtralForCausalLM(nn.Module):
+                     continue
+                 name = name.replace(weight_name, param_name)
+                 # Skip loading extra bias for GPTQ models.
+-                if name.endswith(".bias") and name not in params_dict:
++                if ((name.endswith(".bias") or name.endswith("_bias"))
++                        and name not in params_dict):
++                    continue
++                # Skip layers on other devices.
++                if is_pp_missing_parameter(name, self):
+                     continue
++
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+                 break
+             else:
+-                for param_name, weight_name, expert_id in expert_params_mapping:
++                for mapping in expert_params_mapping:
++                    param_name, weight_name, expert_id, shard_id = mapping
+                     if weight_name not in name:
+                         continue
+                     name = name.replace(weight_name, param_name)
++                    # Skip layers on other devices.
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    if ((name.endswith(".bias") or name.endswith("_bias"))
++                            and name not in params_dict):
++                        continue
+                     param = params_dict[name]
+                     weight_loader = param.weight_loader
+                     weight_loader(param,
+                                   loaded_weight,
+-                                  weight_name,
++                                  name,
++                                  shard_id=shard_id,
+                                   expert_id=expert_id)
+                     break
+                 else:
+                     # Skip loading extra bias for GPTQ models.
+-                    if name.endswith(".bias") and name not in params_dict:
++                    if ((name.endswith(".bias") or name.endswith("_bias"))
++                            and name not in params_dict):
++                        continue
++                    # Skip layers on other devices.
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    # Remapping the name of FP8 kv-scale.
++                    name = maybe_remap_kv_scale_name(name, params_dict)
++                    if name is None:
+                         continue
++
+                     param = params_dict[name]
+                     weight_loader = getattr(param, "weight_loader",
+                                             default_weight_loader)
+                     weight_loader(param, loaded_weight)
+-
+-
+-def all_close_1d(x: torch.Tensor) -> bool:
+-    assert len(x.shape) == 1
+-    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
+index 38c62af..7a9b8cd 100644
+--- a/vllm/model_executor/models/mixtral_quant.py
++++ b/vllm/model_executor/models/mixtral_quant.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+ # Copyright 2023 The vLLM team.
+@@ -21,7 +20,7 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only Mixtral model."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import numpy as np
+ import torch
+@@ -30,7 +29,8 @@ from torch import nn
+ from transformers import MixtralConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import (get_tensor_model_parallel_rank,
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size,
+                               tensor_model_parallel_all_reduce)
+ from vllm.model_executor.layers.layernorm import RMSNorm
+@@ -38,15 +38,19 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                                ReplicatedLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class MixtralMLP(nn.Module):
+@@ -157,14 +161,17 @@ class MixtralMoE(nn.Module):
+ 
+ class MixtralAttention(nn.Module):
+ 
+-    def __init__(self,
+-                 hidden_size: int,
+-                 num_heads: int,
+-                 num_kv_heads: int,
+-                 max_position: int = 4096 * 32,
+-                 rope_theta: float = 10000,
+-                 quant_config: Optional[QuantizationConfig] = None,
+-                 sliding_window: Optional[int] = None) -> None:
++    def __init__(
++        self,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        max_position: int = 4096 * 32,
++        rope_theta: float = 10000,
++        quant_config: Optional[QuantizationConfig] = None,
++        cache_config: Optional[CacheConfig] = None,
++        prefix: str = "",
++    ) -> None:
+         super().__init__()
+         self.hidden_size = hidden_size
+         tp_size = get_tensor_model_parallel_world_size()
+@@ -186,7 +193,6 @@ class MixtralAttention(nn.Module):
+         self.kv_size = self.num_kv_heads * self.head_dim
+         self.scaling = self.head_dim**-0.5
+         self.rope_theta = rope_theta
+-        self.sliding_window = sliding_window
+ 
+         self.qkv_proj = QKVParallelLinear(
+             hidden_size,
+@@ -209,13 +215,13 @@ class MixtralAttention(nn.Module):
+             base=int(self.rope_theta),
+             is_neox_style=True,
+         )
+-        self.attn = Attention(
+-            self.num_heads,
+-            self.head_dim,
+-            self.scaling,
+-            num_kv_heads=self.num_kv_heads,
+-            sliding_window=self.sliding_window,
+-        )
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -237,7 +243,9 @@ class MixtralDecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: MixtralConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -249,8 +257,10 @@ class MixtralDecoderLayer(nn.Module):
+             max_position=config.max_position_embeddings,
+             num_kv_heads=config.num_key_value_heads,
+             rope_theta=rope_theta,
+-            sliding_window=config.sliding_window,
+-            quant_config=quant_config)
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
+         self.block_sparse_moe = MixtralMoE(config=config,
+                                            quant_config=quant_config)
+         self.input_layernorm = RMSNorm(config.hidden_size,
+@@ -289,12 +299,13 @@ class MixtralDecoderLayer(nn.Module):
+ 
+ class MixtralModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: MixtralConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.padding_idx = config.pad_token_id
+         self.vocab_size = config.vocab_size
+ 
+@@ -302,11 +313,19 @@ class MixtralModel(nn.Module):
+             config.vocab_size,
+             config.hidden_size,
+         )
+-        self.layers = nn.ModuleList([
+-            MixtralDecoderLayer(config, quant_config=quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: MixtralDecoderLayer(
++                config, cache_config, quant_config=quant_config, prefix=prefix
++            ),
++            prefix=f"{prefix}.layers")
+         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -314,33 +333,56 @@ class MixtralModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        residual = None
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(positions, hidden_states,
+-                                            kv_caches[i], attn_metadata,
+-                                            residual)
++                                            kv_caches[i - self.start_layer],
++                                            attn_metadata, residual)
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+         hidden_states, _ = self.norm(hidden_states, residual)
+         return hidden_states
+ 
+ 
+-class MixtralForCausalLM(nn.Module):
++class MixtralForCausalLM(nn.Module, SupportsPP):
+     fall_back_to_pt_during_load = False
+ 
+-    def __init__(
+-        self,
+-        config: MixtralConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+-        self.model = MixtralModel(config, quant_config)
+-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
++        self.model = MixtralModel(vllm_config=vllm_config,
++                                  prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      quant_config=quant_config)
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -348,14 +390,20 @@ class MixtralForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -367,7 +415,8 @@ class MixtralForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -376,6 +425,7 @@ class MixtralForCausalLM(nn.Module):
+         ]
+ 
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -386,6 +436,8 @@ class MixtralForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -398,7 +450,11 @@ class MixtralForCausalLM(nn.Module):
+                 if ("block_sparse_moe.experts." in name
+                         and name not in params_dict):
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
+new file mode 100644
+index 0000000..593a4d3
+--- /dev/null
++++ b/vllm/model_executor/models/mllama.py
+@@ -0,0 +1,1527 @@
++# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""PyTorch Mllama model."""
++import math
++from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
++                    TypedDict, Union)
++
++import numpy as np
++import torch
++import torch.nn.functional as F
++import torch.utils.checkpoint
++import transformers.models.mllama.configuration_mllama as config_mllama
++from PIL import Image
++from torch import nn
++from transformers.modeling_outputs import (BaseModelOutput,
++                                           CausalLMOutputWithPast)
++from transformers.models.mllama.image_processing_mllama import (
++    get_optimal_tiled_canvas)
++from transformers.models.mllama.processing_mllama import (
++    get_cross_attention_token_mask)
++
++import vllm.distributed.parallel_state as ps
++from vllm.attention import Attention, AttentionMetadata, AttentionType
++from vllm.attention.ops.paged_attn import PagedAttention
++from vllm.attention.selector import _Backend
++from vllm.config import VllmConfig
++from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.inputs import (INPUT_REGISTRY, DummyData, EncoderDecoderInputs,
++                         InputContext, TokenInputs, token_inputs)
++from vllm.logger import init_logger
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.sequence import SequenceData
++from vllm.utils import is_list_of
++
++from .clip import CLIPMLP
++from .interfaces import SupportsMultiModal
++from .llama import LlamaDecoderLayer, LlamaMLP
++from .utils import maybe_prefix
++
++logger = init_logger(__name__)
++MLLAMA_IMAGE_TOKEN_ID = 128256
++MLLAMA_IMAGE_TOKEN = "<|image|>"
++
++
++class MllamaImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: torch.Tensor
++    """Shape: """
++    """(batch_size, max_num_image, max_num_chunk, num_channel, height, width)"""
++    aspect_ratio_ids: torch.Tensor
++    """Shape: `(batch_size, max_num_image)`"""
++    aspect_ratio_mask: torch.Tensor
++    """Shape: `(batch_size, max_num_image, max_num_tiles)`"""
++
++
++# TODO: support LlamaImageEmbeddingInputs
++
++
++def _get_num_image_in_last_group(prompt_token_ids: List[int]) -> int:
++    num_images = 0
++    for token_id in prompt_token_ids[::-1]:
++        if token_id == MLLAMA_IMAGE_TOKEN_ID:
++            num_images += 1
++        elif num_images > 0:
++            break
++    return num_images
++
++
++def input_processor_for_mllama(
++    ctx: InputContext,
++    inputs: EncoderDecoderInputs,
++) -> EncoderDecoderInputs:
++    # Example input to processor:
++    # {
++    #     'encoder': {
++    #         'type': 'token',
++    #         'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
++    #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
++    #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
++    #     },
++    #     'decoder': {
++    #         'type': 'token',
++    #         'prompt_token_ids': [128000],
++    #     },
++    # }
++
++    # move encoder prompt to decoder
++    dec_inputs = TokenInputs(**inputs["encoder"])
++
++    multi_modal_data = dec_inputs.get("multi_modal_data")
++    if multi_modal_data is None or "image" not in multi_modal_data:
++        # text-only
++        return EncoderDecoderInputs(
++            encoder=token_inputs([]),
++            decoder=dec_inputs,
++        )
++
++    image_data = multi_modal_data["image"]
++    if isinstance(image_data, Image.Image):
++        image_data = [image_data]
++
++    assert is_list_of(image_data, Image.Image)
++
++    num_image_tokens = dec_inputs['prompt_token_ids'].count(
++        MLLAMA_IMAGE_TOKEN_ID)
++    if num_image_tokens != len(image_data):
++        raise ValueError(
++            f"The number of image tokens ({num_image_tokens}) must be"
++            f" the same as the number of images ({len(image_data)})")
++
++    # Since only the last group of consecutive images
++    # are attended by the decoded tokens, we only need to
++    # get the number of tiles for those images.
++    num_decode_images = _get_num_image_in_last_group(
++        dec_inputs["prompt_token_ids"])
++
++    hf_config = ctx.model_config.hf_config
++    vision_config = hf_config.vision_config
++
++    num_tiles = 0
++    for image in image_data[::-1]:
++        width, height = image.size
++        tile_size = vision_config.image_size
++        canvas_height, canvas_width = get_optimal_tiled_canvas(
++            image_height=height,
++            image_width=width,
++            max_image_tiles=vision_config.max_num_tiles,
++            tile_size=tile_size,
++        )
++        num_tiles_height = canvas_height // tile_size
++        num_tiles_width = canvas_width // tile_size
++        num_tiles += num_tiles_height * num_tiles_width
++        num_decode_images -= 1
++        if num_decode_images == 0:
++            break
++
++    # Set encoder prompt length based on the number of tiles.
++    # This tells the block manager to allocate correct number
++    # of slots for encoder tokens.
++    assert vision_config.image_size % 14 == 0, \
++        "chunk size should be multiple of 14"
++    token_per_chunk = (vision_config.image_size // 14)**2 + 1
++    num_tokens = num_tiles * token_per_chunk
++
++    # Example output from processor:
++    # {
++    #     'encoder': {
++    #         'type': 'token',
++    #         'prompt_token_ids': [128256, 128256, ..., 128256],
++    #         'prompt': '<|image|><|image|>...<|image|>',
++    #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
++    #     },
++    #     'decoder': {
++    #         'type': 'token',
++    #         'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
++    #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
++    #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
++    #     },
++    # }
++    return EncoderDecoderInputs(
++        encoder=token_inputs(
++            prompt_token_ids=[MLLAMA_IMAGE_TOKEN_ID] * num_tokens,
++            prompt=MLLAMA_IMAGE_TOKEN * num_tokens,
++            multi_modal_data=multi_modal_data,
++        ),
++        decoder=dec_inputs,
++    )
++
++
++def get_max_mllama_image_tokens(ctx: InputContext) -> int:
++    hf_config = ctx.model_config.hf_config
++    token_per_chunk = (hf_config.vision_config.image_size // 14)**2 + 1
++    return hf_config.vision_config.max_num_tiles * token_per_chunk
++
++
++def dummy_decoder_seq_data(seq_len: int, num_images: int):
++    # <|image|> * num_images + 0 * (seq_len - num_images)
++    assert seq_len >= num_images, \
++        "seq_len should be greater than or equal to num_images"
++
++    return SequenceData.from_prompt_token_counts(
++        (MLLAMA_IMAGE_TOKEN_ID, num_images),
++        (0, seq_len - num_images),
++    )
++
++
++def dummy_encoder_seq_data(ctx: InputContext, num_images: int):
++    num_tokens = get_max_mllama_image_tokens(ctx) * num_images
++
++    return SequenceData.from_prompt_token_counts(
++        (MLLAMA_IMAGE_TOKEN_ID, num_tokens))
++
++
++def dummy_image(num_images: int, ):
++    width = height = 1024
++    image = Image.new("RGB", (width, height), color=0)
++    return {"image": image if num_images == 1 else [image] * num_images}
++
++
++def dummy_decoder_data_for_mllama(ctx: InputContext, seq_len: int,
++                                  mm_counts: Mapping[str, int]):
++    num_images = mm_counts["image"]
++    return DummyData(dummy_decoder_seq_data(seq_len, num_images))
++
++
++def dummy_encoder_data_for_mllama(ctx: InputContext, seq_len: int,
++                                  mm_counts: Mapping[str, int]):
++    num_images = mm_counts["image"]
++    return DummyData(dummy_encoder_seq_data(ctx, num_images),
++                     dummy_image(num_images))
++
++
++def _prepare_aspect_ratio_attention_mask(
++    aspect_ratio_mask: torch.Tensor,
++    num_patches: int,
++    target_length: int,
++    dtype: torch.dtype,
++) -> torch.Tensor:
++    # Expand aspect ratio mask to target_length
++    batch_size, max_num_tiles = aspect_ratio_mask.shape
++    attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1,
++                                            1).to(dtype)
++    attention_mask = attention_mask.repeat(1, 1, target_length, 1)
++
++    # Mask padding patches
++    pad_patches = target_length - num_patches
++    attention_mask[:, :, -pad_patches:] = 0
++
++    # Invert the mask (0 -> 1, 1 -> 0)
++    attention_mask = 1 - attention_mask
++
++    # Reshape to 2D and create 4D attention mask
++    # (batch_size, 1, max_num_tiles*target_length, max_num_tiles*target_length)
++    attention_mask = attention_mask.reshape(batch_size,
++                                            max_num_tiles * target_length, 1)
++    attention_mask = attention_mask @ attention_mask.transpose(
++        -1, -2) * torch.finfo(dtype).min
++    attention_mask = attention_mask.unsqueeze(1)
++
++    return attention_mask
++
++
++class ColumnParallelConv2dPatch(torch.nn.Module):
++    """Conv2D Patching layer with model parallelism.
++    Column parallel over unfolded input.
++    Arguments:
++        in_channels: Input channels.
++        out_channels: Output channels.
++        kernel_size: Size of convolution kernel.
++        stride (default 1): Stride for convolution.
++        bias (default False): Use bias in Conv2d.
++    Input: (bsz, in_channels, width, height)
++    Output: (bsz, num_tokens, out_channels)
++    """
++
++    def __init__(
++        self,
++        in_channels: int,
++        out_channels: int,
++        kernel_size: Union[int, Tuple[int, int]],
++        stride: Union[int, Tuple[int, int]],
++        bias: bool = False,
++    ) -> None:
++        super().__init__()
++        if isinstance(kernel_size, int):
++            kernel_size = (kernel_size, kernel_size)
++        self._unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=stride)
++        self._linear = ColumnParallelLinear(
++            in_channels * kernel_size[0] * kernel_size[1],
++            out_channels,
++            bias=bias,
++        )
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        x = self._unfold(x)
++        x = x.permute(0, 2, 1)
++        x, _ = self._linear(x)
++        return x
++
++
++class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
++
++    def __init__(self,
++                 config: config_mllama.MllamaVisionConfig,
++                 is_gated: bool = True):
++        super().__init__()
++        self.max_num_tiles = config.max_num_tiles
++        self.hidden_size = config.hidden_size
++        self.max_aspect_ratio_id = config.max_aspect_ratio_id
++        self.is_gated = is_gated
++
++        self.embedding = nn.Embedding(self.max_aspect_ratio_id + 1,
++                                      self.max_num_tiles * self.hidden_size)
++        if is_gated:
++            self.gate = nn.Parameter(torch.zeros(1))
++
++    def forward(self, hidden_state: torch.Tensor,
++                aspect_ratio_ids: torch.Tensor) -> torch.Tensor:
++        embeddings = self.embedding(aspect_ratio_ids)
++        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1,
++                                        self.hidden_size)
++
++        if self.is_gated:
++            embeddings = embeddings * self.gate.tanh()
++
++        hidden_state = hidden_state + embeddings
++        return hidden_state
++
++
++class MllamaPrecomputedPositionEmbedding(nn.Module):
++
++    def __init__(self, config: config_mllama.MllamaVisionConfig):
++        super().__init__()
++        self.max_num_tiles = config.max_num_tiles
++        self.max_aspect_ratio_id = config.max_aspect_ratio_id
++        self.num_patches = (config.image_size // config.patch_size)**2 + 1
++        self.hidden_size = config.hidden_size
++        self.scale = config.hidden_size**-0.5
++
++        self.gate = nn.Parameter(torch.zeros(1))
++
++        # position embedding
++        position_embedding = torch.randn(self.num_patches, self.hidden_size)
++        self.embedding = nn.Parameter(self.scale * position_embedding)
++
++        # tile position embedding
++        self.tile_embedding = nn.Embedding(
++            self.max_aspect_ratio_id + 1,
++            self.max_num_tiles * self.num_patches * self.hidden_size)
++
++    def forward(self, hidden_state: torch.Tensor,
++                aspect_ratio_ids: torch.Tensor) -> torch.Tensor:
++        # position embeddings
++        gated_position_embedding = (1 - self.gate.tanh()) * self.embedding
++        hidden_state = hidden_state + gated_position_embedding.view(
++            1, 1, self.num_patches, self.hidden_size)
++
++        # precomputed tile position embeddings
++        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
++        batch_size = hidden_state.shape[0]
++        tile_position_embedding = tile_position_embedding.reshape(
++            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size)
++        gated_tile_position_embedding = self.gate.tanh(
++        ) * tile_position_embedding
++        hidden_state = hidden_state + gated_tile_position_embedding
++
++        return hidden_state
++
++
++# TODO: support other attention backends for attention in vision model
++class MllamaVisionSdpaAttention(nn.Module):
++
++    def __init__(self,
++                 config: config_mllama.MllamaVisionConfig,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++
++        model_parallel_size = get_tensor_model_parallel_world_size()
++        self.embed_dim = config.hidden_size
++        self.num_heads = config.attention_heads
++        self.head_dim = config.hidden_size // config.attention_heads
++        self.num_local_heads = self.num_heads // model_parallel_size
++        self.q_size = self.num_local_heads * self.head_dim
++        self.kv_size = self.num_local_heads * self.head_dim
++
++        self.qkv_proj = QKVParallelLinear(
++            self.embed_dim,
++            self.head_dim,
++            self.num_heads,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++        self.o_proj = RowParallelLinear(
++            self.num_heads * self.head_dim,
++            self.embed_dim,
++            bias=False,
++            input_is_parallel=True,
++            quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
++        )
++
++    def forward(
++        self,
++        hidden_state: torch.Tensor,
++        attention_mask: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_state)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q = q.view(q.shape[0], q.shape[1], self.num_local_heads,
++                   self.head_dim).transpose(1, 2)
++        k = k.view(k.shape[0], k.shape[1], self.num_local_heads,
++                   self.head_dim).transpose(1, 2)
++        v = v.view(v.shape[0], v.shape[1], self.num_local_heads,
++                   self.head_dim).transpose(1, 2)
++
++        # TODO: remove padding in image encoder
++        attn_output = F.scaled_dot_product_attention(q,
++                                                     k,
++                                                     v,
++                                                     attn_mask=attention_mask,
++                                                     dropout_p=0.0)
++
++        attn_output = attn_output.transpose(1, 2).contiguous()
++        attn_output = attn_output.reshape(attn_output.shape[0],
++                                          attn_output.shape[1], -1)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class MllamaVisionEncoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: config_mllama.MllamaVisionConfig,
++        quant_config: Optional[QuantizationConfig],
++        prefix: str = "",
++        is_gated: bool = False,
++    ) -> None:
++        super().__init__()
++
++        self.hidden_size = config.hidden_size
++        self.num_attention_heads = config.attention_heads
++        self.is_gated = is_gated
++        self.intermediate_size = config.intermediate_size
++
++        self.self_attn = MllamaVisionSdpaAttention(
++            config, quant_config=quant_config, prefix=f"{prefix}.self_attn")
++        self.mlp = CLIPMLP(config,
++                           quant_config=quant_config,
++                           prefix=f"{prefix}.mlp")
++
++        self.input_layernorm = nn.LayerNorm(self.hidden_size,
++                                            eps=config.norm_eps)
++        self.post_attention_layernorm = nn.LayerNorm(self.hidden_size,
++                                                     eps=config.norm_eps)
++
++        # there used to be an if else here, no code path
++        if is_gated:
++            self.gate_attn = nn.Parameter(torch.ones(1) * math.pi / 4)
++            self.gate_ffn = nn.Parameter(torch.ones(1) * math.pi / 4)
++
++    def forward(
++        self,
++        hidden_state: torch.Tensor,
++        attention_mask: Optional[torch.Tensor] = None,
++    ):
++        # Self Attention
++        residual = hidden_state
++        hidden_state = self.input_layernorm(hidden_state)
++        hidden_state = self.self_attn(hidden_state,
++                                      attention_mask=attention_mask)
++        gate_attn = 1 if not self.is_gated else self.gate_attn.tanh()
++        hidden_state = residual + gate_attn * hidden_state
++
++        # Feed forward
++        residual = hidden_state
++        hidden_state = self.post_attention_layernorm(hidden_state)
++        hidden_state = self.mlp(hidden_state)
++        gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh()
++        hidden_state = residual + gate_ffn * hidden_state
++
++        return hidden_state
++
++
++class MllamaVisionEncoder(nn.Module):
++
++    def __init__(
++        self,
++        config: config_mllama.MllamaVisionConfig,
++        quant_config: Optional[QuantizationConfig],
++        num_layers: int = 32,
++        is_gated: bool = False,
++        output_hidden_states=None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.config = config
++        self.layers = nn.ModuleList([
++            MllamaVisionEncoderLayer(config,
++                                     quant_config=quant_config,
++                                     is_gated=is_gated,
++                                     prefix=f"{prefix}.layers.{layer_idx}")
++            for layer_idx in range(num_layers)
++        ])
++        self.output_hidden_states = output_hidden_states or []
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        attention_mask: Optional[torch.Tensor] = None,
++    ) -> Union[Tuple, BaseModelOutput]:
++        encoder_states = ()
++
++        for i, encoder_layer in enumerate(self.layers):
++            if i in self.output_hidden_states:
++                encoder_states = encoder_states + (hidden_states, )
++            hidden_states = encoder_layer(
++                hidden_states,
++                attention_mask,
++            )
++
++        if len(self.layers) - 1 in self.output_hidden_states:
++            encoder_states = encoder_states + (hidden_states, )
++
++        return hidden_states, encoder_states
++
++
++class MllamaVisionModel(nn.Module):
++
++    def __init__(
++        self,
++        config: config_mllama.MllamaVisionConfig,
++        quant_config: Optional[QuantizationConfig],
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.image_size = config.image_size
++        self.patch_size = config.patch_size
++        self.max_num_tiles = config.max_num_tiles
++        self.hidden_size = config.hidden_size
++        self.in_channels = config.num_channels
++        self.intermediate_layers_indices = config.intermediate_layers_indices
++
++        self.num_patches = (self.image_size // self.patch_size)**2 + 1
++        self.scale = config.hidden_size**-0.5
++
++        self.patch_embedding = ColumnParallelConv2dPatch(
++            in_channels=config.num_channels,
++            out_channels=self.hidden_size,
++            kernel_size=self.patch_size,
++            stride=self.patch_size,
++            bias=False,
++        )
++
++        self.class_embedding = nn.Parameter(self.scale *
++                                            torch.randn(self.hidden_size))
++        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(
++            config)
++
++        self.pre_tile_positional_embedding = \
++            MllamaPrecomputedAspectRatioEmbedding(config, is_gated=True)
++        self.post_tile_positional_embedding = \
++            MllamaPrecomputedAspectRatioEmbedding(config, is_gated=True)
++
++        # layer norms
++        self.layernorm_pre = nn.LayerNorm(self.hidden_size)
++        self.layernorm_post = nn.LayerNorm(self.hidden_size)
++
++        # encoders
++        self.transformer = MllamaVisionEncoder(
++            config,
++            quant_config,
++            config.num_hidden_layers,
++            is_gated=False,
++            output_hidden_states=config.intermediate_layers_indices,
++            prefix=f"{prefix}.transformer",
++        )
++        self.global_transformer = MllamaVisionEncoder(
++            config,
++            quant_config,
++            config.num_global_layers,
++            is_gated=True,
++            prefix=f"{prefix}.global_transformer",
++        )
++
++    def apply_class_embedding(self,
++                              hidden_state: torch.Tensor) -> torch.Tensor:
++        batch_size, _, hidden_size = hidden_state.shape
++        class_embedding = self.class_embedding.expand(batch_size, 1,
++                                                      hidden_size)
++        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
++        return hidden_state
++
++    def forward(self, pixel_values: torch.Tensor,
++                aspect_ratio_ids: torch.Tensor,
++                aspect_ratio_mask: torch.Tensor) -> torch.Tensor:
++        batch_size, num_concurrent_media, num_tiles, num_channels, \
++            height, width = pixel_values.shape
++
++        pixel_values = pixel_values.reshape(
++            batch_size * num_concurrent_media * num_tiles, num_channels,
++            height, width)
++        aspect_ratio_ids = aspect_ratio_ids.reshape(
++            batch_size * num_concurrent_media, -1)
++
++        # patch embedding
++        patch_embeds = self.patch_embedding(
++            pixel_values.to(self.layernorm_pre.weight.dtype))
++        hidden_state = patch_embeds
++        hidden_state = ps.get_tp_group().all_gather(hidden_state)
++
++        # tile embeddings
++        _, num_patches, dim = hidden_state.shape
++        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
++                                            num_tiles, -1, dim)
++        hidden_state = self.pre_tile_positional_embedding(
++            hidden_state, aspect_ratio_ids)
++
++        # apply cls token
++        hidden_state = hidden_state.reshape(
++            batch_size * num_concurrent_media * num_tiles, num_patches, dim)
++        hidden_state = self.apply_class_embedding(hidden_state)
++        num_patches += 1
++
++        # apply position embeddings
++        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
++                                            num_tiles, num_patches, dim)
++        hidden_state = self.gated_positional_embedding(hidden_state,
++                                                       aspect_ratio_ids)
++
++        # apply encoder
++        hidden_state = self.layernorm_pre(hidden_state)
++
++        # Compute the number of tokens to pad
++        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
++        # Compute padding tuple for pad function
++        padding = (
++            0, 0, 0, num_padding_patches
++        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
++        # Pad the tensor
++        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
++        slice_index = -num_padding_patches if num_padding_patches > 0 else None
++
++        attention_mask = aspect_ratio_mask.reshape(
++            batch_size * num_concurrent_media, -1)
++        attention_mask = _prepare_aspect_ratio_attention_mask(
++            aspect_ratio_mask=attention_mask,
++            num_patches=self.num_patches,
++            target_length=hidden_state.shape[2],
++            dtype=self.layernorm_pre.weight.dtype,
++        )
++
++        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1,
++                                         dim)
++        output = self.transformer(
++            hidden_state,
++            attention_mask=attention_mask,
++        )
++        hidden_state, intermediate_hidden_states = output[0], output[1]
++        intermediate_hidden_states = torch.stack(intermediate_hidden_states,
++                                                 dim=-1)
++
++        # apply global encoder
++        hidden_state = self.layernorm_post(hidden_state)
++        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
++                                            num_tiles,
++                                            num_patches + num_padding_patches,
++                                            dim)
++        hidden_state = self.post_tile_positional_embedding(
++            hidden_state, aspect_ratio_ids)
++        hidden_state = hidden_state.reshape(
++            batch_size * num_concurrent_media,
++            num_tiles * (num_patches + num_padding_patches), dim)
++        hidden_state = self.global_transformer(
++            hidden_state, attention_mask=attention_mask)[0]
++        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
++                                            num_tiles,
++                                            num_patches + num_padding_patches,
++                                            dim)
++        hidden_state = hidden_state[:, :, :slice_index]
++
++        # adding intermediate layer outputs
++        hidden_state = hidden_state.reshape(batch_size, num_concurrent_media,
++                                            num_tiles, num_patches, dim)
++        intermediate_hidden_states = intermediate_hidden_states.reshape(
++            batch_size * num_concurrent_media, num_tiles,
++            num_patches + num_padding_patches, -1)
++        intermediate_hidden_states = intermediate_hidden_states[:, :, :
++                                                                slice_index]
++        intermediate_hidden_states = intermediate_hidden_states.reshape(
++            batch_size, num_concurrent_media, num_tiles, num_patches, -1)
++        hidden_state = torch.cat([hidden_state, intermediate_hidden_states],
++                                 dim=-1)
++        return hidden_state
++
++
++class MllamaTextRMSNorm(nn.Module):
++
++    def __init__(self, hidden_size, eps=1e-6):
++        """
++        MllamaTextRMSNorm is equivalent to T5LayerNorm
++        """
++        super().__init__()
++        self.weight = nn.Parameter(torch.ones(hidden_size))
++        self.variance_epsilon = eps
++
++    def forward(self, hidden_states):
++        input_dtype = hidden_states.dtype
++        hidden_states = hidden_states.to(torch.float32)
++        variance = hidden_states.pow(2).mean(-1, keepdim=True)
++        hidden_states = hidden_states * torch.rsqrt(variance +
++                                                    self.variance_epsilon)
++        return self.weight * hidden_states.to(input_dtype)
++
++    def extra_repr(self):
++        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
++
++
++class MllamaTextCrossAttention(nn.Module):
++    """Multi-headed attention from 'Attention Is All You Need' paper"""
++
++    def __init__(
++        self,
++        config: Optional[config_mllama.MllamaTextConfig] = None,
++        layer_idx: Optional[int] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.config = config
++        self.model_parallel_size = get_tensor_model_parallel_world_size()
++        self.num_heads = self.config.num_attention_heads
++        self.num_local_heads = self.num_heads // self.model_parallel_size
++        self.num_key_value_heads = self.config.num_key_value_heads
++        self.num_local_key_value_heads = \
++            self.num_key_value_heads // self.model_parallel_size
++        self.dropout = config.dropout
++        self.hidden_size = config.hidden_size
++        self.head_dim = config.hidden_size // self.num_heads
++        self.layer_idx = layer_idx
++        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
++        self.q_local_size = self.num_local_heads * self.head_dim
++        self.kv_local_size = self.num_local_key_value_heads * self.head_dim
++
++        # TODO: change to Q/KV separate linear after #7448 is merged
++        self.qkv_proj = QKVParallelLinear(
++            self.hidden_size,
++            self.head_dim,
++            self.num_heads,
++            self.num_key_value_heads,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++        self.o_proj = RowParallelLinear(
++            self.num_heads * self.head_dim,
++            self.hidden_size,
++            bias=False,
++            input_is_parallel=True,
++            quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
++        )
++        # vllm.model_executor.layers.layernorm.RMSNorm has precision issue,
++        # use huggingface's instead
++        self.q_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
++        self.k_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
++        self.scaling = self.head_dim**-0.5
++
++        self.attn = Attention(
++            self.num_local_heads,
++            self.head_dim,
++            self.scaling,
++            self.num_local_key_value_heads,
++            prefix=f"{prefix}.attn",
++            attn_type=AttentionType.ENCODER_DECODER,
++        )
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        attention_mask: Optional[torch.Tensor],
++        kv_range_for_decode: Optional[List[Tuple[int, int]]],
++        cross_attention_states: Optional[torch.Tensor],
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv_dec, _ = self.qkv_proj(hidden_states)
++        q, _, _ = qkv_dec.split(
++            [self.q_local_size, self.kv_local_size, self.kv_local_size],
++            dim=-1)
++        if cross_attention_states is None:
++            k = None
++            v = None
++        else:
++            qkv_enc, _ = self.qkv_proj(cross_attention_states)
++            _, k, v = qkv_enc.split(
++                [self.q_local_size, self.kv_local_size, self.kv_local_size],
++                dim=-1)
++            k = k.view(-1, self.num_local_key_value_heads, self.head_dim)
++            v = v.view(-1, self.num_local_key_value_heads, self.head_dim)
++            k = self.k_norm(k)
++        q = q.view(-1, self.num_local_heads, self.head_dim)
++        q = self.q_norm(q)
++
++        if attention_mask is not None:
++            output = self._attention_with_mask(q, k, v, kv_cache,
++                                               attention_mask,
++                                               kv_range_for_decode,
++                                               attn_metadata)
++        else:
++            output = self.attn(
++                q.view(-1, self.num_local_heads * self.head_dim), k, v,
++                kv_cache, attn_metadata)
++        out, _ = self.o_proj(output)
++        return out
++
++    def _attention_with_mask(
++        self,
++        q: torch.Tensor,
++        k: torch.Tensor,
++        v: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attention_mask: torch.Tensor,
++        kv_range_for_decode: List[Tuple[int, int]],
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        # Skip writing kv-cache for the initial profiling run.
++        if len(kv_cache.shape) > 1:
++            if self.attn.backend in (_Backend.FLASH_ATTN,
++                                     _Backend.FLASH_ATTN_VLLM_V1):
++                cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
++                cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
++                torch.ops._C_cache_ops.reshape_and_cache_flash(
++                    cached_k,
++                    cached_v,
++                    kv_cache[0],
++                    kv_cache[1],
++                    attn_metadata.
++                    cross_slot_mapping,  # type: ignore[union-attr]
++                    "auto",
++                    1.0,
++                    1.0,
++                )
++            elif self.attn.backend in (_Backend.XFORMERS, _Backend.TORCH_SDPA):
++                key_cache, value_cache = PagedAttention.split_kv_cache(
++                    kv_cache, self.num_local_key_value_heads, self.head_dim)
++                cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
++                cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
++                PagedAttention.write_to_paged_cache(
++                    cached_k, cached_v, key_cache, value_cache,
++                    attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0)
++            else:
++                raise ValueError(
++                    f"Unsupported Attention backend {self.attn.backend} "
++                    "enum found. Expected the Attention backend to be "
++                    "FLASH_ATTN, FLASH_ATTN_VLLM_V1, XFORMERS or TORCH_SDPA.")
++
++        # We have to call torch.sdpa for prefill when using a
++        # custom cross-attention mask. Because the mask is not a
++        # standard causal mask, neither a block diagonal mask which
++        # can be optimized by xformers.BlockDiagonalMask.
++        # The mask is specially calculated for supporting multi
++        # images and interleaved images.
++        q_len = q.shape[0]
++        kv_len = k.shape[0]
++        q = q.transpose(0, 1).view(self.num_local_key_value_heads,
++                                   self.num_key_value_groups, q_len,
++                                   self.head_dim).contiguous()
++        k = k.transpose(0,
++                        1)[:,
++                           None, :, :].expand(self.num_local_key_value_heads,
++                                              self.num_key_value_groups,
++                                              kv_len,
++                                              self.head_dim).contiguous()
++        v = v.transpose(0,
++                        1)[:,
++                           None, :, :].expand(self.num_local_key_value_heads,
++                                              self.num_key_value_groups,
++                                              kv_len,
++                                              self.head_dim).contiguous()
++        attention_mask = attention_mask.view(1, 1, q_len, kv_len)
++        output = F.scaled_dot_product_attention(q,
++                                                k,
++                                                v,
++                                                attn_mask=attention_mask,
++                                                is_causal=False)
++        output = output.permute(2, 0, 1, 3).reshape(
++            q_len, self.num_local_heads * self.head_dim)
++        return output
++
++
++class MllamaCrossAttentionDecoderLayer(torch.nn.Module):
++    """Cross-attention transformer block with tanh-gated attention
++    and feedforward."""
++
++    def __init__(
++        self,
++        config: config_mllama.MllamaTextConfig,
++        layer_idx: int,
++        quant_config: Optional[QuantizationConfig],
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.layer_idx = layer_idx
++        self.cross_attn = MllamaTextCrossAttention(
++            config=config,
++            layer_idx=layer_idx,
++            quant_config=quant_config,
++            prefix=f"{prefix}.cross_attn",
++        )
++
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.cross_attn_attn_gate = torch.nn.Parameter(torch.zeros(1))
++
++        self.mlp = LlamaMLP(
++            hidden_size=config.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.hidden_act,
++            quant_config=quant_config,
++            prefix=f"{prefix}.mlp",
++        )
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.rms_norm_eps)
++        self.cross_attn_mlp_gate = torch.nn.Parameter(torch.zeros(1))
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        cross_attention_states: torch.Tensor,
++        cross_attention_mask: torch.Tensor,
++        kv_range_for_decode: Optional[List[Tuple[int, int]]],
++        full_text_row_masked_out_mask: torch.Tensor,
++        kv_cache: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        residual = hidden_states
++        hidden_states = self.input_layernorm(hidden_states)
++
++        hidden_states = self.cross_attn(
++            hidden_states=hidden_states,
++            attention_mask=cross_attention_mask,
++            kv_range_for_decode=kv_range_for_decode,
++            cross_attention_states=cross_attention_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        hidden_states = full_text_row_masked_out_mask * hidden_states
++        hidden_states = residual + self.cross_attn_attn_gate.tanh(
++        ) * hidden_states
++
++        residual = hidden_states
++        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = full_text_row_masked_out_mask * hidden_states
++        hidden_states = residual + self.cross_attn_mlp_gate.tanh(
++        ) * hidden_states
++        return hidden_states
++
++
++class MllamaTextModel(nn.Module):
++    config_class = config_mllama.MllamaTextConfig
++    base_model_prefix = "model"
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config.text_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.padding_idx = config.pad_token_id
++        self.vocab_size = config.vocab_size
++        self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8,
++                                                   config.hidden_size)
++        self.cross_attention_layers = config.cross_attention_layers
++
++        layers = []
++        for layer_idx in range(config.num_hidden_layers):
++            if layer_idx in self.cross_attention_layers:
++                layers.append(
++                    MllamaCrossAttentionDecoderLayer(
++                        config,
++                        layer_idx,
++                        quant_config=quant_config,
++                        prefix=f"{prefix}.layers.{layer_idx}",
++                    ))
++            else:
++                # TODO: force LlamaDecoderLayer to config.attention_bias=False
++                layers.append(
++                    LlamaDecoderLayer(
++                        config,
++                        cache_config=cache_config,
++                        quant_config=quant_config,
++                        prefix=f"{prefix}.layers.{layer_idx}",
++                    ))
++
++        self.layers = nn.ModuleList(layers)
++        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        input_ids: torch.LongTensor,
++        positions: Optional[torch.LongTensor],
++        cross_attention_states: Optional[torch.LongTensor],
++        cross_attention_mask: Optional[torch.LongTensor],
++        kv_range_for_decode: Optional[List[Tuple[int, int]]],
++        full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor,
++                                                      torch.Tensor]],
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        skip_cross_attention: bool,
++    ) -> torch.Tensor:
++        inputs_embeds = self.embed_tokens(input_ids)
++        hidden_states = inputs_embeds
++
++        for idx, decoder_layer in enumerate(self.layers):
++            if isinstance(decoder_layer, MllamaCrossAttentionDecoderLayer):
++                if not skip_cross_attention:
++                    hidden_states = decoder_layer(
++                        hidden_states=hidden_states,
++                        cross_attention_states=cross_attention_states,
++                        cross_attention_mask=cross_attention_mask,
++                        kv_range_for_decode=kv_range_for_decode,
++                        full_text_row_masked_out_mask=
++                        full_text_row_masked_out_mask,
++                        kv_cache=kv_caches[idx],
++                        attn_metadata=attn_metadata,
++                    )
++            elif isinstance(decoder_layer, LlamaDecoderLayer):
++                hidden_states, residual = decoder_layer(
++                    positions=positions,
++                    hidden_states=hidden_states,
++                    kv_cache=kv_caches[idx],
++                    attn_metadata=attn_metadata,
++                    residual=None,
++                )
++                hidden_states = hidden_states + residual
++            else:
++                raise ValueError(
++                    f"Unknown decoder layer type {type(decoder_layer)}")
++        hidden_states = self.norm(hidden_states)
++        return hidden_states
++
++
++class MllamaForCausalLM(nn.Module):
++    config_class = config_mllama.MllamaTextConfig
++    base_model_prefix = "language_model"
++    _no_split_modules = [
++        "MllamaCrossAttentionDecoderLayer", "MllamaSelfAttentionDecoderLayer"
++    ]
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config.text_config
++        quant_config = vllm_config.quant_config
++
++        self.vocab_size = config.vocab_size
++        self.model = MllamaTextModel(vllm_config=vllm_config,
++                                     prefix=f"{prefix}.model")
++        self.lm_head = ParallelLMHead(
++            config.vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
++            quant_config=quant_config,
++            prefix=f"{prefix}.lm_head",
++        )
++
++    def forward(
++        self,
++        input_ids: torch.LongTensor,
++        positions: Optional[torch.LongTensor],
++        cross_attention_states: Optional[torch.LongTensor],
++        cross_attention_mask: Optional[torch.LongTensor],
++        kv_range_for_decode: Optional[List[Tuple[int, int]]],
++        full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor,
++                                                      torch.Tensor]],
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        skip_cross_attention: bool,
++    ) -> torch.Tensor:
++        hidden_states = self.model(
++            input_ids=input_ids,
++            positions=positions,
++            cross_attention_states=cross_attention_states,
++            cross_attention_mask=cross_attention_mask,
++            kv_range_for_decode=kv_range_for_decode,
++            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++            skip_cross_attention=skip_cross_attention,
++        )
++        return hidden_states
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper()
++@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_mllama_image_tokens)
++@INPUT_REGISTRY.register_dummy_data(dummy_decoder_data_for_mllama)
++@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_mllama)
++@INPUT_REGISTRY.register_input_processor(input_processor_for_mllama)
++class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        self.vocab_size = config.text_config.vocab_size
++        self.hidden_size = config.text_config.hidden_size
++        self.max_num_tiles = config.vision_config.max_num_tiles
++        self.vision_output_dim = config.vision_config.vision_output_dim
++        self.pad_token_id = \
++            config.pad_token_id if config.pad_token_id is not None else -1
++        self.image_size = config.vision_config.image_size
++
++        self.vision_model = MllamaVisionModel(config.vision_config,
++                                              quant_config,
++                                              prefix=maybe_prefix(
++                                                  prefix, "vision_model"))
++        self.language_model = MllamaForCausalLM(
++            vllm_config=vllm_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++        self.multi_modal_projector = ColumnParallelLinear(
++            config.vision_config.vision_output_dim,
++            config.text_config.hidden_size,
++            bias=True,
++            quant_config=quant_config,
++            gather_output=True,
++            prefix=maybe_prefix(prefix, "multi_modal_projector"),
++        )
++        self.logits_processor = LogitsProcessor(config.output_hidden_states,
++                                                config.text_config.vocab_size)
++        self.sampler = get_sampler()
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.language_model.lm_head,
++                                       hidden_states, sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def _parse_and_validate_image_input(self, **kwargs: object):
++        # tensor with the same shape will be batched together by
++        # MultiModalKwargs.batch, so pixel_values here can be:
++        #   - List[List[torch.Tensor]]:
++        #       with shape (num_tiles, 3, image_res, image_res)
++        #   - List[torch.Tensor]:
++        #       with shape (num_image, num_tiles, 3, image_res, image_res)
++        #   - torch.Tensor:
++        #       with shape (bs, num_image, num_tiles, 3, image_res, image_res)
++        pixel_values: Optional[Union[List[List[torch.Tensor]],
++                                     List[torch.Tensor],
++                                     torch.Tensor]] = kwargs.pop(
++                                         "pixel_values", None)
++        image_embeds: Optional[Union[List[List[torch.Tensor]],
++                                     List[torch.Tensor],
++                                     torch.Tensor]] = kwargs.pop(
++                                         "image_embeds", None)
++        aspect_ratio_ids: Optional[Union[List[List[torch.Tensor]],
++                                         List[torch.Tensor],
++                                         torch.Tensor]] = kwargs.pop(
++                                             "aspect_ratio_ids", None)
++        aspect_ratio_mask: Optional[Union[List[List[torch.Tensor]],
++                                          List[torch.Tensor],
++                                          torch.Tensor]] = kwargs.pop(
++                                              "aspect_ratio_mask", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if pixel_values is not None and image_embeds is not None:
++            raise ValueError(
++                "Both pixel values and image embeds are provided.")
++
++        if pixel_values is not None:
++            assert aspect_ratio_ids is not None
++            assert aspect_ratio_mask is not None
++            max_num_images = max([len(x[0]) for x in pixel_values])
++            if max_num_images == 0:
++                raise ValueError("No images provided.")
++            max_num_tiles = max(
++                max([len(x) for x in y[0]]) for y in pixel_values)
++            device = next(self.multi_modal_projector.parameters()).device
++            bsz = len(pixel_values)
++            out_num_tiles = []
++            out_images = torch.zeros(
++                bsz,
++                max_num_images,
++                max_num_tiles,
++                3,
++                self.image_size,
++                self.image_size,
++                dtype=torch.float32,
++                device=device,
++            )
++            out_ar_ids = torch.ones(bsz,
++                                    max_num_images,
++                                    dtype=torch.int64,
++                                    device=device)
++            out_ar_mask = torch.zeros(bsz,
++                                      max_num_images,
++                                      max_num_tiles,
++                                      dtype=torch.int64,
++                                      device=device)
++            for b in range(len(pixel_values)):
++                _num_tiles = []
++                for i in range(len(pixel_values[b][0])):
++                    img = pixel_values[b][0][i]
++                    out_images[b, i, :img.shape[0]] = img
++                    out_ar_ids[b, i] = aspect_ratio_ids[b][0][i]
++                    out_ar_mask[b, i] = aspect_ratio_mask[b][0][i]
++                    _num_tiles.append(img.shape[0])
++                out_num_tiles.append(_num_tiles)
++
++            return MllamaImagePixelInputs(
++                type="pixel_values",
++                data=out_images,
++                aspect_ratio_ids=out_ar_ids,
++                aspect_ratio_mask=out_ar_mask,
++            )
++
++        if image_embeds is not None:
++            raise NotImplementedError
++
++        raise AssertionError("This line should be unreachable.")
++
++    def flat_encoder_result(self, cross_attention_states: torch.Tensor,
++                            attn_metadata: AttentionMetadata,
++                            actual_encoder_seq_lens: List[int]):
++
++        cross_attention_states_flat = torch.zeros(
++            sum(actual_encoder_seq_lens),
++            cross_attention_states.shape[-1],
++            device=cross_attention_states.device,
++            dtype=cross_attention_states.dtype)
++        start_pos = 0
++        for seq_len, vision_token_in_batch in zip(actual_encoder_seq_lens,
++                                                  cross_attention_states):
++            end_pos = start_pos + seq_len
++            cross_attention_states_flat[
++                start_pos:end_pos] = vision_token_in_batch[:seq_len]
++            start_pos = end_pos
++        cross_attention_states = cross_attention_states_flat
++        return cross_attention_states
++
++    def get_cross_attention_states(
++        self,
++        image_inputs: MllamaImagePixelInputs,
++        attn_metadata: AttentionMetadata,
++        actual_encoder_seq_lens: List[int],
++    ) -> Tuple[torch.Tensor]:
++        # NOTE: llama's reference implementation runs vision model on CPU
++        pixel_values = image_inputs['data']
++        aspect_ratio_ids = image_inputs['aspect_ratio_ids']
++        aspect_ratio_mask = image_inputs['aspect_ratio_mask']
++        cross_attention_states = self.vision_model(pixel_values,
++                                                   aspect_ratio_ids,
++                                                   aspect_ratio_mask)
++        cross_attention_states, _ = self.multi_modal_projector(
++            cross_attention_states)
++
++        bsz, _, _, _, image_token_dim = tuple(cross_attention_states.shape)
++        cross_attention_states = cross_attention_states.view(
++            bsz, -1, image_token_dim)
++
++        cross_attention_states = self.flat_encoder_result(
++            cross_attention_states, attn_metadata, actual_encoder_seq_lens)
++
++        return cross_attention_states
++
++    def get_cross_attention_mask(
++        self,
++        input_ids: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        num_tiles: List[List[int]],
++        num_tokens_per_tile: int,
++        dtype: torch.dtype,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        token_ids = input_ids.tolist()
++        start = 0
++        batch_token_ids = []
++        for seq_len in attn_metadata.seq_lens:
++            batch_token_ids.append(token_ids[start:start + seq_len])
++            start += seq_len
++        sparse_mask = [
++            get_cross_attention_token_mask(t, MLLAMA_IMAGE_TOKEN_ID)
++            for t in batch_token_ids
++        ]
++
++        # Skip generating cross-attention mask if all samples
++        # are text-only or have only 1 leading image.
++        if skip_attention_mask(sparse_mask):
++            return None, None
++
++        dense_mask, tile_range_for_decode = \
++            convert_sparse_cross_attention_mask_to_dense(
++                sparse_mask, num_tiles, attn_metadata.seq_lens)
++        cross_attention_mask = \
++            convert_dense_cross_attention_mask_to_tensor(
++                dense_mask, num_tokens_per_tile, input_ids.device, dtype)
++        kv_range_for_decode = [[
++            t[0] * num_tokens_per_tile, t[1] * num_tokens_per_tile
++        ] for t in tile_range_for_decode]
++
++        return cross_attention_mask, kv_range_for_decode
++
++    def get_full_text_row_masked_out_mask(
++        self,
++        attn_metadata: AttentionMetadata,
++        device: torch.device,
++    ) -> torch.Tensor:
++        full_text_row_masked_out_mask = torch.ones(
++            (attn_metadata.num_prefill_tokens, 1), dtype=torch.bool)
++        start_pos = 0
++        for seq_len, encoder_seq_len in zip(attn_metadata.seq_lens,
++                                            attn_metadata.encoder_seq_lens):
++            if encoder_seq_len == 0:
++                full_text_row_masked_out_mask[start_pos:start_pos +
++                                              seq_len] = False
++            start_pos += seq_len
++        full_text_row_masked_out_mask = full_text_row_masked_out_mask.to(
++            device)
++        return full_text_row_masked_out_mask
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        **kwargs: object,
++    ) -> Union[Tuple, CausalLMOutputWithPast]:
++        if attn_metadata.num_prefill_tokens > 0 and \
++            attn_metadata.num_decode_tokens > 0:
++            raise ValueError("Chunk prefill not supported")
++        image_inputs = self._parse_and_validate_image_input(**kwargs)
++        cross_attention_states = None
++        cross_attention_mask = None
++        kv_range_for_decode = None
++
++        # For 1) text-only prefill and decode, 2) image-present decode.
++        if image_inputs is None:
++            full_text_row_masked_out_mask = (
++                attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(
++                    input_ids.device)
++            skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
++
++        # For image-present prefill.
++        else:
++            skip_cross_attention = False
++
++            # Get the actual number of encoder tokens for each sample.
++            # Because attn_metadata.encoder_seq_lens only counts the last
++            # group of images for each sample, which is used to cheat the
++            # block manager to allocate blocks for those images only.
++            # See input_processor_for_mllama() for more details.
++            num_tiles_tensor = kwargs.pop("num_tiles")
++            num_tiles = [t[0].tolist() for t in num_tiles_tensor]
++            num_tokens_per_tile = (self.image_size // 14)**2 + 1
++            actual_encoder_seq_lens = [
++                sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
++            ]
++            for actual_len, last_group_len in zip(
++                    actual_encoder_seq_lens, attn_metadata.encoder_seq_lens):
++                assert actual_len >= last_group_len
++
++            cross_attention_states = self.get_cross_attention_states(
++                image_inputs, attn_metadata, actual_encoder_seq_lens)
++
++            full_text_row_masked_out_mask = \
++                self.get_full_text_row_masked_out_mask(
++                    attn_metadata, input_ids.device)
++
++            cross_attention_mask, kv_range_for_decode = \
++                self.get_cross_attention_mask(
++                    input_ids, attn_metadata, num_tiles,
++                    num_tokens_per_tile, cross_attention_states.dtype)
++
++        outputs = self.language_model(
++            input_ids=input_ids,
++            positions=positions,
++            cross_attention_states=cross_attention_states,
++            cross_attention_mask=cross_attention_mask,
++            kv_range_for_decode=kv_range_for_decode,
++            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++            skip_cross_attention=skip_cross_attention,
++        )
++
++        return outputs
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            (".qkv_proj", ".q_proj", "q"),
++            (".qkv_proj", ".k_proj", "k"),
++            (".qkv_proj", ".v_proj", "v"),
++            (".gate_up_proj", ".gate_proj", 0),
++            (".gate_up_proj", ".up_proj", 1),
++        ]
++        params_dict = dict(self.named_parameters())
++        updated_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if 'patch_embedding.weight' in name:
++                name = name.replace('patch_embedding.weight',
++                                    'patch_embedding._linear.weight')
++                loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                param = params_dict[name]
++                updated_params.add(name)
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                param = params_dict.pop(name)
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++                updated_params.add(name)
++        return updated_params
++
++
++def skip_attention_mask(sparse_mask: List[List[int]]) -> bool:
++    for mask in sparse_mask:
++        # Skip text-only samples.
++        if len(mask) == 0:
++            continue
++        # If the sample contains more than 1 images,
++        # we can't skip mask.
++        if len(mask) != 1:
++            return False
++        # If the sample contains only 1 image,
++        # but the image is not the leading one,
++        # we can't skip mask.
++        if mask[0][0] != 0 or mask[0][1] != -1:
++            return False
++    return True
++
++
++def convert_sparse_cross_attention_mask_to_dense(
++    sparse_mask: List[List[List[int]]],
++    num_tiles: List[List[int]],
++    lengths: List[int],
++) -> Tuple[np.ndarray, List[Tuple[int, int]]]:
++    total_length = sum(lengths)
++    total_tiles = sum([sum(tiles) for tiles in num_tiles])
++    dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64)
++    # A list of ranges, range[i] = [start, end] means
++    # if the i-th sample has N tiles in total, the tiles[start, end]
++    # will be used for cross-attention decoding.
++    tile_range_for_decode = []
++
++    seq_start = 0
++    tile_start = 0
++    for masks, tiles, length in zip(sparse_mask, num_tiles, lengths):
++        ts, td = -1, 0
++        for mask, tile in zip(masks, tiles):
++            if len(mask) != 2:
++                continue
++            start, end = mask
++            end = min(end, length)
++            if end == -1:
++                end = length
++            if end == length:
++                if ts == -1:
++                    ts = tile_start
++                td += tile
++            dense_mask[seq_start + start:seq_start + end,
++                       tile_start:tile_start + tile] = 1
++            tile_start += tile
++        assert ts != -1
++        assert td != 0
++        tile_range_for_decode.append((ts, ts + td))
++        seq_start += length
++
++    return dense_mask, tile_range_for_decode
++
++
++def convert_dense_cross_attention_mask_to_tensor(
++    cross_attention_token_mask: np.ndarray,
++    num_tokens_per_tile: int,
++    device: torch.device,
++    dtype: torch.dtype,
++) -> torch.Tensor:
++    mask = torch.tensor(cross_attention_token_mask, dtype=dtype, device=device)
++    mask = mask.repeat_interleave(num_tokens_per_tile, dim=1)
++
++    mask = 1.0 - mask
++    mask = mask.masked_fill(mask.to(torch.bool), torch.finfo(dtype).min)
++
++    ninf = torch.finfo(dtype).min
++    full_text_mask = ((mask != ninf).any(dim=-1).type_as(mask)[..., None])
++    mask *= full_text_mask
++    # (num_prompt_tokens, num_encoder_tokens)
++    return mask
+diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
+new file mode 100644
+index 0000000..d49da5f
+--- /dev/null
++++ b/vllm/model_executor/models/mlp_speculator.py
+@@ -0,0 +1,203 @@
++import math
++from typing import Iterable, List, Set, Tuple
++
++import torch
++import torch.nn as nn
++
++from vllm.config import VllmConfig
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++
++SQRT2 = 2**0.5
++
++
++class MLPSpeculatorLayerNorm(nn.Module):
++    """
++    A L2 normalization implementation
++    ...
++    Args
++    ----
++    normalized_shape : int
++        Dimensionality of input data (size of final tensor axis)
++    eps : float
++        Safety term to prevent division by zero. Make sure the chosen value
++         fits in the range of your encoding scheme
++         (i.e. fp16 requires eps >= 6e-8).
++    elementwise_scale_and_shift : bool
++        Include a learned scaling and shift term after normalization.
++    """
++
++    def __init__(
++        self,
++        normalized_shape,
++        eps=1e-06,
++        elementwise_scale_and_shift=True,
++    ):
++        super().__init__()
++        self.elementwise_scale_and_shift = elementwise_scale_and_shift
++        if self.elementwise_scale_and_shift:
++            self.weight = nn.Parameter(torch.empty(normalized_shape))
++            self.bias = nn.Parameter(torch.empty(normalized_shape))
++        self.eps = eps
++
++    def forward(self, x):
++        xf = x
++        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
++        x = xf.type_as(x)
++        if self.elementwise_scale_and_shift:
++            x = self.weight * x
++            x = x + self.bias
++        return x
++
++
++class MLPSpeculator(nn.Module):
++    """
++    An implementation of the speculative models introduced in
++    "Accelerating Production LLMs with Combined Token/Embedding
++    Speculators"
++    https://arxiv.org/pdf/2404.19124
++
++    Trained speculators of this type are available on HF hub at:
++    https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite
++    """
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        self.n_predict = config.n_predict
++        self.vocab_size = config.vocab_size
++        self.emb_dim = config.emb_dim
++        self.inner_dim = config.inner_dim if config.inner_dim != 0 \
++            else config.emb_dim
++
++        self.max_speculative_tokens = config.num_lookahead_tokens
++
++        self.tie_weights = config.tie_weights
++        self.scale_input = config.scale_input
++
++        if self.tie_weights:
++            assert (
++                self.n_predict >
++                1), "You cannot tie weights between stages when only 1 exists"
++            embedding = VocabParallelEmbedding(
++                config.vocab_size,
++                self.inner_dim,
++                org_num_embeddings=config.vocab_size)
++            self.emb = nn.ModuleList([embedding] * self.max_speculative_tokens)
++
++            # the initial projection from the base model may
++            # have a different size, so that stays separate.
++            proj_first = nn.Linear(self.emb_dim, self.inner_dim, bias=False)
++            proj_tied = nn.Linear(self.inner_dim, self.inner_dim, bias=False)
++            self.proj = nn.ModuleList([proj_first] + [proj_tied] *
++                                      (self.max_speculative_tokens - 1))
++
++            head = ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
++            self.head = nn.ModuleList([head] * self.max_speculative_tokens)
++
++            ln = MLPSpeculatorLayerNorm(self.inner_dim,
++                                        elementwise_scale_and_shift=True)
++            self.ln = nn.ModuleList([ln] * self.max_speculative_tokens)
++
++        else:
++            self.emb = nn.ModuleList([
++                VocabParallelEmbedding(config.vocab_size,
++                                       self.inner_dim,
++                                       org_num_embeddings=config.vocab_size)
++                for _ in range(self.max_speculative_tokens)
++            ])
++
++            self.proj = nn.ModuleList([
++                nn.Linear((self.emb_dim if i == 0 else self.inner_dim),
++                          self.inner_dim,
++                          bias=False)
++                for i in range(self.max_speculative_tokens)
++            ])
++
++            self.head = nn.ModuleList([
++                ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
++                for _ in range(self.max_speculative_tokens)
++            ])
++            self.ln = nn.ModuleList([
++                MLPSpeculatorLayerNorm(self.inner_dim,
++                                       elementwise_scale_and_shift=True)
++                for _ in range(self.max_speculative_tokens)
++            ])
++        if self.scale_input:
++            self.ln0 = MLPSpeculatorLayerNorm(
++                self.emb_dim, elementwise_scale_and_shift=False)
++
++        self.state_weight = 0.5**(0.5 / config.n_predict)
++        self.emb_weight = math.sqrt(
++            (1 - self.state_weight**2) * (self.inner_dim / 2))
++        self.activation = nn.GELU()
++        self.config = config
++        self.logits_processor = LogitsProcessor(config.vocab_size,
++                                                config.vocab_size, 1.0)
++        self.sampler = get_sampler()
++
++    def generate_proposals(
++        self,
++        input_ids: torch.Tensor,
++        previous_hidden_states: torch.Tensor,
++        num_predict_tokens: int,
++        sampling_metadata: SamplingMetadata,
++    ) -> List[SamplerOutput]:
++        if num_predict_tokens > self.max_speculative_tokens:
++            raise ValueError(f"Max speculative tokens for model is "
++                             f"{self.max_speculative_tokens}, but "
++                             f"{num_predict_tokens} were requested")
++
++        # b x 1 x d
++        previous_hidden_states = previous_hidden_states.unsqueeze(1)
++
++        if self.scale_input:
++            previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2
++
++        # b x 1
++        last_tokens = input_ids.unsqueeze(1)
++
++        next_tokens = []
++
++        for head_index in range(num_predict_tokens):
++
++            # Project and predict
++            z = self.emb[head_index](last_tokens)  # b k d
++            states = self.proj[head_index](previous_hidden_states)
++
++            # Weighted add of state_weight*state and emb_weight*z
++            # Let subsequent LN take care of denominator
++            # state_weight is close to 1, so shouldn't be any precision issues
++            states.add_(z, alpha=self.emb_weight / self.state_weight)
++
++            states = self.activation(self.ln[head_index](states))  # b k d
++            previous_hidden_states = states
++            # TODO: not yet supporting top_k_tokens_per_head
++            states = states.flatten(0, 1)
++
++            logits = self.logits_processor(self.head[head_index], states,
++                                           sampling_metadata)
++
++            output = self.sampler(logits, sampling_metadata)
++            last_tokens = output.sampled_token_ids
++            next_tokens.append(output)
++
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            name = name.replace("speculator.", "")
++            param = params_dict.get(name)
++            if param is not None:
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++                loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
+new file mode 100644
+index 0000000..a9102a6
+--- /dev/null
++++ b/vllm/model_executor/models/module_mapping.py
+@@ -0,0 +1,69 @@
++# Adapted from
++#  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
++
++from dataclasses import dataclass, field
++from typing import List, Union
++
++
++@dataclass
++class ModelKeys:
++    model_type: str = None
++
++    module_list: str = None
++
++    embedding: str = None
++
++    mlp: str = None
++
++    down_proj: str = None
++
++    attention: str = None
++
++    o_proj: str = None
++
++    q_proj: str = None
++
++    k_proj: str = None
++
++    v_proj: str = None
++
++    qkv_proj: str = None
++
++    qk_proj: str = None
++
++    qa_proj: str = None
++
++    qb_proj: str = None
++
++    kva_proj: str = None
++
++    kvb_proj: str = None
++
++    output: str = None
++
++
++@dataclass
++class MultiModelKeys(ModelKeys):
++    language_model: List[str] = field(default_factory=list)
++    connector: List[str] = field(default_factory=list)
++    # vision tower and audio tower
++    tower_model: List[str] = field(default_factory=list)
++    generator: List[str] = field(default_factory=list)
++
++    @staticmethod
++    def from_string_field(language_model: Union[str, List[str]] = None,
++                          connector: Union[str, List[str]] = None,
++                          tower_model: Union[str, List[str]] = None,
++                          generator: Union[str, List[str]] = None,
++                          **kwargs) -> 'MultiModelKeys':
++
++        def to_list(value):
++            if value is None:
++                return []
++            return [value] if isinstance(value, str) else list(value)
++
++        return MultiModelKeys(language_model=to_list(language_model),
++                              connector=to_list(connector),
++                              tower_model=to_list(tower_model),
++                              generator=to_list(generator),
++                              **kwargs)
+diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
+new file mode 100644
+index 0000000..c45ee9b
+--- /dev/null
++++ b/vllm/model_executor/models/molmo.py
+@@ -0,0 +1,1412 @@
++import math
++import re
++from array import array
++from dataclasses import dataclass
++from functools import lru_cache, partial
++from typing import Iterable, List, Mapping, Optional, Set, Tuple, TypedDict
++
++import torch
++from einops import rearrange
++from PIL import Image
++from torch import nn
++from torch.nn import functional as F
++from transformers import PretrainedConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.attention.layer import MultiHeadAttention
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size,
++                              split_tensor_along_last_dim,
++                              tensor_model_parallel_all_gather)
++from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
++                         InputContext, token_inputs)
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.module_mapping import MultiModelKeys
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
++from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
++from vllm.multimodal.utils import cached_get_tokenizer
++from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
++                           SequenceData)
++from vllm.transformers_utils.processor import get_processor
++
++from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
++from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix, merge_multimodal_embeddings)
++
++# TODO: hard-coded for now. Consider making it configurable.
++VIT_LAYERS = [-2, -9]
++NUM_PREFIX_TOKENS = 1
++ADDITIONAL_VOCAB_SIZE = 128
++DEFAULT_IMAGE_PATCH_TOKEN_ID = 152066
++DEFAULT_IM_START_TOKEN_ID = 152067
++DEFAULT_IM_END_TOKEN_ID = 152064
++DEFAULT_IM_COL_TOKEN_ID = 152065
++
++
++class MolmoImageInputs(TypedDict):
++    images: torch.Tensor
++    """Shape:
++    `(batch_size, num_crops, num_patch, patch_dim)`
++    """
++
++    image_input_idx: torch.Tensor
++    """Shape:
++    `(batch_size, num_crops, num_patch)`
++    """
++
++    seq_len: torch.Tensor
++    """Shape:
++    `(batch_size, )`
++    """
++
++    image_masks: Optional[torch.Tensor]
++    """Shape:
++    `(batch_size, num_crops, num_patch)`
++    """
++
++    image_start_end: Tuple[int, int]
++    """Starting and ending index of placeholder 
++    tokens
++    """
++
++
++@dataclass
++class VisionBackboneConfig:
++    image_default_input_size: Tuple[int, int] = (336, 336)
++    image_patch_size: int = 14
++    image_pos_patch_size: int = 14
++    image_emb_dim: int = 1024
++    image_num_heads: int = 16
++    image_num_key_value_heads: int = 16
++    image_num_layers: int = 23
++    image_mlp_dim: int = 4096
++    image_mlp_activations: str = "quick_gelu"
++    image_num_pos: int = 577
++    image_norm_eps: float = 1e-5
++
++    def __post_init__(self):
++        self.image_default_input_size = tuple(
++            self.image_default_input_size)  # type: ignore[assignment]
++
++    @property
++    def image_num_patch(self):
++        h, w = self.image_default_input_size
++        return h // self.image_patch_size, w // self.image_patch_size
++
++
++class ViTMLP(nn.Module):
++    """MLP used in Vision Transformer."""
++
++    def __init__(
++        self,
++        config: VisionBackboneConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++    ):
++        super().__init__()
++        self.w1 = ColumnParallelLinear(
++            config.image_emb_dim,
++            config.image_mlp_dim,
++            bias=True,
++            quant_config=quant_config,
++        )
++        # Activation function.
++        assert config.image_mlp_activations == "quick_gelu"
++        self.act = QuickGELU()
++        self.w2 = RowParallelLinear(
++            config.image_mlp_dim,
++            config.image_emb_dim,
++            bias=True,
++            quant_config=quant_config,
++        )
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        x, _ = self.w1(x)
++        x = self.act(x)
++        x, _ = self.w2(x)
++        return x
++
++
++class MultiHeadDotProductAttention(nn.Module):
++    """Multi-head attention used in Vision Transformer."""
++
++    def __init__(
++        self,
++        config: VisionBackboneConfig,
++        use_bias: bool = True,
++        nlayers: int = 1,
++        quant_config: Optional[QuantizationConfig] = None,
++    ):
++        super().__init__()
++
++        self.hidden_size = config.image_emb_dim
++        self.total_num_heads = config.image_num_heads
++        tp_size = get_tensor_model_parallel_world_size()
++
++        assert self.hidden_size % self.total_num_heads == 0
++        assert self.total_num_heads % tp_size == 0
++
++        self.num_heads = self.total_num_heads // tp_size
++        self.head_dim = self.hidden_size // self.total_num_heads
++
++        self.total_num_kv_heads = config.image_num_key_value_heads
++        if self.total_num_kv_heads >= tp_size:
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            assert tp_size % self.total_num_kv_heads == 0
++
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++
++        self.wq = ColumnParallelLinear(
++            nlayers * self.hidden_size,
++            self.total_num_heads * self.head_dim,
++            bias=use_bias,
++            quant_config=quant_config,
++        )
++        self.wk = ColumnParallelLinear(
++            nlayers * self.hidden_size,
++            self.total_num_kv_heads * self.head_dim,
++            bias=use_bias,
++            quant_config=quant_config,
++        )
++        self.wv = ColumnParallelLinear(
++            nlayers * self.hidden_size,
++            self.total_num_kv_heads * self.head_dim,
++            bias=use_bias,
++            quant_config=quant_config,
++        )
++        self.wo = RowParallelLinear(
++            self.total_num_heads * self.head_dim,
++            self.hidden_size,
++            bias=use_bias,
++            quant_config=quant_config,
++        )
++
++        self.scale = self.head_dim**-0.5
++        self.attn = MultiHeadAttention(self.num_heads,
++                                       self.head_dim,
++                                       self.scale,
++                                       num_kv_heads=self.num_kv_heads)
++
++    def forward(self,
++                inputs_q: torch.Tensor,
++                inputs_kv: Optional[torch.Tensor] = None) -> torch.Tensor:
++
++        if inputs_kv is not None:
++            inputs_k = inputs_kv
++            inputs_v = inputs_kv
++        else:
++            inputs_k = inputs_q
++            inputs_v = inputs_q
++
++        xq, _ = self.wq(inputs_q)
++        xk, _ = self.wk(inputs_k)
++        xv, _ = self.wv(inputs_v)
++
++        output = self.attn(xq, xk, xv)
++        output, _ = self.wo(output)
++
++        return output
++
++
++class ResidualAttentionBlock(nn.Module):
++    """Residual attention block used in Vision Transformer."""
++
++    def __init__(
++        self,
++        config: VisionBackboneConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++    ):
++        super().__init__()
++        self.attention = MultiHeadDotProductAttention(
++            config, quant_config=quant_config)
++        self.feed_forward = ViTMLP(config, quant_config)
++        self.attention_norm = nn.LayerNorm(
++            config.image_emb_dim,
++            eps=config.image_norm_eps,
++        )
++        self.ffn_norm = nn.LayerNorm(
++            config.image_emb_dim,
++            eps=config.image_norm_eps,
++        )
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        x = x + self.attention(self.attention_norm(x))
++        x = x + self.feed_forward(self.ffn_norm(x))
++        return x
++
++
++class BlockCollection(nn.Module):
++    """Collection of residual attention blocks used in Vision Transformer."""
++
++    def __init__(
++        self,
++        config: VisionBackboneConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++    ):
++        super().__init__()
++        self.resblocks = nn.ModuleList([
++            ResidualAttentionBlock(config, quant_config)
++            for _ in range(config.image_num_layers)
++        ])
++
++    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
++        hidden_states = []
++        for r in self.resblocks:
++            x = r(x)
++            hidden_states.append(x)
++        return hidden_states
++
++
++def _expand_token(token: torch.Tensor, batch_size: int) -> torch.Tensor:
++    return token.view(1, 1, -1).expand(batch_size, -1, -1)
++
++
++class VisionTransformer(nn.Module):
++    """Vision Transformer used in Vision Backbone."""
++
++    def __init__(
++        self,
++        config: VisionBackboneConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++    ):
++        super().__init__()
++        scale = config.image_emb_dim**-0.5
++        self.patch_num = config.image_num_patch
++        self.class_embedding = nn.Parameter(
++            torch.randn(config.image_emb_dim) * scale)
++        self.num_prefix_tokens: int = NUM_PREFIX_TOKENS
++        self.positional_embedding = nn.Parameter(
++            torch.randn(config.image_num_pos, config.image_emb_dim) * scale)
++        image_patch_size = config.image_patch_size
++        self.patch_embedding = nn.Linear(
++            image_patch_size * image_patch_size * 3,
++            config.image_emb_dim,
++            bias=False,
++        )
++        self.pre_ln = nn.LayerNorm(config.image_emb_dim,
++                                   eps=config.image_norm_eps)
++        self.transformer = BlockCollection(config, quant_config)
++
++    def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
++        cls_emb = self.positional_embedding[0:1]
++        pos_emb = self.positional_embedding[1:]
++
++        pos_emb = pos_emb.reshape(
++            (int(math.sqrt(pos_emb.shape[0])),
++             int(math.sqrt(pos_emb.shape[0])), pos_emb.shape[1]))
++
++        (patch_num_0, patch_num_1) = patch_num
++
++        if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
++            # from https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
++            pos_emb = pos_emb.unsqueeze(0).permute(0, 3, 1, 2)
++            pos_emb = F.interpolate(
++                pos_emb,
++                size=(patch_num_0, patch_num_1),
++                mode="bicubic",
++                align_corners=False,
++                antialias=True,
++            )
++            pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)
++
++        pos_emb = pos_emb.reshape(-1, pos_emb.shape[-1])
++        x = x + torch.cat([cls_emb[None, :, :], pos_emb[None, :, :]],
++                          dim=1).to(x.dtype)
++        return x
++
++    def forward(self,
++                x: torch.Tensor,
++                patch_num: int = None) -> List[torch.Tensor]:
++        """
++        : param x: (batch_size, num_patch, n_pixels)
++        """
++        if patch_num is None:
++            patch_num = self.patch_num
++        B, N, D = x.shape
++
++        x = self.patch_embedding(x)
++
++        # class embeddings and positional embeddings
++        x = torch.cat(
++            [_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x],
++            dim=1)
++        x = self.add_pos_emb(x, patch_num)
++
++        x = self.pre_ln(x)
++
++        hidden_states = self.transformer(x)
++        return hidden_states
++
++
++class MolmoAttention(nn.Module):
++    """Molmo's LLM attention."""
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = config.num_attention_heads
++
++        assert self.hidden_size % self.total_num_heads == 0
++        assert self.total_num_heads % self.tp_size == 0
++
++        self.num_heads = self.total_num_heads // self.tp_size
++        self.total_num_kv_heads = config.num_key_value_heads \
++            or self.total_num_heads
++        if self.total_num_kv_heads >= self.tp_size:
++            assert self.total_num_kv_heads % self.tp_size == 0
++        else:
++            assert self.tp_size % self.total_num_kv_heads == 0
++
++        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
++        self.head_dim = self.hidden_size // self.total_num_heads
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.max_position_embeddings = config.max_position_embeddings
++        self.rope_theta = config.rope_theta
++
++        # Attention input projection. Projects x -> (q, k, v)
++        self.qkv_proj = QKVParallelLinear(
++            self.hidden_size,
++            self.head_dim,
++            self.total_num_heads,
++            self.total_num_kv_heads,
++            bias=config.qkv_bias,
++            quant_config=quant_config,
++        )
++
++        self.tp_rank: Optional[int] = None
++        self.k_norm: Optional[nn.Module] = None
++        self.q_norm: Optional[nn.Module] = None
++        if config.attention_layer_norm:
++            self.tp_rank = get_tensor_model_parallel_rank()
++            self.k_norm = RMSNorm(self.total_num_kv_heads * self.head_dim,
++                                  eps=config.layer_norm_eps)
++            self.q_norm = RMSNorm(config.hidden_size,
++                                  eps=config.layer_norm_eps)
++
++        # Rotary embeddings.
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=self.max_position_embeddings,
++            base=self.rope_theta,
++        )
++        self.scaling = self.head_dim**-0.5
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++        # Attention output projection.
++        self.o_proj = RowParallelLinear(
++            self.total_num_heads * self.head_dim,
++            self.hidden_size,
++            bias=False,
++            quant_config=quant_config,
++        )
++
++    def _apply_qk_norm(self, q: torch.Tensor,
++                       k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
++        if self.tp_size > 1:
++            q = tensor_model_parallel_all_gather(q.contiguous())
++            k = tensor_model_parallel_all_gather(k.contiguous())
++        q = self.q_norm.forward_native(q)
++        k = self.k_norm.forward_native(k)
++        if self.tp_size > 1:
++            splitter = partial(split_tensor_along_last_dim,
++                               num_partitions=self.tp_size)
++            q = splitter(q)[self.tp_rank]
++            k = splitter(k)[self.tp_rank]
++        return q, k
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        if self.q_norm is not None and self.k_norm is not None:
++            q, k = self._apply_qk_norm(q, k)
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class SwiGLU(nn.Module):
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        x, gate = x.chunk(2, dim=-1)
++        # Note that the order is reversed compared to
++        # SiluAndMul.
++        return x * F.silu(gate)
++
++
++class LanuageModelMLP(nn.Module):
++    """Molmo's LLM mlp."""
++
++    def __init__(self,
++                 config: PretrainedConfig,
++                 input_dim: Optional[int] = None,
++                 quant_config: Optional[QuantizationConfig] = None) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        self.intermediate_size = config.intermediate_size // 2
++
++        self.gate_up_proj = MergedColumnParallelLinear(
++            input_dim or self.hidden_size,
++            [self.intermediate_size] * 2,
++            bias=False,
++            quant_config=quant_config,
++        )
++        # Activation function.
++        self.act_fn = SwiGLU()
++        # Feed-forward output projection.
++        self.down_proj = RowParallelLinear(
++            self.intermediate_size,
++            self.hidden_size,
++            bias=False,
++            quant_config=quant_config,
++        )
++
++    def forward(
++        self,
++        x: torch.Tensor,
++    ) -> torch.Tensor:
++        gate_up, _ = self.gate_up_proj(x)
++        x = self.act_fn(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class ImageProjectorMLP(nn.Module):
++    """Molmo's image_projector mlp."""
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        input_dim: Optional[int] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        self.intermediate_size = config.intermediate_size // 2
++
++        self.merged_linear = MergedColumnParallelLinear(
++            input_dim or self.hidden_size,
++            [self.intermediate_size] * 2,
++            bias=False,
++            quant_config=quant_config,
++        )
++        # Activation function.
++        self.act_fn = SiluAndMul()
++
++        # Feed-forward output projection.
++        self.down_proj = RowParallelLinear(
++            self.intermediate_size,
++            self.hidden_size,
++            bias=False,
++            quant_config=quant_config,
++        )
++
++    def forward(
++        self,
++        x: torch.Tensor,
++    ) -> torch.Tensor:
++        gate_up, _ = self.merged_linear(x)
++        x = self.act_fn(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class MolmoDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        # Attention block.
++        self.self_attn = MolmoAttention(config,
++                                        cache_config,
++                                        quant_config,
++                                        prefix=f"{prefix}.self_attn")
++
++        # MLP block.
++        self.mlp = LanuageModelMLP(config, quant_config=quant_config)
++
++        # LayerNorm
++        assert config.layer_norm_type == "rms"
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.layer_norm_eps)
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.layer_norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
++        # Self Attention
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.input_layernorm(hidden_states)
++        else:
++            hidden_states, residual = self.input_layernorm(
++                hidden_states, residual)
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        hidden_states, residual = self.post_attention_layernorm(
++            hidden_states, residual)
++        hidden_states = self.mlp(hidden_states)
++        return hidden_states, residual
++
++
++class MolmoDecoderNormAfterLayer(MolmoDecoderLayer):
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
++        # Self Attention
++        residual = hidden_states
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        hidden_states = self.input_layernorm(hidden_states)
++        hidden_states = hidden_states + residual
++        residual = hidden_states
++
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states = hidden_states + residual
++        residual = None
++        return hidden_states, residual
++
++
++class MolmoVisionBackbone(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        vision_config: VisionBackboneConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++    ) -> None:
++        super().__init__()
++        self.vit_layers = VIT_LAYERS
++        self.image_num_patch = vision_config.image_num_patch
++        self.llm_patches_per_crop = (
++            (self.image_num_patch[0] + 1) // 2,
++            (self.image_num_patch[1] + 1) // 2,
++        )
++        self.image_vit = VisionTransformer(vision_config,
++                                           quant_config=quant_config)
++        self.num_prefix_tokens = self.image_vit.num_prefix_tokens
++        assert self.num_prefix_tokens in {
++            0, 1
++        }, "Only 0 or 1 prefix tokens are supported"
++        self.image_pooling_2d = MultiHeadDotProductAttention(
++            vision_config,
++            nlayers=len(self.vit_layers),
++            quant_config=quant_config)
++        self.image_projector = ImageProjectorMLP(
++            config,
++            input_dim=vision_config.image_emb_dim,
++            quant_config=quant_config,
++        )
++
++        image_dim = vision_config.image_emb_dim * len(self.vit_layers)
++        self.pad_embed = nn.Parameter(torch.zeros((2, image_dim)))
++
++    @property
++    def dtype(self) -> torch.dtype:
++        return self.image_vit.patch_embedding.weight.dtype
++
++    @property
++    def device(self) -> torch.device:
++        return self.image_vit.patch_embedding.weight.device
++
++    def encode_image(self, images: torch.Tensor) -> torch.Tensor:
++        """
++        : param images: (batch_size, num_crops, num_patch, n_pixels)
++        """
++        B, T, N, D = images.shape
++
++        mask = ~torch.all(
++            images.view(B * T, N, D) == -1, dim=(1, 2), keepdim=True)
++
++        images = images.view(B * T, N, D)
++        image_features = self.image_vit(images)
++
++        if self.vit_layers is not None:
++            features = []
++            for layer in self.vit_layers:
++                features.append(image_features[layer])
++            image_features = torch.cat(features, dim=-1)
++        else:
++            image_features = image_features[-1]
++
++        if self.num_prefix_tokens > 0:
++            image_features = image_features[:, 1:]
++
++        image_features = image_features * mask
++        image_features = image_features.view(B, T, N, -1)
++
++        return image_features
++
++    def forward(
++        self, images: torch.Tensor, image_masks: torch.Tensor
++    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
++
++        # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim) # noqa: E501
++        batch_size, num_image = images.shape[:2]
++        images = images.to(device=self.device, dtype=self.dtype)
++        image_features = self.encode_image(images)
++
++        og_dtype = image_features.dtype
++        assert image_masks is not None
++        pad_embed = self.pad_embed[:, None, None, None, :]
++        all_pad = image_masks == 0
++        partial_pad = torch.logical_and(
++            image_masks < 1,
++            torch.logical_not(all_pad)).to(dtype=torch.float32)
++        all_pad = all_pad.to(dtype=torch.float32)
++        image_features = image_features + pad_embed[0] * torch.unsqueeze(
++            all_pad, -1)
++        image_features = image_features + pad_embed[1] * torch.unsqueeze(
++            partial_pad, -1)
++
++        image_features = image_features.to(og_dtype)
++
++        image_features = image_features.reshape(
++            (batch_size, num_image) + self.image_num_patch + (-1, ), )
++
++        if self.image_num_patch[0] % 2 == 1:
++            # Pad so we can still pool 2x2 patches
++            image_features = F.pad(
++                image_features,
++                (0, 0, 0, 1, 0, 1, 0, 0, 0, 0),
++            )
++
++        # image pooling
++        image_features = rearrange(
++            image_features,
++            'b n (h dh) (w dw) c -> (b n h w) (dh dw) c',
++            dh=2,
++            dw=2,
++        )
++
++        query = image_features.mean(-2, keepdim=True)
++        image_features = self.image_pooling_2d(query, image_features)
++
++        h, w = self.llm_patches_per_crop
++        image_features = image_features.view(batch_size, num_image, h * w, -1)
++
++        image_features = self.image_projector(image_features)
++
++        # image_features: (batch_size, num_image, num_patch, d_model)
++        return image_features
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("merged_linear", "gate_proj", 0),
++            ("merged_linear", "up_proj", 1),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++
++        for name, loaded_weight in weights:
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++@support_torch_compile
++class MolmoModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.config = config
++
++        self.embedding_size = config.embedding_size or config.vocab_size
++        self.embedding_size += ADDITIONAL_VOCAB_SIZE
++        self.embed_tokens = VocabParallelEmbedding(
++            self.embedding_size,
++            config.hidden_size,
++            quant_config=quant_config,
++        )
++
++        decoder_layer = MolmoDecoderNormAfterLayer if config.norm_after \
++            else MolmoDecoderLayer
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: decoder_layer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers",
++        )
++
++        assert config.layer_norm_type == "rms"
++        self.norm = RMSNorm(config.hidden_size, config.layer_norm_eps)
++
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++    ) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.embed_tokens(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        # Apply blocks one-by-one.
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++                residual,
++            )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++        if residual is not None:
++            hidden_states, _ = self.norm(hidden_states, residual)
++        else:
++            hidden_states = self.norm(hidden_states)
++        return hidden_states
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++
++        for name, loaded_weight in weights:
++            if name.endswith(".bias") and name not in params_dict:
++                continue
++            if is_pp_missing_parameter(name, self):
++                continue
++
++            param = params_dict[name]
++            weight_loader = getattr(param, "weight_loader",
++                                    default_weight_loader)
++            weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++cached_get_processor = lru_cache(get_processor)
++
++
++def get_num_patches(num_tiles: int, crop_patches: int, left_margin: int,
++                    right_margin: int, pooling_size: int) -> int:
++    crop_window_patches = crop_patches - (left_margin + right_margin)
++    if num_tiles > 1:
++        left_crop_window_patches = (crop_window_patches + left_margin +
++                                    pooling_size -
++                                    1) // pooling_size * pooling_size
++        middle_crop_window_patches = (crop_window_patches + pooling_size -
++                                      1) // pooling_size * pooling_size
++        right_crop_window_patches = (crop_window_patches + right_margin +
++                                     pooling_size -
++                                     1) // pooling_size * pooling_size
++        return left_crop_window_patches + (
++            num_tiles -
++            2) * middle_crop_window_patches + right_crop_window_patches
++    else:
++        single_crop_window_patches = (crop_patches + pooling_size -
++                                      1) // pooling_size * pooling_size
++        return single_crop_window_patches
++
++
++def get_tokens(tiling_h: int, tiling_w: int, crop_patches: int,
++               left_margin: int, right_margin: int, pooling_size: int) -> int:
++    h = get_num_patches(tiling_h, crop_patches, left_margin, right_margin,
++                        pooling_size)
++    w = get_num_patches(tiling_w, crop_patches, left_margin, right_margin,
++                        pooling_size)
++    per_row = w // pooling_size + 1
++    joint = per_row * (h // pooling_size) + 2
++    image_token_length = (crop_patches + pooling_size - 1) // pooling_size
++    resize = (image_token_length + 1) * image_token_length + 2
++    return resize + joint
++
++
++def get_max_tokens(max_crops: int, crop_patches: int, left_margin: int,
++                   right_margin: int, pooling_size: int) -> int:
++    tilings = []
++    for i in range(1, max_crops + 1):
++        for j in range(1, max_crops + 1):
++            if i * j <= max_crops:
++                tilings.append((i, j))
++    tokens = [
++        get_tokens(tilings[i][0], tilings[i][1], crop_patches, left_margin,
++                   right_margin, pooling_size) for i in range(len(tilings))
++    ]
++    return max(tokens)
++
++
++def get_max_molmo_image_tokens(ctx: InputContext) -> int:
++    processor = cached_get_processor(
++        ctx.model_config.model,
++        trust_remote_code=ctx.model_config.trust_remote_code,
++        revision=ctx.model_config.code_revision)
++    image_processor = processor.image_processor
++    max_llm_image_tokens = get_max_tokens(
++        image_processor.max_crops,
++        image_processor.base_image_input_size[0] //
++        image_processor.image_patch_size,
++        image_processor.overlap_margins[0],
++        image_processor.overlap_margins[1],
++        2,
++    )
++    return max_llm_image_tokens
++
++
++# NOTE: preprocessing for the image data has been included in the
++# 'input_processor_for_molmo' function
++def image_input_mapper_for_molmo(
++    ctx: InputContext,
++    data: object,
++):
++    if isinstance(data, list):
++        assert len(data) == 1, "Molmo supports only one image per prompt."
++        data = data[0]
++
++    return MultiModalKwargs(data)
++
++
++def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
++                         mm_counts: Mapping[str, int]):
++    processor = cached_get_processor(
++        ctx.model_config.model,
++        trust_remote_code=ctx.model_config.trust_remote_code,
++        revision=ctx.model_config.code_revision)
++    image_processor = processor.image_processor
++
++    base_image_input_d = image_processor.image_patch_size
++    left_margin, right_margin = image_processor.overlap_margins
++    max_crops = image_processor.max_crops
++
++    # Assume: prompt_token_ids always starts with bos_token_id followed image tokens # noqa: E501
++    max_llm_image_tokens = get_max_molmo_image_tokens(ctx)
++    if seq_len - max_llm_image_tokens - 1 < 0:
++        raise RuntimeError(
++            f"Molmo cannot process {max_crops} crops in a prompt, "
++            "please increase max_model_len or reduce number of crops")
++
++    # The vertical image has the maximum number of image tokens due to column tokens. # noqa: E501
++    tiling = (max_crops, 1)
++    total_margin_pixels = base_image_input_d * (right_margin + left_margin)
++    crop_patches = image_processor.base_image_input_size[
++        0] // base_image_input_d
++    crop_window_patches = crop_patches - (right_margin + left_margin)
++    crop_window_size = crop_window_patches * base_image_input_d
++
++    h = crop_window_size * tiling[0] + total_margin_pixels
++    w = crop_window_size * tiling[1] + total_margin_pixels
++
++    dummy_image = Image.new("RGB", (w, h), color="red")
++
++    out = processor.process("dummy prompt", dummy_image)
++
++    token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
++                      out["input_ids"][:1 + max_llm_image_tokens])
++    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
++                       [0]) * (seq_len - max_llm_image_tokens - 1)
++    dummy_seqdata = SequenceData(token_ids)
++    dummy_imgdata = {
++        "images": out["images"],
++        "image_input_idx": out["image_input_idx"],
++    }
++    if "image_masks" in out:
++        dummy_imgdata["image_masks"] = out["image_masks"]
++    dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long)
++    size = 0
++    offset = -1
++    for i in range(len(token_ids)):
++        if token_ids[i] in (DEFAULT_IMAGE_PATCH_TOKEN_ID,
++                            DEFAULT_IM_START_TOKEN_ID, DEFAULT_IM_END_TOKEN_ID,
++                            DEFAULT_IM_COL_TOKEN_ID):
++            if offset < 0:
++                offset = i
++            size += 1
++    dummy_imgdata["image_start_end"] = (offset, offset + size)
++    return DummyData(seq_data=dummy_seqdata,
++                     multi_modal_data={"image": dummy_imgdata},
++                     multi_modal_placeholders={
++                         "image":
++                         [PlaceholderRange(offset=offset, length=size)]
++                     })
++
++
++def pad_images(
++    max_total_crops: int,
++    images: torch.Tensor,
++    image_input_idx: torch.Tensor,
++    image_masks: Optional[torch.Tensor] = None,
++):
++    n = max_total_crops - images.shape[0]
++    images = F.pad(images, (0, 0, 0, 0, 0, n), value=-1)
++    image_input_idx = F.pad(image_input_idx, (0, 0, 0, n), value=-1)
++    if image_masks is not None:
++        image_masks = F.pad(image_masks, (0, 0, 0, n), value=-1)
++    return images, image_input_idx, image_masks
++
++
++def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
++    prompt = inputs.get("prompt")
++    multi_modal_data = inputs.get("multi_modal_data")
++    image = None if multi_modal_data is None else multi_modal_data.get("image")
++
++    model_config = ctx.model_config
++    processor = cached_get_processor(
++        ctx.model_config.model,
++        trust_remote_code=model_config.trust_remote_code,
++        revision=ctx.model_config.code_revision)
++    tokenizer = cached_get_tokenizer(
++        model_config.tokenizer,
++        trust_remote_code=model_config.trust_remote_code)
++
++    # NOTE: message formatting for raw text prompt is only applied for
++    # offline inference; for online serving, the prompt is always in
++    # instruction format and tokenized.
++    if prompt is not None and re.match(r"^User:[\s\S]*?(Assistant:)*$",
++                                       prompt):
++        out = processor.process(prompt, image, message_format="none")
++    elif prompt is not None:
++        out = processor.process(prompt, image)
++    else:
++        out = processor.process(None, image, tokens=inputs["prompt_token_ids"])
++
++    # If there is no image, return directly.
++    if image is None:
++        new_prompt_token_ids = out["input_ids"].tolist()
++        prompt = inputs.get("prompt")
++        if prompt is None:
++            prompt = tokenizer.decode(new_prompt_token_ids)
++        return token_inputs(
++            prompt_token_ids=new_prompt_token_ids,
++            prompt=prompt,
++        )
++
++    image_processor = processor.image_processor
++    max_total_crops = 1 + image_processor.max_crops
++    images, image_input_idx, image_masks = pad_images(
++        max_total_crops,
++        out["images"],
++        out["image_input_idx"],
++        out.get("image_masks"),
++    )
++    image_data = dict(
++        images=images,
++        image_input_idx=image_input_idx,
++    )
++    if image_masks is not None:
++        image_data["image_masks"] = image_masks
++
++    new_prompt_token_ids = out["input_ids"].tolist()
++    image_data["seq_len"] = torch.tensor(len(new_prompt_token_ids),
++                                         dtype=torch.long)
++
++    multi_modal_data = dict(image=image_data)
++    size = 0
++    offset = -1
++    for i in range(len(new_prompt_token_ids)):
++        if new_prompt_token_ids[i] in (DEFAULT_IMAGE_PATCH_TOKEN_ID,
++                                       DEFAULT_IM_START_TOKEN_ID,
++                                       DEFAULT_IM_END_TOKEN_ID,
++                                       DEFAULT_IM_COL_TOKEN_ID):
++            if offset < 0:
++                offset = i
++            size += 1
++    image_data["image_start_end"] = (offset, offset + size)
++    prompt = inputs.get("prompt")
++    if prompt is None:
++        prompt = tokenizer.decode(new_prompt_token_ids)
++    return token_inputs(
++        prompt_token_ids=new_prompt_token_ids,
++        prompt=prompt,
++        multi_modal_data=multi_modal_data,
++        multi_modal_placeholders={
++            "image": [PlaceholderRange(offset=offset, length=size)]
++        },
++    )
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper(image_input_mapper_for_molmo)
++@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_molmo_image_tokens)
++@INPUT_REGISTRY.register_dummy_data(dummy_data_for_molmo)
++@INPUT_REGISTRY.register_input_processor(input_processor_for_molmo)
++class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
++                       SupportsLoRA):
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_substr={
++            # vision backbone mapping
++            "image_projector.w1.": "image_projector.gate_proj.",
++            "image_projector.w3.": "image_projector.up_proj.",
++            "image_projector.w2.": "image_projector.down_proj.",
++            # language backbone mapping
++            "att_proj": "self_attn.qkv_proj",
++            "attn_out": "self_attn.o_proj",
++            "q_norm": "self_attn.q_norm",
++            "k_norm": "self_attn.k_norm",
++            "ff_proj": "mlp.gate_up_proj",
++            "ff_out": "mlp.down_proj",
++            "attn_norm": "input_layernorm",
++            "ff_norm": "post_attention_layernorm",
++        },
++        orig_to_new_prefix={
++            # vision backbone mapping
++            "model.vision_backbone.": "vision_backbone.",
++            # language backbone mapping
++            "model.transformer.blocks.": "model.layers.",
++            "model.transformer.ln_f.": "model.norm.",
++            # lm_head is renamed to model.transformer.mlp.down_proj firstly,
++            # we need to run a second renaming for it
++            "model.transformer.mlp.down_proj.": "lm_head.",
++        },
++    )
++
++    packed_modules_mapping = {
++        "qkv_proj": ["qkv_proj"],
++        "gate_up_proj": ["gate_up_proj"],  # language model
++        "merged_linear": ["gate_proj", "up_proj"]  # image_projector
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        # language model
++        "qkv_proj",
++        "o_proj",
++        "gate_up_proj",
++        "down_proj",  # same name with image_projector
++        # vision tower
++        "wq",
++        "wk",
++        "wv",
++        "wo",
++        "w1",
++        "w2",
++        # image_projector
++        "merged_linear",
++    ]
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        "gate_proj": ("merged_linear", 0),
++        "up_proj": ("merged_linear", 1),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        lora_config = vllm_config.lora_config
++        self.config = config
++        self.multimodal_config = multimodal_config
++        self.lora_config = lora_config
++
++        vision_config = VisionBackboneConfig()
++        self.vision_backbone = MolmoVisionBackbone(config, vision_config,
++                                                   quant_config)
++        self.model = MolmoModel(vllm_config=vllm_config,
++                                prefix=maybe_prefix(prefix, "model"))
++
++        if self.config.weight_tying:
++            self.lm_head = self.model.transformer.wte
++        else:
++            self.lm_head = ParallelLMHead(
++                config.embedding_size or config.vocab_size,
++                config.hidden_size,
++                quant_config=quant_config,
++            )
++
++        self.logits_processor = LogitsProcessor(config.embedding_size
++                                                or config.vocab_size)
++        self.sampler = get_sampler()
++
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def _parse_and_validate_image_input(
++        self,
++        **kwargs: object,
++    ) -> Optional[MolmoImageInputs]:
++        images = kwargs.pop("images", None)
++        image_masks = kwargs.pop("image_masks", None)
++        image_start_end = kwargs.pop("image_start_end", None)
++        if images is None:
++            return None
++
++        image_input_idx = kwargs.pop("image_input_idx", None)
++        seq_len = kwargs.pop("seq_len", None)
++        if image_input_idx is None:
++            raise ValueError("image_input_idx is required for Molmo model.")
++        if seq_len is None:
++            raise ValueError("seq_len is required for Molmo model.")
++        if not isinstance(seq_len, torch.Tensor):
++            seq_len = torch.tensor(seq_len)
++
++        return MolmoImageInputs(
++            images=images,
++            image_input_idx=image_input_idx,
++            seq_len=seq_len,
++            image_masks=image_masks,
++            image_start_end=image_start_end,
++        )
++
++    def _process_image_input(
++        self,
++        image_input: MolmoImageInputs,
++    ) -> torch.Tensor:
++
++        image_features = self.vision_backbone(
++            images=image_input["images"],
++            image_masks=image_input["image_masks"],
++        )
++
++        return image_features
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        image_features = self._process_image_input(image_input)
++        image_input_idx = image_input["image_input_idx"]
++        seq_len = image_input["seq_len"]
++        batch_size, num_image, num_patch = image_features.shape[:3]
++        assert image_input_idx.shape == (batch_size, num_image, num_patch)
++
++        # insert the image feature into the embedding.
++        image_features = image_features.view(batch_size, num_image * num_patch,
++                                             -1)
++        image_input_idx = image_input_idx.view(batch_size,
++                                               num_image * num_patch)
++
++        valid = image_input_idx >= 0
++        image_features = image_features * valid[:, :, None].to(
++            image_features.dtype)
++        image_features = image_features.view(
++            batch_size * num_image * num_patch, -1).contiguous()
++
++        image_input_idx = image_input_idx * valid.to(image_input_idx.dtype)
++        offset = torch.cat([seq_len.new_zeros(1),
++                            seq_len.cumsum(dim=0)[:-1]],
++                           dim=0)[:, None]
++        image_input_idx = image_input_idx + offset.to(image_input_idx.dtype)
++        image_input_idx = image_input_idx.flatten()[:, None]
++        mat = image_input_idx == torch.arange(
++            seq_len.sum().item(), device=image_features.device)[None, :]
++        mat = mat.to(image_features.dtype)
++
++        # Note: In this original implementation from AI2, the final
++        # vision_embeddings will be always be the same length
++        # of input embeddings.
++        vision_embeddings = torch.einsum('nd,nm->md', image_features, mat)
++
++        # Split by the sizes of the input sequences. For each full embedding,
++        # extract the actual vision embeddings to be merged.
++        vision_embeddings = list(vision_embeddings.split(seq_len.tolist()))
++        for i in range(len(vision_embeddings)):
++            start, end = image_input['image_start_end'][i]
++            vision_embeddings[i] = vision_embeddings[i][start:end]
++
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings, [
++                    DEFAULT_IMAGE_PATCH_TOKEN_ID, DEFAULT_IM_START_TOKEN_ID,
++                    DEFAULT_IM_END_TOKEN_ID, DEFAULT_IM_COL_TOKEN_ID
++                ])
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.LongTensor,
++        positions: torch.LongTensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> SamplerOutput:
++
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.model(input_ids,
++                                   positions,
++                                   kv_caches,
++                                   attn_metadata,
++                                   intermediate_tensors,
++                                   inputs_embeds=inputs_embeds)
++
++        return hidden_states
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++
++        loader = AutoWeightsLoader(self)
++        weights = _get_weights_with_merged_embedding(weights)
++        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
++
++    def get_mm_mapping(self) -> MultiModelKeys:
++        """
++        Get the module prefix in multimodal models
++        """
++        return MultiModelKeys.from_string_field(
++            language_model="model",
++            connector="vision_backbone.image_projector",
++            tower_model="vision_backbone",
++        )
++
++
++def _get_weights_with_merged_embedding(
++    weights: Iterable[Tuple[str, torch.Tensor]]
++) -> Iterable[Tuple[str, torch.Tensor]]:
++    embedding_weights = {}
++    for name, weight in weights:
++        if "wte.embedding" in name:
++            embedding_weights["embedding"] = weight
++        elif "wte.new_embedding" in name:
++            embedding_weights["new_embedding"] = weight
++        else:
++            yield (name, weight)
++    # this is compatible with most of quantization,
++    # because they won't quantize embed_tokens
++    embedding_weights = torch.cat(
++        [embedding_weights["embedding"], embedding_weights["new_embedding"]],
++        dim=0,
++    )
++    yield ("model.embed_tokens.weight", embedding_weights)
+diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
+index 6fa5c5b..1235816 100644
+--- a/vllm/model_executor/models/mpt.py
++++ b/vllm/model_executor/models/mpt.py
+@@ -1,29 +1,34 @@
+-# coding=utf-8
+ # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
+ import math
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ import torch.nn as nn
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import (get_tensor_model_parallel_rank,
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
+                               get_tensor_model_parallel_world_size)
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
+ from vllm.transformers_utils.configs.mpt import MPTConfig
+ 
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
+ 
+ def _get_alibi_slopes(
+     total_num_heads: int,
+@@ -43,7 +48,9 @@ class MPTAttention(nn.Module):
+     def __init__(
+         self,
+         config: MPTConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.d_model = config.d_model
+@@ -107,7 +114,10 @@ class MPTAttention(nn.Module):
+                               self.head_dim,
+                               scaling,
+                               alibi_slopes=alibi_slopes,
+-                              num_kv_heads=self.num_kv_heads)
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -146,7 +156,7 @@ class MPTMLP(nn.Module):
+             bias=not config.no_bias,
+             quant_config=quant_config,
+         )
+-        self.act = get_act_fn("gelu", quant_config, intermediate_size)
++        self.act = get_act_fn("gelu")
+         self.down_proj = RowParallelLinear(
+             intermediate_size,
+             hidden_size,
+@@ -166,12 +176,17 @@ class MPTBlock(nn.Module):
+     def __init__(
+         self,
+         config: MPTConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         hidden_size = config.d_model
+         self.norm_1 = nn.LayerNorm(hidden_size)
+-        self.attn = MPTAttention(config, quant_config)
++        self.attn = MPTAttention(config,
++                                 cache_config,
++                                 quant_config,
++                                 prefix=f"{prefix}.attn")
+         self.norm_2 = nn.LayerNorm(hidden_size)
+         self.ffn = MPTMLP(config, quant_config)
+ 
+@@ -196,14 +211,16 @@ class MPTBlock(nn.Module):
+         return hidden_states
+ 
+ 
++@support_torch_compile
+ class MPTModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: MPTConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         assert config.embedding_fraction == 1.0
+         assert config.norm_type == "low_precision_layernorm"
+ 
+@@ -211,8 +228,11 @@ class MPTModel(nn.Module):
+             config.vocab_size,
+             config.d_model,
+         )
+-        self.blocks = nn.ModuleList(
+-            [MPTBlock(config, quant_config) for _ in range(config.n_layers)])
++        self.start_layer, self.end_layer, self.blocks = make_layers(
++            config.n_layers,
++            lambda prefix: MPTBlock(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.blocks")
+         self.norm_f = nn.LayerNorm(config.d_model)
+         if config.no_bias:
+             for module in self.modules():
+@@ -220,6 +240,12 @@ class MPTModel(nn.Module):
+                         module.bias, nn.Parameter):
+                     # Remove the bias term in Linear and LayerNorm.
+                     module.register_parameter("bias", None)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.d_model))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.wte(input_ids)
+ 
+     def forward(
+         self,
+@@ -227,36 +253,52 @@ class MPTModel(nn.Module):
+         position_ids: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.wte(input_ids)
+-        for i in range(len(self.blocks)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++
++        for i in range(self.start_layer, self.end_layer):
+             block = self.blocks[i]
+             hidden_states = block(
+                 position_ids,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         hidden_states = self.norm_f(hidden_states)
+         return hidden_states
+ 
+ 
+-class MPTForCausalLM(nn.Module):
++class MPTForCausalLM(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: MPTConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         assert config.tie_word_embeddings
+         self.quant_config = quant_config
+ 
+-        self.transformer = MPTModel(config, quant_config)
+-        self.lm_head_weight = self.transformer.wte.weight
++        self.transformer = MPTModel(vllm_config=vllm_config,
++                                    prefix=maybe_prefix(prefix, "transformer"))
++        self.lm_head = self.transformer.wte
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.transformer.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.transformer.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -264,14 +306,20 @@ class MPTForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.transformer(input_ids, positions, kv_caches,
+-                                         attn_metadata)
++                                         attn_metadata, intermediate_tensors,
++                                         inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -283,13 +331,19 @@ class MPTForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             # Skip loading extra bias for GPTQ models.
+             if name.endswith(".bias") and name not in params_dict:
+                 continue
++            if is_pp_missing_parameter(name, self):
++                continue
+             param = params_dict[name]
+             weight_loader = getattr(param, "weight_loader",
+                                     default_weight_loader)
+             weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
+new file mode 100644
+index 0000000..34cb998
+--- /dev/null
++++ b/vllm/model_executor/models/nemotron.py
+@@ -0,0 +1,531 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only Nemotron model compatible with HuggingFace weights."""
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.configs import NemotronConfig
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (PPMissingLayer, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++# The architecture is pretty similar to Llama, with these changes:
++# - There is no gate_proj, just up_proj
++# - Normal LayerNorm (with a +1 to the weights) instead of RMSNorm
++# - Squared ReLU instead of SwiGLU
++# - Adds a partial_rotary_factor to RoPE
++
++
++def _cast_if_autocast_enabled(*args):
++    if not torch.is_autocast_enabled():
++        return args
++    else:
++        return torch.cuda.amp.autocast_mode._cast(
++            args, torch.get_autocast_gpu_dtype())
++
++
++class NemotronLayerNorm1P(nn.LayerNorm):
++
++    def __init__(self,
++                 normalized_shape: Union[int, List[int], torch.Size],
++                 eps: float = 1e-5,
++                 elementwise_affine: bool = True,
++                 bias: bool = True,
++                 device=None,
++                 dtype=None):
++        super().__init__(normalized_shape, eps, elementwise_affine, bias,
++                         device, dtype)
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        residual: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        if residual is not None:
++            x = x + residual
++            residual = x
++        args = _cast_if_autocast_enabled(x, self.normalized_shape,
++                                         self.weight + 1, self.bias, self.eps)
++        with torch.cuda.amp.autocast(enabled=False):
++            x = torch.nn.functional.layer_norm(*args)
++            return x if residual is None else (x, residual)
++
++
++class NemotronMLP(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        intermediate_size: int,
++        hidden_act: str,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.up_proj = ColumnParallelLinear(input_size=hidden_size,
++                                            output_size=intermediate_size,
++                                            bias=bias,
++                                            quant_config=quant_config,
++                                            prefix=f"{prefix}.up_proj")
++        self.down_proj = RowParallelLinear(input_size=intermediate_size,
++                                           output_size=hidden_size,
++                                           bias=bias,
++                                           quant_config=quant_config,
++                                           prefix=f"{prefix}.down_proj")
++        self.act_fn = get_act_fn(hidden_act)
++
++    def forward(self, x):
++        up, _ = self.up_proj(x)
++        x = self.act_fn(up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class NemotronAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: NemotronConfig,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        rope_theta: float = 10000,
++        rope_scaling: Optional[Dict[str, Any]] = None,
++        max_position_embeddings: int = 8192,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        cache_config: Optional[CacheConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = num_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = num_kv_heads
++        if self.total_num_kv_heads >= tp_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
++        self.head_dim = getattr(config, "head_dim",
++                                self.hidden_size // self.total_num_heads)
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = self.head_dim**-0.5
++        self.rope_theta = rope_theta
++        self.partial_rotary_factor = config.partial_rotary_factor
++        self.max_position_embeddings = max_position_embeddings
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size=hidden_size,
++            head_size=self.head_dim,
++            total_num_heads=self.total_num_heads,
++            total_num_kv_heads=self.total_num_kv_heads,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++        self.o_proj = RowParallelLinear(
++            input_size=self.total_num_heads * self.head_dim,
++            output_size=hidden_size,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
++        )
++
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=max_position_embeddings,
++            base=rope_theta,
++            rope_scaling=rope_scaling,
++            partial_rotary_factor=self.partial_rotary_factor,
++        )
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class NemotronDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: NemotronConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        rope_theta = getattr(config, "rope_theta", 10000)
++        rope_scaling = getattr(config, "rope_scaling", None)
++        if rope_scaling is not None and getattr(
++                config, "original_max_position_embeddings", None):
++            rope_scaling["original_max_position_embeddings"] = (
++                config.original_max_position_embeddings)
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          8192)
++        # Support abacusai/Smaug-72B-v0.1 with attention_bias
++        # Support internlm/internlm-7b with bias
++        attention_bias = getattr(config, "attention_bias", False) or getattr(
++            config, "bias", False)
++        self.self_attn = NemotronAttention(
++            config=config,
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            num_kv_heads=getattr(config, "num_key_value_heads",
++                                 config.num_attention_heads),
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            quant_config=quant_config,
++            bias=attention_bias,
++            cache_config=cache_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.mlp = NemotronMLP(
++            hidden_size=self.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.hidden_act,
++            quant_config=quant_config,
++            bias=getattr(config, "mlp_bias", False),
++            prefix=f"{prefix}.mlp",
++        )
++        self.input_layernorm = NemotronLayerNorm1P(config.hidden_size,
++                                                   eps=config.norm_eps)
++        self.post_attention_layernorm = NemotronLayerNorm1P(
++            config.hidden_size, eps=config.norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        # Self Attention
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.input_layernorm(hidden_states)
++        else:
++            hidden_states, residual = self.input_layernorm(
++                hidden_states, residual)
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        # Fully Connected
++        hidden_states, residual = self.post_attention_layernorm(
++            hidden_states, residual)
++        hidden_states = self.mlp(hidden_states)
++        return hidden_states, residual
++
++
++@support_torch_compile
++class NemotronModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.config = config
++        self.padding_idx = config.pad_token_id
++        lora_vocab = (lora_config.lora_extra_vocab_size *
++                      (lora_config.max_loras or 1)) if lora_config else 0
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.org_vocab_size = config.vocab_size
++        if get_pp_group().is_first_rank or (config.tie_word_embeddings
++                                            and get_pp_group().is_last_rank):
++            self.embed_tokens = VocabParallelEmbedding(
++                self.vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++            )
++        else:
++            self.embed_tokens = PPMissingLayer()
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: NemotronDecoderLayer(config=config,
++                                                cache_config=cache_config,
++                                                quant_config=quant_config,
++                                                prefix=prefix),
++            prefix=f"{prefix}.layers")
++        if get_pp_group().is_last_rank:
++            self.norm = NemotronLayerNorm1P(config.hidden_size,
++                                            eps=config.norm_eps)
++        else:
++            self.norm = PPMissingLayer()
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++                residual,
++            )
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
++
++
++class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj", "o_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"
++    ]
++    embedding_modules = {
++        "embed_tokens": "input_embeddings",
++        "lm_head": "output_embeddings",
++    }
++    embedding_padding_modules = ["lm_head"]
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++        assert isinstance(config, NemotronConfig)
++
++        self.config = config
++        self.lora_config = lora_config
++
++        self.model = NemotronModel(vllm_config=vllm_config,
++                                   prefix=maybe_prefix(prefix, "model"))
++        if get_pp_group().is_last_rank:
++            self.unpadded_vocab_size = config.vocab_size
++            if lora_config:
++                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++            self.lm_head = ParallelLMHead(
++                self.unpadded_vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++                padding_size=DEFAULT_VOCAB_PADDING_SIZE
++                # We need bigger padding if using lora for kernel
++                # compatibility
++                if not lora_config else lora_config.lora_vocab_padding_size,
++                quant_config=quant_config,
++            )
++            if config.tie_word_embeddings:
++                self.lm_head.weight = self.model.embed_tokens.weight
++
++            logit_scale = getattr(config, "logit_scale", 1.0)
++            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                    config.vocab_size,
++                                                    logit_scale)
++        else:
++            self.lm_head = PPMissingLayer()
++
++        self.sampler = get_sampler()
++
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        model_output = self.model(input_ids, positions, kv_caches,
++                                  attn_metadata, intermediate_tensors,
++                                  inputs_embeds)
++        return model_output
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            (".qkv_proj", ".q_proj", "q"),
++            (".qkv_proj", ".k_proj", "k"),
++            (".qkv_proj", ".v_proj", "v"),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            if ("rotary_emb.cos_cached" in name
++                    or "rotary_emb.sin_cached" in name):
++                # Models trained using ColossalAI may include these tensors in
++                # the checkpoint. Skip them.
++                continue
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++
++                break
++            else:
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                # Remapping the name of FP8 kv-scale.
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
+new file mode 100644
+index 0000000..df4fd0a
+--- /dev/null
++++ b/vllm/model_executor/models/nvlm_d.py
+@@ -0,0 +1,88 @@
++# adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
++# --------------------------------------------------------
++# NVLM-D
++# Copyright (c) 2024 NVIDIA
++# Licensed under Apache 2.0 License [see LICENSE for details]
++# --------------------------------------------------------
++from typing import Optional
++
++import torch.nn as nn
++from transformers import PretrainedConfig
++
++from vllm.inputs import INPUT_REGISTRY
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.multimodal import MULTIMODAL_REGISTRY
++
++from .intern_vit import InternVisionModel
++from .internvl import (InternVLChatModel, InternVLInputPipeline,
++                       get_max_internvl_image_tokens)
++
++IMG_START = '<|vision_start|>'
++IMG_END = '<|vision_end|>'
++IMG_CONTEXT = '<|vision_pad|>'
++
++
++class NVLMInputPipeline(InternVLInputPipeline):
++
++    def _create_image_prompt(self, feature_size: int, num_patches: int) -> str:
++        tile_pos_identifiers = ([f"<tile_{i}>"
++                                 for i in range(1, num_patches)] +
++                                ["<tile_global_thumbnail>"])
++        context_size = feature_size // num_patches
++
++        return '<Image>' + ''.join(
++            tile_pos_identifier + self.img_context_token * context_size
++            for tile_pos_identifier in tile_pos_identifiers) + '</Image>'
++
++
++input_pipeline = NVLMInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
++@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
++@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
++@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
++class NVLM_D_Model(InternVLChatModel):
++
++    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
++        vit_hidden_size = config.vision_config.hidden_size
++        llm_intermediate_size = config.text_config.intermediate_size
++        llm_hidden_size = config.text_config.hidden_size
++
++        return nn.Sequential(
++            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
++            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
++                      llm_intermediate_size,
++                      bias=False),
++            nn.GELU(),
++            nn.Linear(llm_intermediate_size, llm_hidden_size, bias=False),
++        )
++
++    def _init_vision_model(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig],
++        *,
++        is_mono: bool,
++        prefix: str,
++    ):
++        if not is_mono:
++            vision_feature_layer = config.select_layer
++            if vision_feature_layer < 0:
++                num_hidden_layers = config.vision_config.num_hidden_layers \
++                    + vision_feature_layer + 1
++            else:
++                num_hidden_layers = vision_feature_layer + 1
++
++            # We added additional dummy heads to the original num of heads to
++            # make the number of heads divisible by 8.
++            return InternVisionModel(
++                config.vision_config,
++                quant_config=quant_config,
++                num_hidden_layers_override=num_hidden_layers,
++                num_dummy_heads=7,
++                prefix=prefix,
++            )
++        else:
++            msg = "Monolith mode is not applicable to NVLM_D"
++            raise NotImplementedError(msg)
+diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
+index f212ea2..538e31e 100644
+--- a/vllm/model_executor/models/olmo.py
++++ b/vllm/model_executor/models/olmo.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
+ # Copyright 2024 The vLLM team.
+@@ -21,28 +20,34 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only OLMo model compatible with HuggingFace weights."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import OlmoConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import SiluAndMul
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class OlmoAttention(nn.Module):
+@@ -55,7 +60,9 @@ class OlmoAttention(nn.Module):
+     def __init__(
+         self,
+         config: OlmoConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.config = config
+@@ -93,7 +100,10 @@ class OlmoAttention(nn.Module):
+         self.scaling = self.head_dim**-0.5
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+-                              scale=self.scaling)
++                              scale=self.scaling,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+         # Attention output projection.
+         self.o_proj = RowParallelLinear(
+@@ -175,10 +185,15 @@ class OlmoDecoderLayer(nn.Module):
+ 
+     def __init__(self,
+                  config: OlmoConfig,
+-                 quant_config: Optional[QuantizationConfig] = None):
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__()
+         # Attention block.
+-        self.self_attn = OlmoAttention(config, quant_config)
++        self.self_attn = OlmoAttention(config,
++                                       cache_config,
++                                       quant_config,
++                                       prefix=f"{prefix}.self_attn")
+ 
+         # MLP block.
+         self.mlp = OlmoMLP(config, quant_config)
+@@ -213,23 +228,34 @@ class OlmoDecoderLayer(nn.Module):
+         return hidden_states
+ 
+ 
++@support_torch_compile
+ class OlmoModel(nn.Module):
+ 
+-    def __init__(self,
+-                 config: OlmoConfig,
+-                 quant_config: Optional[QuantizationConfig] = None):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+ 
+         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                    config.hidden_size)
+-        self.layers = nn.ModuleList([
+-            OlmoDecoderLayer(config, quant_config)
+-            for layer_idx in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: OlmoDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
+         self.norm = nn.LayerNorm(config.hidden_size,
+                                  elementwise_affine=False,
+                                  bias=False)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -237,56 +263,68 @@ class OlmoModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         """
+         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+         """
+-        # Get embeddings of input.
+-        # shape: (batch_size, seq_len, d_model)
+-        inputs_embeds = self.embed_tokens(input_ids)
+-
+-        # embed positions
+-        hidden_states = inputs_embeds
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
+ 
+         # Apply blocks one-by-one.
+-        for layer_idx, decoder_layer in enumerate(self.layers):
++        for i in range(self.start_layer, self.end_layer):
+             # shape: (batch_size, seq_len, d_model)
+-            hidden_states = decoder_layer(
++            hidden_states = self.layers[i](
+                 positions,
+                 hidden_states,
+-                kv_caches[layer_idx],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+             )
+ 
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         # Apply final layer norm.
+         # shape: (batch_size, seq_len or 1, d_model)
+         hidden_states = self.norm(hidden_states)
+         return hidden_states
+ 
+ 
+-class OlmoForCausalLM(nn.Module):
++class OlmoForCausalLM(nn.Module, SupportsPP):
+     """
+     Extremely barebones HF model wrapper.
+     """
+ 
+-    def __init__(self,
+-                 config: OlmoConfig,
+-                 quant_config: Optional[QuantizationConfig] = None):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+-        self.model = OlmoModel(config, quant_config)
++        self.model = OlmoModel(vllm_config=vllm_config,
++                               prefix=maybe_prefix(prefix, "model"))
+         if config.tie_word_embeddings:
+-            self.lm_head_weight = self.model.embed_tokens.weight
++            self.lm_head = self.model.embed_tokens
+         else:
+             self.unpadded_vocab_size = config.vocab_size
+             self.lm_head = ParallelLMHead(
+                 self.unpadded_vocab_size,
+                 config.hidden_size,
+                 org_num_embeddings=config.vocab_size,
++                quant_config=quant_config,
+             )
+-            self.lm_head_weight = self.lm_head.weight
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -294,18 +332,25 @@ class OlmoForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(
+             input_ids=input_ids,
+             positions=positions,
+             kv_caches=kv_caches,
+             attn_metadata=attn_metadata,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=inputs_embeds,
+         )
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -317,7 +362,8 @@ class OlmoForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -327,6 +373,7 @@ class OlmoForCausalLM(nn.Module):
+             ("gate_up_proj", "up_proj", 1),
+         ]
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -335,6 +382,11 @@ class OlmoForCausalLM(nn.Module):
+                 # Models trained using ColossalAI may include these tensors in
+                 # the checkpoint. Skip them.
+                 continue
++            # With tie_word_embeddings, we can skip lm_head.weight
++            # The weight might appear unnecessarily in the files if the model is
++            # processed with quantization, LoRA, fine-tuning, etc.
++            if self.config.tie_word_embeddings and "lm_head.weight" in name:
++                continue
+             for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                 if weight_name not in name:
+                     continue
+@@ -342,6 +394,8 @@ class OlmoForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -350,7 +404,11 @@ class OlmoForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
+new file mode 100644
+index 0000000..a35c911
+--- /dev/null
++++ b/vllm/model_executor/models/olmo2.py
+@@ -0,0 +1,432 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py
++# Copyright 2024 The vLLM team.
++# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only OLMo2 model compatible with HuggingFace weights."""
++
++from functools import partial
++from typing import Iterable, List, Optional, Tuple, Union
++
++import torch
++from torch import nn
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.distributed.communication_op import tensor_model_parallel_all_gather
++from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
++from vllm.distributed.utils import split_tensor_along_last_dim
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.interfaces import SupportsPP
++from vllm.model_executor.models.utils import (
++    is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
++    make_layers, maybe_prefix)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.configs.olmo2 import Olmo2Config
++
++
++class Olmo2Attention(nn.Module):
++    """
++    This is the attention block where the output is computed as
++    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
++    (plus another skip connection).
++    """
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        self.config = vllm_config.model_config.hf_config
++        assert isinstance(self.config, Olmo2Config)
++
++        hidden_size = self.config.hidden_size
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = self.config.num_attention_heads
++
++        assert hidden_size % self.total_num_heads == 0
++        assert self.total_num_heads % self.tp_size == 0
++
++        self.num_heads = self.total_num_heads // self.tp_size
++        self.total_num_kv_heads = (self.config.num_key_value_heads
++                                   or self.total_num_heads)
++        if self.total_num_kv_heads >= self.tp_size:
++            assert self.total_num_kv_heads % self.tp_size == 0
++        else:
++            assert self.tp_size % self.total_num_kv_heads == 0
++
++        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
++        self.head_dim = hidden_size // self.total_num_heads
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.max_position_embeddings = self.config.max_position_embeddings
++        self.rope_theta = self.config.rope_theta
++
++        # Attention input projection. Projects x -> (q, k, v)
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size,
++            self.head_dim,
++            self.total_num_heads,
++            self.total_num_kv_heads,
++            bias=False,
++            quant_config=vllm_config.quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++
++        self.tp_rank = get_tensor_model_parallel_rank()
++        self.k_norm = RMSNorm(
++            self.total_num_kv_heads * self.head_dim,
++            eps=self.config.rms_norm_eps,
++        )
++        self.q_norm = RMSNorm(self.config.hidden_size,
++                              eps=self.config.rms_norm_eps)
++
++        # Rotary embeddings.
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=self.max_position_embeddings,
++            base=self.rope_theta,  # type: ignore
++        )
++        self.scaling = self.head_dim**-0.5
++        self.attn = Attention(
++            self.num_heads,
++            self.head_dim,
++            self.scaling,
++            num_kv_heads=self.num_kv_heads,
++            cache_config=vllm_config.cache_config,
++            quant_config=vllm_config.quant_config,
++            prefix=prefix,
++        )
++
++        # Attention output projection.
++        self.o_proj = RowParallelLinear(
++            self.total_num_heads * self.head_dim,
++            hidden_size,
++            bias=False,
++            quant_config=vllm_config.quant_config,
++            prefix=f"{prefix}.o_proj",
++        )
++
++    def _apply_qk_norm(self, q: torch.Tensor,
++                       k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
++        if self.tp_size > 1:
++            q = tensor_model_parallel_all_gather(q.contiguous())
++            k = tensor_model_parallel_all_gather(k.contiguous())
++        q = self.q_norm.forward_native(q)
++        k = self.k_norm.forward_native(k)
++        if self.tp_size > 1:
++            splitter = partial(split_tensor_along_last_dim,
++                               num_partitions=self.tp_size)
++            q = splitter(q)[self.tp_rank]
++            k = splitter(k)[self.tp_rank]
++        return q, k
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.chunk(chunks=3, dim=-1)
++        q, k = self._apply_qk_norm(q, k)
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class Olmo2MLP(nn.Module):
++    """
++    This is the MLP block where the output is computed as
++    ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
++    (plus another skip connection).
++    """
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        assert isinstance(config, Olmo2Config)
++        hidden_size = config.hidden_size
++        intermediate_size = config.intermediate_size
++
++        # Feed-forward input projection.
++        self.gate_up_proj = MergedColumnParallelLinear(
++            hidden_size,
++            [intermediate_size] * 2,
++            bias=False,
++            quant_config=vllm_config.quant_config,
++            prefix=f"{prefix}.gate_up_proj",
++        )
++
++        # Activation function.
++        self.act_fn = SiluAndMul()
++
++        # Feed-forward output projection.
++        self.down_proj = RowParallelLinear(
++            intermediate_size,
++            hidden_size,
++            bias=False,
++            quant_config=vllm_config.quant_config,
++            prefix=f"{prefix}.down_proj",
++        )
++
++    def forward(
++        self,
++        x: torch.Tensor,
++    ) -> torch.Tensor:
++        gate_up, _ = self.gate_up_proj(x)
++        x = self.act_fn(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class Olmo2DecoderLayer(nn.Module):
++    """
++    This is a typical transformer block where the output is
++    computed as ``MLP(LN(x + Attention(LN(x))))``
++    (plus another skip connection).
++    """
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        assert isinstance(config, Olmo2Config)
++        # Attention block.
++        self.self_attn = Olmo2Attention(vllm_config=vllm_config,
++                                        prefix=f"{prefix}.self_attn")
++
++        # MLP block.
++        self.mlp = Olmo2MLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
++
++        # LayerNorm
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.rms_norm_eps)
++
++        self.post_feedforward_layernorm = RMSNorm(config.hidden_size,
++                                                  eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        # Attention block.
++        residual = hidden_states
++        hidden_states = self.self_attn(positions, hidden_states, kv_cache,
++                                       attn_metadata)
++        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states = hidden_states + residual
++
++        # MLP block.
++        residual = hidden_states
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = self.post_feedforward_layernorm(hidden_states)
++        hidden_states = residual + hidden_states
++        return hidden_states
++
++
++class Olmo2Model(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        self.config = vllm_config.model_config.hf_config
++        assert isinstance(self.config, Olmo2Config)
++
++        self.embed_tokens = VocabParallelEmbedding(
++            self.config.vocab_size,
++            self.config.hidden_size,
++            prefix=f"{prefix}.embed_tokens",
++        )
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            self.config.num_hidden_layers,
++            lambda prefix: Olmo2DecoderLayer(vllm_config=vllm_config,
++                                             prefix=prefix),
++            prefix=f"{prefix}.layers",
++        )
++        self.norm = RMSNorm(
++            self.config.hidden_size,
++            eps=self.config.rms_norm_eps,
++        )
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    self.config.hidden_size))
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        """
++        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
++        """
++        if get_pp_group().is_first_rank:
++            # Get embeddings of input.
++            # shape: (batch_size, seq_len, d_model)
++            inputs_embeds = self.embed_tokens(input_ids)
++
++            # embed positions
++            hidden_states = inputs_embeds
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            assert isinstance(hidden_states, torch.Tensor)
++
++        # Apply blocks one-by-one.
++        for i in range(self.start_layer, self.end_layer):
++            # shape: (batch_size, seq_len, d_model)
++            hidden_states = self.layers[i](
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++            )
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
++
++        # Apply final layer norm.
++        # shape: (batch_size, seq_len or 1, d_model)
++        hidden_states = self.norm(hidden_states)
++        return hidden_states
++
++
++class Olmo2ForCausalLM(nn.Module, SupportsPP):
++    """
++    Extremely barebones HF model wrapper.
++    """
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        assert isinstance(config, Olmo2Config)
++        self.config = config
++        self.model = Olmo2Model(vllm_config=vllm_config,
++                                prefix=maybe_prefix(prefix, "model"))
++        if config.tie_word_embeddings:
++            self.lm_head = self.model.embed_tokens
++        else:
++            self.unpadded_vocab_size = config.vocab_size
++            self.lm_head = ParallelLMHead(
++                config.vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++                quant_config=vllm_config.quant_config,
++                prefix=maybe_prefix(prefix, "lm_head"),
++            )
++        self.logits_processor = LogitsProcessor(config.vocab_size)
++        self.sampler = Sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        hidden_states = self.model(
++            input_ids=input_ids,
++            positions=positions,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++            intermediate_tensors=intermediate_tensors,
++        )
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++            ("gate_up_proj", "gate_proj", 0),
++            ("gate_up_proj", "up_proj", 1),
++        ]
++
++        params_dict = dict(self.named_parameters(remove_duplicate=False))
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            if ("rotary_emb.cos_cached" in name
++                    or "rotary_emb.sin_cached" in name):
++                # Models trained using ColossalAI may include these tensors in
++                # the checkpoint. Skip them.
++                continue
++            if is_pp_missing_parameter(name, self):
++                continue
++            # With tie_word_embeddings, we can skip lm_head.weight
++            # The weight might appear unnecessarily in the files if the model is
++            # processed with quantization, LoRA, fine-tuning, etc.
++            if self.config.tie_word_embeddings and "lm_head.weight" in name:
++                continue
++            for param_name, weight_name, shard_id in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader  # type: ignore
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
+diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
+new file mode 100644
+index 0000000..fbe5d1a
+--- /dev/null
++++ b/vllm/model_executor/models/olmoe.py
+@@ -0,0 +1,466 @@
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only OLMoE model compatible with HuggingFace weights."""
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++from transformers import PretrainedConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.logger import init_logger
++from vllm.model_executor.layers.fused_moe import FusedMoE
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (QKVParallelLinear,
++                                               ReplicatedLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++logger = init_logger(__name__)
++
++
++class OlmoeMoE(nn.Module):
++    """A tensor-parallel MoE implementation for Olmoe that shards each expert
++    across all ranks.
++
++    Each expert's weights are sharded across all ranks and a fused MoE
++    kernel is used for the forward pass, and finally we reduce the outputs
++    across ranks.
++    """
++
++    def __init__(self,
++                 num_experts: int,
++                 top_k: int,
++                 hidden_size: int,
++                 intermediate_size: int,
++                 params_dtype: Optional[torch.dtype] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 tp_size: Optional[int] = None,
++                 prefix: str = ""):
++        super().__init__()
++        self.hidden_size = hidden_size
++
++        # Gate always runs at half / full precision for now.
++        self.gate = ReplicatedLinear(hidden_size,
++                                     num_experts,
++                                     bias=False,
++                                     quant_config=None)
++
++        self.experts = FusedMoE(num_experts=num_experts,
++                                top_k=top_k,
++                                hidden_size=hidden_size,
++                                intermediate_size=intermediate_size,
++                                reduce_results=True,
++                                renormalize=False,
++                                quant_config=quant_config,
++                                tp_size=tp_size)
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        # NOTE: hidden_states can have either 1D or 2D shape.
++        orig_shape = hidden_states.shape
++        hidden_dim = hidden_states.shape[-1]
++        hidden_states = hidden_states.view(-1, hidden_dim)
++        # router_logits: (num_tokens, n_experts)
++        router_logits, _ = self.gate(hidden_states)
++        final_hidden_states = self.experts(hidden_states=hidden_states,
++                                           router_logits=router_logits)
++        return final_hidden_states.view(orig_shape)
++
++
++class OlmoeAttention(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        rope_theta: float = 10000,
++        rope_scaling: Optional[Dict[str, Any]] = None,
++        max_position_embeddings: int = 4096,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = num_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = num_kv_heads
++        if self.total_num_kv_heads >= tp_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        self.head_dim = hidden_size // self.total_num_heads
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = self.head_dim**-0.5
++        self.rope_theta = rope_theta
++        self.max_position_embeddings = max_position_embeddings
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size,
++            self.head_dim,
++            self.total_num_heads,
++            self.total_num_kv_heads,
++            bias=False,
++            quant_config=quant_config,
++        )
++        self.q_norm = RMSNorm(hidden_size, eps=1e-5)
++        self.k_norm = RMSNorm(hidden_size, eps=1e-5)
++        self.o_proj = RowParallelLinear(
++            self.total_num_heads * self.head_dim,
++            hidden_size,
++            bias=False,
++            quant_config=quant_config,
++        )
++
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=max_position_embeddings,
++            base=rope_theta,
++            rope_scaling=rope_scaling,
++            is_neox_style=True,
++        )
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k = self.q_norm(q.contiguous()), self.k_norm(k.contiguous())
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class OlmoeDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        rope_theta = getattr(config, "rope_theta", 10000)
++        rope_scaling = getattr(config, "rope_scaling", None)
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          4096)
++
++        self.self_attn = OlmoeAttention(
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            num_kv_heads=config.num_key_value_heads,
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
++
++        self.mlp = OlmoeMoE(
++            num_experts=config.num_experts,
++            top_k=config.num_experts_per_tok,
++            hidden_size=config.hidden_size,
++            intermediate_size=config.intermediate_size,
++            quant_config=quant_config,
++        )
++        self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
++        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> torch.Tensor:
++        # Self Attention
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.input_layernorm(hidden_states)
++        else:
++            hidden_states, residual = self.input_layernorm(
++                hidden_states, residual)
++
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        # Fully Connected
++        hidden_states, residual = self.post_attention_layernorm(
++            hidden_states, residual)
++        hidden_states = self.mlp(hidden_states)
++        return hidden_states, residual
++
++
++@support_torch_compile
++class OlmoeModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.padding_idx = config.pad_token_id
++        self.vocab_size = config.vocab_size
++
++        self.embed_tokens = VocabParallelEmbedding(
++            config.vocab_size,
++            config.hidden_size,
++        )
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: OlmoeDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
++        self.norm = RMSNorm(config.hidden_size, eps=1e-5)
++
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++                residual,
++            )
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
++
++
++class OlmoeForCausalLM(nn.Module, SupportsPP):
++
++    fall_back_to_pt_during_load = False
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        self.config = config
++        self.quant_config = quant_config
++        self.model = OlmoeModel(vllm_config=vllm_config,
++                                prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      quant_config=quant_config)
++        self.logits_processor = LogitsProcessor(config.vocab_size)
++        self.sampler = get_sampler()
++
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        hidden_states = self.model(input_ids, positions, kv_caches,
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
++        return hidden_states
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: Optional[torch.Tensor],
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++            ("gate_up_proj", "gate_proj", 0),
++            ("gate_up_proj", "up_proj", 1),
++        ]
++
++        # Params for weights, fp8 weight scales, fp8 activation scales
++        # (param_name, weight_name, expert_id, shard_id)
++        expert_params_mapping = FusedMoE.make_expert_params_mapping(
++            ckpt_gate_proj_name="gate_proj",
++            ckpt_down_proj_name="down_proj",
++            ckpt_up_proj_name="up_proj",
++            num_experts=self.config.num_experts)
++
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                # Skip non-stacked layers and experts (experts handled below).
++                if weight_name not in name:
++                    continue
++                # We have mlp.experts[0].gate_proj in the checkpoint.
++                # Since we handle the experts below in expert_params_mapping,
++                # we need to skip here BEFORE we update the name, otherwise
++                # name will be updated to mlp.experts[0].gate_up_proj, which
++                # will then be updated below in expert_params_mapping
++                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
++                if "mlp.experts" in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                # Skip layers on other devices.
++                if is_pp_missing_parameter(name, self):
++                    continue
++                if name not in params_dict:
++                    continue
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                for mapping in expert_params_mapping:
++                    param_name, weight_name, expert_id, shard_id = mapping
++                    if weight_name not in name:
++                        continue
++                    name = name.replace(weight_name, param_name)
++                    # Skip layers on other devices.
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    param = params_dict[name]
++                    weight_loader = param.weight_loader
++                    weight_loader(param,
++                                  loaded_weight,
++                                  name,
++                                  shard_id=shard_id,
++                                  expert_id=expert_id)
++                    break
++                else:
++                    # Skip loading extra bias for GPTQ models.
++                    if name.endswith(".bias") and name not in params_dict:
++                        continue
++                    # Skip layers on other devices.
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    # Remapping the name of FP8 kv-scale.
++                    if name.endswith("kv_scale"):
++                        remapped_kv_scale_name = name.replace(
++                            ".kv_scale", ".attn.kv_scale")
++                        if remapped_kv_scale_name not in params_dict:
++                            logger.warning_once(
++                                "Found kv scale in the checkpoint "
++                                f"(e.g. {name}), but not found the expected "
++                                f"name in the model "
++                                f"(e.g. {remapped_kv_scale_name}). "
++                                "kv-scale is not loaded.")
++                            continue
++                        else:
++                            name = remapped_kv_scale_name
++
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
+index 336f765..7edafcd 100644
+--- a/vllm/model_executor/models/opt.py
++++ b/vllm/model_executor/models/opt.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
+ # Copyright 2023 The vLLM team.
+@@ -17,28 +16,34 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only OPT model compatible with HuggingFace weights."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import OPTConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                ReplicatedLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+-    VocabParallelEmbedding)
++    ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class OPTLearnedPositionalEmbedding(nn.Embedding):
+@@ -61,7 +66,9 @@ class OPTAttention(nn.Module):
+         embed_dim: int,
+         num_heads: int,
+         bias: bool = True,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.embed_dim = embed_dim
+@@ -79,16 +86,21 @@ class OPTAttention(nn.Module):
+             total_num_heads,
+             bias=bias,
+             quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
+         )
+         self.out_proj = RowParallelLinear(
+             embed_dim,
+             embed_dim,
+             bias=bias,
+             quant_config=quant_config,
++            prefix=f"{prefix}.out_proj",
+         )
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+-                              scale=self.scaling)
++                              scale=self.scaling,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -108,7 +120,9 @@ class OPTDecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: OPTConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.config = config
+@@ -117,7 +131,9 @@ class OPTDecoderLayer(nn.Module):
+             embed_dim=self.embed_dim,
+             num_heads=config.num_attention_heads,
+             bias=config.enable_bias,
++            cache_config=cache_config,
+             quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
+         )
+         self.do_layer_norm_before = config.do_layer_norm_before
+ 
+@@ -129,14 +145,15 @@ class OPTDecoderLayer(nn.Module):
+             config.ffn_dim,
+             bias=config.enable_bias,
+             quant_config=quant_config,
++            prefix=f"{prefix}.fc1",
+         )
+-        self.activation_fn = get_act_fn(config.activation_function,
+-                                        quant_config, config.ffn_dim)
++        self.activation_fn = get_act_fn(config.activation_function)
+         self.fc2 = RowParallelLinear(
+             config.ffn_dim,
+             self.embed_dim,
+             bias=config.enable_bias,
+             quant_config=quant_config,
++            prefix=f"{prefix}.fc2",
+         )
+         self.final_layer_norm = nn.LayerNorm(
+             self.embed_dim,
+@@ -181,7 +198,9 @@ class OPTDecoder(nn.Module):
+     def __init__(
+         self,
+         config: OPTConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.config = config
+@@ -202,7 +221,8 @@ class OPTDecoder(nn.Module):
+             self.project_out = ReplicatedLinear(config.hidden_size,
+                                                 config.word_embed_proj_dim,
+                                                 bias=False,
+-                                                quant_config=quant_config)
++                                                quant_config=quant_config,
++                                                prefix=f"{prefix}.project_out")
+         else:
+             self.project_out = None
+ 
+@@ -210,7 +230,8 @@ class OPTDecoder(nn.Module):
+             self.project_in = ReplicatedLinear(config.word_embed_proj_dim,
+                                                config.hidden_size,
+                                                bias=False,
+-                                               quant_config=quant_config)
++                                               quant_config=quant_config,
++                                               prefix=f"{prefix}.project_in")
+         else:
+             self.project_in = None
+ 
+@@ -225,10 +246,14 @@ class OPTDecoder(nn.Module):
+         else:
+             self.final_layer_norm = None
+ 
+-        self.layers = nn.ModuleList([
+-            OPTDecoderLayer(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: OPTDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -236,17 +261,28 @@ class OPTDecoder(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        inputs_embeds = self.embed_tokens(input_ids)
+-        pos_embeds = self.embed_positions(positions)
+-        if self.project_in is not None:
+-            inputs_embeds, _ = self.project_in(inputs_embeds)
+-        hidden_states = inputs_embeds + pos_embeds
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is None:
++                inputs_embeds = self.get_input_embeddings(input_ids)
++            pos_embeds = self.embed_positions(positions)
++            if self.project_in is not None:
++                inputs_embeds, _ = self.project_in(inputs_embeds)
++            hidden_states = inputs_embeds + pos_embeds
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
+ 
+-        for i in range(len(self.layers)):
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+-            hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
++            hidden_states = layer(hidden_states,
++                                  kv_caches[i - self.start_layer],
++                                  attn_metadata)
+ 
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         if self.final_layer_norm is not None:
+             hidden_states = self.final_layer_norm(hidden_states)
+         if self.project_out is not None:
+@@ -254,15 +290,26 @@ class OPTDecoder(nn.Module):
+         return hidden_states
+ 
+ 
++@support_torch_compile
+ class OPTModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: OPTConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
+-        self.decoder = OPTDecoder(config, quant_config)
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.decoder = OPTDecoder(config,
++                                  cache_config,
++                                  quant_config,
++                                  prefix=f"{prefix}.decoder")
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.decoder.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -270,24 +317,48 @@ class OPTModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        return self.decoder(input_ids, positions, kv_caches, attn_metadata)
+-
+-
+-class OPTForCausalLM(nn.Module):
+-
+-    def __init__(
+-        self,
+-        config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        return self.decoder(input_ids,
++                            positions,
++                            kv_caches,
++                            attn_metadata,
++                            intermediate_tensors,
++                            inputs_embeds=inputs_embeds)
++
++
++class OPTForCausalLM(nn.Module, SupportsPP):
++
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         super().__init__()
+         self.config = config
+         self.quant_config = quant_config
+-        self.model = OPTModel(config, quant_config)
+-        self.lm_head_weight = self.model.decoder.embed_tokens.weight
++        self.model = OPTModel(vllm_config=vllm_config,
++                              prefix=maybe_prefix(prefix, "model"))
++        if self.config.tie_word_embeddings:
++            self.lm_head = self.model.decoder.embed_tokens
++        else:
++            self.lm_head = ParallelLMHead(config.vocab_size,
++                                          config.word_embed_proj_dim)
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -295,14 +366,20 @@ class OPTForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -314,7 +391,8 @@ class OPTForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -322,8 +400,9 @@ class OPTForCausalLM(nn.Module):
+             ("qkv_proj", "v_proj", "v"),
+         ]
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+-            if "lm_head.weight" in name:
++            if "lm_head.weight" in name and self.config.tie_word_embeddings:
+                 continue
+             if name.startswith("decoder."):
+                 name = "model." + name
+@@ -335,6 +414,8 @@ class OPTForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -343,7 +424,11 @@ class OPTForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
+index 9ab5dfb..a3757b5 100644
+--- a/vllm/model_executor/models/orion.py
++++ b/vllm/model_executor/models/orion.py
+@@ -1,31 +1,36 @@
+-# coding=utf-8
+ # Adapted from
+ # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
+ # Copyright (c) OrionStar Inc.
+ # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
+ """Inference-only Orion-14B model compatible with HuggingFace weights."""
+-from typing import Any, Dict, Iterable, List, Optional, Tuple
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import PretrainedConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import SiluAndMul
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class OrionMLP(nn.Module):
+@@ -68,7 +73,9 @@ class OrionAttention(nn.Module):
+         rope_theta: float = 10000,
+         rope_scaling: Optional[Dict[str, Any]] = None,
+         max_position_embeddings: int = 8192,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = hidden_size
+@@ -118,7 +125,10 @@ class OrionAttention(nn.Module):
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+                               self.scaling,
+-                              num_kv_heads=self.num_kv_heads)
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -140,7 +150,9 @@ class OrionDecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: PretrainedConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -155,7 +167,9 @@ class OrionDecoderLayer(nn.Module):
+             rope_theta=rope_theta,
+             rope_scaling=rope_scaling,
+             max_position_embeddings=max_position_embeddings,
++            cache_config=cache_config,
+             quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
+         )
+         self.mlp = OrionMLP(
+             hidden_size=self.hidden_size,
+@@ -175,7 +189,6 @@ class OrionDecoderLayer(nn.Module):
+         hidden_states: torch.Tensor,
+         kv_cache: torch.Tensor,
+         attn_metadata: AttentionMetadata,
+-        residual: Optional[torch.Tensor],
+     ) -> Tuple[torch.Tensor, torch.Tensor]:
+         # Self Attention
+         residual = hidden_states
+@@ -194,17 +207,19 @@ class OrionDecoderLayer(nn.Module):
+         hidden_states = self.post_attention_layernorm(hidden_states)
+         hidden_states = self.mlp(hidden_states)
+         hidden_states = residual + hidden_states
+-        return hidden_states, None
++        return hidden_states
+ 
+ 
++@support_torch_compile
+ class OrionModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+         self.padding_idx = config.pad_token_id
+         self.vocab_size = config.vocab_size
+@@ -212,11 +227,19 @@ class OrionModel(nn.Module):
+             config.vocab_size,
+             config.hidden_size,
+         )
+-        self.layers = nn.ModuleList([
+-            OrionDecoderLayer(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: OrionDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
+         self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory([
++                "hidden_states",
++            ], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -224,36 +247,55 @@ class OrionModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        residual = None
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+-            hidden_states, residual = layer(
++            hidden_states = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+-                residual,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++            })
+         hidden_states = self.norm(hidden_states)
+         return hidden_states
+ 
+ 
+-class OrionForCausalLM(nn.Module):
++class OrionForCausalLM(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+-        self.model = OrionModel(config, quant_config)
+-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
++        self.model = OrionModel(vllm_config=vllm_config,
++                                prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      quant_config=quant_config)
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -261,14 +303,20 @@ class OrionForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -280,7 +328,8 @@ class OrionForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -290,6 +339,7 @@ class OrionForCausalLM(nn.Module):
+             ("gate_up_proj", "up_proj", 1),
+         ]
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -305,6 +355,8 @@ class OrionForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -313,7 +365,11 @@ class OrionForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
+new file mode 100644
+index 0000000..f9ad0c6
+--- /dev/null
++++ b/vllm/model_executor/models/paligemma.py
+@@ -0,0 +1,321 @@
++from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
++                    TypedDict, Union)
++
++import torch
++from torch import nn
++from transformers import PaliGemmaConfig
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
++                         InputContext, token_inputs)
++from vllm.logger import init_logger
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import NestedTensors
++from vllm.multimodal.utils import cached_get_tokenizer
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsMultiModal, SupportsPP
++from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
++                     dummy_seq_data_for_siglip, get_max_siglip_image_tokens)
++from .utils import (AutoWeightsLoader, init_vllm_registered_model,
++                    maybe_prefix, merge_multimodal_embeddings)
++
++logger = init_logger(__name__)
++
++
++class PaliGemmaImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: torch.Tensor
++    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
++
++
++class PaliGemmaImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: torch.Tensor
++    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
++
++    `hidden_size` must match the hidden size of language model backbone.
++    """
++
++
++PaliGemmaImageInputs = Union[PaliGemmaImagePixelInputs,
++                             PaliGemmaImageEmbeddingInputs]
++
++
++def get_max_paligemma_image_tokens(ctx: InputContext):
++    hf_config = ctx.get_hf_config(PaliGemmaConfig)
++    vision_config = hf_config.vision_config
++
++    return get_max_siglip_image_tokens(vision_config)
++
++
++def dummy_data_for_paligemma(ctx: InputContext, seq_len: int,
++                             mm_counts: Mapping[str, int]):
++    hf_config = ctx.get_hf_config(PaliGemmaConfig)
++    vision_config = hf_config.vision_config
++    num_images = mm_counts["image"]
++
++    seq_data, ranges = dummy_seq_data_for_siglip(
++        vision_config,
++        seq_len,
++        num_images,
++        image_token_id=hf_config.image_token_index,
++    )
++
++    mm_data = dummy_image_for_siglip(vision_config, num_images)
++    return DummyData(seq_data, mm_data, ranges)
++
++
++def input_processor_for_paligemma(ctx: InputContext,
++                                  inputs: DecoderOnlyInputs):
++
++    """
++    The correct prompt format needs to be:
++    '<image>' * image_feature_size + '<bos>' + prompt + '\n'
++
++    See https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/models/paligemma/processing_paligemma.py#L55
++    """ # noqa
++
++    multi_modal_data = inputs.get("multi_modal_data")
++    if multi_modal_data is None or "image" not in multi_modal_data:
++        return inputs
++
++    model_config = ctx.model_config
++    hf_config = ctx.get_hf_config(PaliGemmaConfig)
++
++    tokenizer = cached_get_tokenizer(model_config.tokenizer)
++    image_feature_size = hf_config.text_config.num_image_tokens
++    image_token_str = tokenizer.decode(hf_config.image_token_index)
++    bos_token = tokenizer.decode(hf_config.bos_token_id)
++    image_token_str_pad = image_token_str * image_feature_size
++    image_token_ids_pad = [hf_config.image_token_index] * image_feature_size
++
++    orig_prompt = inputs.get("prompt")
++    orig_prompt_ids = inputs.get("prompt_token_ids")
++
++    if orig_prompt is not None and image_token_str in orig_prompt:
++        logger.warning(
++            "The image token '%s' was detected in the prompt and "
++            "will be removed. Please follow the proper prompt format"
++            " documented on HuggingFace.", image_token_str)
++        orig_prompt = orig_prompt.replace(image_token_str, "")
++        orig_prompt_ids.remove(hf_config.image_token_index)
++
++    new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n"
++
++    # The PaliGemma 2 tokenizer does not include a starting BOS token
++    if orig_prompt_ids[0] != hf_config.bos_token_id:
++        orig_prompt_ids = [hf_config.bos_token_id] + orig_prompt_ids
++
++    new_token_ids = image_token_ids_pad + orig_prompt_ids + [108]  #newline
++
++    # NOTE: Create a defensive copy of the original inputs
++    return token_inputs(prompt_token_ids=new_token_ids,
++                        prompt=new_prompt,
++                        multi_modal_data=multi_modal_data)
++
++
++class PaliGemmaMultiModalProjector(nn.Module):
++
++    def __init__(self, vision_hidden_size: int, projection_dim: int):
++        super().__init__()
++
++        self.linear = nn.Linear(vision_hidden_size, projection_dim, bias=True)
++
++    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
++        hidden_states = self.linear(image_features)
++        return hidden_states
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper()
++@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_paligemma_image_tokens)
++@INPUT_REGISTRY.register_dummy_data(dummy_data_for_paligemma)
++@INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma)
++class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                        SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        self.vision_tower = SiglipVisionModel(config.vision_config,
++                                              quant_config,
++                                              prefix=maybe_prefix(
++                                                  prefix, "vision_tower"))
++        self.multi_modal_projector = PaliGemmaMultiModalProjector(
++            vision_hidden_size=config.vision_config.hidden_size,
++            projection_dim=config.vision_config.projection_dim)
++
++        self.quant_config = quant_config
++
++        if config.text_config.model_type == "gemma":
++            config.text_config.architectures = ["GemmaForCausalLM"]
++        else:
++            config.text_config.architectures = ["Gemma2ForCausalLM"]
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=config.text_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++        logit_scale = getattr(config, "logit_scale", 1.0)
++        self.language_model.logits_processor.scale *= logit_scale
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    @property
++    def sampler(self):
++        return self.language_model.sampler
++
++    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
++        h = w = self.config.vision_config.image_size
++        expected_dims = (3, h, w)
++        actual_dims = tuple(data.shape[1:])
++
++        if actual_dims != expected_dims:
++            expected_expr = ("batch_size", *map(str, expected_dims))
++            raise ValueError(
++                f"The expected shape of pixel values is {expected_expr}. "
++                f"You supplied {tuple(data.shape)}.")
++
++        return data
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[PaliGemmaImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if pixel_values is not None:
++            if not isinstance(pixel_values, torch.Tensor):
++                raise ValueError("Incorrect type of pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            # Remove the N dimension until multiple images are supported.
++            pixel_values = pixel_values.squeeze(1)
++
++            return PaliGemmaImagePixelInputs(
++                type="pixel_values",
++                data=self._validate_pixel_values(pixel_values),
++            )
++
++        if image_embeds is not None:
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++
++            # Remove the N dimension until multiple images are supported.
++            image_embeds = image_embeds.squeeze(1)
++
++            return PaliGemmaImageEmbeddingInputs(
++                type="image_embeds",
++                data=image_embeds,
++            )
++
++        raise AssertionError("This line should be unreachable.")
++
++    def _image_pixels_to_features(
++        self,
++        vision_tower: SiglipVisionModel,
++        pixel_values: torch.Tensor,
++    ) -> torch.Tensor:
++
++        target_dtype = vision_tower.get_input_embeddings().weight.dtype
++        image_features = vision_tower(pixel_values.to(dtype=target_dtype))
++
++        return image_features
++
++    def _process_image_input(
++        self,
++        image_input: PaliGemmaImageInputs,
++    ) -> torch.Tensor:
++
++        if image_input["type"] == "image_embeds":
++            return image_input["data"]
++
++        assert self.vision_tower is not None
++        pixel_values = image_input["data"]
++        image_features = self._image_pixels_to_features(
++            self.vision_tower,
++            pixel_values,
++        )
++
++        return self.multi_modal_projector(image_features)
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        vision_embeddings = self._process_image_input(image_input)
++        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
++        vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                self.config.image_token_index)
++        return inputs_embeds
++
++    def forward(self,
++                input_ids: torch.Tensor,
++                positions: torch.Tensor,
++                kv_caches: List[torch.Tensor],
++                attn_metadata: AttentionMetadata,
++                intermediate_tensors: Optional[IntermediateTensors] = None,
++                inputs_embeds: Optional[torch.Tensor] = None,
++                **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model.model(input_ids,
++                                                  positions,
++                                                  kv_caches,
++                                                  attn_metadata,
++                                                  intermediate_tensors,
++                                                  inputs_embeds=inputs_embeds)
++
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
+new file mode 100644
+index 0000000..14dd4b5
+--- /dev/null
++++ b/vllm/model_executor/models/persimmon.py
+@@ -0,0 +1,368 @@
++# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
++# Copyright 2023 The vLLM team.
++# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only persimmon model compatible with HuggingFace weights."""
++from typing import Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++from transformers import PersimmonConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++
++class PersimmonMLP(nn.Module):
++
++    def __init__(self,
++                 config: PersimmonConfig,
++                 quant_config: Optional[QuantizationConfig] = None):
++        super().__init__()
++        self.dense_h_to_4h = ColumnParallelLinear(config.hidden_size,
++                                                  config.intermediate_size,
++                                                  quant_config=quant_config)
++        self.dense_4h_to_h = RowParallelLinear(config.intermediate_size,
++                                               config.hidden_size,
++                                               quant_config=quant_config)
++        self.act = get_act_fn(config.hidden_act)
++
++    def forward(self, hidden_states) -> torch.Tensor:
++        hidden_states, _ = self.dense_h_to_4h(hidden_states)
++        hidden_states = self.act(hidden_states)
++        hidden_states, _ = self.dense_4h_to_h(hidden_states)
++        return hidden_states
++
++
++class PersimmonAttention(nn.Module):
++
++    def __init__(self,
++                 config: PersimmonConfig,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++        self.config = config
++        tensor_parallel_world_size = get_tensor_model_parallel_world_size()
++
++        self.hidden_size = config.hidden_size
++        self.total_num_heads = config.num_attention_heads
++        self.num_heads = self.total_num_heads // tensor_parallel_world_size
++        self.head_dim = self.hidden_size // self.total_num_heads
++        self.max_position_embeddings = config.max_position_embeddings
++        self.rope_theta = config.rope_theta
++        self.partial_rotary_factor = config.partial_rotary_factor
++        self.is_causal = True
++
++        assert (self.head_dim * self.total_num_heads) == self.hidden_size
++        assert self.total_num_heads % tensor_parallel_world_size == 0
++
++        self.query_key_value = QKVParallelLinear(
++            self.hidden_size,
++            self.head_dim,
++            self.total_num_heads,
++            bias=True,
++            quant_config=quant_config,
++        )
++        self.dense = RowParallelLinear(
++            self.total_num_heads * self.head_dim,
++            self.hidden_size,
++            bias=True,
++            quant_config=quant_config,
++        )
++        self.is_qk_layernorm = config.qk_layernorm
++
++        if self.is_qk_layernorm:
++            self.q_layernorm = nn.LayerNorm(self.head_dim)
++            self.k_layernorm = nn.LayerNorm(self.head_dim)
++
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=int(self.partial_rotary_factor * self.head_dim),
++            max_position=self.max_position_embeddings,
++            base=self.rope_theta,
++        )
++        self.scaling = self.head_dim**-0.5
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              scale=self.scaling,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
++
++    def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
++        # [seq_length, hidden_size] -> [seq_length, num_heads, head_dim]
++        seq_length = x.shape[0]
++        return x.view(seq_length, self.num_heads, self.head_dim)
++
++    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
++        # [seq_length, num_heads, head_dim] -> [seq_length, hidden_size]
++        seq_length = x.shape[0]
++        return x.view(seq_length, self.num_heads * self.head_dim)
++
++    def forward(
++        self,
++        position_ids: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        # [seq_length, 3 x hidden_size]
++        qkv, _ = self.query_key_value(hidden_states)
++        q, k, v = qkv.chunk(chunks=3, dim=-1)
++
++        if self.is_qk_layernorm:
++            # [seq_length, num_heads, head_dim]
++            q = self._split_heads(q)
++            k = self._split_heads(k)
++
++            q = self.q_layernorm(q)
++            k = self.k_layernorm(k)
++
++            q = self._merge_heads(q)
++            k = self._merge_heads(k)
++
++        q, k = self.rotary_emb(position_ids, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.dense(attn_output)
++        return output
++
++
++class PersimmonDecoderLayer(nn.Module):
++
++    def __init__(self,
++                 config: PersimmonConfig,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        self.self_attn = PersimmonAttention(config=config,
++                                            cache_config=cache_config,
++                                            quant_config=quant_config,
++                                            prefix=f"{prefix}.self_attn")
++        self.mlp = PersimmonMLP(config, quant_config=quant_config)
++        self.input_layernorm = nn.LayerNorm(config.hidden_size,
++                                            eps=config.layer_norm_eps)
++        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
++                                                     eps=config.layer_norm_eps)
++
++    def forward(
++        self,
++        position_ids: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        residual = hidden_states
++
++        hidden_states = self.input_layernorm(hidden_states)
++
++        # Self Attention
++        hidden_states = self.self_attn(
++            position_ids=position_ids,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        hidden_states = residual + hidden_states
++
++        # Fully Connected
++        residual = hidden_states
++        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states = self.mlp(hidden_states)
++
++        hidden_states = hidden_states + residual
++
++        outputs = hidden_states
++        return outputs
++
++
++@support_torch_compile
++class PersimmonModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.vocab_size = config.vocab_size
++
++        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
++                                                   config.hidden_size)
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: PersimmonDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
++        self.final_layernorm = nn.LayerNorm(config.hidden_size,
++                                            eps=config.layer_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
++            hidden_states = self.layers[i](
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++            )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
++        hidden_states = self.final_layernorm(hidden_states)
++        return hidden_states
++
++
++class PersimmonForCausalLM(nn.Module, SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        self.config = config
++        self.vocab_size = config.vocab_size
++        self.model = PersimmonModel(vllm_config=vllm_config,
++                                    prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      bias=False)
++        self.logits_processor = LogitsProcessor(config.vocab_size)
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ):
++        hidden_states = self.model(
++            input_ids=input_ids,
++            positions=positions,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++        )
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            if ("rotary_emb.cos_cached" in name
++                    or "rotary_emb.sin_cached" in name):
++                # Models trained using ColossalAI may include these tensors in
++                # the checkpoint. Skip them.
++                continue
++            if is_pp_missing_parameter(name, self):
++                continue
++            param = params_dict[name]
++
++            if "query_key_value" in name:
++                # copy from vllm/model_executor/models/bloom.py
++                # NOTE: Persimmon's fused QKV's output_dim has the shape of
++                # (num_heads * 3 * head_size), while the
++                # required shape is (3 * num_heads * head_size).
++                # Thus, we need weight conversion.
++                output_dim = getattr(param, "output_dim", None)
++                num_heads = self.config.num_attention_heads
++                if output_dim is not None:
++                    loaded_weight_shape = loaded_weight.shape
++                    loaded_weight = loaded_weight.view(
++                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
++                        loaded_weight_shape[output_dim + 1:])
++                    loaded_weight = loaded_weight.transpose(
++                        output_dim, output_dim + 1)
++                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
++
++            weight_loader = getattr(param, "weight_loader",
++                                    default_weight_loader)
++            weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
+index 4a45879..f9e9726 100644
+--- a/vllm/model_executor/models/phi.py
++++ b/vllm/model_executor/models/phi.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
+ # Copyright 2023 The vLLM team.
+@@ -35,35 +34,43 @@
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ """Inference-only Phi-1.5 model compatible with HuggingFace weights."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+-from transformers import PretrainedConfig
++from transformers import PhiConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class PhiAttention(nn.Module):
+ 
+     def __init__(self,
+-                 config: PretrainedConfig,
+-                 quant_config: Optional[QuantizationConfig] = None):
++                 config: PhiConfig,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__()
+         self.total_num_heads = config.num_attention_heads
+         self.hidden_size = config.hidden_size
+@@ -97,15 +104,21 @@ class PhiAttention(nn.Module):
+         # pylint: disable=C0301
+         # Refer to:
+         # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518
+-        rope_theta = 10000
+-        max_position_embeddings = getattr(config, "n_positions", 2048)
++        rope_theta = getattr(config, "rope_theta", 10000.0)
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          2048)
+         self.rotary_emb = get_rope(
+             self.head_size,
+             rotary_dim=rotary_dim,
+             max_position=max_position_embeddings,
+             base=rope_theta,
+         )
+-        self.attn = Attention(self.num_heads, self.head_size, scaling)
++        self.attn = Attention(self.num_heads,
++                              self.head_size,
++                              scaling,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -125,7 +138,7 @@ class PhiAttention(nn.Module):
+ class PhiMLP(nn.Module):
+ 
+     def __init__(self,
+-                 config: PretrainedConfig,
++                 config: PhiConfig,
+                  quant_config: Optional[QuantizationConfig] = None):
+         super().__init__()
+ 
+@@ -142,7 +155,7 @@ class PhiMLP(nn.Module):
+             config.hidden_size,
+             quant_config=quant_config,
+         )
+-        self.act = get_act_fn(config.hidden_act, quant_config, n_inner)
++        self.act = get_act_fn(config.hidden_act)
+ 
+     def forward(self, hidden_states):
+         hidden_states, _ = self.fc1(hidden_states)
+@@ -154,12 +167,17 @@ class PhiMLP(nn.Module):
+ class PhiLayer(nn.Module):
+ 
+     def __init__(self,
+-                 config: PretrainedConfig,
+-                 quant_config: Optional[QuantizationConfig] = None):
++                 config: PhiConfig,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__()
+         self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                             eps=config.layer_norm_eps)
+-        self.self_attn = PhiAttention(config, quant_config)
++        self.self_attn = PhiAttention(config,
++                                      cache_config,
++                                      quant_config,
++                                      prefix=f"{prefix}.self_attn")
+         self.mlp = PhiMLP(config, quant_config)
+ 
+     def forward(
+@@ -182,22 +200,33 @@ class PhiLayer(nn.Module):
+         return hidden_states
+ 
+ 
++@support_torch_compile
+ class PhiModel(nn.Module):
+ 
+-    def __init__(self,
+-                 config: PretrainedConfig,
+-                 quant_config: Optional[QuantizationConfig] = None):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+         self.quant_config = quant_config
+         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                    config.hidden_size)
+-        self.layers = nn.ModuleList([
+-            PhiLayer(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: PhiLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
+         self.final_layernorm = nn.LayerNorm(config.hidden_size,
+                                             eps=config.layer_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -205,38 +234,88 @@ class PhiModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        for i in range(self.config.num_hidden_layers):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+             )
+ 
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
++
+         hidden_states = self.final_layernorm(hidden_states)
+ 
+         return hidden_states
+ 
+ 
+-class PhiForCausalLM(nn.Module):
+-
+-    def __init__(self,
+-                 config: PretrainedConfig,
+-                 quant_config: Optional[QuantizationConfig] = None):
++class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ]
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj",
++        "dense",
++        "fc1",
++        "fc2",
++    ]
++
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++    }
++
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
+         self.config = config
++        # lm_head use bias, cannot share word embeddings
++        assert not config.tie_word_embeddings
++        self.lora_config = lora_config
++
+         self.quant_config = quant_config
+ 
+-        self.model = PhiModel(config, quant_config)
++        self.model = PhiModel(vllm_config=vllm_config,
++                              prefix=maybe_prefix(prefix, "model"))
+ 
+         self.lm_head = ParallelLMHead(config.vocab_size,
+                                       config.hidden_size,
+-                                      bias=True)
++                                      bias=True,
++                                      quant_config=quant_config)
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -244,15 +323,21 @@ class PhiForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+ 
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata, self.lm_head.bias)
+         return logits
+ 
+@@ -264,7 +349,8 @@ class PhiForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -272,6 +358,7 @@ class PhiForCausalLM(nn.Module):
+             ("qkv_proj", "v_proj", "v")
+         ]
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+ 
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+@@ -284,6 +371,8 @@ class PhiForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -294,7 +383,11 @@ class PhiForCausalLM(nn.Module):
+                     continue
+                 # pylint: disable=E1136
+ 
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
+new file mode 100644
+index 0000000..937858e
+--- /dev/null
++++ b/vllm/model_executor/models/phi3.py
+@@ -0,0 +1,20 @@
++# Adapted from llama.py
++"""Inference-only Phi3 model code inherit from Llama.py"""
++
++from vllm.model_executor.models.llama import LlamaForCausalLM
++
++
++class Phi3ForCausalLM(LlamaForCausalLM):
++
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "qkv_proj",
++        ],
++        "gate_up_proj": [
++            "gate_up_proj",
++        ],
++    }
++
++    # BitandBytes specific attributes
++    # Initialize an empty dict when there is no stacked parameter mapping.
++    bitsandbytes_stacked_params_mapping = {}
+diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
+new file mode 100644
+index 0000000..f47676b
+--- /dev/null
++++ b/vllm/model_executor/models/phi3_small.py
+@@ -0,0 +1,482 @@
++import math
++from typing import Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++from transformers.configuration_utils import PretrainedConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size)
++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.platforms import current_platform
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++
++def load_column_parallel_weight(param: torch.nn.Parameter,
++                                loaded_weight: torch.Tensor):
++    tp = get_tensor_model_parallel_world_size()
++    rk = get_tensor_model_parallel_rank()
++    assert param.size(0) * tp == loaded_weight.size(0)
++    s = rk * param.size(0)
++    e = (rk + 1) * param.size(0)
++    loaded_weight = loaded_weight[s:e]
++    assert param.shape == loaded_weight.shape
++    param.data.copy_(loaded_weight)
++
++
++class HeadMajorQKVParallelLinear(QKVParallelLinear):
++
++    def weight_loader(self, param: torch.nn.Parameter,
++                      loaded_weight: torch.Tensor):
++        return load_column_parallel_weight(param, loaded_weight)
++
++
++class HeadMajorColumnParallelLinear(MergedColumnParallelLinear):
++
++    def weight_loader(self, param: torch.nn.Parameter,
++                      loaded_weight: torch.Tensor):
++        return load_column_parallel_weight(param, loaded_weight)
++
++
++@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
++def quick_gelu(x):
++    return x * torch.sigmoid(1.702 * x)
++
++
++@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
++def gegelu(input, limit: Optional[float] = None):
++    a_gelu, a_linear = input[..., ::2], input[..., 1::2]
++    if limit is not None:
++        a_gelu = torch.where(torch.isinf(a_gelu), a_gelu,
++                             a_gelu.clamp(min=None, max=limit))
++        a_linear = torch.where(
++            torch.isinf(a_linear),
++            a_linear,
++            a_linear.clamp(min=-limit, max=limit),
++        )
++    out_gelu = quick_gelu(a_gelu)
++    return out_gelu * (a_linear + 1)
++
++
++class Phi3SmallMLP(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++    ) -> None:
++        super().__init__()
++        self.config = config
++        assert (self.config.hidden_act == "gegelu"
++                ), "Only `gegelu` is supported for the 4.7 series of models .."
++        self.hidden_size = config.hidden_size
++        self.gegelu_limit = config.gegelu_limit
++        self.intermediate_size = config.intermediate_size
++
++        self.up_proj = HeadMajorColumnParallelLinear(
++            self.hidden_size,
++            2 * [self.intermediate_size],
++            bias=True,
++            quant_config=quant_config,
++        )
++        self.down_proj = RowParallelLinear(
++            self.intermediate_size,
++            self.hidden_size,
++            bias=True,
++            quant_config=quant_config,
++        )
++
++    def forward(self, x):
++        gate_up, _ = self.up_proj(x)
++        x = gegelu(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class Phi3SmallSelfAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        layer_idx: int,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.layer_idx = layer_idx
++        self.config = config
++        self.sparse_block_size = config.blocksparse_block_size
++        self.homo_heads = config.blocksparse_homo_head_pattern
++        self.local_blocks = config.blocksparse_num_local_blocks
++        self.vert_stride = config.blocksparse_vert_stride
++
++        assert (config.blocksparse_block_size ==
++                config.blocksparse_triton_kernel_block_size)
++
++        self.hidden_size = config.hidden_size
++        # Number of Query Heads
++        self.num_heads = config.num_attention_heads
++
++        self.head_dim = self.hidden_size // self.num_heads
++        self.tp_size = get_tensor_model_parallel_world_size()
++        # Number of total Key Value Heads before tensor parallel
++        self.num_key_value_heads = config.num_key_value_heads
++        self.num_q_per_kv = self.num_heads // self.num_key_value_heads
++        if self.tp_size > 1:
++            assert self.num_key_value_heads % self.tp_size == 0
++        self.num_kv_heads_per_partion = max(
++            1, self.num_key_value_heads // self.tp_size)
++        self.num_heads_per_partition = self.num_heads // self.tp_size
++
++        self.max_position_embeddings = config.max_position_embeddings
++        self.rope_embedding_base = config.rope_embedding_base
++        self.rope_position_scale = config.rope_position_scale
++        self.is_causal = True
++
++        norm_factor = None
++        if config.mup_use_scaling:
++            norm_factor = self.head_dim / config.mup_attn_multiplier
++        else:
++            norm_factor = math.sqrt(self.head_dim)
++        self.scale = 1 / norm_factor
++
++        self.query_key_value = HeadMajorQKVParallelLinear(
++            self.hidden_size,
++            self.head_dim,
++            self.num_heads,
++            self.num_key_value_heads,
++            bias=True,
++            quant_config=quant_config,
++        )
++
++        self.dense = RowParallelLinear(self.hidden_size,
++                                       self.hidden_size,
++                                       bias=True,
++                                       quant_config=quant_config)
++
++        if getattr(self.config, "rope_scaling", None) is not None:
++            rope_scaling = self.config.rope_scaling
++            for key in rope_scaling:
++                if isinstance(rope_scaling[key], list):
++                    rope_scaling[key] = tuple(rope_scaling[key])
++
++            if "factor" not in rope_scaling:
++                rope_scaling["factor"] = self.rope_position_scale
++        else:
++            rope_scaling = {
++                "rope_type": "linear",
++                "factor": self.rope_position_scale,
++            }
++
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=self.max_position_embeddings,
++            base=self.rope_embedding_base,
++            rope_scaling=rope_scaling,
++        )
++
++        # blocksparse params
++        self.blocksparse_block_size = config.blocksparse_block_size
++        self.blocksparse_num_local_blocks = config.blocksparse_num_local_blocks
++        self.blocksparse_vert_stride = config.blocksparse_vert_stride
++
++        use_dense_attn = (getattr(self.config,
++                                  "dense_attention_every_n_layers", None)
++                          and (self.layer_idx + 1) %
++                          self.config.dense_attention_every_n_layers == 0)
++
++        bs_params = None
++        if not use_dense_attn:
++            bs_params = {
++                'max_seqlen': self.max_position_embeddings,
++                'num_heads': self.num_heads_per_partition,
++                "num_kv_heads": self.num_kv_heads_per_partion,
++                "block_size": self.sparse_block_size,
++                "local_blocks": self.local_blocks,
++                "vert_stride": self.vert_stride,
++                "homo_head": self.homo_heads
++            }
++
++        self.attn = Attention(self.num_heads_per_partition,
++                              self.head_dim,
++                              self.scale,
++                              num_kv_heads=self.num_kv_heads_per_partion,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              blocksparse_params=bs_params,
++                              prefix=f"{prefix}.attn")
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
++               Optional[Tuple[torch.Tensor]]]:
++        qkv, _ = self.query_key_value(hidden_states)
++
++        qkv = qkv.view(qkv.shape[:-1] +
++                       (-1, (self.num_q_per_kv + 2), self.head_dim))
++        q, k, v = qkv.split([self.num_q_per_kv, 1, 1], dim=-2)
++
++        # NOTE: this is required by RotaryEmbed, which indeed does not have to
++        # TODO: allow 3D QK for rotary forward
++        q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
++        k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
++        v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
++
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata=attn_metadata)
++        output, _ = self.dense(attn_output)
++
++        return output
++
++
++class Phi3SmallDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        layer_idx: int,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        self.self_attn = Phi3SmallSelfAttention(config,
++                                                layer_idx,
++                                                cache_config=cache_config,
++                                                quant_config=quant_config,
++                                                prefix=f"{prefix}.self_attn")
++        self.mlp = Phi3SmallMLP(config, quant_config)
++
++        self.input_layernorm = nn.LayerNorm(config.hidden_size,
++                                            eps=config.layer_norm_epsilon)
++        self.post_attention_layernorm = nn.LayerNorm(
++            config.hidden_size, eps=config.layer_norm_epsilon)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        residual = hidden_states
++        hidden_states = self.input_layernorm(hidden_states)
++
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        hidden_states = residual + hidden_states
++
++        residual = hidden_states
++        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = residual + hidden_states
++        return hidden_states
++
++
++class Phi3SmallModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.config = config
++        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
++                                                   config.hidden_size)
++        self.mup_embedding_multiplier = config.mup_embedding_multiplier
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: Phi3SmallDecoderLayer(config,
++                                                 int(prefix.split('.')[-1]),
++                                                 cache_config,
++                                                 quant_config,
++                                                 prefix=prefix),
++            prefix=f"{prefix}.layers")
++
++        self.final_layernorm = nn.LayerNorm(config.hidden_size,
++                                            eps=config.layer_norm_epsilon)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.LongTensor,
++        positions: Optional[torch.LongTensor],
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor],
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            if (self.mup_embedding_multiplier is not None
++                    and self.mup_embedding_multiplier > 0.0):
++                hidden_states = hidden_states * self.mup_embedding_multiplier
++        else:
++            assert intermediate_tensors
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++            )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
++        hidden_states = self.final_layernorm(hidden_states)
++        return hidden_states
++
++
++class Phi3SmallForCausalLM(nn.Module, SupportsPP):
++    _tied_weights_keys = ["lm_head.weight"]
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        self.config = config
++        self.quant_config = quant_config
++        self.model = Phi3SmallModel(vllm_config=vllm_config,
++                                    prefix=maybe_prefix(prefix, "model"))
++        self.vocab_size = config.vocab_size
++        self.mup_width_multiplier = config.mup_width_multiplier
++        self.lm_head = ParallelLMHead(
++            self.vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
++            quant_config=quant_config,
++        )
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
++        self.logits_processor = LogitsProcessor(config.vocab_size)
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++        # tokens in tiktoken but not used
++        if hasattr(config, 'dummy_token_indices'):
++            device = self.lm_head.weight.device
++            self.register_buffer('dummy_token_indices',
++                                 torch.LongTensor(
++                                     config.dummy_token_indices).to(device),
++                                 persistent=False)
++        else:
++            self.dummy_token_indices = None
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def set_input_embeddings(self, value):
++        self.model.embed_tokens = value
++
++    def get_output_embeddings(self):
++        return self.lm_head
++
++    def set_output_embeddings(self, value):
++        self.lm_head = value
++
++    def set_decoder(self, decoder):
++        self.model = decoder
++
++    def get_decoder(self):
++        return self.model
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        if self.dummy_token_indices is not None and logits is not None:
++            logits.index_fill_(-1, self.dummy_token_indices, -torch.inf)
++        return logits
++
++    def forward(
++        self,
++        input_ids: torch.LongTensor,
++        positions: Optional[torch.LongTensor],
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        output_hidden_states = self.model(
++            input_ids=input_ids,
++            positions=positions,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++        )
++        output_hidden_states = output_hidden_states
++        return output_hidden_states
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++
++        next_tokens = self.sampler(logits / self.mup_width_multiplier,
++                                   sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            if name.endswith(".bias") and name not in params_dict:
++                continue
++            if is_pp_missing_parameter(name, self):
++                continue
++            param = params_dict[name]
++            weight_loader = getattr(param, "weight_loader",
++                                    default_weight_loader)
++            weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
+new file mode 100644
+index 0000000..7a230e5
+--- /dev/null
++++ b/vllm/model_executor/models/phi3v.py
+@@ -0,0 +1,732 @@
++# Copyright 2024 The vLLM team.
++# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++from collections.abc import Iterable, Mapping, Sequence
++from functools import cached_property
++from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
++
++import torch
++import torch.nn as nn
++from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig,
++                          ProcessorMixin)
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.logger import init_logger
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    VocabParallelEmbedding)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
++                                    MultiModalInputsV2, MultiModalKwargs,
++                                    NestedTensors, PlaceholderRange)
++from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
++                                   ImageSize, MultiModalDataItems)
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo,
++                                        BoundPromptReplacement,
++                                        PlaceholderInfo, PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.sequence import IntermediateTensors
++from vllm.utils import is_list_of
++
++from .clip import CLIPVisionModel
++from .interfaces import SupportsMultiModal, SupportsPP
++from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
++                    init_vllm_registered_model, maybe_prefix,
++                    merge_multimodal_embeddings)
++
++logger = init_logger(__name__)
++
++# Cannot find the following 2 numbers from hf config.
++_IMAGE_TOKEN_ID = 32044
++
++CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
++                                                     hidden_act="quick_gelu",
++                                                     hidden_size=1024,
++                                                     image_size=336,
++                                                     intermediate_size=4096,
++                                                     num_attention_heads=16,
++                                                     num_channels=3,
++                                                     num_hidden_layers=24,
++                                                     patch_size=14,
++                                                     projection_dim=768)
++
++
++def _init_img_processor(hf_config: PretrainedConfig,
++                        quant_config: Optional[QuantizationConfig],
++                        prefix: str = "") -> CLIPVisionModel:
++    clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
++    layer_idx = hf_config.img_processor.get('layer_idx', -2)
++
++    # Initialize the CLIP only up to the required feature layer
++    if layer_idx < 0:
++        num_hidden_layers = clip_config.num_hidden_layers + \
++            layer_idx + 1
++    else:
++        num_hidden_layers = layer_idx + 1
++
++    img_processor = CLIPVisionModel(
++        clip_config,
++        quant_config,
++        num_hidden_layers_override=num_hidden_layers,
++        prefix=prefix,
++    )
++
++    return img_processor
++
++
++class Phi3VImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: Union[torch.Tensor, List[torch.Tensor]]
++    """
++    Shape:
++    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
++
++    Note that `num_patches` may be different per batch and image,
++    in which case the data is passed as a list instead of a batched tensor.
++    """
++
++    image_sizes: torch.Tensor
++    """
++    Shape: `(batch_size * num_images, 2)`
++
++    This should be in `(height, width)` format.
++    """
++
++
++class Phi3VImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: Union[torch.Tensor, List[torch.Tensor]]
++    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
++
++    `hidden_size` must match the hidden size of language model backbone.
++    """
++
++
++Phi3VImageInputs = Union[Phi3VImagePixelInputs, Phi3VImageEmbeddingInputs]
++
++
++class Phi3ImageEmbeddingBase(nn.Module):
++
++    def __init__(self) -> None:
++        super().__init__()
++        self.layer_idx: int
++        self.type_feature: str
++        self.img_processor: CLIPVisionModel
++
++    def get_img_features(self,
++                         img_embeds: torch.FloatTensor) -> torch.FloatTensor:
++        TYPE_FEATURE = self.type_feature
++
++        # NOTE: we skip the step to select the vision feature layer since
++        # this is already done inside the img_processor
++        img_feature = self.img_processor(img_embeds)
++
++        if TYPE_FEATURE == "patch":
++            patch_feature = img_feature[:, 1:]
++            return patch_feature
++
++        if TYPE_FEATURE == "cls_patch":
++            return img_feature
++
++        raise NotImplementedError
++
++
++# adapted from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_embedding_phi3_v.py
++class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
++    """Phi3 Image embedding with HD transform."""
++
++    def __init__(self,
++                 config: PretrainedConfig,
++                 quant_config: Optional[QuantizationConfig],
++                 prefix: str = "") -> None:
++        super().__init__()
++
++        # n_embed or hidden_size
++        hidden_size = config.n_embd if hasattr(
++            config, 'n_embd') else config.hidden_size
++
++        self.img_processor = _init_img_processor(
++            config, quant_config, prefix=f"{prefix}.img_processor")
++
++        image_dim_out = config.img_processor['image_dim_out']
++        self.num_img_tokens = config.img_processor['num_img_tokens']
++
++        self.image_dim_out = image_dim_out
++
++        # global_gn and sub_gn for hd transform, serves as line separator
++        self.use_hd_transform = config.embd_layer.get('use_hd_transform',
++                                                      False)
++        self.with_learnable_separator = config.embd_layer.get(
++            'with_learnable_separator', False)
++        self.hd_transform_order = config.embd_layer.get(
++            'hd_transform_order', 'glb_sub')
++        # with_hd_transform and with_learnable_separator should have same value
++        assert self.use_hd_transform and self.with_learnable_separator
++
++        # 1024 * 4, merge spatial to channel dimension
++        self.glb_GN = nn.Parameter(torch.empty([1, 1, self.image_dim_out * 4]))
++        self.sub_GN = nn.Parameter(
++            torch.empty([1, 1, 1, self.image_dim_out * 4]))
++
++        dim_projection = hidden_size
++        depth = 2
++        layers = [nn.Linear(image_dim_out * 4, dim_projection)]
++        for _ in range(1, depth):
++            layers.extend(
++                [nn.GELU(),
++                 nn.Linear(dim_projection, dim_projection)])
++        self.img_projection = nn.Sequential(*layers)
++
++        self.type_feature = config.img_processor.get('type_feature', 'patch')
++
++    def forward(self, pixel_values: torch.FloatTensor,
++                image_sizes: torch.Tensor) -> torch.FloatTensor:
++        """
++        process image and return vision embeddings.
++
++        pixel_values: (num_images, num_crops, c, h, w)
++        output: (num_images, num_img_tokens, hidden_size)
++        """
++        num_images, num_crops, c, h, w = pixel_values.shape
++        pixel_values = pixel_values.flatten(0, 1)
++        img_features = self.get_img_features(pixel_values)
++        img_features = img_features.reshape(num_images, num_crops, -1,
++                                            self.image_dim_out)
++        image_features_proj = self.hd_feature_transform(
++            img_features, image_sizes)
++        return image_features_proj
++
++    def hd_feature_transform(self, image_features, image_sizes):
++        """
++        image_features: (num_images, num_crops+1, 24*24, 1024)
++        """
++        assert (
++            self.hd_transform_order == 'sub_glb'
++        ), f'hd_transform_order `{self.hd_transform_order}` not implemented'
++        if isinstance(self.img_projection, nn.Sequential):
++            target_device = self.img_projection[0].bias.device
++            target_dtype = self.img_projection[0].bias.dtype
++        else:  # It's a single nn.Linear layer
++            target_device = self.img_projection.bias.device
++            target_dtype = self.img_projection.bias.dtype
++
++        global_image_features = image_features[:,
++                                               0]  # (num_images, 24*24, 1024)
++        # global feature can be viewed as a special HD case with num_crops 1x1
++        global_image_features_hd = self.reshape_hd_patches_2x2merge(
++            global_image_features, 1, 1)
++        global_image_features_hd_newline = self.add_image_newline(
++            global_image_features_hd)
++
++        batch_image_features_proj = []
++        # need a for loop to process each image because of different image sizes
++        # (patch arrangement is different for each image)
++        for i, img_size in enumerate(image_sizes):
++            h, w = img_size
++            h_crop = h // 336
++            w_crop = w // 336
++            num_crops = h_crop * w_crop
++
++            # NOTE: real num_crops is padded
++            # (num_crops, 24*24, 1024)
++            sub_image_features = image_features[i, 1:1 + num_crops]
++            sub_image_features_hd = self.reshape_hd_patches_2x2merge(
++                sub_image_features, h_crop, w_crop)
++            sub_image_features_hd_newline = self.add_image_newline(
++                sub_image_features_hd)
++
++            # [sub features, separator, global features]
++            image_embeddings = torch.cat([
++                sub_image_features_hd_newline.squeeze(
++                    0),  # (h_crop*12*(w_crop*12+1), 4096)
++                self.glb_GN.squeeze(0),
++                global_image_features_hd_newline[i],
++            ])
++            img_proj = self.img_projection(
++                image_embeddings.to(target_device, target_dtype))
++            batch_image_features_proj.append(img_proj)
++
++        return batch_image_features_proj
++
++    def reshape_hd_patches_2x2merge(self, image_features, h_crop, w_crop):
++        """
++        image_features: (num_images*num_crops, 24*24, 1024)
++        output: (num_images, h_crop*12, w_crop*12, 4096)
++        where h_crop*w_crop == num_crops
++        """
++        N, L, C = image_features.shape
++        assert L == 576 and C == 1024 and N % (h_crop * w_crop) == 0
++        num_images = N // (h_crop * w_crop)
++        H = int(L**0.5)
++        image_features_hd = (
++            image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
++            .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
++            .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
++            .reshape(N, -1, 4 * C)  # N, 144, 4096
++            .reshape(num_images, h_crop, w_crop, H // 2, H // 2,
++                     -1)  # n_img, h_crop, w_crop, 12, 12, 4096
++            .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
++            .reshape(num_images, h_crop * H // 2, w_crop * H // 2,
++                     4 * C)  # n_img, h_crop*12, w_crop*12, 4096
++        )
++        return image_features_hd
++
++    def add_image_newline(self, image_features_hd):
++        """
++        image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
++        output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
++        """
++        num_images, h, w, hid_dim = image_features_hd.shape
++        # add the newline token to the HD image feature patches
++        newline_embeddings = self.sub_GN.expand(num_images, h, -1,
++                                                -1)  # (n_img, h, 1, hid_dim)
++        image_features_hd_newline = torch.cat(
++            [image_features_hd, newline_embeddings],
++            dim=2).reshape(num_images, -1, hid_dim)
++        return image_features_hd_newline
++
++
++class Phi3VProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_processor(
++        self,
++        *,
++        num_crops: Optional[int] = None,
++    ) -> ProcessorMixin:
++        if num_crops is not None:
++            return self.ctx.get_hf_processor(num_crops=num_crops)
++
++        return self.ctx.get_hf_processor()
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": None}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        target_width, target_height = self.get_image_size_with_most_features()
++
++        max_image_tokens = self.get_num_image_tokens(
++            image_width=target_width,
++            image_height=target_height,
++            processor=None,
++        )
++
++        return {"image": max_image_tokens}
++
++    def get_num_image_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++        processor: Optional[ProcessorMixin],
++    ) -> int:
++        if processor is None:
++            processor = self.get_hf_processor()
++
++        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
++            width=image_width,
++            height=image_height,
++        )
++
++    def get_image_size_with_most_features(self) -> ImageSize:
++        # Result in the max possible feature size (h:w = 16:1)
++        return ImageSize(height=8000, width=50)
++
++
++class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        num_images = mm_counts.get("image", 0)
++
++        target_width, target_height = \
++            self.info.get_image_size_with_most_features()
++
++        mm_data = {
++            "image":
++            self._get_dummy_images(width=target_width,
++                                   height=target_height,
++                                   num_images=num_images)
++        }
++
++        hf_processor = self.info.get_hf_processor()
++        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
++
++        return ProcessorInputs(
++            prompt_text="".join(image_tokens[:num_images]),
++            mm_data=mm_data,
++        )
++
++
++class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        processed_outputs = super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++        )
++
++        input_ids = processed_outputs["input_ids"]
++        assert isinstance(input_ids, torch.Tensor)
++
++        # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
++        # which will cause OverflowError when decoding the prompt_ids.
++        # Therefore, we need to do an early replacement here
++        input_ids.masked_fill_(input_ids < 0, _IMAGE_TOKEN_ID)
++
++        return processed_outputs
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(
++            pixel_values=MultiModalFieldConfig.batched("image"),
++            image_sizes=MultiModalFieldConfig.batched("image"),
++            image_embeds=MultiModalFieldConfig.batched("image"),
++        )
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, Any],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
++        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
++
++        tokenizer = self.info.get_tokenizer()
++        bos_token_id = tokenizer.bos_token_id
++        assert isinstance(bos_token_id, int)
++
++        def get_replacement_phi3v(item_idx: int):
++            images = mm_items.get_items(
++                "image", (ImageEmbeddingItems, ImageProcessorItems))
++
++            if isinstance(images, ImageEmbeddingItems):
++                num_image_tokens = images.get_feature_size(item_idx)
++            else:
++                image_size = images.get_image_size(item_idx)
++                num_image_tokens = self.info.get_num_image_tokens(
++                    image_width=image_size.width,
++                    image_height=image_size.height,
++                    processor=hf_processor,
++                )
++
++            return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id]
++
++        num_images = mm_items.get_count("image", strict=False)
++
++        return [
++            PromptReplacement(
++                modality="image",
++                target=image_token,
++                replacement=get_replacement_phi3v,
++            ) for image_token in image_tokens[:num_images]
++        ]
++
++    def _apply_prompt_replacements(
++        self,
++        token_ids: list[int],
++        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
++        mm_item_counts: Mapping[str, int],
++    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
++        token_ids, text, placeholders = super()._apply_prompt_replacements(
++            token_ids=token_ids,
++            mm_prompt_repls=mm_prompt_repls,
++            mm_item_counts=mm_item_counts,
++        )
++
++        # Keep the behavior in line with HF processor
++        if text.startswith("<s> <|image|>"):
++            text = text.replace("<s> <|image|>", "<s><|image|>", 1)
++            token_ids = [token_ids[0], *token_ids[2:]]
++            placeholders = {
++                modality: [
++                    PlaceholderInfo(
++                        modality=p.modality,
++                        item_idx=p.item_idx,
++                        start_idx=p.start_idx - 1,
++                        replacement=p.replacement,
++                    ) for p in ps
++                ]
++                for modality, ps in placeholders.items()
++            }
++
++        return token_ids, text, placeholders
++
++    def apply(
++        self,
++        prompt: Union[str, list[int]],
++        mm_data: MultiModalDataDict,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> MultiModalInputsV2:
++        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
++
++        # Only <|image|> tokens should be considered as placeholders,
++        # so we ignore the trailing bos_token_id
++        result["mm_placeholders"] = {
++            modality: [
++                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
++                for p in ps
++            ]
++            for modality, ps in result["mm_placeholders"].items()
++        }
++
++        return result
++
++
++@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
++                                        info=Phi3VProcessingInfo,
++                                        dummy_inputs=Phi3VDummyInputsBuilder)
++class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_prefix={
++            "model.vision_embed_tokens.wte": "embed_tokens",
++            "model.vision_embed_tokens.": "vision_embed_tokens.",
++            "lm_head.": "language_model.lm_head.",
++            "model.": "language_model.model.",
++        })
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = config
++        self.multimodal_config = multimodal_config
++        self.image_token_id = _IMAGE_TOKEN_ID
++
++        self.embed_tokens = VocabParallelEmbedding(
++            config.vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++            quant_config=quant_config,
++            prefix=maybe_prefix(prefix, "model.embed_tokens"),
++        )
++
++        # TODO: Optionally initializes this for supporting input embeddings.
++        self.vision_embed_tokens = Phi3HDImageEmbedding(
++            config,
++            quant_config,
++            prefix=maybe_prefix(prefix, "model.vision_embed_tokens"))
++
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            # The prefix is empty intentionally because default prefix of
++            # LlamaForCausalLM is "model"
++            prefix="",
++            # We don't directly initialize vLLM's LlamaForCausalLM so we
++            # can automatically apply embedding wrapper if this model is
++            # initialized as an embedding model
++            architectures=["LlamaForCausalLM"],
++        )
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
++        expected_dims = (2, )
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape)
++
++            if actual_dims != expected_dims:
++                expected_expr = str(expected_dims)
++                raise ValueError(
++                    f"The expected shape of image sizes per image per batch "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _validate_pixel_values(
++        self, data: Union[torch.Tensor, List[torch.Tensor]]
++    ) -> Union[torch.Tensor, List[torch.Tensor]]:
++
++        h = w = CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size
++        expected_dims = (3, h, w)
++
++        def _validate_shape(d: torch.Tensor):
++            actual_dims = tuple(d.shape[1:])
++
++            if actual_dims != expected_dims:
++                expected_expr = ("num_patches", *map(str, expected_dims))
++                raise ValueError(
++                    "The expected shape of pixel values per image per batch "
++                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
++
++        for d in data:
++            _validate_shape(d)
++
++        return data
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[Phi3VImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_sizes = kwargs.pop("image_sizes", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if pixel_values is not None:
++            if not isinstance(pixel_values, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            if not isinstance(image_sizes, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of image sizes. "
++                                 f"Got type: {type(image_sizes)}")
++
++            return Phi3VImagePixelInputs(
++                type="pixel_values",
++                data=self._validate_pixel_values(flatten_bn(pixel_values)),
++                image_sizes=self._validate_image_sizes(
++                    flatten_bn(image_sizes, concat=True)))
++
++        if image_embeds is not None:
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++
++            return Phi3VImageEmbeddingInputs(
++                type="image_embeds",
++                data=flatten_bn(image_embeds),
++            )
++
++        raise AssertionError("This line should be unreachable.")
++
++    def _process_image_input(
++        self,
++        image_input: Phi3VImageInputs,
++    ) -> torch.Tensor:
++
++        if image_input["type"] == "image_embeds":
++            image_data = image_input["data"]
++            if is_list_of(image_data, torch.Tensor):
++                # it's already a list of tensors
++                return image_data
++            if len(image_data.shape) == 3:
++                # 3D tensor
++                return list(torch.unbind(image_data, dim=0))
++            raise ValueError(
++                "We expect batched 2D tensors;"
++                "this can be either a list of 2D tensors or a single 3D tensor."
++            )
++
++        assert self.vision_embed_tokens is not None
++        image_embeds = self.vision_embed_tokens(image_input["data"],
++                                                image_input["image_sizes"])
++
++        return image_embeds
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        if image_input is None:
++            return None
++        vision_embeddings = self._process_image_input(image_input)
++        return vision_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.embed_tokens(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                self.image_token_id)
++        return inputs_embeds
++
++    def forward(self,
++                input_ids: torch.Tensor,
++                positions: torch.Tensor,
++                kv_caches: List[torch.Tensor],
++                attn_metadata: AttentionMetadata,
++                intermediate_tensors: Optional[IntermediateTensors] = None,
++                inputs_embeds: Optional[torch.Tensor] = None,
++                **kwargs: object):
++
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model.model(input_ids,
++                                                  positions,
++                                                  kv_caches,
++                                                  attn_metadata,
++                                                  intermediate_tensors,
++                                                  inputs_embeds=inputs_embeds)
++
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++
++        loader = AutoWeightsLoader(self)
++        autoloaded_weights = loader.load_weights(weights,
++                                                 mapper=self.hf_to_vllm_mapper)
++
++        # The HF config doesn't specify whether these are tied,
++        # so we detect it this way
++        if "embed_tokens.weight" not in autoloaded_weights:
++            self.embed_tokens = self.language_model.model.embed_tokens
++            autoloaded_weights.add("embed_tokens.weight")
++        return autoloaded_weights
+diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
+new file mode 100644
+index 0000000..1febd62
+--- /dev/null
++++ b/vllm/model_executor/models/phimoe.py
+@@ -0,0 +1,676 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only PhiMoE model."""
++from typing import Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++from transformers.configuration_utils import PretrainedConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.model_executor.layers.fused_moe import FusedMoE
++from vllm.model_executor.layers.linear import (QKVParallelLinear,
++                                               ReplicatedLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++
++class PhiMoEConfig(PretrainedConfig):
++
++    model_type = "phimoe"
++    keys_to_ignore_at_inference = ["past_key_values"]
++
++    def __init__(
++        self,
++        vocab_size=32000,
++        hidden_size=4096,
++        intermediate_size=14336,
++        num_hidden_layers=32,
++        num_attention_heads=32,
++        num_key_value_heads=8,
++        hidden_act="silu",
++        max_position_embeddings=4096 * 32,
++        initializer_range=0.02,
++        rms_norm_eps=1e-5,
++        use_cache=True,
++        pad_token_id=None,
++        bos_token_id=1,
++        eos_token_id=2,
++        tie_word_embeddings=False,
++        rope_theta=1e6,
++        sliding_window=None,
++        attention_dropout=0.0,
++        num_experts_per_tok=2,
++        num_local_experts=16,
++        output_router_logits=False,
++        router_aux_loss_coef=0.001,
++        router_jitter_noise=0.0,
++        attention_bias=False,
++        lm_head_bias=False,
++        **kwargs,
++    ):
++        self.vocab_size = vocab_size
++        self.max_position_embeddings = max_position_embeddings
++        self.hidden_size = hidden_size
++        self.intermediate_size = intermediate_size
++        self.num_hidden_layers = num_hidden_layers
++        self.num_attention_heads = num_attention_heads
++        self.sliding_window = sliding_window
++        self.attention_bias = attention_bias
++        self.lm_head_bias = lm_head_bias
++        # for backward compatibility
++        if num_key_value_heads is None:
++            num_key_value_heads = num_attention_heads
++
++        self.num_key_value_heads = num_key_value_heads
++        self.hidden_act = hidden_act
++        self.initializer_range = initializer_range
++        self.rms_norm_eps = rms_norm_eps
++        self.use_cache = use_cache
++        self.rope_theta = rope_theta
++        self.attention_dropout = attention_dropout
++
++        self.num_experts_per_tok = num_experts_per_tok
++        self.num_local_experts = num_local_experts
++        self.output_router_logits = output_router_logits
++        self.router_aux_loss_coef = router_aux_loss_coef
++        self.router_jitter_noise = router_jitter_noise
++        super().__init__(
++            pad_token_id=pad_token_id,
++            bos_token_id=bos_token_id,
++            eos_token_id=eos_token_id,
++            tie_word_embeddings=tie_word_embeddings,
++            **kwargs,
++        )
++
++
++class mp(torch.autograd.Function):
++
++    @staticmethod
++    def forward(
++        ctx,
++        scores: torch.Tensor,
++        multiplier: torch.Tensor,
++        selected_experts: torch.Tensor,
++        masked_gates: torch.Tensor,
++        mask_for_one: torch.Tensor,
++    ):
++        ctx.save_for_backward(multiplier, selected_experts, masked_gates)
++        return multiplier * mask_for_one
++
++    @staticmethod
++    def backward(
++        ctx,
++        grad_at_output: torch.Tensor,
++    ):
++        multiplier, selected_experts, masked_gates = ctx.saved_tensors
++
++        grad_at_output = grad_at_output * multiplier
++
++        grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1)
++        grad_at_scores_expaned.scatter_add_(
++            dim=-1,
++            index=selected_experts,
++            src=grad_at_output,
++        )
++
++        return (
++            grad_at_scores_expaned,
++            None,
++            None,
++            None,
++            None,
++        )
++
++
++def sparsemixer(scores, jitter_eps=0.01):
++    ################ first expert ################
++
++    with torch.no_grad():
++        # compute mask for sparsity
++        mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
++        factor = scores.abs().clamp(min=mask_logits_threshold)
++        mask_logits_threshold = (
++            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
++
++    # apply mask
++    masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
++    selected_experts = max_ind
++
++    # compute scores for gradients
++    masked_gates = torch.softmax(masked_gates, dim=-1)
++    multiplier_o = masked_gates.gather(dim=-1, index=selected_experts)
++
++    multiplier = multiplier_o
++
++    # masked out first expert
++    masked_scores = torch.scatter(
++        scores,
++        -1,
++        selected_experts,
++        float("-inf"),
++    )
++    with torch.no_grad():
++        # compute mask for sparsity
++        mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
++                                                           keepdim=True)
++        factor = scores.abs().clamp(min=mask_logits_threshold)
++        mask_logits_threshold = (
++            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
++
++    # apply mask
++    masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
++                                                  float("-inf"))
++    selected_experts_top2 = max_ind
++    # compute scores for gradients
++    masked_gates_top2 = torch.softmax(masked_gates_top2, dim=-1)
++    multiplier_top2 = masked_gates_top2.gather(dim=-1,
++                                               index=selected_experts_top2)
++
++    multiplier = torch.concat((multiplier, multiplier_top2), dim=-1)
++    selected_experts = torch.concat((selected_experts, selected_experts_top2),
++                                    dim=-1)
++
++    return (
++        multiplier,
++        selected_experts,
++    )
++
++
++def phimoe_routing_function(
++    hidden_states: torch.Tensor,
++    gating_output: torch.Tensor,
++    topk: int,
++    renormalize: bool,
++):
++    assert hidden_states.shape[0] == gating_output.shape[0], (
++        "Number of tokens mismatch")
++    assert topk == 2, "Only top-2 routing is supported"
++    assert renormalize is False, "Renormalization is not supported"
++
++    topk_weights, topk_ids = sparsemixer(gating_output)
++    return topk_weights, topk_ids
++
++
++class PhiMoE(nn.Module):
++    """A tensor-parallel MoE implementation for PhiMoE that shards each expert
++    across all ranks.
++
++    Each expert's weights are sharded across all ranks and a fused MoE
++    kernel is used for the forward pass, and finally we reduce the outputs
++    across ranks.
++    """
++
++    def __init__(
++        self,
++        num_experts: int,
++        top_k: int,
++        hidden_size: int,
++        intermediate_size: int,
++        params_dtype: Optional[torch.dtype] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        tp_size: Optional[int] = None,
++    ):
++        super().__init__()
++        self.hidden_size = hidden_size
++
++        # Gate always runs at half / full precision for now.
++        self.gate = ReplicatedLinear(
++            hidden_size,
++            num_experts,
++            bias=False,
++            params_dtype=params_dtype,
++            quant_config=None,
++        )
++
++        self.experts = FusedMoE(
++            num_experts=num_experts,
++            top_k=top_k,
++            hidden_size=hidden_size,
++            intermediate_size=intermediate_size,
++            params_dtype=params_dtype,
++            reduce_results=True,
++            renormalize=False,
++            quant_config=quant_config,
++            tp_size=tp_size,
++            custom_routing_function=phimoe_routing_function)
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        # NOTE: hidden_states can have either 1D or 2D shape.
++        orig_shape = hidden_states.shape
++        hidden_states = hidden_states.view(-1, self.hidden_size)
++        # router_logits: (num_tokens, n_experts)
++        router_logits, _ = self.gate(hidden_states)
++        final_hidden_states = self.experts(hidden_states, router_logits)
++        return final_hidden_states.view(orig_shape)
++
++
++class PhiMoEAttention(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        max_position: int = 4096 * 32,
++        rope_theta: float = 10000,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        rope_scaling: Optional[dict] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = num_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = num_kv_heads
++        if self.total_num_kv_heads >= tp_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        self.head_dim = hidden_size // self.total_num_heads
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = self.head_dim**-0.5
++        self.rope_theta = rope_theta
++        self.rope_scaling = rope_scaling
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size,
++            self.head_dim,
++            self.total_num_heads,
++            self.total_num_kv_heads,
++            bias=True,
++            quant_config=quant_config,
++        )
++        self.o_proj = RowParallelLinear(
++            self.total_num_heads * self.head_dim,
++            hidden_size,
++            bias=True,
++            quant_config=quant_config,
++        )
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=max_position,
++            base=int(self.rope_theta),
++            is_neox_style=True,
++            rope_scaling=self.rope_scaling,
++        )
++        self.attn = Attention(
++            self.num_heads,
++            self.head_dim,
++            self.scaling,
++            num_kv_heads=self.num_kv_heads,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attn",
++        )
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class PhiMoEDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: PhiMoEConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        # Requires transformers > 4.32.0
++        rope_theta = getattr(config, "rope_theta", 10000)
++        self.self_attn = PhiMoEAttention(
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            max_position=config.max_position_embeddings,
++            num_kv_heads=config.num_key_value_heads,
++            rope_theta=rope_theta,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            rope_scaling=config.rope_scaling,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.block_sparse_moe = PhiMoE(
++            num_experts=config.num_local_experts,
++            top_k=config.num_experts_per_tok,
++            hidden_size=config.hidden_size,
++            intermediate_size=config.intermediate_size,
++            quant_config=quant_config,
++        )
++        self.input_layernorm = nn.LayerNorm(config.hidden_size,
++                                            eps=config.rms_norm_eps,
++                                            elementwise_affine=True)
++        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
++                                                     eps=config.rms_norm_eps,
++                                                     elementwise_affine=True)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> torch.Tensor:
++        residual = hidden_states
++
++        # Self Attention
++        hidden_states = self.input_layernorm(hidden_states)
++
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        hidden_states = hidden_states + residual
++
++        # Fully Connected
++        residual = hidden_states
++        hidden_states = self.post_attention_layernorm(hidden_states)
++        hidden_states = self.block_sparse_moe(hidden_states)
++
++        hidden_states = hidden_states + residual
++        return hidden_states, residual
++
++
++@support_torch_compile
++class PhiMoEModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.padding_idx = config.pad_token_id
++        lora_vocab = ((lora_config.lora_extra_vocab_size *
++                       (lora_config.max_loras or 1)) if lora_config else 0)
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.org_vocab_size = config.vocab_size
++
++        self.embed_tokens = VocabParallelEmbedding(
++            self.vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++        )
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: PhiMoEDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers")
++        self.norm = nn.LayerNorm(config.hidden_size,
++                                 eps=config.rms_norm_eps,
++                                 elementwise_affine=True)
++
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        for i in range(self.start_layer, self.end_layer):
++            layer = self.layers[i]
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++                residual,
++            )
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++
++        hidden_states = self.norm(hidden_states)
++        return hidden_states
++
++
++class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    fall_back_to_pt_during_load = False
++
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj",
++        "o_proj",
++        "embed_tokens",
++        "lm_head",
++        "w1",
++        "w2",
++        "w3",
++        "gate",
++    ]
++    embedding_modules = {
++        "embed_tokens": "input_embeddings",
++        "lm_head": "output_embeddings",
++    }
++    embedding_padding_modules = ["lm_head"]
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        lora_config = vllm_config.lora_config
++        self.config = config
++        self.lora_config = lora_config
++
++        self.model = PhiMoEModel(vllm_config=vllm_config,
++                                 prefix=maybe_prefix(prefix, "model"))
++        self.unpadded_vocab_size = config.vocab_size
++        if lora_config:
++            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++        self.lm_head = ParallelLMHead(
++            self.unpadded_vocab_size,
++            config.hidden_size,
++            org_num_embeddings=config.vocab_size,
++            padding_size=(
++                DEFAULT_VOCAB_PADDING_SIZE
++                # We need bigger padding if using lora for kernel
++                # compatibility
++                if not lora_config else lora_config.lora_vocab_padding_size),
++            quant_config=None,
++            bias=True,
++        )
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                config.vocab_size)
++        self.sampler = get_sampler()
++
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        hidden_states = self.model(input_ids, positions, kv_caches,
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
++        return hidden_states
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: Optional[torch.Tensor],
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++        ]
++
++        expert_params_mapping = FusedMoE.make_expert_params_mapping(
++            ckpt_gate_proj_name="w1",
++            ckpt_down_proj_name="w2",
++            ckpt_up_proj_name="w3",
++            num_experts=self.config.num_local_experts)
++
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++
++            for param_name, weight_name, shard_id in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                # Skip layers on other devices.
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                for mapping in expert_params_mapping:
++                    param_name, weight_name, expert_id, shard_id = mapping
++                    if weight_name not in name:
++                        continue
++                    name = name.replace(weight_name, param_name)
++                    # Skip layers on other devices.
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    param = params_dict[name]
++                    weight_loader = param.weight_loader
++                    weight_loader(
++                        param,
++                        loaded_weight,
++                        name,
++                        shard_id=shard_id,
++                        expert_id=expert_id,
++                    )
++                    break
++                else:
++                    # Skip loading extra bias for GPTQ models.
++                    if name.endswith(".bias") and name not in params_dict:
++                        continue
++                    # Skip layers on other devices.
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    # Remapping the name of FP8 kv-scale.
++                    name = maybe_remap_kv_scale_name(name, params_dict)
++                    if name is None:
++                        continue
++
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
+new file mode 100644
+index 0000000..37b9989
+--- /dev/null
++++ b/vllm/model_executor/models/pixtral.py
+@@ -0,0 +1,1123 @@
++import math
++from dataclasses import dataclass, fields
++from functools import cached_property
++from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
++
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from mistral_common.protocol.instruct.messages import ImageChunk
++from PIL import Image
++from transformers import PixtralVisionConfig
++from transformers.models.pixtral.image_processing_pixtral import (
++    _num_image_tokens as _get_pixtral_hf_num_image_tokens)
++from transformers.models.pixtral.modeling_pixtral import (
++    PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid)
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.distributed import divide, get_tensor_model_parallel_world_size
++from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
++                         InputContext, token_inputs)
++from vllm.model_executor.layers.activation import get_act_and_mul_fn
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
++from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
++from vllm.multimodal.utils import (cached_get_tokenizer,
++                                   consecutive_placeholder_ranges)
++from vllm.sequence import IntermediateTensors, SequenceData
++
++from .interfaces import SupportsMultiModal, SupportsPP
++from .utils import (init_vllm_registered_model, maybe_prefix,
++                    merge_multimodal_embeddings)
++from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
++
++try:
++    from xformers import ops as xops
++    USE_XFORMERS_OPS = True
++except ImportError:
++    USE_XFORMERS_OPS = False
++
++
++def get_max_pixtral_image_tokens(ctx: InputContext):
++    tokenizer = cached_get_tokenizer(
++        ctx.model_config.tokenizer,
++        tokenizer_mode=ctx.model_config.tokenizer_mode)
++    mm_encoder = tokenizer.instruct.mm_encoder
++
++    max_image_size = mm_encoder.mm_config.max_image_size
++    image_patch_size = mm_encoder.mm_config.image_patch_size
++
++    return ((max_image_size // image_patch_size)**2)
++
++
++def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
++                           mm_counts: Mapping[str, int]):
++    tokenizer = cached_get_tokenizer(
++        ctx.model_config.tokenizer,
++        tokenizer_mode=ctx.model_config.tokenizer_mode)
++
++    mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
++    image_token_id = mm_encoder.special_ids.img
++
++    mm_config = ctx.get_mm_config()
++    num_images = mm_config.limit_per_prompt.get("image", 1)
++
++    # dummy size
++    size = 256
++    image = Image.new("RGB", (size, size), color=0)
++
++    encoding = tokenizer.instruct.mm_encoder(ImageChunk(image=image))
++    image_feature_size = len(encoding.tokens)
++    num_image_tokens = image_feature_size * num_images
++    seq_data = SequenceData.from_prompt_token_counts(
++        (image_token_id, num_image_tokens),
++        (0, seq_len - num_image_tokens),
++    )
++
++    mm_data = {"image": num_images * [image]}
++    mm_placeholders = {
++        "image":
++        consecutive_placeholder_ranges(num_items=num_images,
++                                       item_size=image_feature_size)
++    }
++    return DummyData(seq_data, mm_data, mm_placeholders)
++
++
++def input_mapper_for_pixtral(ctx: InputContext,
++                             data: object) -> MultiModalKwargs:
++    """Maps the input data to its MultiModalKwargs (if any).
++
++    Args:
++        ctx: Context of the loaded model.
++        data: data potentially containing PIL images to be processed
++            and mapped to `images`.
++
++    Returns:
++        MultiModalKwargs containing the stacked normalized images tensor or
++        image embeddings.
++    """
++    model_config = ctx.model_config
++    tokenizer = cached_get_tokenizer(
++        model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
++
++    data_list = data if isinstance(data, list) else [data]
++
++    images = []
++    image_tokens_list = []
++    for image_data in data_list:
++        image = ImageChunk(image=image_data)
++        encoding = tokenizer.instruct.mm_encoder(image)
++        image = torch.from_numpy(encoding.image).to(dtype=torch.float16)
++        images.append(image)
++        image_tokens_list.append(encoding.tokens)
++
++    image_tokens = torch.tensor([
++        token_id for image_tokens in image_tokens_list
++        for token_id in image_tokens
++    ])
++    return MultiModalKwargs({"images": images, "image_tokens": image_tokens})
++
++
++def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
++    multi_modal_data = inputs.get("multi_modal_data")
++    if multi_modal_data is None or "image" not in multi_modal_data:
++        return inputs
++
++    prompt_token_ids = inputs.get("prompt_token_ids")
++    prompt = inputs.get("prompt")
++    tokenizer = cached_get_tokenizer(
++        ctx.model_config.tokenizer,
++        tokenizer_mode=ctx.model_config.tokenizer_mode)
++
++    mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
++    image_token_id = mm_encoder.special_ids.img
++    image_break_id = mm_encoder.special_ids.img_break
++    image_end_id = mm_encoder.special_ids.img_end
++
++    if image_token_id not in inputs['prompt_token_ids']:
++        raise ValueError(
++            f"You've passed {inputs=} without {image_token_id=}"
++            " Make sure to process your input via mistral_common's"
++            " tokenizer or pass a chat completion request. For more"
++            " For more info, see: "
++            "https://github.com/vllm-project/vllm/issues/8411.")
++
++    # Get precise tracking of placeholder positions
++    placeholder_ranges = []
++    curr_offset = -1
++    curr_length = 0
++    for i in range(len(prompt_token_ids)):
++        if prompt_token_ids[i] in (image_token_id, image_break_id):
++            if curr_offset < 0:
++                curr_offset = i
++            curr_length += 1
++        elif prompt_token_ids[i] == image_end_id:
++            curr_length += 1
++            placeholder_ranges.append(
++                PlaceholderRange(offset=curr_offset, length=curr_length))
++            curr_offset = -1
++            curr_length = 0
++        else:
++            pass
++    return token_inputs(prompt=prompt,
++                        prompt_token_ids=prompt_token_ids,
++                        multi_modal_data=multi_modal_data,
++                        multi_modal_placeholders={"image": placeholder_ranges})
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_pixtral)
++@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_pixtral_image_tokens)
++@INPUT_REGISTRY.register_dummy_data(dummy_data_for_pixtral)
++@INPUT_REGISTRY.register_input_processor(input_processor_for_pixtral)
++class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                      SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        dataclass_fields = {field.name for field in fields(VisionEncoderArgs)}
++        vision_args = {
++            key: value
++            for key, value in self.config.vision_config.to_dict().items()
++            if key in dataclass_fields
++        }
++
++        if not ("image_break_token_id" in vision_args
++                and "image_end_token_id" in vision_args):
++            raise ValueError(
++                "'image_break_token_id' and 'image_end_token_id' not found "
++                "in the vision_encoder arguments. Please download the latest "
++                "version of 'params.json' from the model repository.")
++
++        self.vision_args = VisionEncoderArgs(**vision_args)
++
++        # init MistralForCausalLM
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=config.text_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++
++        self.vision_encoder = VisionTransformer(self.vision_args)
++        self.vision_language_adapter = VisionLanguageAdapter(
++            self.vision_args, dim=config.text_config.hidden_size)
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        image_input, image_tokens = self._parse_and_validate_image_input(
++            **kwargs)
++        if image_input is None:
++            return None
++
++        vision_embeddings = self._process_image_input(image_input)
++
++        # NOTE: We patch the outputs of the vision encoder with embeddings
++        # from `[IMG_BREAK]` and `[IMG_END]` tokens.
++        image_embeds = self.language_model.get_input_embeddings(image_tokens)
++        image_token_mask = image_tokens == self.vision_args.image_token_id
++        image_embeds[image_token_mask] = vision_embeddings
++
++        # NOTE: Image embeddings are split into separate tensors for each image
++        # by the indices of `[IMG_END]` token.
++        image_end_mask = image_tokens == self.vision_args.image_end_token_id
++        split_indices = torch.where(image_end_mask)[0] + 1
++        if len(split_indices) <= 1:
++            # Do not split, return as tensor of shape [1, fs, hs]
++            return image_embeds.unsqueeze(0)
++
++        # If the last split index is the last index in image_tokens, we
++        # ignore it to avoid empty split tensor
++        if split_indices[-1] == len(image_tokens):
++            split_indices = split_indices[:-1]
++
++        image_embeds = image_embeds.tensor_split(split_indices.cpu())
++        return image_embeds
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings, [
++                    self.vision_args.image_token_id,
++                    self.vision_args.image_break_token_id,
++                    self.vision_args.image_end_token_id,
++                ])
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        """Run forward pass for pixtral.
++        """
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      vision_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model.model(input_ids,
++                                                  positions,
++                                                  kv_caches,
++                                                  attn_metadata,
++                                                  intermediate_tensors,
++                                                  inputs_embeds=inputs_embeds)
++
++        return hidden_states
++
++    def _parse_and_validate_image_input(
++        self,
++        images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor],
++                               torch.Tensor]] = None,
++        image_tokens: Optional[torch.Tensor] = None,
++    ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]:
++        if images is None:
++            return None, None
++
++        if isinstance(images, torch.Tensor):
++            # if passed as batch take all images
++            N, B, C, W, H = images.shape
++            images = images.reshape(N * B, C, W, H)
++            images = [images[i] for i in range(images.size(0))]
++        elif isinstance(images, list):
++            # if passed as list flatten lists of tensors
++            flatten_images = []
++            for imgs_per_req in images:
++                imgs_per_req = [
++                    imgs_per_req[i] for i in range(imgs_per_req.size(0))
++                ] if isinstance(imgs_per_req, torch.Tensor) else imgs_per_req
++
++                flatten_images.extend(imgs_per_req)
++
++            images = flatten_images
++
++        if isinstance(image_tokens, torch.Tensor):
++            # image_tokens are batched
++            image_tokens = image_tokens.flatten()
++        elif isinstance(image_tokens, list):
++            # image_tokens are of different lengths thus passed as a list
++            image_tokens = torch.cat(image_tokens)
++
++        assert image_tokens.dim() == 1
++
++        return images, image_tokens
++
++    def _process_image_input(self,
++                             image_input: List[torch.Tensor]) -> torch.Tensor:
++        return self.vision_language_adapter(self.vision_encoder(image_input))
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++
++        def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]):
++            return weight[0].startswith("vision_encoder")
++
++        def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]):
++            return weight[0].startswith("vision_language_adapter")
++
++        # Get references to parameters for direct loading
++        vision_encoder_dict = dict(self.vision_encoder.named_parameters())
++        vision_lang_adapter_dict = dict(
++            self.vision_language_adapter.named_parameters())
++
++        def llm_weights_generator():
++            # Single pass over weights
++            for name, w in weights:
++                if is_vision_encoder_weights((name, w)):
++                    # Load vision encoder weights directly
++                    trimmed_name = '.'.join(name.split(".")[1:])
++                    param = vision_encoder_dict[trimmed_name]
++                    with torch.no_grad():
++                        default_weight_loader(param, w)
++                elif is_vision_lang_adapter_weights((name, w)):
++                    # Load vision-language adapter weights directly
++                    trimmed_name = '.'.join(name.split(".")[1:])
++                    param = vision_lang_adapter_dict[trimmed_name]
++                    with torch.no_grad():
++                        default_weight_loader(param, w)
++                else:
++                    # LLM weights: yield them to be loaded
++                    # by language_model.load_weights
++                    yield (name, w)
++
++        # Now we call the language model load with the generator
++        self.language_model.load_weights(llm_weights_generator())
++
++
++# Vision encoder
++@dataclass
++class VisionEncoderArgs:
++    hidden_size: int
++    num_channels: int
++    image_size: int
++    patch_size: int
++    intermediate_size: int
++    num_hidden_layers: int
++    num_attention_heads: int
++    rope_theta: float  # for rope-2D
++    image_token_id: int
++    image_break_token_id: int
++    image_end_token_id: int
++    adapter_bias: bool = True
++
++
++def _reshape_for_broadcast(freqs_cis: torch.Tensor,
++                           x: torch.Tensor) -> torch.Tensor:
++    """
++    freqs_cis: complex - (seq_len, head_dim / 2)
++    x: complex - (bsz, seq_len, head_dim / 2)
++    """
++    ndim = x.ndim
++    assert ndim > 1
++    assert freqs_cis.shape == (x.shape[1], x.shape[-1]), (
++        freqs_cis.shape,
++        (x.shape[1], x.shape[-1]),
++    )
++    shape = [
++        d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)
++    ]
++    return freqs_cis.view(*shape)
++
++
++def precompute_freqs_cis_2d(
++    dim: int,
++    height: int,
++    width: int,
++    theta: float,
++) -> torch.Tensor:
++    """
++    freqs_cis: 2D complex tensor of shape (height, width, dim // 2)
++        to be indexed by (height, width) position tuples
++    """
++    # (dim / 2) frequency bases
++    freqs = 1.0 / (theta**(torch.arange(0, dim, 2).float() / dim))
++
++    h = torch.arange(height, device=freqs.device)
++    w = torch.arange(width, device=freqs.device)
++
++    freqs_h = torch.outer(h, freqs[::2]).float()
++    freqs_w = torch.outer(w, freqs[1::2]).float()
++    freqs_2d = torch.cat(
++        [
++            freqs_h[:, None, :].repeat(1, width, 1),
++            freqs_w[None, :, :].repeat(height, 1, 1),
++        ],
++        dim=-1,
++    )
++    return torch.polar(torch.ones_like(freqs_2d), freqs_2d)
++
++
++def apply_rotary_emb_vit(
++    xq: torch.Tensor,
++    xk: torch.Tensor,
++    freqs_cis: torch.Tensor,
++) -> Tuple[torch.Tensor, torch.Tensor]:
++    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
++    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
++    assert freqs_cis.dtype == torch.complex64
++    freqs_cis = _reshape_for_broadcast(freqs_cis, xq_)
++    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
++    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
++    return xq_out.type_as(xq), xk_out.type_as(xk)
++
++
++class FeedForward(nn.Module):
++
++    def __init__(self, args: VisionEncoderArgs):
++        super().__init__()
++        assert args.intermediate_size is not None
++        self.w1 = nn.Linear(args.hidden_size,
++                            args.intermediate_size,
++                            bias=False)
++        self.w2 = nn.Linear(args.intermediate_size,
++                            args.hidden_size,
++                            bias=False)
++        self.w3 = nn.Linear(args.hidden_size,
++                            args.intermediate_size,
++                            bias=False)
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        return self.w2(F.silu(self.w1(x)) * self.w3(x))
++
++
++class Attention(nn.Module):
++
++    def __init__(self, args: VisionEncoderArgs):
++        super().__init__()
++        self.args = args
++        assert not args.hidden_size % args.num_attention_heads
++        self.n_heads = args.num_attention_heads
++        self.head_dim = args.hidden_size // args.num_attention_heads
++
++        self.wq = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
++        self.wk = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
++        self.wv = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
++        self.wo = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        mask: torch.Tensor,
++        freqs_cis: torch.Tensor,
++    ) -> torch.Tensor:
++        batch, patches, _ = x.shape
++
++        q, k, v = self.wq(x), self.wk(x), self.wv(x)
++        q = q.reshape(batch, patches, self.n_heads, self.head_dim)
++        k = k.reshape(batch, patches, self.n_heads, self.head_dim)
++        v = v.reshape(batch, patches, self.n_heads, self.head_dim)
++
++        q, k = apply_rotary_emb_vit(q, k, freqs_cis=freqs_cis)
++        out = xops.memory_efficient_attention(q, k, v, attn_bias=mask)
++        out = out.reshape(batch, patches, self.n_heads * self.head_dim)
++        return self.wo(out)
++
++
++class TransformerBlock(nn.Module):
++
++    def __init__(self, args: VisionEncoderArgs):
++        super().__init__()
++        self.attention = Attention(args)
++        self.feed_forward = FeedForward(args)
++        self.attention_norm = RMSNorm(args.hidden_size, eps=1e-5)
++        self.ffn_norm = RMSNorm(args.hidden_size, eps=1e-5)
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        mask: torch.Tensor,
++        freqs_cis: torch.Tensor,
++    ) -> torch.Tensor:
++        r = self.attention.forward(self.attention_norm(x),
++                                   mask=mask,
++                                   freqs_cis=freqs_cis)
++        h = x + r
++        r = self.feed_forward.forward(self.ffn_norm(h))
++        out = h + r
++        return out
++
++
++class Transformer(nn.Module):
++
++    def __init__(self, args: VisionEncoderArgs):
++        super().__init__()
++        self.layers = torch.nn.ModuleList()
++        for _ in range(args.num_hidden_layers):
++            self.layers.append(TransformerBlock(args))
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        mask: torch.Tensor,
++        freqs_cis: Optional[torch.Tensor],
++    ) -> torch.Tensor:
++        for layer in self.layers:
++            x = layer(x, mask=mask, freqs_cis=freqs_cis)
++        return x
++
++
++def position_meshgrid(patch_embeds_list: List[torch.Tensor], ) -> torch.Tensor:
++    positions = torch.cat([
++        torch.stack(
++            torch.meshgrid(
++                torch.arange(p.shape[-2]),
++                torch.arange(p.shape[-1]),
++                indexing="ij",
++            ),
++            dim=-1,
++        ).reshape(-1, 2) for p in patch_embeds_list
++    ])
++    return positions
++
++
++class VisionTransformer(nn.Module):
++
++    def __init__(self, args: VisionEncoderArgs):
++        super().__init__()
++        self.args = args
++        self.patch_conv = nn.Conv2d(
++            in_channels=args.num_channels,
++            out_channels=args.hidden_size,
++            kernel_size=args.patch_size,
++            stride=args.patch_size,
++            bias=False,
++        )
++        self.ln_pre = RMSNorm(args.hidden_size, eps=1e-5)
++        self.transformer = Transformer(args)
++
++        head_dim = self.args.hidden_size // self.args.num_attention_heads
++        assert head_dim % 2 == 0, "ROPE requires even head_dim"
++        self._freqs_cis: Optional[torch.Tensor] = None
++
++    @property
++    def max_patches_per_side(self) -> int:
++        return self.args.image_size // self.args.patch_size
++
++    @property
++    def device(self) -> torch.types.Device:
++        return next(self.parameters()).device
++
++    @property
++    def dtype(self) -> torch.dtype:
++        return next(self.parameters()).dtype
++
++    @property
++    def freqs_cis(self) -> torch.Tensor:
++        if self._freqs_cis is None:
++            self._freqs_cis = precompute_freqs_cis_2d(
++                dim=self.args.hidden_size // self.args.num_attention_heads,
++                height=self.max_patches_per_side,
++                width=self.max_patches_per_side,
++                theta=self.args.rope_theta,
++            )
++
++        if self._freqs_cis.device != self.device:
++            self._freqs_cis = self._freqs_cis.to(device=self.device)
++
++        return self._freqs_cis
++
++    def forward(
++        self,
++        images: List[torch.Tensor],
++    ) -> torch.Tensor:
++        """
++        Args:
++            images: list of N_img images of variable sizes, 
++                each of shape (C, H, W)
++        Returns:
++            image_features: tensor of token features for 
++                all tokens of all images of shape (N_toks, D)
++        """
++        # pass images through initial convolution independently
++        patch_embeds_list = [
++            self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in images
++        ]
++
++        # flatten to a single sequence
++        patch_embeds = torch.cat(
++            [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
++        patch_embeds = self.ln_pre(patch_embeds)
++
++        # positional embeddings
++        positions = position_meshgrid(patch_embeds_list).to(self.device)
++        freqs_cis = self.freqs_cis[positions[:, 0], positions[:, 1]]
++
++        # pass through Transformer with a block diagonal mask delimiting images
++        if USE_XFORMERS_OPS:
++            mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(
++                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], )
++        else:
++            raise ImportError("Xformers is required for Pixtral inference "
++                              "with the Mistral format")
++        out = self.transformer(patch_embeds, mask=mask, freqs_cis=freqs_cis)
++
++        # remove batch dimension of the single sequence
++        return out.squeeze(0)
++
++
++class VisionLanguageAdapter(nn.Module):
++
++    def __init__(self, args: VisionEncoderArgs, dim: int):
++        super().__init__()
++        assert isinstance(args, VisionEncoderArgs)
++        self.w_in = nn.Linear(
++            args.hidden_size,
++            dim,
++            bias=args.adapter_bias,
++        )
++        self.gelu = nn.GELU()
++        self.w_out = nn.Linear(dim, dim, bias=args.adapter_bias)
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        return self.w_out(self.gelu(self.w_in(x)))
++
++
++#### HF Transformers version of Pixtral ####
++# Based off https://github.com/huggingface/transformers/blob/d7950bff82b18c823193d17d72188c5e46d06c83/src/transformers/models/pixtral/modeling_pixtral.py
++# This model follows the Llava family, meaning image embeddings are placed
++# instead of the `[IMG]` token placeholders.
++# The model uses [`PixtralVisionModel`] for its vision encoder,
++# and [`MistralForCausalLM`] for its language decoder.
++
++
++def get_pixtral_hf_patch_grid_length(*, image_size: int,
++                                     patch_size: int) -> int:
++    # Since interpolation is applied, the image size need not be divisible
++    # assert image_size % patch_size == 0
++    return image_size // patch_size
++
++
++def get_pixtral_hf_image_feature_size(
++    *,
++    image_size: int,
++    patch_size: int,
++) -> int:
++    grid_length = get_pixtral_hf_patch_grid_length(
++        image_size=image_size,
++        patch_size=patch_size,
++    )
++
++    # Consider the image_break_token
++    return (grid_length + 1) * grid_length
++
++
++def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int:
++    grid_length = get_pixtral_hf_patch_grid_length(
++        image_size=hf_config.image_size,
++        patch_size=hf_config.patch_size,
++    )
++
++    # Consider the image_break_token
++    return (grid_length + 1) * grid_length
++
++
++def dummy_image_for_pixtral_hf(
++    hf_config: PixtralVisionConfig,
++    num_images: int,
++    *,
++    image_width_override: Optional[int] = None,
++    image_height_override: Optional[int] = None,
++):
++    width = height = hf_config.image_size
++    if image_width_override is not None:
++        width = image_width_override
++    if image_height_override is not None:
++        height = image_height_override
++
++    image = Image.new("RGB", (width, height), color=0)
++    return {"image": image if num_images == 1 else [image] * num_images}
++
++
++# Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501
++# https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180
++def get_pixtral_hf_image_feature_grid_size(
++    hf_config: PixtralVisionConfig,
++    *,
++    image_width: int,
++    image_height: int,
++) -> tuple[int, int]:
++    max_width = max_height = hf_config.image_size
++    patch_width = patch_height = hf_config.patch_size
++
++    ratio = max(image_width / max_width, image_height / max_height)
++
++    if ratio > 1:
++        image_width = int(math.ceil(image_width / ratio))
++        image_height = int(math.ceil(image_height / ratio))
++
++    nrows, ncols = _get_pixtral_hf_num_image_tokens(
++        (image_height, image_width),
++        (patch_height, patch_width),
++    )  # type: ignore
++
++    return ncols, nrows
++
++
++class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
++
++    def get_num_image_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++    ) -> int:
++        return get_pixtral_hf_image_feature_size(
++            image_size=self.vision_config.image_size,
++            patch_size=self.vision_config.patch_size,
++        )
++
++    def get_max_image_tokens(self) -> int:
++        return get_max_pixtral_hf_image_tokens(self.vision_config)
++
++    def get_image_size(self) -> int:
++        return self.vision_config.image_size
++
++    def get_patch_size(self) -> int:
++        return self.vision_config.patch_size
++
++    def get_patch_grid_length(self) -> int:
++        return get_pixtral_hf_patch_grid_length(
++            image_size=self.vision_config.image_size,
++            patch_size=self.vision_config.patch_size,
++        )
++
++
++class PixtralHFMLP(nn.Module):
++
++    def __init__(
++        self,
++        config: PixtralVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        assert config.intermediate_size is not None
++        self.gate_up_proj = MergedColumnParallelLinear(
++            input_size=config.hidden_size,
++            output_sizes=[config.intermediate_size] * 2,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj")
++        self.down_proj = RowParallelLinear(input_size=config.intermediate_size,
++                                           output_size=config.hidden_size,
++                                           bias=False,
++                                           quant_config=quant_config,
++                                           prefix=f"{prefix}.down_proj")
++        self.act_and_mul = get_act_and_mul_fn(config.hidden_act)
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        gate_up, _ = self.gate_up_proj(x)
++        x = self.act_and_mul(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class PixtralHFAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: PixtralVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++        assert not config.hidden_size % config.num_attention_heads
++        self.total_num_heads = config.num_attention_heads
++        tp_size = get_tensor_model_parallel_world_size()
++        self.n_heads = divide(config.num_attention_heads, tp_size)
++        self.head_dim = config.hidden_size // config.num_attention_heads
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size=config.hidden_size,
++            head_size=self.head_dim,
++            total_num_heads=self.total_num_heads,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++        assert self.total_num_heads * self.head_dim == config.hidden_size
++        self.o_proj = RowParallelLinear(
++            input_size=config.hidden_size,
++            output_size=config.hidden_size,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
++        )
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        attention_mask: torch.Tensor,
++        position_embeddings: torch.Tensor,
++    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
++        batch, patches, _ = hidden_states.size()
++
++        qkv_states, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv_states.chunk(3, dim=-1)
++
++        # Transpose q and k to apply HF's Rotary Position Embedding
++        q = q.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2)
++        k = k.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2)
++        v = v.view(batch, patches, self.n_heads, self.head_dim)
++        cos, sin = position_embeddings
++        q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0)
++
++        if USE_XFORMERS_OPS:
++            # Transpose q and k back for attention
++            q = q.transpose(1, 2).contiguous()
++            k = k.transpose(1, 2).contiguous()
++
++            out = xops.memory_efficient_attention(q,
++                                                  k,
++                                                  v,
++                                                  attn_bias=attention_mask)
++        else:
++            v = v.transpose(1, 2)
++            out = nn.functional.scaled_dot_product_attention(
++                q, k, v, attn_mask=attention_mask)
++            out = out.transpose(1, 2)
++
++        out = out.view(batch, patches, self.n_heads * self.head_dim)
++        attn_output, _ = self.o_proj(out)
++
++        return attn_output, None
++
++
++class PixtralHFTransformerBlock(nn.Module):
++
++    def __init__(
++        self,
++        config: PixtralVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.attention_norm = RMSNorm(config.hidden_size, eps=1e-5)
++        self.attention = PixtralHFAttention(config,
++                                            quant_config=quant_config,
++                                            prefix=f"{prefix}.attention")
++        self.feed_forward = PixtralHFMLP(config,
++                                         quant_config=quant_config,
++                                         prefix=f"{prefix}.feed_forward")
++        self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        attention_mask: torch.Tensor,
++        position_embeddings: torch.Tensor,
++    ) -> torch.Tensor:
++        r, _ = self.attention.forward(self.attention_norm(hidden_states),
++                                      attention_mask=attention_mask,
++                                      position_embeddings=position_embeddings)
++        h = hidden_states + r
++        r = self.feed_forward.forward(self.ffn_norm(h))
++        out = h + r
++        return out
++
++
++class PixtralHFTransformer(nn.Module):
++
++    def __init__(
++        self,
++        config: PixtralVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        if num_hidden_layers_override is None:
++            num_hidden_layers = config.num_hidden_layers
++        else:
++            num_hidden_layers = num_hidden_layers_override
++
++        self.layers = nn.ModuleList([
++            PixtralHFTransformerBlock(config=config,
++                                      quant_config=quant_config,
++                                      prefix=f"{prefix}.layers.{layer_idx}")
++            for layer_idx in range(num_hidden_layers)
++        ])
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        attention_mask: torch.Tensor,
++        position_embeddings: torch.Tensor,
++        return_all_hidden_states: bool,
++    ) -> torch.Tensor:
++        hidden_states_pool = []
++
++        for layer in self.layers:
++            x = layer(x, attention_mask, position_embeddings)
++            if return_all_hidden_states:
++                hidden_states_pool.append(x)
++        # If we have multiple feature sample layers, we return all hidden
++        # states in order and grab the ones we need by index.
++        if return_all_hidden_states:
++            return hidden_states_pool
++        return x
++
++
++class PixtralHFVisionModel(nn.Module):
++
++    def __init__(
++        self,
++        config: PixtralVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        require_post_norm: Optional[bool] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++
++        self.patch_conv = nn.Conv2d(
++            in_channels=config.num_channels,
++            out_channels=config.hidden_size,
++            kernel_size=config.patch_size,
++            stride=config.patch_size,
++            bias=False,
++        )
++        self.ln_pre = RMSNorm(config.hidden_size, eps=1e-5)
++        self.transformer = PixtralHFTransformer(
++            config,
++            quant_config,
++            num_hidden_layers_override=num_hidden_layers_override,
++            prefix=f"{prefix}.transformer",
++        )
++
++        num_hidden_layers = config.num_hidden_layers
++        if len(self.transformer.layers) > config.num_hidden_layers:
++            raise ValueError(
++                f"The original encoder only has {num_hidden_layers} "
++                f"layers, but you requested {len(self.transformer.layers)} "
++                "layers.")
++
++        if require_post_norm is True:
++            msg = "PixtralHFVisionModel does not have post-layernorm"
++            raise ValueError(msg)
++
++        self.dtype = next(self.parameters()).dtype
++        self.device = next(self.parameters()).device
++        self.patch_positional_embedding = PixtralRotaryEmbedding(
++            config, self.device)
++
++    def forward(
++        self,
++        pixel_values: List[torch.Tensor],
++        feature_sample_layers: Optional[list[int]] = None,
++    ) -> torch.Tensor:
++        """
++        Args:
++            pixel_values: Each image to be processed will be a separate tensor
++                in pixel_values. This means it will be a list of tensors
++                because multiple requests batched can have multiple images,
++                each with their own shape potentially
++            feature_sample_layers: Layer indices whose features should be
++                concatenated and used as the visual encoder output. If none
++                are provided, the last layer is used.
++
++        Returns:
++            image_features: tensor of token features for
++                all tokens of all images of shape (N_toks, D)
++        """
++        # pass images through initial convolution independently
++        patch_embeds_list = [
++            self.patch_conv(img.unsqueeze(0).to(self.dtype))
++            for img in pixel_values
++        ]
++
++        # flatten to a single sequence
++        patch_embeds = torch.cat(
++            [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
++        patch_embeds = self.ln_pre(patch_embeds)
++
++        # positional embeddings
++        position_ids = position_ids_in_meshgrid(
++            patch_embeds_list,
++            max_width=self.config.image_size // self.config.patch_size).to(
++                self.device)
++        position_embedding = self.patch_positional_embedding(
++            patch_embeds, position_ids)
++
++        if USE_XFORMERS_OPS:
++            attention_mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(
++                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], )
++        else:
++            from transformers.models.pixtral.modeling_pixtral import (
++                generate_block_attention_mask)
++            attention_mask = generate_block_attention_mask(
++                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list],
++                patch_embeds)
++
++        return_all_hidden_states = feature_sample_layers is not None
++        out = self.transformer(
++            patch_embeds,
++            attention_mask,
++            position_embedding,
++            return_all_hidden_states=return_all_hidden_states)
++
++        out = resolve_visual_encoder_outputs(out, feature_sample_layers, None,
++                                             self.config.num_hidden_layers)
++
++        return out
++
++    # (TODO) Add prefix argument for filtering out weights to be loaded
++    #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            (".qkv_proj", ".q_proj", "q"),
++            (".qkv_proj", ".k_proj", "k"),
++            (".qkv_proj", ".v_proj", "v"),
++            (".gate_up_proj", ".gate_proj", 0),
++            (".gate_up_proj", ".up_proj", 1),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        layer_count = len(self.transformer.layers)
++
++        for name, loaded_weight in weights:
++            # omit layers when num_hidden_layers_override is set
++            if name.startswith("transformer.layers"):
++                layer_idx = int(name.split(".")[2])
++                if layer_idx >= layer_count:
++                    continue
++
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
+index e5e0028..baf955f 100644
+--- a/vllm/model_executor/models/qwen.py
++++ b/vllm/model_executor/models/qwen.py
+@@ -1,35 +1,407 @@
+-# coding=utf-8
+ # Adapted from
+ # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
+ # Copyright (c) Alibaba Cloud.
+ # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
+ """Inference-only QWen model compatible with HuggingFace weights."""
+-from typing import Any, Dict, Iterable, List, Optional, Tuple
+ 
++import math
++import re
++from functools import partial
++from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
++                    Optional, Set, Tuple, TypedDict, Union)
++
++import numpy as np
+ import torch
++from PIL import Image
+ from torch import nn
++from torchvision import transforms
++from torchvision.transforms import InterpolationMode
+ from transformers import PretrainedConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
+-from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
++                         InputContext, token_inputs)
++from vllm.logger import init_logger
++from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+ from vllm.model_executor.layers.layernorm import RMSNorm
+-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               MergedColumnParallelLinear,
+                                                QKVParallelLinear,
++                                               ReplicatedLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.module_mapping import MultiModelKeys
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
++from vllm.multimodal.utils import cached_get_tokenizer
++from vllm.sequence import IntermediateTensors, SequenceData
++from vllm.utils import is_list_of
++
++from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
++from .utils import (flatten_bn, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++logger = init_logger(__name__)
++
++# NOTE: Qwen models have a few other special tags, e.g., ref, bbox, quad;
++# for the time being, these tags are not considered as special at encoding
++# time. This may change as VLLMs multimodal API changes in the future.
++IMG_START = "<img>"
++IMG_END = "</img>"
++IMG_PAD = "<imgpad>"
++# Image context is fixed at 256 for all images
++MAX_QWEN_IMG_TOKENS = 256
++# Image normalization params
++CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
++CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
++
++
++class QwenImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    data: torch.Tensor
++    """
++    Shape: `(batch_size * num_images, 3, image_size, image_size)`
++
++    Note that image_size is the value in the vision config to which we resize
++    the image to in the normalization transform. Currently multi-image support
++    can only be leveraged by passing image embeddings directly.
++    """
++
++
++class QwenImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    data: torch.Tensor
++    """Shape: `(batch_size * num_images, 256, hidden_size)`
++
++    `hidden_size` must match the hidden size of the language model backbone
++    and is stored in the visual config of the model if we have one.
++    """
++
++
++QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs]
++
++
++class VisualAttention(nn.Module):
++    """self-attention layer class.
++    Self-attention layer takes input with size [s, b, h]
++    and returns output of the same size.
++    """
++
++    def __init__(
++        self,
++        embed_dim: int,
++        num_heads: int,
++        bias: bool = True,
++        kdim: Optional[int] = None,
++        vdim: Optional[int] = None,
++    ):
++        super().__init__()
++        self.embed_dim = embed_dim
++        self.kdim = kdim if kdim is not None else embed_dim
++        self.vdim = vdim if vdim is not None else embed_dim
++        self._qkv_same_embed_dim = self.kdim == embed_dim \
++            and self.vdim == embed_dim
++
++        self.num_heads = num_heads
++
++        # Per attention head and per partition values.
++        assert embed_dim % num_heads == 0
++        self.hidden_size_per_attention_head = embed_dim // num_heads
++        self.num_attention_heads_per_partition = num_heads
++        self.hidden_size_per_partition = embed_dim
++
++        # Strided linear layer.
++        assert self._qkv_same_embed_dim, \
++                'Visual Attention implementation only supports self-attention'
++        self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim)
++        self.out_proj = ReplicatedLinear(embed_dim, embed_dim)
++        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        attn_mask: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        # query/key/value: [sq, b, h]
++        sq, b, _ = x.size()
++        mixed_x_layer, _ = self.in_proj(x)
++
++        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
++        new_tensor_shape = mixed_x_layer.size()[:-1] + \
++            (self.num_attention_heads_per_partition,
++             3 * self.hidden_size_per_attention_head)
++        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
++
++        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
++        query_layer, key_layer, value_layer = mixed_x_layer.split(
++            self.hidden_size_per_attention_head, dim=-1)
++
++        # [sq, b, np, hn] -> [sq, b * np, hn]
++        query_layer = query_layer.view(
++            sq, b * self.num_attention_heads_per_partition,
++            self.hidden_size_per_attention_head).transpose(0, 1)
++        # [sk, b, np, hn] -> [sk, b * np, hn]
++        key_layer = key_layer.view(
++            sq, b * self.num_attention_heads_per_partition,
++            self.hidden_size_per_attention_head).transpose(0, 1)
++
++        q_scaled = query_layer / self.norm_factor
++        if attn_mask is not None:
++            attention_probs = torch.baddbmm(attn_mask, q_scaled,
++                                            key_layer.transpose(-2, -1))
++        else:
++            attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
++        attention_probs = attention_probs.softmax(dim=-1)
++
++        value_layer = value_layer.view(
++            sq, b * self.num_attention_heads_per_partition,
++            self.hidden_size_per_attention_head).transpose(0, 1)
++
++        # matmul: [b * np, sq, hn]
++        context_layer = torch.bmm(attention_probs, value_layer)
++
++        # change view [b, np, sq, hn]
++        context_layer = context_layer.view(
++            b, self.num_attention_heads_per_partition, sq,
++            self.hidden_size_per_attention_head)
++
++        # [b, np, sq, hn] --> [sq, b, np, hn]
++        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
++
++        # [sq, b, np, hn] --> [sq, b, hp]
++        new_context_layer_shape = context_layer.size()[:-2] + \
++            (self.hidden_size_per_partition,)
++        context_layer = context_layer.view(*new_context_layer_shape)
++
++        output, _ = self.out_proj(context_layer)
++
++        return output
++
++
++class QwenVMLP(nn.Module):
++    """MLP for the visual component of the Qwen model."""
++
++    def __init__(
++        self,
++        hidden_size: int,
++        intermediate_size: int,
++        quant_config: Optional[QuantizationConfig] = None,
++    ):
++        super().__init__()
++        self.c_fc = ColumnParallelLinear(hidden_size,
++                                         intermediate_size,
++                                         bias=True,
++                                         quant_config=quant_config)
++        self.act_fn = get_act_fn("gelu")
++        self.c_proj = RowParallelLinear(
++            intermediate_size,
++            hidden_size,
++            bias=True,
++            quant_config=quant_config,
++        )
++
++    def forward(self, x):
++        x, _ = self.c_fc(x)
++        x = self.act_fn(x)
++        x, _ = self.c_proj(x)
++        return x
++
++
++class VisualAttentionBlock(nn.Module):
++
++    def __init__(
++        self,
++        d_model: int,
++        n_head: int,
++        mlp_ratio: float = 4.0,
++        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
++        quant_config: Optional[QuantizationConfig] = None,
++    ):
++        super().__init__()
++
++        self.ln_1 = norm_layer(d_model)
++        self.ln_2 = norm_layer(d_model)
++        mlp_width = int(d_model * mlp_ratio)
++        self.attn = VisualAttention(d_model, n_head)
++        self.mlp = QwenVMLP(
++            hidden_size=d_model,
++            intermediate_size=mlp_width,
++            quant_config=quant_config,
++        )
++
++    def attention(
++        self,
++        x: torch.Tensor,
++        attn_mask: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None
++        return self.attn(x, attn_mask=attn_mask)
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        attn_mask: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        x = x + self.attention(self.ln_1(x), attn_mask=attn_mask)
++        x = x + self.mlp(self.ln_2(x))
++        return x
++
++
++class TransformerBlock(nn.Module):
++
++    def __init__(
++        self,
++        width: int,
++        layers: int,
++        heads: int,
++        mlp_ratio: float = 4.0,
++        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
++        quant_config: Optional[QuantizationConfig] = None,
++    ):
++        super().__init__()
++        self.width = width
++        self.layers = layers
++
++        self.resblocks = nn.ModuleList([
++            VisualAttentionBlock(width,
++                                 heads,
++                                 mlp_ratio,
++                                 norm_layer=norm_layer,
++                                 quant_config=quant_config)
++            for _ in range(layers)
++        ])
++
++    def get_cast_dtype(self) -> torch.dtype:
++        return self.resblocks[0].mlp.c_fc.weight.dtype
++
++    def get_cast_device(self) -> torch.device:
++        return self.resblocks[0].mlp.c_fc.weight.device
++
++    def forward(self,
++                x: torch.Tensor,
++                attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
++        for r in self.resblocks:
++            x = r(x, attn_mask=attn_mask)
++        return x
++
++
++class VisionTransformer(nn.Module):
++
++    def __init__(self,
++                 image_size: int,
++                 patch_size: int,
++                 width: int,
++                 layers: int,
++                 heads: int,
++                 mlp_ratio: float,
++                 n_queries: int = 256,
++                 output_dim: int = 512,
++                 image_start_id: int = 151857,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 **kwargs):
++        super().__init__()
++        image_height, image_width = self.image_size = (image_size, image_size)
++        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
++        self.grid_size = (image_height // patch_height,
++                          image_width // patch_width)
++        self.output_dim = output_dim
++        self.conv1 = nn.Conv2d(in_channels=3,
++                               out_channels=width,
++                               kernel_size=patch_size,
++                               stride=patch_size,
++                               bias=False)
++
++        # class embeddings and positional embeddings
++        scale = width**-0.5
++        self.positional_embedding = nn.Parameter(scale *
++                                                 torch.randn(256, width))
++
++        norm_layer = partial(nn.LayerNorm, eps=1e-6)
++
++        self.ln_pre = norm_layer(width)
++        self.transformer = TransformerBlock(width,
++                                            layers,
++                                            heads,
++                                            mlp_ratio,
++                                            norm_layer=norm_layer,
++                                            quant_config=quant_config)
++
++        self.attn_pool = Resampler2(
++            grid_size=int(math.sqrt(n_queries)),
++            embed_dim=output_dim,
++            num_heads=output_dim // 128,
++            kv_dim=width,
++            norm_layer=norm_layer,
++            adaptive=False,
++            do_post_projection=False,
++        ).to(
++            device=self.positional_embedding.device,
++            dtype=self.positional_embedding.dtype,
++        )
++
++        self.ln_post = norm_layer(output_dim)
++        self.proj = nn.Parameter(
++            (output_dim**-0.5) * torch.randn(output_dim, output_dim))
++        self.image_start_id = image_start_id
++        self.image_end_id = image_start_id + 1
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        x = x.to(
++            dtype=self.transformer.get_cast_dtype(),
++            device=self.transformer.get_cast_device(),
++        )
++
++        # to patches
++        x = self.conv1(x)  # shape = [*, width, grid, grid]
++        x = x.reshape(x.shape[0], x.shape[1],
++                      -1)  # shape = [*, width, grid ** 2]
++        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
++
++        x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(
++            x.size(1))))
++
++        x = self.ln_pre(x)
++
++        x = x.permute(1, 0, 2)  # NLD -> LND
++        x = self.transformer(x)
++        x = x.permute(1, 0, 2)  # LND -> NLD
++
++        x = self.attn_pool(x)
++        x = self.ln_post(x)
++        x = x @ self.proj
++
++        return x
++
++    def get_image_positions(self,
++                            input_ids: torch.Tensor) -> Optional[torch.Tensor]:
++        """Given the input IDs, extracts start/stop points corresponding to
++        images.
++
++        args:
++        Returns:
++            Optional torch tensor corresponding to start/stop pairs of images.
++        """
++        if torch.any(input_ids == self.image_start_id):
++            bos_pos = torch.where(input_ids == self.image_start_id)
++            eos_pos = torch.where(input_ids == self.image_end_id)
++            return torch.stack((bos_pos[0], eos_pos[0]), dim=1)
++        return None
+ 
+ 
+ class QWenMLP(nn.Module):
++    """MLP for the language component of the Qwen model, which contains a
++    MergedColumnParallelLinear merging 2 outputs via silu activation."""
+ 
+     def __init__(
+         self,
+@@ -52,7 +424,7 @@ class QWenMLP(nn.Module):
+                              "Only silu is supported for now.")
+         self.act_fn = SiluAndMul()
+ 
+-    def forward(self, x):
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
+         gate_up, _ = self.gate_up_proj(x)
+         x = self.act_fn(gate_up)
+         x, _ = self.c_proj(x)
+@@ -68,7 +440,9 @@ class QWenAttention(nn.Module):
+         max_position_embeddings: int,
+         rope_theta: float = 10000,
+         rope_scaling: Optional[Dict[str, Any]] = None,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.hidden_size = hidden_size
+@@ -101,7 +475,12 @@ class QWenAttention(nn.Module):
+             base=rope_theta,
+             rope_scaling=rope_scaling,
+         )
+-        self.attn = Attention(self.num_heads, self.head_dim, self.scaling)
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -123,7 +502,9 @@ class QWenBlock(nn.Module):
+     def __init__(
+         self,
+         config: PretrainedConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ):
+         super().__init__()
+         self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+@@ -135,7 +516,9 @@ class QWenBlock(nn.Module):
+                                   config.max_position_embeddings,
+                                   rope_theta=rope_theta,
+                                   rope_scaling=rope_scaling,
+-                                  quant_config=quant_config)
++                                  cache_config=cache_config,
++                                  quant_config=quant_config,
++                                  prefix=f"{prefix}.attn")
+ 
+         self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+ 
+@@ -170,14 +553,16 @@ class QWenBlock(nn.Module):
+         return hidden_states, residual
+ 
+ 
++@support_torch_compile
+ class QWenModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+         self.vocab_size = config.vocab_size
+ 
+@@ -185,11 +570,21 @@ class QWenModel(nn.Module):
+             config.vocab_size,
+             config.hidden_size,
+         )
+-        self.h = nn.ModuleList([
+-            QWenBlock(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.h = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: QWenBlock(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.h")
+         self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++        self.visual = VisionTransformer(**config.visual,
++                                        quant_config=quant_config) if hasattr(
++                                            config, "visual") else None
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.wte(input_ids)
+ 
+     def forward(
+         self,
+@@ -197,36 +592,344 @@ class QWenModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.wte(input_ids)
+-        residual = None
+-        for i in range(len(self.h)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        pixel_values: Optional[QwenImageInputs],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        img_pos = None
++        # If pixel / visual embeddings are provided, this is a visual model
++        if pixel_values is not None and self.visual is not None:
++            if pixel_values["type"] != "image_embeds":
++                image_embeds = self.visual(pixel_values["data"])
++            else:
++                image_embeds = pixel_values["data"]
++
++            # features should be of shape (# images, 256, hidden_dim)
++            img_pos = self.visual.get_image_positions(input_ids)
++            if isinstance(
++                    img_pos,
++                    np.ndarray) and img_pos.shape[0] != image_embeds.shape[0]:
++                raise ValueError(
++                    f"Number of placeholders: {img_pos.shape[0]} "
++                    f"does not match number of images {image_embeds.shape[0]}."
++                )
++
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            hidden_states = self.wte(input_ids)
++            # Merge the image embeddings into the hidden states if actually have
++            # visual features and the corresponding image tokens
++            if img_pos is not None:
++                for idx, (img_bos, img_eos) in enumerate(img_pos):
++                    hidden_states[img_bos + 1:img_eos] = image_embeds[idx]
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.h[i]
+             hidden_states, residual = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+                 residual,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+         hidden_states, _ = self.ln_f(hidden_states, residual)
+         return hidden_states
+ 
+ 
+-class QWenLMHeadModel(nn.Module):
++def get_image_text(image_num: int, padding: bool) -> str:
++    """Retrieves a placeholder text that when tokenized, will be expanded with
++    image pads.
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ):
++    Args:
++        image_num: The number of the image that we want a text prompt for.
++            Images should be indexed starting at 1.
++        padding: Whether or not padding should be manually added.
++
++    Returns:
++        Text placeholder prompt for the image being considered.
++    """
++    image_start = f"Picture {image_num}: {IMG_START}"
++    image_end = f"{IMG_END}\n"
++    if not padding:
++        return f"{image_start}{image_end}"
++    return f"{image_start}{MAX_QWEN_IMG_TOKENS * IMG_PAD}{image_end}"
++
++
++def input_processor_for_qwen(ctx: InputContext,
++                             inputs: DecoderOnlyInputs) -> DecoderOnlyInputs:
++    """Processes the inputs, which may or may not be multimodal.
++    Multimodal inputs will only be processed if the model has a "visual"
++    component in its model config, otherwise they'll be ignored.
++
++    Args:
++        ctx: Context of the loaded model.
++        inputs: LLM inputs which may have a multi_modal_data attribute.
++
++    Returns:
++        If the model is language only or not multimodal inputs were provided,
++        returns inputs unmodified. Otherwise, processes the multimodal
++        images / image embeddings and adds the fixed-length image placeholders.
++    """
++    multi_modal_data = inputs.get("multi_modal_data")
++
++    # Only process images if we have multimodal data and a visual config
++    hf_config = ctx.get_hf_config()
++    if (multi_modal_data is None or "image" not in multi_modal_data
++            or not hasattr(hf_config, "visual")):
++        return inputs
++
++    prompt = inputs.get("prompt")
++    prompt_token_ids = inputs["prompt_token_ids"]
++    model_config = ctx.model_config
++    tokenizer = cached_get_tokenizer(
++        model_config.tokenizer,
++        trust_remote_code=model_config.trust_remote_code)
++    image_data = multi_modal_data["image"]
++    if isinstance(image_data, torch.Tensor):
++        num_dims = len(image_data.shape)
++        if num_dims < 2 or num_dims > 3:
++            raise ValueError(
++                f"Expected img embeds to be have 3 dimensions, got {num_dims}")
++        num_images = 1 if num_dims == 2 else image_data.shape[0]
++    elif isinstance(image_data, Image.Image):
++        num_images = 1
++    elif is_list_of(image_data, Image.Image):
++        num_images = len(image_data)
++    else:
++        raise TypeError(f"Invalid image type: {type(image_data)}")
++
++    if prompt is None:
++        prompt = tokenizer.decode(prompt_token_ids)
++
++    # Drops anything between <img>/</img> tags; encoding with the tokenizer
++    # will automatically add the image pads for the context.
++    new_prompt, num_matched_images = re.subn(
++        r"(Picture \d*: <img>).*?(<\/img>\n)",
++        r"\1\2",
++        prompt,
++    )
++
++    if num_matched_images != num_images:
++        logger.warning(
++            "Number of matched image placeholders %s doesn't match the number "
++            "of expected images %s; check your placeholder formatting.",
++            num_matched_images, num_images)
++
++    new_prompt_token_ids = tokenizer.encode(new_prompt)
++
++    return token_inputs(prompt=new_prompt,
++                        prompt_token_ids=new_prompt_token_ids,
++                        multi_modal_data=multi_modal_data)
++
++
++def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
++    """Maps the input data to its MultiModalKwargs (if any).
++
++    Args:
++        ctx: Context of the loaded model.
++        data: data potentially containing image/image embeddings to be mapped
++            to pixel_values in .forward() for a visual QWenLMHeadModel model.
++
++    Returns:
++        MultiModalKwargs containing the stacked normalized images tensor or
++        image embeddings.
++    """
++    # Early exit if we have provided an image to a language only Qwen model
++    hf_config = ctx.get_hf_config()
++    if not hasattr(hf_config, "visual"):
++        logger.warning(
++            "Images were provided but this model has no visual config; "
++            "multimodal inputs will not be forwarded to the model.")
++        return MultiModalKwargs()
++
++    model_config = ctx.model_config
++    tokenizer = cached_get_tokenizer(
++        model_config.tokenizer,
++        trust_remote_code=model_config.trust_remote_code)
++
++    image_pair_tok = tokenizer.encode(IMG_START + IMG_END,
++                                      add_special_tokens=False,
++                                      return_tensors="pt").squeeze()
++    image_start_id = image_pair_tok[0]
++    image_end_id = image_pair_tok[-1]
++    if (image_start_id + 1) != image_end_id:
++        raise ValueError(
++            f"Found image end ID {image_end_id}, but expected {IMG_START} + 1")
++    if len(image_pair_tok) != (MAX_QWEN_IMG_TOKENS + 2):
++        raise ValueError(
++            f"Expected image context length of {MAX_QWEN_IMG_TOKENS}, "
++            f"but got {image_pair_tok - 2}")
++
++    hf_config = ctx.get_hf_config()
++    image_size = hf_config.visual["image_size"]
++    img_emb_size = hf_config.visual["output_dim"]
++
++    if isinstance(data, torch.Tensor):
++        # It's expected that our values have already been processed
++        # by the visual transformer; shape is expected to be:
++        # (# images, 256, hidden_size)
++        if len(data.shape) == 2:
++            # Assume only one image embed was provided; unsqueeze the extra dim
++            data = data.unsqueeze(0)
++        if len(data.shape) != 3 or data.shape[
++                1] != MAX_QWEN_IMG_TOKENS or data.shape[2] != img_emb_size:
++            raise ValueError(
++                "Expected image embeds to be a tensor of shape"
++                f"[# images, {MAX_QWEN_IMG_TOKENS}, {img_emb_size}], but "
++                f"received shape [{data.shape}]")
++        pixel_values = data
++    else:
++        transform = build_normalization_transform(image_size)
++        if not isinstance(data, (list, tuple)):
++            data = [data]
++        transformed_images = [transform(datum) for datum in data]
++        pixel_values = torch.stack(transformed_images, dim=0)
++    return MultiModalKwargs({"pixel_values": pixel_values})
++
++
++def build_normalization_transform(image_size: int) -> transforms.Compose:
++    """Builds a normalization transform which can be applied to one or
++    more input images from which we want to extract visual features.
++
++    Args:
++        image_size: size of the image to be processed for visual embeddings.
++    
++    Returns:
++        Callable transform for normalizing and resizing one RGB image.
++    """
++    return transforms.Compose([
++        transforms.Resize((image_size, image_size),
++                          interpolation=InterpolationMode.BICUBIC),
++        transforms.ToTensor(),
++        transforms.Normalize(mean=CLIP_MEAN, std=CLIP_STD),
++    ])
++
++
++def dummy_data_for_qwen(
++    ctx: InputContext,
++    seq_len: int,
++    mm_counts: Mapping[str, int],
++) -> DummyData:
++    """Build dummy data for warming up Qwen models; this will only contain text
++    matching the defaults for VLLM unless the model has a visual config.
++
++    Args:
++        ctx: Context of the loaded model.
++        seq_len: Number of tokens in the text sequence.
++        mm_counts: multimodal data counts.
++    
++    Returns:
++        Tuple containing sequential and multimodal data.
++    """
++    hf_config = ctx.get_hf_config()
++
++    # The presence of a visual config indicates this is a multimodal model.
++    # If we don't have it, the model is considered an LLM for warmup purposes.
++    if not hasattr(hf_config, "visual"):
++        seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
++        mm_data = None
++        return DummyData(seq_data, mm_data)
++
++    # We have a visual component - use images to warm up
++    num_images = mm_counts["image"]
++    model_config = ctx.model_config
++    tokenizer = cached_get_tokenizer(
++        model_config.tokenizer,
++        trust_remote_code=model_config.trust_remote_code)
++
++    # Build the image prompts with no imgpads; the tokenizer will add img pads
++    image_prompt = ''.join(
++        [get_image_text(idx, False) for idx in range(1, num_images + 1)])
++    toks = tokenizer.encode(image_prompt, add_special_tokens=False)
++
++    # Make sure we actually get the fixed context size per tok padding
++    num_pads = toks.count(tokenizer.encode(IMG_PAD)[0])
++    if num_pads != (num_images * MAX_QWEN_IMG_TOKENS):
++        raise ValueError(
++            f"Tokenized dummy data should encode {MAX_QWEN_IMG_TOKENS} pads"
++            f" per image, but got {num_pads} pads for {num_images} image(s)"
++            " in total. Are you using a qwen tokenizer?")
++
++    # Ensure the number of tokens is at minimum the sequence length provided
++    if len(toks) < seq_len:
++        toks += [0] * (seq_len - len(toks))
++
++    seq_data = SequenceData.from_seqs(toks)
++
++    # Build the input images; width/height doesn't actually matter here since
++    # the data will get resized and the # of tokens per image is constant
++    image = Image.new("RGB", (224, 224), color=0)
++    mm_data = {"image": image if num_images == 1 else [image] * num_images}
++    return DummyData(seq_data, mm_data)
++
++
++class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
+         self.config = config
++        self.multimodal_config = multimodal_config
+         self.quant_config = quant_config
+-        self.transformer = QWenModel(config, quant_config)
+-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
++        self.transformer = QWenModel(vllm_config=vllm_config,
++                                     prefix=maybe_prefix(
++                                         prefix, "transformer"))
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      quant_config=quant_config)
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.transformer.wte.weight
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.transformer.make_empty_intermediate_tensors)
++
++    def _get_image_input_type(
++            self,
++            pixel_values: Optional[torch.Tensor]) -> Optional[QwenImageInputs]:
++        """Determines if the provided pixel_values are normalized pixel values
++        or image embeddings.
++
++        Args:
++            pixel_values: Optional data to processed into visual embeddings.
++
++        Returns:
++            None of the QwenImageInputs type used to determine whether or not
++            the visual transformer needs to process the pixel_values.
++        """
++        if pixel_values is not None and self.transformer.visual is not None:
++            pixel_values = flatten_bn(pixel_values)
++            if len(pixel_values.shape) == 3 and pixel_values.shape[
++                    1] == MAX_QWEN_IMG_TOKENS and pixel_values.shape[
++                        2] == self.config.visual["output_dim"]:
++                return QwenImageEmbeddingInputs(
++                    type="image_embeds",
++                    data=pixel_values,
++                )
++            else:
++                # If we have the wrong shape, assume we still need to process
++                return QwenImagePixelInputs(
++                    type="pixel_values",
++                    data=pixel_values,
++                )
++        return None
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.transformer.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -234,14 +937,27 @@ class QWenLMHeadModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        pixel_values: Optional[torch.Tensor] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if intermediate_tensors is not None:
++            input_ids = None
++            pixel_values = None
++        else:
++            pixel_values = self._get_image_input_type(pixel_values)
++
+         hidden_states = self.transformer(input_ids, positions, kv_caches,
+-                                         attn_metadata)
++                                         attn_metadata, intermediate_tensors,
++                                         pixel_values, inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -253,13 +969,15 @@ class QWenLMHeadModel(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("gate_up_proj", "w2", 0),
+             ("gate_up_proj", "w1", 1),
+         ]
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -270,6 +988,9 @@ class QWenLMHeadModel(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                # Skip layers on other devices.
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -278,7 +999,103 @@ class QWenLMHeadModel(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                # Skip layers on other devices.
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++class QWenLLM(QWenBaseModel):
++    packed_modules_mapping = {
++        "c_attn": ["c_attn"],
++        "gate_up_proj": [
++            "w2",
++            "w1",
++        ],
++    }
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "c_attn",
++        "gate_up_proj",
++        "c_proj",
++    ]
++
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "w2": ("gate_up_proj", 0),
++        "w1": ("gate_up_proj", 1),
++    }
++
++
++class QWenVL(QWenBaseModel, SupportsMultiModal):
++    packed_modules_mapping = {
++        "c_attn": ["c_attn"],
++        "gate_up_proj": [
++            "w2",
++            "w1",
++        ],
++    }
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "c_attn",
++        "gate_up_proj",
++        "c_proj",
++        # visual module
++        "out_proj",
++        "in_proj",
++        "c_fc",
++        # resampler
++        "kv_proj",
++    ]
++
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def get_mm_mapping(self) -> MultiModelKeys:
++        """
++        Get the module prefix in multimodal models
++        """
++        return MultiModelKeys.from_string_field(
++            language_model="transformer.h",
++            connector="transformer.visual.attn_pool",
++            tower_model="transformer.visual.transformer")
++
++
++@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)
++@MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS)
++@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen)
++@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen)
++class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
++    """
++    QWenLMHeadModel is not only applicable to LLM  but also to VL, which is not 
++    conducive to the current integration logic of LoRA in vLLM. Therefore, it 
++    is necessary to separate them.
++    """
++    # Ensure that the LoRA support check passes when the class is not
++    # initialized, but set all these attributes to empty.
++    packed_modules_mapping = {}
++    supported_lora_modules = []
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def __new__(
++        cls,
++        vllm_config: VllmConfig,
++        prefix: str = "",
++    ) -> None:
++        config = vllm_config.model_config.hf_config
++        # Initialize VL
++        if hasattr(config, "visual"):
++            return QWenVL(vllm_config=vllm_config, prefix=prefix)
++        # Initialize LLM
++        else:
++            return QWenLLM(vllm_config=vllm_config, prefix=prefix)
+diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
+index 62bc7fe..d20fb15 100644
+--- a/vllm/model_executor/models/qwen2.py
++++ b/vllm/model_executor/models/qwen2.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
+ # Copyright 2024 The Qwen team.
+@@ -22,30 +21,42 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only Qwen2 model compatible with HuggingFace weights."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import Qwen2Config
+ 
+-from vllm.attention import Attention, AttentionMetadata
+-from vllm.config import LoRAConfig
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.attention import Attention, AttentionMetadata, AttentionType
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
++from vllm.logger import init_logger
+ from vllm.model_executor.layers.activation import SiluAndMul
+ from vllm.model_executor.layers.layernorm import RMSNorm
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.pooler import Pooler, PoolingType
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name)
++from vllm.model_executor.pooling_metadata import PoolingMetadata
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors, PoolerOutput
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
++                    is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++logger = init_logger(__name__)
+ 
+ 
+ class Qwen2MLP(nn.Module):
+@@ -56,16 +67,23 @@ class Qwen2MLP(nn.Module):
+         intermediate_size: int,
+         hidden_act: str,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.gate_up_proj = MergedColumnParallelLinear(
+-            hidden_size, [intermediate_size] * 2,
++            hidden_size,
++            [intermediate_size] * 2,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj",
++        )
++        self.down_proj = RowParallelLinear(
++            intermediate_size,
++            hidden_size,
+             bias=False,
+-            quant_config=quant_config)
+-        self.down_proj = RowParallelLinear(intermediate_size,
+-                                           hidden_size,
+-                                           bias=False,
+-                                           quant_config=quant_config)
++            quant_config=quant_config,
++            prefix=f"{prefix}.down_proj",
++        )
+         if hidden_act != "silu":
+             raise ValueError(f"Unsupported activation: {hidden_act}. "
+                              "Only silu is supported for now.")
+@@ -86,9 +104,11 @@ class Qwen2Attention(nn.Module):
+                  num_kv_heads: int,
+                  max_position: int = 4096 * 32,
+                  rope_theta: float = 10000,
+-                 use_sliding_window: bool = False,
++                 cache_config: Optional[CacheConfig] = None,
+                  quant_config: Optional[QuantizationConfig] = None,
+-                 sliding_window: Optional[int] = None) -> None:
++                 rope_scaling: Optional[Tuple] = None,
++                 prefix: str = "",
++                 attn_type: str = AttentionType.DECODER) -> None:
+         super().__init__()
+         self.hidden_size = hidden_size
+         tp_size = get_tensor_model_parallel_world_size()
+@@ -110,7 +130,6 @@ class Qwen2Attention(nn.Module):
+         self.kv_size = self.num_kv_heads * self.head_dim
+         self.scaling = self.head_dim**-0.5
+         self.rope_theta = rope_theta
+-        self.sliding_window = sliding_window if use_sliding_window else None
+ 
+         self.qkv_proj = QKVParallelLinear(
+             hidden_size,
+@@ -119,12 +138,14 @@ class Qwen2Attention(nn.Module):
+             self.total_num_kv_heads,
+             bias=True,
+             quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
+         )
+         self.o_proj = RowParallelLinear(
+             self.total_num_heads * self.head_dim,
+             hidden_size,
+             bias=False,
+             quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
+         )
+ 
+         self.rotary_emb = get_rope(
+@@ -132,12 +153,16 @@ class Qwen2Attention(nn.Module):
+             rotary_dim=self.head_dim,
+             max_position=max_position,
+             base=self.rope_theta,
++            rope_scaling=rope_scaling,
+         )
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+                               self.scaling,
+                               num_kv_heads=self.num_kv_heads,
+-                              sliding_window=self.sliding_window)
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn",
++                              attn_type=attn_type)
+ 
+     def forward(
+         self,
+@@ -159,29 +184,43 @@ class Qwen2DecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: Qwen2Config,
+-        layer_idx: int,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = config.hidden_size
+         # Requires transformers > 4.32.0
+         rope_theta = getattr(config, "rope_theta", 1000000)
+-        use_sliding_window = (config.use_sliding_window
+-                              and layer_idx < config.max_window_layers)
++        rope_scaling = getattr(config, "rope_scaling", None)
++
++        # By default, Qwen2 uses causal attention as it is a decoder-only model.
++        # You can override the HF config with `is_causal=False` to enable
++        # bidirectional attention, which is used in some embedding models
++        # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct)
++        if getattr(config, "is_causal", True):
++            attn_type = AttentionType.DECODER
++        else:
++            attn_type = AttentionType.ENCODER_ONLY
++
+         self.self_attn = Qwen2Attention(
+             hidden_size=self.hidden_size,
+             num_heads=config.num_attention_heads,
+             max_position=config.max_position_embeddings,
+             num_kv_heads=config.num_key_value_heads,
+             rope_theta=rope_theta,
+-            use_sliding_window=use_sliding_window,
++            cache_config=cache_config,
+             quant_config=quant_config,
+-            sliding_window=config.sliding_window)
++            rope_scaling=rope_scaling,
++            prefix=f"{prefix}.self_attn",
++            attn_type=attn_type,
++        )
+         self.mlp = Qwen2MLP(
+             hidden_size=self.hidden_size,
+             intermediate_size=config.intermediate_size,
+             hidden_act=config.hidden_act,
+             quant_config=quant_config,
++            prefix=f"{prefix}.mlp",
+         )
+         self.input_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+@@ -217,27 +256,62 @@ class Qwen2DecoderLayer(nn.Module):
+         return hidden_states, residual
+ 
+ 
++@support_torch_compile
+ class Qwen2Model(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: Qwen2Config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        # TODO (@robertgshaw2): see if this can be moved out
++        if (cache_config.sliding_window is not None
++                and hasattr(config, "max_window_layers")):
++            raise ValueError("Sliding window for some but all layers is not "
++                             "supported. This model uses sliding window "
++                             "but `max_window_layers` = {} is less than "
++                             "`num_hidden_layers` = {}. Please open an issue "
++                             "to discuss this feature.".format(
++                                 config.max_window_layers,
++                                 config.num_hidden_layers,
++                             ))
++
+         self.config = config
+         self.padding_idx = config.pad_token_id
+         self.vocab_size = config.vocab_size
+ 
+-        self.embed_tokens = VocabParallelEmbedding(
+-            config.vocab_size,
+-            config.hidden_size,
++        if get_pp_group().is_first_rank or (config.tie_word_embeddings
++                                            and get_pp_group().is_last_rank):
++            self.embed_tokens = VocabParallelEmbedding(
++                config.vocab_size,
++                config.hidden_size,
++                quant_config=quant_config,
++                prefix=f"{prefix}.embed_tokens",
++            )
++        else:
++            self.embed_tokens = PPMissingLayer()
++
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: Qwen2DecoderLayer(config=config,
++                                             cache_config=cache_config,
++                                             quant_config=quant_config,
++                                             prefix=prefix),
++            prefix=f"{prefix}.layers",
+         )
+-        self.layers = nn.ModuleList([
+-            Qwen2DecoderLayer(config, layer_idx, quant_config)
+-            for layer_idx in range(config.num_hidden_layers)
+-        ])
+-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++        if get_pp_group().is_last_rank:
++            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        else:
++            self.norm = PPMissingLayer()
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -245,23 +319,83 @@ class Qwen2Model(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        residual = None
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+                 residual,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+         hidden_states, _ = self.norm(hidden_states, residual)
+         return hidden_states
+ 
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++            ("gate_up_proj", "gate_proj", 0),
++            ("gate_up_proj", "up_proj", 1),
++        ]
++        params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                # Remapping the name of FP8 kv-scale.
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+ 
+-class Qwen2ForCausalLM(nn.Module):
++
++class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+     packed_modules_mapping = {
+         "qkv_proj": [
+             "q_proj",
+@@ -284,27 +418,49 @@ class Qwen2ForCausalLM(nn.Module):
+     embedding_modules = {}
+     embedding_padding_modules = []
+ 
+-    def __init__(
+-        self,
+-        config: Qwen2Config,
+-        quant_config: Optional[QuantizationConfig] = None,
+-        lora_config: Optional[LoRAConfig] = None,
+-    ) -> None:
+-        del lora_config
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
+         self.config = config
++        self.lora_config = lora_config
++
+         self.quant_config = quant_config
+-        self.model = Qwen2Model(config, quant_config)
++        self.model = Qwen2Model(vllm_config=vllm_config,
++                                prefix=maybe_prefix(prefix, "model"))
+ 
+-        if config.tie_word_embeddings:
+-            self.lm_head_weight = self.model.embed_tokens.weight
++        if get_pp_group().is_last_rank:
++            if config.tie_word_embeddings:
++                self.lm_head = self.model.embed_tokens
++            else:
++                self.lm_head = ParallelLMHead(config.vocab_size,
++                                              config.hidden_size,
++                                              quant_config=quant_config,
++                                              prefix=maybe_prefix(
++                                                  prefix, "lm_head"))
+         else:
+-            self.lm_head = ParallelLMHead(config.vocab_size,
+-                                          config.hidden_size)
+-            self.lm_head_weight = self.lm_head.weight
++            self.lm_head = PPMissingLayer()
+ 
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -312,14 +468,20 @@ class Qwen2ForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -331,37 +493,90 @@ class Qwen2ForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(
++            self,
++            skip_prefixes=(["lm_head."]
++                           if self.config.tie_word_embeddings else None),
++        )
++        return loader.load_weights(weights)
++
++
++class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj",
++        "o_proj",
++        "gate_up_proj",
++        "down_proj",
++    ]
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++        pooler_config = vllm_config.model_config.pooler_config
++
++        self.config = config
++        self.lora_config = lora_config
++
++        self.quant_config = quant_config
++        self.model = Qwen2Model(vllm_config=vllm_config,
++                                prefix=maybe_prefix(prefix, "model"))
++
++        # TODO: Replace this model class with as_embedding_model(
++        # Qwen2ForCausalLM) after changing the default pooling method
++        if pooler_config.pooling_type is None:
++            logger.warning(
++                "This embedding model will default to last-token pooling in "
++                "an upcoming version. To avoid breaking changes, you should "
++                "pass `--override-pooler-config '{\"pooling_type\": \"MEAN\"}'`"
++                " explicitly.")
++
++        self._pooler = Pooler.from_config_with_defaults(
++            pooler_config,
++            pooling_type=PoolingType.MEAN,
++            normalize=True,
++            softmax=False)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++    ) -> torch.Tensor:
++        return self.model(input_ids, positions, kv_caches, attn_metadata,
++                          intermediate_tensors)
++
++    def pooler(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Optional[PoolerOutput]:
++        return self._pooler(hidden_states, pooling_metadata)
++
+     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+-        stacked_params_mapping = [
+-            # (param_name, shard_name, shard_id)
+-            ("qkv_proj", "q_proj", "q"),
+-            ("qkv_proj", "k_proj", "k"),
+-            ("qkv_proj", "v_proj", "v"),
+-            ("gate_up_proj", "gate_proj", 0),
+-            ("gate_up_proj", "up_proj", 1),
+-        ]
+-        params_dict = dict(self.named_parameters(remove_duplicate=False))
+-        for name, loaded_weight in weights:
+-            if "rotary_emb.inv_freq" in name:
+-                continue
+-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+-                continue
+-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+-                if weight_name not in name:
+-                    continue
+-                name = name.replace(weight_name, param_name)
+-                # Skip loading extra bias for GPTQ models.
+-                if name.endswith(".bias") and name not in params_dict:
+-                    continue
+-                param = params_dict[name]
+-                weight_loader = param.weight_loader
+-                weight_loader(param, loaded_weight, shard_id)
+-                break
+-            else:
+-                # Skip loading extra bias for GPTQ models.
+-                if name.endswith(".bias") and name not in params_dict:
+-                    continue
+-                param = params_dict[name]
+-                weight_loader = getattr(param, "weight_loader",
+-                                        default_weight_loader)
+-                weight_loader(param, loaded_weight)
++        weights = self.hf_to_vllm_mapper.apply(weights)
++        weights = ((name, data) for name, data in weights
++                   if not name.startswith("lm_head."))
++        self.model.load_weights(weights)
+diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
+new file mode 100644
+index 0000000..0dff959
+--- /dev/null
++++ b/vllm/model_executor/models/qwen2_audio.py
+@@ -0,0 +1,417 @@
++# Copyright 2024 The Qwen team.
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
++from functools import cached_property
++from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
++                    TypedDict, Union)
++
++import torch
++import torch.nn as nn
++from transformers import BatchFeature
++from transformers.models.qwen2_audio import (Qwen2AudioConfig,
++                                             Qwen2AudioEncoder,
++                                             Qwen2AudioProcessor)
++from transformers.models.whisper import WhisperFeatureExtractor
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
++                                    NestedTensors)
++from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
++                                   MultiModalDataParser)
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo, PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsMultiModal, SupportsPP
++from .utils import (AutoWeightsLoader, init_vllm_registered_model,
++                    maybe_prefix, merge_multimodal_embeddings)
++
++
++# # === Audio Inputs === #
++class Qwen2AudioInputs(TypedDict):
++    input_features: torch.Tensor
++    """Shape: `(num_audios, num_mel_bins, 3000)`"""
++
++    feature_attention_mask: torch.Tensor
++    """Shape: `(num_audios, 3000)`"""
++
++
++# === Audio Encoder === #
++
++
++class Qwen2AudioMultiModalProjector(nn.Module):
++
++    def __init__(self, audio_hidden_size: int, text_hidden_size: int):
++        super().__init__()
++        self.linear = nn.Linear(audio_hidden_size, text_hidden_size, bias=True)
++
++    def forward(self, audio_features):
++        hidden_states = self.linear(audio_features)
++        return hidden_states
++
++
++# From Qwen2AudioEncoder._get_feat_extract_output_lengths
++def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
++    feat_lengths = (input_lengths - 1) // 2 + 1
++    output_lengths = (feat_lengths - 2) // 2 + 1
++    return feat_lengths, output_lengths
++
++
++class Qwen2AudioProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(Qwen2AudioConfig)
++
++    def get_hf_processor(
++        self,
++        *,
++        # Ignored in initialization
++        sampling_rate: Optional[int] = None,
++    ) -> Qwen2AudioProcessor:
++        return self.ctx.get_hf_processor(Qwen2AudioProcessor)
++
++    def get_feature_extractor(
++        self,
++        *,
++        # Ignored in initialization
++        sampling_rate: Optional[int] = None,
++    ) -> WhisperFeatureExtractor:
++        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
++        feature_extractor = hf_processor.feature_extractor  # type: ignore
++        assert isinstance(feature_extractor, WhisperFeatureExtractor)
++        return feature_extractor
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"audio": None}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        hf_config = self.get_hf_config()
++        max_source_positions = hf_config.audio_config.max_source_positions
++        max_output_lengths = (max_source_positions - 2) // 2 + 1
++
++        return {"audio": max_output_lengths}
++
++
++class Qwen2AudioDummyInputsBuilder(
++        BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        feature_extractor = self.info.get_feature_extractor()
++
++        sampling_rate = feature_extractor.sampling_rate
++        audio_len = feature_extractor.chunk_length * sampling_rate
++        num_audios = mm_counts.get("audio", 0)
++
++        mm_data = {
++            "audio":
++            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
++        }
++
++        return ProcessorInputs(
++            prompt_text="<|AUDIO|>" * num_audios,
++            mm_data=mm_data,
++        )
++
++
++class Qwen2AudioMultiModalProcessor(
++        BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
++
++    def _get_data_parser(self) -> MultiModalDataParser:
++        feature_extractor = self.info.get_feature_extractor()
++        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, Any],
++    ) -> BatchFeature:
++        mm_data = dict(mm_data)
++        audios = mm_data.pop("audios", [])
++
++        if audios:
++            mm_data["audios"] = audios
++
++            feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
++            mm_kwargs = dict(
++                **mm_kwargs,
++                sampling_rate=feature_extractor.sampling_rate,
++            )
++        else:
++            # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
++            pass
++
++        processed_outputs = super()._call_hf_processor(
++            prompt=prompt,
++            mm_data=mm_data,
++            mm_kwargs=mm_kwargs,
++        )
++
++        return processed_outputs
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(
++            input_features=MultiModalFieldConfig.batched("audio"),
++            feature_attention_mask=MultiModalFieldConfig.batched("audio"),
++        )
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        hf_config = self.info.get_hf_config()
++        placeholder = hf_config.audio_token_index
++
++        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
++        if feature_attention_mask is None:
++            audio_output_lengths = []
++        else:
++            assert isinstance(feature_attention_mask, torch.Tensor)
++            _, audio_output_lens = _get_feat_extract_output_lengths(
++                feature_attention_mask.sum(-1))
++
++            audio_output_lengths = audio_output_lens.tolist()
++
++        def get_replacement_qwen2_audio(item_idx: int):
++            num_placeholders = audio_output_lengths[item_idx]
++            if num_placeholders == 0:
++                audios = mm_items.get_items("audio", AudioProcessorItems)
++                audio = audios.get(item_idx)
++                raise ValueError(
++                    f"The audio {audio} (len={len(audio)}) is too short "
++                    "to be represented inside the model")
++
++            return [placeholder] * num_placeholders
++
++        return [
++            PromptReplacement(
++                modality="audio",
++                target=[placeholder],
++                replacement=get_replacement_qwen2_audio,
++            )
++        ]
++
++    def _always_apply_prompt_replacements(self) -> bool:
++        # Qwen2-Audio processor will start inserting placeholder tokens
++        # in an upcoming release:
++        # https://github.com/huggingface/transformers/pull/35534
++        # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF
++        # has already performed processing for multi-audio input when the input
++        # audios are short (the corresponding placeholders may take up fewer
++        # tokens than the number of audio items)
++        return not hasattr(self.info.get_hf_processor(), "audio_token")
++
++
++@MULTIMODAL_REGISTRY.register_processor(
++    Qwen2AudioMultiModalProcessor,
++    info=Qwen2AudioProcessingInfo,
++    dummy_inputs=Qwen2AudioDummyInputsBuilder)
++class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                         SupportsPP):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        self.audio_tower = Qwen2AudioEncoder(config.audio_config)
++        self.multi_modal_projector = Qwen2AudioMultiModalProjector(
++            config.audio_config.d_model, config.text_config.hidden_size)
++
++        self.quant_config = quant_config
++
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=config.text_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++            architectures=["Qwen2ForCausalLM"],
++        )
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _validate_and_reshape_mm_tensor(self, mm_input: object,
++                                        name: str) -> torch.Tensor:
++        if not isinstance(mm_input, (torch.Tensor, list)):
++            raise ValueError(f"Incorrect type of {name}. "
++                             f"Got type: {type(mm_input)}")
++        if isinstance(mm_input, torch.Tensor):
++            return torch.concat(list(mm_input))
++        else:
++            return torch.concat(mm_input)
++
++    def _parse_and_validate_audio_input(
++            self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
++        input_features = kwargs.pop('input_features', None)
++        feature_attention_mask = kwargs.pop('feature_attention_mask', None)
++        if input_features is None:
++            return None
++        input_features = self._validate_and_reshape_mm_tensor(
++            input_features, 'input_features')
++        feature_attention_mask = self._validate_and_reshape_mm_tensor(
++            feature_attention_mask, 'feature_attention_mask')
++        if not isinstance(input_features, (torch.Tensor, list)):
++            raise ValueError("Incorrect type of audio input features. "
++                             f"Got type: {type(input_features)}")
++        return Qwen2AudioInputs(input_features=input_features,
++                                feature_attention_mask=feature_attention_mask)
++
++    def _process_audio_input(self,
++                             audio_input: Qwen2AudioInputs) -> torch.Tensor:
++
++        input_features = audio_input["input_features"]
++        feature_attention_mask = audio_input["feature_attention_mask"]
++
++        audio_feat_lengths, audio_output_lengths = (
++            self.audio_tower._get_feat_extract_output_lengths(
++                feature_attention_mask.sum(-1)))
++
++        batch_size, _, max_mel_seq_len = input_features.shape
++        max_seq_len = (max_mel_seq_len - 2) // 2 + 1
++        # Create a sequence tensor of shape (batch_size, max_seq_len)
++        seq_range = (torch.arange(
++            0,
++            max_seq_len,
++            dtype=audio_feat_lengths.dtype,
++            device=audio_feat_lengths.device).unsqueeze(0).expand(
++                batch_size, max_seq_len))
++        lengths_expand = audio_feat_lengths.unsqueeze(-1).expand(
++            batch_size, max_seq_len)
++        # Create mask
++        padding_mask = seq_range >= lengths_expand
++
++        audio_attention_mask_ = padding_mask.view(
++            batch_size, 1, 1, max_seq_len).expand(batch_size, 1, max_seq_len,
++                                                  max_seq_len)
++        audio_attention_mask = audio_attention_mask_.to(
++            dtype=self.audio_tower.conv1.weight.dtype,
++            device=self.audio_tower.conv1.weight.device)
++        audio_attention_mask[audio_attention_mask_] = float("-inf")
++
++        audio_outputs = self.audio_tower(input_features,
++                                         attention_mask=audio_attention_mask)
++        selected_audio_feature = audio_outputs.last_hidden_state
++        audio_features = self.multi_modal_projector(selected_audio_feature)
++        num_audios, max_audio_tokens, embed_dim = audio_features.shape
++        audio_output_lengths = audio_output_lengths.unsqueeze(1)
++        audio_features_mask = torch.arange(max_audio_tokens).expand(
++            num_audios, max_audio_tokens).to(
++                audio_output_lengths.device) < audio_output_lengths
++        masked_audio_features = audio_features[audio_features_mask].view(
++            -1, embed_dim)
++
++        # Split to tuple of embeddings for individual audio input.
++        return torch.split(masked_audio_features,
++                           audio_output_lengths.flatten().tolist())
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        audio_input = self._parse_and_validate_audio_input(**kwargs)
++        if audio_input is None:
++            return None
++        masked_audio_features = self._process_audio_input(audio_input)
++        return masked_audio_features
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            inputs_embeds = merge_multimodal_embeddings(
++                input_ids, inputs_embeds, multimodal_embeddings,
++                self.config.audio_token_index)
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      multimodal_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model.model(input_ids,
++                                                  positions,
++                                                  kv_caches,
++                                                  attn_metadata,
++                                                  intermediate_tensors,
++                                                  inputs_embeds=inputs_embeds)
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
+index 8da89a2..95de6c2 100644
+--- a/vllm/model_executor/models/qwen2_moe.py
++++ b/vllm/model_executor/models/qwen2_moe.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+ # Copyright 2024 The Qwen team.
+@@ -22,7 +21,7 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
+-from typing import Any, Dict, Iterable, List, Optional, Tuple
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ import torch.nn.functional as F
+@@ -30,26 +29,35 @@ from torch import nn
+ from transformers import PretrainedConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import (get_tensor_model_parallel_rank,
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group,
+                               get_tensor_model_parallel_world_size,
+                               tensor_model_parallel_all_reduce)
++from vllm.logger import init_logger
+ from vllm.model_executor.layers.activation import SiluAndMul
+-from vllm.model_executor.layers.fused_moe import fused_moe
++from vllm.model_executor.layers.fused_moe import FusedMoE
+ from vllm.model_executor.layers.layernorm import RMSNorm
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                ReplicatedLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (extract_layer_index, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++logger = init_logger(__name__)
+ 
+ 
+ class Qwen2MoeMLP(nn.Module):
+@@ -92,28 +100,23 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
+         quant_config: Optional[QuantizationConfig] = None,
+     ):
+         super().__init__()
+-        self.config = config
+-        self.rank = get_tensor_model_parallel_rank()
+         self.tp_size = get_tensor_model_parallel_world_size()
+-        self.n_routed_experts = config.num_experts
+-        self.top_k = config.num_experts_per_tok
+-        if self.tp_size > self.n_routed_experts:
++
++        if self.tp_size > config.num_experts:
+             raise ValueError(
+                 f"Tensor parallel size {self.tp_size} is greater than "
+-                f"the number of experts {self.n_routed_experts}.")
+-
+-        self.experts = nn.ModuleList([
+-            Qwen2MoeMLP(hidden_size=config.hidden_size,
+-                        intermediate_size=config.moe_intermediate_size,
+-                        hidden_act=config.hidden_act,
+-                        quant_config=quant_config,
+-                        reduce_results=False)
+-            for idx in range(self.n_routed_experts)
+-        ])
+-        self.pack_params()
++                f"the number of experts {config.num_experts}.")
++
++        self.experts = FusedMoE(num_experts=config.num_experts,
++                                top_k=config.num_experts_per_tok,
++                                hidden_size=config.hidden_size,
++                                intermediate_size=config.moe_intermediate_size,
++                                reduce_results=False,
++                                renormalize=config.norm_topk_prob,
++                                quant_config=quant_config)
+ 
+         self.gate = ReplicatedLinear(config.hidden_size,
+-                                     self.n_routed_experts,
++                                     config.num_experts,
+                                      bias=False,
+                                      quant_config=None)
+         if config.shared_expert_intermediate_size > 0:
+@@ -130,27 +133,10 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
+                                                   1,
+                                                   bias=False)
+ 
+-    def pack_params(self):
+-        w1 = []
+-        w2 = []
+-        for expert in self.experts:
+-            w1.append(expert.gate_up_proj.weight)
+-            w2.append(expert.down_proj.weight)
+-        self.w1 = torch._utils._flatten_dense_tensors(w1)
+-        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
+-        for data, param in zip(w1s, w1):
+-            param.data = data
+-        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
+-
+-        self.w2 = torch._utils._flatten_dense_tensors(w2)
+-        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
+-        for data, param in zip(w2s, w2):
+-            param.data = data
+-
+-        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
+-
+     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+-        num_tokens, hidden_dim = hidden_states.shape
++        # NOTE: hidden_states can have either 1D or 2D shape.
++        orig_shape = hidden_states.shape
++        hidden_dim = hidden_states.shape[-1]
+         hidden_states = hidden_states.view(-1, hidden_dim)
+         shared_output = None
+         if self.shared_expert is not None:
+@@ -161,20 +147,15 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
+ 
+         # router_logits: (num_tokens, n_experts)
+         router_logits, _ = self.gate(hidden_states)
+-        final_hidden_states = fused_moe(hidden_states,
+-                                        self.w1,
+-                                        self.w2,
+-                                        router_logits,
+-                                        self.top_k,
+-                                        renormalize=self.config.norm_topk_prob,
+-                                        inplace=True)
+-
++        final_hidden_states = self.experts(hidden_states=hidden_states,
++                                           router_logits=router_logits)
+         if shared_output is not None:
+             final_hidden_states = final_hidden_states + shared_output
+-        final_hidden_states = tensor_model_parallel_all_reduce(
+-            final_hidden_states)
++        if self.tp_size > 1:
++            final_hidden_states = tensor_model_parallel_all_reduce(
++                final_hidden_states)
+ 
+-        return final_hidden_states.view(num_tokens, hidden_dim)
++        return final_hidden_states.view(orig_shape)
+ 
+ 
+ class Qwen2MoeAttention(nn.Module):
+@@ -187,7 +168,9 @@ class Qwen2MoeAttention(nn.Module):
+         rope_theta: float = 10000,
+         rope_scaling: Optional[Dict[str, Any]] = None,
+         max_position_embeddings: int = 8192,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = hidden_size
+@@ -238,7 +221,10 @@ class Qwen2MoeAttention(nn.Module):
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+                               self.scaling,
+-                              num_kv_heads=self.num_kv_heads)
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -260,8 +246,9 @@ class Qwen2MoeDecoderLayer(nn.Module):
+     def __init__(
+         self,
+         config: PretrainedConfig,
+-        layer_idx: int,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+         self.hidden_size = config.hidden_size
+@@ -276,10 +263,19 @@ class Qwen2MoeDecoderLayer(nn.Module):
+             rope_theta=rope_theta,
+             rope_scaling=rope_scaling,
+             max_position_embeddings=max_position_embeddings,
++            cache_config=cache_config,
+             quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
+         )
+-        if (config.num_experts is not None
+-                and (layer_idx + 1) % config.decoder_sparse_step == 0):
++
++        # Note: Qwen/Qwen2-57B-A14B-Instruct does not have
++        # `mlp_only_layers` in the config.
++        layer_idx = extract_layer_index(prefix)
++        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
++                           config.mlp_only_layers)
++        if (layer_idx not in mlp_only_layers) and (
++                config.num_experts > 0 and
++            (layer_idx + 1) % config.decoder_sparse_step == 0):
+             self.mlp = Qwen2MoeSparseMoeBlock(config=config,
+                                               quant_config=quant_config)
+         else:
+@@ -323,14 +319,16 @@ class Qwen2MoeDecoderLayer(nn.Module):
+         return hidden_states, residual
+ 
+ 
++@support_torch_compile
+ class Qwen2MoeModel(nn.Module):
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.padding_idx = config.pad_token_id
+         self.vocab_size = config.vocab_size
+ 
+@@ -338,11 +336,21 @@ class Qwen2MoeModel(nn.Module):
+             config.vocab_size,
+             config.hidden_size,
+         )
+-        self.layers = nn.ModuleList([
+-            Qwen2MoeDecoderLayer(config, layer_idx, quant_config=quant_config)
+-            for layer_idx in range(config.num_hidden_layers)
+-        ])
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: Qwen2MoeDecoderLayer(config=config,
++                                                cache_config=cache_config,
++                                                quant_config=quant_config,
++                                                prefix=prefix),
++            prefix=f"{prefix}.layers",
++        )
+         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -350,34 +358,57 @@ class Qwen2MoeModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        residual = None
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(positions, hidden_states,
+-                                            kv_caches[i], attn_metadata,
+-                                            residual)
++                                            kv_caches[i - self.start_layer],
++                                            attn_metadata, residual)
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
+         hidden_states, _ = self.norm(hidden_states, residual)
+         return hidden_states
+ 
+ 
+-class Qwen2MoeForCausalLM(nn.Module):
++class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
+ 
+     fall_back_to_pt_during_load = False
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+-        self.model = Qwen2MoeModel(config, quant_config)
+-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
++        self.model = Qwen2MoeModel(vllm_config=vllm_config,
++                                   prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      quant_config=quant_config)
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -385,14 +416,20 @@ class Qwen2MoeForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -404,7 +441,8 @@ class Qwen2MoeForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -414,34 +452,92 @@ class Qwen2MoeForCausalLM(nn.Module):
+             ("gate_up_proj", "up_proj", 1),
+         ]
+ 
++        # Params for weights, fp8 weight scales, fp8 activation scales
++        # (param_name, weight_name, expert_id, shard_id)
++        expert_params_mapping = FusedMoE.make_expert_params_mapping(
++            ckpt_gate_proj_name="gate_proj",
++            ckpt_down_proj_name="down_proj",
++            ckpt_up_proj_name="up_proj",
++            num_experts=self.config.num_experts)
++
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+             for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                # Skip non-stacked layers and experts (experts handled below).
+                 if weight_name not in name:
+                     continue
++                # We have mlp.experts[0].gate_proj in the checkpoint.
++                # Since we handle the experts below in expert_params_mapping,
++                # we need to skip here BEFORE we update the name, otherwise
++                # name will be updated to mlp.experts[0].gate_up_proj, which
++                # will then be updated below in expert_params_mapping
++                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
++                if "mlp.experts" in name:
++                    continue
+                 name = name.replace(weight_name, param_name)
+                 # Skip loading extra bias for GPTQ models.
+-                if name.endswith(".bias") and name not in params_dict:
+-                    continue
+-                # Skip experts that are not assigned to this worker.
+-                if (("mlp.experts." in name or "mlp.shared_expert." in name)
++                if ((name.endswith(".bias") or name.endswith("_bias"))
+                         and name not in params_dict):
+                     continue
++                # Skip layers on other devices.
++                if is_pp_missing_parameter(name, self):
++                    continue
++                if name not in params_dict:
++                    continue
++
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+                 break
+             else:
+-                # Skip loading extra bias for GPTQ models.
+-                if name.endswith(".bias") and name not in params_dict:
+-                    continue
+-                # Skip experts that are not assigned to this worker.
+-                if (("mlp.experts." in name or "mlp.shared_expert." in name)
+-                        and name not in params_dict):
+-                    continue
+-                param = params_dict[name]
+-                weight_loader = getattr(param, "weight_loader",
+-                                        default_weight_loader)
+-                weight_loader(param, loaded_weight)
++                for mapping in expert_params_mapping:
++                    param_name, weight_name, expert_id, shard_id = mapping
++                    if weight_name not in name:
++                        continue
++                    name = name.replace(weight_name, param_name)
++                    # Skip layers on other devices.
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    # Skip loading extra bias for GPTQ models.
++                    if ((name.endswith(".bias") or name.endswith("_bias"))
++                            and name not in params_dict):
++                        continue
++                    param = params_dict[name]
++                    weight_loader = param.weight_loader
++                    weight_loader(param,
++                                  loaded_weight,
++                                  name,
++                                  shard_id=shard_id,
++                                  expert_id=expert_id)
++                    break
++                else:
++                    # Skip loading extra bias for GPTQ models.
++                    if ((name.endswith(".bias") or name.endswith("_bias"))
++                            and name not in params_dict):
++                        continue
++                    # Skip layers on other devices.
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    # Remapping the name of FP8 kv-scale.
++                    if name.endswith("kv_scale"):
++                        remapped_kv_scale_name = name.replace(
++                            ".kv_scale", ".attn.kv_scale")
++                        if remapped_kv_scale_name not in params_dict:
++                            logger.warning_once(
++                                "Found kv scale in the checkpoint "
++                                f"(e.g. {name}), but not found the expected "
++                                f"name in the model "
++                                f"(e.g. {remapped_kv_scale_name}). "
++                                "kv-scale is not loaded.")
++                            continue
++                        else:
++                            name = remapped_kv_scale_name
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
+new file mode 100644
+index 0000000..988d682
+--- /dev/null
++++ b/vllm/model_executor/models/qwen2_rm.py
+@@ -0,0 +1,117 @@
++# Adapted from
++# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
++# Copyright 2024 The Qwen team.
++# Copyright 2023 The vLLM team.
++"""Inference-only Qwen2-RM model compatible with HuggingFace weights."""
++from typing import Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.pooler import Pooler, PoolingType
++from vllm.model_executor.pooling_metadata import PoolingMetadata
++from vllm.sequence import IntermediateTensors, PoolerOutput
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .qwen2 import Qwen2Model
++from .utils import AutoWeightsLoader, maybe_prefix
++
++
++class ReLU(nn.Module):
++
++    def __init__(self):
++        super().__init__()
++        self.activation = nn.ReLU()
++
++    def forward(self, input):
++        input, _ = input
++        return self.activation(input)
++
++
++class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj",
++        "o_proj",
++        "gate_up_proj",
++        "down_proj",
++    ]
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++        pooler_config = vllm_config.model_config.pooler_config
++
++        self.config = config
++        self.lora_config = lora_config
++
++        self.quant_config = quant_config
++        self.model = Qwen2Model(vllm_config=vllm_config,
++                                prefix=maybe_prefix(prefix, "model"))
++
++        self.score = nn.Sequential(
++            ColumnParallelLinear(config.hidden_size,
++                                 config.hidden_size,
++                                 quant_config=quant_config),
++            ReLU(),
++            RowParallelLinear(config.hidden_size, 1,
++                              quant_config=quant_config),
++        )
++        self._pooler = Pooler.from_config_with_defaults(
++            pooler_config,
++            pooling_type=PoolingType.ALL,
++            normalize=False,
++            softmax=False)
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        hidden_states = self.model(input_ids, positions, kv_caches,
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
++        logits, _ = self.score(hidden_states)
++        return logits
++
++    def pooler(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Optional[PoolerOutput]:
++        return self._pooler(hidden_states, pooling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self,
++                                   ignore_unexpected_prefixes=["lm_head."])
++        return loader.load_weights(weights)
+diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
+new file mode 100644
+index 0000000..76a810e
+--- /dev/null
++++ b/vllm/model_executor/models/qwen2_vl.py
+@@ -0,0 +1,1355 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
++# Copyright 2024 The Qwen team.
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
++from functools import cached_property, partial
++from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
++                    Set, Tuple, Type, TypedDict, Union)
++
++import torch
++import torch.nn as nn
++import torch.nn.functional as F
++from einops import rearrange, repeat
++from transformers import BatchFeature
++from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
++                                          Qwen2VLProcessor)
++from transformers.models.qwen2_vl.configuration_qwen2_vl import (
++    Qwen2VLConfig, Qwen2VLVisionConfig)
++from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
++from vllm.distributed import utils as dist_utils
++from vllm.logger import init_logger
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.activation import QuickGELU
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.quantization.gptq import GPTQConfig
++from vllm.model_executor.layers.quantization.gptq_marlin import (
++    GPTQMarlinConfig)
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.module_mapping import MultiModelKeys
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (ImageItem, ModalityData,
++                                    MultiModalFieldConfig, MultiModalKwargs,
++                                    NestedTensors, VideoItem)
++from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
++                                   MultiModalDataItems, MultiModalDataParser)
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo, PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.platforms import _Backend
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.config import uses_mrope
++
++from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
++from .utils import (AutoWeightsLoader, WeightsMapper,
++                    init_vllm_registered_model, maybe_prefix)
++from .vision import get_vit_attn_backend
++
++logger = init_logger(__name__)
++
++# === Vision Inputs === #
++
++
++class Qwen2VLImagePixelInputs(TypedDict):
++    type: Literal["pixel_values"]
++    pixel_values: torch.Tensor
++    """Shape:
++    `(num_patches, num_channels * patch_size * patch_size)`
++    """
++
++    image_grid_thw: torch.Tensor
++    """Shape: `(num_images, 3)`
++    This should be in `(grid_t, grid_h, grid_w)` format.
++    """
++
++
++class Qwen2VLImageEmbeddingInputs(TypedDict):
++    type: Literal["image_embeds"]
++    image_embeds: torch.Tensor
++    """Supported types:
++    - List[`torch.Tensor`]: A list of tensors holding all images' features.
++        Each tensor holds an image's features.
++    - `torch.Tensor`: A tensor holding all images' features
++        (concatenation of all images' feature tensors).
++    
++    Tensor shape: `(num_image_features, hidden_size)`
++    - `num_image_features` varies based on
++        the number and resolution of the images.
++    - `hidden_size` must match the hidden size of language model backbone.
++    """
++
++    image_grid_thw: torch.Tensor
++    """Shape: `(num_images, 3)`
++    This should be in `(grid_t, grid_h, grid_w)` format.
++    """
++
++
++Qwen2VLImageInputs = Union[Qwen2VLImagePixelInputs,
++                           Qwen2VLImageEmbeddingInputs]
++
++
++class Qwen2VLVideoPixelInputs(TypedDict):
++    type: Literal["pixel_values_videos"]
++    pixel_values_videos: torch.Tensor
++    """Shape:
++    `(num_patches,
++      num_channels * temporal_patch_size * patch_size * patch_size)`
++    """
++
++    video_grid_thw: torch.Tensor
++    """Shape: `(num_videos, 3)`
++
++    This should be in `(grid_t, grid_h, grid_w)` format.
++    """
++
++
++class Qwen2VLVideoEmbeddingInputs(TypedDict):
++    type: Literal["video_embeds"]
++    video_embeds: torch.Tensor
++    """Supported types:
++    - List[`torch.Tensor`]: A list of tensors holding all videos' features.
++        Each tensor holds an video's features.
++    - `torch.Tensor`: A tensor holding all videos' features
++      (concatenation of all videos' feature tensors).
++    
++    Tensor shape: `(num_image_features, hidden_size)`
++    - `num_image_features` varies based on 
++        the number and resolution of the videos.
++    - `hidden_size` must match the hidden size of language model backbone.
++    """
++
++    video_grid_thw: torch.Tensor
++    """Shape: `(num_videos, 3)`
++    This should be in `(grid_t, grid_h, grid_w)` format.
++    """
++
++
++Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs,
++                           Qwen2VLVideoEmbeddingInputs]
++
++# === Vision Encoder === #
++
++
++class Qwen2VisionMLP(nn.Module):
++
++    def __init__(
++        self,
++        in_features: int,
++        hidden_features: int,
++        act_layer: Type[nn.Module] = QuickGELU,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.fc1 = ColumnParallelLinear(in_features,
++                                        hidden_features,
++                                        quant_config=quant_config,
++                                        prefix=f"{prefix}.fc1")
++        self.act = act_layer()
++        self.fc2 = RowParallelLinear(hidden_features,
++                                     in_features,
++                                     quant_config=quant_config,
++                                     prefix=f"{prefix}.fc2")
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        x_parallel, _ = self.fc1(x)
++        x_parallel = self.act(x_parallel)
++        x, _ = self.fc2(x_parallel)
++        return x
++
++
++def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
++    if not interleaved:
++        x1, x2 = x.chunk(2, dim=-1)
++        return torch.cat((-x2, x1), dim=-1)
++    else:
++        x1, x2 = x[..., ::2], x[..., 1::2]
++        return rearrange(torch.stack((-x2, x1), dim=-1),
++                         "... d two -> ... (d two)",
++                         two=2)
++
++
++def apply_rotary_emb_torch(x: torch.Tensor,
++                           cos: torch.Tensor,
++                           sin: torch.Tensor,
++                           interleaved: bool = False) -> torch.Tensor:
++    """
++    x: (batch_size, seqlen, nheads, headdim)
++    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
++    """
++    ro_dim = cos.shape[-1] * 2
++    assert ro_dim <= x.shape[-1]
++    cos = repeat(
++        cos,
++        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
++    sin = repeat(
++        sin,
++        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
++    return torch.cat(
++        [
++            x[..., :ro_dim] * cos +
++            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
++        ],
++        dim=-1,
++    )
++
++
++def apply_rotary_pos_emb_vision(t: torch.Tensor,
++                                freqs: torch.Tensor) -> torch.Tensor:
++    t_ = t.float()
++    cos = freqs.cos()
++    sin = freqs.sin()
++    output = apply_rotary_emb_torch(t_, cos, sin).type_as(t)
++    return output
++
++
++class Qwen2VisionAttention(nn.Module):
++
++    def __init__(
++        self,
++        embed_dim: int,
++        num_heads: int,
++        projection_size: int,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        # Per attention head and per partition values.
++        world_size = parallel_state.get_tensor_model_parallel_world_size()
++        self.tp_size = world_size
++        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
++        self.hidden_size_per_attention_head = dist_utils.divide(
++            projection_size, num_heads)
++        self.num_attention_heads_per_partition = dist_utils.divide(
++            num_heads, world_size)
++
++        self.qkv = ColumnParallelLinear(input_size=embed_dim,
++                                        output_size=3 * projection_size,
++                                        quant_config=quant_config,
++                                        prefix=f"{prefix}.qkv")
++        self.proj = RowParallelLinear(input_size=projection_size,
++                                      output_size=embed_dim,
++                                      quant_config=quant_config,
++                                      prefix=f"{prefix}.proj")
++
++        # Detect attention implementation.
++        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
++        if self.attn_backend not in {
++                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
++        }:
++            raise RuntimeError(
++                f"Qwen2-VL does not support {self.attn_backend} backend now.")
++
++    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
++        # [s, b, 3 * head * head_dim]
++        seq_len, bs, _ = qkv.shape
++        if self.tp_size > 1:
++            qkv = tensor_model_parallel_all_gather(qkv)
++
++        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
++        q, k, v = qkv.chunk(3, dim=2)
++
++        # 3 * [s, b, head * head_dim]
++        if self.tp_size > 1:
++            splitter = partial(dist_utils.split_tensor_along_last_dim,
++                               num_partitions=self.tp_size)
++            q = splitter(q)[self.tp_rank]
++            k = splitter(k)[self.tp_rank]
++            v = splitter(v)[self.tp_rank]
++
++        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
++        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
++                     self.hidden_size_per_attention_head)
++        q, k, v = (x.view(*new_shape) for x in (q, k, v))
++        return q, k, v
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        cu_seqlens: torch.Tensor,
++        rotary_pos_emb: torch.Tensor,
++    ) -> torch.Tensor:
++
++        # [s, b, c] --> [s, b, 3 * head * head_dim]
++        x, _ = self.qkv(x)
++
++        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
++        q, k, v = self.split_qkv(x)
++        batch_size = q.shape[1]
++
++        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
++                   for x in (q, k, v))
++        if rotary_pos_emb is not None:
++            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
++            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
++
++        if self.attn_backend == _Backend.FLASH_ATTN:
++            # from vllm_flash_attn.flash_attn_interface import (
++            #   flash_attn_varlen_func)
++            from flash_attn import flash_attn_varlen_func
++
++            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
++
++            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
++            output = flash_attn_varlen_func(q,
++                                            k,
++                                            v,
++                                            cu_seqlens_q=cu_seqlens,
++                                            cu_seqlens_k=cu_seqlens,
++                                            max_seqlen_q=max_seqlen,
++                                            max_seqlen_k=max_seqlen,
++                                            dropout_p=0,
++                                            causal=False)
++
++            context_layer = rearrange(output,
++                                      "(b s) ... -> b s ...",
++                                      b=batch_size)
++        elif self.attn_backend == _Backend.TORCH_SDPA:
++            seq_length = q.size(1)
++            q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v])
++            attention_mask = torch.zeros([1, seq_length, seq_length],
++                                         device=q.device,
++                                         dtype=torch.bool)
++            for i in range(1, len(cu_seqlens)):
++                attention_mask[..., cu_seqlens[i - 1]:cu_seqlens[i],
++                               cu_seqlens[i - 1]:cu_seqlens[i]] = True
++            output = F.scaled_dot_product_attention(q,
++                                                    k,
++                                                    v,
++                                                    attention_mask,
++                                                    dropout_p=0.0)
++            context_layer = rearrange(output, "b h s d -> b s h d ")
++        elif self.attn_backend == _Backend.XFORMERS:
++            from xformers import ops as xops
++            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
++
++            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
++            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
++                                                       kv_seqlen=None)
++
++            context_layer = xops.memory_efficient_attention_forward(
++                q, k, v, attn_bias=attn_bias, p=0, scale=None)
++        context_layer = rearrange(context_layer,
++                                  "b s h d -> s b (h d)").contiguous()
++
++        output, _ = self.proj(context_layer)
++        return output
++
++
++class Qwen2VisionBlock(nn.Module):
++
++    def __init__(
++        self,
++        dim: int,
++        num_heads: int,
++        mlp_ratio: float,
++        act_layer: Type[nn.Module] = QuickGELU,
++        norm_layer: Optional[Callable[[int], nn.Module]] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        if norm_layer is None:
++            norm_layer = partial(nn.LayerNorm, eps=1e-6)
++        self.norm1 = norm_layer(dim)
++        self.norm2 = norm_layer(dim)
++        mlp_hidden_dim = int(dim * mlp_ratio)
++
++        self.attn = Qwen2VisionAttention(embed_dim=dim,
++                                         num_heads=num_heads,
++                                         projection_size=dim,
++                                         quant_config=quant_config,
++                                         prefix=f"{prefix}.attn")
++        self.mlp = Qwen2VisionMLP(dim,
++                                  mlp_hidden_dim,
++                                  act_layer=act_layer,
++                                  quant_config=quant_config,
++                                  prefix=f"{prefix}.mlp")
++
++    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor,
++                rotary_pos_emb: torch.Tensor) -> torch.Tensor:
++        x = x + self.attn(self.norm1(x),
++                          cu_seqlens=cu_seqlens,
++                          rotary_pos_emb=rotary_pos_emb)
++        x = x + self.mlp(self.norm2(x))
++        return x
++
++
++class Qwen2VisionPatchEmbed(nn.Module):
++
++    def __init__(
++        self,
++        patch_size: int = 14,
++        temporal_patch_size: int = 2,
++        in_channels: int = 3,
++        embed_dim: int = 1152,
++    ) -> None:
++        super().__init__()
++        self.patch_size = patch_size
++        self.temporal_patch_size = temporal_patch_size
++        self.embed_dim = embed_dim
++
++        kernel_size = (temporal_patch_size, patch_size, patch_size)
++        self.proj = nn.Conv3d(in_channels,
++                              embed_dim,
++                              kernel_size=kernel_size,
++                              stride=kernel_size,
++                              bias=False)
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        L, C = x.shape
++        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
++                   self.patch_size)
++        x = self.proj(x).view(L, self.embed_dim)
++        return x
++
++
++class Qwen2VisionPatchMerger(nn.Module):
++
++    def __init__(
++        self,
++        d_model: int,
++        context_dim: int,
++        norm_layer: Optional[Callable[[int], nn.Module]] = None,
++        spatial_merge_size: int = 2,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = context_dim * (spatial_merge_size**2)
++        if norm_layer is None:
++            norm_layer = partial(nn.LayerNorm, eps=1e-6)
++        self.ln_q = norm_layer(context_dim)
++        self.mlp = nn.ModuleList([
++            ColumnParallelLinear(self.hidden_size,
++                                 self.hidden_size,
++                                 bias=True,
++                                 quant_config=quant_config,
++                                 prefix=f"{prefix}.mlp.0"),
++            nn.GELU(),
++            RowParallelLinear(self.hidden_size,
++                              d_model,
++                              bias=True,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.mlp.2"),
++        ])
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        x = self.ln_q(x)
++        x = x.view(-1, self.hidden_size)
++
++        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
++        x_parallel, _ = mlp_fc1(x)
++        x_parallel = mlp_act(x_parallel)
++        out, _ = mlp_fc2(x_parallel)
++        return out
++
++
++class Qwen2VisionRotaryEmbedding(nn.Module):
++
++    def __init__(self, dim: int, theta: float = 10000.0) -> None:
++        super().__init__()
++        self.dim = dim
++        self.theta = theta
++        inv_freq = 1.0 / (theta
++                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
++        self.register_buffer("inv_freq", inv_freq, persistent=False)
++        self._seq_len_cached = 0
++        self._freqs_cached = None
++
++    def update_freqs_cache(self, seqlen: int) -> None:
++        if seqlen > self._seq_len_cached:
++            seqlen *= 2
++            self._seq_len_cached = seqlen
++            self.inv_freq = 1.0 / (self.theta**(torch.arange(
++                0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device)
++                                                / self.dim))
++            seq = torch.arange(seqlen,
++                               device=self.inv_freq.device,
++                               dtype=self.inv_freq.dtype)
++            freqs = torch.outer(seq, self.inv_freq)
++            self._freqs_cached = freqs
++
++    def forward(self, seqlen: int) -> torch.Tensor:
++        self.update_freqs_cache(seqlen)
++        return self._freqs_cached[:seqlen]
++
++
++class Qwen2VisionTransformer(nn.Module):
++
++    def __init__(
++        self,
++        vision_config: Qwen2VLVisionConfig,
++        norm_eps: float = 1e-6,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        patch_size = vision_config.patch_size
++        temporal_patch_size = vision_config.temporal_patch_size
++        spatial_merge_size = vision_config.spatial_merge_size
++        in_channels = vision_config.in_channels
++        hidden_size = vision_config.hidden_size
++        embed_dim = vision_config.embed_dim
++        depth = vision_config.depth
++        num_heads = vision_config.num_heads
++        mlp_ratio = vision_config.mlp_ratio
++
++        self.spatial_merge_size = spatial_merge_size
++        self.num_heads = num_heads
++        self.embed_dim = embed_dim
++
++        self.patch_embed = Qwen2VisionPatchEmbed(
++            patch_size=patch_size,
++            temporal_patch_size=temporal_patch_size,
++            in_channels=in_channels,
++            embed_dim=embed_dim,
++        )
++
++        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
++        head_dim = embed_dim // num_heads
++        self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
++
++        self.blocks = nn.ModuleList([
++            Qwen2VisionBlock(dim=embed_dim,
++                             num_heads=num_heads,
++                             mlp_ratio=mlp_ratio,
++                             norm_layer=norm_layer,
++                             quant_config=quant_config,
++                             prefix=f"{prefix}.blocks.{layer_idx}")
++            for layer_idx in range(depth)
++        ])
++        self.merger = Qwen2VisionPatchMerger(
++            d_model=hidden_size,
++            context_dim=embed_dim,
++            norm_layer=norm_layer,
++            quant_config=quant_config,
++            prefix=f"{prefix}.merger",
++        )
++
++    @property
++    def dtype(self) -> torch.dtype:
++        return self.patch_embed.proj.weight.dtype
++
++    @property
++    def device(self) -> torch.device:
++        return self.patch_embed.proj.weight.device
++
++    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
++        pos_ids = []
++        for t, h, w in grid_thw:
++            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
++            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
++            hpos_ids = hpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            ).permute(0, 2, 1, 3).flatten()
++            wpos_ids = wpos_ids.reshape(
++                h // self.spatial_merge_size,
++                self.spatial_merge_size,
++                w // self.spatial_merge_size,
++                self.spatial_merge_size,
++            ).permute(0, 2, 1, 3).flatten()
++            pos_ids.append(
++                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
++        pos_ids = torch.cat(pos_ids, dim=0)
++        max_grid_size = grid_thw[:, 1:].max()
++        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
++        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
++        return rotary_pos_emb
++
++    def forward(
++        self,
++        x: torch.Tensor,
++        grid_thw: torch.Tensor,
++    ) -> torch.Tensor:
++        # patchify
++        x = x.to(device=self.device, dtype=self.dtype)
++        x = self.patch_embed(x)
++
++        # compute position embedding
++        rotary_pos_emb = self.rot_pos_emb(grid_thw)
++
++        # compute cu_seqlens
++        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
++                                             grid_thw[:, 0]).cumsum(
++                                                 dim=0, dtype=torch.int32)
++        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
++
++        # transformers
++        x = x.unsqueeze(1)
++        for blk in self.blocks:
++            x = blk(x, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
++
++        # adapter
++        x = self.merger(x)
++        return x
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++        ]
++        params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
++
++        for name, loaded_weight in weights:
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
++                                            dict[str, torch.Tensor]]):
++
++    def __init__(self, data: dict, modality: str) -> None:
++        super().__init__(data, modality)
++
++        grid_thw = data[f"{modality}_grid_thw"]
++        slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist()
++        self._slices = [
++            slice(slice_idxs[i], slice_idxs[i + 1])
++            for i in range(len(grid_thw))
++        ]
++
++    def get_count(self) -> int:
++        return len(self.data[f"{self.modality}_grid_thw"])
++
++    def get(self, index: int) -> dict[str, torch.Tensor]:
++        out = {}
++        for k, v in self.data.items():
++            if v != f"{self.modality}_grid_thw":
++                v = v[self._slices[index]]
++
++            out[k] = v
++
++        return out
++
++    def get_processor_data(self) -> Mapping[str, object]:
++        return {}
++
++    def get_passthrough_data(self) -> Mapping[str, object]:
++        return self.data
++
++
++class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems):
++
++    def __init__(self, data: dict) -> None:
++        super().__init__(data, "image")
++
++
++class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems):
++
++    def __init__(self, data: dict) -> None:
++        super().__init__(data, "video")
++
++
++class Qwen2MultiModalDataParser(MultiModalDataParser):
++
++    def _parse_image_data(
++        self,
++        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
++    ) -> ModalityDataItems[Any, Any]:
++        if isinstance(data, dict):
++            return Qwen2EmbeddingItems(data, modality="image")
++
++        return super()._parse_image_data(data)
++
++    def _parse_video_data(
++        self,
++        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
++    ) -> ModalityDataItems[Any, Any]:
++        if isinstance(data, dict):
++            return Qwen2EmbeddingItems(data, modality="video")
++
++        return super()._parse_video_data(data)
++
++
++class Qwen2VLProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_config(self):
++        return self.ctx.get_hf_config(Qwen2VLConfig)
++
++    def get_hf_processor(
++        self,
++        *,
++        min_pixels: Optional[int] = None,
++        max_pixels: Optional[int] = None,
++    ) -> Qwen2VLProcessor:
++        hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
++        image_processor = hf_processor.image_processor  # type: ignore
++        assert isinstance(image_processor, Qwen2VLImageProcessor)
++
++        if min_pixels:
++            image_processor.min_pixels = min_pixels
++        if max_pixels:
++            image_processor.max_pixels = max_pixels
++        if max_pixels or min_pixels:
++            image_processor.size = {
++                "min_pixels": image_processor.min_pixels,
++                "max_pixels": image_processor.max_pixels,
++            }
++
++        return hf_processor
++
++    def get_image_processor(
++        self,
++        *,
++        min_pixels: Optional[int] = None,
++        max_pixels: Optional[int] = None,
++    ):
++        hf_processor = self.get_hf_processor(min_pixels=min_pixels,
++                                             max_pixels=max_pixels)
++        image_processor = hf_processor.image_processor  # type: ignore
++        assert isinstance(image_processor, Qwen2VLImageProcessor)
++        return image_processor
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"image": None, "video": None}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        return {
++            "image": self.get_max_image_tokens(),
++            "video": self.get_max_video_tokens(seq_len),
++        }
++
++    def _get_vision_info(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++        num_frames: int = 1,
++        do_resize: bool = True,
++        image_processor: Optional[Qwen2VLImageProcessor],
++    ) -> tuple[ImageSize, int]:
++        if image_processor is None:
++            image_processor = self.get_image_processor()
++
++        hf_config = self.get_hf_config()
++        vision_config = hf_config.vision_config
++        patch_size = vision_config.patch_size
++        merge_size = vision_config.spatial_merge_size
++        temporal_patch_size = vision_config.temporal_patch_size
++
++        if do_resize:
++            resized_height, resized_width = smart_resize(
++                height=image_height,
++                width=image_width,
++                factor=patch_size * merge_size,
++                min_pixels=image_processor.min_pixels,
++                max_pixels=image_processor.max_pixels,
++            )
++            preprocessed_size = ImageSize(width=resized_width,
++                                          height=resized_height)
++        else:
++            preprocessed_size = ImageSize(width=image_width,
++                                          height=image_height)
++
++        grid_t = max(num_frames // temporal_patch_size, 1)
++        grid_h = preprocessed_size.height // patch_size
++        grid_w = preprocessed_size.width // patch_size
++
++        num_patches = grid_t * grid_h * grid_w
++        num_vision_tokens = num_patches // (merge_size**2)
++
++        return preprocessed_size, num_vision_tokens
++
++    def get_num_image_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++        image_processor: Optional[Qwen2VLImageProcessor],
++    ) -> int:
++        _, num_image_tokens = self._get_vision_info(
++            image_width=image_width,
++            image_height=image_height,
++            image_processor=image_processor,
++        )
++        return num_image_tokens
++
++    def get_num_video_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++        num_frames: int,
++        image_processor: Optional[Qwen2VLImageProcessor],
++    ) -> int:
++        _, num_video_tokens = self._get_vision_info(
++            image_width=image_width,
++            image_height=image_height,
++            num_frames=num_frames,
++            image_processor=image_processor,
++        )
++        return num_video_tokens
++
++    def get_image_size_with_most_features(self) -> ImageSize:
++        max_image_size, _ = self._get_vision_info(
++            image_width=9999999,
++            image_height=9999999,
++            image_processor=None,
++        )
++        return max_image_size
++
++    def get_max_image_tokens(self) -> int:
++        target_width, target_height = self.get_image_size_with_most_features()
++
++        return self.get_num_image_tokens(
++            image_width=target_width,
++            image_height=target_height,
++            image_processor=None,
++        )
++
++    def _get_max_video_frames(self, max_tokens: int) -> int:
++        target_width, target_height = self.get_image_size_with_most_features()
++
++        num_frames = 0
++
++        while True:
++            next_num_frames = num_frames + 1
++            next_max_tokens = self.get_num_video_tokens(
++                image_width=target_width,
++                image_height=target_height,
++                num_frames=next_num_frames,
++                image_processor=None,
++            )
++
++            if next_max_tokens > max_tokens:
++                break
++
++            num_frames = next_num_frames
++
++        return num_frames
++
++    def get_num_frames_with_most_features(self, seq_len: int) -> int:
++        mm_config = self.ctx.get_mm_config()
++        max_images = mm_config.limit_per_prompt.get("image", 1)
++        max_videos = mm_config.limit_per_prompt.get("video", 1)
++
++        max_image_tokens = self.get_max_image_tokens() * max_images
++        max_total_frames = self._get_max_video_frames(seq_len -
++                                                      max_image_tokens)
++
++        num_frames = max(max_total_frames // max(max_videos, 1), 1)
++
++        # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
++        if num_frames > 1 and num_frames % 2 == 1:
++            num_frames += 1
++
++        return num_frames
++
++    def get_max_video_tokens(self, seq_len: int) -> int:
++        target_width, target_height = self.get_image_size_with_most_features()
++
++        return self.get_num_video_tokens(
++            image_width=target_width,
++            image_height=target_height,
++            num_frames=self.get_num_frames_with_most_features(seq_len),
++            image_processor=None,
++        )
++
++
++class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        num_images = mm_counts.get("image", 0)
++        num_videos = mm_counts.get("video", 0)
++
++        hf_processor = self.info.get_hf_processor()
++        image_token: str = hf_processor.image_token
++        video_token: str = hf_processor.video_token
++
++        target_width, target_height = \
++            self.info.get_image_size_with_most_features()
++        target_num_frames = \
++            self.info.get_num_frames_with_most_features(seq_len)
++
++        mm_data = {
++            "image":
++            self._get_dummy_images(width=target_width,
++                                   height=target_height,
++                                   num_images=num_images),
++            "video":
++            self._get_dummy_videos(
++                width=target_width,
++                height=target_height,
++                num_frames=target_num_frames,
++                num_videos=num_videos,
++            )
++        }
++
++        return ProcessorInputs(
++            prompt_text=image_token * num_images + video_token * num_videos,
++            mm_data=mm_data,
++        )
++
++
++class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
++                                 ):
++
++    def _get_data_parser(self) -> MultiModalDataParser:
++        return Qwen2MultiModalDataParser()
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, Any],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
++        image_processor = self.info.get_image_processor(
++            **hf_processor_mm_kwargs)
++
++        # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
++        # image_token and video_token registered
++        placeholder = {
++            "image": hf_processor.image_token,
++            "video": hf_processor.video_token,
++        }
++        merge_length = image_processor.merge_size**2
++
++        def get_replacement_qwen2vl(item_idx: int, modality: str):
++            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
++            assert isinstance(grid_thw, torch.Tensor)
++
++            num_tokens = grid_thw.prod() // merge_length
++            return placeholder[modality] * num_tokens
++
++        return [
++            PromptReplacement(
++                modality=modality,
++                target=placeholder[modality],
++                replacement=partial(get_replacement_qwen2vl,
++                                    modality=modality),
++            ) for modality in ("image", "video")
++        ]
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
++        image_slice_idxs = [0] + image_grid_thw.prod(-1).cumsum_(0).tolist()
++        image_slices = [
++            slice(image_slice_idxs[i], image_slice_idxs[i + 1])
++            for i in range(len(image_grid_thw))
++        ]
++
++        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
++        video_slice_idxs = [0] + video_grid_thw.prod(-1).cumsum_(0).tolist()
++        video_slices = [
++            slice(video_slice_idxs[i], video_slice_idxs[i + 1])
++            for i in range(len(video_grid_thw))
++        ]
++
++        return dict(
++            pixel_values=MultiModalFieldConfig.flat("image", image_slices),
++            image_embeds=MultiModalFieldConfig.flat("image", image_slices),
++            image_grid_thw=MultiModalFieldConfig.batched("image"),
++            pixel_values_videos=MultiModalFieldConfig.flat(
++                "video", video_slices),
++            video_embeds=MultiModalFieldConfig.flat("video", video_slices),
++            video_grid_thw=MultiModalFieldConfig.batched("video"),
++        )
++
++
++@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,
++                                        info=Qwen2VLProcessingInfo,
++                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
++class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
++                                      SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj",
++        "o_proj",
++        "gate_up_proj",
++        "down_proj",
++        # vision tower
++        "qkv",
++        "attn.proj",  # Distinguish patch_embed.proj
++        "fc1",
++        "fc2",
++        # projector
++        "mlp.0",
++        "mlp.2"
++    ]
++    embedding_modules = {}
++    embedding_padding_modules = []
++
++    # BitandBytes specific attributes
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    # To ensure correct weight loading and mapping.
++    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
++        "lm_head.": "language_model.lm_head.",
++        "model.": "language_model.model.",
++    })
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config: Qwen2VLConfig = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        assert not cache_config.enable_prefix_caching, \
++            "Qwen2-VL currently does not support prefix caching"
++
++        self.config = config
++        self.multimodal_config = multimodal_config
++
++        self.visual = Qwen2VisionTransformer(
++            config.vision_config,
++            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
++            quant_config=self._maybe_ignore_quant_config(quant_config),
++            prefix=maybe_prefix(prefix, "visual"),
++        )
++
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++            architectures=["Qwen2ForCausalLM"],
++        )
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
++        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
++        # seems to avoid vision encoder sections for some models.
++        # See: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4
++        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
++            return None
++        return quant_config
++
++    def _validate_and_reshape_mm_tensor(self, mm_input: object,
++                                        name: str) -> torch.Tensor:
++        if not isinstance(mm_input, (torch.Tensor, list)):
++            raise ValueError(f"Incorrect type of {name}. "
++                             f"Got type: {type(mm_input)}")
++        if isinstance(mm_input, torch.Tensor):
++            if mm_input.ndim == 2:
++                return mm_input
++            if mm_input.ndim != 3:
++                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
++                                 f"Got ndim: {mm_input.ndim} "
++                                 f"(shape={mm_input.shape})")
++            return torch.concat(list(mm_input))
++        else:
++            return torch.concat(mm_input)
++
++    def _parse_and_validate_image_input(
++            self, **kwargs: object) -> Optional[Qwen2VLImageInputs]:
++        pixel_values = kwargs.pop("pixel_values", None)
++        image_embeds = kwargs.pop("image_embeds", None)
++        image_grid_thw = kwargs.pop("image_grid_thw", None)
++
++        if pixel_values is None and image_embeds is None:
++            return None
++
++        if pixel_values is not None:
++            pixel_values = self._validate_and_reshape_mm_tensor(
++                pixel_values, "image pixel values")
++            image_grid_thw = self._validate_and_reshape_mm_tensor(
++                image_grid_thw, "image grid_thw")
++
++            if not isinstance(pixel_values, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of image pixel values. "
++                                 f"Got type: {type(pixel_values)}")
++
++            return Qwen2VLImagePixelInputs(type="pixel_values",
++                                           pixel_values=pixel_values,
++                                           image_grid_thw=image_grid_thw)
++
++        if image_embeds is not None:
++            image_embeds = self._validate_and_reshape_mm_tensor(
++                image_embeds, "image embeds")
++            image_grid_thw = self._validate_and_reshape_mm_tensor(
++                image_grid_thw, "image grid_thw")
++
++            if not isinstance(image_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of image embeddings. "
++                                 f"Got type: {type(image_embeds)}")
++            return Qwen2VLImageEmbeddingInputs(type="image_embeds",
++                                               image_embeds=image_embeds,
++                                               image_grid_thw=image_grid_thw)
++
++    def _parse_and_validate_video_input(
++            self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]:
++        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
++        video_embeds = kwargs.pop("video_embeds", None)
++        video_grid_thw = kwargs.pop("video_grid_thw", None)
++
++        if pixel_values_videos is None and video_embeds is None:
++            return None
++
++        if pixel_values_videos is not None:
++            pixel_values_videos = self._validate_and_reshape_mm_tensor(
++                pixel_values_videos, "video pixel values")
++            video_grid_thw = self._validate_and_reshape_mm_tensor(
++                video_grid_thw, "video grid_thw")
++
++            return Qwen2VLVideoPixelInputs(
++                type="pixel_values_videos",
++                pixel_values_videos=pixel_values_videos,
++                video_grid_thw=video_grid_thw,
++            )
++
++        if video_embeds is not None:
++            video_embeds = self._validate_and_reshape_mm_tensor(
++                video_embeds, "video embeds")
++            video_grid_thw = self._validate_and_reshape_mm_tensor(
++                video_grid_thw, "video grid_thw")
++
++            if not isinstance(video_embeds, torch.Tensor):
++                raise ValueError("Incorrect type of video embeddings. "
++                                 f"Got type: {type(video_embeds)}")
++            return Qwen2VLVideoEmbeddingInputs(type="video_embeds",
++                                               video_embeds=video_embeds,
++                                               video_grid_thw=video_grid_thw)
++
++    def _process_image_input(self,
++                             image_input: Qwen2VLImageInputs) -> torch.Tensor:
++        if image_input["type"] == "image_embeds":
++            return image_input["image_embeds"].type(self.visual.dtype)
++
++        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
++        image_embeds = self.visual(pixel_values,
++                                   grid_thw=image_input["image_grid_thw"])
++        return image_embeds
++
++    def _process_video_input(self,
++                             video_input: Qwen2VLVideoInputs) -> torch.Tensor:
++        if video_input["type"] == "video_embeds":
++            return video_input["video_embeds"].type(self.visual.dtype)
++
++        pixel_values_videos = video_input["pixel_values_videos"].type(
++            self.visual.dtype)
++        video_embeds = self.visual(pixel_values_videos,
++                                   grid_thw=video_input["video_grid_thw"])
++        return video_embeds
++
++    def _merge_multimodal_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        inputs_embeds: torch.Tensor,
++        multimodal_embeddings: torch.Tensor,
++        placeholder_token_id: int,
++    ) -> torch.Tensor:
++        mask = (input_ids == placeholder_token_id)
++        inputs_embeds[mask, :] = multimodal_embeddings
++        return inputs_embeds
++
++    def get_multimodal_embeddings(
++            self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
++
++        image_input = self._parse_and_validate_image_input(**kwargs)
++        video_input = self._parse_and_validate_video_input(**kwargs)
++        if image_input is None and video_input is None:
++            return None
++
++        # We make a tuple of each embedding with its modality string. This is a
++        # temporary workaround for models to handle mixed modalities when
++        # get_multimodal_embeddings and get_input_embeddings are called
++        # separately.
++        # TODO(ywang96): Add support for mixed-modality inference for v1.
++        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
++
++        if image_input is not None:
++            image_embeds = self._process_image_input(image_input)
++            multimodal_embeddings.append((image_embeds, "image"))
++        if video_input is not None:
++            video_embeds = self._process_video_input(video_input)
++            multimodal_embeddings.append((video_embeds, "video"))
++
++        return multimodal_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[List[Tuple[NestedTensors,
++                                                   str]]] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++            for embeddings, modality in multimodal_embeddings:
++                if modality == "image":
++                    inputs_embeds = self._merge_multimodal_embeddings(
++                        input_ids,
++                        inputs_embeds,
++                        embeddings,
++                        placeholder_token_id=self.config.image_token_id,
++                    )
++                if modality == "video":
++                    inputs_embeds = self._merge_multimodal_embeddings(
++                        input_ids,
++                        inputs_embeds,
++                        embeddings,
++                        placeholder_token_id=self.config.video_token_id,
++                    )
++        return inputs_embeds
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        **kwargs: object,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        """Run forward pass for Qwen2-VL.
++
++        Args:
++            input_ids: Flattened (concatenated) input_ids corresponding to a
++                batch.
++            positions: Flattened (concatenated) position ids corresponding to a
++                batch.
++                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
++                opensource models), the shape will be `(3, seq_len)`,
++                otherwise it will be `(seq_len,).
++            pixel_values: Pixel values to be fed to a model.
++                `None` if no images are passed.
++            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
++                `None` if no images are passed.
++            pixel_values_videos: Pixel values of videos to be fed to a model.
++                `None` if no videos are passed.
++            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
++                `None` if no videos are passed.
++        """
++
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
++
++            # We need to check for usage of mrope here in case there is
++            # multimodal data.
++            # TODO (ywang96): move this to model runner in V1.
++            if multimodal_embeddings is not None and uses_mrope(self.config):
++                assert positions.ndim == 2 and positions.size(0) == 3, (
++                    "multimodal section rotary embedding requires "
++                    f"(3, seq_len) positions, but got {positions.size()}")
++
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      multimodal_embeddings)
++            input_ids = None
++
++        hidden_states = self.language_model.model(
++            input_ids=input_ids,
++            positions=positions,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++            intermediate_tensors=intermediate_tensors,
++            inputs_embeds=inputs_embeds,
++        )
++        return hidden_states
++
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++
++        loader = AutoWeightsLoader(self)
++        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
++
++    def get_mm_mapping(self) -> MultiModelKeys:
++        """
++        Get the module prefix in multimodal models
++        """
++        return MultiModelKeys.from_string_field(
++            language_model="language_model",
++            connector="visual.",
++            tower_model="visual.merger.")
+diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
+new file mode 100644
+index 0000000..a7286a9
+--- /dev/null
++++ b/vllm/model_executor/models/registry.py
+@@ -0,0 +1,514 @@
++"""
++Whenever you add an architecture to this page, please also update
++`tests/models/registry.py` with example HuggingFace models for it.
++"""
++import importlib
++import os
++import pickle
++import subprocess
++import sys
++import tempfile
++from abc import ABC, abstractmethod
++from dataclasses import dataclass, field
++from functools import lru_cache
++from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type,
++                    TypeVar, Union)
++
++import cloudpickle
++import torch.nn as nn
++
++from vllm.logger import init_logger
++
++from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
++                         supports_cross_encoding, supports_multimodal,
++                         supports_pp)
++from .interfaces_base import is_text_generation_model
++
++logger = init_logger(__name__)
++
++# yapf: disable
++_TEXT_GENERATION_MODELS = {
++    # [Decoder-only]
++    "AquilaModel": ("llama", "LlamaForCausalLM"),
++    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
++    "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
++    # baichuan-7b, upper case 'C' in the class name
++    "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),
++    # baichuan-13b, lower case 'c' in the class name
++    "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),
++    "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
++    # ChatGLMModel supports multimodal
++    "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
++    "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
++    "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
++    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
++    "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
++    "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
++    "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"),
++    "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
++    "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
++    "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
++    "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
++    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
++    "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
++    "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
++    "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
++    "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
++    "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
++    "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
++    "GritLM": ("gritlm", "GritLM"),
++    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
++    "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
++    "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
++    "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
++    "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
++    "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
++    # For decapoda-research/llama-*
++    "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
++    "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
++    "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
++    "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
++    "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
++    "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
++    "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
++    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
++    # transformers's mpt class has lower case
++    "MptForCausalLM": ("mpt", "MPTForCausalLM"),
++    "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
++    "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
++    "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
++    "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"),
++    "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
++    "OPTForCausalLM": ("opt", "OPTForCausalLM"),
++    "OrionForCausalLM": ("orion", "OrionForCausalLM"),
++    "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
++    "PhiForCausalLM": ("phi", "PhiForCausalLM"),
++    "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
++    "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
++    "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
++    # QWenLMHeadModel supports multimodal
++    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
++    "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
++    "RWForCausalLM": ("falcon", "FalconForCausalLM"),
++    "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
++    "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
++    "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
++    "SolarForCausalLM": ("solar", "SolarForCausalLM"),
++    "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
++    "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
++    # [Encoder-decoder]
++    "BartModel": ("bart", "BartForConditionalGeneration"),
++    "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
++    "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
++}
++
++_EMBEDDING_MODELS = {
++    # [Text-only]
++    "BertModel": ("bert", "BertEmbeddingModel"),
++    "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
++    "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
++    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
++    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
++    "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
++    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
++    "GritLM": ("gritlm", "GritLM"),
++    "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"),
++    "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
++    "LlamaModel": ("llama", "LlamaForCausalLM"),
++    **{
++        # Multiple models share the same architecture, so we include them all
++        k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
++        if arch == "LlamaForCausalLM"
++    },
++    "MistralModel": ("llama", "LlamaForCausalLM"),
++    "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
++    "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
++    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
++    "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
++    "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
++    # [Multimodal]
++    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
++    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
++    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
++    # [Auto-converted (see adapters.py)]
++    "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"),
++}
++
++_CROSS_ENCODER_MODELS = {
++    "BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
++    "RobertaForSequenceClassification": ("roberta",
++                                         "RobertaForSequenceClassification"),
++    "XLMRobertaForSequenceClassification": ("roberta",
++                                            "RobertaForSequenceClassification"),
++}
++
++_MULTIMODAL_MODELS = {
++    # [Decoder-only]
++    "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
++    "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
++    "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
++    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
++    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
++    "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
++    "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
++    "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
++    "InternVLChatModel": ("internvl", "InternVLChatModel"),
++    "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
++    "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
++    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
++    "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
++    "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
++    "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
++    "MiniCPMV": ("minicpmv", "MiniCPMV"),
++    "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
++    "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
++    "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
++    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
++    "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
++    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
++    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
++    "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
++    "UltravoxModel": ("ultravox", "UltravoxModel"),
++    # [Encoder-decoder]
++    "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
++    "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
++}
++
++_SPECULATIVE_DECODING_MODELS = {
++    "EAGLEModel": ("eagle", "EAGLE"),
++    "MedusaModel": ("medusa", "Medusa"),
++    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
++}
++# yapf: enable
++
++_VLLM_MODELS = {
++    **_TEXT_GENERATION_MODELS,
++    **_EMBEDDING_MODELS,
++    **_CROSS_ENCODER_MODELS,
++    **_MULTIMODAL_MODELS,
++    **_SPECULATIVE_DECODING_MODELS,
++}
++
++
++@dataclass(frozen=True)
++class _ModelInfo:
++    architecture: str
++    is_text_generation_model: bool
++    is_pooling_model: bool
++    supports_cross_encoding: bool
++    supports_multimodal: bool
++    supports_pp: bool
++    has_inner_state: bool
++    is_attention_free: bool
++    is_hybrid: bool
++
++    @staticmethod
++    def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
++        return _ModelInfo(
++            architecture=model.__name__,
++            is_text_generation_model=is_text_generation_model(model),
++            is_pooling_model=True,  # Can convert any model into a pooling model
++            supports_cross_encoding=supports_cross_encoding(model),
++            supports_multimodal=supports_multimodal(model),
++            supports_pp=supports_pp(model),
++            has_inner_state=has_inner_state(model),
++            is_attention_free=is_attention_free(model),
++            is_hybrid=is_hybrid(model),
++        )
++
++
++class _BaseRegisteredModel(ABC):
++
++    @abstractmethod
++    def inspect_model_cls(self) -> _ModelInfo:
++        raise NotImplementedError
++
++    @abstractmethod
++    def load_model_cls(self) -> Type[nn.Module]:
++        raise NotImplementedError
++
++
++@dataclass(frozen=True)
++class _RegisteredModel(_BaseRegisteredModel):
++    """
++    Represents a model that has already been imported in the main process.
++    """
++
++    interfaces: _ModelInfo
++    model_cls: Type[nn.Module]
++
++    @staticmethod
++    def from_model_cls(model_cls: Type[nn.Module]):
++        return _RegisteredModel(
++            interfaces=_ModelInfo.from_model_cls(model_cls),
++            model_cls=model_cls,
++        )
++
++    def inspect_model_cls(self) -> _ModelInfo:
++        return self.interfaces
++
++    def load_model_cls(self) -> Type[nn.Module]:
++        return self.model_cls
++
++
++@dataclass(frozen=True)
++class _LazyRegisteredModel(_BaseRegisteredModel):
++    """
++    Represents a model that has not been imported in the main process.
++    """
++    module_name: str
++    class_name: str
++
++    # Performed in another process to avoid initializing CUDA
++    def inspect_model_cls(self) -> _ModelInfo:
++        return _run_in_subprocess(
++            lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
++
++    def load_model_cls(self) -> Type[nn.Module]:
++        mod = importlib.import_module(self.module_name)
++        return getattr(mod, self.class_name)
++
++
++@lru_cache(maxsize=128)
++def _try_load_model_cls(
++    model_arch: str,
++    model: _BaseRegisteredModel,
++) -> Optional[Type[nn.Module]]:
++    from vllm.platforms import current_platform
++    current_platform.verify_model_arch(model_arch)
++    try:
++        return model.load_model_cls()
++    except Exception:
++        logger.exception("Error in loading model architecture '%s'",
++                         model_arch)
++        return None
++
++
++@lru_cache(maxsize=128)
++def _try_inspect_model_cls(
++    model_arch: str,
++    model: _BaseRegisteredModel,
++) -> Optional[_ModelInfo]:
++    try:
++        return model.inspect_model_cls()
++    except Exception:
++        logger.exception("Error in inspecting model architecture '%s'",
++                         model_arch)
++        return None
++
++
++@dataclass
++class _ModelRegistry:
++    # Keyed by model_arch
++    models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict)
++
++    def get_supported_archs(self) -> AbstractSet[str]:
++        return self.models.keys()
++
++    def register_model(
++        self,
++        model_arch: str,
++        model_cls: Union[Type[nn.Module], str],
++    ) -> None:
++        """
++        Register an external model to be used in vLLM.
++
++        :code:`model_cls` can be either:
++
++        - A :class:`torch.nn.Module` class directly referencing the model.
++        - A string in the format :code:`<module>:<class>` which can be used to
++          lazily import the model. This is useful to avoid initializing CUDA
++          when importing the model and thus the related error
++          :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
++        """
++        if model_arch in self.models:
++            logger.warning(
++                "Model architecture %s is already registered, and will be "
++                "overwritten by the new model class %s.", model_arch,
++                model_cls)
++
++        if isinstance(model_cls, str):
++            split_str = model_cls.split(":")
++            if len(split_str) != 2:
++                msg = "Expected a string in the format `<module>:<class>`"
++                raise ValueError(msg)
++
++            model = _LazyRegisteredModel(*split_str)
++        else:
++            model = _RegisteredModel.from_model_cls(model_cls)
++
++        self.models[model_arch] = model
++
++    def _raise_for_unsupported(self, architectures: List[str]):
++        all_supported_archs = self.get_supported_archs()
++
++        if any(arch in all_supported_archs for arch in architectures):
++            raise ValueError(
++                f"Model architectures {architectures} failed "
++                "to be inspected. Please check the logs for more details.")
++
++        raise ValueError(
++            f"Model architectures {architectures} are not supported for now. "
++            f"Supported architectures: {all_supported_archs}")
++
++    def _try_load_model_cls(self,
++                            model_arch: str) -> Optional[Type[nn.Module]]:
++        if model_arch not in self.models:
++            return None
++
++        return _try_load_model_cls(model_arch, self.models[model_arch])
++
++    def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
++        if model_arch not in self.models:
++            return None
++
++        return _try_inspect_model_cls(model_arch, self.models[model_arch])
++
++    def _normalize_archs(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> List[str]:
++        if isinstance(architectures, str):
++            architectures = [architectures]
++        if not architectures:
++            logger.warning("No model architectures are specified")
++
++        return architectures
++
++    def inspect_model_cls(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> Tuple[_ModelInfo, str]:
++        architectures = self._normalize_archs(architectures)
++
++        for arch in architectures:
++            model_info = self._try_inspect_model_cls(arch)
++            if model_info is not None:
++                return (model_info, arch)
++
++        return self._raise_for_unsupported(architectures)
++
++    def resolve_model_cls(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> Tuple[Type[nn.Module], str]:
++        architectures = self._normalize_archs(architectures)
++
++        for arch in architectures:
++            model_cls = self._try_load_model_cls(arch)
++            if model_cls is not None:
++                return (model_cls, arch)
++
++        return self._raise_for_unsupported(architectures)
++
++    def is_text_generation_model(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> bool:
++        model_cls, _ = self.inspect_model_cls(architectures)
++        return model_cls.is_text_generation_model
++
++    def is_pooling_model(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> bool:
++        model_cls, _ = self.inspect_model_cls(architectures)
++        return model_cls.is_pooling_model
++
++    def is_cross_encoder_model(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> bool:
++        model_cls, _ = self.inspect_model_cls(architectures)
++        return model_cls.supports_cross_encoding
++
++    def is_multimodal_model(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> bool:
++        model_cls, _ = self.inspect_model_cls(architectures)
++        return model_cls.supports_multimodal
++
++    def is_pp_supported_model(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> bool:
++        model_cls, _ = self.inspect_model_cls(architectures)
++        return model_cls.supports_pp
++
++    def model_has_inner_state(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> bool:
++        model_cls, _ = self.inspect_model_cls(architectures)
++        return model_cls.has_inner_state
++
++    def is_attention_free_model(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> bool:
++        model_cls, _ = self.inspect_model_cls(architectures)
++        return model_cls.is_attention_free
++
++    def is_hybrid_model(
++        self,
++        architectures: Union[str, List[str]],
++    ) -> bool:
++        model_cls, _ = self.inspect_model_cls(architectures)
++        return model_cls.is_hybrid
++
++
++ModelRegistry = _ModelRegistry({
++    model_arch: _LazyRegisteredModel(
++        module_name=f"vllm.model_executor.models.{mod_relname}",
++        class_name=cls_name,
++    )
++    for model_arch, (mod_relname, cls_name) in _VLLM_MODELS.items()
++})
++
++_T = TypeVar("_T")
++
++
++def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
++    # NOTE: We use a temporary directory instead of a temporary file to avoid
++    # issues like https://stackoverflow.com/questions/23212435/permission-denied-to-write-to-my-temporary-file
++    with tempfile.TemporaryDirectory() as tempdir:
++        output_filepath = os.path.join(tempdir, "registry_output.tmp")
++
++        # `cloudpickle` allows pickling lambda functions directly
++        input_bytes = cloudpickle.dumps((fn, output_filepath))
++
++        # cannot use `sys.executable __file__` here because the script
++        # contains relative imports
++        returned = subprocess.run(
++            [sys.executable, "-m", "vllm.model_executor.models.registry"],
++            input=input_bytes,
++            capture_output=True)
++
++        # check if the subprocess is successful
++        try:
++            returned.check_returncode()
++        except Exception as e:
++            # wrap raised exception to provide more information
++            raise RuntimeError(f"Error raised in subprocess:\n"
++                               f"{returned.stderr.decode()}") from e
++
++        with open(output_filepath, "rb") as f:
++            return pickle.load(f)
++
++
++def _run() -> None:
++    # Setup plugins
++    from vllm.plugins import load_general_plugins
++    load_general_plugins()
++
++    fn, output_file = pickle.loads(sys.stdin.buffer.read())
++
++    result = fn()
++
++    with open(output_file, "wb") as f:
++        f.write(pickle.dumps(result))
++
++
++if __name__ == "__main__":
++    _run()
+diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
+new file mode 100644
+index 0000000..5997a76
+--- /dev/null
++++ b/vllm/model_executor/models/roberta.py
+@@ -0,0 +1,256 @@
++import itertools
++from typing import Iterable, List, Optional, Tuple
++
++import torch
++from torch import nn
++from transformers import RobertaConfig
++
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.pooler import CrossEncodingPooler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
++from vllm.model_executor.models.utils import maybe_prefix
++from vllm.model_executor.pooling_metadata import PoolingMetadata
++from vllm.sequence import IntermediateTensors, PoolerOutput
++from vllm.transformers_utils.config import (
++    get_cross_encoder_activation_function)
++
++from .interfaces import SupportsCrossEncoding
++
++
++def roberta_task_weights_filter(
++    all_weights: Iterable[Tuple[str, torch.Tensor]]
++) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str,
++                                                              torch.Tensor]]]:
++    """
++    Separate task-specific weights that are applied on top
++    of the encoder-decoder bert base.
++    To do so, return two generators over the original iterator.
++    Also, remove the "roberta." prefix to make it loadable
++    from vanilla BertModel.
++    """
++    # Copy of a lazy iterator without in-memory overhead so both
++    # iterators can be iterated upon independently.
++    all_weights1, all_weights2 = itertools.tee(all_weights)
++
++    def encoder_decoder_weights():
++        for name, weight in all_weights1:
++            if name.startswith("roberta."):
++                yield (name[len("roberta."):], weight)
++
++    return encoder_decoder_weights(), ((n, w) for n, w in all_weights2
++                                       if not n.startswith("roberta."))
++
++
++class RobertaEmbedding(nn.Module):
++
++    def __init__(self, config: RobertaConfig):
++        super().__init__()
++        self.size = config.hidden_size
++        self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
++                                                      config.hidden_size)
++        self.padding_idx = config.pad_token_id
++        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
++                                                config.hidden_size,
++                                                padding_idx=self.padding_idx)
++
++        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
++                                                  config.hidden_size)
++        self.LayerNorm = nn.LayerNorm(config.hidden_size,
++                                      eps=config.layer_norm_eps)
++        self.position_ids = nn.Parameter(
++            torch.empty((1, config.max_position_embeddings)), )
++
++        self.position_embedding_type = config.position_embedding_type
++        if self.position_embedding_type != "absolute":
++            raise ValueError("Only 'absolute' position_embedding_type" +
++                             " is supported")
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        seq_lens: torch.Tensor,
++        position_ids: torch.Tensor,
++        token_type_ids: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        input_shape = input_ids.size()
++        inputs_embeds = self.word_embeddings(input_ids)
++
++        # Replace position ids because in RoBERTa models
++        # they have to start at padding_idx + 1 and ignore
++        # existing padding tokens
++        # References:
++        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
++        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
++        pos_list = []
++        token_list = []
++        offset = 0
++        for seq_len in seq_lens:
++            pos_list.append(position_ids[offset:offset + seq_len])
++            token_list.append(input_ids[offset:offset + seq_len])
++            offset += seq_len
++
++        new_pos_list = []
++        for positions, tokens in zip(pos_list, token_list):
++            # Verify assumption that incoming position are
++            # always a sequence from 0 to N.
++            expected_pos = torch.arange(positions.size()[0],
++                                        dtype=torch.long,
++                                        device=inputs_embeds.device)
++            assert torch.equal(positions, expected_pos)
++            new_pos_list.append(
++                create_position_ids_from_input_ids(tokens, self.padding_idx))
++        position_ids = torch.cat(new_pos_list)
++
++        # Position embeddings.
++        position_embeddings = self.position_embeddings(position_ids)
++        if token_type_ids is None:
++            token_type_ids = torch.zeros(input_shape,
++                                         dtype=torch.long,
++                                         device=inputs_embeds.device)
++
++        token_type_embeddings = self.token_type_embeddings(token_type_ids)
++        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
++        embeddings = self.LayerNorm(embeddings)
++        return embeddings
++
++
++# Adapted from transformers
++def create_position_ids_from_input_ids(input_ids,
++                                       padding_idx,
++                                       past_key_values_length=0):
++    """
++    Replace non-padding symbols with their position numbers.
++    Position numbers begin at padding_idx+1. Padding symbols
++    are ignored. This is modified from fairseq's `utils.make_positions`.
++
++    Args:
++        x: torch.Tensor x:
++
++    Returns: torch.Tensor
++    """
++    # The series of casts and type-conversions here are carefully
++    # balanced to both work with ONNX export and XLA.
++    mask = input_ids.ne(padding_idx).int()
++
++    incremental_indices = (torch.cumsum(mask, dim=0).type_as(mask) +
++                           past_key_values_length) * mask
++
++    return incremental_indices.long() + padding_idx
++
++
++# Adapted from transformers
++class RobertaClassificationHead(nn.Module):
++    """Head for sentence-level classification tasks."""
++
++    def __init__(self, config: RobertaConfig):
++        super().__init__()
++        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
++        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
++
++    def forward(self, features, **kwargs):
++        x = features[0, :]  # take <s> token (equiv. to [CLS])
++        x = self.dense(x)
++        x = torch.tanh(x)
++        x = self.out_proj(x)
++        return x
++
++
++class RobertaEmbeddingModel(BertEmbeddingModel):
++    """A model that uses Roberta to provide embedding functionalities.
++
++   This class encapsulates the BertModel and provides an interface for
++   embedding operations and customized pooling functions.
++
++   Attributes:
++       model: An instance of BertModel used for forward operations.
++       _pooler: An instance of Pooler used for pooling operations.
++   """
++
++    def _build_model(self,
++                     vllm_config: VllmConfig,
++                     prefix: str = "") -> BertModel:
++        return BertModel(vllm_config=vllm_config,
++                         prefix=prefix,
++                         embedding_class=RobertaEmbedding)
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++        weights = self.hf_to_vllm_mapper.apply(weights)
++        # Separate weights in "roberta"-prefixed and all else (not in memory).
++        # For use with models like FacebookAI/roberta-base.
++        bert_weights, task_weights = roberta_task_weights_filter(weights)
++        loaded = self.model.load_weights(bert_weights)
++        if not len(loaded):
++            # Fix for models like `sentence-transformers/stsb-roberta-base-v2`
++            # which use the same architecture, but have no "roberta" prefix.
++            loaded = self.model.load_weights(task_weights)
++        assert len(loaded), "Unable to load RobertaEmbeddingModel"
++
++
++class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
++    """A model that uses Roberta to provide embedding functionalities.
++
++   This class encapsulates the BertModel and provides an interface for
++   embedding operations and customized pooling functions.
++
++   Attributes:
++       roberta: An instance of BertModel used for forward operations.
++       _pooler: An instance of Pooler used for pooling operations.
++   """
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++
++        self.default_activation_function = \
++            get_cross_encoder_activation_function(config)
++
++        self.num_labels = config.num_labels
++        self.roberta = BertModel(vllm_config=vllm_config,
++                                 prefix=maybe_prefix(prefix, "bert"),
++                                 embedding_class=RobertaEmbedding,
++                                 add_pooling_layer=False)
++        self.classifier = RobertaClassificationHead(config)
++        self._pooler = CrossEncodingPooler(config, self.classifier)
++
++    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++
++        bert_weights, task_weights = roberta_task_weights_filter(weights)
++        self.roberta.load_weights(bert_weights)
++
++        params_dict = dict(self.named_parameters())
++
++        for name, loaded_weight in task_weights:
++            if name.startswith("classifier"):
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++
++    def pooler(
++        self,
++        hidden_states: torch.Tensor,
++        pooling_metadata: PoolingMetadata,
++    ) -> Optional[PoolerOutput]:
++        return self._pooler(hidden_states, pooling_metadata)
++
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++        token_type_ids: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        return self.roberta(input_ids=input_ids,
++                            position_ids=positions,
++                            kv_caches=kv_caches,
++                            inputs_embeds=inputs_embeds,
++                            intermediate_tensors=intermediate_tensors,
++                            attn_metadata=attn_metadata,
++                            token_type_ids=token_type_ids)
+diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
+new file mode 100644
+index 0000000..cca4284
+--- /dev/null
++++ b/vllm/model_executor/models/siglip.py
+@@ -0,0 +1,655 @@
++"""Implementation of SiglipVisionModel intended to be only used
++within a vision language model."""
++
++import math
++from typing import Iterable, List, Optional, Set, Tuple, Union
++
++import numpy as np
++import torch
++from PIL import Image
++from torch import nn
++from transformers import SiglipVisionConfig
++
++from vllm.attention.layer import MultiHeadAttention
++from vllm.config import ModelConfig
++from vllm.distributed import divide, get_tensor_model_parallel_world_size
++from vllm.inputs import DecoderOnlyInputs, token_inputs
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.multimodal.utils import (cached_get_tokenizer,
++                                   consecutive_placeholder_ranges,
++                                   repeat_and_pad_placeholder_tokens)
++from vllm.sequence import SequenceData
++
++from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
++
++
++def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
++    # Since interpolation is applied, the image size need not be divisible
++    # assert image_size % patch_size == 0
++    return image_size // patch_size
++
++
++def get_siglip_num_patches(*, image_size: int, patch_size: int) -> int:
++    grid_length = get_siglip_patch_grid_length(image_size=image_size,
++                                               patch_size=patch_size)
++    return grid_length * grid_length
++
++
++def get_siglip_image_feature_size(hf_config: SiglipVisionConfig) -> int:
++    return get_siglip_num_patches(image_size=hf_config.image_size,
++                                  patch_size=hf_config.patch_size)
++
++
++def get_max_siglip_image_tokens(hf_config: SiglipVisionConfig) -> int:
++    return get_siglip_image_feature_size(hf_config)
++
++
++def dummy_seq_data_for_siglip(
++    hf_config: SiglipVisionConfig,
++    seq_len: int,
++    num_images: int,
++    *,
++    image_token_id: int,
++    image_feature_size_override: Optional[int] = None,
++    mm_key: str = "image",
++):
++    if image_feature_size_override is None:
++        image_feature_size = get_siglip_image_feature_size(hf_config)
++    else:
++        image_feature_size = image_feature_size_override
++
++    return SequenceData.from_prompt_token_counts(
++        (image_token_id, image_feature_size * num_images),
++        (0, seq_len - image_feature_size * num_images),
++    ), {
++        mm_key:
++        consecutive_placeholder_ranges(num_items=num_images,
++                                       item_size=image_feature_size)
++    }
++
++
++def dummy_image_for_siglip(
++    hf_config: SiglipVisionConfig,
++    num_images: int,
++    *,
++    image_width_override: Optional[int] = None,
++    image_height_override: Optional[int] = None,
++):
++    width = height = hf_config.image_size
++    if image_width_override is not None:
++        width = image_width_override
++    if image_height_override is not None:
++        height = image_height_override
++
++    image = Image.new("RGB", (width, height), color=0)
++    return {"image": image if num_images == 1 else [image] * num_images}
++
++
++def dummy_video_for_siglip(
++    hf_config: SiglipVisionConfig,
++    num_frames: int,
++    num_videos: int = 1,
++    *,
++    image_width_override: Optional[int] = None,
++    image_height_override: Optional[int] = None,
++):
++    pil_frame = dummy_image_for_siglip(
++        hf_config,
++        num_images=1,
++        image_width_override=image_width_override,
++        image_height_override=image_height_override)
++    np_frame = np.array(pil_frame["image"])
++    mm_data_per_video = np.repeat([np_frame], num_frames, axis=0)
++    video_data = [mm_data_per_video] * num_videos
++    mm_data = {"video": video_data}
++    return mm_data
++
++
++def input_processor_for_siglip(
++    model_config: ModelConfig,
++    hf_config: SiglipVisionConfig,
++    inputs: DecoderOnlyInputs,
++    *,
++    image_token_id: int,
++    image_feature_size_override: Optional[Union[int, List[int]]] = None,
++):
++    multi_modal_data = inputs.get("multi_modal_data")
++    if multi_modal_data is None or "image" not in multi_modal_data:
++        return inputs
++
++    if "multi_modal_placeholders" in inputs and "image" in inputs[
++            "multi_modal_placeholders"]:
++        # The inputs already have placeholders.
++        return inputs
++
++    tokenizer = cached_get_tokenizer(model_config.tokenizer)
++
++    if image_feature_size_override is None:
++        image_data = multi_modal_data["image"]
++        if isinstance(image_data, Image.Image):
++            image_feature_size = get_siglip_image_feature_size(hf_config)
++        elif isinstance(image_data, torch.Tensor):
++            num_images, image_feature_size, hidden_size = image_data.shape
++        else:
++            raise TypeError(f"Invalid image type: {type(image_data)}")
++    else:
++        image_feature_size = image_feature_size_override
++
++    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
++        tokenizer,
++        inputs.get("prompt"),
++        inputs["prompt_token_ids"],
++        placeholder_token_id=image_token_id,
++        repeat_count=image_feature_size,
++    )
++
++    # NOTE: Create a defensive copy of the original inputs
++    return token_inputs(prompt_token_ids=new_token_ids,
++                        prompt=new_prompt,
++                        multi_modal_data=multi_modal_data,
++                        multi_modal_placeholders={"image": ranges})
++
++
++class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
++
++    def get_num_image_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++    ) -> int:
++        return get_siglip_image_feature_size(self.vision_config)
++
++    def get_max_image_tokens(self) -> int:
++        return get_max_siglip_image_tokens(self.vision_config)
++
++    def get_image_size(self) -> int:
++        return self.vision_config.image_size
++
++    def get_patch_size(self) -> int:
++        return self.vision_config.patch_size
++
++    def get_patch_grid_length(self) -> int:
++        return get_siglip_patch_grid_length(
++            image_size=self.vision_config.image_size,
++            patch_size=self.vision_config.patch_size,
++        )
++
++
++# Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
++class SiglipVisionEmbeddings(nn.Module):
++
++    def __init__(self, config: SiglipVisionConfig):
++        super().__init__()
++        self.config = config
++        self.embed_dim = config.hidden_size
++        self.image_size = config.image_size
++        self.patch_size = config.patch_size
++
++        self.patch_embedding = nn.Conv2d(
++            in_channels=config.num_channels,
++            out_channels=self.embed_dim,
++            kernel_size=self.patch_size,
++            stride=self.patch_size,
++            padding="valid",
++        )
++
++        self.num_patches = (self.image_size // self.patch_size)**2
++        self.num_positions = self.num_patches
++        self.position_embedding = VocabParallelEmbedding(
++            self.num_positions, self.embed_dim)
++        self.register_buffer(
++            "position_ids",
++            torch.arange(self.num_positions, dtype=torch.int64).expand(
++                (1, -1)),
++            persistent=False,
++        )
++
++    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int,
++                                 width: int) -> torch.Tensor:
++        """
++        This method is an adapted method for SigLIP (due to SigLIP not having
++        class embedding unlike other ViTs) that allows the model to interpolate
++        the pre-trained position encodings such that it can be usable on higher
++        resolution images.
++
++        Source:
++        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
++        """
++        position_embeddings = self.position_embedding.weight.unsqueeze(0)
++        num_patches = embeddings.shape[1]
++        num_positions = position_embeddings.shape[1]
++        if num_patches == num_positions and height == width:
++            return position_embeddings
++
++        dim = embeddings.shape[-1]
++        height = height // self.patch_size
++        width = width // self.patch_size
++        # we add a small number to avoid floating point error
++        # in the interpolation
++        # see discussion at https://github.com/facebookresearch/dino/issues/8
++        height, width = height + 0.1, width + 0.1
++
++        patch_pos_embed = position_embeddings.reshape(
++            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)),
++            dim)
++        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
++        patch_pos_embed = nn.functional.interpolate(
++            patch_pos_embed,
++            scale_factor=(
++                height / math.sqrt(num_positions),
++                width / math.sqrt(num_positions),
++            ),
++            mode="bicubic",
++            align_corners=False,
++        )
++        if (int(height) != patch_pos_embed.shape[-2]
++                or int(width) != patch_pos_embed.shape[-1]):
++            raise ValueError("Width or height does not match with "
++                             "the interpolated position embeddings")
++
++        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
++        return patch_pos_embed
++
++    def forward(self,
++                pixel_values: torch.Tensor,
++                interpolate_pos_encoding: bool = False) -> torch.Tensor:
++        _, _, height, width = pixel_values.shape
++        target_dtype = self.patch_embedding.weight.dtype
++        patch_embeds = self.patch_embedding(pixel_values.to(
++            dtype=target_dtype))  # shape = [*, width, grid, grid]
++        embeddings = patch_embeds.flatten(2).transpose(1, 2)
++
++        if interpolate_pos_encoding:
++            embeddings = embeddings + self.interpolate_pos_encoding(
++                embeddings, height, width)
++        else:
++            embeddings = embeddings + self.position_embedding(
++                self.position_ids)
++        return embeddings
++
++
++class SiglipAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: SiglipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++        self.embed_dim = config.hidden_size
++        self.num_heads = config.num_attention_heads
++        self.head_dim = self.embed_dim // self.num_heads
++        if self.head_dim * self.num_heads != self.embed_dim:
++            raise ValueError(f"embed_dim must be divisible by num_heads (got "
++                             "`embed_dim`: {self.embed_dim} and `num_heads`:"
++                             f" {self.num_heads}).")
++
++        self.scale = self.head_dim**-0.5
++        self.dropout = config.attention_dropout
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size=self.embed_dim,
++            head_size=self.head_dim,
++            total_num_heads=self.num_heads,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++
++        self.out_proj = RowParallelLinear(
++            input_size=self.embed_dim,
++            output_size=self.embed_dim,
++            quant_config=quant_config,
++            prefix=f"{prefix}.out_proj",
++        )
++
++        self.tp_size = get_tensor_model_parallel_world_size()
++        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
++
++        self.attn = MultiHeadAttention(self.num_heads_per_partition,
++                                       self.head_dim, self.scale)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++    ) -> torch.Tensor:
++        """Input shape: Batch x Time x Channel"""
++        qkv_states, _ = self.qkv_proj(hidden_states)
++        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
++
++        out = self.attn(query_states, key_states, value_states)
++        attn_output, _ = self.out_proj(out)
++
++        return attn_output, None
++
++
++class SiglipMLP(nn.Module):
++
++    def __init__(
++        self,
++        config: SiglipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++        self.activation_fn = get_act_fn(config.hidden_act)
++
++        # For quantization, we require the hidden size to be a multiple of 64
++        quantizable = (config.hidden_size % 64 == 0
++                       and config.intermediate_size % 64 == 0)
++        self.fc1 = ColumnParallelLinear(
++            config.hidden_size,
++            config.intermediate_size,
++            quant_config=quant_config if quantizable else None,
++            prefix=f"{prefix}.fc1",
++        )
++        self.fc2 = RowParallelLinear(
++            config.intermediate_size,
++            config.hidden_size,
++            quant_config=quant_config if quantizable else None,
++            prefix=f"{prefix}.fc2",
++        )
++
++    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
++        hidden_states, _ = self.fc1(hidden_states)
++        hidden_states = self.activation_fn(hidden_states)
++        hidden_states, _ = self.fc2(hidden_states)
++        return hidden_states
++
++
++class SiglipEncoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: SiglipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.embed_dim = config.hidden_size
++
++        self.self_attn = SiglipAttention(
++            config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
++                                        eps=config.layer_norm_eps)
++        self.mlp = SiglipMLP(
++            config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.mlp",
++        )
++        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
++                                        eps=config.layer_norm_eps)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++    ) -> Tuple[torch.Tensor, None]:
++        residual = hidden_states
++
++        hidden_states = self.layer_norm1(hidden_states)
++        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
++        hidden_states = residual + hidden_states
++
++        residual = hidden_states
++        hidden_states = self.layer_norm2(hidden_states)
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = residual + hidden_states
++
++        return hidden_states, None
++
++
++class SiglipEncoder(nn.Module):
++
++    def __init__(
++        self,
++        config: SiglipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        num_hidden_layers_override: Optional[int] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++
++        if num_hidden_layers_override is None:
++            num_hidden_layers = config.num_hidden_layers
++        else:
++            num_hidden_layers = num_hidden_layers_override
++
++        self.layers = nn.ModuleList([
++            SiglipEncoderLayer(config,
++                               quant_config=quant_config,
++                               prefix=f"{prefix}.layers.{layer_idx}")
++            for layer_idx in range(num_hidden_layers)
++        ])
++
++    def forward(
++        self,
++        inputs_embeds: torch.Tensor,
++        return_all_hidden_states: bool,
++    ) -> Union[torch.Tensor, list[torch.Tensor]]:
++        hidden_states_pool = []
++        hidden_states = inputs_embeds
++
++        for encoder_layer in self.layers:
++            hidden_states, _ = encoder_layer(hidden_states)
++            if return_all_hidden_states:
++                hidden_states_pool.append(hidden_states)
++        # If we have multiple feature sample layers, we return all hidden
++        # states in order and grab the ones we need by index.
++        if return_all_hidden_states:
++            return hidden_states_pool
++        return hidden_states
++
++
++class SiglipMultiheadAttentionPoolingHead(nn.Module):
++    """Multihead Attention Pooling."""
++
++    def __init__(
++        self,
++        config: SiglipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
++        # TODO(ChristopherCho): Implement vLLM version of MultiheadAttention
++        self.attention = torch.nn.MultiheadAttention(
++            config.hidden_size, config.num_attention_heads, batch_first=True)
++        self.layernorm = nn.LayerNorm(config.hidden_size,
++                                      eps=config.layer_norm_eps)
++        self.mlp = SiglipMLP(config=config,
++                             quant_config=quant_config,
++                             prefix=f"{prefix}.mlp")
++
++    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
++        batch_size = hidden_state.shape[0]
++        probe = self.probe.repeat(batch_size, 1, 1)
++
++        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
++
++        residual = hidden_state
++        hidden_state = self.layernorm(hidden_state)
++        hidden_state = residual + self.mlp(hidden_state)
++
++        return hidden_state[:, 0]
++
++
++class SiglipVisionTransformer(nn.Module):
++
++    def __init__(
++        self,
++        config: SiglipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        require_post_norm: Optional[bool] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.config = config
++        embed_dim = config.hidden_size
++
++        self.embeddings = SiglipVisionEmbeddings(config)
++
++        self.encoder = SiglipEncoder(
++            config,
++            quant_config=quant_config,
++            num_hidden_layers_override=num_hidden_layers_override,
++            prefix=f"{prefix}.encoder",
++        )
++
++        num_hidden_layers = config.num_hidden_layers
++        if len(self.encoder.layers) > config.num_hidden_layers:
++            raise ValueError(
++                f"The original encoder only has {num_hidden_layers} "
++                f"layers, but you requested {len(self.encoder.layers)} layers."
++            )
++
++        # If possible, skip post_layernorm to conserve memory
++        if require_post_norm is None:
++            require_post_norm = len(self.encoder.layers) == num_hidden_layers
++
++        if require_post_norm:
++            self.post_layernorm = nn.LayerNorm(embed_dim,
++                                               eps=config.layer_norm_eps)
++        else:
++            self.post_layernorm = None
++
++        self.use_head = (True if not hasattr(config, "vision_use_head") else
++                         config.vision_use_head)
++        if self.use_head:
++            self.head = SiglipMultiheadAttentionPoolingHead(
++                config=config,
++                quant_config=quant_config,
++                prefix=f"{prefix}.head",
++            )
++
++    def forward(
++        self,
++        pixel_values: torch.Tensor,
++        interpolate_pos_encoding: bool = True,
++        feature_sample_layers: Optional[list[int]] = None,
++    ) -> torch.Tensor:
++
++        hidden_states = self.embeddings(
++            pixel_values,
++            interpolate_pos_encoding=interpolate_pos_encoding,
++        )
++
++        return_all_hidden_states = feature_sample_layers is not None
++
++        # Produces either the last layer output or all of the hidden states,
++        # depending on if we have feature_sample_layers or not
++        encoder_outputs = self.encoder(
++            inputs_embeds=hidden_states,
++            return_all_hidden_states=return_all_hidden_states,
++        )
++
++        # Handle post-norm (if applicable) and stacks feature layers if needed
++        encoder_outputs = resolve_visual_encoder_outputs(
++            encoder_outputs, feature_sample_layers, self.post_layernorm,
++            self.config.num_hidden_layers)
++
++        # TODO: add this back when pooled_output is used in inference.
++        # if self.use_head:
++        # pooled_output = self.head(encoder_outputs)
++
++        return encoder_outputs
++
++
++class SiglipVisionModel(nn.Module):
++    config_class = SiglipVisionConfig
++    main_input_name = "pixel_values"
++
++    def __init__(
++        self,
++        config: SiglipVisionConfig,
++        quant_config: Optional[QuantizationConfig] = None,
++        *,
++        num_hidden_layers_override: Optional[int] = None,
++        require_post_norm: Optional[bool] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.vision_model = SiglipVisionTransformer(
++            config,
++            quant_config,
++            num_hidden_layers_override=num_hidden_layers_override,
++            require_post_norm=require_post_norm,
++            prefix=f"{prefix}.vision_model",
++        )
++
++    def get_input_embeddings(self) -> nn.Module:
++        return self.vision_model.embeddings.patch_embedding
++
++    def forward(
++        self,
++        pixel_values: torch.Tensor,
++        interpolate_pos_encoding: bool = False,
++        feature_sample_layers: Optional[list[int]] = None,
++    ) -> torch.Tensor:
++        return self.vision_model(
++            pixel_values=pixel_values,
++            interpolate_pos_encoding=interpolate_pos_encoding,
++            feature_sample_layers=feature_sample_layers,
++        )
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            ("qkv_proj", "q_proj", "q"),
++            ("qkv_proj", "k_proj", "k"),
++            ("qkv_proj", "v_proj", "v"),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        layer_count = len(self.vision_model.encoder.layers)
++
++        for name, loaded_weight in weights:
++            # post_layernorm is optional in SiglipVisionModel
++            if (name.startswith("vision_model.post_layernorm")
++                    and self.vision_model.post_layernorm is None):
++                continue
++
++            # omit layers when num_hidden_layers_override is set
++            if name.startswith("vision_model.encoder.layers"):
++                layer_idx = int(name.split(".")[3])
++                if layer_idx >= layer_count:
++                    continue
++
++            for (param_name, weight_name, shard_id) in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
+new file mode 100644
+index 0000000..a7cf65a
+--- /dev/null
++++ b/vllm/model_executor/models/solar.py
+@@ -0,0 +1,573 @@
++# Adapted from
++# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Inference-only Solar model compatible with HuggingFace weights."""
++
++from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
++
++import torch
++from torch import nn
++from transformers import PretrainedConfig
++
++from vllm.attention import Attention, AttentionMetadata
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
++                              get_tensor_model_parallel_world_size)
++from vllm.model_executor.layers.activation import SiluAndMul
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization import QuantizationConfig
++from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
++    get_compressed_tensors_cache_scale)
++from vllm.model_executor.layers.rotary_embedding import get_rope
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.platforms import current_platform
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsLoRA, SupportsPP
++from .utils import (PPMissingLayer, is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
++
++
++class SolarMLP(nn.Module):
++
++    def __init__(
++        self,
++        hidden_size: int,
++        intermediate_size: int,
++        hidden_act: str,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.gate_up_proj = MergedColumnParallelLinear(
++            input_size=hidden_size,
++            output_sizes=[intermediate_size] * 2,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj",
++        )
++        self.down_proj = RowParallelLinear(
++            input_size=intermediate_size,
++            output_size=hidden_size,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.down_proj",
++        )
++        if hidden_act != "silu":
++            raise ValueError(f"Unsupported activation: {hidden_act}. "
++                             "Only silu is supported for now.")
++        self.act_fn = SiluAndMul()
++
++    def forward(self, x):
++        gate_up, _ = self.gate_up_proj(x)
++        x = self.act_fn(gate_up)
++        x, _ = self.down_proj(x)
++        return x
++
++
++class SolarAttention(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        hidden_size: int,
++        num_heads: int,
++        num_kv_heads: int,
++        rope_theta: float = 10000,
++        rope_scaling: Optional[Dict[str, Any]] = None,
++        max_position_embeddings: int = 8192,
++        quant_config: Optional[QuantizationConfig] = None,
++        bias: bool = False,
++        cache_config: Optional[CacheConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = hidden_size
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = num_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        self.total_num_kv_heads = num_kv_heads
++        if self.total_num_kv_heads >= tp_size:
++            # Number of KV heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_kv_heads % tp_size == 0
++        else:
++            # Number of KV heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_kv_heads == 0
++        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
++        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
++        self.head_dim = getattr(config, "head_dim",
++                                self.hidden_size // self.total_num_heads)
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.scaling = self.head_dim**-0.5
++        self.rope_theta = rope_theta
++        self.max_position_embeddings = max_position_embeddings
++
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size=hidden_size,
++            head_size=self.head_dim,
++            total_num_heads=self.total_num_heads,
++            total_num_kv_heads=self.total_num_kv_heads,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++        self.o_proj = RowParallelLinear(
++            input_size=self.total_num_heads * self.head_dim,
++            output_size=hidden_size,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
++        )
++
++        self.rotary_emb = get_rope(
++            self.head_dim,
++            rotary_dim=self.head_dim,
++            max_position=max_position_embeddings,
++            base=rope_theta,
++            rope_scaling=rope_scaling,
++        )
++        self.attn = Attention(
++            self.num_heads,
++            self.head_dim,
++            self.scaling,
++            num_kv_heads=self.num_kv_heads,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attn",
++        )
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++        q, k = self.rotary_emb(positions, q, k)
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++        output, _ = self.o_proj(attn_output)
++        return output
++
++
++class SolarDecoderLayer(nn.Module):
++
++    def __init__(
++        self,
++        config: PretrainedConfig,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        super().__init__()
++        self.hidden_size = config.hidden_size
++        rope_theta = getattr(config, "rope_theta", 10000)
++        rope_scaling = getattr(config, "rope_scaling", None)
++
++        if rope_scaling is not None and getattr(
++                config, "original_max_position_embeddings", None):
++            rope_scaling["original_max_position_embeddings"] \
++                = config.original_max_position_embeddings
++        max_position_embeddings = getattr(config, "max_position_embeddings",
++                                          8192)
++        # Support abacusai/Smaug-72B-v0.1 with attention_bias
++        # Support internlm/internlm-7b with bias
++        attention_bias = getattr(config, "attention_bias", False) or getattr(
++            config, "bias", False)
++        self.self_attn = SolarAttention(
++            config=config,
++            hidden_size=self.hidden_size,
++            num_heads=config.num_attention_heads,
++            num_kv_heads=getattr(config, "num_key_value_heads",
++                                 config.num_attention_heads),
++            rope_theta=rope_theta,
++            rope_scaling=rope_scaling,
++            max_position_embeddings=max_position_embeddings,
++            quant_config=quant_config,
++            bias=attention_bias,
++            cache_config=cache_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.mlp = SolarMLP(
++            hidden_size=self.hidden_size,
++            intermediate_size=config.intermediate_size,
++            hidden_act=config.hidden_act,
++            quant_config=quant_config,
++            bias=getattr(config, "mlp_bias", False),
++            prefix=f"{prefix}.mlp",
++        )
++        self.input_layernorm = RMSNorm(config.hidden_size,
++                                       eps=config.rms_norm_eps)
++        self.post_attention_layernorm = RMSNorm(config.hidden_size,
++                                                eps=config.rms_norm_eps)
++
++    def forward(
++        self,
++        positions: torch.Tensor,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        residual: Optional[torch.Tensor],
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        # Self Attention
++        if residual is None:
++            residual = hidden_states
++            hidden_states = self.input_layernorm(hidden_states)
++        else:
++            hidden_states, residual = self.input_layernorm(
++                hidden_states, residual)
++        hidden_states = self.self_attn(
++            positions=positions,
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++
++        # Fully Connected
++        hidden_states, residual = self.post_attention_layernorm(
++            hidden_states, residual)
++        hidden_states = self.mlp(hidden_states)
++        return hidden_states, residual
++
++
++@support_torch_compile
++class SolarModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++
++        self.config = config
++        self.padding_idx = config.pad_token_id
++        lora_vocab = ((lora_config.lora_extra_vocab_size *
++                       (lora_config.max_loras or 1)) if lora_config else 0)
++        self.vocab_size = config.vocab_size + lora_vocab
++        self.org_vocab_size = config.vocab_size
++        if get_pp_group().is_first_rank or (config.tie_word_embeddings
++                                            and get_pp_group().is_last_rank):
++            self.embed_tokens = VocabParallelEmbedding(
++                self.vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++            )
++        else:
++            self.embed_tokens = PPMissingLayer()
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: SolarDecoderLayer(
++                config=config,
++                cache_config=cache_config,
++                quant_config=quant_config,
++                prefix=prefix,
++            ),
++            prefix=f"{prefix}.layers",
++        )
++        if get_pp_group().is_last_rank:
++            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
++        else:
++            self.norm = PPMissingLayer()
++
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(
++                ["hidden_states", "residual"], config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++    def forward(
++        self,
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++            residual = None
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++            residual = intermediate_tensors["residual"]
++
++        bskcn_h_1 = None
++        bskcn_h_2 = None
++        bskcn_r_1 = None
++        bskcn_r_2 = None
++        bskcn_tv = (self.config.bskcn_tv[0]
++                    if self.training else self.config.bskcn_tv[1])
++
++        for i in range(self.start_layer, self.end_layer):
++            if i in self.config.bskcn_1:
++                bskcn_h_1 = hidden_states.clone()
++                bskcn_r_1 = residual.clone()
++            if i in self.config.bskcn_2:
++                bskcn_h_2 = hidden_states.clone()
++                bskcn_r_2 = residual.clone()
++            if i in self.config.bskcn_3:
++                hidden_states = bskcn_h_1 * bskcn_tv + hidden_states * (
++                    1 - bskcn_tv)
++                residual = bskcn_r_1 * bskcn_tv + residual * (1 - bskcn_tv)
++            if i in self.config.bskcn_4:
++                hidden_states = bskcn_h_2 * bskcn_tv + hidden_states * (
++                    1 - bskcn_tv)
++                residual = bskcn_r_2 * bskcn_tv + residual * (1 - bskcn_tv)
++            layer = self.layers[i]
++            hidden_states, residual = layer(
++                positions,
++                hidden_states,
++                kv_caches[i - self.start_layer],
++                attn_metadata,
++                residual,
++            )
++
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({
++                "hidden_states": hidden_states,
++                "residual": residual
++            })
++
++        hidden_states, _ = self.norm(hidden_states, residual)
++        return hidden_states
++
++
++class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
++    packed_modules_mapping = {
++        "qkv_proj": [
++            "q_proj",
++            "k_proj",
++            "v_proj",
++        ],
++        "gate_up_proj": [
++            "gate_proj",
++            "up_proj",
++        ],
++    }
++
++    # LoRA specific attributes
++    supported_lora_modules = [
++        "qkv_proj",
++        "o_proj",
++        "gate_up_proj",
++        "down_proj",
++        "embed_tokens",
++        "lm_head",
++    ]
++    embedding_modules = {
++        "embed_tokens": "input_embeddings",
++        "lm_head": "output_embeddings",
++    }
++    embedding_padding_modules = ["lm_head"]
++    bitsandbytes_stacked_params_mapping = {
++        # shard_name, weight_name, index
++        "q_proj": ("qkv_proj", 0),
++        "k_proj": ("qkv_proj", 1),
++        "v_proj": ("qkv_proj", 2),
++        "gate_proj": ("gate_up_proj", 0),
++        "up_proj": ("gate_up_proj", 1),
++    }
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        lora_config = vllm_config.lora_config
++        self.config = config
++        self.lora_config = lora_config
++
++        self.model = SolarModel(
++            vllm_config=vllm_config,
++            prefix=maybe_prefix(prefix, "model"),
++        )
++        if get_pp_group().is_last_rank:
++            self.unpadded_vocab_size = config.vocab_size
++            if lora_config:
++                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
++            self.lm_head = ParallelLMHead(
++                self.unpadded_vocab_size,
++                config.hidden_size,
++                org_num_embeddings=config.vocab_size,
++                padding_size=DEFAULT_VOCAB_PADDING_SIZE
++                # We need bigger padding if using lora for kernel
++                # compatibility
++                if not lora_config else lora_config.lora_vocab_padding_size,
++                quant_config=quant_config,
++            )
++            if config.tie_word_embeddings:
++                self.lm_head.weight = self.model.embed_tokens.weight
++
++            logit_scale = getattr(config, "logit_scale", 1.0)
++            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                    config.vocab_size,
++                                                    logit_scale)
++        else:
++            self.lm_head = PPMissingLayer()
++
++        self.sampler = get_sampler()
++
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        model_output = self.model(input_ids, positions, kv_caches,
++                                  attn_metadata, intermediate_tensors,
++                                  inputs_embeds)
++        return model_output
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        logits = self.logits_processor(self.lm_head, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            (".qkv_proj", ".q_proj", "q"),
++            (".qkv_proj", ".k_proj", "k"),
++            (".qkv_proj", ".v_proj", "v"),
++            (".gate_up_proj", ".gate_proj", 0),
++            (".gate_up_proj", ".up_proj", 1),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            if "rotary_emb.inv_freq" in name:
++                continue
++            if ("rotary_emb.cos_cached" in name
++                    or "rotary_emb.sin_cached" in name):
++                # Models trained using ColossalAI may include these tensors in
++                # the checkpoint. Skip them.
++                continue
++            if scale_name := get_compressed_tensors_cache_scale(name):
++                # Loading kv cache scales for compressed-tensors quantization
++                param = params_dict[scale_name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                loaded_weight = loaded_weight[0]
++                weight_loader(param, loaded_weight)
++                loaded_params.add(scale_name)
++                continue
++            for param_name, weight_name, shard_id in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++
++                break
++            else:
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++                # Remapping the name of FP8 kv-scale.
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++
++                if is_pp_missing_parameter(name, self):
++                    continue
++
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++    # If this function is called, it should always initialize KV cache scale
++    # factors (or else raise an exception). Thus, handled exceptions should
++    # make sure to leave KV cache scale factors in a known good (dummy) state
++    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
++        tp_size = get_tensor_model_parallel_world_size()
++        tp_rank = get_tensor_model_parallel_rank()
++        for layer_idx, scaling_factor in kv_cache_scales_loader(
++                quantization_param_path,
++                tp_rank,
++                tp_size,
++                self.config.num_hidden_layers,
++                self.config.__class__.model_type,
++        ):
++            if not isinstance(self.model.layers[layer_idx], nn.Identity):
++                layer_self_attn = self.model.layers[layer_idx].self_attn
++
++            if current_platform.is_rocm():
++                # The scaling factor convention we are assuming is
++                # quantized_value * scaling_factor ~= true_value
++                # which is consistent with the practice of setting
++                # scaling_factor = tensor_amax / FPtype_max
++                scaling_factor *= 2
++            if hasattr(layer_self_attn.attn, "_k_scale"):
++                layer_self_attn.attn._k_scale = scaling_factor
++                layer_self_attn.attn._v_scale = scaling_factor
++            else:
++                raise RuntimeError("Self attention has no KV cache scaling "
++                                   "factor attribute!")
+diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
+index 3d4f4f7..c9d1af7 100644
+--- a/vllm/model_executor/models/stablelm.py
++++ b/vllm/model_executor/models/stablelm.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
+ # All rights reserved.
+ #
+@@ -19,35 +18,41 @@
+ # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
+ """Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
+ model compatible with HuggingFace weights."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+-from transformers import PretrainedConfig
++from transformers import StableLmConfig
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import SiluAndMul
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     ParallelLMHead, VocabParallelEmbedding)
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class StablelmMLP(nn.Module):
+ 
+     def __init__(self,
+-                 config: PretrainedConfig,
+-                 quant_config: Optional[QuantizationConfig] = None) -> None:
++                 config: StableLmConfig,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = "") -> None:
+         super().__init__()
+         self.config = config
+         self.hidden_size = config.hidden_size
+@@ -55,10 +60,13 @@ class StablelmMLP(nn.Module):
+         self.gate_up_proj = MergedColumnParallelLinear(
+             config.hidden_size, [config.intermediate_size] * 2,
+             bias=False,
+-            quant_config=quant_config)
++            quant_config=quant_config,
++            prefix=f"{prefix}.gate_up_proj")
+         self.down_proj = RowParallelLinear(config.intermediate_size,
+                                            config.hidden_size,
+-                                           bias=False)
++                                           bias=False,
++                                           quant_config=quant_config,
++                                           prefix=f"{prefix}.down_proj")
+         self.act_fn = SiluAndMul()
+ 
+     def forward(self, x: torch.Tensor) -> torch.Tensor:
+@@ -71,8 +79,10 @@ class StablelmMLP(nn.Module):
+ class StablelmAttention(nn.Module):
+ 
+     def __init__(self,
+-                 config: PretrainedConfig,
+-                 quant_config: Optional[QuantizationConfig] = None) -> None:
++                 config: StableLmConfig,
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = "") -> None:
+         super().__init__()
+         self.config = config
+         self.hidden_size = config.hidden_size
+@@ -110,11 +120,13 @@ class StablelmAttention(nn.Module):
+                                           self.total_num_heads,
+                                           self.total_num_key_value_heads,
+                                           self.qkv_bias,
+-                                          quant_config=quant_config)
++                                          quant_config=quant_config,
++                                          prefix=f"{prefix}.qkv_proj")
+         self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                         self.hidden_size,
+                                         bias=False,
+-                                        quant_config=quant_config)
++                                        quant_config=quant_config,
++                                        prefix=f"{prefix}.o_proj")
+         self.rotary_emb = get_rope(
+             self.head_dim,
+             rotary_dim=self.rotary_ndims,
+@@ -124,7 +136,10 @@ class StablelmAttention(nn.Module):
+         self.attn = Attention(self.num_heads,
+                               self.head_dim,
+                               self.scaling,
+-                              num_kv_heads=self.num_key_value_heads)
++                              num_kv_heads=self.num_key_value_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -145,12 +160,17 @@ class StablelmDecoderLayer(nn.Module):
+ 
+     def __init__(
+         self,
+-        config: PretrainedConfig,
++        config: StableLmConfig,
++        cache_config: Optional[CacheConfig] = None,
+         quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
+     ) -> None:
+         super().__init__()
+-        self.self_attn = StablelmAttention(config)
+-        self.mlp = StablelmMLP(config, quant_config)
++        self.self_attn = StablelmAttention(config,
++                                           cache_config,
++                                           quant_config,
++                                           prefix=f"{prefix}.self_attn")
++        self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp")
+         norm_eps = getattr(config, "norm_eps",
+                            getattr(config, "layer_norm_eps", 1e-05))
+         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
+@@ -186,21 +206,34 @@ class StablelmDecoderLayer(nn.Module):
+ 
+ class StableLMEpochModel(nn.Module):
+ 
+-    def __init__(self,
+-                 config: PretrainedConfig,
+-                 quant_config: Optional[QuantizationConfig] = None) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.embed_tokens = VocabParallelEmbedding(
+             config.vocab_size,
+             config.hidden_size,
++            quant_config=quant_config,
++            prefix=f"{prefix}.embed_tokens",
++        )
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: StablelmDecoderLayer(
++                config, cache_config, quant_config, prefix=prefix),
++            prefix=f"{prefix}.layers",
+         )
+-        self.layers = nn.ModuleList([
+-            StablelmDecoderLayer(config, quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
+         norm_eps = getattr(config, "norm_eps",
+                            getattr(config, "layer_norm_eps", 1e-05))
+         self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -208,34 +241,54 @@ class StableLMEpochModel(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+             hidden_states, residual = layer(
+                 positions,
+                 hidden_states,
+-                kv_caches[i],
++                kv_caches[i - self.start_layer],
+                 attn_metadata,
+             )
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         hidden_states = self.norm(hidden_states)
+         return hidden_states
+ 
+ 
+-class StablelmForCausalLM(nn.Module):
++class StablelmForCausalLM(nn.Module, SupportsPP):
+ 
+-    def __init__(
+-        self,
+-        config: PretrainedConfig,
+-        quant_config: Optional[QuantizationConfig] = None,
+-    ) -> None:
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+         self.quant_config = quant_config
+-        self.model = StableLMEpochModel(config, quant_config)
+-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
++        self.model = StableLMEpochModel(vllm_config=vllm_config,
++                                        prefix=maybe_prefix(prefix, "model"))
++        self.lm_head = ParallelLMHead(config.vocab_size,
++                                      config.hidden_size,
++                                      quant_config=quant_config,
++                                      prefix=f"{prefix}.lm_head")
++        if self.config.tie_word_embeddings:
++            self.lm_head.weight = self.model.embed_tokens.weight
+         self.logits_processor = LogitsProcessor(config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -243,14 +296,20 @@ class StablelmForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -262,7 +321,8 @@ class StablelmForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -272,6 +332,7 @@ class StablelmForCausalLM(nn.Module):
+             ("gate_up_proj", "up_proj", 1),
+         ]
+         params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -287,6 +348,8 @@ class StablelmForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+@@ -295,7 +358,11 @@ class StablelmForCausalLM(nn.Module):
+                 # Skip loading extra bias for GPTQ models.
+                 if name.endswith(".bias") and name not in params_dict:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
+index 33998e2..1cd0ded 100644
+--- a/vllm/model_executor/models/starcoder2.py
++++ b/vllm/model_executor/models/starcoder2.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
+ #
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+@@ -18,35 +17,44 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """ PyTorch Starcoder2 model."""
+-from typing import Iterable, List, Optional, Tuple
++from typing import Iterable, List, Optional, Set, Tuple, Union
+ 
+ import torch
+ from torch import nn
+ from transformers import Starcoder2Config
+ 
+ from vllm.attention import Attention, AttentionMetadata
+-from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.compilation.decorators import support_torch_compile
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+ from vllm.model_executor.layers.activation import get_act_fn
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                                QKVParallelLinear,
+                                                RowParallelLinear)
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
+-from vllm.model_executor.layers.quantization.base_config import (
+-    QuantizationConfig)
++from vllm.model_executor.layers.quantization import QuantizationConfig
+ from vllm.model_executor.layers.rotary_embedding import get_rope
+-from vllm.model_executor.layers.sampler import Sampler
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
+     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.model_loader.weight_utils import (
++    default_weight_loader, maybe_remap_kv_scale_name)
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
+-from vllm.sequence import SamplerOutput
++from vllm.sequence import IntermediateTensors
++
++from .interfaces import SupportsPP
++from .utils import (is_pp_missing_parameter,
++                    make_empty_intermediate_tensors_factory, make_layers,
++                    maybe_prefix)
+ 
+ 
+ class Starcoder2Attention(nn.Module):
+ 
+     def __init__(self,
+                  config: Starcoder2Config,
+-                 quant_config: Optional[QuantizationConfig] = None):
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__()
+         self.config = config
+ 
+@@ -72,7 +80,6 @@ class Starcoder2Attention(nn.Module):
+         self.rope_theta = config.rope_theta
+         self.max_position_embeddings = config.max_position_embeddings
+         self.use_bias = config.use_bias
+-        self.sliding_window = config.sliding_window
+ 
+         self.qkv_proj = QKVParallelLinear(
+             self.hidden_size,
+@@ -81,12 +88,14 @@ class Starcoder2Attention(nn.Module):
+             self.total_num_kv_heads,
+             bias=self.use_bias,
+             quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
+         )
+         self.o_proj = RowParallelLinear(
+             self.total_num_heads * self.head_dim,
+             self.hidden_size,
+             bias=self.use_bias,
+             quant_config=quant_config,
++            prefix=f"{prefix}.o_proj",
+         )
+         self.rotary_emb = get_rope(
+             self.head_dim,
+@@ -95,13 +104,13 @@ class Starcoder2Attention(nn.Module):
+             base=int(self.rope_theta),
+             is_neox_style=True,
+         )
+-        self.attn = Attention(
+-            self.num_heads,
+-            self.head_dim,
+-            self.scaling,
+-            num_kv_heads=self.num_kv_heads,
+-            sliding_window=self.sliding_window,
+-        )
++        self.attn = Attention(self.num_heads,
++                              self.head_dim,
++                              self.scaling,
++                              num_kv_heads=self.num_kv_heads,
++                              cache_config=cache_config,
++                              quant_config=quant_config,
++                              prefix=f"{prefix}.attn")
+ 
+     def forward(
+         self,
+@@ -122,22 +131,24 @@ class Starcoder2MLP(nn.Module):
+ 
+     def __init__(self,
+                  config: Starcoder2Config,
+-                 quant_config: Optional[QuantizationConfig] = None):
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__()
+         self.c_fc = ColumnParallelLinear(
+             config.hidden_size,
+             config.intermediate_size,
+             bias=config.use_bias,
+             quant_config=quant_config,
++            prefix=f"{prefix}.c_fc",
+         )
+         self.c_proj = RowParallelLinear(
+             config.intermediate_size,
+             config.hidden_size,
+             bias=config.use_bias,
+             quant_config=quant_config,
++            prefix=f"{prefix}.c_proj",
+         )
+-        self.act = get_act_fn(config.hidden_act, quant_config,
+-                              config.intermediate_size)
++        self.act = get_act_fn(config.hidden_act)
+ 
+     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+         hidden_states, _ = self.c_fc(hidden_states)
+@@ -150,11 +161,18 @@ class Starcoder2DecoderLayer(nn.Module):
+ 
+     def __init__(self,
+                  config: Starcoder2Config,
+-                 quant_config: Optional[QuantizationConfig] = None):
++                 cache_config: Optional[CacheConfig] = None,
++                 quant_config: Optional[QuantizationConfig] = None,
++                 prefix: str = ""):
+         super().__init__()
+         self.hidden_size = config.hidden_size
+-        self.self_attn = Starcoder2Attention(config, quant_config=quant_config)
+-        self.mlp = Starcoder2MLP(config, quant_config=quant_config)
++        self.self_attn = Starcoder2Attention(config,
++                                             cache_config,
++                                             quant_config=quant_config,
++                                             prefix=f"{prefix}.self_attn")
++        self.mlp = Starcoder2MLP(config,
++                                 quant_config=quant_config,
++                                 prefix=f"{prefix}.mlp")
+         self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                             eps=config.norm_epsilon)
+         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+@@ -187,24 +205,40 @@ class Starcoder2DecoderLayer(nn.Module):
+         return hidden_states
+ 
+ 
++@support_torch_compile
+ class Starcoder2Model(nn.Module):
+ 
+-    def __init__(self,
+-                 config: Starcoder2Config,
+-                 quant_config: Optional[QuantizationConfig] = None):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
+         self.config = config
+         self.padding_idx = config.pad_token_id
+         self.vocab_size = config.vocab_size
+ 
+         # TODO: consider padding_idx (currently removed)
+-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+-                                                   config.hidden_size)
+-        self.layers = nn.ModuleList([
+-            Starcoder2DecoderLayer(config, quant_config=quant_config)
+-            for _ in range(config.num_hidden_layers)
+-        ])
++        self.embed_tokens = VocabParallelEmbedding(
++            config.vocab_size,
++            config.hidden_size,
++            quant_config=quant_config,
++            prefix=f"{prefix}.embed_tokens")
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.num_hidden_layers,
++            lambda prefix: Starcoder2DecoderLayer(
++                config, cache_config, quant_config=quant_config, prefix=prefix
++            ),
++            prefix=f"{prefix}.layers",
++        )
+         self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
++        self.make_empty_intermediate_tensors = (
++            make_empty_intermediate_tensors_factory(["hidden_states"],
++                                                    config.hidden_size))
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
+ 
+     def forward(
+         self,
+@@ -212,28 +246,41 @@ class Starcoder2Model(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
+-        hidden_states = self.embed_tokens(input_ids)
+-        for i in range(len(self.layers)):
++        intermediate_tensors: Optional[IntermediateTensors],
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
++        if get_pp_group().is_first_rank:
++            if inputs_embeds is not None:
++                hidden_states = inputs_embeds
++            else:
++                hidden_states = self.get_input_embeddings(input_ids)
++        else:
++            assert intermediate_tensors is not None
++            hidden_states = intermediate_tensors["hidden_states"]
++        for i in range(self.start_layer, self.end_layer):
+             layer = self.layers[i]
+-            hidden_states = layer(positions, hidden_states, kv_caches[i],
++            hidden_states = layer(positions, hidden_states,
++                                  kv_caches[i - self.start_layer],
+                                   attn_metadata)
++        if not get_pp_group().is_last_rank:
++            return IntermediateTensors({"hidden_states": hidden_states})
+         hidden_states = self.norm(hidden_states)
+         return hidden_states
+ 
+ 
+-class Starcoder2ForCausalLM(nn.Module):
++class Starcoder2ForCausalLM(nn.Module, SupportsPP):
+ 
+-    def __init__(self,
+-                 config: Starcoder2Config,
+-                 quant_config: Optional[QuantizationConfig] = None):
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
+         self.config = config
+-        self.model = Starcoder2Model(config, quant_config=quant_config)
++        self.model = Starcoder2Model(vllm_config=vllm_config,
++                                     prefix=maybe_prefix(prefix, "model"))
+         self.vocab_size = config.vocab_size
+         self.unpadded_vocab_size = config.vocab_size
+         if config.tie_word_embeddings:
+-            self.lm_head_weight = self.model.embed_tokens.weight
++            self.lm_head = self.model.embed_tokens
+         else:
+             self.unpadded_vocab_size = config.vocab_size
+             self.lm_head = ParallelLMHead(
+@@ -241,11 +288,17 @@ class Starcoder2ForCausalLM(nn.Module):
+                 config.hidden_size,
+                 org_num_embeddings=config.vocab_size,
+                 padding_size=DEFAULT_VOCAB_PADDING_SIZE,
++                quant_config=quant_config,
++                prefix=f"{prefix}.lm_head",
+             )
+-            self.lm_head_weight = self.lm_head.weight
+         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                 config.vocab_size)
+-        self.sampler = Sampler()
++        self.sampler = get_sampler()
++        self.make_empty_intermediate_tensors = (
++            self.model.make_empty_intermediate_tensors)
++
++    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
++        return self.model.get_input_embeddings(input_ids)
+ 
+     def forward(
+         self,
+@@ -253,14 +306,20 @@ class Starcoder2ForCausalLM(nn.Module):
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-    ) -> torch.Tensor:
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        inputs_embeds: Optional[torch.Tensor] = None,
++    ) -> Union[torch.Tensor, IntermediateTensors]:
+         hidden_states = self.model(input_ids, positions, kv_caches,
+-                                   attn_metadata)
++                                   attn_metadata, intermediate_tensors,
++                                   inputs_embeds)
+         return hidden_states
+ 
+-    def compute_logits(self, hidden_states: torch.Tensor,
+-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+-        logits = self.logits_processor(self.lm_head_weight, hidden_states,
++    def compute_logits(
++        self,
++        hidden_states: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[torch.Tensor]:
++        logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
+ 
+@@ -272,7 +331,8 @@ class Starcoder2ForCausalLM(nn.Module):
+         next_tokens = self.sampler(logits, sampling_metadata)
+         return next_tokens
+ 
+-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
+         stacked_params_mapping = [
+             # (param_name, shard_name, shard_id)
+             ("qkv_proj", "q_proj", "q"),
+@@ -281,6 +341,7 @@ class Starcoder2ForCausalLM(nn.Module):
+         ]
+ 
+         params_dict = dict(self.named_parameters(remove_duplicate=False))
++        loaded_params: Set[str] = set()
+         for name, loaded_weight in weights:
+             if "rotary_emb.inv_freq" in name:
+                 continue
+@@ -289,14 +350,24 @@ class Starcoder2ForCausalLM(nn.Module):
+                 if weight_name not in name:
+                     continue
+                 name = name.replace(weight_name, param_name)
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = param.weight_loader
+                 weight_loader(param, loaded_weight, shard_id)
+                 break
+             else:
++                name = maybe_remap_kv_scale_name(name, params_dict)
++                if name is None:
++                    continue
++
+                 if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                     continue
++                if is_pp_missing_parameter(name, self):
++                    continue
+                 param = params_dict[name]
+                 weight_loader = getattr(param, "weight_loader",
+                                         default_weight_loader)
+                 weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
+diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
+new file mode 100644
+index 0000000..02ca7fe
+--- /dev/null
++++ b/vllm/model_executor/models/telechat2.py
+@@ -0,0 +1,132 @@
++# Copyright 2023 The vLLM team.
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++from typing import Iterable, Set, Tuple
++
++import torch
++
++from vllm.config import VllmConfig
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel
++
++from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
++                    is_pp_missing_parameter)
++
++
++class TeleChat2Model(LlamaModel):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        # 1. Initialize the LlamaModel with bias
++        vllm_config.model_config.hf_config.bias = True
++        vllm_config.model_config.hf_config.mlp_bias = True
++        super().__init__(vllm_config=vllm_config, prefix=prefix)
++        # 2. Remove the bias from the qkv_proj and gate_up_proj based on config
++        # Telechat2's gate_up_proj and qkv_proj don't have bias
++        # see: https://github.com/vllm-project/vllm/pull/10311#issuecomment-2490297566
++        for layer in self.layers:
++            if not isinstance(layer, PPMissingLayer):
++                layer.self_attn.qkv_proj.bias = None
++                layer.self_attn.qkv_proj.skip_bias_add = True
++                layer.mlp.gate_up_proj.bias = None
++                layer.mlp.gate_up_proj.skip_bias_add = True
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            ('gate_up_proj', 'gate_proj', 0),
++            ('gate_up_proj', 'up_proj', 1),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        total_num_heads = self.config.n_head
++        head_dim = self.config.hidden_size // total_num_heads
++        for name, loaded_weight in weights:
++            if "self_attn.key_value" in name:
++                k_weight = []
++                v_weight = []
++                for i in range(total_num_heads):
++                    start = i * head_dim * 2
++                    k_weight.append(loaded_weight[start:start + head_dim, :])
++                    v_weight.append(loaded_weight[start + head_dim:start +
++                                                  2 * head_dim:])
++                k_weight = torch.cat(k_weight, dim=0)
++                v_weight = torch.cat(v_weight, dim=0)
++                name = name.replace("key_value", "qkv_proj")
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, k_weight, "k")
++                weight_loader(param, v_weight, "v")
++            elif "query" in name:
++                name = name.replace("query", "qkv_proj")
++                if is_pp_missing_parameter(name, self):
++                    continue
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, "q")
++            else:
++                for param_name, weight_name, shard_id in stacked_params_mapping:
++                    if weight_name not in name:
++                        continue
++                    name = name.replace(weight_name, param_name)
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    param = params_dict[name]
++                    weight_loader = param.weight_loader
++                    weight_loader(param, loaded_weight, shard_id)
++                    break
++                else:
++                    if is_pp_missing_parameter(name, self):
++                        continue
++                    param = params_dict[name]
++                    weight_loader = getattr(param, "weight_loader",
++                                            default_weight_loader)
++                    weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++class TeleChat2ForCausalLM(LlamaForCausalLM):
++
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_prefix={
++            "transformer.": "model.",
++        },
++        orig_to_new_substr={
++            ".h.": ".layers.",
++            ".self_attention.": ".self_attn.",
++            ".word_embeddings.": ".embed_tokens.",
++            ".dense.": ".o_proj.",
++            ".ln_f.": ".norm.",
++        },
++    )
++
++    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
++        return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++
++        loader = AutoWeightsLoader(
++            self,
++            skip_prefixes=(["lm_head."]
++                           if self.config.tie_word_embeddings else None),
++        )
++        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
+new file mode 100644
+index 0000000..3edfb51
+--- /dev/null
++++ b/vllm/model_executor/models/ultravox.py
+@@ -0,0 +1,556 @@
++# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
++"""PyTorch Ultravox model."""
++import math
++from functools import cached_property
++from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
++                    Tuple, TypedDict, Union)
++
++import torch
++import torch.utils.checkpoint
++from torch import nn
++from torch.nn import functional as F
++from transformers import BatchFeature, ProcessorMixin
++from transformers.models.whisper import WhisperFeatureExtractor
++from transformers.models.whisper.modeling_whisper import WhisperEncoder
++
++from vllm import envs
++from vllm.attention import AttentionMetadata
++from vllm.config import VllmConfig
++from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
++from vllm.model_executor.layers.layernorm import RMSNorm
++from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
++from vllm.model_executor.model_loader.loader import DefaultModelLoader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
++                                    NestedTensors)
++from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
++from vllm.multimodal.processing import (BaseMultiModalProcessor,
++                                        BaseProcessingInfo, PromptReplacement)
++from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
++from vllm.sequence import IntermediateTensors
++from vllm.transformers_utils.configs.ultravox import UltravoxConfig
++
++from .interfaces import SupportsMultiModal, SupportsPP
++from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
++                    init_vllm_registered_model, maybe_prefix,
++                    merge_multimodal_embeddings,
++                    merge_multimodal_embeddings_from_map)
++
++_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>"
++_AUDIO_PLACEHOLDER_TOKEN = 128002
++_AUDIO_TOKENS_PER_SECOND = 6.25
++
++
++class UltravoxAudioFeatureInputs(TypedDict):
++    type: Literal["audio_features"]
++    data: NestedTensors
++    """Shape: `(batch_size, num_audios, 80, M)`"""
++
++
++class UltravoxAudioEmbeddingInputs(TypedDict):
++    type: Literal["audio_embeds"]
++    data: NestedTensors
++    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)`"""
++
++
++UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
++                            UltravoxAudioEmbeddingInputs]
++
++
++class UltravoxProcessingInfo(BaseProcessingInfo):
++
++    def get_hf_processor(
++        self,
++        *,
++        # Ignored in initialization
++        sampling_rate: Optional[int] = None,
++    ) -> ProcessorMixin:
++        hf_processor = self.ctx.get_hf_processor()
++
++        # NOTE: Ultravox processing definition uses '<|eot_id|>' as the
++        # placeholder that will cause confusion with the actual end of turn
++        # token, thus we override placeholder with a reserved special
++        # token.
++        hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
++        return hf_processor
++
++    def get_feature_extractor(
++        self,
++        *,
++        # Ignored in initialization
++        sampling_rate: Optional[int] = None,
++    ) -> WhisperFeatureExtractor:
++        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
++        audio_processor = hf_processor.audio_processor  # type: ignore
++        feature_extractor = audio_processor.feature_extractor  # type: ignore
++        assert isinstance(feature_extractor, WhisperFeatureExtractor)
++        return feature_extractor
++
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        return {"audio": None}
++
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        feature_extractor = self.get_feature_extractor()
++        max_audio_tokens = math.ceil(feature_extractor.chunk_length *
++                                     _AUDIO_TOKENS_PER_SECOND)
++
++        return {"audio": max_audio_tokens}
++
++
++class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
++                                 ):
++
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        feature_extractor = self.info.get_feature_extractor()
++
++        sampling_rate = feature_extractor.sampling_rate
++        audio_len = feature_extractor.chunk_length * sampling_rate
++        num_audios = mm_counts.get("audio", 0)
++
++        mm_data = {
++            "audio":
++            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
++        }
++
++        return ProcessorInputs(
++            prompt_text="<|audio|>" * num_audios,
++            mm_data=mm_data,
++        )
++
++
++class UltravoxMultiModalProcessor(
++        BaseMultiModalProcessor[UltravoxProcessingInfo]):
++
++    def _get_data_parser(self) -> MultiModalDataParser:
++        feature_extractor = self.info.get_feature_extractor()
++        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        # Text-only input not supported in composite processor
++        if not mm_data:
++            prompt_ids = self.info.get_tokenizer().encode(prompt)
++            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
++            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
++
++        mm_data = dict(mm_data)
++        audios = mm_data.pop("audios", [])
++        assert isinstance(audios, list)
++
++        if not audios:
++            return super()._call_hf_processor(
++                prompt=prompt,
++                mm_data=mm_data,
++                mm_kwargs=mm_kwargs,
++            )
++
++        feature_extractor = self.info.get_feature_extractor()
++        mm_kwargs = dict(
++            **mm_kwargs,
++            sampling_rate=feature_extractor.sampling_rate,
++        )
++
++        # Ultravox processor doesn't support multiple inputs,
++        # therefore we need to input text and audio one by one
++        audio_features, audio_token_len = [], []
++        shared_outputs = {}
++        for audio in audios:
++            # NOTE: Ultravox processor accepts "audio" instead of "audios"
++            item_processor_data = dict(**mm_data, audio=audio)
++
++            item_outputs = super()._call_hf_processor(
++                prompt=prompt,
++                mm_data=item_processor_data,
++                mm_kwargs=mm_kwargs,
++            )
++
++            audio_features.append(item_outputs.pop("audio_values")[0])
++            audio_token_len.append(item_outputs.pop("audio_token_len").item())
++            shared_outputs = item_outputs
++
++        combined_outputs = dict(
++            **shared_outputs,
++            audio_features=audio_features,
++            audio_token_len=audio_token_len,
++        )
++        return BatchFeature(combined_outputs)
++
++    def _apply_hf_processor_tokens_only(
++        self,
++        prompt_tokens: list[int],
++    ) -> list[int]:
++        # HF processor omits bos_token_id by setting add_special_tokens=False
++        tokenizer = self.info.get_tokenizer()
++        assert prompt_tokens[0] == tokenizer.bos_token_id
++
++        return prompt_tokens[1:]
++
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        return dict(
++            audio_features=MultiModalFieldConfig.batched("audio"),
++            audio_token_len=MultiModalFieldConfig.batched("audio"),
++            audio_embeds=MultiModalFieldConfig.batched("audio"),
++        )
++
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, Any],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
++        placeholder = hf_processor.audio_token_replacement  # type: ignore
++
++        def get_replacement_ultravox(item_idx: int):
++            audio_token_len = out_mm_kwargs["audio_token_len"][item_idx]
++            return placeholder * audio_token_len
++
++        return [
++            PromptReplacement(
++                modality="audio",
++                target="<|audio|>",
++                replacement=get_replacement_ultravox,
++            )
++        ]
++
++
++class StackAudioFrames(nn.Module):
++    """
++    Stack the audio embedding frames to reduce the sequence length by a factor
++    of `stack_factor`.
++    """
++
++    def __init__(self, stack_factor: int = 8):
++        super().__init__()
++        self.stack_factor = stack_factor
++
++    def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
++        B, T, C = audio_embeds.shape
++        T_pad = (T + self.stack_factor -
++                 1) // self.stack_factor * self.stack_factor
++        audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T))
++        B, T, C = audio_embeds.shape
++        audio_embeds = audio_embeds.view(B, T // self.stack_factor,
++                                         C * self.stack_factor)
++        return audio_embeds
++
++
++class FlippedSiluAndMul(SiluAndMul):
++    """Ultravox is trained with SwiGLU with flipped halves."""
++
++    def forward(self, x: torch.Tensor):
++        a, b = x.chunk(2, dim=-1)
++        flipped = torch.cat((b, a), dim=-1)
++        return super().forward(flipped)
++
++
++class UltravoxProjector(nn.Module):
++
++    def __init__(self, config: UltravoxConfig):
++        super().__init__()
++        self.hidden_dim = config.hidden_size
++        self._pad_and_stack = StackAudioFrames(config.stack_factor)
++        dim = config.audio_config.hidden_size * config.stack_factor
++        self.ln_pre = RMSNorm(dim)
++        self.linear_1 = nn.Linear(dim, self.hidden_dim, bias=False)
++        dim = self.hidden_dim
++
++        if config.projector_act == "swiglu":
++            self.act = FlippedSiluAndMul()
++            dim = dim // 2
++        else:
++            self.act = get_act_fn(config.projector_act)
++
++        self.linear_2 = nn.Linear(dim,
++                                  config.text_config.hidden_size,
++                                  bias=False)
++        self.ln_post = RMSNorm(config.text_config.hidden_size)
++
++    def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
++        audio_features = self._pad_and_stack(audio_features)
++        audio_features = self.ln_pre(audio_features)
++        hidden_states = self.linear_1(audio_features)
++        hidden_states = self.act(hidden_states)
++        hidden_states = self.linear_2(hidden_states)
++        hidden_states = self.ln_post(hidden_states)
++        return hidden_states
++
++
++class ModifiedWhisperEncoder(WhisperEncoder):
++    """
++    Encoder portion of OpenAI's Whisper model.
++
++    This implementation is a slightly modified version of HF Transformers'
++    Whisper Encoder, with only a few fixes:
++    1. base_model_prefix updated to allow for doing `.from_pretrained`
++       directly on the encoder
++    2. allow less than 30 second of audio padding to be passed in:
++        - relaxed ValueError check for `input_features` length to be less
++           than or equal to `expected_seq_length` instead of strictly equal
++        - embed_pos is now sliced to match the length of `inputs_embeds`
++
++    Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
++    See commentary: https://github.com/huggingface/transformers/issues/25744
++    """
++
++    base_model_prefix = "model.encoder"
++
++    def forward(
++        self,
++        input_features,
++    ):
++        expected_seq_length = (self.config.max_source_positions *
++                               self.conv1.stride[0] * self.conv2.stride[0])
++        if input_features.shape[-1] > expected_seq_length:
++            raise ValueError(
++                f"Whisper expects the mel input features to be of length "
++                f"{expected_seq_length} or less, but found "
++                f"{input_features.shape[-1]}. Make sure to pad the input mel "
++                f"features to {expected_seq_length}.")
++
++        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
++        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
++
++        inputs_embeds = inputs_embeds.permute(0, 2, 1)
++        embed_pos = self.embed_positions.weight[:inputs_embeds.size(-2)]
++
++        hidden_states = inputs_embeds + embed_pos
++        hidden_states = nn.functional.dropout(hidden_states,
++                                              p=self.dropout,
++                                              training=self.training)
++
++        for encoder_layer in self.layers:
++            layer_outputs = encoder_layer(
++                hidden_states,
++                None,
++                layer_head_mask=None,
++            )
++
++            hidden_states = layer_outputs[0]
++
++        hidden_states = self.layer_norm(hidden_states)
++        return hidden_states
++
++
++@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
++                                        info=UltravoxProcessingInfo,
++                                        dummy_inputs=UltravoxDummyInputsBuilder
++                                        )
++class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
++
++    hf_to_vllm_mapper = WeightsMapper(
++        orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        multimodal_config = vllm_config.model_config.multimodal_config
++        self.config = config
++        self.multi_modal_config = multimodal_config
++        assert self.multi_modal_config
++
++        self.secondary_weights = []
++        self.audio_tower = ModifiedWhisperEncoder(config.audio_config)
++        if config.audio_model_id is not None:
++            # this prefix is not for initialization, but for loading weights
++            # note the trailing dot
++            self.secondary_weights.append(
++                DefaultModelLoader.Source(
++                    model_or_path=config.audio_model_id,
++                    revision=None,
++                    prefix="audio_tower.",
++                ))
++        self.multi_modal_projector = UltravoxProjector(config)
++        self.language_model = init_vllm_registered_model(
++            vllm_config=vllm_config,
++            hf_config=config.text_config,
++            prefix=maybe_prefix(prefix, "language_model"),
++        )
++        if config.text_model_id is not None:
++            # this prefix is not for initialization, but for loading weights
++            # note the trailing dot
++            self.secondary_weights.append(
++                DefaultModelLoader.Source(model_or_path=config.text_model_id,
++                                          revision=None,
++                                          prefix="language_model."))
++
++        self.make_empty_intermediate_tensors = (
++            self.language_model.make_empty_intermediate_tensors)
++
++    @cached_property
++    def sampler(self):
++        if hasattr(self.language_model, "sampler"):
++            return self.language_model.sampler
++
++        return get_sampler()
++
++    def _audio_features_to_embeddings(
++            self, input_features: torch.Tensor) -> torch.Tensor:
++        audio_input = input_features.to(self.audio_tower.dtype)
++        audio_features = self.audio_tower(audio_input)
++        audio_features = audio_features.to(self.audio_tower.dtype)
++        audio_embeddings = self.multi_modal_projector(audio_features)
++        return audio_embeddings
++
++    def _parse_and_validate_audio_input(
++            self, **kwargs: object) -> Optional[UltravoxAudioInputs]:
++        audio_features = kwargs.pop("audio_features", None)
++        audio_embeds = kwargs.pop("audio_embeds", None)
++
++        if audio_features is None and audio_embeds is None:
++            return None
++
++        if audio_features is not None:
++            if not isinstance(audio_features, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of audio features. "
++                                 f"Got type: {type(audio_features)}")
++
++            return UltravoxAudioFeatureInputs(type="audio_features",
++                                              data=audio_features)
++
++        if audio_embeds is not None:
++            if not isinstance(audio_embeds, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of audio embeds. "
++                                 f"Got type: {type(audio_embeds)}")
++
++            return UltravoxAudioEmbeddingInputs(type="audio_embeds",
++                                                data=audio_embeds)
++
++        raise AssertionError("This line should be unreachable.")
++
++    def _process_audio_input(
++            self, audio_input: UltravoxAudioInputs) -> NestedTensors:
++        if audio_input["type"] == "audio_embeds":
++            return audio_input["data"]
++
++        audio_features = audio_input["data"]
++        if isinstance(audio_features, torch.Tensor):
++            # Combine the B and N dimensions for the encoder/projector
++            flattened = flatten_bn(audio_features)
++            flattened_embeddings = self._audio_features_to_embeddings(
++                flattened)
++
++            # Restore the original dimensions
++            embeddings = flattened_embeddings.unflatten(
++                0, audio_features.shape[:2])
++            return embeddings
++
++        result = []
++        # TODO: Batch heterogeneous tensors through the encoder/projector
++        for audio_features_item in audio_features:
++            if isinstance(audio_features_item, torch.Tensor):
++                result.append(
++                    self._audio_features_to_embeddings(audio_features_item))
++            else:
++                embeddings = [
++                    # Add a batch dimension to embed it, then remove it.
++                    self._audio_features_to_embeddings(tensor.unsqueeze(0)
++                                                       ).squeeze(0)
++                    for tensor in audio_features_item
++                ]
++                result.append(embeddings)
++
++        return result
++
++    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
++        audio_input = self._parse_and_validate_audio_input(**kwargs)
++        if audio_input is None:
++            return None
++        audio_embeddings = self._process_audio_input(audio_input)
++        return audio_embeddings
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++        attn_metadata: Optional[AttentionMetadata] = None,
++    ) -> torch.Tensor:
++        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
++        if multimodal_embeddings is not None:
++
++            # TODO(ywang96): remove this block after v0 is deprecated.
++            if not envs.VLLM_USE_V1:
++                merge_multimodal_embeddings_from_map(
++                    inputs_embeds, multimodal_embeddings,
++                    attn_metadata.multi_modal_placeholder_index_maps["audio"])
++            else:
++                inputs_embeds = merge_multimodal_embeddings(
++                    input_ids, inputs_embeds, multimodal_embeddings,
++                    _AUDIO_PLACEHOLDER_TOKEN)
++        return inputs_embeds
++
++    def forward(self,
++                input_ids: torch.Tensor,
++                positions: torch.Tensor,
++                kv_caches: List[torch.Tensor],
++                attn_metadata: AttentionMetadata,
++                intermediate_tensors: Optional[torch.Tensor] = None,
++                inputs_embeds: Optional[torch.Tensor] = None,
++                **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
++        """Run forward pass for Ultravox
++
++        One key thing to understand is the `input_ids` already accounts for the
++        positions of the to-be-inserted audio embeddings. The to-be-inserted
++        audio has a size that is essentially 6.25 tokens per second of audio.
++
++        This way, the `positions` and `attn_metadata` are consistent
++        with the `input_ids`.
++
++        Args:
++            audio_features: A batch of audio inputs [B, N, 80, M].
++        """
++
++        if intermediate_tensors is not None:
++            inputs_embeds = None
++
++        # NOTE: In v1, inputs_embeds is always generated at model runner, this
++        # condition is for v0 compatibility.
++        elif inputs_embeds is None:
++            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
++
++            # TODO(ywang96): remove attn_metadata from get_input_embeddings
++            # after v0 is deprecated
++            inputs_embeds = self.get_input_embeddings(input_ids,
++                                                      multimodal_embeddings,
++                                                      attn_metadata)
++            input_ids = None
++
++        hidden_states = self.language_model.model(input_ids,
++                                                  positions,
++                                                  kv_caches,
++                                                  attn_metadata,
++                                                  intermediate_tensors,
++                                                  inputs_embeds=inputs_embeds)
++        return hidden_states
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        return self.language_model.compute_logits(hidden_states,
++                                                  sampling_metadata)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        return self.language_model.sample(logits, sampling_metadata)
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++
++        loader = AutoWeightsLoader(self,
++                                   ignore_unexpected_prefixes=["audio_tower."])
++        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
+new file mode 100644
+index 0000000..43b3c97
+--- /dev/null
++++ b/vllm/model_executor/models/utils.py
+@@ -0,0 +1,642 @@
++import itertools
++from dataclasses import dataclass, field
++from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional,
++                    Protocol, Set, Tuple, Union, overload)
++
++import torch
++import torch.nn as nn
++from torch.func import functional_call
++from transformers import PretrainedConfig
++
++from vllm.config import VllmConfig
++from vllm.logger import init_logger
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
++from vllm.sequence import IntermediateTensors
++from vllm.utils import is_pin_memory_available
++
++logger = init_logger(__name__)
++
++WeightsMapping = Mapping[str, Optional[str]]
++"""If a key maps to a value of `None`, the corresponding weight is ignored."""
++
++
++@dataclass
++class WeightsMapper:
++    """Maps the name of each weight if they match the following patterns."""
++
++    orig_to_new_substr: WeightsMapping = field(default_factory=dict)
++    orig_to_new_prefix: WeightsMapping = field(default_factory=dict)
++    orig_to_new_suffix: WeightsMapping = field(default_factory=dict)
++
++    def _map_name(self, key: str) -> Optional[str]:
++        for substr, new_key in self.orig_to_new_substr.items():
++            if substr in key:
++                if new_key is None:
++                    return None
++
++                key = key.replace(substr, new_key, 1)
++
++        for prefix, new_key in self.orig_to_new_prefix.items():
++            if key.startswith(prefix):
++                if new_key is None:
++                    return None
++
++                key = key.replace(prefix, new_key, 1)
++
++        for suffix, new_key in self.orig_to_new_suffix.items():
++            if key.endswith(suffix):
++                if new_key is None:
++                    return None
++
++                key = new_key.join(key.rsplit(suffix, 1))
++
++        return key
++
++    def apply(
++        self, weights: Iterable[Tuple[str, torch.Tensor]]
++    ) -> Iterable[Tuple[str, torch.Tensor]]:
++        return ((out_name, data) for name, data in weights
++                if (out_name := self._map_name(name)) is not None)
++
++
++class AutoWeightsLoader:
++    """
++    Helper class to load weights into a :class:`torch.nn.Module`. It is able
++    to automatically detect child modules and parameters while iterating over
++    the weights only once.
++
++    The weight loading logic for individual modules can be overridden
++    by defining a ``load_weights`` method.
++
++    Similarly, the weight loading logic for individual parameters can be
++    overridden by defining a ``weight_loader`` method.
++
++    Detailed weight loading information can be viewed by setting the
++    environment variable ``VLLM_LOGGING_LEVEL=DEBUG``.
++    """
++
++    def __init__(
++        self,
++        module: nn.Module,
++        *,
++        skip_prefixes: Optional[List[str]] = None,
++        ignore_unexpected_prefixes: Optional[List[str]] = None,
++    ) -> None:
++        super().__init__()
++
++        self.module = module
++        self.skip_prefixes = skip_prefixes or []
++        self.ignore_unexpected_prefixes = ignore_unexpected_prefixes or []
++
++    def _groupby_prefix(
++        self,
++        weights: Iterable[Tuple[str, torch.Tensor]],
++    ) -> Iterable[Tuple[str, Iterable[Tuple[str, torch.Tensor]]]]:
++        weights_by_parts = ((weight_name.split(".", 1), weight_data)
++                            for weight_name, weight_data in weights)
++
++        for prefix, group in itertools.groupby(weights_by_parts,
++                                               key=lambda x: x[0][0]):
++            yield (
++                prefix,
++                # Because maxsplit=1 in weight_name.split(...),
++                # the length of `parts` must either be 1 or 2
++                (("" if len(parts) == 1 else parts[1], weights_data)
++                 for parts, weights_data in group),
++            )
++
++    def _get_qualname(self, prefix: str, rest: str) -> str:
++        if prefix == "":
++            return rest
++        if rest == "":
++            return prefix
++
++        return ".".join((prefix, rest))
++
++    def _can_skip(self, qualname: str) -> bool:
++        return any(qualname.startswith(p) for p in self.skip_prefixes)
++
++    def _can_ignore_unexpected(self, qualname: str) -> bool:
++        return any(
++            qualname.startswith(p) for p in self.ignore_unexpected_prefixes)
++
++    def _load_param(
++        self,
++        base_prefix: str,
++        param: nn.Parameter,
++        weights: Iterable[Tuple[str, torch.Tensor]],
++    ) -> Iterable[str]:
++        for weight_name, weight_data in weights:
++            weight_qualname = self._get_qualname(base_prefix, weight_name)
++
++            if self._can_skip(weight_qualname):
++                logger.debug("Skipping weight %s", weight_qualname)
++
++                continue
++
++            if weight_name != "":
++                if self._can_ignore_unexpected(weight_qualname):
++                    logger.debug("Ignoring weight %s", weight_qualname)
++
++                    continue
++
++                raise ValueError(
++                    f"Attempted to load nested weight '{weight_qualname}' "
++                    f"into a single parameter '{base_prefix}'")
++
++            weight_loader = getattr(param, "weight_loader",
++                                    default_weight_loader)
++            weight_loader(param, weight_data)
++
++            logger.debug("Loaded weight %s with shape %s", weight_qualname,
++                         param.shape)
++
++            yield weight_qualname
++
++    def _load_module(
++        self,
++        base_prefix: str,
++        module: nn.Module,
++        weights: Iterable[Tuple[str, torch.Tensor]],
++    ) -> Iterable[str]:
++        if isinstance(module, PPMissingLayer):
++            return
++
++        # Avoid infinite recursion since this function is typically
++        # called inside load_weights of the module itself
++        if module != self.module:
++            module_load_weights = getattr(module, "load_weights", None)
++            if callable(module_load_weights):
++                loaded_params = module_load_weights(weights)
++                if loaded_params is None:
++                    logger.warning(
++                        "Unable to collect loaded parameters "
++                        "for module %s", module)
++                else:
++                    yield from map(
++                        lambda x: self._get_qualname(base_prefix, x),
++                        loaded_params,
++                    )
++
++        child_modules = dict(module.named_children())
++        child_params = dict(module.named_parameters(recurse=False))
++
++        for child_prefix, child_weights in self._groupby_prefix(weights):
++            prefix = self._get_qualname(base_prefix, child_prefix)
++
++            if child_prefix in child_modules:
++                if self._can_skip(prefix + "."):
++                    logger.debug("Skipping module %s", prefix)
++
++                    continue
++
++                yield from self._load_module(prefix,
++                                             child_modules[child_prefix],
++                                             child_weights)
++            elif child_prefix in child_params:
++                if self._can_skip(prefix):
++                    logger.debug("Skipping param %s", prefix)
++
++                    continue
++
++                yield from self._load_param(prefix, child_params[child_prefix],
++                                            child_weights)
++            else:
++                can_skip_module = self._can_skip(prefix + ".")
++                can_skip_param = self._can_skip(prefix)
++                if can_skip_module or can_skip_param:
++                    logger.debug("Skipping missing %s", prefix)
++
++                    continue
++
++                can_ignore_module = self._can_ignore_unexpected(prefix + ".")
++                can_ignore_param = self._can_ignore_unexpected(prefix)
++                if can_ignore_module or can_ignore_param:
++                    logger.debug("Ignoring missing %s", prefix)
++
++                    continue
++
++                msg = (f"There is no module or parameter named '{prefix}' "
++                       f"in {type(self.module).__name__}")
++                raise ValueError(msg)
++
++    def load_weights(
++        self,
++        weights: Iterable[Tuple[str, torch.Tensor]],
++        *,
++        mapper: Optional[WeightsMapper] = None,
++    ) -> Set[str]:
++        if mapper is not None:
++            weights = mapper.apply(weights)
++
++        autoloaded_weights = set(self._load_module("", self.module, weights))
++        return autoloaded_weights
++
++
++def init_vllm_registered_model(
++    vllm_config: VllmConfig,
++    *,
++    prefix: str = "",
++    hf_config: Optional[PretrainedConfig] = None,
++    architectures: Optional[list[str]] = None,
++) -> nn.Module:
++    """
++    Helper function to initialize an inner model registered to vLLM,
++    based on the arguments passed to the outer vLLM model.
++    """
++    from vllm.model_executor.model_loader.loader import _initialize_model
++
++    if hf_config is None and architectures is not None:
++        # So that the architectures field is overridden
++        hf_config = vllm_config.model_config.hf_config
++
++    if hf_config is not None:
++        vllm_config = vllm_config.with_hf_config(hf_config,
++                                                 architectures=architectures)
++
++    return _initialize_model(vllm_config=vllm_config, prefix=prefix)
++
++
++@overload
++def flatten_bn(x: torch.Tensor) -> torch.Tensor:
++    ...
++
++
++@overload
++def flatten_bn(x: List[torch.Tensor]) -> List[torch.Tensor]:
++    ...
++
++
++@overload
++def flatten_bn(
++    x: Union[List[torch.Tensor], torch.Tensor],
++    *,
++    concat: Literal[True],
++) -> torch.Tensor:
++    ...
++
++
++@overload
++def flatten_bn(
++    x: Union[List[torch.Tensor], torch.Tensor],
++    *,
++    concat: bool = False,
++) -> Union[List[torch.Tensor], torch.Tensor]:
++    ...
++
++
++def flatten_bn(
++    x: Union[List[torch.Tensor], torch.Tensor],
++    *,
++    concat: bool = False,
++) -> Union[List[torch.Tensor], torch.Tensor]:
++    """
++    Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs.
++
++    The input tensor should have shape ``(B, N, ...)```.
++    """
++    if isinstance(x, torch.Tensor):
++        return x.flatten(0, 1)
++
++    if concat:
++        return torch.cat(x)
++
++    return [x_n for x_b in x for x_n in x_b]
++
++
++def _flatten_embeddings(embeddings: NestedTensors) -> torch.Tensor:
++    """
++    Recursively flattens and concatenates NestedTensors on all but the last
++    dimension.
++    """
++
++    if isinstance(embeddings, torch.Tensor):
++        # Flatten all but the last dimension.
++        return embeddings.flatten(0, -2)
++
++    return torch.cat(tuple(_flatten_embeddings(t) for t in embeddings))
++
++
++def _embedding_count_expression(embeddings: NestedTensors) -> str:
++    """
++    Constructs a debugging representation of the number of embeddings in the
++    NestedTensors.
++    """
++
++    if isinstance(embeddings, torch.Tensor):
++        return " x ".join([str(dim) for dim in embeddings.shape[:-1]])
++
++    return " + ".join(
++        _embedding_count_expression(inner) for inner in embeddings)
++
++
++def merge_multimodal_embeddings_from_map(
++        inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors,
++        placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor:
++    """
++    Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided 
++    placeholder map .
++
++    Note:
++        This updates ``inputs_embeds`` in place.
++    """
++    flattened_embeddings = _flatten_embeddings(multimodal_embeddings)
++    inputs_embeds[placeholder_map.dest] = flattened_embeddings[
++        placeholder_map.src]
++    return inputs_embeds
++
++
++def _merge_multimodal_embeddings(
++    inputs_embeds: torch.Tensor,
++    is_multimodal: torch.Tensor,
++    multimodal_embeddings: NestedTensors,
++) -> torch.Tensor:
++    """
++    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
++    positions in ``inputs_embeds`` corresponding to placeholder tokens in
++    ``input_ids``.
++
++    Note:
++        This updates ``inputs_embeds`` in place.
++    """
++    num_expected_tokens = is_multimodal.sum().item()
++    assert isinstance(num_expected_tokens, int)
++
++    flattened = _flatten_embeddings(multimodal_embeddings)
++    if flattened.shape[0] != num_expected_tokens:
++        expr = _embedding_count_expression(multimodal_embeddings)
++        raise ValueError(
++            f"Attempted to assign {expr} = {flattened.shape[0]} "
++            f"multimodal tokens to {num_expected_tokens} placeholders")
++
++    inputs_embeds[is_multimodal] = flattened
++    return inputs_embeds
++
++
++def embed_multimodal(
++    input_ids: torch.Tensor,
++    multimodal_token_id: int,
++    get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
++    multimodal_embeds: NestedTensors,
++) -> torch.Tensor:
++    """
++    Embed token IDs and multimodal inputs and combine their embeddings.
++
++    ``multimodal_token_id`` is used to determine whether a token ID should
++    be embedded using ``get_text_embeds`` or ``get_multimodal_embeds``.
++
++    Compared to ``merge_multimodal_embeddings`, this avoids running
++    ``get_text_embeds`` on ``input_ids[input_ids == multimodal_token_id]``
++    which causes issues when the placeholder token ID exceeds the
++    vocabulary size of the language model.
++    """
++    is_multimodal = input_ids == multimodal_token_id
++    is_text = ~is_multimodal
++
++    text_embeds = get_text_embeds(input_ids[is_text])
++    merged_embeds = torch.empty(
++        (input_ids.shape[0], text_embeds.shape[1]),
++        dtype=text_embeds.dtype,
++        device=text_embeds.device,
++    )
++
++    merged_embeds[is_text] = text_embeds
++
++    return _merge_multimodal_embeddings(
++        merged_embeds,
++        is_multimodal,
++        multimodal_embeds,
++    )
++
++
++def merge_multimodal_embeddings(
++    input_ids: torch.Tensor,
++    inputs_embeds: torch.Tensor,
++    multimodal_embeddings: NestedTensors,
++    placeholder_token_id: Union[int, List[int]],
++) -> torch.Tensor:
++    """
++    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
++    positions in ``inputs_embeds`` corresponding to placeholder tokens in
++    ``input_ids``.
++    
++    ``placeholder_token_id`` can be a list of token ids (e.g, token ids 
++    of img_start, img_break, and img_end tokens) when needed: This means 
++    the order of these tokens in the ``input_ids`` MUST MATCH the order of 
++    their embeddings in ``multimodal_embeddings`` since we need to 
++    slice-merge instead of individually scattering.
++
++    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
++    - T is text token
++    - S is image start token
++    - I is image embedding token
++    - B is image break token
++    - E is image end token.
++    
++    Then the image embeddings (that correspond to I's) from vision encoder 
++    must be padded with embeddings of S, B, and E in the same order of 
++    input_ids for a correct embedding merge.
++
++    Note:
++        This updates ``inputs_embeds`` in place.
++    """
++    if isinstance(placeholder_token_id, list):
++        placeholder_token_id = torch.tensor(placeholder_token_id,
++                                            device=input_ids.device)
++        return _merge_multimodal_embeddings(
++            inputs_embeds,
++            torch.isin(input_ids, placeholder_token_id),
++            multimodal_embeddings,
++        )
++
++    return _merge_multimodal_embeddings(
++        inputs_embeds,
++        (input_ids == placeholder_token_id),
++        multimodal_embeddings,
++    )
++
++
++class LayerFn(Protocol):
++
++    def __call__(self, prefix: str) -> torch.nn.Module:
++        ...
++
++
++class PPMissingLayer(torch.nn.Identity):
++    """
++    A placeholder layer for missing layers in a pipeline parallel model.
++    """
++
++    def __init__(self, *args, **kwargs):
++        super().__init__()
++
++
++_CPU_OFFLOAD_BYTES = 0
++_CPU_OFFLOAD_MAX_BYTES = 0
++
++
++def set_cpu_offload_max_bytes(max_bytes: int) -> None:
++    global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
++    _CPU_OFFLOAD_BYTES = 0
++    _CPU_OFFLOAD_MAX_BYTES = max_bytes
++
++
++def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
++    device = next(module.parameters()).device
++
++    if device == torch.device("cpu"):
++        return module
++
++    global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
++    if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
++        return module
++
++    pin_memory = is_pin_memory_available()
++
++    # offload parameters to CPU
++    # use pin_memory if possible, which helps cudagraph capture speed
++    offloaded_parameters = False
++    for p in module.parameters():
++        if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
++            # we use per-parameter offloading
++            # one module might have some parameters offloaded and some not
++            break
++
++        # `torch.empty_like` does not support `pin_memory` argument
++        cpu_data = torch.empty_strided(size=p.data.size(),
++                                       stride=p.data.stride(),
++                                       dtype=p.data.dtype,
++                                       layout=p.data.layout,
++                                       device='cpu',
++                                       pin_memory=pin_memory)
++        cpu_data.copy_(p.data)
++        p.data = cpu_data
++        _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
++        offloaded_parameters = True
++
++    if offloaded_parameters:
++        original_forward = module.forward
++
++        def forward(*args, **kwargs):
++            module.forward = original_forward
++            device_state = {
++                # here we blindly call `to(device)`
++                # if the parameter is already on the device, it will be a no-op
++                k: v.to(device, non_blocking=True)
++                for k, v in module.state_dict().items()
++            }
++            output = functional_call(module,
++                                     device_state,
++                                     args=args,
++                                     kwargs=kwargs)
++            module.forward = forward
++            return output
++
++        module.forward = forward
++
++    return module
++
++
++def make_layers(
++    num_hidden_layers: int,
++    layer_fn: LayerFn,
++    prefix: str,
++) -> Tuple[int, int, torch.nn.ModuleList]:
++    """Make a list of layers with the given layer function, taking
++    pipeline parallelism into account.
++    """
++    from vllm.distributed.parallel_state import get_pp_group
++    from vllm.distributed.utils import get_pp_indices
++    start_layer, end_layer = get_pp_indices(num_hidden_layers,
++                                            get_pp_group().rank_in_group,
++                                            get_pp_group().world_size)
++    modules = torch.nn.ModuleList(
++        [PPMissingLayer() for _ in range(start_layer)] + [
++            maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
++            for idx in range(start_layer, end_layer)
++        ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
++    return start_layer, end_layer, modules
++
++
++# NOTE: don't use lru_cache here because it can prevent garbage collection
++_model_to_pp_missing_layer_names: Dict[int, List[str]] = {}
++
++
++def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]:
++    """Get the names of the missing layers in a pipeline parallel model."""
++    model_id = id(model)
++    if model_id in _model_to_pp_missing_layer_names:
++        return _model_to_pp_missing_layer_names[model_id]
++
++    missing_layer_names = []
++    for name, module in model.named_modules():
++        if isinstance(module, PPMissingLayer):
++            # NOTE: the trailing dot is used to match the prefix of the layer.
++            # without the dot, we could match a layer that is not missing,
++            # e.g., 'encoder.layer.1' would match 'encoder.layer.11'
++            missing_layer_names.append(name + '.')
++    _model_to_pp_missing_layer_names[model_id] = missing_layer_names
++
++    return missing_layer_names
++
++
++def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool:
++    """Check if a parameter is missing in a pipeline parallel model."""
++    if isinstance(model, PPMissingLayer):
++        return True
++
++    return any(
++        name.startswith(missing_layer_name)
++        for missing_layer_name in get_pp_missing_layer_names(model))
++
++
++def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
++
++    def make_empty_intermediate_tensors(
++        batch_size: int,
++        dtype: torch.dtype,
++        device: torch.device,
++    ) -> IntermediateTensors:
++        return IntermediateTensors({
++            key: torch.zeros((batch_size, hidden_size),
++                             dtype=dtype,
++                             device=device)
++            for key in keys
++        })
++
++    return make_empty_intermediate_tensors
++
++
++def maybe_prefix(prefix: str, name: str) -> str:
++    """Add a prefix to a name if the prefix is non-empty.
++
++    Args:
++        prefix: The prefix to add. If empty, no prefix will be added.
++        name: The name to potentially prefix.
++
++    Returns:
++        The string "prefix.name" if prefix was non-empty, otherwise just "name".
++    """
++    return name if not prefix else f"{prefix}.{name}"
++
++
++def extract_layer_index(layer_name: str) -> int:
++    """
++    Extract the layer index from the module name.
++    Examples:
++    - "encoder.layers.0" -> 0
++    - "encoder.layers.1.self_attn" -> 1
++    - "2.self_attn" -> 2
++    - "model.encoder.layers.0.sub.1" -> ValueError
++    """
++    subnames = layer_name.split(".")
++    int_vals: List[int] = []
++    for subname in subnames:
++        try:
++            int_vals.append(int(subname))
++        except ValueError:
++            continue
++    assert len(int_vals) == 1, (f"layer name {layer_name} should"
++                                " only contain one integer")
++    return int_vals[0]
+diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
+new file mode 100644
+index 0000000..a139598
+--- /dev/null
++++ b/vllm/model_executor/models/vision.py
+@@ -0,0 +1,145 @@
++from abc import ABC, abstractmethod
++from typing import Final, Generic, Optional, Protocol, TypeVar, Union
++
++import torch
++from transformers import PretrainedConfig
++
++import vllm.envs as envs
++from vllm.attention.selector import (backend_name_to_enum,
++                                     get_global_forced_attn_backend)
++from vllm.logger import init_logger
++from vllm.platforms import _Backend, current_platform
++
++logger = init_logger(__name__)
++
++_C = TypeVar("_C", bound=PretrainedConfig)
++
++
++class VisionEncoderInfo(ABC, Generic[_C]):
++
++    def __init__(self, vision_config: _C) -> None:
++        super().__init__()
++
++        self.vision_config = vision_config
++
++    @abstractmethod
++    def get_num_image_tokens(
++        self,
++        *,
++        image_width: int,
++        image_height: int,
++    ) -> int:
++        raise NotImplementedError
++
++    @abstractmethod
++    def get_max_image_tokens(self) -> int:
++        raise NotImplementedError
++
++    @abstractmethod
++    def get_image_size(self) -> int:
++        raise NotImplementedError
++
++    @abstractmethod
++    def get_patch_size(self) -> int:
++        raise NotImplementedError
++
++    @abstractmethod
++    def get_patch_grid_length(self) -> int:
++        raise NotImplementedError
++
++
++class VisionLanguageConfig(Protocol):
++    vision_config: Final[PretrainedConfig]
++
++
++def get_vision_encoder_info(
++        hf_config: VisionLanguageConfig) -> VisionEncoderInfo:
++    # Avoid circular imports
++    from .clip import CLIPEncoderInfo, CLIPVisionConfig
++    from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig
++    from .siglip import SiglipEncoderInfo, SiglipVisionConfig
++
++    vision_config = hf_config.vision_config
++    if isinstance(vision_config, CLIPVisionConfig):
++        return CLIPEncoderInfo(vision_config)
++    if isinstance(vision_config, PixtralVisionConfig):
++        return PixtralHFEncoderInfo(vision_config)
++    if isinstance(vision_config, SiglipVisionConfig):
++        return SiglipEncoderInfo(vision_config)
++
++    msg = f"Unsupported vision config: {type(vision_config)}"
++    raise NotImplementedError(msg)
++
++
++def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
++    """
++    Get the available attention backend for Vision Transformer.
++    """
++    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
++    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
++    if selected_backend is None:
++        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
++        if backend_by_env_var is not None:
++            selected_backend = backend_name_to_enum(backend_by_env_var)
++    if selected_backend is None:
++        # For Volta and Turing GPUs, use xformers instead.
++        device_available = current_platform.has_device_capability(80)
++        if device_available and support_fa:
++            from transformers.utils import is_flash_attn_2_available
++            if is_flash_attn_2_available():
++                selected_backend = _Backend.FLASH_ATTN
++            else:
++                logger.warning_once(
++                    "Current `vllm-flash-attn` has a bug inside vision module, "
++                    "so we use xformers backend instead. You can run "
++                    "`pip install flash-attn` to use flash-attention backend.")
++                selected_backend = _Backend.XFORMERS
++        elif current_platform.is_cpu() or current_platform.is_rocm():
++            # ROCM doesn't support xformers
++            selected_backend = _Backend.TORCH_SDPA
++        else:
++            selected_backend = _Backend.XFORMERS
++    return selected_backend
++
++
++def resolve_visual_encoder_outputs(
++    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
++    feature_sample_layers: Optional[list[int]],
++    post_layer_norm: Optional[torch.nn.LayerNorm],
++    max_possible_layers: int,
++) -> torch.Tensor:
++    """Given the outputs a visual encoder module that may correspond to the
++    output of the last layer, or a list of hidden states to be stacked,
++    handle post normalization and resolve it into a single output tensor.
++
++    Args:
++        encoder_outputs: Output of encoder's last layer or all hidden states.
++        feature_sample_layers: Optional layer indices to grab from the encoder
++            outputs; if provided, encoder outputs must be a list.
++        post_layer_norm: Post norm to apply to the output of the encoder.
++        max_possible_layers: Total layers in the fully loaded visual encoder.
++
++    """
++    if feature_sample_layers is None:
++        if post_layer_norm is not None:
++            return post_layer_norm(encoder_outputs)
++        return encoder_outputs
++
++    # Get the hidden states corresponding to the layer indices.
++    # Negative values are relative to the full visual encoder,
++    # so offset them depending on how many layers were loaded.
++    # NOTE: this assumes that encoder_outputs contains a list
++    # of hidden states in the same order as the encoder layers
++    # that produced them.
++    offset = max_possible_layers - len(encoder_outputs)
++    hs_pool = [
++        encoder_outputs[layer_idx]
++        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
++        for layer_idx in feature_sample_layers
++    ]
++
++    # Apply post-norm on the final hidden state if we are using it
++    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
++    if post_layer_norm is not None and uses_last_layer:
++        hs_pool[-1] = post_layer_norm(encoder_outputs)
++    return torch.cat(hs_pool, dim=-1)
+diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
+new file mode 100644
+index 0000000..c1f3bb0
+--- /dev/null
++++ b/vllm/model_executor/models/whisper.py
+@@ -0,0 +1,735 @@
++import math
++from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
++                    Union)
++
++import numpy as np
++import torch
++from torch import nn
++from transformers.models.whisper.modeling_whisper import sinusoids
++
++from vllm.attention import Attention, AttentionMetadata, AttentionType
++from vllm.config import CacheConfig, VllmConfig
++from vllm.distributed import get_tensor_model_parallel_world_size
++from vllm.inputs import INPUT_REGISTRY, DummyData, InputContext
++from vllm.logger import init_logger
++from vllm.model_executor.layers.activation import get_act_fn
++from vllm.model_executor.layers.linear import (ColumnParallelLinear,
++                                               QKVParallelLinear,
++                                               RowParallelLinear)
++from vllm.model_executor.layers.logits_processor import LogitsProcessor
++from vllm.model_executor.layers.quantization.base_config import (
++    QuantizationConfig)
++from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
++from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
++from vllm.model_executor.model_loader.weight_utils import default_weight_loader
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
++                             NestedTensors)
++from vllm.multimodal.audio import resample_audio
++from vllm.sequence import SequenceData
++from vllm.transformers_utils.processor import cached_get_processor
++
++from .interfaces import SupportsMultiModal
++from .utils import AutoWeightsLoader, WeightsMapper, make_layers
++
++logger = init_logger(__name__)
++
++
++class WhisperAudioInputs(TypedDict):
++    input_features: NestedTensors
++    """Shape: `(batch_size, 128, M)`"""
++
++
++class WhisperPositionalEmbedding(nn.Embedding):
++
++    def __init__(self,
++                 num_positions: int,
++                 embedding_dim: int,
++                 padding_idx: Optional[int] = None):
++        super().__init__(num_positions, embedding_dim)
++
++    def forward(self, position_ids):
++        return self.weight[position_ids]
++
++
++class WhisperAttention(nn.Module):
++
++    def __init__(
++        self,
++        embed_dim: int,
++        num_heads: int,
++        bias: bool = True,
++        attn_type: AttentionType = AttentionType.DECODER,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++        self.embed_dim = embed_dim
++        tp_size = get_tensor_model_parallel_world_size()
++        self.total_num_heads = num_heads
++        assert self.total_num_heads % tp_size == 0
++        self.num_heads = self.total_num_heads // tp_size
++        if self.total_num_heads >= tp_size:
++            # Number of heads is greater than TP size, so we partition
++            # the KV heads across multiple tensor parallel GPUs.
++            assert self.total_num_heads % tp_size == 0
++        else:
++            # Number of heads is less than TP size, so we replicate
++            # the KV heads across multiple tensor parallel GPUs.
++            assert tp_size % self.total_num_heads == 0
++        self.num_kv_heads = max(1, self.total_num_heads // tp_size)
++        self.head_dim = self.embed_dim // self.total_num_heads
++        self.q_size = self.num_heads * self.head_dim
++        self.kv_size = self.num_kv_heads * self.head_dim
++        self.attn_type = attn_type
++
++        if (self.head_dim * num_heads) != self.embed_dim:
++            raise ValueError(
++                f"embed_dim must be divisible by num_heads (got `embed_dim`: "
++                f"{self.embed_dim} and `num_heads`: {num_heads}).")
++        self.scaling = self.head_dim**-0.5
++
++        self._init_qkv(embed_dim, bias, quant_config, prefix=prefix)
++        self.out_proj = RowParallelLinear(
++            input_size=embed_dim,
++            output_size=embed_dim,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.out_proj",
++        )
++        self.attn = Attention(
++            self.num_heads,
++            self.head_dim,
++            self.scaling,
++            num_kv_heads=self.num_kv_heads,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.attn",
++            attn_type=self.attn_type,
++        )
++
++    def _init_qkv(
++        self,
++        embed_dim: int,
++        bias: bool = True,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        self.qkv_proj = QKVParallelLinear(
++            hidden_size=embed_dim,
++            head_size=self.head_dim,
++            total_num_heads=self.total_num_heads,
++            total_num_kv_heads=self.total_num_heads,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.qkv_proj",
++        )
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ):
++        qkv, _ = self.qkv_proj(hidden_states)
++        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
++
++        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
++
++        output, _ = self.out_proj(attn_output)
++
++        return output
++
++
++class WhisperCrossAttention(WhisperAttention):
++
++    def __init__(
++        self,
++        embed_dim: int,
++        num_heads: int,
++        bias: bool = True,
++        cache_config: Optional[CacheConfig] = None,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__(
++            embed_dim=embed_dim,
++            num_heads=num_heads,
++            bias=bias,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=prefix,
++            attn_type=AttentionType.ENCODER_DECODER,
++        )
++
++    def _init_qkv(
++        self,
++        embed_dim: int,
++        bias: bool = True,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ) -> None:
++        self.q_proj = ColumnParallelLinear(
++            input_size=embed_dim,
++            output_size=embed_dim,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.q_proj",
++        )
++        self.kv_proj = QKVParallelLinear(
++            hidden_size=embed_dim,
++            head_size=self.head_dim,
++            total_num_heads=0,
++            total_num_kv_heads=self.total_num_heads,
++            bias=bias,
++            quant_config=quant_config,
++            prefix=f"{prefix}.kv_proj",
++        )
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        encoder_hidden_states: Optional[torch.Tensor],
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ):
++        q, _ = self.q_proj(hidden_states)
++
++        # Encoder hidden states are only computed once during prefill phase.
++        # Afterwards, the keys and values should be available in the kv-cache.
++        if encoder_hidden_states is not None:
++            kv, _ = self.kv_proj(encoder_hidden_states)
++            k, v = kv.split([self.kv_size, self.kv_size], dim=-1)
++        else:
++            k = v = None
++
++        attn_output = self.attn(
++            q,
++            k,
++            v,
++            kv_cache,
++            attn_metadata,
++        )
++
++        output, _ = self.out_proj(attn_output)
++
++        return output
++
++
++class WhisperMLP(nn.Module):
++
++    def __init__(
++        self,
++        embed_dim: int,
++        ffn_dim: int,
++        act_fn: str,
++        quant_config: Optional[QuantizationConfig] = None,
++        prefix: str = "",
++    ):
++        super().__init__()
++
++        self.activation_fn = get_act_fn(act_fn)
++        self.fc1 = ColumnParallelLinear(
++            input_size=embed_dim,
++            output_size=ffn_dim,
++            quant_config=quant_config,
++            prefix=f"{prefix}.fc1",
++        )
++        self.fc2 = RowParallelLinear(
++            input_size=ffn_dim,
++            output_size=embed_dim,
++            quant_config=quant_config,
++            prefix=f"{prefix}.fc2",
++        )
++
++    def forward(self, hidden_states: torch.Tensor):
++        hidden_states, _ = self.fc1(hidden_states)
++        hidden_states = self.activation_fn(hidden_states)
++        hidden_states, _ = self.fc2(hidden_states)
++        return hidden_states
++
++
++class WhisperEncoderLayer(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.embed_dim = config.d_model
++        self.self_attn = WhisperAttention(
++            embed_dim=self.embed_dim,
++            num_heads=config.encoder_attention_heads,
++            attn_type=AttentionType.ENCODER,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
++        self.mlp = WhisperMLP(
++            embed_dim=config.d_model,
++            ffn_dim=config.encoder_ffn_dim,
++            act_fn=config.activation_function,
++            quant_config=quant_config,
++            prefix=f"{prefix}.mlp",
++        )
++        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ):
++        residual = hidden_states
++        hidden_states = self.self_attn_layer_norm(hidden_states)
++        hidden_states = self.self_attn(
++            hidden_states=hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        hidden_states = residual + hidden_states
++        residual = hidden_states
++        hidden_states = self.final_layer_norm(hidden_states)
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = residual + hidden_states
++
++        if hidden_states.isinf().any() or hidden_states.isnan().any():
++            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
++            hidden_states = torch.clamp(hidden_states,
++                                        min=-clamp_value,
++                                        max=clamp_value)
++
++        return hidden_states
++
++
++class WhisperDecoderLayer(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        cache_config = vllm_config.cache_config
++        quant_config = vllm_config.quant_config
++
++        self.self_attn = WhisperAttention(
++            embed_dim=config.d_model,
++            num_heads=config.decoder_attention_heads,
++            attn_type=AttentionType.DECODER,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.self_attn",
++        )
++        self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
++        self.encoder_attn = WhisperCrossAttention(
++            embed_dim=config.d_model,
++            num_heads=config.decoder_attention_heads,
++            cache_config=cache_config,
++            quant_config=quant_config,
++            prefix=f"{prefix}.encoder_attn",
++        )
++        self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model)
++        self.mlp = WhisperMLP(
++            embed_dim=config.d_model,
++            ffn_dim=config.decoder_ffn_dim,
++            act_fn=config.activation_function,
++            quant_config=quant_config,
++            prefix=f"{prefix}.mlp",
++        )
++        self.final_layer_norm = nn.LayerNorm(config.d_model)
++
++    def forward(
++        self,
++        hidden_states: torch.Tensor,
++        encoder_hidden_states: Optional[torch.Tensor],
++        kv_cache: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++    ):
++        residual = hidden_states
++        hidden_states = self.self_attn_layer_norm(hidden_states)
++        hidden_states = self.self_attn(hidden_states=hidden_states,
++                                       kv_cache=kv_cache,
++                                       attn_metadata=attn_metadata)
++        hidden_states = residual + hidden_states
++
++        residual = hidden_states
++        hidden_states = self.encoder_attn_layer_norm(hidden_states)
++        hidden_states = self.encoder_attn(
++            hidden_states=hidden_states,
++            encoder_hidden_states=encoder_hidden_states,
++            kv_cache=kv_cache,
++            attn_metadata=attn_metadata,
++        )
++        hidden_states = residual + hidden_states
++
++        residual = hidden_states
++        hidden_states = self.final_layer_norm(hidden_states)
++        hidden_states = self.mlp(hidden_states)
++        hidden_states = residual + hidden_states
++
++        return hidden_states
++
++
++class WhisperEncoder(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        embed_dim = config.d_model
++        self.num_mel_bins = config.num_mel_bins
++        self.padding_idx = config.pad_token_id
++        self.max_source_positions = config.max_source_positions
++        self.embed_scale = (math.sqrt(embed_dim)
++                            if config.scale_embedding else 1.0)
++
++        self.conv1 = nn.Conv1d(self.num_mel_bins,
++                               embed_dim,
++                               kernel_size=3,
++                               padding=1)
++        self.conv2 = nn.Conv1d(embed_dim,
++                               embed_dim,
++                               kernel_size=3,
++                               stride=2,
++                               padding=1)
++        self.embed_positions = nn.Embedding(self.max_source_positions,
++                                            embed_dim)
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.encoder_layers,
++            lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config,
++                                               prefix=f"{prefix}.layers"),
++            prefix=f"{prefix}.layers",
++        )
++        self.layer_norm = nn.LayerNorm(config.d_model)
++
++        with torch.no_grad():
++            self.embed_positions.weight.copy_(
++                sinusoids(*self.embed_positions.weight.shape))
++
++    def forward(
++        self,
++        input_features: Union[torch.Tensor, List[torch.Tensor]],
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++    ):
++        hidden_states = []
++        for features in input_features:
++            embeds = nn.functional.gelu(self.conv1(features))
++            embeds = nn.functional.gelu(self.conv2(embeds))
++            embeds = embeds.permute(1, 0)
++            embeds = embeds + self.embed_positions.weight[:embeds.size(0), :]
++            hidden_states.append(embeds)
++        hidden_states = torch.cat(hidden_states)
++
++        for idx, encoder_layer in enumerate(self.layers):
++            hidden_states = encoder_layer(
++                hidden_states,
++                kv_cache=kv_caches[idx],
++                attn_metadata=attn_metadata,
++            )
++
++        hidden_states = self.layer_norm(hidden_states)
++        return hidden_states
++
++
++class WhisperDecoder(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        self.layerdrop = config.decoder_layerdrop
++        self.padding_idx = config.pad_token_id
++        self.max_target_positions = config.max_target_positions
++        self.max_source_positions = config.max_source_positions
++        self.embed_scale = (math.sqrt(config.d_model)
++                            if config.scale_embedding else 1.0)
++
++        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model,
++                                         self.padding_idx)
++        self.embed_positions = WhisperPositionalEmbedding(
++            self.max_target_positions, config.d_model)
++        self.start_layer, self.end_layer, self.layers = make_layers(
++            config.decoder_layers,
++            lambda prefix: WhisperDecoderLayer(vllm_config=vllm_config,
++                                               prefix=f"{prefix}.layers"),
++            prefix=f"{prefix}.layers",
++        )
++        self.layer_norm = nn.LayerNorm(config.d_model)
++
++    def forward(
++        self,
++        input_ids,
++        positions: torch.Tensor,
++        encoder_hidden_states: Optional[torch.Tensor],
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++    ):
++        inputs_embeds = self.get_input_embeddings(input_ids)
++        positions = self.embed_positions(positions)
++        hidden_states = inputs_embeds + positions
++
++        for idx, decoder_layer in enumerate(self.layers):
++            hidden_states = decoder_layer(
++                hidden_states,
++                encoder_hidden_states=encoder_hidden_states,
++                kv_cache=kv_caches[idx],
++                attn_metadata=attn_metadata,
++            )
++
++        hidden_states = self.layer_norm(hidden_states)
++        return hidden_states
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++    ) -> torch.Tensor:
++        return self.embed_tokens(input_ids)
++
++
++class WhisperModel(nn.Module):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        self.encoder = WhisperEncoder(vllm_config=vllm_config,
++                                      prefix=f"{prefix}.encoder")
++        self.decoder = WhisperDecoder(vllm_config=vllm_config,
++                                      prefix=f"{prefix}.decoder")
++
++    def forward(
++        self,
++        input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]],
++        input_ids: Optional[torch.Tensor],
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++    ) -> torch.Tensor:
++        encoder_outputs = self.get_encoder_outputs(
++            input_features,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++        )
++        decoder_outputs = self.decoder(
++            input_ids=input_ids,
++            positions=positions,
++            encoder_hidden_states=encoder_outputs,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++        )
++        return decoder_outputs
++
++    def get_encoder_outputs(
++        self,
++        input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]],
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++    ) -> Optional[torch.Tensor]:
++        if input_features is None:
++            return None
++        return self.encoder(
++            input_features,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++        )
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        stacked_params_mapping = [
++            # (param_name, shard_name, shard_id)
++            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
++            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
++            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
++            (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"),
++            (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"),
++        ]
++        params_dict = dict(self.named_parameters())
++        loaded_params: Set[str] = set()
++        for name, loaded_weight in weights:
++            for param_name, weight_name, shard_id in stacked_params_mapping:
++                if weight_name not in name:
++                    continue
++                name = name.replace(weight_name, param_name)
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++
++                param = params_dict[name]
++                weight_loader = param.weight_loader
++                weight_loader(param, loaded_weight, shard_id)
++                break
++            else:
++                # Skip loading extra bias for GPTQ models.
++                if name.endswith(".bias") and name not in params_dict:
++                    continue
++
++                param = params_dict[name]
++                weight_loader = getattr(param, "weight_loader",
++                                        default_weight_loader)
++                weight_loader(param, loaded_weight)
++            loaded_params.add(name)
++        return loaded_params
++
++
++def get_max_whisper_audio_tokens(ctx: InputContext) -> int:
++    return ctx.model_config.hf_config.max_source_positions
++
++
++def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
++                                   mm_counts: Mapping[str, int]):
++    assert mm_counts["audio"] == 1
++    num_tokens = get_max_whisper_audio_tokens(ctx)
++    processor = cached_get_processor(ctx.model_config.model)
++    chunk_length = processor.feature_extractor.chunk_length
++    sampling_rate = processor.feature_extractor.sampling_rate
++    num_samples = chunk_length * sampling_rate
++    return DummyData(
++        SequenceData.from_prompt_token_counts((0, num_tokens)),
++        {"audio": [(np.zeros(num_samples), sampling_rate)]},
++    )
++
++
++def input_processor_for_whisper(ctx: InputContext, inputs):
++    multi_modal_data = inputs["encoder"]["multi_modal_data"]
++    if isinstance(multi_modal_data["audio"], list):
++        assert len(multi_modal_data["audio"]) == 1
++        multi_modal_data["audio"] = multi_modal_data["audio"][0]
++    # Resample and process audio
++    audio, orig_sr = multi_modal_data["audio"]
++    processor = cached_get_processor(ctx.model_config.model)
++    target_sr = processor.feature_extractor.sampling_rate
++    audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
++    multi_modal_data["audio"] = (audio, target_sr)
++    # Pre-allocate placeholder tokens in encoder sequence
++    num_tokens = get_max_whisper_audio_tokens(ctx)
++    inputs["encoder"]["prompt_token_ids"] = [0] * num_tokens
++    return inputs
++
++
++def input_mapper_for_whisper(
++    ctx: InputContext,
++    multi_modal_data: Union[np.ndarray, List[np.ndarray]],
++) -> MultiModalKwargs:
++    if not isinstance(multi_modal_data, list):
++        multi_modal_data = [multi_modal_data]
++
++    assert len(multi_modal_data) == 1
++
++    if len(multi_modal_data) == 0:
++        return MultiModalKwargs()
++
++    processor = cached_get_processor(ctx.model_config.model)
++    sampling_rate = processor.feature_extractor.sampling_rate
++
++    audios = [audio for audio, _ in multi_modal_data]
++
++    kwargs = processor(audios,
++                       sampling_rate=sampling_rate,
++                       return_tensors="pt")
++    kwargs["input_features"] = kwargs["input_features"].squeeze(0).to(
++        ctx.model_config.dtype)
++
++    return MultiModalKwargs(kwargs)
++
++
++@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper)
++@INPUT_REGISTRY.register_input_processor(input_processor_for_whisper)
++@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_whisper)
++@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
++    "audio", get_max_whisper_audio_tokens)
++class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal):
++
++    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
++        super().__init__()
++        config = vllm_config.model_config.hf_config
++        quant_config = vllm_config.quant_config
++        self.config = config
++        self.dtype = vllm_config.model_config.dtype
++
++        self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix)
++        self.unpadded_vocab_size = config.vocab_size
++        self.proj_out = ParallelLMHead(config.vocab_size,
++                                       config.d_model,
++                                       quant_config=quant_config)
++        self.proj_out = self.proj_out.tie_weights(
++            self.model.decoder.embed_tokens)
++        logit_scale = getattr(config, "logit_scale", 1.0)
++        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
++                                                config.vocab_size, logit_scale)
++        self.sampler = Sampler()
++
++    def forward(
++        self,
++        input_ids: torch.Tensor,
++        positions: torch.Tensor,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        **kwargs,
++    ) -> torch.Tensor:
++        audio_input = self._parse_and_validate_audio_input(**kwargs)
++        decoder_outputs = self.model(
++            input_features=audio_input["input_features"],
++            input_ids=input_ids,
++            positions=positions,
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++        )
++        return decoder_outputs
++
++    def get_multimodal_embeddings(
++        self,
++        kv_caches: List[torch.Tensor],
++        attn_metadata: AttentionMetadata,
++        **kwargs,
++    ) -> Optional[NestedTensors]:
++        # TODO: This method does not obey the interface for SupportsMultiModal.
++        # Refactor this once encoder/decoder support is implemented in V1.
++        audio_input = self._parse_and_validate_audio_input(**kwargs)
++        return self.model.get_encoder_outputs(
++            audio_input["input_features"],
++            kv_caches=kv_caches,
++            attn_metadata=attn_metadata,
++        )
++
++    def get_input_embeddings(
++        self,
++        input_ids: torch.Tensor,
++        multimodal_embeddings: Optional[NestedTensors] = None,
++        attn_metadata: Optional[AttentionMetadata] = None,
++    ) -> torch.Tensor:
++        # TODO: This method just returns the decoder sequence embeddings since
++        # Whisper does not have encoder text tokens. Refactor this once
++        # encoder/decoder support is implemented in V1.
++        return self.model.decoder.get_input_embeddings(input_ids)
++
++    def _parse_and_validate_audio_input(
++            self, **kwargs: object) -> WhisperAudioInputs:
++        input_features = kwargs.pop("input_features", None)
++
++        if input_features is not None:
++            if not isinstance(input_features, (torch.Tensor, list)):
++                raise ValueError("Incorrect type of audio features. "
++                                 f"Got type: {type(input_features)}")
++            input_features = [feat.to(self.dtype) for feat in input_features]
++
++        return WhisperAudioInputs(input_features=input_features)
++
++    def compute_logits(self, hidden_states: torch.Tensor,
++                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
++        logits = self.logits_processor(self.proj_out, hidden_states,
++                                       sampling_metadata)
++        return logits
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Optional[SamplerOutput]:
++        next_tokens = self.sampler(logits, sampling_metadata)
++        return next_tokens
++
++    def load_weights(self, weights: Iterable[Tuple[str,
++                                                   torch.Tensor]]) -> Set[str]:
++        loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."])
++        loaded_weights = [(name, loaded_weight)
++                          for name, loaded_weight in weights]
++        mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."})
++        return loader.load_weights(loaded_weights, mapper=mapper)
+diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
+new file mode 100644
+index 0000000..fc5a3e7
+--- /dev/null
++++ b/vllm/model_executor/parameter.py
+@@ -0,0 +1,425 @@
++from fractions import Fraction
++from typing import Callable, Optional, Union
++
++import torch
++from torch.nn import Parameter
++
++from vllm.distributed import get_tensor_model_parallel_rank
++from vllm.logger import init_logger
++from vllm.model_executor.utils import _make_synced_weight_loader
++
++__all__ = [
++    "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter",
++    "ModelWeightParameter", "ChannelQuantScaleParameter",
++    "GroupQuantScaleParameter", "PackedColumnParameter", "RowvLLMParameter"
++]
++
++logger = init_logger(__name__)
++
++
++class BasevLLMParameter(Parameter):
++    """
++    Base parameter for vLLM linear layers. Extends the torch.nn.parameter
++    by taking in a linear weight loader. Will copy the loaded weight
++    into the parameter when the provided weight loader is called.
++    """
++
++    def __new__(cls, data: torch.Tensor, **kwargs):
++
++        return super().__new__(cls, data=data, requires_grad=False)
++
++    def __init__(self, data: torch.Tensor, weight_loader: Callable):
++        """
++        Initialize the BasevLLMParameter
++
++        :param data: torch tensor with the parameter data
++        :param weight_loader: weight loader callable
++
++        :returns: a torch.nn.parameter
++        """
++
++        # During weight loading, we often do something like:
++        # narrowed_tensor = param.data.narrow(0, offset, len)
++        # narrowed_tensor.copy_(real_weight)
++        # expecting narrowed_tensor and param.data to share the same storage.
++        # However, on TPUs, narrowed_tensor will lazily propagate to the base
++        # tensor, which is param.data, leading to the redundant memory usage.
++        # This sometimes causes OOM errors during model loading. To avoid this,
++        # we sync the param tensor after its weight loader is called.
++        from vllm.platforms import current_platform
++        if current_platform.is_tpu():
++            weight_loader = _make_synced_weight_loader(weight_loader)
++
++        self._weight_loader = weight_loader
++
++    @property
++    def weight_loader(self):
++        return self._weight_loader
++
++    def _assert_and_load(self, loaded_weight: torch.Tensor):
++        assert self.data.shape == loaded_weight.shape
++        self.data.copy_(loaded_weight)
++
++    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
++        self._assert_and_load(loaded_weight)
++
++    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
++        self._assert_and_load(loaded_weight)
++
++    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
++        self._assert_and_load(loaded_weight)
++
++    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
++        self._assert_and_load(loaded_weight)
++
++
++class _ColumnvLLMParameter(BasevLLMParameter):
++    """
++    Private class defining weight loading functionality 
++    (load_merged_column_weight, load_qkv_weight)
++    for parameters being loaded into linear layers with column
++    parallelism. This includes QKV and MLP layers which are
++    not already fused on disk. Requires an output dimension 
++    to be defined. Called within the weight loader of
++    each of the column parallel linear layers.
++    """
++
++    def __init__(self, output_dim: int, **kwargs):
++        self._output_dim = output_dim
++        super().__init__(**kwargs)
++
++    @property
++    def output_dim(self):
++        return self._output_dim
++
++    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
++        tp_rank = get_tensor_model_parallel_rank()
++        shard_size = self.data.shape[self.output_dim]
++        loaded_weight = loaded_weight.narrow(self.output_dim,
++                                             tp_rank * shard_size, shard_size)
++        assert self.data.shape == loaded_weight.shape
++        self.data.copy_(loaded_weight)
++
++    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
++
++        shard_offset = kwargs.get("shard_offset")
++        shard_size = kwargs.get("shard_size")
++        if isinstance(
++                self,
++            (PackedColumnParameter,
++             PackedvLLMParameter)) and self.packed_dim == self.output_dim:
++            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
++                shard_offset=shard_offset, shard_size=shard_size)
++
++        param_data = self.data
++
++        tp_rank = get_tensor_model_parallel_rank()
++        param_data = param_data.narrow(self.output_dim, shard_offset,
++                                       shard_size)
++        loaded_weight = loaded_weight.narrow(self.output_dim,
++                                             tp_rank * shard_size, shard_size)
++        assert param_data.shape == loaded_weight.shape
++        param_data.copy_(loaded_weight)
++
++    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
++
++        shard_offset = kwargs.get("shard_offset")
++        shard_size = kwargs.get("shard_size")
++        shard_id = kwargs.get("shard_id")
++        num_heads = kwargs.get("num_heads")
++
++        if isinstance(
++                self,
++            (PackedColumnParameter,
++             PackedvLLMParameter)) and self.output_dim == self.packed_dim:
++            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
++                shard_offset=shard_offset, shard_size=shard_size)
++
++        param_data = self.data
++        tp_rank = get_tensor_model_parallel_rank()
++        shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
++        param_data = param_data.narrow(self.output_dim, shard_offset,
++                                       shard_size)
++        loaded_weight = loaded_weight.narrow(self.output_dim,
++                                             shard_id * shard_size, shard_size)
++
++        assert param_data.shape == loaded_weight.shape
++        param_data.copy_(loaded_weight)
++
++
++class RowvLLMParameter(BasevLLMParameter):
++    """
++    Parameter class defining weight_loading functionality
++    (load_row_parallel_weight) for parameters being loaded
++    into linear layers with row parallel functionality.
++    Requires an input_dim to be defined.
++    """
++
++    def __init__(self, input_dim: int, **kwargs):
++        self._input_dim = input_dim
++        super().__init__(**kwargs)
++
++    @property
++    def input_dim(self):
++        return self._input_dim
++
++    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
++        tp_rank = get_tensor_model_parallel_rank()
++        shard_size = self.data.shape[self.input_dim]
++        loaded_weight = loaded_weight.narrow(self.input_dim,
++                                             tp_rank * shard_size, shard_size)
++
++        if len(loaded_weight.shape) == 0:
++            loaded_weight = loaded_weight.reshape(1)
++
++        assert self.data.shape == loaded_weight.shape
++        self.data.copy_(loaded_weight)
++
++
++class ModelWeightParameter(_ColumnvLLMParameter, RowvLLMParameter):
++    """
++    Parameter class for linear layer weights. Uses both column and
++    row parallelism.
++    """
++    pass
++
++
++class GroupQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
++    """
++    Parameter class for weight scales loaded for weights with
++    grouped quantization. Uses both column and row parallelism.
++    """
++    pass
++
++
++class ChannelQuantScaleParameter(_ColumnvLLMParameter):
++    """
++    Parameter class for weight scales loaded for weights with
++    channel-wise quantization. Equivalent to _ColumnvLLMParameter.
++    """
++    pass
++
++
++class PerTensorScaleParameter(BasevLLMParameter):
++    """
++    Parameter class for scales where the number of scales is
++    equivalent to the number of logical matrices in fused linear
++    layers (e.g. for QKV, there are 3 scales loaded from disk).
++    This is relevant to weights with per-tensor quantization.
++    Adds functionality to map the scalers to a shard during
++    weight loading. 
++
++    Note: additional parameter manipulation may be handled 
++    for each quantization config specifically, within 
++    process_weights_after_loading 
++    """
++
++    def __init__(self, **kwargs):
++        self.qkv_idxs = {"q": 0, "k": 1, "v": 2}
++        super().__init__(**kwargs)
++
++    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
++        if isinstance(shard_id, int):
++            return shard_id
++
++        # if not int, assume shard_id for qkv
++        # map to int and return
++        assert isinstance(shard_id, str)
++        assert shard_id in self.qkv_idxs
++        return self.qkv_idxs[shard_id]
++
++    # For row parallel layers, no sharding needed
++    # load weight into parameter as is
++    def load_row_parallel_weight(self, *args, **kwargs):
++        super().load_row_parallel_weight(*args, **kwargs)
++
++    def load_merged_column_weight(self, *args, **kwargs):
++        self._load_into_shard_id(*args, **kwargs)
++
++    def load_qkv_weight(self, *args, **kwargs):
++        self._load_into_shard_id(*args, **kwargs)
++
++    def load_column_parallel_weight(self, *args, **kwargs):
++        super().load_row_parallel_weight(*args, **kwargs)
++
++    def _load_into_shard_id(self, loaded_weight: torch.Tensor,
++                            shard_id: Union[str, int], **kwargs):
++        """
++        Slice the parameter data based on the shard id for 
++        loading.
++        """
++
++        param_data = self.data
++        shard_id = self._shard_id_as_int(shard_id)
++
++        # AutoFP8 scales do not have a shape
++        # compressed-tensors scales do have a shape
++        if len(loaded_weight.shape) != 0:
++            assert loaded_weight.shape[0] == 1
++            loaded_weight = loaded_weight[0]
++
++        param_data = param_data[shard_id]
++        assert param_data.shape == loaded_weight.shape
++        param_data.copy_(loaded_weight)
++
++
++class PackedColumnParameter(_ColumnvLLMParameter):
++    """
++    Parameter for model parameters which are packed on disk
++    and support column parallelism only. See PackedvLLMParameter
++    for more details on the packed properties.
++    """
++
++    def __init__(self,
++                 packed_factor: Union[int, Fraction],
++                 packed_dim: int,
++                 marlin_tile_size: Optional[int] = None,
++                 **kwargs):
++        self._packed_factor = packed_factor
++        self._packed_dim = packed_dim
++        self._marlin_tile_size = marlin_tile_size
++        super().__init__(**kwargs)
++
++    @property
++    def packed_dim(self):
++        return self._packed_dim
++
++    @property
++    def packed_factor(self):
++        return self._packed_factor
++
++    @property
++    def marlin_tile_size(self):
++        return self._marlin_tile_size
++
++    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
++        return _adjust_shard_indexes_for_packing(
++            shard_size=shard_size,
++            shard_offset=shard_offset,
++            packed_factor=self.packed_factor,
++            marlin_tile_size=self.marlin_tile_size)
++
++
++class PackedvLLMParameter(ModelWeightParameter):
++    """
++    Parameter for model weights which are packed on disk.
++    Example: GPTQ Marlin weights are int4 or int8, packed into int32.
++    Extends the ModelWeightParameter to take in the
++    packed factor, the packed dimension, and optionally, marlin
++    tile size for marlin kernels. Adjusts the shard_size and 
++    shard_offset for fused linear layers model weight loading
++    by accounting for packing and optionally, marlin tile size.
++    """
++
++    def __init__(self,
++                 packed_factor: Union[int, Fraction],
++                 packed_dim: int,
++                 marlin_tile_size: Optional[int] = None,
++                 **kwargs):
++        self._packed_factor = packed_factor
++        self._packed_dim = packed_dim
++        self._marlin_tile_size = marlin_tile_size
++        super().__init__(**kwargs)
++
++    @property
++    def packed_dim(self):
++        return self._packed_dim
++
++    @property
++    def packed_factor(self):
++        return self._packed_factor
++
++    @property
++    def marlin_tile_size(self):
++        return self._marlin_tile_size
++
++    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
++        return _adjust_shard_indexes_for_packing(
++            shard_size=shard_size,
++            shard_offset=shard_offset,
++            packed_factor=self.packed_factor,
++            marlin_tile_size=self.marlin_tile_size)
++
++
++class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
++    """
++    Parameter class for weight scales loaded for weights with
++    block-wise quantization. Uses both column and row parallelism.
++    """
++
++    pass
++
++
++def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
++                          output_dim: int, **kwargs) -> BasevLLMParameter:
++    """
++    Permute a parameter's layout to the specified input and output dimensions, 
++    useful for forcing the parameter into a known layout, for example, if I need
++    a packed (quantized) weight matrix to be in the layout 
++        {input_dim = 0, output_dim = 1, packed_dim = 0}
++    then I can call:
++        permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
++    to ensure x is in the correct layout (permuting it to the correct layout if 
++    required, asserting if it cannot get it to the correct layout)
++    """
++
++    curr_input_dim = getattr(param, "input_dim", None)
++    curr_output_dim = getattr(param, "output_dim", None)
++
++    if curr_input_dim is None or curr_output_dim is None:
++        assert param.data.dim() == 2,\
++            "permute_param_layout_ only supports 2D parameters when either "\
++            "input_dim or output_dim is not set"
++
++    # if one of the dimensions is not set, set it to the opposite of the other
++    #  we can only do this since we asserted the parameter is 2D above
++    if curr_input_dim is None:
++        assert curr_output_dim is not None,\
++            "either input or output dim must be set"
++        curr_input_dim = (curr_output_dim + 1) % 2
++    if curr_output_dim is None:
++        assert curr_input_dim is not None,\
++            "either input or output dim must be set"
++        curr_output_dim = (curr_input_dim + 1) % 2
++
++    # create permutation from the current layout to the layout with
++    # self.input_dim at input_dim and self.output_dim at output_dim preserving
++    # other dimensions
++    perm = [
++        i for i in range(param.data.dim())
++        if i not in [curr_input_dim, curr_output_dim]
++    ]
++    perm.insert(input_dim, curr_input_dim)
++    perm.insert(output_dim, curr_output_dim)
++
++    if "packed_dim" in kwargs:
++        assert hasattr(param, "packed_dim") and\
++            param.packed_dim == perm[kwargs["packed_dim"]],\
++            "permute_param_layout_ currently doesn't support repacking"
++
++    param.data = param.data.permute(*perm)
++    if hasattr(param, "_input_dim"):
++        param._input_dim = input_dim
++    if hasattr(param, "_output_dim"):
++        param._output_dim = output_dim
++    if "packed_dim" in kwargs and hasattr(param, "_packed_dim"):
++        param._packed_dim = kwargs["packed_dim"]
++
++    return param
++
++
++def _adjust_shard_indexes_for_marlin(shard_size, shard_offset,
++                                     marlin_tile_size):
++    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
++
++
++def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
++                                      marlin_tile_size):
++    shard_size = shard_size // packed_factor
++    shard_offset = shard_offset // packed_factor
++    if marlin_tile_size is not None:
++        return _adjust_shard_indexes_for_marlin(
++            shard_size=shard_size,
++            shard_offset=shard_offset,
++            marlin_tile_size=marlin_tile_size)
++    return shard_size, shard_offset
+diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
+new file mode 100644
+index 0000000..b86cafc
+--- /dev/null
++++ b/vllm/model_executor/pooling_metadata.py
+@@ -0,0 +1,69 @@
++from dataclasses import dataclass
++from typing import Any, Dict, List, Tuple
++
++import torch
++
++from vllm.pooling_params import PoolingParams
++from vllm.utils import is_pin_memory_available
++
++
++class PoolingMetadata:
++    """Metadata for pooling operations in the Pooler layer.
++
++    This class holds the necessary information for pooling operations,
++    providing context for how to perform pooling and other related operations.
++
++    Attributes:
++        seq_groups: List of (seq_ids, pooling_params).
++        seq_data: A mapping of sequence ID to additional sequence data.
++        prompt_lens: List of the lengths of each prompt.
++    """
++
++    def __init__(
++        self,
++        seq_groups: List[Tuple[List[int], PoolingParams]],
++        seq_data: Dict[int, Any],  # Specific data related to sequences
++        prompt_lens: List[int],
++    ) -> None:
++        self.seq_groups = seq_groups
++        self.seq_data = seq_data
++        self.prompt_lens = prompt_lens
++
++    def __repr__(self) -> str:
++        return ("PoolingMetadata("
++                f"seq_groups={self.seq_groups}, "
++                f"seq_data={self.seq_data}, "
++                f"prompt_lens={self.prompt_lens})")
++
++
++@dataclass
++class PoolingTensors:
++    """Tensors for pooling."""
++
++    prompt_lens: torch.Tensor
++
++    @classmethod
++    def from_pooling_metadata(
++        cls,
++        pooling_metadata: "PoolingMetadata",
++        device: torch.device,
++    ) -> "PoolingTensors":
++        """
++        Create PoolingTensors from PoolingMetadata.
++
++        Args:
++            pooling_metadata: PoolingMetadata instance to convert.
++            device: Device to store the tensors.
++        """
++        # Convert prompt lengths to tensor
++        pin_memory = is_pin_memory_available()
++
++        prompt_lens_t = torch.tensor(
++            pooling_metadata.prompt_lens,
++            device="cpu",
++            dtype=torch.long,
++            pin_memory=pin_memory,
++        )
++
++        return cls(prompt_lens=prompt_lens_t.to(device=device,
++                                                non_blocking=True), )
+diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
+index 9969c45..1df8f84 100644
+--- a/vllm/model_executor/sampling_metadata.py
++++ b/vllm/model_executor/sampling_metadata.py
+@@ -1,17 +1,16 @@
+-import random
++from array import array
+ from dataclasses import dataclass
+ from typing import Dict, List, Optional, Tuple
+ 
+ import torch
+ 
+-from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits
+ from vllm.sampling_params import SamplingParams, SamplingType
+-from vllm.sequence import SequenceData, SequenceGroupMetadata
+-from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
+-                        maybe_expand_dim)
++from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData,
++                           SequenceGroupMetadata)
++from vllm.utils import (PyObjectCache, async_tensor_h2d,
++                        is_pin_memory_available, make_tensor_with_pad)
+ 
+ _SAMPLING_EPS = 1e-5
+-_SEED_0_REPLACEMENT = 3403598558
+ 
+ 
+ @dataclass
+@@ -59,6 +58,39 @@ class SequenceGroupToSample:
+             assert self.query_len is not None
+ 
+ 
++def gen_seq_group_to_sample_builder(num_seqs: int):
++    return lambda: SequenceGroupToSample(
++        seq_ids=[0] * num_seqs,
++        sampling_params=None,
++        seq_data=None,  # type: ignore
++        seq_len=0,
++        query_len=0,
++        generator=None,
++        is_prompt=True,
++        prompt_logprob_indices=[],
++        sample_indices=[],
++    )
++
++
++class SamplingMetadataCache:
++    """Used to cache SamplingMetadata objects between scheduler iterations"""
++
++    def __init__(self):
++        self._seq_group_to_sample_cache: Dict[int, PyObjectCache] = {}
++
++    def get_cached_seq_group_to_sample(self, num_seqs):
++        if num_seqs not in self._seq_group_to_sample_cache:
++            self._seq_group_to_sample_cache[num_seqs] = PyObjectCache(
++                gen_seq_group_to_sample_builder(num_seqs))
++
++        obj = self._seq_group_to_sample_cache[num_seqs].get_object()
++        return obj
++
++    def reset(self):
++        for cache in self._seq_group_to_sample_cache.values():
++            cache.reset()
++
++
+ class SamplingMetadata:
+     """Metadata for input sequences. Used in sampler.
+ 
+@@ -86,6 +118,12 @@ class SamplingMetadata:
+             The first tuple is [1, 2] (sampled index within original logit),
+             and the second tuple is [0, 1] (sampled index within pruned logit).
+         num_prompts: Number of prompt sequence groups in seq_groups.
++        skip_sampler_cpu_output: Indicates if we want to skip the GPU=>CPU
++            serialization of token outputs.
++        reuse_sampling_tensors: Indicates if we want to reuse sampling
++            tensors that are part of the sampler forward pass. Currently,
++            it is mainly used for multi-step decode.
++
+     """
+ 
+     def __init__(
+@@ -94,19 +132,25 @@ class SamplingMetadata:
+         selected_token_indices: torch.Tensor,
+         categorized_sample_indices: Dict[SamplingType, torch.Tensor],
+         num_prompts: int,
++        skip_sampler_cpu_output: bool = False,
++        reuse_sampling_tensors: bool = False,
+     ) -> None:
+         self.seq_groups = seq_groups
+         self.selected_token_indices = selected_token_indices
+         self.categorized_sample_indices = categorized_sample_indices
+         self.num_prompts = num_prompts
++        self.skip_sampler_cpu_output = skip_sampler_cpu_output
++        self.reuse_sampling_tensors = reuse_sampling_tensors
+ 
+     @staticmethod
+     def prepare(
+         seq_group_metadata_list: List[SequenceGroupMetadata],
+         seq_lens: List[int],
+-        query_lens: Optional[List[int]],
++        query_lens: List[int],
+         device: str,
+         pin_memory: bool,
++        generators: Optional[Dict[str, torch.Generator]] = None,
++        cache: Optional[SamplingMetadataCache] = None,
+     ) -> "SamplingMetadata":
+         (
+             seq_groups,
+@@ -114,17 +158,20 @@ class SamplingMetadata:
+             categorized_sample_indices,
+             num_prompts,
+         ) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens,
+-                                device)
+-        selected_token_indices = async_tensor_h2d(selected_token_indices,
+-                                                  dtype=torch.long,
+-                                                  target_device=device,
+-                                                  pin_memory=pin_memory)
++                                device, generators, cache)
++        selected_token_indices = async_tensor_h2d(
++            selected_token_indices,
++            dtype=torch.long,
++            target_device=device,
++            pin_memory=pin_memory,
++        )
+         categorized_sample_indices = {
+-            t: maybe_expand_dim(
+-                async_tensor_h2d(seq_ids,
+-                                 dtype=torch.int,
+-                                 target_device=device,
+-                                 pin_memory=pin_memory), 2, 2)
++            t: async_tensor_h2d(
++                seq_ids,
++                dtype=torch.int,
++                target_device=device,
++                pin_memory=pin_memory,
++            )
+             for t, seq_ids in categorized_sample_indices.items()
+         }
+ 
+@@ -147,10 +194,12 @@ class SamplingMetadata:
+ def _prepare_seq_groups(
+     seq_group_metadata_list: List[SequenceGroupMetadata],
+     seq_lens: List[int],
+-    query_lens: Optional[List[int]],
++    query_lens: List[int],
+     device: str,
+-) -> Tuple[List[SequenceGroupToSample], List[int], Dict[
+-        SamplingType, List[Tuple[int, int]]], int]:
++    generators: Optional[Dict[str, torch.Generator]] = None,
++    cache: Optional[SamplingMetadataCache] = None,
++) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType,
++                                                        List[int]], int, ]:
+     """Prepare sequence groups and indices for sampling.
+ 
+     Args:
+@@ -159,8 +208,10 @@ def _prepare_seq_groups(
+             Index of prompt len should match with seq_group_metadata_list.
+         query_lens: A list of query lengths. Prompt lens include the length
+             of entire prompt tokens, and it could be shorter.
+-        device: A device to use for random number generator,
++        device: A device to use for random number generators,
+             `SequenceGroupToSample.generator`.
++        generators: A store of per-request random number generators used
++            for seeded requests.
+ 
+     Returns:
+         seq_groups: A list of sequence group to sample.
+@@ -179,35 +230,46 @@ def _prepare_seq_groups(
+     # Sampling type -> (
+     # indices to sample/prompt logprob within pruned output logits,
+     # indices to sample within pruned logits)
+-    categorized_sample_indices: Dict[SamplingType, List[Tuple[int, int]]] = {
++    categorized_sample_indices: Dict[SamplingType, List[int]] = {
+         t: []
+         for t in SamplingType
+     }
+     # Index of logits to compute logprob. Logits include both prompt logprob
+     # and sample logprob indices.
+     logit_idx = 0
+-    # Index to sample from a sample tensor. It is used by triton sample kernel.
+-    # See `_sample_with_triton_kernel` for more details.
+-    sample_idx = 0
+     # Total number of prompts from given sequence groups.
+     num_prompts = 0
+ 
+     for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+-        seq_ids = list(seq_group_metadata.seq_data.keys())
++        seq_ids = seq_group_metadata.seq_data.keys()
++
++        if cache is not None:
++            sample_obj = cache.get_cached_seq_group_to_sample(len(seq_ids))
++
++            for j, seq_id in enumerate(seq_ids):
++                sample_obj.seq_ids[j] = seq_id
++
++            sample_obj.prompt_logprob_indices.clear()
++            sample_obj.sample_indices.clear()
++
+         sampling_params = seq_group_metadata.sampling_params
+         is_prompt = seq_group_metadata.is_prompt
+         generator: Optional[torch.Generator] = None
+         # If the current seq group is in decode stage, it is None.
+         seq_len: Optional[int] = None
+         query_len: Optional[int] = None
+-        prompt_logprob_indices: List[int] = []
+-        sample_indices: List[int] = []
++        prompt_logprob_indices: List[int] = (sample_obj.prompt_logprob_indices
++                                             if cache is not None else [])
++        sample_indices: List[int] = (sample_obj.sample_indices
++                                     if cache is not None else [])
+         do_sample = seq_group_metadata.do_sample
+ 
+         if seq_group_metadata.is_prompt:
+             if sampling_params.seed is not None:
+-                seq_group_metadata.state.generator = torch.Generator(
+-                    device=device).manual_seed(sampling_params.seed)
++                generator = torch.Generator(device=device).manual_seed(
++                    sampling_params.seed)
++                if generators is not None:
++                    generators[seq_group_metadata.request_id] = generator
+ 
+             num_prompts += 1
+             num_prefill_sample = len(seq_ids)
+@@ -222,7 +284,12 @@ def _prepare_seq_groups(
+         else:
+             # Decode
+             prompt_logprob_len = 0
+-            sample_len = len(seq_ids) if do_sample else 0
++            query_len = query_lens[i] if query_lens is not None and len(
++                query_lens) > 0 else 1
++            sample_len = len(seq_ids) * query_len if do_sample else 0
++
++            if sampling_params.seed is not None and generators is not None:
++                generator = generators.get(seq_group_metadata.request_id)
+ 
+         # Update indices to select from the model output.
+         """
+@@ -233,7 +300,7 @@ def _prepare_seq_groups(
+         logits = hidden_states[selected_token_indices]
+         """
+ 
+-        if sampling_params.prompt_logprobs:
++        if sampling_params.prompt_logprobs is not None:
+             selected_token_indices.extend(
+                 range(model_output_idx, model_output_idx + prompt_logprob_len))
+         model_output_idx += prompt_logprob_len
+@@ -262,18 +329,19 @@ def _prepare_seq_groups(
+         if do_sample:
+             sample_indices.extend(range(logit_idx, logit_idx + sample_len))
+             categorized_sample_indices[sampling_params.sampling_type].extend(
+-                list(
+-                    zip(range(logit_idx, logit_idx + sample_len),
+-                        range(sample_idx, sample_idx + sample_len))))
++                list(range(logit_idx, logit_idx + sample_len)))
+             logit_idx += sample_len
+-            sample_idx += sample_len
+ 
+-        if sampling_params.seed is not None:
+-            generator = seq_group_metadata.state.generator
+-
+-        seq_groups.append(
+-            SequenceGroupToSample(
+-                seq_ids=seq_ids,
++        if cache is not None:
++            sample_obj.sampling_params = sampling_params
++            sample_obj.seq_data = seq_group_metadata.seq_data
++            sample_obj.seq_len = seq_len
++            sample_obj.query_len = query_len
++            sample_obj.generator = generator
++            sample_obj.is_prompt = is_prompt
++        else:
++            sample_obj = SequenceGroupToSample(
++                seq_ids=list(seq_ids),
+                 sampling_params=sampling_params,
+                 seq_data=seq_group_metadata.seq_data,
+                 seq_len=seq_len,
+@@ -281,7 +349,14 @@ def _prepare_seq_groups(
+                 generator=generator,
+                 is_prompt=is_prompt,
+                 prompt_logprob_indices=list(prompt_logprob_indices),
+-                sample_indices=list(sample_indices)))
++                sample_indices=list(sample_indices),
++            )
++
++        seq_groups.append(sample_obj)
++
++    if cache is not None:
++        cache.reset()
++
+     return (seq_groups, selected_token_indices, categorized_sample_indices,
+             num_prompts)
+ 
+@@ -297,9 +372,6 @@ class SamplingTensors:
+     presence_penalties: torch.Tensor
+     frequency_penalties: torch.Tensor
+     repetition_penalties: torch.Tensor
+-    sampling_seeds: torch.Tensor
+-    sample_indices: torch.Tensor
+-    extra_seeds: Optional[torch.Tensor]
+     prompt_tokens: torch.Tensor
+     output_tokens: torch.Tensor
+ 
+@@ -310,17 +382,9 @@ class SamplingTensors:
+         vocab_size: int,
+         device: torch.device,
+         dtype: torch.dtype,
+-        *,
+-        extra_seeds_to_generate: int = 0,
+-        extra_entropy: Optional[Tuple[int, ...]] = None
+     ) -> Tuple["SamplingTensors", bool, bool, bool]:
+-        """
+-        extra_seeds_to_generate: extra seeds to generate using the
+-            user-defined seed for each sequence.
+-        extra_entropy: extra entropy to use when generating seeds.
+-        """
+-        prompt_tokens: List[List[int]] = []
+-        output_tokens: List[List[int]] = []
++        prompt_tokens: List[array] = []
++        output_tokens: List[array] = []
+         top_ks: List[int] = []
+         temperatures: List[float] = []
+         top_ps: List[float] = []
+@@ -328,17 +392,10 @@ class SamplingTensors:
+         presence_penalties: List[float] = []
+         frequency_penalties: List[float] = []
+         repetition_penalties: List[float] = []
+-        sampling_seeds: List[int] = []
+-        sample_indices: List[int] = []
+-        prompt_best_of: List[int] = []
+         do_penalties = False
+         do_top_p_top_k = False
+         do_min_p = False
+ 
+-        # We need one base seed per Triton slice.
+-        seeds_to_generate = (extra_seeds_to_generate +
+-                             get_num_triton_sampler_splits(vocab_size))
+-
+         assert sampling_metadata.seq_groups is not None
+         for seq_group in sampling_metadata.seq_groups:
+             seq_ids = seq_group.seq_ids
+@@ -349,9 +406,6 @@ class SamplingTensors:
+             r = sampling_params.repetition_penalty
+             top_p = sampling_params.top_p
+             min_p = sampling_params.min_p
+-            seed = sampling_params.seed
+-
+-            is_greedy = sampling_params.sampling_type == SamplingType.GREEDY
+ 
+             # k should not be greater than the vocab size.
+             top_k = min(sampling_params.top_k, vocab_size)
+@@ -372,8 +426,7 @@ class SamplingTensors:
+                 do_penalties = True
+ 
+             is_prompt = seq_group.is_prompt
+-            if (seq_group.is_prompt
+-                    and sampling_params.prompt_logprobs is not None):
++            if is_prompt and sampling_params.prompt_logprobs is not None:
+                 # For tokens in the prompt that we only need to get
+                 # their logprobs
+                 query_len = seq_group.query_len
+@@ -386,75 +439,94 @@ class SamplingTensors:
+                 presence_penalties += [0] * prefill_len
+                 frequency_penalties += [0] * prefill_len
+                 repetition_penalties += [1] * prefill_len
+-                prompt_tokens.extend([] for _ in range(prefill_len))
+-                output_tokens.extend([] for _ in range(prefill_len))
+ 
+             if seq_group.do_sample:
+                 sample_lens = len(seq_group.sample_indices)
+-                assert sample_lens == len(seq_ids)
+-                for seq_id in seq_ids:
+-                    seq_data = seq_group.seq_data[seq_id]
+-                    prompt_tokens.append(seq_data.prompt_token_ids)
+-                    output_tokens.append(seq_data.output_token_ids)
+-                temperatures += [temperature] * len(seq_ids)
+-                top_ps += [top_p] * len(seq_ids)
+-                top_ks += [top_k] * len(seq_ids)
+-                min_ps += [min_p] * len(seq_ids)
+-                presence_penalties += [p] * len(seq_ids)
+-                frequency_penalties += [f] * len(seq_ids)
+-                repetition_penalties += [r] * len(seq_ids)
+-
+-            if is_prompt:
+-                prompt_best_of.append(sampling_params.best_of)
+-                query_len = seq_group.query_len
+-                assert query_len is not None
+-
+-            for seq_id in seq_ids:
+-                seq_data = seq_group.seq_data[seq_id]
+-                extra_entropy = extra_entropy or ()
+-                seq_seeds = cls._get_sequence_seeds(
+-                    seed,
+-                    seq_data.get_len(),
+-                    *extra_entropy,
+-                    seq_id,
+-                    seeds_to_generate=seeds_to_generate,
+-                    is_greedy=is_greedy)
+-                sampling_seeds.append(seq_seeds)
+-            sample_indices.extend(seq_group.sample_indices)
++                assert sample_lens >= len(seq_ids)
++                temperatures += [temperature] * sample_lens
++                top_ps += [top_p] * sample_lens
++                top_ks += [top_k] * sample_lens
++                min_ps += [min_p] * sample_lens
++                presence_penalties += [p] * sample_lens
++                frequency_penalties += [f] * sample_lens
++                repetition_penalties += [r] * sample_lens
++
++        if do_penalties:
++            for seq_group in sampling_metadata.seq_groups:
++                seq_ids = seq_group.seq_ids
++                sampling_params = seq_group.sampling_params
++                if (seq_group.is_prompt
++                        and sampling_params.prompt_logprobs is not None):
++                    prefill_len = len(seq_group.prompt_logprob_indices)
++                    prompt_tokens.extend(
++                        array(VLLM_TOKEN_ID_ARRAY_TYPE)
++                        for _ in range(prefill_len))
++                    output_tokens.extend(
++                        array(VLLM_TOKEN_ID_ARRAY_TYPE)
++                        for _ in range(prefill_len))
++                if seq_group.do_sample:
++                    for seq_id in seq_ids:
++                        seq_data = seq_group.seq_data[seq_id]
++                        prompt_tokens.append(seq_data.prompt_token_ids_array)
++                        output_tokens.append(seq_data.output_token_ids_array)
+ 
+         sampling_tensors = SamplingTensors.from_lists(
+-            temperatures, top_ps, top_ks, min_ps, presence_penalties,
+-            frequency_penalties, repetition_penalties, sampling_seeds,
+-            sample_indices, prompt_tokens, output_tokens, vocab_size,
+-            extra_seeds_to_generate, device, dtype)
++            temperatures,
++            top_ps,
++            top_ks,
++            min_ps,
++            presence_penalties,
++            frequency_penalties,
++            repetition_penalties,
++            prompt_tokens,
++            output_tokens,
++            vocab_size,
++            device,
++            dtype,
++        )
+         return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p)
+ 
+     @classmethod
+-    def from_lists(cls, temperatures: List[float], top_ps: List[float],
+-                   top_ks: List[int], min_ps: List[float],
+-                   presence_penalties: List[float],
+-                   frequency_penalties: List[float],
+-                   repetition_penalties: List[float],
+-                   sampling_seeds: List[int], sample_indices: List[int],
+-                   prompt_tokens: List[List[int]],
+-                   output_tokens: List[List[int]], vocab_size: int,
+-                   extra_seeds_to_generate: int, device: torch.device,
+-                   dtype: torch.dtype) -> "SamplingTensors":
++    def from_lists(
++        cls,
++        temperatures: List[float],
++        top_ps: List[float],
++        top_ks: List[int],
++        min_ps: List[float],
++        presence_penalties: List[float],
++        frequency_penalties: List[float],
++        repetition_penalties: List[float],
++        prompt_tokens: List[array],
++        output_tokens: List[array],
++        vocab_size: int,
++        device: torch.device,
++        dtype: torch.dtype,
++    ) -> "SamplingTensors":
+         # Note that the performance will be very bad without
+         # pinned memory.
+         pin_memory = is_pin_memory_available()
+-        prompt_max_len = max([len(tokens) for tokens in prompt_tokens],
+-                             default=0)
+-        prompt_padded_tokens = [
+-            tokens + [vocab_size] * (prompt_max_len - len(tokens))
+-            for tokens in prompt_tokens
+-        ]
+-        output_max_len = max([len(tokens) for tokens in output_tokens],
+-                             default=0)
+-        output_padded_tokens = [
+-            tokens + [vocab_size] * (output_max_len - len(tokens))
+-            for tokens in output_tokens
+-        ]
++
++        do_penalties = prompt_tokens or output_tokens
++
++        if do_penalties:
++            prompt_t = make_tensor_with_pad(
++                prompt_tokens,
++                vocab_size,
++                device="cpu",
++                dtype=torch.int64,
++                pin_memory=pin_memory,
++            )
++            output_t = make_tensor_with_pad(
++                output_tokens,
++                vocab_size,
++                device="cpu",
++                dtype=torch.int64,
++                pin_memory=pin_memory,
++            )
++        else:
++            empty_tensor = torch.empty(0, device=device, dtype=torch.long)
++            prompt_t = empty_tensor
++            output_t = empty_tensor
+ 
+         temperatures_t = torch.tensor(
+             temperatures,
+@@ -498,46 +570,9 @@ class SamplingTensors:
+             dtype=torch.int,
+             pin_memory=pin_memory,
+         )
+-        sample_indices_t = torch.tensor(
+-            sample_indices,
+-            device="cpu",
+-            dtype=torch.long,
+-            pin_memory=pin_memory,
+-        )
+-        prompt_tensor = torch.tensor(
+-            prompt_padded_tokens,
+-            device="cpu",
+-            dtype=torch.long,
+-            pin_memory=pin_memory,
+-        )
+-        output_tensor = torch.tensor(
+-            output_padded_tokens,
+-            device="cpu",
+-            dtype=torch.long,
+-            pin_memory=pin_memory,
+-        )
+-        # need to transpose and make contiguous to
+-        # copy the tensor correctly.
+-        # [batch_size, n_seeds] -> [n_seeds, batch_size]
+-        sampling_seeds_t = torch.tensor(
+-            sampling_seeds,
+-            device="cpu",
+-            dtype=torch.long,
+-            pin_memory=pin_memory,
+-        ).T.contiguous()
+-
+         # Because the memory is pinned, we can do non-blocking
+         # transfer to device.
+ 
+-        # How many seeds the sample operation itself will need.
+-        num_base_seeds = sampling_seeds_t.shape[0] - extra_seeds_to_generate
+-        sampling_seeds_gpu = sampling_seeds_t.to(device=device,
+-                                                 non_blocking=True)
+-        extra_seeds_gpu = sampling_seeds_gpu[num_base_seeds:]
+-        if not extra_seeds_gpu.numel():
+-            extra_seeds_gpu = None
+-        sampling_seeds_gpu = sampling_seeds_gpu[:num_base_seeds]
+-
+         return cls(
+             temperatures=temperatures_t.to(device=device, non_blocking=True),
+             top_ps=top_ps_t.to(device=device, non_blocking=True),
+@@ -549,40 +584,6 @@ class SamplingTensors:
+                                                          non_blocking=True),
+             repetition_penalties=repetition_penalties_t.to(device=device,
+                                                            non_blocking=True),
+-            prompt_tokens=prompt_tensor.to(device=device, non_blocking=True),
+-            output_tokens=output_tensor.to(device=device, non_blocking=True),
+-            sampling_seeds=sampling_seeds_gpu,
+-            sample_indices=sample_indices_t.to(device=device,
+-                                               non_blocking=True),
+-            extra_seeds=extra_seeds_gpu,
++            prompt_tokens=prompt_t.to(device=device, non_blocking=True),
++            output_tokens=output_t.to(device=device, non_blocking=True),
+         )
+-
+-    @staticmethod
+-    def _get_sequence_seeds(
+-        seed: int,
+-        *extra_entropy: int,
+-        seeds_to_generate: int,
+-        is_greedy: bool,
+-    ):
+-        """Get `seeds_to_generate` child seeds from `seed` and extra entropy."""
+-        if not is_greedy:
+-            if seed is None:
+-                randint_fn = random.randint
+-            else:
+-                generator = random.Random(str((seed, ) + extra_entropy))
+-                randint_fn = generator.randint
+-            lo, hi = torch.iinfo(torch.long).min, torch.iinfo(torch.long).max
+-            # If the user/random sets seed = 0 but request should
+-            # have sampling, we need to change it to something
+-            # else. We use a constant in that case.
+-            # This way we don't need to create and load a bool
+-            # matrix in the sampling kernel, which reduces CPU
+-            # overhead and latency.
+-            seq_seeds = [
+-                randint_fn(lo, hi) or _SEED_0_REPLACEMENT
+-                for _ in range(seeds_to_generate)
+-            ]
+-        else:
+-            # For the kernel, seed == 0 means greedy decoding.
+-            seq_seeds = [0] * seeds_to_generate
+-        return seq_seeds
+diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
+index 336bc1c..6f1cc9d 100644
+--- a/vllm/model_executor/utils.py
++++ b/vllm/model_executor/utils.py
+@@ -1,17 +1,12 @@
+ """Utils for model executor."""
+-import random
+ from typing import Any, Dict, Optional
+ 
+-import numpy as np
+ import torch
+ 
+ 
+ def set_random_seed(seed: int) -> None:
+-    random.seed(seed)
+-    np.random.seed(seed)
+-    torch.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed_all(seed)
++    from vllm.platforms import current_platform
++    current_platform.seed_everything(seed)
+ 
+ 
+ def set_weight_attrs(
+@@ -32,4 +27,26 @@ def set_weight_attrs(
+     for key, value in weight_attrs.items():
+         assert not hasattr(
+             weight, key), (f"Overwriting existing tensor attribute: {key}")
++
++        # NOTE(woosuk): During weight loading, we often do something like:
++        # narrowed_tensor = param.data.narrow(0, offset, len)
++        # narrowed_tensor.copy_(real_weight)
++        # expecting narrowed_tensor and param.data to share the same storage.
++        # However, on TPUs, narrowed_tensor will lazily propagate to the base
++        # tensor, which is param.data, leading to the redundant memory usage.
++        # This sometimes causes OOM errors during model loading. To avoid this,
++        # we sync the param tensor after its weight loader is called.
++        # TODO(woosuk): Remove this hack once we have a better solution.
++        from vllm.platforms import current_platform
++        if current_platform.is_tpu() and key == "weight_loader":
++            value = _make_synced_weight_loader(value)
+         setattr(weight, key, value)
++
++
++def _make_synced_weight_loader(original_weight_loader):
++
++    def _synced_weight_loader(param, *args, **kwargs):
++        original_weight_loader(param, *args, **kwargs)
++        torch._sync(param)
++
++    return _synced_weight_loader
+diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
+new file mode 100644
+index 0000000..1d7f5d5
+--- /dev/null
++++ b/vllm/multimodal/__init__.py
+@@ -0,0 +1,31 @@
++from .base import MultiModalPlaceholderMap, MultiModalPlugin
++from .hasher import MultiModalHashDict, MultiModalHasher
++from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
++                     MultiModalDataDict, MultiModalKwargs,
++                     MultiModalPlaceholderDict, NestedTensors)
++from .registry import MultiModalRegistry
++
++MULTIMODAL_REGISTRY = MultiModalRegistry()
++"""
++The global :class:`~MultiModalRegistry` is used by model runners to
++dispatch data processing according to the target model.
++
++See also:
++    :ref:`mm-processing`
++"""
++
++__all__ = [
++    "BatchedTensorInputs",
++    "ModalityData",
++    "MultiModalDataBuiltins",
++    "MultiModalDataDict",
++    "MultiModalHashDict",
++    "MultiModalHasher",
++    "MultiModalKwargs",
++    "MultiModalPlaceholderDict",
++    "MultiModalPlaceholderMap",
++    "MultiModalPlugin",
++    "NestedTensors",
++    "MULTIMODAL_REGISTRY",
++    "MultiModalRegistry",
++]
+diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
+new file mode 100644
+index 0000000..de80f22
+--- /dev/null
++++ b/vllm/multimodal/audio.py
+@@ -0,0 +1,75 @@
++import base64
++from io import BytesIO
++from pathlib import Path
++
++import numpy as np
++import numpy.typing as npt
++
++from vllm.inputs.registry import InputContext
++from vllm.utils import PlaceholderModule
++
++from .base import MediaIO, MultiModalPlugin
++from .inputs import AudioItem, ModalityData, MultiModalKwargs
++
++try:
++    import librosa
++except ImportError:
++    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
++
++try:
++    import soundfile
++except ImportError:
++    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
++
++
++class AudioPlugin(MultiModalPlugin):
++    """Plugin for audio data."""
++
++    def get_data_key(self) -> str:
++        return "audio"
++
++    def _default_input_mapper(
++        self,
++        ctx: InputContext,
++        data: ModalityData[AudioItem],
++        **mm_processor_kwargs,
++    ) -> MultiModalKwargs:
++        raise NotImplementedError("There is no default audio input mapper")
++
++    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
++        raise NotImplementedError(
++            "There is no default maximum multimodal tokens")
++
++
++def resample_audio(
++    audio: npt.NDArray[np.floating],
++    *,
++    orig_sr: float,
++    target_sr: float,
++) -> npt.NDArray[np.floating]:
++    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
++
++
++class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
++
++    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
++        return librosa.load(BytesIO(data), sr=None)
++
++    def load_base64(
++        self,
++        media_type: str,
++        data: str,
++    ) -> tuple[npt.NDArray, float]:
++        return self.load_bytes(base64.b64decode(data))
++
++    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
++        return librosa.load(filepath, sr=None)
++
++    def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
++        audio, sr = media
++
++        with BytesIO() as buffer:
++            soundfile.write(buffer, audio, sr, format="WAV")
++            data = buffer.getvalue()
++
++        return base64.b64encode(data).decode('utf-8')
+diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
+new file mode 100644
+index 0000000..fd3ec7e
+--- /dev/null
++++ b/vllm/multimodal/base.py
+@@ -0,0 +1,461 @@
++from abc import ABC, abstractmethod
++from collections import defaultdict
++from pathlib import Path
++from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple,
++                    Optional, Sequence, Tuple, Type, TypeVar, Union)
++
++from torch import nn
++
++from vllm.inputs import InputContext
++from vllm.logger import init_logger
++from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
++                        resolve_mm_processor_kwargs)
++
++if TYPE_CHECKING:
++    from vllm.config import ModelConfig
++    from vllm.sequence import SequenceGroupMetadata
++
++from .inputs import (ModalityData, MultiModalDataDict, MultiModalKwargs,
++                     PlaceholderRange)
++
++logger = init_logger(__name__)
++
++MultiModalInputMapper = Callable[[InputContext, ModalityData[object]],
++                                 MultiModalKwargs]
++"""
++Return a dictionary to be passed as keyword arguments to
++:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
++and processors in HuggingFace Transformers.
++
++If the data is not supported, throw :exc:`TypeError`.
++"""
++
++MultiModalTokensCalc = Union[int, Callable[[InputContext], int]]
++"""
++Calculate the maximum number of multimodal tokens input to the language
++model. This does not include tokens that correspond to the input text.
++"""
++
++_T = TypeVar("_T")
++N = TypeVar("N", bound=Type[nn.Module])
++
++
++class MultiModalPlugin(ABC):
++    """
++    Base class that defines data processing logic for a specific modality.
++
++    In particular, we adopt a registry pattern to dispatch data processing
++    according to the model being used (considering that different models may
++    process the same data differently). This registry is in turn used by
++    :class:`~MultiModalRegistry` which acts at a higher level
++    (i.e., the modality of the data).
++    """
++
++    def __init__(self) -> None:
++        self._input_mappers = ClassRegistry[nn.Module, MultiModalInputMapper]()
++        self._max_mm_tokens = ClassRegistry[nn.Module, MultiModalTokensCalc]()
++
++    @abstractmethod
++    def get_data_key(self) -> str:
++        """
++        Get the data key corresponding to the modality.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def _default_input_mapper(
++        self,
++        ctx: InputContext,
++        data: ModalityData[Any],
++        **mm_processor_kwargs,
++    ) -> MultiModalKwargs:
++        """
++        Return a dictionary to be passed as keyword arguments to
++        :meth:`~torch.nn.Module.forward`. This is similar in concept to
++        tokenizers and processors in HuggingFace Transformers.
++
++        If the data is not supported, throw :exc:`TypeError`.
++        """
++        raise NotImplementedError
++
++    def register_input_mapper(
++        self,
++        mapper: Optional[MultiModalInputMapper] = None,
++    ):
++        """
++        Register an input mapper to a model class.
++
++        When the model receives input data that matches the modality served by
++        this plugin (see :meth:`get_data_key`), the provided function is
++        invoked to transform the data into a dictionary of model inputs.
++
++        If `None` is provided, then the default input mapper is used instead.
++        """
++
++        def wrapper(model_cls: N) -> N:
++            if self._input_mappers.contains(model_cls, strict=True):
++                logger.warning(
++                    "Model class %s already has an input mapper "
++                    "registered to %s. It is overwritten by the new one.",
++                    model_cls,
++                    self,
++                )
++
++            self._input_mappers[model_cls] = (mapper
++                                              or self._default_input_mapper)
++
++            return model_cls
++
++        return wrapper
++
++    def map_input(
++        self,
++        model_config: "ModelConfig",
++        data: ModalityData[Any],
++        mm_processor_kwargs: Optional[dict[str, Any]],
++    ) -> MultiModalKwargs:
++        """
++        Transform the data into a dictionary of model inputs using the
++        input mapper registered for that model.
++
++        The model is identified by ``model_config``.
++
++        Raises:
++            TypeError: If the data type is not supported.
++        """
++
++        # Avoid circular import
++        from vllm.model_executor.model_loader import get_model_architecture
++
++        model_cls, _ = get_model_architecture(model_config)
++
++        mapper = self._input_mappers.get(model_cls)
++
++        if mapper is None:
++            raise KeyError(f"No input mapper in {self} is registered for "
++                           f"model class {model_cls.__name__}.")
++
++        if mm_processor_kwargs is None:
++            mm_processor_kwargs = {}
++
++        # In the case of the default mapper, we have to get resource
++        # processor through its HuggingFace autoclass; since this goes
++        # through **kwargs, we can't inspect it the same way, so we allow
++        # drop mm_processor_kwargs based on signature inspection
++        # if we're using the default mapper.
++        #
++        # This should be safe in general due to the sanitation, since the
++        # transformers resource should filter unused kwargs anyway.
++        uses_default_mapper = mapper == self._default_input_mapper
++        mm_processor_kwargs = resolve_mm_processor_kwargs(
++            model_config.mm_processor_kwargs,
++            mm_processor_kwargs,
++            callable=mapper,
++            allow_var_kwargs=uses_default_mapper,
++        )
++        return mapper(InputContext(model_config), data, **mm_processor_kwargs)
++
++    @abstractmethod
++    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
++        """
++        Calculate the maximum number of tokens, corresponding to a single
++        instance of multimodal data, that are passed to the language model.
++        """
++        raise NotImplementedError
++
++    def _validate_max_multimodal_tokens(self, max_mm_tokens: int):
++        if max_mm_tokens < 1:
++            raise ValueError("You should set the number of tokens to a "
++                             f"positive integer. Found: {max_mm_tokens}")
++
++    def register_max_multimodal_tokens(
++        self,
++        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
++    ):
++        """
++        Register the maximum number of tokens, corresponding to a single
++        instance of multimodal data, that are passed to the language model
++        for a model class.
++
++        If `None` is provided, then the default calculation is used instead.
++        """
++
++        def wrapper(model_cls: N) -> N:
++            if self._max_mm_tokens.contains(model_cls, strict=True):
++                logger.warning(
++                    "Model class %s already calculates maximum number of "
++                    "tokens in %s. It is overwritten by the new one.",
++                    model_cls,
++                    self,
++                )
++
++            if isinstance(max_mm_tokens, int):
++                self._validate_max_multimodal_tokens(max_mm_tokens)
++
++            self._max_mm_tokens[model_cls] = (
++                max_mm_tokens or self._default_max_multimodal_tokens)
++
++            return model_cls
++
++        return wrapper
++
++    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
++        """
++        Get the maximum number of multi-modal tokens
++        for profiling the memory usage of a model.
++
++        If this registry is not applicable to the model, `0` is returned.
++
++        The model is identified by ``model_config``.
++        """
++        # Avoid circular import
++        from vllm.model_executor.model_loader import get_model_architecture
++        from vllm.model_executor.models import supports_multimodal
++
++        model_cls, _ = get_model_architecture(model_config)
++
++        if not supports_multimodal(model_cls):
++            return 0
++
++        max_mm_tokens = self._max_mm_tokens.get(model_cls)
++        if max_mm_tokens is None:
++            return 0
++
++        if callable(max_mm_tokens):
++            mm_processor_kwargs = get_allowed_kwarg_only_overrides(
++                max_mm_tokens, overrides=model_config.mm_processor_kwargs)
++            max_mm_tokens = max_mm_tokens(InputContext(model_config),
++                                          **mm_processor_kwargs)
++
++        self._validate_max_multimodal_tokens(max_mm_tokens)
++
++        return max_mm_tokens
++
++
++class MultiModalPlaceholderMap:
++    """
++    Relates multi-modal embeddings to their corresponding placeholders.
++    """
++
++    class IndexMap(NamedTuple):
++        src: list[int]
++        dest: list[int]
++
++    src_ranges: list[range]
++    """
++    The indices of the multi-modal embeddings that will replace the
++    corresponding placeholder embeddings pointed to by ``dest_ranges``.
++    """
++
++    src_len: int
++    """
++    The total number of flattened multi-modal embeddings.
++    """
++
++    dest_ranges: list[range]
++    """
++    The indices of the placeholder embeddings that will be replaced by the
++    multimodal embeddings.
++    """
++
++    dest_len: int
++    """
++    The total number of embeddings in the destination tensor.
++    """
++
++    def __init__(self):
++        self.src_ranges = []
++        self.src_len = 0
++        self.dest_ranges = []
++        self.dest_len = 0
++
++    @classmethod
++    def from_seq_group(
++        cls, seq_group: "SequenceGroupMetadata", positions: range
++    ) -> Tuple[Optional[MultiModalDataDict], dict[str,
++                                                  "MultiModalPlaceholderMap"]]:
++        """
++        Returns the multi-modal items that intersect with the portion of a
++        prompt (``seq_group``) represented by ``positions``, as well as a
++        ``MultiModalPlaceholderMap`` that relates the multi-modal embedding
++        vectors to their corresponding placeholders.
++
++        Examples:
++
++        .. code-block::
++
++            Prompt:    |AAAA BBBB What's in these images?|
++            Positions: |.................................|
++
++                images      = [A, B]
++                src_ranges  = [(0, 4), (4, 8)]
++                dest_ranges = [(0, 4), (5, 9)]
++
++            Prompt:    |AAAA BBBB What's in these images?|
++            Positions: |  .....                          |
++
++                images      = [A, B]
++                src_ranges  = [(2, 4), (4, 6)]
++                dest_ranges = [(0, 2), (3, 5)]
++
++            Prompt:    |AAAA BBBB What's in these images?|
++            Positions: |     .........                   |
++
++                images      = [B]
++                src_ranges  = [(0, 4)]
++                dest_ranges = [(0, 4)]
++
++            Prompt:    |AAAA BBBB What's in these images?|
++            Positions: |          .......................|
++
++                images      = []
++                src_ranges  = []
++                dest_ranges = []
++        """
++        seq_mm_data = seq_group.multi_modal_data
++        seq_mm_placeholders = seq_group.multi_modal_placeholders
++
++        if not seq_mm_data or not seq_mm_placeholders:
++            return seq_mm_data, {}
++
++        # For merged processor, we directly use mm_kwargs as mm_data
++        if isinstance(seq_mm_data, MultiModalKwargs):
++            placeholder_maps = dict[str, MultiModalPlaceholderMap]()
++
++            for modality, placeholders in seq_mm_placeholders.items():
++                placeholder_map = MultiModalPlaceholderMap()
++
++                if positions:
++                    placeholder_map.append_items_from_seq_group(
++                        positions,
++                        # Dummy, since we don't care about intersecting items
++                        [None] * len(placeholders),
++                        placeholders,
++                    )
++
++                placeholder_maps[modality] = placeholder_map
++
++            return seq_mm_data, placeholder_maps
++
++        mm_data = {**seq_mm_data}
++        placeholder_maps = defaultdict[str, MultiModalPlaceholderMap](
++            MultiModalPlaceholderMap)
++
++        for modality, placeholders in seq_mm_placeholders.items():
++            mm_items = mm_data.pop(modality)
++            if not isinstance(mm_items, list):
++                mm_items = [mm_items]
++
++            if positions:
++                intersecting_items = placeholder_maps[modality] \
++                    .append_items_from_seq_group(
++                        positions,
++                        mm_items,
++                        placeholders,
++                    )
++
++                if intersecting_items:
++                    mm_data[modality] = intersecting_items
++
++        return mm_data, placeholder_maps
++
++    def append_items_from_seq_group(
++        self,
++        positions: range,
++        multi_modal_items: list[_T],
++        multi_modal_placeholders: Sequence[PlaceholderRange],
++    ) -> list[_T]:
++        """
++        Adds the multi-modal items that intersect ```positions`` to this
++        placeholder map and returns the intersecting items.
++        """
++        intersecting_items = []
++
++        if len(multi_modal_items) != len(multi_modal_placeholders):
++            raise ValueError(
++                "Multi-modal placeholders and items must have the same length."
++            )
++        for placeholder_dict, mm_item in zip(multi_modal_placeholders,
++                                             multi_modal_items):
++            placeholder = range(
++                placeholder_dict["offset"],
++                placeholder_dict["offset"] + placeholder_dict["length"],
++            )
++            intersection = range(
++                max(positions.start, placeholder.start),
++                min(positions.stop, placeholder.stop),
++            )
++
++            if not intersection:
++                # Skip this multi-modal item.
++                continue
++
++            token_embedding_range = range(
++                intersection.start - positions.start,
++                intersection.stop - positions.start,
++            )
++
++            multimodal_embedding_range = range(
++                intersection.start - placeholder.start + self.src_len,
++                intersection.stop - placeholder.start + self.src_len,
++            )
++
++            intersecting_items.append(mm_item)
++            self.dest_ranges.append(token_embedding_range)
++            self.src_ranges.append(multimodal_embedding_range)
++            self.src_len += len(placeholder)
++
++        self.dest_len += len(positions)
++        return intersecting_items
++
++    def extend(self, other: "MultiModalPlaceholderMap"):
++        """
++        Adds the placeholders from another ``MultiModalPlaceholderMap`` to this
++        instance based on the source and destination tensors being
++        concatenated.
++        """
++
++        self.src_ranges.extend(
++            range(self.src_len + r.start, self.src_len + r.stop)
++            for r in other.src_ranges)
++        self.src_len += other.src_len
++        self.dest_ranges.extend(
++            range(self.dest_len + r.start, self.dest_len + r.stop)
++            for r in other.dest_ranges)
++        self.dest_len += other.dest_len
++
++    def index_map(self) -> "IndexMap":
++        """
++        Finalizes the placeholder map into lists of indices that can be used to
++        index the source and destination tensors.
++        """
++
++        src_indices = [i for r in self.src_ranges for i in r]
++        dest_indices = [i for r in self.dest_ranges for i in r]
++
++        if len(src_indices) != len(dest_indices):
++            raise ValueError(
++                f"The number of source ({len(src_indices)}) and destination "
++                f"indices ({len(dest_indices)}) must be the same.")
++
++        return MultiModalPlaceholderMap.IndexMap(src=src_indices,
++                                                 dest=dest_indices)
++
++
++class MediaIO(ABC, Generic[_T]):
++
++    @abstractmethod
++    def load_bytes(self, data: bytes) -> _T:
++        raise NotImplementedError
++
++    @abstractmethod
++    def load_base64(self, media_type: str, data: str) -> _T:
++        """
++        List of media types:
++        https://www.iana.org/assignments/media-types/media-types.xhtml
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def load_file(self, filepath: Path) -> _T:
++        raise NotImplementedError
+diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
+new file mode 100644
+index 0000000..24aa1ca
+--- /dev/null
++++ b/vllm/multimodal/hasher.py
+@@ -0,0 +1,100 @@
++import pickle
++from typing import TYPE_CHECKING, Iterable, Mapping, Optional
++
++import numpy as np
++import torch
++from blake3 import blake3
++from PIL import Image
++
++from vllm.logger import init_logger
++
++if TYPE_CHECKING:
++    from vllm.inputs import TokensPrompt
++
++logger = init_logger(__name__)
++
++MultiModalHashDict = Mapping[str, list[str]]
++"""
++A dictionary containing hashes for items in each modality.
++"""
++
++
++class MultiModalHasher:
++
++    @classmethod
++    def serialize_item(cls, obj: object) -> bytes:
++        # Simple cases
++        if isinstance(obj, str):
++            return obj.encode("utf-8")
++        if isinstance(obj, bytes):
++            return obj
++        if isinstance(obj, Image.Image):
++            return obj.tobytes()
++
++        # Convertible to NumPy arrays
++        if isinstance(obj, torch.Tensor):
++            obj = obj.numpy()
++        if isinstance(obj, (int, float)):
++            obj = np.array(obj)
++        if isinstance(obj, np.ndarray):
++            return obj.tobytes()
++
++        logger.warning(
++            "No serialization method found for %s. "
++            "Falling back to pickle.", type(obj))
++
++        return pickle.dumps(obj)
++
++    @classmethod
++    def item_to_bytes(
++        cls,
++        key: str,
++        obj: object,
++    ) -> Iterable[tuple[bytes, bytes]]:
++        # Recursive cases
++        if isinstance(obj, (list, tuple)):
++            for i, elem in enumerate(obj):
++                yield from cls.item_to_bytes(f"{key}.{i}", elem)
++        elif isinstance(obj, dict):
++            for k, v in obj.items():
++                yield from cls.item_to_bytes(f"{key}.{k}", v)
++        else:
++            key_bytes = cls.serialize_item(key)
++            value_bytes = cls.serialize_item(obj)
++            yield key_bytes, value_bytes
++
++    @classmethod
++    def hash_kwargs(cls, **kwargs: object) -> str:
++        hasher = blake3()
++
++        for k, v in kwargs.items():
++            for k_bytes, v_bytes in cls.item_to_bytes(k, v):
++                hasher.update(k_bytes)
++                hasher.update(v_bytes)
++
++        return hasher.hexdigest()
++
++    @classmethod
++    def hash_prompt_mm_data(
++            cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]:
++        """Hash multimodal data in the user input prompt if they exist."""
++
++        if "multi_modal_data" not in prompt:
++            return None
++
++        mm_data = prompt["multi_modal_data"]
++        if not mm_data:
++            # mm_data can be None or an empty dict.
++            return None
++
++        mm_items = {
++            modality: items if isinstance(items, list) else [items]
++            for modality, items in mm_data.items()
++        }
++
++        mm_hashes = {
++            modality: [cls.hash_kwargs(**{modality: item}) for item in items]
++            for modality, items in mm_items.items()
++        }
++
++        return mm_hashes
+diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
+new file mode 100644
+index 0000000..da13a38
+--- /dev/null
++++ b/vllm/multimodal/image.py
+@@ -0,0 +1,137 @@
++import base64
++from functools import lru_cache
++from io import BytesIO
++from pathlib import Path
++from typing import TYPE_CHECKING, Any, Dict, Optional
++
++import torch
++from PIL import Image
++
++from vllm.inputs.registry import InputContext
++from vllm.logger import init_logger
++from vllm.transformers_utils.processor import get_image_processor
++from vllm.utils import is_list_of
++
++from .base import MediaIO, MultiModalPlugin
++from .inputs import ImageItem, ModalityData, MultiModalKwargs
++
++if TYPE_CHECKING:
++    from vllm.config import ModelConfig
++
++logger = init_logger(__name__)
++
++cached_get_image_processor = lru_cache(get_image_processor)
++
++
++class ImagePlugin(MultiModalPlugin):
++    """Plugin for image data."""
++
++    def get_data_key(self) -> str:
++        return "image"
++
++    def _get_hf_image_processor(
++        self,
++        model_config: "ModelConfig",
++        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
++    ):
++        if mm_processor_kwargs is None:
++            mm_processor_kwargs = {}
++        return cached_get_image_processor(
++            model_config.model,
++            trust_remote_code=model_config.trust_remote_code,
++            **mm_processor_kwargs)
++
++    def _default_input_mapper(
++        self,
++        ctx: InputContext,
++        data: ModalityData[ImageItem],
++        **mm_processor_kwargs,
++    ) -> MultiModalKwargs:
++        model_config = ctx.model_config
++
++        # PIL image
++        if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
++            image_processor = self._get_hf_image_processor(
++                model_config,
++                mm_processor_kwargs,
++            )
++
++            if image_processor is None:
++                raise RuntimeError("No HuggingFace processor is available "
++                                   "to process the image object")
++            try:
++                # NOTE: It may make sense to forward the mm_processor_kwargs
++                # here too. For now, to keep it simple, we only allow it be
++                # used for the initialization call though, just in case the
++                # signatures of the preprocessor initializer don't match
++                # preprocess()
++                batch_data = image_processor \
++                    .preprocess(data, return_tensors="pt") \
++                    .data
++            except Exception:
++                logger.error(
++                    "Failed to process image (%s) with the default mapper. "
++                    "This is most likely an edge-case with this model's image "
++                    "processor in transformers (type: %s), and not vLLM.",
++                    data,
++                    type(image_processor).__name__)
++                raise
++
++            return MultiModalKwargs(batch_data)
++
++        # Image embedding
++        elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor):
++            return MultiModalKwargs({"image_embeds": data})
++
++        raise TypeError(f"Invalid image type: {type(data)}")
++
++    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
++        return 3000
++
++
++def rescale_image_size(image: Image.Image,
++                       size_factor: float,
++                       transpose: int = -1) -> Image.Image:
++    """Rescale the dimensions of an image by a constant factor."""
++    new_width = int(image.width * size_factor)
++    new_height = int(image.height * size_factor)
++    image = image.resize((new_width, new_height))
++    if transpose >= 0:
++        image = image.transpose(Image.Transpose(transpose))
++    return image
++
++
++class ImageMediaIO(MediaIO[Image.Image]):
++
++    def __init__(self, *, image_mode: str = "RGB") -> None:
++        super().__init__()
++
++        self.image_mode = image_mode
++
++    def load_bytes(self, data: bytes) -> Image.Image:
++        image = Image.open(BytesIO(data))
++        image.load()
++        return image.convert(self.image_mode)
++
++    def load_base64(self, media_type: str, data: str) -> Image.Image:
++        return self.load_bytes(base64.b64decode(data))
++
++    def load_file(self, filepath: Path) -> Image.Image:
++        image = Image.open(filepath)
++        image.load()
++        return image.convert(self.image_mode)
++
++    def encode_base64(
++        self,
++        media: Image.Image,
++        *,
++        image_format: str = "JPEG",
++    ) -> str:
++        image = media
++
++        with BytesIO() as buffer:
++            image = image.convert(self.image_mode)
++            image.save(buffer, image_format)
++            data = buffer.getvalue()
++
++        return base64.b64encode(data).decode('utf-8')
+diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
+new file mode 100644
+index 0000000..4b63703
+--- /dev/null
++++ b/vllm/multimodal/inputs.py
+@@ -0,0 +1,523 @@
++from abc import ABC, abstractmethod
++from collections import UserDict, defaultdict
++from collections.abc import Mapping, Sequence
++from dataclasses import dataclass
++from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
++                    Union, cast, final)
++
++import numpy as np
++import torch
++import torch.types
++from PIL.Image import Image
++from transformers import BatchFeature
++from typing_extensions import NotRequired, TypeAlias
++
++from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves
++
++if TYPE_CHECKING:
++    from .hasher import MultiModalHashDict
++
++_T = TypeVar("_T")
++
++HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
++"""
++A :class:`transformers.image_utils.ImageInput` representing a single image
++item, which can be passed to a HuggingFace :code:`ImageProcessor`.
++"""
++
++HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
++                               list[np.ndarray], list[torch.Tensor]]
++"""
++A :class:`transformers.image_utils.VideoInput` representing a single video
++item, which can be passed to a HuggingFace :code:`VideoProcessor`.
++"""
++
++HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
++"""
++Represents a single audio
++item, which can be passed to a HuggingFace :code:`AudioProcessor`.
++"""
++
++ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
++"""
++A :class:`transformers.image_utils.ImageInput` representing a single image
++item, which can be passed to a HuggingFace :code:`ImageProcessor`.
++
++Alternatively, a 3-D tensor or batch of 2-D tensors,
++which are treated as image embeddings;
++these are directly passed to the model without HF processing.
++"""
++
++VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
++"""
++A :class:`transformers.image_utils.VideoInput` representing a single video
++item, which can be passed to a HuggingFace :code:`VideoProcessor`.
++
++Alternatively, a 3-D tensor or batch of 2-D tensors,
++which are treated as video embeddings;
++these are directly passed to the model without HF processing.
++"""
++
++AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
++                             torch.Tensor]
++"""
++Represents a single audio
++item, which can be passed to a HuggingFace :code:`AudioProcessor`.
++
++Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
++is different from that expected by the model;
++these are resampled to the model's sampling rate before being processed by HF.
++
++Alternatively, a 3-D tensor or batch of 2-D tensors,
++which are treated as audio embeddings;
++these are directly passed to the model without HF processing.
++"""
++
++ModalityData: TypeAlias = Union[_T, list[_T]]
++"""
++Either a single data item, or a list of data items.
++
++The number of data items allowed per modality is restricted by
++:code:`--limit-mm-per-prompt`.
++"""
++
++
++@final
++class MultiModalDataBuiltins(TypedDict, total=False):
++    """Type annotations for modality types predefined by vLLM."""
++
++    image: ModalityData[ImageItem]
++    """The input image(s)."""
++
++    video: ModalityData[VideoItem]
++    """The input video(s)."""
++
++    audio: ModalityData[AudioItem]
++    """The input audio(s)."""
++
++
++MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
++"""
++A dictionary containing an entry for each modality type to input.
++
++The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
++"""
++
++
++class PlaceholderRange(TypedDict):
++    """
++    Placeholder location information for multi-modal data.
++
++    Example:
++
++        Prompt: :code:`AAAA BBBB What is in these images?`
++
++        Images A and B will have:
++
++        .. code-block::
++
++            A: { "offset": 0, "length": 4 }
++            B: { "offset": 5, "length": 4 }
++    """
++
++    offset: int
++    """The start index of the placeholder in the prompt."""
++
++    length: int
++    """The length of the placeholder."""
++
++
++NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
++                      tuple[torch.Tensor, ...]]
++"""
++Uses a list instead of a tensor if the dimensions of each element do not match.
++"""
++
++
++def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
++    """Equality check between :data:`NestedTensors` objects."""
++    if isinstance(a, torch.Tensor):
++        return isinstance(b, torch.Tensor) and bool((a == b).all().item())
++    elif isinstance(b, torch.Tensor):
++        return isinstance(a, torch.Tensor) and bool((b == a).all().item())
++
++    if isinstance(a, list):
++        return (isinstance(b, list)
++                and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b)))
++    if isinstance(b, list):
++        return (isinstance(a, list)
++                and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a)))
++
++    # Both a and b are scalars
++    return a == b
++
++
++BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
++"""
++A dictionary containing nested tensors which have been batched via
++:meth:`MultiModalKwargs.batch`.
++"""
++
++
++@dataclass(frozen=True)
++class MultiModalFieldElem:
++    """Contains metadata and data of an item in :class:`MultiModalKwargs`."""
++    field: "BaseMultiModalField"
++    data: NestedTensors
++
++    def __eq__(self, other: object) -> bool:
++        if not isinstance(other, self.__class__):
++            return False
++
++        return (self.field == other.field
++                and nested_tensors_equal(self.data, other.data))
++
++
++@dataclass(frozen=True)
++class BaseMultiModalField(ABC):
++    """Abstract base class for a field in :class:`MultiModalKwargs`."""
++    key: str
++    modality: str
++
++    @abstractmethod
++    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
++        raise NotImplementedError
++
++    def _build_elem(self, data: NestedTensors) -> MultiModalFieldElem:
++        return MultiModalFieldElem(self, data)
++
++    def reduce(self, batch: list[MultiModalFieldElem]) -> MultiModalFieldElem:
++        """Merge multiple instances of :class:`MultiModalFieldElem` together."""
++        fields = [item.field for item in batch]
++        if len(set(fields)) > 1:
++            raise ValueError(f"Cannot merge different {fields=}")
++
++        data = self._reduce_data([item.data for item in batch])
++
++        return self._build_elem(data)
++
++
++@dataclass(frozen=True)
++class MultiModalBatchedField(BaseMultiModalField):
++    """
++    A :class:`BaseMultiModalField` implementation where an element in the batch
++    is obtained by indexing into the first dimension of the underlying data.
++    """
++
++    def build_elems(self, batch: NestedTensors) -> list[MultiModalFieldElem]:
++        return [self._build_elem(item) for item in batch]
++
++    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
++        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
++            first_shape = batch[0].shape
++            if all(elem.shape == first_shape for elem in batch):
++                return torch.stack(batch)
++
++        return batch
++
++
++@dataclass(frozen=True)
++class MultiModalFlatField(BaseMultiModalField):
++    """
++    A :class:`BaseMultiModalField` implementation where an element in the batch
++    is obtained by slicing along the first dimension of the underlying data.
++    """
++
++    def build_elems(
++        self,
++        batch: NestedTensors,
++        slices: Sequence[slice],
++    ) -> list[MultiModalFieldElem]:
++        return [self._build_elem(batch[slice_]) for slice_ in slices]
++
++    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
++        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
++            first_shape = batch[0].shape
++            if all(elem.shape[1:] == first_shape[1:] for elem in batch):
++                return torch.concat(batch)
++
++        return [e for elem in batch for e in elem]
++
++
++class MultiModalFieldConfig:
++
++    @staticmethod
++    def batched(modality: str):
++        return MultiModalFieldConfig(
++            field_cls=MultiModalBatchedField,
++            modality=modality,
++        )
++
++    @staticmethod
++    def flat(modality: str, slices: Sequence[slice]):
++        return MultiModalFieldConfig(
++            field_cls=MultiModalFlatField,
++            modality=modality,
++            slices=slices,
++        )
++
++    def __init__(
++        self,
++        field_cls: type[BaseMultiModalField],
++        modality: str,
++        **field_config: Any,
++    ) -> None:
++        super().__init__()
++
++        self.field_cls = field_cls
++        self.modality = modality
++        self.field_config = field_config
++
++    def build_elems(
++        self,
++        key: str,
++        batch: NestedTensors,
++    ) -> Sequence[MultiModalFieldElem]:
++        field = self.field_cls(key=key, modality=self.modality)
++        return field.build_elems(batch, **self.field_config)  # type: ignore
++
++
++class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
++    """
++    A collection of :class:`MultiModalFieldElem`
++    corresponding to a data item in :class:`MultiModalDataItems`.
++    """
++
++    @staticmethod
++    def from_elems(elems: Sequence[MultiModalFieldElem]):
++        return MultiModalKwargsItem({elem.field.key: elem for elem in elems})
++
++    @property
++    def modality(self) -> str:
++        modalities = {elem.field.modality for elem in self.data.values()}
++        assert len(modalities) == 1, f"Found different modalities={modalities}"
++        return next(iter(modalities))
++
++
++# NOTE: UserDict is for V0 compatibility.
++# V1 should access individual items via `get_item`.
++class MultiModalKwargs(UserDict[str, NestedTensors]):
++    """
++    A dictionary that represents the keyword arguments to
++    :meth:`~torch.nn.Module.forward`.
++
++    The metadata :code:`items` enables us to obtain the keyword arguments
++    corresponding to each data item in :class:`MultiModalDataItems`, via
++    :meth:`get_item` and :meth:`get_items`.
++    """
++
++    @staticmethod
++    def from_hf_inputs(
++        hf_inputs: BatchFeature,
++        config_by_key: Mapping[str, MultiModalFieldConfig],
++    ):
++        # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
++        # We assume that those fields are not used in vLLM
++        elems_by_key = dict[str, Sequence[MultiModalFieldElem]]()
++        keys_by_modality = defaultdict[str, set[str]](set)
++        for key, config in config_by_key.items():
++            batch = hf_inputs.get(key)
++            if batch is not None:
++                elems = config.build_elems(key, batch)
++                if len(elems) > 0:
++                    elems_by_key[key] = elems
++                    keys_by_modality[config.modality].add(key)
++
++        items = list[MultiModalKwargsItem]()
++        for modality, keys in keys_by_modality.items():
++            elems_in_modality = {k: elems_by_key[k] for k in keys}
++            batch_sizes = {k: len(v) for k, v in elems_in_modality.items()}
++
++            if len(set(batch_sizes.values())) > 1:
++                raise ValueError(
++                    f"Cannot merge different batch sizes for {modality=}! "
++                    f"Found: {batch_sizes=}")
++
++            batch_size = next(iter(batch_sizes.values()))
++            for item_idx in range(batch_size):
++                elems = [v[item_idx] for v in elems_in_modality.values()]
++                items.append(MultiModalKwargsItem.from_elems(elems))
++
++        return MultiModalKwargs.from_items(items)
++
++    @staticmethod
++    def from_items(items: Sequence[MultiModalKwargsItem]):
++        """Construct a new :class:`MultiModalKwargs` from multiple items."""
++        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
++        for item in items:
++            for key, elem in item.items():
++                elems_by_key[key].append(elem)
++
++        data = {
++            key: elems[0].field.reduce(elems).data
++            for key, elems in elems_by_key.items() if len(elems) > 0
++        }
++
++        return MultiModalKwargs(data, items=items)
++
++    def __init__(
++        self,
++        data: Mapping[str, NestedTensors],
++        *,
++        items: Optional[Sequence[MultiModalKwargsItem]] = None,
++    ) -> None:
++        super().__init__(data)
++
++        items_by_modality = full_groupby(items or [], key=lambda x: x.modality)
++        self._items_by_modality = dict(items_by_modality)
++
++    @property
++    def modalities(self):
++        return self._items_by_modality.keys()
++
++    @staticmethod
++    def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
++        """
++        Stack the inner dimensions that have the same shape in
++        a nested list of tensors.
++
++        Thus, a dimension represented by a list means that the inner
++        dimensions are different for each element along that dimension.
++        """
++        if isinstance(nested_tensors, torch.Tensor):
++            return nested_tensors
++
++        # TODO: Remove these once all models have been migrated
++        if isinstance(nested_tensors, np.ndarray):
++            return torch.from_numpy(nested_tensors)
++        if isinstance(nested_tensors, (int, float)):
++            return torch.tensor(nested_tensors)
++
++        stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors]
++        if not is_list_of(stacked, torch.Tensor, check="all"):
++            # Only tensors (not lists) can be stacked.
++            return stacked
++
++        tensors_ = cast(list[torch.Tensor], stacked)
++        if any(t.shape != tensors_[0].shape for t in tensors_):
++            # The tensors have incompatible shapes and can't be stacked.
++            return tensors_
++
++        return torch.stack(tensors_)
++
++    @staticmethod
++    def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs:
++        """
++        Batch multiple inputs together into a dictionary.
++
++        The resulting dictionary has the same keys as the inputs.
++        If the corresponding value from each input is a tensor and they all
++        share the same shape, the output value is a single batched tensor;
++        otherwise, the output value is a list containing the original value
++        from each input.
++        """
++        if len(inputs_list) == 0:
++            return {}
++
++        # We need to consider the case where each item in the batch
++        # contains different modalities (i.e. different keys).
++        item_lists = defaultdict[str, list[NestedTensors]](list)
++
++        for inputs in inputs_list:
++            for k, v in inputs.items():
++                item_lists[k].append(v)
++
++        return {
++            k: MultiModalKwargs._try_stack(item_list)
++            for k, item_list in item_lists.items()
++        }
++
++    @staticmethod
++    def as_kwargs(
++        batched_inputs: BatchedTensorInputs,
++        *,
++        device: torch.types.Device,
++    ) -> BatchedTensorInputs:
++        json_inputs = cast(JSONTree[torch.Tensor], batched_inputs)
++
++        json_mapped = json_map_leaves(
++            lambda x: x.to(device, non_blocking=True),
++            json_inputs,
++        )
++
++        return cast(BatchedTensorInputs, json_mapped)
++
++    def __eq__(self, other: object) -> bool:
++        if not isinstance(other, self.__class__):
++            return False
++        if self._items_by_modality != other._items_by_modality:
++            return False
++
++        ks = self.keys()
++        return (ks == other.keys()
++                and all(nested_tensors_equal(self[k], other[k]) for k in ks))
++
++    def _validate_modality(self, method_name: str, modality: str) -> None:
++        if not self._items_by_modality:
++            raise RuntimeError(
++                f"`{method_name}` is not supported when "
++                "MultiModalKwargs is not initialized with `items`")
++
++        if modality not in self._items_by_modality:
++            available_modalities = set(self._items_by_modality.keys())
++            raise KeyError(f"Modality {modality!r} not found. "
++                           f"Available modalities: {available_modalities}")
++
++    def get_item_count(self, modality: str) -> int:
++        """Get the number of items belonging to a modality."""
++        self._validate_modality("get_item_count", modality)
++        return len(self._items_by_modality[modality])
++
++    def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem:
++        """
++        Get the keyword arguments corresponding to an item identified by
++        its modality and index.
++        """
++        self._validate_modality("get_item", modality)
++        return self._items_by_modality[modality][item_index]
++
++    def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
++        """
++        Get the keyword arguments corresponding to each item belonging to
++        a modality.
++        """
++        self._validate_modality("get_items", modality)
++        return self._items_by_modality[modality]
++
++
++MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
++"""
++A dictionary containing placeholder ranges for each modality.
++"""
++
++
++class MultiModalInputsV2(TypedDict):
++    """
++    Represents the outputs of
++    :class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
++    ready to be passed to vLLM internals.
++    """
++
++    type: Literal["multimodal"]
++    """The type of inputs."""
++
++    prompt: str
++    """The processed prompt text."""
++
++    prompt_token_ids: list[int]
++    """The processed token IDs which includes placeholder tokens."""
++
++    token_type_ids: NotRequired[list[int]]
++    """The token type IDs of the prompt."""
++
++    mm_kwargs: MultiModalKwargs
++    """Keyword arguments to be directly passed to the model after batching."""
++
++    mm_hashes: NotRequired[Optional["MultiModalHashDict"]]
++    """The hashes of the multi-modal data."""
++
++    mm_placeholders: MultiModalPlaceholderDict
++    """
++    For each modality, information about the placeholder tokens in
++    :code:`prompt_token_ids`.
++    """
+diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
+new file mode 100644
+index 0000000..ccff0e8
+--- /dev/null
++++ b/vllm/multimodal/parse.py
+@@ -0,0 +1,366 @@
++from abc import ABC, abstractmethod
++from collections import UserDict
++from collections.abc import Callable, Iterator, Mapping, Sequence
++from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar,
++                    Union)
++
++import numpy as np
++import torch
++from PIL.Image import Image
++from typing_extensions import TypeAlias, TypeGuard, assert_never
++
++from vllm.utils import is_list_of
++
++from .audio import resample_audio
++from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
++                     ImageItem, ModalityData, MultiModalDataDict, VideoItem)
++
++_T = TypeVar("_T")
++_I = TypeVar("_I")
++
++
++class ModalityDataItems(ABC, Generic[_T, _I]):
++    """
++    Represents data items for a modality in :class:`MultiModalDataItems`.
++    """
++
++    def __init__(self, data: _T, modality: str) -> None:
++        super().__init__()
++
++        self.data = data
++        self.modality = modality
++
++    def __repr__(self) -> str:
++        return (f"{type(self).__name__}(modality={self.modality!r}, "
++                f"len={len(self)})")
++
++    def __len__(self) -> int:
++        return self.get_count()
++
++    def __getitem__(self, index: int) -> _I:
++        return self.get(index)
++
++    if TYPE_CHECKING:
++        # Auto-generated
++        def __iter__(self) -> Iterator[_I]:
++            ...
++
++    @abstractmethod
++    def get_count(self) -> int:
++        """Get the number of data items."""
++        raise NotImplementedError
++
++    @abstractmethod
++    def get(self, index: int) -> _I:
++        """Get a data item by its index."""
++        raise NotImplementedError
++
++    def get_all(self) -> list[_I]:
++        """Get all data items."""
++        return [self.get(idx) for idx in range(self.get_count())]
++
++    @abstractmethod
++    def get_processor_data(self) -> Mapping[str, object]:
++        """Get the data to pass to the HF processor."""
++        raise NotImplementedError
++
++    @abstractmethod
++    def get_passthrough_data(self) -> Mapping[str, object]:
++        """Get the data to pass directly to the model."""
++        raise NotImplementedError
++
++
++class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
++    """Base class for data items that are arranged in a list."""
++
++    def get_count(self) -> int:
++        return len(self.data)
++
++    def get(self, index: int) -> _T:
++        return self.data[index]
++
++    def get_processor_data(self) -> Mapping[str, object]:
++        return {f"{self.modality}s": self.data}
++
++    def get_passthrough_data(self) -> Mapping[str, object]:
++        return {}
++
++
++class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
++                                       torch.Tensor]):
++    """
++    Base class for data items that are expressed as a batched embedding tensor,
++    or a list of embedding tensors (one per item).
++    """
++
++    def get_count(self) -> int:
++        return len(self.data)
++
++    def get(self, index: int) -> torch.Tensor:
++        return self.data[index]
++
++    def get_processor_data(self) -> Mapping[str, object]:
++        return {}
++
++    def get_passthrough_data(self) -> Mapping[str, object]:
++        return {f"{self.modality}_embeds": self.data}
++
++    def get_feature_size(self, item_idx: int) -> int:
++        return len(self.get(item_idx))
++
++
++class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
++
++    def __init__(self, data: Sequence[HfAudioItem]) -> None:
++        super().__init__(data, "audio")
++
++
++class AudioEmbeddingItems(EmbeddingItems):
++
++    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
++        super().__init__(data, "audio")
++
++
++class ImageSize(NamedTuple):
++    width: int
++    height: int
++
++
++class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
++
++    def __init__(self, data: Sequence[HfImageItem]) -> None:
++        super().__init__(data, "image")
++
++    def get_image_size(self, item_idx: int) -> ImageSize:
++        image = self.get(item_idx)
++
++        if isinstance(image, Image):
++            return ImageSize(*image.size)
++        if isinstance(image, (np.ndarray, torch.Tensor)):
++            _, h, w = image.shape
++            return ImageSize(w, h)
++
++        assert_never(image)
++
++
++class ImageEmbeddingItems(EmbeddingItems):
++
++    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
++        super().__init__(data, "image")
++
++
++class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
++
++    def __init__(self, data: Sequence[HfVideoItem]) -> None:
++        super().__init__(data, "video")
++
++    def get_num_frames(self, item_idx: int) -> int:
++        return len(self.get(item_idx))
++
++    def get_frame_size(self, item_idx: int) -> ImageSize:
++        image = self.get(item_idx)[0]  # Assume that the video isn't empty
++
++        if isinstance(image, Image):
++            return ImageSize(*image.size)
++        if isinstance(image, (np.ndarray, torch.Tensor)):
++            _, h, w = image.shape
++            return ImageSize(w, h)
++
++        assert_never(image)
++
++
++class VideoEmbeddingItems(EmbeddingItems):
++
++    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
++        super().__init__(data, "video")
++
++
++_D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
++
++
++class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
++    """
++    As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
++    such that each entry corresponds to a list.
++    """
++
++    def get_count(self, modality: str, *, strict: bool = True) -> int:
++        """
++        Get the number of data items belonging to a modality.
++        
++        If `strict=False`, return `0` instead of raising :exc:`KeyError`
++        even if the modality is not found.
++        """
++        if modality not in self:
++            if strict:
++                available_modalities = set(self.keys())
++                raise KeyError(f"Modality {modality!r} not found. "
++                               f"Available modalities: {available_modalities}")
++
++            return 0
++
++        return self[modality].get_count()
++
++    def get_all_counts(self) -> Mapping[str, int]:
++        """Get the number of items belonging to each modality."""
++        return {m: items.get_count() for m, items in self.items()}
++
++    def get_items(
++        self,
++        modality: str,
++        typ: Union[type[_D], tuple[type[_D], ...]],
++    ) -> _D:
++        """
++        Get the data items belonging to a modality,
++        requiring that they belong to a certain type.
++        """
++        if modality not in self:
++            available_modalities = set(self.keys())
++            raise KeyError(f"Modality {modality!r} not found. "
++                           f"Available modalities: {available_modalities}")
++
++        items = self[modality]
++        if not isinstance(items, typ):
++            raise TypeError(f"Invalid type of data items for {modality=}. "
++                            f"Expected type: {typ}, but "
++                            f"found type: {type(items)}")
++
++        return items  # type: ignore[return-value]
++
++
++ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
++                                         ModalityDataItems[Any, Any]]
++
++
++class MultiModalDataParser:
++    """
++    Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
++    :class:`MultiModalDataItems`.
++
++    Args:
++        target_sr (float, optional): Enables automatic resampling of audio
++            items to the model's expected sampling rate.
++    """
++
++    def __init__(self, *, target_sr: Optional[float] = None) -> None:
++        super().__init__()
++
++        self.target_sr = target_sr
++
++    def _is_embeddings(
++            self, data: object
++    ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
++        if isinstance(data, torch.Tensor):
++            return data.ndim == 3
++        if is_list_of(data, torch.Tensor):
++            return len(data) == 0 or data[0].ndim == 2
++
++        return False
++
++    def _get_audio_with_sr(
++        self,
++        audio: AudioItem,
++    ) -> tuple[np.ndarray, Optional[float]]:
++        if isinstance(audio, tuple):
++            return audio
++        if isinstance(audio, list):
++            return np.array(audio), None
++        if isinstance(audio, np.ndarray):
++            return audio, None
++        if isinstance(audio, torch.Tensor):
++            return audio.numpy(), None
++
++        assert_never(audio)
++
++    def _parse_audio_data(
++        self,
++        data: ModalityData[AudioItem],
++    ) -> ModalityDataItems[Any, Any]:
++        if self._is_embeddings(data):
++            return AudioEmbeddingItems(data)
++
++        if (is_list_of(data, float)
++                or isinstance(data,
++                              (np.ndarray, torch.Tensor)) and data.ndim == 1
++                or isinstance(data, tuple)):
++            data_items = [data]
++        elif isinstance(data, (np.ndarray, torch.Tensor)):
++            data_items = [elem for elem in data]
++        else:
++            data_items = data
++
++        new_audios = list[np.ndarray]()
++        for data_item in data_items:
++            audio, orig_sr = self._get_audio_with_sr(data_item)
++            if orig_sr is None:
++                new_audio = audio
++            else:
++                target_sr = self.target_sr
++                if target_sr is None:
++                    raise RuntimeError(
++                        "Audio resampling is not supported when "
++                        "`target_sr` is not provided")
++
++                new_audio = resample_audio(audio,
++                                           orig_sr=orig_sr,
++                                           target_sr=target_sr)
++
++            new_audios.append(new_audio)
++
++        return AudioProcessorItems(new_audios)
++
++    def _parse_image_data(
++        self,
++        data: ModalityData[ImageItem],
++    ) -> ModalityDataItems[Any, Any]:
++        if self._is_embeddings(data):
++            return ImageEmbeddingItems(data)
++
++        if (isinstance(data, Image)
++                or isinstance(data,
++                              (np.ndarray, torch.Tensor)) and data.ndim == 3):
++            data_items = [data]
++        elif isinstance(data, (np.ndarray, torch.Tensor)):
++            data_items = [elem for elem in data]
++        else:
++            data_items = data
++
++        return ImageProcessorItems(data_items)
++
++    def _parse_video_data(
++        self,
++        data: ModalityData[VideoItem],
++    ) -> ModalityDataItems[Any, Any]:
++        if self._is_embeddings(data):
++            return VideoEmbeddingItems(data)
++
++        if (is_list_of(data, Image)
++                or isinstance(data,
++                              (np.ndarray, torch.Tensor)) and data.ndim == 4):
++            data_items = [data]
++        elif isinstance(data, (np.ndarray, torch.Tensor)):
++            data_items = [elem for elem in data]
++        else:
++            data_items = data
++
++        return VideoProcessorItems(data_items)
++
++    def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
++        return {
++            "audio": self._parse_audio_data,
++            "image": self._parse_image_data,
++            "video": self._parse_video_data,
++        }
++
++    def parse_mm_data(self,
++                      mm_data: MultiModalDataDict) -> MultiModalDataItems:
++        subparsers = self._get_subparsers()
++
++        mm_items = MultiModalDataItems()
++        for k, v in mm_data.items():
++            if k not in subparsers:
++                raise ValueError(f"Unsupported modality: {k}")
++
++            mm_items[k] = subparsers[k](v)
++
++        return mm_items
+diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
+new file mode 100644
+index 0000000..8b47dfb
+--- /dev/null
++++ b/vllm/multimodal/processing.py
+@@ -0,0 +1,1190 @@
++import re
++from abc import ABC, abstractmethod
++from collections import defaultdict
++from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
++from dataclasses import dataclass, field
++from functools import lru_cache
++from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
++                    TypeVar, Union)
++
++from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
++
++import vllm.envs as envs
++from vllm.inputs import InputProcessingContext
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
++                                               encode_tokens)
++from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
++
++from .hasher import MultiModalHasher
++from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
++                     MultiModalInputsV2, MultiModalKwargs,
++                     MultiModalKwargsItem, PlaceholderRange)
++from .parse import MultiModalDataItems, MultiModalDataParser
++
++if TYPE_CHECKING:
++    from .profiling import BaseDummyInputsBuilder
++
++logger = init_logger(__name__)
++
++_S = TypeVar("_S", str, list[int])
++_PromptSeq = Union[str, list[int]]
++
++
++@dataclass
++class PromptReplacement:
++    """
++    Defines how to replace portions of an input prompt with placeholder tokens.
++    """
++
++    modality: str
++    """The modality for which the replacement is made."""
++
++    target: _PromptSeq
++    """The token sequence (or text) to find and replace."""
++
++    replacement: Union[Callable[[int], _PromptSeq],
++                       _PromptSeq] = field(repr=False)
++    """
++    Given the index of the processed item within :attr:`modality`,
++    output the replacement token sequence (or text).
++
++    For convenience, you can directly pass in the replacement token sequence
++    (or text) instead of a function if it does not depend on the input.
++    """
++
++    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
++        return BoundPromptReplacement(
++            tokenizer=tokenizer,
++            modality=self.modality,
++            _target=self.target,
++            _replacement=self.replacement,
++        )
++
++
++@lru_cache(maxsize=2048)
++def _cached_encode(
++    tokenizer: AnyTokenizer,
++    text: str,
++    *,
++    add_special_tokens: bool = False,
++) -> list[int]:
++    return encode_tokens(tokenizer,
++                         text,
++                         add_special_tokens=add_special_tokens)
++
++
++@lru_cache(maxsize=2048)
++def _cached_decode(
++    tokenizer: AnyTokenizer,
++    token_ids: tuple[int, ...],
++    *,
++    skip_special_tokens: bool = False,
++) -> str:
++    return decode_tokens(tokenizer,
++                         list(token_ids),
++                         skip_special_tokens=skip_special_tokens)
++
++
++class _HasModalityAttr(Protocol):
++    modality: str
++
++
++class _HasModalityProp(Protocol):
++
++    @property
++    def modality(self) -> str:
++        ...
++
++
++_M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
++
++
++def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
++    """Convenience function to apply :func:`full_groupby` based on modality."""
++    return full_groupby(values, key=lambda x: x.modality)
++
++
++@dataclass
++class _BoundPromptSequence:
++    tokenizer: AnyTokenizer = field(repr=False)
++
++    _text: Optional[str]
++    _token_ids: Optional[list[int]]
++
++    def __post_init__(self) -> None:
++        if self._text is None and self._token_ids is None:
++            raise ValueError("At least one of 'text' and 'token_ids' must be "
++                             "specified")
++
++    @property
++    def text(self) -> str:
++        if self._text is None:
++            assert self._token_ids is not None
++            self._text = _cached_decode(self.tokenizer, tuple(self._token_ids))
++
++        return self._text
++
++    @property
++    def token_ids(self) -> list[int]:
++        if self._token_ids is None:
++            assert self._text is not None
++            self._token_ids = _cached_encode(self.tokenizer, self._text)
++
++        return self._token_ids
++
++
++@dataclass
++class BoundPromptReplacement:
++    """
++    A :class:`PromptReplacement` bound to a tokenizer to automatically
++    convert :attr:`target` and the result of :meth:`get_replacement` between
++    token sequence and text representations.
++    """
++    tokenizer: AnyTokenizer = field(repr=False)
++    modality: str
++
++    _target: _PromptSeq
++    _replacement: Union[Callable[[int], _PromptSeq],
++                        _PromptSeq] = field(repr=False)
++
++    def __post_init__(self) -> None:
++        self._replacement_cache = dict[int, _BoundPromptSequence]()
++
++    @property
++    def target(self) -> _BoundPromptSequence:
++        """The token sequence (or text) to find and replace."""
++        target = self._target
++
++        return _BoundPromptSequence(
++            tokenizer=self.tokenizer,
++            _text=target if isinstance(target, str) else None,
++            _token_ids=target if isinstance(target, list) else None,
++        )
++
++    def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
++        """
++        Given the index of the processed item within :attr:`modality`,
++        output the replacement token sequence (or text).
++        """
++        replacement = self._replacement
++        if callable(replacement):
++            cache_key = item_idx
++            if cache_key in self._replacement_cache:
++                return self._replacement_cache[cache_key]
++
++            replacement = replacement(item_idx)
++        else:
++            cache_key = None
++
++        bound_replacement = _BoundPromptSequence(
++            tokenizer=self.tokenizer,
++            _text=replacement if isinstance(replacement, str) else None,
++            _token_ids=replacement if isinstance(replacement, list) else None,
++        )
++
++        if cache_key is not None:
++            self._replacement_cache[cache_key] = bound_replacement
++
++        return bound_replacement
++
++
++class _TokenMatch(NamedTuple):
++    start_idx: int
++    end_idx: int
++
++
++def iter_token_matches(
++    token_ids: list[int],
++    match_ids: list[int],
++) -> Iterable[_TokenMatch]:
++    """
++    Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
++
++    Note that empty matches are ignored.
++    """
++    prompt_len = len(token_ids)
++    match_len = len(match_ids)
++
++    if match_len == 0:
++        return
++
++    start_idx = 0
++    while start_idx < prompt_len - match_len + 1:
++        end_idx = start_idx + match_len
++
++        if token_ids[start_idx:end_idx] == match_ids:
++            yield _TokenMatch(start_idx=start_idx, end_idx=end_idx)
++
++            # Exclude overlapping matches
++            start_idx = end_idx
++        else:
++            start_idx += 1
++
++
++@dataclass(repr=False)
++class _PromptReplacementMatch(ABC):
++    prompt_repl: BoundPromptReplacement
++
++    @property
++    def modality(self) -> str:
++        return self.prompt_repl.modality
++
++    @property
++    @abstractmethod
++    def start_idx(self) -> int:
++        raise NotImplementedError
++
++    @property
++    @abstractmethod
++    def end_idx(self) -> int:
++        raise NotImplementedError
++
++    def __repr__(self) -> str:
++        return (f"{type(self).__name__}(modality={self.modality!r}, "
++                f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})")
++
++
++@dataclass(repr=False)
++class _PromptReplacementTokenMatch(_PromptReplacementMatch):
++    match: _TokenMatch
++
++    @property
++    def start_idx(self) -> int:
++        return self.match.start_idx
++
++    @property
++    def end_idx(self) -> int:
++        return self.match.end_idx
++
++
++@dataclass(repr=False)
++class _PromptReplacementTextMatch(_PromptReplacementMatch):
++    match: re.Match[str]
++
++    @property
++    def start_idx(self) -> int:
++        return self.match.start()
++
++    @property
++    def end_idx(self) -> int:
++        return self.match.end()
++
++
++@dataclass
++class PlaceholderInfo:
++    modality: str
++    item_idx: int
++    start_idx: int
++    replacement: list[int]
++
++    @property
++    def length(self) -> int:
++        return len(self.replacement)
++
++    def to_range(self) -> PlaceholderRange:
++        return PlaceholderRange(
++            offset=self.start_idx,
++            length=self.length,
++        )
++
++
++def find_token_matches(
++    prompt: list[int],
++    prompt_repls: Sequence[BoundPromptReplacement],
++) -> list[_PromptReplacementTokenMatch]:
++    """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
++    return [
++        _PromptReplacementTokenMatch(prompt_repl, match)
++        for prompt_repl in prompt_repls
++        for match in iter_token_matches(prompt, prompt_repl.target.token_ids)
++    ]
++
++
++def find_text_matches(
++    prompt: str,
++    prompt_repls: Sequence[BoundPromptReplacement],
++) -> list[_PromptReplacementTextMatch]:
++    """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
++    return [
++        _PromptReplacementTextMatch(prompt_repl, match)
++        for prompt_repl in prompt_repls
++        for match in re.finditer(re.escape(prompt_repl.target.text), prompt)
++    ]
++
++
++def _resolve_matches(
++    prompt: _PromptSeq,
++    mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
++) -> list[_PromptReplacementMatch]:
++    """
++    Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
++    and sort them such that earlier matches take priority over later ones.
++    """
++    matches = [m for matches in mm_matches.values() for m in matches]
++
++    seen_matches: list[Optional[_PromptReplacementMatch]] = [None
++                                                             ] * len(prompt)
++
++    for match in matches:
++        for idx in range(match.start_idx, match.end_idx):
++            if seen_matches[idx] is not None:
++                raise ValueError("Found overlapping matches "
++                                 f"({seen_matches[idx]} and {match}) "
++                                 f"at index={idx} of prompt={prompt}")
++
++            seen_matches[idx] = match
++
++    return sorted(matches, key=lambda x: x.start_idx)
++
++
++def _replace_matches(
++    prompt: _S,
++    mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
++    mm_item_counts: Mapping[str, int],
++) -> list[_S]:
++    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
++    out_seqs = list[_S]()
++    prev_end_idx = 0
++    next_idx_by_modality = defaultdict[str, int](lambda: 0)
++
++    for match in _resolve_matches(prompt, mm_matches):
++        modality = match.modality
++
++        item_idx = next_idx_by_modality[modality]
++        if item_idx >= mm_item_counts.get(modality, 0):
++            continue
++
++        start_idx = match.start_idx
++        end_idx = match.end_idx
++
++        repl_info = match.prompt_repl
++        replacement = repl_info.get_replacement(item_idx)
++
++        if isinstance(prompt, str):
++            repl_seq = replacement.text
++            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
++        else:
++            repl_seq = replacement.token_ids
++            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
++
++        prev_end_idx = end_idx
++        next_idx_by_modality[modality] += 1
++
++    out_seqs.append(prompt[prev_end_idx:])
++
++    return out_seqs
++
++
++def replace_token_matches(
++    prompt: list[int],
++    mm_matches: Mapping[str, Sequence[_PromptReplacementTokenMatch]],
++    mm_item_counts: Mapping[str, int],
++) -> list[int]:
++    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
++    if not mm_matches:
++        return prompt
++
++    token_id_seqs = _replace_matches(prompt, mm_matches, mm_item_counts)
++
++    return flatten_2d_lists(token_id_seqs)
++
++
++def replace_text_matches(
++    prompt: str,
++    mm_matches: Mapping[str, Sequence[_PromptReplacementTextMatch]],
++    mm_item_counts: Mapping[str, int],
++) -> str:
++    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
++    if not mm_matches:
++        return prompt
++
++    texts = _replace_matches(prompt, mm_matches, mm_item_counts)
++
++    return "".join(texts)
++
++
++def _iter_modality_placeholders(
++    prompt: list[int],
++    modality: str,
++    modality_repls: Sequence[BoundPromptReplacement],
++    modal_item_count: int,
++) -> Iterable[PlaceholderInfo]:
++    if modal_item_count == 0:
++        return
++
++    prompt_len = len(prompt)
++    item_idx = 0
++
++    start_idx = 0
++    while start_idx < prompt_len:
++        found = False
++
++        for repl_info in modality_repls:
++            replacement = repl_info.get_replacement(item_idx)
++            repl_tokens = replacement.token_ids
++            repl_len = len(repl_tokens)
++            end_idx = start_idx + repl_len
++
++            if repl_len == 0 or end_idx > prompt_len:
++                continue
++
++            if prompt[start_idx:end_idx] == repl_tokens:
++                yield PlaceholderInfo(
++                    modality=modality,
++                    item_idx=item_idx,
++                    start_idx=start_idx,
++                    replacement=repl_tokens,
++                )
++
++                item_idx += 1
++                if item_idx >= modal_item_count:
++                    return
++
++                # Exclude overlapping matches
++                start_idx = end_idx
++                found = True
++                break
++
++        if not found:
++            start_idx += 1
++
++
++def _iter_placeholders(
++    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
++    prompt: list[int],
++    mm_item_counts: Mapping[str, int],
++) -> Iterable[PlaceholderInfo]:
++    """
++    For each modality, yield each set of placeholder tokens found in
++    :code:`prompt`.
++
++    Note that empty matches are ignored.
++    """
++    for modality, modal_item_count in mm_item_counts.items():
++        if modality in mm_prompt_repls:
++            yield from _iter_modality_placeholders(
++                prompt,
++                modality,
++                mm_prompt_repls[modality],
++                modal_item_count,
++            )
++
++
++def find_mm_placeholders(
++    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
++    prompt: list[int],
++    mm_item_counts: Mapping[str, int],
++) -> Mapping[str, list[PlaceholderInfo]]:
++    it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts)
++    return dict(full_groupby_modality(it))
++
++
++class ProcessingCache:
++
++    def __init__(self, capacity: int) -> None:
++        super().__init__()
++
++        # DEBUG: Set to None to disable
++        self.debug_cache_hit_ratio_steps: Optional[int] = None
++
++        self._cache = LRUCache[str, MultiModalKwargsItem](capacity)
++
++    def _maybe_log_cache_stats(self) -> None:
++        steps = self.debug_cache_hit_ratio_steps
++        if not steps:
++            return
++
++        cache_stats = self._cache.stat()
++        if cache_stats.total % steps == 0:
++            logger.debug("ProcessingCache: hit_ratio = %.2f",
++                         cache_stats.hit_ratio)
++
++    def get(
++        self,
++        model_id: str,
++        modality: str,
++        input_item: object,
++        input_kwargs: Mapping[str, object],
++    ) -> Optional[MultiModalKwargsItem]:
++        """
++        Get a processed multi-modal item from the cache
++        according to its dependencies, including:
++
++        - The model ID
++        - The modality of the item
++        - The original data item passed to the HF processor
++        - The configuration options of the HF processor
++        """
++        self._maybe_log_cache_stats()
++
++        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
++                                                 **{modality: input_item},
++                                                 **input_kwargs)
++        return self._cache.get(cache_key)
++
++    def put(
++        self,
++        model_id: str,
++        modality: str,
++        input_item: object,
++        input_kwargs: Mapping[str, object],
++        output_kwargs: MultiModalKwargsItem,
++    ) -> None:
++        """
++        Put a processed multi-modal item into the cache
++        according to its dependencies (see :meth:`get`).
++        """
++        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
++                                                 **{modality: input_item},
++                                                 **input_kwargs)
++        self._cache.put(cache_key, output_kwargs)
++
++
++class BaseProcessingInfo:
++    """Base class to provide the information necessary for data processing."""
++
++    def __init__(self, ctx: InputProcessingContext) -> None:
++        super().__init__()
++
++        self.ctx = ctx
++
++    @property
++    def model_id(self) -> str:
++        return self.ctx.model_config.model
++
++    def get_tokenizer(self) -> AnyTokenizer:
++        return self.ctx.tokenizer
++
++    def get_hf_config(self) -> PretrainedConfig:
++        return self.ctx.get_hf_config()
++
++    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
++        """
++        Subclasses can override this method to handle
++        specific kwargs from model config or user inputs.
++        """
++        return self.ctx.get_hf_processor(**kwargs)
++
++    @abstractmethod
++    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
++        """
++        Return the maximum supported number of items for each modality.
++
++        A value of `None` means unlimited number of items.
++
++        Omitting a modality from the returned dictionary means that
++        it is not supported at all.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
++        """
++        Get the maximum possible number of tokens per data item
++        for each modality.
++
++        The dictionary returned by this method should have the same
++        keys as that returned by :meth:`get_supported_mm_limits`.
++        """
++        raise NotImplementedError
++
++
++_I = TypeVar("_I", bound=BaseProcessingInfo)
++
++
++class BaseMultiModalProcessor(ABC, Generic[_I]):
++    """
++    Abstract base class to process multi-modal inputs to be used in vLLM.
++
++    Not to be confused with :class:`transformers.ProcessorMixin`.
++    """
++
++    def __init__(self,
++                 info: _I,
++                 dummy_inputs: "BaseDummyInputsBuilder[_I]",
++                 *,
++                 cache: Optional[ProcessingCache] = None,
++                 enable_sanity_checks: bool = True) -> None:
++        super().__init__()
++
++        self.info = info
++        self.dummy_inputs = dummy_inputs
++        self.cache = cache
++        self.enable_sanity_checks = enable_sanity_checks
++
++        self.data_parser = self._get_data_parser()
++
++    def __call__(
++        self,
++        prompt: str,
++        mm_data: MultiModalDataDict,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> MultiModalInputsV2:
++        return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
++
++    def _get_data_parser(self) -> MultiModalDataParser:
++        """
++        Construct a parser to preprocess multi-modal data items
++        before passing them to :meth:`_get_hf_mm_data`.
++
++        You can support additional modalities by creating a subclass
++        of :class:`MultiModalDataParser` that has additional subparsers.
++        """
++        return MultiModalDataParser()
++
++    def _to_mm_items(
++        self,
++        mm_data: MultiModalDataDict,
++    ) -> MultiModalDataItems:
++        """
++        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
++        before passing them to :meth:`_get_hf_mm_data`.
++        """
++        mm_items = self.data_parser.parse_mm_data(mm_data)
++
++        mm_limits = self.info.ctx.get_mm_config().limit_per_prompt
++        for modality, items in mm_items.items():
++            limit = mm_limits.get(modality, 1)
++            if len(items) > limit:
++                raise ValueError(
++                    f"You set {modality}={limit} (or defaulted to 1) in "
++                    f"`--limit-mm-per-prompt`, but passed {len(items)} "
++                    f"{modality} items in the same prompt.")
++
++        return mm_items
++
++    @abstractmethod
++    def _get_mm_fields_config(
++        self,
++        hf_inputs: BatchFeature,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> Mapping[str, MultiModalFieldConfig]:
++        """Given the HF-processed data, output the metadata of each field."""
++        raise NotImplementedError
++
++    @abstractmethod
++    def _get_prompt_replacements(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        out_mm_kwargs: MultiModalKwargs,
++    ) -> list[PromptReplacement]:
++        """
++        Given the original multi-modal items for this modality
++        and HF-processed data, output the replacements to perform.
++
++        Notes:
++            - You should not assume that HF processor always performs prompt
++              replacement: in :meth:`_apply_hf_processor_missing`, this method
++              is called on text-only and multimodal-only inputs separately,
++              instead of passing them in the same call.
++            - The replacement information returned by this method is also used
++              to determine the placeholder token positions for each multi-modal
++              item.
++        """
++        raise NotImplementedError
++
++    def _find_mm_placeholders(
++        self,
++        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
++        new_token_ids: list[int],
++        mm_item_counts: Mapping[str, int],
++    ) -> Mapping[str, list[PlaceholderInfo]]:
++        return find_mm_placeholders(mm_prompt_repls, new_token_ids,
++                                    mm_item_counts)
++
++    def _get_hf_mm_data(
++        self,
++        mm_items: MultiModalDataItems,
++    ) -> tuple[Mapping[str, object], Mapping[str, object]]:
++        processor_data = dict[str, object]()
++        passthrough_data = dict[str, object]()
++
++        for items in mm_items.values():
++            processor_data.update(items.get_processor_data())
++            passthrough_data.update(items.get_passthrough_data())
++
++        return processor_data, passthrough_data
++
++    def _call_hf_processor(
++        self,
++        prompt: str,
++        # Not to be confused with `mm_data` in `self.apply`.
++        # This refers to the data to be passed to HF processor.
++        mm_data: Mapping[str, object],
++        mm_kwargs: Mapping[str, object],
++    ) -> BatchFeature:
++        """
++        Call the HF processor on the prompt text and
++        associated multi-modal data.
++        """
++        return self.info.ctx.call_hf_processor(
++            self.info.get_hf_processor(**mm_kwargs),
++            dict(text=prompt, **mm_data),
++            mm_kwargs,
++        )
++
++    def _apply_hf_processor_text_mm(
++        self,
++        prompt_text: str,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> tuple[list[int], MultiModalKwargs]:
++        """
++        Apply the HF processor on the prompt text and multi-modal data
++        together.
++        """
++        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
++
++        processed_data = self._call_hf_processor(
++            prompt=prompt_text,
++            mm_data=processor_data,
++            mm_kwargs=hf_processor_mm_kwargs,
++        )
++        processed_data.update(passthrough_data)
++
++        prompt_ids, = processed_data.pop("input_ids").tolist()
++
++        mm_kwargs = MultiModalKwargs.from_hf_inputs(
++            processed_data,
++            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
++        )
++
++        return prompt_ids, mm_kwargs
++
++    def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]:
++        """
++        Apply the HF processor on the prompt text only.
++
++        Since HF processor requires that text and multi-modal items
++        correspond to each other, we create dummy multi-modal items
++        to go along with the text.
++        """
++        prompt_ids, _ = self._apply_hf_processor_text_mm(
++            prompt_text=prompt_text,
++            mm_items=MultiModalDataItems({}),
++            hf_processor_mm_kwargs={},
++        )
++
++        return prompt_ids
++
++    def _apply_hf_processor_tokens_only(
++        self,
++        prompt_tokens: list[int],
++    ) -> list[int]:
++        """
++        Apply the HF processor on the prompt tokens only.
++
++        Most HF processors accept prompt text but not prompt tokens.
++        If the HF processor adds or removes tokens that are not related to
++        multi-modal data, you should override this method so it is consistent
++        with the output of :meth:`_apply_hf_processor_text_only` on the
++        corresponding text.
++        """
++        return prompt_tokens
++
++    def _apply_hf_processor_mm_only(
++        self,
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> MultiModalKwargs:
++        """
++        Apply the HF processor on the multi-modal data only.
++
++        Since HF processor requires that text and multi-modal items
++        correspond to each other, we generate dummy text using
++        :class:`DummyInputsBuilder` to go along with the multi-modal data.
++        """
++        mm_counts = mm_items.get_all_counts()
++
++        dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
++            self.info.ctx.model_config.max_model_len,
++            mm_counts,
++        )
++
++        _, mm_kwargs = self._apply_hf_processor_text_mm(
++            prompt_text=dummy_inputs.prompt_text,
++            mm_items=mm_items,
++            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
++        )
++
++        return mm_kwargs
++
++    def _apply_hf_processor_main(
++        self,
++        prompt: Union[str, list[int]],
++        mm_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++        *,
++        enable_hf_prompt_replacement: bool,
++    ) -> tuple[list[int], MultiModalKwargs]:
++        """
++        Apply the HF processor on the prompt text and multi-modal data.
++
++        Note:
++            If :code:`enable_hf_prompt_replacement=False`, the prompt should
++            correspond to the multi-modal items.
++        """
++        if isinstance(prompt, str):
++            if enable_hf_prompt_replacement:
++                return self._apply_hf_processor_text_mm(
++                    prompt_text=prompt,
++                    mm_items=mm_items,
++                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
++                )
++
++            prompt_ids = self._apply_hf_processor_text_only(prompt)
++        else:
++            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
++
++        mm_missing_kwargs = self._apply_hf_processor_mm_only(
++            mm_items=mm_items,
++            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
++        )
++
++        return prompt_ids, mm_missing_kwargs
++
++    def _cached_apply_hf_processor(
++        self,
++        prompt: Union[str, list[int]],
++        mm_data_items: MultiModalDataItems,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> tuple[list[int], MultiModalKwargs]:
++        """
++        Apply the HF processor on the full prompt text,
++        caching the results and reusing cached results.
++        """
++        cache = self.cache
++        model_id = self.info.model_id
++
++        _, passthrough_data = self._get_hf_mm_data(mm_data_items)
++        if cache is None or passthrough_data:
++            return self._apply_hf_processor_main(
++                prompt=prompt,
++                mm_items=mm_data_items,
++                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
++                enable_hf_prompt_replacement=True,
++            )
++
++        mm_maybe_cached_kw_items = {
++            modality: [
++                cache.get(model_id, modality, item, hf_processor_mm_kwargs)
++                for item in items
++            ]
++            for modality, items in mm_data_items.items()
++        }
++
++        mm_missing_idxs = {
++            modality:
++            [idx for idx, item in enumerate(kw_items) if item is None]
++            for modality, kw_items in mm_maybe_cached_kw_items.items()
++        }
++        mm_missing_data = {
++            modality: [mm_data_items[modality][idx] for idx in idxs]
++            for modality, idxs in mm_missing_idxs.items()
++        }
++        mm_missing_data_items = self._to_mm_items(mm_missing_data)
++
++        # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
++        # so we need to pass `enable_hf_prompt_replacement=False`
++        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_main(
++            prompt=prompt,
++            mm_items=mm_missing_data_items,
++            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
++            enable_hf_prompt_replacement=False,
++        )
++
++        mm_missing_next_idx = {
++            modality: 0
++            for modality in mm_missing_data_items
++        }
++
++        merged_kw_items = list[MultiModalKwargsItem]()
++        for modality, kw_items in mm_maybe_cached_kw_items.items():
++            for idx, kw_item in enumerate(kw_items):
++                if kw_item is None:
++                    kw_item = mm_missing_kwargs.get_item(
++                        modality,
++                        mm_missing_next_idx[modality],
++                    )
++
++                    cache.put(
++                        model_id,
++                        modality,
++                        mm_data_items[modality][idx],
++                        hf_processor_mm_kwargs,
++                        kw_item,
++                    )
++
++                    mm_missing_next_idx[modality] += 1
++
++                merged_kw_items.append(kw_item)
++
++        if self.enable_sanity_checks:
++            mm_missing_counts = mm_missing_data_items.get_all_counts()
++            assert all(
++                item_count == mm_missing_counts[modality]
++                for modality, item_count in mm_missing_next_idx.items()), dict(
++                    mm_missing_next_idx=mm_missing_next_idx,
++                    mm_missing_counts=mm_missing_counts)
++
++        mm_kwargs = MultiModalKwargs.from_items(merged_kw_items)
++
++        return prompt_ids, mm_kwargs
++
++    def _bind_and_group_repls(
++        self,
++        prompt_repls: list[PromptReplacement],
++    ) -> dict[str, list[BoundPromptReplacement]]:
++        tokenizer = self.info.get_tokenizer()
++
++        it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls)
++        return dict(full_groupby_modality(it))
++
++    def _always_apply_prompt_replacements(self) -> bool:
++        """
++        A flag which can be overridden so that
++        :meth:`_apply_prompt_replacements` is always called even if we
++        detect that HF has performed processing via
++        :meth:`_find_placeholders_by_modality`.
++
++        This is useful in cases where :meth:`_find_placeholders_by_modality`
++        cannot be reliably used to detect whether HF has performed processing.
++        """
++        return False
++
++    def _apply_prompt_replacements(
++        self,
++        token_ids: list[int],
++        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
++        mm_item_counts: Mapping[str, int],
++    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
++        tokenizer = self.info.get_tokenizer()
++
++        mm_token_matches = {
++            modality: find_token_matches(token_ids, prompt_repls)
++            for modality, prompt_repls in mm_prompt_repls.items()
++        }
++        mm_match_counts = {
++            modality: len(matches)
++            for modality, matches in mm_token_matches.items()
++        }
++
++        # If the search text does not represent a special token,
++        # it may have different token IDs in the prompt, because
++        # the tokens may go across the boundaries of the search text.
++        # ----
++        # e.g. when searching for "foo" in "food", if "food" itself makes
++        # up a token, then the token ID of "foo" will not appear at all
++        # ----
++        # Since it is inefficient to search for all possible tokenizations
++        # of the search text in the prompt, we instead perform string
++        # replacement on the decoded token IDs, then encode them back.
++        if all(
++            mm_match_counts.get(modality, 0) >= item_count
++            for modality, item_count in mm_item_counts.items()
++        ):  # yapf: disable
++            token_ids = replace_token_matches(
++                token_ids,
++                mm_token_matches,
++                mm_item_counts,
++            )
++
++            text = decode_tokens(tokenizer, token_ids)
++            matched_repls = {
++                modality: [match.prompt_repl for match in token_matches]
++                for modality, token_matches in mm_token_matches.items()
++            }
++        else:
++            text = decode_tokens(tokenizer, token_ids)
++
++            mm_text_matches = {
++                modality: find_text_matches(text, prompt_repls)
++                for modality, prompt_repls in mm_prompt_repls.items()
++            }
++            text = replace_text_matches(
++                text,
++                mm_text_matches,
++                mm_item_counts,
++            )
++
++            token_ids = encode_tokens(tokenizer,
++                                      text,
++                                      add_special_tokens=False)
++            matched_repls = {
++                modality: [match.prompt_repl for match in token_matches]
++                for modality, token_matches in mm_text_matches.items()
++            }
++
++        placeholders = self._find_mm_placeholders(
++            matched_repls,
++            token_ids,
++            mm_item_counts,
++        )
++
++        return token_ids, text, placeholders
++
++    def _validate_mm_kwargs(
++        self,
++        mm_kwargs: MultiModalKwargs,
++        mm_item_counts: Mapping[str, int],
++    ) -> None:
++        for modality, item_count in mm_item_counts.items():
++            if modality in mm_kwargs.modalities:
++                items = mm_kwargs.get_items(modality)
++            else:
++                items = []
++
++            if len(items) != item_count:
++                raise RuntimeError(
++                    f"Expected there to be {item_count} {modality} items in "
++                    f"keyword arguments corresponding to {item_count} "
++                    f"{modality} data items, but only found {len(items)}! "
++                    "There is likely a problem with your "
++                    "implementation of merged multi-modal processor for this "
++                    "model (usually arising from an inconsistency between "
++                    "`_call_hf_processor` and `_get_mm_fields_config`).")
++
++    def _validate_mm_placeholders(
++        self,
++        mm_placeholders: Mapping[str, list[PlaceholderInfo]],
++        mm_item_counts: Mapping[str, int],
++        *,
++        allow_missing: bool = False,
++    ) -> Mapping[str, int]:
++        missing_repl_counts = dict[str, int]()
++
++        for modality, item_count in mm_item_counts.items():
++            placeholders = mm_placeholders.get(modality, [])
++
++            if len(placeholders) != item_count and not allow_missing:
++                raise RuntimeError(
++                    f"Expected there to be {item_count} prompt replacements "
++                    f"corresponding to {item_count} {modality} items, but only "
++                    f"found {len(placeholders)} prompt replacements! Either "
++                    "the prompt text has missing/incorrect tokens for "
++                    "multi-modal inputs, or there is a problem with your "
++                    "implementation of merged multi-modal processor for this "
++                    "model (usually arising from an inconsistency between "
++                    "`_call_hf_processor` and `_get_prompt_replacements`).")
++
++            missing_repl_counts[modality] = item_count - len(placeholders)
++
++        return missing_repl_counts
++
++    def apply(
++        self,
++        prompt: Union[str, list[int]],
++        mm_data: MultiModalDataDict,
++        hf_processor_mm_kwargs: Mapping[str, object],
++    ) -> MultiModalInputsV2:
++        """
++        Process multi-modal inputs to be used in vLLM.
++
++        The main steps are:
++
++        1. Apply HF Processor on prompt text and multi-modal data together,
++           outputting token IDs and processed tensors.
++        2. Find and replace sequences in the token IDs with placeholder tokens.
++           The number of placeholder tokens equals the feature size of the
++           multi-modal data outputted by the multi-modal encoder.
++        3. Extract information about the placeholder tokens from the
++           processed token IDs.
++        """
++        mm_items = self._to_mm_items(mm_data)
++
++        # Create MM hashes (only used in V1)
++        # TODO: Use these hash keys for caching operations in apply_hf_processor
++        # instead of rehashing.
++
++        if envs.VLLM_USE_V1:
++            model_id = self.info.model_id
++            mm_hashes = {
++                modality: [
++                    MultiModalHasher.hash_kwargs(model_id=model_id,
++                                                 **{modality: item},
++                                                 **hf_processor_mm_kwargs)
++                    for item in items
++                ]
++                for modality, items in mm_items.items()
++            }
++        else:
++            mm_hashes = None
++
++        prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
++            prompt,
++            mm_items,
++            hf_processor_mm_kwargs,
++        )
++
++        unbound_prompt_repls = self._get_prompt_replacements(
++            mm_items,
++            hf_processor_mm_kwargs,
++            mm_kwargs,
++        )
++        mm_prompt_repls = self._bind_and_group_repls(unbound_prompt_repls)
++
++        mm_item_counts = mm_items.get_all_counts()
++        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
++
++        hf_mm_placeholders = self._find_mm_placeholders(
++            mm_prompt_repls,
++            prompt_ids,
++            mm_item_counts,
++        )
++
++        if self._always_apply_prompt_replacements():
++            mm_missing_repl_counts = mm_item_counts
++            mm_missing_repls = dict(mm_prompt_repls)
++        else:
++            mm_missing_repl_counts = self._validate_mm_placeholders(
++                hf_mm_placeholders,
++                mm_item_counts,
++                allow_missing=True,
++            )
++
++            mm_missing_repls = dict[str, list[BoundPromptReplacement]]()
++            for modality, missing_repl_count in mm_missing_repl_counts.items():
++                if missing_repl_count == 0:
++                    mm_missing_repls[modality] = []
++                elif missing_repl_count == mm_item_counts.get(modality, 0):
++                    mm_missing_repls[modality] = mm_prompt_repls[modality]
++                else:
++                    raise ValueError("Partial prompt replacement within "
++                                     f"{modality=} is not supported")
++
++        # If HF processor already inserts placeholder tokens,
++        # there is no need for us to insert them
++        if all(len(repls) == 0 for repls in mm_missing_repls.items()):
++            tokenizer = self.info.get_tokenizer()
++            prompt = decode_tokens(tokenizer, prompt_ids)
++            mm_placeholders = hf_mm_placeholders
++        else:
++            (
++                prompt_ids,
++                prompt,
++                missing_mm_placeholders,
++            ) = self._apply_prompt_replacements(
++                prompt_ids,
++                mm_missing_repls,
++                mm_missing_repl_counts,
++            )
++
++            mm_placeholders = {**hf_mm_placeholders, **missing_mm_placeholders}
++
++        self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
++
++        mm_placeholder_ranges = {
++            modality: [item.to_range() for item in placeholders]
++            for modality, placeholders in mm_placeholders.items()
++        }
++
++        return MultiModalInputsV2(
++            type="multimodal",
++            prompt=prompt,
++            prompt_token_ids=prompt_ids,
++            mm_kwargs=mm_kwargs,
++            mm_hashes=mm_hashes,
++            mm_placeholders=mm_placeholder_ranges,
++        )
+diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
+new file mode 100644
+index 0000000..ec580cd
+--- /dev/null
++++ b/vllm/multimodal/profiling.py
+@@ -0,0 +1,206 @@
++from abc import ABC, abstractmethod
++from collections.abc import Mapping
++from dataclasses import dataclass, field
++from typing import Generic, TypeVar
++
++import numpy as np
++import numpy.typing as npt
++from PIL import Image
++
++import vllm.envs as envs
++from vllm.inputs import DummyData
++from vllm.logger import init_logger
++
++from .inputs import MultiModalDataDict, MultiModalInputsV2
++from .processing import BaseMultiModalProcessor, BaseProcessingInfo
++
++logger = init_logger(__name__)
++
++
++@dataclass
++class ProcessorInputs:
++    """
++    Represents the keyword arguments to
++    :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
++    """
++    prompt_text: str
++    mm_data: MultiModalDataDict
++    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
++
++
++_I = TypeVar("_I", bound=BaseProcessingInfo)
++
++
++class BaseDummyInputsBuilder(ABC, Generic[_I]):
++    """
++    Abstract base class that constructs the dummy data to profile
++    multi-modal models.
++    """
++
++    def __init__(self, info: _I) -> None:
++        super().__init__()
++
++        self.info = info
++
++    @abstractmethod
++    def get_dummy_processor_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> ProcessorInputs:
++        """
++        Build the input which, after processing, results in
++        :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
++        """
++        raise NotImplementedError
++
++    def _get_dummy_audios(
++        self,
++        *,
++        length: int,
++        num_audios: int,
++    ) -> list[npt.NDArray]:
++        audio = np.zeros((length, ))
++        return [audio] * num_audios
++
++    def _get_dummy_images(
++        self,
++        *,
++        width: int,
++        height: int,
++        num_images: int,
++    ) -> list[Image.Image]:
++        image = Image.new("RGB", (width, height), color=0)
++        return [image] * num_images
++
++    def _get_dummy_videos(
++        self,
++        *,
++        width: int,
++        height: int,
++        num_frames: int,
++        num_videos: int,
++    ) -> list[npt.NDArray]:
++        video = np.zeros((num_frames, width, height, 3))
++        return [video] * num_videos
++
++
++class MultiModalProfiler(Generic[_I]):
++    """
++    Contains code for running memory profiling for multi-modal models.
++    """
++
++    def __init__(
++        self,
++        processor: BaseMultiModalProcessor[_I],
++    ) -> None:
++        super().__init__()
++
++        self.processor = processor
++
++    @property
++    def processing_info(self) -> BaseProcessingInfo:
++        return self.processor.info
++
++    @property
++    def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
++        return self.processor.dummy_inputs
++
++    def _get_mm_limits(self) -> Mapping[str, int]:
++        mm_config = self.processing_info.ctx.get_mm_config()
++        mm_limit_per_prompt = mm_config.limit_per_prompt
++
++        supported_mm_limits = self.processing_info.get_supported_mm_limits()
++
++        mm_limits = {
++            modality: mm_limit_per_prompt.get(modality, 1)
++            for modality in supported_mm_limits
++        }
++
++        for modality, supported_limit in supported_mm_limits.items():
++            limit = mm_limits[modality]
++            if supported_limit is not None and supported_limit < limit:
++                raise ValueError(
++                    f"You set {modality}={limit} (or defaulted to 1) in "
++                    f"`--limit-mm-per-prompt`, but this model only supports "
++                    f"at most {supported_limit} {modality} items.")
++
++        return mm_limits
++
++    def _get_dummy_mm_inputs(
++        self,
++        seq_len: int,
++        mm_counts: Mapping[str, int],
++    ) -> MultiModalInputsV2:
++        factory = self.dummy_inputs
++        processor_inputs = factory.get_dummy_processor_inputs(
++            seq_len, mm_counts)
++
++        return self.processor.apply(
++            prompt=processor_inputs.prompt_text,
++            mm_data=processor_inputs.mm_data,
++            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
++        )
++
++    def get_dummy_data(self, seq_len: int) -> DummyData:
++        # Avoid circular import
++        from vllm.sequence import SequenceData
++
++        mm_counts = self._get_mm_limits()
++
++        info = self.processing_info
++        mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len)
++
++        if mm_counts.keys() != mm_max_tokens_per_item.keys():
++            raise AssertionError(
++                "The keys returned by `get_supported_mm_limits`"
++                f"({set(mm_counts.keys())}) should be the same as those "
++                "returned by `get_mm_max_tokens_per_item` "
++                f"({set(mm_max_tokens_per_item.keys())})")
++
++        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
++        prompt_token_ids = mm_inputs["prompt_token_ids"]
++        placeholders_by_modality = mm_inputs["mm_placeholders"]
++
++        total_placeholders_by_modality = {
++            modality: sum(item["length"] for item in placeholders)
++            for modality, placeholders in placeholders_by_modality.items()
++        }
++        expected_placeholders_by_modality = {
++            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
++            for modality in placeholders_by_modality
++        }
++        if total_placeholders_by_modality != expected_placeholders_by_modality:
++            raise AssertionError(
++                f"The processed dummy data has a total of "
++                f"{total_placeholders_by_modality} placeholder tokens, which "
++                f"is not the expected {expected_placeholders_by_modality} "
++                "tokens.")
++
++        total_len = len(prompt_token_ids)
++
++        # V0 does not support chunked prefill.
++        if total_len > seq_len and not envs.VLLM_USE_V1:
++            logger.warning(
++                "The context length (%d) of the model is too short "
++                "to hold the multi-modal embeddings in the worst case "
++                "(%d tokens in total, out of which %s are reserved for "
++                "multi-modal embeddings). This may cause certain multi-modal "
++                "inputs to fail during inference, even when the input text is "
++                "short. To avoid this, you should increase `max_model_len`, "
++                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
++                total_len, total_placeholders_by_modality)
++
++            return DummyData(
++                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
++                multi_modal_data=None,
++                multi_modal_placeholders=None,
++            )
++
++        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
++
++        return DummyData(
++            seq_data=SequenceData.from_seqs(prompt_token_ids),
++            multi_modal_data=mm_inputs["mm_kwargs"],
++            multi_modal_placeholders=placeholders_by_modality,
++        )
+diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
+new file mode 100644
+index 0000000..804a91d
+--- /dev/null
++++ b/vllm/multimodal/registry.py
+@@ -0,0 +1,423 @@
++import functools
++from collections import UserDict
++from dataclasses import dataclass
++from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
++                    Protocol, Sequence, Type, TypeVar)
++
++import torch.nn as nn
++
++from vllm.inputs import InputProcessingContext
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.utils import ClassRegistry
++
++from .audio import AudioPlugin
++from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
++from .image import ImagePlugin
++from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
++from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
++                         ProcessingCache)
++from .profiling import BaseDummyInputsBuilder
++from .utils import cached_get_tokenizer
++from .video import VideoPlugin
++
++if TYPE_CHECKING:
++    from vllm.config import ModelConfig
++
++logger = init_logger(__name__)
++
++# TODO: Tune the MM cache size
++MM_CACHE_SIZE = 256
++
++N = TypeVar("N", bound=Type[nn.Module])
++_I = TypeVar("_I", bound=BaseProcessingInfo)
++_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
++
++
++class ProcessingInfoFactory(Protocol[_I_co]):
++    """Constructs a :class:`MultiModalProcessor` instance from the context."""
++
++    def __call__(
++        self,
++        ctx: InputProcessingContext,
++    ) -> _I_co:
++        ...
++
++
++class DummyInputsBuilderFactory(Protocol[_I]):
++    """
++    Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
++    """
++
++    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
++        ...
++
++
++class MultiModalProcessorFactory(Protocol[_I]):
++    """Constructs a :class:`MultiModalProcessor` instance from the context."""
++
++    def __call__(
++        self,
++        info: _I,
++        dummy_inputs: BaseDummyInputsBuilder[_I],
++        *,
++        cache: Optional[ProcessingCache] = None,
++    ) -> BaseMultiModalProcessor[_I]:
++        ...
++
++
++@dataclass(frozen=True)
++class _ProcessorFactories(Generic[_I]):
++    info: ProcessingInfoFactory[_I]
++    processor: MultiModalProcessorFactory[_I]
++    dummy_inputs: DummyInputsBuilderFactory[_I]
++
++    def build_processor(
++        self,
++        ctx: InputProcessingContext,
++        *,
++        cache: Optional[ProcessingCache] = None,
++    ):
++        info = self.info(ctx)
++        dummy_inputs_builder = self.dummy_inputs(info)
++        return self.processor(info, dummy_inputs_builder, cache=cache)
++
++
++class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
++    """
++    Wraps `_limits_by_model` for a more informative error message
++    when attempting to access a model that does not exist.
++    """
++
++    def __getitem__(self, key: "ModelConfig") -> Dict[str, int]:
++        try:
++            return super().__getitem__(key)
++        except KeyError as exc:
++            msg = (f"Cannot find `mm_limits` for model={key.model}. Did you "
++                   "forget to call `init_mm_limits_per_prompt`?")
++            raise KeyError(msg) from exc
++
++
++class MultiModalRegistry:
++    """
++    A registry that dispatches data processing according to the model.
++    """
++
++    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
++
++    def __init__(
++            self,
++            *,
++            plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None:
++        self._plugins = {p.get_data_key(): p for p in plugins}
++
++        self._processor_factories = ClassRegistry[nn.Module,
++                                                  _ProcessorFactories]()
++
++        # This is used for non-multimodal models
++        self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
++
++        self._limits_by_model = _MultiModalLimits()
++
++        self._processing_cache = ProcessingCache(MM_CACHE_SIZE)
++
++    def register_plugin(self, plugin: MultiModalPlugin) -> None:
++        """
++        Register a multi-modal plugin so it can be recognized by vLLM.
++        """
++        data_type_key = plugin.get_data_key()
++
++        if data_type_key in self._plugins:
++            logger.warning(
++                "A plugin is already registered for data type %s, "
++                "and will be overwritten by the new plugin %s.", data_type_key,
++                plugin)
++
++        self._plugins[data_type_key] = plugin
++
++    def _get_plugin(self, data_type_key: str):
++        plugin = self._plugins.get(data_type_key)
++        if plugin is not None:
++            return plugin
++
++        msg = f"Unknown multi-modal data type: {data_type_key}"
++        raise NotImplementedError(msg)
++
++    def register_input_mapper(
++        self,
++        data_type_key: str,
++        mapper: Optional[MultiModalInputMapper] = None,
++    ):
++        """
++        Register an input mapper for a specific modality to a model class.
++
++        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
++        """
++        return self._get_plugin(data_type_key).register_input_mapper(mapper)
++
++    def register_image_input_mapper(
++        self,
++        mapper: Optional[MultiModalInputMapper] = None,
++    ):
++        """
++        Register an input mapper for image data to a model class.
++
++        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
++        """
++        return self.register_input_mapper("image", mapper)
++
++    def map_input(
++        self,
++        model_config: "ModelConfig",
++        data: MultiModalDataDict,
++        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
++    ) -> MultiModalKwargs:
++        """
++        Apply an input mapper to the data passed to the model.
++
++        The data belonging to each modality is passed to the corresponding
++        plugin which in turn converts the data into into keyword arguments
++        via the input mapper registered for that model.
++
++        See :meth:`MultiModalPlugin.map_input` for more details.
++
++        Note:
++            This should be called after :meth:`init_mm_limits_per_prompt`.
++        """
++        merged_dict: Dict[str, NestedTensors] = {}
++
++        for data_key, data_value in data.items():
++            plugin = self._get_plugin(data_key)
++
++            num_items = len(data_value) if isinstance(data_value, list) else 1
++            max_items = self._limits_by_model[model_config][data_key]
++            if num_items > max_items:
++                raise ValueError(
++                    f"You set {data_key}={max_items} (or defaulted to 1) in "
++                    f"`--limit-mm-per-prompt`, but found {num_items} items "
++                    "in the same prompt.")
++
++            input_dict = plugin.map_input(model_config, data_value,
++                                          mm_processor_kwargs)
++            for input_key, input_tensor in input_dict.items():
++                if input_key in merged_dict:
++                    raise ValueError(f"The input mappers (keys={set(data)}) "
++                                     f"resulted in a conflicting keyword "
++                                     f"argument to `forward()`: {input_key}")
++
++                merged_dict[input_key] = input_tensor
++
++        return MultiModalKwargs(merged_dict)
++
++    def create_input_mapper(self, model_config: "ModelConfig"):
++        """
++        Create an input mapper (see :meth:`map_input`) for a specific model.
++        """
++        # NOTE - we currently make the assumption that if a model has multiple
++        # supported modalities, they take the same kwargs. For the default,
++        # this could be an issue in the future if it falls back to two HF
++        # resources and we can't inspect the signature easily since it's
++        # getting initialized through the autoclass.
++        #
++        # If this is a problem in the future, we should revisit it, but since
++        # it potentially introduces a lot of complexity for a currently
++        # uncommon case, we do not for simplicity of both use & implementation
++        return functools.partial(self.map_input, model_config)
++
++    def register_max_multimodal_tokens(
++        self,
++        data_type_key: str,
++        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
++    ):
++        """
++        Register the maximum number of tokens, corresponding to a single
++        instance of multimodal data belonging to a specific modality, that are
++        passed to the language model for a model class.
++        """
++        return self._get_plugin(data_type_key) \
++            .register_max_multimodal_tokens(max_mm_tokens)
++
++    def register_max_image_tokens(
++        self,
++        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
++    ):
++        """
++        Register the maximum number of image tokens, corresponding to a single
++        image, that are passed to the language model for a model class.
++        """
++        return self.register_max_multimodal_tokens("image", max_mm_tokens)
++
++    def get_max_tokens_per_item_by_modality(
++        self,
++        model_config: "ModelConfig",
++    ) -> Mapping[str, int]:
++        """
++        Get the maximum number of tokens per data item from each modality
++        for profiling the memory usage of a model.
++
++        Note:
++            This is currently directly used only in V1.
++        """
++        if self.has_processor(model_config):
++            tokenizer = cached_get_tokenizer(model_config.tokenizer)
++            processor = self.create_processor(model_config, tokenizer)
++            seq_len = model_config.max_model_len
++            return processor.info.get_mm_max_tokens_per_item(seq_len)
++
++        return {
++            key: plugin.get_max_multimodal_tokens(model_config)
++            for key, plugin in self._plugins.items()
++        }
++
++    def get_max_tokens_by_modality(
++        self,
++        model_config: "ModelConfig",
++    ) -> Mapping[str, int]:
++        """
++        Get the maximum number of tokens from each modality
++        for profiling the memory usage of a model.
++
++        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
++
++        Note:
++            This should be called after :meth:`init_mm_limits_per_prompt`.
++        """
++        limits_per_plugin = self._limits_by_model[model_config]
++
++        return {
++            key: limits_per_plugin[key] * max_tokens_per_mm_item
++            for key, max_tokens_per_mm_item in
++            self.get_max_tokens_per_item_by_modality(model_config).items()
++        }
++
++    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
++        """
++        Get the maximum number of multi-modal tokens
++        for profiling the memory usage of a model.
++
++        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
++
++        Note:
++            This should be called after :meth:`init_mm_limits_per_prompt`.
++        """
++        return sum(self.get_max_tokens_by_modality(model_config).values())
++
++    def init_mm_limits_per_prompt(
++        self,
++        model_config: "ModelConfig",
++    ) -> None:
++        """
++        Initialize the maximum number of multi-modal input instances for each
++        modality that are allowed per prompt for a model class.
++        """
++        if model_config in self._limits_by_model:
++            logger.warning(
++                "`mm_limits` has already been set for model=%s, and will "
++                "be overwritten by the new values.", model_config.model)
++
++        multimodal_config = model_config.multimodal_config
++        if multimodal_config is None:
++            limits_per_plugin = self._disabled_limits_per_plugin
++        else:
++            config_limits_per_plugin = multimodal_config.limit_per_prompt
++
++            extra_keys = config_limits_per_plugin.keys() - self._plugins.keys()
++            if extra_keys:
++                logger.warning(
++                    "Detected extra keys in `--limit-mm-per-prompt` which "
++                    "are not registered as multi-modal plugins: %s. "
++                    "They will be ignored.", extra_keys)
++
++            # NOTE: Currently the default is set to 1 for each plugin
++            # TODO: Automatically determine the limits based on budget
++            # once more models support multi-image inputs
++            limits_per_plugin = {
++                key: config_limits_per_plugin.get(key, 1)
++                for key in self._plugins
++            }
++
++        self._limits_by_model[model_config] = limits_per_plugin
++
++    def get_mm_limits_per_prompt(
++        self,
++        model_config: "ModelConfig",
++    ) -> Mapping[str, int]:
++        """
++        Get the maximum number of multi-modal input instances for each modality
++        that are allowed per prompt for a model class.
++
++        Note:
++            This should be called after :meth:`init_mm_limits_per_prompt`.
++        """
++        return self._limits_by_model[model_config]
++
++    def register_processor(
++        self,
++        processor: MultiModalProcessorFactory[_I],
++        *,
++        info: ProcessingInfoFactory[_I],
++        dummy_inputs: DummyInputsBuilderFactory[_I],
++    ):
++        """
++        Register a multi-modal processor to a model class. The processor
++        is constructed lazily, hence a factory method should be passed.
++
++        When the model receives multi-modal data, the provided function is
++        invoked to transform the data into a dictionary of model inputs.
++
++        See also:
++            :ref:`mm-processing`
++        """
++
++        def wrapper(model_cls: N) -> N:
++            if self._processor_factories.contains(model_cls, strict=True):
++                logger.warning(
++                    "Model class %s already has a multi-modal processor "
++                    "registered to %s. It is overwritten by the new one.",
++                    model_cls, self)
++
++            self._processor_factories[model_cls] = _ProcessorFactories(
++                info=info,
++                dummy_inputs=dummy_inputs,
++                processor=processor,
++            )
++
++            return model_cls
++
++        return wrapper
++
++    def _get_model_cls(self, model_config: "ModelConfig"):
++        # Avoid circular import
++        from vllm.model_executor.model_loader import get_model_architecture
++
++        model_cls, _ = get_model_architecture(model_config)
++        return model_cls
++
++    def has_processor(self, model_config: "ModelConfig") -> bool:
++        """
++        Test whether a multi-modal processor is defined for a specific model.
++
++        See also:
++            :ref:`mm-processing`
++        """
++        return self._get_model_cls(model_config) in self._processor_factories
++
++    def create_processor(
++        self,
++        model_config: "ModelConfig",
++        tokenizer: AnyTokenizer,
++    ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
++        """
++        Create a multi-modal processor for a specific model and tokenizer.
++
++        See also:
++            :ref:`mm-processing`
++        """
++        model_cls = self._get_model_cls(model_config)
++        factories = self._processor_factories[model_cls]
++
++        ctx = InputProcessingContext(model_config, tokenizer)
++        cache = (None if model_config.disable_mm_preprocessor_cache else
++                 self._processing_cache)
++
++        return factories.build_processor(ctx, cache=cache)
+diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
+new file mode 100644
+index 0000000..1c6bbf7
+--- /dev/null
++++ b/vllm/multimodal/utils.py
+@@ -0,0 +1,479 @@
++from functools import lru_cache
++from pathlib import Path
++from typing import TYPE_CHECKING, Optional, TypeVar, Union
++from urllib.parse import ParseResult, urlparse
++
++import numpy as np
++import numpy.typing as npt
++from PIL import Image
++
++import vllm.envs as envs
++from vllm.connections import HTTPConnection, global_http_connection
++from vllm.logger import init_logger
++from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
++
++from .audio import AudioMediaIO
++from .base import MediaIO
++from .image import ImageMediaIO
++from .inputs import PlaceholderRange
++from .video import VideoMediaIO
++
++logger = init_logger(__name__)
++
++cached_get_tokenizer = lru_cache(get_tokenizer)
++
++_M = TypeVar("_M")
++
++if TYPE_CHECKING:
++    from .hasher import MultiModalHashDict
++    from .inputs import MultiModalPlaceholderDict
++
++
++class MediaConnector:
++
++    def __init__(
++        self,
++        connection: HTTPConnection = global_http_connection,
++        *,
++        allowed_local_media_path: str = "",
++    ) -> None:
++        super().__init__()
++
++        self.connection = connection
++
++        if allowed_local_media_path:
++            allowed_local_media_path_ = Path(allowed_local_media_path)
++
++            if not allowed_local_media_path_.exists():
++                raise ValueError(
++                    "Invalid `--allowed-local-media-path`: The path "
++                    f"{allowed_local_media_path_} does not exist.")
++            if not allowed_local_media_path_.is_dir():
++                raise ValueError(
++                    "Invalid `--allowed-local-media-path`: The path "
++                    f"{allowed_local_media_path_} must be a directory.")
++        else:
++            allowed_local_media_path_ = None
++
++        self.allowed_local_media_path = allowed_local_media_path_
++
++    def _load_data_url(
++        self,
++        url_spec: ParseResult,
++        media_io: MediaIO[_M],
++    ) -> _M:
++        data_spec, data = url_spec.path.split(",", 1)
++        media_type, data_type = data_spec.split(";", 1)
++
++        if data_type != "base64":
++            msg = "Only base64 data URLs are supported for now."
++            raise NotImplementedError(msg)
++
++        return media_io.load_base64(media_type, data)
++
++    def _load_file_url(
++        self,
++        url_spec: ParseResult,
++        media_io: MediaIO[_M],
++    ) -> _M:
++        allowed_local_media_path = self.allowed_local_media_path
++        if allowed_local_media_path is None:
++            raise RuntimeError("Cannot load local files without "
++                               "`--allowed-local-media-path`.")
++
++        filepath = Path(url_spec.path)
++        if allowed_local_media_path not in filepath.resolve().parents:
++            raise ValueError(
++                f"The file path {filepath} must be a subpath "
++                f"of `--allowed-local-media-path` {allowed_local_media_path}.")
++
++        return media_io.load_file(filepath)
++
++    def load_from_url(
++        self,
++        url: str,
++        media_io: MediaIO[_M],
++        *,
++        fetch_timeout: Optional[int] = None,
++    ) -> _M:
++        url_spec = urlparse(url)
++
++        if url_spec.scheme.startswith("http"):
++            connection = self.connection
++            data = connection.get_bytes(url, timeout=fetch_timeout)
++
++            return media_io.load_bytes(data)
++
++        if url_spec.scheme == "data":
++            return self._load_data_url(url_spec, media_io)
++
++        if url_spec.scheme == "file":
++            return self._load_file_url(url_spec, media_io)
++
++        msg = "The URL must be either a HTTP, data or file URL."
++        raise ValueError(msg)
++
++    async def load_from_url_async(
++        self,
++        url: str,
++        media_io: MediaIO[_M],
++        *,
++        fetch_timeout: Optional[int] = None,
++    ) -> _M:
++        url_spec = urlparse(url)
++
++        if url_spec.scheme.startswith("http"):
++            connection = self.connection
++            data = await connection.async_get_bytes(url, timeout=fetch_timeout)
++
++            return media_io.load_bytes(data)
++
++        if url_spec.scheme == "data":
++            return self._load_data_url(url_spec, media_io)
++
++        if url_spec.scheme == "file":
++            return self._load_file_url(url_spec, media_io)
++
++        msg = "The URL must be either a HTTP, data or file URL."
++        raise ValueError(msg)
++
++    def fetch_audio(
++        self,
++        audio_url: str,
++    ) -> tuple[np.ndarray, Union[int, float]]:
++        """
++        Load audio from a URL.
++        """
++        audio_io = AudioMediaIO()
++
++        return self.load_from_url(
++            audio_url,
++            audio_io,
++            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
++        )
++
++    async def fetch_audio_async(
++        self,
++        audio_url: str,
++    ) -> tuple[np.ndarray, Union[int, float]]:
++        """
++        Asynchronously fetch audio from a URL.
++        """
++        audio_io = AudioMediaIO()
++
++        return await self.load_from_url_async(
++            audio_url,
++            audio_io,
++            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
++        )
++
++    def fetch_image(
++        self,
++        image_url: str,
++        *,
++        image_mode: str = "RGB",
++    ) -> Image.Image:
++        """
++        Load a PIL image from a HTTP or base64 data URL.
++
++        By default, the image is converted into RGB format.
++        """
++        image_io = ImageMediaIO(image_mode=image_mode)
++
++        return self.load_from_url(
++            image_url,
++            image_io,
++            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
++        )
++
++    async def fetch_image_async(
++        self,
++        image_url: str,
++        *,
++        image_mode: str = "RGB",
++    ) -> Image.Image:
++        """
++        Asynchronously load a PIL image from a HTTP or base64 data URL.
++
++        By default, the image is converted into RGB format.
++        """
++        image_io = ImageMediaIO(image_mode=image_mode)
++
++        return await self.load_from_url_async(
++            image_url,
++            image_io,
++            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
++        )
++
++    def fetch_video(
++        self,
++        video_url: str,
++        *,
++        image_mode: str = "RGB",
++        num_frames: int = 32,
++    ) -> npt.NDArray:
++        """
++        Load video from a HTTP or base64 data URL.
++        """
++        image_io = ImageMediaIO(image_mode=image_mode)
++        video_io = VideoMediaIO(image_io, num_frames=num_frames)
++
++        return self.load_from_url(
++            video_url,
++            video_io,
++            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
++        )
++
++    async def fetch_video_async(
++        self,
++        video_url: str,
++        *,
++        image_mode: str = "RGB",
++        num_frames: int = 32,
++    ) -> npt.NDArray:
++        """
++        Asynchronously load video from a HTTP or base64 data URL.
++
++        By default, the image is converted into RGB format.
++        """
++        image_io = ImageMediaIO(image_mode=image_mode)
++        video_io = VideoMediaIO(image_io, num_frames=num_frames)
++
++        return await self.load_from_url_async(
++            video_url,
++            video_io,
++            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
++        )
++
++
++global_media_connector = MediaConnector()
++"""The global :class:`MediaConnector` instance used by vLLM."""
++
++fetch_audio = global_media_connector.fetch_audio
++fetch_image = global_media_connector.fetch_image
++fetch_video = global_media_connector.fetch_video
++
++
++def encode_audio_base64(
++    audio: np.ndarray,
++    sampling_rate: int,
++) -> str:
++    """Encode audio as base64."""
++    audio_io = AudioMediaIO()
++    return audio_io.encode_base64((audio, sampling_rate))
++
++
++def encode_image_base64(
++    image: Image.Image,
++    *,
++    image_mode: str = "RGB",
++    format: str = "JPEG",
++) -> str:
++    """
++    Encode a pillow image to base64 format.
++
++    By default, the image is converted into RGB format before being encoded.
++    """
++    image_io = ImageMediaIO(image_mode=image_mode)
++    return image_io.encode_base64(image, image_format=format)
++
++
++def encode_video_base64(frames: npt.NDArray) -> str:
++    image_io = ImageMediaIO()
++    video_io = VideoMediaIO(image_io)
++    return video_io.encode_base64(frames)
++
++
++# Utilities for input processors
++_T = TypeVar("_T", str, int)
++
++
++def repeat_and_pad_token(
++    token: _T,
++    *,
++    repeat_count: int = 1,
++    pad_token_left: Optional[_T] = None,
++    pad_token_right: Optional[_T] = None,
++) -> list[_T]:
++    replacement = [token] * repeat_count
++    if pad_token_left is not None:
++        replacement = [pad_token_left] + replacement
++    if pad_token_right is not None:
++        replacement = replacement + [pad_token_right]
++
++    return replacement
++
++
++def repeat_and_pad_placeholder_tokens(
++    tokenizer: AnyTokenizer,
++    prompt: Optional[str],
++    prompt_token_ids: list[int],
++    *,
++    placeholder_token_id: int,
++    repeat_count: Union[int, list[int]],
++    pad_token_left: Optional[int] = None,
++    pad_token_right: Optional[int] = None,
++) -> tuple[Optional[str], list[int], list[PlaceholderRange]]:
++    if isinstance(repeat_count, int):
++        repeat_count = [repeat_count]
++
++    if prompt is None:
++        new_prompt = None
++    else:
++        placeholder_token_str = tokenizer.decode(placeholder_token_id)
++        pad_token_str_left = (None if pad_token_left is None else
++                              tokenizer.decode(pad_token_left))
++        pad_token_str_right = (None if pad_token_right is None else
++                               tokenizer.decode(pad_token_right))
++
++        placeholder_token_count = prompt.count(placeholder_token_str)
++        # This is an arbitrary number to distinguish between the two cases
++        if placeholder_token_count > 16:
++            logger.warning(
++                "Please follow the prompt format that is "
++                "documented on HuggingFace which does not involve "
++                "repeating %s tokens.", placeholder_token_str)
++        if placeholder_token_count < len(repeat_count):
++            logger.warning(
++                "The number of multi-modal placeholder tokens in the prompt "
++                "is less than the number of multi-modal inputs. Extra "
++                "placeholder tokens will be treated as plain text")
++            repeat_count = repeat_count[:placeholder_token_count]
++
++        prompt_parts = prompt.split(placeholder_token_str,
++                                    maxsplit=len(repeat_count))
++        new_prompt = ""
++        for i, repeat_count_item in enumerate(repeat_count):
++            replacement_str = "".join(
++                repeat_and_pad_token(
++                    placeholder_token_str,
++                    repeat_count=repeat_count_item,
++                    pad_token_left=pad_token_str_left,
++                    pad_token_right=pad_token_str_right,
++                ))
++            # The image tokens are removed to be consistent with HuggingFace
++            new_prompt += prompt_parts[i] + replacement_str
++        new_prompt += prompt_parts[-1]
++
++    new_token_ids = list[int]()
++    placeholder_ranges = list[PlaceholderRange]()
++    placeholder_token_idx = 0
++    for i, token in enumerate(prompt_token_ids):
++        if token == placeholder_token_id:
++            curr_repeat_count = repeat_count[placeholder_token_idx]
++            replacement_ids = repeat_and_pad_token(
++                placeholder_token_id,
++                repeat_count=curr_repeat_count,
++                pad_token_left=pad_token_left,
++                pad_token_right=pad_token_right,
++            )
++            offset = len(new_token_ids)
++            if pad_token_left is not None:
++                offset += 1
++            placeholder_ranges.append({
++                "offset": offset,
++                "length": curr_repeat_count,
++            })
++            new_token_ids.extend(replacement_ids)
++            placeholder_token_idx += 1
++
++            # No need to further scan the list since we replaced all tokens
++            if placeholder_token_idx >= len(repeat_count):
++                new_token_ids.extend(prompt_token_ids[i + 1:])
++                break
++        else:
++            new_token_ids.append(token)
++
++    return new_prompt, new_token_ids, placeholder_ranges
++
++
++def consecutive_placeholder_ranges(
++        num_items: int,
++        item_size: int,
++        initial_offset: int = 0) -> list[PlaceholderRange]:
++    """Returns a list of consecutive PlaceholderRanges of a fixed size"""
++
++    return [
++        PlaceholderRange(offset=initial_offset + i * item_size,
++                         length=item_size) for i in range(num_items)
++    ]
++
++
++def merge_and_sort_multimodal_metadata(
++    mm_positions: "MultiModalPlaceholderDict",
++    mm_hashes: Optional["MultiModalHashDict"],
++) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
++    """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
++    objects from all available modalities into a single list of 
++    PlaceholderRange, sorted by their offset (starting index in the input 
++    sequence) in the ascending order.
++
++    Optionally if a MultiModalHashDict is given, same operation will be 
++    applied to the object and the sorted list of hashes will be returned.
++
++    Raises:
++        ValueError: If the input prompt has interleaved placeholders from
++            different modalities (e.g, "<image><audio><image> Describe the 
++            content.")
++    
++    Returns:
++        list[str]: Sorted list of involved modalities.
++        list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from 
++            mm_positions.
++        Optional[list[str]]: Sorted list of all hashes from mm_hashes if 
++            given, None otherwise.
++    """
++
++    modalities = list(mm_positions.keys())
++
++    assert len(modalities) > 0, "No modalities found in the mm_positions."
++
++    # For single modality, placeholder ranges and hashes are already sorted
++    # so we can return the list directly.
++    if len(modalities) == 1:
++        if mm_hashes is None:
++            return modalities, list(mm_positions[modalities[0]]), None
++        else:
++            return modalities, list(mm_positions[modalities[0]]), list(
++                mm_hashes[modalities[0]])
++
++    placeholder_lists_with_modality = [(modality, mm_positions[modality])
++                                       for modality in modalities]
++
++    if mm_hashes is None:
++        sorted_placeholder_lists = sorted(placeholder_lists_with_modality,
++                                          key=lambda x: x[1][0]['offset'])
++        sorted_hash_lists = None
++    else:
++        hashes_lists = [
++            mm_hashes[modality] for modality in modalities
++            if modality in mm_hashes
++        ]
++        sorted_pairs = sorted(zip(placeholder_lists_with_modality,
++                                  hashes_lists),
++                              key=lambda x: x[0][1][0]['offset'])
++        sorted_placeholder_tuple, sorted_hash_tuple = zip(*sorted_pairs)
++        sorted_placeholder_lists = list(sorted_placeholder_tuple)
++        sorted_hash_lists = list(sorted_hash_tuple)
++
++    sorted_modalities = [modality for modality, _ in sorted_placeholder_lists]
++
++    # Flatten sorted list of lists to a single list and verify there is no
++    # interleaving of placeholders from different modalities.
++    merged_placeholders: list[PlaceholderRange] = []
++    for modality, placeholder_list in sorted_placeholder_lists:
++        if merged_placeholders and placeholder_list[0][
++                'offset'] < merged_placeholders[-1]['offset']:
++            raise ValueError(
++                "Interleaved mixed-modality inference is currently not "
++                "supported.")
++        merged_placeholders.extend(placeholder_list)
++
++    if sorted_hash_lists is not None:
++        merged_hashes = []
++        for hash_list in sorted_hash_lists:
++            merged_hashes.extend(hash_list)
++    else:
++        merged_hashes = None
++
++    return sorted_modalities, merged_placeholders, merged_hashes
+diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
+new file mode 100644
+index 0000000..1ad1f5a
+--- /dev/null
++++ b/vllm/multimodal/video.py
+@@ -0,0 +1,188 @@
++import base64
++from functools import lru_cache, partial
++from io import BytesIO
++from pathlib import Path
++from typing import TYPE_CHECKING, Any, Dict, Optional
++
++import cv2
++import numpy as np
++import numpy.typing as npt
++from PIL import Image
++
++from vllm.inputs.registry import InputContext
++from vllm.logger import init_logger
++from vllm.transformers_utils.processor import get_video_processor
++from vllm.transformers_utils.tokenizer import get_tokenizer
++from vllm.utils import PlaceholderModule, is_list_of
++
++from .base import MediaIO, ModalityData
++from .image import ImageMediaIO, ImagePlugin
++from .inputs import MultiModalKwargs, VideoItem
++
++if TYPE_CHECKING:
++    from vllm.config import ModelConfig
++
++try:
++    import decord
++except ImportError:
++    decord = PlaceholderModule("decord")  # type: ignore[assignment]
++
++logger = init_logger(__name__)
++
++cached_get_video_processor = lru_cache(get_video_processor)
++cached_get_tokenizer = lru_cache(get_tokenizer)
++
++
++class VideoPlugin(ImagePlugin):
++    """Plugin for video data."""
++
++    def get_data_key(self) -> str:
++        return "video"
++
++    def _get_hf_video_processor(
++        self,
++        model_config: "ModelConfig",
++        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
++    ):
++        if mm_processor_kwargs is None:
++            mm_processor_kwargs = {}
++        return cached_get_video_processor(
++            model_config.model,
++            trust_remote_code=model_config.trust_remote_code,
++            **mm_processor_kwargs)
++
++    def _default_input_mapper(
++        self,
++        ctx: InputContext,
++        data: ModalityData[VideoItem],
++        **mm_processor_kwargs,
++    ) -> MultiModalKwargs:
++        model_config = ctx.model_config
++
++        if isinstance(data, list) and len(data) == 1:
++            data = data[0]  # type: ignore
++
++        if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
++            video_processor = self._get_hf_video_processor(
++                model_config,
++                mm_processor_kwargs,
++            )
++            if video_processor is None:
++                raise RuntimeError("No HuggingFace processor is available "
++                                   "to process the video object")
++            try:
++                # NOTE: Similar to image; it may be a good idea to filter and
++                # pass mm_processor_kwargs here too, but for now we don't to
++                # avoid extra complexity if the initializer and preprocess
++                # signatures of the processor don't align
++                batch_data = video_processor(data, return_tensors="pt").data
++            except Exception:
++                logger.error("Failed to process video (%s)", data)
++                raise
++
++            return MultiModalKwargs(batch_data)
++
++        raise TypeError(f"Invalid video type: {type(data)}")
++
++    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
++        return 4096
++
++
++def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
++    num_frames, _, _, channels = frames.shape
++    new_height, new_width = size
++    resized_frames = np.empty((num_frames, new_height, new_width, channels),
++                              dtype=frames.dtype)
++    for i, frame in enumerate(frames):
++        resized_frame = cv2.resize(frame, (new_width, new_height))
++        resized_frames[i] = resized_frame
++    return resized_frames
++
++
++def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
++    _, height, width, _ = frames.shape
++    new_height = int(height * size_factor)
++    new_width = int(width * size_factor)
++
++    return resize_video(frames, (new_height, new_width))
++
++
++def sample_frames_from_video(frames: npt.NDArray,
++                             num_frames: int) -> npt.NDArray:
++    total_frames = frames.shape[0]
++    if num_frames == -1:
++        return frames
++
++    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
++    sampled_frames = frames[frame_indices, ...]
++    return sampled_frames
++
++
++class VideoMediaIO(MediaIO[npt.NDArray]):
++
++    def __init__(
++        self,
++        image_io: ImageMediaIO,
++        *,
++        num_frames: int = 32,
++    ) -> None:
++        super().__init__()
++
++        self.image_io = image_io
++        self.num_frames = num_frames
++
++    def load_bytes(self, data: bytes) -> npt.NDArray:
++        vr = decord.VideoReader(BytesIO(data), num_threads=1)
++        total_frame_num = len(vr)
++
++        num_frames = self.num_frames
++        if total_frame_num > num_frames:
++            uniform_sampled_frames = np.linspace(0,
++                                                 total_frame_num - 1,
++                                                 num_frames,
++                                                 dtype=int)
++            frame_idx = uniform_sampled_frames.tolist()
++        else:
++            frame_idx = list(range(0, total_frame_num))
++
++        return vr.get_batch(frame_idx).asnumpy()
++
++    def load_base64(self, media_type: str, data: str) -> npt.NDArray:
++        if media_type.lower() == "video/jpeg":
++            load_frame = partial(
++                self.image_io.load_base64,
++                "image/jpeg",
++            )
++
++            return np.stack([
++                np.array(load_frame(frame_data))
++                for frame_data in data.split(",")
++            ])
++
++        return self.load_bytes(base64.b64decode(data))
++
++    def load_file(self, filepath: Path) -> npt.NDArray:
++        with filepath.open("rb") as f:
++            data = f.read()
++
++        return self.load_bytes(data)
++
++    def encode_base64(
++        self,
++        media: npt.NDArray,
++        *,
++        video_format: str = "JPEG",
++    ) -> str:
++        video = media
++
++        if video_format == "JPEG":
++            encode_frame = partial(
++                self.image_io.encode_base64,
++                image_format=video_format,
++            )
++
++            return ",".join(
++                encode_frame(Image.fromarray(frame)) for frame in video)
++
++        msg = "Only JPEG format is supported for now."
++        raise NotImplementedError(msg)
+diff --git a/vllm/outputs.py b/vllm/outputs.py
+index d01be0e..b519c15 100644
+--- a/vllm/outputs.py
++++ b/vllm/outputs.py
+@@ -1,11 +1,20 @@
+ import time
+-from typing import List, Optional, Union
++from dataclasses import dataclass
++from typing import Dict, Generic, List, Optional
++from typing import Sequence as GenericSequence
++from typing import Union
++
++import torch
++from typing_extensions import TypeVar, deprecated
+ 
+ from vllm.lora.request import LoRARequest
++from vllm.multimodal.inputs import MultiModalPlaceholderDict
++from vllm.sampling_params import RequestOutputKind
+ from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
+-                           SequenceGroup, SequenceStatus)
++                           SequenceGroup, SequenceGroupBase, SequenceStatus)
+ 
+ 
++@dataclass
+ class CompletionOutput:
+     """The output data of one completion output of a request.
+ 
+@@ -24,25 +33,14 @@ class CompletionOutput:
+         lora_request: The LoRA request that was used to generate the output.
+     """
+ 
+-    def __init__(
+-        self,
+-        index: int,
+-        text: str,
+-        token_ids: List[int],
+-        cumulative_logprob: float,
+-        logprobs: Optional[SampleLogprobs],
+-        finish_reason: Optional[str] = None,
+-        stop_reason: Union[int, str, None] = None,
+-        lora_request: Optional[LoRARequest] = None,
+-    ) -> None:
+-        self.index = index
+-        self.text = text
+-        self.token_ids = token_ids
+-        self.cumulative_logprob = cumulative_logprob
+-        self.logprobs = logprobs
+-        self.finish_reason = finish_reason
+-        self.stop_reason = stop_reason
+-        self.lora_request = lora_request
++    index: int
++    text: str
++    token_ids: GenericSequence[int]
++    cumulative_logprob: Optional[float]
++    logprobs: Optional[SampleLogprobs]
++    finish_reason: Optional[str] = None
++    stop_reason: Union[int, str, None] = None
++    lora_request: Optional[LoRARequest] = None
+ 
+     def finished(self) -> bool:
+         return self.finish_reason is not None
+@@ -57,94 +55,453 @@ class CompletionOutput:
+                 f"stop_reason={self.stop_reason})")
+ 
+ 
++@dataclass
++class PoolingOutput:
++    """The output data of one pooling output of a request.
++
++    Args:
++        data: The extracted hidden states.
++    """
++    data: torch.Tensor
++
++    def __repr__(self) -> str:
++        return (f"PoolingOutput(data={self.data})")
++
++    def __eq__(self, other: object) -> bool:
++        return (isinstance(other, self.__class__) and bool(
++            (self.data == other.data).all()))
++
++    @property
++    @deprecated("`LLM.encode()` now stores raw outputs in the `data` "
++                "attribute. To return embeddings, use `LLM.embed()`. "
++                "To return class probabilities, use `LLM.classify()` "
++                "and access the `probs` attribute. ")
++    def embedding(self) -> list[float]:
++        return self.data.tolist()
++
++
+ class RequestOutput:
+-    """The output data of a request to the LLM.
++    """The output data of a completion request to the LLM.
+ 
+     Args:
+         request_id: The unique ID of the request.
+         prompt: The prompt string of the request.
++                For encoder/decoder models, this is the
++                decoder input prompt.
+         prompt_token_ids: The token IDs of the prompt.
++                          For encoder/decoder models, this is the
++                          decoder input prompt token ids.
+         prompt_logprobs: The log probabilities to return per prompt token.
+         outputs: The output sequences of the request.
+         finished: Whether the whole request is finished.
+         metrics: Metrics associated with the request.
+         lora_request: The LoRA request that was used to generate the output.
++        encoder_prompt: The encoder prompt string of the request.
++                        None if decoder-only.
++        encoder_prompt_token_ids: The token IDs of the encoder prompt.
++                                  None if decoder-only.
++        num_cached_tokens: The number of tokens with prefix cache hit.
+     """
+ 
+     def __init__(
+         self,
+         request_id: str,
+-        prompt: str,
+-        prompt_token_ids: List[int],
++        prompt: Optional[str],
++        prompt_token_ids: Optional[List[int]],
+         prompt_logprobs: Optional[PromptLogprobs],
+         outputs: List[CompletionOutput],
+         finished: bool,
+         metrics: Optional[RequestMetrics] = None,
+         lora_request: Optional[LoRARequest] = None,
++        encoder_prompt: Optional[str] = None,
++        encoder_prompt_token_ids: Optional[List[int]] = None,
++        num_cached_tokens: Optional[int] = None,
++        *,
++        multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
+     ) -> None:
+         self.request_id = request_id
+         self.prompt = prompt
+         self.prompt_token_ids = prompt_token_ids
++        self.multi_modal_placeholders = multi_modal_placeholders or {}
+         self.prompt_logprobs = prompt_logprobs
+         self.outputs = outputs
+         self.finished = finished
+         self.metrics = metrics
+         self.lora_request = lora_request
++        self.encoder_prompt = encoder_prompt
++        self.encoder_prompt_token_ids = encoder_prompt_token_ids
++        self.num_cached_tokens = num_cached_tokens
+ 
+     @classmethod
+-    def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
+-        seqs = seq_group.get_seqs()
+-        if len(seqs) == 1:
+-            top_n_seqs = seqs
+-        else:
+-            # Get the top-n sequences.
+-            n = seq_group.sampling_params.n
+-            if seq_group.sampling_params.use_beam_search:
+-                sorting_key = lambda seq: seq.get_beam_search_score(
+-                    seq_group.sampling_params.length_penalty)
+-            else:
+-                sorting_key = lambda seq: seq.get_cumulative_logprob()
+-            sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
+-            top_n_seqs = sorted_seqs[:n]
++    def new(
++        cls,
++        request_id: str,
++        prompt: Optional[str],
++        prompt_token_ids: Optional[List[int]],
++        text: str,
++        token_ids: List[int],
++        finished: bool = False,
++    ) -> "RequestOutput":
++        """Initialize a new RequestOutput object."""
++
++        # TODO: Support `n` > 1.
++        completion_output = CompletionOutput(
++            index=0,
++            text=text,
++            token_ids=token_ids,
++            cumulative_logprob=None,
++            logprobs=None,  # TODO
++        )
++
++        return RequestOutput(
++            request_id=request_id,
++            prompt=prompt,
++            prompt_token_ids=prompt_token_ids,
++            prompt_logprobs=None,  # TODO
++            outputs=[completion_output],
++            finished=finished,
++        )
++
++    @classmethod
++    def from_seq_group(
++        cls, seq_group: SequenceGroup, use_cache: bool,
++        seq_id_to_seq_group: Dict[str, SequenceGroupBase]
++    ) -> Optional["RequestOutput"]:
++        finished = seq_group.is_finished()
++
++        if seq_group.request_id in seq_id_to_seq_group:
++            group: SequenceGroupBase = seq_id_to_seq_group[
++                seq_group.request_id]
++            if finished:
++                group.finish_seq(seq_group)
++            assembled_seq_group = group.maybe_assemble_group(seq_group)
++            if assembled_seq_group is None:
++                return None
++            return cls.from_seq_group(assembled_seq_group, use_cache,
++                                      seq_id_to_seq_group)
++
++        sampling_params = seq_group.sampling_params
++        if sampling_params is None:
++            raise ValueError(
++                "Sampling parameters are missing for a CompletionRequest.")
++
++        if sampling_params.output_kind == RequestOutputKind.FINAL_ONLY and (
++                not finished):
++            return None
++
++        # Init cache (if needed)
++        if use_cache and seq_group.cached_request_output is None:
++            seq_group.cached_request_output = RequestOutput(  # type: ignore
++                request_id="",
++                prompt=None,
++                prompt_token_ids=[],
++                prompt_logprobs=None,
++                outputs=[],
++                finished=False)
++
++        top_n_seqs = seq_group.get_seqs()
+ 
+         # Create the outputs.
+         # NOTE: We need omit logprobs here explicitly because the sequence
+         # always has the logprobs of the sampled tokens even if the
+         # logprobs are not requested.
+-        include_logprobs = seq_group.sampling_params.logprobs is not None
+-        text_buffer_length = seq_group.sampling_params.output_text_buffer_length
+-        outputs = [
+-            CompletionOutput(seqs.index(seq),
+-                             seq.get_output_text_to_return(text_buffer_length),
+-                             seq.get_output_token_ids(),
+-                             seq.get_cumulative_logprob(),
+-                             seq.output_logprobs if include_logprobs else None,
+-                             SequenceStatus.get_finished_reason(seq.status),
+-                             seq.stop_reason) for seq in top_n_seqs
+-        ]
++        include_logprobs = sampling_params.logprobs is not None
++        text_buffer_length = sampling_params.output_text_buffer_length
++        delta = sampling_params.output_kind == RequestOutputKind.DELTA
++
++        outputs = []
++        include_prompt = True
++        # num_cached_tokens should be the same for all the sequences
++        num_cached_tokens = None
++        for i, seq in enumerate(top_n_seqs):
++            output_text = seq.get_output_text_to_return(
++                text_buffer_length, delta)
++
++            output_token_ids = seq.get_output_token_ids_to_return(delta)
++            num_output_tokens = 1 if isinstance(output_token_ids,
++                                                int) else len(output_token_ids)
++            num_cached_tokens = seq.data.get_num_cached_tokens()
++
++            output_logprobs = seq.output_logprobs if include_logprobs else None
++
++            if delta:
++                # Slice logprobs delta if applicable
++                if output_logprobs:
++                    output_logprobs = output_logprobs[-num_output_tokens:]
++                # Don't include prompt if this is after the first output
++                # containing decode token ids
++                if include_prompt and seq.get_output_len() > num_output_tokens:
++                    include_prompt = False
++
++            if use_cache:
++                # Get cached output object
++                cached_outputs = seq_group.cached_request_output.outputs  # type: ignore
++                if i >= len(cached_outputs):
++                    cached_outputs.append(
++                        CompletionOutput(index=i,
++                                         text="",
++                                         token_ids=[],
++                                         cumulative_logprob=None,
++                                         logprobs=None,
++                                         finish_reason=None,
++                                         stop_reason=None))
++                output = cached_outputs[i]
++
++                # Init cached output object
++                assert output.index == i
++                output.text = output_text
++
++                if isinstance(output_token_ids, int):
++                    output.token_ids.clear()
++                    output.token_ids.append(output_token_ids)
++                else:
++                    output.token_ids = output_token_ids
++
++                output.cumulative_logprob = seq.get_cumulative_logprob() \
++                    if include_logprobs else None
++                output.logprobs = output_logprobs
++                output.finish_reason = SequenceStatus.get_finished_reason(
++                    seq.status)
++                output.stop_reason = seq.stop_reason
++
++            else:
++                output = CompletionOutput(
++                    top_n_seqs.index(seq), output_text, [output_token_ids]
++                    if isinstance(output_token_ids, int) else output_token_ids,
++                    seq.get_cumulative_logprob() if include_logprobs else None,
++                    output_logprobs,
++                    SequenceStatus.get_finished_reason(seq.status),
++                    seq.stop_reason)
++
++            outputs.append(output)
+ 
+         # Every sequence in the sequence group should have the same prompt.
+-        prompt = seq_group.prompt
+-        prompt_token_ids = seq_group.prompt_token_ids
+-        prompt_logprobs = seq_group.prompt_logprobs
+-        finished = seq_group.is_finished()
++        if include_prompt:
++            prompt = seq_group.prompt
++            prompt_token_ids = seq_group.prompt_token_ids
++            encoder_prompt = seq_group.encoder_prompt
++            encoder_prompt_token_ids = seq_group.encoder_prompt_token_ids
++            prompt_logprobs = seq_group.prompt_logprobs
++        else:
++            prompt = None
++            prompt_token_ids = None
++            encoder_prompt = None
++            encoder_prompt_token_ids = None
++            prompt_logprobs = None
+         finished_time = time.time() if finished else None
+         seq_group.set_finished_time(finished_time)
+-        return cls(seq_group.request_id,
+-                   prompt,
+-                   prompt_token_ids,
+-                   prompt_logprobs,
+-                   outputs,
+-                   finished,
+-                   seq_group.metrics,
+-                   lora_request=seq_group.lora_request)
++
++        init_kwargs = {
++            "request_id": seq_group.request_id,
++            "prompt": prompt,
++            "prompt_token_ids": prompt_token_ids,
++            "prompt_logprobs": prompt_logprobs,
++            "outputs": outputs,
++            "finished": finished,
++            "metrics": seq_group.metrics,
++            "lora_request": seq_group.lora_request,
++            "encoder_prompt": encoder_prompt,
++            "encoder_prompt_token_ids": encoder_prompt_token_ids,
++            "num_cached_tokens": num_cached_tokens,
++            "multi_modal_placeholders": seq_group.multi_modal_placeholders
++        }
++
++        if use_cache:
++            request_output = seq_group.cached_request_output
++            request_output.__init__(**init_kwargs)  # type: ignore
++        else:
++            request_output = cls(**init_kwargs)  # type: ignore
++
++        return request_output
+ 
+     def __repr__(self) -> str:
+         return (f"RequestOutput(request_id={self.request_id}, "
+                 f"prompt={self.prompt!r}, "
+                 f"prompt_token_ids={self.prompt_token_ids}, "
++                f"encoder_prompt={self.encoder_prompt!r}, "
++                f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
+                 f"prompt_logprobs={self.prompt_logprobs}, "
+                 f"outputs={self.outputs}, "
+                 f"finished={self.finished}, "
+                 f"metrics={self.metrics}, "
+-                f"lora_request={self.lora_request})")
++                f"lora_request={self.lora_request}, "
++                f"num_cached_tokens={self.num_cached_tokens}, "
++                f"multi_modal_placeholders={self.multi_modal_placeholders})")
++
++
++_O = TypeVar("_O", default=PoolingOutput)
++
++
++class PoolingRequestOutput(Generic[_O]):
++    """
++    The output data of a pooling request to the LLM.
++
++    Args:
++        request_id (str): A unique identifier for the pooling request.
++        outputs (PoolingOutput): The pooling results for the given input.
++        prompt_token_ids (List[int]): A list of token IDs used in the prompt.
++        finished (bool): A flag indicating whether the pooling is completed.
++    """
++
++    def __init__(self, request_id: str, outputs: _O,
++                 prompt_token_ids: List[int], finished: bool):
++        self.request_id = request_id
++        self.prompt_token_ids = prompt_token_ids
++        self.finished = finished
++        self.outputs = outputs
++
++    @staticmethod
++    def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
++        pooled_data = seq_group.pooled_data
++        assert pooled_data is not None
++
++        data = pooled_data.to(dtype=torch.float32, device="cpu")
++        output = PoolingOutput(data)
++        prompt_token_ids = seq_group.prompt_token_ids
++        finished = seq_group.is_finished()
++
++        return PoolingRequestOutput(seq_group.request_id, output,
++                                    prompt_token_ids, finished)
++
++    def __repr__(self):
++        """
++        Returns a string representation of an PoolingRequestOutput instance.
++
++        The representation includes the request_id and the number of outputs,
++        providing a quick overview of the pooling request's results.
++
++        Returns:
++            str: A string representation of the PoolingRequestOutput instance.
++        """
++        return (f"{type(self).__name__}(request_id={self.request_id!r}, "
++                f"outputs={self.outputs!r}, "
++                f"prompt_token_ids={self.prompt_token_ids}, "
++                f"finished={self.finished})")
++
++
++class RequestOutputFactory:
++
++    @staticmethod
++    def create(seq_group: SequenceGroup,
++               seq_id_to_seq_group: Dict[str, SequenceGroupBase],
++               use_cache: bool = False):
++        if seq_group.pooled_data is not None:
++            return PoolingRequestOutput.from_seq_group(seq_group)
++        else:
++            return RequestOutput.from_seq_group(seq_group, use_cache,
++                                                seq_id_to_seq_group)
++
++
++@dataclass
++class EmbeddingOutput:
++    """The output data of one embedding output of a request.
++
++    Args:
++        embedding: The embedding vector, which is a list of floats.
++        Its length depends on the hidden dimension of the model.
++    """
++    embedding: list[float]
++
++    @staticmethod
++    def from_base(pooling_output: PoolingOutput):
++        pooled_data = pooling_output.data
++        if pooled_data.ndim != 1:
++            raise ValueError("pooled_data should be a 1-D embedding vector")
++
++        return EmbeddingOutput(pooled_data.tolist())
++
++    @property
++    def hidden_size(self) -> int:
++        return len(self.embedding)
++
++    def __repr__(self) -> str:
++        return f"EmbeddingOutput(hidden_size={self.hidden_size})"
++
++
++class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):
++
++    @staticmethod
++    def from_base(request_output: PoolingRequestOutput):
++        return EmbeddingRequestOutput(
++            request_id=request_output.request_id,
++            outputs=EmbeddingOutput.from_base(request_output.outputs),
++            prompt_token_ids=request_output.prompt_token_ids,
++            finished=request_output.finished,
++        )
++
++
++@dataclass
++class ClassificationOutput:
++    """The output data of one classification output of a request.
++
++    Args:
++        probs: The probability vector, which is a list of floats.
++        Its length depends on the number of classes.
++    """
++    probs: list[float]
++
++    @staticmethod
++    def from_base(pooling_output: PoolingOutput):
++        pooled_data = pooling_output.data
++        if pooled_data.ndim != 1:
++            raise ValueError("pooled_data should be a 1-D probability vector")
++
++        return ClassificationOutput(pooled_data.tolist())
++
++    @property
++    def num_classes(self) -> int:
++        return len(self.probs)
++
++    def __repr__(self) -> str:
++        return f"ClassificationOutput(num_classes={self.num_classes})"
++
++
++class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):
++
++    @staticmethod
++    def from_base(request_output: PoolingRequestOutput):
++        return ClassificationRequestOutput(
++            request_id=request_output.request_id,
++            outputs=ClassificationOutput.from_base(request_output.outputs),
++            prompt_token_ids=request_output.prompt_token_ids,
++            finished=request_output.finished,
++        )
++
++
++@dataclass
++class ScoringOutput:
++    """The output data of one scoring output of a request.
++
++    Args:
++        score: The similarity score, which is a scalar value.
++    """
++    score: float
++
++    @staticmethod
++    def from_base(pooling_output: PoolingOutput):
++        pooled_data = pooling_output.data
++        if pooled_data.ndim != 0:
++            raise ValueError("pooled_data should be a scalar score")
++
++        return ScoringOutput(pooled_data.item())
++
++    def __repr__(self) -> str:
++        return f"ScoringOutput(score={self.score})"
++
++    @property
++    @deprecated("`LLM.score()` now returns scalar scores. "
++                "Please access it via the `score` attribute. ")
++    def embedding(self) -> list[float]:
++        return [self.score]
++
++
++class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
++
++    @staticmethod
++    def from_base(request_output: PoolingRequestOutput):
++        return ScoringRequestOutput(
++            request_id=request_output.request_id,
++            outputs=ScoringOutput.from_base(request_output.outputs),
++            prompt_token_ids=request_output.prompt_token_ids,
++            finished=request_output.finished,
++        )
+diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
+new file mode 100644
+index 0000000..6ca95b4
+--- /dev/null
++++ b/vllm/platforms/__init__.py
+@@ -0,0 +1,223 @@
++import logging
++import traceback
++from itertools import chain
++from typing import TYPE_CHECKING, Optional
++
++from vllm.plugins import load_plugins_by_group
++from vllm.utils import resolve_obj_by_qualname
++
++from .interface import _Backend  # noqa: F401
++from .interface import CpuArchEnum, Platform, PlatformEnum
++
++logger = logging.getLogger(__name__)
++
++
++def tpu_platform_plugin() -> Optional[str]:
++    is_tpu = False
++    try:
++        # While it's technically possible to install libtpu on a
++        # non-TPU machine, this is a very uncommon scenario. Therefore,
++        # we assume that libtpu is installed if and only if the machine
++        # has TPUs.
++        import libtpu  # noqa: F401
++        is_tpu = True
++    except Exception:
++        pass
++
++    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
++
++
++def cuda_platform_plugin() -> Optional[str]:
++    is_cuda = False
++
++    try:
++        import pynvml
++        pynvml.nvmlInit()
++        try:
++            if pynvml.nvmlDeviceGetCount() > 0:
++                is_cuda = True
++        finally:
++            pynvml.nvmlShutdown()
++    except Exception:
++        # CUDA is supported on Jetson, but NVML may not be.
++        import os
++
++        def cuda_is_jetson() -> bool:
++            return os.path.isfile("/etc/nv_tegra_release") \
++                or os.path.exists("/sys/class/tegra-firmware")
++
++        if cuda_is_jetson():
++            is_cuda = True
++
++    return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
++
++
++def rocm_platform_plugin() -> Optional[str]:
++    is_rocm = False
++
++    try:
++        import amdsmi
++        amdsmi.amdsmi_init()
++        try:
++            if len(amdsmi.amdsmi_get_processor_handles()) > 0:
++                is_rocm = True
++        finally:
++            amdsmi.amdsmi_shut_down()
++    except Exception:
++        pass
++
++    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
++
++
++def hpu_platform_plugin() -> Optional[str]:
++    is_hpu = False
++    try:
++        from importlib import util
++        is_hpu = util.find_spec('habana_frameworks') is not None
++    except Exception:
++        pass
++
++    return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
++
++
++def xpu_platform_plugin() -> Optional[str]:
++    is_xpu = False
++
++    try:
++        # installed IPEX if the machine has XPUs.
++        import intel_extension_for_pytorch  # noqa: F401
++        import oneccl_bindings_for_pytorch  # noqa: F401
++        import torch
++        if hasattr(torch, 'xpu') and torch.xpu.is_available():
++            is_xpu = True
++    except Exception:
++        pass
++
++    return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
++
++
++def cpu_platform_plugin() -> Optional[str]:
++    is_cpu = False
++    try:
++        from importlib.metadata import version
++        is_cpu = "cpu" in version("vllm")
++    except Exception:
++        pass
++
++    return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
++
++
++def neuron_platform_plugin() -> Optional[str]:
++    is_neuron = False
++    try:
++        import transformers_neuronx  # noqa: F401
++        is_neuron = True
++    except ImportError:
++        pass
++
++    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
++
++
++def openvino_platform_plugin() -> Optional[str]:
++    is_openvino = False
++    try:
++        from importlib.metadata import version
++        is_openvino = "openvino" in version("vllm")
++    except Exception:
++        pass
++
++    return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
++
++
++builtin_platform_plugins = {
++    'tpu': tpu_platform_plugin,
++    'cuda': cuda_platform_plugin,
++    'rocm': rocm_platform_plugin,
++    'hpu': hpu_platform_plugin,
++    'xpu': xpu_platform_plugin,
++    'cpu': cpu_platform_plugin,
++    'neuron': neuron_platform_plugin,
++    'openvino': openvino_platform_plugin,
++}
++
++
++def resolve_current_platform_cls_qualname() -> str:
++    platform_plugins = load_plugins_by_group('vllm.platform_plugins')
++
++    activated_plugins = []
++
++    for name, func in chain(builtin_platform_plugins.items(),
++                            platform_plugins.items()):
++        try:
++            assert callable(func)
++            platform_cls_qualname = func()
++            if platform_cls_qualname is not None:
++                activated_plugins.append(name)
++        except Exception:
++            pass
++
++    activated_builtin_plugins = list(
++        set(activated_plugins) & set(builtin_platform_plugins.keys()))
++    activated_oot_plugins = list(
++        set(activated_plugins) & set(platform_plugins.keys()))
++
++    if len(activated_oot_plugins) >= 2:
++        raise RuntimeError(
++            "Only one platform plugin can be activated, but got: "
++            f"{activated_oot_plugins}")
++    elif len(activated_oot_plugins) == 1:
++        platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
++        logger.info("Platform plugin %s is activated",
++                    activated_oot_plugins[0])
++    elif len(activated_builtin_plugins) >= 2:
++        raise RuntimeError(
++            "Only one platform plugin can be activated, but got: "
++            f"{activated_builtin_plugins}")
++    elif len(activated_builtin_plugins) == 1:
++        platform_cls_qualname = builtin_platform_plugins[
++            activated_builtin_plugins[0]]()
++        logger.info("Automatically detected platform %s.",
++                    activated_builtin_plugins[0])
++    else:
++        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
++        logger.info(
++            "No platform detected, vLLM is running on UnspecifiedPlatform")
++    return platform_cls_qualname
++
++
++_current_platform = None
++_init_trace: str = ''
++
++if TYPE_CHECKING:
++    current_platform: Platform
++
++
++def __getattr__(name: str):
++    if name == 'current_platform':
++        # lazy init current_platform.
++        # 1. out-of-tree platform plugins need `from vllm.platforms import
++        #    Platform` so that they can inherit `Platform` class. Therefore,
++        #    we cannot resolve `current_platform` during the import of
++        #    `vllm.platforms`.
++        # 2. when users use out-of-tree platform plugins, they might run
++        #    `import vllm`, some vllm internal code might access
++        #    `current_platform` during the import, and we need to make sure
++        #    `current_platform` is only resolved after the plugins are loaded
++        #    (we have tests for this, if any developer violate this, they will
++        #    see the test failures).
++        global _current_platform
++        if _current_platform is None:
++            platform_cls_qualname = resolve_current_platform_cls_qualname()
++            _current_platform = resolve_obj_by_qualname(
++                platform_cls_qualname)()
++            global _init_trace
++            _init_trace = "".join(traceback.format_stack())
++        return _current_platform
++    else:
++        return globals()[name]
++
++
++__all__ = [
++    'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum',
++    "_init_trace"
++]
+diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
+new file mode 100644
+index 0000000..eb3e269
+--- /dev/null
++++ b/vllm/platforms/cpu.py
+@@ -0,0 +1,111 @@
++from typing import TYPE_CHECKING, Optional
++
++import psutil
++import torch
++
++from vllm.logger import init_logger
++
++from .interface import Platform, PlatformEnum, _Backend
++
++logger = init_logger(__name__)
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++else:
++    VllmConfig = None
++
++logger = init_logger(__name__)
++
++
++class CpuPlatform(Platform):
++    _enum = PlatformEnum.CPU
++    device_name: str = "cpu"
++    device_type: str = "cpu"
++    dispatch_key: str = "CPU"
++
++    @classmethod
++    def get_device_name(cls, device_id: int = 0) -> str:
++        return "cpu"
++
++    @classmethod
++    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
++                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
++                             block_size: int, use_v1: bool) -> str:
++        if selected_backend != _Backend.TORCH_SDPA:
++            logger.info("Cannot use %s backend on CPU.", selected_backend)
++        logger.info("Using Torch SDPA backend.")
++        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
++
++    @classmethod
++    def get_device_total_memory(cls, device_id: int = 0) -> int:
++        return psutil.virtual_memory().total
++
++    @classmethod
++    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
++        return False
++
++    @classmethod
++    def inference_mode(cls):
++        return torch.no_grad()
++
++    @classmethod
++    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
++        import vllm.envs as envs
++        from vllm.utils import GiB_bytes
++        model_config = vllm_config.model_config
++        # Reminder: Please update docs/source/features/compatibility_matrix.md
++        # If the feature combo become valid
++        if not model_config.enforce_eager:
++            logger.warning(
++                "CUDA graph is not supported on CPU, fallback to the eager "
++                "mode.")
++            model_config.enforce_eager = True
++
++        cache_config = vllm_config.cache_config
++
++        if cache_config and cache_config.block_size is None:
++            cache_config.block_size = 16
++
++        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
++
++        if kv_cache_space >= 0:
++            if kv_cache_space == 0:
++                cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
++                logger.warning(
++                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
++                    "for CPU backend is not set, using 4 by default.")
++            else:
++                cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore # noqa
++        else:
++            raise RuntimeError(
++                "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
++                f" {kv_cache_space}, expect a positive integer value.")
++
++        scheduler_config = vllm_config.scheduler_config
++        if ((scheduler_config.chunked_prefill_enabled
++             or cache_config.enable_prefix_caching)
++                and model_config.dtype == torch.half):
++            logger.warning("Chunked-prefill on the CPU backend only does not"
++                           " support fp16 for now, cast to bf16.")
++            model_config.dtype = torch.bfloat16
++
++        parallel_config = vllm_config.parallel_config
++        if (parallel_config.distributed_executor_backend is not None
++                and parallel_config.distributed_executor_backend != "mp"):
++            logger.warning(("%s is not supported on CPU, fallback to mp "
++                            "distributed executor backend."),
++                           parallel_config.distributed_executor_backend)
++            parallel_config.distributed_executor_backend = "mp"
++        if parallel_config.worker_cls == "auto":
++            if vllm_config.speculative_config:
++                parallel_config.worker_cls = \
++                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
++                parallel_config.sd_worker_cls = \
++                    "vllm.worker.cpu_worker.CPUWorker"
++            else:
++                parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
++
++    @classmethod
++    def is_pin_memory_available(cls) -> bool:
++        logger.warning("Pin memory is not supported on CPU.")
++        return False
+diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
+new file mode 100644
+index 0000000..3f77ec5
+--- /dev/null
++++ b/vllm/platforms/cuda.py
+@@ -0,0 +1,365 @@
++"""Code inside this file can safely assume cuda platform, e.g. importing
++pynvml. However, it should not initialize cuda context.
++"""
++
++import os
++from functools import lru_cache, wraps
++from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar,
++                    Union)
++
++import pynvml
++import torch
++from typing_extensions import ParamSpec
++
++# import custom ops, trigger op registration
++import vllm._C  # noqa
++import vllm.envs as envs
++from vllm.logger import init_logger
++
++from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++else:
++    VllmConfig = None
++
++logger = init_logger(__name__)
++
++_P = ParamSpec("_P")
++_R = TypeVar("_R")
++
++if pynvml.__file__.endswith("__init__.py"):
++    logger.warning(
++        "You are using a deprecated `pynvml` package. Please install"
++        " `nvidia-ml-py` instead, and make sure to uninstall `pynvml`."
++        " When both of them are installed, `pynvml` will take precedence"
++        " and cause errors. See https://pypi.org/project/pynvml "
++        "for more information.")
++
++# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
++# see https://github.com/huggingface/diffusers/issues/9704 for details
++torch.backends.cuda.enable_cudnn_sdp(False)
++
++
++def device_id_to_physical_device_id(device_id: int) -> int:
++    if "CUDA_VISIBLE_DEVICES" in os.environ:
++        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
++        if device_ids == [""]:
++            msg = (
++                "CUDA_VISIBLE_DEVICES is set to empty string, which means"
++                " GPU support is disabled. If you are using ray, please unset"
++                " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
++                " worker/actor. "
++                "Check https://github.com/vllm-project/vllm/issues/8402 for"
++                " more information.")
++            raise RuntimeError(msg)
++        physical_device_id = device_ids[device_id]
++        return int(physical_device_id)
++    else:
++        return device_id
++
++
++def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
++
++    @wraps(fn)
++    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
++        pynvml.nvmlInit()
++        try:
++            return fn(*args, **kwargs)
++        finally:
++            pynvml.nvmlShutdown()
++
++    return wrapper
++
++
++class CudaPlatformBase(Platform):
++    _enum = PlatformEnum.CUDA
++    device_name: str = "cuda"
++    device_type: str = "cuda"
++    dispatch_key: str = "CUDA"
++    ray_device_key: str = "GPU"
++
++    @classmethod
++    def get_device_capability(cls,
++                              device_id: int = 0
++                              ) -> Optional[DeviceCapability]:
++        raise NotImplementedError
++
++    @classmethod
++    def get_device_name(cls, device_id: int = 0) -> str:
++        raise NotImplementedError
++
++    @classmethod
++    def get_device_total_memory(cls, device_id: int = 0) -> int:
++        raise NotImplementedError
++
++    @classmethod
++    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
++        if enforce_eager:
++            logger.warning(
++                "To see benefits of async output processing, enable CUDA "
++                "graph. Since, enforce-eager is enabled, async output "
++                "processor cannot be used")
++            return False
++        return True
++
++    @classmethod
++    def is_full_nvlink(cls, device_ids: List[int]) -> bool:
++        raise NotImplementedError
++
++    @classmethod
++    def log_warnings(cls):
++        pass
++
++    @classmethod
++    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
++        parallel_config = vllm_config.parallel_config
++        scheduler_config = vllm_config.scheduler_config
++
++        if parallel_config.worker_cls == "auto":
++            if scheduler_config.is_multi_step:
++                if envs.VLLM_USE_V1:
++                    raise NotImplementedError
++                else:
++                    parallel_config.worker_cls = \
++                        "vllm.worker.multi_step_worker.MultiStepWorker"
++            elif vllm_config.speculative_config:
++                if envs.VLLM_USE_V1:
++                    raise NotImplementedError
++                else:
++                    parallel_config.worker_cls = \
++                        "vllm.spec_decode.spec_decode_worker.create_spec_worker"
++                    parallel_config.sd_worker_cls = \
++                        "vllm.worker.worker.Worker"
++            else:
++                if envs.VLLM_USE_V1:
++                    parallel_config.worker_cls = \
++                            "vllm.v1.worker.gpu_worker.Worker"
++                else:
++                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
++
++        cache_config = vllm_config.cache_config
++        if cache_config and cache_config.block_size is None:
++            cache_config.block_size = 16
++
++    @classmethod
++    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
++                             kv_cache_dtype, block_size, use_v1) -> str:
++        if use_v1:
++            logger.info("Using Flash Attention backend on V1 engine.")
++            return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
++        if selected_backend == _Backend.FLASHINFER:
++            logger.info("Using FlashInfer backend.")
++            return "vllm.attention.backends.flashinfer.FlashInferBackend"
++        elif selected_backend == _Backend.XFORMERS:
++            logger.info("Using XFormers backend.")
++            return "vllm.attention.backends.xformers.XFormersBackend"
++        elif selected_backend == _Backend.FLASH_ATTN:
++            pass
++        elif selected_backend:
++            raise ValueError(
++                f"Invalid attention backend for {cls.device_name}")
++
++        target_backend = _Backend.FLASH_ATTN
++        if not cls.has_device_capability(80):
++            # Volta and Turing NVIDIA GPUs.
++            logger.info(
++                "Cannot use FlashAttention-2 backend for Volta and Turing "
++                "GPUs.")
++            target_backend = _Backend.XFORMERS
++        elif dtype not in (torch.float16, torch.bfloat16):
++            logger.info(
++                "Cannot use FlashAttention-2 backend for dtype other than "
++                "torch.float16 or torch.bfloat16.")
++            target_backend = _Backend.XFORMERS
++        elif kv_cache_dtype is not None and \
++            kv_cache_dtype.startswith("fp8"):
++            logger.info(
++                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
++            logger.warning(
++                "Please use FlashInfer backend with FP8 KV Cache for "
++                "better performance by setting environment variable  "
++                "VLLM_ATTENTION_BACKEND=FLASHINFER")
++            target_backend = _Backend.XFORMERS
++        elif block_size % 16 != 0:
++            logger.info(
++                "Cannot use FlashAttention-2 backend for block size not "
++                "divisible by 16.")
++            target_backend = _Backend.XFORMERS
++
++        # FlashAttn is valid for the model, checking if the package is
++        # installed.
++        if target_backend == _Backend.FLASH_ATTN:
++            try:
++                import vllm.vllm_flash_attn  # noqa: F401
++                from vllm.attention.backends.flash_attn import (  # noqa: F401
++                    FlashAttentionBackend)
++
++                supported_sizes = \
++                    FlashAttentionBackend.get_supported_head_sizes()
++                if head_size not in supported_sizes:
++                    logger.info(
++                        "Cannot use FlashAttention-2 backend for head size %d.",
++                        head_size)
++                    target_backend = _Backend.XFORMERS
++            except ImportError:
++                logger.info(
++                    "Cannot use FlashAttention-2 backend because the "
++                    "vllm.vllm_flash_attn package is not found. "
++                    "Make sure that vllm_flash_attn was built and installed "
++                    "(on by default).")
++                target_backend = _Backend.XFORMERS
++
++        if target_backend == _Backend.XFORMERS:
++            logger.info("Using XFormers backend.")
++            return "vllm.attention.backends.xformers.XFormersBackend"
++
++        logger.info("Using Flash Attention backend.")
++        return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
++
++
++# NVML utils
++# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
++# all the related functions work on real physical device ids.
++# the major benefit of using NVML is that it will not initialize CUDA
++class NvmlCudaPlatform(CudaPlatformBase):
++
++    @classmethod
++    @lru_cache(maxsize=8)
++    @with_nvml_context
++    def get_device_capability(cls,
++                              device_id: int = 0
++                              ) -> Optional[DeviceCapability]:
++        try:
++            physical_device_id = device_id_to_physical_device_id(device_id)
++            handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
++            major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
++            return DeviceCapability(major=major, minor=minor)
++        except RuntimeError:
++            return None
++
++    @classmethod
++    @lru_cache(maxsize=8)
++    @with_nvml_context
++    def has_device_capability(
++        cls,
++        capability: Union[Tuple[int, int], int],
++        device_id: int = 0,
++    ) -> bool:
++        try:
++            return super().has_device_capability(capability, device_id)
++        except RuntimeError:
++            return False
++
++    @classmethod
++    @lru_cache(maxsize=8)
++    @with_nvml_context
++    def get_device_name(cls, device_id: int = 0) -> str:
++        physical_device_id = device_id_to_physical_device_id(device_id)
++        return cls._get_physical_device_name(physical_device_id)
++
++    @classmethod
++    @lru_cache(maxsize=8)
++    @with_nvml_context
++    def get_device_total_memory(cls, device_id: int = 0) -> int:
++        physical_device_id = device_id_to_physical_device_id(device_id)
++        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
++        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
++
++    @classmethod
++    @with_nvml_context
++    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
++        """
++        query if the set of gpus are fully connected by nvlink (1 hop)
++        """
++        handles = [
++            pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
++        ]
++        for i, handle in enumerate(handles):
++            for j, peer_handle in enumerate(handles):
++                if i < j:
++                    try:
++                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
++                            handle,
++                            peer_handle,
++                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
++                        )
++                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
++                            return False
++                    except pynvml.NVMLError:
++                        logger.exception(
++                            "NVLink detection failed. This is normal if"
++                            " your machine has no NVLink equipped.")
++                        return False
++        return True
++
++    @classmethod
++    def _get_physical_device_name(cls, device_id: int = 0) -> str:
++        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
++        return pynvml.nvmlDeviceGetName(handle)
++
++    @classmethod
++    @with_nvml_context
++    def log_warnings(cls):
++        device_ids: int = pynvml.nvmlDeviceGetCount()
++        if device_ids > 1:
++            device_names = [
++                cls._get_physical_device_name(i) for i in range(device_ids)
++            ]
++            if (len(set(device_names)) > 1
++                    and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
++                logger.warning(
++                    "Detected different devices in the system: \n%s\nPlease"
++                    " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
++                    "avoid unexpected behavior.",
++                    "\n".join(device_names),
++                )
++
++
++class NonNvmlCudaPlatform(CudaPlatformBase):
++
++    @classmethod
++    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
++        major, minor = torch.cuda.get_device_capability(device_id)
++        return DeviceCapability(major=major, minor=minor)
++
++    @classmethod
++    def get_device_name(cls, device_id: int = 0) -> str:
++        return torch.cuda.get_device_name(device_id)
++
++    @classmethod
++    def get_device_total_memory(cls, device_id: int = 0) -> int:
++        device_props = torch.cuda.get_device_properties(device_id)
++        return device_props.total_memory
++
++    @classmethod
++    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
++        logger.exception(
++            "NVLink detection not possible, as context support was"
++            " not found. Assuming no NVLink available.")
++        return False
++
++
++# Autodetect either NVML-enabled or non-NVML platform
++# based on whether NVML is available.
++nvml_available = False
++try:
++    try:
++        pynvml.nvmlInit()
++        nvml_available = True
++    except Exception:
++        # On Jetson, NVML is not supported.
++        nvml_available = False
++finally:
++    if nvml_available:
++        pynvml.nvmlShutdown()
++
++CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
++
++try:
++    from sphinx.ext.autodoc.mock import _MockModule
++
++    if not isinstance(pynvml, _MockModule):
++        CudaPlatform.log_warnings()
++except ModuleNotFoundError:
++    CudaPlatform.log_warnings()
+diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
+new file mode 100644
+index 0000000..0acb280
+--- /dev/null
++++ b/vllm/platforms/hpu.py
+@@ -0,0 +1,64 @@
++from typing import TYPE_CHECKING, Optional
++
++import torch
++
++from vllm.logger import init_logger
++
++from .interface import Platform, PlatformEnum, _Backend
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++else:
++    VllmConfig = None
++
++logger = init_logger(__name__)
++
++
++class HpuPlatform(Platform):
++    _enum = PlatformEnum.HPU
++    device_name: str = "hpu"
++    device_type: str = "hpu"
++    dispatch_key: str = "HPU"
++    ray_device_key: str = "HPU"
++
++    @classmethod
++    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
++                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
++                             block_size: int, use_v1: bool) -> str:
++        logger.info("Using HPUAttention backend.")
++        return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
++
++    @classmethod
++    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
++        return True
++
++    @staticmethod
++    def inference_mode():
++        return torch.no_grad()
++
++    @classmethod
++    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
++
++        scheduler_config = vllm_config.scheduler_config
++        if scheduler_config.is_multi_step:
++            raise NotImplementedError(
++                "Multi-step execution is not implemented for HPU")
++
++        if vllm_config.speculative_config is not None:
++            raise NotImplementedError(
++                "Speculative decoding is not implemented for HPU")
++
++        parallel_config = vllm_config.parallel_config
++        if parallel_config.worker_cls == "auto":
++            parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
++
++        # NOTE(kzawora): default block size for Gaudi should be 128
++        # smaller sizes still work, but very inefficiently
++        cache_config = vllm_config.cache_config
++        if cache_config and cache_config.block_size is None:
++            cache_config.block_size = 128
++
++    @classmethod
++    def is_pin_memory_available(cls):
++        logger.warning("Pin memory is not supported on HPU.")
++        return False
+diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
+new file mode 100644
+index 0000000..ec917f7
+--- /dev/null
++++ b/vllm/platforms/interface.py
+@@ -0,0 +1,272 @@
++import enum
++import platform
++import random
++from platform import uname
++from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
++
++import numpy as np
++import torch
++
++from vllm.logger import init_logger
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++else:
++    VllmConfig = None
++
++logger = init_logger(__name__)
++
++
++def in_wsl() -> bool:
++    # Reference: https://github.com/microsoft/WSL/issues/4071
++    return "microsoft" in " ".join(uname()).lower()
++
++
++class _Backend(enum.Enum):
++    FLASH_ATTN = enum.auto()
++    FLASH_ATTN_VLLM_V1 = enum.auto()
++    XFORMERS = enum.auto()
++    ROCM_FLASH = enum.auto()
++    TORCH_SDPA = enum.auto()
++    OPENVINO = enum.auto()
++    FLASHINFER = enum.auto()
++    HPU_ATTN = enum.auto()
++    PALLAS = enum.auto()
++    IPEX = enum.auto()
++    NO_ATTENTION = enum.auto()
++
++
++class PlatformEnum(enum.Enum):
++    CUDA = enum.auto()
++    ROCM = enum.auto()
++    TPU = enum.auto()
++    HPU = enum.auto()
++    XPU = enum.auto()
++    CPU = enum.auto()
++    NEURON = enum.auto()
++    OPENVINO = enum.auto()
++    OOT = enum.auto()
++    UNSPECIFIED = enum.auto()
++
++
++class CpuArchEnum(enum.Enum):
++    X86 = enum.auto()
++    ARM = enum.auto()
++    POWERPC = enum.auto()
++    OTHER = enum.auto()
++    UNKNOWN = enum.auto()
++
++
++class DeviceCapability(NamedTuple):
++    major: int
++    minor: int
++
++    def as_version_str(self) -> str:
++        return f"{self.major}.{self.minor}"
++
++    def to_int(self) -> int:
++        """
++        Express device capability as an integer ``<major><minor>``.
++
++        It is assumed that the minor version is always a single digit.
++        """
++        assert 0 <= self.minor < 10
++        return self.major * 10 + self.minor
++
++
++class Platform:
++    _enum: PlatformEnum
++    device_name: str
++    device_type: str
++    # available dispatch keys:
++    # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
++    # use "CPU" as a fallback for platforms not registered in PyTorch
++    dispatch_key: str = "CPU"
++    # available ray device keys:
++    # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
++    # empty string means the device does not support ray
++    ray_device_key: str = ""
++    # The torch.compile backend for compiling simple and
++    # standalone functions. The default value is "inductor" to keep
++    # the same behavior as PyTorch.
++    # NOTE: for the forward part of the model, vLLM has another separate
++    # compilation strategy.
++    simple_compile_backend: str = "inductor"
++    supported_quantization: list[str] = []
++
++    def is_cuda(self) -> bool:
++        return self._enum == PlatformEnum.CUDA
++
++    def is_rocm(self) -> bool:
++        return self._enum == PlatformEnum.ROCM
++
++    def is_tpu(self) -> bool:
++        return self._enum == PlatformEnum.TPU
++
++    def is_hpu(self) -> bool:
++        return self._enum == PlatformEnum.HPU
++
++    def is_xpu(self) -> bool:
++        return self._enum == PlatformEnum.XPU
++
++    def is_cpu(self) -> bool:
++        return self._enum == PlatformEnum.CPU
++
++    def is_neuron(self) -> bool:
++        return self._enum == PlatformEnum.NEURON
++
++    def is_openvino(self) -> bool:
++        return self._enum == PlatformEnum.OPENVINO
++
++    def is_out_of_tree(self) -> bool:
++        return self._enum == PlatformEnum.OOT
++
++    def is_cuda_alike(self) -> bool:
++        """Stateless version of :func:`torch.cuda.is_available`."""
++        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
++
++    @classmethod
++    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
++                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
++                             block_size: int, use_v1: bool) -> str:
++        """Get the attention backend class of a device."""
++        return ""
++
++    @classmethod
++    def get_device_capability(
++        cls,
++        device_id: int = 0,
++    ) -> Optional[DeviceCapability]:
++        """Stateless version of :func:`torch.cuda.get_device_capability`."""
++        return None
++
++    @classmethod
++    def has_device_capability(
++        cls,
++        capability: Union[Tuple[int, int], int],
++        device_id: int = 0,
++    ) -> bool:
++        """
++        Test whether this platform is compatible with a device capability.
++
++        The ``capability`` argument can either be:
++
++        - A tuple ``(major, minor)``.
++        - An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
++        """
++        current_capability = cls.get_device_capability(device_id=device_id)
++        if current_capability is None:
++            return False
++
++        if isinstance(capability, tuple):
++            return current_capability >= capability
++
++        return current_capability.to_int() >= capability
++
++    @classmethod
++    def get_device_name(cls, device_id: int = 0) -> str:
++        """Get the name of a device."""
++        raise NotImplementedError
++
++    @classmethod
++    def get_device_total_memory(cls, device_id: int = 0) -> int:
++        """Get the total memory of a device in bytes."""
++        raise NotImplementedError
++
++    @classmethod
++    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
++        """
++        Check if the current platform supports async output.
++        """
++        raise NotImplementedError
++
++    @classmethod
++    def inference_mode(cls):
++        """A device-specific wrapper of `torch.inference_mode`.
++
++        This wrapper is recommended because some hardware backends such as TPU
++        do not support `torch.inference_mode`. In such a case, they will fall
++        back to `torch.no_grad` by overriding this method.
++        """
++        return torch.inference_mode(mode=True)
++
++    @classmethod
++    def seed_everything(cls, seed: int) -> None:
++        """
++        Set the seed of each random module.
++        `torch.manual_seed` will set seed on all devices.
++
++        Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
++        """
++        random.seed(seed)
++        np.random.seed(seed)
++        torch.manual_seed(seed)
++
++    @classmethod
++    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
++        """
++        Check and update the configuration for the current platform.
++
++        It can raise an exception if the configuration is not compatible with
++        the current platform, or it can update the configuration to make it
++        compatible with the current platform.
++
++        The config is passed by reference, so it can be modified in place.
++        """
++        pass
++
++    @classmethod
++    def verify_model_arch(cls, model_arch: str) -> None:
++        """
++        Verify whether the current platform supports the specified model
++        architecture.
++
++        - This will raise an Error or Warning based on the model support on
++        the current platform.
++        - By default all models are considered supported.
++        """
++        pass
++
++    @classmethod
++    def verify_quantization(cls, quant: str) -> None:
++        """
++        Verify whether the quantization is supported by the current platform.
++        """
++        if cls.supported_quantization and \
++            quant not in cls.supported_quantization:
++            raise ValueError(
++                f"{quant} quantization is currently not supported in "
++                f"{cls.device_name}.")
++
++    @classmethod
++    def get_cpu_architecture(cls) -> CpuArchEnum:
++        """
++        Determine the CPU architecture of the current system.
++        Returns CpuArchEnum indicating the architecture type.
++        """
++        machine = platform.machine().lower()
++
++        if machine in ("x86_64", "amd64", "i386", "i686"):
++            return CpuArchEnum.X86
++        elif machine.startswith("arm") or machine.startswith("aarch"):
++            return CpuArchEnum.ARM
++        elif machine.startswith("ppc"):
++            return CpuArchEnum.POWERPC
++
++        return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
++
++    @classmethod
++    def is_pin_memory_available(cls) -> bool:
++        """Checks whether pin memory is available on the current platform."""
++        if in_wsl():
++            # Pinning memory in WSL is not supported.
++            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
++            logger.warning("Using 'pin_memory=False' as WSL is detected. "
++                           "This may slow down the performance.")
++            return False
++        return True
++
++
++class UnspecifiedPlatform(Platform):
++    _enum = PlatformEnum.UNSPECIFIED
++    device_type = ""
+diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
+new file mode 100644
+index 0000000..7f4a867
+--- /dev/null
++++ b/vllm/platforms/neuron.py
+@@ -0,0 +1,46 @@
++from typing import TYPE_CHECKING, Optional
++
++from vllm.logger import init_logger
++
++from .interface import Platform, PlatformEnum
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++else:
++    VllmConfig = None
++
++logger = init_logger(__name__)
++
++
++class NeuronPlatform(Platform):
++    _enum = PlatformEnum.NEURON
++    device_name: str = "neuron"
++    device_type: str = "neuron"
++    ray_device_key: str = "neuron_cores"
++    supported_quantization: list[str] = ["neuron_quant"]
++
++    @classmethod
++    def get_device_name(cls, device_id: int = 0) -> str:
++        return "neuron"
++
++    @classmethod
++    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
++        return False
++
++    @classmethod
++    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
++        parallel_config = vllm_config.parallel_config
++        if parallel_config.worker_cls == "auto":
++            parallel_config.worker_cls = \
++                "vllm.worker.neuron_worker.NeuronWorker"
++
++        cache_config = vllm_config.cache_config
++        if cache_config:
++            # neuron needs block_size = max_model_len
++            vllm_config.cache_config.block_size = \
++                vllm_config.model_config.max_model_len
++
++    @classmethod
++    def is_pin_memory_available(cls) -> bool:
++        logger.warning("Pin memory is not supported on Neuron.")
++        return False
+diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
+new file mode 100644
+index 0000000..9390eda
+--- /dev/null
++++ b/vllm/platforms/openvino.py
+@@ -0,0 +1,143 @@
++from typing import TYPE_CHECKING, Optional
++
++import torch
++
++import vllm.envs as envs
++from vllm.logger import init_logger
++
++from .interface import Platform, PlatformEnum, _Backend
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++else:
++    VllmConfig = None
++
++logger = init_logger(__name__)
++
++try:
++    import openvino as ov
++    import openvino.properties.hint as hints
++except ImportError as e:
++    logger.warning("Failed to import OpenVINO with %r", e)
++
++
++class OpenVinoPlatform(Platform):
++    _enum = PlatformEnum.OPENVINO
++    device_name: str = "openvino"
++    device_type: str = "openvino"
++    dispatch_key: str = "CPU"
++
++    @classmethod
++    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
++                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
++                             block_size: int, use_v1: bool) -> str:
++        if selected_backend != _Backend.OPENVINO:
++            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
++        logger.info("Using OpenVINO Attention backend.")
++        return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
++
++    @classmethod
++    def get_device_name(cls, device_id: int = 0) -> str:
++        return "openvino"
++
++    @classmethod
++    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
++        return False
++
++    @classmethod
++    def inference_mode(cls):
++        return torch.inference_mode(mode=True)
++
++    @classmethod
++    def is_openvino_cpu(cls) -> bool:
++        return "CPU" in envs.VLLM_OPENVINO_DEVICE
++
++    @classmethod
++    def is_openvino_gpu(cls) -> bool:
++        return "GPU" in envs.VLLM_OPENVINO_DEVICE
++
++    @classmethod
++    def is_pin_memory_available(cls) -> bool:
++        logger.warning("Pin memory is not supported on OpenViNO.")
++        return False
++
++    @classmethod
++    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
++        from vllm.utils import GiB_bytes
++
++        parallel_config = vllm_config.parallel_config
++        assert (
++            parallel_config.world_size == 1
++        ), "OpenVINOExecutor only supports single CPU socket currently."
++
++        if parallel_config.worker_cls == "auto":
++            parallel_config.worker_cls = \
++                "vllm.worker.openvino_worker.OpenVINOWorker"
++
++        # check and update model config
++        model_config = vllm_config.model_config
++        if model_config.dtype != torch.float32:
++            logger.warning(
++                f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}."  # noqa: G004, E501
++            )
++            model_config.dtype = torch.float32
++        if not model_config.enforce_eager:
++            logger.warning(
++                "CUDA graph is not supported on OpenVINO backend, fallback to "
++                "the eager mode.")
++            model_config.enforce_eager = True
++
++        # check and update cache config
++        ov_core = ov.Core()
++        cache_config = vllm_config.cache_config
++        if cache_config and cache_config.block_size is None:
++            cache_config.block_size = 16
++
++        if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
++            if not OpenVinoPlatform.is_openvino_cpu():
++                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
++                            "ignored for GPU, f16 data type will be used.")
++                cache_config.cache_dtype = ov.Type.f16
++            else:
++                logger.info("KV cache type is overridden to u8 via "
++                            "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
++                cache_config.cache_dtype = ov.Type.u8
++        else:
++            if OpenVinoPlatform.is_openvino_cpu():
++                ov_device = envs.VLLM_OPENVINO_DEVICE
++                inference_precision = ov_core.get_property(
++                    ov_device, hints.inference_precision)
++                if inference_precision == ov.Type.bf16:
++                    cache_config.cache_dtype = ov.Type.bf16
++                else:
++                    cache_config.cache_dtype = ov.Type.f16
++            else:
++                cache_config.cache_dtype = ov.Type.f16
++
++        if OpenVinoPlatform.is_openvino_cpu():
++            if cache_config.block_size != 32:
++                logger.info(
++                    f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
++                )
++                cache_config.block_size = 32
++        else:
++            if cache_config.block_size != 16:
++                logger.info(
++                    f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
++                )
++                cache_config.block_size = 16
++
++        kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
++        if kv_cache_space >= 0:
++            if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
++                cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
++                logger.warning(
++                    "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
++                    "for OpenVINO backend is not set, using 4 by default.")
++            else:
++                cache_config.openvino_kvcache_space_bytes = (  # type: ignore
++                    kv_cache_space * GiB_bytes)
++        else:
++            raise RuntimeError(
++                "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
++                f" {kv_cache_space}, expect a positive integer value.")
+diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
+new file mode 100644
+index 0000000..f12e948
+--- /dev/null
++++ b/vllm/platforms/rocm.py
+@@ -0,0 +1,153 @@
++import os
++from functools import lru_cache
++from typing import TYPE_CHECKING, Dict, List, Optional
++
++import torch
++
++import vllm.envs as envs
++from vllm.logger import init_logger
++
++from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++else:
++    VllmConfig = None
++
++logger = init_logger(__name__)
++
++try:
++    import vllm._C  # noqa: F401
++except ImportError as e:
++    logger.warning("Failed to import from vllm._C with %r", e)
++
++# import custom ops, trigger op registration
++try:
++    import vllm._rocm_C  # noqa: F401
++except ImportError as e:
++    logger.warning("Failed to import from vllm._rocm_C with %r", e)
++
++if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
++    logger.warning("`fork` method is not supported by ROCm. "
++                   "VLLM_WORKER_MULTIPROC_METHOD is overridden to"
++                   " `spawn` instead.")
++    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
++
++# Models not supported by ROCm.
++_ROCM_UNSUPPORTED_MODELS: List[str] = []
++
++# Models partially supported by ROCm.
++# Architecture -> Reason.
++_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
++                    "Triton flash attention. For half-precision SWA support, "
++                    "please use CK flash attention by setting "
++                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
++_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
++    "Qwen2ForCausalLM":
++    _ROCM_SWA_REASON,
++    "MistralForCausalLM":
++    _ROCM_SWA_REASON,
++    "MixtralForCausalLM":
++    _ROCM_SWA_REASON,
++    "PaliGemmaForConditionalGeneration":
++    ("ROCm flash attention does not yet "
++     "fully support 32-bit precision on PaliGemma"),
++    "Phi3VForCausalLM":
++    ("ROCm Triton flash attention may run into compilation errors due to "
++     "excessive use of shared memory. If this happens, disable Triton FA "
++     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
++}
++
++
++class RocmPlatform(Platform):
++    _enum = PlatformEnum.ROCM
++    device_name: str = "rocm"
++    device_type: str = "cuda"
++    dispatch_key: str = "CUDA"
++    ray_device_key: str = "GPU"
++
++    supported_quantization: list[str] = [
++        "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
++        "fbgemm_fp8", "gguf"
++    ]
++
++    @classmethod
++    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
++                             kv_cache_dtype, block_size, use_v1) -> str:
++        selected_backend = (_Backend.ROCM_FLASH if selected_backend
++                            == _Backend.FLASH_ATTN else selected_backend)
++        if selected_backend == _Backend.ROCM_FLASH:
++            if not cls.has_device_capability(90):
++                # not Instinct series GPUs.
++                logger.info("flash_attn is not supported on NAVI GPUs.")
++        else:
++            logger.info("%s is not supported in AMD GPUs.", selected_backend)
++        logger.info("Using ROCmFlashAttention backend.")
++        return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"  # noqa: E501
++
++    @classmethod
++    @lru_cache(maxsize=8)
++    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
++        major, minor = torch.cuda.get_device_capability(device_id)
++        return DeviceCapability(major=major, minor=minor)
++
++    @classmethod
++    @lru_cache(maxsize=8)
++    def get_device_name(cls, device_id: int = 0) -> str:
++        return torch.cuda.get_device_name(device_id)
++
++    @classmethod
++    def get_device_total_memory(cls, device_id: int = 0) -> int:
++        device_props = torch.cuda.get_device_properties(device_id)
++        return device_props.total_memory
++
++    @classmethod
++    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
++        if enforce_eager:
++            logger.warning(
++                "To see benefits of async output processing, enable CUDA "
++                "graph. Since, enforce-eager is enabled, async output "
++                "processor cannot be used")
++            return False
++        return True
++
++    @classmethod
++    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
++        cache_config = vllm_config.cache_config
++        if cache_config and cache_config.block_size is None:
++            cache_config.block_size = 16
++
++        parallel_config = vllm_config.parallel_config
++        scheduler_config = vllm_config.scheduler_config
++        if parallel_config.worker_cls == "auto":
++            if scheduler_config.is_multi_step:
++                parallel_config.worker_cls = \
++                    "vllm.worker.multi_step_worker.MultiStepWorker"
++            elif vllm_config.speculative_config:
++                parallel_config.worker_cls = \
++                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
++                parallel_config.sd_worker_cls = \
++                    "vllm.worker.worker.Worker"
++            else:
++                parallel_config.worker_cls = "vllm.worker.worker.Worker"
++
++    @classmethod
++    def verify_model_arch(cls, model_arch: str) -> None:
++        if model_arch in _ROCM_UNSUPPORTED_MODELS:
++            raise ValueError(f"Model architecture '{model_arch}' is not "
++                             "supported by ROCm for now.")
++
++        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
++            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
++            logger.warning(
++                "Model architecture '%s' is partially "
++                "supported by ROCm: %s", model_arch, msg)
++
++    @classmethod
++    def verify_quantization(cls, quant: str) -> None:
++        super().verify_quantization(quant)
++        if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ:
++            logger.warning(
++                "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
++                " is not set, enabling VLLM_USE_TRITON_AWQ.")
++        envs.VLLM_USE_TRITON_AWQ = True
+diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
+new file mode 100644
+index 0000000..460eb17
+--- /dev/null
++++ b/vllm/platforms/tpu.py
+@@ -0,0 +1,81 @@
++from typing import TYPE_CHECKING, Optional
++
++import torch
++
++from vllm.logger import init_logger
++
++from .interface import Platform, PlatformEnum, _Backend
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++else:
++    VllmConfig = None
++
++logger = init_logger(__name__)
++
++
++class TpuPlatform(Platform):
++    _enum = PlatformEnum.TPU
++    device_name: str = "tpu"
++    device_type: str = "tpu"
++    dispatch_key: str = "XLA"
++    ray_device_key: str = "TPU"
++
++    supported_quantization: list[str] = [
++        "tpu_int8", "compressed-tensors", "compressed_tensors"
++    ]
++
++    @classmethod
++    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
++                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
++                             block_size: int, use_v1: bool) -> str:
++        if selected_backend != _Backend.PALLAS:
++            logger.info("Cannot use %s backend on TPU.", selected_backend)
++        logger.info("Using Pallas backend.")
++        return "vllm.attention.backends.pallas.PallasAttentionBackend"
++
++    @classmethod
++    def get_device_name(cls, device_id: int = 0) -> str:
++        raise NotImplementedError
++
++    @classmethod
++    def get_device_total_memory(cls, device_id: int = 0) -> int:
++        raise NotImplementedError
++
++    @classmethod
++    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
++        return True
++
++    @classmethod
++    def inference_mode(cls):
++        return torch.no_grad()
++
++    @classmethod
++    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
++        from vllm.config import CompilationLevel
++
++        cache_config = vllm_config.cache_config
++        if cache_config and cache_config.block_size is None:
++            cache_config.block_size = 16
++
++        compilation_config = vllm_config.compilation_config
++        if compilation_config.level == CompilationLevel.NO_COMPILATION:
++            # TPU does not support NO_COMPILATION
++            compilation_config.level = CompilationLevel.DYNAMO_ONCE
++        assert compilation_config.level < CompilationLevel.PIECEWISE,\
++            "TPU does not support Inductor."
++
++        if compilation_config.backend == "":
++            compilation_config.backend = "openxla"
++
++        assert vllm_config.speculative_config is None, \
++            "TPU does not support speculative decoding"
++
++        parallel_config = vllm_config.parallel_config
++        scheduler_config = vllm_config.scheduler_config
++        if parallel_config.worker_cls == "auto":
++            if scheduler_config.is_multi_step:
++                parallel_config.worker_cls = \
++                    "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker"
++            else:
++                parallel_config.worker_cls = "vllm.worker.tpu_worker.TPUWorker"
+diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
+new file mode 100644
+index 0000000..cb74f79
+--- /dev/null
++++ b/vllm/platforms/xpu.py
+@@ -0,0 +1,95 @@
++from typing import TYPE_CHECKING, Optional
++
++import torch
++
++from vllm.logger import init_logger
++
++from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
++
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++else:
++    VllmConfig = None
++
++logger = init_logger(__name__)
++
++
++class XPUPlatform(Platform):
++    _enum = PlatformEnum.XPU
++    device_name: str = "xpu"
++    device_type: str = "xpu"
++    dispatch_key: str = "XPU"
++    # Intel XPU's device key is "GPU" for Ray.
++    # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
++    ray_device_key: str = "GPU"
++
++    @classmethod
++    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
++                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
++                             block_size: int, use_v1: bool) -> str:
++        if selected_backend != _Backend.IPEX:
++            logger.info("Cannot use %s backend on XPU.", selected_backend)
++        logger.info("Using IPEX attention backend.")
++        return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
++
++    @staticmethod
++    def get_device_capability(device_id: int = 0) -> DeviceCapability:
++        major, minor, *_ = torch.xpu.get_device_capability(
++            device_id)['version'].split('.')
++        return DeviceCapability(major=int(major), minor=int(minor))
++
++    @staticmethod
++    def get_device_name(device_id: int = 0) -> str:
++        return torch.xpu.get_device_name(device_id)
++
++    @classmethod
++    def get_device_total_memory(cls, device_id: int = 0) -> int:
++        device_props = torch.xpu.get_device_properties(device_id)
++        return device_props.total_memory
++
++    @classmethod
++    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
++        return True
++
++    @staticmethod
++    def inference_mode():
++        return torch.no_grad()
++
++    @classmethod
++    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
++        cache_config = vllm_config.cache_config
++        if cache_config and cache_config.block_size is None:
++            cache_config.block_size = 16
++
++        # check and update model config
++        model_config = vllm_config.model_config
++        if model_config.dtype == torch.bfloat16:
++            logger.warning(
++                "bfloat16 is not fully supported on XPU, casting to float16.")
++            model_config.dtype = torch.float16
++        if not model_config.enforce_eager:
++            logger.warning(
++                "CUDA graph is not supported on XPU, fallback to the eager "
++                "mode.")
++            model_config.enforce_eager = True
++
++        if vllm_config.speculative_config is not None:
++            raise NotImplementedError(
++                "XPU does not support speculative decoding")
++
++        # check and update parallel config
++        parallel_config = vllm_config.parallel_config
++        if (parallel_config.distributed_executor_backend is not None
++                and parallel_config.distributed_executor_backend != "ray"):
++            logger.warning(
++                "%s is not supported on XPU, fallback to ray distributed"
++                " executor backend.",
++                parallel_config.distributed_executor_backend)
++            parallel_config.distributed_executor_backend = "ray"
++        if parallel_config.worker_cls == "auto":
++            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
++
++    @classmethod
++    def is_pin_memory_available(cls):
++        logger.warning("Pin memory is not supported on XPU.")
++        return False
+diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
+new file mode 100644
+index 0000000..e5fa4f0
+--- /dev/null
++++ b/vllm/plugins/__init__.py
+@@ -0,0 +1,88 @@
++import logging
++import os
++from typing import Callable, Dict
++
++import torch
++
++import vllm.envs as envs
++
++logger = logging.getLogger(__name__)
++
++# make sure one process only loads plugins once
++plugins_loaded = False
++
++
++def load_plugins_by_group(group: str) -> Dict[str, Callable]:
++    import sys
++    if sys.version_info < (3, 10):
++        from importlib_metadata import entry_points
++    else:
++        from importlib.metadata import entry_points
++
++    allowed_plugins = envs.VLLM_PLUGINS
++
++    discovered_plugins = entry_points(group=group)
++    if len(discovered_plugins) == 0:
++        logger.debug("No plugins for group %s found.", group)
++        return {}
++    logger.info("Available plugins for group %s:", group)
++    for plugin in discovered_plugins:
++        logger.info("name=%s, value=%s", plugin.name, plugin.value)
++    if allowed_plugins is None:
++        logger.info("all available plugins for group %s will be loaded.",
++                    group)
++        logger.info("set environment variable VLLM_PLUGINS to control"
++                    " which plugins to load.")
++    plugins = {}
++    for plugin in discovered_plugins:
++        if allowed_plugins is None or plugin.name in allowed_plugins:
++            try:
++                func = plugin.load()
++                plugins[plugin.name] = func
++                logger.info("plugin %s loaded.", plugin.name)
++            except Exception:
++                logger.exception("Failed to load plugin %s", plugin.name)
++    return plugins
++
++
++def load_general_plugins():
++    """WARNING: plugins can be loaded for multiple times in different
++    processes. They should be designed in a way that they can be loaded
++    multiple times without causing issues.
++    """
++
++    # all processes created by vllm will load plugins,
++    # and here we can inject some common environment variables
++    # for all processes.
++
++    # see https://github.com/vllm-project/vllm/issues/10480
++    os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
++    # see https://github.com/vllm-project/vllm/issues/10619
++    torch._inductor.config.compile_threads = 1
++
++    from vllm.platforms import current_platform
++
++    if current_platform.is_xpu():
++        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
++        torch._dynamo.config.disable = True
++    if current_platform.is_hpu():
++        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
++        # does not support torch.compile
++        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
++        # torch.compile support
++        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
++        if is_lazy:
++            torch._dynamo.config.disable = True
++            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
++            # requires enabling lazy collectives
++            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
++            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
++
++    global plugins_loaded
++    if plugins_loaded:
++        return
++    plugins_loaded = True
++    plugins = load_plugins_by_group(group='vllm.general_plugins')
++    # general plugins, we only need to execute the loaded functions
++    for func in plugins.values():
++        func()
+diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
+new file mode 100644
+index 0000000..b24b7e9
+--- /dev/null
++++ b/vllm/pooling_params.py
+@@ -0,0 +1,23 @@
++from typing import Any, Optional
++
++import msgspec
++
++
++class PoolingParams(
++        msgspec.Struct,
++        omit_defaults=True,  # type: ignore[call-arg]
++        array_like=True):  # type: ignore[call-arg]
++    """API parameters for pooling models. This is currently a placeholder.
++
++    Attributes:
++        additional_data: Any additional data needed for pooling.
++    """
++    additional_data: Optional[Any] = None
++
++    def clone(self) -> "PoolingParams":
++        """Returns a deep copy of the PoolingParams instance."""
++        return PoolingParams(additional_data=self.additional_data)
++
++    def __repr__(self) -> str:
++        return (f"PoolingParams("
++                f"additional_metadata={self.additional_data})")
+diff --git a/vllm/profiler/__init__.py b/vllm/profiler/__init__.py
+new file mode 100644
+index 0000000..3e25f5c
+--- /dev/null
++++ b/vllm/profiler/__init__.py
+@@ -0,0 +1,5 @@
++from .layerwise_profile import layerwise_profile
++
++__all__ = [
++    "layerwise_profile",
++]
+diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
+new file mode 100644
+index 0000000..33babfe
+--- /dev/null
++++ b/vllm/profiler/layerwise_profile.py
+@@ -0,0 +1,372 @@
++import copy
++from collections import defaultdict
++from dataclasses import asdict, dataclass, field
++from typing import Callable, Dict, List, Optional, Tuple, TypeAlias, Union
++
++import pandas as pd
++from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult
++from torch._C._profiler import _EventType, _ExperimentalConfig, _ProfilerEvent
++from torch.autograd.profiler import FunctionEvent
++from torch.profiler import ProfilerActivity, profile
++
++from vllm.profiler.utils import (TablePrinter, event_has_module,
++                                 event_is_torch_op, event_module_repr,
++                                 event_torch_op_stack_trace, indent_string)
++
++
++@dataclass
++class _ModuleTreeNode:
++    event: _ProfilerEvent
++    parent: Optional['_ModuleTreeNode'] = None
++    children: List['_ModuleTreeNode'] = field(default_factory=list)
++    trace: str = ""
++
++    @property
++    def is_leaf(self):
++        return (self.event.children is None or len(self.event.children) == 0)
++
++    @property
++    def is_torch_op(self):
++        return event_is_torch_op(self.event)
++
++    @property
++    def is_cuda(self):
++        return (self.event.tag == _EventType.Kineto
++                and self.event.typed[1].device_type == DeviceType.CUDA)
++
++
++@dataclass
++class SummaryStatsEntry:
++    name: str
++    cuda_time_us: float
++    pct_cuda_time: float
++    invocations: int
++
++
++@dataclass
++class ModelStatsEntry:
++    name: str
++    cpu_time_us: float
++    cuda_time_us: float
++    pct_cuda_time: float
++    trace: str
++
++
++StatsEntry: TypeAlias = Union[ModelStatsEntry, SummaryStatsEntry]
++
++
++@dataclass
++class _StatsTreeNode:
++    entry: StatsEntry
++    children: List[StatsEntry]
++    parent: Optional[StatsEntry]
++
++
++@dataclass
++class LayerwiseProfileResults(profile):
++    _kineto_results: _ProfilerResult
++    _kineto_event_correlation_map: Dict[int,
++                                        List[_KinetoEvent]] = field(init=False)
++    _event_correlation_map: Dict[int, List[FunctionEvent]] = field(init=False)
++    _module_tree: List[_ModuleTreeNode] = field(init=False)
++    _model_stats_tree: List[_StatsTreeNode] = field(init=False)
++    _summary_stats_tree: List[_StatsTreeNode] = field(init=False)
++
++    # profile metadata
++    num_running_seqs: Optional[int] = None
++
++    def __post_init__(self):
++        self._build_correlation_map()
++        self._build_module_tree()
++        self._build_stats_trees()
++
++    def print_model_table(self, column_widths: Dict[str, int] = None):
++        _column_widths = dict(name=60,
++                              cpu_time_us=12,
++                              cuda_time_us=12,
++                              pct_cuda_time=12,
++                              trace=60)
++        if column_widths:
++            _column_widths.update(**column_widths)
++        filtered_model_table = [
++            (depth, row)
++            for depth, row in self._flatten_stats_tree(self._model_stats_tree)
++            if row.cuda_time_us > 0 or row.cpu_time_us > 0
++        ]
++        TablePrinter(ModelStatsEntry, _column_widths).print_table(
++            self._indent_row_names_based_on_depth(
++                filtered_model_table,
++                indent_style=lambda indent: "|" + "-" * indent + " "))
++
++    def print_summary_table(self, column_widths: Dict[str, int] = None):
++        _column_widths = dict(name=80,
++                              cuda_time_us=12,
++                              pct_cuda_time=12,
++                              invocations=15)
++        if column_widths:
++            _column_widths.update(**column_widths)
++        filtered_summary_table = [(depth, row)
++                                  for depth, row in self._flatten_stats_tree(
++                                      self._summary_stats_tree)
++                                  if row.cuda_time_us > 0]
++        TablePrinter(SummaryStatsEntry, _column_widths).print_table(
++            self._indent_row_names_based_on_depth(
++                filtered_summary_table,
++                indent_style=lambda indent: "|" + "-" * indent + " "))
++
++    def export_model_stats_table_csv(self, filename: str):
++        df = pd.DataFrame([
++            asdict(row)
++            for _, row in self._flatten_stats_tree(self._model_stats_tree)
++        ])
++        df.to_csv(filename)
++
++    def export_summary_stats_table_csv(self, filename: str):
++        df = pd.DataFrame([
++            asdict(row)
++            for _, row in self._flatten_stats_tree(self._summary_stats_tree)
++        ])
++        df.to_csv(filename)
++
++    def convert_stats_to_dict(self) -> str:
++        return {
++            "metadata": {
++                "num_running_seqs": self.num_running_seqs
++            },
++            "summary_stats":
++            self._convert_stats_tree_to_dict(self._summary_stats_tree),
++            "model_stats":
++            self._convert_stats_tree_to_dict(self._model_stats_tree)
++        }
++
++    @staticmethod
++    def _indent_row_names_based_on_depth(depths_rows: List[Tuple[int,
++                                                                 StatsEntry]],
++                                         indent_style: Union[Callable[[int],
++                                                                      str],
++                                                             str] = " "):
++        indented_rows = []
++        for depth, row in depths_rows:
++            if row.cuda_time_us == 0:
++                continue
++            indented_row = copy.deepcopy(row)
++            indented_row.name = indent_string(indented_row.name, depth,
++                                              indent_style)
++            indented_rows.append(indented_row)
++        return indented_rows
++
++    def _build_correlation_map(self):
++        self._kineto_event_correlation_map = defaultdict(list)
++        for event in self._kineto_results.events():
++            self._kineto_event_correlation_map[event.correlation_id()].append(
++                event)
++
++    def _build_module_tree(self):
++        self._module_tree = []
++        event_tree = self._kineto_results.experimental_event_tree()
++
++        def _df_traversal(event: _ProfilerEvent,
++                          curr_node: Optional[_ModuleTreeNode] = None):
++
++            # For the tensor parallel case for now only look at task 1
++            if event.start_tid != 1:
++                return
++
++            if event_has_module(event):
++                node = _ModuleTreeNode(event=event, parent=curr_node)
++                if curr_node:
++                    curr_node.children.append(node)
++                else:
++                    self._module_tree.append(node)
++                curr_node = node
++
++            is_leaf = (event.children is None or len(event.children) == 0)
++            if is_leaf and curr_node:
++                node = _ModuleTreeNode(
++                    event=event,
++                    parent=curr_node,
++                    trace=event_torch_op_stack_trace(
++                        event, until=lambda x: event_has_module(x)))
++                curr_node.children.append(node)
++                curr_node = node
++
++            for child in event.children:
++                _df_traversal(child, curr_node)
++
++        for root in event_tree:
++            _df_traversal(root)
++
++    def _get_kineto_gpu_event(self, node: _ModuleTreeNode):
++        if node.event.tag != _EventType.Kineto:
++            return None
++        correlated_kineto_events = self._kineto_event_correlation_map.get(
++            node.event.correlation_id, [])
++        iterator = (x for x in correlated_kineto_events
++                    if x.device_type() == DeviceType.CUDA
++                    and x.name() == node.event.name)
++        return next(iterator, None)
++
++    def _cumulative_cuda_time(self, node: _ModuleTreeNode):
++        'Return cuda time in microseconds'
++
++        def _cumulative_cuda_time_recursive(node: _ModuleTreeNode):
++            if node.is_leaf and (gpu_kineto_event :=
++                                 self._get_kineto_gpu_event(node)):
++                return gpu_kineto_event.duration_ns() / 1000.0
++            else:
++                cumulative_cuda_time = 0
++                for child in node.children:
++                    cumulative_cuda_time += _cumulative_cuda_time_recursive(
++                        child)
++                return cumulative_cuda_time
++
++        return _cumulative_cuda_time_recursive(node)
++
++    def _total_cuda_time(self):
++        return sum(
++            [self._cumulative_cuda_time(root) for root in self._module_tree])
++
++    def _build_stats_trees(self):
++        summary_dict: Dict[str, self.StatsTreeNode] = {}
++        total_cuda_time = self._total_cuda_time()
++
++        def pct_cuda_time(cuda_time_us):
++            return (cuda_time_us / total_cuda_time) * 100
++
++        def build_summary_stats_tree_df(
++            node: _ModuleTreeNode,
++            parent: Optional[_StatsTreeNode] = None,
++            summary_trace: Tuple[str] = ()):
++
++            if event_has_module(node.event):
++                name = event_module_repr(node.event)
++                cuda_time_us = self._cumulative_cuda_time(node)
++            elif (gpu_kineto_event := self._get_kineto_gpu_event(node)):
++                name = gpu_kineto_event.name()
++                cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
++            else:
++                return None
++
++            summary_trace = summary_trace + (name, )
++            if summary_trace in summary_dict:
++                entry = summary_dict[summary_trace].entry
++                entry.cuda_time_us += cuda_time_us
++                entry.invocations += 1
++                entry.pct_cuda_time = pct_cuda_time(entry.cuda_time_us)
++            else:
++                new_node = _StatsTreeNode(entry=SummaryStatsEntry(
++                    name=name,
++                    cuda_time_us=cuda_time_us,
++                    pct_cuda_time=pct_cuda_time(cuda_time_us),
++                    invocations=1),
++                                          children=[],
++                                          parent=parent)
++                if parent:
++                    parent.children.append(new_node)
++                summary_dict[summary_trace] = new_node
++
++            for child in node.children:
++                build_summary_stats_tree_df(child, summary_dict[summary_trace],
++                                            summary_trace)
++
++            return summary_dict[summary_trace]
++
++        self._summary_stats_tree = []
++        for root in self._module_tree:
++            self._summary_stats_tree.append(build_summary_stats_tree_df(root))
++
++        def build_model_stats_tree_df(node: _ModuleTreeNode,
++                                      parent: Optional[_StatsTreeNode] = None):
++            if event_has_module(node.event, ):
++                name = event_module_repr(node.event)
++                cuda_time_us = self._cumulative_cuda_time(node)
++                cpu_time_us = node.event.duration_time_ns / 1000
++                trace = ""
++            elif (gpu_kineto_event := self._get_kineto_gpu_event(node)):
++                name = gpu_kineto_event.name()
++                cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0
++                cpu_time_us = 0
++                trace = node.trace
++            else:
++                return None
++
++            new_node = _StatsTreeNode(entry=ModelStatsEntry(
++                name=name,
++                cpu_time_us=cpu_time_us,
++                cuda_time_us=cuda_time_us,
++                pct_cuda_time=pct_cuda_time(cuda_time_us),
++                trace=trace),
++                                      parent=parent,
++                                      children=[])
++            if parent:
++                parent.children.append(new_node)
++
++            for child in node.children:
++                build_model_stats_tree_df(child, new_node)
++
++            return new_node
++
++        self._model_stats_tree = []
++        for root in self._module_tree:
++            self._model_stats_tree.append(build_model_stats_tree_df(root))
++
++    def _flatten_stats_tree(
++            self, tree: List[_StatsTreeNode]) -> List[Tuple[int, StatsEntry]]:
++        entries: List[Tuple[int, StatsEntry]] = []
++
++        def df_traversal(node: _StatsTreeNode, depth=0):
++            entries.append((depth, node.entry))
++            for child in node.children:
++                df_traversal(child, depth=depth + 1)
++
++        for root in tree:
++            df_traversal(root)
++
++        return entries
++
++    def _convert_stats_tree_to_dict(self,
++                                    tree: List[_StatsTreeNode]) -> List[Dict]:
++        root_dicts: List[Dict] = []
++
++        def df_traversal(node: _StatsTreeNode, curr_json_list: List[Dict]):
++            curr_json_list.append({
++                "entry": asdict(node.entry),
++                "children": []
++            })
++            for child in node.children:
++                df_traversal(child, curr_json_list[-1]["children"])
++
++        for root in tree:
++            df_traversal(root, root_dicts)
++
++        return root_dicts
++
++
++class layerwise_profile(profile):
++
++    def __init__(self, num_running_seqs: Optional[int] = None):
++        """
++        layerwise profile constructor.
++
++        Args:
++            num_running_seqs (Optional[int], optional): When given,
++            num_running_seqs will be passed to LayerProfileResults for metadata
++            update. Defaults to None.
++        """
++        super().__init__(
++            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
++            record_shapes=True,
++            with_stack=True,
++            with_modules=True,
++            experimental_config=_ExperimentalConfig(verbose=True))
++
++        self.num_running_seqs = num_running_seqs
++
++    def __enter__(self):
++        return super().__enter__()
++
++    def __exit__(self, exc_type, exc_val, exc_tb):
++        super().__exit__(exc_type, exc_val, exc_tb)
++        self.results = LayerwiseProfileResults(
++            self.profiler.kineto_results,
++            num_running_seqs=self.num_running_seqs)
+diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py
+new file mode 100644
+index 0000000..033035e
+--- /dev/null
++++ b/vllm/profiler/utils.py
+@@ -0,0 +1,145 @@
++import dataclasses
++from typing import Callable, Dict, List, Type, Union
++
++from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata
++
++#
++# String / Print Manipulation
++#
++
++
++def trim_string_front(string, width):
++    if len(string) > width:
++        offset = len(string) - width + 3
++        string = string[offset:]
++        if len(string) > 3:
++            string = "..." + string[3:]
++    return string
++
++
++def trim_string_back(string, width):
++    if len(string) > width:
++        offset = len(string) - width + 3
++        string = string[:-offset]
++        if len(string) > 3:
++            string = string + "..."
++    return string
++
++
++class TablePrinter:
++
++    def __init__(self, row_cls: Type[dataclasses.dataclass],
++                 column_widths: Dict[str, int]):
++        self.row_cls = row_cls
++        self.fieldnames = [x.name for x in dataclasses.fields(row_cls)]
++        self.column_widths = column_widths
++        assert set(self.column_widths.keys()) == set(self.fieldnames)
++
++    def print_table(self, rows: List[dataclasses.dataclass]):
++        self._print_header()
++        self._print_line()
++        for row in rows:
++            self._print_row(row)
++
++    def _print_header(self):
++        for i, f in enumerate(self.fieldnames):
++            last = (i == len(self.fieldnames) - 1)
++            col_width = self.column_widths[f]
++            print(trim_string_back(f, col_width).ljust(col_width),
++                  end=" | " if not last else "\n")
++
++    def _print_row(self, row):
++        assert isinstance(row, self.row_cls)
++
++        for i, f in enumerate(self.fieldnames):
++            last = (i == len(self.fieldnames) - 1)
++            col_width = self.column_widths[f]
++            val = getattr(row, f)
++
++            val_str = ""
++            if isinstance(val, str):
++                val_str = trim_string_back(val, col_width).ljust(col_width)
++            elif type(val) in [float, int]:
++                val_str = f"{float(val):>.2f}".rjust(col_width)
++            else:
++                val_str = f"{val}".rjust(col_width)
++            print(val_str, end=" | " if not last else "\n")
++
++    def _print_line(self):
++        total_col_width = 0
++        for column_width in self.column_widths.values():
++            total_col_width += column_width
++        print("=" * (total_col_width + 3 * (len(self.column_widths) - 1)))
++
++
++def indent_string(string: str,
++                  indent: int,
++                  indent_style: Union[Callable[[int], str], str] = " ") -> str:
++    if indent:
++        if isinstance(indent_style, str):
++            return indent_style * indent + string
++        else:
++            return indent_style(indent) + string
++    else:
++        return string
++
++
++#
++# _ProfilerEvent utils
++#
++
++
++def event_has_module(event: _ProfilerEvent) -> bool:
++    event_type, typed_event = event.typed
++    if event_type == _EventType.PyCall:
++        return typed_event.module is not None
++    return False
++
++
++def event_is_torch_op(event: _ProfilerEvent) -> bool:
++    return event.tag == _EventType.TorchOp
++
++
++def event_arg_repr(arg) -> str:
++    if arg is None or type(arg) in [float, int, bool, str]:
++        return f"{arg}"
++    elif isinstance(arg, list):
++        return f"[{', '.join([event_arg_repr(x) for x in arg])}]"
++    elif isinstance(arg, tuple):
++        return f"({', '.join([event_arg_repr(x) for x in arg])})"
++    else:
++        assert isinstance(arg,
++                          _TensorMetadata), f"Unsupported type: {type(arg)}"
++        sizes_str = ', '.join([str(x) for x in arg.sizes])
++        return f"{str(arg.dtype).replace('torch.', '')}[{sizes_str}]"
++
++
++def event_torch_op_repr(event: _ProfilerEvent) -> str:
++    assert event.tag == _EventType.TorchOp
++    args_str = ', '.join([event_arg_repr(x) for x in event.typed[1].inputs])
++    return f"{event.name}({args_str})".replace("aten::", "")
++
++
++def event_module_repr(event: _ProfilerEvent) -> str:
++    assert event_has_module(event)
++    module = event.typed[1].module
++    if module.parameters and len(module.parameters) > 0:
++        args_str = ', '.join(
++            [f'{x[0]}={event_arg_repr(x[1])}' for x in module.parameters])
++        return f"{module.cls_name}({args_str})"
++    else:
++        return module.cls_name
++
++
++def event_torch_op_stack_trace(curr_event: _ProfilerEvent,
++                               until: Callable[[_ProfilerEvent], bool]) -> str:
++    trace = ""
++    curr_event = curr_event.parent
++    while curr_event and not until(curr_event):
++        if event_is_torch_op(curr_event):
++            if len(trace) > 0:
++                trace += " <- "
++            trace += event_torch_op_repr(curr_event)
++        curr_event = curr_event.parent
++
++    return trace
+diff --git a/vllm/prompt_adapter/__init__.py b/vllm/prompt_adapter/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
+new file mode 100644
+index 0000000..27a61e6
+--- /dev/null
++++ b/vllm/prompt_adapter/layers.py
+@@ -0,0 +1,80 @@
++from dataclasses import dataclass
++from typing import Optional
++
++import torch
++from torch import nn
++
++from vllm.adapter_commons.layers import AdapterMapping
++from vllm.config import PromptAdapterConfig
++from vllm.model_executor.layers.vocab_parallel_embedding import (
++    VocabParallelEmbedding)
++
++
++@dataclass
++class PromptAdapterMapping(AdapterMapping):
++    pass
++
++
++class VocabParallelEmbeddingWithPromptAdapter(nn.Module):
++
++    def __init__(self, base_layer: VocabParallelEmbedding) -> None:
++        super().__init__()
++        self.base_layer = base_layer
++        self.emb_layer = self.base_layer
++        if 'LoRA' in base_layer.__class__.__name__:
++            self.emb_layer = self.base_layer.base_layer
++
++    def create_prompt_adapter_weights(
++            self, prompt_adapter_config: PromptAdapterConfig):
++        self.embeddings_tensors = torch.zeros(
++            (
++                prompt_adapter_config.max_prompt_adapters,
++                prompt_adapter_config.max_prompt_adapter_token,
++                self.emb_layer.embedding_dim,
++            ),
++            dtype=self.emb_layer.weight.dtype,
++            device=self.emb_layer.weight.device,
++        )
++        self.adapter_lengths = torch.zeros(
++            prompt_adapter_config.max_prompt_adapters,
++            dtype=torch.long,
++            device=self.emb_layer.weight.device)
++
++        self.indices_gpu: torch.Tensor
++        self.embedding_indices_gpu: torch.Tensor
++
++    def reset_prompt_adapter(self, index: int):
++        self.embeddings_tensors[index] = 0
++
++    def set_prompt_adapter(
++        self,
++        index: int,
++        adapter_model: Optional[torch.Tensor],
++    ):
++        self.reset_prompt_adapter(index)
++        if adapter_model is not None:
++            length = adapter_model.shape[0]
++            self.embeddings_tensors[index, :length] = adapter_model
++            self.adapter_lengths[index] = length
++
++    def set_mapping(
++        self,
++        prompt_indices: torch.Tensor,
++        prompt_embedding_indices: torch.Tensor,
++    ):
++        self.indices_gpu = prompt_indices.to(
++            device=self.emb_layer.weight.device)
++        self.embedding_indices_gpu = prompt_embedding_indices.to(
++            device=self.emb_layer.weight.device)
++
++    def forward(self, x: torch.Tensor) -> torch.Tensor:
++        hidden_states = self.base_layer(x)
++        if self.embedding_indices_gpu.ndim > 1:
++            valid_mask = self.indices_gpu != -1
++            gathered_embeddings = self.embeddings_tensors[
++                self.embedding_indices_gpu[:, 0],
++                self.embedding_indices_gpu[:, 1]]
++
++            # Update hidden states
++            hidden_states[valid_mask] = gathered_embeddings
++        return hidden_states
+\ No newline at end of file
+diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
+new file mode 100644
+index 0000000..18a5f86
+--- /dev/null
++++ b/vllm/prompt_adapter/models.py
+@@ -0,0 +1,355 @@
++import logging
++import math
++from typing import Any, Callable, Dict, List, Optional, Type
++
++import torch
++from torch import nn
++
++from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
++                                         AdapterModelManager)
++from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
++                                        get_adapter, list_adapters,
++                                        remove_adapter, set_adapter_mapping)
++from vllm.config import PromptAdapterConfig
++from vllm.prompt_adapter.layers import (
++    VocabParallelEmbeddingWithPromptAdapter)  # yapf: disable
++from vllm.prompt_adapter.layers import PromptAdapterMapping
++from vllm.prompt_adapter.utils import load_peft_weights
++
++logger = logging.getLogger(__name__)
++
++_GLOBAL_PROMPT_ADAPTER_ID = 0
++
++
++def get_prompt_adapter_id():
++    global _GLOBAL_PROMPT_ADAPTER_ID
++    _GLOBAL_PROMPT_ADAPTER_ID += 1
++    return _GLOBAL_PROMPT_ADAPTER_ID
++
++
++def convert_to_embedding_indices(indices):
++    embedding_indices = []
++    count = 0
++
++    for value in indices:
++        if value == -1:
++            count = 0
++        else:
++            embedding_indices.append([value, count])
++            count += 1
++
++    return torch.tensor(embedding_indices)
++
++
++def convert_mapping(
++    mapping: PromptAdapterMapping,
++    prompt_adapter_index_to_id: List[Optional[int]],
++) -> torch.Tensor:
++    """Converts PromptAdapterMapping to index tensors.
++
++    Args:
++        mapping: PromptAdapterMapping mapping rows in a 
++                batch to PromptAdapter ids.
++        prompt_adapter_index_to_id: List mapping PromptAdapter 
++                ids to PromptAdapter indices.
++        
++    Returns:
++        pa_indices: Tensor of shape [batch_size] mapping batch rows to
++            PromptAdapter indices.
++    """
++    id_to_index = {
++        id_: idx
++        for idx, id_ in enumerate(prompt_adapter_index_to_id)
++        if id_ is not None
++    }
++    pa_indices = ([
++        id_to_index.get(id_, -1) if id_ > 0 else -1
++        for id_ in mapping.index_mapping
++    ])
++
++    pa_embedding_mapping = convert_to_embedding_indices(pa_indices)
++    pa_indices = torch.tensor(pa_indices)
++    return pa_indices, pa_embedding_mapping
++
++
++class PromptAdapterModel(AdapterModel):
++
++    def __init__(self,
++                 prompt_adapter_id=None,
++                 num_virtual_tokens=None,
++                 prompt_embedding=None) -> None:
++        self.id = prompt_adapter_id
++        self.prompt_embedding = prompt_embedding
++        self.num_virtual_tokens = num_virtual_tokens
++
++    @classmethod
++    def from_local_checkpoint(
++        cls,
++        adapter_model_path: str,
++        prompt_adapter_id: int,
++        num_virtual_tokens: int,
++        config: PromptAdapterConfig,
++        device: str = "cuda",
++    ) -> "PromptAdapterModel":
++
++        if num_virtual_tokens > config.max_prompt_adapter_token:
++            raise ValueError(
++                f'num_virtual_tokens ({num_virtual_tokens}) should be <= '
++                f'max_prompt_adapter_token({config.max_prompt_adapter_token})')
++
++        adapters_weights = load_peft_weights(adapter_model_path, device)
++        prompt_embedding = adapters_weights["prompt_embeddings"].to(
++            config.prompt_adapter_dtype)
++
++        return cls(prompt_adapter_id, num_virtual_tokens, prompt_embedding)
++
++
++class PromptAdapterModelManager(AdapterModelManager):
++    """A manager that manages multiple Prompt Adapter models."""
++
++    def __init__(
++        self,
++        model: nn.Module,
++        max_num_seqs: int,
++        max_num_batched_tokens: int,
++        prompt_adapter_config: PromptAdapterConfig,
++    ):
++        """Create a PromptAdapterModel and adapter for a given model.
++
++        Args:
++            model: the model to be adapted.
++            max_num_seqs: the maximum number of sequences model can run in a
++                single batch.
++            max_num_batched_tokens: the maximum number of tokens model can run
++                in a single batch.
++            prompt_adapter_config: the PromptAdapter config,
++        """
++        self.model: nn.Module = model
++        # Dict instead of a Set for compatibility with LRUCache.
++        self.prompt_adapter_index_to_id: List[
++            Optional[int]] = [None] * self.prompt_adapter_slots
++        self.max_num_seqs = max_num_seqs
++        self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
++        self.prompt_adapter_config = prompt_adapter_config
++        self.model.prompt_adapter_manager = self
++        self.adapter_type = 'PromptAdapter'
++
++        self.base_indices = torch.tensor([-1])
++        self.base_embedding_indices = torch.tensor([])
++
++        self.modules: Dict[str, nn.Module] = {}
++        self._create_prompt_adapter_modules()
++        self._last_mapping: Optional[PromptAdapterMapping] = None
++
++    @property
++    def prompt_adapter_slots(self) -> int:
++        return self.prompt_adapter_config.max_prompt_adapters
++
++    @property
++    def adapter_slots(self) -> int:
++        return self.prompt_adapter_slots
++
++    @property
++    def capacity(self) -> int:
++        return self.prompt_adapter_config.max_cpu_prompt_adapters
++
++    def activate_adapter(
++        self,
++        prompt_adapter_id: int,
++    ) -> bool:
++        """Move PromptAdapter into a GPU buffer 
++            to be used in the forward pass."""
++        if prompt_adapter_id in self._active_adapters:
++            return False
++        first_free_slot = next(
++            ((i, prompt_adapter_id) for i, prompt_adapter_id in enumerate(
++                self.prompt_adapter_index_to_id) if prompt_adapter_id is None),
++            None)
++        if first_free_slot is None:
++            raise ValueError("No free prompt_adapter slots")
++        index, _ = first_free_slot
++        self._active_adapters[prompt_adapter_id] = None
++        prompt_adapter_model = (self._registered_adapters[prompt_adapter_id])
++        logger.debug("Activating prompt_adapter. int id: %d, slot index: %d",
++                     prompt_adapter_model.id, index)
++        self.prompt_adapter_index_to_id[index] = prompt_adapter_model.id
++        for _, v in self.modules.items():
++            v.set_prompt_adapter(index, prompt_adapter_model.prompt_embedding)
++        return True
++
++    def _deactivate_adapter(self, prompt_adapter_id: int):
++        try:
++            index = self.prompt_adapter_index_to_id.index(prompt_adapter_id)
++            self.prompt_adapter_index_to_id[index] = None
++            for _, v in self.modules.items():
++                v.reset_prompt_adapter(index)
++        except ValueError:
++            pass
++
++    def _add_adapter(self, prompt_adapter: PromptAdapterModel):
++        self._registered_adapters[prompt_adapter.id] = prompt_adapter
++
++    def _set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None:
++        base_indices, base_embedding_indices = convert_mapping(
++            mapping, self.prompt_adapter_index_to_id)
++        for k, v in self.modules.items():
++            v.set_mapping(base_indices, base_embedding_indices)
++
++    def _create_prompt_adapter_modules(self):
++        for module_name, module in self.model.named_modules(
++                remove_duplicate=False):
++            if "VocabParallel" in module.__class__.__name__:
++                new_module = VocabParallelEmbeddingWithPromptAdapter(module)
++                new_module.create_prompt_adapter_weights(
++                    self.prompt_adapter_config)
++                replaced_module = self.replace_submodule(
++                    self.model, module_name, new_module)
++                self.register_module(module.__class__.__name__,
++                                     replaced_module)
++                replaced_module.set_mapping(self.base_indices,
++                                            self.base_embedding_indices)
++                break
++
++    def replace_submodule(self, model: nn.Module, module_name: str,
++                          new_module: nn.Module) -> nn.Module:
++        """Replace a submodule in a model with a new module."""
++        parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
++        target_name = module_name.split(".")[-1]
++        setattr(parent, target_name, new_module)
++        return new_module
++
++    def register_module(self, module_name: str, module: nn.Module):
++        self.modules[module_name] = module
++
++    def pin_adapter(self, prompt_adapter_id: int) -> bool:
++        """Pin a PromptAdapterModel in the manager cache."""
++        raise NotImplementedError(
++            "Pinning is not supported in PromptAdapterModelManager."
++            "Use LRUCachePromptAdapterModelManager for pinning"
++        )  # type: ignore
++
++    def remove_all_adapters(self):
++        """Remove all PromptAdapterModel from the manager."""
++        self._registered_adapters.clear()
++        self.prompt_adapter_index_to_id = [None] * self.prompt_adapter_slots
++        self._active_adapters.clear()
++
++    def deactivate_adapter(self, adapter_id: int) -> bool:
++        return deactivate_adapter(adapter_id, self._active_adapters,
++                                  self._deactivate_adapter)
++
++    def add_adapter(self, adapter: PromptAdapterModel) -> bool:
++        return add_adapter(adapter, self._registered_adapters, self.capacity,
++                           self._add_adapter)
++
++    def set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None:
++        self._last_mapping = set_adapter_mapping(mapping, self._last_mapping,
++                                                 self._set_adapter_mapping)
++
++    def remove_adapter(self, adapter_id: int) -> bool:
++        return remove_adapter(adapter_id, self._registered_adapters,
++                              self.deactivate_adapter)
++
++    def list_adapters(self) -> Dict[int, Any]:
++        return list_adapters(self._registered_adapters)
++
++    def get_adapter(self, adapter_id: int) -> Optional[Any]:
++        return get_adapter(adapter_id, self._registered_adapters)
++
++
++class PromptAdapterLRUCache(AdapterLRUCache[PromptAdapterModel]):
++
++    def __init__(self, capacity: int,
++                 deactivate_prompt_adapter_fn: Callable[[int], bool]):
++        super().__init__(capacity, deactivate_prompt_adapter_fn)
++
++
++class LRUCachePromptAdapterModelManager(PromptAdapterModelManager):
++    """A model manager that manages multiple prompt_adapters with LRU cache."""
++
++    def __init__(
++        self,
++        model: nn.Module,
++        max_num_seqs: int,
++        max_num_batched_tokens: int,
++        prompt_adapter_config: PromptAdapterConfig,
++    ):
++        self.prompt_adapter_config = prompt_adapter_config
++        super().__init__(model, max_num_seqs, max_num_batched_tokens,
++                         prompt_adapter_config)
++        self._registered_adapters = PromptAdapterLRUCache(
++            self.capacity, self.deactivate_adapter)
++        self._active_adapters = PromptAdapterLRUCache(
++            self.prompt_adapter_slots, self._deactivate_adapter)
++
++    def list_adapters(self) -> Dict[int, PromptAdapterModel]:
++        """List all registered PromptAdapterModel."""
++        return dict(self._registered_adapters.cache)
++
++    def add_adapter(self, prompt_adapter: PromptAdapterModel) -> bool:
++        """Add a PromptAdapterModel to the manager."""
++        if prompt_adapter.id not in self._registered_adapters:
++            self._add_adapter(prompt_adapter)
++            was_added = True
++        else:
++            # We always touch to update the LRU cache order
++            self._registered_adapters.touch(prompt_adapter.id)
++            was_added = False
++        return was_added
++
++    def activate_adapter(
++        self,
++        prompt_adapter_id: int,
++    ) -> bool:
++        if prompt_adapter_id not in self._active_adapters and len(
++                self._active_adapters) >= self.prompt_adapter_slots:
++            self._active_adapters.remove_oldest()
++        result = super().activate_adapter(prompt_adapter_id)
++        # We always touch to update the LRU cache order
++        self._active_adapters.touch(prompt_adapter_id)
++        return result
++
++    def remove_oldest_adapter(self) -> bool:
++        if len(self._registered_adapters) > 0:
++            self._registered_adapters.remove_oldest()
++            return True
++        return False
++
++    def pin_adapter(self, prompt_adapter_id: int) -> bool:
++        """Pin a PromptAdapterModel in the manager cache."""
++        self._pin_prompt_adapter_in_cpu_cache(prompt_adapter_id)
++        self._pin_prompt_adapter_in_gpu_cache(prompt_adapter_id)
++        return True
++
++    def _pin_prompt_adapter_in_cpu_cache(self, prompt_adapter_id: int):
++        try:
++            self._registered_adapters.pin(prompt_adapter_id)
++        except ValueError as err:
++            raise ValueError(
++                "Pinning failed. "
++                f"Prompt Adapter {prompt_adapter_id} is not registered."
++            ) from err
++
++    def _pin_prompt_adapter_in_gpu_cache(self, prompt_adapter_id: int):
++        if prompt_adapter_id not in self._active_adapters:
++            # move adapter to gpu if not already active
++            self.activate_adapter(prompt_adapter_id)
++        self._active_adapters.pin(prompt_adapter_id)
++
++
++def create_prompt_adapter_manager(
++        model: nn.Module,
++        max_num_seqs: int,
++        max_num_batched_tokens: int,
++        prompt_adapter_config: PromptAdapterConfig,
++        prompt_adapter_manager_cls: Type[
++            PromptAdapterModelManager] = PromptAdapterModelManager,
++        **kwargs) -> PromptAdapterModelManager:
++    """Create a PromptAdapterModel for a given model."""
++    prompt_adapter_manager = prompt_adapter_manager_cls(
++        model=model,
++        max_num_seqs=max_num_seqs,
++        max_num_batched_tokens=max_num_batched_tokens,
++        prompt_adapter_config=prompt_adapter_config,
++        **kwargs)
++    return prompt_adapter_manager
+diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py
+new file mode 100644
+index 0000000..775dd11
+--- /dev/null
++++ b/vllm/prompt_adapter/request.py
+@@ -0,0 +1,34 @@
++import msgspec
++
++from vllm.adapter_commons.request import AdapterRequest
++
++
++class PromptAdapterRequest(
++        msgspec.Struct,
++        array_like=True,  # type: ignore[call-arg]
++        omit_defaults=True,  # type: ignore[call-arg]
++        frozen=True):  # type: ignore[call-arg]
++    """
++    Request for a Prompt adapter.
++    """
++    __metaclass__ = AdapterRequest
++
++    prompt_adapter_name: str
++    prompt_adapter_id: int
++    prompt_adapter_local_path: str
++    prompt_adapter_num_virtual_tokens: int
++
++    def __hash__(self):
++        return super().__hash__()
++
++    @property
++    def adapter_id(self):
++        return self.prompt_adapter_id
++
++    @property
++    def name(self):
++        return self.prompt_adapter_name
++
++    @property
++    def local_path(self):
++        return self.prompt_adapter_local_path
+diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
+new file mode 100644
+index 0000000..473b87c
+--- /dev/null
++++ b/vllm/prompt_adapter/utils.py
+@@ -0,0 +1,94 @@
++# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
++
++import os
++from typing import Optional
++
++import torch
++from huggingface_hub import file_exists, hf_hub_download
++from huggingface_hub.utils import EntryNotFoundError
++from safetensors.torch import load_file as safe_load_file
++
++from vllm.platforms import current_platform
++
++WEIGHTS_NAME = "adapter_model.bin"
++SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
++
++
++# Get current device name based on available devices
++def infer_device() -> str:
++    if current_platform.is_cuda_alike():
++        return "cuda"
++    return "cpu"
++
++
++def load_peft_weights(model_id: str,
++                      device: Optional[str] = None,
++                      **hf_hub_download_kwargs) -> dict:
++    r"""
++    A helper method to load the PEFT weights from the HuggingFace Hub or locally
++
++    Args:
++        model_id (`str`):
++            The local path to the adapter weights or the name of the adapter to
++            load from the HuggingFace Hub.
++        device (`str`):
++            The device to load the weights onto.
++        hf_hub_download_kwargs (`dict`):
++            Additional arguments to pass to the `hf_hub_download` method when 
++            loading from the HuggingFace Hub.
++    """
++    path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) if
++            hf_hub_download_kwargs.get("subfolder") is not None else model_id)
++
++    if device is None:
++        device = infer_device()
++
++    if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)):
++        filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME)
++        use_safetensors = True
++    elif os.path.exists(os.path.join(path, WEIGHTS_NAME)):
++        filename = os.path.join(path, WEIGHTS_NAME)
++        use_safetensors = False
++    else:
++        token = hf_hub_download_kwargs.get("token")
++        if token is None:
++            token = hf_hub_download_kwargs.get("use_auth_token")
++
++        hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"],
++                                     SAFETENSORS_WEIGHTS_NAME)
++                        if hf_hub_download_kwargs.get("subfolder") is not None
++                        else SAFETENSORS_WEIGHTS_NAME)
++        has_remote_safetensors_file = file_exists(
++            repo_id=model_id,
++            filename=hub_filename,
++            revision=hf_hub_download_kwargs.get("revision"),
++            repo_type=hf_hub_download_kwargs.get("repo_type"),
++            token=token,
++        )
++        use_safetensors = has_remote_safetensors_file
++
++        if has_remote_safetensors_file:
++            # Priority 1: load safetensors weights
++            filename = hf_hub_download(
++                model_id,
++                SAFETENSORS_WEIGHTS_NAME,
++                **hf_hub_download_kwargs,
++            )
++        else:
++            try:
++                filename = hf_hub_download(model_id, WEIGHTS_NAME,
++                                           **hf_hub_download_kwargs)
++            except EntryNotFoundError:
++                raise ValueError(  # noqa: B904
++                    f"Can't find weights for {model_id} in {model_id} or \
++                    in the Hugging Face Hub. "
++                    f"Please check that the file {WEIGHTS_NAME} or \
++                    {SAFETENSORS_WEIGHTS_NAME} is present at {model_id}.")
++
++    if use_safetensors:
++        adapters_weights = safe_load_file(filename, device=device)
++    else:
++        adapters_weights = torch.load(filename,
++                                      map_location=torch.device(device))
++
++    return adapters_weights
+diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py
+new file mode 100644
+index 0000000..ddc1ef8
+--- /dev/null
++++ b/vllm/prompt_adapter/worker_manager.py
+@@ -0,0 +1,176 @@
++import logging
++from typing import Any, Optional, Set, Type
++
++import torch
++
++from vllm.adapter_commons.utils import (add_adapter_worker,
++                                        apply_adapters_worker,
++                                        list_adapters_worker,
++                                        set_active_adapters_worker)
++from vllm.adapter_commons.worker_manager import AbstractWorkerManager
++from vllm.config import PromptAdapterConfig
++from vllm.prompt_adapter.models import (LRUCachePromptAdapterModelManager,
++                                        PromptAdapterModel,
++                                        PromptAdapterModelManager,
++                                        create_prompt_adapter_manager)
++from vllm.prompt_adapter.request import PromptAdapterRequest
++
++logger = logging.getLogger(__name__)
++
++
++class WorkerPromptAdapterManager(AbstractWorkerManager):
++    """WorkerPromptAdapterManager that manages 
++    prompt_adapter models on the worker side.
++
++    Every request, the requested prompt_adapters will be 
++    loaded (unless they are already loaded), 
++    and every other prompt_adapter will be unloaded."""
++
++    _manager_cls: Type[PromptAdapterModelManager] = PromptAdapterModelManager
++
++    def __init__(
++        self,
++        max_num_seqs: int,
++        max_num_batched_tokens: int,
++        device: torch.device,
++        prompt_adapter_config: PromptAdapterConfig,
++        prompt_adapter_model_cls: Type[PromptAdapterModel] = PromptAdapterModel
++    ):
++        self._adapter_manager: PromptAdapterModelManager
++        self.max_num_seqs = max_num_seqs
++        self.max_num_batched_tokens = max_num_batched_tokens
++        self._prompt_adapter_model_cls = prompt_adapter_model_cls
++        self.prompt_adapter_config = prompt_adapter_config
++        super().__init__(device)
++
++    @property
++    def is_enabled(self) -> bool:
++        return True
++
++    def create_prompt_adapter_manager(
++        self,
++        model: torch.nn.Module,
++    ) -> Any:
++        prompt_adapter_manager = create_prompt_adapter_manager(
++            model,
++            max_num_seqs=self.max_num_seqs,
++            max_num_batched_tokens=self.max_num_batched_tokens,
++            prompt_adapter_config=self.prompt_adapter_config,
++            prompt_adapter_manager_cls=self._manager_cls,
++        )
++        self._adapter_manager = prompt_adapter_manager
++        return prompt_adapter_manager.model
++
++    def _load_adapter(
++            self, prompt_adapter_request: PromptAdapterRequest
++    ) -> PromptAdapterModel:
++        try:
++            prompt_adapter = (
++                self._prompt_adapter_model_cls.from_local_checkpoint(
++                    prompt_adapter_request.prompt_adapter_local_path,
++                    prompt_adapter_id=prompt_adapter_request.prompt_adapter_id,
++                    num_virtual_tokens=prompt_adapter_request.
++                    prompt_adapter_num_virtual_tokens,
++                    config=self.prompt_adapter_config,
++                    device=str(self.device),
++                ))
++        except Exception as e:
++            raise RuntimeError(
++                f"Loading prompt_adapter "
++                f"{prompt_adapter_request.prompt_adapter_local_path}"
++                f" failed") from e
++        return prompt_adapter
++
++    def add_dummy_prompt_adapter(
++            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
++        return True
++
++    def pin_adapter(self, adapter_id: int) -> bool:
++        return self._adapter_manager.pin_adapter(adapter_id)
++
++    def set_active_adapters(self, requests: Set[Any],
++                            mapping: Optional[Any]) -> None:
++        set_active_adapters_worker(requests, mapping, self._apply_adapters,
++                                   self._adapter_manager.set_adapter_mapping)
++
++    def add_adapter(self, adapter_request: Any) -> bool:
++        return add_adapter_worker(adapter_request, self.list_adapters,
++                                  self._load_adapter,
++                                  self._adapter_manager.add_adapter,
++                                  self._adapter_manager.activate_adapter)
++
++    def _apply_adapters(self, adapter_requests: Set[Any]) -> None:
++        apply_adapters_worker(adapter_requests, self.list_adapters,
++                              self._adapter_manager.adapter_slots,
++                              self.remove_adapter, self.add_adapter)
++
++    def remove_adapter(self, adapter_id: int) -> bool:
++        return self._adapter_manager.remove_adapter(adapter_id)
++
++    def remove_all_adapters(self):
++        self._adapter_manager.remove_all_adapters()
++
++    def list_adapters(self) -> Set[int]:
++        return list_adapters_worker(self._adapter_manager.list_adapters)
++
++
++class LRUCacheWorkerPromptAdapterManager(WorkerPromptAdapterManager):
++    """WorkerPromptAdapterManager that manages 
++    prompt_adapter models on the worker side.
++
++    Uses an LRU Cache. Every request, the requested 
++    prompt_adapters will be loaded (unless they are already loaded) 
++    and least recently used prompt_adapters will
++    be unloaded if the cache is above capacity."""
++
++    _prompt_adapter_manager_cls: Type[
++        LRUCachePromptAdapterModelManager] = LRUCachePromptAdapterModelManager
++
++    def create_prompt_adapter_manager(
++        self,
++        model: torch.nn.Module,
++    ) -> Any:
++        prompt_adapter_manager = create_prompt_adapter_manager(
++            model,
++            max_num_seqs=self.max_num_seqs,
++            max_num_batched_tokens=self.max_num_batched_tokens,
++            prompt_adapter_config=self.prompt_adapter_config,
++            prompt_adapter_manager_cls=self._prompt_adapter_manager_cls)
++        self._adapter_manager: LRUCachePromptAdapterModelManager = (
++            prompt_adapter_manager)
++        return prompt_adapter_manager.model
++
++    def _apply_adapters(
++            self, prompt_adapter_requests: Set[PromptAdapterRequest]) -> None:
++        prompt_adapters_map = {
++            prompt_adapter_request.prompt_adapter_id: prompt_adapter_request
++            for prompt_adapter_request in prompt_adapter_requests
++            if prompt_adapter_request
++        }
++        if len(prompt_adapters_map
++               ) > self._adapter_manager.prompt_adapter_slots:
++            raise RuntimeError(
++                f"Number of requested prompt_adapters "
++                f"({len(prompt_adapters_map)}) is greater "
++                "than the number of GPU prompt_adapter slots "
++                f"({self._adapter_manager.prompt_adapter_slots}).")
++        for prompt_adapter in prompt_adapters_map.values():
++            self.add_adapter(prompt_adapter)
++
++    def add_adapter(self,
++                    prompt_adapter_request: PromptAdapterRequest) -> bool:
++        if prompt_adapter_request.prompt_adapter_id not in self.list_adapters(
++        ):
++            # Remove before we load the new prompt_adapter to save memory
++            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
++                self._adapter_manager.remove_oldest_adapter()
++            prompt_adapter = self._load_adapter(prompt_adapter_request)
++            loaded = self._adapter_manager.add_adapter(prompt_adapter)
++        else:
++            # If the prompt_adapter is already loaded, just touch it to
++            # update its position in the caches
++            loaded = self._adapter_manager.get_adapter(
++                prompt_adapter_request.prompt_adapter_id) is not None
++        self._adapter_manager.activate_adapter(
++            prompt_adapter_request.prompt_adapter_id)
++        return loaded
+diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
+index 5fa94eb..605c09b 100644
+--- a/vllm/sampling_params.py
++++ b/vllm/sampling_params.py
+@@ -1,30 +1,94 @@
+ """Sampling parameters for text generation."""
+ import copy
+-from enum import IntEnum
++from dataclasses import dataclass
++from enum import Enum, IntEnum
+ from functools import cached_property
+-from typing import Any, Callable, Dict, List, Optional, Union
++from typing import Any, Dict, List, Optional, Set, Union
+ 
+-import torch
+-from pydantic import Field
++import msgspec
++from pydantic import BaseModel
+ from typing_extensions import Annotated
+ 
++from vllm.logger import init_logger
++from vllm.logits_process import LogitsProcessor
++
++logger = init_logger(__name__)
++
+ _SAMPLING_EPS = 1e-5
++_MAX_TEMP = 1e-2
+ 
+ 
+ class SamplingType(IntEnum):
+     GREEDY = 0
+     RANDOM = 1
+     RANDOM_SEED = 2
+-    BEAM = 3
+ 
+ 
+-LogitsProcessor = Callable[[List[int], torch.Tensor], torch.Tensor]
+-"""LogitsProcessor is a function that takes a list of previously generated
+-tokens and a tensor of the logits for the next token, and returns a modified
+-tensor of logits to sample from."""
++# maybe make msgspec?
++@dataclass
++class GuidedDecodingParams:
++    """One of these fields will be used to build a logit processor."""
++    json: Optional[Union[str, Dict]] = None
++    regex: Optional[str] = None
++    choice: Optional[List[str]] = None
++    grammar: Optional[str] = None
++    json_object: Optional[bool] = None
++    """These are other options that can be set"""
++    backend: Optional[str] = None
++    whitespace_pattern: Optional[str] = None
++
++    @staticmethod
++    def from_optional(
++        json: Optional[Union[Dict, BaseModel, str]] = None,
++        regex: Optional[str] = None,
++        choice: Optional[List[str]] = None,
++        grammar: Optional[str] = None,
++        json_object: Optional[bool] = None,
++        backend: Optional[str] = None,
++        whitespace_pattern: Optional[str] = None,
++    ) -> Optional["GuidedDecodingParams"]:
++        if all(arg is None
++               for arg in (json, regex, choice, grammar, json_object)):
++            return None
++        # Extract json schemas from pydantic models
++        if isinstance(json, (BaseModel, type(BaseModel))):
++            json = json.model_json_schema()
++        return GuidedDecodingParams(
++            json=json,
++            regex=regex,
++            choice=choice,
++            grammar=grammar,
++            json_object=json_object,
++            backend=backend,
++            whitespace_pattern=whitespace_pattern,
++        )
++
++    def __post_init__(self):
++        """Validate that some fields are mutually exclusive."""
++        guide_count = sum([
++            self.json is not None, self.regex is not None, self.choice
++            is not None, self.grammar is not None, self.json_object is not None
++        ])
++        if guide_count > 1:
++            raise ValueError(
++                "You can only use one kind of guided decoding but multiple are "
++                f"specified: {self.__dict__}")
++
+ 
++class RequestOutputKind(Enum):
++    # Return entire output so far in every RequestOutput
++    CUMULATIVE = 0
++    # Return only deltas in each RequestOutput
++    DELTA = 1
++    # Do not return intermediate RequestOuputs
++    FINAL_ONLY = 2
+ 
+-class SamplingParams:
++
++class SamplingParams(
++        msgspec.Struct,
++        omit_defaults=True,  # type: ignore[call-arg]
++        # required for @cached_property.
++        dict=True):  # type: ignore[call-arg]
+     """Sampling parameters for text generation.
+ 
+     Overall, we follow the sampling parameters from the OpenAI text completion
+@@ -35,9 +99,8 @@ class SamplingParams:
+         n: Number of output sequences to return for the given prompt.
+         best_of: Number of output sequences that are generated from the prompt.
+             From these `best_of` sequences, the top `n` sequences are returned.
+-            `best_of` must be greater than or equal to `n`. This is treated as
+-            the beam width when `use_beam_search` is True. By default, `best_of`
+-            is set to `n`.
++            `best_of` must be greater than or equal to `n`. By default,
++            `best_of` is set to `n`.
+         presence_penalty: Float that penalizes new tokens based on whether they
+             appear in the generated text so far. Values > 0 encourage the model
+             to use new tokens, while values < 0 encourage the model to repeat
+@@ -61,21 +124,15 @@ class SamplingParams:
+             considered, relative to the probability of the most likely token.
+             Must be in [0, 1]. Set to 0 to disable this.
+         seed: Random seed to use for the generation.
+-        use_beam_search: Whether to use beam search instead of sampling.
+-        length_penalty: Float that penalizes sequences based on their length.
+-            Used in beam search.
+-        early_stopping: Controls the stopping condition for beam search. It
+-            accepts the following values: `True`, where the generation stops as
+-            soon as there are `best_of` complete candidates; `False`, where an
+-            heuristic is applied and the generation stops when is it very
+-            unlikely to find better candidates; `"never"`, where the beam search
+-            procedure only stops when there cannot be better candidates
+-            (canonical beam search algorithm).
+         stop: List of strings that stop the generation when they are generated.
+             The returned output will not contain the stop strings.
+         stop_token_ids: List of tokens that stop the generation when they are
+             generated. The returned output will contain the stop tokens unless
+             the stop tokens are special tokens.
++        bad_words: List of words that are not allowed to be generated.
++            More precisely, only the last token of a corresponding
++            token sequence is not allowed when the next generated token
++            can complete the sequence.
+         include_stop_str_in_output: Whether to include the stop strings in
+             output text. Defaults to False.
+         ignore_eos: Whether to ignore the EOS token and continue generating
+@@ -84,40 +141,89 @@ class SamplingParams:
+         min_tokens: Minimum number of tokens to generate per output sequence
+             before EOS or stop_token_ids can be generated
+         logprobs: Number of log probabilities to return per output token.
+-            Note that the implementation follows the OpenAI API: The return
+-            result includes the log probabilities on the `logprobs` most likely
+-            tokens, as well the chosen tokens. The API will always return the
+-            log probability of the sampled token, so there  may be up to
+-            `logprobs+1` elements in the response.
++            When set to None, no probability is returned. If set to a non-None
++            value, the result includes the log probabilities of the specified
++            number of most likely tokens, as well as the chosen tokens.
++            Note that the implementation follows the OpenAI API: The API will
++            always return the log probability of the sampled token, so there
++            may be up to `logprobs+1` elements in the response.
+         prompt_logprobs: Number of log probabilities to return per prompt token.
+         detokenize: Whether to detokenize the output. Defaults to True.
+         skip_special_tokens: Whether to skip special tokens in the output.
+         spaces_between_special_tokens: Whether to add spaces between special
+             tokens in the output.  Defaults to True.
+         logits_processors: List of functions that modify logits based on
+-            previously generated tokens.
++            previously generated tokens, and optionally prompt tokens as
++            a first argument.
+         truncate_prompt_tokens: If set to an integer k, will use only the last k
+             tokens from the prompt (i.e., left truncation). Defaults to None
+             (i.e., no truncation).
++        guided_decoding: If provided, the engine will construct a guided
++            decoding logits processor from these parameters. Defaults to None.
++        logit_bias: If provided, the engine will construct a logits processor
++            that applies these logit biases. Defaults to None.
++        allowed_token_ids: If provided, the engine will construct a logits
++            processor which only retains scores for the given token ids.
++            Defaults to None.
+     """
+ 
+-    def __init__(
+-        self,
+-        n: int = 1,
++    n: int = 1
++    best_of: Optional[int] = None
++    _real_n: Optional[int] = None
++    presence_penalty: float = 0.0
++    frequency_penalty: float = 0.0
++    repetition_penalty: float = 1.0
++    temperature: float = 1.0
++    top_p: float = 1.0
++    top_k: int = -1
++    min_p: float = 0.0
++    seed: Optional[int] = None
++    stop: Optional[Union[str, List[str]]] = None
++    stop_token_ids: Optional[List[int]] = None
++    bad_words: Optional[List[str]] = None
++    ignore_eos: bool = False
++    max_tokens: Optional[int] = 16
++    min_tokens: int = 0
++    logprobs: Optional[int] = None
++    prompt_logprobs: Optional[int] = None
++    # NOTE: This parameter is only exposed at the engine level for now.
++    # It is not exposed in the OpenAI API server, as the OpenAI API does
++    # not support returning only a list of token IDs.
++    detokenize: bool = True
++    skip_special_tokens: bool = True
++    spaces_between_special_tokens: bool = True
++    # Optional[List[LogitsProcessor]] type. We use Any here because
++    # Optional[List[LogitsProcessor]] type is not supported by msgspec.
++    logits_processors: Optional[Any] = None
++    include_stop_str_in_output: bool = False
++    truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None
++    output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE
++
++    # The below fields are not supposed to be used as an input.
++    # They are set in post_init.
++    output_text_buffer_length: int = 0
++    _all_stop_token_ids: Set[int] = msgspec.field(default_factory=set)
++
++    # Fields used to construct logits processors
++    guided_decoding: Optional[GuidedDecodingParams] = None
++    logit_bias: Optional[Dict[int, float]] = None
++    allowed_token_ids: Optional[List[int]] = None
++
++    @staticmethod
++    def from_optional(
++        n: Optional[int] = 1,
+         best_of: Optional[int] = None,
+-        presence_penalty: float = 0.0,
+-        frequency_penalty: float = 0.0,
+-        repetition_penalty: float = 1.0,
+-        temperature: float = 1.0,
+-        top_p: float = 1.0,
++        presence_penalty: Optional[float] = 0.0,
++        frequency_penalty: Optional[float] = 0.0,
++        repetition_penalty: Optional[float] = 1.0,
++        temperature: Optional[float] = 1.0,
++        top_p: Optional[float] = 1.0,
+         top_k: int = -1,
+         min_p: float = 0.0,
+         seed: Optional[int] = None,
+-        use_beam_search: bool = False,
+-        length_penalty: float = 1.0,
+-        early_stopping: Union[bool, str] = False,
+         stop: Optional[Union[str, List[str]]] = None,
+         stop_token_ids: Optional[List[int]] = None,
++        bad_words: Optional[List[str]] = None,
+         include_stop_str_in_output: bool = False,
+         ignore_eos: bool = False,
+         max_tokens: Optional[int] = 16,
+@@ -128,75 +234,124 @@ class SamplingParams:
+         skip_special_tokens: bool = True,
+         spaces_between_special_tokens: bool = True,
+         logits_processors: Optional[List[LogitsProcessor]] = None,
+-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+-    ) -> None:
+-        self.n = n
+-        self.best_of = best_of if best_of is not None else n
+-        self.presence_penalty = presence_penalty
+-        self.frequency_penalty = frequency_penalty
+-        self.repetition_penalty = repetition_penalty
+-        self.temperature = temperature
+-        self.top_p = top_p
+-        self.top_k = top_k
+-        self.min_p = min_p
+-        if seed == -1:
++        truncate_prompt_tokens: Optional[Annotated[int,
++                                                   msgspec.Meta(ge=1)]] = None,
++        output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
++        guided_decoding: Optional[GuidedDecodingParams] = None,
++        logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]] = None,
++        allowed_token_ids: Optional[List[int]] = None,
++    ) -> "SamplingParams":
++        if logit_bias is not None:
++            logit_bias = {
++                int(token): bias
++                for token, bias in logit_bias.items()
++            }
++
++        return SamplingParams(
++            n=1 if n is None else n,
++            best_of=best_of,
++            presence_penalty=0.0
++            if presence_penalty is None else presence_penalty,
++            frequency_penalty=0.0
++            if frequency_penalty is None else frequency_penalty,
++            repetition_penalty=1.0
++            if repetition_penalty is None else repetition_penalty,
++            temperature=1.0 if temperature is None else temperature,
++            top_p=1.0 if top_p is None else top_p,
++            top_k=top_k,
++            min_p=min_p,
++            seed=seed,
++            stop=stop,
++            stop_token_ids=stop_token_ids,
++            bad_words=bad_words,
++            include_stop_str_in_output=include_stop_str_in_output,
++            ignore_eos=ignore_eos,
++            max_tokens=max_tokens,
++            min_tokens=min_tokens,
++            logprobs=logprobs,
++            prompt_logprobs=prompt_logprobs,
++            detokenize=detokenize,
++            skip_special_tokens=skip_special_tokens,
++            spaces_between_special_tokens=spaces_between_special_tokens,
++            logits_processors=logits_processors,
++            truncate_prompt_tokens=truncate_prompt_tokens,
++            output_kind=output_kind,
++            guided_decoding=guided_decoding,
++            logit_bias=logit_bias,
++            allowed_token_ids=allowed_token_ids,
++        )
++
++    def __post_init__(self) -> None:
++        # how we deal with `best_of``:
++        # if `best_of`` is not set, we default to `n`;
++        # if `best_of`` is set, we set `n`` to `best_of`,
++        # and set `_real_n`` to the original `n`.
++        # when we return the result, we will check
++        # if we need to return `n` or `_real_n` results
++        if self.best_of:
++            if self.best_of < self.n:
++                raise ValueError(
++                    f"best_of must be greater than or equal to n, "
++                    f"got n={self.n} and best_of={self.best_of}.")
++            if not self._real_n:
++                self._real_n = self.n
++                self.n = self.best_of
++
++        if 0 < self.temperature < _MAX_TEMP:
++            logger.warning(
++                "temperature %s is less than %s, which may cause numerical "
++                "errors nan or inf in tensors. We have maxed it out to %s.",
++                self.temperature, _MAX_TEMP, _MAX_TEMP)
++            self.temperature = max(self.temperature, _MAX_TEMP)
++
++        if self.seed == -1:
+             self.seed = None
+         else:
+-            self.seed = seed
+-        self.use_beam_search = use_beam_search
+-        self.length_penalty = length_penalty
+-        self.early_stopping = early_stopping
+-        if stop is None:
++            self.seed = self.seed
++
++        if self.stop is None:
+             self.stop = []
+-        elif isinstance(stop, str):
+-            self.stop = [stop]
++        elif isinstance(self.stop, str):
++            self.stop = [self.stop]
+         else:
+-            self.stop = list(stop)
+-        if stop_token_ids is None:
++            self.stop = list(self.stop)
++
++        if self.stop_token_ids is None:
+             self.stop_token_ids = []
+         else:
+-            self.stop_token_ids = list(stop_token_ids)
+-        self.ignore_eos = ignore_eos
+-        self.max_tokens = max_tokens
+-        self.min_tokens = min_tokens
+-        self.logprobs = logprobs
+-        self.prompt_logprobs = prompt_logprobs
+-        # NOTE: This parameter is only exposed at the engine level for now.
+-        # It is not exposed in the OpenAI API server, as the OpenAI API does
+-        # not support returning only a list of token IDs.
+-        self.detokenize = detokenize
+-        self.skip_special_tokens = skip_special_tokens
+-        self.spaces_between_special_tokens = spaces_between_special_tokens
+-        self.logits_processors = logits_processors
+-        self.include_stop_str_in_output = include_stop_str_in_output
+-        self.truncate_prompt_tokens = truncate_prompt_tokens
++            self.stop_token_ids = list(self.stop_token_ids)
++
++        if self.bad_words is None:
++            self.bad_words = []
++        else:
++            self.bad_words = list(self.bad_words)
++
++        self.logprobs = 1 if self.logprobs is True else self.logprobs
++        self.prompt_logprobs = (1 if self.prompt_logprobs is True else
++                                self.prompt_logprobs)
++
+         # Number of characters to hold back for stop string evaluation
+         # until sequence is finished.
+-        if self.stop and not include_stop_str_in_output:
++        if self.stop and not self.include_stop_str_in_output:
+             self.output_text_buffer_length = max(len(s) for s in self.stop) - 1
+-        else:
+-            self.output_text_buffer_length = 0
+ 
+         self._verify_args()
+-        if self.use_beam_search:
+-            self._verify_beam_search()
+-        else:
+-            self._verify_non_beam_search()
+-            if self.temperature < _SAMPLING_EPS:
+-                # Zero temperature means greedy sampling.
+-                self.top_p = 1.0
+-                self.top_k = -1
+-                self.min_p = 0.0
+-                self._verify_greedy_sampling()
++
++        if self.temperature < _SAMPLING_EPS:
++            # Zero temperature means greedy sampling.
++            self.top_p = 1.0
++            self.top_k = -1
++            self.min_p = 0.0
++            self._verify_greedy_sampling()
+         # eos_token_id is added to this by the engine
+-        self.all_stop_token_ids = set(self.stop_token_ids)
++        self._all_stop_token_ids = set(self.stop_token_ids)
+ 
+     def _verify_args(self) -> None:
++        if not isinstance(self.n, int):
++            raise ValueError(f"n must be an int, but is of "
++                             f"type {type(self.n)}")
+         if self.n < 1:
+             raise ValueError(f"n must be at least 1, got {self.n}.")
+-        if self.best_of < self.n:
+-            raise ValueError(f"best_of must be greater than or equal to n, "
+-                             f"got n={self.n} and best_of={self.best_of}.")
+         if not -2.0 <= self.presence_penalty <= 2.0:
+             raise ValueError("presence_penalty must be in [-2, 2], got "
+                              f"{self.presence_penalty}.")
+@@ -214,6 +369,9 @@ class SamplingParams:
+         if self.top_k < -1 or self.top_k == 0:
+             raise ValueError(f"top_k must be -1 (disable), or at least 1, "
+                              f"got {self.top_k}.")
++        if not isinstance(self.top_k, int):
++            raise TypeError(
++                f"top_k must be an integer, got {type(self.top_k).__name__}")
+         if not 0.0 <= self.min_p <= 1.0:
+             raise ValueError("min_p must be in [0, 1], got "
+                              f"{self.min_p}.")
+@@ -237,76 +395,71 @@ class SamplingParams:
+                 and self.truncate_prompt_tokens < 1):
+             raise ValueError(f"truncate_prompt_tokens must be >= 1, "
+                              f"got {self.truncate_prompt_tokens}")
++        assert isinstance(self.stop, list)
+         if any(not stop_str for stop_str in self.stop):
+             raise ValueError("stop cannot contain an empty string.")
+         if self.stop and not self.detokenize:
+             raise ValueError(
+                 "stop strings are only supported when detokenize is True. "
+                 "Set detokenize=True to use stop.")
+-
+-    def _verify_beam_search(self) -> None:
+-        if self.best_of == 1:
+-            raise ValueError("best_of must be greater than 1 when using beam "
+-                             f"search. Got {self.best_of}.")
+-        if self.temperature > _SAMPLING_EPS:
+-            raise ValueError("temperature must be 0 when using beam search.")
+-        if self.top_p < 1.0 - _SAMPLING_EPS:
+-            raise ValueError("top_p must be 1 when using beam search.")
+-        if self.top_k != -1:
+-            raise ValueError("top_k must be -1 when using beam search.")
+-        if self.early_stopping not in [True, False, "never"]:
+-            raise ValueError(
+-                f"early_stopping must be True, False, or 'never', "
+-                f"got {self.early_stopping}.")
+-
+-    def _verify_non_beam_search(self) -> None:
+-        if self.early_stopping is not False:
+-            raise ValueError("early_stopping is not effective and must be "
+-                             "False when not using beam search.")
+-        if (self.length_penalty < 1.0 - _SAMPLING_EPS
+-                or self.length_penalty > 1.0 + _SAMPLING_EPS):
+-            raise ValueError(
+-                "length_penalty is not effective and must be the "
+-                "default value of 1.0 when not using beam search.")
++        if self.best_of != self._real_n and self.output_kind == (
++                RequestOutputKind.DELTA):
++            raise ValueError("best_of must equal n to use output_kind=DELTA")
+ 
+     def _verify_greedy_sampling(self) -> None:
+-        if self.best_of > 1:
+-            raise ValueError("best_of must be 1 when using greedy sampling."
+-                             f"Got {self.best_of}.")
++        if self.n > 1:
++            raise ValueError("n must be 1 when using greedy sampling, "
++                             f"got {self.n}.")
+ 
+     def update_from_generation_config(
+-            self, generation_config: Dict[str, Any]) -> None:
++            self,
++            generation_config: Dict[str, Any],
++            model_eos_token_id: Optional[int] = None) -> None:
+         """Update if there are non-default values from generation_config"""
++
++        if model_eos_token_id is not None:
++            # Add the eos token id into the sampling_params to support
++            # min_tokens processing.
++            self._all_stop_token_ids.add(model_eos_token_id)
++
+         # Update eos_token_id for generation
+-        if (not self.ignore_eos) and (eos_ids :=
+-                                      generation_config.get("eos_token_id")):
++        if (eos_ids := generation_config.get("eos_token_id")) is not None:
+             # it can be either int or list of int
+-            if isinstance(eos_ids, int):
+-                eos_ids = [eos_ids]
+-            original_stop_token_ids = set(self.stop_token_ids)
+-            original_stop_token_ids.update(eos_ids)
+-            self.stop_token_ids = list(original_stop_token_ids)
++            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
++            if model_eos_token_id is not None:
++                # We don't need to include the primary eos_token_id in
++                # stop_token_ids since it's handled separately for stopping
++                # purposes.
++                eos_ids.discard(model_eos_token_id)
++            if eos_ids:
++                self._all_stop_token_ids.update(eos_ids)
++                if not self.ignore_eos:
++                    eos_ids.update(self.stop_token_ids)
++                    self.stop_token_ids = list(eos_ids)
+ 
+     @cached_property
+     def sampling_type(self) -> SamplingType:
+-        if self.use_beam_search:
+-            return SamplingType.BEAM
+         if self.temperature < _SAMPLING_EPS:
+             return SamplingType.GREEDY
+         if self.seed is not None:
+             return SamplingType.RANDOM_SEED
+         return SamplingType.RANDOM
+ 
++    @property
++    def all_stop_token_ids(self) -> Set[int]:
++        return self._all_stop_token_ids
++
+     def clone(self) -> "SamplingParams":
+-        """Deep copy excluding LogitsProcessor objects.
++        """Deep copy, but maybe not the LogitsProcessor objects.
+ 
+-        LogitsProcessor objects are excluded because they may contain an
+-        arbitrary, nontrivial amount of data.
++        LogitsProcessor objects may contain an arbitrary, nontrivial amount of
++        data that is expensive to copy. However, if not copied, the processor
++        needs to support parallel decoding for multiple sequences
+         See https://github.com/vllm-project/vllm/issues/3087
+         """
+ 
+         logit_processor_refs = None if self.logits_processors is None else {
+-            id(lp): lp
++            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
+             for lp in self.logits_processors
+         }
+         return copy.deepcopy(self, memo=logit_processor_refs)
+@@ -314,7 +467,6 @@ class SamplingParams:
+     def __repr__(self) -> str:
+         return (
+             f"SamplingParams(n={self.n}, "
+-            f"best_of={self.best_of}, "
+             f"presence_penalty={self.presence_penalty}, "
+             f"frequency_penalty={self.frequency_penalty}, "
+             f"repetition_penalty={self.repetition_penalty}, "
+@@ -323,11 +475,9 @@ class SamplingParams:
+             f"top_k={self.top_k}, "
+             f"min_p={self.min_p}, "
+             f"seed={self.seed}, "
+-            f"use_beam_search={self.use_beam_search}, "
+-            f"length_penalty={self.length_penalty}, "
+-            f"early_stopping={self.early_stopping}, "
+             f"stop={self.stop}, "
+             f"stop_token_ids={self.stop_token_ids}, "
++            f"bad_words={self.bad_words}, "
+             f"include_stop_str_in_output={self.include_stop_str_in_output}, "
+             f"ignore_eos={self.ignore_eos}, "
+             f"max_tokens={self.max_tokens}, "
+@@ -337,4 +487,19 @@ class SamplingParams:
+             f"skip_special_tokens={self.skip_special_tokens}, "
+             "spaces_between_special_tokens="
+             f"{self.spaces_between_special_tokens}, "
+-            f"truncate_prompt_tokens={self.truncate_prompt_tokens})")
++            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
++            f"guided_decoding={self.guided_decoding})")
++
++
++class BeamSearchParams(
++        msgspec.Struct,
++        omit_defaults=True,  # type: ignore[call-arg]
++        # required for @cached_property.
++        dict=True):  # type: ignore[call-arg]
++    """Beam search parameters for text generation."""
++    beam_width: int
++    max_tokens: int
++    ignore_eos: bool = False
++    temperature: float = 0.0
++    length_penalty: float = 1.0
++    include_stop_str_in_output: bool = False
+diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
+new file mode 100644
+index 0000000..9d711b0
+--- /dev/null
++++ b/vllm/scalar_type.py
+@@ -0,0 +1,330 @@
++import functools
++import struct
++from dataclasses import dataclass
++from enum import Enum
++from typing import Optional, Union
++
++
++# Mirrors enum in `core/scalar_type.hpp`
++class NanRepr(Enum):
++    NONE = 0  # nans are not supported
++    IEEE_754 = 1  # nans are: Exp all 1s, mantissa not all 0s
++    EXTD_RANGE_MAX_MIN = 2  # nans are: Exp all 1s, mantissa all 1s
++
++
++# This ScalarType class is a parallel implementation of the C++ ScalarType
++# class found in csrc/core/scalar_type.hpp.  These two classes should be kept
++# in sync until the inductor fully supports custom C++ classes.
++@dataclass(frozen=True)
++class ScalarType:
++    """
++    ScalarType can represent a wide range of floating point and integer
++    types, in particular it can be used to represent sub-byte data types
++    (something that torch.dtype currently does not support). It is also
++    capable of  representing types with a bias, i.e.:
++      `stored_value = value + bias`,
++    this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias
++    of 8). The implementation for this class can be found in
++    csrc/core/scalar_type.hpp, these type signatures should be kept in sync
++    with that file.
++    """
++
++    exponent: int
++    """
++    Number of bits in the exponent if this is a floating point type
++    (zero if this an integer type)
++    """
++
++    mantissa: int
++    """
++    Number of bits in the mantissa if this is a floating point type,
++    or the number bits representing an integer excluding the sign bit if
++    this an integer type.
++    """
++
++    signed: bool
++    "If the type is signed (i.e. has a sign bit)"
++
++    bias: int
++    """
++    bias used to encode the values in this scalar type
++    (value = stored_value - bias, default 0) for example if we store the
++    type as an unsigned integer with a bias of 128 then the value 0 will be
++    stored as 128 and -1 will be stored as 127 and 1 will be stored as 129.
++    """
++
++    _finite_values_only: bool = False
++    """
++    Private: if infs are supported, used `has_infs()` instead.
++    """
++
++    nan_repr: NanRepr = NanRepr.IEEE_754
++    """
++    How NaNs are represent in this scalar type, returns NanRepr value.
++    (not applicable for integer types)
++    """
++
++    def _floating_point_max_int(self) -> int:
++        assert (
++            self.mantissa <= 52 and self.exponent <= 11
++        ), f"Cannot represent max/min as a double for type {self.__str__()}"
++
++        max_mantissa = (1 << self.mantissa) - 1
++        if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN:
++            max_mantissa = max_mantissa - 1
++
++        max_exponent = (1 << self.exponent) - 2
++        if (self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN
++                or self.nan_repr == NanRepr.NONE):
++            assert (
++                self.exponent < 11
++            ), f"Cannot represent max/min as a double for type {self.__str__()}"
++            max_exponent = max_exponent + 1
++
++        # adjust the exponent to match that of a double
++        # for now we assume the exponent bias is the standard 2^(e-1) -1, (where
++        # e is the exponent bits), there is some precedent for non-standard
++        # biases, example `float8_e4m3b11fnuz` here:
++        # https://github.com/jax-ml/ml_dtypes but to avoid premature over
++        # complication we are just assuming the standard exponent bias until
++        # there is a need to support non-standard biases
++        exponent_bias = (1 << (self.exponent - 1)) - 1
++        exponent_bias_double = (1 << 10) - 1  # double e = 11
++
++        max_exponent_double = (max_exponent - exponent_bias +
++                               exponent_bias_double)
++
++        # shift the mantissa and exponent into the proper positions for an
++        # IEEE double and bitwise-or them together.
++        return (max_mantissa <<
++                (52 - self.mantissa)) | (max_exponent_double << 52)
++
++    def _floating_point_max(self) -> float:
++        double_raw = self._floating_point_max_int()
++        return struct.unpack('!d', struct.pack('!Q', double_raw))[0]
++
++    def _raw_max(self) -> Union[int, float]:
++        if self.is_floating_point():
++            return self._floating_point_max()
++        else:
++            assert (self.size_bits < 64 or self.size_bits == 64
++                    and self.is_signed()), "Cannot represent max as an int"
++            return (1 << self.mantissa) - 1
++
++    def _raw_min(self) -> Union[int, float]:
++        if self.is_floating_point():
++            assert self.is_signed(
++            ), "We currently assume all floating point types are signed"
++            sign_bit_double = 1 << 63
++
++            max_raw = self._floating_point_max_int()
++            min_raw = max_raw | sign_bit_double
++            return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
++        else:
++            assert (not self.is_signed() or
++                    self.size_bits <= 64), "Cannot represent min as a int64_t"
++
++            if self.is_signed():
++                return -(1 << (self.size_bits - 1))
++            else:
++                return 0
++
++    @functools.cached_property
++    def id(self) -> int:
++        """
++        Convert the ScalarType to an int which can be passed to pytorch custom
++        ops. This layout of the int must be kept in sync with the C++
++        ScalarType's from_id method.
++        """
++        val = 0
++        offset = 0
++
++        def or_and_advance(member, bit_width):
++            nonlocal val
++            nonlocal offset
++            bit_mask = (1 << bit_width) - 1
++            val = val | (int(member) & bit_mask) << offset
++            offset = offset + bit_width
++
++        or_and_advance(self.exponent, 8)
++        or_and_advance(self.mantissa, 8)
++        or_and_advance(self.signed, 1)
++        or_and_advance(self.bias, 32)
++        or_and_advance(self._finite_values_only, 1)
++        or_and_advance(self.nan_repr.value, 8)
++
++        assert offset <= 64, \
++            f"ScalarType fields too big {offset} to fit into an int64"
++
++        return val
++
++    @property
++    def size_bits(self) -> int:
++        return self.exponent + self.mantissa + int(self.signed)
++
++    def min(self) -> Union[int, float]:
++        """
++        Min representable value for this scalar type.
++        (accounting for bias if there is one)
++        """
++        return self._raw_min() - self.bias
++
++    def max(self) -> Union[int, float]:
++        """
++        Max representable value for this scalar type.
++        (accounting for bias if there is one)
++        """
++        return self._raw_max() - self.bias
++
++    def is_signed(self) -> bool:
++        """
++        If the type is signed (i.e. has a sign bit), same as `signed`
++        added for consistency with:
++        https://pytorch.org/docs/stable/generated/torch.Tensor.is_signed.html
++        """
++        return self.signed
++
++    def is_floating_point(self) -> bool:
++        "If the type is a floating point type"
++        return self.exponent != 0
++
++    def is_integer(self) -> bool:
++        "If the type is an integer type"
++        return self.exponent == 0
++
++    def has_bias(self) -> bool:
++        "If the type has a non-zero bias"
++        return self.bias != 0
++
++    def has_infs(self) -> bool:
++        "If the type is floating point and supports infinity"
++        return not self._finite_values_only
++
++    def has_nans(self) -> bool:
++        return self.nan_repr != NanRepr.NONE.value
++
++    def is_ieee_754(self) -> bool:
++        """
++        If the type is a floating point type that follows IEEE 754
++        conventions
++        """
++        return self.nan_repr == NanRepr.IEEE_754.value and \
++            not self._finite_values_only
++
++    def __str__(self) -> str:
++        """
++        naming generally follows: https://github.com/jax-ml/ml_dtypes
++        for floating point types (leading f) the scheme is:
++        `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
++        flags:
++          - no-flags: means it follows IEEE 754 conventions
++          - f: means finite values only (no infinities)
++          - n: means nans are supported (non-standard encoding)
++        for integer types the scheme is:
++          `[u]int<size_bits>[b<bias>]`
++          - if bias is not present it means its zero
++        """
++        if self.is_floating_point():
++            ret = "float" + str(self.size_bits) + "_e" + str(
++                self.exponent) + "m" + str(self.mantissa)
++
++            if not self.is_ieee_754():
++                if self._finite_values_only:
++                    ret = ret + "f"
++                if self.nan_repr != NanRepr.NONE:
++                    ret = ret + "n"
++
++            return ret
++        else:
++            ret = ("int" if self.is_signed() else "uint") + str(self.size_bits)
++            if self.has_bias():
++                ret = ret + "b" + str(self.bias)
++            return ret
++
++    def __repr__(self) -> str:
++        return "ScalarType." + self.__str__()
++
++    # __len__ needs to be defined (and has to throw TypeError) for pytorch's
++    # opcheck to work.
++    def __len__(self) -> int:
++        raise TypeError
++
++    #
++    # Convenience Constructors
++    #
++
++    @classmethod
++    def int_(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
++        "Create a signed integer scalar type (size_bits includes sign-bit)."
++        ret = cls(0, size_bits - 1, True, bias if bias else 0)
++        ret.id  # noqa B018: make sure the id is cached
++        return ret
++
++    @classmethod
++    def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
++        """Create a unsigned integer scalar type."""
++        ret = cls(0, size_bits, False, bias if bias else 0)
++        ret.id  # noqa B018: make sure the id is cached
++        return ret
++
++    @classmethod
++    def float_IEEE754(cls, exponent: int, mantissa: int) -> 'ScalarType':
++        """
++        Create a standard floating point type
++        (i.e. follows IEEE 754 conventions).
++        """
++        assert (mantissa > 0 and exponent > 0)
++        ret = cls(exponent, mantissa, True, 0)
++        ret.id  # noqa B018: make sure the id is cached
++        return ret
++
++    @classmethod
++    def float_(cls, exponent: int, mantissa: int, finite_values_only: bool,
++               nan_repr: NanRepr) -> 'ScalarType':
++        """
++        Create a non-standard floating point type
++        (i.e. does not follow IEEE 754 conventions).
++        """
++        assert (mantissa > 0 and exponent > 0)
++        assert (nan_repr != NanRepr.IEEE_754), (
++            "use `float_IEEE754` constructor for floating point types that "
++            "follow IEEE 754 conventions")
++        ret = cls(exponent, mantissa, True, 0, finite_values_only, nan_repr)
++        ret.id  # noqa B018: make sure the id is cached
++        return ret
++
++
++# naming generally follows: https://github.com/jax-ml/ml_dtypes
++# for floating point types (leading f) the scheme is:
++#  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
++#  flags:
++#  - no-flags: means it follows IEEE 754 conventions
++#  - f: means finite values only (no infinities)
++#  - n: means nans are supported (non-standard encoding)
++# for integer types the scheme is:
++#  `[u]int<size_bits>[b<bias>]`
++#  - if bias is not present it means its zero
++
++
++class scalar_types:
++    int4 = ScalarType.int_(4, None)
++    uint4 = ScalarType.uint(4, None)
++    int8 = ScalarType.int_(8, None)
++    uint8 = ScalarType.uint(8, None)
++    float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN)
++    float8_e5m2 = ScalarType.float_IEEE754(5, 2)
++    float16_e8m7 = ScalarType.float_IEEE754(8, 7)
++    float16_e5m10 = ScalarType.float_IEEE754(5, 10)
++
++    # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
++    float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
++
++    # "gptq" types
++    uint2b2 = ScalarType.uint(2, 2)
++    uint3b4 = ScalarType.uint(3, 4)
++    uint4b8 = ScalarType.uint(4, 8)
++    uint8b128 = ScalarType.uint(8, 128)
++
++    # colloquial names
++    bfloat16 = float16_e8m7
++    float16 = float16_e5m10
+diff --git a/vllm/scripts.py b/vllm/scripts.py
+new file mode 100644
+index 0000000..42e1c63
+--- /dev/null
++++ b/vllm/scripts.py
+@@ -0,0 +1,207 @@
++# The CLI entrypoint to vLLM.
++import argparse
++import os
++import signal
++import sys
++from typing import List, Optional
++
++import uvloop
++from openai import OpenAI
++from openai.types.chat import ChatCompletionMessageParam
++
++import vllm.version
++from vllm.engine.arg_utils import EngineArgs
++from vllm.entrypoints.openai.api_server import run_server
++from vllm.entrypoints.openai.cli_args import (make_arg_parser,
++                                              validate_parsed_serve_args)
++from vllm.logger import init_logger
++from vllm.utils import FlexibleArgumentParser
++
++logger = init_logger(__name__)
++
++
++def register_signal_handlers():
++
++    def signal_handler(sig, frame):
++        sys.exit(0)
++
++    signal.signal(signal.SIGINT, signal_handler)
++    signal.signal(signal.SIGTSTP, signal_handler)
++
++
++def serve(args: argparse.Namespace) -> None:
++    # The default value of `--model`
++    if args.model != EngineArgs.model:
++        raise ValueError(
++            "With `vllm serve`, you should provide the model as a "
++            "positional argument instead of via the `--model` option.")
++
++    # EngineArgs expects the model name to be passed as --model.
++    args.model = args.model_tag
++
++    uvloop.run(run_server(args))
++
++
++def interactive_cli(args: argparse.Namespace) -> None:
++    register_signal_handlers()
++
++    base_url = args.url
++    api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
++    openai_client = OpenAI(api_key=api_key, base_url=base_url)
++
++    if args.model_name:
++        model_name = args.model_name
++    else:
++        available_models = openai_client.models.list()
++        model_name = available_models.data[0].id
++
++    print(f"Using model: {model_name}")
++
++    if args.command == "complete":
++        complete(model_name, openai_client)
++    elif args.command == "chat":
++        chat(args.system_prompt, model_name, openai_client)
++
++
++def complete(model_name: str, client: OpenAI) -> None:
++    print("Please enter prompt to complete:")
++    while True:
++        input_prompt = input("> ")
++
++        completion = client.completions.create(model=model_name,
++                                               prompt=input_prompt)
++        output = completion.choices[0].text
++        print(output)
++
++
++def chat(system_prompt: Optional[str], model_name: str,
++         client: OpenAI) -> None:
++    conversation: List[ChatCompletionMessageParam] = []
++    if system_prompt is not None:
++        conversation.append({"role": "system", "content": system_prompt})
++
++    print("Please enter a message for the chat model:")
++    while True:
++        input_message = input("> ")
++        conversation.append({"role": "user", "content": input_message})
++
++        chat_completion = client.chat.completions.create(model=model_name,
++                                                         messages=conversation)
++
++        response_message = chat_completion.choices[0].message
++        output = response_message.content
++
++        conversation.append(response_message)  # type: ignore
++        print(output)
++
++
++def _add_query_options(
++        parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
++    parser.add_argument(
++        "--url",
++        type=str,
++        default="http://localhost:8000/v1",
++        help="url of the running OpenAI-Compatible RESTful API server")
++    parser.add_argument(
++        "--model-name",
++        type=str,
++        default=None,
++        help=("The model name used in prompt completion, default to "
++              "the first model in list models API call."))
++    parser.add_argument(
++        "--api-key",
++        type=str,
++        default=None,
++        help=(
++            "API key for OpenAI services. If provided, this api key "
++            "will overwrite the api key obtained through environment variables."
++        ))
++    return parser
++
++
++def env_setup():
++    # The safest multiprocessing method is `spawn`, as the default `fork` method
++    # is not compatible with some accelerators. The default method will be
++    # changing in future versions of Python, so we should use it explicitly when
++    # possible.
++    #
++    # We only set it here in the CLI entrypoint, because changing to `spawn`
++    # could break some existing code using vLLM as a library. `spawn` will cause
++    # unexpected behavior if the code is not protected by
++    # `if __name__ == "__main__":`.
++    #
++    # References:
++    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
++    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
++    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
++    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
++    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
++        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
++        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
++
++
++def main():
++    env_setup()
++
++    parser = FlexibleArgumentParser(description="vLLM CLI")
++    parser.add_argument('-v',
++                        '--version',
++                        action='version',
++                        version=vllm.version.__version__)
++
++    subparsers = parser.add_subparsers(required=True, dest="subparser")
++
++    serve_parser = subparsers.add_parser(
++        "serve",
++        help="Start the vLLM OpenAI Compatible API server",
++        usage="vllm serve <model_tag> [options]")
++    serve_parser.add_argument("model_tag",
++                              type=str,
++                              help="The model tag to serve")
++    serve_parser.add_argument(
++        "--config",
++        type=str,
++        default='',
++        required=False,
++        help="Read CLI options from a config file."
++        "Must be a YAML with the following options:"
++        "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
++    )
++    serve_parser = make_arg_parser(serve_parser)
++    serve_parser.set_defaults(dispatch_function=serve)
++
++    complete_parser = subparsers.add_parser(
++        "complete",
++        help=("Generate text completions based on the given prompt "
++              "via the running API server"),
++        usage="vllm complete [options]")
++    _add_query_options(complete_parser)
++    complete_parser.set_defaults(dispatch_function=interactive_cli,
++                                 command="complete")
++
++    chat_parser = subparsers.add_parser(
++        "chat",
++        help="Generate chat completions via the running API server",
++        usage="vllm chat [options]")
++    _add_query_options(chat_parser)
++    chat_parser.add_argument(
++        "--system-prompt",
++        type=str,
++        default=None,
++        help=("The system prompt to be added to the chat template, "
++              "used for models that support system prompts."))
++    chat_parser.set_defaults(dispatch_function=interactive_cli, command="chat")
++
++    args = parser.parse_args()
++    if args.subparser == "serve":
++        validate_parsed_serve_args(args)
++
++    # One of the sub commands should be executed.
++    if hasattr(args, "dispatch_function"):
++        args.dispatch_function(args)
++    else:
++        parser.print_help()
++
++
++if __name__ == "__main__":
++    main()
+diff --git a/vllm/sequence.py b/vllm/sequence.py
+index f2939ef..5857f65 100644
+--- a/vllm/sequence.py
++++ b/vllm/sequence.py
+@@ -1,19 +1,38 @@
+ """Sequence and its related classes."""
+ import copy
+ import enum
++from abc import ABC, abstractmethod
++from array import array
++from collections import defaultdict
+ from dataclasses import dataclass, field
+-from typing import TYPE_CHECKING, Dict, List, Optional, Union
++from functools import reduce
++from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional
++from typing import Sequence as GenericSequence
++from typing import Set, Tuple, Union
+ 
+-from vllm.block import LogicalTokenBlock
++import msgspec
++import torch
++
++from vllm.inputs import SingletonInputs, SingletonInputsAdapter
+ from vllm.lora.request import LoRARequest
+-from vllm.sampling_params import SamplingParams
++from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
++from vllm.pooling_params import PoolingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import RequestOutputKind, SamplingParams
++
++VLLM_TOKEN_ID_ARRAY_TYPE = "l"
+ 
+-if TYPE_CHECKING:
+-    import torch
++VLLM_INVALID_TOKEN_ID = -1
+ 
+-    from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
+ 
++def array_full(token_id: int, count: int):
++    """:class:`array` equivalent of :func:`numpy.full`."""
++    return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
+ 
++
++# We use dataclass for now because it is used for
++# openai server output, and msgspec is not serializable.
++# TODO(sang): Fix it.
+ @dataclass
+ class Logprob:
+     """Infos for supporting OpenAI compatible logprobs and token ranks.
+@@ -35,24 +54,21 @@ PromptLogprobs = List[Optional[Dict[int, Logprob]]]
+ SampleLogprobs = List[Dict[int, Logprob]]
+ 
+ 
+-class SequenceStatus(enum.Enum):
++class SequenceStatus(enum.IntEnum):
+     """Status of a sequence."""
+-    WAITING = enum.auto()
+-    RUNNING = enum.auto()
+-    SWAPPED = enum.auto()
+-    FINISHED_STOPPED = enum.auto()
+-    FINISHED_LENGTH_CAPPED = enum.auto()
+-    FINISHED_ABORTED = enum.auto()
+-    FINISHED_IGNORED = enum.auto()
++    WAITING = 0
++    RUNNING = 1
++    SWAPPED = 2
++    # Note: anything after SWAPPED (2) will be considered
++    # as a finished status.
++    FINISHED_STOPPED = 3
++    FINISHED_LENGTH_CAPPED = 4
++    FINISHED_ABORTED = 5
++    FINISHED_IGNORED = 6
+ 
+     @staticmethod
+     def is_finished(status: "SequenceStatus") -> bool:
+-        return status in [
+-            SequenceStatus.FINISHED_STOPPED,
+-            SequenceStatus.FINISHED_LENGTH_CAPPED,
+-            SequenceStatus.FINISHED_ABORTED,
+-            SequenceStatus.FINISHED_IGNORED,
+-        ]
++        return status > SequenceStatus.SWAPPED
+ 
+     @staticmethod
+     def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
+@@ -87,6 +103,13 @@ class RequestMetrics:
+         first_token_time: The time when the first token was generated.
+         time_in_queue: The time the request spent in the queue.
+         finished_time: The time when the request was finished.
++        scheduler_time: The time spent in the scheduler when this request was
++                        being considered by the scheduler.
++        model_forward_time: The time spent in the model forward pass when this
++                            request was in the batch.
++        model_execute_time: The time spent in the model execute function. This
++                            will include model forward, block/sync across
++                            workers, cpu-gpu sync time and sampling time.
+     """
+     arrival_time: float
+     last_token_time: float
+@@ -94,9 +117,28 @@ class RequestMetrics:
+     first_token_time: Optional[float]
+     time_in_queue: Optional[float]
+     finished_time: Optional[float] = None
+-
+-
+-class SequenceData:
++    scheduler_time: Optional[float] = None
++    model_forward_time: Optional[float] = None
++    model_execute_time: Optional[float] = None
++
++
++class SequenceDataDelta(
++        msgspec.Struct,
++        array_like=True,  # type: ignore[call-arg]
++        omit_defaults=True):  # type: ignore[call-arg]
++    """Delta SequenceData to send to workers per step."""
++    # A new token to be appended to existing SequenceData.
++    new_output_token_ids: List[int]
++    # Overwriting existing `cumulative_logprob`
++    new_cumulative_logprob: float
++    # Overwriting existing `num_computed_tokens`.
++    new_num_computed_tokens: int
++    # Overwriting existing `stage`.
++    new_stage: SequenceStage
++
++
++class SequenceData(msgspec.Struct,
++                   omit_defaults=True):  # type: ignore[call-arg]
+     """Data associated with a sequence.
+ 
+     Args:
+@@ -109,37 +151,162 @@ class SequenceData:
+         output_token_ids: The token IDs of the output.
+         cumulative_logprob: The cumulative log probability of the output.
+     """
++    # NOTE: we cannot use Union[List, array] because msgspec cannot support
++    # union of 2 list types.
++    _prompt_token_ids: array
++    _output_token_ids: array = msgspec.field(
++        default_factory=lambda: array(VLLM_TOKEN_ID_ARRAY_TYPE, []))
++
++    ### The below fields should not be passed as an argument ###
++    _cumulative_logprob: float = 0.0
++    _prompt_token_ids_tuple: Tuple[int,
++                                   ...] = msgspec.field(default_factory=tuple)
++    # The number of tokens that are computed (that run against the model).
++    _num_computed_tokens: int = 0
++    # The number of tokens with prefix cache hit.
++    _num_cached_tokens: int = 0
++    _stage: SequenceStage = SequenceStage.PREFILL
++    _cached_all_token_ids: List[int] = msgspec.field(default_factory=list)
++
++    # It is used to get delta input. It is reset when `get_delta_and_reset`
++    # is called.
++    _new_appended_tokens: List[int] = msgspec.field(default_factory=list)
++
++    # It is used to compute mrope_position_ids.
++    _mrope_position_delta: Optional[int] = None
++
++    @staticmethod
++    def from_prompt_token_counts(
++            *token_counts: Tuple[int, int]) -> "SequenceData":
++        """
++        Construct a :class:`SequenceData` instance by concatenating
++        prompt token sequences.
++
++        Each tuple represents one token sequence, expressed in the form
++        :code:`(token_id, count)`.
++        """
++        if len(token_counts) == 0:
++            return SequenceData.from_seqs([])
++
++        prompt_token_ids_arr = reduce(
++            array.__iadd__,
++            (array_full(token_id, count) for token_id, count in token_counts),
++        )
++
++        return SequenceData(prompt_token_ids_arr)
++
++    @staticmethod
++    def from_seqs(
++        prompt_token_ids: GenericSequence[int],
++        output_token_ids: Optional[GenericSequence[int]] = None,
++    ) -> "SequenceData":
++        """
++        Construct a :class:`SequenceData` instance from prompt and output
++        token sequences.
++        """
++        prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
++                                     prompt_token_ids)
+ 
+-    def __init__(
+-        self,
+-        prompt_token_ids: List[int],
+-        output_token_ids: Optional[List[int]] = None,
+-    ) -> None:
+         if output_token_ids is None:
+-            output_token_ids = []
++            return SequenceData(prompt_token_ids_arr)
+ 
+-        self.prompt_token_ids = prompt_token_ids
+-        self.output_token_ids = output_token_ids
+-        self.cumulative_logprob = 0.0
+-        # The number of tokens that are computed (that run against the model).
+-        self._num_computed_tokens = 0
+-        self._stage: SequenceStage = SequenceStage.PREFILL
++        output_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
++                                     output_token_ids)
++
++        return SequenceData(prompt_token_ids_arr,
++                            _output_token_ids=output_token_ids_arr)
++
++    def __post_init__(self) -> None:
++        assert self._prompt_token_ids.typecode == "l"
++        assert self._output_token_ids.typecode == "l"
++        self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(
++            self._prompt_token_ids)
++        self._update_cached_all_tokens()
++
++    def _update_cached_all_tokens(self):
++        assert isinstance(self._prompt_token_ids, array)
++        assert isinstance(self._output_token_ids, array)
++        self._cached_all_token_ids: List[int] = list(self._prompt_token_ids +
++                                                     self._output_token_ids)
++
++    @property
++    def cumulative_logprob(self) -> float:
++        return self._cumulative_logprob
++
++    @property
++    def prompt_token_ids(self) -> Tuple[int, ...]:
++        return self._prompt_token_ids_tuple
++
++    @prompt_token_ids.setter
++    def prompt_token_ids(self, new_prompt_token_ids) -> None:
++        raise NotImplementedError
++
++    @property
++    def prompt_token_ids_array(self) -> array:
++        """Return the prompt token ids in array type.
++
++        Note that the array is in "I" type, and it is not compatible
++        with torch.long (2 bytes vs 4 bytes). So beware of the usage.
++        """
++        return self._prompt_token_ids
++
++    @property
++    def output_token_ids(self) -> Tuple[int, ...]:
++        return tuple(self._output_token_ids)
++
++    @output_token_ids.setter
++    def output_token_ids(self,
++                         new_output_token_ids: GenericSequence[int]) -> None:
++        self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
++                                       new_output_token_ids)
++        self._update_cached_all_tokens()
++
++    @property
++    def output_token_ids_array(self) -> array:
++        """Return the prompt token ids in array type.
++
++        Note that the array is in "I" type, and it is not compatible
++        with torch.long (2 bytes vs 4 bytes). So beware of the usage.
++        """
++        assert isinstance(self._output_token_ids, array)
++        return self._output_token_ids
++
++    @property
++    def mrope_position_delta(self) -> Optional[int]:
++        return self._mrope_position_delta
++
++    @mrope_position_delta.setter
++    def mrope_position_delta(self, new_mrope_position_delta):
++        self._mrope_position_delta = new_mrope_position_delta
+ 
+     def append_token_id(self, token_id: int, logprob: float) -> None:
+-        self.output_token_ids.append(token_id)
+-        self.cumulative_logprob += logprob
++        self._output_token_ids.append(token_id)
++        self._new_appended_tokens.append(token_id)
++        self._cached_all_token_ids.append(token_id)
++        self._cumulative_logprob += logprob
+ 
+     def get_len(self) -> int:
+-        return len(self.output_token_ids) + len(self.prompt_token_ids)
++        return len(self._output_token_ids) + len(self._prompt_token_ids)
+ 
+     def get_prompt_len(self) -> int:
+-        return len(self.prompt_token_ids)
++        return len(self._prompt_token_ids)
+ 
+     def get_output_len(self) -> int:
+-        return len(self.output_token_ids)
++        return len(self._output_token_ids)
+ 
+     def get_token_ids(self) -> List[int]:
+-        return self.prompt_token_ids + self.output_token_ids
++        return self._cached_all_token_ids
++
++    def get_prefix_token_ids(
++            self, num_tokens: int
++    ) -> Tuple[Tuple[int, ...], Optional[Tuple[int, ...]]]:
++        """Get prefix tokens, and make the return value hashable"""
++        prompt_length = self.get_prompt_len()
++        if num_tokens > prompt_length:
++            return (self._prompt_token_ids_tuple,
++                    tuple(self._output_token_ids[:num_tokens - prompt_length]))
++        else:
++            return (self._prompt_token_ids_tuple[:num_tokens], None)
+ 
+     def get_num_computed_tokens(self) -> int:
+         """Return the number of prefill tokens that are already computed."""
+@@ -154,6 +321,14 @@ class SequenceData:
+         if self.get_num_uncomputed_tokens() == 0:
+             self._stage = SequenceStage.DECODE
+ 
++    def get_num_cached_tokens(self) -> int:
++        """Return the number of tokens with prefix cache hit."""
++        return self._num_cached_tokens
++
++    def update_num_cached_tokens(self, num_cached_tokens: int):
++        """Update the number of tokens with prefix cache hit."""
++        self._num_cached_tokens = num_cached_tokens
++
+     def reset_state_for_recompute(self) -> None:
+         """Reset the number of computed tokens from this sequence. It is
+         supposed to be called when a sequence needs to be started from
+@@ -161,6 +336,7 @@ class SequenceData:
+         """
+         self._num_computed_tokens = 0
+         self._stage = SequenceStage.PREFILL
++        self._new_appended_tokens = []
+ 
+     def get_num_uncomputed_tokens(self) -> int:
+         """Return the number of prefill tokens that are not computed."""
+@@ -170,79 +346,176 @@ class SequenceData:
+         return self.get_len() - self.get_num_computed_tokens()
+ 
+     def get_last_token_id(self) -> int:
+-        if not self.output_token_ids:
+-            return self.prompt_token_ids[-1]
+-        return self.output_token_ids[-1]
++        if not self._output_token_ids:
++            return self._prompt_token_ids[-1]
++        return self._output_token_ids[-1]
+ 
+-    def get_prompt_token_ids(self) -> List[int]:
++    def get_prompt_token_ids(self) -> Tuple[int, ...]:
+         return self.prompt_token_ids
+ 
+-    def get_output_token_ids(self) -> List[int]:
++    def get_output_token_ids(self) -> Tuple[int, ...]:
+         return self.output_token_ids
+ 
++    def get_delta_and_reset(self) -> SequenceDataDelta:
++        delta = SequenceDataDelta(self._new_appended_tokens,
++                                  self._cumulative_logprob,
++                                  self.get_num_computed_tokens(), self.stage)
++        # Reset delta state.
++        self._new_appended_tokens = []
++        return delta
++
++    def apply_delta(self, delta: SequenceDataDelta):
++        self._num_computed_tokens = delta.new_num_computed_tokens
++        self._cumulative_logprob = delta.new_cumulative_logprob
++        self._stage = delta.new_stage
++        self._output_token_ids.extend(delta.new_output_token_ids)
++        self._cached_all_token_ids.extend(delta.new_output_token_ids)
++
+     @property
+     def stage(self) -> SequenceStage:
+         return self._stage
+ 
+     def __repr__(self) -> str:
+         return (f"SequenceData("
+-                f"prompt_token_ids={self.prompt_token_ids}, "
++                f"prompt_token_ids={self._prompt_token_ids}, "
+                 f"output_token_ids={self.output_token_ids}, "
+-                f"cumulative_logprob={self.cumulative_logprob})")
++                f"cumulative_logprob={self.cumulative_logprob}, "
++                f"get_num_computed_tokens={self.get_num_computed_tokens()}")
+ 
+ 
+ class Sequence:
+     """Stores the data, status, and block information of a sequence.
+ 
++    The sequence is constructed from the :data:`DecoderOnlyInputs`
++    (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder)
++    instance passed in through the :code:`inputs` constructor argument.
++
+     Args:
+         seq_id: The ID of the sequence.
+-        prompt: The prompt of the sequence.
+-        prompt_token_ids: The token IDs of the prompt.
++        inputs: The inputs of the sequence.
+         block_size: The block size of the sequence. Should be the same as the
+             block size used by the block manager and cache engine.
++        eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM.
+         lora_request: LoRA request.
++        prompt_adapter_request: Prompt Adapter request.
+     """
+ 
+     def __init__(
+         self,
+         seq_id: int,
+-        prompt: str,
+-        prompt_token_ids: List[int],
++        inputs: SingletonInputs,
+         block_size: int,
+         eos_token_id: Optional[int] = None,
+         lora_request: Optional[LoRARequest] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+     ) -> None:
+         self.seq_id = seq_id
+-        self.prompt = prompt
++        self.inputs = SingletonInputsAdapter(inputs)
+         self.block_size = block_size
+         self.eos_token_id = eos_token_id
+         self.lora_request = lora_request
++        self.prompt_adapter_request = prompt_adapter_request
+ 
+-        self.data: SequenceData = SequenceData(prompt_token_ids)
++        self.data = SequenceData.from_seqs(self.prompt_token_ids)
+         self.output_logprobs: SampleLogprobs = []
+         self.output_text = ""
+ 
+-        self.logical_token_blocks: List[LogicalTokenBlock] = []
+-        # Initialize the logical token blocks with the prompt token ids.
+-        self._append_tokens_to_blocks(prompt_token_ids)
+         self.status = SequenceStatus.WAITING
+         self.stop_reason: Union[int, str, None] = None
+ 
++        # These are used to keep track of delta outputs
++        self._last_output_token_ids_offset: int = 0
++        self._last_output_text_offset: int = 0
++
+         # Used for incremental detokenization
+         self.prefix_offset = 0
+         self.read_offset = 0
+         # Input + output tokens
+         self.tokens: Optional[List[str]] = None
+ 
++    @property
++    def n_blocks(self) -> int:
++        return (self.get_len() + self.block_size - 1) // self.block_size
++
++    @property
++    def prompt(self) -> Optional[str]:
++        return self.inputs.prompt
++
++    @property
++    def prompt_token_ids(self) -> List[int]:
++        return self.inputs.prompt_token_ids
++
++    @property
++    def prompt_embeds(self) -> Optional[torch.Tensor]:
++        return self.inputs.prompt_embeds
++
++    @property
++    def token_type_ids(self) -> List[int]:
++        return self.inputs.token_type_ids
++
++    @property
++    def multi_modal_data(self) -> "MultiModalDataDict":
++        return self.inputs.multi_modal_data
++
++    @property
++    def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
++        return self.inputs.multi_modal_placeholders
++
++    @property
++    def mm_processor_kwargs(self) -> Dict[str, Any]:
++        return self.inputs.mm_processor_kwargs
++
+     @property
+     def lora_int_id(self) -> int:
+         return self.lora_request.lora_int_id if self.lora_request else 0
+ 
+-    def get_output_text_to_return(self, buffer_length: int):
++    @property
++    def prompt_adapter_id(self) -> int:
++        return self.prompt_adapter_request.prompt_adapter_id \
++                        if self.prompt_adapter_request else 0
++
++    def get_output_text_to_return(self, buffer_length: int,
++                                  delta: bool) -> str:
++        """If delta is True, only new text since the last call to
++        this method is returned"""
++
+         # We return the full output text if the sequence is finished.
+         truncate = buffer_length and not self.is_finished()
+-        return self.output_text[:-buffer_length] if truncate else (
+-            self.output_text)
++        if not delta:
++            return self.output_text[:-buffer_length] if truncate else (
++                self.output_text)
++        length = len(self.output_text)
++        if truncate:
++            length -= buffer_length
++        last_offset = self._last_output_text_offset
++        if last_offset < length:
++            self._last_output_text_offset = length
++            return self.output_text[last_offset:length]
++        return ""
++
++    def get_output_token_ids_to_return(
++            self, delta: bool) -> Union[GenericSequence[int], int]:
++        """If delta is True, only new tokens since the last call to
++        this method are returned"""
++        if not delta:
++            return self.get_output_token_ids()
++
++        output_len = self.get_output_len()
++
++        # Get the number of new tokens
++        num_new_tokens = output_len - self._last_output_token_ids_offset
++        self._last_output_token_ids_offset = output_len
++
++        # Return new tokens
++        if num_new_tokens == 1:
++            # Optimization for single decode token case
++            # (which is what we have most of the time)
++            return self.data._cached_all_token_ids[-1]
++
++        if num_new_tokens == 0:
++            return []
++
++        return self.data._cached_all_token_ids[-num_new_tokens:]
+ 
+     def hash_of_block(self, logical_idx: int) -> int:
+         # TODO This can produce incorrect hash when block size > prompt size
+@@ -251,8 +524,21 @@ class Sequence:
+         # TODO: The current hashing function is O(L^2). We should optimize
+         # this in the future.
+         num_tokens = self.num_hashed_tokens_of_block(logical_idx)
+-        return hash(
+-            (tuple(self.data.get_token_ids()[0:num_tokens]), self.lora_int_id))
++        hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
++        return hash((hashed_tokens, self.lora_int_id))
++
++    def extra_hash(self) -> Optional[int]:
++        """
++        This function computes an extra hash for a sequence, specifically
++        designed for prefix caching mode. The final sequence hash is determined
++        by applying token_ids from the sequence's blocks.
++        """
++        if self.prompt_adapter_id == 0 and self.lora_int_id == 0:
++            return None
++
++        # NOTE: If there are additional factors influencing the block aside from
++        # token_ids, include them as input parameters to the hash.
++        return hash((self.prompt_adapter_id, self.lora_int_id))
+ 
+     def num_hashed_tokens_of_block(self, logical_idx: int):
+         return logical_idx * self.block_size + self.block_size
+@@ -261,36 +547,9 @@ class Sequence:
+         """Reset the sequence states for recomputation."""
+         self.data.reset_state_for_recompute()
+ 
+-    def _append_logical_block(self) -> None:
+-        block = LogicalTokenBlock(
+-            block_number=len(self.logical_token_blocks),
+-            block_size=self.block_size,
+-        )
+-        self.logical_token_blocks.append(block)
+-
+-    def _append_tokens_to_blocks(self, token_ids: List[int]) -> None:
+-        cursor = 0
+-        while cursor < len(token_ids):
+-            if not self.logical_token_blocks:
+-                self._append_logical_block()
+-
+-            last_block = self.logical_token_blocks[-1]
+-            if last_block.is_full():
+-                self._append_logical_block()
+-                last_block = self.logical_token_blocks[-1]
+-
+-            num_empty_slots = last_block.get_num_empty_slots()
+-            last_block.append_tokens(token_ids[cursor:cursor +
+-                                               num_empty_slots])
+-            cursor += num_empty_slots
+-
+-    def append_token_id(
+-        self,
+-        token_id: int,
+-        logprobs: Dict[int, Logprob],
+-    ) -> None:
++    def append_token_id(self, token_id: int, logprobs: Dict[int,
++                                                            Logprob]) -> None:
+         assert token_id in logprobs
+-        self._append_tokens_to_blocks([token_id])
+         self.output_logprobs.append(logprobs)
+         self.data.append_token_id(token_id, logprobs[token_id].logprob)
+ 
+@@ -306,37 +565,18 @@ class Sequence:
+     def get_token_ids(self) -> List[int]:
+         return self.data.get_token_ids()
+ 
+-    def get_prompt_token_ids(self) -> List[int]:
++    def get_prompt_token_ids(self) -> Tuple[int, ...]:
+         return self.data.get_prompt_token_ids()
+ 
+     def get_last_token_id(self) -> int:
+         return self.data.get_last_token_id()
+ 
+-    def get_output_token_ids(self) -> List[int]:
+-        return self.data.output_token_ids
++    def get_output_token_ids(self) -> Tuple[int, ...]:
++        return self.data.get_output_token_ids()
+ 
+     def get_cumulative_logprob(self) -> float:
+         return self.data.cumulative_logprob
+ 
+-    def get_beam_search_score(self,
+-                              length_penalty: float = 1.0,
+-                              seq_len: Optional[int] = None,
+-                              eos_token_id: Optional[int] = None) -> float:
+-        """Calculate the beam search score with length penalty.
+-
+-        Adapted from
+-
+-        https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
+-        """
+-        if seq_len is None:
+-            seq_len = self.get_len()
+-            # NOTE: HF implementation does not count the EOS token
+-            # towards the length, we align with that here for testing.
+-            if (eos_token_id is not None
+-                    and self.get_last_token_id() == eos_token_id):
+-                seq_len -= 1
+-        return self.get_cumulative_logprob() / (seq_len**length_penalty)
+-
+     def is_finished(self) -> bool:
+         return SequenceStatus.is_finished(self.status)
+ 
+@@ -356,40 +596,29 @@ class Sequence:
+             return 1
+         return self.data.get_num_uncomputed_tokens()
+ 
++    def get_num_computed_tokens(self) -> int:
++        return self.data.get_num_computed_tokens()
++
+     def is_prefill(self) -> bool:
+         return self.data.stage == SequenceStage.PREFILL
+ 
+     def __repr__(self) -> str:
+         return (f"Sequence(seq_id={self.seq_id}, "
+                 f"status={self.status.name}, "
+-                f"num_blocks={len(self.logical_token_blocks)})")
++                f"num_blocks={self.n_blocks}, ")
+ 
+ 
+-@dataclass
+-class SequenceGroupState:
++class SequenceGroupState(msgspec.Struct,
++                         omit_defaults=True):  # type: ignore[call-arg]
+     """Mutable state tied to a specific sequence group"""
+ 
+-    # torch.Generator used in seeded sampling
+-    generator: Optional = None  # type: ignore
++    # for multi-step decoding
++    num_steps: int = 1
++    current_step: int = 0
+ 
+-
+-class MultiModalData:
+-    """Multi modal request.
+-    
+-    Args:
+-        type: The data type.
+-        data: The actual data.
+-        The required shape and semantic meaning of it depends on the vision
+-        language config of the hosted model. 
+-        See `VisionLanguageConfig` in `config.py`.
+-    """
+-
+-    class Type(enum.Enum):
+-        IMAGE = enum.auto()
+-
+-    def __init__(self, type: Type, data: "torch.Tensor"):
+-        self.type = type
+-        self.data = data
++    @property
++    def remaining_steps(self) -> int:
++        return self.num_steps - self.current_step
+ 
+ 
+ class SequenceGroup:
+@@ -401,59 +630,166 @@ class SequenceGroup:
+         sampling_params: The sampling parameters used to generate the outputs.
+         arrival_time: The arrival time of the request.
+         lora_request: LoRA request.
+-        multi_modal_data: Multi modal data associated with the request.
++        pooling_params: The parameters used to generate the pooler
++            for a pooling model.
++        pooled_data: The extracted hidden states from a pooling model.
++        encoder_seq: Optional, the single encoder sequence. Should be None
++                     unless you are working with an encoder/decoder model.
++        trace_headers: OpenTelemetry trace headers.
++        prompt_adapter_request: Prompt Adapter request.
++        priority: User-defined priority of the request.
+     """
+ 
+     def __init__(
+         self,
+         request_id: str,
+         seqs: List[Sequence],
+-        sampling_params: SamplingParams,
+         arrival_time: float,
++        sampling_params: Optional[SamplingParams] = None,
+         lora_request: Optional[LoRARequest] = None,
+-        multi_modal_data: Optional[MultiModalData] = None,
++        pooling_params: Optional[PoolingParams] = None,
++        pooled_data: Optional[torch.Tensor] = None,
++        encoder_seq: Optional[Sequence] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
+     ) -> None:
+         self.request_id = request_id
++        self.seqs = seqs
++        self.first_seq = seqs[0]
++        self.arrival_time = arrival_time
++        self.is_single_seq = len(seqs) == 1
+         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
++
+         self.sampling_params = sampling_params
+         self.metrics = RequestMetrics(arrival_time=arrival_time,
+                                       last_token_time=arrival_time,
+                                       first_scheduled_time=None,
+                                       first_token_time=None,
+                                       time_in_queue=None)
++        self.last_token_latency = 0.0
+         self.lora_request = lora_request
+         self.prompt_logprobs: Optional[PromptLogprobs] = None
+         self.state = SequenceGroupState()
+-        self.multi_modal_data = multi_modal_data
++        self.pooling_params = pooling_params
++        self.pooled_data = pooled_data
++        self.prompt_adapter_request = prompt_adapter_request
++        self.encoder_seq = encoder_seq
++        self.trace_headers = trace_headers
++        self.priority = priority
++
++        self.cached_request_output = None
+ 
+     @property
+-    def prompt(self) -> str:
+-        # All sequences in the group should have the same prompt.
+-        # We use the prompt of an arbitrary sequence.
+-        return next(iter(self.seqs_dict.values())).prompt
++    def prompt(self) -> Optional[str]:
++        return self.first_seq.prompt
+ 
+     @property
+     def prompt_token_ids(self) -> List[int]:
+-        # All sequences in the group should have the same prompt.
+-        # We use the prompt of an arbitrary sequence.
+-        return next(iter(self.seqs_dict.values())).data.prompt_token_ids
++        return self.first_seq.prompt_token_ids
++
++    @property
++    def encoder_prompt(self) -> Optional[str]:
++        # There are either 0 or 1 encoder sequences
++        # If one is present, its prompt is distinct
++        # from the decoder's.
++        return (self.encoder_seq.prompt
++                if self.encoder_seq is not None else None)
++
++    @property
++    def encoder_prompt_token_ids(self) -> Optional[List[int]]:
++        # There are either 0 or 1 encoder sequences
++        # If one is present, its prompt token ids are
++        # distinct from the decoder's.
++        return (self.encoder_seq.prompt_token_ids
++                if self.encoder_seq is not None else None)
++
++    @property
++    def token_type_ids(self) -> Optional[List[int]]:
++        return self.first_seq.token_type_ids
++
++    @property
++    def multi_modal_data(self) -> MultiModalDataDict:
++        if self.first_seq.multi_modal_data:
++            return self.first_seq.multi_modal_data
++        elif self.encoder_seq is not None:
++            return self.encoder_seq.multi_modal_data
++        return {}
++
++    @property
++    def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
++        if self.first_seq.multi_modal_data:
++            return self.first_seq.multi_modal_placeholders
++        elif self.encoder_seq is not None:
++            return self.encoder_seq.multi_modal_placeholders
++        return {}
++
++    @property
++    def mm_processor_kwargs(self) -> Dict[str, Any]:
++        if self.first_seq.multi_modal_data:
++            return self.first_seq.mm_processor_kwargs
++        elif self.encoder_seq is not None:
++            return self.encoder_seq.mm_processor_kwargs
++        return {}
+ 
+     @property
+     def lora_int_id(self) -> int:
+         return self.lora_request.lora_int_id if self.lora_request else 0
+ 
+-    def get_last_latency(self, now: float) -> Optional[float]:
++    @property
++    def prompt_adapter_id(self) -> int:
++        return self.prompt_adapter_request.prompt_adapter_id \
++                        if self.prompt_adapter_request else 0
++
++    @property
++    def prompt_adapter_num_virtual_tokens(self) -> int:
++        return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens\
++                         if self.prompt_adapter_request else 0
++
++    def init_multi_step(self, num_steps: int) -> None:
++        self.state.num_steps = num_steps
++        self.state.current_step = 0
++
++    def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int,
++                                             num_scheduler_steps: int,
++                                             is_multi_step: bool,
++                                             enable_chunking: bool) -> None:
++
++        if not is_multi_step:
++            self.init_multi_step(num_steps=num_scheduler_steps)
++            return
++
++        # Multi-Step case
++        is_prefill = self.is_prefill()
++
++        # The asserts below reflect the expectations of the current system.
++        if is_prefill and enable_chunking:
++            assert num_lookahead_slots == num_scheduler_steps
++            self.init_multi_step(num_steps=num_lookahead_slots)
++        else:
++            is_decode: bool = not is_prefill
++            # If it is a prefill, num_lookahead_slots must be 0
++            assert num_lookahead_slots == 0 or is_decode
++            # If it is a decode, num_lookahead_slots + 1 must match
++            # the scheduler steps.
++            assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
++            self.init_multi_step(num_steps=num_lookahead_slots + 1)
++
++    def set_last_token_time(self, now: float) -> None:
+         """Sets the last token time for Request level timings."""
+-        # If still in prefill phase, raise Error.
+-        if self.is_prefill():
+-            raise ValueError(
+-                "seq_group.get_last_latency() should not be called "
+-                "if the seq_group is in prefill phase.")
+-
+-        # Otherwise return token latency.
+-        latency = now - self.metrics.last_token_time
++        # If still in prefill phase, assertion fails.
++        assert not self.is_prefill(), (
++            "seq_group.set_last_token_time() should not be called "
++            "if the seq_group is in prefill phase.")
++        self.last_token_latency = now - self.metrics.last_token_time
+         self.metrics.last_token_time = now
+-        return latency
++
++    def get_last_token_latency(self) -> float:
++        """Returns the latency of the last token."""
++        assert not self.is_prefill(), (
++            "seq_group.get_last_token_latency() should not be called "
++            "if the seq_group is in prefill phase.")
++        return self.last_token_latency
+ 
+     def maybe_set_first_token_time(self, time: float) -> None:
+         """Sets the first token time for Request level timings."""
+@@ -462,7 +798,7 @@ class SequenceGroup:
+         #   in TPOT, rather than recalculating TTFT (since from the )
+         #   POV of the user, there is simply a long generation delay.
+         if (self.metrics.first_token_time is None
+-                and self.get_seqs()[0].get_output_len() == 1):
++                and self.first_seq.get_output_len() == 1):
+             self.metrics.first_token_time = time
+ 
+     def maybe_set_first_scheduled_time(self, time: float) -> None:
+@@ -479,92 +815,91 @@ class SequenceGroup:
+     def get_max_num_running_seqs(self) -> int:
+         """The maximum number of sequences running in parallel in the remaining
+         lifetime of the request."""
+-        if self.sampling_params.use_beam_search:
+-            # For beam search, maximally there will always be `best_of` beam
+-            # candidates running in the future.
+-            return self.sampling_params.best_of
+-        else:
+-            if self.sampling_params.best_of > self.num_seqs():
+-                # At prompt stage, the sequence group is not yet filled up
+-                # and only have one sequence running. However, in the
+-                # generation stage, we will have `best_of` sequences running.
+-                return self.sampling_params.best_of
+-            # At sampling stages, return the number of actual sequences
+-            # that are not finished yet.
+-            return self.num_unfinished_seqs()
++        return 0 if self.first_seq.is_finished() else 1
+ 
+     def get_seqs(
+         self,
+         status: Optional[SequenceStatus] = None,
+     ) -> List[Sequence]:
+-        return list(self.seqs_dict.values()) if status is None else [
+-            seq for seq in self.seqs_dict.values() if seq.status == status
+-        ]
++        if status is None:
++            return self.seqs
++
++        return self.seqs if self.first_seq.status == status else []
+ 
+-    def get_unfinished_seqs(self) -> List[Sequence]:
+-        return [
+-            seq for seq in self.seqs_dict.values() if not seq.is_finished()
+-        ]
++    def is_encoder_decoder(self) -> bool:
++        return self.encoder_seq is not None
++
++    def get_encoder_seq(self) -> Optional[Sequence]:
++        return self.encoder_seq
+ 
+     def get_finished_seqs(self) -> List[Sequence]:
+-        return [seq for seq in self.seqs_dict.values() if seq.is_finished()]
++        return self.seqs if self.first_seq.is_finished() else []
+ 
+     def update_num_computed_tokens(self, num_new_computed_tokens: int):
+         """Update number of tokens computed so far."""
+-        for seq in self.seqs_dict.values():
+-            if not seq.is_finished():
+-                seq.data.update_num_computed_tokens(num_new_computed_tokens)
++        seq = self.first_seq
++        if not seq.is_finished():
++            seq.data.update_num_computed_tokens(num_new_computed_tokens)
+ 
+     def get_num_uncomputed_tokens(self) -> int:
+         num_uncomputed_tokens = 0
+-        for seq in self.get_seqs():
+-            if not seq.is_finished():
+-                num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
++        seq = self.first_seq
++        if not seq.is_finished():
++            num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
+         return num_uncomputed_tokens
+ 
+     def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
+         # Optimization. We don't need to call get_seqs if we don't need to
+         # filter by states.
+         if status is None:
+-            return len(self.seqs_dict)
++            return len(self.seqs)
+ 
+-        return len(self.get_seqs(status))
++        if self.is_single_seq:
++            return 1 if self.seqs[0].status == status else 0
+ 
+-    def num_unfinished_seqs(self) -> int:
+-        return len(self.get_unfinished_seqs())
++        return len(self.get_seqs(status))
+ 
+     def num_finished_seqs(self) -> int:
+-        return len(self.get_finished_seqs())
+-
+-    def find(self, seq_id: int) -> Sequence:
+-        if seq_id not in self.seqs_dict:
+-            raise ValueError(f"Sequence {seq_id} not found.")
+-        return self.seqs_dict[seq_id]
+-
+-    def add(self, seq: Sequence) -> None:
+-        if seq.seq_id in self.seqs_dict:
+-            raise ValueError(f"Sequence {seq.seq_id} already exists.")
+-        self.seqs_dict[seq.seq_id] = seq
+-
+-    def remove(self, seq_id: int) -> None:
+-        if seq_id not in self.seqs_dict:
+-            raise ValueError(f"Sequence {seq_id} not found.")
+-        del self.seqs_dict[seq_id]
++        return 1 if self.first_seq.is_finished() else 0
+ 
+     def is_finished(self) -> bool:
+-        return all(seq.is_finished() for seq in self.get_seqs())
++        return self.first_seq.is_finished()
+ 
+     def is_prefill(self) -> bool:
+-        # Every sequences should be in the same stage.
+-        return self.get_seqs()[0].is_prefill()
++        return self.first_seq.is_prefill()
+ 
+     def __repr__(self) -> str:
+         return (f"SequenceGroup(request_id={self.request_id}, "
+                 f"sampling_params={self.sampling_params}, "
+-                f"num_seqs={len(self.seqs_dict)})")
++                f"num_seqs={len(self.seqs)})")
+ 
+ 
+-class SequenceGroupMetadata:
++class SequenceGroupMetadataDelta(
++        msgspec.Struct,
++        tag=True,  # type: ignore[call-arg]
++        array_like=True,  # type: ignore[call-arg]
++        omit_defaults=True):  # type: ignore[call-arg]
++    """Delta of SequenceGroupMetadata.
++
++    After sending the first SequenceGroupMetadata, vLLM scheduler
++    only sends delta to reduce the data payload size.
++    """
++    seq_data_delta: Dict[int, SequenceDataDelta]
++    request_id: str
++    block_tables: Dict[int, List[int]]
++    is_prompt: bool
++    do_sample: bool = True
++    token_chunk_size: Optional[int] = None
++    computed_block_nums: Optional[List[int]] = None
++    state: Optional[SequenceGroupState] = msgspec.field(
++        default_factory=lambda: SequenceGroupState())
++
++
++class SequenceGroupMetadata(
++        msgspec.Struct,
++        tag=True,  # type: ignore[call-arg]
++        array_like=True,  # type: ignore[call-arg]
++        omit_defaults=True):  # type: ignore[call-arg]
+     """Metadata for a sequence group. Used to create `AttentionMetadata`.
+ 
+     Args:
+@@ -584,51 +919,105 @@ class SequenceGroupMetadata:
+             used in prefix caching.
+         state: Internal state tied to this sequence group.
+         multi_modal_data: Multi modal data.
++        mm_processor_kwargs: Multimodal input processor / mapper overrides.
++        encoder_seq_data: Optional sequence data for encoder prompt
++                          (SequenceGroup.encoder_seq). Should be None
++                          unless you are working with an encoder/decoder
++                          model.
++        cross_block_table: Optional cross-attention block table associated
++                           with the encoder prompt
++                           (SequenceGroup.encoder_seq). Should be None
++                           unless you are working with an encoder/decoder
++                           model.
++        prompt_adapter_request: Prompt Adapter request.
+     """
+ 
+-    def __init__(
+-        self,
+-        request_id: str,
+-        is_prompt: bool,
+-        seq_data: Dict[int, SequenceData],
+-        sampling_params: SamplingParams,
+-        block_tables: Dict[int, List[int]],
+-        do_sample: bool = True,
+-        token_chunk_size: Optional[int] = None,
+-        lora_request: Optional[LoRARequest] = None,
+-        computed_block_nums: Optional[List[int]] = None,
+-        state: Optional[SequenceGroupState] = None,
+-        multi_modal_data: Optional[MultiModalData] = None,
+-    ) -> None:
+-        self.request_id = request_id
+-        self.is_prompt = is_prompt
+-        self.seq_data = seq_data
+-        self.sampling_params = sampling_params
+-        self.block_tables = block_tables
+-        self.lora_request = lora_request
+-        self.computed_block_nums = computed_block_nums
+-        self.multi_modal_data = multi_modal_data
+-        self.state = SequenceGroupState() if state is None else state
+-        self._token_chunk_size = token_chunk_size
+-        self.do_sample = do_sample
+-
+-        if self._token_chunk_size is None:
+-            if is_prompt:
+-                self._token_chunk_size = list(seq_data.values())[0].get_len()
++    request_id: str
++    is_prompt: bool
++    seq_data: Dict[int, SequenceData]
++    sampling_params: Optional[SamplingParams]
++    block_tables: Dict[int, List[int]]
++    do_sample: bool = True
++    pooling_params: Optional[PoolingParams] = None
++    lora_request: Optional[LoRARequest] = None
++    computed_block_nums: Optional[List[int]] = None
++    state: Optional[SequenceGroupState] = msgspec.field(
++        default_factory=lambda: SequenceGroupState())
++    # "MultiModalDataDict" types. We have to use Any due to msgspec
++    # doesn't allow to have union of 2 different dicts.
++    token_type_ids: Optional[List[int]] = None
++    multi_modal_data: Optional[Any] = None
++    multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
++    mm_processor_kwargs: Optional[Dict[str, Any]] = None
++    encoder_seq_data: Optional[SequenceData] = None
++    cross_block_table: Optional[List[int]] = None
++    prompt_adapter_request: Optional[PromptAdapterRequest] = None
++    token_chunk_size: Optional[int] = None
++
++    ### Stateful fields that are lazily defined. ###
++    # The number of speculative tokens adopted in this request.
++    # None means specuative decoding is not used.
++    # Zero means speculative decoding is disabled for some reasons.
++    # TODO: We should maintain this states out of the sequence group.
++    num_speculative_tokens: Optional[int] = None
++
++    def __post_init__(self):
++        if self.seq_data is not None and self.token_chunk_size is None:
++            if self.is_prompt:
++                self.token_chunk_size = next(iter(
++                    self.seq_data.values())).get_len()
+             else:
+-                self._token_chunk_size = 1
++                self.token_chunk_size = 1
+ 
+     @property
+     def lora_int_id(self) -> int:
+         return self.lora_request.lora_int_id if self.lora_request else 0
+ 
+     @property
+-    def token_chunk_size(self) -> Optional[int]:
+-        """Return the number of tokens to be processed (chunk size)."""
+-        return self._token_chunk_size
++    def prompt_adapter_id(self) -> int:
++        return self.prompt_adapter_request.prompt_adapter_id \
++                        if self.prompt_adapter_request else 0
+ 
++    @property
++    def prompt_adapter_num_virtual_tokens(self) -> int:
++        return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \
++                        if self.prompt_adapter_request else 0
+ 
+-class SequenceOutput:
++    # Multi-Step Chunked-Prefill property
++    @property
++    def is_single_step_prompt(self) -> bool:
++        # do_sample is true, only when the token_chunk_size matches the
++        # num_uncomputed_tokens of the sequence. This indicates that
++        # the prompt will finish processing in a single `execute_model`
++        # step.
++        return self.is_prompt and self.do_sample
++
++    def get_first_seq_id(self) -> int:
++        # This is an efficient way of fetching the seq_id when
++        # we know this SequenceGroup has only one sequence.
++        return next(iter(self.seq_data))
++
++    def apply_delta(self,
++                    sequence_group_metadata_delta: SequenceGroupMetadataDelta):
++        for id, delta in sequence_group_metadata_delta.seq_data_delta.items():
++            self.seq_data[id].apply_delta(delta)
++        assert self.request_id == sequence_group_metadata_delta.request_id
++        self.block_tables = sequence_group_metadata_delta.block_tables
++        self.token_chunk_size = sequence_group_metadata_delta.token_chunk_size
++        self.do_sample = sequence_group_metadata_delta.do_sample
++        self.is_prompt = sequence_group_metadata_delta.is_prompt
++
++    def finish_step(self) -> None:
++        assert self.state is not None
++        assert self.state.current_step < self.state.num_steps, \
++            f"current step {self.state.current_step}, num_steps {self.state.num_steps}" # noqa
++        self.state.current_step += 1
++
++
++class SequenceOutput(
++        msgspec.Struct,
++        omit_defaults=True,  # type: ignore[call-arg]
++        array_like=True):  # type: ignore[call-arg]
+     """The model output associated with a sequence.
+ 
+     Args:
+@@ -638,16 +1027,9 @@ class SequenceOutput:
+         logprobs: The logprobs of the output token.
+             (Token id -> logP(x_i+1 | x_0, ..., x_i))
+     """
+-
+-    def __init__(
+-        self,
+-        parent_seq_id: int,
+-        output_token: int,
+-        logprobs: Dict[int, Logprob],
+-    ) -> None:
+-        self.parent_seq_id = parent_seq_id
+-        self.output_token = output_token
+-        self.logprobs = logprobs
++    parent_seq_id: int
++    output_token: int
++    logprobs: Dict[int, Logprob]
+ 
+     def __repr__(self) -> str:
+         return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
+@@ -663,56 +1045,106 @@ class SequenceOutput:
+         return equal and log_probs_equal
+ 
+ 
+-class SequenceGroupOutput:
+-    """The model output associated with a sequence group."""
++class SequenceGroupOutput(ABC):
++    """The base class for model outputs associated with a sequence group."""
+ 
+-    def __init__(
+-        self,
+-        samples: List[SequenceOutput],
+-        prompt_logprobs: Optional[PromptLogprobs],
+-    ) -> None:
+-        self.samples = samples
+-        # Prompt logprob for each prompt query token.
+-        self.prompt_logprobs = prompt_logprobs
++    @abstractmethod
++    def __repr__(self) -> str:
++        pass
++
++    @abstractmethod
++    def __eq__(self, other: object) -> bool:
++        pass
++
++
++class CompletionSequenceGroupOutput(
++        msgspec.Struct,
++        omit_defaults=True,  # type: ignore[call-arg]
++        array_like=True):  # type: ignore[call-arg]
++    """The model output associated with a completion sequence group."""
++    __metaclass__ = SequenceGroupOutput
++    samples: List[SequenceOutput]
++    # Prompt logprob for each prompt query token.
++    prompt_logprobs: Optional[PromptLogprobs]
+ 
+     def __repr__(self) -> str:
+-        return (f"SequenceGroupOutput(samples={self.samples}, "
++        return (f"CompletionSequenceGroupOutput(samples={self.samples}, "
+                 f"prompt_logprobs={self.prompt_logprobs})")
+ 
+     def __eq__(self, other: object) -> bool:
+-        if not isinstance(other, SequenceGroupOutput):
++        if not isinstance(other, CompletionSequenceGroupOutput):
+             raise NotImplementedError()
+         return (self.samples == other.samples
+                 and self.prompt_logprobs == other.prompt_logprobs)
+ 
+ 
+-@dataclass
+-class SamplerOutput:
+-    """For each sequence group, we generate a list of SequenceOutput object,
+-    each of which contains one possible candidate for the next token.
++class PoolingSequenceGroupOutput(
++        msgspec.Struct,
++        omit_defaults=True,  # type: ignore[call-arg]
++        array_like=True,  # type: ignore[call-arg]
++):
++    """The model output associated with a pooling sequence group."""
++    __metaclass__ = SequenceGroupOutput
++    # Annotated as Any to be compatible with msgspec
++    # The actual type is in SequenceGroup.pooled_data
++    data: Any
+ 
+-    This datastructure implements methods so it can be used like a list, but
+-    also has optional fields for device tensors.
++    def __repr__(self) -> str:
++        return f"PoolingSequenceGroupOutput(data={self.data}"
++
++    def __eq__(self, other: object) -> bool:
++        if not isinstance(other, PoolingSequenceGroupOutput):
++            raise NotImplementedError()
++        return self.data == other.data
++
++
++# cannot use msgspec.Struct here because Dynamo does not support it
++@dataclass
++class IntermediateTensors:
++    """For all pipeline stages except the last, we need to return the hidden
++    states and residuals to be sent to the next stage. This data structure
++    contains the hidden states and residuals for a request.
+     """
+ 
+-    outputs: List[SequenceGroupOutput]
++    tensors: Dict[str, torch.Tensor]
+ 
+-    # On-device tensor containing probabilities of each token.
+-    sampled_token_probs: Optional["torch.Tensor"] = None
++    def __init__(self, tensors):
++        # manually define this function, so that
++        # Dynamo knows `IntermediateTensors()` comes from this file.
++        # Otherwise, dataclass will generate this function by evaluating
++        # a string, and we will lose the information about the source file.
++        self.tensors = tensors
+ 
+-    # On-device tensor containing the logprobs of each token.
+-    logprobs: Optional["torch.Tensor"] = None
++    def __getitem__(self, key: Union[str, slice]):
++        if isinstance(key, str):
++            return self.tensors[key]
++        elif isinstance(key, slice):
++            return self.__class__({k: v[key] for k, v in self.tensors.items()})
+ 
+-    # On-device tensor containing the sampled token ids.
+-    sampled_token_ids: Optional["torch.Tensor"] = None
++    def __setitem__(self, key: str, value: torch.Tensor):
++        self.tensors[key] = value
+ 
+-    # Spec decode metrics populated by workers.
+-    spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
++    def __len__(self):
++        return len(self.tensors)
++
++    def __eq__(self, other: object):
++        return isinstance(other, self.__class__) and self
+ 
+-    def __getitem__(self, idx: int):
++    def __repr__(self) -> str:
++        return f"IntermediateTensors(tensors={self.tensors})"
++
++
++class PoolerOutput(
++        msgspec.Struct,
++        omit_defaults=True,  # type: ignore[call-arg]
++        array_like=True):  # type: ignore[call-arg]
++    """The output from a pooling operation in the pooling model."""
++    outputs: List[PoolingSequenceGroupOutput]
++
++    def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
+         return self.outputs[idx]
+ 
+-    def __setitem__(self, idx: int, value):
++    def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput):
+         self.outputs[idx] = value
+ 
+     def __len__(self):
+@@ -722,38 +1154,180 @@ class SamplerOutput:
+         return isinstance(other,
+                           self.__class__) and self.outputs == other.outputs
+ 
+-    def __repr__(self) -> str:
+-        """Show the shape of a tensor instead of its values to reduce noise.
+-        """
+-        sampled_token_probs_repr = ("None" if self.sampled_token_probs is None
+-                                    else self.sampled_token_probs.shape)
+-        sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else
+-                                  self.sampled_token_ids.shape)
+-        return (
+-            f"SamplerOutput(outputs={self.outputs}, "
+-            f"sampled_token_probs={sampled_token_probs_repr}, "
+-            f"sampled_token_ids={sampled_token_ids_repr}, "
+-            f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})")
+ 
++def get_all_seq_ids(
++        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]:
++    """Given a list of SequenceGroupMetadata, create a list of all
++    sequence ids.
++    """
++    return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
+ 
+-@dataclass
+-class ExecuteModelRequest:
+-    """The model execution request."""
+-    # The sequence group metadata list.
++
++def get_all_seq_ids_and_request_ids(
+     seq_group_metadata_list: List[SequenceGroupMetadata]
+-    # Blocks to swap in. Dict of CPU -> GPU block number.
+-    blocks_to_swap_in: Dict[int, int] = field(default_factory=dict)
+-    # Blocks to swap out. Dict of GPU -> CPU block number.
+-    blocks_to_swap_out: Dict[int, int] = field(default_factory=dict)
+-    # Blocks to copy. Source to a list of dest blocks.
+-    blocks_to_copy: Dict[int, List[int]] = field(default_factory=dict)
++) -> Tuple[List[int], Dict[str, Set[int]]]:
++    """Given a list of SequenceGroupMetadata, create a list of all
++    sequence ids.
++    """
++    seq_ids: List[int] = []
++    request_id_seq_ids_mapping: DefaultDict[str, Set[int]] = defaultdict(set)
++    for sg in seq_group_metadata_list:
++        for seq_id in sg.seq_data:
++            seq_ids.append(seq_id)
++            request_id_seq_ids_mapping[sg.request_id].add(seq_id)
++    return seq_ids, request_id_seq_ids_mapping
++
++
++class HiddenStates(msgspec.Struct, array_like=True,
++                   omit_defaults=True):  # type: ignore[call-arg]
++    """Hidden states corresponding to in-progress sequences.
++    Used in speculative decoding to pass hidden states from
++    the target model to the proposer model.
++
++    seq_ids are the sequence ids of each entry of the batch
++    dimension of the hidden_states tensor"""
++    # Scorer hidden states. For prefill step, it is used for hidden states of
++    # all tokens, whereas for decode step, it use used for last accepted tokens.
++    hidden_states: torch.Tensor
++    # The sequence group metadata list. Only needed for decode step.
++    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
++    # Scorer hidden states of the 2nd last token proposed by the proposer (
++    # irrespective of whether it was accepted or not). Only used for cases when
++    # last proposed token is accepted (i.e., in case of bonus tokens). For the
++    # case of no bonus tokens, these are ignored.
++    second_last_token_hidden_states: Optional[torch.Tensor] = None
++
++    _seq_ids: List[int] = msgspec.field(default_factory=list)
++
++    def __post_init__(self):
++        if self.seq_group_metadata_list is not None:
++            assert len(self.seq_group_metadata_list) == len(self.hidden_states)
++            self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list)
++
++    @property
++    def seq_ids(self) -> List[int]:
++        return self._seq_ids
++
++    def update(self,
++               hidden_states: torch.Tensor,
++               seq_group_metadata_list: List[SequenceGroupMetadata],
++               second_last_token_hidden_states: Optional[torch.Tensor] = None):
++        """Update hidden states from target model invocation. Only used for
++        decode steps"""
++        assert len(seq_group_metadata_list) == len(hidden_states)
++        self._seq_ids.extend(get_all_seq_ids(seq_group_metadata_list))
++        self.hidden_states = torch.cat([self.hidden_states, hidden_states])
++
++        if self.second_last_token_hidden_states is not None:
++            # Adding dummy hidden_states to this to maintain same shape
++            self.second_last_token_hidden_states = torch.cat([
++                self.second_last_token_hidden_states,
++                torch.zeros_like(hidden_states)
++                if second_last_token_hidden_states is None else
++                second_last_token_hidden_states
++            ])
++
++    def prune(self,
++              seq_group_metadata_list: List[SequenceGroupMetadata]) -> None:
++        """Prune to provided list of sequence ids. Only used for decode steps.
++        """
++        # Currently this prunes all seq_ids not present in
++        # seq_group_metadata_list which might cause problems where a sequence
++        # may be "paused" then "resumed" later. This should only prune sequences
++        # which are confirmed to be aborted.
++        seq_ids = get_all_seq_ids(seq_group_metadata_list)
++        if seq_ids != self._seq_ids:
++            # Batch contents changed - prune removed sequences.
++            index = [self._seq_ids.index(seq_id) for seq_id in seq_ids]
++            self.hidden_states = self.hidden_states[index]
++            if self.second_last_token_hidden_states is not None:
++                self.second_last_token_hidden_states = self\
++                    .second_last_token_hidden_states[index]
++            self._seq_ids = seq_ids
++
++    def expand_with_bonus_tokens(
++            self, seq_with_bonus_token_in_last_step: set) -> None:
++        """Expand hidden states for sequences with bonus tokens. This is in
++        alignment with `MultiStepWorker._expand_execute_model_request`."""
++        if self.second_last_token_hidden_states is None \
++            or not seq_with_bonus_token_in_last_step:
++            return
++
++        index = []
++        for seq_id in self._seq_ids:
++            i = self._seq_ids.index(seq_id)
++            if seq_id in seq_with_bonus_token_in_last_step:
++                index.append(i + len(self._seq_ids))
++            index.append(i)
++
++        self.hidden_states = torch.cat(
++            [self.hidden_states, self.second_last_token_hidden_states])[index]
++
++
++class ExecuteModelRequest(
++        msgspec.Struct,
++        array_like=True,  # type: ignore[call-arg]
++        omit_defaults=True):  # type: ignore[call-arg]
++    """The model execution request, containing CPU metadata only. The LLM
++    engine should create an instance of this class for each request batch."""
++    # The sequence group metadata list.
++    seq_group_metadata_list: List[Union[SequenceGroupMetadata,
++                                        SequenceGroupMetadataDelta]]
++    # Blocks to swap in. List of CPU -> GPU block number.
++    blocks_to_swap_in: List[Tuple[int,
++                                  int]] = msgspec.field(default_factory=list)
++    # Blocks to swap out. List of GPU -> CPU block number.
++    blocks_to_swap_out: List[Tuple[int,
++                                   int]] = msgspec.field(default_factory=list)
++    # Blocks to copy. Source to dest block.
++    blocks_to_copy: List[Tuple[int, int]] = msgspec.field(default_factory=list)
++    # Virtual engine ID for pipeline parallel.
++    virtual_engine: int = 0
+     # The number of slots for lookahead decoding.
+     num_lookahead_slots: int = 0
+     # The number of requests in the running queue.
+     running_queue_size: int = 0
++    # Optional hidden states from prior step.
++    previous_hidden_states: Optional[HiddenStates] = None
++    # The number of forward steps to run.
++    num_steps: int = 1
++    # Finished request ids since last step.
++    finished_requests_ids: List[str] = msgspec.field(default_factory=list)
++    # The last sampled token ids for multi step decoding.
++    last_sampled_token_ids: Optional[torch.Tensor] = None
++    # Async callback
++    async_callback: Optional[Callable] = None
++
++    @property
++    def is_first_multi_step(self) -> bool:
++        # TODO(will) make this be able to handle batches with variable number of
++        # steps
++        assert len(self.seq_group_metadata_list) > 0
++        first_seq_group = self.seq_group_metadata_list[0]
++        assert first_seq_group.state is not None
++        return first_seq_group.state.current_step == 0
++
++    @property
++    def is_last_step(self) -> bool:
++        # TODO(will) make this be able to handle batches with variable number of
++        # steps
++        assert len(self.seq_group_metadata_list) > 0
++        first_seq_group = self.seq_group_metadata_list[0]
++        assert first_seq_group.state is not None
++        return first_seq_group.state.remaining_steps == 1
++
++    @property
++    def current_step(self) -> int:
++        # TODO(will) make this be able to handle batches with variable number of
++        # steps
++        assert len(self.seq_group_metadata_list) > 0
++        state = self.seq_group_metadata_list[0].state
++        assert state is not None
++        return state.current_step
+ 
+     def clone(
+-        self, seq_group_metadata_list: List[SequenceGroupMetadata]
++        self, seq_group_metadata_list: List[Union[SequenceGroupMetadata,
++                                                  SequenceGroupMetadataDelta]]
+     ) -> "ExecuteModelRequest":
+         """Clone the request with a new sequence group metadata list."""
+         return ExecuteModelRequest(
+@@ -761,6 +1335,130 @@ class ExecuteModelRequest:
+             blocks_to_swap_in=self.blocks_to_swap_in.copy(),
+             blocks_to_swap_out=self.blocks_to_swap_out.copy(),
+             blocks_to_copy=self.blocks_to_copy.copy(),
++            virtual_engine=self.virtual_engine,
+             num_lookahead_slots=self.num_lookahead_slots,
+             running_queue_size=self.running_queue_size,
++            previous_hidden_states=self.previous_hidden_states,
++            num_steps=self.num_steps,
++            finished_requests_ids=self.finished_requests_ids,
++            last_sampled_token_ids=self.last_sampled_token_ids.clone()
++            if self.last_sampled_token_ids is not None else None,
++            async_callback=self.async_callback)
++
++
++@dataclass
++class SequenceGroupBase:
++    group_id: str  # the original request id before splitting
++
++    assembled_seq_group: Optional[SequenceGroup] = None
++
++    # seq id to a unique index inside this group
++    seq_id_to_index: Dict[str, int] = field(default_factory=dict)
++
++    # seq ids to be finished
++    to_be_finished: Dict[str, SequenceGroup] = field(default_factory=dict)
++
++    # seq id to finished sequences
++    finished_reqs: Dict[str, SequenceGroup] = field(default_factory=dict)
++
++    streaming: bool = False
++
++    output_produced: bool = False
++
++    @staticmethod
++    def add_request(request_id: str, engine, params, *args, **kwargs):
++        """When we are ready to add a request with request_id and params
++        into the engine, we can split the request into multiple requests.
++        """
++        raise NotImplementedError
++
++    def finish_seq(self, seq: SequenceGroup):
++        """The sequence `seq` finishes, we should record the information.
++        """
++        del self.to_be_finished[seq.request_id]
++        self.finished_reqs[seq.request_id] = seq
++
++    def maybe_assemble_group(
++            self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
++        """Assemble the sequence group, for producing the final
++        output, or adding request in the engine again.
++        """
++        raise NotImplementedError
++
++
++class ParallelSampleSequenceGroup(SequenceGroupBase):
++
++    @staticmethod
++    def add_request(request_id: str, engine, params, **kwargs):
++        original_params = params
++        params = original_params.clone()
++        params.n = 1
++        group = ParallelSampleSequenceGroup(request_id)
++        seqs = []
++        for i in range(original_params.n):
++            request_id_i = f"{request_id}_parallel_sample_{i}"
++            group.seq_id_to_index[request_id_i] = i
++            seq_group = engine._add_processed_request(
++                request_id_i,
++                params=params,
++                **kwargs,
++            )  # type: ignore
++            assert seq_group is not None
++            engine.seq_id_to_seq_group[request_id_i] = group
++            group.to_be_finished[request_id_i] = seq_group
++            seqs.append(seq_group.seqs[0])
++
++        # for parallel sampling, the `assembled_seq_group` is always
++        # available, since we have all the sequences ready, and they
++        # will not change.
++        group.assembled_seq_group = SequenceGroup(
++            request_id=request_id,
++            seqs=seqs,
++            arrival_time=seq_group.arrival_time,
++            sampling_params=original_params,
++            lora_request=seq_group.lora_request,
++            pooling_params=seq_group.pooling_params,
++            pooled_data=seq_group.pooled_data,
++            encoder_seq=seq_group.encoder_seq,
++            trace_headers=seq_group.trace_headers,
++            prompt_adapter_request=seq_group.prompt_adapter_request,
++            priority=seq_group.priority,
+         )
++
++        group.streaming = params.output_kind == RequestOutputKind.DELTA
++        group.output_produced = False
++
++    def maybe_assemble_group(
++            self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
++
++        # in the streaming mode, we will return the assembled sequence
++        # for the first sequence, and then return None for the rest of
++        # sequences
++        if self.streaming:
++            if self.seq_id_to_index[seq_group.request_id] == 0:
++                return self.assembled_seq_group
++            return None
++
++        # in the non-streaming mode, we will return the assembled sequence
++        # once after all sequences finish, and then return None for the
++        # rest of the time
++
++        if len(self.to_be_finished) > 0:
++            return None
++
++        assert self.assembled_seq_group is not None
++        params = self.assembled_seq_group.sampling_params
++        assert isinstance(params, SamplingParams)
++        if not self.output_produced:
++            self.output_produced = True
++            if params._real_n is not None:
++                # Get the top-n sequences.
++                n = params._real_n or params.n
++                seqs = self.assembled_seq_group.seqs
++                sorting_key = lambda seq: seq.get_cumulative_logprob()
++                sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
++                top_n_seqs = sorted_seqs[:n]
++                self.assembled_seq_group.seqs = top_n_seqs
++            return self.assembled_seq_group
++        if self.output_produced:
++            return None
+diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
+index d5fd969..01b9cda 100644
+--- a/vllm/spec_decode/batch_expansion.py
++++ b/vllm/spec_decode/batch_expansion.py
+@@ -1,21 +1,24 @@
++from array import array
+ from itertools import chain, count
+-from typing import Iterator, List, Tuple
++from typing import Iterator, List, Optional, Tuple
+ 
+ import torch
+ 
+-from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
+-                           SequenceGroupMetadata)
++from vllm import SamplingParams
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import (VLLM_INVALID_TOKEN_ID, VLLM_TOKEN_ID_ARRAY_TYPE,
++                           ExecuteModelRequest, SequenceData,
++                           SequenceGroupMetadata, get_all_seq_ids)
+ from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                          SpeculativeScorer, SpeculativeScores)
+-from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
+-                                   sampler_output_to_torch,
+-                                   split_batch_by_proposal_len)
+-from vllm.worker.worker_base import WorkerBase
++from vllm.spec_decode.util import nvtx_range, split_batch_by_proposal_len
+ 
+ SeqId = int
+ TargetSeqId = int
+ TokenId = int
+ 
++DEFAULT_SIMPLE_SAMPLING_PARAMS = SamplingParams()
++
+ 
+ class BatchExpansionTop1Scorer(SpeculativeScorer):
+     """Implements a speculative scorer that uses batch expansion to get
+@@ -32,12 +35,6 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+     of topk/tree.
+     """
+ 
+-    def __init__(self, scorer_worker: WorkerBase, device: str,
+-                 vocab_size: int):
+-        self._scorer_worker = scorer_worker
+-        self._device = device
+-        self._vocab_size = vocab_size
+-
+     @nvtx_range("BatchExpansionTop1Scorer.score_proposals")
+     def score_proposals(
+         self,
+@@ -65,10 +62,10 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+         proposal_lens_list = proposals.proposal_lens.tolist()
+         proposal_token_ids_list = proposals.proposal_token_ids.tolist()
+ 
+-        # Filter the list to ignore -1 proposals.
++        # Filter the list to ignore invalid proposals.
+         proposal_token_ids_list_without_skips = [
+             proposals for proposals in proposal_token_ids_list
+-            if -1 not in proposals
++            if VLLM_INVALID_TOKEN_ID not in proposals
+         ]
+ 
+         (spec_indices, non_spec_indices, target_seq_group_metadata_list,
+@@ -80,24 +77,34 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+ 
+         target_sampler_output = self._scorer_worker.execute_model(
+             execute_model_req=execute_model_req.clone(
+-                seq_group_metadata_list=target_seq_group_metadata_list, ))
++                seq_group_metadata_list=target_seq_group_metadata_list))
+         assert len(target_sampler_output) == 1, "expected single-step output"
+         target_sampler_output = target_sampler_output[0]
+ 
+-        all_tokens, all_probs, spec_logprobs = self._contract_batch(
+-            contracted_bs=len(execute_model_req.seq_group_metadata_list),
+-            target_sampler_output=target_sampler_output,
+-            proposals=proposals,
+-            num_scoring_tokens=num_scoring_tokens,
+-            non_spec_indices=non_spec_indices,
+-            spec_indices=spec_indices,
+-            k=execute_model_req.num_lookahead_slots,
+-        )
+-
++        if not non_spec_indices:
++            # All sequence groups in batch have spec decoding enabled
++            contracted = self._contract_batch_all_spec(
++                target_sampler_output=target_sampler_output,
++                proposals=proposals,
++            )
++        else:
++            # Batch has a mix of spec decode enabled and disabled seq groups
++            contracted = self._contract_batch(
++                execute_model_req.seq_group_metadata_list,
++                target_sampler_output=target_sampler_output,
++                proposals=proposals,
++                num_scoring_tokens=num_scoring_tokens,
++                non_spec_indices=non_spec_indices,
++                spec_indices=spec_indices,
++                k=execute_model_req.num_lookahead_slots,
++            )
++
++        all_tokens, all_probs, spec_logprobs, all_hidden_states = contracted
+         return SpeculativeScores(
+             probs=all_probs,
+             token_ids=all_tokens,
+             logprobs=spec_logprobs,
++            hidden_states=all_hidden_states,
+         )
+ 
+     def _expand_batch(
+@@ -115,16 +122,11 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+         # proposal len. This adds some complexity (splitting the batch into spec
+         # and non spec sequences) and should be removed in the future. It can be
+         # done by supporting per-sequence proposal lens.
+-        spec_seqs, spec_indices = split_batch_by_proposal_len(
+-            seq_group_metadata_list,
+-            proposal_lens_list,
+-            select_proposal_len_zero=False)
+-        non_spec_seqs, non_spec_indices = split_batch_by_proposal_len(
+-            seq_group_metadata_list,
+-            proposal_lens_list,
+-            select_proposal_len_zero=True)
+-
+-        target_seq_group_metadata_list = self._create_scoring_model_input(
++        (spec_seqs, spec_indices), (non_spec_seqs, non_spec_indices) = \
++            split_batch_by_proposal_len(
++                seq_group_metadata_list, proposal_lens_list)
++
++        spec_expanded_seqs = self._create_scoring_model_input(
+             seq_group_metadata_list=spec_seqs,
+             proposal_token_ids=proposal_token_ids_list,
+             # NOTE: We determine the seq ids in the expanded batch using the
+@@ -133,18 +135,21 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+                 seq_ids=get_all_seq_ids(seq_group_metadata_list)),
+         )
+ 
+-        num_scoring_tokens = len(target_seq_group_metadata_list)
+-        target_seq_group_metadata_list.extend(non_spec_seqs)
++        num_scoring_tokens = len(spec_expanded_seqs)
++        # Batch speculative and non-speculative (e.g. chunked prefill) requests
++        # but make sure order is prefill|decode due to backend requirement.
++        target_seq_group_metadata_list = non_spec_seqs + spec_expanded_seqs
+ 
+         return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
+                 num_scoring_tokens)
+ 
+     def _contract_batch(
+-            self, contracted_bs: int,
+-            target_sampler_output: List[SamplerOutput],
+-            proposals: SpeculativeProposals, num_scoring_tokens: int,
+-            non_spec_indices: List[int], spec_indices: List[int],
+-            k: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
++        self, contracted_seq_group_metadata_list: List[SequenceGroupMetadata],
++        target_sampler_output: SamplerOutput, proposals: SpeculativeProposals,
++        num_scoring_tokens: int, non_spec_indices: List[int],
++        spec_indices: List[int], k: int
++    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
++               Optional[torch.Tensor]]:
+         """Contract the expanded batch back into its original size.
+         This maps the scores of speculative tokens back to their original
+         sequences.
+@@ -152,9 +157,11 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+         contracted_bs is the original batch size, and the batch size that the
+         target_sampler_output will be contracted to.
+         """
+-        (target_token_ids, target_probs, target_logprobs,
++        contracted_bs = len(contracted_seq_group_metadata_list)
++        (target_token_ids, target_probs, target_logprobs, target_hidden_states,
+          non_spec_target_token_ids, non_spec_target_probs,
+-         non_spec_target_logprobs) = self._split_scoring_output(
++         non_spec_target_logprobs,
++         non_spec_target_hidden_states) = self._split_scoring_output(
+              target_sampler_output, num_scoring_tokens)
+ 
+         # Map distinct sequences used to score each token
+@@ -163,46 +170,88 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+ 
+         # The number of tokens in the expanded batch used for speculation is
+         # equal to the total expanded batch size minus the number of samples for
+-        # non-speculative sequences.
+-        non_spec_expanded_bs, _ = non_spec_target_token_ids.shape
++        # non-speculative sequences, prefill chunks with no out tokens included
++        non_spec_expanded_bs = len(non_spec_indices)
+         spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs
+ 
+-        target_token_ids = target_token_ids.squeeze().reshape(
+-            spec_expanded_bs, k + 1)
+-        target_probs = target_probs.squeeze().reshape(spec_expanded_bs, k + 1,
+-                                                      self._vocab_size)
+-        target_logprobs = target_logprobs.squeeze().reshape(
+-            spec_expanded_bs, k + 1, self._vocab_size)
+-
+-        all_tokens = torch.full(size=(contracted_bs, k + 1),
+-                                fill_value=-1,
+-                                device=self._device,
+-                                dtype=torch.long)
+-        all_probs = torch.zeros(contracted_bs,
+-                                k + 1,
+-                                self._vocab_size,
+-                                device=self._device,
+-                                dtype=torch.float32)
+-        all_logprobs = torch.full(size=(
+-            contracted_bs,
+-            k + 1,
+-            self._vocab_size,
+-        ),
+-                                  fill_value=-float("inf"),
+-                                  device=self._device,
+-                                  dtype=torch.float32)
+-
+-        if non_spec_indices:
+-            all_tokens[non_spec_indices, :1] = non_spec_target_token_ids
+-            all_probs[non_spec_indices, :1, :] = non_spec_target_probs
+-            all_logprobs[non_spec_indices, :1, :] = non_spec_target_logprobs
++        target_token_ids = target_token_ids.reshape(spec_expanded_bs, k + 1)
++        target_probs = target_probs.reshape(*target_token_ids.shape,
++                                            self._vocab_size)
++        target_logprobs = target_logprobs.reshape(target_probs.shape)
++
++        if target_hidden_states is not None:
++            target_hidden_states = target_hidden_states.reshape(
++                *target_token_ids.shape, target_hidden_states.shape[-1])
++
++        all_tokens = target_token_ids.new_full(size=(contracted_bs, k + 1),
++                                               fill_value=-1)
++        all_probs = target_probs.new_zeros(*all_tokens.shape, self._vocab_size)
++        all_logprobs = target_logprobs.new_full(size=all_probs.shape,
++                                                fill_value=-float("inf"))
++
++        if target_sampler_output.hidden_states is not None:
++            all_hidden_states = target_hidden_states.new_zeros(
++                size=(contracted_bs, k + 1, target_hidden_states.shape[-1]))
++        else:
++            all_hidden_states = None
++
++        # Rule out prefills that produce no tokens.
++        non_spec_indices = [
++            idx for idx in non_spec_indices
++            if contracted_seq_group_metadata_list[idx].do_sample
++        ]
++        if len(non_spec_indices):
++            all_tokens[non_spec_indices, :1] = \
++                non_spec_target_token_ids.unsqueeze(1)
++            all_probs[non_spec_indices, :1, :] = \
++                non_spec_target_probs.unsqueeze(1)
++            all_logprobs[non_spec_indices, :1, :] = \
++                non_spec_target_logprobs.unsqueeze(1)
++            if all_hidden_states is not None:
++                assert non_spec_target_hidden_states is not None
++                all_hidden_states[non_spec_indices, :1, :] = \
++                    non_spec_target_hidden_states.unsqueeze(1)
+ 
+         if spec_indices:
+             all_tokens[spec_indices] = target_token_ids
+             all_probs[spec_indices] = target_probs
+             all_logprobs[spec_indices] = target_logprobs
++            if all_hidden_states is not None:
++                all_hidden_states[spec_indices] = target_hidden_states
+ 
+-        return all_tokens, all_probs, all_logprobs
++        return all_tokens, all_probs, all_logprobs, all_hidden_states
++
++    def _contract_batch_all_spec(
++        self,
++        target_sampler_output: SamplerOutput,
++        proposals: SpeculativeProposals,
++    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
++               Optional[torch.Tensor]]:
++        """Contract the expanded batch back into its original size.
++        This maps the scores of speculative tokens back to their original
++        sequences.
++
++        It assumes all sequences in the batch were previously expanded.
++        """
++
++        # Map distinct sequences used to score each token
++        # of shape [batch_size * k + 1] back to [batch_size, k + 1].
++        contracted_bs, k = proposals.proposal_token_ids.shape
++
++        # Reshape tensors to original batch size
++        target_token_ids = target_sampler_output.sampled_token_ids.reshape(
++            contracted_bs, k + 1)
++        target_probs = target_sampler_output.sampled_token_probs.reshape(
++            *target_token_ids.shape, self._vocab_size)
++        target_logprobs = target_sampler_output.logprobs.reshape(
++            target_probs.shape)
++        target_hidden_states = target_sampler_output.hidden_states
++        if target_hidden_states is not None:
++            target_hidden_states = target_hidden_states.reshape(
++                *target_token_ids.shape, target_hidden_states.shape[-1])
++
++        return (target_token_ids, target_probs, target_logprobs,
++                target_hidden_states)
+ 
+     def _create_scoring_model_input(
+         self,
+@@ -250,9 +299,6 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+         This function creates K+1 target SequenceGroupMetadata to take
+         advantage of the bonus token.
+         """
+-        assert not input_seq_group_metadata.is_prompt, (
+-            "Speculating on "
+-            "prompts not yet supported")
+         assert len(input_seq_group_metadata.seq_data) == 1, (
+             "Beam search "
+             "not supported in speculative decoding")
+@@ -261,24 +307,27 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+         token_ids_to_score = self._get_token_ids_to_score(
+             proposal_token_ids[batch_index])
+ 
++        sampling_params = input_seq_group_metadata.sampling_params
+         target_seq_group_metadata_list: List[SequenceGroupMetadata] = []
+-        for token_ids in token_ids_to_score:
++        for i, token_ids in enumerate(token_ids_to_score):
+             target_seq_group_metadata_list.append(
+                 self._create_single_target_seq_group_metadata(
+                     input_seq_group_metadata,
+                     input_seq_id,
+                     next(target_seq_ids_iter),
+                     token_ids,
++                    sampling_params=sampling_params,
+                 ))
+ 
+         return target_seq_group_metadata_list
+ 
++    @staticmethod
+     def _create_single_target_seq_group_metadata(
+-        self,
+         seq_group_metadata: SequenceGroupMetadata,
+         seq_id: SeqId,
+         target_seq_id: TargetSeqId,
+         token_ids: List[TokenId],
++        sampling_params: SamplingParams,
+     ) -> SequenceGroupMetadata:
+         """Create a single target SequenceGroupMetadata.
+ 
+@@ -290,30 +339,44 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+                 input sequence.
+         """
+         seq_data = seq_group_metadata.seq_data[seq_id]
+-        prompt_token_ids = seq_data.get_prompt_token_ids()
++        prompt_token_ids = seq_data.prompt_token_ids_array
+         new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids]
++        mrope_position_delta = seq_data.mrope_position_delta
++
++        new_seq_data_dict = {
++            target_seq_id:
++            SequenceData(
++                prompt_token_ids,
++                _output_token_ids=array(VLLM_TOKEN_ID_ARRAY_TYPE,
++                                        new_output_token_ids),
++            ),
++        }
++        # This is a hack. Technically, spec decoding should compute
++        # num_lookahead slots at one shot, but instead, it expands the batch
++        # and evaluate one by one right now. context_len is seq_len - 1 because
++        # the kv cache is filled by a previous batch in the batch expansion.
++        for data in new_seq_data_dict.values():
++            data.update_num_computed_tokens(data.get_len() - 1)
++            data.mrope_position_delta = mrope_position_delta
+ 
+         return SequenceGroupMetadata(
+             request_id=seq_group_metadata.request_id,
+             is_prompt=seq_group_metadata.is_prompt,
+-            seq_data={
+-                target_seq_id:
+-                SequenceData(
+-                    prompt_token_ids=prompt_token_ids,
+-                    output_token_ids=new_output_token_ids,
+-                ),
+-            },
+-            sampling_params=seq_group_metadata.sampling_params,
++            seq_data=new_seq_data_dict,
++            sampling_params=sampling_params,
+             block_tables={
+                 target_seq_id: seq_group_metadata.block_tables[seq_id],
+             },
+             lora_request=None,
++            token_chunk_size=1,
+         )
+ 
++    @staticmethod
+     def _split_scoring_output(
+-        self, sampler_output: SamplerOutput, num_scoring_tokens: int
+-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+-               torch.Tensor, torch.Tensor]:
++        sampler_output: SamplerOutput, num_scoring_tokens: int
++    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
++               Optional[torch.Tensor], torch.Tensor, torch.Tensor,
++               torch.Tensor, Optional[torch.Tensor]]:
+         """Split the target model output into speculative and non-speculative
+         output.
+         """
+@@ -323,42 +386,30 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+         # and non spec sequences) and should be removed in the future. It can be
+         # done by supporting per-sequence proposal lens.
+         #
+-        # First samples are from speculative scoring, latter samples are non-
+-        # speculative samples.
+-        split_sizes = [
+-            num_scoring_tokens,
+-            sampler_output.sampled_token_ids.numel() - num_scoring_tokens
+-        ]
+-        (spec_probs, non_spec_probs
+-         ) = sampler_output.sampled_token_probs.split(split_sizes)
+-        (spec_sampled_tokens, non_spec_sampled_tokens
++        # First samples are non-speculative, latter samples are from speculative
++        # scoring (prefill|decode order).
++        split_sizes = (sampler_output.sampled_token_ids.numel() -
++                       num_scoring_tokens, num_scoring_tokens)
++        (non_spec_probs,
++         spec_probs) = sampler_output.sampled_token_probs.split(split_sizes)
++        (non_spec_sampled_tokens, spec_sampled_tokens
+          ) = sampler_output.sampled_token_ids.flatten().split(split_sizes)
+-        (
+-            spec_logprobs,
+-            non_spec_logprobs,
+-        ) = sampler_output.logprobs.split(split_sizes)
+-
+-        # Convert scores to tensors.
+-        sampler_output.sampled_token_probs = spec_probs
+-        sampler_output.sampled_token_ids = spec_sampled_tokens
+-        sampler_output.logprobs = spec_logprobs
+-        (target_token_ids, target_probs,
+-         target_logprobs) = sampler_output_to_torch([sampler_output], True)
+-
+-        # Convert non-speculative output tokens to tensors.
+-        sampler_output.sampled_token_probs = non_spec_probs
+-        sampler_output.sampled_token_ids = non_spec_sampled_tokens
+-        sampler_output.logprobs = non_spec_logprobs
+-        (non_spec_target_token_ids, non_spec_target_probs,
+-         non_spec_target_logprobs) = sampler_output_to_torch([sampler_output],
+-                                                             True)
++        (non_spec_logprobs,
++         spec_logprobs) = sampler_output.logprobs.split(split_sizes)
+ 
+-        return (target_token_ids, target_probs, target_logprobs,
+-                non_spec_target_token_ids, non_spec_target_probs,
+-                non_spec_target_logprobs)
++        if sampler_output.hidden_states is not None:
++            (non_spec_hidden_states, spec_hidden_states
++             ) = sampler_output.hidden_states.split(split_sizes)
++        else:
++            non_spec_hidden_states, spec_hidden_states = None, None
++
++        return (spec_sampled_tokens, spec_probs, spec_logprobs,
++                spec_hidden_states, non_spec_sampled_tokens, non_spec_probs,
++                non_spec_logprobs, non_spec_hidden_states)
+ 
++    @staticmethod
+     def _create_target_seq_id_iterator(
+-            self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]:
++            seq_ids: List[SeqId]) -> Iterator[TargetSeqId]:
+         """Create an iterator for creating target sequence ids.
+         Target sequence ids are distinct from sequence ids because we create a
+         distinct target sequence id for each proposal token to be scored.
+@@ -368,8 +419,8 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+         """
+         return count(start=max(seq_ids) + 1)
+ 
++    @staticmethod
+     def _get_token_ids_to_score(
+-        self,
+         full_spec_token_ids: List[TokenId]  # shape: [k]
+     ) -> List[List[TokenId]]:
+         """Given an int tensor of proposal token ids, return a list of
+@@ -390,8 +441,6 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
+         empty_token_ids: List[TokenId] = []
+ 
+         token_ids_to_score = [empty_token_ids]
+-        token_ids_to_score.extend([
+-            full_spec_token_ids[:i + 1]
+-            for i in range(len(full_spec_token_ids))
+-        ])
++        token_ids_to_score.extend(full_spec_token_ids[:i + 1]
++                                  for i in range(len(full_spec_token_ids)))
+         return token_ids_to_score
+diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
+new file mode 100644
+index 0000000..fe5fd39
+--- /dev/null
++++ b/vllm/spec_decode/draft_model_runner.py
+@@ -0,0 +1,323 @@
++from typing import List, Optional
++
++import torch
++
++from vllm.forward_context import set_forward_context
++from vllm.model_executor.layers.sampler import SamplerOutput
++
++try:
++    try:
++        from vllm.attention.backends.flash_attn import FlashAttentionMetadata
++    except (ModuleNotFoundError, ImportError):
++        # vllm_flash_attn is not installed, try the ROCm FA metadata
++        from vllm.attention.backends.rocm_flash_attn import (
++            ROCmFlashAttentionMetadata as FlashAttentionMetadata)
++except (ModuleNotFoundError, ImportError) as err:
++    raise RuntimeError(
++        "Draft model speculative decoding currently only supports"
++        "CUDA and ROCm flash attention backend.") from err
++
++from vllm.logger import init_logger
++from vllm.multimodal import MultiModalKwargs
++from vllm.sequence import ExecuteModelRequest, IntermediateTensors
++from vllm.worker.model_runner_base import (ModelRunnerBase,
++                                           ModelRunnerInputBase,
++                                           ModelRunnerWrapperBase)
++
++logger = init_logger(__name__)
++
++# A flag to enable debug prints for the updated input tensors
++# before each step.
++debug_advance_input = False
++# A flag to allow GPU advance step for draft model runner.
++# Set to False for debugging.
++allow_gpu_advance_step = True
++
++
++class TP1DraftModelRunner(ModelRunnerWrapperBase):
++    """Specialized model runner for speculative decoding draft model.
++    Since the draft model always execute k forward passes consecutively to
++    generate k speculative tokens in a single speculative decoding step,
++    we could get rid of most CPU-GPU synchronization and data transfer
++    overheads by keeping model input and output tensors on GPU all the time.
++
++    TODOs:
++    1. Currently supports only flash-attn, add support for other attn_backends.
++    2. Support TP > 1 (this requires some designs because we do not expect
++       any broadcasting inside execute_model).
++    """
++
++    def __init__(self, model_runner: ModelRunnerBase):
++        if hasattr(
++                model_runner,
++                "return_hidden_states") and model_runner.return_hidden_states:
++            raise ValueError(
++                "return_hidden_states is not supported for TP1DraftModelRunner."
++            )
++        super().__init__(model_runner)
++
++        self.indices_of_seq_with_bonus_tokens = None
++
++    def _update_sampling_metadata(self, sampling_metadata, num_seqs,
++                                  num_queries):
++
++        assert sampling_metadata.num_prompts == 0
++        assert len(sampling_metadata.seq_groups) == num_queries
++        assert sampling_metadata.selected_token_indices.shape == (
++            num_queries, )
++        # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501
++
++        # Verify that all sequences are decodes
++        for i in range(num_queries):
++            seq_group = sampling_metadata.seq_groups[i]
++
++            assert seq_group.is_prompt is False  # No prompt
++            assert seq_group.prompt_logprob_indices == []  # No prompt
++            assert seq_group.sample_indices == [i]  # Simple
++
++    def _gpu_advance_step(self, model_input: ModelRunnerInputBase,
++                          last_output: SamplerOutput) -> ModelRunnerInputBase:
++        # Currently, we expect "decode mode" only
++        assert not model_input.is_prompt
++
++        # Get num_seqs
++        num_seqs = len(model_input.seq_lens)
++        num_queries = len(model_input.query_lens)
++
++        # Get output tokens GPU tensor
++        sampled_token_ids = last_output.sampled_token_ids
++        assert sampled_token_ids is not None
++
++        # Update attn_metadata
++        attn_metadata = model_input.attn_metadata
++        assert isinstance(attn_metadata, FlashAttentionMetadata)
++
++        attn_metadata.advance_step(model_input, sampled_token_ids,
++                                   self.block_size, num_seqs, num_queries)
++
++        # Update sampling_metadata
++        sampling_metadata = model_input.sampling_metadata
++        self._update_sampling_metadata(sampling_metadata, num_seqs,
++                                       num_queries)
++
++        # Create new input
++        new_model_input = self._model_input_cls(
++            input_tokens=model_input.input_tokens,
++            input_positions=model_input.input_positions,
++            attn_metadata=attn_metadata,
++            seq_lens=attn_metadata.seq_lens,
++            query_lens=model_input.query_lens,
++            lora_mapping=model_input.lora_mapping,
++            lora_requests=model_input.lora_requests,
++            multi_modal_kwargs=model_input.multi_modal_kwargs,
++            sampling_metadata=model_input.sampling_metadata,
++            is_prompt=False,
++        )
++
++        # Ensure we skip CPU samples
++        assert new_model_input.sampling_metadata.skip_sampler_cpu_output is True
++        # We can reuse sampling tensors since every decode iteration is the same
++        new_model_input.sampling_metadata.reuse_sampling_tensors = True
++
++        if debug_advance_input:
++            logger.debug("NEW INPUT: ")
++            logger.debug("  input_tokens = %s", new_model_input.input_tokens)
++            logger.debug("  input_positions = %s",
++                         new_model_input.input_positions)
++            logger.debug("  seq_lens = %d", new_model_input.seq_lens)
++            logger.debug("  query_lens = %d", new_model_input.query_lens)
++            logger.debug("  attn_metadata:")
++            logger.debug("    seq_lens_tensor: %s",
++                         attn_metadata.seq_lens_tensor)
++            logger.debug("    slot_mapping: %s", attn_metadata.slot_mapping)
++            logger.debug("    block_tables: %s", attn_metadata.block_tables)
++
++        return new_model_input
++
++    def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
++        """Determines if draft_model_runner GPU multi-step can be used.
++        Currently required conditions are:
++            1. Only decodes 
++            2. Only flash-attn
++            3. No LORA
++            4. No prompt_adapter_config
++        """
++        if not allow_gpu_advance_step:
++            return False
++
++        # We allow multi-step GPU only in decode mode
++        for seq_group in execute_model_req.seq_group_metadata_list:
++            if seq_group.is_prompt:
++                return False
++
++        # TODO: Add support for other attn backends
++        if self.attn_backend.get_name() != "FLASH_ATTN":
++            return False
++
++        # TODO: Add support for LORA
++        if self.lora_config:
++            return False
++
++        # TODO: Add soft-tuning prompt adapter support
++        return not self.prompt_adapter_config
++
++    def set_indices_of_seq_with_bonus_tokens(self,
++                                             indices_of_seq_with_bonus_tokens):
++        self.indices_of_seq_with_bonus_tokens = indices_of_seq_with_bonus_tokens
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        model_input: ModelRunnerInputBase,
++        kv_caches: List[torch.Tensor],
++        previous_hidden_states: Optional[torch.Tensor] = None,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++    ) -> Optional[List[SamplerOutput]]:
++        """Executes num_steps forward passes with advacement of input tensors 
++        on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions.
++
++        Optimizations used:
++            1. Input tensors are updated on the GPU directly
++            2. Skips GPU=>CPU serialization of sampler outputs (we don't need 
++                them since we do batch expansion later that uses GPU outputs)
++            3. Reuses sampling tensors (since we run only decodes and they have
++                a repeating sampling logic)
++        """
++
++        # When num_steps == 1, we execute the fallback here for the GPU
++        # advance_step, which runs prepare_inputs on CPU and for each spec
++        # iteration invokes this function only once
++        # (Look at multi-step-worker code)
++        is_fallback = num_steps == 1
++        if not is_fallback:
++            # Since we do not broadcast data inside execute_model anymore,
++            # we need to figure out the best way to support TP > 1 in this
++            # case, because we will at least need to broadcast the sampled
++            # tokens to all workers.
++            if not self.is_driver_worker:
++                raise ValueError("TP1DraftModelRunner only supports TP=1.")
++
++            # Sanity
++            if self.lora_config is not None:
++                raise ValueError("TP1DraftModelRunner has no support for LORA")
++            if self.prompt_adapter_config is not None:
++                raise ValueError("TP1DraftModelRunner has no support for "
++                                 "prompt_adapter_config")
++            if model_input.multi_modal_kwargs:
++                raise ValueError(
++                    "TP1DraftModelRunner has no support for multi_modal_kwargs"
++                )
++        else:
++            if self.lora_config:
++                assert model_input.lora_requests is not None
++                assert model_input.lora_mapping is not None
++                self.set_active_loras(model_input.lora_requests,
++                                      model_input.lora_mapping)
++
++            if self.prompt_adapter_config:
++                assert model_input.prompt_adapter_requests is not None
++                assert model_input.prompt_adapter_mapping is not None
++                self.set_active_prompt_adapters(
++                    model_input.prompt_adapter_requests,
++                    model_input.prompt_adapter_mapping)
++
++            self.attn_state.begin_forward(model_input)
++
++        # Detect exec mode
++        assert model_input.attn_metadata is not None
++        use_cuda_graph = False
++        if model_input.attn_metadata.num_prefills > 0:
++            # In this case, execute_model(..) was called directly
++            if num_steps > 1:
++                raise ValueError(
++                    "execute_model(..) of draft_model_runner can be called "
++                    "directly only with a single-step prefill")
++        else:
++            # We can skip CPU samples for spec token generation.
++            # (We do allow CPU samples for num_steps == 1 to support the
++            # fallback case, where supports_gpu_multi_step(..) does not pass)
++            model_input.sampling_metadata.skip_sampler_cpu_output = (
++                not is_fallback)
++
++            # Attn attr defines if we use cuda graphs
++            use_cuda_graph = model_input.attn_metadata.use_cuda_graph
++
++        # Get model
++        if use_cuda_graph:
++            graph_batch_size = model_input.input_tokens.shape[0]
++            model_executable = (self.graph_runners[model_input.virtual_engine]
++                                [graph_batch_size])
++
++            if previous_hidden_states is not None:
++                hidden_states = torch.cat([
++                    previous_hidden_states,
++                    torch.empty([
++                        graph_batch_size - previous_hidden_states.shape[0],
++                        *previous_hidden_states.shape[1:]
++                    ],
++                                dtype=previous_hidden_states.dtype,
++                                device=previous_hidden_states.device)
++                ])
++            else:
++                hidden_states = None
++        else:
++            model_executable = self.model
++            hidden_states = previous_hidden_states
++
++        outputs: List[SamplerOutput] = []
++        for step in range(num_steps):
++            multi_modal_kwargs = model_input.multi_modal_kwargs or {}
++
++            kwargs = {"previous_hidden_states": hidden_states} \
++                if previous_hidden_states is not None else {}
++
++            # Run model
++            with set_forward_context(model_input.attn_metadata,
++                                     self.vllm_config):
++                hidden_states = model_executable(
++                    input_ids=model_input.input_tokens,
++                    positions=model_input.input_positions,
++                    kv_caches=kv_caches,
++                    attn_metadata=model_input.attn_metadata,
++                    intermediate_tensors=intermediate_tensors,
++                    **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
++                                                 device=self.device),
++                    **kwargs,
++                )
++
++            # Compute the logits.
++            logits = self.model.compute_logits(hidden_states,
++                                               model_input.sampling_metadata)
++
++            # Sample the next token.
++            output = self.model.sample(
++                logits=logits,
++                sampling_metadata=model_input.sampling_metadata,
++            )
++            outputs.append(output)
++
++            if model_input.attn_metadata.num_prefills == 0 \
++                and self.indices_of_seq_with_bonus_tokens is not None:
++                assert output.sampled_token_ids is not None
++                # output.sampled_token_ids should be of shape (num_seqs, 1)
++                nums_seqs, num_tokens_per_seq = output.sampled_token_ids.shape
++                assert num_tokens_per_seq == 1
++                count = 0
++                for i in range(nums_seqs):
++                    bonus_seq_idx = self.indices_of_seq_with_bonus_tokens[
++                        count]
++                    if i != bonus_seq_idx:
++                        # The following might cause a cpu->gpu sync
++                        # However, the performance impact is negligible as we
++                        # benchmarked on H100.
++                        output.sampled_token_ids[
++                            i, :] = model_input.input_tokens[bonus_seq_idx]
++                    else:
++                        count += 1
++
++            # Prepare inputs for the next step
++            if step != num_steps - 1:
++                model_input = self._gpu_advance_step(model_input, outputs[-1])
++
++        return outputs
+diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
+index d311bfe..a4fe0f1 100644
+--- a/vllm/spec_decode/interfaces.py
++++ b/vllm/spec_decode/interfaces.py
+@@ -1,9 +1,11 @@
+ from abc import ABC, abstractmethod
+ from dataclasses import dataclass
++from typing import Optional, Set, Union
+ 
+ import torch
+ 
+ from vllm.sequence import ExecuteModelRequest
++from vllm.worker.worker_base import WorkerBase
+ 
+ 
+ @dataclass
+@@ -21,6 +23,9 @@ class SpeculativeProposals:
+     # The valid length of each proposal; can be zero.
+     proposal_lens: torch.Tensor
+ 
++    # A flag to mark that there's no available proposals
++    no_proposals: bool = False
++
+     def __repr__(self):
+         return (f"SpeculativeProposals("
+                 f"proposal_token_ids={self.proposal_token_ids}, "
+@@ -46,6 +51,9 @@ class SpeculativeScores:
+     # tokens and also non-speculative normal decoding.
+     token_ids: torch.Tensor
+ 
++    # Optional last hidden states from the scoring model.
++    hidden_states: Optional[torch.Tensor] = None
++
+     def __repr__(self):
+         return (f"SpeculativeScores("
+                 f"probs={self.probs.shape}, "
+@@ -55,15 +63,26 @@ class SpeculativeScores:
+ class SpeculativeProposer(ABC):
+ 
+     @abstractmethod
+-    def get_proposals(
++    def get_spec_proposals(
+         self,
+         execute_model_req: ExecuteModelRequest,
++        # If set, this contains all sequence IDs that were assigned
++        # bonus tokens in their last forward pass.
++        seq_ids_with_bonus_token_in_last_step: Set[int],
+     ) -> SpeculativeProposals:
+         raise NotImplementedError
+ 
+ 
+ class SpeculativeScorer(ABC):
+ 
++    def __init__(self, scorer_worker: WorkerBase,
++                 device: Union[torch.device, str], vocab_size: int):
++        self._scorer_worker = scorer_worker
++        if isinstance(device, torch.device):
++            device = device.type
++        self._device = device
++        self._vocab_size = vocab_size
++
+     @abstractmethod
+     def score_proposals(
+         self,
+diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
+new file mode 100644
+index 0000000..1ab691a
+--- /dev/null
++++ b/vllm/spec_decode/medusa_worker.py
+@@ -0,0 +1,137 @@
++import weakref
++from typing import List, Optional, Set, Tuple
++
++import torch
++
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
++from vllm.spec_decode.interfaces import SpeculativeProposals
++from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
++from vllm.spec_decode.top1_proposer import Top1Proposer
++from vllm.worker.worker_base import WorkerWrapperBase
++
++
++class MedusaWorker(NonLLMProposerWorkerBase, WorkerWrapperBase):
++    """Worker for Medusa.
++    """
++
++    def __init__(self, *args, **kwargs):
++        super().__init__(kwargs.get("vllm_config"))
++        self.init_worker(*args, **kwargs)
++
++        # Lazy initialization list.
++        self._proposer: Top1Proposer
++
++    def init_device(self):
++        self.worker.init_device()
++
++        self._proposer = Top1Proposer(
++            weakref.proxy(self),  # type: ignore[arg-type]
++            self.device,
++            self.vocab_size,
++            max_proposal_len=self.max_model_len,
++        )
++
++    def set_include_gpu_probs_tensor(self):
++        pass
++
++    def set_should_modify_greedy_probs_inplace(self):
++        pass
++
++    @torch.inference_mode()
++    def sampler_output(
++        self,
++        execute_model_req: ExecuteModelRequest,
++        sample_len: int,
++        # Unused parameter.
++        seq_ids_with_bonus_token_in_last_step: Set[int],
++    ) -> Tuple[List[SamplerOutput], bool]:
++        """Run the model forward pass to generate sample_len future tokens.
++        Returns the list of sampler output, one per layer, along with indicator
++        of whether torch tensor in sampler output need to be transposed in
++        latter sampler_output_to_torch logic.
++
++        For medusa worker, this indicator shall be False.
++        """
++        self._raise_if_unsupported(execute_model_req)
++
++        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
++
++        seq_lens, query_lens = self._prepare_input_tensors(
++            seq_group_metadata_list)
++
++        generators = self.model_runner.get_generators(
++            execute_model_req.finished_requests_ids)
++        sampling_metadata = SamplingMetadata.prepare(
++            seq_group_metadata_list, seq_lens, query_lens, self.device,
++            self.model_runner.pin_memory, generators)
++
++        model_outputs = self.model_runner.model.generate_proposals(
++            previous_hidden_states=execute_model_req.previous_hidden_states.
++            hidden_states,
++            sampling_metadata=sampling_metadata)
++
++        return model_outputs, False
++
++    def _prepare_input_tensors(
++        self,
++        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
++    ) -> Tuple[List[int], List[int]]:
++        if not seq_group_metadata_list:
++            return [], []
++
++        seq_lens: List[int] = []
++        query_lens: List[int] = []
++
++        for seq_group_metadata in seq_group_metadata_list:
++            is_prompt = seq_group_metadata.is_prompt
++
++            for seq_data in seq_group_metadata.seq_data.values():
++                seq_data_len = seq_data.get_len()
++                if is_prompt:
++                    context_len = seq_data.get_num_computed_tokens()
++                    seq_len = min(
++                        seq_data_len,
++                        context_len + seq_group_metadata.token_chunk_size)
++                    seq_lens.append(seq_len)
++                    query_lens.append(seq_len - context_len)
++                else:
++                    seq_lens.append(seq_data_len)
++                    query_lens.append(1)
++
++        return seq_lens, query_lens
++
++    def get_spec_proposals(
++        self,
++        execute_model_req: ExecuteModelRequest,
++        seq_ids_with_bonus_token_in_last_step: Set[int],
++    ) -> SpeculativeProposals:
++        """Produce speculations given an input batch of sequences. The number of
++        speculative tokens per sequence is determined by max_proposal_len.
++        """
++
++        return self._proposer.get_spec_proposals(
++            execute_model_req, seq_ids_with_bonus_token_in_last_step)
++
++    def _raise_if_unsupported(
++        self,
++        execute_model_req: ExecuteModelRequest,
++    ) -> None:
++        """MedusaWorker does not yet implement support for cache swap
++        operations or beam search.
++        """
++        if any([
++                execute_model_req.blocks_to_swap_in,
++                execute_model_req.blocks_to_swap_out,
++                execute_model_req.blocks_to_copy
++        ]):
++            raise NotImplementedError(
++                "MedusaWorker does not support cache operations")
++
++        if any(
++                len(seq_group_metadata.seq_data.keys()) != 1
++                for seq_group_metadata in
++                execute_model_req.seq_group_metadata_list):
++            raise NotImplementedError(
++                "MedusaWorker does not support beam search.")
+diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
+index ab1d96c..d678f45 100644
+--- a/vllm/spec_decode/metrics.py
++++ b/vllm/spec_decode/metrics.py
+@@ -1,15 +1,18 @@
+ import time
+-from dataclasses import dataclass
+-from typing import Callable, Optional
++from typing import Callable, Optional, Union
+ 
++import msgspec
+ import torch
+ 
+-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
++from vllm.model_executor.layers.spec_decode_base_sampler import (
++    SpecDecodeBaseSampler)
+ from vllm.utils import is_pin_memory_available
+ 
+ 
+-@dataclass
+-class SpecDecodeWorkerMetrics:
++class SpecDecodeWorkerMetrics(
++        msgspec.Struct,
++        omit_defaults=True,  # type: ignore[call-arg]
++        array_like=True):  # type: ignore[call-arg]
+     """Dataclass holding metrics emitted from the spec decode worker.
+     """
+ 
+@@ -46,15 +49,15 @@ Timer = Callable[[], float]
+ 
+ 
+ class AsyncMetricsCollector:
+-    """Class which copies rejection sampler metrics from the device to CPU on a
+-    non-default Torch stream.
++    """Class which copies rejection/typical-acceptance sampler metrics
++    from the device to CPU on a non-default Torch stream.
+     """
+ 
+     def __init__(self,
+-                 rejection_sampler: RejectionSampler,
++                 spec_decode_sampler: SpecDecodeBaseSampler,
+                  timer: Optional[Timer] = None,
+                  collect_interval_s: float = 5.0):
+-        self._rejection_sampler = rejection_sampler
++        self.spec_decode_sampler = spec_decode_sampler
+         self._timer = time.time if timer is None else timer
+ 
+         self._rank: Optional[int] = None
+@@ -78,8 +81,21 @@ class AsyncMetricsCollector:
+         self._rank = rank
+         self._copy_stream = torch.cuda.Stream()
+ 
++    def init_tensors(self,
++                     rank: int,
++                     device_type: Union[torch.device, str] = 'cuda') -> None:
++        self._rank = rank
++        if isinstance(device_type, torch.device):
++            device_type = device_type.type
++        if device_type == 'cuda':
++            self._copy_stream = torch.cuda.Stream()
++
+     def maybe_collect_rejsample_metrics(
+             self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
++        # currently using cuda.Event, skip for any non_cuda_alike platform
++        from vllm.platforms import current_platform
++        if not current_platform.is_cuda_alike():
++            return None
+ 
+         # If a copy was initiated in the previous call, collect and return.
+         if self._in_flight_copy is not None:
+@@ -95,20 +111,17 @@ class AsyncMetricsCollector:
+         return None
+ 
+     def _should_collect_rejsample_metrics(self, now: float) -> bool:
+-        """Return whether or not this iteration should print rejection sampling
++        """Return whether or not this iteration should print sampling
+         metrics.
+         """
+         if self._rank != 0:
+             return False
+ 
+-        if (now - self._last_metrics_collect_time <
+-                self._rejsample_metrics_collect_interval_s):
+-            return False
+-        return True
++        return now - self._last_metrics_collect_time >= self._rejsample_metrics_collect_interval_s  # noqa: E501
+ 
+     def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
+-        """Copy rejection sampling metrics (number of accepted tokens, etc) to
+-        CPU asynchronously.
++        """Copy rejection/typical-acceptance sampling metrics
++        (number of accepted tokens, etc) to CPU asynchronously.
+ 
+         Returns a CUDA event recording when the copy is complete.
+         """
+@@ -117,13 +130,14 @@ class AsyncMetricsCollector:
+ 
+         with torch.cuda.stream(self._copy_stream):
+             self._aggregate_num_accepted_tokens.copy_(
+-                self._rejection_sampler.num_accepted_tokens, non_blocking=True)
++                self.spec_decode_sampler.num_accepted_tokens,
++                non_blocking=True)
+             self._aggregate_num_emitted_tokens.copy_(
+-                self._rejection_sampler.num_emitted_tokens, non_blocking=True)
++                self.spec_decode_sampler.num_emitted_tokens, non_blocking=True)
+             # Number of draft tokens is calculated on CPU, so no copy is
+             # required.
+             self._aggregate_num_draft_tokens = (
+-                self._rejection_sampler.num_draft_tokens)
++                self.spec_decode_sampler.num_draft_tokens)
+ 
+         aggregate_metrics_ready = torch.cuda.Event()
+         aggregate_metrics_ready.record(self._copy_stream)
+@@ -143,6 +157,10 @@ class AsyncMetricsCollector:
+         """
+ 
+         ready_event.synchronize()
++
++        # update time of last collection
++        self._last_metrics_collect_time = self._timer()
++
+         accepted_tokens = self._aggregate_num_accepted_tokens.item()
+         emitted_tokens = self._aggregate_num_emitted_tokens.item()
+         draft_tokens = self._aggregate_num_draft_tokens
+diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
+new file mode 100644
+index 0000000..fc41bb8
+--- /dev/null
++++ b/vllm/spec_decode/mlp_speculator_worker.py
+@@ -0,0 +1,91 @@
++from typing import List, Optional, Set, Tuple
++
++import torch
++
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
++from vllm.spec_decode.multi_step_worker import MultiStepWorker
++from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
++
++
++class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker):
++    """Worker for MLPSpeculator models.
++
++    Not currently compatible with LoRA or chunked prefill.
++    """
++
++    @torch.inference_mode()
++    def sampler_output(
++        self,
++        execute_model_req: ExecuteModelRequest,
++        sample_len: int,
++        # Unused parameter. MLPSpeculatorWorker does not use the KV Cache and
++        # therefore does not need this parameter.
++        seq_ids_with_bonus_token_in_last_step: Set[int],
++    ) -> Tuple[List[SamplerOutput], bool]:
++        """Run the model forward pass to generate sample_len future tokens.
++        Returns the list of sampler output, one per layer, along with indicator
++        of whether torch tensor in sampler output need to be transposed in
++        latter sampler_output_to_torch logic.
++
++        For mlp spec worker, this indicator shall be True.
++        """
++        self._raise_if_unsupported(execute_model_req)
++
++        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
++
++        (input_tokens, seq_lens,
++         query_lens) = self._prepare_input_tensors(seq_group_metadata_list)
++
++        generators = self.model_runner.get_generators(
++            execute_model_req.finished_requests_ids)
++        sampling_metadata = SamplingMetadata.prepare(
++            seq_group_metadata_list, seq_lens, query_lens, self.device,
++            self.model_runner.pin_memory, generators)
++
++        model_outputs = self.model_runner.model.generate_proposals(
++            input_ids=input_tokens,
++            previous_hidden_states=execute_model_req.previous_hidden_states.
++            hidden_states,
++            num_predict_tokens=sample_len,
++            sampling_metadata=sampling_metadata)
++
++        assert len(model_outputs) == sample_len
++
++        return model_outputs, True
++
++    def _prepare_input_tensors(
++        self,
++        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
++    ) -> Tuple[torch.Tensor, List[int], List[int]]:
++        if not seq_group_metadata_list:
++            return torch.empty(0, device=self.device), [], []
++
++        input_tokens: List[int] = []
++        seq_lens: List[int] = []
++        query_lens: List[int] = []
++
++        for seq_group_metadata in seq_group_metadata_list:
++            is_prompt = seq_group_metadata.is_prompt
++
++            for seq_data in seq_group_metadata.seq_data.values():
++                seq_data_len = seq_data.get_len()
++                if is_prompt:
++                    context_len = seq_data.get_num_computed_tokens()
++                    seq_len = min(
++                        seq_data_len,
++                        context_len + seq_group_metadata.token_chunk_size)
++                    tokens = seq_data.get_token_ids()[context_len:seq_len]
++                    seq_lens.append(seq_len)
++                    input_tokens.extend(tokens)
++                    query_lens.append(seq_len - context_len)
++                else:
++                    seq_lens.append(seq_data_len)
++                    input_tokens.append(seq_data.get_last_token_id())
++                    query_lens.append(1)
++
++        input_tokens_tensor = torch.tensor(input_tokens,
++                                           dtype=torch.long,
++                                           device=self.device)
++        return input_tokens_tensor, seq_lens, query_lens
+diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
+new file mode 100644
+index 0000000..cbf793e
+--- /dev/null
++++ b/vllm/spec_decode/mqa_scorer.py
+@@ -0,0 +1,113 @@
++from vllm.sequence import (ExecuteModelRequest, SequenceData,
++                           SequenceGroupMetadata, get_all_seq_ids)
++from vllm.spec_decode.interfaces import (SpeculativeProposals,
++                                         SpeculativeScorer, SpeculativeScores)
++
++SeqId = int
++TargetSeqId = int
++
++
++class MQAScorer(SpeculativeScorer):
++
++    def score_proposals(
++        self,
++        execute_model_req: ExecuteModelRequest,
++        proposals: SpeculativeProposals,
++    ) -> SpeculativeScores:
++        target_seq_group_metadata_list = []
++        target_seq_id_start = max(
++            get_all_seq_ids(execute_model_req.seq_group_metadata_list)) + 1
++        all_proposal_tokens = proposals.proposal_token_ids.tolist()
++        all_proposal_lengths = proposals.proposal_lens.tolist()
++        for i, seq_group_metadata in enumerate(
++                execute_model_req.seq_group_metadata_list):
++            if all_proposal_lengths[i] == 0:
++                # Keep prompt seqs untouched (keep computed_tokens for chunks).
++                target_seq_group_metadata_list.append(seq_group_metadata)
++                continue
++
++            seq_data_dict = seq_group_metadata.seq_data
++            assert len(seq_data_dict) == 1
++            seq_id = next(iter(seq_data_dict.keys()))
++
++            seq_data: SequenceData = seq_data_dict[seq_id]
++            prompt_token_ids = seq_data.get_prompt_token_ids()
++            output_token_ids = seq_data.get_output_token_ids()
++            proposal_token_ids = all_proposal_tokens[
++                i][:all_proposal_lengths[i]]
++            new_output_token_ids = [*output_token_ids, *proposal_token_ids]
++
++            target_seq_id = target_seq_id_start + i
++            new_seq_data = SequenceData.from_seqs(
++                prompt_token_ids=prompt_token_ids,
++                output_token_ids=new_output_token_ids,
++            )
++            new_seq_data.update_num_computed_tokens(
++                len(prompt_token_ids) + len(output_token_ids) - 1)
++
++            # Ensure that the new decode sequence has at least one token.
++            assert len(output_token_ids) >= 1
++            new_seq_data_dict = {target_seq_id: new_seq_data}
++
++            new_seq_group_metadata = SequenceGroupMetadata(
++                request_id=seq_group_metadata.request_id,
++                is_prompt=seq_group_metadata.is_prompt,
++                seq_data=new_seq_data_dict,
++                sampling_params=seq_group_metadata.sampling_params,
++                block_tables={
++                    target_seq_id: seq_group_metadata.block_tables[seq_id],
++                },
++                lora_request=None,
++            )
++            target_seq_group_metadata_list.append(new_seq_group_metadata)
++
++        target_sampler_output = self._scorer_worker.execute_model(
++            execute_model_req=execute_model_req.clone(
++                seq_group_metadata_list=target_seq_group_metadata_list))
++
++        target_sampler_output = target_sampler_output[0]
++
++        k = execute_model_req.num_lookahead_slots
++        bs = len(execute_model_req.seq_group_metadata_list)
++        target_token_ids = target_sampler_output.sampled_token_ids
++        target_probs = target_sampler_output.sampled_token_probs
++        target_logprobs = target_sampler_output.logprobs
++        # If all requests have the same number of query tokens, we can avoid
++        # the for loop to build output for better performance.
++        if min(all_proposal_lengths) == k:
++            bs, _ = proposals.proposal_token_ids.shape
++            all_tokens = target_token_ids.reshape(bs, k + 1)
++            all_probs = target_probs.reshape(bs, k + 1, self._vocab_size)
++            all_logprobs = target_logprobs.reshape(bs, k + 1, self._vocab_size)
++        else:
++            # We either have decodes with different lens or prefill+decodes.
++            all_tokens = target_token_ids.new_full(size=(bs, k + 1),
++                                                   fill_value=-1)
++            all_probs = target_probs.new_zeros(*all_tokens.shape,
++                                               self._vocab_size)
++            all_logprobs = target_logprobs.new_full(size=all_probs.shape,
++                                                    fill_value=-float("inf"))
++            target_token_ids = target_token_ids.flatten()
++            start_loc = 0
++            for i, (proposed_len, seq_meta) in enumerate(
++                    zip(all_proposal_lengths, target_seq_group_metadata_list)):
++                # Skip chunks with no output tokens.
++                if seq_meta.do_sample:
++                    output_len = proposed_len + 1
++                    end_loc = start_loc + output_len
++                    all_tokens[
++                        i, :output_len] = target_token_ids[start_loc:end_loc]
++                    all_probs[i, :output_len] = target_probs[start_loc:end_loc]
++                    all_logprobs[
++                        i, :output_len] = target_logprobs[start_loc:end_loc]
++                    start_loc = end_loc
++
++        hidden_states = None
++        if target_sampler_output.hidden_states is not None:
++            hidden_states = target_sampler_output.hidden_states.reshape(
++                bs, (k + 1), -1)
++
++        return SpeculativeScores(probs=all_probs,
++                                 token_ids=all_tokens,
++                                 logprobs=all_logprobs,
++                                 hidden_states=hidden_states)
+diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
+index 5044cc1..676ac5e 100644
+--- a/vllm/spec_decode/multi_step_worker.py
++++ b/vllm/spec_decode/multi_step_worker.py
+@@ -1,16 +1,25 @@
+ import copy
+-from typing import List, Tuple
++import weakref
++from typing import Dict, List, Set, Tuple
+ 
+ import torch
+ 
+-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.platforms import current_platform
++from vllm.sequence import (ExecuteModelRequest, HiddenStates, SequenceData,
+                            SequenceGroupMetadata)
+-from vllm.spec_decode.interfaces import SpeculativeProposals
++
++if current_platform.is_cuda_alike():
++    from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
++
++from vllm.spec_decode.interfaces import (SpeculativeProposals,
++                                         SpeculativeProposer)
++from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+ from vllm.spec_decode.top1_proposer import Top1Proposer
+-from vllm.worker.worker import Worker
++from vllm.worker.worker_base import WorkerWrapperBase
+ 
+ 
+-class MultiStepWorker(Worker):
++class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
+     """The MultiStepWorker is equivalent to a Worker except that it allows
+     multiple forward passes in a single call, assuming the scheduler has
+     allocated enough space to store the additional KV. This reduces overhead
+@@ -23,30 +32,48 @@ class MultiStepWorker(Worker):
+     """
+ 
+     def __init__(self, *args, **kwargs):
+-        super().__init__(*args, **kwargs)
++        super().__init__(kwargs.get("vllm_config"))
++        self.init_worker(*args, **kwargs)
+ 
+         # Lazy initialization list.
+-        self._proposer: Top1Proposer
++        self._proposer: SpeculativeProposer
+ 
+-    def init_device(self):
+-        super().init_device()
++    def init_device(self) -> None:
++        self.worker.init_device()
+ 
+         self._proposer = Top1Proposer(
+-            self,
++            weakref.proxy(self),  # type: ignore[arg-type]
+             self.device,
+             self.vocab_size,
+             max_proposal_len=self.max_model_len,
+         )
+ 
+-    def set_include_gpu_probs_tensor(self):
+-        # Need include_gpu_probs_tensor for multi_step_worker
++    def set_include_gpu_probs_tensor(self) -> None:
++        # Need include_gpu_probs_tensor for MultiStepWorker
+         self.model_runner.model.sampler.include_gpu_probs_tensor = True
+ 
++    def set_should_modify_greedy_probs_inplace(self) -> None:
++        self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
++            True)
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        return self.worker.determine_num_available_blocks()
++
++    def get_cache_block_size_bytes(self) -> int:
++        return self.worker.get_cache_block_size_bytes()
++
++    def initialize_cache(self, *args, **kwargs) -> None:
++        self.worker.initialize_cache(*args, **kwargs)
++
++    def execute_model(self, *args, **kwargs) -> List[SamplerOutput]:
++        return self.worker.execute_model(*args, **kwargs)
++
+     @torch.inference_mode()
+     def sampler_output(
+         self,
+         execute_model_req: ExecuteModelRequest,
+         sample_len: int,
++        seq_ids_with_bonus_token_in_last_step: Set[int],
+     ) -> Tuple[List[SamplerOutput], bool]:
+         """Run the model forward pass sample_len times. Returns the list of
+         sampler output, one per model forward pass, along with indicator of
+@@ -56,52 +83,176 @@ class MultiStepWorker(Worker):
+         For multi step worker, this indicator shall be True.
+         """
+         self._raise_if_unsupported(execute_model_req)
+-
+-        # Shallow copy input data so modifications (such as appending tokens)
+-        # do not cause side-effects.
+-        copied_seq_group_metadata_list = self._shallow_copy_inputs(
+-            execute_model_req.seq_group_metadata_list)
+-        copied_execute_model_req = execute_model_req.clone(
+-            copied_seq_group_metadata_list)
+-
+-        # Assert enough KV space for sample_len tokens per sequence.
+-        self._assert_enough_kv_space(execute_model_req.seq_group_metadata_list,
+-                                     sample_len)
++        # Expand the batch for sequences with a bonus token.
++        # Perform a forward pass on the expanded batch and filter the
++        # response to retain only the original sequences' responses.
++        expanded_request, indices_of_seq_with_bonus_tokens =\
++            self._expand_execute_model_request(
++                execute_model_req, seq_ids_with_bonus_token_in_last_step)
+ 
+         # Run model sample_len times.
+-        model_outputs = []
+-        for _ in range(sample_len):
+-            model_output = super().execute_model(
+-                execute_model_req=copied_execute_model_req)
+-            assert (len(model_output) == 1
+-                    ), "composing multistep workers not supported"
+-            model_output = model_output[0]
+-
+-            self._append_new_tokens(model_output,
+-                                    copied_seq_group_metadata_list)
+-            model_outputs.append(model_output)
+-
+-        return model_outputs, True
++        model_outputs: List[SamplerOutput] = []
++        if current_platform.is_cuda_alike() and isinstance(
++                self.model_runner, TP1DraftModelRunner
++        ) and self.model_runner.supports_gpu_multi_step(expanded_request):
++            # Here we run the draft_model_runner with multi-step prepare
++            # on the GPU directly
++            expanded_request.num_steps = sample_len
++            self.model_runner.set_indices_of_seq_with_bonus_tokens(
++                indices_of_seq_with_bonus_tokens)
++            model_outputs = self.execute_model(
++                execute_model_req=expanded_request)
++        else:
++            # Here we run multi-step directly, with every step prepared
++            # on the CPU.
++            # TODO: Remove this branch once DraftModelRunner supports TP>1
++            # and other restrictions that are part of DraftModelRunner's
++            # supports_gpu_multi_step(..)
++            for _ in range(sample_len):
++                model_output: List[SamplerOutput] = self.worker.execute_model(
++                    execute_model_req=expanded_request)
++                assert (len(model_output) == 1
++                        ), "composing multistep workers not supported"
++                model_output = model_output[0]
++
++                self._append_new_tokens(
++                    model_output, expanded_request.seq_group_metadata_list,
++                    indices_of_seq_with_bonus_tokens)
++                model_outputs.append(model_output)
++
++        # move indices to device to avoid stream sync
++        indices_of_seq_with_bonus_tokens = torch.tensor(
++            indices_of_seq_with_bonus_tokens, device=self.device)
++        filtered_model_outputs = self._filter_model_output(
++            model_outputs, indices_of_seq_with_bonus_tokens)
++        return filtered_model_outputs, True
++
++    @staticmethod
++    def _expand_execute_model_request(
++        execute_model_req: ExecuteModelRequest,
++        seq_with_bonus_token_in_last_step: set,
++    ) -> Tuple[ExecuteModelRequest, List[int]]:
++        """
++        Expands the execute model request based on sequences with bonus
++        tokens.
++
++        For each sequence with a bonus token, this method creates a new
++        sequence without the bonus token and adds it to the execute model
++        request. The original sequence groups are also retained. The indices
++        of the original sequence groups are returned for further processing.
++
++        Args:
++            execute_model_req (ExecuteModelRequest): The original execute
++            model request.
++            seq_with_bonus_token_in_last_step (set): Set of sequence IDs that 
++            contain bonus tokens.
++
++        Returns:
++            Tuple[ExecuteModelRequest, List[int]]: The updated execute model
++            request with expanded sequences and a list of indices corresponding
++            to the original sequence groups.
++        """
++        updated_seq_group_metadata_list: List[SequenceGroupMetadata] = []
++        updated_execute_model_req = execute_model_req.clone(
++            updated_seq_group_metadata_list)
++        indices_of_original_sequence_groups = []
++        for seq_group in execute_model_req.seq_group_metadata_list:
++            seq_group_has_bonus_tokens = False
++            for seq_id, _ in seq_group.seq_data.items():
++                # Identify sequences with bonus tokens in the sequence group.
++                if seq_id in seq_with_bonus_token_in_last_step:
++                    seq_group_has_bonus_tokens = True
++                    break
++            if seq_group_has_bonus_tokens:
++                #Create new sequences without the last bonus token. These new
++                # sequence have the same sequence id as the original sequence.
++                # We create a new sequence group and add them there.
++                updated_seq_group_without_bonus_token  = \
++                    MultiStepWorker._copy_seq_metadata_excluding_last_token(
++                        seq_group, seq_with_bonus_token_in_last_step)
++                updated_seq_group_metadata_list.append(
++                    updated_seq_group_without_bonus_token)
++            # Add the original sequence group.
++            updated_seq_group_metadata_list.append(
++                MultiStepWorker._shallow_copy_seq_group_metadata(seq_group))
++            # Record the index of the original sequence group.
++            indices_of_original_sequence_groups.append(
++                len(updated_seq_group_metadata_list) - 1)
++
++        updated_execute_model_req.seq_group_metadata_list =\
++            updated_seq_group_metadata_list
++
++        if isinstance(updated_execute_model_req.previous_hidden_states,
++                      HiddenStates):
++            updated_execute_model_req.previous_hidden_states\
++                .expand_with_bonus_tokens(seq_with_bonus_token_in_last_step)
++
++        return updated_execute_model_req, indices_of_original_sequence_groups
++
++    @staticmethod
++    def _filter_model_output(
++            expanded_batch_outputs: List[SamplerOutput],
++            output_indices_to_retain: torch.Tensor) -> List[SamplerOutput]:
++        """
++        Filters the model output to include only the specified sequence
++        outputs. This method contracts the expanded batch output from the
++        model to retain the outputs of only those sequences indicated by the
++        provided indices.
++
++        Args:
++            expanded_batch_output (List[SamplerOutput]): The expanded output
++                batch from the model.
++            output_indices_to_retain (torch.Tensor): Indices of the model
++                outputs to retain.
++
++        Returns:
++            List[SamplerOutput]: A list containing the filtered model 
++            outputs for the specified indices.
++        """
++        return [
++            SamplerOutput(
++                outputs=[
++                    expanded_batch_output.outputs[i]
++                    for i in output_indices_to_retain
++                ] if len(expanded_batch_output.outputs) > 0 else [],
++                sampled_token_probs=(
++                    expanded_batch_output.
++                    sampled_token_probs[output_indices_to_retain]
++                    if expanded_batch_output.sampled_token_probs is not None
++                    else None),
++                logprobs=(
++                    expanded_batch_output.logprobs[output_indices_to_retain]
++                    if expanded_batch_output.logprobs is not None else None),
++                sampled_token_ids=(expanded_batch_output.
++                                   sampled_token_ids[output_indices_to_retain]
++                                   if expanded_batch_output.sampled_token_ids
++                                   is not None else None))
++            for expanded_batch_output in expanded_batch_outputs
++        ]
+ 
+     def get_spec_proposals(
+         self,
+         execute_model_req: ExecuteModelRequest,
++        seq_ids_with_bonus_token_in_last_step: set,
+     ) -> SpeculativeProposals:
+         """Produce speculations given an input batch of sequences. The number of
+         speculative tokens per sequence is determined by max_proposal_len.
+         """
++        return self._proposer.get_spec_proposals(
++            execute_model_req, seq_ids_with_bonus_token_in_last_step)
+ 
+-        return self._proposer.get_proposals(execute_model_req)
+-
++    @staticmethod
+     def _append_new_tokens(
+-            self, model_output: SamplerOutput,
+-            seq_group_metadata_list: SequenceGroupMetadata) -> None:
++            model_output: List[SamplerOutput],
++            seq_group_metadata_list: List[SequenceGroupMetadata],
++            indices_of_seq_with_bonus_tokens: List[int]) -> None:
+         """Given model output from a single run, append the tokens to the
+         sequences. This is normally done outside of the worker, but it is
+         required if the worker is to perform multiple forward passes.
+         """
+-        for seq_group_metadata, sequence_group_outputs in zip(
+-                seq_group_metadata_list, model_output):
++        count = 0
++        for index, (seq_group_metadata, sequence_group_outputs) in enumerate(
++                zip(seq_group_metadata_list, model_output)):
+             seq_group_metadata.is_prompt = False
+ 
+             for seq_output in sequence_group_outputs.samples:
+@@ -111,12 +262,23 @@ class MultiStepWorker(Worker):
+ 
+                 token_id = seq_output.output_token
+                 token_logprob = seq_output.logprobs[token_id]
++                # Determine the actual token ID to be generated,
++                # considering bonus tokens
++                if index != indices_of_seq_with_bonus_tokens[count]:
++                    bonus_seq_metadata = seq_group_metadata_list[
++                        indices_of_seq_with_bonus_tokens[count]]
++                    _, bonus_token_seq_data = next(
++                        iter(bonus_seq_metadata.seq_data.items()))
++                    token_id = bonus_token_seq_data.output_token_ids[-1]
++                else:
++                    count += 1
+ 
+                 seq.append_token_id(token_id, token_logprob.logprob)
++                seq.update_num_computed_tokens(1)
+ 
+-    def _shallow_copy_inputs(
+-        self, seq_group_metadata_list: List[SequenceGroupMetadata]
+-    ) -> List[SequenceGroupMetadata]:
++    @staticmethod
++    def _shallow_copy_seq_group_metadata(
++        seq_group_metadata: SequenceGroupMetadata, ) -> SequenceGroupMetadata:
+         """Copy input data structures to remove side-effects when input data
+         structures are shared with other modules.
+ 
+@@ -124,26 +286,62 @@ class MultiStepWorker(Worker):
+         The alternative is deep-copying (or other form of deep copy); this has
+         performance downsides.
+         """
+-
+-        # Shallow-copy the list of SequenceGroupMetadata. This allows us to
++        # Shallow-copy the SequenceGroupMetadata. This allows us to
+         # append tokens and change is_prompt without external side-effects.
+-        new_seq_group_metadata_list = []
+-
+-        for old_seq_group_metadata in seq_group_metadata_list:
+-            # We must shallow-copy seq_group_metadata as is_prompt could change.
+-            seq_group_metadata = copy.copy(old_seq_group_metadata)
+-            new_seq_group_metadata_list.append(seq_group_metadata)
+-
+-            # We must shallow-copy seq_data as we will append token ids
+-            new_seq_data = {}
+-            for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
++        # We must shallow-copy seq_group_metadata as is_prompt could change.
++        new_seq_group_metadata = copy.copy(seq_group_metadata)
++
++        # We must shallow-copy seq_data as we will append token ids
++        new_seq_data: Dict[int, SequenceData] = {}
++        for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
++            new_seq_data[seq_id] = copy.copy(old_seq_data)
++            new_seq_data[seq_id].output_token_ids =\
++                old_seq_data.output_token_ids[:]
++
++        new_seq_group_metadata.seq_data = new_seq_data
++        return new_seq_group_metadata
++
++    @staticmethod
++    def _copy_seq_metadata_excluding_last_token(
++        seq_group_metadata: SequenceGroupMetadata,
++        seq_ids_to_copy: Set[int],
++    ) -> SequenceGroupMetadata:
++        """
++        Creates a shallow copy of the given SequenceGroupMetadata, retaining
++        only the sequence IDs specified in seq_ids_to_copy. For each of these
++        sequence IDs, all output_token_ids except the last one are copied.
++        Sequence IDs not in seq_ids_to_copy are excluded from the copy.
++        
++        Parameters:
++        seq_group_metadata (SequenceGroupMetadata): The original sequence
++            group metadata.
++        seq_ids_to_copy (Set[int]): The set of sequence IDs to include in the
++            copy.
++        
++        Returns:
++        SequenceGroupMetadata: A shallow copy of the sequence group metadata
++            with the specified modifications.
++        """
++        # Shallow-copy the SequenceGroupMetadata.
++        new_seq_group_metadata = copy.copy(seq_group_metadata)
++        # Shallow-copy seq_data and modify the output_token_ids.
++        new_seq_data: Dict[int, SequenceData] = {}
++        for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
++            if (seq_id in seq_ids_to_copy):
+                 new_seq_data[seq_id] = copy.copy(old_seq_data)
+-                new_seq_data[
+-                    seq_id].output_token_ids = old_seq_data.output_token_ids[:]
+-
+-            seq_group_metadata.seq_data = new_seq_data
+-
+-        return new_seq_group_metadata_list
++                # Copy all the output token ids except the last.
++                # Also reduce num_computed_tokens by 1 since we are not
++                # including the last output token.
++                # NOTE: num_computed_tokens is not directly used by the
++                # speculative decoding workers, as it is only relevant for
++                # chunked prefill, which is disabled for speculative decoding.
++                # However, to maintain consistency in num_computed_tokens,
++                # we update it here.
++                new_seq_data[seq_id].output_token_ids =\
++                    old_seq_data.output_token_ids[:-1]
++                new_seq_data[seq_id].update_num_computed_tokens(-1)
++        new_seq_group_metadata.seq_data = new_seq_data
++        return new_seq_group_metadata
+ 
+     def _assert_enough_kv_space(
+             self, seq_group_metadata_list: List[SequenceGroupMetadata],
+diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
+index fed8be4..bb6b991 100644
+--- a/vllm/spec_decode/ngram_worker.py
++++ b/vllm/spec_decode/ngram_worker.py
+@@ -1,25 +1,28 @@
+-from typing import List, Optional, Tuple
++import weakref
++from typing import List, Optional, Set, Tuple
+ 
+ import torch
+ 
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
+ from vllm.spec_decode.interfaces import SpeculativeProposals
++from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
+ from vllm.spec_decode.top1_proposer import Top1Proposer
+-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+ 
+ 
+-class NGramWorker(LoraNotSupportedWorkerBase):
++class NGramWorker(NonLLMProposerWorkerBase):
+     """NGramWorker provides a light drafter without need for model.
+ 
+-    Current NGramWorker only implement prompt lookup decoding,
+-    and in future we may also do RAG type drafter and other scenerios
++    Current NGramWorker only implements prompt lookup decoding,
++    and in future we may also do RAG type drafter and other scenarios
+     which don't rely on LLM model to give proposals.
+     """
+ 
+     def __init__(self, *args, **kwargs):
+         # Get local_rank/vocab_size from kwargs attribute
+         self.local_rank = kwargs["local_rank"]
+-        self.vocab_size = kwargs["model_config"].get_vocab_size()
++        self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size()
++        self.device_type = kwargs.get("device_type", "cuda")
+ 
+         # Lazy initialization list.
+         self._proposer: Top1Proposer
+@@ -32,42 +35,24 @@ class NGramWorker(LoraNotSupportedWorkerBase):
+         self.ngram_prompt_lookup_min = ngram_prompt_lookup_min
+ 
+     def init_device(self):
+-        self.device = torch.device(f"cuda:{self.local_rank}")
++        self.device = torch.device(f"{self.device_type}:{self.local_rank}")
+         self.load_model = lambda *args, **kwargs: None
+ 
+-        # Current only support Top1Proposer
++        # Current NGramWorker only supports Top1Proposer
+         self._proposer = Top1Proposer(
+-            self,
++            weakref.proxy(self),  # type: ignore[arg-type]
+             device=self.device,
+             vocab_size=self.vocab_size,
+         )
+ 
+-    def set_include_gpu_probs_tensor(self):
+-        # NGram don't need gpu sampler
+-        pass
+-
+-    def execute_model(self, execute_model_req: ExecuteModelRequest) -> None:
+-        """NGram doesn't depend on model execution, just pass this function"""
+-        pass
+-
+-    def determine_num_available_blocks(self) -> None:
+-        """NGram doesn't depend on model execution, no need to check blocks"""
+-        pass
+-
+-    def initialize_cache(self, num_gpu_blocks: int,
+-                         num_cpu_blocks: int) -> None:
+-        """As there is no cache need to handle, just pass this function"""
+-        pass
+-
+-    def get_cache_block_size_bytes(self):
+-        """Return the size of a cache block in bytes."""
+-        return 0
+-
+     def sampler_output(
+         self,
+         execute_model_req: ExecuteModelRequest,
+         sample_len: int,
+-    ) -> Tuple[Optional[List[SamplerOutput]], bool]:
++        # Unused parameter. NGramWorker does not use the KV Cache and
++        # therefore does not need this parameter.
++        seq_ids_with_bonus_token_in_last_step: Set[int],
++    ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]:
+         """NGram match algo to pick proposal candidate. Returns the list of
+         sampler output, one per SequenceGroupMetadata.
+ 
+@@ -76,82 +61,96 @@ class NGramWorker(LoraNotSupportedWorkerBase):
+         """
+         self._raise_if_unsupported(execute_model_req)
+ 
+-        arr = []
+         has_spec_out = False
+-        for seq_group_metadata in execute_model_req.seq_group_metadata_list:
++        token_id_list: List[Optional[torch.Tensor]] = []
++        token_prob_list: List[Optional[torch.Tensor]] = []
++        for idx, seq_group_metadata in enumerate(
++                execute_model_req.seq_group_metadata_list):
+             seq_data = next(iter(seq_group_metadata.seq_data.values()))
+ 
++            seq_len = seq_data.get_len()
++            # When seq_len is less than 3072 (3K), we use CPU to perform
++            # the ngram match. Otherwise, we use the device specified in
++            # the model config (normally GPU). 3072 is a rough threshold
++            # based on profiling on H100, and it can be adjusted based
++            # on the actual performance on different hardware.
++            cur_device = "cpu" if seq_len < 3072 else self.device
+             input_ids = torch.as_tensor(seq_data.get_token_ids(),
+                                         dtype=torch.long,
+-                                        device=self.device)
++                                        device=cur_device)
+             input_length = seq_data.get_len()
+ 
+             for ngram_size in range(
+                     min(self.ngram_prompt_lookup_max, input_length - 1),
+-                    self.ngram_prompt_lookup_min,
++                    self.ngram_prompt_lookup_min - 1,
+                     -1,
+             ):
+-                ngram_tensor = input_ids[-1 * ngram_size:]
+-                windows = input_ids.unfold(dimension=0,
+-                                           size=ngram_size,
+-                                           step=1)
+-                matches = (windows == ngram_tensor).all(dim=1)
+-                match_indices = matches.nonzero(as_tuple=True)[0]
+-                if match_indices.size()[0] > 1:
++                ngram_tensor = input_ids[-ngram_size:]
++                if ngram_size == 1:
++                    # Do not match itself and do not use unfold and all
++                    matches = (input_ids[:-1] == ngram_tensor)
++                else:
++                    windows = input_ids.unfold(dimension=0,
++                                               size=ngram_size,
++                                               step=1)
++                    # Do not match itself
++                    matches = (windows[:-1] == ngram_tensor).all(dim=-1)
++
++                # first_match includes "values" (bool), indicating whether
++                # the match is found, and "indices", indicating the index
++                # of the first match.
++                first_match = matches.max(dim=-1)
++                if first_match.values.item():
++                    proposal_start_idx = first_match.indices.add_(ngram_size)
++                    spec_indices = (
++                        proposal_start_idx).repeat(sample_len) + torch.arange(
++                            sample_len, device=cur_device)
++                    spec_indices.clamp_(max=input_ids.shape[-1] - 1)
++                    res = input_ids.gather(dim=-1,
++                                           index=spec_indices).to(self.device)
++                    token_id_list.append(res)
++                    token_prob_list.append(
++                        torch.nn.functional.one_hot(
++                            res,
++                            num_classes=self.vocab_size).to(torch.float32))
+                     has_spec_out = True
+-                    res = seq_data.get_token_ids()
+-                    res = res[match_indices[0] + ngram_size:match_indices[0] +
+-                              ngram_size + sample_len]
+-                    res_len = len(res)
+-                    # pad 0 towards output as sample_len tokens required
+-                    res += [0] * (sample_len - res_len)
+-
+                     break
+             else:
+-                # if no candidate found, fill with 0
+-                res = [0] * sample_len
+-
+-            arr.append(res)
++                token_id_list.append(None)
++                token_prob_list.append(None)
+ 
+         if not has_spec_out:
+             return None, False
+ 
+-        outputs = []
+-        token_ids = torch.as_tensor(arr, dtype=torch.long, device=self.device)
+-        indices = token_ids.unsqueeze(2)
++        outputs: List[Optional[SamplerOutput]] = []
++        for idx in range(len(execute_model_req.seq_group_metadata_list)):
++            if token_id_list[idx] is None:
++                outputs.append(None)
++            else:
++                outputs.append(
++                    SamplerOutput(
++                        outputs=None,
++                        sampled_token_probs=token_prob_list[idx],
++                        logprobs=torch.zeros((sample_len, self.vocab_size),
++                                             dtype=torch.float32,
++                                             device=self.device),
++                        sampled_token_ids=token_id_list[idx],
++                    ))
+ 
+-        token_probs = torch.zeros(
+-            (len(execute_model_req.seq_group_metadata_list), sample_len,
+-             self.vocab_size),
+-            dtype=torch.float32,
+-            device=self.device,
+-        )
+-        token_probs.scatter_(2, indices, 1)
+-        token_logprobs = torch.zeros(
+-            (len(execute_model_req.seq_group_metadata_list), sample_len,
+-             self.vocab_size),
+-            dtype=torch.float32,
+-            device=self.device,
+-        )
+-        for i in range(len(execute_model_req.seq_group_metadata_list)):
+-            outputs.append(
+-                SamplerOutput(
+-                    outputs=None,
+-                    sampled_token_probs=token_probs[i],
+-                    logprobs=token_logprobs,
+-                    sampled_token_ids=token_ids[i],
+-                ))
+         return outputs, False
+ 
+     def get_spec_proposals(
+         self,
+         execute_model_req: ExecuteModelRequest,
++        # Unused parameter. NGramWorker does not use the KV Cache and
++        # therefore does not need this parameter.
++        seq_ids_with_bonus_token_in_last_step: Set[int],
+     ) -> SpeculativeProposals:
+         """Produce speculations given an input batch of sequences. The number of
+         speculative tokens per sequence is determined by max_proposal_len.
+         """
+-
+-        return self._proposer.get_proposals(execute_model_req)
++        return self._proposer.get_spec_proposals(
++            execute_model_req, seq_ids_with_bonus_token_in_last_step)
+ 
+     def _raise_if_unsupported(
+         self,
+diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
+new file mode 100644
+index 0000000..28a5375
+--- /dev/null
++++ b/vllm/spec_decode/proposer_worker_base.py
+@@ -0,0 +1,56 @@
++from abc import ABC, abstractmethod
++from typing import List, Optional, Set, Tuple
++
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
++from vllm.spec_decode.interfaces import SpeculativeProposer
++from vllm.worker.worker_base import LoraNotSupportedWorkerBase
++
++
++class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer):
++    """Interface for proposer workers"""
++
++    @abstractmethod
++    def sampler_output(
++        self,
++        execute_model_req: ExecuteModelRequest,
++        sample_len: int,
++        # A set containing all sequence IDs that were assigned bonus tokens
++        # in their last forward pass. This set is used to backfill the KV cache
++        # with the key-value pairs of the penultimate token in the sequences.
++        # This parameter is only used by the MultiStepWorker, which relies on
++        # the KV cache for token generation. It is not used by workers that
++        # do not utilize the KV cache.
++        seq_ids_with_bonus_token_in_last_step: Set[int]
++    ) -> Tuple[Optional[List[SamplerOutput]], bool]:
++        raise NotImplementedError
++
++    def set_include_gpu_probs_tensor(self) -> None:
++        """Implementation optional"""
++        pass
++
++    def set_should_modify_greedy_probs_inplace(self) -> None:
++        """Implementation optional"""
++        pass
++
++
++class NonLLMProposerWorkerBase(ProposerWorkerBase, ABC):
++    """Proposer worker which does not use a model with kvcache"""
++
++    def execute_model(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None
++    ) -> List[SamplerOutput]:
++        """get_spec_proposals is used to get the proposals"""
++        return []
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """This is never called on the proposer, only the target model"""
++        raise NotImplementedError
++
++    def initialize_cache(self, num_gpu_blocks: int,
++                         num_cpu_blocks: int) -> None:
++        pass
++
++    def get_cache_block_size_bytes(self) -> int:
++        return 0
+diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
+new file mode 100644
+index 0000000..8896b7d
+--- /dev/null
++++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
+@@ -0,0 +1,161 @@
++from typing import List, Optional, Set, Tuple
++
++import torch
++
++from vllm.distributed.parallel_state import (get_tp_group,
++                                             init_model_parallel_group,
++                                             patch_tensor_parallel_group)
++from vllm.logger import init_logger
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
++from vllm.spec_decode.interfaces import SpeculativeProposals
++from vllm.spec_decode.multi_step_worker import MultiStepWorker
++from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
++
++logger = init_logger(__name__)
++
++
++class SmallerTpProposerWorker(ProposerWorkerBase):
++    """Class which allows a speculative draft model to run with smaller tensor
++    parallel degree than target model.
++    This reduces the communication overhead of small draft models.
++
++    To implement this feature, this class differs behavior based on is_dummy
++    flag, where dummy means worker that does not participate draft generation.
++    Participating workers use a smaller tp group by patching vLLM's tensor
++    parallel group temporarily during forward passes of draft models.
++    """
++
++    @classmethod
++    def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int,
++                          target_tensor_parallel_size: int):
++        """Wrap the worker in a SmallerTpProposerWorker if necessary.
++        """
++        if draft_tensor_parallel_size == target_tensor_parallel_size:
++            return worker
++
++        # gpu ranks that will generate draft tokens together
++        draft_ranks = list(range(draft_tensor_parallel_size))
++
++        logger.info("Wrapping {%s} in {%s}", type(worker), cls)
++        return cls(worker, draft_ranks)
++
++    def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]):
++        """Create a SmallerTpProposerWorker.
++
++        Args:
++            worker (MultiStepWorker): an actual worker wrapped with this class
++            draft_ranks (List[int]): if this value is given, only the GPU ranks
++            written in this value participate in draft generation
++        """
++        self._worker = worker
++        self._draft_ranks = draft_ranks
++
++        # init during init_device
++        self._is_dummy = False
++        self._tp_group = None
++
++    def _patch_tensor_parallel_group(self):
++        """Temporarily patch the global tp group state with its own tp group
++        state.
++        """
++        return patch_tensor_parallel_group(self._tp_group)
++
++    def init_device(self) -> None:
++        self._is_dummy = get_tp_group().rank not in self._draft_ranks
++
++        # dummy workers do nothing
++        if self._is_dummy:
++            return
++
++        # creates tp process group containing only a subset of gpu ranks
++        local_rank = get_tp_group().local_rank
++        tp_backend = torch.distributed.get_backend(get_tp_group().device_group)
++        self._tp_group = init_model_parallel_group([self._draft_ranks],
++                                                   local_rank, tp_backend)
++
++        with self._patch_tensor_parallel_group():
++            self._worker.init_device()
++
++    def set_include_gpu_probs_tensor(self) -> None:
++        if self._is_dummy:
++            return
++
++        # Need include_gpu_probs_tensor for multi_step_worker
++        self._worker.set_include_gpu_probs_tensor()
++
++    def set_should_modify_greedy_probs_inplace(self) -> None:
++        if self._is_dummy:
++            return
++
++        self._worker.set_should_modify_greedy_probs_inplace()
++
++    def load_model(self) -> None:
++        if self._is_dummy:
++            return
++
++        with self._patch_tensor_parallel_group():
++            self._worker.load_model()
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        if self._is_dummy:
++            # this case is not used now
++            return -1, -1
++
++        with self._patch_tensor_parallel_group():
++            return self._worker.determine_num_available_blocks()
++
++    def initialize_cache(self, num_gpu_blocks: int,
++                         num_cpu_blocks: int) -> None:
++        if self._is_dummy:
++            return
++
++        with self._patch_tensor_parallel_group():
++            self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
++
++    def sampler_output(
++        self,
++        execute_model_req: ExecuteModelRequest,
++        sample_len: int,
++        seq_ids_with_bonus_token_in_last_step: Set[int],
++    ) -> Tuple[List[SamplerOutput], bool]:
++        # Do not check _is_dummy, as it's always called by get_spec_proposals
++        return self._worker.sampler_output(
++            execute_model_req, sample_len,
++            seq_ids_with_bonus_token_in_last_step)
++
++    def get_spec_proposals(
++        self,
++        execute_model_req: ExecuteModelRequest,
++        seq_ids_with_bonus_token_in_last_step: Set[int],
++    ) -> SpeculativeProposals:
++        """Produce speculations given an input batch of sequences. The number of
++        speculative tokens per sequence is determined by max_proposal_len.
++        """
++        if self._is_dummy:
++            return SpeculativeProposals(None, None, None)
++
++        with self._patch_tensor_parallel_group():
++            return self._worker.get_spec_proposals(
++                execute_model_req, seq_ids_with_bonus_token_in_last_step)
++
++    def execute_model(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None
++    ) -> List[SamplerOutput]:
++        if self._is_dummy:
++            return []
++
++        with self._patch_tensor_parallel_group():
++            return self._worker.execute_model(execute_model_req)
++
++    def get_cache_block_size_bytes(self) -> int:
++        if self._is_dummy:
++            # by returning zero, target worker can use the entire kv cache space
++            return 0
++
++        return self._worker.get_cache_block_size_bytes()
++
++    @property
++    def vocab_size(self) -> int:
++        return self._worker.vocab_size
+diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
+index c2b119f..e369da1 100644
+--- a/vllm/spec_decode/spec_decode_worker.py
++++ b/vllm/spec_decode/spec_decode_worker.py
+@@ -1,27 +1,115 @@
++import copy
++from collections import defaultdict
+ from functools import cached_property
+-from typing import List, Optional, Tuple
++from typing import Any, Dict, List, Optional, Set, Tuple, Type
+ 
+ import torch
+ 
++from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig
++from vllm.distributed.communication_op import broadcast_tensor_dict
+ from vllm.logger import init_logger
+ from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+-                           SequenceGroupMetadata)
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.model_executor.layers.spec_decode_base_sampler import (
++    SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler)
++from vllm.model_executor.layers.typical_acceptance_sampler import (
++    TypicalAcceptanceSampler)
++from vllm.platforms import current_platform
++from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
++                           CompletionSequenceGroupOutput, ExecuteModelRequest,
++                           HiddenStates, SequenceGroupMetadata,
++                           get_all_seq_ids_and_request_ids)
+ from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
++
++if current_platform.is_cuda_alike():
++    from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
++
+ from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                          SpeculativeScorer, SpeculativeScores)
++from vllm.spec_decode.medusa_worker import MedusaWorker
+ from vllm.spec_decode.metrics import AsyncMetricsCollector
++from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker
++from vllm.spec_decode.mqa_scorer import MQAScorer
+ from vllm.spec_decode.multi_step_worker import MultiStepWorker
+ from vllm.spec_decode.ngram_worker import NGramWorker
+-from vllm.spec_decode.util import (create_sequence_group_output,
+-                                   get_all_num_logprobs, get_all_seq_ids,
++from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
++from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker
++from vllm.spec_decode.target_model_runner import TargetModelRunner
++from vllm.spec_decode.util import (Timer, create_logprobs_output,
++                                   create_sequence_group_output,
++                                   get_all_num_logprobs,
+                                    get_sampled_token_logprobs, nvtx_range,
+                                    split_batch_by_proposal_len)
+-from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
++from vllm.worker.worker_base import (LoraNotSupportedWorkerBase, WorkerBase,
++                                     WorkerWrapperBase)
+ 
+ logger = init_logger(__name__)
+ 
+ 
++def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
++    """Helper method that is the entrypoint for Executors which use
++    WorkerWrapper. It constructs a SpecDecodeWorker from the speculative config.
++    """
++    vllm_config: VllmConfig = kwargs.get("vllm_config")
++    speculative_config: SpeculativeConfig = vllm_config.speculative_config
++    assert speculative_config is not None
++
++    if vllm_config.parallel_config.pipeline_parallel_size > 1:
++        raise NotImplementedError("Speculative decoding is currently "
++                                  "incompatible with pipeline parallelism")
++
++    draft_worker_kwargs = kwargs.copy()
++
++    kwargs["model_runner_cls"] = TargetModelRunner
++    target_worker_config = copy.deepcopy(vllm_config)
++    target_worker_config.parallel_config.worker_cls =\
++        target_worker_config.parallel_config.sd_worker_cls
++    target_worker = WorkerWrapperBase(vllm_config=target_worker_config)
++    target_worker.init_worker(*args, **kwargs)
++    # Set the disable_logprobs variable in the TargetModelRunner instance
++    # as per its value specified in the SpeculativeConfig.
++    target_worker.model_runner.disable_logprobs =\
++         speculative_config.disable_logprobs
++
++    draft_worker_config = copy.deepcopy(vllm_config)
++    draft_worker_config.model_config = speculative_config.draft_model_config
++    draft_worker_config.quant_config = VllmConfig._get_quantization_config(
++        draft_worker_config.model_config,
++        vllm_config.load_config,
++    )
++    speculative_config.draft_parallel_config.worker_cls =\
++        draft_worker_config.parallel_config.sd_worker_cls
++    draft_worker_config.parallel_config = speculative_config.draft_parallel_config  # noqa
++    # TODO allow draft-model specific load config.
++
++    # Override draft-model specific worker args.
++    draft_worker_kwargs.update(
++        vllm_config=draft_worker_config,
++        ngram_prompt_lookup_max=speculative_config.ngram_prompt_lookup_max,
++        ngram_prompt_lookup_min=speculative_config.ngram_prompt_lookup_min,
++    )
++
++    spec_decode_worker = SpecDecodeWorker.create_worker(
++        scorer_worker=target_worker,
++        draft_worker_kwargs=draft_worker_kwargs,
++        disable_mqa_scorer=speculative_config.speculative_disable_mqa_scorer,
++        disable_by_batch_size=speculative_config.
++        speculative_disable_by_batch_size,
++        draft_token_acceptance_method=speculative_config.
++        draft_token_acceptance_method,
++        typical_acceptance_sampler_posterior_threshold=speculative_config.
++        typical_acceptance_sampler_posterior_threshold,
++        typical_acceptance_sampler_posterior_alpha=speculative_config.
++        typical_acceptance_sampler_posterior_alpha,
++        disable_logprobs=speculative_config.disable_logprobs,
++        disable_log_stats=speculative_config.disable_log_stats,
++    )
++
++    return spec_decode_worker
++
++
++# Reminder: Please update docs/source/features/compatibility_matrix.md
++# If the feature combo become valid
+ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+     """Worker which implements speculative decoding.
+ 
+@@ -39,8 +127,6 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+         welcome!).
+     * Only top-1 proposal and scoring are implemented. Tree-attention is left as
+         future work.
+-    * Only lossless rejection sampling is supported. Contributions adding lossy
+-        verification routines are welcome (e.g. Medusa's typical acceptance).
+     * All sequences in a batch must have the same proposal length, or zero. This
+         can be improved by having per-sequence speculation in the future.
+     * The scoring forward pass is done without an MQA kernel, which is
+@@ -54,37 +140,114 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+     def create_worker(
+         cls,
+         scorer_worker: WorkerBase,
+-        draft_worker_kwargs,
++        draft_worker_kwargs: Dict[str, Any],
++        disable_mqa_scorer: bool,
++        disable_by_batch_size: Optional[int],
++        draft_token_acceptance_method: str,
++        typical_acceptance_sampler_posterior_threshold: float,
++        typical_acceptance_sampler_posterior_alpha: float,
++        disable_logprobs: bool,
++        disable_log_stats: bool,
+     ) -> "SpecDecodeWorker":
+ 
+-        if "ngram_prompt_lookup_max" in draft_worker_kwargs:
+-            ngram_prompt_lookup_max = (
+-                draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
+-            ngram_prompt_lookup_min = (
+-                draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
+-        else:
+-            ngram_prompt_lookup_max = 0
+-
++        allow_zero_draft_token_step = True
++        ngram_prompt_lookup_max = (
++            draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
++        ngram_prompt_lookup_min = (
++            draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
++        draft_model_config = draft_worker_kwargs["vllm_config"].model_config
++        draft_parallel_config: ParallelConfig = draft_worker_kwargs[
++            'vllm_config'].parallel_config
+         if ngram_prompt_lookup_max > 0:
++            draft_worker_kwargs[
++                "device_type"] = scorer_worker.device_config.device.type
+             proposer_worker = NGramWorker(**draft_worker_kwargs)
+             proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
+                                                   ngram_prompt_lookup_max)
+         else:
+-            proposer_worker = MultiStepWorker(**draft_worker_kwargs)
++            draft_tp = draft_parallel_config.tensor_parallel_size
++            target_tp = scorer_worker.parallel_config.tensor_parallel_size
++
++            if draft_model_config.hf_config.model_type == "mlp_speculator":
++                proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
++            elif draft_model_config.hf_config.model_type == "medusa":
++                proposer_worker = MedusaWorker(**draft_worker_kwargs)
++            else:
++                if draft_tp == 1:
++                    if current_platform.is_cuda_alike():
++                        draft_worker_kwargs[
++                            "model_runner_cls"] = TP1DraftModelRunner
++                else:
++                    if draft_model_config.hf_config.model_type == "eagle":
++                        raise NotImplementedError(
++                            "EAGLE does not support TP > 1 yet")
++
++                    allow_zero_draft_token_step = False
++                proposer_worker = MultiStepWorker(**draft_worker_kwargs)
++
++            proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
++                proposer_worker, draft_tp, target_tp)
++
++        logger.info("Configuring SpecDecodeWorker with proposer=%s",
++                    type(proposer_worker))
++
++        spec_decode_sampler: SpecDecodeBaseSampler = None
++        if draft_token_acceptance_method == "rejection_sampler":
++            spec_decode_sampler = RejectionSampler()
++        elif draft_token_acceptance_method == "typical_acceptance_sampler":
++            spec_decode_sampler = TypicalAcceptanceSampler(
++                posterior_threshold=\
++                    typical_acceptance_sampler_posterior_threshold,
++                posterior_alpha=typical_acceptance_sampler_posterior_alpha,
++            )
++        logger.info(
++            "[Speculative Decoding] Configuring"
++            " SpecDecodeWorker with sampler=%s", type(spec_decode_sampler))
++
++        if not disable_mqa_scorer:
++            if scorer_worker.model_runner.attn_backend.get_name(
++            ) != "FLASH_ATTN":
++                disable_mqa_scorer = True
++                logger.info(
++                    "[Speculative Decoding] Disabling MQA scorer as the "
++                    "MQA is only available with flash attn backend.")
++
++            if draft_model_config and \
++                draft_model_config.max_model_len < \
++                    scorer_worker.model_config.max_model_len:
++                disable_mqa_scorer = True
++                logger.info(
++                    "[Speculative Decoding] Disabling MQA scorer as the "
++                    "draft model max_model_len is smaller than the target "
++                    "model max_model_len.")
++
++            if not scorer_worker.model_runner.model_config.enforce_eager:
++                disable_mqa_scorer = True
++                logger.info(
++                    "[Speculative Decoding] Disabling MQA scorer as the "
++                    "target model is not running in eager mode.")
+ 
+         return SpecDecodeWorker(
+             proposer_worker,
+             scorer_worker,
+-            # TODO(cade) disable strict mode for speedup.
+-            rejection_sampler=RejectionSampler(strict_mode=True),
+-        )
++            disable_mqa_scorer=disable_mqa_scorer,
++            disable_logprobs=disable_logprobs,
++            disable_log_stats=disable_log_stats,
++            disable_by_batch_size=disable_by_batch_size,
++            spec_decode_sampler=spec_decode_sampler,
++            allow_zero_draft_token_step=allow_zero_draft_token_step)
+ 
+     def __init__(
+         self,
+-        proposer_worker: WorkerBase,
++        proposer_worker: ProposerWorkerBase,
+         scorer_worker: WorkerBase,
+-        rejection_sampler: RejectionSampler,
++        spec_decode_sampler: SpecDecodeBaseSampler,
++        disable_mqa_scorer: bool = False,
++        disable_logprobs: bool = False,
++        disable_log_stats: bool = False,
+         metrics_collector: Optional[AsyncMetricsCollector] = None,
++        disable_by_batch_size: Optional[int] = None,
++        allow_zero_draft_token_step: Optional[bool] = True,
+     ):
+         """
+         Create a SpecDecodeWorker.
+@@ -95,24 +258,58 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+             scorer_worker: A worker that produces probabilities of speculative
+                 tokens according to some base model. Typically a vanilla vLLM
+                 Worker.
+-            rejection_sampler: A Torch module used to perform modified rejection
+-                sampling for speculative decoding.
++            spec_decode_sampler: A Torch module used to perform acceptance
++                sampling of the draft tokens in the verification step of
++                speculative decoding. Currently we support two different 
++                types of sampler namely RejectionSampler and
++                TypicalAcceptanceSampler. 'spec_decode_sampler' is either an
++                instance of RejectionSampler or TypicalAcceptanceSampler.
++            disable_mqa_scorer: If set to True, disable the MQA scorer and use
++                the BatchExpansionTop1Scorer instead.
++            disable_logprobs: If set to True, token log probabilities will
++                not be output in both the draft worker and the target worker.
++                If set to False, log probabilities will be output by both.
++            disable_log_stats: If set to True, disable periodic printing of
++                speculative stage times.
++            disable_by_batch_size: If the batch size is larger than this,
++                disable speculative decoding for new incoming requests.
+             metrics_collector: Helper class for collecting metrics; can be set
+                 for testing purposes.
++            allow_zero_draft_token_step: whether to allow a step where the draft
++                model generates no draft token; should disallow when the tp of
++                draft model is larger than 1 (TODO: #5814)
+         """
+         self.proposer_worker = proposer_worker
+         self.scorer_worker = scorer_worker
+-        self.rejection_sampler = rejection_sampler
+-
++        scorer_runner = getattr(self.scorer_worker, "model_runner", None)
++        self.generators = scorer_runner.get_generators(
++        ) if scorer_runner else None
++        self.disable_by_batch_size = disable_by_batch_size or float("inf")
++        self.spec_decode_sampler = spec_decode_sampler
++        self._allow_zero_draft_token_step = allow_zero_draft_token_step
+         self._metrics = AsyncMetricsCollector(
+-            rejection_sampler
++            self.spec_decode_sampler
+         ) if metrics_collector is None else metrics_collector
+-
+-        self.probs_dtype = self.rejection_sampler.probs_dtype
+-        self.token_id_dtype = self.rejection_sampler.token_id_dtype
+-
+-        # Lazy initiazliation.
++        # Tracks the sequence IDs that received a bonus token ID in
++        # their last forward pass. Needed only if KV cache is being
++        # used for token generation such as in the case of MultiStepWorker.
++        self._seq_with_bonus_token_in_last_step: Set[int] = set()
++        # Tracks the currently active request ids and the sequence IDs
++        # corresponding to them
++        self._request_id_seq_id_mapping: Dict[str, Set[int]] = defaultdict(set)
++        # Tracks if the proposer worker uses the KV cache or not.
++
++        self.probs_dtype = self.spec_decode_sampler.probs_dtype
++        self.token_id_dtype = self.spec_decode_sampler.token_id_dtype
++        # Lazy initialization.
+         self.scorer: SpeculativeScorer
++        self.disable_mqa_scorer = disable_mqa_scorer
++
++        # Hidden states from target model to pass to proposer
++        # in the subsequent step.
++        self.previous_hidden_states: Optional[HiddenStates] = None
++        self._disable_logprobs = disable_logprobs
++        self._disable_log_stats = disable_log_stats
+ 
+     def init_device(self) -> None:
+         """Initialize both scorer and proposer models.
+@@ -126,19 +323,33 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+         self.scorer_worker.load_model()
+         self.proposer_worker.load_model()
+ 
+-        self._metrics.init_gpu_tensors(self.rank)
+-        self.rejection_sampler.init_gpu_tensors(self.rank)
+-        self.scorer = BatchExpansionTop1Scorer(
+-            scorer_worker=self.scorer_worker,
+-            device=self.device,
+-            vocab_size=self._vocab_size)
++        self._metrics.init_tensors(self.rank, device_type=self.device)
++        self.spec_decode_sampler.init_tensors(self.rank,
++                                              device_type=self.device)
++
++        scorer_cls: Type[SpeculativeScorer]
++        if self.disable_mqa_scorer:
++            scorer_cls = BatchExpansionTop1Scorer
++            logger.info("[Speculative Decoding] Use batch "
++                        "expansion for scoring proposals.")
++        else:
++            scorer_cls = MQAScorer
++            logger.info(
++                "[Speculative Decoding] Use MQA scorer for scoring proposals.")
++
++        self.scorer = scorer_cls(scorer_worker=self.scorer_worker,
++                                 device=self.device,
++                                 vocab_size=self._vocab_size)
+ 
+         self._configure_model_sampler_for_spec_decode()
+ 
++    def load_model(self, *args, **kwargs):
++        pass
++
+     def _configure_model_sampler_for_spec_decode(self):
+         """Configure model sampler to emit GPU tensors. This allows spec decode
+         to keep data on device without transferring to CPU and serializing,
+-        which significantly reduces overhead of rejection sampling.
++        which significantly reduces overhead of sampling during verification.
+ 
+         NOTE(cade): This breaks abstraction boundaries pretty badly. The better
+         design is to have the "move to CPU and serialize" sampling decision be
+@@ -156,7 +367,10 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+         """
+         (self.scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor
+          ) = True
++        (self.scorer_worker.model_runner.model.sampler.
++         should_modify_greedy_probs_inplace) = True
+         self.proposer_worker.set_include_gpu_probs_tensor()
++        self.proposer_worker.set_should_modify_greedy_probs_inplace()
+ 
+     def determine_num_available_blocks(self) -> Tuple[int, int]:
+         """Determine the number of cache blocks to use.
+@@ -190,81 +404,373 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+ 
+     @torch.inference_mode()
+     def execute_model(
+-            self,
+-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None
++    ) -> List[SamplerOutput]:
+         """Perform speculative decoding on the input batch.
+         """
++        if self.rank != self._driver_rank:
++            self._run_non_driver_rank()
++            return []
++
++        if execute_model_req is None:
++            # This signals that there's no more requests to process for now.
++            # All workers are running infinite loop with broadcast_tensor_dict,
++            # and it stops the loop when the driver broadcasts an empty input.
++            # Send an empty input to notify all other workers to stop their
++            # execution loop.
++            broadcast_tensor_dict({}, src=0)
++            return []
++
++        self._track_finished_requests(execute_model_req)
++        disable_all_speculation = self._should_disable_all_speculation(
++            execute_model_req)
++        num_lookahead_slots = execute_model_req.num_lookahead_slots
++        all_prompt = True
++        atleast_one_prompt = False
++        all_zero_spec_tokens = True
++        for sgm in execute_model_req.seq_group_metadata_list:
++            all_prompt = all_prompt and sgm.is_prompt
++            atleast_one_prompt = atleast_one_prompt or sgm.is_prompt
++            all_zero_spec_tokens = all_zero_spec_tokens and (
++                sgm.num_speculative_tokens == 0)
++
++        if all_prompt and execute_model_req.seq_group_metadata_list:
++            assert num_lookahead_slots == 0, (
++                "Prompt only runs should have num_lookahead_slots equal to 0. "
++                "This should never happen, please file a bug at "
++                "https://github.com/vllm-project/vllm/issues")
++        # Speculative decoding is disabled in the following cases:
++        # 1. Prefill phase: Speculative decoding is not
++        #    used during the prefill phase.
++        # 2. Auto-disable enabled: The running queue size exceeds
++        #    the specified threshold.
++        # 3. No request: There are no requests in the batch, or
++        #    none of the requests in the batch have spec decoding enabled.
++        # In any of these cases, the proposer and scorer workers
++        # are called normally.
++        # We expect `num_speculative_tokens` to be None for prefills.
++        no_spec = (num_lookahead_slots == 0 or disable_all_speculation
++                   or all_zero_spec_tokens)
++
++        # Broadcast how many lookahead slots are scheduled for this step, and
++        # whether all speculation is disabled, to all non-driver workers.
++
++        # This is required as if the number of draft model runs changes
++        # dynamically, the non-driver workers won't know unless we perform a
++        # communication to inform them.
++
++        # no_spec is used to signal non-driver worker about prefill vs decode
++        # stage. This is needed to ensure that order of execution of proposer
++        # and scorer is same in both driver and non-driver workers (i.e.,
++        # scorer -> proposer for prefill and proposer -> scorer in decode). This
++        # order is needed to support models like EAGLE that take scorer states
++        # as inputs.
++        broadcast_dict = dict(
++            num_lookahead_slots=num_lookahead_slots,
++            no_spec=no_spec,
++            disable_all_speculation=disable_all_speculation,
++            # When both chunked prefill and speculative decoding are enabled
++            # it is possible that the same batch contains both prefill
++            # and decodes. If that happens in the scorer we run the batch
++            # as one single forward pass. However, in the proposer we
++            # run them as 2 different batches - one for prefill and
++            # the other for decodes. The variable indicates to the non-driver
++            # worker that there are prefills as part of the speculative batch
++            # and hence it needs to run an extra prefill forward pass.
++            run_spec_proposer_for_prefill=atleast_one_prompt,
++        )
++        broadcast_tensor_dict(broadcast_dict, src=self._driver_rank)
+ 
+         assert execute_model_req.seq_group_metadata_list is not None, (
+-            "speculative decoding "
+-            "requires non-None seq_group_metadata_list")
++            "speculative decoding requires non-None seq_group_metadata_list")
+ 
+-        # If no spec tokens, call the proposer and scorer workers normally.
+-        # Used for prefill.
+-        if execute_model_req.num_lookahead_slots == 0 or len(
+-                execute_model_req.seq_group_metadata_list) == 0:
+-            return self._run_no_spec(execute_model_req)
++        self._maybe_disable_speculative_tokens(
++            disable_all_speculation, execute_model_req.seq_group_metadata_list)
+ 
+-        return self._run_speculative_decoding_step(execute_model_req)
++        if no_spec:
++            return self._run_no_spec(execute_model_req,
++                                     skip_proposer=disable_all_speculation)
++        return self._run_speculative_decoding_step(execute_model_req,
++                                                   num_lookahead_slots)
+ 
+-    @nvtx_range("spec_decode_worker._run_no_spec")
+-    def _run_no_spec(
+-            self,
+-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+-        """Run a prefill step, without any speculation. The input is sent to the
+-        proposer and scorer model so that the KV cache is consistent between the
+-        two.
++    @torch.inference_mode()
++    def start_worker_execution_loop(self) -> None:
++        """Execute model loop to perform speculative decoding
++        in parallel worker."""
++        while self._run_non_driver_rank():
++            pass
++
++    def _should_disable_all_speculation(
++            self, execute_model_req: ExecuteModelRequest) -> bool:
++        # When the batch size is too large, disable speculative decoding
++        # to stop trading off throughput for latency.
++        return (execute_model_req.running_queue_size >=
++                self.disable_by_batch_size)
++
++    def _maybe_disable_speculative_tokens(
++            self, disable_all_speculation: bool,
++            seq_group_metadata_list: List[SequenceGroupMetadata]) -> None:
++        if not disable_all_speculation:
++            return
++
++        for seq_group_metadata in seq_group_metadata_list:
++            # Once num_speculative_tokens is set to 0, the spec decode
++            # of this request will be disabled forever.
++            # TODO(comaniac): We currently store spec decoding specific
++            # state in the global data structure, but we should maintain
++            # this state within spec decode worker.
++            seq_group_metadata.num_speculative_tokens = 0
++
++    def _serialize_sampler_output_no_logprobs(
++            self, execute_model_req: ExecuteModelRequest,
++            sampler_output: SamplerOutput) -> List[SamplerOutput]:
+         """
+-        #logger.info("run proposer worker no spec")
++        Creates and returns a `SamplerOutput` with only the token IDs being
++        serialized to CPU and populated in `CompletionSequenceGroupOutput`.
++        All other parameters in `CompletionSequenceGroupOutput` related to log 
++        probabilities are skipped.
++
++        Args:
++            execute_model_req (ExecuteModelRequest): The model request that
++            was executed.
++            sampler_output (SamplerOutput): The output from the sampler with
++            only GPU tensors populated.
++
++        Returns:
++            SamplerOutput: A new `SamplerOutput` instance containing a list of 
++            `CompletionSequenceGroupOutput` objects with only token IDs
++            populated.
++        """
++        seq_output_prompt_logprobs = [
++            seq.is_prompt and seq.sampling_params.prompt_logprobs is not None
++            and seq.sampling_params.prompt_logprobs > 0
++            for seq in execute_model_req.seq_group_metadata_list
++        ]
++        # ignore slots for prompt tokens that are filled with INVALID_TOKEN_ID
++        sampled_token_ids_list = (sampler_output.sampled_token_ids[torch.where(
++            # subtracting is faster than testing for equality
++            sampler_output.sampled_token_ids - VLLM_INVALID_TOKEN_ID)[0]] \
++            if any(seq_output_prompt_logprobs) else \
++                sampler_output.sampled_token_ids).tolist()
++
++        seq_data_entries = [
++            (seq_id, seq_data) for sg in \
++            execute_model_req.seq_group_metadata_list \
++            for seq_id, seq_data in sg.seq_data.items()
++            if sg.do_sample # ignore empty token sequences
++        ]
++        completion_seq_group_output_list: List[
++            CompletionSequenceGroupOutput] = []
++        output_index = 0
++        # Make sure the non-terminal prefill chunks are still aligned with
++        # their own empty output.
++        for seq_group_meta in execute_model_req.seq_group_metadata_list:
++            # Since we can get chunks here, we dont always have a sampled token
++            # (only on last chunk) but we still have to provide an output.
++            if not seq_group_meta.do_sample:
++                completion_seq_group_output_list.append(
++                    CompletionSequenceGroupOutput(samples=[],
++                                                  prompt_logprobs=None))
++            else:
++                # Sequence with output.
++                seq_id, seq_data = seq_data_entries[output_index]
++                needs_prompt_logprobs = seq_output_prompt_logprobs[
++                    output_index]
++                if needs_prompt_logprobs:
++                    prompt_token_ids = seq_data.get_prompt_token_ids()
++                    prompt_logprobs = [
++                        create_logprobs_output(
++                            token_id=p_token_id,
++                            token_id_logprob_rank=-1,
++                            token_id_logprob=0.0,
++                            topk_token_ids=[],
++                            topk_logprobs=[],
++                        )
++                        # no prompt logprobs for the first token
++                        for p_token_id in prompt_token_ids[1:]
++                    ]
++                else:
++                    prompt_logprobs = None
++                completion_seq_group_output_list.append(
++                    create_sequence_group_output(
++                        token_id=sampled_token_ids_list[output_index][0],
++                        token_id_logprob_rank=-1,
++                        token_id_logprob=0.0,
++                        seq_id=seq_id,
++                        topk_token_ids=[],
++                        topk_logprobs=[],
++                        prompt_logprobs=prompt_logprobs))
++                output_index += 1
++
++        return [SamplerOutput(outputs=completion_seq_group_output_list)]
+ 
+-        self.proposer_worker.execute_model(execute_model_req)
++    @nvtx_range("spec_decode_worker._run_no_spec")
++    def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
++                     skip_proposer: bool) -> List[SamplerOutput]:
++        """Run a single generation step without any speculation. The input is
++        sent to the proposer and scorer model so that the KV cache is consistent
++        between the two. When skip_proposer is True, the proposer model is
++        not called, meaning that the kv-cache in proposer for requests is not
++        updated, so they cannot enable spec decode in the rest decoding.
++        """
+ 
+-        #logger.info("run target worker no spec")
+         sampler_output = self.scorer_worker.execute_model(execute_model_req)
+         assert len(sampler_output) == 1
+         sampler_output = sampler_output[0]
+ 
++        # Store hidden states from target model execution.
++        hidden_states = sampler_output.hidden_states
++        if hidden_states is not None:
++            # remove hidden_states for prompt tokens
++            # TODO Enable `return_hidden_states`: prefill chunks hidden states
++            # are pruned by the logits processor. Also, they should be arranged
++            # back into full-prefill latent. Address it to enable MLPSpeculator.
++            if any(seq.is_prompt
++                   for seq in execute_model_req.seq_group_metadata_list):
++                hidden_states = hidden_states[
++                    torch.where(sampler_output.sampled_token_ids -
++                                VLLM_INVALID_TOKEN_ID)[0]]
++            if self.previous_hidden_states is None:
++                self.previous_hidden_states = HiddenStates(
++                    hidden_states, execute_model_req.seq_group_metadata_list)
++            else:
++                self.previous_hidden_states.update(
++                    hidden_states, execute_model_req.seq_group_metadata_list)
++
++        if not skip_proposer:
++            # We prepare the prefill hidden states here so that there no
++            # additional complexity in worker for spec_decode vs non_spec_decode
++            # flow and execute_model doesn't need additional modifications.
++            execute_model_req.previous_hidden_states = \
++                prepare_prefill_hidden_states(
++                    sampler_output.prefill_hidden_states)
++
++            self.proposer_worker.execute_model(execute_model_req)
++
++        sampler_output_to_return = (self._serialize_sampler_output_no_logprobs(
++            execute_model_req=execute_model_req, sampler_output=sampler_output)
++                                    if self._disable_logprobs else
++                                    [sampler_output])
++
+         # Clear device tensors from sampler output. This reduces communication
+         # overhead when the engine runs in a different process than the workers.
+-        sampler_output.probs = None
+-        sampler_output.sampled_tokens = None
++        sampler_output.sampled_token_probs = None
++        sampler_output.sampled_token_ids = None
+         sampler_output.logprobs = None
+-        return [sampler_output]
++        return sampler_output_to_return
++
++    def _run_non_driver_rank(self) -> bool:
++        """Run proposer and verifier model in non-driver workers. This is used
++        for both speculation cases (num_lookahead_slots>0) and non-speculation
++        cases (e.g. prefill).
++
++        Returns True if there are remaining sequences to process.
++        """
++        assert self.rank != self._driver_rank
++
++        data = broadcast_tensor_dict(src=self._driver_rank)
++        if not data:
++            return False
++        num_lookahead_slots = data["num_lookahead_slots"]
++
++        # In case of prefill, scorer_worker has to be run before proposer so
++        # that the hidden states can be propagated to proposer when needed.
++        if data["no_spec"]:
++            self.scorer_worker.execute_model()
++
++        if not data["disable_all_speculation"]:
++            # Even if num_lookahead_slots is zero, we want to run the
++            # proposer model as it may have KV.
++            #
++            # We run the proposer once per lookahead slot. In the future we
++            # should delegate how many times it runs to the proposer.
++            for _ in range(max(num_lookahead_slots, 1)):
++                self.proposer_worker.execute_model()
++
++        if not data["no_spec"]:
++            self.scorer_worker.execute_model()
++            if data["run_spec_proposer_for_prefill"]:
++                self.proposer_worker.execute_model()
++
++        return True
+ 
+     @nvtx_range("spec_decode_worker._run_speculative_decoding_step")
+     def _run_speculative_decoding_step(
+-            self,
+-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
++            self, execute_model_req: ExecuteModelRequest,
++            num_lookahead_slots: int) -> List[SamplerOutput]:
+         """Execute a single step of speculative decoding.
+ 
+         This invokes the proposer worker to get k speculative tokens for each
+         sequence, then scores each speculative token using the scoring worker.
+ 
++        When `enable_chunked_prefill` is set, scorer will batch decodes and 
++        prefills, while proposer will sync its KV-cache by running an extra
++        forward on prefills.
++
+         Returns a list of SamplerOutput, each containing a single token per
+         sequence.
+         """
++        # With prefill chunking, expect requests to have prompts first
++        # so that backend gets prefill|decode.
++        assert num_lookahead_slots == execute_model_req.num_lookahead_slots
++
++        # Pass last hidden states from target model to proposer
++        execute_model_req.previous_hidden_states = self.previous_hidden_states
++        self.previous_hidden_states = None
++
++        with Timer() as proposal_timer:
++            # Generate proposals using draft worker.
++            proposals = self.proposer_worker.get_spec_proposals(
++                execute_model_req, self._seq_with_bonus_token_in_last_step)
++
++        if not self._allow_zero_draft_token_step and proposals.no_proposals:
++            #TODO: Fix it #5814
++            raise RuntimeError("Cannot handle cases where distributed draft "
++                               "workers generate no tokens")
++
++        execute_model_req.previous_hidden_states = None
++
++        with Timer() as scoring_timer:
++            proposal_scores = self.scorer.score_proposals(
++                execute_model_req,
++                proposals,
++            )
++
++        _, (non_spec_seqs, non_spec_indices) = split_batch_by_proposal_len(
++            execute_model_req.seq_group_metadata_list, proposals.proposal_lens)
++        # With prefill chunking enabled, `non_spec_seqs` contains prefills too:
++        # discard decodes that have already been processed by proposer.
++        non_spec_indices = [
++            idx for idx in non_spec_indices
++            if execute_model_req.seq_group_metadata_list[idx].is_prompt
++        ]
++        if len(non_spec_indices):
++            all_hidden_states = proposal_scores.hidden_states
++            # TODO fix `return_hidden_states`, same as in `_run_no_spec`
++            if all_hidden_states is not None:
++                prefill_hidden_states = all_hidden_states[non_spec_indices]
++                execute_model_req.previous_hidden_states = \
++                    prepare_prefill_hidden_states(prefill_hidden_states)
++            # Sync proposer KV cache for prefills.
++            prefill_req = execute_model_req.clone(non_spec_seqs)
++            self.proposer_worker.execute_model(prefill_req)
++
++        with Timer() as verification_timer:
++            accepted_token_ids, target_logprobs = self._verify_tokens(
++                execute_model_req.seq_group_metadata_list, proposal_scores,
++                proposals, execute_model_req.num_lookahead_slots)
++
++        stage_times = (proposal_timer.elapsed_time_ms / num_lookahead_slots,
++                       scoring_timer.elapsed_time_ms,
++                       verification_timer.elapsed_time_ms)
+ 
+-        #logger.info("get spec proposals")
+-        # Generate proposals using draft worker.
+-        proposals = self.proposer_worker.get_spec_proposals(execute_model_req)
+-
+-        #logger.info("score proposals")
+-        proposal_scores = self.scorer.score_proposals(
+-            execute_model_req,
+-            proposals,
+-        )
+-
+-        #logger.info("verify proposals")
+-        accepted_token_ids, target_logprobs = self._verify_tokens(
+-            execute_model_req.seq_group_metadata_list, proposal_scores,
+-            proposals, execute_model_req.num_lookahead_slots)
+-
+-        #logger.info("create output list")
+         return self._create_output_sampler_list(
+             execute_model_req.seq_group_metadata_list,
+             accepted_token_ids,
+             target_logprobs=target_logprobs,
+-            k=execute_model_req.num_lookahead_slots)
++            k=execute_model_req.num_lookahead_slots,
++            stage_times=stage_times)
+ 
+     @nvtx_range("spec_decode_worker._verify_tokens")
+     def _verify_tokens(
+@@ -286,18 +792,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+         # proposal len. This adds some complexity (splitting the batch into spec
+         # and non spec sequences) and should be removed in the future. It can be
+         # done by supporting per-sequence proposal lens.
+-        _, spec_indices = split_batch_by_proposal_len(
+-            seq_group_metadata_list,
+-            proposal_lens_list,
+-            select_proposal_len_zero=False)
+-        _, non_spec_indices = split_batch_by_proposal_len(
+-            seq_group_metadata_list,
+-            proposal_lens_list,
+-            select_proposal_len_zero=True)
++        (_, spec_indices), (_, non_spec_indices) = split_batch_by_proposal_len(
++            seq_group_metadata_list, proposal_lens_list)
+         original_indices = spec_indices + non_spec_indices
+ 
+-        # Get probabilities of target model, excluding bonus token.
+-        proposal_verifier_probs = proposal_scores.probs[spec_indices, :-1]
++        # Get probabilities of target model, including bonus tokens.
++        proposal_verifier_probs = proposal_scores.probs[spec_indices]
+ 
+         # Get non-speculative sampled tokens from target model.
+         non_spec_token_ids = proposal_scores.token_ids[non_spec_indices]
+@@ -311,13 +811,23 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+         # Get proposed tokens.
+         proposal_token_ids = proposals.proposal_token_ids[spec_indices]
+ 
+-        accepted_token_ids = self.rejection_sampler(
+-            target_probs=proposal_verifier_probs,
++        # Sampler arguments
++        sampler_extra_kwargs: Dict[str, Any] = {}
++        if self.generators and isinstance(self.spec_decode_sampler,
++                                          SpecDecodeStochasticBaseSampler):
++            sampler_extra_kwargs["seeded_seqs"] = {
++                idx: self.generators[sgm.request_id]
++                for idx, sgm in enumerate(seq_group_metadata_list)
++                if sgm.sampling_params.seed is not None
++            }
++
++        accepted_token_ids = self.spec_decode_sampler(
++            target_with_bonus_probs=proposal_verifier_probs,
+             bonus_token_ids=bonus_token_ids,
+             draft_probs=proposal_probs,
+             draft_token_ids=proposal_token_ids,
++            **sampler_extra_kwargs,
+         )
+-
+         # Append output tokens from non-speculative sequences to
+         # the accepted token ids tensor.
+         non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len +
+@@ -326,11 +836,24 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+         accepted_token_ids = torch.cat(
+             [accepted_token_ids, non_spec_token_ids])
+         logprobs = proposal_scores.logprobs
+-
+         # Rearrange so that results are in the order of the original seq group
+         # metadata.
+         accepted_token_ids[original_indices] = accepted_token_ids.clone()
+ 
++        hidden_states = proposal_scores.hidden_states
++        if hidden_states is not None:
++            # Contract hidden states based on accepted tokens
++            hs_size = hidden_states.shape[-1]
++
++            accepted_index = accepted_token_ids + 1  # Convert -1 to 0
++            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)
++            index = accepted_index[:, None, None].expand(-1, 1, hs_size)
++            second_last_token_hidden_states = hidden_states[:, -2]  # b x d
++            hidden_states = hidden_states.gather(1, index).squeeze(1)  # b x d
++            # Store hidden states from target model for subsequent decode step
++            self.previous_hidden_states = HiddenStates(
++                hidden_states, seq_group_metadata_list,
++                second_last_token_hidden_states)
+         return accepted_token_ids, logprobs
+ 
+     def _create_output_sampler_list(
+@@ -339,6 +862,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+         accepted_token_ids: torch.Tensor,  # shape: [batch_size, k+1]
+         target_logprobs: torch.Tensor,  # shape: [batch_size, k+1, vocab_size]
+         k: int,
++        stage_times: Tuple[float, float, float],
+     ) -> List[SamplerOutput]:
+         """Given the accepted token ids, create a list of SamplerOutput.
+ 
+@@ -346,52 +870,51 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+         the same number of outputs.
+         """
+         batch_size, num_steps = accepted_token_ids.shape
+-
+-        # Organize input tensors by step instead of by sequence.
+-        target_logprobs_by_step = target_logprobs.transpose(0, 1)
+         accepted_token_ids_by_step = accepted_token_ids.transpose(0, 1)
+-
+-        # Get the logprobs/rank of the accepted tokens.
+-        (accepted_token_id_ranks_by_step,
+-         accepted_token_id_logprobs_by_step) = get_sampled_token_logprobs(
+-             logprob_tensor=target_logprobs_by_step,
+-             sampled_token_ids=accepted_token_ids_by_step,
+-         )
+-
+-        # Get the top-k logprobs (which may or may not include the logprob of
+-        # the accepted token).
+-        (topk_logprobs_by_step,
+-         topk_indices_by_step) = target_logprobs_by_step.topk(
+-             k=self.scorer_worker.model_config.max_logprobs,
+-             dim=-1,
+-         )
++        if self._disable_logprobs:
++            # We are skipping the logprobs. Hence don't serialize the
++            # logprobs related tensors from the GPU. Instead create
++            # empty/dummy lists.
++            (accepted_token_id_ranks_by_step,
++            accepted_token_id_logprobs_by_step,
++            topk_logprobs_by_step, topk_indices_by_step) =\
++            self._create_dummy_logprob_lists(
++                batch_size, num_steps,
++                self.scorer_worker.model_config.max_logprobs)
++        else:
++            # Organize input tensors by step instead of by sequence.
++            target_logprobs_by_step = target_logprobs.transpose(0, 1)
++            # Serialize all tensors into Python lists.
++            (accepted_token_id_ranks_by_step,
++            accepted_token_id_logprobs_by_step,
++            topk_logprobs_by_step, topk_indices_by_step) =\
++                self._create_logprob_lists_from_tensors(
++                    target_logprobs_by_step, accepted_token_ids_by_step,
++                    self.scorer_worker.model_config.max_logprobs)
+ 
+         # Get the sequence ids and num_logprobs (sampling parameter) in the
+         # batch.
+-        seq_ids = get_all_seq_ids(seq_group_metadata_list)
++        seq_ids, request_ids_seq_ids_mapping = get_all_seq_ids_and_request_ids(
++            seq_group_metadata_list)
++
+         num_logprobs_per_seq = get_all_num_logprobs(seq_group_metadata_list)
+ 
+-        # Serialize all tensors to CPU Python lists.
++        # Serialize tensor to CPU Python list.
+         accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
+-        accepted_token_id_ranks_by_step = (
+-            accepted_token_id_ranks_by_step.tolist())
+-        accepted_token_id_logprobs_by_step = (
+-            accepted_token_id_logprobs_by_step.tolist())
+-        topk_logprobs_by_step = topk_logprobs_by_step.tolist()
+-        topk_indices_by_step = topk_indices_by_step.tolist()
+ 
+         # Construct the output on a per-step, per-sequence basis.
+-        sampler_output_list = []
++        # Non-terminal prefill chunks will end up here as rows with just -1s
++        # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]]
++        sampler_output_list: List[SamplerOutput] = []
+         for step_index in range(num_steps):
+             if all(token_id == -1
+                    for token_id in accepted_token_ids_by_step[step_index]):
+                 break
+ 
+-            step_output_token_ids = []
++            step_output_token_ids: List[CompletionSequenceGroupOutput] = []
+             for sequence_index in range(batch_size):
+                 # Each sequence may have a different num_logprobs; retrieve it.
+                 num_logprobs = num_logprobs_per_seq[sequence_index]
+-
+                 step_output_token_ids.append(
+                     create_sequence_group_output(
+                         token_id=accepted_token_ids_by_step[step_index]
+@@ -406,18 +929,171 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+                         topk_logprobs=topk_logprobs_by_step[step_index]
+                         [sequence_index][:num_logprobs],
+                     ))
+-
+             sampler_output_list.append(
+                 SamplerOutput(outputs=step_output_token_ids))
+ 
++        # Populate the data structures needed to keep track of sequences with
++        # bonus tokens.
++        self._track_sequences_with_bonus_tokens(seq_ids,
++                                                request_ids_seq_ids_mapping,
++                                                accepted_token_ids_by_step)
+         maybe_rejsample_metrics = (
+             self._metrics.maybe_collect_rejsample_metrics(k))
+         if maybe_rejsample_metrics is not None:
+             sampler_output_list[
+                 0].spec_decode_worker_metrics = maybe_rejsample_metrics
+ 
++            # Log time spent in each stage periodically.
++            # This is periodic because the rejection sampler emits metrics
++            # periodically.
++            self._maybe_log_stage_times(*stage_times)
+         return sampler_output_list
+ 
++    def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float,
++                               scoring_time_ms: float,
++                               verification_time_ms: float) -> None:
++        """Log the speculative stage times. If stat logging is disabled, do
++        nothing.
++        """
++        if self._disable_log_stats:
++            return
++
++        logger.info(
++            "SpecDecodeWorker stage times: "
++            "average_time_per_proposal_tok_ms=%.02f "
++            "scoring_time_ms=%.02f verification_time_ms=%.02f",
++            average_time_per_proposal_tok_ms, scoring_time_ms,
++            verification_time_ms)
++
++    def _create_dummy_logprob_lists(
++        self,
++        batch_size: int,
++        num_steps: int,
++        num_top_k: int,
++    ) -> Tuple[List[List[int]], List[List[float]],
++               List[List[List[Optional[float]]]],
++               List[List[List[Optional[int]]]]]:
++        """
++        Creates and returns four dummy lists representing token probabilities 
++        and their ranks.
++
++        This method initializes and returns:
++            - The ranks of the accepted tokens, shaped (num_steps, batch_size)
++            - The log probabilities of the accepted tokens,
++              shaped (num_steps, batch_size)
++            - The log probabilities of the top k tokens,
++              shaped (num_steps, batch_size, num_top_k)
++            - The token IDs of the top k tokens,
++              shaped (num_steps, batch_size, num_top_k)
++
++        Args:
++            batch_size (int): The size of the batch.
++            num_steps (int): The number of steps in the sequence.
++            num_top_k (int): The number of top-k token log probabilities to
++            return.
++        
++        Returns:
++            A tuple containing four dummy lists as described above.
++        """
++        accepted_token_id_ranks_by_step = [[-1] * batch_size
++                                           for _ in range(num_steps)]
++        accepted_token_id_logprobs_by_step = [[0.0] * batch_size
++                                              for _ in range(num_steps)]
++        topk_logprobs_by_step: List[List[List[Optional[float]]]] = [[
++            [None] * num_top_k for _ in range(batch_size)
++        ] for _ in range(num_steps)]
++        topk_indices_by_step: List[List[List[Optional[int]]]] = [[
++            [None] * num_top_k for _ in range(batch_size)
++        ] for _ in range(num_steps)]
++        return (accepted_token_id_ranks_by_step,
++                accepted_token_id_logprobs_by_step, topk_logprobs_by_step,
++                topk_indices_by_step)
++
++    def _create_logprob_lists_from_tensors(
++        self,
++        target_logprobs_by_step: torch.Tensor,
++        accepted_token_ids_by_step: torch.Tensor,
++        num_top_k: int,
++    ) -> Tuple[List[List[int]], List[List[float]],
++               List[List[List[Optional[float]]]],
++               List[List[List[Optional[int]]]]]:
++        """
++        Creates and returns four lists representing token probabilities and
++        their ranks.
++
++        This method initializes and returns four lists containing:
++            - The ranks of the accepted tokens, shaped (num_steps, batch_size)
++            - The log probabilities of the accepted tokens,
++              shaped (num_steps, batch_size)
++            - The log probabilities of the top k tokens,
++              shaped (num_steps, batch_size, num_top_k)
++            - The token IDs of the top k tokens,
++              shaped (num_steps, batch_size, num_top_k)
++
++        Args:
++            target_logprobs_by_step (torch.Tensor): Tensor representing the
++            log probabilities of the target model,
++            shaped (num_steps, batch_size, vocab_size)
++            accepted_token_ids_by_step (torch.Tensor): Tensor representing
++            the accepted  token_ids, shaped (num_steps, batch_size) 
++            num_top_k (int): The number of top-k token log probabilities to
++            return.
++        
++        Returns:
++            A tuple containing the lists as described above.
++        """
++        # Serialize all tensors to CPU Python lists.
++        # Get the logprobs/rank of the accepted tokens.
++        (accepted_token_id_ranks_by_step_tensor,
++         accepted_token_id_logprobs_by_step_tensor
++         ) = get_sampled_token_logprobs(
++             logprob_tensor=target_logprobs_by_step,
++             sampled_token_ids=accepted_token_ids_by_step,
++         )
++        # Get the top-k logprobs (which may or may not include the
++        # logprob of the accepted token).
++        (topk_logprobs_by_step_tensor,
++         topk_indices_by_step_tensor) = target_logprobs_by_step.topk(
++             k=num_top_k,
++             dim=-1,
++         )
++        accepted_token_id_ranks_by_step = (
++            accepted_token_id_ranks_by_step_tensor.tolist())
++        accepted_token_id_logprobs_by_step = (
++            accepted_token_id_logprobs_by_step_tensor.tolist())
++        topk_logprobs_by_step = topk_logprobs_by_step_tensor.tolist()
++        topk_indices_by_step = topk_indices_by_step_tensor.tolist()
++        return (accepted_token_id_ranks_by_step,
++                accepted_token_id_logprobs_by_step, topk_logprobs_by_step,
++                topk_indices_by_step)
++
++    def _track_finished_requests(self, execute_model_req: ExecuteModelRequest):
++        """
++        Removes the finished requests and their associated sequence ids from
++        internal book keeping data structures.
++        """
++        for finished_request in execute_model_req.finished_requests_ids:
++            for seq_id in self._request_id_seq_id_mapping[finished_request]:
++                self._seq_with_bonus_token_in_last_step.discard(seq_id)
++            del self._request_id_seq_id_mapping[finished_request]
++
++    def _track_sequences_with_bonus_tokens(
++            self, seq_ids: List[int],
++            request_ids_seq_ids_mapping: Dict[str, Set[int]],
++            accepted_token_ids_by_step: List[List[int]]):
++        """
++        Updates the internal data structures which keep track of sequences
++        which have been assigned bonus tokens in their last forward pass.
++        """
++        for seq_index, seq_id in enumerate(seq_ids):
++            last_token_id = accepted_token_ids_by_step[-1][seq_index]
++            if last_token_id == -1:
++                self._seq_with_bonus_token_in_last_step.discard(seq_id)
++            else:
++                self._seq_with_bonus_token_in_last_step.add(seq_id)
++        for request_id, sequences in request_ids_seq_ids_mapping.items():
++            self._request_id_seq_id_mapping[request_id].update(sequences)
++
+     @cached_property
+     def _vocab_size(self) -> int:
+         """Get the vocab size of the model and make sure it's consistent between
+@@ -438,6 +1114,10 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+     def device(self):
+         return self.scorer_worker.device
+ 
++    @property
++    def _driver_rank(self) -> int:
++        return 0
++
+     def get_cache_block_size_bytes(self):
+         """Return the size of a cache block in bytes.
+         
+@@ -448,6 +1128,14 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+         """
+         raise NotImplementedError
+ 
++    def start_profile(self):
++        if isinstance(self.scorer_worker, WorkerBase):
++            self.scorer_worker.start_profile()
++
++    def stop_profile(self):
++        if isinstance(self.scorer_worker, WorkerBase):
++            self.scorer_worker.stop_profile()
++
+ 
+ def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int,
+                                   proposer_cache_block_size_bytes: int,
+@@ -470,3 +1158,15 @@ def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int,
+         (proposer_cache_block_size_bytes + scorer_cache_block_size_bytes))
+ 
+     return new_num_gpu_blocks
++
++
++def prepare_prefill_hidden_states(
++        prefill_hidden_states: torch.Tensor) -> HiddenStates:
++    # For prefill step in proposer, we run the model for N-1 tokens
++    # because Nth token will be processed in the first decode step. For
++    # N-1 tokens, the input should be 0:N-1 hidden states which should
++    # be concatanated with 1:N token (since output of scorer has to be
++    # the input for proposer). Therefore, we shift the hidden states to
++    # align n-1th hidden state with nth token.
++    return HiddenStates(prefill_hidden_states.roll(
++        shifts=1, dims=0)) if prefill_hidden_states is not None else None
+diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
+new file mode 100644
+index 0000000..5654074
+--- /dev/null
++++ b/vllm/spec_decode/target_model_runner.py
+@@ -0,0 +1,42 @@
++from typing import List, Optional
++
++from vllm.sequence import SequenceGroupMetadata
++from vllm.worker.model_runner_base import (ModelRunnerBase,
++                                           ModelRunnerInputBase,
++                                           ModelRunnerWrapperBase)
++
++
++class TargetModelRunner(ModelRunnerWrapperBase):
++    """Specialized model runner for speculative decoding target model.
++    In speculative decoding, the log probabilities selected finally may not
++    be the same ones as selected by the target model sampling. This means
++    that the time spent in the log probability calculation of the target model
++    is time wasted, since we calculate log probabilities after deciding which
++    tokens are accepted. For this reason disabling log probabilities in the
++    target model will make decode faster. The model runner sets the
++    SamplingMetadata parameters according to whether log probabilities are
++    requested or not. 
++    """
++
++    def __init__(self, model_runner: ModelRunnerBase):
++        # An internal boolean member variable to indicate if token log
++        # probabilities are needed or not.
++        super().__init__(model_runner)
++        self.disable_logprobs = True
++
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None,
++    ) -> ModelRunnerInputBase:
++        model_input: ModelRunnerInputBase =\
++            self.model_runner.prepare_model_input(
++            seq_group_metadata_list, virtual_engine, finished_requests_ids)
++        # If token log probabilities is disabled then skip generating sampler
++        # CPU output. We directly serialize the GPU sampled_token_id tensors
++        # as needed. If log probabilities is enabled then synchronize all the
++        # sampling related tensors which includes the logprobs tensors.
++        model_input.sampling_metadata.skip_sampler_cpu_output = (
++            self.disable_logprobs)
++        return model_input
+diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
+index eb622a0..5a7999a 100644
+--- a/vllm/spec_decode/top1_proposer.py
++++ b/vllm/spec_decode/top1_proposer.py
+@@ -1,13 +1,13 @@
+-from typing import List, Optional, Tuple
++from typing import List, Optional, Set, Tuple
+ 
+ import torch
+ 
+-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+-                           SequenceGroupMetadata)
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
+ from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                          SpeculativeProposer)
++from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+ from vllm.spec_decode.util import sampler_output_to_torch
+-from vllm.worker.worker_base import WorkerBase
+ 
+ 
+ class Top1Proposer(SpeculativeProposer):
+@@ -29,7 +29,7 @@ class Top1Proposer(SpeculativeProposer):
+ 
+     def __init__(
+         self,
+-        worker: WorkerBase,
++        worker: ProposerWorkerBase,
+         device: str,
+         vocab_size: int,
+         max_proposal_len: Optional[int] = None,
+@@ -39,9 +39,10 @@ class Top1Proposer(SpeculativeProposer):
+         self.max_proposal_len = max_proposal_len
+         self._vocab_size = vocab_size
+ 
+-    def get_proposals(
++    def get_spec_proposals(
+         self,
+         execute_model_req: ExecuteModelRequest,
++        seq_ids_with_bonus_token_in_last_step: Set[int],
+     ) -> SpeculativeProposals:
+         """Get speculative proposals given the input batch.
+ 
+@@ -56,7 +57,7 @@ class Top1Proposer(SpeculativeProposer):
+             proposal_lens,
+             nonzero_proposal_len_seqs,
+             nonzero_proposal_len_indices,
+-        ) = self._split_by_max_model_len(seq_group_metadata_list, proposal_len)
++        ) = self._split_by_proposal_len(seq_group_metadata_list, proposal_len)
+ 
+         if nonzero_proposal_len_seqs:
+             # Speculate tokens using the draft worker for the speculative
+@@ -65,14 +66,28 @@ class Top1Proposer(SpeculativeProposer):
+             # token_ids is like [batch] format in proposal_len size list,
+             # while if it is false, the format would be [proposal_len]
+             # in batch size list
++            hidden_states = execute_model_req.previous_hidden_states
++            if hidden_states is not None:
++                hidden_states.prune(nonzero_proposal_len_seqs)
+             nonzero_execute_model_req = ExecuteModelRequest(
+                 seq_group_metadata_list=nonzero_proposal_len_seqs,
+                 num_lookahead_slots=proposal_len,
++                previous_hidden_states=hidden_states,
+             )
+             maybe_sampler_output, transposed = self._worker.sampler_output(
+                 execute_model_req=nonzero_execute_model_req,
+                 sample_len=proposal_len,
++                seq_ids_with_bonus_token_in_last_step=\
++                    seq_ids_with_bonus_token_in_last_step,
+             )
++            (
++                proposal_lens,
++                maybe_sampler_output,
++                nonzero_proposal_len_indices,
++            ) = self._remove_no_proposal_seqs(proposal_lens,
++                                              maybe_sampler_output,
++                                              nonzero_proposal_len_indices,
++                                              transposed)
+         else:
+             # If no sequences can be speculated, set sampler output to None.
+             maybe_sampler_output = None
+@@ -93,35 +108,46 @@ class Top1Proposer(SpeculativeProposer):
+             proposal_token_ids=proposal_tokens,
+             proposal_probs=proposal_probs,
+             proposal_lens=proposal_lens,
+-        )
+-
++            no_proposals=maybe_sampler_output is None)
+         return proposals
+ 
+-    def _split_by_max_model_len(
++    def _split_by_proposal_len(
+         self,
+         seq_group_metadata_list: List[SequenceGroupMetadata],
+         proposal_len: int,
+     ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]:
+-        """Determine which sequences would exceed the max model length."""
++        """Split sequences by two groups:
++        1. Sequences with non-zero proposal length.
++        2. Sequences with zero proposal length (due to disabled speculation
++        or exceed the maximum model length).
++        """
+ 
+         proposal_lens: List[int] = []
+         nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = []
+         nonzero_proposal_len_indices: List[int] = []
+         for i, seq_group_metadata in enumerate(seq_group_metadata_list):
++            # The speculative decoding for this request has either been disabled
++            # (e.g. due to high traffic) or this is a prompt request.
++            if (seq_group_metadata.is_prompt
++                    or seq_group_metadata.num_speculative_tokens == 0):
++                proposal_lens.append(0)
++                continue
++
+             seq_data = next(iter(seq_group_metadata.seq_data.values()))
+             seq_len = seq_data.get_len()
+ 
+             # Currently only proposal lens of 0 or the global batch proposal len
+             # are supported.
+-            # If max_proposal_len is defined, then we shall no exccess this
++            # If max_proposal_len is defined, then we shall not exceed this
+             # quota for nonzero_proposal
++            new_k = 0
+             if (self.max_proposal_len is None
+                     or seq_len + proposal_len < self.max_proposal_len):
+-                proposal_lens.append(proposal_len)
++                new_k = proposal_len
+                 nonzero_proposal_len_seqs.append(seq_group_metadata)
+                 nonzero_proposal_len_indices.append(i)
+-            else:
+-                proposal_lens.append(0)
++            proposal_lens.append(new_k)
++            seq_group_metadata.num_speculative_tokens = new_k
+ 
+         return (
+             proposal_lens,
+@@ -129,61 +155,107 @@ class Top1Proposer(SpeculativeProposer):
+             nonzero_proposal_len_indices,
+         )
+ 
++    @staticmethod
++    def _remove_no_proposal_seqs(proposal_lens, maybe_sampler_output,
++                                 nonzero_proposal_len_indices, transposed):
++        """Remove sequences from nonzero_proposal_len_indices and reset
++        their proposal_len to 0 the draft worker does not provide a proposal
++        (maybe_sampler_output=None). This can avoid scoring overheads.
++        """
++
++        # If maybe_sampler_output is None, then the draft worker did not
++        # provide a proposal for any sequence and thus no action needed.
++        # Also we do not support transposed maybe_sampler_output for now
++        # because it seems not straightforward for draft workers outputting
++        # transposed sampler outputs to handle the case of no proposal.
++        if maybe_sampler_output is None or transposed:
++            return (proposal_lens, maybe_sampler_output,
++                    nonzero_proposal_len_indices)
++
++        new_proposal_lens: List[int] = []
++        new_nonzero_proposal_len_indices: List[int] = []
++        new_maybe_sampler_output: List[SamplerOutput] = []
++        nonzero_proposal_len_idx_ptr = 0
++        seq_idx = 0
++        while seq_idx < len(
++                proposal_lens) and nonzero_proposal_len_idx_ptr < len(
++                    nonzero_proposal_len_indices):
++            if seq_idx < nonzero_proposal_len_indices[
++                    nonzero_proposal_len_idx_ptr]:
++                # Sequence is not in the original nonzero_proposal_len_indices,
++                # meaning that it has a proposal length of 0 before sending to
++                # the draft worker.
++                assert proposal_lens[seq_idx] == 0
++                new_proposal_lens.append(0)
++            else:
++                # Sequence is in the original nonzero_proposal_len_indices
++                if maybe_sampler_output[nonzero_proposal_len_idx_ptr] is None:
++                    # but does not have a proposal from the draft worker.
++                    new_proposal_lens.append(0)
++                else:
++                    # and has a proposal from the draft worker. Add it to the
++                    # new nonzero proposal list and keep the sampler output.
++                    new_proposal_lens.append(proposal_lens[seq_idx])
++                    new_nonzero_proposal_len_indices.append(seq_idx)
++                    new_maybe_sampler_output.append(
++                        maybe_sampler_output[nonzero_proposal_len_idx_ptr])
++                nonzero_proposal_len_idx_ptr += 1
++            seq_idx += 1
++
++        # The remaining sequences should have proposal length of 0.
++        new_proposal_lens.extend(proposal_lens[seq_idx:])
++
++        # We assume sampler_output will not be a list of all Nones.
++        # In this case this function should not be called.
++        assert new_maybe_sampler_output
++        return (new_proposal_lens, new_maybe_sampler_output,
++                new_nonzero_proposal_len_indices)
++
+     def _merge_outputs(
+         self,
+         batch_size: int,
+         proposal_len: int,
+-        maybe_sampler_output: Optional[SamplerOutput],
++        maybe_sampler_output: Optional[List[SamplerOutput]],
+         proposal_lens: List[int],
+         nonzero_proposal_len_indices: List[int],
+         sampler_transposed: bool,
+-    ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]:
++    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+         """After speculations are produced, merge the speculation results with
+         the skipped sequences.
+         """
+         if maybe_sampler_output is None:
+             # If no speculative tokens, the sampler output will be None.
+             # In this case we return empty proposals.
+-            proposal_tokens = torch.full(
+-                size=(
+-                    batch_size,
+-                    proposal_len,
+-                ),
+-                fill_value=-1,
+-                dtype=torch.long,
+-                device=self._device,
+-            )
+-            proposal_probs = torch.zeros(
+-                batch_size,
+-                proposal_len,
+-                self._vocab_size,
+-                dtype=torch.float32,
+-                device=self._device,
+-            )
+-            proposal_lens_tensor = torch.zeros(len(proposal_lens),
+-                                               dtype=torch.long,
+-                                               device=self._device)
++            proposal_tokens = torch.tensor(-1,
++                                           dtype=torch.long,
++                                           device=self._device).expand(
++                                               batch_size, proposal_len)
++            proposal_probs = torch.tensor(0,
++                                          dtype=torch.float32,
++                                          device=self._device).expand(
++                                              batch_size, proposal_len,
++                                              self._vocab_size)
++            proposal_lens_tensor = torch.tensor(0,
++                                                dtype=torch.long,
++                                                device=self._device).expand(
++                                                    len(proposal_lens))
+             return proposal_tokens, proposal_probs, proposal_lens_tensor
+ 
+         sampler_output = maybe_sampler_output
+-        proposal_tokens, proposal_probs, _ = sampler_output_to_torch(
++        proposal_tokens, proposal_probs, *_ = sampler_output_to_torch(
+             sampler_output, sampler_transposed)
+ 
+         # Now, reformat the output GPU tensors such that each sequence has
+         # a proposal. the proposal can be empty, e.g. [-1, -1, -1]
+ 
+-        entire_proposal_tokens = torch.full(
++        entire_proposal_tokens = proposal_tokens.new_full(
+             size=(batch_size, *proposal_tokens.shape[1:]),
+             fill_value=-1,
+-            dtype=torch.long,
+-            device=self._device,
+         )
+         entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens
+-        entire_proposal_probs = torch.zeros(
++        entire_proposal_probs = proposal_probs.new_zeros(
+             batch_size,
+             *proposal_probs.shape[1:],
+-            dtype=torch.float32,
+-            device=self._device,
+         )
+         entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs
+ 
+diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
+index d6f80c8..da87066 100644
+--- a/vllm/spec_decode/util.py
++++ b/vllm/spec_decode/util.py
+@@ -1,27 +1,18 @@
++import time
+ from contextlib import contextmanager
+-from itertools import chain
+-from typing import Dict, List, Tuple
++from typing import Dict, List, Optional, Sequence, Tuple
+ 
+ import torch
+ 
+-from vllm.sequence import (Logprob, SamplerOutput, SequenceGroupMetadata,
+-                           SequenceGroupOutput, SequenceOutput)
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.platforms import current_platform
++from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
++                           PromptLogprobs, SequenceGroupMetadata,
++                           SequenceOutput)
+ 
+ SeqId = int
+ 
+ 
+-def get_all_seq_ids(
+-        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]:
+-    """Given a list of SequenceGroupMetadata, create a list of all
+-    sequence ids.
+-    """
+-    return list(
+-        chain.from_iterable([
+-            seq_group_metadata.seq_data.keys()
+-            for seq_group_metadata in seq_group_metadata_list
+-        ]))
+-
+-
+ def get_all_num_logprobs(
+         seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]:
+     """Given a list of SequenceGroupMetadata, create a list of all num_logprobs.
+@@ -30,10 +21,10 @@ def get_all_num_logprobs(
+     sequence.
+     """
+ 
+-    all_num_logprobs = []
++    all_num_logprobs: List[int] = []
+     for seq_group_metadata in seq_group_metadata_list:
+         num_logprobs = seq_group_metadata.sampling_params.logprobs
+-        if seq_group_metadata.sampling_params.logprobs is None:
++        if num_logprobs is None:
+             num_logprobs = 0
+         all_num_logprobs.append(num_logprobs)
+ 
+@@ -54,29 +45,27 @@ def get_sampled_token_logprobs(
+                                        sampled_token_ids, ]
+     expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
+         -1, -1, vocab_size)
+-    sampled_token_ids_ranks = (logprob_tensor >=
+-                               expanded_selected_logprobs).sum(-1)
++    sampled_token_ids_ranks = (logprob_tensor >
++                               expanded_selected_logprobs).sum(-1).add_(1)
+ 
+     return sampled_token_ids_ranks, selected_logprobs
+ 
+ 
+-def create_sequence_group_output(
++def create_logprobs_output(
+     token_id: int,
+     token_id_logprob_rank: int,
+     token_id_logprob: float,
+-    seq_id: SeqId,
+-    topk_token_ids: List[int],
+-    topk_logprobs: List[float],
+-) -> SequenceGroupOutput:
+-    """Create a SequenceGroupOutput given the sampling results.
++    topk_token_ids: List[Optional[int]],
++    topk_logprobs: List[Optional[float]],
++) -> Dict[int, Logprob]:
++    """Create a Logprob Dict for a token given the sampling results.
+ 
+     Args:
+         token_id (int): The sampled token for the sequence.
+         token_id_logprob_rank (int): The logprob rank of the sampled token.
+         token_id_logprob (float): The logprob value of the sampled token.
+-        seq_id (int): The sequence id.
+-        topk_token_ids (List[int]): The list of top-k token ids.
+-        topk_logprobs (List[float]): The list of top-k logprobs.
++        topk_token_ids (List[Optional[int]]): The list of top-k token ids.
++        topk_logprobs (List[Optional[float]]): The list of top-k logprobs.
+     """
+     # vLLM logprobs always include the sampled token. In addition, the user may
+     # request topk-logprobs (where top-k varies per user up to max_logprobs).
+@@ -87,54 +76,79 @@ def create_sequence_group_output(
+         ),
+     }
+     logprobs.update({
+-        topk_token_ids[topk_logprob_index]: Logprob(
+-            logprob=topk_logprobs[topk_logprob_index],
+-            rank=topk_logprob_index + 1,
++        topk_token_id: Logprob(
++            logprob=topk_logprob if topk_logprob is not None else 0.0,
++            rank=topk_index + 1,
+         )
+-        for topk_logprob_index, _ in enumerate(topk_token_ids)
++        for topk_index, (topk_token_id, topk_logprob) \
++            in enumerate(zip(topk_token_ids, topk_logprobs)) \
++        if topk_token_id is not None
+     })
+ 
+-    return SequenceGroupOutput(
++    return logprobs
++
++
++def create_sequence_group_output(
++    token_id: int,
++    token_id_logprob_rank: int,
++    token_id_logprob: float,
++    seq_id: SeqId,
++    topk_token_ids: List[Optional[int]],
++    topk_logprobs: List[Optional[float]],
++    prompt_logprobs: Optional[PromptLogprobs] = None,
++) -> CompletionSequenceGroupOutput:
++    """Create a SequenceGroupOutput given the sampling results.
++
++    Args:
++        token_id (int): The sampled token for the sequence.
++        token_id_logprob_rank (int): The logprob rank of the sampled token.
++        token_id_logprob (float): The logprob value of the sampled token.
++        seq_id (int): The sequence id.
++        topk_token_ids (List[Optional[int]]): The list of top-k token ids.
++        topk_logprobs (List[Optional[float]]): The list of top-k logprobs.
++    """
++
++    logprobs = create_logprobs_output(
++        token_id,
++        token_id_logprob_rank,
++        token_id_logprob,
++        topk_token_ids,
++        topk_logprobs,
++    )
++
++    return CompletionSequenceGroupOutput(
+         samples=[
+             SequenceOutput(parent_seq_id=seq_id,
+                            output_token=token_id,
+                            logprobs=logprobs)
+         ],
+-        # TODO add prompt logprobs support.
+-        prompt_logprobs=None,
++        prompt_logprobs=prompt_logprobs,
+     )
+ 
+ 
+ def split_batch_by_proposal_len(
+     seq_group_metadata_list: List[SequenceGroupMetadata],
+-    proposal_lens: List[int], select_proposal_len_zero: bool
+-) -> Tuple[List[SequenceGroupMetadata], List[int]]:
++    proposal_lens: List[int],
++) -> Tuple[Tuple[List[SequenceGroupMetadata], List[int]], Tuple[
++        List[SequenceGroupMetadata], List[int]]]:
+     """Utility function that splits a batch based on whether the proposal len is
+     zero or not. We should remove this once vLLM supports per-sequence proposal
+     lens in a batch.
+     """
+ 
+-    if select_proposal_len_zero:
+-        predicate = lambda proposal_len: proposal_len == 0
+-    else:
+-        predicate = lambda proposal_len: proposal_len != 0
+-
+-    indices = [
+-        i for i, (_, proposal_len
+-                  ) in enumerate(zip(seq_group_metadata_list, proposal_lens))
+-        if predicate(proposal_len)
+-    ]
+-    seq_groups = [
+-        seq_group for seq_group, proposal_len in zip(
+-            seq_group_metadata_list, proposal_lens) if predicate(proposal_len)
+-    ]
+-
+-    return seq_groups, indices
++    nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
++    zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
++    for i, (seq_group, proposal_len) in enumerate(
++            zip(seq_group_metadata_list, proposal_lens)):
++        seq_groups, indices = nonzero_lists if proposal_len else zero_lists
++        seq_groups.append(seq_group)
++        indices.append(i)
++    return nonzero_lists, zero_lists
+ 
+ 
+ def sampler_output_to_torch(
+-    sampler_output_list: List[SamplerOutput], sampler_transposed: bool
+-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
++    sampler_output_list: Sequence[SamplerOutput], sampler_transposed: bool
++) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+     """Utility function which converts a list of SamplerOutput to tensors.
+ 
+         sampler_transposed here is used as the indicator for whether
+@@ -157,18 +171,12 @@ def sampler_output_to_torch(
+         dim=0,
+     )
+ 
+-    if sampler_transposed:
+-        sampled_token_probs = sampled_token_probs.transpose(0, 1)
+-
+     # shape: [batch_size, num_sampler_output, vocab_size]
+     sampled_token_logprobs = torch.stack(
+         [sampler_output.logprobs for sampler_output in sampler_output_list],
+         dim=0,
+     )
+ 
+-    if sampler_transposed:
+-        sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
+-
+     # shape: [batch_size, num_sampler_output]
+     sampled_token_ids = torch.stack(
+         [
+@@ -177,10 +185,29 @@ def sampler_output_to_torch(
+         ],
+         dim=0,
+     )
++
+     if sampler_transposed:
++        sampled_token_probs = sampled_token_probs.transpose(0, 1)
++        sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
+         sampled_token_ids = sampled_token_ids.transpose(0, 1)
+ 
+-    return sampled_token_ids, sampled_token_probs, sampled_token_logprobs
++    if sampler_output_list[0].hidden_states is not None:
++        # shape: [batch_size, num_sampler_output, hidden_dim]
++        sampled_hidden_states = torch.stack(
++            [
++                sampler_output.hidden_states
++                for sampler_output in sampler_output_list
++            ],
++            dim=0,
++        )
++
++        if sampler_transposed:
++            sampled_hidden_states = sampled_hidden_states.transpose(0, 1)
++    else:
++        sampled_hidden_states = None
++
++    return (sampled_token_ids, sampled_token_probs, sampled_token_logprobs,
++            sampled_hidden_states)
+ 
+ 
+ def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int,
+@@ -221,8 +248,25 @@ def nvtx_range(msg, *args, **kwargs):
+     Arguments:
+         msg (string): message to associate with the range
+     """
+-    torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
+-    try:
++    if current_platform.is_cuda_alike():
++        torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
++        try:
++            yield
++        finally:
++            torch.cuda.nvtx.range_pop()
++    else:
+         yield
+-    finally:
+-        torch.cuda.nvtx.range_pop()
++
++
++class Timer:
++    """Basic timer context manager for measuring CPU time.
++    """
++
++    def __enter__(self):
++        self.start_time = time.time()
++        return self
++
++    def __exit__(self, exc_type, exc_value, traceback):
++        self.end_time = time.time()
++        self.elapsed_time_s = self.end_time - self.start_time
++        self.elapsed_time_ms = self.elapsed_time_s * 1000
+diff --git a/vllm/tracing.py b/vllm/tracing.py
+new file mode 100644
+index 0000000..50068d8
+--- /dev/null
++++ b/vllm/tracing.py
+@@ -0,0 +1,119 @@
++import os
++from typing import Mapping, Optional
++
++from vllm.logger import init_logger
++from vllm.utils import run_once
++
++TRACE_HEADERS = ["traceparent", "tracestate"]
++
++logger = init_logger(__name__)
++
++_is_otel_imported = False
++otel_import_error_traceback: Optional[str] = None
++try:
++    from opentelemetry.context.context import Context
++    from opentelemetry.sdk.environment_variables import (
++        OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
++    from opentelemetry.sdk.trace import TracerProvider
++    from opentelemetry.sdk.trace.export import BatchSpanProcessor
++    from opentelemetry.semconv_ai import SpanAttributes as BaseSpanAttributes
++    from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
++    from opentelemetry.trace.propagation.tracecontext import (
++        TraceContextTextMapPropagator)
++    _is_otel_imported = True
++except ImportError:
++    # Capture and format traceback to provide detailed context for the import
++    # error. Only the string representation of the error is retained to avoid
++    # memory leaks.
++    # See https://github.com/vllm-project/vllm/pull/7266#discussion_r1707395458
++    import traceback
++    otel_import_error_traceback = traceback.format_exc()
++
++    class Context:  # type: ignore
++        pass
++
++    class BaseSpanAttributes:  # type: ignore
++        pass
++
++    class SpanKind:  # type: ignore
++        pass
++
++    class Tracer:  # type: ignore
++        pass
++
++
++def is_otel_available() -> bool:
++    return _is_otel_imported
++
++
++def init_tracer(instrumenting_module_name: str,
++                otlp_traces_endpoint: str) -> Optional[Tracer]:
++    if not is_otel_available():
++        raise ValueError(
++            "OpenTelemetry is not available. Unable to initialize "
++            "a tracer. Ensure OpenTelemetry packages are installed. "
++            f"Original error:\n{otel_import_error_traceback}")
++    trace_provider = TracerProvider()
++
++    span_exporter = get_span_exporter(otlp_traces_endpoint)
++    trace_provider.add_span_processor(BatchSpanProcessor(span_exporter))
++    set_tracer_provider(trace_provider)
++
++    tracer = trace_provider.get_tracer(instrumenting_module_name)
++    return tracer
++
++
++def get_span_exporter(endpoint):
++    protocol = os.environ.get(OTEL_EXPORTER_OTLP_TRACES_PROTOCOL, "grpc")
++    if protocol == "grpc":
++        from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
++            OTLPSpanExporter)
++    elif protocol == "http/protobuf":
++        from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
++            OTLPSpanExporter)  # type: ignore
++    else:
++        raise ValueError(
++            f"Unsupported OTLP protocol '{protocol}' is configured")
++
++    return OTLPSpanExporter(endpoint=endpoint)
++
++
++def extract_trace_context(
++        headers: Optional[Mapping[str, str]]) -> Optional[Context]:
++    if is_otel_available():
++        headers = headers or {}
++        return TraceContextTextMapPropagator().extract(headers)
++    else:
++        return None
++
++
++def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
++
++    return {h: headers[h] for h in TRACE_HEADERS if h in headers}
++
++
++class SpanAttributes(BaseSpanAttributes):
++    # The following span attribute names are added here because they are missing
++    # from the Semantic Conventions for LLM.
++    LLM_REQUEST_ID = "gen_ai.request.id"
++    LLM_REQUEST_N = "gen_ai.request.n"
++    LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
++    LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
++    LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
++    LLM_LATENCY_E2E = "gen_ai.latency.e2e"
++    LLM_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
++    # Time taken in the forward pass for this across all workers
++    LLM_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward"
++    # Time taken in the model execute function. This will include model
++    # forward, block/sync across workers, cpu-gpu sync time and sampling time.
++    LLM_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute"
++
++
++def contains_trace_headers(headers: Mapping[str, str]) -> bool:
++    return any(h in headers for h in TRACE_HEADERS)
++
++
++@run_once
++def log_tracing_disabled_warning() -> None:
++    logger.warning(
++        "Received a request with trace context but tracing is disabled")
+diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py
+index e69de29..eeec029 100644
+--- a/vllm/transformers_utils/__init__.py
++++ b/vllm/transformers_utils/__init__.py
+@@ -0,0 +1,17 @@
++from vllm.envs import VLLM_USE_MODELSCOPE
++
++if VLLM_USE_MODELSCOPE:
++    # Patch here, before each import happens
++    import modelscope
++    from packaging import version
++
++    # patch_hub begins from modelscope>=1.18.1
++    if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
++        raise ImportError(
++            'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
++            'install by `pip install modelscope -U`')
++
++    from modelscope.utils.hf_util import patch_hub
++
++    # Patch hub to download models from modelscope to speed up.
++    patch_hub()
+diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
+index 1756c91..c97acff 100644
+--- a/vllm/transformers_utils/config.py
++++ b/vllm/transformers_utils/config.py
+@@ -1,52 +1,554 @@
+-from typing import Dict, Optional
++import enum
++import json
++import os
++from pathlib import Path
++from typing import Any, Dict, Optional, Type, Union
+ 
+-from transformers import AutoConfig, PretrainedConfig
++import huggingface_hub
++from huggingface_hub import (file_exists, hf_hub_download,
++                             try_to_load_from_cache)
++from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError,
++                                   RepositoryNotFoundError,
++                                   RevisionNotFoundError)
++from torch import nn
++from transformers import GenerationConfig, PretrainedConfig
++from transformers.models.auto.image_processing_auto import (
++    get_image_processor_config)
++from transformers.models.auto.modeling_auto import (
++    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
++from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
+ 
+-from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
+-                                             JAISConfig, MPTConfig, RWConfig)
++from vllm.envs import VLLM_USE_MODELSCOPE
++from vllm.logger import init_logger
++# yapf conflicts with isort for this block
++# yapf: disable
++from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
++                                             DbrxConfig, DeepseekVLV2Config,
++                                             EAGLEConfig, ExaoneConfig,
++                                             H2OVLChatConfig,
++                                             InternVLChatConfig, JAISConfig,
++                                             MedusaConfig, MllamaConfig,
++                                             MLPSpeculatorConfig, MPTConfig,
++                                             NemotronConfig, NVLM_D_Config,
++                                             Olmo2Config, RWConfig,
++                                             SolarConfig, Telechat2Config,
++                                             UltravoxConfig)
++# yapf: enable
++from vllm.transformers_utils.utils import check_gguf_file
++from vllm.utils import resolve_obj_by_qualname
+ 
+-_CONFIG_REGISTRY: Dict[str, PretrainedConfig] = {
++if VLLM_USE_MODELSCOPE:
++    from modelscope import AutoConfig
++else:
++    from transformers import AutoConfig
++
++MISTRAL_CONFIG_NAME = "params.json"
++HF_TOKEN = os.getenv('HF_TOKEN', None)
++
++logger = init_logger(__name__)
++
++_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = {
++    "mllama": MllamaConfig
++}
++
++_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
+     "chatglm": ChatGLMConfig,
++    "cohere2": Cohere2Config,
+     "dbrx": DbrxConfig,
++    "deepseek_vl_v2": DeepseekVLV2Config,
+     "mpt": MPTConfig,
+     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
+     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
+     "jais": JAISConfig,
++    "mlp_speculator": MLPSpeculatorConfig,
++    "medusa": MedusaConfig,
++    "eagle": EAGLEConfig,
++    "exaone": ExaoneConfig,
++    "h2ovl_chat": H2OVLChatConfig,
++    "internvl_chat": InternVLChatConfig,
++    "nemotron": NemotronConfig,
++    "NVLM_D": NVLM_D_Config,
++    "olmo2": Olmo2Config,
++    "solar": SolarConfig,
++    "telechat": Telechat2Config,
++    "ultravox": UltravoxConfig,
++    **_CONFIG_REGISTRY_OVERRIDE_HF
+ }
+ 
+ 
+-def get_config(model: str,
+-               trust_remote_code: bool,
+-               revision: Optional[str] = None,
+-               code_revision: Optional[str] = None) -> PretrainedConfig:
++class ConfigFormat(str, enum.Enum):
++    AUTO = "auto"
++    HF = "hf"
++    MISTRAL = "mistral"
++
++
++def file_or_path_exists(model: Union[str, Path], config_name: str,
++                        revision: Optional[str]) -> bool:
++    if Path(model).exists():
++        return (Path(model) / config_name).is_file()
++
++    # Offline mode support: Check if config file is cached already
++    cached_filepath = try_to_load_from_cache(repo_id=model,
++                                             filename=config_name,
++                                             revision=revision)
++    if isinstance(cached_filepath, str):
++        # The config file exists in cache- we can continue trying to load
++        return True
++
++    # NB: file_exists will only check for the existence of the config file on
++    # hf_hub. This will fail in offline mode.
+     try:
+-        config = AutoConfig.from_pretrained(
++        return file_exists(model,
++                           config_name,
++                           revision=revision,
++                           token=HF_TOKEN)
++    except huggingface_hub.errors.OfflineModeIsEnabled:
++        # Don't raise in offline mode, all we know is that we don't have this
++        # file cached.
++        return False
++
++
++def patch_rope_scaling(config: PretrainedConfig) -> None:
++    """Provide backwards compatibility for RoPE."""
++    text_config = getattr(config, "text_config", None)
++    if text_config is not None:
++        patch_rope_scaling(text_config)
++
++    rope_scaling = getattr(config, "rope_scaling", None)
++    if rope_scaling is not None:
++        patch_rope_scaling_dict(rope_scaling)
++
++
++def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None:
++    if "rope_type" in rope_scaling and "type" in rope_scaling:
++        rope_type = rope_scaling["rope_type"]
++        rope_type_legacy = rope_scaling["type"]
++        if rope_type != rope_type_legacy:
++            raise ValueError(
++                f"Found conflicts between 'rope_type={rope_type}' (modern "
++                f"field) and 'type={rope_type_legacy}' (legacy field). "
++                "You should only specify one of them.")
++
++    if "rope_type" not in rope_scaling and "type" in rope_scaling:
++        rope_scaling["rope_type"] = rope_scaling["type"]
++        logger.info("Replacing legacy 'type' key with 'rope_type'")
++
++    if "rope_type" not in rope_scaling:
++        raise ValueError("rope_scaling should have a 'rope_type' key")
++
++    if rope_scaling["rope_type"] == "su":
++        rope_scaling["rope_type"] = "longrope"
++        logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
++    elif rope_scaling["rope_type"] == "mrope":
++        assert "mrope_section" in rope_scaling
++        rope_scaling["rope_type"] = "default"
++        logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
++
++
++def uses_mrope(config: PretrainedConfig) -> bool:
++    """Detect if the model with this config uses M-ROPE."""
++    rope_scaling = getattr(config, "rope_scaling", None)
++    if rope_scaling is None:
++        return False
++
++    return "mrope_section" in rope_scaling
++
++
++def is_encoder_decoder(config: PretrainedConfig) -> bool:
++    """Detect if the model with this config is used as an encoder/decoder."""
++    text_config = getattr(config, "text_config", None)
++    if text_config is not None:
++        return is_encoder_decoder(text_config)
++
++    return getattr(config, "is_encoder_decoder", False)
++
++
++def get_config(
++    model: Union[str, Path],
++    trust_remote_code: bool,
++    revision: Optional[str] = None,
++    code_revision: Optional[str] = None,
++    config_format: ConfigFormat = ConfigFormat.AUTO,
++    **kwargs,
++) -> PretrainedConfig:
++    # Separate model folder from file path for GGUF models
++
++    is_gguf = check_gguf_file(model)
++    if is_gguf:
++        kwargs["gguf_file"] = Path(model).name
++        model = Path(model).parent
++
++    if config_format == ConfigFormat.AUTO:
++        if is_gguf or file_or_path_exists(
++                model, HF_CONFIG_NAME, revision=revision):
++            config_format = ConfigFormat.HF
++        elif file_or_path_exists(model, MISTRAL_CONFIG_NAME,
++                                 revision=revision):
++            config_format = ConfigFormat.MISTRAL
++        else:
++            # If we're in offline mode and found no valid config format, then
++            # raise an offline mode error to indicate to the user that they
++            # don't have files cached and may need to go online.
++            # This is conveniently triggered by calling file_exists().
++            file_exists(model,
++                        HF_CONFIG_NAME,
++                        revision=revision,
++                        token=HF_TOKEN)
++
++            raise ValueError(f"No supported config format found in {model}")
++
++    if config_format == ConfigFormat.HF:
++        config_dict, _ = PretrainedConfig.get_config_dict(
+             model,
+-            trust_remote_code=trust_remote_code,
+             revision=revision,
+-            code_revision=code_revision)
+-    except ValueError as e:
+-        if (not trust_remote_code and
+-                "requires you to execute the configuration file" in str(e)):
+-            err_msg = (
+-                "Failed to load the model config. If the model is a custom "
+-                "model not yet available in the HuggingFace transformers "
+-                "library, consider setting `trust_remote_code=True` in LLM "
+-                "or using the `--trust-remote-code` flag in the CLI.")
+-            raise RuntimeError(err_msg) from e
++            code_revision=code_revision,
++            token=HF_TOKEN,
++            **kwargs,
++        )
++
++        # Use custom model class if it's in our registry
++        model_type = config_dict.get("model_type")
++        if model_type in _CONFIG_REGISTRY:
++            config_class = _CONFIG_REGISTRY[model_type]
++            config = config_class.from_pretrained(
++                model,
++                revision=revision,
++                code_revision=code_revision,
++                token=HF_TOKEN,
++                **kwargs,
++            )
++        else:
++            try:
++                config = AutoConfig.from_pretrained(
++                    model,
++                    trust_remote_code=trust_remote_code,
++                    revision=revision,
++                    code_revision=code_revision,
++                    token=HF_TOKEN,
++                    **kwargs,
++                )
++            except ValueError as e:
++                if (not trust_remote_code
++                        and "requires you to execute the configuration file"
++                        in str(e)):
++                    err_msg = (
++                        "Failed to load the model config. If the model "
++                        "is a custom model not yet available in the "
++                        "HuggingFace transformers library, consider setting "
++                        "`trust_remote_code=True` in LLM or using the "
++                        "`--trust-remote-code` flag in the CLI.")
++                    raise RuntimeError(err_msg) from e
++                else:
++                    raise e
++
++    elif config_format == ConfigFormat.MISTRAL:
++        config = load_params_config(model, revision, token=HF_TOKEN, **kwargs)
++    else:
++        raise ValueError(f"Unsupported config format: {config_format}")
++
++    # Special architecture mapping check for GGUF models
++    if is_gguf:
++        if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
++            raise RuntimeError(
++                f"Can't get gguf config for {config.model_type}.")
++        model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
++        config.update({"architectures": [model_type]})
++
++    patch_rope_scaling(config)
++
++    if trust_remote_code:
++        maybe_register_config_serialize_by_value()
++
++    return config
++
++
++def get_hf_file_to_dict(file_name: str,
++                        model: Union[str, Path],
++                        revision: Optional[str] = 'main'):
++    """
++    Downloads a file from the Hugging Face Hub and returns 
++    its contents as a dictionary.
++
++    Parameters:
++    - file_name (str): The name of the file to download.
++    - model (str): The name of the model on the Hugging Face Hub.
++    - revision (str): The specific version of the model. 
++
++    Returns:
++    - config_dict (dict): A dictionary containing 
++    the contents of the downloaded file.
++    """
++    file_path = Path(model) / file_name
++
++    if file_or_path_exists(model=model,
++                           config_name=file_name,
++                           revision=revision):
++
++        if not file_path.is_file():
++            try:
++                hf_hub_file = hf_hub_download(model,
++                                              file_name,
++                                              revision=revision)
++            except (RepositoryNotFoundError, RevisionNotFoundError,
++                    EntryNotFoundError, LocalEntryNotFoundError) as e:
++                logger.debug("File or repository not found in hf_hub_download",
++                             e)
++                return None
++            file_path = Path(hf_hub_file)
++
++        with open(file_path) as file:
++            return json.load(file)
++    return None
++
++
++def get_pooling_config(model: str, revision: Optional[str] = 'main'):
++    """
++    This function gets the pooling and normalize 
++    config from the model - only applies to 
++    sentence-transformers models. 
++
++    Args:
++        model (str): The name of the Hugging Face model.
++        revision (str, optional): The specific version 
++        of the model to use. Defaults to 'main'.
++
++    Returns:
++        dict: A dictionary containing the pooling 
++        type and whether normalization is used.
++    """
++
++    modules_file_name = "modules.json"
++    modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
++
++    if modules_dict is None:
++        return None
++
++    pooling = next((item for item in modules_dict
++                    if item["type"] == "sentence_transformers.models.Pooling"),
++                   None)
++    normalize = bool(
++        next((item for item in modules_dict
++              if item["type"] == "sentence_transformers.models.Normalize"),
++             False))
++
++    if pooling:
++
++        pooling_file_name = "{}/config.json".format(pooling["path"])
++        pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
++        pooling_type_name = next(
++            (item for item, val in pooling_dict.items() if val is True), None)
++
++        if pooling_type_name is not None:
++            pooling_type_name = get_pooling_config_name(pooling_type_name)
++
++        return {"pooling_type": pooling_type_name, "normalize": normalize}
++
++    return None
++
++
++def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
++    if "pooling_mode_" in pooling_name:
++        pooling_name = pooling_name.replace("pooling_mode_", "")
++
++    if "_" in pooling_name:
++        pooling_name = pooling_name.split("_")[0]
++
++    if "lasttoken" in pooling_name:
++        pooling_name = "last"
++
++    supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
++    pooling_type_name = pooling_name.upper()
++
++    try:
++        if pooling_type_name in supported_pooling_types:
++            return pooling_type_name
++    except NotImplementedError as e:
++        logger.debug("Pooling type not supported", e)
++        return None
++    return None
++
++
++def get_sentence_transformer_tokenizer_config(model: str,
++                                              revision: Optional[str] = 'main'
++                                              ):
++    """
++    Returns the tokenization configuration dictionary for a 
++    given Sentence Transformer BERT model.
++
++    Parameters:
++    - model (str): The name of the Sentence Transformer 
++    BERT model.
++    - revision (str, optional): The revision of the m
++    odel to use. Defaults to 'main'.
++
++    Returns:
++    - dict: A dictionary containing the configuration parameters 
++    for the Sentence Transformer BERT model.
++    """
++    for config_name in [
++            "sentence_bert_config.json",
++            "sentence_roberta_config.json",
++            "sentence_distilbert_config.json",
++            "sentence_camembert_config.json",
++            "sentence_albert_config.json",
++            "sentence_xlm-roberta_config.json",
++            "sentence_xlnet_config.json",
++    ]:
++        encoder_dict = get_hf_file_to_dict(config_name, model, revision)
++        if encoder_dict:
++            break
++
++    if not encoder_dict:
++        return None
++
++    if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")):
++        return encoder_dict
++    return None
++
++
++def maybe_register_config_serialize_by_value() -> None:
++    """Try to register HF model configuration class to serialize by value
++
++        If trust_remote_code is set, and the model's config file specifies an
++        `AutoConfig` class, then the config class is typically an instance of
++        a custom class imported from the HF modules cache.
++
++        Examples:
++
++        >>> from transformers import AutoConfig
++        >>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
++        >>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
++        >>> import transformers_modules # error, not initialized
++        >>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
++        >>> import transformers_modules # success, initialized
++        >>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
++
++        In the DeepSeek example, the config class is an instance of a custom
++        class that is not serializable by default. This class will not be
++        importable in spawned workers, and won't exist at all on
++        other nodes, which breaks serialization of the config.
++
++        In this function we tell the cloudpickle serialization library to pass
++        instances of these generated classes by value instead of by reference,
++        i.e. the class definition is serialized along with its data so that the
++        class module does not need to be importable on the receiving end.
++
++        See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
++    """ # noqa
++    try:
++        import transformers_modules
++    except ImportError:
++        # the config does not need trust_remote_code
++        return
++
++    try:
++        import cloudpickle
++        cloudpickle.register_pickle_by_value(transformers_modules)
++
++        # ray vendors its own version of cloudpickle
++        from vllm.executor.ray_utils import ray
++        if ray:
++            ray.cloudpickle.register_pickle_by_value(transformers_modules)
++
++        # multiprocessing uses pickle to serialize arguments when using spawn
++        # Here we get pickle to use cloudpickle to serialize config objects
++        # that contain instances of the custom config class to avoid
++        # serialization problems if the generated module (and model) has a `.`
++        # in its name
++        import multiprocessing
++        import pickle
++
++        from vllm.config import VllmConfig
++
++        def _reduce_config(config: VllmConfig):
++            return (pickle.loads, (cloudpickle.dumps(config), ))
++
++        multiprocessing.reducer.register(VllmConfig, _reduce_config)
++
++    except Exception as e:
++        logger.warning(
++            "Unable to register remote classes used by"
++            " trust_remote_code with by-value serialization. This may"
++            " lead to a later error. If remote code is not needed"
++            " remove `--trust-remote-code`",
++            exc_info=e)
++
++
++def load_params_config(model: Union[str, Path], revision: Optional[str],
++                       **kwargs) -> PretrainedConfig:
++    # This function loads a params.json config which
++    # should be used when loading models in mistral format
++
++    config_file_name = "params.json"
++
++    config_dict = get_hf_file_to_dict(config_file_name, model, revision)
++    assert isinstance(config_dict, dict)
++
++    config_mapping = {
++        "dim": "hidden_size",
++        "norm_eps": "rms_norm_eps",
++        "n_kv_heads": "num_key_value_heads",
++        "n_layers": "num_hidden_layers",
++        "n_heads": "num_attention_heads",
++        "hidden_dim": "intermediate_size",
++    }
++
++    def recurse_elems(elem: Any):
++        if isinstance(elem, dict):
++            config_dict = {}
++            for key, value in elem.items():
++                key = config_mapping.get(key, key)
++                config_dict[key] = recurse_elems(value)
++            return PretrainedConfig(**config_dict)
+         else:
+-            raise e
+-    if config.model_type in _CONFIG_REGISTRY:
+-        config_class = _CONFIG_REGISTRY[config.model_type]
+-        config = config_class.from_pretrained(model,
+-                                              revision=revision,
+-                                              code_revision=code_revision)
++            return elem
++
++    config_dict["model_type"] = config_dict.get("model_type", "transformer")
++    config_dict["hidden_act"] = config_dict.get("activation", "silu")
++    config_dict["tie_word_embeddings"] = config_dict.get(
++        "tie_embeddings", False)
++    config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000)
++    config_dict["max_position_embeddings"] = config_dict.get(
++        "max_position_embeddings", 128_000)
++
++    if config_dict.get("moe") is not None:
++        config_dict["architectures"] = ["MixtralForCausalLM"]
++    else:
++        config_dict["architectures"] = ["MistralForCausalLM"]
++
++    if config_dict.get("vision_encoder") is not None:
++        multimodal_config = config_dict.pop("vision_encoder")
++
++        config_dict = {
++            "text_config": config_dict,
++            "vision_config": multimodal_config
++        }
++        config_dict["architectures"] = ["PixtralForConditionalGeneration"]
++        config_dict["model_type"] = "pixtral"
++
++    config_dict.update(kwargs)
++
++    config = recurse_elems(config_dict)
+     return config
+ 
+ 
++def get_hf_image_processor_config(
++    model: Union[str, Path],
++    revision: Optional[str] = None,
++    **kwargs,
++) -> Dict[str, Any]:
++    # ModelScope does not provide an interface for image_processor
++    if VLLM_USE_MODELSCOPE:
++        return dict()
++    # Separate model folder from file path for GGUF models
++    if check_gguf_file(model):
++        model = Path(model).parent
++    return get_image_processor_config(model, revision=revision, **kwargs)
++
++
+ def get_hf_text_config(config: PretrainedConfig):
+     """Get the "sub" config relevant to llm for multi modal models.
+-        No op for pure text models.
++    No op for pure text models.
+     """
+     if hasattr(config, "text_config"):
+         # The code operates under the assumption that text_config should have
+@@ -56,3 +558,38 @@ def get_hf_text_config(config: PretrainedConfig):
+         return config.text_config
+     else:
+         return config
++
++
++def try_get_generation_config(
++    model: str,
++    trust_remote_code: bool,
++    revision: Optional[str] = None,
++) -> Optional[GenerationConfig]:
++    try:
++        return GenerationConfig.from_pretrained(
++            model,
++            revision=revision,
++        )
++    except OSError:  # Not found
++        try:
++            config = get_config(
++                model,
++                trust_remote_code=trust_remote_code,
++                revision=revision,
++            )
++            return GenerationConfig.from_model_config(config)
++        except OSError:  # Not found
++            return None
++
++
++def get_cross_encoder_activation_function(config: PretrainedConfig):
++    if (hasattr(config, "sbert_ce_default_activation_function")
++            and config.sbert_ce_default_activation_function is not None):
++
++        function_name = config.sbert_ce_default_activation_function
++        assert function_name.startswith("torch.nn.modules."), \
++            "Loading of activation functions is restricted to " \
++            "torch.nn.modules for security reasons"
++        return resolve_obj_by_qualname(function_name)()
++    else:
++        return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
+diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
+index 0e48692..f065c56 100644
+--- a/vllm/transformers_utils/configs/__init__.py
++++ b/vllm/transformers_utils/configs/__init__.py
+@@ -1,16 +1,46 @@
+ from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
++from vllm.transformers_utils.configs.cohere2 import Cohere2Config
+ from vllm.transformers_utils.configs.dbrx import DbrxConfig
++from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
++from vllm.transformers_utils.configs.eagle import EAGLEConfig
++from vllm.transformers_utils.configs.exaone import ExaoneConfig
+ # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+ # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+ # `FalconConfig` class from the official HuggingFace transformers library.
+ from vllm.transformers_utils.configs.falcon import RWConfig
++from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig
++from vllm.transformers_utils.configs.internvl import InternVLChatConfig
+ from vllm.transformers_utils.configs.jais import JAISConfig
++from vllm.transformers_utils.configs.medusa import MedusaConfig
++from vllm.transformers_utils.configs.mllama import MllamaConfig
++from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
+ from vllm.transformers_utils.configs.mpt import MPTConfig
++from vllm.transformers_utils.configs.nemotron import NemotronConfig
++from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
++from vllm.transformers_utils.configs.olmo2 import Olmo2Config
++from vllm.transformers_utils.configs.solar import SolarConfig
++from vllm.transformers_utils.configs.telechat2 import Telechat2Config
++from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+ 
+ __all__ = [
+     "ChatGLMConfig",
++    "Cohere2Config",
+     "DbrxConfig",
++    "DeepseekVLV2Config",
+     "MPTConfig",
+     "RWConfig",
++    "H2OVLChatConfig",
++    "InternVLChatConfig",
+     "JAISConfig",
+-]
++    "MedusaConfig",
++    "EAGLEConfig",
++    "ExaoneConfig",
++    "MllamaConfig",
++    "MLPSpeculatorConfig",
++    "NemotronConfig",
++    "NVLM_D_Config",
++    "Olmo2Config",
++    "SolarConfig",
++    "Telechat2Config",
++    "UltravoxConfig",
++]
+\ No newline at end of file
+diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
+new file mode 100644
+index 0000000..7780bf5
+--- /dev/null
++++ b/vllm/transformers_utils/configs/arctic.py
+@@ -0,0 +1,204 @@
++# yapf: disable
++# ruff: noqa: E501
++# coding=utf-8
++# Copied from
++# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
++""" Arctic model configuration"""
++
++from dataclasses import asdict, dataclass
++from typing import Any, Dict
++
++from transformers.configuration_utils import PretrainedConfig
++from transformers.utils import logging
++
++logger = logging.get_logger(__name__)
++
++ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
++    "arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
++}
++
++
++@dataclass
++class ArcticLoraConfig:
++    lora_r: int = 64
++    lora_alpha: float = 16
++    shard_base_weights: bool = False
++
++
++@dataclass
++class ArcticQuantizationConfig:
++    q_bits: int = 8
++    rounding: str = "nearest"
++    mantissa_bits: int = 3
++    group_size: int = 128
++
++
++class ArcticConfig(PretrainedConfig):
++    r"""
++    This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
++    Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
++    with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
++
++
++    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
++    documentation from [`PretrainedConfig`] for more information.
++
++
++    Args:
++        vocab_size (`int`, *optional*, defaults to 32000):
++            Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
++            `inputs_ids` passed when calling [`ArcticModel`]
++        hidden_size (`int`, *optional*, defaults to 4096):
++            Dimension of the hidden representations.
++        intermediate_size (`int`, *optional*, defaults to 14336):
++            Dimension of the MLP representations.
++        num_hidden_layers (`int`, *optional*, defaults to 32):
++            Number of hidden layers in the Transformer encoder.
++        num_attention_heads (`int`, *optional*, defaults to 32):
++            Number of attention heads for each attention layer in the Transformer encoder.
++        num_key_value_heads (`int`, *optional*, defaults to 8):
++            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
++            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
++            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
++            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
++            by meanpooling all the original heads within that group. For more details checkout [this
++            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
++        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
++            The non-linear activation function (function or string) in the decoder.
++        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
++            The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
++            allows sequence of up to 4096*32 tokens.
++        initializer_range (`float`, *optional*, defaults to 0.02):
++            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
++        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
++            The epsilon used by the rms normalization layers.
++        use_cache (`bool`, *optional*, defaults to `True`):
++            Whether or not the model should return the last key/values attentions (not used by all models). Only
++            relevant if `config.is_decoder=True`.
++        pad_token_id (`int`, *optional*):
++            The id of the padding token.
++        bos_token_id (`int`, *optional*, defaults to 1):
++            The id of the "beginning-of-sequence" token.
++        eos_token_id (`int`, *optional*, defaults to 2):
++            The id of the "end-of-sequence" token.
++        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
++            Whether the model's input and output word embeddings should be tied.
++        rope_theta (`float`, *optional*, defaults to 1000000.0):
++            The base period of the RoPE embeddings.
++        sliding_window (`int`, *optional*):
++            Sliding window attention window size. If not specified, will default to `4096`.
++        attention_dropout (`float`, *optional*, defaults to 0.0):
++            The dropout ratio for the attention probabilities.
++        num_experts_per_tok (`int`, *optional*, defaults to 2):
++            The number of experts to root per-token, can be also interpreted as the `top-p` routing
++            parameter
++        num_local_experts (`int`, *optional*, defaults to 8):
++            Number of experts per Sparse MLP layer.
++        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
++            The aux loss factor for the total loss.
++
++    ```python
++    >>> from transformers import ArcticModel, ArcticConfig
++
++    >>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
++    >>> configuration = ArcticConfig()
++
++    >>> # Initializing a model from the Arctic 7B style configuration
++    >>> model = ArcticModel(configuration)
++
++    >>> # Accessing the model configuration
++    >>> configuration = model.config
++    ```"""
++
++    model_type = "arctic"
++    keys_to_ignore_at_inference = ["past_key_values"]
++
++    def __init__(
++        self,
++        vocab_size=32000,
++        hidden_size=4096,
++        intermediate_size=14336,
++        num_hidden_layers=32,
++        num_attention_heads=32,
++        num_key_value_heads=None,
++        hidden_act="silu",
++        max_position_embeddings=4096,
++        initializer_range=0.02,
++        rms_norm_eps=1e-5,
++        use_cache=True,
++        pad_token_id=None,
++        bos_token_id=1,
++        eos_token_id=2,
++        tie_word_embeddings=False,
++        rope_theta=1e6,
++        sliding_window=None,
++        attention_dropout=0.0,
++        num_experts_per_tok=1,
++        num_local_experts=8,
++        router_aux_loss_coef=0.001,
++        moe_layer_frequency=2,
++        parallel_attn_mlp_res=False,
++        moe_train_capacity_factor=1,
++        moe_eval_capacity_factor=1,
++        enable_expert_tensor_parallelism=False,
++        moe_min_capacity=0,
++        moe_token_dropping=True,
++        quantization=None,
++        **kwargs,
++    ):
++        self.vocab_size = vocab_size
++        self.max_position_embeddings = max_position_embeddings
++        self.hidden_size = hidden_size
++        self.intermediate_size = intermediate_size
++        self.num_hidden_layers = num_hidden_layers
++        self.num_attention_heads = num_attention_heads
++        self.sliding_window = sliding_window
++
++        # for backward compatibility
++        if num_key_value_heads is None:
++            num_key_value_heads = num_attention_heads
++
++        self.num_key_value_heads = num_key_value_heads
++        self.hidden_act = hidden_act
++        self.initializer_range = initializer_range
++        self.rms_norm_eps = rms_norm_eps
++        self.use_cache = use_cache
++        self.rope_theta = rope_theta
++        self.attention_dropout = attention_dropout
++
++        self.num_experts_per_tok = num_experts_per_tok
++        self.num_local_experts = num_local_experts
++        self.router_aux_loss_coef = router_aux_loss_coef
++        self.moe_layer_frequency = moe_layer_frequency
++        self.moe_train_capacity_factor = moe_train_capacity_factor
++        self.moe_eval_capacity_factor = moe_eval_capacity_factor
++        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
++        self.moe_min_capacity = moe_min_capacity
++        self.moe_token_dropping = moe_token_dropping
++        self.parallel_attn_mlp_res = parallel_attn_mlp_res
++        if isinstance(quantization, dict):
++            self.quantization = ArcticQuantizationConfig(**quantization)
++        else:
++            self.quantization = quantization
++
++        super().__init__(
++            pad_token_id=pad_token_id,
++            bos_token_id=bos_token_id,
++            eos_token_id=eos_token_id,
++            tie_word_embeddings=tie_word_embeddings,
++            **kwargs,
++        )
++
++    @classmethod
++    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "ArcticConfig":
++        result = super().from_dict(config_dict, **kwargs)
++        config = result[0] if isinstance(result, tuple) else result
++        if isinstance(config.quantization, dict):
++            config.quantization = ArcticQuantizationConfig(**config.quantization)
++        return result
++
++    def to_dict(self) -> Dict[str, Any]:
++        ret = super().to_dict()
++        if isinstance(ret["quantization"], ArcticQuantizationConfig):
++            ret["quantization"] = asdict(ret["quantization"])
++        return ret
+diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py
+new file mode 100644
+index 0000000..d253da0
+--- /dev/null
++++ b/vllm/transformers_utils/configs/aria.py
+@@ -0,0 +1,47 @@
++from transformers.models.idefics2.configuration_idefics2 import (
++    Idefics2VisionConfig)
++from transformers.models.llama.configuration_llama import LlamaConfig
++
++
++class AriaVisionConfig(Idefics2VisionConfig):
++    model_type = "aria_vision_model"
++
++
++class AriaMoELMConfig(LlamaConfig):
++    """
++    Configuration class for AriaMoE language model.
++
++    This class extends the LlamaConfig to include additional parameters specific
++    to the Mixture of Experts (MoE) architecture.
++    """
++
++    model_type = "aria_moe_lm"
++
++    def __init__(
++        self,
++        moe_intermediate_size: int = 4096,
++        moe_num_experts: int = 8,
++        moe_topk: int = 2,
++        moe_num_shared_experts: int = 2,
++        **kwargs,
++    ):
++        """
++        Initialize the AriaMoELMConfig.
++
++        Args:
++            moe_intermediate_size (int): The intermediate size for MoE layers.
++                Default is 4096.
++            moe_num_experts (int): The number of experts in the MoE layer.
++                Default is 8.
++            moe_topk (int): The number of top experts to route to for each 
++                token. Default is 2.
++            moe_num_shared_experts (int): The number of shared experts. Default
++                is 2. 
++            **kwargs: Additional keyword arguments to be passed to the parent
++                LlamaConfig.
++        """
++        super().__init__(**kwargs)
++        self.moe_intermediate_size = moe_intermediate_size
++        self.moe_num_experts = moe_num_experts
++        self.moe_topk = moe_topk
++        self.moe_num_shared_experts = moe_num_shared_experts
+diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py
+index c4244f8..e563bf6 100644
+--- a/vllm/transformers_utils/configs/chatglm.py
++++ b/vllm/transformers_utils/configs/chatglm.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Adapted from
+ # https://github.com/THUDM/ChatGLM2-6B
+ from transformers import PretrainedConfig
+@@ -46,6 +45,8 @@ class ChatGLMConfig(PretrainedConfig):
+         self.kv_channels = kv_channels
+         self.num_attention_heads = num_attention_heads
+         self.seq_length = seq_length
++        # It is to be compatible with long lora.
++        self.max_position_embeddings = seq_length
+         self.hidden_dropout = hidden_dropout
+         self.attention_dropout = attention_dropout
+         self.layernorm_epsilon = layernorm_epsilon
+diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
+new file mode 100644
+index 0000000..1509330
+--- /dev/null
++++ b/vllm/transformers_utils/configs/cohere2.py
+@@ -0,0 +1,192 @@
++# ruff: noqa
++
++# Adapted from
++# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
++from transformers import PretrainedConfig
++from transformers.modeling_rope_utils import rope_config_validation
++
++
++class Cohere2Config(PretrainedConfig):
++    r"""
++    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
++    model according to the specified arguments, defining the model architecture.
++
++    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
++    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
++    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
++
++
++    Args:
++        vocab_size (`int`, *optional*, defaults to 256000):
++            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
++            `inputs_ids` passed when calling [`CohereModel`]
++        hidden_size (`int`, *optional*, defaults to 8192):
++            Dimension of the hidden representations.
++        intermediate_size (`int`, *optional*, defaults to 22528):
++            Dimension of the MLP representations.
++        logit_scale (`float`, *optional*, defaults to 0.0625):
++            The scaling factor for the output logits.
++        num_hidden_layers (`int`, *optional*, defaults to 40):
++            Number of hidden layers in the Transformer decoder.
++        num_attention_heads (`int`, *optional*, defaults to 64):
++            Number of attention heads for each attention layer in the Transformer decoder.
++        num_key_value_heads (`int`, *optional*):
++            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
++            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
++            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
++            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
++            by meanpooling all the original heads within that group. For more details checkout [this
++            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
++            `num_attention_heads`.
++        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
++            The non-linear activation function (function or string) in the decoder.
++        max_position_embeddings (`int`, *optional*, defaults to 8192):
++            The maximum sequence length that this model might ever be used with.
++        initializer_range (`float`, *optional*, defaults to 0.02):
++            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
++        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
++            The epsilon used by the layer normalization.
++        use_cache (`bool`, *optional*, defaults to `True`):
++            Whether or not the model should return the last key/values attentions (not used by all models). Only
++            relevant if `config.is_decoder=True`.
++        pad_token_id (`int`, *optional*, defaults to 0):
++            Padding token id.
++        bos_token_id (`int`, *optional*, defaults to 5):
++            Beginning of stream token id.
++        eos_token_id (`int`, *optional*, defaults to 255001):
++            End of stream token id.
++        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
++            Whether to tie weight embeddings
++        rope_theta (`float`, *optional*, defaults to 10000.0):
++            The base period of the RoPE embeddings.
++        rope_scaling (`Dict`, *optional*):
++            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
++            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
++            accordingly.
++            Expected contents:
++                `rope_type` (`str`):
++                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
++                    'llama3'], with 'default' being the original RoPE implementation.
++                `factor` (`float`, *optional*):
++                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
++                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
++                    original maximum pre-trained length.
++                `original_max_position_embeddings` (`int`, *optional*):
++                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
++                    pretraining.
++                `attention_factor` (`float`, *optional*):
++                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
++                    computation. If unspecified, it defaults to value recommended by the implementation, using the
++                    `factor` field to infer the suggested value.
++                `beta_fast` (`float`, *optional*):
++                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
++                    ramp function. If unspecified, it defaults to 32.
++                `beta_slow` (`float`, *optional*):
++                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
++                    ramp function. If unspecified, it defaults to 1.
++                `short_factor` (`List[float]`, *optional*):
++                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
++                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
++                    size divided by the number of attention heads divided by 2
++                `long_factor` (`List[float]`, *optional*):
++                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
++                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
++                    size divided by the number of attention heads divided by 2
++                `low_freq_factor` (`float`, *optional*):
++                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
++                `high_freq_factor` (`float`, *optional*):
++                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
++        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
++            Whether to use a bias in the query, key, value and output projection layers during self-attention.
++        attention_dropout (`float`, *optional*, defaults to 0.0):
++            The dropout ratio for the attention probabilities.
++        sliding_window (`int`, *optional*, defaults to 4096):
++            Size of the sliding window attention context.
++        sliding_window_pattern (`int`, *optional*, defaults to 4):
++            Pattern for the sliding window attention.
++        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
++
++    ```python
++    >>> from transformers import Cohere2Model, Cohere2Config
++
++    >>> # Initializing a Cohere Nextmodel configuration
++    >>> configuration = Cohere2Config()
++
++    >>> # Initializing a model from the Cohere2 configuration
++    >>> model = Cohere2Model(configuration) # doctest: +SKIP
++
++    >>> # Accessing the model configuration
++    >>> configuration = model.config # doctest: +SKIP
++    ```
++    """
++
++    model_type = "cohere2"
++    keys_to_ignore_at_inference = ["past_key_values"]
++
++    def __init__(
++        self,
++        vocab_size=256000,
++        hidden_size=8192,
++        intermediate_size=22528,
++        logit_scale=0.0625,
++        num_hidden_layers=40,
++        num_attention_heads=64,
++        num_key_value_heads=None,
++        hidden_act="silu",
++        max_position_embeddings=8192,
++        initializer_range=0.02,
++        layer_norm_eps=1e-5,
++        use_cache=True,
++        pad_token_id=0,
++        bos_token_id=5,
++        eos_token_id=255001,
++        tie_word_embeddings=True,
++        rope_theta=10000.0,
++        rope_scaling=None,
++        attention_bias=False,
++        attention_dropout=0.0,
++        sliding_window=4096,
++        sliding_window_pattern=4,
++        cache_implementation="hybrid",
++        **kwargs,
++    ):
++        self.vocab_size = vocab_size
++        self.max_position_embeddings = max_position_embeddings
++        self.hidden_size = hidden_size
++        self.logit_scale = logit_scale
++        self.intermediate_size = intermediate_size
++        self.num_hidden_layers = num_hidden_layers
++        self.num_attention_heads = num_attention_heads
++
++        # for backward compatibility
++        if num_key_value_heads is None:
++            num_key_value_heads = num_attention_heads
++
++        self.num_key_value_heads = num_key_value_heads
++        self.hidden_act = hidden_act
++        self.initializer_range = initializer_range
++        self.layer_norm_eps = layer_norm_eps
++        self.use_cache = use_cache
++        self.rope_theta = rope_theta
++        self.rope_scaling = rope_scaling
++        self.attention_bias = attention_bias
++        self.attention_dropout = attention_dropout
++        self.sliding_window = sliding_window
++        self.sliding_window_pattern = sliding_window_pattern
++        # Need to specify head_dim in the config so it can be used in the attention forward functions
++        self.head_dim = hidden_size // num_attention_heads
++        self.cache_implementation = cache_implementation
++
++        # Validate the correctness of rotary position embeddings parameters
++        rope_config_validation(self)
++
++        super().__init__(
++            pad_token_id=pad_token_id,
++            bos_token_id=bos_token_id,
++            eos_token_id=eos_token_id,
++            tie_word_embeddings=tie_word_embeddings,
++            **kwargs,
++        )
++
++
++__all__ = ["Cohere2Config"]
+diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
+new file mode 100644
+index 0000000..681528c
+--- /dev/null
++++ b/vllm/transformers_utils/configs/deepseek_vl2.py
+@@ -0,0 +1,214 @@
++# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
++from typing import Tuple
++
++from transformers.configuration_utils import PretrainedConfig
++
++
++class VisionEncoderConfig(PretrainedConfig):
++    model_type: str = "vision"
++
++    model_name: str = "vit_so400m_patch14_siglip_384.webli"
++    image_size: int = 384
++    patch_size: int = 16
++    width: int = 1024
++    layers: int = 24
++    heads: int = 16
++    mlp_ratio: int = 4
++    global_pool: str = "map"
++    ignore_head: bool = True
++    class_token: bool = False
++    num_classes: int = 0
++    use_checkpoint: bool = False
++    weight_init: str = "skip"
++    deterministic: bool = False
++    num_recomputing_layers: int = 0
++
++    def __init__(self,
++                 model_name: str = "vit_so400m_patch14_siglip_384.webli",
++                 image_size: int = 384,
++                 patch_size: int = 16,
++                 width: int = 1024,
++                 layers: int = 24,
++                 heads: int = 16,
++                 mlp_ratio: int = 4,
++                 global_pool: str = "map",
++                 ignore_head: bool = True,
++                 class_token: bool = False,
++                 num_classes: int = 0,
++                 use_checkpoint: bool = False,
++                 **kwargs):
++        self.model_name = model_name
++        self.image_size = image_size
++        self.patch_size = patch_size
++        self.width = width
++        self.layers = layers
++        self.heads = heads
++        self.mlp_ratio = mlp_ratio
++        self.global_pool = global_pool
++        self.ignore_head = ignore_head
++        self.class_token = class_token
++        self.num_classes = num_classes
++        self.use_checkpoint = use_checkpoint
++
++        super().__init__(**kwargs)
++
++
++class MlpProjectorConfig(PretrainedConfig):
++    model_type = "mlp_projector"
++    projector_type: str = "downsample_mlp_gelu"
++    input_dim: int = 1152
++    n_embed: int = 2048
++    depth: int = 2
++    mlp_ratio: int = 1
++    downsample_ratio: int = 2
++    token_pooling: bool = False
++
++    def __init__(self,
++                 projector_type: str = "downsample_mlp_gelu",
++                 input_dim: int = 1152,
++                 n_embed: int = 2048,
++                 depth: int = 2,
++                 mlp_ratio: int = 1,
++                 downsample_ratio: int = 2,
++                 **kwargs):
++        self.projector_type = projector_type
++        self.input_dim = input_dim
++        self.n_embed = n_embed
++        self.depth = depth
++        self.mlp_ratio = mlp_ratio
++        self.downsample_ratio = downsample_ratio
++
++        super().__init__(**kwargs)
++
++
++class DeepseekV2Config(PretrainedConfig):
++
++    model_type = "deepseek_v2"
++    keys_to_ignore_at_inference = ["past_key_values"]
++
++    def __init__(
++        self,
++        vocab_size=102400,
++        hidden_size=4096,
++        intermediate_size=11008,
++        moe_intermediate_size=1407,
++        num_hidden_layers=30,
++        num_attention_heads=32,
++        num_key_value_heads=32,
++        n_shared_experts=None,
++        n_routed_experts=None,
++        ep_size=1,
++        routed_scaling_factor=1.0,
++        kv_lora_rank=512,
++        q_lora_rank=1536,
++        qk_rope_head_dim=64,
++        v_head_dim=128,
++        qk_nope_head_dim=128,
++        topk_method='gready',
++        n_group=None,
++        topk_group=None,
++        num_experts_per_tok=None,
++        moe_layer_freq=1,
++        first_k_dense_replace=0,
++        norm_topk_prob=False,
++        scoring_func='softmax',
++        aux_loss_alpha=0.001,
++        seq_aux=True,
++        hidden_act="silu",
++        max_position_embeddings=2048,
++        initializer_range=0.02,
++        rms_norm_eps=1e-6,
++        use_cache=True,
++        pad_token_id=None,
++        bos_token_id=100000,
++        eos_token_id=100001,
++        pretraining_tp=1,
++        tie_word_embeddings=False,
++        rope_theta=10000.0,
++        rope_scaling=None,
++        attention_bias=False,
++        attention_dropout=0.0,
++        use_mla=True,
++        **kwargs,
++    ):
++        self.vocab_size = vocab_size
++        self.max_position_embeddings = max_position_embeddings
++        self.hidden_size = hidden_size
++        self.intermediate_size = intermediate_size
++        self.moe_intermediate_size = moe_intermediate_size
++        self.num_hidden_layers = num_hidden_layers
++        self.num_attention_heads = num_attention_heads
++        self.n_shared_experts = n_shared_experts
++        self.n_routed_experts = n_routed_experts
++        self.ep_size = ep_size
++        self.routed_scaling_factor = routed_scaling_factor
++        self.kv_lora_rank = kv_lora_rank
++        self.q_lora_rank = q_lora_rank
++        self.qk_rope_head_dim = qk_rope_head_dim
++        self.v_head_dim = v_head_dim
++        self.qk_nope_head_dim = qk_nope_head_dim
++        self.topk_method = topk_method
++        self.n_group = n_group
++        self.topk_group = topk_group
++        self.num_experts_per_tok = num_experts_per_tok
++        self.moe_layer_freq = moe_layer_freq
++        self.first_k_dense_replace = first_k_dense_replace
++        self.norm_topk_prob = norm_topk_prob
++        self.scoring_func = scoring_func
++        self.aux_loss_alpha = aux_loss_alpha
++        self.seq_aux = seq_aux
++        # for backward compatibility
++        if num_key_value_heads is None:
++            num_key_value_heads = num_attention_heads
++
++        self.num_key_value_heads = num_key_value_heads
++        self.hidden_act = hidden_act
++        self.initializer_range = initializer_range
++        self.rms_norm_eps = float(rms_norm_eps)
++        self.pretraining_tp = pretraining_tp
++        self.use_cache = use_cache
++        self.rope_theta = rope_theta
++        self.rope_scaling = rope_scaling
++        self.attention_bias = attention_bias
++        self.attention_dropout = attention_dropout
++        self.use_mla = use_mla
++
++        super().__init__(
++            pad_token_id=pad_token_id,
++            bos_token_id=bos_token_id,
++            eos_token_id=eos_token_id,
++            tie_word_embeddings=tie_word_embeddings,
++            **kwargs,
++        )
++
++
++class DeepseekVLV2Config(PretrainedConfig):
++    model_type = "deepseek_vl_v2"
++    vision_config: VisionEncoderConfig
++    projector_config: MlpProjectorConfig
++
++    tile_tag: str = "2D"
++    global_view_pos: str = "head"
++    candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384), )
++
++    def __init__(self,
++                 tile_tag: str = "tile_tag",
++                 global_view_pos: str = "head",
++                 candidate_resolutions: Tuple[Tuple[int,
++                                                    int]] = ((384, 384), ),
++                 **kwargs):
++        super().__init__(**kwargs)
++
++        vision_config = kwargs.get("vision_config", {})
++        self.vision_config = VisionEncoderConfig(**vision_config)
++
++        projector_config = kwargs.get("projector_config", {})
++        self.projector_config = MlpProjectorConfig(**projector_config)
++
++        language_config = kwargs.get("language_config", {})
++        self.text_config = DeepseekV2Config(**language_config)
++
++        self.tile_tag = tile_tag
++        self.global_view_pos = global_view_pos
++        self.candidate_resolutions = candidate_resolutions
++        self.vocab_size = self.text_config.vocab_size
+diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
+new file mode 100644
+index 0000000..b357a78
+--- /dev/null
++++ b/vllm/transformers_utils/configs/eagle.py
+@@ -0,0 +1,49 @@
++import os
++from typing import Optional, Union
++
++from transformers import AutoConfig, PretrainedConfig
++
++
++class EAGLEConfig(PretrainedConfig):
++    model_type = "eagle"
++
++    def __init__(self,
++                 model: Union[PretrainedConfig, dict, None] = None,
++                 truncated_vocab_size: Optional[int] = None,
++                 **kwargs):
++
++        model_config = None if model is None else (AutoConfig.for_model(
++            **model) if isinstance(model, dict) else model)
++
++        for k, v in kwargs.items():
++            if k != "architectures" and k != "model_type" and hasattr(
++                    model_config, k):
++                setattr(model_config, k, v)
++
++        self.model = model_config
++
++        if self.model is None:
++            self.truncated_vocab_size = None
++        else:
++            self.truncated_vocab_size = self.model.vocab_size if \
++                truncated_vocab_size is None else truncated_vocab_size
++
++        if "architectures" not in kwargs:
++            kwargs["architectures"] = ["EAGLEModel"]
++
++        super().__init__(**kwargs)
++
++        if self.model is not None:
++            for k, v in self.model.to_dict().items():
++                if not hasattr(self, k):
++                    setattr(self, k, v)
++
++    @classmethod
++    def from_pretrained(
++        cls,
++        pretrained_model_name_or_path: Union[str, os.PathLike],
++        **kwargs,
++    ) -> "EAGLEConfig":
++        config_dict, kwargs = cls.get_config_dict(
++            pretrained_model_name_or_path, **kwargs)
++        return cls.from_dict(config_dict, **kwargs)
+diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
+new file mode 100644
+index 0000000..f60a59f
+--- /dev/null
++++ b/vllm/transformers_utils/configs/exaone.py
+@@ -0,0 +1,189 @@
++# Copied from
++# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
++# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Exaone model configuration"""
++
++from typing import Dict
++
++from transformers.configuration_utils import PretrainedConfig
++from transformers.utils import logging
++
++logger = logging.get_logger(__name__)
++
++EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
++
++
++class ExaoneConfig(PretrainedConfig):
++    r"""
++    This is the configuration class to store the configuration of a :class:
++    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
++    according to the specified arguments, defining the model architecture.
++    Instantiating a configuration with the defaults will yield a similar
++    configuration to that of the Exaone
++
++    Configuration objects inherit from :class:`~transformers.PretrainedConfig`
++    and can be used to control the model outputs. Read the documentation from :
++    class:`~transformers.PretrainedConfig` for more information.
++
++    Args:
++        vocab_size (:obj:`int`, `optional`, defaults to 50257):
++            Vocabulary size of the GPT Lingvo model. Defines the number of
++            different tokens that can be represented by the :obj:`inputs_ids`
++            passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
++            size of the model.
++            Defines the different tokens that can be represented by the
++            `inputs_ids` passed to the forward method of :class:
++            `~transformers.EXAONEModel`.
++        hidden_size (:obj:`int`, `optional`, defaults to 2048):
++            Dimensionality of the encoder layers and the pooler layer.
++        num_layers (:obj:`int`, `optional`, defaults to 24):
++            Number of hidden layers in the Transformer encoder.
++        num_attention_heads (`int`, *optional*, defaults to 32):
++            Number of attention heads for each attention layer in the
++            Transformer decoder.
++        num_key_value_heads (`int`, *optional*):
++            This is the number of key_value heads that should be used to
++            implement Grouped Query Attention. If
++            `num_key_value_heads=num_attention_heads`, the model will use Multi
++            Head Attention (MHA), if `num_key_value_heads=1 the model will use
++            Multi Query Attention (MQA) otherwise GQA is used. When
++            converting a multi-head checkpoint to a GQA checkpoint,
++            each group key and value head should be constructed by meanpooling
++            all the original heads within that group. For more details checkout
++            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
++            specified, will default to `num_attention_heads`.
++        rotary_pct (`float`, *optional*, defaults to 0.25):
++            percentage of hidden dimensions to allocate to rotary embeddings
++        intermediate_size (:obj:`int`, `optional`, defaults to 8192):
++            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
++            the Transformer encoder.
++        activation_function (:obj:`str` or :obj:`function`, `optional`,
++        defaults to :obj:`"gelu_new"`):
++            The non-linear activation function (function or string) in the
++            encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
++            :obj:`"selu"` and :obj:`"gelu_new"` are supported.
++        embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
++            The dropout probabilitiy for all fully connected layers in the
++            embeddings, encoder, and pooler.
++        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
++            The dropout ratio for the attention probabilities.
++        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
++            The maximum sequence length that this model might ever be used with.
++            Typically set this to something large just in case
++            (e.g., 512 or 1024 or 2048).
++        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
++            The vocabulary size of the :obj:`token_type_ids` passed when calling
++            :class:`~transformers.EXAONEModel`.
++        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
++            The standard deviation of the truncated_normal_initializer for
++            initializing all weight matrices.
++        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
++            The epsilon used by the layer normalization layers.
++        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
++            Whether or not the model should return the last key/values
++            attentions (not used by all models).
++            Only relevant if ``config.is_decoder=True``.
++        gradient_checkpointing (:obj:`bool`, `optional`,
++        defaults to :obj:`False`):
++            If True, use gradient checkpointing to save memory at the expense
++            of slower backward pass.
++        Example::
++
++            >>> from transformers import ExoneModel, ExaoneConfig
++
++            >>> # Initializing a EXAONE configuration
++            >>> configuration = ExaoneConfig()
++
++            >>> # Initializing a model from configuration
++            >>> model = ExoneModel(configuration)
++
++            >>> # Accessing the model configuration
++            >>> configuration = model.config
++    """
++
++    model_type = "exaone"
++    keys_to_ignore_at_inference = ["past_key_values"]
++    attribute_map = {"num_hidden_layers": "num_layers"}
++
++    def __init__(
++        self,
++        vocab_size=102400,
++        max_position_embeddings=2048,
++        hidden_size=2048,
++        num_layers=32,
++        num_attention_heads=32,
++        num_key_value_heads=None,
++        intermediate_size=None,
++        activation_function="silu",
++        rotary_pct=0.25,
++        resid_dropout=0.0,
++        embed_dropout=0.0,
++        attention_dropout=0.0,
++        layer_norm_epsilon=1e-6,
++        initializer_range=0.02,
++        use_cache=True,
++        bos_token_id=0,
++        eos_token_id=2,
++        tie_word_embeddings=True,
++        **kwargs,
++    ):
++        super().__init__(
++            bos_token_id=bos_token_id,
++            eos_token_id=eos_token_id,
++            tie_word_embeddings=tie_word_embeddings,
++            **kwargs,
++        )
++
++        self.vocab_size = vocab_size
++        self.max_position_embeddings = max_position_embeddings
++        self.hidden_size = hidden_size
++        self.num_layers = num_layers
++        self.num_attention_heads = num_attention_heads
++        self.num_hidden_layers = num_layers
++        if num_key_value_heads is None:
++            num_key_value_heads = num_attention_heads
++        self.num_key_value_heads = num_key_value_heads
++        if intermediate_size:
++            self.intermediate_size = intermediate_size
++        else:
++            self.intermediate_size = hidden_size * 4
++        self.activation_function = activation_function
++        self.resid_dropout = resid_dropout
++        self.embed_dropout = embed_dropout
++        self.attention_dropout = attention_dropout
++        self.layer_norm_epsilon = layer_norm_epsilon
++        self.initializer_range = initializer_range
++        self.use_cache = use_cache
++        self.rotary_pct = rotary_pct
++
++        self.bos_token_id = bos_token_id
++        self.eos_token_id = eos_token_id
++
++        self.use_logit_cap = kwargs.pop("use_logit_cap", False)
++        self.ln_no_scale = kwargs.pop("ln_no_scale", False)
++        self.use_gated = kwargs.pop("use_gated", False)
++        self.use_emb_norm = kwargs.pop("use_emb_norm", False)
++        self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
++        self.rotary_type = kwargs.pop("rotary_type", None)
++        self.scaling_factor = kwargs.pop("scaling_factor", 1)
++        self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
++        self.use_extra_logit = kwargs.pop("use_extra_logit", True)
++        self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
++        self.rotary_base = kwargs.pop("rotary_base", 10000.0)
++        self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
++        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
++                                                 (rotary_pct == 0.25))
++        if self.use_rotary_pos:
++            self.use_absolute_pos = False
+diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py
+new file mode 100644
+index 0000000..b94c5b7
+--- /dev/null
++++ b/vllm/transformers_utils/configs/h2ovl.py
+@@ -0,0 +1,13 @@
++# Adapted from
++# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
++# --------------------------------------------------------
++# H2OVL-Mississippi
++# Copyright (c) 2024 H2O.AI
++# Licensed under Apache 2.0 License [see LICENSE for details]
++# --------------------------------------------------------
++
++from .internvl import InternVLChatConfig
++
++
++class H2OVLChatConfig(InternVLChatConfig):
++    model_type = "h2ovl_chat"
+diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py
+new file mode 100644
+index 0000000..ac24923
+--- /dev/null
++++ b/vllm/transformers_utils/configs/internvl.py
+@@ -0,0 +1,51 @@
++# Adapted from
++# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
++# --------------------------------------------------------
++# InternVL
++# Copyright (c) 2024 OpenGVLab
++# Licensed under The MIT License [see LICENSE for details]
++# --------------------------------------------------------
++from transformers.configuration_utils import PretrainedConfig
++
++
++class InternVLChatConfig(PretrainedConfig):
++    model_type = 'internvl_chat'
++    is_composition = True
++
++    def __init__(self,
++                 vision_config=None,
++                 llm_config=None,
++                 use_backbone_lora=0,
++                 use_llm_lora=0,
++                 select_layer=-1,
++                 force_image_size=None,
++                 downsample_ratio=0.5,
++                 template=None,
++                 dynamic_image_size=False,
++                 use_thumbnail=False,
++                 ps_version='v1',
++                 min_dynamic_patch=1,
++                 max_dynamic_patch=6,
++                 **kwargs):
++        super().__init__(**kwargs)
++
++        if vision_config is None:
++            vision_config = {}
++
++        if llm_config is None:
++            llm_config = {}
++
++        self.vision_config = PretrainedConfig(**vision_config)
++        self.text_config = PretrainedConfig(**llm_config)
++
++        self.use_backbone_lora = use_backbone_lora
++        self.use_llm_lora = use_llm_lora
++        self.select_layer = select_layer
++        self.force_image_size = force_image_size
++        self.downsample_ratio = downsample_ratio
++        self.template = template
++        self.dynamic_image_size = dynamic_image_size
++        self.use_thumbnail = use_thumbnail
++        self.ps_version = ps_version  # pixel shuffle version
++        self.min_dynamic_patch = min_dynamic_patch
++        self.max_dynamic_patch = max_dynamic_patch
+diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
+index b06a946..82f129e 100644
+--- a/vllm/transformers_utils/configs/jais.py
++++ b/vllm/transformers_utils/configs/jais.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
+ # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ # Copyright 2023 Cerebras Systems.
+diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py
+new file mode 100644
+index 0000000..d71a083
+--- /dev/null
++++ b/vllm/transformers_utils/configs/medusa.py
+@@ -0,0 +1,60 @@
++import os
++from typing import Optional, Union
++
++from transformers import PretrainedConfig
++
++
++class MedusaConfig(PretrainedConfig):
++    model_type = "medusa"
++
++    def __init__(self,
++                 hidden_size: int = 4096,
++                 vocab_size: int = 32001,
++                 num_heads: int = 5,
++                 num_hidden_layers: int = 1,
++                 max_paths: int = 64,
++                 topk: int = 10,
++                 truncated_vocab_size: Optional[int] = None,
++                 **kwargs):
++
++        self.hidden_size = hidden_size
++        self.vocab_size = vocab_size
++        self.num_heads = num_heads
++        self.num_hidden_layers = num_hidden_layers
++        self.max_paths = max_paths
++        self.topk = topk
++        self.max_seq_len = int(2**20)
++        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
++            else truncated_vocab_size
++        if "architectures" not in kwargs:
++            kwargs["architectures"] = ["MedusaModel"]
++
++        super().__init__(**kwargs)
++
++    @classmethod
++    def from_pretrained(
++        cls,
++        pretrained_model_name_or_path: Union[str, os.PathLike],
++        **kwargs,
++    ) -> "MedusaConfig":
++        config_dict, kwargs = cls.get_config_dict(
++            pretrained_model_name_or_path, **kwargs)
++        for k in list(config_dict.keys()):
++            if 'num' in k:
++                if 'heads' in k:
++                    config_dict["num_heads"] = config_dict.pop(k)
++                elif 'layers' in k:
++                    config_dict["num_hidden_layers"] = config_dict.pop(k)
++        return cls.from_dict(config_dict, **kwargs)
++
++    @property
++    def num_attention_heads(self):
++        return 0
++
++    @property
++    def num_lookahead_tokens(self):
++        return self.num_heads
++
++    @num_lookahead_tokens.setter
++    def num_lookahead_tokens(self, num_lookahead_tokens: int):
++        self.num_heads = num_lookahead_tokens
+diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py
+new file mode 100644
+index 0000000..49e766d
+--- /dev/null
++++ b/vllm/transformers_utils/configs/mllama.py
+@@ -0,0 +1,28 @@
++from transformers.models.mllama import configuration_mllama as mllama_hf_config
++
++
++class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
++    '''
++    Use this class to override is_encoder_decoder:
++    - transformers regards mllama as is_encoder_decoder=False
++    - vllm needs is_encoder_decoder=True to enable cross-attention
++    '''
++
++    def __init__(
++        self,
++        **kwargs,
++    ):
++        super().__init__(**kwargs)
++        self.is_encoder_decoder = True
++
++
++class MllamaConfig(mllama_hf_config.MllamaConfig):
++
++    def __init__(
++        self,
++        text_config=None,
++        **kwargs,
++    ):
++        if isinstance(text_config, dict):
++            text_config = MllamaTextConfig(**text_config)
++        super().__init__(text_config=text_config, **kwargs)
+diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
+new file mode 100644
+index 0000000..946af4e
+--- /dev/null
++++ b/vllm/transformers_utils/configs/mlp_speculator.py
+@@ -0,0 +1,65 @@
++from typing import List, Optional
++
++from transformers import PretrainedConfig
++
++
++class MLPSpeculatorConfig(PretrainedConfig):
++    model_type = "mlp_speculator"
++
++    attribute_map = {
++        "hidden_size": "emb_dim",
++    }
++
++    def __init__(self,
++                 vocab_size: int = 32000,
++                 emb_dim: int = 4096,
++                 inner_dim: int = 0,
++                 n_predict: int = 3,
++                 top_k_tokens_per_head: Optional[List[int]] = None,
++                 n_candidates: int = 5,
++                 tie_weights: bool = False,
++                 scale_input: bool = False,
++                 **kwargs):
++        """
++        Initialize an MLPSpeculatorConfig
++
++        Args:
++            vocab_size: int
++                the model vocab size
++            emb_dim: int
++                the model embedding dimension
++            inner_dim: int
++                the inner dimension of the model. If 0, will be the emb_dim.
++            n_predict: int
++                the number of lookaheads for the speculator
++            top_k_tokens_per_head: List[int]
++                Number of tokens to consider from each head when forming the
++                candidate tree.
++                For each candidate branch in the tree, head n produces topk[n]
++                additional sub-branches.
++                NOTE: This parameter is currently unused.
++            n_candidates: int
++                number of child candidates to create per sequence
++            tie_weights: bool
++                If true, use a single set of weights for every model
++                head/stage after the first. The initial projection
++                from the base model may have a different size, so that
++                stays separate.
++            scale_input: bool
++                if True, will scale the initial hidden states from
++                the base model.
++        """
++        if top_k_tokens_per_head is None:
++            top_k_tokens_per_head = [5, 4, 3]
++        assert len(top_k_tokens_per_head) == n_predict
++        self.vocab_size = vocab_size
++        self.emb_dim = emb_dim
++        self.inner_dim = inner_dim
++        self.n_predict = n_predict
++        self.top_k_tokens_per_head = top_k_tokens_per_head
++        self.n_candidates = n_candidates
++        self.num_lookahead_tokens = n_predict
++        self.tie_weights = tie_weights
++        self.scale_input = scale_input
++
++        super().__init__(**kwargs)
+diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
+index 497db0a..0f047c8 100644
+--- a/vllm/transformers_utils/configs/mpt.py
++++ b/vllm/transformers_utils/configs/mpt.py
+@@ -1,4 +1,3 @@
+-# coding=utf-8
+ # Copied from
+ # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
+ """A HuggingFace-style model configuration."""
+@@ -117,10 +116,10 @@ class MPTConfig(PretrainedConfig):
+                                                      init_config_defaults)
+         if self.d_model % self.n_heads != 0:
+             raise ValueError('d_model must be divisible by n_heads')
+-        if any((
++        if any(
+                 prob < 0 or prob > 1 for prob in
+-            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
+-        )):
++            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
++             ]):
+             raise ValueError(
+                 "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
+                 "probabilities and must be between 0 and 1")
+diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
+new file mode 100644
+index 0000000..93fec66
+--- /dev/null
++++ b/vllm/transformers_utils/configs/nemotron.py
+@@ -0,0 +1,202 @@
++# Copyright 2024 HuggingFace Inc. team. All rights reserved.
++# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Nemotron model configuration"""
++
++from transformers import PretrainedConfig
++from transformers.utils import logging
++
++logger = logging.get_logger(__name__)
++
++
++class NemotronConfig(PretrainedConfig):
++    r"""
++    This is the configuration class to store the configuration of a
++    [`NemotronModel`]. It is used to instantiate an Nemotron model
++    according to the specified arguments, defining the model architecture.
++    Instantiating a configuration with the defaults will yield a similar
++    configuration to that of the Nemotron-8B.
++
++    Configuration objects inherit from [`PretrainedConfig`] and can be
++    used to control the model outputs. Read the documentation from
++    [`PretrainedConfig`] for more information.
++
++
++    Args:
++        vocab_size (`int`, *optional*, defaults to 256000):
++            Vocabulary size of the Nemotron model. Defines the number of
++            different tokens that can be represented by the
++            `inputs_ids` passed when calling [`NemotronModel`]
++        hidden_size (`int`, *optional*, defaults to 6144):
++            Dimension of the hidden representations.
++        intermediate_size (`int`, *optional*, defaults to 24576):
++            Dimension of the MLP representations.
++        num_hidden_layers (`int`, *optional*, defaults to 32):
++            Number of hidden layers in the Transformer decoder.
++        num_attention_heads (`int`, *optional*, defaults to 48):
++            Number of attention heads for each attention layer in the
++            Transformer decoder.
++        head_dim (`int`, *optional*):
++            Projection weights dimension in multi-head attention. Set to
++            hidden_size // num_attention_heads if None
++        num_key_value_heads (`int`, *optional*):
++            This is the number of key_value heads that should be used to
++            implement Grouped Query Attention. If
++            `num_key_value_heads=num_attention_heads`, the model will use
++            Multi Head Attention (MHA), if
++            `num_key_value_heads=1 the model will use Multi Query Attention
++            (MQA) otherwise GQA is used. When converting a multi-head
++            checkpoint to a GQA checkpoint, each group key and value
++            head should be constructed by meanpooling all the original
++            heads within that group. For more details checkout 
++            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
++            is not specified, will default to `num_attention_heads`.
++        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
++            The non-linear activation function (function or string) in the
++            decoder.
++        max_position_embeddings (`int`, *optional*, defaults to 4096):
++            The maximum sequence length that this model might ever be used
++            with.
++        initializer_range (`float`, *optional*, defaults to 0.0134):
++            The standard deviation of the truncated_normal_initializer for
++            initializing all weight matrices.
++        norm_eps (`float`, *optional*, defaults to 1e-05):
++            The epsilon used by the normalization layers.
++        use_cache (`bool`, *optional*, defaults to `True`):
++            Whether or not the model should return the last key/values
++            attentions (not used by all models). Only relevant if
++            `config.is_decoder=True`.
++        pad_token_id (`int`, *optional*):
++            Padding token id.
++        bos_token_id (`int`, *optional*, defaults to 2):
++            Beginning of stream token id.
++        eos_token_id (`int`, *optional*, defaults to 3):
++            End of stream token id.
++        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
++            Whether to tie weight embeddings
++        rope_theta (`float`, *optional*, defaults to 10000.0):
++            The base period of the RoPE embeddings.
++        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
++            Percentage of the query and keys which will have rotary embedding.
++        attention_bias (`bool`, *optional*, defaults to `False`):
++            Whether to use a bias in the query, key, value and output
++            projection layers during self-attention.
++        attention_dropout (`float`, *optional*, defaults to 0.0):
++            The dropout ratio for the attention probabilities.
++        mlp_bias (`bool`, *optional*, defaults to `False`):
++            Whether to use a bias in up_proj and down_proj layers in the MLP
++            layers.
++
++    ```python
++    >>> from transformers import NemotronModel, NemotronConfig
++    >>> # Initializing a Nemotron nemotron-15b style configuration
++    >>> configuration = NemotronConfig()
++    >>> # Initializing a model from the nemotron-15b style configuration
++    >>> model = NemotronModel(configuration)
++    >>> # Accessing the model configuration
++    >>> configuration = model.config
++    ```"""
++
++    model_type = "nemotron"
++    keys_to_ignore_at_inference = ["past_key_values"]
++
++    def __init__(
++        self,
++        vocab_size=256000,
++        hidden_size=6144,
++        intermediate_size=24576,
++        num_hidden_layers=32,
++        num_attention_heads=48,
++        head_dim=None,
++        num_key_value_heads=None,
++        hidden_act="relu2",
++        max_position_embeddings=4096,
++        initializer_range=0.0134,
++        norm_eps=1e-5,
++        use_cache=True,
++        pad_token_id=None,
++        bos_token_id=2,
++        eos_token_id=3,
++        tie_word_embeddings=False,
++        rope_theta=10000.0,
++        rope_scaling=None,
++        partial_rotary_factor=0.5,
++        attention_bias=False,
++        attention_dropout=0.0,
++        mlp_bias=False,
++        **kwargs,
++    ):
++        self.vocab_size = vocab_size
++        self.max_position_embeddings = max_position_embeddings
++        self.hidden_size = hidden_size
++        self.intermediate_size = intermediate_size
++        self.num_hidden_layers = num_hidden_layers
++        self.num_attention_heads = num_attention_heads
++        head_dim = head_dim or kwargs.get("kv_channels")
++        self.head_dim = head_dim if head_dim is not None else (
++            hidden_size // num_attention_heads)
++
++        # for backward compatibility
++        if num_key_value_heads is None:
++            num_key_value_heads = num_attention_heads
++
++        self.num_key_value_heads = num_key_value_heads
++        self.hidden_act = hidden_act
++        self.initializer_range = initializer_range
++        self.norm_eps = norm_eps
++        self.use_cache = use_cache
++        self.rope_theta = rope_theta
++        self.rope_scaling = rope_scaling
++        # for backward compatibility
++        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
++            "rope_percentage") or partial_rotary_factor
++        self.partial_rotary_factor = partial_rotary_factor
++        self._rope_scaling_validation()
++        self.attention_bias = attention_bias
++        self.attention_dropout = attention_dropout
++        self.mlp_bias = mlp_bias
++
++        super().__init__(
++            pad_token_id=pad_token_id,
++            bos_token_id=bos_token_id,
++            eos_token_id=eos_token_id,
++            tie_word_embeddings=tie_word_embeddings,
++            **kwargs,
++        )
++
++    def _rope_scaling_validation(self):
++        """
++        Validate the `rope_scaling` configuration.
++        """
++        if self.rope_scaling is None:
++            return
++
++        if not isinstance(self.rope_scaling,
++                          dict) or len(self.rope_scaling) != 2:
++            raise ValueError(
++                "`rope_scaling` must be a dictionary with two fields, "
++                f"`type` and `factor`, got {self.rope_scaling}")
++        rope_scaling_type = self.rope_scaling.get("type", None)
++        rope_scaling_factor = self.rope_scaling.get("factor", None)
++        if rope_scaling_type is None or rope_scaling_type not in [
++                "linear", "dynamic"
++        ]:
++            raise ValueError(
++                "`rope_scaling`'s type field must be one of ['linear', "
++                f"'dynamic'], got {rope_scaling_type}")
++        if rope_scaling_factor is None or not isinstance(
++                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
++            raise ValueError(
++                "`rope_scaling`'s factor field must be a float > 1, got "
++                f"{rope_scaling_factor}")
+diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py
+new file mode 100644
+index 0000000..8007176
+--- /dev/null
++++ b/vllm/transformers_utils/configs/nvlm_d.py
+@@ -0,0 +1,12 @@
++# Adapted from
++# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
++# --------------------------------------------------------
++# NVLM-D
++# Copyright (c) 2024 NVIDIA
++# Licensed under Apache 2.0 License [see LICENSE for details]
++# --------------------------------------------------------
++from .internvl import InternVLChatConfig
++
++
++class NVLM_D_Config(InternVLChatConfig):
++    model_type = 'NVLM_D'
+diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py
+new file mode 100644
+index 0000000..0e6d8e4
+--- /dev/null
++++ b/vllm/transformers_utils/configs/olmo2.py
+@@ -0,0 +1,166 @@
++# yapf: disable
++# ruff: noqa: E501
++# coding=utf-8
++# Copied from
++# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/configuration_olmo2.py
++"""OLMo 2 configuration."""
++
++from transformers.configuration_utils import PretrainedConfig
++from transformers.utils import logging
++
++logger = logging.get_logger(__name__)
++
++
++class Olmo2Config(PretrainedConfig):
++    r"""
++    This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
++    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
++    defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).
++
++    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
++    documentation from [`PretrainedConfig`] for more information.
++
++
++    Args:
++        vocab_size (`int`, *optional*, defaults to 50304):
++            Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
++            `inputs_ids` passed when calling [`Olmo2Model`]
++        hidden_size (`int`, *optional*, defaults to 4096):
++            Dimension of the hidden representations.
++        intermediate_size (`int`, *optional*, defaults to 11008):
++            Dimension of the MLP representations.
++        num_hidden_layers (`int`, *optional*, defaults to 32):
++            Number of hidden layers in the Transformer decoder.
++        num_attention_heads (`int`, *optional*, defaults to 32):
++            Number of attention heads for each attention layer in the Transformer decoder.
++        num_key_value_heads (`int`, *optional*):
++            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
++            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
++            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
++            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
++            by meanpooling all the original heads within that group. For more details checkout [this
++            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
++            `num_attention_heads`.
++        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
++            The non-linear activation function (function or string) in the decoder.
++        max_position_embeddings (`int`, *optional*, defaults to 2048):
++            The maximum sequence length that this model might ever be used with.
++        initializer_range (`float`, *optional*, defaults to 0.02):
++            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
++        use_cache (`bool`, *optional*, defaults to `True`):
++            Whether or not the model should return the last key/values attentions (not used by all models). Only
++            relevant if `config.is_decoder=True`.
++        pad_token_id (`int`, *optional*, defaults to 1):
++            Padding token id.
++        bos_token_id (`int`, *optional*):
++            Beginning of stream token id.
++        eos_token_id (`int`, *optional*, defaults to 50279):
++            End of stream token id.
++        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
++            Whether to tie weight embeddings
++        rope_theta (`float`, *optional*, defaults to 10000.0):
++            The base period of the RoPE embeddings.
++        rope_scaling (`Dict`, *optional*):
++            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
++            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
++            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
++            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
++            these scaling strategies behave:
++            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
++            experimental feature, subject to breaking API changes in future versions.
++        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
++            Whether to use a bias in the query, key, value and output projection layers during self-attention.
++        attention_dropout (`float`, *optional*, defaults to 0.0):
++            The dropout ratio for the attention probabilities.
++        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
++            The epsilon used by the rms normalization layers.
++
++    ```python
++    >>> from transformers import Olmo2Model, Olmo2Config
++
++    >>> # Initializing a Olmo2 7B style configuration
++    >>> configuration = Olmo2Config()
++
++    >>> # Initializing a model from the Olmo2 7B style configuration
++    >>> model = Olmo2Model(configuration)
++
++    >>> # Accessing the model configuration
++    >>> configuration = model.config
++    ```
++    """
++
++    model_type = "olmo2"
++    keys_to_ignore_at_inference = ["past_key_values"]
++
++    def __init__(
++        self,
++        vocab_size=50304,
++        hidden_size=4096,
++        intermediate_size=11008,
++        num_hidden_layers=32,
++        num_attention_heads=32,
++        num_key_value_heads=None,
++        hidden_act="silu",
++        max_position_embeddings=2048,
++        initializer_range=0.02,
++        use_cache=True,
++        pad_token_id=1,
++        bos_token_id=None,
++        eos_token_id=50279,
++        tie_word_embeddings=False,
++        rope_theta=10000.0,
++        rope_scaling=None,
++        attention_bias=False,
++        attention_dropout=0.0,
++        rms_norm_eps=1e-5,
++        **kwargs,
++    ):
++        super().__init__(
++            pad_token_id=pad_token_id,
++            bos_token_id=bos_token_id,
++            eos_token_id=eos_token_id,
++            tie_word_embeddings=tie_word_embeddings,
++            **kwargs,
++        )
++        self.vocab_size = vocab_size
++        self.max_position_embeddings = max_position_embeddings
++        self.hidden_size = hidden_size
++        self.intermediate_size = intermediate_size
++        self.num_hidden_layers = num_hidden_layers
++        self.num_attention_heads = num_attention_heads
++
++        # for backward compatibility
++        if num_key_value_heads is None:
++            num_key_value_heads = num_attention_heads
++
++        self.num_key_value_heads = num_key_value_heads
++        self.hidden_act = hidden_act
++        self.initializer_range = initializer_range
++        self.use_cache = use_cache
++        self.rope_theta = rope_theta
++        self.rope_scaling = rope_scaling
++        self._rope_scaling_validation()
++        self.attention_bias = attention_bias
++        self.attention_dropout = attention_dropout
++
++        self.rms_norm_eps = rms_norm_eps
++
++    def _rope_scaling_validation(self):
++        """
++        Validate the `rope_scaling` configuration.
++        """
++        if self.rope_scaling is None:
++            return
++
++        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
++            raise ValueError(
++                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
++            )
++        rope_scaling_type = self.rope_scaling.get("type", None)
++        rope_scaling_factor = self.rope_scaling.get("factor", None)
++        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
++            raise ValueError(
++                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
++            )
++        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
++            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py
+new file mode 100644
+index 0000000..0c1c048
+--- /dev/null
++++ b/vllm/transformers_utils/configs/solar.py
+@@ -0,0 +1,244 @@
++# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
++#
++# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
++# and OPT implementations in this library. It has been modified from its
++# original forms to accommodate minor architectural differences compared
++# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++"""Solar model configuration"""
++
++from transformers import PretrainedConfig
++from transformers.utils import logging
++
++logger = logging.get_logger(__name__)
++
++
++class SolarConfig(PretrainedConfig):
++    r"""
++    This is the configuration class to store
++    the configuration of a [`SolarModel`].
++    It is used to instantiate an LLaMA model
++    according to the specified arguments,
++    defining the model architecture.
++    Instantiating a configuration with the
++    defaults will yield a similar
++    configuration to that of the LLaMA-7B.
++    Configuration objects inherit from [`PretrainedConfig`]
++    and can be used to control the model outputs.
++    Read the documentation from [`PretrainedConfig`] for more information.
++    Args:
++        vocab_size (`int`, *optional*, defaults to 32000):
++            Vocabulary size of the LLaMA model.
++            Defines the number of different tokens
++            that can be represented by the `inputs_ids`
++            passed when calling [`SolarModel`]
++        hidden_size (`int`, *optional*, defaults to 4096):
++            Dimension of the hidden representations.
++        intermediate_size (`int`, *optional*, defaults to 11008):
++            Dimension of the MLP representations.
++        num_hidden_layers (`int`, *optional*, defaults to 32):
++            Number of hidden layers in the Transformer decoder.
++        num_attention_heads (`int`, *optional*, defaults to 32):
++            Number of attention heads for each attention layer
++            in the Transformer decoder.
++        num_key_value_heads (`int`, *optional*):
++            This is the number of key_value heads that
++            should be used to implement Grouped Query Attention. If
++            `num_key_value_heads=num_attention_heads`,
++            the model will use Multi Head Attention (MHA), if
++            `num_key_value_heads=1` the model
++            will use Multi Query Attention (MQA)
++            otherwise GQA is used. When
++            converting a multi-head checkpoint to a GQA checkpoint,
++            each group key and value head should be constructed
++            by meanpooling all the original heads within that group.
++            For more details checkout [this paper]
++            (https://arxiv.org/pdf/2305.13245.pdf).
++            If it is not specified, will default to
++            `num_attention_heads`.
++        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
++            The non-linear activation function (function or string)
++            in the decoder.
++        max_position_embeddings (`int`, *optional*, defaults to 2048):
++            The maximum sequence length that this model might ever be used with.
++            Solar 1 supports up to 2048 tokens,
++            Solar 2 up to 4096, CodeSolar up to 16384.
++        initializer_range (`float`, *optional*, defaults to 0.02):
++            The standard deviation of
++            the truncated_normal_initializer for initializing
++            all weight matrices.
++        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
++            The epsilon used by the rms normalization layers.
++        use_cache (`bool`, *optional*, defaults to `True`):
++            Whether or not the model should return
++            the last key/values attentions (not used by all models). Only
++            relevant if `config.is_decoder=True`.
++        pad_token_id (`int`, *optional*):
++            Padding token id.
++        bos_token_id (`int`, *optional*, defaults to 1):
++            Beginning of stream token id.
++        eos_token_id (`int`, *optional*, defaults to 2):
++            End of stream token id.
++        pretraining_tp (`int`, *optional*, defaults to 1):
++            Experimental feature. Tensor parallelism rank
++            used during pretraining.
++            Please refer to [this
++            document](https://huggingface.co/docs/
++            transformers/main/
++            perf_train_gpu_many#tensor-parallelism)
++             to understand more about it. This value is
++            necessary to ensure exact reproducibility
++            of the pretraining results.
++            Please refer to [this
++            issue](https://github.com/pytorch/pytorch/issues/76232).
++        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
++            Whether to tie weight embeddings
++        rope_theta (`float`, *optional*, defaults to 10000.0):
++            The base period of the RoPE embeddings.
++        rope_scaling (`Dict`, *optional*):
++            Dictionary containing the scaling configuration for
++            the RoPE embeddings.
++            Currently supports two scaling
++            strategies: linear and dynamic.
++            Their scaling factor must be a float greater than 1.
++            The expected format is
++            `{"type": strategy name, "factor": scaling factor}`.
++            When using this flag, don't update
++            `max_position_embeddings` to the expected new maximum.
++            See the following thread for more information on how
++            these scaling strategies behave:
++            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
++            dynamically_scaled_rope_further_increases/. This is an
++            experimental feature, subject to breaking
++            API changes in future versions.
++        attention_bias (`bool`, *optional*, defaults to `False`):
++            Whether to use a bias in the query, key, value
++            and output projection layers during self-attention.
++        attention_dropout (`float`, *optional*, defaults to 0.0):
++            The dropout ratio for the attention probabilities.
++        mlp_bias (`bool`, *optional*, defaults to `False`):
++            Whether to use a bias in up_proj, down_proj and gate_proj
++            layers in the MLP layers.
++        sliding_window (`int`, *optional*, defaults to 2047):
++            Sliding window attention window size. If not specified,
++            will default to `2047`.
++    ```python
++    >>> from transformers import SolarModel, SolarConfig
++    >>> # Initializing a Solar-pro style configuration
++    >>> configuration = SolarConfig()
++    >>> # Initializing a model from the Solar-pro style configuration
++    >>> model = SolarModel(configuration)
++    >>> # Accessing the model configuration
++    >>> configuration = model.config
++    ```"""
++
++    model_type = "solar"
++    keys_to_ignore_at_inference = ["past_key_values"]
++
++    def __init__(
++        self,
++        vocab_size=32000,
++        hidden_size=4096,
++        intermediate_size=11008,
++        num_hidden_layers=32,
++        num_attention_heads=32,
++        num_key_value_heads=None,
++        hidden_act="silu",
++        max_position_embeddings=2048,
++        initializer_range=0.02,
++        rms_norm_eps=1e-6,
++        use_cache=True,
++        pad_token_id=None,
++        bos_token_id=1,
++        eos_token_id=2,
++        pretraining_tp=1,
++        tie_word_embeddings=False,
++        rope_theta=10000.0,
++        rope_scaling=None,
++        attention_bias=False,
++        attention_dropout=0.0,
++        mlp_bias=False,
++        sliding_window=2047,
++        bskcn_1=None,
++        bskcn_2=None,
++        bskcn_3=None,
++        bskcn_4=None,
++        bskcn_tv=None,
++        **kwargs,
++    ):
++        self.vocab_size = vocab_size
++        self.max_position_embeddings = max_position_embeddings
++        self.hidden_size = hidden_size
++        self.intermediate_size = intermediate_size
++        self.num_hidden_layers = num_hidden_layers
++        self.num_attention_heads = num_attention_heads
++
++        # for backward compatibility
++        if num_key_value_heads is None:
++            num_key_value_heads = num_attention_heads
++
++        self.num_key_value_heads = num_key_value_heads
++        self.hidden_act = hidden_act
++        self.initializer_range = initializer_range
++        self.rms_norm_eps = rms_norm_eps
++        self.pretraining_tp = pretraining_tp
++        self.use_cache = use_cache
++        self.rope_theta = rope_theta
++        self.rope_scaling = rope_scaling
++        self._rope_scaling_validation()
++        self.attention_bias = attention_bias
++        self.attention_dropout = attention_dropout
++        self.mlp_bias = mlp_bias
++        self.sliding_window = sliding_window
++        self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
++        self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
++        self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
++        self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
++        self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
++
++        super().__init__(
++            pad_token_id=pad_token_id,
++            bos_token_id=bos_token_id,
++            eos_token_id=eos_token_id,
++            tie_word_embeddings=tie_word_embeddings,
++            **kwargs,
++        )
++
++    def _rope_scaling_validation(self):
++        """
++        Validate the `rope_scaling` configuration.
++        """
++        if self.rope_scaling is None:
++            return
++
++        if (not isinstance(self.rope_scaling, dict)
++                or len(self.rope_scaling) != 2):
++            raise ValueError(
++                "`rope_scaling` must be a dictionary with two fields,"
++                " `type` and `factor`, "
++                f"got {self.rope_scaling}")
++        rope_scaling_type = self.rope_scaling.get("type", None)
++        rope_scaling_factor = self.rope_scaling.get("factor", None)
++        if rope_scaling_type is None or rope_scaling_type not in [
++                "linear",
++                "dynamic",
++        ]:
++            raise ValueError(f"`rope_scaling`'s type field must be one of "
++                             f"['linear', 'dynamic'], got {rope_scaling_type}")
++        if (rope_scaling_factor is None
++                or not isinstance(rope_scaling_factor, float)
++                or rope_scaling_factor <= 1.0):
++            raise ValueError(
++                f"`rope_scaling`'s factor field must be a float > 1,"
++                f" got {rope_scaling_factor}")
+diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py
+new file mode 100644
+index 0000000..eb6f5a0
+--- /dev/null
++++ b/vllm/transformers_utils/configs/telechat2.py
+@@ -0,0 +1,61 @@
++# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
++""" Telechat configuration compatible with LlamaConfig. """
++
++from transformers.configuration_utils import PretrainedConfig
++
++
++class Telechat2Config(PretrainedConfig):
++
++    model_type = "telechat"
++    keys_to_ignore_at_inference = ["past_key_values"]
++    attribute_map = {
++        "num_hidden_layers": "n_layer",
++        "num_attention_heads": "n_head",
++        "intermediate_size": "ffn_hidden_size",
++        "rms_norm_eps": "layer_norm_epsilon"
++    }
++
++    def __init__(
++        self,
++        vocab_size=160256,
++        hidden_size=4096,
++        n_layer=30,
++        n_head=32,
++        layer_norm_epsilon=1e-5,
++        initializer_range=0.02,
++        use_cache=True,
++        bos_token_id=1,
++        eos_token_id=2,
++        apply_residual_connection_post_layernorm=False,
++        hidden_dropout=0.0,
++        attention_dropout=0.0,
++        ffn_hidden_size=12288,
++        training_seqlen=8192,
++        logn=True,
++        embed_layernorm=False,
++        hidden_act="silu",
++        **kwargs,
++    ):
++        self.vocab_size = vocab_size
++        n_embed = kwargs.pop("n_embed", None)
++        self.hidden_size = hidden_size if n_embed is None else n_embed
++        self.n_layer = n_layer
++        self.n_head = n_head
++        self.layer_norm_epsilon = layer_norm_epsilon
++        self.initializer_range = initializer_range
++        self.use_cache = use_cache
++        self.apply_residual_connection_post_layernorm = (
++            apply_residual_connection_post_layernorm)
++        self.hidden_dropout = hidden_dropout
++        self.attention_dropout = attention_dropout
++        self.bos_token_id = bos_token_id
++        self.eos_token_id = eos_token_id
++        self.logn = logn
++        self.training_seqlen = training_seqlen
++        self.embed_layernorm = embed_layernorm
++        self.num_key_value_heads = kwargs.pop("num_key_value_heads", None)
++        self.ffn_hidden_size = ffn_hidden_size
++        self.hidden_act = hidden_act
++        super().__init__(bos_token_id=bos_token_id,
++                         eos_token_id=eos_token_id,
++                         **kwargs)
+diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
+new file mode 100644
+index 0000000..f724bf7
+--- /dev/null
++++ b/vllm/transformers_utils/configs/ultravox.py
+@@ -0,0 +1,99 @@
++# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
++from typing import Any, Dict, Optional
++
++import transformers
++
++
++class UltravoxConfig(transformers.PretrainedConfig):
++    r"""
++    This is the configuration class to store the configuration of a
++    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
++    Ultravox model according to the specified arguments, defining the model
++    architecture.
++
++    Configuration objects inherit from [`PretrainedConfig`] and can be used to
++    control the model outputs. Read the documentation from [`PretrainedConfig`]
++    for more information.
++
++    Args:
++        audio_config (`Union[AutoConfig, dict]`,  *optional*):
++            Custom audio config or dict
++        text_config (`Union[AutoConfig, dict]`, *optional*):
++            The config object of the text backbone. Can be any of `LlamaConfig`
++            or `MistralConfig`.
++        ignore_index (`int`, *optional*, defaults to -100):
++            The ignore index for the loss function.
++        audio_token_index (`int`, *optional*, defaults to 32000):
++            The audio token index to encode the audio prompt.
++        stack_factor (`int`, *optional*, defaults to 8):
++            Audio downsampling factor for the multimodal projector.
++        norm_init (`float`, *optional*, defaults to 0.4):
++            The initialization value for the layer normalization.
++        projector_act (`str`, *optional*, defaults to `"swiglu"`):
++            The activation function used by the multimodal projector.
++        text_model_lora_config (`LoraConfigSimplified`, *optional*):
++            The LoRA configuration for finetuning the text model.
++        audio_model_lora_config (`LoraConfigSimplified`, *optional*):
++            The LoRA configuration for finetuning the audio model.
++    """
++
++    model_type = "ultravox"
++    is_composition = False
++
++    def __init__(
++        self,
++        audio_config: Optional[Dict[str, Any]] = None,
++        text_config: Optional[Dict[str, Any]] = None,
++        audio_model_id: Optional[str] = None,
++        text_model_id: Optional[str] = None,
++        ignore_index: int = -100,
++        audio_token_index: int = 32000,
++        hidden_size: int = 4096,
++        stack_factor: int = 8,
++        norm_init: float = 0.4,
++        projector_act: str = "swiglu",
++        text_model_lora_config: Optional[Dict[str, Any]] = None,
++        audio_model_lora_config: Optional[Dict[str, Any]] = None,
++        **kwargs,
++    ):
++        self.ignore_index = ignore_index
++
++        self.audio_model_id = audio_model_id
++        self.text_model_id = text_model_id
++        self.audio_token_index = audio_token_index
++
++        self.hidden_size = hidden_size
++        self.stack_factor = stack_factor
++        self.norm_init = norm_init
++        self.projector_act = projector_act
++
++        if text_model_id is not None:
++            # Avoid circular import
++            from vllm.transformers_utils.config import get_config
++
++            self.text_config = get_config(text_model_id,
++                                          trust_remote_code=False)
++        else:
++            text_config = text_config or {}
++            self.text_config = transformers.CONFIG_MAPPING[text_config.get(
++                "model_type", "llama")](**text_config)
++
++        if audio_model_id is not None:
++            # Avoid circular import
++            from vllm.transformers_utils.config import get_config
++
++            self.audio_config = get_config(audio_model_id,
++                                           trust_remote_code=False)
++        else:
++            audio_config = audio_config or {}
++            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
++                "model_type", "whisper")](**audio_config)
++
++        self.text_model_lora_config = text_model_lora_config or {}
++        self.audio_model_lora_config = audio_model_lora_config or {}
++
++        self.vocab_size = self.text_config.vocab_size
++
++        self.initializer_range = self.text_config.initializer_range
++
++        super().__init__(**kwargs)
+diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
+index f064c26..7c8423d 100644
+--- a/vllm/transformers_utils/detokenizer.py
++++ b/vllm/transformers_utils/detokenizer.py
+@@ -1,13 +1,12 @@
+-from typing import Dict, List, Optional, Tuple, Union
++from typing import Dict, List, Optional
+ 
+-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
++from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
++                           Sequence, SequenceGroup)
+ 
+-from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
+-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
+-    BaseTokenizerGroup)
+-
+-# Used eg. for marking rejected tokens in spec decoding.
+-INVALID_TOKEN_ID = -1
++from .detokenizer_utils import (convert_prompt_ids_to_tokens,
++                                detokenize_incrementally)
++from .tokenizer import AnyTokenizer
++from .tokenizer_group import BaseTokenizerGroup
+ 
+ 
+ class Detokenizer:
+@@ -16,26 +15,30 @@ class Detokenizer:
+     def __init__(self, tokenizer_group: BaseTokenizerGroup):
+         self.tokenizer_group = tokenizer_group
+ 
+-    def get_tokenizer_for_seq(self,
+-                              sequence: Sequence) -> "PreTrainedTokenizer":
++    def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
+         """Returns the HF tokenizer to use for a given sequence."""
+         return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
+ 
+-    def decode_prompt_logprobs_inplace(
+-            self, seq_group: SequenceGroup,
+-            prompt_logprobs: List[Optional[Dict[int, Logprob]]]) -> None:
++    def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
++                                       prompt_logprobs: List[Optional[Dict[
++                                           int, Logprob]]],
++                                       position_offset: int) -> None:
+         """Decodes the logprobs for the prompt of a sequence group.
+ 
+         Args:
+             seq_group: The sequence group to decode.
+             prompt_logprobs: The logprobs to decode.
++            position_offset: Offset of the first index of the logprobs 
++                relative to the start of the sequence (for chunked prefill).
+         
+         Returns:
+             The prompt logprobs with the decoded tokens.
+         """
+         prms = seq_group.sampling_params
++        assert prms is not None
++
+         # We can pick any sequence for the prompt.
+-        seq = next(iter(seq_group.seqs_dict.values()))
++        seq = seq_group.get_seqs()[0]
+         # Only prompt, without the generated token.
+         all_token_ids = seq.get_token_ids()
+         prompt_token_ids = all_token_ids[:-1]
+@@ -44,16 +47,21 @@ class Detokenizer:
+         read_offset = 0
+         next_iter_prefix_offset = 0
+         next_iter_read_offset = 0
+-        next_iter_tokens = []
++        next_iter_tokens: List[str] = []
+         prev_tokens = None
+ 
+-        for token_position, prompt_logprobs_for_token in enumerate(
++        for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
+                 prompt_logprobs):
++
++            # Absolute token position equals the index in the logprobs
++            # list plus the offset of the entire logprobs list relative
++            # to the start of the sequence.
++            token_position = token_position_in_logprob + position_offset
+             if not prompt_logprobs_for_token:
+                 continue
+             for token_id, sample_logprob in prompt_logprobs_for_token.items():
+                 if (sample_logprob.decoded_token is None
+-                        and token_id != INVALID_TOKEN_ID):
++                        and token_id != VLLM_INVALID_TOKEN_ID):
+                     prompt_token_ids_with_token = (
+                         prompt_token_ids[:token_position] + [token_id])
+                     (new_tokens, new_text, new_prefix_offset,
+@@ -82,7 +90,7 @@ class Detokenizer:
+             prefix_offset = next_iter_prefix_offset
+             read_offset = next_iter_read_offset
+             if prev_tokens is None:
+-                prev_tokens = next_iter_tokens
++                prev_tokens = next_iter_tokens.copy()
+             else:
+                 prev_tokens.extend(next_iter_tokens)
+ 
+@@ -135,7 +143,7 @@ class Detokenizer:
+                     continue
+ 
+                 if (sample_logprob.decoded_token is None
+-                        and token_id != INVALID_TOKEN_ID):
++                        and token_id != VLLM_INVALID_TOKEN_ID):
+                     all_input_ids_with_logprob = previous_tokens + [token_id]
+                     (_, new_text, _, _) = detokenize_incrementally(
+                         tokenizer=tokenizer,
+@@ -155,159 +163,3 @@ class Detokenizer:
+         seq.output_text += new_decoded_token_text
+ 
+         return len(new_decoded_token_text)
+-
+-
+-def _convert_tokens_to_string_with_added_encoders(
+-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+-    output_tokens: List[str],
+-    skip_special_tokens: bool,
+-    spaces_between_special_tokens: bool,
+-) -> str:
+-    # Adapted from
+-    # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
+-    # NOTE(woosuk): The following code is slow because it runs a for loop over
+-    # the output_tokens. In Python, running a for loop over a list can be slow
+-    # even when the loop body is very simple.
+-    sub_texts: List[str] = []
+-    current_sub_text: List[str] = []
+-    all_special_tokens = set(tokenizer.all_special_tokens)
+-    for token in output_tokens:
+-        if skip_special_tokens and token in all_special_tokens:
+-            continue
+-        if token in tokenizer.get_added_vocab():
+-            if current_sub_text:
+-                sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
+-                sub_texts.append(sub_text)
+-                current_sub_text = []
+-            sub_texts.append(token)
+-        else:
+-            current_sub_text.append(token)
+-    if current_sub_text:
+-        sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
+-        sub_texts.append(sub_text)
+-    if spaces_between_special_tokens:
+-        return " ".join(sub_texts)
+-    else:
+-        return "".join(sub_texts)
+-
+-
+-# 5 is an arbitrary value that should work for all
+-# tokenizers (bigger = more conservative).
+-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
+-
+-
+-def convert_prompt_ids_to_tokens(
+-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+-    prompt_ids: List[int],
+-    skip_special_tokens: bool = False,
+-) -> Tuple[List[str], int, int]:
+-    """Converts the prompt ids to tokens and returns the tokens and offsets
+-    for incremental detokenization.
+-
+-    Note that not all tokens are converted to strings. Only the tokens that
+-    are necessary for incremental detokenization are converted to strings.
+-    """
+-    # We do not need to convert the whole prompt to tokens.
+-    # Offset a little more in case we have special tokens.
+-    new_tokens = tokenizer.convert_ids_to_tokens(
+-        prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
+-        skip_special_tokens=skip_special_tokens)
+-    read_offset = len(new_tokens)
+-    prefix_offset = max(
+-        read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
+-    return new_tokens, prefix_offset, read_offset
+-
+-
+-# Based on
+-# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
+-# under Apache 2.0 license
+-def detokenize_incrementally(
+-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+-    all_input_ids: List[int],
+-    prev_tokens: Optional[List[str]],
+-    prefix_offset: int,
+-    read_offset: int,
+-    skip_special_tokens: bool = False,
+-    spaces_between_special_tokens: bool = True,
+-) -> Tuple[List[str], str, int, int]:
+-    """Detokenizes the input ids incrementally and returns the new tokens
+-    and the new text.
+-
+-    If `prev_tokens` is None, this function will convert the input ids to
+-    tokens and return the tokens and the new text. Otherwise, it will return the
+-    new tokens and the new text.
+-
+-    This function will also return the new prefix offset and the new read
+-    offset to be used in the next iteration.
+-
+-    The offsets are necessary to defeat cleanup algorithms in the decode which
+-    decide to add a space or not depending on the surrounding ids.
+-
+-    Args:
+-        tokenizer: The tokenizer to use.
+-        all_input_ids: The input ids. The last id is the new token id.
+-        prev_tokens: The previous tokens. If None, this function will convert
+-            the input ids to tokens and return the tokens and the new text.
+-        prefix_offset: The prefix offset.
+-        read_offset: The read offset.
+-        skip_special_tokens: Whether to skip special tokens.
+-        spaces_between_special_tokens: Whether to add spaces between special
+-            tokens.
+-    """
+-    new_token_id = all_input_ids[-1]
+-    # This is the first iteration for this sequence
+-    is_first_iter = prev_tokens is None
+-    if is_first_iter:
+-        (prev_tokens, prefix_offset,
+-         read_offset) = convert_prompt_ids_to_tokens(
+-             tokenizer,
+-             all_input_ids[:-1],
+-             skip_special_tokens=skip_special_tokens)
+-    assert prev_tokens is not None
+-
+-    # If the new token id is out of bounds, return an empty string.
+-    if new_token_id >= len(tokenizer):
+-        new_tokens = [""]
+-    else:
+-        # Put new_token_id in a list so skip_special_tokens is respected
+-        new_tokens = tokenizer.convert_ids_to_tokens(
+-            [new_token_id], skip_special_tokens=skip_special_tokens)
+-        if isinstance(new_tokens, str):
+-            new_tokens = [new_tokens]
+-    output_tokens = prev_tokens + new_tokens
+-
+-    # If this is the first iteration, return all tokens.
+-    if is_first_iter:
+-        new_tokens = output_tokens
+-
+-    # The prefix text is necessary only to defeat cleanup algorithms in
+-    # the decode which decide to add a space or not depending on the
+-    # surrounding ids.
+-    if tokenizer.is_fast or not tokenizer.get_added_vocab():
+-        prefix_text = tokenizer.convert_tokens_to_string(
+-            output_tokens[prefix_offset:read_offset])
+-        new_text = tokenizer.convert_tokens_to_string(
+-            output_tokens[prefix_offset:])
+-    else:
+-        prefix_text = _convert_tokens_to_string_with_added_encoders(
+-            tokenizer,
+-            output_tokens[prefix_offset:read_offset],
+-            skip_special_tokens=skip_special_tokens,
+-            spaces_between_special_tokens=spaces_between_special_tokens,
+-        )
+-        new_text = _convert_tokens_to_string_with_added_encoders(
+-            tokenizer,
+-            output_tokens[prefix_offset:],
+-            skip_special_tokens=skip_special_tokens,
+-            spaces_between_special_tokens=spaces_between_special_tokens,
+-        )
+-
+-    if len(new_text) <= len(prefix_text) or new_text.endswith("�"):
+-        # utf-8 char at the end means it's a potential unfinished byte sequence
+-        # from byte fallback tokenization.
+-        # If it's in the middle, it's probably a real invalid id generated
+-        # by the model
+-        return new_tokens, "", prefix_offset, read_offset
+-
+-    new_text = new_text[len(prefix_text):]
+-    return new_tokens, new_text, read_offset, len(output_tokens)
+diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
+new file mode 100644
+index 0000000..37ff8a2
+--- /dev/null
++++ b/vllm/transformers_utils/detokenizer_utils.py
+@@ -0,0 +1,167 @@
++from typing import List, Optional, Tuple
++
++from .tokenizer import AnyTokenizer
++
++
++def _replace_none_with_empty(tokens: List[Optional[str]]):
++    for i, token in enumerate(tokens):
++        if token is None:
++            tokens[i] = ""
++
++
++def _convert_tokens_to_string_with_added_encoders(
++    tokenizer: AnyTokenizer,
++    output_tokens: List[str],
++    skip_special_tokens: bool,
++    spaces_between_special_tokens: bool,
++) -> str:
++    # Adapted from
++    # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
++    # NOTE(woosuk): The following code is slow because it runs a for loop over
++    # the output_tokens. In Python, running a for loop over a list can be slow
++    # even when the loop body is very simple.
++    sub_texts: List[str] = []
++    current_sub_text: List[str] = []
++    all_special_tokens = set(tokenizer.all_special_tokens)
++    for token in output_tokens:
++        if skip_special_tokens and token in all_special_tokens:
++            continue
++        if token in tokenizer.get_added_vocab():
++            if current_sub_text:
++                sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
++                sub_texts.append(sub_text)
++                current_sub_text = []
++            sub_texts.append(token)
++        else:
++            current_sub_text.append(token)
++    if current_sub_text:
++        sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
++        sub_texts.append(sub_text)
++    if spaces_between_special_tokens:
++        return " ".join(sub_texts)
++    else:
++        return "".join(sub_texts)
++
++
++# 5 is an arbitrary value that should work for all
++# tokenizers (bigger = more conservative).
++INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
++
++
++def convert_prompt_ids_to_tokens(
++    tokenizer: AnyTokenizer,
++    prompt_ids: List[int],
++    skip_special_tokens: bool = False,
++) -> Tuple[List[str], int, int]:
++    """Converts the prompt ids to tokens and returns the tokens and offsets
++    for incremental detokenization.
++
++    Note that not all tokens are converted to strings. Only the tokens that
++    are necessary for incremental detokenization are converted to strings.
++    """
++    # We do not need to convert the whole prompt to tokens.
++    # Offset a little more in case we have special tokens.
++    new_tokens = tokenizer.convert_ids_to_tokens(
++        prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
++        skip_special_tokens=skip_special_tokens)
++    read_offset = len(new_tokens)
++    prefix_offset = max(
++        read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
++    # This is required to guard against out-of-vocab prompt token ids
++    _replace_none_with_empty(new_tokens)  # type: ignore[arg-type]
++    return new_tokens, prefix_offset, read_offset
++
++
++# Based on
++# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
++# under Apache 2.0 license
++def detokenize_incrementally(
++    tokenizer: AnyTokenizer,
++    all_input_ids: List[int],
++    prev_tokens: Optional[List[str]],
++    prefix_offset: int,
++    read_offset: int,
++    skip_special_tokens: bool = False,
++    spaces_between_special_tokens: bool = True,
++) -> Tuple[List[str], str, int, int]:
++    """Detokenizes the input ids incrementally and returns the new tokens
++    and the new text.
++
++    If `prev_tokens` is None, this function will convert the input ids to
++    tokens and return the tokens and the new text. Otherwise, it will return the
++    new tokens and the new text.
++
++    This function will also return the new prefix offset and the new read
++    offset to be used in the next iteration.
++
++    The offsets are necessary to defeat cleanup algorithms in the decode which
++    decide to add a space or not depending on the surrounding ids.
++
++    Args:
++        tokenizer: The tokenizer to use.
++        all_input_ids: The input ids. The last id is the new token id.
++        prev_tokens: The previous tokens. If None, this function will convert
++            the input ids to tokens and return the tokens and the new text.
++        prefix_offset: The prefix offset.
++        read_offset: The read offset.
++        skip_special_tokens: Whether to skip special tokens.
++        spaces_between_special_tokens: Whether to add spaces between special
++            tokens.
++    """
++    new_token_id = all_input_ids[-1]
++    # This is the first iteration for this sequence
++    is_first_iter = prev_tokens is None
++    if is_first_iter:
++        (prev_tokens, prefix_offset,
++         read_offset) = convert_prompt_ids_to_tokens(
++             tokenizer,
++             all_input_ids[:-1],
++             skip_special_tokens=skip_special_tokens)
++    assert prev_tokens is not None
++
++    # If the new token id is out of bounds, return an empty string.
++    if 0 <= new_token_id < len(tokenizer):
++        # Put new_token_id in a list so skip_special_tokens is respected
++        new_tokens = tokenizer.convert_ids_to_tokens(
++            [new_token_id], skip_special_tokens=skip_special_tokens)
++        if isinstance(new_tokens, str):
++            new_tokens = [new_tokens]
++    else:
++        new_tokens = [""]
++    output_tokens = prev_tokens + new_tokens
++
++    # If this is the first iteration, return all tokens.
++    if is_first_iter:
++        new_tokens = output_tokens
++
++    # The prefix text is necessary only to defeat cleanup algorithms in
++    # the decode which decide to add a space or not depending on the
++    # surrounding ids.
++    if tokenizer.is_fast or not tokenizer.get_added_vocab():
++        prefix_text = tokenizer.convert_tokens_to_string(
++            output_tokens[prefix_offset:read_offset])
++        new_text = tokenizer.convert_tokens_to_string(
++            output_tokens[prefix_offset:])
++    else:
++        prefix_text = _convert_tokens_to_string_with_added_encoders(
++            tokenizer,
++            output_tokens[prefix_offset:read_offset],
++            skip_special_tokens=skip_special_tokens,
++            spaces_between_special_tokens=spaces_between_special_tokens,
++        )
++        new_text = _convert_tokens_to_string_with_added_encoders(
++            tokenizer,
++            output_tokens[prefix_offset:],
++            skip_special_tokens=skip_special_tokens,
++            spaces_between_special_tokens=spaces_between_special_tokens,
++        )
++
++    if len(new_text) <= len(prefix_text) or new_text.endswith("�"):
++        # utf-8 char at the end means it's a potential unfinished byte sequence
++        # from byte fallback tokenization.
++        # If it's in the middle, it's probably a real invalid id generated
++        # by the model
++        return new_tokens, "", prefix_offset, read_offset
++
++    new_text = new_text[len(prefix_text):]
++    return new_tokens, new_text, read_offset, len(output_tokens)
+diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
+new file mode 100644
+index 0000000..b12cc83
+--- /dev/null
++++ b/vllm/transformers_utils/processor.py
+@@ -0,0 +1,104 @@
++from functools import lru_cache
++from typing import Any, cast
++
++from transformers.processing_utils import ProcessorMixin
++
++
++def get_processor(
++    processor_name: str,
++    *args: Any,
++    trust_remote_code: bool = False,
++    processor_cls: type[ProcessorMixin] = ProcessorMixin,
++    **kwargs: Any,
++):
++    """Load a processor for the given model name via HuggingFace."""
++    # don't put this import at the top level
++    # it will call torch.cuda.device_count()
++    from transformers import AutoProcessor
++
++    processor_factory = (AutoProcessor
++                         if processor_cls == ProcessorMixin else processor_cls)
++
++    try:
++        processor = processor_factory.from_pretrained(
++            processor_name,
++            *args,
++            trust_remote_code=trust_remote_code,
++            **kwargs,
++        )
++    except ValueError as e:
++        # If the error pertains to the processor class not existing or not
++        # currently being imported, suggest using the --trust-remote-code flag.
++        # Unlike AutoTokenizer, AutoProcessor does not separate such errors
++        if not trust_remote_code:
++            err_msg = (
++                "Failed to load the processor. If the processor is "
++                "a custom processor not yet available in the HuggingFace "
++                "transformers library, consider setting "
++                "`trust_remote_code=True` in LLM or using the "
++                "`--trust-remote-code` flag in the CLI.")
++            raise RuntimeError(err_msg) from e
++        else:
++            raise e
++
++    return cast(ProcessorMixin, processor)
++
++
++cached_get_processor = lru_cache(get_processor)
++
++
++def get_image_processor(
++    processor_name: str,
++    *args: Any,
++    trust_remote_code: bool = False,
++    **kwargs: Any,
++):
++    """Load an image processor for the given model name via HuggingFace."""
++    # don't put this import at the top level
++    # it will call torch.cuda.device_count()
++    from transformers import AutoImageProcessor
++    from transformers.image_processing_utils import BaseImageProcessor
++
++    try:
++        processor = AutoImageProcessor.from_pretrained(
++            processor_name,
++            *args,
++            trust_remote_code=trust_remote_code,
++            **kwargs)
++    except ValueError as e:
++        # If the error pertains to the processor class not existing or not
++        # currently being imported, suggest using the --trust-remote-code flag.
++        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
++        if not trust_remote_code:
++            err_msg = (
++                "Failed to load the image processor. If the image processor is "
++                "a custom processor not yet available in the HuggingFace "
++                "transformers library, consider setting "
++                "`trust_remote_code=True` in LLM or using the "
++                "`--trust-remote-code` flag in the CLI.")
++            raise RuntimeError(err_msg) from e
++        else:
++            raise e
++
++    return cast(BaseImageProcessor, processor)
++
++
++def get_video_processor(
++    processor_name: str,
++    *args: Any,
++    trust_remote_code: bool = False,
++    **kwargs: Any,
++):
++    """Load a video processor for the given model name via HuggingFace."""
++    # don't put this import at the top level
++    # it will call torch.cuda.device_count()
++    from transformers.image_processing_utils import BaseImageProcessor
++
++    processor = get_processor(
++        processor_name,
++        *args,
++        trust_remote_code=trust_remote_code,
++        **kwargs,
++    )
++
++    return cast(BaseImageProcessor, processor.video_processor)
+diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
+new file mode 100644
+index 0000000..6ae6816
+--- /dev/null
++++ b/vllm/transformers_utils/s3_utils.py
+@@ -0,0 +1,151 @@
++import fnmatch
++import os
++import shutil
++import signal
++import tempfile
++from pathlib import Path
++from typing import Optional
++
++from vllm.utils import PlaceholderModule
++
++try:
++    import boto3
++except ImportError:
++    boto3 = PlaceholderModule("boto3")  # type: ignore[assignment]
++
++
++def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
++    return [
++        path for path in paths if any(
++            fnmatch.fnmatch(path, pattern) for pattern in patterns)
++    ]
++
++
++def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
++    return [
++        path for path in paths
++        if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
++    ]
++
++
++def glob(s3=None,
++         path: str = "",
++         allow_pattern: Optional[list[str]] = None) -> list[str]:
++    """
++    List full file names from S3 path and filter by allow pattern.
++
++    Args:
++        s3: S3 client to use.
++        path: The S3 path to list from.
++        allow_pattern: A list of patterns of which files to pull.
++
++    Returns:
++        list[str]: List of full S3 paths allowed by the pattern
++    """
++    if s3 is None:
++        s3 = boto3.client("s3")
++    bucket_name, _, paths = list_files(s3,
++                                       path=path,
++                                       allow_pattern=allow_pattern)
++    return [f"s3://{bucket_name}/{path}" for path in paths]
++
++
++def list_files(
++        s3,
++        path: str,
++        allow_pattern: Optional[list[str]] = None,
++        ignore_pattern: Optional[list[str]] = None
++) -> tuple[str, str, list[str]]:
++    """
++    List files from S3 path and filter by pattern.
++
++    Args:
++        s3: S3 client to use.
++        path: The S3 path to list from.
++        allow_pattern: A list of patterns of which files to pull.
++        ignore_pattern: A list of patterns of which files not to pull.
++
++    Returns:
++        tuple[str, str, list[str]]: A tuple where:
++            - The first element is the bucket name
++            - The second element is string represent the bucket 
++              and the prefix as a dir like string
++            - The third element is a list of files allowed or 
++              disallowed by pattern
++    """
++    parts = path.removeprefix('s3://').split('/')
++    prefix = '/'.join(parts[1:])
++    bucket_name = parts[0]
++
++    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
++    paths = [obj['Key'] for obj in objects.get('Contents', [])]
++
++    paths = _filter_ignore(paths, ["*/"])
++    if allow_pattern is not None:
++        paths = _filter_allow(paths, allow_pattern)
++
++    if ignore_pattern is not None:
++        paths = _filter_ignore(paths, ignore_pattern)
++
++    return bucket_name, prefix, paths
++
++
++class S3Model:
++    """
++    A class representing a S3 model mirrored into a temporary directory.
++
++    Attributes:
++        s3: S3 client.
++        dir: The temporary created directory.
++
++    Methods:
++        pull_files(): Pull model from S3 to the temporary directory.
++    """
++
++    def __init__(self) -> None:
++        self.s3 = boto3.client('s3')
++        for sig in (signal.SIGINT, signal.SIGTERM):
++            existing_handler = signal.getsignal(sig)
++            signal.signal(sig, self._close_by_signal(existing_handler))
++        self.dir = tempfile.mkdtemp()
++
++    def __del__(self):
++        self._close()
++
++    def _close(self) -> None:
++        if os.path.exists(self.dir):
++            shutil.rmtree(self.dir)
++
++    def _close_by_signal(self, existing_handler=None):
++
++        def new_handler(signum, frame):
++            self._close()
++            if existing_handler:
++                existing_handler(signum, frame)
++
++        return new_handler
++
++    def pull_files(self,
++                   s3_model_path: str = "",
++                   allow_pattern: Optional[list[str]] = None,
++                   ignore_pattern: Optional[list[str]] = None) -> None:
++        """
++        Pull files from S3 storage into the temporary directory.
++
++        Args:
++            s3_model_path: The S3 path of the model.
++            allow_pattern: A list of patterns of which files to pull.
++            ignore_pattern: A list of patterns of which files not to pull.
++
++        """
++        bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
++                                                  allow_pattern,
++                                                  ignore_pattern)
++        if len(files) == 0:
++            return
++
++        for file in files:
++            destination_file = self.dir + file.removeprefix(base_dir)
++            local_dir = Path(destination_file).parent
++            os.makedirs(local_dir, exist_ok=True)
++            self.s3.download_file(bucket_name, file, destination_file)
+diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
+index f5684db..2942624 100644
+--- a/vllm/transformers_utils/tokenizer.py
++++ b/vllm/transformers_utils/tokenizer.py
+@@ -1,4 +1,8 @@
++import contextlib
+ import os
++import warnings
++from pathlib import Path
++from types import MethodType
+ from typing import Optional, Union
+ 
+ import huggingface_hub
+@@ -8,15 +12,49 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
+ from vllm.envs import VLLM_USE_MODELSCOPE
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+-from vllm.transformers_utils.tokenizers import BaichuanTokenizer
++from vllm.transformers_utils.tokenizers import MistralTokenizer
++from vllm.transformers_utils.utils import check_gguf_file
+ from vllm.utils import make_async
+ 
+ logger = init_logger(__name__)
+ 
++AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
++                     MistralTokenizer]
+ 
+-def get_cached_tokenizer(
+-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
++
++def decode_tokens(
++    tokenizer: AnyTokenizer,
++    token_ids: list[int],
++    *,
++    skip_special_tokens: bool = False,
++) -> str:
++    """
++    Backend-agnostic equivalent of HF's
++    :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`.
++    """
++    return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
++
++
++def encode_tokens(
++    tokenizer: AnyTokenizer,
++    text: str,
++    *,
++    add_special_tokens: Optional[bool] = None,
++) -> list[int]:
++    """
++    Backend-agnostic equivalent of HF's
++    :code:`tokenizer.encode(text, add_special_tokens=...)`.
++    """
++    if isinstance(tokenizer, MistralTokenizer):
++        return tokenizer.tokenizer.encode(text,
++                                          bos=add_special_tokens,
++                                          eos=add_special_tokens)
++    elif add_special_tokens is not None:
++        return tokenizer.encode(text, add_special_tokens=add_special_tokens)
++    return tokenizer.encode(text)
++
++
++def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
+     """Get tokenizer with cached properties.
+ 
+     This will patch the tokenizer object in place.
+@@ -31,6 +69,15 @@ def get_cached_tokenizer(
+     tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
+     tokenizer_len = len(tokenizer)
+ 
++    max_token_id = max(tokenizer.get_vocab().values())
++    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
++    # are added and included in the implementation of the vocab_size
++    # property, but not in get_vocab(); if there is an implementation
++    # of vocab size, we should take the greater value.
++    if hasattr(tokenizer, "vocab_size"):
++        with contextlib.suppress(NotImplementedError):
++            max_token_id = max(max_token_id, tokenizer.vocab_size)
++
+     class CachedTokenizer(tokenizer.__class__):  # type: ignore
+ 
+         @property
+@@ -45,6 +92,10 @@ def get_cached_tokenizer(
+         def all_special_tokens_extended(self):
+             return tokenizer_all_special_tokens_extended
+ 
++        @property
++        def max_token_id(self):
++            return max_token_id
++
+         def __len__(self):
+             return tokenizer_len
+ 
+@@ -54,15 +105,35 @@ def get_cached_tokenizer(
+     return tokenizer
+ 
+ 
++def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
++    """Patch _pad method to accept `padding_side` for older tokenizers."""
++    orig_pad = tokenizer._pad
++
++    def _pad(
++        self: PreTrainedTokenizer,
++        *args,
++        padding_side: Optional[str] = None,
++        **kwargs,
++    ):
++        if padding_side is not None and padding_side != self.padding_side:
++            msg = ("`padding_side` argument is not supported by "
++                   f"{type(tokenizer).__name__} and will be ignored.")
++            warnings.warn(msg, stacklevel=2)
++
++        return orig_pad(*args, **kwargs)
++
++    tokenizer._pad = MethodType(_pad, tokenizer)
++
++
+ def get_tokenizer(
+-    tokenizer_name: str,
++    tokenizer_name: Union[str, Path],
+     *args,
+     tokenizer_mode: str = "auto",
+     trust_remote_code: bool = False,
+     revision: Optional[str] = None,
+     download_dir: Optional[str] = None,
+     **kwargs,
+-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
++) -> AnyTokenizer:
+     """Gets a tokenizer for the given model name via HuggingFace or ModelScope.
+     """
+     if VLLM_USE_MODELSCOPE:
+@@ -88,60 +159,79 @@ def get_tokenizer(
+                 "Cannot use the fast tokenizer in slow tokenizer mode.")
+         kwargs["use_fast"] = False
+ 
+-    try:
+-        tokenizer = AutoTokenizer.from_pretrained(
+-            tokenizer_name,
+-            *args,
+-            trust_remote_code=trust_remote_code,
+-            revision=revision,
+-            **kwargs)
+-    except ValueError as e:
+-        # If the error pertains to the tokenizer class not existing or not
+-        # currently being imported, suggest using the --trust-remote-code flag.
+-        if (not trust_remote_code and
+-            ("does not exist or is not currently imported." in str(e)
+-             or "requires you to execute the tokenizer file" in str(e))):
+-            err_msg = (
+-                "Failed to load the tokenizer. If the tokenizer is a custom "
+-                "tokenizer not yet available in the HuggingFace transformers "
+-                "library, consider setting `trust_remote_code=True` in LLM "
+-                "or using the `--trust-remote-code` flag in the CLI.")
+-            raise RuntimeError(err_msg) from e
+-        else:
+-            raise e
+-    except AttributeError as e:
+-        if "BaichuanTokenizer" in str(e):
+-            # This is for the error "'BaichuanTokenizer' object has no
+-            # attribute 'sp_model'".
+-            tokenizer = BaichuanTokenizer.from_pretrained(
++    if "truncation_side" not in kwargs:
++        kwargs["truncation_side"] = "left"
++
++    # Separate model folder from file path for GGUF models
++    is_gguf = check_gguf_file(tokenizer_name)
++    if is_gguf:
++        kwargs["gguf_file"] = Path(tokenizer_name).name
++        tokenizer_name = Path(tokenizer_name).parent
++
++    # if tokenizer is from official mistral org
++    is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
++    if is_from_mistral_org and tokenizer_mode != "mistral":
++        warnings.warn(
++            'It is strongly recommended to run mistral models with '
++            '`--tokenizer-mode "mistral"` to ensure correct '
++            'encoding and decoding.',
++            FutureWarning,
++            stacklevel=2)
++    if tokenizer_mode == "mistral":
++        tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
++                                                     revision=revision)
++    else:
++        try:
++            tokenizer = AutoTokenizer.from_pretrained(
+                 tokenizer_name,
+                 *args,
+                 trust_remote_code=trust_remote_code,
+                 revision=revision,
+-                **kwargs)
+-        else:
+-            raise e
++                **kwargs,
++            )
++        except ValueError as e:
++            # If the error pertains to the tokenizer class not existing or not
++            # currently being imported,
++            # suggest using the --trust-remote-code flag.
++            if not trust_remote_code and (
++                    "does not exist or is not currently imported." in str(e)
++                    or "requires you to execute the tokenizer file" in str(e)):
++                err_msg = ("Failed to load the tokenizer. If the tokenizer "
++                           "is a custom tokenizer not yet available in the "
++                           "HuggingFace transformers library, consider "
++                           "setting `trust_remote_code=True` in LLM or using "
++                           "the `--trust-remote-code` flag in the CLI.")
++                raise RuntimeError(err_msg) from e
++            else:
++                raise e
++
++        # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
++        if type(tokenizer).__name__ in ("ChatGLMTokenizer",
++                                        "ChatGLM4Tokenizer"):
++            assert isinstance(tokenizer, PreTrainedTokenizer)
++            patch_padding_side(tokenizer)
++
++        if not isinstance(tokenizer, PreTrainedTokenizerFast):
++            logger.warning(
++                "Using a slow tokenizer. This might cause a significant "
++                "slowdown. Consider using a fast tokenizer instead.")
++        tokenizer = get_cached_tokenizer(tokenizer)
+ 
+-    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+-        logger.warning(
+-            "Using a slow tokenizer. This might cause a significant "
+-            "slowdown. Consider using a fast tokenizer instead.")
+-    return get_cached_tokenizer(tokenizer)
++    return tokenizer
+ 
+ 
+ def get_lora_tokenizer(lora_request: LoRARequest, *args,
+-                       **kwargs) -> Optional[PreTrainedTokenizer]:
++                       **kwargs) -> Optional[AnyTokenizer]:
+     if lora_request is None:
+         return None
+     try:
+-        tokenizer = get_tokenizer(lora_request.lora_local_path, *args,
+-                                  **kwargs)
+-    except OSError as e:
++        tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
++    except Exception as e:
+         # No tokenizer was found in the LoRA folder,
+         # use base model tokenizer
+         logger.warning(
+             "No tokenizer found in %s, using base model tokenizer instead. "
+-            "(Exception: %s)", lora_request.lora_local_path, e)
++            "(Exception: %s)", lora_request.lora_path, e)
+         tokenizer = None
+     return tokenizer
+ 
+diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
+index 0195c40..d400276 100644
+--- a/vllm/transformers_utils/tokenizer_group/__init__.py
++++ b/vllm/transformers_utils/tokenizer_group/__init__.py
+@@ -1,33 +1,53 @@
+-from typing import Optional
++from typing import Optional, Type
+ 
+-from vllm.config import TokenizerPoolConfig
++from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
++                         SchedulerConfig, TokenizerPoolConfig)
+ from vllm.executor.ray_utils import ray
+-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
+-    BaseTokenizerGroup)
+-from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
+-    TokenizerGroup)
++
++from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
++from .tokenizer_group import TokenizerGroup
+ 
+ if ray:
+-    from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
+-        RayTokenizerGroupPool)
++    from .ray_tokenizer_group import RayTokenizerGroupPool
+ else:
+     RayTokenizerGroupPool = None  # type: ignore
+ 
+ 
++def init_tokenizer_from_configs(model_config: ModelConfig,
++                                scheduler_config: SchedulerConfig,
++                                parallel_config: ParallelConfig,
++                                lora_config: LoRAConfig):
++    init_kwargs = dict(tokenizer_id=model_config.tokenizer,
++                       enable_lora=bool(lora_config),
++                       max_num_seqs=scheduler_config.max_num_seqs,
++                       max_loras=lora_config.max_loras if lora_config else 0,
++                       max_input_length=None,
++                       tokenizer_mode=model_config.tokenizer_mode,
++                       trust_remote_code=model_config.trust_remote_code,
++                       revision=model_config.tokenizer_revision)
++
++    return get_tokenizer_group(parallel_config.tokenizer_pool_config,
++                               **init_kwargs)
++
++
+ def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
+                         **init_kwargs) -> BaseTokenizerGroup:
++    tokenizer_cls: Type[BaseTokenizerGroup]
+     if tokenizer_pool_config is None:
+-        return TokenizerGroup(**init_kwargs)
+-    if tokenizer_pool_config.pool_type == "ray":
++        tokenizer_cls = TokenizerGroup
++    elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass(
++            tokenizer_pool_config.pool_type, BaseTokenizerGroup):
++        tokenizer_cls = tokenizer_pool_config.pool_type
++    elif tokenizer_pool_config.pool_type == "ray":
+         if RayTokenizerGroupPool is None:
+             raise ImportError(
+                 "RayTokenizerGroupPool is not available. Please install "
+                 "the ray package to use the Ray tokenizer group pool.")
+-        return RayTokenizerGroupPool.from_config(tokenizer_pool_config,
+-                                                 **init_kwargs)
++        tokenizer_cls = RayTokenizerGroupPool
+     else:
+         raise ValueError(
+             f"Unknown pool type: {tokenizer_pool_config.pool_type}")
++    return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
+ 
+ 
+-__all__ = ["get_tokenizer_group", "BaseTokenizerGroup"]
++__all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"]
+diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+index 3cce96e..e6cc7cd 100644
+--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
++++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+@@ -1,23 +1,30 @@
+ from abc import ABC, abstractmethod
+ from typing import List, Optional
+ 
+-from transformers import PreTrainedTokenizer
+-
++from vllm.config import TokenizerPoolConfig
+ from vllm.lora.request import LoRARequest
++from vllm.transformers_utils.tokenizer import AnyTokenizer
+ 
+ 
+ class BaseTokenizerGroup(ABC):
+     """A group of tokenizers that can be used for LoRA adapters."""
+ 
++    @classmethod
++    @abstractmethod
++    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
++                    **init_kwargs) -> "BaseTokenizerGroup":
++        pass
++
+     @abstractmethod
+     def ping(self) -> bool:
+         """Check if the tokenizer group is alive."""
+         pass
+ 
+     @abstractmethod
+-    def get_max_input_len(self,
+-                          lora_request: Optional[LoRARequest] = None
+-                          ) -> Optional[int]:
++    def get_max_input_len(
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> Optional[int]:
+         """Get the maximum input length for the LoRA request."""
+         pass
+ 
+@@ -25,7 +32,8 @@ class BaseTokenizerGroup(ABC):
+     def encode(self,
+                prompt: str,
+                request_id: Optional[str] = None,
+-               lora_request: Optional[LoRARequest] = None) -> List[int]:
++               lora_request: Optional[LoRARequest] = None,
++               add_special_tokens: Optional[bool] = None) -> List[int]:
+         """Encode a prompt using the tokenizer group."""
+         pass
+ 
+@@ -34,22 +42,27 @@ class BaseTokenizerGroup(ABC):
+             self,
+             prompt: str,
+             request_id: Optional[str] = None,
+-            lora_request: Optional[LoRARequest] = None) -> List[int]:
++            lora_request: Optional[LoRARequest] = None,
++            add_special_tokens: Optional[bool] = None) -> List[int]:
+         """Encode a prompt using the tokenizer group."""
+         pass
+ 
+     @abstractmethod
+     def get_lora_tokenizer(
+-            self,
+-            lora_request: Optional[LoRARequest] = None
+-    ) -> "PreTrainedTokenizer":
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> AnyTokenizer:
+         """Get a tokenizer for a LoRA request."""
+         pass
+ 
+     @abstractmethod
+     async def get_lora_tokenizer_async(
+-            self,
+-            lora_request: Optional[LoRARequest] = None
+-    ) -> "PreTrainedTokenizer":
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> AnyTokenizer:
+         """Get a tokenizer for a LoRA request."""
+         pass
++
++    def check_health(self):
++        """Raise exception if the tokenizer group is unhealthy."""
++        return
+diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+index 7c60541..3f7627e 100644
+--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
++++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+@@ -2,16 +2,23 @@ import asyncio
+ import os
+ from typing import List, Optional
+ 
++try:
++    from ray.exceptions import ActorDiedError  # type: ignore
++except ImportError:
++    # For older versions of Ray
++    from ray.exceptions import RayActorError as ActorDiedError  # type: ignore
+ from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+-from transformers import PreTrainedTokenizer
+ 
+ from vllm.config import TokenizerPoolConfig
+ from vllm.executor.ray_utils import ray
++from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
+-    BaseTokenizerGroup)
+-from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
+-    TokenizerGroup)
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++
++from .base_tokenizer_group import BaseTokenizerGroup
++from .tokenizer_group import TokenizerGroup
++
++logger = init_logger(__name__)
+ 
+ 
+ class RayTokenizerGroupPool(BaseTokenizerGroup):
+@@ -21,8 +28,10 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
+     _worker_cls = TokenizerGroup
+ 
+     @classmethod
+-    def from_config(cls, tokenizer_pool_config: TokenizerPoolConfig,
++    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
+                     **init_kwargs) -> "RayTokenizerGroupPool":
++        if not tokenizer_pool_config:
++            raise ValueError("tokenizer_pool_config must not be None.")
+         ray_actor_options = (tokenizer_pool_config.extra_config or {
+             "num_cpus": 0
+         })
+@@ -46,31 +55,37 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
+                  ray_actor_options: dict, **tokenizer_config):
+         # Store a local copy of the TokenizerGroup for quick access
+         # to underlying HF tokenizers.
++        self._tokenizer_config = {
++            "tokenizer_id": tokenizer_id,
++            "enable_lora": enable_lora,
++            "max_num_seqs": max_num_seqs,
++            "max_input_length": max_input_length,
++            **tokenizer_config
++        }
+         self._local_tokenizer_group = self._worker_cls(
+-            tokenizer_id=tokenizer_id,
+-            enable_lora=enable_lora,
+-            max_num_seqs=max_num_seqs,
+-            max_input_length=max_input_length,
+-            **tokenizer_config,
+-        )
+-
+-        ray_tokenizer_group_cls = ray.remote(
+-            self._worker_cls).options(**ray_actor_options)
+-        self.tokenizer_actors = [
+-            ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora,
+-                                           max_num_seqs, max_input_length,
+-                                           **tokenizer_config)
+-            for _ in range(num_actors)
+-        ]
++            **self._tokenizer_config, )
++
++        self._ray_tokenizer_group_cls = ray.remote(
++            self._worker_cls).options(**ray_actor_options)  # type: ignore
++        self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
+         self._idle_actors: Optional[asyncio.Queue] = None
+ 
++        # If set, actor is unhealthy. Will reraise on the next
++        # check_health call.
++        self._exception: Optional[ActorDiedError] = None
++
++    def _init_actor(self) -> ray.ObjectRef:
++        return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
++
+     @property
+     def pool_size(self) -> int:
+         return len(self.tokenizer_actors)
+ 
+     def ping(self):
+-        return ray.get(
+-            [actor.ping.remote() for actor in self.tokenizer_actors])
++        return ray.get([
++            actor.ping.remote()  # type: ignore
++            for actor in self.tokenizer_actors
++        ])
+ 
+     def _ensure_queue_initialized(self):
+         if self._idle_actors is None:
+@@ -78,40 +93,78 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
+             for actor in self.tokenizer_actors:
+                 self._idle_actors.put_nowait(actor)
+ 
++    def _finalize_encode(self, actor: ray.ObjectRef,
++                         original_actor: ray.ObjectRef, actor_is_alive: bool):
++        assert self._idle_actors is not None
++        # Cleanup the dead actor.
++        if not actor_is_alive or original_actor is not actor:
++            self.tokenizer_actors.remove(original_actor)
++        if actor_is_alive:
++            # Put the actor back in the queue.
++            # This is done in a finally block to ensure that the actor is
++            # always put back in the queue, even if an exception/cancellation
++            # is raised.
++            self._idle_actors.put_nowait(actor)
++            # Add back the new actor.
++            if original_actor is not actor:
++                self.tokenizer_actors.append(actor)
++
+     def encode(self,
+                prompt: str,
+                request_id: Optional[str] = None,
+-               lora_request: Optional[LoRARequest] = None) -> List[int]:
++               lora_request: Optional[LoRARequest] = None,
++               add_special_tokens: Optional[bool] = None) -> List[int]:
+         """Encode a prompt using the tokenizer group.
+ 
+         We pick an idle actor and use it to encode the prompt.
+         The actor is then put back in the queue for future use.
+         This is blocking.
+         """
++        self.check_health()
+         self._ensure_queue_initialized()
+         assert self._idle_actors is not None
+ 
+         if self._idle_actors.empty():
+             raise RuntimeError("No idle actors available.")
+         actor = self._idle_actors.get_nowait()
++        actor_is_alive = True
++        original_actor = actor
+         try:
+             ret = ray.get(
+                 actor.encode.remote(request_id=request_id,
+                                     prompt=prompt,
+-                                    lora_request=lora_request))
++                                    lora_request=lora_request,
++                                    add_special_tokens=add_special_tokens))
++        except ActorDiedError as e:
++            # If the actor is dead, we first try to reinitialize it.
++            logger.warning("%s died with ActorDiedError, reinitializing.",
++                           actor,
++                           exc_info=e)
++            actor = self._init_actor()
++            try:
++                ret = ray.get(
++                    actor.encode.remote(request_id=request_id,
++                                        prompt=prompt,
++                                        lora_request=lora_request,
++                                        add_special_tokens=add_special_tokens))
++            except ActorDiedError as e:
++                logger.error(
++                    "%s died for second time in a row, marking "
++                    "RayTokenizerGroupPool as unhealthy.", actor)
++                actor_is_alive = False
++                if not self._exception:
++                    self._exception = e
++                self.check_health()
+         finally:
+-            # Put the actor back in the queue.
+-            # This is done in a finally block to ensure that the actor is
+-            # always put back in the queue, even if an exception/cancellation
+-            # is raised.
+-            self._idle_actors.put_nowait(actor)
++            self._finalize_encode(actor, original_actor, actor_is_alive)
+         return ret
+ 
+     async def encode_async(
+             self,
+             prompt: str,
+             request_id: Optional[str] = None,
+-            lora_request: Optional[LoRARequest] = None) -> List[int]:
++            lora_request: Optional[LoRARequest] = None,
++            add_special_tokens: Optional[bool] = None) -> List[int]:
+         """Encode a prompt using the tokenizer group.
+ 
+         We pick an idle actor and use it to encode the prompt.
+@@ -120,20 +173,41 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
+         The actor is then put back in the queue for future use.
+         This is non-blocking.
+         """
++        self.check_health()
+         self._ensure_queue_initialized()
+         assert self._idle_actors is not None
+ 
+         actor = await self._idle_actors.get()
++        actor_is_alive = True
++        original_actor = actor
+         try:
+-            ret = await actor.encode.remote(request_id=request_id,
+-                                            prompt=prompt,
+-                                            lora_request=lora_request)
++            ret = await actor.encode.remote(
++                request_id=request_id,
++                prompt=prompt,
++                lora_request=lora_request,
++                add_special_tokens=add_special_tokens)
++        except ActorDiedError as e:
++            # If the actor is dead, we first try to reinitialize it.
++            logger.warning("%s died with ActorDiedError, reinitializing.",
++                           actor,
++                           exc_info=e)
++            actor = self._init_actor()
++            try:
++                ret = await actor.encode.remote(
++                    request_id=request_id,
++                    prompt=prompt,
++                    lora_request=lora_request,
++                    add_special_tokens=add_special_tokens)
++            except ActorDiedError as e:
++                logger.error(
++                    "%s died for second time in a row, marking "
++                    "RayTokenizerGroupPool as unhealthy.", actor)
++                actor_is_alive = False
++                if not self._exception:
++                    self._exception = e
++                self.check_health()
+         finally:
+-            # Put the actor back in the queue.
+-            # This is done in a finally block to ensure that the actor is
+-            # always put back in the queue, even if an exception/cancellation
+-            # is raised.
+-            self._idle_actors.put_nowait(actor)
++            self._finalize_encode(actor, original_actor, actor_is_alive)
+         return ret
+ 
+     def get_max_input_len(self,
+@@ -143,18 +217,23 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
+         return self._local_tokenizer_group.get_max_input_len(lora_request)
+ 
+     def get_lora_tokenizer(
+-            self,
+-            lora_request: Optional[LoRARequest] = None
+-    ) -> "PreTrainedTokenizer":
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> AnyTokenizer:
+         return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
+ 
+     async def get_lora_tokenizer_async(
+-            self,
+-            lora_request: Optional[LoRARequest] = None
+-    ) -> "PreTrainedTokenizer":
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> AnyTokenizer:
+         return await self._local_tokenizer_group.get_lora_tokenizer_async(
+             lora_request)
+ 
++    def check_health(self):
++        if self._exception:
++            raise RuntimeError(
++                "TokenizerGroupPool is unhealthy.") from self._exception
++
+ 
+ def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
+     """Copy over all current process environment variables to the runtime_env.
+diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+index 927cbee..6dc2f90 100644
+--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
++++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+@@ -1,15 +1,15 @@
+ from typing import List, Optional
+ 
+-from transformers import PreTrainedTokenizer
+-
++from vllm.config import TokenizerPoolConfig
+ from vllm.lora.request import LoRARequest
+-from vllm.transformers_utils.tokenizer import (get_lora_tokenizer,
++from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
++                                               get_lora_tokenizer,
+                                                get_lora_tokenizer_async,
+                                                get_tokenizer)
+-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
+-    BaseTokenizerGroup)
+ from vllm.utils import LRUCache
+ 
++from .base_tokenizer_group import BaseTokenizerGroup
++
+ 
+ class TokenizerGroup(BaseTokenizerGroup):
+     """A group of tokenizers that can be used for LoRA adapters."""
+@@ -21,8 +21,14 @@ class TokenizerGroup(BaseTokenizerGroup):
+         self.enable_lora = enable_lora
+         self.max_input_length = max_input_length
+         self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
+-        self.lora_tokenizers = LRUCache[PreTrainedTokenizer](
+-            capacity=max_num_seqs) if enable_lora else None
++        max_loras = tokenizer_config.get("max_loras", 0)
++        self.lora_tokenizers = LRUCache[int, AnyTokenizer](
++            capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
++
++    @classmethod
++    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
++                    **init_kwargs) -> "TokenizerGroup":
++        return cls(**init_kwargs)
+ 
+     def ping(self) -> bool:
+         """Check if the tokenizer group is alive."""
+@@ -34,25 +40,47 @@ class TokenizerGroup(BaseTokenizerGroup):
+         """Get the maximum input length for the LoRA request."""
+         return self.max_input_length
+ 
++    def _raise_if_input_too_long(self,
++                                 encoded_tokens: List[int],
++                                 lora_request: Optional[LoRARequest] = None):
++        input_length = len(encoded_tokens)
++        if lora_request:
++            max_input_length = (lora_request.long_lora_max_len
++                                or self.max_input_length)
++        else:
++            max_input_length = self.max_input_length
++        if max_input_length is not None and input_length > max_input_length:
++            raise ValueError("Input too long.", input_length, max_input_length)
++
+     def encode(self,
+                prompt: str,
+                request_id: Optional[str] = None,
+-               lora_request: Optional[LoRARequest] = None) -> List[int]:
++               lora_request: Optional[LoRARequest] = None,
++               add_special_tokens: Optional[bool] = None) -> List[int]:
+         tokenizer = self.get_lora_tokenizer(lora_request)
+-        return tokenizer.encode(prompt)
++        ret = encode_tokens(tokenizer,
++                            prompt,
++                            add_special_tokens=add_special_tokens)
++        self._raise_if_input_too_long(ret, lora_request)
++        return ret
+ 
+     async def encode_async(
+             self,
+             prompt: str,
+             request_id: Optional[str] = None,
+-            lora_request: Optional[LoRARequest] = None) -> List[int]:
++            lora_request: Optional[LoRARequest] = None,
++            add_special_tokens: Optional[bool] = None) -> List[int]:
+         tokenizer = await self.get_lora_tokenizer_async(lora_request)
+-        return tokenizer.encode(prompt)
++        ret = encode_tokens(tokenizer,
++                            prompt,
++                            add_special_tokens=add_special_tokens)
++        self._raise_if_input_too_long(ret, lora_request)
++        return ret
+ 
+     def get_lora_tokenizer(
+-            self,
+-            lora_request: Optional[LoRARequest] = None
+-    ) -> "PreTrainedTokenizer":
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> AnyTokenizer:
+         if not lora_request or not self.enable_lora:
+             return self.tokenizer
+         if lora_request.lora_int_id not in self.lora_tokenizers:
+@@ -61,12 +89,12 @@ class TokenizerGroup(BaseTokenizerGroup):
+             self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+             return tokenizer
+         else:
+-            return self.lora_tokenizers.get(lora_request.lora_int_id)
++            return self.lora_tokenizers[lora_request.lora_int_id]
+ 
+     async def get_lora_tokenizer_async(
+-            self,
+-            lora_request: Optional[LoRARequest] = None
+-    ) -> "PreTrainedTokenizer":
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> AnyTokenizer:
+         if not lora_request or not self.enable_lora:
+             return self.tokenizer
+         if lora_request.lora_int_id not in self.lora_tokenizers:
+@@ -75,4 +103,4 @@ class TokenizerGroup(BaseTokenizerGroup):
+             self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+             return tokenizer
+         else:
+-            return self.lora_tokenizers.get(lora_request.lora_int_id)
++            return self.lora_tokenizers[lora_request.lora_int_id]
+diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
+index e6b5972..e68ad79 100644
+--- a/vllm/transformers_utils/tokenizers/__init__.py
++++ b/vllm/transformers_utils/tokenizers/__init__.py
+@@ -1,5 +1,3 @@
+-from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
++from .mistral import MistralTokenizer, maybe_serialize_tool_calls
+ 
+-__all__ = [
+-    "BaichuanTokenizer",
+-]
++__all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"]
+diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
+new file mode 100644
+index 0000000..17d722e
+--- /dev/null
++++ b/vllm/transformers_utils/tokenizers/mistral.py
+@@ -0,0 +1,366 @@
++import os
++import re
++from dataclasses import dataclass
++from pathlib import Path
++from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
++
++import huggingface_hub
++from huggingface_hub import HfApi, hf_hub_download
++from mistral_common.protocol.instruct.request import ChatCompletionRequest
++from mistral_common.tokens.tokenizers.base import SpecialTokens
++# yapf: disable
++from mistral_common.tokens.tokenizers.mistral import (
++    MistralTokenizer as PublicMistralTokenizer)
++# yapf: enable
++from mistral_common.tokens.tokenizers.sentencepiece import (
++    SentencePieceTokenizer)
++from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
++                                                     Tekkenizer)
++
++from vllm.logger import init_logger
++
++if TYPE_CHECKING:
++    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
++
++logger = init_logger(__name__)
++
++
++@dataclass
++class Encoding:
++    input_ids: List[int]
++
++
++def maybe_serialize_tool_calls(request: ChatCompletionRequest):
++    # SEE: https://github.com/vllm-project/vllm/pull/9951
++    # Credits go to: @gcalmettes
++    # NOTE: There is currently a bug in pydantic where attributes
++    # declared as iterables are replaced in in the instances by
++    # pydantic-core ValidatorIterator instance. In particular, this
++    # affects tool_calls defined in ChatCompletionAssistantMessageParam
++    # model:
++    # see:
++    #   - https://github.com/pydantic/pydantic/issues/9467
++    # As a result, tool_calls from assistant messages are never
++    # deserialized in the request object if the tool_calls iterator is
++    # not consumed. This affect messages passed to the MistralTokenizer
++    # since no chat template is applied and therefore the tools_calls
++    # iterator is not directly consumed.
++    # Issue is tracked on Pydantic side, with resolution planned for
++    # v2.11 release. In the meantime, the official workaround is to
++    # consume the iterator so the tool_calls are correctly deserialized
++    # in the OpenAI ChatCompletionAssistantMessageParam object
++    # https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
++    # Official Pydantic Issues:
++    #   - https://github.com/pydantic/pydantic/issues/9541
++    # TODO: remove when pydantic v2.11 is released
++    for i, message in enumerate(request.messages):
++        if message.get("role") == 'assistant':
++            tool_calls_validator = message.get("tool_calls", ().__iter__())
++            validated_tool_calls = []
++            while True:
++                try:
++                    tool_call = next(tool_calls_validator)  # type: ignore
++                    validated_tool_calls.append(tool_call)
++                except StopIteration:
++                    break
++
++            request.messages[i]["tool_calls"] = validated_tool_calls
++
++
++def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]:
++    repo_cache = os.path.join(
++        huggingface_hub.constants.HF_HUB_CACHE,
++        huggingface_hub.constants.REPO_ID_SEPARATOR.join(
++            ["models", *repo_id.split("/")]))
++
++    if revision is None:
++        revision_file = os.path.join(repo_cache, "refs", "main")
++        if os.path.isfile(revision_file):
++            with open(revision_file) as file:
++                revision = file.read()
++
++    if revision:
++        revision_dir = os.path.join(repo_cache, "snapshots", revision)
++        if os.path.isdir(revision_dir):
++            return os.listdir(revision_dir)
++
++    return []
++
++
++def find_tokenizer_file(files: List[str]):
++    file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
++
++    matched_files = [file for file in files if file_pattern.match(file)]
++    if len(matched_files) > 1:
++        raise OSError(f"Found {len(matched_files)} files matching the "
++                      f"pattern: {file_pattern}. Make sure only one Mistral "
++                      f"tokenizer is present in {files}.")
++    elif len(matched_files) == 0:
++        raise OSError(f"Found {len(matched_files)} files matching the "
++                      f"pattern: {file_pattern}. Make sure that a Mistral "
++                      f"tokenizer is present in {files}.")
++
++    return matched_files[0]
++
++
++class MistralTokenizer:
++
++    def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
++        self.mistral = tokenizer
++        self.instruct = tokenizer.instruct_tokenizer
++
++        tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
++        self.is_tekken = isinstance(tokenizer_, Tekkenizer)
++        self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
++        if self.is_tekken:
++            # Make sure special tokens will not raise
++            tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
++        elif self.is_spm:
++            pass
++        else:
++            raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
++
++        self._vocab = tokenizer_.vocab()
++        # Convert to a Dict[str, int] to match protocol, but this is a lossy
++        # conversion. There may be multiple token ids that decode to the same
++        # string due to partial UTF-8 byte sequences being converted to �
++        self._vocab_dict = {
++            token: idx
++            for idx, token in enumerate(self._vocab)
++        }
++        self.tokenizer = tokenizer_
++        self._max_token_id = self.vocab_size - 1
++
++    @classmethod
++    def from_pretrained(cls,
++                        path_or_repo_id: str,
++                        *,
++                        revision: Optional[str] = None) -> "MistralTokenizer":
++        if not Path(path_or_repo_id).exists():
++            assert len(path_or_repo_id.split("/")) == 2, (
++                "You have either provided a non-existent path: "
++                "{path_or_repo_id} or an invalid HF Hub repo id.")
++            tokenizer_file = cls._download_mistral_tokenizer_from_hf(
++                path_or_repo_id, revision)
++        elif Path(path_or_repo_id).is_dir():
++            tokenizer_file_name = find_tokenizer_file(
++                os.listdir(path_or_repo_id))
++            tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
++        else:
++            assert Path(
++                path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
++
++        mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
++        return cls(mistral_tokenizer)
++
++    @staticmethod
++    def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
++                                            revision: Optional[str]) -> str:
++        try:
++            hf_api = HfApi()
++            files = hf_api.list_repo_files(repo_id=tokenizer_name,
++                                           revision=revision)
++        except ConnectionError as exc:
++            files = list_local_repo_files(repo_id=tokenizer_name,
++                                          revision=revision)
++
++            if len(files) == 0:
++                raise exc
++
++        filename = find_tokenizer_file(files)
++
++        tokenizer_file = hf_hub_download(tokenizer_name,
++                                         filename=filename,
++                                         revision=revision)
++        return tokenizer_file
++
++    # the following attributes are set to fit VLLM's design and are used
++    # by the guided structured output backends.
++    @property
++    def all_special_tokens_extended(self) -> List[str]:
++        # tekken defines its own extended special tokens list
++        if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
++            special_tokens = self.tokenizer.SPECIAL_TOKENS
++        else:
++            special_tokens = list(SpecialTokens)
++        return [
++            s.value if isinstance(s, SpecialTokens) else s
++            for s in special_tokens
++        ]
++
++    @property
++    def all_special_tokens(self) -> List[str]:
++        return self.all_special_tokens_extended
++
++    @property
++    def all_special_ids(self) -> List[int]:
++        return [
++            self.all_special_tokens.index(t) for t in self.all_special_tokens
++        ]
++
++    @property
++    def bos_token_id(self) -> int:
++        return self.tokenizer.bos_id
++
++    @property
++    def eos_token_id(self) -> int:
++        return self.tokenizer.eos_id
++
++    @property
++    def is_fast(self) -> bool:
++        return True
++
++    @property
++    def vocab_size(self) -> int:
++        return len(self._vocab)
++
++    @property
++    def max_token_id(self) -> int:
++        return self._max_token_id
++
++    def __len__(self) -> int:
++        return self.vocab_size
++
++    def __call__(
++        self,
++        prompt: str,
++        add_special_tokens: bool = False,
++        truncation: bool = False,
++        max_length: Optional[int] = None,
++    ):
++        # Mistral Tokenizers should not add special tokens
++        input_ids = self.encode(prompt)
++
++        if truncation:
++            input_ids = input_ids[:max_length]
++
++        return Encoding(input_ids=input_ids)
++
++    def get_vocab(self) -> Dict[str, int]:
++        # NB: the dictionary form of the vocabulary collapses token ids that map
++        # to the same string but have different bytes
++        return self._vocab_dict
++
++    def get_added_vocab(self) -> Dict[str, int]:
++        # Mistral tokenizers have no added vocabulary
++        return {}
++
++    def encode(self, prompt: str) -> List[int]:
++        # `encode` should only be used for prompt completion
++        # it should never be used for chat_completion.
++        # For chat completion use `apply_chat_template`
++        return self.tokenizer.encode(prompt, bos=True, eos=False)
++
++    def apply_chat_template(self,
++                            messages: List["ChatCompletionMessageParam"],
++                            tools: Optional[Dict[str, Any]] = None,
++                            **kwargs) -> List[int]:
++
++        last_message = cast(Dict[str, Any], messages[-1])
++        if last_message["role"] == "assistant":
++            last_message["prefix"] = True
++
++        request = ChatCompletionRequest(messages=messages,
++                                        tools=tools)  # type: ignore[type-var]
++        encoded = self.mistral.encode_chat_completion(request)
++
++        # encode-decode to get clean prompt
++        return encoded.tokens
++
++    def convert_tokens_to_string(self, tokens: List[str]) -> str:
++        if self.is_tekken:
++            tokens = [
++                t for t in tokens
++                if (t is SpecialTokens.tool_calls
++                    or t not in self.tokenizer._all_special_tokens)
++            ]
++
++            if any(isinstance(t, bytes) for t in tokens):
++                # we need to encode and decode all tokens again
++                shift = self.tokenizer.num_special_tokens
++
++                def _token_to_id(t: str):
++                    t_bytes = t.encode("utf-8") \
++                        if not isinstance(t, bytes) else t
++                    try:
++                        return shift + \
++                            self.tokenizer._tekken_token2id_nospecial[t_bytes]
++                    except KeyError:
++                        logger.warning(
++                            "Failed to convert token %s to id,"
++                            " replacing with <unk>", t_bytes)
++                        return self.tokenizer.unk_id
++
++                ids = [_token_to_id(t) for t in tokens]
++                decoded = self.tokenizer.decode(ids)
++            else:
++                decoded = "".join(tokens)
++        else:
++            # make sure certain special tokens like Tool calls are
++            # not decoded
++            special_tokens = {SpecialTokens.tool_calls}
++            regular_tokens: List[str] = []
++            decoded_list = []
++
++            for token in tokens:
++                if token in special_tokens:
++                    if regular_tokens:
++                        decoded_list.append(
++                            self.tokenizer.decode(regular_tokens))
++                        regular_tokens = []
++                    decoded_list.append(token)
++                else:
++                    regular_tokens.append(token)
++
++            if regular_tokens:
++                decoded_list.append(
++                    self.tokenizer.decode(regular_tokens))  # type: ignore
++
++            decoded = ''.join(decoded_list)
++
++        return decoded
++
++    # WARN: Outlines logits processors can overwrite this method.
++    # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
++    # for more.
++    def decode(self,
++               ids: Union[List[int], int],
++               skip_special_tokens: bool = True) -> str:
++        assert (
++            skip_special_tokens
++        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
++
++        if isinstance(ids, int):
++            ids = [ids]
++        return self.tokenizer.decode(ids)
++
++    def convert_ids_to_tokens(
++        self,
++        ids: List[int],
++        skip_special_tokens: bool = True,
++    ) -> List[str]:
++        # TODO(Patrick) - potentially allow special tokens to not be skipped
++        assert (
++            skip_special_tokens
++        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
++
++        assert self.is_tekken or self.is_spm, type(self.tokenizer)
++
++        if self.is_tekken:
++            # skip special tokens except tool call
++            ids = [
++                i for i in ids if i > self.tokenizer.num_special_tokens or i ==
++                self.tokenizer.get_control_token(SpecialTokens.tool_calls)
++            ]
++
++        tokens = [self.tokenizer.id_to_piece(id) for id in ids]
++
++        if any("�" in t for t in tokens) and self.is_tekken:
++            # if a decoded token contains the replacement character, then the
++            # token has an incomplete UTF-8 character so we must use bytes
++            # See: https://github.com/vllm-project/vllm/pull/8640
++            #      https://github.com/vllm-project/vllm/pull/9625
++            # if underlying tokenizeir is sentencepiece, we just add "�"
++            tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
++
++        return tokens
+diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
+new file mode 100644
+index 0000000..10a09fb
+--- /dev/null
++++ b/vllm/transformers_utils/utils.py
+@@ -0,0 +1,20 @@
++from os import PathLike
++from pathlib import Path
++from typing import Union
++
++
++def is_s3(model_or_path: str) -> bool:
++    return model_or_path.lower().startswith('s3://')
++
++
++def check_gguf_file(model: Union[str, PathLike]) -> bool:
++    """Check if the file is a GGUF model."""
++    model = Path(model)
++    if not model.is_file():
++        return False
++    elif model.suffix == ".gguf":
++        return True
++
++    with open(model, "rb") as f:
++        header = f.read(4)
++    return header == b"GGUF"
+diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
+new file mode 100644
+index 0000000..5681853
+--- /dev/null
++++ b/vllm/triton_utils/__init__.py
+@@ -0,0 +1,10 @@
++from vllm.triton_utils.importing import HAS_TRITON
++
++__all__ = ["HAS_TRITON"]
++
++if HAS_TRITON:
++
++    from vllm.triton_utils.custom_cache_manager import (
++        maybe_set_triton_cache_manager)
++
++    __all__ += ["maybe_set_triton_cache_manager"]
+diff --git a/vllm/triton_utils/custom_cache_manager.py b/vllm/triton_utils/custom_cache_manager.py
+new file mode 100644
+index 0000000..17039d7
+--- /dev/null
++++ b/vllm/triton_utils/custom_cache_manager.py
+@@ -0,0 +1,53 @@
++import os
++
++from triton.runtime.cache import (FileCacheManager, default_cache_dir,
++                                  default_dump_dir, default_override_dir)
++
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++
++def maybe_set_triton_cache_manager() -> None:
++    """Set environment variable to tell Triton to use a
++    custom cache manager"""
++    cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
++    if cache_manger is None:
++        manager = "vllm.triton_utils.custom_cache_manager:CustomCacheManager"
++        logger.info("Setting Triton cache manager to: %s", manager)
++        os.environ["TRITON_CACHE_MANAGER"] = manager
++
++
++class CustomCacheManager(FileCacheManager):
++    """Re-implements Triton's cache manager, ensuring that a
++    unique cache directory is created for each process. This is
++    needed to avoid collisions when running with tp>1 and
++    using multi-processing as the distributed backend.
++
++    Note this issue was fixed by triton-lang/triton/pull/4295,
++    but the fix is not yet included in triton==v3.0.0. However,
++    it should be included in the subsequent version.
++    """
++
++    def __init__(self, key, override=False, dump=False):
++        self.key = key
++        self.lock_path = None
++        if dump:
++            self.cache_dir = default_dump_dir()
++            self.cache_dir = os.path.join(self.cache_dir, self.key)
++            self.lock_path = os.path.join(self.cache_dir, "lock")
++            os.makedirs(self.cache_dir, exist_ok=True)
++        elif override:
++            self.cache_dir = default_override_dir()
++            self.cache_dir = os.path.join(self.cache_dir, self.key)
++        else:
++            # create cache directory if it doesn't exist
++            self.cache_dir = os.getenv("TRITON_CACHE_DIR",
++                                       "").strip() or default_cache_dir()
++            if self.cache_dir:
++                self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
++                self.cache_dir = os.path.join(self.cache_dir, self.key)
++                self.lock_path = os.path.join(self.cache_dir, "lock")
++                os.makedirs(self.cache_dir, exist_ok=True)
++            else:
++                raise RuntimeError("Could not create or locate cache dir")
+diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
+new file mode 100644
+index 0000000..0c96e06
+--- /dev/null
++++ b/vllm/triton_utils/importing.py
+@@ -0,0 +1,15 @@
++from importlib.util import find_spec
++
++from vllm.logger import init_logger
++from vllm.platforms import current_platform
++
++logger = init_logger(__name__)
++
++HAS_TRITON = (
++    find_spec("triton") is not None
++    and not current_platform.is_xpu()  # Not compatible
++)
++
++if not HAS_TRITON:
++    logger.info("Triton not installed or not compatible; certain GPU-related"
++                " functions will not be available.")
+diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
+index 9029a5b..a9deee8 100644
+--- a/vllm/usage/usage_lib.py
++++ b/vllm/usage/usage_lib.py
+@@ -7,7 +7,7 @@ import time
+ from enum import Enum
+ from pathlib import Path
+ from threading import Thread
+-from typing import Any, Dict, Optional
++from typing import Any, Dict, Optional, Union
+ from uuid import uuid4
+ 
+ import cpuinfo
+@@ -16,14 +16,22 @@ import requests
+ import torch
+ 
+ import vllm.envs as envs
++from vllm.connections import global_http_connection
++from vllm.version import __version__ as VLLM_VERSION
+ 
+ _config_home = envs.VLLM_CONFIG_ROOT
+-_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
+-_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
+-                                              "vllm/do_not_track")
++_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
++_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
+ _USAGE_STATS_ENABLED = None
+ _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
+ 
++_GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {}
++
++
++def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None:
++    """Set global usage data that will be sent with every usage heartbeat."""
++    _GLOBAL_RUNTIME_DATA[key] = value
++
+ 
+ def is_usage_stats_enabled():
+     """Determine whether or not we can send usage stats to the server.
+@@ -90,6 +98,7 @@ class UsageContext(str, Enum):
+     LLM_CLASS = "LLM_CLASS"
+     API_SERVER = "API_SERVER"
+     OPENAI_API_SERVER = "OPENAI_API_SERVER"
++    OPENAI_BATCH_RUNNER = "OPENAI_BATCH_RUNNER"
+     ENGINE_CONTEXT = "ENGINE_CONTEXT"
+ 
+ 
+@@ -142,7 +151,8 @@ class UsageMessage:
+                            usage_context: UsageContext,
+                            extra_kvs: Dict[str, Any]) -> None:
+         # Platform information
+-        if torch.cuda.is_available():
++        from vllm.platforms import current_platform
++        if current_platform.is_cuda_alike():
+             device_property = torch.cuda.get_device_properties(0)
+             self.gpu_count = torch.cuda.device_count()
+             self.gpu_type = device_property.name
+@@ -162,9 +172,8 @@ class UsageMessage:
+         ])
+ 
+         # vLLM information
+-        import vllm  # delayed import to prevent circular import
+         self.context = usage_context.value
+-        self.vllm_version = vllm.__version__
++        self.vllm_version = VLLM_VERSION
+         self.model_architecture = model_architecture
+ 
+         # Metadata
+@@ -186,19 +195,24 @@ class UsageMessage:
+         """
+         while True:
+             time.sleep(600)
+-            data = {"uuid": self.uuid, "log_time": _get_current_timestamp_ns()}
++            data = {
++                "uuid": self.uuid,
++                "log_time": _get_current_timestamp_ns(),
++            }
++            data.update(_GLOBAL_RUNTIME_DATA)
+ 
+             self._write_to_file(data)
+             self._send_to_server(data)
+ 
+-    def _send_to_server(self, data):
++    def _send_to_server(self, data: Dict[str, Any]) -> None:
+         try:
+-            requests.post(_USAGE_STATS_SERVER, json=data)
++            global_http_client = global_http_connection.get_sync_client()
++            global_http_client.post(_USAGE_STATS_SERVER, json=data)
+         except requests.exceptions.RequestException:
+             # silently ignore unless we are using debug log
+             logging.debug("Failed to send usage data to server")
+ 
+-    def _write_to_file(self, data):
++    def _write_to_file(self, data: Dict[str, Any]) -> None:
+         os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True)
+         Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True)
+         with open(_USAGE_STATS_JSON_PATH, "a") as f:
+diff --git a/vllm/utils.py b/vllm/utils.py
+index b06c850..217ccb2 100644
+--- a/vllm/utils.py
++++ b/vllm/utils.py
+@@ -1,45 +1,189 @@
++import argparse
+ import asyncio
++import concurrent
++import contextlib
+ import datetime
+ import enum
+ import gc
+-import glob
++import getpass
++import importlib.metadata
++import importlib.util
++import inspect
++import ipaddress
++import multiprocessing
+ import os
++import re
++import resource
++import signal
+ import socket
+ import subprocess
++import sys
+ import tempfile
+ import threading
++import time
++import traceback
+ import uuid
+ import warnings
+-from collections import defaultdict
+-from functools import lru_cache, partial
+-from platform import uname
+-from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
+-                    Hashable, List, Optional, OrderedDict, Tuple, TypeVar,
+-                    Union)
+-
++import weakref
++from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
++from collections import OrderedDict, UserDict, defaultdict
++from collections.abc import Hashable, Iterable, Mapping
++from dataclasses import dataclass, field
++from functools import lru_cache, partial, wraps
++from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
++                    Dict, Generator, Generic, Iterator, List, Literal,
++                    NamedTuple, Optional, Tuple, Type, TypeVar, Union,
++                    overload)
++from uuid import uuid4
++
++import numpy as np
++import numpy.typing as npt
+ import psutil
+ import torch
+-from packaging.version import Version, parse
++import torch.types
++import yaml
++import zmq
++import zmq.asyncio
++from packaging.version import Version
++from torch.library import Library
++from typing_extensions import Never, ParamSpec, TypeIs, assert_never
+ 
+ import vllm.envs as envs
+ from vllm.logger import enable_trace_function_call, init_logger
+ 
+-T = TypeVar("T")
++if TYPE_CHECKING:
++    from vllm.config import VllmConfig
++
+ logger = init_logger(__name__)
+ 
++# Exception strings for non-implemented encoder/decoder scenarios
++
++# Reminder: Please update docs/source/features/compatibility_matrix.md
++# If the feature combo become valid
++
++STR_NOT_IMPL_ENC_DEC_SWA = \
++    "Sliding window attention for encoder/decoder models " + \
++                    "is not currently supported."
++
++STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \
++    "Prefix caching for encoder/decoder models " + \
++                    "is not currently supported."
++
++STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL = \
++    "Chunked prefill for encoder/decoder models " + \
++                    "is not currently supported."
++
++STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = (
++    "Models with logits_soft_cap "
++    "require FlashInfer backend, which is "
++    "currently not supported for encoder/decoder "
++    "models.")
++
++STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is currently not currently "
++                             "supported with encoder/decoder "
++                             "models.")
++
++STR_NOT_IMPL_ENC_DEC_PP = ("Pipeline parallelism is not "
++                           "currently supported with "
++                           "encoder/decoder models.")
++
++STR_NOT_IMPL_ENC_DEC_MM = ("Multimodal is not currently "
++                           "supported with encoder/decoder "
++                           "models.")
++
++STR_NOT_IMPL_ENC_DEC_SPEC_DEC = ("Speculative decoding is not "
++                                 "currently supported with encoder/"
++                                 "decoder models.")
++
++STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only "
++                                "backends currently supported with encoder/"
++                                "decoder models.")
++
++STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER = ("Prompt adapters are not "
++                                       "currently supported with encoder/"
++                                       "decoder models.")
++
++# Efficiently import all enc/dec error strings
++# rather than having to import all of the above
++STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
++    "STR_NOT_IMPL_ENC_DEC_SWA": STR_NOT_IMPL_ENC_DEC_SWA,
++    "STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE": STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
++    "STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL":
++    STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL,
++    "STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP": STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP,
++    "STR_NOT_IMPL_ENC_DEC_LORA": STR_NOT_IMPL_ENC_DEC_LORA,
++    "STR_NOT_IMPL_ENC_DEC_PP": STR_NOT_IMPL_ENC_DEC_PP,
++    "STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM,
++    "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC,
++    "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND,
++    "STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER,
++}
++
++# Constants related to forcing the attention backend selection
++
++# String name of register which may be set in order to
++# force auto-selection of attention backend by Attention
++# wrapper
++STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
++
++# Possible string values of STR_BACKEND_ENV_VAR
++# register, corresponding to possible backends
++STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
++STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA"
++STR_ROCM_FLASH_ATTN_VAL: str = "ROCM_FLASH"
++STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
++STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
++STR_INVALID_VAL: str = "INVALID"
++
++GB_bytes = 1_000_000_000
++"""The number of bytes in one gigabyte (GB)."""
++
++GiB_bytes = 1 << 30
++"""The number of bytes in one gibibyte (GiB)."""
++
+ STR_DTYPE_TO_TORCH_DTYPE = {
+     "half": torch.half,
+     "bfloat16": torch.bfloat16,
+     "float": torch.float,
+     "fp8": torch.uint8,
++    "fp8_e4m3": torch.uint8,
++    "fp8_e5m2": torch.uint8,
+ }
+ 
++TORCH_DTYPE_TO_NUMPY_DTYPE = {
++    torch.float16: np.float16,
++    torch.float32: np.float32,
++    torch.float64: np.float64,
++    torch.uint8: np.uint8,
++    torch.int32: np.int32,
++    torch.int64: np.int64,
++}
++
++P = ParamSpec('P')
++T = TypeVar("T")
++U = TypeVar("U")
++
++_K = TypeVar("_K", bound=Hashable)
++_V = TypeVar("_V")
++
++
++class _Sentinel:
++    ...
++
++
++ALL_PINNED_SENTINEL = _Sentinel()
++
+ 
+ class Device(enum.Enum):
+     GPU = enum.auto()
+     CPU = enum.auto()
+ 
+ 
++class LayerBlockType(enum.Enum):
++    attention = "attention"
++    mamba = "mamba"
++
++
+ class Counter:
+ 
+     def __init__(self, start: int = 0) -> None:
+@@ -54,104 +198,165 @@ class Counter:
+         self.counter = 0
+ 
+ 
+-class LRUCache(Generic[T]):
++class CacheInfo(NamedTuple):
++    hits: int
++    total: int
++
++    @property
++    def hit_ratio(self) -> float:
++        if self.total == 0:
++            return 0
++
++        return self.hits / self.total
+ 
+-    def __init__(self, capacity: int):
+-        self.cache: OrderedDict[Hashable, T] = OrderedDict()
++
++class LRUCache(Generic[_K, _V]):
++    """Note: This class is not thread safe!"""
++
++    def __init__(self, capacity: int) -> None:
++        self.cache = OrderedDict[_K, _V]()
++        self.pinned_items = set[_K]()
+         self.capacity = capacity
+ 
+-    def __contains__(self, key: Hashable) -> bool:
++        self._hits = 0
++        self._total = 0
++
++    def __contains__(self, key: _K) -> bool:
+         return key in self.cache
+ 
+     def __len__(self) -> int:
+         return len(self.cache)
+ 
+-    def __getitem__(self, key: Hashable) -> Optional[T]:
+-        return self.get(key)
++    def __getitem__(self, key: _K) -> _V:
++        value = self.cache[key]  # Raise KeyError if not exists
++        self.cache.move_to_end(key)
++        return value
+ 
+-    def __setitem__(self, key: Hashable, value: T) -> None:
++    def __setitem__(self, key: _K, value: _V) -> None:
+         self.put(key, value)
+ 
+-    def __delitem__(self, key: Hashable) -> None:
++    def __delitem__(self, key: _K) -> None:
+         self.pop(key)
+ 
+-    def touch(self, key: Hashable) -> None:
++    def stat(self) -> CacheInfo:
++        return CacheInfo(hits=self._hits, total=self._total)
++
++    def touch(self, key: _K) -> None:
+         self.cache.move_to_end(key)
+ 
+-    def get(self,
+-            key: Hashable,
+-            default_value: Optional[T] = None) -> Optional[T]:
++    def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
++        value: Optional[_V]
+         if key in self.cache:
+-            value: Optional[T] = self.cache[key]
++            value = self.cache[key]
+             self.cache.move_to_end(key)
++
++            self._hits += 1
+         else:
+-            value = default_value
++            value = default
++
++        self._total += 1
+         return value
+ 
+-    def put(self, key: Hashable, value: T) -> None:
++    def put(self, key: _K, value: _V) -> None:
+         self.cache[key] = value
+         self.cache.move_to_end(key)
+         self._remove_old_if_needed()
+ 
+-    def _on_remove(self, key: Hashable, value: Optional[T]):
++    def pin(self, key: _K) -> None:
++        """
++        Pins a key in the cache preventing it from being
++        evicted in the LRU order.
++        """
++        if key not in self.cache:
++            raise ValueError(f"Cannot pin key: {key} not in cache.")
++        self.pinned_items.add(key)
++
++    def _unpin(self, key: _K) -> None:
++        self.pinned_items.remove(key)
++
++    def _on_remove(self, key: _K, value: Optional[_V]) -> None:
+         pass
+ 
+-    def remove_oldest(self):
++    def remove_oldest(self, *, remove_pinned: bool = False) -> None:
+         if not self.cache:
+             return
+-        key, value = self.cache.popitem(last=False)
+-        self._on_remove(key, value)
++
++        if not remove_pinned:
++            # pop the oldest item in the cache that is not pinned
++            lru_key = next(
++                (key for key in self.cache if key not in self.pinned_items),
++                ALL_PINNED_SENTINEL)
++            if lru_key is ALL_PINNED_SENTINEL:
++                raise RuntimeError("All items are pinned, "
++                                   "cannot remove oldest from the cache.")
++        else:
++            lru_key = next(iter(self.cache))
++        self.pop(lru_key)  # type: ignore
+ 
+     def _remove_old_if_needed(self) -> None:
+         while len(self.cache) > self.capacity:
+             self.remove_oldest()
+ 
+-    def pop(self,
+-            key: Hashable,
+-            default_value: Optional[T] = None) -> Optional[T]:
++    def pop(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
+         run_on_remove = key in self.cache
+-        value: Optional[T] = self.cache.pop(key, default_value)
++        value = self.cache.pop(key, default)
++        # remove from pinned items
++        if key in self.pinned_items:
++            self._unpin(key)
+         if run_on_remove:
+             self._on_remove(key, value)
+         return value
+ 
+-    def clear(self):
++    def clear(self) -> None:
+         while len(self.cache) > 0:
+-            self.remove_oldest()
++            self.remove_oldest(remove_pinned=True)
+         self.cache.clear()
+ 
+ 
+-def is_hip() -> bool:
+-    return torch.version.hip is not None
++class PyObjectCache:
++    """Used to cache python objects to avoid object allocations
++    across scheduler iterations.
++    """
+ 
++    def __init__(self, obj_builder):
++        self._obj_builder = obj_builder
++        self._index = 0
+ 
+-@lru_cache(maxsize=None)
+-def is_cpu() -> bool:
+-    from importlib.metadata import PackageNotFoundError, version
+-    try:
+-        return "cpu" in version("vllm")
+-    except PackageNotFoundError:
+-        return False
++        self._obj_cache = []
++        for _ in range(128):
++            self._obj_cache.append(self._obj_builder())
+ 
++    def _grow_cache(self):
++        # Double the size of the cache
++        num_objs = len(self._obj_cache)
++        for _ in range(num_objs):
++            self._obj_cache.append(self._obj_builder())
+ 
+-@lru_cache(maxsize=None)
+-def is_neuron() -> bool:
+-    try:
+-        import transformers_neuronx
+-    except ImportError:
+-        transformers_neuronx = None
+-    return transformers_neuronx is not None
++    def get_object(self):
++        """Returns a pre-allocated cached object. If there is not enough
++        objects, then the cache size will double.
++        """
++        if self._index >= len(self._obj_cache):
++            self._grow_cache()
++            assert self._index < len(self._obj_cache)
++
++        obj = self._obj_cache[self._index]
++        self._index += 1
++
++        return obj
++
++    def reset(self):
++        """Makes all cached-objects available for the next scheduler iteration.
++        """
++        self._index = 0
+ 
+ 
+ @lru_cache(maxsize=None)
+ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
+     """Returns the maximum shared memory per thread block in bytes."""
+-    # NOTE: This import statement should be executed lazily since
+-    # the Neuron-X backend does not have the `cuda_utils` module.
+-    from vllm._C import cuda_utils
+-
++    from vllm import _custom_ops as ops
+     max_shared_mem = (
+-        cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
++        ops.get_max_shared_memory_per_block_device_attribute(gpu))
+     # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
+     # will fail
+     assert max_shared_mem > 0, "max_shared_mem can not be zero"
+@@ -167,24 +372,10 @@ def random_uuid() -> str:
+     return str(uuid.uuid4().hex)
+ 
+ 
+-@lru_cache(maxsize=None)
+-def get_vllm_instance_id():
+-    """
+-    If the environment variable VLLM_INSTANCE_ID is set, return it.
+-    Otherwise, return a random UUID.
+-    Instance id represents an instance of the VLLM. All processes in the same
+-    instance should have the same instance id.
+-    """
+-    return envs.VLLM_INSTANCE_ID or f"vllm-instance-{random_uuid()}"
+-
+-
+-@lru_cache(maxsize=None)
+-def in_wsl() -> bool:
+-    # Reference: https://github.com/microsoft/WSL/issues/4071
+-    return "microsoft" in " ".join(uname()).lower()
+-
+-
+-def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
++def make_async(
++    func: Callable[P, T],
++    executor: Optional[concurrent.futures.Executor] = None
++) -> Callable[P, Awaitable[T]]:
+     """Take a blocking function, and run it on in an executor thread.
+ 
+     This function prevents the blocking function from blocking the
+@@ -192,59 +383,72 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
+     The code in this function needs to be thread safe.
+     """
+ 
+-    def _async_wrapper(*args, **kwargs) -> asyncio.Future:
++    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
+         loop = asyncio.get_event_loop()
+         p_func = partial(func, *args, **kwargs)
+-        return loop.run_in_executor(executor=None, func=p_func)
++        return loop.run_in_executor(executor=executor, func=p_func)
+ 
+     return _async_wrapper
+ 
+ 
+-def merge_async_iterators(
+-        *iterators: AsyncIterator[T]) -> AsyncIterator[Tuple[int, T]]:
++def _next_task(iterator: AsyncGenerator[T, None],
++               loop: AbstractEventLoop) -> Task:
++    # Can use anext() in python >= 3.10
++    return loop.create_task(iterator.__anext__())  # type: ignore[arg-type]
++
++
++async def merge_async_iterators(
++    *iterators: AsyncGenerator[T,
++                               None], ) -> AsyncGenerator[Tuple[int, T], None]:
+     """Merge multiple asynchronous iterators into a single iterator.
+ 
+     This method handle the case where some iterators finish before others.
+     When it yields, it yields a tuple (i, item) where i is the index of the
+     iterator that yields the item.
+     """
+-    queue: asyncio.Queue[Union[Tuple[int, T], Exception]] = asyncio.Queue()
+-
+-    finished = [False] * len(iterators)
+ 
+-    async def producer(i: int, iterator: AsyncIterator[T]):
+-        try:
+-            async for item in iterator:
+-                await queue.put((i, item))
+-        except Exception as e:
+-            await queue.put(e)
+-        finished[i] = True
+-
+-    _tasks = [
+-        asyncio.create_task(producer(i, iterator))
+-        for i, iterator in enumerate(iterators)
+-    ]
++    loop = asyncio.get_running_loop()
+ 
+-    async def consumer():
+-        try:
+-            while not all(finished) or not queue.empty():
+-                item = await queue.get()
+-                if isinstance(item, Exception):
+-                    raise item
+-                yield item
+-        except (Exception, asyncio.CancelledError) as e:
+-            for task in _tasks:
+-                # NOTE: Pass the error msg in cancel()
+-                # when only Python 3.9+ is supported.
+-                task.cancel()
+-            raise e
+-        await asyncio.gather(*_tasks)
+-
+-    return consumer()
++    awaits = {_next_task(pair[1], loop): pair for pair in enumerate(iterators)}
++    try:
++        while awaits:
++            done, _ = await asyncio.wait(awaits.keys(),
++                                         return_when=FIRST_COMPLETED)
++            for d in done:
++                pair = awaits.pop(d)
++                try:
++                    item = await d
++                    i, it = pair
++                    awaits[_next_task(it, loop)] = pair
++                    yield i, item
++                except StopAsyncIteration:
++                    pass
++    finally:
++        # Cancel any remaining iterators
++        for f, (_, it) in awaits.items():
++            with contextlib.suppress(BaseException):
++                f.cancel()
++                await it.aclose()
++
++
++async def collect_from_async_generator(
++        iterator: AsyncGenerator[T, None]) -> List[T]:
++    """Collect all items from an async generator into a list."""
++    items = []
++    async for item in iterator:
++        items.append(item)
++    return items
+ 
+ 
+ def get_ip() -> str:
+     host_ip = envs.VLLM_HOST_IP
++    if "HOST_IP" in os.environ and "VLLM_HOST_IP" not in os.environ:
++        logger.warning(
++            "The environment variable HOST_IP is deprecated and ignored, as"
++            " it is often used by Docker and other software to"
++            "interact with the container's network stack. Please "
++            "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
++            " to communicate with each other.")
+     if host_ip:
+         return host_ip
+ 
+@@ -276,13 +480,37 @@ def get_ip() -> str:
+     return "0.0.0.0"
+ 
+ 
++def is_valid_ipv6_address(address: str) -> bool:
++    try:
++        ipaddress.IPv6Address(address)
++        return True
++    except ValueError:
++        return False
++
++
+ def get_distributed_init_method(ip: str, port: int) -> str:
+     # Brackets are not permitted in ipv4 addresses,
+     # see https://github.com/python/cpython/issues/103848
+     return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
+ 
+ 
++def get_open_zmq_ipc_path() -> str:
++    base_rpc_path = envs.VLLM_RPC_BASE_PATH
++    return f"ipc://{base_rpc_path}/{uuid4()}"
++
++
+ def get_open_port() -> int:
++    port = envs.VLLM_PORT
++    if port is not None:
++        while True:
++            try:
++                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
++                    s.bind(("", port))
++                    return port
++            except OSError:
++                port += 1  # Increment port number if already in use
++                logger.info("Port %d is already in use, trying port %d",
++                            port - 1, port)
+     # try ipv4
+     try:
+         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+@@ -295,6 +523,23 @@ def get_open_port() -> int:
+             return s.getsockname()[1]
+ 
+ 
++def find_process_using_port(port: int) -> Optional[psutil.Process]:
++    # TODO: We can not check for running processes with network
++    # port on macOS. Therefore, we can not have a full graceful shutdown
++    # of vLLM. For now, let's not look for processes in this case.
++    # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
++    if sys.platform.startswith("darwin"):
++        return None
++
++    for conn in psutil.net_connections():
++        if conn.laddr.port == port:
++            try:
++                return psutil.Process(conn.pid)
++            except psutil.NoSuchProcess:
++                return None
++    return None
++
++
+ def update_environment_variables(envs: Dict[str, str]):
+     for k, v in envs.items():
+         if k in os.environ and os.environ[k] != v:
+@@ -304,9 +549,10 @@ def update_environment_variables(envs: Dict[str, str]):
+         os.environ[k] = v
+ 
+ 
+-def chunk_list(lst, chunk_size):
++def chunk_list(lst: List[T], chunk_size: int):
+     """Yield successive chunk_size chunks from lst."""
+-    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
++    for i in range(0, len(lst), chunk_size):
++        yield lst[i:i + chunk_size]
+ 
+ 
+ def cdiv(a: int, b: int) -> int:
+@@ -314,29 +560,8 @@ def cdiv(a: int, b: int) -> int:
+     return -(a // -b)
+ 
+ 
+-@lru_cache(maxsize=None)
+-def get_nvcc_cuda_version() -> Optional[Version]:
+-    cuda_home = envs.CUDA_HOME
+-    if not cuda_home:
+-        cuda_home = '/usr/local/cuda'
+-        if os.path.isfile(cuda_home + '/bin/nvcc'):
+-            logger.info(
+-                'CUDA_HOME is not found in the environment. '
+-                'Using %s as CUDA_HOME.', cuda_home)
+-        else:
+-            logger.warning('Not found nvcc in %s. Skip cuda version check!',
+-                           cuda_home)
+-            return None
+-    nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
+-                                          universal_newlines=True)
+-    output = nvcc_output.split()
+-    release_idx = output.index("release") + 1
+-    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
+-    return nvcc_cuda_version
+-
+-
+ def _generate_random_fp8(
+-    tensor: torch.tensor,
++    tensor: torch.Tensor,
+     low: float,
+     high: float,
+ ) -> None:
+@@ -351,7 +576,7 @@ def _generate_random_fp8(
+     from vllm import _custom_ops as ops
+     tensor_tmp = torch.empty_like(tensor, dtype=torch.float16)
+     tensor_tmp.uniform_(low, high)
+-    ops.convert_fp8(tensor_tmp, tensor)
++    ops.convert_fp8(tensor, tensor_tmp)
+     del tensor_tmp
+ 
+ 
+@@ -390,20 +615,27 @@ def create_kv_caches_with_random_flash(
+     seed: int = 0,
+     device: Optional[str] = "cuda",
+ ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+-    assert cache_dtype != "fp8"
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++    from vllm.platforms import current_platform
++    current_platform.seed_everything(seed)
+ 
+     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+     key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
+     scale = head_size**-0.5
+-    key_caches, value_caches = [], []
++
++    key_caches: List[torch.Tensor] = []
++    value_caches: List[torch.Tensor] = []
++
+     for _ in range(num_layers):
+         key_value_cache = torch.empty(size=key_value_cache_shape,
+                                       dtype=torch_dtype,
+                                       device=device)
+-        key_value_cache.uniform_(-scale, scale)
++        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
++            key_value_cache.uniform_(-scale, scale)
++        elif cache_dtype == 'fp8':
++            _generate_random_fp8(key_value_cache, -scale, scale)
++        else:
++            raise ValueError(
++                f"Does not support key cache of type {cache_dtype}")
+         key_caches.append(key_value_cache[:, 0])
+         value_caches.append(key_value_cache[:, 1])
+     return key_caches, value_caches
+@@ -420,16 +652,20 @@ def create_kv_caches_with_random(
+     seed: int = 0,
+     device: Optional[str] = "cuda",
+ ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+-    torch.random.manual_seed(seed)
+-    if torch.cuda.is_available():
+-        torch.cuda.manual_seed(seed)
++
++    if cache_dtype == "fp8" and head_size % 16:
++        raise ValueError(
++            f"Does not support key cache of type fp8 with head_size {head_size}"
++        )
++    from vllm.platforms import current_platform
++    current_platform.seed_everything(seed)
+ 
+     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+ 
+     scale = head_size**-0.5
+     x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
+     key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
+-    key_caches = []
++    key_caches: List[torch.Tensor] = []
+     for _ in range(num_layers):
+         key_cache = torch.empty(size=key_cache_shape,
+                                 dtype=torch_dtype,
+@@ -444,7 +680,7 @@ def create_kv_caches_with_random(
+         key_caches.append(key_cache)
+ 
+     value_cache_shape = (num_blocks, num_heads, head_size, block_size)
+-    value_caches = []
++    value_caches: List[torch.Tensor] = []
+     for _ in range(num_layers):
+         value_cache = torch.empty(size=value_cache_shape,
+                                   dtype=torch_dtype,
+@@ -460,37 +696,26 @@ def create_kv_caches_with_random(
+     return key_caches, value_caches
+ 
+ 
+-@lru_cache
+-def print_warning_once(msg: str) -> None:
+-    logger.warning(msg)
+-
+-
+ @lru_cache(maxsize=None)
+ def is_pin_memory_available() -> bool:
+-
+-    if in_wsl():
+-        # Pinning memory in WSL is not supported.
+-        # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+-        print_warning_once("Using 'pin_memory=False' as WSL is detected. "
+-                           "This may slow down the performance.")
+-        return False
+-    elif is_neuron():
+-        print_warning_once("Pin memory is not supported on Neuron.")
+-        return False
+-    elif is_cpu():
+-        return False
+-    return True
++    from vllm.platforms import current_platform
++    return current_platform.is_pin_memory_available()
+ 
+ 
+-class CudaMemoryProfiler:
++class DeviceMemoryProfiler:
+ 
+-    def __init__(self, device=None):
++    def __init__(self, device: Optional[torch.types.Device] = None):
+         self.device = device
+ 
+     def current_memory_usage(self) -> float:
+         # Return the memory usage in bytes.
+-        torch.cuda.reset_peak_memory_stats(self.device)
+-        mem = torch.cuda.max_memory_allocated(self.device)
++        from vllm.platforms import current_platform
++        if current_platform.is_cuda_alike():
++            torch.cuda.reset_peak_memory_stats(self.device)
++            mem = torch.cuda.max_memory_allocated(self.device)
++        elif current_platform.is_xpu():
++            torch.xpu.reset_peak_memory_stats(self.device)  # type: ignore
++            mem = torch.xpu.max_memory_allocated(self.device)  # type: ignore
+         return mem
+ 
+     def __enter__(self):
+@@ -506,35 +731,54 @@ class CudaMemoryProfiler:
+         gc.collect()
+ 
+ 
+-def str_to_int_tuple(s: str) -> Tuple[int, ...]:
+-    """Convert a string to a tuple of integers."""
+-    try:
+-        return tuple(map(int, s.split(",")))
+-    except ValueError as e:
+-        raise ValueError(
+-            "String must be a series of integers separated by commas "
+-            f"(e.g., 1, 2, 3). Given input: {s}") from e
++def make_ndarray_with_pad(
++    x: List[List[T]],
++    pad: T,
++    dtype: npt.DTypeLike,
++    *,
++    max_len: Optional[int] = None,
++) -> npt.NDArray:
++    """
++    Make a padded array from 2D inputs.
++
++    The padding is applied to the end of each inner list until it reaches
++    `max_len`.
++    """
++    if max_len is None:
++        # Unlike for most functions, map is faster than a genexpr over `len`
++        max_len = max(map(len, x), default=0)
+ 
++    padded_x = np.full((len(x), max_len), pad, dtype=dtype)
++    for ind, blocktb in enumerate(x):
++        assert len(blocktb) <= max_len
++        padded_x[ind, :len(blocktb)] = blocktb
+ 
+-def pad_to_max_length(x: List[int], max_len: int, pad: int) -> List[int]:
+-    assert len(x) <= max_len
+-    return x + [pad] * (max_len - len(x))
++    return padded_x
+ 
+ 
+ def make_tensor_with_pad(
+-    x: List[List[int]],
+-    max_len: int,
+-    pad: int,
++    x: List[List[T]],
++    pad: T,
+     dtype: torch.dtype,
+-    device: Optional[Union[str, torch.device]],
++    *,
++    max_len: Optional[int] = None,
++    device: Optional[Union[str, torch.device]] = None,
++    pin_memory: bool = False,
+ ) -> torch.Tensor:
+-    """Make a padded tensor of a 2D inputs.
++    """
++    Make a padded tensor from 2D inputs.
+ 
+     The padding is applied to the end of each inner list until it reaches
+     `max_len`.
+     """
+-    padded_x = [pad_to_max_length(x_i, max_len, pad) for x_i in x]
+-    return torch.tensor(padded_x, dtype=dtype, device=device)
++    np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
++    padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len)
++
++    tensor = torch.from_numpy(padded_x).to(device)
++    if pin_memory:
++        tensor = tensor.pin_memory()
++
++    return tensor
+ 
+ 
+ def async_tensor_h2d(
+@@ -548,60 +792,103 @@ def async_tensor_h2d(
+     return t.to(device=target_device, non_blocking=True)
+ 
+ 
+-def maybe_expand_dim(tensor: torch.Tensor,
+-                     target_dims: int,
+-                     size: int = 1) -> torch.Tensor:
+-    """Expand the tensor to the target_dims."""
+-    if tensor.ndim < target_dims:
+-        tensor = tensor.view(-1, *([size] * (target_dims - tensor.ndim)))
+-    return tensor
++def get_dtype_size(dtype: torch.dtype) -> int:
++    """Get the size of the data type in bytes."""
++    return torch.tensor([], dtype=dtype).element_size()
+ 
+ 
+-def merge_dicts(dict1: Dict[Any, List[Any]],
+-                dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
+-    """Merge 2 dicts that have key -> List of items.
+-    
+-    When a key conflicts, the values in dict1 is prioritized.
+-    """
+-    merged_dict = defaultdict(list)
++# `collections` helpers
++def is_list_of(
++    value: object,
++    typ: Union[type[T], tuple[type[T], ...]],
++    *,
++    check: Literal["first", "all"] = "first",
++) -> TypeIs[List[T]]:
++    if not isinstance(value, list):
++        return False
++
++    if check == "first":
++        return len(value) == 0 or isinstance(value[0], typ)
++    elif check == "all":
++        return all(isinstance(v, typ) for v in value)
++
++    assert_never(check)
++
++
++JSONTree = Union[Dict[str, "JSONTree[T]"], List["JSONTree[T]"],
++                 Tuple["JSONTree[T]", ...], T]
++"""A nested JSON structure where the leaves need not be JSON-serializable."""
++
++
++@overload
++def json_map_leaves(
++    func: Callable[[T], U],
++    value: Dict[str, JSONTree[T]],
++) -> Dict[str, JSONTree[U]]:
++    ...
++
++
++@overload
++def json_map_leaves(
++    func: Callable[[T], U],
++    value: List[JSONTree[T]],
++) -> List[JSONTree[U]]:
++    ...
+ 
+-    for key, value in dict1.items():
+-        merged_dict[key].extend(value)
+ 
+-    for key, value in dict2.items():
+-        merged_dict[key].extend(value)
++@overload
++def json_map_leaves(
++    func: Callable[[T], U],
++    value: Tuple[JSONTree[T], ...],
++) -> Tuple[JSONTree[U], ...]:
++    ...
+ 
+-    return dict(merged_dict)
+ 
++@overload
++def json_map_leaves(
++    func: Callable[[T], U],
++    value: JSONTree[T],
++) -> JSONTree[U]:
++    ...
+ 
+-def init_cached_hf_modules():
++
++def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]:
++    if isinstance(value, dict):
++        return {k: json_map_leaves(func, v) for k, v in value.items()}
++    elif isinstance(value, list):
++        return [json_map_leaves(func, v) for v in value]
++    elif isinstance(value, tuple):
++        return tuple(json_map_leaves(func, v) for v in value)
++    else:
++        return func(value)
++
++
++def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
++    """Flatten a list of lists to a single list."""
++    return [item for sublist in lists for item in sublist]
++
++
++def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
+     """
+-    Lazy initialization of the Hugging Face modules.
++    Unlike :class:`itertools.groupby`, groups are not broken by
++    non-contiguous data.
+     """
+-    from transformers.dynamic_module_utils import init_hf_modules
+-    init_hf_modules()
++    groups = defaultdict[_K, list[_V]](list)
++
++    for value in values:
++        groups[key(value)].append(value)
++
++    return groups.items()
+ 
+ 
+-def nccl_integrity_check(filepath):
++# TODO: This function can be removed if transformer_modules classes are
++# serialized by value when communicating between processes
++def init_cached_hf_modules() -> None:
+     """
+-    when the library is corrupted, we cannot catch
+-    the exception in python. it will crash the process.
+-    instead, we use the exit code of `ldd` to check
+-    if the library is corrupted. if not, we will return
+-    the version of the library.
++    Lazy initialization of the Hugging Face modules.
+     """
+-    exit_code = os.system(f"ldd {filepath} 2>&1 > /dev/null")
+-    if exit_code != 0:
+-        raise RuntimeError(f"Failed to load NCCL library from {filepath} .")
+-    import ctypes
+-
+-    nccl = ctypes.CDLL(filepath)
+-    version = ctypes.c_int()
+-    nccl.ncclGetVersion.restype = ctypes.c_int
+-    nccl.ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
+-    result = nccl.ncclGetVersion(ctypes.byref(version))
+-    assert result == 0
+-    return version.value
++    from transformers.dynamic_module_utils import init_hf_modules
++    init_hf_modules()
+ 
+ 
+ @lru_cache(maxsize=None)
+@@ -632,18 +919,14 @@ def find_library(lib_name: str) -> str:
+     return locs[0]
+ 
+ 
+-def find_nccl_library():
++def find_nccl_library() -> str:
++    """
++    We either use the library file specified by the `VLLM_NCCL_SO_PATH`
++    environment variable, or we find the library file brought by PyTorch.
++    After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
++    found by `ctypes` automatically.
++    """
+     so_file = envs.VLLM_NCCL_SO_PATH
+-    VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
+-
+-    # check if we have vllm-managed nccl
+-    vllm_nccl_path = None
+-    if torch.version.cuda is not None:
+-        cuda_major = torch.version.cuda.split(".")[0]
+-        path = os.path.expanduser(
+-            f"{VLLM_CONFIG_ROOT}/vllm/nccl/cu{cuda_major}/libnccl.so.*")
+-        files = glob.glob(path)
+-        vllm_nccl_path = files[0] if files else None
+ 
+     # manually load the nccl library
+     if so_file:
+@@ -652,26 +935,1241 @@ def find_nccl_library():
+             so_file)
+     else:
+         if torch.version.cuda is not None:
+-            so_file = vllm_nccl_path or find_library("libnccl.so.2")
++            so_file = "libnccl.so.2"
+         elif torch.version.hip is not None:
+-            so_file = find_library("librccl.so.1")
++            so_file = "librccl.so.1"
+         else:
+             raise ValueError("NCCL only supports CUDA and ROCm backends.")
+         logger.info("Found nccl from library %s", so_file)
+     return so_file
+ 
+ 
+-def enable_trace_function_call_for_thread() -> None:
++prev_set_stream = torch.cuda.set_stream
++
++_current_stream = None
++
++
++def _patched_set_stream(stream: torch.cuda.Stream) -> None:
++    global _current_stream
++    _current_stream = stream
++    prev_set_stream(stream)
++
++
++torch.cuda.set_stream = _patched_set_stream
++
++
++def current_stream() -> torch.cuda.Stream:
++    """
++    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
++    it turns out that `torch.cuda.current_stream()` is quite expensive,
++    as it will construct a new stream object at each call.
++    here we patch `torch.cuda.set_stream` to keep track of the current stream
++    directly, so that we can avoid calling `torch.cuda.current_stream()`.
++
++    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
++    from C/C++ code.
++    """
++    global _current_stream
++    if _current_stream is None:
++        # when this function is called before any stream is set,
++        # we return the default stream.
++        _current_stream = torch.cuda.current_stream()
++    return _current_stream
++
++
++def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
+     """Set up function tracing for the current thread,
+     if enabled via the VLLM_TRACE_FUNCTION environment variable
+     """
+ 
+     if envs.VLLM_TRACE_FUNCTION:
+         tmp_dir = tempfile.gettempdir()
++        # add username to tmp_dir to avoid permission issues
++        tmp_dir = os.path.join(tmp_dir, getpass.getuser())
+         filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
+                     f"_thread_{threading.get_ident()}_"
+                     f"at_{datetime.datetime.now()}.log").replace(" ", "_")
+-        log_path = os.path.join(tmp_dir, "vllm", get_vllm_instance_id(),
++        log_path = os.path.join(tmp_dir, "vllm",
++                                f"vllm-instance-{vllm_config.instance_id}",
+                                 filename)
+         os.makedirs(os.path.dirname(log_path), exist_ok=True)
+         enable_trace_function_call(log_path)
++
++
++# `functools` helpers
++def identity(value: T, **kwargs) -> T:
++    """Returns the first provided value."""
++    return value
++
++
++F = TypeVar('F', bound=Callable[..., Any])
++
++
++def deprecate_args(
++    start_index: int,
++    is_deprecated: Union[bool, Callable[[], bool]] = True,
++    additional_message: Optional[str] = None,
++) -> Callable[[F], F]:
++
++    if not callable(is_deprecated):
++        is_deprecated = partial(identity, is_deprecated)
++
++    def wrapper(fn: F) -> F:
++
++        params = inspect.signature(fn).parameters
++        pos_types = (
++            inspect.Parameter.POSITIONAL_ONLY,
++            inspect.Parameter.POSITIONAL_OR_KEYWORD,
++        )
++        pos_kws = [
++            kw for kw, param in params.items() if param.kind in pos_types
++        ]
++
++        @wraps(fn)
++        def inner(*args, **kwargs):
++            if is_deprecated():
++                deprecated_args = pos_kws[start_index:len(args)]
++                if deprecated_args:
++                    msg = (
++                        f"The positional arguments {deprecated_args} are "
++                        "deprecated and will be removed in a future update.")
++                    if additional_message is not None:
++                        msg += f" {additional_message}"
++
++                    warnings.warn(
++                        DeprecationWarning(msg),
++                        stacklevel=3,  # The inner function takes up one level
++                    )
++
++            return fn(*args, **kwargs)
++
++        return inner  # type: ignore
++
++    return wrapper
++
++
++def deprecate_kwargs(
++    *kws: str,
++    is_deprecated: Union[bool, Callable[[], bool]] = True,
++    additional_message: Optional[str] = None,
++) -> Callable[[F], F]:
++    deprecated_kws = set(kws)
++
++    if not callable(is_deprecated):
++        is_deprecated = partial(identity, is_deprecated)
++
++    def wrapper(fn: F) -> F:
++
++        @wraps(fn)
++        def inner(*args, **kwargs):
++            if is_deprecated():
++                deprecated_kwargs = kwargs.keys() & deprecated_kws
++                if deprecated_kwargs:
++                    msg = (
++                        f"The keyword arguments {deprecated_kwargs} are "
++                        "deprecated and will be removed in a future update.")
++                    if additional_message is not None:
++                        msg += f" {additional_message}"
++
++                    warnings.warn(
++                        DeprecationWarning(msg),
++                        stacklevel=3,  # The inner function takes up one level
++                    )
++
++            return fn(*args, **kwargs)
++
++        return inner  # type: ignore
++
++    return wrapper
++
++
++@lru_cache(maxsize=8)
++def _cuda_device_count_stateless(
++        cuda_visible_devices: Optional[str] = None) -> int:
++    # Note: cuda_visible_devices is not used, but we keep it as an argument for
++    # LRU Cache purposes.
++
++    # Code below is based on
++    # https://github.com/pytorch/pytorch/blob/
++    # c1cd946818442aca8c7f812b16d187ce1586c3bc/
++    # torch/cuda/__init__.py#L831C1-L831C17
++    import torch.cuda
++    import torch.version
++
++    from vllm.platforms import current_platform
++    if not torch.cuda._is_compiled():
++        return 0
++    if current_platform.is_rocm():
++        # ROCm uses amdsmi instead of nvml for stateless device count
++        # This requires a sufficiently modern version of Torch 2.4.0
++        raw_count = torch.cuda._device_count_amdsmi() if (hasattr(
++            torch.cuda, "_device_count_amdsmi")) else -1
++    else:
++        raw_count = torch.cuda._device_count_nvml()
++    r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
++    return r
++
++
++def cuda_device_count_stateless() -> int:
++    """Get number of CUDA devices, caching based on the value of
++    CUDA_VISIBLE_DEVICES at the time of call.
++
++    This should be used instead of torch.cuda.device_count()
++    unless CUDA_VISIBLE_DEVICES has already been set to the desired
++    value."""
++
++    # This can be removed and simply replaced with torch.cuda.get_device_count
++    # after https://github.com/pytorch/pytorch/pull/122815 is released.
++    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
++
++
++def cuda_is_initialized() -> bool:
++    """Check if CUDA is initialized."""
++    if not torch.cuda._is_compiled():
++        return False
++    return torch.cuda.is_initialized()
++
++
++def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]:
++    """Make an instance method that weakly references
++    its associated instance and no-ops once that
++    instance is collected."""
++    ref = weakref.ref(bound_method.__self__)  # type: ignore[attr-defined]
++    unbound = bound_method.__func__  # type: ignore[attr-defined]
++
++    def weak_bound(*args, **kwargs) -> None:
++        if inst := ref():
++            unbound(inst, *args, **kwargs)
++
++    return weak_bound
++
++
++#From: https://stackoverflow.com/a/4104188/2749989
++def run_once(f: Callable[P, None]) -> Callable[P, None]:
++
++    def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
++        if not wrapper.has_run:  # type: ignore[attr-defined]
++            wrapper.has_run = True  # type: ignore[attr-defined]
++            return f(*args, **kwargs)
++
++    wrapper.has_run = False  # type: ignore[attr-defined]
++    return wrapper
++
++
++class StoreBoolean(argparse.Action):
++
++    def __call__(self, parser, namespace, values, option_string=None):
++        if values.lower() == "true":
++            setattr(namespace, self.dest, True)
++        elif values.lower() == "false":
++            setattr(namespace, self.dest, False)
++        else:
++            raise ValueError(f"Invalid boolean value: {values}. "
++                             "Expected 'true' or 'false'.")
++
++
++class SortedHelpFormatter(argparse.HelpFormatter):
++    """SortedHelpFormatter that sorts arguments by their option strings."""
++
++    def add_arguments(self, actions):
++        actions = sorted(actions, key=lambda x: x.option_strings)
++        super().add_arguments(actions)
++
++
++class FlexibleArgumentParser(argparse.ArgumentParser):
++    """ArgumentParser that allows both underscore and dash in names."""
++
++    def __init__(self, *args, **kwargs):
++        # Set the default 'formatter_class' to SortedHelpFormatter
++        if 'formatter_class' not in kwargs:
++            kwargs['formatter_class'] = SortedHelpFormatter
++        super().__init__(*args, **kwargs)
++
++    def parse_args(self, args=None, namespace=None):
++        if args is None:
++            args = sys.argv[1:]
++
++        if '--config' in args:
++            args = self._pull_args_from_config(args)
++
++        # Convert underscores to dashes and vice versa in argument names
++        processed_args = []
++        for arg in args:
++            if arg.startswith('--'):
++                if '=' in arg:
++                    key, value = arg.split('=', 1)
++                    key = '--' + key[len('--'):].replace('_', '-')
++                    processed_args.append(f'{key}={value}')
++                else:
++                    processed_args.append('--' +
++                                          arg[len('--'):].replace('_', '-'))
++            elif arg.startswith('-O') and arg != '-O' and len(arg) == 2:
++                # allow -O flag to be used without space, e.g. -O3
++                processed_args.append('-O')
++                processed_args.append(arg[2:])
++            else:
++                processed_args.append(arg)
++
++        return super().parse_args(processed_args, namespace)
++
++    def _pull_args_from_config(self, args: List[str]) -> List[str]:
++        """Method to pull arguments specified in the config file
++        into the command-line args variable.
++
++        The arguments in config file will be inserted between
++        the argument list.
++
++        example:
++        ```yaml
++            port: 12323
++            tensor-parallel-size: 4
++        ```
++        ```python
++        $: vllm {serve,chat,complete} "facebook/opt-12B" \
++            --config config.yaml -tp 2
++        $: args = [
++            "serve,chat,complete",
++            "facebook/opt-12B",
++            '--config', 'config.yaml',
++            '-tp', '2'
++        ]
++        $: args = [
++            "serve,chat,complete",
++            "facebook/opt-12B",
++            '--port', '12323',
++            '--tensor-parallel-size', '4',
++            '-tp', '2'
++            ]
++        ```
++
++        Please note how the config args are inserted after the sub command.
++        this way the order of priorities is maintained when these are args
++        parsed by super().
++        """
++        assert args.count(
++            '--config') <= 1, "More than one config file specified!"
++
++        index = args.index('--config')
++        if index == len(args) - 1:
++            raise ValueError("No config file specified! \
++                             Please check your command-line arguments.")
++
++        file_path = args[index + 1]
++
++        config_args = self._load_config_file(file_path)
++
++        # 0th index is for {serve,chat,complete}
++        # followed by model_tag (only for serve)
++        # followed by config args
++        # followed by rest of cli args.
++        # maintaining this order will enforce the precedence
++        # of cli > config > defaults
++        if args[0] == "serve":
++            if index == 1:
++                raise ValueError(
++                    "No model_tag specified! Please check your command-line"
++                    " arguments.")
++            args = [args[0]] + [
++                args[1]
++            ] + config_args + args[2:index] + args[index + 2:]
++        else:
++            args = [args[0]] + config_args + args[1:index] + args[index + 2:]
++
++        return args
++
++    def _load_config_file(self, file_path: str) -> List[str]:
++        """Loads a yaml file and returns the key value pairs as a
++        flattened list with argparse like pattern
++        ```yaml
++            port: 12323
++            tensor-parallel-size: 4
++        ```
++        returns:
++            processed_args: list[str] = [
++                '--port': '12323',
++                '--tensor-parallel-size': '4'
++            ]
++
++        """
++
++        extension: str = file_path.split('.')[-1]
++        if extension not in ('yaml', 'yml'):
++            raise ValueError(
++                "Config file must be of a yaml/yml type.\
++                              %s supplied", extension)
++
++        # only expecting a flat dictionary of atomic types
++        processed_args: List[str] = []
++
++        config: Dict[str, Union[int, str]] = {}
++        try:
++            with open(file_path) as config_file:
++                config = yaml.safe_load(config_file)
++        except Exception as ex:
++            logger.error(
++                "Unable to read the config file at %s. \
++                Make sure path is correct", file_path)
++            raise ex
++
++        store_boolean_arguments = [
++            action.dest for action in self._actions
++            if isinstance(action, StoreBoolean)
++        ]
++
++        for key, value in config.items():
++            if isinstance(value, bool) and key not in store_boolean_arguments:
++                if value:
++                    processed_args.append('--' + key)
++            else:
++                processed_args.append('--' + key)
++                processed_args.append(str(value))
++
++        return processed_args
++
++
++async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
++                              **kwargs):
++    """Utility function to run async task in a lock"""
++    async with lock:
++        return await task(*args, **kwargs)
++
++
++def supports_kw(
++    callable: Callable[..., object],
++    kw_name: str,
++    *,
++    requires_kw_only: bool = False,
++    allow_var_kwargs: bool = True,
++) -> bool:
++    """Check if a keyword is a valid kwarg for a callable; if requires_kw_only
++    disallows kwargs names that can also be positional arguments.
++    """
++    params = inspect.signature(callable).parameters
++    if not params:
++        return False
++
++    param_val = params.get(kw_name)
++
++    # Types where the it may be valid, i.e., explicitly defined & nonvariadic
++    passable_kw_types = set((inspect.Parameter.POSITIONAL_ONLY,
++                             inspect.Parameter.POSITIONAL_OR_KEYWORD,
++                             inspect.Parameter.KEYWORD_ONLY))
++
++    if param_val:
++        is_sig_param = param_val.kind in passable_kw_types
++        # We want kwargs only, but this is passable as a positional arg
++        if (requires_kw_only and is_sig_param
++                and param_val.kind != inspect.Parameter.KEYWORD_ONLY):
++            return False
++        if ((requires_kw_only
++             and param_val.kind == inspect.Parameter.KEYWORD_ONLY)
++                or (not requires_kw_only and is_sig_param)):
++            return True
++
++    # If we're okay with var-kwargs, it's supported as long as
++    # the kw_name isn't something like *args, **kwargs
++    if allow_var_kwargs:
++        # Get the last param; type is ignored here because params is a proxy
++        # mapping, but it wraps an ordered dict, and they appear in order.
++        # Ref: https://docs.python.org/3/library/inspect.html#inspect.Signature.parameters
++        last_param = params[next(reversed(params))]  # type: ignore
++        return (last_param.kind == inspect.Parameter.VAR_KEYWORD
++                and last_param.name != kw_name)
++    return False
++
++
++def resolve_mm_processor_kwargs(
++    init_kwargs: Optional[Mapping[str, object]],
++    inference_kwargs: Optional[Mapping[str, object]],
++    callable: Callable[..., object],
++    *,
++    requires_kw_only: bool = True,
++    allow_var_kwargs: bool = False,
++) -> Dict[str, Any]:
++    """Applies filtering to eliminate invalid mm_processor_kwargs, i.e.,
++    those who are not explicit keywords to the given callable (of one is
++    given; otherwise no filtering is done), then merges the kwarg dicts,
++    giving priority to inference_kwargs if there are any collisions.
++
++    In the case that no kwarg overrides are provided, returns an empty
++    dict so that it can still be kwarg expanded into the callable later on.
++
++    If allow_var_kwargs=True, allows for things that can be expanded into
++    kwargs as long as they aren't naming collision for var_kwargs or potential
++    positional arguments.
++    """
++    # Filter inference time multimodal processor kwargs provided
++    runtime_mm_kwargs = get_allowed_kwarg_only_overrides(
++        callable,
++        overrides=inference_kwargs,
++        requires_kw_only=requires_kw_only,
++        allow_var_kwargs=allow_var_kwargs,
++    )
++
++    # Filter init time multimodal processor kwargs provided
++    init_mm_kwargs = get_allowed_kwarg_only_overrides(
++        callable,
++        overrides=init_kwargs,
++        requires_kw_only=requires_kw_only,
++        allow_var_kwargs=allow_var_kwargs,
++    )
++
++    # Merge the final processor kwargs, prioritizing inference
++    # time values over the initialization time values.
++    mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs}
++    return mm_processor_kwargs
++
++
++def get_allowed_kwarg_only_overrides(
++    callable: Callable[..., object],
++    overrides: Optional[Mapping[str, object]],
++    *,
++    requires_kw_only: bool = True,
++    allow_var_kwargs: bool = False,
++) -> Dict[str, Any]:
++    """
++    Given a callable which has one or more keyword only params and a dict
++    mapping param names to values, drop values that can be not be kwarg
++    expanded to overwrite one or more keyword-only args. This is used in a
++    few places to handle custom processor overrides for multimodal models,
++    e.g., for profiling when processor options provided by the user
++    may affect the number of mm tokens per instance.
++
++    Args:
++        callable: Callable which takes 0 or more keyword only arguments.
++                  If None is provided, all overrides names are allowed.
++        overrides: Potential overrides to be used when invoking the callable.
++        allow_var_kwargs: Allows overrides that are expandable for var kwargs.
++
++    Returns:
++        Dictionary containing the kwargs to be leveraged which may be used
++        to overwrite one or more keyword only arguments when invoking the
++        callable.
++    """
++    if not overrides:
++        return {}
++
++    # Drop any mm_processor_kwargs provided by the user that
++    # are not kwargs, unless it can fit it var_kwargs param
++    filtered_overrides = {
++        kwarg_name: val
++        for kwarg_name, val in overrides.items()
++        if supports_kw(callable,
++                       kwarg_name,
++                       requires_kw_only=requires_kw_only,
++                       allow_var_kwargs=allow_var_kwargs)
++    }
++
++    # If anything is dropped, log a warning
++    dropped_keys = overrides.keys() - filtered_overrides.keys()
++    if dropped_keys:
++        if requires_kw_only:
++            logger.warning(
++                "The following intended overrides are not keyword-only args "
++                "and and will be dropped: %s", dropped_keys)
++        else:
++            logger.warning(
++                "The following intended overrides are not keyword args "
++                "and and will be dropped: %s", dropped_keys)
++
++    return filtered_overrides
++
++
++# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
++# In particular, the FakeScalarType is not supported for earlier versions of
++# PyTorch which breaks dynamo for any ops registered using ScalarType.
++def supports_dynamo() -> bool:
++    base_torch_version = Version(Version(torch.__version__).base_version)
++    return base_torch_version >= Version("2.4.0")
++
++
++# Some backends use pytorch version < 2.4.0 which doesn't
++# support `torch.library.custom_op`.
++def supports_custom_op() -> bool:
++    return hasattr(torch.library, "custom_op")
++
++
++class AtomicCounter:
++    """An atomic, thread-safe counter"""
++
++    def __init__(self, initial=0):
++        """Initialize a new atomic counter to given initial value"""
++        self._value = initial
++        self._lock = threading.Lock()
++
++    def inc(self, num=1):
++        """Atomically increment the counter by num and return the new value"""
++        with self._lock:
++            self._value += num
++            return self._value
++
++    def dec(self, num=1):
++        """Atomically decrement the counter by num and return the new value"""
++        with self._lock:
++            self._value -= num
++            return self._value
++
++    @property
++    def value(self):
++        return self._value
++
++
++# Adapted from: https://stackoverflow.com/a/47212782/5082708
++class LazyDict(Mapping[str, T], Generic[T]):
++
++    def __init__(self, factory: Dict[str, Callable[[], T]]):
++        self._factory = factory
++        self._dict: Dict[str, T] = {}
++
++    def __getitem__(self, key: str) -> T:
++        if key not in self._dict:
++            if key not in self._factory:
++                raise KeyError(key)
++            self._dict[key] = self._factory[key]()
++        return self._dict[key]
++
++    def __setitem__(self, key: str, value: Callable[[], T]):
++        self._factory[key] = value
++
++    def __iter__(self):
++        return iter(self._factory)
++
++    def __len__(self):
++        return len(self._factory)
++
++
++class ClassRegistry(UserDict[Type[T], _V]):
++
++    def __getitem__(self, key: Type[T]) -> _V:
++        for cls in key.mro():
++            if cls in self.data:
++                return self.data[cls]
++
++        raise KeyError(key)
++
++    def __contains__(self, key: object) -> bool:
++        return self.contains(key)
++
++    def contains(self, key: object, *, strict: bool = False) -> bool:
++        if not isinstance(key, type):
++            return False
++
++        if strict:
++            return key in self.data
++
++        return any(cls in self.data for cls in key.mro())
++
++
++def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
++    """
++    Create a weak reference to a tensor.
++    The new tensor will share the same data as the original tensor,
++    but will not keep the original tensor alive.
++    """
++    return torch.ops._C.weak_ref_tensor(tensor)
++
++
++def weak_ref_tensors(
++    tensors: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]
++) -> Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]:
++    """
++    Convenience function to create weak references to tensors,
++    for single tensor, list of tensors or tuple of tensors.
++    """
++    if isinstance(tensors, torch.Tensor):
++        return weak_ref_tensor(tensors)
++    if isinstance(tensors, list):
++        return [weak_ref_tensor(t) for t in tensors]
++    if isinstance(tensors, tuple):
++        return tuple(weak_ref_tensor(t) for t in tensors)
++    raise ValueError("Invalid type for tensors")
++
++
++def is_in_doc_build() -> bool:
++    try:
++        from sphinx.ext.autodoc.mock import _MockModule
++        return isinstance(torch, _MockModule)
++    except ModuleNotFoundError:
++        return False
++
++
++def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
++    """
++    Import a Python file according to its file path.
++
++    Based on the official recipe:
++    https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
++    """
++    spec = importlib.util.spec_from_file_location(module_name, file_path)
++    if spec is None:
++        raise ModuleNotFoundError(f"No module named '{module_name}'")
++
++    assert spec.loader is not None
++
++    module = importlib.util.module_from_spec(spec)
++    sys.modules[module_name] = module
++    spec.loader.exec_module(module)
++    return module
++
++
++@lru_cache(maxsize=None)
++def get_vllm_optional_dependencies():
++    metadata = importlib.metadata.metadata("vllm")
++    requirements = metadata.get_all("Requires-Dist", [])
++    extras = metadata.get_all("Provides-Extra", [])
++
++    return {
++        extra: [
++            re.split(r";|>=|<=|==", req)[0] for req in requirements
++            if req.endswith(f'extra == "{extra}"')
++        ]
++        for extra in extras
++    }
++
++
++class _PlaceholderBase:
++    """
++    Disallows downstream usage of placeholder modules.
++
++    We need to explicitly override each dunder method because
++    :meth:`__getattr__` is not called when they are accessed.
++
++    See also:
++        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
++    """
++
++    def __getattr__(self, key: str) -> Never:
++        """
++        The main class should implement this to throw an error
++        for attribute accesses representing downstream usage.
++        """
++        raise NotImplementedError
++
++    # [Basic customization]
++
++    def __lt__(self, other: object):
++        return self.__getattr__("__lt__")
++
++    def __le__(self, other: object):
++        return self.__getattr__("__le__")
++
++    def __eq__(self, other: object):
++        return self.__getattr__("__eq__")
++
++    def __ne__(self, other: object):
++        return self.__getattr__("__ne__")
++
++    def __gt__(self, other: object):
++        return self.__getattr__("__gt__")
++
++    def __ge__(self, other: object):
++        return self.__getattr__("__ge__")
++
++    def __hash__(self):
++        return self.__getattr__("__hash__")
++
++    def __bool__(self):
++        return self.__getattr__("__bool__")
++
++    # [Callable objects]
++
++    def __call__(self, *args: object, **kwargs: object):
++        return self.__getattr__("__call__")
++
++    # [Container types]
++
++    def __len__(self):
++        return self.__getattr__("__len__")
++
++    def __getitem__(self, key: object):
++        return self.__getattr__("__getitem__")
++
++    def __setitem__(self, key: object, value: object):
++        return self.__getattr__("__setitem__")
++
++    def __delitem__(self, key: object):
++        return self.__getattr__("__delitem__")
++
++    # __missing__ is optional according to __getitem__ specification,
++    # so it is skipped
++
++    # __iter__ and __reversed__ have a default implementation
++    # based on __len__ and __getitem__, so they are skipped.
++
++    # [Numeric Types]
++
++    def __add__(self, other: object):
++        return self.__getattr__("__add__")
++
++    def __sub__(self, other: object):
++        return self.__getattr__("__sub__")
++
++    def __mul__(self, other: object):
++        return self.__getattr__("__mul__")
++
++    def __matmul__(self, other: object):
++        return self.__getattr__("__matmul__")
++
++    def __truediv__(self, other: object):
++        return self.__getattr__("__truediv__")
++
++    def __floordiv__(self, other: object):
++        return self.__getattr__("__floordiv__")
++
++    def __mod__(self, other: object):
++        return self.__getattr__("__mod__")
++
++    def __divmod__(self, other: object):
++        return self.__getattr__("__divmod__")
++
++    def __pow__(self, other: object, modulo: object = ...):
++        return self.__getattr__("__pow__")
++
++    def __lshift__(self, other: object):
++        return self.__getattr__("__lshift__")
++
++    def __rshift__(self, other: object):
++        return self.__getattr__("__rshift__")
++
++    def __and__(self, other: object):
++        return self.__getattr__("__and__")
++
++    def __xor__(self, other: object):
++        return self.__getattr__("__xor__")
++
++    def __or__(self, other: object):
++        return self.__getattr__("__or__")
++
++    # r* and i* methods have lower priority than
++    # the methods for left operand so they are skipped
++
++    def __neg__(self):
++        return self.__getattr__("__neg__")
++
++    def __pos__(self):
++        return self.__getattr__("__pos__")
++
++    def __abs__(self):
++        return self.__getattr__("__abs__")
++
++    def __invert__(self):
++        return self.__getattr__("__invert__")
++
++    # __complex__, __int__ and __float__ have a default implementation
++    # based on __index__, so they are skipped.
++
++    def __index__(self):
++        return self.__getattr__("__index__")
++
++    def __round__(self, ndigits: object = ...):
++        return self.__getattr__("__round__")
++
++    def __trunc__(self):
++        return self.__getattr__("__trunc__")
++
++    def __floor__(self):
++        return self.__getattr__("__floor__")
++
++    def __ceil__(self):
++        return self.__getattr__("__ceil__")
++
++    # [Context managers]
++
++    def __enter__(self):
++        return self.__getattr__("__enter__")
++
++    def __exit__(self, *args: object, **kwargs: object):
++        return self.__getattr__("__exit__")
++
++
++class PlaceholderModule(_PlaceholderBase):
++    """
++    A placeholder object to use when a module does not exist.
++
++    This enables more informative errors when trying to access attributes
++    of a module that does not exists.
++    """
++
++    def __init__(self, name: str) -> None:
++        super().__init__()
++
++        # Apply name mangling to avoid conflicting with module attributes
++        self.__name = name
++
++    def placeholder_attr(self, attr_path: str):
++        return _PlaceholderModuleAttr(self, attr_path)
++
++    def __getattr__(self, key: str):
++        name = self.__name
++
++        try:
++            importlib.import_module(name)
++        except ImportError as exc:
++            for extra, names in get_vllm_optional_dependencies().items():
++                if name in names:
++                    msg = f"Please install vllm[{extra}] for {extra} support"
++                    raise ImportError(msg) from exc
++
++            raise exc
++
++        raise AssertionError("PlaceholderModule should not be used "
++                             "when the original module can be imported")
++
++
++class _PlaceholderModuleAttr(_PlaceholderBase):
++
++    def __init__(self, module: PlaceholderModule, attr_path: str) -> None:
++        super().__init__()
++
++        # Apply name mangling to avoid conflicting with module attributes
++        self.__module = module
++        self.__attr_path = attr_path
++
++    def placeholder_attr(self, attr_path: str):
++        return _PlaceholderModuleAttr(self.__module,
++                                      f"{self.__attr_path}.{attr_path}")
++
++    def __getattr__(self, key: str):
++        getattr(self.__module, f"{self.__attr_path}.{key}")
++
++        raise AssertionError("PlaceholderModule should not be used "
++                             "when the original module can be imported")
++
++
++# create a library to hold the custom op
++vllm_lib = Library("vllm", "FRAGMENT")  # noqa
++
++
++def direct_register_custom_op(
++    op_name: str,
++    op_func: Callable,
++    mutates_args: List[str],
++    fake_impl: Optional[Callable] = None,
++    target_lib: Optional[Library] = None,
++    dispatch_key: str = "CUDA",
++):
++    """
++    `torch.library.custom_op` can have significant overhead because it
++    needs to consider complicated dispatching logic. This function
++    directly registers a custom op and dispatches it to the CUDA backend.
++    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
++    for more details.
++
++    By default, the custom op is registered to the vLLM library. If you
++    want to register it to a different library, you can pass the library
++    object to the `target_lib` argument.
++
++    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
++    library object. If you want to bind the operator to a different library,
++    make sure the library object is alive when the operator is used.
++    """
++    if is_in_doc_build():
++        return
++
++    if not supports_custom_op():
++        from vllm.platforms import current_platform
++        assert not current_platform.is_cuda_alike(), (
++            "cuda platform needs torch>=2.4 to support custom op, "
++            "chances are you are using an old version of pytorch "
++            "or a custom build of pytorch. It is recommended to "
++            "use vLLM in a fresh new environment and let it install "
++            "the required dependencies.")
++        return
++
++    import torch.library
++    if hasattr(torch.library, "infer_schema"):
++        schema_str = torch.library.infer_schema(op_func,
++                                                mutates_args=mutates_args)
++    else:
++        # for pytorch 2.4
++        import torch._custom_op.impl
++        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
++    my_lib = target_lib or vllm_lib
++    my_lib.define(op_name + schema_str)
++    my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
++    if fake_impl is not None:
++        my_lib._register_fake(op_name, fake_impl)
++
++
++def resolve_obj_by_qualname(qualname: str) -> Any:
++    """
++    Resolve an object by its fully qualified name.
++    """
++    module_name, obj_name = qualname.rsplit(".", 1)
++    module = importlib.import_module(module_name)
++    return getattr(module, obj_name)
++
++
++def kill_process_tree(pid: int):
++    """
++    Kills all descendant processes of the given pid by sending SIGKILL.
++
++    Args:
++        pid (int): Process ID of the parent process
++    """
++    try:
++        parent = psutil.Process(pid)
++    except psutil.NoSuchProcess:
++        return
++
++    # Get all children recursively
++    children = parent.children(recursive=True)
++
++    # Send SIGKILL to all children first
++    for child in children:
++        with contextlib.suppress(ProcessLookupError):
++            os.kill(child.pid, signal.SIGKILL)
++
++    # Finally kill the parent
++    with contextlib.suppress(ProcessLookupError):
++        os.kill(pid, signal.SIGKILL)
++
++
++@dataclass
++class MemorySnapshot:
++    """Memory snapshot."""
++    torch_peak_in_bytes: int = 0
++    torch_memory_in_bytes: int = 0
++    timestamp: float = 0.0
++
++    def measure(self):
++        self.torch_peak_in_bytes = torch.cuda.max_memory_reserved()
++        # torch.cuda.memory_reserved() is how many bytes
++        # PyTorch gets from cuda (by calling cudaMalloc, etc.)
++        self.torch_memory_in_bytes = torch.cuda.memory_reserved()
++        self.timestamp = time.time()
++
++    def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
++        """support a - b"""
++        return MemorySnapshot(
++            torch_peak_in_bytes=self.torch_peak_in_bytes -
++            other.torch_peak_in_bytes,
++            torch_memory_in_bytes=self.torch_memory_in_bytes -
++            other.torch_memory_in_bytes,
++            timestamp=self.timestamp - other.timestamp)
++
++
++@dataclass
++class MemoryProfilingResult:
++    """Memory profiling result.
++    """  # noqa
++    baseline_memory_in_bytes: int = 0
++    non_kv_cache_memory_in_bytes: int = 0
++    torch_peak_increase_in_bytes: int = 0
++    non_torch_increase_in_bytes: int = 0
++    weights_memory_in_bytes: float = 0
++    before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
++    after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
++    profile_time: float = 0.0
++
++
++@contextlib.contextmanager
++def memory_profiling(
++    baseline_memory_in_bytes: int, weights_memory_in_bytes: int
++) -> Generator[MemoryProfilingResult, None, None]:
++    """Memory profiling context manager.
++    baseline_memory_in_bytes: memory used by all the components other than
++        the current vLLM instance. It contains: memory used by other processes, memory
++        used by another vLLM instance in the same process, etc. It is usually measured
++        before the current vLLM instance initialize the device. And we assume it is
++        constant during the profiling of the current vLLM instance.
++    weights_memory_in_bytes: memory used by PyTorch when loading the model weights.
++        Note that, before loading the model weights, we also initialize the device
++        and distributed environment, which may consume some memory. This part is not
++        included in the weights_memory_in_bytes because PyTorch does not control it.
++
++    The memory in one GPU can be classified into 3 categories:
++    1. memory used by anything other than the current vLLM instance.
++    2. memory used by torch in the current vLLM instance.
++    3. memory used in the current vLLM instance, but not by torch.
++
++    A quantitive example:
++
++    Before creating the current vLLM instance:
++        category 1: 1 GiB
++        category 2: 0 GiB
++        category 3: 0 GiB
++
++    After creating the current vLLM instance and loading the model,
++    (i.e. before profiling):
++        category 1: 1 GiB
++        category 2: 2 GiB (model weights take 2 GiB)
++        category 3: 0.5 GiB (memory used by NCCL)
++
++    During profiling (peak):
++        category 1: 1 GiB
++        category 2: 4 GiB (peak activation tensors take 2 GiB)
++        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
++
++    After profiling:
++        category 1: 1 GiB
++        category 2: 3 GiB (after garbage-collecting activation tensors)
++        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
++
++    In this case, non-kv cache takes 5 GiB in total, including:
++    a. 2 GiB used by the model weights (category 2)
++    b. 2 GiB reserved for the peak activation tensors (category 2)
++    c. 1 GiB used by non-torch components (category 3)
++
++    The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
++
++    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
++
++    (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
++    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`.
++    """ # noqa
++    torch.cuda.reset_peak_memory_stats()
++
++    result = MemoryProfilingResult()
++
++    result.baseline_memory_in_bytes = baseline_memory_in_bytes
++    # the part of memory used for holding the model weights
++    result.weights_memory_in_bytes = weights_memory_in_bytes
++
++    result.before_profile.measure()
++
++    yield result
++
++    gc.collect()
++    torch.cuda.empty_cache()
++
++    result.after_profile.measure()
++
++    diff = result.after_profile - result.before_profile
++    result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes
++    current_cuda_memory_bytes = torch.cuda.mem_get_info(
++    )[1] - torch.cuda.mem_get_info()[0]
++    result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes  # noqa
++    result.profile_time = diff.timestamp
++    result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
++
++
++# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
++def set_ulimit(target_soft_limit=65535):
++    resource_type = resource.RLIMIT_NOFILE
++    current_soft, current_hard = resource.getrlimit(resource_type)
++
++    if current_soft < target_soft_limit:
++        try:
++            resource.setrlimit(resource_type,
++                               (target_soft_limit, current_hard))
++        except ValueError as e:
++            logger.warning(
++                "Found ulimit of %s and failed to automatically increase"
++                "with error %s. This can cause fd limit errors like"
++                "`OSError: [Errno 24] Too many open files`. Consider "
++                "increasing with ulimit -n", current_soft, e)
++
++
++# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501
++def get_exception_traceback():
++    etype, value, tb = sys.exc_info()
++    err_str = "".join(traceback.format_exception(etype, value, tb))
++    return err_str
++
++
++# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
++def make_zmq_socket(
++    ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
++    path: str,
++    type: Any,
++) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
++    """Make a ZMQ socket with the proper bind/connect semantics."""
++
++    mem = psutil.virtual_memory()
++    socket = ctx.socket(type)
++
++    # Calculate buffer size based on system memory
++    total_mem = mem.total / 1024**3
++    available_mem = mem.available / 1024**3
++    # For systems with substantial memory (>32GB total, >16GB available):
++    # - Set a large 0.5GB buffer to improve throughput
++    # For systems with less memory:
++    # - Use system default (-1) to avoid excessive memory consumption
++    if total_mem > 32 and available_mem > 16:
++        buf_size = int(0.5 * 1024**3)  # 0.5GB in bytes
++    else:
++        buf_size = -1  # Use system default buffer size
++
++    if type == zmq.constants.PULL:
++        socket.setsockopt(zmq.constants.RCVHWM, 0)
++        socket.setsockopt(zmq.constants.RCVBUF, buf_size)
++        socket.connect(path)
++    elif type == zmq.constants.PUSH:
++        socket.setsockopt(zmq.constants.SNDHWM, 0)
++        socket.setsockopt(zmq.constants.SNDBUF, buf_size)
++        socket.bind(path)
++    else:
++        raise ValueError(f"Unknown Socket Type: {type}")
++
++    return socket
++
++
++@contextlib.contextmanager
++def zmq_socket_ctx(
++        path: str,
++        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
++    """Context manager for a ZMQ socket"""
++
++    ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
++    try:
++        yield make_zmq_socket(ctx, path, type)
++
++    except KeyboardInterrupt:
++        logger.debug("Got Keyboard Interrupt.")
++
++    finally:
++        ctx.destroy(linger=0)
++
++
++def _check_multiproc_method():
++    if (cuda_is_initialized()
++            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
++        logger.warning("CUDA was previously initialized. We must use "
++                       "the `spawn` multiprocessing start method. Setting "
++                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
++                       "See https://docs.vllm.ai/en/latest/getting_started/"
++                       "troubleshooting.html#python-multiprocessing "
++                       "for more information.")
++        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
++
++
++def get_mp_context():
++    _check_multiproc_method()
++    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
++    return multiprocessing.get_context(mp_method)
++
++
++def bind_kv_cache(
++        ctx: Dict[str, Any],
++        kv_cache: List[List[torch.Tensor]],  # [virtual_engine][layer_index]
++) -> None:
++    # Bind the kv_cache tensor to Attention modules, similar to
++    # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
++    # Special things handled here:
++    # 1. Some models have non-attention layers, e.g., Jamba
++    # 2. Pipeline parallelism, each rank only has a subset of layers
++    # 3. Encoder attention has no kv cache
++    # 4. Encoder-decoder models, encoder-decoder attention and decoder-only
++    #    attention of the same layer (e.g., bart's decoder.layers.1.self_attn
++    #    and decoder.layers.1.encoder_attn) is mapped to the same kv cache
++    #    tensor
++    from vllm.attention import AttentionType
++    from vllm.model_executor.models.utils import extract_layer_index
++    layer_need_kv_cache = [
++        layer_name for layer_name in ctx
++        if ctx[layer_name].attn_type in (AttentionType.DECODER,
++                                         AttentionType.ENCODER_DECODER)
++    ]
++    layer_index_sorted = sorted(
++        set(
++            extract_layer_index(layer_name)
++            for layer_name in layer_need_kv_cache))
++    for layer_name in layer_need_kv_cache:
++        kv_cache_idx = layer_index_sorted.index(
++            extract_layer_index(layer_name))
++        forward_ctx = ctx[layer_name]
++        assert len(forward_ctx.kv_cache) == len(kv_cache)
++        for ve, ve_kv_cache in enumerate(kv_cache):
++            assert forward_ctx.kv_cache[ve].numel() == 0
++            forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
+diff --git a/vllm/v1/__init__.py b/vllm/v1/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/v1/attention/__init__.py b/vllm/v1/attention/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/v1/attention/backends/__init__.py b/vllm/v1/attention/backends/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
+new file mode 100644
+index 0000000..b02bc9f
+--- /dev/null
++++ b/vllm/v1/attention/backends/flash_attn.py
+@@ -0,0 +1,430 @@
++"""Attention layer with FlashAttention."""
++from dataclasses import dataclass
++from typing import Any, Dict, List, Optional, Tuple, Type
++
++import numpy as np
++import torch
++import triton
++import triton.language as tl
++
++from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
++                                              AttentionMetadata, AttentionType)
++from vllm.utils import cdiv
++from vllm.vllm_flash_attn import flash_attn_varlen_func
++
++
++class FlashAttentionBackend(AttentionBackend):
++
++    @staticmethod
++    def get_supported_head_sizes() -> List[int]:
++        return [32, 64, 96, 128, 160, 192, 224, 256]
++
++    @staticmethod
++    def get_name() -> str:
++        return "FLASH_ATTN_VLLM_V1"
++
++    @staticmethod
++    def get_impl_cls() -> Type["FlashAttentionImpl"]:
++        return FlashAttentionImpl
++
++    @staticmethod
++    def get_metadata_cls() -> Type["AttentionMetadata"]:
++        return FlashAttentionMetadata
++
++    @staticmethod
++    def get_kv_cache_shape(
++        num_blocks: int,
++        block_size: int,
++        num_kv_heads: int,
++        head_size: int,
++    ) -> Tuple[int, ...]:
++        if block_size % 16 != 0:
++            raise ValueError("Block size must be a multiple of 16.")
++        return (2, num_blocks, block_size, num_kv_heads, head_size)
++
++    @staticmethod
++    def use_cascade_attention(*args, **kwargs) -> bool:
++        return use_cascade_attention(*args, **kwargs)
++
++
++@dataclass
++class FlashAttentionMetadata:
++    # NOTE(sang): Definition of context_len, query_len, and seq_len.
++    # |---------- N-1 iteration --------|
++    # |---------------- N iteration ---------------------|
++    # |- tokenA -|......................|-- newTokens ---|
++    # |---------- context_len ----------|
++    # |-------------------- seq_len ---------------------|
++    #                                   |-- query_len ---|
++
++    num_actual_tokens: int  # Number of tokens excluding padding.
++    max_query_len: int
++    query_start_loc: torch.Tensor
++    max_seq_len: int
++    seq_start_loc: torch.Tensor
++    block_table: torch.Tensor
++    slot_mapping: torch.Tensor
++
++    # For cascade attention.
++    use_cascade: bool
++    common_prefix_len: int
++    cu_prefix_query_lens: Optional[torch.Tensor]
++    cu_prefix_kv_lens: Optional[torch.Tensor]
++    cu_suffix_kv_lens: Optional[torch.Tensor]
++
++    # For logging.
++    num_input_tokens: int = 0  # Number of tokens including padding.
++
++
++class FlashAttentionImpl(AttentionImpl):
++
++    def __init__(
++        self,
++        num_heads: int,
++        head_size: int,
++        scale: float,
++        num_kv_heads: int,
++        alibi_slopes: Optional[List[float]],
++        sliding_window: Optional[int],
++        kv_cache_dtype: str,
++        blocksparse_params: Optional[Dict[str, Any]] = None,
++        logits_soft_cap: Optional[float] = None,
++        attn_type: AttentionType = AttentionType.DECODER,
++    ) -> None:
++        if blocksparse_params is not None:
++            raise ValueError(
++                "FlashAttention does not support block-sparse attention.")
++        self.num_heads = num_heads
++        self.head_size = head_size
++        self.scale = float(scale)
++        self.num_kv_heads = num_kv_heads
++        if alibi_slopes is not None:
++            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
++        self.alibi_slopes = alibi_slopes
++        if sliding_window is None:
++            self.sliding_window = (-1, -1)
++        else:
++            self.sliding_window = (sliding_window - 1, 0)
++        self.kv_cache_dtype = kv_cache_dtype
++        if logits_soft_cap is None:
++            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
++            logits_soft_cap = 0
++        self.logits_soft_cap = logits_soft_cap
++
++        assert self.num_heads % self.num_kv_heads == 0
++        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
++
++        support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
++        if head_size not in support_head_sizes:
++            raise ValueError(
++                f"Head size {head_size} is not supported by FlashAttention. "
++                f"Supported head sizes are: {support_head_sizes}.")
++
++        if attn_type != AttentionType.DECODER:
++            raise NotImplementedError("Encoder self-attention and "
++                                      "encoder/decoder cross-attention "
++                                      "are not implemented for "
++                                      "FlashAttentionImpl")
++
++    def forward(
++        self,
++        query: torch.Tensor,
++        key: torch.Tensor,
++        value: torch.Tensor,
++        kv_cache: torch.Tensor,
++        attn_metadata: FlashAttentionMetadata,
++        k_scale: float = 1.0,
++        v_scale: float = 1.0,
++        output: Optional[torch.Tensor] = None,
++    ) -> torch.Tensor:
++        """Forward pass with FlashAttention.
++
++        Args:
++            query: shape = [num_tokens, num_heads, head_size]
++            key: shape = [num_tokens, num_kv_heads, head_size]
++            value: shape = [num_tokens, num_kv_heads, head_size]
++            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
++            attn_metadata: Metadata for attention.
++        Returns:
++            shape = [num_tokens, num_heads * head_size]
++        """
++        # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
++        assert k_scale == 1.0 and v_scale == 1.0, (
++            "key/v_scale is not supported in FlashAttention.")
++
++        assert output is not None, "Output tensor must be provided."
++
++        if attn_metadata is None:
++            # Profiling run.
++            return output
++
++        # IMPORTANT!
++        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
++        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
++        # in this method. For example, `view` and `slice` (or `[:n]`) operations
++        # are surprisingly slow even in the case they do not invoke any GPU ops.
++        # Minimize the PyTorch ops in this method as much as possible.
++        # Whenever making a change in this method, please benchmark the
++        # performance to make sure it does not introduce any overhead.
++
++        num_actual_tokens = attn_metadata.num_actual_tokens
++        # Reshape the input keys and values and store them in the cache.
++        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
++        # not padded. However, we don't need to do key[:num_actual_tokens] and
++        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
++        # the slot_mapping's shape to determine the number of actual tokens.
++        key_cache, value_cache = kv_cache.unbind(0)
++        torch.ops._C_cache_ops.reshape_and_cache_flash(
++            key,
++            value,
++            key_cache,
++            value_cache,
++            attn_metadata.slot_mapping,
++            self.kv_cache_dtype,
++            k_scale,
++            v_scale,
++        )
++
++        # Compute attention and update output up to `num_actual_tokens`.
++        if not attn_metadata.use_cascade:
++            # Regular attention (common case).
++            flash_attn_varlen_func(
++                q=query[:num_actual_tokens],
++                k=key_cache,
++                v=value_cache,
++                out=output[:num_actual_tokens],
++                cu_seqlens_q=attn_metadata.query_start_loc,
++                max_seqlen_q=attn_metadata.max_query_len,
++                cu_seqlens_k=attn_metadata.seq_start_loc,
++                max_seqlen_k=attn_metadata.max_seq_len,
++                softmax_scale=self.scale,
++                causal=True,
++                alibi_slopes=self.alibi_slopes,
++                window_size=self.sliding_window,
++                block_table=attn_metadata.block_table,
++                softcap=self.logits_soft_cap,
++            )
++            return output
++
++        # Cascade attention (rare case).
++        cascade_attention(
++            output[:num_actual_tokens],
++            query[:num_actual_tokens],
++            key_cache,
++            value_cache,
++            cu_query_lens=attn_metadata.query_start_loc,
++            max_query_len=attn_metadata.max_query_len,
++            cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens,
++            cu_prefix_kv_lens=attn_metadata.cu_prefix_kv_lens,
++            cu_suffix_kv_lens=attn_metadata.cu_suffix_kv_lens,
++            max_kv_len=attn_metadata.max_seq_len,
++            softmax_scale=self.scale,
++            alibi_slopes=self.alibi_slopes,
++            sliding_window=self.sliding_window,
++            logits_soft_cap=self.logits_soft_cap,
++            block_table=attn_metadata.block_table,
++            common_prefix_len=attn_metadata.common_prefix_len,
++        )
++        return output
++
++
++def use_cascade_attention(
++    common_prefix_len: int,
++    query_lens: np.ndarray,
++    num_query_heads: int,
++    num_kv_heads: int,
++    use_alibi: bool,
++    use_sliding_window: bool,
++    num_sms: int,
++) -> bool:
++    """Decide whether to use cascade attention.
++
++    This function 1) checks whether cascade attention is supported with the
++    given configuration, and 2) heuristically decides whether using cascade
++    attention can improve performance.
++    """
++    # Too short common prefix. Probably not worth using cascade attention.
++    # We use an arbitrary threshold of 256 tokens. TODO: Tune this threshold.
++    # NOTE(woosuk): This is the common case. We should return False as soon as
++    # possible to avoid any unnecessary computation.
++    if common_prefix_len < 256:
++        return False
++    # Cascade attention is currently not supported with these variants.
++    if use_alibi or use_sliding_window:
++        return False
++    # Too few queries. Probably not worth using cascade attention.
++    # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold.
++    num_reqs = len(query_lens)
++    if num_reqs < 8:
++        return False
++
++    # Heuristics to decide whether using cascade attention is beneficial.
++    # 1. When FlashDecoding is not used for normal attention, cascade attention
++    #    is likely to be faster since it saves memory bandwidth.
++    num_queries_per_kv = num_query_heads // num_kv_heads
++    # The criteria for using FlashDecoding can be found in the following link:
++    # https://github.com/vllm-project/flash-attention/blob/96266b1111111f3d11aabefaf3bacbab6a89d03c/csrc/flash_attn/flash_api.cpp#L535
++    use_flash_decoding = (num_queries_per_kv > 1 and not use_sliding_window
++                          and not use_alibi and np.all(query_lens == 1))
++    if not use_flash_decoding:
++        # Use cascade attention.
++        return True
++
++    # 2. When FlashDecoding is used for normal attention, it is not clear
++    #    whether cascade attention is beneficial, because FlashDecoding can
++    #    launch more CTAs than cascade attention.
++    #    We use a simple performance model to compare the two methods.
++    #    NOTE(woosuk): The performance model is very rough and may not be
++    #    accurate.
++    num_tokens = num_reqs
++    # NOTE(woosuk): These are default tile sizes. flash-attn might use
++    # different tile sizes (e.g., 64 or 256) depending on the configuration.
++    q_tile_size = 128
++    kv_tile_size = 128
++    num_prefix_tiles = cdiv(common_prefix_len, kv_tile_size)
++
++    cascade_ctas = num_query_heads * cdiv(num_tokens, q_tile_size)
++    cascade_waves = cdiv(cascade_ctas, num_sms)
++    cascade_time = cascade_waves * num_prefix_tiles
++
++    flash_decoding_ctas = (num_reqs * num_kv_heads *
++                           cdiv(num_queries_per_kv, q_tile_size))
++    flash_decoding_ctas *= num_prefix_tiles
++    flash_decoding_time = cdiv(flash_decoding_ctas, num_sms)
++
++    # Use cascade attention if it is faster than FlashDecoding.
++    return cascade_time < flash_decoding_time
++
++
++def cascade_attention(
++    output: torch.Tensor,
++    query: torch.Tensor,
++    key_cache: torch.Tensor,
++    value_cache: torch.Tensor,
++    cu_query_lens: torch.Tensor,
++    max_query_len: int,
++    cu_prefix_query_lens: torch.Tensor,
++    cu_prefix_kv_lens: torch.Tensor,
++    cu_suffix_kv_lens: torch.Tensor,
++    max_kv_len: int,
++    softmax_scale: float,
++    alibi_slopes: Optional[torch.Tensor],
++    sliding_window: Tuple[int, int],
++    logits_soft_cap: float,
++    block_table: torch.Tensor,
++    common_prefix_len: int,
++) -> torch.Tensor:
++    assert alibi_slopes is None, ("Cascade attention does not support ALiBi.")
++    # TODO: Support sliding window.
++    assert sliding_window == (-1, -1), (
++        "Cascade attention does not support sliding window.")
++
++    num_tokens = query.shape[0]
++    block_size = key_cache.shape[-3]
++    assert common_prefix_len % block_size == 0
++    num_common_kv_blocks = common_prefix_len // block_size
++    assert num_common_kv_blocks > 0
++
++    # Process shared prefix.
++    prefix_output, prefix_lse = flash_attn_varlen_func(
++        q=query,
++        k=key_cache,
++        v=value_cache,
++        cu_seqlens_q=cu_prefix_query_lens,
++        cu_seqlens_k=cu_prefix_kv_lens,
++        max_seqlen_q=num_tokens,
++        max_seqlen_k=common_prefix_len,
++        softmax_scale=softmax_scale,
++        causal=False,
++        window_size=sliding_window,
++        block_table=block_table[:1],
++        softcap=logits_soft_cap,
++        return_softmax_lse=True,
++    )
++
++    # Process suffix per query.
++    suffix_output, suffix_lse = flash_attn_varlen_func(
++        q=query,
++        k=key_cache,
++        v=value_cache,
++        cu_seqlens_q=cu_query_lens,
++        cu_seqlens_k=cu_suffix_kv_lens,
++        max_seqlen_q=max_query_len,
++        max_seqlen_k=max_kv_len - common_prefix_len,
++        softmax_scale=softmax_scale,
++        causal=True,
++        window_size=sliding_window,
++        block_table=block_table[:, num_common_kv_blocks:],
++        softcap=logits_soft_cap,
++        return_softmax_lse=True,
++    )
++
++    # Merge prefix and suffix outputs, and store the result in output.
++    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
++                      suffix_lse)
++
++
++def merge_attn_states(
++    output: torch.Tensor,
++    prefix_output: torch.Tensor,
++    prefix_lse: torch.Tensor,
++    suffix_output: torch.Tensor,
++    suffix_lse: torch.Tensor,
++) -> None:
++    num_tokens = output.shape[0]
++    num_query_heads = output.shape[1]
++    head_size = output.shape[2]
++    padded_head_size = triton.next_power_of_2(head_size)
++
++    # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead.
++    merge_attn_states_kernel[(num_tokens, num_query_heads)](
++        output,
++        prefix_output,
++        prefix_lse,
++        suffix_output,
++        suffix_lse,
++        head_size,
++        padded_head_size,
++    )
++
++
++@triton.jit
++def merge_attn_states_kernel(
++    output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
++    prefix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
++    prefix_lse,  # [NUM_HEADS, NUM_TOKENS]
++    suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
++    suffix_lse,  # [NUM_HEADS, NUM_TOKENS]
++    HEAD_SIZE: tl.constexpr,
++    PADDED_HEAD_SIZE: tl.constexpr,
++):
++    token_idx = tl.program_id(0)
++    num_tokens = tl.num_programs(0)
++    head_idx = tl.program_id(1)
++    num_heads = tl.num_programs(1)
++
++    p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx)
++    s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx)
++    max_lse = tl.maximum(p_lse, s_lse)
++    p_lse = p_lse - max_lse
++    s_lse = s_lse - max_lse
++
++    head_arange = tl.arange(0, PADDED_HEAD_SIZE)
++    head_mask = head_arange < HEAD_SIZE
++    p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE +
++                    head_idx * HEAD_SIZE + head_arange,
++                    mask=head_mask)
++    s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE +
++                    head_idx * HEAD_SIZE + head_arange,
++                    mask=head_mask)
++
++    # NOTE(woosuk): Be careful with the numerical stability.
++    # We should compute the scale first, and then multiply it with the output.
++    # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly.
++    p_scale = tl.exp(p_lse) / (tl.exp(p_lse) + tl.exp(s_lse))
++    s_scale = tl.exp(s_lse) / (tl.exp(p_lse) + tl.exp(s_lse))
++    out = p_out * p_scale + s_out * s_scale
++    tl.store(output + token_idx * num_heads * HEAD_SIZE +
++             head_idx * HEAD_SIZE + head_arange,
++             out,
++             mask=head_mask)
+diff --git a/vllm/v1/core/__init__.py b/vllm/v1/core/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
+new file mode 100644
+index 0000000..845bd5e
+--- /dev/null
++++ b/vllm/v1/core/encoder_cache_manager.py
+@@ -0,0 +1,48 @@
++from typing import Dict, List, Set, Tuple
++
++from vllm.v1.request import Request
++
++
++class EncoderCacheManager:
++
++    def __init__(self, cache_size: int):
++        self.cache_size = cache_size
++        self.num_free_slots = cache_size
++        # req_id -> cached input ids
++        self.cached: Dict[str, Set[int]] = {}
++        # List of [req_id, input_id]
++        self.freed: List[Tuple[str, int]] = []
++
++    def has_cache(self, request: Request, input_id: int) -> bool:
++        req_id = request.request_id
++        return req_id in self.cached and input_id in self.cached[req_id]
++
++    def can_allocate(self, request: Request, input_id: int) -> bool:
++        num_tokens = request.get_num_encoder_tokens(input_id)
++        return num_tokens <= self.num_free_slots
++
++    def allocate(self, request: Request, input_id: int) -> None:
++        req_id = request.request_id
++        if req_id not in self.cached:
++            self.cached[req_id] = set()
++        self.cached[req_id].add(input_id)
++        self.num_free_slots -= request.get_num_encoder_tokens(input_id)
++
++    def get_cached_input_ids(self, request: Request) -> Set[int]:
++        return self.cached.get(request.request_id, set())
++
++    def free(self, request: Request, input_id: int) -> None:
++        req_id = request.request_id
++        if req_id not in self.cached:
++            return
++
++        self.cached[req_id].discard(input_id)
++        if len(self.cached[req_id]) == 0:
++            del self.cached[req_id]
++        self.num_free_slots += request.get_num_encoder_tokens(input_id)
++        self.freed.append((req_id, input_id))
++
++    def get_freed_ids(self) -> List[Tuple[str, int]]:
++        freed = self.freed
++        self.freed = []
++        return freed
+diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
+new file mode 100644
+index 0000000..1cbff1e
+--- /dev/null
++++ b/vllm/v1/core/kv_cache_manager.py
+@@ -0,0 +1,479 @@
++from collections import defaultdict
++from typing import Dict, Iterable, List, Optional
++
++from vllm.logger import init_logger
++from vllm.utils import cdiv
++from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
++                                         KVCacheBlock,
++                                         generate_block_hash_extra_keys,
++                                         hash_block_tokens,
++                                         hash_request_tokens)
++from vllm.v1.request import Request, RequestStatus
++
++logger = init_logger(__name__)
++
++
++class KVCacheManager:
++
++    def __init__(
++        self,
++        block_size: int,
++        num_gpu_blocks: int,
++        max_model_len: int,
++        sliding_window: Optional[int] = None,
++        enable_caching: bool = True,
++        num_preallocate_tokens: int = 64,
++    ) -> None:
++        self.block_size = block_size
++        self.num_gpu_blocks = num_gpu_blocks
++        self.max_model_len = max_model_len
++        self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
++        self.sliding_window = sliding_window
++        self.enable_caching = enable_caching
++        # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
++        # blocks for each request. For example, when a request reaches the end
++        # of its block table, we preallocate N blocks in advance. This way, we
++        # reduce the overhead of updating free_block_ids and ref_cnts for each
++        # request every step (at the cost of some memory waste).
++        # NOTE(woosuk): This is different from the "lookahead" slots since this
++        # does not guarantee that the request always has N empty blocks. After
++        # the request gets N empty blocks, it starts to use the blocks without
++        # further allocation. When it uses up all the N empty blocks, it gets
++        # N new empty blocks.
++        self.num_preallocate_tokens = num_preallocate_tokens
++        self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size)
++
++        # A Block pool of all kv-cache blocks.
++        self.block_pool: List[KVCacheBlock] = [
++            KVCacheBlock(idx) for idx in range(num_gpu_blocks)
++        ]
++        # Free block queue that constructs and manipulates a doubly linked
++        # list of free blocks (including eviction candidates when caching is
++        # enabled).
++        self.free_block_queue = FreeKVCacheBlockQueue(self.block_pool)
++
++        # {block_hash: {block ID: block}}. A cached block is
++        # a full block with a block hash that can be used for prefix caching.
++        # The cached block may be used by running requests or in the
++        # free_block_queue that could potentially be evicted.
++        # NOTE: We currently don't de-duplicate the blocks in the cache,
++        # meaning that if a block becomes full and is cached, we don't check
++        # if there is already an identical block in the cache. This is because
++        # we want to make sure the allocated block IDs won't change so that
++        # block tables are append-only.
++        self.cached_block_hash_to_block: Dict[BlockHashType, Dict[
++            int, KVCacheBlock]] = defaultdict(dict)
++
++        # Mapping from request ID to blocks to track the blocks allocated
++        # for each request, so that we can free the blocks when the request
++        # is finished.
++        self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
++
++    def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
++        """Get the computed (cached) blocks for the request.
++        Note that the computed blocks must be full.
++
++        Args:
++            request: The request to get the computed blocks.
++
++        Returns:
++            A list of blocks that are computed for the request.
++        """
++        if not self.enable_caching:
++            # Prefix caching is disabled.
++            return []
++
++        computed_blocks = []
++
++        # The block hashes for the request may already be computed
++        # if the request was preempted and resumed.
++        if not request.kv_block_hashes:
++            request.set_kv_block_hashes(
++                hash_request_tokens(self.block_size, request))
++        block_hashes = request.kv_block_hashes
++
++        for block_hash in block_hashes:
++            # block_hashes is a chain of block hashes. If a block hash is not
++            # in the cached_block_hash_to_id, the following block hashes are
++            # not computed yet for sure.
++            if cached_block := self._get_cached_block(block_hash):
++                computed_blocks.append(cached_block)
++            else:
++                break
++
++        return computed_blocks
++
++    def append_slots(
++        self,
++        request: Request,
++        num_tokens: int,
++    ) -> Optional[List[KVCacheBlock]]:
++        """Append slots to the block table of the request.
++        We first append slots to already allocated blocks. If the allocated
++        blocks are not enough, we allocate new blocks.
++
++        Args:
++            request: The request to append slots.
++            num_tokens: The number of tokens to append.
++
++        Returns:
++            A list of new blocks if new blocks are allocated, or None
++            if new blocks are required but cannot be allocated.
++        """
++        num_required_blocks = cdiv(request.num_computed_tokens + num_tokens,
++                                   self.block_size)
++        req_blocks = self.req_to_blocks[request.request_id]
++
++        num_new_blocks = num_required_blocks - len(req_blocks)
++        if num_new_blocks > self.free_block_queue.num_free_blocks:
++            # Need to allocate new blocks due to insufficient pre-allocated
++            # slots, but we cannot allocate new blocks due to the limit.
++            return None
++
++        if num_new_blocks <= 0:
++            # No new block is needed.
++            new_blocks = []
++        else:
++            # Get new blocks from the free block pool considering
++            # preallocated blocks.
++            num_new_blocks = min(
++                num_new_blocks + self.num_preallocate_blocks,
++                self.free_block_queue.num_free_blocks,
++                # Should not exceed the maximum number of blocks per request.
++                # This is especially because the block table has the shape
++                # [..., max_num_blocks_per_req].
++                # TODO(woosuk): Check and reject requests if
++                # num_prompt_tokens + max_tokens > max_model_len.
++                self.max_num_blocks_per_req - len(req_blocks),
++            )
++            assert num_new_blocks > 0
++
++            new_blocks = self._get_new_blocks(num_new_blocks)
++            req_blocks.extend(new_blocks)
++
++        if not self.enable_caching:
++            return new_blocks
++
++        num_computed_full_blocks = (request.num_computed_tokens //
++                                    self.block_size)
++
++        # NOTE(rickyx): We are assuming the `num_tokens` are actual
++        # tokens rather than lookahead slots (e.g. for speculative decoding).
++        # TODO(rickyx): When supporting speculative decoding, we will need to
++        # differentiate between them so that we can know how many blocks are
++        # full after appending the actual tokens.
++        num_full_blocks_after_append = (request.num_computed_tokens +
++                                        num_tokens) // self.block_size
++        assert num_full_blocks_after_append <= len(req_blocks)
++
++        new_full_blocks = req_blocks[
++            num_computed_full_blocks:num_full_blocks_after_append]
++        if new_full_blocks:
++            self._cache_full_blocks(
++                request=request,
++                blk_start_idx=num_computed_full_blocks,
++                full_blocks=new_full_blocks,
++                prev_block=req_blocks[num_computed_full_blocks - 1]
++                if num_computed_full_blocks >= 1 else None,
++            )
++
++        return new_blocks
++
++    def allocate_slots(
++        self,
++        request: Request,
++        num_tokens: int,
++        computed_blocks: List[KVCacheBlock],
++    ) -> Optional[List[KVCacheBlock]]:
++        """Allocate slots for a new request.
++
++        Args:
++            request: The request to allocate slots.
++            num_tokens: The number of tokens to allocate. Note that this does
++                not include the tokens that have already been computed.
++            computed_blocks: A list of computed blocks.
++
++        Returns:
++            A list of new allocated blocks.
++        """
++        if num_tokens == 0:
++            raise ValueError(
++                f"num_tokens must be greater than 0, got {num_tokens}")
++
++        # If a computed block of a request is an eviction candidate (in the
++        # free queue and ref_cnt == 0), it cannot be counted as a free block
++        # when allocating this request.
++        num_evictable_computed_blocks = sum(1 for blk in computed_blocks
++                                            if blk.ref_cnt == 0)
++
++        num_required_blocks = cdiv(num_tokens, self.block_size)
++        if (num_required_blocks > self.free_block_queue.num_free_blocks -
++                num_evictable_computed_blocks):
++            # Cannot allocate new blocks.
++            return None
++
++        # Touch the computed blocks to make sure they won't be evicted.
++        if self.enable_caching:
++            self._touch(computed_blocks)
++        else:
++            assert not computed_blocks, (
++                "Computed blocks should be empty when "
++                "prefix caching is disabled")
++
++        # Determine the number of new blocks to allocate considering
++        # preallocated blocks.
++        num_new_blocks = min(
++            num_required_blocks + self.num_preallocate_blocks,
++            self.free_block_queue.num_free_blocks,
++            # Should not exceed the maximum number of blocks per request.
++            # This is especially because the block table has the shape
++            # [..., max_num_blocks_per_req].
++            # TODO(woosuk): Check and reject requests if
++            # num_prompt_tokens + max_tokens > max_model_len.
++            self.max_num_blocks_per_req - len(computed_blocks),
++        )
++        assert num_new_blocks > 0
++
++        # Concatenate the computed block IDs and the new block IDs.
++        new_blocks = self._get_new_blocks(num_new_blocks)
++        self.req_to_blocks[request.request_id] = computed_blocks + new_blocks
++
++        if not self.enable_caching:
++            return new_blocks
++
++        num_computed_tokens = len(computed_blocks) * self.block_size
++        num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size
++
++        new_full_blocks = self.req_to_blocks[
++            request.request_id][len(computed_blocks):num_full_blocks]
++        if new_full_blocks:
++            self._cache_full_blocks(
++                request=request,
++                blk_start_idx=len(computed_blocks),
++                # The new full blocks are the full blocks that are not computed.
++                full_blocks=new_full_blocks,
++                prev_block=computed_blocks[-1] if computed_blocks else None,
++            )
++
++        return new_blocks
++
++    def free(self, request: Request) -> None:
++        """Free the blocks allocated for the request.
++        When caching is enabled, we free the blocks in reverse order so that
++        the tail blocks are evicted first.
++
++        Args:
++            request: The request to free the blocks.
++        """
++        # Default to [] in case a request is freed (aborted) before alloc.
++        blocks = self.req_to_blocks.pop(request.request_id, [])
++        ordered_blocks: Iterable[KVCacheBlock] = blocks
++        if self.enable_caching:
++            # Free blocks in reverse order so that the tail blocks are
++            # freed first.
++            ordered_blocks = reversed(blocks)
++
++        for block in ordered_blocks:
++            block.decr_ref()
++            if block.ref_cnt == 0:
++                self.free_block_queue.append(block)
++
++    def get_num_common_prefix_blocks(
++        self,
++        request: Request,
++        num_running_requests: int,
++    ) -> int:
++        """Calculate the number of common prefix blocks shared by all requests
++        in the RUNNING state.
++
++        The function determines this by selecting any request and iterating
++        through its blocks.  A block is considered a common prefix block if its
++        `ref_cnt` equals the total number of requests in the RUNNING state.
++
++        NOTE(woosuk): The number of requests in the RUNNING state is **greater
++        than or equal to** the number of requests scheduled in the current step.
++        This is because the RUNNING state only indicates that:
++        1. The request has not yet finished, and
++        2. The request holds its blocks unfreed.
++
++        While all scheduled requests must be in the RUNNING state, the inverse
++        is not necessarily true. There may be RUNNING requests that are not
++        scheduled in the current step. As of 1/1/2025, the scheduler does not
++        allow this case, but it is possible in the future, as we allow more
++        flexible scheduling.
++
++        This can result in an edge case where the number of common prefix blocks
++        is 0, even though all scheduled requests share a common prefix. This
++        occurs because there may be unscheduled RUNNING requests that do not
++        share the common prefix. Currently, this case cannot be easily detected,
++        so the function returns 0 in such cases.
++
++        Args:
++            request: Any request in the RUNNING state, used to identify the
++                common prefix blocks.
++            num_running_requests: The total number of requests in the RUNNING
++                state. This can be different from the number of scheduled
++                requests in the current step.
++
++        Returns:
++            int: The number of common prefix blocks.
++        """
++        assert request.status == RequestStatus.RUNNING
++        blocks = self.req_to_blocks[request.request_id]
++        num_common_blocks = 0
++        for block in blocks:
++            if block.ref_cnt == num_running_requests:
++                num_common_blocks += 1
++            else:
++                break
++        return num_common_blocks
++
++    def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
++        """Get new blocks from the free block pool.
++
++        Note that we do not check block cache in this function.
++
++        Args:
++            num_blocks: The number of blocks to allocate.
++
++        Returns:
++            A list of new block.
++        """
++        if num_blocks > self.free_block_queue.num_free_blocks:
++            raise ValueError(
++                f"Cannot get {num_blocks} free blocks from the pool")
++
++        ret: List[KVCacheBlock] = []
++        idx = 0
++        while idx < num_blocks:
++            # First allocate blocks.
++            curr_block = self.free_block_queue.popleft()
++            assert curr_block.ref_cnt == 0
++
++            # If the block is cached, evict it.
++            if self.enable_caching:
++                self._evict_cached_block(curr_block)
++
++            curr_block.incr_ref()
++            ret.append(curr_block)
++            idx += 1
++
++        return ret
++
++    def _evict_cached_block(self, block: KVCacheBlock) -> None:
++        """
++        If a block is cached in `cached_block_hash_to_block`, we reset its hash
++        metadata and evict it from the cache.
++
++        Args:
++            block: The block to evict.
++        """
++        block_hash = block.block_hash
++        if block_hash and block_hash in self.cached_block_hash_to_block:
++            block.reset_hash()
++            del self.cached_block_hash_to_block[block_hash][block.block_id]
++
++            if len(self.cached_block_hash_to_block[block_hash]) == 0:
++                del self.cached_block_hash_to_block[block_hash]
++
++    def _get_cached_block(self,
++                          block_hash: BlockHashType) -> Optional[KVCacheBlock]:
++        """Get a cached block by the block hash, or None if cache miss.
++        If there are duplicated blocks, we return the first block in the cache.
++
++        Args:
++            block_hash: The hash value of the block.
++
++        Returns:
++            The cached block if it exists, or None.
++        """
++        if block_hash in self.cached_block_hash_to_block:
++            first_block_id = list(
++                self.cached_block_hash_to_block[block_hash].keys())[0]
++            return self.cached_block_hash_to_block[block_hash][first_block_id]
++        return None
++
++    def _touch(self, blocks: List[KVCacheBlock]) -> None:
++        """Touch a block increases its reference count by 1, and may remove
++        the block from the free queue. This is used when a block is hit by
++        another request with the same prefix.
++
++        Args:
++            blocks: A list of blocks to touch.
++        """
++        for block in blocks:
++            # ref_cnt=0 means this block is in the free list (i.e. eviction
++            # candidate), so remove it.
++            if block.ref_cnt == 0:
++                self.free_block_queue.remove(block)
++            block.incr_ref()
++
++    def _cache_full_blocks(
++        self,
++        request: Request,
++        blk_start_idx: int,
++        full_blocks: List[KVCacheBlock],
++        prev_block: Optional[KVCacheBlock],
++    ) -> None:
++        """Cache a list of full blocks for prefix caching.
++
++        This function takes a list of blocks that will have their block hash
++        metadata to be updated and cached. Given a request, it computes the
++        block hashes for the blocks starting from `blk_start_idx` to the end
++        of the request's full blocks, updating the metadata for each block
++        and caching them in the `cached_block_hash_to_block`.
++
++        Args:
++            request: The request to cache the blocks.
++            blk_start_idx: The index of the first block in the request's blocks
++                to cache.
++            full_blocks: The list of blocks to update hash metadata.
++            prev_block: The previous block in the chain.
++        """
++        num_cached_block_hashes = len(request.kv_block_hashes)
++
++        # Update the new blocks with the block hashes through the chain.
++        prev_block_hash_value = None
++        if prev_block is not None:
++            # Previous block must have a block hash because it must be
++            # a full, cached block.
++            assert prev_block.block_hash is not None
++            prev_block_hash_value = prev_block.block_hash.hash_value
++
++        for i, blk in enumerate(full_blocks):
++            blk_idx = blk_start_idx + i
++
++            if blk_idx < num_cached_block_hashes:
++                # The block hash may already be computed in
++                # "get_computed_blocks" if the tokens are not generated by
++                # this request (either the prompt tokens or the previously
++                # generated tokens with preemption). In this case we simply
++                # reuse the block hash.
++                block_hash = request.kv_block_hashes[blk_idx]
++            else:
++                # Otherwise compute the block hash and cache it in the request
++                # in case it will be preempted in the future.
++                start_token_idx = blk_idx * self.block_size
++                end_token_idx = (blk_idx + 1) * self.block_size
++                block_tokens = request.all_token_ids[
++                    start_token_idx:end_token_idx]
++                assert len(block_tokens) == self.block_size, (
++                    f"Expected {self.block_size} tokens, got "
++                    f"{len(block_tokens)} at {blk_idx}th block for request "
++                    f"{request.request_id}({request})")
++
++                # Generate extra keys for multi-modal inputs. Note that since
++                # we reach to this branch only when the block is completed with
++                # generated tokens, we only need to consider the last mm input.
++                extra_keys, _ = generate_block_hash_extra_keys(
++                    request, start_token_idx, end_token_idx, -1)
++
++                # Compute the hash of the current block.
++                block_hash = hash_block_tokens(prev_block_hash_value,
++                                               block_tokens, extra_keys)
++                request.append_kv_block_hashes(block_hash)
++
++            # Update and added the full block to the cache.
++            blk.block_hash = block_hash
++            self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
++            prev_block_hash_value = block_hash.hash_value
+diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
+new file mode 100644
+index 0000000..22a5d2f
+--- /dev/null
++++ b/vllm/v1/core/kv_cache_utils.py
+@@ -0,0 +1,307 @@
++"""KV-Cache Utilities."""
++from collections.abc import Sequence
++from dataclasses import dataclass
++from typing import Any, List, NamedTuple, Optional, Tuple
++
++from vllm.logger import init_logger
++from vllm.v1.request import Request
++
++logger = init_logger(__name__)
++
++
++class BlockHashType(NamedTuple):
++    """Hash value of a block (int), the token IDs in the block, and extra keys.
++    We keep a tuple of token IDs and extra keys to reduce the likelihood of
++    hash collisions when the hash value is the same. But please note that 
++    hash collisions can still theoretically occur, albeit with an extremely 
++    low probability.
++    """
++    # Hash value of the block in an integer.
++    hash_value: int
++    # Token IDs in the block.
++    token_ids: Tuple[int, ...]
++    # Extra keys for the block.
++    extra_keys: Optional[Any] = None
++
++
++@dataclass
++class KVCacheBlock:
++    """KV-cache block metadata."""
++    # Block ID, ranging from 0 to num_gpu_blocks - 1.
++    block_id: int
++    # Reference count.
++    ref_cnt: int = 0
++    # The hash of the block composed of (block hash, tuple of token IDs).
++    # It is only available when the block is full.
++    _block_hash: Optional[BlockHashType] = None
++
++    # Used to construct a doubly linked list for free blocks.
++    # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
++    prev_free_block: Optional["KVCacheBlock"] = None
++    next_free_block: Optional["KVCacheBlock"] = None
++
++    def incr_ref(self):
++        self.ref_cnt += 1
++
++    def decr_ref(self):
++        self.ref_cnt -= 1
++
++    @property
++    def block_hash(self) -> Optional[BlockHashType]:
++        return self._block_hash
++
++    @block_hash.setter
++    def block_hash(self, block_hash: BlockHashType):
++        assert self.block_hash is None, (
++            "The block already has a hash. This should not happen.")
++        self._block_hash = block_hash
++
++    def reset_hash(self):
++        """Reset the block hash when the block is evicted."""
++        self._block_hash = None
++
++
++class FreeKVCacheBlockQueue:
++    """This class organizes a list of KVCacheBlock objects to a doubly linked
++    list of free blocks. We implement this class instead of using Python
++    builtin deque to support removing a block in the middle of the queue
++    in O(1) time. To close the performance gap to the builtin deque which is
++    implemented in C++, this class does not allocate any Python objects when
++    manipulating the linked list. Instead, this class manipulates the 
++    prev_free_block and next_free_block attributes of the given blocks.
++
++    The queue is ordered by block ID in the beginning. When a block is allocated
++    and then freed, it will be appended back with the eviction order:
++    1. The least recent used block is at the front (LRU).
++    2. If two blocks have the same last accessed time (allocated by the
++       same sequence), the one with more hash tokens (the tail of a block
++       chain) is at the front.
++    Note that we maintain this order by reversing the block order when free
++    blocks of a request. This operation is outside of this class.
++
++    Args:
++        blocks: A list of KVCacheBlock objects.
++    """
++
++    def __init__(self, blocks: List[KVCacheBlock]) -> None:
++        self.num_free_blocks = len(blocks)
++
++        # Initialize the doubly linked list of free blocks.
++        self.free_list_head: Optional[KVCacheBlock] = blocks[0]
++        self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
++        for i in range(self.num_free_blocks):
++            if i > 0:
++                blocks[i].prev_free_block = blocks[i - 1]
++            if i < self.num_free_blocks - 1:
++                blocks[i].next_free_block = blocks[i + 1]
++
++    def popleft(self) -> KVCacheBlock:
++        """Pop the first free block and reduce num_free_blocks by 1.
++        
++        Returns:
++            The first free block.
++        """
++        if not self.free_list_head:
++            raise ValueError("No free blocks available")
++
++        block = self.free_list_head
++        self.remove(block)
++        return block
++
++    def remove(self, block: KVCacheBlock) -> None:
++        """Remove a block in the free list and reduce num_free_blocks by 1.
++        
++        Args:
++            block: The block to remove.
++        """
++        if block.prev_free_block is not None:
++            # Link the previous block to the next block.
++            block.prev_free_block.next_free_block = block.next_free_block
++        if block.next_free_block is not None:
++            # Link the next block to the previous block.
++            block.next_free_block.prev_free_block = block.prev_free_block
++
++        if block == self.free_list_head:
++            # Update the head if the block is the head.
++            self.free_list_head = block.next_free_block
++        if block == self.free_list_tail:
++            # Update the tail if the block is the tail.
++            self.free_list_tail = block.prev_free_block
++
++        # Remove the block from the linked list.
++        block.prev_free_block = block.next_free_block = None
++        self.num_free_blocks -= 1
++
++    def append(self, block: KVCacheBlock) -> None:
++        """Put a block back into the free list and increase
++        num_free_blocks by 1.
++
++        Args:
++            block: The block to append.
++        """
++        if self.free_list_tail is not None:
++            # Link the last block to the new block.
++            self.free_list_tail.next_free_block = block
++            block.prev_free_block = self.free_list_tail
++            self.free_list_tail = block
++        else:
++            # The free list is empty.
++            assert self.free_list_head is None
++            self.free_list_head = self.free_list_tail = block
++
++        block.next_free_block = None
++        self.num_free_blocks += 1
++
++    def get_all_free_blocks(self) -> List[KVCacheBlock]:
++        """Get all free blocks in the free list. Mainly used for testing.
++        
++        Returns:
++            A list of free blocks.
++        """
++        ret = []
++        curr_block = self.free_list_head
++        while curr_block is not None:
++            ret.append(curr_block)
++            curr_block = curr_block.next_free_block
++        return ret
++
++
++def generate_block_hash_extra_keys(
++        request: Request, start_token_idx: int, end_token_idx: int,
++        start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]:
++    """Generate extra keys for the block hash. The extra keys can come from
++    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
++    For multi-modal inputs, the extra keys are (mm_hash, start_offset) that
++    indicate a mm input contained in the block and its starting offset in
++    the block tokens.
++    
++    Args:
++        request: The request object.
++        start_token_idx: The start token index of the block.
++        end_token_idx: The end token index of the block.
++        start_mm_idx: The start multi-modal index of the block.
++    
++    Returns:
++        A tuple of extra keys and the next multi-modal index.
++    """
++
++    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
++    if not mm_positions:
++        return None, start_mm_idx
++
++    if mm_positions and len(mm_positions) != len(mm_hashes):
++        raise ValueError(
++            "The number of multi-modal positions and hashes must match. This "
++            "is likely because you do not enable MM preprocessor hashing. "
++            "Please set disable_mm_preprocessor_cache=False.")
++
++    # Note that we assume mm_positions is sorted by offset.
++    # We do not need to check all mm inputs if the start token index is out of
++    # range. This usually happens in the late prefill phase and decoding phase.
++    if mm_positions[-1]["offset"] + mm_positions[-1][
++            "length"] < start_token_idx:
++        return None, start_mm_idx
++
++    # Support start_mm_idx == -1 to indicate the last mm input.
++    if start_mm_idx < 0:
++        assert -start_mm_idx <= len(mm_positions)
++        start_mm_idx = len(mm_positions) + start_mm_idx
++
++    extra_keys = []
++    curr_mm_idx = start_mm_idx
++    while mm_positions and curr_mm_idx < len(mm_positions):
++        assert mm_hashes[curr_mm_idx] is not None
++        offset = mm_positions[curr_mm_idx]["offset"]
++        length = mm_positions[curr_mm_idx]["length"]
++        if end_token_idx > offset:
++            if start_token_idx > offset + length:
++                # This block has passed the current mm input.
++                curr_mm_idx += 1
++                continue
++
++            # The block contains the current mm input.
++            extra_keys.append(mm_hashes[curr_mm_idx])
++
++            if end_token_idx >= offset + length:
++                # If this block contains the end of the current mm input,
++                # move to the next mm input as this block may also contain
++                # the next mm input.
++                curr_mm_idx += 1
++            else:
++                # Otherwise this block is done with mm inputs.
++                break
++        else:
++            # This block has not reached the current mm input.
++            break
++    return tuple(extra_keys), curr_mm_idx
++
++
++def hash_block_tokens(
++        parent_block_hash: Optional[int],
++        curr_block_token_ids: Sequence[int],
++        extra_keys: Optional[Tuple[Any, ...]] = None) -> BlockHashType:
++    """Computes a hash value corresponding to the contents of a block and
++    the contents of the preceding block(s). The hash value is used for
++    prefix caching. We use LRU cache for this function to avoid recomputing
++    hash values for the same block contents.
++
++    TODO: Support arbitrary metadata so that we could support more
++    features such as LoRA adapter.
++
++    Args:
++        parent_block_hash: The hash of the parent block. None
++            if this is the first block.
++        curr_block_token_ids: A list of token ids in the current
++            block. The current block is assumed to be full.
++        extra_keys: Extra keys for the block.
++
++    Returns:
++        The hash value of the block and the token ids in the block.
++        The entire tuple is used as the hash key of the block.
++    """
++    return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
++                         tuple(curr_block_token_ids), extra_keys)
++
++
++def hash_request_tokens(block_size: int,
++                        request: Request) -> List[BlockHashType]:
++    """Computes hash values of a chain of blocks given a sequence of
++    token IDs. The hash value is used for prefix caching.
++
++    Args:
++        block_size: The size of each block.
++        request: The request object.
++
++    Returns:
++        The list of computed hash values.
++    """
++    token_ids = request.all_token_ids
++    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
++    if mm_positions and len(mm_positions) != len(mm_hashes):
++        raise ValueError(
++            "The number of multi-modal positions and hashes must match.")
++
++    # TODO: Extend this to support other features such as LoRA.
++    need_extra_keys = bool(mm_positions)
++    extra_keys = None
++    curr_mm_idx = 0
++
++    ret = []
++    parent_block_hash_value = None
++    for start in range(0, len(token_ids), block_size):
++        end = start + block_size
++        block_token_ids = token_ids[start:end]
++        # Do not hash the block if it is not full.
++        if len(block_token_ids) < block_size:
++            break
++
++        # Add extra keys if the block is a multi-modal block.
++        if need_extra_keys:
++            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
++                request, start, end, curr_mm_idx)
++
++        block_hash = hash_block_tokens(parent_block_hash_value,
++                                       block_token_ids, extra_keys)
++        ret.append(block_hash)
++        parent_block_hash_value = block_hash.hash_value
++    return ret
+diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
+new file mode 100644
+index 0000000..f04e529
+--- /dev/null
++++ b/vllm/v1/core/scheduler.py
+@@ -0,0 +1,618 @@
++from collections import deque
++from dataclasses import dataclass
++from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
++                    Tuple, Union)
++
++from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
++from vllm.logger import init_logger
++from vllm.sampling_params import SamplingParams
++from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
++from vllm.v1.core.kv_cache_manager import KVCacheManager
++from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
++from vllm.v1.metrics.stats import SchedulerStats
++from vllm.v1.outputs import ModelRunnerOutput
++from vllm.v1.request import Request, RequestStatus
++
++if TYPE_CHECKING:
++    from vllm.multimodal import MultiModalKwargs
++    from vllm.multimodal.base import PlaceholderRange
++
++logger = init_logger(__name__)
++
++
++class Scheduler:
++
++    def __init__(
++        self,
++        scheduler_config: SchedulerConfig,
++        cache_config: CacheConfig,
++        lora_config: Optional[LoRAConfig],
++    ) -> None:
++        self.scheduler_config = scheduler_config
++        self.cache_config = cache_config
++        self.lora_config = lora_config
++        # TODO: Support LoRA.
++        assert lora_config is None, "V1 does not support LoRA yet."
++
++        # Scheduling constraints.
++        self.max_num_running_reqs = self.scheduler_config.max_num_seqs
++        self.max_num_scheduled_tokens = \
++            self.scheduler_config.max_num_batched_tokens
++        self.max_model_len = self.scheduler_config.max_model_len
++
++        num_gpu_blocks = cache_config.num_gpu_blocks
++        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
++        # Create the KV cache manager.
++        self.kv_cache_manager = KVCacheManager(
++            block_size=self.cache_config.block_size,
++            num_gpu_blocks=num_gpu_blocks,
++            max_model_len=self.max_model_len,
++            sliding_window=self.cache_config.sliding_window,
++            enable_caching=self.cache_config.enable_prefix_caching)
++        self.block_size = self.cache_config.block_size
++
++        # req_id -> Request
++        self.requests: Dict[str, Request] = {}
++        # Priority queues for requests.
++        self.waiting: Deque[Request] = deque()
++        self.running: List[Request] = []
++
++        # The request IDs that are finished in between the previous and the
++        # current steps. This is used to notify the workers about the finished
++        # requests so that they can free the cached states for those requests.
++        # This is flushed at the end of each scheduling step.
++        self.finished_req_ids: Set[str] = set()
++
++        # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating
++        # them at each scheduling step.
++        # Request id -> RunningRequestData
++        self.running_reqs_data: Dict[str, RunningRequestData] = {}
++
++        # Encoder-related.
++        # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
++        # projector if needed). Currently, we assume that the encoder also
++        # has the Transformer architecture (e.g., ViT).
++        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  #noqa: E501
++        # NOTE(woosuk): For the models without encoder (e.g., text-only models),
++        # the encoder cache will not be initialized and used, regardless of
++        # the cache size. This is because the memory space for the encoder cache
++        # is preallocated in the profiling run.
++        self.encoder_cache_manager = EncoderCacheManager(
++            cache_size=self.scheduler_config.encoder_cache_size)
++
++    def schedule(self) -> "SchedulerOutput":
++        # NOTE(woosuk) on the scheduling algorithm:
++        # There's no "decoding phase" nor "prefill phase" in the scheduler.
++        # Each request just has the num_computed_tokens and num_tokens,
++        # which is equal to len(prompt_token_ids) + len(output_token_ids).
++        # At each step, the scheduler tries to assign tokens to the requests
++        # so that each request's num_computed_tokens can catch up its
++        # num_tokens. This is general enough to cover chunked prefills,
++        # prefix caching, and the "jump decoding" optimization in the future.
++
++        scheduled_new_reqs: List[Request] = []
++        scheduled_resumed_reqs: List[Request] = []
++        scheduled_running_reqs: List[Request] = []
++        preempted_reqs: List[Request] = []
++
++        req_to_new_block_ids: Dict[str, List[int]] = {}
++        num_scheduled_tokens: Dict[str, int] = {}
++        token_budget = self.max_num_scheduled_tokens
++        # Encoder-related.
++        scheduled_encoder_inputs: Dict[str, List[int]] = {}
++        encoder_budget = self.max_num_encoder_input_tokens
++
++        # First, schedule the RUNNING requests.
++        # NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be
++        # in the "partial" state, where the request has some tokens computed
++        # but not all. The constraint is due to the persistent batch in the
++        # V1 model runner.
++        # TODO(woosuk): Remove this constraint after refactoring model runner.
++        has_partial_request = False
++        req_index = 0
++        while req_index < len(self.running):
++            # Only the last request in the RUNNING queue can be "partial".
++            assert not has_partial_request
++            assert token_budget > 0
++            request = self.running[req_index]
++            num_new_tokens = request.num_tokens - request.num_computed_tokens
++            num_new_tokens = min(num_new_tokens, token_budget)
++            assert num_new_tokens > 0
++
++            # Schedule encoder inputs.
++            encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = (
++                self._try_schedule_encoder_inputs(request,
++                                                  request.num_computed_tokens,
++                                                  num_new_tokens,
++                                                  encoder_budget))
++            assert num_new_tokens > 0
++
++            while True:
++                new_blocks = self.kv_cache_manager.append_slots(
++                    request, num_new_tokens)
++                if new_blocks is None:
++                    # The request cannot be scheduled.
++                    # Preempt the lowest-priority request.
++                    preempted_req = self.running.pop()
++                    self.kv_cache_manager.free(preempted_req)
++                    preempted_req.status = RequestStatus.PREEMPTED
++                    preempted_req.num_computed_tokens = 0
++
++                    self.waiting.appendleft(preempted_req)
++                    preempted_reqs.append(preempted_req)
++                    if preempted_req == request:
++                        # No more request to preempt.
++                        can_schedule = False
++                        break
++                else:
++                    # The request can be scheduled.
++                    can_schedule = True
++                    break
++            if not can_schedule:
++                break
++            assert new_blocks is not None
++
++            # Schedule the request.
++            scheduled_running_reqs.append(request)
++            req_to_new_block_ids[request.request_id] = [
++                b.block_id for b in new_blocks
++            ]
++            num_scheduled_tokens[request.request_id] = num_new_tokens
++            token_budget -= num_new_tokens
++            req_index += 1
++            has_partial_request = (request.num_computed_tokens + num_new_tokens
++                                   < request.num_tokens)
++
++            # Encoder-related.
++            if encoder_inputs_to_schedule:
++                scheduled_encoder_inputs[request.request_id] = (
++                    encoder_inputs_to_schedule)
++                # Allocate the encoder cache.
++                for i in encoder_inputs_to_schedule:
++                    self.encoder_cache_manager.allocate(request, i)
++                encoder_budget = new_encoder_budget
++
++        # Next, schedule the WAITING requests.
++        if not preempted_reqs:
++            while self.waiting:
++                if has_partial_request:
++                    break
++                if len(self.running) == self.max_num_running_reqs:
++                    break
++                if token_budget == 0:
++                    break
++
++                request = self.waiting[0]
++                # Get already-cached tokens.
++                computed_blocks = self.kv_cache_manager.get_computed_blocks(
++                    request)
++                # NOTE(woosuk): Since incomplete blocks are not eligible for
++                # sharing, `num_computed_tokens` is always a multiple of
++                # `block_size`.
++                num_computed_tokens = len(computed_blocks) * self.block_size
++                # Number of tokens to be scheduled.
++                # We use `request.num_tokens` instead of
++                # `request.num_prompt_tokens` to consider the resumed requests,
++                # which have output tokens.
++                num_new_tokens = request.num_tokens - num_computed_tokens
++                if num_new_tokens == 0:
++                    # The happens when prompt length is divisible by the block
++                    # size and all blocks are cached. Now we force to recompute
++                    # the last block. Note that we have to re-compute an entire
++                    # block because allocate_slots() assumes num_computed_tokens
++                    # is always a multiple of the block size. This limitation
++                    # can potentially be removed in the future to slightly
++                    # improve the performance.
++                    num_computed_tokens -= self.block_size
++                    num_new_tokens = self.block_size
++                    computed_blocks.pop()
++                num_new_tokens = min(num_new_tokens, token_budget)
++                assert num_new_tokens > 0
++
++                # Schedule encoder inputs.
++                (encoder_inputs_to_schedule, num_new_tokens,
++                 new_encoder_budget) = self._try_schedule_encoder_inputs(
++                     request, num_computed_tokens, num_new_tokens,
++                     encoder_budget)
++                if num_new_tokens == 0:
++                    # The request cannot be scheduled.
++                    break
++
++                new_blocks = self.kv_cache_manager.allocate_slots(
++                    request, num_new_tokens, computed_blocks)
++                if new_blocks is None:
++                    # The request cannot be scheduled.
++                    break
++
++                self.waiting.popleft()
++                self.running.append(request)
++                if request.status == RequestStatus.WAITING:
++                    scheduled_new_reqs.append(request)
++                elif request.status == RequestStatus.PREEMPTED:
++                    scheduled_resumed_reqs.append(request)
++                else:
++                    raise RuntimeError(
++                        f"Invalid request status: {request.status}")
++
++                req_to_new_block_ids[request.request_id] = [
++                    b.block_id for b in computed_blocks + new_blocks
++                ]
++                num_scheduled_tokens[request.request_id] = num_new_tokens
++                token_budget -= num_new_tokens
++                request.status = RequestStatus.RUNNING
++                request.num_computed_tokens = num_computed_tokens
++                has_partial_request = (num_computed_tokens + num_new_tokens <
++                                       request.num_tokens)
++
++                # Encoder-related.
++                if encoder_inputs_to_schedule:
++                    scheduled_encoder_inputs[request.request_id] = (
++                        encoder_inputs_to_schedule)
++                    # Allocate the encoder cache.
++                    for i in encoder_inputs_to_schedule:
++                        self.encoder_cache_manager.allocate(request, i)
++                    encoder_budget = new_encoder_budget
++
++        # Check if the scheduling constraints are satisfied.
++        total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
++        assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
++        assert token_budget >= 0
++        assert len(self.running) <= self.max_num_running_reqs
++        assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
++                len(scheduled_running_reqs) == len(self.running))
++
++        # Get the longest common prefix among all requests in the running queue.
++        # This can be potentially used for cascade attention.
++        if self.running:
++            any_request = self.running[0]
++            num_common_prefix_blocks = (
++                self.kv_cache_manager.get_num_common_prefix_blocks(
++                    any_request, len(self.running)))
++
++        # Construct the scheduler output.
++        new_reqs_data = [
++            NewRequestData.from_request(req,
++                                        req_to_new_block_ids[req.request_id],
++                                        req.num_computed_tokens)
++            for req in scheduled_new_reqs
++        ]
++        resumed_reqs_data = [
++            ResumedRequestData.from_request(
++                req, req_to_new_block_ids[req.request_id],
++                req.num_computed_tokens) for req in scheduled_resumed_reqs
++        ]
++        running_reqs_data = [
++            self._make_running_request_data(
++                req, req_to_new_block_ids[req.request_id],
++                req.num_computed_tokens) for req in scheduled_running_reqs
++        ]
++        preempted_req_ids = {req.request_id for req in preempted_reqs}
++        scheduler_output = SchedulerOutput(
++            scheduled_new_reqs=new_reqs_data,
++            scheduled_resumed_reqs=resumed_reqs_data,
++            scheduled_running_reqs=running_reqs_data,
++            num_scheduled_tokens=num_scheduled_tokens,
++            total_num_scheduled_tokens=total_num_scheduled_tokens,
++            scheduled_encoder_inputs=scheduled_encoder_inputs,
++            num_common_prefix_blocks=num_common_prefix_blocks,
++            preempted_req_ids=preempted_req_ids,
++            # finished_req_ids is an existing state in the scheduler,
++            # instead of being newly scheduled in this step.
++            # It contains the request IDs that are finished in between
++            # the previous and the current steps.
++            finished_req_ids=self.finished_req_ids,
++            free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
++        )
++
++        self.finished_req_ids = set()
++        return scheduler_output
++
++    def _make_running_request_data(
++        self,
++        request: Request,
++        new_block_ids: List[int],
++        num_computed_tokens: int,
++    ) -> "RunningRequestData":
++        # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating
++        # them at each scheduling step.
++        if request.request_id in self.running_reqs_data:
++            req_data = self.running_reqs_data[request.request_id]
++            req_data.new_block_ids = new_block_ids
++            req_data.num_computed_tokens = num_computed_tokens
++        else:
++            req_data = RunningRequestData.from_request(request, new_block_ids,
++                                                       num_computed_tokens)
++            self.running_reqs_data[request.request_id] = req_data
++        return req_data
++
++    def _try_schedule_encoder_inputs(
++        self,
++        request: Request,
++        num_computed_tokens: int,
++        num_new_tokens: int,
++        encoder_budget: int,
++    ) -> Tuple[List[int], int, int]:
++        """
++        Determine which encoder inputs need to be scheduled in the current step,
++        and update `num_new_tokens` and encoder token budget accordingly.
++
++        An encoder input will be scheduled if:
++        - Its output tokens overlap with the range of tokens being computed
++        in this step, i.e.,
++        [num_computed_tokens, num_computed_tokens + num_new_tokens).
++        - It is not already computed and stored in the encoder cache.
++        - There is sufficient encoder token budget to process it.
++        - The encoder cache has space to store it.
++
++        If an encoder input cannot be scheduled due to cache or budget
++        limitations, the method adjusts `num_new_tokens` to schedule only the
++        decoder tokens up to just before the unschedulable encoder input.
++        """
++        if not request.has_encoder_inputs():
++            return [], num_new_tokens, encoder_budget
++
++        encoder_inputs_to_schedule: List[int] = []
++        mm_positions = request.mm_positions
++        assert mm_positions is not None
++        assert len(mm_positions) > 0
++        for i, pos_info in enumerate(mm_positions):
++            start_pos = pos_info["offset"]
++            num_encoder_tokens = pos_info["length"]
++
++            # The encoder output is needed if the two ranges overlap:
++            # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
++            # [start_pos, start_pos + num_encoder_tokens)
++            if start_pos >= num_computed_tokens + num_new_tokens:
++                # The encoder input is not needed in this step.
++                break
++            if start_pos + num_encoder_tokens <= num_computed_tokens:
++                # The encoder input is already computed and stored
++                # in the decoder's KV cache.
++                continue
++
++            if self.encoder_cache_manager.has_cache(request, i):
++                # The encoder input is already computed and cached.
++                continue
++            if not self.encoder_cache_manager.can_allocate(request, i):
++                # The encoder cache is full. We can only schedule the decoder
++                # tokens just before the encoder input.
++                num_new_tokens = start_pos - num_computed_tokens
++                break
++            if num_encoder_tokens > encoder_budget:
++                # The encoder budget is exhausted. We can only schedule the
++                # decoder tokens up until the encoder input.
++                # NOTE(woosuk): We assume that the encoder tokens should be
++                # processed altogether, as the encoder usually uses
++                # bidirectional attention.
++                num_new_tokens = start_pos - num_computed_tokens
++                break
++
++            encoder_budget -= num_encoder_tokens
++            encoder_inputs_to_schedule.append(i)
++        return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
++
++    def update_from_output(
++        self,
++        scheduler_output: "SchedulerOutput",
++        model_runner_output: "ModelRunnerOutput",
++    ) -> EngineCoreOutputs:
++        # NOTE(woosuk): This method doesn't consider speculative decoding.
++        sampled_token_ids = model_runner_output.sampled_token_ids
++        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
++        new_running: List[Request] = []
++        outputs: List[EngineCoreOutput] = []
++        for request in self.running:
++            req_id = request.request_id
++            request.num_computed_tokens += num_scheduled_tokens[req_id]
++            # When the request's num_computed_tokens catches up its num_tokens,
++            # the request generates output tokens. Otherwise, we ignore the
++            # sampler output for the request.
++            assert request.num_computed_tokens <= request.num_tokens
++
++            cached_encoder_input_ids = (
++                self.encoder_cache_manager.get_cached_input_ids(request))
++            for input_id in list(cached_encoder_input_ids):
++                start_pos = request.mm_positions[input_id]["offset"]
++                num_tokens = request.mm_positions[input_id]["length"]
++                if start_pos + num_tokens <= request.num_computed_tokens:
++                    # The encoder output is already processed and stored
++                    # in the decoder's KV cache.
++                    self.encoder_cache_manager.free(request, input_id)
++
++            if request.num_computed_tokens == request.num_tokens:
++                req_index = model_runner_output.req_id_to_index[req_id]
++                # NOTE(woosuk): Currently, we assume that each request
++                # generates at most one token at each step.
++                token_id = sampled_token_ids[req_index]
++                request.append_output_token_ids(token_id)
++                num_new_tokens = 1
++                # TODO: Update the KV cache manager for prefix caching.
++
++                # Check for stop and update request state.
++                # This must be called before me make the EngineCoreOutput.
++                stopped = self._check_stop(request)
++
++                # Add EngineCoreOutput for this Request.
++                output = EngineCoreOutput(
++                    request_id=req_id,
++                    new_token_ids=request.output_token_ids[-num_new_tokens:],
++                    finished=request.is_finished(),
++                    finish_reason=request.get_finished_reason(),
++                    stop_reason=request.stop_reason)
++                outputs.append(output)
++
++                # Breakout of the loop.
++                if stopped:
++                    continue
++
++            new_running.append(request)
++        self.running = new_running
++        return EngineCoreOutputs(
++            outputs=outputs,
++            scheduler_stats=self.make_stats(),
++        )
++
++    def _check_stop(self, request: Request) -> bool:
++        if (request.num_tokens >= self.max_model_len
++                or request.num_output_tokens >= request.max_tokens):
++            request.status = RequestStatus.FINISHED_LENGTH_CAPPED
++            self._free_request(request)
++            return True
++
++        sampling_params = request.sampling_params
++        last_token_id = request.output_token_ids[-1]
++        if (not sampling_params.ignore_eos
++                and last_token_id == request.eos_token_id):
++            request.status = RequestStatus.FINISHED_STOPPED
++            self._free_request(request)
++            return True
++
++        if last_token_id in (sampling_params.stop_token_ids or ()):
++            request.status = RequestStatus.FINISHED_STOPPED
++            request.stop_reason = last_token_id
++            self._free_request(request)
++            return True
++        return False
++
++    def add_request(self, request: Request) -> None:
++        self.waiting.append(request)
++        self.requests[request.request_id] = request
++
++    def finish_requests(
++        self,
++        request_ids: Union[str, Iterable[str]],
++        finished_status: RequestStatus,
++    ) -> None:
++        """Handles the finish signal from outside the scheduler.
++
++        For example, the API server can abort a request when the client
++        disconnects.
++        """
++        assert RequestStatus.is_finished(finished_status)
++        if isinstance(request_ids, str):
++            request_ids = (request_ids, )
++        request_ids = set(request_ids)
++
++        for req_id in request_ids:
++            request = self.requests.get(req_id)
++            if request is None:
++                # Invalid request ID.
++                continue
++
++            if request.status == RequestStatus.RUNNING:
++                self.running.remove(request)
++            else:
++                self.waiting.remove(request)
++            request.status = finished_status
++            self._free_request(request)
++
++    def _free_request(self, request: Request) -> None:
++        assert request.is_finished()
++        self.kv_cache_manager.free(request)
++        self.running_reqs_data.pop(request.request_id, None)
++        del self.requests[request.request_id]
++        self.finished_req_ids.add(request.request_id)
++
++    def get_num_unfinished_requests(self) -> int:
++        return len(self.waiting) + len(self.running)
++
++    def has_unfinished_requests(self) -> bool:
++        return self.get_num_unfinished_requests() > 0
++
++    def make_stats(self) -> SchedulerStats:
++        return SchedulerStats(
++            num_running_reqs=len(self.running),
++            num_waiting_reqs=len(self.waiting),
++        )
++
++
++@dataclass
++class NewRequestData:
++
++    req_id: str
++    prompt_token_ids: List[int]
++    prompt: Optional[str]
++    mm_inputs: List["MultiModalKwargs"]
++    mm_hashes: List[str]
++    mm_positions: List["PlaceholderRange"]
++    sampling_params: SamplingParams
++    block_ids: List[int]
++    num_computed_tokens: int
++
++    @classmethod
++    def from_request(
++        cls,
++        request: Request,
++        block_ids: List[int],
++        num_computed_tokens: int,
++    ) -> "NewRequestData":
++        return cls(
++            req_id=request.request_id,
++            prompt_token_ids=request.prompt_token_ids,
++            prompt=request.prompt,
++            mm_inputs=request.mm_inputs,
++            mm_hashes=request.mm_hashes,
++            mm_positions=request.mm_positions,
++            sampling_params=request.sampling_params,
++            block_ids=block_ids,
++            num_computed_tokens=num_computed_tokens,
++        )
++
++
++@dataclass
++class ResumedRequestData:
++
++    req_id: str
++    block_ids: List[int]
++    num_computed_tokens: int
++
++    @classmethod
++    def from_request(
++        cls,
++        request: Request,
++        block_ids: List[int],
++        num_computed_tokens: int,
++    ) -> "ResumedRequestData":
++        return cls(
++            req_id=request.request_id,
++            block_ids=block_ids,
++            num_computed_tokens=num_computed_tokens,
++        )
++
++
++@dataclass
++class RunningRequestData:
++
++    req_id: str
++    new_block_ids: List[int]
++    num_computed_tokens: int
++
++    @classmethod
++    def from_request(
++        cls,
++        request: Request,
++        new_block_ids: List[int],
++        num_computed_tokens: int,
++    ) -> "RunningRequestData":
++        return cls(
++            req_id=request.request_id,
++            new_block_ids=new_block_ids,
++            num_computed_tokens=num_computed_tokens,
++        )
++
++
++@dataclass
++class SchedulerOutput:
++
++    scheduled_new_reqs: List[NewRequestData]
++    scheduled_resumed_reqs: List[ResumedRequestData]
++    scheduled_running_reqs: List[RunningRequestData]
++
++    num_scheduled_tokens: Dict[str, int]
++    total_num_scheduled_tokens: int
++    scheduled_encoder_inputs: Dict[str, List[int]]
++    num_common_prefix_blocks: int
++
++    preempted_req_ids: Set[str]
++    finished_req_ids: Set[str]
++    free_encoder_input_ids: List[Tuple[str, int]]
+diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
+new file mode 100644
+index 0000000..6d90c38
+--- /dev/null
++++ b/vllm/v1/engine/__init__.py
+@@ -0,0 +1,79 @@
++import enum
++from dataclasses import dataclass
++from typing import TYPE_CHECKING, List, Optional, Union
++
++import msgspec
++
++from vllm.v1.metrics.stats import SchedulerStats
++
++if TYPE_CHECKING:
++    from vllm.lora.request import LoRARequest
++    from vllm.multimodal import MultiModalKwargs
++    from vllm.multimodal.inputs import PlaceholderRange
++    from vllm.sampling_params import SamplingParams
++
++
++@dataclass
++class EngineCoreRequest:
++
++    # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
++    # but this object is currently not playing well with msgspec
++    # due to circular imports and typing we have in data.py
++
++    request_id: str
++    # NOTE(ywang96): original text prompt is needed when a request is added to
++    # Detokenizer, but set to None when it is added to EngineCoreClient.
++    prompt: Optional[str]
++    prompt_token_ids: List[int]
++    mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]
++    mm_hashes: Optional[List[str]]
++    mm_placeholders: Optional[List["PlaceholderRange"]]
++    sampling_params: "SamplingParams"
++    eos_token_id: Optional[int]
++    arrival_time: float
++    lora_request: Optional["LoRARequest"]
++
++
++class EngineCoreOutput(
++        msgspec.Struct,
++        array_like=True,  # type: ignore[call-arg]
++        omit_defaults=True,  # type: ignore[call-arg]
++        gc=False):  # type: ignore[call-arg]
++
++    request_id: str
++    new_token_ids: List[int]
++    finished: bool
++    finish_reason: Optional[str] = None
++    stop_reason: Union[int, str, None] = None
++
++
++class EngineCoreOutputs(
++        msgspec.Struct,
++        array_like=True,  # type: ignore[call-arg]
++        omit_defaults=True,  # type: ignore[call-arg]
++        gc=False):  # type: ignore[call-arg]
++
++    #NOTE(Nick): We could consider ways to make this more compact,
++    # e.g. columnwise layout and using an int enum for finish/stop reason
++
++    # [num_reqs]
++    outputs: List[EngineCoreOutput]
++    scheduler_stats: SchedulerStats
++
++
++@dataclass
++class EngineCoreProfile:
++    is_start: bool
++
++
++class EngineCoreRequestType(enum.Enum):
++    """
++    Request types defined as hex byte strings, so it can be sent over sockets
++    without separate encoding step.
++    """
++    ADD = b'\x00'
++    ABORT = b'\x01'
++    PROFILE = b'\x02'
++
++
++EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]]
+diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
+new file mode 100644
+index 0000000..a74699f
+--- /dev/null
++++ b/vllm/v1/engine/async_llm.py
+@@ -0,0 +1,342 @@
++import asyncio
++import os
++from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
++
++from vllm.config import ModelConfig, VllmConfig
++from vllm.engine.arg_utils import AsyncEngineArgs
++from vllm.engine.protocol import EngineClient
++from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
++from vllm.inputs.preprocess import InputPreprocessor
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.outputs import RequestOutput
++from vllm.pooling_params import PoolingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import SamplingParams
++from vllm.transformers_utils.tokenizer import AnyTokenizer
++from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
++from vllm.usage.usage_lib import UsageContext
++from vllm.utils import kill_process_tree
++from vllm.v1.engine.core_client import EngineCoreClient
++from vllm.v1.engine.output_processor import OutputProcessor
++from vllm.v1.engine.processor import Processor
++from vllm.v1.executor.abstract import Executor
++from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
++from vllm.v1.metrics.stats import IterationStats, SchedulerStats
++
++logger = init_logger(__name__)
++
++
++class AsyncLLM(EngineClient):
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        executor_class: Type[Executor],
++        log_stats: bool,
++        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
++        input_registry: InputRegistry = INPUT_REGISTRY,
++        use_cached_outputs: bool = False,
++        log_requests: bool = True,
++        start_engine_loop: bool = True,
++    ) -> None:
++
++        assert start_engine_loop
++
++        self.log_requests = log_requests
++        self.log_stats = log_stats
++        self.stat_loggers: List[StatLoggerBase] = [
++            LoggingStatLogger(),
++            # TODO(rob): PrometheusStatLogger(),
++        ]
++        self.model_config = vllm_config.model_config
++
++        # Tokenizer (+ ensure liveness if running in another process).
++        self.tokenizer = init_tokenizer_from_configs(
++            model_config=vllm_config.model_config,
++            scheduler_config=vllm_config.scheduler_config,
++            parallel_config=vllm_config.parallel_config,
++            lora_config=vllm_config.lora_config)
++        self.tokenizer.ping()
++
++        # Processor (converts Inputs --> EngineCoreRequests).
++        self.processor = Processor(
++            model_config=vllm_config.model_config,
++            cache_config=vllm_config.cache_config,
++            lora_config=vllm_config.lora_config,
++            tokenizer=self.tokenizer,
++            input_registry=input_registry,
++        )
++
++        # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
++        self.output_processor = OutputProcessor(self.tokenizer,
++                                                log_stats=self.log_stats)
++
++        # EngineCore (starts the engine in background process).
++        self.engine_core = EngineCoreClient.make_client(
++            multiprocess_mode=True,
++            asyncio_mode=True,
++            vllm_config=vllm_config,
++            executor_class=executor_class,
++        )
++
++        self.output_handler: Optional[asyncio.Task] = None
++
++    @classmethod
++    def from_engine_args(
++        cls,
++        engine_args: AsyncEngineArgs,
++        engine_config: Optional[VllmConfig] = None,
++        start_engine_loop: bool = True,
++        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
++    ) -> "AsyncLLM":
++        """Create an AsyncLLM from the EngineArgs."""
++
++        # Create the engine configs.
++        if engine_config is None:
++            vllm_config = engine_args.create_engine_config(usage_context)
++        else:
++            vllm_config = engine_config
++
++        executor_class = Executor.get_class(vllm_config)
++
++        # Create the AsyncLLM.
++        return cls(
++            vllm_config=vllm_config,
++            executor_class=executor_class,
++            log_requests=not engine_args.disable_log_requests,
++            log_stats=not engine_args.disable_log_stats,
++            start_engine_loop=start_engine_loop,
++            usage_context=usage_context,
++        )
++
++    def shutdown(self):
++        """Shutdown, cleaning up the background proc and IPC."""
++
++        if engine_core := getattr(self, "engine_core", None):
++            engine_core.shutdown()
++
++        if handler := getattr(self, "output_handler", None):
++            handler.cancel()
++
++    async def add_request(
++        self,
++        request_id: str,
++        prompt: PromptType,
++        params: Union[SamplingParams, PoolingParams],
++        arrival_time: Optional[float] = None,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> asyncio.Queue[RequestOutput]:
++        """Add new request to the AsyncLLM."""
++
++        # 1) Create a new output queue for the request.
++        if self.output_processor.is_request_active(request_id):
++            raise ValueError(f"Request id {request_id} already running.")
++        queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
++
++        # 2) Convert Input --> Request.
++        request = self.processor.process_inputs(request_id, prompt, params,
++                                                arrival_time, lora_request,
++                                                trace_headers,
++                                                prompt_adapter_request,
++                                                priority)
++
++        # 3) Add the request to OutputProcessor (this process).
++        self.output_processor.add_request(request, queue)
++
++        # 4) Add the EngineCoreRequest to EngineCore (separate process).
++        await self.engine_core.add_request_async(request)
++
++        if self.log_requests:
++            logger.info("Added request %s.", request_id)
++
++        return queue
++
++    # TODO: we should support multiple prompts in one call, as you
++    # can do with LLM.generate. So that for multi-prompt completion
++    # requests we don't need to send multiple messages to core proc,
++    # and so we don't need multiple streams which then get
++    # re-multiplexed in the API server anyhow.
++    async def generate(
++        self,
++        prompt: PromptType,
++        sampling_params: SamplingParams,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> AsyncGenerator[RequestOutput, None]:
++        """
++        Main function called by the API server to kick off a request
++            * 1) Making an AsyncStream corresponding to the Request.
++            * 2) Processing the Input.
++            * 3) Adding the Request to the Detokenizer.
++            * 4) Adding the Request to the EngineCore (separate process).
++
++        A separate output_handler loop runs in a background AsyncIO task, 
++        pulling outputs from EngineCore and putting them into the 
++        per-request AsyncStream.
++
++        The caller of generate() iterates the returned AsyncGenerator,
++        returning the RequestOutput back to the caller.
++        """
++
++        try:
++            # We start the output_handler on the first call to generate() so
++            # we can call __init__ before the event loop, which enables us
++            # to handle startup failure gracefully in the OpenAI server.
++            if self.output_handler is None:
++                self.output_handler = asyncio.create_task(
++                    self._run_output_handler())
++
++            q = await self.add_request(
++                request_id,
++                prompt,
++                sampling_params,
++                lora_request=lora_request,
++                trace_headers=trace_headers,
++                prompt_adapter_request=prompt_adapter_request,
++                priority=priority,
++            )
++
++            # The output_handler task pushes items into the queue.
++            # This task pulls from the queue and yields to caller.
++            while True:
++                # Note: drain queue without await if possible (avoids
++                # task switching under load which helps performance).
++                out = q.get_nowait() if q.qsize() > 0 else await q.get()
++
++                # Note: both OutputProcessor and EngineCore handle their
++                # own request cleanup based on finished.
++                if out.finished:
++                    yield out
++                    break
++
++                yield out
++
++        # If the request is disconnected by the client, the
++        # generate() task will be canceled. So, we abort the
++        # request if we end up here.
++        except asyncio.CancelledError:
++            await self.abort(request_id)
++            raise
++
++    async def _run_output_handler(self):
++        """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
++
++        try:
++            while True:
++                # 1) Pull EngineCoreOutputs from the EngineCore.
++                outputs = await self.engine_core.get_output_async()
++
++                # 2) Process EngineCoreOutputs.
++                processed_outputs = self.output_processor.process_outputs(
++                    outputs.outputs)
++                # NOTE: RequestOutputs are pushed to their queues.
++                assert len(processed_outputs.request_outputs) == 0
++
++                # 3) Abort any reqs that finished due to stop strings.
++                await self.engine_core.abort_requests_async(
++                    processed_outputs.reqs_to_abort)
++
++                # 4) Logging.
++                # TODO(rob): make into a coroutine and launch it in
++                # background thread once we add Prometheus.
++                self._log_stats(
++                    scheduler_stats=outputs.scheduler_stats,
++                    iteration_stats=processed_outputs.iteration_stats,
++                )
++
++        except Exception as e:
++            logger.exception("EngineCore output handler hit an error: %s", e)
++            kill_process_tree(os.getpid())
++
++    async def abort(self, request_id: str) -> None:
++        """Abort RequestId in OutputProcessor and EngineCore."""
++
++        request_ids = [request_id]
++        await self.engine_core.abort_requests_async(request_ids)
++        self.output_processor.abort_requests(request_ids)
++
++        if self.log_requests:
++            logger.info("Aborted request %s.", request_id)
++
++    def _log_stats(
++        self,
++        scheduler_stats: SchedulerStats,
++        iteration_stats: IterationStats,
++    ):
++        if not self.log_stats:
++            return
++
++        for logger in self.stat_loggers:
++            logger.log(scheduler_stats=scheduler_stats)
++
++    def encode(
++        self,
++        prompt: PromptType,
++        pooling_params: PoolingParams,
++        request_id: str,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        priority: int = 0,
++    ):
++        raise ValueError("Not Supported on V1 yet.")
++
++    async def get_model_config(self) -> ModelConfig:
++        return self.model_config
++
++    async def get_decoding_config(self):
++        raise ValueError("Not Supported on V1 yet.")
++
++    async def get_input_preprocessor(self) -> InputPreprocessor:
++        return self.processor.input_preprocessor
++
++    async def get_tokenizer(
++        self,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> AnyTokenizer:
++        return self.tokenizer.get_lora_tokenizer(lora_request)
++
++    async def is_tracing_enabled(self) -> bool:
++        return False
++
++    async def do_log_stats(
++        self,
++        scheduler_outputs=None,
++        model_output=None,
++    ) -> None:
++        logger.debug("Called do_log_stats.")
++
++    async def check_health(self) -> None:
++        logger.debug("Called check_health.")
++
++    async def start_profile(self) -> None:
++        await self.engine_core.profile_async(True)
++
++    async def stop_profile(self) -> None:
++        await self.engine_core.profile_async(False)
++
++    @property
++    def is_running(self) -> bool:
++        return True
++
++    @property
++    def is_stopped(self) -> bool:
++        return False
++
++    @property
++    def errored(self) -> bool:
++        return False
++
++    @property
++    def dead_error(self) -> BaseException:
++        return Exception()  # TODO: implement
++
++    async def add_lora(self, lora_request: LoRARequest) -> None:
++        """Load a new LoRA adapter into the engine for future requests."""
++        raise NotImplementedError("LoRA not yet supported in V1")
+diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
+new file mode 100644
+index 0000000..e7f90d3
+--- /dev/null
++++ b/vllm/v1/engine/core.py
+@@ -0,0 +1,286 @@
++import pickle
++import queue
++import signal
++import threading
++import time
++from multiprocessing.connection import Connection
++from typing import List, Tuple, Type
++
++import psutil
++import zmq
++import zmq.asyncio
++from msgspec import msgpack
++
++from vllm.config import CacheConfig, VllmConfig
++from vllm.logger import init_logger
++from vllm.transformers_utils.config import (
++    maybe_register_config_serialize_by_value)
++from vllm.utils import get_exception_traceback, zmq_socket_ctx
++from vllm.v1.core.scheduler import Scheduler
++from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
++                            EngineCoreRequest, EngineCoreRequestType,
++                            EngineCoreRequestUnion)
++from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
++from vllm.v1.executor.abstract import Executor
++from vllm.v1.request import Request, RequestStatus
++from vllm.v1.serial_utils import PickleEncoder
++from vllm.version import __version__ as VLLM_VERSION
++
++logger = init_logger(__name__)
++
++POLLING_TIMEOUT_S = 2.5
++
++
++class EngineCore:
++    """Inner loop of vLLM's Engine."""
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        executor_class: Type[Executor],
++    ):
++        assert vllm_config.model_config.runner_type != "pooling"
++
++        logger.info("Initializing an LLM engine (v%s) with config: %s",
++                    VLLM_VERSION, vllm_config)
++
++        # Setup Model.
++        self.model_executor = executor_class(vllm_config)
++
++        # Setup KV Caches and update CacheConfig after profiling.
++        num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches(
++            vllm_config.cache_config)
++        vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
++        vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
++
++        # Setup scheduler.
++        self.scheduler = Scheduler(vllm_config.scheduler_config,
++                                   vllm_config.cache_config,
++                                   vllm_config.lora_config)
++
++        self.mm_input_mapper_server = MMInputMapperServer(
++            vllm_config.model_config)
++
++    def _initialize_kv_caches(self,
++                              cache_config: CacheConfig) -> Tuple[int, int]:
++        start = time.time()
++        num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks(
++        )
++
++        if cache_config.num_gpu_blocks_override is not None:
++            num_gpu_blocks_override = cache_config.num_gpu_blocks_override
++            logger.info(
++                "Overriding num_gpu_blocks=%d with "
++                "num_gpu_blocks_override=%d", num_gpu_blocks,
++                num_gpu_blocks_override)
++            num_gpu_blocks = num_gpu_blocks_override
++
++        num_cpu_blocks = 0
++        self.model_executor.initialize(num_gpu_blocks)
++        elapsed = time.time() - start
++        logger.info(("init engine (profile, create kv cache, "
++                     "warmup model) took %.2f seconds"), elapsed)
++        return num_gpu_blocks, num_cpu_blocks
++
++    def add_request(self, request: EngineCoreRequest):
++        """Add request to the scheduler."""
++
++        if request.mm_hashes is not None:
++            # Here, if hash exists for an image, then it will be fetched
++            # from the cache, else it will be added to the cache.
++            # Note that the cache here is mirrored with the client side of the
++            # MM mapper, so anything that has a hash must have a HIT cache
++            # entry here as well.
++            assert request.mm_inputs is not None
++            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
++                request.mm_inputs, request.mm_hashes)
++
++        req = Request.from_engine_core_request(request)
++
++        self.scheduler.add_request(req)
++
++    def abort_requests(self, request_ids: List[str]):
++        """Abort requests from the scheduler."""
++
++        # TODO: The scheduler doesn't really need to know the
++        # specific finish reason, TBD whether we propagate that
++        # (i.e. client-aborted vs stop criteria met).
++        self.scheduler.finish_requests(request_ids,
++                                       RequestStatus.FINISHED_ABORTED)
++
++    def step(self) -> EngineCoreOutputs:
++        """Schedule, execute, and make output."""
++
++        if not self.scheduler.has_unfinished_requests():
++            return EngineCoreOutputs(
++                outputs=[], scheduler_stats=self.scheduler.make_stats())
++
++        scheduler_output = self.scheduler.schedule()
++        output = self.model_executor.execute_model(scheduler_output)
++        engine_core_outputs = self.scheduler.update_from_output(
++            scheduler_output, output)
++        return engine_core_outputs
++
++    def shutdown(self):
++        self.model_executor.shutdown()
++
++    def profile(self, is_start: bool = True):
++        self.model_executor.profile(is_start)
++
++
++class EngineCoreProc(EngineCore):
++    """ZMQ-wrapper for running EngineCore in background process."""
++
++    def __init__(
++        self,
++        input_path: str,
++        output_path: str,
++        ready_pipe: Connection,
++        vllm_config: VllmConfig,
++        executor_class: Type[Executor],
++        log_stats: bool = False,
++    ):
++        super().__init__(vllm_config, executor_class)
++
++        self.log_stats = log_stats
++
++        # Background Threads and Queues for IO. These enable us to
++        # overlap ZMQ socket IO with GPU since they release the GIL,
++        # and to overlap some serialization/deserialization with the
++        # model forward pass.
++        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
++        self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
++        self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
++        threading.Thread(target=self.process_input_socket,
++                         args=(input_path, ),
++                         daemon=True).start()
++        threading.Thread(target=self.process_output_socket,
++                         args=(output_path, ),
++                         daemon=True).start()
++
++        # Send Readiness signal to EngineClient.
++        ready_pipe.send({"status": "READY"})
++
++    @staticmethod
++    def run_engine_core(*args, **kwargs):
++        """Launch EngineCore busy loop in background process."""
++
++        # Signal handler used for graceful termination.
++        # SystemExit exception is only raised once to allow this and worker
++        # processes to terminate without error
++        shutdown_requested = False
++
++        # Ensure we can serialize transformer config after spawning
++        maybe_register_config_serialize_by_value()
++
++        def signal_handler(signum, frame):
++            nonlocal shutdown_requested
++            if not shutdown_requested:
++                shutdown_requested = True
++                raise SystemExit()
++
++        # Either SIGTERM or SIGINT will terminate the engine_core
++        signal.signal(signal.SIGTERM, signal_handler)
++        signal.signal(signal.SIGINT, signal_handler)
++
++        parent_process = psutil.Process().parent()
++        engine_core = None
++        try:
++            engine_core = EngineCoreProc(*args, **kwargs)
++            engine_core.run_busy_loop()
++
++        except SystemExit:
++            logger.debug("EngineCore interrupted.")
++
++        except Exception:
++            traceback = get_exception_traceback()
++            logger.error("EngineCore hit an exception: %s", traceback)
++            parent_process.send_signal(signal.SIGUSR1)
++
++        finally:
++            if engine_core is not None:
++                engine_core.shutdown()
++
++    def run_busy_loop(self):
++        """Core busy loop of the EngineCore."""
++
++        # Loop until process is sent a SIGINT or SIGTERM
++        while True:
++            # 1) Poll the input queue until there is work to do.
++            if not self.scheduler.has_unfinished_requests():
++                while True:
++                    try:
++                        req = self.input_queue.get(timeout=POLLING_TIMEOUT_S)
++                        self._handle_client_request(req)
++                        break
++                    except queue.Empty:
++                        logger.debug("EngineCore busy loop waiting.")
++                        # Break out the loop so we can log_stats in step().
++                        if self.log_stats:
++                            break
++                    except BaseException:
++                        raise
++
++            # 2) Handle any new client requests (Abort or Add).
++            while not self.input_queue.empty():
++                req = self.input_queue.get_nowait()
++                self._handle_client_request(req)
++
++            # 3) Step the engine core.
++            outputs = self.step()
++
++            # 5) Put EngineCoreOutputs into the output queue.
++            self.output_queue.put_nowait(outputs)
++
++    def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
++        """Handle EngineCoreRequest or EngineCoreABORT from Client."""
++
++        if isinstance(request, EngineCoreRequest):
++            self.add_request(request)
++        elif isinstance(request, EngineCoreProfile):
++            self.model_executor.profile(request.is_start)
++        else:
++            # TODO: make an EngineCoreAbort wrapper
++            assert isinstance(request, list)
++            self.abort_requests(request)
++
++    def process_input_socket(self, input_path: str):
++        """Input socket IO thread."""
++
++        # Msgpack serialization decoding.
++        decoder_add_req = PickleEncoder()
++        decoder_abort_req = PickleEncoder()
++
++        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
++            while True:
++                # (RequestType, RequestData)
++                type_frame, data_frame = socket.recv_multipart(copy=False)
++                request_type = type_frame.buffer
++                request_data = data_frame.buffer
++
++                # Deserialize the request data.
++                if request_type == EngineCoreRequestType.ADD.value:
++                    request = decoder_add_req.decode(request_data)
++                elif request_type == EngineCoreRequestType.ABORT.value:
++                    request = decoder_abort_req.decode(request_data)
++                elif request_type == EngineCoreRequestType.PROFILE.value:
++                    request = pickle.loads(request_data)
++                else:
++                    raise ValueError(f"Unknown RequestType: {request_type}")
++
++                # Push to input queue for core busy loop.
++                self.input_queue.put_nowait(request)
++
++    def process_output_socket(self, output_path: str):
++        """Output socket IO thread."""
++
++        # Msgpack serialization encoding.
++        encoder = msgpack.Encoder()
++        # Reuse send buffer.
++        buffer = bytearray()
++
++        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
++            while True:
++                outputs = self.output_queue.get()
++                encoder.encode_into(outputs, buffer)
++                socket.send_multipart((buffer, ), copy=False)
+diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
+new file mode 100644
+index 0000000..ac0f0f1
+--- /dev/null
++++ b/vllm/v1/engine/core_client.py
+@@ -0,0 +1,268 @@
++import os
++import signal
++import weakref
++from abc import ABC, abstractmethod
++from typing import List, Type
++
++import msgspec
++import zmq
++import zmq.asyncio
++
++from vllm.config import VllmConfig
++from vllm.logger import init_logger
++from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
++                        make_zmq_socket)
++from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
++                            EngineCoreRequest, EngineCoreRequestType,
++                            EngineCoreRequestUnion)
++from vllm.v1.engine.core import EngineCore, EngineCoreProc
++from vllm.v1.executor.abstract import Executor
++from vllm.v1.serial_utils import PickleEncoder
++from vllm.v1.utils import BackgroundProcHandle
++
++logger = init_logger(__name__)
++
++
++class EngineCoreClient(ABC):
++    """
++    EngineCoreClient: subclasses handle different methods for pushing 
++        and pulling from the EngineCore for asyncio / multiprocessing.
++
++    Subclasses:
++    * InprocClient: In process EngineCore (for V0-style LLMEngine use)
++    * SyncMPClient: ZMQ + background proc EngineCore (for LLM)
++    * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
++    """
++
++    @staticmethod
++    def make_client(
++        multiprocess_mode: bool,
++        asyncio_mode: bool,
++        vllm_config: VllmConfig,
++        executor_class: Type[Executor],
++    ) -> "EngineCoreClient":
++
++        # TODO: support this for debugging purposes.
++        if asyncio_mode and not multiprocess_mode:
++            raise NotImplementedError(
++                "Running EngineCore in asyncio without multiprocessing "
++                "is not currently supported.")
++
++        if multiprocess_mode and asyncio_mode:
++            return AsyncMPClient(vllm_config, executor_class)
++
++        if multiprocess_mode and not asyncio_mode:
++            return SyncMPClient(vllm_config, executor_class)
++
++        return InprocClient(vllm_config, executor_class)
++
++    @abstractmethod
++    def shutdown(self):
++        ...
++
++    def get_output(self) -> EngineCoreOutputs:
++        raise NotImplementedError
++
++    def add_request(self, request: EngineCoreRequest) -> None:
++        raise NotImplementedError
++
++    def profile(self, is_start: bool = True) -> None:
++        raise NotImplementedError
++
++    def abort_requests(self, request_ids: List[str]) -> None:
++        raise NotImplementedError
++
++    async def get_output_async(self) -> EngineCoreOutputs:
++        raise NotImplementedError
++
++    async def add_request_async(self, request: EngineCoreRequest) -> None:
++        raise NotImplementedError
++
++    async def profile_async(self, is_start: bool = True) -> None:
++        raise NotImplementedError
++
++    async def abort_requests_async(self, request_ids: List[str]) -> None:
++        raise NotImplementedError
++
++
++class InprocClient(EngineCoreClient):
++    """
++    InprocClient: client for in-process EngineCore. Intended 
++    for use in LLMEngine for V0-style add_request() and step()
++        EngineCore setup in this process (no busy loop).
++
++        * pushes EngineCoreRequest directly into the EngineCore
++        * pulls EngineCoreOutputs by stepping the EngineCore
++    """
++
++    def __init__(self, *args, **kwargs):
++        self.engine_core = EngineCore(*args, **kwargs)
++
++    def get_output(self) -> EngineCoreOutputs:
++        return self.engine_core.step()
++
++    def add_request(self, request: EngineCoreRequest) -> None:
++        self.engine_core.add_request(request)
++
++    def abort_requests(self, request_ids: List[str]) -> None:
++        if len(request_ids) > 0:
++            self.engine_core.abort_requests(request_ids)
++
++    def shutdown(self):
++        self.engine_core.shutdown()
++
++    def profile(self, is_start: bool = True) -> None:
++        self.engine_core.profile(is_start)
++
++
++class MPClient(EngineCoreClient):
++    """
++    MPClient: base client for multi-proc EngineCore.
++        EngineCore runs in a background process busy loop, getting
++        new EngineCoreRequests and returning EngineCoreOutputs
++
++        * pushes EngineCoreRequests via input_socket
++        * pulls EngineCoreOutputs via output_socket
++    
++        * AsyncMPClient subclass for AsyncLLM usage
++        * SyncMPClient subclass for LLM usage
++    """
++
++    def __init__(
++        self,
++        asyncio_mode: bool,
++        vllm_config: VllmConfig,
++        executor_class: Type[Executor],
++        log_stats: bool,
++    ):
++        # The child processes will send SIGUSR1 when unrecoverable
++        # errors happen. We kill the process tree here so that the
++        # stack trace is very evident.
++        # TODO(rob): rather than killing the main process, we should
++        # figure out how to raise an AsyncEngineDeadError and
++        # handle at the API server level so we can return a better
++        # error code to the clients calling VLLM.
++        def sigusr1_handler(signum, frame):
++            logger.fatal("Got fatal signal from worker processes, shutting "
++                         "down. See stack trace above for root cause issue.")
++            kill_process_tree(os.getpid())
++
++        signal.signal(signal.SIGUSR1, sigusr1_handler)
++
++        # Serialization setup.
++        self.encoder = PickleEncoder()
++        self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
++
++        # ZMQ setup.
++        self.ctx = (
++            zmq.asyncio.Context()  # type: ignore[attr-defined]
++            if asyncio_mode else zmq.Context())  # type: ignore[attr-defined]
++
++        # Note(rob): shutdown function cannot be a bound method,
++        # else the gc cannot collect the object.
++        self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0),
++                                           self.ctx)
++
++        # Paths and sockets for IPC.
++        output_path = get_open_zmq_ipc_path()
++        input_path = get_open_zmq_ipc_path()
++        self.output_socket = make_zmq_socket(self.ctx, output_path,
++                                             zmq.constants.PULL)
++        self.input_socket = make_zmq_socket(self.ctx, input_path,
++                                            zmq.constants.PUSH)
++
++        # Start EngineCore in background process.
++        self.proc_handle = BackgroundProcHandle(
++            input_path=input_path,
++            output_path=output_path,
++            process_name="EngineCore",
++            target_fn=EngineCoreProc.run_engine_core,
++            process_kwargs={
++                "vllm_config": vllm_config,
++                "executor_class": executor_class,
++                "log_stats": log_stats,
++            })
++
++    def shutdown(self):
++        """Clean up background resources."""
++        if hasattr(self, "proc_handle"):
++            self.proc_handle.shutdown()
++
++        self._finalizer()
++
++
++class SyncMPClient(MPClient):
++    """Synchronous client for multi-proc EngineCore."""
++
++    def __init__(self, vllm_config: VllmConfig,
++                 executor_class: Type[Executor]):
++        super().__init__(
++            asyncio_mode=False,
++            vllm_config=vllm_config,
++            executor_class=executor_class,
++            log_stats=False,
++        )
++
++    def get_output(self) -> EngineCoreOutputs:
++
++        (frame, ) = self.output_socket.recv_multipart(copy=False)
++        return self.decoder.decode(frame.buffer)
++
++    def _send_input(self, request_type: EngineCoreRequestType,
++                    request: EngineCoreRequestUnion) -> None:
++
++        # (RequestType, SerializedRequest)
++        msg = (request_type.value, self.encoder.encode(request))
++        self.input_socket.send_multipart(msg, copy=False)
++
++    def add_request(self, request: EngineCoreRequest) -> None:
++        # NOTE: text prompt is not needed in the core engine as it has been
++        # tokenized.
++        request.prompt = None
++        self._send_input(EngineCoreRequestType.ADD, request)
++
++    def abort_requests(self, request_ids: List[str]) -> None:
++        if len(request_ids) > 0:
++            self._send_input(EngineCoreRequestType.ABORT, request_ids)
++
++    def profile(self, is_start: bool = True) -> None:
++        self._send_input(EngineCoreRequestType.PROFILE,
++                         EngineCoreProfile(is_start))
++
++
++class AsyncMPClient(MPClient):
++    """Asyncio-compatible client for multi-proc EngineCore."""
++
++    def __init__(self, vllm_config: VllmConfig,
++                 executor_class: Type[Executor]):
++        super().__init__(
++            asyncio_mode=True,
++            vllm_config=vllm_config,
++            executor_class=executor_class,
++            log_stats=True,
++        )
++
++    async def get_output_async(self) -> EngineCoreOutputs:
++
++        frames = await self.output_socket.recv_multipart(copy=False)
++        return self.decoder.decode(frames[0].buffer)
++
++    async def _send_input(self, request_type: EngineCoreRequestType,
++                          request: EngineCoreRequestUnion) -> None:
++
++        msg = (request_type.value, self.encoder.encode(request))
++        await self.input_socket.send_multipart(msg, copy=False)
++
++    async def add_request_async(self, request: EngineCoreRequest) -> None:
++        # NOTE: text prompt is not needed in the core engine as it has been
++        # tokenized.
++        request.prompt = None
++        await self._send_input(EngineCoreRequestType.ADD, request)
++
++    async def abort_requests_async(self, request_ids: List[str]) -> None:
++        if len(request_ids) > 0:
++            await self._send_input(EngineCoreRequestType.ABORT, request_ids)
++
++    async def profile_async(self, is_start: bool = True) -> None:
++        await self._send_input(EngineCoreRequestType.PROFILE,
++                               EngineCoreProfile(is_start))
+diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
+new file mode 100644
+index 0000000..4a8b61b
+--- /dev/null
++++ b/vllm/v1/engine/detokenizer.py
+@@ -0,0 +1,180 @@
++from dataclasses import dataclass
++from typing import List, Optional, Union
++
++from vllm.engine.output_processor.stop_checker import StopChecker
++from vllm.logger import init_logger
++from vllm.sampling_params import RequestOutputKind
++from vllm.transformers_utils.detokenizer_utils import (
++    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
++from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
++
++logger = init_logger(__name__)
++
++
++@dataclass
++class DetokenizerOutput:
++    output_text: str
++    token_ids: List[int]
++    finished: bool
++    finish_reason: Optional[str] = None
++    stop_reason: Union[int, str, None] = None
++
++
++@dataclass
++class IncrementalDetokenizer:
++
++    # Generation data
++    output_text: str
++    tokens: List[str]
++    token_ids: List[int]
++    prompt_len: int
++
++    # Stop strings
++    stop: List[str]
++    include_stop_str_in_output: bool
++
++    # Metadata for incremental detokenization
++    prefix_offset: int
++    read_offset: int
++
++    # Parameters for detokenization
++    skip_special_tokens: bool
++    spaces_between_special_tokens: bool
++    output_kind: RequestOutputKind
++
++    # Tokenizer for this request
++    tokenizer: AnyTokenizer
++
++    # Accounting for stop string buffering
++    stop_buffer_length: int
++    _last_output_text_offset: int = 0
++
++    @property
++    def output_token_ids(self) -> List[int]:
++        return self.token_ids[self.prompt_len:]
++
++    @classmethod
++    def from_new_request(
++        cls,
++        tokenizer: AnyTokenizer,
++        request: EngineCoreRequest,
++    ) -> "IncrementalDetokenizer":
++
++        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
++            tokenizer=tokenizer,
++            prompt_ids=request.prompt_token_ids,
++            skip_special_tokens=request.sampling_params.skip_special_tokens,
++        )
++
++        stops = request.sampling_params.stop
++        # Number of chars to hold back when stop strings are to be excluded
++        # from streamed output.
++        if stops and not request.sampling_params.include_stop_str_in_output:
++            stop_buffer_length = max(len(s) for s in stops) - 1
++        else:
++            stop_buffer_length = 0
++
++        return cls(
++            output_text="",
++            tokens=tokens,
++            # Detokenizer mutates this list, so need a unique copy.
++            # NOTE(Nick): could we take ownership of it though?
++            token_ids=request.prompt_token_ids.copy(),
++            stop=stops,
++            include_stop_str_in_output=request.sampling_params.
++            include_stop_str_in_output,
++            prefix_offset=prefix_offset,
++            read_offset=read_offset,
++            skip_special_tokens=request.sampling_params.skip_special_tokens,
++            spaces_between_special_tokens=request.sampling_params.
++            spaces_between_special_tokens,
++            output_kind=request.sampling_params.output_kind,
++            prompt_len=len(request.prompt_token_ids),
++            tokenizer=tokenizer,
++            stop_buffer_length=stop_buffer_length,
++        )
++
++    def update_from_output(
++        self,
++        output: EngineCoreOutput,
++    ) -> Optional[DetokenizerOutput]:
++        """
++        Update RequestState for the request_id by:
++            1) Detokenize the new token ids incrementally.
++            2) Update the RequestOutput with the new text.
++        """
++
++        new_token_ids = output.new_token_ids
++        finish_reason = output.finish_reason
++        stop_reason = output.stop_reason
++
++        # 1) Detokenize the new token ids incrementally.
++        # TODO(woosuk): This method becomes very inefficient when the number of
++        # new_token_ids is more than 1. We need to optimize this.
++        decoded_text = ""
++        for new_token_id in new_token_ids:
++            self.token_ids.append(new_token_id)
++            (new_tokens, new_decoded_token_text, prefix_offset,
++             read_offset) = detokenize_incrementally(
++                 tokenizer=self.tokenizer,
++                 all_input_ids=self.token_ids,
++                 prev_tokens=self.tokens,
++                 prefix_offset=self.prefix_offset,
++                 read_offset=self.read_offset,
++                 skip_special_tokens=self.skip_special_tokens,
++                 spaces_between_special_tokens=self.
++                 spaces_between_special_tokens,
++             )
++
++            self.tokens.extend(new_tokens)
++            self.prefix_offset = prefix_offset
++            self.read_offset = read_offset
++            self.output_text += new_decoded_token_text
++
++            decoded_text += new_decoded_token_text
++
++        # 2) Evaluate stop criteria.
++        if self.stop:
++            stop = StopChecker.check_stop_strings(
++                output_text=self.output_text,
++                new_char_count=len(decoded_text),
++                stop=self.stop,
++                include_in_output=self.include_stop_str_in_output,
++            )
++            if stop is not None:
++                stop_str, truncate_to = stop
++                if truncate_to != -1:
++                    self.output_text = self.output_text[:truncate_to]
++                finish_reason = "stop"  # TODO: use constant
++                stop_reason = stop_str
++
++        # TODO: handle stop_token_ids here too?
++
++        # 3) Update the RequestOutput object with the new text.
++        finished = bool(finish_reason)
++        if self.output_kind == RequestOutputKind.FINAL_ONLY \
++            and not finished:
++            return None
++
++        delta = self.output_kind == RequestOutputKind.DELTA
++        output_text = self._get_next_output_text(finished, delta)
++        token_ids = new_token_ids if delta else self.output_token_ids
++
++        return DetokenizerOutput(output_text, token_ids, finished,
++                                 finish_reason, stop_reason)
++
++    def _get_next_output_text(self, finished: bool, delta: bool) -> str:
++        """If delta is True, only new text since the last call to
++        this method is returned"""
++
++        # We return the full output text if the sequence is finished.
++        buffer_length = 0 if finished else self.stop_buffer_length
++        if not delta:
++            return self.output_text[:-buffer_length] if buffer_length else (
++                self.output_text)
++        length = len(self.output_text) - buffer_length
++        last_offset = self._last_output_text_offset
++        if last_offset < length:
++            self._last_output_text_offset = length
++            return self.output_text[last_offset:length]
++        return ""
+diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
+new file mode 100644
+index 0000000..f5999cc
+--- /dev/null
++++ b/vllm/v1/engine/llm_engine.py
+@@ -0,0 +1,179 @@
++from typing import Dict, List, Mapping, Optional, Type, Union
++
++from typing_extensions import TypeVar
++
++from vllm.config import VllmConfig
++from vllm.engine.arg_utils import EngineArgs
++from vllm.engine.metrics_types import StatLoggerBase
++from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING
++from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
++from vllm.outputs import RequestOutput
++from vllm.pooling_params import PoolingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import SamplingParams
++from vllm.transformers_utils.tokenizer_group import (
++    BaseTokenizerGroup, init_tokenizer_from_configs)
++from vllm.usage.usage_lib import UsageContext
++from vllm.v1.engine.core_client import EngineCoreClient
++from vllm.v1.engine.output_processor import OutputProcessor
++from vllm.v1.engine.processor import Processor
++from vllm.v1.executor.abstract import Executor
++
++logger = init_logger(__name__)
++
++_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
++
++
++class LLMEngine:
++    """Legacy LLMEngine for backwards compatibility."""
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        executor_class: Type[Executor],
++        log_stats: bool,
++        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
++        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
++        input_registry: InputRegistry = INPUT_REGISTRY,
++        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
++        use_cached_outputs: bool = False,
++        multiprocess_mode: bool = False,
++    ) -> None:
++        self.model_config = vllm_config.model_config
++
++        # Tokenizer (+ ensure liveness if running in another process).
++        self.tokenizer = init_tokenizer_from_configs(
++            model_config=vllm_config.model_config,
++            scheduler_config=vllm_config.scheduler_config,
++            parallel_config=vllm_config.parallel_config,
++            lora_config=vllm_config.lora_config)
++        self.tokenizer.ping()
++
++        # Processor (convert Inputs --> EngineCoreRequests)
++        self.processor = Processor(model_config=vllm_config.model_config,
++                                   cache_config=vllm_config.cache_config,
++                                   lora_config=vllm_config.lora_config,
++                                   tokenizer=self.tokenizer,
++                                   input_registry=input_registry,
++                                   mm_registry=mm_registry)
++
++        # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
++        self.output_processor = OutputProcessor(self.tokenizer,
++                                                log_stats=False)
++
++        # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
++        self.engine_core = EngineCoreClient.make_client(
++            multiprocess_mode=multiprocess_mode,
++            asyncio_mode=False,
++            vllm_config=vllm_config,
++            executor_class=executor_class,
++        )
++
++    @classmethod
++    def from_engine_args(
++        cls,
++        engine_args: EngineArgs,
++        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
++        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
++        enable_multiprocessing: bool = False,
++    ) -> "LLMEngine":
++        """Creates an LLM engine from the engine arguments."""
++
++        # Create the engine configs.
++        vllm_config = engine_args.create_engine_config(usage_context)
++        executor_class = Executor.get_class(vllm_config)
++
++        if VLLM_ENABLE_V1_MULTIPROCESSING:
++            logger.debug("Enabling multiprocessing for LLMEngine.")
++            enable_multiprocessing = True
++
++        # Create the LLMEngine.
++        return cls(vllm_config=vllm_config,
++                   executor_class=executor_class,
++                   log_stats=not engine_args.disable_log_stats,
++                   usage_context=usage_context,
++                   stat_loggers=stat_loggers,
++                   multiprocess_mode=enable_multiprocessing)
++
++    def get_num_unfinished_requests(self) -> int:
++        return self.output_processor.get_num_unfinished_requests()
++
++    def has_unfinished_requests(self) -> bool:
++        return self.output_processor.has_unfinished_requests()
++
++    @classmethod
++    def validate_outputs(cls, outputs, output_type):
++        return outputs
++
++    def abort_request(self, request_ids: List[str]) -> None:
++        """Remove request_ids from EngineCore and Detokenizer."""
++
++        self.engine_core.abort_requests(request_ids)
++        self.output_processor.abort_requests(request_ids)
++
++    def add_request(
++        self,
++        request_id: str,
++        prompt: PromptType,
++        params: Union[SamplingParams, PoolingParams],
++        arrival_time: Optional[float] = None,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> None:
++
++        # 1) Process raw inputs into the request.
++        request = self.processor.process_inputs(request_id, prompt, params,
++                                                arrival_time, lora_request,
++                                                trace_headers,
++                                                prompt_adapter_request,
++                                                priority)
++
++        # 2) Make a new RequestState and queue.
++        self.output_processor.add_request(request)
++
++        # 3) Add the request to EngineCore.
++        self.engine_core.add_request(request)
++
++    def step(self) -> List[RequestOutput]:
++
++        # 1) Get EngineCoreOutput from the EngineCore.
++        outputs = self.engine_core.get_output()
++
++        # 2) Process EngineCoreOutputs.
++        processed_outputs = self.output_processor.process_outputs(
++            outputs.outputs)
++
++        # 3) Abort any reqs that finished due to stop strings.
++        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
++
++        return processed_outputs.request_outputs
++
++    def get_model_config(self):
++        return self.model_config
++
++    def start_profile(self):
++        self.engine_core.profile(True)
++
++    def stop_profile(self):
++        self.engine_core.profile(False)
++
++    def get_tokenizer_group(
++        self,
++        group_type: Type[_G] = BaseTokenizerGroup,
++    ) -> _G:
++        tokenizer_group = self.tokenizer
++
++        if tokenizer_group is None:
++            raise ValueError("Unable to get tokenizer because "
++                             "skip_tokenizer_init is True")
++        if not isinstance(tokenizer_group, group_type):
++            raise TypeError("Invalid type of tokenizer group. "
++                            f"Expected type: {group_type}, but "
++                            f"found type: {type(tokenizer_group)}")
++
++        return tokenizer_group
+diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
+new file mode 100644
+index 0000000..d83460a
+--- /dev/null
++++ b/vllm/v1/engine/mm_input_mapper.py
+@@ -0,0 +1,142 @@
++from typing import Any, Dict, List, Optional
++
++from vllm.config import ModelConfig
++from vllm.logger import init_logger
++from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
++                             MultiModalKwargs, MultiModalRegistry)
++from vllm.utils import LRUCache
++
++logger = init_logger(__name__)
++
++# The idea of MM preprocessor caching is based on having a client and a server,
++# where the client executes in the frontend process (=P0) and the server in the
++# core process (=P1).
++#
++# -- Client: Executes the MM mapper and performs caching of the results.
++# -- Server: Performs caching of the results
++#
++# The caching for both client and server is mirrored/similar, and this allows us
++# to avoid the serialization of "mm_inputs" (like pixel values) between
++# client (=P0) and server (=P1) processes.
++
++# Both Client and Server must use the same cache size
++# (to perform mirrored caching)
++# TODO: Tune the MM cache size
++MM_CACHE_SIZE = 256
++
++
++class MMInputMapperClient:
++
++    def __init__(
++        self,
++        model_config: ModelConfig,
++        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
++    ):
++        self.model_config = model_config
++        self.mm_registry = mm_registry
++        self.multi_modal_input_mapper = mm_registry.create_input_mapper(
++            model_config)
++        self.mm_registry.init_mm_limits_per_prompt(model_config)
++
++        # Init cache
++        self.use_cache = not model_config.disable_mm_preprocessor_cache
++        self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
++
++        # DEBUG: Set to None to disable
++        self.mm_debug_cache_hit_ratio_steps = None
++        self.mm_cache_hits = 0
++        self.mm_cache_total = 0
++
++    def cache_hit_ratio(self, steps):
++        if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0:
++            logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
++                         self.mm_cache_hits / self.mm_cache_total)
++
++    # TODO: Support modalities beyond image.
++    def process_inputs(
++        self,
++        mm_data: MultiModalDataDict,
++        mm_hashes: Optional[List[str]],
++        mm_processor_kwargs: Optional[Dict[str, Any]],
++        precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
++    ) -> List[MultiModalKwargs]:
++        if precomputed_mm_inputs is None:
++            image_inputs = mm_data["image"]
++            if not isinstance(image_inputs, list):
++                image_inputs = [image_inputs]
++            num_inputs = len(image_inputs)
++        else:
++            num_inputs = len(precomputed_mm_inputs)
++
++        # Sanity
++        if self.use_cache:
++            assert mm_hashes is not None
++            assert num_inputs == len(mm_hashes)
++
++        # Process each image input separately, so that later we can schedule
++        # them in a fine-grained manner.
++        # Apply caching (if enabled) and reuse precomputed inputs (if provided)
++        ret_inputs: List[MultiModalKwargs] = []
++        for input_id in range(num_inputs):
++            if self.mm_debug_cache_hit_ratio_steps is not None:
++                self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
++
++            mm_input = None
++            if self.use_cache:
++                assert mm_hashes is not None
++                mm_hash = mm_hashes[input_id]
++                mm_input = self.mm_cache.get(mm_hash)
++
++            self.mm_cache_total += 1
++            if mm_input is None:
++                if precomputed_mm_inputs is not None:
++                    # Reuse precomputed input (for merged preprocessor)
++                    mm_input = precomputed_mm_inputs[input_id]
++                else:
++                    # Apply MM mapper
++                    mm_input = self.multi_modal_input_mapper(
++                        {"image": [image_inputs[input_id]]},
++                        mm_processor_kwargs=mm_processor_kwargs,
++                    )
++
++                if self.use_cache:
++                    # Add to cache
++                    assert mm_hash is not None
++                    self.mm_cache.put(mm_hash, mm_input)
++            else:
++                self.mm_cache_hits += 1
++                mm_input = None  # Avoids sending mm_input to Server
++
++            ret_inputs.append(mm_input)
++
++        return ret_inputs
++
++
++class MMInputMapperServer:
++
++    def __init__(self, model_config):
++        self.use_cache = not model_config.disable_mm_preprocessor_cache
++        self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
++
++    def process_inputs(
++        self,
++        mm_inputs: List[Optional[MultiModalKwargs]],
++        mm_hashes: List[str],
++    ) -> List[MultiModalKwargs]:
++        assert len(mm_inputs) == len(mm_hashes)
++
++        if not self.use_cache:
++            return mm_inputs
++
++        full_mm_inputs = []
++        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
++            assert mm_hash is not None
++            if mm_input is None:
++                mm_input = self.mm_cache.get(mm_hash)
++                assert mm_input is not None
++            else:
++                self.mm_cache.put(mm_hash, mm_input)
++
++            full_mm_inputs.append(mm_input)
++
++        return full_mm_inputs
+diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
+new file mode 100644
+index 0000000..749f4f5
+--- /dev/null
++++ b/vllm/v1/engine/output_processor.py
+@@ -0,0 +1,200 @@
++import asyncio
++from dataclasses import dataclass
++from typing import Dict, List, Optional
++
++from vllm.outputs import RequestOutput
++from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
++from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
++from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
++from vllm.v1.engine.detokenizer import (DetokenizerOutput,
++                                        IncrementalDetokenizer)
++from vllm.v1.metrics.stats import IterationStats
++
++
++@dataclass
++class OutputProcessorOutput:
++
++    request_outputs: List[RequestOutput]
++    reqs_to_abort: List[str]
++    iteration_stats: IterationStats
++
++
++class RequestState:
++
++    def __init__(
++        self,
++        request_id: str,
++        prompt: Optional[str],
++        prompt_token_ids: List[int],
++        detokenizer: IncrementalDetokenizer,
++        queue: Optional[asyncio.Queue[RequestOutput]],
++    ):
++        self.request_id = request_id
++        self.prompt = prompt
++        self.prompt_token_ids = prompt_token_ids
++        self.prompt_len = len(prompt_token_ids)
++        self.detokenizer = detokenizer
++        self.is_prefilling = True
++        self.queue = queue
++
++    @classmethod
++    def from_new_request(
++        cls,
++        tokenizer: AnyTokenizer,
++        request: EngineCoreRequest,
++        queue: Optional[asyncio.Queue[RequestOutput]] = None,
++    ) -> "RequestState":
++        return cls(
++            request_id=request.request_id,
++            prompt=request.prompt,
++            prompt_token_ids=request.prompt_token_ids,
++            detokenizer=IncrementalDetokenizer.from_new_request(
++                tokenizer=tokenizer,
++                request=request,
++            ),
++            queue=queue,
++        )
++
++
++class OutputProcessor:
++    """Process EngineCoreOutputs into RequestOutputs."""
++
++    def __init__(
++        self,
++        tokenizer: BaseTokenizerGroup,
++        log_stats: bool,
++    ):
++        self.log_stats = log_stats
++        self.tokenizer = tokenizer
++        self.request_states: Dict[str, RequestState] = {}
++
++    def is_request_active(self, request_id: str) -> bool:
++        return request_id in self.request_states
++
++    def get_num_unfinished_requests(self):
++        return len(self.request_states)
++
++    def has_unfinished_requests(self) -> bool:
++        return len(self.request_states) > 0
++
++    def abort_requests(
++        self,
++        request_ids: List[str],
++    ) -> None:
++        for request_id in request_ids:
++            self.request_states.pop(request_id, None)
++
++    def add_request(
++        self,
++        request: EngineCoreRequest,
++        queue: Optional[asyncio.Queue[RequestOutput]] = None,
++    ) -> None:
++        request_id = request.request_id
++        if request_id in self.request_states:
++            raise ValueError(f"Request id {request_id} already running.")
++
++        self.request_states[request_id] = RequestState.from_new_request(
++            tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
++            request=request,
++            queue=queue)
++
++    def process_outputs(
++        self,
++        engine_core_outputs: List[EngineCoreOutput],
++    ) -> OutputProcessorOutput:
++        """
++        Process the EngineCoreOutputs:
++        1) Compute stats for logging
++        2) Detokenize
++        3) Create and handle RequestOutput objects:
++            * If there is a queue (for usage with AsyncLLM), 
++              put the RequestOutput objects into the queue for
++              handling by the per-request generate() tasks.
++
++            * If there is no queue (for usage with LLMEngine), 
++              return a list of RequestOutput objects.
++
++        ****************** NOTE FOR DEVELOPERS ******************
++
++        VLLM V1 minimizes the number of python loops over the full
++        batch to ensure system overheads are minimized. This is the 
++        only function that should loop over EngineCoreOutputs.
++
++        If you need to touch every element of the batch, implement a
++        method called XXXClass.update_from_output() to be called
++        within the loop below. For examples, see:
++            * IterationStats.update_from_output()
++            * Detokenizer.update_from_output()
++        
++        TODO(rob): add Protocol makes update_from_output explicit.
++        
++        **********************************************************
++        """
++
++        request_outputs: List[RequestOutput] = []
++        reqs_to_abort: List[str] = []
++        iteration_stats = IterationStats(self.log_stats)
++        for engine_core_output in engine_core_outputs:
++            req_id = engine_core_output.request_id
++            req_state = self.request_states.get(req_id)
++            if req_state is None:
++                # Ignore output for already-aborted request.
++                continue
++
++            # 1) Compute stats for this iteration.
++            iteration_stats.update_from_output(engine_core_output,
++                                               req_state.is_prefilling,
++                                               req_state.prompt_len)
++            req_state.is_prefilling = False
++
++            # 2) Detokenize the token ids into text.
++            detokenizer_output = req_state.detokenizer.update_from_output(
++                engine_core_output)
++
++            # 3) Create and handle RequestOutput objects.
++            if request_output := self._make_request_output(
++                    req_state, detokenizer_output):
++                if req_state.queue is not None:
++                    # AsyncLLM: put into queue for handling by generate().
++                    req_state.queue.put_nowait(request_output)
++                else:
++                    # LLMEngine: return list of RequestOutputs.
++                    request_outputs.append(request_output)
++
++                # Free completed requests.
++                if request_output.finished:
++                    self.request_states.pop(req_id)
++                    if not engine_core_output.finished:
++                        # If req not finished in EngineCore, but Detokenizer
++                        # detected stop string, abort needed in EngineCore.
++                        reqs_to_abort.append(req_id)
++
++        return OutputProcessorOutput(
++            request_outputs=request_outputs,
++            reqs_to_abort=reqs_to_abort,
++            iteration_stats=iteration_stats,
++        )
++
++    def _make_request_output(
++        self,
++        request_state: RequestState,
++        detokenizer_output: Optional[DetokenizerOutput],
++    ) -> Optional[RequestOutput]:
++
++        if detokenizer_output is None:
++            return None
++
++        request_output = RequestOutput.new(
++            request_state.request_id,
++            request_state.prompt,
++            request_state.prompt_token_ids,
++            detokenizer_output.output_text,
++            detokenizer_output.token_ids,
++            detokenizer_output.finished,
++        )
++        if detokenizer_output.finished:
++            completion_output = request_output.outputs[0]
++            completion_output.finish_reason = detokenizer_output.finish_reason
++            completion_output.stop_reason = detokenizer_output.stop_reason
++
++        return request_output
+diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
+new file mode 100644
+index 0000000..43419d2
+--- /dev/null
++++ b/vllm/v1/engine/processor.py
+@@ -0,0 +1,223 @@
++import time
++from typing import Mapping, Optional, Union
++
++from vllm.config import CacheConfig, LoRAConfig, ModelConfig
++from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
++                         PromptType, SingletonInputsAdapter)
++from vllm.inputs.parse import is_encoder_decoder_inputs
++from vllm.inputs.preprocess import InputPreprocessor
++from vllm.lora.request import LoRARequest
++from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
++                             MultiModalKwargs, MultiModalRegistry)
++from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
++from vllm.pooling_params import PoolingParams
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sampling_params import SamplingParams
++from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
++from vllm.v1.engine import EngineCoreRequest
++from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
++
++
++class Processor:
++
++    def __init__(
++        self,
++        model_config: ModelConfig,
++        cache_config: CacheConfig,
++        lora_config: Optional[LoRAConfig],
++        tokenizer: BaseTokenizerGroup,
++        input_registry: InputRegistry = INPUT_REGISTRY,
++        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
++    ):
++
++        self.model_config = model_config
++        self.lora_config = lora_config
++        self.tokenizer = tokenizer
++
++        self.generation_config_fields = model_config.try_get_generation_config(
++        )
++        self.input_preprocessor = InputPreprocessor(model_config,
++                                                    self.tokenizer,
++                                                    mm_registry)
++        self.input_processor = input_registry.create_input_processor(
++            model_config)
++
++        # Multi-modal (huggingface) input mapper
++        self.mm_input_mapper_client = MMInputMapperClient(model_config)
++
++        # Multi-modal hasher (for images)
++        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
++            cache_config.enable_prefix_caching
++
++    def process_inputs(
++        self,
++        request_id: str,
++        prompt: PromptType,
++        params: Union[SamplingParams, PoolingParams],
++        arrival_time: Optional[float] = None,
++        lora_request: Optional[LoRARequest] = None,
++        trace_headers: Optional[Mapping[str, str]] = None,
++        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++        priority: int = 0,
++    ) -> EngineCoreRequest:
++
++        # TODO(woosuk): Support pooling models.
++        # TODO(woosuk): Check max_logprobs
++        # TODO(woosuk): Support encoder-decoder models.
++
++        if lora_request is not None and not self.lora_config:
++            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
++                             "not enabled!")
++        if arrival_time is None:
++            arrival_time = time.time()
++        assert priority == 0, "vLLM V1 does not support priority at the moment."
++        assert trace_headers is None, "vLLM V1 does not support tracing yet."
++
++        # Process inputs.
++        preprocessed_inputs = self.input_preprocessor.preprocess(
++            prompt,
++            request_id=request_id,
++            lora_request=lora_request,
++            prompt_adapter_request=prompt_adapter_request,
++        )
++        processed_inputs = self.input_processor(preprocessed_inputs)
++        self._validate_model_inputs(processed_inputs)
++        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
++
++        if is_encoder_decoder_inputs(processed_inputs):
++            decoder_inputs = SingletonInputsAdapter(
++                processed_inputs["decoder"])
++            encoder_inputs = SingletonInputsAdapter(
++                processed_inputs["encoder"])
++        else:
++            decoder_inputs = SingletonInputsAdapter(processed_inputs)
++            encoder_inputs = None
++
++        # TODO: Impl encoder-decoder
++        if encoder_inputs is not None:
++            raise NotImplementedError
++
++        assert isinstance(params, SamplingParams)
++        # TODO: can we avoid cloning here in multiproc case
++        sampling_params = params.clone()
++        sampling_params.update_from_generation_config(
++            self.generation_config_fields, eos_token_id)
++
++        # Multimodal related.
++        # Compute MM hashes (if enabled)
++        mm_hashes = None
++        if self.use_hash:
++            # Use mm_hashes from processed inputs if the model has merged
++            # input processor.
++            if decoder_inputs.multi_modal_hashes:
++                mm_hashes = decoder_inputs.multi_modal_hashes
++            # Fallback to using MultiModalHasher directly.
++            else:
++                mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
++
++        # For merged preprocessor, mm_data is already mm_inputs
++        precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
++        decoder_mm_data = decoder_inputs.multi_modal_data
++        if isinstance(decoder_mm_data, MultiModalKwargs):
++            # The output of merged multi-modal processor (`decoder_mm_data`)
++            # contains the kwargs for all items from all modalities.
++            # This code separates them so that there is one set of kwargs
++            # per item per modality.
++            precomputed_mm_inputs = [
++                MultiModalKwargs.from_items([item])
++                for modality in decoder_mm_data.modalities
++                for item in decoder_mm_data.get_items(modality)
++            ]
++
++        mm_positions = decoder_inputs.multi_modal_placeholders
++
++        # Last-mile processing of multimodal metadata and inputs.
++        if mm_positions:
++
++            # Merge and flatten multimodal placeholders, hashes and inputs
++            # from dictionaries to lists, and sort them by each item's position
++            # in the input sequence.
++            # NOTE: interleaved modalities are not supported.
++            (
++                sorted_modalities,
++                sorted_mm_positions,
++                sorted_mm_hashes,
++            ) = merge_and_sort_multimodal_metadata(
++                mm_positions,
++                mm_hashes,
++            )
++
++            # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
++            # modalities involved AND the model supports merged input processor.
++            if len(sorted_modalities) > 1 and precomputed_mm_inputs:
++
++                modality_order_dict = {
++                    modality: order
++                    for order, modality in enumerate(sorted_modalities)
++                }
++
++                # Sanity check to make sure each multimodal input has only one
++                # modality key.
++                for mm_input in precomputed_mm_inputs:
++                    assert len(mm_input.modalities) == 1
++
++                # Sort MultiModalKwags to match sorted_mm_positions
++                precomputed_mm_inputs = sorted(
++                    precomputed_mm_inputs,
++                    key=lambda mm_input: modality_order_dict[list(
++                        mm_input.modalities)[0]])
++
++            # Apply mm input cache update (and input mapper if necessary).
++            sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
++                mm_data=decoder_mm_data,
++                mm_hashes=sorted_mm_hashes,
++                mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
++                precomputed_mm_inputs=precomputed_mm_inputs,
++            )
++        else:
++            sorted_mm_inputs = None
++            sorted_mm_hashes = None
++            sorted_mm_positions = None
++
++        return EngineCoreRequest(
++            request_id=request_id,
++            prompt=decoder_inputs.prompt,
++            prompt_token_ids=decoder_inputs.prompt_token_ids,
++            mm_inputs=sorted_mm_inputs,
++            mm_hashes=sorted_mm_hashes,
++            mm_placeholders=sorted_mm_positions,
++            sampling_params=sampling_params,
++            eos_token_id=eos_token_id,
++            arrival_time=arrival_time,
++            lora_request=lora_request,
++        )
++
++    def _validate_model_inputs(self, inputs: ProcessorInputs):
++        if is_encoder_decoder_inputs(inputs):
++            # For encoder-decoder multimodal models, the max_prompt_len
++            # restricts the decoder prompt length
++            prompt_inputs = inputs["decoder" if self.model_config.
++                                   is_multimodal_model else "encoder"]
++        else:
++            prompt_inputs = inputs
++
++        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
++
++        if prompt_ids is None or len(prompt_ids) == 0:
++            raise ValueError("Prompt cannot be empty")
++
++        if self.model_config.is_multimodal_model:
++            max_prompt_len = self.model_config.max_model_len
++
++            if len(prompt_ids) > max_prompt_len:
++                raise ValueError(
++                    f"The prompt (total length {len(prompt_ids)}) is too long "
++                    f"to fit into the model (context length {max_prompt_len}). "
++                    "Make sure that `max_model_len` is no smaller than the "
++                    "number of text tokens plus multimodal tokens. For image "
++                    "inputs, the number of image tokens depends on the number "
++                    "of images, and possibly their aspect ratios as well.")
++
++            # TODO: Find out how many placeholder tokens are there so we can
++            # check that chunked prefill does not truncate them
++            # max_batch_len = self.scheduler_config.max_num_batched_tokens
+diff --git a/vllm/v1/executor/__init__.py b/vllm/v1/executor/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
+new file mode 100644
+index 0000000..5d74d4b
+--- /dev/null
++++ b/vllm/v1/executor/abstract.py
+@@ -0,0 +1,57 @@
++from abc import ABC, abstractmethod
++from typing import Tuple, Type
++
++from vllm.config import VllmConfig
++from vllm.v1.outputs import ModelRunnerOutput
++
++
++class Executor(ABC):
++    """Abstract class for executors."""
++
++    @staticmethod
++    def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
++        executor_class: Type[Executor]
++        distributed_executor_backend = (
++            vllm_config.parallel_config.distributed_executor_backend)
++        if distributed_executor_backend == "ray":
++            from vllm.v1.executor.ray_executor import RayExecutor
++            executor_class = RayExecutor
++        elif distributed_executor_backend == "mp":
++            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
++            executor_class = MultiprocExecutor
++        else:
++            assert (distributed_executor_backend is None)
++            from vllm.v1.executor.uniproc_executor import UniprocExecutor
++            executor_class = UniprocExecutor
++        return executor_class
++
++    @abstractmethod
++    def __init__(self, vllm_config: VllmConfig) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def initialize(self, num_gpu_blocks: int) -> None:
++        raise NotImplementedError
++
++    @abstractmethod
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        raise NotImplementedError
++
++    @abstractmethod
++    def execute_model(
++        self,
++        scheduler_output,
++    ) -> ModelRunnerOutput:
++        raise NotImplementedError
++
++    @abstractmethod
++    def profile(self, is_start: bool = True):
++        raise NotImplementedError
++
++    @abstractmethod
++    def shutdown(self):
++        pass
++
++    @abstractmethod
++    def check_health(self) -> None:
++        raise NotImplementedError
+diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
+new file mode 100644
+index 0000000..41e6abb
+--- /dev/null
++++ b/vllm/v1/executor/multiproc_executor.py
+@@ -0,0 +1,405 @@
++import os
++import pickle
++import signal
++import sys
++import time
++import weakref
++from dataclasses import dataclass
++from enum import Enum, auto
++from multiprocessing.process import BaseProcess
++from typing import Any, Dict, List, Optional, Tuple
++
++import psutil
++import zmq
++
++from vllm.config import VllmConfig
++from vllm.distributed import (destroy_distributed_environment,
++                              destroy_model_parallel)
++from vllm.distributed.device_communicators.shm_broadcast import (Handle,
++                                                                 MessageQueue)
++from vllm.executor.multiproc_worker_utils import (
++    _add_prefix, set_multiprocessing_worker_envs)
++from vllm.logger import init_logger
++from vllm.utils import (get_distributed_init_method, get_mp_context,
++                        get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
++from vllm.v1.executor.abstract import Executor
++from vllm.v1.outputs import ModelRunnerOutput
++from vllm.worker.worker_base import WorkerWrapperBase
++
++logger = init_logger(__name__)
++
++POLLING_TIMEOUT_MS = 5000
++POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
++
++
++class MultiprocExecutor(Executor):
++
++    def __init__(self, vllm_config: VllmConfig) -> None:
++        # Call self.shutdown at exit to clean up
++        # and ensure workers will be terminated.
++        self._finalizer = weakref.finalize(self, self.shutdown)
++
++        # The child processes will send SIGUSR1 when unrecoverable
++        # errors happen.
++        def sigusr1_handler(signum, frame):
++            logger.fatal(
++                "MulitprocExecutor got fatal signal from worker processes, "
++                "shutting down. See stack trace above for root cause issue.")
++            # Propagate error up to parent process.
++            parent_process = psutil.Process().parent()
++            parent_process.send_signal(signal.SIGUSR1)
++            self.shutdown()
++
++        signal.signal(signal.SIGUSR1, sigusr1_handler)
++
++        self.vllm_config = vllm_config
++        self.parallel_config = vllm_config.parallel_config
++
++        self.world_size = self.parallel_config.world_size
++        tensor_parallel_size = self.parallel_config.tensor_parallel_size
++        assert self.world_size == tensor_parallel_size, (
++            f"world_size ({self.world_size}) must be equal to the "
++            f"tensor_parallel_size ({tensor_parallel_size}). "
++            f"Pipeline parallelism is not yet implemented in v1")
++
++        # Set multiprocessing envs that are common to V0 and V1
++        set_multiprocessing_worker_envs(self.parallel_config)
++
++        # Multiprocessing-based executor does not support multi-node setting.
++        # Since it only works for single node, we can use the loopback address
++        # 127.0.0.1 for communication.
++        distributed_init_method = get_distributed_init_method(
++            "127.0.0.1", get_open_port())
++
++        # Initialize worker and set up message queues for SchedulerOutputs
++        # and ModelRunnerOutputs
++        self.rpc_broadcast_mq = MessageQueue(self.world_size, self.world_size)
++        scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
++
++        # Create workers
++        self.workers: List[WorkerProcHandle] = []
++        for rank in range(self.world_size):
++            worker = WorkerProc.make_worker_process(vllm_config, rank, rank,
++                                                    distributed_init_method,
++                                                    scheduler_output_handle)
++            self.workers.append(worker)
++
++        # Ensure message queues are ready. Will deadlock if re-ordered
++        # Must be kept consistent with the WorkerProc
++        self.rpc_broadcast_mq.wait_until_ready()
++        for w in self.workers:
++            w.worker_response_mq.wait_until_ready()
++
++    def initialize(self, num_gpu_blocks: int) -> None:
++        """
++        Initialize the KV caches and begin the model execution loop of the
++        underlying workers.
++        """
++        logger.info("# GPU blocks: %d", num_gpu_blocks)
++        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
++        self.collective_rpc("compile_or_warm_up_model")
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """
++        Determine the number of available KV blocks by invoking the
++        underlying worker.
++        """
++        num_blocks = self.collective_rpc("determine_num_available_blocks")
++
++        # Since we use a shared centralized controller, we take the minimum
++        # number of blocks across all workers to make sure all the memory
++        # operators can be applied to all workers.
++        num_gpu_blocks = min(b[0] for b in num_blocks)
++        num_cpu_blocks = min(b[1] for b in num_blocks)
++
++        return num_gpu_blocks, num_cpu_blocks
++
++    def collective_rpc(self,
++                       method: str,
++                       timeout: Optional[float] = None,
++                       args: Tuple = (),
++                       kwargs: Optional[Dict] = None) -> List[Any]:
++        """
++        Execute an RPC call on workers.
++        
++        Args:
++            method: Name of the worker method to execute
++            timeout: Maximum time in seconds to wait for execution. Rases a
++                     TimeoutError on timeout. None means wait indefinitely.
++            args: Positional arguments to pass to the worker method
++            kwargs: Keyword arguments to pass to the worker method
++
++        Returns:
++            List of results from each worker
++        """
++        start_time = time.monotonic()
++        kwargs = kwargs or {}
++
++        try:
++            self.rpc_broadcast_mq.enqueue((method, args, kwargs))
++
++            responses = [None] * self.world_size
++            for w in self.workers:
++                dequeue_timeout = timeout - (time.monotonic() - start_time
++                                             ) if timeout is not None else None
++                status, result = w.worker_response_mq.dequeue(
++                    timeout=dequeue_timeout)
++
++                if status != WorkerProc.ResponseStatus.SUCCESS:
++                    if isinstance(result, Exception):
++                        raise result
++                    else:
++                        raise RuntimeError("Worker failed")
++
++                responses[w.rank] = result
++
++            return responses
++        except TimeoutError as e:
++            raise TimeoutError(f"RPC call to {method} timed out.") from e
++        except Exception as e:
++            # Re-raise any other exceptions
++            raise e
++
++    def execute_model(
++        self,
++        scheduler_output,
++    ) -> ModelRunnerOutput:
++        model_output = self.collective_rpc("execute_model",
++                                           args=(scheduler_output, ))[0]
++        return model_output
++
++    def profile(self, is_start: bool = True):
++        self.collective_rpc("profile", args=(is_start, ))
++        return
++
++    def _ensure_worker_termination(self):
++        """Ensure that all worker processes are terminated. Assumes workers have
++        received termination requests. Waits for processing, then sends
++        termination and kill signals if needed."""
++
++        def wait_for_termination(procs, timeout):
++            if not time:
++                # If we are in late stage shutdown, the interpreter may replace
++                # `time` with `None`.
++                return all(not proc.is_alive() for proc in procs)
++            start_time = time.time()
++            while time.time() - start_time < timeout:
++                if all(not proc.is_alive() for proc in procs):
++                    return True
++                time.sleep(0.1)
++            return False
++
++        # Send SIGTERM if still running
++        active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
++        for p in active_procs:
++            p.terminate()
++        if not wait_for_termination(active_procs, 4):
++            # Send SIGKILL if still running
++            active_procs = [p for p in active_procs if p.is_alive()]
++            for p in active_procs:
++                p.kill()
++
++        self._cleanup_sockets()
++
++    def _cleanup_sockets(self):
++        for w in self.workers:
++            # Remove the zmq ipc socket file
++            socket_path = w.ready_path.replace("ipc://", "")
++            if os and os.path.exists(socket_path):
++                os.remove(socket_path)
++
++    def shutdown(self):
++        """Properly shut down the executor and its workers"""
++        if getattr(self, 'shutting_down', False):
++            self.shutting_down = True
++            for w in self.workers:
++                w.worker_response_mq = None
++            self._ensure_worker_termination()
++
++        self.rpc_broadcast_mq = None
++
++    def check_health(self) -> None:
++        self.collective_rpc("check_health", timeout=10)
++        return
++
++
++@dataclass
++class WorkerProcHandle:
++    proc: BaseProcess
++    rank: int
++    ready_path: str
++    worker_response_mq: MessageQueue  # The worker process writes to this MQ
++
++
++class WorkerProc:
++    """Wrapper that runs one Worker in a separate process."""
++
++    READY_STR = "READY"
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        local_rank: int,
++        rank: int,
++        distributed_init_method: str,
++        input_shm_handle: Handle,
++        ready_path: str,
++    ):
++        self.rank = rank
++        wrapper = WorkerWrapperBase(vllm_config=vllm_config)
++        wrapper.init_worker(vllm_config, local_rank, rank,
++                            distributed_init_method)
++        self.worker = wrapper.worker
++
++        pid = os.getpid()
++        _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
++        _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid)
++
++        # Initialize MessageQueue for receiving SchedulerOutput
++        self.rpc_broadcast_mq = MessageQueue.create_from_handle(
++            input_shm_handle, self.worker.rank)
++
++        # Initializes a message queue for sending the model output
++        self.worker_response_mq = MessageQueue(1, 1)
++        worker_response_mq_handle = self.worker_response_mq.export_handle()
++
++        # Send Readiness signal to EngineCore process.
++        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
++            payload = pickle.dumps(worker_response_mq_handle,
++                                   protocol=pickle.HIGHEST_PROTOCOL)
++            ready_socket.send_string(WorkerProc.READY_STR)
++            ready_socket.send(payload)
++
++        self.worker.initialize()
++        self.worker.load_model()
++
++    @staticmethod
++    def make_worker_process(
++            vllm_config: VllmConfig,
++            local_rank: int,
++            rank: int,
++            distributed_init_method: str,
++            input_shm_handle,  # Receive SchedulerOutput
++    ) -> WorkerProcHandle:
++        context = get_mp_context()
++
++        # ZMQ path for worker to send ready message and shm_broadcast handle
++        # back to core process.
++        ready_path = get_open_zmq_ipc_path()
++
++        process_kwargs = {
++            "vllm_config": vllm_config,
++            "local_rank": local_rank,
++            "rank": rank,
++            "distributed_init_method": distributed_init_method,
++            "input_shm_handle": input_shm_handle,
++            "ready_path": ready_path,
++        }
++        # Run EngineCore busy loop in background process.
++        proc = context.Process(target=WorkerProc.worker_main,
++                               kwargs=process_kwargs,
++                               daemon=True)
++        proc.start()
++
++        # Wait for startup
++        worker_response_mq_handle = WorkerProc.wait_for_startup(
++            proc, ready_path)
++
++        worker_response_mq = MessageQueue.create_from_handle(
++            worker_response_mq_handle, 0)
++
++        return WorkerProcHandle(proc, rank, ready_path, worker_response_mq)
++
++    def shutdown(self):
++        self.rpc_broadcast_mq = None
++        self.worker_response_mq = None
++        destroy_model_parallel()
++        destroy_distributed_environment()
++
++    @staticmethod
++    def worker_main(*args, **kwargs):
++        """ Worker initialization and execution loops.
++        This runs a background process """
++
++        # Signal handler used for graceful termination.
++        # SystemExit exception is only raised once to allow this and worker
++        # processes to terminate without error
++        shutdown_requested = False
++
++        def signal_handler(signum, frame):
++            nonlocal shutdown_requested
++            if not shutdown_requested:
++                shutdown_requested = True
++                raise SystemExit()
++
++        # Either SIGTERM or SIGINT will terminate the worker
++        signal.signal(signal.SIGTERM, signal_handler)
++        signal.signal(signal.SIGINT, signal_handler)
++
++        worker = None
++        try:
++            worker = WorkerProc(*args, **kwargs)
++
++            # Ensure message queues are ready. Will deadlock if re-ordered.
++            # Must be kept consistent with the Executor
++            worker.rpc_broadcast_mq.wait_until_ready()
++            worker.worker_response_mq.wait_until_ready()
++
++            worker.worker_busy_loop()
++
++        except SystemExit:
++            logger.debug("Worker interrupted.")
++
++        except Exception:
++            # worker_busy_loop sends exceptions exceptons to Executor
++            # for shutdown, but if there is an error in startup or an
++            # error with IPC itself, we need to alert the parent.
++            psutil.Process().parent().send_signal(signal.SIGUSR1)
++            raise
++
++        finally:
++            # Clean up once worker exits busy loop
++            if worker is not None:
++                worker.shutdown()
++                worker = None
++
++    @staticmethod
++    def wait_for_startup(
++        proc: BaseProcess,
++        ready_path: str,
++    ) -> Optional[Handle]:
++        """Wait until the Worker is ready."""
++        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
++
++            # Wait for Worker to send READY.
++            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
++                logger.debug("Waiting for WorkerProc to startup.")
++
++                if not proc.is_alive():
++                    raise RuntimeError("WorkerProc failed to start.")
++
++            message = socket.recv_string()
++            assert message == WorkerProc.READY_STR
++            handle_frame = socket.recv(copy=False)
++            handle = pickle.loads(handle_frame.buffer)
++            return handle
++
++    class ResponseStatus(Enum):
++        SUCCESS = auto()
++        FAILURE = auto()
++
++    def worker_busy_loop(self):
++        """Main busy loop for Multiprocessing Workers"""
++        while True:
++            method, args, kwargs = self.rpc_broadcast_mq.dequeue()
++
++            try:
++                output = getattr(self.worker, method)(*args, **kwargs)
++            except Exception as e:
++                self.worker_response_mq.enqueue(
++                    (WorkerProc.ResponseStatus.FAILURE, e))
++                logger.exception("WorkerProc hit an exception: %s", exc_info=e)
++                continue
++
++            self.worker_response_mq.enqueue(
++                (WorkerProc.ResponseStatus.SUCCESS, output))
+diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
+new file mode 100644
+index 0000000..79acc60
+--- /dev/null
++++ b/vllm/v1/executor/ray_executor.py
+@@ -0,0 +1,342 @@
++import os
++from collections import defaultdict
++from itertools import islice, repeat
++from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
++
++import vllm.envs as envs
++from vllm.config import VllmConfig
++from vllm.logger import init_logger
++from vllm.utils import get_distributed_init_method, get_ip, get_open_port
++from vllm.v1.executor.abstract import Executor
++from vllm.v1.executor.ray_utils import (RayWorkerWrapper,
++                                        initialize_ray_cluster, ray)
++from vllm.v1.outputs import ModelRunnerOutput
++
++if ray is not None:
++    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
++
++if TYPE_CHECKING:
++    from ray.util.placement_group import PlacementGroup
++
++logger = init_logger(__name__)
++
++
++class RayExecutor(Executor):
++
++    def __init__(self, vllm_config: VllmConfig) -> None:
++        self.vllm_config = vllm_config
++        self.parallel_config = vllm_config.parallel_config
++        self.model_config = vllm_config.model_config
++        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
++
++        # Disable Ray usage stats collection.
++        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
++        if ray_usage != "1":
++            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
++
++        initialize_ray_cluster(self.parallel_config)
++        placement_group = self.parallel_config.placement_group
++
++        # Create the parallel GPU workers.
++        self._init_workers_ray(placement_group)
++
++    def _init_workers_ray(self, placement_group: "PlacementGroup",
++                          **ray_remote_kwargs):
++        # A list of workers to run a model.
++        self.workers: List[RayWorkerWrapper] = []
++        if self.parallel_config.ray_workers_use_nsight:
++            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
++                ray_remote_kwargs)
++
++        # Create the workers.
++        driver_ip = get_ip()
++        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
++            if not bundle.get("GPU", 0):
++                # Skip bundles that don't have GPUs,
++                # as each worker needs one GPU.
++                continue
++            scheduling_strategy = PlacementGroupSchedulingStrategy(
++                placement_group=placement_group,
++                placement_group_capture_child_tasks=True,
++                placement_group_bundle_index=bundle_id,
++            )
++
++            worker = ray.remote(
++                num_cpus=0,
++                num_gpus=1,
++                scheduling_strategy=scheduling_strategy,
++                **ray_remote_kwargs,
++            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
++            self.workers.append(worker)
++
++        logger.debug("workers: %s", self.workers)
++        worker_ips = [
++            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
++            for worker in self.workers
++        ]
++        ip_counts: Dict[str, int] = {}
++        for ip in worker_ips:
++            ip_counts[ip] = ip_counts.get(ip, 0) + 1
++
++        worker_to_ip = dict(zip(self.workers, worker_ips))
++
++        def sort_by_driver_then_worker_ip(worker):
++            """
++            Sort the workers based on 3 properties:
++            1. If the worker is on the same node as the driver (vllm engine),
++                it should be placed first.
++            2. Then, if the worker is on a node with fewer workers, it should
++                be placed first.
++            3. Finally, if the work is on a node with smaller IP address, it
++                should be placed first. This is simply a tiebreaker to make
++                sure the workers are sorted in a deterministic way.
++            """
++            ip = worker_to_ip[worker]
++            return (ip != driver_ip, ip_counts[ip], ip)
++
++        # After sorting, the workers on the same node will be
++        # close to each other, and the workers on the driver
++        # node will be placed first.
++        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
++
++        # Get the set of GPU IDs used on each node.
++        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids")
++
++        node_workers = defaultdict(list)  # node id -> list of worker ranks
++        node_gpus = defaultdict(list)  # node id -> list of gpu ids
++
++        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
++            node_workers[node_id].append(i)
++            # `gpu_ids` can be a list of strings or integers.
++            # convert them to integers for consistency.
++            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
++            # string sorting is not sufficient.
++            # see https://github.com/vllm-project/vllm/issues/5590
++            gpu_ids = [int(x) for x in gpu_ids]
++            node_gpus[node_id].extend(gpu_ids)
++
++        for node_id, gpu_ids in node_gpus.items():
++            node_gpus[node_id] = sorted(gpu_ids)
++
++        all_ips = set(worker_ips)
++        n_ips = len(all_ips)
++        n_nodes = len(node_workers)
++
++        if n_nodes != n_ips:
++            raise RuntimeError(
++                f"Every node should have a unique IP address. Got {n_nodes}"
++                f" nodes with node ids {list(node_workers.keys())} and "
++                f"{n_ips} unique IP addresses {all_ips}. Please check your"
++                " network configuration. If you set `VLLM_HOST_IP` or "
++                "`HOST_IP` environment variable, make sure it is unique for"
++                " each node.")
++
++        # Set environment variables for the driver and workers.
++        all_args_to_update_environment_variables = [({
++            "CUDA_VISIBLE_DEVICES":
++            ",".join(map(str, node_gpus[node_id])),
++            "VLLM_TRACE_FUNCTION":
++            str(envs.VLLM_TRACE_FUNCTION),
++            "VLLM_USE_V1":
++            str(int(envs.VLLM_USE_V1)),
++            **({
++                "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
++            } if envs.VLLM_ATTENTION_BACKEND is not None else {})
++        }, ) for (node_id, _) in worker_node_and_gpu_ids]
++
++        self._env_vars_for_all_workers = (
++            all_args_to_update_environment_variables)
++
++        self._run_workers("update_environment_variables",
++                          all_args=self._get_env_vars_to_be_updated())
++
++        if len(node_gpus) == 1:
++            # in single node case, we don't need to get the IP address.
++            # the loopback address is sufficient
++            # NOTE: a node may have several IP addresses, one for each
++            # network interface. `get_ip()` might return any of them,
++            # while they might not work for communication inside the node
++            # if the network setup is complicated. Using the loopback address
++            # solves this issue, as it always works for communication inside
++            # the node.
++            driver_ip = "127.0.0.1"
++        distributed_init_method = get_distributed_init_method(
++            driver_ip, get_open_port())
++
++        # Initialize the actual workers inside worker wrapper.
++        init_worker_all_kwargs = [
++            self._get_worker_kwargs(
++                local_rank=node_workers[node_id].index(rank),
++                rank=rank,
++                distributed_init_method=distributed_init_method,
++            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
++        ]
++        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
++        self._run_workers("initialize")
++        self._run_workers("load_model")
++
++    def _configure_ray_workers_use_nsight(self,
++                                          ray_remote_kwargs) -> Dict[str, Any]:
++        # If nsight profiling is enabled, we need to set the profiling
++        # configuration for the ray workers as runtime env.
++        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
++        runtime_env.update({
++            "nsight": {
++                "t": "cuda,cudnn,cublas",
++                "o": "'worker_process_%p'",
++                "cuda-graph-trace": "node",
++            }
++        })
++
++        return ray_remote_kwargs
++
++    def _get_env_vars_to_be_updated(self):
++        return self._env_vars_for_all_workers
++
++    def _get_worker_kwargs(
++            self,
++            local_rank: int = 0,
++            rank: int = 0,
++            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
++        """
++        Return worker init args for a given rank.
++        """
++        if distributed_init_method is None:
++            distributed_init_method = get_distributed_init_method(
++                get_ip(), get_open_port())
++        return dict(
++            vllm_config=self.vllm_config,
++            local_rank=local_rank,
++            rank=rank,
++            distributed_init_method=distributed_init_method,
++        )
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """
++        Determine the number of available KV blocks.
++        
++        This invokes `determine_num_available_blocks` on each worker and takes
++        the min of the results, guaranteeing that the selected cache sizes are
++        compatible with all workers.
++        
++        Returns:
++            - tuple[num_gpu_blocks, num_cpu_blocks]
++        """
++        # Get the maximum number of blocks that can be allocated on GPU and CPU.
++        num_blocks = self._run_workers("determine_num_available_blocks")
++
++        # Since we use a shared centralized controller, we take the minimum
++        # number of blocks across all workers to make sure all the memory
++        # operators can be applied to all workers.
++        num_gpu_blocks = min(b[0] for b in num_blocks)
++        num_cpu_blocks = min(b[1] for b in num_blocks)
++
++        return num_gpu_blocks, num_cpu_blocks
++
++    def initialize(self, num_gpu_blocks: int) -> None:
++        """
++        Initialize the KV cache in all workers.
++        """
++        # NOTE: This is logged in the executor because there can be >1 worker
++        # with other executors. We could log in the engine level, but work
++        # remains to abstract away the device for non-GPU configurations.
++        logger.info("# GPU blocks: %d", num_gpu_blocks)
++        self._run_workers("initialize_cache", num_gpu_blocks)
++        self._run_workers("compile_or_warm_up_model")
++
++    def _run_workers(
++        self,
++        method: str,
++        *args,
++        all_args: Optional[List[Tuple[Any, ...]]] = None,
++        all_kwargs: Optional[List[Dict[str, Any]]] = None,
++        **kwargs,
++    ) -> Any:
++        """
++        Runs the given method on all workers. Can be used in the following
++        ways:
++
++        Args:
++        - args/kwargs: All workers share the same args/kwargs
++        - all_args/all_kwargs: args/kwargs for each worker are specified
++          individually
++        """
++        count = len(self.workers)
++        all_worker_args = repeat(args, count) if all_args is None \
++            else islice(all_args, 0, None)
++        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
++            else islice(all_kwargs, 0, None)
++
++        ray_worker_refs = [
++            worker.execute_method.remote(  # type: ignore[attr-defined]
++                method, *worker_args, **worker_kwargs)
++            for (worker, worker_args, worker_kwargs
++                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
++        ]
++        return ray.get(ray_worker_refs)
++
++    def execute_model(
++        self,
++        scheduler_output,
++    ) -> ModelRunnerOutput:
++        if self.forward_dag is None:
++            self.forward_dag = self._compiled_ray_dag()
++        # Only the first worker (with rank 0) returns the execution result.
++        # Others return None.
++        output = ray.get(self.forward_dag.execute(scheduler_output))[0]
++        return output
++
++    def profile(self, is_start=True):
++        raise NotImplementedError
++
++    def shutdown(self):
++        if hasattr(self, "forward_dag") and self.forward_dag is not None:
++            self.forward_dag.teardown()
++            import ray
++            for worker in self.workers:
++                ray.kill(worker)
++            self.forward_dag = None
++
++    def check_health(self) -> None:
++        logger.debug("Called check_health.")
++
++    def _check_ray_compiled_graph_installation(self):
++        import pkg_resources
++        from packaging import version
++
++        required_version = version.parse("2.39")
++        current_version = version.parse(
++            pkg_resources.get_distribution("ray").version)
++        if current_version < required_version:
++            raise ValueError(f"Ray version {required_version} is "
++                             f"required, but found {current_version}")
++
++        import importlib.util
++        raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref")
++        if raycg is None:
++            raise ValueError("Ray Compiled Graph is not installed. "
++                             "Run `pip install ray[adag]` to install it.")
++
++        cupy_spec = importlib.util.find_spec("cupy")
++        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
++            raise ValueError(
++                "cupy is not installed but required since "
++                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
++                "Run `pip install ray[adag]` and check cupy installation.")
++
++    def _compiled_ray_dag(self):
++        assert self.parallel_config.use_ray
++        self._check_ray_compiled_graph_installation()
++        from ray.dag import InputNode, MultiOutputNode
++
++        with InputNode() as input_batches:
++            outputs = [
++                worker.execute_model.bind(  # type: ignore[attr-defined]
++                    input_batches) for worker in self.workers
++            ]
++            forward_dag = MultiOutputNode(outputs)
++
++        return forward_dag.experimental_compile()
++
++    def __del__(self):
++        self.shutdown()
+diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
+new file mode 100644
+index 0000000..fc9715b
+--- /dev/null
++++ b/vllm/v1/executor/ray_utils.py
+@@ -0,0 +1,280 @@
++import time
++from collections import defaultdict
++from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
++
++from vllm.config import ParallelConfig
++from vllm.logger import init_logger
++from vllm.platforms import current_platform
++from vllm.utils import get_ip
++from vllm.v1.outputs import ModelRunnerOutput
++from vllm.worker.worker_base import WorkerWrapperBase
++
++if TYPE_CHECKING:
++    from vllm.v1.core.scheduler import SchedulerOutput
++
++logger = init_logger(__name__)
++PG_WAIT_TIMEOUT = 60
++
++try:
++    import ray
++    from ray.util import placement_group_table
++    from ray.util.placement_group import PlacementGroup
++    try:
++        from ray._private.state import available_resources_per_node
++    except ImportError:
++        # Ray 2.9.x doesn't expose `available_resources_per_node`
++        from ray._private.state import state as _state
++        available_resources_per_node = _state._available_resources_per_node
++
++    class RayWorkerWrapper(WorkerWrapperBase):
++
++        def __init__(self, *args, **kwargs) -> None:
++            super().__init__(*args, **kwargs)
++            # Since the compiled DAG runs a main execution
++            # in a different thread that calls cuda.set_device.
++            # The flag indicates is set_device is called on
++            # that thread. It will be removed soon.
++            self.compiled_dag_cuda_device_set = False
++
++        def get_node_ip(self) -> str:
++            return get_ip()
++
++        def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
++            node_id = ray.get_runtime_context().get_node_id()
++            device_key = current_platform.ray_device_key
++            if not device_key:
++                raise RuntimeError("current platform %s does not support ray.",
++                                   current_platform.device_name)
++            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
++            )[device_key]
++            return node_id, gpu_ids
++
++        def setup_device_if_necessary(self):
++            # TODO(swang): This is needed right now because Ray CG executes
++            # on a background thread, so we need to reset torch's current
++            # device.
++            # We can remove this API after it is fixed in compiled graph.
++            import torch
++            assert self.worker is not None, "Worker is not initialized"
++            if not self.compiled_dag_cuda_device_set:
++                torch.cuda.set_device(self.worker.device)
++                self.compiled_dag_cuda_device_set = True
++
++        def execute_model(
++            self,
++            scheduler_output: "SchedulerOutput",
++        ) -> ModelRunnerOutput:
++            self.setup_device_if_necessary()
++            assert self.worker is not None, "Worker is not initialized"
++            output = self.worker.model_runner.execute_model(scheduler_output)
++            return output
++
++    ray_import_err = None
++
++except ImportError as e:
++    ray = None  # type: ignore
++    ray_import_err = e
++    RayWorkerWrapper = None  # type: ignore
++
++
++def ray_is_available() -> bool:
++    """Returns True if Ray is available."""
++    return ray is not None
++
++
++def assert_ray_available():
++    """
++    Raise an exception if Ray is not available.
++    """
++    if ray is None:
++        raise ValueError("Failed to import Ray, please install Ray with "
++                         "`pip install ray`.") from ray_import_err
++
++
++def _verify_bundles(placement_group: "PlacementGroup",
++                    parallel_config: ParallelConfig, device_str: str):
++    """
++    Verify a given placement group has bundles located in the right place.
++
++    There are 2 rules.
++    - Warn if all tensor parallel workers cannot fit in a single node.
++    - Fail if driver node is not included in a placement group.
++
++    Args:
++        placement_group: The placement group to verify.
++        parallel_config: The parallel configuration.
++        device_str: The required device.
++    """
++    assert ray.is_initialized(), (
++        "Ray is not initialized although distributed-executor-backend is ray.")
++    pg_data = placement_group_table(placement_group)
++    # bundle_idx -> node_id
++    bundle_to_node_ids = pg_data["bundles_to_node_id"]
++    # bundle_idx -> bundle (e.g., {"GPU": 1})
++    bundles = pg_data["bundles"]
++    # node_id -> List of bundle (e.g., {"GPU": 1})
++    node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
++
++    for bundle_idx, node_id in bundle_to_node_ids.items():
++        node_id_to_bundle[node_id].append(bundles[bundle_idx])
++    driver_node_id = ray.get_runtime_context().get_node_id()
++
++    if driver_node_id not in node_id_to_bundle:
++        raise RuntimeError(
++            f"driver node id {driver_node_id} is not included in a placement "
++            f"group {placement_group.id}. Node id -> bundles "
++            f"{node_id_to_bundle}. "
++            "You don't have enough GPUs available in a current node. Check "
++            "`ray status` to see if you have available GPUs in a node "
++            f"{driver_node_id} before starting an vLLM engine.")
++
++    for node_id, bundles in node_id_to_bundle.items():
++        if len(bundles) < parallel_config.tensor_parallel_size:
++            logger.warning(
++                "tensor_parallel_size=%d "
++                "is bigger than a reserved number of %ss (%d "
++                "%ss) in a node %s. Tensor parallel workers can be "
++                "spread out to 2+ nodes which can degrade the performance "
++                "unless you have fast interconnect across nodes, like "
++                "Infiniband. To resolve this issue, make sure you have more "
++                "than %d GPUs available at each node.",
++                parallel_config.tensor_parallel_size, device_str, len(bundles),
++                device_str, node_id, parallel_config.tensor_parallel_size)
++
++
++def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
++    """Wait until a placement group is ready.
++
++    It prints the informative log messages if the placement group is
++    not created within time.
++
++    """
++    # Wait until PG is ready - this will block until all
++    # requested resources are available, and will timeout
++    # if they cannot be provisioned.
++    placement_group_specs = current_placement_group.bundle_specs
++
++    s = time.time()
++    pg_ready_ref = current_placement_group.ready()
++    wait_interval = 10
++    while time.time() - s < PG_WAIT_TIMEOUT:
++        ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
++        if len(ready) > 0:
++            break
++
++        # Exponential backoff for warning print.
++        wait_interval *= 2
++        logger.info(
++            "Waiting for creating a placement group of specs for "
++            "%d seconds. specs=%s. Check "
++            "`ray status` to see if you have enough resources.",
++            int(time.time() - s), placement_group_specs)
++
++    try:
++        ray.get(pg_ready_ref, timeout=0)
++    except ray.exceptions.GetTimeoutError:
++        raise ValueError(
++            "Cannot provide a placement group of "
++            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
++            "`ray status` to make sure the cluster has enough resources."
++        ) from None
++
++
++def initialize_ray_cluster(
++    parallel_config: ParallelConfig,
++    ray_address: Optional[str] = None,
++):
++    """Initialize the distributed cluster with Ray.
++
++    it will connect to the Ray cluster and create a placement group
++    for the workers, which includes the specification of the resources
++    for each distributed worker.
++
++    Args:
++        parallel_config: The configurations for parallel execution.
++        ray_address: The address of the Ray cluster. If None, uses
++            the default Ray cluster address.
++    """
++    assert_ray_available()
++
++    # Connect to a ray cluster.
++    if current_platform.is_rocm() or current_platform.is_xpu():
++        # Try to connect existing ray instance and create a new one if not found
++        try:
++            ray.init("auto")
++        except ConnectionError:
++            logger.warning(
++                "No existing RAY instance detected. "
++                "A new instance will be launched with current node resources.")
++            ray.init(address=ray_address,
++                     ignore_reinit_error=True,
++                     num_gpus=parallel_config.world_size)
++    else:
++        ray.init(address=ray_address, ignore_reinit_error=True)
++
++    if parallel_config.placement_group:
++        # Placement group is already set.
++        return
++
++    device_str = current_platform.ray_device_key
++    if not device_str:
++        raise ValueError(
++            f"current platform {current_platform.device_name} does not "
++            "support ray.")
++    # Create placement group for worker processes
++    current_placement_group = ray.util.get_current_placement_group()
++    if current_placement_group:
++        # We are in a placement group
++        bundles = current_placement_group.bundle_specs
++        # Verify that we can use the placement group.
++        device_bundles = 0
++        for bundle in bundles:
++            bundle_devices = bundle.get(device_str, 0)
++            if bundle_devices > 1:
++                raise ValueError(
++                    "Placement group bundle cannot have more than 1 "
++                    f"{device_str}.")
++            if bundle_devices:
++                device_bundles += 1
++        if parallel_config.world_size > device_bundles:
++            raise ValueError(
++                f"The number of required {device_str}s exceeds the total "
++                f"number of available {device_str}s in the placement group."
++                f"Required number of devices: {parallel_config.world_size}. "
++                f"Total number of devices: {device_bundles}.")
++    else:
++        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
++        if parallel_config.world_size > num_devices_in_cluster:
++            raise ValueError(
++                f"The number of required {device_str}s exceeds the total "
++                f"number of available {device_str}s in the placement group.")
++        # Create a new placement group
++        placement_group_specs: List[Dict[str, float]] = ([{
++            device_str: 1.0
++        } for _ in range(parallel_config.world_size)])
++
++        # vLLM engine is also a worker to execute model with an accelerator,
++        # so it requires to have the device in a current node. Check if
++        # the current node has at least one device.
++        current_ip = get_ip()
++        current_node_id = ray.get_runtime_context().get_node_id()
++        current_node_resource = available_resources_per_node()[current_node_id]
++        if current_node_resource.get(device_str, 0) < 1:
++            raise ValueError(
++                f"Current node has no {device_str} available. "
++                f"{current_node_resource=}. vLLM engine cannot start without "
++                f"{device_str}. Make sure you have at least 1 {device_str} "
++                f"available in a node {current_node_id=} {current_ip=}.")
++        # This way, at least bundle is required to be created in a current
++        # node.
++        placement_group_specs[0][f"node:{current_ip}"] = 0.001
++
++        # By default, Ray packs resources as much as possible.
++        current_placement_group = ray.util.placement_group(
++            placement_group_specs, strategy="PACK")
++        _wait_until_pg_ready(current_placement_group)
++
++    assert current_placement_group is not None
++    _verify_bundles(current_placement_group, parallel_config, device_str)
++    # Set the placement group in the parallel config
++    parallel_config.placement_group = current_placement_group
+diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
+new file mode 100644
+index 0000000..be05831
+--- /dev/null
++++ b/vllm/v1/executor/uniproc_executor.py
+@@ -0,0 +1,84 @@
++import os
++from typing import Optional, Tuple
++
++from vllm.config import VllmConfig
++from vllm.logger import init_logger
++from vllm.utils import get_distributed_init_method, get_ip, get_open_port
++from vllm.v1.executor.abstract import Executor
++from vllm.v1.outputs import ModelRunnerOutput
++from vllm.v1.worker.gpu_worker import Worker
++
++logger = init_logger(__name__)
++
++
++class UniprocExecutor(Executor):
++
++    def __init__(self, vllm_config: VllmConfig) -> None:
++        self.vllm_config = vllm_config
++        self.model_config = vllm_config.model_config
++        self.cache_config = vllm_config.cache_config
++        self.lora_config = vllm_config.lora_config
++        self.load_config = vllm_config.load_config
++        self.parallel_config = vllm_config.parallel_config
++        self.scheduler_config = vllm_config.scheduler_config
++        self.device_config = vllm_config.device_config
++        self.speculative_config = vllm_config.speculative_config
++        self.prompt_adapter_config = vllm_config.prompt_adapter_config
++        self.observability_config = vllm_config.observability_config
++
++        self.worker: Worker = self._create_worker()
++        self.worker.initialize()
++        self.worker.load_model()
++
++    def _create_worker(
++            self,
++            local_rank: int = 0,
++            rank: int = 0,
++            distributed_init_method: Optional[str] = None) -> Worker:
++        """Return worker init args for a given rank."""
++        # see https://github.com/NVIDIA/nccl/issues/1234
++        os.environ['NCCL_CUMEM_ENABLE'] = '0'
++
++        if distributed_init_method is None:
++            distributed_init_method = get_distributed_init_method(
++                get_ip(), get_open_port())
++        return Worker(
++            vllm_config=self.vllm_config,
++            local_rank=local_rank,
++            rank=rank,
++            distributed_init_method=distributed_init_method,
++        )
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """Determine the number of available KV blocks by invoking the
++        underlying worker.
++        """
++        return self.worker.determine_num_available_blocks()
++
++    def initialize(self, num_gpu_blocks: int) -> None:
++        """Initialize the KV cache by invoking the underlying worker.
++        """
++        # NOTE: This is logged in the executor because there can be >1 worker
++        # with other executors. We could log in the engine level, but work
++        # remains to abstract away the device for non-GPU configurations.
++        logger.info("# GPU blocks: %d", num_gpu_blocks)
++        self.worker.initialize_cache(num_gpu_blocks)
++        self.worker.compile_or_warm_up_model()
++
++    def execute_model(
++        self,
++        scheduler_output,
++    ) -> ModelRunnerOutput:
++        output = self.worker.execute_model(scheduler_output)
++        return output
++
++    def profile(self, is_start: bool = True):
++        self.worker.profile(is_start)
++
++    def shutdown(self):
++        pass
++
++    def check_health(self) -> None:
++        # UniprocExecutor will always be healthy as long as
++        # it's running.
++        return
+diff --git a/vllm/v1/metrics/__init__.py b/vllm/v1/metrics/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
+new file mode 100644
+index 0000000..8feeef1
+--- /dev/null
++++ b/vllm/v1/metrics/loggers.py
+@@ -0,0 +1,38 @@
++import time
++from abc import ABC, abstractmethod
++
++from vllm.logger import init_logger
++from vllm.v1.metrics.stats import SchedulerStats
++
++logger = init_logger(__name__)
++
++_LOCAL_LOGGING_INTERVAL_SEC = 5.0
++
++
++class StatLoggerBase(ABC):
++
++    @abstractmethod
++    def log(self, scheduler_stats: SchedulerStats):
++        ...
++
++
++class LoggingStatLogger(StatLoggerBase):
++
++    def __init__(self):
++        self.last_log_time = time.monotonic()
++
++    def log(self, scheduler_stats: SchedulerStats):
++        """Log Stats to standard output."""
++
++        # Log every _LOCAL_LOGGING_INTERVAL_SEC.
++        now = time.monotonic()
++        if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC:
++            return
++        self.last_log_time = now
++
++        # Format and print output.
++        logger.info(
++            "Running: %d reqs, Waiting: %d reqs ",
++            scheduler_stats.num_running_reqs,
++            scheduler_stats.num_waiting_reqs,
++        )
+diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
+new file mode 100644
+index 0000000..60cb986
+--- /dev/null
++++ b/vllm/v1/metrics/stats.py
+@@ -0,0 +1,39 @@
++from dataclasses import dataclass
++from typing import TYPE_CHECKING
++
++if TYPE_CHECKING:
++    from vllm.v1.engine import EngineCoreOutput
++
++
++@dataclass
++class SchedulerStats:
++    """Stats associated with the scheduler."""
++
++    num_running_reqs: int = 0
++    num_waiting_reqs: int = 0
++
++    # gpu_cache_usage: float = 0.0
++    # gpu_prefix_cache_hit_rate: float = 0.0
++
++
++class IterationStats:
++    """Stats associated with a single set of EngineCoreOutputs."""
++
++    def __init__(self, log_stats: bool):
++        self.log_stats = log_stats
++        self.num_generation_tokens = 0
++        self.num_prompt_tokens = 0
++
++    def update_from_output(self, output: "EngineCoreOutput",
++                           is_prefilling: bool, prompt_len: int):
++        if not self.log_stats:
++            return
++
++        self.num_generation_tokens += len(output.new_token_ids)
++        if is_prefilling:
++            # This relies on the invariant that EngineCore does
++            # not stream outputs for partially completed prefills
++            # (scheduler.update_from_output makes EngineCoreOutput
++            # iff num_computed_tokens == num_tokens).
++            assert (len(output.new_token_ids) > 0)
++            self.num_prompt_tokens += prompt_len
+diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
+new file mode 100644
+index 0000000..acc3a94
+--- /dev/null
++++ b/vllm/v1/outputs.py
+@@ -0,0 +1,39 @@
++from dataclasses import dataclass
++from typing import Dict, List, Optional
++
++import torch
++
++
++@dataclass
++class SamplerOutput:
++
++    # [num_reqs]
++    sampled_token_ids: List[int]
++
++    # [num_reqs, max_num_logprobs + 1]
++    logprob_token_ids: Optional[torch.Tensor]
++    # [num_reqs, max_num_logprobs + 1]
++    logprobs: Optional[torch.Tensor]
++
++    # TODO: Support prompt logprobs.
++    prompt_logprob_token_ids: Optional[torch.Tensor]
++    prompt_logprobs: Optional[torch.Tensor]
++
++
++# ModelRunnerOutput is serialized and sent to the scheduler process.
++# This is expensive for torch.Tensor so prefer to use List instead.
++@dataclass
++class ModelRunnerOutput:
++
++    # [num_reqs]
++    req_ids: List[str]
++    # req_id -> index
++    req_id_to_index: Dict[str, int]
++
++    # [num_reqs]
++    sampled_token_ids: List[int]
++
++    # [num_reqs, max_num_logprobs + 1]
++    logprob_token_ids_cpu: Optional[torch.Tensor]
++    # [num_reqs, max_num_logprobs + 1]
++    logprobs_cpu: Optional[torch.Tensor]
+diff --git a/vllm/v1/request.py b/vllm/v1/request.py
+new file mode 100644
+index 0000000..4545016
+--- /dev/null
++++ b/vllm/v1/request.py
+@@ -0,0 +1,171 @@
++import enum
++from typing import TYPE_CHECKING, List, Optional, Union
++
++from vllm.lora.request import LoRARequest
++from vllm.sampling_params import SamplingParams
++from vllm.sequence import RequestMetrics
++from vllm.v1.engine import EngineCoreRequest
++from vllm.v1.utils import ConstantList
++
++if TYPE_CHECKING:
++    from vllm.multimodal import MultiModalKwargs
++    from vllm.multimodal.inputs import PlaceholderRange
++    from vllm.v1.core.kv_cache_utils import BlockHashType
++
++
++class Request:
++
++    def __init__(
++        self,
++        request_id: str,
++        prompt: Optional[str],
++        prompt_token_ids: List[int],
++        multi_modal_inputs: Optional[List["MultiModalKwargs"]],
++        multi_modal_hashes: Optional[List[str]],
++        multi_modal_placeholders: Optional[List["PlaceholderRange"]],
++        sampling_params: SamplingParams,
++        eos_token_id: Optional[int],
++        arrival_time: float,
++        lora_request: Optional[LoRARequest] = None,
++    ) -> None:
++        self.request_id = request_id
++        self.sampling_params = sampling_params
++        # Because of LoRA, the eos token id can be different for each request.
++        self.eos_token_id = eos_token_id
++        self.metrics = RequestMetrics(arrival_time=arrival_time,
++                                      last_token_time=arrival_time,
++                                      first_scheduled_time=None,
++                                      first_token_time=None,
++                                      time_in_queue=None)
++        self.lora_request = lora_request
++
++        self.status = RequestStatus.WAITING
++        self.stop_reason: Union[int, str, None] = None
++        assert sampling_params.max_tokens is not None
++        self.max_tokens = sampling_params.max_tokens
++
++        self.prompt = prompt
++        self.prompt_token_ids = prompt_token_ids
++        self.num_prompt_tokens = len(self.prompt_token_ids)
++        self._output_token_ids: List[int] = []
++        self._all_token_ids: List[int] = self.prompt_token_ids.copy()
++        self.num_computed_tokens = 0
++
++        # Multi-modal related
++        self.mm_positions = multi_modal_placeholders or []
++        self.mm_inputs = multi_modal_inputs or []
++        self.mm_hashes: List[str] = multi_modal_hashes or []
++
++        # Sanity check
++        assert len(self.mm_inputs) == len(self.mm_positions)
++        assert len(self.mm_inputs) == len(self.mm_hashes)
++
++        # Cache the computed kv block hashes of the request to avoid
++        # recomputing.
++        self._kv_block_hashes: List[BlockHashType] = []
++
++    @classmethod
++    def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
++        return cls(
++            request_id=request.request_id,
++            prompt=request.prompt,
++            prompt_token_ids=request.prompt_token_ids,
++            multi_modal_inputs=request.mm_inputs,
++            multi_modal_hashes=request.mm_hashes,
++            multi_modal_placeholders=request.mm_placeholders,
++            sampling_params=request.sampling_params,
++            eos_token_id=request.eos_token_id,
++            arrival_time=request.arrival_time,
++            lora_request=request.lora_request,
++        )
++
++    @property
++    def output_token_ids(self) -> ConstantList[int]:
++        # Prevent directly appending to the output_token_ids since
++        # all_token_ids should also be updated simultaneously.
++        return ConstantList(self._output_token_ids)
++
++    @property
++    def all_token_ids(self) -> ConstantList[int]:
++        # Prevent directly appending to the all_token_ids since
++        # output_token_ids should also be updated simultaneously
++        return ConstantList(self._all_token_ids)
++
++    def append_output_token_ids(
++        self,
++        token_ids: Union[int, List[int]],
++    ) -> None:
++        if isinstance(token_ids, int):
++            token_ids = [token_ids]
++        self._output_token_ids.extend(token_ids)
++        self._all_token_ids.extend(token_ids)
++
++    @property
++    def num_tokens(self) -> int:
++        return len(self._all_token_ids)
++
++    @property
++    def num_output_tokens(self) -> int:
++        return len(self._output_token_ids)
++
++    def is_finished(self) -> bool:
++        return RequestStatus.is_finished(self.status)
++
++    def get_finished_reason(self) -> Union[str, None]:
++        return RequestStatus.get_finished_reason(self.status)
++
++    def has_encoder_inputs(self) -> bool:
++        return len(self.mm_inputs) > 0
++
++    @property
++    def num_encoder_inputs(self) -> int:
++        return len(self.mm_positions)
++
++    def get_num_encoder_tokens(self, input_id: int) -> int:
++        assert input_id < len(self.mm_positions)
++        num_tokens = self.mm_positions[input_id]["length"]
++        return num_tokens
++
++    @property
++    def kv_block_hashes(self) -> ConstantList["BlockHashType"]:
++        # Prevent directly appending to the kv_block_hashes.
++        return ConstantList(self._kv_block_hashes)
++
++    def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None:
++        self._kv_block_hashes = value
++
++    def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None:
++        self._kv_block_hashes.append(block_hash)
++
++
++class RequestStatus(enum.IntEnum):
++    """Status of a request."""
++    WAITING = 0
++    RUNNING = 1
++    PREEMPTED = 2
++    # Note: anything after PREEMPTED (2) will be considered
++    # as a finished status.
++    FINISHED_STOPPED = 3
++    FINISHED_LENGTH_CAPPED = 4
++    FINISHED_ABORTED = 5
++    FINISHED_IGNORED = 6
++
++    @staticmethod
++    def is_finished(status: "RequestStatus") -> bool:
++        return status > RequestStatus.PREEMPTED
++
++    @staticmethod
++    def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
++        return _FINISHED_REASON_MAP.get(status)
++
++
++# Mapping of finished statuses to their finish reasons.
++# NOTE: The ignored requests are the requests whose prompt lengths
++# are longer than the model's length cap. Therefore, the stop
++# reason should also be "length" as in OpenAI API.
++_FINISHED_REASON_MAP = {
++    RequestStatus.FINISHED_STOPPED: "stop",
++    RequestStatus.FINISHED_LENGTH_CAPPED: "length",
++    RequestStatus.FINISHED_ABORTED: "abort",
++    RequestStatus.FINISHED_IGNORED: "length",
++}
+diff --git a/vllm/v1/sample/__init__.py b/vllm/v1/sample/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
+new file mode 100644
+index 0000000..d60f7eb
+--- /dev/null
++++ b/vllm/v1/sample/metadata.py
+@@ -0,0 +1,31 @@
++from dataclasses import dataclass
++from typing import Dict, List, Optional, Set
++
++import torch
++
++
++@dataclass
++class SamplingMetadata:
++
++    temperature: torch.Tensor
++    all_greedy: bool
++    all_random: bool
++
++    top_p: torch.Tensor
++    top_k: torch.Tensor
++    no_top_p: bool
++    no_top_k: bool
++
++    generators: Dict[int, torch.Generator]
++
++    max_num_logprobs: int
++
++    no_penalties: bool
++    prompt_token_ids: Optional[torch.Tensor]
++    frequency_penalties: torch.Tensor
++    presence_penalties: torch.Tensor
++    repetition_penalties: torch.Tensor
++
++    output_token_ids: List[List[int]]
++    min_tokens: List[int]
++    stop_token_ids: List[Set[int]]
+diff --git a/vllm/v1/sample/ops/__init__.py b/vllm/v1/sample/ops/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
+new file mode 100644
+index 0000000..2796d04
+--- /dev/null
++++ b/vllm/v1/sample/ops/penalties.py
+@@ -0,0 +1,59 @@
++from typing import List, Set, Tuple
++
++import torch
++
++from vllm.model_executor.layers.utils import apply_penalties
++from vllm.utils import is_pin_memory_available, make_tensor_with_pad
++
++
++def apply_min_token_penalties(logits: torch.Tensor,
++                              output_token_ids: List[List[int]],
++                              stop_token_ids: List[Set[int]],
++                              min_tokens: List[int]) -> None:
++    """
++    Applies minimum token penalty by setting the logits of the stop tokens
++    to -inf.
++    """
++    min_tokens_logits_to_penalize: List[Tuple[int, int]] = []
++    for index, min_token in enumerate(min_tokens):
++        if len(output_token_ids[index]) < min_token:
++            for stop_token_id in stop_token_ids[index]:
++                min_tokens_logits_to_penalize.append((index, stop_token_id))
++    if min_tokens_logits_to_penalize:
++        logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
++
++
++def apply_all_penalties(
++    logits: torch.Tensor,
++    prompt_token_ids: torch.Tensor,
++    presence_penalties: torch.Tensor,
++    frequency_penalties: torch.Tensor,
++    repetition_penalties: torch.Tensor,
++    output_token_ids: List[List[int]],
++) -> torch.Tensor:
++    """
++    Applies presence, frequency and repetition penalties to the logits.
++    """
++    _, vocab_size = logits.shape
++    output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size,
++                                          logits.device)
++    return apply_penalties(logits, prompt_token_ids, output_tokens_t,
++                           presence_penalties, frequency_penalties,
++                           repetition_penalties)
++
++
++def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int,
++                        device: torch.device) -> torch.Tensor:
++    """
++    Convert the different list data structures to tensors.
++    """
++    output_tokens_tensor = make_tensor_with_pad(
++        output_token_ids,
++        # Use the value of vocab_size as a pad since we don't have a
++        # token_id of this value.
++        pad=vocab_size,
++        device="cpu",
++        dtype=torch.int64,
++        pin_memory=is_pin_memory_available(),
++    )
++    return output_tokens_tensor.to(device, non_blocking=True)
+diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
+new file mode 100644
+index 0000000..f2007d8
+--- /dev/null
++++ b/vllm/v1/sample/ops/topk_topp_sampler.py
+@@ -0,0 +1,201 @@
++from typing import Dict
++
++import torch
++import torch.nn as nn
++
++from vllm import envs
++from vllm.logger import init_logger
++from vllm.platforms import current_platform
++
++logger = init_logger(__name__)
++
++try:
++    import flashinfer.sampling
++    is_flashinfer_available = True
++except ImportError:
++    is_flashinfer_available = False
++
++
++class TopKTopPSampler(nn.Module):
++
++    def __init__(self):
++        super().__init__()
++        if current_platform.is_cuda:
++            if is_flashinfer_available:
++                if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
++                    # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
++                    # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
++                    # default it is unused). For backward compatibility, we set
++                    # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
++                    # interpret it differently in V0 and V1 samplers: In V0,
++                    # None means False, while in V1, None means True. This is
++                    # why we use the condition
++                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
++                    logger.info("Using FlashInfer for top-p & top-k sampling.")
++                    self.forward = self.forward_cuda
++                else:
++                    logger.warning(
++                        "FlashInfer is available, but it is not enabled. "
++                        "Falling back to the PyTorch-native implementation of "
++                        "top-p & top-k sampling. For the best performance, "
++                        "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
++                    self.forward = self.forward_native
++            else:
++                logger.warning(
++                    "FlashInfer is not available. Falling back to the PyTorch-"
++                    "native implementation of top-p & top-k sampling. For the "
++                    "best performance, please install FlashInfer.")
++                self.forward = self.forward_native
++        else:
++            self.forward = self.forward_native
++
++    def forward_native(
++        self,
++        logits: torch.Tensor,
++        generators: Dict[int, torch.Generator],
++        no_top_k: bool,
++        k: torch.Tensor,
++        no_top_p: bool,
++        p: torch.Tensor,
++    ) -> torch.Tensor:
++        """PyTorch-native implementation of top-k and top-p sampling."""
++        logits = apply_top_k_top_p(logits, no_top_k, k, no_top_p, p)
++        probs = logits.softmax(dim=-1, dtype=torch.float32)
++        return random_sample(probs, generators)
++
++    def forward_cuda(
++        self,
++        logits: torch.Tensor,
++        generators: Dict[int, torch.Generator],
++        no_top_k: bool,
++        k: torch.Tensor,
++        no_top_p: bool,
++        p: torch.Tensor,
++    ) -> torch.Tensor:
++        """More optimized implementation for top-k and top-p sampling."""
++        probs = logits.softmax(dim=-1, dtype=torch.float32)
++        if no_top_k and no_top_p:
++            # We prefer `random_sample` over `flashinfer_sample` when sorting is
++            # not needed. This is because `random_sample` does not require
++            # CPU-GPU synchronization while `flashinfer_sample` does.
++            return random_sample(probs, generators)
++        return flashinfer_sample(probs, no_top_k, k, no_top_p, p, generators)
++
++
++def apply_top_k_top_p(
++    logits: torch.Tensor,
++    no_top_k: bool,
++    k: torch.Tensor,
++    no_top_p: bool,
++    p: torch.Tensor,
++) -> torch.Tensor:
++    """Apply top-k and top-p masks to the logits.
++
++    This function sorts the logits tensor, which can be slow for large batches.
++    """
++    if no_top_k and no_top_p:
++        return logits
++    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
++
++    if not no_top_k:
++        # Apply top-k.
++        top_k_mask = logits_sort.size(1) - k.to(torch.long)
++        # Get all the top_k values.
++        top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
++        top_k_mask = logits_sort < top_k_mask
++        logits_sort.masked_fill_(top_k_mask, -float("inf"))
++
++    if not no_top_p:
++        # Apply top-p.
++        probs_sort = logits_sort.softmax(dim=-1)
++        probs_sum = probs_sort.cumsum(dim=-1)
++        top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
++        # at least one
++        top_p_mask[:, -1] = False
++        logits_sort.masked_fill_(top_p_mask, -float("inf"))
++
++    # Re-sort the probabilities.
++    logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
++    return logits
++
++
++def random_sample(
++    probs: torch.Tensor,
++    generators: Dict[int, torch.Generator],
++) -> torch.Tensor:
++    """Randomly sample from the probabilities.
++
++    We use this function instead of torch.multinomial because torch.multinomial
++    causes CPU-GPU synchronization.
++    """
++    q = torch.empty_like(probs)
++    # NOTE(woosuk): To batch-process the requests without their own seeds,
++    # which is the common case, we first assume that every request does
++    # not have its own seed. Then, we overwrite the values for the requests
++    # that have their own seeds.
++    if len(generators) != probs.shape[0]:
++        q.exponential_()
++    if generators:
++        # TODO(woosuk): This can be slow because we handle each request
++        # one by one. Optimize this.
++        for i, generator in generators.items():
++            q[i].exponential_(generator=generator)
++    return probs.div_(q).argmax(dim=-1).view(-1)
++
++
++def flashinfer_sample(
++    probs: torch.Tensor,
++    no_top_k: bool,
++    k: torch.Tensor,
++    no_top_p: bool,
++    p: torch.Tensor,
++    generators: Dict[int, torch.Generator],
++) -> torch.Tensor:
++    """Sample from the probabilities using FlashInfer.
++
++    Statistically, this function is equivalent to the `random_sample` function.
++    However, this function is faster because it avoids sorting the logits tensor
++    via rejection sampling.
++    
++    NOTE: The outputs of this function do not necessarily match the outputs of
++    the `random_sample` function. It only guarantees that the outputs are
++    statistically equivalent.
++
++    NOTE: This function includes CPU-GPU synchronization, while `random_sample`
++    does not. Call this function at the end of the forward pass to minimize
++    the synchronization overhead.
++    """
++    assert not (no_top_k and no_top_p)
++    max_top_k_round = 32
++    batch_size = probs.shape[0]
++    uniform_samples = torch.empty((max_top_k_round, batch_size),
++                                  device=probs.device)
++    if len(generators) != batch_size:
++        uniform_samples.uniform_()
++    if generators:
++        for i, generator in generators.items():
++            uniform_samples[:, i].uniform_(generator=generator)
++
++    if no_top_k:
++        # Top-p only.
++        next_token_ids, success = flashinfer.sampling.top_p_sampling_from_probs(
++            probs, uniform_samples, p, deterministic=True)
++    elif no_top_p:
++        # Top-k only.
++        next_token_ids, success = flashinfer.sampling.top_k_sampling_from_probs(
++            probs, uniform_samples, k, deterministic=True)
++    else:
++        # Both top-k and top-p.
++        next_token_ids, success = (
++            flashinfer.sampling.top_k_top_p_sampling_from_probs(
++                probs, uniform_samples, k, p, deterministic=True))
++
++    # NOTE: CPU-GPU synchronization happens here.
++    if not success.all():
++        if not no_top_k:
++            probs = flashinfer.sampling.top_k_renorm_prob(probs, k)
++        if not no_top_p:
++            probs = flashinfer.sampling.top_p_renorm_prob(probs, p)
++        next_token_ids = flashinfer.sampling.sampling_from_probs(
++            probs, uniform_samples[0], deterministic=True)
++    return next_token_ids.view(-1)
+diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
+new file mode 100644
+index 0000000..7cd42ca
+--- /dev/null
++++ b/vllm/v1/sample/sampler.py
+@@ -0,0 +1,136 @@
++"""A layer that samples the next tokens from the model's outputs."""
++from typing import Tuple
++
++import torch
++import torch.nn as nn
++
++from vllm.v1.outputs import SamplerOutput
++from vllm.v1.sample.metadata import SamplingMetadata
++from vllm.v1.sample.ops.penalties import (apply_all_penalties,
++                                          apply_min_token_penalties)
++from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
++
++_SAMPLING_EPS = 1e-5
++
++
++class Sampler(nn.Module):
++
++    def __init__(self):
++        super().__init__()
++        self.topk_topp_sampler = TopKTopPSampler()
++
++    def forward(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> SamplerOutput:
++        needs_logprobs = sampling_metadata.max_num_logprobs > 0
++        if needs_logprobs:
++            # NOTE(woosuk): Use the original logits (before any penalties or
++            # temperature scaling) for the top-k logprobs.
++            # This is different from the V0 sampler, which uses the logits that
++            # is used for sampling (after penalties and temperature scaling).
++            # NOTE: We compute logprobs first because the below ops may
++            # modify the logits tensor in-place (and we don't want to clone
++            # the logits tensor for memory efficiency).
++            topk_logprobs, topk_indices = self.get_topk_logprobs(
++                logits, sampling_metadata)
++        else:
++            topk_logprobs = None
++            topk_indices = None
++
++        # Use float32 for the logits.
++        logits = logits.to(torch.float32)
++        # Apply penalties (e.g., min_tokens, freq_penalties).
++        logits = self.apply_penalties(logits, sampling_metadata)
++        # Apply temperature.
++        logits = self.apply_temperature(logits, sampling_metadata.temperature)
++        # Sample the next token.
++        sampled = self.sample(logits, sampling_metadata)
++        # Use int32 to reduce the tensor size.
++        sampled = sampled.to(torch.int32)
++
++        # NOTE: CPU-GPU synchronization happens here.
++        sampler_output = SamplerOutput(
++            sampled_token_ids=sampled.tolist(),
++            logprob_token_ids=topk_indices,
++            logprobs=topk_logprobs,
++            prompt_logprob_token_ids=None,
++            prompt_logprobs=None,
++        )
++        return sampler_output
++
++    def apply_temperature(
++        self,
++        logits: torch.Tensor,
++        temp: torch.Tensor,
++    ) -> torch.Tensor:
++        # Avoid division by zero.
++        temp = torch.where(temp < _SAMPLING_EPS, 1.0, temp)
++        # Use in-place division to avoid creating a new tensor.
++        logits.div_(temp.unsqueeze(dim=1))
++        return logits
++
++    def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
++        return logits.argmax(dim=-1).view(-1)
++
++    def sample(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> torch.Tensor:
++        assert not (sampling_metadata.all_greedy
++                    and sampling_metadata.all_random)
++        if sampling_metadata.all_greedy:
++            return self.greedy_sample(logits)
++
++        random_sampled = self.topk_topp_sampler(
++            logits,
++            sampling_metadata.generators,
++            sampling_metadata.no_top_k,
++            sampling_metadata.top_k,
++            sampling_metadata.no_top_p,
++            sampling_metadata.top_p,
++        )
++        if sampling_metadata.all_random:
++            return random_sampled
++
++        greedy_sampled = self.greedy_sample(logits)
++        sampled = torch.where(
++            sampling_metadata.temperature < _SAMPLING_EPS,
++            greedy_sampled,
++            random_sampled,
++        )
++        return sampled
++
++    def get_topk_logprobs(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> Tuple[torch.Tensor, torch.Tensor]:
++        logprobs = logits.log_softmax(dim=-1, dtype=torch.float32)
++        # FIXME: Mask the sampled token_id, get topk logprobs,
++        # and concatenate the topk with the sampled token_id.
++        topk_logprobs, topk_indices = torch.topk(
++            logprobs, sampling_metadata.max_num_logprobs, dim=-1)
++        # Use int32 to reduce the tensor size.
++        topk_indices = topk_indices.to(torch.int32)
++        return topk_logprobs, topk_indices
++
++    def apply_penalties(
++        self,
++        logits: torch.Tensor,
++        sampling_metadata: SamplingMetadata,
++    ) -> torch.Tensor:
++        apply_min_token_penalties(logits, sampling_metadata.output_token_ids,
++                                  sampling_metadata.stop_token_ids,
++                                  sampling_metadata.min_tokens)
++        if not sampling_metadata.no_penalties:
++            assert sampling_metadata.prompt_token_ids is not None
++            logits = apply_all_penalties(
++                logits, sampling_metadata.prompt_token_ids,
++                sampling_metadata.presence_penalties,
++                sampling_metadata.frequency_penalties,
++                sampling_metadata.repetition_penalties,
++                sampling_metadata.output_token_ids)
++        return logits
+diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
+new file mode 100644
+index 0000000..b1cd5c1
+--- /dev/null
++++ b/vllm/v1/serial_utils.py
+@@ -0,0 +1,10 @@
++import pickle
++
++
++class PickleEncoder:
++
++    def encode(self, obj):
++        return pickle.dumps(obj)
++
++    def decode(self, data):
++        return pickle.loads(data)
+diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
+new file mode 100644
+index 0000000..b0a7aff
+--- /dev/null
++++ b/vllm/v1/utils.py
+@@ -0,0 +1,136 @@
++import multiprocessing
++import os
++import weakref
++from collections.abc import Sequence
++from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar,
++                    Union, overload)
++
++from vllm.logger import init_logger
++from vllm.utils import get_mp_context, kill_process_tree
++
++logger = init_logger(__name__)
++
++T = TypeVar("T")
++
++
++class ConstantList(Generic[T], Sequence):
++
++    def __init__(self, x: List[T]) -> None:
++        self._x = x
++
++    def append(self, item):
++        raise Exception("Cannot append to a constant list")
++
++    def extend(self, item):
++        raise Exception("Cannot extend a constant list")
++
++    def insert(self, item):
++        raise Exception("Cannot insert into a constant list")
++
++    def pop(self, item):
++        raise Exception("Cannot pop from a constant list")
++
++    def remove(self, item):
++        raise Exception("Cannot remove from a constant list")
++
++    def clear(self):
++        raise Exception("Cannot clear a constant list")
++
++    def index(self,
++              item: T,
++              start: int = 0,
++              stop: Optional[int] = None) -> int:
++        return self._x.index(item, start,
++                             stop if stop is not None else len(self._x))
++
++    @overload
++    def __getitem__(self, item: int) -> T:
++        ...
++
++    @overload
++    def __getitem__(self, s: slice, /) -> List[T]:
++        ...
++
++    def __getitem__(self, item: Union[int, slice]) -> Union[T, List[T]]:
++        return self._x[item]
++
++    @overload
++    def __setitem__(self, item: int, value: T):
++        ...
++
++    @overload
++    def __setitem__(self, s: slice, value: T, /):
++        ...
++
++    def __setitem__(self, item: Union[int, slice], value: Union[T, List[T]]):
++        raise Exception("Cannot set item in a constant list")
++
++    def __delitem__(self, item):
++        raise Exception("Cannot delete item from a constant list")
++
++    def __iter__(self):
++        return iter(self._x)
++
++    def __contains__(self, item):
++        return item in self._x
++
++    def __len__(self):
++        return len(self._x)
++
++
++class BackgroundProcHandle:
++    """
++    Utility class to handle creation, readiness, and shutdown
++    of background processes used by the AsyncLLM and LLMEngine.
++    """
++
++    def __init__(
++        self,
++        input_path: str,
++        output_path: str,
++        process_name: str,
++        target_fn: Callable,
++        process_kwargs: Dict[Any, Any],
++    ):
++        context = get_mp_context()
++        reader, writer = context.Pipe(duplex=False)
++
++        assert ("ready_pipe" not in process_kwargs
++                and "input_path" not in process_kwargs
++                and "output_path" not in process_kwargs)
++        process_kwargs["ready_pipe"] = writer
++        process_kwargs["input_path"] = input_path
++        process_kwargs["output_path"] = output_path
++
++        # Run busy loop in background process.
++        self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
++        self._finalizer = weakref.finalize(self, shutdown, self.proc,
++                                           input_path, output_path)
++        self.proc.start()
++
++        # Wait for startup.
++        if reader.recv()["status"] != "READY":
++            raise RuntimeError(f"{process_name} initialization failed. "
++                               "See root cause above.")
++
++    def shutdown(self):
++        self._finalizer()
++
++
++# Note(rob): shutdown function cannot be a bound method,
++# else the gc cannot collect the object.
++def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
++    # Shutdown the process.
++    if proc.is_alive():
++        proc.terminate()
++        proc.join(5)
++
++        if proc.is_alive():
++            kill_process_tree(proc.pid)
++
++    # Remove zmq ipc socket files.
++    ipc_sockets = [output_path, input_path]
++    for ipc_socket in ipc_sockets:
++        socket_file = ipc_socket.replace("ipc://", "")
++        if os and os.path.exists(socket_file):
++            os.remove(socket_file)
+diff --git a/vllm/v1/worker/__init__.py b/vllm/v1/worker/__init__.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
+new file mode 100644
+index 0000000..26a2084
+--- /dev/null
++++ b/vllm/v1/worker/block_table.py
+@@ -0,0 +1,78 @@
++from typing import List
++
++import numpy as np
++import torch
++
++from vllm.logger import init_logger
++
++logger = init_logger(__name__)
++
++
++class BlockTable:
++
++    def __init__(
++        self,
++        max_num_reqs: int,
++        max_model_len: int,
++        max_num_blocks_per_req: int,
++        pin_memory: bool,
++        device: torch.device,
++    ):
++        self.max_num_reqs = max_num_reqs
++        self.max_model_len = max_model_len
++        self.max_num_blocks_per_req = max_num_blocks_per_req
++        self.pin_memory = pin_memory
++        self.device = device
++
++        self.block_table = torch.zeros(
++            (max_num_reqs, max_num_blocks_per_req),
++            device=self.device,
++            dtype=torch.int32,
++        )
++        self.block_table_cpu = torch.zeros(
++            (max_num_reqs, max_num_blocks_per_req),
++            device="cpu",
++            dtype=torch.int32,
++            pin_memory=pin_memory,
++        )
++        self.block_table_np = self.block_table_cpu.numpy()
++        self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32)
++
++    def append_row(
++        self,
++        row_idx: int,
++        start: int,
++        block_ids: List[int],
++    ) -> None:
++        num_blocks = len(block_ids)
++        self.block_table_np[row_idx, start:start + num_blocks] = block_ids
++        self.num_blocks_per_row[row_idx] = start + num_blocks
++
++    def add_row(self, row_idx: int, block_ids: List[int]) -> None:
++        self.append_row(row_idx, 0, block_ids)
++
++    def move_row(self, src: int, tgt: int) -> None:
++        num_blocks = self.num_blocks_per_row[src]
++        self.block_table_np[tgt, :num_blocks] = self.block_table_np[
++            src, :num_blocks]
++        self.num_blocks_per_row[tgt] = num_blocks
++
++    def commit(self, num_reqs: int) -> None:
++        self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
++                                          non_blocking=True)
++
++    def clear(self) -> None:
++        self.block_table.fill_(0)
++        self.block_table_cpu.fill_(0)
++
++    def get_device_tensor(self) -> torch.Tensor:
++        """Ruturns the device tensor of the block table."""
++        return self.block_table
++
++    def get_cpu_tensor(self) -> torch.Tensor:
++        """Returns the CPU tensor of the block table."""
++        return self.block_table_cpu
++
++    def get_numpy_array(self) -> np.ndarray:
++        """Returns the numpy array of the block table."""
++        return self.block_table_np
+diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
+new file mode 100644
+index 0000000..40494e6
+--- /dev/null
++++ b/vllm/v1/worker/gpu_input_batch.py
+@@ -0,0 +1,435 @@
++# Datastructures defining an input batch
++
++from dataclasses import dataclass
++from typing import TYPE_CHECKING, Dict, List, Optional, Set
++
++import numpy as np
++import torch
++
++from vllm.multimodal import MultiModalKwargs
++from vllm.sampling_params import SamplingParams, SamplingType
++from vllm.v1.sample.metadata import SamplingMetadata
++from vllm.v1.worker.block_table import BlockTable
++
++if TYPE_CHECKING:
++    from vllm.multimodal.inputs import PlaceholderRange
++
++
++@dataclass
++class CachedRequestState:
++
++    req_id: str
++    prompt_token_ids: List[int]
++    prompt: Optional[str]
++    mm_inputs: List[MultiModalKwargs]
++    mm_positions: List["PlaceholderRange"]
++    sampling_params: SamplingParams
++    generator: Optional[torch.Generator]
++
++    block_ids: List[int]
++    num_computed_tokens: int
++    output_token_ids: List[int]
++
++    @property
++    def num_tokens(self) -> int:
++        return len(self.prompt_token_ids) + len(self.output_token_ids)
++
++
++class InputBatch:
++
++    def __init__(
++        self,
++        max_num_reqs: int,
++        max_model_len: int,
++        max_num_blocks_per_req: int,
++        device: torch.device,
++        pin_memory: bool,
++        vocab_size: int,
++    ):
++        self.max_num_reqs = max_num_reqs
++        self.max_model_len = max_model_len
++        self.max_num_blocks_per_req = max_num_blocks_per_req
++        self.device = device
++        self.pin_memory = pin_memory
++        self.vocab_size = vocab_size
++
++        self.req_ids: List[Optional[str]] = [None] * max_num_reqs
++        self.req_id_to_index: Dict[str, int] = {}
++
++        # TODO(woosuk): This buffer could be too large if max_model_len is big.
++        # Find a way to reduce the CPU memory usage.
++        # This buffer is not directly transferred to the GPU, so it does not
++        # need to be pinned.
++        self.token_ids_cpu_tensor = torch.zeros(
++            (max_num_reqs, max_model_len),
++            device="cpu",
++            dtype=torch.int32,
++            pin_memory=False,
++        )
++        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
++        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
++        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
++        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
++
++        # Block table.
++        self.block_table = BlockTable(
++            max_num_reqs=max_num_reqs,
++            max_model_len=max_model_len,
++            max_num_blocks_per_req=max_num_blocks_per_req,
++            pin_memory=pin_memory,
++            device=device,
++        )
++
++        # Sampling-related.
++        self.temperature = torch.empty((max_num_reqs, ),
++                                       dtype=torch.float32,
++                                       device=device)
++        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
++                                                  dtype=torch.float32,
++                                                  device="cpu",
++                                                  pin_memory=pin_memory)
++        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
++        self.greedy_reqs: Set[str] = set()
++        self.random_reqs: Set[str] = set()
++
++        self.top_p = torch.empty((max_num_reqs, ),
++                                 dtype=torch.float32,
++                                 device=device)
++        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
++                                            dtype=torch.float32,
++                                            device="cpu",
++                                            pin_memory=pin_memory)
++        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
++        self.top_p_reqs: Set[str] = set()
++
++        self.top_k = torch.empty((max_num_reqs, ),
++                                 dtype=torch.int32,
++                                 device=device)
++        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
++                                            dtype=torch.int32,
++                                            device="cpu",
++                                            pin_memory=pin_memory)
++        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
++        self.top_k_reqs: Set[str] = set()
++
++        # Frequency penalty related data structures
++        self.frequency_penalties = torch.empty((max_num_reqs, ),
++                                               dtype=torch.float,
++                                               device=device)
++        self.frequency_penalties_cpu_tensor = torch.empty(
++            (max_num_reqs, ),
++            dtype=torch.float,
++            device="cpu",
++            pin_memory=pin_memory)
++        self.frequency_penalties_cpu = \
++            self.frequency_penalties_cpu_tensor.numpy()
++        self.frequency_penalties_reqs: Set[str] = set()
++
++        # Presence penalty related data structures
++        self.presence_penalties = torch.empty((max_num_reqs, ),
++                                              dtype=torch.float,
++                                              device=device)
++        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
++                                                         dtype=torch.float,
++                                                         device="cpu",
++                                                         pin_memory=pin_memory)
++        self.presence_penalties_cpu = \
++            self.presence_penalties_cpu_tensor.numpy()
++        self.presence_penalties_reqs: Set[str] = set()
++
++        # Repetition penalty related data structures
++        self.repetition_penalties = torch.empty((max_num_reqs, ),
++                                                dtype=torch.float,
++                                                device=device)
++        self.repetition_penalties_cpu_tensor = torch.empty(
++            (max_num_reqs, ),
++            dtype=torch.float,
++            device="cpu",
++            pin_memory=pin_memory)
++        self.repetition_penalties_cpu = \
++            self.repetition_penalties_cpu_tensor.numpy()
++        self.repetition_penalties_reqs: Set[str] = set()
++
++        self.min_tokens: List[int] = [0] * max_num_reqs
++        self.stop_token_ids: List[Set[int]] = [
++            set() for _ in range(max_num_reqs)
++        ]
++        self.prompt_token_ids: Optional[torch.Tensor] = None
++
++        # req_index -> generator
++        # NOTE(woosuk): The indices of the requests that do not have their own
++        # generator should not be included in the dictionary.
++        self.generators: Dict[int, torch.Generator] = {}
++
++        self.num_logprobs: Dict[str, int] = {}
++        self.prompt_logprob_reqs: Set[str] = set()
++
++    def add_request(
++        self,
++        request: "CachedRequestState",
++        req_index: Optional[int] = None,
++    ) -> None:
++        if req_index is None:
++            req_index = self.num_reqs
++        assert req_index < self.max_num_reqs
++
++        req_id = request.req_id
++        self.req_ids[req_index] = req_id
++        self.req_id_to_index[req_id] = req_index
++
++        # Copy the prompt token ids and output token ids.
++        num_prompt_tokens = len(request.prompt_token_ids)
++        self.num_prompt_tokens[req_index] = num_prompt_tokens
++        self.token_ids_cpu[
++            req_index, :num_prompt_tokens] = request.prompt_token_ids
++        start_idx = num_prompt_tokens
++        end_idx = start_idx + len(request.output_token_ids)
++        self.token_ids_cpu[req_index,
++                           start_idx:end_idx] = request.output_token_ids
++        self.num_tokens[req_index] = request.num_tokens
++
++        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
++        self.block_table.add_row(req_index, request.block_ids)
++
++        sampling_params = request.sampling_params
++        self.temperature_cpu[req_index] = sampling_params.temperature
++        if sampling_params.sampling_type == SamplingType.GREEDY:
++            self.greedy_reqs.add(req_id)
++        else:
++            self.random_reqs.add(req_id)
++
++        self.top_p_cpu[req_index] = sampling_params.top_p
++        if sampling_params.top_p < 1:
++            self.top_p_reqs.add(req_id)
++        self.top_k_cpu[req_index] = sampling_params.top_k
++        if sampling_params.top_k > 0:
++            self.top_k_reqs.add(req_id)
++        self.frequency_penalties_cpu[req_index] = \
++            sampling_params.frequency_penalty
++        if sampling_params.frequency_penalty != 0.0:
++            self.frequency_penalties_reqs.add(req_id)
++        self.presence_penalties_cpu[req_index] = \
++            sampling_params.presence_penalty
++        if sampling_params.presence_penalty != 0.0:
++            self.presence_penalties_reqs.add(req_id)
++        self.repetition_penalties_cpu[req_index] = \
++            sampling_params.repetition_penalty
++        if sampling_params.repetition_penalty != 1.0:
++            self.repetition_penalties_reqs.add(req_id)
++        self.min_tokens[req_index] = sampling_params.min_tokens
++        self.stop_token_ids[req_index] = sampling_params.all_stop_token_ids
++
++        # NOTE(woosuk): self.generators should not include the requests that
++        # do not have their own generator.
++        if request.generator is not None:
++            self.generators[req_index] = request.generator
++
++        num_logprobs = sampling_params.logprobs
++        if num_logprobs is not None and num_logprobs > 0:
++            self.num_logprobs[req_id] = num_logprobs
++        if sampling_params.prompt_logprobs:
++            self.prompt_logprob_reqs.add(req_id)
++
++    def remove_request(self, req_id: str) -> Optional[int]:
++        req_index = self.req_id_to_index.pop(req_id, None)
++        if req_index is None:
++            return None
++        self.req_ids[req_index] = None
++
++        self.greedy_reqs.discard(req_id)
++        self.random_reqs.discard(req_id)
++        self.top_p_reqs.discard(req_id)
++        self.top_k_reqs.discard(req_id)
++        self.frequency_penalties_reqs.discard(req_id)
++        self.presence_penalties_reqs.discard(req_id)
++        self.repetition_penalties_reqs.discard(req_id)
++        self.generators.pop(req_index, None)
++        self.num_logprobs.pop(req_id, None)
++        self.prompt_logprob_reqs.discard(req_id)
++        return req_index
++
++    def clear(self) -> None:
++        self.req_ids = [None] * self.max_num_reqs
++        self.req_id_to_index.clear()
++        self.greedy_reqs.clear()
++        self.random_reqs.clear()
++        self.top_p_reqs.clear()
++        self.top_k_reqs.clear()
++        self.frequency_penalties_reqs.clear()
++        self.presence_penalties_reqs.clear()
++        self.repetition_penalties_reqs.clear()
++        self.generators.clear()
++        self.num_logprobs.clear()
++        self.prompt_logprob_reqs.clear()
++
++    def condense(self, empty_req_indices: List[int]) -> None:
++        if self.num_reqs == 0:
++            # The batched states are empty.
++            return
++
++        # NOTE(woosuk): This function assumes that the empty_req_indices
++        # is sorted in descending order.
++        last_req_index = self.num_reqs + len(empty_req_indices) - 1
++        while empty_req_indices:
++            # Find the largest non-empty index.
++            while last_req_index in empty_req_indices:
++                last_req_index -= 1
++
++            # Find the smallest empty index.
++            empty_index = empty_req_indices.pop()
++            if empty_index >= last_req_index:
++                break
++
++            # Swap the states.
++            req_id = self.req_ids[last_req_index]
++            assert req_id is not None
++            self.req_ids[empty_index] = req_id
++            self.req_ids[last_req_index] = None
++            self.req_id_to_index[req_id] = empty_index
++
++            num_tokens = self.num_tokens[last_req_index]
++            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
++                last_req_index, :num_tokens]
++            self.num_tokens[empty_index] = num_tokens
++            self.num_prompt_tokens[empty_index] = \
++                self.num_prompt_tokens[last_req_index]
++            self.num_computed_tokens_cpu[
++                empty_index] = self.num_computed_tokens_cpu[last_req_index]
++            self.block_table.move_row(last_req_index, empty_index)
++            self.temperature_cpu[empty_index] = self.temperature_cpu[
++                last_req_index]
++            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
++            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
++            self.frequency_penalties_cpu[empty_index] = \
++                self.frequency_penalties_cpu[last_req_index]
++            self.presence_penalties_cpu[empty_index] = \
++                self.presence_penalties_cpu[last_req_index]
++            self.repetition_penalties_cpu[empty_index] = \
++                self.repetition_penalties_cpu[last_req_index]
++            self.min_tokens[empty_index] = self.min_tokens[last_req_index]
++            self.stop_token_ids[empty_index] = \
++                self.stop_token_ids[last_req_index]
++            generator = self.generators.pop(last_req_index, None)
++            if generator is not None:
++                self.generators[empty_index] = generator
++
++            # Decrement last_req_index since it is now empty.
++            last_req_index -= 1
++
++    def make_sampling_metadata(
++        self,
++        req_id_output_token_ids: Dict[str, List[int]],
++        skip_copy: bool = False,
++    ) -> SamplingMetadata:
++        if not skip_copy:
++            self.temperature[:self.num_reqs].copy_(
++                self.temperature_cpu_tensor[:self.num_reqs], non_blocking=True)
++            self.top_p[:self.num_reqs].copy_(
++                self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
++            self.top_k[:self.num_reqs].copy_(
++                self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
++            if not self.no_penalties:
++                # Since syncing these tensors is expensive only copy them
++                # if necessary i.e. if there are requests which require
++                # penalties to be applied during sampling.
++                self.frequency_penalties[:self.num_reqs].copy_(
++                    self.frequency_penalties_cpu_tensor[:self.num_reqs],
++                    non_blocking=True)
++                self.presence_penalties[:self.num_reqs].copy_(
++                    self.presence_penalties_cpu_tensor[:self.num_reqs],
++                    non_blocking=True)
++                self.repetition_penalties[:self.num_reqs].copy_(
++                    self.repetition_penalties_cpu_tensor[:self.num_reqs],
++                    non_blocking=True)
++                # The prompt tokens are used only for applying penalties during
++                # the sampling process. Hence copy these tensors only when
++                # there are requests which need penalties to be applied.
++                self.prompt_token_ids = self._make_prompt_token_ids_tensor()
++
++        output_token_ids: List[List[int]] = []
++
++        for req_id in self.req_ids[:self.num_reqs]:
++            assert req_id is not None
++            # Currently we create a tensor for output_token_ids from scratch
++            # at each step. However, for the penalties computation what we
++            # need is stats about the token ids present in the output. This
++            # stats can be maintained incrementally instead of computing it
++            # from scratch at each step.
++            # TODO - Replace this with incremental update to output token
++            # statistics.
++            output_token_ids.append(req_id_output_token_ids[req_id])
++
++        return SamplingMetadata(
++            temperature=self.temperature[:self.num_reqs],
++            all_greedy=self.all_greedy,
++            all_random=self.all_random,
++            top_p=self.top_p[:self.num_reqs],
++            top_k=self.top_k[:self.num_reqs],
++            no_top_p=self.no_top_p,
++            no_top_k=self.no_top_k,
++            generators=self.generators,
++            max_num_logprobs=self.max_num_logprobs,
++            prompt_token_ids=self.prompt_token_ids,
++            frequency_penalties=self.frequency_penalties[:self.num_reqs],
++            presence_penalties=self.presence_penalties[:self.num_reqs],
++            repetition_penalties=self.repetition_penalties[:self.num_reqs],
++            output_token_ids=output_token_ids,
++            min_tokens=self.min_tokens[:self.num_reqs],
++            stop_token_ids=self.stop_token_ids[:self.num_reqs],
++            no_penalties=self.no_penalties,
++        )
++
++    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
++        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
++        prompt_token_ids_cpu_tensor = torch.empty(
++            (self.num_reqs, max_prompt_len),
++            device="cpu",
++            dtype=torch.int64,
++            pin_memory=self.pin_memory)
++        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
++        prompt_token_ids[:] = (
++            self.token_ids_cpu[:self.num_reqs, :max_prompt_len])
++        # Use the value of vocab_size as a pad since we don't have a
++        # token_id of this value.
++        for i in range(self.num_reqs):
++            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
++        return prompt_token_ids_cpu_tensor.to(device=self.device,
++                                              non_blocking=True)
++
++    @property
++    def num_reqs(self) -> int:
++        return len(self.req_id_to_index)
++
++    @property
++    def all_greedy(self) -> bool:
++        return len(self.random_reqs) == 0
++
++    @property
++    def all_random(self) -> bool:
++        return len(self.greedy_reqs) == 0
++
++    @property
++    def no_top_p(self) -> bool:
++        return len(self.top_p_reqs) == 0
++
++    @property
++    def no_top_k(self) -> bool:
++        return len(self.top_k_reqs) == 0
++
++    @property
++    def no_penalties(self) -> bool:
++        return (len(self.presence_penalties_reqs) == 0
++                and len(self.frequency_penalties_reqs) == 0
++                and len(self.repetition_penalties_reqs) == 0)
++
++    @property
++    def max_num_logprobs(self) -> int:
++        return max(self.num_logprobs.values()) if self.num_logprobs else 0
++
++    @property
++    def no_logprob(self) -> bool:
++        return len(self.num_logprobs) == 0
++
++    @property
++    def no_prompt_logprob(self) -> bool:
++        return len(self.prompt_logprob_reqs) == 0
+diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
+new file mode 100644
+index 0000000..fb87dc5
+--- /dev/null
++++ b/vllm/v1/worker/gpu_model_runner.py
+@@ -0,0 +1,866 @@
++import gc
++import time
++from typing import TYPE_CHECKING, Dict, List, Tuple, cast
++
++import numpy as np
++import torch
++import torch.distributed
++import torch.nn as nn
++
++from vllm.config import CompilationLevel, VllmConfig
++from vllm.distributed.parallel_state import graph_capture
++from vllm.forward_context import set_forward_context
++from vllm.inputs import INPUT_REGISTRY
++from vllm.logger import init_logger
++from vllm.model_executor.model_loader import get_model
++from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
++from vllm.sampling_params import SamplingType
++from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
++                        LayerBlockType, bind_kv_cache, cdiv,
++                        is_pin_memory_available)
++from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
++                                                   FlashAttentionMetadata)
++from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
++from vllm.v1.outputs import ModelRunnerOutput
++from vllm.v1.sample.metadata import SamplingMetadata
++from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
++
++if TYPE_CHECKING:
++    from vllm.v1.core.scheduler import SchedulerOutput
++
++logger = init_logger(__name__)
++
++
++class GPUModelRunner:
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        device: torch.device,
++    ):
++        self.vllm_config = vllm_config
++        self.model_config = vllm_config.model_config
++        self.cache_config = vllm_config.cache_config
++        self.lora_config = vllm_config.lora_config
++        self.load_config = vllm_config.load_config
++        self.parallel_config = vllm_config.parallel_config
++        self.scheduler_config = vllm_config.scheduler_config
++        self.speculative_config = vllm_config.speculative_config
++        self.prompt_adapter_config = vllm_config.prompt_adapter_config
++        self.observability_config = vllm_config.observability_config
++
++        model_config = self.model_config
++        cache_config = self.cache_config
++        scheduler_config = self.scheduler_config
++        parallel_config = self.parallel_config
++        self.device = device
++        self.pin_memory = is_pin_memory_available()
++        self.dtype = self.model_config.dtype
++        if cache_config.cache_dtype == "auto":
++            self.kv_cache_dtype = self.dtype
++        else:
++            self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
++                cache_config.cache_dtype]
++
++        self.is_multimodal_model = model_config.is_multimodal_model
++        self.sliding_window = model_config.get_sliding_window()
++        self.block_size = cache_config.block_size
++        self.max_model_len = model_config.max_model_len
++        self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
++        self.max_num_tokens = scheduler_config.max_num_batched_tokens
++        self.max_num_reqs = scheduler_config.max_num_seqs
++
++        # Model-related.
++        self.num_attn_layers = model_config.get_num_layers_by_block_type(
++            parallel_config, LayerBlockType.attention)
++        self.num_query_heads = model_config.get_num_attention_heads(
++            parallel_config)
++        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
++        self.head_size = model_config.get_head_size()
++        self.hidden_size = model_config.get_hidden_size()
++
++        # Multi-modal data support
++        self.input_registry = INPUT_REGISTRY
++        self.mm_registry = MULTIMODAL_REGISTRY
++
++        # NOTE: Initialized input mapper is only used for processing dummy
++        # multimodal data into multimodal kwargs for GPU memory profiling.
++        self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config)
++        self.mm_input_mapper_profiling.use_cache = False
++
++        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
++        self.encoder_cache_size = self.scheduler_config.encoder_cache_size
++
++        # Lazy initialization
++        # self.model: nn.Module  # Set after load_model
++        self.kv_caches: List[torch.Tensor] = []
++        # req_id -> (input_id -> encoder_output)
++        self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {}
++
++        # Request states.
++        self.requests: Dict[str, CachedRequestState] = {}
++        # Persistent batch.
++        self.input_batch = InputBatch(
++            max_num_reqs=self.max_num_reqs,
++            max_model_len=self.max_model_len,
++            max_num_blocks_per_req=self.max_num_blocks_per_req,
++            device=self.device,
++            pin_memory=self.pin_memory,
++            vocab_size=model_config.get_vocab_size(),
++        )
++
++        self.use_cuda_graph = (self.vllm_config.compilation_config.level
++                               == CompilationLevel.PIECEWISE
++                               and not self.model_config.enforce_eager)
++        # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
++        # The convention is different.
++        # self.cudagraph_batch_sizes sorts in ascending order.
++        # The batch sizes in the config are in descending order.
++        self.cudagraph_batch_sizes = list(
++            reversed(self.vllm_config.compilation_config.capture_sizes))
++
++        # Cache the device properties.
++        self.device_properties = torch.cuda.get_device_properties(self.device)
++        self.num_sms = self.device_properties.multi_processor_count
++
++        # Persistent buffers for CUDA graphs.
++        self.input_ids = torch.zeros(self.max_num_tokens,
++                                     dtype=torch.int32,
++                                     device=self.device)
++        self.positions = torch.zeros(self.max_num_tokens,
++                                     dtype=torch.int64,
++                                     device=self.device)
++        self.inputs_embeds = torch.zeros(
++            (self.max_num_tokens, self.hidden_size),
++            dtype=self.dtype,
++            device=self.device)
++
++        # OPTIMIZATION: Cache the tensors rather than creating them every step.
++        self.arange_np = np.arange(max(self.max_num_reqs + 1,
++                                       self.max_model_len),
++                                   dtype=np.int32)
++        # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
++        # a faster version of creating a new tensor every time. Thus, we should
++        # not make any assumptions about the values in these tensors.
++        self.input_ids_cpu = torch.zeros(self.max_num_tokens,
++                                         dtype=torch.int32,
++                                         device="cpu",
++                                         pin_memory=self.pin_memory)
++        self.input_ids_np = self.input_ids_cpu.numpy()
++        self.positions_cpu = torch.zeros(self.max_num_tokens,
++                                         dtype=torch.int64,
++                                         device="cpu",
++                                         pin_memory=self.pin_memory)
++        self.positions_np = self.positions_cpu.numpy()
++        self.slot_mapping_cpu = torch.zeros(self.max_num_tokens,
++                                            dtype=torch.int32,
++                                            device="cpu",
++                                            pin_memory=self.pin_memory)
++        self.slot_mapping_np = self.slot_mapping_cpu.numpy()
++        self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
++                                               dtype=torch.int32,
++                                               device="cpu",
++                                               pin_memory=self.pin_memory)
++        self.query_start_loc_np = self.query_start_loc_cpu.numpy()
++        self.seq_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
++                                             dtype=torch.int32,
++                                             device="cpu",
++                                             pin_memory=self.pin_memory)
++        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
++
++    def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
++        # Remove stopped requests from the cached states.
++        # Keep the states of the pre-empted requests.
++        for req_id in scheduler_output.finished_req_ids:
++            self.requests.pop(req_id, None)
++            self.encoder_cache.pop(req_id, None)
++
++        # Free the cached encoder outputs.
++        for req_id, input_id in scheduler_output.free_encoder_input_ids:
++            encoder_outputs = self.encoder_cache.get(req_id)
++            if encoder_outputs is not None:
++                encoder_outputs.pop(input_id, None)
++                if not encoder_outputs:
++                    self.encoder_cache.pop(req_id, None)
++
++        # Remove the requests from the persistent batch.
++        stopped_req_ids = set().union(
++            scheduler_output.preempted_req_ids,
++            scheduler_output.finished_req_ids,
++        )
++        removed_req_indices: List[int] = []
++        for req_id in stopped_req_ids:
++            req_index = self.input_batch.remove_request(req_id)
++            if req_index is not None:
++                removed_req_indices.append(req_index)
++
++        # Update the states of the running requests.
++        for req_data in scheduler_output.scheduled_running_reqs:
++            req_id = req_data.req_id
++            req_state = self.requests[req_id]
++            req_index = self.input_batch.req_id_to_index[req_id]
++
++            # Update the num_computed_tokens.
++            req_state.num_computed_tokens = req_data.num_computed_tokens
++            self.input_batch.num_computed_tokens_cpu[req_index] = (
++                req_data.num_computed_tokens)
++
++            # Update the block table.
++            num_new_blocks = len(req_data.new_block_ids)
++            if num_new_blocks == 0:
++                continue
++            start_index = len(req_state.block_ids)
++            req_state.block_ids.extend(req_data.new_block_ids)
++            self.input_batch.block_table.append_row(req_index, start_index,
++                                                    req_data.new_block_ids)
++
++        req_ids_to_add: List[str] = []
++        # Add new requests to the cached states.
++        for new_req_data in scheduler_output.scheduled_new_reqs:
++            req_id = new_req_data.req_id
++            sampling_params = new_req_data.sampling_params
++            if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
++                generator = torch.Generator(device=self.device)
++                generator.manual_seed(sampling_params.seed)
++            else:
++                generator = None
++
++            self.requests[req_id] = CachedRequestState(
++                req_id=req_id,
++                prompt_token_ids=new_req_data.prompt_token_ids,
++                prompt=new_req_data.prompt,
++                mm_inputs=new_req_data.mm_inputs,
++                mm_positions=new_req_data.mm_positions,
++                sampling_params=sampling_params,
++                generator=generator,
++                block_ids=new_req_data.block_ids,
++                num_computed_tokens=new_req_data.num_computed_tokens,
++                output_token_ids=[],
++            )
++            req_ids_to_add.append(req_id)
++
++        # Update the cached states of the resumed requests.
++        for res_req_data in scheduler_output.scheduled_resumed_reqs:
++            req_id = res_req_data.req_id
++            req_state = self.requests[req_id]
++
++            req_state.block_ids = res_req_data.block_ids
++            req_state.num_computed_tokens = res_req_data.num_computed_tokens
++            req_ids_to_add.append(req_id)
++
++        # Add the new or resumed requests to the persistent batch.
++        # The smaller empty indices are filled first.
++        removed_req_indices = sorted(removed_req_indices, reverse=True)
++        for req_id in req_ids_to_add:
++            req_state = self.requests[req_id]
++            if removed_req_indices:
++                # Fill the empty index.
++                req_index = removed_req_indices.pop()
++            else:
++                # Append to the end.
++                req_index = None
++            self.input_batch.add_request(req_state, req_index)
++
++        # Condense the batched states if there are empty indices.
++        if removed_req_indices:
++            self.input_batch.condense(removed_req_indices)
++
++    def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
++        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
++        assert total_num_scheduled_tokens > 0
++        num_reqs = self.input_batch.num_reqs
++        assert num_reqs > 0
++
++        # OPTIMIZATION: Start copying the block table first.
++        # This way, we can overlap the copy with the following CPU operations.
++        self.input_batch.block_table.commit(num_reqs)
++
++        # Get the number of scheduled tokens for each request.
++        # TODO: The Python loop can be slow. Optimize.
++        num_scheduled_tokens = []
++        max_num_scheduled_tokens = 0
++        for req_id in self.input_batch.req_ids[:num_reqs]:
++            assert req_id is not None
++            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
++            num_scheduled_tokens.append(num_tokens)
++            max_num_scheduled_tokens = max(max_num_scheduled_tokens,
++                                           num_tokens)
++        num_scheduled_tokens = np.array(num_scheduled_tokens, dtype=np.int32)
++        assert max_num_scheduled_tokens > 0
++
++        # Get request indices.
++        # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
++        req_indices = np.repeat(self.arange_np[:num_reqs],
++                                num_scheduled_tokens)
++
++        # Get batched arange.
++        # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
++        arange = np.concatenate(
++            [self.arange_np[:n] for n in num_scheduled_tokens])
++
++        # Get positions.
++        positions_np = self.positions_np[:total_num_scheduled_tokens]
++        np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
++               arange,
++               out=positions_np)
++
++        # Get token indices.
++        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
++        # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
++        # where M is the max_model_len.
++        token_indices = (positions_np +
++                         req_indices * self.input_batch.token_ids_cpu.shape[1])
++        # NOTE(woosuk): We use torch.index_select instead of np.take here
++        # because torch.index_select is much faster than np.take for large
++        # tensors.
++        torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
++                           0,
++                           torch.from_numpy(token_indices),
++                           out=self.input_ids_cpu[:total_num_scheduled_tokens])
++
++        # Calculate the slot mapping.
++        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
++        # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
++        # where K is the max_num_blocks_per_req and the block size is 2.
++        # NOTE(woosuk): We can't simply use `token_indices // block_size` here
++        # because M (max_model_len) is not necessarily divisible by block_size.
++        block_table_indices = (req_indices * self.max_num_blocks_per_req +
++                               positions_np // self.block_size)
++        # NOTE(woosuk): We use torch.index_select instead of np.take here
++        # because torch.index_select is much faster than np.take for large
++        # tensors.
++        block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
++        block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
++        block_offsets = positions_np % self.block_size
++        np.add(block_numbers * self.block_size,
++               block_offsets,
++               out=self.slot_mapping_np[:total_num_scheduled_tokens])
++
++        # Prepare the attention metadata.
++        self.query_start_loc_np[0] = 0
++        np.cumsum(num_scheduled_tokens,
++                  out=self.query_start_loc_np[1:num_reqs + 1])
++
++        seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
++                    num_scheduled_tokens)
++        max_seq_len = seq_lens.max()
++        self.seq_start_loc_np[0] = 0
++        np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1])
++
++        # Copy the tensors to the GPU.
++        self.input_ids[:total_num_scheduled_tokens].copy_(
++            self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
++        self.positions[:total_num_scheduled_tokens].copy_(
++            self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
++        query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to(
++            self.device, non_blocking=True)
++        seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to(
++            self.device, non_blocking=True)
++        slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to(
++            self.device, non_blocking=True).long()
++
++        # Prepare for cascade attention if needed.
++        common_prefix_len = (scheduler_output.num_common_prefix_blocks *
++                             self.block_size)
++        if common_prefix_len == 0:
++            # Common case.
++            use_cascade = False
++        else:
++            # NOTE(woosuk): Cascade attention uses two attention kernels: one
++            # for the common prefix and the other for the rest. For the first
++            # kernel, we concatenate all the query tokens (possibly from
++            # different requests) and treat them as if they are from the same
++            # request. Then, we use bi-directional attention to process the
++            # common prefix in the KV cache. Importantly, this means that the
++            # first kernel does not do any masking.
++
++            # Consider the following example:
++            # Request 1's input query: [D, E, X]
++            # Request 1's kv cache: [A, B, C, D, E, X]
++            # Request 1's num_computed_tokens: 3 (i.e., [A, B, C])
++            # Request 2's input query: [E, Y]
++            # Request 2's kv cache: [A, B, C, D, E, Y]
++            # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D])
++
++            # If we use [A, B, C, D, E] as the common prefix, then the
++            # first kernel will compute the bi-directional attention between
++            # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E].
++            # However, this is wrong because D in Request 1 should not attend to
++            # E in the common prefix (i.e., we need masking).
++            # To avoid this, [A, B, C, D] should be the common prefix.
++            # That is, the common prefix should be capped by the minimum
++            # num_computed_tokens among the requests, and plus one to include
++            # the first token of the query.
++
++            # In practice, we use [A, B, C] as the common prefix, instead of
++            # [A, B, C, D] (i.e., the common prefix is capped by the minimum
++            # num_computed_tokens, without plus one).
++            # This is because of an implementation detail: We want to always
++            # use two kernels for cascade attention. Let's imagine:
++            # Request 3's input query: [D]
++            # Request 3's kv cache: [A, B, C, D]
++            # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D])
++            # If we use [A, B, C, D] as the common prefix for Request 1-3,
++            # then Request 3 will be processed only by the first kernel,
++            # and the second kernel will get an empty input. While this is not
++            # a fundamental problem, our current implementation does not support
++            # this case.
++            common_prefix_len = min(
++                common_prefix_len,
++                self.input_batch.num_computed_tokens_cpu[:num_reqs].min())
++            # common_prefix_len should be a multiple of the block size.
++            common_prefix_len = (common_prefix_len // self.block_size *
++                                 self.block_size)
++            use_cascade = FlashAttentionBackend.use_cascade_attention(
++                common_prefix_len=common_prefix_len,
++                query_lens=num_scheduled_tokens,
++                num_query_heads=self.num_query_heads,
++                num_kv_heads=self.num_kv_heads,
++                use_alibi=False,  # FIXME
++                use_sliding_window=self.sliding_window is not None,
++                num_sms=self.num_sms,
++            )
++
++        if use_cascade:
++            # TODO: Optimize.
++            cu_prefix_query_lens = torch.tensor(
++                [0, total_num_scheduled_tokens],
++                dtype=torch.int32,
++                device=self.device)
++            cu_prefix_kv_lens = torch.tensor([0, common_prefix_len],
++                                             dtype=torch.int32,
++                                             device=self.device)
++            cu_suffix_kv_lens = (
++                self.seq_start_loc_np[:num_reqs + 1] -
++                self.arange_np[:num_reqs + 1] * common_prefix_len)
++            cu_suffix_kv_lens = torch.from_numpy(cu_suffix_kv_lens).to(
++                self.device)
++        else:
++            cu_prefix_query_lens = None
++            cu_prefix_kv_lens = None
++            cu_suffix_kv_lens = None
++
++        attn_metadata = FlashAttentionMetadata(
++            num_actual_tokens=total_num_scheduled_tokens,
++            max_query_len=max_num_scheduled_tokens,
++            query_start_loc=query_start_loc,
++            max_seq_len=max_seq_len,
++            seq_start_loc=seq_start_loc,
++            block_table=(
++                self.input_batch.block_table.get_device_tensor()[:num_reqs]),
++            slot_mapping=slot_mapping,
++            use_cascade=use_cascade,
++            common_prefix_len=common_prefix_len,
++            cu_prefix_query_lens=cu_prefix_query_lens,
++            cu_prefix_kv_lens=cu_prefix_kv_lens,
++            cu_suffix_kv_lens=cu_suffix_kv_lens,
++        )
++        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
++        # request in the batch. While we should not sample any token from this
++        # partial request, we do so for simplicity. We will ignore the sampled
++        # token from the partial request.
++        # TODO: Support prompt logprobs.
++        logits_indices = query_start_loc[1:] - 1
++        return attn_metadata, logits_indices
++
++    def _prepare_sampling(
++        self,
++        scheduler_output: "SchedulerOutput",
++    ) -> SamplingMetadata:
++        skip_copy = True
++        if (scheduler_output.finished_req_ids
++                or scheduler_output.preempted_req_ids):
++            skip_copy = False
++        if (scheduler_output.scheduled_new_reqs
++                or scheduler_output.scheduled_resumed_reqs):
++            skip_copy = False
++        # Create the sampling metadata.
++        req_id_output_token_ids: Dict[str, List[int]] = \
++            {req_id: req.output_token_ids \
++                for req_id, req in self.requests.items()}
++
++        sampling_metadata = self.input_batch.make_sampling_metadata(
++            req_id_output_token_ids, skip_copy)
++        return sampling_metadata
++
++    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
++        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
++        if not scheduled_encoder_inputs:
++            return
++
++        # Batch the multi-modal inputs.
++        mm_inputs: List[MultiModalKwargs] = []
++        req_input_ids: List[Tuple[str, int]] = []
++        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
++            req_state = self.requests[req_id]
++            for input_id in encoder_input_ids:
++                mm_inputs.append(req_state.mm_inputs[input_id])
++                req_input_ids.append((req_id, input_id))
++        batched_mm_inputs = MultiModalKwargs.batch(mm_inputs)
++        batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
++                                                       device=self.device)
++
++        # Run the encoder.
++        # `encoder_outputs` is either of the following:
++        # 1. A tensor of shape [num_images, feature_size, hidden_size]
++        # in case when feature_size is fixed across all images.
++        # 2. A list (length: num_images) of tensors, each of shape
++        # [feature_size, hidden_size] in case when the feature size is
++        # dynamic depending on input images.
++        encoder_outputs = self.model.get_multimodal_embeddings(
++            **batched_mm_inputs)
++
++        # Cache the encoder outputs.
++        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
++            if req_id not in self.encoder_cache:
++                self.encoder_cache[req_id] = {}
++            self.encoder_cache[req_id][input_id] = output
++
++    def _gather_encoder_outputs(
++        self,
++        scheduler_output: "SchedulerOutput",
++    ) -> List[torch.Tensor]:
++        encoder_outputs: List[torch.Tensor] = []
++        num_reqs = self.input_batch.num_reqs
++        for req_id in self.input_batch.req_ids[:num_reqs]:
++            assert req_id is not None
++            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
++                req_id]
++            req_state = self.requests[req_id]
++            num_computed_tokens = req_state.num_computed_tokens
++            mm_positions = req_state.mm_positions
++            for i, pos_info in enumerate(mm_positions):
++                start_pos = pos_info["offset"]
++                num_encoder_tokens = pos_info["length"]
++
++                # The encoder output is needed if the two ranges overlap:
++                # [num_computed_tokens,
++                #  num_computed_tokens + num_scheduled_tokens) and
++                # [start_pos, start_pos + num_encoder_tokens)
++                if start_pos >= num_computed_tokens + num_scheduled_tokens:
++                    # The encoder output is not needed in this step.
++                    break
++                if start_pos + num_encoder_tokens <= num_computed_tokens:
++                    # The encoder output is already processed and stored
++                    # in the decoder's KV cache.
++                    continue
++
++                start_idx = max(num_computed_tokens - start_pos, 0)
++                end_idx = min(
++                    num_computed_tokens - start_pos + num_scheduled_tokens,
++                    num_encoder_tokens)
++                assert start_idx < end_idx
++                assert req_id in self.encoder_cache
++                assert i in self.encoder_cache[req_id]
++                encoder_output = self.encoder_cache[req_id][i]
++                encoder_outputs.append(encoder_output[start_idx:end_idx])
++        return encoder_outputs
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        scheduler_output: "SchedulerOutput",
++    ) -> ModelRunnerOutput:
++        self._update_states(scheduler_output)
++
++        if self.is_multimodal_model:
++            # Run the multimodal encoder if any.
++            self._execute_encoder(scheduler_output)
++            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
++        else:
++            encoder_outputs = []
++
++        # Prepare the decoder inputs.
++        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
++        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
++        if (self.use_cuda_graph
++                and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
++            # Use piecewise CUDA graphs.
++            # Add padding to the batch size.
++            num_input_tokens = self.vllm_config.pad_for_cudagraph(
++                num_scheduled_tokens)
++        else:
++            # Eager mode.
++            num_input_tokens = num_scheduled_tokens
++        attn_metadata.num_input_tokens = num_input_tokens
++
++        if self.is_multimodal_model:
++            # NOTE(woosuk): To unify token ids and soft tokens (vision
++            # embeddings), we always use embeddings (rather than token ids)
++            # as input to the multimodal model, even when the input is text.
++            input_ids = self.input_ids[:num_scheduled_tokens]
++            if encoder_outputs:
++                inputs_embeds = self.model.get_input_embeddings(
++                    input_ids, encoder_outputs)
++            else:
++                inputs_embeds = self.model.get_input_embeddings(input_ids)
++            # TODO(woosuk): Avoid the copy. Optimize.
++            self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
++            inputs_embeds = self.inputs_embeds[:num_input_tokens]
++            input_ids = None
++        else:
++            # For text-only models, we use token ids as input.
++            # While it is possible to use embeddings as input just like the
++            # multimodal models, it is not desirable for performance since
++            # then the embedding layer is not included in the CUDA graph.
++            input_ids = self.input_ids[:num_input_tokens]
++            inputs_embeds = None
++
++        # Run the decoder.
++        # Use persistent buffers for CUDA graphs.
++        with set_forward_context(attn_metadata, self.vllm_config):
++            hidden_states = self.model(
++                input_ids=input_ids,
++                positions=self.positions[:num_input_tokens],
++                kv_caches=self.kv_caches,
++                attn_metadata=None,
++                inputs_embeds=inputs_embeds,
++            )
++        hidden_states = hidden_states[:num_scheduled_tokens]
++        hidden_states = hidden_states[logits_indices]
++        logits = self.model.compute_logits(hidden_states, None)
++
++        # Sample the next token and get logprobs if needed.
++        sampling_metadata = self._prepare_sampling(scheduler_output)
++        sampler_output = self.model.sample(
++            logits=logits,
++            sampling_metadata=sampling_metadata,
++        )
++
++        sampled_token_ids = sampler_output.sampled_token_ids
++        # TODO(woosuk): The following loop can be slow since it iterates over
++        # the requests one by one. Optimize.
++        num_reqs = self.input_batch.num_reqs
++        for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
++            assert req_id is not None
++            req_state = self.requests[req_id]
++            seq_len = (req_state.num_computed_tokens +
++                       scheduler_output.num_scheduled_tokens[req_id])
++            assert seq_len <= req_state.num_tokens
++            if seq_len == req_state.num_tokens:
++                # Append the sampled token to the output token ids.
++                token_id = sampled_token_ids[i]
++                self.input_batch.token_ids_cpu[i, seq_len] = token_id
++                self.input_batch.num_tokens[i] += 1
++                req_state.output_token_ids.append(token_id)
++            else:
++                # Ignore the sampled token from the partial request.
++                # Rewind the generator state as if the token was not sampled.
++                generator = self.input_batch.generators.get(i)
++                if generator is not None:
++                    # This relies on cuda-specific torch-internal impl details
++                    generator.set_offset(generator.get_offset() - 4)
++
++        if sampler_output.logprob_token_ids is None:
++            logprob_token_ids = None
++        else:
++            logprob_token_ids = sampler_output.logprob_token_ids.cpu()
++        if sampler_output.logprobs is None:
++            logprobs = None
++        else:
++            logprobs = sampler_output.logprobs.cpu()
++
++        # num_reqs entries should be non-None
++        assert all(
++            req_id is not None for req_id in
++            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
++        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
++
++        model_runner_output = ModelRunnerOutput(
++            req_ids=req_ids,
++            req_id_to_index=self.input_batch.req_id_to_index,
++            sampled_token_ids=sampled_token_ids,
++            logprob_token_ids_cpu=logprob_token_ids,
++            logprobs_cpu=logprobs,
++        )
++        return model_runner_output
++
++    def load_model(self) -> None:
++        logger.info("Starting to load model %s...", self.model_config.model)
++        with DeviceMemoryProfiler() as m:  # noqa: SIM117
++            self.model = get_model(vllm_config=self.vllm_config)
++
++        self.model_memory_usage = m.consumed_memory
++        logger.info("Loading model weights took %.4f GB",
++                    self.model_memory_usage / float(2**30))
++
++    @torch.inference_mode()
++    def _dummy_run(
++        self,
++        model: nn.Module,
++        num_tokens: int,
++        kv_caches: List[torch.Tensor],
++    ) -> torch.Tensor:
++        if self.is_multimodal_model:
++            input_ids = None
++            inputs_embeds = self.inputs_embeds[:num_tokens]
++        else:
++            input_ids = self.input_ids[:num_tokens]
++            inputs_embeds = None
++        with set_forward_context(None, self.vllm_config):
++            hidden_states = model(
++                input_ids=input_ids,
++                positions=self.positions[:num_tokens],
++                kv_caches=kv_caches,
++                attn_metadata=None,
++                inputs_embeds=inputs_embeds,
++            )
++        return hidden_states
++
++    def profile_run(self) -> None:
++        # use an empty tensor instead of `None`` to force Dynamo to pass
++        # it by reference, rather by specializing on the value `None`.
++        # the `dtype` argument does not matter, and we use `float32` as
++        # a placeholder (it has wide hardware support).
++        # it is important to create tensors inside the loop, rather than
++        # multiplying the list, to avoid Dynamo from treating them as
++        # tensor aliasing.
++        dummy_kv_caches = [
++            torch.tensor([], dtype=torch.float32, device=self.device)
++            for _ in range(self.num_attn_layers)
++        ]
++
++        # Profile with multimodal encoder & encoder cache.
++        if self.is_multimodal_model:
++
++            # Create dummy batch of multimodal inputs.
++            dummy_request_data = self.input_registry.dummy_data_for_profiling(
++                model_config=self.model_config,
++                seq_len=self.max_num_tokens,
++                mm_registry=self.mm_registry,
++            )
++            dummy_mm_data = dummy_request_data.multi_modal_data
++
++            # NOTE: Currently model is profiled with a single non-text
++            # modality with the max possible input tokens even when
++            # it supports multiple.
++            max_tokens_by_modality_dict = self.mm_registry.get_max_tokens_per_item_by_modality(  # noqa: E501
++                self.model_config)
++
++            dummy_data_modality, max_tokens_per_mm_item = max(
++                max_tokens_by_modality_dict.items(), key=lambda item: item[1])
++
++            # Check how many items of this modality can be supported by
++            # the encoder cache budget.
++            encoder_cache_budget = min(self.max_num_encoder_input_tokens,
++                                       self.encoder_cache_size)
++            max_num_mm_items_encoder_budget = encoder_cache_budget // \
++                max_tokens_per_mm_item
++
++            # TODO: Allow users to set encoder_cache_budget in case this
++            # happens.
++            assert max_num_mm_items_encoder_budget > 0, (
++                f"Encoder cache budget={encoder_cache_budget} is too small to "
++                f"support the maximum possible size of multimodal embeddings"
++                f"={max_tokens_per_mm_item}.")
++
++            # Check how many items of this modality can be supported by
++            # the decoder budget.
++            max_mm_items_per_req = max(
++                self.mm_registry.get_mm_limits_per_prompt(
++                    self.model_config).values())
++
++            # NOTE: We do not consider max_num_batched_tokens on purpose
++            # because the multimodal embeddings can be generated in advance
++            # and chunked prefilled.
++            max_num_mm_items_decoder_budget = self.max_num_reqs * \
++                max_mm_items_per_req
++
++            max_num_mm_items = min(max_num_mm_items_encoder_budget,
++                                   max_num_mm_items_decoder_budget)
++
++            # Dummy data definition in V0 may contain multiple multimodal items
++            # (e.g, multiple images) for a single request, therefore here we
++            # always replicate first item by max_num_mm_items times since in V1
++            # they are scheduled to be processed separately.
++
++            # Case when models have a merged processor, their dummy data is
++            # already batched `MultiModalKwargs`, therefore we take the first
++            # `MultiModalKwargsItem` from the desired modality to profile on.
++            if isinstance(dummy_mm_data, MultiModalKwargs):
++                dummy_mm_item = dummy_mm_data.get_item(
++                    modality=dummy_data_modality, item_index=0)
++                dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
++
++            # Case when models have dummy data explicitly defined as
++            # `MultiModalDataDict`, so they need to be processed through input
++            # mapper.
++            # TODO (ywang96): deprecate this path once merged processor is
++            # supported on all models.
++            else:
++                mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
++                    mm_data=dummy_mm_data,
++                    mm_hashes=None,
++                    mm_processor_kwargs=None,
++                    precomputed_mm_inputs=None)
++                dummy_mm_kwargs = mm_kwargs_list[0]
++
++            batched_dummy_mm_inputs = MultiModalKwargs.batch(
++                [dummy_mm_kwargs] * max_num_mm_items)
++            batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
++                batched_dummy_mm_inputs, device=self.device)
++
++            # Run multimodal encoder.
++            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
++                **batched_dummy_mm_inputs)
++            assert len(dummy_encoder_outputs) == max_num_mm_items, (
++                "Expected dimension 0 of encoder outputs to match the number "
++                f"of multimodal data items: {max_num_mm_items}, got "
++                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
++                "due to the 'get_multimodal_embeddings' method of the model "
++                "not implemented correctly.")
++
++            # Cache the dummy encoder outputs.
++            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
++
++        # Trigger compilation for general shape.
++        hidden_states = self._dummy_run(self.model, self.max_num_tokens,
++                                        dummy_kv_caches)
++        logits = self.model.compute_logits(hidden_states, None)
++        logits = logits[:self.max_num_tokens]
++        # TODO(woosuk): Consider the memory usage of the sampler.
++        torch.cuda.synchronize()
++        del hidden_states, logits
++        self.encoder_cache.clear()
++        gc.collect()
++
++    def capture_model(self) -> None:
++        if not self.use_cuda_graph:
++            logger.warning(
++                "Skipping CUDA graph capture. Please add "
++                "-O %s to use CUDA graphs.", CompilationLevel.PIECEWISE)
++            return
++
++        start_time = time.perf_counter()
++        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
++
++        # Trigger CUDA graph capture for specific shapes.
++        # Capture the large shapes first so that the smaller shapes
++        # can reuse the memory pool allocated for the large shapes.
++        with graph_capture(device=self.device):
++            for num_tokens in reversed(self.cudagraph_batch_sizes):
++                for _ in range(self.vllm_config.compilation_config.
++                               cudagraph_num_of_warmups):
++                    self._dummy_run(self.model, num_tokens, self.kv_caches)
++                self._dummy_run(self.model, num_tokens, self.kv_caches)
++
++        end_time = time.perf_counter()
++        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
++        elapsed_time = end_time - start_time
++        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
++        # This usually takes 5~20 seconds.
++        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
++                    elapsed_time, cuda_graph_size / (1 << 30))
++
++    def initialize_kv_cache(self, num_blocks: int) -> None:
++        assert len(self.kv_caches) == 0
++        kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
++            num_blocks, self.block_size, self.num_kv_heads, self.head_size)
++        for _ in range(self.num_attn_layers):
++            self.kv_caches.append(
++                torch.zeros(kv_cache_shape,
++                            dtype=self.kv_cache_dtype,
++                            device=self.device))
++        bind_kv_cache(
++            self.vllm_config.compilation_config.static_forward_context,
++            [self.kv_caches])
+diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
+new file mode 100644
+index 0000000..e83bce4
+--- /dev/null
++++ b/vllm/v1/worker/gpu_worker.py
+@@ -0,0 +1,273 @@
++"""A GPU worker class."""
++import gc
++import os
++from typing import TYPE_CHECKING, Optional, Tuple
++
++import torch
++import torch.distributed
++
++import vllm.envs as envs
++from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
++from vllm.distributed import (ensure_model_parallel_initialized,
++                              init_distributed_environment,
++                              set_custom_all_reduce)
++from vllm.logger import init_logger
++from vllm.model_executor import set_random_seed
++from vllm.platforms import current_platform
++from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size
++from vllm.v1.core.scheduler import SchedulerOutput
++from vllm.v1.outputs import ModelRunnerOutput
++from vllm.v1.worker.gpu_model_runner import GPUModelRunner
++
++logger = init_logger(__name__)
++
++if TYPE_CHECKING:
++    from vllm.v1.core.scheduler import SchedulerOutput
++
++
++class Worker:
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        local_rank: int,
++        rank: int,
++        distributed_init_method: str,
++    ):
++
++        # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config)
++        self.vllm_config = vllm_config
++        self.model_config = vllm_config.model_config
++        self.cache_config = vllm_config.cache_config
++        self.lora_config = vllm_config.lora_config
++        self.load_config = vllm_config.load_config
++        self.parallel_config = vllm_config.parallel_config
++        self.scheduler_config = vllm_config.scheduler_config
++        self.device_config = vllm_config.device_config
++        self.speculative_config = vllm_config.speculative_config
++        self.prompt_adapter_config = vllm_config.prompt_adapter_config
++        self.observability_config = vllm_config.observability_config
++
++        self.parallel_config.rank = rank
++        self.local_rank = local_rank
++        self.rank = rank
++        self.distributed_init_method = distributed_init_method
++
++        if self.model_config.trust_remote_code:
++            # note: lazy import to avoid importing torch before initializing
++            from vllm.utils import init_cached_hf_modules
++            init_cached_hf_modules()
++
++        # Torch profiler. Enabled and configured through env vars:
++        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
++        if envs.VLLM_TORCH_PROFILER_DIR:
++            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
++            logger.info("Profiling enabled. Traces will be saved to: %s",
++                        torch_profiler_trace_dir)
++            self.profiler = torch.profiler.profile(
++                activities=[
++                    torch.profiler.ProfilerActivity.CPU,
++                    torch.profiler.ProfilerActivity.CUDA,
++                ],
++                with_stack=True,
++                on_trace_ready=torch.profiler.tensorboard_trace_handler(
++                    torch_profiler_trace_dir, use_gzip=True))
++        else:
++            self.profiler = None
++
++    def initialize(self):
++        if self.device_config.device.type == "cuda":
++            # torch.distributed.all_reduce does not free the input tensor until
++            # the synchronization point. This causes the memory usage to grow
++            # as the number of all_reduce calls increases. This env var disables
++            # this behavior.
++            # Related issue:
++            # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
++            os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
++
++            # This env var set by Ray causes exceptions with graph building.
++            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
++            self.device = torch.device(f"cuda:{self.local_rank}")
++            torch.cuda.set_device(self.device)
++
++            _check_if_gpu_supports_dtype(self.model_config.dtype)
++            gc.collect()
++            torch.cuda.empty_cache()
++            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
++        else:
++            raise RuntimeError(
++                f"Not support device type: {self.device_config.device}")
++        # Initialize the distributed environment.
++        init_worker_distributed_environment(self.parallel_config, self.rank,
++                                            self.distributed_init_method,
++                                            self.local_rank)
++        # Set random seed.
++        set_random_seed(self.model_config.seed)
++
++        # Construct the model runner
++        self.model_runner = GPUModelRunner(self.vllm_config, self.device)
++
++    def load_model(self) -> None:
++        self.model_runner.load_model()
++
++    @torch.inference_mode()
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """Profiles the peak memory usage of the model to determine how many
++        KV blocks may be allocated without OOMs.
++
++        The engine will first conduct a profiling of the existing memory usage.
++        Then, it calculate the maximum possible number of GPU and CPU blocks
++        that can be allocated with the remaining free memory.
++
++        .. tip::
++            You may limit the usage of GPU memory
++            by adjusting the `gpu_memory_utilization` parameter.
++        """
++        # Profile the memory usage of the model and get the maximum number of
++        # cache blocks that can be allocated with the remaining free memory.
++        torch.cuda.empty_cache()
++        torch.cuda.reset_peak_memory_stats()
++
++        _, total_gpu_memory = torch.cuda.mem_get_info()
++        # Execute a forward pass with dummy inputs to profile the memory usage
++        # of the model.
++        self.model_runner.profile_run()
++
++        free_gpu_memory, _ = torch.cuda.mem_get_info()
++        # NOTE(woosuk): Here we assume that the other processes using the same
++        # GPU did not change their memory usage during the profiling.
++        assert self.init_gpu_memory > free_gpu_memory, (
++            "Error in memory profiling. "
++            f"Initial free memory {self.init_gpu_memory}, current free memory"
++            f" {free_gpu_memory}. This happens when the GPU memory was "
++            "not properly cleaned up before initializing the vLLM instance.")
++
++        # Get the peak memory allocation recorded by torch
++        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
++
++        # Check for any memory left around that may have been allocated on the
++        # gpu outside of `torch`. NCCL operations, for example, can use a few
++        # GB during a forward pass
++        torch.cuda.empty_cache()
++        torch_allocated_bytes = torch.cuda.memory_stats(
++        )["allocated_bytes.all.current"]
++        total_allocated_bytes = torch.cuda.mem_get_info(
++        )[1] - torch.cuda.mem_get_info()[0]
++        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
++        if non_torch_allocations > 0:
++            peak_memory += non_torch_allocations
++        available_kv_cache_memory = (
++            total_gpu_memory * self.cache_config.gpu_memory_utilization -
++            peak_memory)
++
++        # Calculate the number of blocks that can be allocated with the
++        # profiled peak memory.
++        cache_block_size = _get_cache_block_size(self.cache_config,
++                                                 self.model_config,
++                                                 self.parallel_config)
++        num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
++        num_gpu_blocks = max(num_gpu_blocks, 0)
++        return num_gpu_blocks, 0
++
++    def initialize_cache(self, num_gpu_blocks: int) -> None:
++        """Allocate GPU and CPU KV cache with the specified number of blocks."""
++        if num_gpu_blocks <= 0:
++            raise ValueError("No available memory for the cache blocks. "
++                             "Try increasing `gpu_memory_utilization` when "
++                             "initializing the engine.")
++
++        max_seq_len = self.cache_config.block_size * num_gpu_blocks
++        max_model_len = self.model_config.max_model_len
++        if max_model_len > max_seq_len:
++            raise ValueError(
++                f"The model's max seq len ({max_model_len}) "
++                "is larger than the maximum number of tokens that can be "
++                f"stored in KV cache ({max_seq_len}). Try increasing "
++                "`gpu_memory_utilization` or decreasing `max_model_len` when "
++                "initializing the engine.")
++
++        self.model_runner.initialize_kv_cache(num_gpu_blocks)
++
++    def compile_or_warm_up_model(self) -> None:
++        if not self.model_config.enforce_eager:
++            self.model_runner.capture_model()
++        # Reset the seed to ensure that the random state is not affected by
++        # the model initialization and profiling.
++        set_random_seed(self.model_config.seed)
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        scheduler_output: "SchedulerOutput",
++    ) -> ModelRunnerOutput:
++        output = self.model_runner.execute_model(scheduler_output)
++        return output if self.rank == 0 else None
++
++    def profile(self, is_start: bool = True):
++        if self.profiler is None:
++            raise RuntimeError("Profiler is not enabled.")
++        if is_start:
++            self.profiler.start()
++        else:
++            self.profiler.stop()
++
++    def check_health(self) -> None:
++        # worker will always be healthy as long as it's running.
++        return
++
++
++def init_worker_distributed_environment(
++    parallel_config: ParallelConfig,
++    rank: int,
++    distributed_init_method: Optional[str] = None,
++    local_rank: int = -1,
++) -> None:
++    """Initialize the distributed environment."""
++    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
++
++    init_distributed_environment(parallel_config.world_size, rank,
++                                 distributed_init_method, local_rank)
++
++    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
++                                      parallel_config.pipeline_parallel_size)
++
++
++def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
++    # Check if the GPU supports the dtype.
++    if torch_dtype == torch.bfloat16:  # noqa: SIM102
++        if not current_platform.has_device_capability(80):
++            capability = current_platform.get_device_capability()
++            gpu_name = current_platform.get_device_name()
++
++            if capability is None:
++                compute_str = "does not have a compute capability"
++            else:
++                version_str = capability.as_version_str()
++                compute_str = f"has compute capability {version_str}"
++
++            raise ValueError(
++                "Bfloat16 is only supported on GPUs with compute capability "
++                f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
++                "You can use float16 instead by explicitly setting the"
++                "`dtype` flag in CLI, for example: --dtype=half.")
++
++
++def _get_cache_block_size(
++    cache_config: CacheConfig,
++    model_config: ModelConfig,
++    parallel_config: ParallelConfig,
++) -> int:
++    head_size = model_config.get_head_size()
++    num_heads = model_config.get_num_kv_heads(parallel_config)
++    num_attention_layers = model_config.get_num_layers_by_block_type(
++        parallel_config, LayerBlockType.attention)
++
++    key_cache_block = cache_config.block_size * num_heads * head_size
++    value_cache_block = key_cache_block
++    total = num_attention_layers * (key_cache_block + value_cache_block)
++    if cache_config.cache_dtype == "auto":
++        dtype = model_config.dtype
++    else:
++        dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
++    dtype_size = get_dtype_size(dtype)
++    return dtype_size * total
+diff --git a/vllm/version.py b/vllm/version.py
+new file mode 100644
+index 0000000..66e189d
+--- /dev/null
++++ b/vllm/version.py
+@@ -0,0 +1,11 @@
++try:
++    from ._version import __version__, __version_tuple__
++except Exception as e:
++    import warnings
++
++    warnings.warn(f"Failed to read commit hash:\n{e}",
++                  RuntimeWarning,
++                  stacklevel=2)
++
++    __version__ = "dev"
++    __version_tuple__ = (0, 0, __version__)
+diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
+index c34ee06..7ccd457 100644
+--- a/vllm/worker/cache_engine.py
++++ b/vllm/worker/cache_engine.py
+@@ -1,12 +1,13 @@
+ """CacheEngine class for managing the KV cache."""
+-from typing import Dict, List
++from typing import List
+ 
+ import torch
+ 
+ from vllm.attention import get_attn_backend
+-from vllm.config import CacheConfig, ModelConfig, ParallelConfig
++from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
+ from vllm.logger import init_logger
+-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available
++from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
++                        get_dtype_size, is_pin_memory_available)
+ 
+ logger = init_logger(__name__)
+ 
+@@ -24,18 +25,26 @@ class CacheEngine:
+         cache_config: CacheConfig,
+         model_config: ModelConfig,
+         parallel_config: ParallelConfig,
++        device_config: DeviceConfig,
+     ) -> None:
+         self.cache_config = cache_config
+         self.model_config = model_config
+         self.parallel_config = parallel_config
++        self.device_config = device_config
+ 
+         self.head_size = model_config.get_head_size()
+-        self.num_layers = model_config.get_num_layers(parallel_config)
+-        self.num_heads = model_config.get_num_kv_heads(parallel_config)
++        # Models like Jamba, have mixed typed layers, E.g Mamba
++        self.num_attention_layers = model_config.get_num_layers_by_block_type(
++            parallel_config, LayerBlockType.attention)
++        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+ 
+         self.block_size = cache_config.block_size
+         self.num_gpu_blocks = cache_config.num_gpu_blocks
++        if self.num_gpu_blocks:
++            self.num_gpu_blocks //= parallel_config.pipeline_parallel_size
+         self.num_cpu_blocks = cache_config.num_cpu_blocks
++        if self.num_cpu_blocks:
++            self.num_cpu_blocks //= parallel_config.pipeline_parallel_size
+ 
+         if cache_config.cache_dtype == "auto":
+             self.dtype = model_config.dtype
+@@ -43,10 +52,15 @@ class CacheEngine:
+             self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+ 
+         # Get attention backend.
+-        self.attn_backend = get_attn_backend(model_config.dtype)
++        self.attn_backend = get_attn_backend(self.head_size,
++                                             model_config.dtype,
++                                             cache_config.cache_dtype,
++                                             self.block_size,
++                                             model_config.is_attention_free)
+ 
+         # Initialize the cache.
+-        self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda")
++        self.gpu_cache = self._allocate_kv_cache(
++            self.num_gpu_blocks, self.device_config.device_type)
+         self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
+ 
+     def _allocate_kv_cache(
+@@ -56,28 +70,31 @@ class CacheEngine:
+     ) -> List[torch.Tensor]:
+         """Allocates KV cache on the specified device."""
+         kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+-            num_blocks, self.block_size, self.num_heads, self.head_size)
++            num_blocks, self.block_size, self.num_kv_heads, self.head_size)
+         pin_memory = is_pin_memory_available() if device == "cpu" else False
+         kv_cache: List[torch.Tensor] = []
+-        for _ in range(self.num_layers):
++        for _ in range(self.num_attention_layers):
++            # null block in CpuGpuBlockAllocator requires at least that
++            # block to be zeroed-out.
++            # We zero-out everything for simplicity.
+             kv_cache.append(
+-                torch.empty(kv_cache_shape,
++                torch.zeros(kv_cache_shape,
+                             dtype=self.dtype,
+                             pin_memory=pin_memory,
+                             device=device))
+         return kv_cache
+ 
+-    def swap_in(self, src_to_dst: Dict[int, int]) -> None:
+-        for i in range(self.num_layers):
++    def swap_in(self, src_to_dst: torch.Tensor) -> None:
++        for i in range(self.num_attention_layers):
+             self.attn_backend.swap_blocks(self.cpu_cache[i], self.gpu_cache[i],
+                                           src_to_dst)
+ 
+-    def swap_out(self, src_to_dst: Dict[int, int]) -> None:
+-        for i in range(self.num_layers):
++    def swap_out(self, src_to_dst: torch.Tensor) -> None:
++        for i in range(self.num_attention_layers):
+             self.attn_backend.swap_blocks(self.gpu_cache[i], self.cpu_cache[i],
+                                           src_to_dst)
+ 
+-    def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
++    def copy(self, src_to_dsts: torch.Tensor) -> None:
+         self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts)
+ 
+     @staticmethod
+@@ -88,18 +105,15 @@ class CacheEngine:
+     ) -> int:
+         head_size = model_config.get_head_size()
+         num_heads = model_config.get_num_kv_heads(parallel_config)
+-        num_layers = model_config.get_num_layers(parallel_config)
++        num_attention_layers = model_config.get_num_layers_by_block_type(
++            parallel_config, LayerBlockType.attention)
+ 
+         key_cache_block = cache_config.block_size * num_heads * head_size
+         value_cache_block = key_cache_block
+-        total = num_layers * (key_cache_block + value_cache_block)
++        total = num_attention_layers * (key_cache_block + value_cache_block)
+         if cache_config.cache_dtype == "auto":
+             dtype = model_config.dtype
+         else:
+             dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+-        dtype_size = _get_dtype_size(dtype)
++        dtype_size = get_dtype_size(dtype)
+         return dtype_size * total
+-
+-
+-def _get_dtype_size(dtype: torch.dtype) -> int:
+-    return torch.tensor([], dtype=dtype).element_size()
+diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
+new file mode 100644
+index 0000000..fa6775c
+--- /dev/null
++++ b/vllm/worker/cpu_enc_dec_model_runner.py
+@@ -0,0 +1,325 @@
++import dataclasses
++from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast
++
++import torch
++
++from vllm.attention import AttentionMetadata
++from vllm.forward_context import set_forward_context
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.multimodal import MultiModalKwargs
++from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
++from vllm.utils import make_tensor_with_pad
++from vllm.worker.cpu_model_runner import (CPUModelRunnerBase,
++                                          ModelInputForCPUBuilder,
++                                          ModelInputForCPUWithSamplingMetadata)
++from vllm.worker.model_runner_base import (
++    _add_attn_metadata_broadcastable_dict,
++    _add_sampling_metadata_broadcastable_dict)
++
++if TYPE_CHECKING:
++    from vllm.attention.backends.abstract import AttentionBackend
++
++
++@dataclasses.dataclass(frozen=True)
++class EncoderDecoderModelInputForCPU(ModelInputForCPUWithSamplingMetadata):
++    """
++    Used by the EncoderDecoderModelRunner.
++    """
++    encoder_input_tokens: Optional[torch.Tensor] = None
++    encoder_input_positions: Optional[torch.Tensor] = None
++
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        tensor_dict = {
++            "input_tokens": self.input_tokens,
++            "input_positions": self.input_positions,
++            "encoder_input_tokens": self.encoder_input_tokens,
++            "encoder_input_positions": self.encoder_input_positions,
++            "multi_modal_kwargs": self.multi_modal_kwargs,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++        _add_sampling_metadata_broadcastable_dict(tensor_dict,
++                                                  self.sampling_metadata)
++        return tensor_dict
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls,
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> "EncoderDecoderModelInputForCPU":
++        return cast(
++            EncoderDecoderModelInputForCPU,
++            super().from_broadcasted_tensor_dict(tensor_dict, attn_backend))
++
++
++class CPUEncoderDecoderModelRunner(
++        CPUModelRunnerBase[EncoderDecoderModelInputForCPU]):
++    _model_input_cls: Type[EncoderDecoderModelInputForCPU] = (
++        EncoderDecoderModelInputForCPU)
++    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
++
++    def _list_to_int32_tensor(
++        self,
++        _list: List[int],
++    ) -> torch.Tensor:
++        return torch.tensor(_list, dtype=torch.int32, device=self.device)
++
++    def _list_to_long_tensor(
++        self,
++        _list: List[int],
++    ) -> torch.Tensor:
++        return torch.tensor(_list, dtype=torch.long, device=self.device)
++
++    def _empty_int32_tensor(self) -> torch.Tensor:
++        return self._list_to_int32_tensor([])
++
++    def _empty_long_tensor(self) -> torch.Tensor:
++        return self._list_to_long_tensor([])
++
++    def make_model_input_from_broadcasted_tensor_dict(
++            self, tensor_dict: Dict[str,
++                                    Any]) -> EncoderDecoderModelInputForCPU:
++        return EncoderDecoderModelInputForCPU.from_broadcasted_tensor_dict(
++            tensor_dict,
++            attn_backend=self.attn_backend,
++        )
++
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> EncoderDecoderModelInputForCPU:
++        model_input = self._prepare_model_input_tensors(
++            seq_group_metadata_list, finished_requests_ids)
++        (
++            attn_metadata,
++            encoder_input_tokens_tensor,
++            encoder_input_positions_tensor,
++        ) = self._prepare_encoder_model_input_tensors(seq_group_metadata_list,
++                                                      model_input)
++        # Sampling metadata is only required for the final pp group
++        generators = self.get_generators(finished_requests_ids)
++        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
++                                                     model_input.seq_lens,
++                                                     model_input.query_lens,
++                                                     self.device,
++                                                     pin_memory=False,
++                                                     generators=generators)
++        return dataclasses.replace(
++            model_input,
++            sampling_metadata=sampling_metadata,
++            attn_metadata=attn_metadata,
++            encoder_input_tokens=encoder_input_tokens_tensor,
++            encoder_input_positions=encoder_input_positions_tensor,
++            virtual_engine=virtual_engine,
++        )
++
++    def _prepare_encoder_model_input_tensors(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        model_input: EncoderDecoderModelInputForCPU,
++    ) -> Tuple[AttentionMetadata, Optional[torch.Tensor],
++               Optional[torch.Tensor]]:
++        """Helper method to prepare the encoder- and cross-attn-related
++        model inputs based on a given sequence group. These additional inputs
++        are used to augment an already-computed `EncoderDecoderModelInput`
++        data structure which already has decoder-related model inputs
++        populated.
++
++        Sets the following attn_metadata fields:
++        * `num_encoder_tokens`
++        * `encoder_seq_lens`
++        * `encoder_seq_lens_tensor`
++        * `max_encoder_seq_len`
++        * `cross_slot_mapping`
++        * `cross_block_tables`
++
++        Constructs a new model inputs data structure, based on
++        (1) the existing fields in the `model_inputs` argument,
++        and (2) the following additional fields which are
++        computed (or in the case of `attn_metadata`, updated) 
++        by this function:
++        * attn_metadata
++        * encoder_input_tokens
++        * encoder_input_positions
++
++        Arguments:
++
++        * seq_group_metadata_list: list of sequence groups for which to
++                                   compute inputs
++        * model_inputs: model inputs data structure with decoder-oriented
++                        fields already computed.
++
++        Return:
++
++        * Updated model inputs data structure
++        """
++
++        if len(seq_group_metadata_list) == 0:
++            return (model_input.attn_metadata, None, None)
++
++        # Since we are not supporting chunked prefill either the entire
++        # batch is prefill or it is decode
++        is_prompt = seq_group_metadata_list[0].is_prompt
++
++        # Build encoder inputs
++        encoder_seq_lens: List[int] = []
++        if is_prompt:
++            # Prefill phase.
++            cross_block_tables = self._empty_int32_tensor().view(
++                len(seq_group_metadata_list), -1)
++
++            # Extract input tokens/positions, cross-attention slot-mapping,
++            # & seq len from each sequence group metadata
++            (
++                encoder_input_tokens,
++                encoder_input_positions,
++                cross_slot_mapping,
++            ) = (
++                [],
++                [],
++                [],
++            )
++            for seq_group_metadata in seq_group_metadata_list:
++                # Build seq lens
++                seq_len = seq_group_metadata.encoder_seq_data.get_len()
++                token_ids = seq_group_metadata.encoder_seq_data.get_token_ids()
++                encoder_seq_lens.append(seq_len)
++
++                # Build slot mapping
++                for i in range(0, seq_len):
++                    block_number = seq_group_metadata.cross_block_table[
++                        i // self.block_size]
++                    block_offset = i % self.block_size
++                    slot = block_number * self.block_size + block_offset
++                    cross_slot_mapping.append(slot)
++
++                # Build encoder input tokens
++                encoder_input_tokens.extend(token_ids)
++                encoder_input_positions.extend(list(range(0, seq_len)))
++
++            # Convert tokens/positions & cross-attention
++            # slot-mapping to encoder input tensors
++            encoder_input_tokens_tensor = self._list_to_long_tensor(
++                encoder_input_tokens)
++            encoder_input_positions_tensor = self._list_to_long_tensor(
++                encoder_input_positions)
++            cross_slot_mapping_tensor = self._list_to_long_tensor(
++                cross_slot_mapping)
++
++        else:
++            # Decode phase.
++            encoder_input_tokens_tensor = self._empty_long_tensor()
++            encoder_input_positions_tensor = self._empty_long_tensor()
++            cross_slot_mapping_tensor = self._empty_long_tensor()
++            # Extract cross-attention block tables &
++            # seq len from each sequence group metadata.
++            # Cross-attention block tables are empty
++            # during vLLM memory profiling.
++            cross_block_tables = []
++            for seq_group_metadata in seq_group_metadata_list:
++                for _ in range(len(seq_group_metadata.seq_data)):
++                    encoder_seq_lens.append(
++                        seq_group_metadata.encoder_seq_data.get_len())
++                    cross_block_table = seq_group_metadata.cross_block_table
++                    cross_block_tables.append([] if (
++                        cross_block_table is None) else cross_block_table)
++
++            max_len_of_block_table = max(
++                len(block_table) for block_table in cross_block_tables)
++
++            cross_block_tables = make_tensor_with_pad(
++                cross_block_tables,
++                max_len=max_len_of_block_table,
++                pad=0,
++                dtype=torch.int32,
++                device=self.device,
++            )
++
++        # Compute encoder sequence lengths & encoder
++        # sequence starting offset tensors
++        max_encoder_seq_len = max(encoder_seq_lens, default=0)
++        encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens)
++        encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] +
++                                            1,
++                                            dtype=torch.int32,
++                                            device=self.device)
++        torch.cumsum(encoder_seq_lens_tensor,
++                     dim=0,
++                     dtype=encoder_seq_start_loc.dtype,
++                     out=encoder_seq_start_loc[1:])
++
++        # Update attention metadata with encoder-oriented attributes
++        attn_metadata = model_input.attn_metadata
++        assert attn_metadata is not None
++        (
++            attn_metadata.num_encoder_tokens,
++            attn_metadata.encoder_seq_lens,
++            attn_metadata.encoder_seq_lens_tensor,
++            attn_metadata.max_encoder_seq_len,
++            attn_metadata.cross_slot_mapping,
++            attn_metadata.cross_block_tables,
++        ) = (
++            sum(encoder_seq_lens),
++            encoder_seq_lens,
++            encoder_seq_lens_tensor,
++            max_encoder_seq_len,
++            cross_slot_mapping_tensor,
++            cross_block_tables,
++        )
++
++        return (attn_metadata, encoder_input_tokens_tensor,
++                encoder_input_positions_tensor)
++
++    @torch.no_grad()
++    def execute_model(
++        self,
++        model_input: EncoderDecoderModelInputForCPU,
++        kv_caches: List[torch.Tensor],
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++    ) -> Optional[List[SamplerOutput]]:
++        if num_steps > 1:
++            raise ValueError(
++                "CPU worker does not support multi-step execution.")
++
++        model_executable = self.model
++        execute_model_kwargs = {
++            "input_ids":
++            model_input.input_tokens,
++            "positions":
++            model_input.input_positions,
++            "encoder_input_ids":
++            model_input.encoder_input_tokens,
++            "encoder_positions":
++            model_input.encoder_input_positions,
++            "kv_caches":
++            kv_caches,
++            "attn_metadata":
++            model_input.attn_metadata,
++            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
++                                         device=self.device),
++            "intermediate_tensors":
++            intermediate_tensors,
++        }
++
++        with set_forward_context(model_input.attn_metadata, self.vllm_config,
++                                 model_input.virtual_engine):
++            hidden_states = model_executable(**execute_model_kwargs)
++
++        # Compute the logits.
++        logits = self.model.compute_logits(hidden_states,
++                                           model_input.sampling_metadata)
++
++        # Only perform sampling in the driver worker.
++        if not self.is_driver_worker:
++            return []
++
++        # Sample the next token.
++        output = self.model.sample(
++            logits=logits,
++            sampling_metadata=model_input.sampling_metadata,
++        )
++        return [output]
+diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
+index 193b021..303d9a1 100644
+--- a/vllm/worker/cpu_model_runner.py
++++ b/vllm/worker/cpu_model_runner.py
+@@ -1,346 +1,673 @@
+-from typing import List, Optional, Tuple
++import dataclasses
++import weakref
++from collections import defaultdict
++from dataclasses import dataclass
++from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Type,
++                    TypeVar, Union)
+ 
+ import torch
+ from torch import nn
+ 
+ from vllm.attention import AttentionMetadata, get_attn_backend
+-from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+-from vllm.distributed import broadcast_tensor_dict
++from vllm.config import VllmConfig
++from vllm.forward_context import set_forward_context
+ from vllm.logger import init_logger
++from vllm.lora.layers import LoRAMapping
++from vllm.lora.request import LoRARequest
++from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+ from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
++from vllm.model_executor.layers.sampler import SamplerOutput
+ from vllm.model_executor.model_loader import get_model
+-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+-from vllm.utils import make_tensor_with_pad
++from vllm.model_executor.models import supports_lora, supports_multimodal
++from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
++                             MultiModalKwargs, MultiModalPlaceholderMap)
++from vllm.sequence import (IntermediateTensors, SequenceData,
++                           SequenceGroupMetadata)
++from vllm.worker.model_runner_base import (
++    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
++    _add_attn_metadata_broadcastable_dict,
++    _add_sampling_metadata_broadcastable_dict,
++    _init_attn_metadata_from_tensor_dict,
++    _init_sampling_metadata_from_tensor_dict)
++
++if TYPE_CHECKING:
++    from vllm.attention.backends.abstract import AttentionBackend
+ 
+ logger = init_logger(__name__)
+ 
++TModelInputForCPU = TypeVar('TModelInputForCPU', bound="ModelInputForCPU")
+ _PAD_SLOT_ID = -1
+ 
+ 
+-class CPUModelRunner:
++@dataclass(frozen=True)
++class ModelInputForCPU(ModelRunnerInputBase):
++    """
++    Base class contains metadata needed for the base model forward pass on CPU
++    """
++    input_tokens: Optional[torch.Tensor] = None
++    input_positions: Optional[torch.Tensor] = None
++    token_type_ids: Optional[torch.Tensor] = None
++    attn_metadata: Optional["AttentionMetadata"] = None
++    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
++    virtual_engine: Optional[int] = None
++    seq_lens: Optional[List[int]] = None
++    query_lens: Optional[List[int]] = None
++    lora_mapping: Optional["LoRAMapping"] = None
++    lora_requests: Optional[Set[LoRARequest]] = None
++
++    def as_broadcastable_tensor_dict(
++            self) -> Dict[str, Union[int, torch.Tensor]]:
++        tensor_dict = {
++            "input_tokens": self.input_tokens,
++            "input_positions": self.input_positions,
++            "token_type_ids": self.token_type_ids,
++            "multi_modal_kwargs": self.multi_modal_kwargs,
++            "lora_requests": self.lora_requests,
++            "lora_mapping": self.lora_mapping,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++
++        return tensor_dict
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls: Type[TModelInputForCPU],
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None
++    ) -> TModelInputForCPU:
++        if attn_backend is not None:
++            tensor_dict = _init_attn_metadata_from_tensor_dict(
++                attn_backend, tensor_dict)
++        return cls(**tensor_dict)
++
++
++@dataclass(frozen=True)
++class ModelInputForCPUWithSamplingMetadata(ModelInputForCPU):
++    """
++    Used by the ModelRunner.
++    """
++    sampling_metadata: Optional["SamplingMetadata"] = None
++    is_prompt: Optional[bool] = None
++
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        tensor_dict = {
++            "input_tokens": self.input_tokens,
++            "input_positions": self.input_positions,
++            "token_type_ids": self.token_type_ids,
++            "multi_modal_kwargs": self.multi_modal_kwargs,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++        _add_sampling_metadata_broadcastable_dict(tensor_dict,
++                                                  self.sampling_metadata)
++        return tensor_dict
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls,
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> "ModelInputForCPUWithSamplingMetadata":
++        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
++        if attn_backend is not None:
++            tensor_dict = _init_attn_metadata_from_tensor_dict(
++                attn_backend, tensor_dict)
++        return cls(**tensor_dict)
++
++
++class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
++
++    class ModelInputData:
++
++        def __init__(self, use_mrope: bool):
++            self.use_mrope = use_mrope
++            self.input_tokens: List[int] = []
++            self.input_positions: List[int] = []
++            self.token_type_ids: Optional[List[int]] = []
++            self.seq_lens: List[int] = []
++            self.query_lens: List[int] = []
++            self.prefill_block_tables: List[List[int]] = []
++            self.decode_block_tables: List[List[int]] = []
++            self.max_decode_seq_len: int = 0
++            self.num_prefills: int = 0
++            self.num_prefill_tokens: int = 0
++            self.num_decode_tokens: int = 0
++            self.slot_mapping: List[int] = []
++            self.multi_modal_inputs_list: List[MultiModalKwargs] = []
++            self.multi_modal_placeholder_maps: Dict[
++                str, MultiModalPlaceholderMap] = defaultdict(
++                    MultiModalPlaceholderMap)
++            self.input_mrope_positions: List[List[int]] = [[]
++                                                           for _ in range(3)]
++
++    def __init__(self,
++                 runner: "CPUModelRunner",
++                 finished_requests_ids: Optional[List[str]] = None) -> None:
++        super().__init__()
++        self.seq_group_metadata_list: List[SequenceGroupMetadata] = []
++        self.runner = runner
++
++        self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled
++                                or runner.cache_config.enable_prefix_caching)
++        self.model_input_cls = self.runner._model_input_cls
++        self.attn_backend = self.runner.attn_backend
++        self.sliding_window = self.runner.sliding_window
++        self.block_size = self.runner.block_size
++        self.device = self.runner.device
++        self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
++        self.enable_lora = self.runner.lora_config is not None
++        self.input_data = ModelInputForCPUBuilder.ModelInputData(
++            self.runner.model_config.uses_mrope)
++        self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()(
++            self)
++
++    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
++        self.seq_group_metadata_list.append(seq_group_metadata)
++
++    def set_seq_group_list(
++            self, seq_group_metadata_list: List[SequenceGroupMetadata]):
++        self.seq_group_metadata_list = seq_group_metadata_list
++
++    def build(self) -> ModelInputForCPU:
++        self._build_input_data()
++
++        input_data = self.input_data
++        input_tokens = torch.tensor(input_data.input_tokens,
++                                    dtype=torch.long,
++                                    device="cpu")
++        input_positions = torch.tensor(
++            input_data.input_positions
++            if not any(input_data.input_mrope_positions) else
++            input_data.input_mrope_positions,
++            dtype=torch.long,
++            device="cpu")
++        token_type_ids = torch.tensor(input_data.token_type_ids,
++                                    dtype=torch.long,
++                                    device="cpu") \
++                                    if input_data.token_type_ids else None
++
++        # For multi-modal models
++        multi_modal_kwargs = None
++        if len(input_data.multi_modal_inputs_list) != 0:
++            multi_modal_kwargs = MultiModalKwargs.batch(
++                input_data.multi_modal_inputs_list)
++
++        attn_metadata = self.att_metadata_builder.build(
++            input_data.seq_lens, input_data.query_lens, -1, -1)
++
++        is_prompt = (self.seq_group_metadata_list[0].is_prompt
++                     if self.seq_group_metadata_list else None)
++        # LoRA data.
++        lora_requests = set()
++        lora_mapping = None
++        if self.enable_lora:
++            lora_requests = set(seq.lora_request
++                                for seq in self.seq_group_metadata_list
++                                if seq.lora_request is not None)
++
++            lora_mapping = self._prepare_lora_input(
++                self.seq_group_metadata_list, is_prompt)
++
++        return self.model_input_cls(input_tokens=input_tokens,
++                                    input_positions=input_positions,
++                                    token_type_ids=token_type_ids,
++                                    seq_lens=input_data.seq_lens,
++                                    query_lens=input_data.query_lens,
++                                    attn_metadata=attn_metadata,
++                                    multi_modal_kwargs=multi_modal_kwargs,
++                                    lora_mapping=lora_mapping,
++                                    lora_requests=lora_requests)
++
++    def _build_input_data(self):
++        for seq_group_metadata in self.seq_group_metadata_list:
++            for seq_id, seq_data in seq_group_metadata.seq_data.items():
++                if seq_group_metadata.is_prompt:
++                    self._compute_prompt_input_tokens(self.input_data,
++                                                      seq_group_metadata,
++                                                      seq_data, seq_id)
++                    if seq_group_metadata.multi_modal_data:
++                        self._compute_multi_modal_input(
++                            seq_group_metadata, seq_data)
++                else:
++                    self._compute_decode_input_tokens(self.input_data,
++                                                      seq_group_metadata,
++                                                      seq_data, seq_id)
++
++    def _compute_decode_input_tokens(self, data: ModelInputData,
++                                     seq_group_metadata: SequenceGroupMetadata,
++                                     seq_data: SequenceData, seq_id: int):
++        """
++        Compute decode input tokens, positions, block table and slot mapping.
++        """
++        block_size = self.runner.block_size
++
++        block_table = seq_group_metadata.block_tables[seq_id]
++        seq_len = seq_data.get_len()
++        context_len = seq_data.get_num_computed_tokens()
++
++        tokens = seq_data.get_last_token_id()
++        token_positions = seq_len - 1
++        block_number = block_table[token_positions // block_size]
++        block_offset = token_positions % block_size
++        slot = block_number * block_size + block_offset
++
++        # For paged_attention kernel
++        if self.runner.sliding_window:
++            start_idx = max(0, seq_len - self.runner.sliding_window)
++            start_block = start_idx // block_size
++            start_idx = start_block * block_size
++            seq_len = seq_len - start_idx
++            block_table = block_table[start_block:]
++
++        # For MRotaryEmbedding
++        if seq_data.mrope_position_delta is not None:
++            next_pos = MRotaryEmbedding.get_next_input_positions(
++                seq_data.mrope_position_delta,
++                context_len,
++                seq_len,
++            )
++            for idx in range(3):
++                data.input_mrope_positions[idx].extend(  # type: ignore
++                    next_pos[idx])
++        else:
++            data.input_positions.append(token_positions)  # type: ignore
++
++        # Update fields
++        data.input_tokens.append(tokens)
++        data.max_decode_seq_len = max(data.max_decode_seq_len, seq_len)
++        data.num_decode_tokens += 1
++        data.slot_mapping.append(slot)
++        data.decode_block_tables.append(block_table)
++        data.query_lens.append(1)
++        data.seq_lens.append(seq_len)
++
++    def _compute_prompt_input_tokens(self, data: ModelInputData,
++                                     seq_group_metadata: SequenceGroupMetadata,
++                                     seq_data: SequenceData, seq_id: int):
++        """
++        Compute prompt input tokens, positions, block table and slot mapping.
++        """
++        token_chunk_size = seq_group_metadata.token_chunk_size
++        block_size = self.runner.block_size
++
++        block_table = seq_group_metadata.block_tables[seq_id]
++        seq_len = seq_data.get_len()
++        context_len = seq_data.get_num_computed_tokens()
++        seq_len = min(seq_len, context_len + token_chunk_size)
++
++        # For prefix caching
++        prefix_cache_block_num = len(seq_group_metadata.computed_block_nums)
++        if prefix_cache_block_num > 0:
++            prefix_cache_len = (prefix_cache_block_num *
++                                self.runner.block_size)
++            if prefix_cache_len <= context_len:
++                # We already passed the cache hit region,
++                # so do normal computation.
++                pass
++            elif context_len < prefix_cache_len < seq_len:
++                # Partial hit. Compute the missing part.
++                context_len = prefix_cache_len
++                token_chunk_size = seq_len - context_len
++            elif seq_len <= prefix_cache_len:
++                # Full hit. Only compute the last token to avoid
++                # erroneous behavior. FIXME: Ideally we should directly
++                # mark all tokens as computed in the scheduler and do not
++                # schedule this sequence, so this case should not happen.
++                context_len = seq_len - 1
++                token_chunk_size = 1
++
++        tokens = seq_data.get_token_ids()
++        tokens = tokens[context_len:seq_len]
++        token_positions = range(context_len, seq_len)
++        token_types = seq_group_metadata.token_type_ids
++
++        # For encoder-only models, the block_table is None,
++        # and there is no need to initialize the slot_mapping.
++        if block_table is not None:
++            slot_mapping = [_PAD_SLOT_ID] * len(token_positions)
++            for i, pos in enumerate(token_positions):
++                block_number = block_table[pos // block_size]
++                block_offset = pos % block_size
++                slot = block_number * block_size + block_offset
++                slot_mapping[i] = slot
++            data.slot_mapping.extend(slot_mapping)
++
++        # The MROPE positions are prepared in _compute_multi_modal_input
++        data.input_positions.extend(token_positions)
++
++        if data.token_type_ids is not None:
++            data.token_type_ids.extend(token_types if token_types else [])
++
++        # Update fields
++        data.input_tokens.extend(tokens)
++        data.num_prefills += 1
++        data.num_prefill_tokens += len(tokens)
++        data.query_lens.append(len(tokens))
++        data.prefill_block_tables.append(block_table)
++        data.seq_lens.append(seq_len)
++
++    def _compute_multi_modal_input(self,
++                                   seq_group_metadata: SequenceGroupMetadata,
++                                   seq_data: SequenceData):
++        computed_len = seq_data.get_num_computed_tokens()
++        seq_len = self.input_data.seq_lens[-1]
++
++        # NOTE: mm_data only includes the subset of multi-modal items that
++        # intersect with the current prefill positions.
++        mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
++            seq_group_metadata, range(computed_len, seq_len))
++
++        if not mm_data:
++            return
++
++        if self.runner.mm_registry.has_processor(self.runner.model_config):
++            mm_kwargs = mm_data
++        else:
++            mm_kwargs = self.multi_modal_input_mapper(
++                mm_data,
++                seq_group_metadata.mm_processor_kwargs,
++            )
++
++        # special processing for mrope position deltas.
++        if self.runner.model_config.uses_mrope:
++            assert not self.chunked_prefill, \
++                "MROPE on CPU does not support chunked-prefill."
++
++            image_grid_thw = mm_kwargs.get("image_grid_thw", None)
++            video_grid_thw = mm_kwargs.get("video_grid_thw", None)
++            assert image_grid_thw is not None or video_grid_thw is not None, (
++                "mrope embedding type requires multi-modal input mapper "
++                "returns 'image_grid_thw' or 'video_grid_thw'.")
++
++            hf_config = self.runner.model_config.hf_config
++            token_ids = seq_data.get_token_ids()
++
++            mrope_positions, mrope_position_delta = \
++                MRotaryEmbedding.get_input_positions(
++                    token_ids,
++                    image_grid_thw=image_grid_thw,
++                    video_grid_thw=video_grid_thw,
++                    image_token_id=hf_config.image_token_id,
++                    video_token_id=hf_config.video_token_id,
++                    vision_start_token_id=hf_config.vision_start_token_id,
++                    vision_end_token_id=hf_config.vision_end_token_id,
++                    spatial_merge_size=hf_config.vision_config.
++                    spatial_merge_size,
++                    context_len=computed_len,
++                )
++            seq_data.mrope_position_delta = mrope_position_delta
++
++            for i in range(3):
++                self.input_data.input_mrope_positions[  # type: ignore
++                    i].extend(mrope_positions[i])
++
++        self.input_data.multi_modal_inputs_list.append(mm_kwargs)
++        for modality, placeholder_map in placeholder_maps.items():
++            self.input_data.multi_modal_placeholder_maps[modality].extend(
++                placeholder_map)
++
++    def _prepare_lora_input(
++            self, seq_group_metadata_list: List[SequenceGroupMetadata],
++            is_prefill: bool) -> LoRAMapping:
++        index_mapping = []
++        prompt_mapping = []
++        for seq in seq_group_metadata_list:
++            lora_id = seq.lora_int_id
++            query_len = seq.token_chunk_size
++
++            index_mapping += [lora_id] * query_len
++            prompt_mapping += [lora_id] * (
++                query_len if seq.sampling_params
++                and seq.sampling_params.prompt_logprobs is not None else 1)
++
++        return LoRAMapping(index_mapping=tuple(index_mapping),
++                           prompt_mapping=tuple(prompt_mapping),
++                           is_prefill=is_prefill)
++
++
++class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
++    """
++    Helper class for shared methods between CPU model runners.
++    """
++    _model_input_cls: Type[TModelInputForCPU]
++    _builder_cls: Type[ModelInputForCPUBuilder]
+ 
+     def __init__(
+         self,
+-        model_config: ModelConfig,
+-        parallel_config: ParallelConfig,
+-        scheduler_config: SchedulerConfig,
+-        device_config: DeviceConfig,
+-        load_config: LoadConfig,
+-        lora_config: Optional[LoRAConfig],
+-        vision_language_config: Optional[VisionLanguageConfig],
++        vllm_config: VllmConfig,
+         kv_cache_dtype: Optional[str] = "auto",
+         is_driver_worker: bool = False,
++        return_hidden_states: bool = False,
+         *args,
+         **kwargs,
+     ):
+-        self.model_config = model_config
+-        self.parallel_config = parallel_config
+-        self.scheduler_config = scheduler_config
+-        # Currently, CPU worker doesn't support chunked prefill.
+-        assert self.scheduler_config.chunked_prefill_enabled is False
+-        self.lora_config = lora_config
+-        self.vision_language_config = vision_language_config
+-        self.load_config = load_config
++        ModelRunnerBase.__init__(self, vllm_config)
++        model_config = self.model_config
++        cache_config = self.cache_config
++
+         self.is_driver_worker = is_driver_worker
++        self.return_hidden_states = return_hidden_states
+ 
+-        # model_config can be None in tests/samplers/test_sampler.py.
+-        # FIXME(woosuk): This is a hack to make the tests work. Refactor this.
+-        self.sliding_window = (model_config.get_sliding_window()
+-                               if model_config is not None else None)
+-        self.device_config = (device_config
+-                              if device_config is not None else DeviceConfig())
+         self.device = self.device_config.device
++        self.pin_memory = False
+ 
+         self.kv_cache_dtype = kv_cache_dtype
+-
++        self.sliding_window = model_config.get_sliding_window()
++        self.block_size = cache_config.block_size
++        num_attn_heads = self.model_config.get_num_attention_heads(
++            self.parallel_config)
++        needs_attn_backend = (num_attn_heads != 0
++                              or self.model_config.is_attention_free)
+         self.attn_backend = get_attn_backend(
+-            self.model_config.dtype if model_config is not None else None)
++            self.model_config.get_head_size(),
++            self.model_config.dtype,
++            self.kv_cache_dtype,
++            self.block_size,
++            self.model_config.is_attention_free,
++        ) if needs_attn_backend else None
++
++        # Multi-modal data support
++        self.mm_registry = MULTIMODAL_REGISTRY
++        self.multi_modal_input_mapper = self.mm_registry \
++            .create_input_mapper(self.model_config)
++        self.mm_registry.init_mm_limits_per_prompt(self.model_config)
+ 
+         # Lazy initialization.
+         self.model: nn.Module  # Set after init_Model
+-        self.block_size: int  # Set after initial profiling.
++        # Set after load_model.
++        self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
+ 
+     def load_model(self) -> None:
+-        self.model = get_model(
+-            model_config=self.model_config,
+-            load_config=self.load_config,
+-            device_config=self.device_config,
+-            vision_language_config=self.vision_language_config,
+-            lora_config=self.lora_config,
+-            parallel_config=self.parallel_config,
+-            scheduler_config=self.scheduler_config)
+-
+-    def _prepare_prompt(
+-        self,
+-        seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+-               Optional[torch.Tensor]]:
+-        assert len(seq_group_metadata_list) > 0
+-        input_tokens: List[int] = []
+-        input_positions: List[int] = []
+-        slot_mapping: List[int] = []
+-        seq_lens: List[int] = []
+-        multi_modal_input_list: List[torch.Tensor] = []
+-
+-        for seq_group_metadata in seq_group_metadata_list:
+-            assert seq_group_metadata.is_prompt
+-            seq_ids = list(seq_group_metadata.seq_data.keys())
+-            assert len(seq_ids) == 1
+-            seq_id = seq_ids[0]
+-
+-            seq_data = seq_group_metadata.seq_data[seq_id]
+-            prompt_tokens = seq_data.get_token_ids()
+-            computed_len = seq_data.get_num_computed_tokens()
+-            seq_len = len(prompt_tokens)
+-
+-            seq_lens.append(seq_len)  # Prompt token num
+-            input_tokens.extend(prompt_tokens)  # Token ids
+-
+-            # Token position ids
+-            # NOTE(woosuk): Here we assume that the first token in the prompt
+-            # is always the first token in the sequence.
+-            input_positions.extend(list(range(computed_len, seq_len)))
+-
+-            if seq_group_metadata.multi_modal_data:
+-                multi_modal_input_list.append(
+-                    seq_group_metadata.multi_modal_data.data)
+-
+-            # Compute the slot mapping.
+-            block_table = seq_group_metadata.block_tables[seq_id]
+-            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
+-            # where start_idx is max(0, seq_len - sliding_window).
+-            # For example, if the prompt len is 10, sliding window is 8, and
+-            # block size is 4, the first two tokens are masked and the slot
+-            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+-            start_idx = 0
+-            if self.sliding_window is not None:
+-                start_idx = max(0, seq_len - self.sliding_window)
+-
+-            for i in range(computed_len, seq_len):
+-                if i < start_idx:
+-                    slot_mapping.append(_PAD_SLOT_ID)
+-                    continue
+-
+-                block_number = block_table[i //
+-                                           self.block_size]  # type: ignore
+-                block_offset = i % self.block_size  # type: ignore
+-                slot = block_number * self.block_size + block_offset
+-                slot_mapping.append(slot)
+-
+-        if multi_modal_input_list:
+-            assert self.vision_language_config, (
+-                "Multi-modal inputs are only supported by "
+-                "vision language models.")
+-            multi_modal_input = torch.cat(multi_modal_input_list,
+-                                          dim=0).to(self.device)
+-        else:
+-            multi_modal_input = None
++        self.model = get_model(vllm_config=self.vllm_config)
+ 
+-        num_prompt_tokens = len(input_tokens)
++        if self.lora_config:
++            assert supports_lora(
++                self.model
++            ), f"{self.model.__class__.__name__} does not support LoRA yet."
+ 
+-        input_tokens = torch.tensor(input_tokens,
+-                                    dtype=torch.long,
+-                                    device=self.device)  # type: ignore
+-        input_positions = torch.tensor(input_positions,
+-                                       dtype=torch.long,
+-                                       device=self.device)  # type: ignore
+-        slot_mapping = torch.tensor(slot_mapping,
+-                                    dtype=torch.long,
+-                                    device=self.device)  # type: ignore
+-
+-        attn_metadata = self.attn_backend.make_metadata(
+-            is_prompt=True,
+-            seq_lens=seq_lens,
+-            seq_lens_tensor=None,
+-            max_seq_len=None,
+-            num_prefills=len(seq_lens),
+-            num_prefill_tokens=num_prompt_tokens,
+-            num_decode_tokens=0,
+-            prefill_metadata=None,
+-            decode_metadata=None,
+-            block_tables=torch.tensor([]),
+-            slot_mapping=slot_mapping,
+-            kv_cache_dtype=self.kv_cache_dtype,
+-        )
+-        return (input_tokens, input_positions, attn_metadata, seq_lens,
+-                multi_modal_input)
++            if supports_multimodal(self.model):
++                logger.warning("Regarding multimodal models, vLLM currently "
++                               "only supports adding LoRA to language model.")
++
++            # It's necessary to distinguish between the max_position_embeddings
++            # of VLMs and LLMs.
++            if hasattr(self.model.config, "max_position_embeddings"):
++                max_pos_embeddings = self.model.config.max_position_embeddings
++            else:
++                max_pos_embeddings = (
++                    self.model.config.text_config.max_position_embeddings)
++
++            self.lora_manager = LRUCacheWorkerLoRAManager(
++                self.scheduler_config.max_num_seqs,
++                self.scheduler_config.max_num_batched_tokens,
++                self.vocab_size,
++                self.lora_config,
++                self.device,
++                self.model.embedding_modules,
++                self.model.embedding_padding_modules,
++                max_position_embeddings=max_pos_embeddings,
++            )
++            self.model = self.lora_manager.create_lora_manager(self.model)
+ 
+-    def _prepare_decode(
++    def _prepare_model_input_tensors(
+         self,
+         seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]:
+-        assert len(seq_group_metadata_list) > 0
+-        input_tokens: List[int] = []
+-        input_positions: List[int] = []
+-        slot_mapping: List[int] = []
+-        seq_lens: List[int] = []
+-        block_tables: List[List[int]] = []
+-
+-        for seq_group_metadata in seq_group_metadata_list:
+-            assert not seq_group_metadata.is_prompt
+-            assert seq_group_metadata.token_chunk_size == 1
+-
+-            seq_ids = list(seq_group_metadata.seq_data.keys())
+-
+-            for seq_id in seq_ids:
+-                seq_data = seq_group_metadata.seq_data[seq_id]
+-                generation_token = seq_data.get_last_token_id()
+-                input_tokens.append(generation_token)
+-
+-                seq_len = seq_data.get_len()
+-                position = seq_len - 1
+-                input_positions.append(position)
+-
+-                seq_len = seq_len if self.sliding_window is None else min(
+-                    seq_len, self.sliding_window)
+-                seq_lens.append(seq_len)
+-
+-                block_table = seq_group_metadata.block_tables[seq_id]
+-                block_number = block_table[position // self.block_size]
+-                block_offset = position % self.block_size
+-                slot = block_number * self.block_size + block_offset
+-                slot_mapping.append(slot)
+-
+-                if self.sliding_window is not None:
+-                    sliding_window_blocks = (self.sliding_window //
+-                                             self.block_size)
+-                    block_table = block_table[-sliding_window_blocks:]
+-                block_tables.append(block_table)
+-
+-        max_seq_len = max(seq_lens)
+-
+-        input_tokens = torch.tensor(input_tokens,
+-                                    dtype=torch.long,
+-                                    device=self.device)
+-        input_positions = torch.tensor(input_positions,
+-                                       dtype=torch.long,
+-                                       device=self.device)
+-        slot_mapping = torch.tensor(slot_mapping,
+-                                    dtype=torch.long,
+-                                    device=self.device)
+-        seq_lens_tensor = torch.tensor(seq_lens,
+-                                       dtype=torch.int,
+-                                       device=self.device)
+-
+-        max_block_table_len = max(
+-            len(block_table) for block_table in block_tables)
+-        block_tables = make_tensor_with_pad(
+-            block_tables,
+-            max_len=max_block_table_len,
+-            pad=0,
+-            dtype=torch.int,
+-            device=self.device,
+-        )
+-
+-        attn_metadata = self.attn_backend.make_metadata(
+-            is_prompt=False,
+-            slot_mapping=slot_mapping,
+-            seq_lens=seq_lens,
+-            seq_lens_tensor=seq_lens_tensor,
+-            max_seq_len=max_seq_len,
+-            num_prefill_tokens=0,
+-            num_decode_tokens=len(input_tokens),
+-            num_prefills=0,
+-            prefill_metadata=None,
+-            decode_metadata=None,
+-            block_tables=block_tables,
+-            kv_cache_dtype=self.kv_cache_dtype,
+-        )
+-        return (
+-            input_tokens,
+-            input_positions,
+-            attn_metadata,
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> TModelInputForCPU:
++        """Helper method to prepare the model input based on a given sequence
++        group. Prepares metadata needed for the base model forward pass but not
++        metadata for possible additional steps, e.g., sampling.
++
++        """
++        builder = self._builder_cls(weakref.proxy(self), finished_requests_ids)
++        builder.set_seq_group_list(seq_group_metadata_list)
++
++        return builder.build()  # type: ignore
++
++    # sampler property will be used by spec_decode_worker
++    @property
++    def sampler(self):
++        return self.model.sampler
++
++    @property
++    def vocab_size(self) -> int:
++        return self.model_config.get_vocab_size()
++
++    def remove_all_loras(self):
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        self.lora_manager.remove_all_adapters()
++
++    def set_active_loras(self, lora_requests: Set[LoRARequest],
++                         lora_mapping: LoRAMapping) -> None:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
++
++    def add_lora(self, lora_request: LoRARequest) -> bool:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        return self.lora_manager.add_adapter(lora_request)
++
++    def remove_lora(self, lora_id: int) -> bool:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        return self.lora_manager.remove_adapter(lora_id)
++
++    def pin_lora(self, lora_id: int) -> bool:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        return self.lora_manager.pin_adapter(lora_id)
++
++    def list_loras(self) -> Set[int]:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        return self.lora_manager.list_adapters()
++
++
++class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
++    _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
++        ModelInputForCPUWithSamplingMetadata)
++    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
++
++    def make_model_input_from_broadcasted_tensor_dict(
++        self,
++        tensor_dict: Dict[str, Any],
++    ) -> ModelInputForCPUWithSamplingMetadata:
++        return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict(  # noqa: E501
++            tensor_dict,
++            attn_backend=self.attn_backend,
+         )
+ 
+-    def prepare_input_tensors(
++    def prepare_model_input(
+         self,
+         seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
+-               Optional[torch.Tensor]]:
+-        multi_modal_input = None
+-        if self.is_driver_worker:
+-            # NOTE: We assume that all sequences in the group are all prompts or
+-            # all decodes.
+-            is_prompt = seq_group_metadata_list[0].is_prompt
+-            # Prepare input tensors.
+-            if is_prompt:
+-                (input_tokens, input_positions, attn_metadata, seq_lens,
+-                 multi_modal_input
+-                 ) = self._prepare_prompt(seq_group_metadata_list)
+-            else:
+-                (input_tokens, input_positions,
+-                 attn_metadata) = self._prepare_decode(seq_group_metadata_list)
+-                seq_lens = []
+-            sampling_metadata = SamplingMetadata.prepare(
+-                seq_group_metadata_list,
+-                seq_lens,
+-                # query_lens is not needed if chunked prefill is not
+-                # supported. Since CPU worker doesn't support chunked prefill
+-                # just use seq_lens instead.
+-                seq_lens,
+-                self.device,
+-                pin_memory=False)
+-            # Broadcast the metadata.
+-            metadata_dict = {
+-                "input_tokens": input_tokens,
+-                "input_positions": input_positions,
+-                "selected_token_indices":
+-                sampling_metadata.selected_token_indices,
+-            }
+-            metadata_dict.update(attn_metadata.asdict_zerocopy())
+-            broadcast_tensor_dict(metadata_dict, src=0)
+-        else:
+-            metadata_dict = broadcast_tensor_dict(src=0)
+-            input_tokens = metadata_dict.pop("input_tokens")
+-            input_positions = metadata_dict.pop("input_positions")
+-            selected_token_indices = metadata_dict.pop(
+-                "selected_token_indices")
+-            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
+-            sampling_metadata = SamplingMetadata(
+-                seq_groups=None,
+-                seq_data=None,
+-                seq_lens=None,
+-                selected_token_indices=selected_token_indices,
+-                categorized_sample_indices=None,
+-                generators=None,
+-            )
+-
+-        return (input_tokens, input_positions, attn_metadata,
+-                sampling_metadata, multi_modal_input)
+-
+-    @torch.inference_mode()
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> ModelInputForCPUWithSamplingMetadata:
++        """Prepare the model input based on a given sequence group, including
++        metadata for the sampling step.
++
++        """
++        model_input = self._prepare_model_input_tensors(
++            seq_group_metadata_list, finished_requests_ids)
++        # Sampling metadata is only required for the final pp group
++        generators = self.get_generators(finished_requests_ids)
++        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
++                                                     model_input.seq_lens,
++                                                     model_input.query_lens,
++                                                     self.device,
++                                                     pin_memory=False,
++                                                     generators=generators)
++
++        is_prompt = (seq_group_metadata_list[0].is_prompt
++                     if seq_group_metadata_list else None)
++        return dataclasses.replace(model_input,
++                                   sampling_metadata=sampling_metadata,
++                                   virtual_engine=virtual_engine,
++                                   is_prompt=is_prompt)
++
++    @torch.no_grad()
+     def execute_model(
+         self,
+-        seq_group_metadata_list: List[SequenceGroupMetadata],
++        model_input: ModelInputForCPUWithSamplingMetadata,
+         kv_caches: List[torch.Tensor],
+-    ) -> Optional[SamplerOutput]:
+-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
+-         multi_modal_input
+-         ) = self.prepare_input_tensors(seq_group_metadata_list)
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++        previous_hidden_states: Optional[torch.Tensor] = None,
++    ) -> Optional[List[SamplerOutput]]:
++        if num_steps > 1:
++            raise ValueError(
++                "CPU worker does not support multi-step execution.")
++
++        if self.lora_config:
++            assert model_input.lora_requests is not None
++            assert model_input.lora_mapping is not None
++            self.set_active_loras(model_input.lora_requests,
++                                  model_input.lora_mapping)
+ 
+         model_executable = self.model
+-        execute_model_kwargs = {
+-            "input_ids": input_tokens,
+-            "positions": input_positions,
+-            "kv_caches": kv_caches,
+-            "attn_metadata": attn_metadata,
+-        }
+-        if self.vision_language_config:
+-            execute_model_kwargs.update({"image_input": multi_modal_input})
+ 
+-        hidden_states = model_executable(**execute_model_kwargs)
++        multimodal_kwargs = {}
++        if model_input.multi_modal_kwargs is not None:
++            multimodal_kwargs = MultiModalKwargs.as_kwargs(
++                model_input.multi_modal_kwargs, device=self.device)
++        execute_model_kwargs = {}
++        if previous_hidden_states is not None:
++            execute_model_kwargs.update(
++                {"previous_hidden_states": previous_hidden_states})
++
++        with set_forward_context(model_input.attn_metadata, self.vllm_config,
++                                 model_input.virtual_engine):
++            hidden_states = model_executable(
++                input_ids=model_input.input_tokens,
++                positions=model_input.input_positions,
++                kv_caches=kv_caches,
++                attn_metadata=model_input.attn_metadata,
++                intermediate_tensors=intermediate_tensors,
++                **execute_model_kwargs,
++                **multimodal_kwargs,
++            )
+ 
+         # Compute the logits.
+-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
++        logits = self.model.compute_logits(hidden_states,
++                                           model_input.sampling_metadata)
+ 
+         # Only perform sampling in the driver worker.
+         if not self.is_driver_worker:
+-            return None
++            return []
+ 
+         # Sample the next token.
+         output = self.model.sample(
+             logits=logits,
+-            sampling_metadata=sampling_metadata,
++            sampling_metadata=model_input.sampling_metadata,
+         )
+-        return output
++        if self.return_hidden_states:
++            # we only need to pass hidden states of most recent token
++            if model_input.is_prompt:
++                output.prefill_hidden_states = hidden_states
++            output.hidden_states = hidden_states
++        return [output]
++
++    def generate_proposals(self, *args, **kwargs):
++        return self.model.generate_proposals(*args, **kwargs)
+diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
+new file mode 100644
+index 0000000..d31ba89
+--- /dev/null
++++ b/vllm/worker/cpu_pooling_model_runner.py
+@@ -0,0 +1,134 @@
++import dataclasses
++from typing import Any, Dict, List, Optional, Tuple, Type, Union
++
++import torch
++
++from vllm.forward_context import set_forward_context
++from vllm.model_executor.pooling_metadata import PoolingMetadata
++from vllm.multimodal import MultiModalKwargs
++from vllm.pooling_params import PoolingParams
++from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
++                           SequenceGroupMetadata)
++from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU,
++                                          ModelInputForCPUBuilder)
++
++
++@dataclasses.dataclass(frozen=True)
++class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU):
++    """
++    Used by the CPUPoolingModelRunner.
++    """
++    pooling_metadata: Optional["PoolingMetadata"] = None
++
++
++class CPUPoolingModelRunner(
++        CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]):
++    _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = (
++        ModelInputForCPUWithPoolingMetadata)
++    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        model_input: ModelInputForCPUWithPoolingMetadata,
++        kv_caches: List[torch.Tensor],
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++    ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
++        if num_steps > 1:
++            raise ValueError(
++                "CPU worker does not support multi-step execution.")
++
++        num_layers = self.model_config.get_num_layers(self.parallel_config)
++        # use an empty tensor instead of `None`` to force Dynamo to pass
++        # it by reference, rather by specializing on the value ``None``.
++        # the `dtype` argument does not matter, and we use `float32` as
++        # a placeholder (it has wide hardware support).
++        kv_caches = [
++            torch.tensor([], dtype=torch.float32, device=self.device)
++            for _ in range(num_layers)
++        ]
++
++        model_executable = self.model
++        cross_enc_kwargs = {}
++        if model_input.token_type_ids is not None:
++            cross_enc_kwargs["token_type_ids"] = model_input.token_type_ids
++        execute_model_kwargs = {
++            "input_ids":
++            model_input.input_tokens,
++            "positions":
++            model_input.input_positions,
++            "kv_caches":
++            kv_caches,
++            "attn_metadata":
++            model_input.attn_metadata,
++            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
++                                         device=self.device),
++            **cross_enc_kwargs,
++            "intermediate_tensors":
++            intermediate_tensors,
++        }
++
++        with set_forward_context(model_input.attn_metadata, self.vllm_config,
++                                 model_input.virtual_engine):
++            hidden_states = model_executable(**execute_model_kwargs)
++
++        # Only perform pooling in the driver worker.
++        if not self.is_driver_worker:
++            return []
++
++        return [
++            self.model.pooler(hidden_states=hidden_states,
++                              pooling_metadata=model_input.pooling_metadata)
++        ]
++
++    def make_model_input_from_broadcasted_tensor_dict(
++            self,
++            tensor_dict: Dict[str,
++                              Any]) -> ModelInputForCPUWithPoolingMetadata:
++        return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict(
++            tensor_dict,
++            attn_backend=self.attn_backend,
++        )
++
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> ModelInputForCPUWithPoolingMetadata:
++        assert seq_group_metadata_list is not None
++        model_input = self._prepare_model_input_tensors(
++            seq_group_metadata_list, finished_requests_ids)
++        # Prepare PoolingMetadata.
++        assert model_input.seq_lens is not None
++        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
++                                                 model_input.seq_lens)
++
++        return dataclasses.replace(model_input,
++                                   virtual_engine=virtual_engine,
++                                   pooling_metadata=pooling_metadata)
++
++    def _prepare_pooling(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        prompt_lens: List[int],
++    ) -> PoolingMetadata:
++        """Prepare PoolingMetadata for the sequence group metadata list."""
++        seq_groups: List[Tuple[List[int], PoolingParams]] = []
++        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
++            seq_ids = list(seq_group_metadata.seq_data.keys())
++            pooling_params = seq_group_metadata.pooling_params
++            seq_groups.append((seq_ids, pooling_params))
++
++        seq_data: Dict[int, SequenceData] = {}
++        for seq_group_metadata in seq_group_metadata_list:
++            seq_data.update(seq_group_metadata.seq_data)
++
++        pooling_metadata = PoolingMetadata(
++            seq_groups=seq_groups,
++            seq_data=seq_data,
++            prompt_lens=prompt_lens,
++        )
++
++        return pooling_metadata
+diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
+index 4420d4c..3e5fcf1 100644
+--- a/vllm/worker/cpu_worker.py
++++ b/vllm/worker/cpu_worker.py
+@@ -1,22 +1,25 @@
+ """A CPU worker class."""
+-from typing import Any, Dict, List, Optional, Tuple
++from typing import Dict, List, Optional, Set, Tuple, Type
+ 
+ import torch
+ import torch.distributed
+ 
++import vllm.envs as envs
+ from vllm.attention import get_attn_backend
+-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+-                         ModelConfig, ParallelConfig, SchedulerConfig,
+-                         VisionLanguageConfig)
+-from vllm.distributed import (broadcast_tensor_dict,
+-                              ensure_model_parallel_initialized,
++from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
++                         ParallelConfig, VllmConfig)
++from vllm.distributed import (ensure_model_parallel_initialized,
+                               init_distributed_environment)
+ from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
+ from vllm.model_executor import set_random_seed
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+-from vllm.worker.cpu_model_runner import CPUModelRunner
+-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache
++from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
++from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
++from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
++from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
++                                     WorkerInput)
+ 
+ logger = init_logger(__name__)
+ 
+@@ -53,7 +56,13 @@ class CPUCacheEngine:
+             self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+ 
+         # Get attention backend.
+-        self.attn_backend = get_attn_backend(model_config.dtype)
++        self.attn_backend = get_attn_backend(
++            self.model_config.get_head_size(),
++            self.model_config.dtype,
++            cache_config.cache_dtype,
++            self.block_size,
++            self.model_config.is_attention_free,
++        )
+ 
+         # Initialize the cache.
+         self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks)
+@@ -102,7 +111,7 @@ class CPUCacheEngine:
+         return dtype_size * total
+ 
+ 
+-class CPUWorker(LoraNotSupportedWorkerBase):
++class CPUWorker(LocalOrDistributedWorkerBase):
+     """A worker class that executes (a partition of) the model on a CPU socket.
+ 
+     Each worker is associated with a single CPU socket. The worker is 
+@@ -113,31 +122,20 @@ class CPUWorker(LoraNotSupportedWorkerBase):
+ 
+     def __init__(
+         self,
+-        model_config: ModelConfig,
+-        parallel_config: ParallelConfig,
+-        scheduler_config: SchedulerConfig,
+-        device_config: DeviceConfig,
+-        cache_config: CacheConfig,
+-        load_config: LoadConfig,
++        vllm_config: VllmConfig,
+         local_rank: int,
+         rank: int,
+         distributed_init_method: str,
+-        lora_config: Optional[LoRAConfig] = None,
+-        vision_language_config: Optional[VisionLanguageConfig] = None,
+         kv_cache_dtype: Optional[str] = "auto",
+         is_driver_worker: bool = False,
++        model_runner_cls: Optional[Type[CPUModelRunner]] = None,
+     ) -> None:
+-        self.model_config = model_config
+-        self.parallel_config = parallel_config
+-        self.scheduler_config = scheduler_config
+-        self.device_config = device_config
+-        self.cache_config = cache_config
+-        self.load_config = load_config
++        WorkerBase.__init__(self, vllm_config=vllm_config)
++
+         self.local_rank = local_rank
+         self.rank = rank
+         self.distributed_init_method = distributed_init_method
+-        self.lora_config = lora_config
+-        self.vision_language_config = vision_language_config
++
+         self.is_driver_worker = is_driver_worker
+         if self.is_driver_worker:
+             assert self.rank == 0, "The driver worker must have rank 0."
+@@ -146,22 +144,75 @@ class CPUWorker(LoraNotSupportedWorkerBase):
+             # note: lazy import to avoid importing torch before initializing
+             from vllm.utils import init_cached_hf_modules
+             init_cached_hf_modules()
+-        self.model_runner = CPUModelRunner(
+-            model_config,
+-            parallel_config,
+-            scheduler_config,
+-            device_config,
+-            load_config=self.load_config,
+-            lora_config=self.lora_config,
+-            vision_language_config=self.vision_language_config,
++
++        # Setup OpenMP threads affinity.
++        omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
++        if omp_cpuids == "all":
++            self.local_omp_cpuid = "all"
++        else:
++            self.local_omp_cpuid = omp_cpuids.split("|")[rank]
++
++        # Return hidden states from target model if the draft model is an
++        # mlp_speculator
++        speculative_config = self.speculative_config
++        model_config = self.model_config
++        speculative_args = {} if speculative_config is None \
++            or (speculative_config.draft_model_config.model ==
++                model_config.model) \
++            or (speculative_config.draft_model_config.hf_config.model_type
++                not in ["medusa", "mlp_speculator", "eagle"]) \
++                    else {"return_hidden_states": True}
++        ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner
++        if self.model_config.runner_type == "pooling":
++            ModelRunnerClass = CPUPoolingModelRunner
++        elif self.model_config.is_encoder_decoder:
++            ModelRunnerClass = CPUEncoderDecoderModelRunner
++        self.model_runner: CPUModelRunnerBase = ModelRunnerClass(
++            vllm_config=vllm_config,
+             kv_cache_dtype=kv_cache_dtype,
+-            is_driver_worker=is_driver_worker)
++            is_driver_worker=is_driver_worker,
++            **speculative_args,
++        )
++        if model_runner_cls is not None:
++            self.model_runner = model_runner_cls(self.model_runner)
+         # Uninitialized cache engine. Will be initialized by
+         # initialize_cache.
+-        self.cache_engine: CPUCacheEngine
+-        self.cpu_cache: List[torch.Tensor]
++        self.cache_engine: List[CPUCacheEngine]
++        # Initialize cpu_cache as pooling models don't initialize kv_caches
++        self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
++
++        # Torch profiler. Enabled and configured through env vars:
++        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
++        if envs.VLLM_TORCH_PROFILER_DIR:
++            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
++            logger.info("Profiling enabled. Traces will be saved to: %s",
++                        torch_profiler_trace_dir)
++            self.profiler = torch.profiler.profile(
++                activities=[
++                    torch.profiler.ProfilerActivity.CPU,
++                ],
++                with_stack=True,
++                on_trace_ready=torch.profiler.tensorboard_trace_handler(
++                    torch_profiler_trace_dir, use_gzip=True))
++        else:
++            self.profiler = None
++
++    def start_profile(self):
++        if self.profiler is None:
++            raise RuntimeError("Profiler is not enabled.")
++        self.profiler.start()
++
++    def stop_profile(self):
++        if self.profiler is None:
++            raise RuntimeError("Profiler is not enabled.")
++        self.profiler.stop()
+ 
+     def init_device(self) -> None:
++        if self.local_omp_cpuid != "all":
++            ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
++            if ret:
++                logger.info(ret)
++        self.device = torch.device("cpu")
+         self.init_distributed_environment()
+         # Set random seed.
+         set_random_seed(self.model_config.seed)
+@@ -215,6 +266,18 @@ class CPUWorker(LoraNotSupportedWorkerBase):
+         # Initialize the cache.
+         self._init_cache_engine()
+ 
++    def add_lora(self, lora_request: LoRARequest) -> bool:
++        return self.model_runner.add_lora(lora_request)
++
++    def remove_lora(self, lora_id: int) -> bool:
++        return self.model_runner.remove_lora(lora_id)
++
++    def pin_lora(self, lora_id: int) -> bool:
++        return self.model_runner.pin_lora(lora_id)
++
++    def list_loras(self) -> Set[int]:
++        return self.model_runner.list_loras()
++
+     def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None:
+         """Raise errors if the num_cpu_blocks is invalid.
+         """
+@@ -233,65 +296,69 @@ class CPUWorker(LoraNotSupportedWorkerBase):
+                 "initializing the engine.")
+ 
+     def _init_cache_engine(self) -> None:
+-        self.cache_engine = CPUCacheEngine(self.cache_config,
+-                                           self.model_config,
+-                                           self.parallel_config,
+-                                           self.device_config)
+-        self.cpu_cache = self.cache_engine.cpu_cache
+-        self.model_runner.block_size = self.cache_engine.block_size
+-
+-        assert self.cpu_cache is not None
++        self.cache_engine = [
++            CPUCacheEngine(self.cache_config, self.model_config,
++                           self.parallel_config, self.device_config)
++            for _ in range(self.parallel_config.pipeline_parallel_size)
++        ]
++        self.cpu_cache = [
++            self.cache_engine[ve].cpu_cache
++            for ve in range(self.parallel_config.pipeline_parallel_size)
++        ]
++        bind_kv_cache(self.compilation_config.static_forward_context,
++                      self.cpu_cache)
++        self.model_runner.block_size = self.cache_engine[0].block_size
++
++        assert all(
++            self.cpu_cache[ve] is not None
++            for ve in range(self.parallel_config.pipeline_parallel_size))
+ 
+         # Populate the cache to warmup the memory
+-        for layer_cache in self.cpu_cache:
+-            layer_cache.fill_(0)
+-
+-    def cache_copy(
+-        self,
+-        blocks_to_copy: Dict[int, List[int]],
+-    ) -> None:
+-        if blocks_to_copy:
+-            self.cache_engine.copy(blocks_to_copy)
+-
+-    @torch.inference_mode()
+-    def execute_model(
+-        self,
+-        execute_model_req: Optional[ExecuteModelRequest] = None,
+-    ) -> List[SamplerOutput]:
++        for ve in range(self.parallel_config.pipeline_parallel_size):
++            for layer_cache in self.cpu_cache[ve]:
++                layer_cache.fill_(0)
+ 
+-        if execute_model_req is None:
+-            seq_group_metadata_list = None
+-        else:
+-            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
++    @property
++    def do_metadata_broadcast(self) -> bool:
++        return self.parallel_config.tensor_parallel_size > 1
+ 
+-        if self.is_driver_worker:
+-            assert seq_group_metadata_list is not None
+-            num_seq_groups: int = len(seq_group_metadata_list)
+-            assert execute_model_req is not None
+-            blocks_to_copy = execute_model_req.blocks_to_copy
+-            assert len(execute_model_req.blocks_to_swap_in) == 0
+-            assert len(execute_model_req.blocks_to_swap_out) == 0
+-            data: Dict[str, Any] = {
+-                "num_seq_groups": num_seq_groups,
+-                "blocks_to_copy": execute_model_req.blocks_to_copy,
+-            }
+-            broadcast_tensor_dict(data, src=0)
+-        else:
+-            data = broadcast_tensor_dict(src=0)
+-            num_seq_groups = data["num_seq_groups"]
+-            blocks_to_copy = data["blocks_to_copy"]
++    @property
++    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
++        return self.cpu_cache
+ 
+-        self.cache_copy(blocks_to_copy)
++    @property
++    def vocab_size(self) -> int:
++        return self.model_runner.vocab_size
+ 
+-        # If there is no input, we don't need to execute the model.
+-        if num_seq_groups == 0:
+-            return []
++    @property
++    def max_model_len(self) -> int:
++        return self.model_config.max_model_len
+ 
+-        output = self.model_runner.execute_model(seq_group_metadata_list,
+-                                                 self.cpu_cache)
++    def execute_worker(
++        self,
++        worker_input: WorkerInput,
++    ) -> None:
++        if (worker_input.blocks_to_copy is not None
++                and worker_input.blocks_to_copy.numel() > 0):
++            self.cache_engine[worker_input.virtual_engine].copy(
++                worker_input.blocks_to_copy)
+ 
+-        # CPU worker only supports single-step execution.
+-        return [output]
++    @torch.inference_mode()
++    def prepare_worker_input(
++            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
++        assert execute_model_req is not None
++        virtual_engine: int = execute_model_req.virtual_engine
++        num_seq_groups: int = len(execute_model_req.seq_group_metadata_list)
++        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
++                                      device="cpu",
++                                      dtype=torch.int64).view(-1, 2)
++        assert len(execute_model_req.blocks_to_swap_in) == 0
++        assert len(execute_model_req.blocks_to_swap_out) == 0
++        return WorkerInput(
++            num_seq_groups=num_seq_groups,
++            blocks_to_copy=blocks_to_copy,
++            virtual_engine=virtual_engine,
++        )
+ 
+     def init_distributed_environment(self) -> None:
+         """Initialize the distributed environment."""
+diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
+new file mode 100644
+index 0000000..8a161b7
+--- /dev/null
++++ b/vllm/worker/enc_dec_model_runner.py
+@@ -0,0 +1,526 @@
++import dataclasses
++import itertools
++from typing import Any, Dict, List, Optional, Tuple, Type, cast
++
++import torch
++import torch.distributed
++
++from vllm.attention.backends.abstract import (AttentionBackend,
++                                              AttentionMetadata)
++from vllm.attention.backends.utils import PAD_SLOT_ID
++from vllm.attention.selector import (get_env_variable_attn_backend,
++                                     get_global_forced_attn_backend)
++from vllm.config import VllmConfig
++from vllm.forward_context import set_forward_context
++from vllm.inputs import INPUT_REGISTRY, InputRegistry
++from vllm.logger import init_logger
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
++                             MultiModalRegistry)
++from vllm.platforms import _Backend
++from vllm.sampling_params import SamplingParams
++from vllm.sequence import (IntermediateTensors, PoolerOutput,
++                           SequenceGroupMetadata)
++from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
++from vllm.worker.model_runner import (GPUModelRunnerBase,
++                                      ModelInputForGPUBuilder,
++                                      ModelInputForGPUWithSamplingMetadata)
++from vllm.worker.model_runner_base import (
++    _add_attn_metadata_broadcastable_dict,
++    _add_sampling_metadata_broadcastable_dict)
++from vllm.worker.utils import assert_enc_dec_mr_supported_scenario
++
++logger = init_logger(__name__)
++
++
++@dataclasses.dataclass(frozen=True)
++class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
++    """
++    Used by the EncoderDecoderModelRunner.
++    """
++    encoder_input_tokens: Optional[torch.Tensor] = None
++    encoder_input_positions: Optional[torch.Tensor] = None
++
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        tensor_dict = {
++            "input_tokens": self.input_tokens,
++            "input_positions": self.input_positions,
++            "encoder_input_tokens": self.encoder_input_tokens,
++            "encoder_input_positions": self.encoder_input_positions,
++            "virtual_engine": self.virtual_engine,
++            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
++            "finished_requests_ids": self.finished_requests_ids,
++            "multi_modal_kwargs": self.multi_modal_kwargs,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++        _add_sampling_metadata_broadcastable_dict(tensor_dict,
++                                                  self.sampling_metadata)
++        return tensor_dict
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls,
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> "EncoderDecoderModelInput":
++        return cast(
++            EncoderDecoderModelInput,
++            super().from_broadcasted_tensor_dict(tensor_dict, attn_backend))
++
++
++class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
++    _model_input_cls: Type[EncoderDecoderModelInput] = (
++        EncoderDecoderModelInput)
++    _builder_cls: Type[ModelInputForGPUBuilder] = (ModelInputForGPUBuilder)
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        kv_cache_dtype: Optional[str] = "auto",
++        is_driver_worker: bool = False,
++        input_registry: InputRegistry = INPUT_REGISTRY,
++        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
++    ):
++        '''
++        EncoderDecoderModelRunner constructor.
++
++        `lora_config` and `prompt_adapter_config` are
++        unused (since these features are not yet supported for encoder/decoder
++        models) but these arguments are present here for compatibility with 
++        the base-class constructor.
++        '''
++        self._maybe_force_supported_attention_backend()
++
++        super().__init__(
++            vllm_config=vllm_config,
++            kv_cache_dtype=kv_cache_dtype,
++            is_driver_worker=is_driver_worker,
++        )
++
++        # Crash for unsupported encoder/scenarios
++        assert_enc_dec_mr_supported_scenario(self)
++
++    def _maybe_force_supported_attention_backend(self):
++        '''
++        Force vLLM to use the XFormers attention backend,
++        which is currently the only supported option.
++        '''
++
++        def raise_backend_err():
++            # The user has specified an attention backend override
++            # which is invalid for encoder/decoder models
++            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_BACKEND)
++
++        maybe_env_var_forced_backend = get_env_variable_attn_backend()
++        maybe_global_forced_backend = get_global_forced_attn_backend()
++        is_forced_by_global = maybe_global_forced_backend is not None
++        is_forced_by_env_var = maybe_env_var_forced_backend is not None
++        if is_forced_by_global:  # noqa: SIM102
++            # Backend override enforced by global variable takes
++            # precedence over vLLM backend environment variable.
++            if maybe_global_forced_backend not in\
++                 [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
++                raise_backend_err()
++        elif is_forced_by_env_var:  # noqa: SIM102
++            # Backend override enforced by vLLM backend
++            # environment variable
++            if maybe_env_var_forced_backend not in\
++                 [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
++                raise_backend_err()
++
++    def _list_to_int32_tensor(
++        self,
++        _list: List[int],
++    ) -> torch.Tensor:
++        return torch.tensor(_list, dtype=torch.int32, device=self.device)
++
++    def _list_to_long_tensor(
++        self,
++        _list: List[int],
++    ) -> torch.Tensor:
++        return torch.tensor(_list, dtype=torch.long, device=self.device)
++
++    def _empty_int32_tensor(self) -> torch.Tensor:
++        return self._list_to_int32_tensor([])
++
++    def _empty_long_tensor(self) -> torch.Tensor:
++        return self._list_to_long_tensor([])
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        model_input: EncoderDecoderModelInput,
++        kv_caches: List[torch.Tensor],
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++    ) -> Optional[List[PoolerOutput]]:
++        if num_steps > 1:
++            raise ValueError("num_steps > 1 is not supported in "
++                             "EncoderDecoderModelRunner")
++
++        if (model_input.attn_metadata is not None
++                and model_input.attn_metadata.prefill_metadata is None
++                and model_input.attn_metadata.decode_metadata.use_cuda_graph):
++            assert model_input.input_tokens is not None
++            graph_batch_size = model_input.input_tokens.shape[0]
++            model_executable = self.graph_runners[
++                model_input.virtual_engine][graph_batch_size]
++        else:
++            model_executable = self.model
++
++        seqlen_agnostic_kwargs = {
++            "finished_requests_ids": model_input.finished_requests_ids,
++            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
++        } if self.has_inner_state else {}
++
++        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
++        with set_forward_context(model_input.attn_metadata, self.vllm_config,
++                                 model_input.virtual_engine):
++            hidden_or_intermediate_states = model_executable(
++                input_ids=model_input.input_tokens,
++                positions=model_input.input_positions,
++                encoder_input_ids=model_input.encoder_input_tokens,
++                encoder_positions=model_input.encoder_input_positions,
++                kv_caches=kv_caches,
++                attn_metadata=model_input.attn_metadata,
++                intermediate_tensors=intermediate_tensors,
++                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
++                                             device=self.device),
++                **seqlen_agnostic_kwargs)
++
++        logits = self.model.compute_logits(hidden_or_intermediate_states,
++                                           model_input.sampling_metadata)
++
++        if not self.is_driver_worker:
++            return []
++
++        if model_input.async_callback is not None:
++            model_input.async_callback()
++
++        # Sample the next token.
++        output: SamplerOutput = self.model.sample(
++            logits=logits,
++            sampling_metadata=model_input.sampling_metadata,
++        )
++
++        return [output]
++
++    def make_model_input_from_broadcasted_tensor_dict(
++            self, tensor_dict: Dict[str, Any]) -> EncoderDecoderModelInput:
++        return EncoderDecoderModelInput.from_broadcasted_tensor_dict(
++            tensor_dict,
++            attn_backend=self.attn_backend,
++        )
++
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> EncoderDecoderModelInput:
++        """Prepare the model input based on a given sequence group, including
++        metadata for the sampling step.
++
++        Since chunked prefill is not supported for encoder/decoder models,
++        `input_tokens` is assumed to be either entirely prefill tokens or
++        entirely decode tokens.
++
++        """
++        model_input = self._prepare_model_input_tensors(
++            seq_group_metadata_list, finished_requests_ids)
++        (
++            attn_metadata,
++            encoder_input_tokens_tensor,
++            encoder_input_positions_tensor,
++        ) = (self._prepare_encoder_model_input_tensors(seq_group_metadata_list,
++                                                       model_input))
++        # Inject attn_metadata encoder/cross-attention fields &
++        # encoder input tokens/positions into model_input.
++        # Frozen dataclass fields cannot be modified, so use
++        # dataclasses.replace to construct a new model input
++        # instance.
++        model_input = dataclasses.replace(
++            model_input,
++            attn_metadata=attn_metadata,
++            encoder_input_tokens=encoder_input_tokens_tensor,
++            encoder_input_positions=encoder_input_positions_tensor,
++        )
++
++        generators = self.get_generators(finished_requests_ids)
++        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
++                                                     model_input.seq_lens,
++                                                     model_input.query_lens,
++                                                     self.device,
++                                                     self.pin_memory,
++                                                     generators=generators)
++        is_prompt = (seq_group_metadata_list[0].is_prompt
++                     if seq_group_metadata_list else None)
++        return dataclasses.replace(model_input,
++                                   sampling_metadata=sampling_metadata,
++                                   is_prompt=is_prompt,
++                                   virtual_engine=virtual_engine)
++
++    @torch.inference_mode()
++    def profile_run(self) -> None:
++        # Enable top-k sampling to reflect the accurate memory usage.
++        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
++        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
++        max_num_seqs = self.scheduler_config.max_num_seqs
++
++        # Profile memory usage with max_num_sequences sequences and the total
++        # number of tokens equal to max_num_batched_tokens.
++        seqs: List[SequenceGroupMetadata] = []
++
++        max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
++            self.model_config)
++        if max_mm_tokens > 0:
++            logger.info("Starting profile run for multi-modal models.")
++
++        batch_size = 0
++        for group_id in range(max_num_seqs):
++            seq_len = (max_num_batched_tokens // max_num_seqs +
++                       (group_id < max_num_batched_tokens % max_num_seqs))
++            batch_size += seq_len
++
++            decoder_dummy_data = self.input_registry \
++                .dummy_data_for_profiling(self.model_config,
++                                          seq_len,
++                                          self.mm_registry,
++                                          is_encoder_data=False)
++            encoder_dummy_data = self.input_registry \
++                .dummy_data_for_profiling(self.model_config,
++                                          seq_len,
++                                          self.mm_registry,
++                                          is_encoder_data=True)
++
++            # Having more tokens is over-conservative but otherwise fine
++            assert len(
++                decoder_dummy_data.seq_data.prompt_token_ids
++            ) >= seq_len, (
++                f"Expected at least {seq_len} dummy tokens for profiling, "
++                f"but got: {len(decoder_dummy_data.seq_data.prompt_token_ids)}"
++            )
++
++            assert decoder_dummy_data.multi_modal_data is None or \
++            encoder_dummy_data.multi_modal_data is None, (
++                "Multi-modal data can't be provided in both encoder and decoder"
++            )
++
++            seq = SequenceGroupMetadata(
++                request_id=str(group_id),
++                is_prompt=True,
++                seq_data={group_id: decoder_dummy_data.seq_data},
++                sampling_params=sampling_params,
++                block_tables=None,
++                encoder_seq_data=encoder_dummy_data.seq_data,
++                cross_block_table=None,
++                multi_modal_data=decoder_dummy_data.multi_modal_data
++                or encoder_dummy_data.multi_modal_data,
++                multi_modal_placeholders=decoder_dummy_data.
++                multi_modal_placeholders
++                or encoder_dummy_data.multi_modal_placeholders)
++            seqs.append(seq)
++
++        # Run the model with the dummy inputs.
++        num_layers = self.model_config.get_num_layers(self.parallel_config)
++        # use an empty tensor instead of `None`` to force Dynamo to pass
++        # it by reference, rather by specializing on the value ``None``.
++        # the `dtype` argument does not matter, and we use `float32` as
++        # a placeholder (it has wide hardware support).
++        kv_caches = [
++            torch.tensor([], dtype=torch.float32, device=self.device)
++            for _ in range(num_layers)
++        ]
++        finished_requests_ids = [seq.request_id for seq in seqs]
++        model_input = self.prepare_model_input(
++            seqs, finished_requests_ids=finished_requests_ids)
++        intermediate_tensors = None
++        self.execute_model(model_input, kv_caches, intermediate_tensors)
++        torch.cuda.synchronize()
++        return
++
++    def _prepare_encoder_model_input_tensors(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        model_input: EncoderDecoderModelInput,
++    ) -> Tuple[AttentionMetadata, Optional[torch.Tensor],
++               Optional[torch.Tensor]]:
++        """Helper method to prepare the encoder- and cross-attn-related
++        model inputs based on a given sequence group. These additional inputs
++        are used to augment an already-computed `EncoderDecoderModelInput`
++        data structure which already has decoder-related model inputs
++        populated.
++
++        Sets the following attn_metadata fields:
++        * `num_encoder_tokens`
++        * `encoder_seq_lens`
++        * `encoder_seq_lens_tensor`
++        * `max_encoder_seq_len`
++        * `cross_slot_mapping`
++        * `cross_block_tables`
++
++        Constructs a new model inputs data structure, based on
++        (1) the existing fields in the `model_inputs` argument,
++        and (2) the following additional fields which are
++        computed (or in the case of `attn_metadata`, updated) 
++        by this function:
++        * attn_metadata
++        * encoder_input_tokens
++        * encoder_input_positions
++
++        Arguments:
++
++        * seq_group_metadata_list: list of sequence groups for which to
++                                   compute inputs
++        * model_inputs: model inputs data structure with decoder-oriented
++                        fields already computed.
++
++        Return:
++
++        * Updated model inputs data structure
++        """
++
++        if len(seq_group_metadata_list) == 0:
++            return (model_input.attn_metadata, None, None)
++
++        # Since we are not supporting chunked prefill either the entire
++        # batch is prefill or it is decode
++        is_prompt = seq_group_metadata_list[0].is_prompt
++
++        # Build encoder inputs
++        encoder_seq_lens: List[int] = []
++        if is_prompt:
++            # Prefill phase.
++            cross_block_tables = self._empty_int32_tensor().view(
++                len(seq_group_metadata_list), -1)
++
++            # Extract input tokens/positions, cross-attention slot-mapping,
++            # & seq len from each sequence group metadata
++            (
++                encoder_input_tokens,
++                encoder_input_positions,
++                cross_slot_mapping,
++            ) = (
++                [],
++                [],
++                [],
++            )
++            for seq_group_metadata in seq_group_metadata_list:
++                # Build seq lens
++                seq_len = seq_group_metadata.encoder_seq_data.get_len()
++                token_ids = seq_group_metadata.encoder_seq_data.get_token_ids()
++                encoder_seq_lens.append(seq_len)
++
++                # Build slot mapping
++                is_profile_run = (seq_group_metadata.block_tables is None)
++                if is_profile_run:
++                    # During memory profiling, the block tables are not
++                    # initialized yet. In this case, we just use a dummy
++                    # slot mapping.
++                    # In embeddings, the block tables are {seq_id: None}.
++                    cross_slot_mapping.extend([PAD_SLOT_ID] * seq_len)
++                else:
++                    for i in range(0, seq_len):
++                        block_number = seq_group_metadata.cross_block_table[
++                            i // self.block_size]
++                        block_offset = i % self.block_size
++                        slot = block_number * self.block_size + block_offset
++                        cross_slot_mapping.append(slot)
++
++                # Build encoder input tokens
++                encoder_input_tokens.extend(token_ids)
++                encoder_input_positions.extend(list(range(0, seq_len)))
++
++            # Convert tokens/positions & cross-attention
++            # slot-mapping to encoder input tensors
++            encoder_input_tokens_tensor = self._list_to_long_tensor(
++                encoder_input_tokens)
++            encoder_input_positions_tensor = self._list_to_long_tensor(
++                encoder_input_positions)
++            cross_slot_mapping_tensor = self._list_to_long_tensor(
++                cross_slot_mapping)
++
++        else:
++            # Decode phase.
++            encoder_input_tokens_tensor = self._empty_long_tensor()
++            encoder_input_positions_tensor = self._empty_long_tensor()
++            cross_slot_mapping_tensor = self._empty_long_tensor()
++            # Extract cross-attention block tables &
++            # seq len from each sequence group metadata.
++            # Cross-attention block tables are empty
++            # during vLLM memory profiling.
++            cross_block_tables = []
++            for seq_group_metadata in seq_group_metadata_list:
++                for _ in range(len(seq_group_metadata.seq_data)):
++                    encoder_seq_lens.append(
++                        seq_group_metadata.encoder_seq_data.get_len())
++                    cross_block_table = seq_group_metadata.cross_block_table
++                    cross_block_tables.append([] if (
++                        cross_block_table is None) else cross_block_table)
++
++            if (model_input.attn_metadata is not None
++                    and model_input.attn_metadata.use_cuda_graph):
++                # We will be using CUDA graph replay for this decode.
++                max_len_of_block_table = self.get_max_block_per_batch()
++                batch_size = len(encoder_seq_lens)
++                graph_batch_size = self.vllm_config.pad_for_cudagraph(
++                    batch_size)
++                assert graph_batch_size >= batch_size
++                cuda_graph_pad_size = graph_batch_size - batch_size
++                # extend the cross_block_tables and encoder_seq_lens to match
++                # the graph_batch_size.
++                cross_block_tables.extend([[]
++                                           for _ in range(cuda_graph_pad_size)
++                                           ])
++                encoder_seq_lens.extend(
++                    itertools.repeat(1, cuda_graph_pad_size))
++
++            else:
++                max_len_of_block_table = max(
++                    len(block_table) for block_table in cross_block_tables)
++
++            cross_block_tables = make_tensor_with_pad(
++                cross_block_tables,
++                max_len=max_len_of_block_table,
++                pad=0,
++                dtype=torch.int32,
++                device=self.device,
++            )
++
++        # Compute encoder sequence lengths & encoder
++        # sequence starting offset tensors
++        max_encoder_seq_len = max(encoder_seq_lens, default=0)
++        encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens)
++        encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] +
++                                            1,
++                                            dtype=torch.int32,
++                                            device=self.device)
++        torch.cumsum(encoder_seq_lens_tensor,
++                     dim=0,
++                     dtype=encoder_seq_start_loc.dtype,
++                     out=encoder_seq_start_loc[1:])
++
++        # Update attention metadata with encoder-oriented attributes
++        attn_metadata = model_input.attn_metadata
++        assert attn_metadata is not None
++        (
++            attn_metadata.num_encoder_tokens,
++            attn_metadata.encoder_seq_lens,
++            attn_metadata.encoder_seq_lens_tensor,
++            attn_metadata.max_encoder_seq_len,
++            attn_metadata.encoder_seq_start_loc,
++            attn_metadata.cross_slot_mapping,
++            attn_metadata.cross_block_tables,
++        ) = (
++            sum(encoder_seq_lens),
++            encoder_seq_lens,
++            encoder_seq_lens_tensor,
++            max_encoder_seq_len,
++            encoder_seq_start_loc,
++            cross_slot_mapping_tensor,
++            cross_block_tables,
++        )
++
++        return (attn_metadata, encoder_input_tokens_tensor,
++                encoder_input_positions_tensor)
+diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
+new file mode 100644
+index 0000000..9d479f4
+--- /dev/null
++++ b/vllm/worker/hpu_model_runner.py
+@@ -0,0 +1,2016 @@
++###############################################################################
++# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
++###############################################################################
++
++import collections
++import contextlib
++import dataclasses
++import functools
++import gc
++import itertools
++import math
++import operator
++import os
++import time
++from array import array
++from dataclasses import dataclass, field
++from enum import IntEnum
++from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
++                    Optional, Set, Tuple, Type, TypeVar, Union)
++
++import habana_frameworks.torch as htorch
++import habana_frameworks.torch.internal.bridge_config as bc
++import torch
++from vllm_hpu_extension.ops import LoraMask as LoraMask
++from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
++                                         HabanaMemoryProfiler, format_bytes)
++
++from vllm.attention import AttentionMetadata, get_attn_backend
++from vllm.config import DeviceConfig, VllmConfig
++from vllm.distributed.parallel_state import get_world_group
++from vllm.logger import init_logger
++from vllm.lora.layers import LoRAMapping
++from vllm.lora.request import LoRARequest
++from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.model_executor.model_loader import get_model
++from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
++                             MultiModalKwargs)
++from vllm.sampling_params import SamplingParams
++from vllm.sequence import (IntermediateTensors, SequenceData,
++                           SequenceGroupMetadata)
++from vllm.utils import is_pin_memory_available, make_tensor_with_pad
++from vllm.worker.model_runner_base import (
++    ModelRunnerBase, ModelRunnerInputBase,
++    _add_attn_metadata_broadcastable_dict,
++    _add_sampling_metadata_broadcastable_dict,
++    _init_attn_metadata_from_tensor_dict,
++    _init_sampling_metadata_from_tensor_dict)
++
++if TYPE_CHECKING:
++    from vllm.attention.backends.abstract import AttentionBackend
++
++logger = init_logger(__name__)
++
++_TYPE_CACHE = {}
++# These values are assumed to be zero in several places.
++# Use caution when updating them!
++_PAD_SLOT_ID = 0
++_PAD_BLOCK_ID = 0
++
++LORA_WARMUP_RANK = 8
++
++
++class Singleton(type):
++    _instances: Dict[type, object] = {}
++
++    def __call__(cls, *args, **kwargs):
++        if cls not in cls._instances:
++            cls._instances[cls] = super().__call__(*args, **kwargs)
++        return cls._instances[cls]
++
++
++@dataclass
++class HPUBucketingGlobalState(metaclass=Singleton):
++    prompt_bs_bucket_cfg: Tuple[int, int, int] = field(init=False)
++    decode_bs_bucket_cfg: Tuple[int, int, int] = field(init=False)
++    prompt_seq_bucket_cfg: Tuple[int, int, int] = field(init=False)
++    decode_block_bucket_cfg: Tuple[int, int, int] = field(init=False)
++    prompt_buckets: List[Tuple[int, int]] = field(init=False)
++    decode_buckets: List[Tuple[int, int]] = field(init=False)
++
++
++def subtuple(obj: object,
++             typename: str,
++             to_copy: List[str],
++             to_override: Optional[Dict[str, object]] = None):
++    if obj is None:
++        return None
++    if to_override is None:
++        to_override = {}
++    fields = set(to_copy) | set(to_override.keys())
++    values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
++    if typename not in _TYPE_CACHE:
++        _TYPE_CACHE[typename] = collections.namedtuple(typename,
++                                                       ' '.join(fields))
++    return _TYPE_CACHE[typename](**values)
++
++
++def read_bucket_settings(phase: str, dim: str, **defaults):
++    """Read bucketing configuration from env variables.
++
++    phase is either 'prompt' or 'decode'
++    dim is either 'bs', 'seq' or 'block'
++    param is either 'min', 'step' or 'max'
++    example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
++    """
++    params = ['min', 'step', 'max']
++    env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for p in params]
++    default_values = [defaults[p] for p in params]
++    values = [
++        int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values)
++    ]
++    for e, v, d in zip(env_vars, values, default_values):
++        logger.info('%s=%s (default:%s)', e, v, d)
++    return values
++
++
++def warmup_range(config: Tuple[int, int, int]):
++    """Generate a warmup range.
++
++    Start from bmin and multiply by 2 until you reach bstep.
++    Then, increase the values in the range by the value of bstep until you 
++    reach bmax.
++
++    Example:
++    bmin = 2, bstep = 32, bmax = 64
++    => ramp_up = (2, 4, 8, 16)
++    => stable = (32, 64)
++    => return ramp_up + stable => (2, 4, 8, 16, 32, 64)
++    """
++    bmin, bstep, bmax = config
++    assert bmin <= bmax, ("Min. batch size cannot be greater than max. "
++                          "batch size. If you want to skip warmup, "
++                          "set VLLM_SKIP_WARMUP=true")
++    base = itertools.repeat(2)
++    ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin)
++    ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \
++        ramp_up_acc)
++    stable = range(bstep, bmax + 1, bstep)
++    buckets = list(ramp_up_tw) + list(stable)
++    return list(filter(lambda bucket: bucket >= bmin, buckets))
++
++
++def generate_prompt_buckets(bs_bucket_config,
++                            seq_bucket_config,
++                            max_num_batched_tokens=None):
++    buckets = list(
++        itertools.product(warmup_range(bs_bucket_config),
++                          warmup_range(seq_bucket_config)))
++    if len(buckets) == 0:
++        msg = ("No buckets could be captured with following config "
++               f"(min, step, max_warmup): "
++               f"bs:{bs_bucket_config}, "
++               f"seq:{seq_bucket_config}")
++        raise ValueError(msg)
++
++    filtered_buckets = buckets
++    if max_num_batched_tokens is not None:
++        # Remove buckets exceeding batch token budget
++        filtered_buckets = list(
++            filter(
++                lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens,
++                buckets))
++
++        if len(filtered_buckets) == 0:
++            # we can handle this if we ignore max_num_batched_tokens
++            min_bucket_bs, min_bucket_seq = min(buckets,
++                                                key=lambda b: (b[0] * b[1]))
++            min_reqd_budget = min_bucket_bs * min_bucket_seq
++            msg = (
++                "The current bucketing configuration "
++                f"(min, step, max_warmup): "
++                f"bs:{bs_bucket_config}, "
++                f"seq:{seq_bucket_config} cannot be used with specified "
++                f"max_num_batched_tokens ({max_num_batched_tokens}), as the "
++                f"smallest bucket ({min_reqd_budget}) would exceed token "
++                "budget. Please increase max_num_batched_tokens or decrease "
++                "bucket minimum Ignoring max_num_batched_tokens at risk of "
++                "out-of-memory errors.")
++            logger.error(msg)
++            return list(
++                sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))), []
++
++    captured_buckets = list(
++        sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
++    omitted_buckets = list(
++        sorted([x for x in buckets if x not in filtered_buckets]))
++    return captured_buckets, omitted_buckets
++
++
++def generate_decode_buckets(bs_bucket_config, blocks_bucket_config,
++                            max_blocks):
++    buckets = []
++    bs_buckets = warmup_range(bs_bucket_config)
++    block_buckets = warmup_range(blocks_bucket_config)
++    bmin, bstep, bmax = blocks_bucket_config
++    last_bucket = round_up(max_blocks, bstep)
++    for bs in bs_buckets:
++        for blocks in block_buckets:
++            if blocks < bs:
++                continue
++            if blocks > last_bucket:
++                break
++            buckets.append((bs, blocks))
++    return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
++
++
++def next_pow2(value: int, base: int):
++    res = base
++    while value > 1:
++        value = (value + 1) // 2
++        res *= 2
++    return res
++
++
++def round_up(value: int, k: int):
++    return (value + k - 1) // k * k
++
++
++def find_bucket(value: int, config: Tuple[int, int, int]):
++    bmin, bstep, _ = config
++    next_step = round_up(value, bstep)
++    next_pow = next_pow2(value, bmin)
++    return max(bmin, min(next_step, next_pow))
++
++
++def align_workers(value, op):
++    group = get_world_group().cpu_group
++    world_size = torch.distributed.get_world_size()
++    if world_size <= 1:
++        return value
++    value_t = torch.tensor(value, device='cpu')
++    torch.distributed.all_reduce(value_t, op=op, group=group)
++    return value_t.item()
++
++
++def setup_profiler():
++    schedule = torch.profiler.schedule(wait=0, warmup=2, active=1, repeat=1)
++    DEVICE = 'hpu'
++    activities = [torch.profiler.ProfilerActivity.CPU]
++    activities.extend([torch.profiler.ProfilerActivity.HPU] if DEVICE ==
++                      'hpu' else [])
++    #from habana_frameworks.torch.activity_profiler import DebugActivity
++    #debug_activities=[DebugActivity.BRIDGE_FUNCTION_CALLS]
++
++    profiler = torch.profiler.profile(
++        schedule=schedule,
++        activities=activities,
++        #debug_activities=debug_activities,
++        on_trace_ready=torch.profiler.tensorboard_trace_handler('.',
++                                                                use_gzip=True),
++        record_shapes=False,
++        with_stack=True)
++    return profiler
++
++
++def pad_list(list, k, v):
++    target_len = round_up(len(list), k)
++    padding = target_len - len(list)
++    return list + [v] * padding
++
++
++def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
++    slot_mapping = slot_mapping.flatten()
++    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
++    if is_prompt:
++        indices = indices.unflatten(0, (-1, block_size))[:, 0]
++        offsets = None
++    else:
++        offsets = torch.fmod(slot_mapping, block_size)
++    return indices, offsets
++
++
++def modify_decoder_layer(module: torch.nn.Module, suffix="DecoderLayer"):
++    if module.__class__.__name__.endswith(suffix):
++
++        def forward_hook(module, args, output):
++            htorch.core.mark_step()
++            return output
++
++        module.register_forward_hook(forward_hook)
++
++    for child_name, child_module in module.named_children():
++        modify_decoder_layer(child_module)
++
++
++class HpuModelAdapter:
++
++    def __init__(self, model, block_size, dtype, enforce_eager):
++        self.model = model
++        self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
++                                               '0').lower() in ['1', 'true']
++        self.block_size = block_size
++        self.dtype = dtype
++        if not htorch.utils.internal.is_lazy() and not enforce_eager:
++            self.model = torch.compile(self.model,
++                                       backend='hpu_backend',
++                                       dynamic=False)
++
++    def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
++                       dtype):
++        prefill_metadata = attn_metadata
++        if prefill_metadata is None or self.prefill_use_fusedsdpa:
++            return attn_metadata
++
++        seq_lens_t = prefill_metadata.seq_lens_tensor
++        len_mask = (torch.arange(0, seq_len, device=device,
++                                 dtype=torch.int32).view(1, seq_len).ge(
++                                     seq_lens_t.unsqueeze(-1)).view(
++                                         batch_size, 1, 1, seq_len))
++        causal_mask = torch.triu(torch.ones((batch_size, 1, seq_len, seq_len),
++                                            device=device,
++                                            dtype=torch.bool),
++                                 diagonal=1)
++        mask = causal_mask.logical_or(len_mask)
++        attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
++            mask, -math.inf))
++        attn_metadata = prefill_metadata._replace(attn_bias=attn_bias)
++        return attn_metadata
++
++    def _set_block_mapping(self, metadata, batch_size, device, dtype):
++        mask = torch.arange(0,
++                            self.block_size,
++                            device=device,
++                            dtype=torch.int32).unsqueeze(0)
++        mask = mask >= metadata.block_usage.unsqueeze(-1)
++        attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
++            mask, -math.inf))
++        block_mapping = torch.nn.functional.one_hot(metadata.block_mapping,
++                                                    num_classes=batch_size)
++        block_mapping = block_mapping.to(dtype)
++        metadata = metadata._replace(block_mapping=block_mapping,
++                                     attn_bias=attn_bias)
++        return metadata
++
++    def _update_metadata(self, attn_metadata, batch_size, seq_len, device,
++                         dtype):
++        if attn_metadata.is_prompt:
++            meta = attn_metadata
++            attn_metadata = self._set_attn_bias(meta, batch_size, seq_len,
++                                                device, dtype)
++        else:
++            meta = attn_metadata
++            attn_metadata = self._set_block_mapping(meta, batch_size, device,
++                                                    dtype)
++        return attn_metadata
++
++    def forward(self, *args, **kwargs):
++        kwargs = kwargs.copy()
++        selected_token_indices = kwargs.pop('selected_token_indices')
++        if 'warmup_mode' in kwargs:
++            kwargs.pop('warmup_mode')
++        input_ids = kwargs['input_ids']
++        kwargs['attn_metadata'] = self._update_metadata(
++            kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
++            input_ids.device, self.dtype)
++        LoraMask.setLoraMask(kwargs.pop('lora_mask'))
++        hidden_states = self.model(*args, **kwargs)
++        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
++        hidden_states = hidden_states.index_select(0, selected_token_indices)
++        return hidden_states
++
++    def compute_logits(self, *args, **kwargs):
++        return self.model.compute_logits(*args, **kwargs)
++
++    def sample(self, *args, **kwargs):
++        return self.model.sample(*args, **kwargs)
++
++
++class PreparePromptMetadata(NamedTuple):
++    input_tokens: torch.Tensor
++    input_positions: List[List[int]]
++    attn_metadata: Optional[AttentionMetadata]
++    seq_lens: List[int]
++    query_lens: List[int]
++    lora_index_mapping: List[List[int]]
++    lora_prompt_mapping: List[List[int]]
++    lora_requests: Set[LoRARequest]
++    multi_modal_kwargs: Optional[Dict[str, BatchedTensorInputs]]
++    slot_mapping: List[List[int]]
++    lora_ids: List[int]
++
++    @classmethod
++    def empty(cls):
++        return PreparePromptMetadata(input_tokens=[],
++                                     input_positions=[],
++                                     attn_metadata=None,
++                                     seq_lens=[],
++                                     query_lens=[],
++                                     lora_index_mapping=[],
++                                     lora_prompt_mapping=[],
++                                     lora_requests=set(),
++                                     multi_modal_kwargs=None,
++                                     slot_mapping=[],
++                                     lora_ids=[])
++
++
++class PrepareDecodeMetadata(NamedTuple):
++    input_tokens: torch.Tensor
++    input_positions: List[List[int]]
++    attn_metadata: Optional[AttentionMetadata]
++    lora_index_mapping: List[List[int]]
++    lora_prompt_mapping: List[List[int]]
++    lora_requests: Set[LoRARequest]
++    slot_mapping: List[List[int]]
++    lora_ids: List[int]
++
++    @classmethod
++    def empty(cls):
++        return PrepareDecodeMetadata(input_tokens=[],
++                                     input_positions=[],
++                                     attn_metadata=None,
++                                     lora_index_mapping=[],
++                                     lora_prompt_mapping=[],
++                                     lora_requests=set(),
++                                     slot_mapping=[],
++                                     lora_ids=[])
++
++
++# How batches are constructed.
++class BatchType(IntEnum):
++    # Every batch is prefill.
++    PREFILL = 0
++    # Every batch is decode.
++    DECODE = 1
++    # Batch is a mixture of prefill and decode.
++    MIXED = 2
++
++
++TModelInputForHPU = TypeVar('TModelInputForHPU', bound="ModelInputForHPU")
++
++
++@dataclasses.dataclass(frozen=True)
++class ModelInputForHPU(ModelRunnerInputBase):
++    """
++    This base class contains metadata needed for the base model forward pass
++    but not metadata for possible additional steps, e.g., sampling. Model
++    runners that run additional steps should subclass this method to add
++    additional fields.
++    """
++    input_tokens: Optional[torch.Tensor] = None
++    input_positions: Optional[torch.Tensor] = None
++    seq_lens: Optional[List[int]] = None
++    query_lens: Optional[List[int]] = None
++    lora_mapping: Optional["LoRAMapping"] = None
++    lora_requests: Optional[Set[LoRARequest]] = None
++    attn_metadata: Optional["AttentionMetadata"] = None
++    multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
++    real_batch_size: Optional[int] = None
++    batch_size_padded: Optional[int] = None
++    virtual_engine: int = 0
++    lora_ids: Optional[List[int]] = None
++    async_callback: Optional[Callable] = None
++
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        tensor_dict = {
++            "input_tokens": self.input_tokens,
++            "input_positions": self.input_positions,
++            "lora_requests": self.lora_requests,
++            "lora_mapping": self.lora_mapping,
++            "multi_modal_kwargs": self.multi_modal_kwargs,
++            "real_batch_size": self.real_batch_size,
++            "batch_size_padded": self.batch_size_padded,
++            "virtual_engine": self.virtual_engine,
++            "lora_ids": self.lora_ids,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++        return tensor_dict
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls: Type[TModelInputForHPU],
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> TModelInputForHPU:
++        if attn_backend is not None:
++            tensor_dict = _init_attn_metadata_from_tensor_dict(
++                attn_backend, tensor_dict)
++        return cls(**tensor_dict)
++
++
++@dataclasses.dataclass(frozen=True)
++class ModelInputForHPUWithSamplingMetadata(ModelInputForHPU):
++    """
++    Used by the ModelRunner.
++    """
++    sampling_metadata: Optional["SamplingMetadata"] = None
++    # Used for speculative decoding. We do not broadcast it because it is only
++    # used by the driver worker.
++    is_prompt: Optional[bool] = None
++
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        tensor_dict = {
++            "input_tokens": self.input_tokens,
++            "input_positions": self.input_positions,
++            "lora_requests": self.lora_requests,
++            "lora_mapping": self.lora_mapping,
++            "multi_modal_kwargs": self.multi_modal_kwargs,
++            "lora_ids": self.lora_ids,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++        _add_sampling_metadata_broadcastable_dict(tensor_dict,
++                                                  self.sampling_metadata)
++        return tensor_dict
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls,
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> "ModelInputForHPUWithSamplingMetadata":
++        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
++        # FIXME(kzawora): this fails for whatever reason - why?
++        if attn_backend is not None:
++            tensor_dict = _init_attn_metadata_from_tensor_dict(
++                attn_backend, tensor_dict)
++        return cls(**tensor_dict)
++
++
++class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
++    """
++    Helper class for shared methods between GPU model runners.
++    """
++    _model_input_cls: Type[TModelInputForHPU]
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        is_driver_worker: bool = False,
++        return_hidden_states: bool = False,
++    ):
++        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
++        self.is_driver_worker = is_driver_worker
++        self.return_hidden_states = return_hidden_states
++
++        self.sliding_window = (self.model_config.get_sliding_window()
++                               if self.model_config is not None else None)
++        self.device_config = (self.device_config if self.device_config
++                              is not None else DeviceConfig())
++        self.device = self.device_config.device
++        self.enforce_eager = self.model_config.enforce_eager
++        self.max_num_seqs = self.scheduler_config.max_num_seqs
++        # NOTE(kzawora): Change that to scheduler_config.max_num_prefill_seqs
++        # once padding-aware scheduling gets merged
++        self.max_num_prefill_seqs = 64
++        self.max_model_len = self.scheduler_config.max_model_len
++        self.max_num_batched_tokens = \
++            self.scheduler_config.max_num_batched_tokens
++        self.block_size = self.cache_config.block_size
++
++        self.pin_memory = is_pin_memory_available()
++        self.kv_cache_dtype = self.cache_config.cache_dtype
++
++        self.attn_backend = get_attn_backend(
++            self.model_config.get_head_size(),
++            self.model_config.dtype,
++            self.kv_cache_dtype,
++            self.block_size,
++            self.model_config.is_attention_free,
++        )
++
++        # Lazy initialization
++        self.lora_manager: LRUCacheWorkerLoRAManager = None
++        self.model: torch.nn.Module = None
++        self.inc_initialized_successfully = False
++
++        # Profiler stats
++        self.profiler = HabanaHighLevelProfiler()
++        self.profiler_counter_helper = HabanaProfilerCounterHelper()
++        self.seen_configs: set = set()
++        self._mem_margin: Optional[int] = None
++        self.bucketing_global_state = HPUBucketingGlobalState()
++        self._setup_buckets()
++        self._set_gc_threshold()
++
++    def _set_gc_threshold(self) -> None:
++        # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
++        # for comprehensive description of gc generations.
++        # We can either use VLLM_GC_THR_GEN[0-2] (this has higher priority)
++        # to set particular generation threshold or use simpler
++        # VLLM_GC_THR_MULTIPLIER to multiply default values.
++        default_gc_thrs = list(gc.get_threshold())
++        requested_gc_thrs = [0] * len(default_gc_thrs)
++        for i in range(len(default_gc_thrs)):
++            requested_gc_thrs[i] = int(
++                os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i]))
++        if requested_gc_thrs == default_gc_thrs:
++            gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER',
++                                                   2))
++            requested_gc_thrs = [
++                t * gc_thr_multiplier for t in default_gc_thrs
++            ]
++        gc.set_threshold(*requested_gc_thrs)
++
++        # Multi-modal data support
++        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
++            .create_input_mapper(self.model_config)
++
++        self.skip_warmup = os.environ.get('VLLM_SKIP_WARMUP',
++                                          'false').lower() == 'true'
++
++    def load_model(self) -> None:
++        import habana_frameworks.torch.core as htcore
++        if self.model_config.quantization == 'inc' or \
++           self.model_config.quantization == 'fp8':
++            htcore.hpu_set_env()
++        with HabanaMemoryProfiler() as m:
++            with HabanaMemoryProfiler() as m_getmodel:
++                self.model = get_model(vllm_config=self.vllm_config)
++            msg = ("Pre-loading model weights on "
++                   f"{next(self.model.parameters()).device} "
++                   f"took {m_getmodel.get_summary_string()}")
++            logger.info(msg)
++
++            if self.lora_config:
++                assert hasattr(self.model, "supported_lora_modules"
++                               ) and self.model.supported_lora_modules, (
++                                   "Model does not support LoRA")
++                assert hasattr(self.model, "embedding_modules"
++                               ), "Model does not have embedding_modules"
++                assert hasattr(
++                    self.model, "embedding_padding_modules"
++                ), "Model does not have embedding_padding_modules"
++                assert not self.lora_config.bias_enabled, \
++                    "Bias support in LoRA is not enabled in HPU yet."
++                assert not self.lora_config.fully_sharded_loras, \
++                    "Fully sharded LoRAs is not enabled in HPU yet."
++                self.lora_manager = LRUCacheWorkerLoRAManager(
++                    self.scheduler_config.max_num_seqs,
++                    self.scheduler_config.max_num_batched_tokens,
++                    self.vocab_size, self.lora_config, self.device,
++                    self.model.embedding_modules,
++                    self.model.embedding_padding_modules)
++                self.model = self.lora_manager.create_lora_manager(self.model)
++
++            if self.model_config.quantization == 'inc':
++                logger.info("Preparing model with INC..")
++                with HabanaMemoryProfiler() as m_inc:
++                    from neural_compressor.torch.quantization import (
++                        FP8Config, convert, prepare)
++                    config = FP8Config.from_json_file(
++                        os.getenv("QUANT_CONFIG", ""))
++                    if config.measure:
++                        self.model = prepare(self.model, config)
++                    elif config.quantize:
++                        self.model = convert(self.model, config)
++                    htcore.hpu_initialize(self.model,
++                                          mark_only_scales_as_const=True)
++                self.inc_initialized_successfully = True
++                logger.info("Preparing model with INC took %s",
++                            m_inc.get_summary_string())
++            else:
++                self.model = self.model.to("hpu")
++                htcore.mark_step()
++            modify_decoder_layer(self.model)
++            torch.hpu.synchronize()
++
++            with HabanaMemoryProfiler() as m_wrap:
++                self.model = _maybe_wrap_in_hpu_graph(
++                    self.model,
++                    self.block_size,
++                    dtype=self.model_config.dtype,
++                    enforce_eager=self.enforce_eager)
++            msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}"
++            logger.info(msg)
++
++        self.model_memory_usage = m.consumed_device_memory
++        msg = f"Loading model weights took in total {m.get_summary_string()}"
++        logger.info(msg)
++
++    def _use_graphs(self, batch_size, seq_len, is_prompt):
++        if self.enforce_eager:
++            return False
++        if self.skip_warmup:
++            return True
++        return (batch_size, seq_len, is_prompt) in self.graphed_buckets
++
++    def _is_valid_bucket(self, bucket):
++        return bucket[0] * bucket[1] <= self.max_num_batched_tokens
++
++    def _setup_buckets(self) -> None:
++        align_bs = lambda x: min(self.max_num_seqs, x)
++        #FIXME: The default values should be max_model_len
++        max_prompt_seq = 1024
++        max_decode_seq = 2048
++        self.bucketing_global_state.prompt_bs_bucket_cfg = read_bucket_settings(
++            'prompt',
++            'bs',
++            min=1,
++            step=align_bs(32),
++            max=self.max_num_prefill_seqs)
++        self.bucketing_global_state.decode_bs_bucket_cfg = read_bucket_settings(
++            'decode', 'bs', min=1, step=align_bs(32), max=self.max_num_seqs)
++        self.bucketing_global_state.prompt_seq_bucket_cfg = \
++            read_bucket_settings(
++            'prompt',
++            'seq',
++            min=self.block_size,
++            step=self.block_size,
++            max=max_prompt_seq)
++        self.bucketing_global_state.decode_block_bucket_cfg = \
++            read_bucket_settings(
++            'decode',
++            'block',
++            min=self.block_size,
++            step=self.block_size,
++            max=max(self.block_size,
++                    self.max_num_seqs * max_decode_seq // self.block_size))
++        self.graphed_buckets: Set[Any] = set()
++
++        msg = ("Prompt bucket config (min, step, max_warmup) "
++               f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, "
++               f"seq:{self.bucketing_global_state.prompt_seq_bucket_cfg}")
++        logger.info(msg)
++
++        msg = ("Decode bucket config (min, step, max_warmup) "
++               f"bs:{self.bucketing_global_state.decode_bs_bucket_cfg}, "
++               f"block:{self.bucketing_global_state.decode_block_bucket_cfg}")
++        logger.info(msg)
++
++    def _prepare_prompt(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++    ) -> PreparePromptMetadata:
++        input_tokens: List[List[int]] = []
++        input_positions: List[List[int]] = []
++        slot_mapping: List[List[int]] = []
++        lora_index_mapping: List[List[int]] = []
++        lora_prompt_mapping: List[List[int]] = []
++        lora_requests: Set[LoRARequest] = set()
++
++        seq_lens: List[int] = []
++        context_lens: List[int] = []
++        query_lens: List[int] = []
++        prefix_block_tables: List[List[int]] = []
++        multi_modal_kwargs_list: List[MultiModalKwargs] = []
++
++        if len(seq_group_metadata_list) == 0:
++            return PreparePromptMetadata.empty()
++
++        for seq_group_metadata in seq_group_metadata_list:
++            assert seq_group_metadata.is_prompt
++            seq_ids = list(seq_group_metadata.seq_data.keys())
++            assert len(seq_ids) == 1
++            seq_id = seq_ids[0]
++
++            computed_block_nums = seq_group_metadata.computed_block_nums
++            if (self.scheduler_config is not None
++                    and self.scheduler_config.chunked_prefill_enabled
++                    and not (computed_block_nums is None
++                             or computed_block_nums == [])):
++                raise RuntimeError(
++                    "chunked prefill cannot be used with prefix caching "
++                    "now.")
++
++            token_chunk_size = seq_group_metadata.token_chunk_size
++            seq_data = seq_group_metadata.seq_data[seq_id]
++            context_len = seq_data.get_num_computed_tokens()
++            # We should use get_len here because in case of preemption
++            # it contains output tokens.
++            seq_len = min(seq_data.get_len(), context_len + token_chunk_size)
++            prompt_tokens = seq_data.get_token_ids()[context_len:seq_len]
++            seq_lens.append(seq_len)
++
++            # NOTE: This only works for oooooooxxx style attention.
++            if computed_block_nums is not None and len(
++                    computed_block_nums) > 0 and self.sliding_window is None:
++                # Prefix is not supported with sliding_window
++                context_len = len(computed_block_nums) * self.block_size
++                prompt_tokens = prompt_tokens[context_len:]
++                prefix_block_tables.append(computed_block_nums)
++            elif self.scheduler_config.chunked_prefill_enabled:
++                if seq_group_metadata.block_tables is not None:
++                    # Prefill has chunked before.
++                    block_table = seq_group_metadata.block_tables[seq_id]
++                    prefix_block_tables.append(block_table)
++                else:
++                    # The first prefill.
++                    prefix_block_tables.append([])
++            else:
++                prefix_block_tables.append([])
++                # Right now, prefill start is always 0. However, this
++                # assumption can be changed once chunked prefill is introduced.
++                assert context_len == 0
++
++            # actual prompt lens
++            context_lens.append(context_len)
++            query_lens.append(seq_len - context_len)
++            input_tokens.append(prompt_tokens)
++            # NOTE(woosuk): Here we assume that the first token in the prompt
++            # is always the first token in the sequence.
++            input_positions.append(list(range(context_len, seq_len)))
++
++            mm_data = seq_group_metadata.multi_modal_data
++            if mm_data:
++                mm_kwargs = self.multi_modal_input_mapper(mm_data)
++                multi_modal_kwargs_list.append(mm_kwargs)
++
++            if seq_group_metadata.block_tables is None:
++                # During memory profiling, the block tables are not initialized
++                # yet. In this case, we just use a dummy slot mapping.
++                slot_mapping.append([_PAD_SLOT_ID] * seq_len)
++                continue
++
++            # Compute the slot mapping.
++            slot_mapping.append([])
++            block_table = seq_group_metadata.block_tables[seq_id]
++
++            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
++            # where start_idx is max(0, seq_len - sliding_window).
++            # For example, if the prompt len is 10, sliding window is 8, and
++            # block size is 4, the first two tokens are masked and the slot
++            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
++            start_idx = 0
++            if self.sliding_window is not None:
++                assert context_len == 0, (
++                    "Prefix caching is currently not supported with "
++                    "sliding window attention")
++                start_idx = max(0, seq_len - self.sliding_window)
++            for i in range(context_len, seq_len):
++                if i < start_idx:
++                    slot_mapping[-1].append(_PAD_SLOT_ID)
++                    continue
++
++                block_number = block_table[i // self.block_size]
++                block_offset = i % self.block_size
++                slot = block_number * self.block_size + block_offset
++                slot_mapping[-1].append(slot)
++
++        max_query_len = max(query_lens)
++        sum_query_len = sum(query_lens)
++        real_num_seqs = len(query_lens)
++        assert max_query_len > 0
++
++        max_prompt_len = max(
++            find_bucket(max(seq_lens),
++                        self.bucketing_global_state.prompt_seq_bucket_cfg),
++            self.block_size)
++
++        lora_ids: List[int] = []
++        for seq_group_metadata, context_len in zip(seq_group_metadata_list,
++                                                   context_lens):
++            lora_id = seq_group_metadata.lora_int_id
++            lora_ids.append(lora_id)
++
++            if lora_id > 0:
++                lora_requests.add(seq_group_metadata.lora_request)
++
++            lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
++            lora_prompt_mapping.extend(
++                [lora_id] *
++                (max_prompt_len - context_len
++                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))
++
++        input_tokens = make_tensor_with_pad(input_tokens,
++                                            max_len=max_prompt_len,
++                                            pad=0,
++                                            dtype=torch.long,
++                                            device=self.device)
++
++        input_positions = make_tensor_with_pad(input_positions,
++                                               max_len=max_prompt_len,
++                                               pad=0,
++                                               dtype=torch.long,
++                                               device=self.device)
++
++        slot_mapping = make_tensor_with_pad(slot_mapping,
++                                            max_len=max_prompt_len,
++                                            pad=_PAD_SLOT_ID,
++                                            dtype=torch.long,
++                                            device=self.device)
++
++        seq_lens_tensor = torch.tensor(seq_lens,
++                                       dtype=torch.long,
++                                       device=self.device)
++
++        block_indices, block_offsets = precompute_indices_and_offsets(
++            self.block_size, slot_mapping, True)
++        attn_metadata = self.attn_backend.make_metadata(
++            is_prompt=True,
++            block_list=None,
++            block_mapping=None,
++            block_usage=None,
++            block_indices=block_indices,
++            block_offsets=block_offsets,
++            block_scales=None,
++            attn_bias=None,
++            seq_lens_tensor=seq_lens_tensor,
++            num_prefills=real_num_seqs,
++            num_prefill_tokens=sum_query_len,
++            num_decode_tokens=0,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=
++            None  # FIXME(kzawora): mutli-modality will not work here
++        )
++        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
++
++        return PreparePromptMetadata(input_tokens=input_tokens,
++                                     input_positions=input_positions,
++                                     attn_metadata=attn_metadata,
++                                     seq_lens=seq_lens,
++                                     query_lens=query_lens,
++                                     lora_index_mapping=lora_index_mapping,
++                                     lora_prompt_mapping=lora_prompt_mapping,
++                                     lora_requests=lora_requests,
++                                     multi_modal_kwargs=multi_modal_kwargs,
++                                     slot_mapping=slot_mapping,
++                                     lora_ids=lora_ids)
++
++    def _prepare_decode(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++    ) -> PrepareDecodeMetadata:
++        input_tokens: List[List[int]] = []
++        input_positions: List[List[int]] = []
++        slot_mapping: List[List[int]] = []
++        seq_lens: List[int] = []
++        block_tables: List[List[int]] = []
++        lora_index_mapping: List[List[int]] = []
++        lora_prompt_mapping: List[List[int]] = []
++        lora_requests: Set[LoRARequest] = set()
++
++        if len(seq_group_metadata_list) == 0:
++            return PrepareDecodeMetadata.empty()
++        lora_ids: List[int] = []
++
++        dummy_slots = itertools.cycle(
++            range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size))
++
++        for seq_group_metadata in seq_group_metadata_list:
++            assert not seq_group_metadata.is_prompt
++            assert seq_group_metadata.token_chunk_size == 1
++
++            seq_ids = list(seq_group_metadata.seq_data.keys())
++            lora_id = seq_group_metadata.lora_int_id
++            lora_ids.append(lora_id)
++
++            if lora_id > 0:
++                lora_requests.add(seq_group_metadata.lora_request)
++
++            for seq_id in seq_ids:
++                seq_data = seq_group_metadata.seq_data[seq_id]
++                generation_token = seq_data.get_last_token_id()
++                input_tokens.append([generation_token])
++
++                seq_len = seq_data.get_len()
++                position = seq_len - 1
++                input_positions.append([position])
++
++                seq_len = seq_len if self.sliding_window is None else min(
++                    seq_len, self.sliding_window)
++                seq_lens.append(seq_len)
++
++                block_table = seq_group_metadata.block_tables[seq_id]
++                if len(block_table) == 0:
++                    block_number = _PAD_BLOCK_ID
++                else:
++                    block_number = block_table[position // self.block_size]
++                if block_number == _PAD_BLOCK_ID:
++                    slot = next(dummy_slots)
++                else:
++                    block_offset = position % self.block_size
++                    slot = block_number * self.block_size + block_offset
++                slot_mapping.append([slot])
++                lora_index_mapping.append(lora_id)
++                lora_prompt_mapping.append(lora_id)
++
++                if self.sliding_window is not None:
++                    sliding_window_blocks = (self.sliding_window //
++                                             self.block_size)
++                    block_table = block_table[-sliding_window_blocks:]
++                block_tables.append(block_table)
++
++        input_tokens = torch.tensor(input_tokens,
++                                    dtype=torch.long,
++                                    device=self.device)
++        input_positions = torch.tensor(input_positions,
++                                       dtype=torch.long,
++                                       device=self.device)
++
++        num_decode_tokens = sum(seq_lens)
++
++        blocks_used = [len(bt) for bt in block_tables if bt]
++        block_list = []
++        block_scales = []
++        for i, bt in enumerate(block_tables):
++            block_list.extend(bt)
++            blocks_in_group = len(bt)
++            if blocks_in_group > 0:
++                scale = 1.0 / blocks_in_group
++                block_scales.extend([scale] * blocks_in_group)
++
++        block_mapping_nested: List[List[int]] = [
++            [i] * b_u for i, b_u in enumerate(blocks_used)
++        ]
++        block_mapping: List[int] = list(
++            itertools.chain.from_iterable(block_mapping_nested))
++
++        last_block = [
++            sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping)
++        ]
++        block_usage = [[self.block_size] * (b_u - 1) + [lb]
++                       for b_u, lb in zip(blocks_used, last_block)]
++        block_usage = list(itertools.chain(*block_usage))
++
++        block_bucket_size = find_bucket(
++            len(block_list),
++            self.bucketing_global_state.decode_block_bucket_cfg)
++        block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID)
++        block_mapping = pad_list(block_mapping, block_bucket_size, -1)
++        block_usage = pad_list(block_usage, block_bucket_size, 1)
++        block_scales = pad_list(block_scales, block_bucket_size, 0.0)
++
++        block_list = torch.tensor(block_list,
++                                  dtype=torch.int,
++                                  device=self.device)
++        block_mapping = torch.tensor(block_mapping,
++                                     dtype=torch.long,
++                                     device=self.device)
++        block_usage = torch.tensor(block_usage,
++                                   dtype=self.model_config.dtype,
++                                   device=self.device)
++
++        slot_mapping = torch.tensor(slot_mapping,
++                                    dtype=torch.long,
++                                    device=self.device)
++
++        block_indices, block_offsets = precompute_indices_and_offsets(
++            self.block_size, slot_mapping, False)
++        block_scales = torch.tensor(block_scales,
++                                    dtype=self.model_config.dtype,
++                                    device=self.device)
++
++        attn_metadata = self.attn_backend.make_metadata(
++            is_prompt=False,
++            block_list=block_list,
++            block_mapping=block_mapping,
++            block_usage=block_usage,
++            block_indices=block_indices,
++            block_offsets=block_offsets,
++            block_scales=block_scales,
++            attn_bias=None,
++            seq_lens_tensor=None,
++            num_prefills=0,
++            num_prefill_tokens=0,
++            num_decode_tokens=num_decode_tokens,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=None)
++        return PrepareDecodeMetadata(input_tokens=input_tokens,
++                                     input_positions=input_positions,
++                                     attn_metadata=attn_metadata,
++                                     lora_index_mapping=lora_index_mapping,
++                                     lora_prompt_mapping=lora_prompt_mapping,
++                                     lora_requests=lora_requests,
++                                     slot_mapping=slot_mapping,
++                                     lora_ids=lora_ids)
++
++    def prepare_input_tensors(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++    ) -> Tuple[TModelInputForHPU, SamplingMetadata]:
++        if len(seq_group_metadata_list) == 0:
++            return self._model_input_cls(), None
++
++        input_tokens = None
++        input_positions = None
++        lora_mapping = None
++        lora_requests = None
++        multi_modal_kwargs = None
++        batch_type = None
++        seq_lens = None
++        query_lens = None
++        real_batch_size = None
++        batch_size_padded = None
++
++        self.event_start = self.profiler.get_timestamp_us()
++        is_prompt = seq_group_metadata_list[0].is_prompt
++        base_event_name = 'prompt' if is_prompt else 'decode'
++        self.profiler.start('internal', base_event_name)
++
++        real_batch_size = len(seq_group_metadata_list)
++        bucket_cfg = self.bucketing_global_state.prompt_bs_bucket_cfg \
++            if is_prompt else self.bucketing_global_state.decode_bs_bucket_cfg
++        batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
++        batch_size_padding = batch_size_padded - real_batch_size
++        seq_group_metadata_list = seq_group_metadata_list.copy()
++        if batch_size_padding > 0:
++            dummy_seq_group_metadata = self.create_dummy_seq_group_metadata(
++                0, 0, is_prompt)
++            seq_group_metadata_list.extend(dummy_seq_group_metadata
++                                           for _ in range(batch_size_padding))
++
++        prefill_reqs = []
++        decode_reqs = []
++        for seq_group_meta in seq_group_metadata_list:
++            if seq_group_meta.is_prompt:
++                prefill_reqs.append(seq_group_meta)
++            else:
++                decode_reqs.append(seq_group_meta)
++
++        # Prepare input tensors.
++        (
++            input_tokens,
++            input_positions,
++            prefill_attn_metadata,
++            seq_lens,
++            query_lens,
++            lora_index_mapping,
++            lora_prompt_mapping,
++            lora_requests,
++            multi_modal_kwargs,
++            slot_mapping,
++            lora_ids,
++        ) = self._prepare_prompt(prefill_reqs)
++        (
++            decode_input_tokens,
++            decode_input_positions,
++            decode_attn_metadata,
++            decode_lora_index_mapping,
++            decode_lora_prompt_mapping,
++            decode_lora_requests,
++            decode_slot_mapping,
++            decode_lora_ids,
++        ) = self._prepare_decode(decode_reqs)
++        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
++                                                     seq_lens, query_lens,
++                                                     self.device,
++                                                     self.pin_memory)
++
++        if not self.scheduler_config.chunked_prefill_enabled:
++            assert (len(prefill_reqs) and len(decode_reqs)) == 0
++
++        num_prefills = len(seq_lens)
++        num_prefill_tokens = len(input_tokens)
++        num_decode_tokens = len(decode_input_tokens)
++
++        # NOTE(kzawora): Here we diverge from GPU code - we don't
++        # support mixed batches, so we either use decode or prefill
++        # inputs, without coalescing.
++        assert (num_prefills == 0 and num_decode_tokens > 0) or (
++            num_prefills > 0
++            and num_decode_tokens == 0), "HPU does not support mixed batches!"
++        if num_decode_tokens > 0:
++            input_tokens = decode_input_tokens
++            input_positions = decode_input_positions
++            slot_mapping = decode_slot_mapping
++            lora_index_mapping = decode_lora_index_mapping
++            lora_prompt_mapping = decode_lora_prompt_mapping
++            lora_requests = decode_lora_requests
++            lora_ids = decode_lora_ids
++
++        # FIXME: We need to adjust selected_token_indices to accommodate
++        # for padding
++        max_len = input_tokens.size(1)
++        paddings = [max_len - s for s in seq_lens]
++        paddings = [0] + paddings[:-1]
++        paddings = list(itertools.accumulate(paddings))
++        paddings_prompt_logprobs = []
++        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
++            if seq_group_metadata.sampling_params.prompt_logprobs is not None \
++                              and seq_group_metadata.is_prompt:
++                paddings_prompt_logprobs += ([paddings[i]] * seq_lens[i])
++        paddings = torch.tensor(
++            paddings_prompt_logprobs if paddings_prompt_logprobs else paddings,
++            dtype=sampling_metadata.selected_token_indices.dtype,
++            device=sampling_metadata.selected_token_indices.device)
++        sampling_metadata.selected_token_indices.add_(paddings)
++
++        if self.lora_config:
++            lora_mapping = LoRAMapping(
++                **dict(index_mapping=lora_index_mapping,
++                       prompt_mapping=lora_prompt_mapping,
++                       is_prefill=(num_prefills > 0)))
++        else:
++            lora_mapping = None
++
++        if (prefill_attn_metadata is not None
++                and decode_attn_metadata is not None):
++            batch_type = BatchType.MIXED
++            raise NotImplementedError("Mixed batch is not supported on HPU")
++        elif prefill_attn_metadata is not None:
++            batch_type = BatchType.PREFILL
++        else:
++            batch_type = BatchType.DECODE
++
++        metadata_dict = {
++            "input_tokens": input_tokens,
++            "input_positions": input_positions,
++            "selected_token_indices": sampling_metadata.selected_token_indices,
++            "lora_requests": lora_requests,
++            "lora_mapping": lora_mapping,
++            "multi_modal_kwargs": multi_modal_kwargs,
++            "num_prefill_tokens": num_prefill_tokens,
++            "num_decode_tokens": num_decode_tokens,
++            "slot_mapping": slot_mapping,
++            "num_prefills": num_prefills,
++            "batch_type": batch_type,
++            "seq_lens": seq_lens,
++            "query_lens": query_lens
++        }
++        if prefill_attn_metadata is not None:
++            metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
++        else:
++            assert decode_attn_metadata is not None
++            metadata_dict.update(decode_attn_metadata.asdict_zerocopy())
++
++        attn_metadata = prefill_attn_metadata if \
++            prefill_attn_metadata is not None else decode_attn_metadata
++
++        return self._model_input_cls(input_tokens=input_tokens,
++                                     seq_lens=seq_lens,
++                                     query_lens=query_lens,
++                                     input_positions=input_positions,
++                                     attn_metadata=attn_metadata,
++                                     lora_requests=lora_requests,
++                                     lora_mapping=lora_mapping,
++                                     multi_modal_kwargs=multi_modal_kwargs,
++                                     real_batch_size=real_batch_size,
++                                     batch_size_padded=batch_size_padded,
++                                     lora_ids=lora_ids), \
++                                        sampling_metadata
++
++    def _seq_len(self, attn_metadata):
++        if attn_metadata.num_prefills != 0:
++            return attn_metadata.slot_mapping.size(1)
++        else:
++            return attn_metadata.block_list.numel()
++
++    def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
++        # NOTE(kzawora): To anyone working on this in the future:
++        # Trimming metadata is required when using HPUGraphs.
++        # Attention metadata is going to be hashed by PT bridge, and
++        # appropriate HPUGraphs will be matched based on all inputs' hash.
++
++        # Before you put more keys in here, make sure you know their
++        # value type and make sure you know how it's going to be hashed.
++        # You can find that information in input_hash function
++        # in habana_frameworks/torch/hpu/graphs.py. You can also hash
++        # it manually with torch.hpu.graphs.input_hash(attention_metadata)
++
++        # If you use primitive types here - they will get hashed based
++        # on their value. You *will* get lots of excessive graph captures
++        # (and an OOM eventually) if you decide to put something like
++        # seq_len int here.
++        # If you absolutely need a scalar, put it in a tensor. Tensors
++        # get hashed using their metadata, not their values:
++        # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321))
++        # input_hash(123) != input_hash(321)
++        # input_hash("abc") != input_hash("cba")
++        attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [
++            'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping',
++            'block_usage', 'slot_mapping', 'is_prompt', 'block_indices',
++            'block_offsets', 'block_scales'
++        ])
++        return attention_metadata
++
++    def create_dummy_seq_group_metadata(self,
++                                        group_id,
++                                        seq_len,
++                                        is_prompt,
++                                        lora_request=None):
++        sampling_params = SamplingParams(temperature=0)
++        num_blocks = math.ceil(seq_len / self.block_size)
++        seq_len = max(seq_len, 1)
++        if is_prompt:
++            input_len = seq_len
++            output_len = 0
++            block_tables = None
++        else:
++            input_len = seq_len - 1
++            output_len = 1
++            block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks}
++        prompt_token_ids = [0] * input_len
++        output_token_ids = [1] * output_len
++        prompt_token_ids_array = array('l', prompt_token_ids)  # noqa: F821
++        seq_data = SequenceData(prompt_token_ids_array)
++        seq_data.output_token_ids = output_token_ids
++        return SequenceGroupMetadata(request_id=str(group_id),
++                                     is_prompt=(output_len == 0),
++                                     seq_data={group_id: seq_data},
++                                     sampling_params=sampling_params,
++                                     block_tables=block_tables,
++                                     lora_request=lora_request)
++
++    def profile_run(self) -> None:
++        num_layers = self.model_config.get_num_layers(self.parallel_config)
++        kv_caches = [None] * num_layers
++        max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
++        max_batch_size = min(self.max_num_batched_tokens // max_seq_len,
++                             self.scheduler_config.max_num_seqs)
++        self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
++                             False, True)
++        return
++
++    def warmup_scenario(self,
++                        batch_size,
++                        seq_len,
++                        is_prompt,
++                        kv_caches,
++                        is_pt_profiler_run=False,
++                        is_lora_profile_run=False) -> None:
++        use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
++        scenario_name = ("warmup_"
++                         f"{'prompt' if is_prompt else 'decode'}_"
++                         f"bs{batch_size}_"
++                         f"seq{seq_len}_"
++                         f"graphs{'T' if use_graphs else 'F'}")
++        # This represents the maximum number of different requests
++        # that will have unique loras, an therefore the max amount of memory
++        # consumption create dummy lora request copies from the lora request
++        # passed in, which contains a lora from the lora warmup path.
++        dummy_lora_requests: List[LoRARequest] = []
++        dummy_lora_requests_per_seq: List[LoRARequest] = []
++        if self.lora_config and is_lora_profile_run:
++            assert self.lora_manager is not None
++            with self.lora_manager.dummy_lora_cache():
++                for idx in range(self.lora_config.max_loras):
++                    lora_id = idx + 1
++                    dummy_lora_request = LoRARequest(
++                        lora_name=f"warmup_{lora_id}",
++                        lora_int_id=lora_id,
++                        lora_local_path="/not/a/real/path",
++                    )
++                    self.lora_manager.add_dummy_lora(dummy_lora_request,
++                                                     rank=LORA_WARMUP_RANK)
++                    dummy_lora_requests.append(dummy_lora_request)
++                dummy_lora_requests_per_seq = [
++                    dummy_lora_requests[idx % len(dummy_lora_requests)]
++                    for idx in range(batch_size)
++                ]
++        self.profiler.start('internal', scenario_name)
++        times = 3 if use_graphs or is_pt_profiler_run else 1
++        if is_prompt:
++            seqs = [
++                self.create_dummy_seq_group_metadata(
++                    i,
++                    seq_len,
++                    is_prompt,
++                    lora_request=dummy_lora_requests_per_seq[i]
++                    if dummy_lora_requests_per_seq else None)
++                for i in range(batch_size)
++            ]
++        else:
++            # FIXME: seq_len is actually number of blocks
++            blocks = [seq_len // batch_size for _ in range(batch_size)]
++            blocks[0] += seq_len % batch_size
++            seqs = [
++                self.create_dummy_seq_group_metadata(
++                    i,
++                    b * self.block_size - 1,
++                    is_prompt,
++                    lora_request=dummy_lora_requests_per_seq[i]
++                    if dummy_lora_requests_per_seq else None)
++                for i, b in enumerate(blocks)
++            ]
++        torch.hpu.synchronize()
++        profiler = None
++        if is_pt_profiler_run and self.is_driver_worker:
++            profiler = setup_profiler()
++            profiler.start()
++        for _ in range(times):
++            inputs = self.prepare_model_input(seqs)
++            self.execute_model(inputs, kv_caches, warmup_mode=True)
++            torch.hpu.synchronize()
++            if profiler:
++                profiler.step()
++        if profiler:
++            profiler.stop()
++        self.profiler.end()
++        gc.collect()
++
++    def remove_all_loras(self):
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        self.lora_manager.remove_all_adapters()
++
++    def set_active_loras(self, lora_requests: Set[LoRARequest],
++                         lora_mapping: LoRAMapping) -> None:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
++
++    def add_lora(self, lora_request: LoRARequest) -> bool:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        return self.lora_manager.add_adapter(lora_request)
++
++    def remove_lora(self, lora_id: int) -> bool:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        return self.lora_manager.remove_adapter(lora_id)
++
++    def pin_lora(self, lora_id: int) -> bool:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        return self.lora_manager.pin_adapter(lora_id)
++
++    def list_loras(self) -> Set[int]:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        return self.lora_manager.list_adapters()
++
++    def log_warmup(self, phase, i, max_i, batch_size, seq_len):
++        free_mem = format_bytes(
++            HabanaMemoryProfiler.current_free_device_memory())
++        dim = "num_blocks"
++        if phase == "Prompt":
++            dim = "seq_len"
++        msg = (f"[Warmup][{phase}][{i+1}/{max_i}] "
++               f"batch_size:{batch_size} "
++               f"{dim}:{seq_len} "
++               f"free_mem:{free_mem}")
++        logger.info(msg)
++
++    def warmup_all_buckets(self, buckets, is_prompt, kv_caches):
++        for i, (batch_size, seq_len) in enumerate(reversed(buckets)):
++            self.log_warmup('Prompt' if is_prompt else 'Decode', i,
++                            len(buckets), batch_size, seq_len)
++            self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
++
++    def warmup_graphs(self,
++                      strategy,
++                      buckets,
++                      is_prompt,
++                      kv_caches,
++                      available_mem,
++                      starting_mem=0,
++                      total_batch_seq=0.001):
++        total_mem = starting_mem
++        idx = 0
++        phase = f'Graph/{"Prompt" if is_prompt else "Decode"}'
++        num_candidates = len(buckets)
++        ordering : Union[Callable[[Any], Tuple[Any, Any]], \
++            Callable[[Any], Tuple[Any, Any, Any]]]
++        if strategy == 'min_tokens':
++            ordering = lambda b: (b[0] * b[1], b[1], b[0])
++        elif strategy == 'max_bs':
++            ordering = lambda b: (-b[0], b[1])
++        else:
++            raise NotImplementedError(
++                f'Unsupported graph allocation strategy: {strategy}')
++        buckets = list(sorted(buckets, key=ordering))
++        captured_all = True
++        for idx, (batch_size, seq_len) in enumerate(buckets):
++            # Graph memory usage is proportional to seq dimension in a batch
++            batch_seq = batch_size * seq_len if is_prompt else batch_size
++            mem_estimate = batch_seq / total_batch_seq * total_mem
++            if mem_estimate >= available_mem:
++                captured_all = False
++                continue
++            graphed_bucket = (batch_size, seq_len, is_prompt)
++            if graphed_bucket in self.graphed_buckets:
++                continue
++            self.graphed_buckets.add(graphed_bucket)
++            self.log_warmup(phase, idx, num_candidates, batch_size, seq_len)
++            with HabanaMemoryProfiler() as mem_prof:
++                self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
++            used_mem = align_workers(mem_prof.consumed_device_memory,
++                                     torch.distributed.ReduceOp.MAX)
++            available_mem -= used_mem
++            total_mem += used_mem
++            total_batch_seq += batch_seq
++
++        return total_mem, total_batch_seq, captured_all
++
++    def log_graph_warmup_summary(self, buckets, is_prompt, total_mem):
++        num_candidates = len(buckets)
++        phase = f'Graph/{"Prompt" if is_prompt else "Decode"}'
++        graphed = list(c[:2] for c in self.graphed_buckets
++                       if c[2] == is_prompt)
++        if num_candidates == 0:
++            num_candidates = 1
++        msg = (f'{phase} captured:{len(graphed)} '
++               f'({100 * len(graphed) / num_candidates:.1f}%) '
++               f'used_mem:{format_bytes(total_mem)} '
++               f'buckets:{sorted(list(graphed))}')
++        logger.info(msg)
++
++    @torch.inference_mode()
++    def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
++        if profile := os.environ.get('VLLM_PT_PROFILE', None):
++            phase, bs, seq_len, graph = profile.split('_')
++            is_prompt = phase == 'prompt'
++            graphs = graph == 't'
++            if graphs:
++                self.graphed_buckets.add((int(bs), int(seq_len), is_prompt))
++            self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches,
++                                 True)
++            raise AssertionError("Finished profiling")
++        if self.skip_warmup:
++            logger.info("Skipping warmup...")
++            return
++        self.profiler.start('internal', 'warmup')
++        max_blocks = kv_caches[0][0].size(0)
++
++        self.bucketing_global_state.prompt_buckets, prompt_omitted_buckets = \
++            generate_prompt_buckets(
++            self.bucketing_global_state.prompt_bs_bucket_cfg,
++            self.bucketing_global_state.prompt_seq_bucket_cfg,
++            self.max_num_batched_tokens)
++
++        msg = (f"Generated {len(self.bucketing_global_state.prompt_buckets)} "
++               f"prompt buckets [bs, seq]: \
++                {list(sorted(self.bucketing_global_state.prompt_buckets))}")
++        logger.info(msg)
++
++        msg = (f"Omitted {len(prompt_omitted_buckets)} "
++               "prompt buckets due to exceeded token budget "
++               f"(max_num_batched_tokens={self.max_num_batched_tokens})")
++        logger.info(msg)
++
++        msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}"
++        logger.debug(msg)
++
++        self.bucketing_global_state.decode_buckets = generate_decode_buckets(
++            self.bucketing_global_state.decode_bs_bucket_cfg,
++            self.bucketing_global_state.decode_block_bucket_cfg, max_blocks)
++        logger.info("Generated %d decode buckets [bs, total_blocks]: %s",
++                    len(self.bucketing_global_state.decode_buckets),
++                    list(sorted(self.bucketing_global_state.decode_buckets)))
++
++        if not htorch.utils.internal.is_lazy() and not self.enforce_eager:
++            cache_size_limit = len(
++                self.bucketing_global_state.prompt_buckets) + len(
++                    self.bucketing_global_state.decode_buckets) + 1
++            torch._dynamo.config.cache_size_limit = max(
++                cache_size_limit, torch._dynamo.config.cache_size_limit)
++            # Multiply by 8 to follow the original default ratio between
++            # the cache_size_limit and accumulated_cache_size_limit
++            torch._dynamo.config.accumulated_cache_size_limit = max(
++                cache_size_limit * 8,
++                torch._dynamo.config.accumulated_cache_size_limit)
++
++        start_mem = HabanaMemoryProfiler.current_device_memory_usage()
++        start_time = time.perf_counter()
++
++        compile_only_mode_context = functools.partial(bc.env_setting,
++                                                      "PT_COMPILE_ONLY_MODE",
++                                                      True)
++        can_use_compile_only_mode = True
++        try:
++            with compile_only_mode_context():
++                pass
++            logger.debug("Using PT_COMPILE_ONLY_MODE.")
++        except KeyError:
++            can_use_compile_only_mode = False
++            logger.warning('Cannot use PT_COMPILE_ONLY_MODE. '
++                           'Warmup time will be negatively impacted. '
++                           'Please update Gaudi Software Suite.')
++        with compile_only_mode_context(
++        ) if can_use_compile_only_mode else contextlib.nullcontext():
++            self.warmup_all_buckets(self.bucketing_global_state.prompt_buckets,
++                                    True, kv_caches)
++            self.warmup_all_buckets(self.bucketing_global_state.decode_buckets,
++                                    False, kv_caches)
++
++            if not self.enforce_eager and htorch.utils.internal.is_lazy():
++                assert self.mem_margin is not None, \
++                    ("HabanaWorker.determine_num_available_blocks needs "
++                    "to be called before warming up the model.")
++                free_mem = HabanaMemoryProfiler.current_free_device_memory()
++                graph_free_mem = free_mem - self.mem_margin
++                graph_free_mem = align_workers(graph_free_mem,
++                                               torch.distributed.ReduceOp.MIN)
++                prompt_graph_mem_ratio = float(
++                    os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.3'))
++                prompt_available_memory = (prompt_graph_mem_ratio *
++                                           graph_free_mem)
++                decode_available_memory = (graph_free_mem -
++                                           prompt_available_memory)
++                msg = (
++                    f"Using {format_bytes(graph_free_mem)}"
++                    f"/{format_bytes(free_mem)} "
++                    "of free device memory for HPUGraphs, "
++                    f"{format_bytes(prompt_available_memory)} for prompt and "
++                    f"{format_bytes(decode_available_memory)} for decode "
++                    f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})")
++                logger.info(msg)
++                prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY',
++                                                 'min_tokens')
++                decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY',
++                                                 'max_bs')
++                mem_post_prompt, prompt_batch_seq, prompt_captured_all = \
++                    self.warmup_graphs(
++                    prompt_strategy, self.bucketing_global_state.prompt_buckets,
++                    True, kv_caches, prompt_available_memory)
++                mem_post_decode, decode_batch_seq, decode_captured_all = \
++                    self.warmup_graphs(
++                    decode_strategy, self.bucketing_global_state.decode_buckets,
++                    False, kv_caches, decode_available_memory)
++
++                # Not all prompt buckets were captured, but all decode buckets
++                # were captured and we have some free graph-allocated space
++                # left. Let's try to use it for capturing more prompt buckets.
++                if (mem_post_decode + mem_post_prompt < graph_free_mem
++                        and not prompt_captured_all and decode_captured_all):
++                    mem_post_prompt, _, prompt_captured_all = (
++                        self.warmup_graphs(
++                            prompt_strategy,
++                            self.bucketing_global_state.prompt_buckets, True,
++                            kv_caches,
++                            graph_free_mem - mem_post_prompt - mem_post_decode,
++                            mem_post_prompt, prompt_batch_seq))
++
++                # Not all decode buckets were captured, but all prompt buckets
++                # were captured and we have some free graph-allocated space
++                # left. Let's try to use it for capturing more decode buckets.
++                if mem_post_decode + mem_post_prompt < graph_free_mem \
++                    and not decode_captured_all \
++                        and prompt_captured_all:
++                    mem_post_decode, _, _ = self.warmup_graphs(
++                        decode_strategy,
++                        self.bucketing_global_state.decode_buckets, False,
++                        kv_caches,
++                        graph_free_mem - mem_post_prompt - mem_post_decode,
++                        mem_post_decode, decode_batch_seq)
++
++                self.log_graph_warmup_summary(
++                    self.bucketing_global_state.prompt_buckets, True,
++                    mem_post_prompt)
++                self.log_graph_warmup_summary(
++                    self.bucketing_global_state.decode_buckets, False,
++                    mem_post_decode)
++
++        end_time = time.perf_counter()
++        end_mem = HabanaMemoryProfiler.current_device_memory_usage()
++        elapsed_time = end_time - start_time
++        msg = (
++            f"Warmup finished in {elapsed_time:.0f} secs, "
++            f"allocated {format_bytes(end_mem - start_mem)} of device memory")
++        logger.info(msg)
++        self.profiler.end()
++
++    @property
++    def vocab_size(self) -> int:
++        return self.model_config.get_vocab_size()
++
++    @property
++    def mem_margin(self) -> Optional[int]:
++        return self._mem_margin
++
++    @mem_margin.setter
++    def mem_margin(self, value):
++        self._mem_margin = value
++
++
++def _maybe_wrap_in_hpu_graph(*args, **kwargs):
++    return htorch.hpu.wrap_in_hpu_graph(
++        HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True
++    ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs)
++
++
++class HabanaProfilerCounterHelper:
++
++    def __init__(self):
++        self.niter = 0
++        self.average_real_throughput = None
++        self.logged_once = False
++        self.real_seq_lens = []
++        self.prompt_seq_lens = []
++
++    def capture_seq_group_metadata_stats(self, seq_group_metadata_list):
++        self.real_seq_lens = [
++            len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids)
++            for seq_group_metadata in seq_group_metadata_list
++            for seq_data in seq_group_metadata.seq_data.values()
++        ]
++        self.prompt_seq_lens = [
++            len(seq_data.prompt_token_ids)
++            for seq_group_metadata in seq_group_metadata_list
++            for seq_data in seq_group_metadata.seq_data.values()
++        ]
++
++    def get_counter_dict(self, cache_config, duration, seq_len,
++                         batch_size_padded, real_batch_size, is_prompt):
++        throughput = batch_size_padded / (duration / 1e6)
++        throughput_effective = real_batch_size / (duration / 1e6)
++
++        real_max_seq_len = max(self.real_seq_lens)
++        real_num_tokens = sum(self.real_seq_lens)
++        padded_num_tokens = batch_size_padded * seq_len
++        batch_token_utilization = real_num_tokens / padded_num_tokens
++        if self.average_real_throughput is None:
++            self.average_real_throughput = throughput_effective
++        else:  # https://www.heikohoffmann.de/htmlthesis/node134.html
++            self.average_real_throughput = self.average_real_throughput + 1 / (
++                self.niter + 1) * (throughput_effective -
++                                   self.average_real_throughput)
++        phase = "prompt" if is_prompt else "decode"
++        counters = {
++            f'{phase}_bucket_batch_size': batch_size_padded,
++            f'{phase}_batch_size': real_batch_size,
++            f'{phase}_bucket_seq_len': seq_len,
++            f'{phase}_seq_len': real_max_seq_len,
++            f'{phase}_bucket_gen_throughput': throughput,
++            f'{phase}_real_gen_throughput': throughput_effective,
++            f'{phase}_batch_token_utilization': batch_token_utilization,
++            'average_real_throughput': self.average_real_throughput,
++            'engine_iteration': self.niter,
++        }
++        self.niter += 1
++        if is_prompt:
++            prompt_bucket_in_throughput = (seq_len * batch_size_padded) / (
++                duration / 1e6)
++            prompt_real_in_throughput = sum(
++                self.prompt_seq_lens) / (duration / 1e6)
++            counters[
++                f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput
++            counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput
++
++        # KV cache might not be created yet (e.g. for profiling run)
++        if cache_config.num_gpu_blocks is not None and \
++            cache_config.num_gpu_blocks != 0:
++            cache_num_blocks_used = [
++                math.ceil(sl / cache_config.block_size)
++                for sl in self.real_seq_lens
++            ]
++            cache_total_num_blocks_used = sum(cache_num_blocks_used)
++            num_cache_blocks = cache_config.num_gpu_blocks
++            cache_total_num_free_blocks = \
++                num_cache_blocks - cache_total_num_blocks_used
++            cache_computed_utilization = \
++                cache_total_num_blocks_used / num_cache_blocks
++            max_blocks_per_seq = math.ceil(seq_len / cache_config.block_size)
++            batch_block_utilization = cache_total_num_blocks_used / (
++                batch_size_padded * max_blocks_per_seq)
++            counters['cache_num_blocks_used'] = cache_total_num_blocks_used
++            counters['cache_num_free_blocks'] = cache_total_num_free_blocks
++            counters['cache_computed_utilization'] = cache_computed_utilization
++            counters[
++                f'{phase}_batch_block_utilization'] = batch_block_utilization
++        if not self.logged_once:
++            counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks
++            counters[
++                'const_gpu_memory_utilization'] = \
++                    cache_config.gpu_memory_utilization
++            counters['const_block_size'] = cache_config.block_size
++            self.logged_once = True
++        return counters
++
++
++def unwrap_model(model):
++    if isinstance(model, torch._dynamo.eval_frame.OptimizedModule):
++        return unwrap_model(model._orig_mod)
++    else:
++        model = list(vars(model)['_modules'].values())[0]
++        modules = list(vars(model)['_modules'].values())
++        return modules
++
++
++class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
++    """
++    GPU model runner with sampling step.
++    """
++    _model_input_cls: Type[ModelInputForHPUWithSamplingMetadata] = (
++        ModelInputForHPUWithSamplingMetadata)
++
++    def make_model_input_from_broadcasted_tensor_dict(
++        self,
++        tensor_dict: Dict[str, Any],
++    ) -> ModelInputForHPUWithSamplingMetadata:
++        return (
++            ModelInputForHPUWithSamplingMetadata.from_broadcasted_tensor_dict(
++                tensor_dict,
++                attn_backend=self.attn_backend,
++            ))
++
++    @torch.inference_mode()
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> ModelInputForHPUWithSamplingMetadata:
++        """Prepare the model input based on a given sequence group, including
++        metadata for the sampling step.
++        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
++        The result tensors and data structure also batches input in prefill
++        -> decode order. For example,
++        - input_tokens[:num_prefill_tokens] contains prefill tokens.
++        - input_tokens[num_prefill_tokens:] contains decode tokens.
++        If cuda graph is required, this API automatically pads inputs.
++        """
++        with self.profiler.record_event('internal', 'prepare_input_tensors'):
++            assert seq_group_metadata_list is not None
++            if self.profiler.enabled:
++                self.profiler_counter_helper.capture_seq_group_metadata_stats(
++                    seq_group_metadata_list=seq_group_metadata_list)
++            model_input, sampling_metadata = self.prepare_input_tensors(
++                seq_group_metadata_list)
++            assert model_input.attn_metadata is not None
++            is_prompt = model_input.attn_metadata.is_prompt
++
++        return dataclasses.replace(model_input,
++                                   sampling_metadata=sampling_metadata,
++                                   is_prompt=is_prompt,
++                                   virtual_engine=virtual_engine)
++
++    def finish_measurements(self):
++        from neural_compressor.torch.quantization import finalize_calibration
++        finalize_calibration(self.model.model)
++
++    def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode):
++        cfg = (batch_size, seq_len, is_prompt)
++        seen = cfg in self.seen_configs
++        self.seen_configs.add(cfg)
++        if not seen and not warmup_mode:
++            phase = 'prompt' if is_prompt else 'decode'
++            logger.warning("Configuration: (%s, %s, %s) was not warmed-up!",
++                           phase, batch_size, seq_len)
++
++    def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int],
++                         is_prompt: bool):
++        '''
++        This is a helper function to create the mask for lora computations.
++        Lora Mask is needed to ensure we match the correct lora weights for the
++        for the request.
++        For Prompt phase we have 
++        lora_mask with shape (batch_size * seq_len, max_loras * max_rank)
++        lora_logits_mask with shape (batch_size, max_loras * max_rank)
++        For Decode phase we have both
++        lora_mask and lora_logits_mask with shape
++        (batch_size, max_loras * max_rank)
++        '''
++        lora_mask: torch.Tensor = None
++        lora_logits_mask: torch.Tensor = None
++        lora_index = 0
++
++        if self.lora_config:
++            if is_prompt:
++                lora_mask = torch.zeros(
++                    input_tokens.shape[0] * input_tokens.shape[1],
++                    (self.lora_config.max_loras) *\
++                        self.lora_config.max_lora_rank,
++                    dtype=self.lora_config.lora_dtype)
++                lora_logits_mask = torch.zeros(
++                    input_tokens.shape[0], (self.lora_config.max_loras) *
++                    self.lora_config.max_lora_rank,
++                    dtype=self.lora_config.lora_dtype)
++
++                ones = torch.ones(input_tokens.shape[1],
++                                  self.lora_config.max_lora_rank,
++                                  dtype=self.lora_config.lora_dtype)
++                logit_ones = torch.ones(1,
++                                        self.lora_config.max_lora_rank,
++                                        dtype=self.lora_config.lora_dtype)
++
++                for i in range(len(lora_ids)):
++                    if lora_ids[i] == 0:
++                        continue
++                    lora_index = self.lora_manager._adapter_manager.\
++                        lora_index_to_id.index(lora_ids[i])
++                    start_row = i * input_tokens.shape[1]
++                    end_row = start_row + input_tokens.shape[1]
++                    start_col = lora_index * self.lora_config.max_lora_rank
++                    end_col = start_col + self.lora_config.max_lora_rank
++                    lora_mask[start_row:end_row, start_col:end_col] = ones
++                    lora_logits_mask[i, start_col:end_col] = logit_ones
++                lora_mask = lora_mask.to('hpu')
++                lora_logits_mask = lora_logits_mask.to('hpu')
++            else:
++                lora_mask = torch.zeros(input_tokens.shape[0],
++                                        (self.lora_config.max_loras) *
++                                        self.lora_config.max_lora_rank,
++                                        dtype=self.lora_config.lora_dtype)
++                ones = torch.ones(1,
++                                  self.lora_config.max_lora_rank,
++                                  dtype=self.lora_config.lora_dtype)
++                for i in range(len(lora_ids)):
++                    if lora_ids[i] == 0:
++                        continue
++                    lora_index = self.lora_manager._adapter_manager.\
++                        lora_index_to_id.index(lora_ids[i])
++                    start_pos = lora_index * self.lora_config.max_lora_rank
++                    end_pos = start_pos + self.lora_config.max_lora_rank
++                    lora_mask[i, start_pos:end_pos] = ones
++                lora_mask = lora_mask.to('hpu')
++                lora_logits_mask = lora_mask
++
++        return lora_mask, lora_logits_mask
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        model_input: ModelInputForHPUWithSamplingMetadata,
++        kv_caches: List[torch.Tensor],
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++        warmup_mode=False,
++    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
++        if num_steps > 1:
++            raise ValueError(
++                "num_steps > 1 is not supported in HPUModelRunner")
++
++        if self.lora_config:
++            assert model_input.lora_requests is not None
++            assert model_input.lora_mapping is not None
++            self.set_active_loras(model_input.lora_requests,
++                                  model_input.lora_mapping)
++        input_tokens = model_input.input_tokens
++        input_positions = model_input.input_positions
++        attn_metadata = model_input.attn_metadata
++        sampling_metadata = model_input.sampling_metadata
++        real_batch_size = model_input.real_batch_size
++        batch_size_padded = model_input.batch_size_padded
++        assert input_tokens is not None
++        assert input_positions is not None
++        assert sampling_metadata is not None
++        assert attn_metadata is not None
++        is_prompt = attn_metadata.is_prompt
++        assert is_prompt is not None
++        batch_size = input_tokens.size(0)
++        seq_len = self._seq_len(attn_metadata)
++        use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
++        self._check_config(batch_size, seq_len, is_prompt, warmup_mode)
++
++        lora_mask: torch.Tensor = None
++        lora_logits_mask: torch.Tensor = None
++        if self.lora_config:
++            assert model_input.lora_ids is not None
++            lora_mask, lora_logits_mask = self.create_lora_mask(
++                input_tokens, model_input.lora_ids, attn_metadata.is_prompt)
++
++        execute_model_kwargs = {
++            "input_ids": input_tokens,
++            "positions": input_positions,
++            "kv_caches": kv_caches,
++            "attn_metadata": self.trim_attn_metadata(attn_metadata),
++            "intermediate_tensors": intermediate_tensors,
++            "lora_mask": lora_mask,
++            **(model_input.multi_modal_kwargs or {}),
++        }
++        if htorch.utils.internal.is_lazy():
++            execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs})
++
++        htorch.core.mark_step()
++        if self.is_driver_worker:
++            model_event_name = ("model_"
++                                f"{'prompt' if is_prompt else 'decode'}_"
++                                f"bs{batch_size}_"
++                                f"seq{seq_len}_"
++                                f"graphs{'T' if use_graphs else 'F'}")
++        else:
++            model_event_name = 'model_executable'
++        with self.profiler.record_event('internal', model_event_name):
++            hidden_states = self.model.forward(
++                **execute_model_kwargs,
++                selected_token_indices=sampling_metadata.selected_token_indices
++            )
++
++        if self.lora_config:
++            LoraMask.setLoraMask(
++                lora_logits_mask.index_select(
++                    0, sampling_metadata.selected_token_indices))
++
++        # Compute the logits.
++        with self.profiler.record_event(
++                'internal', ('compute_logits_'
++                             f'{"prompt" if is_prompt else "decode"}_bs'
++                             f'{batch_size}_'
++                             f'seq{seq_len}')):
++            sampling_metadata.selected_token_indices = None
++            logits = self.model.compute_logits(hidden_states,
++                                               sampling_metadata)
++        htorch.core.mark_step()
++        # Only perform sampling in the driver worker.
++        if not self.is_driver_worker:
++            return []
++
++        if model_input.async_callback is not None:
++            model_input.async_callback()
++
++        # Sample the next token.
++        with self.profiler.record_event(
++                'internal', ('sample_'
++                             f'{"prompt" if is_prompt else "decode"}_'
++                             f'bs{batch_size}_'
++                             f'seq{seq_len}')):
++            output = self.model.sample(
++                logits=logits,
++                sampling_metadata=sampling_metadata,
++            )
++        output.outputs = output.outputs[:real_batch_size]
++        htorch.core.mark_step()
++
++        if self.is_driver_worker and self.profiler.enabled:
++            # Stop recording 'execute_model' event
++            self.profiler.end()
++            event_end = self.profiler.get_timestamp_us()
++            counters = self.profiler_counter_helper.get_counter_dict(
++                cache_config=self.cache_config,
++                duration=event_end - self.event_start,
++                seq_len=seq_len,
++                batch_size_padded=batch_size_padded,
++                real_batch_size=real_batch_size,
++                is_prompt=is_prompt)
++            self.profiler.record_counter(self.event_start, counters)
++        return [output]
++
++    def shutdown_inc(self):
++        can_finalize_inc = False
++        from contextlib import suppress
++        with suppress(AttributeError):
++            can_finalize_inc = (self.model_config.quantization == 'inc') and \
++                (self.model.model is not None) and \
++                self.inc_initialized_successfully and \
++                not getattr(self, "_is_inc_finalized", False)
++        if can_finalize_inc:
++            from neural_compressor.torch.quantization import (
++                finalize_calibration)
++            finalize_calibration(self.model.model)
++            self._is_inc_finalized = True
++
++    def __del__(self):
++        self.shutdown_inc()
+diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
+new file mode 100644
+index 0000000..cca7cd5
+--- /dev/null
++++ b/vllm/worker/hpu_worker.py
+@@ -0,0 +1,410 @@
++###############################################################################
++# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
++###############################################################################
++
++import gc
++import os
++from typing import List, Optional, Set, Tuple, Type
++
++import habana_frameworks.torch as htorch  # noqa:F401
++import torch
++import torch.distributed
++from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes
++
++import vllm.envs as envs
++from vllm.config import ParallelConfig, VllmConfig
++from vllm.distributed import (ensure_model_parallel_initialized,
++                              init_distributed_environment)
++from vllm.logger import init_logger
++from vllm.lora.request import LoRARequest
++from vllm.model_executor import set_random_seed
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sequence import ExecuteModelRequest
++from vllm.worker.cache_engine import CacheEngine
++from vllm.worker.hpu_model_runner import HPUModelRunner
++from vllm.worker.model_runner_base import ModelRunnerBase
++from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
++                                     WorkerInput)
++
++logger = init_logger(__name__)
++
++
++class HPUWorker(LocalOrDistributedWorkerBase):
++    """A worker class that executes (a partition of) the model on a HPU.
++
++    Each worker is associated with a single HPU. The worker is responsible for
++    maintaining the KV cache and executing the model on the HPU. In case of
++    distributed inference, each worker is assigned a partition of the model.
++    """
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        local_rank: int,
++        rank: int,
++        distributed_init_method: str,
++        is_driver_worker: bool = False,
++        model_runner_cls: Optional[Type[ModelRunnerBase]] = None,
++    ) -> None:
++        WorkerBase.__init__(self, vllm_config=vllm_config)
++        self.parallel_config.rank = rank
++        self.local_rank = local_rank
++        self.rank = rank
++        self.distributed_init_method = distributed_init_method
++        self.is_driver_worker = is_driver_worker
++        if self.is_driver_worker:
++            assert self.rank == 0, "The driver worker must have rank 0."
++
++        if self.model_config.trust_remote_code:
++            # note: lazy import to avoid importing torch before initializing
++            from vllm.utils import init_cached_hf_modules
++            init_cached_hf_modules()
++
++        self.model_runner: HPUModelRunner = HPUModelRunner(
++            vllm_config=vllm_config, is_driver_worker=is_driver_worker)
++        # Uninitialized cache engine. Will be initialized by
++        # initialize_cache.
++        self.cache_engine: List[HPUCacheEngine]
++        # Initialize gpu_cache as pooling models don't initialize kv_caches
++        self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
++        # Torch profiler. Enabled and configured through env vars:
++        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
++        if envs.VLLM_TORCH_PROFILER_DIR:
++            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
++            logger.info("Profiling enabled. Traces will be saved to: %s",
++                        torch_profiler_trace_dir)
++            self.profiler = torch.profiler.profile(
++                activities=[
++                    torch.profiler.ProfilerActivity.CPU,
++                    torch.profiler.ProfilerActivity.HPU,
++                ],
++                with_stack=True,
++                on_trace_ready=torch.profiler.tensorboard_trace_handler(
++                    torch_profiler_trace_dir, use_gzip=True))
++        else:
++            self.profiler = None
++
++    def start_profile(self):
++        if self.profiler is None:
++            raise RuntimeError("Profiler is not enabled.")
++        self.profiler.start()
++
++    def stop_profile(self):
++        if self.profiler is None:
++            raise RuntimeError("Profiler is not enabled.")
++        self.profiler.stop()
++
++    def _set_env_vars(self):
++        local_rank = self.local_rank
++        if self.parallel_config.world_size == 1:
++            local_rank = -1
++        import os
++        os.environ["LOCAL_RANK"] = str(local_rank)
++        os.environ["ID"] = str(local_rank)
++        os.environ["WORLD_SIZE"] = str(self.parallel_config.world_size)
++        os.environ["RANK"] = str(self.rank)
++
++    def init_device(self) -> None:
++        if self.device_config.device.type == "hpu":
++            self.device = torch.device("hpu")
++            torch.hpu.set_device(self.device)
++        else:
++            raise RuntimeError(
++                f"Not support device type: {self.device_config.device}")
++        # Initialize the distributed environment.
++        if self.model_config.quantization == 'inc':
++            self._set_env_vars()
++        init_worker_distributed_environment(self.parallel_config, self.rank,
++                                            self.distributed_init_method,
++                                            self.local_rank)
++        # Set random seed.
++        set_random_seed(self.model_config.seed)
++
++    def load_model(self):
++        self.model_runner.load_model()
++
++    @torch.inference_mode()
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """Profiles the peak memory usage of the model to determine how many
++        KV blocks may be allocated without OOMs.
++
++        The engine will first conduct a profiling of the existing memory usage.
++        Then, it calculate the maximum possible number of GPU and CPU blocks
++        that can be allocated with the remaining free memory.
++
++        .. tip::
++            You may limit the usage of GPU memory
++            by adjusting the `gpu_memory_utilization` parameter.
++        """
++        # Profile the memory usage of the model and get the maximum number of
++        # cache blocks that can be allocated with the remaining free memory.
++
++        # Execute a forward pass with dummy inputs to profile the memory usage
++        # of the model.
++        with HabanaMemoryProfiler() as m:
++            self.model_runner.profile_run()
++            torch.hpu.synchronize()
++        msg = ("Model profiling run "
++               f"took {m.get_summary_string()}")
++        logger.info(msg)
++        # At this point we should've allocated the maximum workspace for all
++        # recipes we will use the extra memory for graphs/blocks
++        free_hpu_memory = torch.hpu.mem_get_info()[0]
++
++        cache_block_size = self.get_cache_block_size_bytes()
++        graph_reserved_mem = (float(
++            os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.1'))
++                              if not self.model_config.enforce_eager else 0)
++        graph_headroom = 1 - graph_reserved_mem
++        available_hpu_memory = free_hpu_memory * \
++            self.cache_config.gpu_memory_utilization
++        hpu_memory_margin = free_hpu_memory * (
++            1 - self.cache_config.gpu_memory_utilization)
++        self.model_runner.mem_margin = hpu_memory_margin
++        cache_size_bytes = available_hpu_memory * graph_headroom
++        graph_headroom_bytes = available_hpu_memory * (1 - graph_headroom)
++        msg = (
++            f"Free device memory: {format_bytes(free_hpu_memory)}, "
++            f"{format_bytes(available_hpu_memory)} usable "
++            f"(gpu_memory_utilization={self.cache_config.gpu_memory_utilization}),"
++            f" {format_bytes(graph_headroom_bytes)} reserved for HPUGraphs "
++            f"(VLLM_GRAPH_RESERVED_MEM={graph_reserved_mem}), "
++            f"{format_bytes(cache_size_bytes)} reserved for KV cache")
++        logger.info(msg)
++        num_hpu_blocks = int(cache_size_bytes // cache_block_size)
++        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
++                             cache_block_size)
++        num_hpu_blocks = max(num_hpu_blocks, 0)
++        num_cpu_blocks = max(num_cpu_blocks, 0)
++
++        if self.model_runner.lora_manager:
++            self.model_runner.remove_all_loras()
++
++        gc.collect()
++        return num_hpu_blocks, num_cpu_blocks
++
++    def initialize_cache(self, num_gpu_blocks: int,
++                         num_cpu_blocks: int) -> None:
++        """Allocate GPU and CPU KV cache with the specified number of blocks.
++
++        This also warms up the model, which may record CUDA graphs.
++        """
++        raise_if_cache_size_invalid(num_gpu_blocks,
++                                    self.cache_config.block_size,
++                                    self.model_config.max_model_len)
++
++        self.cache_config.num_gpu_blocks = num_gpu_blocks
++        self.cache_config.num_cpu_blocks = num_cpu_blocks
++
++        with HabanaMemoryProfiler() as m:
++            self._init_cache_engine()
++            torch.hpu.synchronize()
++        msg = ("Initializing cache engine "
++               f"took {m.get_summary_string()}")
++        logger.info(msg)
++        self._warm_up_model()
++
++    def _init_cache_engine(self):
++        assert self.cache_config.num_gpu_blocks is not None
++        self.cache_engine = [
++            HPUCacheEngine(self.cache_config, self.model_config,
++                           self.parallel_config, self.device_config)
++            for _ in range(self.parallel_config.pipeline_parallel_size)
++        ]
++        self.hpu_cache = [
++            self.cache_engine[ve].gpu_cache
++            for ve in range(self.parallel_config.pipeline_parallel_size)
++        ]
++
++    def _warm_up_model(self) -> None:
++        # NOTE(kzawora): We should use virtual engine index here
++        # for pipeline parallelism. Using 0 for now.
++        assert self.hpu_cache is not None
++        self.model_runner.warmup_model(self.hpu_cache[0])
++        # Reset the seed to ensure that the random state is not affected by
++        # the model initialization and profiling.
++        set_random_seed(self.model_config.seed)
++
++    def finish_measurements(self):
++        self.model_runner.finish_measurements()
++
++    @property
++    def do_metadata_broadcast(self) -> bool:
++        return self.parallel_config.tensor_parallel_size > 1
++
++    @property
++    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
++        return self.hpu_cache
++
++    @torch.inference_mode()
++    def prepare_worker_input(
++            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
++        virtual_engine = execute_model_req.virtual_engine
++        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
++        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
++        # they contain parameters to launch cudamemcpyasync.
++        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
++                                         device="cpu",
++                                         dtype=torch.int64).view(-1, 2)
++        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
++                                          device="cpu",
++                                          dtype=torch.int64).view(-1, 2)
++        # `blocks_to_copy` is a gpu tensor. The src and tgt of
++        # blocks to copy are in the same device, and `blocks_to_copy`
++        # can be used directly within cuda kernels.
++        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
++                                      device=self.device,
++                                      dtype=torch.int64).view(-1, 2)
++
++        return WorkerInput(
++            num_seq_groups=num_seq_groups,
++            blocks_to_swap_in=blocks_to_swap_in,
++            blocks_to_swap_out=blocks_to_swap_out,
++            blocks_to_copy=blocks_to_copy,
++            virtual_engine=virtual_engine,
++        )
++
++    @torch.inference_mode()
++    def execute_worker(self, worker_input: WorkerInput) -> None:
++        virtual_engine = worker_input.virtual_engine
++        # Issue cache operations.
++        if (worker_input.blocks_to_swap_in is not None
++                and worker_input.blocks_to_swap_in.numel() > 0):
++            self.cache_engine[virtual_engine].swap_in(
++                worker_input.blocks_to_swap_in)
++        if (worker_input.blocks_to_swap_out is not None
++                and worker_input.blocks_to_swap_out.numel() > 0):
++            self.cache_engine[virtual_engine].swap_out(
++                worker_input.blocks_to_swap_out)
++        if (worker_input.blocks_to_copy is not None
++                and worker_input.blocks_to_copy.numel() > 0):
++            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
++
++    def add_lora(self, lora_request: LoRARequest) -> bool:
++        return self.model_runner.add_lora(lora_request)
++
++    def remove_lora(self, lora_id: int) -> bool:
++        return self.model_runner.remove_lora(lora_id)
++
++    def pin_lora(self, lora_id: int) -> bool:
++        return self.model_runner.pin_lora(lora_id)
++
++    def list_loras(self) -> Set[int]:
++        return self.model_runner.list_loras()
++
++    def add_prompt_adapter(
++            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
++        raise NotImplementedError(
++            "Prompt Adapter is not implemented for HPU backend.")
++
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError(
++            "Prompt Adapter is not implemented for HPU backend.")
++
++    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        raise NotImplementedError(
++            "Prompt Adapter is not implemented for HPU backend.")
++
++    def list_prompt_adapters(self) -> Set[int]:
++        raise NotImplementedError(
++            "Prompt Adapter is not implemented for HPU backend.")
++
++    def shutdown_inc(self):
++        self.model_runner.shutdown_inc()
++
++    @property
++    def max_model_len(self) -> int:
++        return self.model_config.max_model_len
++
++    @property
++    def vocab_size(self) -> int:
++        return self.model_runner.vocab_size
++
++    def get_cache_block_size_bytes(self) -> int:
++        """Get the size of the KV cache block size in bytes.
++        """
++        return HPUCacheEngine.get_cache_block_size(self.cache_config,
++                                                   self.model_config,
++                                                   self.parallel_config)
++
++
++def init_worker_distributed_environment(
++    parallel_config: ParallelConfig,
++    rank: int,
++    distributed_init_method: Optional[str] = None,
++    local_rank: int = -1,
++) -> None:
++    """Initialize the distributed environment."""
++    init_distributed_environment(parallel_config.world_size,
++                                 rank,
++                                 distributed_init_method,
++                                 local_rank,
++                                 backend='hccl')
++
++    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
++                                      parallel_config.pipeline_parallel_size)
++
++    if torch.distributed.is_initialized():
++        torch_world_size = torch.distributed.get_world_size()
++        if torch_world_size != parallel_config.world_size:
++            raise RuntimeError(
++                "torch.distributed is already initialized but the torch world "
++                "size does not match parallel_config.world_size "
++                f"({torch_world_size} vs. {parallel_config.world_size}).")
++    elif not distributed_init_method:
++        raise ValueError(
++            "distributed_init_method must be set if torch.distributed "
++            "is not already initialized")
++    else:
++        torch.distributed.init_process_group(
++            backend="hccl",
++            world_size=parallel_config.world_size,
++            rank=rank,
++            init_method=distributed_init_method,
++        )
++
++    # A small all_reduce for warmup & checking conformance.
++    dummy_tensor_hpu = torch.ones(1).to('hpu')
++    torch.distributed.all_reduce(dummy_tensor_hpu)
++    assert dummy_tensor_hpu.item() == parallel_config.world_size
++    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
++                                      parallel_config.pipeline_parallel_size)
++
++
++def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
++                                max_model_len) -> None:
++    if num_gpu_blocks <= 0:
++        raise ValueError("No available memory for the cache blocks. "
++                         "Try increasing `gpu_memory_utilization` when "
++                         "initializing the engine.")
++    max_seq_len = block_size * num_gpu_blocks
++    if max_model_len > max_seq_len:
++        raise ValueError(
++            f"The model's max seq len ({max_model_len}) "
++            "is larger than the maximum number of tokens that can be "
++            f"stored in KV cache ({max_seq_len}). Try increasing "
++            "`gpu_memory_utilization` or decreasing `max_model_len` when "
++            "initializing the engine.")
++
++
++class HPUCacheEngine(CacheEngine):
++
++    def _allocate_kv_cache(
++        self,
++        num_blocks: int,
++        device: str,
++    ) -> List[Tuple[torch.Tensor, torch.Tensor]]:
++        """Allocates KV cache on the specified device."""
++        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
++            num_blocks, self.block_size, self.num_kv_heads, self.head_size)
++        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
++        for _ in range(self.num_attention_layers):
++            key_cache = torch.zeros(kv_cache_shape,
++                                    dtype=self.dtype,
++                                    device=device)
++            value_cache = torch.zeros(kv_cache_shape,
++                                      dtype=self.dtype,
++                                      device=device)
++            kv_layer = (key_cache, value_cache)
++            kv_cache.append(kv_layer)
++        return kv_cache
+diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
+index ab24859..2b91848 100644
+--- a/vllm/worker/model_runner.py
++++ b/vllm/worker/model_runner.py
+@@ -1,202 +1,1158 @@
+-import contextlib
++import dataclasses
++import gc
++import inspect
++import itertools
+ import time
+-from enum import IntEnum
+-from typing import Dict, List, NamedTuple, Optional, Set, Tuple
++import warnings
++import weakref
++from dataclasses import dataclass
++from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set,
++                    Tuple, Type, TypeVar, Union)
+ 
+ import numpy as np
+ import torch
++import torch.distributed
+ import torch.nn as nn
+-
+-from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage,
+-                            get_attn_backend)
+-from vllm.attention.backends.flashinfer import FlashInferBackend
+-from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+-from vllm.distributed import broadcast_tensor_dict, with_pynccl_for_all_reduce
+-from vllm.distributed.device_communicators import (custom_all_reduce,
+-                                                   pynccl_utils)
++from tqdm import tqdm
++
++import vllm.envs as envs
++from vllm.attention import AttentionMetadata, get_attn_backend
++from vllm.attention.backends.abstract import AttentionState
++from vllm.attention.backends.utils import CommonAttentionState
++from vllm.config import CompilationLevel, VllmConfig
++from vllm.core.scheduler import SchedulerOutputs
++from vllm.distributed import get_kv_transfer_group, get_pp_group
++from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
++                                             graph_capture)
++from vllm.forward_context import set_forward_context
++from vllm.inputs import INPUT_REGISTRY, InputRegistry
+ from vllm.logger import init_logger
+ from vllm.lora.layers import LoRAMapping
+ from vllm.lora.request import LoRARequest
+ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+-from vllm.model_executor import SamplingMetadata
++from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
++from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
++from vllm.model_executor.layers.sampler import SamplerOutput
+ from vllm.model_executor.model_loader import get_model
++from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
++from vllm.model_executor.models import supports_lora, supports_multimodal
++from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
++from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
++                             MultiModalKwargs, MultiModalPlaceholderMap,
++                             MultiModalRegistry)
++from vllm.platforms import current_platform
++from vllm.prompt_adapter.layers import PromptAdapterMapping
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.prompt_adapter.worker_manager import (
++    LRUCacheWorkerPromptAdapterManager)
+ from vllm.sampling_params import SamplingParams
+-from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData,
+-                           SequenceGroupMetadata)
+-from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
+-                        is_pin_memory_available, make_tensor_with_pad)
++from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
++from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache,
++                        async_tensor_h2d, flatten_2d_lists,
++                        is_pin_memory_available, supports_dynamo,
++                        weak_ref_tensor)
++from vllm.worker.model_runner_base import (
++    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
++    _add_attn_metadata_broadcastable_dict,
++    _add_sampling_metadata_broadcastable_dict,
++    _init_attn_metadata_from_tensor_dict,
++    _init_sampling_metadata_from_tensor_dict, dump_input_when_exception)
++
++if TYPE_CHECKING:
++    from vllm.attention.backends.abstract import AttentionBackend
+ 
+ logger = init_logger(__name__)
+ 
+-_PAD_SLOT_ID = -1
+ LORA_WARMUP_RANK = 8
+-_BATCH_SIZE_ALIGNMENT = 8
+-# Capture graphs for token size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
+-# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
+-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
+-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
+-]
+-
+-
+-class PreparePromptMetadata(NamedTuple):
+-    input_tokens: List[int]
+-    input_positions: List[int]
+-    attn_metadata: Optional[AttentionMetadataPerStage]
+-    seq_lens: List[int]
+-    query_lens: List[int]
+-    lora_index_mapping: List[int]
+-    lora_prompt_mapping: List[int]
+-    lora_requests: Set[LoRARequest]
+-    multi_modal_input: Optional[torch.Tensor]
+-    slot_mapping: List[int]
+ 
+-    @classmethod
+-    def empty(cls):
+-        return PreparePromptMetadata(
+-            input_tokens=[],
+-            input_positions=[],
+-            attn_metadata=None,
+-            seq_lens=[],
+-            query_lens=[],
+-            lora_index_mapping=[],
+-            lora_prompt_mapping=[],
+-            lora_requests=set(),
+-            multi_modal_input=None,
+-            slot_mapping=[],
+-        )
++_NUM_WARMUP_ITERS = 2
++
++TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
+ 
++# For now, bump up cache limits for recompilations during CUDA graph warmups.
++torch._dynamo.config.cache_size_limit = 128
++torch._dynamo.config.accumulated_cache_size_limit = 128
++
++
++@dataclass(frozen=True)
++class ModelInputForGPU(ModelRunnerInputBase):
++    """
++    This base class contains metadata needed for the base model forward pass
++    but not metadata for possible additional steps, e.g., sampling. Model
++    runners that run additional steps should subclass this method to add
++    additional fields.
++    """
++    input_tokens: Optional[torch.Tensor] = None
++    input_positions: Optional[torch.Tensor] = None
++    token_types: Optional[torch.Tensor] = None
++    seq_lens: Optional[List[int]] = None
++    query_lens: Optional[List[int]] = None
++    lora_mapping: Optional["LoRAMapping"] = None
++    lora_requests: Optional[Set[LoRARequest]] = None
++    attn_metadata: Optional["AttentionMetadata"] = None
++    prompt_adapter_mapping: Optional[PromptAdapterMapping] = None
++    prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None
++    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
++    request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
++    finished_requests_ids: Optional[List[str]] = None
++    virtual_engine: int = 0
++    async_callback: Optional[Callable] = None
++    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
++    scheduler_outputs: Optional[SchedulerOutputs] = None
++
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        tensor_dict = {
++            "input_tokens": self.input_tokens,
++            "input_positions": self.input_positions,
++            "lora_requests": self.lora_requests,
++            "lora_mapping": self.lora_mapping,
++            "multi_modal_kwargs": self.multi_modal_kwargs,
++            "prompt_adapter_mapping": self.prompt_adapter_mapping,
++            "prompt_adapter_requests": self.prompt_adapter_requests,
++            "virtual_engine": self.virtual_engine,
++            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
++            "finished_requests_ids": self.finished_requests_ids,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++        return tensor_dict
+ 
+-class PrepareDecodeMetadata(NamedTuple):
+-    input_tokens: List[int]
+-    input_positions: List[int]
+-    attn_metadata: Optional[AttentionMetadata]
+-    lora_index_mapping: List[int]
+-    lora_prompt_mapping: List[int]
+-    lora_requests: Set[LoRARequest]
+-    slot_mapping: List[int]
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls: Type[TModelInputForGPU],
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> TModelInputForGPU:
++        if attn_backend is not None:
++            tensor_dict = _init_attn_metadata_from_tensor_dict(
++                attn_backend, tensor_dict)
++        return cls(**tensor_dict)
++
++    # Exclude `async_callback` to be able to pickle this object
++    def __getstate__(self):
++        state = self.__dict__.copy()
++        del state["async_callback"]
++        return state
++
++    # TODO: What happens when we depickle this object?
++    # How can we update this callback to properly pass it to the engine?
++    def __setstate__(self, state):
++        self.__dict__.update(state)
++        self.__dict__.update({'async_callback': None})
++
++
++@dataclass(frozen=True)
++class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
++    """
++    Used by the ModelRunner.
++    """
++    sampling_metadata: Optional["SamplingMetadata"] = None
++    # Used for speculative decoding. We do not broadcast it because it is only
++    # used by the driver worker.
++    is_prompt: Optional[bool] = None
++
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        tensor_dict = {
++            "input_tokens": self.input_tokens,
++            "input_positions": self.input_positions,
++            "lora_requests": self.lora_requests,
++            "lora_mapping": self.lora_mapping,
++            "multi_modal_kwargs": self.multi_modal_kwargs,
++            "prompt_adapter_mapping": self.prompt_adapter_mapping,
++            "prompt_adapter_requests": self.prompt_adapter_requests,
++            "virtual_engine": self.virtual_engine,
++            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
++            "finished_requests_ids": self.finished_requests_ids,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++        _add_sampling_metadata_broadcastable_dict(tensor_dict,
++                                                  self.sampling_metadata)
++        return tensor_dict
+ 
+     @classmethod
+-    def empty(cls):
+-        return PrepareDecodeMetadata(
+-            input_tokens=[],
+-            input_positions=[],
+-            attn_metadata=None,
+-            lora_index_mapping=[],
+-            lora_prompt_mapping=[],
+-            lora_requests=set(),
+-            slot_mapping=[],
+-        )
++    def from_broadcasted_tensor_dict(
++        cls,
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> "ModelInputForGPUWithSamplingMetadata":
++        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
++        if attn_backend is not None:
++            tensor_dict = _init_attn_metadata_from_tensor_dict(
++                attn_backend, tensor_dict)
++        return cls(**tensor_dict)
++
++
++class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
++    """Build ModelInputForGPU from SequenceGroupMetadata."""
++
++    # Note: ideally we would be using a dataclass(kw_only=True)
++    # here, so that this can be subclassed easily,
++    # but kw_only is not supported in python<3.10.
++    class InterDataForSeqGroup:
++        """Intermediate data for the current sequence group."""
++
++        def simple_reinit(self):
++            self.input_tokens[0].clear()  # type: ignore
++            self.input_positions[0].clear()  # type: ignore
++            self.token_types[0].clear()  # type: ignore
++            self.mrope_input_positions = None  # type: ignore
++            self.seq_lens[0] = 0  # type: ignore
++            self.orig_seq_lens[0] = 0  # type: ignore
++            self.query_lens[0] = 0  # type: ignore
++            self.context_lens[0] = 0  # type: ignore
++            self.curr_sliding_window_blocks[0] = 0  # type: ignore
++            self.lora_index_mapping.clear()  # type: ignore
++            self.lora_prompt_mapping.clear()  # type: ignore
++            self.lora_requests.clear()  # type: ignore
++            self.prompt_adapter_index_mapping.clear()  # type: ignore
++            self.prompt_adapter_prompt_mapping.clear()  # type: ignore
++
++        def __init__(
++            self,
++            *,
++            # From sequence group metadata.
++            request_id: str,
++            seq_ids: List[int],
++            is_prompt: bool,
++            block_tables: Optional[Dict[int, List[int]]],
++            computed_block_nums: List[int],
++            n_seqs: int = 0,
++
++            # Input tokens and positions.
++            input_tokens: Optional[List[List[int]]] = None,
++            input_positions: Optional[List[List[int]]] = None,
++            token_types: Optional[List[List[int]]] = None,
++            mrope_input_positions: Optional[List[List[List[int]]]] = None,
++
++            # The sequence length (may be capped to the sliding window).
++            seq_lens: Optional[List[int]] = None,
++            # The original sequence length (before applying sliding window).
++            # This is used to compute slot mapping.
++            orig_seq_lens: Optional[List[int]] = None,
++            # The query length.
++            query_lens: Optional[List[int]] = None,
++            # The number of tokens that are already computed.
++            context_lens: Optional[List[int]] = None,
++            # The current sliding window block.
++            curr_sliding_window_blocks: Optional[List[int]] = None,
++
++            # LoRA inputs.
++            lora_index_mapping: Optional[List[List[int]]] = None,
++            lora_prompt_mapping: Optional[List[List[int]]] = None,
++            lora_requests: Optional[Set[LoRARequest]] = None,
++
++            # Prompt adapter inputs.
++            prompt_adapter_index_mapping: Optional[List[int]] = None,
++            prompt_adapter_prompt_mapping: Optional[List[int]] = None,
++            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
++
++            # Multi-modal inputs.
++            multi_modal_kwargs: Optional[MultiModalKwargs] = None,
++            multi_modal_placeholder_maps: Optional[Dict[
++                str, MultiModalPlaceholderMap]] = None,
++
++            # Whether the prefix cache is hit (prefill only).
++            prefix_cache_hit: bool = False,
++            reinit: bool = False,
++            reinit_use_defaults: bool = False,
++            encoder_seq_len: int = 0,
++        ):
++            if reinit:
++                assert len(self.seq_ids) == len(seq_ids)  # type: ignore
++                for i, seq_id in enumerate(seq_ids):
++                    self.seq_ids[i] = seq_id  # type: ignore
++            else:
++                self.seq_ids = seq_ids
++
++            self.request_id = request_id
++            self.is_prompt = is_prompt
++            self.block_tables = block_tables
++            self.computed_block_nums = computed_block_nums
++            self.n_seqs = n_seqs
++            self.encoder_seq_len = encoder_seq_len
++
++            if reinit:
++                if len(self.seq_ids) == 1 and reinit_use_defaults:
++                    self.simple_reinit()
++                else:
++                    if input_tokens:
++                        self.input_tokens = input_tokens
++                    else:
++                        for seq_id in range(len(self.seq_ids)):
++                            self.input_tokens[seq_id].clear()
++
++                    if input_positions:
++                        self.input_positions = input_positions
++                    else:
++                        for seq_id in range(len(self.seq_ids)):
++                            self.input_positions[seq_id].clear()
++
++                    if token_types:
++                        self.token_types = token_types
++                    else:
++                        for seq_id in range(len(self.seq_ids)):
++                            self.token_types[seq_id].clear()
++
++                    self.mrope_input_positions = None
++
++                    if seq_lens:
++                        self.seq_lens = seq_lens
++                    else:
++                        for seq_id in range(len(self.seq_ids)):
++                            self.seq_lens[seq_id] = 0
++
++                    if orig_seq_lens:
++                        self.orig_seq_lens = orig_seq_lens
++                    else:
++                        for seq_id in range(len(self.seq_ids)):
++                            self.orig_seq_lens[seq_id] = 0
++
++                    if query_lens:
++                        self.query_lens = query_lens
++                    else:
++                        for seq_id in range(len(self.seq_ids)):
++                            self.query_lens[seq_id] = 0
++
++                    if context_lens:
++                        self.context_lens = context_lens
++                    else:
++                        for seq_id in range(len(self.seq_ids)):
++                            self.context_lens[seq_id] = 0
++
++                    if curr_sliding_window_blocks:
++                        self.curr_sliding_window_blocks = \
++                            curr_sliding_window_blocks
++                    else:
++                        for seq_id in range(len(self.seq_ids)):
++                            self.curr_sliding_window_blocks[seq_id] = 0
++
++                    if lora_index_mapping:
++                        self.lora_index_mapping = lora_index_mapping
++                    else:
++                        self.lora_index_mapping.clear()
++
++                    if lora_prompt_mapping:
++                        self.lora_prompt_mapping = lora_prompt_mapping
++                    else:
++                        self.lora_prompt_mapping.clear()
++
++                    if lora_requests:
++                        self.lora_requests = lora_requests
++                    else:
++                        self.lora_requests.clear()
++
++                    if prompt_adapter_index_mapping:
++                        self.prompt_adapter_index_mapping = \
++                            prompt_adapter_index_mapping
++                    else:
++                        self.prompt_adapter_index_mapping.clear()
++
++                    if prompt_adapter_prompt_mapping:
++                        self.prompt_adapter_prompt_mapping = \
++                            prompt_adapter_prompt_mapping
++                    else:
++                        self.prompt_adapter_prompt_mapping.clear()
++
++            else:
++                self.input_tokens = input_tokens or []
++                self.input_positions = input_positions or []
++                self.token_types = token_types or []
++                self.mrope_input_positions = mrope_input_positions or None
++                self.seq_lens = seq_lens or []
++                self.orig_seq_lens = orig_seq_lens or []
++                self.query_lens = query_lens or []
++                self.context_lens = context_lens or []
++                self.curr_sliding_window_blocks = \
++                    curr_sliding_window_blocks or []
++
++                self.lora_index_mapping = lora_index_mapping or []
++                self.lora_prompt_mapping = lora_prompt_mapping or []
++                self.lora_requests = lora_requests or set()
++
++                self.prompt_adapter_index_mapping = (
++                    prompt_adapter_index_mapping or [])
++                self.prompt_adapter_prompt_mapping = (
++                    prompt_adapter_prompt_mapping or [])
++
++            self.prompt_adapter_request = prompt_adapter_request
++            self.multi_modal_kwargs = multi_modal_kwargs
++            self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
++            self.prefix_cache_hit = prefix_cache_hit
++
++            self.n_seqs = len(self.seq_ids)
++
++            if not reinit:
++                self.__post_init__()
++
++        def __post_init__(self):
++            self.n_seqs = len(self.seq_ids)
++
++            self.input_tokens = [[] for _ in range(self.n_seqs)]
++            self.input_positions = [[] for _ in range(self.n_seqs)]
++            self.token_types = [[] for _ in range(self.n_seqs)]
++            self.mrope_input_positions = None
++            self.seq_lens = [0] * self.n_seqs
++            self.orig_seq_lens = [0] * self.n_seqs
++            self.query_lens = [0] * self.n_seqs
++            self.context_lens = [0] * self.n_seqs
++            self.curr_sliding_window_blocks = [0] * self.n_seqs
++
++            self.lora_index_mapping = []
++            self.lora_prompt_mapping = []
++
++    def gen_inter_data_builder(self, num_seqs: int):
++        return lambda: ModelInputForGPUBuilder.InterDataForSeqGroup(
++            request_id="",
++            seq_ids=[0] * num_seqs,
++            is_prompt=True,
++            block_tables=None,
++            computed_block_nums=[])
++
++    def init_cached_inter_data(self, *args, **kwargs):
++        assert len(args) == 0
++        assert "seq_ids" in kwargs
++        seq_ids = kwargs["seq_ids"]
++        num_seqs = len(seq_ids)
++
++        # The inter-data cache is per model_runner
++        inter_data_cache = self.runner.inter_data_cache
++        if num_seqs not in inter_data_cache:
++            inter_data_cache[num_seqs] = PyObjectCache(
++                self.gen_inter_data_builder(num_seqs))
++
++        obj = inter_data_cache[num_seqs].get_object()
++        obj.__init__(*args, **kwargs)
++        return obj
++
++    def reset_cached_inter_data(self):
++        for cache in self.runner.inter_data_cache.values():
++            cache.reset()
++
++    def __init__(self,
++                 runner: "GPUModelRunnerBase",
++                 finished_requests_ids: Optional[List[str]] = None):
++        super().__init__()
++        # Compute functions for each sequence in a sequence group.
++        # WARNING: The order of the functions matters!
++        self.per_seq_compute_fns = [
++            self._compute_lens,
++            self._compute_for_prefix_cache_hit,
++            self._compute_for_sliding_window,
++            self._compute_lora_input,
++        ]
++        # Compute functions for each sequence group.
++        # WARNING: The order of the functions matters!
++        self.per_seq_group_compute_fns = [
++            self._compute_prompt_adapter_input,
++            self._compute_multi_modal_input,
++        ]
++
++        self.runner = runner
++        self.model_input_cls = self.runner._model_input_cls
++        self.attn_backend = self.runner.attn_backend
++        self.scheduler_config = self.runner.scheduler_config
++        self.sliding_window = self.runner.sliding_window
++        self.block_size = self.runner.block_size
++        self.enable_lora = self.runner.lora_config is not None
++        self.enable_prompt_adapter = (self.runner.prompt_adapter_config
++                                      is not None)
++        self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
++        self.finished_requests_ids = finished_requests_ids
++        self.decode_only = True
++
++        # Intermediate data (data in CPU before going to GPU) for
++        # the current sequence group.
++        self.inter_data_list: List[
++            ModelInputForGPUBuilder.InterDataForSeqGroup] = []
++
++        # Attention metadata inputs.
++        self.attn_metadata_builder = self.attn_backend.make_metadata_builder(
++            weakref.proxy(self))
++
++        # Engine/Model configurations.
++        self.chunked_prefill_enabled = (
++            self.scheduler_config is not None
++            and self.scheduler_config.chunked_prefill_enabled)
++        if self.sliding_window is not None:
++            self.sliding_window_blocks = (
++                self.sliding_window + self.block_size - 1) // self.block_size
++            self.block_aligned_sliding_window = \
++                self.sliding_window_blocks * self.block_size
++
++    def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
++                      seq_group_metadata: SequenceGroupMetadata):
++        """Compute context length, sequence length and tokens
++        for the given sequence data.
++        """
++        seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]]
++        token_chunk_size = seq_group_metadata.token_chunk_size
++
++        # Compute context length (the number of tokens that are
++        # already computed) and sequence length (total number of tokens).
++
++        seq_len = seq_data.get_len()
++        if inter_data.is_prompt:
++            context_len = seq_data.get_num_computed_tokens()
++            seq_len = min(seq_len, context_len + token_chunk_size)
++        elif self.runner.scheduler_config.is_multi_step or \
++            self.runner.model_config.is_encoder_decoder:
++            context_len = seq_len - 1
++        else:
++            context_len = seq_data.get_num_computed_tokens()
++
++        # Compute tokens.
++        tokens = seq_data.get_token_ids()[context_len:seq_len]
++        token_types = seq_group_metadata.token_type_ids
++
++        inter_data.seq_lens[seq_idx] = seq_len
++        inter_data.orig_seq_lens[seq_idx] = seq_len
++        inter_data.context_lens[seq_idx] = context_len
++        inter_data.input_tokens[seq_idx].extend(tokens)
++        inter_data.input_positions[seq_idx].extend(range(context_len, seq_len))
++        inter_data.token_types[seq_idx].extend(
++            token_types if token_types else [])
++        inter_data.query_lens[seq_idx] = seq_len - context_len
++
++        if seq_data.mrope_position_delta is not None:
++            if inter_data.mrope_input_positions is None:
++                inter_data.mrope_input_positions = [None] * inter_data.n_seqs
++
++            inter_data.mrope_input_positions[
++                seq_idx] = MRotaryEmbedding.get_next_input_positions(
++                    seq_data.mrope_position_delta,
++                    context_len,
++                    seq_len,
++                )
++
++    def _compute_for_prefix_cache_hit(
++            self, inter_data: InterDataForSeqGroup, seq_idx: int,
++            seq_group_metadata: SequenceGroupMetadata):
++        """Check if hit prefix cache (i.e., some blocks are already computed).
++        If hit, update input tokens and positions to only compute the
++        remaining blocks.
++        """
++        computed_block_nums = inter_data.computed_block_nums
++
++        # Note that prefix caching does not support sliding window.
++        prefix_cache_hit = (computed_block_nums is not None
++                            and len(computed_block_nums) > 0
++                            and self.sliding_window is None
++                            and inter_data.is_prompt)
++        inter_data.prefix_cache_hit = prefix_cache_hit
++
++        if not prefix_cache_hit:
++            return
++
++        assert computed_block_nums is not None
++        # The cache hit prompt tokens in this sequence. Note that
++        # this may be larger than the sequence length if chunked
++        # prefill is enabled.
++        prefix_cache_len = len(computed_block_nums) * self.block_size
++        seq_group_metadata.seq_data[inter_data.seq_ids[
++            seq_idx]].update_num_cached_tokens(prefix_cache_len)
++
++        # The number of so far computed prompt tokens in this sequence.
++        context_len = inter_data.context_lens[seq_idx]
++        # The total number of prompt tokens in this sequence.
++        # When chunked prefill is enabled, this is the token number of
++        # computed chunks + current chunk.
++        seq_len = inter_data.seq_lens[seq_idx]
++        if prefix_cache_len <= context_len:
++            # We already passed the cache hit region,
++            # so do normal computation.
++            pass
++        elif context_len < prefix_cache_len < seq_len:
++            # Partial hit. Compute the missing part.
++            uncomputed_start = prefix_cache_len - context_len
++            inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
++                seq_idx][uncomputed_start:]
++            inter_data.input_positions[seq_idx] = inter_data.input_positions[
++                seq_idx][uncomputed_start:]
++            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
++                uncomputed_start:]
++            context_len = prefix_cache_len
++
++            inter_data.context_lens[seq_idx] = context_len
++            inter_data.query_lens[
++                seq_idx] = inter_data.seq_lens[seq_idx] - context_len
++        elif seq_len <= prefix_cache_len:
++            # Full hit. Only compute the last token to avoid
++            # erroneous behavior. FIXME: Ideally we should directly
++            # mark all tokens as computed in the scheduler and do not
++            # schedule this sequence, so this case should not happen.
++            inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
++                seq_idx][-1:]
++            inter_data.input_positions[seq_idx] = inter_data.input_positions[
++                seq_idx][-1:]
++            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
++                -1:]
++            inter_data.query_lens[seq_idx] = 1
++            inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1
++
++    def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup,
++                                    seq_idx: int,
++                                    seq_group_metadata: SequenceGroupMetadata):
++        """Update seq_len and curr_sliding_window_block for the given
++        sequence data (only required by decoding) if sliding window is enabled.
++        """
++        curr_sliding_window_block = 0
++        sliding_seq_len = inter_data.seq_lens[seq_idx]
++        if not inter_data.is_prompt and self.sliding_window is not None:
++            # TODO(sang): This is a hack to make sliding window work with
++            # paged attn. We can remove it if we make paged attn kernel
++            # to properly handle slinding window attn.
++            curr_sliding_window_block = self.sliding_window_blocks
++            # number of elements in last block
++            suff_len = inter_data.seq_lens[seq_idx] % self.block_size
++            sliding_seq_len = min(inter_data.seq_lens[seq_idx],
++                                  self.block_aligned_sliding_window + suff_len)
++            if suff_len > 0:
++                curr_sliding_window_block += 1
++
++        inter_data.curr_sliding_window_blocks[
++            seq_idx] = curr_sliding_window_block
++        inter_data.seq_lens[seq_idx] = sliding_seq_len
++
++    def _compute_lora_input(self, inter_data: InterDataForSeqGroup,
++                            seq_idx: int,
++                            seq_group_metadata: SequenceGroupMetadata):
++        """If LoRA is enabled, compute LoRA index and prompt mapping."""
++        if not self.enable_lora:
++            return
++
++        lora_id = seq_group_metadata.lora_int_id
++        if lora_id > 0:
++            inter_data.lora_requests.add(seq_group_metadata.lora_request)
++        query_len = inter_data.query_lens[seq_idx]
++        inter_data.lora_index_mapping.append([lora_id] * query_len)
++        sampling_params = seq_group_metadata.sampling_params
++        if sampling_params and sampling_params.prompt_logprobs is not None:
++            inter_data.lora_prompt_mapping.append([lora_id] * query_len)
++        elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample:
++            inter_data.lora_prompt_mapping.append([lora_id])
++        else:
++            inter_data.lora_prompt_mapping.append([])
++
++    def _compute_prompt_adapter_input(
++            self, inter_data: InterDataForSeqGroup,
++            seq_group_metadata: SequenceGroupMetadata):
++        """If prompt adapter is enabled, compute index and prompt mapping.
++        """
++        # Note that when is_prompt=True, we expect only one sequence
++        # in the group.
++        if not self.enable_prompt_adapter:
++            return
++
++        prompt_adapter_id = seq_group_metadata.prompt_adapter_id
++        if prompt_adapter_id <= 0 or not inter_data.is_prompt:
++            return
++
++        # We expect only one sequence in the group when is_prompt=True.
++        assert inter_data.n_seqs == 1
++        query_len = inter_data.query_lens[0]
++        inter_data.prompt_adapter_request = (
++            seq_group_metadata.prompt_adapter_request)
++
++        num_tokens = seq_group_metadata.prompt_adapter_num_virtual_tokens
++        inter_data.prompt_adapter_index_mapping = [
++            prompt_adapter_id
++        ] * num_tokens + [0] * (query_len - num_tokens)
++        inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * (
++            query_len if seq_group_metadata.sampling_params
++            and seq_group_metadata.sampling_params.prompt_logprobs else 1)
++
++    def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
++                                   seq_group_metadata: SequenceGroupMetadata):
++        """If multi-modal data is given, add it to the input."""
++        # NOTE: mm_data only includes the subset of multi-modal items that
++        # intersect with the current prefill positions.
++        positions = inter_data.input_positions[0]
++        mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
++            seq_group_metadata,
++            range(positions[0], positions[0] + len(positions)))
++        if not mm_data:
++            return
++
++        if self.runner.mm_registry.has_processor(self.runner.model_config):
++            mm_kwargs = mm_data
++        else:
++            mm_kwargs = self.multi_modal_input_mapper(
++                mm_data,
++                seq_group_metadata.mm_processor_kwargs,
++            )
++
++        inter_data.multi_modal_kwargs = mm_kwargs
++        inter_data.multi_modal_placeholder_maps = placeholder_maps
++
++        # special processing for mrope position deltas.
++        if self.runner.model_config.uses_mrope:
++            image_grid_thw = mm_kwargs.get("image_grid_thw", None)
++            video_grid_thw = mm_kwargs.get("video_grid_thw", None)
++            assert image_grid_thw is not None or video_grid_thw is not None, (
++                "mrope embedding type requires multi-modal input mapper "
++                "returns 'image_grid_thw' or 'video_grid_thw'.")
++
++            hf_config = self.runner.model_config.hf_config
++
++            inter_data.mrope_input_positions = [None] * inter_data.n_seqs
++            for seq_idx in range(inter_data.n_seqs):
++                seq_data = seq_group_metadata.seq_data[
++                    inter_data.seq_ids[seq_idx]]
++                token_ids = seq_data.get_token_ids()
++
++                mrope_input_positions, mrope_position_delta = \
++                    MRotaryEmbedding.get_input_positions(
++                        token_ids,
++                        image_grid_thw=image_grid_thw,
++                        video_grid_thw=video_grid_thw,
++                        image_token_id=hf_config.image_token_id,
++                        video_token_id=hf_config.video_token_id,
++                        vision_start_token_id=hf_config.vision_start_token_id,
++                        vision_end_token_id=hf_config.vision_end_token_id,
++                        spatial_merge_size=hf_config.vision_config.
++                        spatial_merge_size,
++                        context_len=inter_data.context_lens[seq_idx],
++                        seq_len=inter_data.seq_lens[seq_idx],
++                    )
++
++                seq_data.mrope_position_delta = mrope_position_delta
++                inter_data.mrope_input_positions[
++                    seq_idx] = mrope_input_positions
++
++    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
++        """Add a sequence group to the builder."""
++        seq_ids = seq_group_metadata.seq_data.keys()
++        n_seqs = len(seq_ids)
++        is_prompt = seq_group_metadata.is_prompt
++
++        if is_prompt:
++            assert n_seqs == 1
++            self.decode_only = False
++
++        encoder_seq_len = 0
++
++        if self.runner.model_config.is_encoder_decoder:
++            encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len()
++
++        inter_data = self.init_cached_inter_data(
++            request_id=seq_group_metadata.request_id,
++            seq_ids=seq_ids,
++            is_prompt=is_prompt,
++            block_tables=seq_group_metadata.block_tables,
++            computed_block_nums=seq_group_metadata.computed_block_nums,
++            reinit=True,
++            reinit_use_defaults=True,
++            encoder_seq_len=encoder_seq_len)
++
++        self.inter_data_list.append(inter_data)
++
++        for seq_idx in range(n_seqs):
++            for per_seq_fn in self.per_seq_compute_fns:
++                per_seq_fn(inter_data, seq_idx, seq_group_metadata)
++        for per_seq_group_fn in self.per_seq_group_compute_fns:
++            per_seq_group_fn(inter_data, seq_group_metadata)
++
++    def _use_captured_graph(self,
++                            batch_size: int,
++                            decode_only: bool,
++                            max_decode_seq_len: int,
++                            max_encoder_seq_len: int = 0) -> bool:
++        return (decode_only and not self.runner.model_config.enforce_eager
++                and max_decode_seq_len <= self.runner.max_seq_len_to_capture
++                and max_encoder_seq_len <= self.runner.max_seq_len_to_capture
++                and batch_size <= self.runner.max_batchsize_to_capture)
++
++    def _get_cuda_graph_pad_size(self,
++                                 num_seqs: int,
++                                 max_decode_seq_len: int,
++                                 max_encoder_seq_len: int = 0) -> int:
++        """
++        Determine the number of padding sequences required for running in
++        CUDA graph mode. Returns -1 if CUDA graphs cannot be used.
++
++        In the multi-step + chunked-prefill case, only the first step
++        has Prefills (if any). The rest of the steps are guaranteed to be all
++        decodes. In this case, we set up the padding as if all the sequences
++        are decodes so we may run all steps except the first step in CUDA graph
++        mode. The padding is accounted for in the multi-step `advance_step`
++        family of functions.
++
++        Args:
++            num_seqs (int): Number of sequences scheduled to run.
++            max_decode_seq_len (int): Greatest of all the decode sequence
++                lengths. Used only in checking the viablility of using
++                CUDA graphs.
++            max_encoder_seq_len (int, optional): Greatest of all the encode
++                sequence lengths. Defaults to 0. Used only in checking the
++                viability of using CUDA graphs.
++        Returns:
++            int: Returns the determined number of padding sequences. If
++                CUDA graphs is not viable, returns -1.
++        """
++        is_mscp: bool = self.runner.scheduler_config.is_multi_step and \
++                    self.runner.scheduler_config.chunked_prefill_enabled
++        decode_only = self.decode_only or is_mscp
++        if not decode_only:
++            # Early exit so we can treat num_seqs as the batch_size below.
++            return -1
++
++        # batch_size out of this function refers to the number of input
++        # tokens being scheduled. This conflation of num_seqs as batch_size
++        # is valid as this is a decode-only case.
++        batch_size = num_seqs
++        if not self._use_captured_graph(batch_size, decode_only,
++                                        max_decode_seq_len,
++                                        max_encoder_seq_len):
++            return -1
++
++        graph_batch_size = self.runner.vllm_config.pad_for_cudagraph(
++            batch_size)
++        assert graph_batch_size >= batch_size
++        return graph_batch_size - batch_size
++
++    def build(self) -> ModelInputForGPU:
++        """Finalize the builder intermediate data and
++        create on-device tensors.
++        """
++        # Combine and flatten intermediate data.
++        input_tokens = []
++        token_types = []
++        for inter_data in self.inter_data_list:
++            for cur_input_tokens in inter_data.input_tokens:
++                input_tokens.extend(cur_input_tokens)
++            for cur_token_types in inter_data.token_types:
++                token_types.extend(cur_token_types)
++
++        if not input_tokens:
++            # This may happen when all prefill requests hit
++            # prefix caching and there is no decode request.
++            return self.model_input_cls()
++
++        mrope_input_positions: Optional[List[List[int]]] = None
++        if any(inter_data.mrope_input_positions is not None
++               for inter_data in self.inter_data_list):
++            mrope_input_positions = [[] for _ in range(3)]
++            for idx in range(3):
++                for inter_data in self.inter_data_list:
++                    msections = inter_data.mrope_input_positions
++                    if msections is None:
++                        for _seq_input_positions in inter_data.input_positions:
++                            mrope_input_positions[idx].extend(
++                                _seq_input_positions)
++                    else:
++                        for _seq_mrope_input_positions in msections:
++                            mrope_input_positions[idx].extend(
++                                _seq_mrope_input_positions[idx])
++            input_positions = None
++        else:
++            input_positions = []
++            for inter_data in self.inter_data_list:
++                for cur_input_positions in inter_data.input_positions:
++                    input_positions.extend(cur_input_positions)
++
++        seq_lens = []
++        query_lens = []
++        max_decode_seq_len = 0
++        max_encoder_seq_len = 0
++        for inter_data in self.inter_data_list:
++            seq_lens.extend(inter_data.seq_lens)
++            query_lens.extend(inter_data.query_lens)
++            if not inter_data.is_prompt:
++                max_decode_seq_len = max(max_decode_seq_len,
++                                         max(inter_data.seq_lens))
++                if self.runner.model_config.is_encoder_decoder:
++                    max_encoder_seq_len = max(max_encoder_seq_len,
++                                              inter_data.encoder_seq_len)
++
++        # Mapping from request IDs to sequence IDs. Used for Jamba models
++        # that manages the cache by itself.
++        request_ids_to_seq_ids = {
++            data.request_id: data.seq_ids
++            for data in self.inter_data_list
++        }
++
++        cuda_graph_pad_size = self._get_cuda_graph_pad_size(
++            num_seqs=len(seq_lens),
++            max_decode_seq_len=max_decode_seq_len,
++            max_encoder_seq_len=max_encoder_seq_len)
++
++        batch_size = len(input_tokens)
++        if cuda_graph_pad_size != -1:
++            # If cuda graph can be used, pad tensors accordingly.
++            # See `capture_model` API for more details.
++            # vLLM uses cuda graph only for decoding requests.
++            batch_size += cuda_graph_pad_size
++
++        # Tokens and positions.
++        if cuda_graph_pad_size:
++            input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size))
++        assert self.runner.device is not None
++        input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long,
++                                               self.runner.device,
++                                               self.runner.pin_memory)
++
++        token_types_tensor = async_tensor_h2d(token_types, torch.long,
++                                               self.runner.device,
++                                               self.runner.pin_memory) \
++                                                if token_types else None
++
++        if mrope_input_positions is not None:
++            for idx in range(3):
++                mrope_input_positions[idx].extend(
++                    itertools.repeat(0, cuda_graph_pad_size))
++            input_positions_tensor = async_tensor_h2d(mrope_input_positions,
++                                                      torch.long,
++                                                      self.runner.device,
++                                                      self.runner.pin_memory)
++        else:
++            input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
++            input_positions_tensor = async_tensor_h2d(input_positions,
++                                                      torch.long,
++                                                      self.runner.device,
++                                                      self.runner.pin_memory)
++        # Sequence and query lengths.
++        if cuda_graph_pad_size:
++            seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size))
++
++        # Attention metadata.
++        attn_metadata = self.attn_metadata_builder.build(
++            seq_lens, query_lens, cuda_graph_pad_size, batch_size)
++
++        # LoRA data.
++        lora_requests = set()
++        lora_mapping = None
++        if self.enable_lora:
++            lora_requests = set(r for data in self.inter_data_list
++                                for r in data.lora_requests)
++            lora_index_mapping = flatten_2d_lists([
++                flatten_2d_lists(inter_data.lora_index_mapping)
++                for inter_data in self.inter_data_list
++            ])
++            if cuda_graph_pad_size:
++                lora_index_mapping.extend(
++                    itertools.repeat(0, cuda_graph_pad_size))
++            lora_prompt_mapping = flatten_2d_lists([
++                flatten_2d_lists(inter_data.lora_prompt_mapping)
++                for inter_data in self.inter_data_list
++            ])
++
++            lora_mapping = LoRAMapping(
++                **dict(index_mapping=lora_index_mapping,
++                       prompt_mapping=lora_prompt_mapping,
++                       is_prefill=not self.decode_only))
++
++        # Prompt adapter data.
++        prompt_adapter_requests: Set[PromptAdapterRequest] = set()
++        prompt_adapter_mapping = None
++        if self.enable_prompt_adapter:
++            prompt_adapter_requests = set(
++                data.prompt_adapter_request for data in self.inter_data_list
++                if data.prompt_adapter_request is not None)
++            prompt_adapter_index_mapping = flatten_2d_lists([
++                inter_data.prompt_adapter_index_mapping
++                for inter_data in self.inter_data_list
++            ])
++            if cuda_graph_pad_size:
++                prompt_adapter_index_mapping.extend(
++                    itertools.repeat(0, cuda_graph_pad_size))
++            prompt_adapter_prompt_mapping = flatten_2d_lists([
++                inter_data.prompt_adapter_prompt_mapping
++                for inter_data in self.inter_data_list
++            ])
++            prompt_adapter_mapping = PromptAdapterMapping(
++                prompt_adapter_index_mapping,
++                prompt_adapter_prompt_mapping,
++            )
+ 
++        # Multi-modal data.
++        multi_modal_kwargs_list = [
++            data.multi_modal_kwargs for data in self.inter_data_list
++            if data.multi_modal_kwargs is not None
++        ]
++        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+ 
+-# How batches are constructed.
+-class BatchType(IntEnum):
+-    # Every batch is prefill.
+-    PREFILL = 0
+-    # Every batch is decode.
+-    DECODE = 1
+-    # Batch is a mixture of prefill and decode.
+-    MIXED = 2
++        return self.model_input_cls(
++            input_tokens=input_tokens_tensor,
++            input_positions=input_positions_tensor,
++            token_types=token_types_tensor,
++            attn_metadata=attn_metadata,
++            seq_lens=seq_lens,
++            query_lens=query_lens,
++            lora_mapping=lora_mapping,
++            lora_requests=lora_requests,
++            multi_modal_kwargs=multi_modal_kwargs,
++            request_ids_to_seq_ids=request_ids_to_seq_ids,
++            finished_requests_ids=self.finished_requests_ids,
++            prompt_adapter_mapping=prompt_adapter_mapping,
++            prompt_adapter_requests=prompt_adapter_requests)
+ 
+ 
+-class ModelRunner:
++class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
++    """
++    Helper class for shared methods between GPU model runners.
++    """
++    _model_input_cls: Type[TModelInputForGPU]
++    _builder_cls: Type[ModelInputForGPUBuilder]
+ 
+     def __init__(
+         self,
+-        model_config: ModelConfig,
+-        parallel_config: ParallelConfig,
+-        scheduler_config: SchedulerConfig,
+-        device_config: DeviceConfig,
+-        load_config: LoadConfig,
+-        lora_config: Optional[LoRAConfig],
++        vllm_config: VllmConfig,
+         kv_cache_dtype: Optional[str] = "auto",
+         is_driver_worker: bool = False,
+-        vision_language_config: Optional[VisionLanguageConfig] = None,
++        return_hidden_states: bool = False,
++        input_registry: InputRegistry = INPUT_REGISTRY,
++        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+     ):
+-        self.model_config = model_config
+-        self.parallel_config = parallel_config
+-        self.scheduler_config = scheduler_config
+-        self.lora_config = lora_config
+-        self.load_config = load_config
++
++        ModelRunnerBase.__init__(self, vllm_config)
++        model_config = self.model_config
++        cache_config = self.cache_config
++
+         self.is_driver_worker = is_driver_worker
++        self.return_hidden_states = return_hidden_states
+ 
+-        # model_config can be None in tests/samplers/test_sampler.py.
+-        # FIXME(woosuk): This is a hack to make the tests work. Refactor this.
+-        self.sliding_window = (model_config.get_sliding_window()
+-                               if model_config is not None else None)
+-        self.device_config = (device_config
+-                              if device_config is not None else DeviceConfig())
+         self.device = self.device_config.device
++        self.pin_memory = is_pin_memory_available()
+ 
+-        # Set after load_model.
+-        self.lora_manager: LRUCacheWorkerLoRAManager = None
+-
+-        self.graph_runners: Dict[int, CUDAGraphRunner] = {}
++        self.kv_cache_dtype = kv_cache_dtype
++        self.sliding_window = model_config.get_sliding_window()
++        self.block_size = cache_config.block_size
++        self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
++        self.max_batchsize_to_capture = \
++            self.vllm_config.compilation_config.max_capture_size
++
++        self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [
++            {} for _ in range(self.parallel_config.pipeline_parallel_size)
++        ]
+         self.graph_memory_pool: Optional[Tuple[
+             int, int]] = None  # Set during graph capture.
+ 
+-        self.max_seq_len_to_capture = (self.model_config.max_seq_len_to_capture
+-                                       if self.model_config is not None else 0)
++        self.has_inner_state = model_config.has_inner_state
+ 
+-        self.pin_memory = is_pin_memory_available()
+-        self.kv_cache_dtype = kv_cache_dtype
+-        self.vision_language_config = vision_language_config
+-
+-        self.attn_backend = get_attn_backend(
+-            self.model_config.dtype if model_config is not None else None)
+-
+-        # Lazy initialization
+-        self.model: torch.nn.Module  # Set after load_model
+-        self.block_size: int  # Set after initial profiling.
+         # When using CUDA graph, the input block tables must be padded to
+         # max_seq_len_to_capture. However, creating the block table in
+         # Python can be expensive. To optimize this, we cache the block table
+         # in numpy and only copy the actual input content at every iteration.
+         # The shape of the cached block table will be
+-        # (max batch size to capture, max context len to capture / block size).
+-        self.graph_block_tables: torch.Tensor  # Set after initial profiling.
++        # (max batch size to capture, max seq len to capture / block size).
++        self.graph_block_tables = np.zeros(
++            (self.max_batchsize_to_capture, self.get_max_block_per_batch()),
++            dtype=np.int32)
++
++        # Attention-free but stateful models like Mamba need a placeholder attn
++        # backend, as the attention metadata is needed to manage internal state.
++        # However we must bypass attention selection altogether for some models
++        # used for speculative decoding to avoid a divide-by-zero in
++        # model_config.get_head_size()
++        num_attn_heads = self.model_config.get_num_attention_heads(
++            self.parallel_config)
++        needs_attn_backend = (num_attn_heads != 0
++                              or self.model_config.is_attention_free)
++
++        self.attn_backend = get_attn_backend(
++            self.model_config.get_head_size(),
++            self.model_config.dtype,
++            self.kv_cache_dtype,
++            self.block_size,
++            self.model_config.is_attention_free,
++        ) if needs_attn_backend else None
++        if self.attn_backend:
++            self.attn_state = self.attn_backend.get_state_cls()(
++                weakref.proxy(self))
++        else:
++            self.attn_state = CommonAttentionState(weakref.proxy(self))
++
++        # Multi-modal data support
++        self.input_registry = input_registry
++        self.mm_registry = mm_registry
++        self.multi_modal_input_mapper = mm_registry \
++            .create_input_mapper(model_config)
++        self.mm_registry.init_mm_limits_per_prompt(self.model_config)
++
++        # Lazy initialization
++        self.model: nn.Module  # Set after load_model
++        # Set after load_model.
++        self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
++        self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None
+ 
+-        # Set if the backend is flashinfer.
+-        self.flashinfer_workspace_buffer: torch.Tensor
++        set_cpu_offload_max_bytes(
++            int(self.cache_config.cpu_offload_gb * 1024**3))
++
++        # Used to cache python objects
++        self.inter_data_cache: Dict[int, PyObjectCache] = {}
++
++        # Using the PythonizationCache in Pipeline-Parallel clobbers the
++        # SequenceGroupToSample object. In Pipeline-Parallel, we have
++        # more than 1 Scheduler, resulting in a potential back-to-back
++        # prepare_model_inputs() call. This clobbers the cached
++        # SequenceGroupToSample objects, as we reset the cache during
++        # every prepare_model_inputs() call.
++        self.sampling_metadata_cache: SamplingMetadataCache = \
++              SamplingMetadataCache() \
++                if self.parallel_config.pipeline_parallel_size == 1 else None
+ 
+     def load_model(self) -> None:
+-        with CudaMemoryProfiler() as m:
+-            self.model = get_model(
+-                model_config=self.model_config,
+-                device_config=self.device_config,
+-                load_config=self.load_config,
+-                lora_config=self.lora_config,
+-                vision_language_config=self.vision_language_config,
+-                parallel_config=self.parallel_config,
+-                scheduler_config=self.scheduler_config,
+-            )
++        logger.info("Starting to load model %s...", self.model_config.model)
++        with DeviceMemoryProfiler() as m:
++            self.model = get_model(vllm_config=self.vllm_config)
+ 
+         self.model_memory_usage = m.consumed_memory
+         logger.info("Loading model weights took %.4f GB",
+                     self.model_memory_usage / float(2**30))
+ 
+         if self.lora_config:
+-            assert hasattr(self.model, "supported_lora_modules"
+-                           ) and self.model.supported_lora_modules, (
+-                               "Model does not support LoRA")
+-            assert hasattr(
+-                self.model,
+-                "embedding_modules"), "Model does not have embedding_modules"
+-            assert hasattr(self.model, "embedding_padding_modules"
+-                           ), "Model does not have embedding_padding_modules"
++            assert supports_lora(
++                self.model
++            ), f"{self.model.__class__.__name__} does not support LoRA yet."
++
++            if supports_multimodal(self.model):
++                logger.warning("Regarding multimodal models, vLLM currently "
++                               "only supports adding LoRA to language model.")
++            # It's necessary to distinguish between the max_position_embeddings
++            # of VLMs and LLMs.
++            if hasattr(self.model.config, "max_position_embeddings"):
++                max_pos_embeddings = self.model.config.max_position_embeddings
++            else:
++                max_pos_embeddings = (
++                    self.model.config.text_config.max_position_embeddings)
++
+             self.lora_manager = LRUCacheWorkerLoRAManager(
+                 self.scheduler_config.max_num_seqs,
+-                self.scheduler_config.max_num_batched_tokens, self.vocab_size,
+-                self.lora_config, self.device, self.model.embedding_modules,
+-                self.model.embedding_padding_modules)
++                self.scheduler_config.max_num_batched_tokens,
++                self.vocab_size,
++                self.lora_config,
++                self.device,
++                self.model.embedding_modules,
++                self.model.embedding_padding_modules,
++                max_position_embeddings=max_pos_embeddings,
++            )
+             self.model = self.lora_manager.create_lora_manager(self.model)
+ 
+-        if self.kv_cache_dtype == "fp8" and is_hip():
+-            # Currently scaled KV cache is only enabled on ROCm
++        if self.prompt_adapter_config:
++            self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
++                self.scheduler_config.max_num_seqs,
++                self.scheduler_config.max_num_batched_tokens, self.device,
++                self.prompt_adapter_config)
++            self.model = (
++                self.prompt_adapter_manager.create_prompt_adapter_manager(
++                    self.model))
++
++        if self.kv_cache_dtype == "fp8" and (current_platform.is_rocm()
++                                             or current_platform.is_cuda()):
++            # Currently only ROCm accepts kv-cache scaling factors
++            # via quantization_param_path and this will be deprecated
++            # in the future.
+             if self.model_config.quantization_param_path is not None:
+                 if callable(getattr(self.model, "load_kv_cache_scales", None)):
++                    warnings.warn(
++                        "Loading kv cache scaling factor from JSON is "
++                        "deprecated and will be removed. Please include "
++                        "kv cache scaling factors in the model checkpoint.",
++                        FutureWarning,
++                        stacklevel=2)
+                     self.model.load_kv_cache_scales(
+                         self.model_config.quantization_param_path)
++                    logger.info("Loaded KV cache scaling factors from %s",
++                                self.model_config.quantization_param_path)
+                 else:
+                     raise RuntimeError(
+                         "Using FP8 KV cache and scaling factors provided but "
+@@ -207,620 +1163,70 @@ class ModelRunner:
+                     "Using FP8 KV cache but no scaling factors "
+                     "provided. Defaulting to scaling factors of 1.0. "
+                     "This may lead to less accurate results!")
+-        elif self.model_config.quantization_param_path is not None:
+-            logger.warning("KV cache scaling factors provided, "
+-                           "but the KV cache data type is not FP8. "
+-                           "KV cache scaling factors will not be used.")
+-
+-    def set_block_size(self, block_size: int) -> None:
+-        self.block_size = block_size
+-
+-        self.graph_block_tables = np.zeros(
+-            (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()),
+-            dtype=np.int32)
+-
+-    def get_max_block_per_batch(self) -> int:
+-        block_size = self.block_size
+-        return (self.max_seq_len_to_capture + block_size - 1) // block_size
+ 
+-    def _prepare_prompt(
+-        self,
+-        seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> PreparePromptMetadata:
+-        input_tokens: List[int] = []
+-        input_positions: List[int] = []
+-        slot_mapping: List[int] = []
+-        lora_index_mapping: List[int] = []
+-        lora_prompt_mapping: List[int] = []
+-        lora_requests: Set[LoRARequest] = set()
+-
+-        seq_lens: List[int] = []
+-        context_lens: List[int] = []
+-        query_lens: List[int] = []
+-        prefix_block_tables: List[List[int]] = []
+-        multi_modal_input_list: List[torch.Tensor] = []
+-
+-        if len(seq_group_metadata_list) == 0:
+-            return PreparePromptMetadata.empty()
+-
+-        for seq_group_metadata in seq_group_metadata_list:
+-            assert seq_group_metadata.is_prompt
+-            seq_ids = list(seq_group_metadata.seq_data.keys())
+-            assert len(seq_ids) == 1
+-            seq_id = seq_ids[0]
+-
+-            computed_block_nums = seq_group_metadata.computed_block_nums
+-            if (self.scheduler_config is not None
+-                    and self.scheduler_config.chunked_prefill_enabled
+-                    and not (computed_block_nums is None
+-                             or computed_block_nums == [])):
+-                raise RuntimeError(
+-                    "chunked prefill cannot be used with prefix caching "
+-                    "now.")
+-
+-            token_chunk_size = seq_group_metadata.token_chunk_size
+-            seq_data = seq_group_metadata.seq_data[seq_id]
+-            context_len = seq_data.get_num_computed_tokens()
+-            # We should use get_len here because in case of preemption
+-            # it contains output tokens.
+-            seq_len = min(seq_data.get_len(), context_len + token_chunk_size)
+-            prompt_tokens = seq_data.get_token_ids()[context_len:seq_len]
+-            seq_lens.append(seq_len)
+-
+-            # NOTE: This only works for oooooooxxx style attention.
+-            if computed_block_nums is not None and len(
+-                    computed_block_nums) > 0 and self.sliding_window is None:
+-                # Prefix is not supported with sliding_window
+-                context_len = len(computed_block_nums) * self.block_size
+-                prompt_tokens = prompt_tokens[context_len:]
+-                prefix_block_tables.append(computed_block_nums)
+-            elif self.scheduler_config.chunked_prefill_enabled:
+-                if seq_group_metadata.block_tables is not None:
+-                    # Prefill has chunked before.
+-                    block_table = seq_group_metadata.block_tables[seq_id]
+-                    prefix_block_tables.append(block_table)
+-                else:
+-                    # The first prefill.
+-                    prefix_block_tables.append([])
+-            else:
+-                prefix_block_tables.append([])
+-                # Right now, prefill start is always 0. However, this
+-                # assumption can be changed once chunked prefill is introduced.
+-                assert context_len == 0
+-
+-            # actual prompt lens
+-            context_lens.append(context_len)
+-            query_lens.append(seq_len - context_len)
+-
+-            input_tokens.extend(prompt_tokens)
+-            # NOTE(woosuk): Here we assume that the first token in the prompt
+-            # is always the first token in the sequence.
+-            input_positions.extend(list(range(context_len, seq_len)))
+-            lora_id = seq_group_metadata.lora_int_id
+-
+-            if lora_id > 0:
+-                lora_requests.add(seq_group_metadata.lora_request)
+-
+-            lora_index_mapping += [lora_id] * (seq_len - context_len)
+-            lora_prompt_mapping.extend(
+-                [lora_id] *
+-                (seq_len - context_len
+-                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))
+-
+-            if seq_group_metadata.multi_modal_data:
+-                multi_modal_input_list.append(
+-                    seq_group_metadata.multi_modal_data.data)
+-
+-            if seq_group_metadata.block_tables is None:
+-                # During memory profiling, the block tables are not initialized
+-                # yet. In this case, we just use a dummy slot mapping.
+-                slot_mapping.extend([_PAD_SLOT_ID] * seq_len)
+-                continue
+-
+-            # Compute the slot mapping.
+-            block_table = seq_group_metadata.block_tables[seq_id]
+-
+-            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
+-            # where start_idx is max(0, seq_len - sliding_window).
+-            # For example, if the prompt len is 10, sliding window is 8, and
+-            # block size is 4, the first two tokens are masked and the slot
+-            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+-            start_idx = 0
+-            if self.sliding_window is not None:
+-                assert context_len == 0, (
+-                    "Prefix caching is currently not supported with "
+-                    "sliding window attention")
+-                start_idx = max(0, seq_len - self.sliding_window)
+-
+-            for i in range(context_len, seq_len):
+-                if i < start_idx:
+-                    slot_mapping.append(_PAD_SLOT_ID)
+-                    continue
+-
+-                block_number = block_table[i // self.block_size]
+-                block_offset = i % self.block_size
+-                slot = block_number * self.block_size + block_offset
+-                slot_mapping.append(slot)
+-
+-        max_query_len = max(query_lens)
+-        max_seq_len = max(seq_lens)
+-        assert max_query_len > 0
+-
+-        context_lens_tensor = torch.tensor(context_lens,
+-                                           dtype=torch.int,
+-                                           device=self.device)
+-
+-        if multi_modal_input_list:
+-            assert self.vision_language_config, (
+-                "Multi-modal inputs are only supported by "
+-                "vision language models.")
+-            multi_modal_input = torch.cat(multi_modal_input_list,
+-                                          dim=0).to(self.device)
+-        else:
+-            multi_modal_input = None
+-
+-        # Prepare prefix block tables
+-        max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
+-        block_tables = make_tensor_with_pad(
+-            prefix_block_tables,
+-            max_len=max_prompt_block_table_len,
+-            pad=0,
+-            dtype=torch.int,
+-            device=self.device,
+-        )
+-
+-        # Query length can be shorter than key (i.e., prompt) when prefill
+-        # is chunked or prefix cached.
+-        query_lens_tensor = torch.tensor(query_lens,
+-                                         dtype=torch.long,
+-                                         device=self.device)
+-        subquery_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
+-                                         dtype=torch.int32,
+-                                         device=self.device)
+-
+-        seq_lens_tensor = torch.tensor(seq_lens,
+-                                       dtype=torch.int,
+-                                       device=self.device)
+-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
+-                                    dtype=torch.int32,
+-                                    device=self.device)
+-
+-        torch.cumsum(query_lens_tensor,
+-                     dim=0,
+-                     dtype=subquery_start_loc.dtype,
+-                     out=subquery_start_loc[1:])
+-
+-        torch.cumsum(seq_lens_tensor,
+-                     dim=0,
+-                     dtype=seq_start_loc.dtype,
+-                     out=seq_start_loc[1:])
+-
+-        if self.attn_backend is FlashInferBackend:
+-            attn_metadata = self.attn_backend.make_metadata(
+-                is_prompt=True,
+-                use_cuda_graph=False,
+-                seq_start_loc=seq_start_loc,
+-                max_seq_len=max_seq_len,
+-                block_tables=block_tables)
+-        else:
+-            attn_metadata = self.attn_backend.make_metadata(
+-                is_prompt=True,
+-                seq_lens=seq_lens,
+-                seq_lens_tensor=seq_lens_tensor,
+-                max_query_len=max_query_len,
+-                max_seq_len=max_seq_len,
+-                subquery_start_loc=subquery_start_loc,
+-                seq_start_loc=seq_start_loc,
+-                context_lens_tensor=context_lens_tensor,
+-                block_tables=block_tables,
+-                use_cuda_graph=False,
+-            )
+-
+-        return PreparePromptMetadata(
+-            input_tokens=input_tokens,
+-            input_positions=input_positions,
+-            attn_metadata=attn_metadata,
+-            seq_lens=seq_lens,
+-            query_lens=query_lens,
+-            lora_index_mapping=lora_index_mapping,
+-            lora_prompt_mapping=lora_prompt_mapping,
+-            lora_requests=lora_requests,
+-            multi_modal_input=multi_modal_input,
+-            slot_mapping=slot_mapping,
+-        )
++        if self.vllm_config.compilation_config.level ==\
++            CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
++            backend = self.vllm_config.compilation_config.init_backend(
++                self.vllm_config)
++            self.model = torch.compile(
++                self.model,
++                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
++                backend=backend)
+ 
+-    def _prepare_decode(
++    def save_sharded_state(
+         self,
+-        seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> PrepareDecodeMetadata:
+-        input_tokens: List[int] = []
+-        input_positions: List[int] = []
+-        slot_mapping: List[int] = []
+-        seq_lens: List[int] = []
+-        block_tables: List[List[int]] = []
+-        lora_index_mapping: List[int] = []
+-        lora_prompt_mapping: List[int] = []
+-        lora_requests: Set[LoRARequest] = set()
+-
+-        # The following fields are only for flashinfer
+-        # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout
+-        # for the precise definition of the following fields.
+-        # An example:
+-        # request 1, page indices [0, 5, 8]
+-        # request 2, page indices [1, 6, 7]
+-        # request 3, page indices [3, 4]
+-        # paged_kv_indices is a concatenation of page indices of all requests:
+-        # [0, 5, 8, 1, 6, 7, 3, 4]
+-        # paged_kv_indptr is used to index into paged_kv_indices:
+-        # [0, 3, 6, 8]
+-        paged_kv_indices: List[int] = []
+-        # 0 at the beginning of paged_kv_indptr indicates the start of the
+-        # first request’s page indices in the paged_kv_indices list.
+-        paged_kv_indptr: List[int] = [0]
+-        # paged_kv_last_page_len is the length of the last page of each request
+-        paged_kv_last_page_len: List[int] = []
+-
+-        if len(seq_group_metadata_list) == 0:
+-            return PrepareDecodeMetadata.empty()
+-
+-        for seq_group_metadata in seq_group_metadata_list:
+-            assert not seq_group_metadata.is_prompt
+-            assert seq_group_metadata.token_chunk_size == 1
+-
+-            seq_ids = list(seq_group_metadata.seq_data.keys())
+-            lora_id = seq_group_metadata.lora_int_id
+-
+-            if lora_id > 0:
+-                lora_requests.add(seq_group_metadata.lora_request)
+-
+-            for seq_id in seq_ids:
+-                seq_data = seq_group_metadata.seq_data[seq_id]
+-                generation_token = seq_data.get_last_token_id()
+-                input_tokens.append(generation_token)
+-
+-                seq_len = seq_data.get_len()
+-                position = seq_len - 1
+-                input_positions.append(position)
+-
+-                seq_len = seq_len if self.sliding_window is None else min(
+-                    seq_len, self.sliding_window)
+-                seq_lens.append(seq_len)
+-
+-                block_table = seq_group_metadata.block_tables[seq_id]
+-                block_number = block_table[position // self.block_size]
+-                block_offset = position % self.block_size
+-                slot = block_number * self.block_size + block_offset
+-                slot_mapping.append(slot)
+-                lora_index_mapping.append(lora_id)
+-                lora_prompt_mapping.append(lora_id)
+-
+-                if self.sliding_window is not None:
+-                    sliding_window_blocks = (self.sliding_window //
+-                                             self.block_size)
+-                    block_table = block_table[-sliding_window_blocks:]
+-                block_tables.append(block_table)
+-
+-                paged_kv_indices.extend(block_table)
+-                paged_kv_indptr.append(paged_kv_indptr[-1] + len(block_table))
+-                last_page_len = seq_data.get_len() % self.block_size
+-                if last_page_len == 0:
+-                    last_page_len = self.block_size
+-                paged_kv_last_page_len.append(last_page_len)
+-
+-        # vLLM uses cuda graph only for decoding requests.
+-        # See `capture_model` API for more details.
+-        # For decoding requests, batch_size == input_tokens.
+-        batch_size = len(input_tokens)
+-        max_seq_len = max(seq_lens)
+-        use_captured_graph = (not self.model_config.enforce_eager
+-                              and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
+-                              and max_seq_len <= self.max_seq_len_to_capture)
+-        if use_captured_graph:
+-            graph_batch_size = _get_graph_batch_size(batch_size)
+-            assert graph_batch_size >= batch_size
+-            for _ in range(graph_batch_size - batch_size):
+-                input_tokens.append(0)
+-                input_positions.append(0)
+-                slot_mapping.append(_PAD_SLOT_ID)
+-                seq_lens.append(1)
+-                block_tables.append([])
+-                lora_index_mapping.append(0)
+-            batch_size = graph_batch_size
+-
+-        seq_lens_tensor = torch.tensor(seq_lens,
+-                                       dtype=torch.int,
+-                                       device=self.device)
+-
+-        if use_captured_graph:
+-            # When using cuda-graph all these tensors should be
+-            # padded.
+-            assert seq_lens_tensor.shape[0] == len(input_tokens)
+-            assert seq_lens_tensor.shape[0] == len(input_positions)
+-            assert seq_lens_tensor.shape[0] == len(slot_mapping)
+-
+-            # The shape of graph_block_tables is
+-            # [max batch size, max context len // block size].
+-            input_block_tables = self.graph_block_tables[:batch_size]
+-            for i, block_table in enumerate(block_tables):
+-                if block_table:
+-                    input_block_tables[i, :len(block_table)] = block_table
+-            block_tables = torch.tensor(input_block_tables, device=self.device)
+-        else:
+-            max_block_table_len = max(
+-                len(block_table) for block_table in block_tables)
+-            block_tables = make_tensor_with_pad(
+-                block_tables,
+-                max_len=max_block_table_len,
+-                pad=0,
+-                dtype=torch.int,
+-                device=self.device,
+-            )
+-
+-        if self.attn_backend is FlashInferBackend:
+-            if not hasattr(self, "flashinfer_workspace_buffer"):
+-                # Allocate 16MB workspace buffer
+-                # Follow the example of flashinfer: https://docs.flashinfer.ai/api/python/decode.html
+-                self.flashinfer_workspace_buffer = torch.empty(
+-                    16 * 1024 * 1024, dtype=torch.uint8, device=self.device)
+-            paged_kv_indptr = torch.tensor(paged_kv_indptr,
+-                                           dtype=torch.int,
+-                                           device=self.device)
+-            paged_kv_indices = torch.tensor(paged_kv_indices,
+-                                            dtype=torch.int,
+-                                            device=self.device)
+-            paged_kv_last_page_len = torch.tensor(paged_kv_last_page_len,
+-                                                  dtype=torch.int,
+-                                                  device=self.device)
+-            kv_cache_dtype = get_kv_cache_torch_dtype(self.kv_cache_dtype,
+-                                                      self.model_config.dtype)
+-
+-            attn_metadata = self.attn_backend.make_metadata(
+-                is_prompt=False,
+-                use_cuda_graph=False,
+-                workspace_buffer=self.flashinfer_workspace_buffer,
+-                paged_kv_indptr=paged_kv_indptr,
+-                paged_kv_indices=paged_kv_indices,
+-                paged_kv_last_page_len=paged_kv_last_page_len,
+-                num_qo_heads=self.model_config.get_num_attention_heads(
+-                    self.parallel_config),
+-                num_kv_heads=self.model_config.get_num_kv_heads(
+-                    self.parallel_config),
+-                head_dim=self.model_config.get_head_size(),
+-                page_size=self.block_size,
+-                data_type=kv_cache_dtype)
+-        else:
+-            attn_metadata = self.attn_backend.make_metadata(
+-                is_prompt=False,
+-                seq_lens=None,
+-                seq_lens_tensor=seq_lens_tensor,
+-                max_query_len=None,
+-                max_seq_len=max_seq_len,
+-                subquery_start_loc=None,
+-                seq_start_loc=None,
+-                context_lens_tensor=None,
+-                block_tables=block_tables,
+-                use_cuda_graph=use_captured_graph,
+-            )
+-        return PrepareDecodeMetadata(
+-            input_tokens=input_tokens,
+-            input_positions=input_positions,
+-            attn_metadata=attn_metadata,
+-            lora_index_mapping=lora_index_mapping,
+-            lora_prompt_mapping=lora_prompt_mapping,
+-            lora_requests=lora_requests,
+-            slot_mapping=slot_mapping,
++        path: str,
++        pattern: Optional[str] = None,
++        max_size: Optional[int] = None,
++    ) -> None:
++        from vllm.model_executor.model_loader.loader import ShardedStateLoader
++        ShardedStateLoader.save_model(
++            self.model,
++            path,
++            pattern=pattern,
++            max_size=max_size,
+         )
+ 
+-    def prepare_input_tensors(
++    def save_tensorized_model(
+         self,
+-        seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
+-               Set[LoRARequest], LoRAMapping, torch.Tensor]:
+-        if self.is_driver_worker:
+-            prefill_reqs = []
+-            decode_reqs = []
+-            for seq_group_meta in seq_group_metadata_list:
+-                if seq_group_meta.is_prompt:
+-                    prefill_reqs.append(seq_group_meta)
+-                else:
+-                    decode_reqs.append(seq_group_meta)
+-
+-            # Prepare input tensors.
+-            (
+-                input_tokens,
+-                input_positions,
+-                prefill_attn_metadata,
+-                seq_lens,
+-                query_lens,
+-                lora_index_mapping,
+-                lora_prompt_mapping,
+-                lora_requests,
+-                multi_modal_input,
+-                slot_mapping,
+-            ) = self._prepare_prompt(prefill_reqs)
+-            (
+-                decode_input_tokens,
+-                decode_input_positions,
+-                decode_attn_metadata,
+-                decode_lora_index_mapping,
+-                decode_lora_prompt_mapping,
+-                decode_lora_requests,
+-                decode_slot_mapping,
+-            ) = self._prepare_decode(decode_reqs)
+-            sampling_metadata = SamplingMetadata.prepare(
+-                seq_group_metadata_list, seq_lens, query_lens, self.device,
+-                self.pin_memory)
+-
+-            if not self.scheduler_config.chunked_prefill_enabled:
+-                assert (len(prefill_reqs) and len(decode_reqs)) == 0
+-
+-            num_prefills = len(seq_lens)
+-            num_prefill_tokens = len(input_tokens)
+-            num_decode_tokens = len(decode_input_tokens)
+-
+-            # Coalesce tensors. Note that attn_metadata is currently not
+-            # coalesced for simplicity.
+-            input_tokens.extend(decode_input_tokens)
+-            input_positions.extend(decode_input_positions)
+-            slot_mapping.extend(decode_slot_mapping)
+-            lora_index_mapping.extend(decode_lora_index_mapping)
+-            lora_prompt_mapping.extend(decode_lora_prompt_mapping)
+-            lora_requests.update(decode_lora_requests)
+-
+-            input_tokens = torch.tensor(input_tokens,
+-                                        dtype=torch.long,
+-                                        device=self.device)
+-            input_positions = torch.tensor(input_positions,
+-                                           dtype=torch.long,
+-                                           device=self.device)
+-            slot_mapping = torch.tensor(slot_mapping,
+-                                        dtype=torch.long,
+-                                        device=self.device)
+-
+-            if self.lora_config:
+-                lora_mapping = LoRAMapping(
+-                    lora_index_mapping,
+-                    lora_prompt_mapping,
+-                )
+-            else:
+-                lora_mapping = None
+-
+-            # Broadcast the metadata.
+-            # If batch contains both prefill and decode, it sends 2 broadcasts.
+-            # If it only contains 1 type, it triggers a single broadcast.
+-            if (prefill_attn_metadata is not None
+-                    and decode_attn_metadata is not None):
+-                batch_type = BatchType.MIXED
+-            elif prefill_attn_metadata is not None:
+-                batch_type = BatchType.PREFILL
+-            else:
+-                batch_type = BatchType.DECODE
+-
+-            metadata_dict = {
+-                "input_tokens": input_tokens,
+-                "input_positions": input_positions,
+-                "selected_token_indices":
+-                sampling_metadata.selected_token_indices,
+-                "lora_requests": lora_requests,
+-                "lora_mapping": lora_mapping,
+-                "multi_modal_input": multi_modal_input,
+-                "num_prefill_tokens": num_prefill_tokens,
+-                "num_decode_tokens": num_decode_tokens,
+-                "slot_mapping": slot_mapping,
+-                "num_prefills": num_prefills,
+-                "batch_type": batch_type,
+-            }
+-            if prefill_attn_metadata is not None:
+-                metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
+-            else:
+-                assert decode_attn_metadata is not None
+-                metadata_dict.update(decode_attn_metadata.asdict_zerocopy())
+-            broadcast_tensor_dict(metadata_dict, src=0)
+-
+-            # Broadcast decode attn metadata for mixed batch type.
+-            # The additional broadcast costs 300us overhead on 4 A10 GPUs.
+-            # We can potentially reduce the overhead by coelescing tensors.
+-            if batch_type == BatchType.MIXED:
+-                assert decode_attn_metadata is not None
+-                metadata_dict = decode_attn_metadata.asdict_zerocopy()
+-                broadcast_tensor_dict(metadata_dict, src=0)
+-        else:
+-            metadata_dict = broadcast_tensor_dict(src=0)
+-            input_tokens = metadata_dict.pop("input_tokens")
+-            input_positions = metadata_dict.pop("input_positions")
+-            slot_mapping = metadata_dict.pop("slot_mapping")
+-            num_prefills = metadata_dict.pop("num_prefills")
+-            selected_token_indices = metadata_dict.pop(
+-                "selected_token_indices")
+-            lora_mapping = metadata_dict.pop("lora_mapping")
+-            lora_requests = metadata_dict.pop("lora_requests")
+-            multi_modal_input = metadata_dict.pop("multi_modal_input")
+-            num_prefill_tokens = metadata_dict.pop("num_prefill_tokens")
+-            num_decode_tokens = metadata_dict.pop("num_decode_tokens")
+-            batch_type = metadata_dict.pop("batch_type")
+-
+-            # Create an attention metadata.
+-            prefill_attn_metadata = None
+-            decode_attn_metadata = None
+-            if batch_type == BatchType.PREFILL or batch_type == BatchType.MIXED:
+-                prefill_attn_metadata = self.attn_backend.make_metadata(
+-                    **metadata_dict)
+-            else:
+-                decode_attn_metadata = self.attn_backend.make_metadata(
+-                    **metadata_dict)
+-            sampling_metadata = SamplingMetadata(
+-                seq_groups=None,
+-                selected_token_indices=selected_token_indices,
+-                categorized_sample_indices=None,
+-                num_prompts=0,
+-            )
+-
+-            # if it is a mixed batch, decode attn_metadata is broadcasted
+-            # separately.
+-            if batch_type == BatchType.MIXED:
+-                metadata_dict = broadcast_tensor_dict(src=0)
+-                decode_attn_metadata = self.attn_backend.make_metadata(
+-                    **metadata_dict)
+-
+-        attn_metadata = AttentionMetadata(
+-            num_prefills=num_prefills,
+-            slot_mapping=slot_mapping,
+-            num_prefill_tokens=num_prefill_tokens,
+-            num_decode_tokens=num_decode_tokens,
+-            prefill_metadata=prefill_attn_metadata,
+-            decode_metadata=decode_attn_metadata,
+-            kv_cache_dtype=self.kv_cache_dtype,
++        tensorizer_config: TensorizerConfig,
++    ) -> None:
++        from vllm.model_executor.model_loader.loader import TensorizerLoader
++        TensorizerLoader.save_model(
++            self.model,
++            tensorizer_config=tensorizer_config,
+         )
+ 
+-        return (input_tokens, input_positions, attn_metadata,
+-                sampling_metadata, lora_requests, lora_mapping,
+-                multi_modal_input)
++    def get_max_block_per_batch(self) -> int:
++        block_size = self.block_size
++        return (self.max_seq_len_to_capture + block_size - 1) // block_size
+ 
+-    @torch.inference_mode()
+-    def execute_model(
++    def _prepare_model_input_tensors(
+         self,
+         seq_group_metadata_list: List[SequenceGroupMetadata],
+-        kv_caches: List[torch.Tensor],
+-    ) -> Optional[SamplerOutput]:
+-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
+-         lora_requests, lora_mapping, multi_modal_input
+-         ) = self.prepare_input_tensors(seq_group_metadata_list)
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> TModelInputForGPU:
++        """Helper method to prepare the model input based on a given sequence
++        group. Prepares metadata needed for the base model forward pass but not
++        metadata for possible additional steps, e.g., sampling.
+ 
+-        if self.lora_config:
+-            self.set_active_loras(lora_requests, lora_mapping)
++        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+ 
+-        # Currently cuda graph is only supported by the decode phase.
+-        prefill_meta = attn_metadata.prefill_metadata
+-        decode_meta = attn_metadata.decode_metadata
+-        if prefill_meta is None and decode_meta.use_cuda_graph:
+-            graph_batch_size = input_tokens.shape[0]
+-            model_executable = self.graph_runners[graph_batch_size]
+-        else:
+-            model_executable = self.model
+-        execute_model_kwargs = {
+-            "input_ids": input_tokens,
+-            "positions": input_positions,
+-            "kv_caches": kv_caches,
+-            "attn_metadata": attn_metadata,
+-        }
+-        if self.vision_language_config:
+-            execute_model_kwargs.update({"image_input": multi_modal_input})
+-        hidden_states = model_executable(**execute_model_kwargs)
++        The result tensors and data structure also batches input in prefill
++        -> decode order. For example,
+ 
+-        # Compute the logits.
+-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
++        - input_tokens[:num_prefill_tokens] contains prefill tokens.
++        - input_tokens[num_prefill_tokens:] contains decode tokens.
+ 
+-        # Only perform sampling in the driver worker.
+-        if not self.is_driver_worker:
+-            return None
++        If cuda graph is required, this API automatically pads inputs.
++        """
++        builder = self._builder_cls(weakref.proxy(self), finished_requests_ids)
++        for seq_group_metadata in seq_group_metadata_list:
++            builder.add_seq_group(seq_group_metadata)
+ 
+-        # Sample the next token.
+-        output = self.model.sample(
+-            logits=logits,
+-            sampling_metadata=sampling_metadata,
+-        )
++        builder.reset_cached_inter_data()
+ 
+-        return output
++        return builder.build()  # type: ignore
+ 
+     @torch.inference_mode()
+     def profile_run(self) -> None:
+@@ -828,95 +1234,172 @@ class ModelRunner:
+         sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+         max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+         max_num_seqs = self.scheduler_config.max_num_seqs
+-
+         # This represents the maximum number of different requests
+         # that will have unique loras, an therefore the max amount of memory
+         # consumption create dummy lora request copies from the lora request
+         # passed in, which contains a lora from the lora warmup path.
+-        dummy_lora_requests = []
+-        dummy_lora_requests_per_seq = []
++        dummy_lora_requests: List[LoRARequest] = []
++        dummy_lora_requests_per_seq: List[LoRARequest] = []
+         if self.lora_config:
+-            for idx in range(self.lora_config.max_loras):
+-                lora_id = idx + 1
+-                dummy_lora_request = LoRARequest(
+-                    lora_name=f"warmup_{lora_id}",
+-                    lora_int_id=lora_id,
+-                    lora_local_path="/not/a/real/path",
+-                )
+-                self.lora_manager.add_dummy_lora(dummy_lora_request,
+-                                                 rank=LORA_WARMUP_RANK)
+-                dummy_lora_requests.append(dummy_lora_request)
+-            dummy_lora_requests_per_seq = [
+-                dummy_lora_requests[idx % len(dummy_lora_requests)]
+-                for idx in range(max_num_seqs)
+-            ]
++            assert self.lora_manager is not None
++            with self.lora_manager.dummy_lora_cache():
++                for idx in range(self.lora_config.max_loras):
++                    lora_id = idx + 1
++                    dummy_lora_request = LoRARequest(
++                        lora_name=f"warmup_{lora_id}",
++                        lora_int_id=lora_id,
++                        lora_path="/not/a/real/path",
++                    )
++                    self.lora_manager.add_dummy_lora(dummy_lora_request,
++                                                     rank=LORA_WARMUP_RANK)
++                    dummy_lora_requests.append(dummy_lora_request)
++                dummy_lora_requests_per_seq = [
++                    dummy_lora_requests[idx % len(dummy_lora_requests)]
++                    for idx in range(max_num_seqs)
++                ]
+ 
+         # Profile memory usage with max_num_sequences sequences and the total
+         # number of tokens equal to max_num_batched_tokens.
+         seqs: List[SequenceGroupMetadata] = []
+-        # Additional GPU memory may be needed for vision encoding, which needs
+-        # to be accounted for when calculating the GPU blocks for
++        # Additional GPU memory may be needed for multi-modal encoding, which
++        # needs to be accounted for when calculating the GPU blocks for
+         # vLLM blocker manager.
+         # To exercise the worst scenario for GPU memory consumption,
+         # the number of seqs (batch_size) is chosen to maximize the number
+         # of images processed.
+-        if self.vision_language_config:
+-            max_num_seqs = min(
+-                max_num_seqs,
+-                int(max_num_batched_tokens /
+-                    self.vision_language_config.image_feature_size))
++
++        max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
++            self.model_config)
++        if max_mm_tokens > 0:
++            max_num_seqs_orig = max_num_seqs
++            max_num_seqs = min(max_num_seqs,
++                               max_num_batched_tokens // max_mm_tokens)
++            if max_num_seqs < 1:
++                expr = (f"min({max_num_seqs_orig}, "
++                        f"{max_num_batched_tokens} // {max_mm_tokens})")
++                logger.warning(
++                    "Computed max_num_seqs (%s) to be less than 1. "
++                    "Setting it to the minimum value of 1.", expr)
++                max_num_seqs = 1
++
++        batch_size = 0
+         for group_id in range(max_num_seqs):
+             seq_len = (max_num_batched_tokens // max_num_seqs +
+                        (group_id < max_num_batched_tokens % max_num_seqs))
+-            seq_data, fake_multi_modal_input = _prepare_fake_inputs(
+-                seq_len, self.vision_language_config)
++            batch_size += seq_len
++
++            dummy_data = self.input_registry \
++                .dummy_data_for_profiling(self.model_config,
++                                          seq_len,
++                                          self.mm_registry)
++
+             seq = SequenceGroupMetadata(
+                 request_id=str(group_id),
+                 is_prompt=True,
+-                seq_data={group_id: seq_data},
++                seq_data={group_id: dummy_data.seq_data},
+                 sampling_params=sampling_params,
+                 block_tables=None,
+                 lora_request=dummy_lora_requests_per_seq[group_id]
+                 if dummy_lora_requests_per_seq else None,
+-                multi_modal_data=fake_multi_modal_input,
++                multi_modal_data=dummy_data.multi_modal_data,
++                multi_modal_placeholders=dummy_data.multi_modal_placeholders,
+             )
+             seqs.append(seq)
+ 
+         # Run the model with the dummy inputs.
+         num_layers = self.model_config.get_num_layers(self.parallel_config)
+-        kv_caches = [None] * num_layers
+-        self.execute_model(seqs, kv_caches)
++        # use an empty tensor instead of `None`` to force Dynamo to pass
++        # it by reference, rather by specializing on the value ``None``.
++        # the `dtype` argument does not matter, and we use `float32` as
++        # a placeholder (it has wide hardware support).
++        # it is important to create tensors inside the loop, rather than
++        # multiplying the list, to avoid Dynamo from treating them as
++        # tensor aliasing.
++        kv_caches = [
++            torch.tensor([], dtype=torch.float32, device=self.device)
++            for _ in range(num_layers)
++        ]
++        finished_requests_ids = [seq.request_id for seq in seqs]
++        model_input = self.prepare_model_input(
++            seqs, finished_requests_ids=finished_requests_ids)
++        intermediate_tensors = None
++        if not get_pp_group().is_first_rank:
++            intermediate_tensors = self.model.make_empty_intermediate_tensors(
++                batch_size=batch_size,
++                dtype=self.model_config.dtype,
++                device=self.device)
++
++        self.execute_model(model_input, kv_caches, intermediate_tensors)
+         torch.cuda.synchronize()
+         return
+ 
+     def remove_all_loras(self):
+         if not self.lora_manager:
+             raise RuntimeError("LoRA is not enabled.")
+-        self.lora_manager.remove_all_loras()
++        self.lora_manager.remove_all_adapters()
+ 
+     def set_active_loras(self, lora_requests: Set[LoRARequest],
+                          lora_mapping: LoRAMapping) -> None:
+         if not self.lora_manager:
+             raise RuntimeError("LoRA is not enabled.")
+-        self.lora_manager.set_active_loras(lora_requests, lora_mapping)
++        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
+ 
+     def add_lora(self, lora_request: LoRARequest) -> bool:
+         if not self.lora_manager:
+             raise RuntimeError("LoRA is not enabled.")
+-        return self.lora_manager.add_lora(lora_request)
++        return self.lora_manager.add_adapter(lora_request)
+ 
+     def remove_lora(self, lora_id: int) -> bool:
+         if not self.lora_manager:
+             raise RuntimeError("LoRA is not enabled.")
+-        return self.lora_manager.remove_lora(lora_id)
++        return self.lora_manager.remove_adapter(lora_id)
++
++    def pin_lora(self, lora_id: int) -> bool:
++        if not self.lora_manager:
++            raise RuntimeError("LoRA is not enabled.")
++        return self.lora_manager.pin_adapter(lora_id)
+ 
+     def list_loras(self) -> Set[int]:
+         if not self.lora_manager:
+             raise RuntimeError("LoRA is not enabled.")
+-        return self.lora_manager.list_loras()
++        return self.lora_manager.list_adapters()
++
++    def remove_all_prompt_adapters(self):
++        if not self.prompt_adapter_manager:
++            raise RuntimeError("PromptAdapter is not enabled.")
++        self.prompt_adapter_manager.remove_all_adapters()
++
++    def set_active_prompt_adapters(
++            self, prompt_adapter_requests: Set[PromptAdapterRequest],
++            prompt_adapter_mapping: PromptAdapterMapping) -> None:
++        if not self.prompt_adapter_manager:
++            raise RuntimeError("PromptAdapter is not enabled.")
++        self.prompt_adapter_manager.set_active_adapters(
++            prompt_adapter_requests, prompt_adapter_mapping)
++
++    def add_prompt_adapter(
++            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
++        if not self.prompt_adapter_manager:
++            raise RuntimeError("PromptAdapter is not enabled.")
++        return self.prompt_adapter_manager.add_adapter(prompt_adapter_request)
++
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        if not self.prompt_adapter_manager:
++            raise RuntimeError("PromptAdapter is not enabled.")
++        return self.prompt_adapter_manager.remove_adapter(prompt_adapter_id)
++
++    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        if not self.prompt_adapter_manager:
++            raise RuntimeError("PromptAdapter is not enabled.")
++        return self.prompt_adapter_manager.pin_adapter(prompt_adapter_id)
++
++    def list_prompt_adapters(self) -> Set[int]:
++        if not self.prompt_adapter_manager:
++            raise RuntimeError("PromptAdapter is not enabled.")
++        return self.prompt_adapter_manager.list_adapters()
+ 
+     @torch.inference_mode()
+-    def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
++    def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
+         """Cuda graph capture a model.
+ 
+         Note that CUDA graph's performance gain is negligible if number
+@@ -929,116 +1412,452 @@ class ModelRunner:
+         Since it is used for decoding-only, it assumes there's only 1 token
+         per sequence in the batch.
+         """
+-        # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
+-        # deleted before the CUDA graphs.
+-        self.pynccl_backend = pynccl_utils.get_nccl_backend()
+-
+         assert not self.model_config.enforce_eager
+-        logger.info("Capturing the model for CUDA graphs. This may lead to "
++        logger.info("Capturing cudagraphs for decoding. This may lead to "
+                     "unexpected consequences if the model is not static. To "
+                     "run the model in eager mode, set 'enforce_eager=True' or "
+-                    "use '--enforce-eager' in the CLI.")
+-        logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. "
+-                    "If you are running out of memory, consider decreasing "
+-                    "`gpu_memory_utilization` or enforcing eager mode. "
+-                    "You can also reduce the `max_num_seqs` as needed "
+-                    "to decrease memory usage.")
++                    "use '--enforce-eager' in the CLI. "
++                    "If out-of-memory error occurs during cudagraph capture,"
++                    " consider decreasing `gpu_memory_utilization` or "
++                    "switching to eager mode. You can also reduce the "
++                    "`max_num_seqs` as needed to decrease memory usage.")
+         start_time = time.perf_counter()
++        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
+ 
+         # Prepare dummy inputs. These will be reused for all batch sizes.
+-        max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
+-        input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
+-        input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
+-        slot_mapping = torch.empty(max_batch_size, dtype=torch.long).cuda()
+-        slot_mapping.fill_(_PAD_SLOT_ID)
+-        seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
+-        block_tables = torch.from_numpy(self.graph_block_tables).cuda()
+-
+-        graph_batch_size = _get_graph_batch_size(
+-            self.scheduler_config.max_num_seqs)
+-        batch_size_capture_list = [
+-            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
+-        ]
+-
+-        # NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce
+-        # kernel, pynccl, and PyTorch NCCL. When using CUDA graph, we use
+-        # either custom all-reduce kernel or pynccl. When not using CUDA
+-        # graph, we use either custom all-reduce kernel or PyTorch NCCL.
+-        # We always prioritize using custom all-reduce kernel but fall back
+-        # to PyTorch or pynccl if it is disabled or not supported.
+-        with custom_all_reduce.capture():
++        max_batch_size = self.max_batchsize_to_capture
++        input_tokens = torch.zeros(max_batch_size,
++                                   dtype=torch.long,
++                                   device=self.device)
++        input_positions = torch.zeros(max_batch_size,
++                                      dtype=torch.long,
++                                      device=self.device)
++        if self.model_config.uses_mrope:
++            input_positions = torch.tile(input_positions,
++                                         (3, 1)).cuda(device=self.device)
++        # Prepare dummy previous_hidden_states only if needed by the model.
++        # This is used by draft models such as EAGLE.
++        previous_hidden_states = None
++        if "previous_hidden_states" in inspect.signature(
++                self.model.forward).parameters:
++            previous_hidden_states = torch.empty(
++                [max_batch_size,
++                 self.model_config.get_hidden_size()],
++                dtype=self.model_config.dtype,
++                device=self.device)
++
++        intermediate_inputs = None
++        if not get_pp_group().is_first_rank:
++            intermediate_inputs = self.model.make_empty_intermediate_tensors(
++                batch_size=max_batch_size,
++                dtype=self.model_config.dtype,
++                device=self.device)
++
++        with self.attn_state.graph_capture(max_batch_size), graph_capture(
++                self.device) as graph_capture_context:
+             # NOTE: Capturing the largest batch size first may help reduce the
+             # memory usage of CUDA graph.
+-            for batch_size in reversed(batch_size_capture_list):
+-                # Create dummy attn_metadata.
+-                decode_metadata = self.attn_backend.make_metadata(
+-                    is_prompt=False,
+-                    seq_lens=None,
+-                    seq_lens_tensor=seq_lens[:batch_size],
+-                    max_query_len=None,
+-                    max_seq_len=self.max_seq_len_to_capture,
+-                    subquery_start_loc=None,
+-                    seq_start_loc=None,
+-                    context_lens_tensor=None,
+-                    block_tables=block_tables[:batch_size],
+-                    use_cuda_graph=True,
+-                )
+-                attn_metadata = AttentionMetadata(
+-                    num_prefills=0,
+-                    num_prefill_tokens=0,
+-                    num_decode_tokens=batch_size,
+-                    slot_mapping=slot_mapping[:batch_size],
+-                    prefill_metadata=None,
+-                    decode_metadata=decode_metadata,
+-                    kv_cache_dtype=self.kv_cache_dtype,
+-                )
+-
+-                if self.lora_config:
+-                    lora_mapping = LoRAMapping(
+-                        [0] * batch_size,
+-                        [0] * batch_size,
+-                    )
+-                    self.set_active_loras(set(), lora_mapping)
+-
+-                graph_runner = CUDAGraphRunner(self.model)
+-                graph_runner.capture(
+-                    input_tokens[:batch_size],
+-                    input_positions[:batch_size],
+-                    kv_caches,
+-                    attn_metadata,
+-                    memory_pool=self.graph_memory_pool,
+-                )
+-                self.graph_memory_pool = graph_runner.graph.pool()
+-                self.graph_runners[batch_size] = graph_runner
++            for virtual_engine in range(
++                    self.parallel_config.pipeline_parallel_size):
++                # Only rank 0 should print progress bar during capture
++                capture_sizes = (
++                    tqdm(
++                        self.vllm_config.compilation_config.capture_sizes,
++                        desc="Capturing CUDA graph shapes",
++                    ) if get_tensor_model_parallel_rank() == 0 else
++                    self.vllm_config.compilation_config.capture_sizes)
++                for batch_size in capture_sizes:
++                    attn_metadata = (
++                        self.attn_state.graph_capture_get_metadata_for_batch(
++                            batch_size,
++                            is_encoder_decoder_model=self.model_config.
++                            is_encoder_decoder))
++
++                    if self.lora_config:
++                        lora_mapping = LoRAMapping(
++                            **dict(index_mapping=[0] * batch_size,
++                                   prompt_mapping=[0] * batch_size,
++                                   is_prefill=False))
++                        self.set_active_loras(set(), lora_mapping)
++
++                    if self.prompt_adapter_config:
++                        prompt_adapter_mapping = PromptAdapterMapping(
++                            [-1] * batch_size,
++                            [-1] * batch_size,
++                        )
++                        self.set_active_prompt_adapters(
++                            set(), prompt_adapter_mapping)
++                    graph_runner = CUDAGraphRunner(
++                        self.model, self.attn_backend.get_name(),
++                        self.attn_state.graph_clone(batch_size),
++                        self.model_config.is_encoder_decoder)
++
++                    capture_inputs = {
++                        "input_ids":
++                        input_tokens[:batch_size],
++                        "positions":
++                        input_positions[..., :batch_size],
++                        "intermediate_inputs":
++                        intermediate_inputs[:batch_size]
++                        if intermediate_inputs is not None else None,
++                        "kv_caches":
++                        kv_caches[virtual_engine],
++                        "attn_metadata":
++                        attn_metadata,
++                        "memory_pool":
++                        self.graph_memory_pool,
++                        "stream":
++                        graph_capture_context.stream
++                    }
++                    if previous_hidden_states is not None:
++                        capture_inputs[
++                            "previous_hidden_states"] = previous_hidden_states[:
++                                                                               batch_size]
++
++                    if self.has_inner_state:
++                        # Only used by Mamba-based models CUDA graph atm (Jamba)
++                        capture_inputs.update({
++                            "seqlen_agnostic_capture_inputs":
++                            self.model.get_seqlen_agnostic_capture_inputs(
++                                batch_size)
++                        })
++                    if self.model_config.is_encoder_decoder:
++                        # add the additional inputs to capture for
++                        # encoder-decoder models.
++                        self._update_inputs_to_capture_for_enc_dec_model(
++                            capture_inputs)
++
++                    with set_forward_context(attn_metadata, self.vllm_config,
++                                             virtual_engine):
++                        graph_runner.capture(**capture_inputs)
++                    self.graph_memory_pool = graph_runner.graph.pool()
++                    self.graph_runners[virtual_engine][batch_size] = (
++                        graph_runner)
+ 
+         end_time = time.perf_counter()
++        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
+         elapsed_time = end_time - start_time
++        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
+         # This usually takes < 10 seconds.
+-        logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
+-
+-    def __del__(self) -> None:
+-        # Delete the CUDA graphs before deleting the pynccl communicator.
+-        # NOTE(woosuk): This is necessary because otherwise deadlocks can
+-        # happen.
+-        # FIXME(woosuk): This is a bit hacky. Find a more robust solution.
+-        # TODO(youkaichao): when we get enough user feedback that pynccl is
+-        # more stable than cupy, we can remove this, e.g. in v0.4.1.
+-        self.graph_runners.clear()
+-        self.pynccl_backend = None
++        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
++                    elapsed_time, cuda_graph_size / GiB_bytes)
++
++    def _update_inputs_to_capture_for_enc_dec_model(self,
++                                                    capture_inputs: Dict[str,
++                                                                         Any]):
++        """
++        Updates the set of input tensors needed for CUDA graph capture in an
++        encoder-decoder model.
++
++        This method modifies the provided `capture_inputs` dictionary by
++        adding tensors specific to encoder-decoder specific models that
++        need to be captured for CUDA Graph replay.
++        """
++        # During the decode phase encoder_input_ids and encoder_positions are
++        # unset. Do the same thing for graph capture.
++        capture_inputs["encoder_input_ids"] = torch.tensor([],
++                                                           dtype=torch.long,
++                                                           device=self.device)
++        capture_inputs["encoder_positions"] = torch.tensor([],
++                                                           dtype=torch.long,
++                                                           device=self.device)
+ 
+     @property
+     def vocab_size(self) -> int:
+         return self.model_config.get_vocab_size()
+ 
+ 
+-class CUDAGraphRunner:
++class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
++    """
++    GPU model runner with sampling step.
++    """
++    _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = (
++        ModelInputForGPUWithSamplingMetadata)
++    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
++
++    def make_model_input_from_broadcasted_tensor_dict(
++        self,
++        tensor_dict: Dict[str, Any],
++    ) -> ModelInputForGPUWithSamplingMetadata:
++        model_input = \
++            ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
++                tensor_dict,
++                attn_backend=self.attn_backend,
++            )
++        return model_input
++
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None,
++    ) -> ModelInputForGPUWithSamplingMetadata:
++        """Prepare the model input based on a given sequence group, including
++        metadata for the sampling step.
+ 
+-    def __init__(self, model: nn.Module):
++        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
++
++        The result tensors and data structure also batches input in prefill
++        -> decode order. For example,
++
++        - input_tokens[:num_prefill_tokens] contains prefill tokens.
++        - input_tokens[num_prefill_tokens:] contains decode tokens.
++
++        If cuda graph is required, this API automatically pads inputs.
++        """
++        model_input = self._prepare_model_input_tensors(
++            seq_group_metadata_list, finished_requests_ids)
++        if get_pp_group().is_last_rank:
++            # Sampling metadata is only required for the final pp group
++            generators = self.get_generators(finished_requests_ids)
++            sampling_metadata = SamplingMetadata.prepare(
++                seq_group_metadata_list, model_input.seq_lens,
++                model_input.query_lens, self.device, self.pin_memory,
++                generators, self.sampling_metadata_cache)
++        else:
++            sampling_metadata = None
++        is_prompt = (seq_group_metadata_list[0].is_prompt
++                     if seq_group_metadata_list else None)
++        return dataclasses.replace(model_input,
++                                   sampling_metadata=sampling_metadata,
++                                   is_prompt=is_prompt,
++                                   virtual_engine=virtual_engine)
++
++    @torch.inference_mode()
++    @dump_input_when_exception(exclude_args=[0], exclude_kwargs=["self"])
++    def execute_model(
++        self,
++        model_input: ModelInputForGPUWithSamplingMetadata,
++        kv_caches: List[torch.Tensor],
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
++        if num_steps > 1:
++            raise ValueError("num_steps > 1 is not supported in ModelRunner")
++
++        if self.lora_config:
++            assert model_input.lora_requests is not None
++            assert model_input.lora_mapping is not None
++            self.set_active_loras(model_input.lora_requests,
++                                  model_input.lora_mapping)
++
++        if self.prompt_adapter_config:
++            assert model_input.prompt_adapter_requests is not None
++            assert model_input.prompt_adapter_mapping is not None
++            self.set_active_prompt_adapters(
++                model_input.prompt_adapter_requests,
++                model_input.prompt_adapter_mapping)
++
++        self.attn_state.begin_forward(model_input)
++
++        # Currently cuda graph is only supported by the decode phase.
++        assert model_input.attn_metadata is not None
++        prefill_meta = model_input.attn_metadata.prefill_metadata
++        decode_meta = model_input.attn_metadata.decode_metadata
++        # TODO(andoorve): We can remove this once all
++        # virtual engines share the same kv cache.
++        virtual_engine = model_input.virtual_engine
++        if prefill_meta is None and decode_meta.use_cuda_graph:
++            assert model_input.input_tokens is not None
++            graph_batch_size = model_input.input_tokens.shape[0]
++            model_executable = self.graph_runners[virtual_engine][
++                graph_batch_size]
++        else:
++            model_executable = self.model
++
++        # Receive KV cache in distributed KV cache transfer setting
++        # In disagg prefill setting, it will also recv hidden states and bypass
++        # model forwarding
++        # In KV cache database setting, it will change the model input so that
++        # we can skip prefilling on tokens that successfully received KV caches
++        # NOTE: The receive operation is blocking
++        bypass_model_exec = False
++        if self.need_recv_kv(model_input, kv_caches):
++            hidden_or_intermediate_states, bypass_model_exec, model_input = \
++                get_kv_transfer_group().recv_kv_caches_and_hidden_states(
++                    # model is used to know which layer the current worker
++                    # is working on, so that we can receive KV for only those
++                    # layers.
++                    model_executable,
++                    model_input,
++                    kv_caches=kv_caches
++                )
++
++        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
++        seqlen_agnostic_kwargs = {
++            "finished_requests_ids": model_input.finished_requests_ids,
++            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
++        } if self.has_inner_state else {}
++        if (self.observability_config is not None
++                and self.observability_config.collect_model_forward_time):
++            model_forward_start = torch.cuda.Event(enable_timing=True)
++            model_forward_end = torch.cuda.Event(enable_timing=True)
++            model_forward_start.record()
++
++        if not bypass_model_exec:
++            with set_forward_context(model_input.attn_metadata,
++                                     self.vllm_config, virtual_engine):
++                hidden_or_intermediate_states = model_executable(
++                    input_ids=model_input.input_tokens,
++                    positions=model_input.input_positions,
++                    kv_caches=kv_caches,
++                    attn_metadata=model_input.attn_metadata,
++                    intermediate_tensors=intermediate_tensors,
++                    **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
++                                                 device=self.device),
++                    **seqlen_agnostic_kwargs)
++
++        if (self.observability_config is not None
++                and self.observability_config.collect_model_forward_time):
++            model_forward_end.record()
++
++        # Sending KV cache in distributed KV cache transfer setting
++        # NOTE: the send operation is non-blocking
++        if self.need_send_kv(model_input, kv_caches):
++            get_kv_transfer_group().send_kv_caches_and_hidden_states(
++                # model_executable is used to know which layer the current
++                # worker is working on, so that we can send KV for only those
++                # layers.
++                model_executable,
++                model_input,
++                kv_caches,
++                hidden_or_intermediate_states,
++            )
++
++        # Compute the logits in the last pipeline stage.
++        if not get_pp_group().is_last_rank:
++            if (self.is_driver_worker
++                    and hidden_or_intermediate_states is not None
++                    and isinstance(hidden_or_intermediate_states,
++                                   IntermediateTensors)
++                    and self.observability_config is not None
++                    and self.observability_config.collect_model_forward_time):
++                model_forward_end.synchronize()
++                model_forward_time = model_forward_start.elapsed_time(
++                    model_forward_end)
++                orig_model_forward_time = 0.0
++                if intermediate_tensors is not None:
++                    orig_model_forward_time = intermediate_tensors.tensors.get(
++                        "model_forward_time", torch.tensor(0.0)).item()
++                hidden_or_intermediate_states.tensors["model_forward_time"] = (
++                    torch.tensor(model_forward_time + orig_model_forward_time))
++            return hidden_or_intermediate_states
++
++        logits = self.model.compute_logits(hidden_or_intermediate_states,
++                                           model_input.sampling_metadata)
++
++        if not self.is_driver_worker:
++            return []
++
++        if model_input.async_callback is not None:
++            model_input.async_callback()
++
++        # Sample the next token.
++        output: SamplerOutput = self.model.sample(
++            logits=logits,
++            sampling_metadata=model_input.sampling_metadata,
++        )
++        if (self.observability_config is not None
++                and self.observability_config.collect_model_forward_time
++                and output is not None):
++            model_forward_end.synchronize()
++            model_forward_time = model_forward_start.elapsed_time(
++                model_forward_end)
++            orig_model_forward_time = 0.0
++            if intermediate_tensors is not None:
++                orig_model_forward_time = intermediate_tensors.tensors.get(
++                    "model_forward_time", torch.tensor(0.0)).item()
++            # If there are multiple workers, we are still tracking the latency
++            # from the start time of the driver worker to the end time of the
++            # driver worker. The model forward time will then end up covering
++            # the communication time as well.
++            output.model_forward_time = (orig_model_forward_time +
++                                         model_forward_time)
++
++        if self.return_hidden_states:
++            # we only need to pass hidden states of most recent token
++            assert model_input.sampling_metadata is not None
++            indices = model_input.sampling_metadata.selected_token_indices
++            if model_input.is_prompt:
++                hidden_states = hidden_or_intermediate_states.index_select(
++                    0, indices)
++                output.prefill_hidden_states = hidden_or_intermediate_states
++            elif decode_meta.use_cuda_graph:
++                hidden_states = hidden_or_intermediate_states[:len(indices)]
++            else:
++                hidden_states = hidden_or_intermediate_states
++
++            output.hidden_states = hidden_states
++
++        return [output]
++
++    def need_recv_kv(self, model_input, kv_caches) -> bool:
++        """Check if we need to receive kv-cache from the other worker.
++        We need to receive KV when
++            1. current vLLM instance is KV cache consumer/decode vLLM instance
++            2. this batch is not a profiling run
++            3. this batch is a prefill run
++            
++        Args:
++            model_input: input to the model executable
++            kv_caches: vLLM's paged memory
++        """
++
++        if self.vllm_config.kv_transfer_config is None:
++            return False
++
++        prefill_meta = model_input.attn_metadata.prefill_metadata
++
++        # check if the current run is profiling
++        is_profile_run = (kv_caches[0].numel() == 0)
++        # check if the current run is prefill
++        is_prefill_run = prefill_meta is not None
++
++        return self.vllm_config.kv_transfer_config.is_kv_consumer and (
++            not is_profile_run) and is_prefill_run
++
++    def need_send_kv(self, model_input, kv_caches) -> bool:
++        """Check if we need to send kv-cache to the other worker.
++        We need to send KV when
++            1. current vLLM instance is KV cache producer/prefill vLLM instance
++            2. this batch is not a profiling run
++            3. this batch is a prefill run
++            
++        Args:
++            model_input: input to the model executable
++            kv_caches: vLLM's paged memory
++        """
++
++        if self.vllm_config.kv_transfer_config is None:
++            return False
++
++        prefill_meta = model_input.attn_metadata.prefill_metadata
++
++        # check if the current run is profiling
++        is_profile_run = (kv_caches[0].numel() == 0)
++        # check if the current run is prefill
++        is_prefill_run = prefill_meta is not None
++
++        return self.vllm_config.kv_transfer_config.is_kv_producer and (
++            not is_profile_run) and is_prefill_run
++
++
++# NOTE: this is nn.Module so the profiler can properly capture/group
++#  kernels calls made within the graph
++class CUDAGraphRunner(nn.Module):
++
++    def __init__(self, model: nn.Module, backend_name: str,
++                 attn_state: AttentionState, is_encoder_decoder_model: bool):
++        super().__init__()
+         self.model = model
++        self.backend_name = backend_name
++        self.attn_state = attn_state
++
+         self.input_buffers: Dict[str, torch.Tensor] = {}
+         self.output_buffers: Dict[str, torch.Tensor] = {}
+ 
+         self._graph: Optional[torch.cuda.CUDAGraph] = None
++        self._is_encoder_decoder_model = is_encoder_decoder_model
+ 
+     @property
+     def graph(self):
+@@ -1049,51 +1868,80 @@ class CUDAGraphRunner:
+         self,
+         input_ids: torch.Tensor,
+         positions: torch.Tensor,
++        intermediate_inputs: Optional[IntermediateTensors],
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
+-        memory_pool,
++        memory_pool: Optional[Tuple[int, int]],
++        stream: torch.cuda.Stream,
+         **kwargs,
+-    ) -> None:
++    ):
+         assert self._graph is None
+-        # Run the model once without capturing the graph.
++        # Run the model a few times without capturing the graph.
+         # This is to make sure that the captured graph does not include the
+         # kernel launches for initial benchmarking (e.g., Triton autotune).
+-        with _maybe_pynccl():
++        # Note one iteration is not enough for torch.compile
++        for _ in range(_NUM_WARMUP_ITERS):
+             self.model(
+-                input_ids,
+-                positions,
+-                kv_caches,
+-                attn_metadata,
++                input_ids=input_ids,
++                positions=positions,
++                kv_caches=kv_caches,
++                attn_metadata=attn_metadata,
++                intermediate_tensors=intermediate_inputs,
+                 **kwargs,
+             )
++        # Wait for the warm up operations to finish before proceeding with
++        # Graph Capture.
+         torch.cuda.synchronize()
+-
+         # Capture the graph.
+-        # NOTE(woosuk): Python 3.8 does not support multi-line with statements.
+-        # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
+         self._graph = torch.cuda.CUDAGraph()
+-        with torch.cuda.graph(self._graph, pool=memory_pool):  # noqa: SIM117
+-            with _maybe_pynccl():
+-                hidden_states = self.model(
+-                    input_ids,
+-                    positions,
+-                    kv_caches,
+-                    attn_metadata,
+-                    **kwargs,
+-                )
++        with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
++            output_hidden_or_intermediate_states = self.model(
++                input_ids=input_ids,
++                positions=positions,
++                kv_caches=kv_caches,
++                attn_metadata=attn_metadata,
++                intermediate_tensors=intermediate_inputs,
++                **kwargs,
++            )
++
++            if isinstance(output_hidden_or_intermediate_states, torch.Tensor):
++                hidden_or_intermediate_states = weak_ref_tensor(
++                    output_hidden_or_intermediate_states)
++            elif isinstance(output_hidden_or_intermediate_states,
++                            IntermediateTensors):
++                hidden_or_intermediate_states = IntermediateTensors(
++                    tensors={
++                        key: weak_ref_tensor(value)
++                        for key, value in
++                        output_hidden_or_intermediate_states.tensors.items()
++                    })
++
++            del output_hidden_or_intermediate_states
++            # make sure `output_hidden_or_intermediate_states` is deleted
++            # in the graph's memory pool
++            gc.collect()
+         torch.cuda.synchronize()
+ 
+         # Save the input and output buffers.
+         self.input_buffers = {
+-            "input_ids": input_ids,
+-            "positions": positions,
+-            "kv_caches": kv_caches,
+-            "slot_mapping": attn_metadata.slot_mapping,
+-            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
+-            "block_tables": attn_metadata.decode_metadata.block_tables,
++            "input_ids":
++            input_ids,
++            "positions":
++            positions,
++            "kv_caches":
++            kv_caches,
++            **self.attn_state.get_graph_input_buffers(
++                attn_metadata, self._is_encoder_decoder_model),
++            **kwargs,
+         }
+-        self.output_buffers = {"hidden_states": hidden_states}
+-        return
++        if intermediate_inputs is not None:
++            self.input_buffers.update(intermediate_inputs.tensors)
++        if get_pp_group().is_last_rank:
++            self.output_buffers = {
++                "hidden_states": hidden_or_intermediate_states
++            }
++        else:
++            self.output_buffers = hidden_or_intermediate_states
+ 
+     def forward(
+         self,
+@@ -1101,6 +1949,7 @@ class CUDAGraphRunner:
+         positions: torch.Tensor,
+         kv_caches: List[torch.Tensor],
+         attn_metadata: AttentionMetadata,
++        intermediate_tensors: Optional[IntermediateTensors],
+         **kwargs,
+     ) -> torch.Tensor:
+         # KV caches are fixed tensors, so we don't need to copy them.
+@@ -1109,60 +1958,37 @@ class CUDAGraphRunner:
+         # Copy the input tensors to the input buffers.
+         self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
+         self.input_buffers["positions"].copy_(positions, non_blocking=True)
+-        self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
+-                                                 non_blocking=True)
+-        self.input_buffers["seq_lens_tensor"].copy_(
+-            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
+-        self.input_buffers["block_tables"].copy_(
+-            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+-        # Run the graph.
+-        self.graph.replay()
+ 
+-        # Return the output tensor.
+-        return self.output_buffers["hidden_states"]
++        if self.backend_name != "NO_ATTENTION":
++            self.input_buffers["slot_mapping"].copy_(
++                attn_metadata.slot_mapping, non_blocking=True)
+ 
+-    def __call__(self, *args, **kwargs):
+-        return self.forward(*args, **kwargs)
++        self.attn_state.prepare_graph_input_buffers(
++            self.input_buffers, attn_metadata, self._is_encoder_decoder_model)
+ 
++        if "seqlen_agnostic_capture_inputs" in self.input_buffers:
++            self.model.copy_inputs_before_cuda_graphs(self.input_buffers,
++                                                      **kwargs)
+ 
+-@contextlib.contextmanager
+-def _maybe_pynccl():
+-    if pynccl_utils.is_initialized(
+-    ) and not custom_all_reduce.is_initialized():
+-        with with_pynccl_for_all_reduce():
+-            yield
+-    else:
+-        yield
++        if "previous_hidden_states" in self.input_buffers:
++            self.input_buffers["previous_hidden_states"].copy_(
++                kwargs["previous_hidden_states"], non_blocking=True)
+ 
++        if intermediate_tensors is not None:
++            for key in intermediate_tensors.tensors:
++                if key != "model_execute_time" and key != "model_forward_time":
++                    self.input_buffers[key].copy_(intermediate_tensors[key],
++                                                  non_blocking=True)
++        if self._is_encoder_decoder_model:
++            self.input_buffers["encoder_input_ids"].copy_(
++                kwargs['encoder_input_ids'], non_blocking=True)
++            self.input_buffers["encoder_positions"].copy_(
++                kwargs['encoder_positions'], non_blocking=True)
+ 
+-def _get_graph_batch_size(batch_size: int) -> int:
+-    """Returns the padded batch size given actual batch size.
++        # Run the graph.
++        self.graph.replay()
++        # Return the output tensor.
++        if get_pp_group().is_last_rank:
++            return self.output_buffers["hidden_states"]
+ 
+-    Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
+-    2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
+-    """
+-    if batch_size <= 2:
+-        return batch_size
+-    elif batch_size <= 4:
+-        return 4
+-    else:
+-        return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
+-                _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
+-
+-
+-def _prepare_fake_inputs(
+-        seq_len: int, vision_language_config: Optional[VisionLanguageConfig]):
+-    """Prepare fake inputs for profile run."""
+-    if vision_language_config:
+-        prompt_tokens = [
+-            vision_language_config.image_token_id
+-        ] * vision_language_config.image_feature_size + [0] * (
+-            seq_len - vision_language_config.image_feature_size)
+-        fake_image_input = MultiModalData(
+-            type=MultiModalData.Type.IMAGE,
+-            data=torch.zeros(vision_language_config.image_input_shape,
+-                             dtype=torch.float16))
+-    else:
+-        prompt_tokens = [0] * seq_len
+-        fake_image_input = None
+-    return SequenceData(prompt_tokens), fake_image_input
++        return self.output_buffers
+diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
+new file mode 100644
+index 0000000..c7abad7
+--- /dev/null
++++ b/vllm/worker/model_runner_base.py
+@@ -0,0 +1,305 @@
++import dataclasses
++import pickle
++from abc import ABC, abstractmethod
++from datetime import datetime
++from functools import wraps
++from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List,
++                    Optional, Type, TypeVar)
++
++import torch
++from torch import is_tensor
++
++from vllm.config import VllmConfig
++from vllm.logger import init_logger
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
++
++if TYPE_CHECKING:
++    from vllm.attention import AttentionMetadata
++    from vllm.attention.backends.abstract import AttentionBackend
++    from vllm.model_executor import SamplingMetadata
++
++logger = init_logger(__name__)
++
++T = TypeVar('T', bound="BroadcastableModelInput")
++
++
++def _add_attn_metadata_broadcastable_dict(
++        tensor_dict: Dict[str, Any],
++        attn_metadata: Optional["AttentionMetadata"]) -> None:
++    """
++    Helper method to update tensor_dict with broadcastable
++    AttentionMetadata fields.
++    """
++    if attn_metadata is not None:
++        tensor_dict.update(attn_metadata.asdict_zerocopy())
++
++
++def _init_attn_metadata_from_tensor_dict(
++    attn_backend: "AttentionBackend",
++    tensor_dict: Dict[str, Any],
++) -> Dict[str, Any]:
++    """
++    Helper method to initialize AttentionMetadata based on an
++    AttentionBackend and broadcastable AttentionMetadata fields.
++    """
++    # Extract the fields used to create AttentionMetadata.
++    valid_attn_kwargs = {}
++    for field in dataclasses.fields(attn_backend.get_metadata_cls()):
++        if field.name in tensor_dict:
++            valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)
++
++    attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
++    tensor_dict["attn_metadata"] = attn_metadata
++    return tensor_dict
++
++
++def _init_sampling_metadata_from_tensor_dict(  # type: ignore
++        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
++    """
++    Helper method to initialize SamplingMetadata based on broadcastable
++    SamplingMetadata fields.
++    """
++    from vllm.model_executor import SamplingMetadata
++
++    selected_token_indices = tensor_dict.pop("selected_token_indices", None)
++    # An empty SamplingMetadata to signal that the worker should skip
++    # sampling.
++    if selected_token_indices is not None:
++        tensor_dict["sampling_metadata"] = SamplingMetadata(
++            seq_groups=None,
++            selected_token_indices=selected_token_indices,
++            categorized_sample_indices=None,
++            num_prompts=0,
++        )
++    return tensor_dict
++
++
++def _add_sampling_metadata_broadcastable_dict(
++        tensor_dict: Dict[str, Any],
++        sampling_metadata: Optional["SamplingMetadata"]) -> None:
++    """
++    Helper method to update tensor_dict with broadcastable
++    SamplingMetadata fields.
++    """
++    if sampling_metadata is not None:
++        tensor_dict["selected_token_indices"] = (
++            sampling_metadata.selected_token_indices)
++
++
++def _init_frozen_model_input_from_tensor_dict(
++        frozen_model_input_cls: Type["ModelRunnerInputBase"],
++        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
++    """
++    Helper method to initialize a frozen ModelInput based on broadcastable
++    """
++    valid_tensor_kwargs = {}
++    for field in dataclasses.fields(frozen_model_input_cls):
++        val = tensor_dict.pop(field.name, None)
++        if val is not None:
++            valid_tensor_kwargs[field.name] = val
++
++    frozen_model_input = frozen_model_input_cls(**valid_tensor_kwargs)
++    tensor_dict["frozen_model_input"] = frozen_model_input
++    return tensor_dict
++
++
++def dump_input_when_exception(exclude_args: Optional[List[int]] = None,
++                              exclude_kwargs: Optional[List[str]] = None):
++
++    def _inner(func):
++
++        @wraps(func)
++        def _wrapper(*args, **kwargs):
++            try:
++                return func(*args, **kwargs)
++            except Exception as err:
++                timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
++                filename = f"/tmp/err_{func.__name__}_input_{timestamp}.pkl"
++                logger.info("Writing input of failed execution to %s...",
++                            filename)
++                with open(filename, "wb") as filep:
++                    dumped_inputs = {
++                        k: v
++                        for k, v in kwargs.items()
++                        if k not in (exclude_kwargs or [])
++                    }
++                    for i, arg in enumerate(args):
++                        if i not in (exclude_args or []):
++                            dumped_inputs[f"arg_{i}"] = arg
++
++                    # Only persist dtype and shape for kvcache tensors
++                    # (can be way to big otherwise)
++                    if (kv_caches := dumped_inputs.get("kv_caches")) \
++                        and isinstance(kv_caches, Iterable):
++                        dumped_inputs["kv_caches"] = [(t.dtype, t.shape)
++                                                      for t in kv_caches
++                                                      if is_tensor(t)]
++
++                    try:
++                        pickle.dump(dumped_inputs, filep)
++                    except Exception as pickle_err:
++                        logger.warning(
++                            "Failed to pickle inputs of failed execution: %s",
++                            str(pickle_err))
++                        raise type(err)(f"Error in model execution: "
++                                        f"{str(err)}") from err
++
++                    logger.info(
++                        "Completed writing input of failed execution to %s.",
++                        filename)
++                raise type(err)(
++                    f"Error in model execution (input dumped to {filename}): "
++                    f"{str(err)}") from err
++
++        return _wrapper
++
++    return _inner
++
++
++class BroadcastableModelInput(ABC):
++
++    @abstractmethod
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        """
++        Extract broadcastable fields. Override for fields that require some
++        custom deserialization.
++        """
++        raise NotImplementedError
++
++    @classmethod
++    @abstractmethod
++    def from_broadcasted_tensor_dict(
++        cls: Type[T],
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> T:
++        """
++        Pop fields from the given tensor_dict and populate a new instance of
++        BroadcastableModelInput.
++        """
++        raise NotImplementedError
++
++
++@dataclasses.dataclass(frozen=True)
++class ModelRunnerInputBase(BroadcastableModelInput):
++    """Local inputs to each worker's model runner. May contain
++    device-specific data. Different worker backends may have different methods
++    of converting from the global ExecuteModelRequest produced by the LLM
++    engine to the worker-local ModelRunnerInputBase objects.
++
++    Model runners that support multi-GPU execution should define a
++    ModelRunnerInputBase subclass, add their required fields, and specify how to
++    serialize/deserialize a ModelInput for broadcast between workers.
++    """
++    pass
++
++
++class ModelRunnerInputBuilderBase(ABC, Generic[T]):
++    """A builder to create ModelRunnerInputBase objects.
++  """
++
++    @abstractmethod
++    def add_seq_group(self, seq_group_metadata):
++        """TBA"""
++        raise NotImplementedError
++
++    @abstractmethod
++    def build(self, *args, **kwargs) -> T:
++        """Build metadata with on-device tensors."""
++        raise NotImplementedError
++
++
++class ModelRunnerBase(ABC, Generic[T]):
++    """
++    Model runner interface that abstracts a particular hardware and/or type of
++    model. Model execution may communicate data with model runners in other
++    processes, but it should not include control plane metadata communication.
++
++    Each ModelRunnerBase subclass should define a corresponding
++    ModelRunnerInputBase subclass.
++    """
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++    ) -> None:
++        self.vllm_config = vllm_config
++        self.model_config = vllm_config.model_config
++        self.cache_config = vllm_config.cache_config
++        self.lora_config = vllm_config.lora_config
++        self.load_config = vllm_config.load_config
++        self.parallel_config = vllm_config.parallel_config
++        self.scheduler_config = vllm_config.scheduler_config
++        self.device_config = vllm_config.device_config
++        self.speculative_config = vllm_config.speculative_config
++        self.prompt_adapter_config = vllm_config.prompt_adapter_config
++        self.observability_config = vllm_config.observability_config
++
++    # Map of request_id -> generator used for seeded random sampling
++    generators: Dict[str, torch.Generator] = {}
++
++    @abstractmethod
++    def make_model_input_from_broadcasted_tensor_dict(
++        self,
++        tensor_dict: Dict[str, Any],
++    ) -> T:
++        """
++        Make an instance of a ModelRunnerInputBase from the broadcasted tensor
++        dict.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None,
++    ) -> T:
++        """
++        Prepare the inputs to ModelRunnerBase.execute_model from an execution
++        request. This method may move data to the worker's local device. It is
++        not allowed to communicate with other workers or devices.
++        """
++        raise NotImplementedError
++
++    def execute_model(
++        self,
++        model_input: T,
++        kv_caches: Optional[List[torch.Tensor]],
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++        **kwargs,
++    ) -> Optional[List[SamplerOutput]]:
++        """
++        Execute the model on the given input.
++        """
++        raise NotImplementedError
++
++    def get_generators(self, finished_request_ids: Optional[List[str]] = None):
++        """
++        Return dict of per-request generators used for random sampling.
++        """
++
++        # Clean up generators from completed requests
++        if finished_request_ids:
++            for request_id in finished_request_ids:
++                self.generators.pop(request_id, None)
++
++        return self.generators
++
++
++class ModelRunnerWrapperBase:
++    """
++    The whole point of this class is to lazily initialize the model_runner.
++    """
++
++    def __init__(
++        self,
++        moderl_runner: ModelRunnerBase,
++    ) -> None:
++        self.model_runner: ModelRunnerBase = moderl_runner
++
++    def __getattr__(self, attr):
++        return getattr(self.model_runner, attr)
+diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
+new file mode 100644
+index 0000000..acce923
+--- /dev/null
++++ b/vllm/worker/multi_step_model_runner.py
+@@ -0,0 +1,907 @@
++import dataclasses
++import functools
++from dataclasses import dataclass, field
++from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
++                    Union)
++
++import torch
++
++from vllm.distributed import get_pp_group
++from vllm.logger import init_logger
++from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs,
++                                                SamplerOutput,
++                                                SamplingMetadata, get_logprobs,
++                                                get_pythonized_sample_results)
++from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
++                           Logprob, SequenceGroupMetadata, SequenceOutput)
++from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
++from vllm.worker.model_runner import (GPUModelRunnerBase,
++                                      ModelInputForGPUWithSamplingMetadata)
++from vllm.worker.model_runner_base import (
++    BroadcastableModelInput, _init_attn_metadata_from_tensor_dict,
++    _init_frozen_model_input_from_tensor_dict,
++    _init_sampling_metadata_from_tensor_dict)
++
++from ..model_executor.model_loader.tensorizer import TensorizerConfig
++
++if TYPE_CHECKING:
++    from vllm.attention.backends.abstract import AttentionBackend
++
++logger = init_logger(__name__)
++
++MULTI_STEP_ATTENTION_BACKENDS = [
++    "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION"
++]
++MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN"]
++
++def _get_supported_attention_backends(chunked_prefill_enabled: bool) \
++    -> List[str]:
++    if chunked_prefill_enabled:
++        return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS
++    else:
++        return MULTI_STEP_ATTENTION_BACKENDS
++
++
++def seq_output_builder():
++    return SequenceOutput(
++        0, 0,
++        {0: Logprob(logprob=float('inf'), rank=None, decoded_token=None)})
++
++
++def completion_seq_group_output_builder():
++    return CompletionSequenceGroupOutput([], None)
++
++
++# Used by pythonization to reduce python object allocations
++class PythonizationCache:
++
++    def __init__(self):
++        self.cached_seq_output = PyObjectCache(seq_output_builder)
++        self.cached_completion_seq_group_output = PyObjectCache(
++            completion_seq_group_output_builder)
++
++    def reset(self):
++        self.cached_seq_output.reset()
++        self.cached_completion_seq_group_output.reset()
++
++
++@dataclass
++class ModelOutput:
++    """The output of a single model forward pass.
++
++    The sampler_output_ready_event is set when the tensors in
++    sampler_output are ready (the model+sampler forward pass has
++    completed). We use the event to synchronize the GPU->CPU transfer,
++    which we want to only run when the data has been written to the
++    GPU tensors. Until the event is ready, the tensors in sampler_output
++    will have garbage data.
++
++    There are two scenarios:
++    1. The output tensors are ready and we can pythonize them immediately.
++    2. The output tensors are not ready and we need to wait for the event to be
++    ready.
++    """
++    sampler_output: SamplerOutput
++    sampler_output_ready_event: torch.cuda.Event
++    sampled_token_ids: Optional[torch.Tensor] = None
++    pythonized: bool = False
++    # On-device tensor containing the logprobs of each token.
++    logprobs: Optional["torch.Tensor"] = None
++    pythonization_cache: Optional[PythonizationCache] = None
++
++    def pythonize(self, input_metadata: "StatefulModelInput",
++                  copy_stream: torch.cuda.Stream,
++                  pinned_sampled_token_buffer: torch.Tensor) -> None:
++        """Pythonize the output. Blocking."""
++        if not self.pythonized:
++            self._pythonize_sampler_output(input_metadata, copy_stream,
++                                           pinned_sampled_token_buffer, True)
++            self.pythonized = True
++
++    def maybe_pythonize(self, input_metadata: "StatefulModelInput",
++                        copy_stream: torch.cuda.Stream,
++                        pinned_sampled_token_buffer: torch.Tensor) -> None:
++        """Pythonize the output if ready, else return None. Non-blocking."""
++        if not self.pythonized:
++            self.pythonized = self._pythonize_sampler_output(
++                input_metadata, copy_stream, pinned_sampled_token_buffer,
++                False)
++
++    def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput",
++                                  copy_stream: torch.cuda.Stream,
++                                  pinned_sampled_token_buffer: torch.Tensor,
++                                  blocking: bool) -> bool:
++        """
++        If blocking is set, will block until the forward pass for the output is
++        ready and pythonize the output. Upon completing Pythonization, erases
++        self.logprobs (note that a non-blocking call that is performed when
++        the sampler output is not yet ready, will not erase self.logprobs.)
++        """
++        assert self.sampled_token_ids is not None
++        if not blocking and not self.sampler_output_ready_event.query():
++            return False
++
++        if blocking:
++            self.sampler_output_ready_event.synchronize()
++        with torch.cuda.stream(copy_stream):
++            _pythonize_sampler_output(input_metadata, self.sampler_output,
++                                      pinned_sampled_token_buffer,
++                                      self.sampled_token_ids, self.logprobs,
++                                      self.pythonization_cache)
++
++        # Erase the logprobs GPU-side tensor.
++        # Note that although _pythonize_sampler_output() runs in its
++        # own CUDA stream, nonetheless _pythonize_sampler_output()
++        # cannot return until Pythonization is complete; therefore
++        # we know that by the time the CPU reaches this point,
++        # `self.logprobs` is no longer needed.
++        self.logprobs = None
++        return True
++
++
++@dataclass(frozen=False)
++class StatefulModelInput(BroadcastableModelInput):
++    # actual frozen model input dataclass passed to _base_model_runner
++    frozen_model_input: Optional[ModelInputForGPUWithSamplingMetadata] = None
++
++    # list of model outputs for each step, may not be all pythonized
++    cached_outputs: List[ModelOutput] = field(default_factory=list)
++
++    # used to pass sampled token ids from the last step to the current step for
++    # TP workers. Used to append to end of outputs and used by advance_step
++    last_sampled_token_ids: Optional[torch.Tensor] = None
++    current_step: int = 0
++    is_multi_step: bool = True
++    is_last_step: bool = False
++    is_first_multi_step: bool = False
++    base_output_proc_callback: Optional[Callable] = None
++    # ping-pong data structures for multi-step to wait on the previous step
++    step_cuda_events: List[torch.cuda.Event] = field(
++        default_factory=lambda: [torch.cuda.Event(blocking=True)] * 2)
++    num_seqs: int = -1
++    num_queries: int = -1
++    num_single_step_prefills: int = 0
++
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        assert self.frozen_model_input is not None
++        tensor_dict = self.frozen_model_input.as_broadcastable_tensor_dict()
++        new_tensor_dict = {
++            'last_sampled_token_ids': self.last_sampled_token_ids,
++            'current_step': self.current_step,
++            'is_multi_step': self.is_multi_step,
++            'is_last_step': self.is_last_step,
++            'is_first_multi_step': self.is_first_multi_step,
++            'num_seqs': self.num_seqs,
++            'num_queries': self.num_queries,
++            'num_single_step_prefills': self.num_single_step_prefills,
++        }
++        tensor_dict.update(new_tensor_dict)
++        return tensor_dict
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls,
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> "StatefulModelInput":
++        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
++        if attn_backend is not None:
++            tensor_dict = _init_attn_metadata_from_tensor_dict(
++                attn_backend, tensor_dict)
++        tensor_dict = _init_frozen_model_input_from_tensor_dict(
++            ModelInputForGPUWithSamplingMetadata, tensor_dict)
++
++        return cls(**tensor_dict)
++
++    def record_step_event(self, current_stream: torch.cuda.Stream):
++        # record the event for the current step so that the next step can sync
++        # on it. We modulo by 2 to keep the events in a circular buffer and
++        # support any attn backends that may be supported in the future. ie
++        # Flashinfer would want two DecodeWrappers to overlap the CPU and GPU.
++        self.step_cuda_events[self.current_step & 1] = \
++            torch.cuda.Event(blocking=True)
++        self.step_cuda_events[self.current_step & 1].record(current_stream)
++
++    def wait_previous_step(self):
++        # These cuda events are an explicit synchronization to ensure that
++        # advance_step() (for other attn backends that may be supported in the
++        # future) do not clobber any data structures that is also used by any
++        # enqueued forwards steps. For distributed case, only a single event is
++        # needed, but for single GPU case, since we can let the CPU run much
++        # further ahead, two events allow us to overlap the advance_step with
++        # the previous forward (ie using two DecodeWrappers for flashinfer
++        # backend)
++        self.step_cuda_events[(self.current_step + 1) & 1].wait()
++
++    def add_sampler_output(self,
++                           sampler_output: SamplerOutput,
++                           sampled_token_ids: Optional[torch.Tensor] = None):
++        self.cached_outputs.append(
++            ModelOutput(sampler_output=sampler_output,
++                        sampler_output_ready_event=None,
++                        sampled_token_ids=sampled_token_ids,
++                        pythonized=False))
++
++    def maybe_advance_sampling_metadata(self, device: str, pin_memory: bool):
++        """
++        sampling_metadata.selected_token_indices is constructed for the
++        first-step in Multi-Step. However, when chunked-prefill is enabled with
++        multi-step, the scheduled prompts are fully processed in the
++        first-step and are processed as decodes in the rest of the steps.
++        This function updates the sampling_metadata.selected_token_indices
++        to account for this conversion.
++
++        Example:
++        Let 2 prompts and 2 decodes be scheduled together. Let the
++        num-tokens to process for the 2 prompts be 5 and 8 respectively.
++
++        In that case, sampling_metadata.sampled_token_indices will be,
++        [4, 12, 13, 14] as it is constructed for the first-step in
++        multi-step.
++        However, the prompts turns to decodes after the first-step
++        and the num-tokens for the previously-prompt sequences will
++        be 1 and 1 as they are decodes now. The self.sampled_token_indices
++        must be updated to [0,1,2,3].
++        """
++        assert self.current_step == 1 and self.num_single_step_prefills > 0
++        if not get_pp_group().is_last_rank:
++            return
++
++        assert self.frozen_model_input is not None
++        assert self.frozen_model_input.sampling_metadata is not None
++        self.frozen_model_input.sampling_metadata.selected_token_indices =  \
++            async_tensor_h2d(list(range(self.num_queries)),
++                             dtype=torch.long,
++                             target_device=device,
++                             pin_memory=pin_memory)
++
++    def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool):
++        """
++        Advancing the datastructures of StatefulModelInput::frozen_model_input
++        is only required when prefills are scheduled with decodes to run in
++        multi-step. This advancement/correction is required to account for
++        the conversion of Prefills to Decodes after the first multi-step.
++        """
++        if self.current_step != 1 or self.num_single_step_prefills == 0:
++            return
++
++        assert self.frozen_model_input is not None
++        fmi = self.frozen_model_input
++
++        # Truncate input_tokens
++        assert fmi.input_tokens is not None
++        assert fmi.input_tokens.shape[0] >= self.num_seqs
++        fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
++
++        # Update frozen_model_input::input_positons.
++        assert fmi.input_positions is not None
++        assert fmi.input_positions.shape[0] >= self.num_seqs
++        fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
++                                                                    num_seqs]
++
++        # Assert unsupported
++        assert fmi.lora_mapping is None
++        assert fmi.lora_requests is not None
++        assert len(fmi.lora_requests) == 0
++        assert fmi.attn_metadata is not None
++        assert fmi.prompt_adapter_mapping is None
++        assert fmi.prompt_adapter_requests is not None
++        assert len(fmi.prompt_adapter_requests) == 0
++        assert fmi.multi_modal_kwargs is not None
++        assert len(fmi.multi_modal_kwargs) == 0
++
++        self.frozen_model_input = dataclasses.replace(
++            self.frozen_model_input,
++            input_tokens=fmi_new_input_tokens,
++            input_positions=fmi_new_input_positions)
++
++        self.maybe_advance_sampling_metadata(device, pin_memory)
++
++
++# MutableModelInputForGPUWithMultiStepMetadata is not subclass of
++# ModelInputForGPU but it wraps the actual input dataclass and adds multi-step
++# metadata
++# mypy: disable-error-code=type-var
++class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
++    # mypy: enable-error-code=type-var
++
++    def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
++
++        super().__init__(*args, **kwargs)
++
++        # Check attention backend support.
++        supported_attention_backends: List[str] = \
++            _get_supported_attention_backends(
++                self.scheduler_config.chunked_prefill_enabled)
++        if self.attn_backend.get_name() not in supported_attention_backends:
++            ms_config_str: str = "Multi-Step + Chunked-Prefill" \
++                if self.scheduler_config.chunked_prefill_enabled \
++                      else "Multi-Step"
++            raise ValueError(
++                f"{ms_config_str} not supported for attention backend: "
++                f"{self.attn_backend.get_name()}. Set VLLM_ATTENTION_BACKEND "
++                f"to a value from {supported_attention_backends}.")
++
++        # uses the base model runner to execute the model and wraps it with
++        # multi-step logic
++        self._base_model_runner: GPUModelRunnerBase = base_model_runner
++
++        self.is_multi_step = self.scheduler_config.is_multi_step
++        self.pinned_sampled_token_ids: Optional[torch.Tensor] = None
++
++        # Using the PythonizationCache in Pipeline-Parallel clobbers the
++        # SequenceOutput and CompletionSequenceGroupOutput object.
++        # When cache-reset happens at the last step of a multi-step
++        # execution, there may be other on-going single-step/multi-step
++        # executions. The current caching implementation does not check
++        # for this.
++        self.pythonization_cache = PythonizationCache() \
++            if self.parallel_config.pipeline_parallel_size == 1 else None
++
++    @functools.cached_property
++    def _copy_stream(self):
++        # used to copy tensors from GPU to CPU asynchronously
++        return torch.cuda.Stream()
++
++    def make_model_input_from_broadcasted_tensor_dict(
++            self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
++        model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
++            tensor_dict,
++            attn_backend=self.attn_backend,
++        ))
++        return model_input
++
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> StatefulModelInput:
++        frozen_model_input: ModelInputForGPUWithSamplingMetadata = \
++              self._base_model_runner.prepare_model_input(
++                    seq_group_metadata_list,
++                    virtual_engine,
++                    finished_requests_ids)
++
++        assert frozen_model_input.query_lens is not None
++        assert frozen_model_input.seq_lens is not None
++        assert frozen_model_input.attn_metadata is not None
++        num_queries = len(frozen_model_input.query_lens)
++        num_seqs = len(frozen_model_input.seq_lens)
++        num_single_step_prefills = frozen_model_input.attn_metadata.num_prefills
++
++        model_input = StatefulModelInput(
++            frozen_model_input=frozen_model_input,
++            num_seqs=num_seqs,
++            num_queries=num_queries,
++            num_single_step_prefills=num_single_step_prefills)
++
++        return model_input
++
++    def _async_process_outputs(self, model_input: StatefulModelInput,
++                               output_proc_callback: Callable):
++        # Proceed with pythonization and output_proc in order.
++        # Stop on the first one that fails to pythonize
++        output_proc_callback()
++
++        cont = True
++        for step_num, model_output in enumerate(model_input.cached_outputs):
++            if not model_output.pythonized:
++                model_output.maybe_pythonize(model_input, self._copy_stream,
++                                             self.pinned_sampled_token_ids)
++                if model_output.pythonized:
++                    ctx = output_proc_callback.keywords["ctx"]
++                    ctx.append_output(
++                        outputs=[model_output.sampler_output],
++                        seq_group_metadata_list=ctx.seq_group_metadata_list,
++                        scheduler_outputs=ctx.scheduler_outputs,
++                        is_async=False,
++                        is_last_step=False,
++                        is_first_step_output=step_num == 0)
++
++                    output_proc_callback()
++                else:
++                    cont = False
++
++            if not cont:
++                break
++
++    def _final_process_outputs(
++            self, model_input: StatefulModelInput,
++            output_proc_callback: Optional[Callable]) -> List[SamplerOutput]:
++        assert model_input.frozen_model_input is not None
++
++        has_async_callback = output_proc_callback is not None
++
++        outputs = []
++        for step_num, output in enumerate(model_input.cached_outputs):
++            is_last_step = step_num == len(model_input.cached_outputs) - 1
++
++            # For non-async case:
++            #   -- We simply add the outputs
++            # For async case:
++            #   -- Invoke callback, pythonize, add to callback queue and repeat
++            #   -- For last output, just add to callback queue
++            if has_async_callback:
++                assert output_proc_callback is not None
++
++                # Invoke callback before pythonize (to overlap with GPU)
++                output_proc_callback()
++
++                # Pythonize
++                if not output.pythonized:
++                    output.pythonize(model_input, self._copy_stream,
++                                     self.pinned_sampled_token_ids)
++
++                    # For non last step, add to callback queue to chain
++                    # callbacks=>pythonize pairs (for GPU overlap)
++                    if not is_last_step:
++                        ctx = output_proc_callback.keywords[  # type: ignore
++                            "ctx"]  # type: ignore
++                        ctx.append_output(
++                            outputs=[output.sampler_output],
++                            seq_group_metadata_list=ctx.
++                            seq_group_metadata_list,
++                            scheduler_outputs=ctx.scheduler_outputs,
++                            is_async=False,
++                            is_last_step=False,
++                            is_first_step_output=step_num == 0)
++                    else:
++                        outputs.append(output.sampler_output)
++            else:
++                output.pythonize(model_input, self._copy_stream,
++                                 self.pinned_sampled_token_ids)
++                outputs.append(output.sampler_output)
++
++        return outputs
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        model_input: StatefulModelInput,
++        kv_caches: List[torch.Tensor],
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
++        """ 
++        Execute the model for a single step and update multi-step
++        metadata
++        """
++        assert num_steps == 1, "MultiStepModelRunner only supports num_steps=1"
++        frozen_model_input = model_input.frozen_model_input
++        assert frozen_model_input is not None
++
++        # path for warm up runs
++        if not model_input.is_multi_step:
++            return self._base_model_runner.execute_model(
++                frozen_model_input, kv_caches, intermediate_tensors, num_steps)
++
++        # make sure we skip the sampler on the lask rank and only pythonize
++        # if CPU is ahead.
++        if self.is_driver_worker and get_pp_group().is_last_rank:
++            if self.pinned_sampled_token_ids is None:
++                self.pinned_sampled_token_ids = torch.zeros(
++                    (self.scheduler_config.max_num_seqs, 1),
++                    dtype=torch.long,
++                    device="cpu",
++                    pin_memory=True)
++
++            self._base_model_runner.model.sampler.include_gpu_probs_tensor = (
++                True)
++            if frozen_model_input.sampling_metadata:
++                frozen_model_input.sampling_metadata.skip_sampler_cpu_output = (
++                    True)
++
++        # some pre-execute model logic for multi-step:
++        #   - if it's the first step, we need to reset the sampling tensors
++        #   - if it's not the first step, we need to advance the step using the
++        #   appended sampler output from last iteration
++        #   - also maybe pythonize if CPU is ahead of GPU
++
++        stream = current_stream()
++        if not model_input.is_first_multi_step:
++            # Explicitly block on the previous step's forward to make sure we
++            # don't clobber any GPU tensors still in use.
++            # This is not needed for flashattn backend, but for other attn
++            # backends such as flashinfer that performs extra CPU operations on
++            # input metadata we may need to synchronize any CPU operations that
++            # might clobber enqueued forwards. (prevents CPU from running too
++            # far ahead if needed)
++            model_input.wait_previous_step()
++            model_input = self._advance_step(
++                model_input, model_input.cached_outputs[-1].sampler_output)
++
++            # frozen_model_input may have been updated
++            frozen_model_input = model_input.frozen_model_input
++            assert frozen_model_input is not None
++
++        if model_input.base_output_proc_callback is None:
++            assert frozen_model_input is not None
++            model_input.base_output_proc_callback = \
++                        frozen_model_input.async_callback
++
++        if frozen_model_input.async_callback is not None:
++            assert model_input.base_output_proc_callback is not None
++            async_callback = functools.partial(
++                self._async_process_outputs,
++                model_input=model_input,
++                output_proc_callback=model_input.base_output_proc_callback)
++
++            model_input.frozen_model_input = dataclasses.replace(  # type: ignore
++                model_input.frozen_model_input,
++                async_callback=async_callback)
++            # Update the local instance
++            frozen_model_input = model_input.frozen_model_input
++            assert frozen_model_input is not None
++
++        # Execute the model
++        output = self._base_model_runner.execute_model(frozen_model_input,
++                                                       kv_caches,
++                                                       intermediate_tensors,
++                                                       num_steps=1)
++
++        # record the event for the current step so that the next step can sync
++        model_input.record_step_event(stream)
++
++        if get_pp_group().is_last_rank and self.is_driver_worker:
++            assert isinstance(output, list)
++            assert len(
++                output
++            ) == 1, "MultiStepModelRunner requires single-step base_models"
++
++            # event for the pythonization so that we only pythonize if the
++            # tensors are ready. May be able to be combined with the step event
++            output_ready_event = torch.cuda.Event()
++            output_ready_event.record(stream)
++            if self.parallel_config.pipeline_parallel_size > 1:
++                output[0].sampled_token_ids_cpu = output[
++                    0].sampled_token_ids.cpu()
++            model_input.cached_outputs.append(
++                ModelOutput(output[0], output_ready_event,
++                            output[0].sampled_token_ids, False,
++                            output[0].logprobs, self.pythonization_cache))
++
++            # These GPU tensors are not required by multi-step;
++            # erase them to ensure they are not pythonized or
++            # transferred to CPU
++            output[0].sampled_token_ids = None
++            output[0].sampled_token_probs = None
++            output[0].logprobs = None
++
++            # Pythonize the output if CPU is ahead and the previous step is
++            # ready.
++            if frozen_model_input.async_callback is None:
++                for model_output in model_input.cached_outputs:
++                    model_output.maybe_pythonize(model_input,
++                                                 self._copy_stream,
++                                                 self.pinned_sampled_token_ids)
++
++        model_input.current_step += 1
++
++        if not get_pp_group().is_last_rank:
++            # Should be IntermediateTensors
++            assert isinstance(output, IntermediateTensors)
++            return output
++        if not self.is_driver_worker:
++            return []
++
++        # Pythonize the output and block if needed since it is the last step
++        if model_input.is_last_step:
++            outputs = self._final_process_outputs(
++                model_input, model_input.base_output_proc_callback)
++            if self.pythonization_cache:
++                self.pythonization_cache.reset()
++            return outputs
++
++        # should be [SamplerOutput]
++        return output
++
++    def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata,
++                                  num_seqs: Optional[int], num_queries: int):
++
++        assert sampling_metadata.num_prompts == 0
++        assert len(sampling_metadata.seq_groups) == num_queries
++        assert sampling_metadata.selected_token_indices.shape == (
++            num_queries, )
++        # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501
++
++        # Verify that all sequences are decodes
++        for i in range(num_queries):
++            seq_group = sampling_metadata.seq_groups[i]
++
++            assert seq_group.is_prompt is False  # No prompt
++            assert seq_group.prompt_logprob_indices == []  # No prompt
++            assert seq_group.sample_indices == [i]  # Simple
++            assert seq_group.seq_len is None  # Decode
++            assert seq_group.query_len is None  # Decode
++
++    def _advance_step(self, model_input: StatefulModelInput,
++                      out: SamplerOutput) -> StatefulModelInput:
++
++        model_input.maybe_advance_frozen_model_input(self.device,
++                                                     self.pin_memory)
++        frozen_model_input = model_input.frozen_model_input
++        assert frozen_model_input is not None
++        assert frozen_model_input.input_tokens is not None
++        assert frozen_model_input.input_tokens.shape[0] == model_input.num_seqs
++        assert frozen_model_input.attn_metadata is not None
++
++        sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids
++        num_seqs = model_input.num_seqs
++        num_queries = model_input.num_queries
++        frozen_model_input = model_input.frozen_model_input
++        assert frozen_model_input is not None
++        attn_metadata = frozen_model_input.attn_metadata
++        assert attn_metadata is not None
++
++        turn_prefills_into_decodes: bool = model_input.current_step == 1 and \
++                                    model_input.num_single_step_prefills != 0
++        attn_metadata.advance_step(
++            frozen_model_input,
++            sampled_token_ids,
++            self.block_size,
++            num_seqs,
++            num_queries,
++            turn_prefills_into_decodes=turn_prefills_into_decodes)
++
++        return model_input
++
++    def load_model(self) -> None:
++        self._base_model_runner.load_model()
++        self.model_memory_usage = self._base_model_runner.model_memory_usage
++
++    def save_sharded_state(
++        self,
++        path: str,
++        pattern: Optional[str] = None,
++        max_size: Optional[int] = None,
++    ) -> None:
++        return self._base_model_runner.save_sharded_state(
++            path, pattern, max_size)
++
++    def save_tensorized_model(self,
++                              tensorizer_config: TensorizerConfig) -> None:
++        return self._base_model_runner.save_tensorized_model(tensorizer_config)
++
++    def profile_run(self) -> None:
++        return self._base_model_runner.profile_run()
++
++    def remove_all_loras(self):
++        return self._base_model_runner.remove_all_loras()
++
++    def capture_model(self, kv_caches: List[List]) -> None:
++        return self._base_model_runner.capture_model(kv_caches)
++
++    @property
++    def vocab_size(self) -> int:
++        return self._base_model_runner.vocab_size
++
++
++DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]],
++                                   Optional[List[SampleLogprobs]]]
++
++
++def deferred_pythonize_logprobs(
++    output: SamplerOutput,
++    sampling_metadata: SamplingMetadata,
++    logprobs_tensor: Optional[torch.Tensor],
++) -> DeferredLogprobsReturnType:
++    """Perform deferred logprob Pythonization.
++
++    1. Pythonize GPU-side sampler result tensors into CPU-side sampler result.
++    2. Pythonize GPU-side logprobs tensor into CPU-side logprobs lists,
++       utilizing  the Pythonized sampler result computed in step 1.
++    
++    These deferred computations are not required for single-step scheduling
++    or the `profile_run()` phase of multi-step scheduling.
++
++    Args:
++        output: sampler output (under deferred Pythonization)
++        sampling_metadata
++        
++    Returns:
++        prompt_logprobs (CPU), sample_logprobs (CPU)
++    """
++
++    # - Deferred pythonization of sample result
++    sampler_result = get_pythonized_sample_results(
++        output.deferred_sample_results_args)
++
++    # - Erase the GPU-side deferred sample_result
++    #   computation args to ensure it is never
++    #   pythonized or transferred to CPU
++    output.deferred_sample_results_args = None
++
++    # - Deferred pythonization of logprobs
++    (
++        prompt_logprobs,
++        sample_logprobs,
++    ) = get_logprobs(logprobs_tensor, sampling_metadata, sampler_result)
++    assert len(prompt_logprobs) == len(sampling_metadata.seq_groups)
++    assert len(sample_logprobs) == len(sampling_metadata.seq_groups)
++
++    return prompt_logprobs, sample_logprobs
++
++
++def _pythonize_sampler_output(
++    model_input: StatefulModelInput,
++    output: SamplerOutput,
++    pinned_sampled_token_buffer: torch.Tensor,
++    sampled_token_ids: torch.Tensor,
++    logprobs_tensor: Optional[torch.Tensor],
++    cache: Optional[PythonizationCache],
++) -> None:
++    """ This function is only called when the output tensors are ready. 
++    See :class:`ModelOutput`. 
++    
++    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, 
++    adding a Pythonized output data structure
++    (:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`.
++
++    Args:
++      model_input
++      output: sampler output
++      pinned_sampled_token_token_buffer: CPU-side pinned memory
++                                         (receives copy of
++                                         GPU-side token buffer.)
++      sampled_token_ids: GPU-side token buffer
++      logprobs_tensor: GPU-side tensor containing 
++                       logprobs computed during sampling
++    """
++
++    assert model_input.frozen_model_input is not None
++
++    frozen_model_input = model_input.frozen_model_input
++    assert frozen_model_input.sampling_metadata is not None
++    sampling_metadata = frozen_model_input.sampling_metadata
++    # samples generation should have been skipped
++    assert not output.outputs
++
++    pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries]
++
++    # We guarantee output tensors are ready, so it is safe to
++    # pythonize the sampler output & obtain CPU-side logprobs.
++    #
++    # However we should check whether logprobs pythonization may
++    # be skipped entirely, i.e. because no logprobs were requested
++    # or pythonization was not deferred. To that end,
++    #
++    # * `prompt_logprobs_are_requested_for_prefill` signals that
++    #   there are *any* prefill-phase requests which specify that
++    #   prompt logprobs should be returned.
++    #
++    # * `any_logprobs_are_requested` signals that there are any
++    #   requests which (1) specify that sample logprobs should be
++    #   returned, or (2) are in the prefill phase AND specify that
++    #   prompt logprobs should be returned.
++    #
++    # Later on, these flags cause adjustments to the pythonization
++    # process to accommodate logprobs.
++
++    seq_groups = sampling_metadata.seq_groups
++    prompt_logprobs_are_requested_for_prefill = any([
++        sg.sampling_params.prompt_logprobs is not None and sg.is_prompt
++        for sg in seq_groups
++    ])
++    any_logprobs_are_requested = (
++        prompt_logprobs_are_requested_for_prefill
++        or any([sg.sampling_params.logprobs is not None for sg in seq_groups]))
++
++    if prompt_logprobs_are_requested_for_prefill:
++        # CPU GPU sync, after gathering *only* sampled tokens (since
++        # requesting prompt logprobs leads `sampled_token_ids` to
++        # include prompt token ids in addition to sampled token ids.)
++        sample_idx_tensor = torch.tensor(
++            [sdx for sg in seq_groups for sdx in sg.sample_indices])
++        pinned_buffer = pinned_buffer.copy_(
++            sampled_token_ids[sample_idx_tensor, :], non_blocking=False)
++    else:
++        # CPU GPU sync
++        pinned_buffer = pinned_buffer.copy_(sampled_token_ids,
++                                            non_blocking=False)
++
++    # this will not block as the tensors are already on CPU
++    samples_list = pinned_buffer.tolist()
++
++    skip_sampler_cpu_output = (
++        frozen_model_input.sampling_metadata.skip_sampler_cpu_output)
++
++    # *Don't* skip logprobs pythonization *if*:
++    # * Any requests require logprobs to be returned in this
++    # iteration AND
++    # * These requests are being scheduled in a fashion which
++    # defers pythonization (i.e. multi-step scheduling.)
++    do_pythonize_logprobs = (skip_sampler_cpu_output
++                             and any_logprobs_are_requested)
++    (
++        prompt_logprobs,
++        sample_logprobs,
++    ) = (deferred_pythonize_logprobs(output, sampling_metadata,
++                                     logprobs_tensor)
++         if do_pythonize_logprobs else (None, None))
++
++    for sgdx, (seq_group,
++               sample_result) in enumerate(zip(seq_groups, samples_list)):
++        # Reminder: Please update docs/source/features/compatibility_matrix.md
++        # If the feature combo become valid
++        # (Check for Guided Decoding)
++        if seq_group.sampling_params.logits_processors:
++            assert len(seq_group.sampling_params.logits_processors) == 0, (
++                "Logits Processors are not supported in multi-step decoding")
++
++        if do_pythonize_logprobs:
++            assert prompt_logprobs is not None
++            assert sample_logprobs is not None
++
++            (
++                group_prompt_logprobs,
++                group_sample_logprobs,
++            ) = (  # Utilize deferred pythonization results
++                prompt_logprobs[sgdx],
++                sample_logprobs[sgdx],
++            )
++        elif any_logprobs_are_requested:
++            (
++                group_prompt_logprobs,
++                group_sample_logprobs,
++            ) = (
++                # profile_run: use already-computed logprobs
++                output.outputs[sgdx].prompt_logprobs,
++                [sample.logprobs for sample in output.outputs[sgdx].samples])
++
++        seq_ids = seq_group.seq_ids
++        next_token_ids = sample_result
++        parent_ids = [0]
++        seq_outputs: List[SequenceOutput]
++
++        if cache is not None:
++            completion_seq_group_output: CompletionSequenceGroupOutput = \
++                cache.cached_completion_seq_group_output.get_object()
++            completion_seq_group_output.samples.clear()
++            seq_outputs = completion_seq_group_output.samples
++        else:
++            seq_outputs = []
++
++        for tdx, (parent_id,
++                  next_token_id) in enumerate(zip(parent_ids, next_token_ids)):
++            if cache is not None:
++                seq_output: SequenceOutput = cache.cached_seq_output.get_object(
++                )
++                seq_output.parent_seq_id = seq_ids[parent_id]
++                seq_output.output_token = next_token_id
++
++                if any_logprobs_are_requested:
++                    seq_output.logprobs = group_sample_logprobs[tdx]
++                else:
++                    logprobs = next(iter(seq_output.logprobs.values()))
++                    seq_output.logprobs.clear()
++
++                    logprobs.logprob = float('inf')
++                    logprobs.rank = None
++                    logprobs.decoded_token = None
++
++                    seq_output.logprobs[next_token_id] = logprobs
++
++                seq_outputs.append(seq_output)
++
++            else:
++                seq_outputs.append(
++                    SequenceOutput(seq_ids[parent_id], next_token_id,
++                                   (group_sample_logprobs[tdx]
++                                    if any_logprobs_are_requested else {
++                                        next_token_id:
++                                        Logprob(logprob=float('inf'),
++                                                rank=None,
++                                                decoded_token=None)
++                                    })))
++        if cache is not None:
++            completion_seq_group_output.prompt_logprobs = \
++                group_prompt_logprobs if any_logprobs_are_requested else None
++            output.outputs.append(completion_seq_group_output)
++        else:
++            output.outputs.append(
++                CompletionSequenceGroupOutput(
++                    seq_outputs, (group_prompt_logprobs
++                                  if any_logprobs_are_requested else None)))
++
++    assert len(output.outputs) > 0
+diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py
+new file mode 100644
+index 0000000..e654f71
+--- /dev/null
++++ b/vllm/worker/multi_step_tpu_worker.py
+@@ -0,0 +1,105 @@
++import dataclasses
++from typing import Dict, Optional, Tuple
++
++import torch
++
++from vllm.distributed import broadcast_tensor_dict
++from vllm.sequence import ExecuteModelRequest
++from vllm.worker.tpu_model_runner import ModelInputForTPU
++from vllm.worker.tpu_worker import TPUWorker
++from vllm.worker.worker_base import WorkerInput
++
++
++class MultiStepTPUWorker(TPUWorker):
++
++    def __init__(self, *args, **kwargs):
++        super().__init__(*args, **kwargs)
++        self.cached_model_input: Optional[ModelInputForTPU] = None
++
++    def _get_driver_input_and_broadcast(
++        self, execute_model_req: ExecuteModelRequest
++    ) -> Tuple[ModelInputForTPU, WorkerInput, Dict[str, torch.Tensor]]:
++        assert self.is_driver_worker
++        assert execute_model_req.virtual_engine == 0
++
++        is_first_multi_step = execute_model_req.is_first_multi_step
++        is_last_step = execute_model_req.is_last_step
++        if is_first_multi_step:
++            worker_input: WorkerInput = self.prepare_worker_input(
++                execute_model_req=execute_model_req)
++            worker_input = dataclasses.replace(
++                worker_input,
++                num_steps=execute_model_req.num_lookahead_slots + 1)
++            model_input: ModelInputForTPU = (
++                self.model_runner.prepare_model_input(
++                    execute_model_req.seq_group_metadata_list,
++                    execute_model_req.virtual_engine,
++                    execute_model_req.finished_requests_ids))
++
++            if execute_model_req.async_callback:
++                model_input = dataclasses.replace(
++                    model_input,
++                    async_callback=execute_model_req.async_callback)
++        else:
++            assert self.cached_model_input is not None
++            model_input = self.cached_model_input
++            worker_input = WorkerInput()
++        model_input = dataclasses.replace(
++            model_input,
++            is_first_multi_step=is_first_multi_step,
++            is_last_step=is_last_step)
++
++        if self.do_metadata_broadcast:
++            if is_first_multi_step:
++                broadcast_data = worker_input.as_broadcastable_tensor_dict()
++                broadcast_data.update(
++                    model_input.as_broadcastable_tensor_dict())
++                broadcast_tensor_dict(broadcast_data, src=0)
++            else:
++                broadcast_data = {
++                    "is_first_multi_step": is_first_multi_step,
++                    "is_last_step": is_last_step,
++                }
++                broadcast_tensor_dict(broadcast_data, src=0)
++
++        # Retuning empty dict here to keep this compatible with
++        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
++        return model_input, worker_input, {}
++
++    def prepare_input(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None,
++    ) -> Optional[Tuple[ModelInputForTPU, WorkerInput, Dict[str,
++                                                            torch.Tensor]]]:
++        if self.is_driver_worker:
++            if execute_model_req is None:
++                if self.do_metadata_broadcast:
++                    broadcast_tensor_dict({}, src=0)
++                return None
++
++            model_input, worker_input, _ = self._get_driver_input_and_broadcast(
++                execute_model_req)
++            if model_input.is_first_multi_step:
++                self.cached_model_input = model_input
++            return model_input, worker_input, {}
++        else:
++            broadcast_data = broadcast_tensor_dict(src=0)
++            if not broadcast_data:
++                return None
++
++            if len(broadcast_data) == 2:
++                assert self.cached_model_input is not None
++                self.cached_model_input = dataclasses.replace(
++                    self.cached_model_input,
++                    is_first_multi_step=broadcast_data["is_first_multi_step"],
++                    is_last_step=broadcast_data["is_last_step"])
++                empty_worker_input = WorkerInput()
++                return self.cached_model_input, empty_worker_input, {}
++
++            worker_input = WorkerInput.from_broadcasted_tensor_dict(
++                broadcast_data)
++            model_input = (
++                self.model_runner.
++                make_model_input_from_broadcasted_tensor_dict(broadcast_data))
++            self.cached_model_input = model_input
++            return model_input, worker_input, {}
+diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
+new file mode 100644
+index 0000000..1f982fe
+--- /dev/null
++++ b/vllm/worker/multi_step_worker.py
+@@ -0,0 +1,194 @@
++import dataclasses
++from dataclasses import dataclass
++from typing import Dict, List, Optional, Tuple
++
++import torch
++
++from vllm.distributed import broadcast_tensor_dict, get_pp_group
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest
++from vllm.worker.model_runner_base import BroadcastableModelInput
++from vllm.worker.multi_step_model_runner import (MultiStepModelRunner,
++                                                 StatefulModelInput)
++from vllm.worker.worker import Worker, WorkerInput
++
++
++@dataclass
++class MultiStepState:
++    worker_input: WorkerInput
++    model_input: StatefulModelInput
++
++
++class MultiStepWorker(Worker):
++
++    def __init__(self, *args, **kwargs):
++        super().__init__(*args, **kwargs)
++        base_model_runner = self.model_runner
++        # for multi-step model, wrap the model runner with MultiStepModelRunner
++        self.model_runner = MultiStepModelRunner(
++            base_model_runner,
++            vllm_config=base_model_runner.vllm_config,
++            kv_cache_dtype=self.cache_config.cache_dtype,
++            is_driver_worker=base_model_runner.is_driver_worker,
++        )
++
++        pipeline_parallel_size = self.parallel_config.pipeline_parallel_size
++        self.multi_step_states: List[
++            Optional[MultiStepState]] = [None] * pipeline_parallel_size
++        self.temp_output = None
++
++    def _get_driver_input_and_broadcast(
++        self, execute_model_req: ExecuteModelRequest
++    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
++        """
++        Get the driver input and broadcast it to other workers.
++        """
++        assert self.is_driver_worker
++        virtual_engine = execute_model_req.virtual_engine
++        is_first_multi_step = execute_model_req.is_first_multi_step
++        if is_first_multi_step:
++            # on first step we prepare the worker input and model input normally
++            worker_input: WorkerInput = self.prepare_worker_input(
++                execute_model_req=execute_model_req)
++            model_input: StatefulModelInput = (
++                self.model_runner.prepare_model_input(
++                    execute_model_req.seq_group_metadata_list,
++                    execute_model_req.virtual_engine,
++                    execute_model_req.finished_requests_ids))
++
++            if execute_model_req.async_callback:
++                model_input.frozen_model_input = dataclasses.replace(  # type: ignore
++                    model_input.frozen_model_input,
++                    async_callback=execute_model_req.async_callback)
++        else:
++            # on subsequent steps we reuse the worker input and model input
++            multi_step_state = self.multi_step_states[virtual_engine]
++            worker_input = multi_step_state.worker_input
++            model_input = multi_step_state.model_input
++            frozen_model_input = model_input.frozen_model_input
++            assert frozen_model_input is not None
++            assert frozen_model_input.attn_metadata is not None
++            # clear the cached metadata so that it can be recomputed on
++            # the workers.
++            frozen_model_input.attn_metadata._cached_prefill_metadata = None
++            frozen_model_input.attn_metadata._cached_decode_metadata = None
++
++        model_input.is_first_multi_step = is_first_multi_step
++        model_input.is_last_step = execute_model_req.is_last_step
++
++        if not is_first_multi_step:
++            # we broadcast the last sampled token ids to all TP workers so they
++            # can update their model input metadata in-place.
++            self._prepare_last_sampled_token_ids_for_tp_workers(
++                execute_model_req=execute_model_req, model_input=model_input)
++
++        if self.do_metadata_broadcast:
++            broadcast_data = worker_input.as_broadcastable_tensor_dict()
++            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
++            broadcast_tensor_dict(broadcast_data, src=0)
++
++        # Retuning empty dict here to keep this compatible with
++        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
++        return model_input, worker_input, {}
++
++    def _prepare_last_sampled_token_ids_for_tp_workers(
++        self,
++        execute_model_req: ExecuteModelRequest,
++        model_input: StatefulModelInput,
++    ) -> None:
++        """ 
++        Prepare the last sampled token ids for TP workers. If it's the last 
++        PP rank, then the last sampled token ids are already in the model_input.
++        If it is NOT the last PP rank, then we need to get the last sampled
++        token that is cached in the execute_model_req.
++        """
++        if get_pp_group().is_last_rank:
++            assert model_input.cached_outputs[
++                -1].sampler_output.sampled_token_ids is None
++            assert model_input.cached_outputs[-1].sampled_token_ids is not None
++            model_input.last_sampled_token_ids = model_input.cached_outputs[
++                -1].sampled_token_ids
++            # free sampled token ids from the previous step if it has been
++            # pythonized. Cannot free the last sampled token ids because
++            # we need it for GPU advance_step.
++            for output in model_input.cached_outputs[:-1]:
++                if output.pythonized:
++                    output.sampled_token_ids = None
++        else:
++            # otherwise we need to get the cached sampled token ids from the
++            # execute_model_req
++            assert execute_model_req.last_sampled_token_ids is not None
++            model_input.last_sampled_token_ids = (
++                execute_model_req.last_sampled_token_ids.cuda())
++            model_input.add_sampler_output(
++                SamplerOutput(outputs=[], sampled_token_ids=None),
++                model_input.last_sampled_token_ids)
++
++            # free sampled token ids from the previous step.
++            # TODO(will) we could reuse the sampled token ids tensor from
++            # the previous step instead.
++            for output in model_input.cached_outputs[:-1]:
++                output.sampled_token_ids = None
++            assert model_input.cached_outputs[-1].sampled_token_ids is not None
++
++    def prepare_input(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None,
++    ) -> Optional[Tuple[StatefulModelInput, WorkerInput, Dict[str,
++                                                              torch.Tensor]]]:
++        """
++        Depending on the current state of the request and multi step worker,
++        this method may skip the normal _prepare_model_input and
++        _prepare_worker_input methods and instead used cached values.
++        """
++        if self.is_driver_worker:
++            if execute_model_req is None:
++                if self.do_metadata_broadcast:
++                    # This signals that there's no more requests to process for
++                    # now. All workers are running infinite loop with
++                    # broadcast_tensor_dict, and it stops the loop when the
++                    # driver broadcasts an empty input. Send an empty input to
++                    # notify all other workers to stop their execution loop.
++                    broadcast_tensor_dict({}, src=0)
++                return None
++
++            virtual_engine = execute_model_req.virtual_engine
++            (model_input, worker_input,
++             kwargs) = self._get_driver_input_and_broadcast(execute_model_req)
++            assert isinstance(model_input, StatefulModelInput)
++            if execute_model_req.is_first_multi_step:
++                # cache the worker input and model input for the next steps
++                self.multi_step_states[virtual_engine] = MultiStepState(
++                    worker_input=worker_input, model_input=model_input)
++        # if TP workers
++        else:
++            broadcast_data = self._get_worker_input_from_broadcast()
++            # if the driver has sent an empty input, we should stop the worker
++            # loop
++            if broadcast_data is None:
++                return None
++            model_input, worker_input, kwargs = broadcast_data
++            assert isinstance(model_input, StatefulModelInput)
++            virtual_engine = worker_input.virtual_engine
++            if model_input.is_first_multi_step:
++                pass
++                # TODO(will) Can cache the worker input and model input for the
++                # next steps. See below for details
++            else:
++                # TODO(will) possible to also cache and reuse the cached worker
++                # input and model input. The idea is essentially the delta
++                # optimization for model_inputs. Where the TP workers can cache
++                # the model input states and we only broadcast the delta need
++                # for the next step (sampled_token_ids from the previous step)
++
++                assert isinstance(model_input, StatefulModelInput)
++                # we need to update the last sampled token ids in the model
++                # input for the workers so that they can run inplace
++                # advance_step
++                model_input.add_sampler_output(
++                    SamplerOutput(outputs=[], sampled_token_ids=None),
++                    model_input.last_sampled_token_ids)
++
++        assert model_input is not None
++        assert worker_input is not None
++        return model_input, worker_input, kwargs
+diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
+index a336be0..ae4eb6b 100644
+--- a/vllm/worker/neuron_model_runner.py
++++ b/vllm/worker/neuron_model_runner.py
+@@ -1,58 +1,129 @@
+-from typing import List, Optional, Tuple
++import os
++from dataclasses import dataclass
++from importlib.util import find_spec
++from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+ 
+ import torch
+ from torch import nn
++from transformers_neuronx.config import GenerationConfig
+ 
+-from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
+-                         SchedulerConfig)
++from vllm.config import VllmConfig
+ from vllm.logger import init_logger
+ from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.sampler import SamplerOutput
+ from vllm.model_executor.model_loader.neuron import get_neuron_model
+-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
++from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
++                             MultiModalKwargs)
++from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
+ from vllm.utils import is_pin_memory_available, make_tensor_with_pad
++from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
++
++if TYPE_CHECKING:
++    from vllm.attention.backends.abstract import AttentionBackend
+ 
+ logger = init_logger(__name__)
+ 
+ 
+-class NeuronModelRunner:
++@dataclass(frozen=True)
++class ModelInputForNeuron(ModelRunnerInputBase):
++    """
++    Used by the NeuronModelRunner.
++    """
++    input_tokens: Optional[torch.Tensor] = None
++    input_positions: Optional[torch.Tensor] = None
++    input_block_ids: Optional[torch.Tensor] = None
++    sampling_metadata: Optional["SamplingMetadata"] = None
++    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
++
++    def as_broadcastable_tensor_dict(
++            self) -> Dict[str, Union[int, torch.Tensor]]:
++        raise NotImplementedError("ModelInputForNeuron cannot be broadcast.")
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls,
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> "ModelInputForNeuron":
++        assert attn_backend is None
++        return cls.from_broadcasted_tensor_dict(tensor_dict)
++
++
++class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
++
++    # NEURON has an upper limit on the top_k
++    _MAX_NEURON_SAMPLING_TOP_K = 256
+ 
+     def __init__(
+         self,
+-        model_config: ModelConfig,
+-        parallel_config: ParallelConfig,
+-        scheduler_config: SchedulerConfig,
+-        device_config: DeviceConfig,
++        vllm_config: VllmConfig,
+     ):
+-        self.model_config = model_config
+-        self.parallel_config = parallel_config
+-        self.scheduler_config = scheduler_config
+-
++        ModelRunnerBase.__init__(self, vllm_config)
++        model_config = self.model_config
+         if model_config is not None and model_config.get_sliding_window():
+             logger.warning("Sliding window is not supported on Neuron. "
+                            "The model will run without sliding window.")
+-        self.device_config = (device_config
+-                              if device_config is not None else DeviceConfig())
+         self.device = self.device_config.device
+         self.pin_memory = is_pin_memory_available()
+ 
++        # Multi-modal data support
++        self.mm_registry = MULTIMODAL_REGISTRY
++        self.multi_modal_input_mapper = self.mm_registry \
++            .create_input_mapper(self.model_config)
++
+         # Lazy initialization.
+         self.model: nn.Module  # initialize after load_model.
+ 
++        # Once NEURON_ON_DEVICE_SAMPLING_DISABLED is set to a non-zero value,
++        # turn off on-device sampling.
++        self._on_device_sampling_disabled = int(
++            os.getenv("NEURON_ON_DEVICE_SAMPLING_DISABLED", "0"))
++
++        # NEURON needs to update sampling parameters when request IDs change
++        # across batches. This variable stores the previous batch's request IDs
++        # to determine if an update is needed.
++        self._previous_batch_request_ids: List[str] = []
++
++        if not self._on_device_sampling_disabled:
++            logger.warning(
++                "On-device sampling is turned on in Neuron by default, only "
++                "top_k, top_p, and temperature are current supported sampling "
++                "parameters. To turn off the on-device sampling, please set "
++                "the environment variable NEURON_ON_DEVICE_SAMPLING_DISABLED=1."
++            )
++            self.model_config.neuron_sampling_params = GenerationConfig(
++                max_length=self.scheduler_config.max_model_len,
++                do_sample=True,
++                per_batch_line=True,
++                top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \
++                    * self.scheduler_config.max_num_seqs,
++                top_p=[1.0] * self.scheduler_config.max_num_seqs,
++                temperature=[1.0] * self.scheduler_config.max_num_seqs,
++                dynamic=True,
++                global_top_k=self._MAX_NEURON_SAMPLING_TOP_K)
++
+     def load_model(self) -> None:
+-        self.model = get_neuron_model(self.model_config,
+-                                      parallel_config=self.parallel_config,
+-                                      scheduler_config=self.scheduler_config)
++        if find_spec("transformers_neuronx") is not None:
++            self.model = get_neuron_model(
++                self.model_config,
++                parallel_config=self.parallel_config,
++                scheduler_config=self.scheduler_config)
++        else:
++            raise NotImplementedError(
++                "Supports only Transformer-NeuronX based models.")
+ 
+     def _prepare_prompt(
+         self,
+         seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int]]:
++    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int],
++               BatchedTensorInputs]:
+         assert len(seq_group_metadata_list) > 0
+         input_tokens: List[List[int]] = []
+         input_positions: List[List[int]] = []
+         input_block_ids: List[int] = []
+ 
+         seq_lens: List[int] = []
++        multi_modal_kwargs_list: List[MultiModalKwargs] = []
+         for seq_group_metadata in seq_group_metadata_list:
+             assert seq_group_metadata.is_prompt
+             seq_ids = list(seq_group_metadata.seq_data.keys())
+@@ -72,23 +143,38 @@ class NeuronModelRunner:
+             assert len(block_table) == 1
+             input_block_ids.append(block_table[0])
+ 
++            mm_data = seq_group_metadata.multi_modal_data
++            if mm_data:
++                if self.mm_registry.has_processor(self.model_config):
++                    mm_kwargs = mm_data
++                else:
++                    mm_kwargs = self.multi_modal_input_mapper(
++                        mm_data,
++                        seq_group_metadata.mm_processor_kwargs,
++                    )
++
++                multi_modal_kwargs_list.append(mm_kwargs)
++
+         max_seq_len = max(seq_lens)
+         assert max_seq_len > 0
+         input_tokens = make_tensor_with_pad(input_tokens,
+-                                            max_seq_len,
+                                             pad=0,
++                                            max_len=max_seq_len,
+                                             dtype=torch.long,
+                                             device=self.device)
+         input_positions = make_tensor_with_pad(input_positions,
+-                                               max_seq_len,
+                                                pad=0,
++                                               max_len=max_seq_len,
+                                                dtype=torch.long,
+                                                device=self.device)
+         input_block_ids = torch.tensor(input_block_ids,
+                                        dtype=torch.long,
+                                        device=self.device)
+ 
+-        return input_tokens, input_positions, input_block_ids, seq_lens
++        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
++
++        return (input_tokens, input_positions, input_block_ids, seq_lens,
++                multi_modal_kwargs)
+ 
+     def _prepare_decode(
+         self,
+@@ -121,13 +207,13 @@ class NeuronModelRunner:
+                 input_block_ids.append(block_table[0])
+ 
+         input_tokens = make_tensor_with_pad(input_tokens,
+-                                            max_len=1,
+                                             pad=0,
++                                            max_len=1,
+                                             dtype=torch.long,
+                                             device=self.device)
+         input_positions = make_tensor_with_pad(input_positions,
+-                                               max_len=1,
+                                                pad=0,
++                                               max_len=1,
+                                                dtype=torch.long,
+                                                device=self.device)
+         context_lens = torch.tensor(context_lens,
+@@ -139,21 +225,29 @@ class NeuronModelRunner:
+ 
+         return input_tokens, input_positions, input_block_ids
+ 
+-    def prepare_input_tensors(
++    def make_model_input_from_broadcasted_tensor_dict(
++            self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron:
++        return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict)
++
++    def prepare_model_input(
+         self,
+         seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]:
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> ModelInputForNeuron:
++        multi_modal_kwargs = None
+         # NOTE: We assume that all sequences in the group are all prompts or
+         # all decodes.
+         is_prompt = seq_group_metadata_list[0].is_prompt
+         # Prepare input tensors.
+         if is_prompt:
+-            (input_tokens, input_positions, input_block_ids,
+-             seq_lens) = self._prepare_prompt(seq_group_metadata_list)
++            (input_tokens, input_positions, input_block_ids, seq_lens,
++             multi_modal_kwargs
++             ) = self._prepare_prompt(seq_group_metadata_list)
+         else:
+             (input_tokens, input_positions,
+              input_block_ids) = self._prepare_decode(seq_group_metadata_list)
+-            seq_lens = []
++            seq_lens = None
+         sampling_metadata = SamplingMetadata.prepare(
+             seq_group_metadata_list,
+             seq_lens,
+@@ -162,34 +256,86 @@ class NeuronModelRunner:
+             # just use seq_lens instead.
+             seq_lens,
+             self.device,
+-            self.pin_memory)
++            self.pin_memory,
++            generators=self.get_generators(finished_requests_ids))
++
++        if not self._on_device_sampling_disabled:
++            # Once the request IDs are changed in current iteration, we will
++            # update the on-device sampling parameters.
++            current_batch_request_ids = [
++                seq_group_meta_data.request_id
++                for seq_group_meta_data in seq_group_metadata_list
++            ]
++            if current_batch_request_ids != self._previous_batch_request_ids:
++                self._update_neuron_sampling_params(sampling_metadata)
++                self._previous_batch_request_ids = current_batch_request_ids
+ 
+-        return (input_tokens, input_positions, input_block_ids,
+-                sampling_metadata)
++        return ModelInputForNeuron(input_tokens=input_tokens,
++                                   input_positions=input_positions,
++                                   input_block_ids=input_block_ids,
++                                   sampling_metadata=sampling_metadata,
++                                   multi_modal_kwargs=multi_modal_kwargs)
++
++    def _update_neuron_sampling_params(self,
++                                       sampling_metadata: SamplingMetadata):
++        # Update Neuron sampling parameters (GenerationConfig in Neuron)
++        current_sampling_params = self.model_config.neuron_sampling_params
++        assert current_sampling_params is not None, (
++            f"Failed to update sampling_params, "
++            f"current sampling params is {current_sampling_params}")
++
++        top_k = current_sampling_params.top_k
++        top_p = current_sampling_params.top_p
++        temperature = current_sampling_params.temperature
++        for index, sequence_group_to_sample in enumerate(
++                sampling_metadata.seq_groups):
++            top_k[index] = self._convert_to_neuron_top_k(
++                sequence_group_to_sample.sampling_params.top_k)
++            top_p[index] = sequence_group_to_sample.sampling_params.top_p
++            temperature[index] = \
++                sequence_group_to_sample.sampling_params.temperature
++
++        self.model.model.update_generation_config(current_sampling_params)
++
++    def _convert_to_neuron_top_k(self, top_k: int) -> int:
++        if top_k < 0 or top_k > self._MAX_NEURON_SAMPLING_TOP_K:
++            return self._MAX_NEURON_SAMPLING_TOP_K
++        return top_k
+ 
+     @torch.inference_mode()
+     def execute_model(
+         self,
+-        seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> Optional[SamplerOutput]:
+-        (input_tokens, input_positions, input_block_ids, sampling_metadata
+-         ) = self.prepare_input_tensors(seq_group_metadata_list)
++        model_input: ModelInputForNeuron,
++        kv_caches: Optional[List[torch.Tensor]] = None,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++    ) -> Optional[List[SamplerOutput]]:
++        if num_steps > 1:
++            raise ValueError(
++                "NeuronModelRunner does not support multi-step execution.")
+ 
+         hidden_states = self.model(
+-            input_ids=input_tokens,
+-            positions=input_positions,
+-            input_block_ids=input_block_ids,
++            input_ids=model_input.input_tokens,
++            positions=model_input.input_positions,
++            input_block_ids=model_input.input_block_ids,
++            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
++                                         device=self.device),
+         )
+ 
+-        # Compute the logits.
+-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
++        # Compute the logits only if the on-device sampling is turned off as
++        # on-device sampling outputs the token ids.
++        if self._on_device_sampling_disabled:
++            logits = self.model.compute_logits(hidden_states,
++                                               model_input.sampling_metadata)
++        else:
++            logits = hidden_states
+ 
+         # Sample the next token.
+         output = self.model.sample(
+             logits=logits,
+-            sampling_metadata=sampling_metadata,
++            sampling_metadata=model_input.sampling_metadata,
+         )
+-        return output
++        return [output]
+ 
+     @property
+     def vocab_size(self) -> int:
+diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
+index d0e6aae..3f62696 100644
+--- a/vllm/worker/neuron_worker.py
++++ b/vllm/worker/neuron_worker.py
+@@ -1,43 +1,47 @@
+ """A Neuron worker class."""
+-from typing import List, Tuple
++from typing import List, Optional, Tuple
+ 
+ import torch
+ import torch.distributed
+ 
+-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+-                         ParallelConfig, SchedulerConfig)
++from vllm.config import VllmConfig
++from vllm.distributed import (ensure_model_parallel_initialized,
++                              init_distributed_environment)
+ from vllm.model_executor import set_random_seed
+-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
++from vllm.sequence import ExecuteModelRequest
+ from vllm.worker.neuron_model_runner import NeuronModelRunner
+-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
++from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
++                                     LoraNotSupportedWorkerBase, WorkerBase,
++                                     WorkerInput)
+ 
+ 
+-class NeuronWorker(LoraNotSupportedWorkerBase):
++class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
+     """A worker class that executes the model on a group of neuron cores.
+     """
+ 
+     def __init__(
+         self,
+-        model_config: ModelConfig,
+-        parallel_config: ParallelConfig,
+-        scheduler_config: SchedulerConfig,
+-        device_config: DeviceConfig,
+-        cache_config: CacheConfig,
++        vllm_config: VllmConfig,
++        local_rank: int,
++        rank: int,
++        distributed_init_method: str,
+     ) -> None:
+-        self.model_config = model_config
+-        self.parallel_config = parallel_config
+-        self.scheduler_config = scheduler_config
+-        self.device_config = device_config
+-        self.cache_config = cache_config
++        WorkerBase.__init__(self, vllm_config=vllm_config)
++        self.local_rank = local_rank
++        self.rank = rank
++        self.distributed_init_method = distributed_init_method
+         if self.model_config.trust_remote_code:
+             # note: lazy import to avoid importing torch before initializing
+             from vllm.utils import init_cached_hf_modules
+             init_cached_hf_modules()
+ 
+-        self.model_runner = NeuronModelRunner(model_config, parallel_config,
+-                                              scheduler_config, device_config)
++        self.model_runner: NeuronModelRunner = NeuronModelRunner(
++            vllm_config=vllm_config)
++        self.is_driver_worker = True
+ 
+     def init_device(self) -> None:
++        self.init_distributed_environment()
++
+         # Set random seed.
+         set_random_seed(self.model_config.seed)
+ 
+@@ -73,22 +77,22 @@ class NeuronWorker(LoraNotSupportedWorkerBase):
+         self.cache_config.num_gpu_blocks = num_gpu_blocks
+         self.cache_config.num_cpu_blocks = num_cpu_blocks
+ 
+-    @torch.inference_mode()
+-    def execute_model(
+-        self,
+-        seq_group_metadata_list: List[SequenceGroupMetadata],
+-    ) -> List[SamplerOutput]:
+-        num_seq_groups = len(seq_group_metadata_list)
++    @property
++    def do_metadata_broadcast(self) -> bool:
++        return False
+ 
+-        # If there is no input, we don't need to execute the model.
+-        if num_seq_groups == 0:
+-            return []
++    @property
++    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
++        return None
+ 
+-        output = self.model_runner.execute_model(seq_group_metadata_list)
++    @torch.inference_mode()
++    def prepare_worker_input(
++            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
++        return WorkerInput(num_seq_groups=len(
++            execute_model_req.seq_group_metadata_list), )
+ 
+-        # Neuron worker only supports single-step output. Wrap the output in a
+-        # list to conform to interface.
+-        return [output]
++    def execute_worker(self, worker_input: WorkerInput) -> None:
++        pass
+ 
+     def get_cache_block_size_bytes(self) -> int:
+         """Determine the size in bytes of a cache block.
+@@ -96,3 +100,20 @@ class NeuronWorker(LoraNotSupportedWorkerBase):
+         This is required for speculative decoding; it is not yet implemented.
+         """
+         raise NotImplementedError
++
++    def init_distributed_environment(self):
++        """Neuron uses transformers-neuronx for tensor parallelism.
++
++        vLLM still needs the environment inited when TP/PP > 1
++        """
++        init_distributed_environment(
++            world_size=1,
++            rank=self.rank,
++            local_rank=self.local_rank,
++            distributed_init_method=self.distributed_init_method,
++            backend="gloo",
++        )
++        ensure_model_parallel_initialized(
++            1,
++            1,
++        )
+diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
+new file mode 100644
+index 0000000..6000e5d
+--- /dev/null
++++ b/vllm/worker/openvino_model_runner.py
+@@ -0,0 +1,369 @@
++from collections import defaultdict
++from typing import Dict, List, NamedTuple, Optional, Tuple
++
++import openvino as ov
++import torch
++from torch import nn
++
++from vllm.attention import get_attn_backend
++from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
++from vllm.config import VllmConfig
++from vllm.logger import init_logger
++from vllm.model_executor import SamplingMetadata
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.model_executor.model_loader.openvino import get_model
++from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
++                             MultiModalKwargs, MultiModalPlaceholderMap)
++from vllm.sequence import SequenceGroupMetadata
++from vllm.worker.model_runner_base import ModelRunnerBase
++
++logger = init_logger(__name__)
++
++
++class ModelInput(NamedTuple):
++    input_tokens: torch.Tensor
++    input_positions: torch.Tensor
++    attn_metadata: Optional[OpenVINOAttentionMetadata]
++    seq_lens: List[int]
++    query_lens: List[int]
++    multi_modal_kwargs: BatchedTensorInputs
++
++    @classmethod
++    def empty(cls, device):
++        return ModelInput(input_tokens=torch.empty(0, device=device),
++                          input_positions=torch.empty(0, device=device),
++                          attn_metadata=None,
++                          seq_lens=[],
++                          query_lens=[],
++                          multi_modal_kwargs={})
++
++
++class OpenVINOModelRunner(ModelRunnerBase):
++
++    def __init__(
++        self,
++        ov_core: ov.Core,
++        vllm_config: VllmConfig,
++        kv_cache_dtype: Optional[str] = "auto",
++        is_driver_worker: bool = False,
++        *args,
++        **kwargs,
++    ):
++        self.ov_core = ov_core
++        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
++        cache_config = self.cache_config
++        model_config = self.model_config
++        self.is_driver_worker = is_driver_worker
++
++        self.device = self.device_config.device
++
++        self.kv_cache_dtype = kv_cache_dtype
++        self.sliding_window = model_config.get_sliding_window()
++        self.block_size = cache_config.block_size
++
++        self.attn_backend = get_attn_backend(
++            self.model_config.get_head_size(),
++            self.model_config.dtype,
++            self.kv_cache_dtype,
++            self.block_size,
++            self.model_config.is_attention_free,
++        )
++
++        # Multi-modal data support
++        self.mm_registry = MULTIMODAL_REGISTRY
++        self.multi_modal_input_mapper = self.mm_registry \
++            .create_input_mapper(self.model_config)
++
++        # Lazy initialization.
++        self.model: nn.Module  # Set after init_Model
++
++    def load_model(self) -> None:
++        self.model = get_model(model_config=self.model_config,
++                               device_config=self.device_config,
++                               kv_cache_dtype=self.kv_cache_dtype,
++                               ov_core=self.ov_core)
++
++    def _prepare_model_input(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++    ) -> ModelInput:
++        """Prepare the model input based on a given sequence group.
++
++        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
++
++        The result tensors and data structure also batches input in prefill
++        -> decode order. For example,
++
++        - input_tokens[:num_prefill_tokens] contains prefill tokens.
++        - input_tokens[num_prefill_tokens:] contains decode tokens.
++        """
++        input_tokens: List[int] = []
++        input_positions: List[int] = []
++
++        seq_lens: List[int] = []
++        past_lens: List[int] = []
++        query_lens: List[int] = []
++        multi_modal_kwargs_list: List[MultiModalKwargs] = []
++        multi_modal_placeholder_maps: Dict[
++            str,
++            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
++
++        subsequence_begins: List[int] = []
++        block_indices: List[int] = []
++        block_indices_begins: List[int] = []
++
++        # initialize beginning of prefix sums
++        subsequence_begins.append(0)
++        block_indices_begins.append(0)
++
++        if len(seq_group_metadata_list) == 0:
++            return ModelInput.empty(self.device)
++
++        for seq_group_metadata in seq_group_metadata_list:
++            seq_ids = list(seq_group_metadata.seq_data.keys())
++            is_prompt = seq_group_metadata.is_prompt
++
++            for seq_id in seq_ids:
++                computed_block_nums = seq_group_metadata.computed_block_nums
++                if (self.scheduler_config is not None
++                        and self.scheduler_config.chunked_prefill_enabled
++                        and not (computed_block_nums is None
++                                 or computed_block_nums == [])):
++                    raise RuntimeError(
++                        "chunked prefill cannot be used with prefix caching "
++                        "now.")
++
++                seq_data = seq_group_metadata.seq_data[seq_id]
++                if is_prompt:
++                    computed_len = seq_data.get_num_computed_tokens()
++                else:
++                    # get_num_computed_tokens is incorrect for spec decoding.
++                    # So, we should have a special logic here.
++                    # TODO(sang): Fix it.
++                    computed_len = seq_data.get_len() - 1
++
++                seq_len = min(
++                    seq_data.get_len(),
++                    computed_len + seq_group_metadata.token_chunk_size,
++                )
++                if is_prompt:
++                    tokens = seq_data.get_token_ids()[computed_len:seq_len]
++                else:
++                    # Optimization. get_token_ids requires the entire copy of
++                    # tokens.
++                    tokens = [seq_data.get_last_token_id()]
++
++                # Prefix cache was hit.
++                # Prefix is not supported with sliding_window
++                prefix_cache_hit = (computed_block_nums is not None
++                                    and len(computed_block_nums) > 0
++                                    and self.sliding_window is None
++                                    and is_prompt)
++
++                block_table = seq_group_metadata.block_tables[seq_id]
++                # TODO(sang): Combine chunked prefill and prefix caching by
++                # only allowing multiple of block_size chunk size.
++                # NOTE: This only works for oooooooxxx style attention.
++                if prefix_cache_hit:
++                    assert computed_block_nums is not None
++                    computed_len = len(computed_block_nums) * self.block_size
++                    tokens = tokens[computed_len:]
++                elif (self.scheduler_config.chunked_prefill_enabled
++                      or not is_prompt):
++                    if seq_group_metadata.block_tables is not None:
++                        # chunked prefill or decode
++                        block_table = seq_group_metadata.block_tables[seq_id]
++                        if self.sliding_window is not None:
++                            # chunked prefill doesn't support sliding window.
++                            assert not self.scheduler_config.chunked_prefill_enabled  # noqa: E501
++                            sliding_window_blocks = (self.sliding_window //
++                                                     self.block_size)
++                            block_table = block_table[-sliding_window_blocks:]
++                    else:
++                        # Only happens when memory profiling runs.
++                        block_table = []
++                else:
++                    # prompt phase w/o prefix_caching, chunked_prefill
++                    pass
++
++                block_indices.extend(block_table)
++                block_indices_begins.append(block_indices_begins[-1] +
++                                            len(block_table))
++
++                # TODO(sang): This is a hack to make sliding window work with
++                # paged attn. We can remove it if we make paged attn kernel
++                # to properly handle slinding window attn.
++                if self.sliding_window is not None and not is_prompt:
++                    seq_len = min(seq_len, self.sliding_window)
++                    computed_len = seq_len - 1
++
++                seq_lens.append(seq_len)
++
++                query_len = seq_len - computed_len
++                query_lens.append(query_len)
++
++                input_tokens.extend(tokens)
++                positions_range = range(computed_len, seq_len)
++                input_positions.extend(list(positions_range))
++
++                past_lens.append(computed_len)
++                subsequence_begins.append(subsequence_begins[-1] + query_len)
++
++                if is_prompt:
++                    assert len(seq_ids) == 1
++                else:
++                    assert (
++                        query_len == 1
++                    ), "seq_len: {}, computed_len: {}, query_len: {}".format(
++                        seq_len, computed_len, query_len)
++
++                if seq_group_metadata.multi_modal_data:
++                    # NOTE: mm_data only includes the subset of multi-modal
++                    # items that intersect with the current prefill positions.
++                    mm_data, placeholder_maps = MultiModalPlaceholderMap \
++                        .from_seq_group(seq_group_metadata, positions_range)
++
++                    if self.mm_registry.has_processor(self.model_config):
++                        mm_kwargs = mm_data
++                    else:
++                        mm_kwargs = self.multi_modal_input_mapper(
++                            mm_data,
++                            seq_group_metadata.mm_processor_kwargs,
++                        )
++
++                    multi_modal_kwargs_list.append(mm_kwargs)
++
++                    for modality, placeholder_map in placeholder_maps.items():
++                        multi_modal_placeholder_maps[modality].extend(
++                            placeholder_map, )
++
++        max_query_len = max(query_lens)
++        assert max_query_len > 0, "query_lens: {}".format(query_lens)
++
++        input_tokens = torch.tensor(input_tokens,
++                                    dtype=torch.long,
++                                    device=self.device)  # type: ignore
++        input_positions = torch.tensor(input_positions,
++                                       dtype=torch.long,
++                                       device=self.device)  # type: ignore
++
++        past_lens_tensor = torch.tensor(past_lens,
++                                        dtype=torch.int32,
++                                        device=self.device)  # type: ignore
++        subsequence_begins_tensor = torch.tensor(
++            subsequence_begins, dtype=torch.int32,
++            device=self.device)  # type: ignore
++        block_indices_tensor = torch.tensor(block_indices,
++                                            dtype=torch.int32,
++                                            device=self.device)  # type: ignore
++        block_indices_begins_tensor = torch.tensor(
++            block_indices_begins, dtype=torch.int32,
++            device=self.device)  # type: ignore
++
++        max_context_len = max(seq_lens)
++        max_context_len_tensor = torch.tensor(
++            max_context_len, dtype=torch.int32,
++            device=self.device)  # type: ignore
++
++        placeholder_index_maps = {
++            modality: placeholder_map.index_map()
++            for modality, placeholder_map in
++            multi_modal_placeholder_maps.items()
++        }
++
++        attn_metadata = self.attn_backend.make_openvino_metadata(
++            past_lens=past_lens_tensor,
++            subsequence_begins=subsequence_begins_tensor,
++            block_indices=block_indices_tensor,
++            block_indices_begins=block_indices_begins_tensor,
++            max_context_len=max_context_len_tensor,
++            multi_modal_placeholder_index_maps=placeholder_index_maps,
++        )
++
++        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
++
++        return ModelInput(
++            input_tokens,
++            input_positions,
++            attn_metadata,
++            seq_lens,
++            query_lens,
++            multi_modal_kwargs=multi_modal_kwargs,
++        )
++
++    def prepare_input_tensors(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++    ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
++               SamplingMetadata, BatchedTensorInputs]:
++        # Prepare input tensors.
++        (
++            input_tokens,
++            input_positions,
++            attn_metadata,
++            seq_lens,
++            query_lens,
++            multi_modal_kwargs,
++        ) = self._prepare_model_input(seq_group_metadata_list)
++
++        sampling_metadata = SamplingMetadata.prepare(
++            seq_group_metadata_list,
++            seq_lens,
++            query_lens,
++            self.device,
++            pin_memory=False,
++        )
++
++        return (
++            input_tokens,
++            input_positions,
++            attn_metadata,
++            sampling_metadata,
++            multi_modal_kwargs,
++        )
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        kv_caches: List[Tuple["ov.Tensor", "ov.Tensor"]],
++    ) -> Optional[SamplerOutput]:
++        (
++            input_tokens,
++            input_positions,
++            attn_metadata,
++            sampling_metadata,
++            multi_modal_kwargs,
++        ) = self.prepare_input_tensors(seq_group_metadata_list)
++
++        model_executable = self.model
++        execute_model_kwargs = {
++            "input_ids":
++            input_tokens,
++            "positions":
++            input_positions,
++            "kv_caches":
++            kv_caches,
++            "attn_metadata":
++            attn_metadata,
++            **MultiModalKwargs.as_kwargs(multi_modal_kwargs or {},
++                                         device=self.device),
++        }
++
++        hidden_states = model_executable(**execute_model_kwargs)
++
++        # Compute the logits.
++        logits = self.model.compute_logits(hidden_states, sampling_metadata)
++
++        # Sample the next token.
++        output = self.model.sample(
++            logits=logits,
++            sampling_metadata=sampling_metadata,
++        )
++        return output
++
++    def prepare_model_input(self, *args, **kwargs):
++        raise NotImplementedError
++
++    def make_model_input_from_broadcasted_tensor_dict(self, *args, **kwargs):
++        raise NotImplementedError
+diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
+new file mode 100644
+index 0000000..0bf522d
+--- /dev/null
++++ b/vllm/worker/openvino_worker.py
+@@ -0,0 +1,588 @@
++"""An OpenVINO worker class."""
++from typing import Any, Dict, List, Optional, Tuple
++
++import openvino as ov
++import torch
++import torch.distributed
++
++import vllm.envs as envs
++from vllm.attention import get_attn_backend
++from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
++                         ParallelConfig, VllmConfig)
++from vllm.distributed import (broadcast_tensor_dict,
++                              ensure_model_parallel_initialized,
++                              init_distributed_environment)
++from vllm.inputs import INPUT_REGISTRY
++from vllm.logger import init_logger
++from vllm.model_executor import set_random_seed
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.multimodal import MULTIMODAL_REGISTRY
++from vllm.platforms import current_platform
++from vllm.sampling_params import SamplingParams
++from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
++from vllm.worker.openvino_model_runner import OpenVINOModelRunner
++from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
++
++logger = init_logger(__name__)
++
++
++class OpenVINOCacheEngine:
++    """Manages the KV cache for OpenVINO backend.
++
++    This class is responsible for initializing and managing CPU KV
++    caches. It also provides methods for performing KV cache operations, such
++    as copying.
++    """
++
++    def __init__(
++        self,
++        cache_config: CacheConfig,
++        model_config: ModelConfig,
++        parallel_config: ParallelConfig,
++        device_config: DeviceConfig,
++        ov_core: ov.Core,
++        ov_device: str,
++    ) -> None:
++        assert device_config.device_type == "openvino"
++        self.cache_config = cache_config
++        self.model_config = model_config
++        self.parallel_config = parallel_config
++
++        self.head_size = model_config.get_head_size()
++        if device_config.device.type == "cpu" and \
++            cache_config.cache_dtype == ov.Type.u8:
++            # Scale, zero point and quantized data will be stored together.
++            # The layout for per token per head:
++            # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
++            # so, we have to extend head_size by 8, which is sizeof(float)
++            # for scale and sizeof(float) for zeropoint
++            self.head_size += 8
++        self.num_layers = model_config.get_num_layers(parallel_config)
++        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
++
++        self.block_size = cache_config.block_size
++        # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks
++        # for OpenVINO backend with a CPU target device, because we want
++        # to reuse KV cache management in the scheduler.
++        self.num_device_blocks = cache_config.num_gpu_blocks
++        self.num_swap_blocks = cache_config.num_cpu_blocks
++
++        # Get attention backend.
++        self.attn_backend = get_attn_backend(
++            self.head_size,
++            self.model_config.dtype,
++            self.cache_config.cache_dtype,
++            self.block_size,
++            self.model_config.is_attention_free,
++        )
++
++        # Initialize the cache.
++        self.kv_cache: List[Tuple[ov.Tensor,
++                                  ov.Tensor]] = self._allocate_kv_cache(
++                                      self.num_device_blocks, ov_core,
++                                      ov_device)
++
++        # Initialize the swap.
++        self.swap_cache: List[Tuple[ov.Tensor,
++                                    ov.Tensor]] = self._allocate_swap_cache(
++                                        self.num_swap_blocks, ov_device)
++
++    def _allocate_kv_cache(
++        self,
++        num_blocks: int,
++        ov_core: ov.Core,
++        ov_device: str,
++    ) -> List[Tuple[ov.Tensor, ov.Tensor]]:
++        """Allocates KV cache."""
++        k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape(
++            num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:]
++        kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = []
++
++        if current_platform.is_openvino_cpu():
++            for _ in range(self.num_layers):
++                key_blocks = ov.Tensor(self.cache_config.cache_dtype,
++                                       k_block_shape)
++                value_blocks = ov.Tensor(self.cache_config.cache_dtype,
++                                         v_block_shape)
++                kv_cache.append((key_blocks, value_blocks))
++        else:
++            # Update key_cache shape:
++            k_block_shape = (v_block_shape[0], v_block_shape[1],
++                             v_block_shape[3], v_block_shape[2])
++
++            remote_context = ov_core.get_default_context(ov_device)
++
++            for _ in range(self.num_layers):
++                key_blocks = \
++                    remote_context.create_tensor(self.cache_config.cache_dtype,
++                                                 ov.Shape(k_block_shape),
++                                                 {})
++
++                value_blocks = \
++                    remote_context.create_tensor(self.cache_config.cache_dtype,
++                                                 ov.Shape(v_block_shape),
++                                                 {})
++
++                kv_cache.append((key_blocks, value_blocks))
++
++        return kv_cache
++
++    def _allocate_swap_cache(
++        self,
++        num_blocks: int,
++        ov_device: str,
++    ) -> List[Tuple[ov.Tensor, ov.Tensor]]:
++        """Allocates swap cache."""
++        k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape(
++            num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:]
++        swap_cache: List[Tuple[ov.Tensor, ov.Tensor]] = []
++
++        if num_blocks == 0:
++            return swap_cache
++
++        assert not current_platform.is_openvino_cpu(), \
++            "CPU device isn't supposed to have swap cache"
++
++        # Update key_cache shape:
++        k_block_shape = (v_block_shape[0], v_block_shape[1], v_block_shape[3],
++                         v_block_shape[2])
++
++        for _ in range(self.num_layers):
++            key_blocks = ov.Tensor(self.cache_config.cache_dtype,
++                                   k_block_shape)
++            value_blocks = ov.Tensor(self.cache_config.cache_dtype,
++                                     v_block_shape)
++            swap_cache.append((key_blocks, value_blocks))
++
++        return swap_cache
++
++    def swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None:
++        for i in range(self.num_layers):
++            for swap_tensor, kv_tensor in zip(self.swap_cache[i],
++                                              self.kv_cache[i]):
++                self.attn_backend.swap_blocks(swap_tensor, kv_tensor,
++                                              src_to_dst)
++
++    def swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None:
++        for i in range(self.num_layers):
++            for swap_tensor, kv_tensor in zip(self.swap_cache[i],
++                                              self.kv_cache[i]):
++                self.attn_backend.swap_blocks(kv_tensor, swap_tensor,
++                                              src_to_dst)
++
++    def copy(self, src_to_dsts: List[Tuple[int, int]]) -> None:
++        if (len(src_to_dsts) > 0):
++            self.attn_backend.copy_blocks(self.kv_cache, src_to_dsts)
++
++    @staticmethod
++    def get_cache_block_size(
++        block_size: int,
++        cache_dtype: ov.Type,
++        model_config: ModelConfig,
++        parallel_config: ParallelConfig,
++    ) -> int:
++        head_size = model_config.get_head_size()
++        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
++        num_layers = model_config.get_num_layers(parallel_config)
++
++        if cache_dtype == ov.Type.u8:
++            # Scale, zero point and quantized data will be stored together.
++            # The layout for per token per head:
++            # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
++            # so, we have to extend head_size by 8, which is sizeof(float)
++            # for scale and sizeof(float) for zeropoint
++            head_size += 8
++
++        key_cache_block = block_size * num_kv_heads * head_size
++        value_cache_block = key_cache_block
++        total = num_layers * (key_cache_block + value_cache_block)
++        dtype_size = cache_dtype.size
++        return dtype_size * total
++
++
++class OpenVINOWorker(LoraNotSupportedWorkerBase):
++    """A worker class that executes the model on OpenVINO backend.
++
++    Each worker is associated with a single OpenVINO device. The worker is
++    responsible for maintaining the KV cache and executing the model on the
++    OpenVINO backend.
++    """
++
++    def __init__(
++        self,
++        ov_core: ov.Core,
++        vllm_config: VllmConfig,
++        local_rank: int,
++        rank: int,
++        distributed_init_method: str,
++        kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
++        is_driver_worker: bool = False,
++    ) -> None:
++        self.ov_core = ov_core
++        WorkerBase.__init__(self, vllm_config)
++        self.parallel_config.rank = rank
++        self.local_rank = local_rank
++        self.rank = rank
++        self.distributed_init_method = distributed_init_method
++        self.is_driver_worker = is_driver_worker
++        if self.is_driver_worker:
++            assert self.rank == 0, "The driver worker must have rank 0."
++
++        if self.model_config.trust_remote_code:
++            # note: lazy import to avoid importing torch before initializing
++            from vllm.utils import init_cached_hf_modules
++
++            init_cached_hf_modules()
++        self.model_runner = OpenVINOModelRunner(
++            self.ov_core,
++            vllm_config=self.vllm_config,
++            kv_cache_dtype=kv_cache_dtype,
++            is_driver_worker=is_driver_worker,
++        )
++        # Uninitialized cache engine. Will be initialized by
++        # initialize_cache.
++        self.cache_engine: OpenVINOCacheEngine
++        self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]]
++
++    def init_device(self) -> None:
++        self.init_distributed_environment()
++        # Set random seed.
++        set_random_seed(self.model_config.seed)
++
++    def load_model(self):
++        self.model_runner.load_model()
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """Determine the number of blocks available for the KV cache.
++
++        This determines how many KV blocks can fit into the configured
++        KV cache space.
++        """
++        # For OpenVINO backend, in case of CPU device, the block number will be
++        # calculated based on the openvino_kvcache_space_bytes.
++        cache_block_size = self.get_cache_block_size_bytes()
++        kvcache_space_bytes = self.cache_config.openvino_kvcache_space_bytes
++
++        if current_platform.is_openvino_cpu():
++            num_device_blocks = int(kvcache_space_bytes // cache_block_size)
++            num_swap_blocks = 0
++        else:
++            if kvcache_space_bytes > 0:
++                logger.info("KV_CACHE size was explicitly configured via "
++                            "VLLM_OPENVINO_KVCACHE_SPACE environment "
++                            "variable, ignoring profiling run.")
++                kv_cache_size = kvcache_space_bytes
++            else:
++                try:
++                    kv_cache_size = self.profile_run()
++                except Exception as err:
++                    raise RuntimeError(
++                        "The error occurred during profile run. This might be "
++                        "due to insufficient GPU memory. Consider decreasing "
++                        "`max_model_len` to limit the maximum simultaneously "
++                        "processed tokens.") from err
++
++            num_device_blocks = int(kv_cache_size // cache_block_size)
++            num_swap_blocks = int(self.cache_config.swap_space_bytes //
++                                  cache_block_size)
++
++        return num_device_blocks, num_swap_blocks
++
++    def initialize_cache(self, num_gpu_blocks: int,
++                         num_cpu_blocks: int) -> None:
++        """Initialize the KV cache. Swappable CPU memory is only
++        supported on GPU.
++
++        For CPU, we use the num_gpu_blocks to
++        determine how many non-swappable CPU blocks to allocate.
++        """
++
++        num_device_blocks = num_gpu_blocks
++        num_swap_blocks = num_cpu_blocks
++
++        if current_platform.is_openvino_cpu():
++            assert (num_swap_blocks == 0
++                    ), f"{type(self)} does not support swappable cache for CPU"
++
++        self._validate_num_blocks(num_device_blocks)
++        self.cache_config.num_gpu_blocks = num_device_blocks
++        self.cache_config.num_cpu_blocks = num_swap_blocks
++
++        # Initialize the cache.
++        self._init_cache_engine()
++
++    def _validate_num_blocks(self, num_blocks: int) -> None:
++        """Raise errors if the num_blocks is invalid."""
++        if num_blocks <= 0:
++            raise ValueError(
++                "No available memory for the cache blocks. "
++                "Try increasing `VLLM_OPENVINO_KVCACHE_SPACE` when "
++                "initializing the engine.")
++
++        max_seq_len = self.cache_config.block_size * num_blocks
++        if self.model_config.max_model_len > max_seq_len:
++            raise ValueError(
++                f"The model's max seq len ({self.model_config.max_model_len}) "
++                "is larger than the maximum number of tokens that can be "
++                f"stored in KV cache ({max_seq_len}). Try increasing "
++                "`VLLM_OPENVINO_KVCACHE_SPACE` or decreasing `max_model_len` "
++                "when initializing the engine.")
++
++    def _init_cache_engine(self) -> None:
++        ov_device = envs.VLLM_OPENVINO_DEVICE
++        self.cache_engine = OpenVINOCacheEngine(
++            self.cache_config,
++            self.model_config,
++            self.parallel_config,
++            self.device_config,
++            self.ov_core,
++            ov_device,
++        )
++        self.kv_cache = self.cache_engine.kv_cache
++        self.model_runner.block_size = self.cache_engine.block_size
++
++        assert self.kv_cache is not None
++
++        # Populate the cache to warmup the memory
++        if current_platform.is_openvino_cpu():
++            for key_cache, value_cache in self.kv_cache:
++                key_cache.data[:] = 0
++                value_cache.data[:] = 0
++
++    def cache_swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None:
++        self.cache_engine.swap_in(src_to_dst)
++
++    def cache_swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None:
++        self.cache_engine.swap_out(src_to_dst)
++
++    def cache_copy(
++        self,
++        blocks_to_copy: List[Tuple[int, int]],
++    ) -> None:
++        self.cache_engine.copy(blocks_to_copy)  # type: ignore
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None,
++    ) -> List[SamplerOutput]:
++        if execute_model_req is None:
++            seq_group_metadata_list = None
++        else:
++            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
++
++        if self.is_driver_worker:
++            assert seq_group_metadata_list is not None
++            num_seq_groups: int = len(seq_group_metadata_list)
++            assert execute_model_req is not None
++            blocks_to_copy = execute_model_req.blocks_to_copy
++            blocks_to_swap_in = execute_model_req.blocks_to_swap_in
++            blocks_to_swap_out = execute_model_req.blocks_to_swap_out
++            data: Dict[str, Any] = {
++                "num_seq_groups": num_seq_groups,
++                "blocks_to_copy": execute_model_req.blocks_to_copy,
++                "blocks_to_swap_in": execute_model_req.blocks_to_swap_in,
++                "blocks_to_swap_out": execute_model_req.blocks_to_swap_out,
++            }
++            broadcast_tensor_dict(data, src=0)
++        else:
++            data = broadcast_tensor_dict(src=0)
++            num_seq_groups = data["num_seq_groups"]
++            blocks_to_copy = data["blocks_to_copy"]
++            blocks_to_swap_in = data["blocks_to_swap_in"]
++            blocks_to_swap_out = data["blocks_to_swap_out"]
++
++        if current_platform.is_openvino_cpu():
++            assert len(execute_model_req.blocks_to_swap_in) == 0
++            assert len(execute_model_req.blocks_to_swap_out) == 0
++        else:
++            self.cache_swap_in(blocks_to_swap_in)
++            self.cache_swap_out(blocks_to_swap_out)
++
++        self.cache_copy(blocks_to_copy)
++
++        # If there is no input, we don't need to execute the model.
++        if num_seq_groups == 0:
++            return []
++
++        output = self.model_runner.execute_model(seq_group_metadata_list,
++                                                 self.kv_cache)
++
++        # OpenVINO worker only supports single-step execution.
++        return [output]
++
++    def init_distributed_environment(self) -> None:
++        """Initialize the distributed environment."""
++
++        parallel_config = self.parallel_config
++        rank = self.rank
++        distributed_init_method = self.distributed_init_method
++        init_distributed_environment(
++            world_size=parallel_config.world_size,
++            rank=rank,
++            distributed_init_method=distributed_init_method,
++            backend="gloo",
++        )
++
++        # A small all_reduce for warmup.
++        torch.distributed.all_reduce(torch.zeros(1).cpu())
++
++        ensure_model_parallel_initialized(
++            parallel_config.tensor_parallel_size,
++            parallel_config.pipeline_parallel_size,
++        )
++
++    def get_cache_block_size_bytes(self) -> int:
++        """Return the size in bytes of a single KV cache block."""
++        return OpenVINOCacheEngine.get_cache_block_size(
++            self.cache_config.block_size,
++            self.cache_config.cache_dtype,
++            self.model_config,
++            self.parallel_config,
++        )
++
++    def profile_run(self) -> int:
++        ov_device = envs.VLLM_OPENVINO_DEVICE
++
++        assert not current_platform.is_openvino_cpu(), \
++            "CPU device isn't supposed to use profile run."
++
++        import openvino.properties.device as device
++        import openvino.properties.intel_gpu as intel_gpu
++
++        ov_core = self.ov_core
++        cache_config = self.cache_config
++        model_config = self.model_config
++        parallel_config = self.parallel_config
++        device_config = self.device_config
++        input_registry = INPUT_REGISTRY
++        mm_registry = MULTIMODAL_REGISTRY
++        mm_registry.init_mm_limits_per_prompt(model_config)
++
++        # Execute a forward pass with dummy inputs to profile the memory usage
++        # of the model.
++        def model_profile_run():
++            top_k = model_config.get_vocab_size() - 1
++            sampling_params = SamplingParams(top_p=0.99, top_k=top_k)
++
++            max_num_batched_tokens = \
++                self.scheduler_config.max_num_batched_tokens
++            max_num_seqs = self.scheduler_config.max_num_seqs
++            tmp_cache_config = CacheConfig(cache_config.block_size,
++                                           cache_config.gpu_memory_utilization,
++                                           cache_config.swap_space_bytes,
++                                           "auto")
++            tmp_cache_config.num_gpu_blocks = 1
++            tmp_cache_config.num_cpu_blocks = 0
++            tmp_cache_config.cache_dtype = cache_config.cache_dtype
++
++            profiling_cache_engine = OpenVINOCacheEngine(
++                tmp_cache_config, model_config, parallel_config, device_config,
++                ov_core, ov_device)
++
++            # Profile memory usage with max_num_sequences sequences and the
++            # total # number of tokens equal to max_num_batched_tokens.
++            seqs: List[SequenceGroupMetadata] = []
++            for group_id in range(max_num_seqs):
++                seq_len = (max_num_batched_tokens // max_num_seqs +
++                           (group_id < max_num_batched_tokens % max_num_seqs))
++                block_size = cache_config.block_size
++                seq_num_blocks = (seq_len + block_size - 1) // block_size
++
++                dummy_data = input_registry \
++                    .dummy_data_for_profiling(model_config,
++                                              seq_len,
++                                              mm_registry)
++
++                block_tables = [[0] * seq_num_blocks] * max_num_seqs
++                seq = SequenceGroupMetadata(
++                    request_id=str(group_id),
++                    is_prompt=True,
++                    seq_data={group_id: dummy_data.seq_data},
++                    sampling_params=sampling_params,
++                    block_tables=block_tables,
++                    lora_request=None,
++                    multi_modal_data=dummy_data.multi_modal_data)
++                seqs.append(seq)
++
++            self.model_runner.block_size = tmp_cache_config.block_size
++
++            # Run the model with the dummy inputs.
++            self.model_runner.execute_model(seqs,
++                                            profiling_cache_engine.kv_cache)
++
++            # explicitly delete temporary KV cache manager to free KV cache
++            # when real inputs will be passed to OV
++            del profiling_cache_engine
++
++            logger.info(
++                "Start profiling run with dummy inputs to evaluate "
++                "memory usage for %s. It might take a while.", ov_device)
++
++        model_profile_run()
++
++        gpu_device_type = ov_core.get_property(ov_device, device.type)
++        memory_statistics = \
++            ov_core.get_property(ov_device, intel_gpu.memory_statistics)
++        memory_utilization = cache_config.gpu_memory_utilization
++
++        if gpu_device_type == device.Type.INTEGRATED and \
++            memory_utilization >= 0.9:
++            logger.warning(
++                "iGPU is used with high gpu_memory_utilization=%f "
++                "value. This may cause low performance due to "
++                "occupying the majority of available system "
++                "memory. Please consider decreasing "
++                "gpu_memory_utilization or explicitly setting"
++                "`VLLM_OPENVINO_KVCACHE_SPACE` (GB) environment "
++                "variable.", memory_utilization)
++
++        # sum up all used device memory
++        device_memory_types = ["cl_mem", "usm_device"]
++        used_device_mem = \
++            sum(memory_statistics.get(key, 0) for key in device_memory_types)
++
++        if gpu_device_type == device.Type.INTEGRATED:
++            used_device_mem += memory_statistics.get("usm_host", 0)
++
++        # there could be unaccounted extra memory reserved by kernels, kept
++        # in memory pools, etc
++        # therefore, add a threshold to account for this
++        used_memory_threshold = 1.1
++        used_device_mem *= used_memory_threshold
++
++        total_device_memory = \
++            ov_core.get_property(ov_device, intel_gpu.device_total_mem_size)
++
++        def format_memory_size(size) -> str:
++            units = ["B", "KB", "MB", "GB"]
++            unit_index = 0
++
++            while size > 1024 and unit_index < len(units) - 1:
++                size /= 1024
++                unit_index += 1
++
++            return f"{size:.2f} {units[unit_index]}"
++
++        total_device_memory_str = \
++            format(format_memory_size(total_device_memory))
++        used_device_memory_str = \
++            format(format_memory_size(used_device_mem))
++
++        logger.info(
++            "Total %s memory: %s. "
++            "Amount of memory required to run the model with "
++            "max_num_batched_tokens=%d: %s.", ov_device,
++            total_device_memory_str,
++            self.scheduler_config.max_num_batched_tokens,
++            used_device_memory_str)
++
++        if used_device_mem >= total_device_memory:
++            raise RuntimeError(
++                f"The required memory size {used_device_memory_str} for model "
++                "is higher than the total available device "
++                "memory {total_device_memory_str}. Please consider to "
++                "decrease `max_num_batched_tokens` or increase "
++                "`gpu_memory_utilization`")
++
++        return total_device_memory * memory_utilization - used_device_mem
+diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
+new file mode 100644
+index 0000000..6de227f
+--- /dev/null
++++ b/vllm/worker/pooling_model_runner.py
+@@ -0,0 +1,201 @@
++import dataclasses
++from typing import Any, Dict, List, Optional, Tuple, Type, Union
++
++import torch
++
++from vllm.config import VllmConfig
++from vllm.distributed import get_pp_group
++from vllm.forward_context import set_forward_context
++from vllm.logger import init_logger
++from vllm.model_executor.pooling_metadata import PoolingMetadata
++from vllm.multimodal import MultiModalKwargs
++from vllm.pooling_params import PoolingParams
++from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
++                           SequenceGroupMetadata)
++from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPU,
++                                      ModelInputForGPUBuilder)
++
++logger = init_logger(__name__)
++
++
++@dataclasses.dataclass(frozen=True)
++class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
++    """
++    Used by the PoolingModelRunner.
++    """
++    pooling_metadata: Optional["PoolingMetadata"] = None
++
++
++class PoolingModelRunner(
++        GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
++    _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
++        ModelInputForGPUWithPoolingMetadata)
++    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        kv_cache_dtype: Optional[str] = "auto",
++        is_driver_worker: bool = False,
++    ):
++        super().__init__(vllm_config=vllm_config,
++                         kv_cache_dtype=kv_cache_dtype,
++                         is_driver_worker=is_driver_worker)
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        model_input: ModelInputForGPUWithPoolingMetadata,
++        kv_caches: List[torch.Tensor],
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++    ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
++        if num_steps > 1:
++            raise ValueError(
++                "PoolingModelRunner does not support multi-step execution.")
++
++        if self.lora_config:
++            assert model_input.lora_requests is not None
++            assert model_input.lora_mapping is not None
++            self.set_active_loras(model_input.lora_requests,
++                                  model_input.lora_mapping)
++
++        if self.prompt_adapter_config:
++            assert model_input.prompt_adapter_requests is not None
++            assert model_input.prompt_adapter_mapping is not None
++            self.set_active_prompt_adapters(
++                model_input.prompt_adapter_requests,
++                model_input.prompt_adapter_mapping)
++
++        # Currently cuda graph is only supported by the decode phase.
++        assert model_input.attn_metadata is not None
++        prefill_meta = model_input.attn_metadata.prefill_metadata
++        decode_meta = model_input.attn_metadata.decode_metadata
++        virtual_engine = model_input.virtual_engine
++        if prefill_meta is None and decode_meta.use_cuda_graph:
++            assert model_input.input_tokens is not None
++            graph_batch_size = model_input.input_tokens.shape[0]
++            model_executable = self.graph_runners[virtual_engine][
++                graph_batch_size]
++        else:
++            model_executable = self.model
++
++        num_layers = self.model_config.get_num_layers(self.parallel_config)
++        # use an empty tensor instead of `None`` to force Dynamo to pass
++        # it by reference, rather by specializing on the value ``None``.
++        # the `dtype` argument does not matter, and we use `float32` as
++        # a placeholder (it has wide hardware support).
++        kv_caches = [
++            torch.tensor([], dtype=torch.float32, device=self.device)
++            for _ in range(num_layers)
++        ]
++
++        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
++        seqlen_agnostic_kwargs = {
++            "finished_requests_ids": model_input.finished_requests_ids,
++            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
++        } if self.has_inner_state else {}
++        if (self.observability_config is not None
++                and self.observability_config.collect_model_forward_time):
++            model_forward_start = torch.cuda.Event(enable_timing=True)
++            model_forward_end = torch.cuda.Event(enable_timing=True)
++            model_forward_start.record()
++
++        cross_enc_kwargs = {}
++        if model_input.token_types is not None:
++            cross_enc_kwargs["token_type_ids"] = model_input.token_types
++
++        with set_forward_context(model_input.attn_metadata, self.vllm_config,
++                                 virtual_engine):
++            hidden_or_intermediate_states = model_executable(
++                input_ids=model_input.input_tokens,
++                positions=model_input.input_positions,
++                kv_caches=kv_caches,
++                attn_metadata=model_input.attn_metadata,
++                intermediate_tensors=intermediate_tensors,
++                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
++                                             device=self.device),
++                **cross_enc_kwargs,
++                **seqlen_agnostic_kwargs)
++
++        if (self.observability_config is not None
++                and self.observability_config.collect_model_forward_time):
++            model_forward_end.record()
++
++        # Only perform pooling in the last pipeline stage.
++        if not get_pp_group().is_last_rank:
++            if (self.is_driver_worker
++                    and hidden_or_intermediate_states is not None
++                    and isinstance(hidden_or_intermediate_states,
++                                   IntermediateTensors)
++                    and self.observability_config is not None
++                    and self.observability_config.collect_model_forward_time):
++                model_forward_end.synchronize()
++                model_forward_time = model_forward_start.elapsed_time(
++                    model_forward_end)
++                orig_model_forward_time = 0.0
++                if intermediate_tensors is not None:
++                    orig_model_forward_time = intermediate_tensors.tensors.get(
++                        "model_forward_time", torch.tensor(0.0)).item()
++                hidden_or_intermediate_states.tensors["model_forward_time"] = (
++                    torch.tensor(model_forward_time + orig_model_forward_time))
++            return hidden_or_intermediate_states
++
++        # Only perform pooling in the driver worker.
++        if not self.is_driver_worker:
++            return []
++
++        return [
++            self.model.pooler(hidden_states=hidden_or_intermediate_states,
++                              pooling_metadata=model_input.pooling_metadata)
++        ]
++
++    def make_model_input_from_broadcasted_tensor_dict(
++            self,
++            tensor_dict: Dict[str,
++                              Any]) -> ModelInputForGPUWithPoolingMetadata:
++        return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
++            tensor_dict,
++            attn_backend=self.attn_backend,
++        )
++
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> ModelInputForGPUWithPoolingMetadata:
++        assert seq_group_metadata_list is not None
++        model_input = self._prepare_model_input_tensors(
++            seq_group_metadata_list, finished_requests_ids)
++        # Prepare PoolingMetadata.
++        assert model_input.seq_lens is not None
++        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
++                                                 model_input.seq_lens)
++
++        return dataclasses.replace(model_input,
++                                   pooling_metadata=pooling_metadata)
++
++    def _prepare_pooling(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        prompt_lens: List[int],
++    ) -> PoolingMetadata:
++        """Prepare PoolingMetadata for the sequence group metadata list."""
++        seq_groups: List[Tuple[List[int], PoolingParams]] = []
++        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
++            seq_ids = list(seq_group_metadata.seq_data.keys())
++            pooling_params = seq_group_metadata.pooling_params
++            seq_groups.append((seq_ids, pooling_params))
++
++        seq_data: Dict[int, SequenceData] = {}
++        for seq_group_metadata in seq_group_metadata_list:
++            seq_data.update(seq_group_metadata.seq_data)
++
++        pooling_metadata = PoolingMetadata(
++            seq_groups=seq_groups,
++            seq_data=seq_data,
++            prompt_lens=prompt_lens,
++        )
++
++        return pooling_metadata
+diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
+new file mode 100644
+index 0000000..7bdb7f0
+--- /dev/null
++++ b/vllm/worker/tpu_model_runner.py
+@@ -0,0 +1,896 @@
++import enum
++import time
++from dataclasses import dataclass
++from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
++                    Type, Union)
++from unittest.mock import patch
++
++import numpy as np
++import torch
++import torch.nn as nn
++import torch_xla.core.xla_model as xm
++import torch_xla.runtime as xr
++
++from vllm.attention import AttentionMetadata, get_attn_backend
++from vllm.config import VllmConfig
++from vllm.logger import init_logger
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.model_executor.model_loader import get_model
++from vllm.model_executor.sampling_metadata import SamplingMetadata
++from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
++                           Logprob, SequenceGroupMetadata, SequenceOutput)
++from vllm.worker.model_runner_base import (
++    ModelRunnerBase, ModelRunnerInputBase,
++    _add_attn_metadata_broadcastable_dict,
++    _init_attn_metadata_from_tensor_dict)
++
++if TYPE_CHECKING:
++    from vllm.attention.backends.abstract import AttentionBackend
++
++logger = init_logger(__name__)
++
++# Here we utilize the behavior that out-of-bound index is ignored.
++# FIXME(woosuk): Find a more reliable way to prevent possible bugs.
++_PAD_SLOT_ID = 1_000_000_000
++# FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow.
++_ENABLE_TOP_P = False
++# FIXME(woosuk): A temporary hack to support `n > 1`.
++# This can significantly affect the performance if too large.
++_MAX_NUM_SAMPLES = 128
++
++
++class ExecutionMode(enum.Enum):
++    PREFILL = enum.auto()
++    DECODE = enum.auto()
++    PREFIX_PREFILL = enum.auto()
++
++    def is_prefill(self) -> bool:
++        return self in (ExecutionMode.PREFILL, ExecutionMode.PREFIX_PREFILL)
++
++
++@dataclass(frozen=True)
++class ModelInputForTPU(ModelRunnerInputBase):
++    token_ids: torch.Tensor
++    position_ids: torch.Tensor
++    attn_metadata: AttentionMetadata
++    input_lens: torch.Tensor
++    t: torch.Tensor
++    p: torch.Tensor
++    num_samples: int
++    n: List[int]
++    seq_groups: List[List[int]]
++    is_first_multi_step: bool = True
++    is_last_step: bool = True
++    virtual_engine: int = 0
++    async_callback: Optional[Callable] = None
++
++    def as_broadcastable_tensor_dict(
++            self) -> Dict[str, Union[int, torch.Tensor]]:
++        tensor_dict = {
++            "token_ids": self.token_ids,
++            "position_ids": self.position_ids,
++            "input_lens": self.input_lens,
++            "t": self.t,
++            "p": self.p,
++            "num_samples": self.num_samples,
++            "n": self.n,
++            "seq_groups": self.seq_groups,
++            "is_first_multi_step": self.is_first_multi_step,
++            "is_last_step": self.is_last_step,
++            "virtual_engine": self.virtual_engine,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++        return tensor_dict
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls: Type["ModelInputForTPU"],
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> "ModelInputForTPU":
++        if attn_backend is not None:
++            tensor_dict = _init_attn_metadata_from_tensor_dict(
++                attn_backend, tensor_dict)
++        return cls(**tensor_dict)
++
++
++class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        is_driver_worker: bool = False,
++    ):
++        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
++        self.is_driver_worker = is_driver_worker
++
++        self.block_size = self.cache_config.block_size
++        self.max_num_blocks_per_seq = (self.model_config.max_model_len //
++                                       self.block_size)
++        self.block_tables = np.zeros(
++            (self.scheduler_config.max_num_seqs, self.max_num_blocks_per_seq),
++            dtype=np.int32)
++        self.attn_backend = get_attn_backend(
++            self.model_config.get_head_size(),
++            self.model_config.dtype,
++            self.cache_config.cache_dtype,
++            self.block_size,
++            self.model_config.is_attention_free,
++            False,
++        )
++        self.cached_step_outputs: List[torch.Tensor] = []
++
++        smem_size = 512 * 1024
++        block_table_size = 4 * self.block_tables.size
++        if block_table_size >= smem_size:
++            logger.warning(
++                "The max_model_len (%d) is too large. This may degrade the "
++                "performance due to the insufficient smem size. Consider "
++                "setting --max-model-len to a smaller value, like %d.",
++                self.model_config.max_model_len,
++                self.model_config.max_model_len /
++                (block_table_size / smem_size))
++
++    def load_model(self) -> None:
++        self.device = self.device_config.device
++
++        # NOTE(woosuk): While the executor assigns the TP ranks to the worker
++        # process, the ranks can be different from the ranks internally assigned
++        # by the xm runtime. Therefore, there is a mismatch in the rank
++        # assignment between the gloo (cpu) runtime and the xm (tpu) runtime.
++        # This is not a problem in linear layers because all-reduce is
++        # rank-agnostic. However, it matters for all-gather as the ranks
++        # determine the order of concatenating the output tensors.
++        # As a workaround, we use the xm's rank assignment only when loading
++        # the embedding weights.
++        xm_tp_rank = xr.global_ordinal()
++        with patch(
++                "vllm.model_executor.layers.vocab_parallel_embedding."
++                "get_tensor_model_parallel_rank",
++                return_value=xm_tp_rank):
++            model = get_model(vllm_config=self.vllm_config)
++        model = model.eval()
++        xm.wait_device_ops()
++        model = ModelWrapper(model)
++        self.model = torch.compile(model,
++                                   backend="openxla",
++                                   fullgraph=True,
++                                   dynamic=False)
++
++    def _dummy_run(
++        self,
++        batch_size: int,
++        seq_len: int,
++        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
++        exec_mode: ExecutionMode,
++    ) -> None:
++        exec_mode = ExecutionMode(exec_mode)
++        if exec_mode.is_prefill():
++            seq_len = (seq_len + 15) // 16 * 16
++            token_ids = torch.zeros((batch_size, seq_len),
++                                    dtype=torch.int32,
++                                    device=self.device)
++            position_ids = torch.zeros((batch_size, seq_len),
++                                       dtype=torch.int32,
++                                       device=self.device)
++            slot_mapping = torch.zeros((batch_size, seq_len),
++                                       dtype=torch.int64,
++                                       device=self.device)
++            input_lens = torch.ones((batch_size, ),
++                                    dtype=torch.int32,
++                                    device=self.device)
++            if exec_mode == ExecutionMode.PREFILL:
++                attn_metadata = self.attn_backend.make_metadata(
++                    num_prefills=batch_size,
++                    num_prefill_tokens=batch_size * seq_len,
++                    num_decode_tokens=0,
++                    slot_mapping=slot_mapping,
++                    multi_modal_placeholder_index_maps=None,
++                    block_tables=None,
++                    context_lens=None,
++                    effective_query_lens=None,
++                )
++            else:
++                context_lens = torch.ones((batch_size, ),
++                                          dtype=torch.int32,
++                                          device=self.device)
++                block_tables = torch.tensor(self.block_tables[:batch_size],
++                                            dtype=torch.int32,
++                                            device=self.device)
++                effective_query_lens = torch.ones_like(context_lens)
++                attn_metadata = self.attn_backend.make_metadata(
++                    num_prefills=batch_size,
++                    num_prefill_tokens=batch_size * seq_len,
++                    num_decode_tokens=0,
++                    slot_mapping=slot_mapping,
++                    multi_modal_placeholder_index_maps=None,
++                    block_tables=block_tables,
++                    context_lens=context_lens,
++                    effective_query_lens=effective_query_lens,
++                )
++        else:
++            assert seq_len == 1
++            token_ids = torch.zeros((batch_size, seq_len),
++                                    dtype=torch.int32,
++                                    device=self.device)
++            position_ids = torch.zeros((batch_size, seq_len),
++                                       dtype=torch.int32,
++                                       device=self.device)
++            slot_mapping = torch.zeros((batch_size, seq_len),
++                                       dtype=torch.int64,
++                                       device=self.device)
++            block_tables = torch.zeros(
++                (batch_size, self.max_num_blocks_per_seq),
++                dtype=torch.int32,
++                device=self.device)
++            context_lens = torch.ones((batch_size, ),
++                                      dtype=torch.int32,
++                                      device=self.device)
++            input_lens = torch.ones((batch_size, ),
++                                    dtype=torch.int32,
++                                    device=self.device)
++            attn_metadata = self.attn_backend.make_metadata(
++                num_prefills=0,
++                num_prefill_tokens=0,
++                num_decode_tokens=batch_size * seq_len,
++                slot_mapping=slot_mapping,
++                multi_modal_placeholder_index_maps=None,
++                block_tables=block_tables,
++                context_lens=context_lens,
++            )
++        t = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
++        p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device)
++        num_samples = _MAX_NUM_SAMPLES if exec_mode.is_prefill() else 1
++
++        # NOTE(woosuk): There are two stages of compilation: torch.compile and
++        # XLA compilation. Using `mark_dynamic` can reduce the torch.compile
++        # overhead by reusing the FX graph for different shapes.
++        # However, the XLA graph will still require static shapes and needs to
++        # be re-compiled for every different shapes. This overhead is inevitable
++        # in the first run, but can be skipped afterwards as we cache the XLA
++        # graphs in the disk (VLLM_XLA_CACHE_PATH).
++        if exec_mode.is_prefill():
++            # Prefll
++            torch._dynamo.mark_dynamic(token_ids, 1)
++            torch._dynamo.mark_dynamic(position_ids, 1)
++            torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 1)
++        else:
++            # Decode
++            torch._dynamo.mark_dynamic(token_ids, 0)
++            torch._dynamo.mark_dynamic(position_ids, 0)
++            torch._dynamo.mark_dynamic(input_lens, 0)
++            torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
++            torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0)
++            torch._dynamo.mark_dynamic(attn_metadata.block_tables, 0)
++            torch._dynamo.mark_dynamic(t, 0)
++            torch._dynamo.mark_dynamic(p, 0)
++        # Dummy run.
++        self.model(token_ids, position_ids, attn_metadata, input_lens, t, p,
++                   num_samples, kv_caches)
++
++    def warmup_model(
++        self,
++        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
++    ) -> None:
++        # Prefill
++        logger.info("Compiling the model with different input shapes...")
++        start = time.time()
++        for batch_size in [1]:
++            seq_len = 16
++            while seq_len <= self.model_config.max_model_len:
++                self._dummy_run(batch_size,
++                                seq_len,
++                                kv_caches,
++                                exec_mode=ExecutionMode.PREFILL)
++                xm.wait_device_ops()
++                logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len)
++                num_tokens = batch_size * seq_len
++                if num_tokens >= self.scheduler_config.max_num_batched_tokens:
++                    break
++                seq_len = seq_len * 2
++
++        end = time.time()
++        logger.info("Compilation for prefill done in %.2f s.", end - start)
++
++        # Prefix prefill
++        if self.cache_config.enable_prefix_caching:
++            logger.info("Compiling the model with different input shapes for "
++                        "prefix prefill...")
++            start = time.time()
++            for batch_size in [1]:
++                seq_len = 16
++                while seq_len <= self.model_config.max_model_len:
++                    self._dummy_run(batch_size,
++                                    seq_len,
++                                    kv_caches,
++                                    exec_mode=ExecutionMode.PREFIX_PREFILL)
++                    xm.wait_device_ops()
++                    logger.info("batch_size: %d, seq_len: %d", batch_size,
++                                seq_len)
++                    num_tokens = batch_size * seq_len
++                    if (num_tokens >=
++                            self.scheduler_config.max_num_batched_tokens):
++                        break
++                    seq_len = seq_len * 2
++            end = time.time()
++            logger.info("Compilation for prefix prefill done in %.2f s.",
++                        end - start)
++
++        # Decode
++        start = time.time()
++        seq_len = 1
++        batch_size = 8  # Must be in sync with _get_padded_batch_size()
++        while True:
++            self._dummy_run(batch_size,
++                            seq_len,
++                            kv_caches,
++                            exec_mode=ExecutionMode.DECODE)
++            xm.wait_device_ops()
++            logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len)
++
++            if batch_size >= self.scheduler_config.max_num_seqs:
++                break
++            batch_size = batch_size + 16 if batch_size >= 16 else batch_size * 2
++
++        end = time.time()
++        logger.info("Compilation for decode done in %.2f s.", end - start)
++
++    def _prepare_prompt(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]:
++        assert len(seq_group_metadata_list) > 0
++        input_tokens: List[int] = []
++        input_positions: List[int] = []
++        prompt_lens: List[int] = []
++        context_lens: List[int] = []
++        slot_mapping: List[int] = []
++
++        for batch_idx, seq_group_metadata in enumerate(
++                seq_group_metadata_list):
++            assert seq_group_metadata.is_prompt
++            seq_ids = list(seq_group_metadata.seq_data.keys())
++            assert len(seq_ids) == 1
++            seq_id = seq_ids[0]
++
++            seq_data = seq_group_metadata.seq_data[seq_id]
++            # Could include output tokens when a request is preempted.
++            prompt_tokens = seq_data.get_token_ids()
++            seq_len = len(prompt_tokens)
++
++            num_computed_blocks = len(seq_group_metadata.computed_block_nums)
++            num_computed_tokens = num_computed_blocks * self.block_size
++            if num_computed_tokens > 0:
++                prompt_tokens = prompt_tokens[num_computed_tokens:]
++                context_lens.append(seq_len)
++            else:
++                context_lens.append(0)
++
++            prompt_len = len(prompt_tokens)
++            prompt_lens.append(prompt_len)
++
++            input_tokens.extend(prompt_tokens)
++            input_positions.extend(range(num_computed_tokens, seq_len))
++
++            assert seq_group_metadata.block_tables is not None
++            block_table = seq_group_metadata.block_tables[seq_id]
++            for i in range(num_computed_tokens, seq_len):
++                block_number = block_table[i // self.block_size]
++                block_offset = i % self.block_size
++                slot = block_number * self.block_size + block_offset
++                slot_mapping.append(slot)
++            if num_computed_tokens > 0:
++                self.block_tables[batch_idx, :len(block_table)] = block_table
++
++            # Add paddings to EACH prompt to the smallest power of 2 that is
++            # greater than or equal to the prompt length.
++            # We pad the seq_len to reduce the compilation overhead.
++            # We execute each prompt individually (i.e., with batch_size 1)
++            # because the FlashAttention kernel does not support ragged inputs.
++            # TODO(woosuk): Use SplashAttention to support ragged inputs.
++            padded_prompt_len = _get_padded_prefill_len(prompt_len)
++            num_paddings = padded_prompt_len - prompt_len
++            input_tokens += [0] * num_paddings
++            input_positions += [0] * num_paddings
++            slot_mapping += [_PAD_SLOT_ID] * num_paddings
++
++        assert len(prompt_lens) > 0
++        num_prefills = len(prompt_lens)
++        input_tokens = torch.tensor(input_tokens,
++                                    dtype=torch.int32,
++                                    device="cpu")
++        input_positions = torch.tensor(input_positions,
++                                       dtype=torch.int32,
++                                       device="cpu")
++        slot_mapping = torch.tensor(slot_mapping,
++                                    dtype=torch.int64,
++                                    device="cpu")
++        prompt_lens = torch.tensor(prompt_lens,
++                                   dtype=torch.int32,
++                                   device="cpu")
++        context_lens = torch.tensor(context_lens,
++                                    dtype=torch.int32,
++                                    device="cpu")
++        block_tables = torch.tensor(self.block_tables[:num_prefills],
++                                    dtype=torch.int32,
++                                    device="cpu")
++        attn_metadata = self.attn_backend.make_metadata(
++            num_prefills=num_prefills,
++            num_prefill_tokens=0,  # NOTE: This is not used.
++            num_decode_tokens=0,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=None,
++            block_tables=block_tables,
++            context_lens=context_lens,
++            effective_query_lens=prompt_lens,
++        )
++        return input_tokens, input_positions, attn_metadata, prompt_lens
++
++    def _prepare_decode(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]:
++        assert len(seq_group_metadata_list) > 0
++        input_tokens: List[List[int]] = []
++        input_positions: List[List[int]] = []
++        slot_mapping: List[List[int]] = []
++        context_lens: List[int] = []
++
++        batch_idx = 0
++        for seq_group_metadata in seq_group_metadata_list:
++            assert not seq_group_metadata.is_prompt
++            seq_ids = list(seq_group_metadata.seq_data.keys())
++            for seq_id in seq_ids:
++                seq_data = seq_group_metadata.seq_data[seq_id]
++                generation_token = seq_data.get_last_token_id()
++                input_tokens.append([generation_token])
++
++                seq_len = seq_data.get_len()
++                position = seq_len - 1
++                input_positions.append([position])
++                context_lens.append(seq_len)
++
++                assert seq_group_metadata.block_tables is not None
++                block_table = seq_group_metadata.block_tables[seq_id]
++                self.block_tables[batch_idx, :len(block_table)] = block_table
++                batch_idx += 1
++
++                block_number = block_table[position // self.block_size]
++                block_offset = position % self.block_size
++                slot = block_number * self.block_size + block_offset
++                slot_mapping.append([slot])
++
++        batch_size = _get_padded_batch_size(batch_idx)
++        num_paddings = batch_size - batch_idx
++        input_tokens = input_tokens + [[0]] * num_paddings
++        input_positions = input_positions + [[0]] * num_paddings
++        slot_mapping = slot_mapping + [[_PAD_SLOT_ID]] * num_paddings
++        context_lens = context_lens + [0] * num_paddings
++
++        input_tokens = torch.tensor(input_tokens,
++                                    dtype=torch.int32,
++                                    device="cpu")
++        input_positions = torch.tensor(input_positions,
++                                       dtype=torch.int32,
++                                       device="cpu")
++        slot_mapping = torch.tensor(slot_mapping,
++                                    dtype=torch.int64,
++                                    device="cpu")
++        context_lens = torch.tensor(context_lens,
++                                    dtype=torch.int32,
++                                    device="cpu")
++        block_tables = torch.tensor(self.block_tables[:batch_size],
++                                    dtype=torch.int32,
++                                    device="cpu")
++        input_lens = torch.tensor([1] * batch_size,
++                                  dtype=torch.int32,
++                                  device="cpu")
++        attn_metadata = self.attn_backend.make_metadata(
++            num_prefills=0,
++            num_prefill_tokens=0,
++            num_decode_tokens=batch_size,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=None,
++            block_tables=block_tables,
++            context_lens=context_lens,
++        )
++        return input_tokens, input_positions, attn_metadata, input_lens
++
++    def _prepare_sample(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        padded_batch_size: int,
++    ) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
++        assert len(seq_group_metadata_list) > 0
++        t = []
++        p = []
++        n = []
++        for seq_group_metadata in seq_group_metadata_list:
++            sampling_params = seq_group_metadata.sampling_params
++            t.append(sampling_params.temperature)
++            if sampling_params.top_p != 1 and not _ENABLE_TOP_P:
++                raise NotImplementedError(
++                    "Top-p sampling is currently disabled for the TPU backend "
++                    "due to performance issues.")
++            p.append(sampling_params.top_p)
++            if sampling_params.top_k != -1:
++                raise NotImplementedError(
++                    "Top-k sampling is currently disabled for the TPU backend "
++                    "due to performance issues.")
++            if sampling_params.n > _MAX_NUM_SAMPLES:
++                raise NotImplementedError(
++                    f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU "
++                    "backend.")
++            n.append(sampling_params.n)
++            if sampling_params.logprobs is not None:
++                raise NotImplementedError(
++                    "logprobs is not currently supported by the TPU backend.")
++            if sampling_params.prompt_logprobs is not None:
++                raise NotImplementedError(
++                    "prompt_logprobs is not currently supported by the TPU "
++                    "backend.")
++
++            # Repeat the sampling params if the seq group has multiple seqs.
++            num_seqs = len(seq_group_metadata.seq_data)
++            t += [t[-1]] * (num_seqs - 1)
++            p += [p[-1]] * (num_seqs - 1)
++            n += [n[-1]] * (num_seqs - 1)
++
++        num_paddings = padded_batch_size - len(t)
++        t += [1.0] * num_paddings
++        p += [1.0] * num_paddings
++
++        t = torch.tensor(t, dtype=torch.float32, device="cpu")
++        p = torch.tensor(p, dtype=torch.float32, device="cpu")
++        return t, p, n
++
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None,
++    ) -> ModelInputForTPU:
++        del finished_requests_ids  # Unused.
++        assert virtual_engine == 0
++        assert len(seq_group_metadata_list) > 0
++        # NOTE: We assume that all sequences in the group are all prompts or
++        # all decodes.
++        is_prompt = seq_group_metadata_list[0].is_prompt
++        if is_prompt:
++            inputs = self._prepare_prompt(seq_group_metadata_list)
++        else:
++            inputs = self._prepare_decode(seq_group_metadata_list)
++        input_tokens, input_positions, attn_metadata, input_lens = inputs
++        padded_batch_size = input_tokens.shape[0]
++        t, p, n = self._prepare_sample(seq_group_metadata_list,
++                                       padded_batch_size)
++        num_samples = _MAX_NUM_SAMPLES if is_prompt else 1
++
++        seq_groups = [
++            list(metadata.seq_data.keys())
++            for metadata in seq_group_metadata_list
++        ]
++        return ModelInputForTPU(input_tokens, input_positions, attn_metadata,
++                                input_lens, t, p, num_samples, n, seq_groups)
++
++    def make_model_input_from_broadcasted_tensor_dict(
++            self, tensor_dict: Dict[str, Any]) -> ModelInputForTPU:
++        model_input = ModelInputForTPU.from_broadcasted_tensor_dict(
++            tensor_dict, attn_backend=self.attn_backend)
++        return model_input
++
++    @torch.no_grad()
++    def execute_model(
++        self,
++        model_input: ModelInputForTPU,
++        kv_caches: Optional[List[Any]],
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++    ) -> List[SamplerOutput]:
++        assert intermediate_tensors is None
++        if not model_input.is_first_multi_step:
++            if not model_input.is_last_step:
++                return []
++
++            use_async_out_proc = model_input.async_callback is not None
++            sampler_outputs = []
++            num_outputs = len(self.cached_step_outputs)
++            for i in range(num_outputs):
++                next_token_ids = self.cached_step_outputs.pop(0)
++                next_token_ids = next_token_ids.cpu().tolist()
++                sampler_output = _make_decode_output(next_token_ids,
++                                                     model_input.seq_groups)
++                sampler_outputs.append(sampler_output)
++
++                if i < num_outputs - 1 and use_async_out_proc:
++                    assert model_input.async_callback is not None
++                    ctx = model_input.async_callback.keywords[  # type: ignore
++                        "ctx"]
++                    ctx.append_output(
++                        outputs=[sampler_output],
++                        seq_group_metadata_list=ctx.seq_group_metadata_list,
++                        scheduler_outputs=ctx.scheduler_outputs,
++                        is_async=False,
++                        is_last_step=False,
++                        is_first_step_output=i == 0)
++                    model_input.async_callback()
++            if use_async_out_proc:
++                return [sampler_outputs[-1]]
++            else:
++                return sampler_outputs
++
++        is_prompt = model_input.attn_metadata.num_prefills > 0
++        if is_prompt:
++            assert num_steps == 1
++            # NOTE(woosuk): Since the FlashAttention kernel does not support
++            # ragged inputs, we split the prompts into different batches and
++            # process them separately. This is a temporary hack that should be
++            # optimized by using SplashAttention.
++            orig_slot_mapping = model_input.attn_metadata.slot_mapping
++            orig_block_tables = model_input.attn_metadata.block_tables
++            orig_context_lens = model_input.attn_metadata.context_lens
++            orig_effective_query_lens = \
++                model_input.attn_metadata.effective_query_lens
++            batch_size = model_input.input_lens.shape[0]
++            start_idx = 0
++            next_token_ids = []
++            for i in range(batch_size):
++                # Get the actual prefill_len.
++                prefill_len = model_input.input_lens[i:i + 1].item()
++                prefill_len = _get_padded_prefill_len(prefill_len)
++                end_idx = start_idx + prefill_len
++
++                token_ids = model_input.token_ids[None, start_idx:end_idx].to(
++                    self.device)
++                position_ids = model_input.position_ids[None,
++                                                        start_idx:end_idx].to(
++                                                            self.device)
++                attn_metadata = model_input.attn_metadata
++                attn_metadata.num_prefills = 1
++                attn_metadata.slot_mapping = orig_slot_mapping[
++                    None, start_idx:end_idx].to(self.device)
++                if orig_context_lens[i].item() > 0:
++                    attn_metadata.context_lens = orig_context_lens[i:i + 1].to(
++                        self.device)
++                    attn_metadata.block_tables = orig_block_tables[
++                        i].unsqueeze(0).to(self.device)
++                    attn_metadata.effective_query_lens = \
++                        orig_effective_query_lens[i:i + 1].to(self.device)
++                else:
++                    attn_metadata.context_lens = None
++                    attn_metadata.block_tables = None
++                    attn_metadata.effective_query_lens = None
++                input_lens = model_input.input_lens[i:i + 1].to(self.device)
++                t = model_input.t[i:i + 1].to(self.device)
++                p = model_input.p[i:i + 1].to(self.device)
++                output_token_ids = self.model(token_ids, position_ids,
++                                              attn_metadata, input_lens, t, p,
++                                              model_input.num_samples,
++                                              kv_caches)
++                next_token_ids.append(output_token_ids[0])
++                start_idx = end_idx
++
++            if model_input.async_callback is not None:
++                model_input.async_callback()
++            # Retrieve the outputs to CPU.
++            next_token_ids = [
++                output_token_ids.cpu().tolist()
++                for output_token_ids in next_token_ids
++            ]
++
++            # NOTE(woosuk): Minimal code to construct the sampler outputs.
++            # The TPU backend does not reuse the sampler, since the TPU backend
++            # does not support advanced sampling parameters such as logprobs.
++            zero_logprob = Logprob(0.0)
++            sampler_outputs = []
++            for i, seq_group in enumerate(model_input.seq_groups):
++                seq_ids = seq_group
++                assert len(seq_ids) == 1
++                seq_id = seq_ids[0]
++                seq_outputs = []
++                for j in range(model_input.n[i]):
++                    next_token_id = next_token_ids[i][j]
++                    seq_outputs.append(
++                        SequenceOutput(seq_id, next_token_id,
++                                       {next_token_id: zero_logprob}))
++                sampler_outputs.append(
++                    CompletionSequenceGroupOutput(seq_outputs, None))
++            return [SamplerOutput(sampler_outputs)]
++        else:
++            token_ids = model_input.token_ids.to(self.device)
++            position_ids = model_input.position_ids.to(self.device)
++            attn_metadata = model_input.attn_metadata
++            attn_metadata.slot_mapping = attn_metadata.slot_mapping.to(
++                self.device)
++            attn_metadata.block_tables = attn_metadata.block_tables.to(
++                self.device)
++            attn_metadata.context_lens = attn_metadata.context_lens.to(
++                self.device)
++            t = model_input.t.to(self.device)
++            p = model_input.p.to(self.device)
++            input_lens = model_input.input_lens.to(self.device)
++            for i in range(num_steps):
++                slot_mapping = attn_metadata.slot_mapping
++                output_token_ids = self.model(token_ids, position_ids,
++                                              attn_metadata, input_lens, t, p,
++                                              model_input.num_samples,
++                                              kv_caches)
++                self.cached_step_outputs.append(output_token_ids)
++
++                if i < num_steps - 1:
++                    # Prepare the inputs for the next step.
++                    token_ids = output_token_ids.unsqueeze(dim=1).int()
++                    position_ids = position_ids + 1
++                    attn_metadata.context_lens = attn_metadata.context_lens + 1
++
++                    block_tables = attn_metadata.block_tables
++                    block_number = block_tables.gather(
++                        1,
++                        position_ids.long() // self.block_size)
++                    block_offset = position_ids % self.block_size
++
++                    is_padding = slot_mapping == _PAD_SLOT_ID
++                    slot_mapping = block_number * self.block_size + block_offset
++                    slot_mapping = slot_mapping.long()
++                    slot_mapping = torch.where(is_padding, _PAD_SLOT_ID,
++                                               slot_mapping)
++                    attn_metadata.slot_mapping = slot_mapping
++
++            if model_input.async_callback is not None:
++                model_input.async_callback()
++
++            if num_steps > 1:
++                return []
++            # Retrieve the outputs to CPU.
++            next_token_ids = self.cached_step_outputs.pop(0)
++            next_token_ids = next_token_ids.cpu().tolist()
++            sampler_output = _make_decode_output(next_token_ids,
++                                                 model_input.seq_groups)
++            return [sampler_output]
++
++
++class ModelWrapper(nn.Module):
++
++    def __init__(self, model: nn.Module):
++        super().__init__()
++        self.model = model
++
++    def forward(
++        self,
++        token_ids: torch.Tensor,
++        position_ids: torch.Tensor,
++        attn_metadata: AttentionMetadata,
++        input_lens: torch.Tensor,
++        t: torch.Tensor,
++        p: torch.Tensor,
++        num_samples: int,
++        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
++    ) -> torch.Tensor:
++        """Executes the forward pass of the model and samples the next token.
++
++        Args:
++            token_ids: The input token IDs of shape [batch_size, seq_len].
++            position_ids: The input position IDs of shape [batch_size, seq_len].
++            attn_metadata: The Pallas attention metadata.
++            input_lens: The actual input lengths of shape [batch_size].
++            t: The sampling temperature of shape [batch_size].
++            p: The top-p probability of shape [batch_size].
++            num_samples: Number of samples to draw from each logits vector.
++            kv_caches: The key and value caches. They can be None during the
++                memory profiling at initialization.
++        """
++        batch_size, seq_len = token_ids.shape
++        # Calculate the positions to sample from.
++        start_indicies = torch.arange(
++            batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
++        logits_indices = start_indicies + input_lens - 1
++
++        # FIXME(woosuk): This is a temporary hack to avoid using the existing
++        # sampler and sampling metadata.
++        sampling_metadata = SamplingMetadata(
++            seq_groups=[],
++            selected_token_indices=logits_indices,
++            categorized_sample_indices={},
++            num_prompts=attn_metadata.num_prefills,
++        )
++
++        # Skip this in memory profiling at initialization.
++        if kv_caches[0][0].numel() > 0:
++            # index_copy_(slot_mapping) only works when the inserted dimension
++            # is 0. However, the KV cache in the Pallas backend has the shape
++            # [num_kv_heads, num_blocks, block_size, head_size]. To make it
++            # work, we need to flatten the first three dimensions and modify
++            # the slot_mapping accordingly.
++            num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
++            slot_mapping = attn_metadata.slot_mapping
++            slot_mapping = slot_mapping.flatten()
++            head_indicies = torch.arange(0,
++                                         num_kv_heads,
++                                         device=slot_mapping.device,
++                                         dtype=slot_mapping.dtype)
++            head_indicies *= block_size * num_blocks
++            slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
++                -1, num_kv_heads)
++            slot_mapping = slot_mapping + head_indicies.view(1, -1)
++            slot_mapping = slot_mapping.flatten()
++            attn_metadata.slot_mapping = slot_mapping
++
++        hidden_states = self.model(
++            token_ids,
++            position_ids,
++            kv_caches,
++            attn_metadata,
++        )
++        hidden_states = hidden_states.flatten(0, 1)
++        logits = self.model.compute_logits(hidden_states, sampling_metadata)
++
++        # Argmax sampling.
++        argmax_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
++        argmax_token_ids = argmax_token_ids.repeat(1, num_samples)
++
++        # Zero temperature means greedy decoding. Avoid division by zero.
++        nonzero_t = torch.where(t != 0, t, 1.0)
++        logits = logits / nonzero_t.unsqueeze(dim=1)
++        if _ENABLE_TOP_P:
++            logits = _apply_top_p(logits, p.unsqueeze(dim=1))
++
++        # Random sampling.
++        probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
++        sampled_token_ids = torch.multinomial(probs,
++                                              num_samples,
++                                              replacement=True)
++        if num_samples == 1:
++            argmax_token_ids = argmax_token_ids.squeeze(dim=-1)
++            sampled_token_ids = sampled_token_ids.squeeze(dim=-1)
++        next_token_ids = torch.where(t != 0, sampled_token_ids,
++                                     argmax_token_ids)
++        return next_token_ids
++
++
++def _get_padded_prefill_len(x: int) -> int:
++    # NOTE(woosuk): The pallas FlashAttention kernel requires the sequence
++    # length to be a multiple of 16. We pad the prompt length to the nearest
++    # multiple of 16. This is also good for performance.
++    if x <= 16:
++        return 16
++    return 1 << (x - 1).bit_length()
++
++
++def _get_padded_batch_size(batch_size: int) -> int:
++    # The GMM Pallas kernel requires num_tokens * topk to be a multiple of 16.
++    # To meet this requirement in the simplest way, we set the minimal batch
++    # size to 8.
++    if batch_size <= 8:
++        return 8
++    else:
++        return ((batch_size + 15) // 16) * 16
++
++
++def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor:
++    logits_sorted = torch.sort(logits, dim=-1, descending=True).values
++    sorted_cum_probs = torch.cumsum(logits_sorted.softmax(dim=-1), dim=-1)
++    cutoff_index = torch.sum(sorted_cum_probs < p, dim=-1, keepdim=True)
++    cutoff_logit = torch.gather(logits_sorted, -1, cutoff_index)
++    logits = logits.masked_fill_(logits < cutoff_logit, -float("inf"))
++    return logits
++
++
++def _make_decode_output(
++    next_token_ids: List[int],
++    seq_groups: List[List[int]],
++) -> SamplerOutput:
++    zero_logprob = Logprob(0.0)
++    sampler_outputs = []
++    batch_idx = 0
++    for seq_group in seq_groups:
++        seq_ids = seq_group
++        seq_outputs = []
++        for seq_id in seq_ids:
++            next_token_id = next_token_ids[batch_idx]
++            seq_outputs.append(
++                SequenceOutput(seq_id, next_token_id,
++                               {next_token_id: zero_logprob}))
++            batch_idx += 1
++        sampler_outputs.append(CompletionSequenceGroupOutput(
++            seq_outputs, None))
++    return SamplerOutput(sampler_outputs)
+diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
+new file mode 100644
+index 0000000..8754f75
+--- /dev/null
++++ b/vllm/worker/tpu_worker.py
+@@ -0,0 +1,294 @@
++import os
++from typing import List, Optional, Tuple, Union
++
++import torch
++import torch_xla.core.xla_model as xm
++import torch_xla.runtime as xr
++
++import vllm.envs as envs
++from vllm.config import VllmConfig
++from vllm.distributed import (ensure_model_parallel_initialized,
++                              init_distributed_environment)
++from vllm.logger import init_logger
++from vllm.model_executor import set_random_seed
++from vllm.sequence import ExecuteModelRequest
++from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
++from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner
++from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
++                                     LoraNotSupportedWorkerBase, WorkerBase,
++                                     WorkerInput)
++
++logger = init_logger(__name__)
++
++
++class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        local_rank: int,
++        rank: int,
++        distributed_init_method: str,
++        is_driver_worker: bool,
++    ) -> None:
++        WorkerBase.__init__(self, vllm_config=vllm_config)
++        self.parallel_config.rank = rank
++        self.local_rank = local_rank
++        self.rank = rank
++        self.distributed_init_method = distributed_init_method
++        self.is_driver_worker = is_driver_worker
++
++        assert self.device_config.device_type == "tpu"
++        if self.cache_config.cache_dtype == "auto":
++            self.cache_dtype = self.model_config.dtype
++        else:
++            self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
++                self.cache_config.cache_dtype]
++
++        self.model_runner: TPUModelRunner = TPUModelRunner(
++            vllm_config=vllm_config, is_driver_worker=is_driver_worker)
++
++    def init_device(self) -> None:
++        os.environ["PJRT_DEVICE"] = "TPU"
++        torch.set_grad_enabled(False)
++        torch.set_default_dtype(self.model_config.dtype)
++
++        # NOTE(woosuk): This is just to initialize the TP group and broadcast
++        # the input objects on CPU. The all-reduce and all-gather ops on TPU
++        # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
++        # own context.
++        init_distributed_environment(
++            world_size=self.parallel_config.world_size,
++            rank=self.rank,
++            local_rank=self.local_rank,
++            distributed_init_method=self.distributed_init_method,
++            backend="gloo",
++        )
++        ensure_model_parallel_initialized(
++            self.parallel_config.tensor_parallel_size,
++            self.parallel_config.pipeline_parallel_size)
++
++        # Device initialization should happen after initializing the distributed
++        # runtime.
++        self.device = xm.xla_device()
++        self.device_config.device = self.device
++
++        # Set random seed.
++        set_random_seed(self.model_config.seed)
++        xm.set_rng_state(self.model_config.seed, self.device)
++
++        # Increase the cache size limit, which is the maximum number of
++        # dynamo graphs that can be compiled.
++        # NOTE(woosuk): Usually, we compile 10-15 graphs for prefill and
++        # 30-40 graphs for decode. 128 is an arbitrary safe number.
++        torch._dynamo.config.cache_size_limit = 128
++        # Use persistent cache to avoid XLA recompilation.
++        # NOTE(woosuk): Set per-rank cache path since different ranks
++        # can have slightly different XLA graphs.
++        world_size = self.parallel_config.world_size
++        rank = xr.global_ordinal()
++        per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
++                                     f"tp{world_size}_rank{rank}")
++        xr.initialize_cache(per_rank_path, readonly=False)
++
++    def load_model(self):
++        self.model_runner.load_model()
++
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        num_layers = self.model_config.get_num_layers(self.parallel_config)
++        head_size = self.model_config.get_head_size()
++        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
++
++        # use an empty tensor instead of `None`` to force Dynamo to pass
++        # it by reference, rather by specializing on the value ``None``.
++        # the `dtype` argument does not matter, and we use `float32` as
++        # a placeholder (it has wide hardware support).
++        kv_caches = [(torch.tensor([], dtype=torch.float32,
++                                   device=self.device),
++                      torch.tensor([], dtype=torch.float32,
++                                   device=self.device))
++                     for _ in range(num_layers)]
++        self.model_runner._dummy_run(
++            batch_size=1,
++            seq_len=self.scheduler_config.max_num_batched_tokens,
++            kv_caches=kv_caches,
++            exec_mode=ExecutionMode.PREFILL,
++        )
++        # Synchronize before measuring the memory usage.
++        xm.wait_device_ops()
++
++        # Get the maximum amount of memory used by the model weights and
++        # intermediate activations.
++        m = xm.get_memory_info(self.device)
++        total_memory_size = m["bytes_limit"]
++        profiled = m["peak_bytes_used"]  # Weights + intermediate activations.
++
++        # Calculate the TPU KV cache size based on profiling.
++        usable_memory_size = int(total_memory_size *
++                                 self.cache_config.gpu_memory_utilization)
++        tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
++        dtype_btyes = get_dtype_size(self.cache_dtype)
++        block_size_bytes = (dtype_btyes * self.cache_config.block_size *
++                            num_layers * 2 * head_size * num_kv_heads)
++        num_tpu_blocks = tpu_kv_cache_bytes // block_size_bytes
++        num_tpu_blocks = (num_tpu_blocks // 8) * 8  # Round down to 8.
++
++        # Calculate the CPU KV cache size based on the config.
++        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
++                             block_size_bytes)
++        num_cpu_blocks = (num_cpu_blocks // 8) * 8  # Round down to 8.
++        return num_tpu_blocks, num_cpu_blocks
++
++    def initialize_cache(
++        self,
++        num_gpu_blocks: int,
++        num_cpu_blocks: int,
++    ) -> None:
++        self.cache_config.num_gpu_blocks = num_gpu_blocks
++        self.cache_config.num_cpu_blocks = num_cpu_blocks
++        self.block_size = self.cache_config.block_size
++
++        dtype = self.cache_dtype
++        num_layers = self.model_config.get_num_layers(self.parallel_config)
++        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
++        head_size = self.model_config.get_head_size()
++
++        self.cpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
++        self.tpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
++        tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
++            num_gpu_blocks, self.block_size, num_kv_heads, head_size)
++        cpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape(
++            num_cpu_blocks, self.block_size, num_kv_heads, head_size)
++        for _ in range(num_layers):
++            tpu_k_cache = torch.zeros(tpu_cache_shape,
++                                      dtype=dtype,
++                                      device=self.device)
++            tpu_v_cache = torch.zeros_like(tpu_k_cache)
++            self.tpu_cache.append((tpu_k_cache, tpu_v_cache))
++            cpu_k_cache = torch.zeros(cpu_cache_shape,
++                                      dtype=dtype,
++                                      device="cpu")
++            cpu_v_cache = torch.zeros_like(cpu_k_cache)
++            self.cpu_cache.append((cpu_k_cache, cpu_v_cache))
++        self._warmup_model()
++
++    def _warmup_model(self) -> None:
++        # FIXME(woosuk): Here we are abusing `enforce_eager` which is defined
++        # for CUDA graphs. We should refactor this part.
++        if not self.model_config.enforce_eager:
++            # Warm up the model with all possible input shapes so that
++            # compilation never happens during the actual execution.
++            # This may take ~30 mins for the first run and ~20 mins for the
++            # subsequent runs.
++            # If `enforce_eager` is True, the ahead-of-time compilation is
++            # skipped and the compilation happens during the actual execution,
++            # which is bad for performance but useful for development.
++            self.model_runner.warmup_model(self.tpu_cache)
++
++    def get_cache_block_size_bytes(self) -> int:
++        head_size = self.model_config.get_head_size()
++        num_heads = self.model_config.get_num_kv_heads(self.parallel_config)
++        num_layers = self.model_config.get_num_layers(self.parallel_config)
++
++        key_cache_block = self.cache_config.block_size * num_heads * head_size
++        value_cache_block = key_cache_block
++        total = num_layers * (key_cache_block + value_cache_block)
++        dtype_size = get_dtype_size(self.cache_dtype)
++        return dtype_size * total
++
++    @property
++    def do_metadata_broadcast(self) -> bool:
++        return self.parallel_config.tensor_parallel_size > 1
++
++    @property
++    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
++        # NOTE(woosuk): This assumes virtual_engine == 0, i.e., no pipeline
++        # parallelism.
++        return [self.tpu_cache]
++
++    def prepare_worker_input(
++        self,
++        execute_model_req: ExecuteModelRequest,
++    ) -> WorkerInput:
++        virtual_engine = execute_model_req.virtual_engine
++        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
++        blocks_to_swap_in = _make_src_to_dst(
++            execute_model_req.blocks_to_swap_in, "cpu", self.device)
++        blocks_to_swap_out = _make_src_to_dst(
++            execute_model_req.blocks_to_swap_out, self.device, "cpu")
++        blocks_to_copy = _make_src_to_dst(execute_model_req.blocks_to_copy,
++                                          self.device, self.device)
++        return WorkerInput(
++            num_seq_groups=num_seq_groups,
++            blocks_to_swap_in=blocks_to_swap_in,
++            blocks_to_swap_out=blocks_to_swap_out,
++            blocks_to_copy=blocks_to_copy,
++            virtual_engine=virtual_engine,
++        )
++
++    def execute_worker(self, worker_input: WorkerInput) -> None:
++        virtual_engine = worker_input.virtual_engine
++        assert virtual_engine == 0
++        attn_backend = self.model_runner.attn_backend
++        num_layers = self.model_config.get_num_layers(self.parallel_config)
++
++        # Issue cache operations.
++        if worker_input.blocks_to_swap_in is not None:
++            src_indices, dst_indices = worker_input.blocks_to_swap_in
++            if src_indices.numel() > 0:
++                # Swap from CPU to TPU.
++                for i in range(num_layers):
++                    tpu_k_cache, tpu_v_cache = self.tpu_cache[i]
++                    cpu_k_cache, cpu_v_cache = self.cpu_cache[i]
++                    k = cpu_k_cache[:, src_indices].to(self.device)
++                    v = cpu_v_cache[:, src_indices].to(self.device)
++                    _insert_kv(k, v, dst_indices, tpu_k_cache, tpu_v_cache)
++
++        if worker_input.blocks_to_swap_out is not None:
++            src_indices, dst_indices = worker_input.blocks_to_swap_out
++            if src_indices.numel() > 0:
++                # Swap from TPU to CPU.
++                for i in range(num_layers):
++                    tpu_k_cache, tpu_v_cache = self.tpu_cache[i]
++                    cpu_k_cache, cpu_v_cache = self.cpu_cache[i]
++                    cpu_k_cache[:, dst_indices] = tpu_k_cache[:, src_indices]
++                    cpu_v_cache[:, dst_indices] = tpu_v_cache[:, src_indices]
++
++        if worker_input.blocks_to_copy is not None:
++            src_indices, dst_indices = worker_input.blocks_to_copy
++            if src_indices.numel() > 0:
++                attn_backend.copy_blocks(self.tpu_cache,
++                                         (src_indices, dst_indices))
++
++
++def _make_src_to_dst(
++    mapping: List[Tuple[int, int]],
++    src_device: Union[torch.device, str],
++    dst_device: Union[torch.device, str],
++) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
++    if not mapping:
++        return None
++
++    src_indices = [i for i, _ in mapping]
++    dst_indices = [i for _, i in mapping]
++    src_indices = torch.tensor(src_indices,
++                               device=src_device,
++                               dtype=torch.int64)
++    dst_indices = torch.tensor(dst_indices,
++                               device=dst_device,
++                               dtype=torch.int64)
++    return src_indices, dst_indices
++
++
++@torch.compile(backend="openxla")
++def _insert_kv(
++    k: torch.Tensor,
++    v: torch.Tensor,
++    indices: torch.Tensor,
++    tpu_k_cache: torch.Tensor,
++    tpu_v_cache: torch.Tensor,
++) -> None:
++    torch.ops.xla.dynamo_set_buffer_donor_(tpu_k_cache, True)
++    torch.ops.xla.dynamo_set_buffer_donor_(tpu_v_cache, True)
++    tpu_k_cache[:, indices] = k
++    tpu_v_cache[:, indices] = v
+diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
+new file mode 100644
+index 0000000..ffa8c4c
+--- /dev/null
++++ b/vllm/worker/utils.py
+@@ -0,0 +1,51 @@
++'''
++Worker-related helper functions.
++'''
++
++from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS
++from vllm.worker.model_runner import GPUModelRunnerBase
++
++
++def assert_enc_dec_mr_supported_scenario(
++        enc_dec_mr: GPUModelRunnerBase) -> None:
++    '''
++    Asserted that the provided encoder/decoder model runner instance reflects
++    a supported scenario.
++    '''
++
++    # Reminder: Please update docs/source/features/compatibility_matrix.md
++    # If the feature combo become valid
++
++    if enc_dec_mr.cache_config.enable_prefix_caching:
++        raise NotImplementedError(
++            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])
++
++    if enc_dec_mr.sliding_window is not None:
++        raise NotImplementedError(
++            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SWA'])
++
++    if enc_dec_mr.scheduler_config.chunked_prefill_enabled:
++        raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[
++            'STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL'])
++
++    if getattr(enc_dec_mr.model_config.hf_config, 'attn_logit_softcapping',
++               None) is not None:
++        raise NotImplementedError(
++            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP']
++        )
++
++    if enc_dec_mr.lora_config is not None:
++        raise NotImplementedError(
++            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LORA'])
++
++    if enc_dec_mr.parallel_config.pipeline_parallel_size > 1:
++        raise NotImplementedError(
++            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PP'])
++
++    if enc_dec_mr.scheduler_config.num_lookahead_slots > 0:
++        raise NotImplementedError(
++            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC'])
++
++    if enc_dec_mr.prompt_adapter_config is not None:
++        raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[
++            'STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER'])
+diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
+index 4add36e..a3e377e 100644
+--- a/vllm/worker/worker.py
++++ b/vllm/worker/worker.py
+@@ -1,30 +1,38 @@
+ """A GPU worker class."""
+ import gc
+ import os
+-from typing import Any, Dict, List, Optional, Set, Tuple
++from typing import Dict, List, Optional, Set, Tuple, Type, Union
+ 
+ import torch
+ import torch.distributed
+ 
+-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+-                         ModelConfig, ParallelConfig, SchedulerConfig,
+-                         VisionLanguageConfig)
+-from vllm.distributed import (broadcast_tensor_dict,
++import vllm.envs as envs
++from vllm.config import VllmConfig
++from vllm.distributed import (ensure_kv_transfer_initialized,
+                               ensure_model_parallel_initialized,
+-                              get_tensor_model_parallel_cpu_group,
+-                              init_distributed_environment)
+-from vllm.distributed.device_communicators import pynccl_utils
+-from vllm.distributed.device_communicators.custom_all_reduce import (
+-    init_custom_ar)
++                              init_distributed_environment,
++                              set_custom_all_reduce)
++from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+ from vllm.model_executor import set_random_seed
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
++from vllm.platforms import current_platform
++from vllm.prompt_adapter.request import PromptAdapterRequest
++from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
++                           SequenceGroupMetadata, SequenceGroupMetadataDelta)
++from vllm.utils import GiB_bytes, bind_kv_cache, memory_profiling
+ from vllm.worker.cache_engine import CacheEngine
+-from vllm.worker.model_runner import ModelRunner
+-from vllm.worker.worker_base import WorkerBase
++from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
++from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
++from vllm.worker.pooling_model_runner import PoolingModelRunner
++from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
++                                     WorkerInput)
+ 
++logger = init_logger(__name__)
+ 
+-class Worker(WorkerBase):
++
++class Worker(LocalOrDistributedWorkerBase):
+     """A worker class that executes (a partition of) the model on a GPU.
+ 
+     Each worker is associated with a single GPU. The worker is responsible for
+@@ -34,57 +42,85 @@ class Worker(WorkerBase):
+ 
+     def __init__(
+         self,
+-        model_config: ModelConfig,
+-        parallel_config: ParallelConfig,
+-        scheduler_config: SchedulerConfig,
+-        device_config: DeviceConfig,
+-        cache_config: CacheConfig,
+-        load_config: LoadConfig,
++        vllm_config: VllmConfig,
+         local_rank: int,
+         rank: int,
+         distributed_init_method: str,
+-        lora_config: Optional[LoRAConfig] = None,
+-        vision_language_config: Optional[VisionLanguageConfig] = None,
+         is_driver_worker: bool = False,
++        model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
+     ) -> None:
+-        self.model_config = model_config
+-        self.parallel_config = parallel_config
+-        self.scheduler_config = scheduler_config
+-        self.device_config = device_config
+-        self.cache_config = cache_config
++        WorkerBase.__init__(self, vllm_config)
++        self.parallel_config.rank = rank
+         self.local_rank = local_rank
+         self.rank = rank
+         self.distributed_init_method = distributed_init_method
+-        self.lora_config = lora_config
+-        self.load_config = load_config
+         self.is_driver_worker = is_driver_worker
+-        if self.is_driver_worker:
+-            assert self.rank == 0, "The driver worker must have rank 0."
+-
++        if is_driver_worker:
++            assert rank % self.parallel_config.tensor_parallel_size == 0, \
++                   "Driver worker should be rank 0 of tensor parallel group."
+         if self.model_config.trust_remote_code:
+             # note: lazy import to avoid importing torch before initializing
+             from vllm.utils import init_cached_hf_modules
+             init_cached_hf_modules()
+-        self.vision_language_config = vision_language_config
+-        if self.vision_language_config:
+-            assert not self.lora_config, (
+-                "To be tested: vision language model with LoRA settings.")
+-
+-        self.model_runner = ModelRunner(
+-            model_config,
+-            parallel_config,
+-            scheduler_config,
+-            device_config,
+-            load_config=load_config,
+-            lora_config=self.lora_config,
++
++        # Return hidden states from target model if the draft model is an
++        # mlp_speculator
++        speculative_config = self.speculative_config
++        model_config = self.model_config
++        speculative_args = {} if speculative_config is None \
++            or (speculative_config.draft_model_config.model ==
++                model_config.model) \
++            or (speculative_config.draft_model_config.hf_config.model_type
++                not in ["medusa", "mlp_speculator", "eagle"]) \
++                    else {"return_hidden_states": True}
++
++        ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
++        if model_config.runner_type == "pooling":
++            ModelRunnerClass = PoolingModelRunner
++        elif self.model_config.is_encoder_decoder:
++            ModelRunnerClass = EncoderDecoderModelRunner
++        self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
++            vllm_config=self.vllm_config,
+             kv_cache_dtype=self.cache_config.cache_dtype,
+             is_driver_worker=is_driver_worker,
+-            vision_language_config=vision_language_config,
++            **speculative_args,
+         )
++        if model_runner_cls is not None:
++            self.model_runner = model_runner_cls(self.model_runner)
++
+         # Uninitialized cache engine. Will be initialized by
+         # initialize_cache.
+-        self.cache_engine: CacheEngine
+-        self.gpu_cache: List[torch.Tensor]
++        self.cache_engine: List[CacheEngine]
++        # Initialize gpu_cache as pooling models don't initialize kv_caches
++        self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
++        self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
++
++        # Torch profiler. Enabled and configured through env vars:
++        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
++        if envs.VLLM_TORCH_PROFILER_DIR:
++            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
++            logger.info("Profiling enabled. Traces will be saved to: %s",
++                        torch_profiler_trace_dir)
++            self.profiler = torch.profiler.profile(
++                activities=[
++                    torch.profiler.ProfilerActivity.CPU,
++                    torch.profiler.ProfilerActivity.CUDA,
++                ],
++                with_stack=True,
++                on_trace_ready=torch.profiler.tensorboard_trace_handler(
++                    torch_profiler_trace_dir, use_gzip=True))
++        else:
++            self.profiler = None
++
++    def start_profile(self):
++        if self.profiler is None:
++            raise RuntimeError("Profiler is not enabled.")
++        self.profiler.start()
++
++    def stop_profile(self):
++        if self.profiler is None:
++            raise RuntimeError("Profiler is not enabled.")
++        self.profiler.stop()
+ 
+     def init_device(self) -> None:
+         if self.device_config.device.type == "cuda":
+@@ -102,13 +138,14 @@ class Worker(WorkerBase):
+             torch.cuda.set_device(self.device)
+ 
+             _check_if_gpu_supports_dtype(self.model_config.dtype)
++            gc.collect()
+             torch.cuda.empty_cache()
+             self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+         else:
+             raise RuntimeError(
+                 f"Not support device type: {self.device_config.device}")
+         # Initialize the distributed environment.
+-        init_worker_distributed_environment(self.parallel_config, self.rank,
++        init_worker_distributed_environment(self.vllm_config, self.rank,
+                                             self.distributed_init_method,
+                                             self.local_rank)
+         # Set random seed.
+@@ -117,6 +154,25 @@ class Worker(WorkerBase):
+     def load_model(self):
+         self.model_runner.load_model()
+ 
++    def save_sharded_state(
++        self,
++        path: str,
++        pattern: Optional[str] = None,
++        max_size: Optional[int] = None,
++    ) -> None:
++        self.model_runner.save_sharded_state(
++            path,
++            pattern=pattern,
++            max_size=max_size,
++        )
++
++    def save_tensorized_model(
++        self,
++        tensorizer_config: TensorizerConfig,
++    ) -> None:
++        self.model_runner.save_tensorized_model(
++            tensorizer_config=tensorizer_config, )
++
+     @torch.inference_mode()
+     def determine_num_available_blocks(self) -> Tuple[int, int]:
+         """Profiles the peak memory usage of the model to determine how many
+@@ -133,36 +189,73 @@ class Worker(WorkerBase):
+         # Profile the memory usage of the model and get the maximum number of
+         # cache blocks that can be allocated with the remaining free memory.
+         torch.cuda.empty_cache()
++        torch.cuda.reset_peak_memory_stats()
++
++        free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info()
+ 
+         # Execute a forward pass with dummy inputs to profile the memory usage
+         # of the model.
+-        self.model_runner.profile_run()
++        with memory_profiling(baseline_memory_in_bytes=total_gpu_memory -
++                              self.init_gpu_memory,
++                              weights_memory_in_bytes=self.model_runner.
++                              model_memory_usage) as result:
++            self.model_runner.profile_run()
++
++        self._assert_memory_footprint_increased_during_profiling()
++
++        memory_for_current_instance = total_gpu_memory * \
++            self.cache_config.gpu_memory_utilization
++        available_kv_cache_memory = (memory_for_current_instance -
++                                     result.non_kv_cache_memory_in_bytes)
+ 
+         # Calculate the number of blocks that can be allocated with the
+         # profiled peak memory.
+-        torch.cuda.synchronize()
+-        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+-        # NOTE(woosuk): Here we assume that the other processes using the same
+-        # GPU did not change their memory usage during the profiling.
+-        peak_memory = self.init_gpu_memory - free_gpu_memory
+-        assert peak_memory > 0, (
+-            "Error in memory profiling. This happens when the GPU memory was "
+-            "not properly cleaned up before initializing the vLLM instance.")
+-
+         cache_block_size = self.get_cache_block_size_bytes()
+-        num_gpu_blocks = int(
+-            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
+-             peak_memory) // cache_block_size)
+-        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+-                             cache_block_size)
++        if cache_block_size == 0:
++            num_gpu_blocks = 0
++            num_cpu_blocks = 0
++        else:
++            num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
++            num_cpu_blocks = int(self.cache_config.swap_space_bytes //
++                                 cache_block_size)
+         num_gpu_blocks = max(num_gpu_blocks, 0)
+         num_cpu_blocks = max(num_cpu_blocks, 0)
++
++        msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
++               "the current vLLM instance can use "
++               "total_gpu_memory "
++               f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
++               " x gpu_memory_utilization "
++               f"({self.cache_config.gpu_memory_utilization:.2f})"
++               f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
++               "model weights take "
++               f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;"
++               " non_torch_memory takes "
++               f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;"
++               " PyTorch activation peak memory takes "
++               f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;"
++               " the rest of the memory reserved for KV Cache is "
++               f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
++
++        logger.info(msg)
++
++        # Final cleanup
+         if self.model_runner.lora_manager:
+             self.model_runner.remove_all_loras()
+         gc.collect()
+-        torch.cuda.empty_cache()
++
+         return num_gpu_blocks, num_cpu_blocks
+ 
++    def _assert_memory_footprint_increased_during_profiling(self):
++        # NOTE(woosuk): Here we assume that the other processes using the same
++        # GPU did not change their memory usage during the profiling.
++        free_gpu_memory, _ = torch.cuda.mem_get_info()
++        assert self.init_gpu_memory - free_gpu_memory > 0, (
++            "Error in memory profiling. "
++            f"Initial free memory {self.init_gpu_memory}, current free memory"
++            f" {free_gpu_memory}. This happens when the GPU memory was "
++            "not properly cleaned up before initializing the vLLM instance.")
++
+     def initialize_cache(self, num_gpu_blocks: int,
+                          num_cpu_blocks: int) -> None:
+         """Allocate GPU and CPU KV cache with the specified number of blocks.
+@@ -171,6 +264,7 @@ class Worker(WorkerBase):
+         """
+         raise_if_cache_size_invalid(num_gpu_blocks,
+                                     self.cache_config.block_size,
++                                    self.cache_config.is_attention_free,
+                                     self.model_config.max_model_len)
+ 
+         self.cache_config.num_gpu_blocks = num_gpu_blocks
+@@ -181,10 +275,17 @@ class Worker(WorkerBase):
+ 
+     def _init_cache_engine(self):
+         assert self.cache_config.num_gpu_blocks is not None
+-        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
+-                                        self.parallel_config)
+-        self.gpu_cache = self.cache_engine.gpu_cache
+-        self.model_runner.set_block_size(self.cache_engine.block_size)
++        self.cache_engine = [
++            CacheEngine(self.cache_config, self.model_config,
++                        self.parallel_config, self.device_config)
++            for _ in range(self.parallel_config.pipeline_parallel_size)
++        ]
++        self.gpu_cache = [
++            self.cache_engine[ve].gpu_cache
++            for ve in range(self.parallel_config.pipeline_parallel_size)
++        ]
++        bind_kv_cache(self.compilation_config.static_forward_context,
++                      self.gpu_cache)
+ 
+     def _warm_up_model(self) -> None:
+         if not self.model_config.enforce_eager:
+@@ -193,65 +294,116 @@ class Worker(WorkerBase):
+         # the model initialization and profiling.
+         set_random_seed(self.model_config.seed)
+ 
+-    def cache_swap(
+-        self,
+-        blocks_to_swap_in: Dict[int, int],
+-        blocks_to_swap_out: Dict[int, int],
+-        blocks_to_copy: Dict[int, List[int]],
+-    ) -> None:
+-        # Issue cache operations.
+-        # TODO(woosuk): Profile swapping overhead and optimize if needed.
+-        if blocks_to_swap_in:
+-            self.cache_engine.swap_in(blocks_to_swap_in)
+-        if blocks_to_swap_out:
+-            self.cache_engine.swap_out(blocks_to_swap_out)
+-        if blocks_to_copy:
+-            self.cache_engine.copy(blocks_to_copy)
+-
+-    @torch.inference_mode()
+-    def execute_model(
+-        self,
+-        execute_model_req: Optional[ExecuteModelRequest] = None
+-    ) -> List[SamplerOutput]:
+-
+-        if execute_model_req is None:
+-            seq_group_metadata_list = None
+-        else:
+-            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+-
+-        if self.is_driver_worker:
+-            assert seq_group_metadata_list is not None
+-            assert execute_model_req is not None
+-            num_seq_groups = len(seq_group_metadata_list)
+-            blocks_to_swap_in = execute_model_req.blocks_to_swap_in
+-            blocks_to_swap_out = execute_model_req.blocks_to_swap_out
+-            blocks_to_copy = execute_model_req.blocks_to_copy
+-            data: Dict[str, Any] = {
+-                "num_seq_groups": num_seq_groups,
+-                "blocks_to_swap_in": blocks_to_swap_in,
+-                "blocks_to_swap_out": blocks_to_swap_out,
+-                "blocks_to_copy": blocks_to_copy,
+-            }
+-            broadcast_tensor_dict(data, src=0)
+-        else:
+-            data = broadcast_tensor_dict(src=0)
+-            num_seq_groups = data["num_seq_groups"]
+-            blocks_to_swap_in = data["blocks_to_swap_in"]
+-            blocks_to_swap_out = data["blocks_to_swap_out"]
+-            blocks_to_copy = data["blocks_to_copy"]
+-
+-        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
++    @property
++    def do_metadata_broadcast(self) -> bool:
++        return self.parallel_config.tensor_parallel_size > 1
+ 
+-        # If there is no input, we don't need to execute the model.
+-        if num_seq_groups == 0:
+-            return []
++    @property
++    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
++        return self.gpu_cache
+ 
+-        output = self.model_runner.execute_model(seq_group_metadata_list,
+-                                                 self.gpu_cache)
++    @torch.inference_mode()
++    def prepare_worker_input(
++            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
++        virtual_engine = execute_model_req.virtual_engine
++        num_steps = execute_model_req.num_steps
++        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
++        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
++        # they contain parameters to launch cudamemcpyasync.
++        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
++                                         device="cpu",
++                                         dtype=torch.int64).view(-1, 2)
++        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
++                                          device="cpu",
++                                          dtype=torch.int64).view(-1, 2)
++        # `blocks_to_copy` is a gpu tensor. The src and tgt of
++        # blocks to copy are in the same device, and `blocks_to_copy`
++        # can be used directly within cuda kernels.
++        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
++                                      device=self.device,
++                                      dtype=torch.int64).view(-1, 2)
++
++        return WorkerInput(
++            num_seq_groups=num_seq_groups,
++            blocks_to_swap_in=blocks_to_swap_in,
++            blocks_to_swap_out=blocks_to_swap_out,
++            blocks_to_copy=blocks_to_copy,
++            virtual_engine=virtual_engine,
++            num_steps=num_steps,
++        )
+ 
+-        # Worker only supports single-step execution. Wrap the output in a list
+-        # to conform to interface.
+-        return [output]
++    @torch.inference_mode()
++    def execute_worker(self, worker_input: WorkerInput) -> None:
++        virtual_engine = worker_input.virtual_engine
++        # Issue cache operations.
++        if (worker_input.blocks_to_swap_in is not None
++                and worker_input.blocks_to_swap_in.numel() > 0):
++            self.cache_engine[virtual_engine].swap_in(
++                worker_input.blocks_to_swap_in)
++        if (worker_input.blocks_to_swap_out is not None
++                and worker_input.blocks_to_swap_out.numel() > 0):
++            self.cache_engine[virtual_engine].swap_out(
++                worker_input.blocks_to_swap_out)
++        if (worker_input.blocks_to_copy is not None
++                and worker_input.blocks_to_copy.numel() > 0):
++            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
++
++    def _get_cached_seq_group_metadata(
++            self,
++            seq_group_metadata_list: List[Union[SequenceGroupMetadata,
++                                                SequenceGroupMetadataDelta]],
++            finished_request_ids: List[str]) -> List[SequenceGroupMetadata]:
++        """Return a list of cached Sequence Group Metadata after updating its
++        state.
++
++        It is used because scheduler only sends delta to workers to reduce
++        the data payload size. The function also cleans up cache based on
++        a given `finished_request_ids`.
++        """
++        new_seq_group_metadata_list = []
++        for metadata_or_delta in seq_group_metadata_list:
++            request_id = metadata_or_delta.request_id
++            if request_id not in self._seq_group_metadata_cache:
++                # The first prefill.
++                assert isinstance(metadata_or_delta, SequenceGroupMetadata)
++                self._seq_group_metadata_cache[request_id] = metadata_or_delta
++            else:
++                # The first prefill is already cached.
++                if isinstance(metadata_or_delta, SequenceGroupMetadataDelta):
++                    self._seq_group_metadata_cache[request_id].apply_delta(
++                        metadata_or_delta)
++                else:
++                    # If metadata snapshot is sent again, it is
++                    # preempted. Reset the cache because we need to start
++                    # from scratch.
++                    assert isinstance(metadata_or_delta, SequenceGroupMetadata)
++                    self._seq_group_metadata_cache[
++                        request_id] = metadata_or_delta
++
++            new_seq_group_metadata_list.append(
++                self._seq_group_metadata_cache[request_id])
++
++        # Clean up finished ids
++        for finished_id in finished_request_ids:
++            del self._seq_group_metadata_cache[finished_id]
++
++        return new_seq_group_metadata_list
++
++    def _execute_model_spmd(
++        self,
++        execute_model_req: ExecuteModelRequest,
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++    ) -> Optional[List[SamplerOutput]]:
++        if execute_model_req is not None:
++            new_seq_group_metadata_list = self._get_cached_seq_group_metadata(
++                execute_model_req.seq_group_metadata_list,
++                execute_model_req.finished_requests_ids)
++
++            execute_model_req.seq_group_metadata_list = (
++                new_seq_group_metadata_list)
++        output = super()._execute_model_spmd(execute_model_req,
++                                             intermediate_tensors)
++        return output
+ 
+     def add_lora(self, lora_request: LoRARequest) -> bool:
+         return self.model_runner.add_lora(lora_request)
+@@ -259,9 +411,25 @@ class Worker(WorkerBase):
+     def remove_lora(self, lora_id: int) -> bool:
+         return self.model_runner.remove_lora(lora_id)
+ 
++    def pin_lora(self, lora_id: int) -> bool:
++        return self.model_runner.pin_lora(lora_id)
++
+     def list_loras(self) -> Set[int]:
+         return self.model_runner.list_loras()
+ 
++    def add_prompt_adapter(
++            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
++        return self.model_runner.add_prompt_adapter(prompt_adapter_request)
++
++    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        return self.model_runner.remove_lora(prompt_adapter_id)
++
++    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
++        return self.model_runner.pin_prompt_adapter(prompt_adapter_id)
++
++    def list_prompt_adapters(self) -> Set[int]:
++        return self.model_runner.list_prompt_adapters()
++
+     @property
+     def max_model_len(self) -> int:
+         return self.model_config.max_model_len
+@@ -279,64 +447,55 @@ class Worker(WorkerBase):
+ 
+ 
+ def init_worker_distributed_environment(
+-    parallel_config: ParallelConfig,
++    vllm_config: VllmConfig,
+     rank: int,
+     distributed_init_method: Optional[str] = None,
+     local_rank: int = -1,
+ ) -> None:
+     """Initialize the distributed environment."""
++    parallel_config = vllm_config.parallel_config
++    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
++
+     init_distributed_environment(parallel_config.world_size, rank,
+                                  distributed_init_method, local_rank)
+-
+     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                       parallel_config.pipeline_parallel_size)
+ 
+-    if pynccl_utils.is_initialized():
+-        pynccl_world_size = pynccl_utils.get_world_size()
+-        if pynccl_world_size != parallel_config.world_size:
+-            raise RuntimeError(
+-                "pynccl is already initialized but the pynccl world "
+-                "size does not match parallel_config.world_size "
+-                f"({pynccl_world_size} vs. {parallel_config.world_size}).")
+-    elif parallel_config.world_size > 1:
+-        # NOTE(woosuk): We don't initialize pynccl process group when world size
+-        # is 1.
+-        # NOTE(kaichao): By default, pynccl is initialized for tp group.
+-        pynccl_utils.init_process_group(
+-            group=get_tensor_model_parallel_cpu_group())
+-
+-    # Initialize a custom fast all-reduce implementation.
+-    if not parallel_config.disable_custom_all_reduce:
+-        init_custom_ar()
+-
+-    # A small all_reduce for warmup.
+-    torch.distributed.all_reduce(torch.zeros(1).cuda())
+-    if pynccl_utils.is_initialized():
+-        pynccl_utils.all_reduce(torch.zeros(1).cuda())
++    ensure_kv_transfer_initialized(vllm_config)
+ 
+ 
+ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
+     # Check if the GPU supports the dtype.
+-    if torch_dtype == torch.bfloat16:
+-        compute_capability = torch.cuda.get_device_capability()
+-        if compute_capability[0] < 8:
+-            gpu_name = torch.cuda.get_device_name()
++    if torch_dtype == torch.bfloat16:  # noqa: SIM102
++        if not current_platform.has_device_capability(80):
++            capability = current_platform.get_device_capability()
++            gpu_name = current_platform.get_device_name()
++
++            if capability is None:
++                compute_str = "does not have a compute capability"
++            else:
++                version_str = capability.as_version_str()
++                compute_str = f"has compute capability {version_str}"
++
+             raise ValueError(
+                 "Bfloat16 is only supported on GPUs with compute capability "
+-                f"of at least 8.0. Your {gpu_name} GPU has compute capability "
+-                f"{compute_capability[0]}.{compute_capability[1]}. "
++                f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
+                 "You can use float16 instead by explicitly setting the"
+                 "`dtype` flag in CLI, for example: --dtype=half.")
+ 
+ 
+-def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
++def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
+                                 max_model_len) -> None:
+-    if num_gpu_blocks <= 0:
++    if is_attention_free and num_gpu_blocks != 0:
++        raise ValueError("No memory should be allocated for the cache blocks "
++                         f"for an attention-free model, but {num_gpu_blocks}"
++                         "blocks are allocated.")
++    if not is_attention_free and num_gpu_blocks <= 0:
+         raise ValueError("No available memory for the cache blocks. "
+                          "Try increasing `gpu_memory_utilization` when "
+                          "initializing the engine.")
+     max_seq_len = block_size * num_gpu_blocks
+-    if max_model_len > max_seq_len:
++    if not is_attention_free and max_model_len > max_seq_len:
+         raise ValueError(
+             f"The model's max seq len ({max_model_len}) "
+             "is larger than the maximum number of tokens that can be "
+diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
+index fb32fea..a835718 100644
+--- a/vllm/worker/worker_base.py
++++ b/vllm/worker/worker_base.py
+@@ -1,22 +1,52 @@
+-import importlib
++import dataclasses
+ import os
++import time
+ from abc import ABC, abstractmethod
+-from typing import Dict, List, Set, Tuple
++from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
+ 
++import torch
++
++from vllm.config import ObservabilityConfig, VllmConfig
++from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+-from vllm.sequence import ExecuteModelRequest, SamplerOutput
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.sequence import ExecuteModelRequest, IntermediateTensors
+ from vllm.utils import (enable_trace_function_call_for_thread,
+-                        update_environment_variables)
++                        resolve_obj_by_qualname, update_environment_variables)
++from vllm.worker.model_runner_base import (BroadcastableModelInput,
++                                           ModelRunnerBase,
++                                           ModelRunnerInputBase)
+ 
+ logger = init_logger(__name__)
+ 
+ 
+ class WorkerBase(ABC):
+     """Worker interface that allows vLLM to cleanly separate implementations for
+-    different hardware.
++    different hardware. Also abstracts control plane communication, e.g., to
++    communicate request metadata to other workers.
+     """
+ 
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++    ) -> None:
++        self.vllm_config = vllm_config
++        self.model_config = vllm_config.model_config
++        self.cache_config = vllm_config.cache_config
++        self.lora_config = vllm_config.lora_config
++        self.load_config = vllm_config.load_config
++        self.parallel_config = vllm_config.parallel_config
++        self.scheduler_config = vllm_config.scheduler_config
++        self.device_config = vllm_config.device_config
++        self.speculative_config = vllm_config.speculative_config
++        self.prompt_adapter_config = vllm_config.prompt_adapter_config
++        self.observability_config = vllm_config.observability_config
++        self.kv_transfer_config = vllm_config.kv_transfer_config
++        self.compilation_config = vllm_config.compilation_config
++        from vllm.platforms import current_platform
++        self.current_platform = current_platform
++
+     @abstractmethod
+     def init_device(self) -> None:
+         """Initialize device state, such as loading the model or other on-device
+@@ -46,12 +76,23 @@ class WorkerBase(ABC):
+         """
+         raise NotImplementedError
+ 
++    def start_worker_execution_loop(self) -> None:
++        """Execute model loop in parallel worker.
++
++        You can stop the loop by executing a driver worker with an empty output.
++        See `stop_remote_worker_execution_loop` for more details.
++        """
++        with self.current_platform.inference_mode():
++            while True:
++                output = self.execute_model(execute_model_req=None)
++                if output is None:
++                    return None
++
+     @abstractmethod
+     def execute_model(
+-            self,
+-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+-        """Executes at least one model step on the given sequences, unless no
+-        sequences are provided."""
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None
++    ) -> Optional[List[SamplerOutput]]:
+         raise NotImplementedError
+ 
+     @abstractmethod
+@@ -69,6 +110,10 @@ class WorkerBase(ABC):
+     def remove_lora(self, lora_id: int) -> bool:
+         raise NotImplementedError
+ 
++    @abstractmethod
++    def pin_lora(self, lora_id: int) -> bool:
++        raise NotImplementedError
++
+     @abstractmethod
+     def list_loras(self) -> Set[int]:
+         raise NotImplementedError
+@@ -85,10 +130,284 @@ class LoraNotSupportedWorkerBase(WorkerBase):
+     def remove_lora(self, lora_id: int) -> bool:
+         raise ValueError(f"{type(self)} does not support LoRA")
+ 
++    def pin_lora(self, lora_id: int) -> bool:
++        return ValueError(
++            f"{type(self)} does not support LoRA")  # type: ignore
++
+     def list_loras(self) -> Set[int]:
+         raise ValueError(f"{type(self)} does not support LoRA")
+ 
+ 
++@dataclasses.dataclass(frozen=True)
++class WorkerInput:
++    """Local inputs to each worker. May contain device-specific data. These
++    fields should be broadcastable to other workers.
++    """
++
++    num_seq_groups: Optional[int] = None
++    blocks_to_swap_in: Optional[torch.Tensor] = None
++    blocks_to_swap_out: Optional[torch.Tensor] = None
++    blocks_to_copy: Optional[torch.Tensor] = None
++    virtual_engine: int = 0
++    num_steps: int = 1
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls: Type["WorkerInput"],
++        tensor_dict: Dict[str, Any],
++    ) -> "WorkerInput":
++        """
++        Pop fields from the given tensor_dict and populate a new instance of
++        WorkerInput.
++        """
++        return cls(
++            num_seq_groups=tensor_dict.pop("num_seq_groups"),
++            blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"),
++            blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"),
++            blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
++            virtual_engine=tensor_dict["virtual_engine"],
++            num_steps=tensor_dict.pop("num_steps"),
++        )
++
++    def as_broadcastable_tensor_dict(
++            self) -> Dict[str, Union[int, torch.Tensor]]:
++        """
++        Extract broadcastable fields.
++        """
++        tensor_dict = {
++            "num_seq_groups": self.num_seq_groups,
++            "blocks_to_swap_in": self.blocks_to_swap_in,
++            "blocks_to_swap_out": self.blocks_to_swap_out,
++            "blocks_to_copy": self.blocks_to_copy,
++            "virtual_engine": self.virtual_engine,
++            "num_steps": self.num_steps,
++        }
++
++        return tensor_dict
++
++
++class LocalOrDistributedWorkerBase(WorkerBase):
++    """
++    Partial implementation of WorkerBase that has a default `execute_model`
++    definition to perform metadata transfer between workers when in distributed
++    mode. Subclasses of this interface should use model runners that inherit
++    from ModelRunnerBase, and should only need to implement worker-local logic.
++    If custom control plane logic is needed to transfer metadata, or if the
++    model runner cannot inherit from ModelRunnerBase, use WorkerBase instead.
++    """
++    is_driver_worker: bool
++    model_runner: ModelRunnerBase
++    observability_config: Optional[ObservabilityConfig] = None
++
++    @property
++    @abstractmethod
++    def do_metadata_broadcast(self) -> bool:
++        """
++        Used by the default `execute_model` to check whether broadcast is
++        needed to transfer request inputs from the driver worker to other
++        workers in the TP group. If WorkerBase subclass only supports
++        single-worker execution, then this method should return False.
++        """
++        raise NotImplementedError
++
++    @property
++    @abstractmethod
++    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
++        """
++        Gets the list of kv caches to pass to the worker's model runner. Each
++        element in the list is a kv cache corresponding to a particular virtual
++        engine (PP stream). Used by the default `execute_model`. If the worker's
++        model runner does not follow the ModelRunnerBase interface, then inherit
++        from WorkerBase instead.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def prepare_worker_input(
++            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
++        """
++        Prepare the inputs to WorkerBase.execute_worker from an execution
++        request. This method may move data to the worker's local device. It is
++        not allowed to communicate with other workers or devices.
++        """
++        raise NotImplementedError
++
++    @abstractmethod
++    def execute_worker(self, worker_input: WorkerInput) -> None:
++        """
++        Process an execution request.
++        """
++        raise NotImplementedError
++
++    def _get_worker_input_from_broadcast(
++        self
++    ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[
++            str, torch.Tensor]]]:
++        """ Get the worker input from the broadcasted tensor dict. """
++        assert self.do_metadata_broadcast
++        assert not self.is_driver_worker
++        broadcast_data = broadcast_tensor_dict(src=0)
++        if not broadcast_data:
++            return None
++
++        worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data)
++        model_input = (
++            self.model_runner.make_model_input_from_broadcasted_tensor_dict(
++                broadcast_data))
++
++        kwargs = extract_previous_hidden_states(broadcast_data)
++
++        return model_input, worker_input, kwargs
++
++    def _get_driver_input_and_broadcast(
++        self, execute_model_req: ExecuteModelRequest
++    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
++        """ Get the driver input and broadcast it to other workers.  """
++        assert self.is_driver_worker
++
++        worker_input: WorkerInput = self.prepare_worker_input(
++            execute_model_req=execute_model_req)
++        model_input: ModelRunnerInputBase = (
++            self.model_runner.prepare_model_input(
++                execute_model_req.seq_group_metadata_list,
++                execute_model_req.virtual_engine,
++                execute_model_req.finished_requests_ids))
++
++        kwargs = extract_previous_hidden_states(execute_model_req)
++
++        if self.do_metadata_broadcast:
++            broadcast_data = worker_input.as_broadcastable_tensor_dict()
++            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
++            broadcast_data.update(kwargs)
++            broadcast_tensor_dict(broadcast_data, src=0)
++
++        if execute_model_req.async_callback:
++            model_input = dataclasses.replace(  # type: ignore
++                model_input,
++                async_callback=execute_model_req.async_callback)
++
++        return model_input, worker_input, kwargs
++
++    def prepare_input(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None
++    ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[
++            str, torch.Tensor]]]:
++        """
++        Prepare the inputs to ModelRunner and workers.
++        """
++        if self.is_driver_worker:
++            if execute_model_req is None:
++                if self.do_metadata_broadcast:
++                    # This signals that there's no more requests to process for
++                    # now. All workers are running infinite loop with
++                    # broadcast_tensor_dict, and it stops the loop when the
++                    # driver broadcasts an empty input. Send an empty input to
++                    # notify all other workers to stop their execution loop.
++                    broadcast_tensor_dict({}, src=0)
++                return None
++            return self._get_driver_input_and_broadcast(execute_model_req)
++        else:
++            return self._get_worker_input_from_broadcast()
++
++    def execute_model(
++        self,
++        execute_model_req: Optional[ExecuteModelRequest] = None,
++    ) -> Optional[List[SamplerOutput]]:
++        """Executes at least one model step on the given sequences, unless no
++        sequences are provided."""
++        start_time = time.perf_counter()
++
++        inputs = self.prepare_input(execute_model_req)
++        if inputs is None:
++            return None
++
++        model_input, worker_input, kwargs = inputs
++        num_steps = worker_input.num_steps
++
++        self.execute_worker(worker_input)
++
++        # If there is no input, we don't need to execute the model.
++        if worker_input.num_seq_groups == 0:
++            return []
++
++        intermediate_tensors = None
++        orig_model_execute_time = 0.0
++        if not get_pp_group().is_first_rank:
++            intermediate_tensors = IntermediateTensors(
++                get_pp_group().recv_tensor_dict(
++                    all_gather_group=get_tp_group()))
++            if (self.observability_config is not None
++                    and self.observability_config.collect_model_execute_time):
++                orig_model_execute_time = intermediate_tensors.tensors.get(
++                    "model_execute_time", torch.tensor(0)).item()
++
++        output = self.model_runner.execute_model(
++            model_input=model_input,
++            kv_caches=self.kv_cache[worker_input.virtual_engine]
++            if self.kv_cache is not None else None,
++            intermediate_tensors=intermediate_tensors,
++            num_steps=num_steps,
++            **kwargs,
++        )
++
++        model_execute_time = time.perf_counter() - start_time
++        if not get_pp_group().is_last_rank:
++            # output is IntermediateTensors
++            assert isinstance(output, IntermediateTensors)
++            if (self.observability_config is not None
++                    and self.observability_config.collect_model_execute_time):
++                output.tensors["model_execute_time"] = torch.tensor(
++                    model_execute_time + orig_model_execute_time)
++            get_pp_group().send_tensor_dict(output.tensors,
++                                            all_gather_group=get_tp_group())
++            return [None]
++        if (self.observability_config is not None
++                and self.observability_config.collect_model_execute_time
++                and output is not None):
++            for o in output:
++                o.model_execute_time = (orig_model_execute_time +
++                                        model_execute_time)
++
++        # output is List[SamplerOutput]
++        return output
++
++    def _execute_model_spmd(
++        self,
++        execute_model_req: ExecuteModelRequest,
++        intermediate_tensors: Optional[IntermediateTensors] = None
++    ) -> Optional[List[SamplerOutput]]:
++        """
++        Execute model in Single Program Multiple Data (SPMD) fashion.
++        All workers take the same request, prepare the input and
++        execute the model.
++        """
++        assert execute_model_req is not None, (
++            "_execute_model_spmd() requires each worker to take in an "
++            "ExecuteModelRequest")
++        worker_input: WorkerInput = self.prepare_worker_input(
++            execute_model_req=execute_model_req)
++        model_input: ModelRunnerInputBase = (
++            self.model_runner.prepare_model_input(
++                execute_model_req.seq_group_metadata_list))
++
++        self.execute_worker(worker_input)
++
++        # If there is no input, we don't need to execute the model.
++        if worker_input.num_seq_groups == 0:
++            return []
++
++        kwargs = extract_previous_hidden_states(execute_model_req)
++
++        return self.model_runner.execute_model(
++            model_input=model_input,
++            kv_caches=self.kv_cache[worker_input.virtual_engine]
++            if self.kv_cache is not None else None,
++            intermediate_tensors=intermediate_tensors,
++            **kwargs,
++        )
++
++
+ class WorkerWrapperBase:
+     """
+     The whole point of this class is to lazily initialize the worker.
+@@ -97,13 +416,13 @@ class WorkerWrapperBase:
+     real initialization happens in `init_worker`.
+     """
+ 
+-    def __init__(self,
+-                 worker_module_name=None,
+-                 worker_class_name=None,
+-                 trust_remote_code: bool = False) -> None:
+-        self.worker_module_name = worker_module_name
+-        self.worker_class_name = worker_class_name
+-        self.worker = None
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++    ) -> None:
++        self.vllm_config = vllm_config
++        trust_remote_code = vllm_config.model_config.trust_remote_code
++        self.worker: Optional[WorkerBase] = None
+         if trust_remote_code:
+             # note: lazy import to avoid importing torch before initializing
+             from vllm.utils import init_cached_hf_modules
+@@ -120,17 +439,23 @@ class WorkerWrapperBase:
+ 
+     def init_worker(self, *args, **kwargs):
+         """
+-        Actual initialization of the worker class, and set up
+-       function tracing if required.
++        Here we inject some common logic before initializing the worker.
+         Arguments are passed to the worker class constructor.
+         """
+-        enable_trace_function_call_for_thread()
++        enable_trace_function_call_for_thread(self.vllm_config)
++
++        # see https://github.com/NVIDIA/nccl/issues/1234
++        os.environ['NCCL_CUMEM_ENABLE'] = '0'
++
++        from vllm.plugins import load_general_plugins
++        load_general_plugins()
+ 
+-        mod = importlib.import_module(self.worker_module_name)
+-        worker_class = getattr(mod, self.worker_class_name)
++        worker_class = resolve_obj_by_qualname(
++            self.vllm_config.parallel_config.worker_cls)
+         self.worker = worker_class(*args, **kwargs)
++        assert self.worker is not None
+ 
+-    def execute_method(self, method, *args, **kwargs):
++    def execute_method(self, method: str, *args, **kwargs):
+         try:
+             target = self if self.worker is None else self.worker
+             executor = getattr(target, method)
+@@ -144,3 +469,26 @@ class WorkerWrapperBase:
+                    "This might cause deadlock in distributed execution.")
+             logger.exception(msg)
+             raise e
++
++    def __getattr__(self, attr):
++        return getattr(self.worker, attr)
++
++
++def extract_previous_hidden_states(
++        data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \
++            Dict[str, torch.Tensor]:
++    """If data contains previous_hidden_states, extract it. This returns a dict
++    which can be used directly as additional kwargs in any following 
++    execute_model calls. This is used in draft models like EAGLE."""
++    output = {}
++
++    # When called from non-driver worker, data is dict but when called from
++    # driver worker, data is ExecuteModelRequest.
++    if isinstance(data, dict):
++        if "previous_hidden_states" in data:
++            output["previous_hidden_states"] = data["previous_hidden_states"]
++    elif data.previous_hidden_states is not None:
++        output["previous_hidden_states"] = data.previous_hidden_states\
++            .hidden_states
++
++    return output
+diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
+new file mode 100644
+index 0000000..9cf2538
+--- /dev/null
++++ b/vllm/worker/xpu_model_runner.py
+@@ -0,0 +1,609 @@
++import dataclasses
++import time
++import weakref
++from collections import defaultdict
++from dataclasses import dataclass
++from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
++                    Type, TypeVar)
++
++import torch
++import torch.nn as nn
++
++from vllm.attention import get_attn_backend
++from vllm.config import VllmConfig
++from vllm.distributed import get_pp_group
++from vllm.inputs import INPUT_REGISTRY, InputRegistry
++from vllm.logger import init_logger
++from vllm.model_executor import SamplingMetadataCache
++from vllm.model_executor.layers.sampler import SamplerOutput
++from vllm.model_executor.model_loader import get_model
++from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
++                             MultiModalKwargs, MultiModalPlaceholderMap,
++                             MultiModalRegistry)
++from vllm.sampling_params import SamplingParams
++from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
++from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad
++from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
++from vllm.worker.model_runner_base import (
++    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
++    _add_attn_metadata_broadcastable_dict,
++    _add_sampling_metadata_broadcastable_dict,
++    _init_attn_metadata_from_tensor_dict,
++    _init_sampling_metadata_from_tensor_dict)
++
++if TYPE_CHECKING:
++    from vllm.attention.backends.abstract import AttentionBackend
++
++logger = init_logger(__name__)
++
++_PAD_SLOT_ID = -1
++
++TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU")
++
++
++@dataclass(frozen=True)
++class ModelInputForXPU(ModelRunnerInputBase):
++    """
++    Used by the NeuronModelRunner.
++    """
++    input_tokens: Optional[torch.Tensor] = None
++    input_positions: Optional[torch.Tensor] = None
++    attn_metadata: Optional["AttentionMetadata"] = None
++    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
++    virtual_engine: Optional[int] = None
++    seq_lens: Optional[List[int]] = None
++    query_lens: Optional[List[int]] = None
++    async_callback: Optional[Callable] = None
++
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        tensor_dict = {
++            "input_tokens": self.input_tokens,
++            "input_positions": self.input_positions,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++
++        return tensor_dict
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls: Type[TModelInputForXPU],
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> TModelInputForXPU:
++        if attn_backend is not None:
++            tensor_dict = _init_attn_metadata_from_tensor_dict(
++                attn_backend, tensor_dict)
++        return cls(**tensor_dict)
++
++
++@dataclass(frozen=True)
++class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU):
++    """
++    Used by the ModelRunner.
++    """
++    sampling_metadata: Optional["SamplingMetadata"] = None
++
++    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
++        tensor_dict = {
++            "input_tokens": self.input_tokens,
++            "input_positions": self.input_positions,
++        }
++        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
++        _add_sampling_metadata_broadcastable_dict(tensor_dict,
++                                                  self.sampling_metadata)
++        return tensor_dict
++
++    @classmethod
++    def from_broadcasted_tensor_dict(
++        cls,
++        tensor_dict: Dict[str, Any],
++        attn_backend: Optional["AttentionBackend"] = None,
++    ) -> "ModelInputForXPUWithSamplingMetadata":
++        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
++        if attn_backend is not None:
++            tensor_dict = _init_attn_metadata_from_tensor_dict(
++                attn_backend, tensor_dict)
++        return cls(**tensor_dict)
++
++
++class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]):
++
++    def __init__(self,
++                 runner: "XPUModelRunner",
++                 finished_requests_ids: Optional[List[str]] = None) -> None:
++        super().__init__()
++        self.seq_group_metadata_list: List[SequenceGroupMetadata] = []
++        self.runner = runner
++        self.model_input_cls = self.runner._model_input_cls
++        self.attn_backend = self.runner.attn_backend
++        self.sliding_window = self.runner.sliding_window
++        self.block_size = self.runner.block_size
++        self.device = self.runner.device
++
++    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
++        self.seq_group_metadata_list.append(seq_group_metadata)
++
++    def build(self) -> ModelInputForXPU:
++        is_prompt = self.seq_group_metadata_list[0].is_prompt
++        # Prepare input tensors.
++        if is_prompt:
++            (input_tokens, input_positions, attn_metadata, seq_lens,
++             multi_modal_kwargs) = self._prepare_prompt(
++                 self.seq_group_metadata_list)
++        else:
++            (input_tokens, input_positions,
++             attn_metadata) = self._prepare_decode(
++                 self.seq_group_metadata_list)
++            seq_lens = None
++            multi_modal_kwargs = None
++
++        return self.model_input_cls(
++            input_tokens=input_tokens,
++            input_positions=input_positions,
++            attn_metadata=attn_metadata,
++            multi_modal_kwargs=multi_modal_kwargs,
++            seq_lens=seq_lens,
++            query_lens=seq_lens,
++        )
++
++    def _prepare_prompt(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
++               BatchedTensorInputs]:
++        assert len(seq_group_metadata_list) > 0
++        input_tokens: List[int] = []
++        input_positions: List[int] = []
++        slot_mapping: List[int] = []
++        seq_lens: List[int] = []
++        multi_modal_kwargs_list: List[MultiModalKwargs] = []
++        multi_modal_placeholder_maps: Dict[
++            str,
++            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
++
++        for seq_group_metadata in seq_group_metadata_list:
++            assert seq_group_metadata.is_prompt
++            seq_ids = list(seq_group_metadata.seq_data.keys())
++            assert len(seq_ids) == 1
++            seq_id = seq_ids[0]
++
++            seq_data = seq_group_metadata.seq_data[seq_id]
++            prompt_tokens = seq_data.get_token_ids()
++            computed_len = seq_data.get_num_computed_tokens()
++            seq_len = len(prompt_tokens)
++
++            seq_lens.append(seq_len)  # Prompt token num
++            input_tokens.extend(prompt_tokens)  # Token ids
++
++            # Token position ids
++            # NOTE(woosuk): Here we assume that the first token in the prompt
++            # is always the first token in the sequence.
++            positions_range = range(computed_len, seq_len)
++            input_positions.extend(list(positions_range))
++
++            if seq_group_metadata.multi_modal_data:
++                # NOTE: mm_data only includes the subset of multi-modal items
++                # that intersect with the current prefill positions.
++                mm_data, placeholder_maps = MultiModalPlaceholderMap \
++                    .from_seq_group(seq_group_metadata, positions_range)
++
++                if self.runner.mm_registry.has_processor(
++                        self.runner.model_config):
++                    mm_kwargs = mm_data
++                else:
++                    mm_kwargs = self.runner.multi_modal_input_mapper(
++                        mm_data,
++                        seq_group_metadata.mm_processor_kwargs,
++                    )
++
++                multi_modal_kwargs_list.append(mm_kwargs)
++
++                for modality, placeholder_map in placeholder_maps.items():
++                    multi_modal_placeholder_maps[modality].extend(
++                        placeholder_map)
++
++            if seq_group_metadata.block_tables is None:
++                # During memory profiling, the block tables are not initialized
++                # yet. In this case, we just use a dummy slot mapping.
++                slot_mapping.extend([_PAD_SLOT_ID] * seq_len)
++                continue
++
++            # Compute the slot mapping.
++            block_table = seq_group_metadata.block_tables[seq_id]
++            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
++            # where start_idx is max(0, seq_len - sliding_window).
++            # For example, if the prompt len is 10, sliding window is 8, and
++            # block size is 4, the first two tokens are masked and the slot
++            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
++            start_idx = 0
++            if self.sliding_window is not None:
++                start_idx = max(0, seq_len - self.sliding_window)
++
++            for i in range(computed_len, seq_len):
++                if i < start_idx:
++                    slot_mapping.append(_PAD_SLOT_ID)
++                    continue
++
++                block_number = block_table[i //
++                                           self.block_size]  # type: ignore
++                block_offset = i % self.block_size  # type: ignore
++                slot = block_number * self.block_size + block_offset
++                slot_mapping.append(slot)
++
++        num_prompt_tokens = len(input_tokens)
++
++        input_tokens = torch.tensor(input_tokens,
++                                    dtype=torch.long,
++                                    device=self.device)  # type: ignore
++        input_positions = torch.tensor(input_positions,
++                                       dtype=torch.long,
++                                       device=self.device)  # type: ignore
++        slot_mapping = torch.tensor(slot_mapping,
++                                    dtype=torch.long,
++                                    device=self.device)  # type: ignore
++        placeholder_index_maps = {
++            modality: placeholder_map.index_map()
++            for modality, placeholder_map in
++            multi_modal_placeholder_maps.items()
++        }
++
++        max_seqlen = max(seq_lens)
++        tmp = [0]
++        tmp.extend(seq_lens)
++        seqlen = torch.tensor(tmp)
++        seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device)
++
++        attn_metadata = self.attn_backend.make_metadata(
++            is_prompt=True,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=placeholder_index_maps,
++            seq_lens=seq_lens,
++            seqlen_q=seqlen_q,
++            max_seqlen=max_seqlen,
++            seq_lens_tensor=torch.tensor([]),
++            max_decode_seq_len=0,
++            num_prefills=len(seq_lens),
++            num_prefill_tokens=num_prompt_tokens,
++            num_decode_tokens=0,
++            block_tables=torch.tensor([], device=self.device, dtype=torch.int),
++        )
++
++        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
++
++        return (input_tokens, input_positions, attn_metadata, seq_lens,
++                multi_modal_kwargs)
++
++    def _prepare_decode(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]:
++        assert len(seq_group_metadata_list) > 0
++        input_tokens: List[int] = []
++        input_positions: List[int] = []
++        slot_mapping: List[int] = []
++        seq_lens: List[int] = []
++        block_tables: List[List[int]] = []
++
++        for seq_group_metadata in seq_group_metadata_list:
++            assert not seq_group_metadata.is_prompt
++            assert seq_group_metadata.token_chunk_size == 1
++
++            seq_ids = list(seq_group_metadata.seq_data.keys())
++
++            for seq_id in seq_ids:
++                seq_data = seq_group_metadata.seq_data[seq_id]
++                generation_token = seq_data.get_last_token_id()
++                input_tokens.append(generation_token)
++
++                seq_len = seq_data.get_len()
++                position = seq_len - 1
++                input_positions.append(position)
++
++                seq_len = seq_len if self.sliding_window is None else min(
++                    seq_len, self.sliding_window)
++                seq_lens.append(seq_len)
++
++                block_table = seq_group_metadata.block_tables[seq_id]
++                block_number = block_table[position // self.block_size]
++                block_offset = position % self.block_size
++                slot = block_number * self.block_size + block_offset
++                slot_mapping.append(slot)
++
++                if self.sliding_window is not None:
++                    sliding_window_blocks = (self.sliding_window //
++                                             self.block_size)
++                    block_table = block_table[-sliding_window_blocks:]
++                block_tables.append(block_table)
++
++        max_decode_seq_len = max(seq_lens)
++
++        input_tokens = torch.tensor(input_tokens,
++                                    dtype=torch.long,
++                                    device=self.device)
++        input_positions = torch.tensor(input_positions,
++                                       dtype=torch.long,
++                                       device=self.device)
++        slot_mapping = torch.tensor(slot_mapping,
++                                    dtype=torch.long,
++                                    device=self.device)
++        seq_lens_tensor = torch.tensor(seq_lens,
++                                       dtype=torch.int,
++                                       device=self.device)
++
++        block_tables = make_tensor_with_pad(
++            block_tables,
++            pad=0,
++            dtype=torch.int,
++            device=self.device,
++        )
++
++        attn_metadata = self.attn_backend.make_metadata(
++            is_prompt=False,
++            slot_mapping=slot_mapping,
++            multi_modal_placeholder_index_maps=None,
++            seq_lens=seq_lens,
++            seqlen_q=torch.tensor([]),
++            max_seqlen=0,
++            seq_lens_tensor=seq_lens_tensor,
++            max_decode_seq_len=max_decode_seq_len,
++            num_prefill_tokens=0,
++            num_decode_tokens=len(input_tokens),
++            num_prefills=0,
++            block_tables=block_tables,
++        )
++        return (
++            input_tokens,
++            input_positions,
++            attn_metadata,
++        )
++
++
++class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
++    _model_input_cls: Type[ModelInputForXPUWithSamplingMetadata] = (
++        ModelInputForXPUWithSamplingMetadata)
++    _builder_cls: Type[ModelInputForXPUBuilder] = ModelInputForXPUBuilder
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        kv_cache_dtype: Optional[str] = "auto",
++        is_driver_worker: bool = False,
++        return_hidden_states: bool = False,
++        input_registry: InputRegistry = INPUT_REGISTRY,
++        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
++    ):
++
++        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
++        model_config = self.model_config
++        cache_config = self.cache_config
++        self.is_driver_worker = is_driver_worker
++        self.return_hidden_states = return_hidden_states
++
++        self.device = self.device_config.device
++
++        self.kv_cache_dtype = kv_cache_dtype
++        self.sliding_window = model_config.get_sliding_window()
++        self.block_size = cache_config.block_size
++
++        self.attn_backend = get_attn_backend(
++            self.model_config.get_head_size(),
++            self.model_config.dtype,
++            self.kv_cache_dtype,
++            self.block_size,
++            self.model_config.is_attention_free,
++        )
++
++        # Multi-modal data support
++        self.input_registry = input_registry
++        self.mm_registry = mm_registry
++        self.multi_modal_input_mapper = mm_registry \
++            .create_input_mapper(model_config)
++        self.mm_registry.init_mm_limits_per_prompt(self.model_config)
++
++        # Lazy initialization.
++        self.model: nn.Module  # Set after init_Model
++
++        self.sampling_metadata_cache: SamplingMetadataCache = \
++              SamplingMetadataCache() \
++                if self.parallel_config.pipeline_parallel_size == 1 else None
++
++    def load_model(self) -> None:
++        with DeviceMemoryProfiler() as m:
++            self.model = get_model(vllm_config=self.vllm_config)
++
++        self.model_memory_usage = m.consumed_memory
++        logger.info("Loading model weights took %.4f GB",
++                    self.model_memory_usage / float(2**30))
++
++    @property
++    def vocab_size(self) -> int:
++        return self.model_config.get_vocab_size()
++
++    @torch.inference_mode()
++    def profile_run(self) -> None:
++        # Enable top-k sampling to reflect the accurate memory usage.
++        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
++        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
++        max_num_seqs = self.scheduler_config.max_num_seqs
++
++        # Profile memory usage with max_num_sequences sequences and the total
++        # number of tokens equal to max_num_batched_tokens.
++        seqs: List[SequenceGroupMetadata] = []
++        # Additional GPU memory may be needed for multi-modal encoding, which
++        # needs to be accounted for when calculating the GPU blocks for
++        # vLLM blocker manager.
++        # To exercise the worst scenario for GPU memory consumption,
++        # the number of seqs (batch_size) is chosen to maximize the number
++        # of images processed.
++        max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
++            self.model_config)
++        if max_mm_tokens > 0:
++            max_num_seqs_orig = max_num_seqs
++            max_num_seqs = min(max_num_seqs,
++                               max_num_batched_tokens // max_mm_tokens)
++            if max_num_seqs < 1:
++                expr = (f"min({max_num_seqs_orig}, "
++                        f"{max_num_batched_tokens} // {max_mm_tokens})")
++                logger.warning(
++                    "Computed max_num_seqs (%s) to be less than 1. "
++                    "Setting it to the minimum value of 1.", expr)
++                max_num_seqs = 1
++
++        batch_size = 0
++        for group_id in range(max_num_seqs):
++            seq_len = (max_num_batched_tokens // max_num_seqs +
++                       (group_id < max_num_batched_tokens % max_num_seqs))
++            batch_size += seq_len
++
++            dummy_data = self.input_registry \
++                .dummy_data_for_profiling(self.model_config,
++                                          seq_len,
++                                          self.mm_registry)
++
++            seq = SequenceGroupMetadata(
++                request_id=str(group_id),
++                is_prompt=True,
++                seq_data={group_id: dummy_data.seq_data},
++                sampling_params=sampling_params,
++                block_tables=None,
++                lora_request=None,
++                multi_modal_data=dummy_data.multi_modal_data,
++                multi_modal_placeholders=dummy_data.multi_modal_placeholders)
++            seqs.append(seq)
++
++        # Run the model with the dummy inputs.
++        num_layers = self.model_config.get_num_layers(self.parallel_config)
++        # use an empty tensor instead of `None`` to force Dynamo to pass
++        # it by reference, rather by specializing on the value ``None``.
++        # the `dtype` argument does not matter, and we use `float32` as
++        # a placeholder (it has wide hardware support).
++        kv_caches = [
++            torch.tensor([], dtype=torch.float32, device=self.device)
++        ] * num_layers
++        finished_requests_ids = [seq.request_id for seq in seqs]
++        model_input = self.prepare_model_input(
++            seqs, finished_requests_ids=finished_requests_ids)
++        intermediate_tensors = None
++        if not get_pp_group().is_first_rank:
++            intermediate_tensors = self.model.make_empty_intermediate_tensors(
++                batch_size=batch_size,
++                dtype=self.model_config.dtype,
++                device=self.device)
++        self.execute_model(model_input, kv_caches, intermediate_tensors)
++        torch.xpu.synchronize()
++        return
++
++    def make_model_input_from_broadcasted_tensor_dict(
++            self,
++            tensor_dict: Dict[str,
++                              Any]) -> ModelInputForXPUWithSamplingMetadata:
++        return (
++            ModelInputForXPUWithSamplingMetadata.from_broadcasted_tensor_dict(
++                tensor_dict,
++                attn_backend=self.attn_backend,
++            ))
++
++    def _prepare_model_input_tensors(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> ModelInputForXPUWithSamplingMetadata:
++        """Helper method to prepare the model input based on a given sequence
++        group. Prepares metadata needed for the base model forward pass but not
++        metadata for possible additional steps, e.g., sampling.
++
++        """
++        builder = self._builder_cls(weakref.proxy(self), finished_requests_ids)
++        for seq_group_metadata in seq_group_metadata_list:
++            builder.add_seq_group(seq_group_metadata)
++
++        return builder.build()  # type: ignore
++
++    def prepare_model_input(
++        self,
++        seq_group_metadata_list: List[SequenceGroupMetadata],
++        virtual_engine: int = 0,
++        finished_requests_ids: Optional[List[str]] = None
++    ) -> ModelInputForXPUWithSamplingMetadata:
++        """Prepare the model input based on a given sequence group, including
++        metadata for the sampling step.
++
++        """
++        model_input = self._prepare_model_input_tensors(
++            seq_group_metadata_list, finished_requests_ids)
++        # Sampling metadata is only required for the final pp group
++        generators = self.get_generators(finished_requests_ids)
++        sampling_metadata = SamplingMetadata.prepare(
++            seq_group_metadata_list,
++            model_input.seq_lens,
++            model_input.query_lens,
++            self.device,
++            pin_memory=False,
++            generators=generators,
++            cache=self.sampling_metadata_cache)
++
++        return dataclasses.replace(model_input,
++                                   sampling_metadata=sampling_metadata,
++                                   virtual_engine=virtual_engine)
++
++    @torch.inference_mode()
++    def execute_model(
++        self,
++        model_input: ModelInputForXPUWithSamplingMetadata,
++        kv_caches: List[torch.Tensor],
++        intermediate_tensors: Optional[IntermediateTensors] = None,
++        num_steps: int = 1,
++    ) -> Optional[List[SamplerOutput]]:
++        if num_steps > 1:
++            raise ValueError(
++                "XPUModelRunner does not support multi-step execution.")
++
++        model_executable = self.model
++        if (self.observability_config is not None
++                and self.observability_config.collect_model_forward_time):
++            model_forward_start_time = time.time()
++
++        hidden_or_intermediate_states = model_executable(
++            input_ids=model_input.input_tokens,
++            positions=model_input.input_positions,
++            kv_caches=kv_caches,
++            attn_metadata=model_input.attn_metadata,
++            intermediate_tensors=intermediate_tensors,
++            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
++                                         device=self.device))
++        # Compute the logits in the last pipeline stage.
++        if not get_pp_group().is_last_rank:
++            return hidden_or_intermediate_states
++
++        if (self.observability_config is not None
++                and self.observability_config.collect_model_forward_time):
++            model_forward_end_time = time.time()
++
++        # Compute the logits.
++        logits = self.model.compute_logits(hidden_or_intermediate_states,
++                                           model_input.sampling_metadata)
++
++        # Only perform sampling in the driver worker.
++        if not self.is_driver_worker:
++            return []
++
++        if model_input.async_callback is not None:
++            model_input.async_callback()
++
++        # Sample the next token.
++        output: SamplerOutput = self.model.sample(
++            logits=logits,
++            sampling_metadata=model_input.sampling_metadata,
++        )
++        if (self.observability_config is not None
++                and self.observability_config.collect_model_forward_time
++                and output is not None):
++            model_forward_time = (model_forward_end_time -
++                                  model_forward_start_time)
++            # If there are multiple workers, we are still tracking the latency
++            # from the start time of the driver worker to the end time of the
++            # driver worker. The model forward time will then end up covering
++            # the communication time as well.
++            output.model_forward_time = model_forward_time
++
++        return [output]
+diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
+new file mode 100644
+index 0000000..e9cb623
+--- /dev/null
++++ b/vllm/worker/xpu_worker.py
+@@ -0,0 +1,184 @@
++"""A XPU worker class."""
++import gc
++import os
++from typing import List, Optional, Tuple
++
++import intel_extension_for_pytorch  # noqa: F401
++import oneccl_bindings_for_pytorch  # noqa: F401
++import torch
++import torch.distributed
++
++from vllm.config import VllmConfig
++from vllm.distributed import (ensure_model_parallel_initialized,
++                              init_distributed_environment)
++from vllm.distributed.parallel_state import get_pp_group
++from vllm.logger import init_logger
++from vllm.model_executor import set_random_seed
++from vllm.platforms import current_platform
++from vllm.worker.cache_engine import CacheEngine
++from vllm.worker.worker import Worker
++from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
++from vllm.worker.xpu_model_runner import XPUModelRunner
++
++logger = init_logger(__name__)
++
++
++class XPUWorker(LoraNotSupportedWorkerBase, Worker):
++    """A worker class that executes (a partition of) the model on a GPU.
++    
++    Each worker is associated with a single XPU device. The worker is 
++    responsible for maintaining the KV cache and executing the model on the 
++    XPU. In case of distributed inference, each worker is assigned a partition
++    of the model.
++    """
++
++    def __init__(
++        self,
++        vllm_config: VllmConfig,
++        local_rank: int,
++        rank: int,
++        distributed_init_method: str,
++        is_driver_worker: bool = False,
++    ) -> None:
++        WorkerBase.__init__(self, vllm_config=vllm_config)
++        device_config = self.device_config
++        parallel_config = self.parallel_config
++        assert device_config.device_type == "xpu"
++        assert current_platform.is_xpu()
++
++        self.parallel_config.rank = rank
++
++        self.local_rank = local_rank
++        self.rank = rank
++        self.distributed_init_method = distributed_init_method
++        self.is_driver_worker = is_driver_worker
++        if parallel_config and is_driver_worker:
++            assert rank % parallel_config.tensor_parallel_size == 0, \
++                   "Driver worker should be rank 0 of tensor parallel group."
++
++        self.model_runner = XPUModelRunner(  # type: ignore
++            vllm_config=vllm_config,
++            kv_cache_dtype=self.cache_config.cache_dtype,
++            is_driver_worker=is_driver_worker,
++        )
++        # Uninitialized cache engine. Will be initialized by
++        # initialize_cache.
++        self.cache_engine: List[CacheEngine]
++        self.gpu_cache: Optional[List[List[torch.Tensor]]]
++
++    def init_device(self) -> None:
++        if self.device_config.device.type == "xpu" and current_platform.is_xpu(
++        ):
++            self.device = torch.device(f"xpu:{self.local_rank}")
++            torch.xpu.set_device(self.device)
++            torch.xpu.empty_cache()
++            self.init_gpu_memory = torch.xpu.get_device_properties(
++                self.local_rank).total_memory
++        else:
++            raise RuntimeError(
++                f"Not support device type: {self.device_config.device}")
++        # Initialize the distributed environment.
++        self.init_worker_distributed_environment()
++        # Initialize the model.
++        set_random_seed(self.model_config.seed)
++
++    # keep this method for `empty_cache` and `synchronize` api
++    @torch.inference_mode()
++    def determine_num_available_blocks(self) -> Tuple[int, int]:
++        """Profiles the peak memory usage of the model to determine how many
++        KV blocks may be allocated without OOMs.
++
++        The engine will first conduct a profiling of the existing memory usage.
++        Then, it calculate the maximum possible number of GPU and CPU blocks
++        that can be allocated with the remaining free memory.
++
++        .. tip::
++            You may limit the usage of GPU memory
++            by adjusting the `gpu_memory_utilization` parameter.
++        """
++        # Profile the memory usage of the model and get the maximum number of
++        # cache blocks that can be allocated with the remaining free memory.
++        torch.xpu.empty_cache()
++
++        # Execute a forward pass with dummy inputs to profile the memory usage
++        # of the model.
++        self.model_runner.profile_run()
++
++        # Calculate the number of blocks that can be allocated with the
++        # profiled peak memory.
++        torch.xpu.synchronize()
++        used_memory = torch.xpu.memory_allocated()
++        total_gpu_memory = torch.xpu.get_device_properties(
++            self.local_rank).total_memory
++        free_gpu_memory = total_gpu_memory - used_memory
++
++        # NOTE(woosuk): Here we assume that the other processes using the same
++        # GPU did not change their memory usage during the profiling.
++        peak_memory = self.init_gpu_memory - free_gpu_memory
++        assert peak_memory > 0, (
++            "Error in memory profiling. "
++            f"Initial free memory {self.init_gpu_memory}, current free memory"
++            f" {free_gpu_memory}. This happens when the GPU memory was "
++            "not properly cleaned up before initializing the vLLM instance.")
++
++        cache_block_size = self.get_cache_block_size_bytes()
++        num_gpu_blocks = int(
++            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
++             peak_memory) // cache_block_size)
++        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
++                             cache_block_size)
++        num_gpu_blocks = max(num_gpu_blocks, 0)
++        num_cpu_blocks = max(num_cpu_blocks, 0)
++        gc.collect()
++        torch.xpu.empty_cache()
++        return num_gpu_blocks, num_cpu_blocks
++
++    def _warm_up_model(self) -> None:
++        # IPEX don't support capture graph yet
++        pass
++
++    def init_worker_distributed_environment(self) -> None:
++        """Initialize the distributed environment."""
++
++        parallel_config = self.parallel_config
++        rank = self.rank
++        distributed_init_method = self.distributed_init_method
++
++        if torch.distributed.is_initialized():
++            torch_world_size = torch.distributed.get_world_size()
++            if torch_world_size != parallel_config.world_size:
++                raise RuntimeError(
++                    "torch.distributed is already initialized but the torch "
++                    "world size does not match parallel_config.world_size "
++                    f"({torch_world_size} vs. {parallel_config.world_size}).")
++        elif not distributed_init_method:
++            raise ValueError(
++                "distributed_init_method must be set if torch.distributed "
++                "is not already initialized")
++        else:
++            # use sockets as default Level zero IPC exchange backend. By
++            # default oneccl will use `drmfd` as mechanism which need extra
++            # dependency (libdrm and drm headers) on your system.
++            ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
++            ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE",
++                                             str(parallel_config.world_size))
++            os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT
++            os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE
++            os.environ["LOCAL_RANK"] = str(self.local_rank)
++            init_distributed_environment(
++                world_size=parallel_config.world_size,
++                rank=rank,
++                distributed_init_method=distributed_init_method,
++                local_rank=self.local_rank,
++                backend="ccl")
++
++        ensure_model_parallel_initialized(
++            parallel_config.tensor_parallel_size,
++            parallel_config.pipeline_parallel_size)
++        # global all_reduce needed for overall oneccl warm up
++        torch.distributed.all_reduce(torch.zeros(1).xpu())
++
++        if parallel_config.pipeline_parallel_size > 1:
++            # Add pp group init to avoid
++            # p2p communication as the first call
++            get_pp_group().all_reduce(torch.zeros(1).xpu())
+-- 
+2.33.0
+
diff --git a/sysHAX.yaml b/sysHAX.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0bb08a6b17ee4766a898520a910fac203b476c00
--- /dev/null
+++ b/sysHAX.yaml
@@ -0,0 +1,4 @@
+version_control: github
+src_repo: vllm-project/vllm
+tag_prefix: ^v
+separator: "."
\ No newline at end of file
diff --git a/vllm-0.4.2.tar.gz b/vllm-0.4.2.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..4392aad71d6ea9f89d7db8fa6c1c5f6768196e9c
Binary files /dev/null and b/vllm-0.4.2.tar.gz differ
diff --git a/vllm.spec b/vllm.spec
new file mode 100644
index 0000000000000000000000000000000000000000..1af36a049c8ea915d112330f1a231ad1a5daf924
--- /dev/null
+++ b/vllm.spec
@@ -0,0 +1,34 @@
+Name:          vllm
+Version:       0.4.2
+Release:       1%{?dist}
+Summary:       A high-throughput and memory-efficient inference and serving engine for LLMs
+
+License:       Apache-2.0
+URL:           https://github.com/vllm-project/vllm
+Source0:       vllm-0.4.2.tar.gz
+Patch0:        0001-modify-operator-throughput-speed.patch
+
+ExclusiveArch: aarch64
+
+BuildRequires: python3-setuptools, python3-devel, gcc, cmake, make
+
+Requires:      python3
+
+%description
+vLLM is a high-throughput and memory-efficient inference engine optimized for large language models (LLMs). 
+It enables serving and inference of large models in a memory-efficient way with high throughput.
+
+%prep
+tar -zxvf %{SOURCE0}
+cd vllm-0.4.2
+%patch0 -p1
+
+%build
+
+%install
+
+%files
+
+%changelog
+* Thu Oct 26 2023 sunhailiang <sunhailiang3@huawei.com.com> - 0.4.2-1
+- Initial package for vllm 0.4.2 with patch applied.
\ No newline at end of file